data_lang/htm8

OILS / data_lang / htm8_test.py View on Github | oils.pub

413 lines, 268 significant

1	#!/usr/bin/env python2
2	from __future__ import print_function
3
4	from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_id_str, attr_name,
5	attr_value_e, attr_value_str)
6
7	import unittest
8	import re
9
10	from typing import List, Tuple
11
12	from data_lang import htm8
13	from doctools.util import log
14
15	with open('data_lang/testdata/hello.htm8') as f:
16	TEST_HTML = f.read()
17
18
19	class RegexTest(unittest.TestCase):
20
21	def testDotAll(self):
22	# type: () -> None
23
24	# Note that $ matches end of line, not end of string
25	p1 = re.compile(r'.')
26	print(p1.match('\n'))
27
28	p2 = re.compile(r'.', re.DOTALL)
29	print(p2.match('\n'))
30
31	#p3 = re.compile(r'[.\n]', re.VERBOSE)
32	p3 = re.compile(r'[.\n]')
33	print(p3.match('\n'))
34
35	print('Negation')
36
37	p4 = re.compile(r'[^>]')
38	print(p4.match('\n'))
39
40	def testAttrRe(self):
41	# type: () -> None
42	_ATTR_RE = htm8._ATTR_RE
43	m = _ATTR_RE.match(' empty= val')
44	print(m.groups())
45
46
47	class FunctionsTest(unittest.TestCase):
48
49	def testFindLineNum(self):
50	# type: () -> None
51	s = 'foo\n' * 3
52	for pos in [1, 5, 10, 50]: # out of bounds
53	line_num = htm8._FindLineNum(s, pos)
54	print(line_num)
55
56
57	class AttrLexerTest(unittest.TestCase):
58
59	def testNoAttrs(self):
60	# type: () -> None
61
62	# TODO: h8_id.StartTag and EndTag will expose the tag_name_pos - the
63	# end of the tag name
64
65	h = 'x <a>'
66	lx = htm8.Lexer(h)
67
68	# Skip raw data
69	tok_id, end_pos = lx.Read()
70	self.assertEqual(h8_id.RawData, tok_id)
71
72	tok_id, end_pos = lx.Read()
73	self.assertEqual(h8_id.StartTag, tok_id)
74
75	attr_lx = htm8.AttrLexer(h)
76	attr_lx.Init(lx.TagNamePos(), end_pos)
77
78	# There is no tag
79	n, name_start, name_end = attr_lx.ReadName()
80	self.assertEqual(n, attr_name.Done)
81	self.assertEqual(-1, name_start)
82	self.assertEqual(-1, name_end)
83
84	try:
85	result = attr_lx.ReadRawValue()
86	except AssertionError as e:
87	print(e)
88	else:
89	self.fail('should have failed')
90
91	def testInvalid(self):
92	h = '<a !>'
93	lx = htm8.Lexer(h)
94
95	tok_id, end_pos = lx.Read()
96	self.assertEqual(h8_id.StartTag, tok_id)
97
98	attr_lx = htm8.AttrLexer(h)
99	attr_lx.Init(lx.TagNamePos(), end_pos)
100
101	n, name_start, name_end = attr_lx.ReadName()
102	self.assertEqual(n, attr_name.Invalid)
103	self.assertEqual(-1, name_start)
104	self.assertEqual(-1, name_end)
105
106	try:
107	result = attr_lx.ReadRawValue()
108	except AssertionError as e:
109	print(e)
110	else:
111	self.fail('should have failed')
112
113	def testEmpty(self):
114	h = '<img src=/>'
115	lx = htm8.Lexer(h)
116
117	tok_id, end_pos = lx.Read()
118	self.assertEqual(h8_id.StartEndTag, tok_id)
119
120	attr_lx = htm8.AttrLexer(h)
121	attr_lx.Init(lx.TagNamePos(), end_pos)
122
123	n, name_start, name_end = attr_lx.ReadName()
124	self.assertEqual(n, attr_name.Ok)
125	self.assertEqual(5, name_start)
126	self.assertEqual(8, name_end)
127	self.assertEqual(False, attr_lx.next_value_is_missing)
128
129	self.assertEqual(True, attr_lx.AttrNameEquals('src'))
130	self.assertEqual(False, attr_lx.AttrNameEquals('srcz'))
131
132	v, attr_start, attr_end = attr_lx.ReadRawValue()
133	log('v = %s', attr_value_str(v))
134	self.assertEqual(attr_value_e.Empty, v)
135	self.assertEqual(-1, attr_start)
136	self.assertEqual(-1, attr_end)
137
138	def testMissing(self):
139	h = '<img SRC/>'
140	lx = htm8.Lexer(h)
141
142	tok_id, end_pos = lx.Read()
143	self.assertEqual(h8_id.StartEndTag, tok_id)
144
145	attr_lx = htm8.AttrLexer(h)
146	attr_lx.Init(lx.TagNamePos(), end_pos)
147
148	n, name_start, name_end = attr_lx.ReadName()
149	self.assertEqual(n, attr_name.Ok)
150	self.assertEqual(5, name_start)
151	self.assertEqual(8, name_end)
152	self.assertEqual(True, attr_lx.next_value_is_missing)
153
154	self.assertEqual(True, attr_lx.AttrNameEquals('src'))
155	self.assertEqual(False, attr_lx.AttrNameEquals('srcz'))
156
157	v, attr_start, attr_end = attr_lx.ReadRawValue()
158	self.assertEqual(attr_value_e.Missing, v)
159	self.assertEqual(-1, attr_start)
160	self.assertEqual(-1, attr_end)
161
162	def testUnquoted(self):
163	# CAREFUL: /> is a StartEndTag, and / is not part of unquoted value
164	h = '<a x=foo/>'
165	lx = htm8.Lexer(h)
166
167	tok_id, end_pos = lx.Read()
168	self.assertEqual(h8_id.StartEndTag, tok_id)
169
170	attr_lx = htm8.AttrLexer(h)
171	attr_lx.Init(lx.TagNamePos(), end_pos)
172	n, name_start, name_end = attr_lx.ReadName()
173	self.assertEqual(n, attr_name.Ok)
174	self.assertEqual(3, name_start)
175	self.assertEqual(4, name_end)
176
177	v, attr_start, attr_end = attr_lx.ReadRawValue()
178
179	log('v = %s', attr_value_str(v))
180	log('unquoted val %r', h[attr_start:attr_end])
181
182	self.assertEqual(attr_value_e.Unquoted, v)
183	self.assertEqual(5, attr_start)
184	self.assertEqual(8, attr_end)
185
186
187	def ValidTokenList(s, no_special_tags=False):
188	# type: (str, bool) -> List[Tuple[h8_id_t, int]]
189	"""A wrapper that can be more easily translated to C++. Doesn't use iterators."""
190
191	start_pos = 0
192	tokens = []
193	lx = htm8.Lexer(s, no_special_tags=no_special_tags)
194	while True:
195	tok_id, end_pos = lx.Read()
196	tokens.append((tok_id, end_pos))
197	if tok_id == h8_id.EndOfStream:
198	break
199	if tok_id == h8_id.Invalid:
200	raise htm8.LexError(s, start_pos)
201	start_pos = end_pos
202	return tokens
203
204
205	def Lex(h, no_special_tags=False):
206	# type: (str, bool) -> List[Tuple[int, int]]
207	print(repr(h))
208	tokens = ValidTokenList(h, no_special_tags=no_special_tags)
209	start_pos = 0
210	for tok_id, end_pos in tokens:
211	frag = h[start_pos:end_pos]
212	log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
213	start_pos = end_pos
214	return tokens
215
216
217	class LexerTest(unittest.TestCase):
218
219	# IndexLinker in devtools/make_help.py
220	# <pre> sections in doc/html_help.py
221	# TocExtractor in devtools/cmark.py
222
223	def testPstrip(self):
224	# type: () -> None
225	"""Remove anything like this.
226
227	<p><pstrip> </pstrip></p>
228	"""
229	pass
230
231	def testCommentParse(self):
232	# type: () -> None
233	n = len(TEST_HTML)
234	tokens = Lex(TEST_HTML)
235
236	def testCommentParse2(self):
237	# type: () -> None
238	h = '''
239	hi <!-- line 1
240	line 2 --><br/>'''
241	tokens = Lex(h)
242
243	self.assertEqual(
244	[
245	(h8_id.RawData, 12),
246	(h8_id.Comment, 50), # <? err ?>
247	(h8_id.StartEndTag, 55),
248	(h8_id.EndOfStream, 55),
249	],
250	tokens)
251
252	def testProcessingInstruction(self):
253	# type: () -> None
254	# <?xml ?> header
255	h = 'hi <? err ?>'
256	tokens = Lex(h)
257
258	self.assertEqual(
259	[
260	(h8_id.RawData, 3),
261	(h8_id.Processing, 12), # <? err ?>
262	(h8_id.EndOfStream, 12),
263	],
264	tokens)
265
266	def testScriptStyle(self):
267	# type: () -> None
268	h = '''
269	hi <script src=""> if (x < 1 && y > 2 ) { console.log(""); }
270	</script>
271	'''
272	tokens = Lex(h)
273
274	expected = [
275	(h8_id.RawData, 12),
276	(h8_id.StartTag, 27), # <script>
277	(h8_id.HtmlCData, 78), # JavaScript code is HTML CData
278	(h8_id.EndTag, 87), # </script>
279	(h8_id.RawData, 96), # \n
280	(h8_id.EndOfStream, 96), # \n
281	]
282	self.assertEqual(expected, tokens)
283
284	# Test case matching
285	tokens = Lex(h.replace('script', 'scrIPT'))
286	self.assertEqual(expected, tokens)
287
288	def testScriptStyleXml(self):
289	# type: () -> None
290	h = 'hi <script src=""> < </script>'
291	# XML mode
292	tokens = Lex(h, no_special_tags=True)
293
294	self.assertEqual(
295	[
296	(h8_id.RawData, 3),
297	(h8_id.StartTag, 18), # <script>
298	(h8_id.RawData, 19), # space
299	(h8_id.CharEntity, 23), # </script>
300	(h8_id.RawData, 24), # \n
301	(h8_id.EndTag, 33), # \n
302	(h8_id.EndOfStream, 33), # \n
303	],
304	tokens)
305
306	def testCData(self):
307	# type: () -> None
308
309	# from
310	# /home/andy/src/languages/Python-3.11.5/Lib/test/xmltestdata/c14n-20/inC14N4.xml
311	h = '<compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>'
312	tokens = Lex(h)
313
314	self.assertEqual([
315	(h8_id.StartTag, 9),
316	(h8_id.CData, 61),
317	(h8_id.EndTag, 71),
318	(h8_id.EndOfStream, 71),
319	], tokens)
320
321	def testEntity(self):
322	# type: () -> None
323
324	# from
325	# /home/andy/src/Python-3.12.4/Lib/test/xmltestdata/c14n-20/inC14N5.xml
326	h = '&ent1;, &ent2;!'
327
328	tokens = Lex(h)
329
330	self.assertEqual([
331	(h8_id.CharEntity, 6),
332	(h8_id.RawData, 8),
333	(h8_id.CharEntity, 14),
334	(h8_id.RawData, 15),
335	(h8_id.EndOfStream, 15),
336	], tokens)
337
338	def testStartTag(self):
339	# type: () -> None
340
341	h = '<a>hi</a>'
342	tokens = Lex(h)
343
344	self.assertEqual([
345	(h8_id.StartTag, 3),
346	(h8_id.RawData, 5),
347	(h8_id.EndTag, 9),
348	(h8_id.EndOfStream, 9),
349	], tokens)
350
351	# Make sure we don't consume too much
352	h = '<a><source>1.7</source></a>'
353
354	tokens = Lex(h)
355
356	self.assertEqual([
357	(h8_id.StartTag, 3),
358	(h8_id.StartTag, 11),
359	(h8_id.RawData, 14),
360	(h8_id.EndTag, 23),
361	(h8_id.EndTag, 27),
362	(h8_id.EndOfStream, 27),
363	], tokens)
364
365	return
366
367	h = '''
368	<configuration>
369	<source>1.7</source>
370	</configuration>'''
371
372	tokens = Lex(h)
373
374	self.assertEqual([
375	(h8_id.RawData, 9),
376	(h8_id.StartTag, 24),
377	(h8_id.RawData, 9),
378	(h8_id.EndOfStream, 9),
379	], tokens)
380
381	def testBad(self):
382	# type: () -> None
383	h = '&'
384	tokens = Lex(h)
385
386	self.assertEqual([
387	(h8_id.BadAmpersand, 1),
388	(h8_id.EndOfStream, 1),
389	], tokens)
390
391	h = '>'
392	tokens = Lex(h)
393
394	self.assertEqual([
395	(h8_id.BadGreaterThan, 1),
396	(h8_id.EndOfStream, 1),
397	], tokens)
398
399	def testEndOfStream(self):
400	# type: () -> None
401
402	# NUL is end
403	h = 'a\0b'
404	tokens = Lex(h)
405
406	self.assertEqual([
407	(h8_id.RawData, 1),
408	(h8_id.EndOfStream, 2),
409	], tokens)
410
411
412	if __name__ == '__main__':
413	unittest.main()