data_lang/htm8

OILS / data_lang / htm8_test.py View on Github | oils.pub

472 lines, 311 significant

1	#!/usr/bin/env python2
2	from __future__ import print_function
3
4	from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_id_str, attr_name,
5	attr_value_e, attr_value_str)
6
7	import unittest
8	import re
9
10	from typing import List, Tuple, Any
11
12	from data_lang import htm8
13	from doctools.util import log
14
15	with open('data_lang/testdata/hello.htm8') as f:
16	TEST_HTML = f.read()
17
18
19	class RegexTest(unittest.TestCase):
20
21	def testDotAll(self):
22	# type: () -> None
23
24	# Note that $ matches end of line, not end of string
25	p1 = re.compile(r'.')
26	print(p1.match('\n'))
27
28	p2 = re.compile(r'.', re.DOTALL)
29	print(p2.match('\n'))
30
31	#p3 = re.compile(r'[.\n]', re.VERBOSE)
32	p3 = re.compile(r'[.\n]')
33	print(p3.match('\n'))
34
35	print('Negation')
36
37	p4 = re.compile(r'[^>]')
38	print(p4.match('\n'))
39
40	def testAttrRe(self):
41	# type: () -> None
42	_ATTR_RE = htm8._ATTR_RE
43	m = _ATTR_RE.match(' empty= val')
44	print(m.groups())
45
46
47	class FunctionsTest(unittest.TestCase):
48
49	def testFindLineNum(self):
50	# type: () -> None
51	s = 'foo\n' * 3
52	for pos in [1, 5, 10, 50]: # out of bounds
53	line_num = htm8._FindLineNum(s, pos)
54	print(line_num)
55
56
57	def _MakeAttrLexer(t, h, expected_tag=h8_id.StartTag):
58	# type: (Any, str) -> htm8.AttrLexer
59
60	lx = htm8.Lexer(h)
61
62	tok_id, end_pos = lx.Read()
63	t.assertEqual(expected_tag, tok_id)
64
65	attr_lx = htm8.AttrLexer(h)
66	attr_lx.Init(lx.TagNamePos(), end_pos)
67
68	return attr_lx
69
70
71	class AttrLexerTest(unittest.TestCase):
72
73	def testNoAttrs(self):
74	# type: () -> None
75
76	# TODO: h8_id.StartTag and EndTag will expose the tag_name_pos - the
77	# end of the tag name
78
79	h = 'x <a>'
80	lx = htm8.Lexer(h)
81
82	# Skip raw data
83	tok_id, end_pos = lx.Read()
84	self.assertEqual(h8_id.RawData, tok_id)
85
86	tok_id, end_pos = lx.Read()
87	self.assertEqual(h8_id.StartTag, tok_id)
88
89	attr_lx = htm8.AttrLexer(h)
90	attr_lx.Init(lx.TagNamePos(), end_pos)
91
92	# There is no tag
93	n, name_start, name_end = attr_lx.ReadName()
94	self.assertEqual(n, attr_name.Done)
95	self.assertEqual(-1, name_start)
96	self.assertEqual(-1, name_end)
97
98	try:
99	result = attr_lx.ReadValue()
100	except AssertionError as e:
101	print(e)
102	else:
103	self.fail('should have failed')
104
105	def testInvalid(self):
106	h = '<a !>'
107	attr_lx = _MakeAttrLexer(self, h)
108
109	n, name_start, name_end = attr_lx.ReadName()
110	self.assertEqual(n, attr_name.Invalid)
111	self.assertEqual(-1, name_start)
112	self.assertEqual(-1, name_end)
113
114	try:
115	result = attr_lx.ReadValue()
116	except AssertionError as e:
117	print(e)
118	else:
119	self.fail('should have failed')
120
121	def testEmpty(self):
122	h = '<img src=/>'
123	attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartEndTag)
124
125	n, name_start, name_end = attr_lx.ReadName()
126	self.assertEqual(n, attr_name.Ok)
127	self.assertEqual(5, name_start)
128	self.assertEqual(8, name_end)
129	self.assertEqual(False, attr_lx.next_value_is_missing)
130
131	self.assertEqual(True, attr_lx.AttrNameEquals('src'))
132	self.assertEqual(False, attr_lx.AttrNameEquals('srcz'))
133
134	v, attr_start, attr_end = attr_lx.ReadValue()
135	log('v = %s', attr_value_str(v))
136	self.assertEqual(attr_value_e.Empty, v)
137	self.assertEqual(-1, attr_start)
138	self.assertEqual(-1, attr_end)
139
140	def testMissing(self):
141	h = '<img SRC/>'
142	attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartEndTag)
143
144	n, name_start, name_end = attr_lx.ReadName()
145	self.assertEqual(n, attr_name.Ok)
146	self.assertEqual(5, name_start)
147	self.assertEqual(8, name_end)
148	self.assertEqual(True, attr_lx.next_value_is_missing)
149
150	self.assertEqual(True, attr_lx.AttrNameEquals('src'))
151	self.assertEqual(False, attr_lx.AttrNameEquals('srcz'))
152
153	v, attr_start, attr_end = attr_lx.ReadValue()
154	self.assertEqual(attr_value_e.Missing, v)
155	self.assertEqual(-1, attr_start)
156	self.assertEqual(-1, attr_end)
157
158	def testUnquoted(self):
159	# CAREFUL: /> is a StartEndTag, and / is not part of unquoted value
160	h = '<a x=foo/>'
161	attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartEndTag)
162
163	n, name_start, name_end = attr_lx.ReadName()
164	self.assertEqual(n, attr_name.Ok)
165	self.assertEqual(3, name_start)
166	self.assertEqual(4, name_end)
167
168	v, attr_start, attr_end = attr_lx.ReadValue()
169
170	log('v = %s', attr_value_str(v))
171	log('unquoted val %r', h[attr_start:attr_end])
172
173	self.assertEqual(attr_value_e.Unquoted, v)
174	self.assertEqual(5, attr_start)
175	self.assertEqual(8, attr_end)
176
177	def testDoubleQuoted(self):
178	h = '<a x="f&">'
179	attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)
180
181	n, name_start, name_end = attr_lx.ReadName()
182	self.assertEqual(n, attr_name.Ok)
183	self.assertEqual(3, name_start)
184	self.assertEqual(4, name_end)
185
186	v, attr_start, attr_end = attr_lx.ReadValue()
187
188	log('v = %s', attr_value_str(v))
189	log('val %r', h[attr_start:attr_end])
190
191	self.assertEqual(attr_value_e.DoubleQuoted, v)
192	self.assertEqual(6, attr_start)
193	self.assertEqual(8, attr_end)
194
195	def testSingleQuoted(self):
196	h = "<a x='&f'>"
197	attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)
198
199	n, name_start, name_end = attr_lx.ReadName()
200	self.assertEqual(n, attr_name.Ok)
201	self.assertEqual(3, name_start)
202	self.assertEqual(4, name_end)
203
204	v, attr_start, attr_end = attr_lx.ReadValue()
205
206	log('v = %s', attr_value_str(v))
207	log('unquoted val %r', h[attr_start:attr_end])
208
209	self.assertEqual(attr_value_e.SingleQuoted, v)
210	self.assertEqual(6, attr_start)
211	self.assertEqual(8, attr_end)
212
213	def testDoubleQuoted_Bad(self):
214	h = '<a x="foo>'
215	attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)
216
217	n, name_start, name_end = attr_lx.ReadName()
218	self.assertEqual(n, attr_name.Ok)
219	self.assertEqual(3, name_start)
220	self.assertEqual(4, name_end)
221
222	try:
223	v, attr_start, attr_end = attr_lx.ReadValue()
224	except htm8.LexError as e:
225	print(e)
226	else:
227	self.fail('Expected LexError')
228
229	def testSingleQuoted_Bad(self):
230	h = "<a x='foo>"
231	attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)
232
233	n, name_start, name_end = attr_lx.ReadName()
234	self.assertEqual(n, attr_name.Ok)
235	self.assertEqual(3, name_start)
236	self.assertEqual(4, name_end)
237
238	try:
239	v, attr_start, attr_end = attr_lx.ReadValue()
240	except htm8.LexError as e:
241	print(e)
242	else:
243	self.fail('Expected LexError')
244
245
246	def ValidTokenList(s, no_special_tags=False):
247	# type: (str, bool) -> List[Tuple[h8_id_t, int]]
248	"""A wrapper that can be more easily translated to C++. Doesn't use iterators."""
249
250	start_pos = 0
251	tokens = []
252	lx = htm8.Lexer(s, no_special_tags=no_special_tags)
253	while True:
254	tok_id, end_pos = lx.Read()
255	tokens.append((tok_id, end_pos))
256	if tok_id == h8_id.EndOfStream:
257	break
258	if tok_id == h8_id.Invalid:
259	raise htm8.LexError(s, start_pos)
260	start_pos = end_pos
261	return tokens
262
263
264	def Lex(h, no_special_tags=False):
265	# type: (str, bool) -> List[Tuple[int, int]]
266	print(repr(h))
267	tokens = ValidTokenList(h, no_special_tags=no_special_tags)
268	start_pos = 0
269	for tok_id, end_pos in tokens:
270	frag = h[start_pos:end_pos]
271	log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
272	start_pos = end_pos
273	return tokens
274
275
276	class LexerTest(unittest.TestCase):
277
278	# IndexLinker in devtools/make_help.py
279	# <pre> sections in doc/html_help.py
280	# TocExtractor in devtools/cmark.py
281
282	def testPstrip(self):
283	# type: () -> None
284	"""Remove anything like this.
285
286	<p><pstrip> </pstrip></p>
287	"""
288	pass
289
290	def testCommentParse(self):
291	# type: () -> None
292	n = len(TEST_HTML)
293	tokens = Lex(TEST_HTML)
294
295	def testCommentParse2(self):
296	# type: () -> None
297	h = '''
298	hi <!-- line 1
299	line 2 --><br/>'''
300	tokens = Lex(h)
301
302	self.assertEqual(
303	[
304	(h8_id.RawData, 12),
305	(h8_id.Comment, 50), # <? err ?>
306	(h8_id.StartEndTag, 55),
307	(h8_id.EndOfStream, 55),
308	],
309	tokens)
310
311	def testProcessingInstruction(self):
312	# type: () -> None
313	# <?xml ?> header
314	h = 'hi <? err ?>'
315	tokens = Lex(h)
316
317	self.assertEqual(
318	[
319	(h8_id.RawData, 3),
320	(h8_id.Processing, 12), # <? err ?>
321	(h8_id.EndOfStream, 12),
322	],
323	tokens)
324
325	def testScriptStyle(self):
326	# type: () -> None
327	h = '''
328	hi <script src=""> if (x < 1 && y > 2 ) { console.log(""); }
329	</script>
330	'''
331	tokens = Lex(h)
332
333	expected = [
334	(h8_id.RawData, 12),
335	(h8_id.StartTag, 27), # <script>
336	(h8_id.HtmlCData, 78), # JavaScript code is HTML CData
337	(h8_id.EndTag, 87), # </script>
338	(h8_id.RawData, 96), # \n
339	(h8_id.EndOfStream, 96), # \n
340	]
341	self.assertEqual(expected, tokens)
342
343	# Test case matching
344	tokens = Lex(h.replace('script', 'scrIPT'))
345	self.assertEqual(expected, tokens)
346
347	def testScriptStyleXml(self):
348	# type: () -> None
349	h = 'hi <script src=""> < </script>'
350	# XML mode
351	tokens = Lex(h, no_special_tags=True)
352
353	self.assertEqual(
354	[
355	(h8_id.RawData, 3),
356	(h8_id.StartTag, 18), # <script>
357	(h8_id.RawData, 19), # space
358	(h8_id.CharEntity, 23), # </script>
359	(h8_id.RawData, 24), # \n
360	(h8_id.EndTag, 33), # \n
361	(h8_id.EndOfStream, 33), # \n
362	],
363	tokens)
364
365	def testCData(self):
366	# type: () -> None
367
368	# from
369	# /home/andy/src/languages/Python-3.11.5/Lib/test/xmltestdata/c14n-20/inC14N4.xml
370	h = '<compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>'
371	tokens = Lex(h)
372
373	self.assertEqual([
374	(h8_id.StartTag, 9),
375	(h8_id.CData, 61),
376	(h8_id.EndTag, 71),
377	(h8_id.EndOfStream, 71),
378	], tokens)
379
380	def testEntity(self):
381	# type: () -> None
382
383	# from
384	# /home/andy/src/Python-3.12.4/Lib/test/xmltestdata/c14n-20/inC14N5.xml
385	h = '&ent1;, &ent2;!'
386
387	tokens = Lex(h)
388
389	self.assertEqual([
390	(h8_id.CharEntity, 6),
391	(h8_id.RawData, 8),
392	(h8_id.CharEntity, 14),
393	(h8_id.RawData, 15),
394	(h8_id.EndOfStream, 15),
395	], tokens)
396
397	def testStartTag(self):
398	# type: () -> None
399
400	h = '<a>hi</a>'
401	tokens = Lex(h)
402
403	self.assertEqual([
404	(h8_id.StartTag, 3),
405	(h8_id.RawData, 5),
406	(h8_id.EndTag, 9),
407	(h8_id.EndOfStream, 9),
408	], tokens)
409
410	# Make sure we don't consume too much
411	h = '<a><source>1.7</source></a>'
412
413	tokens = Lex(h)
414
415	self.assertEqual([
416	(h8_id.StartTag, 3),
417	(h8_id.StartTag, 11),
418	(h8_id.RawData, 14),
419	(h8_id.EndTag, 23),
420	(h8_id.EndTag, 27),
421	(h8_id.EndOfStream, 27),
422	], tokens)
423
424	return
425
426	h = '''
427	<configuration>
428	<source>1.7</source>
429	</configuration>'''
430
431	tokens = Lex(h)
432
433	self.assertEqual([
434	(h8_id.RawData, 9),
435	(h8_id.StartTag, 24),
436	(h8_id.RawData, 9),
437	(h8_id.EndOfStream, 9),
438	], tokens)
439
440	def testBad(self):
441	# type: () -> None
442	h = '&'
443	tokens = Lex(h)
444
445	self.assertEqual([
446	(h8_id.BadAmpersand, 1),
447	(h8_id.EndOfStream, 1),
448	], tokens)
449
450	h = '>'
451	tokens = Lex(h)
452
453	self.assertEqual([
454	(h8_id.BadGreaterThan, 1),
455	(h8_id.EndOfStream, 1),
456	], tokens)
457
458	def testEndOfStream(self):
459	# type: () -> None
460
461	# NUL is end
462	h = 'a\0b'
463	tokens = Lex(h)
464
465	self.assertEqual([
466	(h8_id.RawData, 1),
467	(h8_id.EndOfStream, 2),
468	], tokens)
469
470
471	if __name__ == '__main__':
472	unittest.main()