data_lang/htm8

OILS / data_lang / htm8_test.py View on Github | oils.pub

501 lines, 336 significant

1	#!/usr/bin/env python2
2	from __future__ import print_function
3
4	from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_id_str, attr_name,
5	attr_value_e, attr_value_str)
6
7	import unittest
8	import re
9
10	from typing import List, Tuple
11
12	from data_lang import htm8
13	from doctools.util import log
14
15	with open('data_lang/testdata/hello.htm8') as f:
16	TEST_HTML = f.read()
17
18
19	class RegexTest(unittest.TestCase):
20
21	def testDotAll(self):
22	# type: () -> None
23
24	# Note that $ matches end of line, not end of string
25	p1 = re.compile(r'.')
26	print(p1.match('\n'))
27
28	p2 = re.compile(r'.', re.DOTALL)
29	print(p2.match('\n'))
30
31	#p3 = re.compile(r'[.\n]', re.VERBOSE)
32	p3 = re.compile(r'[.\n]')
33	print(p3.match('\n'))
34
35	print('Negation')
36
37	p4 = re.compile(r'[^>]')
38	print(p4.match('\n'))
39
40	def testAttrRe(self):
41	# type: () -> None
42	_ATTR_RE = htm8._ATTR_RE
43	m = _ATTR_RE.match(' empty= val')
44	print(m.groups())
45
46
47	class FunctionsTest(unittest.TestCase):
48
49	def testFindLineNum(self):
50	# type: () -> None
51	s = 'foo\n' * 3
52	for pos in [1, 5, 10, 50]: # out of bounds
53	line_num = htm8._FindLineNum(s, pos)
54	print(line_num)
55
56
57	class AttrLexerTest(unittest.TestCase):
58
59	def testNoAttrs(self):
60	# type: () -> None
61
62	# TODO: h8_id.StartTag and EndTag will expose the tag_name_pos - the
63	# end of the tag name
64
65	h = 'x <a>'
66	lx = htm8.Lexer(h)
67
68	# Skip raw data
69	tok_id, end_pos = lx.Read()
70	self.assertEqual(h8_id.RawData, tok_id)
71
72	tok_id, end_pos = lx.Read()
73	self.assertEqual(h8_id.StartTag, tok_id)
74
75	attr_lx = htm8.AttrLexer(h)
76	attr_lx.Init(lx.TagNamePos(), end_pos)
77
78	# There is no tag
79	n, name_start, name_end = attr_lx.ReadName()
80	self.assertEqual(n, attr_name.Done)
81	self.assertEqual(-1, name_start)
82	self.assertEqual(-1, name_end)
83
84	try:
85	result = attr_lx.ReadRawValue()
86	except AssertionError as e:
87	print(e)
88	else:
89	self.fail('should have failed')
90
91	def testInvalid(self):
92	h = '<a !>'
93	lx = htm8.Lexer(h)
94
95	tok_id, end_pos = lx.Read()
96	self.assertEqual(h8_id.StartTag, tok_id)
97
98	attr_lx = htm8.AttrLexer(h)
99	attr_lx.Init(lx.TagNamePos(), end_pos)
100
101	n, name_start, name_end = attr_lx.ReadName()
102	self.assertEqual(n, attr_name.Invalid)
103	self.assertEqual(-1, name_start)
104	self.assertEqual(-1, name_end)
105
106	try:
107	result = attr_lx.ReadRawValue()
108	except AssertionError as e:
109	print(e)
110	else:
111	self.fail('should have failed')
112
113	def testEmpty(self):
114	h = '<img src=/>'
115	lx = htm8.Lexer(h)
116
117	tok_id, end_pos = lx.Read()
118	self.assertEqual(h8_id.StartEndTag, tok_id)
119
120	attr_lx = htm8.AttrLexer(h)
121	attr_lx.Init(lx.TagNamePos(), end_pos)
122
123	n, name_start, name_end = attr_lx.ReadName()
124	self.assertEqual(n, attr_name.Ok)
125	self.assertEqual(5, name_start)
126	self.assertEqual(8, name_end)
127	self.assertEqual(False, attr_lx.next_value_is_missing)
128
129	self.assertEqual(True, attr_lx.AttrNameEquals('src'))
130	self.assertEqual(False, attr_lx.AttrNameEquals('srcz'))
131
132	v, attr_start, attr_end = attr_lx.ReadRawValue()
133	log('v = %s', attr_value_str(v))
134	self.assertEqual(attr_value_e.Empty, v)
135	self.assertEqual(-1, attr_start)
136	self.assertEqual(-1, attr_end)
137
138	def testMissing(self):
139	h = '<img SRC/>'
140	lx = htm8.Lexer(h)
141
142	tok_id, end_pos = lx.Read()
143	self.assertEqual(h8_id.StartEndTag, tok_id)
144
145	attr_lx = htm8.AttrLexer(h)
146	attr_lx.Init(lx.TagNamePos(), end_pos)
147
148	n, name_start, name_end = attr_lx.ReadName()
149	self.assertEqual(n, attr_name.Ok)
150	self.assertEqual(5, name_start)
151	self.assertEqual(8, name_end)
152	self.assertEqual(True, attr_lx.next_value_is_missing)
153
154	self.assertEqual(True, attr_lx.AttrNameEquals('src'))
155	self.assertEqual(False, attr_lx.AttrNameEquals('srcz'))
156
157	v, attr_start, attr_end = attr_lx.ReadRawValue()
158	self.assertEqual(attr_value_e.Missing, v)
159	self.assertEqual(-1, attr_start)
160	self.assertEqual(-1, attr_end)
161
162	def testUnquoted(self):
163	# CAREFUL: /> is a StartEndTag, and / is not part of unquoted value
164	h = '<a x=foo/>'
165	lx = htm8.Lexer(h)
166
167	tok_id, end_pos = lx.Read()
168	self.assertEqual(h8_id.StartEndTag, tok_id)
169
170	attr_lx = htm8.AttrLexer(h)
171	attr_lx.Init(lx.TagNamePos(), end_pos)
172	n, name_start, name_end = attr_lx.ReadName()
173	self.assertEqual(n, attr_name.Ok)
174	self.assertEqual(3, name_start)
175	self.assertEqual(4, name_end)
176
177	v, attr_start, attr_end = attr_lx.ReadRawValue()
178
179	log('v = %s', attr_value_str(v))
180	log('unquoted val %r', h[attr_start:attr_end])
181
182	self.assertEqual(attr_value_e.Unquoted, v)
183	self.assertEqual(5, attr_start)
184	self.assertEqual(8, attr_end)
185
186	def testDoubleQuoted(self):
187	h = '<a x="f&">'
188	lx = htm8.Lexer(h)
189
190	tok_id, end_pos = lx.Read()
191	self.assertEqual(h8_id.StartTag, tok_id)
192
193	attr_lx = htm8.AttrLexer(h)
194	attr_lx.Init(lx.TagNamePos(), end_pos)
195	n, name_start, name_end = attr_lx.ReadName()
196	self.assertEqual(n, attr_name.Ok)
197	self.assertEqual(3, name_start)
198	self.assertEqual(4, name_end)
199
200	v, attr_start, attr_end = attr_lx.ReadRawValue()
201
202	log('v = %s', attr_value_str(v))
203	log('val %r', h[attr_start:attr_end])
204
205	self.assertEqual(attr_value_e.DoubleQuoted, v)
206	self.assertEqual(6, attr_start)
207	self.assertEqual(8, attr_end)
208
209	def testSingleQuoted(self):
210	h = "<a x='&f'>"
211	lx = htm8.Lexer(h)
212
213	tok_id, end_pos = lx.Read()
214	self.assertEqual(h8_id.StartTag, tok_id)
215
216	attr_lx = htm8.AttrLexer(h)
217	attr_lx.Init(lx.TagNamePos(), end_pos)
218	n, name_start, name_end = attr_lx.ReadName()
219	self.assertEqual(n, attr_name.Ok)
220	self.assertEqual(3, name_start)
221	self.assertEqual(4, name_end)
222
223	v, attr_start, attr_end = attr_lx.ReadRawValue()
224
225	log('v = %s', attr_value_str(v))
226	log('unquoted val %r', h[attr_start:attr_end])
227
228	self.assertEqual(attr_value_e.SingleQuoted, v)
229	self.assertEqual(6, attr_start)
230	self.assertEqual(8, attr_end)
231
232	def testDoubleQuoted_Bad(self):
233	h = '<a x="foo>'
234	lx = htm8.Lexer(h)
235
236	tok_id, end_pos = lx.Read()
237	self.assertEqual(h8_id.StartTag, tok_id)
238
239	attr_lx = htm8.AttrLexer(h)
240	attr_lx.Init(lx.TagNamePos(), end_pos)
241	n, name_start, name_end = attr_lx.ReadName()
242	self.assertEqual(n, attr_name.Ok)
243	self.assertEqual(3, name_start)
244	self.assertEqual(4, name_end)
245
246	try:
247	v, attr_start, attr_end = attr_lx.ReadRawValue()
248	except htm8.LexError as e:
249	print(e)
250	else:
251	self.fail('Expected LexError')
252
253	def testSingleQuoted_Bad(self):
254	h = "<a x='foo>"
255	lx = htm8.Lexer(h)
256
257	tok_id, end_pos = lx.Read()
258	self.assertEqual(h8_id.StartTag, tok_id)
259
260	attr_lx = htm8.AttrLexer(h)
261	attr_lx.Init(lx.TagNamePos(), end_pos)
262	n, name_start, name_end = attr_lx.ReadName()
263	self.assertEqual(n, attr_name.Ok)
264	self.assertEqual(3, name_start)
265	self.assertEqual(4, name_end)
266
267	try:
268	v, attr_start, attr_end = attr_lx.ReadRawValue()
269	except htm8.LexError as e:
270	print(e)
271	else:
272	self.fail('Expected LexError')
273
274
275	def ValidTokenList(s, no_special_tags=False):
276	# type: (str, bool) -> List[Tuple[h8_id_t, int]]
277	"""A wrapper that can be more easily translated to C++. Doesn't use iterators."""
278
279	start_pos = 0
280	tokens = []
281	lx = htm8.Lexer(s, no_special_tags=no_special_tags)
282	while True:
283	tok_id, end_pos = lx.Read()
284	tokens.append((tok_id, end_pos))
285	if tok_id == h8_id.EndOfStream:
286	break
287	if tok_id == h8_id.Invalid:
288	raise htm8.LexError(s, start_pos)
289	start_pos = end_pos
290	return tokens
291
292
293	def Lex(h, no_special_tags=False):
294	# type: (str, bool) -> List[Tuple[int, int]]
295	print(repr(h))
296	tokens = ValidTokenList(h, no_special_tags=no_special_tags)
297	start_pos = 0
298	for tok_id, end_pos in tokens:
299	frag = h[start_pos:end_pos]
300	log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
301	start_pos = end_pos
302	return tokens
303
304
305	class LexerTest(unittest.TestCase):
306
307	# IndexLinker in devtools/make_help.py
308	# <pre> sections in doc/html_help.py
309	# TocExtractor in devtools/cmark.py
310
311	def testPstrip(self):
312	# type: () -> None
313	"""Remove anything like this.
314
315	<p><pstrip> </pstrip></p>
316	"""
317	pass
318
319	def testCommentParse(self):
320	# type: () -> None
321	n = len(TEST_HTML)
322	tokens = Lex(TEST_HTML)
323
324	def testCommentParse2(self):
325	# type: () -> None
326	h = '''
327	hi <!-- line 1
328	line 2 --><br/>'''
329	tokens = Lex(h)
330
331	self.assertEqual(
332	[
333	(h8_id.RawData, 12),
334	(h8_id.Comment, 50), # <? err ?>
335	(h8_id.StartEndTag, 55),
336	(h8_id.EndOfStream, 55),
337	],
338	tokens)
339
340	def testProcessingInstruction(self):
341	# type: () -> None
342	# <?xml ?> header
343	h = 'hi <? err ?>'
344	tokens = Lex(h)
345
346	self.assertEqual(
347	[
348	(h8_id.RawData, 3),
349	(h8_id.Processing, 12), # <? err ?>
350	(h8_id.EndOfStream, 12),
351	],
352	tokens)
353
354	def testScriptStyle(self):
355	# type: () -> None
356	h = '''
357	hi <script src=""> if (x < 1 && y > 2 ) { console.log(""); }
358	</script>
359	'''
360	tokens = Lex(h)
361
362	expected = [
363	(h8_id.RawData, 12),
364	(h8_id.StartTag, 27), # <script>
365	(h8_id.HtmlCData, 78), # JavaScript code is HTML CData
366	(h8_id.EndTag, 87), # </script>
367	(h8_id.RawData, 96), # \n
368	(h8_id.EndOfStream, 96), # \n
369	]
370	self.assertEqual(expected, tokens)
371
372	# Test case matching
373	tokens = Lex(h.replace('script', 'scrIPT'))
374	self.assertEqual(expected, tokens)
375
376	def testScriptStyleXml(self):
377	# type: () -> None
378	h = 'hi <script src=""> < </script>'
379	# XML mode
380	tokens = Lex(h, no_special_tags=True)
381
382	self.assertEqual(
383	[
384	(h8_id.RawData, 3),
385	(h8_id.StartTag, 18), # <script>
386	(h8_id.RawData, 19), # space
387	(h8_id.CharEntity, 23), # </script>
388	(h8_id.RawData, 24), # \n
389	(h8_id.EndTag, 33), # \n
390	(h8_id.EndOfStream, 33), # \n
391	],
392	tokens)
393
394	def testCData(self):
395	# type: () -> None
396
397	# from
398	# /home/andy/src/languages/Python-3.11.5/Lib/test/xmltestdata/c14n-20/inC14N4.xml
399	h = '<compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>'
400	tokens = Lex(h)
401
402	self.assertEqual([
403	(h8_id.StartTag, 9),
404	(h8_id.CData, 61),
405	(h8_id.EndTag, 71),
406	(h8_id.EndOfStream, 71),
407	], tokens)
408
409	def testEntity(self):
410	# type: () -> None
411
412	# from
413	# /home/andy/src/Python-3.12.4/Lib/test/xmltestdata/c14n-20/inC14N5.xml
414	h = '&ent1;, &ent2;!'
415
416	tokens = Lex(h)
417
418	self.assertEqual([
419	(h8_id.CharEntity, 6),
420	(h8_id.RawData, 8),
421	(h8_id.CharEntity, 14),
422	(h8_id.RawData, 15),
423	(h8_id.EndOfStream, 15),
424	], tokens)
425
426	def testStartTag(self):
427	# type: () -> None
428
429	h = '<a>hi</a>'
430	tokens = Lex(h)
431
432	self.assertEqual([
433	(h8_id.StartTag, 3),
434	(h8_id.RawData, 5),
435	(h8_id.EndTag, 9),
436	(h8_id.EndOfStream, 9),
437	], tokens)
438
439	# Make sure we don't consume too much
440	h = '<a><source>1.7</source></a>'
441
442	tokens = Lex(h)
443
444	self.assertEqual([
445	(h8_id.StartTag, 3),
446	(h8_id.StartTag, 11),
447	(h8_id.RawData, 14),
448	(h8_id.EndTag, 23),
449	(h8_id.EndTag, 27),
450	(h8_id.EndOfStream, 27),
451	], tokens)
452
453	return
454
455	h = '''
456	<configuration>
457	<source>1.7</source>
458	</configuration>'''
459
460	tokens = Lex(h)
461
462	self.assertEqual([
463	(h8_id.RawData, 9),
464	(h8_id.StartTag, 24),
465	(h8_id.RawData, 9),
466	(h8_id.EndOfStream, 9),
467	], tokens)
468
469	def testBad(self):
470	# type: () -> None
471	h = '&'
472	tokens = Lex(h)
473
474	self.assertEqual([
475	(h8_id.BadAmpersand, 1),
476	(h8_id.EndOfStream, 1),
477	], tokens)
478
479	h = '>'
480	tokens = Lex(h)
481
482	self.assertEqual([
483	(h8_id.BadGreaterThan, 1),
484	(h8_id.EndOfStream, 1),
485	], tokens)
486
487	def testEndOfStream(self):
488	# type: () -> None
489
490	# NUL is end
491	h = 'a\0b'
492	tokens = Lex(h)
493
494	self.assertEqual([
495	(h8_id.RawData, 1),
496	(h8_id.EndOfStream, 2),
497	], tokens)
498
499
500	if __name__ == '__main__':
501	unittest.main()