data_lang/htm8

OILS / data_lang / htm8_test.py View on Github | oils.pub

608 lines, 400 significant

1	#!/usr/bin/env python2
2	from __future__ import print_function
3
4	from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_id_str, attr_name,
5	attr_name_str, attr_value_e,
6	attr_value_str)
7
8	import unittest
9	import re
10
11	from typing import List, Tuple, Any
12
13	from data_lang import htm8
14	from doctools.util import log
15
16	with open('data_lang/testdata/hello.htm8') as f:
17	TEST_HTML = f.read()
18
19
20	class RegexTest(unittest.TestCase):
21
22	def testDotAll(self):
23	# type: () -> None
24
25	# Note that $ matches end of line, not end of string
26	p1 = re.compile(r'.')
27	print(p1.match('\n'))
28
29	p2 = re.compile(r'.', re.DOTALL)
30	print(p2.match('\n'))
31
32	#p3 = re.compile(r'[.\n]', re.VERBOSE)
33	p3 = re.compile(r'[.\n]')
34	print(p3.match('\n'))
35
36	print('Negation')
37
38	p4 = re.compile(r'[^>]')
39	print(p4.match('\n'))
40
41	def testAttrRe(self):
42	# type: () -> None
43	_ATTR_RE = htm8._ATTR_RE
44	m = _ATTR_RE.match(' empty= val')
45	print(m.groups())
46
47
48	class FunctionsTest(unittest.TestCase):
49
50	def testFindLineNum(self):
51	# type: () -> None
52	s = 'foo\n' * 3
53	for pos in [1, 5, 10, 50]: # out of bounds
54	line_num = htm8._FindLineNum(s, pos)
55	print(line_num)
56
57
58	def _MakeAttrLexer(t, h, expected_tag=h8_id.StartTag):
59	# type: (Any, str) -> htm8.AttrLexer
60
61	lx = htm8.Lexer(h)
62
63	tok_id, end_pos = lx.Read()
64	t.assertEqual(expected_tag, tok_id)
65
66	attr_lx = htm8.AttrLexer(h)
67	attr_lx.Init(tok_id, lx.TagNamePos(), end_pos)
68
69	return attr_lx
70
71
72	class AttrLexerTest(unittest.TestCase):
73
74	def testNoAttrs(self):
75	# type: () -> None
76
77	# TODO: h8_id.StartTag and EndTag will expose the tag_name_pos - the
78	# end of the tag name
79
80	h = 'x <a>'
81	lx = htm8.Lexer(h)
82
83	# Skip raw data
84	tok_id, end_pos = lx.Read()
85	self.assertEqual(h8_id.RawData, tok_id)
86
87	tok_id, end_pos = lx.Read()
88	self.assertEqual(h8_id.StartTag, tok_id)
89
90	attr_lx = htm8.AttrLexer(h)
91	attr_lx.Init(tok_id, lx.TagNamePos(), end_pos)
92
93	# There is no tag
94	n, name_start, name_end = attr_lx.ReadName()
95	self.assertEqual(n, attr_name.Done)
96	self.assertEqual(-1, name_start)
97	self.assertEqual(-1, name_end)
98
99	try:
100	result = attr_lx.ReadValue()
101	except AssertionError as e:
102	print(e)
103	else:
104	self.fail('should have failed')
105
106	try:
107	result = attr_lx.ReadName()
108	except AssertionError as e:
109	print(e)
110	else:
111	self.fail('should have failed')
112
113	def testInvalid(self):
114	h = '<a !>'
115	attr_lx = _MakeAttrLexer(self, h)
116
117	n, name_start, name_end = attr_lx.ReadName()
118	self.assertEqual(n, attr_name.Invalid)
119	self.assertEqual(-1, name_start)
120	self.assertEqual(-1, name_end)
121
122	try:
123	result = attr_lx.ReadValue()
124	except AssertionError as e:
125	print(e)
126	else:
127	self.fail('should have failed')
128
129	def testEmpty(self):
130	h = '<img src=>'
131	attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)
132
133	n, name_start, name_end = attr_lx.ReadName()
134	self.assertEqual(n, attr_name.Ok)
135	self.assertEqual(5, name_start)
136	self.assertEqual(8, name_end)
137	self.assertEqual(False, attr_lx.next_value_is_missing)
138
139	self.assertEqual(True, attr_lx.AttrNameEquals('src'))
140	self.assertEqual(False, attr_lx.AttrNameEquals('srcz'))
141
142	v, attr_start, attr_end = attr_lx.ReadValue()
143	log('v = %s', attr_value_str(v))
144	self.assertEqual(attr_value_e.Empty, v)
145	self.assertEqual(-1, attr_start)
146	self.assertEqual(-1, attr_end)
147
148	n, name_start, name_end = attr_lx.ReadName()
149	self.assertEqual(n, attr_name.Done)
150
151	def testMissing(self):
152	h = '<img SRC/>'
153	attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartEndTag)
154
155	n, name_start, name_end = attr_lx.ReadName()
156	self.assertEqual(n, attr_name.Ok)
157	self.assertEqual(5, name_start)
158	self.assertEqual(8, name_end)
159	self.assertEqual(True, attr_lx.next_value_is_missing)
160
161	self.assertEqual(True, attr_lx.AttrNameEquals('src'))
162	self.assertEqual(False, attr_lx.AttrNameEquals('srcz'))
163
164	v, attr_start, attr_end = attr_lx.ReadValue()
165	self.assertEqual(attr_value_e.Missing, v)
166	self.assertEqual(-1, attr_start)
167	self.assertEqual(-1, attr_end)
168
169	n, name_start, name_end = attr_lx.ReadName()
170	self.assertEqual(n, attr_name.Done)
171
172	def testUnquoted(self):
173	# CAREFUL: /> is a StartEndTag, and / is not part of unquoted value
174	h = '<a x=foo />'
175	attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartEndTag)
176
177	n, name_start, name_end = attr_lx.ReadName()
178	self.assertEqual(n, attr_name.Ok)
179	self.assertEqual(3, name_start)
180	self.assertEqual(4, name_end)
181
182	v, attr_start, attr_end = attr_lx.ReadValue()
183
184	log('v = %s', attr_value_str(v))
185	log('unquoted val %r', h[attr_start:attr_end])
186
187	self.assertEqual(attr_value_e.Unquoted, v)
188	self.assertEqual(5, attr_start)
189	self.assertEqual(8, attr_end)
190
191	n, name_start, name_end = attr_lx.ReadName()
192	self.assertEqual(n, attr_name.Done)
193
194	def testDoubleQuoted(self):
195	h = '<a x="f&">'
196	attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)
197
198	n, name_start, name_end = attr_lx.ReadName()
199	self.assertEqual(n, attr_name.Ok)
200	self.assertEqual(3, name_start)
201	self.assertEqual(4, name_end)
202
203	v, attr_start, attr_end = attr_lx.ReadValue()
204
205	log('v = %s', attr_value_str(v))
206	log('val %r', h[attr_start:attr_end])
207
208	self.assertEqual(attr_value_e.DoubleQuoted, v)
209	self.assertEqual(6, attr_start)
210	self.assertEqual(8, attr_end)
211	self.assertEqual(9, attr_lx.pos)
212
213	n, name_start, name_end = attr_lx.ReadName()
214	log('n = %r', attr_name_str(n))
215	self.assertEqual(n, attr_name.Done)
216
217	def testSingleQuoted(self):
218	h = "<a x='&f'>"
219	attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)
220
221	n, name_start, name_end = attr_lx.ReadName()
222	self.assertEqual(n, attr_name.Ok)
223	self.assertEqual(3, name_start)
224	self.assertEqual(4, name_end)
225
226	v, attr_start, attr_end = attr_lx.ReadValue()
227
228	log('v = %s', attr_value_str(v))
229	log('unquoted val %r', h[attr_start:attr_end])
230
231	self.assertEqual(attr_value_e.SingleQuoted, v)
232	self.assertEqual(6, attr_start)
233	self.assertEqual(8, attr_end)
234	self.assertEqual(9, attr_lx.pos)
235
236	n, name_start, name_end = attr_lx.ReadName()
237	#log('n = %r', attr_name_str(n))
238	self.assertEqual(n, attr_name.Done)
239
240	def testDoubleQuoted_Bad(self):
241	h = '<a x="foo>'
242	attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)
243
244	n, name_start, name_end = attr_lx.ReadName()
245	self.assertEqual(n, attr_name.Ok)
246	self.assertEqual(3, name_start)
247	self.assertEqual(4, name_end)
248
249	try:
250	v, attr_start, attr_end = attr_lx.ReadValue()
251	except htm8.LexError as e:
252	print(e)
253	else:
254	self.fail('Expected LexError')
255
256	def testSingleQuoted_Bad(self):
257	h = "<a x='foo>"
258	attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)
259
260	n, name_start, name_end = attr_lx.ReadName()
261	self.assertEqual(n, attr_name.Ok)
262	self.assertEqual(3, name_start)
263	self.assertEqual(4, name_end)
264
265	try:
266	v, attr_start, attr_end = attr_lx.ReadValue()
267	except htm8.LexError as e:
268	print(e)
269	else:
270	self.fail('Expected LexError')
271
272
273	class AttrLexerWrapperTest(unittest.TestCase):
274
275	def testGetAttrRaw(self):
276	# type: () -> None
277	lex = _MakeAttrLexer(self, '<a>')
278	#_PrintTokens(lex)
279	self.assertEqual(None, htm8.GetAttrRaw(lex, 'oops'))
280
281	# <a novalue> means lex.Get('novalue') == ''
282	# https://developer.mozilla.org/en-US/docs/Web/API/Element/hasAttribute
283	# We are not distinguishing <a novalue=""> from <a novalue> in this API
284	lex = _MakeAttrLexer(self, '<a novalue>')
285	#_PrintTokens(lex)
286	self.assertEqual('', htm8.GetAttrRaw(lex, 'novalue'))
287
288	def testGetAttrRaw2(self):
289	lex = _MakeAttrLexer(self, '<a href="double quoted">')
290	#_PrintTokens(lex)
291
292	log('*** OOPS')
293	self.assertEqual(None, htm8.GetAttrRaw(lex, 'oops'))
294	lex.Reset()
295	log('*** DOUBLE')
296	self.assertEqual('double quoted', htm8.GetAttrRaw(lex, 'href'))
297
298	def testGetAttrRaw3(self):
299	"""Reverse order vs. testGetAttrRaw2"""
300	lex = _MakeAttrLexer(self, '<a href="double quoted">')
301	#_PrintTokens(lex)
302
303	self.assertEqual('double quoted', htm8.GetAttrRaw(lex, 'href'))
304	lex.Reset()
305	self.assertEqual(None, htm8.GetAttrRaw(lex, 'oops'))
306
307	def testGetAttrRaw4(self):
308
309	lex = _MakeAttrLexer(self, '<a href=foo class="bar">')
310	#_PrintTokens(lex)
311	self.assertEqual('bar', htm8.GetAttrRaw(lex, 'class'))
312
313	lex = _MakeAttrLexer(self,
314	'<a href=foo class="bar" />',
315	expected_tag=h8_id.StartEndTag)
316	#_PrintTokens(lex)
317	self.assertEqual('bar', htm8.GetAttrRaw(lex, 'class'))
318
319	lex = _MakeAttrLexer(self,
320	'<a href="?foo=1&bar=2" />',
321	expected_tag=h8_id.StartEndTag)
322	self.assertEqual('?foo=1&bar=2', htm8.GetAttrRaw(lex, 'href'))
323
324	def testAllAttrs(self):
325	# type: () -> None
326	"""
327	[('key', 'value')] for all
328	"""
329	# closed
330	lex = _MakeAttrLexer(self,
331	'<a href=foo class="bar" />',
332	expected_tag=h8_id.StartEndTag)
333	self.assertEqual([('href', 'foo'), ('class', 'bar')],
334	htm8.AllAttrsRaw(lex))
335
336	lex = _MakeAttrLexer(self,
337	'<a href="?foo=1&bar=2" />',
338	expected_tag=h8_id.StartEndTag)
339	self.assertEqual([('href', '?foo=1&bar=2')], htm8.AllAttrsRaw(lex))
340
341	def testEmptyMissingValues(self):
342	# type: () -> None
343	# equivalent to <button disabled="">
344	lex = _MakeAttrLexer(self, '<button disabled>')
345	all_attrs = htm8.AllAttrsRaw(lex)
346	self.assertEqual([('disabled', '')], all_attrs)
347
348	# TODO: restore this
349	if 0:
350	slices = lex.AllAttrsRawSlice()
351	log('slices %s', slices)
352
353	lex = _MakeAttrLexer(
354	self, '''<p double="" single='' empty= value missing empty2=>''')
355	all_attrs = htm8.AllAttrsRaw(lex)
356	self.assertEqual([
357	('double', ''),
358	('single', ''),
359	('empty', 'value'),
360	('missing', ''),
361	('empty2', ''),
362	], all_attrs)
363	# TODO: should have
364	log('all %s', all_attrs)
365
366	if 0:
367	slices = lex.AllAttrsRawSlice()
368	log('slices %s', slices)
369
370	def testInvalidTag(self):
371	# type: () -> None
372	try:
373	lex = _MakeAttrLexer(self, '<a foo=bar !></a>')
374	all_attrs = htm8.AllAttrsRaw(lex)
375	except htm8.LexError as e:
376	print(e)
377	else:
378	self.fail('Expected LexError')
379
380
381	def ValidTokenList(s, no_special_tags=False):
382	# type: (str, bool) -> List[Tuple[h8_id_t, int]]
383	"""A wrapper that can be more easily translated to C++. Doesn't use iterators."""
384
385	start_pos = 0
386	tokens = []
387	lx = htm8.Lexer(s, no_special_tags=no_special_tags)
388	while True:
389	tok_id, end_pos = lx.Read()
390	tokens.append((tok_id, end_pos))
391	if tok_id == h8_id.EndOfStream:
392	break
393	if tok_id == h8_id.Invalid:
394	raise htm8.LexError('ValidTokenList() got invalid token', s,
395	start_pos)
396	start_pos = end_pos
397	return tokens
398
399
400	def Lex(h, no_special_tags=False):
401	# type: (str, bool) -> List[Tuple[int, int]]
402	print(repr(h))
403	tokens = ValidTokenList(h, no_special_tags=no_special_tags)
404	start_pos = 0
405	for tok_id, end_pos in tokens:
406	frag = h[start_pos:end_pos]
407	log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
408	start_pos = end_pos
409	return tokens
410
411
412	class LexerTest(unittest.TestCase):
413
414	# IndexLinker in devtools/make_help.py
415	# <pre> sections in doc/html_help.py
416	# TocExtractor in devtools/cmark.py
417
418	def testPstrip(self):
419	# type: () -> None
420	"""Remove anything like this.
421
422	<p><pstrip> </pstrip></p>
423	"""
424	pass
425
426	def testCommentParse(self):
427	# type: () -> None
428	n = len(TEST_HTML)
429	tokens = Lex(TEST_HTML)
430
431	def testCommentParse2(self):
432	# type: () -> None
433	h = '''
434	hi <!-- line 1
435	line 2 --><br/>'''
436	tokens = Lex(h)
437
438	self.assertEqual(
439	[
440	(h8_id.RawData, 12),
441	(h8_id.Comment, 50), # <? err ?>
442	(h8_id.StartEndTag, 55),
443	(h8_id.EndOfStream, 55),
444	],
445	tokens)
446
447	def testProcessingInstruction(self):
448	# type: () -> None
449	# <?xml ?> header
450	h = 'hi <? err ?>'
451	tokens = Lex(h)
452
453	self.assertEqual(
454	[
455	(h8_id.RawData, 3),
456	(h8_id.Processing, 12), # <? err ?>
457	(h8_id.EndOfStream, 12),
458	],
459	tokens)
460
461	def testScriptStyle(self):
462	# type: () -> None
463	h = '''
464	hi <script src=""> if (x < 1 && y > 2 ) { console.log(""); }
465	</script>
466	'''
467	tokens = Lex(h)
468
469	expected = [
470	(h8_id.RawData, 12),
471	(h8_id.StartTag, 27), # <script>
472	(h8_id.HtmlCData, 78), # JavaScript code is HTML CData
473	(h8_id.EndTag, 87), # </script>
474	(h8_id.RawData, 96), # \n
475	(h8_id.EndOfStream, 96), # \n
476	]
477	self.assertEqual(expected, tokens)
478
479	# Test case matching
480	tokens = Lex(h.replace('script', 'scrIPT'))
481	self.assertEqual(expected, tokens)
482
483	def testScriptStyleXml(self):
484	# type: () -> None
485	h = 'hi <script src=""> < </script>'
486	# XML mode
487	tokens = Lex(h, no_special_tags=True)
488
489	self.assertEqual(
490	[
491	(h8_id.RawData, 3),
492	(h8_id.StartTag, 18), # <script>
493	(h8_id.RawData, 19), # space
494	(h8_id.CharEntity, 23), # </script>
495	(h8_id.RawData, 24), # \n
496	(h8_id.EndTag, 33), # \n
497	(h8_id.EndOfStream, 33), # \n
498	],
499	tokens)
500
501	def testCData(self):
502	# type: () -> None
503
504	# from
505	# /home/andy/src/languages/Python-3.11.5/Lib/test/xmltestdata/c14n-20/inC14N4.xml
506	h = '<compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>'
507	tokens = Lex(h)
508
509	self.assertEqual([
510	(h8_id.StartTag, 9),
511	(h8_id.CData, 61),
512	(h8_id.EndTag, 71),
513	(h8_id.EndOfStream, 71),
514	], tokens)
515
516	def testEntity(self):
517	# type: () -> None
518
519	# from
520	# /home/andy/src/Python-3.12.4/Lib/test/xmltestdata/c14n-20/inC14N5.xml
521	h = '&ent1;, &ent2;!'
522
523	tokens = Lex(h)
524
525	self.assertEqual([
526	(h8_id.CharEntity, 6),
527	(h8_id.RawData, 8),
528	(h8_id.CharEntity, 14),
529	(h8_id.RawData, 15),
530	(h8_id.EndOfStream, 15),
531	], tokens)
532
533	def testStartTag(self):
534	# type: () -> None
535
536	h = '<a>hi</a>'
537	tokens = Lex(h)
538
539	self.assertEqual([
540	(h8_id.StartTag, 3),
541	(h8_id.RawData, 5),
542	(h8_id.EndTag, 9),
543	(h8_id.EndOfStream, 9),
544	], tokens)
545
546	# Make sure we don't consume too much
547	h = '<a><source>1.7</source></a>'
548
549	tokens = Lex(h)
550
551	self.assertEqual([
552	(h8_id.StartTag, 3),
553	(h8_id.StartTag, 11),
554	(h8_id.RawData, 14),
555	(h8_id.EndTag, 23),
556	(h8_id.EndTag, 27),
557	(h8_id.EndOfStream, 27),
558	], tokens)
559
560	return
561
562	h = '''
563	<configuration>
564	<source>1.7</source>
565	</configuration>'''
566
567	tokens = Lex(h)
568
569	self.assertEqual([
570	(h8_id.RawData, 9),
571	(h8_id.StartTag, 24),
572	(h8_id.RawData, 9),
573	(h8_id.EndOfStream, 9),
574	], tokens)
575
576	def testBad(self):
577	# type: () -> None
578	h = '&'
579	tokens = Lex(h)
580
581	self.assertEqual([
582	(h8_id.BadAmpersand, 1),
583	(h8_id.EndOfStream, 1),
584	], tokens)
585
586	h = '>'
587	tokens = Lex(h)
588
589	self.assertEqual([
590	(h8_id.BadGreaterThan, 1),
591	(h8_id.EndOfStream, 1),
592	], tokens)
593
594	def testEndOfStream(self):
595	# type: () -> None
596
597	# NUL is end
598	h = 'a\0b'
599	tokens = Lex(h)
600
601	self.assertEqual([
602	(h8_id.RawData, 1),
603	(h8_id.EndOfStream, 2),
604	], tokens)
605
606
607	if __name__ == '__main__':
608	unittest.main()