data_lang/htm8

OILS / data_lang / htm8_test.py View on Github | oils.pub

607 lines, 399 significant

1	#!/usr/bin/env python2
2	from __future__ import print_function
3
4	from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_id_str, attr_name,
5	attr_name_str, attr_value_e,
6	attr_value_str)
7
8	import unittest
9	import re
10
11	from typing import List, Tuple, Any
12
13	from data_lang import htm8
14	from doctools.util import log
15
16	with open('data_lang/testdata/hello.htm8') as f:
17	TEST_HTML = f.read()
18
19
20	class RegexTest(unittest.TestCase):
21
22	def testDotAll(self):
23	# type: () -> None
24
25	# Note that $ matches end of line, not end of string
26	p1 = re.compile(r'.')
27	print(p1.match('\n'))
28
29	p2 = re.compile(r'.', re.DOTALL)
30	print(p2.match('\n'))
31
32	#p3 = re.compile(r'[.\n]', re.VERBOSE)
33	p3 = re.compile(r'[.\n]')
34	print(p3.match('\n'))
35
36	print('Negation')
37
38	p4 = re.compile(r'[^>]')
39	print(p4.match('\n'))
40
41	def testAttrRe(self):
42	# type: () -> None
43	_ATTR_RE = htm8._ATTR_RE
44	m = _ATTR_RE.match(' empty= val')
45	print(m.groups())
46
47
48	class FunctionsTest(unittest.TestCase):
49
50	def testFindLineNum(self):
51	# type: () -> None
52	s = 'foo\n' * 3
53	for pos in [1, 5, 10, 50]: # out of bounds
54	line_num = htm8._FindLineNum(s, pos)
55	print(line_num)
56
57
58	def _MakeAttrLexer(t, h, expected_tag=h8_id.StartTag):
59	# type: (Any, str) -> htm8.AttrLexer
60
61	lx = htm8.Lexer(h)
62
63	tok_id, end_pos = lx.Read()
64	t.assertEqual(expected_tag, tok_id)
65
66	attr_lx = htm8.AttrLexer(h)
67	attr_lx.Init(lx.TagNamePos(), end_pos)
68
69	return attr_lx
70
71
72	class AttrLexerTest(unittest.TestCase):
73
74	def testNoAttrs(self):
75	# type: () -> None
76
77	# TODO: h8_id.StartTag and EndTag will expose the tag_name_pos - the
78	# end of the tag name
79
80	h = 'x <a>'
81	lx = htm8.Lexer(h)
82
83	# Skip raw data
84	tok_id, end_pos = lx.Read()
85	self.assertEqual(h8_id.RawData, tok_id)
86
87	tok_id, end_pos = lx.Read()
88	self.assertEqual(h8_id.StartTag, tok_id)
89
90	attr_lx = htm8.AttrLexer(h)
91	attr_lx.Init(lx.TagNamePos(), end_pos)
92
93	# There is no tag
94	n, name_start, name_end = attr_lx.ReadName()
95	self.assertEqual(n, attr_name.Done)
96	self.assertEqual(-1, name_start)
97	self.assertEqual(-1, name_end)
98
99	try:
100	result = attr_lx.ReadValue()
101	except AssertionError as e:
102	print(e)
103	else:
104	self.fail('should have failed')
105
106	try:
107	result = attr_lx.ReadName()
108	except AssertionError as e:
109	print(e)
110	else:
111	self.fail('should have failed')
112
113	def testInvalid(self):
114	h = '<a !>'
115	attr_lx = _MakeAttrLexer(self, h)
116
117	n, name_start, name_end = attr_lx.ReadName()
118	self.assertEqual(n, attr_name.Invalid)
119	self.assertEqual(-1, name_start)
120	self.assertEqual(-1, name_end)
121
122	try:
123	result = attr_lx.ReadValue()
124	except AssertionError as e:
125	print(e)
126	else:
127	self.fail('should have failed')
128
129	def testEmpty(self):
130	h = '<img src=/>'
131	attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartEndTag)
132
133	n, name_start, name_end = attr_lx.ReadName()
134	self.assertEqual(n, attr_name.Ok)
135	self.assertEqual(5, name_start)
136	self.assertEqual(8, name_end)
137	self.assertEqual(False, attr_lx.next_value_is_missing)
138
139	self.assertEqual(True, attr_lx.AttrNameEquals('src'))
140	self.assertEqual(False, attr_lx.AttrNameEquals('srcz'))
141
142	v, attr_start, attr_end = attr_lx.ReadValue()
143	log('v = %s', attr_value_str(v))
144	self.assertEqual(attr_value_e.Empty, v)
145	self.assertEqual(-1, attr_start)
146	self.assertEqual(-1, attr_end)
147
148	n, name_start, name_end = attr_lx.ReadName()
149	self.assertEqual(n, attr_name.Done)
150
151	def testMissing(self):
152	h = '<img SRC/>'
153	attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartEndTag)
154
155	n, name_start, name_end = attr_lx.ReadName()
156	self.assertEqual(n, attr_name.Ok)
157	self.assertEqual(5, name_start)
158	self.assertEqual(8, name_end)
159	self.assertEqual(True, attr_lx.next_value_is_missing)
160
161	self.assertEqual(True, attr_lx.AttrNameEquals('src'))
162	self.assertEqual(False, attr_lx.AttrNameEquals('srcz'))
163
164	v, attr_start, attr_end = attr_lx.ReadValue()
165	self.assertEqual(attr_value_e.Missing, v)
166	self.assertEqual(-1, attr_start)
167	self.assertEqual(-1, attr_end)
168
169	n, name_start, name_end = attr_lx.ReadName()
170	self.assertEqual(n, attr_name.Done)
171
172	def testUnquoted(self):
173	# CAREFUL: /> is a StartEndTag, and / is not part of unquoted value
174	h = '<a x=foo/>'
175	attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartEndTag)
176
177	n, name_start, name_end = attr_lx.ReadName()
178	self.assertEqual(n, attr_name.Ok)
179	self.assertEqual(3, name_start)
180	self.assertEqual(4, name_end)
181
182	v, attr_start, attr_end = attr_lx.ReadValue()
183
184	log('v = %s', attr_value_str(v))
185	log('unquoted val %r', h[attr_start:attr_end])
186
187	self.assertEqual(attr_value_e.Unquoted, v)
188	self.assertEqual(5, attr_start)
189	self.assertEqual(8, attr_end)
190
191	n, name_start, name_end = attr_lx.ReadName()
192	self.assertEqual(n, attr_name.Done)
193
194	def testDoubleQuoted(self):
195	h = '<a x="f&">'
196	attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)
197
198	n, name_start, name_end = attr_lx.ReadName()
199	self.assertEqual(n, attr_name.Ok)
200	self.assertEqual(3, name_start)
201	self.assertEqual(4, name_end)
202
203	v, attr_start, attr_end = attr_lx.ReadValue()
204
205	log('v = %s', attr_value_str(v))
206	log('val %r', h[attr_start:attr_end])
207
208	self.assertEqual(attr_value_e.DoubleQuoted, v)
209	self.assertEqual(6, attr_start)
210	self.assertEqual(8, attr_end)
211	self.assertEqual(9, attr_lx.pos)
212
213	n, name_start, name_end = attr_lx.ReadName()
214	log('n = %r', attr_name_str(n))
215	self.assertEqual(n, attr_name.Done)
216
217	def testSingleQuoted(self):
218	h = "<a x='&f'>"
219	attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)
220
221	n, name_start, name_end = attr_lx.ReadName()
222	self.assertEqual(n, attr_name.Ok)
223	self.assertEqual(3, name_start)
224	self.assertEqual(4, name_end)
225
226	v, attr_start, attr_end = attr_lx.ReadValue()
227
228	log('v = %s', attr_value_str(v))
229	log('unquoted val %r', h[attr_start:attr_end])
230
231	self.assertEqual(attr_value_e.SingleQuoted, v)
232	self.assertEqual(6, attr_start)
233	self.assertEqual(8, attr_end)
234	self.assertEqual(9, attr_lx.pos)
235
236	n, name_start, name_end = attr_lx.ReadName()
237	#log('n = %r', attr_name_str(n))
238	self.assertEqual(n, attr_name.Done)
239
240	def testDoubleQuoted_Bad(self):
241	h = '<a x="foo>'
242	attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)
243
244	n, name_start, name_end = attr_lx.ReadName()
245	self.assertEqual(n, attr_name.Ok)
246	self.assertEqual(3, name_start)
247	self.assertEqual(4, name_end)
248
249	try:
250	v, attr_start, attr_end = attr_lx.ReadValue()
251	except htm8.LexError as e:
252	print(e)
253	else:
254	self.fail('Expected LexError')
255
256	def testSingleQuoted_Bad(self):
257	h = "<a x='foo>"
258	attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)
259
260	n, name_start, name_end = attr_lx.ReadName()
261	self.assertEqual(n, attr_name.Ok)
262	self.assertEqual(3, name_start)
263	self.assertEqual(4, name_end)
264
265	try:
266	v, attr_start, attr_end = attr_lx.ReadValue()
267	except htm8.LexError as e:
268	print(e)
269	else:
270	self.fail('Expected LexError')
271
272
273	class AttrLexerWrapperTest(unittest.TestCase):
274
275	def testGetAttrRaw(self):
276	# type: () -> None
277	lex = _MakeAttrLexer(self, '<a>')
278	#_PrintTokens(lex)
279	self.assertEqual(None, htm8.GetAttrRaw(lex, 'oops'))
280
281	# <a novalue> means lex.Get('novalue') == ''
282	# https://developer.mozilla.org/en-US/docs/Web/API/Element/hasAttribute
283	# We are not distinguishing <a novalue=""> from <a novalue> in this API
284	lex = _MakeAttrLexer(self, '<a novalue>')
285	#_PrintTokens(lex)
286	self.assertEqual('', htm8.GetAttrRaw(lex, 'novalue'))
287
288	def testGetAttrRaw2(self):
289	lex = _MakeAttrLexer(self, '<a href="double quoted">')
290	#_PrintTokens(lex)
291
292	log('*** OOPS')
293	self.assertEqual(None, htm8.GetAttrRaw(lex, 'oops'))
294	lex.Reset()
295	log('*** DOUBLE')
296	self.assertEqual('double quoted', htm8.GetAttrRaw(lex, 'href'))
297
298	def testGetAttrRaw3(self):
299	"""Reverse order vs. testGetAttrRaw2"""
300	lex = _MakeAttrLexer(self, '<a href="double quoted">')
301	#_PrintTokens(lex)
302
303	self.assertEqual('double quoted', htm8.GetAttrRaw(lex, 'href'))
304	lex.Reset()
305	self.assertEqual(None, htm8.GetAttrRaw(lex, 'oops'))
306
307	def testGetAttrRaw4(self):
308
309	lex = _MakeAttrLexer(self, '<a href=foo class="bar">')
310	#_PrintTokens(lex)
311	self.assertEqual('bar', htm8.GetAttrRaw(lex, 'class'))
312
313	lex = _MakeAttrLexer(self,
314	'<a href=foo class="bar" />',
315	expected_tag=h8_id.StartEndTag)
316	#_PrintTokens(lex)
317	self.assertEqual('bar', htm8.GetAttrRaw(lex, 'class'))
318
319	lex = _MakeAttrLexer(self,
320	'<a href="?foo=1&bar=2" />',
321	expected_tag=h8_id.StartEndTag)
322	self.assertEqual('?foo=1&bar=2', htm8.GetAttrRaw(lex, 'href'))
323
324	def testAllAttrs(self):
325	# type: () -> None
326	"""
327	[('key', 'value')] for all
328	"""
329	# closed
330	lex = _MakeAttrLexer(self,
331	'<a href=foo class="bar" />',
332	expected_tag=h8_id.StartEndTag)
333	self.assertEqual([('href', 'foo'), ('class', 'bar')],
334	htm8.AllAttrsRaw(lex))
335
336	lex = _MakeAttrLexer(self,
337	'<a href="?foo=1&bar=2" />',
338	expected_tag=h8_id.StartEndTag)
339	self.assertEqual([('href', '?foo=1&bar=2')], htm8.AllAttrsRaw(lex))
340
341	def testEmptyMissingValues(self):
342	# type: () -> None
343	# equivalent to <button disabled="">
344	lex = _MakeAttrLexer(self, '<button disabled>')
345	all_attrs = htm8.AllAttrsRaw(lex)
346	self.assertEqual([('disabled', '')], all_attrs)
347
348	# TODO: restore this
349	if 0:
350	slices = lex.AllAttrsRawSlice()
351	log('slices %s', slices)
352
353	lex = _MakeAttrLexer(
354	self, '''<p double="" single='' empty= value missing empty2=>''')
355	all_attrs = htm8.AllAttrsRaw(lex)
356	self.assertEqual([
357	('double', ''),
358	('single', ''),
359	('empty', 'value'),
360	('missing', ''),
361	('empty2', ''),
362	], all_attrs)
363	# TODO: should have
364	log('all %s', all_attrs)
365
366	if 0:
367	slices = lex.AllAttrsRawSlice()
368	log('slices %s', slices)
369
370	def testInvalidTag(self):
371	# type: () -> None
372	try:
373	lex = _MakeAttrLexer(self, '<a foo=bar !></a>')
374	all_attrs = htm8.AllAttrsRaw(lex)
375	except htm8.LexError as e:
376	print(e)
377	else:
378	self.fail('Expected LexError')
379
380
381	def ValidTokenList(s, no_special_tags=False):
382	# type: (str, bool) -> List[Tuple[h8_id_t, int]]
383	"""A wrapper that can be more easily translated to C++. Doesn't use iterators."""
384
385	start_pos = 0
386	tokens = []
387	lx = htm8.Lexer(s, no_special_tags=no_special_tags)
388	while True:
389	tok_id, end_pos = lx.Read()
390	tokens.append((tok_id, end_pos))
391	if tok_id == h8_id.EndOfStream:
392	break
393	if tok_id == h8_id.Invalid:
394	raise htm8.LexError(s, start_pos)
395	start_pos = end_pos
396	return tokens
397
398
399	def Lex(h, no_special_tags=False):
400	# type: (str, bool) -> List[Tuple[int, int]]
401	print(repr(h))
402	tokens = ValidTokenList(h, no_special_tags=no_special_tags)
403	start_pos = 0
404	for tok_id, end_pos in tokens:
405	frag = h[start_pos:end_pos]
406	log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
407	start_pos = end_pos
408	return tokens
409
410
411	class LexerTest(unittest.TestCase):
412
413	# IndexLinker in devtools/make_help.py
414	# <pre> sections in doc/html_help.py
415	# TocExtractor in devtools/cmark.py
416
417	def testPstrip(self):
418	# type: () -> None
419	"""Remove anything like this.
420
421	<p><pstrip> </pstrip></p>
422	"""
423	pass
424
425	def testCommentParse(self):
426	# type: () -> None
427	n = len(TEST_HTML)
428	tokens = Lex(TEST_HTML)
429
430	def testCommentParse2(self):
431	# type: () -> None
432	h = '''
433	hi <!-- line 1
434	line 2 --><br/>'''
435	tokens = Lex(h)
436
437	self.assertEqual(
438	[
439	(h8_id.RawData, 12),
440	(h8_id.Comment, 50), # <? err ?>
441	(h8_id.StartEndTag, 55),
442	(h8_id.EndOfStream, 55),
443	],
444	tokens)
445
446	def testProcessingInstruction(self):
447	# type: () -> None
448	# <?xml ?> header
449	h = 'hi <? err ?>'
450	tokens = Lex(h)
451
452	self.assertEqual(
453	[
454	(h8_id.RawData, 3),
455	(h8_id.Processing, 12), # <? err ?>
456	(h8_id.EndOfStream, 12),
457	],
458	tokens)
459
460	def testScriptStyle(self):
461	# type: () -> None
462	h = '''
463	hi <script src=""> if (x < 1 && y > 2 ) { console.log(""); }
464	</script>
465	'''
466	tokens = Lex(h)
467
468	expected = [
469	(h8_id.RawData, 12),
470	(h8_id.StartTag, 27), # <script>
471	(h8_id.HtmlCData, 78), # JavaScript code is HTML CData
472	(h8_id.EndTag, 87), # </script>
473	(h8_id.RawData, 96), # \n
474	(h8_id.EndOfStream, 96), # \n
475	]
476	self.assertEqual(expected, tokens)
477
478	# Test case matching
479	tokens = Lex(h.replace('script', 'scrIPT'))
480	self.assertEqual(expected, tokens)
481
482	def testScriptStyleXml(self):
483	# type: () -> None
484	h = 'hi <script src=""> < </script>'
485	# XML mode
486	tokens = Lex(h, no_special_tags=True)
487
488	self.assertEqual(
489	[
490	(h8_id.RawData, 3),
491	(h8_id.StartTag, 18), # <script>
492	(h8_id.RawData, 19), # space
493	(h8_id.CharEntity, 23), # </script>
494	(h8_id.RawData, 24), # \n
495	(h8_id.EndTag, 33), # \n
496	(h8_id.EndOfStream, 33), # \n
497	],
498	tokens)
499
500	def testCData(self):
501	# type: () -> None
502
503	# from
504	# /home/andy/src/languages/Python-3.11.5/Lib/test/xmltestdata/c14n-20/inC14N4.xml
505	h = '<compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>'
506	tokens = Lex(h)
507
508	self.assertEqual([
509	(h8_id.StartTag, 9),
510	(h8_id.CData, 61),
511	(h8_id.EndTag, 71),
512	(h8_id.EndOfStream, 71),
513	], tokens)
514
515	def testEntity(self):
516	# type: () -> None
517
518	# from
519	# /home/andy/src/Python-3.12.4/Lib/test/xmltestdata/c14n-20/inC14N5.xml
520	h = '&ent1;, &ent2;!'
521
522	tokens = Lex(h)
523
524	self.assertEqual([
525	(h8_id.CharEntity, 6),
526	(h8_id.RawData, 8),
527	(h8_id.CharEntity, 14),
528	(h8_id.RawData, 15),
529	(h8_id.EndOfStream, 15),
530	], tokens)
531
532	def testStartTag(self):
533	# type: () -> None
534
535	h = '<a>hi</a>'
536	tokens = Lex(h)
537
538	self.assertEqual([
539	(h8_id.StartTag, 3),
540	(h8_id.RawData, 5),
541	(h8_id.EndTag, 9),
542	(h8_id.EndOfStream, 9),
543	], tokens)
544
545	# Make sure we don't consume too much
546	h = '<a><source>1.7</source></a>'
547
548	tokens = Lex(h)
549
550	self.assertEqual([
551	(h8_id.StartTag, 3),
552	(h8_id.StartTag, 11),
553	(h8_id.RawData, 14),
554	(h8_id.EndTag, 23),
555	(h8_id.EndTag, 27),
556	(h8_id.EndOfStream, 27),
557	], tokens)
558
559	return
560
561	h = '''
562	<configuration>
563	<source>1.7</source>
564	</configuration>'''
565
566	tokens = Lex(h)
567
568	self.assertEqual([
569	(h8_id.RawData, 9),
570	(h8_id.StartTag, 24),
571	(h8_id.RawData, 9),
572	(h8_id.EndOfStream, 9),
573	], tokens)
574
575	def testBad(self):
576	# type: () -> None
577	h = '&'
578	tokens = Lex(h)
579
580	self.assertEqual([
581	(h8_id.BadAmpersand, 1),
582	(h8_id.EndOfStream, 1),
583	], tokens)
584
585	h = '>'
586	tokens = Lex(h)
587
588	self.assertEqual([
589	(h8_id.BadGreaterThan, 1),
590	(h8_id.EndOfStream, 1),
591	], tokens)
592
593	def testEndOfStream(self):
594	# type: () -> None
595
596	# NUL is end
597	h = 'a\0b'
598	tokens = Lex(h)
599
600	self.assertEqual([
601	(h8_id.RawData, 1),
602	(h8_id.EndOfStream, 2),
603	], tokens)
604
605
606	if __name__ == '__main__':
607	unittest.main()