lazylex/html

OILS / lazylex / html_test.py View on Github | oils.pub

556 lines, 340 significant

1	#!/usr/bin/env python2
2	from __future__ import print_function
3
4	import unittest
5
6	from _devbuild.gen.htm8_asdl import h8_id, h8_id_str
7	from data_lang import htm8
8	from lazylex import html # module under test log = html.log
9
10	from typing import List, Tuple
11
12	log = html.log
13
14	with open('data_lang/testdata/hello.htm8') as f:
15	TEST_HTML = f.read()
16
17
18	class RegexTest(unittest.TestCase):
19
20	def testDotAll(self):
21	# type: () -> None
22	import re
23
24	# Note that $ matches end of line, not end of string
25	p1 = re.compile(r'.')
26	print(p1.match('\n'))
27
28	p2 = re.compile(r'.', re.DOTALL)
29	print(p2.match('\n'))
30
31	#p3 = re.compile(r'[.\n]', re.VERBOSE)
32	p3 = re.compile(r'[.\n]')
33	print(p3.match('\n'))
34
35	print('Negation')
36
37	p4 = re.compile(r'[^>]')
38	print(p4.match('\n'))
39
40	def testAttrRe(self):
41	# type: () -> None
42	_ATTR_RE = htm8._ATTR_RE
43	m = _ATTR_RE.match(' empty= val')
44	print(m.groups())
45
46
47	class FunctionsTest(unittest.TestCase):
48
49	def testFindLineNum(self):
50	# type: () -> None
51	s = 'foo\n' * 3
52	for pos in [1, 5, 10, 50]: # out of bounds
53	line_num = htm8.FindLineNum(s, pos)
54	print(line_num)
55
56	def testToText(self):
57	# type: () -> None
58	t = html.ToText('<b name="&"> three < four && five </b>')
59	self.assertEqual(' three < four && five ', t)
60
61
62	def _MakeTagLexer(s):
63	# type: (str) -> html.TagLexer
64	lex = html.TagLexer(s)
65	lex.Reset(0, len(s))
66	return lex
67
68
69	def _PrintTokens(lex):
70	# type: (html.TagLexer) -> None
71	log('')
72	log('tag = %r', lex.GetTagName())
73	for tok, start, end in lex.Tokens():
74	log('%s %r', tok, lex.s[start:end])
75
76
77	class TagLexerTest(unittest.TestCase):
78
79	def testTagLexer(self):
80	# type: () -> None
81	# Invalid!
82	#lex = _MakeTagLexer('< >')
83	#print(lex.Tag())
84
85	lex = _MakeTagLexer('<a>')
86	_PrintTokens(lex)
87
88	lex = _MakeTagLexer('<a novalue>')
89	_PrintTokens(lex)
90
91	# Note: we could have a different HasAttr() method
92	# <a novalue> means lex.Get('novalue') == ''
93	# https://developer.mozilla.org/en-US/docs/Web/API/Element/hasAttribute
94	self.assertEqual('', lex.GetAttrRaw('novalue'))
95
96	lex = _MakeTagLexer('<a href="double quoted">')
97	_PrintTokens(lex)
98
99	self.assertEqual('double quoted', lex.GetAttrRaw('href'))
100	self.assertEqual(None, lex.GetAttrRaw('oops'))
101
102	lex = _MakeTagLexer('<a href=foo class="bar">')
103	_PrintTokens(lex)
104
105	lex = _MakeTagLexer('<a href=foo class="bar" />')
106	_PrintTokens(lex)
107
108	lex = _MakeTagLexer('<a href="?foo=1&bar=2" />')
109	self.assertEqual('?foo=1&bar=2', lex.GetAttrRaw('href'))
110
111	def testTagName(self):
112	# type: () -> None
113	lex = _MakeTagLexer('<a href=foo class="bar" />')
114	self.assertEqual('a', lex.GetTagName())
115
116	def testAllAttrs(self):
117	# type: () -> None
118	"""
119	[('key', 'value')] for all
120	"""
121	# closed
122	lex = _MakeTagLexer('<a href=foo class="bar" />')
123	self.assertEqual([('href', 'foo'), ('class', 'bar')],
124	lex.AllAttrsRaw())
125
126	lex = _MakeTagLexer('<a href="?foo=1&bar=2" />')
127	self.assertEqual([('href', '?foo=1&bar=2')], lex.AllAttrsRaw())
128
129	def testEmptyMissingValues(self):
130	# type: () -> None
131	# equivalent to <button disabled="">
132	lex = _MakeTagLexer('<button disabled>')
133	all_attrs = lex.AllAttrsRaw()
134	self.assertEqual([('disabled', '')], all_attrs)
135
136	slices = lex.AllAttrsRawSlice()
137	log('slices %s', slices)
138
139	lex = _MakeTagLexer(
140	'''<p double="" single='' empty= value missing empty2=>''')
141	all_attrs = lex.AllAttrsRaw()
142	self.assertEqual([
143	('double', ''),
144	('single', ''),
145	('empty', 'value'),
146	('missing', ''),
147	('empty2', ''),
148	], all_attrs)
149	# TODO: should have
150	log('all %s', all_attrs)
151
152	slices = lex.AllAttrsRawSlice()
153	log('slices %s', slices)
154
155	def testInvalidTag(self):
156	# type: () -> None
157	try:
158	lex = _MakeTagLexer('<a foo=bar !></a>')
159	all_attrs = lex.AllAttrsRaw()
160	except html.LexError as e:
161	print(e)
162	else:
163	self.fail('Expected LexError')
164
165
166	def _MakeAttrValueLexer(s):
167	# type: (str) -> html.AttrValueLexer
168	lex = html.AttrValueLexer(s)
169	lex.Reset(0, len(s))
170	return lex
171
172
173	class AttrValueLexerTest(unittest.TestCase):
174
175	def testGood(self):
176	# type: () -> None
177	lex = _MakeAttrValueLexer('?foo=42&bar=99')
178	n = lex.NumTokens()
179	self.assertEqual(3, n)
180
181
182	def Lex(h, no_special_tags=False):
183	# type: (str, bool) -> List[Tuple[int, int]]
184	print(repr(h))
185	tokens = html.ValidTokenList(h, no_special_tags=no_special_tags)
186	start_pos = 0
187	for tok_id, end_pos in tokens:
188	frag = h[start_pos:end_pos]
189	log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
190	start_pos = end_pos
191	return tokens
192
193
194	class LexerTest(unittest.TestCase):
195
196	# IndexLinker in devtools/make_help.py
197	# <pre> sections in doc/html_help.py
198	# TocExtractor in devtools/cmark.py
199
200	def testPstrip(self):
201	# type: () -> None
202	"""Remove anything like this.
203
204	<p><pstrip> </pstrip></p>
205	"""
206	pass
207
208	def testCommentParse(self):
209	# type: () -> None
210	n = len(TEST_HTML)
211	tokens = Lex(TEST_HTML)
212
213	def testCommentParse2(self):
214	# type: () -> None
215	h = '''
216	hi <!-- line 1
217	line 2 --><br/>'''
218	tokens = Lex(h)
219
220	self.assertEqual(
221	[
222	(h8_id.RawData, 12),
223	(h8_id.Comment, 50), # <? err ?>
224	(h8_id.StartEndTag, 55),
225	(h8_id.EndOfStream, 55),
226	],
227	tokens)
228
229	def testProcessingInstruction(self):
230	# type: () -> None
231	# <?xml ?> header
232	h = 'hi <? err ?>'
233	tokens = Lex(h)
234
235	self.assertEqual(
236	[
237	(h8_id.RawData, 3),
238	(h8_id.Processing, 12), # <? err ?>
239	(h8_id.EndOfStream, 12),
240	],
241	tokens)
242
243	def testScriptStyle(self):
244	# type: () -> None
245	h = '''
246	hi <script src=""> if (x < 1 && y > 2 ) { console.log(""); }
247	</script>
248	'''
249	tokens = Lex(h)
250
251	expected = [
252	(h8_id.RawData, 12),
253	(h8_id.StartTag, 27), # <script>
254	(h8_id.HtmlCData, 78), # JavaScript code is HTML CData
255	(h8_id.EndTag, 87), # </script>
256	(h8_id.RawData, 96), # \n
257	(h8_id.EndOfStream, 96), # \n
258	]
259	self.assertEqual(expected, tokens)
260
261	# Test case matching
262	tokens = Lex(h.replace('script', 'scrIPT'))
263	self.assertEqual(expected, tokens)
264
265	def testScriptStyleXml(self):
266	# type: () -> None
267	h = 'hi <script src=""> < </script>'
268	# XML mode
269	tokens = Lex(h, no_special_tags=True)
270
271	self.assertEqual(
272	[
273	(h8_id.RawData, 3),
274	(h8_id.StartTag, 18), # <script>
275	(h8_id.RawData, 19), # space
276	(h8_id.CharEntity, 23), # </script>
277	(h8_id.RawData, 24), # \n
278	(h8_id.EndTag, 33), # \n
279	(h8_id.EndOfStream, 33), # \n
280	],
281	tokens)
282
283	def testCData(self):
284	# type: () -> None
285
286	# from
287	# /home/andy/src/languages/Python-3.11.5/Lib/test/xmltestdata/c14n-20/inC14N4.xml
288	h = '<compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>'
289	tokens = Lex(h)
290
291	self.assertEqual([
292	(h8_id.StartTag, 9),
293	(h8_id.CData, 61),
294	(h8_id.EndTag, 71),
295	(h8_id.EndOfStream, 71),
296	], tokens)
297
298	def testEntity(self):
299	# type: () -> None
300
301	# from
302	# /home/andy/src/Python-3.12.4/Lib/test/xmltestdata/c14n-20/inC14N5.xml
303	h = '&ent1;, &ent2;!'
304
305	tokens = Lex(h)
306
307	self.assertEqual([
308	(h8_id.CharEntity, 6),
309	(h8_id.RawData, 8),
310	(h8_id.CharEntity, 14),
311	(h8_id.RawData, 15),
312	(h8_id.EndOfStream, 15),
313	], tokens)
314
315	def testStartTag(self):
316	# type: () -> None
317
318	h = '<a>hi</a>'
319	tokens = Lex(h)
320
321	self.assertEqual([
322	(h8_id.StartTag, 3),
323	(h8_id.RawData, 5),
324	(h8_id.EndTag, 9),
325	(h8_id.EndOfStream, 9),
326	], tokens)
327
328	# Make sure we don't consume too much
329	h = '<a><source>1.7</source></a>'
330
331	tokens = Lex(h)
332
333	self.assertEqual([
334	(h8_id.StartTag, 3),
335	(h8_id.StartTag, 11),
336	(h8_id.RawData, 14),
337	(h8_id.EndTag, 23),
338	(h8_id.EndTag, 27),
339	(h8_id.EndOfStream, 27),
340	], tokens)
341
342	return
343
344	h = '''
345	<configuration>
346	<source>1.7</source>
347	</configuration>'''
348
349	tokens = Lex(h)
350
351	self.assertEqual([
352	(h8_id.RawData, 9),
353	(h8_id.StartTag, 24),
354	(h8_id.RawData, 9),
355	(h8_id.EndOfStream, 9),
356	], tokens)
357
358	def testBad(self):
359	# type: () -> None
360	h = '&'
361	tokens = Lex(h)
362
363	self.assertEqual([
364	(h8_id.BadAmpersand, 1),
365	(h8_id.EndOfStream, 1),
366	], tokens)
367
368	h = '>'
369	tokens = Lex(h)
370
371	self.assertEqual([
372	(h8_id.BadGreaterThan, 1),
373	(h8_id.EndOfStream, 1),
374	], tokens)
375
376	def testInvalid(self):
377	# type: () -> None
378	for s in INVALID_LEX:
379	try:
380	tokens = html.ValidTokenList(s)
381	except html.LexError as e:
382	print(e)
383	else:
384	self.fail('Expected LexError %r' % s)
385
386	def testValid(self):
387	# type: () -> None
388	for s, _ in VALID_LEX:
389	tokens = Lex(s)
390	print()
391
392
393	INVALID_LEX = [
394	'<a><',
395	'&amp<',
396	'&<',
397	# Hm > is allowed?
398	#'a > b',
399	'a < b',
400	'<!-- unfinished comment',
401	'<? unfinished processing',
402	'</div bad=attr> <a> <b>',
403
404	# not allowed, but 3 > 4 is allowed
405	'<a> 3 < 4 </a>',
406	# Not a CDATA tag
407	'<STYLEz><</STYLEz>',
408	]
409
410	SKIP = 0
411	UNCHANGED = 1
412
413	VALID_LEX = [
414	# TODO: convert these to XML
415	('<foo></foo>', UNCHANGED),
416	('<foo x=y></foo>', ''),
417	#('<foo x="&"></foo>', '<foo x="&"></foo>'),
418	('<foo x="&"></foo>', ''),
419
420	# Allowed with BadAmpersand
421	('<p> x & y </p>', '<p> x & y </p>'),
422	]
423
424	INVALID_PARSE = [
425	'<a></b>',
426	'<a>', # missing closing tag
427	'<meta></meta>', # this is a self-closing tag
428	]
429
430	VALID_PARSE = [
431	('<!DOCTYPE html>\n', ''),
432	('<!DOCTYPE>', ''),
433
434	# empty strings
435	('<p x=""></p>', UNCHANGED),
436	("<p x=''></p>", UNCHANGED),
437	('<self-closing a="b" />', UNCHANGED),
438
439	# We could also normalize CDATA?
440	# Note that CDATA has an escaping problem: you need to handle it ]]> with
441	# concatenation. It just "pushes the problem around".
442	# So I think it's better to use ONE kind of escaping, which is <
443	('<script><![CDATA[ <wtf> >< ]]></script>', UNCHANGED),
444
445	# allowed, but 3 < 4 is not allowed
446	('<a> 3 > 4 </a>', '<a> 3 > 4 </a>'),
447	# allowed, but 3 > 4 is not allowed
448	('<p x="3 < 4"></p>', ''),
449	('<b><a href="foo">link</a></b>', UNCHANGED),
450
451	# TODO: should be self-closing
452	#('<meta><a></a>', '<meta/><a></a>'),
453	('<meta><a></a>', ''),
454
455	# no attribute
456	('<button disabled></button>', ''),
457	('<button disabled=></button>', ''),
458	('<button disabled= ></button>', ''),
459
460	# single quoted is pretty common
461	("<a href='single'></a>", ''),
462
463	# Conceding to reality - I used these myself
464	('<a href=ble.sh></a>', ''),
465	('<a href=foo.html></a>', ''),
466	('<foo x="&"></foo>', ''),
467
468	# caps
469	('<foo></FOO>', ''),
470	('<Foo></fOO>', ''),
471
472	# capital VOID tag
473	('<META><a></a>', ''),
474	('<script><</script>', ''),
475	# matching
476	('<SCRipt><</SCRipt>', ''),
477	('<SCRIPT><</SCRIPT>', ''),
478	('<STYLE><</STYLE>', ''),
479	#'<SCRipt><</script>',
480
481	# Note: Python HTMLParser.py does DYNAMIC compilation of regex with re.I
482	# flag to handle this! Gah I want something faster.
483	#'<script><</SCRIPT>',
484
485	# TODO: Test <svg> and <math> ?
486	]
487
488	VALID_XML = [
489	'<meta></meta>',
490	]
491
492	INVALID_TAG_LEX = [
493	# not allowed, but 3 < 4 is allowed
494	'<p x="3 > 4"></p>',
495	# same thing
496	'<a href=">"></a>',
497	'<a foo=bar !></a>', # bad attr
498	]
499
500
501	class ValidateTest(unittest.TestCase):
502
503	def testInvalid(self):
504	# type: () -> None
505	counters = html.Counters()
506	for s in INVALID_LEX + INVALID_TAG_LEX:
507	try:
508	html.Validate(s, html.BALANCED_TAGS, counters)
509	except html.LexError as e:
510	print(e)
511	else:
512	self.fail('Expected LexError %r' % s)
513
514	for s in INVALID_PARSE:
515	try:
516	html.Validate(s, html.BALANCED_TAGS, counters)
517	except html.ParseError as e:
518	print(e)
519	else:
520	self.fail('Expected ParseError')
521
522	def testValid(self):
523	# type: () -> None
524	counters = html.Counters()
525	for s, _ in VALID_PARSE:
526	html.Validate(s, html.BALANCED_TAGS, counters)
527	print('HTML5 %r' % s)
528	#print('HTML5 attrs %r' % counters.debug_attrs)
529
530	def testValidXml(self):
531	# type: () -> None
532	counters = html.Counters()
533	for s in VALID_XML:
534	html.Validate(s, html.BALANCED_TAGS \| html.NO_SPECIAL_TAGS,
535	counters)
536	print('XML %r' % s)
537	#print('XML attrs %r' % counters.debug_attrs)
538
539
540	class XmlTest(unittest.TestCase):
541
542	def testValid(self):
543	# type: () -> None
544	counters = html.Counters()
545	for h, expected_xml in VALID_LEX + VALID_PARSE:
546	actual = html.ToXml(h)
547	if expected_xml == UNCHANGED: # Unchanged
548	self.assertEqual(h, actual)
549	elif expected_xml == '': # Skip
550	pass
551	else:
552	self.assertEqual(expected_xml, actual)
553
554
555	if __name__ == '__main__':
556	unittest.main()