lazylex/html

OILS / lazylex / html_test.py View on Github | oils.pub

519 lines, 318 significant

1	#!/usr/bin/env python2
2	from __future__ import print_function
3
4	import unittest
5
6	from _devbuild.gen.htm8_asdl import h8_id, h8_id_str
7	from lazylex import html # module under test log = html.log
8
9	from typing import List, Tuple
10
11	log = html.log
12
13	with open('data_lang/testdata/hello.htm8') as f:
14	TEST_HTML = f.read()
15
16
17	class FunctionsTest(unittest.TestCase):
18
19	def testToText(self):
20	# type: () -> None
21	t = html.ToText('<b name="&"> three < four && five </b>')
22	self.assertEqual(' three < four && five ', t)
23
24
25	def _MakeTagLexer(s):
26	# type: (str) -> html.TagLexer
27	lex = html.TagLexer(s)
28	lex.Reset(0, len(s))
29	return lex
30
31
32	def _PrintTokens(lex):
33	# type: (html.TagLexer) -> None
34	log('')
35	log('tag = %r', lex.GetTagName())
36	for tok, start, end in lex.Tokens():
37	log('%s %r', tok, lex.s[start:end])
38
39
40	class TagLexerTest(unittest.TestCase):
41
42	def testTagLexer(self):
43	# type: () -> None
44	# Invalid!
45	#lex = _MakeTagLexer('< >')
46	#print(lex.Tag())
47
48	lex = _MakeTagLexer('<a>')
49	_PrintTokens(lex)
50
51	lex = _MakeTagLexer('<a novalue>')
52	_PrintTokens(lex)
53
54	# Note: we could have a different HasAttr() method
55	# <a novalue> means lex.Get('novalue') == ''
56	# https://developer.mozilla.org/en-US/docs/Web/API/Element/hasAttribute
57	self.assertEqual('', lex.GetAttrRaw('novalue'))
58
59	lex = _MakeTagLexer('<a href="double quoted">')
60	_PrintTokens(lex)
61
62	self.assertEqual('double quoted', lex.GetAttrRaw('href'))
63	self.assertEqual(None, lex.GetAttrRaw('oops'))
64
65	lex = _MakeTagLexer('<a href=foo class="bar">')
66	_PrintTokens(lex)
67
68	lex = _MakeTagLexer('<a href=foo class="bar" />')
69	_PrintTokens(lex)
70
71	lex = _MakeTagLexer('<a href="?foo=1&bar=2" />')
72	self.assertEqual('?foo=1&bar=2', lex.GetAttrRaw('href'))
73
74	def testTagName(self):
75	# type: () -> None
76	lex = _MakeTagLexer('<a href=foo class="bar" />')
77	self.assertEqual('a', lex.GetTagName())
78
79	def testAllAttrs(self):
80	# type: () -> None
81	"""
82	[('key', 'value')] for all
83	"""
84	# closed
85	lex = _MakeTagLexer('<a href=foo class="bar" />')
86	self.assertEqual([('href', 'foo'), ('class', 'bar')],
87	lex.AllAttrsRaw())
88
89	lex = _MakeTagLexer('<a href="?foo=1&bar=2" />')
90	self.assertEqual([('href', '?foo=1&bar=2')], lex.AllAttrsRaw())
91
92	def testEmptyMissingValues(self):
93	# type: () -> None
94	# equivalent to <button disabled="">
95	lex = _MakeTagLexer('<button disabled>')
96	all_attrs = lex.AllAttrsRaw()
97	self.assertEqual([('disabled', '')], all_attrs)
98
99	slices = lex.AllAttrsRawSlice()
100	log('slices %s', slices)
101
102	lex = _MakeTagLexer(
103	'''<p double="" single='' empty= value missing empty2=>''')
104	all_attrs = lex.AllAttrsRaw()
105	self.assertEqual([
106	('double', ''),
107	('single', ''),
108	('empty', 'value'),
109	('missing', ''),
110	('empty2', ''),
111	], all_attrs)
112	# TODO: should have
113	log('all %s', all_attrs)
114
115	slices = lex.AllAttrsRawSlice()
116	log('slices %s', slices)
117
118	def testInvalidTag(self):
119	# type: () -> None
120	try:
121	lex = _MakeTagLexer('<a foo=bar !></a>')
122	all_attrs = lex.AllAttrsRaw()
123	except html.LexError as e:
124	print(e)
125	else:
126	self.fail('Expected LexError')
127
128
129	def _MakeAttrValueLexer(s):
130	# type: (str) -> html.AttrValueLexer
131	lex = html.AttrValueLexer(s)
132	lex.Reset(0, len(s))
133	return lex
134
135
136	class AttrValueLexerTest(unittest.TestCase):
137
138	def testGood(self):
139	# type: () -> None
140	lex = _MakeAttrValueLexer('?foo=42&bar=99')
141	n = lex.NumTokens()
142	self.assertEqual(3, n)
143
144
145	def Lex(h, no_special_tags=False):
146	# type: (str, bool) -> List[Tuple[int, int]]
147	print(repr(h))
148	tokens = html.ValidTokenList(h, no_special_tags=no_special_tags)
149	start_pos = 0
150	for tok_id, end_pos in tokens:
151	frag = h[start_pos:end_pos]
152	log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
153	start_pos = end_pos
154	return tokens
155
156
157	class LexerTest(unittest.TestCase):
158
159	# IndexLinker in devtools/make_help.py
160	# <pre> sections in doc/html_help.py
161	# TocExtractor in devtools/cmark.py
162
163	def testPstrip(self):
164	# type: () -> None
165	"""Remove anything like this.
166
167	<p><pstrip> </pstrip></p>
168	"""
169	pass
170
171	def testCommentParse(self):
172	# type: () -> None
173	n = len(TEST_HTML)
174	tokens = Lex(TEST_HTML)
175
176	def testCommentParse2(self):
177	# type: () -> None
178	h = '''
179	hi <!-- line 1
180	line 2 --><br/>'''
181	tokens = Lex(h)
182
183	self.assertEqual(
184	[
185	(h8_id.RawData, 12),
186	(h8_id.Comment, 50), # <? err ?>
187	(h8_id.StartEndTag, 55),
188	(h8_id.EndOfStream, 55),
189	],
190	tokens)
191
192	def testProcessingInstruction(self):
193	# type: () -> None
194	# <?xml ?> header
195	h = 'hi <? err ?>'
196	tokens = Lex(h)
197
198	self.assertEqual(
199	[
200	(h8_id.RawData, 3),
201	(h8_id.Processing, 12), # <? err ?>
202	(h8_id.EndOfStream, 12),
203	],
204	tokens)
205
206	def testScriptStyle(self):
207	# type: () -> None
208	h = '''
209	hi <script src=""> if (x < 1 && y > 2 ) { console.log(""); }
210	</script>
211	'''
212	tokens = Lex(h)
213
214	expected = [
215	(h8_id.RawData, 12),
216	(h8_id.StartTag, 27), # <script>
217	(h8_id.HtmlCData, 78), # JavaScript code is HTML CData
218	(h8_id.EndTag, 87), # </script>
219	(h8_id.RawData, 96), # \n
220	(h8_id.EndOfStream, 96), # \n
221	]
222	self.assertEqual(expected, tokens)
223
224	# Test case matching
225	tokens = Lex(h.replace('script', 'scrIPT'))
226	self.assertEqual(expected, tokens)
227
228	def testScriptStyleXml(self):
229	# type: () -> None
230	h = 'hi <script src=""> < </script>'
231	# XML mode
232	tokens = Lex(h, no_special_tags=True)
233
234	self.assertEqual(
235	[
236	(h8_id.RawData, 3),
237	(h8_id.StartTag, 18), # <script>
238	(h8_id.RawData, 19), # space
239	(h8_id.CharEntity, 23), # </script>
240	(h8_id.RawData, 24), # \n
241	(h8_id.EndTag, 33), # \n
242	(h8_id.EndOfStream, 33), # \n
243	],
244	tokens)
245
246	def testCData(self):
247	# type: () -> None
248
249	# from
250	# /home/andy/src/languages/Python-3.11.5/Lib/test/xmltestdata/c14n-20/inC14N4.xml
251	h = '<compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>'
252	tokens = Lex(h)
253
254	self.assertEqual([
255	(h8_id.StartTag, 9),
256	(h8_id.CData, 61),
257	(h8_id.EndTag, 71),
258	(h8_id.EndOfStream, 71),
259	], tokens)
260
261	def testEntity(self):
262	# type: () -> None
263
264	# from
265	# /home/andy/src/Python-3.12.4/Lib/test/xmltestdata/c14n-20/inC14N5.xml
266	h = '&ent1;, &ent2;!'
267
268	tokens = Lex(h)
269
270	self.assertEqual([
271	(h8_id.CharEntity, 6),
272	(h8_id.RawData, 8),
273	(h8_id.CharEntity, 14),
274	(h8_id.RawData, 15),
275	(h8_id.EndOfStream, 15),
276	], tokens)
277
278	def testStartTag(self):
279	# type: () -> None
280
281	h = '<a>hi</a>'
282	tokens = Lex(h)
283
284	self.assertEqual([
285	(h8_id.StartTag, 3),
286	(h8_id.RawData, 5),
287	(h8_id.EndTag, 9),
288	(h8_id.EndOfStream, 9),
289	], tokens)
290
291	# Make sure we don't consume too much
292	h = '<a><source>1.7</source></a>'
293
294	tokens = Lex(h)
295
296	self.assertEqual([
297	(h8_id.StartTag, 3),
298	(h8_id.StartTag, 11),
299	(h8_id.RawData, 14),
300	(h8_id.EndTag, 23),
301	(h8_id.EndTag, 27),
302	(h8_id.EndOfStream, 27),
303	], tokens)
304
305	return
306
307	h = '''
308	<configuration>
309	<source>1.7</source>
310	</configuration>'''
311
312	tokens = Lex(h)
313
314	self.assertEqual([
315	(h8_id.RawData, 9),
316	(h8_id.StartTag, 24),
317	(h8_id.RawData, 9),
318	(h8_id.EndOfStream, 9),
319	], tokens)
320
321	def testBad(self):
322	# type: () -> None
323	h = '&'
324	tokens = Lex(h)
325
326	self.assertEqual([
327	(h8_id.BadAmpersand, 1),
328	(h8_id.EndOfStream, 1),
329	], tokens)
330
331	h = '>'
332	tokens = Lex(h)
333
334	self.assertEqual([
335	(h8_id.BadGreaterThan, 1),
336	(h8_id.EndOfStream, 1),
337	], tokens)
338
339	def testInvalid(self):
340	# type: () -> None
341	for s in INVALID_LEX:
342	try:
343	tokens = html.ValidTokenList(s)
344	except html.LexError as e:
345	print(e)
346	else:
347	self.fail('Expected LexError %r' % s)
348
349	def testValid(self):
350	# type: () -> None
351	for s, _ in VALID_LEX:
352	tokens = Lex(s)
353	print()
354
355
356	INVALID_LEX = [
357	'<a><',
358	'&amp<',
359	'&<',
360	# Hm > is allowed?
361	#'a > b',
362	'a < b',
363	'<!-- unfinished comment',
364	'<? unfinished processing',
365	'</div bad=attr> <a> <b>',
366
367	# not allowed, but 3 > 4 is allowed
368	'<a> 3 < 4 </a>',
369	# Not a CDATA tag
370	'<STYLEz><</STYLEz>',
371	]
372
373	SKIP = 0
374	UNCHANGED = 1
375
376	VALID_LEX = [
377	# TODO: convert these to XML
378	('<foo></foo>', UNCHANGED),
379	('<foo x=y></foo>', ''),
380	#('<foo x="&"></foo>', '<foo x="&"></foo>'),
381	('<foo x="&"></foo>', ''),
382
383	# Allowed with BadAmpersand
384	('<p> x & y </p>', '<p> x & y </p>'),
385	]
386
387	INVALID_PARSE = [
388	'<a></b>',
389	'<a>', # missing closing tag
390	'<meta></meta>', # this is a self-closing tag
391	]
392
393	VALID_PARSE = [
394	('<!DOCTYPE html>\n', ''),
395	('<!DOCTYPE>', ''),
396
397	# empty strings
398	('<p x=""></p>', UNCHANGED),
399	("<p x=''></p>", UNCHANGED),
400	('<self-closing a="b" />', UNCHANGED),
401
402	# We could also normalize CDATA?
403	# Note that CDATA has an escaping problem: you need to handle it ]]> with
404	# concatenation. It just "pushes the problem around".
405	# So I think it's better to use ONE kind of escaping, which is <
406	('<script><![CDATA[ <wtf> >< ]]></script>', UNCHANGED),
407
408	# allowed, but 3 < 4 is not allowed
409	('<a> 3 > 4 </a>', '<a> 3 > 4 </a>'),
410	# allowed, but 3 > 4 is not allowed
411	('<p x="3 < 4"></p>', ''),
412	('<b><a href="foo">link</a></b>', UNCHANGED),
413
414	# TODO: should be self-closing
415	#('<meta><a></a>', '<meta/><a></a>'),
416	('<meta><a></a>', ''),
417
418	# no attribute
419	('<button disabled></button>', ''),
420	('<button disabled=></button>', ''),
421	('<button disabled= ></button>', ''),
422
423	# single quoted is pretty common
424	("<a href='single'></a>", ''),
425
426	# Conceding to reality - I used these myself
427	('<a href=ble.sh></a>', ''),
428	('<a href=foo.html></a>', ''),
429	('<foo x="&"></foo>', ''),
430
431	# caps
432	('<foo></FOO>', ''),
433	('<Foo></fOO>', ''),
434
435	# capital VOID tag
436	('<META><a></a>', ''),
437	('<script><</script>', ''),
438	# matching
439	('<SCRipt><</SCRipt>', ''),
440	('<SCRIPT><</SCRIPT>', ''),
441	('<STYLE><</STYLE>', ''),
442	#'<SCRipt><</script>',
443
444	# Note: Python HTMLParser.py does DYNAMIC compilation of regex with re.I
445	# flag to handle this! Gah I want something faster.
446	#'<script><</SCRIPT>',
447
448	# TODO: Test <svg> and <math> ?
449	]
450
451	VALID_XML = [
452	'<meta></meta>',
453	]
454
455	INVALID_TAG_LEX = [
456	# not allowed, but 3 < 4 is allowed
457	'<p x="3 > 4"></p>',
458	# same thing
459	'<a href=">"></a>',
460	'<a foo=bar !></a>', # bad attr
461	]
462
463
464	class ValidateTest(unittest.TestCase):
465
466	def testInvalid(self):
467	# type: () -> None
468	counters = html.Counters()
469	for s in INVALID_LEX + INVALID_TAG_LEX:
470	try:
471	html.Validate(s, html.BALANCED_TAGS, counters)
472	except html.LexError as e:
473	print(e)
474	else:
475	self.fail('Expected LexError %r' % s)
476
477	for s in INVALID_PARSE:
478	try:
479	html.Validate(s, html.BALANCED_TAGS, counters)
480	except html.ParseError as e:
481	print(e)
482	else:
483	self.fail('Expected ParseError')
484
485	def testValid(self):
486	# type: () -> None
487	counters = html.Counters()
488	for s, _ in VALID_PARSE:
489	html.Validate(s, html.BALANCED_TAGS, counters)
490	print('HTML5 %r' % s)
491	#print('HTML5 attrs %r' % counters.debug_attrs)
492
493	def testValidXml(self):
494	# type: () -> None
495	counters = html.Counters()
496	for s in VALID_XML:
497	html.Validate(s, html.BALANCED_TAGS \| html.NO_SPECIAL_TAGS,
498	counters)
499	print('XML %r' % s)
500	#print('XML attrs %r' % counters.debug_attrs)
501
502
503	class XmlTest(unittest.TestCase):
504
505	def testValid(self):
506	# type: () -> None
507	counters = html.Counters()
508	for h, expected_xml in VALID_LEX + VALID_PARSE:
509	actual = html.ToXml(h)
510	if expected_xml == UNCHANGED: # Unchanged
511	self.assertEqual(h, actual)
512	elif expected_xml == '': # Skip
513	pass
514	else:
515	self.assertEqual(expected_xml, actual)
516
517
518	if __name__ == '__main__':
519	unittest.main()