lazylex/html

OILS / lazylex / html_test.py View on Github | oils.pub

522 lines, 321 significant

1	#!/usr/bin/env python2
2	from __future__ import print_function
3
4	import unittest
5
6	from _devbuild.gen.htm8_asdl import h8_id, h8_id_str
7	from data_lang import htm8
8	from lazylex import html # module under test log = html.log
9
10	from typing import List, Tuple
11
12	log = html.log
13
14	with open('data_lang/testdata/hello.htm8') as f:
15	TEST_HTML = f.read()
16
17
18	class FunctionsTest(unittest.TestCase):
19
20	def testFindLineNum(self):
21	# type: () -> None
22	s = 'foo\n' * 3
23	for pos in [1, 5, 10, 50]: # out of bounds
24	line_num = htm8.FindLineNum(s, pos)
25	print(line_num)
26
27
28	def _MakeTagLexer(s):
29	# type: (str) -> html.TagLexer
30	lex = html.TagLexer(s)
31	lex.Reset(0, len(s))
32	return lex
33
34
35	def _PrintTokens(lex):
36	# type: (html.TagLexer) -> None
37	log('')
38	log('tag = %r', lex.GetTagName())
39	for tok, start, end in lex.Tokens():
40	log('%s %r', tok, lex.s[start:end])
41
42
43	class TagLexerTest(unittest.TestCase):
44
45	def testTagLexer(self):
46	# type: () -> None
47	# Invalid!
48	#lex = _MakeTagLexer('< >')
49	#print(lex.Tag())
50
51	lex = _MakeTagLexer('<a>')
52	_PrintTokens(lex)
53
54	lex = _MakeTagLexer('<a novalue>')
55	_PrintTokens(lex)
56
57	# Note: we could have a different HasAttr() method
58	# <a novalue> means lex.Get('novalue') == ''
59	# https://developer.mozilla.org/en-US/docs/Web/API/Element/hasAttribute
60	self.assertEqual('', lex.GetAttrRaw('novalue'))
61
62	lex = _MakeTagLexer('<a href="double quoted">')
63	_PrintTokens(lex)
64
65	self.assertEqual('double quoted', lex.GetAttrRaw('href'))
66	self.assertEqual(None, lex.GetAttrRaw('oops'))
67
68	lex = _MakeTagLexer('<a href=foo class="bar">')
69	_PrintTokens(lex)
70
71	lex = _MakeTagLexer('<a href=foo class="bar" />')
72	_PrintTokens(lex)
73
74	lex = _MakeTagLexer('<a href="?foo=1&bar=2" />')
75	self.assertEqual('?foo=1&bar=2', lex.GetAttrRaw('href'))
76
77	def testTagName(self):
78	# type: () -> None
79	lex = _MakeTagLexer('<a href=foo class="bar" />')
80	self.assertEqual('a', lex.GetTagName())
81
82	def testAllAttrs(self):
83	# type: () -> None
84	"""
85	[('key', 'value')] for all
86	"""
87	# closed
88	lex = _MakeTagLexer('<a href=foo class="bar" />')
89	self.assertEqual([('href', 'foo'), ('class', 'bar')],
90	lex.AllAttrsRaw())
91
92	lex = _MakeTagLexer('<a href="?foo=1&bar=2" />')
93	self.assertEqual([('href', '?foo=1&bar=2')], lex.AllAttrsRaw())
94
95	def testEmptyMissingValues(self):
96	# type: () -> None
97	# equivalent to <button disabled="">
98	lex = _MakeTagLexer('<button disabled>')
99	all_attrs = lex.AllAttrsRaw()
100	self.assertEqual([('disabled', '')], all_attrs)
101
102	slices = lex.AllAttrsRawSlice()
103	log('slices %s', slices)
104
105	lex = _MakeTagLexer(
106	'''<p double="" single='' empty= value missing empty2=>''')
107	all_attrs = lex.AllAttrsRaw()
108	self.assertEqual([
109	('double', ''),
110	('single', ''),
111	('empty', 'value'),
112	('missing', ''),
113	('empty2', ''),
114	], all_attrs)
115	# TODO: should have
116	log('all %s', all_attrs)
117
118	slices = lex.AllAttrsRawSlice()
119	log('slices %s', slices)
120
121	def testInvalidTag(self):
122	# type: () -> None
123	try:
124	lex = _MakeTagLexer('<a foo=bar !></a>')
125	all_attrs = lex.AllAttrsRaw()
126	except html.LexError as e:
127	print(e)
128	else:
129	self.fail('Expected LexError')
130
131
132	def _MakeAttrValueLexer(s):
133	# type: (str) -> html.AttrValueLexer
134	lex = html.AttrValueLexer(s)
135	lex.Reset(0, len(s))
136	return lex
137
138
139	class AttrValueLexerTest(unittest.TestCase):
140
141	def testGood(self):
142	# type: () -> None
143	lex = _MakeAttrValueLexer('?foo=42&bar=99')
144	n = lex.NumTokens()
145	self.assertEqual(3, n)
146
147
148	def Lex(h, no_special_tags=False):
149	# type: (str, bool) -> List[Tuple[int, int]]
150	print(repr(h))
151	tokens = html.ValidTokenList(h, no_special_tags=no_special_tags)
152	start_pos = 0
153	for tok_id, end_pos in tokens:
154	frag = h[start_pos:end_pos]
155	log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
156	start_pos = end_pos
157	return tokens
158
159
160	class LexerTest(unittest.TestCase):
161
162	# IndexLinker in devtools/make_help.py
163	# <pre> sections in doc/html_help.py
164	# TocExtractor in devtools/cmark.py
165
166	def testPstrip(self):
167	# type: () -> None
168	"""Remove anything like this.
169
170	<p><pstrip> </pstrip></p>
171	"""
172	pass
173
174	def testCommentParse(self):
175	# type: () -> None
176	n = len(TEST_HTML)
177	tokens = Lex(TEST_HTML)
178
179	def testCommentParse2(self):
180	# type: () -> None
181	h = '''
182	hi <!-- line 1
183	line 2 --><br/>'''
184	tokens = Lex(h)
185
186	self.assertEqual(
187	[
188	(h8_id.RawData, 12),
189	(h8_id.Comment, 50), # <? err ?>
190	(h8_id.StartEndTag, 55),
191	(h8_id.EndOfStream, 55),
192	],
193	tokens)
194
195	def testProcessingInstruction(self):
196	# type: () -> None
197	# <?xml ?> header
198	h = 'hi <? err ?>'
199	tokens = Lex(h)
200
201	self.assertEqual(
202	[
203	(h8_id.RawData, 3),
204	(h8_id.Processing, 12), # <? err ?>
205	(h8_id.EndOfStream, 12),
206	],
207	tokens)
208
209	def testScriptStyle(self):
210	# type: () -> None
211	h = '''
212	hi <script src=""> if (x < 1 && y > 2 ) { console.log(""); }
213	</script>
214	'''
215	tokens = Lex(h)
216
217	expected = [
218	(h8_id.RawData, 12),
219	(h8_id.StartTag, 27), # <script>
220	(h8_id.HtmlCData, 78), # JavaScript code is HTML CData
221	(h8_id.EndTag, 87), # </script>
222	(h8_id.RawData, 96), # \n
223	(h8_id.EndOfStream, 96), # \n
224	]
225	self.assertEqual(expected, tokens)
226
227	# Test case matching
228	tokens = Lex(h.replace('script', 'scrIPT'))
229	self.assertEqual(expected, tokens)
230
231	def testScriptStyleXml(self):
232	# type: () -> None
233	h = 'hi <script src=""> < </script>'
234	# XML mode
235	tokens = Lex(h, no_special_tags=True)
236
237	self.assertEqual(
238	[
239	(h8_id.RawData, 3),
240	(h8_id.StartTag, 18), # <script>
241	(h8_id.RawData, 19), # space
242	(h8_id.CharEntity, 23), # </script>
243	(h8_id.RawData, 24), # \n
244	(h8_id.EndTag, 33), # \n
245	(h8_id.EndOfStream, 33), # \n
246	],
247	tokens)
248
249	def testCData(self):
250	# type: () -> None
251
252	# from
253	# /home/andy/src/languages/Python-3.11.5/Lib/test/xmltestdata/c14n-20/inC14N4.xml
254	h = '<compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>'
255	tokens = Lex(h)
256
257	self.assertEqual([
258	(h8_id.StartTag, 9),
259	(h8_id.CData, 61),
260	(h8_id.EndTag, 71),
261	(h8_id.EndOfStream, 71),
262	], tokens)
263
264	def testEntity(self):
265	# type: () -> None
266
267	# from
268	# /home/andy/src/Python-3.12.4/Lib/test/xmltestdata/c14n-20/inC14N5.xml
269	h = '&ent1;, &ent2;!'
270
271	tokens = Lex(h)
272
273	self.assertEqual([
274	(h8_id.CharEntity, 6),
275	(h8_id.RawData, 8),
276	(h8_id.CharEntity, 14),
277	(h8_id.RawData, 15),
278	(h8_id.EndOfStream, 15),
279	], tokens)
280
281	def testStartTag(self):
282	# type: () -> None
283
284	h = '<a>hi</a>'
285	tokens = Lex(h)
286
287	self.assertEqual([
288	(h8_id.StartTag, 3),
289	(h8_id.RawData, 5),
290	(h8_id.EndTag, 9),
291	(h8_id.EndOfStream, 9),
292	], tokens)
293
294	# Make sure we don't consume too much
295	h = '<a><source>1.7</source></a>'
296
297	tokens = Lex(h)
298
299	self.assertEqual([
300	(h8_id.StartTag, 3),
301	(h8_id.StartTag, 11),
302	(h8_id.RawData, 14),
303	(h8_id.EndTag, 23),
304	(h8_id.EndTag, 27),
305	(h8_id.EndOfStream, 27),
306	], tokens)
307
308	return
309
310	h = '''
311	<configuration>
312	<source>1.7</source>
313	</configuration>'''
314
315	tokens = Lex(h)
316
317	self.assertEqual([
318	(h8_id.RawData, 9),
319	(h8_id.StartTag, 24),
320	(h8_id.RawData, 9),
321	(h8_id.EndOfStream, 9),
322	], tokens)
323
324	def testBad(self):
325	# type: () -> None
326	h = '&'
327	tokens = Lex(h)
328
329	self.assertEqual([
330	(h8_id.BadAmpersand, 1),
331	(h8_id.EndOfStream, 1),
332	], tokens)
333
334	h = '>'
335	tokens = Lex(h)
336
337	self.assertEqual([
338	(h8_id.BadGreaterThan, 1),
339	(h8_id.EndOfStream, 1),
340	], tokens)
341
342	def testInvalid(self):
343	# type: () -> None
344	for s in INVALID_LEX:
345	try:
346	tokens = html.ValidTokenList(s)
347	except html.LexError as e:
348	print(e)
349	else:
350	self.fail('Expected LexError %r' % s)
351
352	def testValid(self):
353	# type: () -> None
354	for s, _ in VALID_LEX:
355	tokens = Lex(s)
356	print()
357
358
359	INVALID_LEX = [
360	'<a><',
361	'&amp<',
362	'&<',
363	# Hm > is allowed?
364	#'a > b',
365	'a < b',
366	'<!-- unfinished comment',
367	'<? unfinished processing',
368	'</div bad=attr> <a> <b>',
369
370	# not allowed, but 3 > 4 is allowed
371	'<a> 3 < 4 </a>',
372	# Not a CDATA tag
373	'<STYLEz><</STYLEz>',
374	]
375
376	SKIP = 0
377	UNCHANGED = 1
378
379	VALID_LEX = [
380	# TODO: convert these to XML
381	('<foo></foo>', UNCHANGED),
382	('<foo x=y></foo>', ''),
383	#('<foo x="&"></foo>', '<foo x="&"></foo>'),
384	('<foo x="&"></foo>', ''),
385
386	# Allowed with BadAmpersand
387	('<p> x & y </p>', '<p> x & y </p>'),
388	]
389
390	INVALID_PARSE = [
391	'<a></b>',
392	'<a>', # missing closing tag
393	'<meta></meta>', # this is a self-closing tag
394	]
395
396	VALID_PARSE = [
397	('<!DOCTYPE html>\n', ''),
398	('<!DOCTYPE>', ''),
399
400	# empty strings
401	('<p x=""></p>', UNCHANGED),
402	("<p x=''></p>", UNCHANGED),
403	('<self-closing a="b" />', UNCHANGED),
404
405	# We could also normalize CDATA?
406	# Note that CDATA has an escaping problem: you need to handle it ]]> with
407	# concatenation. It just "pushes the problem around".
408	# So I think it's better to use ONE kind of escaping, which is <
409	('<script><![CDATA[ <wtf> >< ]]></script>', UNCHANGED),
410
411	# allowed, but 3 < 4 is not allowed
412	('<a> 3 > 4 </a>', '<a> 3 > 4 </a>'),
413	# allowed, but 3 > 4 is not allowed
414	('<p x="3 < 4"></p>', ''),
415	('<b><a href="foo">link</a></b>', UNCHANGED),
416
417	# TODO: should be self-closing
418	#('<meta><a></a>', '<meta/><a></a>'),
419	('<meta><a></a>', ''),
420
421	# no attribute
422	('<button disabled></button>', ''),
423	('<button disabled=></button>', ''),
424	('<button disabled= ></button>', ''),
425
426	# single quoted is pretty common
427	("<a href='single'></a>", ''),
428
429	# Conceding to reality - I used these myself
430	('<a href=ble.sh></a>', ''),
431	('<a href=foo.html></a>', ''),
432	('<foo x="&"></foo>', ''),
433
434	# caps
435	('<foo></FOO>', ''),
436	('<Foo></fOO>', ''),
437
438	# capital VOID tag
439	('<META><a></a>', ''),
440	('<script><</script>', ''),
441	# matching
442	('<SCRipt><</SCRipt>', ''),
443	('<SCRIPT><</SCRIPT>', ''),
444	('<STYLE><</STYLE>', ''),
445	#'<SCRipt><</script>',
446
447	# Note: Python HTMLParser.py does DYNAMIC compilation of regex with re.I
448	# flag to handle this! Gah I want something faster.
449	#'<script><</SCRIPT>',
450
451	# TODO: Test <svg> and <math> ?
452	]
453
454	VALID_XML = [
455	'<meta></meta>',
456	]
457
458	INVALID_TAG_LEX = [
459	# not allowed, but 3 < 4 is allowed
460	'<p x="3 > 4"></p>',
461	# same thing
462	'<a href=">"></a>',
463	'<a foo=bar !></a>', # bad attr
464	]
465
466
467	class ValidateTest(unittest.TestCase):
468
469	def testInvalid(self):
470	# type: () -> None
471	counters = html.Counters()
472	for s in INVALID_LEX + INVALID_TAG_LEX:
473	try:
474	html.Validate(s, html.BALANCED_TAGS, counters)
475	except html.LexError as e:
476	print(e)
477	else:
478	self.fail('Expected LexError %r' % s)
479
480	for s in INVALID_PARSE:
481	try:
482	html.Validate(s, html.BALANCED_TAGS, counters)
483	except html.ParseError as e:
484	print(e)
485	else:
486	self.fail('Expected ParseError')
487
488	def testValid(self):
489	# type: () -> None
490	counters = html.Counters()
491	for s, _ in VALID_PARSE:
492	html.Validate(s, html.BALANCED_TAGS, counters)
493	print('HTML5 %r' % s)
494	#print('HTML5 attrs %r' % counters.debug_attrs)
495
496	def testValidXml(self):
497	# type: () -> None
498	counters = html.Counters()
499	for s in VALID_XML:
500	html.Validate(s, html.BALANCED_TAGS \| html.NO_SPECIAL_TAGS,
501	counters)
502	print('XML %r' % s)
503	#print('XML attrs %r' % counters.debug_attrs)
504
505
506	class XmlTest(unittest.TestCase):
507
508	def testValid(self):
509	# type: () -> None
510	counters = html.Counters()
511	for h, expected_xml in VALID_LEX + VALID_PARSE:
512	actual = html.ToXml(h)
513	if expected_xml == UNCHANGED: # Unchanged
514	self.assertEqual(h, actual)
515	elif expected_xml == '': # Skip
516	pass
517	else:
518	self.assertEqual(expected_xml, actual)
519
520
521	if __name__ == '__main__':
522	unittest.main()