lazylex/html

OILS / lazylex / html_test.py View on Github | oils.pub

425 lines, 275 significant

1	#!/usr/bin/env python2
2	from __future__ import print_function
3
4	import unittest
5
6	from lazylex import html # module under test log = html.log
7
8	log = html.log
9
10	with open('lazylex/testdata.html') as f:
11	TEST_HTML = f.read()
12
13
14	class RegexTest(unittest.TestCase):
15
16	def testDotAll(self):
17	import re
18
19	# Note that $ matches end of line, not end of string
20	p1 = re.compile(r'.')
21	print(p1.match('\n'))
22
23	p2 = re.compile(r'.', re.DOTALL)
24	print(p2.match('\n'))
25
26	#p3 = re.compile(r'[.\n]', re.VERBOSE)
27	p3 = re.compile(r'[.\n]')
28	print(p3.match('\n'))
29
30	print('Negation')
31
32	p4 = re.compile(r'[^>]')
33	print(p4.match('\n'))
34
35
36	class FunctionsTest(unittest.TestCase):
37
38	def testFindLineNum(self):
39	s = 'foo\n' * 3
40	for pos in [1, 5, 10, 50]: # out of bounds
41	line_num = html.FindLineNum(s, pos)
42	print(line_num)
43
44
45	def _MakeTagLexer(s):
46	lex = html.TagLexer(s)
47	lex.Reset(0, len(s))
48	return lex
49
50
51	def _PrintTokens(lex):
52	log('')
53	log('tag = %r', lex.TagName())
54	for tok, start, end in lex.Tokens():
55	log('%s %r', tok, lex.s[start:end])
56
57
58	class TagLexerTest(unittest.TestCase):
59
60	def testTagLexer(self):
61	# Invalid!
62	#lex = _MakeTagLexer('< >')
63	#print(lex.Tag())
64
65	lex = _MakeTagLexer('<a>')
66	_PrintTokens(lex)
67
68	lex = _MakeTagLexer('<a novalue>')
69	_PrintTokens(lex)
70
71	# Note: we could have a different HasAttr() method
72	# <a novalue> means lex.Get('novalue') == None
73	# https://developer.mozilla.org/en-US/docs/Web/API/Element/hasAttribute
74	self.assertEqual(None, lex.GetAttrRaw('novalue'))
75
76	lex = _MakeTagLexer('<a href="double quoted">')
77	_PrintTokens(lex)
78
79	self.assertEqual('double quoted', lex.GetAttrRaw('href'))
80	self.assertEqual(None, lex.GetAttrRaw('oops'))
81
82	lex = _MakeTagLexer('<a href=foo class="bar">')
83	_PrintTokens(lex)
84
85	lex = _MakeTagLexer('<a href=foo class="bar" />')
86	_PrintTokens(lex)
87
88	lex = _MakeTagLexer('<a href="?foo=1&bar=2" />')
89	self.assertEqual('?foo=1&bar=2', lex.GetAttrRaw('href'))
90
91	def testTagName(self):
92	lex = _MakeTagLexer('<a href=foo class="bar" />')
93	self.assertEqual('a', lex.TagName())
94
95	def testAllAttrs(self):
96	"""
97	[('key', 'value')] for all
98	"""
99	# closed
100	lex = _MakeTagLexer('<a href=foo class="bar" />')
101	self.assertEqual([('href', 'foo'), ('class', 'bar')],
102	lex.AllAttrsRaw())
103
104	lex = _MakeTagLexer('<a href="?foo=1&bar=2" />')
105	self.assertEqual([('href', '?foo=1&bar=2')], lex.AllAttrsRaw())
106
107	def testAttrWithoutValue(self):
108	# equivalent to <button disabled="">
109	lex = _MakeTagLexer('<button disabled>')
110	all_attrs = lex.AllAttrsRaw()
111	log('all %s', all_attrs)
112
113	try:
114	lex = _MakeTagLexer('<a foo=bar !></a>')
115	all_attrs = lex.AllAttrsRaw()
116	except html.LexError as e:
117	print(e)
118	else:
119	self.fail('Expected LexError')
120
121
122	def _MakeAttrValueLexer(s):
123	lex = html.AttrValueLexer(s)
124	lex.Reset(0, len(s))
125	return lex
126
127
128	class AttrValueLexerTest(unittest.TestCase):
129
130	def testGood(self):
131	lex = _MakeAttrValueLexer('?foo=42&bar=99')
132	n = lex.NumTokens()
133	self.assertEqual(3, n)
134
135
136	def Lex(h, no_special_tags=False):
137	print(repr(h))
138	tokens = html.ValidTokenList(h, no_special_tags=no_special_tags)
139	start_pos = 0
140	for tok_id, end_pos in tokens:
141	frag = h[start_pos:end_pos]
142	log('%d %s %r', end_pos, html.TokenName(tok_id), frag)
143	start_pos = end_pos
144	return tokens
145
146
147	class LexerTest(unittest.TestCase):
148
149	# IndexLinker in devtools/make_help.py
150	# <pre> sections in doc/html_help.py
151	# TocExtractor in devtools/cmark.py
152
153	def testPstrip(self):
154	"""Remove anything like this.
155
156	<p><pstrip> </pstrip></p>
157	"""
158	pass
159
160	def testCommentParse(self):
161	n = len(TEST_HTML)
162	tokens = Lex(TEST_HTML)
163
164	def testCommentParse2(self):
165
166	Tok = html.Tok
167	h = '''
168	hi <!-- line 1
169	line 2 --><br/>'''
170	tokens = Lex(h)
171
172	self.assertEqual(
173	[
174	(Tok.RawData, 12),
175	(Tok.Comment, 50), # <? err ?>
176	(Tok.StartEndTag, 55),
177	(Tok.EndOfStream, 55),
178	],
179	tokens)
180
181	def testProcessingInstruction(self):
182	# <?xml ?> header
183	Tok = html.Tok
184	h = 'hi <? err ?>'
185	tokens = Lex(h)
186
187	self.assertEqual(
188	[
189	(Tok.RawData, 3),
190	(Tok.Processing, 12), # <? err ?>
191	(Tok.EndOfStream, 12),
192	],
193	tokens)
194
195	def testScriptStyle(self):
196	Tok = html.Tok
197	h = '''
198	hi <script src=""> if (x < 1 && y > 2 ) { console.log(""); }
199	</script>
200	'''
201	tokens = Lex(h)
202
203	self.assertEqual(
204	[
205	(Tok.RawData, 12),
206	(Tok.StartTag, 27), # <script>
207	(Tok.HtmlCData, 78), # JavaScript code is HTML CData
208	(Tok.EndTag, 87), # </script>
209	(Tok.RawData, 96), # \n
210	(Tok.EndOfStream, 96), # \n
211	],
212	tokens)
213
214	def testScriptStyleXml(self):
215	Tok = html.Tok
216	h = 'hi <script src=""> < </script>'
217	# XML mode
218	tokens = Lex(h, no_special_tags=True)
219
220	self.assertEqual(
221	[
222	(Tok.RawData, 3),
223	(Tok.StartTag, 18), # <script>
224	(Tok.RawData, 19), # space
225	(Tok.CharEntity, 23), # </script>
226	(Tok.RawData, 24), # \n
227	(Tok.EndTag, 33), # \n
228	(Tok.EndOfStream, 33), # \n
229	],
230	tokens)
231
232	def testCData(self):
233	Tok = html.Tok
234
235	# from
236	# /home/andy/src/languages/Python-3.11.5/Lib/test/xmltestdata/c14n-20/inC14N4.xml
237	h = '<compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>'
238	tokens = Lex(h)
239
240	self.assertEqual([
241	(Tok.StartTag, 9),
242	(Tok.CData, 61),
243	(Tok.EndTag, 71),
244	(Tok.EndOfStream, 71),
245	], tokens)
246
247	def testEntity(self):
248	Tok = html.Tok
249
250	# from
251	# /home/andy/src/Python-3.12.4/Lib/test/xmltestdata/c14n-20/inC14N5.xml
252	h = '&ent1;, &ent2;!'
253
254	tokens = Lex(h)
255
256	self.assertEqual([
257	(Tok.CharEntity, 6),
258	(Tok.RawData, 8),
259	(Tok.CharEntity, 14),
260	(Tok.RawData, 15),
261	(Tok.EndOfStream, 15),
262	], tokens)
263
264	def testStartTag(self):
265	Tok = html.Tok
266
267	h = '<a>hi</a>'
268	tokens = Lex(h)
269
270	self.assertEqual([
271	(Tok.StartTag, 3),
272	(Tok.RawData, 5),
273	(Tok.EndTag, 9),
274	(Tok.EndOfStream, 9),
275	], tokens)
276
277	# Make sure we don't consume too much
278	h = '<a><source>1.7</source></a>'
279
280	tokens = Lex(h)
281
282	self.assertEqual([
283	(Tok.StartTag, 3),
284	(Tok.StartTag, 11),
285	(Tok.RawData, 14),
286	(Tok.EndTag, 23),
287	(Tok.EndTag, 27),
288	(Tok.EndOfStream, 27),
289	], tokens)
290
291	return
292
293	h = '''
294	<configuration>
295	<source>1.7</source>
296	</configuration>'''
297
298	tokens = Lex(h)
299
300	self.assertEqual([
301	(Tok.RawData, 9),
302	(Tok.StartTag, 24),
303	(Tok.RawData, 9),
304	(Tok.EndOfStream, 9),
305	], tokens)
306
307	def testInvalid(self):
308	Tok = html.Tok
309
310	for s in INVALID_LEX:
311	try:
312	tokens = html.ValidTokenList(s)
313	except html.LexError as e:
314	print(e)
315	else:
316	self.fail('Expected LexError %r' % s)
317
318
319	INVALID_LEX = [
320	# Should be &
321	'<a>&',
322	'&amp', # not finished
323	'&#', # not finished
324	# Hm > is allowed?
325	#'a > b',
326	'a < b',
327	'<!-- unfinished comment',
328	'<? unfinished processing',
329	'</div bad=attr> <a> <b>',
330
331	# not allowed, but 3 > 4 is allowed
332	'<a> 3 < 4 </a>',
333	]
334
335	INVALID_PARSE = [
336	'<a></b>',
337	'<a>', # missing closing tag
338	'<meta></meta>', # this is a self-closing tag
339	]
340
341	VALID_PARSE = [
342	'<!DOCTYPE html>\n',
343	'<!DOCTYPE>',
344
345	# empty strings
346	'<p x=""></p>',
347	"<p x=''></p>",
348
349	# allowed, but 3 < 4 is not allowed
350	'<a> 3 > 4 </a>',
351	# allowed, but 3 > 4 is not allowed
352	'<p x="3 < 4"></p>',
353	'<b><a href="foo">link</a></b>',
354	'<meta><a></a>',
355	# no attribute
356	'<button disabled></button>',
357	'<button disabled=></button>',
358	'<button disabled= ></button>',
359
360	# single quoted is pretty common
361	"<a href='single'></a>",
362
363	# Conceding to reality - I used these myself
364	'<a href=ble.sh></a>',
365	'<a href=foo.html></a>',
366
367	# TODO: capitalization should be allowed
368	#'<META><a></a>',
369
370	# TODO: Test <svg> and <math> ?
371	]
372
373	VALID_XML = [
374	'<meta></meta>',
375	]
376
377	INVALID_TAG_LEX = [
378	# not allowed, but 3 < 4 is allowed
379	'<p x="3 > 4"></p>',
380	'<a foo=bar !></a>', # bad attr
381
382	# should be escaped
383	#'<a href="&"></a>',
384	#'<a href=">"></a>',
385	]
386
387
388	class ValidateTest(unittest.TestCase):
389
390	def testInvalid(self):
391	counters = html.Counters()
392	for s in INVALID_LEX + INVALID_TAG_LEX:
393	try:
394	html.Validate(s, html.BALANCED_TAGS, counters)
395	except html.LexError as e:
396	print(e)
397	else:
398	self.fail('Expected LexError %r' % s)
399
400	for s in INVALID_PARSE:
401	try:
402	html.Validate(s, html.BALANCED_TAGS, counters)
403	except html.ParseError as e:
404	print(e)
405	else:
406	self.fail('Expected ParseError')
407
408	def testValid(self):
409	counters = html.Counters()
410	for s in VALID_PARSE:
411	html.Validate(s, html.BALANCED_TAGS, counters)
412	print('HTML5 %r' % s)
413	print('HTML5 attrs %r' % counters.debug_attrs)
414
415	def testValidXml(self):
416	counters = html.Counters()
417	for s in VALID_XML:
418	html.Validate(s, html.BALANCED_TAGS \| html.NO_SPECIAL_TAGS,
419	counters)
420	print('XML %r' % s)
421	print('XML attrs %r' % counters.debug_attrs)
422
423
424	if __name__ == '__main__':
425	unittest.main()