lazylex/html

OILS / lazylex / html_test.py View on Github | oils.pub

376 lines, 235 significant

1	#!/usr/bin/env python2
2	from __future__ import print_function
3
4	import unittest
5
6	from lazylex import html # module under test log = html.log
7	from doctools.util import log
8
9
10	class FunctionsTest(unittest.TestCase):
11
12	def testToText(self):
13	# type: () -> None
14	t = html.ToText('<b name="&"> three < four && five </b>')
15	self.assertEqual(' three < four && five ', t)
16
17
18	def _MakeTagLexer(s):
19	# type: (str) -> html.TagLexer
20	lex = html.TagLexer(s)
21	lex.Reset(0, len(s))
22	return lex
23
24
25	def _PrintTokens(lex):
26	# type: (html.TagLexer) -> None
27	log('')
28	log('tag = %r', lex.GetTagName())
29	for tok, start, end in lex.Tokens():
30	log('%s %r', tok, lex.s[start:end])
31
32
33	class TagLexerTest(unittest.TestCase):
34
35	def testTagName_DEPRECATED(self):
36	# type: () -> None
37	lex = _MakeTagLexer('<a href=foo class="bar" />')
38	self.assertEqual('a', lex.GetTagName())
39
40	def testGetAttrRaw(self):
41	# type: () -> None
42	lex = _MakeTagLexer('<a>')
43	_PrintTokens(lex)
44	self.assertEqual(None, lex.GetAttrRaw('oops'))
45
46	# <a novalue> means lex.Get('novalue') == ''
47	# https://developer.mozilla.org/en-US/docs/Web/API/Element/hasAttribute
48	# We are not distinguishing <a novalue=""> from <a novalue> in this API
49	lex = _MakeTagLexer('<a novalue>')
50	_PrintTokens(lex)
51	self.assertEqual('', lex.GetAttrRaw('novalue'))
52
53	lex = _MakeTagLexer('<a href="double quoted">')
54	_PrintTokens(lex)
55
56	self.assertEqual('double quoted', lex.GetAttrRaw('href'))
57	self.assertEqual(None, lex.GetAttrRaw('oops'))
58
59	lex = _MakeTagLexer('<a href=foo class="bar">')
60	_PrintTokens(lex)
61	self.assertEqual('bar', lex.GetAttrRaw('class'))
62
63	lex = _MakeTagLexer('<a href=foo class="bar" />')
64	_PrintTokens(lex)
65	self.assertEqual('bar', lex.GetAttrRaw('class'))
66
67	lex = _MakeTagLexer('<a href="?foo=1&bar=2" />')
68	self.assertEqual('?foo=1&bar=2', lex.GetAttrRaw('href'))
69
70	def testAllAttrs(self):
71	# type: () -> None
72	"""
73	[('key', 'value')] for all
74	"""
75	# closed
76	lex = _MakeTagLexer('<a href=foo class="bar" />')
77	self.assertEqual([('href', 'foo'), ('class', 'bar')],
78	lex.AllAttrsRaw())
79
80	lex = _MakeTagLexer('<a href="?foo=1&bar=2" />')
81	self.assertEqual([('href', '?foo=1&bar=2')], lex.AllAttrsRaw())
82
83	def testEmptyMissingValues(self):
84	# type: () -> None
85	# equivalent to <button disabled="">
86	lex = _MakeTagLexer('<button disabled>')
87	all_attrs = lex.AllAttrsRaw()
88	self.assertEqual([('disabled', '')], all_attrs)
89
90	slices = lex.AllAttrsRawSlice()
91	log('slices %s', slices)
92
93	lex = _MakeTagLexer(
94	'''<p double="" single='' empty= value missing empty2=>''')
95	all_attrs = lex.AllAttrsRaw()
96	self.assertEqual([
97	('double', ''),
98	('single', ''),
99	('empty', 'value'),
100	('missing', ''),
101	('empty2', ''),
102	], all_attrs)
103	# TODO: should have
104	log('all %s', all_attrs)
105
106	slices = lex.AllAttrsRawSlice()
107	log('slices %s', slices)
108
109	def testInvalidTag(self):
110	# type: () -> None
111	try:
112	lex = _MakeTagLexer('<a foo=bar !></a>')
113	all_attrs = lex.AllAttrsRaw()
114	except html.LexError as e:
115	print(e)
116	else:
117	self.fail('Expected LexError')
118
119
120	def _MakeAttrValueLexer(s):
121	# type: (str) -> html.AttrValueLexer
122	lex = html.AttrValueLexer(s)
123	lex.Reset(0, len(s))
124	return lex
125
126
127	class AttrValueLexerTest(unittest.TestCase):
128
129	def testGood(self):
130	# type: () -> None
131	lex = _MakeAttrValueLexer('?foo=42&bar=99')
132	n = lex.NumTokens()
133	self.assertEqual(3, n)
134
135
136	class LexerTest(unittest.TestCase):
137
138	def testInvalid(self):
139	# type: () -> None
140	from data_lang.htm8_test import ValidTokenList
141	for s in INVALID_LEX:
142	try:
143	tokens = ValidTokenList(s)
144	except html.LexError as e:
145	print(e)
146	else:
147	self.fail('Expected LexError %r' % s)
148
149	def testValid(self):
150	# type: () -> None
151
152	from data_lang.htm8_test import Lex
153
154	for s, _ in VALID_LEX:
155	tokens = Lex(s)
156	print()
157
158
159	INVALID_LEX = [
160	'< >',
161	'<a><',
162	'&amp<',
163	'&<',
164	# Hm > is allowed?
165	#'a > b',
166	'a < b',
167	'<!-- unfinished comment',
168	'<? unfinished processing',
169	'</div bad=attr> <a> <b>',
170
171	# not allowed, but 3 > 4 is allowed
172	'<a> 3 < 4 </a>',
173	# Not a CDATA tag
174	'<STYLEz><</STYLEz>',
175	]
176
177	SKIP = 0
178	UNCHANGED = 1
179
180	VALID_LEX = [
181	# TODO: convert these to XML
182	('<foo></foo>', UNCHANGED),
183	('<foo x=y></foo>', ''),
184	#('<foo x="&"></foo>', '<foo x="&"></foo>'),
185	('<foo x="&"></foo>', ''),
186
187	# Allowed with BadAmpersand
188	('<p> x & y </p>', '<p> x & y </p>'),
189
190	# No ambiguity
191	('<img src=/ >', ''),
192	('<img src="/">', ''),
193	('<img src=foo/ >', ''),
194	]
195
196	INVALID_PARSE = [
197	'<a></b>',
198	'<a>', # missing closing tag
199	'<meta></meta>', # this is a self-closing tag
200	]
201
202	INVALID_ATTR_LEX = [
203	# Ambiguous, should be ""
204	'<img src=/>',
205	'<img src= />',
206	'<img src=foo/>',
207	'<img src= foo/>',
208	]
209
210	VALID_PARSE = [
211	('<!DOCTYPE html>\n', ''),
212	('<!DOCTYPE>', ''),
213
214	# empty strings
215	('<p x=""></p>', UNCHANGED),
216	("<p x=''></p>", UNCHANGED),
217	('<self-closing a="b" />', UNCHANGED),
218
219	# We could also normalize CDATA?
220	# Note that CDATA has an escaping problem: you need to handle it ]]> with
221	# concatenation. It just "pushes the problem around".
222	# So I think it's better to use ONE kind of escaping, which is <
223	('<script><![CDATA[ <wtf> >< ]]></script>', UNCHANGED),
224
225	# allowed, but 3 < 4 is not allowed
226	('<a> 3 > 4 </a>', '<a> 3 > 4 </a>'),
227	# allowed, but 3 > 4 is not allowed
228	('<p x="3 < 4"></p>', ''),
229	('<b><a href="foo">link</a></b>', UNCHANGED),
230
231	# TODO: should be self-closing
232	#('<meta><a></a>', '<meta/><a></a>'),
233	('<meta><a></a>', ''),
234
235	# no attribute
236	('<button disabled></button>', ''),
237	('<button disabled=></button>', ''),
238	('<button disabled= ></button>', ''),
239
240	# single quoted is pretty common
241	("<a href='single'></a>", ''),
242
243	# Conceding to reality - I used these myself
244	('<a href=ble.sh></a>', ''),
245	('<a href=foo.html></a>', ''),
246	('<foo x="&"></foo>', ''),
247
248	# caps
249	('<foo></FOO>', ''),
250	('<Foo></fOO>', ''),
251
252	# capital VOID tag
253	('<META><a></a>', ''),
254	('<script><</script>', ''),
255	# matching
256	('<SCRipt><</SCRipt>', ''),
257	('<SCRIPT><</SCRIPT>', ''),
258	('<STYLE><</STYLE>', ''),
259	#'<SCRipt><</script>',
260
261	# Regression test from blog
262	('<script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>',
263	'')
264
265	# Note: Python HTMLParser.py does DYNAMIC compilation of regex with re.I
266	# flag to handle this! Gah I want something faster.
267	#'<script><</SCRIPT>',
268
269	# TODO: Test <svg> and <math> ?
270	]
271
272	VALID_XML = [
273	'<meta></meta>',
274	]
275
276	INVALID_TAG_LEX = [
277	# not allowed, but 3 < 4 is allowed
278	'<p x="3 > 4"></p>',
279	# same thing
280	'<a href=">"></a>',
281	'<a foo=bar !></a>', # bad attr
282	]
283
284
285	class ValidateTest(unittest.TestCase):
286
287	def testInvalidOld(self):
288	# type: () -> None
289	counters = html.Counters()
290	for s in INVALID_LEX + INVALID_TAG_LEX:
291	try:
292	html.ValidateOld(s, html.BALANCED_TAGS, counters)
293	except html.LexError as e:
294	print(e)
295	else:
296	self.fail('Expected LexError %r' % s)
297
298	for s in INVALID_PARSE:
299	try:
300	html.ValidateOld(s, html.BALANCED_TAGS, counters)
301	except html.ParseError as e:
302	print(e)
303	else:
304	self.fail('Expected ParseError')
305
306	def testValidOld(self):
307	# type: () -> None
308	counters = html.Counters()
309	for s, _ in VALID_PARSE:
310	html.ValidateOld(s, html.BALANCED_TAGS, counters)
311	print('HTML5 %r' % s)
312	#print('HTML5 attrs %r' % counters.debug_attrs)
313
314	def testValidXmlOld(self):
315	# type: () -> None
316	counters = html.Counters()
317	for s in VALID_XML:
318	html.ValidateOld(s, html.BALANCED_TAGS \| html.NO_SPECIAL_TAGS,
319	counters)
320	print('XML %r' % s)
321	#print('XML attrs %r' % counters.debug_attrs)
322
323	def testInvalid(self):
324	# type: () -> None
325	counters = html.Counters()
326	for s in INVALID_LEX + INVALID_TAG_LEX + INVALID_ATTR_LEX:
327	try:
328	html.Validate(s, html.BALANCED_TAGS, counters)
329	except html.LexError as e:
330	print(e)
331	else:
332	self.fail('Expected LexError %r' % s)
333
334	for s in INVALID_PARSE:
335	try:
336	html.Validate(s, html.BALANCED_TAGS, counters)
337	except html.ParseError as e:
338	print(e)
339	else:
340	self.fail('Expected ParseError')
341
342	def testValid(self):
343	# type: () -> None
344	counters = html.Counters()
345	for s, _ in VALID_PARSE:
346	print('HTML5 %r' % s)
347	html.Validate(s, html.BALANCED_TAGS, counters)
348	#print('HTML5 attrs %r' % counters.debug_attrs)
349
350	def testValidXml(self):
351	# type: () -> None
352	counters = html.Counters()
353	for s in VALID_XML:
354	print('XML %r' % s)
355	html.Validate(s, html.BALANCED_TAGS \| html.NO_SPECIAL_TAGS,
356	counters)
357	#print('XML attrs %r' % counters.debug_attrs)
358
359
360	class XmlTest(unittest.TestCase):
361
362	def testValid(self):
363	# type: () -> None
364	counters = html.Counters()
365	for h, expected_xml in VALID_LEX + VALID_PARSE:
366	actual = html.ToXml(h)
367	if expected_xml == UNCHANGED: # Unchanged
368	self.assertEqual(h, actual)
369	elif expected_xml == '': # Skip
370	pass
371	else:
372	self.assertEqual(expected_xml, actual)
373
374
375	if __name__ == '__main__':
376	unittest.main()