lazylex/html

OILS / lazylex / html_test.py View on Github | oils.pub

357 lines, 217 significant

1	#!/usr/bin/env python2
2	from __future__ import print_function
3
4	import re
5	import unittest
6
7	from lazylex import html # module under test log = html.log
8	from doctools.util import log
9
10
11	class RegexTest(unittest.TestCase):
12
13	def testDotAll(self):
14	# type: () -> None
15
16	# Note that $ matches end of line, not end of string
17	p1 = re.compile(r'.')
18	print(p1.match('\n'))
19
20	p2 = re.compile(r'.', re.DOTALL)
21	print(p2.match('\n'))
22
23	#p3 = re.compile(r'[.\n]', re.VERBOSE)
24	p3 = re.compile(r'[.\n]')
25	print(p3.match('\n'))
26
27	print('Negation')
28
29	p4 = re.compile(r'[^>]')
30	print(p4.match('\n'))
31
32	def testAttrRe(self):
33	# type: () -> None
34	_ATTR_RE = html._ATTR_RE
35	m = _ATTR_RE.match(' empty= val')
36	print(m.groups())
37
38
39	class FunctionsTest(unittest.TestCase):
40
41	def testToText(self):
42	# type: () -> None
43	t = html.ToText('<b name="&"> three < four && five </b>')
44	self.assertEqual(' three < four && five ', t)
45
46
47	def _MakeTagLexer(s):
48	# type: (str) -> html.TagLexer
49	lex = html.TagLexer(s)
50	lex.Reset(0, len(s))
51	return lex
52
53
54	def _PrintTokens(lex):
55	# type: (html.TagLexer) -> None
56	log('')
57	log('tag = %r', lex.GetTagName())
58	for tok, start, end in lex.Tokens():
59	log('%s %r', tok, lex.s[start:end])
60
61
62	class TagLexerTest(unittest.TestCase):
63
64	def testTagName_DEPRECATED(self):
65	# type: () -> None
66	lex = _MakeTagLexer('<a href=foo class="bar" />')
67	self.assertEqual('a', lex.GetTagName())
68
69	def testGetAttrRaw(self):
70	# type: () -> None
71	lex = _MakeTagLexer('<a>')
72	_PrintTokens(lex)
73	self.assertEqual(None, lex.GetAttrRaw('oops'))
74
75	# <a novalue> means lex.Get('novalue') == ''
76	# https://developer.mozilla.org/en-US/docs/Web/API/Element/hasAttribute
77	# We are not distinguishing <a novalue=""> from <a novalue> in this API
78	lex = _MakeTagLexer('<a novalue>')
79	_PrintTokens(lex)
80	self.assertEqual('', lex.GetAttrRaw('novalue'))
81
82	lex = _MakeTagLexer('<a href="double quoted">')
83	_PrintTokens(lex)
84
85	self.assertEqual('double quoted', lex.GetAttrRaw('href'))
86	self.assertEqual(None, lex.GetAttrRaw('oops'))
87
88	lex = _MakeTagLexer('<a href=foo class="bar">')
89	_PrintTokens(lex)
90	self.assertEqual('bar', lex.GetAttrRaw('class'))
91
92	lex = _MakeTagLexer('<a href=foo class="bar" />')
93	_PrintTokens(lex)
94	self.assertEqual('bar', lex.GetAttrRaw('class'))
95
96	lex = _MakeTagLexer('<a href="?foo=1&bar=2" />')
97	self.assertEqual('?foo=1&bar=2', lex.GetAttrRaw('href'))
98
99	def testAllAttrs(self):
100	# type: () -> None
101	"""
102	[('key', 'value')] for all
103	"""
104	# closed
105	lex = _MakeTagLexer('<a href=foo class="bar" />')
106	self.assertEqual([('href', 'foo'), ('class', 'bar')],
107	lex.AllAttrsRaw())
108
109	lex = _MakeTagLexer('<a href="?foo=1&bar=2" />')
110	self.assertEqual([('href', '?foo=1&bar=2')], lex.AllAttrsRaw())
111
112	def testEmptyMissingValues(self):
113	# type: () -> None
114	# equivalent to <button disabled="">
115	lex = _MakeTagLexer('<button disabled>')
116	all_attrs = lex.AllAttrsRaw()
117	self.assertEqual([('disabled', '')], all_attrs)
118
119	slices = lex.AllAttrsRawSlice()
120	log('slices %s', slices)
121
122	lex = _MakeTagLexer(
123	'''<p double="" single='' empty= value missing empty2=>''')
124	all_attrs = lex.AllAttrsRaw()
125	self.assertEqual([
126	('double', ''),
127	('single', ''),
128	('empty', 'value'),
129	('missing', ''),
130	('empty2', ''),
131	], all_attrs)
132	# TODO: should have
133	log('all %s', all_attrs)
134
135	slices = lex.AllAttrsRawSlice()
136	log('slices %s', slices)
137
138	def testInvalidTag(self):
139	# type: () -> None
140	try:
141	lex = _MakeTagLexer('<a foo=bar !></a>')
142	all_attrs = lex.AllAttrsRaw()
143	except html.LexError as e:
144	print(e)
145	else:
146	self.fail('Expected LexError')
147
148
149	class LexerTest(unittest.TestCase):
150
151	def testInvalid(self):
152	# type: () -> None
153	from data_lang.htm8_test import ValidTokenList
154	for s in INVALID_LEX:
155	try:
156	tokens = ValidTokenList(s)
157	except html.LexError as e:
158	print(e)
159	else:
160	self.fail('Expected LexError %r' % s)
161
162	def testValid(self):
163	# type: () -> None
164
165	from data_lang.htm8_test import Lex
166
167	for s, _ in VALID_LEX:
168	tokens = Lex(s)
169	print()
170
171
172	INVALID_LEX = [
173	'< >',
174	'<a><',
175	'&amp<',
176	'&<',
177	# Hm > is allowed?
178	#'a > b',
179	'a < b',
180	'<!-- unfinished comment',
181	'<? unfinished processing',
182	'</div bad=attr> <a> <b>',
183
184	# not allowed, but 3 > 4 is allowed
185	'<a> 3 < 4 </a>',
186	# Not a CDATA tag
187	'<STYLEz><</STYLEz>',
188	]
189
190	SKIP = 0
191	UNCHANGED = 1
192
193	VALID_LEX = [
194	# TODO: convert these to XML
195	('<foo></foo>', UNCHANGED),
196	('<foo x=y></foo>', ''),
197	#('<foo x="&"></foo>', '<foo x="&"></foo>'),
198	('<foo x="&"></foo>', ''),
199
200	# Allowed with BadAmpersand
201	('<p> x & y </p>', '<p> x & y </p>'),
202
203	# No ambiguity
204	('<img src=/ >', ''),
205	('<img src="/">', ''),
206	('<img src=foo/ >', ''),
207	]
208
209	INVALID_PARSE = [
210	'<a></b>',
211	'<a>', # missing closing tag
212	'<meta></meta>', # this is a self-closing tag
213	]
214
215	INVALID_ATTR_LEX = [
216	# Ambiguous, should be ""
217	'<img src=/>',
218	'<img src= />',
219	'<img src=foo/>',
220	'<img src= foo/>',
221
222	# Quoting
223	'<img src=x"y">',
224	"<img src=j''>",
225	]
226
227	VALID_PARSE = [
228	('<!DOCTYPE html>\n', ''),
229	('<!DOCTYPE>', ''),
230
231	# empty strings
232	('<p x=""></p>', UNCHANGED),
233	("<p x=''></p>", UNCHANGED),
234	('<self-closing a="b" />', UNCHANGED),
235
236	# We could also normalize CDATA?
237	# Note that CDATA has an escaping problem: you need to handle it ]]> with
238	# concatenation. It just "pushes the problem around".
239	# So I think it's better to use ONE kind of escaping, which is <
240	('<script><![CDATA[ <wtf> >< ]]></script>', UNCHANGED),
241
242	# allowed, but 3 < 4 is not allowed
243	('<a> 3 > 4 </a>', '<a> 3 > 4 </a>'),
244	# allowed, but 3 > 4 is not allowed
245	('<p x="3 < 4"></p>', ''),
246	('<b><a href="foo">link</a></b>', UNCHANGED),
247
248	# TODO: should be self-closing
249	#('<meta><a></a>', '<meta/><a></a>'),
250	('<meta><a></a>', ''),
251
252	# no attribute
253	('<button disabled></button>', ''),
254	('<button disabled=></button>', ''),
255	('<button disabled= ></button>', ''),
256
257	# single quoted is pretty common
258	("<a href='single'></a>", ''),
259
260	# Conceding to reality - I used these myself
261	('<a href=ble.sh></a>', ''),
262	('<a href=foo.html></a>', ''),
263	('<foo x="&"></foo>', ''),
264
265	# caps
266	('<foo></FOO>', ''),
267	('<Foo></fOO>', ''),
268
269	# capital VOID tag
270	('<META><a></a>', ''),
271	('<script><</script>', ''),
272	# matching
273	('<SCRipt><</SCRipt>', ''),
274	('<SCRIPT><</SCRIPT>', ''),
275	('<STYLE><</STYLE>', ''),
276	#'<SCRipt><</script>',
277
278	# Regression test from blog
279	('<script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>',
280	'')
281
282	# Note: Python HTMLParser.py does DYNAMIC compilation of regex with re.I
283	# flag to handle this! Gah I want something faster.
284	#'<script><</SCRIPT>',
285
286	# TODO: Test <svg> and <math> ?
287	]
288
289	VALID_XML = [
290	'<meta></meta>',
291	]
292
293	INVALID_TAG_LEX = [
294	# not allowed, but 3 < 4 is allowed
295	'<p x="3 > 4"></p>',
296	# same thing
297	'<a href=">"></a>',
298	'<a foo=bar !></a>', # bad attr
299	]
300
301
302	class ValidateTest(unittest.TestCase):
303
304	def testInvalid(self):
305	# type: () -> None
306	counters = html.Counters()
307	for s in INVALID_LEX + INVALID_TAG_LEX + INVALID_ATTR_LEX:
308	try:
309	html.Validate(s, html.BALANCED_TAGS, counters)
310	except html.LexError as e:
311	print(e)
312	else:
313	self.fail('Expected LexError %r' % s)
314
315	for s in INVALID_PARSE:
316	try:
317	html.Validate(s, html.BALANCED_TAGS, counters)
318	except html.ParseError as e:
319	print(e)
320	else:
321	self.fail('Expected ParseError')
322
323	def testValid(self):
324	# type: () -> None
325	counters = html.Counters()
326	for s, _ in VALID_PARSE:
327	print('HTML5 %r' % s)
328	html.Validate(s, html.BALANCED_TAGS, counters)
329	#print('HTML5 attrs %r' % counters.debug_attrs)
330
331	def testValidXml(self):
332	# type: () -> None
333	counters = html.Counters()
334	for s in VALID_XML:
335	print('XML %r' % s)
336	html.Validate(s, html.BALANCED_TAGS \| html.NO_SPECIAL_TAGS,
337	counters)
338	#print('XML attrs %r' % counters.debug_attrs)
339
340
341	class XmlTest(unittest.TestCase):
342
343	def testValid(self):
344	# type: () -> None
345	counters = html.Counters()
346	for h, expected_xml in VALID_LEX + VALID_PARSE:
347	actual = html.ToXml(h)
348	if expected_xml == UNCHANGED: # Unchanged
349	self.assertEqual(h, actual)
350	elif expected_xml == '': # Skip
351	pass
352	else:
353	self.assertEqual(expected_xml, actual)
354
355
356	if __name__ == '__main__':
357	unittest.main()