lazylex/html

OILS / lazylex / html_test.py View on Github | oils.pub

324 lines, 193 significant

1	#!/usr/bin/env python2
2	from __future__ import print_function
3
4	import unittest
5
6	from lazylex import html # module under test log = html.log
7	from doctools.util import log
8
9
10	class FunctionsTest(unittest.TestCase):
11
12	def testToText(self):
13	# type: () -> None
14	t = html.ToText('<b name="&"> three < four && five </b>')
15	self.assertEqual(' three < four && five ', t)
16
17
18	def _MakeTagLexer(s):
19	# type: (str) -> html.TagLexer
20	lex = html.TagLexer(s)
21	lex.Reset(0, len(s))
22	return lex
23
24
25	def _PrintTokens(lex):
26	# type: (html.TagLexer) -> None
27	log('')
28	log('tag = %r', lex.GetTagName())
29	for tok, start, end in lex.Tokens():
30	log('%s %r', tok, lex.s[start:end])
31
32
33	class TagLexerTest(unittest.TestCase):
34
35	def testTagLexer(self):
36	# type: () -> None
37	# Invalid!
38	#lex = _MakeTagLexer('< >')
39	#print(lex.Tag())
40
41	lex = _MakeTagLexer('<a>')
42	_PrintTokens(lex)
43
44	lex = _MakeTagLexer('<a novalue>')
45	_PrintTokens(lex)
46
47	# Note: we could have a different HasAttr() method
48	# <a novalue> means lex.Get('novalue') == ''
49	# https://developer.mozilla.org/en-US/docs/Web/API/Element/hasAttribute
50	self.assertEqual('', lex.GetAttrRaw('novalue'))
51
52	lex = _MakeTagLexer('<a href="double quoted">')
53	_PrintTokens(lex)
54
55	self.assertEqual('double quoted', lex.GetAttrRaw('href'))
56	self.assertEqual(None, lex.GetAttrRaw('oops'))
57
58	lex = _MakeTagLexer('<a href=foo class="bar">')
59	_PrintTokens(lex)
60
61	lex = _MakeTagLexer('<a href=foo class="bar" />')
62	_PrintTokens(lex)
63
64	lex = _MakeTagLexer('<a href="?foo=1&bar=2" />')
65	self.assertEqual('?foo=1&bar=2', lex.GetAttrRaw('href'))
66
67	def testTagName(self):
68	# type: () -> None
69	lex = _MakeTagLexer('<a href=foo class="bar" />')
70	self.assertEqual('a', lex.GetTagName())
71
72	def testAllAttrs(self):
73	# type: () -> None
74	"""
75	[('key', 'value')] for all
76	"""
77	# closed
78	lex = _MakeTagLexer('<a href=foo class="bar" />')
79	self.assertEqual([('href', 'foo'), ('class', 'bar')],
80	lex.AllAttrsRaw())
81
82	lex = _MakeTagLexer('<a href="?foo=1&bar=2" />')
83	self.assertEqual([('href', '?foo=1&bar=2')], lex.AllAttrsRaw())
84
85	def testEmptyMissingValues(self):
86	# type: () -> None
87	# equivalent to <button disabled="">
88	lex = _MakeTagLexer('<button disabled>')
89	all_attrs = lex.AllAttrsRaw()
90	self.assertEqual([('disabled', '')], all_attrs)
91
92	slices = lex.AllAttrsRawSlice()
93	log('slices %s', slices)
94
95	lex = _MakeTagLexer(
96	'''<p double="" single='' empty= value missing empty2=>''')
97	all_attrs = lex.AllAttrsRaw()
98	self.assertEqual([
99	('double', ''),
100	('single', ''),
101	('empty', 'value'),
102	('missing', ''),
103	('empty2', ''),
104	], all_attrs)
105	# TODO: should have
106	log('all %s', all_attrs)
107
108	slices = lex.AllAttrsRawSlice()
109	log('slices %s', slices)
110
111	def testInvalidTag(self):
112	# type: () -> None
113	try:
114	lex = _MakeTagLexer('<a foo=bar !></a>')
115	all_attrs = lex.AllAttrsRaw()
116	except html.LexError as e:
117	print(e)
118	else:
119	self.fail('Expected LexError')
120
121
122	def _MakeAttrValueLexer(s):
123	# type: (str) -> html.AttrValueLexer
124	lex = html.AttrValueLexer(s)
125	lex.Reset(0, len(s))
126	return lex
127
128
129	class AttrValueLexerTest(unittest.TestCase):
130
131	def testGood(self):
132	# type: () -> None
133	lex = _MakeAttrValueLexer('?foo=42&bar=99')
134	n = lex.NumTokens()
135	self.assertEqual(3, n)
136
137
138	class LexerTest(unittest.TestCase):
139
140	def testInvalid(self):
141	# type: () -> None
142	from data_lang.htm8_test import ValidTokenList
143	for s in INVALID_LEX:
144	try:
145	tokens = ValidTokenList(s)
146	except html.LexError as e:
147	print(e)
148	else:
149	self.fail('Expected LexError %r' % s)
150
151	def testValid(self):
152	# type: () -> None
153
154	from data_lang.htm8_test import Lex
155
156	for s, _ in VALID_LEX:
157	tokens = Lex(s)
158	print()
159
160
161	INVALID_LEX = [
162	'<a><',
163	'&amp<',
164	'&<',
165	# Hm > is allowed?
166	#'a > b',
167	'a < b',
168	'<!-- unfinished comment',
169	'<? unfinished processing',
170	'</div bad=attr> <a> <b>',
171
172	# not allowed, but 3 > 4 is allowed
173	'<a> 3 < 4 </a>',
174	# Not a CDATA tag
175	'<STYLEz><</STYLEz>',
176	]
177
178	SKIP = 0
179	UNCHANGED = 1
180
181	VALID_LEX = [
182	# TODO: convert these to XML
183	('<foo></foo>', UNCHANGED),
184	('<foo x=y></foo>', ''),
185	#('<foo x="&"></foo>', '<foo x="&"></foo>'),
186	('<foo x="&"></foo>', ''),
187
188	# Allowed with BadAmpersand
189	('<p> x & y </p>', '<p> x & y </p>'),
190	]
191
192	INVALID_PARSE = [
193	'<a></b>',
194	'<a>', # missing closing tag
195	'<meta></meta>', # this is a self-closing tag
196	]
197
198	VALID_PARSE = [
199	('<!DOCTYPE html>\n', ''),
200	('<!DOCTYPE>', ''),
201
202	# empty strings
203	('<p x=""></p>', UNCHANGED),
204	("<p x=''></p>", UNCHANGED),
205	('<self-closing a="b" />', UNCHANGED),
206
207	# We could also normalize CDATA?
208	# Note that CDATA has an escaping problem: you need to handle it ]]> with
209	# concatenation. It just "pushes the problem around".
210	# So I think it's better to use ONE kind of escaping, which is <
211	('<script><![CDATA[ <wtf> >< ]]></script>', UNCHANGED),
212
213	# allowed, but 3 < 4 is not allowed
214	('<a> 3 > 4 </a>', '<a> 3 > 4 </a>'),
215	# allowed, but 3 > 4 is not allowed
216	('<p x="3 < 4"></p>', ''),
217	('<b><a href="foo">link</a></b>', UNCHANGED),
218
219	# TODO: should be self-closing
220	#('<meta><a></a>', '<meta/><a></a>'),
221	('<meta><a></a>', ''),
222
223	# no attribute
224	('<button disabled></button>', ''),
225	('<button disabled=></button>', ''),
226	('<button disabled= ></button>', ''),
227
228	# single quoted is pretty common
229	("<a href='single'></a>", ''),
230
231	# Conceding to reality - I used these myself
232	('<a href=ble.sh></a>', ''),
233	('<a href=foo.html></a>', ''),
234	('<foo x="&"></foo>', ''),
235
236	# caps
237	('<foo></FOO>', ''),
238	('<Foo></fOO>', ''),
239
240	# capital VOID tag
241	('<META><a></a>', ''),
242	('<script><</script>', ''),
243	# matching
244	('<SCRipt><</SCRipt>', ''),
245	('<SCRIPT><</SCRIPT>', ''),
246	('<STYLE><</STYLE>', ''),
247	#'<SCRipt><</script>',
248
249	# Note: Python HTMLParser.py does DYNAMIC compilation of regex with re.I
250	# flag to handle this! Gah I want something faster.
251	#'<script><</SCRIPT>',
252
253	# TODO: Test <svg> and <math> ?
254	]
255
256	VALID_XML = [
257	'<meta></meta>',
258	]
259
260	INVALID_TAG_LEX = [
261	# not allowed, but 3 < 4 is allowed
262	'<p x="3 > 4"></p>',
263	# same thing
264	'<a href=">"></a>',
265	'<a foo=bar !></a>', # bad attr
266	]
267
268
269	class ValidateTest(unittest.TestCase):
270
271	def testInvalid(self):
272	# type: () -> None
273	counters = html.Counters()
274	for s in INVALID_LEX + INVALID_TAG_LEX:
275	try:
276	html.Validate(s, html.BALANCED_TAGS, counters)
277	except html.LexError as e:
278	print(e)
279	else:
280	self.fail('Expected LexError %r' % s)
281
282	for s in INVALID_PARSE:
283	try:
284	html.Validate(s, html.BALANCED_TAGS, counters)
285	except html.ParseError as e:
286	print(e)
287	else:
288	self.fail('Expected ParseError')
289
290	def testValid(self):
291	# type: () -> None
292	counters = html.Counters()
293	for s, _ in VALID_PARSE:
294	html.Validate(s, html.BALANCED_TAGS, counters)
295	print('HTML5 %r' % s)
296	#print('HTML5 attrs %r' % counters.debug_attrs)
297
298	def testValidXml(self):
299	# type: () -> None
300	counters = html.Counters()
301	for s in VALID_XML:
302	html.Validate(s, html.BALANCED_TAGS \| html.NO_SPECIAL_TAGS,
303	counters)
304	print('XML %r' % s)
305	#print('XML attrs %r' % counters.debug_attrs)
306
307
308	class XmlTest(unittest.TestCase):
309
310	def testValid(self):
311	# type: () -> None
312	counters = html.Counters()
313	for h, expected_xml in VALID_LEX + VALID_PARSE:
314	actual = html.ToXml(h)
315	if expected_xml == UNCHANGED: # Unchanged
316	self.assertEqual(h, actual)
317	elif expected_xml == '': # Skip
318	pass
319	else:
320	self.assertEqual(expected_xml, actual)
321
322
323	if __name__ == '__main__':
324	unittest.main()