OILS / lazylex / html_test.py View on Github | oils.pub

376 lines, 235 significant
1#!/usr/bin/env python2
2from __future__ import print_function
3
4import unittest
5
6from lazylex import html # module under test log = html.log
7from doctools.util import log
8
9
10class FunctionsTest(unittest.TestCase):
11
12 def testToText(self):
13 # type: () -> None
14 t = html.ToText('<b name="&amp;"> three &lt; four && five </b>')
15 self.assertEqual(' three < four && five ', t)
16
17
18def _MakeTagLexer(s):
19 # type: (str) -> html.TagLexer
20 lex = html.TagLexer(s)
21 lex.Reset(0, len(s))
22 return lex
23
24
25def _PrintTokens(lex):
26 # type: (html.TagLexer) -> None
27 log('')
28 log('tag = %r', lex.GetTagName())
29 for tok, start, end in lex.Tokens():
30 log('%s %r', tok, lex.s[start:end])
31
32
33class TagLexerTest(unittest.TestCase):
34
35 def testTagName_DEPRECATED(self):
36 # type: () -> None
37 lex = _MakeTagLexer('<a href=foo class="bar" />')
38 self.assertEqual('a', lex.GetTagName())
39
40 def testGetAttrRaw(self):
41 # type: () -> None
42 lex = _MakeTagLexer('<a>')
43 _PrintTokens(lex)
44 self.assertEqual(None, lex.GetAttrRaw('oops'))
45
46 # <a novalue> means lex.Get('novalue') == ''
47 # https://developer.mozilla.org/en-US/docs/Web/API/Element/hasAttribute
48 # We are not distinguishing <a novalue=""> from <a novalue> in this API
49 lex = _MakeTagLexer('<a novalue>')
50 _PrintTokens(lex)
51 self.assertEqual('', lex.GetAttrRaw('novalue'))
52
53 lex = _MakeTagLexer('<a href="double quoted">')
54 _PrintTokens(lex)
55
56 self.assertEqual('double quoted', lex.GetAttrRaw('href'))
57 self.assertEqual(None, lex.GetAttrRaw('oops'))
58
59 lex = _MakeTagLexer('<a href=foo class="bar">')
60 _PrintTokens(lex)
61 self.assertEqual('bar', lex.GetAttrRaw('class'))
62
63 lex = _MakeTagLexer('<a href=foo class="bar" />')
64 _PrintTokens(lex)
65 self.assertEqual('bar', lex.GetAttrRaw('class'))
66
67 lex = _MakeTagLexer('<a href="?foo=1&amp;bar=2" />')
68 self.assertEqual('?foo=1&amp;bar=2', lex.GetAttrRaw('href'))
69
70 def testAllAttrs(self):
71 # type: () -> None
72 """
73 [('key', 'value')] for all
74 """
75 # closed
76 lex = _MakeTagLexer('<a href=foo class="bar" />')
77 self.assertEqual([('href', 'foo'), ('class', 'bar')],
78 lex.AllAttrsRaw())
79
80 lex = _MakeTagLexer('<a href="?foo=1&amp;bar=2" />')
81 self.assertEqual([('href', '?foo=1&amp;bar=2')], lex.AllAttrsRaw())
82
83 def testEmptyMissingValues(self):
84 # type: () -> None
85 # equivalent to <button disabled="">
86 lex = _MakeTagLexer('<button disabled>')
87 all_attrs = lex.AllAttrsRaw()
88 self.assertEqual([('disabled', '')], all_attrs)
89
90 slices = lex.AllAttrsRawSlice()
91 log('slices %s', slices)
92
93 lex = _MakeTagLexer(
94 '''<p double="" single='' empty= value missing empty2=>''')
95 all_attrs = lex.AllAttrsRaw()
96 self.assertEqual([
97 ('double', ''),
98 ('single', ''),
99 ('empty', 'value'),
100 ('missing', ''),
101 ('empty2', ''),
102 ], all_attrs)
103 # TODO: should have
104 log('all %s', all_attrs)
105
106 slices = lex.AllAttrsRawSlice()
107 log('slices %s', slices)
108
109 def testInvalidTag(self):
110 # type: () -> None
111 try:
112 lex = _MakeTagLexer('<a foo=bar !></a>')
113 all_attrs = lex.AllAttrsRaw()
114 except html.LexError as e:
115 print(e)
116 else:
117 self.fail('Expected LexError')
118
119
120def _MakeAttrValueLexer(s):
121 # type: (str) -> html.AttrValueLexer
122 lex = html.AttrValueLexer(s)
123 lex.Reset(0, len(s))
124 return lex
125
126
127class AttrValueLexerTest(unittest.TestCase):
128
129 def testGood(self):
130 # type: () -> None
131 lex = _MakeAttrValueLexer('?foo=42&amp;bar=99')
132 n = lex.NumTokens()
133 self.assertEqual(3, n)
134
135
136class LexerTest(unittest.TestCase):
137
138 def testInvalid(self):
139 # type: () -> None
140 from data_lang.htm8_test import ValidTokenList
141 for s in INVALID_LEX:
142 try:
143 tokens = ValidTokenList(s)
144 except html.LexError as e:
145 print(e)
146 else:
147 self.fail('Expected LexError %r' % s)
148
149 def testValid(self):
150 # type: () -> None
151
152 from data_lang.htm8_test import Lex
153
154 for s, _ in VALID_LEX:
155 tokens = Lex(s)
156 print()
157
158
159INVALID_LEX = [
160 '< >',
161 '<a><',
162 '&amp<',
163 '&<',
164 # Hm > is allowed?
165 #'a > b',
166 'a < b',
167 '<!-- unfinished comment',
168 '<? unfinished processing',
169 '</div bad=attr> <a> <b>',
170
171 # not allowed, but 3 > 4 is allowed
172 '<a> 3 < 4 </a>',
173 # Not a CDATA tag
174 '<STYLEz><</STYLEz>',
175]
176
177SKIP = 0
178UNCHANGED = 1
179
180VALID_LEX = [
181 # TODO: convert these to XML
182 ('<foo></foo>', UNCHANGED),
183 ('<foo x=y></foo>', ''),
184 #('<foo x="&"></foo>', '<foo x="&amp;"></foo>'),
185 ('<foo x="&"></foo>', ''),
186
187 # Allowed with BadAmpersand
188 ('<p> x & y </p>', '<p> x &amp; y </p>'),
189
190 # No ambiguity
191 ('<img src=/ >', ''),
192 ('<img src="/">', ''),
193 ('<img src=foo/ >', ''),
194]
195
196INVALID_PARSE = [
197 '<a></b>',
198 '<a>', # missing closing tag
199 '<meta></meta>', # this is a self-closing tag
200]
201
202INVALID_ATTR_LEX = [
203 # Ambiguous, should be ""
204 '<img src=/>',
205 '<img src= />',
206 '<img src=foo/>',
207 '<img src= foo/>',
208]
209
210VALID_PARSE = [
211 ('<!DOCTYPE html>\n', ''),
212 ('<!DOCTYPE>', ''),
213
214 # empty strings
215 ('<p x=""></p>', UNCHANGED),
216 ("<p x=''></p>", UNCHANGED),
217 ('<self-closing a="b" />', UNCHANGED),
218
219 # We could also normalize CDATA?
220 # Note that CDATA has an escaping problem: you need to handle it ]]> with
221 # concatenation. It just "pushes the problem around".
222 # So I think it's better to use ONE kind of escaping, which is &lt;
223 ('<script><![CDATA[ <wtf> >< ]]></script>', UNCHANGED),
224
225 # allowed, but 3 < 4 is not allowed
226 ('<a> 3 > 4 </a>', '<a> 3 &gt; 4 </a>'),
227 # allowed, but 3 > 4 is not allowed
228 ('<p x="3 < 4"></p>', ''),
229 ('<b><a href="foo">link</a></b>', UNCHANGED),
230
231 # TODO: should be self-closing
232 #('<meta><a></a>', '<meta/><a></a>'),
233 ('<meta><a></a>', ''),
234
235 # no attribute
236 ('<button disabled></button>', ''),
237 ('<button disabled=></button>', ''),
238 ('<button disabled= ></button>', ''),
239
240 # single quoted is pretty common
241 ("<a href='single'></a>", ''),
242
243 # Conceding to reality - I used these myself
244 ('<a href=ble.sh></a>', ''),
245 ('<a href=foo.html></a>', ''),
246 ('<foo x="&"></foo>', ''),
247
248 # caps
249 ('<foo></FOO>', ''),
250 ('<Foo></fOO>', ''),
251
252 # capital VOID tag
253 ('<META><a></a>', ''),
254 ('<script><</script>', ''),
255 # matching
256 ('<SCRipt><</SCRipt>', ''),
257 ('<SCRIPT><</SCRIPT>', ''),
258 ('<STYLE><</STYLE>', ''),
259 #'<SCRipt><</script>',
260
261 # Regression test from blog
262 ('<script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>',
263 '')
264
265 # Note: Python HTMLParser.py does DYNAMIC compilation of regex with re.I
266 # flag to handle this! Gah I want something faster.
267 #'<script><</SCRIPT>',
268
269 # TODO: Test <svg> and <math> ?
270]
271
272VALID_XML = [
273 '<meta></meta>',
274]
275
276INVALID_TAG_LEX = [
277 # not allowed, but 3 < 4 is allowed
278 '<p x="3 > 4"></p>',
279 # same thing
280 '<a href=">"></a>',
281 '<a foo=bar !></a>', # bad attr
282]
283
284
285class ValidateTest(unittest.TestCase):
286
287 def testInvalidOld(self):
288 # type: () -> None
289 counters = html.Counters()
290 for s in INVALID_LEX + INVALID_TAG_LEX:
291 try:
292 html.ValidateOld(s, html.BALANCED_TAGS, counters)
293 except html.LexError as e:
294 print(e)
295 else:
296 self.fail('Expected LexError %r' % s)
297
298 for s in INVALID_PARSE:
299 try:
300 html.ValidateOld(s, html.BALANCED_TAGS, counters)
301 except html.ParseError as e:
302 print(e)
303 else:
304 self.fail('Expected ParseError')
305
306 def testValidOld(self):
307 # type: () -> None
308 counters = html.Counters()
309 for s, _ in VALID_PARSE:
310 html.ValidateOld(s, html.BALANCED_TAGS, counters)
311 print('HTML5 %r' % s)
312 #print('HTML5 attrs %r' % counters.debug_attrs)
313
314 def testValidXmlOld(self):
315 # type: () -> None
316 counters = html.Counters()
317 for s in VALID_XML:
318 html.ValidateOld(s, html.BALANCED_TAGS | html.NO_SPECIAL_TAGS,
319 counters)
320 print('XML %r' % s)
321 #print('XML attrs %r' % counters.debug_attrs)
322
323 def testInvalid(self):
324 # type: () -> None
325 counters = html.Counters()
326 for s in INVALID_LEX + INVALID_TAG_LEX + INVALID_ATTR_LEX:
327 try:
328 html.Validate(s, html.BALANCED_TAGS, counters)
329 except html.LexError as e:
330 print(e)
331 else:
332 self.fail('Expected LexError %r' % s)
333
334 for s in INVALID_PARSE:
335 try:
336 html.Validate(s, html.BALANCED_TAGS, counters)
337 except html.ParseError as e:
338 print(e)
339 else:
340 self.fail('Expected ParseError')
341
342 def testValid(self):
343 # type: () -> None
344 counters = html.Counters()
345 for s, _ in VALID_PARSE:
346 print('HTML5 %r' % s)
347 html.Validate(s, html.BALANCED_TAGS, counters)
348 #print('HTML5 attrs %r' % counters.debug_attrs)
349
350 def testValidXml(self):
351 # type: () -> None
352 counters = html.Counters()
353 for s in VALID_XML:
354 print('XML %r' % s)
355 html.Validate(s, html.BALANCED_TAGS | html.NO_SPECIAL_TAGS,
356 counters)
357 #print('XML attrs %r' % counters.debug_attrs)
358
359
360class XmlTest(unittest.TestCase):
361
362 def testValid(self):
363 # type: () -> None
364 counters = html.Counters()
365 for h, expected_xml in VALID_LEX + VALID_PARSE:
366 actual = html.ToXml(h)
367 if expected_xml == UNCHANGED: # Unchanged
368 self.assertEqual(h, actual)
369 elif expected_xml == '': # Skip
370 pass
371 else:
372 self.assertEqual(expected_xml, actual)
373
374
375if __name__ == '__main__':
376 unittest.main()