OILS / lazylex / html_test.py View on Github | oils.pub

357 lines, 217 significant
1#!/usr/bin/env python2
2from __future__ import print_function
3
4import re
5import unittest
6
7from lazylex import html # module under test log = html.log
8from doctools.util import log
9
10
11class RegexTest(unittest.TestCase):
12
13 def testDotAll(self):
14 # type: () -> None
15
16 # Note that $ matches end of line, not end of string
17 p1 = re.compile(r'.')
18 print(p1.match('\n'))
19
20 p2 = re.compile(r'.', re.DOTALL)
21 print(p2.match('\n'))
22
23 #p3 = re.compile(r'[.\n]', re.VERBOSE)
24 p3 = re.compile(r'[.\n]')
25 print(p3.match('\n'))
26
27 print('Negation')
28
29 p4 = re.compile(r'[^>]')
30 print(p4.match('\n'))
31
32 def testAttrRe(self):
33 # type: () -> None
34 _ATTR_RE = html._ATTR_RE
35 m = _ATTR_RE.match(' empty= val')
36 print(m.groups())
37
38
39class FunctionsTest(unittest.TestCase):
40
41 def testToText(self):
42 # type: () -> None
43 t = html.ToText('<b name="&amp;"> three &lt; four && five </b>')
44 self.assertEqual(' three < four && five ', t)
45
46
47def _MakeTagLexer(s):
48 # type: (str) -> html.TagLexer
49 lex = html.TagLexer(s)
50 lex.Reset(0, len(s))
51 return lex
52
53
54def _PrintTokens(lex):
55 # type: (html.TagLexer) -> None
56 log('')
57 log('tag = %r', lex.GetTagName())
58 for tok, start, end in lex.Tokens():
59 log('%s %r', tok, lex.s[start:end])
60
61
62class TagLexerTest(unittest.TestCase):
63
64 def testTagName_DEPRECATED(self):
65 # type: () -> None
66 lex = _MakeTagLexer('<a href=foo class="bar" />')
67 self.assertEqual('a', lex.GetTagName())
68
69 def testGetAttrRaw(self):
70 # type: () -> None
71 lex = _MakeTagLexer('<a>')
72 _PrintTokens(lex)
73 self.assertEqual(None, lex.GetAttrRaw('oops'))
74
75 # <a novalue> means lex.Get('novalue') == ''
76 # https://developer.mozilla.org/en-US/docs/Web/API/Element/hasAttribute
77 # We are not distinguishing <a novalue=""> from <a novalue> in this API
78 lex = _MakeTagLexer('<a novalue>')
79 _PrintTokens(lex)
80 self.assertEqual('', lex.GetAttrRaw('novalue'))
81
82 lex = _MakeTagLexer('<a href="double quoted">')
83 _PrintTokens(lex)
84
85 self.assertEqual('double quoted', lex.GetAttrRaw('href'))
86 self.assertEqual(None, lex.GetAttrRaw('oops'))
87
88 lex = _MakeTagLexer('<a href=foo class="bar">')
89 _PrintTokens(lex)
90 self.assertEqual('bar', lex.GetAttrRaw('class'))
91
92 lex = _MakeTagLexer('<a href=foo class="bar" />')
93 _PrintTokens(lex)
94 self.assertEqual('bar', lex.GetAttrRaw('class'))
95
96 lex = _MakeTagLexer('<a href="?foo=1&amp;bar=2" />')
97 self.assertEqual('?foo=1&amp;bar=2', lex.GetAttrRaw('href'))
98
99 def testAllAttrs(self):
100 # type: () -> None
101 """
102 [('key', 'value')] for all
103 """
104 # closed
105 lex = _MakeTagLexer('<a href=foo class="bar" />')
106 self.assertEqual([('href', 'foo'), ('class', 'bar')],
107 lex.AllAttrsRaw())
108
109 lex = _MakeTagLexer('<a href="?foo=1&amp;bar=2" />')
110 self.assertEqual([('href', '?foo=1&amp;bar=2')], lex.AllAttrsRaw())
111
112 def testEmptyMissingValues(self):
113 # type: () -> None
114 # equivalent to <button disabled="">
115 lex = _MakeTagLexer('<button disabled>')
116 all_attrs = lex.AllAttrsRaw()
117 self.assertEqual([('disabled', '')], all_attrs)
118
119 slices = lex.AllAttrsRawSlice()
120 log('slices %s', slices)
121
122 lex = _MakeTagLexer(
123 '''<p double="" single='' empty= value missing empty2=>''')
124 all_attrs = lex.AllAttrsRaw()
125 self.assertEqual([
126 ('double', ''),
127 ('single', ''),
128 ('empty', 'value'),
129 ('missing', ''),
130 ('empty2', ''),
131 ], all_attrs)
132 # TODO: should have
133 log('all %s', all_attrs)
134
135 slices = lex.AllAttrsRawSlice()
136 log('slices %s', slices)
137
138 def testInvalidTag(self):
139 # type: () -> None
140 try:
141 lex = _MakeTagLexer('<a foo=bar !></a>')
142 all_attrs = lex.AllAttrsRaw()
143 except html.LexError as e:
144 print(e)
145 else:
146 self.fail('Expected LexError')
147
148
149class LexerTest(unittest.TestCase):
150
151 def testInvalid(self):
152 # type: () -> None
153 from data_lang.htm8_test import ValidTokenList
154 for s in INVALID_LEX:
155 try:
156 tokens = ValidTokenList(s)
157 except html.LexError as e:
158 print(e)
159 else:
160 self.fail('Expected LexError %r' % s)
161
162 def testValid(self):
163 # type: () -> None
164
165 from data_lang.htm8_test import Lex
166
167 for s, _ in VALID_LEX:
168 tokens = Lex(s)
169 print()
170
171
172INVALID_LEX = [
173 '< >',
174 '<a><',
175 '&amp<',
176 '&<',
177 # Hm > is allowed?
178 #'a > b',
179 'a < b',
180 '<!-- unfinished comment',
181 '<? unfinished processing',
182 '</div bad=attr> <a> <b>',
183
184 # not allowed, but 3 > 4 is allowed
185 '<a> 3 < 4 </a>',
186 # Not a CDATA tag
187 '<STYLEz><</STYLEz>',
188]
189
190SKIP = 0
191UNCHANGED = 1
192
193VALID_LEX = [
194 # TODO: convert these to XML
195 ('<foo></foo>', UNCHANGED),
196 ('<foo x=y></foo>', ''),
197 #('<foo x="&"></foo>', '<foo x="&amp;"></foo>'),
198 ('<foo x="&"></foo>', ''),
199
200 # Allowed with BadAmpersand
201 ('<p> x & y </p>', '<p> x &amp; y </p>'),
202
203 # No ambiguity
204 ('<img src=/ >', ''),
205 ('<img src="/">', ''),
206 ('<img src=foo/ >', ''),
207]
208
209INVALID_PARSE = [
210 '<a></b>',
211 '<a>', # missing closing tag
212 '<meta></meta>', # this is a self-closing tag
213]
214
215INVALID_ATTR_LEX = [
216 # Ambiguous, should be ""
217 '<img src=/>',
218 '<img src= />',
219 '<img src=foo/>',
220 '<img src= foo/>',
221
222 # Quoting
223 '<img src=x"y">',
224 "<img src=j''>",
225]
226
227VALID_PARSE = [
228 ('<!DOCTYPE html>\n', ''),
229 ('<!DOCTYPE>', ''),
230
231 # empty strings
232 ('<p x=""></p>', UNCHANGED),
233 ("<p x=''></p>", UNCHANGED),
234 ('<self-closing a="b" />', UNCHANGED),
235
236 # We could also normalize CDATA?
237 # Note that CDATA has an escaping problem: you need to handle it ]]> with
238 # concatenation. It just "pushes the problem around".
239 # So I think it's better to use ONE kind of escaping, which is &lt;
240 ('<script><![CDATA[ <wtf> >< ]]></script>', UNCHANGED),
241
242 # allowed, but 3 < 4 is not allowed
243 ('<a> 3 > 4 </a>', '<a> 3 &gt; 4 </a>'),
244 # allowed, but 3 > 4 is not allowed
245 ('<p x="3 < 4"></p>', ''),
246 ('<b><a href="foo">link</a></b>', UNCHANGED),
247
248 # TODO: should be self-closing
249 #('<meta><a></a>', '<meta/><a></a>'),
250 ('<meta><a></a>', ''),
251
252 # no attribute
253 ('<button disabled></button>', ''),
254 ('<button disabled=></button>', ''),
255 ('<button disabled= ></button>', ''),
256
257 # single quoted is pretty common
258 ("<a href='single'></a>", ''),
259
260 # Conceding to reality - I used these myself
261 ('<a href=ble.sh></a>', ''),
262 ('<a href=foo.html></a>', ''),
263 ('<foo x="&"></foo>', ''),
264
265 # caps
266 ('<foo></FOO>', ''),
267 ('<Foo></fOO>', ''),
268
269 # capital VOID tag
270 ('<META><a></a>', ''),
271 ('<script><</script>', ''),
272 # matching
273 ('<SCRipt><</SCRipt>', ''),
274 ('<SCRIPT><</SCRIPT>', ''),
275 ('<STYLE><</STYLE>', ''),
276 #'<SCRipt><</script>',
277
278 # Regression test from blog
279 ('<script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>',
280 '')
281
282 # Note: Python HTMLParser.py does DYNAMIC compilation of regex with re.I
283 # flag to handle this! Gah I want something faster.
284 #'<script><</SCRIPT>',
285
286 # TODO: Test <svg> and <math> ?
287]
288
289VALID_XML = [
290 '<meta></meta>',
291]
292
293INVALID_TAG_LEX = [
294 # not allowed, but 3 < 4 is allowed
295 '<p x="3 > 4"></p>',
296 # same thing
297 '<a href=">"></a>',
298 '<a foo=bar !></a>', # bad attr
299]
300
301
302class ValidateTest(unittest.TestCase):
303
304 def testInvalid(self):
305 # type: () -> None
306 counters = html.Counters()
307 for s in INVALID_LEX + INVALID_TAG_LEX + INVALID_ATTR_LEX:
308 try:
309 html.Validate(s, html.BALANCED_TAGS, counters)
310 except html.LexError as e:
311 print(e)
312 else:
313 self.fail('Expected LexError %r' % s)
314
315 for s in INVALID_PARSE:
316 try:
317 html.Validate(s, html.BALANCED_TAGS, counters)
318 except html.ParseError as e:
319 print(e)
320 else:
321 self.fail('Expected ParseError')
322
323 def testValid(self):
324 # type: () -> None
325 counters = html.Counters()
326 for s, _ in VALID_PARSE:
327 print('HTML5 %r' % s)
328 html.Validate(s, html.BALANCED_TAGS, counters)
329 #print('HTML5 attrs %r' % counters.debug_attrs)
330
331 def testValidXml(self):
332 # type: () -> None
333 counters = html.Counters()
334 for s in VALID_XML:
335 print('XML %r' % s)
336 html.Validate(s, html.BALANCED_TAGS | html.NO_SPECIAL_TAGS,
337 counters)
338 #print('XML attrs %r' % counters.debug_attrs)
339
340
341class XmlTest(unittest.TestCase):
342
343 def testValid(self):
344 # type: () -> None
345 counters = html.Counters()
346 for h, expected_xml in VALID_LEX + VALID_PARSE:
347 actual = html.ToXml(h)
348 if expected_xml == UNCHANGED: # Unchanged
349 self.assertEqual(h, actual)
350 elif expected_xml == '': # Skip
351 pass
352 else:
353 self.assertEqual(expected_xml, actual)
354
355
356if __name__ == '__main__':
357 unittest.main()