OILS / lazylex / html_test.py View on Github | oils.pub

450 lines, 295 significant
1#!/usr/bin/env python2
2from __future__ import print_function
3
4import unittest
5
6from lazylex import html # module under test log = html.log
7
8log = html.log
9
10with open('lazylex/testdata.html') as f:
11 TEST_HTML = f.read()
12
13
14class RegexTest(unittest.TestCase):
15
16 def testDotAll(self):
17 import re
18
19 # Note that $ matches end of line, not end of string
20 p1 = re.compile(r'.')
21 print(p1.match('\n'))
22
23 p2 = re.compile(r'.', re.DOTALL)
24 print(p2.match('\n'))
25
26 #p3 = re.compile(r'[.\n]', re.VERBOSE)
27 p3 = re.compile(r'[.\n]')
28 print(p3.match('\n'))
29
30 print('Negation')
31
32 p4 = re.compile(r'[^>]')
33 print(p4.match('\n'))
34
35 def testAttrRe(self):
36 _ATTR_RE = html._ATTR_RE
37 m = _ATTR_RE.match(' empty= missing')
38 print(m.groups())
39
40
41class FunctionsTest(unittest.TestCase):
42
43 def testFindLineNum(self):
44 s = 'foo\n' * 3
45 for pos in [1, 5, 10, 50]: # out of bounds
46 line_num = html.FindLineNum(s, pos)
47 print(line_num)
48
49
50def _MakeTagLexer(s):
51 lex = html.TagLexer(s)
52 lex.Reset(0, len(s))
53 return lex
54
55
56def _PrintTokens(lex):
57 log('')
58 log('tag = %r', lex.TagName())
59 for tok, start, end in lex.Tokens():
60 log('%s %r', tok, lex.s[start:end])
61
62
63class TagLexerTest(unittest.TestCase):
64
65 def testTagLexer(self):
66 # Invalid!
67 #lex = _MakeTagLexer('< >')
68 #print(lex.Tag())
69
70 lex = _MakeTagLexer('<a>')
71 _PrintTokens(lex)
72
73 lex = _MakeTagLexer('<a novalue>')
74 _PrintTokens(lex)
75
76 # Note: we could have a different HasAttr() method
77 # <a novalue> means lex.Get('novalue') == ''
78 # https://developer.mozilla.org/en-US/docs/Web/API/Element/hasAttribute
79 self.assertEqual('', lex.GetAttrRaw('novalue'))
80
81 lex = _MakeTagLexer('<a href="double quoted">')
82 _PrintTokens(lex)
83
84 self.assertEqual('double quoted', lex.GetAttrRaw('href'))
85 self.assertEqual(None, lex.GetAttrRaw('oops'))
86
87 lex = _MakeTagLexer('<a href=foo class="bar">')
88 _PrintTokens(lex)
89
90 lex = _MakeTagLexer('<a href=foo class="bar" />')
91 _PrintTokens(lex)
92
93 lex = _MakeTagLexer('<a href="?foo=1&amp;bar=2" />')
94 self.assertEqual('?foo=1&amp;bar=2', lex.GetAttrRaw('href'))
95
96 def testTagName(self):
97 lex = _MakeTagLexer('<a href=foo class="bar" />')
98 self.assertEqual('a', lex.TagName())
99
100 def testAllAttrs(self):
101 """
102 [('key', 'value')] for all
103 """
104 # closed
105 lex = _MakeTagLexer('<a href=foo class="bar" />')
106 self.assertEqual([('href', 'foo'), ('class', 'bar')],
107 lex.AllAttrsRaw())
108
109 lex = _MakeTagLexer('<a href="?foo=1&amp;bar=2" />')
110 self.assertEqual([('href', '?foo=1&amp;bar=2')], lex.AllAttrsRaw())
111
112 def testEmptyMissingValues(self):
113 # equivalent to <button disabled="">
114 lex = _MakeTagLexer('<button disabled>')
115 all_attrs = lex.AllAttrsRaw()
116 self.assertEqual([('disabled', '')], all_attrs)
117
118 slices = lex.AllAttrsRawSlice()
119 log('slices %s', slices)
120
121 lex = _MakeTagLexer(
122 '''<p double="" single='' empty= missing missing2>''')
123 all_attrs = lex.AllAttrsRaw()
124 self.assertEqual([
125 ('double', ''),
126 ('single', ''),
127 ('empty', ''),
128 ('missing', ''),
129 ('missing2', ''),
130 ], all_attrs)
131 # TODO: should have
132 log('all %s', all_attrs)
133
134 slices = lex.AllAttrsRawSlice()
135 log('slices %s', slices)
136
137 def testInvalidTag(self):
138 try:
139 lex = _MakeTagLexer('<a foo=bar !></a>')
140 all_attrs = lex.AllAttrsRaw()
141 except html.LexError as e:
142 print(e)
143 else:
144 self.fail('Expected LexError')
145
146
147def _MakeAttrValueLexer(s):
148 lex = html.AttrValueLexer(s)
149 lex.Reset(0, len(s))
150 return lex
151
152
153class AttrValueLexerTest(unittest.TestCase):
154
155 def testGood(self):
156 lex = _MakeAttrValueLexer('?foo=42&amp;bar=99')
157 n = lex.NumTokens()
158 self.assertEqual(3, n)
159
160
161def Lex(h, no_special_tags=False):
162 print(repr(h))
163 tokens = html.ValidTokenList(h, no_special_tags=no_special_tags)
164 start_pos = 0
165 for tok_id, end_pos in tokens:
166 frag = h[start_pos:end_pos]
167 log('%d %s %r', end_pos, html.TokenName(tok_id), frag)
168 start_pos = end_pos
169 return tokens
170
171
172class LexerTest(unittest.TestCase):
173
174 # IndexLinker in devtools/make_help.py
175 # <pre> sections in doc/html_help.py
176 # TocExtractor in devtools/cmark.py
177
178 def testPstrip(self):
179 """Remove anything like this.
180
181 <p><pstrip> </pstrip></p>
182 """
183 pass
184
185 def testCommentParse(self):
186 n = len(TEST_HTML)
187 tokens = Lex(TEST_HTML)
188
189 def testCommentParse2(self):
190
191 Tok = html.Tok
192 h = '''
193 hi <!-- line 1
194 line 2 --><br/>'''
195 tokens = Lex(h)
196
197 self.assertEqual(
198 [
199 (Tok.RawData, 12),
200 (Tok.Comment, 50), # <? err ?>
201 (Tok.StartEndTag, 55),
202 (Tok.EndOfStream, 55),
203 ],
204 tokens)
205
206 def testProcessingInstruction(self):
207 # <?xml ?> header
208 Tok = html.Tok
209 h = 'hi <? err ?>'
210 tokens = Lex(h)
211
212 self.assertEqual(
213 [
214 (Tok.RawData, 3),
215 (Tok.Processing, 12), # <? err ?>
216 (Tok.EndOfStream, 12),
217 ],
218 tokens)
219
220 def testScriptStyle(self):
221 Tok = html.Tok
222 h = '''
223 hi <script src=""> if (x < 1 && y > 2 ) { console.log(""); }
224 </script>
225 '''
226 tokens = Lex(h)
227
228 self.assertEqual(
229 [
230 (Tok.RawData, 12),
231 (Tok.StartTag, 27), # <script>
232 (Tok.HtmlCData, 78), # JavaScript code is HTML CData
233 (Tok.EndTag, 87), # </script>
234 (Tok.RawData, 96), # \n
235 (Tok.EndOfStream, 96), # \n
236 ],
237 tokens)
238
239 def testScriptStyleXml(self):
240 Tok = html.Tok
241 h = 'hi <script src=""> &lt; </script>'
242 # XML mode
243 tokens = Lex(h, no_special_tags=True)
244
245 self.assertEqual(
246 [
247 (Tok.RawData, 3),
248 (Tok.StartTag, 18), # <script>
249 (Tok.RawData, 19), # space
250 (Tok.CharEntity, 23), # </script>
251 (Tok.RawData, 24), # \n
252 (Tok.EndTag, 33), # \n
253 (Tok.EndOfStream, 33), # \n
254 ],
255 tokens)
256
257 def testCData(self):
258 Tok = html.Tok
259
260 # from
261 # /home/andy/src/languages/Python-3.11.5/Lib/test/xmltestdata/c14n-20/inC14N4.xml
262 h = '<compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>'
263 tokens = Lex(h)
264
265 self.assertEqual([
266 (Tok.StartTag, 9),
267 (Tok.CData, 61),
268 (Tok.EndTag, 71),
269 (Tok.EndOfStream, 71),
270 ], tokens)
271
272 def testEntity(self):
273 Tok = html.Tok
274
275 # from
276 # /home/andy/src/Python-3.12.4/Lib/test/xmltestdata/c14n-20/inC14N5.xml
277 h = '&ent1;, &ent2;!'
278
279 tokens = Lex(h)
280
281 self.assertEqual([
282 (Tok.CharEntity, 6),
283 (Tok.RawData, 8),
284 (Tok.CharEntity, 14),
285 (Tok.RawData, 15),
286 (Tok.EndOfStream, 15),
287 ], tokens)
288
289 def testStartTag(self):
290 Tok = html.Tok
291
292 h = '<a>hi</a>'
293 tokens = Lex(h)
294
295 self.assertEqual([
296 (Tok.StartTag, 3),
297 (Tok.RawData, 5),
298 (Tok.EndTag, 9),
299 (Tok.EndOfStream, 9),
300 ], tokens)
301
302 # Make sure we don't consume too much
303 h = '<a><source>1.7</source></a>'
304
305 tokens = Lex(h)
306
307 self.assertEqual([
308 (Tok.StartTag, 3),
309 (Tok.StartTag, 11),
310 (Tok.RawData, 14),
311 (Tok.EndTag, 23),
312 (Tok.EndTag, 27),
313 (Tok.EndOfStream, 27),
314 ], tokens)
315
316 return
317
318 h = '''
319 <configuration>
320 <source>1.7</source>
321 </configuration>'''
322
323 tokens = Lex(h)
324
325 self.assertEqual([
326 (Tok.RawData, 9),
327 (Tok.StartTag, 24),
328 (Tok.RawData, 9),
329 (Tok.EndOfStream, 9),
330 ], tokens)
331
332 def testInvalid(self):
333 Tok = html.Tok
334
335 for s in INVALID_LEX:
336 try:
337 tokens = html.ValidTokenList(s)
338 except html.LexError as e:
339 print(e)
340 else:
341 self.fail('Expected LexError %r' % s)
342
343
344INVALID_LEX = [
345 # Should be &amp;
346 '<a>&',
347 '&amp', # not finished
348 '&#', # not finished
349 # Hm > is allowed?
350 #'a > b',
351 'a < b',
352 '<!-- unfinished comment',
353 '<? unfinished processing',
354 '</div bad=attr> <a> <b>',
355
356 # not allowed, but 3 > 4 is allowed
357 '<a> 3 < 4 </a>',
358]
359
360INVALID_PARSE = [
361 '<a></b>',
362 '<a>', # missing closing tag
363 '<meta></meta>', # this is a self-closing tag
364]
365
366VALID_PARSE = [
367 '<!DOCTYPE html>\n',
368 '<!DOCTYPE>',
369
370 # empty strings
371 '<p x=""></p>',
372 "<p x=''></p>",
373
374 # allowed, but 3 < 4 is not allowed
375 '<a> 3 > 4 </a>',
376 # allowed, but 3 > 4 is not allowed
377 '<p x="3 < 4"></p>',
378 '<b><a href="foo">link</a></b>',
379 '<meta><a></a>',
380 # no attribute
381 '<button disabled></button>',
382 '<button disabled=></button>',
383 '<button disabled= ></button>',
384
385 # single quoted is pretty common
386 "<a href='single'></a>",
387
388 # Conceding to reality - I used these myself
389 '<a href=ble.sh></a>',
390 '<a href=foo.html></a>',
391
392 # TODO: capitalization should be allowed
393 #'<META><a></a>',
394
395 # TODO: Test <svg> and <math> ?
396]
397
398VALID_XML = [
399 '<meta></meta>',
400]
401
402INVALID_TAG_LEX = [
403 # not allowed, but 3 < 4 is allowed
404 '<p x="3 > 4"></p>',
405 '<a foo=bar !></a>', # bad attr
406
407 # should be escaped
408 #'<a href="&"></a>',
409 #'<a href=">"></a>',
410]
411
412
413class ValidateTest(unittest.TestCase):
414
415 def testInvalid(self):
416 counters = html.Counters()
417 for s in INVALID_LEX + INVALID_TAG_LEX:
418 try:
419 html.Validate(s, html.BALANCED_TAGS, counters)
420 except html.LexError as e:
421 print(e)
422 else:
423 self.fail('Expected LexError %r' % s)
424
425 for s in INVALID_PARSE:
426 try:
427 html.Validate(s, html.BALANCED_TAGS, counters)
428 except html.ParseError as e:
429 print(e)
430 else:
431 self.fail('Expected ParseError')
432
433 def testValid(self):
434 counters = html.Counters()
435 for s in VALID_PARSE:
436 html.Validate(s, html.BALANCED_TAGS, counters)
437 print('HTML5 %r' % s)
438 print('HTML5 attrs %r' % counters.debug_attrs)
439
440 def testValidXml(self):
441 counters = html.Counters()
442 for s in VALID_XML:
443 html.Validate(s, html.BALANCED_TAGS | html.NO_SPECIAL_TAGS,
444 counters)
445 print('XML %r' % s)
446 print('XML attrs %r' % counters.debug_attrs)
447
448
449if __name__ == '__main__':
450 unittest.main()