OILS / lazylex / html_test.py View on Github | oils.pub

522 lines, 321 significant
1#!/usr/bin/env python2
2from __future__ import print_function
3
4import unittest
5
6from _devbuild.gen.htm8_asdl import h8_id, h8_id_str
7from data_lang import htm8
8from lazylex import html # module under test log = html.log
9
10from typing import List, Tuple
11
12log = html.log
13
14with open('data_lang/testdata/hello.htm8') as f:
15 TEST_HTML = f.read()
16
17
18class FunctionsTest(unittest.TestCase):
19
20 def testFindLineNum(self):
21 # type: () -> None
22 s = 'foo\n' * 3
23 for pos in [1, 5, 10, 50]: # out of bounds
24 line_num = htm8.FindLineNum(s, pos)
25 print(line_num)
26
27
28def _MakeTagLexer(s):
29 # type: (str) -> html.TagLexer
30 lex = html.TagLexer(s)
31 lex.Reset(0, len(s))
32 return lex
33
34
35def _PrintTokens(lex):
36 # type: (html.TagLexer) -> None
37 log('')
38 log('tag = %r', lex.GetTagName())
39 for tok, start, end in lex.Tokens():
40 log('%s %r', tok, lex.s[start:end])
41
42
43class TagLexerTest(unittest.TestCase):
44
45 def testTagLexer(self):
46 # type: () -> None
47 # Invalid!
48 #lex = _MakeTagLexer('< >')
49 #print(lex.Tag())
50
51 lex = _MakeTagLexer('<a>')
52 _PrintTokens(lex)
53
54 lex = _MakeTagLexer('<a novalue>')
55 _PrintTokens(lex)
56
57 # Note: we could have a different HasAttr() method
58 # <a novalue> means lex.Get('novalue') == ''
59 # https://developer.mozilla.org/en-US/docs/Web/API/Element/hasAttribute
60 self.assertEqual('', lex.GetAttrRaw('novalue'))
61
62 lex = _MakeTagLexer('<a href="double quoted">')
63 _PrintTokens(lex)
64
65 self.assertEqual('double quoted', lex.GetAttrRaw('href'))
66 self.assertEqual(None, lex.GetAttrRaw('oops'))
67
68 lex = _MakeTagLexer('<a href=foo class="bar">')
69 _PrintTokens(lex)
70
71 lex = _MakeTagLexer('<a href=foo class="bar" />')
72 _PrintTokens(lex)
73
74 lex = _MakeTagLexer('<a href="?foo=1&amp;bar=2" />')
75 self.assertEqual('?foo=1&amp;bar=2', lex.GetAttrRaw('href'))
76
77 def testTagName(self):
78 # type: () -> None
79 lex = _MakeTagLexer('<a href=foo class="bar" />')
80 self.assertEqual('a', lex.GetTagName())
81
82 def testAllAttrs(self):
83 # type: () -> None
84 """
85 [('key', 'value')] for all
86 """
87 # closed
88 lex = _MakeTagLexer('<a href=foo class="bar" />')
89 self.assertEqual([('href', 'foo'), ('class', 'bar')],
90 lex.AllAttrsRaw())
91
92 lex = _MakeTagLexer('<a href="?foo=1&amp;bar=2" />')
93 self.assertEqual([('href', '?foo=1&amp;bar=2')], lex.AllAttrsRaw())
94
95 def testEmptyMissingValues(self):
96 # type: () -> None
97 # equivalent to <button disabled="">
98 lex = _MakeTagLexer('<button disabled>')
99 all_attrs = lex.AllAttrsRaw()
100 self.assertEqual([('disabled', '')], all_attrs)
101
102 slices = lex.AllAttrsRawSlice()
103 log('slices %s', slices)
104
105 lex = _MakeTagLexer(
106 '''<p double="" single='' empty= value missing empty2=>''')
107 all_attrs = lex.AllAttrsRaw()
108 self.assertEqual([
109 ('double', ''),
110 ('single', ''),
111 ('empty', 'value'),
112 ('missing', ''),
113 ('empty2', ''),
114 ], all_attrs)
115 # TODO: should have
116 log('all %s', all_attrs)
117
118 slices = lex.AllAttrsRawSlice()
119 log('slices %s', slices)
120
121 def testInvalidTag(self):
122 # type: () -> None
123 try:
124 lex = _MakeTagLexer('<a foo=bar !></a>')
125 all_attrs = lex.AllAttrsRaw()
126 except html.LexError as e:
127 print(e)
128 else:
129 self.fail('Expected LexError')
130
131
132def _MakeAttrValueLexer(s):
133 # type: (str) -> html.AttrValueLexer
134 lex = html.AttrValueLexer(s)
135 lex.Reset(0, len(s))
136 return lex
137
138
139class AttrValueLexerTest(unittest.TestCase):
140
141 def testGood(self):
142 # type: () -> None
143 lex = _MakeAttrValueLexer('?foo=42&amp;bar=99')
144 n = lex.NumTokens()
145 self.assertEqual(3, n)
146
147
148def Lex(h, no_special_tags=False):
149 # type: (str, bool) -> List[Tuple[int, int]]
150 print(repr(h))
151 tokens = html.ValidTokenList(h, no_special_tags=no_special_tags)
152 start_pos = 0
153 for tok_id, end_pos in tokens:
154 frag = h[start_pos:end_pos]
155 log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
156 start_pos = end_pos
157 return tokens
158
159
160class LexerTest(unittest.TestCase):
161
162 # IndexLinker in devtools/make_help.py
163 # <pre> sections in doc/html_help.py
164 # TocExtractor in devtools/cmark.py
165
166 def testPstrip(self):
167 # type: () -> None
168 """Remove anything like this.
169
170 <p><pstrip> </pstrip></p>
171 """
172 pass
173
174 def testCommentParse(self):
175 # type: () -> None
176 n = len(TEST_HTML)
177 tokens = Lex(TEST_HTML)
178
179 def testCommentParse2(self):
180 # type: () -> None
181 h = '''
182 hi <!-- line 1
183 line 2 --><br/>'''
184 tokens = Lex(h)
185
186 self.assertEqual(
187 [
188 (h8_id.RawData, 12),
189 (h8_id.Comment, 50), # <? err ?>
190 (h8_id.StartEndTag, 55),
191 (h8_id.EndOfStream, 55),
192 ],
193 tokens)
194
195 def testProcessingInstruction(self):
196 # type: () -> None
197 # <?xml ?> header
198 h = 'hi <? err ?>'
199 tokens = Lex(h)
200
201 self.assertEqual(
202 [
203 (h8_id.RawData, 3),
204 (h8_id.Processing, 12), # <? err ?>
205 (h8_id.EndOfStream, 12),
206 ],
207 tokens)
208
209 def testScriptStyle(self):
210 # type: () -> None
211 h = '''
212 hi <script src=""> if (x < 1 && y > 2 ) { console.log(""); }
213 </script>
214 '''
215 tokens = Lex(h)
216
217 expected = [
218 (h8_id.RawData, 12),
219 (h8_id.StartTag, 27), # <script>
220 (h8_id.HtmlCData, 78), # JavaScript code is HTML CData
221 (h8_id.EndTag, 87), # </script>
222 (h8_id.RawData, 96), # \n
223 (h8_id.EndOfStream, 96), # \n
224 ]
225 self.assertEqual(expected, tokens)
226
227 # Test case matching
228 tokens = Lex(h.replace('script', 'scrIPT'))
229 self.assertEqual(expected, tokens)
230
231 def testScriptStyleXml(self):
232 # type: () -> None
233 h = 'hi <script src=""> &lt; </script>'
234 # XML mode
235 tokens = Lex(h, no_special_tags=True)
236
237 self.assertEqual(
238 [
239 (h8_id.RawData, 3),
240 (h8_id.StartTag, 18), # <script>
241 (h8_id.RawData, 19), # space
242 (h8_id.CharEntity, 23), # </script>
243 (h8_id.RawData, 24), # \n
244 (h8_id.EndTag, 33), # \n
245 (h8_id.EndOfStream, 33), # \n
246 ],
247 tokens)
248
249 def testCData(self):
250 # type: () -> None
251
252 # from
253 # /home/andy/src/languages/Python-3.11.5/Lib/test/xmltestdata/c14n-20/inC14N4.xml
254 h = '<compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>'
255 tokens = Lex(h)
256
257 self.assertEqual([
258 (h8_id.StartTag, 9),
259 (h8_id.CData, 61),
260 (h8_id.EndTag, 71),
261 (h8_id.EndOfStream, 71),
262 ], tokens)
263
264 def testEntity(self):
265 # type: () -> None
266
267 # from
268 # /home/andy/src/Python-3.12.4/Lib/test/xmltestdata/c14n-20/inC14N5.xml
269 h = '&ent1;, &ent2;!'
270
271 tokens = Lex(h)
272
273 self.assertEqual([
274 (h8_id.CharEntity, 6),
275 (h8_id.RawData, 8),
276 (h8_id.CharEntity, 14),
277 (h8_id.RawData, 15),
278 (h8_id.EndOfStream, 15),
279 ], tokens)
280
281 def testStartTag(self):
282 # type: () -> None
283
284 h = '<a>hi</a>'
285 tokens = Lex(h)
286
287 self.assertEqual([
288 (h8_id.StartTag, 3),
289 (h8_id.RawData, 5),
290 (h8_id.EndTag, 9),
291 (h8_id.EndOfStream, 9),
292 ], tokens)
293
294 # Make sure we don't consume too much
295 h = '<a><source>1.7</source></a>'
296
297 tokens = Lex(h)
298
299 self.assertEqual([
300 (h8_id.StartTag, 3),
301 (h8_id.StartTag, 11),
302 (h8_id.RawData, 14),
303 (h8_id.EndTag, 23),
304 (h8_id.EndTag, 27),
305 (h8_id.EndOfStream, 27),
306 ], tokens)
307
308 return
309
310 h = '''
311 <configuration>
312 <source>1.7</source>
313 </configuration>'''
314
315 tokens = Lex(h)
316
317 self.assertEqual([
318 (h8_id.RawData, 9),
319 (h8_id.StartTag, 24),
320 (h8_id.RawData, 9),
321 (h8_id.EndOfStream, 9),
322 ], tokens)
323
324 def testBad(self):
325 # type: () -> None
326 h = '&'
327 tokens = Lex(h)
328
329 self.assertEqual([
330 (h8_id.BadAmpersand, 1),
331 (h8_id.EndOfStream, 1),
332 ], tokens)
333
334 h = '>'
335 tokens = Lex(h)
336
337 self.assertEqual([
338 (h8_id.BadGreaterThan, 1),
339 (h8_id.EndOfStream, 1),
340 ], tokens)
341
342 def testInvalid(self):
343 # type: () -> None
344 for s in INVALID_LEX:
345 try:
346 tokens = html.ValidTokenList(s)
347 except html.LexError as e:
348 print(e)
349 else:
350 self.fail('Expected LexError %r' % s)
351
352 def testValid(self):
353 # type: () -> None
354 for s, _ in VALID_LEX:
355 tokens = Lex(s)
356 print()
357
358
359INVALID_LEX = [
360 '<a><',
361 '&amp<',
362 '&<',
363 # Hm > is allowed?
364 #'a > b',
365 'a < b',
366 '<!-- unfinished comment',
367 '<? unfinished processing',
368 '</div bad=attr> <a> <b>',
369
370 # not allowed, but 3 > 4 is allowed
371 '<a> 3 < 4 </a>',
372 # Not a CDATA tag
373 '<STYLEz><</STYLEz>',
374]
375
376SKIP = 0
377UNCHANGED = 1
378
379VALID_LEX = [
380 # TODO: convert these to XML
381 ('<foo></foo>', UNCHANGED),
382 ('<foo x=y></foo>', ''),
383 #('<foo x="&"></foo>', '<foo x="&amp;"></foo>'),
384 ('<foo x="&"></foo>', ''),
385
386 # Allowed with BadAmpersand
387 ('<p> x & y </p>', '<p> x &amp; y </p>'),
388]
389
390INVALID_PARSE = [
391 '<a></b>',
392 '<a>', # missing closing tag
393 '<meta></meta>', # this is a self-closing tag
394]
395
396VALID_PARSE = [
397 ('<!DOCTYPE html>\n', ''),
398 ('<!DOCTYPE>', ''),
399
400 # empty strings
401 ('<p x=""></p>', UNCHANGED),
402 ("<p x=''></p>", UNCHANGED),
403 ('<self-closing a="b" />', UNCHANGED),
404
405 # We could also normalize CDATA?
406 # Note that CDATA has an escaping problem: you need to handle it ]]> with
407 # concatenation. It just "pushes the problem around".
408 # So I think it's better to use ONE kind of escaping, which is &lt;
409 ('<script><![CDATA[ <wtf> >< ]]></script>', UNCHANGED),
410
411 # allowed, but 3 < 4 is not allowed
412 ('<a> 3 > 4 </a>', '<a> 3 &gt; 4 </a>'),
413 # allowed, but 3 > 4 is not allowed
414 ('<p x="3 < 4"></p>', ''),
415 ('<b><a href="foo">link</a></b>', UNCHANGED),
416
417 # TODO: should be self-closing
418 #('<meta><a></a>', '<meta/><a></a>'),
419 ('<meta><a></a>', ''),
420
421 # no attribute
422 ('<button disabled></button>', ''),
423 ('<button disabled=></button>', ''),
424 ('<button disabled= ></button>', ''),
425
426 # single quoted is pretty common
427 ("<a href='single'></a>", ''),
428
429 # Conceding to reality - I used these myself
430 ('<a href=ble.sh></a>', ''),
431 ('<a href=foo.html></a>', ''),
432 ('<foo x="&"></foo>', ''),
433
434 # caps
435 ('<foo></FOO>', ''),
436 ('<Foo></fOO>', ''),
437
438 # capital VOID tag
439 ('<META><a></a>', ''),
440 ('<script><</script>', ''),
441 # matching
442 ('<SCRipt><</SCRipt>', ''),
443 ('<SCRIPT><</SCRIPT>', ''),
444 ('<STYLE><</STYLE>', ''),
445 #'<SCRipt><</script>',
446
447 # Note: Python HTMLParser.py does DYNAMIC compilation of regex with re.I
448 # flag to handle this! Gah I want something faster.
449 #'<script><</SCRIPT>',
450
451 # TODO: Test <svg> and <math> ?
452]
453
454VALID_XML = [
455 '<meta></meta>',
456]
457
458INVALID_TAG_LEX = [
459 # not allowed, but 3 < 4 is allowed
460 '<p x="3 > 4"></p>',
461 # same thing
462 '<a href=">"></a>',
463 '<a foo=bar !></a>', # bad attr
464]
465
466
467class ValidateTest(unittest.TestCase):
468
469 def testInvalid(self):
470 # type: () -> None
471 counters = html.Counters()
472 for s in INVALID_LEX + INVALID_TAG_LEX:
473 try:
474 html.Validate(s, html.BALANCED_TAGS, counters)
475 except html.LexError as e:
476 print(e)
477 else:
478 self.fail('Expected LexError %r' % s)
479
480 for s in INVALID_PARSE:
481 try:
482 html.Validate(s, html.BALANCED_TAGS, counters)
483 except html.ParseError as e:
484 print(e)
485 else:
486 self.fail('Expected ParseError')
487
488 def testValid(self):
489 # type: () -> None
490 counters = html.Counters()
491 for s, _ in VALID_PARSE:
492 html.Validate(s, html.BALANCED_TAGS, counters)
493 print('HTML5 %r' % s)
494 #print('HTML5 attrs %r' % counters.debug_attrs)
495
496 def testValidXml(self):
497 # type: () -> None
498 counters = html.Counters()
499 for s in VALID_XML:
500 html.Validate(s, html.BALANCED_TAGS | html.NO_SPECIAL_TAGS,
501 counters)
502 print('XML %r' % s)
503 #print('XML attrs %r' % counters.debug_attrs)
504
505
506class XmlTest(unittest.TestCase):
507
508 def testValid(self):
509 # type: () -> None
510 counters = html.Counters()
511 for h, expected_xml in VALID_LEX + VALID_PARSE:
512 actual = html.ToXml(h)
513 if expected_xml == UNCHANGED: # Unchanged
514 self.assertEqual(h, actual)
515 elif expected_xml == '': # Skip
516 pass
517 else:
518 self.assertEqual(expected_xml, actual)
519
520
521if __name__ == '__main__':
522 unittest.main()