OILS / data_lang / htm8_test.py View on Github | oils.pub

608 lines, 400 significant
1#!/usr/bin/env python2
2from __future__ import print_function
3
4from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_id_str, attr_name,
5 attr_name_str, attr_value_e,
6 attr_value_str)
7
8import unittest
9import re
10
11from typing import List, Tuple, Any
12
13from data_lang import htm8
14from doctools.util import log
15
16with open('data_lang/testdata/hello.htm8') as f:
17 TEST_HTML = f.read()
18
19
20class RegexTest(unittest.TestCase):
21
22 def testDotAll(self):
23 # type: () -> None
24
25 # Note that $ matches end of line, not end of string
26 p1 = re.compile(r'.')
27 print(p1.match('\n'))
28
29 p2 = re.compile(r'.', re.DOTALL)
30 print(p2.match('\n'))
31
32 #p3 = re.compile(r'[.\n]', re.VERBOSE)
33 p3 = re.compile(r'[.\n]')
34 print(p3.match('\n'))
35
36 print('Negation')
37
38 p4 = re.compile(r'[^>]')
39 print(p4.match('\n'))
40
41 def testAttrRe(self):
42 # type: () -> None
43 _ATTR_RE = htm8._ATTR_RE
44 m = _ATTR_RE.match(' empty= val')
45 print(m.groups())
46
47
48class FunctionsTest(unittest.TestCase):
49
50 def testFindLineNum(self):
51 # type: () -> None
52 s = 'foo\n' * 3
53 for pos in [1, 5, 10, 50]: # out of bounds
54 line_num = htm8._FindLineNum(s, pos)
55 print(line_num)
56
57
58def _MakeAttrLexer(t, h, expected_tag=h8_id.StartTag):
59 # type: (Any, str) -> htm8.AttrLexer
60
61 lx = htm8.Lexer(h)
62
63 tok_id, end_pos = lx.Read()
64 t.assertEqual(expected_tag, tok_id)
65
66 attr_lx = htm8.AttrLexer(h)
67 attr_lx.Init(tok_id, lx.TagNamePos(), end_pos)
68
69 return attr_lx
70
71
72class AttrLexerTest(unittest.TestCase):
73
74 def testNoAttrs(self):
75 # type: () -> None
76
77 # TODO: h8_id.StartTag and EndTag will expose the tag_name_pos - the
78 # end of the tag name
79
80 h = 'x <a>'
81 lx = htm8.Lexer(h)
82
83 # Skip raw data
84 tok_id, end_pos = lx.Read()
85 self.assertEqual(h8_id.RawData, tok_id)
86
87 tok_id, end_pos = lx.Read()
88 self.assertEqual(h8_id.StartTag, tok_id)
89
90 attr_lx = htm8.AttrLexer(h)
91 attr_lx.Init(tok_id, lx.TagNamePos(), end_pos)
92
93 # There is no tag
94 n, name_start, name_end = attr_lx.ReadName()
95 self.assertEqual(n, attr_name.Done)
96 self.assertEqual(-1, name_start)
97 self.assertEqual(-1, name_end)
98
99 try:
100 result = attr_lx.ReadValue()
101 except AssertionError as e:
102 print(e)
103 else:
104 self.fail('should have failed')
105
106 try:
107 result = attr_lx.ReadName()
108 except AssertionError as e:
109 print(e)
110 else:
111 self.fail('should have failed')
112
113 def testInvalid(self):
114 h = '<a !>'
115 attr_lx = _MakeAttrLexer(self, h)
116
117 n, name_start, name_end = attr_lx.ReadName()
118 self.assertEqual(n, attr_name.Invalid)
119 self.assertEqual(-1, name_start)
120 self.assertEqual(-1, name_end)
121
122 try:
123 result = attr_lx.ReadValue()
124 except AssertionError as e:
125 print(e)
126 else:
127 self.fail('should have failed')
128
129 def testEmpty(self):
130 h = '<img src=>'
131 attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)
132
133 n, name_start, name_end = attr_lx.ReadName()
134 self.assertEqual(n, attr_name.Ok)
135 self.assertEqual(5, name_start)
136 self.assertEqual(8, name_end)
137 self.assertEqual(False, attr_lx.next_value_is_missing)
138
139 self.assertEqual(True, attr_lx.AttrNameEquals('src'))
140 self.assertEqual(False, attr_lx.AttrNameEquals('srcz'))
141
142 v, attr_start, attr_end = attr_lx.ReadValue()
143 log('v = %s', attr_value_str(v))
144 self.assertEqual(attr_value_e.Empty, v)
145 self.assertEqual(-1, attr_start)
146 self.assertEqual(-1, attr_end)
147
148 n, name_start, name_end = attr_lx.ReadName()
149 self.assertEqual(n, attr_name.Done)
150
151 def testMissing(self):
152 h = '<img SRC/>'
153 attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartEndTag)
154
155 n, name_start, name_end = attr_lx.ReadName()
156 self.assertEqual(n, attr_name.Ok)
157 self.assertEqual(5, name_start)
158 self.assertEqual(8, name_end)
159 self.assertEqual(True, attr_lx.next_value_is_missing)
160
161 self.assertEqual(True, attr_lx.AttrNameEquals('src'))
162 self.assertEqual(False, attr_lx.AttrNameEquals('srcz'))
163
164 v, attr_start, attr_end = attr_lx.ReadValue()
165 self.assertEqual(attr_value_e.Missing, v)
166 self.assertEqual(-1, attr_start)
167 self.assertEqual(-1, attr_end)
168
169 n, name_start, name_end = attr_lx.ReadName()
170 self.assertEqual(n, attr_name.Done)
171
172 def testUnquoted(self):
173 # CAREFUL: /> is a StartEndTag, and / is not part of unquoted value
174 h = '<a x=foo />'
175 attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartEndTag)
176
177 n, name_start, name_end = attr_lx.ReadName()
178 self.assertEqual(n, attr_name.Ok)
179 self.assertEqual(3, name_start)
180 self.assertEqual(4, name_end)
181
182 v, attr_start, attr_end = attr_lx.ReadValue()
183
184 log('v = %s', attr_value_str(v))
185 log('unquoted val %r', h[attr_start:attr_end])
186
187 self.assertEqual(attr_value_e.Unquoted, v)
188 self.assertEqual(5, attr_start)
189 self.assertEqual(8, attr_end)
190
191 n, name_start, name_end = attr_lx.ReadName()
192 self.assertEqual(n, attr_name.Done)
193
194 def testDoubleQuoted(self):
195 h = '<a x="f&">'
196 attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)
197
198 n, name_start, name_end = attr_lx.ReadName()
199 self.assertEqual(n, attr_name.Ok)
200 self.assertEqual(3, name_start)
201 self.assertEqual(4, name_end)
202
203 v, attr_start, attr_end = attr_lx.ReadValue()
204
205 log('v = %s', attr_value_str(v))
206 log('val %r', h[attr_start:attr_end])
207
208 self.assertEqual(attr_value_e.DoubleQuoted, v)
209 self.assertEqual(6, attr_start)
210 self.assertEqual(8, attr_end)
211 self.assertEqual(9, attr_lx.pos)
212
213 n, name_start, name_end = attr_lx.ReadName()
214 log('n = %r', attr_name_str(n))
215 self.assertEqual(n, attr_name.Done)
216
217 def testSingleQuoted(self):
218 h = "<a x='&f'>"
219 attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)
220
221 n, name_start, name_end = attr_lx.ReadName()
222 self.assertEqual(n, attr_name.Ok)
223 self.assertEqual(3, name_start)
224 self.assertEqual(4, name_end)
225
226 v, attr_start, attr_end = attr_lx.ReadValue()
227
228 log('v = %s', attr_value_str(v))
229 log('unquoted val %r', h[attr_start:attr_end])
230
231 self.assertEqual(attr_value_e.SingleQuoted, v)
232 self.assertEqual(6, attr_start)
233 self.assertEqual(8, attr_end)
234 self.assertEqual(9, attr_lx.pos)
235
236 n, name_start, name_end = attr_lx.ReadName()
237 #log('n = %r', attr_name_str(n))
238 self.assertEqual(n, attr_name.Done)
239
240 def testDoubleQuoted_Bad(self):
241 h = '<a x="foo>'
242 attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)
243
244 n, name_start, name_end = attr_lx.ReadName()
245 self.assertEqual(n, attr_name.Ok)
246 self.assertEqual(3, name_start)
247 self.assertEqual(4, name_end)
248
249 try:
250 v, attr_start, attr_end = attr_lx.ReadValue()
251 except htm8.LexError as e:
252 print(e)
253 else:
254 self.fail('Expected LexError')
255
256 def testSingleQuoted_Bad(self):
257 h = "<a x='foo>"
258 attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)
259
260 n, name_start, name_end = attr_lx.ReadName()
261 self.assertEqual(n, attr_name.Ok)
262 self.assertEqual(3, name_start)
263 self.assertEqual(4, name_end)
264
265 try:
266 v, attr_start, attr_end = attr_lx.ReadValue()
267 except htm8.LexError as e:
268 print(e)
269 else:
270 self.fail('Expected LexError')
271
272
273class AttrLexerWrapperTest(unittest.TestCase):
274
275 def testGetAttrRaw(self):
276 # type: () -> None
277 lex = _MakeAttrLexer(self, '<a>')
278 #_PrintTokens(lex)
279 self.assertEqual(None, htm8.GetAttrRaw(lex, 'oops'))
280
281 # <a novalue> means lex.Get('novalue') == ''
282 # https://developer.mozilla.org/en-US/docs/Web/API/Element/hasAttribute
283 # We are not distinguishing <a novalue=""> from <a novalue> in this API
284 lex = _MakeAttrLexer(self, '<a novalue>')
285 #_PrintTokens(lex)
286 self.assertEqual('', htm8.GetAttrRaw(lex, 'novalue'))
287
288 def testGetAttrRaw2(self):
289 lex = _MakeAttrLexer(self, '<a href="double quoted">')
290 #_PrintTokens(lex)
291
292 log('*** OOPS')
293 self.assertEqual(None, htm8.GetAttrRaw(lex, 'oops'))
294 lex.Reset()
295 log('*** DOUBLE')
296 self.assertEqual('double quoted', htm8.GetAttrRaw(lex, 'href'))
297
298 def testGetAttrRaw3(self):
299 """Reverse order vs. testGetAttrRaw2"""
300 lex = _MakeAttrLexer(self, '<a href="double quoted">')
301 #_PrintTokens(lex)
302
303 self.assertEqual('double quoted', htm8.GetAttrRaw(lex, 'href'))
304 lex.Reset()
305 self.assertEqual(None, htm8.GetAttrRaw(lex, 'oops'))
306
307 def testGetAttrRaw4(self):
308
309 lex = _MakeAttrLexer(self, '<a href=foo class="bar">')
310 #_PrintTokens(lex)
311 self.assertEqual('bar', htm8.GetAttrRaw(lex, 'class'))
312
313 lex = _MakeAttrLexer(self,
314 '<a href=foo class="bar" />',
315 expected_tag=h8_id.StartEndTag)
316 #_PrintTokens(lex)
317 self.assertEqual('bar', htm8.GetAttrRaw(lex, 'class'))
318
319 lex = _MakeAttrLexer(self,
320 '<a href="?foo=1&amp;bar=2" />',
321 expected_tag=h8_id.StartEndTag)
322 self.assertEqual('?foo=1&amp;bar=2', htm8.GetAttrRaw(lex, 'href'))
323
324 def testAllAttrs(self):
325 # type: () -> None
326 """
327 [('key', 'value')] for all
328 """
329 # closed
330 lex = _MakeAttrLexer(self,
331 '<a href=foo class="bar" />',
332 expected_tag=h8_id.StartEndTag)
333 self.assertEqual([('href', 'foo'), ('class', 'bar')],
334 htm8.AllAttrsRaw(lex))
335
336 lex = _MakeAttrLexer(self,
337 '<a href="?foo=1&amp;bar=2" />',
338 expected_tag=h8_id.StartEndTag)
339 self.assertEqual([('href', '?foo=1&amp;bar=2')], htm8.AllAttrsRaw(lex))
340
341 def testEmptyMissingValues(self):
342 # type: () -> None
343 # equivalent to <button disabled="">
344 lex = _MakeAttrLexer(self, '<button disabled>')
345 all_attrs = htm8.AllAttrsRaw(lex)
346 self.assertEqual([('disabled', '')], all_attrs)
347
348 # TODO: restore this
349 if 0:
350 slices = lex.AllAttrsRawSlice()
351 log('slices %s', slices)
352
353 lex = _MakeAttrLexer(
354 self, '''<p double="" single='' empty= value missing empty2=>''')
355 all_attrs = htm8.AllAttrsRaw(lex)
356 self.assertEqual([
357 ('double', ''),
358 ('single', ''),
359 ('empty', 'value'),
360 ('missing', ''),
361 ('empty2', ''),
362 ], all_attrs)
363 # TODO: should have
364 log('all %s', all_attrs)
365
366 if 0:
367 slices = lex.AllAttrsRawSlice()
368 log('slices %s', slices)
369
370 def testInvalidTag(self):
371 # type: () -> None
372 try:
373 lex = _MakeAttrLexer(self, '<a foo=bar !></a>')
374 all_attrs = htm8.AllAttrsRaw(lex)
375 except htm8.LexError as e:
376 print(e)
377 else:
378 self.fail('Expected LexError')
379
380
381def ValidTokenList(s, no_special_tags=False):
382 # type: (str, bool) -> List[Tuple[h8_id_t, int]]
383 """A wrapper that can be more easily translated to C++. Doesn't use iterators."""
384
385 start_pos = 0
386 tokens = []
387 lx = htm8.Lexer(s, no_special_tags=no_special_tags)
388 while True:
389 tok_id, end_pos = lx.Read()
390 tokens.append((tok_id, end_pos))
391 if tok_id == h8_id.EndOfStream:
392 break
393 if tok_id == h8_id.Invalid:
394 raise htm8.LexError('ValidTokenList() got invalid token', s,
395 start_pos)
396 start_pos = end_pos
397 return tokens
398
399
400def Lex(h, no_special_tags=False):
401 # type: (str, bool) -> List[Tuple[int, int]]
402 print(repr(h))
403 tokens = ValidTokenList(h, no_special_tags=no_special_tags)
404 start_pos = 0
405 for tok_id, end_pos in tokens:
406 frag = h[start_pos:end_pos]
407 log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
408 start_pos = end_pos
409 return tokens
410
411
412class LexerTest(unittest.TestCase):
413
414 # IndexLinker in devtools/make_help.py
415 # <pre> sections in doc/html_help.py
416 # TocExtractor in devtools/cmark.py
417
418 def testPstrip(self):
419 # type: () -> None
420 """Remove anything like this.
421
422 <p><pstrip> </pstrip></p>
423 """
424 pass
425
426 def testCommentParse(self):
427 # type: () -> None
428 n = len(TEST_HTML)
429 tokens = Lex(TEST_HTML)
430
431 def testCommentParse2(self):
432 # type: () -> None
433 h = '''
434 hi <!-- line 1
435 line 2 --><br/>'''
436 tokens = Lex(h)
437
438 self.assertEqual(
439 [
440 (h8_id.RawData, 12),
441 (h8_id.Comment, 50), # <? err ?>
442 (h8_id.StartEndTag, 55),
443 (h8_id.EndOfStream, 55),
444 ],
445 tokens)
446
447 def testProcessingInstruction(self):
448 # type: () -> None
449 # <?xml ?> header
450 h = 'hi <? err ?>'
451 tokens = Lex(h)
452
453 self.assertEqual(
454 [
455 (h8_id.RawData, 3),
456 (h8_id.Processing, 12), # <? err ?>
457 (h8_id.EndOfStream, 12),
458 ],
459 tokens)
460
461 def testScriptStyle(self):
462 # type: () -> None
463 h = '''
464 hi <script src=""> if (x < 1 && y > 2 ) { console.log(""); }
465 </script>
466 '''
467 tokens = Lex(h)
468
469 expected = [
470 (h8_id.RawData, 12),
471 (h8_id.StartTag, 27), # <script>
472 (h8_id.HtmlCData, 78), # JavaScript code is HTML CData
473 (h8_id.EndTag, 87), # </script>
474 (h8_id.RawData, 96), # \n
475 (h8_id.EndOfStream, 96), # \n
476 ]
477 self.assertEqual(expected, tokens)
478
479 # Test case matching
480 tokens = Lex(h.replace('script', 'scrIPT'))
481 self.assertEqual(expected, tokens)
482
483 def testScriptStyleXml(self):
484 # type: () -> None
485 h = 'hi <script src=""> &lt; </script>'
486 # XML mode
487 tokens = Lex(h, no_special_tags=True)
488
489 self.assertEqual(
490 [
491 (h8_id.RawData, 3),
492 (h8_id.StartTag, 18), # <script>
493 (h8_id.RawData, 19), # space
494 (h8_id.CharEntity, 23), # </script>
495 (h8_id.RawData, 24), # \n
496 (h8_id.EndTag, 33), # \n
497 (h8_id.EndOfStream, 33), # \n
498 ],
499 tokens)
500
501 def testCData(self):
502 # type: () -> None
503
504 # from
505 # /home/andy/src/languages/Python-3.11.5/Lib/test/xmltestdata/c14n-20/inC14N4.xml
506 h = '<compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>'
507 tokens = Lex(h)
508
509 self.assertEqual([
510 (h8_id.StartTag, 9),
511 (h8_id.CData, 61),
512 (h8_id.EndTag, 71),
513 (h8_id.EndOfStream, 71),
514 ], tokens)
515
516 def testEntity(self):
517 # type: () -> None
518
519 # from
520 # /home/andy/src/Python-3.12.4/Lib/test/xmltestdata/c14n-20/inC14N5.xml
521 h = '&ent1;, &ent2;!'
522
523 tokens = Lex(h)
524
525 self.assertEqual([
526 (h8_id.CharEntity, 6),
527 (h8_id.RawData, 8),
528 (h8_id.CharEntity, 14),
529 (h8_id.RawData, 15),
530 (h8_id.EndOfStream, 15),
531 ], tokens)
532
533 def testStartTag(self):
534 # type: () -> None
535
536 h = '<a>hi</a>'
537 tokens = Lex(h)
538
539 self.assertEqual([
540 (h8_id.StartTag, 3),
541 (h8_id.RawData, 5),
542 (h8_id.EndTag, 9),
543 (h8_id.EndOfStream, 9),
544 ], tokens)
545
546 # Make sure we don't consume too much
547 h = '<a><source>1.7</source></a>'
548
549 tokens = Lex(h)
550
551 self.assertEqual([
552 (h8_id.StartTag, 3),
553 (h8_id.StartTag, 11),
554 (h8_id.RawData, 14),
555 (h8_id.EndTag, 23),
556 (h8_id.EndTag, 27),
557 (h8_id.EndOfStream, 27),
558 ], tokens)
559
560 return
561
562 h = '''
563 <configuration>
564 <source>1.7</source>
565 </configuration>'''
566
567 tokens = Lex(h)
568
569 self.assertEqual([
570 (h8_id.RawData, 9),
571 (h8_id.StartTag, 24),
572 (h8_id.RawData, 9),
573 (h8_id.EndOfStream, 9),
574 ], tokens)
575
576 def testBad(self):
577 # type: () -> None
578 h = '&'
579 tokens = Lex(h)
580
581 self.assertEqual([
582 (h8_id.BadAmpersand, 1),
583 (h8_id.EndOfStream, 1),
584 ], tokens)
585
586 h = '>'
587 tokens = Lex(h)
588
589 self.assertEqual([
590 (h8_id.BadGreaterThan, 1),
591 (h8_id.EndOfStream, 1),
592 ], tokens)
593
594 def testEndOfStream(self):
595 # type: () -> None
596
597 # NUL is end
598 h = 'a\0b'
599 tokens = Lex(h)
600
601 self.assertEqual([
602 (h8_id.RawData, 1),
603 (h8_id.EndOfStream, 2),
604 ], tokens)
605
606
607if __name__ == '__main__':
608 unittest.main()