OILS / data_lang / htm8_test.py View on Github | oils.pub

607 lines, 399 significant
1#!/usr/bin/env python2
2from __future__ import print_function
3
4from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_id_str, attr_name,
5 attr_name_str, attr_value_e,
6 attr_value_str)
7
8import unittest
9import re
10
11from typing import List, Tuple, Any
12
13from data_lang import htm8
14from doctools.util import log
15
16with open('data_lang/testdata/hello.htm8') as f:
17 TEST_HTML = f.read()
18
19
20class RegexTest(unittest.TestCase):
21
22 def testDotAll(self):
23 # type: () -> None
24
25 # Note that $ matches end of line, not end of string
26 p1 = re.compile(r'.')
27 print(p1.match('\n'))
28
29 p2 = re.compile(r'.', re.DOTALL)
30 print(p2.match('\n'))
31
32 #p3 = re.compile(r'[.\n]', re.VERBOSE)
33 p3 = re.compile(r'[.\n]')
34 print(p3.match('\n'))
35
36 print('Negation')
37
38 p4 = re.compile(r'[^>]')
39 print(p4.match('\n'))
40
41 def testAttrRe(self):
42 # type: () -> None
43 _ATTR_RE = htm8._ATTR_RE
44 m = _ATTR_RE.match(' empty= val')
45 print(m.groups())
46
47
48class FunctionsTest(unittest.TestCase):
49
50 def testFindLineNum(self):
51 # type: () -> None
52 s = 'foo\n' * 3
53 for pos in [1, 5, 10, 50]: # out of bounds
54 line_num = htm8._FindLineNum(s, pos)
55 print(line_num)
56
57
58def _MakeAttrLexer(t, h, expected_tag=h8_id.StartTag):
59 # type: (Any, str) -> htm8.AttrLexer
60
61 lx = htm8.Lexer(h)
62
63 tok_id, end_pos = lx.Read()
64 t.assertEqual(expected_tag, tok_id)
65
66 attr_lx = htm8.AttrLexer(h)
67 attr_lx.Init(lx.TagNamePos(), end_pos)
68
69 return attr_lx
70
71
72class AttrLexerTest(unittest.TestCase):
73
74 def testNoAttrs(self):
75 # type: () -> None
76
77 # TODO: h8_id.StartTag and EndTag will expose the tag_name_pos - the
78 # end of the tag name
79
80 h = 'x <a>'
81 lx = htm8.Lexer(h)
82
83 # Skip raw data
84 tok_id, end_pos = lx.Read()
85 self.assertEqual(h8_id.RawData, tok_id)
86
87 tok_id, end_pos = lx.Read()
88 self.assertEqual(h8_id.StartTag, tok_id)
89
90 attr_lx = htm8.AttrLexer(h)
91 attr_lx.Init(lx.TagNamePos(), end_pos)
92
93 # There is no tag
94 n, name_start, name_end = attr_lx.ReadName()
95 self.assertEqual(n, attr_name.Done)
96 self.assertEqual(-1, name_start)
97 self.assertEqual(-1, name_end)
98
99 try:
100 result = attr_lx.ReadValue()
101 except AssertionError as e:
102 print(e)
103 else:
104 self.fail('should have failed')
105
106 try:
107 result = attr_lx.ReadName()
108 except AssertionError as e:
109 print(e)
110 else:
111 self.fail('should have failed')
112
113 def testInvalid(self):
114 h = '<a !>'
115 attr_lx = _MakeAttrLexer(self, h)
116
117 n, name_start, name_end = attr_lx.ReadName()
118 self.assertEqual(n, attr_name.Invalid)
119 self.assertEqual(-1, name_start)
120 self.assertEqual(-1, name_end)
121
122 try:
123 result = attr_lx.ReadValue()
124 except AssertionError as e:
125 print(e)
126 else:
127 self.fail('should have failed')
128
129 def testEmpty(self):
130 h = '<img src=/>'
131 attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartEndTag)
132
133 n, name_start, name_end = attr_lx.ReadName()
134 self.assertEqual(n, attr_name.Ok)
135 self.assertEqual(5, name_start)
136 self.assertEqual(8, name_end)
137 self.assertEqual(False, attr_lx.next_value_is_missing)
138
139 self.assertEqual(True, attr_lx.AttrNameEquals('src'))
140 self.assertEqual(False, attr_lx.AttrNameEquals('srcz'))
141
142 v, attr_start, attr_end = attr_lx.ReadValue()
143 log('v = %s', attr_value_str(v))
144 self.assertEqual(attr_value_e.Empty, v)
145 self.assertEqual(-1, attr_start)
146 self.assertEqual(-1, attr_end)
147
148 n, name_start, name_end = attr_lx.ReadName()
149 self.assertEqual(n, attr_name.Done)
150
151 def testMissing(self):
152 h = '<img SRC/>'
153 attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartEndTag)
154
155 n, name_start, name_end = attr_lx.ReadName()
156 self.assertEqual(n, attr_name.Ok)
157 self.assertEqual(5, name_start)
158 self.assertEqual(8, name_end)
159 self.assertEqual(True, attr_lx.next_value_is_missing)
160
161 self.assertEqual(True, attr_lx.AttrNameEquals('src'))
162 self.assertEqual(False, attr_lx.AttrNameEquals('srcz'))
163
164 v, attr_start, attr_end = attr_lx.ReadValue()
165 self.assertEqual(attr_value_e.Missing, v)
166 self.assertEqual(-1, attr_start)
167 self.assertEqual(-1, attr_end)
168
169 n, name_start, name_end = attr_lx.ReadName()
170 self.assertEqual(n, attr_name.Done)
171
172 def testUnquoted(self):
173 # CAREFUL: /> is a StartEndTag, and / is not part of unquoted value
174 h = '<a x=foo/>'
175 attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartEndTag)
176
177 n, name_start, name_end = attr_lx.ReadName()
178 self.assertEqual(n, attr_name.Ok)
179 self.assertEqual(3, name_start)
180 self.assertEqual(4, name_end)
181
182 v, attr_start, attr_end = attr_lx.ReadValue()
183
184 log('v = %s', attr_value_str(v))
185 log('unquoted val %r', h[attr_start:attr_end])
186
187 self.assertEqual(attr_value_e.Unquoted, v)
188 self.assertEqual(5, attr_start)
189 self.assertEqual(8, attr_end)
190
191 n, name_start, name_end = attr_lx.ReadName()
192 self.assertEqual(n, attr_name.Done)
193
194 def testDoubleQuoted(self):
195 h = '<a x="f&">'
196 attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)
197
198 n, name_start, name_end = attr_lx.ReadName()
199 self.assertEqual(n, attr_name.Ok)
200 self.assertEqual(3, name_start)
201 self.assertEqual(4, name_end)
202
203 v, attr_start, attr_end = attr_lx.ReadValue()
204
205 log('v = %s', attr_value_str(v))
206 log('val %r', h[attr_start:attr_end])
207
208 self.assertEqual(attr_value_e.DoubleQuoted, v)
209 self.assertEqual(6, attr_start)
210 self.assertEqual(8, attr_end)
211 self.assertEqual(9, attr_lx.pos)
212
213 n, name_start, name_end = attr_lx.ReadName()
214 log('n = %r', attr_name_str(n))
215 self.assertEqual(n, attr_name.Done)
216
217 def testSingleQuoted(self):
218 h = "<a x='&f'>"
219 attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)
220
221 n, name_start, name_end = attr_lx.ReadName()
222 self.assertEqual(n, attr_name.Ok)
223 self.assertEqual(3, name_start)
224 self.assertEqual(4, name_end)
225
226 v, attr_start, attr_end = attr_lx.ReadValue()
227
228 log('v = %s', attr_value_str(v))
229 log('unquoted val %r', h[attr_start:attr_end])
230
231 self.assertEqual(attr_value_e.SingleQuoted, v)
232 self.assertEqual(6, attr_start)
233 self.assertEqual(8, attr_end)
234 self.assertEqual(9, attr_lx.pos)
235
236 n, name_start, name_end = attr_lx.ReadName()
237 #log('n = %r', attr_name_str(n))
238 self.assertEqual(n, attr_name.Done)
239
240 def testDoubleQuoted_Bad(self):
241 h = '<a x="foo>'
242 attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)
243
244 n, name_start, name_end = attr_lx.ReadName()
245 self.assertEqual(n, attr_name.Ok)
246 self.assertEqual(3, name_start)
247 self.assertEqual(4, name_end)
248
249 try:
250 v, attr_start, attr_end = attr_lx.ReadValue()
251 except htm8.LexError as e:
252 print(e)
253 else:
254 self.fail('Expected LexError')
255
256 def testSingleQuoted_Bad(self):
257 h = "<a x='foo>"
258 attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)
259
260 n, name_start, name_end = attr_lx.ReadName()
261 self.assertEqual(n, attr_name.Ok)
262 self.assertEqual(3, name_start)
263 self.assertEqual(4, name_end)
264
265 try:
266 v, attr_start, attr_end = attr_lx.ReadValue()
267 except htm8.LexError as e:
268 print(e)
269 else:
270 self.fail('Expected LexError')
271
272
273class AttrLexerWrapperTest(unittest.TestCase):
274
275 def testGetAttrRaw(self):
276 # type: () -> None
277 lex = _MakeAttrLexer(self, '<a>')
278 #_PrintTokens(lex)
279 self.assertEqual(None, htm8.GetAttrRaw(lex, 'oops'))
280
281 # <a novalue> means lex.Get('novalue') == ''
282 # https://developer.mozilla.org/en-US/docs/Web/API/Element/hasAttribute
283 # We are not distinguishing <a novalue=""> from <a novalue> in this API
284 lex = _MakeAttrLexer(self, '<a novalue>')
285 #_PrintTokens(lex)
286 self.assertEqual('', htm8.GetAttrRaw(lex, 'novalue'))
287
288 def testGetAttrRaw2(self):
289 lex = _MakeAttrLexer(self, '<a href="double quoted">')
290 #_PrintTokens(lex)
291
292 log('*** OOPS')
293 self.assertEqual(None, htm8.GetAttrRaw(lex, 'oops'))
294 lex.Reset()
295 log('*** DOUBLE')
296 self.assertEqual('double quoted', htm8.GetAttrRaw(lex, 'href'))
297
298 def testGetAttrRaw3(self):
299 """Reverse order vs. testGetAttrRaw2"""
300 lex = _MakeAttrLexer(self, '<a href="double quoted">')
301 #_PrintTokens(lex)
302
303 self.assertEqual('double quoted', htm8.GetAttrRaw(lex, 'href'))
304 lex.Reset()
305 self.assertEqual(None, htm8.GetAttrRaw(lex, 'oops'))
306
307 def testGetAttrRaw4(self):
308
309 lex = _MakeAttrLexer(self, '<a href=foo class="bar">')
310 #_PrintTokens(lex)
311 self.assertEqual('bar', htm8.GetAttrRaw(lex, 'class'))
312
313 lex = _MakeAttrLexer(self,
314 '<a href=foo class="bar" />',
315 expected_tag=h8_id.StartEndTag)
316 #_PrintTokens(lex)
317 self.assertEqual('bar', htm8.GetAttrRaw(lex, 'class'))
318
319 lex = _MakeAttrLexer(self,
320 '<a href="?foo=1&amp;bar=2" />',
321 expected_tag=h8_id.StartEndTag)
322 self.assertEqual('?foo=1&amp;bar=2', htm8.GetAttrRaw(lex, 'href'))
323
324 def testAllAttrs(self):
325 # type: () -> None
326 """
327 [('key', 'value')] for all
328 """
329 # closed
330 lex = _MakeAttrLexer(self,
331 '<a href=foo class="bar" />',
332 expected_tag=h8_id.StartEndTag)
333 self.assertEqual([('href', 'foo'), ('class', 'bar')],
334 htm8.AllAttrsRaw(lex))
335
336 lex = _MakeAttrLexer(self,
337 '<a href="?foo=1&amp;bar=2" />',
338 expected_tag=h8_id.StartEndTag)
339 self.assertEqual([('href', '?foo=1&amp;bar=2')], htm8.AllAttrsRaw(lex))
340
341 def testEmptyMissingValues(self):
342 # type: () -> None
343 # equivalent to <button disabled="">
344 lex = _MakeAttrLexer(self, '<button disabled>')
345 all_attrs = htm8.AllAttrsRaw(lex)
346 self.assertEqual([('disabled', '')], all_attrs)
347
348 # TODO: restore this
349 if 0:
350 slices = lex.AllAttrsRawSlice()
351 log('slices %s', slices)
352
353 lex = _MakeAttrLexer(
354 self, '''<p double="" single='' empty= value missing empty2=>''')
355 all_attrs = htm8.AllAttrsRaw(lex)
356 self.assertEqual([
357 ('double', ''),
358 ('single', ''),
359 ('empty', 'value'),
360 ('missing', ''),
361 ('empty2', ''),
362 ], all_attrs)
363 # TODO: should have
364 log('all %s', all_attrs)
365
366 if 0:
367 slices = lex.AllAttrsRawSlice()
368 log('slices %s', slices)
369
370 def testInvalidTag(self):
371 # type: () -> None
372 try:
373 lex = _MakeAttrLexer(self, '<a foo=bar !></a>')
374 all_attrs = htm8.AllAttrsRaw(lex)
375 except htm8.LexError as e:
376 print(e)
377 else:
378 self.fail('Expected LexError')
379
380
381def ValidTokenList(s, no_special_tags=False):
382 # type: (str, bool) -> List[Tuple[h8_id_t, int]]
383 """A wrapper that can be more easily translated to C++. Doesn't use iterators."""
384
385 start_pos = 0
386 tokens = []
387 lx = htm8.Lexer(s, no_special_tags=no_special_tags)
388 while True:
389 tok_id, end_pos = lx.Read()
390 tokens.append((tok_id, end_pos))
391 if tok_id == h8_id.EndOfStream:
392 break
393 if tok_id == h8_id.Invalid:
394 raise htm8.LexError(s, start_pos)
395 start_pos = end_pos
396 return tokens
397
398
399def Lex(h, no_special_tags=False):
400 # type: (str, bool) -> List[Tuple[int, int]]
401 print(repr(h))
402 tokens = ValidTokenList(h, no_special_tags=no_special_tags)
403 start_pos = 0
404 for tok_id, end_pos in tokens:
405 frag = h[start_pos:end_pos]
406 log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
407 start_pos = end_pos
408 return tokens
409
410
411class LexerTest(unittest.TestCase):
412
413 # IndexLinker in devtools/make_help.py
414 # <pre> sections in doc/html_help.py
415 # TocExtractor in devtools/cmark.py
416
417 def testPstrip(self):
418 # type: () -> None
419 """Remove anything like this.
420
421 <p><pstrip> </pstrip></p>
422 """
423 pass
424
425 def testCommentParse(self):
426 # type: () -> None
427 n = len(TEST_HTML)
428 tokens = Lex(TEST_HTML)
429
430 def testCommentParse2(self):
431 # type: () -> None
432 h = '''
433 hi <!-- line 1
434 line 2 --><br/>'''
435 tokens = Lex(h)
436
437 self.assertEqual(
438 [
439 (h8_id.RawData, 12),
440 (h8_id.Comment, 50), # <? err ?>
441 (h8_id.StartEndTag, 55),
442 (h8_id.EndOfStream, 55),
443 ],
444 tokens)
445
446 def testProcessingInstruction(self):
447 # type: () -> None
448 # <?xml ?> header
449 h = 'hi <? err ?>'
450 tokens = Lex(h)
451
452 self.assertEqual(
453 [
454 (h8_id.RawData, 3),
455 (h8_id.Processing, 12), # <? err ?>
456 (h8_id.EndOfStream, 12),
457 ],
458 tokens)
459
460 def testScriptStyle(self):
461 # type: () -> None
462 h = '''
463 hi <script src=""> if (x < 1 && y > 2 ) { console.log(""); }
464 </script>
465 '''
466 tokens = Lex(h)
467
468 expected = [
469 (h8_id.RawData, 12),
470 (h8_id.StartTag, 27), # <script>
471 (h8_id.HtmlCData, 78), # JavaScript code is HTML CData
472 (h8_id.EndTag, 87), # </script>
473 (h8_id.RawData, 96), # \n
474 (h8_id.EndOfStream, 96), # \n
475 ]
476 self.assertEqual(expected, tokens)
477
478 # Test case matching
479 tokens = Lex(h.replace('script', 'scrIPT'))
480 self.assertEqual(expected, tokens)
481
482 def testScriptStyleXml(self):
483 # type: () -> None
484 h = 'hi <script src=""> &lt; </script>'
485 # XML mode
486 tokens = Lex(h, no_special_tags=True)
487
488 self.assertEqual(
489 [
490 (h8_id.RawData, 3),
491 (h8_id.StartTag, 18), # <script>
492 (h8_id.RawData, 19), # space
493 (h8_id.CharEntity, 23), # </script>
494 (h8_id.RawData, 24), # \n
495 (h8_id.EndTag, 33), # \n
496 (h8_id.EndOfStream, 33), # \n
497 ],
498 tokens)
499
500 def testCData(self):
501 # type: () -> None
502
503 # from
504 # /home/andy/src/languages/Python-3.11.5/Lib/test/xmltestdata/c14n-20/inC14N4.xml
505 h = '<compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>'
506 tokens = Lex(h)
507
508 self.assertEqual([
509 (h8_id.StartTag, 9),
510 (h8_id.CData, 61),
511 (h8_id.EndTag, 71),
512 (h8_id.EndOfStream, 71),
513 ], tokens)
514
515 def testEntity(self):
516 # type: () -> None
517
518 # from
519 # /home/andy/src/Python-3.12.4/Lib/test/xmltestdata/c14n-20/inC14N5.xml
520 h = '&ent1;, &ent2;!'
521
522 tokens = Lex(h)
523
524 self.assertEqual([
525 (h8_id.CharEntity, 6),
526 (h8_id.RawData, 8),
527 (h8_id.CharEntity, 14),
528 (h8_id.RawData, 15),
529 (h8_id.EndOfStream, 15),
530 ], tokens)
531
532 def testStartTag(self):
533 # type: () -> None
534
535 h = '<a>hi</a>'
536 tokens = Lex(h)
537
538 self.assertEqual([
539 (h8_id.StartTag, 3),
540 (h8_id.RawData, 5),
541 (h8_id.EndTag, 9),
542 (h8_id.EndOfStream, 9),
543 ], tokens)
544
545 # Make sure we don't consume too much
546 h = '<a><source>1.7</source></a>'
547
548 tokens = Lex(h)
549
550 self.assertEqual([
551 (h8_id.StartTag, 3),
552 (h8_id.StartTag, 11),
553 (h8_id.RawData, 14),
554 (h8_id.EndTag, 23),
555 (h8_id.EndTag, 27),
556 (h8_id.EndOfStream, 27),
557 ], tokens)
558
559 return
560
561 h = '''
562 <configuration>
563 <source>1.7</source>
564 </configuration>'''
565
566 tokens = Lex(h)
567
568 self.assertEqual([
569 (h8_id.RawData, 9),
570 (h8_id.StartTag, 24),
571 (h8_id.RawData, 9),
572 (h8_id.EndOfStream, 9),
573 ], tokens)
574
575 def testBad(self):
576 # type: () -> None
577 h = '&'
578 tokens = Lex(h)
579
580 self.assertEqual([
581 (h8_id.BadAmpersand, 1),
582 (h8_id.EndOfStream, 1),
583 ], tokens)
584
585 h = '>'
586 tokens = Lex(h)
587
588 self.assertEqual([
589 (h8_id.BadGreaterThan, 1),
590 (h8_id.EndOfStream, 1),
591 ], tokens)
592
593 def testEndOfStream(self):
594 # type: () -> None
595
596 # NUL is end
597 h = 'a\0b'
598 tokens = Lex(h)
599
600 self.assertEqual([
601 (h8_id.RawData, 1),
602 (h8_id.EndOfStream, 2),
603 ], tokens)
604
605
606if __name__ == '__main__':
607 unittest.main()