OILS / data_lang / htm8_test.py View on Github | oils.pub

579 lines, 384 significant
1#!/usr/bin/env python2
2from __future__ import print_function
3
4from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_id_str, attr_name,
5 attr_name_str, attr_value_e,
6 attr_value_str)
7
8import unittest
9
10from typing import List, Tuple, Any
11
12from data_lang import htm8
13from doctools.util import log
14
15with open('data_lang/testdata/hello.htm8') as f:
16 TEST_HTML = f.read()
17
18
19class FunctionsTest(unittest.TestCase):
20
21 def testFindLineNum(self):
22 # type: () -> None
23 s = 'foo\n' * 3
24 for pos in [1, 5, 10, 50]: # out of bounds
25 line_num = htm8._FindLineNum(s, pos)
26 print(line_num)
27
28
29def _MakeAttrLexer(t, h, expected_tag=h8_id.StartTag):
30 # type: (Any, str) -> htm8.AttrLexer
31
32 lx = htm8.Lexer(h)
33
34 tok_id, end_pos = lx.Read()
35 t.assertEqual(expected_tag, tok_id)
36
37 attr_lx = htm8.AttrLexer(h)
38 attr_lx.Init(tok_id, lx.TagNamePos(), end_pos)
39
40 return attr_lx
41
42
43class AttrLexerTest(unittest.TestCase):
44
45 def testNoAttrs(self):
46 # type: () -> None
47
48 # TODO: h8_id.StartTag and EndTag will expose the tag_name_pos - the
49 # end of the tag name
50
51 h = 'x <a>'
52 lx = htm8.Lexer(h)
53
54 # Skip raw data
55 tok_id, end_pos = lx.Read()
56 self.assertEqual(h8_id.RawData, tok_id)
57
58 tok_id, end_pos = lx.Read()
59 self.assertEqual(h8_id.StartTag, tok_id)
60
61 attr_lx = htm8.AttrLexer(h)
62 attr_lx.Init(tok_id, lx.TagNamePos(), end_pos)
63
64 # There is no tag
65 n, name_start, name_end, _ = attr_lx.ReadName()
66 self.assertEqual(n, attr_name.Done)
67 self.assertEqual(-1, name_start)
68 self.assertEqual(-1, name_end)
69
70 try:
71 result = attr_lx.ReadValue()
72 except AssertionError as e:
73 print(e)
74 else:
75 self.fail('should have failed')
76
77 try:
78 result = attr_lx.ReadName()
79 except AssertionError as e:
80 print(e)
81 else:
82 self.fail('should have failed')
83
84 def testInvalid(self):
85 h = '<a !>'
86 attr_lx = _MakeAttrLexer(self, h)
87
88 n, name_start, name_end, _ = attr_lx.ReadName()
89 self.assertEqual(n, attr_name.Invalid)
90 self.assertEqual(-1, name_start)
91 self.assertEqual(-1, name_end)
92
93 try:
94 result = attr_lx.ReadValue()
95 except AssertionError as e:
96 print(e)
97 else:
98 self.fail('should have failed')
99
100 def testEmpty(self):
101 h = '<img src=>'
102 attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)
103
104 n, name_start, name_end, _ = attr_lx.ReadName()
105 self.assertEqual(n, attr_name.Ok)
106 self.assertEqual(5, name_start)
107 self.assertEqual(8, name_end)
108 self.assertEqual(False, attr_lx.next_value_is_missing)
109
110 self.assertEqual(True, attr_lx.AttrNameEquals('src'))
111 self.assertEqual(False, attr_lx.AttrNameEquals('srcz'))
112
113 v, attr_start, attr_end = attr_lx.ReadValue()
114 log('v = %s', attr_value_str(v))
115 self.assertEqual(attr_value_e.Empty, v)
116 self.assertEqual(-1, attr_start)
117 self.assertEqual(-1, attr_end)
118
119 n, name_start, name_end, _ = attr_lx.ReadName()
120 self.assertEqual(n, attr_name.Done)
121
122 def testMissing(self):
123 h = '<img SRC/>'
124 attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartEndTag)
125
126 n, name_start, name_end, _ = attr_lx.ReadName()
127 self.assertEqual(n, attr_name.Ok)
128 self.assertEqual(5, name_start)
129 self.assertEqual(8, name_end)
130 self.assertEqual(True, attr_lx.next_value_is_missing)
131
132 self.assertEqual(True, attr_lx.AttrNameEquals('src'))
133 self.assertEqual(False, attr_lx.AttrNameEquals('srcz'))
134
135 v, attr_start, attr_end = attr_lx.ReadValue()
136 self.assertEqual(attr_value_e.Missing, v)
137 self.assertEqual(-1, attr_start)
138 self.assertEqual(-1, attr_end)
139
140 n, name_start, name_end, _ = attr_lx.ReadName()
141 self.assertEqual(n, attr_name.Done)
142
143 def testUnquoted(self):
144 # CAREFUL: /> is a StartEndTag, and / is not part of unquoted value
145 h = '<a x=foo />'
146 attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartEndTag)
147
148 n, name_start, name_end, _ = attr_lx.ReadName()
149 self.assertEqual(n, attr_name.Ok)
150 self.assertEqual(3, name_start)
151 self.assertEqual(4, name_end)
152
153 v, attr_start, attr_end = attr_lx.ReadValue()
154
155 log('v = %s', attr_value_str(v))
156 log('unquoted val %r', h[attr_start:attr_end])
157
158 self.assertEqual(attr_value_e.Unquoted, v)
159 self.assertEqual(5, attr_start)
160 self.assertEqual(8, attr_end)
161
162 n, name_start, name_end, _ = attr_lx.ReadName()
163 self.assertEqual(n, attr_name.Done)
164
165 def testDoubleQuoted(self):
166 h = '<a x="f&">'
167 attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)
168
169 n, name_start, name_end, _ = attr_lx.ReadName()
170 self.assertEqual(n, attr_name.Ok)
171 self.assertEqual(3, name_start)
172 self.assertEqual(4, name_end)
173
174 v, attr_start, attr_end = attr_lx.ReadValue()
175
176 log('v = %s', attr_value_str(v))
177 log('val %r', h[attr_start:attr_end])
178
179 self.assertEqual(attr_value_e.DoubleQuoted, v)
180 self.assertEqual(6, attr_start)
181 self.assertEqual(8, attr_end)
182 self.assertEqual(9, attr_lx.pos)
183
184 n, name_start, name_end, _ = attr_lx.ReadName()
185 log('n = %r', attr_name_str(n))
186 self.assertEqual(n, attr_name.Done)
187
188 def testSingleQuoted(self):
189 h = "<a x='&f'>"
190 attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)
191
192 n, name_start, name_end, _ = attr_lx.ReadName()
193 self.assertEqual(n, attr_name.Ok)
194 self.assertEqual(3, name_start)
195 self.assertEqual(4, name_end)
196
197 v, attr_start, attr_end = attr_lx.ReadValue()
198
199 log('v = %s', attr_value_str(v))
200 log('unquoted val %r', h[attr_start:attr_end])
201
202 self.assertEqual(attr_value_e.SingleQuoted, v)
203 self.assertEqual(6, attr_start)
204 self.assertEqual(8, attr_end)
205 self.assertEqual(9, attr_lx.pos)
206
207 n, name_start, name_end, _ = attr_lx.ReadName()
208 #log('n = %r', attr_name_str(n))
209 self.assertEqual(n, attr_name.Done)
210
211 def testDoubleQuoted_Bad(self):
212 h = '<a x="foo>'
213 attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)
214
215 n, name_start, name_end, _ = attr_lx.ReadName()
216 self.assertEqual(n, attr_name.Ok)
217 self.assertEqual(3, name_start)
218 self.assertEqual(4, name_end)
219
220 try:
221 v, attr_start, attr_end = attr_lx.ReadValue()
222 except htm8.LexError as e:
223 print(e)
224 else:
225 self.fail('Expected LexError')
226
227 def testSingleQuoted_Bad(self):
228 h = "<a x='foo>"
229 attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)
230
231 n, name_start, name_end, _ = attr_lx.ReadName()
232 self.assertEqual(n, attr_name.Ok)
233 self.assertEqual(3, name_start)
234 self.assertEqual(4, name_end)
235
236 try:
237 v, attr_start, attr_end = attr_lx.ReadValue()
238 except htm8.LexError as e:
239 print(e)
240 else:
241 self.fail('Expected LexError')
242
243
244class AttrLexerWrapperTest(unittest.TestCase):
245
246 def testGetAttrRaw(self):
247 # type: () -> None
248 lex = _MakeAttrLexer(self, '<a>')
249 #_PrintTokens(lex)
250 self.assertEqual(None, htm8.GetAttrRaw(lex, 'oops'))
251
252 # <a novalue> means lex.Get('novalue') == ''
253 # https://developer.mozilla.org/en-US/docs/Web/API/Element/hasAttribute
254 # We are not distinguishing <a novalue=""> from <a novalue> in this API
255 lex = _MakeAttrLexer(self, '<a novalue>')
256 #_PrintTokens(lex)
257 self.assertEqual('', htm8.GetAttrRaw(lex, 'novalue'))
258
259 def testGetAttrRaw2(self):
260 lex = _MakeAttrLexer(self, '<a href="double quoted">')
261 #_PrintTokens(lex)
262
263 log('*** OOPS')
264 self.assertEqual(None, htm8.GetAttrRaw(lex, 'oops'))
265 lex.Reset()
266 log('*** DOUBLE')
267 self.assertEqual('double quoted', htm8.GetAttrRaw(lex, 'href'))
268
269 def testGetAttrRaw3(self):
270 """Reverse order vs. testGetAttrRaw2"""
271 lex = _MakeAttrLexer(self, '<a href="double quoted">')
272 #_PrintTokens(lex)
273
274 self.assertEqual('double quoted', htm8.GetAttrRaw(lex, 'href'))
275 lex.Reset()
276 self.assertEqual(None, htm8.GetAttrRaw(lex, 'oops'))
277
278 def testGetAttrRaw4(self):
279
280 lex = _MakeAttrLexer(self, '<a href=foo class="bar">')
281 #_PrintTokens(lex)
282 self.assertEqual('bar', htm8.GetAttrRaw(lex, 'class'))
283
284 lex = _MakeAttrLexer(self,
285 '<a href=foo class="bar" />',
286 expected_tag=h8_id.StartEndTag)
287 #_PrintTokens(lex)
288 self.assertEqual('bar', htm8.GetAttrRaw(lex, 'class'))
289
290 lex = _MakeAttrLexer(self,
291 '<a href="?foo=1&amp;bar=2" />',
292 expected_tag=h8_id.StartEndTag)
293 self.assertEqual('?foo=1&amp;bar=2', htm8.GetAttrRaw(lex, 'href'))
294
295 def testAllAttrs(self):
296 # type: () -> None
297 """
298 [('key', 'value')] for all
299 """
300 # closed
301 lex = _MakeAttrLexer(self,
302 '<a href=foo class="bar" />',
303 expected_tag=h8_id.StartEndTag)
304 self.assertEqual([('href', 'foo'), ('class', 'bar')],
305 htm8.AllAttrsRaw(lex))
306
307 lex = _MakeAttrLexer(self,
308 '<a href="?foo=1&amp;bar=2" />',
309 expected_tag=h8_id.StartEndTag)
310 self.assertEqual([('href', '?foo=1&amp;bar=2')], htm8.AllAttrsRaw(lex))
311
312 def testEmptyMissingValues(self):
313 # type: () -> None
314 # equivalent to <button disabled="">
315 lex = _MakeAttrLexer(self, '<button disabled>')
316 all_attrs = htm8.AllAttrsRaw(lex)
317 self.assertEqual([('disabled', '')], all_attrs)
318
319 # TODO: restore this
320 if 0:
321 slices = lex.AllAttrsRawSlice()
322 log('slices %s', slices)
323
324 lex = _MakeAttrLexer(
325 self, '''<p double="" single='' empty= value missing empty2=>''')
326 all_attrs = htm8.AllAttrsRaw(lex)
327 self.assertEqual([
328 ('double', ''),
329 ('single', ''),
330 ('empty', 'value'),
331 ('missing', ''),
332 ('empty2', ''),
333 ], all_attrs)
334 # TODO: should have
335 log('all %s', all_attrs)
336
337 if 0:
338 slices = lex.AllAttrsRawSlice()
339 log('slices %s', slices)
340
341 def testInvalidTag(self):
342 # type: () -> None
343 try:
344 lex = _MakeAttrLexer(self, '<a foo=bar !></a>')
345 all_attrs = htm8.AllAttrsRaw(lex)
346 except htm8.LexError as e:
347 print(e)
348 else:
349 self.fail('Expected LexError')
350
351
352def ValidTokenList(s, no_special_tags=False):
353 # type: (str, bool) -> List[Tuple[h8_id_t, int]]
354 """A wrapper that can be more easily translated to C++. Doesn't use iterators."""
355
356 start_pos = 0
357 tokens = []
358 lx = htm8.Lexer(s, no_special_tags=no_special_tags)
359 while True:
360 tok_id, end_pos = lx.Read()
361 tokens.append((tok_id, end_pos))
362 if tok_id == h8_id.EndOfStream:
363 break
364 if tok_id == h8_id.Invalid:
365 raise htm8.LexError('ValidTokenList() got invalid token', s,
366 start_pos)
367 start_pos = end_pos
368 return tokens
369
370
371def Lex(h, no_special_tags=False):
372 # type: (str, bool) -> List[Tuple[int, int]]
373 print(repr(h))
374 tokens = ValidTokenList(h, no_special_tags=no_special_tags)
375 start_pos = 0
376 for tok_id, end_pos in tokens:
377 frag = h[start_pos:end_pos]
378 log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
379 start_pos = end_pos
380 return tokens
381
382
383class LexerTest(unittest.TestCase):
384
385 # IndexLinker in devtools/make_help.py
386 # <pre> sections in doc/html_help.py
387 # TocExtractor in devtools/cmark.py
388
389 def testPstrip(self):
390 # type: () -> None
391 """Remove anything like this.
392
393 <p><pstrip> </pstrip></p>
394 """
395 pass
396
397 def testCommentParse(self):
398 # type: () -> None
399 n = len(TEST_HTML)
400 tokens = Lex(TEST_HTML)
401
402 def testCommentParse2(self):
403 # type: () -> None
404 h = '''
405 hi <!-- line 1
406 line 2 --><br/>'''
407 tokens = Lex(h)
408
409 self.assertEqual(
410 [
411 (h8_id.RawData, 12),
412 (h8_id.Comment, 50), # <? err ?>
413 (h8_id.StartEndTag, 55),
414 (h8_id.EndOfStream, 55),
415 ],
416 tokens)
417
418 def testProcessingInstruction(self):
419 # type: () -> None
420 # <?xml ?> header
421 h = 'hi <? err ?>'
422 tokens = Lex(h)
423
424 self.assertEqual(
425 [
426 (h8_id.RawData, 3),
427 (h8_id.Processing, 12), # <? err ?>
428 (h8_id.EndOfStream, 12),
429 ],
430 tokens)
431
432 def testScriptStyle(self):
433 # type: () -> None
434 h = '''
435 hi <script src=""> if (x < 1 && y > 2 ) { console.log(""); }
436 </script>
437 '''
438 tokens = Lex(h)
439
440 expected = [
441 (h8_id.RawData, 12),
442 (h8_id.StartTag, 27), # <script>
443 (h8_id.HtmlCData, 78), # JavaScript code is HTML CData
444 (h8_id.EndTag, 87), # </script>
445 (h8_id.RawData, 96), # \n
446 (h8_id.EndOfStream, 96), # \n
447 ]
448 self.assertEqual(expected, tokens)
449
450 # Test case matching
451 tokens = Lex(h.replace('script', 'scrIPT'))
452 self.assertEqual(expected, tokens)
453
454 def testScriptStyleXml(self):
455 # type: () -> None
456 h = 'hi <script src=""> &lt; </script>'
457 # XML mode
458 tokens = Lex(h, no_special_tags=True)
459
460 self.assertEqual(
461 [
462 (h8_id.RawData, 3),
463 (h8_id.StartTag, 18), # <script>
464 (h8_id.RawData, 19), # space
465 (h8_id.CharEntity, 23), # </script>
466 (h8_id.RawData, 24), # \n
467 (h8_id.EndTag, 33), # \n
468 (h8_id.EndOfStream, 33), # \n
469 ],
470 tokens)
471
472 def testCData(self):
473 # type: () -> None
474
475 # from
476 # /home/andy/src/languages/Python-3.11.5/Lib/test/xmltestdata/c14n-20/inC14N4.xml
477 h = '<compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>'
478 tokens = Lex(h)
479
480 self.assertEqual([
481 (h8_id.StartTag, 9),
482 (h8_id.CData, 61),
483 (h8_id.EndTag, 71),
484 (h8_id.EndOfStream, 71),
485 ], tokens)
486
487 def testEntity(self):
488 # type: () -> None
489
490 # from
491 # /home/andy/src/Python-3.12.4/Lib/test/xmltestdata/c14n-20/inC14N5.xml
492 h = '&ent1;, &ent2;!'
493
494 tokens = Lex(h)
495
496 self.assertEqual([
497 (h8_id.CharEntity, 6),
498 (h8_id.RawData, 8),
499 (h8_id.CharEntity, 14),
500 (h8_id.RawData, 15),
501 (h8_id.EndOfStream, 15),
502 ], tokens)
503
504 def testStartTag(self):
505 # type: () -> None
506
507 h = '<a>hi</a>'
508 tokens = Lex(h)
509
510 self.assertEqual([
511 (h8_id.StartTag, 3),
512 (h8_id.RawData, 5),
513 (h8_id.EndTag, 9),
514 (h8_id.EndOfStream, 9),
515 ], tokens)
516
517 # Make sure we don't consume too much
518 h = '<a><source>1.7</source></a>'
519
520 tokens = Lex(h)
521
522 self.assertEqual([
523 (h8_id.StartTag, 3),
524 (h8_id.StartTag, 11),
525 (h8_id.RawData, 14),
526 (h8_id.EndTag, 23),
527 (h8_id.EndTag, 27),
528 (h8_id.EndOfStream, 27),
529 ], tokens)
530
531 return
532
533 h = '''
534 <configuration>
535 <source>1.7</source>
536 </configuration>'''
537
538 tokens = Lex(h)
539
540 self.assertEqual([
541 (h8_id.RawData, 9),
542 (h8_id.StartTag, 24),
543 (h8_id.RawData, 9),
544 (h8_id.EndOfStream, 9),
545 ], tokens)
546
547 def testBad(self):
548 # type: () -> None
549 h = '&'
550 tokens = Lex(h)
551
552 self.assertEqual([
553 (h8_id.BadAmpersand, 1),
554 (h8_id.EndOfStream, 1),
555 ], tokens)
556
557 h = '>'
558 tokens = Lex(h)
559
560 self.assertEqual([
561 (h8_id.BadGreaterThan, 1),
562 (h8_id.EndOfStream, 1),
563 ], tokens)
564
565 def testEndOfStream(self):
566 # type: () -> None
567
568 # NUL is end
569 h = 'a\0b'
570 tokens = Lex(h)
571
572 self.assertEqual([
573 (h8_id.RawData, 1),
574 (h8_id.EndOfStream, 2),
575 ], tokens)
576
577
578if __name__ == '__main__':
579 unittest.main()