1 | #!/usr/bin/env python2
|
2 | from __future__ import print_function
|
3 |
|
4 | from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_id_str, attr_name,
|
5 | attr_name_str, attr_value_e,
|
6 | attr_value_str)
|
7 |
|
8 | import unittest
|
9 |
|
10 | from typing import List, Tuple, Any
|
11 |
|
12 | from data_lang import htm8
|
13 | from doctools.util import log
|
14 |
|
15 | with open('data_lang/testdata/hello.htm8') as f:
|
16 | TEST_HTML = f.read()
|
17 |
|
18 |
|
19 | class FunctionsTest(unittest.TestCase):
|
20 |
|
21 | def testFindLineNum(self):
|
22 | # type: () -> None
|
23 | s = 'foo\n' * 3
|
24 | for pos in [1, 5, 10, 50]: # out of bounds
|
25 | line_num = htm8._FindLineNum(s, pos)
|
26 | print(line_num)
|
27 |
|
28 |
|
29 | def _MakeAttrLexer(t, h, expected_tag=h8_id.StartTag):
|
30 | # type: (Any, str) -> htm8.AttrLexer
|
31 |
|
32 | lx = htm8.Lexer(h)
|
33 |
|
34 | tok_id, end_pos = lx.Read()
|
35 | t.assertEqual(expected_tag, tok_id)
|
36 |
|
37 | attr_lx = htm8.AttrLexer(h)
|
38 | attr_lx.Init(tok_id, lx.TagNamePos(), end_pos)
|
39 |
|
40 | return attr_lx
|
41 |
|
42 |
|
43 | class AttrLexerTest(unittest.TestCase):
|
44 |
|
45 | def testNoAttrs(self):
|
46 | # type: () -> None
|
47 |
|
48 | # TODO: h8_id.StartTag and EndTag will expose the tag_name_pos - the
|
49 | # end of the tag name
|
50 |
|
51 | h = 'x <a>'
|
52 | lx = htm8.Lexer(h)
|
53 |
|
54 | # Skip raw data
|
55 | tok_id, end_pos = lx.Read()
|
56 | self.assertEqual(h8_id.RawData, tok_id)
|
57 |
|
58 | tok_id, end_pos = lx.Read()
|
59 | self.assertEqual(h8_id.StartTag, tok_id)
|
60 |
|
61 | attr_lx = htm8.AttrLexer(h)
|
62 | attr_lx.Init(tok_id, lx.TagNamePos(), end_pos)
|
63 |
|
64 | # There is no tag
|
65 | n, name_start, name_end, _ = attr_lx.ReadName()
|
66 | self.assertEqual(n, attr_name.Done)
|
67 | self.assertEqual(-1, name_start)
|
68 | self.assertEqual(-1, name_end)
|
69 |
|
70 | try:
|
71 | result = attr_lx.ReadValue()
|
72 | except AssertionError as e:
|
73 | print(e)
|
74 | else:
|
75 | self.fail('should have failed')
|
76 |
|
77 | try:
|
78 | result = attr_lx.ReadName()
|
79 | except AssertionError as e:
|
80 | print(e)
|
81 | else:
|
82 | self.fail('should have failed')
|
83 |
|
84 | def testInvalid(self):
|
85 | h = '<a !>'
|
86 | attr_lx = _MakeAttrLexer(self, h)
|
87 |
|
88 | n, name_start, name_end, _ = attr_lx.ReadName()
|
89 | self.assertEqual(n, attr_name.Invalid)
|
90 | self.assertEqual(-1, name_start)
|
91 | self.assertEqual(-1, name_end)
|
92 |
|
93 | try:
|
94 | result = attr_lx.ReadValue()
|
95 | except AssertionError as e:
|
96 | print(e)
|
97 | else:
|
98 | self.fail('should have failed')
|
99 |
|
100 | def testEmpty(self):
|
101 | h = '<img src=>'
|
102 | attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)
|
103 |
|
104 | n, name_start, name_end, _ = attr_lx.ReadName()
|
105 | self.assertEqual(n, attr_name.Ok)
|
106 | self.assertEqual(5, name_start)
|
107 | self.assertEqual(8, name_end)
|
108 | self.assertEqual(False, attr_lx.next_value_is_missing)
|
109 |
|
110 | self.assertEqual(True, attr_lx.AttrNameEquals('src'))
|
111 | self.assertEqual(False, attr_lx.AttrNameEquals('srcz'))
|
112 |
|
113 | v, attr_start, attr_end = attr_lx.ReadValue()
|
114 | log('v = %s', attr_value_str(v))
|
115 | self.assertEqual(attr_value_e.Empty, v)
|
116 | self.assertEqual(-1, attr_start)
|
117 | self.assertEqual(-1, attr_end)
|
118 |
|
119 | n, name_start, name_end, _ = attr_lx.ReadName()
|
120 | self.assertEqual(n, attr_name.Done)
|
121 |
|
122 | def testMissing(self):
|
123 | h = '<img SRC/>'
|
124 | attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartEndTag)
|
125 |
|
126 | n, name_start, name_end, _ = attr_lx.ReadName()
|
127 | self.assertEqual(n, attr_name.Ok)
|
128 | self.assertEqual(5, name_start)
|
129 | self.assertEqual(8, name_end)
|
130 | self.assertEqual(True, attr_lx.next_value_is_missing)
|
131 |
|
132 | self.assertEqual(True, attr_lx.AttrNameEquals('src'))
|
133 | self.assertEqual(False, attr_lx.AttrNameEquals('srcz'))
|
134 |
|
135 | v, attr_start, attr_end = attr_lx.ReadValue()
|
136 | self.assertEqual(attr_value_e.Missing, v)
|
137 | self.assertEqual(-1, attr_start)
|
138 | self.assertEqual(-1, attr_end)
|
139 |
|
140 | n, name_start, name_end, _ = attr_lx.ReadName()
|
141 | self.assertEqual(n, attr_name.Done)
|
142 |
|
143 | def testUnquoted(self):
|
144 | # CAREFUL: /> is a StartEndTag, and / is not part of unquoted value
|
145 | h = '<a x=foo />'
|
146 | attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartEndTag)
|
147 |
|
148 | n, name_start, name_end, _ = attr_lx.ReadName()
|
149 | self.assertEqual(n, attr_name.Ok)
|
150 | self.assertEqual(3, name_start)
|
151 | self.assertEqual(4, name_end)
|
152 |
|
153 | v, attr_start, attr_end = attr_lx.ReadValue()
|
154 |
|
155 | log('v = %s', attr_value_str(v))
|
156 | log('unquoted val %r', h[attr_start:attr_end])
|
157 |
|
158 | self.assertEqual(attr_value_e.Unquoted, v)
|
159 | self.assertEqual(5, attr_start)
|
160 | self.assertEqual(8, attr_end)
|
161 |
|
162 | n, name_start, name_end, _ = attr_lx.ReadName()
|
163 | self.assertEqual(n, attr_name.Done)
|
164 |
|
165 | def testDoubleQuoted(self):
|
166 | h = '<a x="f&">'
|
167 | attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)
|
168 |
|
169 | n, name_start, name_end, _ = attr_lx.ReadName()
|
170 | self.assertEqual(n, attr_name.Ok)
|
171 | self.assertEqual(3, name_start)
|
172 | self.assertEqual(4, name_end)
|
173 |
|
174 | v, attr_start, attr_end = attr_lx.ReadValue()
|
175 |
|
176 | log('v = %s', attr_value_str(v))
|
177 | log('val %r', h[attr_start:attr_end])
|
178 |
|
179 | self.assertEqual(attr_value_e.DoubleQuoted, v)
|
180 | self.assertEqual(6, attr_start)
|
181 | self.assertEqual(8, attr_end)
|
182 | self.assertEqual(9, attr_lx.pos)
|
183 |
|
184 | n, name_start, name_end, _ = attr_lx.ReadName()
|
185 | log('n = %r', attr_name_str(n))
|
186 | self.assertEqual(n, attr_name.Done)
|
187 |
|
188 | def testSingleQuoted(self):
|
189 | h = "<a x='&f'>"
|
190 | attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)
|
191 |
|
192 | n, name_start, name_end, _ = attr_lx.ReadName()
|
193 | self.assertEqual(n, attr_name.Ok)
|
194 | self.assertEqual(3, name_start)
|
195 | self.assertEqual(4, name_end)
|
196 |
|
197 | v, attr_start, attr_end = attr_lx.ReadValue()
|
198 |
|
199 | log('v = %s', attr_value_str(v))
|
200 | log('unquoted val %r', h[attr_start:attr_end])
|
201 |
|
202 | self.assertEqual(attr_value_e.SingleQuoted, v)
|
203 | self.assertEqual(6, attr_start)
|
204 | self.assertEqual(8, attr_end)
|
205 | self.assertEqual(9, attr_lx.pos)
|
206 |
|
207 | n, name_start, name_end, _ = attr_lx.ReadName()
|
208 | #log('n = %r', attr_name_str(n))
|
209 | self.assertEqual(n, attr_name.Done)
|
210 |
|
211 | def testDoubleQuoted_Bad(self):
|
212 | h = '<a x="foo>'
|
213 | attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)
|
214 |
|
215 | n, name_start, name_end, _ = attr_lx.ReadName()
|
216 | self.assertEqual(n, attr_name.Ok)
|
217 | self.assertEqual(3, name_start)
|
218 | self.assertEqual(4, name_end)
|
219 |
|
220 | try:
|
221 | v, attr_start, attr_end = attr_lx.ReadValue()
|
222 | except htm8.LexError as e:
|
223 | print(e)
|
224 | else:
|
225 | self.fail('Expected LexError')
|
226 |
|
227 | def testSingleQuoted_Bad(self):
|
228 | h = "<a x='foo>"
|
229 | attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)
|
230 |
|
231 | n, name_start, name_end, _ = attr_lx.ReadName()
|
232 | self.assertEqual(n, attr_name.Ok)
|
233 | self.assertEqual(3, name_start)
|
234 | self.assertEqual(4, name_end)
|
235 |
|
236 | try:
|
237 | v, attr_start, attr_end = attr_lx.ReadValue()
|
238 | except htm8.LexError as e:
|
239 | print(e)
|
240 | else:
|
241 | self.fail('Expected LexError')
|
242 |
|
243 |
|
244 | class AttrLexerWrapperTest(unittest.TestCase):
|
245 |
|
246 | def testGetAttrRaw(self):
|
247 | # type: () -> None
|
248 | lex = _MakeAttrLexer(self, '<a>')
|
249 | #_PrintTokens(lex)
|
250 | self.assertEqual(None, htm8.GetAttrRaw(lex, 'oops'))
|
251 |
|
252 | # <a novalue> means lex.Get('novalue') == ''
|
253 | # https://developer.mozilla.org/en-US/docs/Web/API/Element/hasAttribute
|
254 | # We are not distinguishing <a novalue=""> from <a novalue> in this API
|
255 | lex = _MakeAttrLexer(self, '<a novalue>')
|
256 | #_PrintTokens(lex)
|
257 | self.assertEqual('', htm8.GetAttrRaw(lex, 'novalue'))
|
258 |
|
259 | def testGetAttrRaw2(self):
|
260 | lex = _MakeAttrLexer(self, '<a href="double quoted">')
|
261 | #_PrintTokens(lex)
|
262 |
|
263 | log('*** OOPS')
|
264 | self.assertEqual(None, htm8.GetAttrRaw(lex, 'oops'))
|
265 | lex.Reset()
|
266 | log('*** DOUBLE')
|
267 | self.assertEqual('double quoted', htm8.GetAttrRaw(lex, 'href'))
|
268 |
|
269 | def testGetAttrRaw3(self):
|
270 | """Reverse order vs. testGetAttrRaw2"""
|
271 | lex = _MakeAttrLexer(self, '<a href="double quoted">')
|
272 | #_PrintTokens(lex)
|
273 |
|
274 | self.assertEqual('double quoted', htm8.GetAttrRaw(lex, 'href'))
|
275 | lex.Reset()
|
276 | self.assertEqual(None, htm8.GetAttrRaw(lex, 'oops'))
|
277 |
|
278 | def testGetAttrRaw4(self):
|
279 |
|
280 | lex = _MakeAttrLexer(self, '<a href=foo class="bar">')
|
281 | #_PrintTokens(lex)
|
282 | self.assertEqual('bar', htm8.GetAttrRaw(lex, 'class'))
|
283 |
|
284 | lex = _MakeAttrLexer(self,
|
285 | '<a href=foo class="bar" />',
|
286 | expected_tag=h8_id.StartEndTag)
|
287 | #_PrintTokens(lex)
|
288 | self.assertEqual('bar', htm8.GetAttrRaw(lex, 'class'))
|
289 |
|
290 | lex = _MakeAttrLexer(self,
|
291 | '<a href="?foo=1&bar=2" />',
|
292 | expected_tag=h8_id.StartEndTag)
|
293 | self.assertEqual('?foo=1&bar=2', htm8.GetAttrRaw(lex, 'href'))
|
294 |
|
295 | def testAllAttrs(self):
|
296 | # type: () -> None
|
297 | """
|
298 | [('key', 'value')] for all
|
299 | """
|
300 | # closed
|
301 | lex = _MakeAttrLexer(self,
|
302 | '<a href=foo class="bar" />',
|
303 | expected_tag=h8_id.StartEndTag)
|
304 | self.assertEqual([('href', 'foo'), ('class', 'bar')],
|
305 | htm8.AllAttrsRaw(lex))
|
306 |
|
307 | lex = _MakeAttrLexer(self,
|
308 | '<a href="?foo=1&bar=2" />',
|
309 | expected_tag=h8_id.StartEndTag)
|
310 | self.assertEqual([('href', '?foo=1&bar=2')], htm8.AllAttrsRaw(lex))
|
311 |
|
312 | def testEmptyMissingValues(self):
|
313 | # type: () -> None
|
314 | # equivalent to <button disabled="">
|
315 | lex = _MakeAttrLexer(self, '<button disabled>')
|
316 | all_attrs = htm8.AllAttrsRaw(lex)
|
317 | self.assertEqual([('disabled', '')], all_attrs)
|
318 |
|
319 | # TODO: restore this
|
320 | if 0:
|
321 | slices = lex.AllAttrsRawSlice()
|
322 | log('slices %s', slices)
|
323 |
|
324 | lex = _MakeAttrLexer(
|
325 | self, '''<p double="" single='' empty= value missing empty2=>''')
|
326 | all_attrs = htm8.AllAttrsRaw(lex)
|
327 | self.assertEqual([
|
328 | ('double', ''),
|
329 | ('single', ''),
|
330 | ('empty', 'value'),
|
331 | ('missing', ''),
|
332 | ('empty2', ''),
|
333 | ], all_attrs)
|
334 | # TODO: should have
|
335 | log('all %s', all_attrs)
|
336 |
|
337 | if 0:
|
338 | slices = lex.AllAttrsRawSlice()
|
339 | log('slices %s', slices)
|
340 |
|
341 | def testInvalidTag(self):
|
342 | # type: () -> None
|
343 | try:
|
344 | lex = _MakeAttrLexer(self, '<a foo=bar !></a>')
|
345 | all_attrs = htm8.AllAttrsRaw(lex)
|
346 | except htm8.LexError as e:
|
347 | print(e)
|
348 | else:
|
349 | self.fail('Expected LexError')
|
350 |
|
351 |
|
352 | def ValidTokenList(s, no_special_tags=False):
|
353 | # type: (str, bool) -> List[Tuple[h8_id_t, int]]
|
354 | """A wrapper that can be more easily translated to C++. Doesn't use iterators."""
|
355 |
|
356 | start_pos = 0
|
357 | tokens = []
|
358 | lx = htm8.Lexer(s, no_special_tags=no_special_tags)
|
359 | while True:
|
360 | tok_id, end_pos = lx.Read()
|
361 | tokens.append((tok_id, end_pos))
|
362 | if tok_id == h8_id.EndOfStream:
|
363 | break
|
364 | if tok_id == h8_id.Invalid:
|
365 | raise htm8.LexError('ValidTokenList() got invalid token', s,
|
366 | start_pos)
|
367 | start_pos = end_pos
|
368 | return tokens
|
369 |
|
370 |
|
371 | def Lex(h, no_special_tags=False):
|
372 | # type: (str, bool) -> List[Tuple[int, int]]
|
373 | print(repr(h))
|
374 | tokens = ValidTokenList(h, no_special_tags=no_special_tags)
|
375 | start_pos = 0
|
376 | for tok_id, end_pos in tokens:
|
377 | frag = h[start_pos:end_pos]
|
378 | log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
|
379 | start_pos = end_pos
|
380 | return tokens
|
381 |
|
382 |
|
383 | class LexerTest(unittest.TestCase):
|
384 |
|
385 | # IndexLinker in devtools/make_help.py
|
386 | # <pre> sections in doc/html_help.py
|
387 | # TocExtractor in devtools/cmark.py
|
388 |
|
389 | def testPstrip(self):
|
390 | # type: () -> None
|
391 | """Remove anything like this.
|
392 |
|
393 | <p><pstrip> </pstrip></p>
|
394 | """
|
395 | pass
|
396 |
|
397 | def testCommentParse(self):
|
398 | # type: () -> None
|
399 | n = len(TEST_HTML)
|
400 | tokens = Lex(TEST_HTML)
|
401 |
|
402 | def testCommentParse2(self):
|
403 | # type: () -> None
|
404 | h = '''
|
405 | hi <!-- line 1
|
406 | line 2 --><br/>'''
|
407 | tokens = Lex(h)
|
408 |
|
409 | self.assertEqual(
|
410 | [
|
411 | (h8_id.RawData, 12),
|
412 | (h8_id.Comment, 50), # <? err ?>
|
413 | (h8_id.StartEndTag, 55),
|
414 | (h8_id.EndOfStream, 55),
|
415 | ],
|
416 | tokens)
|
417 |
|
418 | def testProcessingInstruction(self):
|
419 | # type: () -> None
|
420 | # <?xml ?> header
|
421 | h = 'hi <? err ?>'
|
422 | tokens = Lex(h)
|
423 |
|
424 | self.assertEqual(
|
425 | [
|
426 | (h8_id.RawData, 3),
|
427 | (h8_id.Processing, 12), # <? err ?>
|
428 | (h8_id.EndOfStream, 12),
|
429 | ],
|
430 | tokens)
|
431 |
|
432 | def testScriptStyle(self):
|
433 | # type: () -> None
|
434 | h = '''
|
435 | hi <script src=""> if (x < 1 && y > 2 ) { console.log(""); }
|
436 | </script>
|
437 | '''
|
438 | tokens = Lex(h)
|
439 |
|
440 | expected = [
|
441 | (h8_id.RawData, 12),
|
442 | (h8_id.StartTag, 27), # <script>
|
443 | (h8_id.HtmlCData, 78), # JavaScript code is HTML CData
|
444 | (h8_id.EndTag, 87), # </script>
|
445 | (h8_id.RawData, 96), # \n
|
446 | (h8_id.EndOfStream, 96), # \n
|
447 | ]
|
448 | self.assertEqual(expected, tokens)
|
449 |
|
450 | # Test case matching
|
451 | tokens = Lex(h.replace('script', 'scrIPT'))
|
452 | self.assertEqual(expected, tokens)
|
453 |
|
454 | def testScriptStyleXml(self):
|
455 | # type: () -> None
|
456 | h = 'hi <script src=""> < </script>'
|
457 | # XML mode
|
458 | tokens = Lex(h, no_special_tags=True)
|
459 |
|
460 | self.assertEqual(
|
461 | [
|
462 | (h8_id.RawData, 3),
|
463 | (h8_id.StartTag, 18), # <script>
|
464 | (h8_id.RawData, 19), # space
|
465 | (h8_id.CharEntity, 23), # </script>
|
466 | (h8_id.RawData, 24), # \n
|
467 | (h8_id.EndTag, 33), # \n
|
468 | (h8_id.EndOfStream, 33), # \n
|
469 | ],
|
470 | tokens)
|
471 |
|
472 | def testCData(self):
|
473 | # type: () -> None
|
474 |
|
475 | # from
|
476 | # /home/andy/src/languages/Python-3.11.5/Lib/test/xmltestdata/c14n-20/inC14N4.xml
|
477 | h = '<compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>'
|
478 | tokens = Lex(h)
|
479 |
|
480 | self.assertEqual([
|
481 | (h8_id.StartTag, 9),
|
482 | (h8_id.CData, 61),
|
483 | (h8_id.EndTag, 71),
|
484 | (h8_id.EndOfStream, 71),
|
485 | ], tokens)
|
486 |
|
487 | def testEntity(self):
|
488 | # type: () -> None
|
489 |
|
490 | # from
|
491 | # /home/andy/src/Python-3.12.4/Lib/test/xmltestdata/c14n-20/inC14N5.xml
|
492 | h = '&ent1;, &ent2;!'
|
493 |
|
494 | tokens = Lex(h)
|
495 |
|
496 | self.assertEqual([
|
497 | (h8_id.CharEntity, 6),
|
498 | (h8_id.RawData, 8),
|
499 | (h8_id.CharEntity, 14),
|
500 | (h8_id.RawData, 15),
|
501 | (h8_id.EndOfStream, 15),
|
502 | ], tokens)
|
503 |
|
504 | def testStartTag(self):
|
505 | # type: () -> None
|
506 |
|
507 | h = '<a>hi</a>'
|
508 | tokens = Lex(h)
|
509 |
|
510 | self.assertEqual([
|
511 | (h8_id.StartTag, 3),
|
512 | (h8_id.RawData, 5),
|
513 | (h8_id.EndTag, 9),
|
514 | (h8_id.EndOfStream, 9),
|
515 | ], tokens)
|
516 |
|
517 | # Make sure we don't consume too much
|
518 | h = '<a><source>1.7</source></a>'
|
519 |
|
520 | tokens = Lex(h)
|
521 |
|
522 | self.assertEqual([
|
523 | (h8_id.StartTag, 3),
|
524 | (h8_id.StartTag, 11),
|
525 | (h8_id.RawData, 14),
|
526 | (h8_id.EndTag, 23),
|
527 | (h8_id.EndTag, 27),
|
528 | (h8_id.EndOfStream, 27),
|
529 | ], tokens)
|
530 |
|
531 | return
|
532 |
|
533 | h = '''
|
534 | <configuration>
|
535 | <source>1.7</source>
|
536 | </configuration>'''
|
537 |
|
538 | tokens = Lex(h)
|
539 |
|
540 | self.assertEqual([
|
541 | (h8_id.RawData, 9),
|
542 | (h8_id.StartTag, 24),
|
543 | (h8_id.RawData, 9),
|
544 | (h8_id.EndOfStream, 9),
|
545 | ], tokens)
|
546 |
|
547 | def testBad(self):
|
548 | # type: () -> None
|
549 | h = '&'
|
550 | tokens = Lex(h)
|
551 |
|
552 | self.assertEqual([
|
553 | (h8_id.BadAmpersand, 1),
|
554 | (h8_id.EndOfStream, 1),
|
555 | ], tokens)
|
556 |
|
557 | h = '>'
|
558 | tokens = Lex(h)
|
559 |
|
560 | self.assertEqual([
|
561 | (h8_id.BadGreaterThan, 1),
|
562 | (h8_id.EndOfStream, 1),
|
563 | ], tokens)
|
564 |
|
565 | def testEndOfStream(self):
|
566 | # type: () -> None
|
567 |
|
568 | # NUL is end
|
569 | h = 'a\0b'
|
570 | tokens = Lex(h)
|
571 |
|
572 | self.assertEqual([
|
573 | (h8_id.RawData, 1),
|
574 | (h8_id.EndOfStream, 2),
|
575 | ], tokens)
|
576 |
|
577 |
|
578 | if __name__ == '__main__':
|
579 | unittest.main()
|