OILS / data_lang / htm8_test.py View on Github | oils.pub

501 lines, 336 significant
1#!/usr/bin/env python2
2from __future__ import print_function
3
4from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_id_str, attr_name,
5 attr_value_e, attr_value_str)
6
7import unittest
8import re
9
10from typing import List, Tuple
11
12from data_lang import htm8
13from doctools.util import log
14
15with open('data_lang/testdata/hello.htm8') as f:
16 TEST_HTML = f.read()
17
18
19class RegexTest(unittest.TestCase):
20
21 def testDotAll(self):
22 # type: () -> None
23
24 # Note that $ matches end of line, not end of string
25 p1 = re.compile(r'.')
26 print(p1.match('\n'))
27
28 p2 = re.compile(r'.', re.DOTALL)
29 print(p2.match('\n'))
30
31 #p3 = re.compile(r'[.\n]', re.VERBOSE)
32 p3 = re.compile(r'[.\n]')
33 print(p3.match('\n'))
34
35 print('Negation')
36
37 p4 = re.compile(r'[^>]')
38 print(p4.match('\n'))
39
40 def testAttrRe(self):
41 # type: () -> None
42 _ATTR_RE = htm8._ATTR_RE
43 m = _ATTR_RE.match(' empty= val')
44 print(m.groups())
45
46
47class FunctionsTest(unittest.TestCase):
48
49 def testFindLineNum(self):
50 # type: () -> None
51 s = 'foo\n' * 3
52 for pos in [1, 5, 10, 50]: # out of bounds
53 line_num = htm8._FindLineNum(s, pos)
54 print(line_num)
55
56
57class AttrLexerTest(unittest.TestCase):
58
59 def testNoAttrs(self):
60 # type: () -> None
61
62 # TODO: h8_id.StartTag and EndTag will expose the tag_name_pos - the
63 # end of the tag name
64
65 h = 'x <a>'
66 lx = htm8.Lexer(h)
67
68 # Skip raw data
69 tok_id, end_pos = lx.Read()
70 self.assertEqual(h8_id.RawData, tok_id)
71
72 tok_id, end_pos = lx.Read()
73 self.assertEqual(h8_id.StartTag, tok_id)
74
75 attr_lx = htm8.AttrLexer(h)
76 attr_lx.Init(lx.TagNamePos(), end_pos)
77
78 # There is no tag
79 n, name_start, name_end = attr_lx.ReadName()
80 self.assertEqual(n, attr_name.Done)
81 self.assertEqual(-1, name_start)
82 self.assertEqual(-1, name_end)
83
84 try:
85 result = attr_lx.ReadRawValue()
86 except AssertionError as e:
87 print(e)
88 else:
89 self.fail('should have failed')
90
91 def testInvalid(self):
92 h = '<a !>'
93 lx = htm8.Lexer(h)
94
95 tok_id, end_pos = lx.Read()
96 self.assertEqual(h8_id.StartTag, tok_id)
97
98 attr_lx = htm8.AttrLexer(h)
99 attr_lx.Init(lx.TagNamePos(), end_pos)
100
101 n, name_start, name_end = attr_lx.ReadName()
102 self.assertEqual(n, attr_name.Invalid)
103 self.assertEqual(-1, name_start)
104 self.assertEqual(-1, name_end)
105
106 try:
107 result = attr_lx.ReadRawValue()
108 except AssertionError as e:
109 print(e)
110 else:
111 self.fail('should have failed')
112
113 def testEmpty(self):
114 h = '<img src=/>'
115 lx = htm8.Lexer(h)
116
117 tok_id, end_pos = lx.Read()
118 self.assertEqual(h8_id.StartEndTag, tok_id)
119
120 attr_lx = htm8.AttrLexer(h)
121 attr_lx.Init(lx.TagNamePos(), end_pos)
122
123 n, name_start, name_end = attr_lx.ReadName()
124 self.assertEqual(n, attr_name.Ok)
125 self.assertEqual(5, name_start)
126 self.assertEqual(8, name_end)
127 self.assertEqual(False, attr_lx.next_value_is_missing)
128
129 self.assertEqual(True, attr_lx.AttrNameEquals('src'))
130 self.assertEqual(False, attr_lx.AttrNameEquals('srcz'))
131
132 v, attr_start, attr_end = attr_lx.ReadRawValue()
133 log('v = %s', attr_value_str(v))
134 self.assertEqual(attr_value_e.Empty, v)
135 self.assertEqual(-1, attr_start)
136 self.assertEqual(-1, attr_end)
137
138 def testMissing(self):
139 h = '<img SRC/>'
140 lx = htm8.Lexer(h)
141
142 tok_id, end_pos = lx.Read()
143 self.assertEqual(h8_id.StartEndTag, tok_id)
144
145 attr_lx = htm8.AttrLexer(h)
146 attr_lx.Init(lx.TagNamePos(), end_pos)
147
148 n, name_start, name_end = attr_lx.ReadName()
149 self.assertEqual(n, attr_name.Ok)
150 self.assertEqual(5, name_start)
151 self.assertEqual(8, name_end)
152 self.assertEqual(True, attr_lx.next_value_is_missing)
153
154 self.assertEqual(True, attr_lx.AttrNameEquals('src'))
155 self.assertEqual(False, attr_lx.AttrNameEquals('srcz'))
156
157 v, attr_start, attr_end = attr_lx.ReadRawValue()
158 self.assertEqual(attr_value_e.Missing, v)
159 self.assertEqual(-1, attr_start)
160 self.assertEqual(-1, attr_end)
161
162 def testUnquoted(self):
163 # CAREFUL: /> is a StartEndTag, and / is not part of unquoted value
164 h = '<a x=foo/>'
165 lx = htm8.Lexer(h)
166
167 tok_id, end_pos = lx.Read()
168 self.assertEqual(h8_id.StartEndTag, tok_id)
169
170 attr_lx = htm8.AttrLexer(h)
171 attr_lx.Init(lx.TagNamePos(), end_pos)
172 n, name_start, name_end = attr_lx.ReadName()
173 self.assertEqual(n, attr_name.Ok)
174 self.assertEqual(3, name_start)
175 self.assertEqual(4, name_end)
176
177 v, attr_start, attr_end = attr_lx.ReadRawValue()
178
179 log('v = %s', attr_value_str(v))
180 log('unquoted val %r', h[attr_start:attr_end])
181
182 self.assertEqual(attr_value_e.Unquoted, v)
183 self.assertEqual(5, attr_start)
184 self.assertEqual(8, attr_end)
185
186 def testDoubleQuoted(self):
187 h = '<a x="f&">'
188 lx = htm8.Lexer(h)
189
190 tok_id, end_pos = lx.Read()
191 self.assertEqual(h8_id.StartTag, tok_id)
192
193 attr_lx = htm8.AttrLexer(h)
194 attr_lx.Init(lx.TagNamePos(), end_pos)
195 n, name_start, name_end = attr_lx.ReadName()
196 self.assertEqual(n, attr_name.Ok)
197 self.assertEqual(3, name_start)
198 self.assertEqual(4, name_end)
199
200 v, attr_start, attr_end = attr_lx.ReadRawValue()
201
202 log('v = %s', attr_value_str(v))
203 log('val %r', h[attr_start:attr_end])
204
205 self.assertEqual(attr_value_e.DoubleQuoted, v)
206 self.assertEqual(6, attr_start)
207 self.assertEqual(8, attr_end)
208
209 def testSingleQuoted(self):
210 h = "<a x='&f'>"
211 lx = htm8.Lexer(h)
212
213 tok_id, end_pos = lx.Read()
214 self.assertEqual(h8_id.StartTag, tok_id)
215
216 attr_lx = htm8.AttrLexer(h)
217 attr_lx.Init(lx.TagNamePos(), end_pos)
218 n, name_start, name_end = attr_lx.ReadName()
219 self.assertEqual(n, attr_name.Ok)
220 self.assertEqual(3, name_start)
221 self.assertEqual(4, name_end)
222
223 v, attr_start, attr_end = attr_lx.ReadRawValue()
224
225 log('v = %s', attr_value_str(v))
226 log('unquoted val %r', h[attr_start:attr_end])
227
228 self.assertEqual(attr_value_e.SingleQuoted, v)
229 self.assertEqual(6, attr_start)
230 self.assertEqual(8, attr_end)
231
232 def testDoubleQuoted_Bad(self):
233 h = '<a x="foo>'
234 lx = htm8.Lexer(h)
235
236 tok_id, end_pos = lx.Read()
237 self.assertEqual(h8_id.StartTag, tok_id)
238
239 attr_lx = htm8.AttrLexer(h)
240 attr_lx.Init(lx.TagNamePos(), end_pos)
241 n, name_start, name_end = attr_lx.ReadName()
242 self.assertEqual(n, attr_name.Ok)
243 self.assertEqual(3, name_start)
244 self.assertEqual(4, name_end)
245
246 try:
247 v, attr_start, attr_end = attr_lx.ReadRawValue()
248 except htm8.LexError as e:
249 print(e)
250 else:
251 self.fail('Expected LexError')
252
253 def testSingleQuoted_Bad(self):
254 h = "<a x='foo>"
255 lx = htm8.Lexer(h)
256
257 tok_id, end_pos = lx.Read()
258 self.assertEqual(h8_id.StartTag, tok_id)
259
260 attr_lx = htm8.AttrLexer(h)
261 attr_lx.Init(lx.TagNamePos(), end_pos)
262 n, name_start, name_end = attr_lx.ReadName()
263 self.assertEqual(n, attr_name.Ok)
264 self.assertEqual(3, name_start)
265 self.assertEqual(4, name_end)
266
267 try:
268 v, attr_start, attr_end = attr_lx.ReadRawValue()
269 except htm8.LexError as e:
270 print(e)
271 else:
272 self.fail('Expected LexError')
273
274
275def ValidTokenList(s, no_special_tags=False):
276 # type: (str, bool) -> List[Tuple[h8_id_t, int]]
277 """A wrapper that can be more easily translated to C++. Doesn't use iterators."""
278
279 start_pos = 0
280 tokens = []
281 lx = htm8.Lexer(s, no_special_tags=no_special_tags)
282 while True:
283 tok_id, end_pos = lx.Read()
284 tokens.append((tok_id, end_pos))
285 if tok_id == h8_id.EndOfStream:
286 break
287 if tok_id == h8_id.Invalid:
288 raise htm8.LexError(s, start_pos)
289 start_pos = end_pos
290 return tokens
291
292
293def Lex(h, no_special_tags=False):
294 # type: (str, bool) -> List[Tuple[int, int]]
295 print(repr(h))
296 tokens = ValidTokenList(h, no_special_tags=no_special_tags)
297 start_pos = 0
298 for tok_id, end_pos in tokens:
299 frag = h[start_pos:end_pos]
300 log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
301 start_pos = end_pos
302 return tokens
303
304
305class LexerTest(unittest.TestCase):
306
307 # IndexLinker in devtools/make_help.py
308 # <pre> sections in doc/html_help.py
309 # TocExtractor in devtools/cmark.py
310
311 def testPstrip(self):
312 # type: () -> None
313 """Remove anything like this.
314
315 <p><pstrip> </pstrip></p>
316 """
317 pass
318
319 def testCommentParse(self):
320 # type: () -> None
321 n = len(TEST_HTML)
322 tokens = Lex(TEST_HTML)
323
324 def testCommentParse2(self):
325 # type: () -> None
326 h = '''
327 hi <!-- line 1
328 line 2 --><br/>'''
329 tokens = Lex(h)
330
331 self.assertEqual(
332 [
333 (h8_id.RawData, 12),
334 (h8_id.Comment, 50), # <? err ?>
335 (h8_id.StartEndTag, 55),
336 (h8_id.EndOfStream, 55),
337 ],
338 tokens)
339
340 def testProcessingInstruction(self):
341 # type: () -> None
342 # <?xml ?> header
343 h = 'hi <? err ?>'
344 tokens = Lex(h)
345
346 self.assertEqual(
347 [
348 (h8_id.RawData, 3),
349 (h8_id.Processing, 12), # <? err ?>
350 (h8_id.EndOfStream, 12),
351 ],
352 tokens)
353
354 def testScriptStyle(self):
355 # type: () -> None
356 h = '''
357 hi <script src=""> if (x < 1 && y > 2 ) { console.log(""); }
358 </script>
359 '''
360 tokens = Lex(h)
361
362 expected = [
363 (h8_id.RawData, 12),
364 (h8_id.StartTag, 27), # <script>
365 (h8_id.HtmlCData, 78), # JavaScript code is HTML CData
366 (h8_id.EndTag, 87), # </script>
367 (h8_id.RawData, 96), # \n
368 (h8_id.EndOfStream, 96), # \n
369 ]
370 self.assertEqual(expected, tokens)
371
372 # Test case matching
373 tokens = Lex(h.replace('script', 'scrIPT'))
374 self.assertEqual(expected, tokens)
375
376 def testScriptStyleXml(self):
377 # type: () -> None
378 h = 'hi <script src=""> &lt; </script>'
379 # XML mode
380 tokens = Lex(h, no_special_tags=True)
381
382 self.assertEqual(
383 [
384 (h8_id.RawData, 3),
385 (h8_id.StartTag, 18), # <script>
386 (h8_id.RawData, 19), # space
387 (h8_id.CharEntity, 23), # </script>
388 (h8_id.RawData, 24), # \n
389 (h8_id.EndTag, 33), # \n
390 (h8_id.EndOfStream, 33), # \n
391 ],
392 tokens)
393
394 def testCData(self):
395 # type: () -> None
396
397 # from
398 # /home/andy/src/languages/Python-3.11.5/Lib/test/xmltestdata/c14n-20/inC14N4.xml
399 h = '<compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>'
400 tokens = Lex(h)
401
402 self.assertEqual([
403 (h8_id.StartTag, 9),
404 (h8_id.CData, 61),
405 (h8_id.EndTag, 71),
406 (h8_id.EndOfStream, 71),
407 ], tokens)
408
409 def testEntity(self):
410 # type: () -> None
411
412 # from
413 # /home/andy/src/Python-3.12.4/Lib/test/xmltestdata/c14n-20/inC14N5.xml
414 h = '&ent1;, &ent2;!'
415
416 tokens = Lex(h)
417
418 self.assertEqual([
419 (h8_id.CharEntity, 6),
420 (h8_id.RawData, 8),
421 (h8_id.CharEntity, 14),
422 (h8_id.RawData, 15),
423 (h8_id.EndOfStream, 15),
424 ], tokens)
425
426 def testStartTag(self):
427 # type: () -> None
428
429 h = '<a>hi</a>'
430 tokens = Lex(h)
431
432 self.assertEqual([
433 (h8_id.StartTag, 3),
434 (h8_id.RawData, 5),
435 (h8_id.EndTag, 9),
436 (h8_id.EndOfStream, 9),
437 ], tokens)
438
439 # Make sure we don't consume too much
440 h = '<a><source>1.7</source></a>'
441
442 tokens = Lex(h)
443
444 self.assertEqual([
445 (h8_id.StartTag, 3),
446 (h8_id.StartTag, 11),
447 (h8_id.RawData, 14),
448 (h8_id.EndTag, 23),
449 (h8_id.EndTag, 27),
450 (h8_id.EndOfStream, 27),
451 ], tokens)
452
453 return
454
455 h = '''
456 <configuration>
457 <source>1.7</source>
458 </configuration>'''
459
460 tokens = Lex(h)
461
462 self.assertEqual([
463 (h8_id.RawData, 9),
464 (h8_id.StartTag, 24),
465 (h8_id.RawData, 9),
466 (h8_id.EndOfStream, 9),
467 ], tokens)
468
469 def testBad(self):
470 # type: () -> None
471 h = '&'
472 tokens = Lex(h)
473
474 self.assertEqual([
475 (h8_id.BadAmpersand, 1),
476 (h8_id.EndOfStream, 1),
477 ], tokens)
478
479 h = '>'
480 tokens = Lex(h)
481
482 self.assertEqual([
483 (h8_id.BadGreaterThan, 1),
484 (h8_id.EndOfStream, 1),
485 ], tokens)
486
487 def testEndOfStream(self):
488 # type: () -> None
489
490 # NUL is end
491 h = 'a\0b'
492 tokens = Lex(h)
493
494 self.assertEqual([
495 (h8_id.RawData, 1),
496 (h8_id.EndOfStream, 2),
497 ], tokens)
498
499
500if __name__ == '__main__':
501 unittest.main()