OILS / data_lang / htm8_test.py View on Github | oils.pub

472 lines, 311 significant
1#!/usr/bin/env python2
2from __future__ import print_function
3
4from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_id_str, attr_name,
5 attr_value_e, attr_value_str)
6
7import unittest
8import re
9
10from typing import List, Tuple, Any
11
12from data_lang import htm8
13from doctools.util import log
14
15with open('data_lang/testdata/hello.htm8') as f:
16 TEST_HTML = f.read()
17
18
19class RegexTest(unittest.TestCase):
20
21 def testDotAll(self):
22 # type: () -> None
23
24 # Note that $ matches end of line, not end of string
25 p1 = re.compile(r'.')
26 print(p1.match('\n'))
27
28 p2 = re.compile(r'.', re.DOTALL)
29 print(p2.match('\n'))
30
31 #p3 = re.compile(r'[.\n]', re.VERBOSE)
32 p3 = re.compile(r'[.\n]')
33 print(p3.match('\n'))
34
35 print('Negation')
36
37 p4 = re.compile(r'[^>]')
38 print(p4.match('\n'))
39
40 def testAttrRe(self):
41 # type: () -> None
42 _ATTR_RE = htm8._ATTR_RE
43 m = _ATTR_RE.match(' empty= val')
44 print(m.groups())
45
46
47class FunctionsTest(unittest.TestCase):
48
49 def testFindLineNum(self):
50 # type: () -> None
51 s = 'foo\n' * 3
52 for pos in [1, 5, 10, 50]: # out of bounds
53 line_num = htm8._FindLineNum(s, pos)
54 print(line_num)
55
56
57def _MakeAttrLexer(t, h, expected_tag=h8_id.StartTag):
58 # type: (Any, str) -> htm8.AttrLexer
59
60 lx = htm8.Lexer(h)
61
62 tok_id, end_pos = lx.Read()
63 t.assertEqual(expected_tag, tok_id)
64
65 attr_lx = htm8.AttrLexer(h)
66 attr_lx.Init(lx.TagNamePos(), end_pos)
67
68 return attr_lx
69
70
71class AttrLexerTest(unittest.TestCase):
72
73 def testNoAttrs(self):
74 # type: () -> None
75
76 # TODO: h8_id.StartTag and EndTag will expose the tag_name_pos - the
77 # end of the tag name
78
79 h = 'x <a>'
80 lx = htm8.Lexer(h)
81
82 # Skip raw data
83 tok_id, end_pos = lx.Read()
84 self.assertEqual(h8_id.RawData, tok_id)
85
86 tok_id, end_pos = lx.Read()
87 self.assertEqual(h8_id.StartTag, tok_id)
88
89 attr_lx = htm8.AttrLexer(h)
90 attr_lx.Init(lx.TagNamePos(), end_pos)
91
92 # There is no tag
93 n, name_start, name_end = attr_lx.ReadName()
94 self.assertEqual(n, attr_name.Done)
95 self.assertEqual(-1, name_start)
96 self.assertEqual(-1, name_end)
97
98 try:
99 result = attr_lx.ReadValue()
100 except AssertionError as e:
101 print(e)
102 else:
103 self.fail('should have failed')
104
105 def testInvalid(self):
106 h = '<a !>'
107 attr_lx = _MakeAttrLexer(self, h)
108
109 n, name_start, name_end = attr_lx.ReadName()
110 self.assertEqual(n, attr_name.Invalid)
111 self.assertEqual(-1, name_start)
112 self.assertEqual(-1, name_end)
113
114 try:
115 result = attr_lx.ReadValue()
116 except AssertionError as e:
117 print(e)
118 else:
119 self.fail('should have failed')
120
121 def testEmpty(self):
122 h = '<img src=/>'
123 attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartEndTag)
124
125 n, name_start, name_end = attr_lx.ReadName()
126 self.assertEqual(n, attr_name.Ok)
127 self.assertEqual(5, name_start)
128 self.assertEqual(8, name_end)
129 self.assertEqual(False, attr_lx.next_value_is_missing)
130
131 self.assertEqual(True, attr_lx.AttrNameEquals('src'))
132 self.assertEqual(False, attr_lx.AttrNameEquals('srcz'))
133
134 v, attr_start, attr_end = attr_lx.ReadValue()
135 log('v = %s', attr_value_str(v))
136 self.assertEqual(attr_value_e.Empty, v)
137 self.assertEqual(-1, attr_start)
138 self.assertEqual(-1, attr_end)
139
140 def testMissing(self):
141 h = '<img SRC/>'
142 attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartEndTag)
143
144 n, name_start, name_end = attr_lx.ReadName()
145 self.assertEqual(n, attr_name.Ok)
146 self.assertEqual(5, name_start)
147 self.assertEqual(8, name_end)
148 self.assertEqual(True, attr_lx.next_value_is_missing)
149
150 self.assertEqual(True, attr_lx.AttrNameEquals('src'))
151 self.assertEqual(False, attr_lx.AttrNameEquals('srcz'))
152
153 v, attr_start, attr_end = attr_lx.ReadValue()
154 self.assertEqual(attr_value_e.Missing, v)
155 self.assertEqual(-1, attr_start)
156 self.assertEqual(-1, attr_end)
157
158 def testUnquoted(self):
159 # CAREFUL: /> is a StartEndTag, and / is not part of unquoted value
160 h = '<a x=foo/>'
161 attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartEndTag)
162
163 n, name_start, name_end = attr_lx.ReadName()
164 self.assertEqual(n, attr_name.Ok)
165 self.assertEqual(3, name_start)
166 self.assertEqual(4, name_end)
167
168 v, attr_start, attr_end = attr_lx.ReadValue()
169
170 log('v = %s', attr_value_str(v))
171 log('unquoted val %r', h[attr_start:attr_end])
172
173 self.assertEqual(attr_value_e.Unquoted, v)
174 self.assertEqual(5, attr_start)
175 self.assertEqual(8, attr_end)
176
177 def testDoubleQuoted(self):
178 h = '<a x="f&">'
179 attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)
180
181 n, name_start, name_end = attr_lx.ReadName()
182 self.assertEqual(n, attr_name.Ok)
183 self.assertEqual(3, name_start)
184 self.assertEqual(4, name_end)
185
186 v, attr_start, attr_end = attr_lx.ReadValue()
187
188 log('v = %s', attr_value_str(v))
189 log('val %r', h[attr_start:attr_end])
190
191 self.assertEqual(attr_value_e.DoubleQuoted, v)
192 self.assertEqual(6, attr_start)
193 self.assertEqual(8, attr_end)
194
195 def testSingleQuoted(self):
196 h = "<a x='&f'>"
197 attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)
198
199 n, name_start, name_end = attr_lx.ReadName()
200 self.assertEqual(n, attr_name.Ok)
201 self.assertEqual(3, name_start)
202 self.assertEqual(4, name_end)
203
204 v, attr_start, attr_end = attr_lx.ReadValue()
205
206 log('v = %s', attr_value_str(v))
207 log('unquoted val %r', h[attr_start:attr_end])
208
209 self.assertEqual(attr_value_e.SingleQuoted, v)
210 self.assertEqual(6, attr_start)
211 self.assertEqual(8, attr_end)
212
213 def testDoubleQuoted_Bad(self):
214 h = '<a x="foo>'
215 attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)
216
217 n, name_start, name_end = attr_lx.ReadName()
218 self.assertEqual(n, attr_name.Ok)
219 self.assertEqual(3, name_start)
220 self.assertEqual(4, name_end)
221
222 try:
223 v, attr_start, attr_end = attr_lx.ReadValue()
224 except htm8.LexError as e:
225 print(e)
226 else:
227 self.fail('Expected LexError')
228
229 def testSingleQuoted_Bad(self):
230 h = "<a x='foo>"
231 attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag)
232
233 n, name_start, name_end = attr_lx.ReadName()
234 self.assertEqual(n, attr_name.Ok)
235 self.assertEqual(3, name_start)
236 self.assertEqual(4, name_end)
237
238 try:
239 v, attr_start, attr_end = attr_lx.ReadValue()
240 except htm8.LexError as e:
241 print(e)
242 else:
243 self.fail('Expected LexError')
244
245
246def ValidTokenList(s, no_special_tags=False):
247 # type: (str, bool) -> List[Tuple[h8_id_t, int]]
248 """A wrapper that can be more easily translated to C++. Doesn't use iterators."""
249
250 start_pos = 0
251 tokens = []
252 lx = htm8.Lexer(s, no_special_tags=no_special_tags)
253 while True:
254 tok_id, end_pos = lx.Read()
255 tokens.append((tok_id, end_pos))
256 if tok_id == h8_id.EndOfStream:
257 break
258 if tok_id == h8_id.Invalid:
259 raise htm8.LexError(s, start_pos)
260 start_pos = end_pos
261 return tokens
262
263
264def Lex(h, no_special_tags=False):
265 # type: (str, bool) -> List[Tuple[int, int]]
266 print(repr(h))
267 tokens = ValidTokenList(h, no_special_tags=no_special_tags)
268 start_pos = 0
269 for tok_id, end_pos in tokens:
270 frag = h[start_pos:end_pos]
271 log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
272 start_pos = end_pos
273 return tokens
274
275
276class LexerTest(unittest.TestCase):
277
278 # IndexLinker in devtools/make_help.py
279 # <pre> sections in doc/html_help.py
280 # TocExtractor in devtools/cmark.py
281
282 def testPstrip(self):
283 # type: () -> None
284 """Remove anything like this.
285
286 <p><pstrip> </pstrip></p>
287 """
288 pass
289
290 def testCommentParse(self):
291 # type: () -> None
292 n = len(TEST_HTML)
293 tokens = Lex(TEST_HTML)
294
295 def testCommentParse2(self):
296 # type: () -> None
297 h = '''
298 hi <!-- line 1
299 line 2 --><br/>'''
300 tokens = Lex(h)
301
302 self.assertEqual(
303 [
304 (h8_id.RawData, 12),
305 (h8_id.Comment, 50), # <? err ?>
306 (h8_id.StartEndTag, 55),
307 (h8_id.EndOfStream, 55),
308 ],
309 tokens)
310
311 def testProcessingInstruction(self):
312 # type: () -> None
313 # <?xml ?> header
314 h = 'hi <? err ?>'
315 tokens = Lex(h)
316
317 self.assertEqual(
318 [
319 (h8_id.RawData, 3),
320 (h8_id.Processing, 12), # <? err ?>
321 (h8_id.EndOfStream, 12),
322 ],
323 tokens)
324
325 def testScriptStyle(self):
326 # type: () -> None
327 h = '''
328 hi <script src=""> if (x < 1 && y > 2 ) { console.log(""); }
329 </script>
330 '''
331 tokens = Lex(h)
332
333 expected = [
334 (h8_id.RawData, 12),
335 (h8_id.StartTag, 27), # <script>
336 (h8_id.HtmlCData, 78), # JavaScript code is HTML CData
337 (h8_id.EndTag, 87), # </script>
338 (h8_id.RawData, 96), # \n
339 (h8_id.EndOfStream, 96), # \n
340 ]
341 self.assertEqual(expected, tokens)
342
343 # Test case matching
344 tokens = Lex(h.replace('script', 'scrIPT'))
345 self.assertEqual(expected, tokens)
346
347 def testScriptStyleXml(self):
348 # type: () -> None
349 h = 'hi <script src=""> &lt; </script>'
350 # XML mode
351 tokens = Lex(h, no_special_tags=True)
352
353 self.assertEqual(
354 [
355 (h8_id.RawData, 3),
356 (h8_id.StartTag, 18), # <script>
357 (h8_id.RawData, 19), # space
358 (h8_id.CharEntity, 23), # </script>
359 (h8_id.RawData, 24), # \n
360 (h8_id.EndTag, 33), # \n
361 (h8_id.EndOfStream, 33), # \n
362 ],
363 tokens)
364
365 def testCData(self):
366 # type: () -> None
367
368 # from
369 # /home/andy/src/languages/Python-3.11.5/Lib/test/xmltestdata/c14n-20/inC14N4.xml
370 h = '<compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>'
371 tokens = Lex(h)
372
373 self.assertEqual([
374 (h8_id.StartTag, 9),
375 (h8_id.CData, 61),
376 (h8_id.EndTag, 71),
377 (h8_id.EndOfStream, 71),
378 ], tokens)
379
380 def testEntity(self):
381 # type: () -> None
382
383 # from
384 # /home/andy/src/Python-3.12.4/Lib/test/xmltestdata/c14n-20/inC14N5.xml
385 h = '&ent1;, &ent2;!'
386
387 tokens = Lex(h)
388
389 self.assertEqual([
390 (h8_id.CharEntity, 6),
391 (h8_id.RawData, 8),
392 (h8_id.CharEntity, 14),
393 (h8_id.RawData, 15),
394 (h8_id.EndOfStream, 15),
395 ], tokens)
396
397 def testStartTag(self):
398 # type: () -> None
399
400 h = '<a>hi</a>'
401 tokens = Lex(h)
402
403 self.assertEqual([
404 (h8_id.StartTag, 3),
405 (h8_id.RawData, 5),
406 (h8_id.EndTag, 9),
407 (h8_id.EndOfStream, 9),
408 ], tokens)
409
410 # Make sure we don't consume too much
411 h = '<a><source>1.7</source></a>'
412
413 tokens = Lex(h)
414
415 self.assertEqual([
416 (h8_id.StartTag, 3),
417 (h8_id.StartTag, 11),
418 (h8_id.RawData, 14),
419 (h8_id.EndTag, 23),
420 (h8_id.EndTag, 27),
421 (h8_id.EndOfStream, 27),
422 ], tokens)
423
424 return
425
426 h = '''
427 <configuration>
428 <source>1.7</source>
429 </configuration>'''
430
431 tokens = Lex(h)
432
433 self.assertEqual([
434 (h8_id.RawData, 9),
435 (h8_id.StartTag, 24),
436 (h8_id.RawData, 9),
437 (h8_id.EndOfStream, 9),
438 ], tokens)
439
440 def testBad(self):
441 # type: () -> None
442 h = '&'
443 tokens = Lex(h)
444
445 self.assertEqual([
446 (h8_id.BadAmpersand, 1),
447 (h8_id.EndOfStream, 1),
448 ], tokens)
449
450 h = '>'
451 tokens = Lex(h)
452
453 self.assertEqual([
454 (h8_id.BadGreaterThan, 1),
455 (h8_id.EndOfStream, 1),
456 ], tokens)
457
458 def testEndOfStream(self):
459 # type: () -> None
460
461 # NUL is end
462 h = 'a\0b'
463 tokens = Lex(h)
464
465 self.assertEqual([
466 (h8_id.RawData, 1),
467 (h8_id.EndOfStream, 2),
468 ], tokens)
469
470
471if __name__ == '__main__':
472 unittest.main()