OILS / data_lang / htm8_test.py View on Github | oils.pub

413 lines, 268 significant
1#!/usr/bin/env python2
2from __future__ import print_function
3
4from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_id_str, attr_name,
5 attr_value_e, attr_value_str)
6
7import unittest
8import re
9
10from typing import List, Tuple
11
12from data_lang import htm8
13from doctools.util import log
14
15with open('data_lang/testdata/hello.htm8') as f:
16 TEST_HTML = f.read()
17
18
19class RegexTest(unittest.TestCase):
20
21 def testDotAll(self):
22 # type: () -> None
23
24 # Note that $ matches end of line, not end of string
25 p1 = re.compile(r'.')
26 print(p1.match('\n'))
27
28 p2 = re.compile(r'.', re.DOTALL)
29 print(p2.match('\n'))
30
31 #p3 = re.compile(r'[.\n]', re.VERBOSE)
32 p3 = re.compile(r'[.\n]')
33 print(p3.match('\n'))
34
35 print('Negation')
36
37 p4 = re.compile(r'[^>]')
38 print(p4.match('\n'))
39
40 def testAttrRe(self):
41 # type: () -> None
42 _ATTR_RE = htm8._ATTR_RE
43 m = _ATTR_RE.match(' empty= val')
44 print(m.groups())
45
46
47class FunctionsTest(unittest.TestCase):
48
49 def testFindLineNum(self):
50 # type: () -> None
51 s = 'foo\n' * 3
52 for pos in [1, 5, 10, 50]: # out of bounds
53 line_num = htm8._FindLineNum(s, pos)
54 print(line_num)
55
56
57class AttrLexerTest(unittest.TestCase):
58
59 def testNoAttrs(self):
60 # type: () -> None
61
62 # TODO: h8_id.StartTag and EndTag will expose the tag_name_pos - the
63 # end of the tag name
64
65 h = 'x <a>'
66 lx = htm8.Lexer(h)
67
68 # Skip raw data
69 tok_id, end_pos = lx.Read()
70 self.assertEqual(h8_id.RawData, tok_id)
71
72 tok_id, end_pos = lx.Read()
73 self.assertEqual(h8_id.StartTag, tok_id)
74
75 attr_lx = htm8.AttrLexer(h)
76 attr_lx.Init(lx.TagNamePos(), end_pos)
77
78 # There is no tag
79 n, name_start, name_end = attr_lx.ReadName()
80 self.assertEqual(n, attr_name.Done)
81 self.assertEqual(-1, name_start)
82 self.assertEqual(-1, name_end)
83
84 try:
85 result = attr_lx.ReadRawValue()
86 except AssertionError as e:
87 print(e)
88 else:
89 self.fail('should have failed')
90
91 def testInvalid(self):
92 h = '<a !>'
93 lx = htm8.Lexer(h)
94
95 tok_id, end_pos = lx.Read()
96 self.assertEqual(h8_id.StartTag, tok_id)
97
98 attr_lx = htm8.AttrLexer(h)
99 attr_lx.Init(lx.TagNamePos(), end_pos)
100
101 n, name_start, name_end = attr_lx.ReadName()
102 self.assertEqual(n, attr_name.Invalid)
103 self.assertEqual(-1, name_start)
104 self.assertEqual(-1, name_end)
105
106 try:
107 result = attr_lx.ReadRawValue()
108 except AssertionError as e:
109 print(e)
110 else:
111 self.fail('should have failed')
112
113 def testEmpty(self):
114 h = '<img src=/>'
115 lx = htm8.Lexer(h)
116
117 tok_id, end_pos = lx.Read()
118 self.assertEqual(h8_id.StartEndTag, tok_id)
119
120 attr_lx = htm8.AttrLexer(h)
121 attr_lx.Init(lx.TagNamePos(), end_pos)
122
123 n, name_start, name_end = attr_lx.ReadName()
124 self.assertEqual(n, attr_name.Ok)
125 self.assertEqual(5, name_start)
126 self.assertEqual(8, name_end)
127 self.assertEqual(False, attr_lx.next_value_is_missing)
128
129 self.assertEqual(True, attr_lx.AttrNameEquals('src'))
130 self.assertEqual(False, attr_lx.AttrNameEquals('srcz'))
131
132 v, attr_start, attr_end = attr_lx.ReadRawValue()
133 log('v = %s', attr_value_str(v))
134 self.assertEqual(attr_value_e.Empty, v)
135 self.assertEqual(-1, attr_start)
136 self.assertEqual(-1, attr_end)
137
138 def testMissing(self):
139 h = '<img SRC/>'
140 lx = htm8.Lexer(h)
141
142 tok_id, end_pos = lx.Read()
143 self.assertEqual(h8_id.StartEndTag, tok_id)
144
145 attr_lx = htm8.AttrLexer(h)
146 attr_lx.Init(lx.TagNamePos(), end_pos)
147
148 n, name_start, name_end = attr_lx.ReadName()
149 self.assertEqual(n, attr_name.Ok)
150 self.assertEqual(5, name_start)
151 self.assertEqual(8, name_end)
152 self.assertEqual(True, attr_lx.next_value_is_missing)
153
154 self.assertEqual(True, attr_lx.AttrNameEquals('src'))
155 self.assertEqual(False, attr_lx.AttrNameEquals('srcz'))
156
157 v, attr_start, attr_end = attr_lx.ReadRawValue()
158 self.assertEqual(attr_value_e.Missing, v)
159 self.assertEqual(-1, attr_start)
160 self.assertEqual(-1, attr_end)
161
162 def testUnquoted(self):
163 # CAREFUL: /> is a StartEndTag, and / is not part of unquoted value
164 h = '<a x=foo/>'
165 lx = htm8.Lexer(h)
166
167 tok_id, end_pos = lx.Read()
168 self.assertEqual(h8_id.StartEndTag, tok_id)
169
170 attr_lx = htm8.AttrLexer(h)
171 attr_lx.Init(lx.TagNamePos(), end_pos)
172 n, name_start, name_end = attr_lx.ReadName()
173 self.assertEqual(n, attr_name.Ok)
174 self.assertEqual(3, name_start)
175 self.assertEqual(4, name_end)
176
177 v, attr_start, attr_end = attr_lx.ReadRawValue()
178
179 log('v = %s', attr_value_str(v))
180 log('unquoted val %r', h[attr_start:attr_end])
181
182 self.assertEqual(attr_value_e.Unquoted, v)
183 self.assertEqual(5, attr_start)
184 self.assertEqual(8, attr_end)
185
186
187def ValidTokenList(s, no_special_tags=False):
188 # type: (str, bool) -> List[Tuple[h8_id_t, int]]
189 """A wrapper that can be more easily translated to C++. Doesn't use iterators."""
190
191 start_pos = 0
192 tokens = []
193 lx = htm8.Lexer(s, no_special_tags=no_special_tags)
194 while True:
195 tok_id, end_pos = lx.Read()
196 tokens.append((tok_id, end_pos))
197 if tok_id == h8_id.EndOfStream:
198 break
199 if tok_id == h8_id.Invalid:
200 raise htm8.LexError(s, start_pos)
201 start_pos = end_pos
202 return tokens
203
204
205def Lex(h, no_special_tags=False):
206 # type: (str, bool) -> List[Tuple[int, int]]
207 print(repr(h))
208 tokens = ValidTokenList(h, no_special_tags=no_special_tags)
209 start_pos = 0
210 for tok_id, end_pos in tokens:
211 frag = h[start_pos:end_pos]
212 log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
213 start_pos = end_pos
214 return tokens
215
216
217class LexerTest(unittest.TestCase):
218
219 # IndexLinker in devtools/make_help.py
220 # <pre> sections in doc/html_help.py
221 # TocExtractor in devtools/cmark.py
222
223 def testPstrip(self):
224 # type: () -> None
225 """Remove anything like this.
226
227 <p><pstrip> </pstrip></p>
228 """
229 pass
230
231 def testCommentParse(self):
232 # type: () -> None
233 n = len(TEST_HTML)
234 tokens = Lex(TEST_HTML)
235
236 def testCommentParse2(self):
237 # type: () -> None
238 h = '''
239 hi <!-- line 1
240 line 2 --><br/>'''
241 tokens = Lex(h)
242
243 self.assertEqual(
244 [
245 (h8_id.RawData, 12),
246 (h8_id.Comment, 50), # <? err ?>
247 (h8_id.StartEndTag, 55),
248 (h8_id.EndOfStream, 55),
249 ],
250 tokens)
251
252 def testProcessingInstruction(self):
253 # type: () -> None
254 # <?xml ?> header
255 h = 'hi <? err ?>'
256 tokens = Lex(h)
257
258 self.assertEqual(
259 [
260 (h8_id.RawData, 3),
261 (h8_id.Processing, 12), # <? err ?>
262 (h8_id.EndOfStream, 12),
263 ],
264 tokens)
265
266 def testScriptStyle(self):
267 # type: () -> None
268 h = '''
269 hi <script src=""> if (x < 1 && y > 2 ) { console.log(""); }
270 </script>
271 '''
272 tokens = Lex(h)
273
274 expected = [
275 (h8_id.RawData, 12),
276 (h8_id.StartTag, 27), # <script>
277 (h8_id.HtmlCData, 78), # JavaScript code is HTML CData
278 (h8_id.EndTag, 87), # </script>
279 (h8_id.RawData, 96), # \n
280 (h8_id.EndOfStream, 96), # \n
281 ]
282 self.assertEqual(expected, tokens)
283
284 # Test case matching
285 tokens = Lex(h.replace('script', 'scrIPT'))
286 self.assertEqual(expected, tokens)
287
288 def testScriptStyleXml(self):
289 # type: () -> None
290 h = 'hi <script src=""> &lt; </script>'
291 # XML mode
292 tokens = Lex(h, no_special_tags=True)
293
294 self.assertEqual(
295 [
296 (h8_id.RawData, 3),
297 (h8_id.StartTag, 18), # <script>
298 (h8_id.RawData, 19), # space
299 (h8_id.CharEntity, 23), # </script>
300 (h8_id.RawData, 24), # \n
301 (h8_id.EndTag, 33), # \n
302 (h8_id.EndOfStream, 33), # \n
303 ],
304 tokens)
305
306 def testCData(self):
307 # type: () -> None
308
309 # from
310 # /home/andy/src/languages/Python-3.11.5/Lib/test/xmltestdata/c14n-20/inC14N4.xml
311 h = '<compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>'
312 tokens = Lex(h)
313
314 self.assertEqual([
315 (h8_id.StartTag, 9),
316 (h8_id.CData, 61),
317 (h8_id.EndTag, 71),
318 (h8_id.EndOfStream, 71),
319 ], tokens)
320
321 def testEntity(self):
322 # type: () -> None
323
324 # from
325 # /home/andy/src/Python-3.12.4/Lib/test/xmltestdata/c14n-20/inC14N5.xml
326 h = '&ent1;, &ent2;!'
327
328 tokens = Lex(h)
329
330 self.assertEqual([
331 (h8_id.CharEntity, 6),
332 (h8_id.RawData, 8),
333 (h8_id.CharEntity, 14),
334 (h8_id.RawData, 15),
335 (h8_id.EndOfStream, 15),
336 ], tokens)
337
338 def testStartTag(self):
339 # type: () -> None
340
341 h = '<a>hi</a>'
342 tokens = Lex(h)
343
344 self.assertEqual([
345 (h8_id.StartTag, 3),
346 (h8_id.RawData, 5),
347 (h8_id.EndTag, 9),
348 (h8_id.EndOfStream, 9),
349 ], tokens)
350
351 # Make sure we don't consume too much
352 h = '<a><source>1.7</source></a>'
353
354 tokens = Lex(h)
355
356 self.assertEqual([
357 (h8_id.StartTag, 3),
358 (h8_id.StartTag, 11),
359 (h8_id.RawData, 14),
360 (h8_id.EndTag, 23),
361 (h8_id.EndTag, 27),
362 (h8_id.EndOfStream, 27),
363 ], tokens)
364
365 return
366
367 h = '''
368 <configuration>
369 <source>1.7</source>
370 </configuration>'''
371
372 tokens = Lex(h)
373
374 self.assertEqual([
375 (h8_id.RawData, 9),
376 (h8_id.StartTag, 24),
377 (h8_id.RawData, 9),
378 (h8_id.EndOfStream, 9),
379 ], tokens)
380
381 def testBad(self):
382 # type: () -> None
383 h = '&'
384 tokens = Lex(h)
385
386 self.assertEqual([
387 (h8_id.BadAmpersand, 1),
388 (h8_id.EndOfStream, 1),
389 ], tokens)
390
391 h = '>'
392 tokens = Lex(h)
393
394 self.assertEqual([
395 (h8_id.BadGreaterThan, 1),
396 (h8_id.EndOfStream, 1),
397 ], tokens)
398
399 def testEndOfStream(self):
400 # type: () -> None
401
402 # NUL is end
403 h = 'a\0b'
404 tokens = Lex(h)
405
406 self.assertEqual([
407 (h8_id.RawData, 1),
408 (h8_id.EndOfStream, 2),
409 ], tokens)
410
411
412if __name__ == '__main__':
413 unittest.main()