OILS / data_lang / htm8_test.py View on Github | oils.pub

379 lines, 240 significant
1#!/usr/bin/env python2
2from __future__ import print_function
3
4from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_id_str, attr_name)
5
6import unittest
7import re
8
9from typing import List, Tuple
10
11from data_lang import htm8
12from doctools.util import log
13
14with open('data_lang/testdata/hello.htm8') as f:
15 TEST_HTML = f.read()
16
17
18class RegexTest(unittest.TestCase):
19
20 def testDotAll(self):
21 # type: () -> None
22
23 # Note that $ matches end of line, not end of string
24 p1 = re.compile(r'.')
25 print(p1.match('\n'))
26
27 p2 = re.compile(r'.', re.DOTALL)
28 print(p2.match('\n'))
29
30 #p3 = re.compile(r'[.\n]', re.VERBOSE)
31 p3 = re.compile(r'[.\n]')
32 print(p3.match('\n'))
33
34 print('Negation')
35
36 p4 = re.compile(r'[^>]')
37 print(p4.match('\n'))
38
39 def testAttrRe(self):
40 # type: () -> None
41 _ATTR_RE = htm8._ATTR_RE
42 m = _ATTR_RE.match(' empty= val')
43 print(m.groups())
44
45
46class FunctionsTest(unittest.TestCase):
47
48 def testFindLineNum(self):
49 # type: () -> None
50 s = 'foo\n' * 3
51 for pos in [1, 5, 10, 50]: # out of bounds
52 line_num = htm8._FindLineNum(s, pos)
53 print(line_num)
54
55
56class AttrLexerTest(unittest.TestCase):
57
58 def testNoAttrs(self):
59 # type: () -> None
60
61 # TODO: h8_id.StartTag and EndTag will expose the tag_name_pos - the
62 # end of the tag name
63
64 h = 'x <a>'
65 lx = htm8.Lexer(h)
66
67 # Skip raw data
68 tok_id, end_pos = lx.Read()
69 self.assertEqual(h8_id.RawData, tok_id)
70
71 tok_id, end_pos = lx.Read()
72 self.assertEqual(h8_id.StartTag, tok_id)
73
74 attr_lexer = htm8.AttrLexer(h)
75 attr_lexer.Init(lx.TagNamePos(), end_pos)
76
77 # There is no tag
78 n, name_start, name_end = attr_lexer.ReadName()
79 self.assertEqual(n, attr_name.Done)
80 self.assertEqual(-1, name_start)
81 self.assertEqual(-1, name_end)
82
83 def testInvalid(self):
84 h = '<a !>'
85 lx = htm8.Lexer(h)
86
87 tok_id, end_pos = lx.Read()
88 self.assertEqual(h8_id.StartTag, tok_id)
89
90 attr_lexer = htm8.AttrLexer(h)
91 attr_lexer.Init(lx.TagNamePos(), end_pos)
92
93 n, name_start, name_end = attr_lexer.ReadName()
94 self.assertEqual(n, attr_name.Invalid)
95 self.assertEqual(-1, name_start)
96 self.assertEqual(-1, name_end)
97
98 def testEmpty(self):
99 h = '<img src=/>'
100 lx = htm8.Lexer(h)
101
102 tok_id, end_pos = lx.Read()
103 self.assertEqual(h8_id.StartEndTag, tok_id)
104
105 attr_lexer = htm8.AttrLexer(h)
106 attr_lexer.Init(lx.TagNamePos(), end_pos)
107
108 n, name_start, name_end = attr_lexer.ReadName()
109 self.assertEqual(n, attr_name.Ok)
110 self.assertEqual(5, name_start)
111 self.assertEqual(8, name_end)
112 self.assertEqual(False, attr_lexer.next_value_is_missing)
113
114 self.assertEqual(True, attr_lexer.AttrNameEquals('src'))
115 self.assertEqual(False, attr_lexer.AttrNameEquals('srcz'))
116
117 def testMissing(self):
118 h = '<img SRC/>'
119 lx = htm8.Lexer(h)
120
121 tok_id, end_pos = lx.Read()
122 self.assertEqual(h8_id.StartEndTag, tok_id)
123
124 attr_lexer = htm8.AttrLexer(h)
125 attr_lexer.Init(lx.TagNamePos(), end_pos)
126
127 n, name_start, name_end = attr_lexer.ReadName()
128 self.assertEqual(n, attr_name.Ok)
129 self.assertEqual(5, name_start)
130 self.assertEqual(8, name_end)
131 self.assertEqual(True, attr_lexer.next_value_is_missing)
132
133 self.assertEqual(True, attr_lexer.AttrNameEquals('src'))
134 self.assertEqual(False, attr_lexer.AttrNameEquals('srcz'))
135
136 def testAttr(self):
137 h = '<a x=foo>'
138 lx = htm8.Lexer(h)
139
140 tok_id, end_pos = lx.Read()
141 self.assertEqual(h8_id.StartTag, tok_id)
142
143 attr_lexer = htm8.AttrLexer(h)
144 attr_lexer.Init(lx.TagNamePos(), end_pos)
145 n, name_start, name_end = attr_lexer.ReadName()
146 self.assertEqual(n, attr_name.Ok)
147 self.assertEqual(3, name_start)
148 self.assertEqual(4, name_end)
149
150 # Note: internal state set according to =
151
152
153def ValidTokenList(s, no_special_tags=False):
154 # type: (str, bool) -> List[Tuple[h8_id_t, int]]
155 """A wrapper that can be more easily translated to C++. Doesn't use iterators."""
156
157 start_pos = 0
158 tokens = []
159 lx = htm8.Lexer(s, no_special_tags=no_special_tags)
160 while True:
161 tok_id, end_pos = lx.Read()
162 tokens.append((tok_id, end_pos))
163 if tok_id == h8_id.EndOfStream:
164 break
165 if tok_id == h8_id.Invalid:
166 raise htm8.LexError(s, start_pos)
167 start_pos = end_pos
168 return tokens
169
170
171def Lex(h, no_special_tags=False):
172 # type: (str, bool) -> List[Tuple[int, int]]
173 print(repr(h))
174 tokens = ValidTokenList(h, no_special_tags=no_special_tags)
175 start_pos = 0
176 for tok_id, end_pos in tokens:
177 frag = h[start_pos:end_pos]
178 log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
179 start_pos = end_pos
180 return tokens
181
182
183class LexerTest(unittest.TestCase):
184
185 # IndexLinker in devtools/make_help.py
186 # <pre> sections in doc/html_help.py
187 # TocExtractor in devtools/cmark.py
188
189 def testPstrip(self):
190 # type: () -> None
191 """Remove anything like this.
192
193 <p><pstrip> </pstrip></p>
194 """
195 pass
196
197 def testCommentParse(self):
198 # type: () -> None
199 n = len(TEST_HTML)
200 tokens = Lex(TEST_HTML)
201
202 def testCommentParse2(self):
203 # type: () -> None
204 h = '''
205 hi <!-- line 1
206 line 2 --><br/>'''
207 tokens = Lex(h)
208
209 self.assertEqual(
210 [
211 (h8_id.RawData, 12),
212 (h8_id.Comment, 50), # <? err ?>
213 (h8_id.StartEndTag, 55),
214 (h8_id.EndOfStream, 55),
215 ],
216 tokens)
217
218 def testProcessingInstruction(self):
219 # type: () -> None
220 # <?xml ?> header
221 h = 'hi <? err ?>'
222 tokens = Lex(h)
223
224 self.assertEqual(
225 [
226 (h8_id.RawData, 3),
227 (h8_id.Processing, 12), # <? err ?>
228 (h8_id.EndOfStream, 12),
229 ],
230 tokens)
231
232 def testScriptStyle(self):
233 # type: () -> None
234 h = '''
235 hi <script src=""> if (x < 1 && y > 2 ) { console.log(""); }
236 </script>
237 '''
238 tokens = Lex(h)
239
240 expected = [
241 (h8_id.RawData, 12),
242 (h8_id.StartTag, 27), # <script>
243 (h8_id.HtmlCData, 78), # JavaScript code is HTML CData
244 (h8_id.EndTag, 87), # </script>
245 (h8_id.RawData, 96), # \n
246 (h8_id.EndOfStream, 96), # \n
247 ]
248 self.assertEqual(expected, tokens)
249
250 # Test case matching
251 tokens = Lex(h.replace('script', 'scrIPT'))
252 self.assertEqual(expected, tokens)
253
254 def testScriptStyleXml(self):
255 # type: () -> None
256 h = 'hi <script src=""> &lt; </script>'
257 # XML mode
258 tokens = Lex(h, no_special_tags=True)
259
260 self.assertEqual(
261 [
262 (h8_id.RawData, 3),
263 (h8_id.StartTag, 18), # <script>
264 (h8_id.RawData, 19), # space
265 (h8_id.CharEntity, 23), # </script>
266 (h8_id.RawData, 24), # \n
267 (h8_id.EndTag, 33), # \n
268 (h8_id.EndOfStream, 33), # \n
269 ],
270 tokens)
271
272 def testCData(self):
273 # type: () -> None
274
275 # from
276 # /home/andy/src/languages/Python-3.11.5/Lib/test/xmltestdata/c14n-20/inC14N4.xml
277 h = '<compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>'
278 tokens = Lex(h)
279
280 self.assertEqual([
281 (h8_id.StartTag, 9),
282 (h8_id.CData, 61),
283 (h8_id.EndTag, 71),
284 (h8_id.EndOfStream, 71),
285 ], tokens)
286
287 def testEntity(self):
288 # type: () -> None
289
290 # from
291 # /home/andy/src/Python-3.12.4/Lib/test/xmltestdata/c14n-20/inC14N5.xml
292 h = '&ent1;, &ent2;!'
293
294 tokens = Lex(h)
295
296 self.assertEqual([
297 (h8_id.CharEntity, 6),
298 (h8_id.RawData, 8),
299 (h8_id.CharEntity, 14),
300 (h8_id.RawData, 15),
301 (h8_id.EndOfStream, 15),
302 ], tokens)
303
304 def testStartTag(self):
305 # type: () -> None
306
307 h = '<a>hi</a>'
308 tokens = Lex(h)
309
310 self.assertEqual([
311 (h8_id.StartTag, 3),
312 (h8_id.RawData, 5),
313 (h8_id.EndTag, 9),
314 (h8_id.EndOfStream, 9),
315 ], tokens)
316
317 # Make sure we don't consume too much
318 h = '<a><source>1.7</source></a>'
319
320 tokens = Lex(h)
321
322 self.assertEqual([
323 (h8_id.StartTag, 3),
324 (h8_id.StartTag, 11),
325 (h8_id.RawData, 14),
326 (h8_id.EndTag, 23),
327 (h8_id.EndTag, 27),
328 (h8_id.EndOfStream, 27),
329 ], tokens)
330
331 return
332
333 h = '''
334 <configuration>
335 <source>1.7</source>
336 </configuration>'''
337
338 tokens = Lex(h)
339
340 self.assertEqual([
341 (h8_id.RawData, 9),
342 (h8_id.StartTag, 24),
343 (h8_id.RawData, 9),
344 (h8_id.EndOfStream, 9),
345 ], tokens)
346
347 def testBad(self):
348 # type: () -> None
349 h = '&'
350 tokens = Lex(h)
351
352 self.assertEqual([
353 (h8_id.BadAmpersand, 1),
354 (h8_id.EndOfStream, 1),
355 ], tokens)
356
357 h = '>'
358 tokens = Lex(h)
359
360 self.assertEqual([
361 (h8_id.BadGreaterThan, 1),
362 (h8_id.EndOfStream, 1),
363 ], tokens)
364
365 def testEndOfStream(self):
366 # type: () -> None
367
368 # NUL is end
369 h = 'a\0b'
370 tokens = Lex(h)
371
372 self.assertEqual([
373 (h8_id.RawData, 1),
374 (h8_id.EndOfStream, 2),
375 ], tokens)
376
377
378if __name__ == '__main__':
379 unittest.main()