OILS / data_lang / htm8_test.py View on Github | oils.pub

322 lines, 198 significant
1#!/usr/bin/env python2
2from __future__ import print_function
3
4from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_id_str, attr_name)
5
6import unittest
7import re
8
9from typing import List, Tuple
10
11from data_lang import htm8
12from doctools.util import log
13
14with open('data_lang/testdata/hello.htm8') as f:
15 TEST_HTML = f.read()
16
17
18class RegexTest(unittest.TestCase):
19
20 def testDotAll(self):
21 # type: () -> None
22
23 # Note that $ matches end of line, not end of string
24 p1 = re.compile(r'.')
25 print(p1.match('\n'))
26
27 p2 = re.compile(r'.', re.DOTALL)
28 print(p2.match('\n'))
29
30 #p3 = re.compile(r'[.\n]', re.VERBOSE)
31 p3 = re.compile(r'[.\n]')
32 print(p3.match('\n'))
33
34 print('Negation')
35
36 p4 = re.compile(r'[^>]')
37 print(p4.match('\n'))
38
39 def testAttrRe(self):
40 # type: () -> None
41 _ATTR_RE = htm8._ATTR_RE
42 m = _ATTR_RE.match(' empty= val')
43 print(m.groups())
44
45
46class FunctionsTest(unittest.TestCase):
47
48 def testFindLineNum(self):
49 # type: () -> None
50 s = 'foo\n' * 3
51 for pos in [1, 5, 10, 50]: # out of bounds
52 line_num = htm8._FindLineNum(s, pos)
53 print(line_num)
54
55
56class AttrLexerTest(unittest.TestCase):
57
58 def testNoAttrs(self):
59 # type: () -> None
60
61 # TODO: h8_id.StartTag and EndTag will expose the tag_name_pos - the
62 # end of the tag name
63
64 h = 'x <a>'
65 lx = htm8.Lexer(h)
66
67 # Skip raw data
68 tok_id, end_pos = lx.Read()
69 self.assertEqual(h8_id.RawData, tok_id)
70
71 tok_id, end_pos = lx.Read()
72 self.assertEqual(h8_id.StartTag, tok_id)
73
74 attr_lexer = htm8.AttrLexer(h)
75 attr_lexer.Init(lx.TagNamePos(), end_pos)
76
77 # There is no tag
78 n, name_start, name_end = attr_lexer.ReadName()
79 self.assertEqual(n, attr_name.Done)
80 self.assertEqual(-1, name_start)
81 self.assertEqual(-1, name_end)
82
83 def testAttr(self):
84
85 h = '<a href=foo>'
86 lx = htm8.Lexer(h)
87
88 tok_id, end_pos = lx.Read()
89 self.assertEqual(h8_id.StartTag, tok_id)
90
91 attr_lexer = htm8.AttrLexer(h)
92 attr_lexer.Init(lx.TagNamePos(), end_pos)
93 n, name_start, name_end = attr_lexer.ReadName()
94
95
96def ValidTokenList(s, no_special_tags=False):
97 # type: (str, bool) -> List[Tuple[h8_id_t, int]]
98 """A wrapper that can be more easily translated to C++. Doesn't use iterators."""
99
100 start_pos = 0
101 tokens = []
102 lx = htm8.Lexer(s, no_special_tags=no_special_tags)
103 while True:
104 tok_id, end_pos = lx.Read()
105 tokens.append((tok_id, end_pos))
106 if tok_id == h8_id.EndOfStream:
107 break
108 if tok_id == h8_id.Invalid:
109 raise htm8.LexError(s, start_pos)
110 start_pos = end_pos
111 return tokens
112
113
114def Lex(h, no_special_tags=False):
115 # type: (str, bool) -> List[Tuple[int, int]]
116 print(repr(h))
117 tokens = ValidTokenList(h, no_special_tags=no_special_tags)
118 start_pos = 0
119 for tok_id, end_pos in tokens:
120 frag = h[start_pos:end_pos]
121 log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
122 start_pos = end_pos
123 return tokens
124
125
126class LexerTest(unittest.TestCase):
127
128 # IndexLinker in devtools/make_help.py
129 # <pre> sections in doc/html_help.py
130 # TocExtractor in devtools/cmark.py
131
132 def testPstrip(self):
133 # type: () -> None
134 """Remove anything like this.
135
136 <p><pstrip> </pstrip></p>
137 """
138 pass
139
140 def testCommentParse(self):
141 # type: () -> None
142 n = len(TEST_HTML)
143 tokens = Lex(TEST_HTML)
144
145 def testCommentParse2(self):
146 # type: () -> None
147 h = '''
148 hi <!-- line 1
149 line 2 --><br/>'''
150 tokens = Lex(h)
151
152 self.assertEqual(
153 [
154 (h8_id.RawData, 12),
155 (h8_id.Comment, 50), # <? err ?>
156 (h8_id.StartEndTag, 55),
157 (h8_id.EndOfStream, 55),
158 ],
159 tokens)
160
161 def testProcessingInstruction(self):
162 # type: () -> None
163 # <?xml ?> header
164 h = 'hi <? err ?>'
165 tokens = Lex(h)
166
167 self.assertEqual(
168 [
169 (h8_id.RawData, 3),
170 (h8_id.Processing, 12), # <? err ?>
171 (h8_id.EndOfStream, 12),
172 ],
173 tokens)
174
175 def testScriptStyle(self):
176 # type: () -> None
177 h = '''
178 hi <script src=""> if (x < 1 && y > 2 ) { console.log(""); }
179 </script>
180 '''
181 tokens = Lex(h)
182
183 expected = [
184 (h8_id.RawData, 12),
185 (h8_id.StartTag, 27), # <script>
186 (h8_id.HtmlCData, 78), # JavaScript code is HTML CData
187 (h8_id.EndTag, 87), # </script>
188 (h8_id.RawData, 96), # \n
189 (h8_id.EndOfStream, 96), # \n
190 ]
191 self.assertEqual(expected, tokens)
192
193 # Test case matching
194 tokens = Lex(h.replace('script', 'scrIPT'))
195 self.assertEqual(expected, tokens)
196
197 def testScriptStyleXml(self):
198 # type: () -> None
199 h = 'hi <script src=""> &lt; </script>'
200 # XML mode
201 tokens = Lex(h, no_special_tags=True)
202
203 self.assertEqual(
204 [
205 (h8_id.RawData, 3),
206 (h8_id.StartTag, 18), # <script>
207 (h8_id.RawData, 19), # space
208 (h8_id.CharEntity, 23), # </script>
209 (h8_id.RawData, 24), # \n
210 (h8_id.EndTag, 33), # \n
211 (h8_id.EndOfStream, 33), # \n
212 ],
213 tokens)
214
215 def testCData(self):
216 # type: () -> None
217
218 # from
219 # /home/andy/src/languages/Python-3.11.5/Lib/test/xmltestdata/c14n-20/inC14N4.xml
220 h = '<compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>'
221 tokens = Lex(h)
222
223 self.assertEqual([
224 (h8_id.StartTag, 9),
225 (h8_id.CData, 61),
226 (h8_id.EndTag, 71),
227 (h8_id.EndOfStream, 71),
228 ], tokens)
229
230 def testEntity(self):
231 # type: () -> None
232
233 # from
234 # /home/andy/src/Python-3.12.4/Lib/test/xmltestdata/c14n-20/inC14N5.xml
235 h = '&ent1;, &ent2;!'
236
237 tokens = Lex(h)
238
239 self.assertEqual([
240 (h8_id.CharEntity, 6),
241 (h8_id.RawData, 8),
242 (h8_id.CharEntity, 14),
243 (h8_id.RawData, 15),
244 (h8_id.EndOfStream, 15),
245 ], tokens)
246
247 def testStartTag(self):
248 # type: () -> None
249
250 h = '<a>hi</a>'
251 tokens = Lex(h)
252
253 self.assertEqual([
254 (h8_id.StartTag, 3),
255 (h8_id.RawData, 5),
256 (h8_id.EndTag, 9),
257 (h8_id.EndOfStream, 9),
258 ], tokens)
259
260 # Make sure we don't consume too much
261 h = '<a><source>1.7</source></a>'
262
263 tokens = Lex(h)
264
265 self.assertEqual([
266 (h8_id.StartTag, 3),
267 (h8_id.StartTag, 11),
268 (h8_id.RawData, 14),
269 (h8_id.EndTag, 23),
270 (h8_id.EndTag, 27),
271 (h8_id.EndOfStream, 27),
272 ], tokens)
273
274 return
275
276 h = '''
277 <configuration>
278 <source>1.7</source>
279 </configuration>'''
280
281 tokens = Lex(h)
282
283 self.assertEqual([
284 (h8_id.RawData, 9),
285 (h8_id.StartTag, 24),
286 (h8_id.RawData, 9),
287 (h8_id.EndOfStream, 9),
288 ], tokens)
289
290 def testBad(self):
291 # type: () -> None
292 h = '&'
293 tokens = Lex(h)
294
295 self.assertEqual([
296 (h8_id.BadAmpersand, 1),
297 (h8_id.EndOfStream, 1),
298 ], tokens)
299
300 h = '>'
301 tokens = Lex(h)
302
303 self.assertEqual([
304 (h8_id.BadGreaterThan, 1),
305 (h8_id.EndOfStream, 1),
306 ], tokens)
307
308 def testEndOfStream(self):
309 # type: () -> None
310
311 # NUL is end
312 h = 'a\0b'
313 tokens = Lex(h)
314
315 self.assertEqual([
316 (h8_id.RawData, 1),
317 (h8_id.EndOfStream, 2),
318 ], tokens)
319
320
321if __name__ == '__main__':
322 unittest.main()