1 | #!/usr/bin/env python2
|
2 | from __future__ import print_function
|
3 |
|
4 | from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_id_str, attr_name)
|
5 |
|
6 | import unittest
|
7 | import re
|
8 |
|
9 | from typing import List, Tuple
|
10 |
|
11 | from data_lang import htm8
|
12 | from doctools.util import log
|
13 |
|
14 | with open('data_lang/testdata/hello.htm8') as f:
|
15 | TEST_HTML = f.read()
|
16 |
|
17 |
|
18 | class RegexTest(unittest.TestCase):
|
19 |
|
20 | def testDotAll(self):
|
21 | # type: () -> None
|
22 |
|
23 | # Note that $ matches end of line, not end of string
|
24 | p1 = re.compile(r'.')
|
25 | print(p1.match('\n'))
|
26 |
|
27 | p2 = re.compile(r'.', re.DOTALL)
|
28 | print(p2.match('\n'))
|
29 |
|
30 | #p3 = re.compile(r'[.\n]', re.VERBOSE)
|
31 | p3 = re.compile(r'[.\n]')
|
32 | print(p3.match('\n'))
|
33 |
|
34 | print('Negation')
|
35 |
|
36 | p4 = re.compile(r'[^>]')
|
37 | print(p4.match('\n'))
|
38 |
|
39 | def testAttrRe(self):
|
40 | # type: () -> None
|
41 | _ATTR_RE = htm8._ATTR_RE
|
42 | m = _ATTR_RE.match(' empty= val')
|
43 | print(m.groups())
|
44 |
|
45 |
|
46 | class FunctionsTest(unittest.TestCase):
|
47 |
|
48 | def testFindLineNum(self):
|
49 | # type: () -> None
|
50 | s = 'foo\n' * 3
|
51 | for pos in [1, 5, 10, 50]: # out of bounds
|
52 | line_num = htm8._FindLineNum(s, pos)
|
53 | print(line_num)
|
54 |
|
55 |
|
56 | class AttrLexerTest(unittest.TestCase):
|
57 |
|
58 | def testNoAttrs(self):
|
59 | # type: () -> None
|
60 |
|
61 | # TODO: h8_id.StartTag and EndTag will expose the tag_name_pos - the
|
62 | # end of the tag name
|
63 |
|
64 | h = 'x <a>'
|
65 | lx = htm8.Lexer(h)
|
66 |
|
67 | # Skip raw data
|
68 | tok_id, end_pos = lx.Read()
|
69 | self.assertEqual(h8_id.RawData, tok_id)
|
70 |
|
71 | tok_id, end_pos = lx.Read()
|
72 | self.assertEqual(h8_id.StartTag, tok_id)
|
73 |
|
74 | attr_lexer = htm8.AttrLexer(h)
|
75 | attr_lexer.Init(lx.TagNamePos(), end_pos)
|
76 |
|
77 | # There is no tag
|
78 | n, name_start, name_end = attr_lexer.ReadName()
|
79 | self.assertEqual(n, attr_name.Done)
|
80 | self.assertEqual(-1, name_start)
|
81 | self.assertEqual(-1, name_end)
|
82 |
|
83 | def testAttr(self):
|
84 |
|
85 | h = '<a href=foo>'
|
86 | lx = htm8.Lexer(h)
|
87 |
|
88 | tok_id, end_pos = lx.Read()
|
89 | self.assertEqual(h8_id.StartTag, tok_id)
|
90 |
|
91 | attr_lexer = htm8.AttrLexer(h)
|
92 | attr_lexer.Init(lx.TagNamePos(), end_pos)
|
93 | n, name_start, name_end = attr_lexer.ReadName()
|
94 |
|
95 |
|
96 | def ValidTokenList(s, no_special_tags=False):
|
97 | # type: (str, bool) -> List[Tuple[h8_id_t, int]]
|
98 | """A wrapper that can be more easily translated to C++. Doesn't use iterators."""
|
99 |
|
100 | start_pos = 0
|
101 | tokens = []
|
102 | lx = htm8.Lexer(s, no_special_tags=no_special_tags)
|
103 | while True:
|
104 | tok_id, end_pos = lx.Read()
|
105 | tokens.append((tok_id, end_pos))
|
106 | if tok_id == h8_id.EndOfStream:
|
107 | break
|
108 | if tok_id == h8_id.Invalid:
|
109 | raise htm8.LexError(s, start_pos)
|
110 | start_pos = end_pos
|
111 | return tokens
|
112 |
|
113 |
|
114 | def Lex(h, no_special_tags=False):
|
115 | # type: (str, bool) -> List[Tuple[int, int]]
|
116 | print(repr(h))
|
117 | tokens = ValidTokenList(h, no_special_tags=no_special_tags)
|
118 | start_pos = 0
|
119 | for tok_id, end_pos in tokens:
|
120 | frag = h[start_pos:end_pos]
|
121 | log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
|
122 | start_pos = end_pos
|
123 | return tokens
|
124 |
|
125 |
|
126 | class LexerTest(unittest.TestCase):
|
127 |
|
128 | # IndexLinker in devtools/make_help.py
|
129 | # <pre> sections in doc/html_help.py
|
130 | # TocExtractor in devtools/cmark.py
|
131 |
|
132 | def testPstrip(self):
|
133 | # type: () -> None
|
134 | """Remove anything like this.
|
135 |
|
136 | <p><pstrip> </pstrip></p>
|
137 | """
|
138 | pass
|
139 |
|
140 | def testCommentParse(self):
|
141 | # type: () -> None
|
142 | n = len(TEST_HTML)
|
143 | tokens = Lex(TEST_HTML)
|
144 |
|
145 | def testCommentParse2(self):
|
146 | # type: () -> None
|
147 | h = '''
|
148 | hi <!-- line 1
|
149 | line 2 --><br/>'''
|
150 | tokens = Lex(h)
|
151 |
|
152 | self.assertEqual(
|
153 | [
|
154 | (h8_id.RawData, 12),
|
155 | (h8_id.Comment, 50), # <? err ?>
|
156 | (h8_id.StartEndTag, 55),
|
157 | (h8_id.EndOfStream, 55),
|
158 | ],
|
159 | tokens)
|
160 |
|
161 | def testProcessingInstruction(self):
|
162 | # type: () -> None
|
163 | # <?xml ?> header
|
164 | h = 'hi <? err ?>'
|
165 | tokens = Lex(h)
|
166 |
|
167 | self.assertEqual(
|
168 | [
|
169 | (h8_id.RawData, 3),
|
170 | (h8_id.Processing, 12), # <? err ?>
|
171 | (h8_id.EndOfStream, 12),
|
172 | ],
|
173 | tokens)
|
174 |
|
175 | def testScriptStyle(self):
|
176 | # type: () -> None
|
177 | h = '''
|
178 | hi <script src=""> if (x < 1 && y > 2 ) { console.log(""); }
|
179 | </script>
|
180 | '''
|
181 | tokens = Lex(h)
|
182 |
|
183 | expected = [
|
184 | (h8_id.RawData, 12),
|
185 | (h8_id.StartTag, 27), # <script>
|
186 | (h8_id.HtmlCData, 78), # JavaScript code is HTML CData
|
187 | (h8_id.EndTag, 87), # </script>
|
188 | (h8_id.RawData, 96), # \n
|
189 | (h8_id.EndOfStream, 96), # \n
|
190 | ]
|
191 | self.assertEqual(expected, tokens)
|
192 |
|
193 | # Test case matching
|
194 | tokens = Lex(h.replace('script', 'scrIPT'))
|
195 | self.assertEqual(expected, tokens)
|
196 |
|
197 | def testScriptStyleXml(self):
|
198 | # type: () -> None
|
199 | h = 'hi <script src=""> < </script>'
|
200 | # XML mode
|
201 | tokens = Lex(h, no_special_tags=True)
|
202 |
|
203 | self.assertEqual(
|
204 | [
|
205 | (h8_id.RawData, 3),
|
206 | (h8_id.StartTag, 18), # <script>
|
207 | (h8_id.RawData, 19), # space
|
208 | (h8_id.CharEntity, 23), # </script>
|
209 | (h8_id.RawData, 24), # \n
|
210 | (h8_id.EndTag, 33), # \n
|
211 | (h8_id.EndOfStream, 33), # \n
|
212 | ],
|
213 | tokens)
|
214 |
|
215 | def testCData(self):
|
216 | # type: () -> None
|
217 |
|
218 | # from
|
219 | # /home/andy/src/languages/Python-3.11.5/Lib/test/xmltestdata/c14n-20/inC14N4.xml
|
220 | h = '<compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>'
|
221 | tokens = Lex(h)
|
222 |
|
223 | self.assertEqual([
|
224 | (h8_id.StartTag, 9),
|
225 | (h8_id.CData, 61),
|
226 | (h8_id.EndTag, 71),
|
227 | (h8_id.EndOfStream, 71),
|
228 | ], tokens)
|
229 |
|
230 | def testEntity(self):
|
231 | # type: () -> None
|
232 |
|
233 | # from
|
234 | # /home/andy/src/Python-3.12.4/Lib/test/xmltestdata/c14n-20/inC14N5.xml
|
235 | h = '&ent1;, &ent2;!'
|
236 |
|
237 | tokens = Lex(h)
|
238 |
|
239 | self.assertEqual([
|
240 | (h8_id.CharEntity, 6),
|
241 | (h8_id.RawData, 8),
|
242 | (h8_id.CharEntity, 14),
|
243 | (h8_id.RawData, 15),
|
244 | (h8_id.EndOfStream, 15),
|
245 | ], tokens)
|
246 |
|
247 | def testStartTag(self):
|
248 | # type: () -> None
|
249 |
|
250 | h = '<a>hi</a>'
|
251 | tokens = Lex(h)
|
252 |
|
253 | self.assertEqual([
|
254 | (h8_id.StartTag, 3),
|
255 | (h8_id.RawData, 5),
|
256 | (h8_id.EndTag, 9),
|
257 | (h8_id.EndOfStream, 9),
|
258 | ], tokens)
|
259 |
|
260 | # Make sure we don't consume too much
|
261 | h = '<a><source>1.7</source></a>'
|
262 |
|
263 | tokens = Lex(h)
|
264 |
|
265 | self.assertEqual([
|
266 | (h8_id.StartTag, 3),
|
267 | (h8_id.StartTag, 11),
|
268 | (h8_id.RawData, 14),
|
269 | (h8_id.EndTag, 23),
|
270 | (h8_id.EndTag, 27),
|
271 | (h8_id.EndOfStream, 27),
|
272 | ], tokens)
|
273 |
|
274 | return
|
275 |
|
276 | h = '''
|
277 | <configuration>
|
278 | <source>1.7</source>
|
279 | </configuration>'''
|
280 |
|
281 | tokens = Lex(h)
|
282 |
|
283 | self.assertEqual([
|
284 | (h8_id.RawData, 9),
|
285 | (h8_id.StartTag, 24),
|
286 | (h8_id.RawData, 9),
|
287 | (h8_id.EndOfStream, 9),
|
288 | ], tokens)
|
289 |
|
290 | def testBad(self):
|
291 | # type: () -> None
|
292 | h = '&'
|
293 | tokens = Lex(h)
|
294 |
|
295 | self.assertEqual([
|
296 | (h8_id.BadAmpersand, 1),
|
297 | (h8_id.EndOfStream, 1),
|
298 | ], tokens)
|
299 |
|
300 | h = '>'
|
301 | tokens = Lex(h)
|
302 |
|
303 | self.assertEqual([
|
304 | (h8_id.BadGreaterThan, 1),
|
305 | (h8_id.EndOfStream, 1),
|
306 | ], tokens)
|
307 |
|
308 | def testEndOfStream(self):
|
309 | # type: () -> None
|
310 |
|
311 | # NUL is end
|
312 | h = 'a\0b'
|
313 | tokens = Lex(h)
|
314 |
|
315 | self.assertEqual([
|
316 | (h8_id.RawData, 1),
|
317 | (h8_id.EndOfStream, 2),
|
318 | ], tokens)
|
319 |
|
320 |
|
321 | if __name__ == '__main__':
|
322 | unittest.main()
|