OILS / lazylex / html_test.py View on Github | oils.pub

425 lines, 275 significant
1#!/usr/bin/env python2
2from __future__ import print_function
3
4import unittest
5
6from lazylex import html # module under test log = html.log
7
8log = html.log
9
10with open('lazylex/testdata.html') as f:
11 TEST_HTML = f.read()
12
13
14class RegexTest(unittest.TestCase):
15
16 def testDotAll(self):
17 import re
18
19 # Note that $ matches end of line, not end of string
20 p1 = re.compile(r'.')
21 print(p1.match('\n'))
22
23 p2 = re.compile(r'.', re.DOTALL)
24 print(p2.match('\n'))
25
26 #p3 = re.compile(r'[.\n]', re.VERBOSE)
27 p3 = re.compile(r'[.\n]')
28 print(p3.match('\n'))
29
30 print('Negation')
31
32 p4 = re.compile(r'[^>]')
33 print(p4.match('\n'))
34
35
36class FunctionsTest(unittest.TestCase):
37
38 def testFindLineNum(self):
39 s = 'foo\n' * 3
40 for pos in [1, 5, 10, 50]: # out of bounds
41 line_num = html.FindLineNum(s, pos)
42 print(line_num)
43
44
45def _MakeTagLexer(s):
46 lex = html.TagLexer(s)
47 lex.Reset(0, len(s))
48 return lex
49
50
51def _PrintTokens(lex):
52 log('')
53 log('tag = %r', lex.TagName())
54 for tok, start, end in lex.Tokens():
55 log('%s %r', tok, lex.s[start:end])
56
57
58class TagLexerTest(unittest.TestCase):
59
60 def testTagLexer(self):
61 # Invalid!
62 #lex = _MakeTagLexer('< >')
63 #print(lex.Tag())
64
65 lex = _MakeTagLexer('<a>')
66 _PrintTokens(lex)
67
68 lex = _MakeTagLexer('<a novalue>')
69 _PrintTokens(lex)
70
71 # Note: we could have a different HasAttr() method
72 # <a novalue> means lex.Get('novalue') == None
73 # https://developer.mozilla.org/en-US/docs/Web/API/Element/hasAttribute
74 self.assertEqual(None, lex.GetAttrRaw('novalue'))
75
76 lex = _MakeTagLexer('<a href="double quoted">')
77 _PrintTokens(lex)
78
79 self.assertEqual('double quoted', lex.GetAttrRaw('href'))
80 self.assertEqual(None, lex.GetAttrRaw('oops'))
81
82 lex = _MakeTagLexer('<a href=foo class="bar">')
83 _PrintTokens(lex)
84
85 lex = _MakeTagLexer('<a href=foo class="bar" />')
86 _PrintTokens(lex)
87
88 lex = _MakeTagLexer('<a href="?foo=1&amp;bar=2" />')
89 self.assertEqual('?foo=1&amp;bar=2', lex.GetAttrRaw('href'))
90
91 def testTagName(self):
92 lex = _MakeTagLexer('<a href=foo class="bar" />')
93 self.assertEqual('a', lex.TagName())
94
95 def testAllAttrs(self):
96 """
97 [('key', 'value')] for all
98 """
99 # closed
100 lex = _MakeTagLexer('<a href=foo class="bar" />')
101 self.assertEqual([('href', 'foo'), ('class', 'bar')],
102 lex.AllAttrsRaw())
103
104 lex = _MakeTagLexer('<a href="?foo=1&amp;bar=2" />')
105 self.assertEqual([('href', '?foo=1&amp;bar=2')], lex.AllAttrsRaw())
106
107 def testAttrWithoutValue(self):
108 # equivalent to <button disabled="">
109 lex = _MakeTagLexer('<button disabled>')
110 all_attrs = lex.AllAttrsRaw()
111 log('all %s', all_attrs)
112
113 try:
114 lex = _MakeTagLexer('<a foo=bar !></a>')
115 all_attrs = lex.AllAttrsRaw()
116 except html.LexError as e:
117 print(e)
118 else:
119 self.fail('Expected LexError')
120
121
122def _MakeAttrValueLexer(s):
123 lex = html.AttrValueLexer(s)
124 lex.Reset(0, len(s))
125 return lex
126
127
128class AttrValueLexerTest(unittest.TestCase):
129
130 def testGood(self):
131 lex = _MakeAttrValueLexer('?foo=42&amp;bar=99')
132 n = lex.NumTokens()
133 self.assertEqual(3, n)
134
135
136def Lex(h, no_special_tags=False):
137 print(repr(h))
138 tokens = html.ValidTokenList(h, no_special_tags=no_special_tags)
139 start_pos = 0
140 for tok_id, end_pos in tokens:
141 frag = h[start_pos:end_pos]
142 log('%d %s %r', end_pos, html.TokenName(tok_id), frag)
143 start_pos = end_pos
144 return tokens
145
146
147class LexerTest(unittest.TestCase):
148
149 # IndexLinker in devtools/make_help.py
150 # <pre> sections in doc/html_help.py
151 # TocExtractor in devtools/cmark.py
152
153 def testPstrip(self):
154 """Remove anything like this.
155
156 <p><pstrip> </pstrip></p>
157 """
158 pass
159
160 def testCommentParse(self):
161 n = len(TEST_HTML)
162 tokens = Lex(TEST_HTML)
163
164 def testCommentParse2(self):
165
166 Tok = html.Tok
167 h = '''
168 hi <!-- line 1
169 line 2 --><br/>'''
170 tokens = Lex(h)
171
172 self.assertEqual(
173 [
174 (Tok.RawData, 12),
175 (Tok.Comment, 50), # <? err ?>
176 (Tok.StartEndTag, 55),
177 (Tok.EndOfStream, 55),
178 ],
179 tokens)
180
181 def testProcessingInstruction(self):
182 # <?xml ?> header
183 Tok = html.Tok
184 h = 'hi <? err ?>'
185 tokens = Lex(h)
186
187 self.assertEqual(
188 [
189 (Tok.RawData, 3),
190 (Tok.Processing, 12), # <? err ?>
191 (Tok.EndOfStream, 12),
192 ],
193 tokens)
194
195 def testScriptStyle(self):
196 Tok = html.Tok
197 h = '''
198 hi <script src=""> if (x < 1 && y > 2 ) { console.log(""); }
199 </script>
200 '''
201 tokens = Lex(h)
202
203 self.assertEqual(
204 [
205 (Tok.RawData, 12),
206 (Tok.StartTag, 27), # <script>
207 (Tok.HtmlCData, 78), # JavaScript code is HTML CData
208 (Tok.EndTag, 87), # </script>
209 (Tok.RawData, 96), # \n
210 (Tok.EndOfStream, 96), # \n
211 ],
212 tokens)
213
214 def testScriptStyleXml(self):
215 Tok = html.Tok
216 h = 'hi <script src=""> &lt; </script>'
217 # XML mode
218 tokens = Lex(h, no_special_tags=True)
219
220 self.assertEqual(
221 [
222 (Tok.RawData, 3),
223 (Tok.StartTag, 18), # <script>
224 (Tok.RawData, 19), # space
225 (Tok.CharEntity, 23), # </script>
226 (Tok.RawData, 24), # \n
227 (Tok.EndTag, 33), # \n
228 (Tok.EndOfStream, 33), # \n
229 ],
230 tokens)
231
232 def testCData(self):
233 Tok = html.Tok
234
235 # from
236 # /home/andy/src/languages/Python-3.11.5/Lib/test/xmltestdata/c14n-20/inC14N4.xml
237 h = '<compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>'
238 tokens = Lex(h)
239
240 self.assertEqual([
241 (Tok.StartTag, 9),
242 (Tok.CData, 61),
243 (Tok.EndTag, 71),
244 (Tok.EndOfStream, 71),
245 ], tokens)
246
247 def testEntity(self):
248 Tok = html.Tok
249
250 # from
251 # /home/andy/src/Python-3.12.4/Lib/test/xmltestdata/c14n-20/inC14N5.xml
252 h = '&ent1;, &ent2;!'
253
254 tokens = Lex(h)
255
256 self.assertEqual([
257 (Tok.CharEntity, 6),
258 (Tok.RawData, 8),
259 (Tok.CharEntity, 14),
260 (Tok.RawData, 15),
261 (Tok.EndOfStream, 15),
262 ], tokens)
263
264 def testStartTag(self):
265 Tok = html.Tok
266
267 h = '<a>hi</a>'
268 tokens = Lex(h)
269
270 self.assertEqual([
271 (Tok.StartTag, 3),
272 (Tok.RawData, 5),
273 (Tok.EndTag, 9),
274 (Tok.EndOfStream, 9),
275 ], tokens)
276
277 # Make sure we don't consume too much
278 h = '<a><source>1.7</source></a>'
279
280 tokens = Lex(h)
281
282 self.assertEqual([
283 (Tok.StartTag, 3),
284 (Tok.StartTag, 11),
285 (Tok.RawData, 14),
286 (Tok.EndTag, 23),
287 (Tok.EndTag, 27),
288 (Tok.EndOfStream, 27),
289 ], tokens)
290
291 return
292
293 h = '''
294 <configuration>
295 <source>1.7</source>
296 </configuration>'''
297
298 tokens = Lex(h)
299
300 self.assertEqual([
301 (Tok.RawData, 9),
302 (Tok.StartTag, 24),
303 (Tok.RawData, 9),
304 (Tok.EndOfStream, 9),
305 ], tokens)
306
307 def testInvalid(self):
308 Tok = html.Tok
309
310 for s in INVALID_LEX:
311 try:
312 tokens = html.ValidTokenList(s)
313 except html.LexError as e:
314 print(e)
315 else:
316 self.fail('Expected LexError %r' % s)
317
318
319INVALID_LEX = [
320 # Should be &amp;
321 '<a>&',
322 '&amp', # not finished
323 '&#', # not finished
324 # Hm > is allowed?
325 #'a > b',
326 'a < b',
327 '<!-- unfinished comment',
328 '<? unfinished processing',
329 '</div bad=attr> <a> <b>',
330
331 # not allowed, but 3 > 4 is allowed
332 '<a> 3 < 4 </a>',
333]
334
335INVALID_PARSE = [
336 '<a></b>',
337 '<a>', # missing closing tag
338 '<meta></meta>', # this is a self-closing tag
339]
340
341VALID_PARSE = [
342 '<!DOCTYPE html>\n',
343 '<!DOCTYPE>',
344
345 # empty strings
346 '<p x=""></p>',
347 "<p x=''></p>",
348
349 # allowed, but 3 < 4 is not allowed
350 '<a> 3 > 4 </a>',
351 # allowed, but 3 > 4 is not allowed
352 '<p x="3 < 4"></p>',
353 '<b><a href="foo">link</a></b>',
354 '<meta><a></a>',
355 # no attribute
356 '<button disabled></button>',
357 '<button disabled=></button>',
358 '<button disabled= ></button>',
359
360 # single quoted is pretty common
361 "<a href='single'></a>",
362
363 # Conceding to reality - I used these myself
364 '<a href=ble.sh></a>',
365 '<a href=foo.html></a>',
366
367 # TODO: capitalization should be allowed
368 #'<META><a></a>',
369
370 # TODO: Test <svg> and <math> ?
371]
372
373VALID_XML = [
374 '<meta></meta>',
375]
376
377INVALID_TAG_LEX = [
378 # not allowed, but 3 < 4 is allowed
379 '<p x="3 > 4"></p>',
380 '<a foo=bar !></a>', # bad attr
381
382 # should be escaped
383 #'<a href="&"></a>',
384 #'<a href=">"></a>',
385]
386
387
388class ValidateTest(unittest.TestCase):
389
390 def testInvalid(self):
391 counters = html.Counters()
392 for s in INVALID_LEX + INVALID_TAG_LEX:
393 try:
394 html.Validate(s, html.BALANCED_TAGS, counters)
395 except html.LexError as e:
396 print(e)
397 else:
398 self.fail('Expected LexError %r' % s)
399
400 for s in INVALID_PARSE:
401 try:
402 html.Validate(s, html.BALANCED_TAGS, counters)
403 except html.ParseError as e:
404 print(e)
405 else:
406 self.fail('Expected ParseError')
407
408 def testValid(self):
409 counters = html.Counters()
410 for s in VALID_PARSE:
411 html.Validate(s, html.BALANCED_TAGS, counters)
412 print('HTML5 %r' % s)
413 print('HTML5 attrs %r' % counters.debug_attrs)
414
415 def testValidXml(self):
416 counters = html.Counters()
417 for s in VALID_XML:
418 html.Validate(s, html.BALANCED_TAGS | html.NO_SPECIAL_TAGS,
419 counters)
420 print('XML %r' % s)
421 print('XML attrs %r' % counters.debug_attrs)
422
423
424if __name__ == '__main__':
425 unittest.main()