OILS / lazylex / html_test.py View on Github | oils.pub

324 lines, 193 significant
1#!/usr/bin/env python2
2from __future__ import print_function
3
4import unittest
5
6from lazylex import html # module under test log = html.log
7from doctools.util import log
8
9
10class FunctionsTest(unittest.TestCase):
11
12 def testToText(self):
13 # type: () -> None
14 t = html.ToText('<b name="&amp;"> three &lt; four && five </b>')
15 self.assertEqual(' three < four && five ', t)
16
17
18def _MakeTagLexer(s):
19 # type: (str) -> html.TagLexer
20 lex = html.TagLexer(s)
21 lex.Reset(0, len(s))
22 return lex
23
24
25def _PrintTokens(lex):
26 # type: (html.TagLexer) -> None
27 log('')
28 log('tag = %r', lex.GetTagName())
29 for tok, start, end in lex.Tokens():
30 log('%s %r', tok, lex.s[start:end])
31
32
33class TagLexerTest(unittest.TestCase):
34
35 def testTagLexer(self):
36 # type: () -> None
37 # Invalid!
38 #lex = _MakeTagLexer('< >')
39 #print(lex.Tag())
40
41 lex = _MakeTagLexer('<a>')
42 _PrintTokens(lex)
43
44 lex = _MakeTagLexer('<a novalue>')
45 _PrintTokens(lex)
46
47 # Note: we could have a different HasAttr() method
48 # <a novalue> means lex.Get('novalue') == ''
49 # https://developer.mozilla.org/en-US/docs/Web/API/Element/hasAttribute
50 self.assertEqual('', lex.GetAttrRaw('novalue'))
51
52 lex = _MakeTagLexer('<a href="double quoted">')
53 _PrintTokens(lex)
54
55 self.assertEqual('double quoted', lex.GetAttrRaw('href'))
56 self.assertEqual(None, lex.GetAttrRaw('oops'))
57
58 lex = _MakeTagLexer('<a href=foo class="bar">')
59 _PrintTokens(lex)
60
61 lex = _MakeTagLexer('<a href=foo class="bar" />')
62 _PrintTokens(lex)
63
64 lex = _MakeTagLexer('<a href="?foo=1&amp;bar=2" />')
65 self.assertEqual('?foo=1&amp;bar=2', lex.GetAttrRaw('href'))
66
67 def testTagName(self):
68 # type: () -> None
69 lex = _MakeTagLexer('<a href=foo class="bar" />')
70 self.assertEqual('a', lex.GetTagName())
71
72 def testAllAttrs(self):
73 # type: () -> None
74 """
75 [('key', 'value')] for all
76 """
77 # closed
78 lex = _MakeTagLexer('<a href=foo class="bar" />')
79 self.assertEqual([('href', 'foo'), ('class', 'bar')],
80 lex.AllAttrsRaw())
81
82 lex = _MakeTagLexer('<a href="?foo=1&amp;bar=2" />')
83 self.assertEqual([('href', '?foo=1&amp;bar=2')], lex.AllAttrsRaw())
84
85 def testEmptyMissingValues(self):
86 # type: () -> None
87 # equivalent to <button disabled="">
88 lex = _MakeTagLexer('<button disabled>')
89 all_attrs = lex.AllAttrsRaw()
90 self.assertEqual([('disabled', '')], all_attrs)
91
92 slices = lex.AllAttrsRawSlice()
93 log('slices %s', slices)
94
95 lex = _MakeTagLexer(
96 '''<p double="" single='' empty= value missing empty2=>''')
97 all_attrs = lex.AllAttrsRaw()
98 self.assertEqual([
99 ('double', ''),
100 ('single', ''),
101 ('empty', 'value'),
102 ('missing', ''),
103 ('empty2', ''),
104 ], all_attrs)
105 # TODO: should have
106 log('all %s', all_attrs)
107
108 slices = lex.AllAttrsRawSlice()
109 log('slices %s', slices)
110
111 def testInvalidTag(self):
112 # type: () -> None
113 try:
114 lex = _MakeTagLexer('<a foo=bar !></a>')
115 all_attrs = lex.AllAttrsRaw()
116 except html.LexError as e:
117 print(e)
118 else:
119 self.fail('Expected LexError')
120
121
122def _MakeAttrValueLexer(s):
123 # type: (str) -> html.AttrValueLexer
124 lex = html.AttrValueLexer(s)
125 lex.Reset(0, len(s))
126 return lex
127
128
129class AttrValueLexerTest(unittest.TestCase):
130
131 def testGood(self):
132 # type: () -> None
133 lex = _MakeAttrValueLexer('?foo=42&amp;bar=99')
134 n = lex.NumTokens()
135 self.assertEqual(3, n)
136
137
138class LexerTest(unittest.TestCase):
139
140 def testInvalid(self):
141 # type: () -> None
142 from data_lang.htm8_test import ValidTokenList
143 for s in INVALID_LEX:
144 try:
145 tokens = ValidTokenList(s)
146 except html.LexError as e:
147 print(e)
148 else:
149 self.fail('Expected LexError %r' % s)
150
151 def testValid(self):
152 # type: () -> None
153
154 from data_lang.htm8_test import Lex
155
156 for s, _ in VALID_LEX:
157 tokens = Lex(s)
158 print()
159
160
161INVALID_LEX = [
162 '<a><',
163 '&amp<',
164 '&<',
165 # Hm > is allowed?
166 #'a > b',
167 'a < b',
168 '<!-- unfinished comment',
169 '<? unfinished processing',
170 '</div bad=attr> <a> <b>',
171
172 # not allowed, but 3 > 4 is allowed
173 '<a> 3 < 4 </a>',
174 # Not a CDATA tag
175 '<STYLEz><</STYLEz>',
176]
177
178SKIP = 0
179UNCHANGED = 1
180
181VALID_LEX = [
182 # TODO: convert these to XML
183 ('<foo></foo>', UNCHANGED),
184 ('<foo x=y></foo>', ''),
185 #('<foo x="&"></foo>', '<foo x="&amp;"></foo>'),
186 ('<foo x="&"></foo>', ''),
187
188 # Allowed with BadAmpersand
189 ('<p> x & y </p>', '<p> x &amp; y </p>'),
190]
191
192INVALID_PARSE = [
193 '<a></b>',
194 '<a>', # missing closing tag
195 '<meta></meta>', # this is a self-closing tag
196]
197
198VALID_PARSE = [
199 ('<!DOCTYPE html>\n', ''),
200 ('<!DOCTYPE>', ''),
201
202 # empty strings
203 ('<p x=""></p>', UNCHANGED),
204 ("<p x=''></p>", UNCHANGED),
205 ('<self-closing a="b" />', UNCHANGED),
206
207 # We could also normalize CDATA?
208 # Note that CDATA has an escaping problem: you need to handle it ]]> with
209 # concatenation. It just "pushes the problem around".
210 # So I think it's better to use ONE kind of escaping, which is &lt;
211 ('<script><![CDATA[ <wtf> >< ]]></script>', UNCHANGED),
212
213 # allowed, but 3 < 4 is not allowed
214 ('<a> 3 > 4 </a>', '<a> 3 &gt; 4 </a>'),
215 # allowed, but 3 > 4 is not allowed
216 ('<p x="3 < 4"></p>', ''),
217 ('<b><a href="foo">link</a></b>', UNCHANGED),
218
219 # TODO: should be self-closing
220 #('<meta><a></a>', '<meta/><a></a>'),
221 ('<meta><a></a>', ''),
222
223 # no attribute
224 ('<button disabled></button>', ''),
225 ('<button disabled=></button>', ''),
226 ('<button disabled= ></button>', ''),
227
228 # single quoted is pretty common
229 ("<a href='single'></a>", ''),
230
231 # Conceding to reality - I used these myself
232 ('<a href=ble.sh></a>', ''),
233 ('<a href=foo.html></a>', ''),
234 ('<foo x="&"></foo>', ''),
235
236 # caps
237 ('<foo></FOO>', ''),
238 ('<Foo></fOO>', ''),
239
240 # capital VOID tag
241 ('<META><a></a>', ''),
242 ('<script><</script>', ''),
243 # matching
244 ('<SCRipt><</SCRipt>', ''),
245 ('<SCRIPT><</SCRIPT>', ''),
246 ('<STYLE><</STYLE>', ''),
247 #'<SCRipt><</script>',
248
249 # Note: Python HTMLParser.py does DYNAMIC compilation of regex with re.I
250 # flag to handle this! Gah I want something faster.
251 #'<script><</SCRIPT>',
252
253 # TODO: Test <svg> and <math> ?
254]
255
256VALID_XML = [
257 '<meta></meta>',
258]
259
260INVALID_TAG_LEX = [
261 # not allowed, but 3 < 4 is allowed
262 '<p x="3 > 4"></p>',
263 # same thing
264 '<a href=">"></a>',
265 '<a foo=bar !></a>', # bad attr
266]
267
268
269class ValidateTest(unittest.TestCase):
270
271 def testInvalid(self):
272 # type: () -> None
273 counters = html.Counters()
274 for s in INVALID_LEX + INVALID_TAG_LEX:
275 try:
276 html.Validate(s, html.BALANCED_TAGS, counters)
277 except html.LexError as e:
278 print(e)
279 else:
280 self.fail('Expected LexError %r' % s)
281
282 for s in INVALID_PARSE:
283 try:
284 html.Validate(s, html.BALANCED_TAGS, counters)
285 except html.ParseError as e:
286 print(e)
287 else:
288 self.fail('Expected ParseError')
289
290 def testValid(self):
291 # type: () -> None
292 counters = html.Counters()
293 for s, _ in VALID_PARSE:
294 html.Validate(s, html.BALANCED_TAGS, counters)
295 print('HTML5 %r' % s)
296 #print('HTML5 attrs %r' % counters.debug_attrs)
297
298 def testValidXml(self):
299 # type: () -> None
300 counters = html.Counters()
301 for s in VALID_XML:
302 html.Validate(s, html.BALANCED_TAGS | html.NO_SPECIAL_TAGS,
303 counters)
304 print('XML %r' % s)
305 #print('XML attrs %r' % counters.debug_attrs)
306
307
308class XmlTest(unittest.TestCase):
309
310 def testValid(self):
311 # type: () -> None
312 counters = html.Counters()
313 for h, expected_xml in VALID_LEX + VALID_PARSE:
314 actual = html.ToXml(h)
315 if expected_xml == UNCHANGED: # Unchanged
316 self.assertEqual(h, actual)
317 elif expected_xml == '': # Skip
318 pass
319 else:
320 self.assertEqual(expected_xml, actual)
321
322
323if __name__ == '__main__':
324 unittest.main()