OILS / data_lang / htm8_util_test.py View on Github | oils.pub

234 lines, 134 significant
1#!/usr/bin/env python2
2from __future__ import print_function
3
4import unittest
5
6from data_lang import htm8
7from data_lang import htm8_util
8#from doctools.util import log
9
10
11class LexerTest(unittest.TestCase):
12
13 def testInvalid(self):
14 # type: () -> None
15 from data_lang.htm8_test import ValidTokenList
16 for s in INVALID_LEX:
17 try:
18 tokens = ValidTokenList(s)
19 except htm8.LexError as e:
20 print(e)
21 else:
22 self.fail('Expected LexError %r' % s)
23
24 def testValid(self):
25 # type: () -> None
26
27 from data_lang.htm8_test import Lex
28
29 for s, _ in VALID_LEX:
30 tokens = Lex(s)
31 print()
32
33
34INVALID_LEX = [
35 '< >',
36 '<a><',
37 '&amp<',
38 '&<',
39 # Hm > is allowed?
40 #'a > b',
41 'a < b',
42 '<!-- unfinished comment',
43 '<? unfinished processing',
44 '</div bad=attr> <a> <b>',
45
46 # not allowed, but 3 > 4 is allowed
47 '<a> 3 < 4 </a>',
48 # Not a CDATA tag
49 '<STYLEz><</STYLEz>',
50]
51
52SKIP = 0
53UNCHANGED = 1
54
55VALID_LEX = [
56 # TODO: convert these to XML
57 ('<foo></foo>', UNCHANGED),
58 ('<foo x=y></foo>', '<foo x="y"></foo>'),
59 #('<foo x="&"></foo>', '<foo x="&amp;"></foo>'),
60 ('<foo x="&"></foo>', ''),
61
62 # Allowed with BadAmpersand
63 ('<p> x & y </p>', '<p> x &amp; y </p>'),
64
65 # No ambiguity
66 ('<img src=/ >', '<img src="/" >'),
67 ('<img src="/">', UNCHANGED),
68 ('<img src=foo/ >', '<img src="foo/" >'),
69]
70
71INVALID_PARSE = [
72 '<a></b>',
73 '<a>', # missing closing tag
74 '<meta></meta>', # this is a self-closing tag
75]
76
77INVALID_ATTR_LEX = [
78 # Ambiguous, should be ""
79 '<img src=/>',
80 '<img src= />',
81 '<img src=foo/>',
82 '<img src= foo/>',
83
84 # Quoting
85 '<img src=x"y">',
86 "<img src=j''>",
87]
88
89VALID_PARSE = [
90 ('<!DOCTYPE html>\n', ''),
91 ('<!DOCTYPE>', ''),
92
93 # empty strings
94 ('<p x=""></p>', UNCHANGED),
95 ("<p x=''></p>", UNCHANGED),
96 ('<self-closing a="b" />', UNCHANGED),
97
98 # We could also normalize CDATA?
99 # Note that CDATA has an escaping problem: you need to handle it ]]> with
100 # concatenation. It just "pushes the problem around".
101 # So I think it's better to use ONE kind of escaping, which is &lt;
102 ('<script><![CDATA[ <wtf> >< ]]></script>', UNCHANGED),
103
104 # allowed, but 3 < 4 is not allowed
105 ('<a> 3 > 4 </a>', '<a> 3 &gt; 4 </a>'),
106 # allowed, but 3 > 4 is not allowed
107 ('<p x="3 < 4"></p>', ''),
108 ('<b><a href="foo">link</a></b>', UNCHANGED),
109
110 # TODO: should be self-closing
111 #('<meta><a></a>', '<meta/><a></a>'),
112 ('<meta><a></a>', ''),
113
114 # no attribute
115 ('<button disabled></button>', '<button disabled=""></button>'),
116 ('<button disabled=></button>', '<button disabled=""></button>'),
117 ('<button disabled= ></button>', '<button disabled= ""></button>'),
118
119 # single quoted is pretty common
120 ("<a href='single'></a>", ''),
121
122 # Conceding to reality - I used these myself
123 ('<a href=ble.sh></a>', '<a href="ble.sh"></a>'),
124 ('<a href=foo.html></a>', '<a href="foo.html"></a>'),
125 ('<foo x="&"></foo>', ''),
126
127 # caps
128 ('<foo></FOO>', ''),
129 ('<Foo></fOO>', ''),
130
131 # capital VOID tag
132 ('<META><a></a>', ''),
133 ('<script><</script>', ''),
134 # matching
135 ('<SCRipt><</SCRipt>', ''),
136 ('<SCRIPT><</SCRIPT>', ''),
137 ('<STYLE><</STYLE>', ''),
138 #'<SCRipt><</script>',
139
140 # Regression test from blog
141 ('<script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>',
142 '')
143
144 # Note: Python HTMLParser.py does DYNAMIC compilation of regex with re.I
145 # flag to handle this! Gah I want something faster.
146 #'<script><</SCRIPT>',
147
148 # TODO: Test <svg> and <math> ?
149]
150
151VALID_XML = [
152 '<meta></meta>',
153]
154
155INVALID_TAG_LEX = [
156 # bad attr
157 '<a foo=bar !></a>',
158
159 # BUG: are we "overshooting" here? We don't have a sentinel
160 # I wonder if a one-pass lex is just simpler:
161 # - It works with micro-syntax
162 # - And it doesn't have this problem, as well as the stupid / problem
163 # - You can add a sentinel, but then you mess up COW of forked processes,
164 # potentially
165 # - As long as you don't allocate, I think it's not going to be any faster
166 # to skip the attributes
167 # - We could also handle <a href=">"> then
168
169 # Not allowed, but 3 < 4 is allowed
170 '<p x="3 > 4"></p>',
171 # with single quotes
172 "<p x='3 > 4'></p>",
173 # Same thing
174 '<a href=">"></a>',
175]
176
177
178class ValidateTest(unittest.TestCase):
179
180 def testInvalid(self):
181 # type: () -> None
182 counters = htm8_util.Counters()
183 for s in INVALID_LEX + INVALID_TAG_LEX + INVALID_ATTR_LEX:
184 try:
185 htm8_util.Validate(s, htm8_util.BALANCED_TAGS, counters)
186 except htm8.LexError as e:
187 print(e)
188 else:
189 self.fail('Expected LexError %r' % s)
190
191 for s in INVALID_PARSE:
192 try:
193 htm8_util.Validate(s, htm8_util.BALANCED_TAGS, counters)
194 except htm8.ParseError as e:
195 print(e)
196 else:
197 self.fail('Expected ParseError')
198
199 def testValid(self):
200 # type: () -> None
201 counters = htm8_util.Counters()
202 for s, _ in VALID_PARSE:
203 print('HTML5 %r' % s)
204 htm8_util.Validate(s, htm8_util.BALANCED_TAGS, counters)
205 #print('HTML5 attrs %r' % counters.debug_attrs)
206
207 def testValidXml(self):
208 # type: () -> None
209 counters = htm8_util.Counters()
210 for s in VALID_XML:
211 print('XML %r' % s)
212 htm8_util.Validate(
213 s, htm8_util.BALANCED_TAGS | htm8_util.NO_SPECIAL_TAGS,
214 counters)
215 #print('XML attrs %r' % counters.debug_attrs)
216
217
218class XmlTest(unittest.TestCase):
219
220 def testValid(self):
221 # type: () -> None
222 counters = htm8_util.Counters()
223 for h, expected_xml in VALID_LEX + VALID_PARSE:
224 actual = htm8_util.ToXml(h)
225 if expected_xml == UNCHANGED: # Unchanged
226 self.assertEqual(h, actual)
227 elif expected_xml == '': # Skip
228 pass
229 else:
230 self.assertEqual(expected_xml, actual)
231
232
233if __name__ == '__main__':
234 unittest.main()