OILS / data_lang / htm8_util_test.py View on Github | oils.pub

220 lines, 133 significant
1#!/usr/bin/env python2
2from __future__ import print_function
3
4import unittest
5
6from data_lang import htm8
7from data_lang import htm8_util
8#from doctools.util import log
9
10
11class LexerTest(unittest.TestCase):
12
13 def testInvalid(self):
14 # type: () -> None
15 from data_lang.htm8_test import ValidTokenList
16 for s in INVALID_LEX:
17 try:
18 tokens = ValidTokenList(s)
19 except htm8.LexError as e:
20 print(e)
21 else:
22 self.fail('Expected LexError %r' % s)
23
24 def testValid(self):
25 # type: () -> None
26
27 from data_lang.htm8_test import Lex
28
29 for s, _ in VALID_LEX:
30 tokens = Lex(s)
31 print()
32
33
34INVALID_LEX = [
35 '< >',
36 '<a><',
37 '&amp<',
38 '&<',
39 # Hm > is allowed?
40 #'a > b',
41 'a < b',
42 '<!-- unfinished comment',
43 '<? unfinished processing',
44 '</div bad=attr> <a> <b>',
45
46 # not allowed, but 3 > 4 is allowed
47 '<a> 3 < 4 </a>',
48 # Not a CDATA tag
49 '<STYLEz><</STYLEz>',
50]
51
52SKIP = 0
53UNCHANGED = 1
54
55VALID_LEX = [
56 # TODO: convert these to XML
57 ('<foo></foo>', UNCHANGED),
58 ('<foo x=y></foo>', ''),
59 #('<foo x="&"></foo>', '<foo x="&amp;"></foo>'),
60 ('<foo x="&"></foo>', ''),
61
62 # Allowed with BadAmpersand
63 ('<p> x & y </p>', '<p> x &amp; y </p>'),
64
65 # No ambiguity
66 ('<img src=/ >', ''),
67 ('<img src="/">', ''),
68 ('<img src=foo/ >', ''),
69]
70
71INVALID_PARSE = [
72 '<a></b>',
73 '<a>', # missing closing tag
74 '<meta></meta>', # this is a self-closing tag
75]
76
77INVALID_ATTR_LEX = [
78 # Ambiguous, should be ""
79 '<img src=/>',
80 '<img src= />',
81 '<img src=foo/>',
82 '<img src= foo/>',
83
84 # Quoting
85 '<img src=x"y">',
86 "<img src=j''>",
87]
88
89VALID_PARSE = [
90 ('<!DOCTYPE html>\n', ''),
91 ('<!DOCTYPE>', ''),
92
93 # empty strings
94 ('<p x=""></p>', UNCHANGED),
95 ("<p x=''></p>", UNCHANGED),
96 ('<self-closing a="b" />', UNCHANGED),
97
98 # We could also normalize CDATA?
99 # Note that CDATA has an escaping problem: you need to handle it ]]> with
100 # concatenation. It just "pushes the problem around".
101 # So I think it's better to use ONE kind of escaping, which is &lt;
102 ('<script><![CDATA[ <wtf> >< ]]></script>', UNCHANGED),
103
104 # allowed, but 3 < 4 is not allowed
105 ('<a> 3 > 4 </a>', '<a> 3 &gt; 4 </a>'),
106 # allowed, but 3 > 4 is not allowed
107 ('<p x="3 < 4"></p>', ''),
108 ('<b><a href="foo">link</a></b>', UNCHANGED),
109
110 # TODO: should be self-closing
111 #('<meta><a></a>', '<meta/><a></a>'),
112 ('<meta><a></a>', ''),
113
114 # no attribute
115 ('<button disabled></button>', ''),
116 ('<button disabled=></button>', ''),
117 ('<button disabled= ></button>', ''),
118
119 # single quoted is pretty common
120 ("<a href='single'></a>", ''),
121
122 # Conceding to reality - I used these myself
123 ('<a href=ble.sh></a>', ''),
124 ('<a href=foo.html></a>', ''),
125 ('<foo x="&"></foo>', ''),
126
127 # caps
128 ('<foo></FOO>', ''),
129 ('<Foo></fOO>', ''),
130
131 # capital VOID tag
132 ('<META><a></a>', ''),
133 ('<script><</script>', ''),
134 # matching
135 ('<SCRipt><</SCRipt>', ''),
136 ('<SCRIPT><</SCRIPT>', ''),
137 ('<STYLE><</STYLE>', ''),
138 #'<SCRipt><</script>',
139
140 # Regression test from blog
141 ('<script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>',
142 '')
143
144 # Note: Python HTMLParser.py does DYNAMIC compilation of regex with re.I
145 # flag to handle this! Gah I want something faster.
146 #'<script><</SCRIPT>',
147
148 # TODO: Test <svg> and <math> ?
149]
150
151VALID_XML = [
152 '<meta></meta>',
153]
154
155INVALID_TAG_LEX = [
156 # not allowed, but 3 < 4 is allowed
157 '<p x="3 > 4"></p>',
158 # same thing
159 '<a href=">"></a>',
160 '<a foo=bar !></a>', # bad attr
161]
162
163
164class ValidateTest(unittest.TestCase):
165
166 def testInvalid(self):
167 # type: () -> None
168 counters = htm8_util.Counters()
169 for s in INVALID_LEX + INVALID_TAG_LEX + INVALID_ATTR_LEX:
170 try:
171 htm8_util.Validate(s, htm8_util.BALANCED_TAGS, counters)
172 except htm8.LexError as e:
173 print(e)
174 else:
175 self.fail('Expected LexError %r' % s)
176
177 for s in INVALID_PARSE:
178 try:
179 htm8_util.Validate(s, htm8_util.BALANCED_TAGS, counters)
180 except htm8.ParseError as e:
181 print(e)
182 else:
183 self.fail('Expected ParseError')
184
185 def testValid(self):
186 # type: () -> None
187 counters = htm8_util.Counters()
188 for s, _ in VALID_PARSE:
189 print('HTML5 %r' % s)
190 htm8_util.Validate(s, htm8_util.BALANCED_TAGS, counters)
191 #print('HTML5 attrs %r' % counters.debug_attrs)
192
193 def testValidXml(self):
194 # type: () -> None
195 counters = htm8_util.Counters()
196 for s in VALID_XML:
197 print('XML %r' % s)
198 htm8_util.Validate(
199 s, htm8_util.BALANCED_TAGS | htm8_util.NO_SPECIAL_TAGS,
200 counters)
201 #print('XML attrs %r' % counters.debug_attrs)
202
203
204class XmlTest(unittest.TestCase):
205
206 def testValid(self):
207 # type: () -> None
208 counters = htm8_util.Counters()
209 for h, expected_xml in VALID_LEX + VALID_PARSE:
210 actual = htm8_util.ToXml(h)
211 if expected_xml == UNCHANGED: # Unchanged
212 self.assertEqual(h, actual)
213 elif expected_xml == '': # Skip
214 pass
215 else:
216 self.assertEqual(expected_xml, actual)
217
218
219if __name__ == '__main__':
220 unittest.main()