1 | #!/usr/bin/env python2
|
2 | from __future__ import print_function
|
3 |
|
4 | import unittest
|
5 |
|
6 | from data_lang import htm8
|
7 | from data_lang import htm8_util
|
8 | #from doctools.util import log
|
9 |
|
10 |
|
11 | class LexerTest(unittest.TestCase):
|
12 |
|
13 | def testInvalid(self):
|
14 | # type: () -> None
|
15 | from data_lang.htm8_test import ValidTokenList
|
16 | for s in INVALID_LEX:
|
17 | try:
|
18 | tokens = ValidTokenList(s)
|
19 | except htm8.LexError as e:
|
20 | print(e)
|
21 | else:
|
22 | self.fail('Expected LexError %r' % s)
|
23 |
|
24 | def testValid(self):
|
25 | # type: () -> None
|
26 |
|
27 | from data_lang.htm8_test import Lex
|
28 |
|
29 | for s, _ in VALID_LEX:
|
30 | tokens = Lex(s)
|
31 | print()
|
32 |
|
33 |
|
34 | INVALID_LEX = [
|
35 | '< >',
|
36 | '<a><',
|
37 | '&<',
|
38 | '&<',
|
39 | # Hm > is allowed?
|
40 | #'a > b',
|
41 | 'a < b',
|
42 | '<!-- unfinished comment',
|
43 | '<? unfinished processing',
|
44 | '</div bad=attr> <a> <b>',
|
45 |
|
46 | # not allowed, but 3 > 4 is allowed
|
47 | '<a> 3 < 4 </a>',
|
48 | # Not a CDATA tag
|
49 | '<STYLEz><</STYLEz>',
|
50 | ]
|
51 |
|
52 | SKIP = 0
|
53 | UNCHANGED = 1
|
54 |
|
55 | VALID_LEX = [
|
56 | # TODO: convert these to XML
|
57 | ('<foo></foo>', UNCHANGED),
|
58 | ('<foo x=y></foo>', '<foo x="y"></foo>'),
|
59 | #('<foo x="&"></foo>', '<foo x="&"></foo>'),
|
60 | ('<foo x="&"></foo>', ''),
|
61 |
|
62 | # Allowed with BadAmpersand
|
63 | ('<p> x & y </p>', '<p> x & y </p>'),
|
64 |
|
65 | # No ambiguity
|
66 | ('<img src=/ >', '<img src="/" >'),
|
67 | ('<img src="/">', UNCHANGED),
|
68 | ('<img src=foo/ >', '<img src="foo/" >'),
|
69 | ]
|
70 |
|
71 | INVALID_PARSE = [
|
72 | '<a></b>',
|
73 | '<a>', # missing closing tag
|
74 | '<meta></meta>', # this is a self-closing tag
|
75 | ]
|
76 |
|
77 | INVALID_ATTR_LEX = [
|
78 | # Ambiguous, should be ""
|
79 | '<img src=/>',
|
80 | '<img src= />',
|
81 | '<img src=foo/>',
|
82 | '<img src= foo/>',
|
83 |
|
84 | # Quoting
|
85 | '<img src=x"y">',
|
86 | "<img src=j''>",
|
87 | ]
|
88 |
|
89 | VALID_PARSE = [
|
90 | ('<!DOCTYPE html>\n', ''),
|
91 | ('<!DOCTYPE>', ''),
|
92 |
|
93 | # empty strings
|
94 | ('<p x=""></p>', UNCHANGED),
|
95 | ("<p x=''></p>", UNCHANGED),
|
96 | ('<self-closing a="b" />', UNCHANGED),
|
97 |
|
98 | # We could also normalize CDATA?
|
99 | # Note that CDATA has an escaping problem: you need to handle it ]]> with
|
100 | # concatenation. It just "pushes the problem around".
|
101 | # So I think it's better to use ONE kind of escaping, which is <
|
102 | ('<script><![CDATA[ <wtf> >< ]]></script>', UNCHANGED),
|
103 |
|
104 | # allowed, but 3 < 4 is not allowed
|
105 | ('<a> 3 > 4 </a>', '<a> 3 > 4 </a>'),
|
106 | # allowed, but 3 > 4 is not allowed
|
107 | ('<p x="3 < 4"></p>', ''),
|
108 | ('<b><a href="foo">link</a></b>', UNCHANGED),
|
109 |
|
110 | # TODO: should be self-closing
|
111 | #('<meta><a></a>', '<meta/><a></a>'),
|
112 | ('<meta><a></a>', ''),
|
113 |
|
114 | # no attribute
|
115 | ('<button disabled></button>', '<button disabled=""></button>'),
|
116 | ('<button disabled=></button>', '<button disabled=""></button>'),
|
117 | ('<button disabled= ></button>', '<button disabled= ""></button>'),
|
118 |
|
119 | # single quoted is pretty common
|
120 | ("<a href='single'></a>", ''),
|
121 |
|
122 | # Conceding to reality - I used these myself
|
123 | ('<a href=ble.sh></a>', '<a href="ble.sh"></a>'),
|
124 | ('<a href=foo.html></a>', '<a href="foo.html"></a>'),
|
125 | ('<foo x="&"></foo>', ''),
|
126 |
|
127 | # caps
|
128 | ('<foo></FOO>', ''),
|
129 | ('<Foo></fOO>', ''),
|
130 |
|
131 | # capital VOID tag
|
132 | ('<META><a></a>', ''),
|
133 | ('<script><</script>', ''),
|
134 | # matching
|
135 | ('<SCRipt><</SCRipt>', ''),
|
136 | ('<SCRIPT><</SCRIPT>', ''),
|
137 | ('<STYLE><</STYLE>', ''),
|
138 | #'<SCRipt><</script>',
|
139 |
|
140 | # Regression test from blog
|
141 | ('<script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>',
|
142 | '')
|
143 |
|
144 | # Note: Python HTMLParser.py does DYNAMIC compilation of regex with re.I
|
145 | # flag to handle this! Gah I want something faster.
|
146 | #'<script><</SCRIPT>',
|
147 |
|
148 | # TODO: Test <svg> and <math> ?
|
149 | ]
|
150 |
|
151 | VALID_XML = [
|
152 | '<meta></meta>',
|
153 | ]
|
154 |
|
155 | INVALID_TAG_LEX = [
|
156 | # bad attr
|
157 | '<a foo=bar !></a>',
|
158 |
|
159 | # BUG: are we "overshooting" here? We don't have a sentinel
|
160 | # I wonder if a one-pass lex is just simpler:
|
161 | # - It works with micro-syntax
|
162 | # - And it doesn't have this problem, as well as the stupid / problem
|
163 | # - You can add a sentinel, but then you mess up COW of forked processes,
|
164 | # potentially
|
165 | # - As long as you don't allocate, I think it's not going to be any faster
|
166 | # to skip the attributes
|
167 | # - We could also handle <a href=">"> then
|
168 |
|
169 | # Not allowed, but 3 < 4 is allowed
|
170 | '<p x="3 > 4"></p>',
|
171 | # with single quotes
|
172 | "<p x='3 > 4'></p>",
|
173 | # Same thing
|
174 | '<a href=">"></a>',
|
175 | ]
|
176 |
|
177 |
|
178 | class ValidateTest(unittest.TestCase):
|
179 |
|
180 | def testInvalid(self):
|
181 | # type: () -> None
|
182 | counters = htm8_util.Counters()
|
183 | for s in INVALID_LEX + INVALID_TAG_LEX + INVALID_ATTR_LEX:
|
184 | try:
|
185 | htm8_util.Validate(s, htm8_util.BALANCED_TAGS, counters)
|
186 | except htm8.LexError as e:
|
187 | print(e)
|
188 | else:
|
189 | self.fail('Expected LexError %r' % s)
|
190 |
|
191 | for s in INVALID_PARSE:
|
192 | try:
|
193 | htm8_util.Validate(s, htm8_util.BALANCED_TAGS, counters)
|
194 | except htm8.ParseError as e:
|
195 | print(e)
|
196 | else:
|
197 | self.fail('Expected ParseError')
|
198 |
|
199 | def testValid(self):
|
200 | # type: () -> None
|
201 | counters = htm8_util.Counters()
|
202 | for s, _ in VALID_PARSE:
|
203 | print('HTML5 %r' % s)
|
204 | htm8_util.Validate(s, htm8_util.BALANCED_TAGS, counters)
|
205 | #print('HTML5 attrs %r' % counters.debug_attrs)
|
206 |
|
207 | def testValidXml(self):
|
208 | # type: () -> None
|
209 | counters = htm8_util.Counters()
|
210 | for s in VALID_XML:
|
211 | print('XML %r' % s)
|
212 | htm8_util.Validate(
|
213 | s, htm8_util.BALANCED_TAGS | htm8_util.NO_SPECIAL_TAGS,
|
214 | counters)
|
215 | #print('XML attrs %r' % counters.debug_attrs)
|
216 |
|
217 |
|
218 | class XmlTest(unittest.TestCase):
|
219 |
|
220 | def testValid(self):
|
221 | # type: () -> None
|
222 | counters = htm8_util.Counters()
|
223 | for h, expected_xml in VALID_LEX + VALID_PARSE:
|
224 | actual = htm8_util.ToXml(h)
|
225 | if expected_xml == UNCHANGED: # Unchanged
|
226 | self.assertEqual(h, actual)
|
227 | elif expected_xml == '': # Skip
|
228 | pass
|
229 | else:
|
230 | self.assertEqual(expected_xml, actual)
|
231 |
|
232 |
|
233 | if __name__ == '__main__':
|
234 | unittest.main()
|