data_lang/htm8_util

OILS / data_lang / htm8_util_test.py View on Github | oils.pub

220 lines, 133 significant

1	#!/usr/bin/env python2
2	from __future__ import print_function
3
4	import unittest
5
6	from data_lang import htm8
7	from data_lang import htm8_util
8	#from doctools.util import log
9
10
11	class LexerTest(unittest.TestCase):
12
13	def testInvalid(self):
14	# type: () -> None
15	from data_lang.htm8_test import ValidTokenList
16	for s in INVALID_LEX:
17	try:
18	tokens = ValidTokenList(s)
19	except htm8.LexError as e:
20	print(e)
21	else:
22	self.fail('Expected LexError %r' % s)
23
24	def testValid(self):
25	# type: () -> None
26
27	from data_lang.htm8_test import Lex
28
29	for s, _ in VALID_LEX:
30	tokens = Lex(s)
31	print()
32
33
34	INVALID_LEX = [
35	'< >',
36	'<a><',
37	'&amp<',
38	'&<',
39	# Hm > is allowed?
40	#'a > b',
41	'a < b',
42	'<!-- unfinished comment',
43	'<? unfinished processing',
44	'</div bad=attr> <a> <b>',
45
46	# not allowed, but 3 > 4 is allowed
47	'<a> 3 < 4 </a>',
48	# Not a CDATA tag
49	'<STYLEz><</STYLEz>',
50	]
51
52	SKIP = 0
53	UNCHANGED = 1
54
55	VALID_LEX = [
56	# TODO: convert these to XML
57	('<foo></foo>', UNCHANGED),
58	('<foo x=y></foo>', ''),
59	#('<foo x="&"></foo>', '<foo x="&"></foo>'),
60	('<foo x="&"></foo>', ''),
61
62	# Allowed with BadAmpersand
63	('<p> x & y </p>', '<p> x & y </p>'),
64
65	# No ambiguity
66	('<img src=/ >', ''),
67	('<img src="/">', ''),
68	('<img src=foo/ >', ''),
69	]
70
71	INVALID_PARSE = [
72	'<a></b>',
73	'<a>', # missing closing tag
74	'<meta></meta>', # this is a self-closing tag
75	]
76
77	INVALID_ATTR_LEX = [
78	# Ambiguous, should be ""
79	'<img src=/>',
80	'<img src= />',
81	'<img src=foo/>',
82	'<img src= foo/>',
83
84	# Quoting
85	'<img src=x"y">',
86	"<img src=j''>",
87	]
88
89	VALID_PARSE = [
90	('<!DOCTYPE html>\n', ''),
91	('<!DOCTYPE>', ''),
92
93	# empty strings
94	('<p x=""></p>', UNCHANGED),
95	("<p x=''></p>", UNCHANGED),
96	('<self-closing a="b" />', UNCHANGED),
97
98	# We could also normalize CDATA?
99	# Note that CDATA has an escaping problem: you need to handle it ]]> with
100	# concatenation. It just "pushes the problem around".
101	# So I think it's better to use ONE kind of escaping, which is <
102	('<script><![CDATA[ <wtf> >< ]]></script>', UNCHANGED),
103
104	# allowed, but 3 < 4 is not allowed
105	('<a> 3 > 4 </a>', '<a> 3 > 4 </a>'),
106	# allowed, but 3 > 4 is not allowed
107	('<p x="3 < 4"></p>', ''),
108	('<b><a href="foo">link</a></b>', UNCHANGED),
109
110	# TODO: should be self-closing
111	#('<meta><a></a>', '<meta/><a></a>'),
112	('<meta><a></a>', ''),
113
114	# no attribute
115	('<button disabled></button>', ''),
116	('<button disabled=></button>', ''),
117	('<button disabled= ></button>', ''),
118
119	# single quoted is pretty common
120	("<a href='single'></a>", ''),
121
122	# Conceding to reality - I used these myself
123	('<a href=ble.sh></a>', ''),
124	('<a href=foo.html></a>', ''),
125	('<foo x="&"></foo>', ''),
126
127	# caps
128	('<foo></FOO>', ''),
129	('<Foo></fOO>', ''),
130
131	# capital VOID tag
132	('<META><a></a>', ''),
133	('<script><</script>', ''),
134	# matching
135	('<SCRipt><</SCRipt>', ''),
136	('<SCRIPT><</SCRIPT>', ''),
137	('<STYLE><</STYLE>', ''),
138	#'<SCRipt><</script>',
139
140	# Regression test from blog
141	('<script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>',
142	'')
143
144	# Note: Python HTMLParser.py does DYNAMIC compilation of regex with re.I
145	# flag to handle this! Gah I want something faster.
146	#'<script><</SCRIPT>',
147
148	# TODO: Test <svg> and <math> ?
149	]
150
151	VALID_XML = [
152	'<meta></meta>',
153	]
154
155	INVALID_TAG_LEX = [
156	# not allowed, but 3 < 4 is allowed
157	'<p x="3 > 4"></p>',
158	# same thing
159	'<a href=">"></a>',
160	'<a foo=bar !></a>', # bad attr
161	]
162
163
164	class ValidateTest(unittest.TestCase):
165
166	def testInvalid(self):
167	# type: () -> None
168	counters = htm8_util.Counters()
169	for s in INVALID_LEX + INVALID_TAG_LEX + INVALID_ATTR_LEX:
170	try:
171	htm8_util.Validate(s, htm8_util.BALANCED_TAGS, counters)
172	except htm8.LexError as e:
173	print(e)
174	else:
175	self.fail('Expected LexError %r' % s)
176
177	for s in INVALID_PARSE:
178	try:
179	htm8_util.Validate(s, htm8_util.BALANCED_TAGS, counters)
180	except htm8.ParseError as e:
181	print(e)
182	else:
183	self.fail('Expected ParseError')
184
185	def testValid(self):
186	# type: () -> None
187	counters = htm8_util.Counters()
188	for s, _ in VALID_PARSE:
189	print('HTML5 %r' % s)
190	htm8_util.Validate(s, htm8_util.BALANCED_TAGS, counters)
191	#print('HTML5 attrs %r' % counters.debug_attrs)
192
193	def testValidXml(self):
194	# type: () -> None
195	counters = htm8_util.Counters()
196	for s in VALID_XML:
197	print('XML %r' % s)
198	htm8_util.Validate(
199	s, htm8_util.BALANCED_TAGS \| htm8_util.NO_SPECIAL_TAGS,
200	counters)
201	#print('XML attrs %r' % counters.debug_attrs)
202
203
204	class XmlTest(unittest.TestCase):
205
206	def testValid(self):
207	# type: () -> None
208	counters = htm8_util.Counters()
209	for h, expected_xml in VALID_LEX + VALID_PARSE:
210	actual = htm8_util.ToXml(h)
211	if expected_xml == UNCHANGED: # Unchanged
212	self.assertEqual(h, actual)
213	elif expected_xml == '': # Skip
214	pass
215	else:
216	self.assertEqual(expected_xml, actual)
217
218
219	if __name__ == '__main__':
220	unittest.main()