data_lang/htm8

OILS / data_lang / htm8_test.py View on Github | oils.pub

322 lines, 198 significant

1	#!/usr/bin/env python2
2	from __future__ import print_function
3
4	from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_id_str, attr_name)
5
6	import unittest
7	import re
8
9	from typing import List, Tuple
10
11	from data_lang import htm8
12	from doctools.util import log
13
14	with open('data_lang/testdata/hello.htm8') as f:
15	TEST_HTML = f.read()
16
17
18	class RegexTest(unittest.TestCase):
19
20	def testDotAll(self):
21	# type: () -> None
22
23	# Note that $ matches end of line, not end of string
24	p1 = re.compile(r'.')
25	print(p1.match('\n'))
26
27	p2 = re.compile(r'.', re.DOTALL)
28	print(p2.match('\n'))
29
30	#p3 = re.compile(r'[.\n]', re.VERBOSE)
31	p3 = re.compile(r'[.\n]')
32	print(p3.match('\n'))
33
34	print('Negation')
35
36	p4 = re.compile(r'[^>]')
37	print(p4.match('\n'))
38
39	def testAttrRe(self):
40	# type: () -> None
41	_ATTR_RE = htm8._ATTR_RE
42	m = _ATTR_RE.match(' empty= val')
43	print(m.groups())
44
45
46	class FunctionsTest(unittest.TestCase):
47
48	def testFindLineNum(self):
49	# type: () -> None
50	s = 'foo\n' * 3
51	for pos in [1, 5, 10, 50]: # out of bounds
52	line_num = htm8._FindLineNum(s, pos)
53	print(line_num)
54
55
56	class AttrLexerTest(unittest.TestCase):
57
58	def testNoAttrs(self):
59	# type: () -> None
60
61	# TODO: h8_id.StartTag and EndTag will expose the tag_name_pos - the
62	# end of the tag name
63
64	h = 'x <a>'
65	lx = htm8.Lexer(h)
66
67	# Skip raw data
68	tok_id, end_pos = lx.Read()
69	self.assertEqual(h8_id.RawData, tok_id)
70
71	tok_id, end_pos = lx.Read()
72	self.assertEqual(h8_id.StartTag, tok_id)
73
74	attr_lexer = htm8.AttrLexer(h)
75	attr_lexer.Init(lx.TagNamePos(), end_pos)
76
77	# There is no tag
78	n, name_start, name_end = attr_lexer.ReadName()
79	self.assertEqual(n, attr_name.Done)
80	self.assertEqual(-1, name_start)
81	self.assertEqual(-1, name_end)
82
83	def testAttr(self):
84
85	h = '<a href=foo>'
86	lx = htm8.Lexer(h)
87
88	tok_id, end_pos = lx.Read()
89	self.assertEqual(h8_id.StartTag, tok_id)
90
91	attr_lexer = htm8.AttrLexer(h)
92	attr_lexer.Init(lx.TagNamePos(), end_pos)
93	n, name_start, name_end = attr_lexer.ReadName()
94
95
96	def ValidTokenList(s, no_special_tags=False):
97	# type: (str, bool) -> List[Tuple[h8_id_t, int]]
98	"""A wrapper that can be more easily translated to C++. Doesn't use iterators."""
99
100	start_pos = 0
101	tokens = []
102	lx = htm8.Lexer(s, no_special_tags=no_special_tags)
103	while True:
104	tok_id, end_pos = lx.Read()
105	tokens.append((tok_id, end_pos))
106	if tok_id == h8_id.EndOfStream:
107	break
108	if tok_id == h8_id.Invalid:
109	raise htm8.LexError(s, start_pos)
110	start_pos = end_pos
111	return tokens
112
113
114	def Lex(h, no_special_tags=False):
115	# type: (str, bool) -> List[Tuple[int, int]]
116	print(repr(h))
117	tokens = ValidTokenList(h, no_special_tags=no_special_tags)
118	start_pos = 0
119	for tok_id, end_pos in tokens:
120	frag = h[start_pos:end_pos]
121	log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
122	start_pos = end_pos
123	return tokens
124
125
126	class LexerTest(unittest.TestCase):
127
128	# IndexLinker in devtools/make_help.py
129	# <pre> sections in doc/html_help.py
130	# TocExtractor in devtools/cmark.py
131
132	def testPstrip(self):
133	# type: () -> None
134	"""Remove anything like this.
135
136	<p><pstrip> </pstrip></p>
137	"""
138	pass
139
140	def testCommentParse(self):
141	# type: () -> None
142	n = len(TEST_HTML)
143	tokens = Lex(TEST_HTML)
144
145	def testCommentParse2(self):
146	# type: () -> None
147	h = '''
148	hi <!-- line 1
149	line 2 --><br/>'''
150	tokens = Lex(h)
151
152	self.assertEqual(
153	[
154	(h8_id.RawData, 12),
155	(h8_id.Comment, 50), # <? err ?>
156	(h8_id.StartEndTag, 55),
157	(h8_id.EndOfStream, 55),
158	],
159	tokens)
160
161	def testProcessingInstruction(self):
162	# type: () -> None
163	# <?xml ?> header
164	h = 'hi <? err ?>'
165	tokens = Lex(h)
166
167	self.assertEqual(
168	[
169	(h8_id.RawData, 3),
170	(h8_id.Processing, 12), # <? err ?>
171	(h8_id.EndOfStream, 12),
172	],
173	tokens)
174
175	def testScriptStyle(self):
176	# type: () -> None
177	h = '''
178	hi <script src=""> if (x < 1 && y > 2 ) { console.log(""); }
179	</script>
180	'''
181	tokens = Lex(h)
182
183	expected = [
184	(h8_id.RawData, 12),
185	(h8_id.StartTag, 27), # <script>
186	(h8_id.HtmlCData, 78), # JavaScript code is HTML CData
187	(h8_id.EndTag, 87), # </script>
188	(h8_id.RawData, 96), # \n
189	(h8_id.EndOfStream, 96), # \n
190	]
191	self.assertEqual(expected, tokens)
192
193	# Test case matching
194	tokens = Lex(h.replace('script', 'scrIPT'))
195	self.assertEqual(expected, tokens)
196
197	def testScriptStyleXml(self):
198	# type: () -> None
199	h = 'hi <script src=""> < </script>'
200	# XML mode
201	tokens = Lex(h, no_special_tags=True)
202
203	self.assertEqual(
204	[
205	(h8_id.RawData, 3),
206	(h8_id.StartTag, 18), # <script>
207	(h8_id.RawData, 19), # space
208	(h8_id.CharEntity, 23), # </script>
209	(h8_id.RawData, 24), # \n
210	(h8_id.EndTag, 33), # \n
211	(h8_id.EndOfStream, 33), # \n
212	],
213	tokens)
214
215	def testCData(self):
216	# type: () -> None
217
218	# from
219	# /home/andy/src/languages/Python-3.11.5/Lib/test/xmltestdata/c14n-20/inC14N4.xml
220	h = '<compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>'
221	tokens = Lex(h)
222
223	self.assertEqual([
224	(h8_id.StartTag, 9),
225	(h8_id.CData, 61),
226	(h8_id.EndTag, 71),
227	(h8_id.EndOfStream, 71),
228	], tokens)
229
230	def testEntity(self):
231	# type: () -> None
232
233	# from
234	# /home/andy/src/Python-3.12.4/Lib/test/xmltestdata/c14n-20/inC14N5.xml
235	h = '&ent1;, &ent2;!'
236
237	tokens = Lex(h)
238
239	self.assertEqual([
240	(h8_id.CharEntity, 6),
241	(h8_id.RawData, 8),
242	(h8_id.CharEntity, 14),
243	(h8_id.RawData, 15),
244	(h8_id.EndOfStream, 15),
245	], tokens)
246
247	def testStartTag(self):
248	# type: () -> None
249
250	h = '<a>hi</a>'
251	tokens = Lex(h)
252
253	self.assertEqual([
254	(h8_id.StartTag, 3),
255	(h8_id.RawData, 5),
256	(h8_id.EndTag, 9),
257	(h8_id.EndOfStream, 9),
258	], tokens)
259
260	# Make sure we don't consume too much
261	h = '<a><source>1.7</source></a>'
262
263	tokens = Lex(h)
264
265	self.assertEqual([
266	(h8_id.StartTag, 3),
267	(h8_id.StartTag, 11),
268	(h8_id.RawData, 14),
269	(h8_id.EndTag, 23),
270	(h8_id.EndTag, 27),
271	(h8_id.EndOfStream, 27),
272	], tokens)
273
274	return
275
276	h = '''
277	<configuration>
278	<source>1.7</source>
279	</configuration>'''
280
281	tokens = Lex(h)
282
283	self.assertEqual([
284	(h8_id.RawData, 9),
285	(h8_id.StartTag, 24),
286	(h8_id.RawData, 9),
287	(h8_id.EndOfStream, 9),
288	], tokens)
289
290	def testBad(self):
291	# type: () -> None
292	h = '&'
293	tokens = Lex(h)
294
295	self.assertEqual([
296	(h8_id.BadAmpersand, 1),
297	(h8_id.EndOfStream, 1),
298	], tokens)
299
300	h = '>'
301	tokens = Lex(h)
302
303	self.assertEqual([
304	(h8_id.BadGreaterThan, 1),
305	(h8_id.EndOfStream, 1),
306	], tokens)
307
308	def testEndOfStream(self):
309	# type: () -> None
310
311	# NUL is end
312	h = 'a\0b'
313	tokens = Lex(h)
314
315	self.assertEqual([
316	(h8_id.RawData, 1),
317	(h8_id.EndOfStream, 2),
318	], tokens)
319
320
321	if __name__ == '__main__':
322	unittest.main()