data_lang/htm8

OILS / data_lang / htm8_test.py View on Github | oils.pub

379 lines, 240 significant

1	#!/usr/bin/env python2
2	from __future__ import print_function
3
4	from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_id_str, attr_name)
5
6	import unittest
7	import re
8
9	from typing import List, Tuple
10
11	from data_lang import htm8
12	from doctools.util import log
13
14	with open('data_lang/testdata/hello.htm8') as f:
15	TEST_HTML = f.read()
16
17
18	class RegexTest(unittest.TestCase):
19
20	def testDotAll(self):
21	# type: () -> None
22
23	# Note that $ matches end of line, not end of string
24	p1 = re.compile(r'.')
25	print(p1.match('\n'))
26
27	p2 = re.compile(r'.', re.DOTALL)
28	print(p2.match('\n'))
29
30	#p3 = re.compile(r'[.\n]', re.VERBOSE)
31	p3 = re.compile(r'[.\n]')
32	print(p3.match('\n'))
33
34	print('Negation')
35
36	p4 = re.compile(r'[^>]')
37	print(p4.match('\n'))
38
39	def testAttrRe(self):
40	# type: () -> None
41	_ATTR_RE = htm8._ATTR_RE
42	m = _ATTR_RE.match(' empty= val')
43	print(m.groups())
44
45
46	class FunctionsTest(unittest.TestCase):
47
48	def testFindLineNum(self):
49	# type: () -> None
50	s = 'foo\n' * 3
51	for pos in [1, 5, 10, 50]: # out of bounds
52	line_num = htm8._FindLineNum(s, pos)
53	print(line_num)
54
55
56	class AttrLexerTest(unittest.TestCase):
57
58	def testNoAttrs(self):
59	# type: () -> None
60
61	# TODO: h8_id.StartTag and EndTag will expose the tag_name_pos - the
62	# end of the tag name
63
64	h = 'x <a>'
65	lx = htm8.Lexer(h)
66
67	# Skip raw data
68	tok_id, end_pos = lx.Read()
69	self.assertEqual(h8_id.RawData, tok_id)
70
71	tok_id, end_pos = lx.Read()
72	self.assertEqual(h8_id.StartTag, tok_id)
73
74	attr_lexer = htm8.AttrLexer(h)
75	attr_lexer.Init(lx.TagNamePos(), end_pos)
76
77	# There is no tag
78	n, name_start, name_end = attr_lexer.ReadName()
79	self.assertEqual(n, attr_name.Done)
80	self.assertEqual(-1, name_start)
81	self.assertEqual(-1, name_end)
82
83	def testInvalid(self):
84	h = '<a !>'
85	lx = htm8.Lexer(h)
86
87	tok_id, end_pos = lx.Read()
88	self.assertEqual(h8_id.StartTag, tok_id)
89
90	attr_lexer = htm8.AttrLexer(h)
91	attr_lexer.Init(lx.TagNamePos(), end_pos)
92
93	n, name_start, name_end = attr_lexer.ReadName()
94	self.assertEqual(n, attr_name.Invalid)
95	self.assertEqual(-1, name_start)
96	self.assertEqual(-1, name_end)
97
98	def testEmpty(self):
99	h = '<img src=/>'
100	lx = htm8.Lexer(h)
101
102	tok_id, end_pos = lx.Read()
103	self.assertEqual(h8_id.StartEndTag, tok_id)
104
105	attr_lexer = htm8.AttrLexer(h)
106	attr_lexer.Init(lx.TagNamePos(), end_pos)
107
108	n, name_start, name_end = attr_lexer.ReadName()
109	self.assertEqual(n, attr_name.Ok)
110	self.assertEqual(5, name_start)
111	self.assertEqual(8, name_end)
112	self.assertEqual(False, attr_lexer.next_value_is_missing)
113
114	self.assertEqual(True, attr_lexer.AttrNameEquals('src'))
115	self.assertEqual(False, attr_lexer.AttrNameEquals('srcz'))
116
117	def testMissing(self):
118	h = '<img SRC/>'
119	lx = htm8.Lexer(h)
120
121	tok_id, end_pos = lx.Read()
122	self.assertEqual(h8_id.StartEndTag, tok_id)
123
124	attr_lexer = htm8.AttrLexer(h)
125	attr_lexer.Init(lx.TagNamePos(), end_pos)
126
127	n, name_start, name_end = attr_lexer.ReadName()
128	self.assertEqual(n, attr_name.Ok)
129	self.assertEqual(5, name_start)
130	self.assertEqual(8, name_end)
131	self.assertEqual(True, attr_lexer.next_value_is_missing)
132
133	self.assertEqual(True, attr_lexer.AttrNameEquals('src'))
134	self.assertEqual(False, attr_lexer.AttrNameEquals('srcz'))
135
136	def testAttr(self):
137	h = '<a x=foo>'
138	lx = htm8.Lexer(h)
139
140	tok_id, end_pos = lx.Read()
141	self.assertEqual(h8_id.StartTag, tok_id)
142
143	attr_lexer = htm8.AttrLexer(h)
144	attr_lexer.Init(lx.TagNamePos(), end_pos)
145	n, name_start, name_end = attr_lexer.ReadName()
146	self.assertEqual(n, attr_name.Ok)
147	self.assertEqual(3, name_start)
148	self.assertEqual(4, name_end)
149
150	# Note: internal state set according to =
151
152
153	def ValidTokenList(s, no_special_tags=False):
154	# type: (str, bool) -> List[Tuple[h8_id_t, int]]
155	"""A wrapper that can be more easily translated to C++. Doesn't use iterators."""
156
157	start_pos = 0
158	tokens = []
159	lx = htm8.Lexer(s, no_special_tags=no_special_tags)
160	while True:
161	tok_id, end_pos = lx.Read()
162	tokens.append((tok_id, end_pos))
163	if tok_id == h8_id.EndOfStream:
164	break
165	if tok_id == h8_id.Invalid:
166	raise htm8.LexError(s, start_pos)
167	start_pos = end_pos
168	return tokens
169
170
171	def Lex(h, no_special_tags=False):
172	# type: (str, bool) -> List[Tuple[int, int]]
173	print(repr(h))
174	tokens = ValidTokenList(h, no_special_tags=no_special_tags)
175	start_pos = 0
176	for tok_id, end_pos in tokens:
177	frag = h[start_pos:end_pos]
178	log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
179	start_pos = end_pos
180	return tokens
181
182
183	class LexerTest(unittest.TestCase):
184
185	# IndexLinker in devtools/make_help.py
186	# <pre> sections in doc/html_help.py
187	# TocExtractor in devtools/cmark.py
188
189	def testPstrip(self):
190	# type: () -> None
191	"""Remove anything like this.
192
193	<p><pstrip> </pstrip></p>
194	"""
195	pass
196
197	def testCommentParse(self):
198	# type: () -> None
199	n = len(TEST_HTML)
200	tokens = Lex(TEST_HTML)
201
202	def testCommentParse2(self):
203	# type: () -> None
204	h = '''
205	hi <!-- line 1
206	line 2 --><br/>'''
207	tokens = Lex(h)
208
209	self.assertEqual(
210	[
211	(h8_id.RawData, 12),
212	(h8_id.Comment, 50), # <? err ?>
213	(h8_id.StartEndTag, 55),
214	(h8_id.EndOfStream, 55),
215	],
216	tokens)
217
218	def testProcessingInstruction(self):
219	# type: () -> None
220	# <?xml ?> header
221	h = 'hi <? err ?>'
222	tokens = Lex(h)
223
224	self.assertEqual(
225	[
226	(h8_id.RawData, 3),
227	(h8_id.Processing, 12), # <? err ?>
228	(h8_id.EndOfStream, 12),
229	],
230	tokens)
231
232	def testScriptStyle(self):
233	# type: () -> None
234	h = '''
235	hi <script src=""> if (x < 1 && y > 2 ) { console.log(""); }
236	</script>
237	'''
238	tokens = Lex(h)
239
240	expected = [
241	(h8_id.RawData, 12),
242	(h8_id.StartTag, 27), # <script>
243	(h8_id.HtmlCData, 78), # JavaScript code is HTML CData
244	(h8_id.EndTag, 87), # </script>
245	(h8_id.RawData, 96), # \n
246	(h8_id.EndOfStream, 96), # \n
247	]
248	self.assertEqual(expected, tokens)
249
250	# Test case matching
251	tokens = Lex(h.replace('script', 'scrIPT'))
252	self.assertEqual(expected, tokens)
253
254	def testScriptStyleXml(self):
255	# type: () -> None
256	h = 'hi <script src=""> < </script>'
257	# XML mode
258	tokens = Lex(h, no_special_tags=True)
259
260	self.assertEqual(
261	[
262	(h8_id.RawData, 3),
263	(h8_id.StartTag, 18), # <script>
264	(h8_id.RawData, 19), # space
265	(h8_id.CharEntity, 23), # </script>
266	(h8_id.RawData, 24), # \n
267	(h8_id.EndTag, 33), # \n
268	(h8_id.EndOfStream, 33), # \n
269	],
270	tokens)
271
272	def testCData(self):
273	# type: () -> None
274
275	# from
276	# /home/andy/src/languages/Python-3.11.5/Lib/test/xmltestdata/c14n-20/inC14N4.xml
277	h = '<compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>'
278	tokens = Lex(h)
279
280	self.assertEqual([
281	(h8_id.StartTag, 9),
282	(h8_id.CData, 61),
283	(h8_id.EndTag, 71),
284	(h8_id.EndOfStream, 71),
285	], tokens)
286
287	def testEntity(self):
288	# type: () -> None
289
290	# from
291	# /home/andy/src/Python-3.12.4/Lib/test/xmltestdata/c14n-20/inC14N5.xml
292	h = '&ent1;, &ent2;!'
293
294	tokens = Lex(h)
295
296	self.assertEqual([
297	(h8_id.CharEntity, 6),
298	(h8_id.RawData, 8),
299	(h8_id.CharEntity, 14),
300	(h8_id.RawData, 15),
301	(h8_id.EndOfStream, 15),
302	], tokens)
303
304	def testStartTag(self):
305	# type: () -> None
306
307	h = '<a>hi</a>'
308	tokens = Lex(h)
309
310	self.assertEqual([
311	(h8_id.StartTag, 3),
312	(h8_id.RawData, 5),
313	(h8_id.EndTag, 9),
314	(h8_id.EndOfStream, 9),
315	], tokens)
316
317	# Make sure we don't consume too much
318	h = '<a><source>1.7</source></a>'
319
320	tokens = Lex(h)
321
322	self.assertEqual([
323	(h8_id.StartTag, 3),
324	(h8_id.StartTag, 11),
325	(h8_id.RawData, 14),
326	(h8_id.EndTag, 23),
327	(h8_id.EndTag, 27),
328	(h8_id.EndOfStream, 27),
329	], tokens)
330
331	return
332
333	h = '''
334	<configuration>
335	<source>1.7</source>
336	</configuration>'''
337
338	tokens = Lex(h)
339
340	self.assertEqual([
341	(h8_id.RawData, 9),
342	(h8_id.StartTag, 24),
343	(h8_id.RawData, 9),
344	(h8_id.EndOfStream, 9),
345	], tokens)
346
347	def testBad(self):
348	# type: () -> None
349	h = '&'
350	tokens = Lex(h)
351
352	self.assertEqual([
353	(h8_id.BadAmpersand, 1),
354	(h8_id.EndOfStream, 1),
355	], tokens)
356
357	h = '>'
358	tokens = Lex(h)
359
360	self.assertEqual([
361	(h8_id.BadGreaterThan, 1),
362	(h8_id.EndOfStream, 1),
363	], tokens)
364
365	def testEndOfStream(self):
366	# type: () -> None
367
368	# NUL is end
369	h = 'a\0b'
370	tokens = Lex(h)
371
372	self.assertEqual([
373	(h8_id.RawData, 1),
374	(h8_id.EndOfStream, 2),
375	], tokens)
376
377
378	if __name__ == '__main__':
379	unittest.main()