data_lang/htm8

OILS / data_lang / htm8_util.py View on Github | oils.pub

294 lines, 196 significant

1	#!/usr/bin/env python2
2
3	try:
4	from cStringIO import StringIO
5	except ImportError:
6	# for python3
7	from io import StringIO # type: ignore
8	import sys
9
10	from typing import List
11
12	from _devbuild.gen.htm8_asdl import (h8_id, h8_id_str, attr_value_e)
13	from data_lang import htm8
14	from data_lang.htm8 import (Lexer, LexError, ParseError, Output)
15	from doctools.util import log
16
17	# https://developer.mozilla.org/en-US/docs/Glossary/Void_element
18	VOID_ELEMENTS = [
19	'area',
20	'base',
21	'br',
22	'col',
23	'embed',
24	'hr',
25	'img',
26	'input',
27	'link',
28	'meta',
29	'param',
30	'source',
31	'track',
32	'wbr',
33	]
34
35	NO_LEX_ATTRS = 1 << 1 # skip href="?x=42&y=99"
36	NO_SPECIAL_TAGS = 1 << 2 # <script> <style>, VOID tags, etc.
37	BALANCED_TAGS = 1 << 3 # are tags balanced?
38
39
40	def Validate(contents, flags, counters):
41	# type: (str, int, Counters) -> None
42
43	attr_lx = htm8.AttrLexer(contents)
44
45	no_special_tags = bool(flags & NO_SPECIAL_TAGS)
46	lx = htm8.Lexer(contents, no_special_tags=no_special_tags)
47	tokens = []
48	start_pos = 0
49	tag_stack = []
50	while True:
51	tok_id, end_pos = lx.Read()
52	#log('TOP %s %r', h8_id_str(tok_id), contents[start_pos:end_pos])
53
54	if tok_id == h8_id.Invalid:
55	raise LexError('Validate() got invalid token', contents, start_pos)
56	if tok_id == h8_id.EndOfStream:
57	break
58
59	tokens.append((tok_id, end_pos))
60
61	if tok_id == h8_id.StartEndTag:
62	counters.num_start_end_tags += 1
63
64	attr_lx.Init(tok_id, lx.TagNamePos(), end_pos)
65	if not bool(flags & NO_LEX_ATTRS):
66	all_attrs = htm8.AllAttrsRaw(attr_lx)
67	counters.num_attrs += len(all_attrs)
68	# TODO: val_lexer.NumTokens() can be replaced with tokens_out
69
70	elif tok_id == h8_id.StartTag:
71	counters.num_start_tags += 1
72
73	attr_lx.Init(tok_id, lx.TagNamePos(), end_pos)
74	if not bool(flags & NO_LEX_ATTRS):
75	all_attrs = htm8.AllAttrsRaw(attr_lx)
76	counters.num_attrs += len(all_attrs)
77
78	#counters.debug_attrs.extend(all_attrs)
79
80	if flags & BALANCED_TAGS:
81	tag_name = lx.CanonicalTagName()
82	if flags & NO_SPECIAL_TAGS:
83	tag_stack.append(tag_name)
84	else:
85	# e.g. <meta> is considered self-closing, like <meta/>
86	if tag_name not in VOID_ELEMENTS:
87	tag_stack.append(tag_name)
88
89	counters.max_tag_stack = max(counters.max_tag_stack,
90	len(tag_stack))
91	elif tok_id == h8_id.EndTag:
92	if flags & BALANCED_TAGS:
93	try:
94	expected = tag_stack.pop()
95	except IndexError:
96	raise ParseError('Tag stack empty',
97	s=contents,
98	start_pos=start_pos)
99
100	actual = lx.CanonicalTagName()
101	if expected != actual:
102	raise ParseError(
103	'Got unexpected closing tag %r; opening tag was %r' %
104	(contents[start_pos:end_pos], expected),
105	s=contents,
106	start_pos=start_pos)
107
108	start_pos = end_pos
109
110	if len(tag_stack) != 0:
111	raise ParseError('Missing closing tags at end of doc: %s' %
112	' '.join(tag_stack),
113	s=contents,
114	start_pos=start_pos)
115
116	counters.num_tokens += len(tokens)
117
118
119	def ToXml(htm8_str):
120	# type: (str) -> str
121
122	# TODO:
123	# 1. Lex it
124	# 2. < & > must be escaped
125	# a. in raw data
126	# b. in quoted strings
127	# 3. <script> turned into CDATA
128	# 4. void tags turned into self-closing tags
129	# 5. case-sensitive tag matching - not sure about this
130
131	attr_lexer = htm8.AttrLexer(htm8_str)
132
133	f = StringIO()
134	out = Output(htm8_str, f)
135
136	lx = Lexer(htm8_str)
137
138	pos = 0
139	while True:
140	tok_id, end_pos = lx.Read()
141
142	if tok_id == h8_id.Invalid:
143	raise LexError('ToXml() got invalid token', htm8_str, pos)
144	if tok_id == h8_id.EndOfStream:
145	break
146
147	if tok_id in (h8_id.RawData, h8_id.CharEntity, h8_id.HexChar,
148	h8_id.DecChar):
149	out.PrintUntil(end_pos)
150	elif tok_id in (h8_id.StartTag, h8_id.StartEndTag):
151	attr_lexer.Init(tok_id, lx.TagNamePos(), end_pos)
152	all_attrs = htm8.AllAttrsRawSlice(attr_lexer)
153	for name_start, name_end, equal_end, v, val_start, val_end in all_attrs:
154	if v == attr_value_e.Missing: # <a missing>
155	out.PrintUntil(name_end)
156	out.Print('=""')
157	elif v == attr_value_e.Empty: # <a empty=>
158	out.PrintUntil(equal_end)
159	out.Print('""')
160	elif v == attr_value_e.Unquoted: # <a foo=bar>
161	# Because we disallow ", we can just surround with quotes
162	out.PrintUntil(val_start)
163	out.Print('"')
164	out.PrintUntil(val_end)
165	out.Print('"')
166
167	#val_lexer.Reset(val_start, val_end)
168	pass
169	# TODO: get the kind of string
170	#
171	# Quoted: we need to replace & with & and < with <
172	# note > is not allowed
173	# Unquoted: right now, we can just surround with double quotes
174	# because we don't allow any bad chars
175	# Empty : add "", so empty= becomes =""
176	# Missing : add ="", so missing becomes missing=""
177
178	tag_name = lx.CanonicalTagName()
179	if tok_id == h8_id.StartTag and tag_name in VOID_ELEMENTS:
180	# TODO: instead of closing >, print />
181	pass
182
183	elif tok_id == h8_id.BadAmpersand:
184	#out.SkipTo(pos)
185	out.Print('&')
186	out.SkipTo(end_pos)
187
188	elif tok_id == h8_id.BadGreaterThan:
189	#out.SkipTo(pos)
190	out.Print('>')
191	out.SkipTo(end_pos)
192	else:
193	out.PrintUntil(end_pos)
194
195	pos = end_pos
196
197	out.PrintTheRest()
198	return f.getvalue()
199
200
201	class Counters(object):
202
203	def __init__(self):
204	# type: () -> None
205	self.num_tokens = 0
206	self.num_start_tags = 0
207	self.num_start_end_tags = 0
208	self.num_attrs = 0
209	self.max_tag_stack = 0
210	self.num_val_tokens = 0
211
212	#self.debug_attrs = []
213
214
215	def main(argv):
216	# type: (List[str]) -> int
217	action = argv[1]
218
219	if action == 'tokens':
220	contents = sys.stdin.read()
221
222	lx = Lexer(contents)
223	start_pos = 0
224	while True:
225	tok_id, end_pos = lx.Read()
226	if tok_id == h8_id.Invalid:
227	raise LexError('Invalid token', contents, start_pos)
228	if tok_id == h8_id.EndOfStream:
229	break
230
231	frag = contents[start_pos:end_pos]
232	log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
233	start_pos = end_pos
234
235	return 0
236
237	elif action in ('quick-scan', 'lex-htm8', 'parse-htm8', 'parse-xml'):
238
239	errors = []
240	counters = Counters()
241
242	flags = 0
243	if action == 'quick-scan':
244	flags \|= NO_LEX_ATTRS
245	if action.startswith('parse-'):
246	flags \|= BALANCED_TAGS
247	if action == 'parse-xml':
248	flags \|= NO_SPECIAL_TAGS
249
250	i = 0
251	for line in sys.stdin:
252	filename = line.strip()
253	with open(filename) as f:
254	contents = f.read()
255
256	try:
257	Validate(contents, flags, counters)
258	except LexError as e:
259	log('Lex error in %r: %s', filename, e)
260	errors.append((filename, e))
261	except ParseError as e:
262	log('Parse error in %r: %s', filename, e)
263	errors.append((filename, e))
264	i += 1
265
266	log('')
267	log('%10d tokens', counters.num_tokens)
268	log('%10d start/end tags', counters.num_start_end_tags)
269	log('%10d start tags', counters.num_start_tags)
270	log('%10d attrs', counters.num_attrs)
271	log('%10d max tag stack depth', counters.max_tag_stack)
272	log('%10d attr val tokens', counters.num_val_tokens)
273	log('%10d errors', len(errors))
274	if len(errors):
275	return 1
276	return 0
277
278	elif action == 'todo':
279	# Other algorithms:
280	#
281	# - select first subtree with given ID
282	# - this requires understanding the void tags I suppose
283	# - select all subtrees that have a class
284	# - materialize DOM
285
286	# Safe-HTM8? This is a filter
287	return 0
288
289	else:
290	raise RuntimeError('Invalid action %r' % action)
291
292
293	if __name__ == '__main__':
294	sys.exit(main(sys.argv))