data_lang/htm8

OILS / data_lang / htm8_util.py View on Github | oils.pub

278 lines, 182 significant

1	#!/usr/bin/env python2
2
3	try:
4	from cStringIO import StringIO
5	except ImportError:
6	# for python3
7	from io import StringIO # type: ignore
8	import sys
9
10	from typing import List
11
12	from _devbuild.gen.htm8_asdl import (h8_id, h8_id_str)
13	from data_lang import htm8
14	from data_lang.htm8 import (Lexer, LexError, ParseError, Output)
15	from doctools.util import log
16
17	# https://developer.mozilla.org/en-US/docs/Glossary/Void_element
18	VOID_ELEMENTS = [
19	'area',
20	'base',
21	'br',
22	'col',
23	'embed',
24	'hr',
25	'img',
26	'input',
27	'link',
28	'meta',
29	'param',
30	'source',
31	'track',
32	'wbr',
33	]
34
35	LEX_ATTRS = 1 << 1
36	LEX_QUOTED_VALUES = 1 << 2 # href="?x=42&y=99"
37	NO_SPECIAL_TAGS = 1 << 3 # <script> <style>, VOID tags, etc.
38	BALANCED_TAGS = 1 << 4 # are tags balanced?
39
40
41	def Validate(contents, flags, counters):
42	# type: (str, int, Counters) -> None
43
44	attr_lx = htm8.AttrLexer(contents)
45
46	no_special_tags = bool(flags & NO_SPECIAL_TAGS)
47	lx = htm8.Lexer(contents, no_special_tags=no_special_tags)
48	tokens = []
49	start_pos = 0
50	tag_stack = []
51	while True:
52	tok_id, end_pos = lx.Read()
53	#log('TOP %s %r', h8_id_str(tok_id), contents[start_pos:end_pos])
54
55	if tok_id == h8_id.Invalid:
56	raise LexError('Validate() got invalid token', contents, start_pos)
57	if tok_id == h8_id.EndOfStream:
58	break
59
60	tokens.append((tok_id, end_pos))
61
62	if tok_id == h8_id.StartEndTag:
63	counters.num_start_end_tags += 1
64
65	attr_lx.Init(tok_id, lx.TagNamePos(), end_pos)
66	all_attrs = htm8.AllAttrsRaw(attr_lx)
67	counters.num_attrs += len(all_attrs)
68	# TODO: val_lexer.NumTokens() can be replaced with tokens_out
69
70	elif tok_id == h8_id.StartTag:
71	counters.num_start_tags += 1
72
73	attr_lx.Init(tok_id, lx.TagNamePos(), end_pos)
74	all_attrs = htm8.AllAttrsRaw(attr_lx)
75	counters.num_attrs += len(all_attrs)
76
77	#counters.debug_attrs.extend(all_attrs)
78
79	if flags & BALANCED_TAGS:
80	tag_name = lx.CanonicalTagName()
81	if flags & NO_SPECIAL_TAGS:
82	tag_stack.append(tag_name)
83	else:
84	# e.g. <meta> is considered self-closing, like <meta/>
85	if tag_name not in VOID_ELEMENTS:
86	tag_stack.append(tag_name)
87
88	counters.max_tag_stack = max(counters.max_tag_stack,
89	len(tag_stack))
90	elif tok_id == h8_id.EndTag:
91	if flags & BALANCED_TAGS:
92	try:
93	expected = tag_stack.pop()
94	except IndexError:
95	raise ParseError('Tag stack empty',
96	s=contents,
97	start_pos=start_pos)
98
99	actual = lx.CanonicalTagName()
100	if expected != actual:
101	raise ParseError(
102	'Got unexpected closing tag %r; opening tag was %r' %
103	(contents[start_pos:end_pos], expected),
104	s=contents,
105	start_pos=start_pos)
106
107	start_pos = end_pos
108
109	if len(tag_stack) != 0:
110	raise ParseError('Missing closing tags at end of doc: %s' %
111	' '.join(tag_stack),
112	s=contents,
113	start_pos=start_pos)
114
115	counters.num_tokens += len(tokens)
116
117
118	def ToXml(htm8_str):
119	# type: (str) -> str
120
121	# TODO:
122	# 1. Lex it
123	# 2. < & > must be escaped
124	# a. in raw data
125	# b. in quoted strings
126	# 3. <script> turned into CDATA
127	# 4. void tags turned into self-closing tags
128	# 5. case-sensitive tag matching - not sure about this
129
130	attr_lexer = htm8.AttrLexer(htm8_str)
131
132	f = StringIO()
133	out = Output(htm8_str, f)
134
135	lx = Lexer(htm8_str)
136
137	pos = 0
138	while True:
139	tok_id, end_pos = lx.Read()
140
141	if tok_id == h8_id.Invalid:
142	raise LexError('ToXml() got invalid token', htm8_str, pos)
143	if tok_id == h8_id.EndOfStream:
144	break
145
146	if tok_id in (h8_id.RawData, h8_id.CharEntity, h8_id.HexChar,
147	h8_id.DecChar):
148	out.PrintUntil(end_pos)
149	elif tok_id in (h8_id.StartTag, h8_id.StartEndTag):
150	attr_lexer.Init(tok_id, lx.TagNamePos(), end_pos)
151	all_attrs = htm8.AllAttrsRawSlice(attr_lexer)
152	for name_start, name_end, v, val_start, val_end in all_attrs:
153	#val_lexer.Reset(val_start, val_end)
154	pass
155	# TODO: get the kind of string
156	#
157	# Quoted: we need to replace & with & and < with <
158	# note > is not allowed
159	# Unquoted: right now, we can just surround with double quotes
160	# because we don't allow any bad chars
161	# Empty : add "", so empty= becomes =""
162	# Missing : add ="", so missing becomes missing=""
163
164	tag_name = lx.CanonicalTagName()
165	if tok_id == h8_id.StartTag and tag_name in VOID_ELEMENTS:
166	# TODO: instead of closing >, print />
167	pass
168
169	elif tok_id == h8_id.BadAmpersand:
170	#out.SkipTo(pos)
171	out.Print('&')
172	out.SkipTo(end_pos)
173
174	elif tok_id == h8_id.BadGreaterThan:
175	#out.SkipTo(pos)
176	out.Print('>')
177	out.SkipTo(end_pos)
178	else:
179	out.PrintUntil(end_pos)
180
181	pos = end_pos
182
183	out.PrintTheRest()
184	return f.getvalue()
185
186
187	class Counters(object):
188
189	def __init__(self):
190	# type: () -> None
191	self.num_tokens = 0
192	self.num_start_tags = 0
193	self.num_start_end_tags = 0
194	self.num_attrs = 0
195	self.max_tag_stack = 0
196	self.num_val_tokens = 0
197
198	#self.debug_attrs = []
199
200
201	def main(argv):
202	# type: (List[str]) -> int
203	action = argv[1]
204
205	if action == 'tokens':
206	contents = sys.stdin.read()
207
208	lx = Lexer(contents)
209	start_pos = 0
210	while True:
211	tok_id, end_pos = lx.Read()
212	if tok_id == h8_id.Invalid:
213	raise LexError('Invalid token', contents, start_pos)
214	if tok_id == h8_id.EndOfStream:
215	break
216
217	frag = contents[start_pos:end_pos]
218	log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
219	start_pos = end_pos
220
221	return 0
222
223	elif action in ('lex-htm8', 'parse-htm8', 'parse-xml'):
224
225	errors = []
226	counters = Counters()
227
228	flags = LEX_ATTRS \| LEX_QUOTED_VALUES
229	if action.startswith('parse-'):
230	flags \|= BALANCED_TAGS
231	if action == 'parse-xml':
232	flags \|= NO_SPECIAL_TAGS
233
234	i = 0
235	for line in sys.stdin:
236	filename = line.strip()
237	with open(filename) as f:
238	contents = f.read()
239
240	try:
241	Validate(contents, flags, counters)
242	except LexError as e:
243	log('Lex error in %r: %s', filename, e)
244	errors.append((filename, e))
245	except ParseError as e:
246	log('Parse error in %r: %s', filename, e)
247	errors.append((filename, e))
248	i += 1
249
250	log('')
251	log('%10d tokens', counters.num_tokens)
252	log('%10d start/end tags', counters.num_start_end_tags)
253	log('%10d start tags', counters.num_start_tags)
254	log('%10d attrs', counters.num_attrs)
255	log('%10d max tag stack depth', counters.max_tag_stack)
256	log('%10d attr val tokens', counters.num_val_tokens)
257	log('%10d errors', len(errors))
258	if len(errors):
259	return 1
260	return 0
261
262	elif action == 'todo':
263	# Other algorithms:
264	#
265	# - select first subtree with given ID
266	# - this requires understanding the void tags I suppose
267	# - select all subtrees that have a class
268	# - materialize DOM
269
270	# Safe-HTM8? This is a filter
271	return 0
272
273	else:
274	raise RuntimeError('Invalid action %r' % action)
275
276
277	if __name__ == '__main__':
278	sys.exit(main(sys.argv))