lazylex/html.py

OILS / lazylex / html.py View on Github | oils.pub

450 lines, 273 significant

1	#!/usr/bin/env python2
2	"""
3	lazylex/html.py - Wrapper around HTM8
4
5	See doc/lazylex.md for details.
6
7	"""
8	from __future__ import print_function
9
10	from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_id_str)
11	from data_lang.htm8 import (Lexer, TagLexer, AttrValueLexer, LexError,
12	ParseError, Output)
13	from doctools.util import log
14
15	try:
16	from cStringIO import StringIO
17	except ImportError:
18	# for python3
19	from io import StringIO # type: ignore
20	import sys
21
22	if sys.version_info.major == 2:
23	from typing import List, Tuple, Iterator
24
25
26	def _Tokens(s, left_pos, right_pos):
27	# type: (str, int, int) -> Iterator[Tuple[h8_id_t, int]]
28	"""
29	Args:
30	s: string to parse
31	left_pos, right_pos: Optional span boundaries.
32	"""
33	lx = Lexer(s, left_pos, right_pos)
34	while True:
35	tok_id, pos = lx.Read()
36	yield tok_id, pos
37	if tok_id == h8_id.EndOfStream:
38	break
39
40
41	def ValidTokens(s, left_pos=0, right_pos=-1):
42	# type: (str, int, int) -> Iterator[Tuple[h8_id_t, int]]
43	"""Wrapper around _Tokens to prevent callers from having to handle Invalid.
44
45	I'm not combining the two functions because I might want to do a
46	'yield' transformation on Tokens()? Exceptions might complicate the
47	issue?
48	"""
49	pos = left_pos
50	for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
51	if tok_id == h8_id.Invalid:
52	raise LexError(s, pos)
53	yield tok_id, end_pos
54	pos = end_pos
55
56
57	def ValidTokenList(s, no_special_tags=False):
58	# type: (str, bool) -> List[Tuple[h8_id_t, int]]
59	"""A wrapper that can be more easily translated to C++. Doesn't use iterators."""
60
61	start_pos = 0
62	tokens = []
63	lx = Lexer(s, no_special_tags=no_special_tags)
64	while True:
65	tok_id, end_pos = lx.Read()
66	tokens.append((tok_id, end_pos))
67	if tok_id == h8_id.EndOfStream:
68	break
69	if tok_id == h8_id.Invalid:
70	raise LexError(s, start_pos)
71	start_pos = end_pos
72	return tokens
73
74
75	def ReadUntilStartTag(it, tag_lexer, tag_name):
76	# type: (Iterator[Tuple[h8_id_t, int]], TagLexer, str) -> Tuple[int, int]
77	"""Find the next <foo>, returning its (start, end) positions
78
79	Raise ParseError if it's not found.
80
81	tag_lexer is RESET.
82	"""
83	pos = 0
84	while True:
85	try:
86	tok_id, end_pos = next(it)
87	except StopIteration:
88	break
89	tag_lexer.Reset(pos, end_pos)
90	if tok_id == h8_id.StartTag and tag_lexer.GetTagName() == tag_name:
91	return pos, end_pos
92
93	pos = end_pos
94
95	raise ParseError('No start tag %r' % tag_name)
96
97
98	def ReadUntilEndTag(it, tag_lexer, tag_name):
99	# type: (Iterator[Tuple[h8_id_t, int]], TagLexer, str) -> Tuple[int, int]
100	"""Find the next </foo>, returning its (start, end) position
101
102	Raise ParseError if it's not found.
103
104	tag_lexer is RESET.
105	"""
106	pos = 0
107	while True:
108	try:
109	tok_id, end_pos = next(it)
110	except StopIteration:
111	break
112	tag_lexer.Reset(pos, end_pos)
113	if tok_id == h8_id.EndTag and tag_lexer.GetTagName() == tag_name:
114	return pos, end_pos
115
116	pos = end_pos
117
118	raise ParseError('No end tag %r' % tag_name)
119
120
121	CHAR_ENTITY = {
122	'amp': '&',
123	'lt': '<',
124	'gt': '>',
125	'quot': '"',
126	'apos': "'",
127	}
128
129
130	def ToText(s, left_pos=0, right_pos=-1):
131	# type: (str, int, int) -> str
132	"""Given HTML, return text by unquoting > and < etc.
133
134	Used by:
135	doctools/oils_doc.py: PygmentsPlugin
136	doctools/help_gen.py: HelpIndexCards
137
138	In the latter case, we cold process some tags, like:
139
140	- Blue Link (not clickable, but still useful)
141	- Red X
142
143	That should be html.ToAnsi.
144	"""
145	f = StringIO()
146	out = Output(s, f, left_pos, right_pos)
147
148	pos = left_pos
149	for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
150	if tok_id in (h8_id.RawData, h8_id.BadAmpersand, h8_id.BadGreaterThan,
151	h8_id.BadLessThan):
152	out.SkipTo(pos)
153	out.PrintUntil(end_pos)
154
155	elif tok_id == h8_id.CharEntity: # &
156
157	entity = s[pos + 1:end_pos - 1]
158
159	out.SkipTo(pos)
160	out.Print(CHAR_ENTITY[entity])
161	out.SkipTo(end_pos)
162
163	# Not handling these yet
164	elif tok_id == h8_id.HexChar:
165	raise AssertionError('Hex Char %r' % s[pos:pos + 20])
166
167	elif tok_id == h8_id.DecChar:
168	raise AssertionError('Dec Char %r' % s[pos:pos + 20])
169
170	else:
171	# Skip everything else
172	out.SkipTo(end_pos)
173
174	pos = end_pos
175
176	out.PrintTheRest()
177	return f.getvalue()
178
179
180	# https://developer.mozilla.org/en-US/docs/Glossary/Void_element
181	VOID_ELEMENTS = [
182	'area',
183	'base',
184	'br',
185	'col',
186	'embed',
187	'hr',
188	'img',
189	'input',
190	'link',
191	'meta',
192	'param',
193	'source',
194	'track',
195	'wbr',
196	]
197
198	LEX_ATTRS = 1 << 1
199	LEX_QUOTED_VALUES = 1 << 2 # href="?x=42&y=99"
200	NO_SPECIAL_TAGS = 1 << 3 # <script> <style>, VOID tags, etc.
201	BALANCED_TAGS = 1 << 4 # are tags balanced?
202
203
204	def Validate(contents, flags, counters):
205	# type: (str, int, Counters) -> None
206
207	tag_lexer = TagLexer(contents)
208	val_lexer = AttrValueLexer(contents)
209
210	no_special_tags = bool(flags & NO_SPECIAL_TAGS)
211	lx = Lexer(contents, no_special_tags=no_special_tags)
212	tokens = []
213	start_pos = 0
214	tag_stack = []
215	while True:
216	tok_id, end_pos = lx.Read()
217	#log('TOP %s %r', h8_id_str(tok_id), contents[start_pos:end_pos])
218
219	if tok_id == h8_id.Invalid:
220	raise LexError(contents, start_pos)
221	if tok_id == h8_id.EndOfStream:
222	break
223
224	tokens.append((tok_id, end_pos))
225
226	if tok_id == h8_id.StartEndTag:
227	counters.num_start_end_tags += 1
228
229	tag_lexer.Reset(start_pos, end_pos)
230	all_attrs = tag_lexer.AllAttrsRawSlice()
231	counters.num_attrs += len(all_attrs)
232	for name, val_start, val_end in all_attrs:
233	val_lexer.Reset(val_start, val_end)
234	counters.num_val_tokens += val_lexer.NumTokens()
235
236	#counters.debug_attrs.extend(all_attrs)
237
238	elif tok_id == h8_id.StartTag:
239	counters.num_start_tags += 1
240
241	tag_lexer.Reset(start_pos, end_pos)
242	all_attrs = tag_lexer.AllAttrsRawSlice()
243	counters.num_attrs += len(all_attrs)
244	for name, val_start, val_end in all_attrs:
245	val_lexer.Reset(val_start, val_end)
246	counters.num_val_tokens += val_lexer.NumTokens()
247
248	#counters.debug_attrs.extend(all_attrs)
249
250	if flags & BALANCED_TAGS:
251	tag_name = lx.CanonicalTagName()
252	if flags & NO_SPECIAL_TAGS:
253	tag_stack.append(tag_name)
254	else:
255	# e.g. <meta> is considered self-closing, like <meta/>
256	if tag_name not in VOID_ELEMENTS:
257	tag_stack.append(tag_name)
258
259	counters.max_tag_stack = max(counters.max_tag_stack,
260	len(tag_stack))
261	elif tok_id == h8_id.EndTag:
262	if flags & BALANCED_TAGS:
263	try:
264	expected = tag_stack.pop()
265	except IndexError:
266	raise ParseError('Tag stack empty',
267	s=contents,
268	start_pos=start_pos)
269
270	actual = lx.CanonicalTagName()
271	if expected != actual:
272	raise ParseError(
273	'Got unexpected closing tag %r; opening tag was %r' %
274	(contents[start_pos:end_pos], expected),
275	s=contents,
276	start_pos=start_pos)
277
278	start_pos = end_pos
279
280	if len(tag_stack) != 0:
281	raise ParseError('Missing closing tags at end of doc: %s' %
282	' '.join(tag_stack),
283	s=contents,
284	start_pos=start_pos)
285
286	counters.num_tokens += len(tokens)
287
288
289	def ToXml(htm8_str):
290	# type: (str) -> str
291
292	# TODO:
293	# 1. Lex it
294	# 2. < & > must be escaped
295	# a. in raw data
296	# b. in quoted strings
297	# 3. <script> turned into CDATA
298	# 4. void tags turned into self-closing tags
299	# 5. case-sensitive tag matching - not sure about this
300
301	tag_lexer = TagLexer(htm8_str)
302	val_lexer = AttrValueLexer(htm8_str)
303
304	f = StringIO()
305	out = Output(htm8_str, f)
306
307	lx = Lexer(htm8_str)
308
309	pos = 0
310	while True:
311	tok_id, end_pos = lx.Read()
312
313	if tok_id == h8_id.Invalid:
314	raise LexError(htm8_str, pos)
315	if tok_id == h8_id.EndOfStream:
316	break
317
318	if tok_id in (h8_id.RawData, h8_id.CharEntity, h8_id.HexChar,
319	h8_id.DecChar):
320	out.PrintUntil(end_pos)
321	elif tok_id in (h8_id.StartTag, h8_id.StartEndTag):
322	tag_lexer.Reset(pos, end_pos)
323	# TODO: reduce allocations here
324	all_attrs = tag_lexer.AllAttrsRawSlice()
325	for name, val_start, val_end in all_attrs:
326	val_lexer.Reset(val_start, val_end)
327	# TODO: get the kind of string
328	#
329	# Quoted: we need to replace & with & and < with <
330	# note > is not allowed
331	# Unquoted: right now, we can just surround with double quotes
332	# because we don't allow any bad chars
333	# Empty : add "", so empty= becomes =""
334	# Missing : add ="", so missing becomes missing=""
335
336	tag_name = lx.CanonicalTagName()
337	if tok_id == h8_id.StartTag and tag_name in VOID_ELEMENTS:
338	# TODO: instead of closing >, print />
339	pass
340
341	elif tok_id == h8_id.BadAmpersand:
342	#out.SkipTo(pos)
343	out.Print('&')
344	out.SkipTo(end_pos)
345
346	elif tok_id == h8_id.BadGreaterThan:
347	#out.SkipTo(pos)
348	out.Print('>')
349	out.SkipTo(end_pos)
350	else:
351	out.PrintUntil(end_pos)
352
353	pos = end_pos
354
355	out.PrintTheRest()
356	return f.getvalue()
357
358
359	class Counters(object):
360
361	def __init__(self):
362	# type: () -> None
363	self.num_tokens = 0
364	self.num_start_tags = 0
365	self.num_start_end_tags = 0
366	self.num_attrs = 0
367	self.max_tag_stack = 0
368	self.num_val_tokens = 0
369
370	#self.debug_attrs = []
371
372
373	def main(argv):
374	# type: (List[str]) -> int
375	action = argv[1]
376
377	if action == 'tokens':
378	contents = sys.stdin.read()
379
380	lx = Lexer(contents)
381	start_pos = 0
382	while True:
383	tok_id, end_pos = lx.Read()
384	if tok_id == h8_id.Invalid:
385	raise LexError(contents, start_pos)
386	if tok_id == h8_id.EndOfStream:
387	break
388
389	frag = contents[start_pos:end_pos]
390	log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
391	start_pos = end_pos
392
393	return 0
394
395	elif action in ('lex-htm8', 'parse-htm8', 'parse-xml'):
396
397	errors = []
398	counters = Counters()
399
400	flags = LEX_ATTRS \| LEX_QUOTED_VALUES
401	if action.startswith('parse-'):
402	flags \|= BALANCED_TAGS
403	if action == 'parse-xml':
404	flags \|= NO_SPECIAL_TAGS
405
406	i = 0
407	for line in sys.stdin:
408	filename = line.strip()
409	with open(filename) as f:
410	contents = f.read()
411
412	try:
413	Validate(contents, flags, counters)
414	except LexError as e:
415	log('Lex error in %r: %s', filename, e)
416	errors.append((filename, e))
417	except ParseError as e:
418	log('Parse error in %r: %s', filename, e)
419	errors.append((filename, e))
420	i += 1
421
422	log('')
423	log('%10d tokens', counters.num_tokens)
424	log('%10d start/end tags', counters.num_start_end_tags)
425	log('%10d start tags', counters.num_start_tags)
426	log('%10d attrs', counters.num_attrs)
427	log('%10d max tag stack depth', counters.max_tag_stack)
428	log('%10d attr val tokens', counters.num_val_tokens)
429	log('%10d errors', len(errors))
430	if len(errors):
431	return 1
432	return 0
433
434	elif action == 'todo':
435	# Other algorithms:
436	#
437	# - select first subtree with given ID
438	# - this requires understanding the void tags I suppose
439	# - select all subtrees that have a class
440	# - materialize DOM
441
442	# Safe-HTM8? This is a filter
443	return 0
444
445	else:
446	raise RuntimeError('Invalid action %r' % action)
447
448
449	if __name__ == '__main__':
450	sys.exit(main(sys.argv))