lazylex/html.py

OILS / lazylex / html.py View on Github | oils.pub

511 lines, 317 significant

1	#!/usr/bin/env python2
2	"""
3	lazylex/html.py - Wrapper around HTM8
4
5	See doc/lazylex.md for details.
6
7	"""
8	from __future__ import print_function
9
10	from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_id_str)
11	from data_lang import htm8
12	from data_lang.htm8 import (Lexer, TagLexer, AttrValueLexer, LexError,
13	ParseError, Output)
14	from doctools.util import log
15
16	try:
17	from cStringIO import StringIO
18	except ImportError:
19	# for python3
20	from io import StringIO # type: ignore
21	import sys
22
23	if sys.version_info.major == 2:
24	from typing import List, Tuple, Iterator
25
26
27	def _Tokens(s, left_pos, right_pos):
28	# type: (str, int, int) -> Iterator[Tuple[h8_id_t, int]]
29	"""
30	Args:
31	s: string to parse
32	left_pos, right_pos: Optional span boundaries.
33	"""
34	lx = Lexer(s, left_pos, right_pos)
35	while True:
36	tok_id, pos = lx.Read()
37	yield tok_id, pos
38	if tok_id == h8_id.EndOfStream:
39	break
40
41
42	def ValidTokens(s, left_pos=0, right_pos=-1):
43	# type: (str, int, int) -> Iterator[Tuple[h8_id_t, int]]
44	"""Wrapper around _Tokens to prevent callers from having to handle Invalid.
45
46	I'm not combining the two functions because I might want to do a
47	'yield' transformation on Tokens()? Exceptions might complicate the
48	issue?
49	"""
50	pos = left_pos
51	for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
52	if tok_id == h8_id.Invalid:
53	raise LexError('ValidTokens() got invalid token', s, pos)
54	yield tok_id, end_pos
55	pos = end_pos
56
57
58	def ReadUntilStartTag(it, tag_lexer, tag_name):
59	# type: (Iterator[Tuple[h8_id_t, int]], TagLexer, str) -> Tuple[int, int]
60	"""Find the next <foo>, returning its (start, end) positions
61
62	Raise ParseError if it's not found.
63
64	tag_lexer is RESET.
65	"""
66	pos = 0
67	while True:
68	try:
69	tok_id, end_pos = next(it)
70	except StopIteration:
71	break
72	tag_lexer.Reset(pos, end_pos)
73	if tok_id == h8_id.StartTag and tag_lexer.GetTagName() == tag_name:
74	return pos, end_pos
75
76	pos = end_pos
77
78	raise ParseError('No start tag %r' % tag_name)
79
80
81	def ReadUntilEndTag(it, tag_lexer, tag_name):
82	# type: (Iterator[Tuple[h8_id_t, int]], TagLexer, str) -> Tuple[int, int]
83	"""Find the next </foo>, returning its (start, end) position
84
85	Raise ParseError if it's not found.
86
87	tag_lexer is RESET.
88	"""
89	pos = 0
90	while True:
91	try:
92	tok_id, end_pos = next(it)
93	except StopIteration:
94	break
95	tag_lexer.Reset(pos, end_pos)
96	if tok_id == h8_id.EndTag and tag_lexer.GetTagName() == tag_name:
97	return pos, end_pos
98
99	pos = end_pos
100
101	raise ParseError('No end tag %r' % tag_name)
102
103
104	CHAR_ENTITY = {
105	'amp': '&',
106	'lt': '<',
107	'gt': '>',
108	'quot': '"',
109	'apos': "'",
110	}
111
112
113	def ToText(s, left_pos=0, right_pos=-1):
114	# type: (str, int, int) -> str
115	"""Given HTML, return text by unquoting > and < etc.
116
117	Used by:
118	doctools/oils_doc.py: PygmentsPlugin
119	doctools/help_gen.py: HelpIndexCards
120
121	In the latter case, we cold process some tags, like:
122
123	- Blue Link (not clickable, but still useful)
124	- Red X
125
126	That should be html.ToAnsi.
127	"""
128	f = StringIO()
129	out = Output(s, f, left_pos, right_pos)
130
131	pos = left_pos
132	for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
133	if tok_id in (h8_id.RawData, h8_id.BadAmpersand, h8_id.BadGreaterThan,
134	h8_id.BadLessThan):
135	out.SkipTo(pos)
136	out.PrintUntil(end_pos)
137
138	elif tok_id == h8_id.CharEntity: # &
139
140	entity = s[pos + 1:end_pos - 1]
141
142	out.SkipTo(pos)
143	out.Print(CHAR_ENTITY[entity])
144	out.SkipTo(end_pos)
145
146	# Not handling these yet
147	elif tok_id == h8_id.HexChar:
148	raise AssertionError('Hex Char %r' % s[pos:pos + 20])
149
150	elif tok_id == h8_id.DecChar:
151	raise AssertionError('Dec Char %r' % s[pos:pos + 20])
152
153	else:
154	# Skip everything else
155	out.SkipTo(end_pos)
156
157	pos = end_pos
158
159	out.PrintTheRest()
160	return f.getvalue()
161
162
163	# https://developer.mozilla.org/en-US/docs/Glossary/Void_element
164	VOID_ELEMENTS = [
165	'area',
166	'base',
167	'br',
168	'col',
169	'embed',
170	'hr',
171	'img',
172	'input',
173	'link',
174	'meta',
175	'param',
176	'source',
177	'track',
178	'wbr',
179	]
180
181	LEX_ATTRS = 1 << 1
182	LEX_QUOTED_VALUES = 1 << 2 # href="?x=42&y=99"
183	NO_SPECIAL_TAGS = 1 << 3 # <script> <style>, VOID tags, etc.
184	BALANCED_TAGS = 1 << 4 # are tags balanced?
185
186
187	def ValidateOld(contents, flags, counters):
188	# type: (str, int, Counters) -> None
189
190	tag_lexer = TagLexer(contents)
191	val_lexer = AttrValueLexer(contents)
192
193	no_special_tags = bool(flags & NO_SPECIAL_TAGS)
194	lx = Lexer(contents, no_special_tags=no_special_tags)
195	tokens = []
196	start_pos = 0
197	tag_stack = []
198	while True:
199	tok_id, end_pos = lx.Read()
200	#log('TOP %s %r', h8_id_str(tok_id), contents[start_pos:end_pos])
201
202	if tok_id == h8_id.Invalid:
203	raise LexError('ValidateOld() got invalid token', contents,
204	start_pos)
205	if tok_id == h8_id.EndOfStream:
206	break
207
208	tokens.append((tok_id, end_pos))
209
210	if tok_id == h8_id.StartEndTag:
211	counters.num_start_end_tags += 1
212
213	tag_lexer.Reset(start_pos, end_pos)
214	all_attrs = tag_lexer.AllAttrsRawSlice()
215	counters.num_attrs += len(all_attrs)
216	for name, val_start, val_end in all_attrs:
217	val_lexer.Reset(val_start, val_end)
218	counters.num_val_tokens += val_lexer.NumTokens()
219
220	#counters.debug_attrs.extend(all_attrs)
221
222	elif tok_id == h8_id.StartTag:
223	counters.num_start_tags += 1
224
225	tag_lexer.Reset(start_pos, end_pos)
226	all_attrs = tag_lexer.AllAttrsRawSlice()
227	counters.num_attrs += len(all_attrs)
228	for name, val_start, val_end in all_attrs:
229	val_lexer.Reset(val_start, val_end)
230	counters.num_val_tokens += val_lexer.NumTokens()
231
232	#counters.debug_attrs.extend(all_attrs)
233
234	if flags & BALANCED_TAGS:
235	tag_name = lx.CanonicalTagName()
236	if flags & NO_SPECIAL_TAGS:
237	tag_stack.append(tag_name)
238	else:
239	# e.g. <meta> is considered self-closing, like <meta/>
240	if tag_name not in VOID_ELEMENTS:
241	tag_stack.append(tag_name)
242
243	counters.max_tag_stack = max(counters.max_tag_stack,
244	len(tag_stack))
245	elif tok_id == h8_id.EndTag:
246	if flags & BALANCED_TAGS:
247	try:
248	expected = tag_stack.pop()
249	except IndexError:
250	raise ParseError('Tag stack empty',
251	s=contents,
252	start_pos=start_pos)
253
254	actual = lx.CanonicalTagName()
255	if expected != actual:
256	raise ParseError(
257	'Got unexpected closing tag %r; opening tag was %r' %
258	(contents[start_pos:end_pos], expected),
259	s=contents,
260	start_pos=start_pos)
261
262	start_pos = end_pos
263
264	if len(tag_stack) != 0:
265	raise ParseError('Missing closing tags at end of doc: %s' %
266	' '.join(tag_stack),
267	s=contents,
268	start_pos=start_pos)
269
270	counters.num_tokens += len(tokens)
271
272
273	def Validate(contents, flags, counters):
274	# type: (str, int, Counters) -> None
275
276	attr_lx = htm8.AttrLexer(contents)
277
278	no_special_tags = bool(flags & NO_SPECIAL_TAGS)
279	lx = htm8.Lexer(contents, no_special_tags=no_special_tags)
280	tokens = []
281	start_pos = 0
282	tag_stack = []
283	while True:
284	tok_id, end_pos = lx.Read()
285	#log('TOP %s %r', h8_id_str(tok_id), contents[start_pos:end_pos])
286
287	if tok_id == h8_id.Invalid:
288	raise LexError('Validate() got invalid token', contents, start_pos)
289	if tok_id == h8_id.EndOfStream:
290	break
291
292	tokens.append((tok_id, end_pos))
293
294	if tok_id == h8_id.StartEndTag:
295	counters.num_start_end_tags += 1
296
297	attr_lx.Init(lx.TagNamePos(), end_pos)
298	all_attrs = htm8.AllAttrsRaw(attr_lx)
299	counters.num_attrs += len(all_attrs)
300	# TODO: val_lexer.NumTokens() can be replaced with tokens_out
301
302	elif tok_id == h8_id.StartTag:
303	counters.num_start_tags += 1
304
305	attr_lx.Init(lx.TagNamePos(), end_pos)
306	all_attrs = htm8.AllAttrsRaw(attr_lx)
307	counters.num_attrs += len(all_attrs)
308
309	#counters.debug_attrs.extend(all_attrs)
310
311	if flags & BALANCED_TAGS:
312	tag_name = lx.CanonicalTagName()
313	if flags & NO_SPECIAL_TAGS:
314	tag_stack.append(tag_name)
315	else:
316	# e.g. <meta> is considered self-closing, like <meta/>
317	if tag_name not in VOID_ELEMENTS:
318	tag_stack.append(tag_name)
319
320	counters.max_tag_stack = max(counters.max_tag_stack,
321	len(tag_stack))
322	elif tok_id == h8_id.EndTag:
323	if flags & BALANCED_TAGS:
324	try:
325	expected = tag_stack.pop()
326	except IndexError:
327	raise ParseError('Tag stack empty',
328	s=contents,
329	start_pos=start_pos)
330
331	actual = lx.CanonicalTagName()
332	if expected != actual:
333	raise ParseError(
334	'Got unexpected closing tag %r; opening tag was %r' %
335	(contents[start_pos:end_pos], expected),
336	s=contents,
337	start_pos=start_pos)
338
339	start_pos = end_pos
340
341	if len(tag_stack) != 0:
342	raise ParseError('Missing closing tags at end of doc: %s' %
343	' '.join(tag_stack),
344	s=contents,
345	start_pos=start_pos)
346
347	counters.num_tokens += len(tokens)
348
349
350	def ToXml(htm8_str):
351	# type: (str) -> str
352
353	# TODO:
354	# 1. Lex it
355	# 2. < & > must be escaped
356	# a. in raw data
357	# b. in quoted strings
358	# 3. <script> turned into CDATA
359	# 4. void tags turned into self-closing tags
360	# 5. case-sensitive tag matching - not sure about this
361
362	tag_lexer = TagLexer(htm8_str)
363	val_lexer = AttrValueLexer(htm8_str)
364
365	f = StringIO()
366	out = Output(htm8_str, f)
367
368	lx = Lexer(htm8_str)
369
370	pos = 0
371	while True:
372	tok_id, end_pos = lx.Read()
373
374	if tok_id == h8_id.Invalid:
375	raise LexError('ToXml() got invalid token', htm8_str, pos)
376	if tok_id == h8_id.EndOfStream:
377	break
378
379	if tok_id in (h8_id.RawData, h8_id.CharEntity, h8_id.HexChar,
380	h8_id.DecChar):
381	out.PrintUntil(end_pos)
382	elif tok_id in (h8_id.StartTag, h8_id.StartEndTag):
383	tag_lexer.Reset(pos, end_pos)
384	# TODO: reduce allocations here
385	all_attrs = tag_lexer.AllAttrsRawSlice()
386	for name, val_start, val_end in all_attrs:
387	val_lexer.Reset(val_start, val_end)
388	# TODO: get the kind of string
389	#
390	# Quoted: we need to replace & with & and < with <
391	# note > is not allowed
392	# Unquoted: right now, we can just surround with double quotes
393	# because we don't allow any bad chars
394	# Empty : add "", so empty= becomes =""
395	# Missing : add ="", so missing becomes missing=""
396
397	tag_name = lx.CanonicalTagName()
398	if tok_id == h8_id.StartTag and tag_name in VOID_ELEMENTS:
399	# TODO: instead of closing >, print />
400	pass
401
402	elif tok_id == h8_id.BadAmpersand:
403	#out.SkipTo(pos)
404	out.Print('&')
405	out.SkipTo(end_pos)
406
407	elif tok_id == h8_id.BadGreaterThan:
408	#out.SkipTo(pos)
409	out.Print('>')
410	out.SkipTo(end_pos)
411	else:
412	out.PrintUntil(end_pos)
413
414	pos = end_pos
415
416	out.PrintTheRest()
417	return f.getvalue()
418
419
420	class Counters(object):
421
422	def __init__(self):
423	# type: () -> None
424	self.num_tokens = 0
425	self.num_start_tags = 0
426	self.num_start_end_tags = 0
427	self.num_attrs = 0
428	self.max_tag_stack = 0
429	self.num_val_tokens = 0
430
431	#self.debug_attrs = []
432
433
434	def main(argv):
435	# type: (List[str]) -> int
436	action = argv[1]
437
438	if action == 'tokens':
439	contents = sys.stdin.read()
440
441	lx = Lexer(contents)
442	start_pos = 0
443	while True:
444	tok_id, end_pos = lx.Read()
445	if tok_id == h8_id.Invalid:
446	raise LexError('Invalid token', contents, start_pos)
447	if tok_id == h8_id.EndOfStream:
448	break
449
450	frag = contents[start_pos:end_pos]
451	log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
452	start_pos = end_pos
453
454	return 0
455
456	elif action in ('lex-htm8', 'parse-htm8', 'parse-xml'):
457
458	errors = []
459	counters = Counters()
460
461	flags = LEX_ATTRS \| LEX_QUOTED_VALUES
462	if action.startswith('parse-'):
463	flags \|= BALANCED_TAGS
464	if action == 'parse-xml':
465	flags \|= NO_SPECIAL_TAGS
466
467	i = 0
468	for line in sys.stdin:
469	filename = line.strip()
470	with open(filename) as f:
471	contents = f.read()
472
473	try:
474	Validate(contents, flags, counters)
475	except LexError as e:
476	log('Lex error in %r: %s', filename, e)
477	errors.append((filename, e))
478	except ParseError as e:
479	log('Parse error in %r: %s', filename, e)
480	errors.append((filename, e))
481	i += 1
482
483	log('')
484	log('%10d tokens', counters.num_tokens)
485	log('%10d start/end tags', counters.num_start_end_tags)
486	log('%10d start tags', counters.num_start_tags)
487	log('%10d attrs', counters.num_attrs)
488	log('%10d max tag stack depth', counters.max_tag_stack)
489	log('%10d attr val tokens', counters.num_val_tokens)
490	log('%10d errors', len(errors))
491	if len(errors):
492	return 1
493	return 0
494
495	elif action == 'todo':
496	# Other algorithms:
497	#
498	# - select first subtree with given ID
499	# - this requires understanding the void tags I suppose
500	# - select all subtrees that have a class
501	# - materialize DOM
502
503	# Safe-HTM8? This is a filter
504	return 0
505
506	else:
507	raise RuntimeError('Invalid action %r' % action)
508
509
510	if __name__ == '__main__':
511	sys.exit(main(sys.argv))