lazylex/html.py

OILS / lazylex / html.py View on Github | oils.pub

432 lines, 260 significant

1	#!/usr/bin/env python2
2	"""
3	lazylex/html.py - Wrapper around HTM8
4
5	See doc/lazylex.md for details.
6
7	"""
8	from __future__ import print_function
9
10	from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_id_str)
11	from data_lang.htm8 import (Lexer, TagLexer, AttrValueLexer, LexError,
12	ParseError, Output)
13	from doctools.util import log
14
15	try:
16	from cStringIO import StringIO
17	except ImportError:
18	# for python3
19	from io import StringIO # type: ignore
20	import sys
21
22	if sys.version_info.major == 2:
23	from typing import List, Tuple, Iterator
24
25
26	def _Tokens(s, left_pos, right_pos):
27	# type: (str, int, int) -> Iterator[Tuple[h8_id_t, int]]
28	"""
29	Args:
30	s: string to parse
31	left_pos, right_pos: Optional span boundaries.
32	"""
33	lx = Lexer(s, left_pos, right_pos)
34	while True:
35	tok_id, pos = lx.Read()
36	yield tok_id, pos
37	if tok_id == h8_id.EndOfStream:
38	break
39
40
41	def ValidTokens(s, left_pos=0, right_pos=-1):
42	# type: (str, int, int) -> Iterator[Tuple[h8_id_t, int]]
43	"""Wrapper around _Tokens to prevent callers from having to handle Invalid.
44
45	I'm not combining the two functions because I might want to do a
46	'yield' transformation on Tokens()? Exceptions might complicate the
47	issue?
48	"""
49	pos = left_pos
50	for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
51	if tok_id == h8_id.Invalid:
52	raise LexError(s, pos)
53	yield tok_id, end_pos
54	pos = end_pos
55
56
57	def ReadUntilStartTag(it, tag_lexer, tag_name):
58	# type: (Iterator[Tuple[h8_id_t, int]], TagLexer, str) -> Tuple[int, int]
59	"""Find the next <foo>, returning its (start, end) positions
60
61	Raise ParseError if it's not found.
62
63	tag_lexer is RESET.
64	"""
65	pos = 0
66	while True:
67	try:
68	tok_id, end_pos = next(it)
69	except StopIteration:
70	break
71	tag_lexer.Reset(pos, end_pos)
72	if tok_id == h8_id.StartTag and tag_lexer.GetTagName() == tag_name:
73	return pos, end_pos
74
75	pos = end_pos
76
77	raise ParseError('No start tag %r' % tag_name)
78
79
80	def ReadUntilEndTag(it, tag_lexer, tag_name):
81	# type: (Iterator[Tuple[h8_id_t, int]], TagLexer, str) -> Tuple[int, int]
82	"""Find the next </foo>, returning its (start, end) position
83
84	Raise ParseError if it's not found.
85
86	tag_lexer is RESET.
87	"""
88	pos = 0
89	while True:
90	try:
91	tok_id, end_pos = next(it)
92	except StopIteration:
93	break
94	tag_lexer.Reset(pos, end_pos)
95	if tok_id == h8_id.EndTag and tag_lexer.GetTagName() == tag_name:
96	return pos, end_pos
97
98	pos = end_pos
99
100	raise ParseError('No end tag %r' % tag_name)
101
102
103	CHAR_ENTITY = {
104	'amp': '&',
105	'lt': '<',
106	'gt': '>',
107	'quot': '"',
108	'apos': "'",
109	}
110
111
112	def ToText(s, left_pos=0, right_pos=-1):
113	# type: (str, int, int) -> str
114	"""Given HTML, return text by unquoting > and < etc.
115
116	Used by:
117	doctools/oils_doc.py: PygmentsPlugin
118	doctools/help_gen.py: HelpIndexCards
119
120	In the latter case, we cold process some tags, like:
121
122	- Blue Link (not clickable, but still useful)
123	- Red X
124
125	That should be html.ToAnsi.
126	"""
127	f = StringIO()
128	out = Output(s, f, left_pos, right_pos)
129
130	pos = left_pos
131	for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
132	if tok_id in (h8_id.RawData, h8_id.BadAmpersand, h8_id.BadGreaterThan,
133	h8_id.BadLessThan):
134	out.SkipTo(pos)
135	out.PrintUntil(end_pos)
136
137	elif tok_id == h8_id.CharEntity: # &
138
139	entity = s[pos + 1:end_pos - 1]
140
141	out.SkipTo(pos)
142	out.Print(CHAR_ENTITY[entity])
143	out.SkipTo(end_pos)
144
145	# Not handling these yet
146	elif tok_id == h8_id.HexChar:
147	raise AssertionError('Hex Char %r' % s[pos:pos + 20])
148
149	elif tok_id == h8_id.DecChar:
150	raise AssertionError('Dec Char %r' % s[pos:pos + 20])
151
152	else:
153	# Skip everything else
154	out.SkipTo(end_pos)
155
156	pos = end_pos
157
158	out.PrintTheRest()
159	return f.getvalue()
160
161
162	# https://developer.mozilla.org/en-US/docs/Glossary/Void_element
163	VOID_ELEMENTS = [
164	'area',
165	'base',
166	'br',
167	'col',
168	'embed',
169	'hr',
170	'img',
171	'input',
172	'link',
173	'meta',
174	'param',
175	'source',
176	'track',
177	'wbr',
178	]
179
180	LEX_ATTRS = 1 << 1
181	LEX_QUOTED_VALUES = 1 << 2 # href="?x=42&y=99"
182	NO_SPECIAL_TAGS = 1 << 3 # <script> <style>, VOID tags, etc.
183	BALANCED_TAGS = 1 << 4 # are tags balanced?
184
185
186	def Validate(contents, flags, counters):
187	# type: (str, int, Counters) -> None
188
189	tag_lexer = TagLexer(contents)
190	val_lexer = AttrValueLexer(contents)
191
192	no_special_tags = bool(flags & NO_SPECIAL_TAGS)
193	lx = Lexer(contents, no_special_tags=no_special_tags)
194	tokens = []
195	start_pos = 0
196	tag_stack = []
197	while True:
198	tok_id, end_pos = lx.Read()
199	#log('TOP %s %r', h8_id_str(tok_id), contents[start_pos:end_pos])
200
201	if tok_id == h8_id.Invalid:
202	raise LexError(contents, start_pos)
203	if tok_id == h8_id.EndOfStream:
204	break
205
206	tokens.append((tok_id, end_pos))
207
208	if tok_id == h8_id.StartEndTag:
209	counters.num_start_end_tags += 1
210
211	tag_lexer.Reset(start_pos, end_pos)
212	all_attrs = tag_lexer.AllAttrsRawSlice()
213	counters.num_attrs += len(all_attrs)
214	for name, val_start, val_end in all_attrs:
215	val_lexer.Reset(val_start, val_end)
216	counters.num_val_tokens += val_lexer.NumTokens()
217
218	#counters.debug_attrs.extend(all_attrs)
219
220	elif tok_id == h8_id.StartTag:
221	counters.num_start_tags += 1
222
223	tag_lexer.Reset(start_pos, end_pos)
224	all_attrs = tag_lexer.AllAttrsRawSlice()
225	counters.num_attrs += len(all_attrs)
226	for name, val_start, val_end in all_attrs:
227	val_lexer.Reset(val_start, val_end)
228	counters.num_val_tokens += val_lexer.NumTokens()
229
230	#counters.debug_attrs.extend(all_attrs)
231
232	if flags & BALANCED_TAGS:
233	tag_name = lx.CanonicalTagName()
234	if flags & NO_SPECIAL_TAGS:
235	tag_stack.append(tag_name)
236	else:
237	# e.g. <meta> is considered self-closing, like <meta/>
238	if tag_name not in VOID_ELEMENTS:
239	tag_stack.append(tag_name)
240
241	counters.max_tag_stack = max(counters.max_tag_stack,
242	len(tag_stack))
243	elif tok_id == h8_id.EndTag:
244	if flags & BALANCED_TAGS:
245	try:
246	expected = tag_stack.pop()
247	except IndexError:
248	raise ParseError('Tag stack empty',
249	s=contents,
250	start_pos=start_pos)
251
252	actual = lx.CanonicalTagName()
253	if expected != actual:
254	raise ParseError(
255	'Got unexpected closing tag %r; opening tag was %r' %
256	(contents[start_pos:end_pos], expected),
257	s=contents,
258	start_pos=start_pos)
259
260	start_pos = end_pos
261
262	if len(tag_stack) != 0:
263	raise ParseError('Missing closing tags at end of doc: %s' %
264	' '.join(tag_stack),
265	s=contents,
266	start_pos=start_pos)
267
268	counters.num_tokens += len(tokens)
269
270
271	def ToXml(htm8_str):
272	# type: (str) -> str
273
274	# TODO:
275	# 1. Lex it
276	# 2. < & > must be escaped
277	# a. in raw data
278	# b. in quoted strings
279	# 3. <script> turned into CDATA
280	# 4. void tags turned into self-closing tags
281	# 5. case-sensitive tag matching - not sure about this
282
283	tag_lexer = TagLexer(htm8_str)
284	val_lexer = AttrValueLexer(htm8_str)
285
286	f = StringIO()
287	out = Output(htm8_str, f)
288
289	lx = Lexer(htm8_str)
290
291	pos = 0
292	while True:
293	tok_id, end_pos = lx.Read()
294
295	if tok_id == h8_id.Invalid:
296	raise LexError(htm8_str, pos)
297	if tok_id == h8_id.EndOfStream:
298	break
299
300	if tok_id in (h8_id.RawData, h8_id.CharEntity, h8_id.HexChar,
301	h8_id.DecChar):
302	out.PrintUntil(end_pos)
303	elif tok_id in (h8_id.StartTag, h8_id.StartEndTag):
304	tag_lexer.Reset(pos, end_pos)
305	# TODO: reduce allocations here
306	all_attrs = tag_lexer.AllAttrsRawSlice()
307	for name, val_start, val_end in all_attrs:
308	val_lexer.Reset(val_start, val_end)
309	# TODO: get the kind of string
310	#
311	# Quoted: we need to replace & with & and < with <
312	# note > is not allowed
313	# Unquoted: right now, we can just surround with double quotes
314	# because we don't allow any bad chars
315	# Empty : add "", so empty= becomes =""
316	# Missing : add ="", so missing becomes missing=""
317
318	tag_name = lx.CanonicalTagName()
319	if tok_id == h8_id.StartTag and tag_name in VOID_ELEMENTS:
320	# TODO: instead of closing >, print />
321	pass
322
323	elif tok_id == h8_id.BadAmpersand:
324	#out.SkipTo(pos)
325	out.Print('&')
326	out.SkipTo(end_pos)
327
328	elif tok_id == h8_id.BadGreaterThan:
329	#out.SkipTo(pos)
330	out.Print('>')
331	out.SkipTo(end_pos)
332	else:
333	out.PrintUntil(end_pos)
334
335	pos = end_pos
336
337	out.PrintTheRest()
338	return f.getvalue()
339
340
341	class Counters(object):
342
343	def __init__(self):
344	# type: () -> None
345	self.num_tokens = 0
346	self.num_start_tags = 0
347	self.num_start_end_tags = 0
348	self.num_attrs = 0
349	self.max_tag_stack = 0
350	self.num_val_tokens = 0
351
352	#self.debug_attrs = []
353
354
355	def main(argv):
356	# type: (List[str]) -> int
357	action = argv[1]
358
359	if action == 'tokens':
360	contents = sys.stdin.read()
361
362	lx = Lexer(contents)
363	start_pos = 0
364	while True:
365	tok_id, end_pos = lx.Read()
366	if tok_id == h8_id.Invalid:
367	raise LexError(contents, start_pos)
368	if tok_id == h8_id.EndOfStream:
369	break
370
371	frag = contents[start_pos:end_pos]
372	log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
373	start_pos = end_pos
374
375	return 0
376
377	elif action in ('lex-htm8', 'parse-htm8', 'parse-xml'):
378
379	errors = []
380	counters = Counters()
381
382	flags = LEX_ATTRS \| LEX_QUOTED_VALUES
383	if action.startswith('parse-'):
384	flags \|= BALANCED_TAGS
385	if action == 'parse-xml':
386	flags \|= NO_SPECIAL_TAGS
387
388	i = 0
389	for line in sys.stdin:
390	filename = line.strip()
391	with open(filename) as f:
392	contents = f.read()
393
394	try:
395	Validate(contents, flags, counters)
396	except LexError as e:
397	log('Lex error in %r: %s', filename, e)
398	errors.append((filename, e))
399	except ParseError as e:
400	log('Parse error in %r: %s', filename, e)
401	errors.append((filename, e))
402	i += 1
403
404	log('')
405	log('%10d tokens', counters.num_tokens)
406	log('%10d start/end tags', counters.num_start_end_tags)
407	log('%10d start tags', counters.num_start_tags)
408	log('%10d attrs', counters.num_attrs)
409	log('%10d max tag stack depth', counters.max_tag_stack)
410	log('%10d attr val tokens', counters.num_val_tokens)
411	log('%10d errors', len(errors))
412	if len(errors):
413	return 1
414	return 0
415
416	elif action == 'todo':
417	# Other algorithms:
418	#
419	# - select first subtree with given ID
420	# - this requires understanding the void tags I suppose
421	# - select all subtrees that have a class
422	# - materialize DOM
423
424	# Safe-HTM8? This is a filter
425	return 0
426
427	else:
428	raise RuntimeError('Invalid action %r' % action)
429
430
431	if __name__ == '__main__':
432	sys.exit(main(sys.argv))