lazylex/html.py

OILS / lazylex / html.py View on Github | oils.pub

458 lines, 273 significant

1	#!/usr/bin/env python2
2	"""
3	lazylex/html.py - Low-Level HTML Processing.
4
5	See lazylex/README.md for details.
6
7	TODO:
8	- Get rid of AttrValueLexer - this should be in the TagLexer
9	- this also means that unquoted values can be more similar
10	- We can use a single lexer mode for everything inside <>
11	- the SPACE is the only difference
12	- UTF-8 check, like JSON8
13	- Static typing
14
15	"""
16	from __future__ import print_function
17
18	from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_id_str)
19	from data_lang.htm8 import (Lexer, TagLexer, AttrValueLexer, LexError,
20	ParseError, Output)
21	from doctools.util import log
22
23	try:
24	from cStringIO import StringIO
25	except ImportError:
26	# for python3
27	from io import StringIO # type: ignore
28	import sys
29
30	if sys.version_info.major == 2:
31	from typing import List, Tuple, Iterator
32
33
34	def _Tokens(s, left_pos, right_pos):
35	# type: (str, int, int) -> Iterator[Tuple[h8_id_t, int]]
36	"""
37	Args:
38	s: string to parse
39	left_pos, right_pos: Optional span boundaries.
40	"""
41	lx = Lexer(s, left_pos, right_pos)
42	while True:
43	tok_id, pos = lx.Read()
44	yield tok_id, pos
45	if tok_id == h8_id.EndOfStream:
46	break
47
48
49	def ValidTokens(s, left_pos=0, right_pos=-1):
50	# type: (str, int, int) -> Iterator[Tuple[h8_id_t, int]]
51	"""Wrapper around _Tokens to prevent callers from having to handle Invalid.
52
53	I'm not combining the two functions because I might want to do a
54	'yield' transformation on Tokens()? Exceptions might complicate the
55	issue?
56	"""
57	pos = left_pos
58	for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
59	if tok_id == h8_id.Invalid:
60	raise LexError(s, pos)
61	yield tok_id, end_pos
62	pos = end_pos
63
64
65	def ValidTokenList(s, no_special_tags=False):
66	# type: (str, bool) -> List[Tuple[h8_id_t, int]]
67	"""A wrapper that can be more easily translated to C++. Doesn't use iterators."""
68
69	start_pos = 0
70	tokens = []
71	lx = Lexer(s, no_special_tags=no_special_tags)
72	while True:
73	tok_id, end_pos = lx.Read()
74	tokens.append((tok_id, end_pos))
75	if tok_id == h8_id.EndOfStream:
76	break
77	if tok_id == h8_id.Invalid:
78	raise LexError(s, start_pos)
79	start_pos = end_pos
80	return tokens
81
82
83	def ReadUntilStartTag(it, tag_lexer, tag_name):
84	# type: (Iterator[Tuple[h8_id_t, int]], TagLexer, str) -> Tuple[int, int]
85	"""Find the next <foo>, returning its (start, end) positions
86
87	Raise ParseError if it's not found.
88
89	tag_lexer is RESET.
90	"""
91	pos = 0
92	while True:
93	try:
94	tok_id, end_pos = next(it)
95	except StopIteration:
96	break
97	tag_lexer.Reset(pos, end_pos)
98	if tok_id == h8_id.StartTag and tag_lexer.GetTagName() == tag_name:
99	return pos, end_pos
100
101	pos = end_pos
102
103	raise ParseError('No start tag %r' % tag_name)
104
105
106	def ReadUntilEndTag(it, tag_lexer, tag_name):
107	# type: (Iterator[Tuple[h8_id_t, int]], TagLexer, str) -> Tuple[int, int]
108	"""Find the next </foo>, returning its (start, end) position
109
110	Raise ParseError if it's not found.
111
112	tag_lexer is RESET.
113	"""
114	pos = 0
115	while True:
116	try:
117	tok_id, end_pos = next(it)
118	except StopIteration:
119	break
120	tag_lexer.Reset(pos, end_pos)
121	if tok_id == h8_id.EndTag and tag_lexer.GetTagName() == tag_name:
122	return pos, end_pos
123
124	pos = end_pos
125
126	raise ParseError('No end tag %r' % tag_name)
127
128
129	CHAR_ENTITY = {
130	'amp': '&',
131	'lt': '<',
132	'gt': '>',
133	'quot': '"',
134	'apos': "'",
135	}
136
137
138	def ToText(s, left_pos=0, right_pos=-1):
139	# type: (str, int, int) -> str
140	"""Given HTML, return text by unquoting > and < etc.
141
142	Used by:
143	doctools/oils_doc.py: PygmentsPlugin
144	doctools/help_gen.py: HelpIndexCards
145
146	In the latter case, we cold process some tags, like:
147
148	- Blue Link (not clickable, but still useful)
149	- Red X
150
151	That should be html.ToAnsi.
152	"""
153	f = StringIO()
154	out = Output(s, f, left_pos, right_pos)
155
156	pos = left_pos
157	for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
158	if tok_id in (h8_id.RawData, h8_id.BadAmpersand, h8_id.BadGreaterThan,
159	h8_id.BadLessThan):
160	out.SkipTo(pos)
161	out.PrintUntil(end_pos)
162
163	elif tok_id == h8_id.CharEntity: # &
164
165	entity = s[pos + 1:end_pos - 1]
166
167	out.SkipTo(pos)
168	out.Print(CHAR_ENTITY[entity])
169	out.SkipTo(end_pos)
170
171	# Not handling these yet
172	elif tok_id == h8_id.HexChar:
173	raise AssertionError('Hex Char %r' % s[pos:pos + 20])
174
175	elif tok_id == h8_id.DecChar:
176	raise AssertionError('Dec Char %r' % s[pos:pos + 20])
177
178	else:
179	# Skip everything else
180	out.SkipTo(end_pos)
181
182	pos = end_pos
183
184	out.PrintTheRest()
185	return f.getvalue()
186
187
188	# https://developer.mozilla.org/en-US/docs/Glossary/Void_element
189	VOID_ELEMENTS = [
190	'area',
191	'base',
192	'br',
193	'col',
194	'embed',
195	'hr',
196	'img',
197	'input',
198	'link',
199	'meta',
200	'param',
201	'source',
202	'track',
203	'wbr',
204	]
205
206	LEX_ATTRS = 1 << 1
207	LEX_QUOTED_VALUES = 1 << 2 # href="?x=42&y=99"
208	NO_SPECIAL_TAGS = 1 << 3 # <script> <style>, VOID tags, etc.
209	BALANCED_TAGS = 1 << 4 # are tags balanced?
210
211
212	def Validate(contents, flags, counters):
213	# type: (str, int, Counters) -> None
214
215	tag_lexer = TagLexer(contents)
216	val_lexer = AttrValueLexer(contents)
217
218	no_special_tags = bool(flags & NO_SPECIAL_TAGS)
219	lx = Lexer(contents, no_special_tags=no_special_tags)
220	tokens = []
221	start_pos = 0
222	tag_stack = []
223	while True:
224	tok_id, end_pos = lx.Read()
225	#log('TOP %s %r', h8_id_str(tok_id), contents[start_pos:end_pos])
226
227	if tok_id == h8_id.Invalid:
228	raise LexError(contents, start_pos)
229	if tok_id == h8_id.EndOfStream:
230	break
231
232	tokens.append((tok_id, end_pos))
233
234	if tok_id == h8_id.StartEndTag:
235	counters.num_start_end_tags += 1
236
237	tag_lexer.Reset(start_pos, end_pos)
238	all_attrs = tag_lexer.AllAttrsRawSlice()
239	counters.num_attrs += len(all_attrs)
240	for name, val_start, val_end in all_attrs:
241	val_lexer.Reset(val_start, val_end)
242	counters.num_val_tokens += val_lexer.NumTokens()
243
244	#counters.debug_attrs.extend(all_attrs)
245
246	elif tok_id == h8_id.StartTag:
247	counters.num_start_tags += 1
248
249	tag_lexer.Reset(start_pos, end_pos)
250	all_attrs = tag_lexer.AllAttrsRawSlice()
251	counters.num_attrs += len(all_attrs)
252	for name, val_start, val_end in all_attrs:
253	val_lexer.Reset(val_start, val_end)
254	counters.num_val_tokens += val_lexer.NumTokens()
255
256	#counters.debug_attrs.extend(all_attrs)
257
258	if flags & BALANCED_TAGS:
259	tag_name = lx.CanonicalTagName()
260	if flags & NO_SPECIAL_TAGS:
261	tag_stack.append(tag_name)
262	else:
263	# e.g. <meta> is considered self-closing, like <meta/>
264	if tag_name not in VOID_ELEMENTS:
265	tag_stack.append(tag_name)
266
267	counters.max_tag_stack = max(counters.max_tag_stack,
268	len(tag_stack))
269	elif tok_id == h8_id.EndTag:
270	if flags & BALANCED_TAGS:
271	try:
272	expected = tag_stack.pop()
273	except IndexError:
274	raise ParseError('Tag stack empty',
275	s=contents,
276	start_pos=start_pos)
277
278	actual = lx.CanonicalTagName()
279	if expected != actual:
280	raise ParseError(
281	'Got unexpected closing tag %r; opening tag was %r' %
282	(contents[start_pos:end_pos], expected),
283	s=contents,
284	start_pos=start_pos)
285
286	start_pos = end_pos
287
288	if len(tag_stack) != 0:
289	raise ParseError('Missing closing tags at end of doc: %s' %
290	' '.join(tag_stack),
291	s=contents,
292	start_pos=start_pos)
293
294	counters.num_tokens += len(tokens)
295
296
297	def ToXml(htm8_str):
298	# type: (str) -> str
299
300	# TODO:
301	# 1. Lex it
302	# 2. < & > must be escaped
303	# a. in raw data
304	# b. in quoted strings
305	# 3. <script> turned into CDATA
306	# 4. void tags turned into self-closing tags
307	# 5. case-sensitive tag matching - not sure about this
308
309	tag_lexer = TagLexer(htm8_str)
310	val_lexer = AttrValueLexer(htm8_str)
311
312	f = StringIO()
313	out = Output(htm8_str, f)
314
315	lx = Lexer(htm8_str)
316
317	pos = 0
318	while True:
319	tok_id, end_pos = lx.Read()
320
321	if tok_id == h8_id.Invalid:
322	raise LexError(htm8_str, pos)
323	if tok_id == h8_id.EndOfStream:
324	break
325
326	if tok_id in (h8_id.RawData, h8_id.CharEntity, h8_id.HexChar,
327	h8_id.DecChar):
328	out.PrintUntil(end_pos)
329	elif tok_id in (h8_id.StartTag, h8_id.StartEndTag):
330	tag_lexer.Reset(pos, end_pos)
331	# TODO: reduce allocations here
332	all_attrs = tag_lexer.AllAttrsRawSlice()
333	for name, val_start, val_end in all_attrs:
334	val_lexer.Reset(val_start, val_end)
335	# TODO: get the kind of string
336	#
337	# Quoted: we need to replace & with & and < with <
338	# note > is not allowed
339	# Unquoted: right now, we can just surround with double quotes
340	# because we don't allow any bad chars
341	# Empty : add "", so empty= becomes =""
342	# Missing : add ="", so missing becomes missing=""
343
344	tag_name = lx.CanonicalTagName()
345	if tok_id == h8_id.StartTag and tag_name in VOID_ELEMENTS:
346	# TODO: instead of closing >, print />
347	pass
348
349	elif tok_id == h8_id.BadAmpersand:
350	#out.SkipTo(pos)
351	out.Print('&')
352	out.SkipTo(end_pos)
353
354	elif tok_id == h8_id.BadGreaterThan:
355	#out.SkipTo(pos)
356	out.Print('>')
357	out.SkipTo(end_pos)
358	else:
359	out.PrintUntil(end_pos)
360
361	pos = end_pos
362
363	out.PrintTheRest()
364	return f.getvalue()
365
366
367	class Counters(object):
368
369	def __init__(self):
370	# type: () -> None
371	self.num_tokens = 0
372	self.num_start_tags = 0
373	self.num_start_end_tags = 0
374	self.num_attrs = 0
375	self.max_tag_stack = 0
376	self.num_val_tokens = 0
377
378	#self.debug_attrs = []
379
380
381	def main(argv):
382	# type: (List[str]) -> int
383	action = argv[1]
384
385	if action == 'tokens':
386	contents = sys.stdin.read()
387
388	lx = Lexer(contents)
389	start_pos = 0
390	while True:
391	tok_id, end_pos = lx.Read()
392	if tok_id == h8_id.Invalid:
393	raise LexError(contents, start_pos)
394	if tok_id == h8_id.EndOfStream:
395	break
396
397	frag = contents[start_pos:end_pos]
398	log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
399	start_pos = end_pos
400
401	return 0
402
403	elif action in ('lex-htm8', 'parse-htm8', 'parse-xml'):
404
405	errors = []
406	counters = Counters()
407
408	flags = LEX_ATTRS \| LEX_QUOTED_VALUES
409	if action.startswith('parse-'):
410	flags \|= BALANCED_TAGS
411	if action == 'parse-xml':
412	flags \|= NO_SPECIAL_TAGS
413
414	i = 0
415	for line in sys.stdin:
416	filename = line.strip()
417	with open(filename) as f:
418	contents = f.read()
419
420	try:
421	Validate(contents, flags, counters)
422	except LexError as e:
423	log('Lex error in %r: %s', filename, e)
424	errors.append((filename, e))
425	except ParseError as e:
426	log('Parse error in %r: %s', filename, e)
427	errors.append((filename, e))
428	i += 1
429
430	log('')
431	log('%10d tokens', counters.num_tokens)
432	log('%10d start/end tags', counters.num_start_end_tags)
433	log('%10d start tags', counters.num_start_tags)
434	log('%10d attrs', counters.num_attrs)
435	log('%10d max tag stack depth', counters.max_tag_stack)
436	log('%10d attr val tokens', counters.num_val_tokens)
437	log('%10d errors', len(errors))
438	if len(errors):
439	return 1
440	return 0
441
442	elif action == 'todo':
443	# Other algorithms:
444	#
445	# - select first subtree with given ID
446	# - this requires understanding the void tags I suppose
447	# - select all subtrees that have a class
448	# - materialize DOM
449
450	# Safe-HTM8? This is a filter
451	return 0
452
453	else:
454	raise RuntimeError('Invalid action %r' % action)
455
456
457	if __name__ == '__main__':
458	sys.exit(main(sys.argv))