lazylex/html.py

OILS / lazylex / html.py View on Github | oils.pub

758 lines, 378 significant

1	#!/usr/bin/env python2
2	"""
3	lazylex/html.py - Low-Level HTML Processing.
4
5	See lazylex/README.md for details.
6
7	TODO: This should be an Oils library eventually. It's a "lazily-parsed data
8	structure" like TSV8
9	"""
10	from __future__ import print_function
11
12	try:
13	from cStringIO import StringIO
14	except ImportError:
15	from io import StringIO # python3
16	import re
17	import sys
18
19	if sys.version_info.major == 2:
20	from typing import List, Tuple, Optional
21
22
23	def log(msg, *args):
24	msg = msg % args
25	print(msg, file=sys.stderr)
26
27
28	class LexError(Exception):
29	"""
30	Examples of lex errors:
31
32	- Tok.Invalid, like <> or &&
33	- Unclosed <!-- <? <![CDATA[ <script> <style>
34	"""
35
36	def __init__(self, s, start_pos):
37	self.s = s
38	self.start_pos = start_pos
39
40	def __str__(self):
41	return '(LexError %r)' % (self.s[self.start_pos:self.start_pos + 20])
42
43
44	class ParseError(Exception):
45	"""
46	Examples of parse errors
47
48	- unbalanced tag structure
49	- ul_table.py errors
50	"""
51
52	def __init__(self, msg, s=None, start_pos=-1):
53	self.msg = msg
54	self.s = s
55	self.start_pos = start_pos
56
57	def __str__(self):
58	if self.s is not None:
59	assert self.start_pos != -1, self.start_pos
60	snippet = (self.s[self.start_pos:self.start_pos + 20])
61	else:
62	snippet = ''
63	return '(ParseError %r %r)' % (self.msg, snippet)
64
65
66	class Output(object):
67	"""Takes an underlying input buffer and an output file. Maintains a
68	position in the input buffer.
69
70	Print FROM the input or print new text to the output.
71	"""
72
73	def __init__(self, s, f, left_pos=0, right_pos=-1):
74	self.s = s
75	self.f = f
76	self.pos = left_pos
77	self.right_pos = len(s) if right_pos == -1 else right_pos
78
79	def SkipTo(self, pos):
80	"""Skip to a position."""
81	self.pos = pos
82
83	def PrintUntil(self, pos):
84	"""Print until a position."""
85	piece = self.s[self.pos:pos]
86	self.f.write(piece)
87	self.pos = pos
88
89	def PrintTheRest(self):
90	"""Print until the end of the string."""
91	self.PrintUntil(self.right_pos)
92
93	def Print(self, s):
94	"""Print text to the underlying buffer."""
95	self.f.write(s)
96
97
98	# HTML Tokens
99	# CommentBegin, ProcessingBegin, CDataBegin are "pseudo-tokens", not visible
100	TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin CData CDataBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData HtmlCData Invalid EndOfStream'.split(
101	)
102
103
104	class Tok(object):
105	"""
106	Avoid lint errors by using these aliases
107	"""
108	pass
109
110
111	TOKEN_NAMES = [None] * len(TOKENS) # type: List[str]
112
113	this_module = sys.modules[__name__]
114	for i, tok_str in enumerate(TOKENS):
115	setattr(this_module, tok_str, i)
116	setattr(Tok, tok_str, i)
117	TOKEN_NAMES[i] = tok_str
118
119
120	def TokenName(tok_id):
121	return TOKEN_NAMES[tok_id]
122
123
124	def MakeLexer(rules):
125	return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]
126
127
128	#
129	# Eggex
130	#
131	# Tag = / ~['>']+ /
132
133	# Is this valid? A single character?
134	# Tag = / ~'>'* /
135
136	# Maybe better: / [NOT '>']+/
137	# capital letters not allowed there?
138	#
139	# But then this is confusing:
140	# / [NOT ~digit]+/
141	#
142	# / [NOT digit] / is [^\d]
143	# / ~digit / is \D
144	#
145	# Or maybe:
146	#
147	# / [~ digit]+ /
148	# / [~ '>']+ /
149	# / [NOT '>']+ /
150
151	# End = / '</' Tag '>' /
152	# StartEnd = / '<' Tag '/>' /
153	# Start = / '<' Tag '>' /
154	#
155	# EntityRef = / '&' dot{* N} ';' /
156
157	# Tag name, or attribute name
158	# colon is used in XML
159
160	# https://www.w3.org/TR/xml/#NT-Name
161	# Hm there is a lot of unicode stuff. We are simplifying parsing
162
163	_NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter
164
165	LEXER = [
166	(r'<!--', Tok.CommentBegin),
167
168	# Processing instruction are used for the XML header:
169	# <?xml version="1.0" encoding="UTF-8"?>
170	# They are technically XML-only, but in HTML5, they are another kind of
171	# comment:
172	#
173	# https://developer.mozilla.org/en-US/docs/Web/API/ProcessingInstruction
174	#
175	(r'<\?', Tok.ProcessingBegin),
176	# Not necessary in HTML5, but occurs in XML
177	(r'<!\[CDATA\[', Tok.CDataBegin), # <![CDATA[
178
179	# Markup declarations
180	# - In HTML5, there is only <!DOCTYPE html>
181	# - XML has 4 more declarations: <!ELEMENT ...> ATTLIST ENTITY NOTATION
182	# - these seem to be part of DTD
183	# - it's useful to skip these, and be able to parse the rest of the document
184	# - Note: < is allowed?
185	(r'<! [^>]+ >', Tok.Decl),
186
187	# Tags
188	# Notes:
189	# - We look for a valid tag name, but we don't validate attributes.
190	# That's done in the tag lexer.
191	# - We don't allow leading whitespace
192	(r'</ (%s) >' % _NAME, Tok.EndTag),
193	# self-closing <br/> comes before StarttTag
194	(r'< (%s) [^>]* />' % _NAME, Tok.StartEndTag), # end </a>
195	(r'< (%s) [^>]* >' % _NAME, Tok.StartTag), # start <a>
196
197	# Characters
198	# https://www.w3.org/TR/xml/#sec-references
199	(r'&\# [0-9]+ ;', Tok.DecChar),
200	(r'&\# x[0-9a-fA-F]+ ;', Tok.HexChar),
201	(r'& %s ;' % _NAME, Tok.CharEntity),
202
203	# HTML5 allows unescaped > in raw data, but < is not allowed.
204	# https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
205	#
206	# - My early blog has THREE errors when disallowing >
207	# - So do some .wwz files
208	(r'[^&<]+', Tok.RawData),
209	(r'.', Tok.Invalid), # error!
210	]
211
212	# Old notes:
213	#
214	# Non-greedy matches are regular and can be matched in linear time
215	# with RE2.
216	#
217	# https://news.ycombinator.com/item?id=27099798
218	#
219	# Maybe try combining all of these for speed.
220
221	# . is any char except newline
222	# https://re2c.org/manual/manual_c.html
223
224	# Discarded options
225	#(r'<!-- .*? -->', Tok.Comment),
226
227	# Hack from Claude: \s\S instead of re.DOTALL. I don't like this
228	#(r'<!-- [\s\S]*? -->', Tok.Comment),
229	#(r'<!-- (?:.\|[\n])*? -->', Tok.Comment),
230
231	LEXER = MakeLexer(LEXER)
232
233
234	class Lexer(object):
235
236	def __init__(self, s, left_pos=0, right_pos=-1):
237	self.s = s
238	self.pos = left_pos
239	self.right_pos = len(s) if right_pos == -1 else right_pos
240	self.cache = {} # string -> compiled regex pattern object
241
242	# either </script> or </style> - we search until we see that
243	self.search_state = None # type: Optional[str]
244
245	# Position of tag name, if applicable
246	# - Set after you get a StartTag, EndTag, or StartEndTag
247	# - Unset on other tags
248	self.tag_pos_left = -1
249	self.tag_pos_right = -1
250
251	def _Peek(self):
252	# type: () -> Tuple[int, int]
253	"""
254	Note: not using _Peek() now
255	"""
256	if self.pos == self.right_pos:
257	return Tok.EndOfStream, self.pos
258
259	assert self.pos < self.right_pos, self.pos
260
261	if self.search_state is not None:
262	pos = self.s.find(self.search_state, self.pos)
263	if pos == -1:
264	# unterminated <script> or <style>
265	raise LexError(self.s, self.pos)
266	self.search_state = None
267	# beginning
268	return Tok.HtmlCData, pos
269
270	# Find the first match.
271	# Note: frontend/match.py uses _LongestMatch(), which is different!
272	# TODO: reconcile them. This lexer should be expressible in re2c.
273
274	for pat, tok_id in LEXER:
275	m = pat.match(self.s, self.pos)
276	if m:
277	if tok_id in (Tok.StartTag, Tok.EndTag, Tok.StartEndTag):
278	self.tag_pos_left = m.start(1)
279	self.tag_pos_right = m.end(1)
280	else:
281	# Reset state
282	self.tag_pos_left = -1
283	self.tag_pos_right = -1
284
285	if tok_id == Tok.CommentBegin:
286	pos = self.s.find('-->', self.pos)
287	if pos == -1:
288	# unterminated <!--
289	raise LexError(self.s, self.pos)
290	return Tok.Comment, pos + 3 # -->
291
292	if tok_id == Tok.ProcessingBegin:
293	pos = self.s.find('?>', self.pos)
294	if pos == -1:
295	# unterminated <?
296	raise LexError(self.s, self.pos)
297	return Tok.Processing, pos + 2 # ?>
298
299	if tok_id == Tok.CDataBegin:
300	pos = self.s.find(']]>', self.pos)
301	if pos == -1:
302	# unterminated <![CDATA[
303	raise LexError(self.s, self.pos)
304	return Tok.CData, pos + 3 # ]]>
305
306	if tok_id == Tok.StartTag:
307	if self.TagNameEquals('script'):
308	self.search_state = '</script>'
309	elif self.TagNameEquals('style'):
310	self.search_state = '</style>'
311
312	return tok_id, m.end()
313	else:
314	raise AssertionError('Tok.Invalid rule should have matched')
315
316	def TagNameEquals(self, expected):
317	# type: (str) -> bool
318	assert self.tag_pos_left != -1, self.tag_pos_left
319	assert self.tag_pos_right != -1, self.tag_pos_right
320
321	# TODO: In C++, this does not need an allocation
322	return expected == self.s[self.tag_pos_left:self.tag_pos_right]
323
324	def TagName(self):
325	# type: () -> None
326	assert self.tag_pos_left != -1, self.tag_pos_left
327	assert self.tag_pos_right != -1, self.tag_pos_right
328
329	return self.s[self.tag_pos_left:self.tag_pos_right]
330
331	def Read(self):
332	# type: () -> Tuple[int, int]
333	tok_id, end_pos = self._Peek()
334	self.pos = end_pos # advance
335	return tok_id, end_pos
336
337	def LookAhead(self, regex):
338	# Cache the regex compilation. This could also be LookAheadFor(THEAD)
339	# or something.
340	pat = self.cache.get(regex)
341	if pat is None:
342	pat = re.compile(regex)
343	self.cache[regex] = pat
344
345	m = pat.match(self.s, self.pos)
346	return m is not None
347
348
349	def _Tokens(s, left_pos, right_pos):
350	"""
351	Args:
352	s: string to parse
353	left_pos, right_pos: Optional span boundaries.
354	"""
355	lx = Lexer(s, left_pos, right_pos)
356	while True:
357	tok_id, pos = lx.Read()
358	yield tok_id, pos
359	if tok_id == Tok.EndOfStream:
360	break
361
362
363	def ValidTokens(s, left_pos=0, right_pos=-1):
364	"""Wrapper around _Tokens to prevent callers from having to handle Invalid.
365
366	I'm not combining the two functions because I might want to do a
367	'yield' transformation on Tokens()? Exceptions might complicate the
368	issue?
369	"""
370	pos = left_pos
371	for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
372	if tok_id == Tok.Invalid:
373	raise LexError(s, pos)
374	yield tok_id, end_pos
375	pos = end_pos
376
377
378	# Tag names:
379	# Match <a or </a
380	# Match <h2, but not <2h
381	#
382	# HTML 5 doesn't restrict tag names at all
383	# https://html.spec.whatwg.org/#toc-syntax
384	#
385	# XML allows : - .
386	# https://www.w3.org/TR/xml/#NT-NameChar
387
388	# Namespaces for MathML, SVG
389	# XLink, XML, XMLNS
390	#
391	# https://infra.spec.whatwg.org/#namespaces
392	#
393	# Allow - for td-attrs
394
395	_ATTR_VALUE = r'[a-zA-Z0-9_\-]+' # allow hyphens
396
397	# TODO: we don't need to capture the tag name here? That's done at the top
398	# level
399	_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)
400
401	# To match href="foo"
402
403	_ATTR_RE = re.compile(
404	r'''
405	\s+ # Leading whitespace is required
406	(%s) # Attribute name
407	(?: # Optional attribute value
408	\s* = \s*
409	(?:
410	" ([^>"]*) " # double quoted value
411	\| (%s) # Attribute value
412	# TODO: relax this? for href=$foo
413	)
414	)?
415	''' % (_NAME, _ATTR_VALUE), re.VERBOSE)
416
417	TagName, AttrName, UnquotedValue, QuotedValue = range(4)
418
419
420	class TagLexer(object):
421	"""
422	Given a tag like <a href="..."> or <link type="..." />, the TagLexer
423	provides a few operations:
424
425	- What is the tag?
426	- Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
427	"""
428
429	def __init__(self, s):
430	self.s = s
431	self.start_pos = -1 # Invalid
432	self.end_pos = -1
433
434	def Reset(self, start_pos, end_pos):
435	"""Reuse instances of this object."""
436	self.start_pos = start_pos
437	self.end_pos = end_pos
438
439	def TagString(self):
440	return self.s[self.start_pos:self.end_pos]
441
442	def TagName(self):
443	# First event
444	tok_id, start, end = next(self.Tokens())
445	return self.s[start:end]
446
447	def GetSpanForAttrValue(self, attr_name):
448	# Algorithm: search for QuotedValue or UnquotedValue after AttrName
449	# TODO: Could also cache these
450
451	events = self.Tokens()
452	val = (-1, -1)
453	try:
454	while True:
455	tok_id, start, end = next(events)
456	if tok_id == AttrName:
457	name = self.s[start:end]
458	if name == attr_name:
459	# The value should come next
460	tok_id, start, end = next(events)
461	if tok_id in (QuotedValue, UnquotedValue):
462	# Note: quoted values may have &
463	# We would need ANOTHER lexer to unescape them.
464	# Right now help_gen.py and oils_doc.py
465	val = start, end
466	break
467
468	except StopIteration:
469	pass
470	return val
471
472	def GetAttrRaw(self, attr_name):
473	"""
474	Return the value, which may be UNESCAPED.
475	"""
476	# Algorithm: search for QuotedValue or UnquotedValue after AttrName
477	# TODO: Could also cache these
478	start, end = self.GetSpanForAttrValue(attr_name)
479	if start == -1:
480	return None
481	return self.s[start:end]
482
483	def AllAttrsRaw(self):
484	"""
485	Get a list of pairs [('class', 'foo'), ('href', '?foo=1&bar=2')]
486
487	The quoted values may be escaped. We would need another lexer to
488	unescape them.
489	"""
490	pairs = []
491	events = self.Tokens()
492	try:
493	while True:
494	tok_id, start, end = next(events)
495	if tok_id == AttrName:
496	name = self.s[start:end]
497
498	# The value should come next
499	tok_id, start, end = next(events)
500	if tok_id in (QuotedValue, UnquotedValue):
501	# Note: quoted values may have &
502	# We would need ANOTHER lexer to unescape them, but we
503	# don't need that for ul-table
504
505	val = self.s[start:end]
506	pairs.append((name, val))
507	except StopIteration:
508	pass
509	return pairs
510
511	def Tokens(self):
512	"""
513	Yields a sequence of tokens: Tag (AttrName AttrValue?)*
514
515	Where each Token is (Type, start_pos, end_pos)
516
517	Note that start and end are NOT redundant! We skip over some unwanted
518	characters.
519	"""
520	m = _TAG_RE.match(self.s, self.start_pos + 1)
521	if not m:
522	raise RuntimeError("Couldn't find HTML tag in %r" %
523	self.TagString())
524	yield TagName, m.start(1), m.end(1)
525
526	pos = m.end(0)
527
528	while True:
529	# don't search past the end
530	m = _ATTR_RE.match(self.s, pos, self.end_pos)
531	if not m:
532	# A validating parser would check that > or /> is next -- there's no junk
533	break
534
535	yield AttrName, m.start(1), m.end(1)
536
537	# Quoted is group 2, unquoted is group 3.
538	if m.group(2) is not None:
539	yield QuotedValue, m.start(2), m.end(2)
540	elif m.group(3) is not None:
541	yield UnquotedValue, m.start(3), m.end(3)
542
543	# Skip past the "
544	pos = m.end(0)
545
546
547	def ReadUntilStartTag(it, tag_lexer, tag_name):
548	"""Find the next <foo>, returning its (start, end) positions
549
550	Raise ParseError if it's not found.
551
552	tag_lexer is RESET.
553	"""
554	pos = 0
555	while True:
556	try:
557	tok_id, end_pos = next(it)
558	except StopIteration:
559	break
560	tag_lexer.Reset(pos, end_pos)
561	if tok_id == Tok.StartTag and tag_lexer.TagName() == tag_name:
562	return pos, end_pos
563
564	pos = end_pos
565
566	raise ParseError('No start tag %r' % tag_name)
567
568
569	def ReadUntilEndTag(it, tag_lexer, tag_name):
570	"""Find the next </foo>, returning its (start, end) position
571
572	Raise ParseError if it's not found.
573
574	tag_lexer is RESET.
575	"""
576	pos = 0
577	while True:
578	try:
579	tok_id, end_pos = next(it)
580	except StopIteration:
581	break
582	tag_lexer.Reset(pos, end_pos)
583	if tok_id == Tok.EndTag and tag_lexer.TagName() == tag_name:
584	return pos, end_pos
585
586	pos = end_pos
587
588	raise ParseError('No end tag %r' % tag_name)
589
590
591	CHAR_ENTITY = {
592	'amp': '&',
593	'lt': '<',
594	'gt': '>',
595	'quot': '"',
596	}
597
598
599	def ToText(s, left_pos=0, right_pos=-1):
600	"""Given HTML, return text by unquoting > and < etc.
601
602	Used by:
603	doctools/oils_doc.py: PygmentsPlugin
604	doctools/help_gen.py: HelpIndexCards
605
606	In the latter case, we cold process some tags, like:
607
608	- Blue Link (not clickable, but still useful)
609	- Red X
610
611	That should be html.ToAnsi.
612	"""
613	f = StringIO()
614	out = Output(s, f, left_pos, right_pos)
615
616	pos = left_pos
617	for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
618	if tok_id == Tok.RawData:
619	out.SkipTo(pos)
620	out.PrintUntil(end_pos)
621
622	elif tok_id == Tok.CharEntity: # &
623
624	entity = s[pos + 1:end_pos - 1]
625
626	out.SkipTo(pos)
627	out.Print(CHAR_ENTITY[entity])
628	out.SkipTo(end_pos)
629
630	# Not handling these yet
631	elif tok_id == Tok.HexChar:
632	raise AssertionError('Hex Char %r' % s[pos:pos + 20])
633
634	elif tok_id == Tok.DecChar:
635	raise AssertionError('Dec Char %r' % s[pos:pos + 20])
636
637	pos = end_pos
638
639	out.PrintTheRest()
640	return f.getvalue()
641
642
643	# https://developer.mozilla.org/en-US/docs/Glossary/Void_element
644	VOID_ELEMENTS = [
645	'area',
646	'base',
647	'br',
648	'col',
649	'embed',
650	'hr',
651	'img',
652	'input',
653	'link',
654	'meta',
655	'param',
656	'source',
657	'track',
658	'wbr',
659	]
660
661
662	def main(argv):
663	action = argv[1]
664
665	if action in ('lex-tags', 'lex-attrs', 'lex-attr-values', 'well-formed'):
666	num_tokens = 0
667	num_start_tags = 0
668	num_start_end_tags = 0
669	num_attrs = 0
670	max_tag_stack = 0
671
672	errors = []
673	i = 0
674	for line in sys.stdin:
675	name = line.strip()
676	with open(name) as f:
677	contents = f.read()
678
679	tag_lexer = TagLexer(contents)
680	lx = Lexer(contents)
681	tokens = []
682	start_pos = 0
683	tag_stack = []
684	try:
685	while True:
686	tok_id, end_pos = lx.Read()
687
688	if tok_id == Tok.Invalid:
689	raise LexError(contents, start_pos)
690	if tok_id == Tok.EndOfStream:
691	break
692
693	tokens.append((tok_id, end_pos))
694
695	if tok_id == Tok.StartEndTag:
696	num_start_end_tags += 1
697	if action in ('lex-attrs', 'lex-attr-values',
698	'well-formed'):
699	tag_lexer.Reset(start_pos, end_pos)
700	all_attrs = tag_lexer.AllAttrsRaw()
701	num_attrs += len(all_attrs)
702	elif tok_id == Tok.StartTag:
703	num_start_tags += 1
704	if action in ('lex-attrs', 'lex-attr-values',
705	'well-formed'):
706	tag_lexer.Reset(start_pos, end_pos)
707	all_attrs = tag_lexer.AllAttrsRaw()
708
709	tag_name = lx.TagName()
710	# Don't bother to check
711	if tag_name not in VOID_ELEMENTS:
712	tag_stack.append(tag_name)
713	max_tag_stack = max(max_tag_stack, len(tag_stack))
714	elif tok_id == Tok.EndTag:
715	try:
716	expected = tag_stack.pop()
717	except IndexError:
718	raise ParseError('Tag stack empty',
719	s=contents,
720	start_pos=start_pos)
721
722	actual = lx.TagName()
723	if expected != actual:
724	raise ParseError(
725	'Expected closing tag %r, got %r' %
726	(expected, actual),
727	s=contents,
728	start_pos=start_pos)
729
730	start_pos = end_pos
731	except LexError as e:
732	log('Lex error in %r: %s', name, e)
733	errors.append((name, e))
734	except ParseError as e:
735	log('Parse error in %r: %s', name, e)
736	errors.append((name, e))
737	else:
738	num_tokens += len(tokens)
739
740	#print('%d %s' % (len(tokens), name))
741	i += 1
742
743	log('')
744	log(
745	' %d tokens, %d start/end tags, %d start tags, %d attrs, %d max tag stack depth in %d files',
746	num_tokens, num_start_end_tags, num_start_tags, num_attrs,
747	max_tag_stack, i)
748	log(' %d errors', len(errors))
749	if 0:
750	for name, e in errors:
751	log('Error in %r: %s', name, e)
752
753	else:
754	raise RuntimeError('Invalid action %r' % action)
755
756
757	if __name__ == '__main__':
758	main(sys.argv)