lazylex/html.py

OILS / lazylex / html.py View on Github | oils.pub

777 lines, 392 significant

1	#!/usr/bin/env python2
2	"""
3	lazylex/html.py - Low-Level HTML Processing.
4
5	See lazylex/README.md for details.
6
7	TODO: This should be an Oils library eventually. It's a "lazily-parsed data
8	structure" like TSV8
9	"""
10	from __future__ import print_function
11
12	try:
13	from cStringIO import StringIO
14	except ImportError:
15	from io import StringIO # python3
16	import re
17	import sys
18
19	if sys.version_info.major == 2:
20	from typing import List, Tuple, Optional
21
22
23	def log(msg, *args):
24	msg = msg % args
25	print(msg, file=sys.stderr)
26
27
28	class LexError(Exception):
29	"""
30	Examples of lex errors:
31
32	- Tok.Invalid, like <> or &&
33	- Unclosed <!-- <? <![CDATA[ <script> <style>
34	"""
35
36	def __init__(self, s, start_pos):
37	self.s = s
38	self.start_pos = start_pos
39
40	def __str__(self):
41	return '(LexError %r)' % (self.s[self.start_pos:self.start_pos + 20])
42
43
44	def FindLineNum(s, error_pos):
45	current_pos = 0
46	line_num = 1
47	while True:
48	newline_pos = s.find('\n', current_pos)
49	#log('current = %d, N %d, line %d', current_pos, newline_pos, line_num)
50
51	if newline_pos == -1: # this is the last line
52	return line_num
53	if newline_pos >= error_pos:
54	return line_num
55	line_num += 1
56	current_pos = newline_pos + 1
57
58
59	class ParseError(Exception):
60	"""
61	Examples of parse errors
62
63	- unbalanced tag structure
64	- ul_table.py errors
65	"""
66
67	def __init__(self, msg, s=None, start_pos=-1):
68	self.msg = msg
69	self.s = s
70	self.start_pos = start_pos
71
72	def __str__(self):
73	if self.s is not None:
74	assert self.start_pos != -1, self.start_pos
75	snippet = (self.s[self.start_pos:self.start_pos + 20])
76
77	line_num = FindLineNum(self.s, self.start_pos)
78	else:
79	snippet = ''
80	line_num = -1
81	msg = 'line %d: %r %r' % (line_num, self.msg, snippet)
82	return msg
83
84
85	class Output(object):
86	"""Takes an underlying input buffer and an output file. Maintains a
87	position in the input buffer.
88
89	Print FROM the input or print new text to the output.
90	"""
91
92	def __init__(self, s, f, left_pos=0, right_pos=-1):
93	self.s = s
94	self.f = f
95	self.pos = left_pos
96	self.right_pos = len(s) if right_pos == -1 else right_pos
97
98	def SkipTo(self, pos):
99	"""Skip to a position."""
100	self.pos = pos
101
102	def PrintUntil(self, pos):
103	"""Print until a position."""
104	piece = self.s[self.pos:pos]
105	self.f.write(piece)
106	self.pos = pos
107
108	def PrintTheRest(self):
109	"""Print until the end of the string."""
110	self.PrintUntil(self.right_pos)
111
112	def Print(self, s):
113	"""Print text to the underlying buffer."""
114	self.f.write(s)
115
116
117	# HTML Tokens
118	# CommentBegin, ProcessingBegin, CDataBegin are "pseudo-tokens", not visible
119	TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin CData CDataBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData HtmlCData Invalid EndOfStream'.split(
120	)
121
122
123	class Tok(object):
124	"""
125	Avoid lint errors by using these aliases
126	"""
127	pass
128
129
130	TOKEN_NAMES = [None] * len(TOKENS) # type: List[str]
131
132	this_module = sys.modules[__name__]
133	for i, tok_str in enumerate(TOKENS):
134	setattr(this_module, tok_str, i)
135	setattr(Tok, tok_str, i)
136	TOKEN_NAMES[i] = tok_str
137
138
139	def TokenName(tok_id):
140	return TOKEN_NAMES[tok_id]
141
142
143	def MakeLexer(rules):
144	return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]
145
146
147	#
148	# Eggex
149	#
150	# Tag = / ~['>']+ /
151
152	# Is this valid? A single character?
153	# Tag = / ~'>'* /
154
155	# Maybe better: / [NOT '>']+/
156	# capital letters not allowed there?
157	#
158	# But then this is confusing:
159	# / [NOT ~digit]+/
160	#
161	# / [NOT digit] / is [^\d]
162	# / ~digit / is \D
163	#
164	# Or maybe:
165	#
166	# / [~ digit]+ /
167	# / [~ '>']+ /
168	# / [NOT '>']+ /
169
170	# End = / '</' Tag '>' /
171	# StartEnd = / '<' Tag '/>' /
172	# Start = / '<' Tag '>' /
173	#
174	# EntityRef = / '&' dot{* N} ';' /
175
176	# Tag name, or attribute name
177	# colon is used in XML
178
179	# https://www.w3.org/TR/xml/#NT-Name
180	# Hm there is a lot of unicode stuff. We are simplifying parsing
181
182	_NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter
183
184	LEXER = [
185	(r'<!--', Tok.CommentBegin),
186
187	# Processing instruction are used for the XML header:
188	# <?xml version="1.0" encoding="UTF-8"?>
189	# They are technically XML-only, but in HTML5, they are another kind of
190	# comment:
191	#
192	# https://developer.mozilla.org/en-US/docs/Web/API/ProcessingInstruction
193	#
194	(r'<\?', Tok.ProcessingBegin),
195	# Not necessary in HTML5, but occurs in XML
196	(r'<!\[CDATA\[', Tok.CDataBegin), # <![CDATA[
197
198	# Markup declarations
199	# - In HTML5, there is only <!DOCTYPE html>
200	# - XML has 4 more declarations: <!ELEMENT ...> ATTLIST ENTITY NOTATION
201	# - these seem to be part of DTD
202	# - it's useful to skip these, and be able to parse the rest of the document
203	# - Note: < is allowed?
204	(r'<! [^>]+ >', Tok.Decl),
205
206	# Tags
207	# Notes:
208	# - We look for a valid tag name, but we don't validate attributes.
209	# That's done in the tag lexer.
210	# - We don't allow leading whitespace
211	(r'</ (%s) >' % _NAME, Tok.EndTag),
212	# self-closing <br/> comes before StarttTag
213	(r'< (%s) [^>]* />' % _NAME, Tok.StartEndTag), # end </a>
214	(r'< (%s) [^>]* >' % _NAME, Tok.StartTag), # start <a>
215
216	# Characters
217	# https://www.w3.org/TR/xml/#sec-references
218	(r'&\# [0-9]+ ;', Tok.DecChar),
219	(r'&\# x[0-9a-fA-F]+ ;', Tok.HexChar),
220	(r'& %s ;' % _NAME, Tok.CharEntity),
221
222	# HTML5 allows unescaped > in raw data, but < is not allowed.
223	# https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
224	#
225	# - My early blog has THREE errors when disallowing >
226	# - So do some .wwz files
227	(r'[^&<]+', Tok.RawData),
228	(r'.', Tok.Invalid), # error!
229	]
230
231	# Old notes:
232	#
233	# Non-greedy matches are regular and can be matched in linear time
234	# with RE2.
235	#
236	# https://news.ycombinator.com/item?id=27099798
237	#
238	# Maybe try combining all of these for speed.
239
240	# . is any char except newline
241	# https://re2c.org/manual/manual_c.html
242
243	# Discarded options
244	#(r'<!-- .*? -->', Tok.Comment),
245
246	# Hack from Claude: \s\S instead of re.DOTALL. I don't like this
247	#(r'<!-- [\s\S]*? -->', Tok.Comment),
248	#(r'<!-- (?:.\|[\n])*? -->', Tok.Comment),
249
250	LEXER = MakeLexer(LEXER)
251
252
253	class Lexer(object):
254
255	def __init__(self, s, left_pos=0, right_pos=-1):
256	self.s = s
257	self.pos = left_pos
258	self.right_pos = len(s) if right_pos == -1 else right_pos
259	self.cache = {} # string -> compiled regex pattern object
260
261	# either </script> or </style> - we search until we see that
262	self.search_state = None # type: Optional[str]
263
264	# Position of tag name, if applicable
265	# - Set after you get a StartTag, EndTag, or StartEndTag
266	# - Unset on other tags
267	self.tag_pos_left = -1
268	self.tag_pos_right = -1
269
270	def _Peek(self):
271	# type: () -> Tuple[int, int]
272	"""
273	Note: not using _Peek() now
274	"""
275	if self.pos == self.right_pos:
276	return Tok.EndOfStream, self.pos
277
278	assert self.pos < self.right_pos, self.pos
279
280	if self.search_state is not None:
281	pos = self.s.find(self.search_state, self.pos)
282	if pos == -1:
283	# unterminated <script> or <style>
284	raise LexError(self.s, self.pos)
285	self.search_state = None
286	# beginning
287	return Tok.HtmlCData, pos
288
289	# Find the first match.
290	# Note: frontend/match.py uses _LongestMatch(), which is different!
291	# TODO: reconcile them. This lexer should be expressible in re2c.
292
293	for pat, tok_id in LEXER:
294	m = pat.match(self.s, self.pos)
295	if m:
296	if tok_id in (Tok.StartTag, Tok.EndTag, Tok.StartEndTag):
297	self.tag_pos_left = m.start(1)
298	self.tag_pos_right = m.end(1)
299	else:
300	# Reset state
301	self.tag_pos_left = -1
302	self.tag_pos_right = -1
303
304	if tok_id == Tok.CommentBegin:
305	pos = self.s.find('-->', self.pos)
306	if pos == -1:
307	# unterminated <!--
308	raise LexError(self.s, self.pos)
309	return Tok.Comment, pos + 3 # -->
310
311	if tok_id == Tok.ProcessingBegin:
312	pos = self.s.find('?>', self.pos)
313	if pos == -1:
314	# unterminated <?
315	raise LexError(self.s, self.pos)
316	return Tok.Processing, pos + 2 # ?>
317
318	if tok_id == Tok.CDataBegin:
319	pos = self.s.find(']]>', self.pos)
320	if pos == -1:
321	# unterminated <![CDATA[
322	raise LexError(self.s, self.pos)
323	return Tok.CData, pos + 3 # ]]>
324
325	if tok_id == Tok.StartTag:
326	if self.TagNameEquals('script'):
327	self.search_state = '</script>'
328	elif self.TagNameEquals('style'):
329	self.search_state = '</style>'
330
331	return tok_id, m.end()
332	else:
333	raise AssertionError('Tok.Invalid rule should have matched')
334
335	def TagNameEquals(self, expected):
336	# type: (str) -> bool
337	assert self.tag_pos_left != -1, self.tag_pos_left
338	assert self.tag_pos_right != -1, self.tag_pos_right
339
340	# TODO: In C++, this does not need an allocation
341	return expected == self.s[self.tag_pos_left:self.tag_pos_right]
342
343	def TagName(self):
344	# type: () -> None
345	assert self.tag_pos_left != -1, self.tag_pos_left
346	assert self.tag_pos_right != -1, self.tag_pos_right
347
348	return self.s[self.tag_pos_left:self.tag_pos_right]
349
350	def Read(self):
351	# type: () -> Tuple[int, int]
352	tok_id, end_pos = self._Peek()
353	self.pos = end_pos # advance
354	return tok_id, end_pos
355
356	def LookAhead(self, regex):
357	# Cache the regex compilation. This could also be LookAheadFor(THEAD)
358	# or something.
359	pat = self.cache.get(regex)
360	if pat is None:
361	pat = re.compile(regex)
362	self.cache[regex] = pat
363
364	m = pat.match(self.s, self.pos)
365	return m is not None
366
367
368	def _Tokens(s, left_pos, right_pos):
369	"""
370	Args:
371	s: string to parse
372	left_pos, right_pos: Optional span boundaries.
373	"""
374	lx = Lexer(s, left_pos, right_pos)
375	while True:
376	tok_id, pos = lx.Read()
377	yield tok_id, pos
378	if tok_id == Tok.EndOfStream:
379	break
380
381
382	def ValidTokens(s, left_pos=0, right_pos=-1):
383	"""Wrapper around _Tokens to prevent callers from having to handle Invalid.
384
385	I'm not combining the two functions because I might want to do a
386	'yield' transformation on Tokens()? Exceptions might complicate the
387	issue?
388	"""
389	pos = left_pos
390	for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
391	if tok_id == Tok.Invalid:
392	raise LexError(s, pos)
393	yield tok_id, end_pos
394	pos = end_pos
395
396
397	# Tag names:
398	# Match <a or </a
399	# Match <h2, but not <2h
400	#
401	# HTML 5 doesn't restrict tag names at all
402	# https://html.spec.whatwg.org/#toc-syntax
403	#
404	# XML allows : - .
405	# https://www.w3.org/TR/xml/#NT-NameChar
406
407	# Namespaces for MathML, SVG
408	# XLink, XML, XMLNS
409	#
410	# https://infra.spec.whatwg.org/#namespaces
411	#
412	# Allow - for td-attrs
413
414	_ATTR_VALUE = r'[a-zA-Z0-9_\-]+' # allow hyphens
415
416	# TODO: we don't need to capture the tag name here? That's done at the top
417	# level
418	_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)
419
420	# To match href="foo"
421
422	_ATTR_RE = re.compile(
423	r'''
424	\s+ # Leading whitespace is required
425	(%s) # Attribute name
426	(?: # Optional attribute value
427	\s* = \s*
428	(?:
429	" ([^>"]*) " # double quoted value
430	\| (%s) # Attribute value
431	# TODO: relax this? for href=$foo
432	)
433	)?
434	''' % (_NAME, _ATTR_VALUE), re.VERBOSE)
435
436	TagName, AttrName, UnquotedValue, QuotedValue = range(4)
437
438
439	class TagLexer(object):
440	"""
441	Given a tag like <a href="..."> or <link type="..." />, the TagLexer
442	provides a few operations:
443
444	- What is the tag?
445	- Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
446	"""
447
448	def __init__(self, s):
449	self.s = s
450	self.start_pos = -1 # Invalid
451	self.end_pos = -1
452
453	def Reset(self, start_pos, end_pos):
454	"""Reuse instances of this object."""
455	self.start_pos = start_pos
456	self.end_pos = end_pos
457
458	def TagString(self):
459	return self.s[self.start_pos:self.end_pos]
460
461	def TagName(self):
462	# First event
463	tok_id, start, end = next(self.Tokens())
464	return self.s[start:end]
465
466	def GetSpanForAttrValue(self, attr_name):
467	# Algorithm: search for QuotedValue or UnquotedValue after AttrName
468	# TODO: Could also cache these
469
470	events = self.Tokens()
471	val = (-1, -1)
472	try:
473	while True:
474	tok_id, start, end = next(events)
475	if tok_id == AttrName:
476	name = self.s[start:end]
477	if name == attr_name:
478	# The value should come next
479	tok_id, start, end = next(events)
480	if tok_id in (QuotedValue, UnquotedValue):
481	# Note: quoted values may have &
482	# We would need ANOTHER lexer to unescape them.
483	# Right now help_gen.py and oils_doc.py
484	val = start, end
485	break
486
487	except StopIteration:
488	pass
489	return val
490
491	def GetAttrRaw(self, attr_name):
492	"""
493	Return the value, which may be UNESCAPED.
494	"""
495	# Algorithm: search for QuotedValue or UnquotedValue after AttrName
496	# TODO: Could also cache these
497	start, end = self.GetSpanForAttrValue(attr_name)
498	if start == -1:
499	return None
500	return self.s[start:end]
501
502	def AllAttrsRaw(self):
503	"""
504	Get a list of pairs [('class', 'foo'), ('href', '?foo=1&bar=2')]
505
506	The quoted values may be escaped. We would need another lexer to
507	unescape them.
508	"""
509	pairs = []
510	events = self.Tokens()
511	try:
512	while True:
513	tok_id, start, end = next(events)
514	if tok_id == AttrName:
515	name = self.s[start:end]
516
517	# The value should come next
518	tok_id, start, end = next(events)
519	if tok_id in (QuotedValue, UnquotedValue):
520	# Note: quoted values may have &
521	# We would need ANOTHER lexer to unescape them, but we
522	# don't need that for ul-table
523
524	val = self.s[start:end]
525	pairs.append((name, val))
526	except StopIteration:
527	pass
528	return pairs
529
530	def Tokens(self):
531	"""
532	Yields a sequence of tokens: Tag (AttrName AttrValue?)*
533
534	Where each Token is (Type, start_pos, end_pos)
535
536	Note that start and end are NOT redundant! We skip over some unwanted
537	characters.
538	"""
539	m = _TAG_RE.match(self.s, self.start_pos + 1)
540	if not m:
541	raise RuntimeError("Couldn't find HTML tag in %r" %
542	self.TagString())
543	yield TagName, m.start(1), m.end(1)
544
545	pos = m.end(0)
546
547	while True:
548	# don't search past the end
549	m = _ATTR_RE.match(self.s, pos, self.end_pos)
550	if not m:
551	# A validating parser would check that > or /> is next -- there's no junk
552	break
553
554	yield AttrName, m.start(1), m.end(1)
555
556	# Quoted is group 2, unquoted is group 3.
557	if m.group(2) is not None:
558	yield QuotedValue, m.start(2), m.end(2)
559	elif m.group(3) is not None:
560	yield UnquotedValue, m.start(3), m.end(3)
561
562	# Skip past the "
563	pos = m.end(0)
564
565
566	def ReadUntilStartTag(it, tag_lexer, tag_name):
567	"""Find the next <foo>, returning its (start, end) positions
568
569	Raise ParseError if it's not found.
570
571	tag_lexer is RESET.
572	"""
573	pos = 0
574	while True:
575	try:
576	tok_id, end_pos = next(it)
577	except StopIteration:
578	break
579	tag_lexer.Reset(pos, end_pos)
580	if tok_id == Tok.StartTag and tag_lexer.TagName() == tag_name:
581	return pos, end_pos
582
583	pos = end_pos
584
585	raise ParseError('No start tag %r' % tag_name)
586
587
588	def ReadUntilEndTag(it, tag_lexer, tag_name):
589	"""Find the next </foo>, returning its (start, end) position
590
591	Raise ParseError if it's not found.
592
593	tag_lexer is RESET.
594	"""
595	pos = 0
596	while True:
597	try:
598	tok_id, end_pos = next(it)
599	except StopIteration:
600	break
601	tag_lexer.Reset(pos, end_pos)
602	if tok_id == Tok.EndTag and tag_lexer.TagName() == tag_name:
603	return pos, end_pos
604
605	pos = end_pos
606
607	raise ParseError('No end tag %r' % tag_name)
608
609
610	CHAR_ENTITY = {
611	'amp': '&',
612	'lt': '<',
613	'gt': '>',
614	'quot': '"',
615	}
616
617
618	def ToText(s, left_pos=0, right_pos=-1):
619	"""Given HTML, return text by unquoting > and < etc.
620
621	Used by:
622	doctools/oils_doc.py: PygmentsPlugin
623	doctools/help_gen.py: HelpIndexCards
624
625	In the latter case, we cold process some tags, like:
626
627	- Blue Link (not clickable, but still useful)
628	- Red X
629
630	That should be html.ToAnsi.
631	"""
632	f = StringIO()
633	out = Output(s, f, left_pos, right_pos)
634
635	pos = left_pos
636	for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
637	if tok_id == Tok.RawData:
638	out.SkipTo(pos)
639	out.PrintUntil(end_pos)
640
641	elif tok_id == Tok.CharEntity: # &
642
643	entity = s[pos + 1:end_pos - 1]
644
645	out.SkipTo(pos)
646	out.Print(CHAR_ENTITY[entity])
647	out.SkipTo(end_pos)
648
649	# Not handling these yet
650	elif tok_id == Tok.HexChar:
651	raise AssertionError('Hex Char %r' % s[pos:pos + 20])
652
653	elif tok_id == Tok.DecChar:
654	raise AssertionError('Dec Char %r' % s[pos:pos + 20])
655
656	pos = end_pos
657
658	out.PrintTheRest()
659	return f.getvalue()
660
661
662	# https://developer.mozilla.org/en-US/docs/Glossary/Void_element
663	VOID_ELEMENTS = [
664	'area',
665	'base',
666	'br',
667	'col',
668	'embed',
669	'hr',
670	'img',
671	'input',
672	'link',
673	'meta',
674	'param',
675	'source',
676	'track',
677	'wbr',
678	]
679
680
681	def main(argv):
682	action = argv[1]
683
684	if action in ('lex-tags', 'lex-attrs', 'lex-attr-values', 'well-formed'):
685	num_tokens = 0
686	num_start_tags = 0
687	num_start_end_tags = 0
688	num_attrs = 0
689	max_tag_stack = 0
690
691	errors = []
692	i = 0
693	for line in sys.stdin:
694	name = line.strip()
695	with open(name) as f:
696	contents = f.read()
697
698	tag_lexer = TagLexer(contents)
699	lx = Lexer(contents)
700	tokens = []
701	start_pos = 0
702	tag_stack = []
703	try:
704	while True:
705	tok_id, end_pos = lx.Read()
706
707	if tok_id == Tok.Invalid:
708	raise LexError(contents, start_pos)
709	if tok_id == Tok.EndOfStream:
710	break
711
712	tokens.append((tok_id, end_pos))
713
714	if tok_id == Tok.StartEndTag:
715	num_start_end_tags += 1
716	if action in ('lex-attrs', 'lex-attr-values',
717	'well-formed'):
718	tag_lexer.Reset(start_pos, end_pos)
719	all_attrs = tag_lexer.AllAttrsRaw()
720	num_attrs += len(all_attrs)
721	elif tok_id == Tok.StartTag:
722	num_start_tags += 1
723	if action in ('lex-attrs', 'lex-attr-values',
724	'well-formed'):
725	tag_lexer.Reset(start_pos, end_pos)
726	all_attrs = tag_lexer.AllAttrsRaw()
727
728	tag_name = lx.TagName()
729	# Don't bother to check
730	if tag_name not in VOID_ELEMENTS:
731	tag_stack.append(tag_name)
732	max_tag_stack = max(max_tag_stack, len(tag_stack))
733	elif tok_id == Tok.EndTag:
734	try:
735	expected = tag_stack.pop()
736	except IndexError:
737	raise ParseError('Tag stack empty',
738	s=contents,
739	start_pos=start_pos)
740
741	actual = lx.TagName()
742	if expected != actual:
743	raise ParseError(
744	'Expected closing tag %r, got %r' %
745	(expected, actual),
746	s=contents,
747	start_pos=start_pos)
748
749	start_pos = end_pos
750	except LexError as e:
751	log('Lex error in %r: %s', name, e)
752	errors.append((name, e))
753	except ParseError as e:
754	log('Parse error in %r: %s', name, e)
755	errors.append((name, e))
756	else:
757	num_tokens += len(tokens)
758
759	#print('%d %s' % (len(tokens), name))
760	i += 1
761
762	log('')
763	log(
764	' %d tokens, %d start/end tags, %d start tags, %d attrs, %d max tag stack depth in %d files',
765	num_tokens, num_start_end_tags, num_start_tags, num_attrs,
766	max_tag_stack, i)
767	log(' %d errors', len(errors))
768	if 0:
769	for name, e in errors:
770	log('Error in %r: %s', name, e)
771
772	else:
773	raise RuntimeError('Invalid action %r' % action)
774
775
776	if __name__ == '__main__':
777	main(sys.argv)