lazylex/html.py

OILS / lazylex / html.py View on Github | oils.pub

1025 lines, 527 significant

1	#!/usr/bin/env python2
2	"""
3	lazylex/html.py - Low-Level HTML Processing.
4
5	See lazylex/README.md for details.
6	"""
7	from __future__ import print_function
8
9	try:
10	from cStringIO import StringIO
11	except ImportError:
12	from io import StringIO # python3
13	import re
14	import sys
15
16	if sys.version_info.major == 2:
17	from typing import List, Tuple, Optional
18
19
20	def log(msg, *args):
21	msg = msg % args
22	print(msg, file=sys.stderr)
23
24
25	class LexError(Exception):
26	"""
27	Examples of lex errors:
28
29	- Tok.Invalid, like <> or &&
30	- Unclosed <!-- <? <![CDATA[ <script> <style>
31	"""
32
33	def __init__(self, s, start_pos):
34	self.s = s
35	self.start_pos = start_pos
36
37	def __str__(self):
38	return '(LexError %r)' % (self.s[self.start_pos:self.start_pos + 20])
39
40
41	def FindLineNum(s, error_pos):
42	current_pos = 0
43	line_num = 1
44	while True:
45	newline_pos = s.find('\n', current_pos)
46	#log('current = %d, N %d, line %d', current_pos, newline_pos, line_num)
47
48	if newline_pos == -1: # this is the last line
49	return line_num
50	if newline_pos >= error_pos:
51	return line_num
52	line_num += 1
53	current_pos = newline_pos + 1
54
55
56	class ParseError(Exception):
57	"""
58	Examples of parse errors
59
60	- unbalanced tag structure
61	- ul_table.py errors
62	"""
63
64	def __init__(self, msg, s=None, start_pos=-1):
65	self.msg = msg
66	self.s = s
67	self.start_pos = start_pos
68
69	def __str__(self):
70	if self.s is not None:
71	assert self.start_pos != -1, self.start_pos
72	snippet = (self.s[self.start_pos:self.start_pos + 20])
73
74	line_num = FindLineNum(self.s, self.start_pos)
75	else:
76	snippet = ''
77	line_num = -1
78	msg = 'line %d: %r %r' % (line_num, self.msg, snippet)
79	return msg
80
81
82	class Output(object):
83	"""Takes an underlying input buffer and an output file. Maintains a
84	position in the input buffer.
85
86	Print FROM the input or print new text to the output.
87	"""
88
89	def __init__(self, s, f, left_pos=0, right_pos=-1):
90	self.s = s
91	self.f = f
92	self.pos = left_pos
93	self.right_pos = len(s) if right_pos == -1 else right_pos
94
95	def SkipTo(self, pos):
96	"""Skip to a position."""
97	self.pos = pos
98
99	def PrintUntil(self, pos):
100	"""Print until a position."""
101	piece = self.s[self.pos:pos]
102	self.f.write(piece)
103	self.pos = pos
104
105	def PrintTheRest(self):
106	"""Print until the end of the string."""
107	self.PrintUntil(self.right_pos)
108
109	def Print(self, s):
110	"""Print text to the underlying buffer."""
111	self.f.write(s)
112
113
114	# HTML Tokens
115	# CommentBegin, ProcessingBegin, CDataBegin are "pseudo-tokens", not visible
116	TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin CData CDataBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData HtmlCData BadAmpersand Invalid EndOfStream'.split(
117	)
118
119
120	class Tok(object):
121	"""
122	Avoid lint errors by using these aliases
123	"""
124	pass
125
126
127	TOKEN_NAMES = [None] * len(TOKENS) # type: List[str]
128
129	this_module = sys.modules[__name__]
130	for i, tok_str in enumerate(TOKENS):
131	setattr(this_module, tok_str, i)
132	setattr(Tok, tok_str, i)
133	TOKEN_NAMES[i] = tok_str
134
135
136	def TokenName(tok_id):
137	return TOKEN_NAMES[tok_id]
138
139
140	def MakeLexer(rules):
141	return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]
142
143
144	#
145	# Eggex
146	#
147	# Tag = / ~['>']+ /
148
149	# Is this valid? A single character?
150	# Tag = / ~'>'* /
151
152	# Maybe better: / [NOT '>']+/
153	# capital letters not allowed there?
154	#
155	# But then this is confusing:
156	# / [NOT ~digit]+/
157	#
158	# / [NOT digit] / is [^\d]
159	# / ~digit / is \D
160	#
161	# Or maybe:
162	#
163	# / [~ digit]+ /
164	# / [~ '>']+ /
165	# / [NOT '>']+ /
166
167	# End = / '</' Tag '>' /
168	# StartEnd = / '<' Tag '/>' /
169	# Start = / '<' Tag '>' /
170	#
171	# EntityRef = / '&' dot{* N} ';' /
172
173	# Tag name, or attribute name
174	# colon is used in XML
175
176	# https://www.w3.org/TR/xml/#NT-Name
177	# Hm there is a lot of unicode stuff. We are simplifying parsing
178
179	_NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter
180
181	CHAR_LEX = [
182	# Characters
183	# https://www.w3.org/TR/xml/#sec-references
184	(r'&\# [0-9]+ ;', Tok.DecChar),
185	(r'&\# x[0-9a-fA-F]+ ;', Tok.HexChar),
186	(r'& %s ;' % _NAME, Tok.CharEntity),
187	# Allow unquoted, and quoted
188	(r'&', Tok.BadAmpersand),
189	]
190
191	LEXER = CHAR_LEX + [
192	(r'<!--', Tok.CommentBegin),
193
194	# Processing instruction are used for the XML header:
195	# <?xml version="1.0" encoding="UTF-8"?>
196	# They are technically XML-only, but in HTML5, they are another kind of
197	# comment:
198	#
199	# https://developer.mozilla.org/en-US/docs/Web/API/ProcessingInstruction
200	#
201	(r'<\?', Tok.ProcessingBegin),
202	# Not necessary in HTML5, but occurs in XML
203	(r'<!\[CDATA\[', Tok.CDataBegin), # <![CDATA[
204
205	# Markup declarations
206	# - In HTML5, there is only <!DOCTYPE html>
207	# - XML has 4 more declarations: <!ELEMENT ...> ATTLIST ENTITY NOTATION
208	# - these seem to be part of DTD
209	# - it's useful to skip these, and be able to parse the rest of the document
210	# - Note: < is allowed?
211	(r'<! [^>\x00]+ >', Tok.Decl),
212
213	# Tags
214	# Notes:
215	# - We look for a valid tag name, but we don't validate attributes.
216	# That's done in the tag lexer.
217	# - We don't allow leading whitespace
218	(r'</ (%s) >' % _NAME, Tok.EndTag),
219	# self-closing <br/> comes before StartTag
220	# could/should these be collapsed into one rule?
221	(r'< (%s) [^>\x00]* />' % _NAME, Tok.StartEndTag), # end </a>
222	(r'< (%s) [^>\x00]* >' % _NAME, Tok.StartTag), # start <a>
223
224	# HTML5 allows unescaped > in raw data, but < is not allowed.
225	# https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
226	#
227	# - My early blog has THREE errors when disallowing >
228	# - So do some .wwz files
229	(r'[^&<\x00]+', Tok.RawData),
230	(r'.', Tok.Invalid), # error!
231	]
232
233	# Old notes:
234	#
235	# Non-greedy matches are regular and can be matched in linear time
236	# with RE2.
237	#
238	# https://news.ycombinator.com/item?id=27099798
239	#
240	# Maybe try combining all of these for speed.
241
242	# . is any char except newline
243	# https://re2c.org/manual/manual_c.html
244
245	# Discarded options
246	#(r'<!-- .*? -->', Tok.Comment),
247
248	# Hack from Claude: \s\S instead of re.DOTALL. I don't like this
249	#(r'<!-- [\s\S]*? -->', Tok.Comment),
250	#(r'<!-- (?:.\|[\n])*? -->', Tok.Comment),
251
252	LEXER = MakeLexer(LEXER)
253
254
255	class Lexer(object):
256
257	def __init__(self, s, left_pos=0, right_pos=-1, no_special_tags=False):
258	self.s = s
259	self.pos = left_pos
260	self.right_pos = len(s) if right_pos == -1 else right_pos
261	self.no_special_tags = no_special_tags
262
263	self.cache = {} # string -> compiled regex pattern object
264
265	# either </script> or </style> - we search until we see that
266	self.search_state = None # type: Optional[str]
267
268	# Position of tag name, if applicable
269	# - Set after you get a StartTag, EndTag, or StartEndTag
270	# - Unset on other tags
271	self.tag_pos_left = -1
272	self.tag_pos_right = -1
273
274	def _Peek(self):
275	# type: () -> Tuple[int, int]
276	"""
277	Note: not using _Peek() now
278	"""
279	if self.pos == self.right_pos:
280	return Tok.EndOfStream, self.pos
281
282	assert self.pos < self.right_pos, self.pos
283
284	if self.search_state is not None and not self.no_special_tags:
285	# TODO: case-insensitive search for </SCRIPT> <SCRipt> ?
286	#
287	# Another strategy: enter a mode where we find ONLY the end tag
288	# regex, and any data that's not <, and then check the canonical
289	# tag name for 'script' or 'style'.
290	pos = self.s.find(self.search_state, self.pos)
291	if pos == -1:
292	# unterminated <script> or <style>
293	raise LexError(self.s, self.pos)
294	self.search_state = None
295	# beginning
296	return Tok.HtmlCData, pos
297
298	# Find the first match.
299	# Note: frontend/match.py uses _LongestMatch(), which is different!
300	# TODO: reconcile them. This lexer should be expressible in re2c.
301
302	for pat, tok_id in LEXER:
303	m = pat.match(self.s, self.pos)
304	if m:
305	if tok_id in (Tok.StartTag, Tok.EndTag, Tok.StartEndTag):
306	self.tag_pos_left = m.start(1)
307	self.tag_pos_right = m.end(1)
308	else:
309	# Reset state
310	self.tag_pos_left = -1
311	self.tag_pos_right = -1
312
313	if tok_id == Tok.CommentBegin:
314	pos = self.s.find('-->', self.pos)
315	if pos == -1:
316	# unterminated <!--
317	raise LexError(self.s, self.pos)
318	return Tok.Comment, pos + 3 # -->
319
320	if tok_id == Tok.ProcessingBegin:
321	pos = self.s.find('?>', self.pos)
322	if pos == -1:
323	# unterminated <?
324	raise LexError(self.s, self.pos)
325	return Tok.Processing, pos + 2 # ?>
326
327	if tok_id == Tok.CDataBegin:
328	pos = self.s.find(']]>', self.pos)
329	if pos == -1:
330	# unterminated <![CDATA[
331	raise LexError(self.s, self.pos)
332	return Tok.CData, pos + 3 # ]]>
333
334	if tok_id == Tok.StartTag:
335	# TODO: reduce allocations
336	if (self.TagNameEquals('script') or
337	self.TagNameEquals('style')):
338	# <SCRipt a=b> -> </SCRipt>
339	self.search_state = '</' + self._LiteralTagName() + '>'
340
341	return tok_id, m.end()
342	else:
343	raise AssertionError('Tok.Invalid rule should have matched')
344
345	def TagNameEquals(self, expected):
346	# type: (str) -> bool
347	assert self.tag_pos_left != -1, self.tag_pos_left
348	assert self.tag_pos_right != -1, self.tag_pos_right
349
350	# TODO: In C++, this does not need an allocation. Can we test
351	# directly?
352	return expected == self.CanonicalTagName()
353
354	def _LiteralTagName(self):
355	# type: () -> None
356	assert self.tag_pos_left != -1, self.tag_pos_left
357	assert self.tag_pos_right != -1, self.tag_pos_right
358
359	return self.s[self.tag_pos_left:self.tag_pos_right]
360
361	def CanonicalTagName(self):
362	# type: () -> None
363	tag_name = self._LiteralTagName()
364	# Most tags are already lower case, so avoid allocation with this conditional
365	# TODO: this could go in the mycpp runtime?
366	if tag_name.islower():
367	return tag_name
368	else:
369	return tag_name.lower()
370
371	def Read(self):
372	# type: () -> Tuple[int, int]
373	tok_id, end_pos = self._Peek()
374	self.pos = end_pos # advance
375	return tok_id, end_pos
376
377	def LookAhead(self, regex):
378	# Cache the regex compilation. This could also be LookAheadFor(THEAD)
379	# or something.
380	pat = self.cache.get(regex)
381	if pat is None:
382	pat = re.compile(regex)
383	self.cache[regex] = pat
384
385	m = pat.match(self.s, self.pos)
386	return m is not None
387
388
389	def _Tokens(s, left_pos, right_pos):
390	"""
391	Args:
392	s: string to parse
393	left_pos, right_pos: Optional span boundaries.
394	"""
395	lx = Lexer(s, left_pos, right_pos)
396	while True:
397	tok_id, pos = lx.Read()
398	yield tok_id, pos
399	if tok_id == Tok.EndOfStream:
400	break
401
402
403	def ValidTokens(s, left_pos=0, right_pos=-1):
404	"""Wrapper around _Tokens to prevent callers from having to handle Invalid.
405
406	I'm not combining the two functions because I might want to do a
407	'yield' transformation on Tokens()? Exceptions might complicate the
408	issue?
409	"""
410	pos = left_pos
411	for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
412	if tok_id == Tok.Invalid:
413	raise LexError(s, pos)
414	yield tok_id, end_pos
415	pos = end_pos
416
417
418	def ValidTokenList(s, no_special_tags=False):
419	"""A wrapper that can be more easily translated to C++. Doesn't use iterators."""
420
421	start_pos = 0
422	tokens = []
423	lx = Lexer(s, no_special_tags=no_special_tags)
424	while True:
425	tok_id, end_pos = lx.Read()
426	tokens.append((tok_id, end_pos))
427	if tok_id == Tok.EndOfStream:
428	break
429	if tok_id == Tok.Invalid:
430	raise LexError(s, start_pos)
431	start_pos = end_pos
432	return tokens
433
434
435	# Tag names:
436	# Match <a or </a
437	# Match <h2, but not <2h
438	#
439	# HTML 5 doesn't restrict tag names at all
440	# https://html.spec.whatwg.org/#toc-syntax
441	#
442	# XML allows : - .
443	# https://www.w3.org/TR/xml/#NT-NameChar
444
445	# Namespaces for MathML, SVG
446	# XLink, XML, XMLNS
447	#
448	# https://infra.spec.whatwg.org/#namespaces
449	#
450	# Allow - for td-attrs
451
452	# Be very lenient - just no whitespace or special HTML chars
453	# I don't think this is more lenient than HTML5, though we should check.
454	_UNQUOTED_VALUE = r'''[^ \t\r\n<>&"'\x00]*'''
455
456	# TODO: we don't need to capture the tag name here? That's done at the top
457	# level
458	_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)
459
460	_TAG_LAST_RE = re.compile(r'\s* /? >', re.VERBOSE)
461
462	# To match href="foo"
463	# Note: in HTML5 and XML, single quoted attributes are also valid
464
465	# <button disabled> is standard usage
466
467	# NOTE: This used to allow whitespace around =
468	# <a foo = "bar"> makes sense in XML
469	# But then you also have
470	# <a foo= bar> - which is TWO attributes, in HTML5
471	# So the space is problematic
472
473	_ATTR_RE = re.compile(
474	r'''
475	\s+ # Leading whitespace is required
476	(%s) # Attribute name
477	(?: # Optional attribute value
478	\s* = \s* # Spaces allowed around =
479	(?:
480	" ([^>"\x00]*) " # double quoted value
481	\| ' ([^>'\x00]*) ' # single quoted value
482	\| (%s) # Attribute value
483	)
484	)?
485	''' % (_NAME, _UNQUOTED_VALUE), re.VERBOSE)
486
487	TagName, AttrName, UnquotedValue, QuotedValue, MissingValue = range(5)
488
489
490	class TagLexer(object):
491	"""
492	Given a tag like <a href="..."> or <link type="..." />, the TagLexer
493	provides a few operations:
494
495	- What is the tag?
496	- Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
497	"""
498
499	def __init__(self, s):
500	self.s = s
501	self.start_pos = -1 # Invalid
502	self.end_pos = -1
503
504	def Reset(self, start_pos, end_pos):
505	"""Reuse instances of this object."""
506	assert start_pos >= 0, start_pos
507	assert end_pos >= 0, end_pos
508
509	self.start_pos = start_pos
510	self.end_pos = end_pos
511
512	def TagString(self):
513	return self.s[self.start_pos:self.end_pos]
514
515	def TagName(self):
516	# First event
517	tok_id, start, end = next(self.Tokens())
518	return self.s[start:end]
519
520	def GetSpanForAttrValue(self, attr_name):
521	"""
522	Used by oils_doc.py, for href shortcuts
523	"""
524	# Algorithm: search for QuotedValue or UnquotedValue after AttrName
525	# TODO: Could also cache these
526
527	events = self.Tokens()
528	val = (-1, -1)
529	try:
530	while True:
531	tok_id, start, end = next(events)
532	if tok_id == AttrName:
533	name = self.s[start:end]
534	if name == attr_name:
535	# The value should come next
536	tok_id, start, end = next(events)
537	assert tok_id in (QuotedValue, UnquotedValue,
538	MissingValue), TokenName(tok_id)
539	val = start, end
540	break
541
542	except StopIteration:
543	pass
544	return val
545
546	def GetAttrRaw(self, attr_name):
547	"""
548	Return the value, which may be UNESCAPED.
549	"""
550	start, end = self.GetSpanForAttrValue(attr_name)
551	if start == -1:
552	return None
553	return self.s[start:end]
554
555	def AllAttrsRawSlice(self):
556	"""
557	Get a list of pairs [('class', 3, 5), ('href', 9, 12)]
558	"""
559	slices = []
560	events = self.Tokens()
561	try:
562	while True:
563	tok_id, start, end = next(events)
564	if tok_id == AttrName:
565	name = self.s[start:end]
566
567	# The value should come next
568	tok_id, start, end = next(events)
569	assert tok_id in (QuotedValue, UnquotedValue,
570	MissingValue), TokenName(tok_id)
571	# Note: quoted values may have &
572	# We would need ANOTHER lexer to unescape them, but we
573	# don't need that for ul-table
574	slices.append((name, start, end))
575	except StopIteration:
576	pass
577	return slices
578
579	def AllAttrsRaw(self):
580	"""
581	Get a list of pairs [('class', 'foo'), ('href', '?foo=1&bar=2')]
582
583	The quoted values may be escaped. We would need another lexer to
584	unescape them.
585	"""
586	slices = self.AllAttrsRawSlice()
587	pairs = []
588	for name, start, end in slices:
589	pairs.append((name, self.s[start:end]))
590	return pairs
591
592	def Tokens(self):
593	"""
594	Yields a sequence of tokens: Tag (AttrName AttrValue?)*
595
596	Where each Token is (Type, start_pos, end_pos)
597
598	Note that start and end are NOT redundant! We skip over some unwanted
599	characters.
600	"""
601	m = _TAG_RE.match(self.s, self.start_pos + 1)
602	if not m:
603	raise RuntimeError("Couldn't find HTML tag in %r" %
604	self.TagString())
605	yield TagName, m.start(1), m.end(1)
606
607	pos = m.end(0)
608	#log('POS %d', pos)
609
610	while True:
611	# don't search past the end
612	m = _ATTR_RE.match(self.s, pos, self.end_pos)
613	if not m:
614	#log('BREAK pos %d', pos)
615	break
616	#log('AttrName %r', m.group(1))
617
618	yield AttrName, m.start(1), m.end(1)
619
620	#log('m.groups() %r', m.groups())
621	if m.group(2) is not None:
622	# double quoted
623	yield QuotedValue, m.start(2), m.end(2)
624	elif m.group(3) is not None:
625	# single quoted - TODO: could have different token types
626	yield QuotedValue, m.start(3), m.end(3)
627	elif m.group(4) is not None:
628	yield UnquotedValue, m.start(4), m.end(4)
629	else:
630	# <button disabled>
631	end = m.end(0)
632	yield MissingValue, end, end
633
634	# Skip past the "
635	pos = m.end(0)
636
637	#log('TOK %r', self.s)
638
639	m = _TAG_LAST_RE.match(self.s, pos)
640	#log('_TAG_LAST_RE match %r', self.s[pos:])
641	if not m:
642	# Extra data at end of tag. TODO: add messages for all these.
643	raise LexError(self.s, pos)
644
645
646	# This is similar but not identical to
647	# " ([^>"\x00]*) " # double quoted value
648	# \| ' ([^>'\x00]*) ' # single quoted value
649	#
650	# Note: for unquoted values, & isn't allowed, and thus & and c and
651	# are not allowed. We could relax that?
652	ATTR_VALUE_LEXER = CHAR_LEX + [
653	(r'[^>&\x00]+', Tok.RawData),
654	(r'.', Tok.Invalid),
655	]
656
657	ATTR_VALUE_LEXER = MakeLexer(ATTR_VALUE_LEXER)
658
659
660	class AttrValueLexer(object):
661	"""
662	<a href="foo=99&bar">
663	<a href='foo=99&bar'>
664	<a href=unquoted>
665	"""
666
667	def __init__(self, s):
668	self.s = s
669	self.start_pos = -1 # Invalid
670	self.end_pos = -1
671
672	def Reset(self, start_pos, end_pos):
673	"""Reuse instances of this object."""
674	assert start_pos >= 0, start_pos
675	assert end_pos >= 0, end_pos
676
677	self.start_pos = start_pos
678	self.end_pos = end_pos
679
680	def NumTokens(self):
681	num_tokens = 0
682	pos = self.start_pos
683	for tok_id, end_pos in self.Tokens():
684	if tok_id == Tok.Invalid:
685	raise LexError(self.s, pos)
686	pos = end_pos
687	#log('pos %d', pos)
688	num_tokens += 1
689	return num_tokens
690
691	def Tokens(self):
692	pos = self.start_pos
693	while pos < self.end_pos:
694	# Find the first match, like above.
695	# Note: frontend/match.py uses _LongestMatch(), which is different!
696	# TODO: reconcile them. This lexer should be expressible in re2c.
697	for pat, tok_id in ATTR_VALUE_LEXER:
698	m = pat.match(self.s, pos)
699	if m:
700	if 0:
701	tok_str = m.group(0)
702	log('token = %r', tok_str)
703
704	end_pos = m.end(0)
705	yield tok_id, end_pos
706	pos = end_pos
707	break
708	else:
709	raise AssertionError('Tok.Invalid rule should have matched')
710
711
712	def ReadUntilStartTag(it, tag_lexer, tag_name):
713	"""Find the next <foo>, returning its (start, end) positions
714
715	Raise ParseError if it's not found.
716
717	tag_lexer is RESET.
718	"""
719	pos = 0
720	while True:
721	try:
722	tok_id, end_pos = next(it)
723	except StopIteration:
724	break
725	tag_lexer.Reset(pos, end_pos)
726	if tok_id == Tok.StartTag and tag_lexer.TagName() == tag_name:
727	return pos, end_pos
728
729	pos = end_pos
730
731	raise ParseError('No start tag %r' % tag_name)
732
733
734	def ReadUntilEndTag(it, tag_lexer, tag_name):
735	"""Find the next </foo>, returning its (start, end) position
736
737	Raise ParseError if it's not found.
738
739	tag_lexer is RESET.
740	"""
741	pos = 0
742	while True:
743	try:
744	tok_id, end_pos = next(it)
745	except StopIteration:
746	break
747	tag_lexer.Reset(pos, end_pos)
748	if tok_id == Tok.EndTag and tag_lexer.TagName() == tag_name:
749	return pos, end_pos
750
751	pos = end_pos
752
753	raise ParseError('No end tag %r' % tag_name)
754
755
756	CHAR_ENTITY = {
757	'amp': '&',
758	'lt': '<',
759	'gt': '>',
760	'quot': '"',
761	'apos': "'",
762	}
763
764
765	def ToText(s, left_pos=0, right_pos=-1):
766	"""Given HTML, return text by unquoting > and < etc.
767
768	Used by:
769	doctools/oils_doc.py: PygmentsPlugin
770	doctools/help_gen.py: HelpIndexCards
771
772	In the latter case, we cold process some tags, like:
773
774	- Blue Link (not clickable, but still useful)
775	- Red X
776
777	That should be html.ToAnsi.
778	"""
779	f = StringIO()
780	out = Output(s, f, left_pos, right_pos)
781
782	pos = left_pos
783	for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
784	if tok_id in (Tok.RawData, Tok.BadAmpersand):
785	out.SkipTo(pos)
786	out.PrintUntil(end_pos)
787
788	elif tok_id == Tok.CharEntity: # &
789
790	entity = s[pos + 1:end_pos - 1]
791
792	out.SkipTo(pos)
793	out.Print(CHAR_ENTITY[entity])
794	out.SkipTo(end_pos)
795
796	# Not handling these yet
797	elif tok_id == Tok.HexChar:
798	raise AssertionError('Hex Char %r' % s[pos:pos + 20])
799
800	elif tok_id == Tok.DecChar:
801	raise AssertionError('Dec Char %r' % s[pos:pos + 20])
802
803	else:
804	# Skip everything else
805	out.SkipTo(end_pos)
806
807	pos = end_pos
808
809	out.PrintTheRest()
810	return f.getvalue()
811
812
813	# https://developer.mozilla.org/en-US/docs/Glossary/Void_element
814	VOID_ELEMENTS = [
815	'area',
816	'base',
817	'br',
818	'col',
819	'embed',
820	'hr',
821	'img',
822	'input',
823	'link',
824	'meta',
825	'param',
826	'source',
827	'track',
828	'wbr',
829	]
830
831	LEX_ATTRS = 1 << 1
832	LEX_QUOTED_VALUES = 1 << 2 # href="?x=42&y=99"
833	NO_SPECIAL_TAGS = 1 << 3 # <script> <style>, VOID tags, etc.
834	BALANCED_TAGS = 1 << 4 # are tags balanced?
835
836
837	def Validate(contents, flags, counters):
838	# type: (str, int, Counters) -> None
839
840	tag_lexer = TagLexer(contents)
841	val_lexer = AttrValueLexer(contents)
842
843	no_special_tags = bool(flags & NO_SPECIAL_TAGS)
844	lx = Lexer(contents, no_special_tags=no_special_tags)
845	tokens = []
846	start_pos = 0
847	tag_stack = []
848	while True:
849	tok_id, end_pos = lx.Read()
850	#log('TOP %s %r', TokenName(tok_id), contents[start_pos:end_pos])
851
852	if tok_id == Tok.Invalid:
853	raise LexError(contents, start_pos)
854	if tok_id == Tok.EndOfStream:
855	break
856
857	tokens.append((tok_id, end_pos))
858
859	if tok_id == Tok.StartEndTag:
860	counters.num_start_end_tags += 1
861
862	tag_lexer.Reset(start_pos, end_pos)
863	all_attrs = tag_lexer.AllAttrsRawSlice()
864	counters.num_attrs += len(all_attrs)
865	for name, val_start, val_end in all_attrs:
866	val_lexer.Reset(val_start, val_end)
867	counters.num_val_tokens += val_lexer.NumTokens()
868
869	counters.debug_attrs.extend(all_attrs)
870
871	elif tok_id == Tok.StartTag:
872	counters.num_start_tags += 1
873
874	tag_lexer.Reset(start_pos, end_pos)
875	all_attrs = tag_lexer.AllAttrsRawSlice()
876	counters.num_attrs += len(all_attrs)
877	for name, val_start, val_end in all_attrs:
878	val_lexer.Reset(val_start, val_end)
879	counters.num_val_tokens += val_lexer.NumTokens()
880
881	counters.debug_attrs.extend(all_attrs)
882
883	if flags & BALANCED_TAGS:
884	tag_name = lx.CanonicalTagName()
885	if flags & NO_SPECIAL_TAGS:
886	tag_stack.append(tag_name)
887	else:
888	# e.g. <meta> is considered self-closing, like <meta/>
889	if tag_name not in VOID_ELEMENTS:
890	tag_stack.append(tag_name)
891
892	counters.max_tag_stack = max(counters.max_tag_stack,
893	len(tag_stack))
894	elif tok_id == Tok.EndTag:
895	if flags & BALANCED_TAGS:
896	try:
897	expected = tag_stack.pop()
898	except IndexError:
899	raise ParseError('Tag stack empty',
900	s=contents,
901	start_pos=start_pos)
902
903	actual = lx.CanonicalTagName()
904	if expected != actual:
905	raise ParseError(
906	'Got unexpected closing tag %r; opening tag was %r' %
907	(contents[start_pos:end_pos], expected),
908	s=contents,
909	start_pos=start_pos)
910
911	start_pos = end_pos
912
913	if len(tag_stack) != 0:
914	raise ParseError('Missing closing tags at end of doc: %s' %
915	' '.join(tag_stack),
916	s=contents,
917	start_pos=start_pos)
918
919	counters.num_tokens += len(tokens)
920
921
922	def ToXml(h):
923	# type: (str) -> str
924
925	# TODO:
926	# 1. Lex it
927	# 2. < & > must be escaped
928	# a. in raw data
929	# b. in quoted strings
930	# 3. <script> turned into CDATA
931	# 4. void tags turned into self-closing tags
932	# 5. case-sensitive tag matching - not sure about this
933	return h
934
935
936	class Counters(object):
937
938	def __init__(self):
939	self.num_tokens = 0
940	self.num_start_tags = 0
941	self.num_start_end_tags = 0
942	self.num_attrs = 0
943	self.max_tag_stack = 0
944	self.num_val_tokens = 0
945
946	self.debug_attrs = []
947
948
949	def main(argv):
950	action = argv[1]
951
952	if action == 'tokens':
953	contents = sys.stdin.read()
954
955	lx = Lexer(contents)
956	start_pos = 0
957	while True:
958	tok_id, end_pos = lx.Read()
959	if tok_id == Tok.Invalid:
960	raise LexError(contents, start_pos)
961	if tok_id == Tok.EndOfStream:
962	break
963
964	frag = contents[start_pos:end_pos]
965	log('%d %s %r', end_pos, TokenName(tok_id), frag)
966	start_pos = end_pos
967
968	return 0
969
970	elif action in ('lex-htm8', 'parse-htm8', 'parse-xml'):
971
972	errors = []
973	counters = Counters()
974
975	flags = LEX_ATTRS \| LEX_QUOTED_VALUES
976	if action.startswith('parse-'):
977	flags \|= BALANCED_TAGS
978	if action == 'parse-xml':
979	flags \|= NO_SPECIAL_TAGS
980
981	i = 0
982	for line in sys.stdin:
983	filename = line.strip()
984	with open(filename) as f:
985	contents = f.read()
986
987	try:
988	Validate(contents, flags, counters)
989	except LexError as e:
990	log('Lex error in %r: %s', filename, e)
991	errors.append((filename, e))
992	except ParseError as e:
993	log('Parse error in %r: %s', filename, e)
994	errors.append((filename, e))
995	i += 1
996
997	log('')
998	log('%10d tokens', counters.num_tokens)
999	log('%10d start/end tags', counters.num_start_end_tags)
1000	log('%10d start tags', counters.num_start_tags)
1001	log('%10d attrs', counters.num_attrs)
1002	log('%10d max tag stack depth', counters.max_tag_stack)
1003	log('%10d attr val tokens', counters.num_val_tokens)
1004	log('%10d errors', len(errors))
1005	if len(errors):
1006	return 1
1007	return 0
1008
1009	elif action == 'todo':
1010	# Other algorithms:
1011	#
1012	# - select first subtree with given ID
1013	# - this requires understanding the void tags I suppose
1014	# - select all subtrees that have a class
1015	# - materialize DOM
1016
1017	# Safe-HTM8? This is a filter
1018	return 0
1019
1020	else:
1021	raise RuntimeError('Invalid action %r' % action)
1022
1023
1024	if __name__ == '__main__':
1025	sys.exit(main(sys.argv))