lazylex/html.py

OILS / lazylex / html.py View on Github | oils.pub

989 lines, 516 significant

1	#!/usr/bin/env python2
2	"""
3	lazylex/html.py - Low-Level HTML Processing.
4
5	See lazylex/README.md for details.
6	"""
7	from __future__ import print_function
8
9	try:
10	from cStringIO import StringIO
11	except ImportError:
12	from io import StringIO # python3
13	import re
14	import sys
15
16	if sys.version_info.major == 2:
17	from typing import List, Tuple, Optional
18
19
20	def log(msg, *args):
21	msg = msg % args
22	print(msg, file=sys.stderr)
23
24
25	class LexError(Exception):
26	"""
27	Examples of lex errors:
28
29	- Tok.Invalid, like <> or &&
30	- Unclosed <!-- <? <![CDATA[ <script> <style>
31	"""
32
33	def __init__(self, s, start_pos):
34	self.s = s
35	self.start_pos = start_pos
36
37	def __str__(self):
38	return '(LexError %r)' % (self.s[self.start_pos:self.start_pos + 20])
39
40
41	def FindLineNum(s, error_pos):
42	current_pos = 0
43	line_num = 1
44	while True:
45	newline_pos = s.find('\n', current_pos)
46	#log('current = %d, N %d, line %d', current_pos, newline_pos, line_num)
47
48	if newline_pos == -1: # this is the last line
49	return line_num
50	if newline_pos >= error_pos:
51	return line_num
52	line_num += 1
53	current_pos = newline_pos + 1
54
55
56	class ParseError(Exception):
57	"""
58	Examples of parse errors
59
60	- unbalanced tag structure
61	- ul_table.py errors
62	"""
63
64	def __init__(self, msg, s=None, start_pos=-1):
65	self.msg = msg
66	self.s = s
67	self.start_pos = start_pos
68
69	def __str__(self):
70	if self.s is not None:
71	assert self.start_pos != -1, self.start_pos
72	snippet = (self.s[self.start_pos:self.start_pos + 20])
73
74	line_num = FindLineNum(self.s, self.start_pos)
75	else:
76	snippet = ''
77	line_num = -1
78	msg = 'line %d: %r %r' % (line_num, self.msg, snippet)
79	return msg
80
81
82	class Output(object):
83	"""Takes an underlying input buffer and an output file. Maintains a
84	position in the input buffer.
85
86	Print FROM the input or print new text to the output.
87	"""
88
89	def __init__(self, s, f, left_pos=0, right_pos=-1):
90	self.s = s
91	self.f = f
92	self.pos = left_pos
93	self.right_pos = len(s) if right_pos == -1 else right_pos
94
95	def SkipTo(self, pos):
96	"""Skip to a position."""
97	self.pos = pos
98
99	def PrintUntil(self, pos):
100	"""Print until a position."""
101	piece = self.s[self.pos:pos]
102	self.f.write(piece)
103	self.pos = pos
104
105	def PrintTheRest(self):
106	"""Print until the end of the string."""
107	self.PrintUntil(self.right_pos)
108
109	def Print(self, s):
110	"""Print text to the underlying buffer."""
111	self.f.write(s)
112
113
114	# HTML Tokens
115	# CommentBegin, ProcessingBegin, CDataBegin are "pseudo-tokens", not visible
116	TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin CData CDataBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData HtmlCData Invalid EndOfStream'.split(
117	)
118
119
120	class Tok(object):
121	"""
122	Avoid lint errors by using these aliases
123	"""
124	pass
125
126
127	TOKEN_NAMES = [None] * len(TOKENS) # type: List[str]
128
129	this_module = sys.modules[__name__]
130	for i, tok_str in enumerate(TOKENS):
131	setattr(this_module, tok_str, i)
132	setattr(Tok, tok_str, i)
133	TOKEN_NAMES[i] = tok_str
134
135
136	def TokenName(tok_id):
137	return TOKEN_NAMES[tok_id]
138
139
140	def MakeLexer(rules):
141	return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]
142
143
144	#
145	# Eggex
146	#
147	# Tag = / ~['>']+ /
148
149	# Is this valid? A single character?
150	# Tag = / ~'>'* /
151
152	# Maybe better: / [NOT '>']+/
153	# capital letters not allowed there?
154	#
155	# But then this is confusing:
156	# / [NOT ~digit]+/
157	#
158	# / [NOT digit] / is [^\d]
159	# / ~digit / is \D
160	#
161	# Or maybe:
162	#
163	# / [~ digit]+ /
164	# / [~ '>']+ /
165	# / [NOT '>']+ /
166
167	# End = / '</' Tag '>' /
168	# StartEnd = / '<' Tag '/>' /
169	# Start = / '<' Tag '>' /
170	#
171	# EntityRef = / '&' dot{* N} ';' /
172
173	# Tag name, or attribute name
174	# colon is used in XML
175
176	# https://www.w3.org/TR/xml/#NT-Name
177	# Hm there is a lot of unicode stuff. We are simplifying parsing
178
179	_NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter
180
181	CHAR_LEX = [
182	# Characters
183	# https://www.w3.org/TR/xml/#sec-references
184	(r'&\# [0-9]+ ;', Tok.DecChar),
185	(r'&\# x[0-9a-fA-F]+ ;', Tok.HexChar),
186	(r'& %s ;' % _NAME, Tok.CharEntity),
187	]
188
189	LEXER = CHAR_LEX + [
190	(r'<!--', Tok.CommentBegin),
191
192	# Processing instruction are used for the XML header:
193	# <?xml version="1.0" encoding="UTF-8"?>
194	# They are technically XML-only, but in HTML5, they are another kind of
195	# comment:
196	#
197	# https://developer.mozilla.org/en-US/docs/Web/API/ProcessingInstruction
198	#
199	(r'<\?', Tok.ProcessingBegin),
200	# Not necessary in HTML5, but occurs in XML
201	(r'<!\[CDATA\[', Tok.CDataBegin), # <![CDATA[
202
203	# Markup declarations
204	# - In HTML5, there is only <!DOCTYPE html>
205	# - XML has 4 more declarations: <!ELEMENT ...> ATTLIST ENTITY NOTATION
206	# - these seem to be part of DTD
207	# - it's useful to skip these, and be able to parse the rest of the document
208	# - Note: < is allowed?
209	(r'<! [^>\x00]+ >', Tok.Decl),
210
211	# Tags
212	# Notes:
213	# - We look for a valid tag name, but we don't validate attributes.
214	# That's done in the tag lexer.
215	# - We don't allow leading whitespace
216	(r'</ (%s) >' % _NAME, Tok.EndTag),
217	# self-closing <br/> comes before StartTag
218	# could/should these be collapsed into one rule?
219	(r'< (%s) [^>\x00]* />' % _NAME, Tok.StartEndTag), # end </a>
220	(r'< (%s) [^>\x00]* >' % _NAME, Tok.StartTag), # start <a>
221
222	# HTML5 allows unescaped > in raw data, but < is not allowed.
223	# https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
224	#
225	# - My early blog has THREE errors when disallowing >
226	# - So do some .wwz files
227	(r'[^&<\x00]+', Tok.RawData),
228	(r'.', Tok.Invalid), # error!
229	]
230
231	# Old notes:
232	#
233	# Non-greedy matches are regular and can be matched in linear time
234	# with RE2.
235	#
236	# https://news.ycombinator.com/item?id=27099798
237	#
238	# Maybe try combining all of these for speed.
239
240	# . is any char except newline
241	# https://re2c.org/manual/manual_c.html
242
243	# Discarded options
244	#(r'<!-- .*? -->', Tok.Comment),
245
246	# Hack from Claude: \s\S instead of re.DOTALL. I don't like this
247	#(r'<!-- [\s\S]*? -->', Tok.Comment),
248	#(r'<!-- (?:.\|[\n])*? -->', Tok.Comment),
249
250	LEXER = MakeLexer(LEXER)
251
252
253	class Lexer(object):
254
255	def __init__(self, s, left_pos=0, right_pos=-1, no_special_tags=False):
256	self.s = s
257	self.pos = left_pos
258	self.right_pos = len(s) if right_pos == -1 else right_pos
259	self.no_special_tags = no_special_tags
260
261	self.cache = {} # string -> compiled regex pattern object
262
263	# either </script> or </style> - we search until we see that
264	self.search_state = None # type: Optional[str]
265
266	# Position of tag name, if applicable
267	# - Set after you get a StartTag, EndTag, or StartEndTag
268	# - Unset on other tags
269	self.tag_pos_left = -1
270	self.tag_pos_right = -1
271
272	def _Peek(self):
273	# type: () -> Tuple[int, int]
274	"""
275	Note: not using _Peek() now
276	"""
277	if self.pos == self.right_pos:
278	return Tok.EndOfStream, self.pos
279
280	assert self.pos < self.right_pos, self.pos
281
282	if self.search_state is not None and not self.no_special_tags:
283	pos = self.s.find(self.search_state, self.pos)
284	if pos == -1:
285	# unterminated <script> or <style>
286	raise LexError(self.s, self.pos)
287	self.search_state = None
288	# beginning
289	return Tok.HtmlCData, pos
290
291	# Find the first match.
292	# Note: frontend/match.py uses _LongestMatch(), which is different!
293	# TODO: reconcile them. This lexer should be expressible in re2c.
294
295	for pat, tok_id in LEXER:
296	m = pat.match(self.s, self.pos)
297	if m:
298	if tok_id in (Tok.StartTag, Tok.EndTag, Tok.StartEndTag):
299	self.tag_pos_left = m.start(1)
300	self.tag_pos_right = m.end(1)
301	else:
302	# Reset state
303	self.tag_pos_left = -1
304	self.tag_pos_right = -1
305
306	if tok_id == Tok.CommentBegin:
307	pos = self.s.find('-->', self.pos)
308	if pos == -1:
309	# unterminated <!--
310	raise LexError(self.s, self.pos)
311	return Tok.Comment, pos + 3 # -->
312
313	if tok_id == Tok.ProcessingBegin:
314	pos = self.s.find('?>', self.pos)
315	if pos == -1:
316	# unterminated <?
317	raise LexError(self.s, self.pos)
318	return Tok.Processing, pos + 2 # ?>
319
320	if tok_id == Tok.CDataBegin:
321	pos = self.s.find(']]>', self.pos)
322	if pos == -1:
323	# unterminated <![CDATA[
324	raise LexError(self.s, self.pos)
325	return Tok.CData, pos + 3 # ]]>
326
327	if tok_id == Tok.StartTag:
328	if self.TagNameEquals('script'):
329	self.search_state = '</script>'
330	elif self.TagNameEquals('style'):
331	self.search_state = '</style>'
332
333	return tok_id, m.end()
334	else:
335	raise AssertionError('Tok.Invalid rule should have matched')
336
337	def TagNameEquals(self, expected):
338	# type: (str) -> bool
339	assert self.tag_pos_left != -1, self.tag_pos_left
340	assert self.tag_pos_right != -1, self.tag_pos_right
341
342	# TODO: In C++, this does not need an allocation
343	# TODO: conditionally lower() case here (maybe not in XML mode)
344	return expected == self.s[self.tag_pos_left:self.tag_pos_right]
345
346	def TagName(self):
347	# type: () -> None
348	assert self.tag_pos_left != -1, self.tag_pos_left
349	assert self.tag_pos_right != -1, self.tag_pos_right
350
351	# TODO: conditionally lower() case here (maybe not in XML mode)
352	return self.s[self.tag_pos_left:self.tag_pos_right]
353
354	def Read(self):
355	# type: () -> Tuple[int, int]
356	tok_id, end_pos = self._Peek()
357	self.pos = end_pos # advance
358	return tok_id, end_pos
359
360	def LookAhead(self, regex):
361	# Cache the regex compilation. This could also be LookAheadFor(THEAD)
362	# or something.
363	pat = self.cache.get(regex)
364	if pat is None:
365	pat = re.compile(regex)
366	self.cache[regex] = pat
367
368	m = pat.match(self.s, self.pos)
369	return m is not None
370
371
372	def _Tokens(s, left_pos, right_pos):
373	"""
374	Args:
375	s: string to parse
376	left_pos, right_pos: Optional span boundaries.
377	"""
378	lx = Lexer(s, left_pos, right_pos)
379	while True:
380	tok_id, pos = lx.Read()
381	yield tok_id, pos
382	if tok_id == Tok.EndOfStream:
383	break
384
385
386	def ValidTokens(s, left_pos=0, right_pos=-1):
387	"""Wrapper around _Tokens to prevent callers from having to handle Invalid.
388
389	I'm not combining the two functions because I might want to do a
390	'yield' transformation on Tokens()? Exceptions might complicate the
391	issue?
392	"""
393	pos = left_pos
394	for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
395	if tok_id == Tok.Invalid:
396	raise LexError(s, pos)
397	yield tok_id, end_pos
398	pos = end_pos
399
400
401	def ValidTokenList(s, no_special_tags=False):
402	"""A wrapper that can be more easily translated to C++. Doesn't use iterators."""
403
404	start_pos = 0
405	tokens = []
406	lx = Lexer(s, no_special_tags=no_special_tags)
407	while True:
408	tok_id, end_pos = lx.Read()
409	tokens.append((tok_id, end_pos))
410	if tok_id == Tok.EndOfStream:
411	break
412	if tok_id == Tok.Invalid:
413	raise LexError(s, start_pos)
414	start_pos = end_pos
415	return tokens
416
417
418	# Tag names:
419	# Match <a or </a
420	# Match <h2, but not <2h
421	#
422	# HTML 5 doesn't restrict tag names at all
423	# https://html.spec.whatwg.org/#toc-syntax
424	#
425	# XML allows : - .
426	# https://www.w3.org/TR/xml/#NT-NameChar
427
428	# Namespaces for MathML, SVG
429	# XLink, XML, XMLNS
430	#
431	# https://infra.spec.whatwg.org/#namespaces
432	#
433	# Allow - for td-attrs
434
435	# Be very lenient - just no whitespace or special HTML chars
436	# I don't think this is more lenient than HTML5, though we should check.
437	_UNQUOTED_VALUE = r'''[^ \t\r\n<>&"'\x00]*'''
438
439	# TODO: we don't need to capture the tag name here? That's done at the top
440	# level
441	_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)
442
443	_TAG_LAST_RE = re.compile(r'\s* /? >', re.VERBOSE)
444
445	# To match href="foo"
446	# Note: in HTML5 and XML, single quoted attributes are also valid
447
448	# <button disabled> is standard usage
449
450	# NOTE: This used to allow whitespace around =
451	# <a foo = "bar"> makes sense in XML
452	# But then you also have
453	# <a foo= bar> - which is TWO attributes, in HTML5
454	# So the space is problematic
455
456	_ATTR_RE = re.compile(
457	r'''
458	\s+ # Leading whitespace is required
459	(%s) # Attribute name
460	(?: # Optional attribute value
461	\s* = \s* # Spaces allowed around =
462	(?:
463	" ([^>"\x00]*) " # double quoted value
464	\| ' ([^>'\x00]*) ' # single quoted value
465	\| (%s) # Attribute value
466	)
467	)?
468	''' % (_NAME, _UNQUOTED_VALUE), re.VERBOSE)
469
470	TagName, AttrName, UnquotedValue, QuotedValue, MissingValue = range(5)
471
472
473	class TagLexer(object):
474	"""
475	Given a tag like <a href="..."> or <link type="..." />, the TagLexer
476	provides a few operations:
477
478	- What is the tag?
479	- Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
480	"""
481
482	def __init__(self, s):
483	self.s = s
484	self.start_pos = -1 # Invalid
485	self.end_pos = -1
486
487	def Reset(self, start_pos, end_pos):
488	"""Reuse instances of this object."""
489	assert start_pos >= 0, start_pos
490	assert end_pos >= 0, end_pos
491
492	self.start_pos = start_pos
493	self.end_pos = end_pos
494
495	def TagString(self):
496	return self.s[self.start_pos:self.end_pos]
497
498	def TagName(self):
499	# First event
500	tok_id, start, end = next(self.Tokens())
501	return self.s[start:end]
502
503	def GetSpanForAttrValue(self, attr_name):
504	"""
505	Used by oils_doc.py, for href shortcuts
506	"""
507	# Algorithm: search for QuotedValue or UnquotedValue after AttrName
508	# TODO: Could also cache these
509
510	events = self.Tokens()
511	val = (-1, -1)
512	try:
513	while True:
514	tok_id, start, end = next(events)
515	if tok_id == AttrName:
516	name = self.s[start:end]
517	if name == attr_name:
518	# The value should come next
519	tok_id, start, end = next(events)
520	assert tok_id in (QuotedValue, UnquotedValue,
521	MissingValue), TokenName(tok_id)
522	val = start, end
523	break
524
525	except StopIteration:
526	pass
527	return val
528
529	def GetAttrRaw(self, attr_name):
530	"""
531	Return the value, which may be UNESCAPED.
532	"""
533	start, end = self.GetSpanForAttrValue(attr_name)
534	if start == -1:
535	return None
536	return self.s[start:end]
537
538	def AllAttrsRawSlice(self):
539	"""
540	Get a list of pairs [('class', 3, 5), ('href', 9, 12)]
541	"""
542	slices = []
543	events = self.Tokens()
544	try:
545	while True:
546	tok_id, start, end = next(events)
547	if tok_id == AttrName:
548	name = self.s[start:end]
549
550	# The value should come next
551	tok_id, start, end = next(events)
552	assert tok_id in (QuotedValue, UnquotedValue,
553	MissingValue), TokenName(tok_id)
554	# Note: quoted values may have &
555	# We would need ANOTHER lexer to unescape them, but we
556	# don't need that for ul-table
557	slices.append((name, start, end))
558	except StopIteration:
559	pass
560	return slices
561
562	def AllAttrsRaw(self):
563	"""
564	Get a list of pairs [('class', 'foo'), ('href', '?foo=1&bar=2')]
565
566	The quoted values may be escaped. We would need another lexer to
567	unescape them.
568	"""
569	slices = self.AllAttrsRawSlice()
570	pairs = []
571	for name, start, end in slices:
572	pairs.append((name, self.s[start:end]))
573	return pairs
574
575	def Tokens(self):
576	"""
577	Yields a sequence of tokens: Tag (AttrName AttrValue?)*
578
579	Where each Token is (Type, start_pos, end_pos)
580
581	Note that start and end are NOT redundant! We skip over some unwanted
582	characters.
583	"""
584	m = _TAG_RE.match(self.s, self.start_pos + 1)
585	if not m:
586	raise RuntimeError("Couldn't find HTML tag in %r" %
587	self.TagString())
588	yield TagName, m.start(1), m.end(1)
589
590	pos = m.end(0)
591	#log('POS %d', pos)
592
593	while True:
594	# don't search past the end
595	m = _ATTR_RE.match(self.s, pos, self.end_pos)
596	if not m:
597	#log('BREAK pos %d', pos)
598	break
599	#log('AttrName %r', m.group(1))
600
601	yield AttrName, m.start(1), m.end(1)
602
603	#log('m.groups() %r', m.groups())
604	if m.group(2) is not None:
605	# double quoted
606	yield QuotedValue, m.start(2), m.end(2)
607	elif m.group(3) is not None:
608	# single quoted - TODO: could have different token types
609	yield QuotedValue, m.start(3), m.end(3)
610	elif m.group(4) is not None:
611	yield UnquotedValue, m.start(4), m.end(4)
612	else:
613	# <button disabled>
614	end = m.end(0)
615	yield MissingValue, end, end
616
617	# Skip past the "
618	pos = m.end(0)
619
620	#log('TOK %r', self.s)
621
622	m = _TAG_LAST_RE.match(self.s, pos)
623	#log('_TAG_LAST_RE match %r', self.s[pos:])
624	if not m:
625	# Extra data at end of tag. TODO: add messages for all these.
626	raise LexError(self.s, pos)
627
628
629	# This is similar but not identical to
630	# " ([^>"\x00]*) " # double quoted value
631	# \| ' ([^>'\x00]*) ' # single quoted value
632	#
633	# Note: for unquoted values, & isn't allowed, and thus & and c and
634	# are not allowed. We could relax that?
635	ATTR_VALUE_LEXER = CHAR_LEX + [
636	(r'[^>&\x00]+', Tok.RawData),
637	(r'.', Tok.Invalid),
638	]
639
640	ATTR_VALUE_LEXER = MakeLexer(ATTR_VALUE_LEXER)
641
642
643	class AttrValueLexer(object):
644	"""
645	<a href="foo=99&bar">
646	<a href='foo=99&bar'>
647	<a href=unquoted>
648	"""
649
650	def __init__(self, s):
651	self.s = s
652	self.start_pos = -1 # Invalid
653	self.end_pos = -1
654
655	def Reset(self, start_pos, end_pos):
656	"""Reuse instances of this object."""
657	assert start_pos >= 0, start_pos
658	assert end_pos >= 0, end_pos
659
660	self.start_pos = start_pos
661	self.end_pos = end_pos
662
663	def NumTokens(self):
664	num_tokens = 0
665	pos = self.start_pos
666	for tok_id, end_pos in self.Tokens():
667	if tok_id == Tok.Invalid:
668	raise LexError(self.s, pos)
669	pos = end_pos
670	#log('pos %d', pos)
671	num_tokens += 1
672	return num_tokens
673
674	def Tokens(self):
675	pos = self.start_pos
676	while pos < self.end_pos:
677	# Find the first match, like above.
678	# Note: frontend/match.py uses _LongestMatch(), which is different!
679	# TODO: reconcile them. This lexer should be expressible in re2c.
680	for pat, tok_id in ATTR_VALUE_LEXER:
681	m = pat.match(self.s, pos)
682	if m:
683	if 0:
684	tok_str = m.group(0)
685	log('token = %r', tok_str)
686
687	end_pos = m.end(0)
688	yield tok_id, end_pos
689	pos = end_pos
690	break
691	else:
692	raise AssertionError('Tok.Invalid rule should have matched')
693
694
695	def ReadUntilStartTag(it, tag_lexer, tag_name):
696	"""Find the next <foo>, returning its (start, end) positions
697
698	Raise ParseError if it's not found.
699
700	tag_lexer is RESET.
701	"""
702	pos = 0
703	while True:
704	try:
705	tok_id, end_pos = next(it)
706	except StopIteration:
707	break
708	tag_lexer.Reset(pos, end_pos)
709	if tok_id == Tok.StartTag and tag_lexer.TagName() == tag_name:
710	return pos, end_pos
711
712	pos = end_pos
713
714	raise ParseError('No start tag %r' % tag_name)
715
716
717	def ReadUntilEndTag(it, tag_lexer, tag_name):
718	"""Find the next </foo>, returning its (start, end) position
719
720	Raise ParseError if it's not found.
721
722	tag_lexer is RESET.
723	"""
724	pos = 0
725	while True:
726	try:
727	tok_id, end_pos = next(it)
728	except StopIteration:
729	break
730	tag_lexer.Reset(pos, end_pos)
731	if tok_id == Tok.EndTag and tag_lexer.TagName() == tag_name:
732	return pos, end_pos
733
734	pos = end_pos
735
736	raise ParseError('No end tag %r' % tag_name)
737
738
739	CHAR_ENTITY = {
740	'amp': '&',
741	'lt': '<',
742	'gt': '>',
743	'quot': '"',
744	}
745
746
747	def ToText(s, left_pos=0, right_pos=-1):
748	"""Given HTML, return text by unquoting > and < etc.
749
750	Used by:
751	doctools/oils_doc.py: PygmentsPlugin
752	doctools/help_gen.py: HelpIndexCards
753
754	In the latter case, we cold process some tags, like:
755
756	- Blue Link (not clickable, but still useful)
757	- Red X
758
759	That should be html.ToAnsi.
760	"""
761	f = StringIO()
762	out = Output(s, f, left_pos, right_pos)
763
764	pos = left_pos
765	for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
766	if tok_id == Tok.RawData:
767	out.SkipTo(pos)
768	out.PrintUntil(end_pos)
769
770	elif tok_id == Tok.CharEntity: # &
771
772	entity = s[pos + 1:end_pos - 1]
773
774	out.SkipTo(pos)
775	out.Print(CHAR_ENTITY[entity])
776	out.SkipTo(end_pos)
777
778	# Not handling these yet
779	elif tok_id == Tok.HexChar:
780	raise AssertionError('Hex Char %r' % s[pos:pos + 20])
781
782	elif tok_id == Tok.DecChar:
783	raise AssertionError('Dec Char %r' % s[pos:pos + 20])
784
785	pos = end_pos
786
787	out.PrintTheRest()
788	return f.getvalue()
789
790
791	# https://developer.mozilla.org/en-US/docs/Glossary/Void_element
792	VOID_ELEMENTS = [
793	'area',
794	'base',
795	'br',
796	'col',
797	'embed',
798	'hr',
799	'img',
800	'input',
801	'link',
802	'meta',
803	'param',
804	'source',
805	'track',
806	'wbr',
807	]
808
809	LEX_ATTRS = 1 << 1
810	LEX_QUOTED_VALUES = 1 << 2 # href="?x=42&y=99"
811	NO_SPECIAL_TAGS = 1 << 3 # <script> <style>, VOID tags, etc.
812	BALANCED_TAGS = 1 << 4 # are tags balanced?
813
814
815	def Validate(contents, flags, counters):
816	# type: (str, int, Counters) -> None
817
818	tag_lexer = TagLexer(contents)
819	val_lexer = AttrValueLexer(contents)
820
821	no_special_tags = bool(flags & NO_SPECIAL_TAGS)
822	lx = Lexer(contents, no_special_tags=no_special_tags)
823	tokens = []
824	start_pos = 0
825	tag_stack = []
826	while True:
827	tok_id, end_pos = lx.Read()
828	#log('TOP %s %r', TokenName(tok_id), contents[start_pos:end_pos])
829
830	if tok_id == Tok.Invalid:
831	raise LexError(contents, start_pos)
832	if tok_id == Tok.EndOfStream:
833	break
834
835	tokens.append((tok_id, end_pos))
836
837	if tok_id == Tok.StartEndTag:
838	counters.num_start_end_tags += 1
839
840	tag_lexer.Reset(start_pos, end_pos)
841	all_attrs = tag_lexer.AllAttrsRawSlice()
842	counters.num_attrs += len(all_attrs)
843	for name, val_start, val_end in all_attrs:
844	val_lexer.Reset(val_start, val_end)
845	counters.num_val_tokens += val_lexer.NumTokens()
846
847	counters.debug_attrs.extend(all_attrs)
848
849	elif tok_id == Tok.StartTag:
850	counters.num_start_tags += 1
851
852	tag_lexer.Reset(start_pos, end_pos)
853	all_attrs = tag_lexer.AllAttrsRawSlice()
854	counters.num_attrs += len(all_attrs)
855	for name, val_start, val_end in all_attrs:
856	val_lexer.Reset(val_start, val_end)
857	counters.num_val_tokens += val_lexer.NumTokens()
858
859	counters.debug_attrs.extend(all_attrs)
860
861	if flags & BALANCED_TAGS:
862	tag_name = lx.TagName()
863	if flags & NO_SPECIAL_TAGS:
864	tag_stack.append(tag_name)
865	else:
866	# e.g. <meta> is considered self-closing, like <meta/>
867	if tag_name not in VOID_ELEMENTS:
868	tag_stack.append(tag_name)
869
870	counters.max_tag_stack = max(counters.max_tag_stack,
871	len(tag_stack))
872	elif tok_id == Tok.EndTag:
873	if flags & BALANCED_TAGS:
874	try:
875	expected = tag_stack.pop()
876	except IndexError:
877	raise ParseError('Tag stack empty',
878	s=contents,
879	start_pos=start_pos)
880
881	actual = lx.TagName()
882	if expected != actual:
883	raise ParseError(
884	'Got unexpected closing tag %r; opening tag was %r' %
885	(contents[start_pos:end_pos], expected),
886	s=contents,
887	start_pos=start_pos)
888
889	start_pos = end_pos
890
891	if len(tag_stack) != 0:
892	raise ParseError('Missing closing tags at end of doc: %s' %
893	' '.join(tag_stack),
894	s=contents,
895	start_pos=start_pos)
896
897	counters.num_tokens += len(tokens)
898
899
900	class Counters(object):
901
902	def __init__(self):
903	self.num_tokens = 0
904	self.num_start_tags = 0
905	self.num_start_end_tags = 0
906	self.num_attrs = 0
907	self.max_tag_stack = 0
908	self.num_val_tokens = 0
909
910	self.debug_attrs = []
911
912
913	def main(argv):
914	action = argv[1]
915
916	if action == 'tokens':
917	contents = sys.stdin.read()
918
919	lx = Lexer(contents)
920	start_pos = 0
921	while True:
922	tok_id, end_pos = lx.Read()
923	if tok_id == Tok.Invalid:
924	raise LexError(contents, start_pos)
925	if tok_id == Tok.EndOfStream:
926	break
927
928	frag = contents[start_pos:end_pos]
929	log('%d %s %r', end_pos, TokenName(tok_id), frag)
930	start_pos = end_pos
931
932	return 0
933
934	elif action in ('lex-htm8', 'parse-htm8', 'parse-xml'):
935
936	errors = []
937	counters = Counters()
938
939	flags = LEX_ATTRS \| LEX_QUOTED_VALUES
940	if action.startswith('parse-'):
941	flags \|= BALANCED_TAGS
942	if action == 'parse-xml':
943	flags \|= NO_SPECIAL_TAGS
944
945	i = 0
946	for line in sys.stdin:
947	filename = line.strip()
948	with open(filename) as f:
949	contents = f.read()
950
951	try:
952	Validate(contents, flags, counters)
953	except LexError as e:
954	log('Lex error in %r: %s', filename, e)
955	errors.append((filename, e))
956	except ParseError as e:
957	log('Parse error in %r: %s', filename, e)
958	errors.append((filename, e))
959	i += 1
960
961	log('')
962	log('%10d tokens', counters.num_tokens)
963	log('%10d start/end tags', counters.num_start_end_tags)
964	log('%10d start tags', counters.num_start_tags)
965	log('%10d attrs', counters.num_attrs)
966	log('%10d max tag stack depth', counters.max_tag_stack)
967	log('%10d attr val tokens', counters.num_val_tokens)
968	log('%10d errors', len(errors))
969	if len(errors):
970	return 1
971	return 0
972
973	elif action == 'todo':
974	# Other algorithms:
975	#
976	# - select first subtree with given ID
977	# - this requires understanding the void tags I suppose
978	# - select all subtrees that have a class
979	# - materialize DOM
980
981	# Safe-HTM8? This is a filter
982	return 0
983
984	else:
985	raise RuntimeError('Invalid action %r' % action)
986
987
988	if __name__ == '__main__':
989	sys.exit(main(sys.argv))