lazylex/html.py

OILS / lazylex / html.py View on Github | oils.pub

899 lines, 454 significant

1	#!/usr/bin/env python2
2	"""
3	lazylex/html.py - Low-Level HTML Processing.
4
5	See lazylex/README.md for details.
6
7	Conflicts between HTML5 and XML:
8
9	- In XML, <source> is like any tag, and must be closed,
10	- In HTML, <source> is a VOID tag, and must NOT be closedlike any tag, and must be closed,
11
12	- In XML, <script> and <style> don't have special treatment
13	- In HTML, they do
14
15	- The header is different - <!DOCTYPE html> vs. <?xml version= ... ?>
16
17	So do have a mode for <script> <style> and void tags? Upgrade HX8 into HTM8?
18	"""
19	from __future__ import print_function
20
21	try:
22	from cStringIO import StringIO
23	except ImportError:
24	from io import StringIO # python3
25	import re
26	import sys
27
28	if sys.version_info.major == 2:
29	from typing import List, Tuple, Optional
30
31
32	def log(msg, *args):
33	msg = msg % args
34	print(msg, file=sys.stderr)
35
36
37	class LexError(Exception):
38	"""
39	Examples of lex errors:
40
41	- Tok.Invalid, like <> or &&
42	- Unclosed <!-- <? <![CDATA[ <script> <style>
43	"""
44
45	def __init__(self, s, start_pos):
46	self.s = s
47	self.start_pos = start_pos
48
49	def __str__(self):
50	return '(LexError %r)' % (self.s[self.start_pos:self.start_pos + 20])
51
52
53	def FindLineNum(s, error_pos):
54	current_pos = 0
55	line_num = 1
56	while True:
57	newline_pos = s.find('\n', current_pos)
58	#log('current = %d, N %d, line %d', current_pos, newline_pos, line_num)
59
60	if newline_pos == -1: # this is the last line
61	return line_num
62	if newline_pos >= error_pos:
63	return line_num
64	line_num += 1
65	current_pos = newline_pos + 1
66
67
68	class ParseError(Exception):
69	"""
70	Examples of parse errors
71
72	- unbalanced tag structure
73	- ul_table.py errors
74	"""
75
76	def __init__(self, msg, s=None, start_pos=-1):
77	self.msg = msg
78	self.s = s
79	self.start_pos = start_pos
80
81	def __str__(self):
82	if self.s is not None:
83	assert self.start_pos != -1, self.start_pos
84	snippet = (self.s[self.start_pos:self.start_pos + 20])
85
86	line_num = FindLineNum(self.s, self.start_pos)
87	else:
88	snippet = ''
89	line_num = -1
90	msg = 'line %d: %r %r' % (line_num, self.msg, snippet)
91	return msg
92
93
94	class Output(object):
95	"""Takes an underlying input buffer and an output file. Maintains a
96	position in the input buffer.
97
98	Print FROM the input or print new text to the output.
99	"""
100
101	def __init__(self, s, f, left_pos=0, right_pos=-1):
102	self.s = s
103	self.f = f
104	self.pos = left_pos
105	self.right_pos = len(s) if right_pos == -1 else right_pos
106
107	def SkipTo(self, pos):
108	"""Skip to a position."""
109	self.pos = pos
110
111	def PrintUntil(self, pos):
112	"""Print until a position."""
113	piece = self.s[self.pos:pos]
114	self.f.write(piece)
115	self.pos = pos
116
117	def PrintTheRest(self):
118	"""Print until the end of the string."""
119	self.PrintUntil(self.right_pos)
120
121	def Print(self, s):
122	"""Print text to the underlying buffer."""
123	self.f.write(s)
124
125
126	# HTML Tokens
127	# CommentBegin, ProcessingBegin, CDataBegin are "pseudo-tokens", not visible
128	TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin CData CDataBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData HtmlCData Invalid EndOfStream'.split(
129	)
130
131
132	class Tok(object):
133	"""
134	Avoid lint errors by using these aliases
135	"""
136	pass
137
138
139	TOKEN_NAMES = [None] * len(TOKENS) # type: List[str]
140
141	this_module = sys.modules[__name__]
142	for i, tok_str in enumerate(TOKENS):
143	setattr(this_module, tok_str, i)
144	setattr(Tok, tok_str, i)
145	TOKEN_NAMES[i] = tok_str
146
147
148	def TokenName(tok_id):
149	return TOKEN_NAMES[tok_id]
150
151
152	def MakeLexer(rules):
153	return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]
154
155
156	#
157	# Eggex
158	#
159	# Tag = / ~['>']+ /
160
161	# Is this valid? A single character?
162	# Tag = / ~'>'* /
163
164	# Maybe better: / [NOT '>']+/
165	# capital letters not allowed there?
166	#
167	# But then this is confusing:
168	# / [NOT ~digit]+/
169	#
170	# / [NOT digit] / is [^\d]
171	# / ~digit / is \D
172	#
173	# Or maybe:
174	#
175	# / [~ digit]+ /
176	# / [~ '>']+ /
177	# / [NOT '>']+ /
178
179	# End = / '</' Tag '>' /
180	# StartEnd = / '<' Tag '/>' /
181	# Start = / '<' Tag '>' /
182	#
183	# EntityRef = / '&' dot{* N} ';' /
184
185	# Tag name, or attribute name
186	# colon is used in XML
187
188	# https://www.w3.org/TR/xml/#NT-Name
189	# Hm there is a lot of unicode stuff. We are simplifying parsing
190
191	_NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter
192
193	LEXER = [
194	(r'<!--', Tok.CommentBegin),
195
196	# Processing instruction are used for the XML header:
197	# <?xml version="1.0" encoding="UTF-8"?>
198	# They are technically XML-only, but in HTML5, they are another kind of
199	# comment:
200	#
201	# https://developer.mozilla.org/en-US/docs/Web/API/ProcessingInstruction
202	#
203	(r'<\?', Tok.ProcessingBegin),
204	# Not necessary in HTML5, but occurs in XML
205	(r'<!\[CDATA\[', Tok.CDataBegin), # <![CDATA[
206
207	# Markup declarations
208	# - In HTML5, there is only <!DOCTYPE html>
209	# - XML has 4 more declarations: <!ELEMENT ...> ATTLIST ENTITY NOTATION
210	# - these seem to be part of DTD
211	# - it's useful to skip these, and be able to parse the rest of the document
212	# - Note: < is allowed?
213	(r'<! [^>]+ >', Tok.Decl),
214
215	# Tags
216	# Notes:
217	# - We look for a valid tag name, but we don't validate attributes.
218	# That's done in the tag lexer.
219	# - We don't allow leading whitespace
220	(r'</ (%s) >' % _NAME, Tok.EndTag),
221	# self-closing <br/> comes before StartTag
222	# could/should these be collapsed into one rule?
223	(r'< (%s) [^>]* />' % _NAME, Tok.StartEndTag), # end </a>
224	(r'< (%s) [^>]* >' % _NAME, Tok.StartTag), # start <a>
225
226	# Characters
227	# https://www.w3.org/TR/xml/#sec-references
228	(r'&\# [0-9]+ ;', Tok.DecChar),
229	(r'&\# x[0-9a-fA-F]+ ;', Tok.HexChar),
230	(r'& %s ;' % _NAME, Tok.CharEntity),
231
232	# HTML5 allows unescaped > in raw data, but < is not allowed.
233	# https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
234	#
235	# - My early blog has THREE errors when disallowing >
236	# - So do some .wwz files
237	(r'[^&<]+', Tok.RawData),
238	(r'.', Tok.Invalid), # error!
239	]
240
241	# Old notes:
242	#
243	# Non-greedy matches are regular and can be matched in linear time
244	# with RE2.
245	#
246	# https://news.ycombinator.com/item?id=27099798
247	#
248	# Maybe try combining all of these for speed.
249
250	# . is any char except newline
251	# https://re2c.org/manual/manual_c.html
252
253	# Discarded options
254	#(r'<!-- .*? -->', Tok.Comment),
255
256	# Hack from Claude: \s\S instead of re.DOTALL. I don't like this
257	#(r'<!-- [\s\S]*? -->', Tok.Comment),
258	#(r'<!-- (?:.\|[\n])*? -->', Tok.Comment),
259
260	LEXER = MakeLexer(LEXER)
261
262
263	class Lexer(object):
264
265	def __init__(self, s, left_pos=0, right_pos=-1, no_special_tags=False):
266	self.s = s
267	self.pos = left_pos
268	self.right_pos = len(s) if right_pos == -1 else right_pos
269	self.no_special_tags = no_special_tags
270
271	self.cache = {} # string -> compiled regex pattern object
272
273	# either </script> or </style> - we search until we see that
274	self.search_state = None # type: Optional[str]
275
276	# Position of tag name, if applicable
277	# - Set after you get a StartTag, EndTag, or StartEndTag
278	# - Unset on other tags
279	self.tag_pos_left = -1
280	self.tag_pos_right = -1
281
282	def _Peek(self):
283	# type: () -> Tuple[int, int]
284	"""
285	Note: not using _Peek() now
286	"""
287	if self.pos == self.right_pos:
288	return Tok.EndOfStream, self.pos
289
290	assert self.pos < self.right_pos, self.pos
291
292	if self.search_state is not None and not self.no_special_tags:
293	pos = self.s.find(self.search_state, self.pos)
294	if pos == -1:
295	# unterminated <script> or <style>
296	raise LexError(self.s, self.pos)
297	self.search_state = None
298	# beginning
299	return Tok.HtmlCData, pos
300
301	# Find the first match.
302	# Note: frontend/match.py uses _LongestMatch(), which is different!
303	# TODO: reconcile them. This lexer should be expressible in re2c.
304
305	for pat, tok_id in LEXER:
306	m = pat.match(self.s, self.pos)
307	if m:
308	if tok_id in (Tok.StartTag, Tok.EndTag, Tok.StartEndTag):
309	self.tag_pos_left = m.start(1)
310	self.tag_pos_right = m.end(1)
311	else:
312	# Reset state
313	self.tag_pos_left = -1
314	self.tag_pos_right = -1
315
316	if tok_id == Tok.CommentBegin:
317	pos = self.s.find('-->', self.pos)
318	if pos == -1:
319	# unterminated <!--
320	raise LexError(self.s, self.pos)
321	return Tok.Comment, pos + 3 # -->
322
323	if tok_id == Tok.ProcessingBegin:
324	pos = self.s.find('?>', self.pos)
325	if pos == -1:
326	# unterminated <?
327	raise LexError(self.s, self.pos)
328	return Tok.Processing, pos + 2 # ?>
329
330	if tok_id == Tok.CDataBegin:
331	pos = self.s.find(']]>', self.pos)
332	if pos == -1:
333	# unterminated <![CDATA[
334	raise LexError(self.s, self.pos)
335	return Tok.CData, pos + 3 # ]]>
336
337	if tok_id == Tok.StartTag:
338	if self.TagNameEquals('script'):
339	self.search_state = '</script>'
340	elif self.TagNameEquals('style'):
341	self.search_state = '</style>'
342
343	return tok_id, m.end()
344	else:
345	raise AssertionError('Tok.Invalid rule should have matched')
346
347	def TagNameEquals(self, expected):
348	# type: (str) -> bool
349	assert self.tag_pos_left != -1, self.tag_pos_left
350	assert self.tag_pos_right != -1, self.tag_pos_right
351
352	# TODO: In C++, this does not need an allocation
353	# TODO: conditionally lower() case here (maybe not in XML mode)
354	return expected == self.s[self.tag_pos_left:self.tag_pos_right]
355
356	def TagName(self):
357	# type: () -> None
358	assert self.tag_pos_left != -1, self.tag_pos_left
359	assert self.tag_pos_right != -1, self.tag_pos_right
360
361	# TODO: conditionally lower() case here (maybe not in XML mode)
362	return self.s[self.tag_pos_left:self.tag_pos_right]
363
364	def Read(self):
365	# type: () -> Tuple[int, int]
366	tok_id, end_pos = self._Peek()
367	self.pos = end_pos # advance
368	return tok_id, end_pos
369
370	def LookAhead(self, regex):
371	# Cache the regex compilation. This could also be LookAheadFor(THEAD)
372	# or something.
373	pat = self.cache.get(regex)
374	if pat is None:
375	pat = re.compile(regex)
376	self.cache[regex] = pat
377
378	m = pat.match(self.s, self.pos)
379	return m is not None
380
381
382	def _Tokens(s, left_pos, right_pos):
383	"""
384	Args:
385	s: string to parse
386	left_pos, right_pos: Optional span boundaries.
387	"""
388	lx = Lexer(s, left_pos, right_pos)
389	while True:
390	tok_id, pos = lx.Read()
391	yield tok_id, pos
392	if tok_id == Tok.EndOfStream:
393	break
394
395
396	def ValidTokens(s, left_pos=0, right_pos=-1):
397	"""Wrapper around _Tokens to prevent callers from having to handle Invalid.
398
399	I'm not combining the two functions because I might want to do a
400	'yield' transformation on Tokens()? Exceptions might complicate the
401	issue?
402	"""
403	pos = left_pos
404	for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
405	if tok_id == Tok.Invalid:
406	raise LexError(s, pos)
407	yield tok_id, end_pos
408	pos = end_pos
409
410
411	def ValidTokenList(s, no_special_tags=False):
412	"""A wrapper that can be more easily translated to C++. Doesn't use iterators."""
413
414	start_pos = 0
415	tokens = []
416	lx = Lexer(s, no_special_tags=no_special_tags)
417	while True:
418	tok_id, end_pos = lx.Read()
419	tokens.append((tok_id, end_pos))
420	if tok_id == Tok.EndOfStream:
421	break
422	if tok_id == Tok.Invalid:
423	raise LexError(s, start_pos)
424	start_pos = end_pos
425	return tokens
426
427
428	# Tag names:
429	# Match <a or </a
430	# Match <h2, but not <2h
431	#
432	# HTML 5 doesn't restrict tag names at all
433	# https://html.spec.whatwg.org/#toc-syntax
434	#
435	# XML allows : - .
436	# https://www.w3.org/TR/xml/#NT-NameChar
437
438	# Namespaces for MathML, SVG
439	# XLink, XML, XMLNS
440	#
441	# https://infra.spec.whatwg.org/#namespaces
442	#
443	# Allow - for td-attrs
444
445	# Be very lenient - just no whitespace or special HTML chars
446	# I don't think this is more lenient than HTML5, though we should check.
447	_UNQUOTED_VALUE = r'''[^\x00 \t\r\n<>&"']*'''
448
449	# TODO: we don't need to capture the tag name here? That's done at the top
450	# level
451	_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)
452
453	_TAG_LAST_RE = re.compile(r'\s* /? >', re.VERBOSE)
454
455	# To match href="foo"
456	# Note: in HTML5 and XML, single quoted attributes are also valid
457
458	# <button disabled> is standard usage
459
460	_ATTR_RE = re.compile(
461	r'''
462	\s+ # Leading whitespace is required
463	(%s) # Attribute name
464	(?: # Optional attribute value
465	\s* = \s*
466	(?:
467	" ([^>"]*) " # double quoted value
468	\| ' ([^>']*) ' # single quoted value
469	\| (%s) # Attribute value
470	)
471	)?
472	''' % (_NAME, _UNQUOTED_VALUE), re.VERBOSE)
473
474	TagName, AttrName, UnquotedValue, QuotedValue = range(4)
475
476
477	class TagLexer(object):
478	"""
479	Given a tag like <a href="..."> or <link type="..." />, the TagLexer
480	provides a few operations:
481
482	- What is the tag?
483	- Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
484	"""
485
486	def __init__(self, s):
487	self.s = s
488	self.start_pos = -1 # Invalid
489	self.end_pos = -1
490
491	def Reset(self, start_pos, end_pos):
492	"""Reuse instances of this object."""
493	self.start_pos = start_pos
494	self.end_pos = end_pos
495
496	def TagString(self):
497	return self.s[self.start_pos:self.end_pos]
498
499	def TagName(self):
500	# First event
501	tok_id, start, end = next(self.Tokens())
502	return self.s[start:end]
503
504	def GetSpanForAttrValue(self, attr_name):
505	"""
506	Used by oils_doc.py, for href shortcuts
507	"""
508	# Algorithm: search for QuotedValue or UnquotedValue after AttrName
509	# TODO: Could also cache these
510
511	events = self.Tokens()
512	val = (-1, -1)
513	try:
514	while True:
515	tok_id, start, end = next(events)
516	if tok_id == AttrName:
517	name = self.s[start:end]
518	if name == attr_name:
519	# The value should come next
520	tok_id, start, end = next(events)
521	if tok_id in (QuotedValue, UnquotedValue):
522	# Note: quoted values may have &
523	# We would need ANOTHER lexer to unescape them.
524	# Right now help_gen.py and oils_doc.py
525	val = start, end
526	break
527
528	except StopIteration:
529	pass
530	return val
531
532	def GetAttrRaw(self, attr_name):
533	"""
534	Return the value, which may be UNESCAPED.
535	"""
536	start, end = self.GetSpanForAttrValue(attr_name)
537	if start == -1:
538	return None
539	return self.s[start:end]
540
541	def AllAttrsRaw(self):
542	"""
543	Get a list of pairs [('class', 'foo'), ('href', '?foo=1&bar=2')]
544
545	The quoted values may be escaped. We would need another lexer to
546	unescape them.
547	"""
548	pairs = []
549	events = self.Tokens()
550	try:
551	while True:
552	tok_id, start, end = next(events)
553	if tok_id == AttrName:
554	name = self.s[start:end]
555
556	# The value should come next
557	tok_id, start, end = next(events)
558	if tok_id in (QuotedValue, UnquotedValue):
559	# Note: quoted values may have &
560	# We would need ANOTHER lexer to unescape them, but we
561	# don't need that for ul-table
562
563	val = self.s[start:end]
564	pairs.append((name, val))
565	except StopIteration:
566	pass
567	return pairs
568
569	def Tokens(self):
570	"""
571	Yields a sequence of tokens: Tag (AttrName AttrValue?)*
572
573	Where each Token is (Type, start_pos, end_pos)
574
575	Note that start and end are NOT redundant! We skip over some unwanted
576	characters.
577	"""
578	m = _TAG_RE.match(self.s, self.start_pos + 1)
579	if not m:
580	raise RuntimeError("Couldn't find HTML tag in %r" %
581	self.TagString())
582	yield TagName, m.start(1), m.end(1)
583
584	pos = m.end(0)
585	#log('POS %d', pos)
586
587	while True:
588	# don't search past the end
589	m = _ATTR_RE.match(self.s, pos, self.end_pos)
590	if not m:
591	#log('BREAK pos %d', pos)
592	break
593	#log('AttrName %r', m.group(1))
594
595	yield AttrName, m.start(1), m.end(1)
596
597	if m.group(2) is not None:
598	# double quoted
599	yield QuotedValue, m.start(2), m.end(2)
600	elif m.group(3) is not None:
601	# single quoted - TODO: could have different token types
602	yield QuotedValue, m.start(3), m.end(3)
603	elif m.group(4) is not None:
604	yield UnquotedValue, m.start(4), m.end(4)
605
606	# Skip past the "
607	pos = m.end(0)
608
609	#log('TOK %r', self.s)
610
611	m = _TAG_LAST_RE.match(self.s, pos)
612	#log('_TAG_LAST_RE match %r', self.s[pos:])
613	if not m:
614	# Extra data at end of tag. TODO: add messages for all these.
615	raise LexError(self.s, pos)
616
617
618	def ReadUntilStartTag(it, tag_lexer, tag_name):
619	"""Find the next <foo>, returning its (start, end) positions
620
621	Raise ParseError if it's not found.
622
623	tag_lexer is RESET.
624	"""
625	pos = 0
626	while True:
627	try:
628	tok_id, end_pos = next(it)
629	except StopIteration:
630	break
631	tag_lexer.Reset(pos, end_pos)
632	if tok_id == Tok.StartTag and tag_lexer.TagName() == tag_name:
633	return pos, end_pos
634
635	pos = end_pos
636
637	raise ParseError('No start tag %r' % tag_name)
638
639
640	def ReadUntilEndTag(it, tag_lexer, tag_name):
641	"""Find the next </foo>, returning its (start, end) position
642
643	Raise ParseError if it's not found.
644
645	tag_lexer is RESET.
646	"""
647	pos = 0
648	while True:
649	try:
650	tok_id, end_pos = next(it)
651	except StopIteration:
652	break
653	tag_lexer.Reset(pos, end_pos)
654	if tok_id == Tok.EndTag and tag_lexer.TagName() == tag_name:
655	return pos, end_pos
656
657	pos = end_pos
658
659	raise ParseError('No end tag %r' % tag_name)
660
661
662	CHAR_ENTITY = {
663	'amp': '&',
664	'lt': '<',
665	'gt': '>',
666	'quot': '"',
667	}
668
669
670	def ToText(s, left_pos=0, right_pos=-1):
671	"""Given HTML, return text by unquoting > and < etc.
672
673	Used by:
674	doctools/oils_doc.py: PygmentsPlugin
675	doctools/help_gen.py: HelpIndexCards
676
677	In the latter case, we cold process some tags, like:
678
679	- Blue Link (not clickable, but still useful)
680	- Red X
681
682	That should be html.ToAnsi.
683	"""
684	f = StringIO()
685	out = Output(s, f, left_pos, right_pos)
686
687	pos = left_pos
688	for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
689	if tok_id == Tok.RawData:
690	out.SkipTo(pos)
691	out.PrintUntil(end_pos)
692
693	elif tok_id == Tok.CharEntity: # &
694
695	entity = s[pos + 1:end_pos - 1]
696
697	out.SkipTo(pos)
698	out.Print(CHAR_ENTITY[entity])
699	out.SkipTo(end_pos)
700
701	# Not handling these yet
702	elif tok_id == Tok.HexChar:
703	raise AssertionError('Hex Char %r' % s[pos:pos + 20])
704
705	elif tok_id == Tok.DecChar:
706	raise AssertionError('Dec Char %r' % s[pos:pos + 20])
707
708	pos = end_pos
709
710	out.PrintTheRest()
711	return f.getvalue()
712
713
714	# https://developer.mozilla.org/en-US/docs/Glossary/Void_element
715	VOID_ELEMENTS = [
716	'area',
717	'base',
718	'br',
719	'col',
720	'embed',
721	'hr',
722	'img',
723	'input',
724	'link',
725	'meta',
726	'param',
727	'source',
728	'track',
729	'wbr',
730	]
731
732	LEX_ATTRS = 1 << 1
733	LEX_QUOTED_VALUES = 1 << 2 # href="?x=42&y=99"
734	NO_SPECIAL_TAGS = 1 << 3 # <script> <style>, VOID tags, etc.
735	BALANCED_TAGS = 1 << 4 # are tags balanced?
736
737
738	def Validate(contents, flags, counters):
739	# type: (str, int, Counters) -> None
740
741	tag_lexer = TagLexer(contents)
742	no_special_tags = bool(flags & NO_SPECIAL_TAGS)
743	lx = Lexer(contents, no_special_tags=no_special_tags)
744	tokens = []
745	start_pos = 0
746	tag_stack = []
747	while True:
748	tok_id, end_pos = lx.Read()
749
750	if tok_id == Tok.Invalid:
751	raise LexError(contents, start_pos)
752	if tok_id == Tok.EndOfStream:
753	break
754
755	tokens.append((tok_id, end_pos))
756
757	if tok_id == Tok.StartEndTag:
758	counters.num_start_end_tags += 1
759
760	tag_lexer.Reset(start_pos, end_pos)
761	all_attrs = tag_lexer.AllAttrsRaw()
762	counters.num_attrs += len(all_attrs)
763	counters.debug_attrs.extend(all_attrs)
764
765	elif tok_id == Tok.StartTag:
766	counters.num_start_tags += 1
767
768	tag_lexer.Reset(start_pos, end_pos)
769	all_attrs = tag_lexer.AllAttrsRaw()
770	counters.num_attrs += len(all_attrs)
771	counters.debug_attrs.extend(all_attrs)
772
773	if flags & BALANCED_TAGS:
774	tag_name = lx.TagName()
775	if flags & NO_SPECIAL_TAGS:
776	tag_stack.append(tag_name)
777	else:
778	# e.g. <meta> is considered self-closing, like <meta/>
779	if tag_name not in VOID_ELEMENTS:
780	tag_stack.append(tag_name)
781
782	counters.max_tag_stack = max(counters.max_tag_stack,
783	len(tag_stack))
784	elif tok_id == Tok.EndTag:
785	if flags & BALANCED_TAGS:
786	try:
787	expected = tag_stack.pop()
788	except IndexError:
789	raise ParseError('Tag stack empty',
790	s=contents,
791	start_pos=start_pos)
792
793	actual = lx.TagName()
794	if expected != actual:
795	raise ParseError(
796	'Got unexpected closing tag %r; opening tag was %r' %
797	(contents[start_pos:end_pos], expected),
798	s=contents,
799	start_pos=start_pos)
800
801	start_pos = end_pos
802
803	if len(tag_stack) != 0:
804	raise ParseError('Missing closing tags at end of doc: %s' %
805	' '.join(tag_stack),
806	s=contents,
807	start_pos=start_pos)
808
809	counters.num_tokens += len(tokens)
810
811
812	class Counters(object):
813
814	def __init__(self):
815	self.num_tokens = 0
816	self.num_start_tags = 0
817	self.num_start_end_tags = 0
818	self.num_attrs = 0
819	self.max_tag_stack = 0
820
821	self.debug_attrs = []
822
823
824	def main(argv):
825	action = argv[1]
826
827	if action == 'tokens':
828	contents = sys.stdin.read()
829
830	lx = Lexer(contents)
831	start_pos = 0
832	while True:
833	tok_id, end_pos = lx.Read()
834	if tok_id == Tok.Invalid:
835	raise LexError(contents, start_pos)
836	if tok_id == Tok.EndOfStream:
837	break
838
839	frag = contents[start_pos:end_pos]
840	log('%d %s %r', end_pos, TokenName(tok_id), frag)
841	start_pos = end_pos
842
843	return 0
844
845	elif action in ('lex-htm8', 'parse-htm8', 'parse-xml'):
846
847	errors = []
848	counters = Counters()
849
850	flags = LEX_ATTRS \| LEX_QUOTED_VALUES
851	if action.startswith('parse-'):
852	flags \|= BALANCED_TAGS
853	if action == 'parse-xml':
854	flags \|= NO_SPECIAL_TAGS
855
856	i = 0
857	for line in sys.stdin:
858	filename = line.strip()
859	with open(filename) as f:
860	contents = f.read()
861
862	try:
863	Validate(contents, flags, counters)
864	except LexError as e:
865	log('Lex error in %r: %s', filename, e)
866	errors.append((filename, e))
867	except ParseError as e:
868	log('Parse error in %r: %s', filename, e)
869	errors.append((filename, e))
870	i += 1
871
872	log('')
873	log(
874	' %d tokens, %d start/end tags, %d start tags, %d attrs, %d max tag stack depth in %d files',
875	counters.num_tokens, counters.num_start_end_tags,
876	counters.num_start_tags, counters.num_attrs,
877	counters.max_tag_stack, i)
878	log(' %d errors', len(errors))
879	if len(errors):
880	return 1
881	return 0
882
883	elif action == 'todo':
884	# Other algorithms:
885	#
886	# - select first subtree with given ID
887	# - this requires understanding the void tags I suppose
888	# - select all subtrees that have a class
889	# - materialize DOM
890
891	# Safe-HTM8? This is a filter
892	return 0
893
894	else:
895	raise RuntimeError('Invalid action %r' % action)
896
897
898	if __name__ == '__main__':
899	sys.exit(main(sys.argv))