lazylex/html.py

OILS / lazylex / html.py View on Github | oils.pub

872 lines, 440 significant

1	#!/usr/bin/env python2
2	"""
3	lazylex/html.py - Low-Level HTML Processing.
4
5	See lazylex/README.md for details.
6
7	Conflicts between HTML5 and XML:
8
9	- In XML, <source> is like any tag, and must be closed,
10	- In HTML, <source> is a VOID tag, and must NOT be closedlike any tag, and must be closed,
11
12	- In XML, <script> and <style> don't have special treatment
13	- In HTML, they do
14
15	- The header is different - <!DOCTYPE html> vs. <?xml version= ... ?>
16
17	So do have a mode for <script> <style> and void tags? Upgrade HX8 into HTM8?
18
19	TODO:
20
21	- Are there special rules for <svg> and <math>?
22	- Do we need to know about <textarea> <pre>? Those don't have the same
23	whitespace rules
24	"""
25	from __future__ import print_function
26
27	try:
28	from cStringIO import StringIO
29	except ImportError:
30	from io import StringIO # python3
31	import re
32	import sys
33
34	if sys.version_info.major == 2:
35	from typing import List, Tuple, Optional, Dict
36
37
38	def log(msg, *args):
39	msg = msg % args
40	print(msg, file=sys.stderr)
41
42
43	class LexError(Exception):
44	"""
45	Examples of lex errors:
46
47	- Tok.Invalid, like <> or &&
48	- Unclosed <!-- <? <![CDATA[ <script> <style>
49	"""
50
51	def __init__(self, s, start_pos):
52	self.s = s
53	self.start_pos = start_pos
54
55	def __str__(self):
56	return '(LexError %r)' % (self.s[self.start_pos:self.start_pos + 20])
57
58
59	def FindLineNum(s, error_pos):
60	current_pos = 0
61	line_num = 1
62	while True:
63	newline_pos = s.find('\n', current_pos)
64	#log('current = %d, N %d, line %d', current_pos, newline_pos, line_num)
65
66	if newline_pos == -1: # this is the last line
67	return line_num
68	if newline_pos >= error_pos:
69	return line_num
70	line_num += 1
71	current_pos = newline_pos + 1
72
73
74	class ParseError(Exception):
75	"""
76	Examples of parse errors
77
78	- unbalanced tag structure
79	- ul_table.py errors
80	"""
81
82	def __init__(self, msg, s=None, start_pos=-1):
83	self.msg = msg
84	self.s = s
85	self.start_pos = start_pos
86
87	def __str__(self):
88	if self.s is not None:
89	assert self.start_pos != -1, self.start_pos
90	snippet = (self.s[self.start_pos:self.start_pos + 20])
91
92	line_num = FindLineNum(self.s, self.start_pos)
93	else:
94	snippet = ''
95	line_num = -1
96	msg = 'line %d: %r %r' % (line_num, self.msg, snippet)
97	return msg
98
99
100	class Output(object):
101	"""Takes an underlying input buffer and an output file. Maintains a
102	position in the input buffer.
103
104	Print FROM the input or print new text to the output.
105	"""
106
107	def __init__(self, s, f, left_pos=0, right_pos=-1):
108	self.s = s
109	self.f = f
110	self.pos = left_pos
111	self.right_pos = len(s) if right_pos == -1 else right_pos
112
113	def SkipTo(self, pos):
114	"""Skip to a position."""
115	self.pos = pos
116
117	def PrintUntil(self, pos):
118	"""Print until a position."""
119	piece = self.s[self.pos:pos]
120	self.f.write(piece)
121	self.pos = pos
122
123	def PrintTheRest(self):
124	"""Print until the end of the string."""
125	self.PrintUntil(self.right_pos)
126
127	def Print(self, s):
128	"""Print text to the underlying buffer."""
129	self.f.write(s)
130
131
132	# HTML Tokens
133	# CommentBegin, ProcessingBegin, CDataBegin are "pseudo-tokens", not visible
134	TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin CData CDataBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData HtmlCData Invalid EndOfStream'.split(
135	)
136
137
138	class Tok(object):
139	"""
140	Avoid lint errors by using these aliases
141	"""
142	pass
143
144
145	TOKEN_NAMES = [None] * len(TOKENS) # type: List[str]
146
147	this_module = sys.modules[__name__]
148	for i, tok_str in enumerate(TOKENS):
149	setattr(this_module, tok_str, i)
150	setattr(Tok, tok_str, i)
151	TOKEN_NAMES[i] = tok_str
152
153
154	def TokenName(tok_id):
155	return TOKEN_NAMES[tok_id]
156
157
158	def MakeLexer(rules):
159	return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]
160
161
162	#
163	# Eggex
164	#
165	# Tag = / ~['>']+ /
166
167	# Is this valid? A single character?
168	# Tag = / ~'>'* /
169
170	# Maybe better: / [NOT '>']+/
171	# capital letters not allowed there?
172	#
173	# But then this is confusing:
174	# / [NOT ~digit]+/
175	#
176	# / [NOT digit] / is [^\d]
177	# / ~digit / is \D
178	#
179	# Or maybe:
180	#
181	# / [~ digit]+ /
182	# / [~ '>']+ /
183	# / [NOT '>']+ /
184
185	# End = / '</' Tag '>' /
186	# StartEnd = / '<' Tag '/>' /
187	# Start = / '<' Tag '>' /
188	#
189	# EntityRef = / '&' dot{* N} ';' /
190
191	# Tag name, or attribute name
192	# colon is used in XML
193
194	# https://www.w3.org/TR/xml/#NT-Name
195	# Hm there is a lot of unicode stuff. We are simplifying parsing
196
197	_NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter
198
199	LEXER = [
200	(r'<!--', Tok.CommentBegin),
201
202	# Processing instruction are used for the XML header:
203	# <?xml version="1.0" encoding="UTF-8"?>
204	# They are technically XML-only, but in HTML5, they are another kind of
205	# comment:
206	#
207	# https://developer.mozilla.org/en-US/docs/Web/API/ProcessingInstruction
208	#
209	(r'<\?', Tok.ProcessingBegin),
210	# Not necessary in HTML5, but occurs in XML
211	(r'<!\[CDATA\[', Tok.CDataBegin), # <![CDATA[
212
213	# Markup declarations
214	# - In HTML5, there is only <!DOCTYPE html>
215	# - XML has 4 more declarations: <!ELEMENT ...> ATTLIST ENTITY NOTATION
216	# - these seem to be part of DTD
217	# - it's useful to skip these, and be able to parse the rest of the document
218	# - Note: < is allowed?
219	(r'<! [^>]+ >', Tok.Decl),
220
221	# Tags
222	# Notes:
223	# - We look for a valid tag name, but we don't validate attributes.
224	# That's done in the tag lexer.
225	# - We don't allow leading whitespace
226	(r'</ (%s) >' % _NAME, Tok.EndTag),
227	# self-closing <br/> comes before StarttTag
228	(r'< (%s) [^>]* />' % _NAME, Tok.StartEndTag), # end </a>
229	(r'< (%s) [^>]* >' % _NAME, Tok.StartTag), # start <a>
230
231	# Characters
232	# https://www.w3.org/TR/xml/#sec-references
233	(r'&\# [0-9]+ ;', Tok.DecChar),
234	(r'&\# x[0-9a-fA-F]+ ;', Tok.HexChar),
235	(r'& %s ;' % _NAME, Tok.CharEntity),
236
237	# HTML5 allows unescaped > in raw data, but < is not allowed.
238	# https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
239	#
240	# - My early blog has THREE errors when disallowing >
241	# - So do some .wwz files
242	(r'[^&<]+', Tok.RawData),
243	(r'.', Tok.Invalid), # error!
244	]
245
246	# Old notes:
247	#
248	# Non-greedy matches are regular and can be matched in linear time
249	# with RE2.
250	#
251	# https://news.ycombinator.com/item?id=27099798
252	#
253	# Maybe try combining all of these for speed.
254
255	# . is any char except newline
256	# https://re2c.org/manual/manual_c.html
257
258	# Discarded options
259	#(r'<!-- .*? -->', Tok.Comment),
260
261	# Hack from Claude: \s\S instead of re.DOTALL. I don't like this
262	#(r'<!-- [\s\S]*? -->', Tok.Comment),
263	#(r'<!-- (?:.\|[\n])*? -->', Tok.Comment),
264
265	LEXER = MakeLexer(LEXER)
266
267
268	class Lexer(object):
269
270	def __init__(self, s, left_pos=0, right_pos=-1, no_special_tags=False):
271	self.s = s
272	self.pos = left_pos
273	self.right_pos = len(s) if right_pos == -1 else right_pos
274	self.no_special_tags = no_special_tags
275
276	self.cache = {} # string -> compiled regex pattern object
277
278	# either </script> or </style> - we search until we see that
279	self.search_state = None # type: Optional[str]
280
281	# Position of tag name, if applicable
282	# - Set after you get a StartTag, EndTag, or StartEndTag
283	# - Unset on other tags
284	self.tag_pos_left = -1
285	self.tag_pos_right = -1
286
287	def _Peek(self):
288	# type: () -> Tuple[int, int]
289	"""
290	Note: not using _Peek() now
291	"""
292	if self.pos == self.right_pos:
293	return Tok.EndOfStream, self.pos
294
295	assert self.pos < self.right_pos, self.pos
296
297	if self.search_state is not None and not self.no_special_tags:
298	pos = self.s.find(self.search_state, self.pos)
299	if pos == -1:
300	# unterminated <script> or <style>
301	raise LexError(self.s, self.pos)
302	self.search_state = None
303	# beginning
304	return Tok.HtmlCData, pos
305
306	# Find the first match.
307	# Note: frontend/match.py uses _LongestMatch(), which is different!
308	# TODO: reconcile them. This lexer should be expressible in re2c.
309
310	for pat, tok_id in LEXER:
311	m = pat.match(self.s, self.pos)
312	if m:
313	if tok_id in (Tok.StartTag, Tok.EndTag, Tok.StartEndTag):
314	self.tag_pos_left = m.start(1)
315	self.tag_pos_right = m.end(1)
316	else:
317	# Reset state
318	self.tag_pos_left = -1
319	self.tag_pos_right = -1
320
321	if tok_id == Tok.CommentBegin:
322	pos = self.s.find('-->', self.pos)
323	if pos == -1:
324	# unterminated <!--
325	raise LexError(self.s, self.pos)
326	return Tok.Comment, pos + 3 # -->
327
328	if tok_id == Tok.ProcessingBegin:
329	pos = self.s.find('?>', self.pos)
330	if pos == -1:
331	# unterminated <?
332	raise LexError(self.s, self.pos)
333	return Tok.Processing, pos + 2 # ?>
334
335	if tok_id == Tok.CDataBegin:
336	pos = self.s.find(']]>', self.pos)
337	if pos == -1:
338	# unterminated <![CDATA[
339	raise LexError(self.s, self.pos)
340	return Tok.CData, pos + 3 # ]]>
341
342	if tok_id == Tok.StartTag:
343	if self.TagNameEquals('script'):
344	self.search_state = '</script>'
345	elif self.TagNameEquals('style'):
346	self.search_state = '</style>'
347
348	return tok_id, m.end()
349	else:
350	raise AssertionError('Tok.Invalid rule should have matched')
351
352	def TagNameEquals(self, expected):
353	# type: (str) -> bool
354	assert self.tag_pos_left != -1, self.tag_pos_left
355	assert self.tag_pos_right != -1, self.tag_pos_right
356
357	# TODO: In C++, this does not need an allocation
358	# TODO: conditionally lower() case here (maybe not in XML mode)
359	return expected == self.s[self.tag_pos_left:self.tag_pos_right]
360
361	def TagName(self):
362	# type: () -> None
363	assert self.tag_pos_left != -1, self.tag_pos_left
364	assert self.tag_pos_right != -1, self.tag_pos_right
365
366	# TODO: conditionally lower() case here (maybe not in XML mode)
367	return self.s[self.tag_pos_left:self.tag_pos_right]
368
369	def Read(self):
370	# type: () -> Tuple[int, int]
371	tok_id, end_pos = self._Peek()
372	self.pos = end_pos # advance
373	return tok_id, end_pos
374
375	def LookAhead(self, regex):
376	# Cache the regex compilation. This could also be LookAheadFor(THEAD)
377	# or something.
378	pat = self.cache.get(regex)
379	if pat is None:
380	pat = re.compile(regex)
381	self.cache[regex] = pat
382
383	m = pat.match(self.s, self.pos)
384	return m is not None
385
386
387	def _Tokens(s, left_pos, right_pos):
388	"""
389	Args:
390	s: string to parse
391	left_pos, right_pos: Optional span boundaries.
392	"""
393	lx = Lexer(s, left_pos, right_pos)
394	while True:
395	tok_id, pos = lx.Read()
396	yield tok_id, pos
397	if tok_id == Tok.EndOfStream:
398	break
399
400
401	def ValidTokens(s, left_pos=0, right_pos=-1):
402	"""Wrapper around _Tokens to prevent callers from having to handle Invalid.
403
404	I'm not combining the two functions because I might want to do a
405	'yield' transformation on Tokens()? Exceptions might complicate the
406	issue?
407	"""
408	pos = left_pos
409	for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
410	if tok_id == Tok.Invalid:
411	raise LexError(s, pos)
412	yield tok_id, end_pos
413	pos = end_pos
414
415
416	def ValidTokenList(s, no_special_tags=False):
417	"""A wrapper that can be more easily translated to C++. Doesn't use iterators."""
418
419	start_pos = 0
420	tokens = []
421	lx = Lexer(s, no_special_tags=no_special_tags)
422	while True:
423	tok_id, end_pos = lx.Read()
424	tokens.append((tok_id, end_pos))
425	if tok_id == Tok.EndOfStream:
426	break
427	if tok_id == Tok.Invalid:
428	raise LexError(s, start_pos)
429	start_pos = end_pos
430	return tokens
431
432
433	# Tag names:
434	# Match <a or </a
435	# Match <h2, but not <2h
436	#
437	# HTML 5 doesn't restrict tag names at all
438	# https://html.spec.whatwg.org/#toc-syntax
439	#
440	# XML allows : - .
441	# https://www.w3.org/TR/xml/#NT-NameChar
442
443	# Namespaces for MathML, SVG
444	# XLink, XML, XMLNS
445	#
446	# https://infra.spec.whatwg.org/#namespaces
447	#
448	# Allow - for td-attrs
449
450	_ATTR_VALUE = r'[a-zA-Z0-9_\-]+' # allow hyphens
451
452	# TODO: we don't need to capture the tag name here? That's done at the top
453	# level
454	_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)
455
456	# To match href="foo"
457
458	_ATTR_RE = re.compile(
459	r'''
460	\s+ # Leading whitespace is required
461	(%s) # Attribute name
462	(?: # Optional attribute value
463	\s* = \s*
464	(?:
465	" ([^>"]*) " # double quoted value
466	\| (%s) # Attribute value
467	# TODO: relax this? for href=$foo
468	)
469	)?
470	''' % (_NAME, _ATTR_VALUE), re.VERBOSE)
471
472	TagName, AttrName, UnquotedValue, QuotedValue = range(4)
473
474
475	class TagLexer(object):
476	"""
477	Given a tag like <a href="..."> or <link type="..." />, the TagLexer
478	provides a few operations:
479
480	- What is the tag?
481	- Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
482	"""
483
484	def __init__(self, s):
485	self.s = s
486	self.start_pos = -1 # Invalid
487	self.end_pos = -1
488
489	def Reset(self, start_pos, end_pos):
490	"""Reuse instances of this object."""
491	self.start_pos = start_pos
492	self.end_pos = end_pos
493
494	def TagString(self):
495	return self.s[self.start_pos:self.end_pos]
496
497	def TagName(self):
498	# First event
499	tok_id, start, end = next(self.Tokens())
500	return self.s[start:end]
501
502	def GetSpanForAttrValue(self, attr_name):
503	# Algorithm: search for QuotedValue or UnquotedValue after AttrName
504	# TODO: Could also cache these
505
506	events = self.Tokens()
507	val = (-1, -1)
508	try:
509	while True:
510	tok_id, start, end = next(events)
511	if tok_id == AttrName:
512	name = self.s[start:end]
513	if name == attr_name:
514	# The value should come next
515	tok_id, start, end = next(events)
516	if tok_id in (QuotedValue, UnquotedValue):
517	# Note: quoted values may have &
518	# We would need ANOTHER lexer to unescape them.
519	# Right now help_gen.py and oils_doc.py
520	val = start, end
521	break
522
523	except StopIteration:
524	pass
525	return val
526
527	def GetAttrRaw(self, attr_name):
528	"""
529	Return the value, which may be UNESCAPED.
530	"""
531	# Algorithm: search for QuotedValue or UnquotedValue after AttrName
532	# TODO: Could also cache these
533	start, end = self.GetSpanForAttrValue(attr_name)
534	if start == -1:
535	return None
536	return self.s[start:end]
537
538	def AllAttrsRaw(self):
539	"""
540	Get a list of pairs [('class', 'foo'), ('href', '?foo=1&bar=2')]
541
542	The quoted values may be escaped. We would need another lexer to
543	unescape them.
544	"""
545	pairs = []
546	events = self.Tokens()
547	try:
548	while True:
549	tok_id, start, end = next(events)
550	if tok_id == AttrName:
551	name = self.s[start:end]
552
553	# The value should come next
554	tok_id, start, end = next(events)
555	if tok_id in (QuotedValue, UnquotedValue):
556	# Note: quoted values may have &
557	# We would need ANOTHER lexer to unescape them, but we
558	# don't need that for ul-table
559
560	val = self.s[start:end]
561	pairs.append((name, val))
562	except StopIteration:
563	pass
564	return pairs
565
566	def Tokens(self):
567	"""
568	Yields a sequence of tokens: Tag (AttrName AttrValue?)*
569
570	Where each Token is (Type, start_pos, end_pos)
571
572	Note that start and end are NOT redundant! We skip over some unwanted
573	characters.
574	"""
575	m = _TAG_RE.match(self.s, self.start_pos + 1)
576	if not m:
577	raise RuntimeError("Couldn't find HTML tag in %r" %
578	self.TagString())
579	yield TagName, m.start(1), m.end(1)
580
581	pos = m.end(0)
582
583	while True:
584	# don't search past the end
585	m = _ATTR_RE.match(self.s, pos, self.end_pos)
586	if not m:
587	# A validating parser would check that > or /> is next -- there's no junk
588	break
589
590	yield AttrName, m.start(1), m.end(1)
591
592	# Quoted is group 2, unquoted is group 3.
593	if m.group(2) is not None:
594	yield QuotedValue, m.start(2), m.end(2)
595	elif m.group(3) is not None:
596	yield UnquotedValue, m.start(3), m.end(3)
597
598	# Skip past the "
599	pos = m.end(0)
600
601
602	def ReadUntilStartTag(it, tag_lexer, tag_name):
603	"""Find the next <foo>, returning its (start, end) positions
604
605	Raise ParseError if it's not found.
606
607	tag_lexer is RESET.
608	"""
609	pos = 0
610	while True:
611	try:
612	tok_id, end_pos = next(it)
613	except StopIteration:
614	break
615	tag_lexer.Reset(pos, end_pos)
616	if tok_id == Tok.StartTag and tag_lexer.TagName() == tag_name:
617	return pos, end_pos
618
619	pos = end_pos
620
621	raise ParseError('No start tag %r' % tag_name)
622
623
624	def ReadUntilEndTag(it, tag_lexer, tag_name):
625	"""Find the next </foo>, returning its (start, end) position
626
627	Raise ParseError if it's not found.
628
629	tag_lexer is RESET.
630	"""
631	pos = 0
632	while True:
633	try:
634	tok_id, end_pos = next(it)
635	except StopIteration:
636	break
637	tag_lexer.Reset(pos, end_pos)
638	if tok_id == Tok.EndTag and tag_lexer.TagName() == tag_name:
639	return pos, end_pos
640
641	pos = end_pos
642
643	raise ParseError('No end tag %r' % tag_name)
644
645
646	CHAR_ENTITY = {
647	'amp': '&',
648	'lt': '<',
649	'gt': '>',
650	'quot': '"',
651	}
652
653
654	def ToText(s, left_pos=0, right_pos=-1):
655	"""Given HTML, return text by unquoting > and < etc.
656
657	Used by:
658	doctools/oils_doc.py: PygmentsPlugin
659	doctools/help_gen.py: HelpIndexCards
660
661	In the latter case, we cold process some tags, like:
662
663	- Blue Link (not clickable, but still useful)
664	- Red X
665
666	That should be html.ToAnsi.
667	"""
668	f = StringIO()
669	out = Output(s, f, left_pos, right_pos)
670
671	pos = left_pos
672	for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
673	if tok_id == Tok.RawData:
674	out.SkipTo(pos)
675	out.PrintUntil(end_pos)
676
677	elif tok_id == Tok.CharEntity: # &
678
679	entity = s[pos + 1:end_pos - 1]
680
681	out.SkipTo(pos)
682	out.Print(CHAR_ENTITY[entity])
683	out.SkipTo(end_pos)
684
685	# Not handling these yet
686	elif tok_id == Tok.HexChar:
687	raise AssertionError('Hex Char %r' % s[pos:pos + 20])
688
689	elif tok_id == Tok.DecChar:
690	raise AssertionError('Dec Char %r' % s[pos:pos + 20])
691
692	pos = end_pos
693
694	out.PrintTheRest()
695	return f.getvalue()
696
697
698	# https://developer.mozilla.org/en-US/docs/Glossary/Void_element
699	VOID_ELEMENTS = [
700	'area',
701	'base',
702	'br',
703	'col',
704	'embed',
705	'hr',
706	'img',
707	'input',
708	'link',
709	'meta',
710	'param',
711	'source',
712	'track',
713	'wbr',
714	]
715
716	LEX_ATTRS = 1 << 1
717	LEX_QUOTED_VALUES = 1 << 2 # href="?x=42&y=99"
718	NO_SPECIAL_TAGS = 1 << 3 # <script> <style>, VOID tags, etc.
719	BALANCED_TAGS = 1 << 4 # are tags balanced?
720
721
722	def Validate(contents, flags, counters):
723	# type: (str, int, Counters) -> None
724
725	tag_lexer = TagLexer(contents)
726	no_special_tags = bool(flags & NO_SPECIAL_TAGS)
727	lx = Lexer(contents, no_special_tags=no_special_tags)
728	tokens = []
729	start_pos = 0
730	tag_stack = []
731	while True:
732	tok_id, end_pos = lx.Read()
733
734	if tok_id == Tok.Invalid:
735	raise LexError(contents, start_pos)
736	if tok_id == Tok.EndOfStream:
737	break
738
739	tokens.append((tok_id, end_pos))
740
741	if tok_id == Tok.StartEndTag:
742	counters.num_start_end_tags += 1
743
744	tag_lexer.Reset(start_pos, end_pos)
745	all_attrs = tag_lexer.AllAttrsRaw()
746	counters.num_attrs += len(all_attrs)
747
748	elif tok_id == Tok.StartTag:
749	counters.num_start_tags += 1
750
751	tag_lexer.Reset(start_pos, end_pos)
752	all_attrs = tag_lexer.AllAttrsRaw()
753	counters.num_attrs += len(all_attrs)
754
755	if flags & BALANCED_TAGS:
756	tag_name = lx.TagName()
757	if flags & NO_SPECIAL_TAGS:
758	tag_stack.append(tag_name)
759	else:
760	# e.g. <meta> is considered self-closing, like <meta/>
761	if tag_name not in VOID_ELEMENTS:
762	tag_stack.append(tag_name)
763
764	counters.max_tag_stack = max(counters.max_tag_stack,
765	len(tag_stack))
766	elif tok_id == Tok.EndTag:
767	if flags & BALANCED_TAGS:
768	try:
769	expected = tag_stack.pop()
770	except IndexError:
771	raise ParseError('Tag stack empty',
772	s=contents,
773	start_pos=start_pos)
774
775	actual = lx.TagName()
776	if expected != actual:
777	raise ParseError(
778	'Got unexpected closing tag %r; opening tag was %r' %
779	(contents[start_pos:end_pos], expected),
780	s=contents,
781	start_pos=start_pos)
782
783	start_pos = end_pos
784	counters.num_tokens += len(tokens)
785
786
787	class Counters(object):
788
789	def __init__(self):
790	self.num_tokens = 0
791	self.num_start_tags = 0
792	self.num_start_end_tags = 0
793	self.num_attrs = 0
794	self.max_tag_stack = 0
795
796
797	def main(argv):
798	action = argv[1]
799
800	if action == 'tokens':
801	contents = sys.stdin.read()
802
803	lx = Lexer(contents)
804	start_pos = 0
805	while True:
806	tok_id, end_pos = lx.Read()
807	if tok_id == Tok.Invalid:
808	raise LexError(contents, start_pos)
809	if tok_id == Tok.EndOfStream:
810	break
811
812	frag = contents[start_pos:end_pos]
813	log('%d %s %r', end_pos, TokenName(tok_id), frag)
814	start_pos = end_pos
815
816	return 0
817
818	elif action in ('lex-htm8', 'parse-htm8', 'parse-xml'):
819
820	errors = []
821	counters = Counters()
822
823	flags = LEX_ATTRS \| LEX_QUOTED_VALUES
824	if action.startswith('parse-'):
825	flags \|= BALANCED_TAGS
826	if action == 'parse-xml':
827	flags \|= NO_SPECIAL_TAGS
828
829	i = 0
830	for line in sys.stdin:
831	filename = line.strip()
832	with open(filename) as f:
833	contents = f.read()
834
835	try:
836	Validate(contents, flags, counters)
837	except LexError as e:
838	log('Lex error in %r: %s', filename, e)
839	errors.append((filename, e))
840	except ParseError as e:
841	log('Parse error in %r: %s', filename, e)
842	errors.append((filename, e))
843	i += 1
844
845	log('')
846	log(
847	' %d tokens, %d start/end tags, %d start tags, %d attrs, %d max tag stack depth in %d files',
848	counters.num_tokens, counters.num_start_end_tags,
849	counters.num_start_tags, counters.num_attrs,
850	counters.max_tag_stack, i)
851	log(' %d errors', len(errors))
852	if len(errors):
853	return 1
854	return 0
855
856	elif action == 'todo':
857	# Other algorithms:
858	#
859	# - select first subtree with given ID
860	# - this requires understanding the void tags I suppose
861	# - select all subtrees that have a class
862	# - materialize DOM
863
864	# Safe-HTM8? This is a filter
865	return 0
866
867	else:
868	raise RuntimeError('Invalid action %r' % action)
869
870
871	if __name__ == '__main__':
872	sys.exit(main(sys.argv))