lazylex/html.py

OILS / lazylex / html.py View on Github | oils.pub

1001 lines, 516 significant

1	#!/usr/bin/env python2
2	"""
3	lazylex/html.py - Low-Level HTML Processing.
4
5	See lazylex/README.md for details.
6
7	Conflicts between HTML5 and XML:
8
9	- In XML, <source> is like any tag, and must be closed,
10	- In HTML, <source> is a VOID tag, and must NOT be closedlike any tag, and must be closed,
11
12	- In XML, <script> and <style> don't have special treatment
13	- In HTML, they do
14
15	- The header is different - <!DOCTYPE html> vs. <?xml version= ... ?>
16
17	So do have a mode for <script> <style> and void tags? Upgrade HX8 into HTM8?
18	"""
19	from __future__ import print_function
20
21	try:
22	from cStringIO import StringIO
23	except ImportError:
24	from io import StringIO # python3
25	import re
26	import sys
27
28	if sys.version_info.major == 2:
29	from typing import List, Tuple, Optional
30
31
32	def log(msg, *args):
33	msg = msg % args
34	print(msg, file=sys.stderr)
35
36
37	class LexError(Exception):
38	"""
39	Examples of lex errors:
40
41	- Tok.Invalid, like <> or &&
42	- Unclosed <!-- <? <![CDATA[ <script> <style>
43	"""
44
45	def __init__(self, s, start_pos):
46	self.s = s
47	self.start_pos = start_pos
48
49	def __str__(self):
50	return '(LexError %r)' % (self.s[self.start_pos:self.start_pos + 20])
51
52
53	def FindLineNum(s, error_pos):
54	current_pos = 0
55	line_num = 1
56	while True:
57	newline_pos = s.find('\n', current_pos)
58	#log('current = %d, N %d, line %d', current_pos, newline_pos, line_num)
59
60	if newline_pos == -1: # this is the last line
61	return line_num
62	if newline_pos >= error_pos:
63	return line_num
64	line_num += 1
65	current_pos = newline_pos + 1
66
67
68	class ParseError(Exception):
69	"""
70	Examples of parse errors
71
72	- unbalanced tag structure
73	- ul_table.py errors
74	"""
75
76	def __init__(self, msg, s=None, start_pos=-1):
77	self.msg = msg
78	self.s = s
79	self.start_pos = start_pos
80
81	def __str__(self):
82	if self.s is not None:
83	assert self.start_pos != -1, self.start_pos
84	snippet = (self.s[self.start_pos:self.start_pos + 20])
85
86	line_num = FindLineNum(self.s, self.start_pos)
87	else:
88	snippet = ''
89	line_num = -1
90	msg = 'line %d: %r %r' % (line_num, self.msg, snippet)
91	return msg
92
93
94	class Output(object):
95	"""Takes an underlying input buffer and an output file. Maintains a
96	position in the input buffer.
97
98	Print FROM the input or print new text to the output.
99	"""
100
101	def __init__(self, s, f, left_pos=0, right_pos=-1):
102	self.s = s
103	self.f = f
104	self.pos = left_pos
105	self.right_pos = len(s) if right_pos == -1 else right_pos
106
107	def SkipTo(self, pos):
108	"""Skip to a position."""
109	self.pos = pos
110
111	def PrintUntil(self, pos):
112	"""Print until a position."""
113	piece = self.s[self.pos:pos]
114	self.f.write(piece)
115	self.pos = pos
116
117	def PrintTheRest(self):
118	"""Print until the end of the string."""
119	self.PrintUntil(self.right_pos)
120
121	def Print(self, s):
122	"""Print text to the underlying buffer."""
123	self.f.write(s)
124
125
126	# HTML Tokens
127	# CommentBegin, ProcessingBegin, CDataBegin are "pseudo-tokens", not visible
128	TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin CData CDataBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData HtmlCData Invalid EndOfStream'.split(
129	)
130
131
132	class Tok(object):
133	"""
134	Avoid lint errors by using these aliases
135	"""
136	pass
137
138
139	TOKEN_NAMES = [None] * len(TOKENS) # type: List[str]
140
141	this_module = sys.modules[__name__]
142	for i, tok_str in enumerate(TOKENS):
143	setattr(this_module, tok_str, i)
144	setattr(Tok, tok_str, i)
145	TOKEN_NAMES[i] = tok_str
146
147
148	def TokenName(tok_id):
149	return TOKEN_NAMES[tok_id]
150
151
152	def MakeLexer(rules):
153	return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]
154
155
156	#
157	# Eggex
158	#
159	# Tag = / ~['>']+ /
160
161	# Is this valid? A single character?
162	# Tag = / ~'>'* /
163
164	# Maybe better: / [NOT '>']+/
165	# capital letters not allowed there?
166	#
167	# But then this is confusing:
168	# / [NOT ~digit]+/
169	#
170	# / [NOT digit] / is [^\d]
171	# / ~digit / is \D
172	#
173	# Or maybe:
174	#
175	# / [~ digit]+ /
176	# / [~ '>']+ /
177	# / [NOT '>']+ /
178
179	# End = / '</' Tag '>' /
180	# StartEnd = / '<' Tag '/>' /
181	# Start = / '<' Tag '>' /
182	#
183	# EntityRef = / '&' dot{* N} ';' /
184
185	# Tag name, or attribute name
186	# colon is used in XML
187
188	# https://www.w3.org/TR/xml/#NT-Name
189	# Hm there is a lot of unicode stuff. We are simplifying parsing
190
191	_NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter
192
193	CHAR_LEX = [
194	# Characters
195	# https://www.w3.org/TR/xml/#sec-references
196	(r'&\# [0-9]+ ;', Tok.DecChar),
197	(r'&\# x[0-9a-fA-F]+ ;', Tok.HexChar),
198	(r'& %s ;' % _NAME, Tok.CharEntity),
199	]
200
201	LEXER = CHAR_LEX + [
202	(r'<!--', Tok.CommentBegin),
203
204	# Processing instruction are used for the XML header:
205	# <?xml version="1.0" encoding="UTF-8"?>
206	# They are technically XML-only, but in HTML5, they are another kind of
207	# comment:
208	#
209	# https://developer.mozilla.org/en-US/docs/Web/API/ProcessingInstruction
210	#
211	(r'<\?', Tok.ProcessingBegin),
212	# Not necessary in HTML5, but occurs in XML
213	(r'<!\[CDATA\[', Tok.CDataBegin), # <![CDATA[
214
215	# Markup declarations
216	# - In HTML5, there is only <!DOCTYPE html>
217	# - XML has 4 more declarations: <!ELEMENT ...> ATTLIST ENTITY NOTATION
218	# - these seem to be part of DTD
219	# - it's useful to skip these, and be able to parse the rest of the document
220	# - Note: < is allowed?
221	(r'<! [^>\x00]+ >', Tok.Decl),
222
223	# Tags
224	# Notes:
225	# - We look for a valid tag name, but we don't validate attributes.
226	# That's done in the tag lexer.
227	# - We don't allow leading whitespace
228	(r'</ (%s) >' % _NAME, Tok.EndTag),
229	# self-closing <br/> comes before StartTag
230	# could/should these be collapsed into one rule?
231	(r'< (%s) [^>\x00]* />' % _NAME, Tok.StartEndTag), # end </a>
232	(r'< (%s) [^>\x00]* >' % _NAME, Tok.StartTag), # start <a>
233
234	# HTML5 allows unescaped > in raw data, but < is not allowed.
235	# https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
236	#
237	# - My early blog has THREE errors when disallowing >
238	# - So do some .wwz files
239	(r'[^&<\x00]+', Tok.RawData),
240	(r'.', Tok.Invalid), # error!
241	]
242
243	# Old notes:
244	#
245	# Non-greedy matches are regular and can be matched in linear time
246	# with RE2.
247	#
248	# https://news.ycombinator.com/item?id=27099798
249	#
250	# Maybe try combining all of these for speed.
251
252	# . is any char except newline
253	# https://re2c.org/manual/manual_c.html
254
255	# Discarded options
256	#(r'<!-- .*? -->', Tok.Comment),
257
258	# Hack from Claude: \s\S instead of re.DOTALL. I don't like this
259	#(r'<!-- [\s\S]*? -->', Tok.Comment),
260	#(r'<!-- (?:.\|[\n])*? -->', Tok.Comment),
261
262	LEXER = MakeLexer(LEXER)
263
264
265	class Lexer(object):
266
267	def __init__(self, s, left_pos=0, right_pos=-1, no_special_tags=False):
268	self.s = s
269	self.pos = left_pos
270	self.right_pos = len(s) if right_pos == -1 else right_pos
271	self.no_special_tags = no_special_tags
272
273	self.cache = {} # string -> compiled regex pattern object
274
275	# either </script> or </style> - we search until we see that
276	self.search_state = None # type: Optional[str]
277
278	# Position of tag name, if applicable
279	# - Set after you get a StartTag, EndTag, or StartEndTag
280	# - Unset on other tags
281	self.tag_pos_left = -1
282	self.tag_pos_right = -1
283
284	def _Peek(self):
285	# type: () -> Tuple[int, int]
286	"""
287	Note: not using _Peek() now
288	"""
289	if self.pos == self.right_pos:
290	return Tok.EndOfStream, self.pos
291
292	assert self.pos < self.right_pos, self.pos
293
294	if self.search_state is not None and not self.no_special_tags:
295	pos = self.s.find(self.search_state, self.pos)
296	if pos == -1:
297	# unterminated <script> or <style>
298	raise LexError(self.s, self.pos)
299	self.search_state = None
300	# beginning
301	return Tok.HtmlCData, pos
302
303	# Find the first match.
304	# Note: frontend/match.py uses _LongestMatch(), which is different!
305	# TODO: reconcile them. This lexer should be expressible in re2c.
306
307	for pat, tok_id in LEXER:
308	m = pat.match(self.s, self.pos)
309	if m:
310	if tok_id in (Tok.StartTag, Tok.EndTag, Tok.StartEndTag):
311	self.tag_pos_left = m.start(1)
312	self.tag_pos_right = m.end(1)
313	else:
314	# Reset state
315	self.tag_pos_left = -1
316	self.tag_pos_right = -1
317
318	if tok_id == Tok.CommentBegin:
319	pos = self.s.find('-->', self.pos)
320	if pos == -1:
321	# unterminated <!--
322	raise LexError(self.s, self.pos)
323	return Tok.Comment, pos + 3 # -->
324
325	if tok_id == Tok.ProcessingBegin:
326	pos = self.s.find('?>', self.pos)
327	if pos == -1:
328	# unterminated <?
329	raise LexError(self.s, self.pos)
330	return Tok.Processing, pos + 2 # ?>
331
332	if tok_id == Tok.CDataBegin:
333	pos = self.s.find(']]>', self.pos)
334	if pos == -1:
335	# unterminated <![CDATA[
336	raise LexError(self.s, self.pos)
337	return Tok.CData, pos + 3 # ]]>
338
339	if tok_id == Tok.StartTag:
340	if self.TagNameEquals('script'):
341	self.search_state = '</script>'
342	elif self.TagNameEquals('style'):
343	self.search_state = '</style>'
344
345	return tok_id, m.end()
346	else:
347	raise AssertionError('Tok.Invalid rule should have matched')
348
349	def TagNameEquals(self, expected):
350	# type: (str) -> bool
351	assert self.tag_pos_left != -1, self.tag_pos_left
352	assert self.tag_pos_right != -1, self.tag_pos_right
353
354	# TODO: In C++, this does not need an allocation
355	# TODO: conditionally lower() case here (maybe not in XML mode)
356	return expected == self.s[self.tag_pos_left:self.tag_pos_right]
357
358	def TagName(self):
359	# type: () -> None
360	assert self.tag_pos_left != -1, self.tag_pos_left
361	assert self.tag_pos_right != -1, self.tag_pos_right
362
363	# TODO: conditionally lower() case here (maybe not in XML mode)
364	return self.s[self.tag_pos_left:self.tag_pos_right]
365
366	def Read(self):
367	# type: () -> Tuple[int, int]
368	tok_id, end_pos = self._Peek()
369	self.pos = end_pos # advance
370	return tok_id, end_pos
371
372	def LookAhead(self, regex):
373	# Cache the regex compilation. This could also be LookAheadFor(THEAD)
374	# or something.
375	pat = self.cache.get(regex)
376	if pat is None:
377	pat = re.compile(regex)
378	self.cache[regex] = pat
379
380	m = pat.match(self.s, self.pos)
381	return m is not None
382
383
384	def _Tokens(s, left_pos, right_pos):
385	"""
386	Args:
387	s: string to parse
388	left_pos, right_pos: Optional span boundaries.
389	"""
390	lx = Lexer(s, left_pos, right_pos)
391	while True:
392	tok_id, pos = lx.Read()
393	yield tok_id, pos
394	if tok_id == Tok.EndOfStream:
395	break
396
397
398	def ValidTokens(s, left_pos=0, right_pos=-1):
399	"""Wrapper around _Tokens to prevent callers from having to handle Invalid.
400
401	I'm not combining the two functions because I might want to do a
402	'yield' transformation on Tokens()? Exceptions might complicate the
403	issue?
404	"""
405	pos = left_pos
406	for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
407	if tok_id == Tok.Invalid:
408	raise LexError(s, pos)
409	yield tok_id, end_pos
410	pos = end_pos
411
412
413	def ValidTokenList(s, no_special_tags=False):
414	"""A wrapper that can be more easily translated to C++. Doesn't use iterators."""
415
416	start_pos = 0
417	tokens = []
418	lx = Lexer(s, no_special_tags=no_special_tags)
419	while True:
420	tok_id, end_pos = lx.Read()
421	tokens.append((tok_id, end_pos))
422	if tok_id == Tok.EndOfStream:
423	break
424	if tok_id == Tok.Invalid:
425	raise LexError(s, start_pos)
426	start_pos = end_pos
427	return tokens
428
429
430	# Tag names:
431	# Match <a or </a
432	# Match <h2, but not <2h
433	#
434	# HTML 5 doesn't restrict tag names at all
435	# https://html.spec.whatwg.org/#toc-syntax
436	#
437	# XML allows : - .
438	# https://www.w3.org/TR/xml/#NT-NameChar
439
440	# Namespaces for MathML, SVG
441	# XLink, XML, XMLNS
442	#
443	# https://infra.spec.whatwg.org/#namespaces
444	#
445	# Allow - for td-attrs
446
447	# Be very lenient - just no whitespace or special HTML chars
448	# I don't think this is more lenient than HTML5, though we should check.
449	_UNQUOTED_VALUE = r'''[^ \t\r\n<>&"'\x00]*'''
450
451	# TODO: we don't need to capture the tag name here? That's done at the top
452	# level
453	_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)
454
455	_TAG_LAST_RE = re.compile(r'\s* /? >', re.VERBOSE)
456
457	# To match href="foo"
458	# Note: in HTML5 and XML, single quoted attributes are also valid
459
460	# <button disabled> is standard usage
461
462	# NOTE: This used to allow whitespace around =
463	# <a foo = "bar"> makes sense in XML
464	# But then you also have
465	# <a foo= bar> - which is TWO attributes, in HTML5
466	# So the space is problematic
467
468	_ATTR_RE = re.compile(
469	r'''
470	\s+ # Leading whitespace is required
471	(%s) # Attribute name
472	(?: # Optional attribute value
473	=
474	(?:
475	" ([^>"\x00]*) " # double quoted value
476	\| ' ([^>'\x00]*) ' # single quoted value
477	\| (%s) # Attribute value
478	)
479	)?
480	''' % (_NAME, _UNQUOTED_VALUE), re.VERBOSE)
481
482	TagName, AttrName, UnquotedValue, QuotedValue, MissingValue = range(5)
483
484
485	class TagLexer(object):
486	"""
487	Given a tag like <a href="..."> or <link type="..." />, the TagLexer
488	provides a few operations:
489
490	- What is the tag?
491	- Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
492	"""
493
494	def __init__(self, s):
495	self.s = s
496	self.start_pos = -1 # Invalid
497	self.end_pos = -1
498
499	def Reset(self, start_pos, end_pos):
500	"""Reuse instances of this object."""
501	assert start_pos >= 0, start_pos
502	assert end_pos >= 0, end_pos
503
504	self.start_pos = start_pos
505	self.end_pos = end_pos
506
507	def TagString(self):
508	return self.s[self.start_pos:self.end_pos]
509
510	def TagName(self):
511	# First event
512	tok_id, start, end = next(self.Tokens())
513	return self.s[start:end]
514
515	def GetSpanForAttrValue(self, attr_name):
516	"""
517	Used by oils_doc.py, for href shortcuts
518	"""
519	# Algorithm: search for QuotedValue or UnquotedValue after AttrName
520	# TODO: Could also cache these
521
522	events = self.Tokens()
523	val = (-1, -1)
524	try:
525	while True:
526	tok_id, start, end = next(events)
527	if tok_id == AttrName:
528	name = self.s[start:end]
529	if name == attr_name:
530	# The value should come next
531	tok_id, start, end = next(events)
532	assert tok_id in (QuotedValue, UnquotedValue,
533	MissingValue), TokenName(tok_id)
534	val = start, end
535	break
536
537	except StopIteration:
538	pass
539	return val
540
541	def GetAttrRaw(self, attr_name):
542	"""
543	Return the value, which may be UNESCAPED.
544	"""
545	start, end = self.GetSpanForAttrValue(attr_name)
546	if start == -1:
547	return None
548	return self.s[start:end]
549
550	def AllAttrsRawSlice(self):
551	"""
552	Get a list of pairs [('class', 3, 5), ('href', 9, 12)]
553	"""
554	slices = []
555	events = self.Tokens()
556	try:
557	while True:
558	tok_id, start, end = next(events)
559	if tok_id == AttrName:
560	name = self.s[start:end]
561
562	# The value should come next
563	tok_id, start, end = next(events)
564	assert tok_id in (QuotedValue, UnquotedValue,
565	MissingValue), TokenName(tok_id)
566	# Note: quoted values may have &
567	# We would need ANOTHER lexer to unescape them, but we
568	# don't need that for ul-table
569	slices.append((name, start, end))
570	except StopIteration:
571	pass
572	return slices
573
574	def AllAttrsRaw(self):
575	"""
576	Get a list of pairs [('class', 'foo'), ('href', '?foo=1&bar=2')]
577
578	The quoted values may be escaped. We would need another lexer to
579	unescape them.
580	"""
581	slices = self.AllAttrsRawSlice()
582	pairs = []
583	for name, start, end in slices:
584	pairs.append((name, self.s[start:end]))
585	return pairs
586
587	def Tokens(self):
588	"""
589	Yields a sequence of tokens: Tag (AttrName AttrValue?)*
590
591	Where each Token is (Type, start_pos, end_pos)
592
593	Note that start and end are NOT redundant! We skip over some unwanted
594	characters.
595	"""
596	m = _TAG_RE.match(self.s, self.start_pos + 1)
597	if not m:
598	raise RuntimeError("Couldn't find HTML tag in %r" %
599	self.TagString())
600	yield TagName, m.start(1), m.end(1)
601
602	pos = m.end(0)
603	#log('POS %d', pos)
604
605	while True:
606	# don't search past the end
607	m = _ATTR_RE.match(self.s, pos, self.end_pos)
608	if not m:
609	#log('BREAK pos %d', pos)
610	break
611	#log('AttrName %r', m.group(1))
612
613	yield AttrName, m.start(1), m.end(1)
614
615	#log('m.groups() %r', m.groups())
616	if m.group(2) is not None:
617	# double quoted
618	yield QuotedValue, m.start(2), m.end(2)
619	elif m.group(3) is not None:
620	# single quoted - TODO: could have different token types
621	yield QuotedValue, m.start(3), m.end(3)
622	elif m.group(4) is not None:
623	yield UnquotedValue, m.start(4), m.end(4)
624	else:
625	# <button disabled>
626	end = m.end(0)
627	yield MissingValue, end, end
628
629	# Skip past the "
630	pos = m.end(0)
631
632	#log('TOK %r', self.s)
633
634	m = _TAG_LAST_RE.match(self.s, pos)
635	#log('_TAG_LAST_RE match %r', self.s[pos:])
636	if not m:
637	# Extra data at end of tag. TODO: add messages for all these.
638	raise LexError(self.s, pos)
639
640
641	# This is similar but not identical to
642	# " ([^>"\x00]*) " # double quoted value
643	# \| ' ([^>'\x00]*) ' # single quoted value
644	#
645	# Note: for unquoted values, & isn't allowed, and thus & and c and
646	# are not allowed. We could relax that?
647	ATTR_VALUE_LEXER = CHAR_LEX + [
648	(r'[^>&\x00]+', Tok.RawData),
649	(r'.', Tok.Invalid),
650	]
651
652	ATTR_VALUE_LEXER = MakeLexer(ATTR_VALUE_LEXER)
653
654
655	class AttrValueLexer(object):
656	"""
657	<a href="foo=99&bar">
658	<a href='foo=99&bar'>
659	<a href=unquoted>
660	"""
661
662	def __init__(self, s):
663	self.s = s
664	self.start_pos = -1 # Invalid
665	self.end_pos = -1
666
667	def Reset(self, start_pos, end_pos):
668	"""Reuse instances of this object."""
669	assert start_pos >= 0, start_pos
670	assert end_pos >= 0, end_pos
671
672	self.start_pos = start_pos
673	self.end_pos = end_pos
674
675	def NumTokens(self):
676	num_tokens = 0
677	pos = self.start_pos
678	for tok_id, end_pos in self.Tokens():
679	if tok_id == Tok.Invalid:
680	raise LexError(self.s, pos)
681	pos = end_pos
682	#log('pos %d', pos)
683	num_tokens += 1
684	return num_tokens
685
686	def Tokens(self):
687	pos = self.start_pos
688	while pos < self.end_pos:
689	# Find the first match, like above.
690	# Note: frontend/match.py uses _LongestMatch(), which is different!
691	# TODO: reconcile them. This lexer should be expressible in re2c.
692	for pat, tok_id in ATTR_VALUE_LEXER:
693	m = pat.match(self.s, pos)
694	if m:
695	if 0:
696	tok_str = m.group(0)
697	log('token = %r', tok_str)
698
699	end_pos = m.end(0)
700	yield tok_id, end_pos
701	pos = end_pos
702	break
703	else:
704	raise AssertionError('Tok.Invalid rule should have matched')
705
706
707	def ReadUntilStartTag(it, tag_lexer, tag_name):
708	"""Find the next <foo>, returning its (start, end) positions
709
710	Raise ParseError if it's not found.
711
712	tag_lexer is RESET.
713	"""
714	pos = 0
715	while True:
716	try:
717	tok_id, end_pos = next(it)
718	except StopIteration:
719	break
720	tag_lexer.Reset(pos, end_pos)
721	if tok_id == Tok.StartTag and tag_lexer.TagName() == tag_name:
722	return pos, end_pos
723
724	pos = end_pos
725
726	raise ParseError('No start tag %r' % tag_name)
727
728
729	def ReadUntilEndTag(it, tag_lexer, tag_name):
730	"""Find the next </foo>, returning its (start, end) position
731
732	Raise ParseError if it's not found.
733
734	tag_lexer is RESET.
735	"""
736	pos = 0
737	while True:
738	try:
739	tok_id, end_pos = next(it)
740	except StopIteration:
741	break
742	tag_lexer.Reset(pos, end_pos)
743	if tok_id == Tok.EndTag and tag_lexer.TagName() == tag_name:
744	return pos, end_pos
745
746	pos = end_pos
747
748	raise ParseError('No end tag %r' % tag_name)
749
750
751	CHAR_ENTITY = {
752	'amp': '&',
753	'lt': '<',
754	'gt': '>',
755	'quot': '"',
756	}
757
758
759	def ToText(s, left_pos=0, right_pos=-1):
760	"""Given HTML, return text by unquoting > and < etc.
761
762	Used by:
763	doctools/oils_doc.py: PygmentsPlugin
764	doctools/help_gen.py: HelpIndexCards
765
766	In the latter case, we cold process some tags, like:
767
768	- Blue Link (not clickable, but still useful)
769	- Red X
770
771	That should be html.ToAnsi.
772	"""
773	f = StringIO()
774	out = Output(s, f, left_pos, right_pos)
775
776	pos = left_pos
777	for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
778	if tok_id == Tok.RawData:
779	out.SkipTo(pos)
780	out.PrintUntil(end_pos)
781
782	elif tok_id == Tok.CharEntity: # &
783
784	entity = s[pos + 1:end_pos - 1]
785
786	out.SkipTo(pos)
787	out.Print(CHAR_ENTITY[entity])
788	out.SkipTo(end_pos)
789
790	# Not handling these yet
791	elif tok_id == Tok.HexChar:
792	raise AssertionError('Hex Char %r' % s[pos:pos + 20])
793
794	elif tok_id == Tok.DecChar:
795	raise AssertionError('Dec Char %r' % s[pos:pos + 20])
796
797	pos = end_pos
798
799	out.PrintTheRest()
800	return f.getvalue()
801
802
803	# https://developer.mozilla.org/en-US/docs/Glossary/Void_element
804	VOID_ELEMENTS = [
805	'area',
806	'base',
807	'br',
808	'col',
809	'embed',
810	'hr',
811	'img',
812	'input',
813	'link',
814	'meta',
815	'param',
816	'source',
817	'track',
818	'wbr',
819	]
820
821	LEX_ATTRS = 1 << 1
822	LEX_QUOTED_VALUES = 1 << 2 # href="?x=42&y=99"
823	NO_SPECIAL_TAGS = 1 << 3 # <script> <style>, VOID tags, etc.
824	BALANCED_TAGS = 1 << 4 # are tags balanced?
825
826
827	def Validate(contents, flags, counters):
828	# type: (str, int, Counters) -> None
829
830	tag_lexer = TagLexer(contents)
831	val_lexer = AttrValueLexer(contents)
832
833	no_special_tags = bool(flags & NO_SPECIAL_TAGS)
834	lx = Lexer(contents, no_special_tags=no_special_tags)
835	tokens = []
836	start_pos = 0
837	tag_stack = []
838	while True:
839	tok_id, end_pos = lx.Read()
840	#log('TOP %s %r', TokenName(tok_id), contents[start_pos:end_pos])
841
842	if tok_id == Tok.Invalid:
843	raise LexError(contents, start_pos)
844	if tok_id == Tok.EndOfStream:
845	break
846
847	tokens.append((tok_id, end_pos))
848
849	if tok_id == Tok.StartEndTag:
850	counters.num_start_end_tags += 1
851
852	tag_lexer.Reset(start_pos, end_pos)
853	all_attrs = tag_lexer.AllAttrsRawSlice()
854	counters.num_attrs += len(all_attrs)
855	for name, val_start, val_end in all_attrs:
856	val_lexer.Reset(val_start, val_end)
857	counters.num_val_tokens += val_lexer.NumTokens()
858
859	counters.debug_attrs.extend(all_attrs)
860
861	elif tok_id == Tok.StartTag:
862	counters.num_start_tags += 1
863
864	tag_lexer.Reset(start_pos, end_pos)
865	all_attrs = tag_lexer.AllAttrsRawSlice()
866	counters.num_attrs += len(all_attrs)
867	for name, val_start, val_end in all_attrs:
868	val_lexer.Reset(val_start, val_end)
869	counters.num_val_tokens += val_lexer.NumTokens()
870
871	counters.debug_attrs.extend(all_attrs)
872
873	if flags & BALANCED_TAGS:
874	tag_name = lx.TagName()
875	if flags & NO_SPECIAL_TAGS:
876	tag_stack.append(tag_name)
877	else:
878	# e.g. <meta> is considered self-closing, like <meta/>
879	if tag_name not in VOID_ELEMENTS:
880	tag_stack.append(tag_name)
881
882	counters.max_tag_stack = max(counters.max_tag_stack,
883	len(tag_stack))
884	elif tok_id == Tok.EndTag:
885	if flags & BALANCED_TAGS:
886	try:
887	expected = tag_stack.pop()
888	except IndexError:
889	raise ParseError('Tag stack empty',
890	s=contents,
891	start_pos=start_pos)
892
893	actual = lx.TagName()
894	if expected != actual:
895	raise ParseError(
896	'Got unexpected closing tag %r; opening tag was %r' %
897	(contents[start_pos:end_pos], expected),
898	s=contents,
899	start_pos=start_pos)
900
901	start_pos = end_pos
902
903	if len(tag_stack) != 0:
904	raise ParseError('Missing closing tags at end of doc: %s' %
905	' '.join(tag_stack),
906	s=contents,
907	start_pos=start_pos)
908
909	counters.num_tokens += len(tokens)
910
911
912	class Counters(object):
913
914	def __init__(self):
915	self.num_tokens = 0
916	self.num_start_tags = 0
917	self.num_start_end_tags = 0
918	self.num_attrs = 0
919	self.max_tag_stack = 0
920	self.num_val_tokens = 0
921
922	self.debug_attrs = []
923
924
925	def main(argv):
926	action = argv[1]
927
928	if action == 'tokens':
929	contents = sys.stdin.read()
930
931	lx = Lexer(contents)
932	start_pos = 0
933	while True:
934	tok_id, end_pos = lx.Read()
935	if tok_id == Tok.Invalid:
936	raise LexError(contents, start_pos)
937	if tok_id == Tok.EndOfStream:
938	break
939
940	frag = contents[start_pos:end_pos]
941	log('%d %s %r', end_pos, TokenName(tok_id), frag)
942	start_pos = end_pos
943
944	return 0
945
946	elif action in ('lex-htm8', 'parse-htm8', 'parse-xml'):
947
948	errors = []
949	counters = Counters()
950
951	flags = LEX_ATTRS \| LEX_QUOTED_VALUES
952	if action.startswith('parse-'):
953	flags \|= BALANCED_TAGS
954	if action == 'parse-xml':
955	flags \|= NO_SPECIAL_TAGS
956
957	i = 0
958	for line in sys.stdin:
959	filename = line.strip()
960	with open(filename) as f:
961	contents = f.read()
962
963	try:
964	Validate(contents, flags, counters)
965	except LexError as e:
966	log('Lex error in %r: %s', filename, e)
967	errors.append((filename, e))
968	except ParseError as e:
969	log('Parse error in %r: %s', filename, e)
970	errors.append((filename, e))
971	i += 1
972
973	log('')
974	log('%10d tokens', counters.num_tokens)
975	log('%10d start/end tags', counters.num_start_end_tags)
976	log('%10d start tags', counters.num_start_tags)
977	log('%10d attrs', counters.num_attrs)
978	log('%10d max tag stack depth', counters.max_tag_stack)
979	log('%10d attr val tokens', counters.num_val_tokens)
980	log('%10d errors', len(errors))
981	if len(errors):
982	return 1
983	return 0
984
985	elif action == 'todo':
986	# Other algorithms:
987	#
988	# - select first subtree with given ID
989	# - this requires understanding the void tags I suppose
990	# - select all subtrees that have a class
991	# - materialize DOM
992
993	# Safe-HTM8? This is a filter
994	return 0
995
996	else:
997	raise RuntimeError('Invalid action %r' % action)
998
999
1000	if __name__ == '__main__':
1001	sys.exit(main(sys.argv))