lazylex/html.py

OILS / lazylex / html.py View on Github | oils.pub

1105 lines, 551 significant

1	#!/usr/bin/env python2
2	"""
3	lazylex/html.py - Low-Level HTML Processing.
4
5	See lazylex/README.md for details.
6
7	TODO:
8	- Get rid of AttrValueLexer - this should be in the TagLexer
9	- this also means that unquoted values can be more similar
10	- We can use a single lexer mode for everything inside <>
11	- the SPACE is the only difference
12	- UTF-8 check, like JSON8
13	- Static typing
14
15	"""
16	from __future__ import print_function
17
18	from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_id_str, h8_tag_id,
19	h8_tag_id_t, h8_tag_id_str)
20	from typing import Dict, Iterator, Any, IO
21
22	try:
23	from cStringIO import StringIO
24	except ImportError:
25	# for python3
26	from io import StringIO # type: ignore
27	import re
28	import sys
29
30	if sys.version_info.major == 2:
31	from typing import List, Tuple, Optional
32
33
34	def log(msg, *args):
35	# type: (str, *Any) -> None
36	msg = msg % args
37	print(msg, file=sys.stderr)
38
39
40	class LexError(Exception):
41	"""
42	Examples of lex errors:
43
44	- h8_id.Invalid, like <> or &&
45	- Unclosed <!-- <? <![CDATA[ <script> <style>
46	"""
47
48	def __init__(self, s, start_pos):
49	# type: (str, int) -> None
50	self.s = s
51	self.start_pos = start_pos
52
53	def __str__(self):
54	# type: () -> str
55	return '(LexError %r)' % (self.s[self.start_pos:self.start_pos + 20])
56
57
58	def FindLineNum(s, error_pos):
59	# type: (str, int) -> int
60	current_pos = 0
61	line_num = 1
62	while True:
63	newline_pos = s.find('\n', current_pos)
64	#log('current = %d, N %d, line %d', current_pos, newline_pos, line_num)
65
66	if newline_pos == -1: # this is the last line
67	return line_num
68	if newline_pos >= error_pos:
69	return line_num
70	line_num += 1
71	current_pos = newline_pos + 1
72
73
74	class ParseError(Exception):
75	"""
76	Examples of parse errors
77
78	- unbalanced tag structure
79	- ul_table.py errors
80	"""
81
82	def __init__(self, msg, s=None, start_pos=-1):
83	# type: (str, Optional[str], int) -> None
84	self.msg = msg
85	self.s = s
86	self.start_pos = start_pos
87
88	def __str__(self):
89	# type: () -> str
90	if self.s is not None:
91	assert self.start_pos != -1, self.start_pos
92	snippet = (self.s[self.start_pos:self.start_pos + 20])
93
94	line_num = FindLineNum(self.s, self.start_pos)
95	else:
96	snippet = ''
97	line_num = -1
98	msg = 'line %d: %r %r' % (line_num, self.msg, snippet)
99	return msg
100
101
102	class Output(object):
103	"""Takes an underlying input buffer and an output file. Maintains a
104	position in the input buffer.
105
106	Print FROM the input or print new text to the output.
107	"""
108
109	def __init__(self, s, f, left_pos=0, right_pos=-1):
110	# type: (str, IO[str], int, int) -> None
111	self.s = s
112	self.f = f
113	self.pos = left_pos
114	self.right_pos = len(s) if right_pos == -1 else right_pos
115
116	def SkipTo(self, pos):
117	# type: (int) -> None
118	"""Skip to a position."""
119	self.pos = pos
120
121	def PrintUntil(self, pos):
122	# type: (int) -> None
123	"""Print until a position."""
124	piece = self.s[self.pos:pos]
125	self.f.write(piece)
126	self.pos = pos
127
128	def PrintTheRest(self):
129	# type: () -> None
130	"""Print until the end of the string."""
131	self.PrintUntil(self.right_pos)
132
133	def Print(self, s):
134	# type: (str) -> None
135	"""Print text to the underlying buffer."""
136	self.f.write(s)
137
138
139	def MakeLexer(rules):
140	return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]
141
142
143	#
144	# Eggex
145	#
146	# Tag = / ~['>']+ /
147
148	# Is this valid? A single character?
149	# Tag = / ~'>'* /
150
151	# Maybe better: / [NOT '>']+/
152	# capital letters not allowed there?
153	#
154	# But then this is confusing:
155	# / [NOT ~digit]+/
156	#
157	# / [NOT digit] / is [^\d]
158	# / ~digit / is \D
159	#
160	# Or maybe:
161	#
162	# / [~ digit]+ /
163	# / [~ '>']+ /
164	# / [NOT '>']+ /
165
166	# End = / '</' Tag '>' /
167	# StartEnd = / '<' Tag '/>' /
168	# Start = / '<' Tag '>' /
169	#
170	# EntityRef = / '&' dot{* N} ';' /
171
172	# Tag name, or attribute name
173	# colon is used in XML
174
175	# https://www.w3.org/TR/xml/#NT-Name
176	# Hm there is a lot of unicode stuff. We are simplifying parsing
177
178	_NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter
179
180	CHAR_LEX = [
181	# Characters
182	# https://www.w3.org/TR/xml/#sec-references
183	(r'&\# [0-9]+ ;', h8_id.DecChar),
184	(r'&\# x[0-9a-fA-F]+ ;', h8_id.HexChar),
185	(r'& %s ;' % _NAME, h8_id.CharEntity),
186	# Allow unquoted, and quoted
187	(r'&', h8_id.BadAmpersand),
188	]
189
190	HTM8_LEX = CHAR_LEX + [
191	(r'<!--', h8_id.CommentBegin),
192
193	# Processing instruction are used for the XML header:
194	# <?xml version="1.0" encoding="UTF-8"?>
195	# They are technically XML-only, but in HTML5, they are another kind of
196	# comment:
197	#
198	# https://developer.mozilla.org/en-US/docs/Web/API/ProcessingInstruction
199	#
200	(r'<\?', h8_id.ProcessingBegin),
201	# Not necessary in HTML5, but occurs in XML
202	(r'<!\[CDATA\[', h8_id.CDataBegin), # <![CDATA[
203
204	# Markup declarations
205	# - In HTML5, there is only <!DOCTYPE html>
206	# - XML has 4 more declarations: <!ELEMENT ...> ATTLIST ENTITY NOTATION
207	# - these seem to be part of DTD
208	# - it's useful to skip these, and be able to parse the rest of the document
209	# - Note: < is allowed?
210	(r'<! [^>\x00]+ >', h8_id.Decl),
211
212	# Tags
213	# Notes:
214	# - We look for a valid tag name, but we don't validate attributes.
215	# That's done in the tag lexer.
216	# - We don't allow leading whitespace
217	(r'</ (%s) >' % _NAME, h8_id.EndTag),
218	# self-closing <br/> comes before StartTag
219	# could/should these be collapsed into one rule?
220	(r'< (%s) [^>\x00]* />' % _NAME, h8_id.StartEndTag), # end </a>
221	(r'< (%s) [^>\x00]* >' % _NAME, h8_id.StartTag), # start <a>
222
223	# HTML5 allows unescaped > in raw data, but < is not allowed.
224	# https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
225	#
226	# - My early blog has THREE errors when disallowing >
227	# - So do some .wwz files
228	(r'[^&<>\x00]+', h8_id.RawData),
229	(r'>', h8_id.BadGreaterThan),
230	# < is an error
231	(r'.', h8_id.Invalid),
232	]
233
234	# Old notes:
235	#
236	# Non-greedy matches are regular and can be matched in linear time
237	# with RE2.
238	#
239	# https://news.ycombinator.com/item?id=27099798
240	#
241	# Maybe try combining all of these for speed.
242
243	# . is any char except newline
244	# https://re2c.org/manual/manual_c.html
245
246	# Discarded options
247	#(r'<!-- .*? -->', h8_id.Comment),
248
249	# Hack from Claude: \s\S instead of re.DOTALL. I don't like this
250	#(r'<!-- [\s\S]*? -->', h8_id.Comment),
251	#(r'<!-- (?:.\|[\n])*? -->', h8_id.Comment),
252
253	HTM8_LEX_COMPILED = MakeLexer(HTM8_LEX)
254
255
256	class Lexer(object):
257
258	def __init__(self, s, left_pos=0, right_pos=-1, no_special_tags=False):
259	# type: (str, int, int, bool) -> None
260	self.s = s
261	self.pos = left_pos
262	self.right_pos = len(s) if right_pos == -1 else right_pos
263	self.no_special_tags = no_special_tags
264
265	# string -> compiled regex pattern object
266	self.cache = {} # type: Dict[str, Any]
267
268	# either </script> or </style> - we search until we see that
269	self.search_state = None # type: Optional[str]
270
271	# Position of tag name, if applicable
272	# - Set after you get a StartTag, EndTag, or StartEndTag
273	# - Unset on other tags
274	self.tag_pos_left = -1
275	self.tag_pos_right = -1
276
277	def _Read(self):
278	# type: () -> Tuple[h8_id_t, int]
279	if self.pos == self.right_pos:
280	return h8_id.EndOfStream, self.pos
281
282	assert self.pos < self.right_pos, self.pos
283
284	if self.search_state is not None and not self.no_special_tags:
285	# TODO: case-insensitive search for </SCRIPT> <SCRipt> ?
286	#
287	# Another strategy: enter a mode where we find ONLY the end tag
288	# regex, and any data that's not <, and then check the canonical
289	# tag name for 'script' or 'style'.
290	pos = self.s.find(self.search_state, self.pos)
291	if pos == -1:
292	# unterminated <script> or <style>
293	raise LexError(self.s, self.pos)
294	self.search_state = None
295	# beginning
296	return h8_id.HtmlCData, pos
297
298	# Find the first match.
299	# Note: frontend/match.py uses _LongestMatch(), which is different!
300	# TODO: reconcile them. This lexer should be expressible in re2c.
301
302	for pat, tok_id in HTM8_LEX_COMPILED:
303	m = pat.match(self.s, self.pos)
304	if m:
305	if tok_id in (h8_id.StartTag, h8_id.EndTag, h8_id.StartEndTag):
306	self.tag_pos_left = m.start(1)
307	self.tag_pos_right = m.end(1)
308	else:
309	# Reset state
310	self.tag_pos_left = -1
311	self.tag_pos_right = -1
312
313	if tok_id == h8_id.CommentBegin:
314	pos = self.s.find('-->', self.pos)
315	if pos == -1:
316	# unterminated <!--
317	raise LexError(self.s, self.pos)
318	return h8_id.Comment, pos + 3 # -->
319
320	if tok_id == h8_id.ProcessingBegin:
321	pos = self.s.find('?>', self.pos)
322	if pos == -1:
323	# unterminated <?
324	raise LexError(self.s, self.pos)
325	return h8_id.Processing, pos + 2 # ?>
326
327	if tok_id == h8_id.CDataBegin:
328	pos = self.s.find(']]>', self.pos)
329	if pos == -1:
330	# unterminated <![CDATA[
331	raise LexError(self.s, self.pos)
332	return h8_id.CData, pos + 3 # ]]>
333
334	if tok_id == h8_id.StartTag:
335	# TODO: reduce allocations
336	if (self.TagNameEquals('script') or
337	self.TagNameEquals('style')):
338	# <SCRipt a=b> -> </SCRipt>
339	self.search_state = '</' + self._LiteralTagName() + '>'
340
341	return tok_id, m.end()
342	else:
343	raise AssertionError('h8_id.Invalid rule should have matched')
344
345	def TagNameEquals(self, expected):
346	# type: (str) -> bool
347	assert self.tag_pos_left != -1, self.tag_pos_left
348	assert self.tag_pos_right != -1, self.tag_pos_right
349
350	# TODO: In C++, this does not need an allocation. Can we test
351	# directly?
352	return expected == self.CanonicalTagName()
353
354	def _LiteralTagName(self):
355	# type: () -> str
356	assert self.tag_pos_left != -1, self.tag_pos_left
357	assert self.tag_pos_right != -1, self.tag_pos_right
358
359	return self.s[self.tag_pos_left:self.tag_pos_right]
360
361	def CanonicalTagName(self):
362	# type: () -> str
363	tag_name = self._LiteralTagName()
364	# Most tags are already lower case, so avoid allocation with this conditional
365	# TODO: this could go in the mycpp runtime?
366	if tag_name.islower():
367	return tag_name
368	else:
369	return tag_name.lower()
370
371	def Read(self):
372	# type: () -> Tuple[h8_id_t, int]
373	tok_id, end_pos = self._Read()
374	self.pos = end_pos # advance
375	return tok_id, end_pos
376
377	def LookAhead(self, regex):
378	# type: (str) -> bool
379	# Cache the regex compilation. This could also be LookAheadFor(THEAD)
380	# or something.
381	pat = self.cache.get(regex)
382	if pat is None:
383	pat = re.compile(regex)
384	self.cache[regex] = pat
385
386	m = pat.match(self.s, self.pos)
387	return m is not None
388
389
390	def _Tokens(s, left_pos, right_pos):
391	# type: (str, int, int) -> Iterator[Tuple[h8_id_t, int]]
392	"""
393	Args:
394	s: string to parse
395	left_pos, right_pos: Optional span boundaries.
396	"""
397	lx = Lexer(s, left_pos, right_pos)
398	while True:
399	tok_id, pos = lx.Read()
400	yield tok_id, pos
401	if tok_id == h8_id.EndOfStream:
402	break
403
404
405	def ValidTokens(s, left_pos=0, right_pos=-1):
406	# type: (str, int, int) -> Iterator[Tuple[h8_id_t, int]]
407	"""Wrapper around _Tokens to prevent callers from having to handle Invalid.
408
409	I'm not combining the two functions because I might want to do a
410	'yield' transformation on Tokens()? Exceptions might complicate the
411	issue?
412	"""
413	pos = left_pos
414	for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
415	if tok_id == h8_id.Invalid:
416	raise LexError(s, pos)
417	yield tok_id, end_pos
418	pos = end_pos
419
420
421	def ValidTokenList(s, no_special_tags=False):
422	# type: (str, bool) -> List[Tuple[h8_id_t, int]]
423	"""A wrapper that can be more easily translated to C++. Doesn't use iterators."""
424
425	start_pos = 0
426	tokens = []
427	lx = Lexer(s, no_special_tags=no_special_tags)
428	while True:
429	tok_id, end_pos = lx.Read()
430	tokens.append((tok_id, end_pos))
431	if tok_id == h8_id.EndOfStream:
432	break
433	if tok_id == h8_id.Invalid:
434	raise LexError(s, start_pos)
435	start_pos = end_pos
436	return tokens
437
438
439	# Tag names:
440	# Match <a or </a
441	# Match <h2, but not <2h
442	#
443	# HTML 5 doesn't restrict tag names at all
444	# https://html.spec.whatwg.org/#toc-syntax
445	#
446	# XML allows : - .
447	# https://www.w3.org/TR/xml/#NT-NameChar
448
449	# Namespaces for MathML, SVG
450	# XLink, XML, XMLNS
451	#
452	# https://infra.spec.whatwg.org/#namespaces
453	#
454	# Allow - for td-attrs
455
456	# Be very lenient - just no whitespace or special HTML chars
457	# I don't think this is more lenient than HTML5, though we should check.
458	_UNQUOTED_VALUE = r'''[^ \t\r\n<>&"'\x00]*'''
459
460	# TODO: we don't need to capture the tag name here? That's done at the top
461	# level
462	_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)
463
464	_TAG_LAST_RE = re.compile(r'\s* /? >', re.VERBOSE)
465
466	# To match href="foo"
467	# Note: in HTML5 and XML, single quoted attributes are also valid
468
469	# <button disabled> is standard usage
470
471	# NOTE: This used to allow whitespace around =
472	# <a foo = "bar"> makes sense in XML
473	# But then you also have
474	# <a foo= bar> - which is TWO attributes, in HTML5
475	# So the space is problematic
476
477	_ATTR_RE = re.compile(
478	r'''
479	\s+ # Leading whitespace is required
480	(%s) # Attribute name
481	(?: # Optional attribute value
482	\s* = \s* # Spaces allowed around =
483	(?:
484	" ([^>"\x00]*) " # double quoted value
485	\| ' ([^>'\x00]*) ' # single quoted value
486	\| (%s) # Attribute value
487	)
488	)?
489	''' % (_NAME, _UNQUOTED_VALUE), re.VERBOSE)
490
491
492	class TagLexer(object):
493	"""
494	Given a tag like <a href="..."> or <link type="..." />, the TagLexer
495	provides a few operations:
496
497	- What is the tag?
498	- Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
499	"""
500
501	def __init__(self, s):
502	# type: (str) -> None
503	self.s = s
504	self.start_pos = -1 # Invalid
505	self.end_pos = -1
506
507	def Reset(self, start_pos, end_pos):
508	# type: (int, int) -> None
509	"""Reuse instances of this object."""
510	assert start_pos >= 0, start_pos
511	assert end_pos >= 0, end_pos
512
513	self.start_pos = start_pos
514	self.end_pos = end_pos
515
516	def TagString(self):
517	# type: () -> str
518	"""Return the entire tag string, e.g. <a href='foo'>"""
519	return self.s[self.start_pos:self.end_pos]
520
521	def TagName(self):
522	# type: () -> str
523	# First event
524	tok_id, start, end = next(self.Tokens())
525	return self.s[start:end]
526
527	def GetSpanForAttrValue(self, attr_name):
528	# type: (str) -> Tuple[int, int]
529	"""
530	Used by oils_doc.py, for href shortcuts
531	"""
532	# Algorithm: search for QuotedValue or UnquotedValue after AttrName
533	# TODO: Could also cache these
534
535	events = self.Tokens()
536	val = (-1, -1)
537	try:
538	while True:
539	tok_id, start, end = next(events)
540	if tok_id == h8_tag_id.AttrName:
541	name = self.s[start:end]
542	if name == attr_name:
543	# The value should come next
544	tok_id, start, end = next(events)
545	assert tok_id in (
546	h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue,
547	h8_tag_id.MissingValue), h8_tag_id_str(tok_id)
548	val = start, end
549	break
550
551	except StopIteration:
552	pass
553	return val
554
555	def GetAttrRaw(self, attr_name):
556	# type: (str) -> Optional[str]
557	"""
558	Return the value, which may be UNESCAPED.
559	"""
560	start, end = self.GetSpanForAttrValue(attr_name)
561	if start == -1:
562	return None
563	return self.s[start:end]
564
565	def AllAttrsRawSlice(self):
566	# type: () -> List[Tuple[str, int, int]]
567	"""
568	Get a list of pairs [('class', 3, 5), ('href', 9, 12)]
569	"""
570	slices = []
571	events = self.Tokens()
572	try:
573	while True:
574	tok_id, start, end = next(events)
575	if tok_id == h8_tag_id.AttrName:
576	name = self.s[start:end]
577
578	# The value should come next
579	tok_id, start, end = next(events)
580	assert tok_id in (
581	h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue,
582	h8_tag_id.MissingValue), h8_tag_id_str(tok_id)
583	# Note: quoted values may have &
584	# We would need ANOTHER lexer to unescape them, but we
585	# don't need that for ul-table
586	slices.append((name, start, end))
587	except StopIteration:
588	pass
589	return slices
590
591	def AllAttrsRaw(self):
592	# type: () -> List[Tuple[str, str]]
593	"""
594	Get a list of pairs [('class', 'foo'), ('href', '?foo=1&bar=2')]
595
596	The quoted values may be escaped. We would need another lexer to
597	unescape them.
598	"""
599	slices = self.AllAttrsRawSlice()
600	pairs = []
601	for name, start, end in slices:
602	pairs.append((name, self.s[start:end]))
603	return pairs
604
605	def Tokens(self):
606	# type: () -> Iterator[Tuple[h8_tag_id_t, int, int]]
607	"""
608	Yields a sequence of tokens: Tag (AttrName AttrValue?)*
609
610	Where each Token is (Type, start_pos, end_pos)
611
612	Note that start and end are NOT redundant! We skip over some unwanted
613	characters.
614	"""
615	m = _TAG_RE.match(self.s, self.start_pos + 1)
616	if not m:
617	raise RuntimeError("Couldn't find HTML tag in %r" %
618	self.TagString())
619	yield h8_tag_id.TagName, m.start(1), m.end(1)
620
621	pos = m.end(0)
622	#log('POS %d', pos)
623
624	while True:
625	# don't search past the end
626	m = _ATTR_RE.match(self.s, pos, self.end_pos)
627	if not m:
628	#log('BREAK pos %d', pos)
629	break
630	#log('AttrName %r', m.group(1))
631
632	yield h8_tag_id.AttrName, m.start(1), m.end(1)
633
634	#log('m.groups() %r', m.groups())
635	if m.group(2) is not None:
636	# double quoted
637	yield h8_tag_id.QuotedValue, m.start(2), m.end(2)
638	elif m.group(3) is not None:
639	# single quoted - TODO: could have different token types
640	yield h8_tag_id.QuotedValue, m.start(3), m.end(3)
641	elif m.group(4) is not None:
642	yield h8_tag_id.UnquotedValue, m.start(4), m.end(4)
643	else:
644	# <button disabled>
645	end = m.end(0)
646	yield h8_tag_id.MissingValue, end, end
647
648	# Skip past the "
649	pos = m.end(0)
650
651	#log('TOK %r', self.s)
652
653	m = _TAG_LAST_RE.match(self.s, pos)
654	#log('_TAG_LAST_RE match %r', self.s[pos:])
655	if not m:
656	# Extra data at end of tag. TODO: add messages for all these.
657	raise LexError(self.s, pos)
658
659
660	# This is similar but not identical to
661	# " ([^>"\x00]*) " # double quoted value
662	# \| ' ([^>'\x00]*) ' # single quoted value
663	#
664	# Note: for unquoted values, & isn't allowed, and thus & and c and
665	# are not allowed. We could relax that?
666	ATTR_VALUE_LEXER = CHAR_LEX + [
667	(r'[^>&\x00]+', h8_id.RawData),
668	(r'.', h8_id.Invalid),
669	]
670
671	ATTR_VALUE_LEXER = MakeLexer(ATTR_VALUE_LEXER)
672
673
674	class AttrValueLexer(object):
675	"""
676	<a href="foo=99&bar">
677	<a href='foo=99&bar'>
678	<a href=unquoted>
679	"""
680
681	def __init__(self, s):
682	# type: (str) -> None
683	self.s = s
684	self.start_pos = -1 # Invalid
685	self.end_pos = -1
686
687	def Reset(self, start_pos, end_pos):
688	# type: (int, int) -> None
689	"""Reuse instances of this object."""
690	assert start_pos >= 0, start_pos
691	assert end_pos >= 0, end_pos
692
693	self.start_pos = start_pos
694	self.end_pos = end_pos
695
696	def NumTokens(self):
697	# type: () -> int
698	num_tokens = 0
699	pos = self.start_pos
700	for tok_id, end_pos in self.Tokens():
701	if tok_id == h8_id.Invalid:
702	raise LexError(self.s, pos)
703	pos = end_pos
704	#log('pos %d', pos)
705	num_tokens += 1
706	return num_tokens
707
708	def Tokens(self):
709	# type: () -> Iterator[Tuple[h8_id_t, int]]
710	pos = self.start_pos
711	while pos < self.end_pos:
712	# Find the first match, like above.
713	# Note: frontend/match.py uses _LongestMatch(), which is different!
714	# TODO: reconcile them. This lexer should be expressible in re2c.
715	for pat, tok_id in ATTR_VALUE_LEXER:
716	m = pat.match(self.s, pos)
717	if m:
718	if 0:
719	tok_str = m.group(0)
720	log('token = %r', tok_str)
721
722	end_pos = m.end(0)
723	yield tok_id, end_pos
724	pos = end_pos
725	break
726	else:
727	raise AssertionError('h8_id.Invalid rule should have matched')
728
729
730	def ReadUntilStartTag(it, tag_lexer, tag_name):
731	# type: (Iterator[Tuple[h8_id_t, int]], TagLexer, str) -> Tuple[int, int]
732	"""Find the next <foo>, returning its (start, end) positions
733
734	Raise ParseError if it's not found.
735
736	tag_lexer is RESET.
737	"""
738	pos = 0
739	while True:
740	try:
741	tok_id, end_pos = next(it)
742	except StopIteration:
743	break
744	tag_lexer.Reset(pos, end_pos)
745	if tok_id == h8_id.StartTag and tag_lexer.TagName() == tag_name:
746	return pos, end_pos
747
748	pos = end_pos
749
750	raise ParseError('No start tag %r' % tag_name)
751
752
753	def ReadUntilEndTag(it, tag_lexer, tag_name):
754	# type: (Iterator[Tuple[h8_id_t, int]], TagLexer, str) -> Tuple[int, int]
755	"""Find the next </foo>, returning its (start, end) position
756
757	Raise ParseError if it's not found.
758
759	tag_lexer is RESET.
760	"""
761	pos = 0
762	while True:
763	try:
764	tok_id, end_pos = next(it)
765	except StopIteration:
766	break
767	tag_lexer.Reset(pos, end_pos)
768	if tok_id == h8_id.EndTag and tag_lexer.TagName() == tag_name:
769	return pos, end_pos
770
771	pos = end_pos
772
773	raise ParseError('No end tag %r' % tag_name)
774
775
776	CHAR_ENTITY = {
777	'amp': '&',
778	'lt': '<',
779	'gt': '>',
780	'quot': '"',
781	'apos': "'",
782	}
783
784
785	def ToText(s, left_pos=0, right_pos=-1):
786	# type: (str, int, int) -> str
787	"""Given HTML, return text by unquoting > and < etc.
788
789	Used by:
790	doctools/oils_doc.py: PygmentsPlugin
791	doctools/help_gen.py: HelpIndexCards
792
793	In the latter case, we cold process some tags, like:
794
795	- Blue Link (not clickable, but still useful)
796	- Red X
797
798	That should be html.ToAnsi.
799	"""
800	f = StringIO()
801	out = Output(s, f, left_pos, right_pos)
802
803	pos = left_pos
804	for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
805	if tok_id in (h8_id.RawData, h8_id.BadAmpersand, h8_id.BadGreaterThan,
806	h8_id.BadLessThan):
807	out.SkipTo(pos)
808	out.PrintUntil(end_pos)
809
810	elif tok_id == h8_id.CharEntity: # &
811
812	entity = s[pos + 1:end_pos - 1]
813
814	out.SkipTo(pos)
815	out.Print(CHAR_ENTITY[entity])
816	out.SkipTo(end_pos)
817
818	# Not handling these yet
819	elif tok_id == h8_id.HexChar:
820	raise AssertionError('Hex Char %r' % s[pos:pos + 20])
821
822	elif tok_id == h8_id.DecChar:
823	raise AssertionError('Dec Char %r' % s[pos:pos + 20])
824
825	else:
826	# Skip everything else
827	out.SkipTo(end_pos)
828
829	pos = end_pos
830
831	out.PrintTheRest()
832	return f.getvalue()
833
834
835	# https://developer.mozilla.org/en-US/docs/Glossary/Void_element
836	VOID_ELEMENTS = [
837	'area',
838	'base',
839	'br',
840	'col',
841	'embed',
842	'hr',
843	'img',
844	'input',
845	'link',
846	'meta',
847	'param',
848	'source',
849	'track',
850	'wbr',
851	]
852
853	LEX_ATTRS = 1 << 1
854	LEX_QUOTED_VALUES = 1 << 2 # href="?x=42&y=99"
855	NO_SPECIAL_TAGS = 1 << 3 # <script> <style>, VOID tags, etc.
856	BALANCED_TAGS = 1 << 4 # are tags balanced?
857
858
859	def Validate(contents, flags, counters):
860	# type: (str, int, Counters) -> None
861
862	tag_lexer = TagLexer(contents)
863	val_lexer = AttrValueLexer(contents)
864
865	no_special_tags = bool(flags & NO_SPECIAL_TAGS)
866	lx = Lexer(contents, no_special_tags=no_special_tags)
867	tokens = []
868	start_pos = 0
869	tag_stack = []
870	while True:
871	tok_id, end_pos = lx.Read()
872	#log('TOP %s %r', h8_id_str(tok_id), contents[start_pos:end_pos])
873
874	if tok_id == h8_id.Invalid:
875	raise LexError(contents, start_pos)
876	if tok_id == h8_id.EndOfStream:
877	break
878
879	tokens.append((tok_id, end_pos))
880
881	if tok_id == h8_id.StartEndTag:
882	counters.num_start_end_tags += 1
883
884	tag_lexer.Reset(start_pos, end_pos)
885	all_attrs = tag_lexer.AllAttrsRawSlice()
886	counters.num_attrs += len(all_attrs)
887	for name, val_start, val_end in all_attrs:
888	val_lexer.Reset(val_start, val_end)
889	counters.num_val_tokens += val_lexer.NumTokens()
890
891	#counters.debug_attrs.extend(all_attrs)
892
893	elif tok_id == h8_id.StartTag:
894	counters.num_start_tags += 1
895
896	tag_lexer.Reset(start_pos, end_pos)
897	all_attrs = tag_lexer.AllAttrsRawSlice()
898	counters.num_attrs += len(all_attrs)
899	for name, val_start, val_end in all_attrs:
900	val_lexer.Reset(val_start, val_end)
901	counters.num_val_tokens += val_lexer.NumTokens()
902
903	#counters.debug_attrs.extend(all_attrs)
904
905	if flags & BALANCED_TAGS:
906	tag_name = lx.CanonicalTagName()
907	if flags & NO_SPECIAL_TAGS:
908	tag_stack.append(tag_name)
909	else:
910	# e.g. <meta> is considered self-closing, like <meta/>
911	if tag_name not in VOID_ELEMENTS:
912	tag_stack.append(tag_name)
913
914	counters.max_tag_stack = max(counters.max_tag_stack,
915	len(tag_stack))
916	elif tok_id == h8_id.EndTag:
917	if flags & BALANCED_TAGS:
918	try:
919	expected = tag_stack.pop()
920	except IndexError:
921	raise ParseError('Tag stack empty',
922	s=contents,
923	start_pos=start_pos)
924
925	actual = lx.CanonicalTagName()
926	if expected != actual:
927	raise ParseError(
928	'Got unexpected closing tag %r; opening tag was %r' %
929	(contents[start_pos:end_pos], expected),
930	s=contents,
931	start_pos=start_pos)
932
933	start_pos = end_pos
934
935	if len(tag_stack) != 0:
936	raise ParseError('Missing closing tags at end of doc: %s' %
937	' '.join(tag_stack),
938	s=contents,
939	start_pos=start_pos)
940
941	counters.num_tokens += len(tokens)
942
943
944	def ToXml(htm8_str):
945	# type: (str) -> str
946
947	# TODO:
948	# 1. Lex it
949	# 2. < & > must be escaped
950	# a. in raw data
951	# b. in quoted strings
952	# 3. <script> turned into CDATA
953	# 4. void tags turned into self-closing tags
954	# 5. case-sensitive tag matching - not sure about this
955
956	tag_lexer = TagLexer(htm8_str)
957	val_lexer = AttrValueLexer(htm8_str)
958
959	f = StringIO()
960	out = Output(htm8_str, f)
961
962	lx = Lexer(htm8_str)
963
964	pos = 0
965	while True:
966	tok_id, end_pos = lx.Read()
967
968	if tok_id == h8_id.Invalid:
969	raise LexError(htm8_str, pos)
970	if tok_id == h8_id.EndOfStream:
971	break
972
973	if tok_id in (h8_id.RawData, h8_id.CharEntity, h8_id.HexChar,
974	h8_id.DecChar):
975	out.PrintUntil(end_pos)
976	elif tok_id in (h8_id.StartTag, h8_id.StartEndTag):
977	tag_lexer.Reset(pos, end_pos)
978	# TODO: reduce allocations here
979	all_attrs = tag_lexer.AllAttrsRawSlice()
980	for name, val_start, val_end in all_attrs:
981	val_lexer.Reset(val_start, val_end)
982	# TODO: get the kind of string
983	#
984	# Quoted: we need to replace & with & and < with <
985	# note > is not allowed
986	# Unquoted: right now, we can just surround with double quotes
987	# because we don't allow any bad chars
988	# Empty : add "", so empty= becomes =""
989	# Missing : add ="", so missing becomes missing=""
990
991	tag_name = lx.CanonicalTagName()
992	if tok_id == h8_id.StartTag and tag_name in VOID_ELEMENTS:
993	# TODO: instead of closing >, print />
994	pass
995
996	elif tok_id == h8_id.BadAmpersand:
997	#out.SkipTo(pos)
998	out.Print('&')
999	out.SkipTo(end_pos)
1000
1001	elif tok_id == h8_id.BadGreaterThan:
1002	#out.SkipTo(pos)
1003	out.Print('>')
1004	out.SkipTo(end_pos)
1005	else:
1006	out.PrintUntil(end_pos)
1007
1008	pos = end_pos
1009
1010	out.PrintTheRest()
1011	return f.getvalue()
1012
1013
1014	class Counters(object):
1015
1016	def __init__(self):
1017	# type: () -> None
1018	self.num_tokens = 0
1019	self.num_start_tags = 0
1020	self.num_start_end_tags = 0
1021	self.num_attrs = 0
1022	self.max_tag_stack = 0
1023	self.num_val_tokens = 0
1024
1025	#self.debug_attrs = []
1026
1027
1028	def main(argv):
1029	# type: (List[str]) -> int
1030	action = argv[1]
1031
1032	if action == 'tokens':
1033	contents = sys.stdin.read()
1034
1035	lx = Lexer(contents)
1036	start_pos = 0
1037	while True:
1038	tok_id, end_pos = lx.Read()
1039	if tok_id == h8_id.Invalid:
1040	raise LexError(contents, start_pos)
1041	if tok_id == h8_id.EndOfStream:
1042	break
1043
1044	frag = contents[start_pos:end_pos]
1045	log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
1046	start_pos = end_pos
1047
1048	return 0
1049
1050	elif action in ('lex-htm8', 'parse-htm8', 'parse-xml'):
1051
1052	errors = []
1053	counters = Counters()
1054
1055	flags = LEX_ATTRS \| LEX_QUOTED_VALUES
1056	if action.startswith('parse-'):
1057	flags \|= BALANCED_TAGS
1058	if action == 'parse-xml':
1059	flags \|= NO_SPECIAL_TAGS
1060
1061	i = 0
1062	for line in sys.stdin:
1063	filename = line.strip()
1064	with open(filename) as f:
1065	contents = f.read()
1066
1067	try:
1068	Validate(contents, flags, counters)
1069	except LexError as e:
1070	log('Lex error in %r: %s', filename, e)
1071	errors.append((filename, e))
1072	except ParseError as e:
1073	log('Parse error in %r: %s', filename, e)
1074	errors.append((filename, e))
1075	i += 1
1076
1077	log('')
1078	log('%10d tokens', counters.num_tokens)
1079	log('%10d start/end tags', counters.num_start_end_tags)
1080	log('%10d start tags', counters.num_start_tags)
1081	log('%10d attrs', counters.num_attrs)
1082	log('%10d max tag stack depth', counters.max_tag_stack)
1083	log('%10d attr val tokens', counters.num_val_tokens)
1084	log('%10d errors', len(errors))
1085	if len(errors):
1086	return 1
1087	return 0
1088
1089	elif action == 'todo':
1090	# Other algorithms:
1091	#
1092	# - select first subtree with given ID
1093	# - this requires understanding the void tags I suppose
1094	# - select all subtrees that have a class
1095	# - materialize DOM
1096
1097	# Safe-HTM8? This is a filter
1098	return 0
1099
1100	else:
1101	raise RuntimeError('Invalid action %r' % action)
1102
1103
1104	if __name__ == '__main__':
1105	sys.exit(main(sys.argv))