lazylex/html.py

OILS / lazylex / html.py View on Github | oils.pub

1129 lines, 565 significant

1	#!/usr/bin/env python2
2	"""
3	lazylex/html.py - Low-Level HTML Processing.
4
5	See lazylex/README.md for details.
6
7	TODO:
8	- Get rid of AttrValueLexer - this should be in the TagLexer
9	- this also means that unquoted values can be more similar
10	- We can use a single lexer mode for everything inside <>
11	- the SPACE is the only difference
12	- UTF-8 check, like JSON8
13	- Static typing
14
15	"""
16	from __future__ import print_function
17	from typing import Iterator
18	from typing import Union
19	from typing import Any
20	from typing import IO
21
22	try:
23	from cStringIO import StringIO
24	except ImportError:
25	# for python3
26	from io import StringIO # type: ignore
27	import re
28	import sys
29
30	if sys.version_info.major == 2:
31	from typing import List, Tuple, Optional
32
33
34	def log(msg, *args):
35	# type: (str, *Any) -> None
36	msg = msg % args
37	print(msg, file=sys.stderr)
38
39
40	class LexError(Exception):
41	"""
42	Examples of lex errors:
43
44	- Tok.Invalid, like <> or &&
45	- Unclosed <!-- <? <![CDATA[ <script> <style>
46	"""
47
48	def __init__(self, s, start_pos):
49	# type: (str, int) -> None
50	self.s = s
51	self.start_pos = start_pos
52
53	def __str__(self):
54	# type: () -> str
55	return '(LexError %r)' % (self.s[self.start_pos:self.start_pos + 20])
56
57
58	def FindLineNum(s, error_pos):
59	# type: (str, int) -> int
60	current_pos = 0
61	line_num = 1
62	while True:
63	newline_pos = s.find('\n', current_pos)
64	#log('current = %d, N %d, line %d', current_pos, newline_pos, line_num)
65
66	if newline_pos == -1: # this is the last line
67	return line_num
68	if newline_pos >= error_pos:
69	return line_num
70	line_num += 1
71	current_pos = newline_pos + 1
72
73
74	class ParseError(Exception):
75	"""
76	Examples of parse errors
77
78	- unbalanced tag structure
79	- ul_table.py errors
80	"""
81
82	def __init__(self, msg, s=None, start_pos=-1):
83	# type: (str, Optional[str], int) -> None
84	self.msg = msg
85	self.s = s
86	self.start_pos = start_pos
87
88	def __str__(self):
89	# type: () -> str
90	if self.s is not None:
91	assert self.start_pos != -1, self.start_pos
92	snippet = (self.s[self.start_pos:self.start_pos + 20])
93
94	line_num = FindLineNum(self.s, self.start_pos)
95	else:
96	snippet = ''
97	line_num = -1
98	msg = 'line %d: %r %r' % (line_num, self.msg, snippet)
99	return msg
100
101
102	class Output(object):
103	"""Takes an underlying input buffer and an output file. Maintains a
104	position in the input buffer.
105
106	Print FROM the input or print new text to the output.
107	"""
108
109	def __init__(self, s, f, left_pos=0, right_pos=-1):
110	# type: (str, IO[str], int, int) -> None
111	self.s = s
112	self.f = f
113	self.pos = left_pos
114	self.right_pos = len(s) if right_pos == -1 else right_pos
115
116	def SkipTo(self, pos):
117	# type: (int) -> None
118	"""Skip to a position."""
119	self.pos = pos
120
121	def PrintUntil(self, pos):
122	# type: (int) -> None
123	"""Print until a position."""
124	piece = self.s[self.pos:pos]
125	self.f.write(piece)
126	self.pos = pos
127
128	def PrintTheRest(self):
129	# type: () -> None
130	"""Print until the end of the string."""
131	self.PrintUntil(self.right_pos)
132
133	def Print(self, s):
134	# type: (str) -> None
135	"""Print text to the underlying buffer."""
136	self.f.write(s)
137
138
139	# HTML Tokens
140	# CommentBegin, ProcessingBegin, CDataBegin are "pseudo-tokens", not visible
141	TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin CData CDataBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData HtmlCData BadAmpersand BadGreaterThan BadLessThan Invalid EndOfStream'.split(
142	)
143
144
145	class Tok(object):
146	"""
147	Avoid lint errors by using these aliases
148	"""
149	pass
150
151
152	TOKEN_NAMES = [None] * len(TOKENS) # type: List[str]
153
154	this_module = sys.modules[__name__]
155	for i, tok_str in enumerate(TOKENS):
156	setattr(this_module, tok_str, i)
157	setattr(Tok, tok_str, i)
158	TOKEN_NAMES[i] = tok_str
159
160
161	def TokenName(tok_id):
162	# type: (int) -> str
163	return TOKEN_NAMES[tok_id]
164
165
166	def MakeLexer(rules):
167	return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]
168
169
170	#
171	# Eggex
172	#
173	# Tag = / ~['>']+ /
174
175	# Is this valid? A single character?
176	# Tag = / ~'>'* /
177
178	# Maybe better: / [NOT '>']+/
179	# capital letters not allowed there?
180	#
181	# But then this is confusing:
182	# / [NOT ~digit]+/
183	#
184	# / [NOT digit] / is [^\d]
185	# / ~digit / is \D
186	#
187	# Or maybe:
188	#
189	# / [~ digit]+ /
190	# / [~ '>']+ /
191	# / [NOT '>']+ /
192
193	# End = / '</' Tag '>' /
194	# StartEnd = / '<' Tag '/>' /
195	# Start = / '<' Tag '>' /
196	#
197	# EntityRef = / '&' dot{* N} ';' /
198
199	# Tag name, or attribute name
200	# colon is used in XML
201
202	# https://www.w3.org/TR/xml/#NT-Name
203	# Hm there is a lot of unicode stuff. We are simplifying parsing
204
205	_NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter
206
207	CHAR_LEX = [
208	# Characters
209	# https://www.w3.org/TR/xml/#sec-references
210	(r'&\# [0-9]+ ;', Tok.DecChar),
211	(r'&\# x[0-9a-fA-F]+ ;', Tok.HexChar),
212	(r'& %s ;' % _NAME, Tok.CharEntity),
213	# Allow unquoted, and quoted
214	(r'&', Tok.BadAmpersand),
215	]
216
217	HTM8_LEX = CHAR_LEX + [
218	(r'<!--', Tok.CommentBegin),
219
220	# Processing instruction are used for the XML header:
221	# <?xml version="1.0" encoding="UTF-8"?>
222	# They are technically XML-only, but in HTML5, they are another kind of
223	# comment:
224	#
225	# https://developer.mozilla.org/en-US/docs/Web/API/ProcessingInstruction
226	#
227	(r'<\?', Tok.ProcessingBegin),
228	# Not necessary in HTML5, but occurs in XML
229	(r'<!\[CDATA\[', Tok.CDataBegin), # <![CDATA[
230
231	# Markup declarations
232	# - In HTML5, there is only <!DOCTYPE html>
233	# - XML has 4 more declarations: <!ELEMENT ...> ATTLIST ENTITY NOTATION
234	# - these seem to be part of DTD
235	# - it's useful to skip these, and be able to parse the rest of the document
236	# - Note: < is allowed?
237	(r'<! [^>\x00]+ >', Tok.Decl),
238
239	# Tags
240	# Notes:
241	# - We look for a valid tag name, but we don't validate attributes.
242	# That's done in the tag lexer.
243	# - We don't allow leading whitespace
244	(r'</ (%s) >' % _NAME, Tok.EndTag),
245	# self-closing <br/> comes before StartTag
246	# could/should these be collapsed into one rule?
247	(r'< (%s) [^>\x00]* />' % _NAME, Tok.StartEndTag), # end </a>
248	(r'< (%s) [^>\x00]* >' % _NAME, Tok.StartTag), # start <a>
249
250	# HTML5 allows unescaped > in raw data, but < is not allowed.
251	# https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
252	#
253	# - My early blog has THREE errors when disallowing >
254	# - So do some .wwz files
255	(r'[^&<>\x00]+', Tok.RawData),
256	(r'>', Tok.BadGreaterThan),
257	# < is an error
258	(r'.', Tok.Invalid),
259	]
260
261	# Old notes:
262	#
263	# Non-greedy matches are regular and can be matched in linear time
264	# with RE2.
265	#
266	# https://news.ycombinator.com/item?id=27099798
267	#
268	# Maybe try combining all of these for speed.
269
270	# . is any char except newline
271	# https://re2c.org/manual/manual_c.html
272
273	# Discarded options
274	#(r'<!-- .*? -->', Tok.Comment),
275
276	# Hack from Claude: \s\S instead of re.DOTALL. I don't like this
277	#(r'<!-- [\s\S]*? -->', Tok.Comment),
278	#(r'<!-- (?:.\|[\n])*? -->', Tok.Comment),
279
280	HTM8_LEX_COMPILED = MakeLexer(HTM8_LEX)
281
282
283	class Lexer(object):
284
285	def __init__(self, s, left_pos=0, right_pos=-1, no_special_tags=False):
286	# type: (str, int, int, bool) -> None
287	self.s = s
288	self.pos = left_pos
289	self.right_pos = len(s) if right_pos == -1 else right_pos
290	self.no_special_tags = no_special_tags
291
292	self.cache = {} # string -> compiled regex pattern object
293
294	# either </script> or </style> - we search until we see that
295	self.search_state = None # type: Optional[str]
296
297	# Position of tag name, if applicable
298	# - Set after you get a StartTag, EndTag, or StartEndTag
299	# - Unset on other tags
300	self.tag_pos_left = -1
301	self.tag_pos_right = -1
302
303	def _Peek(self):
304	# type: () -> Tuple[int, int]
305	"""
306	Note: not using _Peek() now
307	"""
308	if self.pos == self.right_pos:
309	return Tok.EndOfStream, self.pos
310
311	assert self.pos < self.right_pos, self.pos
312
313	if self.search_state is not None and not self.no_special_tags:
314	# TODO: case-insensitive search for </SCRIPT> <SCRipt> ?
315	#
316	# Another strategy: enter a mode where we find ONLY the end tag
317	# regex, and any data that's not <, and then check the canonical
318	# tag name for 'script' or 'style'.
319	pos = self.s.find(self.search_state, self.pos)
320	if pos == -1:
321	# unterminated <script> or <style>
322	raise LexError(self.s, self.pos)
323	self.search_state = None
324	# beginning
325	return Tok.HtmlCData, pos
326
327	# Find the first match.
328	# Note: frontend/match.py uses _LongestMatch(), which is different!
329	# TODO: reconcile them. This lexer should be expressible in re2c.
330
331	for pat, tok_id in HTM8_LEX_COMPILED:
332	m = pat.match(self.s, self.pos)
333	if m:
334	if tok_id in (Tok.StartTag, Tok.EndTag, Tok.StartEndTag):
335	self.tag_pos_left = m.start(1)
336	self.tag_pos_right = m.end(1)
337	else:
338	# Reset state
339	self.tag_pos_left = -1
340	self.tag_pos_right = -1
341
342	if tok_id == Tok.CommentBegin:
343	pos = self.s.find('-->', self.pos)
344	if pos == -1:
345	# unterminated <!--
346	raise LexError(self.s, self.pos)
347	return Tok.Comment, pos + 3 # -->
348
349	if tok_id == Tok.ProcessingBegin:
350	pos = self.s.find('?>', self.pos)
351	if pos == -1:
352	# unterminated <?
353	raise LexError(self.s, self.pos)
354	return Tok.Processing, pos + 2 # ?>
355
356	if tok_id == Tok.CDataBegin:
357	pos = self.s.find(']]>', self.pos)
358	if pos == -1:
359	# unterminated <![CDATA[
360	raise LexError(self.s, self.pos)
361	return Tok.CData, pos + 3 # ]]>
362
363	if tok_id == Tok.StartTag:
364	# TODO: reduce allocations
365	if (self.TagNameEquals('script') or
366	self.TagNameEquals('style')):
367	# <SCRipt a=b> -> </SCRipt>
368	self.search_state = '</' + self._LiteralTagName() + '>'
369
370	return tok_id, m.end()
371	else:
372	raise AssertionError('Tok.Invalid rule should have matched')
373
374	def TagNameEquals(self, expected):
375	# type: (str) -> bool
376	assert self.tag_pos_left != -1, self.tag_pos_left
377	assert self.tag_pos_right != -1, self.tag_pos_right
378
379	# TODO: In C++, this does not need an allocation. Can we test
380	# directly?
381	return expected == self.CanonicalTagName()
382
383	def _LiteralTagName(self):
384	# type: () -> str
385	assert self.tag_pos_left != -1, self.tag_pos_left
386	assert self.tag_pos_right != -1, self.tag_pos_right
387
388	return self.s[self.tag_pos_left:self.tag_pos_right]
389
390	def CanonicalTagName(self):
391	# type: () -> str
392	tag_name = self._LiteralTagName()
393	# Most tags are already lower case, so avoid allocation with this conditional
394	# TODO: this could go in the mycpp runtime?
395	if tag_name.islower():
396	return tag_name
397	else:
398	return tag_name.lower()
399
400	def Read(self):
401	# type: () -> Tuple[int, int]
402	tok_id, end_pos = self._Peek()
403	self.pos = end_pos # advance
404	return tok_id, end_pos
405
406	def LookAhead(self, regex):
407	# type: (str) -> bool
408	# Cache the regex compilation. This could also be LookAheadFor(THEAD)
409	# or something.
410	pat = self.cache.get(regex)
411	if pat is None:
412	pat = re.compile(regex)
413	self.cache[regex] = pat
414
415	m = pat.match(self.s, self.pos)
416	return m is not None
417
418
419	def _Tokens(s, left_pos, right_pos):
420	# type: (str, int, int) -> Iterator[Tuple[int, int]]
421	"""
422	Args:
423	s: string to parse
424	left_pos, right_pos: Optional span boundaries.
425	"""
426	lx = Lexer(s, left_pos, right_pos)
427	while True:
428	tok_id, pos = lx.Read()
429	yield tok_id, pos
430	if tok_id == Tok.EndOfStream:
431	break
432
433
434	def ValidTokens(s, left_pos=0, right_pos=-1):
435	# type: (str, int, int) -> Iterator[Tuple[int, int]]
436	"""Wrapper around _Tokens to prevent callers from having to handle Invalid.
437
438	I'm not combining the two functions because I might want to do a
439	'yield' transformation on Tokens()? Exceptions might complicate the
440	issue?
441	"""
442	pos = left_pos
443	for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
444	if tok_id == Tok.Invalid:
445	raise LexError(s, pos)
446	yield tok_id, end_pos
447	pos = end_pos
448
449
450	def ValidTokenList(s, no_special_tags=False):
451	# type: (str, bool) -> List[Tuple[int, int]]
452	"""A wrapper that can be more easily translated to C++. Doesn't use iterators."""
453
454	start_pos = 0
455	tokens = []
456	lx = Lexer(s, no_special_tags=no_special_tags)
457	while True:
458	tok_id, end_pos = lx.Read()
459	tokens.append((tok_id, end_pos))
460	if tok_id == Tok.EndOfStream:
461	break
462	if tok_id == Tok.Invalid:
463	raise LexError(s, start_pos)
464	start_pos = end_pos
465	return tokens
466
467
468	# Tag names:
469	# Match <a or </a
470	# Match <h2, but not <2h
471	#
472	# HTML 5 doesn't restrict tag names at all
473	# https://html.spec.whatwg.org/#toc-syntax
474	#
475	# XML allows : - .
476	# https://www.w3.org/TR/xml/#NT-NameChar
477
478	# Namespaces for MathML, SVG
479	# XLink, XML, XMLNS
480	#
481	# https://infra.spec.whatwg.org/#namespaces
482	#
483	# Allow - for td-attrs
484
485	# Be very lenient - just no whitespace or special HTML chars
486	# I don't think this is more lenient than HTML5, though we should check.
487	_UNQUOTED_VALUE = r'''[^ \t\r\n<>&"'\x00]*'''
488
489	# TODO: we don't need to capture the tag name here? That's done at the top
490	# level
491	_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)
492
493	_TAG_LAST_RE = re.compile(r'\s* /? >', re.VERBOSE)
494
495	# To match href="foo"
496	# Note: in HTML5 and XML, single quoted attributes are also valid
497
498	# <button disabled> is standard usage
499
500	# NOTE: This used to allow whitespace around =
501	# <a foo = "bar"> makes sense in XML
502	# But then you also have
503	# <a foo= bar> - which is TWO attributes, in HTML5
504	# So the space is problematic
505
506	_ATTR_RE = re.compile(
507	r'''
508	\s+ # Leading whitespace is required
509	(%s) # Attribute name
510	(?: # Optional attribute value
511	\s* = \s* # Spaces allowed around =
512	(?:
513	" ([^>"\x00]*) " # double quoted value
514	\| ' ([^>'\x00]*) ' # single quoted value
515	\| (%s) # Attribute value
516	)
517	)?
518	''' % (_NAME, _UNQUOTED_VALUE), re.VERBOSE)
519
520	TagName, AttrName, UnquotedValue, QuotedValue, MissingValue = range(5)
521
522
523	class TagLexer(object):
524	"""
525	Given a tag like <a href="..."> or <link type="..." />, the TagLexer
526	provides a few operations:
527
528	- What is the tag?
529	- Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
530	"""
531
532	def __init__(self, s):
533	# type: (str) -> None
534	self.s = s
535	self.start_pos = -1 # Invalid
536	self.end_pos = -1
537
538	def Reset(self, start_pos, end_pos):
539	# type: (int, int) -> None
540	"""Reuse instances of this object."""
541	assert start_pos >= 0, start_pos
542	assert end_pos >= 0, end_pos
543
544	self.start_pos = start_pos
545	self.end_pos = end_pos
546
547	def TagString(self):
548	return self.s[self.start_pos:self.end_pos]
549
550	def TagName(self):
551	# type: () -> str
552	# First event
553	tok_id, start, end = next(self.Tokens())
554	return self.s[start:end]
555
556	def GetSpanForAttrValue(self, attr_name):
557	# type: (str) -> Tuple[int, int]
558	"""
559	Used by oils_doc.py, for href shortcuts
560	"""
561	# Algorithm: search for QuotedValue or UnquotedValue after AttrName
562	# TODO: Could also cache these
563
564	events = self.Tokens()
565	val = (-1, -1)
566	try:
567	while True:
568	tok_id, start, end = next(events)
569	if tok_id == AttrName:
570	name = self.s[start:end]
571	if name == attr_name:
572	# The value should come next
573	tok_id, start, end = next(events)
574	assert tok_id in (QuotedValue, UnquotedValue,
575	MissingValue), TokenName(tok_id)
576	val = start, end
577	break
578
579	except StopIteration:
580	pass
581	return val
582
583	def GetAttrRaw(self, attr_name):
584	# type: (str) -> Optional[str]
585	"""
586	Return the value, which may be UNESCAPED.
587	"""
588	start, end = self.GetSpanForAttrValue(attr_name)
589	if start == -1:
590	return None
591	return self.s[start:end]
592
593	def AllAttrsRawSlice(self):
594	# type: () -> List[Tuple[str, int, int]]
595	"""
596	Get a list of pairs [('class', 3, 5), ('href', 9, 12)]
597	"""
598	slices = []
599	events = self.Tokens()
600	try:
601	while True:
602	tok_id, start, end = next(events)
603	if tok_id == AttrName:
604	name = self.s[start:end]
605
606	# The value should come next
607	tok_id, start, end = next(events)
608	assert tok_id in (QuotedValue, UnquotedValue,
609	MissingValue), TokenName(tok_id)
610	# Note: quoted values may have &
611	# We would need ANOTHER lexer to unescape them, but we
612	# don't need that for ul-table
613	slices.append((name, start, end))
614	except StopIteration:
615	pass
616	return slices
617
618	def AllAttrsRaw(self):
619	# type: () -> List[Tuple[str, str]]
620	"""
621	Get a list of pairs [('class', 'foo'), ('href', '?foo=1&bar=2')]
622
623	The quoted values may be escaped. We would need another lexer to
624	unescape them.
625	"""
626	slices = self.AllAttrsRawSlice()
627	pairs = []
628	for name, start, end in slices:
629	pairs.append((name, self.s[start:end]))
630	return pairs
631
632	def Tokens(self):
633	# type: () -> Iterator[Tuple[int, int, int]]
634	"""
635	Yields a sequence of tokens: Tag (AttrName AttrValue?)*
636
637	Where each Token is (Type, start_pos, end_pos)
638
639	Note that start and end are NOT redundant! We skip over some unwanted
640	characters.
641	"""
642	m = _TAG_RE.match(self.s, self.start_pos + 1)
643	if not m:
644	raise RuntimeError("Couldn't find HTML tag in %r" %
645	self.TagString())
646	yield TagName, m.start(1), m.end(1)
647
648	pos = m.end(0)
649	#log('POS %d', pos)
650
651	while True:
652	# don't search past the end
653	m = _ATTR_RE.match(self.s, pos, self.end_pos)
654	if not m:
655	#log('BREAK pos %d', pos)
656	break
657	#log('AttrName %r', m.group(1))
658
659	yield AttrName, m.start(1), m.end(1)
660
661	#log('m.groups() %r', m.groups())
662	if m.group(2) is not None:
663	# double quoted
664	yield QuotedValue, m.start(2), m.end(2)
665	elif m.group(3) is not None:
666	# single quoted - TODO: could have different token types
667	yield QuotedValue, m.start(3), m.end(3)
668	elif m.group(4) is not None:
669	yield UnquotedValue, m.start(4), m.end(4)
670	else:
671	# <button disabled>
672	end = m.end(0)
673	yield MissingValue, end, end
674
675	# Skip past the "
676	pos = m.end(0)
677
678	#log('TOK %r', self.s)
679
680	m = _TAG_LAST_RE.match(self.s, pos)
681	#log('_TAG_LAST_RE match %r', self.s[pos:])
682	if not m:
683	# Extra data at end of tag. TODO: add messages for all these.
684	raise LexError(self.s, pos)
685
686
687	# This is similar but not identical to
688	# " ([^>"\x00]*) " # double quoted value
689	# \| ' ([^>'\x00]*) ' # single quoted value
690	#
691	# Note: for unquoted values, & isn't allowed, and thus & and c and
692	# are not allowed. We could relax that?
693	ATTR_VALUE_LEXER = CHAR_LEX + [
694	(r'[^>&\x00]+', Tok.RawData),
695	(r'.', Tok.Invalid),
696	]
697
698	ATTR_VALUE_LEXER = MakeLexer(ATTR_VALUE_LEXER)
699
700
701	class AttrValueLexer(object):
702	"""
703	<a href="foo=99&bar">
704	<a href='foo=99&bar'>
705	<a href=unquoted>
706	"""
707
708	def __init__(self, s):
709	# type: (str) -> None
710	self.s = s
711	self.start_pos = -1 # Invalid
712	self.end_pos = -1
713
714	def Reset(self, start_pos, end_pos):
715	# type: (int, int) -> None
716	"""Reuse instances of this object."""
717	assert start_pos >= 0, start_pos
718	assert end_pos >= 0, end_pos
719
720	self.start_pos = start_pos
721	self.end_pos = end_pos
722
723	def NumTokens(self):
724	# type: () -> int
725	num_tokens = 0
726	pos = self.start_pos
727	for tok_id, end_pos in self.Tokens():
728	if tok_id == Tok.Invalid:
729	raise LexError(self.s, pos)
730	pos = end_pos
731	#log('pos %d', pos)
732	num_tokens += 1
733	return num_tokens
734
735	def Tokens(self):
736	# type: () -> Iterator[Union[Iterator, Iterator[Tuple[int, int]]]]
737	pos = self.start_pos
738	while pos < self.end_pos:
739	# Find the first match, like above.
740	# Note: frontend/match.py uses _LongestMatch(), which is different!
741	# TODO: reconcile them. This lexer should be expressible in re2c.
742	for pat, tok_id in ATTR_VALUE_LEXER:
743	m = pat.match(self.s, pos)
744	if m:
745	if 0:
746	tok_str = m.group(0)
747	log('token = %r', tok_str)
748
749	end_pos = m.end(0)
750	yield tok_id, end_pos
751	pos = end_pos
752	break
753	else:
754	raise AssertionError('Tok.Invalid rule should have matched')
755
756
757	def ReadUntilStartTag(it, tag_lexer, tag_name):
758	"""Find the next <foo>, returning its (start, end) positions
759
760	Raise ParseError if it's not found.
761
762	tag_lexer is RESET.
763	"""
764	pos = 0
765	while True:
766	try:
767	tok_id, end_pos = next(it)
768	except StopIteration:
769	break
770	tag_lexer.Reset(pos, end_pos)
771	if tok_id == Tok.StartTag and tag_lexer.TagName() == tag_name:
772	return pos, end_pos
773
774	pos = end_pos
775
776	raise ParseError('No start tag %r' % tag_name)
777
778
779	def ReadUntilEndTag(it, tag_lexer, tag_name):
780	# type: (Iterator, TagLexer, str) -> Tuple[int, int]
781	"""Find the next </foo>, returning its (start, end) position
782
783	Raise ParseError if it's not found.
784
785	tag_lexer is RESET.
786	"""
787	pos = 0
788	while True:
789	try:
790	tok_id, end_pos = next(it)
791	except StopIteration:
792	break
793	tag_lexer.Reset(pos, end_pos)
794	if tok_id == Tok.EndTag and tag_lexer.TagName() == tag_name:
795	return pos, end_pos
796
797	pos = end_pos
798
799	raise ParseError('No end tag %r' % tag_name)
800
801
802	CHAR_ENTITY = {
803	'amp': '&',
804	'lt': '<',
805	'gt': '>',
806	'quot': '"',
807	'apos': "'",
808	}
809
810
811	def ToText(s, left_pos=0, right_pos=-1):
812	# type: (str, int, int) -> str
813	"""Given HTML, return text by unquoting > and < etc.
814
815	Used by:
816	doctools/oils_doc.py: PygmentsPlugin
817	doctools/help_gen.py: HelpIndexCards
818
819	In the latter case, we cold process some tags, like:
820
821	- Blue Link (not clickable, but still useful)
822	- Red X
823
824	That should be html.ToAnsi.
825	"""
826	f = StringIO()
827	out = Output(s, f, left_pos, right_pos)
828
829	pos = left_pos
830	for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
831	if tok_id in (Tok.RawData, Tok.BadAmpersand, Tok.BadGreaterThan,
832	Tok.BadLessThan):
833	out.SkipTo(pos)
834	out.PrintUntil(end_pos)
835
836	elif tok_id == Tok.CharEntity: # &
837
838	entity = s[pos + 1:end_pos - 1]
839
840	out.SkipTo(pos)
841	out.Print(CHAR_ENTITY[entity])
842	out.SkipTo(end_pos)
843
844	# Not handling these yet
845	elif tok_id == Tok.HexChar:
846	raise AssertionError('Hex Char %r' % s[pos:pos + 20])
847
848	elif tok_id == Tok.DecChar:
849	raise AssertionError('Dec Char %r' % s[pos:pos + 20])
850
851	else:
852	# Skip everything else
853	out.SkipTo(end_pos)
854
855	pos = end_pos
856
857	out.PrintTheRest()
858	return f.getvalue()
859
860
861	# https://developer.mozilla.org/en-US/docs/Glossary/Void_element
862	VOID_ELEMENTS = [
863	'area',
864	'base',
865	'br',
866	'col',
867	'embed',
868	'hr',
869	'img',
870	'input',
871	'link',
872	'meta',
873	'param',
874	'source',
875	'track',
876	'wbr',
877	]
878
879	LEX_ATTRS = 1 << 1
880	LEX_QUOTED_VALUES = 1 << 2 # href="?x=42&y=99"
881	NO_SPECIAL_TAGS = 1 << 3 # <script> <style>, VOID tags, etc.
882	BALANCED_TAGS = 1 << 4 # are tags balanced?
883
884
885	def Validate(contents, flags, counters):
886	# type: (str, int, Counters) -> None
887
888	tag_lexer = TagLexer(contents)
889	val_lexer = AttrValueLexer(contents)
890
891	no_special_tags = bool(flags & NO_SPECIAL_TAGS)
892	lx = Lexer(contents, no_special_tags=no_special_tags)
893	tokens = []
894	start_pos = 0
895	tag_stack = []
896	while True:
897	tok_id, end_pos = lx.Read()
898	#log('TOP %s %r', TokenName(tok_id), contents[start_pos:end_pos])
899
900	if tok_id == Tok.Invalid:
901	raise LexError(contents, start_pos)
902	if tok_id == Tok.EndOfStream:
903	break
904
905	tokens.append((tok_id, end_pos))
906
907	if tok_id == Tok.StartEndTag:
908	counters.num_start_end_tags += 1
909
910	tag_lexer.Reset(start_pos, end_pos)
911	all_attrs = tag_lexer.AllAttrsRawSlice()
912	counters.num_attrs += len(all_attrs)
913	for name, val_start, val_end in all_attrs:
914	val_lexer.Reset(val_start, val_end)
915	counters.num_val_tokens += val_lexer.NumTokens()
916
917	counters.debug_attrs.extend(all_attrs)
918
919	elif tok_id == Tok.StartTag:
920	counters.num_start_tags += 1
921
922	tag_lexer.Reset(start_pos, end_pos)
923	all_attrs = tag_lexer.AllAttrsRawSlice()
924	counters.num_attrs += len(all_attrs)
925	for name, val_start, val_end in all_attrs:
926	val_lexer.Reset(val_start, val_end)
927	counters.num_val_tokens += val_lexer.NumTokens()
928
929	counters.debug_attrs.extend(all_attrs)
930
931	if flags & BALANCED_TAGS:
932	tag_name = lx.CanonicalTagName()
933	if flags & NO_SPECIAL_TAGS:
934	tag_stack.append(tag_name)
935	else:
936	# e.g. <meta> is considered self-closing, like <meta/>
937	if tag_name not in VOID_ELEMENTS:
938	tag_stack.append(tag_name)
939
940	counters.max_tag_stack = max(counters.max_tag_stack,
941	len(tag_stack))
942	elif tok_id == Tok.EndTag:
943	if flags & BALANCED_TAGS:
944	try:
945	expected = tag_stack.pop()
946	except IndexError:
947	raise ParseError('Tag stack empty',
948	s=contents,
949	start_pos=start_pos)
950
951	actual = lx.CanonicalTagName()
952	if expected != actual:
953	raise ParseError(
954	'Got unexpected closing tag %r; opening tag was %r' %
955	(contents[start_pos:end_pos], expected),
956	s=contents,
957	start_pos=start_pos)
958
959	start_pos = end_pos
960
961	if len(tag_stack) != 0:
962	raise ParseError('Missing closing tags at end of doc: %s' %
963	' '.join(tag_stack),
964	s=contents,
965	start_pos=start_pos)
966
967	counters.num_tokens += len(tokens)
968
969
970	def ToXml(htm8_str):
971	# type: (str) -> str
972
973	# TODO:
974	# 1. Lex it
975	# 2. < & > must be escaped
976	# a. in raw data
977	# b. in quoted strings
978	# 3. <script> turned into CDATA
979	# 4. void tags turned into self-closing tags
980	# 5. case-sensitive tag matching - not sure about this
981
982	tag_lexer = TagLexer(htm8_str)
983	val_lexer = AttrValueLexer(htm8_str)
984
985	f = StringIO()
986	out = Output(htm8_str, f)
987
988	lx = Lexer(htm8_str)
989
990	pos = 0
991	while True:
992	tok_id, end_pos = lx.Read()
993
994	if tok_id == Tok.Invalid:
995	raise LexError(htm8_str, pos)
996	if tok_id == Tok.EndOfStream:
997	break
998
999	if tok_id in (Tok.RawData, Tok.CharEntity, Tok.HexChar, Tok.DecChar):
1000	out.PrintUntil(end_pos)
1001	elif tok_id in (Tok.StartTag, Tok.StartEndTag):
1002	tag_lexer.Reset(pos, end_pos)
1003	# TODO: reduce allocations here
1004	all_attrs = tag_lexer.AllAttrsRawSlice()
1005	for name, val_start, val_end in all_attrs:
1006	val_lexer.Reset(val_start, val_end)
1007	# TODO: get the kind of string
1008	#
1009	# Quoted: we need to replace & with & and < with <
1010	# note > is not allowed
1011	# Unquoted: right now, we can just surround with double quotes
1012	# because we don't allow any bad chars
1013	# Empty : add "", so empty= becomes =""
1014	# Missing : add ="", so missing becomes missing=""
1015
1016	tag_name = lx.CanonicalTagName()
1017	if tok_id == Tok.StartTag and tag_name in VOID_ELEMENTS:
1018	# TODO: instead of closing >, print />
1019	pass
1020
1021	elif tok_id == Tok.BadAmpersand:
1022	#out.SkipTo(pos)
1023	out.Print('&')
1024	out.SkipTo(end_pos)
1025
1026	elif tok_id == Tok.BadGreaterThan:
1027	#out.SkipTo(pos)
1028	out.Print('>')
1029	out.SkipTo(end_pos)
1030	else:
1031	out.PrintUntil(end_pos)
1032
1033	pos = end_pos
1034
1035	out.PrintTheRest()
1036	return f.getvalue()
1037
1038
1039	class Counters(object):
1040
1041	def __init__(self):
1042	# type: () -> None
1043	self.num_tokens = 0
1044	self.num_start_tags = 0
1045	self.num_start_end_tags = 0
1046	self.num_attrs = 0
1047	self.max_tag_stack = 0
1048	self.num_val_tokens = 0
1049
1050	self.debug_attrs = []
1051
1052
1053	def main(argv):
1054	action = argv[1]
1055
1056	if action == 'tokens':
1057	contents = sys.stdin.read()
1058
1059	lx = Lexer(contents)
1060	start_pos = 0
1061	while True:
1062	tok_id, end_pos = lx.Read()
1063	if tok_id == Tok.Invalid:
1064	raise LexError(contents, start_pos)
1065	if tok_id == Tok.EndOfStream:
1066	break
1067
1068	frag = contents[start_pos:end_pos]
1069	log('%d %s %r', end_pos, TokenName(tok_id), frag)
1070	start_pos = end_pos
1071
1072	return 0
1073
1074	elif action in ('lex-htm8', 'parse-htm8', 'parse-xml'):
1075
1076	errors = []
1077	counters = Counters()
1078
1079	flags = LEX_ATTRS \| LEX_QUOTED_VALUES
1080	if action.startswith('parse-'):
1081	flags \|= BALANCED_TAGS
1082	if action == 'parse-xml':
1083	flags \|= NO_SPECIAL_TAGS
1084
1085	i = 0
1086	for line in sys.stdin:
1087	filename = line.strip()
1088	with open(filename) as f:
1089	contents = f.read()
1090
1091	try:
1092	Validate(contents, flags, counters)
1093	except LexError as e:
1094	log('Lex error in %r: %s', filename, e)
1095	errors.append((filename, e))
1096	except ParseError as e:
1097	log('Parse error in %r: %s', filename, e)
1098	errors.append((filename, e))
1099	i += 1
1100
1101	log('')
1102	log('%10d tokens', counters.num_tokens)
1103	log('%10d start/end tags', counters.num_start_end_tags)
1104	log('%10d start tags', counters.num_start_tags)
1105	log('%10d attrs', counters.num_attrs)
1106	log('%10d max tag stack depth', counters.max_tag_stack)
1107	log('%10d attr val tokens', counters.num_val_tokens)
1108	log('%10d errors', len(errors))
1109	if len(errors):
1110	return 1
1111	return 0
1112
1113	elif action == 'todo':
1114	# Other algorithms:
1115	#
1116	# - select first subtree with given ID
1117	# - this requires understanding the void tags I suppose
1118	# - select all subtrees that have a class
1119	# - materialize DOM
1120
1121	# Safe-HTM8? This is a filter
1122	return 0
1123
1124	else:
1125	raise RuntimeError('Invalid action %r' % action)
1126
1127
1128	if __name__ == '__main__':
1129	sys.exit(main(sys.argv))