lazylex/html.py

OILS / lazylex / html.py View on Github | oils.pub

1126 lines, 559 significant

1	#!/usr/bin/env python2
2	"""
3	lazylex/html.py - Low-Level HTML Processing.
4
5	See lazylex/README.md for details.
6
7	TODO:
8	- Get rid of AttrValueLexer - this should be in the TagLexer
9	- this also means that unquoted values can be more similar
10	- We can use a single lexer mode for everything inside <>
11	- the SPACE is the only difference
12	- UTF-8 check, like JSON8
13	- Static typing
14
15	"""
16	from __future__ import print_function
17
18	from _devbuild.gen.htm8_asdl import h8_id, h8_id_t, h8_id_str
19	from typing import Dict, Iterator, Union, Any, IO
20
21	try:
22	from cStringIO import StringIO
23	except ImportError:
24	# for python3
25	from io import StringIO # type: ignore
26	import re
27	import sys
28
29	if sys.version_info.major == 2:
30	from typing import List, Tuple, Optional
31
32
33	def log(msg, *args):
34	# type: (str, *Any) -> None
35	msg = msg % args
36	print(msg, file=sys.stderr)
37
38
39	class LexError(Exception):
40	"""
41	Examples of lex errors:
42
43	- h8_id.Invalid, like <> or &&
44	- Unclosed <!-- <? <![CDATA[ <script> <style>
45	"""
46
47	def __init__(self, s, start_pos):
48	# type: (str, int) -> None
49	self.s = s
50	self.start_pos = start_pos
51
52	def __str__(self):
53	# type: () -> str
54	return '(LexError %r)' % (self.s[self.start_pos:self.start_pos + 20])
55
56
57	def FindLineNum(s, error_pos):
58	# type: (str, int) -> int
59	current_pos = 0
60	line_num = 1
61	while True:
62	newline_pos = s.find('\n', current_pos)
63	#log('current = %d, N %d, line %d', current_pos, newline_pos, line_num)
64
65	if newline_pos == -1: # this is the last line
66	return line_num
67	if newline_pos >= error_pos:
68	return line_num
69	line_num += 1
70	current_pos = newline_pos + 1
71
72
73	class ParseError(Exception):
74	"""
75	Examples of parse errors
76
77	- unbalanced tag structure
78	- ul_table.py errors
79	"""
80
81	def __init__(self, msg, s=None, start_pos=-1):
82	# type: (str, Optional[str], int) -> None
83	self.msg = msg
84	self.s = s
85	self.start_pos = start_pos
86
87	def __str__(self):
88	# type: () -> str
89	if self.s is not None:
90	assert self.start_pos != -1, self.start_pos
91	snippet = (self.s[self.start_pos:self.start_pos + 20])
92
93	line_num = FindLineNum(self.s, self.start_pos)
94	else:
95	snippet = ''
96	line_num = -1
97	msg = 'line %d: %r %r' % (line_num, self.msg, snippet)
98	return msg
99
100
101	class Output(object):
102	"""Takes an underlying input buffer and an output file. Maintains a
103	position in the input buffer.
104
105	Print FROM the input or print new text to the output.
106	"""
107
108	def __init__(self, s, f, left_pos=0, right_pos=-1):
109	# type: (str, IO[str], int, int) -> None
110	self.s = s
111	self.f = f
112	self.pos = left_pos
113	self.right_pos = len(s) if right_pos == -1 else right_pos
114
115	def SkipTo(self, pos):
116	# type: (int) -> None
117	"""Skip to a position."""
118	self.pos = pos
119
120	def PrintUntil(self, pos):
121	# type: (int) -> None
122	"""Print until a position."""
123	piece = self.s[self.pos:pos]
124	self.f.write(piece)
125	self.pos = pos
126
127	def PrintTheRest(self):
128	# type: () -> None
129	"""Print until the end of the string."""
130	self.PrintUntil(self.right_pos)
131
132	def Print(self, s):
133	# type: (str) -> None
134	"""Print text to the underlying buffer."""
135	self.f.write(s)
136
137
138	# HTML Tokens
139	# CommentBegin, ProcessingBegin, CDataBegin are "pseudo-tokens", not visible
140	TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin CData CDataBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData HtmlCData BadAmpersand BadGreaterThan BadLessThan Invalid EndOfStream'.split(
141	)
142
143
144	class Tok(object):
145	"""
146	Avoid lint errors by using these aliases
147	"""
148	pass
149
150
151	TOKEN_NAMES = [None] * len(TOKENS) # type: List[str]
152
153	this_module = sys.modules[__name__]
154	for i, tok_str in enumerate(TOKENS):
155	setattr(this_module, tok_str, i)
156	setattr(Tok, tok_str, i)
157	TOKEN_NAMES[i] = tok_str
158
159
160	def MakeLexer(rules):
161	return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]
162
163
164	#
165	# Eggex
166	#
167	# Tag = / ~['>']+ /
168
169	# Is this valid? A single character?
170	# Tag = / ~'>'* /
171
172	# Maybe better: / [NOT '>']+/
173	# capital letters not allowed there?
174	#
175	# But then this is confusing:
176	# / [NOT ~digit]+/
177	#
178	# / [NOT digit] / is [^\d]
179	# / ~digit / is \D
180	#
181	# Or maybe:
182	#
183	# / [~ digit]+ /
184	# / [~ '>']+ /
185	# / [NOT '>']+ /
186
187	# End = / '</' Tag '>' /
188	# StartEnd = / '<' Tag '/>' /
189	# Start = / '<' Tag '>' /
190	#
191	# EntityRef = / '&' dot{* N} ';' /
192
193	# Tag name, or attribute name
194	# colon is used in XML
195
196	# https://www.w3.org/TR/xml/#NT-Name
197	# Hm there is a lot of unicode stuff. We are simplifying parsing
198
199	_NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter
200
201	CHAR_LEX = [
202	# Characters
203	# https://www.w3.org/TR/xml/#sec-references
204	(r'&\# [0-9]+ ;', h8_id.DecChar),
205	(r'&\# x[0-9a-fA-F]+ ;', h8_id.HexChar),
206	(r'& %s ;' % _NAME, h8_id.CharEntity),
207	# Allow unquoted, and quoted
208	(r'&', h8_id.BadAmpersand),
209	]
210
211	HTM8_LEX = CHAR_LEX + [
212	(r'<!--', h8_id.CommentBegin),
213
214	# Processing instruction are used for the XML header:
215	# <?xml version="1.0" encoding="UTF-8"?>
216	# They are technically XML-only, but in HTML5, they are another kind of
217	# comment:
218	#
219	# https://developer.mozilla.org/en-US/docs/Web/API/ProcessingInstruction
220	#
221	(r'<\?', h8_id.ProcessingBegin),
222	# Not necessary in HTML5, but occurs in XML
223	(r'<!\[CDATA\[', h8_id.CDataBegin), # <![CDATA[
224
225	# Markup declarations
226	# - In HTML5, there is only <!DOCTYPE html>
227	# - XML has 4 more declarations: <!ELEMENT ...> ATTLIST ENTITY NOTATION
228	# - these seem to be part of DTD
229	# - it's useful to skip these, and be able to parse the rest of the document
230	# - Note: < is allowed?
231	(r'<! [^>\x00]+ >', h8_id.Decl),
232
233	# Tags
234	# Notes:
235	# - We look for a valid tag name, but we don't validate attributes.
236	# That's done in the tag lexer.
237	# - We don't allow leading whitespace
238	(r'</ (%s) >' % _NAME, h8_id.EndTag),
239	# self-closing <br/> comes before StartTag
240	# could/should these be collapsed into one rule?
241	(r'< (%s) [^>\x00]* />' % _NAME, h8_id.StartEndTag), # end </a>
242	(r'< (%s) [^>\x00]* >' % _NAME, h8_id.StartTag), # start <a>
243
244	# HTML5 allows unescaped > in raw data, but < is not allowed.
245	# https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
246	#
247	# - My early blog has THREE errors when disallowing >
248	# - So do some .wwz files
249	(r'[^&<>\x00]+', h8_id.RawData),
250	(r'>', h8_id.BadGreaterThan),
251	# < is an error
252	(r'.', h8_id.Invalid),
253	]
254
255	# Old notes:
256	#
257	# Non-greedy matches are regular and can be matched in linear time
258	# with RE2.
259	#
260	# https://news.ycombinator.com/item?id=27099798
261	#
262	# Maybe try combining all of these for speed.
263
264	# . is any char except newline
265	# https://re2c.org/manual/manual_c.html
266
267	# Discarded options
268	#(r'<!-- .*? -->', h8_id.Comment),
269
270	# Hack from Claude: \s\S instead of re.DOTALL. I don't like this
271	#(r'<!-- [\s\S]*? -->', h8_id.Comment),
272	#(r'<!-- (?:.\|[\n])*? -->', h8_id.Comment),
273
274	HTM8_LEX_COMPILED = MakeLexer(HTM8_LEX)
275
276
277	class Lexer(object):
278
279	def __init__(self, s, left_pos=0, right_pos=-1, no_special_tags=False):
280	# type: (str, int, int, bool) -> None
281	self.s = s
282	self.pos = left_pos
283	self.right_pos = len(s) if right_pos == -1 else right_pos
284	self.no_special_tags = no_special_tags
285
286	# string -> compiled regex pattern object
287	self.cache = {} # type: Dict[str, Any]
288
289	# either </script> or </style> - we search until we see that
290	self.search_state = None # type: Optional[str]
291
292	# Position of tag name, if applicable
293	# - Set after you get a StartTag, EndTag, or StartEndTag
294	# - Unset on other tags
295	self.tag_pos_left = -1
296	self.tag_pos_right = -1
297
298	def _Read(self):
299	# type: () -> Tuple[h8_id_t, int]
300	if self.pos == self.right_pos:
301	return h8_id.EndOfStream, self.pos
302
303	assert self.pos < self.right_pos, self.pos
304
305	if self.search_state is not None and not self.no_special_tags:
306	# TODO: case-insensitive search for </SCRIPT> <SCRipt> ?
307	#
308	# Another strategy: enter a mode where we find ONLY the end tag
309	# regex, and any data that's not <, and then check the canonical
310	# tag name for 'script' or 'style'.
311	pos = self.s.find(self.search_state, self.pos)
312	if pos == -1:
313	# unterminated <script> or <style>
314	raise LexError(self.s, self.pos)
315	self.search_state = None
316	# beginning
317	return h8_id.HtmlCData, pos
318
319	# Find the first match.
320	# Note: frontend/match.py uses _LongestMatch(), which is different!
321	# TODO: reconcile them. This lexer should be expressible in re2c.
322
323	for pat, tok_id in HTM8_LEX_COMPILED:
324	m = pat.match(self.s, self.pos)
325	if m:
326	if tok_id in (h8_id.StartTag, h8_id.EndTag, h8_id.StartEndTag):
327	self.tag_pos_left = m.start(1)
328	self.tag_pos_right = m.end(1)
329	else:
330	# Reset state
331	self.tag_pos_left = -1
332	self.tag_pos_right = -1
333
334	if tok_id == h8_id.CommentBegin:
335	pos = self.s.find('-->', self.pos)
336	if pos == -1:
337	# unterminated <!--
338	raise LexError(self.s, self.pos)
339	return h8_id.Comment, pos + 3 # -->
340
341	if tok_id == h8_id.ProcessingBegin:
342	pos = self.s.find('?>', self.pos)
343	if pos == -1:
344	# unterminated <?
345	raise LexError(self.s, self.pos)
346	return h8_id.Processing, pos + 2 # ?>
347
348	if tok_id == h8_id.CDataBegin:
349	pos = self.s.find(']]>', self.pos)
350	if pos == -1:
351	# unterminated <![CDATA[
352	raise LexError(self.s, self.pos)
353	return h8_id.CData, pos + 3 # ]]>
354
355	if tok_id == h8_id.StartTag:
356	# TODO: reduce allocations
357	if (self.TagNameEquals('script') or
358	self.TagNameEquals('style')):
359	# <SCRipt a=b> -> </SCRipt>
360	self.search_state = '</' + self._LiteralTagName() + '>'
361
362	return tok_id, m.end()
363	else:
364	raise AssertionError('h8_id.Invalid rule should have matched')
365
366	def TagNameEquals(self, expected):
367	# type: (str) -> bool
368	assert self.tag_pos_left != -1, self.tag_pos_left
369	assert self.tag_pos_right != -1, self.tag_pos_right
370
371	# TODO: In C++, this does not need an allocation. Can we test
372	# directly?
373	return expected == self.CanonicalTagName()
374
375	def _LiteralTagName(self):
376	# type: () -> str
377	assert self.tag_pos_left != -1, self.tag_pos_left
378	assert self.tag_pos_right != -1, self.tag_pos_right
379
380	return self.s[self.tag_pos_left:self.tag_pos_right]
381
382	def CanonicalTagName(self):
383	# type: () -> str
384	tag_name = self._LiteralTagName()
385	# Most tags are already lower case, so avoid allocation with this conditional
386	# TODO: this could go in the mycpp runtime?
387	if tag_name.islower():
388	return tag_name
389	else:
390	return tag_name.lower()
391
392	def Read(self):
393	# type: () -> Tuple[h8_id_t, int]
394	tok_id, end_pos = self._Read()
395	self.pos = end_pos # advance
396	return tok_id, end_pos
397
398	def LookAhead(self, regex):
399	# type: (str) -> bool
400	# Cache the regex compilation. This could also be LookAheadFor(THEAD)
401	# or something.
402	pat = self.cache.get(regex)
403	if pat is None:
404	pat = re.compile(regex)
405	self.cache[regex] = pat
406
407	m = pat.match(self.s, self.pos)
408	return m is not None
409
410
411	def _Tokens(s, left_pos, right_pos):
412	# type: (str, int, int) -> Iterator[Tuple[h8_id_t, int]]
413	"""
414	Args:
415	s: string to parse
416	left_pos, right_pos: Optional span boundaries.
417	"""
418	lx = Lexer(s, left_pos, right_pos)
419	while True:
420	tok_id, pos = lx.Read()
421	yield tok_id, pos
422	if tok_id == h8_id.EndOfStream:
423	break
424
425
426	def ValidTokens(s, left_pos=0, right_pos=-1):
427	# type: (str, int, int) -> Iterator[Tuple[int, int]]
428	"""Wrapper around _Tokens to prevent callers from having to handle Invalid.
429
430	I'm not combining the two functions because I might want to do a
431	'yield' transformation on Tokens()? Exceptions might complicate the
432	issue?
433	"""
434	pos = left_pos
435	for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
436	if tok_id == h8_id.Invalid:
437	raise LexError(s, pos)
438	yield tok_id, end_pos
439	pos = end_pos
440
441
442	def ValidTokenList(s, no_special_tags=False):
443	# type: (str, bool) -> List[Tuple[h8_id_t, int]]
444	"""A wrapper that can be more easily translated to C++. Doesn't use iterators."""
445
446	start_pos = 0
447	tokens = []
448	lx = Lexer(s, no_special_tags=no_special_tags)
449	while True:
450	tok_id, end_pos = lx.Read()
451	tokens.append((tok_id, end_pos))
452	if tok_id == h8_id.EndOfStream:
453	break
454	if tok_id == h8_id.Invalid:
455	raise LexError(s, start_pos)
456	start_pos = end_pos
457	return tokens
458
459
460	# Tag names:
461	# Match <a or </a
462	# Match <h2, but not <2h
463	#
464	# HTML 5 doesn't restrict tag names at all
465	# https://html.spec.whatwg.org/#toc-syntax
466	#
467	# XML allows : - .
468	# https://www.w3.org/TR/xml/#NT-NameChar
469
470	# Namespaces for MathML, SVG
471	# XLink, XML, XMLNS
472	#
473	# https://infra.spec.whatwg.org/#namespaces
474	#
475	# Allow - for td-attrs
476
477	# Be very lenient - just no whitespace or special HTML chars
478	# I don't think this is more lenient than HTML5, though we should check.
479	_UNQUOTED_VALUE = r'''[^ \t\r\n<>&"'\x00]*'''
480
481	# TODO: we don't need to capture the tag name here? That's done at the top
482	# level
483	_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)
484
485	_TAG_LAST_RE = re.compile(r'\s* /? >', re.VERBOSE)
486
487	# To match href="foo"
488	# Note: in HTML5 and XML, single quoted attributes are also valid
489
490	# <button disabled> is standard usage
491
492	# NOTE: This used to allow whitespace around =
493	# <a foo = "bar"> makes sense in XML
494	# But then you also have
495	# <a foo= bar> - which is TWO attributes, in HTML5
496	# So the space is problematic
497
498	_ATTR_RE = re.compile(
499	r'''
500	\s+ # Leading whitespace is required
501	(%s) # Attribute name
502	(?: # Optional attribute value
503	\s* = \s* # Spaces allowed around =
504	(?:
505	" ([^>"\x00]*) " # double quoted value
506	\| ' ([^>'\x00]*) ' # single quoted value
507	\| (%s) # Attribute value
508	)
509	)?
510	''' % (_NAME, _UNQUOTED_VALUE), re.VERBOSE)
511
512	TagName, AttrName, UnquotedValue, QuotedValue, MissingValue = range(5)
513
514
515	class TagLexer(object):
516	"""
517	Given a tag like <a href="..."> or <link type="..." />, the TagLexer
518	provides a few operations:
519
520	- What is the tag?
521	- Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
522	"""
523
524	def __init__(self, s):
525	# type: (str) -> None
526	self.s = s
527	self.start_pos = -1 # Invalid
528	self.end_pos = -1
529
530	def Reset(self, start_pos, end_pos):
531	# type: (int, int) -> None
532	"""Reuse instances of this object."""
533	assert start_pos >= 0, start_pos
534	assert end_pos >= 0, end_pos
535
536	self.start_pos = start_pos
537	self.end_pos = end_pos
538
539	def TagString(self):
540	# type: () -> str
541	"""Return the entire tag string, e.g. <a href='foo'>"""
542	return self.s[self.start_pos:self.end_pos]
543
544	def TagName(self):
545	# type: () -> str
546	# First event
547	tok_id, start, end = next(self.Tokens())
548	return self.s[start:end]
549
550	def GetSpanForAttrValue(self, attr_name):
551	# type: (str) -> Tuple[int, int]
552	"""
553	Used by oils_doc.py, for href shortcuts
554	"""
555	# Algorithm: search for QuotedValue or UnquotedValue after AttrName
556	# TODO: Could also cache these
557
558	events = self.Tokens()
559	val = (-1, -1)
560	try:
561	while True:
562	tok_id, start, end = next(events)
563	if tok_id == AttrName:
564	name = self.s[start:end]
565	if name == attr_name:
566	# The value should come next
567	tok_id, start, end = next(events)
568	assert tok_id in (QuotedValue, UnquotedValue,
569	MissingValue), h8_id_str(tok_id)
570	val = start, end
571	break
572
573	except StopIteration:
574	pass
575	return val
576
577	def GetAttrRaw(self, attr_name):
578	# type: (str) -> Optional[str]
579	"""
580	Return the value, which may be UNESCAPED.
581	"""
582	start, end = self.GetSpanForAttrValue(attr_name)
583	if start == -1:
584	return None
585	return self.s[start:end]
586
587	def AllAttrsRawSlice(self):
588	# type: () -> List[Tuple[str, int, int]]
589	"""
590	Get a list of pairs [('class', 3, 5), ('href', 9, 12)]
591	"""
592	slices = []
593	events = self.Tokens()
594	try:
595	while True:
596	tok_id, start, end = next(events)
597	if tok_id == AttrName:
598	name = self.s[start:end]
599
600	# The value should come next
601	tok_id, start, end = next(events)
602	assert tok_id in (QuotedValue, UnquotedValue,
603	MissingValue), h8_id_str(tok_id)
604	# Note: quoted values may have &
605	# We would need ANOTHER lexer to unescape them, but we
606	# don't need that for ul-table
607	slices.append((name, start, end))
608	except StopIteration:
609	pass
610	return slices
611
612	def AllAttrsRaw(self):
613	# type: () -> List[Tuple[str, str]]
614	"""
615	Get a list of pairs [('class', 'foo'), ('href', '?foo=1&bar=2')]
616
617	The quoted values may be escaped. We would need another lexer to
618	unescape them.
619	"""
620	slices = self.AllAttrsRawSlice()
621	pairs = []
622	for name, start, end in slices:
623	pairs.append((name, self.s[start:end]))
624	return pairs
625
626	def Tokens(self):
627	# type: () -> Iterator[Tuple[int, int, int]]
628	"""
629	Yields a sequence of tokens: Tag (AttrName AttrValue?)*
630
631	Where each Token is (Type, start_pos, end_pos)
632
633	Note that start and end are NOT redundant! We skip over some unwanted
634	characters.
635	"""
636	m = _TAG_RE.match(self.s, self.start_pos + 1)
637	if not m:
638	raise RuntimeError("Couldn't find HTML tag in %r" %
639	self.TagString())
640	yield TagName, m.start(1), m.end(1)
641
642	pos = m.end(0)
643	#log('POS %d', pos)
644
645	while True:
646	# don't search past the end
647	m = _ATTR_RE.match(self.s, pos, self.end_pos)
648	if not m:
649	#log('BREAK pos %d', pos)
650	break
651	#log('AttrName %r', m.group(1))
652
653	yield AttrName, m.start(1), m.end(1)
654
655	#log('m.groups() %r', m.groups())
656	if m.group(2) is not None:
657	# double quoted
658	yield QuotedValue, m.start(2), m.end(2)
659	elif m.group(3) is not None:
660	# single quoted - TODO: could have different token types
661	yield QuotedValue, m.start(3), m.end(3)
662	elif m.group(4) is not None:
663	yield UnquotedValue, m.start(4), m.end(4)
664	else:
665	# <button disabled>
666	end = m.end(0)
667	yield MissingValue, end, end
668
669	# Skip past the "
670	pos = m.end(0)
671
672	#log('TOK %r', self.s)
673
674	m = _TAG_LAST_RE.match(self.s, pos)
675	#log('_TAG_LAST_RE match %r', self.s[pos:])
676	if not m:
677	# Extra data at end of tag. TODO: add messages for all these.
678	raise LexError(self.s, pos)
679
680
681	# This is similar but not identical to
682	# " ([^>"\x00]*) " # double quoted value
683	# \| ' ([^>'\x00]*) ' # single quoted value
684	#
685	# Note: for unquoted values, & isn't allowed, and thus & and c and
686	# are not allowed. We could relax that?
687	ATTR_VALUE_LEXER = CHAR_LEX + [
688	(r'[^>&\x00]+', h8_id.RawData),
689	(r'.', h8_id.Invalid),
690	]
691
692	ATTR_VALUE_LEXER = MakeLexer(ATTR_VALUE_LEXER)
693
694
695	class AttrValueLexer(object):
696	"""
697	<a href="foo=99&bar">
698	<a href='foo=99&bar'>
699	<a href=unquoted>
700	"""
701
702	def __init__(self, s):
703	# type: (str) -> None
704	self.s = s
705	self.start_pos = -1 # Invalid
706	self.end_pos = -1
707
708	def Reset(self, start_pos, end_pos):
709	# type: (int, int) -> None
710	"""Reuse instances of this object."""
711	assert start_pos >= 0, start_pos
712	assert end_pos >= 0, end_pos
713
714	self.start_pos = start_pos
715	self.end_pos = end_pos
716
717	def NumTokens(self):
718	# type: () -> int
719	num_tokens = 0
720	pos = self.start_pos
721	for tok_id, end_pos in self.Tokens():
722	if tok_id == h8_id.Invalid:
723	raise LexError(self.s, pos)
724	pos = end_pos
725	#log('pos %d', pos)
726	num_tokens += 1
727	return num_tokens
728
729	def Tokens(self):
730	# type: () -> Iterator[Tuple[h8_id_t, int]]
731	pos = self.start_pos
732	while pos < self.end_pos:
733	# Find the first match, like above.
734	# Note: frontend/match.py uses _LongestMatch(), which is different!
735	# TODO: reconcile them. This lexer should be expressible in re2c.
736	for pat, tok_id in ATTR_VALUE_LEXER:
737	m = pat.match(self.s, pos)
738	if m:
739	if 0:
740	tok_str = m.group(0)
741	log('token = %r', tok_str)
742
743	end_pos = m.end(0)
744	yield tok_id, end_pos
745	pos = end_pos
746	break
747	else:
748	raise AssertionError('h8_id.Invalid rule should have matched')
749
750
751	def ReadUntilStartTag(it, tag_lexer, tag_name):
752	# type: (Iterator[Tuple[h8_id_t, int]], TagLexer, str) -> Tuple[int, int]
753	"""Find the next <foo>, returning its (start, end) positions
754
755	Raise ParseError if it's not found.
756
757	tag_lexer is RESET.
758	"""
759	pos = 0
760	while True:
761	try:
762	tok_id, end_pos = next(it)
763	except StopIteration:
764	break
765	tag_lexer.Reset(pos, end_pos)
766	if tok_id == h8_id.StartTag and tag_lexer.TagName() == tag_name:
767	return pos, end_pos
768
769	pos = end_pos
770
771	raise ParseError('No start tag %r' % tag_name)
772
773
774	def ReadUntilEndTag(it, tag_lexer, tag_name):
775	# type: (Iterator[Tuple[h8_id_t, int]], TagLexer, str) -> Tuple[int, int]
776	"""Find the next </foo>, returning its (start, end) position
777
778	Raise ParseError if it's not found.
779
780	tag_lexer is RESET.
781	"""
782	pos = 0
783	while True:
784	try:
785	tok_id, end_pos = next(it)
786	except StopIteration:
787	break
788	tag_lexer.Reset(pos, end_pos)
789	if tok_id == h8_id.EndTag and tag_lexer.TagName() == tag_name:
790	return pos, end_pos
791
792	pos = end_pos
793
794	raise ParseError('No end tag %r' % tag_name)
795
796
797	CHAR_ENTITY = {
798	'amp': '&',
799	'lt': '<',
800	'gt': '>',
801	'quot': '"',
802	'apos': "'",
803	}
804
805
806	def ToText(s, left_pos=0, right_pos=-1):
807	# type: (str, int, int) -> str
808	"""Given HTML, return text by unquoting > and < etc.
809
810	Used by:
811	doctools/oils_doc.py: PygmentsPlugin
812	doctools/help_gen.py: HelpIndexCards
813
814	In the latter case, we cold process some tags, like:
815
816	- Blue Link (not clickable, but still useful)
817	- Red X
818
819	That should be html.ToAnsi.
820	"""
821	f = StringIO()
822	out = Output(s, f, left_pos, right_pos)
823
824	pos = left_pos
825	for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
826	if tok_id in (h8_id.RawData, h8_id.BadAmpersand, h8_id.BadGreaterThan,
827	h8_id.BadLessThan):
828	out.SkipTo(pos)
829	out.PrintUntil(end_pos)
830
831	elif tok_id == h8_id.CharEntity: # &
832
833	entity = s[pos + 1:end_pos - 1]
834
835	out.SkipTo(pos)
836	out.Print(CHAR_ENTITY[entity])
837	out.SkipTo(end_pos)
838
839	# Not handling these yet
840	elif tok_id == h8_id.HexChar:
841	raise AssertionError('Hex Char %r' % s[pos:pos + 20])
842
843	elif tok_id == h8_id.DecChar:
844	raise AssertionError('Dec Char %r' % s[pos:pos + 20])
845
846	else:
847	# Skip everything else
848	out.SkipTo(end_pos)
849
850	pos = end_pos
851
852	out.PrintTheRest()
853	return f.getvalue()
854
855
856	# https://developer.mozilla.org/en-US/docs/Glossary/Void_element
857	VOID_ELEMENTS = [
858	'area',
859	'base',
860	'br',
861	'col',
862	'embed',
863	'hr',
864	'img',
865	'input',
866	'link',
867	'meta',
868	'param',
869	'source',
870	'track',
871	'wbr',
872	]
873
874	LEX_ATTRS = 1 << 1
875	LEX_QUOTED_VALUES = 1 << 2 # href="?x=42&y=99"
876	NO_SPECIAL_TAGS = 1 << 3 # <script> <style>, VOID tags, etc.
877	BALANCED_TAGS = 1 << 4 # are tags balanced?
878
879
880	def Validate(contents, flags, counters):
881	# type: (str, int, Counters) -> None
882
883	tag_lexer = TagLexer(contents)
884	val_lexer = AttrValueLexer(contents)
885
886	no_special_tags = bool(flags & NO_SPECIAL_TAGS)
887	lx = Lexer(contents, no_special_tags=no_special_tags)
888	tokens = []
889	start_pos = 0
890	tag_stack = []
891	while True:
892	tok_id, end_pos = lx.Read()
893	#log('TOP %s %r', h8_id_str(tok_id), contents[start_pos:end_pos])
894
895	if tok_id == h8_id.Invalid:
896	raise LexError(contents, start_pos)
897	if tok_id == h8_id.EndOfStream:
898	break
899
900	tokens.append((tok_id, end_pos))
901
902	if tok_id == h8_id.StartEndTag:
903	counters.num_start_end_tags += 1
904
905	tag_lexer.Reset(start_pos, end_pos)
906	all_attrs = tag_lexer.AllAttrsRawSlice()
907	counters.num_attrs += len(all_attrs)
908	for name, val_start, val_end in all_attrs:
909	val_lexer.Reset(val_start, val_end)
910	counters.num_val_tokens += val_lexer.NumTokens()
911
912	#counters.debug_attrs.extend(all_attrs)
913
914	elif tok_id == h8_id.StartTag:
915	counters.num_start_tags += 1
916
917	tag_lexer.Reset(start_pos, end_pos)
918	all_attrs = tag_lexer.AllAttrsRawSlice()
919	counters.num_attrs += len(all_attrs)
920	for name, val_start, val_end in all_attrs:
921	val_lexer.Reset(val_start, val_end)
922	counters.num_val_tokens += val_lexer.NumTokens()
923
924	#counters.debug_attrs.extend(all_attrs)
925
926	if flags & BALANCED_TAGS:
927	tag_name = lx.CanonicalTagName()
928	if flags & NO_SPECIAL_TAGS:
929	tag_stack.append(tag_name)
930	else:
931	# e.g. <meta> is considered self-closing, like <meta/>
932	if tag_name not in VOID_ELEMENTS:
933	tag_stack.append(tag_name)
934
935	counters.max_tag_stack = max(counters.max_tag_stack,
936	len(tag_stack))
937	elif tok_id == h8_id.EndTag:
938	if flags & BALANCED_TAGS:
939	try:
940	expected = tag_stack.pop()
941	except IndexError:
942	raise ParseError('Tag stack empty',
943	s=contents,
944	start_pos=start_pos)
945
946	actual = lx.CanonicalTagName()
947	if expected != actual:
948	raise ParseError(
949	'Got unexpected closing tag %r; opening tag was %r' %
950	(contents[start_pos:end_pos], expected),
951	s=contents,
952	start_pos=start_pos)
953
954	start_pos = end_pos
955
956	if len(tag_stack) != 0:
957	raise ParseError('Missing closing tags at end of doc: %s' %
958	' '.join(tag_stack),
959	s=contents,
960	start_pos=start_pos)
961
962	counters.num_tokens += len(tokens)
963
964
965	def ToXml(htm8_str):
966	# type: (str) -> str
967
968	# TODO:
969	# 1. Lex it
970	# 2. < & > must be escaped
971	# a. in raw data
972	# b. in quoted strings
973	# 3. <script> turned into CDATA
974	# 4. void tags turned into self-closing tags
975	# 5. case-sensitive tag matching - not sure about this
976
977	tag_lexer = TagLexer(htm8_str)
978	val_lexer = AttrValueLexer(htm8_str)
979
980	f = StringIO()
981	out = Output(htm8_str, f)
982
983	lx = Lexer(htm8_str)
984
985	pos = 0
986	while True:
987	tok_id, end_pos = lx.Read()
988
989	if tok_id == h8_id.Invalid:
990	raise LexError(htm8_str, pos)
991	if tok_id == h8_id.EndOfStream:
992	break
993
994	if tok_id in (h8_id.RawData, h8_id.CharEntity, h8_id.HexChar,
995	h8_id.DecChar):
996	out.PrintUntil(end_pos)
997	elif tok_id in (h8_id.StartTag, h8_id.StartEndTag):
998	tag_lexer.Reset(pos, end_pos)
999	# TODO: reduce allocations here
1000	all_attrs = tag_lexer.AllAttrsRawSlice()
1001	for name, val_start, val_end in all_attrs:
1002	val_lexer.Reset(val_start, val_end)
1003	# TODO: get the kind of string
1004	#
1005	# Quoted: we need to replace & with & and < with <
1006	# note > is not allowed
1007	# Unquoted: right now, we can just surround with double quotes
1008	# because we don't allow any bad chars
1009	# Empty : add "", so empty= becomes =""
1010	# Missing : add ="", so missing becomes missing=""
1011
1012	tag_name = lx.CanonicalTagName()
1013	if tok_id == h8_id.StartTag and tag_name in VOID_ELEMENTS:
1014	# TODO: instead of closing >, print />
1015	pass
1016
1017	elif tok_id == h8_id.BadAmpersand:
1018	#out.SkipTo(pos)
1019	out.Print('&')
1020	out.SkipTo(end_pos)
1021
1022	elif tok_id == h8_id.BadGreaterThan:
1023	#out.SkipTo(pos)
1024	out.Print('>')
1025	out.SkipTo(end_pos)
1026	else:
1027	out.PrintUntil(end_pos)
1028
1029	pos = end_pos
1030
1031	out.PrintTheRest()
1032	return f.getvalue()
1033
1034
1035	class Counters(object):
1036
1037	def __init__(self):
1038	# type: () -> None
1039	self.num_tokens = 0
1040	self.num_start_tags = 0
1041	self.num_start_end_tags = 0
1042	self.num_attrs = 0
1043	self.max_tag_stack = 0
1044	self.num_val_tokens = 0
1045
1046	#self.debug_attrs = []
1047
1048
1049	def main(argv):
1050	# type: (List[str]) -> int
1051	action = argv[1]
1052
1053	if action == 'tokens':
1054	contents = sys.stdin.read()
1055
1056	lx = Lexer(contents)
1057	start_pos = 0
1058	while True:
1059	tok_id, end_pos = lx.Read()
1060	if tok_id == h8_id.Invalid:
1061	raise LexError(contents, start_pos)
1062	if tok_id == h8_id.EndOfStream:
1063	break
1064
1065	frag = contents[start_pos:end_pos]
1066	log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
1067	start_pos = end_pos
1068
1069	return 0
1070
1071	elif action in ('lex-htm8', 'parse-htm8', 'parse-xml'):
1072
1073	errors = []
1074	counters = Counters()
1075
1076	flags = LEX_ATTRS \| LEX_QUOTED_VALUES
1077	if action.startswith('parse-'):
1078	flags \|= BALANCED_TAGS
1079	if action == 'parse-xml':
1080	flags \|= NO_SPECIAL_TAGS
1081
1082	i = 0
1083	for line in sys.stdin:
1084	filename = line.strip()
1085	with open(filename) as f:
1086	contents = f.read()
1087
1088	try:
1089	Validate(contents, flags, counters)
1090	except LexError as e:
1091	log('Lex error in %r: %s', filename, e)
1092	errors.append((filename, e))
1093	except ParseError as e:
1094	log('Parse error in %r: %s', filename, e)
1095	errors.append((filename, e))
1096	i += 1
1097
1098	log('')
1099	log('%10d tokens', counters.num_tokens)
1100	log('%10d start/end tags', counters.num_start_end_tags)
1101	log('%10d start tags', counters.num_start_tags)
1102	log('%10d attrs', counters.num_attrs)
1103	log('%10d max tag stack depth', counters.max_tag_stack)
1104	log('%10d attr val tokens', counters.num_val_tokens)
1105	log('%10d errors', len(errors))
1106	if len(errors):
1107	return 1
1108	return 0
1109
1110	elif action == 'todo':
1111	# Other algorithms:
1112	#
1113	# - select first subtree with given ID
1114	# - this requires understanding the void tags I suppose
1115	# - select all subtrees that have a class
1116	# - materialize DOM
1117
1118	# Safe-HTM8? This is a filter
1119	return 0
1120
1121	else:
1122	raise RuntimeError('Invalid action %r' % action)
1123
1124
1125	if __name__ == '__main__':
1126	sys.exit(main(sys.argv))