lazylex/html.py

OILS / lazylex / html.py View on Github | oils.pub

1127 lines, 565 significant

1	#!/usr/bin/env python2
2	"""
3	lazylex/html.py - Low-Level HTML Processing.
4
5	See lazylex/README.md for details.
6
7	TODO:
8	- Get rid of AttrValueLexer - this should be in the TagLexer
9	- this also means that unquoted values can be more similar
10	- We can use a single lexer mode for everything inside <>
11	- the SPACE is the only difference
12	- UTF-8 check, like JSON8
13	- Static typing
14
15	"""
16	from __future__ import print_function
17
18	from _devbuild.gen.htm8_asdl import h8_id, h8_id_str
19	from typing import Iterator
20	from typing import Union
21	from typing import Any
22	from typing import IO
23
24	try:
25	from cStringIO import StringIO
26	except ImportError:
27	# for python3
28	from io import StringIO # type: ignore
29	import re
30	import sys
31
32	if sys.version_info.major == 2:
33	from typing import List, Tuple, Optional
34
35
36	def log(msg, *args):
37	# type: (str, *Any) -> None
38	msg = msg % args
39	print(msg, file=sys.stderr)
40
41
42	class LexError(Exception):
43	"""
44	Examples of lex errors:
45
46	- h8_id.Invalid, like <> or &&
47	- Unclosed <!-- <? <![CDATA[ <script> <style>
48	"""
49
50	def __init__(self, s, start_pos):
51	# type: (str, int) -> None
52	self.s = s
53	self.start_pos = start_pos
54
55	def __str__(self):
56	# type: () -> str
57	return '(LexError %r)' % (self.s[self.start_pos:self.start_pos + 20])
58
59
60	def FindLineNum(s, error_pos):
61	# type: (str, int) -> int
62	current_pos = 0
63	line_num = 1
64	while True:
65	newline_pos = s.find('\n', current_pos)
66	#log('current = %d, N %d, line %d', current_pos, newline_pos, line_num)
67
68	if newline_pos == -1: # this is the last line
69	return line_num
70	if newline_pos >= error_pos:
71	return line_num
72	line_num += 1
73	current_pos = newline_pos + 1
74
75
76	class ParseError(Exception):
77	"""
78	Examples of parse errors
79
80	- unbalanced tag structure
81	- ul_table.py errors
82	"""
83
84	def __init__(self, msg, s=None, start_pos=-1):
85	# type: (str, Optional[str], int) -> None
86	self.msg = msg
87	self.s = s
88	self.start_pos = start_pos
89
90	def __str__(self):
91	# type: () -> str
92	if self.s is not None:
93	assert self.start_pos != -1, self.start_pos
94	snippet = (self.s[self.start_pos:self.start_pos + 20])
95
96	line_num = FindLineNum(self.s, self.start_pos)
97	else:
98	snippet = ''
99	line_num = -1
100	msg = 'line %d: %r %r' % (line_num, self.msg, snippet)
101	return msg
102
103
104	class Output(object):
105	"""Takes an underlying input buffer and an output file. Maintains a
106	position in the input buffer.
107
108	Print FROM the input or print new text to the output.
109	"""
110
111	def __init__(self, s, f, left_pos=0, right_pos=-1):
112	# type: (str, IO[str], int, int) -> None
113	self.s = s
114	self.f = f
115	self.pos = left_pos
116	self.right_pos = len(s) if right_pos == -1 else right_pos
117
118	def SkipTo(self, pos):
119	# type: (int) -> None
120	"""Skip to a position."""
121	self.pos = pos
122
123	def PrintUntil(self, pos):
124	# type: (int) -> None
125	"""Print until a position."""
126	piece = self.s[self.pos:pos]
127	self.f.write(piece)
128	self.pos = pos
129
130	def PrintTheRest(self):
131	# type: () -> None
132	"""Print until the end of the string."""
133	self.PrintUntil(self.right_pos)
134
135	def Print(self, s):
136	# type: (str) -> None
137	"""Print text to the underlying buffer."""
138	self.f.write(s)
139
140
141	# HTML Tokens
142	# CommentBegin, ProcessingBegin, CDataBegin are "pseudo-tokens", not visible
143	TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin CData CDataBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData HtmlCData BadAmpersand BadGreaterThan BadLessThan Invalid EndOfStream'.split(
144	)
145
146
147	class Tok(object):
148	"""
149	Avoid lint errors by using these aliases
150	"""
151	pass
152
153
154	TOKEN_NAMES = [None] * len(TOKENS) # type: List[str]
155
156	this_module = sys.modules[__name__]
157	for i, tok_str in enumerate(TOKENS):
158	setattr(this_module, tok_str, i)
159	setattr(Tok, tok_str, i)
160	TOKEN_NAMES[i] = tok_str
161
162
163	def MakeLexer(rules):
164	return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]
165
166
167	#
168	# Eggex
169	#
170	# Tag = / ~['>']+ /
171
172	# Is this valid? A single character?
173	# Tag = / ~'>'* /
174
175	# Maybe better: / [NOT '>']+/
176	# capital letters not allowed there?
177	#
178	# But then this is confusing:
179	# / [NOT ~digit]+/
180	#
181	# / [NOT digit] / is [^\d]
182	# / ~digit / is \D
183	#
184	# Or maybe:
185	#
186	# / [~ digit]+ /
187	# / [~ '>']+ /
188	# / [NOT '>']+ /
189
190	# End = / '</' Tag '>' /
191	# StartEnd = / '<' Tag '/>' /
192	# Start = / '<' Tag '>' /
193	#
194	# EntityRef = / '&' dot{* N} ';' /
195
196	# Tag name, or attribute name
197	# colon is used in XML
198
199	# https://www.w3.org/TR/xml/#NT-Name
200	# Hm there is a lot of unicode stuff. We are simplifying parsing
201
202	_NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter
203
204	CHAR_LEX = [
205	# Characters
206	# https://www.w3.org/TR/xml/#sec-references
207	(r'&\# [0-9]+ ;', h8_id.DecChar),
208	(r'&\# x[0-9a-fA-F]+ ;', h8_id.HexChar),
209	(r'& %s ;' % _NAME, h8_id.CharEntity),
210	# Allow unquoted, and quoted
211	(r'&', h8_id.BadAmpersand),
212	]
213
214	HTM8_LEX = CHAR_LEX + [
215	(r'<!--', h8_id.CommentBegin),
216
217	# Processing instruction are used for the XML header:
218	# <?xml version="1.0" encoding="UTF-8"?>
219	# They are technically XML-only, but in HTML5, they are another kind of
220	# comment:
221	#
222	# https://developer.mozilla.org/en-US/docs/Web/API/ProcessingInstruction
223	#
224	(r'<\?', h8_id.ProcessingBegin),
225	# Not necessary in HTML5, but occurs in XML
226	(r'<!\[CDATA\[', h8_id.CDataBegin), # <![CDATA[
227
228	# Markup declarations
229	# - In HTML5, there is only <!DOCTYPE html>
230	# - XML has 4 more declarations: <!ELEMENT ...> ATTLIST ENTITY NOTATION
231	# - these seem to be part of DTD
232	# - it's useful to skip these, and be able to parse the rest of the document
233	# - Note: < is allowed?
234	(r'<! [^>\x00]+ >', h8_id.Decl),
235
236	# Tags
237	# Notes:
238	# - We look for a valid tag name, but we don't validate attributes.
239	# That's done in the tag lexer.
240	# - We don't allow leading whitespace
241	(r'</ (%s) >' % _NAME, h8_id.EndTag),
242	# self-closing <br/> comes before StartTag
243	# could/should these be collapsed into one rule?
244	(r'< (%s) [^>\x00]* />' % _NAME, h8_id.StartEndTag), # end </a>
245	(r'< (%s) [^>\x00]* >' % _NAME, h8_id.StartTag), # start <a>
246
247	# HTML5 allows unescaped > in raw data, but < is not allowed.
248	# https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
249	#
250	# - My early blog has THREE errors when disallowing >
251	# - So do some .wwz files
252	(r'[^&<>\x00]+', h8_id.RawData),
253	(r'>', h8_id.BadGreaterThan),
254	# < is an error
255	(r'.', h8_id.Invalid),
256	]
257
258	# Old notes:
259	#
260	# Non-greedy matches are regular and can be matched in linear time
261	# with RE2.
262	#
263	# https://news.ycombinator.com/item?id=27099798
264	#
265	# Maybe try combining all of these for speed.
266
267	# . is any char except newline
268	# https://re2c.org/manual/manual_c.html
269
270	# Discarded options
271	#(r'<!-- .*? -->', h8_id.Comment),
272
273	# Hack from Claude: \s\S instead of re.DOTALL. I don't like this
274	#(r'<!-- [\s\S]*? -->', h8_id.Comment),
275	#(r'<!-- (?:.\|[\n])*? -->', h8_id.Comment),
276
277	HTM8_LEX_COMPILED = MakeLexer(HTM8_LEX)
278
279
280	class Lexer(object):
281
282	def __init__(self, s, left_pos=0, right_pos=-1, no_special_tags=False):
283	# type: (str, int, int, bool) -> None
284	self.s = s
285	self.pos = left_pos
286	self.right_pos = len(s) if right_pos == -1 else right_pos
287	self.no_special_tags = no_special_tags
288
289	self.cache = {} # string -> compiled regex pattern object
290
291	# either </script> or </style> - we search until we see that
292	self.search_state = None # type: Optional[str]
293
294	# Position of tag name, if applicable
295	# - Set after you get a StartTag, EndTag, or StartEndTag
296	# - Unset on other tags
297	self.tag_pos_left = -1
298	self.tag_pos_right = -1
299
300	def _Peek(self):
301	# type: () -> Tuple[int, int]
302	"""
303	Note: not using _Peek() now
304	"""
305	if self.pos == self.right_pos:
306	return h8_id.EndOfStream, self.pos
307
308	assert self.pos < self.right_pos, self.pos
309
310	if self.search_state is not None and not self.no_special_tags:
311	# TODO: case-insensitive search for </SCRIPT> <SCRipt> ?
312	#
313	# Another strategy: enter a mode where we find ONLY the end tag
314	# regex, and any data that's not <, and then check the canonical
315	# tag name for 'script' or 'style'.
316	pos = self.s.find(self.search_state, self.pos)
317	if pos == -1:
318	# unterminated <script> or <style>
319	raise LexError(self.s, self.pos)
320	self.search_state = None
321	# beginning
322	return h8_id.HtmlCData, pos
323
324	# Find the first match.
325	# Note: frontend/match.py uses _LongestMatch(), which is different!
326	# TODO: reconcile them. This lexer should be expressible in re2c.
327
328	for pat, tok_id in HTM8_LEX_COMPILED:
329	m = pat.match(self.s, self.pos)
330	if m:
331	if tok_id in (h8_id.StartTag, h8_id.EndTag, h8_id.StartEndTag):
332	self.tag_pos_left = m.start(1)
333	self.tag_pos_right = m.end(1)
334	else:
335	# Reset state
336	self.tag_pos_left = -1
337	self.tag_pos_right = -1
338
339	if tok_id == h8_id.CommentBegin:
340	pos = self.s.find('-->', self.pos)
341	if pos == -1:
342	# unterminated <!--
343	raise LexError(self.s, self.pos)
344	return h8_id.Comment, pos + 3 # -->
345
346	if tok_id == h8_id.ProcessingBegin:
347	pos = self.s.find('?>', self.pos)
348	if pos == -1:
349	# unterminated <?
350	raise LexError(self.s, self.pos)
351	return h8_id.Processing, pos + 2 # ?>
352
353	if tok_id == h8_id.CDataBegin:
354	pos = self.s.find(']]>', self.pos)
355	if pos == -1:
356	# unterminated <![CDATA[
357	raise LexError(self.s, self.pos)
358	return h8_id.CData, pos + 3 # ]]>
359
360	if tok_id == h8_id.StartTag:
361	# TODO: reduce allocations
362	if (self.TagNameEquals('script') or
363	self.TagNameEquals('style')):
364	# <SCRipt a=b> -> </SCRipt>
365	self.search_state = '</' + self._LiteralTagName() + '>'
366
367	return tok_id, m.end()
368	else:
369	raise AssertionError('h8_id.Invalid rule should have matched')
370
371	def TagNameEquals(self, expected):
372	# type: (str) -> bool
373	assert self.tag_pos_left != -1, self.tag_pos_left
374	assert self.tag_pos_right != -1, self.tag_pos_right
375
376	# TODO: In C++, this does not need an allocation. Can we test
377	# directly?
378	return expected == self.CanonicalTagName()
379
380	def _LiteralTagName(self):
381	# type: () -> str
382	assert self.tag_pos_left != -1, self.tag_pos_left
383	assert self.tag_pos_right != -1, self.tag_pos_right
384
385	return self.s[self.tag_pos_left:self.tag_pos_right]
386
387	def CanonicalTagName(self):
388	# type: () -> str
389	tag_name = self._LiteralTagName()
390	# Most tags are already lower case, so avoid allocation with this conditional
391	# TODO: this could go in the mycpp runtime?
392	if tag_name.islower():
393	return tag_name
394	else:
395	return tag_name.lower()
396
397	def Read(self):
398	# type: () -> Tuple[int, int]
399	tok_id, end_pos = self._Peek()
400	self.pos = end_pos # advance
401	return tok_id, end_pos
402
403	def LookAhead(self, regex):
404	# type: (str) -> bool
405	# Cache the regex compilation. This could also be LookAheadFor(THEAD)
406	# or something.
407	pat = self.cache.get(regex)
408	if pat is None:
409	pat = re.compile(regex)
410	self.cache[regex] = pat
411
412	m = pat.match(self.s, self.pos)
413	return m is not None
414
415
416	def _Tokens(s, left_pos, right_pos):
417	# type: (str, int, int) -> Iterator[Tuple[int, int]]
418	"""
419	Args:
420	s: string to parse
421	left_pos, right_pos: Optional span boundaries.
422	"""
423	lx = Lexer(s, left_pos, right_pos)
424	while True:
425	tok_id, pos = lx.Read()
426	yield tok_id, pos
427	if tok_id == h8_id.EndOfStream:
428	break
429
430
431	def ValidTokens(s, left_pos=0, right_pos=-1):
432	# type: (str, int, int) -> Iterator[Tuple[int, int]]
433	"""Wrapper around _Tokens to prevent callers from having to handle Invalid.
434
435	I'm not combining the two functions because I might want to do a
436	'yield' transformation on Tokens()? Exceptions might complicate the
437	issue?
438	"""
439	pos = left_pos
440	for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
441	if tok_id == h8_id.Invalid:
442	raise LexError(s, pos)
443	yield tok_id, end_pos
444	pos = end_pos
445
446
447	def ValidTokenList(s, no_special_tags=False):
448	# type: (str, bool) -> List[Tuple[int, int]]
449	"""A wrapper that can be more easily translated to C++. Doesn't use iterators."""
450
451	start_pos = 0
452	tokens = []
453	lx = Lexer(s, no_special_tags=no_special_tags)
454	while True:
455	tok_id, end_pos = lx.Read()
456	tokens.append((tok_id, end_pos))
457	if tok_id == h8_id.EndOfStream:
458	break
459	if tok_id == h8_id.Invalid:
460	raise LexError(s, start_pos)
461	start_pos = end_pos
462	return tokens
463
464
465	# Tag names:
466	# Match <a or </a
467	# Match <h2, but not <2h
468	#
469	# HTML 5 doesn't restrict tag names at all
470	# https://html.spec.whatwg.org/#toc-syntax
471	#
472	# XML allows : - .
473	# https://www.w3.org/TR/xml/#NT-NameChar
474
475	# Namespaces for MathML, SVG
476	# XLink, XML, XMLNS
477	#
478	# https://infra.spec.whatwg.org/#namespaces
479	#
480	# Allow - for td-attrs
481
482	# Be very lenient - just no whitespace or special HTML chars
483	# I don't think this is more lenient than HTML5, though we should check.
484	_UNQUOTED_VALUE = r'''[^ \t\r\n<>&"'\x00]*'''
485
486	# TODO: we don't need to capture the tag name here? That's done at the top
487	# level
488	_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)
489
490	_TAG_LAST_RE = re.compile(r'\s* /? >', re.VERBOSE)
491
492	# To match href="foo"
493	# Note: in HTML5 and XML, single quoted attributes are also valid
494
495	# <button disabled> is standard usage
496
497	# NOTE: This used to allow whitespace around =
498	# <a foo = "bar"> makes sense in XML
499	# But then you also have
500	# <a foo= bar> - which is TWO attributes, in HTML5
501	# So the space is problematic
502
503	_ATTR_RE = re.compile(
504	r'''
505	\s+ # Leading whitespace is required
506	(%s) # Attribute name
507	(?: # Optional attribute value
508	\s* = \s* # Spaces allowed around =
509	(?:
510	" ([^>"\x00]*) " # double quoted value
511	\| ' ([^>'\x00]*) ' # single quoted value
512	\| (%s) # Attribute value
513	)
514	)?
515	''' % (_NAME, _UNQUOTED_VALUE), re.VERBOSE)
516
517	TagName, AttrName, UnquotedValue, QuotedValue, MissingValue = range(5)
518
519
520	class TagLexer(object):
521	"""
522	Given a tag like <a href="..."> or <link type="..." />, the TagLexer
523	provides a few operations:
524
525	- What is the tag?
526	- Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
527	"""
528
529	def __init__(self, s):
530	# type: (str) -> None
531	self.s = s
532	self.start_pos = -1 # Invalid
533	self.end_pos = -1
534
535	def Reset(self, start_pos, end_pos):
536	# type: (int, int) -> None
537	"""Reuse instances of this object."""
538	assert start_pos >= 0, start_pos
539	assert end_pos >= 0, end_pos
540
541	self.start_pos = start_pos
542	self.end_pos = end_pos
543
544	def TagString(self):
545	return self.s[self.start_pos:self.end_pos]
546
547	def TagName(self):
548	# type: () -> str
549	# First event
550	tok_id, start, end = next(self.Tokens())
551	return self.s[start:end]
552
553	def GetSpanForAttrValue(self, attr_name):
554	# type: (str) -> Tuple[int, int]
555	"""
556	Used by oils_doc.py, for href shortcuts
557	"""
558	# Algorithm: search for QuotedValue or UnquotedValue after AttrName
559	# TODO: Could also cache these
560
561	events = self.Tokens()
562	val = (-1, -1)
563	try:
564	while True:
565	tok_id, start, end = next(events)
566	if tok_id == AttrName:
567	name = self.s[start:end]
568	if name == attr_name:
569	# The value should come next
570	tok_id, start, end = next(events)
571	assert tok_id in (QuotedValue, UnquotedValue,
572	MissingValue), h8_id_str(tok_id)
573	val = start, end
574	break
575
576	except StopIteration:
577	pass
578	return val
579
580	def GetAttrRaw(self, attr_name):
581	# type: (str) -> Optional[str]
582	"""
583	Return the value, which may be UNESCAPED.
584	"""
585	start, end = self.GetSpanForAttrValue(attr_name)
586	if start == -1:
587	return None
588	return self.s[start:end]
589
590	def AllAttrsRawSlice(self):
591	# type: () -> List[Tuple[str, int, int]]
592	"""
593	Get a list of pairs [('class', 3, 5), ('href', 9, 12)]
594	"""
595	slices = []
596	events = self.Tokens()
597	try:
598	while True:
599	tok_id, start, end = next(events)
600	if tok_id == AttrName:
601	name = self.s[start:end]
602
603	# The value should come next
604	tok_id, start, end = next(events)
605	assert tok_id in (QuotedValue, UnquotedValue,
606	MissingValue), h8_id_str(tok_id)
607	# Note: quoted values may have &
608	# We would need ANOTHER lexer to unescape them, but we
609	# don't need that for ul-table
610	slices.append((name, start, end))
611	except StopIteration:
612	pass
613	return slices
614
615	def AllAttrsRaw(self):
616	# type: () -> List[Tuple[str, str]]
617	"""
618	Get a list of pairs [('class', 'foo'), ('href', '?foo=1&bar=2')]
619
620	The quoted values may be escaped. We would need another lexer to
621	unescape them.
622	"""
623	slices = self.AllAttrsRawSlice()
624	pairs = []
625	for name, start, end in slices:
626	pairs.append((name, self.s[start:end]))
627	return pairs
628
629	def Tokens(self):
630	# type: () -> Iterator[Tuple[int, int, int]]
631	"""
632	Yields a sequence of tokens: Tag (AttrName AttrValue?)*
633
634	Where each Token is (Type, start_pos, end_pos)
635
636	Note that start and end are NOT redundant! We skip over some unwanted
637	characters.
638	"""
639	m = _TAG_RE.match(self.s, self.start_pos + 1)
640	if not m:
641	raise RuntimeError("Couldn't find HTML tag in %r" %
642	self.TagString())
643	yield TagName, m.start(1), m.end(1)
644
645	pos = m.end(0)
646	#log('POS %d', pos)
647
648	while True:
649	# don't search past the end
650	m = _ATTR_RE.match(self.s, pos, self.end_pos)
651	if not m:
652	#log('BREAK pos %d', pos)
653	break
654	#log('AttrName %r', m.group(1))
655
656	yield AttrName, m.start(1), m.end(1)
657
658	#log('m.groups() %r', m.groups())
659	if m.group(2) is not None:
660	# double quoted
661	yield QuotedValue, m.start(2), m.end(2)
662	elif m.group(3) is not None:
663	# single quoted - TODO: could have different token types
664	yield QuotedValue, m.start(3), m.end(3)
665	elif m.group(4) is not None:
666	yield UnquotedValue, m.start(4), m.end(4)
667	else:
668	# <button disabled>
669	end = m.end(0)
670	yield MissingValue, end, end
671
672	# Skip past the "
673	pos = m.end(0)
674
675	#log('TOK %r', self.s)
676
677	m = _TAG_LAST_RE.match(self.s, pos)
678	#log('_TAG_LAST_RE match %r', self.s[pos:])
679	if not m:
680	# Extra data at end of tag. TODO: add messages for all these.
681	raise LexError(self.s, pos)
682
683
684	# This is similar but not identical to
685	# " ([^>"\x00]*) " # double quoted value
686	# \| ' ([^>'\x00]*) ' # single quoted value
687	#
688	# Note: for unquoted values, & isn't allowed, and thus & and c and
689	# are not allowed. We could relax that?
690	ATTR_VALUE_LEXER = CHAR_LEX + [
691	(r'[^>&\x00]+', h8_id.RawData),
692	(r'.', h8_id.Invalid),
693	]
694
695	ATTR_VALUE_LEXER = MakeLexer(ATTR_VALUE_LEXER)
696
697
698	class AttrValueLexer(object):
699	"""
700	<a href="foo=99&bar">
701	<a href='foo=99&bar'>
702	<a href=unquoted>
703	"""
704
705	def __init__(self, s):
706	# type: (str) -> None
707	self.s = s
708	self.start_pos = -1 # Invalid
709	self.end_pos = -1
710
711	def Reset(self, start_pos, end_pos):
712	# type: (int, int) -> None
713	"""Reuse instances of this object."""
714	assert start_pos >= 0, start_pos
715	assert end_pos >= 0, end_pos
716
717	self.start_pos = start_pos
718	self.end_pos = end_pos
719
720	def NumTokens(self):
721	# type: () -> int
722	num_tokens = 0
723	pos = self.start_pos
724	for tok_id, end_pos in self.Tokens():
725	if tok_id == h8_id.Invalid:
726	raise LexError(self.s, pos)
727	pos = end_pos
728	#log('pos %d', pos)
729	num_tokens += 1
730	return num_tokens
731
732	def Tokens(self):
733	# type: () -> Iterator[Union[Iterator, Iterator[Tuple[int, int]]]]
734	pos = self.start_pos
735	while pos < self.end_pos:
736	# Find the first match, like above.
737	# Note: frontend/match.py uses _LongestMatch(), which is different!
738	# TODO: reconcile them. This lexer should be expressible in re2c.
739	for pat, tok_id in ATTR_VALUE_LEXER:
740	m = pat.match(self.s, pos)
741	if m:
742	if 0:
743	tok_str = m.group(0)
744	log('token = %r', tok_str)
745
746	end_pos = m.end(0)
747	yield tok_id, end_pos
748	pos = end_pos
749	break
750	else:
751	raise AssertionError('h8_id.Invalid rule should have matched')
752
753
754	def ReadUntilStartTag(it, tag_lexer, tag_name):
755	"""Find the next <foo>, returning its (start, end) positions
756
757	Raise ParseError if it's not found.
758
759	tag_lexer is RESET.
760	"""
761	pos = 0
762	while True:
763	try:
764	tok_id, end_pos = next(it)
765	except StopIteration:
766	break
767	tag_lexer.Reset(pos, end_pos)
768	if tok_id == h8_id.StartTag and tag_lexer.TagName() == tag_name:
769	return pos, end_pos
770
771	pos = end_pos
772
773	raise ParseError('No start tag %r' % tag_name)
774
775
776	def ReadUntilEndTag(it, tag_lexer, tag_name):
777	# type: (Iterator, TagLexer, str) -> Tuple[int, int]
778	"""Find the next </foo>, returning its (start, end) position
779
780	Raise ParseError if it's not found.
781
782	tag_lexer is RESET.
783	"""
784	pos = 0
785	while True:
786	try:
787	tok_id, end_pos = next(it)
788	except StopIteration:
789	break
790	tag_lexer.Reset(pos, end_pos)
791	if tok_id == h8_id.EndTag and tag_lexer.TagName() == tag_name:
792	return pos, end_pos
793
794	pos = end_pos
795
796	raise ParseError('No end tag %r' % tag_name)
797
798
799	CHAR_ENTITY = {
800	'amp': '&',
801	'lt': '<',
802	'gt': '>',
803	'quot': '"',
804	'apos': "'",
805	}
806
807
808	def ToText(s, left_pos=0, right_pos=-1):
809	# type: (str, int, int) -> str
810	"""Given HTML, return text by unquoting > and < etc.
811
812	Used by:
813	doctools/oils_doc.py: PygmentsPlugin
814	doctools/help_gen.py: HelpIndexCards
815
816	In the latter case, we cold process some tags, like:
817
818	- Blue Link (not clickable, but still useful)
819	- Red X
820
821	That should be html.ToAnsi.
822	"""
823	f = StringIO()
824	out = Output(s, f, left_pos, right_pos)
825
826	pos = left_pos
827	for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
828	if tok_id in (h8_id.RawData, h8_id.BadAmpersand, h8_id.BadGreaterThan,
829	h8_id.BadLessThan):
830	out.SkipTo(pos)
831	out.PrintUntil(end_pos)
832
833	elif tok_id == h8_id.CharEntity: # &
834
835	entity = s[pos + 1:end_pos - 1]
836
837	out.SkipTo(pos)
838	out.Print(CHAR_ENTITY[entity])
839	out.SkipTo(end_pos)
840
841	# Not handling these yet
842	elif tok_id == h8_id.HexChar:
843	raise AssertionError('Hex Char %r' % s[pos:pos + 20])
844
845	elif tok_id == h8_id.DecChar:
846	raise AssertionError('Dec Char %r' % s[pos:pos + 20])
847
848	else:
849	# Skip everything else
850	out.SkipTo(end_pos)
851
852	pos = end_pos
853
854	out.PrintTheRest()
855	return f.getvalue()
856
857
858	# https://developer.mozilla.org/en-US/docs/Glossary/Void_element
859	VOID_ELEMENTS = [
860	'area',
861	'base',
862	'br',
863	'col',
864	'embed',
865	'hr',
866	'img',
867	'input',
868	'link',
869	'meta',
870	'param',
871	'source',
872	'track',
873	'wbr',
874	]
875
876	LEX_ATTRS = 1 << 1
877	LEX_QUOTED_VALUES = 1 << 2 # href="?x=42&y=99"
878	NO_SPECIAL_TAGS = 1 << 3 # <script> <style>, VOID tags, etc.
879	BALANCED_TAGS = 1 << 4 # are tags balanced?
880
881
882	def Validate(contents, flags, counters):
883	# type: (str, int, Counters) -> None
884
885	tag_lexer = TagLexer(contents)
886	val_lexer = AttrValueLexer(contents)
887
888	no_special_tags = bool(flags & NO_SPECIAL_TAGS)
889	lx = Lexer(contents, no_special_tags=no_special_tags)
890	tokens = []
891	start_pos = 0
892	tag_stack = []
893	while True:
894	tok_id, end_pos = lx.Read()
895	#log('TOP %s %r', h8_id_str(tok_id), contents[start_pos:end_pos])
896
897	if tok_id == h8_id.Invalid:
898	raise LexError(contents, start_pos)
899	if tok_id == h8_id.EndOfStream:
900	break
901
902	tokens.append((tok_id, end_pos))
903
904	if tok_id == h8_id.StartEndTag:
905	counters.num_start_end_tags += 1
906
907	tag_lexer.Reset(start_pos, end_pos)
908	all_attrs = tag_lexer.AllAttrsRawSlice()
909	counters.num_attrs += len(all_attrs)
910	for name, val_start, val_end in all_attrs:
911	val_lexer.Reset(val_start, val_end)
912	counters.num_val_tokens += val_lexer.NumTokens()
913
914	counters.debug_attrs.extend(all_attrs)
915
916	elif tok_id == h8_id.StartTag:
917	counters.num_start_tags += 1
918
919	tag_lexer.Reset(start_pos, end_pos)
920	all_attrs = tag_lexer.AllAttrsRawSlice()
921	counters.num_attrs += len(all_attrs)
922	for name, val_start, val_end in all_attrs:
923	val_lexer.Reset(val_start, val_end)
924	counters.num_val_tokens += val_lexer.NumTokens()
925
926	counters.debug_attrs.extend(all_attrs)
927
928	if flags & BALANCED_TAGS:
929	tag_name = lx.CanonicalTagName()
930	if flags & NO_SPECIAL_TAGS:
931	tag_stack.append(tag_name)
932	else:
933	# e.g. <meta> is considered self-closing, like <meta/>
934	if tag_name not in VOID_ELEMENTS:
935	tag_stack.append(tag_name)
936
937	counters.max_tag_stack = max(counters.max_tag_stack,
938	len(tag_stack))
939	elif tok_id == h8_id.EndTag:
940	if flags & BALANCED_TAGS:
941	try:
942	expected = tag_stack.pop()
943	except IndexError:
944	raise ParseError('Tag stack empty',
945	s=contents,
946	start_pos=start_pos)
947
948	actual = lx.CanonicalTagName()
949	if expected != actual:
950	raise ParseError(
951	'Got unexpected closing tag %r; opening tag was %r' %
952	(contents[start_pos:end_pos], expected),
953	s=contents,
954	start_pos=start_pos)
955
956	start_pos = end_pos
957
958	if len(tag_stack) != 0:
959	raise ParseError('Missing closing tags at end of doc: %s' %
960	' '.join(tag_stack),
961	s=contents,
962	start_pos=start_pos)
963
964	counters.num_tokens += len(tokens)
965
966
967	def ToXml(htm8_str):
968	# type: (str) -> str
969
970	# TODO:
971	# 1. Lex it
972	# 2. < & > must be escaped
973	# a. in raw data
974	# b. in quoted strings
975	# 3. <script> turned into CDATA
976	# 4. void tags turned into self-closing tags
977	# 5. case-sensitive tag matching - not sure about this
978
979	tag_lexer = TagLexer(htm8_str)
980	val_lexer = AttrValueLexer(htm8_str)
981
982	f = StringIO()
983	out = Output(htm8_str, f)
984
985	lx = Lexer(htm8_str)
986
987	pos = 0
988	while True:
989	tok_id, end_pos = lx.Read()
990
991	if tok_id == h8_id.Invalid:
992	raise LexError(htm8_str, pos)
993	if tok_id == h8_id.EndOfStream:
994	break
995
996	if tok_id in (h8_id.RawData, h8_id.CharEntity, h8_id.HexChar,
997	h8_id.DecChar):
998	out.PrintUntil(end_pos)
999	elif tok_id in (h8_id.StartTag, h8_id.StartEndTag):
1000	tag_lexer.Reset(pos, end_pos)
1001	# TODO: reduce allocations here
1002	all_attrs = tag_lexer.AllAttrsRawSlice()
1003	for name, val_start, val_end in all_attrs:
1004	val_lexer.Reset(val_start, val_end)
1005	# TODO: get the kind of string
1006	#
1007	# Quoted: we need to replace & with & and < with <
1008	# note > is not allowed
1009	# Unquoted: right now, we can just surround with double quotes
1010	# because we don't allow any bad chars
1011	# Empty : add "", so empty= becomes =""
1012	# Missing : add ="", so missing becomes missing=""
1013
1014	tag_name = lx.CanonicalTagName()
1015	if tok_id == h8_id.StartTag and tag_name in VOID_ELEMENTS:
1016	# TODO: instead of closing >, print />
1017	pass
1018
1019	elif tok_id == h8_id.BadAmpersand:
1020	#out.SkipTo(pos)
1021	out.Print('&')
1022	out.SkipTo(end_pos)
1023
1024	elif tok_id == h8_id.BadGreaterThan:
1025	#out.SkipTo(pos)
1026	out.Print('>')
1027	out.SkipTo(end_pos)
1028	else:
1029	out.PrintUntil(end_pos)
1030
1031	pos = end_pos
1032
1033	out.PrintTheRest()
1034	return f.getvalue()
1035
1036
1037	class Counters(object):
1038
1039	def __init__(self):
1040	# type: () -> None
1041	self.num_tokens = 0
1042	self.num_start_tags = 0
1043	self.num_start_end_tags = 0
1044	self.num_attrs = 0
1045	self.max_tag_stack = 0
1046	self.num_val_tokens = 0
1047
1048	self.debug_attrs = []
1049
1050
1051	def main(argv):
1052	action = argv[1]
1053
1054	if action == 'tokens':
1055	contents = sys.stdin.read()
1056
1057	lx = Lexer(contents)
1058	start_pos = 0
1059	while True:
1060	tok_id, end_pos = lx.Read()
1061	if tok_id == h8_id.Invalid:
1062	raise LexError(contents, start_pos)
1063	if tok_id == h8_id.EndOfStream:
1064	break
1065
1066	frag = contents[start_pos:end_pos]
1067	log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
1068	start_pos = end_pos
1069
1070	return 0
1071
1072	elif action in ('lex-htm8', 'parse-htm8', 'parse-xml'):
1073
1074	errors = []
1075	counters = Counters()
1076
1077	flags = LEX_ATTRS \| LEX_QUOTED_VALUES
1078	if action.startswith('parse-'):
1079	flags \|= BALANCED_TAGS
1080	if action == 'parse-xml':
1081	flags \|= NO_SPECIAL_TAGS
1082
1083	i = 0
1084	for line in sys.stdin:
1085	filename = line.strip()
1086	with open(filename) as f:
1087	contents = f.read()
1088
1089	try:
1090	Validate(contents, flags, counters)
1091	except LexError as e:
1092	log('Lex error in %r: %s', filename, e)
1093	errors.append((filename, e))
1094	except ParseError as e:
1095	log('Parse error in %r: %s', filename, e)
1096	errors.append((filename, e))
1097	i += 1
1098
1099	log('')
1100	log('%10d tokens', counters.num_tokens)
1101	log('%10d start/end tags', counters.num_start_end_tags)
1102	log('%10d start tags', counters.num_start_tags)
1103	log('%10d attrs', counters.num_attrs)
1104	log('%10d max tag stack depth', counters.max_tag_stack)
1105	log('%10d attr val tokens', counters.num_val_tokens)
1106	log('%10d errors', len(errors))
1107	if len(errors):
1108	return 1
1109	return 0
1110
1111	elif action == 'todo':
1112	# Other algorithms:
1113	#
1114	# - select first subtree with given ID
1115	# - this requires understanding the void tags I suppose
1116	# - select all subtrees that have a class
1117	# - materialize DOM
1118
1119	# Safe-HTM8? This is a filter
1120	return 0
1121
1122	else:
1123	raise RuntimeError('Invalid action %r' % action)
1124
1125
1126	if __name__ == '__main__':
1127	sys.exit(main(sys.argv))