lazylex/html.py

OILS / lazylex / html.py View on Github | oils.pub

1083 lines, 561 significant

1	#!/usr/bin/env python2
2	"""
3	lazylex/html.py - Low-Level HTML Processing.
4
5	See lazylex/README.md for details.
6	"""
7	from __future__ import print_function
8
9	try:
10	from cStringIO import StringIO
11	except ImportError:
12	from io import StringIO # python3
13	import re
14	import sys
15
16	if sys.version_info.major == 2:
17	from typing import List, Tuple, Optional
18
19
20	def log(msg, *args):
21	msg = msg % args
22	print(msg, file=sys.stderr)
23
24
25	class LexError(Exception):
26	"""
27	Examples of lex errors:
28
29	- Tok.Invalid, like <> or &&
30	- Unclosed <!-- <? <![CDATA[ <script> <style>
31	"""
32
33	def __init__(self, s, start_pos):
34	self.s = s
35	self.start_pos = start_pos
36
37	def __str__(self):
38	return '(LexError %r)' % (self.s[self.start_pos:self.start_pos + 20])
39
40
41	def FindLineNum(s, error_pos):
42	current_pos = 0
43	line_num = 1
44	while True:
45	newline_pos = s.find('\n', current_pos)
46	#log('current = %d, N %d, line %d', current_pos, newline_pos, line_num)
47
48	if newline_pos == -1: # this is the last line
49	return line_num
50	if newline_pos >= error_pos:
51	return line_num
52	line_num += 1
53	current_pos = newline_pos + 1
54
55
56	class ParseError(Exception):
57	"""
58	Examples of parse errors
59
60	- unbalanced tag structure
61	- ul_table.py errors
62	"""
63
64	def __init__(self, msg, s=None, start_pos=-1):
65	self.msg = msg
66	self.s = s
67	self.start_pos = start_pos
68
69	def __str__(self):
70	if self.s is not None:
71	assert self.start_pos != -1, self.start_pos
72	snippet = (self.s[self.start_pos:self.start_pos + 20])
73
74	line_num = FindLineNum(self.s, self.start_pos)
75	else:
76	snippet = ''
77	line_num = -1
78	msg = 'line %d: %r %r' % (line_num, self.msg, snippet)
79	return msg
80
81
82	class Output(object):
83	"""Takes an underlying input buffer and an output file. Maintains a
84	position in the input buffer.
85
86	Print FROM the input or print new text to the output.
87	"""
88
89	def __init__(self, s, f, left_pos=0, right_pos=-1):
90	self.s = s
91	self.f = f
92	self.pos = left_pos
93	self.right_pos = len(s) if right_pos == -1 else right_pos
94
95	def SkipTo(self, pos):
96	"""Skip to a position."""
97	self.pos = pos
98
99	def PrintUntil(self, pos):
100	"""Print until a position."""
101	piece = self.s[self.pos:pos]
102	self.f.write(piece)
103	self.pos = pos
104
105	def PrintTheRest(self):
106	"""Print until the end of the string."""
107	self.PrintUntil(self.right_pos)
108
109	def Print(self, s):
110	"""Print text to the underlying buffer."""
111	self.f.write(s)
112
113
114	# HTML Tokens
115	# CommentBegin, ProcessingBegin, CDataBegin are "pseudo-tokens", not visible
116	TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin CData CDataBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData HtmlCData BadAmpersand BadGreaterThan BadLessThan Invalid EndOfStream'.split(
117	)
118
119
120	class Tok(object):
121	"""
122	Avoid lint errors by using these aliases
123	"""
124	pass
125
126
127	TOKEN_NAMES = [None] * len(TOKENS) # type: List[str]
128
129	this_module = sys.modules[__name__]
130	for i, tok_str in enumerate(TOKENS):
131	setattr(this_module, tok_str, i)
132	setattr(Tok, tok_str, i)
133	TOKEN_NAMES[i] = tok_str
134
135
136	def TokenName(tok_id):
137	return TOKEN_NAMES[tok_id]
138
139
140	def MakeLexer(rules):
141	return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]
142
143
144	#
145	# Eggex
146	#
147	# Tag = / ~['>']+ /
148
149	# Is this valid? A single character?
150	# Tag = / ~'>'* /
151
152	# Maybe better: / [NOT '>']+/
153	# capital letters not allowed there?
154	#
155	# But then this is confusing:
156	# / [NOT ~digit]+/
157	#
158	# / [NOT digit] / is [^\d]
159	# / ~digit / is \D
160	#
161	# Or maybe:
162	#
163	# / [~ digit]+ /
164	# / [~ '>']+ /
165	# / [NOT '>']+ /
166
167	# End = / '</' Tag '>' /
168	# StartEnd = / '<' Tag '/>' /
169	# Start = / '<' Tag '>' /
170	#
171	# EntityRef = / '&' dot{* N} ';' /
172
173	# Tag name, or attribute name
174	# colon is used in XML
175
176	# https://www.w3.org/TR/xml/#NT-Name
177	# Hm there is a lot of unicode stuff. We are simplifying parsing
178
179	_NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter
180
181	CHAR_LEX = [
182	# Characters
183	# https://www.w3.org/TR/xml/#sec-references
184	(r'&\# [0-9]+ ;', Tok.DecChar),
185	(r'&\# x[0-9a-fA-F]+ ;', Tok.HexChar),
186	(r'& %s ;' % _NAME, Tok.CharEntity),
187	# Allow unquoted, and quoted
188	(r'&', Tok.BadAmpersand),
189	]
190
191	LEXER = CHAR_LEX + [
192	(r'<!--', Tok.CommentBegin),
193
194	# Processing instruction are used for the XML header:
195	# <?xml version="1.0" encoding="UTF-8"?>
196	# They are technically XML-only, but in HTML5, they are another kind of
197	# comment:
198	#
199	# https://developer.mozilla.org/en-US/docs/Web/API/ProcessingInstruction
200	#
201	(r'<\?', Tok.ProcessingBegin),
202	# Not necessary in HTML5, but occurs in XML
203	(r'<!\[CDATA\[', Tok.CDataBegin), # <![CDATA[
204
205	# Markup declarations
206	# - In HTML5, there is only <!DOCTYPE html>
207	# - XML has 4 more declarations: <!ELEMENT ...> ATTLIST ENTITY NOTATION
208	# - these seem to be part of DTD
209	# - it's useful to skip these, and be able to parse the rest of the document
210	# - Note: < is allowed?
211	(r'<! [^>\x00]+ >', Tok.Decl),
212
213	# Tags
214	# Notes:
215	# - We look for a valid tag name, but we don't validate attributes.
216	# That's done in the tag lexer.
217	# - We don't allow leading whitespace
218	(r'</ (%s) >' % _NAME, Tok.EndTag),
219	# self-closing <br/> comes before StartTag
220	# could/should these be collapsed into one rule?
221	(r'< (%s) [^>\x00]* />' % _NAME, Tok.StartEndTag), # end </a>
222	(r'< (%s) [^>\x00]* >' % _NAME, Tok.StartTag), # start <a>
223
224	# HTML5 allows unescaped > in raw data, but < is not allowed.
225	# https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
226	#
227	# - My early blog has THREE errors when disallowing >
228	# - So do some .wwz files
229	(r'[^&<>\x00]+', Tok.RawData),
230	(r'>', Tok.BadGreaterThan),
231	# < is an error
232	(r'.', Tok.Invalid),
233	]
234
235	# Old notes:
236	#
237	# Non-greedy matches are regular and can be matched in linear time
238	# with RE2.
239	#
240	# https://news.ycombinator.com/item?id=27099798
241	#
242	# Maybe try combining all of these for speed.
243
244	# . is any char except newline
245	# https://re2c.org/manual/manual_c.html
246
247	# Discarded options
248	#(r'<!-- .*? -->', Tok.Comment),
249
250	# Hack from Claude: \s\S instead of re.DOTALL. I don't like this
251	#(r'<!-- [\s\S]*? -->', Tok.Comment),
252	#(r'<!-- (?:.\|[\n])*? -->', Tok.Comment),
253
254	LEXER = MakeLexer(LEXER)
255
256
257	class Lexer(object):
258
259	def __init__(self, s, left_pos=0, right_pos=-1, no_special_tags=False):
260	self.s = s
261	self.pos = left_pos
262	self.right_pos = len(s) if right_pos == -1 else right_pos
263	self.no_special_tags = no_special_tags
264
265	self.cache = {} # string -> compiled regex pattern object
266
267	# either </script> or </style> - we search until we see that
268	self.search_state = None # type: Optional[str]
269
270	# Position of tag name, if applicable
271	# - Set after you get a StartTag, EndTag, or StartEndTag
272	# - Unset on other tags
273	self.tag_pos_left = -1
274	self.tag_pos_right = -1
275
276	def _Peek(self):
277	# type: () -> Tuple[int, int]
278	"""
279	Note: not using _Peek() now
280	"""
281	if self.pos == self.right_pos:
282	return Tok.EndOfStream, self.pos
283
284	assert self.pos < self.right_pos, self.pos
285
286	if self.search_state is not None and not self.no_special_tags:
287	# TODO: case-insensitive search for </SCRIPT> <SCRipt> ?
288	#
289	# Another strategy: enter a mode where we find ONLY the end tag
290	# regex, and any data that's not <, and then check the canonical
291	# tag name for 'script' or 'style'.
292	pos = self.s.find(self.search_state, self.pos)
293	if pos == -1:
294	# unterminated <script> or <style>
295	raise LexError(self.s, self.pos)
296	self.search_state = None
297	# beginning
298	return Tok.HtmlCData, pos
299
300	# Find the first match.
301	# Note: frontend/match.py uses _LongestMatch(), which is different!
302	# TODO: reconcile them. This lexer should be expressible in re2c.
303
304	for pat, tok_id in LEXER:
305	m = pat.match(self.s, self.pos)
306	if m:
307	if tok_id in (Tok.StartTag, Tok.EndTag, Tok.StartEndTag):
308	self.tag_pos_left = m.start(1)
309	self.tag_pos_right = m.end(1)
310	else:
311	# Reset state
312	self.tag_pos_left = -1
313	self.tag_pos_right = -1
314
315	if tok_id == Tok.CommentBegin:
316	pos = self.s.find('-->', self.pos)
317	if pos == -1:
318	# unterminated <!--
319	raise LexError(self.s, self.pos)
320	return Tok.Comment, pos + 3 # -->
321
322	if tok_id == Tok.ProcessingBegin:
323	pos = self.s.find('?>', self.pos)
324	if pos == -1:
325	# unterminated <?
326	raise LexError(self.s, self.pos)
327	return Tok.Processing, pos + 2 # ?>
328
329	if tok_id == Tok.CDataBegin:
330	pos = self.s.find(']]>', self.pos)
331	if pos == -1:
332	# unterminated <![CDATA[
333	raise LexError(self.s, self.pos)
334	return Tok.CData, pos + 3 # ]]>
335
336	if tok_id == Tok.StartTag:
337	# TODO: reduce allocations
338	if (self.TagNameEquals('script') or
339	self.TagNameEquals('style')):
340	# <SCRipt a=b> -> </SCRipt>
341	self.search_state = '</' + self._LiteralTagName() + '>'
342
343	return tok_id, m.end()
344	else:
345	raise AssertionError('Tok.Invalid rule should have matched')
346
347	def TagNameEquals(self, expected):
348	# type: (str) -> bool
349	assert self.tag_pos_left != -1, self.tag_pos_left
350	assert self.tag_pos_right != -1, self.tag_pos_right
351
352	# TODO: In C++, this does not need an allocation. Can we test
353	# directly?
354	return expected == self.CanonicalTagName()
355
356	def _LiteralTagName(self):
357	# type: () -> None
358	assert self.tag_pos_left != -1, self.tag_pos_left
359	assert self.tag_pos_right != -1, self.tag_pos_right
360
361	return self.s[self.tag_pos_left:self.tag_pos_right]
362
363	def CanonicalTagName(self):
364	# type: () -> None
365	tag_name = self._LiteralTagName()
366	# Most tags are already lower case, so avoid allocation with this conditional
367	# TODO: this could go in the mycpp runtime?
368	if tag_name.islower():
369	return tag_name
370	else:
371	return tag_name.lower()
372
373	def Read(self):
374	# type: () -> Tuple[int, int]
375	tok_id, end_pos = self._Peek()
376	self.pos = end_pos # advance
377	return tok_id, end_pos
378
379	def LookAhead(self, regex):
380	# Cache the regex compilation. This could also be LookAheadFor(THEAD)
381	# or something.
382	pat = self.cache.get(regex)
383	if pat is None:
384	pat = re.compile(regex)
385	self.cache[regex] = pat
386
387	m = pat.match(self.s, self.pos)
388	return m is not None
389
390
391	def _Tokens(s, left_pos, right_pos):
392	"""
393	Args:
394	s: string to parse
395	left_pos, right_pos: Optional span boundaries.
396	"""
397	lx = Lexer(s, left_pos, right_pos)
398	while True:
399	tok_id, pos = lx.Read()
400	yield tok_id, pos
401	if tok_id == Tok.EndOfStream:
402	break
403
404
405	def ValidTokens(s, left_pos=0, right_pos=-1):
406	"""Wrapper around _Tokens to prevent callers from having to handle Invalid.
407
408	I'm not combining the two functions because I might want to do a
409	'yield' transformation on Tokens()? Exceptions might complicate the
410	issue?
411	"""
412	pos = left_pos
413	for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
414	if tok_id == Tok.Invalid:
415	raise LexError(s, pos)
416	yield tok_id, end_pos
417	pos = end_pos
418
419
420	def ValidTokenList(s, no_special_tags=False):
421	"""A wrapper that can be more easily translated to C++. Doesn't use iterators."""
422
423	start_pos = 0
424	tokens = []
425	lx = Lexer(s, no_special_tags=no_special_tags)
426	while True:
427	tok_id, end_pos = lx.Read()
428	tokens.append((tok_id, end_pos))
429	if tok_id == Tok.EndOfStream:
430	break
431	if tok_id == Tok.Invalid:
432	raise LexError(s, start_pos)
433	start_pos = end_pos
434	return tokens
435
436
437	# Tag names:
438	# Match <a or </a
439	# Match <h2, but not <2h
440	#
441	# HTML 5 doesn't restrict tag names at all
442	# https://html.spec.whatwg.org/#toc-syntax
443	#
444	# XML allows : - .
445	# https://www.w3.org/TR/xml/#NT-NameChar
446
447	# Namespaces for MathML, SVG
448	# XLink, XML, XMLNS
449	#
450	# https://infra.spec.whatwg.org/#namespaces
451	#
452	# Allow - for td-attrs
453
454	# Be very lenient - just no whitespace or special HTML chars
455	# I don't think this is more lenient than HTML5, though we should check.
456	_UNQUOTED_VALUE = r'''[^ \t\r\n<>&"'\x00]*'''
457
458	# TODO: we don't need to capture the tag name here? That's done at the top
459	# level
460	_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)
461
462	_TAG_LAST_RE = re.compile(r'\s* /? >', re.VERBOSE)
463
464	# To match href="foo"
465	# Note: in HTML5 and XML, single quoted attributes are also valid
466
467	# <button disabled> is standard usage
468
469	# NOTE: This used to allow whitespace around =
470	# <a foo = "bar"> makes sense in XML
471	# But then you also have
472	# <a foo= bar> - which is TWO attributes, in HTML5
473	# So the space is problematic
474
475	_ATTR_RE = re.compile(
476	r'''
477	\s+ # Leading whitespace is required
478	(%s) # Attribute name
479	(?: # Optional attribute value
480	\s* = \s* # Spaces allowed around =
481	(?:
482	" ([^>"\x00]*) " # double quoted value
483	\| ' ([^>'\x00]*) ' # single quoted value
484	\| (%s) # Attribute value
485	)
486	)?
487	''' % (_NAME, _UNQUOTED_VALUE), re.VERBOSE)
488
489	TagName, AttrName, UnquotedValue, QuotedValue, MissingValue = range(5)
490
491
492	class TagLexer(object):
493	"""
494	Given a tag like <a href="..."> or <link type="..." />, the TagLexer
495	provides a few operations:
496
497	- What is the tag?
498	- Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
499	"""
500
501	def __init__(self, s):
502	self.s = s
503	self.start_pos = -1 # Invalid
504	self.end_pos = -1
505
506	def Reset(self, start_pos, end_pos):
507	"""Reuse instances of this object."""
508	assert start_pos >= 0, start_pos
509	assert end_pos >= 0, end_pos
510
511	self.start_pos = start_pos
512	self.end_pos = end_pos
513
514	def TagString(self):
515	return self.s[self.start_pos:self.end_pos]
516
517	def TagName(self):
518	# First event
519	tok_id, start, end = next(self.Tokens())
520	return self.s[start:end]
521
522	def GetSpanForAttrValue(self, attr_name):
523	"""
524	Used by oils_doc.py, for href shortcuts
525	"""
526	# Algorithm: search for QuotedValue or UnquotedValue after AttrName
527	# TODO: Could also cache these
528
529	events = self.Tokens()
530	val = (-1, -1)
531	try:
532	while True:
533	tok_id, start, end = next(events)
534	if tok_id == AttrName:
535	name = self.s[start:end]
536	if name == attr_name:
537	# The value should come next
538	tok_id, start, end = next(events)
539	assert tok_id in (QuotedValue, UnquotedValue,
540	MissingValue), TokenName(tok_id)
541	val = start, end
542	break
543
544	except StopIteration:
545	pass
546	return val
547
548	def GetAttrRaw(self, attr_name):
549	"""
550	Return the value, which may be UNESCAPED.
551	"""
552	start, end = self.GetSpanForAttrValue(attr_name)
553	if start == -1:
554	return None
555	return self.s[start:end]
556
557	def AllAttrsRawSlice(self):
558	"""
559	Get a list of pairs [('class', 3, 5), ('href', 9, 12)]
560	"""
561	slices = []
562	events = self.Tokens()
563	try:
564	while True:
565	tok_id, start, end = next(events)
566	if tok_id == AttrName:
567	name = self.s[start:end]
568
569	# The value should come next
570	tok_id, start, end = next(events)
571	assert tok_id in (QuotedValue, UnquotedValue,
572	MissingValue), TokenName(tok_id)
573	# Note: quoted values may have &
574	# We would need ANOTHER lexer to unescape them, but we
575	# don't need that for ul-table
576	slices.append((name, start, end))
577	except StopIteration:
578	pass
579	return slices
580
581	def AllAttrsRaw(self):
582	"""
583	Get a list of pairs [('class', 'foo'), ('href', '?foo=1&bar=2')]
584
585	The quoted values may be escaped. We would need another lexer to
586	unescape them.
587	"""
588	slices = self.AllAttrsRawSlice()
589	pairs = []
590	for name, start, end in slices:
591	pairs.append((name, self.s[start:end]))
592	return pairs
593
594	def Tokens(self):
595	"""
596	Yields a sequence of tokens: Tag (AttrName AttrValue?)*
597
598	Where each Token is (Type, start_pos, end_pos)
599
600	Note that start and end are NOT redundant! We skip over some unwanted
601	characters.
602	"""
603	m = _TAG_RE.match(self.s, self.start_pos + 1)
604	if not m:
605	raise RuntimeError("Couldn't find HTML tag in %r" %
606	self.TagString())
607	yield TagName, m.start(1), m.end(1)
608
609	pos = m.end(0)
610	#log('POS %d', pos)
611
612	while True:
613	# don't search past the end
614	m = _ATTR_RE.match(self.s, pos, self.end_pos)
615	if not m:
616	#log('BREAK pos %d', pos)
617	break
618	#log('AttrName %r', m.group(1))
619
620	yield AttrName, m.start(1), m.end(1)
621
622	#log('m.groups() %r', m.groups())
623	if m.group(2) is not None:
624	# double quoted
625	yield QuotedValue, m.start(2), m.end(2)
626	elif m.group(3) is not None:
627	# single quoted - TODO: could have different token types
628	yield QuotedValue, m.start(3), m.end(3)
629	elif m.group(4) is not None:
630	yield UnquotedValue, m.start(4), m.end(4)
631	else:
632	# <button disabled>
633	end = m.end(0)
634	yield MissingValue, end, end
635
636	# Skip past the "
637	pos = m.end(0)
638
639	#log('TOK %r', self.s)
640
641	m = _TAG_LAST_RE.match(self.s, pos)
642	#log('_TAG_LAST_RE match %r', self.s[pos:])
643	if not m:
644	# Extra data at end of tag. TODO: add messages for all these.
645	raise LexError(self.s, pos)
646
647
648	# This is similar but not identical to
649	# " ([^>"\x00]*) " # double quoted value
650	# \| ' ([^>'\x00]*) ' # single quoted value
651	#
652	# Note: for unquoted values, & isn't allowed, and thus & and c and
653	# are not allowed. We could relax that?
654	ATTR_VALUE_LEXER = CHAR_LEX + [
655	(r'[^>&\x00]+', Tok.RawData),
656	(r'.', Tok.Invalid),
657	]
658
659	ATTR_VALUE_LEXER = MakeLexer(ATTR_VALUE_LEXER)
660
661
662	class AttrValueLexer(object):
663	"""
664	<a href="foo=99&bar">
665	<a href='foo=99&bar'>
666	<a href=unquoted>
667	"""
668
669	def __init__(self, s):
670	self.s = s
671	self.start_pos = -1 # Invalid
672	self.end_pos = -1
673
674	def Reset(self, start_pos, end_pos):
675	"""Reuse instances of this object."""
676	assert start_pos >= 0, start_pos
677	assert end_pos >= 0, end_pos
678
679	self.start_pos = start_pos
680	self.end_pos = end_pos
681
682	def NumTokens(self):
683	num_tokens = 0
684	pos = self.start_pos
685	for tok_id, end_pos in self.Tokens():
686	if tok_id == Tok.Invalid:
687	raise LexError(self.s, pos)
688	pos = end_pos
689	#log('pos %d', pos)
690	num_tokens += 1
691	return num_tokens
692
693	def Tokens(self):
694	pos = self.start_pos
695	while pos < self.end_pos:
696	# Find the first match, like above.
697	# Note: frontend/match.py uses _LongestMatch(), which is different!
698	# TODO: reconcile them. This lexer should be expressible in re2c.
699	for pat, tok_id in ATTR_VALUE_LEXER:
700	m = pat.match(self.s, pos)
701	if m:
702	if 0:
703	tok_str = m.group(0)
704	log('token = %r', tok_str)
705
706	end_pos = m.end(0)
707	yield tok_id, end_pos
708	pos = end_pos
709	break
710	else:
711	raise AssertionError('Tok.Invalid rule should have matched')
712
713
714	def ReadUntilStartTag(it, tag_lexer, tag_name):
715	"""Find the next <foo>, returning its (start, end) positions
716
717	Raise ParseError if it's not found.
718
719	tag_lexer is RESET.
720	"""
721	pos = 0
722	while True:
723	try:
724	tok_id, end_pos = next(it)
725	except StopIteration:
726	break
727	tag_lexer.Reset(pos, end_pos)
728	if tok_id == Tok.StartTag and tag_lexer.TagName() == tag_name:
729	return pos, end_pos
730
731	pos = end_pos
732
733	raise ParseError('No start tag %r' % tag_name)
734
735
736	def ReadUntilEndTag(it, tag_lexer, tag_name):
737	"""Find the next </foo>, returning its (start, end) position
738
739	Raise ParseError if it's not found.
740
741	tag_lexer is RESET.
742	"""
743	pos = 0
744	while True:
745	try:
746	tok_id, end_pos = next(it)
747	except StopIteration:
748	break
749	tag_lexer.Reset(pos, end_pos)
750	if tok_id == Tok.EndTag and tag_lexer.TagName() == tag_name:
751	return pos, end_pos
752
753	pos = end_pos
754
755	raise ParseError('No end tag %r' % tag_name)
756
757
758	CHAR_ENTITY = {
759	'amp': '&',
760	'lt': '<',
761	'gt': '>',
762	'quot': '"',
763	'apos': "'",
764	}
765
766
767	def ToText(s, left_pos=0, right_pos=-1):
768	"""Given HTML, return text by unquoting > and < etc.
769
770	Used by:
771	doctools/oils_doc.py: PygmentsPlugin
772	doctools/help_gen.py: HelpIndexCards
773
774	In the latter case, we cold process some tags, like:
775
776	- Blue Link (not clickable, but still useful)
777	- Red X
778
779	That should be html.ToAnsi.
780	"""
781	f = StringIO()
782	out = Output(s, f, left_pos, right_pos)
783
784	pos = left_pos
785	for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
786	if tok_id in (Tok.RawData, Tok.BadAmpersand, Tok.BadGreaterThan,
787	Tok.BadLessThan):
788	out.SkipTo(pos)
789	out.PrintUntil(end_pos)
790
791	elif tok_id == Tok.CharEntity: # &
792
793	entity = s[pos + 1:end_pos - 1]
794
795	out.SkipTo(pos)
796	out.Print(CHAR_ENTITY[entity])
797	out.SkipTo(end_pos)
798
799	# Not handling these yet
800	elif tok_id == Tok.HexChar:
801	raise AssertionError('Hex Char %r' % s[pos:pos + 20])
802
803	elif tok_id == Tok.DecChar:
804	raise AssertionError('Dec Char %r' % s[pos:pos + 20])
805
806	else:
807	# Skip everything else
808	out.SkipTo(end_pos)
809
810	pos = end_pos
811
812	out.PrintTheRest()
813	return f.getvalue()
814
815
816	# https://developer.mozilla.org/en-US/docs/Glossary/Void_element
817	VOID_ELEMENTS = [
818	'area',
819	'base',
820	'br',
821	'col',
822	'embed',
823	'hr',
824	'img',
825	'input',
826	'link',
827	'meta',
828	'param',
829	'source',
830	'track',
831	'wbr',
832	]
833
834	LEX_ATTRS = 1 << 1
835	LEX_QUOTED_VALUES = 1 << 2 # href="?x=42&y=99"
836	NO_SPECIAL_TAGS = 1 << 3 # <script> <style>, VOID tags, etc.
837	BALANCED_TAGS = 1 << 4 # are tags balanced?
838
839
840	def Validate(contents, flags, counters):
841	# type: (str, int, Counters) -> None
842
843	tag_lexer = TagLexer(contents)
844	val_lexer = AttrValueLexer(contents)
845
846	no_special_tags = bool(flags & NO_SPECIAL_TAGS)
847	lx = Lexer(contents, no_special_tags=no_special_tags)
848	tokens = []
849	start_pos = 0
850	tag_stack = []
851	while True:
852	tok_id, end_pos = lx.Read()
853	#log('TOP %s %r', TokenName(tok_id), contents[start_pos:end_pos])
854
855	if tok_id == Tok.Invalid:
856	raise LexError(contents, start_pos)
857	if tok_id == Tok.EndOfStream:
858	break
859
860	tokens.append((tok_id, end_pos))
861
862	if tok_id == Tok.StartEndTag:
863	counters.num_start_end_tags += 1
864
865	tag_lexer.Reset(start_pos, end_pos)
866	all_attrs = tag_lexer.AllAttrsRawSlice()
867	counters.num_attrs += len(all_attrs)
868	for name, val_start, val_end in all_attrs:
869	val_lexer.Reset(val_start, val_end)
870	counters.num_val_tokens += val_lexer.NumTokens()
871
872	counters.debug_attrs.extend(all_attrs)
873
874	elif tok_id == Tok.StartTag:
875	counters.num_start_tags += 1
876
877	tag_lexer.Reset(start_pos, end_pos)
878	all_attrs = tag_lexer.AllAttrsRawSlice()
879	counters.num_attrs += len(all_attrs)
880	for name, val_start, val_end in all_attrs:
881	val_lexer.Reset(val_start, val_end)
882	counters.num_val_tokens += val_lexer.NumTokens()
883
884	counters.debug_attrs.extend(all_attrs)
885
886	if flags & BALANCED_TAGS:
887	tag_name = lx.CanonicalTagName()
888	if flags & NO_SPECIAL_TAGS:
889	tag_stack.append(tag_name)
890	else:
891	# e.g. <meta> is considered self-closing, like <meta/>
892	if tag_name not in VOID_ELEMENTS:
893	tag_stack.append(tag_name)
894
895	counters.max_tag_stack = max(counters.max_tag_stack,
896	len(tag_stack))
897	elif tok_id == Tok.EndTag:
898	if flags & BALANCED_TAGS:
899	try:
900	expected = tag_stack.pop()
901	except IndexError:
902	raise ParseError('Tag stack empty',
903	s=contents,
904	start_pos=start_pos)
905
906	actual = lx.CanonicalTagName()
907	if expected != actual:
908	raise ParseError(
909	'Got unexpected closing tag %r; opening tag was %r' %
910	(contents[start_pos:end_pos], expected),
911	s=contents,
912	start_pos=start_pos)
913
914	start_pos = end_pos
915
916	if len(tag_stack) != 0:
917	raise ParseError('Missing closing tags at end of doc: %s' %
918	' '.join(tag_stack),
919	s=contents,
920	start_pos=start_pos)
921
922	counters.num_tokens += len(tokens)
923
924
925	def ToXml(htm8_str):
926	# type: (str) -> str
927
928	# TODO:
929	# 1. Lex it
930	# 2. < & > must be escaped
931	# a. in raw data
932	# b. in quoted strings
933	# 3. <script> turned into CDATA
934	# 4. void tags turned into self-closing tags
935	# 5. case-sensitive tag matching - not sure about this
936
937	tag_lexer = TagLexer(htm8_str)
938	val_lexer = AttrValueLexer(htm8_str)
939
940	f = StringIO()
941	out = Output(htm8_str, f)
942
943	lx = Lexer(htm8_str)
944
945	pos = 0
946	while True:
947	tok_id, end_pos = lx.Read()
948
949	if tok_id == Tok.Invalid:
950	raise LexError(htm8_str, pos)
951	if tok_id == Tok.EndOfStream:
952	break
953
954	if tok_id in (Tok.RawData, Tok.CharEntity, Tok.HexChar, Tok.DecChar):
955	out.PrintUntil(end_pos)
956	elif tok_id in (Tok.StartTag, Tok.StartEndTag):
957	tag_lexer.Reset(pos, end_pos)
958	# TODO: reduce allocations here
959	all_attrs = tag_lexer.AllAttrsRawSlice()
960	for name, val_start, val_end in all_attrs:
961	val_lexer.Reset(val_start, val_end)
962	# TODO: get the kind of string
963	#
964	# Quoted: we need to replace & with & and < with <
965	# note > is not allowed
966	# Unquoted: right now, we can just surround with double quotes
967	# because we don't allow any bad chars
968	# Empty : add "", so empty= becomes =""
969	# Missing : add ="", so missing becomes missing=""
970
971	tag_name = lx.CanonicalTagName()
972	if tok_id == Tok.StartTag and tag_name in VOID_ELEMENTS:
973	# TODO: instead of closing >, print />
974	pass
975
976	elif tok_id == Tok.BadAmpersand:
977	#out.SkipTo(pos)
978	out.Print('&')
979	out.SkipTo(end_pos)
980
981	elif tok_id == Tok.BadGreaterThan:
982	#out.SkipTo(pos)
983	out.Print('>')
984	out.SkipTo(end_pos)
985	else:
986	out.PrintUntil(end_pos)
987
988	pos = end_pos
989
990	out.PrintTheRest()
991	return f.getvalue()
992
993
994	class Counters(object):
995
996	def __init__(self):
997	self.num_tokens = 0
998	self.num_start_tags = 0
999	self.num_start_end_tags = 0
1000	self.num_attrs = 0
1001	self.max_tag_stack = 0
1002	self.num_val_tokens = 0
1003
1004	self.debug_attrs = []
1005
1006
1007	def main(argv):
1008	action = argv[1]
1009
1010	if action == 'tokens':
1011	contents = sys.stdin.read()
1012
1013	lx = Lexer(contents)
1014	start_pos = 0
1015	while True:
1016	tok_id, end_pos = lx.Read()
1017	if tok_id == Tok.Invalid:
1018	raise LexError(contents, start_pos)
1019	if tok_id == Tok.EndOfStream:
1020	break
1021
1022	frag = contents[start_pos:end_pos]
1023	log('%d %s %r', end_pos, TokenName(tok_id), frag)
1024	start_pos = end_pos
1025
1026	return 0
1027
1028	elif action in ('lex-htm8', 'parse-htm8', 'parse-xml'):
1029
1030	errors = []
1031	counters = Counters()
1032
1033	flags = LEX_ATTRS \| LEX_QUOTED_VALUES
1034	if action.startswith('parse-'):
1035	flags \|= BALANCED_TAGS
1036	if action == 'parse-xml':
1037	flags \|= NO_SPECIAL_TAGS
1038
1039	i = 0
1040	for line in sys.stdin:
1041	filename = line.strip()
1042	with open(filename) as f:
1043	contents = f.read()
1044
1045	try:
1046	Validate(contents, flags, counters)
1047	except LexError as e:
1048	log('Lex error in %r: %s', filename, e)
1049	errors.append((filename, e))
1050	except ParseError as e:
1051	log('Parse error in %r: %s', filename, e)
1052	errors.append((filename, e))
1053	i += 1
1054
1055	log('')
1056	log('%10d tokens', counters.num_tokens)
1057	log('%10d start/end tags', counters.num_start_end_tags)
1058	log('%10d start tags', counters.num_start_tags)
1059	log('%10d attrs', counters.num_attrs)
1060	log('%10d max tag stack depth', counters.max_tag_stack)
1061	log('%10d attr val tokens', counters.num_val_tokens)
1062	log('%10d errors', len(errors))
1063	if len(errors):
1064	return 1
1065	return 0
1066
1067	elif action == 'todo':
1068	# Other algorithms:
1069	#
1070	# - select first subtree with given ID
1071	# - this requires understanding the void tags I suppose
1072	# - select all subtrees that have a class
1073	# - materialize DOM
1074
1075	# Safe-HTM8? This is a filter
1076	return 0
1077
1078	else:
1079	raise RuntimeError('Invalid action %r' % action)
1080
1081
1082	if __name__ == '__main__':
1083	sys.exit(main(sys.argv))