lazylex/html.py

OILS / lazylex / html.py View on Github | oils.pub

1084 lines, 561 significant

1	#!/usr/bin/env python2
2	"""
3	lazylex/html.py - Low-Level HTML Processing.
4
5	See lazylex/README.md for details.
6	"""
7	from __future__ import print_function
8
9	try:
10	from cStringIO import StringIO
11	except ImportError:
12	# for python3
13	from io import StringIO # type: ignore
14	import re
15	import sys
16
17	if sys.version_info.major == 2:
18	from typing import List, Tuple, Optional
19
20
21	def log(msg, *args):
22	msg = msg % args
23	print(msg, file=sys.stderr)
24
25
26	class LexError(Exception):
27	"""
28	Examples of lex errors:
29
30	- Tok.Invalid, like <> or &&
31	- Unclosed <!-- <? <![CDATA[ <script> <style>
32	"""
33
34	def __init__(self, s, start_pos):
35	self.s = s
36	self.start_pos = start_pos
37
38	def __str__(self):
39	return '(LexError %r)' % (self.s[self.start_pos:self.start_pos + 20])
40
41
42	def FindLineNum(s, error_pos):
43	current_pos = 0
44	line_num = 1
45	while True:
46	newline_pos = s.find('\n', current_pos)
47	#log('current = %d, N %d, line %d', current_pos, newline_pos, line_num)
48
49	if newline_pos == -1: # this is the last line
50	return line_num
51	if newline_pos >= error_pos:
52	return line_num
53	line_num += 1
54	current_pos = newline_pos + 1
55
56
57	class ParseError(Exception):
58	"""
59	Examples of parse errors
60
61	- unbalanced tag structure
62	- ul_table.py errors
63	"""
64
65	def __init__(self, msg, s=None, start_pos=-1):
66	self.msg = msg
67	self.s = s
68	self.start_pos = start_pos
69
70	def __str__(self):
71	if self.s is not None:
72	assert self.start_pos != -1, self.start_pos
73	snippet = (self.s[self.start_pos:self.start_pos + 20])
74
75	line_num = FindLineNum(self.s, self.start_pos)
76	else:
77	snippet = ''
78	line_num = -1
79	msg = 'line %d: %r %r' % (line_num, self.msg, snippet)
80	return msg
81
82
83	class Output(object):
84	"""Takes an underlying input buffer and an output file. Maintains a
85	position in the input buffer.
86
87	Print FROM the input or print new text to the output.
88	"""
89
90	def __init__(self, s, f, left_pos=0, right_pos=-1):
91	self.s = s
92	self.f = f
93	self.pos = left_pos
94	self.right_pos = len(s) if right_pos == -1 else right_pos
95
96	def SkipTo(self, pos):
97	"""Skip to a position."""
98	self.pos = pos
99
100	def PrintUntil(self, pos):
101	"""Print until a position."""
102	piece = self.s[self.pos:pos]
103	self.f.write(piece)
104	self.pos = pos
105
106	def PrintTheRest(self):
107	"""Print until the end of the string."""
108	self.PrintUntil(self.right_pos)
109
110	def Print(self, s):
111	"""Print text to the underlying buffer."""
112	self.f.write(s)
113
114
115	# HTML Tokens
116	# CommentBegin, ProcessingBegin, CDataBegin are "pseudo-tokens", not visible
117	TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin CData CDataBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData HtmlCData BadAmpersand BadGreaterThan BadLessThan Invalid EndOfStream'.split(
118	)
119
120
121	class Tok(object):
122	"""
123	Avoid lint errors by using these aliases
124	"""
125	pass
126
127
128	TOKEN_NAMES = [None] * len(TOKENS) # type: List[str]
129
130	this_module = sys.modules[__name__]
131	for i, tok_str in enumerate(TOKENS):
132	setattr(this_module, tok_str, i)
133	setattr(Tok, tok_str, i)
134	TOKEN_NAMES[i] = tok_str
135
136
137	def TokenName(tok_id):
138	return TOKEN_NAMES[tok_id]
139
140
141	def MakeLexer(rules):
142	return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]
143
144
145	#
146	# Eggex
147	#
148	# Tag = / ~['>']+ /
149
150	# Is this valid? A single character?
151	# Tag = / ~'>'* /
152
153	# Maybe better: / [NOT '>']+/
154	# capital letters not allowed there?
155	#
156	# But then this is confusing:
157	# / [NOT ~digit]+/
158	#
159	# / [NOT digit] / is [^\d]
160	# / ~digit / is \D
161	#
162	# Or maybe:
163	#
164	# / [~ digit]+ /
165	# / [~ '>']+ /
166	# / [NOT '>']+ /
167
168	# End = / '</' Tag '>' /
169	# StartEnd = / '<' Tag '/>' /
170	# Start = / '<' Tag '>' /
171	#
172	# EntityRef = / '&' dot{* N} ';' /
173
174	# Tag name, or attribute name
175	# colon is used in XML
176
177	# https://www.w3.org/TR/xml/#NT-Name
178	# Hm there is a lot of unicode stuff. We are simplifying parsing
179
180	_NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter
181
182	CHAR_LEX = [
183	# Characters
184	# https://www.w3.org/TR/xml/#sec-references
185	(r'&\# [0-9]+ ;', Tok.DecChar),
186	(r'&\# x[0-9a-fA-F]+ ;', Tok.HexChar),
187	(r'& %s ;' % _NAME, Tok.CharEntity),
188	# Allow unquoted, and quoted
189	(r'&', Tok.BadAmpersand),
190	]
191
192	HTM8_LEX = CHAR_LEX + [
193	(r'<!--', Tok.CommentBegin),
194
195	# Processing instruction are used for the XML header:
196	# <?xml version="1.0" encoding="UTF-8"?>
197	# They are technically XML-only, but in HTML5, they are another kind of
198	# comment:
199	#
200	# https://developer.mozilla.org/en-US/docs/Web/API/ProcessingInstruction
201	#
202	(r'<\?', Tok.ProcessingBegin),
203	# Not necessary in HTML5, but occurs in XML
204	(r'<!\[CDATA\[', Tok.CDataBegin), # <![CDATA[
205
206	# Markup declarations
207	# - In HTML5, there is only <!DOCTYPE html>
208	# - XML has 4 more declarations: <!ELEMENT ...> ATTLIST ENTITY NOTATION
209	# - these seem to be part of DTD
210	# - it's useful to skip these, and be able to parse the rest of the document
211	# - Note: < is allowed?
212	(r'<! [^>\x00]+ >', Tok.Decl),
213
214	# Tags
215	# Notes:
216	# - We look for a valid tag name, but we don't validate attributes.
217	# That's done in the tag lexer.
218	# - We don't allow leading whitespace
219	(r'</ (%s) >' % _NAME, Tok.EndTag),
220	# self-closing <br/> comes before StartTag
221	# could/should these be collapsed into one rule?
222	(r'< (%s) [^>\x00]* />' % _NAME, Tok.StartEndTag), # end </a>
223	(r'< (%s) [^>\x00]* >' % _NAME, Tok.StartTag), # start <a>
224
225	# HTML5 allows unescaped > in raw data, but < is not allowed.
226	# https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
227	#
228	# - My early blog has THREE errors when disallowing >
229	# - So do some .wwz files
230	(r'[^&<>\x00]+', Tok.RawData),
231	(r'>', Tok.BadGreaterThan),
232	# < is an error
233	(r'.', Tok.Invalid),
234	]
235
236	# Old notes:
237	#
238	# Non-greedy matches are regular and can be matched in linear time
239	# with RE2.
240	#
241	# https://news.ycombinator.com/item?id=27099798
242	#
243	# Maybe try combining all of these for speed.
244
245	# . is any char except newline
246	# https://re2c.org/manual/manual_c.html
247
248	# Discarded options
249	#(r'<!-- .*? -->', Tok.Comment),
250
251	# Hack from Claude: \s\S instead of re.DOTALL. I don't like this
252	#(r'<!-- [\s\S]*? -->', Tok.Comment),
253	#(r'<!-- (?:.\|[\n])*? -->', Tok.Comment),
254
255	HTM8_LEX_COMPILED = MakeLexer(HTM8_LEX)
256
257
258	class Lexer(object):
259
260	def __init__(self, s, left_pos=0, right_pos=-1, no_special_tags=False):
261	self.s = s
262	self.pos = left_pos
263	self.right_pos = len(s) if right_pos == -1 else right_pos
264	self.no_special_tags = no_special_tags
265
266	self.cache = {} # string -> compiled regex pattern object
267
268	# either </script> or </style> - we search until we see that
269	self.search_state = None # type: Optional[str]
270
271	# Position of tag name, if applicable
272	# - Set after you get a StartTag, EndTag, or StartEndTag
273	# - Unset on other tags
274	self.tag_pos_left = -1
275	self.tag_pos_right = -1
276
277	def _Peek(self):
278	# type: () -> Tuple[int, int]
279	"""
280	Note: not using _Peek() now
281	"""
282	if self.pos == self.right_pos:
283	return Tok.EndOfStream, self.pos
284
285	assert self.pos < self.right_pos, self.pos
286
287	if self.search_state is not None and not self.no_special_tags:
288	# TODO: case-insensitive search for </SCRIPT> <SCRipt> ?
289	#
290	# Another strategy: enter a mode where we find ONLY the end tag
291	# regex, and any data that's not <, and then check the canonical
292	# tag name for 'script' or 'style'.
293	pos = self.s.find(self.search_state, self.pos)
294	if pos == -1:
295	# unterminated <script> or <style>
296	raise LexError(self.s, self.pos)
297	self.search_state = None
298	# beginning
299	return Tok.HtmlCData, pos
300
301	# Find the first match.
302	# Note: frontend/match.py uses _LongestMatch(), which is different!
303	# TODO: reconcile them. This lexer should be expressible in re2c.
304
305	for pat, tok_id in HTM8_LEX_COMPILED:
306	m = pat.match(self.s, self.pos)
307	if m:
308	if tok_id in (Tok.StartTag, Tok.EndTag, Tok.StartEndTag):
309	self.tag_pos_left = m.start(1)
310	self.tag_pos_right = m.end(1)
311	else:
312	# Reset state
313	self.tag_pos_left = -1
314	self.tag_pos_right = -1
315
316	if tok_id == Tok.CommentBegin:
317	pos = self.s.find('-->', self.pos)
318	if pos == -1:
319	# unterminated <!--
320	raise LexError(self.s, self.pos)
321	return Tok.Comment, pos + 3 # -->
322
323	if tok_id == Tok.ProcessingBegin:
324	pos = self.s.find('?>', self.pos)
325	if pos == -1:
326	# unterminated <?
327	raise LexError(self.s, self.pos)
328	return Tok.Processing, pos + 2 # ?>
329
330	if tok_id == Tok.CDataBegin:
331	pos = self.s.find(']]>', self.pos)
332	if pos == -1:
333	# unterminated <![CDATA[
334	raise LexError(self.s, self.pos)
335	return Tok.CData, pos + 3 # ]]>
336
337	if tok_id == Tok.StartTag:
338	# TODO: reduce allocations
339	if (self.TagNameEquals('script') or
340	self.TagNameEquals('style')):
341	# <SCRipt a=b> -> </SCRipt>
342	self.search_state = '</' + self._LiteralTagName() + '>'
343
344	return tok_id, m.end()
345	else:
346	raise AssertionError('Tok.Invalid rule should have matched')
347
348	def TagNameEquals(self, expected):
349	# type: (str) -> bool
350	assert self.tag_pos_left != -1, self.tag_pos_left
351	assert self.tag_pos_right != -1, self.tag_pos_right
352
353	# TODO: In C++, this does not need an allocation. Can we test
354	# directly?
355	return expected == self.CanonicalTagName()
356
357	def _LiteralTagName(self):
358	# type: () -> str
359	assert self.tag_pos_left != -1, self.tag_pos_left
360	assert self.tag_pos_right != -1, self.tag_pos_right
361
362	return self.s[self.tag_pos_left:self.tag_pos_right]
363
364	def CanonicalTagName(self):
365	# type: () -> str
366	tag_name = self._LiteralTagName()
367	# Most tags are already lower case, so avoid allocation with this conditional
368	# TODO: this could go in the mycpp runtime?
369	if tag_name.islower():
370	return tag_name
371	else:
372	return tag_name.lower()
373
374	def Read(self):
375	# type: () -> Tuple[int, int]
376	tok_id, end_pos = self._Peek()
377	self.pos = end_pos # advance
378	return tok_id, end_pos
379
380	def LookAhead(self, regex):
381	# Cache the regex compilation. This could also be LookAheadFor(THEAD)
382	# or something.
383	pat = self.cache.get(regex)
384	if pat is None:
385	pat = re.compile(regex)
386	self.cache[regex] = pat
387
388	m = pat.match(self.s, self.pos)
389	return m is not None
390
391
392	def _Tokens(s, left_pos, right_pos):
393	"""
394	Args:
395	s: string to parse
396	left_pos, right_pos: Optional span boundaries.
397	"""
398	lx = Lexer(s, left_pos, right_pos)
399	while True:
400	tok_id, pos = lx.Read()
401	yield tok_id, pos
402	if tok_id == Tok.EndOfStream:
403	break
404
405
406	def ValidTokens(s, left_pos=0, right_pos=-1):
407	"""Wrapper around _Tokens to prevent callers from having to handle Invalid.
408
409	I'm not combining the two functions because I might want to do a
410	'yield' transformation on Tokens()? Exceptions might complicate the
411	issue?
412	"""
413	pos = left_pos
414	for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
415	if tok_id == Tok.Invalid:
416	raise LexError(s, pos)
417	yield tok_id, end_pos
418	pos = end_pos
419
420
421	def ValidTokenList(s, no_special_tags=False):
422	"""A wrapper that can be more easily translated to C++. Doesn't use iterators."""
423
424	start_pos = 0
425	tokens = []
426	lx = Lexer(s, no_special_tags=no_special_tags)
427	while True:
428	tok_id, end_pos = lx.Read()
429	tokens.append((tok_id, end_pos))
430	if tok_id == Tok.EndOfStream:
431	break
432	if tok_id == Tok.Invalid:
433	raise LexError(s, start_pos)
434	start_pos = end_pos
435	return tokens
436
437
438	# Tag names:
439	# Match <a or </a
440	# Match <h2, but not <2h
441	#
442	# HTML 5 doesn't restrict tag names at all
443	# https://html.spec.whatwg.org/#toc-syntax
444	#
445	# XML allows : - .
446	# https://www.w3.org/TR/xml/#NT-NameChar
447
448	# Namespaces for MathML, SVG
449	# XLink, XML, XMLNS
450	#
451	# https://infra.spec.whatwg.org/#namespaces
452	#
453	# Allow - for td-attrs
454
455	# Be very lenient - just no whitespace or special HTML chars
456	# I don't think this is more lenient than HTML5, though we should check.
457	_UNQUOTED_VALUE = r'''[^ \t\r\n<>&"'\x00]*'''
458
459	# TODO: we don't need to capture the tag name here? That's done at the top
460	# level
461	_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)
462
463	_TAG_LAST_RE = re.compile(r'\s* /? >', re.VERBOSE)
464
465	# To match href="foo"
466	# Note: in HTML5 and XML, single quoted attributes are also valid
467
468	# <button disabled> is standard usage
469
470	# NOTE: This used to allow whitespace around =
471	# <a foo = "bar"> makes sense in XML
472	# But then you also have
473	# <a foo= bar> - which is TWO attributes, in HTML5
474	# So the space is problematic
475
476	_ATTR_RE = re.compile(
477	r'''
478	\s+ # Leading whitespace is required
479	(%s) # Attribute name
480	(?: # Optional attribute value
481	\s* = \s* # Spaces allowed around =
482	(?:
483	" ([^>"\x00]*) " # double quoted value
484	\| ' ([^>'\x00]*) ' # single quoted value
485	\| (%s) # Attribute value
486	)
487	)?
488	''' % (_NAME, _UNQUOTED_VALUE), re.VERBOSE)
489
490	TagName, AttrName, UnquotedValue, QuotedValue, MissingValue = range(5)
491
492
493	class TagLexer(object):
494	"""
495	Given a tag like <a href="..."> or <link type="..." />, the TagLexer
496	provides a few operations:
497
498	- What is the tag?
499	- Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
500	"""
501
502	def __init__(self, s):
503	self.s = s
504	self.start_pos = -1 # Invalid
505	self.end_pos = -1
506
507	def Reset(self, start_pos, end_pos):
508	"""Reuse instances of this object."""
509	assert start_pos >= 0, start_pos
510	assert end_pos >= 0, end_pos
511
512	self.start_pos = start_pos
513	self.end_pos = end_pos
514
515	def TagString(self):
516	return self.s[self.start_pos:self.end_pos]
517
518	def TagName(self):
519	# First event
520	tok_id, start, end = next(self.Tokens())
521	return self.s[start:end]
522
523	def GetSpanForAttrValue(self, attr_name):
524	"""
525	Used by oils_doc.py, for href shortcuts
526	"""
527	# Algorithm: search for QuotedValue or UnquotedValue after AttrName
528	# TODO: Could also cache these
529
530	events = self.Tokens()
531	val = (-1, -1)
532	try:
533	while True:
534	tok_id, start, end = next(events)
535	if tok_id == AttrName:
536	name = self.s[start:end]
537	if name == attr_name:
538	# The value should come next
539	tok_id, start, end = next(events)
540	assert tok_id in (QuotedValue, UnquotedValue,
541	MissingValue), TokenName(tok_id)
542	val = start, end
543	break
544
545	except StopIteration:
546	pass
547	return val
548
549	def GetAttrRaw(self, attr_name):
550	"""
551	Return the value, which may be UNESCAPED.
552	"""
553	start, end = self.GetSpanForAttrValue(attr_name)
554	if start == -1:
555	return None
556	return self.s[start:end]
557
558	def AllAttrsRawSlice(self):
559	"""
560	Get a list of pairs [('class', 3, 5), ('href', 9, 12)]
561	"""
562	slices = []
563	events = self.Tokens()
564	try:
565	while True:
566	tok_id, start, end = next(events)
567	if tok_id == AttrName:
568	name = self.s[start:end]
569
570	# The value should come next
571	tok_id, start, end = next(events)
572	assert tok_id in (QuotedValue, UnquotedValue,
573	MissingValue), TokenName(tok_id)
574	# Note: quoted values may have &
575	# We would need ANOTHER lexer to unescape them, but we
576	# don't need that for ul-table
577	slices.append((name, start, end))
578	except StopIteration:
579	pass
580	return slices
581
582	def AllAttrsRaw(self):
583	"""
584	Get a list of pairs [('class', 'foo'), ('href', '?foo=1&bar=2')]
585
586	The quoted values may be escaped. We would need another lexer to
587	unescape them.
588	"""
589	slices = self.AllAttrsRawSlice()
590	pairs = []
591	for name, start, end in slices:
592	pairs.append((name, self.s[start:end]))
593	return pairs
594
595	def Tokens(self):
596	"""
597	Yields a sequence of tokens: Tag (AttrName AttrValue?)*
598
599	Where each Token is (Type, start_pos, end_pos)
600
601	Note that start and end are NOT redundant! We skip over some unwanted
602	characters.
603	"""
604	m = _TAG_RE.match(self.s, self.start_pos + 1)
605	if not m:
606	raise RuntimeError("Couldn't find HTML tag in %r" %
607	self.TagString())
608	yield TagName, m.start(1), m.end(1)
609
610	pos = m.end(0)
611	#log('POS %d', pos)
612
613	while True:
614	# don't search past the end
615	m = _ATTR_RE.match(self.s, pos, self.end_pos)
616	if not m:
617	#log('BREAK pos %d', pos)
618	break
619	#log('AttrName %r', m.group(1))
620
621	yield AttrName, m.start(1), m.end(1)
622
623	#log('m.groups() %r', m.groups())
624	if m.group(2) is not None:
625	# double quoted
626	yield QuotedValue, m.start(2), m.end(2)
627	elif m.group(3) is not None:
628	# single quoted - TODO: could have different token types
629	yield QuotedValue, m.start(3), m.end(3)
630	elif m.group(4) is not None:
631	yield UnquotedValue, m.start(4), m.end(4)
632	else:
633	# <button disabled>
634	end = m.end(0)
635	yield MissingValue, end, end
636
637	# Skip past the "
638	pos = m.end(0)
639
640	#log('TOK %r', self.s)
641
642	m = _TAG_LAST_RE.match(self.s, pos)
643	#log('_TAG_LAST_RE match %r', self.s[pos:])
644	if not m:
645	# Extra data at end of tag. TODO: add messages for all these.
646	raise LexError(self.s, pos)
647
648
649	# This is similar but not identical to
650	# " ([^>"\x00]*) " # double quoted value
651	# \| ' ([^>'\x00]*) ' # single quoted value
652	#
653	# Note: for unquoted values, & isn't allowed, and thus & and c and
654	# are not allowed. We could relax that?
655	ATTR_VALUE_LEXER = CHAR_LEX + [
656	(r'[^>&\x00]+', Tok.RawData),
657	(r'.', Tok.Invalid),
658	]
659
660	ATTR_VALUE_LEXER = MakeLexer(ATTR_VALUE_LEXER)
661
662
663	class AttrValueLexer(object):
664	"""
665	<a href="foo=99&bar">
666	<a href='foo=99&bar'>
667	<a href=unquoted>
668	"""
669
670	def __init__(self, s):
671	self.s = s
672	self.start_pos = -1 # Invalid
673	self.end_pos = -1
674
675	def Reset(self, start_pos, end_pos):
676	"""Reuse instances of this object."""
677	assert start_pos >= 0, start_pos
678	assert end_pos >= 0, end_pos
679
680	self.start_pos = start_pos
681	self.end_pos = end_pos
682
683	def NumTokens(self):
684	num_tokens = 0
685	pos = self.start_pos
686	for tok_id, end_pos in self.Tokens():
687	if tok_id == Tok.Invalid:
688	raise LexError(self.s, pos)
689	pos = end_pos
690	#log('pos %d', pos)
691	num_tokens += 1
692	return num_tokens
693
694	def Tokens(self):
695	pos = self.start_pos
696	while pos < self.end_pos:
697	# Find the first match, like above.
698	# Note: frontend/match.py uses _LongestMatch(), which is different!
699	# TODO: reconcile them. This lexer should be expressible in re2c.
700	for pat, tok_id in ATTR_VALUE_LEXER:
701	m = pat.match(self.s, pos)
702	if m:
703	if 0:
704	tok_str = m.group(0)
705	log('token = %r', tok_str)
706
707	end_pos = m.end(0)
708	yield tok_id, end_pos
709	pos = end_pos
710	break
711	else:
712	raise AssertionError('Tok.Invalid rule should have matched')
713
714
715	def ReadUntilStartTag(it, tag_lexer, tag_name):
716	"""Find the next <foo>, returning its (start, end) positions
717
718	Raise ParseError if it's not found.
719
720	tag_lexer is RESET.
721	"""
722	pos = 0
723	while True:
724	try:
725	tok_id, end_pos = next(it)
726	except StopIteration:
727	break
728	tag_lexer.Reset(pos, end_pos)
729	if tok_id == Tok.StartTag and tag_lexer.TagName() == tag_name:
730	return pos, end_pos
731
732	pos = end_pos
733
734	raise ParseError('No start tag %r' % tag_name)
735
736
737	def ReadUntilEndTag(it, tag_lexer, tag_name):
738	"""Find the next </foo>, returning its (start, end) position
739
740	Raise ParseError if it's not found.
741
742	tag_lexer is RESET.
743	"""
744	pos = 0
745	while True:
746	try:
747	tok_id, end_pos = next(it)
748	except StopIteration:
749	break
750	tag_lexer.Reset(pos, end_pos)
751	if tok_id == Tok.EndTag and tag_lexer.TagName() == tag_name:
752	return pos, end_pos
753
754	pos = end_pos
755
756	raise ParseError('No end tag %r' % tag_name)
757
758
759	CHAR_ENTITY = {
760	'amp': '&',
761	'lt': '<',
762	'gt': '>',
763	'quot': '"',
764	'apos': "'",
765	}
766
767
768	def ToText(s, left_pos=0, right_pos=-1):
769	"""Given HTML, return text by unquoting > and < etc.
770
771	Used by:
772	doctools/oils_doc.py: PygmentsPlugin
773	doctools/help_gen.py: HelpIndexCards
774
775	In the latter case, we cold process some tags, like:
776
777	- Blue Link (not clickable, but still useful)
778	- Red X
779
780	That should be html.ToAnsi.
781	"""
782	f = StringIO()
783	out = Output(s, f, left_pos, right_pos)
784
785	pos = left_pos
786	for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
787	if tok_id in (Tok.RawData, Tok.BadAmpersand, Tok.BadGreaterThan,
788	Tok.BadLessThan):
789	out.SkipTo(pos)
790	out.PrintUntil(end_pos)
791
792	elif tok_id == Tok.CharEntity: # &
793
794	entity = s[pos + 1:end_pos - 1]
795
796	out.SkipTo(pos)
797	out.Print(CHAR_ENTITY[entity])
798	out.SkipTo(end_pos)
799
800	# Not handling these yet
801	elif tok_id == Tok.HexChar:
802	raise AssertionError('Hex Char %r' % s[pos:pos + 20])
803
804	elif tok_id == Tok.DecChar:
805	raise AssertionError('Dec Char %r' % s[pos:pos + 20])
806
807	else:
808	# Skip everything else
809	out.SkipTo(end_pos)
810
811	pos = end_pos
812
813	out.PrintTheRest()
814	return f.getvalue()
815
816
817	# https://developer.mozilla.org/en-US/docs/Glossary/Void_element
818	VOID_ELEMENTS = [
819	'area',
820	'base',
821	'br',
822	'col',
823	'embed',
824	'hr',
825	'img',
826	'input',
827	'link',
828	'meta',
829	'param',
830	'source',
831	'track',
832	'wbr',
833	]
834
835	LEX_ATTRS = 1 << 1
836	LEX_QUOTED_VALUES = 1 << 2 # href="?x=42&y=99"
837	NO_SPECIAL_TAGS = 1 << 3 # <script> <style>, VOID tags, etc.
838	BALANCED_TAGS = 1 << 4 # are tags balanced?
839
840
841	def Validate(contents, flags, counters):
842	# type: (str, int, Counters) -> None
843
844	tag_lexer = TagLexer(contents)
845	val_lexer = AttrValueLexer(contents)
846
847	no_special_tags = bool(flags & NO_SPECIAL_TAGS)
848	lx = Lexer(contents, no_special_tags=no_special_tags)
849	tokens = []
850	start_pos = 0
851	tag_stack = []
852	while True:
853	tok_id, end_pos = lx.Read()
854	#log('TOP %s %r', TokenName(tok_id), contents[start_pos:end_pos])
855
856	if tok_id == Tok.Invalid:
857	raise LexError(contents, start_pos)
858	if tok_id == Tok.EndOfStream:
859	break
860
861	tokens.append((tok_id, end_pos))
862
863	if tok_id == Tok.StartEndTag:
864	counters.num_start_end_tags += 1
865
866	tag_lexer.Reset(start_pos, end_pos)
867	all_attrs = tag_lexer.AllAttrsRawSlice()
868	counters.num_attrs += len(all_attrs)
869	for name, val_start, val_end in all_attrs:
870	val_lexer.Reset(val_start, val_end)
871	counters.num_val_tokens += val_lexer.NumTokens()
872
873	counters.debug_attrs.extend(all_attrs)
874
875	elif tok_id == Tok.StartTag:
876	counters.num_start_tags += 1
877
878	tag_lexer.Reset(start_pos, end_pos)
879	all_attrs = tag_lexer.AllAttrsRawSlice()
880	counters.num_attrs += len(all_attrs)
881	for name, val_start, val_end in all_attrs:
882	val_lexer.Reset(val_start, val_end)
883	counters.num_val_tokens += val_lexer.NumTokens()
884
885	counters.debug_attrs.extend(all_attrs)
886
887	if flags & BALANCED_TAGS:
888	tag_name = lx.CanonicalTagName()
889	if flags & NO_SPECIAL_TAGS:
890	tag_stack.append(tag_name)
891	else:
892	# e.g. <meta> is considered self-closing, like <meta/>
893	if tag_name not in VOID_ELEMENTS:
894	tag_stack.append(tag_name)
895
896	counters.max_tag_stack = max(counters.max_tag_stack,
897	len(tag_stack))
898	elif tok_id == Tok.EndTag:
899	if flags & BALANCED_TAGS:
900	try:
901	expected = tag_stack.pop()
902	except IndexError:
903	raise ParseError('Tag stack empty',
904	s=contents,
905	start_pos=start_pos)
906
907	actual = lx.CanonicalTagName()
908	if expected != actual:
909	raise ParseError(
910	'Got unexpected closing tag %r; opening tag was %r' %
911	(contents[start_pos:end_pos], expected),
912	s=contents,
913	start_pos=start_pos)
914
915	start_pos = end_pos
916
917	if len(tag_stack) != 0:
918	raise ParseError('Missing closing tags at end of doc: %s' %
919	' '.join(tag_stack),
920	s=contents,
921	start_pos=start_pos)
922
923	counters.num_tokens += len(tokens)
924
925
926	def ToXml(htm8_str):
927	# type: (str) -> str
928
929	# TODO:
930	# 1. Lex it
931	# 2. < & > must be escaped
932	# a. in raw data
933	# b. in quoted strings
934	# 3. <script> turned into CDATA
935	# 4. void tags turned into self-closing tags
936	# 5. case-sensitive tag matching - not sure about this
937
938	tag_lexer = TagLexer(htm8_str)
939	val_lexer = AttrValueLexer(htm8_str)
940
941	f = StringIO()
942	out = Output(htm8_str, f)
943
944	lx = Lexer(htm8_str)
945
946	pos = 0
947	while True:
948	tok_id, end_pos = lx.Read()
949
950	if tok_id == Tok.Invalid:
951	raise LexError(htm8_str, pos)
952	if tok_id == Tok.EndOfStream:
953	break
954
955	if tok_id in (Tok.RawData, Tok.CharEntity, Tok.HexChar, Tok.DecChar):
956	out.PrintUntil(end_pos)
957	elif tok_id in (Tok.StartTag, Tok.StartEndTag):
958	tag_lexer.Reset(pos, end_pos)
959	# TODO: reduce allocations here
960	all_attrs = tag_lexer.AllAttrsRawSlice()
961	for name, val_start, val_end in all_attrs:
962	val_lexer.Reset(val_start, val_end)
963	# TODO: get the kind of string
964	#
965	# Quoted: we need to replace & with & and < with <
966	# note > is not allowed
967	# Unquoted: right now, we can just surround with double quotes
968	# because we don't allow any bad chars
969	# Empty : add "", so empty= becomes =""
970	# Missing : add ="", so missing becomes missing=""
971
972	tag_name = lx.CanonicalTagName()
973	if tok_id == Tok.StartTag and tag_name in VOID_ELEMENTS:
974	# TODO: instead of closing >, print />
975	pass
976
977	elif tok_id == Tok.BadAmpersand:
978	#out.SkipTo(pos)
979	out.Print('&')
980	out.SkipTo(end_pos)
981
982	elif tok_id == Tok.BadGreaterThan:
983	#out.SkipTo(pos)
984	out.Print('>')
985	out.SkipTo(end_pos)
986	else:
987	out.PrintUntil(end_pos)
988
989	pos = end_pos
990
991	out.PrintTheRest()
992	return f.getvalue()
993
994
995	class Counters(object):
996
997	def __init__(self):
998	self.num_tokens = 0
999	self.num_start_tags = 0
1000	self.num_start_end_tags = 0
1001	self.num_attrs = 0
1002	self.max_tag_stack = 0
1003	self.num_val_tokens = 0
1004
1005	self.debug_attrs = []
1006
1007
1008	def main(argv):
1009	action = argv[1]
1010
1011	if action == 'tokens':
1012	contents = sys.stdin.read()
1013
1014	lx = Lexer(contents)
1015	start_pos = 0
1016	while True:
1017	tok_id, end_pos = lx.Read()
1018	if tok_id == Tok.Invalid:
1019	raise LexError(contents, start_pos)
1020	if tok_id == Tok.EndOfStream:
1021	break
1022
1023	frag = contents[start_pos:end_pos]
1024	log('%d %s %r', end_pos, TokenName(tok_id), frag)
1025	start_pos = end_pos
1026
1027	return 0
1028
1029	elif action in ('lex-htm8', 'parse-htm8', 'parse-xml'):
1030
1031	errors = []
1032	counters = Counters()
1033
1034	flags = LEX_ATTRS \| LEX_QUOTED_VALUES
1035	if action.startswith('parse-'):
1036	flags \|= BALANCED_TAGS
1037	if action == 'parse-xml':
1038	flags \|= NO_SPECIAL_TAGS
1039
1040	i = 0
1041	for line in sys.stdin:
1042	filename = line.strip()
1043	with open(filename) as f:
1044	contents = f.read()
1045
1046	try:
1047	Validate(contents, flags, counters)
1048	except LexError as e:
1049	log('Lex error in %r: %s', filename, e)
1050	errors.append((filename, e))
1051	except ParseError as e:
1052	log('Parse error in %r: %s', filename, e)
1053	errors.append((filename, e))
1054	i += 1
1055
1056	log('')
1057	log('%10d tokens', counters.num_tokens)
1058	log('%10d start/end tags', counters.num_start_end_tags)
1059	log('%10d start tags', counters.num_start_tags)
1060	log('%10d attrs', counters.num_attrs)
1061	log('%10d max tag stack depth', counters.max_tag_stack)
1062	log('%10d attr val tokens', counters.num_val_tokens)
1063	log('%10d errors', len(errors))
1064	if len(errors):
1065	return 1
1066	return 0
1067
1068	elif action == 'todo':
1069	# Other algorithms:
1070	#
1071	# - select first subtree with given ID
1072	# - this requires understanding the void tags I suppose
1073	# - select all subtrees that have a class
1074	# - materialize DOM
1075
1076	# Safe-HTM8? This is a filter
1077	return 0
1078
1079	else:
1080	raise RuntimeError('Invalid action %r' % action)
1081
1082
1083	if __name__ == '__main__':
1084	sys.exit(main(sys.argv))