data_lang/htm8.py

OILS / data_lang / htm8.py View on Github | oils.pub

670 lines, 280 significant

1	"""data_lang/htm8.py
2
3	TODO
4
5	API:
6	- Get rid of AttrValueLexer - this should be in the TagLexer
7	- this also means that unquoted values can be more similar
8	- We can use a single lexer mode for everything inside <>
9	- the SPACE is the only difference
10	- Deprecate tag_lexer.GetTagName() in favor of lx.CanonicalTagName() or
11	_LiteralTagName()
12	- UTF-8 check, like JSON8
13	- re2c
14	- port lexer, which will fix static typing issues
15	- the abstraction needs to support submatch?
16	- for finding the end of a tag, etc.?
17
18	- LexError and ParseError need details
19	- harmonize with data_lang/j8.py, which uses error.Decode(msg, ...,
20	cur_line_num)
21	"""
22
23	import re
24
25	from typing import Dict, List, Tuple, Optional, IO, Iterator, Any
26
27	from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_tag_id, h8_tag_id_t,
28	h8_tag_id_str)
29	from doctools.util import log
30
31
32	class LexError(Exception):
33	"""
34	Examples of lex errors:
35
36	- h8_id.Invalid, like <> or &&
37	- Unclosed <!-- <? <![CDATA[ <script> <style>
38	"""
39
40	def __init__(self, s, start_pos):
41	# type: (str, int) -> None
42	self.s = s
43	self.start_pos = start_pos
44
45	def __str__(self):
46	# type: () -> str
47	return '(LexError %r)' % (self.s[self.start_pos:self.start_pos + 20])
48
49
50	def FindLineNum(s, error_pos):
51	# type: (str, int) -> int
52	current_pos = 0
53	line_num = 1
54	while True:
55	newline_pos = s.find('\n', current_pos)
56	#log('current = %d, N %d, line %d', current_pos, newline_pos, line_num)
57
58	if newline_pos == -1: # this is the last line
59	return line_num
60	if newline_pos >= error_pos:
61	return line_num
62	line_num += 1
63	current_pos = newline_pos + 1
64
65
66	class ParseError(Exception):
67	"""
68	Examples of parse errors
69
70	- unbalanced tag structure
71	- ul_table.py errors
72	"""
73
74	def __init__(self, msg, s=None, start_pos=-1):
75	# type: (str, Optional[str], int) -> None
76	self.msg = msg
77	self.s = s
78	self.start_pos = start_pos
79
80	def __str__(self):
81	# type: () -> str
82	if self.s is not None:
83	assert self.start_pos != -1, self.start_pos
84	snippet = (self.s[self.start_pos:self.start_pos + 20])
85
86	line_num = FindLineNum(self.s, self.start_pos)
87	else:
88	snippet = ''
89	line_num = -1
90	msg = 'line %d: %r %r' % (line_num, self.msg, snippet)
91	return msg
92
93
94	class Output(object):
95	"""Takes an underlying input buffer and an output file. Maintains a
96	position in the input buffer.
97
98	Print FROM the input or print new text to the output.
99	"""
100
101	def __init__(self, s, f, left_pos=0, right_pos=-1):
102	# type: (str, IO[str], int, int) -> None
103	self.s = s
104	self.f = f
105	self.pos = left_pos
106	self.right_pos = len(s) if right_pos == -1 else right_pos
107
108	def SkipTo(self, pos):
109	# type: (int) -> None
110	"""Skip to a position."""
111	self.pos = pos
112
113	def PrintUntil(self, pos):
114	# type: (int) -> None
115	"""Print until a position."""
116	piece = self.s[self.pos:pos]
117	self.f.write(piece)
118	self.pos = pos
119
120	def PrintTheRest(self):
121	# type: () -> None
122	"""Print until the end of the string."""
123	self.PrintUntil(self.right_pos)
124
125	def Print(self, s):
126	# type: (str) -> None
127	"""Print text to the underlying buffer."""
128	self.f.write(s)
129
130
131	def MakeLexer(rules):
132	return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]
133
134
135	#
136	# Eggex
137	#
138	# Tag = / ~['>']+ /
139
140	# Is this valid? A single character?
141	# Tag = / ~'>'* /
142
143	# Maybe better: / [NOT '>']+/
144	# capital letters not allowed there?
145	#
146	# But then this is confusing:
147	# / [NOT ~digit]+/
148	#
149	# / [NOT digit] / is [^\d]
150	# / ~digit / is \D
151	#
152	# Or maybe:
153	#
154	# / [~ digit]+ /
155	# / [~ '>']+ /
156	# / [NOT '>']+ /
157
158	# End = / '</' Tag '>' /
159	# StartEnd = / '<' Tag '/>' /
160	# Start = / '<' Tag '>' /
161	#
162	# EntityRef = / '&' dot{* N} ';' /
163
164	# Tag name, or attribute name
165	# colon is used in XML
166
167	# https://www.w3.org/TR/xml/#NT-Name
168	# Hm there is a lot of unicode stuff. We are simplifying parsing
169
170	_NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter
171
172	CHAR_LEX = [
173	# Characters
174	# https://www.w3.org/TR/xml/#sec-references
175	(r'&\# [0-9]+ ;', h8_id.DecChar),
176	(r'&\# x[0-9a-fA-F]+ ;', h8_id.HexChar),
177	(r'& %s ;' % _NAME, h8_id.CharEntity),
178	# Allow unquoted, and quoted
179	(r'&', h8_id.BadAmpersand),
180	]
181
182	HTM8_LEX = CHAR_LEX + [
183	(r'<!--', h8_id.CommentBegin),
184
185	# Processing instruction are used for the XML header:
186	# <?xml version="1.0" encoding="UTF-8"?>
187	# They are technically XML-only, but in HTML5, they are another kind of
188	# comment:
189	#
190	# https://developer.mozilla.org/en-US/docs/Web/API/ProcessingInstruction
191	#
192	(r'<\?', h8_id.ProcessingBegin),
193	# Not necessary in HTML5, but occurs in XML
194	(r'<!\[CDATA\[', h8_id.CDataBegin), # <![CDATA[
195
196	# Markup declarations
197	# - In HTML5, there is only <!DOCTYPE html>
198	# - XML has 4 more declarations: <!ELEMENT ...> ATTLIST ENTITY NOTATION
199	# - these seem to be part of DTD
200	# - it's useful to skip these, and be able to parse the rest of the document
201	# - Note: < is allowed?
202	(r'<! [^>\x00]+ >', h8_id.Decl),
203
204	# Tags
205	# Notes:
206	# - We look for a valid tag name, but we don't validate attributes.
207	# That's done in the tag lexer.
208	# - We don't allow leading whitespace
209	(r'</ (%s) >' % _NAME, h8_id.EndTag),
210	# self-closing <br/> comes before StartTag
211	# could/should these be collapsed into one rule?
212	(r'< (%s) [^>\x00]* />' % _NAME, h8_id.StartEndTag), # end </a>
213	(r'< (%s) [^>\x00]* >' % _NAME, h8_id.StartTag), # start <a>
214
215	# HTML5 allows unescaped > in raw data, but < is not allowed.
216	# https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
217	#
218	# - My early blog has THREE errors when disallowing >
219	# - So do some .wwz files
220	(r'[^&<>\x00]+', h8_id.RawData),
221	(r'>', h8_id.BadGreaterThan),
222	# < is an error
223	(r'.', h8_id.Invalid),
224	]
225
226	# Old notes:
227	#
228	# Non-greedy matches are regular and can be matched in linear time
229	# with RE2.
230	#
231	# https://news.ycombinator.com/item?id=27099798
232	#
233	# Maybe try combining all of these for speed.
234
235	# . is any char except newline
236	# https://re2c.org/manual/manual_c.html
237
238	# Discarded options
239	#(r'<!-- .*? -->', h8_id.Comment),
240
241	# Hack from Claude: \s\S instead of re.DOTALL. I don't like this
242	#(r'<!-- [\s\S]*? -->', h8_id.Comment),
243	#(r'<!-- (?:.\|[\n])*? -->', h8_id.Comment),
244
245	HTM8_LEX_COMPILED = MakeLexer(HTM8_LEX)
246
247
248	class Lexer(object):
249
250	def __init__(self, s, left_pos=0, right_pos=-1, no_special_tags=False):
251	# type: (str, int, int, bool) -> None
252	self.s = s
253	self.pos = left_pos
254	self.right_pos = len(s) if right_pos == -1 else right_pos
255	self.no_special_tags = no_special_tags
256
257	# string -> compiled regex pattern object
258	self.cache = {} # type: Dict[str, Any]
259
260	# either </script> or </style> - we search until we see that
261	self.search_state = None # type: Optional[str]
262
263	# Position of tag name, if applicable
264	# - Set after you get a StartTag, EndTag, or StartEndTag
265	# - Unset on other tags
266	self.tag_pos_left = -1
267	self.tag_pos_right = -1
268
269	def _Read(self):
270	# type: () -> Tuple[h8_id_t, int]
271	if self.pos == self.right_pos:
272	return h8_id.EndOfStream, self.pos
273
274	assert self.pos < self.right_pos, self.pos
275
276	if self.search_state is not None and not self.no_special_tags:
277	# TODO: case-insensitive search for </SCRIPT> <SCRipt> ?
278	#
279	# Another strategy: enter a mode where we find ONLY the end tag
280	# regex, and any data that's not <, and then check the canonical
281	# tag name for 'script' or 'style'.
282	pos = self.s.find(self.search_state, self.pos)
283	if pos == -1:
284	# unterminated <script> or <style>
285	raise LexError(self.s, self.pos)
286	self.search_state = None
287	# beginning
288	return h8_id.HtmlCData, pos
289
290	# Find the first match.
291	# Note: frontend/match.py uses _LongestMatch(), which is different!
292	# TODO: reconcile them. This lexer should be expressible in re2c.
293
294	for pat, tok_id in HTM8_LEX_COMPILED:
295	m = pat.match(self.s, self.pos)
296	if m:
297	if tok_id in (h8_id.StartTag, h8_id.EndTag, h8_id.StartEndTag):
298	self.tag_pos_left = m.start(1)
299	self.tag_pos_right = m.end(1)
300	else:
301	# Reset state
302	self.tag_pos_left = -1
303	self.tag_pos_right = -1
304
305	if tok_id == h8_id.CommentBegin:
306	pos = self.s.find('-->', self.pos)
307	if pos == -1:
308	# unterminated <!--
309	raise LexError(self.s, self.pos)
310	return h8_id.Comment, pos + 3 # -->
311
312	if tok_id == h8_id.ProcessingBegin:
313	pos = self.s.find('?>', self.pos)
314	if pos == -1:
315	# unterminated <?
316	raise LexError(self.s, self.pos)
317	return h8_id.Processing, pos + 2 # ?>
318
319	if tok_id == h8_id.CDataBegin:
320	pos = self.s.find(']]>', self.pos)
321	if pos == -1:
322	# unterminated <![CDATA[
323	raise LexError(self.s, self.pos)
324	return h8_id.CData, pos + 3 # ]]>
325
326	if tok_id == h8_id.StartTag:
327	# TODO: reduce allocations
328	if (self.TagNameEquals('script') or
329	self.TagNameEquals('style')):
330	# <SCRipt a=b> -> </SCRipt>
331	self.search_state = '</' + self._LiteralTagName() + '>'
332
333	return tok_id, m.end()
334	else:
335	raise AssertionError('h8_id.Invalid rule should have matched')
336
337	def TagNameEquals(self, expected):
338	# type: (str) -> bool
339	assert self.tag_pos_left != -1, self.tag_pos_left
340	assert self.tag_pos_right != -1, self.tag_pos_right
341
342	# TODO: In C++, this does not need an allocation. Can we test
343	# directly?
344	return expected == self.CanonicalTagName()
345
346	def _LiteralTagName(self):
347	# type: () -> str
348	assert self.tag_pos_left != -1, self.tag_pos_left
349	assert self.tag_pos_right != -1, self.tag_pos_right
350
351	return self.s[self.tag_pos_left:self.tag_pos_right]
352
353	def CanonicalTagName(self):
354	# type: () -> str
355	tag_name = self._LiteralTagName()
356	# Most tags are already lower case, so avoid allocation with this conditional
357	# TODO: this could go in the mycpp runtime?
358	if tag_name.islower():
359	return tag_name
360	else:
361	return tag_name.lower()
362
363	def Read(self):
364	# type: () -> Tuple[h8_id_t, int]
365	tok_id, end_pos = self._Read()
366	self.pos = end_pos # advance
367	return tok_id, end_pos
368
369	def LookAhead(self, regex):
370	# type: (str) -> bool
371	# Cache the regex compilation. This could also be LookAheadFor(THEAD)
372	# or something.
373	pat = self.cache.get(regex)
374	if pat is None:
375	pat = re.compile(regex)
376	self.cache[regex] = pat
377
378	m = pat.match(self.s, self.pos)
379	return m is not None
380
381
382	# Tag names:
383	# Match <a or </a
384	# Match <h2, but not <2h
385	#
386	# HTML 5 doesn't restrict tag names at all
387	# https://html.spec.whatwg.org/#toc-syntax
388	#
389	# XML allows : - .
390	# https://www.w3.org/TR/xml/#NT-NameChar
391
392	# Namespaces for MathML, SVG
393	# XLink, XML, XMLNS
394	#
395	# https://infra.spec.whatwg.org/#namespaces
396	#
397	# Allow - for td-attrs
398
399	# Be very lenient - just no whitespace or special HTML chars
400	# I don't think this is more lenient than HTML5, though we should check.
401	_UNQUOTED_VALUE = r'''[^ \t\r\n<>&"'\x00]*'''
402
403	# TODO: we don't need to capture the tag name here? That's done at the top
404	# level
405	_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)
406
407	_TAG_LAST_RE = re.compile(r'\s* /? >', re.VERBOSE)
408
409	# To match href="foo"
410	# Note: in HTML5 and XML, single quoted attributes are also valid
411
412	# <button disabled> is standard usage
413
414	# NOTE: This used to allow whitespace around =
415	# <a foo = "bar"> makes sense in XML
416	# But then you also have
417	# <a foo= bar> - which is TWO attributes, in HTML5
418	# So the space is problematic
419
420	_ATTR_RE = re.compile(
421	r'''
422	\s+ # Leading whitespace is required
423	(%s) # Attribute name
424	(?: # Optional attribute value
425	\s* = \s* # Spaces allowed around =
426	(?:
427	" ([^>"\x00]*) " # double quoted value
428	\| ' ([^>'\x00]*) ' # single quoted value
429	\| (%s) # Attribute value
430	)
431	)?
432	''' % (_NAME, _UNQUOTED_VALUE), re.VERBOSE)
433
434
435	class TagLexer(object):
436	"""
437	Given a tag like <a href="..."> or <link type="..." />, the TagLexer
438	provides a few operations:
439
440	- What is the tag?
441	- Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
442	"""
443
444	def __init__(self, s):
445	# type: (str) -> None
446	self.s = s
447	self.start_pos = -1 # Invalid
448	self.end_pos = -1
449
450	def Reset(self, start_pos, end_pos):
451	# type: (int, int) -> None
452	"""Reuse instances of this object."""
453	assert start_pos >= 0, start_pos
454	assert end_pos >= 0, end_pos
455
456	self.start_pos = start_pos
457	self.end_pos = end_pos
458
459	def WholeTagString(self):
460	# type: () -> str
461	"""Return the entire tag string, e.g. <a href='foo'>"""
462	return self.s[self.start_pos:self.end_pos]
463
464	def GetTagName(self):
465	# type: () -> str
466	# First event
467	tok_id, start, end = next(self.Tokens())
468	return self.s[start:end]
469
470	def GetSpanForAttrValue(self, attr_name):
471	# type: (str) -> Tuple[int, int]
472	"""
473	Used by oils_doc.py, for href shortcuts
474	"""
475	# Algorithm: search for QuotedValue or UnquotedValue after AttrName
476	# TODO: Could also cache these
477
478	events = self.Tokens()
479	val = (-1, -1)
480	try:
481	while True:
482	tok_id, start, end = next(events)
483	if tok_id == h8_tag_id.AttrName:
484	name = self.s[start:end]
485	if name == attr_name:
486	# The value should come next
487	tok_id, start, end = next(events)
488	assert tok_id in (
489	h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue,
490	h8_tag_id.MissingValue), h8_tag_id_str(tok_id)
491	val = start, end
492	break
493
494	except StopIteration:
495	pass
496	return val
497
498	def GetAttrRaw(self, attr_name):
499	# type: (str) -> Optional[str]
500	"""
501	Return the value, which may be UNESCAPED.
502	"""
503	start, end = self.GetSpanForAttrValue(attr_name)
504	if start == -1:
505	return None
506	return self.s[start:end]
507
508	def AllAttrsRawSlice(self):
509	# type: () -> List[Tuple[str, int, int]]
510	"""
511	Get a list of pairs [('class', 3, 5), ('href', 9, 12)]
512	"""
513	slices = []
514	events = self.Tokens()
515	try:
516	while True:
517	tok_id, start, end = next(events)
518	if tok_id == h8_tag_id.AttrName:
519	name = self.s[start:end]
520
521	# The value should come next
522	tok_id, start, end = next(events)
523	assert tok_id in (
524	h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue,
525	h8_tag_id.MissingValue), h8_tag_id_str(tok_id)
526	# Note: quoted values may have &
527	# We would need ANOTHER lexer to unescape them, but we
528	# don't need that for ul-table
529	slices.append((name, start, end))
530	except StopIteration:
531	pass
532	return slices
533
534	def AllAttrsRaw(self):
535	# type: () -> List[Tuple[str, str]]
536	"""
537	Get a list of pairs [('class', 'foo'), ('href', '?foo=1&bar=2')]
538
539	The quoted values may be escaped. We would need another lexer to
540	unescape them.
541	"""
542	slices = self.AllAttrsRawSlice()
543	pairs = []
544	for name, start, end in slices:
545	pairs.append((name, self.s[start:end]))
546	return pairs
547
548	def Tokens(self):
549	# type: () -> Iterator[Tuple[h8_tag_id_t, int, int]]
550	"""
551	Yields a sequence of tokens: Tag (AttrName AttrValue?)*
552
553	Where each Token is (Type, start_pos, end_pos)
554
555	Note that start and end are NOT redundant! We skip over some unwanted
556	characters.
557	"""
558	m = _TAG_RE.match(self.s, self.start_pos + 1)
559	if not m:
560	raise RuntimeError("Couldn't find HTML tag in %r" %
561	self.WholeTagString())
562	yield h8_tag_id.TagName, m.start(1), m.end(1)
563
564	pos = m.end(0)
565	#log('POS %d', pos)
566
567	while True:
568	# don't search past the end
569	m = _ATTR_RE.match(self.s, pos, self.end_pos)
570	if not m:
571	#log('BREAK pos %d', pos)
572	break
573	#log('AttrName %r', m.group(1))
574
575	yield h8_tag_id.AttrName, m.start(1), m.end(1)
576
577	#log('m.groups() %r', m.groups())
578	if m.group(2) is not None:
579	# double quoted
580	yield h8_tag_id.QuotedValue, m.start(2), m.end(2)
581	elif m.group(3) is not None:
582	# single quoted - TODO: could have different token types
583	yield h8_tag_id.QuotedValue, m.start(3), m.end(3)
584	elif m.group(4) is not None:
585	yield h8_tag_id.UnquotedValue, m.start(4), m.end(4)
586	else:
587	# <button disabled>
588	end = m.end(0)
589	yield h8_tag_id.MissingValue, end, end
590
591	# Skip past the "
592	pos = m.end(0)
593
594	#log('TOK %r', self.s)
595
596	m = _TAG_LAST_RE.match(self.s, pos)
597	#log('_TAG_LAST_RE match %r', self.s[pos:])
598	if not m:
599	# Extra data at end of tag. TODO: add messages for all these.
600	raise LexError(self.s, pos)
601
602
603	# This is similar but not identical to
604	# " ([^>"\x00]*) " # double quoted value
605	# \| ' ([^>'\x00]*) ' # single quoted value
606	#
607	# Note: for unquoted values, & isn't allowed, and thus & and c and
608	# are not allowed. We could relax that?
609	ATTR_VALUE_LEX = CHAR_LEX + [
610	(r'[^>&\x00]+', h8_id.RawData),
611	(r'.', h8_id.Invalid),
612	]
613
614	ATTR_VALUE_LEX_COMPILED = MakeLexer(ATTR_VALUE_LEX)
615
616
617	class AttrValueLexer(object):
618	"""
619	<a href="foo=99&bar">
620	<a href='foo=99&bar'>
621	<a href=unquoted>
622	"""
623
624	def __init__(self, s):
625	# type: (str) -> None
626	self.s = s
627	self.start_pos = -1 # Invalid
628	self.end_pos = -1
629
630	def Reset(self, start_pos, end_pos):
631	# type: (int, int) -> None
632	"""Reuse instances of this object."""
633	assert start_pos >= 0, start_pos
634	assert end_pos >= 0, end_pos
635
636	self.start_pos = start_pos
637	self.end_pos = end_pos
638
639	def NumTokens(self):
640	# type: () -> int
641	num_tokens = 0
642	pos = self.start_pos
643	for tok_id, end_pos in self.Tokens():
644	if tok_id == h8_id.Invalid:
645	raise LexError(self.s, pos)
646	pos = end_pos
647	#log('pos %d', pos)
648	num_tokens += 1
649	return num_tokens
650
651	def Tokens(self):
652	# type: () -> Iterator[Tuple[h8_id_t, int]]
653	pos = self.start_pos
654	while pos < self.end_pos:
655	# Find the first match, like above.
656	# Note: frontend/match.py uses _LongestMatch(), which is different!
657	# TODO: reconcile them. This lexer should be expressible in re2c.
658	for pat, tok_id in ATTR_VALUE_LEX_COMPILED:
659	m = pat.match(self.s, pos)
660	if m:
661	if 0:
662	tok_str = m.group(0)
663	log('token = %r', tok_str)
664
665	end_pos = m.end(0)
666	yield tok_id, end_pos
667	pos = end_pos
668	break
669	else:
670	raise AssertionError('h8_id.Invalid rule should have matched')