data_lang/htm8.py

OILS / data_lang / htm8.py View on Github | oils.pub

648 lines, 280 significant

1	import re
2
3	from typing import Dict, List, Tuple, Optional, IO, Iterator, Any
4
5	from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_tag_id, h8_tag_id_t,
6	h8_tag_id_str)
7	from doctools.util import log
8
9
10	class LexError(Exception):
11	"""
12	Examples of lex errors:
13
14	- h8_id.Invalid, like <> or &&
15	- Unclosed <!-- <? <![CDATA[ <script> <style>
16	"""
17
18	def __init__(self, s, start_pos):
19	# type: (str, int) -> None
20	self.s = s
21	self.start_pos = start_pos
22
23	def __str__(self):
24	# type: () -> str
25	return '(LexError %r)' % (self.s[self.start_pos:self.start_pos + 20])
26
27
28	def FindLineNum(s, error_pos):
29	# type: (str, int) -> int
30	current_pos = 0
31	line_num = 1
32	while True:
33	newline_pos = s.find('\n', current_pos)
34	#log('current = %d, N %d, line %d', current_pos, newline_pos, line_num)
35
36	if newline_pos == -1: # this is the last line
37	return line_num
38	if newline_pos >= error_pos:
39	return line_num
40	line_num += 1
41	current_pos = newline_pos + 1
42
43
44	class ParseError(Exception):
45	"""
46	Examples of parse errors
47
48	- unbalanced tag structure
49	- ul_table.py errors
50	"""
51
52	def __init__(self, msg, s=None, start_pos=-1):
53	# type: (str, Optional[str], int) -> None
54	self.msg = msg
55	self.s = s
56	self.start_pos = start_pos
57
58	def __str__(self):
59	# type: () -> str
60	if self.s is not None:
61	assert self.start_pos != -1, self.start_pos
62	snippet = (self.s[self.start_pos:self.start_pos + 20])
63
64	line_num = FindLineNum(self.s, self.start_pos)
65	else:
66	snippet = ''
67	line_num = -1
68	msg = 'line %d: %r %r' % (line_num, self.msg, snippet)
69	return msg
70
71
72	class Output(object):
73	"""Takes an underlying input buffer and an output file. Maintains a
74	position in the input buffer.
75
76	Print FROM the input or print new text to the output.
77	"""
78
79	def __init__(self, s, f, left_pos=0, right_pos=-1):
80	# type: (str, IO[str], int, int) -> None
81	self.s = s
82	self.f = f
83	self.pos = left_pos
84	self.right_pos = len(s) if right_pos == -1 else right_pos
85
86	def SkipTo(self, pos):
87	# type: (int) -> None
88	"""Skip to a position."""
89	self.pos = pos
90
91	def PrintUntil(self, pos):
92	# type: (int) -> None
93	"""Print until a position."""
94	piece = self.s[self.pos:pos]
95	self.f.write(piece)
96	self.pos = pos
97
98	def PrintTheRest(self):
99	# type: () -> None
100	"""Print until the end of the string."""
101	self.PrintUntil(self.right_pos)
102
103	def Print(self, s):
104	# type: (str) -> None
105	"""Print text to the underlying buffer."""
106	self.f.write(s)
107
108
109	def MakeLexer(rules):
110	return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]
111
112
113	#
114	# Eggex
115	#
116	# Tag = / ~['>']+ /
117
118	# Is this valid? A single character?
119	# Tag = / ~'>'* /
120
121	# Maybe better: / [NOT '>']+/
122	# capital letters not allowed there?
123	#
124	# But then this is confusing:
125	# / [NOT ~digit]+/
126	#
127	# / [NOT digit] / is [^\d]
128	# / ~digit / is \D
129	#
130	# Or maybe:
131	#
132	# / [~ digit]+ /
133	# / [~ '>']+ /
134	# / [NOT '>']+ /
135
136	# End = / '</' Tag '>' /
137	# StartEnd = / '<' Tag '/>' /
138	# Start = / '<' Tag '>' /
139	#
140	# EntityRef = / '&' dot{* N} ';' /
141
142	# Tag name, or attribute name
143	# colon is used in XML
144
145	# https://www.w3.org/TR/xml/#NT-Name
146	# Hm there is a lot of unicode stuff. We are simplifying parsing
147
148	_NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter
149
150	CHAR_LEX = [
151	# Characters
152	# https://www.w3.org/TR/xml/#sec-references
153	(r'&\# [0-9]+ ;', h8_id.DecChar),
154	(r'&\# x[0-9a-fA-F]+ ;', h8_id.HexChar),
155	(r'& %s ;' % _NAME, h8_id.CharEntity),
156	# Allow unquoted, and quoted
157	(r'&', h8_id.BadAmpersand),
158	]
159
160	HTM8_LEX = CHAR_LEX + [
161	(r'<!--', h8_id.CommentBegin),
162
163	# Processing instruction are used for the XML header:
164	# <?xml version="1.0" encoding="UTF-8"?>
165	# They are technically XML-only, but in HTML5, they are another kind of
166	# comment:
167	#
168	# https://developer.mozilla.org/en-US/docs/Web/API/ProcessingInstruction
169	#
170	(r'<\?', h8_id.ProcessingBegin),
171	# Not necessary in HTML5, but occurs in XML
172	(r'<!\[CDATA\[', h8_id.CDataBegin), # <![CDATA[
173
174	# Markup declarations
175	# - In HTML5, there is only <!DOCTYPE html>
176	# - XML has 4 more declarations: <!ELEMENT ...> ATTLIST ENTITY NOTATION
177	# - these seem to be part of DTD
178	# - it's useful to skip these, and be able to parse the rest of the document
179	# - Note: < is allowed?
180	(r'<! [^>\x00]+ >', h8_id.Decl),
181
182	# Tags
183	# Notes:
184	# - We look for a valid tag name, but we don't validate attributes.
185	# That's done in the tag lexer.
186	# - We don't allow leading whitespace
187	(r'</ (%s) >' % _NAME, h8_id.EndTag),
188	# self-closing <br/> comes before StartTag
189	# could/should these be collapsed into one rule?
190	(r'< (%s) [^>\x00]* />' % _NAME, h8_id.StartEndTag), # end </a>
191	(r'< (%s) [^>\x00]* >' % _NAME, h8_id.StartTag), # start <a>
192
193	# HTML5 allows unescaped > in raw data, but < is not allowed.
194	# https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
195	#
196	# - My early blog has THREE errors when disallowing >
197	# - So do some .wwz files
198	(r'[^&<>\x00]+', h8_id.RawData),
199	(r'>', h8_id.BadGreaterThan),
200	# < is an error
201	(r'.', h8_id.Invalid),
202	]
203
204	# Old notes:
205	#
206	# Non-greedy matches are regular and can be matched in linear time
207	# with RE2.
208	#
209	# https://news.ycombinator.com/item?id=27099798
210	#
211	# Maybe try combining all of these for speed.
212
213	# . is any char except newline
214	# https://re2c.org/manual/manual_c.html
215
216	# Discarded options
217	#(r'<!-- .*? -->', h8_id.Comment),
218
219	# Hack from Claude: \s\S instead of re.DOTALL. I don't like this
220	#(r'<!-- [\s\S]*? -->', h8_id.Comment),
221	#(r'<!-- (?:.\|[\n])*? -->', h8_id.Comment),
222
223	HTM8_LEX_COMPILED = MakeLexer(HTM8_LEX)
224
225
226	class Lexer(object):
227
228	def __init__(self, s, left_pos=0, right_pos=-1, no_special_tags=False):
229	# type: (str, int, int, bool) -> None
230	self.s = s
231	self.pos = left_pos
232	self.right_pos = len(s) if right_pos == -1 else right_pos
233	self.no_special_tags = no_special_tags
234
235	# string -> compiled regex pattern object
236	self.cache = {} # type: Dict[str, Any]
237
238	# either </script> or </style> - we search until we see that
239	self.search_state = None # type: Optional[str]
240
241	# Position of tag name, if applicable
242	# - Set after you get a StartTag, EndTag, or StartEndTag
243	# - Unset on other tags
244	self.tag_pos_left = -1
245	self.tag_pos_right = -1
246
247	def _Read(self):
248	# type: () -> Tuple[h8_id_t, int]
249	if self.pos == self.right_pos:
250	return h8_id.EndOfStream, self.pos
251
252	assert self.pos < self.right_pos, self.pos
253
254	if self.search_state is not None and not self.no_special_tags:
255	# TODO: case-insensitive search for </SCRIPT> <SCRipt> ?
256	#
257	# Another strategy: enter a mode where we find ONLY the end tag
258	# regex, and any data that's not <, and then check the canonical
259	# tag name for 'script' or 'style'.
260	pos = self.s.find(self.search_state, self.pos)
261	if pos == -1:
262	# unterminated <script> or <style>
263	raise LexError(self.s, self.pos)
264	self.search_state = None
265	# beginning
266	return h8_id.HtmlCData, pos
267
268	# Find the first match.
269	# Note: frontend/match.py uses _LongestMatch(), which is different!
270	# TODO: reconcile them. This lexer should be expressible in re2c.
271
272	for pat, tok_id in HTM8_LEX_COMPILED:
273	m = pat.match(self.s, self.pos)
274	if m:
275	if tok_id in (h8_id.StartTag, h8_id.EndTag, h8_id.StartEndTag):
276	self.tag_pos_left = m.start(1)
277	self.tag_pos_right = m.end(1)
278	else:
279	# Reset state
280	self.tag_pos_left = -1
281	self.tag_pos_right = -1
282
283	if tok_id == h8_id.CommentBegin:
284	pos = self.s.find('-->', self.pos)
285	if pos == -1:
286	# unterminated <!--
287	raise LexError(self.s, self.pos)
288	return h8_id.Comment, pos + 3 # -->
289
290	if tok_id == h8_id.ProcessingBegin:
291	pos = self.s.find('?>', self.pos)
292	if pos == -1:
293	# unterminated <?
294	raise LexError(self.s, self.pos)
295	return h8_id.Processing, pos + 2 # ?>
296
297	if tok_id == h8_id.CDataBegin:
298	pos = self.s.find(']]>', self.pos)
299	if pos == -1:
300	# unterminated <![CDATA[
301	raise LexError(self.s, self.pos)
302	return h8_id.CData, pos + 3 # ]]>
303
304	if tok_id == h8_id.StartTag:
305	# TODO: reduce allocations
306	if (self.TagNameEquals('script') or
307	self.TagNameEquals('style')):
308	# <SCRipt a=b> -> </SCRipt>
309	self.search_state = '</' + self._LiteralTagName() + '>'
310
311	return tok_id, m.end()
312	else:
313	raise AssertionError('h8_id.Invalid rule should have matched')
314
315	def TagNameEquals(self, expected):
316	# type: (str) -> bool
317	assert self.tag_pos_left != -1, self.tag_pos_left
318	assert self.tag_pos_right != -1, self.tag_pos_right
319
320	# TODO: In C++, this does not need an allocation. Can we test
321	# directly?
322	return expected == self.CanonicalTagName()
323
324	def _LiteralTagName(self):
325	# type: () -> str
326	assert self.tag_pos_left != -1, self.tag_pos_left
327	assert self.tag_pos_right != -1, self.tag_pos_right
328
329	return self.s[self.tag_pos_left:self.tag_pos_right]
330
331	def CanonicalTagName(self):
332	# type: () -> str
333	tag_name = self._LiteralTagName()
334	# Most tags are already lower case, so avoid allocation with this conditional
335	# TODO: this could go in the mycpp runtime?
336	if tag_name.islower():
337	return tag_name
338	else:
339	return tag_name.lower()
340
341	def Read(self):
342	# type: () -> Tuple[h8_id_t, int]
343	tok_id, end_pos = self._Read()
344	self.pos = end_pos # advance
345	return tok_id, end_pos
346
347	def LookAhead(self, regex):
348	# type: (str) -> bool
349	# Cache the regex compilation. This could also be LookAheadFor(THEAD)
350	# or something.
351	pat = self.cache.get(regex)
352	if pat is None:
353	pat = re.compile(regex)
354	self.cache[regex] = pat
355
356	m = pat.match(self.s, self.pos)
357	return m is not None
358
359
360	# Tag names:
361	# Match <a or </a
362	# Match <h2, but not <2h
363	#
364	# HTML 5 doesn't restrict tag names at all
365	# https://html.spec.whatwg.org/#toc-syntax
366	#
367	# XML allows : - .
368	# https://www.w3.org/TR/xml/#NT-NameChar
369
370	# Namespaces for MathML, SVG
371	# XLink, XML, XMLNS
372	#
373	# https://infra.spec.whatwg.org/#namespaces
374	#
375	# Allow - for td-attrs
376
377	# Be very lenient - just no whitespace or special HTML chars
378	# I don't think this is more lenient than HTML5, though we should check.
379	_UNQUOTED_VALUE = r'''[^ \t\r\n<>&"'\x00]*'''
380
381	# TODO: we don't need to capture the tag name here? That's done at the top
382	# level
383	_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)
384
385	_TAG_LAST_RE = re.compile(r'\s* /? >', re.VERBOSE)
386
387	# To match href="foo"
388	# Note: in HTML5 and XML, single quoted attributes are also valid
389
390	# <button disabled> is standard usage
391
392	# NOTE: This used to allow whitespace around =
393	# <a foo = "bar"> makes sense in XML
394	# But then you also have
395	# <a foo= bar> - which is TWO attributes, in HTML5
396	# So the space is problematic
397
398	_ATTR_RE = re.compile(
399	r'''
400	\s+ # Leading whitespace is required
401	(%s) # Attribute name
402	(?: # Optional attribute value
403	\s* = \s* # Spaces allowed around =
404	(?:
405	" ([^>"\x00]*) " # double quoted value
406	\| ' ([^>'\x00]*) ' # single quoted value
407	\| (%s) # Attribute value
408	)
409	)?
410	''' % (_NAME, _UNQUOTED_VALUE), re.VERBOSE)
411
412
413	class TagLexer(object):
414	"""
415	Given a tag like <a href="..."> or <link type="..." />, the TagLexer
416	provides a few operations:
417
418	- What is the tag?
419	- Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
420	"""
421
422	def __init__(self, s):
423	# type: (str) -> None
424	self.s = s
425	self.start_pos = -1 # Invalid
426	self.end_pos = -1
427
428	def Reset(self, start_pos, end_pos):
429	# type: (int, int) -> None
430	"""Reuse instances of this object."""
431	assert start_pos >= 0, start_pos
432	assert end_pos >= 0, end_pos
433
434	self.start_pos = start_pos
435	self.end_pos = end_pos
436
437	def WholeTagString(self):
438	# type: () -> str
439	"""Return the entire tag string, e.g. <a href='foo'>"""
440	return self.s[self.start_pos:self.end_pos]
441
442	def GetTagName(self):
443	# type: () -> str
444	# First event
445	tok_id, start, end = next(self.Tokens())
446	return self.s[start:end]
447
448	def GetSpanForAttrValue(self, attr_name):
449	# type: (str) -> Tuple[int, int]
450	"""
451	Used by oils_doc.py, for href shortcuts
452	"""
453	# Algorithm: search for QuotedValue or UnquotedValue after AttrName
454	# TODO: Could also cache these
455
456	events = self.Tokens()
457	val = (-1, -1)
458	try:
459	while True:
460	tok_id, start, end = next(events)
461	if tok_id == h8_tag_id.AttrName:
462	name = self.s[start:end]
463	if name == attr_name:
464	# The value should come next
465	tok_id, start, end = next(events)
466	assert tok_id in (
467	h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue,
468	h8_tag_id.MissingValue), h8_tag_id_str(tok_id)
469	val = start, end
470	break
471
472	except StopIteration:
473	pass
474	return val
475
476	def GetAttrRaw(self, attr_name):
477	# type: (str) -> Optional[str]
478	"""
479	Return the value, which may be UNESCAPED.
480	"""
481	start, end = self.GetSpanForAttrValue(attr_name)
482	if start == -1:
483	return None
484	return self.s[start:end]
485
486	def AllAttrsRawSlice(self):
487	# type: () -> List[Tuple[str, int, int]]
488	"""
489	Get a list of pairs [('class', 3, 5), ('href', 9, 12)]
490	"""
491	slices = []
492	events = self.Tokens()
493	try:
494	while True:
495	tok_id, start, end = next(events)
496	if tok_id == h8_tag_id.AttrName:
497	name = self.s[start:end]
498
499	# The value should come next
500	tok_id, start, end = next(events)
501	assert tok_id in (
502	h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue,
503	h8_tag_id.MissingValue), h8_tag_id_str(tok_id)
504	# Note: quoted values may have &
505	# We would need ANOTHER lexer to unescape them, but we
506	# don't need that for ul-table
507	slices.append((name, start, end))
508	except StopIteration:
509	pass
510	return slices
511
512	def AllAttrsRaw(self):
513	# type: () -> List[Tuple[str, str]]
514	"""
515	Get a list of pairs [('class', 'foo'), ('href', '?foo=1&bar=2')]
516
517	The quoted values may be escaped. We would need another lexer to
518	unescape them.
519	"""
520	slices = self.AllAttrsRawSlice()
521	pairs = []
522	for name, start, end in slices:
523	pairs.append((name, self.s[start:end]))
524	return pairs
525
526	def Tokens(self):
527	# type: () -> Iterator[Tuple[h8_tag_id_t, int, int]]
528	"""
529	Yields a sequence of tokens: Tag (AttrName AttrValue?)*
530
531	Where each Token is (Type, start_pos, end_pos)
532
533	Note that start and end are NOT redundant! We skip over some unwanted
534	characters.
535	"""
536	m = _TAG_RE.match(self.s, self.start_pos + 1)
537	if not m:
538	raise RuntimeError("Couldn't find HTML tag in %r" %
539	self.WholeTagString())
540	yield h8_tag_id.TagName, m.start(1), m.end(1)
541
542	pos = m.end(0)
543	#log('POS %d', pos)
544
545	while True:
546	# don't search past the end
547	m = _ATTR_RE.match(self.s, pos, self.end_pos)
548	if not m:
549	#log('BREAK pos %d', pos)
550	break
551	#log('AttrName %r', m.group(1))
552
553	yield h8_tag_id.AttrName, m.start(1), m.end(1)
554
555	#log('m.groups() %r', m.groups())
556	if m.group(2) is not None:
557	# double quoted
558	yield h8_tag_id.QuotedValue, m.start(2), m.end(2)
559	elif m.group(3) is not None:
560	# single quoted - TODO: could have different token types
561	yield h8_tag_id.QuotedValue, m.start(3), m.end(3)
562	elif m.group(4) is not None:
563	yield h8_tag_id.UnquotedValue, m.start(4), m.end(4)
564	else:
565	# <button disabled>
566	end = m.end(0)
567	yield h8_tag_id.MissingValue, end, end
568
569	# Skip past the "
570	pos = m.end(0)
571
572	#log('TOK %r', self.s)
573
574	m = _TAG_LAST_RE.match(self.s, pos)
575	#log('_TAG_LAST_RE match %r', self.s[pos:])
576	if not m:
577	# Extra data at end of tag. TODO: add messages for all these.
578	raise LexError(self.s, pos)
579
580
581	# This is similar but not identical to
582	# " ([^>"\x00]*) " # double quoted value
583	# \| ' ([^>'\x00]*) ' # single quoted value
584	#
585	# Note: for unquoted values, & isn't allowed, and thus & and c and
586	# are not allowed. We could relax that?
587	ATTR_VALUE_LEXER = CHAR_LEX + [
588	(r'[^>&\x00]+', h8_id.RawData),
589	(r'.', h8_id.Invalid),
590	]
591
592	ATTR_VALUE_LEXER = MakeLexer(ATTR_VALUE_LEXER)
593
594
595	class AttrValueLexer(object):
596	"""
597	<a href="foo=99&bar">
598	<a href='foo=99&bar'>
599	<a href=unquoted>
600	"""
601
602	def __init__(self, s):
603	# type: (str) -> None
604	self.s = s
605	self.start_pos = -1 # Invalid
606	self.end_pos = -1
607
608	def Reset(self, start_pos, end_pos):
609	# type: (int, int) -> None
610	"""Reuse instances of this object."""
611	assert start_pos >= 0, start_pos
612	assert end_pos >= 0, end_pos
613
614	self.start_pos = start_pos
615	self.end_pos = end_pos
616
617	def NumTokens(self):
618	# type: () -> int
619	num_tokens = 0
620	pos = self.start_pos
621	for tok_id, end_pos in self.Tokens():
622	if tok_id == h8_id.Invalid:
623	raise LexError(self.s, pos)
624	pos = end_pos
625	#log('pos %d', pos)
626	num_tokens += 1
627	return num_tokens
628
629	def Tokens(self):
630	# type: () -> Iterator[Tuple[h8_id_t, int]]
631	pos = self.start_pos
632	while pos < self.end_pos:
633	# Find the first match, like above.
634	# Note: frontend/match.py uses _LongestMatch(), which is different!
635	# TODO: reconcile them. This lexer should be expressible in re2c.
636	for pat, tok_id in ATTR_VALUE_LEXER:
637	m = pat.match(self.s, pos)
638	if m:
639	if 0:
640	tok_str = m.group(0)
641	log('token = %r', tok_str)
642
643	end_pos = m.end(0)
644	yield tok_id, end_pos
645	pos = end_pos
646	break
647	else:
648	raise AssertionError('h8_id.Invalid rule should have matched')