data_lang/htm8.py

OILS / data_lang / htm8.py View on Github | oils.pub

1043 lines, 450 significant

1	"""data_lang/htm8.py
2
3	TODO
4
5	API:
6	- Get rid of AttrValueLexer - this should be in the TagLexer
7	- this also means that unquoted values can be more similar
8	- We can use a single lexer mode for everything inside <>
9	- the SPACE is the only difference
10	- Deprecate tag_lexer.GetTagName() in favor of lx.CanonicalTagName() or
11	_LiteralTagName()
12	- UTF-8 check, like JSON8
13	- re2c
14	- port lexer, which will fix static typing issues
15	- the abstraction needs to support submatch?
16	- for finding the end of a tag, etc.?
17
18	- LexError and ParseError need details
19	- harmonize with data_lang/j8.py, which uses error.Decode(msg, ...,
20	cur_line_num)
21
22	- Copy all errors into doc/ref/chap-errors.md
23	- This helps understand the language
24
25	- Update doc/htm8.md
26	- list of Algorithms:
27	- lex just the top level
28	- lex both levels
29	- and match tags - this is the level for value.Htm8Frag?
30	- convert to XML!
31	- lazy selection by tag, or attr (id= and class=)
32	- lazy selection by CSS selector expression
33	- convert to DOMTree
34	- sed-like replacement of DOM Tree or element
35	- untrusted HTML filter, e.g. like StackOverflow / Reddit
36	- this is Safe HTM8
37	- should have a zero alloc way to support this, with good errors?
38	- I think most of them silently strip data
39	"""
40
41	import re
42
43	from typing import Dict, List, Tuple, Optional, IO, Iterator, Any
44
45	from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_tag_id, h8_tag_id_t,
46	h8_tag_id_str, attr_name, attr_name_t,
47	attr_name_str, attr_value_e, attr_value_t,
48	h8_val_id)
49	from doctools.util import log
50
51
52	class LexError(Exception):
53	"""
54	Examples of lex errors:
55
56	- h8_id.Invalid, like <> or &&
57	- Unclosed <!-- <? <![CDATA[ <script> <style>
58	"""
59
60	def __init__(self, s, start_pos):
61	# type: (str, int) -> None
62	self.s = s
63	self.start_pos = start_pos
64
65	def __str__(self):
66	# type: () -> str
67	return '(LexError %r)' % (self.s[self.start_pos:self.start_pos + 20])
68
69
70	def _FindLineNum(s, error_pos):
71	# type: (str, int) -> int
72	current_pos = 0
73	line_num = 1
74	while True:
75	newline_pos = s.find('\n', current_pos)
76	#log('current = %d, N %d, line %d', current_pos, newline_pos, line_num)
77
78	if newline_pos == -1: # this is the last line
79	return line_num
80	if newline_pos >= error_pos:
81	return line_num
82	line_num += 1
83	current_pos = newline_pos + 1
84
85
86	class ParseError(Exception):
87	"""
88	Examples of parse errors
89
90	- unbalanced tag structure
91	- ul_table.py errors
92	"""
93
94	def __init__(self, msg, s=None, start_pos=-1):
95	# type: (str, Optional[str], int) -> None
96	self.msg = msg
97	self.s = s
98	self.start_pos = start_pos
99
100	def __str__(self):
101	# type: () -> str
102	if self.s is not None:
103	assert self.start_pos != -1, self.start_pos
104	snippet = (self.s[self.start_pos:self.start_pos + 20])
105
106	line_num = _FindLineNum(self.s, self.start_pos)
107	else:
108	snippet = ''
109	line_num = -1
110	msg = 'line %d: %r %r' % (line_num, self.msg, snippet)
111	return msg
112
113
114	class Output(object):
115	"""Takes an underlying input buffer and an output file. Maintains a
116	position in the input buffer.
117
118	Print FROM the input or print new text to the output.
119	"""
120
121	def __init__(self, s, f, left_pos=0, right_pos=-1):
122	# type: (str, IO[str], int, int) -> None
123	self.s = s
124	self.f = f
125	self.pos = left_pos
126	self.right_pos = len(s) if right_pos == -1 else right_pos
127
128	def SkipTo(self, pos):
129	# type: (int) -> None
130	"""Skip to a position."""
131	self.pos = pos
132
133	def PrintUntil(self, pos):
134	# type: (int) -> None
135	"""Print until a position."""
136	piece = self.s[self.pos:pos]
137	self.f.write(piece)
138	self.pos = pos
139
140	def PrintTheRest(self):
141	# type: () -> None
142	"""Print until the end of the string."""
143	self.PrintUntil(self.right_pos)
144
145	def Print(self, s):
146	# type: (str) -> None
147	"""Print text to the underlying buffer."""
148	self.f.write(s)
149
150
151	def MakeLexer(rules):
152	return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]
153
154
155	#
156	# Eggex
157	#
158	# Tag = / ~['>']+ /
159
160	# Is this valid? A single character?
161	# Tag = / ~'>'* /
162
163	# Maybe better: / [NOT '>']+/
164	# capital letters not allowed there?
165	#
166	# But then this is confusing:
167	# / [NOT ~digit]+/
168	#
169	# / [NOT digit] / is [^\d]
170	# / ~digit / is \D
171	#
172	# Or maybe:
173	#
174	# / [~ digit]+ /
175	# / [~ '>']+ /
176	# / [NOT '>']+ /
177
178	# End = / '</' Tag '>' /
179	# StartEnd = / '<' Tag '/>' /
180	# Start = / '<' Tag '>' /
181	#
182	# EntityRef = / '&' dot{* N} ';' /
183
184	# Tag name, or attribute name
185	# colon is used in XML
186
187	# https://www.w3.org/TR/xml/#NT-Name
188	# Hm there is a lot of unicode stuff. We are simplifying parsing
189
190	_NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter
191
192	CHAR_LEX = [
193	# Characters
194	# https://www.w3.org/TR/xml/#sec-references
195	(r'&\# [0-9]+ ;', h8_id.DecChar),
196	(r'&\# x[0-9a-fA-F]+ ;', h8_id.HexChar),
197	(r'& %s ;' % _NAME, h8_id.CharEntity),
198	# Allow unquoted, and quoted
199	(r'&', h8_id.BadAmpersand),
200	]
201
202	HTM8_LEX = CHAR_LEX + [
203	# TODO: CommentBegin, ProcessingBegin, CDataBegin could have an additional
204	# action associated with them? The ending substring
205	(r'<!--', h8_id.CommentBegin),
206
207	# Processing instruction are used for the XML header:
208	# <?xml version="1.0" encoding="UTF-8"?>
209	# They are technically XML-only, but in HTML5, they are another kind of
210	# comment:
211	#
212	# https://developer.mozilla.org/en-US/docs/Web/API/ProcessingInstruction
213	#
214	(r'<\?', h8_id.ProcessingBegin),
215	# Not necessary in HTML5, but occurs in XML
216	(r'<!\[CDATA\[', h8_id.CDataBegin), # <![CDATA[
217
218	# Markup declarations
219	# - In HTML5, there is only <!DOCTYPE html>
220	# - XML has 4 more declarations: <!ELEMENT ...> ATTLIST ENTITY NOTATION
221	# - these seem to be part of DTD
222	# - it's useful to skip these, and be able to parse the rest of the document
223	# - Note: < is allowed?
224	(r'<! [^>\x00]+ >', h8_id.Decl),
225
226	# Tags
227	# Notes:
228	# - We look for a valid tag name, but we don't validate attributes.
229	# That's done in the tag lexer.
230	# - We don't allow leading whitespace
231	(r'</ (%s) >' % _NAME, h8_id.EndTag),
232	# self-closing <br/> comes before StartTag
233	# could/should these be collapsed into one rule?
234	(r'< (%s) [^>\x00]* />' % _NAME, h8_id.StartEndTag), # end </a>
235	(r'< (%s) [^>\x00]* >' % _NAME, h8_id.StartTag), # start <a>
236
237	# HTML5 allows unescaped > in raw data, but < is not allowed.
238	# https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
239	#
240	# - My early blog has THREE errors when disallowing >
241	# - So do some .wwz files
242	(r'[^&<>\x00]+', h8_id.RawData),
243	(r'>', h8_id.BadGreaterThan),
244	# NUL is the end, an accomodation for re2c. Like we do in frontend/match.
245	(r'\x00', h8_id.EndOfStream),
246	# This includes < - it is not BadLessThan because it's NOT recoverable
247	(r'.', h8_id.Invalid),
248	]
249
250	# Old notes:
251	#
252	# Non-greedy matches are regular and can be matched in linear time
253	# with RE2.
254	#
255	# https://news.ycombinator.com/item?id=27099798
256	#
257
258	# This person tried to do it with a regex:
259	#
260	# https://skeptric.com/html-comment-regexp/index.html
261
262	# . is any char except newline
263	# https://re2c.org/manual/manual_c.html
264
265	# Discarded options
266	#(r'<!-- .*? -->', h8_id.Comment),
267
268	# Hack from Claude: \s\S instead of re.DOTALL. I don't like this
269	#(r'<!-- [\s\S]*? -->', h8_id.Comment),
270	#(r'<!-- (?:.\|[\n])*? -->', h8_id.Comment),
271
272	HTM8_LEX_COMPILED = MakeLexer(HTM8_LEX)
273
274
275	class Lexer(object):
276
277	def __init__(self, s, left_pos=0, right_pos=-1, no_special_tags=False):
278	# type: (str, int, int, bool) -> None
279	self.s = s
280	self.pos = left_pos
281	self.right_pos = len(s) if right_pos == -1 else right_pos
282	self.no_special_tags = no_special_tags
283
284	# string -> compiled regex pattern object
285	self.cache = {} # type: Dict[str, Any]
286
287	# either </script> or </style> - we search until we see that
288	self.search_state = None # type: Optional[str]
289
290	# Position of tag name, if applicable
291	# - Set after you get a StartTag, EndTag, or StartEndTag
292	# - Unset on other tags
293	self.tag_pos_left = -1
294	self.tag_pos_right = -1
295
296	def _Read(self):
297	# type: () -> Tuple[h8_id_t, int]
298	if self.pos == self.right_pos:
299	return h8_id.EndOfStream, self.pos
300
301	assert self.pos < self.right_pos, self.pos
302
303	if self.search_state is not None and not self.no_special_tags:
304	# TODO: case-insensitive search for </SCRIPT> <SCRipt> ?
305	#
306	# Another strategy: enter a mode where we find ONLY the end tag
307	# regex, and any data that's not <, and then check the canonical
308	# tag name for 'script' or 'style'.
309	pos = self.s.find(self.search_state, self.pos)
310	if pos == -1:
311	# unterminated <script> or <style>
312	raise LexError(self.s, self.pos)
313	self.search_state = None
314	# beginning
315	return h8_id.HtmlCData, pos
316
317	# Find the first match.
318	# Note: frontend/match.py uses _LongestMatch(), which is different!
319	# TODO: reconcile them. This lexer should be expressible in re2c.
320
321	for pat, tok_id in HTM8_LEX_COMPILED:
322	m = pat.match(self.s, self.pos)
323	if m:
324	if tok_id in (h8_id.StartTag, h8_id.EndTag, h8_id.StartEndTag):
325	self.tag_pos_left = m.start(1)
326	self.tag_pos_right = m.end(1)
327	else:
328	# Reset state
329	self.tag_pos_left = -1
330	self.tag_pos_right = -1
331
332	if tok_id == h8_id.CommentBegin:
333	pos = self.s.find('-->', self.pos)
334	if pos == -1:
335	# unterminated <!--
336	raise LexError(self.s, self.pos)
337	return h8_id.Comment, pos + 3 # -->
338
339	if tok_id == h8_id.ProcessingBegin:
340	pos = self.s.find('?>', self.pos)
341	if pos == -1:
342	# unterminated <?
343	raise LexError(self.s, self.pos)
344	return h8_id.Processing, pos + 2 # ?>
345
346	if tok_id == h8_id.CDataBegin:
347	pos = self.s.find(']]>', self.pos)
348	if pos == -1:
349	# unterminated <![CDATA[
350	raise LexError(self.s, self.pos)
351	return h8_id.CData, pos + 3 # ]]>
352
353	if tok_id == h8_id.StartTag:
354	# TODO: reduce allocations
355	if (self.TagNameEquals('script') or
356	self.TagNameEquals('style')):
357	# <SCRipt a=b> -> </SCRipt>
358	self.search_state = '</' + self._LiteralTagName() + '>'
359
360	return tok_id, m.end()
361	else:
362	raise AssertionError('h8_id.Invalid rule should have matched')
363
364	def TagNamePos(self):
365	"""The right position of the tag pos"""
366	assert self.tag_pos_right != -1, self.tag_pos_right
367	return self.tag_pos_right
368
369	def TagNameEquals(self, expected):
370	# type: (str) -> bool
371	assert self.tag_pos_left != -1, self.tag_pos_left
372	assert self.tag_pos_right != -1, self.tag_pos_right
373
374	# TODO: In C++, this does not need an allocation. Can we test
375	# directly?
376	return expected == self.CanonicalTagName()
377
378	def _LiteralTagName(self):
379	# type: () -> str
380	assert self.tag_pos_left != -1, self.tag_pos_left
381	assert self.tag_pos_right != -1, self.tag_pos_right
382
383	return self.s[self.tag_pos_left:self.tag_pos_right]
384
385	def CanonicalTagName(self):
386	# type: () -> str
387	tag_name = self._LiteralTagName()
388	# Most tags are already lower case, so avoid allocation with this conditional
389	# TODO: this could go in the mycpp runtime?
390	if tag_name.islower():
391	return tag_name
392	else:
393	return tag_name.lower()
394
395	def Read(self):
396	# type: () -> Tuple[h8_id_t, int]
397	tok_id, end_pos = self._Read()
398	self.pos = end_pos # advance
399	return tok_id, end_pos
400
401	def LookAhead(self, regex):
402	# type: (str) -> bool
403	"""
404	Currently used for ul_table.py. But taking a dynamic regex string is
405	not the right interface.
406	"""
407	# Cache the regex compilation. This could also be LookAheadFor(THEAD)
408	# or something.
409	pat = self.cache.get(regex)
410	if pat is None:
411	pat = re.compile(regex)
412	self.cache[regex] = pat
413
414	m = pat.match(self.s, self.pos)
415	return m is not None
416
417
418	A_NAME_LEX = [
419	# Leading whitespace is required, to separate attributes.
420	#
421	# If the = is not present, then we set the lexer in a state for
422	# attr_value_e.Missing.
423	(r'\s+ (%s) \s* (=)? \s*' % _NAME, attr_name.Ok),
424	# unexpected EOF
425
426	# The closing > or /> is treated as end of stream, and it's not an error.
427	(r'\s* /? >', attr_name.Done),
428
429	# NUL should not be possible, because the top-level
430
431	# This includes < - it is not BadLessThan because it's NOT recoverable
432	(r'.', attr_name.Invalid),
433	]
434
435	A_NAME_LEX_COMPILED = MakeLexer(A_NAME_LEX)
436
437	# Here we just loop on regular tokens
438	#
439	# Examples:
440	# <a href = unquoted&foo >
441	# <a href = unquoted&foo > # BadAmpersand is allowed I guess
442	# <a href ="unquoted&foo" > # double quoted
443	# <a href ='unquoted&foo' > # single quoted
444	# <a href = what"foo" > # HTML5 allows this, but we could disallow it if
445	# it's not common. It opens up the j"" and $"" extensions
446	# <a href = what'foo' > # ditto
447	#
448	# Problem: <a href=foo/> - this is hard to recognize
449	# Because is the unquoted value "foo/" or "foo" ?
450
451	# Be very lenient - just no whitespace or special HTML chars
452	# I don't think this is more lenient than HTML5, though we should check.
453	#
454	# Bug fix: Also disallow /
455
456	# TODO: get rid of OLD copy
457	_UNQUOTED_VALUE_OLD = r'''[^ \t\r\n<>&/"'\x00]*'''
458	_UNQUOTED_VALUE = r'''[^ \t\r\n<>&/"'\x00]+'''
459
460	# Restrictive definition, similar to _NAME
461	# I was trying to capture #ble.sh and so forth
462	# I also had unquoted //github.com, etc.
463
464	# _UNQUOTED_VALUE = r'''[a-zA-Z0-9:_\-]+'''
465	#
466	# For now, I guess we live with <a href=?foo/>
467
468	# What comes after = ?
469	A_VALUE_LEX = [
470	(r'"', h8_val_id.DoubleQuote),
471	(r"'", h8_val_id.SingleQuote),
472	(_UNQUOTED_VALUE, h8_val_id.UnquotedVal),
473
474	#(r'[ \r\n\t]', h8_id.Whitespace), # terminates unquoted values
475	#(r'[^ \r\n\t&>\x00]', h8_id.RawData),
476	#(r'[>\x00]', h8_id.EndOfStream),
477	# e.g. < is an error
478	(r'.', h8_val_id.NoMatch),
479	]
480
481	A_VALUE_LEX_COMPILED = MakeLexer(A_VALUE_LEX)
482
483	# What's inside "" or '' ?
484	QUOTED_VALUE_LEX = CHAR_LEX + [
485	(r'"', h8_id.DoubleQuote),
486	(r"'", h8_id.SingleQuote),
487	(r'<', h8_id.BadLessThan), # BadAmpersand is in CharLex
488	(r'''[^"'<>&\x00]+''', h8_id.RawData),
489	# This includes > - it is not BadGreaterThan because it's NOT recoverable
490	(r'.', h8_id.Invalid),
491	]
492
493	QUOTED_VALUE_LEX_COMPILED = MakeLexer(QUOTED_VALUE_LEX)
494
495
496	class AttrLexer(object):
497	"""
498	Typical usage:
499
500	while True:
501	n, start_pos, end_pos = attr_lx.ReadName()
502	if n == attr_name.Ok:
503	if attr_lx.AttrNameEquals('div'):
504	print('div')
505
506	# TODO: also pass Optional[List[]] out_tokens?
507	v, start_pos, end_pos = attr_lx.ReadValue()
508	"""
509
510	def __init__(self, s):
511	# type: (str) -> None
512	self.s = s
513	self.tag_name_pos = -1 # Invalid
514	self.tag_end_pos = -1
515	self.pos = -1
516
517	self.name_start = -1
518	self.name_end = -1
519	self.next_value_is_missing = False
520
521	self.init_t = -1
522	self.init_e = -1
523
524	def Init(self, tag_name_pos, end_pos):
525	# type: (int, int) -> None
526	"""Initialize so we can read names and values.
527
528	Example:
529	'x <a y>' # tag_name_pos=4, end_pos=6
530	'x <a>' # tag_name_pos=4, end_pos=4
531
532	The Reset() method is used to reuse instances of the AttrLexer object.
533	"""
534	assert tag_name_pos >= 0, tag_name_pos
535	assert end_pos >= 0, end_pos
536
537	#log('TAG NAME POS %d', tag_name_pos)
538
539	self.tag_name_pos = tag_name_pos
540	self.end_pos = end_pos
541
542	self.pos = tag_name_pos
543
544	# For Reset()
545	self.init_t = tag_name_pos
546	self.init_e = end_pos
547
548	def Reset(self):
549	self.tag_name_pos = self.init_t
550	self.end_pos = self.init_e
551	self.pos = self.init_t
552
553	def ReadName(self):
554	# type: () -> Tuple[attr_name_t, int, int]
555	"""Reads the attribute name
556
557	EOF case:
558	<a>
559	<a >
560
561	Error case:
562	<a !>
563	<a foo=bar !>
564	"""
565	for pat, a in A_NAME_LEX_COMPILED:
566	m = pat.match(self.s, self.pos)
567	#log('ReadName() matching %r at %d', self.s, self.pos)
568	if m:
569	#log('ReadName() tag_name_pos %d pos, %d %s', self.tag_name_pos, self.pos, m.groups())
570	if a == attr_name.Invalid:
571	#log('m.groups %s', m.groups())
572	return attr_name.Invalid, -1, -1
573
574	self.pos = m.end(0) # Advance if it's not invalid
575
576	if a == attr_name.Ok:
577	#log('%r', m.groups())
578	self.name_start = m.start(1)
579	self.name_end = m.end(1)
580	# Is the equals sign missing? Set state.
581	if m.group(2) is None:
582	self.next_value_is_missing = True
583	# HACK: REWIND, since we don't want to consume whitespace
584	self.pos = self.name_end
585	return attr_name.Ok, self.name_start, self.name_end
586	else:
587	# Reset state - e.g. you must call AttrNameEquals
588	self.name_start = -1
589	self.name_end = -1
590	self.next_value_is_missing = False
591
592	if a == attr_name.Done:
593	return attr_name.Done, -1, -1
594	else:
595	context = self.s[self.pos:]
596	#log('s %r %d', self.s, self.pos)
597	raise AssertionError('h8_id.Invalid rule should have matched %r' %
598	context)
599
600	def _CanonicalAttrName(self):
601	# type: () -> str
602	"""Return the lower case attribute name.
603
604	Must call after ReadName()
605	"""
606	assert self.name_start >= 0, self.name_start
607	assert self.name_end >= 0, self.name_end
608
609	attr_name = self.s[self.name_start:self.name_end]
610	if attr_name.islower():
611	return attr_name
612	else:
613	return attr_name.lower()
614
615	def AttrNameEquals(self, expected):
616	# type: (str) -> bool
617	"""
618	Must call after ReadName()
619
620	TODO: This can be optimized to be "in place", with zero allocs.
621	"""
622	return expected == self._CanonicalAttrName()
623
624	def _QuotedRead(self):
625	# type: () -> Tuple[h8_id_t, int]
626
627	for pat, tok_id in QUOTED_VALUE_LEX_COMPILED:
628	m = pat.match(self.s, self.pos)
629	if m:
630	end_pos = m.end(0) # Advance
631	#log('_QuotedRead %r', self.s[self.pos:end_pos])
632	return tok_id, end_pos
633	else:
634	context = self.s[self.pos:self.pos + 10]
635	raise AssertionError('h8_id.Invalid rule should have matched %r' %
636	context)
637
638	def ReadValue(self, tokens_out=None):
639	# type: (Optional[List[Tuple[h8_id, int]]]) -> Tuple[attr_value_t, int, int]
640	"""Read the attribute value.
641
642	In general, it is escaped or "raw"
643
644	Can only be called after a SUCCESSFUL ReadName().
645	Assuming ReadName() returned a value, this should NOT fail.
646	"""
647	# ReadName() invariant
648	assert self.name_start >= 0, self.name_start
649	assert self.name_end >= 0, self.name_end
650
651	self.name_start = -1
652	self.name_end = -1
653
654	if self.next_value_is_missing:
655	# Do not advance self.pos
656	#log('-> MISSING pos %d : %r', self.pos, self.s[self.pos:])
657	return attr_value_e.Missing, -1, -1
658
659	# Now read " ', unquoted or empty= is valid too.
660	for pat, a in A_VALUE_LEX_COMPILED:
661	m = pat.match(self.s, self.pos)
662	if m:
663	first_end_pos = m.end(0)
664	#log('m %s', m.groups())
665
666	# Note: Unquoted value can't contain & etc. now, so there
667	# is no unquoting, and no respecting tokens_raw.
668	if a == h8_val_id.UnquotedVal:
669	self.pos = first_end_pos # Advance
670	return attr_value_e.Unquoted, m.start(0), first_end_pos
671
672	# TODO: respect tokens_out
673	if a == h8_val_id.DoubleQuote:
674	self.pos = first_end_pos
675	while True:
676	tok_id, q_end_pos = self._QuotedRead()
677	#log('self.pos %d q_end_pos %d', self.pos, q_end_pos)
678	if tok_id == h8_id.Invalid:
679	raise LexError(self.s, self.pos)
680	if tok_id == h8_id.DoubleQuote:
681	right_pos = self.pos
682	self.pos = q_end_pos # Advance past "
683	return attr_value_e.DoubleQuoted, first_end_pos, right_pos
684	self.pos = q_end_pos # Advance _QuotedRead
685
686	# TODO: respect tokens_out
687	if a == h8_val_id.SingleQuote:
688	self.pos = first_end_pos
689	while True:
690	tok_id, q_end_pos = self._QuotedRead()
691	if tok_id == h8_id.Invalid:
692	raise LexError(self.s, self.pos)
693	if tok_id == h8_id.SingleQuote:
694	right_pos = self.pos
695	self.pos = q_end_pos # Advance past "
696	return attr_value_e.SingleQuoted, first_end_pos, right_pos
697	self.pos = q_end_pos # Advance _QuotedRead
698
699	if a == h8_val_id.NoMatch:
700	# <a foo = >
701	return attr_value_e.Empty, -1, -1
702	else:
703	raise AssertionError('h8_val_id.NoMatch rule should have matched')
704
705
706	def GetAttrRaw(attr_lx, name):
707	# type: (AttrLexer, str) -> Optional[str]
708	while True:
709	n, name_start, name_end = attr_lx.ReadName()
710	#log('==> ReadName %s %d %d', attr_name_str(n), name_start, name_end)
711	if n == attr_name.Ok:
712	if attr_lx.AttrNameEquals(name):
713	v, val_start, val_end = attr_lx.ReadValue()
714	return attr_lx.s[val_start:val_end]
715	else:
716	# Problem with stateful API: You are forced to either ReadValue()
717	# or SkipVlaue()
718	attr_lx.ReadValue()
719	elif n == attr_name.Done:
720	break
721	elif n == attr_name.Invalid:
722	raise LexError(attr_lx.s, attr_lx.pos)
723	else:
724	raise AssertionError()
725
726	return None
727
728
729	def AllAttrsRaw(attr_lx):
730	# type: (AttrLexer) -> List[Tuple[str,str]]
731	result = []
732	while True:
733	n, name_start, name_end = attr_lx.ReadName()
734	if 0:
735	log(' AllAttrsRaw ==> ReadName %s %d %d', attr_name_str(n),
736	name_start, name_end)
737	if n == attr_name.Ok:
738	name = attr_lx.s[name_start:name_end]
739	#log(' Name %r', name)
740
741	v, val_start, val_end = attr_lx.ReadValue()
742	val = attr_lx.s[val_start:val_end]
743	#log(' ReadValue %r', val)
744	result.append((name, val))
745	elif n == attr_name.Done:
746	break
747	elif n == attr_name.Invalid:
748	raise LexError(attr_lx.s, attr_lx.pos)
749	else:
750	raise AssertionError()
751
752	return result
753
754
755	#
756	# OLD API - REMOVE THIS
757	#
758
759	# Tag names:
760	# Match <a or </a
761	# Match <h2, but not <2h
762	#
763	# HTML 5 doesn't restrict tag names at all
764	# https://html.spec.whatwg.org/#toc-syntax
765	#
766	# XML allows : - .
767	# https://www.w3.org/TR/xml/#NT-NameChar
768
769	# Namespaces for MathML, SVG
770	# XLink, XML, XMLNS
771	#
772	# https://infra.spec.whatwg.org/#namespaces
773	#
774	# Allow - for td-attrs
775
776	# TODO: we don't need to capture the tag name here? That's done at the top
777	# level
778	_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)
779
780	_TAG_LAST_RE = re.compile(r'\s* /? >', re.VERBOSE)
781
782	# To match href="foo"
783	# Note: in HTML5 and XML, single quoted attributes are also valid
784
785	# <button disabled> is standard usage
786
787	# NOTE: This used to allow whitespace around =
788	# <a foo = "bar"> makes sense in XML
789	# But then you also have
790	# <a foo= bar> - which is TWO attributes, in HTML5
791	# So the space is problematic
792
793	_ATTR_RE = re.compile(
794	r'''
795	\s+ # Leading whitespace is required
796	(%s) # Attribute name
797	(?: # Optional attribute value
798	\s* = \s* # Spaces allowed around =
799	(?:
800	" ([^>"\x00]*) " # double quoted value
801	\| ' ([^>'\x00]*) ' # single quoted value
802	\| (%s) # Attribute value
803	)
804	)?
805	''' % (_NAME, _UNQUOTED_VALUE_OLD), re.VERBOSE)
806
807
808	class TagLexer(object):
809	"""
810	Given a tag like <a href="..."> or <link type="..." />, the TagLexer
811	provides a few operations:
812
813	- What is the tag?
814	- Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
815	"""
816
817	def __init__(self, s):
818	# type: (str) -> None
819	self.s = s
820	self.start_pos = -1 # Invalid
821	self.end_pos = -1
822
823	def Reset(self, start_pos, end_pos):
824	# type: (int, int) -> None
825	"""Reuse instances of this object."""
826	assert start_pos >= 0, start_pos
827	assert end_pos >= 0, end_pos
828
829	self.start_pos = start_pos
830	self.end_pos = end_pos
831
832	def WholeTagString(self):
833	# type: () -> str
834	"""Return the entire tag string, e.g. <a href='foo'>"""
835	return self.s[self.start_pos:self.end_pos]
836
837	def GetTagName(self):
838	# type: () -> str
839	# First event
840	tok_id, start, end = next(self.Tokens())
841	return self.s[start:end]
842
843	def GetSpanForAttrValue(self, attr_name):
844	# type: (str) -> Tuple[int, int]
845	"""
846	Used by oils_doc.py, for href shortcuts
847	"""
848	# Algorithm: search for QuotedValue or UnquotedValue after AttrName
849	# TODO: Could also cache these
850
851	events = self.Tokens()
852	val = (-1, -1)
853	try:
854	while True:
855	tok_id, start, end = next(events)
856	if tok_id == h8_tag_id.AttrName:
857	name = self.s[start:end]
858	if name == attr_name:
859	# The value should come next
860	tok_id, start, end = next(events)
861	assert tok_id in (
862	h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue,
863	h8_tag_id.MissingValue), h8_tag_id_str(tok_id)
864	val = start, end
865	break
866
867	except StopIteration:
868	pass
869	return val
870
871	def GetAttrRaw(self, attr_name):
872	# type: (str) -> Optional[str]
873	"""
874	Return the value, which may be UNESCAPED.
875	"""
876	start, end = self.GetSpanForAttrValue(attr_name)
877	if start == -1:
878	return None
879	return self.s[start:end]
880
881	def AllAttrsRawSlice(self):
882	# type: () -> List[Tuple[str, int, int]]
883	"""
884	Get a list of pairs [('class', 3, 5), ('href', 9, 12)]
885	"""
886	slices = []
887	events = self.Tokens()
888	try:
889	while True:
890	tok_id, start, end = next(events)
891	if tok_id == h8_tag_id.AttrName:
892	name = self.s[start:end]
893
894	# The value should come next
895	tok_id, start, end = next(events)
896	assert tok_id in (
897	h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue,
898	h8_tag_id.MissingValue), h8_tag_id_str(tok_id)
899	# Note: quoted values may have &
900	# We would need ANOTHER lexer to unescape them, but we
901	# don't need that for ul-table
902	slices.append((name, start, end))
903	except StopIteration:
904	pass
905	return slices
906
907	def AllAttrsRaw(self):
908	# type: () -> List[Tuple[str, str]]
909	"""
910	Get a list of pairs [('class', 'foo'), ('href', '?foo=1&bar=2')]
911
912	The quoted values may be escaped. We would need another lexer to
913	unescape them.
914	"""
915	slices = self.AllAttrsRawSlice()
916	pairs = []
917	for name, start, end in slices:
918	pairs.append((name, self.s[start:end]))
919	return pairs
920
921	def Tokens(self):
922	# type: () -> Iterator[Tuple[h8_tag_id_t, int, int]]
923	"""
924	Yields a sequence of tokens: Tag (AttrName AttrValue?)*
925
926	Where each Token is (Type, start_pos, end_pos)
927
928	Note that start and end are NOT redundant! We skip over some unwanted
929	characters.
930	"""
931	m = _TAG_RE.match(self.s, self.start_pos + 1)
932	if not m:
933	raise RuntimeError("Couldn't find HTML tag in %r" %
934	self.WholeTagString())
935	yield h8_tag_id.TagName, m.start(1), m.end(1)
936
937	pos = m.end(0)
938	#log('POS %d', pos)
939
940	while True:
941	# don't search past the end
942	m = _ATTR_RE.match(self.s, pos, self.end_pos)
943	if not m:
944	#log('BREAK pos %d', pos)
945	break
946	#log('AttrName %r', m.group(1))
947
948	yield h8_tag_id.AttrName, m.start(1), m.end(1)
949
950	#log('m.groups() %r', m.groups())
951	if m.group(2) is not None:
952	# double quoted
953	yield h8_tag_id.QuotedValue, m.start(2), m.end(2)
954	elif m.group(3) is not None:
955	# single quoted - TODO: could have different token types
956	yield h8_tag_id.QuotedValue, m.start(3), m.end(3)
957	elif m.group(4) is not None:
958	yield h8_tag_id.UnquotedValue, m.start(4), m.end(4)
959	else:
960	# <button disabled>
961	end = m.end(0)
962	yield h8_tag_id.MissingValue, end, end
963
964	# Skip past the "
965	pos = m.end(0)
966
967	#log('TOK %r', self.s)
968
969	m = _TAG_LAST_RE.match(self.s, pos)
970	#log('_TAG_LAST_RE match %r', self.s[pos:])
971	if not m:
972	# Extra data at end of tag. TODO: add messages for all these.
973	raise LexError(self.s, pos)
974
975
976	# This is similar but not identical to
977	# " ([^>"\x00]*) " # double quoted value
978	# \| ' ([^>'\x00]*) ' # single quoted value
979	#
980	# Note: for unquoted values, & isn't allowed, and thus & and c and
981	# are not allowed. We could relax that?
982	ATTR_VALUE_LEX = CHAR_LEX + [
983	(r'[^>&\x00]+', h8_id.RawData),
984	(r'.', h8_id.Invalid),
985	]
986
987	ATTR_VALUE_LEX_COMPILED = MakeLexer(ATTR_VALUE_LEX)
988
989
990	class AttrValueLexer(object):
991	"""
992	<a href="foo=99&bar">
993	<a href='foo=99&bar'>
994	<a href=unquoted>
995	"""
996
997	def __init__(self, s):
998	# type: (str) -> None
999	self.s = s
1000	self.start_pos = -1 # Invalid
1001	self.end_pos = -1
1002
1003	def Reset(self, start_pos, end_pos):
1004	# type: (int, int) -> None
1005	"""Reuse instances of this object."""
1006	assert start_pos >= 0, start_pos
1007	assert end_pos >= 0, end_pos
1008
1009	self.start_pos = start_pos
1010	self.end_pos = end_pos
1011
1012	def NumTokens(self):
1013	# type: () -> int
1014	num_tokens = 0
1015	pos = self.start_pos
1016	for tok_id, end_pos in self.Tokens():
1017	if tok_id == h8_id.Invalid:
1018	raise LexError(self.s, pos)
1019	pos = end_pos
1020	#log('pos %d', pos)
1021	num_tokens += 1
1022	return num_tokens
1023
1024	def Tokens(self):
1025	# type: () -> Iterator[Tuple[h8_id_t, int]]
1026	pos = self.start_pos
1027	while pos < self.end_pos:
1028	# Find the first match, like above.
1029	# Note: frontend/match.py uses _LongestMatch(), which is different!
1030	# TODO: reconcile them. This lexer should be expressible in re2c.
1031	for pat, tok_id in ATTR_VALUE_LEX_COMPILED:
1032	m = pat.match(self.s, pos)
1033	if m:
1034	if 0:
1035	tok_str = m.group(0)
1036	log('token = %r', tok_str)
1037
1038	end_pos = m.end(0)
1039	yield tok_id, end_pos
1040	pos = end_pos
1041	break
1042	else:
1043	raise AssertionError('h8_id.Invalid rule should have matched')