data_lang/htm8.py

OILS / data_lang / htm8.py View on Github | oils.pub

813 lines, 300 significant

1	"""data_lang/htm8.py
2
3	TODO
4
5	API:
6	- Get rid of AttrValueLexer - this should be in the TagLexer
7	- this also means that unquoted values can be more similar
8	- We can use a single lexer mode for everything inside <>
9	- the SPACE is the only difference
10	- Deprecate tag_lexer.GetTagName() in favor of lx.CanonicalTagName() or
11	_LiteralTagName()
12	- UTF-8 check, like JSON8
13	- re2c
14	- port lexer, which will fix static typing issues
15	- the abstraction needs to support submatch?
16	- for finding the end of a tag, etc.?
17
18	- LexError and ParseError need details
19	- harmonize with data_lang/j8.py, which uses error.Decode(msg, ...,
20	cur_line_num)
21
22	- Copy all errors into doc/ref/chap-errors.md
23	- This helps understand the language
24
25	- Update doc/htm8.md
26	- list of Algorithms:
27	- lex just the top level
28	- lex both levels
29	- and match tags - this is the level for value.Htm8Frag?
30	- convert to XML!
31	- lazy selection by tag, or attr (id= and class=)
32	- lazy selection by CSS selector expression
33	- convert to DOMTree
34	- sed-like replacement of DOM Tree or element
35	- untrusted HTML filter, e.g. like StackOverflow / Reddit
36	- this is Safe HTM8
37	- should have a zero alloc way to support this, with good errors?
38	- I think most of them silently strip data
39	"""
40
41	import re
42
43	from typing import Dict, List, Tuple, Optional, IO, Iterator, Any
44
45	from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_tag_id, h8_tag_id_t,
46	h8_tag_id_str, attr_name_t, attr_value_t)
47	from doctools.util import log
48
49
50	class LexError(Exception):
51	"""
52	Examples of lex errors:
53
54	- h8_id.Invalid, like <> or &&
55	- Unclosed <!-- <? <![CDATA[ <script> <style>
56	"""
57
58	def __init__(self, s, start_pos):
59	# type: (str, int) -> None
60	self.s = s
61	self.start_pos = start_pos
62
63	def __str__(self):
64	# type: () -> str
65	return '(LexError %r)' % (self.s[self.start_pos:self.start_pos + 20])
66
67
68	def _FindLineNum(s, error_pos):
69	# type: (str, int) -> int
70	current_pos = 0
71	line_num = 1
72	while True:
73	newline_pos = s.find('\n', current_pos)
74	#log('current = %d, N %d, line %d', current_pos, newline_pos, line_num)
75
76	if newline_pos == -1: # this is the last line
77	return line_num
78	if newline_pos >= error_pos:
79	return line_num
80	line_num += 1
81	current_pos = newline_pos + 1
82
83
84	class ParseError(Exception):
85	"""
86	Examples of parse errors
87
88	- unbalanced tag structure
89	- ul_table.py errors
90	"""
91
92	def __init__(self, msg, s=None, start_pos=-1):
93	# type: (str, Optional[str], int) -> None
94	self.msg = msg
95	self.s = s
96	self.start_pos = start_pos
97
98	def __str__(self):
99	# type: () -> str
100	if self.s is not None:
101	assert self.start_pos != -1, self.start_pos
102	snippet = (self.s[self.start_pos:self.start_pos + 20])
103
104	line_num = _FindLineNum(self.s, self.start_pos)
105	else:
106	snippet = ''
107	line_num = -1
108	msg = 'line %d: %r %r' % (line_num, self.msg, snippet)
109	return msg
110
111
112	class Output(object):
113	"""Takes an underlying input buffer and an output file. Maintains a
114	position in the input buffer.
115
116	Print FROM the input or print new text to the output.
117	"""
118
119	def __init__(self, s, f, left_pos=0, right_pos=-1):
120	# type: (str, IO[str], int, int) -> None
121	self.s = s
122	self.f = f
123	self.pos = left_pos
124	self.right_pos = len(s) if right_pos == -1 else right_pos
125
126	def SkipTo(self, pos):
127	# type: (int) -> None
128	"""Skip to a position."""
129	self.pos = pos
130
131	def PrintUntil(self, pos):
132	# type: (int) -> None
133	"""Print until a position."""
134	piece = self.s[self.pos:pos]
135	self.f.write(piece)
136	self.pos = pos
137
138	def PrintTheRest(self):
139	# type: () -> None
140	"""Print until the end of the string."""
141	self.PrintUntil(self.right_pos)
142
143	def Print(self, s):
144	# type: (str) -> None
145	"""Print text to the underlying buffer."""
146	self.f.write(s)
147
148
149	def MakeLexer(rules):
150	return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]
151
152
153	#
154	# Eggex
155	#
156	# Tag = / ~['>']+ /
157
158	# Is this valid? A single character?
159	# Tag = / ~'>'* /
160
161	# Maybe better: / [NOT '>']+/
162	# capital letters not allowed there?
163	#
164	# But then this is confusing:
165	# / [NOT ~digit]+/
166	#
167	# / [NOT digit] / is [^\d]
168	# / ~digit / is \D
169	#
170	# Or maybe:
171	#
172	# / [~ digit]+ /
173	# / [~ '>']+ /
174	# / [NOT '>']+ /
175
176	# End = / '</' Tag '>' /
177	# StartEnd = / '<' Tag '/>' /
178	# Start = / '<' Tag '>' /
179	#
180	# EntityRef = / '&' dot{* N} ';' /
181
182	# Tag name, or attribute name
183	# colon is used in XML
184
185	# https://www.w3.org/TR/xml/#NT-Name
186	# Hm there is a lot of unicode stuff. We are simplifying parsing
187
188	_NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter
189
190	CHAR_LEX = [
191	# Characters
192	# https://www.w3.org/TR/xml/#sec-references
193	(r'&\# [0-9]+ ;', h8_id.DecChar),
194	(r'&\# x[0-9a-fA-F]+ ;', h8_id.HexChar),
195	(r'& %s ;' % _NAME, h8_id.CharEntity),
196	# Allow unquoted, and quoted
197	(r'&', h8_id.BadAmpersand),
198	]
199
200	HTM8_LEX = CHAR_LEX + [
201	# TODO: CommentBegin, ProcessingBegin, CDataBegin could have an additional
202	# action associated with them? The ending substring
203	(r'<!--', h8_id.CommentBegin),
204
205	# Processing instruction are used for the XML header:
206	# <?xml version="1.0" encoding="UTF-8"?>
207	# They are technically XML-only, but in HTML5, they are another kind of
208	# comment:
209	#
210	# https://developer.mozilla.org/en-US/docs/Web/API/ProcessingInstruction
211	#
212	(r'<\?', h8_id.ProcessingBegin),
213	# Not necessary in HTML5, but occurs in XML
214	(r'<!\[CDATA\[', h8_id.CDataBegin), # <![CDATA[
215
216	# Markup declarations
217	# - In HTML5, there is only <!DOCTYPE html>
218	# - XML has 4 more declarations: <!ELEMENT ...> ATTLIST ENTITY NOTATION
219	# - these seem to be part of DTD
220	# - it's useful to skip these, and be able to parse the rest of the document
221	# - Note: < is allowed?
222	(r'<! [^>\x00]+ >', h8_id.Decl),
223
224	# Tags
225	# Notes:
226	# - We look for a valid tag name, but we don't validate attributes.
227	# That's done in the tag lexer.
228	# - We don't allow leading whitespace
229	(r'</ (%s) >' % _NAME, h8_id.EndTag),
230	# self-closing <br/> comes before StartTag
231	# could/should these be collapsed into one rule?
232	(r'< (%s) [^>\x00]* />' % _NAME, h8_id.StartEndTag), # end </a>
233	(r'< (%s) [^>\x00]* >' % _NAME, h8_id.StartTag), # start <a>
234
235	# HTML5 allows unescaped > in raw data, but < is not allowed.
236	# https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
237	#
238	# - My early blog has THREE errors when disallowing >
239	# - So do some .wwz files
240	(r'[^&<>\x00]+', h8_id.RawData),
241	(r'>', h8_id.BadGreaterThan),
242	# < is an error
243	(r'.', h8_id.Invalid),
244	]
245
246	# Old notes:
247	#
248	# Non-greedy matches are regular and can be matched in linear time
249	# with RE2.
250	#
251	# https://news.ycombinator.com/item?id=27099798
252	#
253
254	# This person tried to do it with a regex:
255	#
256	# https://skeptric.com/html-comment-regexp/index.html
257
258	# . is any char except newline
259	# https://re2c.org/manual/manual_c.html
260
261	# Discarded options
262	#(r'<!-- .*? -->', h8_id.Comment),
263
264	# Hack from Claude: \s\S instead of re.DOTALL. I don't like this
265	#(r'<!-- [\s\S]*? -->', h8_id.Comment),
266	#(r'<!-- (?:.\|[\n])*? -->', h8_id.Comment),
267
268	HTM8_LEX_COMPILED = MakeLexer(HTM8_LEX)
269
270
271	class Lexer(object):
272
273	def __init__(self, s, left_pos=0, right_pos=-1, no_special_tags=False):
274	# type: (str, int, int, bool) -> None
275	self.s = s
276	self.pos = left_pos
277	self.right_pos = len(s) if right_pos == -1 else right_pos
278	self.no_special_tags = no_special_tags
279
280	# string -> compiled regex pattern object
281	self.cache = {} # type: Dict[str, Any]
282
283	# either </script> or </style> - we search until we see that
284	self.search_state = None # type: Optional[str]
285
286	# Position of tag name, if applicable
287	# - Set after you get a StartTag, EndTag, or StartEndTag
288	# - Unset on other tags
289	self.tag_pos_left = -1
290	self.tag_pos_right = -1
291
292	def _Read(self):
293	# type: () -> Tuple[h8_id_t, int]
294	if self.pos == self.right_pos:
295	return h8_id.EndOfStream, self.pos
296
297	assert self.pos < self.right_pos, self.pos
298
299	if self.search_state is not None and not self.no_special_tags:
300	# TODO: case-insensitive search for </SCRIPT> <SCRipt> ?
301	#
302	# Another strategy: enter a mode where we find ONLY the end tag
303	# regex, and any data that's not <, and then check the canonical
304	# tag name for 'script' or 'style'.
305	pos = self.s.find(self.search_state, self.pos)
306	if pos == -1:
307	# unterminated <script> or <style>
308	raise LexError(self.s, self.pos)
309	self.search_state = None
310	# beginning
311	return h8_id.HtmlCData, pos
312
313	# Find the first match.
314	# Note: frontend/match.py uses _LongestMatch(), which is different!
315	# TODO: reconcile them. This lexer should be expressible in re2c.
316
317	for pat, tok_id in HTM8_LEX_COMPILED:
318	m = pat.match(self.s, self.pos)
319	if m:
320	if tok_id in (h8_id.StartTag, h8_id.EndTag, h8_id.StartEndTag):
321	self.tag_pos_left = m.start(1)
322	self.tag_pos_right = m.end(1)
323	else:
324	# Reset state
325	self.tag_pos_left = -1
326	self.tag_pos_right = -1
327
328	if tok_id == h8_id.CommentBegin:
329	pos = self.s.find('-->', self.pos)
330	if pos == -1:
331	# unterminated <!--
332	raise LexError(self.s, self.pos)
333	return h8_id.Comment, pos + 3 # -->
334
335	if tok_id == h8_id.ProcessingBegin:
336	pos = self.s.find('?>', self.pos)
337	if pos == -1:
338	# unterminated <?
339	raise LexError(self.s, self.pos)
340	return h8_id.Processing, pos + 2 # ?>
341
342	if tok_id == h8_id.CDataBegin:
343	pos = self.s.find(']]>', self.pos)
344	if pos == -1:
345	# unterminated <![CDATA[
346	raise LexError(self.s, self.pos)
347	return h8_id.CData, pos + 3 # ]]>
348
349	if tok_id == h8_id.StartTag:
350	# TODO: reduce allocations
351	if (self.TagNameEquals('script') or
352	self.TagNameEquals('style')):
353	# <SCRipt a=b> -> </SCRipt>
354	self.search_state = '</' + self._LiteralTagName() + '>'
355
356	return tok_id, m.end()
357	else:
358	raise AssertionError('h8_id.Invalid rule should have matched')
359
360	def TagNameEquals(self, expected):
361	# type: (str) -> bool
362	assert self.tag_pos_left != -1, self.tag_pos_left
363	assert self.tag_pos_right != -1, self.tag_pos_right
364
365	# TODO: In C++, this does not need an allocation. Can we test
366	# directly?
367	return expected == self.CanonicalTagName()
368
369	def _LiteralTagName(self):
370	# type: () -> str
371	assert self.tag_pos_left != -1, self.tag_pos_left
372	assert self.tag_pos_right != -1, self.tag_pos_right
373
374	return self.s[self.tag_pos_left:self.tag_pos_right]
375
376	def CanonicalTagName(self):
377	# type: () -> str
378	tag_name = self._LiteralTagName()
379	# Most tags are already lower case, so avoid allocation with this conditional
380	# TODO: this could go in the mycpp runtime?
381	if tag_name.islower():
382	return tag_name
383	else:
384	return tag_name.lower()
385
386	def Read(self):
387	# type: () -> Tuple[h8_id_t, int]
388	tok_id, end_pos = self._Read()
389	self.pos = end_pos # advance
390	return tok_id, end_pos
391
392	def LookAhead(self, regex):
393	# type: (str) -> bool
394	"""
395	Currently used for ul_table.py. But taking a dynamic regex string is
396	not the right interface.
397	"""
398	# Cache the regex compilation. This could also be LookAheadFor(THEAD)
399	# or something.
400	pat = self.cache.get(regex)
401	if pat is None:
402	pat = re.compile(regex)
403	self.cache[regex] = pat
404
405	m = pat.match(self.s, self.pos)
406	return m is not None
407
408
409	class AttrLexer(object):
410	"""
411	We can also invert this
412
413	Unquoted (List[h8_id] tok_ids, List[int] end_pos)
414
415	It would be nice to have a special case for the singleton, since that is
416	very common.
417
418	Simple (int tag_name_start, int tag_name_end, int attr_value_tag,
419	int value_start, int value_end)
420	This would cover many cases
421
422	The other option is to create many different events, and have AttrValueLexer
423	But I think that is annoying and overly detailed.
424
425	Operations:
426	- GetAttrRaw('foo')
427	- AllAttrsRaw()
428	- AllAttrsRawSlice()
429
430	class= query - well we can do this with Space tokens I think - we should
431	have an optimization
432	id= query - ditto, we should just have a predicate
433
434	Zero allocs:
435	tag query - TagNameEquals()
436
437	So I guess we have to write a HasClass('foo') and IdEquals('bar') on top of
438	this. Yes.
439
440	tag_lx.Reset2(...) # we should pass it the tag_name_end position
441	tag_lx.Read() -> bool # success or fail? Or Attr or Invalid
442	.AttrNameEquals('foo') -> bool # id and class query
443	.GetAttrName() -> str # for getting them all
444	.GetRawValue() -> Tuple[h8_tag_id, start, end] # just beginning and end
445	.GetValueTokens() -> Tuple[h8_tag_id, TokenList]
446	.TokenList = Tuple[List[h8_id], List[int end_pos]
447
448	You could also have
449
450	tag_lx.GetValueTokenId() -> Tuple[h8_id, end_pos]
451
452	And then you read it until it's " or ' or space ? We probably won't have
453	that use case to start.
454	"""
455
456	def __init__(self, s):
457	# type: (str) -> None
458	self.s = s
459	self.tag_name_pos = -1 # Invalid
460	self.tag_end_pos = -1
461
462	def Init(self, tag_name_pos, end_pos):
463	# type: (int, int) -> None
464	"""Initialize so we can read names and values.
465
466	Example:
467	'x <a y>' # tag_name_pos=4, end_pos=6
468	'x <a>' # tag_name_pos=4, end_pos=4
469
470	The Reset() method is used to reuse instances of the AttrLexer object.
471	"""
472	assert tag_name_pos >= 0, tag_name_pos
473	assert end_pos >= 0, end_pos
474
475	self.tag_name_pos = tag_name_pos
476	self.end_pos = end_pos
477
478	def ReadName(self):
479	# type: () -> Tuple[attr_name_t, int, int]
480	"""Reads the attribute name
481
482	EOF case:
483	<a>
484	<a >
485
486	Error case:
487	<a !>
488	<a foo=bar !>
489	"""
490	pass
491
492	def AttrNameEquals(self, s):
493	# type: (str) -> bool
494	"""
495	TODO: Must call this after ReadName() ?
496	Because that can FAIL.
497	"""
498	pass
499
500	def ReadRawValue(self):
501	# type: () -> Tuple[attr_value_t, int, int]
502	"""Read the attribute value.
503
504	In general, it is escaped or "raw"
505
506	Note: Assuming ReadName() returned a value, this should NOT fail.
507	"""
508	# NOTE: if = is not found, set state
509
510	pass
511
512	def SkipValue(self):
513	# type: () -> None
514	# Just ignore it and return
515	self.ReadRawValue()
516
517	def ReadValueAndDecode(self):
518	# type: () -> str
519	"""Read the attribute vlaue
520	"""
521	# TODO: tokenize it
522	pass
523
524
525	# Tag names:
526	# Match <a or </a
527	# Match <h2, but not <2h
528	#
529	# HTML 5 doesn't restrict tag names at all
530	# https://html.spec.whatwg.org/#toc-syntax
531	#
532	# XML allows : - .
533	# https://www.w3.org/TR/xml/#NT-NameChar
534
535	# Namespaces for MathML, SVG
536	# XLink, XML, XMLNS
537	#
538	# https://infra.spec.whatwg.org/#namespaces
539	#
540	# Allow - for td-attrs
541
542	# Be very lenient - just no whitespace or special HTML chars
543	# I don't think this is more lenient than HTML5, though we should check.
544	_UNQUOTED_VALUE = r'''[^ \t\r\n<>&"'\x00]*'''
545
546	# TODO: we don't need to capture the tag name here? That's done at the top
547	# level
548	_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)
549
550	_TAG_LAST_RE = re.compile(r'\s* /? >', re.VERBOSE)
551
552	# To match href="foo"
553	# Note: in HTML5 and XML, single quoted attributes are also valid
554
555	# <button disabled> is standard usage
556
557	# NOTE: This used to allow whitespace around =
558	# <a foo = "bar"> makes sense in XML
559	# But then you also have
560	# <a foo= bar> - which is TWO attributes, in HTML5
561	# So the space is problematic
562
563	_ATTR_RE = re.compile(
564	r'''
565	\s+ # Leading whitespace is required
566	(%s) # Attribute name
567	(?: # Optional attribute value
568	\s* = \s* # Spaces allowed around =
569	(?:
570	" ([^>"\x00]*) " # double quoted value
571	\| ' ([^>'\x00]*) ' # single quoted value
572	\| (%s) # Attribute value
573	)
574	)?
575	''' % (_NAME, _UNQUOTED_VALUE), re.VERBOSE)
576
577
578	class TagLexer(object):
579	"""
580	Given a tag like <a href="..."> or <link type="..." />, the TagLexer
581	provides a few operations:
582
583	- What is the tag?
584	- Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
585	"""
586
587	def __init__(self, s):
588	# type: (str) -> None
589	self.s = s
590	self.start_pos = -1 # Invalid
591	self.end_pos = -1
592
593	def Reset(self, start_pos, end_pos):
594	# type: (int, int) -> None
595	"""Reuse instances of this object."""
596	assert start_pos >= 0, start_pos
597	assert end_pos >= 0, end_pos
598
599	self.start_pos = start_pos
600	self.end_pos = end_pos
601
602	def WholeTagString(self):
603	# type: () -> str
604	"""Return the entire tag string, e.g. <a href='foo'>"""
605	return self.s[self.start_pos:self.end_pos]
606
607	def GetTagName(self):
608	# type: () -> str
609	# First event
610	tok_id, start, end = next(self.Tokens())
611	return self.s[start:end]
612
613	def GetSpanForAttrValue(self, attr_name):
614	# type: (str) -> Tuple[int, int]
615	"""
616	Used by oils_doc.py, for href shortcuts
617	"""
618	# Algorithm: search for QuotedValue or UnquotedValue after AttrName
619	# TODO: Could also cache these
620
621	events = self.Tokens()
622	val = (-1, -1)
623	try:
624	while True:
625	tok_id, start, end = next(events)
626	if tok_id == h8_tag_id.AttrName:
627	name = self.s[start:end]
628	if name == attr_name:
629	# The value should come next
630	tok_id, start, end = next(events)
631	assert tok_id in (
632	h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue,
633	h8_tag_id.MissingValue), h8_tag_id_str(tok_id)
634	val = start, end
635	break
636
637	except StopIteration:
638	pass
639	return val
640
641	def GetAttrRaw(self, attr_name):
642	# type: (str) -> Optional[str]
643	"""
644	Return the value, which may be UNESCAPED.
645	"""
646	start, end = self.GetSpanForAttrValue(attr_name)
647	if start == -1:
648	return None
649	return self.s[start:end]
650
651	def AllAttrsRawSlice(self):
652	# type: () -> List[Tuple[str, int, int]]
653	"""
654	Get a list of pairs [('class', 3, 5), ('href', 9, 12)]
655	"""
656	slices = []
657	events = self.Tokens()
658	try:
659	while True:
660	tok_id, start, end = next(events)
661	if tok_id == h8_tag_id.AttrName:
662	name = self.s[start:end]
663
664	# The value should come next
665	tok_id, start, end = next(events)
666	assert tok_id in (
667	h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue,
668	h8_tag_id.MissingValue), h8_tag_id_str(tok_id)
669	# Note: quoted values may have &
670	# We would need ANOTHER lexer to unescape them, but we
671	# don't need that for ul-table
672	slices.append((name, start, end))
673	except StopIteration:
674	pass
675	return slices
676
677	def AllAttrsRaw(self):
678	# type: () -> List[Tuple[str, str]]
679	"""
680	Get a list of pairs [('class', 'foo'), ('href', '?foo=1&bar=2')]
681
682	The quoted values may be escaped. We would need another lexer to
683	unescape them.
684	"""
685	slices = self.AllAttrsRawSlice()
686	pairs = []
687	for name, start, end in slices:
688	pairs.append((name, self.s[start:end]))
689	return pairs
690
691	def Tokens(self):
692	# type: () -> Iterator[Tuple[h8_tag_id_t, int, int]]
693	"""
694	Yields a sequence of tokens: Tag (AttrName AttrValue?)*
695
696	Where each Token is (Type, start_pos, end_pos)
697
698	Note that start and end are NOT redundant! We skip over some unwanted
699	characters.
700	"""
701	m = _TAG_RE.match(self.s, self.start_pos + 1)
702	if not m:
703	raise RuntimeError("Couldn't find HTML tag in %r" %
704	self.WholeTagString())
705	yield h8_tag_id.TagName, m.start(1), m.end(1)
706
707	pos = m.end(0)
708	#log('POS %d', pos)
709
710	while True:
711	# don't search past the end
712	m = _ATTR_RE.match(self.s, pos, self.end_pos)
713	if not m:
714	#log('BREAK pos %d', pos)
715	break
716	#log('AttrName %r', m.group(1))
717
718	yield h8_tag_id.AttrName, m.start(1), m.end(1)
719
720	#log('m.groups() %r', m.groups())
721	if m.group(2) is not None:
722	# double quoted
723	yield h8_tag_id.QuotedValue, m.start(2), m.end(2)
724	elif m.group(3) is not None:
725	# single quoted - TODO: could have different token types
726	yield h8_tag_id.QuotedValue, m.start(3), m.end(3)
727	elif m.group(4) is not None:
728	yield h8_tag_id.UnquotedValue, m.start(4), m.end(4)
729	else:
730	# <button disabled>
731	end = m.end(0)
732	yield h8_tag_id.MissingValue, end, end
733
734	# Skip past the "
735	pos = m.end(0)
736
737	#log('TOK %r', self.s)
738
739	m = _TAG_LAST_RE.match(self.s, pos)
740	#log('_TAG_LAST_RE match %r', self.s[pos:])
741	if not m:
742	# Extra data at end of tag. TODO: add messages for all these.
743	raise LexError(self.s, pos)
744
745
746	# This is similar but not identical to
747	# " ([^>"\x00]*) " # double quoted value
748	# \| ' ([^>'\x00]*) ' # single quoted value
749	#
750	# Note: for unquoted values, & isn't allowed, and thus & and c and
751	# are not allowed. We could relax that?
752	ATTR_VALUE_LEX = CHAR_LEX + [
753	(r'[^>&\x00]+', h8_id.RawData),
754	(r'.', h8_id.Invalid),
755	]
756
757	ATTR_VALUE_LEX_COMPILED = MakeLexer(ATTR_VALUE_LEX)
758
759
760	class AttrValueLexer(object):
761	"""
762	<a href="foo=99&bar">
763	<a href='foo=99&bar'>
764	<a href=unquoted>
765	"""
766
767	def __init__(self, s):
768	# type: (str) -> None
769	self.s = s
770	self.start_pos = -1 # Invalid
771	self.end_pos = -1
772
773	def Reset(self, start_pos, end_pos):
774	# type: (int, int) -> None
775	"""Reuse instances of this object."""
776	assert start_pos >= 0, start_pos
777	assert end_pos >= 0, end_pos
778
779	self.start_pos = start_pos
780	self.end_pos = end_pos
781
782	def NumTokens(self):
783	# type: () -> int
784	num_tokens = 0
785	pos = self.start_pos
786	for tok_id, end_pos in self.Tokens():
787	if tok_id == h8_id.Invalid:
788	raise LexError(self.s, pos)
789	pos = end_pos
790	#log('pos %d', pos)
791	num_tokens += 1
792	return num_tokens
793
794	def Tokens(self):
795	# type: () -> Iterator[Tuple[h8_id_t, int]]
796	pos = self.start_pos
797	while pos < self.end_pos:
798	# Find the first match, like above.
799	# Note: frontend/match.py uses _LongestMatch(), which is different!
800	# TODO: reconcile them. This lexer should be expressible in re2c.
801	for pat, tok_id in ATTR_VALUE_LEX_COMPILED:
802	m = pat.match(self.s, pos)
803	if m:
804	if 0:
805	tok_str = m.group(0)
806	log('token = %r', tok_str)
807
808	end_pos = m.end(0)
809	yield tok_id, end_pos
810	pos = end_pos
811	break
812	else:
813	raise AssertionError('h8_id.Invalid rule should have matched')