data_lang/htm8.py

OILS / data_lang / htm8.py View on Github | oils.pub

980 lines, 444 significant

1	"""data_lang/htm8.py
2
3	TODO
4
5	Migrate:
6
7	- maybe: migrate everything off of TagLexer()
8	- and AttrValueLexer() - this should requires Validate()
9
10	API:
11	- Get rid of Reset()?
12	- Deprecate tag_lexer.GetTagName() in favor of lx.CanonicalTagName() or
13	_LiteralTagName()
14	- UTF-8 check, like JSON8
15	- re2c
16	- port lexer, which will fix static typing issues
17	- the abstraction needs to support submatch?
18	- for finding the end of a tag, etc.?
19	- and what about no match?
20
21	- harmonize LexError and ParseError with data_lang/j8.py, which uses
22	error.Decode(msg, ..., cur_line_num)
23
24	- Copy all errors into doc/ref/chap-errors.md
25	- This helps understand the language
26
27	- Update doc/htm8.md
28	- list of Algorithms:
29	- lex just the top level
30	- lex both levels
31	- and match tags - this is the level for value.Htm8Frag?
32	- convert to XML!
33	- lazy selection by tag, or attr (id= and class=)
34	- lazy selection by CSS selector expression
35	- convert to DOMTree
36	- sed-like replacement of DOM Tree or element
37	- untrusted HTML filter, e.g. like StackOverflow / Reddit
38	- this is Safe HTM8
39	- should have a zero alloc way to support this, with good errors?
40	- I think most of them silently strip data
41	"""
42
43	import re
44
45	from typing import Dict, List, Tuple, Optional, IO, Iterator, Any
46
47	from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_tag_id, h8_tag_id_t,
48	h8_tag_id_str, attr_name, attr_name_t,
49	attr_name_str, attr_value_e, attr_value_t,
50	h8_val_id)
51	from doctools.util import log
52
53
54	class LexError(Exception):
55	"""
56	Examples of lex errors:
57
58	- h8_id.Invalid, like <> or &&
59	- Unclosed <!-- <? <![CDATA[ <script> <style>
60	"""
61
62	def __init__(self, msg, code_str, start_pos):
63	# type: (str, str, int) -> None
64	self.msg = msg
65	self.code_str = code_str
66	self.start_pos = start_pos
67
68	def __str__(self):
69	# type: () -> str
70	return '(LexError %r %r)' % (
71	self.msg, self.code_str[self.start_pos:self.start_pos + 20])
72
73
74	def _FindLineNum(s, error_pos):
75	# type: (str, int) -> int
76	current_pos = 0
77	line_num = 1
78	while True:
79	newline_pos = s.find('\n', current_pos)
80	#log('current = %d, N %d, line %d', current_pos, newline_pos, line_num)
81
82	if newline_pos == -1: # this is the last line
83	return line_num
84	if newline_pos >= error_pos:
85	return line_num
86	line_num += 1
87	current_pos = newline_pos + 1
88
89
90	class ParseError(Exception):
91	"""
92	Examples of parse errors
93
94	- unbalanced tag structure
95	- ul_table.py errors
96	"""
97
98	def __init__(self, msg, s=None, start_pos=-1):
99	# type: (str, Optional[str], int) -> None
100	self.msg = msg
101	self.s = s
102	self.start_pos = start_pos
103
104	def __str__(self):
105	# type: () -> str
106	if self.s is not None:
107	assert self.start_pos != -1, self.start_pos
108	snippet = (self.s[self.start_pos:self.start_pos + 20])
109
110	line_num = _FindLineNum(self.s, self.start_pos)
111	else:
112	snippet = ''
113	line_num = -1
114	msg = 'line %d: %r %r' % (line_num, self.msg, snippet)
115	return msg
116
117
118	class Output(object):
119	"""Output for sed-like "replacement" model.
120
121	Takes an underlying input buffer and an output file. Maintains a position
122	in the input buffer.
123
124	Print FROM the input or print new text to the output.
125	"""
126
127	def __init__(self, s, f, left_pos=0, right_pos=-1):
128	# type: (str, IO[str], int, int) -> None
129	self.s = s
130	self.f = f
131	self.pos = left_pos
132	self.right_pos = len(s) if right_pos == -1 else right_pos
133
134	def SkipTo(self, pos):
135	# type: (int) -> None
136	"""Skip to a position."""
137	self.pos = pos
138
139	def PrintUntil(self, pos):
140	# type: (int) -> None
141	"""Print until a position."""
142	piece = self.s[self.pos:pos]
143	self.f.write(piece)
144	self.pos = pos
145
146	def PrintTheRest(self):
147	# type: () -> None
148	"""Print until the end of the string."""
149	self.PrintUntil(self.right_pos)
150
151	def Print(self, s):
152	# type: (str) -> None
153	"""Print text to the underlying buffer."""
154	self.f.write(s)
155
156
157	def MakeLexer(rules):
158	return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]
159
160
161	#
162	# Lexers
163	#
164
165	_NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter
166
167	CHAR_LEX = [
168	# Characters
169	# https://www.w3.org/TR/xml/#sec-references
170	(r'&\# [0-9]+ ;', h8_id.DecChar),
171	(r'&\# x[0-9a-fA-F]+ ;', h8_id.HexChar),
172	(r'& %s ;' % _NAME, h8_id.CharEntity),
173	# Allow unquoted, and quoted
174	(r'&', h8_id.BadAmpersand),
175	]
176
177	HTM8_LEX = CHAR_LEX + [
178	# TODO: CommentBegin, ProcessingBegin, CDataBegin could have an additional
179	# action associated with them? The ending substring
180	(r'<!--', h8_id.CommentBegin),
181
182	# Processing instruction are used for the XML header:
183	# <?xml version="1.0" encoding="UTF-8"?>
184	# They are technically XML-only, but in HTML5, they are another kind of
185	# comment:
186	#
187	# https://developer.mozilla.org/en-US/docs/Web/API/ProcessingInstruction
188	#
189	(r'<\?', h8_id.ProcessingBegin),
190	# Not necessary in HTML5, but occurs in XML
191	(r'<!\[CDATA\[', h8_id.CDataBegin), # <![CDATA[
192
193	# Markup declarations
194	# - In HTML5, there is only <!DOCTYPE html>
195	# - XML has 4 more declarations: <!ELEMENT ...> ATTLIST ENTITY NOTATION
196	# - these seem to be part of DTD
197	# - it's useful to skip these, and be able to parse the rest of the document
198	# - Note: < is allowed?
199	(r'<! [^>\x00]+ >', h8_id.Decl),
200
201	# Tags
202	# Notes:
203	# - We look for a valid tag name, but we don't validate attributes.
204	# That's done in the tag lexer.
205	# - We don't allow leading whitespace
206	(r'</ (%s) >' % _NAME, h8_id.EndTag),
207	# self-closing <br/> comes before StartTag
208	# could/should these be collapsed into one rule?
209	(r'< (%s) [^>\x00]* />' % _NAME, h8_id.StartEndTag), # end </a>
210	(r'< (%s) [^>\x00]* >' % _NAME, h8_id.StartTag), # start <a>
211
212	# HTML5 allows unescaped > in raw data, but < is not allowed.
213	# https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
214	#
215	# - My early blog has THREE errors when disallowing >
216	# - So do some .wwz files
217	(r'[^&<>\x00]+', h8_id.RawData),
218	(r'>', h8_id.BadGreaterThan),
219	# NUL is the end, an accomodation for re2c. Like we do in frontend/match.
220	(r'\x00', h8_id.EndOfStream),
221	# This includes < - it is not BadLessThan because it's NOT recoverable
222	(r'.', h8_id.Invalid),
223	]
224
225	# Old notes:
226	#
227	# Non-greedy matches are regular and can be matched in linear time
228	# with RE2.
229	#
230	# https://news.ycombinator.com/item?id=27099798
231	#
232
233	# This person tried to do it with a regex:
234	#
235	# https://skeptric.com/html-comment-regexp/index.html
236
237	# . is any char except newline
238	# https://re2c.org/manual/manual_c.html
239
240	# Discarded options
241	#(r'<!-- .*? -->', h8_id.Comment),
242
243	# Hack from Claude: \s\S instead of re.DOTALL. I don't like this
244	#(r'<!-- [\s\S]*? -->', h8_id.Comment),
245	#(r'<!-- (?:.\|[\n])*? -->', h8_id.Comment),
246
247	HTM8_LEX_COMPILED = MakeLexer(HTM8_LEX)
248
249
250	class Lexer(object):
251
252	def __init__(self, s, left_pos=0, right_pos=-1, no_special_tags=False):
253	# type: (str, int, int, bool) -> None
254	self.s = s
255	self.pos = left_pos
256	self.right_pos = len(s) if right_pos == -1 else right_pos
257	self.no_special_tags = no_special_tags
258
259	# string -> compiled regex pattern object
260	self.cache = {} # type: Dict[str, Any]
261
262	# either </script> or </style> - we search until we see that
263	self.search_state = None # type: Optional[str]
264
265	# Position of tag name, if applicable
266	# - Set after you get a StartTag, EndTag, or StartEndTag
267	# - Unset on other tags
268	self.tag_pos_left = -1
269	self.tag_pos_right = -1
270
271	def _Read(self):
272	# type: () -> Tuple[h8_id_t, int]
273	if self.pos == self.right_pos:
274	return h8_id.EndOfStream, self.pos
275
276	assert self.pos < self.right_pos, self.pos
277
278	if self.search_state is not None and not self.no_special_tags:
279	# TODO: case-insensitive search for </SCRIPT> <SCRipt> ?
280	#
281	# Another strategy: enter a mode where we find ONLY the end tag
282	# regex, and any data that's not <, and then check the canonical
283	# tag name for 'script' or 'style'.
284	pos = self.s.find(self.search_state, self.pos)
285	if pos == -1:
286	raise LexError('Unterminated <script> or <style>', self.s,
287	self.pos)
288	self.search_state = None
289	# beginning
290	return h8_id.HtmlCData, pos
291
292	# Find the first match.
293	# Note: frontend/match.py uses _LongestMatch(), which is different!
294	# TODO: reconcile them. This lexer should be expressible in re2c.
295
296	for pat, tok_id in HTM8_LEX_COMPILED:
297	m = pat.match(self.s, self.pos)
298	if m:
299	if tok_id in (h8_id.StartTag, h8_id.EndTag, h8_id.StartEndTag):
300	self.tag_pos_left = m.start(1)
301	self.tag_pos_right = m.end(1)
302	else:
303	# Reset state
304	self.tag_pos_left = -1
305	self.tag_pos_right = -1
306
307	if tok_id == h8_id.CommentBegin:
308	pos = self.s.find('-->', self.pos)
309	if pos == -1:
310	raise LexError('Unterminated <!--', self.s, self.pos)
311	return h8_id.Comment, pos + 3 # -->
312
313	if tok_id == h8_id.ProcessingBegin:
314	pos = self.s.find('?>', self.pos)
315	if pos == -1:
316	raise LexError('Unterminated <?', self.s, self.pos)
317	return h8_id.Processing, pos + 2 # ?>
318
319	if tok_id == h8_id.CDataBegin:
320	pos = self.s.find(']]>', self.pos)
321	if pos == -1:
322	# unterminated <![CDATA[
323	raise LexError('Unterminated <![CDATA[', self.s,
324	self.pos)
325	return h8_id.CData, pos + 3 # ]]>
326
327	if tok_id == h8_id.StartTag:
328	# TODO: reduce allocations
329	if (self.TagNameEquals('script') or
330	self.TagNameEquals('style')):
331	# <SCRipt a=b> -> </SCRipt>
332	self.search_state = '</' + self._LiteralTagName() + '>'
333
334	return tok_id, m.end()
335	else:
336	raise AssertionError('h8_id.Invalid rule should have matched')
337
338	def TagNamePos(self):
339	# type: () -> int
340	"""The right position of the tag pos"""
341	assert self.tag_pos_right != -1, self.tag_pos_right
342	return self.tag_pos_right
343
344	def TagNameEquals(self, expected):
345	# type: (str) -> bool
346	assert self.tag_pos_left != -1, self.tag_pos_left
347	assert self.tag_pos_right != -1, self.tag_pos_right
348
349	# TODO: In C++, this does not need an allocation. Can we test
350	# directly?
351	return expected == self.CanonicalTagName()
352
353	def _LiteralTagName(self):
354	# type: () -> str
355	assert self.tag_pos_left != -1, self.tag_pos_left
356	assert self.tag_pos_right != -1, self.tag_pos_right
357
358	return self.s[self.tag_pos_left:self.tag_pos_right]
359
360	def CanonicalTagName(self):
361	# type: () -> str
362	tag_name = self._LiteralTagName()
363	# Most tags are already lower case, so avoid allocation with this conditional
364	# TODO: this could go in the mycpp runtime?
365	if tag_name.islower():
366	return tag_name
367	else:
368	return tag_name.lower()
369
370	def Read(self):
371	# type: () -> Tuple[h8_id_t, int]
372	tok_id, end_pos = self._Read()
373	self.pos = end_pos # advance
374	return tok_id, end_pos
375
376	def LookAhead(self, regex):
377	# type: (str) -> bool
378	"""
379	Currently used for ul_table.py. But taking a dynamic regex string is
380	not the right interface.
381	"""
382	# Cache the regex compilation. This could also be LookAheadFor(THEAD)
383	# or something.
384	pat = self.cache.get(regex)
385	if pat is None:
386	pat = re.compile(regex)
387	self.cache[regex] = pat
388
389	m = pat.match(self.s, self.pos)
390	return m is not None
391
392
393	A_NAME_LEX = [
394	# Leading whitespace is required, to separate attributes.
395	#
396	# If the = is not present, then we set the lexer in a state for
397	# attr_value_e.Missing.
398	(r'\s+ (%s) \s* (=)? \s*' % _NAME, attr_name.Ok),
399	# unexpected EOF
400
401	# The closing > or /> is treated as end of stream, and it's not an error.
402	(r'\s* /? >', attr_name.Done),
403
404	# NUL should not be possible, because the top-level
405
406	# This includes < - it is not BadLessThan because it's NOT recoverable
407	(r'.', attr_name.Invalid),
408	]
409
410	A_NAME_LEX_COMPILED = MakeLexer(A_NAME_LEX)
411
412	# Here we just loop on regular tokens
413	#
414	# Examples:
415	# <a href = unquoted&foo >
416	# <a href = unquoted&foo > # BadAmpersand is allowed I guess
417	# <a href ="unquoted&foo" > # double quoted
418	# <a href ='unquoted&foo' > # single quoted
419	# <a href = what"foo" > # HTML5 allows this, but we could disallow it if
420	# it's not common. It opens up the j"" and $"" extensions
421	# <a href = what'foo' > # ditto
422
423	# TODO: get rid of OLD copy
424	_UNQUOTED_VALUE_OLD = r'''[^ \t\r\n<>&"'\x00]*'''
425	_UNQUOTED_VALUE = r'''[^ \t\r\n<>&"'\x00]+'''
426
427	# What comes after = ?
428	A_VALUE_LEX = [
429	(r'"', h8_val_id.DoubleQuote),
430	(r"'", h8_val_id.SingleQuote),
431	(_UNQUOTED_VALUE, h8_val_id.UnquotedVal),
432	(r'.', h8_val_id.NoMatch),
433	]
434
435	A_VALUE_LEX_COMPILED = MakeLexer(A_VALUE_LEX)
436
437	# What's inside "" or '' ?
438	QUOTED_VALUE_LEX = CHAR_LEX + [
439	(r'"', h8_id.DoubleQuote),
440	(r"'", h8_id.SingleQuote),
441	(r'<', h8_id.BadLessThan), # BadAmpersand is in CharLex
442
443	# TODO: think about whitespace for efficient class= queries?
444	#(r'[ \r\n\t]', h8_id.Whitespace), # terminates unquoted values
445	(r'''[^"'<>&\x00]+''', h8_id.RawData),
446	# This includes > - it is not BadGreaterThan because it's NOT recoverable
447	(r'.', h8_id.Invalid),
448	]
449
450	QUOTED_VALUE_LEX_COMPILED = MakeLexer(QUOTED_VALUE_LEX)
451
452
453	class AttrLexer(object):
454	"""
455	Typical usage:
456
457	while True:
458	n, start_pos, end_pos = attr_lx.ReadName()
459	if n == attr_name.Ok:
460	if attr_lx.AttrNameEquals('div'):
461	print('div')
462
463	# TODO: also pass Optional[List[]] out_tokens?
464	v, start_pos, end_pos = attr_lx.ReadValue()
465	"""
466
467	def __init__(self, s):
468	# type: (str) -> None
469	self.s = s
470
471	self.tok_id = h8_id.Invalid # Uninitialized
472	self.tag_name_pos = -1 # Invalid
473	self.tag_end_pos = -1
474	self.must_not_exceed_pos = -1
475
476	self.pos = -1
477
478	self.name_start = -1
479	self.name_end = -1
480	self.next_value_is_missing = False
481
482	self.init_t = -1
483	self.init_e = -1
484
485	def Init(self, tok_id, tag_name_pos, end_pos):
486	# type: (h8_id_t, int, int) -> None
487	"""Initialize so we can read names and values.
488
489	Example:
490	'x <a y>' # tag_name_pos=4, end_pos=6
491	'x <a>' # tag_name_pos=4, end_pos=4
492
493	The Init() method is used to reuse instances of the AttrLexer object.
494	"""
495	assert tag_name_pos >= 0, tag_name_pos
496	assert end_pos >= 0, end_pos
497
498	#log('TAG NAME POS %d', tag_name_pos)
499
500	self.tok_id = tok_id
501	self.tag_name_pos = tag_name_pos
502	self.end_pos = end_pos
503
504	# Check for ambiguous <img src=/>
505	if tok_id == h8_id.StartTag:
506	self.must_not_exceed_pos = end_pos - 1 # account for >
507	elif tok_id == h8_id.StartEndTag:
508	self.must_not_exceed_pos = end_pos - 2 # account for />
509	else:
510	raise AssertionError(tok_id)
511
512	self.pos = tag_name_pos
513
514	# For Reset()
515	self.init_t = tag_name_pos
516	self.init_e = end_pos
517
518	def Reset(self):
519	# type: () -> None
520
521	# TODO: maybe GetAttrRaw() should call this directly? But not any of
522	# the AllAttrs() methods?
523	self.tag_name_pos = self.init_t
524	self.end_pos = self.init_e
525	self.pos = self.init_t
526
527	def ReadName(self):
528	# type: () -> Tuple[attr_name_t, int, int]
529	"""Reads the attribute name
530
531	EOF case:
532	<a>
533	<a >
534
535	Error case:
536	<a !>
537	<a foo=bar !>
538	"""
539	for pat, a in A_NAME_LEX_COMPILED:
540	m = pat.match(self.s, self.pos)
541	#log('ReadName() matching %r at %d', self.s, self.pos)
542	if m:
543	#log('ReadName() tag_name_pos %d pos, %d %s', self.tag_name_pos, self.pos, m.groups())
544	if a == attr_name.Invalid:
545	#log('m.groups %s', m.groups())
546	return attr_name.Invalid, -1, -1
547
548	self.pos = m.end(0) # Advance if it's not invalid
549
550	if a == attr_name.Ok:
551	#log('%r', m.groups())
552	self.name_start = m.start(1)
553	self.name_end = m.end(1)
554	# Is the equals sign missing? Set state.
555	if m.group(2) is None:
556	self.next_value_is_missing = True
557	# HACK: REWIND, since we don't want to consume whitespace
558	self.pos = self.name_end
559	else:
560	self.next_value_is_missing = False
561	return attr_name.Ok, self.name_start, self.name_end
562	else:
563	# Reset state - e.g. you must call AttrNameEquals
564	self.name_start = -1
565	self.name_end = -1
566
567	if a == attr_name.Done:
568	return attr_name.Done, -1, -1
569	else:
570	context = self.s[self.pos:]
571	#log('s %r %d', self.s, self.pos)
572	raise AssertionError('h8_id.Invalid rule should have matched %r' %
573	context)
574
575	def _CanonicalAttrName(self):
576	# type: () -> str
577	"""Return the lower case attribute name.
578
579	Must call after ReadName()
580	"""
581	assert self.name_start >= 0, self.name_start
582	assert self.name_end >= 0, self.name_end
583
584	attr_name = self.s[self.name_start:self.name_end]
585	if attr_name.islower():
586	return attr_name
587	else:
588	return attr_name.lower()
589
590	def AttrNameEquals(self, expected):
591	# type: (str) -> bool
592	"""
593	Must call after ReadName()
594
595	TODO: This can be optimized to be "in place", with zero allocs.
596	"""
597	return expected == self._CanonicalAttrName()
598
599	def _QuotedRead(self):
600	# type: () -> Tuple[h8_id_t, int]
601
602	for pat, tok_id in QUOTED_VALUE_LEX_COMPILED:
603	m = pat.match(self.s, self.pos)
604	if m:
605	end_pos = m.end(0) # Advance
606	#log('_QuotedRead %r', self.s[self.pos:end_pos])
607	return tok_id, end_pos
608	else:
609	context = self.s[self.pos:self.pos + 10]
610	raise AssertionError('h8_id.Invalid rule should have matched %r' %
611	context)
612
613	def ReadValue(self, tokens_out=None):
614	# type: (Optional[List[Tuple[h8_id, int]]]) -> Tuple[attr_value_t, int, int]
615	"""Read the attribute value.
616
617	In general, it is escaped or "raw"
618
619	Can only be called after a SUCCESSFUL ReadName().
620	Assuming ReadName() returned a value, this should NOT fail.
621	"""
622	# ReadName() invariant
623	assert self.name_start >= 0, self.name_start
624	assert self.name_end >= 0, self.name_end
625
626	self.name_start = -1
627	self.name_end = -1
628
629	if self.next_value_is_missing:
630	# Do not advance self.pos
631	#log('-> MISSING pos %d : %r', self.pos, self.s[self.pos:])
632	return attr_value_e.Missing, -1, -1
633
634	# Now read " ', unquoted or empty= is valid too.
635	for pat, a in A_VALUE_LEX_COMPILED:
636	m = pat.match(self.s, self.pos)
637	if m:
638	first_end_pos = m.end(0)
639	# We shouldn't go past the end
640	assert first_end_pos <= self.end_pos, \
641	'first_end_pos = %d should be less than self.end_pos = %d' % (first_end_pos, self.end_pos)
642	#log('m %s', m.groups())
643
644	# Note: Unquoted value can't contain & etc. now, so there
645	# is no unquoting, and no respecting tokens_raw.
646	if a == h8_val_id.UnquotedVal:
647	if first_end_pos > self.must_not_exceed_pos:
648	#log('first_end_pos %d', first_end_pos)
649	#log('must_not_exceed_pos %d', self.must_not_exceed_pos)
650	raise LexError(
651	'Ambiguous slash: last attribute should be quoted',
652	self.s, first_end_pos)
653	self.pos = first_end_pos # Advance
654	return attr_value_e.Unquoted, m.start(0), first_end_pos
655
656	# TODO: respect tokens_out
657	if a == h8_val_id.DoubleQuote:
658	self.pos = first_end_pos
659	while True:
660	tok_id, q_end_pos = self._QuotedRead()
661	#log('self.pos %d q_end_pos %d', self.pos, q_end_pos)
662	if tok_id == h8_id.Invalid:
663	raise LexError(
664	'ReadValue() got invalid token (DQ)', self.s,
665	self.pos)
666	if tok_id == h8_id.DoubleQuote:
667	right_pos = self.pos
668	self.pos = q_end_pos # Advance past "
669	return attr_value_e.DoubleQuoted, first_end_pos, right_pos
670	self.pos = q_end_pos # Advance _QuotedRead
671
672	# TODO: respect tokens_out
673	if a == h8_val_id.SingleQuote:
674	self.pos = first_end_pos
675	while True:
676	tok_id, q_end_pos = self._QuotedRead()
677	if tok_id == h8_id.Invalid:
678	raise LexError(
679	'ReadValue() got invalid token (SQ)', self.s,
680	self.pos)
681	if tok_id == h8_id.SingleQuote:
682	right_pos = self.pos
683	self.pos = q_end_pos # Advance past "
684	return attr_value_e.SingleQuoted, first_end_pos, right_pos
685	self.pos = q_end_pos # Advance _QuotedRead
686
687	if a == h8_val_id.NoMatch:
688	# <a foo = >
689	return attr_value_e.Empty, -1, -1
690	else:
691	raise AssertionError('h8_val_id.NoMatch rule should have matched')
692
693
694	def GetAttrRaw(attr_lx, name):
695	# type: (AttrLexer, str) -> Optional[str]
696	while True:
697	n, name_start, name_end = attr_lx.ReadName()
698	#log('==> ReadName %s %d %d', attr_name_str(n), name_start, name_end)
699	if n == attr_name.Ok:
700	if attr_lx.AttrNameEquals(name):
701	v, val_start, val_end = attr_lx.ReadValue()
702	return attr_lx.s[val_start:val_end]
703	else:
704	# Problem with stateful API: You are forced to either ReadValue()
705	# or SkipVlaue()
706	attr_lx.ReadValue()
707	elif n == attr_name.Done:
708	break
709	elif n == attr_name.Invalid:
710	raise LexError('GetAttrRaw() got invalid token', attr_lx.s,
711	attr_lx.pos)
712	else:
713	raise AssertionError()
714
715	return None
716
717
718	def AllAttrsRawSlice(attr_lx):
719	# type: (AttrLexer) -> List[Tuple[int, int, attr_value_t, int, int]]
720	result = []
721	while True:
722	n, name_start, name_end = attr_lx.ReadName()
723	if 0:
724	log(' AllAttrsRaw ==> ReadName %s %d %d %r', attr_name_str(n),
725	name_start, name_end, attr_lx.s[attr_lx.pos:attr_lx.pos + 10])
726	if n == attr_name.Ok:
727	#name = attr_lx.s[name_start:name_end]
728	#log(' Name %r', name)
729
730	v, val_start, val_end = attr_lx.ReadValue()
731	#val = attr_lx.s[val_start:val_end]
732	#log(' ReadValue %r', val)
733	result.append((name_start, name_end, v, val_start, val_end))
734	elif n == attr_name.Done:
735	break
736	elif n == attr_name.Invalid:
737	raise LexError('AllAttrsRaw() got invalid token', attr_lx.s,
738	attr_lx.pos)
739	else:
740	raise AssertionError()
741
742	return result
743
744
745	def AllAttrsRaw(attr_lx):
746	# type: (AttrLexer) -> List[Tuple[str, str]]
747	"""
748	Get a list of pairs [('class', 'foo'), ('href', '?foo=1&bar=2')]
749
750	The quoted values may be escaped. We would need another lexer to
751	unescape them.
752	"""
753	slices = AllAttrsRawSlice(attr_lx)
754	pairs = []
755	s = attr_lx.s
756	for name_start, name_end, val_id, val_start, val_end in slices:
757	n = s[name_start:name_end]
758	v = s[val_start:val_end]
759	pairs.append((n, v))
760	return pairs
761
762
763	#
764	# OLD API - REMOVE THIS
765	#
766
767	# Tag names:
768	# Match <a or </a
769	# Match <h2, but not <2h
770	#
771	# HTML 5 doesn't restrict tag names at all
772	# https://html.spec.whatwg.org/#toc-syntax
773	#
774	# XML allows : - .
775	# https://www.w3.org/TR/xml/#NT-NameChar
776
777	# Namespaces for MathML, SVG
778	# XLink, XML, XMLNS
779	#
780	# https://infra.spec.whatwg.org/#namespaces
781	#
782	# Allow - for td-attrs
783
784	# TODO: we don't need to capture the tag name here? That's done at the top
785	# level
786	_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)
787
788	_TAG_LAST_RE = re.compile(r'\s* /? >', re.VERBOSE)
789
790	# To match href="foo"
791	# Note: in HTML5 and XML, single quoted attributes are also valid
792
793	# <button disabled> is standard usage
794
795	# NOTE: This used to allow whitespace around =
796	# <a foo = "bar"> makes sense in XML
797	# But then you also have
798	# <a foo= bar> - which is TWO attributes, in HTML5
799	# So the space is problematic
800
801	_ATTR_RE = re.compile(
802	r'''
803	\s+ # Leading whitespace is required
804	(%s) # Attribute name
805	(?: # Optional attribute value
806	\s* = \s* # Spaces allowed around =
807	(?:
808	" ([^>"\x00]*) " # double quoted value
809	\| ' ([^>'\x00]*) ' # single quoted value
810	\| (%s) # Attribute value
811	)
812	)?
813	''' % (_NAME, _UNQUOTED_VALUE_OLD), re.VERBOSE)
814
815
816	class TagLexer(object):
817	"""
818	Given a tag like <a href="..."> or <link type="..." />, the TagLexer
819	provides a few operations:
820
821	- What is the tag?
822	- Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
823	"""
824
825	def __init__(self, s):
826	# type: (str) -> None
827	self.s = s
828	self.start_pos = -1 # Invalid
829	self.end_pos = -1
830
831	def Reset(self, start_pos, end_pos):
832	# type: (int, int) -> None
833	"""Reuse instances of this object."""
834	assert start_pos >= 0, start_pos
835	assert end_pos >= 0, end_pos
836
837	self.start_pos = start_pos
838	self.end_pos = end_pos
839
840	def WholeTagString(self):
841	# type: () -> str
842	"""Return the entire tag string, e.g. <a href='foo'>"""
843	return self.s[self.start_pos:self.end_pos]
844
845	def GetTagName(self):
846	# type: () -> str
847	# First event
848	tok_id, start, end = next(self.Tokens())
849	return self.s[start:end]
850
851	def GetSpanForAttrValue(self, attr_name):
852	# type: (str) -> Tuple[int, int]
853	"""
854	Used by oils_doc.py, for href shortcuts
855	"""
856	# Algorithm: search for QuotedValue or UnquotedValue after AttrName
857	# TODO: Could also cache these
858
859	events = self.Tokens()
860	val = (-1, -1)
861	try:
862	while True:
863	tok_id, start, end = next(events)
864	if tok_id == h8_tag_id.AttrName:
865	name = self.s[start:end]
866	if name == attr_name:
867	# The value should come next
868	tok_id, start, end = next(events)
869	assert tok_id in (
870	h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue,
871	h8_tag_id.MissingValue), h8_tag_id_str(tok_id)
872	val = start, end
873	break
874
875	except StopIteration:
876	pass
877	return val
878
879	def GetAttrRaw(self, attr_name):
880	# type: (str) -> Optional[str]
881	"""
882	Return the value, which may be UNESCAPED.
883	"""
884	start, end = self.GetSpanForAttrValue(attr_name)
885	if start == -1:
886	return None
887	return self.s[start:end]
888
889	def AllAttrsRawSlice(self):
890	# type: () -> List[Tuple[str, int, int]]
891	"""
892	Get a list of pairs [('class', 3, 5), ('href', 9, 12)]
893	"""
894	slices = []
895	events = self.Tokens()
896	try:
897	while True:
898	tok_id, start, end = next(events)
899	if tok_id == h8_tag_id.AttrName:
900	name = self.s[start:end]
901
902	# The value should come next
903	tok_id, start, end = next(events)
904	assert tok_id in (
905	h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue,
906	h8_tag_id.MissingValue), h8_tag_id_str(tok_id)
907	# Note: quoted values may have &
908	# We would need ANOTHER lexer to unescape them, but we
909	# don't need that for ul-table
910	slices.append((name, start, end))
911	except StopIteration:
912	pass
913	return slices
914
915	def AllAttrsRaw(self):
916	# type: () -> List[Tuple[str, str]]
917	"""
918	Get a list of pairs [('class', 'foo'), ('href', '?foo=1&bar=2')]
919
920	The quoted values may be escaped. We would need another lexer to
921	unescape them.
922	"""
923	slices = self.AllAttrsRawSlice()
924	pairs = []
925	for name, start, end in slices:
926	pairs.append((name, self.s[start:end]))
927	return pairs
928
929	def Tokens(self):
930	# type: () -> Iterator[Tuple[h8_tag_id_t, int, int]]
931	"""
932	Yields a sequence of tokens: Tag (AttrName AttrValue?)*
933
934	Where each Token is (Type, start_pos, end_pos)
935
936	Note that start and end are NOT redundant! We skip over some unwanted
937	characters.
938	"""
939	m = _TAG_RE.match(self.s, self.start_pos + 1)
940	if not m:
941	raise RuntimeError("Couldn't find HTML tag in %r" %
942	self.WholeTagString())
943	yield h8_tag_id.TagName, m.start(1), m.end(1)
944
945	pos = m.end(0)
946	#log('POS %d', pos)
947
948	while True:
949	# don't search past the end
950	m = _ATTR_RE.match(self.s, pos, self.end_pos)
951	if not m:
952	#log('BREAK pos %d', pos)
953	break
954	#log('AttrName %r', m.group(1))
955
956	yield h8_tag_id.AttrName, m.start(1), m.end(1)
957
958	#log('m.groups() %r', m.groups())
959	if m.group(2) is not None:
960	# double quoted
961	yield h8_tag_id.QuotedValue, m.start(2), m.end(2)
962	elif m.group(3) is not None:
963	# single quoted - TODO: could have different token types
964	yield h8_tag_id.QuotedValue, m.start(3), m.end(3)
965	elif m.group(4) is not None:
966	yield h8_tag_id.UnquotedValue, m.start(4), m.end(4)
967	else:
968	# <button disabled>
969	end = m.end(0)
970	yield h8_tag_id.MissingValue, end, end
971
972	# Skip past the "
973	pos = m.end(0)
974
975	#log('TOK %r', self.s)
976
977	m = _TAG_LAST_RE.match(self.s, pos)
978	#log('_TAG_LAST_RE match %r', self.s[pos:])
979	if not m:
980	raise LexError('Extra data at end of tag', self.s, pos)