data_lang/htm8.py

OILS / data_lang / htm8.py View on Github | oils.pub

964 lines, 437 significant

1	"""data_lang/htm8.py
2
3	TODO
4
5	Migrate:
6
7	- doctools/ul_table.py should use new AttrLexer
8	- AllAttrsRaw()
9	- maybe: migrate everything off of TagLexer()
10	- and AttrValueLexer() - this should requires Validate()
11
12	API:
13	- Get rid of Reset()?
14	- Deprecate tag_lexer.GetTagName() in favor of lx.CanonicalTagName() or
15	_LiteralTagName()
16	- UTF-8 check, like JSON8
17	- re2c
18	- port lexer, which will fix static typing issues
19	- the abstraction needs to support submatch?
20	- for finding the end of a tag, etc.?
21	- and what about no match?
22
23	- harmonize LexError and ParseError with data_lang/j8.py, which uses
24	error.Decode(msg, ..., cur_line_num)
25
26	- Copy all errors into doc/ref/chap-errors.md
27	- This helps understand the language
28
29	- Update doc/htm8.md
30	- list of Algorithms:
31	- lex just the top level
32	- lex both levels
33	- and match tags - this is the level for value.Htm8Frag?
34	- convert to XML!
35	- lazy selection by tag, or attr (id= and class=)
36	- lazy selection by CSS selector expression
37	- convert to DOMTree
38	- sed-like replacement of DOM Tree or element
39	- untrusted HTML filter, e.g. like StackOverflow / Reddit
40	- this is Safe HTM8
41	- should have a zero alloc way to support this, with good errors?
42	- I think most of them silently strip data
43	"""
44
45	import re
46
47	from typing import Dict, List, Tuple, Optional, IO, Iterator, Any
48
49	from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_tag_id, h8_tag_id_t,
50	h8_tag_id_str, attr_name, attr_name_t,
51	attr_name_str, attr_value_e, attr_value_t,
52	h8_val_id)
53	from doctools.util import log
54
55
56	class LexError(Exception):
57	"""
58	Examples of lex errors:
59
60	- h8_id.Invalid, like <> or &&
61	- Unclosed <!-- <? <![CDATA[ <script> <style>
62	"""
63
64	def __init__(self, msg, code_str, start_pos):
65	# type: (str, str, int) -> None
66	self.msg = msg
67	self.code_str = code_str
68	self.start_pos = start_pos
69
70	def __str__(self):
71	# type: () -> str
72	return '(LexError %r %r)' % (
73	self.msg, self.code_str[self.start_pos:self.start_pos + 20])
74
75
76	def _FindLineNum(s, error_pos):
77	# type: (str, int) -> int
78	current_pos = 0
79	line_num = 1
80	while True:
81	newline_pos = s.find('\n', current_pos)
82	#log('current = %d, N %d, line %d', current_pos, newline_pos, line_num)
83
84	if newline_pos == -1: # this is the last line
85	return line_num
86	if newline_pos >= error_pos:
87	return line_num
88	line_num += 1
89	current_pos = newline_pos + 1
90
91
92	class ParseError(Exception):
93	"""
94	Examples of parse errors
95
96	- unbalanced tag structure
97	- ul_table.py errors
98	"""
99
100	def __init__(self, msg, s=None, start_pos=-1):
101	# type: (str, Optional[str], int) -> None
102	self.msg = msg
103	self.s = s
104	self.start_pos = start_pos
105
106	def __str__(self):
107	# type: () -> str
108	if self.s is not None:
109	assert self.start_pos != -1, self.start_pos
110	snippet = (self.s[self.start_pos:self.start_pos + 20])
111
112	line_num = _FindLineNum(self.s, self.start_pos)
113	else:
114	snippet = ''
115	line_num = -1
116	msg = 'line %d: %r %r' % (line_num, self.msg, snippet)
117	return msg
118
119
120	class Output(object):
121	"""Output for sed-like "replacement" model.
122
123	Takes an underlying input buffer and an output file. Maintains a position
124	in the input buffer.
125
126	Print FROM the input or print new text to the output.
127	"""
128
129	def __init__(self, s, f, left_pos=0, right_pos=-1):
130	# type: (str, IO[str], int, int) -> None
131	self.s = s
132	self.f = f
133	self.pos = left_pos
134	self.right_pos = len(s) if right_pos == -1 else right_pos
135
136	def SkipTo(self, pos):
137	# type: (int) -> None
138	"""Skip to a position."""
139	self.pos = pos
140
141	def PrintUntil(self, pos):
142	# type: (int) -> None
143	"""Print until a position."""
144	piece = self.s[self.pos:pos]
145	self.f.write(piece)
146	self.pos = pos
147
148	def PrintTheRest(self):
149	# type: () -> None
150	"""Print until the end of the string."""
151	self.PrintUntil(self.right_pos)
152
153	def Print(self, s):
154	# type: (str) -> None
155	"""Print text to the underlying buffer."""
156	self.f.write(s)
157
158
159	def MakeLexer(rules):
160	return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]
161
162
163	#
164	# Lexers
165	#
166
167	_NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter
168
169	CHAR_LEX = [
170	# Characters
171	# https://www.w3.org/TR/xml/#sec-references
172	(r'&\# [0-9]+ ;', h8_id.DecChar),
173	(r'&\# x[0-9a-fA-F]+ ;', h8_id.HexChar),
174	(r'& %s ;' % _NAME, h8_id.CharEntity),
175	# Allow unquoted, and quoted
176	(r'&', h8_id.BadAmpersand),
177	]
178
179	HTM8_LEX = CHAR_LEX + [
180	# TODO: CommentBegin, ProcessingBegin, CDataBegin could have an additional
181	# action associated with them? The ending substring
182	(r'<!--', h8_id.CommentBegin),
183
184	# Processing instruction are used for the XML header:
185	# <?xml version="1.0" encoding="UTF-8"?>
186	# They are technically XML-only, but in HTML5, they are another kind of
187	# comment:
188	#
189	# https://developer.mozilla.org/en-US/docs/Web/API/ProcessingInstruction
190	#
191	(r'<\?', h8_id.ProcessingBegin),
192	# Not necessary in HTML5, but occurs in XML
193	(r'<!\[CDATA\[', h8_id.CDataBegin), # <![CDATA[
194
195	# Markup declarations
196	# - In HTML5, there is only <!DOCTYPE html>
197	# - XML has 4 more declarations: <!ELEMENT ...> ATTLIST ENTITY NOTATION
198	# - these seem to be part of DTD
199	# - it's useful to skip these, and be able to parse the rest of the document
200	# - Note: < is allowed?
201	(r'<! [^>\x00]+ >', h8_id.Decl),
202
203	# Tags
204	# Notes:
205	# - We look for a valid tag name, but we don't validate attributes.
206	# That's done in the tag lexer.
207	# - We don't allow leading whitespace
208	(r'</ (%s) >' % _NAME, h8_id.EndTag),
209	# self-closing <br/> comes before StartTag
210	# could/should these be collapsed into one rule?
211	(r'< (%s) [^>\x00]* />' % _NAME, h8_id.StartEndTag), # end </a>
212	(r'< (%s) [^>\x00]* >' % _NAME, h8_id.StartTag), # start <a>
213
214	# HTML5 allows unescaped > in raw data, but < is not allowed.
215	# https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
216	#
217	# - My early blog has THREE errors when disallowing >
218	# - So do some .wwz files
219	(r'[^&<>\x00]+', h8_id.RawData),
220	(r'>', h8_id.BadGreaterThan),
221	# NUL is the end, an accomodation for re2c. Like we do in frontend/match.
222	(r'\x00', h8_id.EndOfStream),
223	# This includes < - it is not BadLessThan because it's NOT recoverable
224	(r'.', h8_id.Invalid),
225	]
226
227	# Old notes:
228	#
229	# Non-greedy matches are regular and can be matched in linear time
230	# with RE2.
231	#
232	# https://news.ycombinator.com/item?id=27099798
233	#
234
235	# This person tried to do it with a regex:
236	#
237	# https://skeptric.com/html-comment-regexp/index.html
238
239	# . is any char except newline
240	# https://re2c.org/manual/manual_c.html
241
242	# Discarded options
243	#(r'<!-- .*? -->', h8_id.Comment),
244
245	# Hack from Claude: \s\S instead of re.DOTALL. I don't like this
246	#(r'<!-- [\s\S]*? -->', h8_id.Comment),
247	#(r'<!-- (?:.\|[\n])*? -->', h8_id.Comment),
248
249	HTM8_LEX_COMPILED = MakeLexer(HTM8_LEX)
250
251
252	class Lexer(object):
253
254	def __init__(self, s, left_pos=0, right_pos=-1, no_special_tags=False):
255	# type: (str, int, int, bool) -> None
256	self.s = s
257	self.pos = left_pos
258	self.right_pos = len(s) if right_pos == -1 else right_pos
259	self.no_special_tags = no_special_tags
260
261	# string -> compiled regex pattern object
262	self.cache = {} # type: Dict[str, Any]
263
264	# either </script> or </style> - we search until we see that
265	self.search_state = None # type: Optional[str]
266
267	# Position of tag name, if applicable
268	# - Set after you get a StartTag, EndTag, or StartEndTag
269	# - Unset on other tags
270	self.tag_pos_left = -1
271	self.tag_pos_right = -1
272
273	def _Read(self):
274	# type: () -> Tuple[h8_id_t, int]
275	if self.pos == self.right_pos:
276	return h8_id.EndOfStream, self.pos
277
278	assert self.pos < self.right_pos, self.pos
279
280	if self.search_state is not None and not self.no_special_tags:
281	# TODO: case-insensitive search for </SCRIPT> <SCRipt> ?
282	#
283	# Another strategy: enter a mode where we find ONLY the end tag
284	# regex, and any data that's not <, and then check the canonical
285	# tag name for 'script' or 'style'.
286	pos = self.s.find(self.search_state, self.pos)
287	if pos == -1:
288	raise LexError('Unterminated <script> or <style>', self.s,
289	self.pos)
290	self.search_state = None
291	# beginning
292	return h8_id.HtmlCData, pos
293
294	# Find the first match.
295	# Note: frontend/match.py uses _LongestMatch(), which is different!
296	# TODO: reconcile them. This lexer should be expressible in re2c.
297
298	for pat, tok_id in HTM8_LEX_COMPILED:
299	m = pat.match(self.s, self.pos)
300	if m:
301	if tok_id in (h8_id.StartTag, h8_id.EndTag, h8_id.StartEndTag):
302	self.tag_pos_left = m.start(1)
303	self.tag_pos_right = m.end(1)
304	else:
305	# Reset state
306	self.tag_pos_left = -1
307	self.tag_pos_right = -1
308
309	if tok_id == h8_id.CommentBegin:
310	pos = self.s.find('-->', self.pos)
311	if pos == -1:
312	raise LexError('Unterminated <!--', self.s, self.pos)
313	return h8_id.Comment, pos + 3 # -->
314
315	if tok_id == h8_id.ProcessingBegin:
316	pos = self.s.find('?>', self.pos)
317	if pos == -1:
318	raise LexError('Unterminated <?', self.s, self.pos)
319	return h8_id.Processing, pos + 2 # ?>
320
321	if tok_id == h8_id.CDataBegin:
322	pos = self.s.find(']]>', self.pos)
323	if pos == -1:
324	# unterminated <![CDATA[
325	raise LexError('Unterminated <![CDATA[', self.s,
326	self.pos)
327	return h8_id.CData, pos + 3 # ]]>
328
329	if tok_id == h8_id.StartTag:
330	# TODO: reduce allocations
331	if (self.TagNameEquals('script') or
332	self.TagNameEquals('style')):
333	# <SCRipt a=b> -> </SCRipt>
334	self.search_state = '</' + self._LiteralTagName() + '>'
335
336	return tok_id, m.end()
337	else:
338	raise AssertionError('h8_id.Invalid rule should have matched')
339
340	def TagNamePos(self):
341	# type: () -> int
342	"""The right position of the tag pos"""
343	assert self.tag_pos_right != -1, self.tag_pos_right
344	return self.tag_pos_right
345
346	def TagNameEquals(self, expected):
347	# type: (str) -> bool
348	assert self.tag_pos_left != -1, self.tag_pos_left
349	assert self.tag_pos_right != -1, self.tag_pos_right
350
351	# TODO: In C++, this does not need an allocation. Can we test
352	# directly?
353	return expected == self.CanonicalTagName()
354
355	def _LiteralTagName(self):
356	# type: () -> str
357	assert self.tag_pos_left != -1, self.tag_pos_left
358	assert self.tag_pos_right != -1, self.tag_pos_right
359
360	return self.s[self.tag_pos_left:self.tag_pos_right]
361
362	def CanonicalTagName(self):
363	# type: () -> str
364	tag_name = self._LiteralTagName()
365	# Most tags are already lower case, so avoid allocation with this conditional
366	# TODO: this could go in the mycpp runtime?
367	if tag_name.islower():
368	return tag_name
369	else:
370	return tag_name.lower()
371
372	def Read(self):
373	# type: () -> Tuple[h8_id_t, int]
374	tok_id, end_pos = self._Read()
375	self.pos = end_pos # advance
376	return tok_id, end_pos
377
378	def LookAhead(self, regex):
379	# type: (str) -> bool
380	"""
381	Currently used for ul_table.py. But taking a dynamic regex string is
382	not the right interface.
383	"""
384	# Cache the regex compilation. This could also be LookAheadFor(THEAD)
385	# or something.
386	pat = self.cache.get(regex)
387	if pat is None:
388	pat = re.compile(regex)
389	self.cache[regex] = pat
390
391	m = pat.match(self.s, self.pos)
392	return m is not None
393
394
395	A_NAME_LEX = [
396	# Leading whitespace is required, to separate attributes.
397	#
398	# If the = is not present, then we set the lexer in a state for
399	# attr_value_e.Missing.
400	(r'\s+ (%s) \s* (=)? \s*' % _NAME, attr_name.Ok),
401	# unexpected EOF
402
403	# The closing > or /> is treated as end of stream, and it's not an error.
404	(r'\s* /? >', attr_name.Done),
405
406	# NUL should not be possible, because the top-level
407
408	# This includes < - it is not BadLessThan because it's NOT recoverable
409	(r'.', attr_name.Invalid),
410	]
411
412	A_NAME_LEX_COMPILED = MakeLexer(A_NAME_LEX)
413
414	# Here we just loop on regular tokens
415	#
416	# Examples:
417	# <a href = unquoted&foo >
418	# <a href = unquoted&foo > # BadAmpersand is allowed I guess
419	# <a href ="unquoted&foo" > # double quoted
420	# <a href ='unquoted&foo' > # single quoted
421	# <a href = what"foo" > # HTML5 allows this, but we could disallow it if
422	# it's not common. It opens up the j"" and $"" extensions
423	# <a href = what'foo' > # ditto
424
425	# TODO: get rid of OLD copy
426	_UNQUOTED_VALUE_OLD = r'''[^ \t\r\n<>&"'\x00]*'''
427	_UNQUOTED_VALUE = r'''[^ \t\r\n<>&"'\x00]+'''
428
429	# What comes after = ?
430	A_VALUE_LEX = [
431	(r'"', h8_val_id.DoubleQuote),
432	(r"'", h8_val_id.SingleQuote),
433	(_UNQUOTED_VALUE, h8_val_id.UnquotedVal),
434	(r'.', h8_val_id.NoMatch),
435	]
436
437	A_VALUE_LEX_COMPILED = MakeLexer(A_VALUE_LEX)
438
439	# What's inside "" or '' ?
440	QUOTED_VALUE_LEX = CHAR_LEX + [
441	(r'"', h8_id.DoubleQuote),
442	(r"'", h8_id.SingleQuote),
443	(r'<', h8_id.BadLessThan), # BadAmpersand is in CharLex
444
445	# TODO: think about whitespace for efficient class= queries?
446	#(r'[ \r\n\t]', h8_id.Whitespace), # terminates unquoted values
447	(r'''[^"'<>&\x00]+''', h8_id.RawData),
448	# This includes > - it is not BadGreaterThan because it's NOT recoverable
449	(r'.', h8_id.Invalid),
450	]
451
452	QUOTED_VALUE_LEX_COMPILED = MakeLexer(QUOTED_VALUE_LEX)
453
454
455	class AttrLexer(object):
456	"""
457	Typical usage:
458
459	while True:
460	n, start_pos, end_pos = attr_lx.ReadName()
461	if n == attr_name.Ok:
462	if attr_lx.AttrNameEquals('div'):
463	print('div')
464
465	# TODO: also pass Optional[List[]] out_tokens?
466	v, start_pos, end_pos = attr_lx.ReadValue()
467	"""
468
469	def __init__(self, s):
470	# type: (str) -> None
471	self.s = s
472
473	self.tok_id = h8_id.Invalid # Uninitialized
474	self.tag_name_pos = -1 # Invalid
475	self.tag_end_pos = -1
476	self.must_not_exceed_pos = -1
477
478	self.pos = -1
479
480	self.name_start = -1
481	self.name_end = -1
482	self.next_value_is_missing = False
483
484	self.init_t = -1
485	self.init_e = -1
486
487	def Init(self, tok_id, tag_name_pos, end_pos):
488	# type: (h8_id_t, int, int) -> None
489	"""Initialize so we can read names and values.
490
491	Example:
492	'x <a y>' # tag_name_pos=4, end_pos=6
493	'x <a>' # tag_name_pos=4, end_pos=4
494
495	The Init() method is used to reuse instances of the AttrLexer object.
496	"""
497	assert tag_name_pos >= 0, tag_name_pos
498	assert end_pos >= 0, end_pos
499
500	#log('TAG NAME POS %d', tag_name_pos)
501
502	self.tok_id = tok_id
503	self.tag_name_pos = tag_name_pos
504	self.end_pos = end_pos
505
506	# Check for ambiguous <img src=/>
507	if tok_id == h8_id.StartTag:
508	self.must_not_exceed_pos = end_pos - 1 # account for >
509	elif tok_id == h8_id.StartEndTag:
510	self.must_not_exceed_pos = end_pos - 2 # account for />
511	else:
512	raise AssertionError(tok_id)
513
514	self.pos = tag_name_pos
515
516	# For Reset()
517	self.init_t = tag_name_pos
518	self.init_e = end_pos
519
520	def Reset(self):
521	# type: () -> None
522
523	# TODO: maybe GetAttrRaw() should call this directly? But not any of
524	# the AllAttrs() methods?
525	self.tag_name_pos = self.init_t
526	self.end_pos = self.init_e
527	self.pos = self.init_t
528
529	def ReadName(self):
530	# type: () -> Tuple[attr_name_t, int, int]
531	"""Reads the attribute name
532
533	EOF case:
534	<a>
535	<a >
536
537	Error case:
538	<a !>
539	<a foo=bar !>
540	"""
541	for pat, a in A_NAME_LEX_COMPILED:
542	m = pat.match(self.s, self.pos)
543	#log('ReadName() matching %r at %d', self.s, self.pos)
544	if m:
545	#log('ReadName() tag_name_pos %d pos, %d %s', self.tag_name_pos, self.pos, m.groups())
546	if a == attr_name.Invalid:
547	#log('m.groups %s', m.groups())
548	return attr_name.Invalid, -1, -1
549
550	self.pos = m.end(0) # Advance if it's not invalid
551
552	if a == attr_name.Ok:
553	#log('%r', m.groups())
554	self.name_start = m.start(1)
555	self.name_end = m.end(1)
556	# Is the equals sign missing? Set state.
557	if m.group(2) is None:
558	self.next_value_is_missing = True
559	# HACK: REWIND, since we don't want to consume whitespace
560	self.pos = self.name_end
561	else:
562	self.next_value_is_missing = False
563	return attr_name.Ok, self.name_start, self.name_end
564	else:
565	# Reset state - e.g. you must call AttrNameEquals
566	self.name_start = -1
567	self.name_end = -1
568
569	if a == attr_name.Done:
570	return attr_name.Done, -1, -1
571	else:
572	context = self.s[self.pos:]
573	#log('s %r %d', self.s, self.pos)
574	raise AssertionError('h8_id.Invalid rule should have matched %r' %
575	context)
576
577	def _CanonicalAttrName(self):
578	# type: () -> str
579	"""Return the lower case attribute name.
580
581	Must call after ReadName()
582	"""
583	assert self.name_start >= 0, self.name_start
584	assert self.name_end >= 0, self.name_end
585
586	attr_name = self.s[self.name_start:self.name_end]
587	if attr_name.islower():
588	return attr_name
589	else:
590	return attr_name.lower()
591
592	def AttrNameEquals(self, expected):
593	# type: (str) -> bool
594	"""
595	Must call after ReadName()
596
597	TODO: This can be optimized to be "in place", with zero allocs.
598	"""
599	return expected == self._CanonicalAttrName()
600
601	def _QuotedRead(self):
602	# type: () -> Tuple[h8_id_t, int]
603
604	for pat, tok_id in QUOTED_VALUE_LEX_COMPILED:
605	m = pat.match(self.s, self.pos)
606	if m:
607	end_pos = m.end(0) # Advance
608	#log('_QuotedRead %r', self.s[self.pos:end_pos])
609	return tok_id, end_pos
610	else:
611	context = self.s[self.pos:self.pos + 10]
612	raise AssertionError('h8_id.Invalid rule should have matched %r' %
613	context)
614
615	def ReadValue(self, tokens_out=None):
616	# type: (Optional[List[Tuple[h8_id, int]]]) -> Tuple[attr_value_t, int, int]
617	"""Read the attribute value.
618
619	In general, it is escaped or "raw"
620
621	Can only be called after a SUCCESSFUL ReadName().
622	Assuming ReadName() returned a value, this should NOT fail.
623	"""
624	# ReadName() invariant
625	assert self.name_start >= 0, self.name_start
626	assert self.name_end >= 0, self.name_end
627
628	self.name_start = -1
629	self.name_end = -1
630
631	if self.next_value_is_missing:
632	# Do not advance self.pos
633	#log('-> MISSING pos %d : %r', self.pos, self.s[self.pos:])
634	return attr_value_e.Missing, -1, -1
635
636	# Now read " ', unquoted or empty= is valid too.
637	for pat, a in A_VALUE_LEX_COMPILED:
638	m = pat.match(self.s, self.pos)
639	if m:
640	first_end_pos = m.end(0)
641	# We shouldn't go past the end
642	assert first_end_pos <= self.end_pos, \
643	'first_end_pos = %d should be less than self.end_pos = %d' % (first_end_pos, self.end_pos)
644	#log('m %s', m.groups())
645
646	# Note: Unquoted value can't contain & etc. now, so there
647	# is no unquoting, and no respecting tokens_raw.
648	if a == h8_val_id.UnquotedVal:
649	if first_end_pos > self.must_not_exceed_pos:
650	#log('first_end_pos %d', first_end_pos)
651	#log('must_not_exceed_pos %d', self.must_not_exceed_pos)
652	raise LexError(
653	'Ambiguous slash: last attribute should be quoted',
654	self.s, first_end_pos)
655	self.pos = first_end_pos # Advance
656	return attr_value_e.Unquoted, m.start(0), first_end_pos
657
658	# TODO: respect tokens_out
659	if a == h8_val_id.DoubleQuote:
660	self.pos = first_end_pos
661	while True:
662	tok_id, q_end_pos = self._QuotedRead()
663	#log('self.pos %d q_end_pos %d', self.pos, q_end_pos)
664	if tok_id == h8_id.Invalid:
665	raise LexError(
666	'ReadValue() got invalid token (DQ)', self.s,
667	self.pos)
668	if tok_id == h8_id.DoubleQuote:
669	right_pos = self.pos
670	self.pos = q_end_pos # Advance past "
671	return attr_value_e.DoubleQuoted, first_end_pos, right_pos
672	self.pos = q_end_pos # Advance _QuotedRead
673
674	# TODO: respect tokens_out
675	if a == h8_val_id.SingleQuote:
676	self.pos = first_end_pos
677	while True:
678	tok_id, q_end_pos = self._QuotedRead()
679	if tok_id == h8_id.Invalid:
680	raise LexError(
681	'ReadValue() got invalid token (SQ)', self.s,
682	self.pos)
683	if tok_id == h8_id.SingleQuote:
684	right_pos = self.pos
685	self.pos = q_end_pos # Advance past "
686	return attr_value_e.SingleQuoted, first_end_pos, right_pos
687	self.pos = q_end_pos # Advance _QuotedRead
688
689	if a == h8_val_id.NoMatch:
690	# <a foo = >
691	return attr_value_e.Empty, -1, -1
692	else:
693	raise AssertionError('h8_val_id.NoMatch rule should have matched')
694
695
696	def GetAttrRaw(attr_lx, name):
697	# type: (AttrLexer, str) -> Optional[str]
698	while True:
699	n, name_start, name_end = attr_lx.ReadName()
700	#log('==> ReadName %s %d %d', attr_name_str(n), name_start, name_end)
701	if n == attr_name.Ok:
702	if attr_lx.AttrNameEquals(name):
703	v, val_start, val_end = attr_lx.ReadValue()
704	return attr_lx.s[val_start:val_end]
705	else:
706	# Problem with stateful API: You are forced to either ReadValue()
707	# or SkipVlaue()
708	attr_lx.ReadValue()
709	elif n == attr_name.Done:
710	break
711	elif n == attr_name.Invalid:
712	raise LexError('GetAttrRaw() got invalid token', attr_lx.s,
713	attr_lx.pos)
714	else:
715	raise AssertionError()
716
717	return None
718
719
720	def AllAttrsRaw(attr_lx):
721	# type: (AttrLexer) -> List[Tuple[str,str]]
722	result = []
723	while True:
724	n, name_start, name_end = attr_lx.ReadName()
725	if 0:
726	log(' AllAttrsRaw ==> ReadName %s %d %d %r', attr_name_str(n),
727	name_start, name_end, attr_lx.s[attr_lx.pos:attr_lx.pos + 10])
728	if n == attr_name.Ok:
729	name = attr_lx.s[name_start:name_end]
730	#log(' Name %r', name)
731
732	v, val_start, val_end = attr_lx.ReadValue()
733	val = attr_lx.s[val_start:val_end]
734	#log(' ReadValue %r', val)
735	result.append((name, val))
736	elif n == attr_name.Done:
737	break
738	elif n == attr_name.Invalid:
739	raise LexError('AllAttrsRaw() got invalid token', attr_lx.s,
740	attr_lx.pos)
741	else:
742	raise AssertionError()
743
744	return result
745
746
747	#
748	# OLD API - REMOVE THIS
749	#
750
751	# Tag names:
752	# Match <a or </a
753	# Match <h2, but not <2h
754	#
755	# HTML 5 doesn't restrict tag names at all
756	# https://html.spec.whatwg.org/#toc-syntax
757	#
758	# XML allows : - .
759	# https://www.w3.org/TR/xml/#NT-NameChar
760
761	# Namespaces for MathML, SVG
762	# XLink, XML, XMLNS
763	#
764	# https://infra.spec.whatwg.org/#namespaces
765	#
766	# Allow - for td-attrs
767
768	# TODO: we don't need to capture the tag name here? That's done at the top
769	# level
770	_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)
771
772	_TAG_LAST_RE = re.compile(r'\s* /? >', re.VERBOSE)
773
774	# To match href="foo"
775	# Note: in HTML5 and XML, single quoted attributes are also valid
776
777	# <button disabled> is standard usage
778
779	# NOTE: This used to allow whitespace around =
780	# <a foo = "bar"> makes sense in XML
781	# But then you also have
782	# <a foo= bar> - which is TWO attributes, in HTML5
783	# So the space is problematic
784
785	_ATTR_RE = re.compile(
786	r'''
787	\s+ # Leading whitespace is required
788	(%s) # Attribute name
789	(?: # Optional attribute value
790	\s* = \s* # Spaces allowed around =
791	(?:
792	" ([^>"\x00]*) " # double quoted value
793	\| ' ([^>'\x00]*) ' # single quoted value
794	\| (%s) # Attribute value
795	)
796	)?
797	''' % (_NAME, _UNQUOTED_VALUE_OLD), re.VERBOSE)
798
799
800	class TagLexer(object):
801	"""
802	Given a tag like <a href="..."> or <link type="..." />, the TagLexer
803	provides a few operations:
804
805	- What is the tag?
806	- Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
807	"""
808
809	def __init__(self, s):
810	# type: (str) -> None
811	self.s = s
812	self.start_pos = -1 # Invalid
813	self.end_pos = -1
814
815	def Reset(self, start_pos, end_pos):
816	# type: (int, int) -> None
817	"""Reuse instances of this object."""
818	assert start_pos >= 0, start_pos
819	assert end_pos >= 0, end_pos
820
821	self.start_pos = start_pos
822	self.end_pos = end_pos
823
824	def WholeTagString(self):
825	# type: () -> str
826	"""Return the entire tag string, e.g. <a href='foo'>"""
827	return self.s[self.start_pos:self.end_pos]
828
829	def GetTagName(self):
830	# type: () -> str
831	# First event
832	tok_id, start, end = next(self.Tokens())
833	return self.s[start:end]
834
835	def GetSpanForAttrValue(self, attr_name):
836	# type: (str) -> Tuple[int, int]
837	"""
838	Used by oils_doc.py, for href shortcuts
839	"""
840	# Algorithm: search for QuotedValue or UnquotedValue after AttrName
841	# TODO: Could also cache these
842
843	events = self.Tokens()
844	val = (-1, -1)
845	try:
846	while True:
847	tok_id, start, end = next(events)
848	if tok_id == h8_tag_id.AttrName:
849	name = self.s[start:end]
850	if name == attr_name:
851	# The value should come next
852	tok_id, start, end = next(events)
853	assert tok_id in (
854	h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue,
855	h8_tag_id.MissingValue), h8_tag_id_str(tok_id)
856	val = start, end
857	break
858
859	except StopIteration:
860	pass
861	return val
862
863	def GetAttrRaw(self, attr_name):
864	# type: (str) -> Optional[str]
865	"""
866	Return the value, which may be UNESCAPED.
867	"""
868	start, end = self.GetSpanForAttrValue(attr_name)
869	if start == -1:
870	return None
871	return self.s[start:end]
872
873	def AllAttrsRawSlice(self):
874	# type: () -> List[Tuple[str, int, int]]
875	"""
876	Get a list of pairs [('class', 3, 5), ('href', 9, 12)]
877	"""
878	slices = []
879	events = self.Tokens()
880	try:
881	while True:
882	tok_id, start, end = next(events)
883	if tok_id == h8_tag_id.AttrName:
884	name = self.s[start:end]
885
886	# The value should come next
887	tok_id, start, end = next(events)
888	assert tok_id in (
889	h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue,
890	h8_tag_id.MissingValue), h8_tag_id_str(tok_id)
891	# Note: quoted values may have &
892	# We would need ANOTHER lexer to unescape them, but we
893	# don't need that for ul-table
894	slices.append((name, start, end))
895	except StopIteration:
896	pass
897	return slices
898
899	def AllAttrsRaw(self):
900	# type: () -> List[Tuple[str, str]]
901	"""
902	Get a list of pairs [('class', 'foo'), ('href', '?foo=1&bar=2')]
903
904	The quoted values may be escaped. We would need another lexer to
905	unescape them.
906	"""
907	slices = self.AllAttrsRawSlice()
908	pairs = []
909	for name, start, end in slices:
910	pairs.append((name, self.s[start:end]))
911	return pairs
912
913	def Tokens(self):
914	# type: () -> Iterator[Tuple[h8_tag_id_t, int, int]]
915	"""
916	Yields a sequence of tokens: Tag (AttrName AttrValue?)*
917
918	Where each Token is (Type, start_pos, end_pos)
919
920	Note that start and end are NOT redundant! We skip over some unwanted
921	characters.
922	"""
923	m = _TAG_RE.match(self.s, self.start_pos + 1)
924	if not m:
925	raise RuntimeError("Couldn't find HTML tag in %r" %
926	self.WholeTagString())
927	yield h8_tag_id.TagName, m.start(1), m.end(1)
928
929	pos = m.end(0)
930	#log('POS %d', pos)
931
932	while True:
933	# don't search past the end
934	m = _ATTR_RE.match(self.s, pos, self.end_pos)
935	if not m:
936	#log('BREAK pos %d', pos)
937	break
938	#log('AttrName %r', m.group(1))
939
940	yield h8_tag_id.AttrName, m.start(1), m.end(1)
941
942	#log('m.groups() %r', m.groups())
943	if m.group(2) is not None:
944	# double quoted
945	yield h8_tag_id.QuotedValue, m.start(2), m.end(2)
946	elif m.group(3) is not None:
947	# single quoted - TODO: could have different token types
948	yield h8_tag_id.QuotedValue, m.start(3), m.end(3)
949	elif m.group(4) is not None:
950	yield h8_tag_id.UnquotedValue, m.start(4), m.end(4)
951	else:
952	# <button disabled>
953	end = m.end(0)
954	yield h8_tag_id.MissingValue, end, end
955
956	# Skip past the "
957	pos = m.end(0)
958
959	#log('TOK %r', self.s)
960
961	m = _TAG_LAST_RE.match(self.s, pos)
962	#log('_TAG_LAST_RE match %r', self.s[pos:])
963	if not m:
964	raise LexError('Extra data at end of tag', self.s, pos)