OILS / data_lang / htm8.py View on Github | oils.pub

980 lines, 444 significant
1"""data_lang/htm8.py
2
3TODO
4
5Migrate:
6
7- maybe: migrate everything off of TagLexer()
8 - and AttrValueLexer() - this should requires Validate()
9
10API:
11- Get rid of Reset()?
12- Deprecate tag_lexer.GetTagName() in favor of lx.CanonicalTagName() or
13 _LiteralTagName()
14- UTF-8 check, like JSON8
15- re2c
16 - port lexer, which will fix static typing issues
17 - the abstraction needs to support submatch?
18 - for finding the end of a tag, etc.?
19 - and what about no match?
20
21- harmonize LexError and ParseError with data_lang/j8.py, which uses
22 error.Decode(msg, ..., cur_line_num)
23
24- Copy all errors into doc/ref/chap-errors.md
25 - This helps understand the language
26
27- Update doc/htm8.md
28- list of Algorithms:
29 - lex just the top level
30 - lex both levels
31 - and match tags - this is the level for value.Htm8Frag?
32 - convert to XML!
33 - lazy selection by tag, or attr (id= and class=)
34 - lazy selection by CSS selector expression
35 - convert to DOMTree
36 - sed-like replacement of DOM Tree or element
37 - untrusted HTML filter, e.g. like StackOverflow / Reddit
38 - this is Safe HTM8
39 - should have a zero alloc way to support this, with good errors?
40 - I think most of them silently strip data
41"""
42
43import re
44
45from typing import Dict, List, Tuple, Optional, IO, Iterator, Any
46
47from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_tag_id, h8_tag_id_t,
48 h8_tag_id_str, attr_name, attr_name_t,
49 attr_name_str, attr_value_e, attr_value_t,
50 h8_val_id)
51from doctools.util import log
52
53
54class LexError(Exception):
55 """
56 Examples of lex errors:
57
58 - h8_id.Invalid, like <> or &&
59 - Unclosed <!-- <? <![CDATA[ <script> <style>
60 """
61
62 def __init__(self, msg, code_str, start_pos):
63 # type: (str, str, int) -> None
64 self.msg = msg
65 self.code_str = code_str
66 self.start_pos = start_pos
67
68 def __str__(self):
69 # type: () -> str
70 return '(LexError %r %r)' % (
71 self.msg, self.code_str[self.start_pos:self.start_pos + 20])
72
73
74def _FindLineNum(s, error_pos):
75 # type: (str, int) -> int
76 current_pos = 0
77 line_num = 1
78 while True:
79 newline_pos = s.find('\n', current_pos)
80 #log('current = %d, N %d, line %d', current_pos, newline_pos, line_num)
81
82 if newline_pos == -1: # this is the last line
83 return line_num
84 if newline_pos >= error_pos:
85 return line_num
86 line_num += 1
87 current_pos = newline_pos + 1
88
89
90class ParseError(Exception):
91 """
92 Examples of parse errors
93
94 - unbalanced tag structure
95 - ul_table.py errors
96 """
97
98 def __init__(self, msg, s=None, start_pos=-1):
99 # type: (str, Optional[str], int) -> None
100 self.msg = msg
101 self.s = s
102 self.start_pos = start_pos
103
104 def __str__(self):
105 # type: () -> str
106 if self.s is not None:
107 assert self.start_pos != -1, self.start_pos
108 snippet = (self.s[self.start_pos:self.start_pos + 20])
109
110 line_num = _FindLineNum(self.s, self.start_pos)
111 else:
112 snippet = ''
113 line_num = -1
114 msg = 'line %d: %r %r' % (line_num, self.msg, snippet)
115 return msg
116
117
118class Output(object):
119 """Output for sed-like "replacement" model.
120
121 Takes an underlying input buffer and an output file. Maintains a position
122 in the input buffer.
123
124 Print FROM the input or print new text to the output.
125 """
126
127 def __init__(self, s, f, left_pos=0, right_pos=-1):
128 # type: (str, IO[str], int, int) -> None
129 self.s = s
130 self.f = f
131 self.pos = left_pos
132 self.right_pos = len(s) if right_pos == -1 else right_pos
133
134 def SkipTo(self, pos):
135 # type: (int) -> None
136 """Skip to a position."""
137 self.pos = pos
138
139 def PrintUntil(self, pos):
140 # type: (int) -> None
141 """Print until a position."""
142 piece = self.s[self.pos:pos]
143 self.f.write(piece)
144 self.pos = pos
145
146 def PrintTheRest(self):
147 # type: () -> None
148 """Print until the end of the string."""
149 self.PrintUntil(self.right_pos)
150
151 def Print(self, s):
152 # type: (str) -> None
153 """Print text to the underlying buffer."""
154 self.f.write(s)
155
156
157def MakeLexer(rules):
158 return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]
159
160
161#
162# Lexers
163#
164
165_NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter
166
167CHAR_LEX = [
168 # Characters
169 # https://www.w3.org/TR/xml/#sec-references
170 (r'&\# [0-9]+ ;', h8_id.DecChar),
171 (r'&\# x[0-9a-fA-F]+ ;', h8_id.HexChar),
172 (r'& %s ;' % _NAME, h8_id.CharEntity),
173 # Allow unquoted, and quoted
174 (r'&', h8_id.BadAmpersand),
175]
176
177HTM8_LEX = CHAR_LEX + [
178 # TODO: CommentBegin, ProcessingBegin, CDataBegin could have an additional
179 # action associated with them? The ending substring
180 (r'<!--', h8_id.CommentBegin),
181
182 # Processing instruction are used for the XML header:
183 # <?xml version="1.0" encoding="UTF-8"?>
184 # They are technically XML-only, but in HTML5, they are another kind of
185 # comment:
186 #
187 # https://developer.mozilla.org/en-US/docs/Web/API/ProcessingInstruction
188 #
189 (r'<\?', h8_id.ProcessingBegin),
190 # Not necessary in HTML5, but occurs in XML
191 (r'<!\[CDATA\[', h8_id.CDataBegin), # <![CDATA[
192
193 # Markup declarations
194 # - In HTML5, there is only <!DOCTYPE html>
195 # - XML has 4 more declarations: <!ELEMENT ...> ATTLIST ENTITY NOTATION
196 # - these seem to be part of DTD
197 # - it's useful to skip these, and be able to parse the rest of the document
198 # - Note: < is allowed?
199 (r'<! [^>\x00]+ >', h8_id.Decl),
200
201 # Tags
202 # Notes:
203 # - We look for a valid tag name, but we don't validate attributes.
204 # That's done in the tag lexer.
205 # - We don't allow leading whitespace
206 (r'</ (%s) >' % _NAME, h8_id.EndTag),
207 # self-closing <br/> comes before StartTag
208 # could/should these be collapsed into one rule?
209 (r'< (%s) [^>\x00]* />' % _NAME, h8_id.StartEndTag), # end </a>
210 (r'< (%s) [^>\x00]* >' % _NAME, h8_id.StartTag), # start <a>
211
212 # HTML5 allows unescaped > in raw data, but < is not allowed.
213 # https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
214 #
215 # - My early blog has THREE errors when disallowing >
216 # - So do some .wwz files
217 (r'[^&<>\x00]+', h8_id.RawData),
218 (r'>', h8_id.BadGreaterThan),
219 # NUL is the end, an accomodation for re2c. Like we do in frontend/match.
220 (r'\x00', h8_id.EndOfStream),
221 # This includes < - it is not BadLessThan because it's NOT recoverable
222 (r'.', h8_id.Invalid),
223]
224
225# Old notes:
226#
227# Non-greedy matches are regular and can be matched in linear time
228# with RE2.
229#
230# https://news.ycombinator.com/item?id=27099798
231#
232
233# This person tried to do it with a regex:
234#
235# https://skeptric.com/html-comment-regexp/index.html
236
237# . is any char except newline
238# https://re2c.org/manual/manual_c.html
239
240# Discarded options
241#(r'<!-- .*? -->', h8_id.Comment),
242
243# Hack from Claude: \s\S instead of re.DOTALL. I don't like this
244#(r'<!-- [\s\S]*? -->', h8_id.Comment),
245#(r'<!-- (?:.|[\n])*? -->', h8_id.Comment),
246
247HTM8_LEX_COMPILED = MakeLexer(HTM8_LEX)
248
249
250class Lexer(object):
251
252 def __init__(self, s, left_pos=0, right_pos=-1, no_special_tags=False):
253 # type: (str, int, int, bool) -> None
254 self.s = s
255 self.pos = left_pos
256 self.right_pos = len(s) if right_pos == -1 else right_pos
257 self.no_special_tags = no_special_tags
258
259 # string -> compiled regex pattern object
260 self.cache = {} # type: Dict[str, Any]
261
262 # either </script> or </style> - we search until we see that
263 self.search_state = None # type: Optional[str]
264
265 # Position of tag name, if applicable
266 # - Set after you get a StartTag, EndTag, or StartEndTag
267 # - Unset on other tags
268 self.tag_pos_left = -1
269 self.tag_pos_right = -1
270
271 def _Read(self):
272 # type: () -> Tuple[h8_id_t, int]
273 if self.pos == self.right_pos:
274 return h8_id.EndOfStream, self.pos
275
276 assert self.pos < self.right_pos, self.pos
277
278 if self.search_state is not None and not self.no_special_tags:
279 # TODO: case-insensitive search for </SCRIPT> <SCRipt> ?
280 #
281 # Another strategy: enter a mode where we find ONLY the end tag
282 # regex, and any data that's not <, and then check the canonical
283 # tag name for 'script' or 'style'.
284 pos = self.s.find(self.search_state, self.pos)
285 if pos == -1:
286 raise LexError('Unterminated <script> or <style>', self.s,
287 self.pos)
288 self.search_state = None
289 # beginning
290 return h8_id.HtmlCData, pos
291
292 # Find the first match.
293 # Note: frontend/match.py uses _LongestMatch(), which is different!
294 # TODO: reconcile them. This lexer should be expressible in re2c.
295
296 for pat, tok_id in HTM8_LEX_COMPILED:
297 m = pat.match(self.s, self.pos)
298 if m:
299 if tok_id in (h8_id.StartTag, h8_id.EndTag, h8_id.StartEndTag):
300 self.tag_pos_left = m.start(1)
301 self.tag_pos_right = m.end(1)
302 else:
303 # Reset state
304 self.tag_pos_left = -1
305 self.tag_pos_right = -1
306
307 if tok_id == h8_id.CommentBegin:
308 pos = self.s.find('-->', self.pos)
309 if pos == -1:
310 raise LexError('Unterminated <!--', self.s, self.pos)
311 return h8_id.Comment, pos + 3 # -->
312
313 if tok_id == h8_id.ProcessingBegin:
314 pos = self.s.find('?>', self.pos)
315 if pos == -1:
316 raise LexError('Unterminated <?', self.s, self.pos)
317 return h8_id.Processing, pos + 2 # ?>
318
319 if tok_id == h8_id.CDataBegin:
320 pos = self.s.find(']]>', self.pos)
321 if pos == -1:
322 # unterminated <![CDATA[
323 raise LexError('Unterminated <![CDATA[', self.s,
324 self.pos)
325 return h8_id.CData, pos + 3 # ]]>
326
327 if tok_id == h8_id.StartTag:
328 # TODO: reduce allocations
329 if (self.TagNameEquals('script') or
330 self.TagNameEquals('style')):
331 # <SCRipt a=b> -> </SCRipt>
332 self.search_state = '</' + self._LiteralTagName() + '>'
333
334 return tok_id, m.end()
335 else:
336 raise AssertionError('h8_id.Invalid rule should have matched')
337
338 def TagNamePos(self):
339 # type: () -> int
340 """The right position of the tag pos"""
341 assert self.tag_pos_right != -1, self.tag_pos_right
342 return self.tag_pos_right
343
344 def TagNameEquals(self, expected):
345 # type: (str) -> bool
346 assert self.tag_pos_left != -1, self.tag_pos_left
347 assert self.tag_pos_right != -1, self.tag_pos_right
348
349 # TODO: In C++, this does not need an allocation. Can we test
350 # directly?
351 return expected == self.CanonicalTagName()
352
353 def _LiteralTagName(self):
354 # type: () -> str
355 assert self.tag_pos_left != -1, self.tag_pos_left
356 assert self.tag_pos_right != -1, self.tag_pos_right
357
358 return self.s[self.tag_pos_left:self.tag_pos_right]
359
360 def CanonicalTagName(self):
361 # type: () -> str
362 tag_name = self._LiteralTagName()
363 # Most tags are already lower case, so avoid allocation with this conditional
364 # TODO: this could go in the mycpp runtime?
365 if tag_name.islower():
366 return tag_name
367 else:
368 return tag_name.lower()
369
370 def Read(self):
371 # type: () -> Tuple[h8_id_t, int]
372 tok_id, end_pos = self._Read()
373 self.pos = end_pos # advance
374 return tok_id, end_pos
375
376 def LookAhead(self, regex):
377 # type: (str) -> bool
378 """
379 Currently used for ul_table.py. But taking a dynamic regex string is
380 not the right interface.
381 """
382 # Cache the regex compilation. This could also be LookAheadFor(THEAD)
383 # or something.
384 pat = self.cache.get(regex)
385 if pat is None:
386 pat = re.compile(regex)
387 self.cache[regex] = pat
388
389 m = pat.match(self.s, self.pos)
390 return m is not None
391
392
393A_NAME_LEX = [
394 # Leading whitespace is required, to separate attributes.
395 #
396 # If the = is not present, then we set the lexer in a state for
397 # attr_value_e.Missing.
398 (r'\s+ (%s) \s* (=)? \s*' % _NAME, attr_name.Ok),
399 # unexpected EOF
400
401 # The closing > or /> is treated as end of stream, and it's not an error.
402 (r'\s* /? >', attr_name.Done),
403
404 # NUL should not be possible, because the top-level
405
406 # This includes < - it is not BadLessThan because it's NOT recoverable
407 (r'.', attr_name.Invalid),
408]
409
410A_NAME_LEX_COMPILED = MakeLexer(A_NAME_LEX)
411
412# Here we just loop on regular tokens
413#
414# Examples:
415# <a href = unquoted&amp;foo >
416# <a href = unquoted&foo > # BadAmpersand is allowed I guess
417# <a href ="unquoted&foo" > # double quoted
418# <a href ='unquoted&foo' > # single quoted
419# <a href = what"foo" > # HTML5 allows this, but we could disallow it if
420# it's not common. It opens up the j"" and $"" extensions
421# <a href = what'foo' > # ditto
422
423# TODO: get rid of OLD copy
424_UNQUOTED_VALUE_OLD = r'''[^ \t\r\n<>&"'\x00]*'''
425_UNQUOTED_VALUE = r'''[^ \t\r\n<>&"'\x00]+'''
426
427# What comes after = ?
428A_VALUE_LEX = [
429 (r'"', h8_val_id.DoubleQuote),
430 (r"'", h8_val_id.SingleQuote),
431 (_UNQUOTED_VALUE, h8_val_id.UnquotedVal),
432 (r'.', h8_val_id.NoMatch),
433]
434
435A_VALUE_LEX_COMPILED = MakeLexer(A_VALUE_LEX)
436
437# What's inside "" or '' ?
438QUOTED_VALUE_LEX = CHAR_LEX + [
439 (r'"', h8_id.DoubleQuote),
440 (r"'", h8_id.SingleQuote),
441 (r'<', h8_id.BadLessThan), # BadAmpersand is in CharLex
442
443 # TODO: think about whitespace for efficient class= queries?
444 #(r'[ \r\n\t]', h8_id.Whitespace), # terminates unquoted values
445 (r'''[^"'<>&\x00]+''', h8_id.RawData),
446 # This includes > - it is not BadGreaterThan because it's NOT recoverable
447 (r'.', h8_id.Invalid),
448]
449
450QUOTED_VALUE_LEX_COMPILED = MakeLexer(QUOTED_VALUE_LEX)
451
452
453class AttrLexer(object):
454 """
455 Typical usage:
456
457 while True:
458 n, start_pos, end_pos = attr_lx.ReadName()
459 if n == attr_name.Ok:
460 if attr_lx.AttrNameEquals('div'):
461 print('div')
462
463 # TODO: also pass Optional[List[]] out_tokens?
464 v, start_pos, end_pos = attr_lx.ReadValue()
465 """
466
467 def __init__(self, s):
468 # type: (str) -> None
469 self.s = s
470
471 self.tok_id = h8_id.Invalid # Uninitialized
472 self.tag_name_pos = -1 # Invalid
473 self.tag_end_pos = -1
474 self.must_not_exceed_pos = -1
475
476 self.pos = -1
477
478 self.name_start = -1
479 self.name_end = -1
480 self.next_value_is_missing = False
481
482 self.init_t = -1
483 self.init_e = -1
484
485 def Init(self, tok_id, tag_name_pos, end_pos):
486 # type: (h8_id_t, int, int) -> None
487 """Initialize so we can read names and values.
488
489 Example:
490 'x <a y>' # tag_name_pos=4, end_pos=6
491 'x <a>' # tag_name_pos=4, end_pos=4
492
493 The Init() method is used to reuse instances of the AttrLexer object.
494 """
495 assert tag_name_pos >= 0, tag_name_pos
496 assert end_pos >= 0, end_pos
497
498 #log('TAG NAME POS %d', tag_name_pos)
499
500 self.tok_id = tok_id
501 self.tag_name_pos = tag_name_pos
502 self.end_pos = end_pos
503
504 # Check for ambiguous <img src=/>
505 if tok_id == h8_id.StartTag:
506 self.must_not_exceed_pos = end_pos - 1 # account for >
507 elif tok_id == h8_id.StartEndTag:
508 self.must_not_exceed_pos = end_pos - 2 # account for />
509 else:
510 raise AssertionError(tok_id)
511
512 self.pos = tag_name_pos
513
514 # For Reset()
515 self.init_t = tag_name_pos
516 self.init_e = end_pos
517
518 def Reset(self):
519 # type: () -> None
520
521 # TODO: maybe GetAttrRaw() should call this directly? But not any of
522 # the AllAttrs() methods?
523 self.tag_name_pos = self.init_t
524 self.end_pos = self.init_e
525 self.pos = self.init_t
526
527 def ReadName(self):
528 # type: () -> Tuple[attr_name_t, int, int]
529 """Reads the attribute name
530
531 EOF case:
532 <a>
533 <a >
534
535 Error case:
536 <a !>
537 <a foo=bar !>
538 """
539 for pat, a in A_NAME_LEX_COMPILED:
540 m = pat.match(self.s, self.pos)
541 #log('ReadName() matching %r at %d', self.s, self.pos)
542 if m:
543 #log('ReadName() tag_name_pos %d pos, %d %s', self.tag_name_pos, self.pos, m.groups())
544 if a == attr_name.Invalid:
545 #log('m.groups %s', m.groups())
546 return attr_name.Invalid, -1, -1
547
548 self.pos = m.end(0) # Advance if it's not invalid
549
550 if a == attr_name.Ok:
551 #log('%r', m.groups())
552 self.name_start = m.start(1)
553 self.name_end = m.end(1)
554 # Is the equals sign missing? Set state.
555 if m.group(2) is None:
556 self.next_value_is_missing = True
557 # HACK: REWIND, since we don't want to consume whitespace
558 self.pos = self.name_end
559 else:
560 self.next_value_is_missing = False
561 return attr_name.Ok, self.name_start, self.name_end
562 else:
563 # Reset state - e.g. you must call AttrNameEquals
564 self.name_start = -1
565 self.name_end = -1
566
567 if a == attr_name.Done:
568 return attr_name.Done, -1, -1
569 else:
570 context = self.s[self.pos:]
571 #log('s %r %d', self.s, self.pos)
572 raise AssertionError('h8_id.Invalid rule should have matched %r' %
573 context)
574
575 def _CanonicalAttrName(self):
576 # type: () -> str
577 """Return the lower case attribute name.
578
579 Must call after ReadName()
580 """
581 assert self.name_start >= 0, self.name_start
582 assert self.name_end >= 0, self.name_end
583
584 attr_name = self.s[self.name_start:self.name_end]
585 if attr_name.islower():
586 return attr_name
587 else:
588 return attr_name.lower()
589
590 def AttrNameEquals(self, expected):
591 # type: (str) -> bool
592 """
593 Must call after ReadName()
594
595 TODO: This can be optimized to be "in place", with zero allocs.
596 """
597 return expected == self._CanonicalAttrName()
598
599 def _QuotedRead(self):
600 # type: () -> Tuple[h8_id_t, int]
601
602 for pat, tok_id in QUOTED_VALUE_LEX_COMPILED:
603 m = pat.match(self.s, self.pos)
604 if m:
605 end_pos = m.end(0) # Advance
606 #log('_QuotedRead %r', self.s[self.pos:end_pos])
607 return tok_id, end_pos
608 else:
609 context = self.s[self.pos:self.pos + 10]
610 raise AssertionError('h8_id.Invalid rule should have matched %r' %
611 context)
612
613 def ReadValue(self, tokens_out=None):
614 # type: (Optional[List[Tuple[h8_id, int]]]) -> Tuple[attr_value_t, int, int]
615 """Read the attribute value.
616
617 In general, it is escaped or "raw"
618
619 Can only be called after a SUCCESSFUL ReadName().
620 Assuming ReadName() returned a value, this should NOT fail.
621 """
622 # ReadName() invariant
623 assert self.name_start >= 0, self.name_start
624 assert self.name_end >= 0, self.name_end
625
626 self.name_start = -1
627 self.name_end = -1
628
629 if self.next_value_is_missing:
630 # Do not advance self.pos
631 #log('-> MISSING pos %d : %r', self.pos, self.s[self.pos:])
632 return attr_value_e.Missing, -1, -1
633
634 # Now read " ', unquoted or empty= is valid too.
635 for pat, a in A_VALUE_LEX_COMPILED:
636 m = pat.match(self.s, self.pos)
637 if m:
638 first_end_pos = m.end(0)
639 # We shouldn't go past the end
640 assert first_end_pos <= self.end_pos, \
641 'first_end_pos = %d should be less than self.end_pos = %d' % (first_end_pos, self.end_pos)
642 #log('m %s', m.groups())
643
644 # Note: Unquoted value can't contain &amp; etc. now, so there
645 # is no unquoting, and no respecting tokens_raw.
646 if a == h8_val_id.UnquotedVal:
647 if first_end_pos > self.must_not_exceed_pos:
648 #log('first_end_pos %d', first_end_pos)
649 #log('must_not_exceed_pos %d', self.must_not_exceed_pos)
650 raise LexError(
651 'Ambiguous slash: last attribute should be quoted',
652 self.s, first_end_pos)
653 self.pos = first_end_pos # Advance
654 return attr_value_e.Unquoted, m.start(0), first_end_pos
655
656 # TODO: respect tokens_out
657 if a == h8_val_id.DoubleQuote:
658 self.pos = first_end_pos
659 while True:
660 tok_id, q_end_pos = self._QuotedRead()
661 #log('self.pos %d q_end_pos %d', self.pos, q_end_pos)
662 if tok_id == h8_id.Invalid:
663 raise LexError(
664 'ReadValue() got invalid token (DQ)', self.s,
665 self.pos)
666 if tok_id == h8_id.DoubleQuote:
667 right_pos = self.pos
668 self.pos = q_end_pos # Advance past "
669 return attr_value_e.DoubleQuoted, first_end_pos, right_pos
670 self.pos = q_end_pos # Advance _QuotedRead
671
672 # TODO: respect tokens_out
673 if a == h8_val_id.SingleQuote:
674 self.pos = first_end_pos
675 while True:
676 tok_id, q_end_pos = self._QuotedRead()
677 if tok_id == h8_id.Invalid:
678 raise LexError(
679 'ReadValue() got invalid token (SQ)', self.s,
680 self.pos)
681 if tok_id == h8_id.SingleQuote:
682 right_pos = self.pos
683 self.pos = q_end_pos # Advance past "
684 return attr_value_e.SingleQuoted, first_end_pos, right_pos
685 self.pos = q_end_pos # Advance _QuotedRead
686
687 if a == h8_val_id.NoMatch:
688 # <a foo = >
689 return attr_value_e.Empty, -1, -1
690 else:
691 raise AssertionError('h8_val_id.NoMatch rule should have matched')
692
693
694def GetAttrRaw(attr_lx, name):
695 # type: (AttrLexer, str) -> Optional[str]
696 while True:
697 n, name_start, name_end = attr_lx.ReadName()
698 #log('==> ReadName %s %d %d', attr_name_str(n), name_start, name_end)
699 if n == attr_name.Ok:
700 if attr_lx.AttrNameEquals(name):
701 v, val_start, val_end = attr_lx.ReadValue()
702 return attr_lx.s[val_start:val_end]
703 else:
704 # Problem with stateful API: You are forced to either ReadValue()
705 # or SkipVlaue()
706 attr_lx.ReadValue()
707 elif n == attr_name.Done:
708 break
709 elif n == attr_name.Invalid:
710 raise LexError('GetAttrRaw() got invalid token', attr_lx.s,
711 attr_lx.pos)
712 else:
713 raise AssertionError()
714
715 return None
716
717
718def AllAttrsRawSlice(attr_lx):
719 # type: (AttrLexer) -> List[Tuple[int, int, attr_value_t, int, int]]
720 result = []
721 while True:
722 n, name_start, name_end = attr_lx.ReadName()
723 if 0:
724 log(' AllAttrsRaw ==> ReadName %s %d %d %r', attr_name_str(n),
725 name_start, name_end, attr_lx.s[attr_lx.pos:attr_lx.pos + 10])
726 if n == attr_name.Ok:
727 #name = attr_lx.s[name_start:name_end]
728 #log(' Name %r', name)
729
730 v, val_start, val_end = attr_lx.ReadValue()
731 #val = attr_lx.s[val_start:val_end]
732 #log(' ReadValue %r', val)
733 result.append((name_start, name_end, v, val_start, val_end))
734 elif n == attr_name.Done:
735 break
736 elif n == attr_name.Invalid:
737 raise LexError('AllAttrsRaw() got invalid token', attr_lx.s,
738 attr_lx.pos)
739 else:
740 raise AssertionError()
741
742 return result
743
744
745def AllAttrsRaw(attr_lx):
746 # type: (AttrLexer) -> List[Tuple[str, str]]
747 """
748 Get a list of pairs [('class', 'foo'), ('href', '?foo=1&amp;bar=2')]
749
750 The quoted values may be escaped. We would need another lexer to
751 unescape them.
752 """
753 slices = AllAttrsRawSlice(attr_lx)
754 pairs = []
755 s = attr_lx.s
756 for name_start, name_end, val_id, val_start, val_end in slices:
757 n = s[name_start:name_end]
758 v = s[val_start:val_end]
759 pairs.append((n, v))
760 return pairs
761
762
763#
764# OLD API - REMOVE THIS
765#
766
767# Tag names:
768# Match <a or </a
769# Match <h2, but not <2h
770#
771# HTML 5 doesn't restrict tag names at all
772# https://html.spec.whatwg.org/#toc-syntax
773#
774# XML allows : - .
775# https://www.w3.org/TR/xml/#NT-NameChar
776
777# Namespaces for MathML, SVG
778# XLink, XML, XMLNS
779#
780# https://infra.spec.whatwg.org/#namespaces
781#
782# Allow - for td-attrs
783
784# TODO: we don't need to capture the tag name here? That's done at the top
785# level
786_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)
787
788_TAG_LAST_RE = re.compile(r'\s* /? >', re.VERBOSE)
789
790# To match href="foo"
791# Note: in HTML5 and XML, single quoted attributes are also valid
792
793# <button disabled> is standard usage
794
795# NOTE: This used to allow whitespace around =
796# <a foo = "bar"> makes sense in XML
797# But then you also have
798# <a foo= bar> - which is TWO attributes, in HTML5
799# So the space is problematic
800
801_ATTR_RE = re.compile(
802 r'''
803\s+ # Leading whitespace is required
804(%s) # Attribute name
805(?: # Optional attribute value
806 \s* = \s* # Spaces allowed around =
807 (?:
808 " ([^>"\x00]*) " # double quoted value
809 | ' ([^>'\x00]*) ' # single quoted value
810 | (%s) # Attribute value
811 )
812)?
813''' % (_NAME, _UNQUOTED_VALUE_OLD), re.VERBOSE)
814
815
816class TagLexer(object):
817 """
818 Given a tag like <a href="..."> or <link type="..." />, the TagLexer
819 provides a few operations:
820
821 - What is the tag?
822 - Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
823 """
824
825 def __init__(self, s):
826 # type: (str) -> None
827 self.s = s
828 self.start_pos = -1 # Invalid
829 self.end_pos = -1
830
831 def Reset(self, start_pos, end_pos):
832 # type: (int, int) -> None
833 """Reuse instances of this object."""
834 assert start_pos >= 0, start_pos
835 assert end_pos >= 0, end_pos
836
837 self.start_pos = start_pos
838 self.end_pos = end_pos
839
840 def WholeTagString(self):
841 # type: () -> str
842 """Return the entire tag string, e.g. <a href='foo'>"""
843 return self.s[self.start_pos:self.end_pos]
844
845 def GetTagName(self):
846 # type: () -> str
847 # First event
848 tok_id, start, end = next(self.Tokens())
849 return self.s[start:end]
850
851 def GetSpanForAttrValue(self, attr_name):
852 # type: (str) -> Tuple[int, int]
853 """
854 Used by oils_doc.py, for href shortcuts
855 """
856 # Algorithm: search for QuotedValue or UnquotedValue after AttrName
857 # TODO: Could also cache these
858
859 events = self.Tokens()
860 val = (-1, -1)
861 try:
862 while True:
863 tok_id, start, end = next(events)
864 if tok_id == h8_tag_id.AttrName:
865 name = self.s[start:end]
866 if name == attr_name:
867 # The value should come next
868 tok_id, start, end = next(events)
869 assert tok_id in (
870 h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue,
871 h8_tag_id.MissingValue), h8_tag_id_str(tok_id)
872 val = start, end
873 break
874
875 except StopIteration:
876 pass
877 return val
878
879 def GetAttrRaw(self, attr_name):
880 # type: (str) -> Optional[str]
881 """
882 Return the value, which may be UNESCAPED.
883 """
884 start, end = self.GetSpanForAttrValue(attr_name)
885 if start == -1:
886 return None
887 return self.s[start:end]
888
889 def AllAttrsRawSlice(self):
890 # type: () -> List[Tuple[str, int, int]]
891 """
892 Get a list of pairs [('class', 3, 5), ('href', 9, 12)]
893 """
894 slices = []
895 events = self.Tokens()
896 try:
897 while True:
898 tok_id, start, end = next(events)
899 if tok_id == h8_tag_id.AttrName:
900 name = self.s[start:end]
901
902 # The value should come next
903 tok_id, start, end = next(events)
904 assert tok_id in (
905 h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue,
906 h8_tag_id.MissingValue), h8_tag_id_str(tok_id)
907 # Note: quoted values may have &amp;
908 # We would need ANOTHER lexer to unescape them, but we
909 # don't need that for ul-table
910 slices.append((name, start, end))
911 except StopIteration:
912 pass
913 return slices
914
915 def AllAttrsRaw(self):
916 # type: () -> List[Tuple[str, str]]
917 """
918 Get a list of pairs [('class', 'foo'), ('href', '?foo=1&amp;bar=2')]
919
920 The quoted values may be escaped. We would need another lexer to
921 unescape them.
922 """
923 slices = self.AllAttrsRawSlice()
924 pairs = []
925 for name, start, end in slices:
926 pairs.append((name, self.s[start:end]))
927 return pairs
928
929 def Tokens(self):
930 # type: () -> Iterator[Tuple[h8_tag_id_t, int, int]]
931 """
932 Yields a sequence of tokens: Tag (AttrName AttrValue?)*
933
934 Where each Token is (Type, start_pos, end_pos)
935
936 Note that start and end are NOT redundant! We skip over some unwanted
937 characters.
938 """
939 m = _TAG_RE.match(self.s, self.start_pos + 1)
940 if not m:
941 raise RuntimeError("Couldn't find HTML tag in %r" %
942 self.WholeTagString())
943 yield h8_tag_id.TagName, m.start(1), m.end(1)
944
945 pos = m.end(0)
946 #log('POS %d', pos)
947
948 while True:
949 # don't search past the end
950 m = _ATTR_RE.match(self.s, pos, self.end_pos)
951 if not m:
952 #log('BREAK pos %d', pos)
953 break
954 #log('AttrName %r', m.group(1))
955
956 yield h8_tag_id.AttrName, m.start(1), m.end(1)
957
958 #log('m.groups() %r', m.groups())
959 if m.group(2) is not None:
960 # double quoted
961 yield h8_tag_id.QuotedValue, m.start(2), m.end(2)
962 elif m.group(3) is not None:
963 # single quoted - TODO: could have different token types
964 yield h8_tag_id.QuotedValue, m.start(3), m.end(3)
965 elif m.group(4) is not None:
966 yield h8_tag_id.UnquotedValue, m.start(4), m.end(4)
967 else:
968 # <button disabled>
969 end = m.end(0)
970 yield h8_tag_id.MissingValue, end, end
971
972 # Skip past the "
973 pos = m.end(0)
974
975 #log('TOK %r', self.s)
976
977 m = _TAG_LAST_RE.match(self.s, pos)
978 #log('_TAG_LAST_RE match %r', self.s[pos:])
979 if not m:
980 raise LexError('Extra data at end of tag', self.s, pos)