OILS / data_lang / htm8.py View on Github | oils.pub

964 lines, 437 significant
1"""data_lang/htm8.py
2
3TODO
4
5Migrate:
6
7- doctools/ul_table.py should use new AttrLexer
8 - AllAttrsRaw()
9- maybe: migrate everything off of TagLexer()
10 - and AttrValueLexer() - this should requires Validate()
11
12API:
13- Get rid of Reset()?
14- Deprecate tag_lexer.GetTagName() in favor of lx.CanonicalTagName() or
15 _LiteralTagName()
16- UTF-8 check, like JSON8
17- re2c
18 - port lexer, which will fix static typing issues
19 - the abstraction needs to support submatch?
20 - for finding the end of a tag, etc.?
21 - and what about no match?
22
23- harmonize LexError and ParseError with data_lang/j8.py, which uses
24 error.Decode(msg, ..., cur_line_num)
25
26- Copy all errors into doc/ref/chap-errors.md
27 - This helps understand the language
28
29- Update doc/htm8.md
30- list of Algorithms:
31 - lex just the top level
32 - lex both levels
33 - and match tags - this is the level for value.Htm8Frag?
34 - convert to XML!
35 - lazy selection by tag, or attr (id= and class=)
36 - lazy selection by CSS selector expression
37 - convert to DOMTree
38 - sed-like replacement of DOM Tree or element
39 - untrusted HTML filter, e.g. like StackOverflow / Reddit
40 - this is Safe HTM8
41 - should have a zero alloc way to support this, with good errors?
42 - I think most of them silently strip data
43"""
44
45import re
46
47from typing import Dict, List, Tuple, Optional, IO, Iterator, Any
48
49from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_tag_id, h8_tag_id_t,
50 h8_tag_id_str, attr_name, attr_name_t,
51 attr_name_str, attr_value_e, attr_value_t,
52 h8_val_id)
53from doctools.util import log
54
55
56class LexError(Exception):
57 """
58 Examples of lex errors:
59
60 - h8_id.Invalid, like <> or &&
61 - Unclosed <!-- <? <![CDATA[ <script> <style>
62 """
63
64 def __init__(self, msg, code_str, start_pos):
65 # type: (str, str, int) -> None
66 self.msg = msg
67 self.code_str = code_str
68 self.start_pos = start_pos
69
70 def __str__(self):
71 # type: () -> str
72 return '(LexError %r %r)' % (
73 self.msg, self.code_str[self.start_pos:self.start_pos + 20])
74
75
76def _FindLineNum(s, error_pos):
77 # type: (str, int) -> int
78 current_pos = 0
79 line_num = 1
80 while True:
81 newline_pos = s.find('\n', current_pos)
82 #log('current = %d, N %d, line %d', current_pos, newline_pos, line_num)
83
84 if newline_pos == -1: # this is the last line
85 return line_num
86 if newline_pos >= error_pos:
87 return line_num
88 line_num += 1
89 current_pos = newline_pos + 1
90
91
92class ParseError(Exception):
93 """
94 Examples of parse errors
95
96 - unbalanced tag structure
97 - ul_table.py errors
98 """
99
100 def __init__(self, msg, s=None, start_pos=-1):
101 # type: (str, Optional[str], int) -> None
102 self.msg = msg
103 self.s = s
104 self.start_pos = start_pos
105
106 def __str__(self):
107 # type: () -> str
108 if self.s is not None:
109 assert self.start_pos != -1, self.start_pos
110 snippet = (self.s[self.start_pos:self.start_pos + 20])
111
112 line_num = _FindLineNum(self.s, self.start_pos)
113 else:
114 snippet = ''
115 line_num = -1
116 msg = 'line %d: %r %r' % (line_num, self.msg, snippet)
117 return msg
118
119
120class Output(object):
121 """Output for sed-like "replacement" model.
122
123 Takes an underlying input buffer and an output file. Maintains a position
124 in the input buffer.
125
126 Print FROM the input or print new text to the output.
127 """
128
129 def __init__(self, s, f, left_pos=0, right_pos=-1):
130 # type: (str, IO[str], int, int) -> None
131 self.s = s
132 self.f = f
133 self.pos = left_pos
134 self.right_pos = len(s) if right_pos == -1 else right_pos
135
136 def SkipTo(self, pos):
137 # type: (int) -> None
138 """Skip to a position."""
139 self.pos = pos
140
141 def PrintUntil(self, pos):
142 # type: (int) -> None
143 """Print until a position."""
144 piece = self.s[self.pos:pos]
145 self.f.write(piece)
146 self.pos = pos
147
148 def PrintTheRest(self):
149 # type: () -> None
150 """Print until the end of the string."""
151 self.PrintUntil(self.right_pos)
152
153 def Print(self, s):
154 # type: (str) -> None
155 """Print text to the underlying buffer."""
156 self.f.write(s)
157
158
159def MakeLexer(rules):
160 return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]
161
162
163#
164# Lexers
165#
166
167_NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter
168
169CHAR_LEX = [
170 # Characters
171 # https://www.w3.org/TR/xml/#sec-references
172 (r'&\# [0-9]+ ;', h8_id.DecChar),
173 (r'&\# x[0-9a-fA-F]+ ;', h8_id.HexChar),
174 (r'& %s ;' % _NAME, h8_id.CharEntity),
175 # Allow unquoted, and quoted
176 (r'&', h8_id.BadAmpersand),
177]
178
179HTM8_LEX = CHAR_LEX + [
180 # TODO: CommentBegin, ProcessingBegin, CDataBegin could have an additional
181 # action associated with them? The ending substring
182 (r'<!--', h8_id.CommentBegin),
183
184 # Processing instruction are used for the XML header:
185 # <?xml version="1.0" encoding="UTF-8"?>
186 # They are technically XML-only, but in HTML5, they are another kind of
187 # comment:
188 #
189 # https://developer.mozilla.org/en-US/docs/Web/API/ProcessingInstruction
190 #
191 (r'<\?', h8_id.ProcessingBegin),
192 # Not necessary in HTML5, but occurs in XML
193 (r'<!\[CDATA\[', h8_id.CDataBegin), # <![CDATA[
194
195 # Markup declarations
196 # - In HTML5, there is only <!DOCTYPE html>
197 # - XML has 4 more declarations: <!ELEMENT ...> ATTLIST ENTITY NOTATION
198 # - these seem to be part of DTD
199 # - it's useful to skip these, and be able to parse the rest of the document
200 # - Note: < is allowed?
201 (r'<! [^>\x00]+ >', h8_id.Decl),
202
203 # Tags
204 # Notes:
205 # - We look for a valid tag name, but we don't validate attributes.
206 # That's done in the tag lexer.
207 # - We don't allow leading whitespace
208 (r'</ (%s) >' % _NAME, h8_id.EndTag),
209 # self-closing <br/> comes before StartTag
210 # could/should these be collapsed into one rule?
211 (r'< (%s) [^>\x00]* />' % _NAME, h8_id.StartEndTag), # end </a>
212 (r'< (%s) [^>\x00]* >' % _NAME, h8_id.StartTag), # start <a>
213
214 # HTML5 allows unescaped > in raw data, but < is not allowed.
215 # https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
216 #
217 # - My early blog has THREE errors when disallowing >
218 # - So do some .wwz files
219 (r'[^&<>\x00]+', h8_id.RawData),
220 (r'>', h8_id.BadGreaterThan),
221 # NUL is the end, an accomodation for re2c. Like we do in frontend/match.
222 (r'\x00', h8_id.EndOfStream),
223 # This includes < - it is not BadLessThan because it's NOT recoverable
224 (r'.', h8_id.Invalid),
225]
226
227# Old notes:
228#
229# Non-greedy matches are regular and can be matched in linear time
230# with RE2.
231#
232# https://news.ycombinator.com/item?id=27099798
233#
234
235# This person tried to do it with a regex:
236#
237# https://skeptric.com/html-comment-regexp/index.html
238
239# . is any char except newline
240# https://re2c.org/manual/manual_c.html
241
242# Discarded options
243#(r'<!-- .*? -->', h8_id.Comment),
244
245# Hack from Claude: \s\S instead of re.DOTALL. I don't like this
246#(r'<!-- [\s\S]*? -->', h8_id.Comment),
247#(r'<!-- (?:.|[\n])*? -->', h8_id.Comment),
248
249HTM8_LEX_COMPILED = MakeLexer(HTM8_LEX)
250
251
252class Lexer(object):
253
254 def __init__(self, s, left_pos=0, right_pos=-1, no_special_tags=False):
255 # type: (str, int, int, bool) -> None
256 self.s = s
257 self.pos = left_pos
258 self.right_pos = len(s) if right_pos == -1 else right_pos
259 self.no_special_tags = no_special_tags
260
261 # string -> compiled regex pattern object
262 self.cache = {} # type: Dict[str, Any]
263
264 # either </script> or </style> - we search until we see that
265 self.search_state = None # type: Optional[str]
266
267 # Position of tag name, if applicable
268 # - Set after you get a StartTag, EndTag, or StartEndTag
269 # - Unset on other tags
270 self.tag_pos_left = -1
271 self.tag_pos_right = -1
272
273 def _Read(self):
274 # type: () -> Tuple[h8_id_t, int]
275 if self.pos == self.right_pos:
276 return h8_id.EndOfStream, self.pos
277
278 assert self.pos < self.right_pos, self.pos
279
280 if self.search_state is not None and not self.no_special_tags:
281 # TODO: case-insensitive search for </SCRIPT> <SCRipt> ?
282 #
283 # Another strategy: enter a mode where we find ONLY the end tag
284 # regex, and any data that's not <, and then check the canonical
285 # tag name for 'script' or 'style'.
286 pos = self.s.find(self.search_state, self.pos)
287 if pos == -1:
288 raise LexError('Unterminated <script> or <style>', self.s,
289 self.pos)
290 self.search_state = None
291 # beginning
292 return h8_id.HtmlCData, pos
293
294 # Find the first match.
295 # Note: frontend/match.py uses _LongestMatch(), which is different!
296 # TODO: reconcile them. This lexer should be expressible in re2c.
297
298 for pat, tok_id in HTM8_LEX_COMPILED:
299 m = pat.match(self.s, self.pos)
300 if m:
301 if tok_id in (h8_id.StartTag, h8_id.EndTag, h8_id.StartEndTag):
302 self.tag_pos_left = m.start(1)
303 self.tag_pos_right = m.end(1)
304 else:
305 # Reset state
306 self.tag_pos_left = -1
307 self.tag_pos_right = -1
308
309 if tok_id == h8_id.CommentBegin:
310 pos = self.s.find('-->', self.pos)
311 if pos == -1:
312 raise LexError('Unterminated <!--', self.s, self.pos)
313 return h8_id.Comment, pos + 3 # -->
314
315 if tok_id == h8_id.ProcessingBegin:
316 pos = self.s.find('?>', self.pos)
317 if pos == -1:
318 raise LexError('Unterminated <?', self.s, self.pos)
319 return h8_id.Processing, pos + 2 # ?>
320
321 if tok_id == h8_id.CDataBegin:
322 pos = self.s.find(']]>', self.pos)
323 if pos == -1:
324 # unterminated <![CDATA[
325 raise LexError('Unterminated <![CDATA[', self.s,
326 self.pos)
327 return h8_id.CData, pos + 3 # ]]>
328
329 if tok_id == h8_id.StartTag:
330 # TODO: reduce allocations
331 if (self.TagNameEquals('script') or
332 self.TagNameEquals('style')):
333 # <SCRipt a=b> -> </SCRipt>
334 self.search_state = '</' + self._LiteralTagName() + '>'
335
336 return tok_id, m.end()
337 else:
338 raise AssertionError('h8_id.Invalid rule should have matched')
339
340 def TagNamePos(self):
341 # type: () -> int
342 """The right position of the tag pos"""
343 assert self.tag_pos_right != -1, self.tag_pos_right
344 return self.tag_pos_right
345
346 def TagNameEquals(self, expected):
347 # type: (str) -> bool
348 assert self.tag_pos_left != -1, self.tag_pos_left
349 assert self.tag_pos_right != -1, self.tag_pos_right
350
351 # TODO: In C++, this does not need an allocation. Can we test
352 # directly?
353 return expected == self.CanonicalTagName()
354
355 def _LiteralTagName(self):
356 # type: () -> str
357 assert self.tag_pos_left != -1, self.tag_pos_left
358 assert self.tag_pos_right != -1, self.tag_pos_right
359
360 return self.s[self.tag_pos_left:self.tag_pos_right]
361
362 def CanonicalTagName(self):
363 # type: () -> str
364 tag_name = self._LiteralTagName()
365 # Most tags are already lower case, so avoid allocation with this conditional
366 # TODO: this could go in the mycpp runtime?
367 if tag_name.islower():
368 return tag_name
369 else:
370 return tag_name.lower()
371
372 def Read(self):
373 # type: () -> Tuple[h8_id_t, int]
374 tok_id, end_pos = self._Read()
375 self.pos = end_pos # advance
376 return tok_id, end_pos
377
378 def LookAhead(self, regex):
379 # type: (str) -> bool
380 """
381 Currently used for ul_table.py. But taking a dynamic regex string is
382 not the right interface.
383 """
384 # Cache the regex compilation. This could also be LookAheadFor(THEAD)
385 # or something.
386 pat = self.cache.get(regex)
387 if pat is None:
388 pat = re.compile(regex)
389 self.cache[regex] = pat
390
391 m = pat.match(self.s, self.pos)
392 return m is not None
393
394
395A_NAME_LEX = [
396 # Leading whitespace is required, to separate attributes.
397 #
398 # If the = is not present, then we set the lexer in a state for
399 # attr_value_e.Missing.
400 (r'\s+ (%s) \s* (=)? \s*' % _NAME, attr_name.Ok),
401 # unexpected EOF
402
403 # The closing > or /> is treated as end of stream, and it's not an error.
404 (r'\s* /? >', attr_name.Done),
405
406 # NUL should not be possible, because the top-level
407
408 # This includes < - it is not BadLessThan because it's NOT recoverable
409 (r'.', attr_name.Invalid),
410]
411
412A_NAME_LEX_COMPILED = MakeLexer(A_NAME_LEX)
413
414# Here we just loop on regular tokens
415#
416# Examples:
417# <a href = unquoted&amp;foo >
418# <a href = unquoted&foo > # BadAmpersand is allowed I guess
419# <a href ="unquoted&foo" > # double quoted
420# <a href ='unquoted&foo' > # single quoted
421# <a href = what"foo" > # HTML5 allows this, but we could disallow it if
422# it's not common. It opens up the j"" and $"" extensions
423# <a href = what'foo' > # ditto
424
425# TODO: get rid of OLD copy
426_UNQUOTED_VALUE_OLD = r'''[^ \t\r\n<>&"'\x00]*'''
427_UNQUOTED_VALUE = r'''[^ \t\r\n<>&"'\x00]+'''
428
429# What comes after = ?
430A_VALUE_LEX = [
431 (r'"', h8_val_id.DoubleQuote),
432 (r"'", h8_val_id.SingleQuote),
433 (_UNQUOTED_VALUE, h8_val_id.UnquotedVal),
434 (r'.', h8_val_id.NoMatch),
435]
436
437A_VALUE_LEX_COMPILED = MakeLexer(A_VALUE_LEX)
438
439# What's inside "" or '' ?
440QUOTED_VALUE_LEX = CHAR_LEX + [
441 (r'"', h8_id.DoubleQuote),
442 (r"'", h8_id.SingleQuote),
443 (r'<', h8_id.BadLessThan), # BadAmpersand is in CharLex
444
445 # TODO: think about whitespace for efficient class= queries?
446 #(r'[ \r\n\t]', h8_id.Whitespace), # terminates unquoted values
447 (r'''[^"'<>&\x00]+''', h8_id.RawData),
448 # This includes > - it is not BadGreaterThan because it's NOT recoverable
449 (r'.', h8_id.Invalid),
450]
451
452QUOTED_VALUE_LEX_COMPILED = MakeLexer(QUOTED_VALUE_LEX)
453
454
455class AttrLexer(object):
456 """
457 Typical usage:
458
459 while True:
460 n, start_pos, end_pos = attr_lx.ReadName()
461 if n == attr_name.Ok:
462 if attr_lx.AttrNameEquals('div'):
463 print('div')
464
465 # TODO: also pass Optional[List[]] out_tokens?
466 v, start_pos, end_pos = attr_lx.ReadValue()
467 """
468
469 def __init__(self, s):
470 # type: (str) -> None
471 self.s = s
472
473 self.tok_id = h8_id.Invalid # Uninitialized
474 self.tag_name_pos = -1 # Invalid
475 self.tag_end_pos = -1
476 self.must_not_exceed_pos = -1
477
478 self.pos = -1
479
480 self.name_start = -1
481 self.name_end = -1
482 self.next_value_is_missing = False
483
484 self.init_t = -1
485 self.init_e = -1
486
487 def Init(self, tok_id, tag_name_pos, end_pos):
488 # type: (h8_id_t, int, int) -> None
489 """Initialize so we can read names and values.
490
491 Example:
492 'x <a y>' # tag_name_pos=4, end_pos=6
493 'x <a>' # tag_name_pos=4, end_pos=4
494
495 The Init() method is used to reuse instances of the AttrLexer object.
496 """
497 assert tag_name_pos >= 0, tag_name_pos
498 assert end_pos >= 0, end_pos
499
500 #log('TAG NAME POS %d', tag_name_pos)
501
502 self.tok_id = tok_id
503 self.tag_name_pos = tag_name_pos
504 self.end_pos = end_pos
505
506 # Check for ambiguous <img src=/>
507 if tok_id == h8_id.StartTag:
508 self.must_not_exceed_pos = end_pos - 1 # account for >
509 elif tok_id == h8_id.StartEndTag:
510 self.must_not_exceed_pos = end_pos - 2 # account for />
511 else:
512 raise AssertionError(tok_id)
513
514 self.pos = tag_name_pos
515
516 # For Reset()
517 self.init_t = tag_name_pos
518 self.init_e = end_pos
519
520 def Reset(self):
521 # type: () -> None
522
523 # TODO: maybe GetAttrRaw() should call this directly? But not any of
524 # the AllAttrs() methods?
525 self.tag_name_pos = self.init_t
526 self.end_pos = self.init_e
527 self.pos = self.init_t
528
529 def ReadName(self):
530 # type: () -> Tuple[attr_name_t, int, int]
531 """Reads the attribute name
532
533 EOF case:
534 <a>
535 <a >
536
537 Error case:
538 <a !>
539 <a foo=bar !>
540 """
541 for pat, a in A_NAME_LEX_COMPILED:
542 m = pat.match(self.s, self.pos)
543 #log('ReadName() matching %r at %d', self.s, self.pos)
544 if m:
545 #log('ReadName() tag_name_pos %d pos, %d %s', self.tag_name_pos, self.pos, m.groups())
546 if a == attr_name.Invalid:
547 #log('m.groups %s', m.groups())
548 return attr_name.Invalid, -1, -1
549
550 self.pos = m.end(0) # Advance if it's not invalid
551
552 if a == attr_name.Ok:
553 #log('%r', m.groups())
554 self.name_start = m.start(1)
555 self.name_end = m.end(1)
556 # Is the equals sign missing? Set state.
557 if m.group(2) is None:
558 self.next_value_is_missing = True
559 # HACK: REWIND, since we don't want to consume whitespace
560 self.pos = self.name_end
561 else:
562 self.next_value_is_missing = False
563 return attr_name.Ok, self.name_start, self.name_end
564 else:
565 # Reset state - e.g. you must call AttrNameEquals
566 self.name_start = -1
567 self.name_end = -1
568
569 if a == attr_name.Done:
570 return attr_name.Done, -1, -1
571 else:
572 context = self.s[self.pos:]
573 #log('s %r %d', self.s, self.pos)
574 raise AssertionError('h8_id.Invalid rule should have matched %r' %
575 context)
576
577 def _CanonicalAttrName(self):
578 # type: () -> str
579 """Return the lower case attribute name.
580
581 Must call after ReadName()
582 """
583 assert self.name_start >= 0, self.name_start
584 assert self.name_end >= 0, self.name_end
585
586 attr_name = self.s[self.name_start:self.name_end]
587 if attr_name.islower():
588 return attr_name
589 else:
590 return attr_name.lower()
591
592 def AttrNameEquals(self, expected):
593 # type: (str) -> bool
594 """
595 Must call after ReadName()
596
597 TODO: This can be optimized to be "in place", with zero allocs.
598 """
599 return expected == self._CanonicalAttrName()
600
601 def _QuotedRead(self):
602 # type: () -> Tuple[h8_id_t, int]
603
604 for pat, tok_id in QUOTED_VALUE_LEX_COMPILED:
605 m = pat.match(self.s, self.pos)
606 if m:
607 end_pos = m.end(0) # Advance
608 #log('_QuotedRead %r', self.s[self.pos:end_pos])
609 return tok_id, end_pos
610 else:
611 context = self.s[self.pos:self.pos + 10]
612 raise AssertionError('h8_id.Invalid rule should have matched %r' %
613 context)
614
615 def ReadValue(self, tokens_out=None):
616 # type: (Optional[List[Tuple[h8_id, int]]]) -> Tuple[attr_value_t, int, int]
617 """Read the attribute value.
618
619 In general, it is escaped or "raw"
620
621 Can only be called after a SUCCESSFUL ReadName().
622 Assuming ReadName() returned a value, this should NOT fail.
623 """
624 # ReadName() invariant
625 assert self.name_start >= 0, self.name_start
626 assert self.name_end >= 0, self.name_end
627
628 self.name_start = -1
629 self.name_end = -1
630
631 if self.next_value_is_missing:
632 # Do not advance self.pos
633 #log('-> MISSING pos %d : %r', self.pos, self.s[self.pos:])
634 return attr_value_e.Missing, -1, -1
635
636 # Now read " ', unquoted or empty= is valid too.
637 for pat, a in A_VALUE_LEX_COMPILED:
638 m = pat.match(self.s, self.pos)
639 if m:
640 first_end_pos = m.end(0)
641 # We shouldn't go past the end
642 assert first_end_pos <= self.end_pos, \
643 'first_end_pos = %d should be less than self.end_pos = %d' % (first_end_pos, self.end_pos)
644 #log('m %s', m.groups())
645
646 # Note: Unquoted value can't contain &amp; etc. now, so there
647 # is no unquoting, and no respecting tokens_raw.
648 if a == h8_val_id.UnquotedVal:
649 if first_end_pos > self.must_not_exceed_pos:
650 #log('first_end_pos %d', first_end_pos)
651 #log('must_not_exceed_pos %d', self.must_not_exceed_pos)
652 raise LexError(
653 'Ambiguous slash: last attribute should be quoted',
654 self.s, first_end_pos)
655 self.pos = first_end_pos # Advance
656 return attr_value_e.Unquoted, m.start(0), first_end_pos
657
658 # TODO: respect tokens_out
659 if a == h8_val_id.DoubleQuote:
660 self.pos = first_end_pos
661 while True:
662 tok_id, q_end_pos = self._QuotedRead()
663 #log('self.pos %d q_end_pos %d', self.pos, q_end_pos)
664 if tok_id == h8_id.Invalid:
665 raise LexError(
666 'ReadValue() got invalid token (DQ)', self.s,
667 self.pos)
668 if tok_id == h8_id.DoubleQuote:
669 right_pos = self.pos
670 self.pos = q_end_pos # Advance past "
671 return attr_value_e.DoubleQuoted, first_end_pos, right_pos
672 self.pos = q_end_pos # Advance _QuotedRead
673
674 # TODO: respect tokens_out
675 if a == h8_val_id.SingleQuote:
676 self.pos = first_end_pos
677 while True:
678 tok_id, q_end_pos = self._QuotedRead()
679 if tok_id == h8_id.Invalid:
680 raise LexError(
681 'ReadValue() got invalid token (SQ)', self.s,
682 self.pos)
683 if tok_id == h8_id.SingleQuote:
684 right_pos = self.pos
685 self.pos = q_end_pos # Advance past "
686 return attr_value_e.SingleQuoted, first_end_pos, right_pos
687 self.pos = q_end_pos # Advance _QuotedRead
688
689 if a == h8_val_id.NoMatch:
690 # <a foo = >
691 return attr_value_e.Empty, -1, -1
692 else:
693 raise AssertionError('h8_val_id.NoMatch rule should have matched')
694
695
696def GetAttrRaw(attr_lx, name):
697 # type: (AttrLexer, str) -> Optional[str]
698 while True:
699 n, name_start, name_end = attr_lx.ReadName()
700 #log('==> ReadName %s %d %d', attr_name_str(n), name_start, name_end)
701 if n == attr_name.Ok:
702 if attr_lx.AttrNameEquals(name):
703 v, val_start, val_end = attr_lx.ReadValue()
704 return attr_lx.s[val_start:val_end]
705 else:
706 # Problem with stateful API: You are forced to either ReadValue()
707 # or SkipVlaue()
708 attr_lx.ReadValue()
709 elif n == attr_name.Done:
710 break
711 elif n == attr_name.Invalid:
712 raise LexError('GetAttrRaw() got invalid token', attr_lx.s,
713 attr_lx.pos)
714 else:
715 raise AssertionError()
716
717 return None
718
719
720def AllAttrsRaw(attr_lx):
721 # type: (AttrLexer) -> List[Tuple[str,str]]
722 result = []
723 while True:
724 n, name_start, name_end = attr_lx.ReadName()
725 if 0:
726 log(' AllAttrsRaw ==> ReadName %s %d %d %r', attr_name_str(n),
727 name_start, name_end, attr_lx.s[attr_lx.pos:attr_lx.pos + 10])
728 if n == attr_name.Ok:
729 name = attr_lx.s[name_start:name_end]
730 #log(' Name %r', name)
731
732 v, val_start, val_end = attr_lx.ReadValue()
733 val = attr_lx.s[val_start:val_end]
734 #log(' ReadValue %r', val)
735 result.append((name, val))
736 elif n == attr_name.Done:
737 break
738 elif n == attr_name.Invalid:
739 raise LexError('AllAttrsRaw() got invalid token', attr_lx.s,
740 attr_lx.pos)
741 else:
742 raise AssertionError()
743
744 return result
745
746
747#
748# OLD API - REMOVE THIS
749#
750
751# Tag names:
752# Match <a or </a
753# Match <h2, but not <2h
754#
755# HTML 5 doesn't restrict tag names at all
756# https://html.spec.whatwg.org/#toc-syntax
757#
758# XML allows : - .
759# https://www.w3.org/TR/xml/#NT-NameChar
760
761# Namespaces for MathML, SVG
762# XLink, XML, XMLNS
763#
764# https://infra.spec.whatwg.org/#namespaces
765#
766# Allow - for td-attrs
767
768# TODO: we don't need to capture the tag name here? That's done at the top
769# level
770_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)
771
772_TAG_LAST_RE = re.compile(r'\s* /? >', re.VERBOSE)
773
774# To match href="foo"
775# Note: in HTML5 and XML, single quoted attributes are also valid
776
777# <button disabled> is standard usage
778
779# NOTE: This used to allow whitespace around =
780# <a foo = "bar"> makes sense in XML
781# But then you also have
782# <a foo= bar> - which is TWO attributes, in HTML5
783# So the space is problematic
784
785_ATTR_RE = re.compile(
786 r'''
787\s+ # Leading whitespace is required
788(%s) # Attribute name
789(?: # Optional attribute value
790 \s* = \s* # Spaces allowed around =
791 (?:
792 " ([^>"\x00]*) " # double quoted value
793 | ' ([^>'\x00]*) ' # single quoted value
794 | (%s) # Attribute value
795 )
796)?
797''' % (_NAME, _UNQUOTED_VALUE_OLD), re.VERBOSE)
798
799
800class TagLexer(object):
801 """
802 Given a tag like <a href="..."> or <link type="..." />, the TagLexer
803 provides a few operations:
804
805 - What is the tag?
806 - Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
807 """
808
809 def __init__(self, s):
810 # type: (str) -> None
811 self.s = s
812 self.start_pos = -1 # Invalid
813 self.end_pos = -1
814
815 def Reset(self, start_pos, end_pos):
816 # type: (int, int) -> None
817 """Reuse instances of this object."""
818 assert start_pos >= 0, start_pos
819 assert end_pos >= 0, end_pos
820
821 self.start_pos = start_pos
822 self.end_pos = end_pos
823
824 def WholeTagString(self):
825 # type: () -> str
826 """Return the entire tag string, e.g. <a href='foo'>"""
827 return self.s[self.start_pos:self.end_pos]
828
829 def GetTagName(self):
830 # type: () -> str
831 # First event
832 tok_id, start, end = next(self.Tokens())
833 return self.s[start:end]
834
835 def GetSpanForAttrValue(self, attr_name):
836 # type: (str) -> Tuple[int, int]
837 """
838 Used by oils_doc.py, for href shortcuts
839 """
840 # Algorithm: search for QuotedValue or UnquotedValue after AttrName
841 # TODO: Could also cache these
842
843 events = self.Tokens()
844 val = (-1, -1)
845 try:
846 while True:
847 tok_id, start, end = next(events)
848 if tok_id == h8_tag_id.AttrName:
849 name = self.s[start:end]
850 if name == attr_name:
851 # The value should come next
852 tok_id, start, end = next(events)
853 assert tok_id in (
854 h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue,
855 h8_tag_id.MissingValue), h8_tag_id_str(tok_id)
856 val = start, end
857 break
858
859 except StopIteration:
860 pass
861 return val
862
863 def GetAttrRaw(self, attr_name):
864 # type: (str) -> Optional[str]
865 """
866 Return the value, which may be UNESCAPED.
867 """
868 start, end = self.GetSpanForAttrValue(attr_name)
869 if start == -1:
870 return None
871 return self.s[start:end]
872
873 def AllAttrsRawSlice(self):
874 # type: () -> List[Tuple[str, int, int]]
875 """
876 Get a list of pairs [('class', 3, 5), ('href', 9, 12)]
877 """
878 slices = []
879 events = self.Tokens()
880 try:
881 while True:
882 tok_id, start, end = next(events)
883 if tok_id == h8_tag_id.AttrName:
884 name = self.s[start:end]
885
886 # The value should come next
887 tok_id, start, end = next(events)
888 assert tok_id in (
889 h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue,
890 h8_tag_id.MissingValue), h8_tag_id_str(tok_id)
891 # Note: quoted values may have &amp;
892 # We would need ANOTHER lexer to unescape them, but we
893 # don't need that for ul-table
894 slices.append((name, start, end))
895 except StopIteration:
896 pass
897 return slices
898
899 def AllAttrsRaw(self):
900 # type: () -> List[Tuple[str, str]]
901 """
902 Get a list of pairs [('class', 'foo'), ('href', '?foo=1&amp;bar=2')]
903
904 The quoted values may be escaped. We would need another lexer to
905 unescape them.
906 """
907 slices = self.AllAttrsRawSlice()
908 pairs = []
909 for name, start, end in slices:
910 pairs.append((name, self.s[start:end]))
911 return pairs
912
913 def Tokens(self):
914 # type: () -> Iterator[Tuple[h8_tag_id_t, int, int]]
915 """
916 Yields a sequence of tokens: Tag (AttrName AttrValue?)*
917
918 Where each Token is (Type, start_pos, end_pos)
919
920 Note that start and end are NOT redundant! We skip over some unwanted
921 characters.
922 """
923 m = _TAG_RE.match(self.s, self.start_pos + 1)
924 if not m:
925 raise RuntimeError("Couldn't find HTML tag in %r" %
926 self.WholeTagString())
927 yield h8_tag_id.TagName, m.start(1), m.end(1)
928
929 pos = m.end(0)
930 #log('POS %d', pos)
931
932 while True:
933 # don't search past the end
934 m = _ATTR_RE.match(self.s, pos, self.end_pos)
935 if not m:
936 #log('BREAK pos %d', pos)
937 break
938 #log('AttrName %r', m.group(1))
939
940 yield h8_tag_id.AttrName, m.start(1), m.end(1)
941
942 #log('m.groups() %r', m.groups())
943 if m.group(2) is not None:
944 # double quoted
945 yield h8_tag_id.QuotedValue, m.start(2), m.end(2)
946 elif m.group(3) is not None:
947 # single quoted - TODO: could have different token types
948 yield h8_tag_id.QuotedValue, m.start(3), m.end(3)
949 elif m.group(4) is not None:
950 yield h8_tag_id.UnquotedValue, m.start(4), m.end(4)
951 else:
952 # <button disabled>
953 end = m.end(0)
954 yield h8_tag_id.MissingValue, end, end
955
956 # Skip past the "
957 pos = m.end(0)
958
959 #log('TOK %r', self.s)
960
961 m = _TAG_LAST_RE.match(self.s, pos)
962 #log('_TAG_LAST_RE match %r', self.s[pos:])
963 if not m:
964 raise LexError('Extra data at end of tag', self.s, pos)