OILS / data_lang / htm8.py View on Github | oils.pub

813 lines, 300 significant
1"""data_lang/htm8.py
2
3TODO
4
5API:
6- Get rid of AttrValueLexer - this should be in the TagLexer
7 - this also means that unquoted values can be more similar
8 - We can use a single lexer mode for everything inside <>
9 - the SPACE is the only difference
10- Deprecate tag_lexer.GetTagName() in favor of lx.CanonicalTagName() or
11 _LiteralTagName()
12- UTF-8 check, like JSON8
13- re2c
14 - port lexer, which will fix static typing issues
15 - the abstraction needs to support submatch?
16 - for finding the end of a tag, etc.?
17
18- LexError and ParseError need details
19 - harmonize with data_lang/j8.py, which uses error.Decode(msg, ...,
20 cur_line_num)
21
22- Copy all errors into doc/ref/chap-errors.md
23 - This helps understand the language
24
25- Update doc/htm8.md
26- list of Algorithms:
27 - lex just the top level
28 - lex both levels
29 - and match tags - this is the level for value.Htm8Frag?
30 - convert to XML!
31 - lazy selection by tag, or attr (id= and class=)
32 - lazy selection by CSS selector expression
33 - convert to DOMTree
34 - sed-like replacement of DOM Tree or element
35 - untrusted HTML filter, e.g. like StackOverflow / Reddit
36 - this is Safe HTM8
37 - should have a zero alloc way to support this, with good errors?
38 - I think most of them silently strip data
39"""
40
41import re
42
43from typing import Dict, List, Tuple, Optional, IO, Iterator, Any
44
45from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_tag_id, h8_tag_id_t,
46 h8_tag_id_str, attr_name_t, attr_value_t)
47from doctools.util import log
48
49
50class LexError(Exception):
51 """
52 Examples of lex errors:
53
54 - h8_id.Invalid, like <> or &&
55 - Unclosed <!-- <? <![CDATA[ <script> <style>
56 """
57
58 def __init__(self, s, start_pos):
59 # type: (str, int) -> None
60 self.s = s
61 self.start_pos = start_pos
62
63 def __str__(self):
64 # type: () -> str
65 return '(LexError %r)' % (self.s[self.start_pos:self.start_pos + 20])
66
67
68def _FindLineNum(s, error_pos):
69 # type: (str, int) -> int
70 current_pos = 0
71 line_num = 1
72 while True:
73 newline_pos = s.find('\n', current_pos)
74 #log('current = %d, N %d, line %d', current_pos, newline_pos, line_num)
75
76 if newline_pos == -1: # this is the last line
77 return line_num
78 if newline_pos >= error_pos:
79 return line_num
80 line_num += 1
81 current_pos = newline_pos + 1
82
83
84class ParseError(Exception):
85 """
86 Examples of parse errors
87
88 - unbalanced tag structure
89 - ul_table.py errors
90 """
91
92 def __init__(self, msg, s=None, start_pos=-1):
93 # type: (str, Optional[str], int) -> None
94 self.msg = msg
95 self.s = s
96 self.start_pos = start_pos
97
98 def __str__(self):
99 # type: () -> str
100 if self.s is not None:
101 assert self.start_pos != -1, self.start_pos
102 snippet = (self.s[self.start_pos:self.start_pos + 20])
103
104 line_num = _FindLineNum(self.s, self.start_pos)
105 else:
106 snippet = ''
107 line_num = -1
108 msg = 'line %d: %r %r' % (line_num, self.msg, snippet)
109 return msg
110
111
112class Output(object):
113 """Takes an underlying input buffer and an output file. Maintains a
114 position in the input buffer.
115
116 Print FROM the input or print new text to the output.
117 """
118
119 def __init__(self, s, f, left_pos=0, right_pos=-1):
120 # type: (str, IO[str], int, int) -> None
121 self.s = s
122 self.f = f
123 self.pos = left_pos
124 self.right_pos = len(s) if right_pos == -1 else right_pos
125
126 def SkipTo(self, pos):
127 # type: (int) -> None
128 """Skip to a position."""
129 self.pos = pos
130
131 def PrintUntil(self, pos):
132 # type: (int) -> None
133 """Print until a position."""
134 piece = self.s[self.pos:pos]
135 self.f.write(piece)
136 self.pos = pos
137
138 def PrintTheRest(self):
139 # type: () -> None
140 """Print until the end of the string."""
141 self.PrintUntil(self.right_pos)
142
143 def Print(self, s):
144 # type: (str) -> None
145 """Print text to the underlying buffer."""
146 self.f.write(s)
147
148
149def MakeLexer(rules):
150 return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]
151
152
153#
154# Eggex
155#
156# Tag = / ~['>']+ /
157
158# Is this valid? A single character?
159# Tag = / ~'>'* /
160
161# Maybe better: / [NOT '>']+/
162# capital letters not allowed there?
163#
164# But then this is confusing:
165# / [NOT ~digit]+/
166#
167# / [NOT digit] / is [^\d]
168# / ~digit / is \D
169#
170# Or maybe:
171#
172# / [~ digit]+ /
173# / [~ '>']+ /
174# / [NOT '>']+ /
175
176# End = / '</' Tag '>' /
177# StartEnd = / '<' Tag '/>' /
178# Start = / '<' Tag '>' /
179#
180# EntityRef = / '&' dot{* N} ';' /
181
182# Tag name, or attribute name
183# colon is used in XML
184
185# https://www.w3.org/TR/xml/#NT-Name
186# Hm there is a lot of unicode stuff. We are simplifying parsing
187
188_NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter
189
190CHAR_LEX = [
191 # Characters
192 # https://www.w3.org/TR/xml/#sec-references
193 (r'&\# [0-9]+ ;', h8_id.DecChar),
194 (r'&\# x[0-9a-fA-F]+ ;', h8_id.HexChar),
195 (r'& %s ;' % _NAME, h8_id.CharEntity),
196 # Allow unquoted, and quoted
197 (r'&', h8_id.BadAmpersand),
198]
199
200HTM8_LEX = CHAR_LEX + [
201 # TODO: CommentBegin, ProcessingBegin, CDataBegin could have an additional
202 # action associated with them? The ending substring
203 (r'<!--', h8_id.CommentBegin),
204
205 # Processing instruction are used for the XML header:
206 # <?xml version="1.0" encoding="UTF-8"?>
207 # They are technically XML-only, but in HTML5, they are another kind of
208 # comment:
209 #
210 # https://developer.mozilla.org/en-US/docs/Web/API/ProcessingInstruction
211 #
212 (r'<\?', h8_id.ProcessingBegin),
213 # Not necessary in HTML5, but occurs in XML
214 (r'<!\[CDATA\[', h8_id.CDataBegin), # <![CDATA[
215
216 # Markup declarations
217 # - In HTML5, there is only <!DOCTYPE html>
218 # - XML has 4 more declarations: <!ELEMENT ...> ATTLIST ENTITY NOTATION
219 # - these seem to be part of DTD
220 # - it's useful to skip these, and be able to parse the rest of the document
221 # - Note: < is allowed?
222 (r'<! [^>\x00]+ >', h8_id.Decl),
223
224 # Tags
225 # Notes:
226 # - We look for a valid tag name, but we don't validate attributes.
227 # That's done in the tag lexer.
228 # - We don't allow leading whitespace
229 (r'</ (%s) >' % _NAME, h8_id.EndTag),
230 # self-closing <br/> comes before StartTag
231 # could/should these be collapsed into one rule?
232 (r'< (%s) [^>\x00]* />' % _NAME, h8_id.StartEndTag), # end </a>
233 (r'< (%s) [^>\x00]* >' % _NAME, h8_id.StartTag), # start <a>
234
235 # HTML5 allows unescaped > in raw data, but < is not allowed.
236 # https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
237 #
238 # - My early blog has THREE errors when disallowing >
239 # - So do some .wwz files
240 (r'[^&<>\x00]+', h8_id.RawData),
241 (r'>', h8_id.BadGreaterThan),
242 # < is an error
243 (r'.', h8_id.Invalid),
244]
245
246# Old notes:
247#
248# Non-greedy matches are regular and can be matched in linear time
249# with RE2.
250#
251# https://news.ycombinator.com/item?id=27099798
252#
253
254# This person tried to do it with a regex:
255#
256# https://skeptric.com/html-comment-regexp/index.html
257
258# . is any char except newline
259# https://re2c.org/manual/manual_c.html
260
261# Discarded options
262#(r'<!-- .*? -->', h8_id.Comment),
263
264# Hack from Claude: \s\S instead of re.DOTALL. I don't like this
265#(r'<!-- [\s\S]*? -->', h8_id.Comment),
266#(r'<!-- (?:.|[\n])*? -->', h8_id.Comment),
267
268HTM8_LEX_COMPILED = MakeLexer(HTM8_LEX)
269
270
271class Lexer(object):
272
273 def __init__(self, s, left_pos=0, right_pos=-1, no_special_tags=False):
274 # type: (str, int, int, bool) -> None
275 self.s = s
276 self.pos = left_pos
277 self.right_pos = len(s) if right_pos == -1 else right_pos
278 self.no_special_tags = no_special_tags
279
280 # string -> compiled regex pattern object
281 self.cache = {} # type: Dict[str, Any]
282
283 # either </script> or </style> - we search until we see that
284 self.search_state = None # type: Optional[str]
285
286 # Position of tag name, if applicable
287 # - Set after you get a StartTag, EndTag, or StartEndTag
288 # - Unset on other tags
289 self.tag_pos_left = -1
290 self.tag_pos_right = -1
291
292 def _Read(self):
293 # type: () -> Tuple[h8_id_t, int]
294 if self.pos == self.right_pos:
295 return h8_id.EndOfStream, self.pos
296
297 assert self.pos < self.right_pos, self.pos
298
299 if self.search_state is not None and not self.no_special_tags:
300 # TODO: case-insensitive search for </SCRIPT> <SCRipt> ?
301 #
302 # Another strategy: enter a mode where we find ONLY the end tag
303 # regex, and any data that's not <, and then check the canonical
304 # tag name for 'script' or 'style'.
305 pos = self.s.find(self.search_state, self.pos)
306 if pos == -1:
307 # unterminated <script> or <style>
308 raise LexError(self.s, self.pos)
309 self.search_state = None
310 # beginning
311 return h8_id.HtmlCData, pos
312
313 # Find the first match.
314 # Note: frontend/match.py uses _LongestMatch(), which is different!
315 # TODO: reconcile them. This lexer should be expressible in re2c.
316
317 for pat, tok_id in HTM8_LEX_COMPILED:
318 m = pat.match(self.s, self.pos)
319 if m:
320 if tok_id in (h8_id.StartTag, h8_id.EndTag, h8_id.StartEndTag):
321 self.tag_pos_left = m.start(1)
322 self.tag_pos_right = m.end(1)
323 else:
324 # Reset state
325 self.tag_pos_left = -1
326 self.tag_pos_right = -1
327
328 if tok_id == h8_id.CommentBegin:
329 pos = self.s.find('-->', self.pos)
330 if pos == -1:
331 # unterminated <!--
332 raise LexError(self.s, self.pos)
333 return h8_id.Comment, pos + 3 # -->
334
335 if tok_id == h8_id.ProcessingBegin:
336 pos = self.s.find('?>', self.pos)
337 if pos == -1:
338 # unterminated <?
339 raise LexError(self.s, self.pos)
340 return h8_id.Processing, pos + 2 # ?>
341
342 if tok_id == h8_id.CDataBegin:
343 pos = self.s.find(']]>', self.pos)
344 if pos == -1:
345 # unterminated <![CDATA[
346 raise LexError(self.s, self.pos)
347 return h8_id.CData, pos + 3 # ]]>
348
349 if tok_id == h8_id.StartTag:
350 # TODO: reduce allocations
351 if (self.TagNameEquals('script') or
352 self.TagNameEquals('style')):
353 # <SCRipt a=b> -> </SCRipt>
354 self.search_state = '</' + self._LiteralTagName() + '>'
355
356 return tok_id, m.end()
357 else:
358 raise AssertionError('h8_id.Invalid rule should have matched')
359
360 def TagNameEquals(self, expected):
361 # type: (str) -> bool
362 assert self.tag_pos_left != -1, self.tag_pos_left
363 assert self.tag_pos_right != -1, self.tag_pos_right
364
365 # TODO: In C++, this does not need an allocation. Can we test
366 # directly?
367 return expected == self.CanonicalTagName()
368
369 def _LiteralTagName(self):
370 # type: () -> str
371 assert self.tag_pos_left != -1, self.tag_pos_left
372 assert self.tag_pos_right != -1, self.tag_pos_right
373
374 return self.s[self.tag_pos_left:self.tag_pos_right]
375
376 def CanonicalTagName(self):
377 # type: () -> str
378 tag_name = self._LiteralTagName()
379 # Most tags are already lower case, so avoid allocation with this conditional
380 # TODO: this could go in the mycpp runtime?
381 if tag_name.islower():
382 return tag_name
383 else:
384 return tag_name.lower()
385
386 def Read(self):
387 # type: () -> Tuple[h8_id_t, int]
388 tok_id, end_pos = self._Read()
389 self.pos = end_pos # advance
390 return tok_id, end_pos
391
392 def LookAhead(self, regex):
393 # type: (str) -> bool
394 """
395 Currently used for ul_table.py. But taking a dynamic regex string is
396 not the right interface.
397 """
398 # Cache the regex compilation. This could also be LookAheadFor(THEAD)
399 # or something.
400 pat = self.cache.get(regex)
401 if pat is None:
402 pat = re.compile(regex)
403 self.cache[regex] = pat
404
405 m = pat.match(self.s, self.pos)
406 return m is not None
407
408
409class AttrLexer(object):
410 """
411 We can also invert this
412
413 Unquoted (List[h8_id] tok_ids, List[int] end_pos)
414
415 It would be nice to have a special case for the singleton, since that is
416 very common.
417
418 Simple (int tag_name_start, int tag_name_end, int attr_value_tag,
419 int value_start, int value_end)
420 This would cover many cases
421
422 The other option is to create many different events, and have AttrValueLexer
423 But I think that is annoying and overly detailed.
424
425 Operations:
426 - GetAttrRaw('foo')
427 - AllAttrsRaw()
428 - AllAttrsRawSlice()
429
430 class= query - well we can do this with Space tokens I think - we should
431 have an optimization
432 id= query - ditto, we should just have a predicate
433
434 Zero allocs:
435 tag query - TagNameEquals()
436
437 So I guess we have to write a HasClass('foo') and IdEquals('bar') on top of
438 this. Yes.
439
440 tag_lx.Reset2(...) # we should pass it the tag_name_end position
441 tag_lx.Read() -> bool # success or fail? Or Attr or Invalid
442 .AttrNameEquals('foo') -> bool # id and class query
443 .GetAttrName() -> str # for getting them all
444 .GetRawValue() -> Tuple[h8_tag_id, start, end] # just beginning and end
445 .GetValueTokens() -> Tuple[h8_tag_id, TokenList]
446 .TokenList = Tuple[List[h8_id], List[int end_pos]
447
448 You could also have
449
450 tag_lx.GetValueTokenId() -> Tuple[h8_id, end_pos]
451
452 And then you read it until it's " or ' or space ? We probably won't have
453 that use case to start.
454 """
455
456 def __init__(self, s):
457 # type: (str) -> None
458 self.s = s
459 self.tag_name_pos = -1 # Invalid
460 self.tag_end_pos = -1
461
462 def Init(self, tag_name_pos, end_pos):
463 # type: (int, int) -> None
464 """Initialize so we can read names and values.
465
466 Example:
467 'x <a y>' # tag_name_pos=4, end_pos=6
468 'x <a>' # tag_name_pos=4, end_pos=4
469
470 The Reset() method is used to reuse instances of the AttrLexer object.
471 """
472 assert tag_name_pos >= 0, tag_name_pos
473 assert end_pos >= 0, end_pos
474
475 self.tag_name_pos = tag_name_pos
476 self.end_pos = end_pos
477
478 def ReadName(self):
479 # type: () -> Tuple[attr_name_t, int, int]
480 """Reads the attribute name
481
482 EOF case:
483 <a>
484 <a >
485
486 Error case:
487 <a !>
488 <a foo=bar !>
489 """
490 pass
491
492 def AttrNameEquals(self, s):
493 # type: (str) -> bool
494 """
495 TODO: Must call this after ReadName() ?
496 Because that can FAIL.
497 """
498 pass
499
500 def ReadRawValue(self):
501 # type: () -> Tuple[attr_value_t, int, int]
502 """Read the attribute value.
503
504 In general, it is escaped or "raw"
505
506 Note: Assuming ReadName() returned a value, this should NOT fail.
507 """
508 # NOTE: if = is not found, set state
509
510 pass
511
512 def SkipValue(self):
513 # type: () -> None
514 # Just ignore it and return
515 self.ReadRawValue()
516
517 def ReadValueAndDecode(self):
518 # type: () -> str
519 """Read the attribute vlaue
520 """
521 # TODO: tokenize it
522 pass
523
524
525# Tag names:
526# Match <a or </a
527# Match <h2, but not <2h
528#
529# HTML 5 doesn't restrict tag names at all
530# https://html.spec.whatwg.org/#toc-syntax
531#
532# XML allows : - .
533# https://www.w3.org/TR/xml/#NT-NameChar
534
535# Namespaces for MathML, SVG
536# XLink, XML, XMLNS
537#
538# https://infra.spec.whatwg.org/#namespaces
539#
540# Allow - for td-attrs
541
542# Be very lenient - just no whitespace or special HTML chars
543# I don't think this is more lenient than HTML5, though we should check.
544_UNQUOTED_VALUE = r'''[^ \t\r\n<>&"'\x00]*'''
545
546# TODO: we don't need to capture the tag name here? That's done at the top
547# level
548_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)
549
550_TAG_LAST_RE = re.compile(r'\s* /? >', re.VERBOSE)
551
552# To match href="foo"
553# Note: in HTML5 and XML, single quoted attributes are also valid
554
555# <button disabled> is standard usage
556
557# NOTE: This used to allow whitespace around =
558# <a foo = "bar"> makes sense in XML
559# But then you also have
560# <a foo= bar> - which is TWO attributes, in HTML5
561# So the space is problematic
562
563_ATTR_RE = re.compile(
564 r'''
565\s+ # Leading whitespace is required
566(%s) # Attribute name
567(?: # Optional attribute value
568 \s* = \s* # Spaces allowed around =
569 (?:
570 " ([^>"\x00]*) " # double quoted value
571 | ' ([^>'\x00]*) ' # single quoted value
572 | (%s) # Attribute value
573 )
574)?
575''' % (_NAME, _UNQUOTED_VALUE), re.VERBOSE)
576
577
578class TagLexer(object):
579 """
580 Given a tag like <a href="..."> or <link type="..." />, the TagLexer
581 provides a few operations:
582
583 - What is the tag?
584 - Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
585 """
586
587 def __init__(self, s):
588 # type: (str) -> None
589 self.s = s
590 self.start_pos = -1 # Invalid
591 self.end_pos = -1
592
593 def Reset(self, start_pos, end_pos):
594 # type: (int, int) -> None
595 """Reuse instances of this object."""
596 assert start_pos >= 0, start_pos
597 assert end_pos >= 0, end_pos
598
599 self.start_pos = start_pos
600 self.end_pos = end_pos
601
602 def WholeTagString(self):
603 # type: () -> str
604 """Return the entire tag string, e.g. <a href='foo'>"""
605 return self.s[self.start_pos:self.end_pos]
606
607 def GetTagName(self):
608 # type: () -> str
609 # First event
610 tok_id, start, end = next(self.Tokens())
611 return self.s[start:end]
612
613 def GetSpanForAttrValue(self, attr_name):
614 # type: (str) -> Tuple[int, int]
615 """
616 Used by oils_doc.py, for href shortcuts
617 """
618 # Algorithm: search for QuotedValue or UnquotedValue after AttrName
619 # TODO: Could also cache these
620
621 events = self.Tokens()
622 val = (-1, -1)
623 try:
624 while True:
625 tok_id, start, end = next(events)
626 if tok_id == h8_tag_id.AttrName:
627 name = self.s[start:end]
628 if name == attr_name:
629 # The value should come next
630 tok_id, start, end = next(events)
631 assert tok_id in (
632 h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue,
633 h8_tag_id.MissingValue), h8_tag_id_str(tok_id)
634 val = start, end
635 break
636
637 except StopIteration:
638 pass
639 return val
640
641 def GetAttrRaw(self, attr_name):
642 # type: (str) -> Optional[str]
643 """
644 Return the value, which may be UNESCAPED.
645 """
646 start, end = self.GetSpanForAttrValue(attr_name)
647 if start == -1:
648 return None
649 return self.s[start:end]
650
651 def AllAttrsRawSlice(self):
652 # type: () -> List[Tuple[str, int, int]]
653 """
654 Get a list of pairs [('class', 3, 5), ('href', 9, 12)]
655 """
656 slices = []
657 events = self.Tokens()
658 try:
659 while True:
660 tok_id, start, end = next(events)
661 if tok_id == h8_tag_id.AttrName:
662 name = self.s[start:end]
663
664 # The value should come next
665 tok_id, start, end = next(events)
666 assert tok_id in (
667 h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue,
668 h8_tag_id.MissingValue), h8_tag_id_str(tok_id)
669 # Note: quoted values may have &amp;
670 # We would need ANOTHER lexer to unescape them, but we
671 # don't need that for ul-table
672 slices.append((name, start, end))
673 except StopIteration:
674 pass
675 return slices
676
677 def AllAttrsRaw(self):
678 # type: () -> List[Tuple[str, str]]
679 """
680 Get a list of pairs [('class', 'foo'), ('href', '?foo=1&amp;bar=2')]
681
682 The quoted values may be escaped. We would need another lexer to
683 unescape them.
684 """
685 slices = self.AllAttrsRawSlice()
686 pairs = []
687 for name, start, end in slices:
688 pairs.append((name, self.s[start:end]))
689 return pairs
690
691 def Tokens(self):
692 # type: () -> Iterator[Tuple[h8_tag_id_t, int, int]]
693 """
694 Yields a sequence of tokens: Tag (AttrName AttrValue?)*
695
696 Where each Token is (Type, start_pos, end_pos)
697
698 Note that start and end are NOT redundant! We skip over some unwanted
699 characters.
700 """
701 m = _TAG_RE.match(self.s, self.start_pos + 1)
702 if not m:
703 raise RuntimeError("Couldn't find HTML tag in %r" %
704 self.WholeTagString())
705 yield h8_tag_id.TagName, m.start(1), m.end(1)
706
707 pos = m.end(0)
708 #log('POS %d', pos)
709
710 while True:
711 # don't search past the end
712 m = _ATTR_RE.match(self.s, pos, self.end_pos)
713 if not m:
714 #log('BREAK pos %d', pos)
715 break
716 #log('AttrName %r', m.group(1))
717
718 yield h8_tag_id.AttrName, m.start(1), m.end(1)
719
720 #log('m.groups() %r', m.groups())
721 if m.group(2) is not None:
722 # double quoted
723 yield h8_tag_id.QuotedValue, m.start(2), m.end(2)
724 elif m.group(3) is not None:
725 # single quoted - TODO: could have different token types
726 yield h8_tag_id.QuotedValue, m.start(3), m.end(3)
727 elif m.group(4) is not None:
728 yield h8_tag_id.UnquotedValue, m.start(4), m.end(4)
729 else:
730 # <button disabled>
731 end = m.end(0)
732 yield h8_tag_id.MissingValue, end, end
733
734 # Skip past the "
735 pos = m.end(0)
736
737 #log('TOK %r', self.s)
738
739 m = _TAG_LAST_RE.match(self.s, pos)
740 #log('_TAG_LAST_RE match %r', self.s[pos:])
741 if not m:
742 # Extra data at end of tag. TODO: add messages for all these.
743 raise LexError(self.s, pos)
744
745
746# This is similar but not identical to
747# " ([^>"\x00]*) " # double quoted value
748# | ' ([^>'\x00]*) ' # single quoted value
749#
750# Note: for unquoted values, & isn't allowed, and thus &amp; and &#99; and
751# &#x99; are not allowed. We could relax that?
752ATTR_VALUE_LEX = CHAR_LEX + [
753 (r'[^>&\x00]+', h8_id.RawData),
754 (r'.', h8_id.Invalid),
755]
756
757ATTR_VALUE_LEX_COMPILED = MakeLexer(ATTR_VALUE_LEX)
758
759
760class AttrValueLexer(object):
761 """
762 <a href="foo=99&amp;bar">
763 <a href='foo=99&amp;bar'>
764 <a href=unquoted>
765 """
766
767 def __init__(self, s):
768 # type: (str) -> None
769 self.s = s
770 self.start_pos = -1 # Invalid
771 self.end_pos = -1
772
773 def Reset(self, start_pos, end_pos):
774 # type: (int, int) -> None
775 """Reuse instances of this object."""
776 assert start_pos >= 0, start_pos
777 assert end_pos >= 0, end_pos
778
779 self.start_pos = start_pos
780 self.end_pos = end_pos
781
782 def NumTokens(self):
783 # type: () -> int
784 num_tokens = 0
785 pos = self.start_pos
786 for tok_id, end_pos in self.Tokens():
787 if tok_id == h8_id.Invalid:
788 raise LexError(self.s, pos)
789 pos = end_pos
790 #log('pos %d', pos)
791 num_tokens += 1
792 return num_tokens
793
794 def Tokens(self):
795 # type: () -> Iterator[Tuple[h8_id_t, int]]
796 pos = self.start_pos
797 while pos < self.end_pos:
798 # Find the first match, like above.
799 # Note: frontend/match.py uses _LongestMatch(), which is different!
800 # TODO: reconcile them. This lexer should be expressible in re2c.
801 for pat, tok_id in ATTR_VALUE_LEX_COMPILED:
802 m = pat.match(self.s, pos)
803 if m:
804 if 0:
805 tok_str = m.group(0)
806 log('token = %r', tok_str)
807
808 end_pos = m.end(0)
809 yield tok_id, end_pos
810 pos = end_pos
811 break
812 else:
813 raise AssertionError('h8_id.Invalid rule should have matched')