OILS / data_lang / htm8.py View on Github | oils.pub

754 lines, 356 significant
1"""data_lang/htm8.py
2
3TODO
4
5- would be nice: migrate everything off of TagLexer()
6 - oils_doc.py and help_gen.py
7 - this old API is stateful and uses Python iterators, which is problematic
8 - maybe we can use a better CSS selector abstraction
9
10API:
11- Get rid of Reset()?
12
13Features:
14
15- work on ToXml() test cases? This is another text of AttrLexer
16
17Docs:
18
19- Copy all errors into doc/ref/chap-errors.md
20 - This helps understand the language
21
22C++:
23- UTF-8 check, like JSON8
24- re2c
25 - port lexer, which will fix static typing issues
26 - the abstraction needs to support submatch?
27 - for finding the end of a tag, etc.?
28 - and what about no match?
29
30- harmonize LexError and ParseError with data_lang/j8.py, which uses
31 error.Decode(msg, ..., cur_line_num)
32"""
33
34import re
35
36from typing import Dict, List, Tuple, Optional, IO, Any
37
38from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, attr_name, attr_name_t,
39 attr_name_str, attr_value_e, attr_value_t,
40 h8_val_id)
41from doctools.util import log
42
43
44class LexError(Exception):
45 """
46 Examples of lex errors:
47
48 - h8_id.Invalid, like <> or &&
49 - Unclosed <!-- <? <![CDATA[ <script> <style>
50 """
51
52 def __init__(self, msg, code_str, start_pos):
53 # type: (str, str, int) -> None
54 self.msg = msg
55 self.code_str = code_str
56 self.start_pos = start_pos
57
58 def __str__(self):
59 # type: () -> str
60 return '(LexError %r %r)' % (
61 self.msg, self.code_str[self.start_pos:self.start_pos + 20])
62
63
64def _FindLineNum(s, error_pos):
65 # type: (str, int) -> int
66 current_pos = 0
67 line_num = 1
68 while True:
69 newline_pos = s.find('\n', current_pos)
70 #log('current = %d, N %d, line %d', current_pos, newline_pos, line_num)
71
72 if newline_pos == -1: # this is the last line
73 return line_num
74 if newline_pos >= error_pos:
75 return line_num
76 line_num += 1
77 current_pos = newline_pos + 1
78
79
80class ParseError(Exception):
81 """
82 Examples of parse errors
83
84 - unbalanced tag structure
85 - ul_table.py errors
86 """
87
88 def __init__(self, msg, s=None, start_pos=-1):
89 # type: (str, Optional[str], int) -> None
90 self.msg = msg
91 self.s = s
92 self.start_pos = start_pos
93
94 def __str__(self):
95 # type: () -> str
96 if self.s is not None:
97 assert self.start_pos != -1, self.start_pos
98 snippet = (self.s[self.start_pos:self.start_pos + 20])
99
100 line_num = _FindLineNum(self.s, self.start_pos)
101 else:
102 snippet = ''
103 line_num = -1
104 msg = 'line %d: %r %r' % (line_num, self.msg, snippet)
105 return msg
106
107
108class Output(object):
109 """Output for sed-like "replacement" model.
110
111 Takes an underlying input buffer and an output file. Maintains a position
112 in the input buffer.
113
114 Print FROM the input or print new text to the output.
115 """
116
117 def __init__(self, s, f, left_pos=0, right_pos=-1):
118 # type: (str, IO[str], int, int) -> None
119 self.s = s
120 self.f = f
121 self.pos = left_pos
122 self.right_pos = len(s) if right_pos == -1 else right_pos
123
124 def SkipTo(self, pos):
125 # type: (int) -> None
126 """Skip to a position."""
127 self.pos = pos
128
129 def PrintUntil(self, pos):
130 # type: (int) -> None
131 """Print until a position."""
132 piece = self.s[self.pos:pos]
133 self.f.write(piece)
134 self.pos = pos
135
136 def PrintTheRest(self):
137 # type: () -> None
138 """Print until the end of the string."""
139 self.PrintUntil(self.right_pos)
140
141 def Print(self, s):
142 # type: (str) -> None
143 """Print text to the underlying buffer."""
144 self.f.write(s)
145
146
147def MakeLexer(rules):
148 return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]
149
150
151#
152# Lexers
153#
154
155_NAME_RE = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter
156
157CHAR_LEX = [
158 # Characters
159 # https://www.w3.org/TR/xml/#sec-references
160 (r'&\# [0-9]+ ;', h8_id.DecChar),
161 (r'&\# x[0-9a-fA-F]+ ;', h8_id.HexChar),
162 # TODO: shouldn't use _NAME_RE? Just letters
163 (r'& %s ;' % _NAME_RE, h8_id.CharEntity),
164 # Allow unquoted, and quoted
165 (r'&', h8_id.BadAmpersand),
166]
167
168HTM8_LEX = CHAR_LEX + [
169 # TODO: CommentBegin, ProcessingBegin, CDataBegin could have an additional
170 # action associated with them? The ending substring
171 (r'<!--', h8_id.CommentBegin),
172
173 # Processing instruction are used for the XML header:
174 # <?xml version="1.0" encoding="UTF-8"?>
175 # They are technically XML-only, but in HTML5, they are another kind of
176 # comment:
177 #
178 # https://developer.mozilla.org/en-US/docs/Web/API/ProcessingInstruction
179 #
180 (r'<\?', h8_id.ProcessingBegin),
181 # Not necessary in HTML5, but occurs in XML
182 (r'<!\[CDATA\[', h8_id.CDataBegin), # <![CDATA[
183
184 # Markup declarations
185 # - In HTML5, there is only <!DOCTYPE html>
186 # - XML has 4 more declarations: <!ELEMENT ...> ATTLIST ENTITY NOTATION
187 # - these seem to be part of DTD
188 # - it's useful to skip these, and be able to parse the rest of the document
189 # - Note: < is allowed?
190 (r'<! [^>\x00]+ >', h8_id.Decl),
191
192 # Tags
193 # Notes:
194 # - We look for a valid tag name, but we don't validate attributes.
195 # That's done in the tag lexer.
196 # - We don't allow leading whitespace
197 (r'</ (%s) >' % _NAME_RE, h8_id.EndTag),
198 # self-closing <br/> comes before StartTag
199 # could/should these be collapsed into one rule?
200 (r'< (%s) [^>\x00]* />' % _NAME_RE, h8_id.StartEndTag), # end </a>
201 (r'< (%s) [^>\x00]* >' % _NAME_RE, h8_id.StartTag), # start <a>
202
203 # HTML5 allows unescaped > in raw data, but < is not allowed.
204 # https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
205 #
206 # - My early blog has THREE errors when disallowing >
207 # - So do some .wwz files
208 (r'[^&<>\x00]+', h8_id.RawData),
209 (r'>', h8_id.BadGreaterThan),
210 # NUL is the end, an accomodation for re2c. Like we do in frontend/match.
211 (r'\x00', h8_id.EndOfStream),
212 # This includes < - it is not BadLessThan because it's NOT recoverable
213 (r'.', h8_id.Invalid),
214]
215
216# Old notes:
217#
218# Non-greedy matches are regular and can be matched in linear time
219# with RE2.
220#
221# https://news.ycombinator.com/item?id=27099798
222#
223
224# This person tried to do it with a regex:
225#
226# https://skeptric.com/html-comment-regexp/index.html
227
228# . is any char except newline
229# https://re2c.org/manual/manual_c.html
230
231# Discarded options
232#(r'<!-- .*? -->', h8_id.Comment),
233
234# Hack from Claude: \s\S instead of re.DOTALL. I don't like this
235#(r'<!-- [\s\S]*? -->', h8_id.Comment),
236#(r'<!-- (?:.|[\n])*? -->', h8_id.Comment),
237
238HTM8_LEX_COMPILED = MakeLexer(HTM8_LEX)
239
240
241class Lexer(object):
242
243 def __init__(self, s, left_pos=0, right_pos=-1, no_special_tags=False):
244 # type: (str, int, int, bool) -> None
245 self.s = s
246 self.pos = left_pos
247 self.right_pos = len(s) if right_pos == -1 else right_pos
248 self.no_special_tags = no_special_tags
249
250 # string -> compiled regex pattern object
251 self.cache = {} # type: Dict[str, Any]
252
253 # either </script> or </style> - we search until we see that
254 self.search_state = None # type: Optional[str]
255
256 # Position of tag name, if applicable
257 # - Set after you get a StartTag, EndTag, or StartEndTag
258 # - Unset on other tags
259 self.tag_pos_left = -1
260 self.tag_pos_right = -1
261
262 def _Read(self):
263 # type: () -> Tuple[h8_id_t, int]
264 if self.pos == self.right_pos:
265 return h8_id.EndOfStream, self.pos
266
267 assert self.pos < self.right_pos, self.pos
268
269 if self.search_state is not None and not self.no_special_tags:
270 # TODO: case-insensitive search for </SCRIPT> <SCRipt> ?
271 #
272 # Another strategy: enter a mode where we find ONLY the end tag
273 # regex, and any data that's not <, and then check the canonical
274 # tag name for 'script' or 'style'.
275 pos = self.s.find(self.search_state, self.pos)
276 if pos == -1:
277 raise LexError('Unterminated <script> or <style>', self.s,
278 self.pos)
279 self.search_state = None
280 # beginning
281 return h8_id.HtmlCData, pos
282
283 # Find the first match.
284 # Note: frontend/match.py uses _LongestMatch(), which is different!
285 # TODO: reconcile them. This lexer should be expressible in re2c.
286
287 for pat, tok_id in HTM8_LEX_COMPILED:
288 m = pat.match(self.s, self.pos)
289 if m:
290 if tok_id in (h8_id.StartTag, h8_id.EndTag, h8_id.StartEndTag):
291 self.tag_pos_left = m.start(1)
292 self.tag_pos_right = m.end(1)
293 else:
294 # Reset state
295 self.tag_pos_left = -1
296 self.tag_pos_right = -1
297
298 if tok_id == h8_id.CommentBegin:
299 pos = self.s.find('-->', self.pos)
300 if pos == -1:
301 raise LexError('Unterminated <!--', self.s, self.pos)
302 return h8_id.Comment, pos + 3 # -->
303
304 if tok_id == h8_id.ProcessingBegin:
305 pos = self.s.find('?>', self.pos)
306 if pos == -1:
307 raise LexError('Unterminated <?', self.s, self.pos)
308 return h8_id.Processing, pos + 2 # ?>
309
310 if tok_id == h8_id.CDataBegin:
311 pos = self.s.find(']]>', self.pos)
312 if pos == -1:
313 # unterminated <![CDATA[
314 raise LexError('Unterminated <![CDATA[', self.s,
315 self.pos)
316 return h8_id.CData, pos + 3 # ]]>
317
318 if tok_id == h8_id.StartTag:
319 # TODO: reduce allocations
320 if (self.TagNameEquals('script') or
321 self.TagNameEquals('style')):
322 # <SCRipt a=b> -> </SCRipt>
323 self.search_state = '</' + self._LiteralTagName() + '>'
324
325 return tok_id, m.end()
326 else:
327 raise AssertionError('h8_id.Invalid rule should have matched')
328
329 def TagNamePos(self):
330 # type: () -> int
331 """The right position of the tag pos"""
332 assert self.tag_pos_right != -1, self.tag_pos_right
333 return self.tag_pos_right
334
335 def TagNameEquals(self, expected):
336 # type: (str) -> bool
337 assert self.tag_pos_left != -1, self.tag_pos_left
338 assert self.tag_pos_right != -1, self.tag_pos_right
339
340 # TODO: In C++, this does not need an allocation. Can we test
341 # directly?
342 return expected == self.CanonicalTagName()
343
344 def _LiteralTagName(self):
345 # type: () -> str
346 assert self.tag_pos_left != -1, self.tag_pos_left
347 assert self.tag_pos_right != -1, self.tag_pos_right
348
349 return self.s[self.tag_pos_left:self.tag_pos_right]
350
351 def CanonicalTagName(self):
352 # type: () -> str
353 tag_name = self._LiteralTagName()
354 # Most tags are already lower case, so avoid allocation with this conditional
355 # TODO: this could go in the mycpp runtime?
356 if tag_name.islower():
357 return tag_name
358 else:
359 return tag_name.lower()
360
361 def Read(self):
362 # type: () -> Tuple[h8_id_t, int]
363 tok_id, end_pos = self._Read()
364 self.pos = end_pos # advance
365 return tok_id, end_pos
366
367 def LookAhead(self, regex):
368 # type: (str) -> bool
369 """
370 Currently used for ul_table.py. But taking a dynamic regex string is
371 not the right interface.
372 """
373 # Cache the regex compilation. This could also be LookAheadFor(THEAD)
374 # or something.
375 pat = self.cache.get(regex)
376 if pat is None:
377 pat = re.compile(regex)
378 self.cache[regex] = pat
379
380 m = pat.match(self.s, self.pos)
381 return m is not None
382
383
384A_NAME_LEX = [
385 # Leading whitespace is required, to separate attributes.
386 #
387 # If the = is not present, then we set the lexer in a state for
388 # attr_value_e.Missing.
389 (r'\s+ (%s) \s* (=)? \s*' % _NAME_RE, attr_name.Ok),
390 # unexpected EOF
391
392 # The closing > or /> is treated as end of stream, and it's not an error.
393 (r'\s* /? >', attr_name.Done),
394
395 # NUL should not be possible, because the top-level
396
397 # This includes < - it is not BadLessThan because it's NOT recoverable
398 (r'.', attr_name.Invalid),
399]
400
401A_NAME_LEX_COMPILED = MakeLexer(A_NAME_LEX)
402
403# Here we just loop on regular tokens
404#
405# Examples:
406# <a href = unquoted&amp;foo >
407# <a href = unquoted&foo > # BadAmpersand is allowed I guess
408# <a href ="unquoted&foo" > # double quoted
409# <a href ='unquoted&foo' > # single quoted
410# <a href = what"foo" > # HTML5 allows this, but we could disallow it if
411# it's not common. It opens up the j"" and $"" extensions
412# <a href = what'foo' > # ditto
413
414_UNQUOTED_VALUE = r'''[^ \t\r\n<>&"'\x00]+'''
415
416# What comes after = ?
417A_VALUE_LEX = [
418 (r'"', h8_val_id.DoubleQuote),
419 (r"'", h8_val_id.SingleQuote),
420 (_UNQUOTED_VALUE, h8_val_id.UnquotedVal),
421 (r'.', h8_val_id.NoMatch),
422]
423
424A_VALUE_LEX_COMPILED = MakeLexer(A_VALUE_LEX)
425
426# What's inside "" or '' ?
427QUOTED_VALUE_LEX = CHAR_LEX + [
428 (r'"', h8_id.DoubleQuote),
429 (r"'", h8_id.SingleQuote),
430 (r'<', h8_id.BadLessThan), # BadAmpersand is in CharLex
431
432 # TODO: think about whitespace for efficient class= queries?
433 #(r'[ \r\n\t]', h8_id.Whitespace), # terminates unquoted values
434 (r'''[^"'<>&\x00]+''', h8_id.RawData),
435 # This includes > - it is not BadGreaterThan because it's NOT recoverable
436 (r'.', h8_id.Invalid),
437]
438
439QUOTED_VALUE_LEX_COMPILED = MakeLexer(QUOTED_VALUE_LEX)
440
441
442class AttrLexer(object):
443 """
444 Typical usage:
445
446 while True:
447 n, start_pos, end_pos = attr_lx.ReadName()
448 if n == attr_name.Ok:
449 if attr_lx.AttrNameEquals('div'):
450 print('div')
451
452 # TODO: also pass Optional[List[]] out_tokens?
453 v, start_pos, end_pos = attr_lx.ReadValue()
454 """
455
456 def __init__(self, s):
457 # type: (str) -> None
458 self.s = s
459
460 self.tok_id = h8_id.Invalid # Uninitialized
461 self.tag_name_pos = -1 # Invalid
462 self.tag_end_pos = -1
463 self.must_not_exceed_pos = -1
464
465 self.pos = -1
466
467 self.name_start = -1
468 self.name_end = -1
469 self.equal_end = -1
470 self.next_value_is_missing = False
471
472 self.init_t = -1
473 self.init_e = -1
474
475 def Init(self, tok_id, tag_name_pos, end_pos):
476 # type: (h8_id_t, int, int) -> None
477 """Initialize so we can read names and values.
478
479 Example:
480 'x <a y>' # tag_name_pos=4, end_pos=6
481 'x <a>' # tag_name_pos=4, end_pos=4
482
483 The Init() method is used to reuse instances of the AttrLexer object.
484 """
485 assert tag_name_pos >= 0, tag_name_pos
486 assert end_pos >= 0, end_pos
487
488 #log('TAG NAME POS %d', tag_name_pos)
489
490 self.tok_id = tok_id
491 self.tag_name_pos = tag_name_pos
492 self.end_pos = end_pos
493
494 # Check for ambiguous <img src=/>
495 if tok_id == h8_id.StartTag:
496 self.must_not_exceed_pos = end_pos - 1 # account for >
497 elif tok_id == h8_id.StartEndTag:
498 self.must_not_exceed_pos = end_pos - 2 # account for />
499 else:
500 raise AssertionError(tok_id)
501
502 self.pos = tag_name_pos
503
504 # For Reset()
505 self.init_t = tag_name_pos
506 self.init_e = end_pos
507
508 def Reset(self):
509 # type: () -> None
510
511 # TODO: maybe GetAttrRaw() should call this directly? But not any of
512 # the AllAttrs() methods?
513 self.tag_name_pos = self.init_t
514 self.end_pos = self.init_e
515 self.pos = self.init_t
516
517 def ReadName(self):
518 # type: () -> Tuple[attr_name_t, int, int, int]
519 """Reads the attribute name
520
521 EOF case:
522 <a>
523 <a >
524
525 Error case:
526 <a !>
527 <a foo=bar !>
528 """
529 for pat, a in A_NAME_LEX_COMPILED:
530 m = pat.match(self.s, self.pos)
531 #log('ReadName() matching %r at %d', self.s, self.pos)
532 if m:
533 #log('ReadName() tag_name_pos %d pos, %d %s', self.tag_name_pos, self.pos, m.groups())
534 if a == attr_name.Invalid:
535 #log('m.groups %s', m.groups())
536 return attr_name.Invalid, -1, -1, -1
537
538 self.pos = m.end(0) # Advance if it's not invalid
539
540 if a == attr_name.Ok:
541 #log('%r', m.groups())
542 self.name_start = m.start(1)
543 self.name_end = m.end(1)
544 self.equal_end = m.end(0) # XML conversion needs this
545 # Is the equals sign missing? Set state.
546 if m.group(2) is None:
547 self.next_value_is_missing = True
548 # HACK: REWIND, since we don't want to consume whitespace
549 self.pos = self.name_end
550 else:
551 self.next_value_is_missing = False
552 return attr_name.Ok, self.name_start, self.name_end, self.equal_end
553 else:
554 # Reset state - e.g. you must call AttrNameEquals
555 self.name_start = -1
556 self.name_end = -1
557
558 if a == attr_name.Done:
559 return attr_name.Done, -1, -1, -1
560 else:
561 context = self.s[self.pos:]
562 #log('s %r %d', self.s, self.pos)
563 raise AssertionError('h8_id.Invalid rule should have matched %r' %
564 context)
565
566 def _CanonicalAttrName(self):
567 # type: () -> str
568 """Return the lower case attribute name.
569
570 Must call after ReadName()
571 """
572 assert self.name_start >= 0, self.name_start
573 assert self.name_end >= 0, self.name_end
574
575 attr_name = self.s[self.name_start:self.name_end]
576 if attr_name.islower():
577 return attr_name
578 else:
579 return attr_name.lower()
580
581 def AttrNameEquals(self, expected):
582 # type: (str) -> bool
583 """
584 Must call after ReadName()
585
586 TODO: This can be optimized to be "in place", with zero allocs.
587 """
588 return expected == self._CanonicalAttrName()
589
590 def _QuotedRead(self):
591 # type: () -> Tuple[h8_id_t, int]
592
593 for pat, tok_id in QUOTED_VALUE_LEX_COMPILED:
594 # BUG: We can OVER-READ what the segement lexer gave us, e.g. with
595 # <a href=">"> - the inside > ends it
596 m = pat.match(self.s, self.pos)
597 if m:
598 end_pos = m.end(0) # Advance
599 #log('_QuotedRead %r', self.s[self.pos:end_pos])
600 return tok_id, end_pos
601 else:
602 context = self.s[self.pos:self.pos + 10]
603 raise AssertionError('h8_id.Invalid rule should have matched %r' %
604 context)
605
606 def ReadValue(self, tokens_out=None):
607 # type: (Optional[List[Tuple[h8_id, int]]]) -> Tuple[attr_value_t, int, int]
608 """Read the attribute value.
609
610 In general, it is escaped or "raw"
611
612 Can only be called after a SUCCESSFUL ReadName().
613 Assuming ReadName() returned a value, this should NOT fail.
614 """
615 # ReadName() invariant
616 assert self.name_start >= 0, self.name_start
617 assert self.name_end >= 0, self.name_end
618
619 self.name_start = -1
620 self.name_end = -1
621
622 if self.next_value_is_missing:
623 # Do not advance self.pos
624 #log('-> MISSING pos %d : %r', self.pos, self.s[self.pos:])
625 return attr_value_e.Missing, -1, -1
626
627 # Now read " ', unquoted or empty= is valid too.
628 for pat, a in A_VALUE_LEX_COMPILED:
629 m = pat.match(self.s, self.pos)
630 if m:
631 first_end_pos = m.end(0)
632 # We shouldn't go past the end
633 assert first_end_pos <= self.end_pos, \
634 'first_end_pos = %d should be less than self.end_pos = %d' % (first_end_pos, self.end_pos)
635 #log('m %s', m.groups())
636
637 # Note: Unquoted value can't contain &amp; etc. now, so there
638 # is no unquoting, and no respecting tokens_raw.
639 if a == h8_val_id.UnquotedVal:
640 if first_end_pos > self.must_not_exceed_pos:
641 #log('first_end_pos %d', first_end_pos)
642 #log('must_not_exceed_pos %d', self.must_not_exceed_pos)
643 raise LexError(
644 'Ambiguous slash: last attribute should be quoted',
645 self.s, first_end_pos)
646 self.pos = first_end_pos # Advance
647 return attr_value_e.Unquoted, m.start(0), first_end_pos
648
649 # TODO: respect tokens_out
650 if a == h8_val_id.DoubleQuote:
651 self.pos = first_end_pos
652 while True:
653 tok_id, q_end_pos = self._QuotedRead()
654 #log('self.pos %d q_end_pos %d', self.pos, q_end_pos)
655 if tok_id == h8_id.Invalid:
656 raise LexError(
657 'ReadValue() got invalid token (DQ)', self.s,
658 self.pos)
659 if tok_id == h8_id.DoubleQuote:
660 right_pos = self.pos
661 self.pos = q_end_pos # Advance past "
662 return attr_value_e.DoubleQuoted, first_end_pos, right_pos
663 self.pos = q_end_pos # Advance _QuotedRead
664
665 # TODO: respect tokens_out
666 if a == h8_val_id.SingleQuote:
667 self.pos = first_end_pos
668 while True:
669 tok_id, q_end_pos = self._QuotedRead()
670 if tok_id == h8_id.Invalid:
671 raise LexError(
672 'ReadValue() got invalid token (SQ)', self.s,
673 self.pos)
674 if tok_id == h8_id.SingleQuote:
675 right_pos = self.pos
676 self.pos = q_end_pos # Advance past "
677 return attr_value_e.SingleQuoted, first_end_pos, right_pos
678 self.pos = q_end_pos # Advance _QuotedRead
679
680 if a == h8_val_id.NoMatch:
681 # <a foo = >
682 return attr_value_e.Empty, -1, -1
683 else:
684 raise AssertionError('h8_val_id.NoMatch rule should have matched')
685
686
687def GetAttrRaw(attr_lx, name):
688 # type: (AttrLexer, str) -> Optional[str]
689 while True:
690 n, name_start, name_end, _ = attr_lx.ReadName()
691 #log('==> ReadName %s %d %d', attr_name_str(n), name_start, name_end)
692 if n == attr_name.Ok:
693 if attr_lx.AttrNameEquals(name):
694 v, val_start, val_end = attr_lx.ReadValue()
695 return attr_lx.s[val_start:val_end]
696 else:
697 # Problem with stateful API: You are forced to either ReadValue()
698 # or SkipVlaue()
699 attr_lx.ReadValue()
700 elif n == attr_name.Done:
701 break
702 elif n == attr_name.Invalid:
703 raise LexError('GetAttrRaw() got invalid token', attr_lx.s,
704 attr_lx.pos)
705 else:
706 raise AssertionError()
707
708 return None
709
710
711def AllAttrsRawSlice(attr_lx):
712 # type: (AttrLexer) -> List[Tuple[int, int, int, attr_value_t, int, int]]
713 result = []
714 while True:
715 n, name_start, name_end, equal_end = attr_lx.ReadName()
716 if 0:
717 log(' AllAttrsRaw ==> ReadName %s %d %d %r', attr_name_str(n),
718 name_start, name_end, attr_lx.s[attr_lx.pos:attr_lx.pos + 10])
719 if n == attr_name.Ok:
720 #name = attr_lx.s[name_start:name_end]
721 #log(' Name %r', name)
722
723 v, val_start, val_end = attr_lx.ReadValue()
724 #val = attr_lx.s[val_start:val_end]
725 #log(' ReadValue %r', val)
726 result.append(
727 (name_start, name_end, equal_end, v, val_start, val_end))
728 elif n == attr_name.Done:
729 break
730 elif n == attr_name.Invalid:
731 raise LexError('AllAttrsRaw() got invalid token', attr_lx.s,
732 attr_lx.pos)
733 else:
734 raise AssertionError()
735
736 return result
737
738
739def AllAttrsRaw(attr_lx):
740 # type: (AttrLexer) -> List[Tuple[str, str]]
741 """
742 Get a list of pairs [('class', 'foo'), ('href', '?foo=1&amp;bar=2')]
743
744 The quoted values may be escaped. We would need another lexer to
745 unescape them.
746 """
747 slices = AllAttrsRawSlice(attr_lx)
748 pairs = []
749 s = attr_lx.s
750 for name_start, name_end, equal_end, val_id, val_start, val_end in slices:
751 n = s[name_start:name_end]
752 v = s[val_start:val_end]
753 pairs.append((n, v))
754 return pairs