OILS / data_lang / htm8.py View on Github | oils.pub

670 lines, 280 significant
1"""data_lang/htm8.py
2
3TODO
4
5API:
6- Get rid of AttrValueLexer - this should be in the TagLexer
7 - this also means that unquoted values can be more similar
8 - We can use a single lexer mode for everything inside <>
9 - the SPACE is the only difference
10- Deprecate tag_lexer.GetTagName() in favor of lx.CanonicalTagName() or
11 _LiteralTagName()
12- UTF-8 check, like JSON8
13- re2c
14 - port lexer, which will fix static typing issues
15 - the abstraction needs to support submatch?
16 - for finding the end of a tag, etc.?
17
18- LexError and ParseError need details
19 - harmonize with data_lang/j8.py, which uses error.Decode(msg, ...,
20 cur_line_num)
21"""
22
23import re
24
25from typing import Dict, List, Tuple, Optional, IO, Iterator, Any
26
27from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_tag_id, h8_tag_id_t,
28 h8_tag_id_str)
29from doctools.util import log
30
31
32class LexError(Exception):
33 """
34 Examples of lex errors:
35
36 - h8_id.Invalid, like <> or &&
37 - Unclosed <!-- <? <![CDATA[ <script> <style>
38 """
39
40 def __init__(self, s, start_pos):
41 # type: (str, int) -> None
42 self.s = s
43 self.start_pos = start_pos
44
45 def __str__(self):
46 # type: () -> str
47 return '(LexError %r)' % (self.s[self.start_pos:self.start_pos + 20])
48
49
50def FindLineNum(s, error_pos):
51 # type: (str, int) -> int
52 current_pos = 0
53 line_num = 1
54 while True:
55 newline_pos = s.find('\n', current_pos)
56 #log('current = %d, N %d, line %d', current_pos, newline_pos, line_num)
57
58 if newline_pos == -1: # this is the last line
59 return line_num
60 if newline_pos >= error_pos:
61 return line_num
62 line_num += 1
63 current_pos = newline_pos + 1
64
65
66class ParseError(Exception):
67 """
68 Examples of parse errors
69
70 - unbalanced tag structure
71 - ul_table.py errors
72 """
73
74 def __init__(self, msg, s=None, start_pos=-1):
75 # type: (str, Optional[str], int) -> None
76 self.msg = msg
77 self.s = s
78 self.start_pos = start_pos
79
80 def __str__(self):
81 # type: () -> str
82 if self.s is not None:
83 assert self.start_pos != -1, self.start_pos
84 snippet = (self.s[self.start_pos:self.start_pos + 20])
85
86 line_num = FindLineNum(self.s, self.start_pos)
87 else:
88 snippet = ''
89 line_num = -1
90 msg = 'line %d: %r %r' % (line_num, self.msg, snippet)
91 return msg
92
93
94class Output(object):
95 """Takes an underlying input buffer and an output file. Maintains a
96 position in the input buffer.
97
98 Print FROM the input or print new text to the output.
99 """
100
101 def __init__(self, s, f, left_pos=0, right_pos=-1):
102 # type: (str, IO[str], int, int) -> None
103 self.s = s
104 self.f = f
105 self.pos = left_pos
106 self.right_pos = len(s) if right_pos == -1 else right_pos
107
108 def SkipTo(self, pos):
109 # type: (int) -> None
110 """Skip to a position."""
111 self.pos = pos
112
113 def PrintUntil(self, pos):
114 # type: (int) -> None
115 """Print until a position."""
116 piece = self.s[self.pos:pos]
117 self.f.write(piece)
118 self.pos = pos
119
120 def PrintTheRest(self):
121 # type: () -> None
122 """Print until the end of the string."""
123 self.PrintUntil(self.right_pos)
124
125 def Print(self, s):
126 # type: (str) -> None
127 """Print text to the underlying buffer."""
128 self.f.write(s)
129
130
131def MakeLexer(rules):
132 return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]
133
134
135#
136# Eggex
137#
138# Tag = / ~['>']+ /
139
140# Is this valid? A single character?
141# Tag = / ~'>'* /
142
143# Maybe better: / [NOT '>']+/
144# capital letters not allowed there?
145#
146# But then this is confusing:
147# / [NOT ~digit]+/
148#
149# / [NOT digit] / is [^\d]
150# / ~digit / is \D
151#
152# Or maybe:
153#
154# / [~ digit]+ /
155# / [~ '>']+ /
156# / [NOT '>']+ /
157
158# End = / '</' Tag '>' /
159# StartEnd = / '<' Tag '/>' /
160# Start = / '<' Tag '>' /
161#
162# EntityRef = / '&' dot{* N} ';' /
163
164# Tag name, or attribute name
165# colon is used in XML
166
167# https://www.w3.org/TR/xml/#NT-Name
168# Hm there is a lot of unicode stuff. We are simplifying parsing
169
170_NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter
171
172CHAR_LEX = [
173 # Characters
174 # https://www.w3.org/TR/xml/#sec-references
175 (r'&\# [0-9]+ ;', h8_id.DecChar),
176 (r'&\# x[0-9a-fA-F]+ ;', h8_id.HexChar),
177 (r'& %s ;' % _NAME, h8_id.CharEntity),
178 # Allow unquoted, and quoted
179 (r'&', h8_id.BadAmpersand),
180]
181
182HTM8_LEX = CHAR_LEX + [
183 (r'<!--', h8_id.CommentBegin),
184
185 # Processing instruction are used for the XML header:
186 # <?xml version="1.0" encoding="UTF-8"?>
187 # They are technically XML-only, but in HTML5, they are another kind of
188 # comment:
189 #
190 # https://developer.mozilla.org/en-US/docs/Web/API/ProcessingInstruction
191 #
192 (r'<\?', h8_id.ProcessingBegin),
193 # Not necessary in HTML5, but occurs in XML
194 (r'<!\[CDATA\[', h8_id.CDataBegin), # <![CDATA[
195
196 # Markup declarations
197 # - In HTML5, there is only <!DOCTYPE html>
198 # - XML has 4 more declarations: <!ELEMENT ...> ATTLIST ENTITY NOTATION
199 # - these seem to be part of DTD
200 # - it's useful to skip these, and be able to parse the rest of the document
201 # - Note: < is allowed?
202 (r'<! [^>\x00]+ >', h8_id.Decl),
203
204 # Tags
205 # Notes:
206 # - We look for a valid tag name, but we don't validate attributes.
207 # That's done in the tag lexer.
208 # - We don't allow leading whitespace
209 (r'</ (%s) >' % _NAME, h8_id.EndTag),
210 # self-closing <br/> comes before StartTag
211 # could/should these be collapsed into one rule?
212 (r'< (%s) [^>\x00]* />' % _NAME, h8_id.StartEndTag), # end </a>
213 (r'< (%s) [^>\x00]* >' % _NAME, h8_id.StartTag), # start <a>
214
215 # HTML5 allows unescaped > in raw data, but < is not allowed.
216 # https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
217 #
218 # - My early blog has THREE errors when disallowing >
219 # - So do some .wwz files
220 (r'[^&<>\x00]+', h8_id.RawData),
221 (r'>', h8_id.BadGreaterThan),
222 # < is an error
223 (r'.', h8_id.Invalid),
224]
225
226# Old notes:
227#
228# Non-greedy matches are regular and can be matched in linear time
229# with RE2.
230#
231# https://news.ycombinator.com/item?id=27099798
232#
233# Maybe try combining all of these for speed.
234
235# . is any char except newline
236# https://re2c.org/manual/manual_c.html
237
238# Discarded options
239#(r'<!-- .*? -->', h8_id.Comment),
240
241# Hack from Claude: \s\S instead of re.DOTALL. I don't like this
242#(r'<!-- [\s\S]*? -->', h8_id.Comment),
243#(r'<!-- (?:.|[\n])*? -->', h8_id.Comment),
244
245HTM8_LEX_COMPILED = MakeLexer(HTM8_LEX)
246
247
248class Lexer(object):
249
250 def __init__(self, s, left_pos=0, right_pos=-1, no_special_tags=False):
251 # type: (str, int, int, bool) -> None
252 self.s = s
253 self.pos = left_pos
254 self.right_pos = len(s) if right_pos == -1 else right_pos
255 self.no_special_tags = no_special_tags
256
257 # string -> compiled regex pattern object
258 self.cache = {} # type: Dict[str, Any]
259
260 # either </script> or </style> - we search until we see that
261 self.search_state = None # type: Optional[str]
262
263 # Position of tag name, if applicable
264 # - Set after you get a StartTag, EndTag, or StartEndTag
265 # - Unset on other tags
266 self.tag_pos_left = -1
267 self.tag_pos_right = -1
268
269 def _Read(self):
270 # type: () -> Tuple[h8_id_t, int]
271 if self.pos == self.right_pos:
272 return h8_id.EndOfStream, self.pos
273
274 assert self.pos < self.right_pos, self.pos
275
276 if self.search_state is not None and not self.no_special_tags:
277 # TODO: case-insensitive search for </SCRIPT> <SCRipt> ?
278 #
279 # Another strategy: enter a mode where we find ONLY the end tag
280 # regex, and any data that's not <, and then check the canonical
281 # tag name for 'script' or 'style'.
282 pos = self.s.find(self.search_state, self.pos)
283 if pos == -1:
284 # unterminated <script> or <style>
285 raise LexError(self.s, self.pos)
286 self.search_state = None
287 # beginning
288 return h8_id.HtmlCData, pos
289
290 # Find the first match.
291 # Note: frontend/match.py uses _LongestMatch(), which is different!
292 # TODO: reconcile them. This lexer should be expressible in re2c.
293
294 for pat, tok_id in HTM8_LEX_COMPILED:
295 m = pat.match(self.s, self.pos)
296 if m:
297 if tok_id in (h8_id.StartTag, h8_id.EndTag, h8_id.StartEndTag):
298 self.tag_pos_left = m.start(1)
299 self.tag_pos_right = m.end(1)
300 else:
301 # Reset state
302 self.tag_pos_left = -1
303 self.tag_pos_right = -1
304
305 if tok_id == h8_id.CommentBegin:
306 pos = self.s.find('-->', self.pos)
307 if pos == -1:
308 # unterminated <!--
309 raise LexError(self.s, self.pos)
310 return h8_id.Comment, pos + 3 # -->
311
312 if tok_id == h8_id.ProcessingBegin:
313 pos = self.s.find('?>', self.pos)
314 if pos == -1:
315 # unterminated <?
316 raise LexError(self.s, self.pos)
317 return h8_id.Processing, pos + 2 # ?>
318
319 if tok_id == h8_id.CDataBegin:
320 pos = self.s.find(']]>', self.pos)
321 if pos == -1:
322 # unterminated <![CDATA[
323 raise LexError(self.s, self.pos)
324 return h8_id.CData, pos + 3 # ]]>
325
326 if tok_id == h8_id.StartTag:
327 # TODO: reduce allocations
328 if (self.TagNameEquals('script') or
329 self.TagNameEquals('style')):
330 # <SCRipt a=b> -> </SCRipt>
331 self.search_state = '</' + self._LiteralTagName() + '>'
332
333 return tok_id, m.end()
334 else:
335 raise AssertionError('h8_id.Invalid rule should have matched')
336
337 def TagNameEquals(self, expected):
338 # type: (str) -> bool
339 assert self.tag_pos_left != -1, self.tag_pos_left
340 assert self.tag_pos_right != -1, self.tag_pos_right
341
342 # TODO: In C++, this does not need an allocation. Can we test
343 # directly?
344 return expected == self.CanonicalTagName()
345
346 def _LiteralTagName(self):
347 # type: () -> str
348 assert self.tag_pos_left != -1, self.tag_pos_left
349 assert self.tag_pos_right != -1, self.tag_pos_right
350
351 return self.s[self.tag_pos_left:self.tag_pos_right]
352
353 def CanonicalTagName(self):
354 # type: () -> str
355 tag_name = self._LiteralTagName()
356 # Most tags are already lower case, so avoid allocation with this conditional
357 # TODO: this could go in the mycpp runtime?
358 if tag_name.islower():
359 return tag_name
360 else:
361 return tag_name.lower()
362
363 def Read(self):
364 # type: () -> Tuple[h8_id_t, int]
365 tok_id, end_pos = self._Read()
366 self.pos = end_pos # advance
367 return tok_id, end_pos
368
369 def LookAhead(self, regex):
370 # type: (str) -> bool
371 # Cache the regex compilation. This could also be LookAheadFor(THEAD)
372 # or something.
373 pat = self.cache.get(regex)
374 if pat is None:
375 pat = re.compile(regex)
376 self.cache[regex] = pat
377
378 m = pat.match(self.s, self.pos)
379 return m is not None
380
381
382# Tag names:
383# Match <a or </a
384# Match <h2, but not <2h
385#
386# HTML 5 doesn't restrict tag names at all
387# https://html.spec.whatwg.org/#toc-syntax
388#
389# XML allows : - .
390# https://www.w3.org/TR/xml/#NT-NameChar
391
392# Namespaces for MathML, SVG
393# XLink, XML, XMLNS
394#
395# https://infra.spec.whatwg.org/#namespaces
396#
397# Allow - for td-attrs
398
399# Be very lenient - just no whitespace or special HTML chars
400# I don't think this is more lenient than HTML5, though we should check.
401_UNQUOTED_VALUE = r'''[^ \t\r\n<>&"'\x00]*'''
402
403# TODO: we don't need to capture the tag name here? That's done at the top
404# level
405_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)
406
407_TAG_LAST_RE = re.compile(r'\s* /? >', re.VERBOSE)
408
409# To match href="foo"
410# Note: in HTML5 and XML, single quoted attributes are also valid
411
412# <button disabled> is standard usage
413
414# NOTE: This used to allow whitespace around =
415# <a foo = "bar"> makes sense in XML
416# But then you also have
417# <a foo= bar> - which is TWO attributes, in HTML5
418# So the space is problematic
419
420_ATTR_RE = re.compile(
421 r'''
422\s+ # Leading whitespace is required
423(%s) # Attribute name
424(?: # Optional attribute value
425 \s* = \s* # Spaces allowed around =
426 (?:
427 " ([^>"\x00]*) " # double quoted value
428 | ' ([^>'\x00]*) ' # single quoted value
429 | (%s) # Attribute value
430 )
431)?
432''' % (_NAME, _UNQUOTED_VALUE), re.VERBOSE)
433
434
435class TagLexer(object):
436 """
437 Given a tag like <a href="..."> or <link type="..." />, the TagLexer
438 provides a few operations:
439
440 - What is the tag?
441 - Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
442 """
443
444 def __init__(self, s):
445 # type: (str) -> None
446 self.s = s
447 self.start_pos = -1 # Invalid
448 self.end_pos = -1
449
450 def Reset(self, start_pos, end_pos):
451 # type: (int, int) -> None
452 """Reuse instances of this object."""
453 assert start_pos >= 0, start_pos
454 assert end_pos >= 0, end_pos
455
456 self.start_pos = start_pos
457 self.end_pos = end_pos
458
459 def WholeTagString(self):
460 # type: () -> str
461 """Return the entire tag string, e.g. <a href='foo'>"""
462 return self.s[self.start_pos:self.end_pos]
463
464 def GetTagName(self):
465 # type: () -> str
466 # First event
467 tok_id, start, end = next(self.Tokens())
468 return self.s[start:end]
469
470 def GetSpanForAttrValue(self, attr_name):
471 # type: (str) -> Tuple[int, int]
472 """
473 Used by oils_doc.py, for href shortcuts
474 """
475 # Algorithm: search for QuotedValue or UnquotedValue after AttrName
476 # TODO: Could also cache these
477
478 events = self.Tokens()
479 val = (-1, -1)
480 try:
481 while True:
482 tok_id, start, end = next(events)
483 if tok_id == h8_tag_id.AttrName:
484 name = self.s[start:end]
485 if name == attr_name:
486 # The value should come next
487 tok_id, start, end = next(events)
488 assert tok_id in (
489 h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue,
490 h8_tag_id.MissingValue), h8_tag_id_str(tok_id)
491 val = start, end
492 break
493
494 except StopIteration:
495 pass
496 return val
497
498 def GetAttrRaw(self, attr_name):
499 # type: (str) -> Optional[str]
500 """
501 Return the value, which may be UNESCAPED.
502 """
503 start, end = self.GetSpanForAttrValue(attr_name)
504 if start == -1:
505 return None
506 return self.s[start:end]
507
508 def AllAttrsRawSlice(self):
509 # type: () -> List[Tuple[str, int, int]]
510 """
511 Get a list of pairs [('class', 3, 5), ('href', 9, 12)]
512 """
513 slices = []
514 events = self.Tokens()
515 try:
516 while True:
517 tok_id, start, end = next(events)
518 if tok_id == h8_tag_id.AttrName:
519 name = self.s[start:end]
520
521 # The value should come next
522 tok_id, start, end = next(events)
523 assert tok_id in (
524 h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue,
525 h8_tag_id.MissingValue), h8_tag_id_str(tok_id)
526 # Note: quoted values may have &amp;
527 # We would need ANOTHER lexer to unescape them, but we
528 # don't need that for ul-table
529 slices.append((name, start, end))
530 except StopIteration:
531 pass
532 return slices
533
534 def AllAttrsRaw(self):
535 # type: () -> List[Tuple[str, str]]
536 """
537 Get a list of pairs [('class', 'foo'), ('href', '?foo=1&amp;bar=2')]
538
539 The quoted values may be escaped. We would need another lexer to
540 unescape them.
541 """
542 slices = self.AllAttrsRawSlice()
543 pairs = []
544 for name, start, end in slices:
545 pairs.append((name, self.s[start:end]))
546 return pairs
547
548 def Tokens(self):
549 # type: () -> Iterator[Tuple[h8_tag_id_t, int, int]]
550 """
551 Yields a sequence of tokens: Tag (AttrName AttrValue?)*
552
553 Where each Token is (Type, start_pos, end_pos)
554
555 Note that start and end are NOT redundant! We skip over some unwanted
556 characters.
557 """
558 m = _TAG_RE.match(self.s, self.start_pos + 1)
559 if not m:
560 raise RuntimeError("Couldn't find HTML tag in %r" %
561 self.WholeTagString())
562 yield h8_tag_id.TagName, m.start(1), m.end(1)
563
564 pos = m.end(0)
565 #log('POS %d', pos)
566
567 while True:
568 # don't search past the end
569 m = _ATTR_RE.match(self.s, pos, self.end_pos)
570 if not m:
571 #log('BREAK pos %d', pos)
572 break
573 #log('AttrName %r', m.group(1))
574
575 yield h8_tag_id.AttrName, m.start(1), m.end(1)
576
577 #log('m.groups() %r', m.groups())
578 if m.group(2) is not None:
579 # double quoted
580 yield h8_tag_id.QuotedValue, m.start(2), m.end(2)
581 elif m.group(3) is not None:
582 # single quoted - TODO: could have different token types
583 yield h8_tag_id.QuotedValue, m.start(3), m.end(3)
584 elif m.group(4) is not None:
585 yield h8_tag_id.UnquotedValue, m.start(4), m.end(4)
586 else:
587 # <button disabled>
588 end = m.end(0)
589 yield h8_tag_id.MissingValue, end, end
590
591 # Skip past the "
592 pos = m.end(0)
593
594 #log('TOK %r', self.s)
595
596 m = _TAG_LAST_RE.match(self.s, pos)
597 #log('_TAG_LAST_RE match %r', self.s[pos:])
598 if not m:
599 # Extra data at end of tag. TODO: add messages for all these.
600 raise LexError(self.s, pos)
601
602
603# This is similar but not identical to
604# " ([^>"\x00]*) " # double quoted value
605# | ' ([^>'\x00]*) ' # single quoted value
606#
607# Note: for unquoted values, & isn't allowed, and thus &amp; and &#99; and
608# &#x99; are not allowed. We could relax that?
609ATTR_VALUE_LEX = CHAR_LEX + [
610 (r'[^>&\x00]+', h8_id.RawData),
611 (r'.', h8_id.Invalid),
612]
613
614ATTR_VALUE_LEX_COMPILED = MakeLexer(ATTR_VALUE_LEX)
615
616
617class AttrValueLexer(object):
618 """
619 <a href="foo=99&amp;bar">
620 <a href='foo=99&amp;bar'>
621 <a href=unquoted>
622 """
623
624 def __init__(self, s):
625 # type: (str) -> None
626 self.s = s
627 self.start_pos = -1 # Invalid
628 self.end_pos = -1
629
630 def Reset(self, start_pos, end_pos):
631 # type: (int, int) -> None
632 """Reuse instances of this object."""
633 assert start_pos >= 0, start_pos
634 assert end_pos >= 0, end_pos
635
636 self.start_pos = start_pos
637 self.end_pos = end_pos
638
639 def NumTokens(self):
640 # type: () -> int
641 num_tokens = 0
642 pos = self.start_pos
643 for tok_id, end_pos in self.Tokens():
644 if tok_id == h8_id.Invalid:
645 raise LexError(self.s, pos)
646 pos = end_pos
647 #log('pos %d', pos)
648 num_tokens += 1
649 return num_tokens
650
651 def Tokens(self):
652 # type: () -> Iterator[Tuple[h8_id_t, int]]
653 pos = self.start_pos
654 while pos < self.end_pos:
655 # Find the first match, like above.
656 # Note: frontend/match.py uses _LongestMatch(), which is different!
657 # TODO: reconcile them. This lexer should be expressible in re2c.
658 for pat, tok_id in ATTR_VALUE_LEX_COMPILED:
659 m = pat.match(self.s, pos)
660 if m:
661 if 0:
662 tok_str = m.group(0)
663 log('token = %r', tok_str)
664
665 end_pos = m.end(0)
666 yield tok_id, end_pos
667 pos = end_pos
668 break
669 else:
670 raise AssertionError('h8_id.Invalid rule should have matched')