OILS / data_lang / htm8.py View on Github | oils.pub

648 lines, 280 significant
1import re
2
3from typing import Dict, List, Tuple, Optional, IO, Iterator, Any
4
5from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_tag_id, h8_tag_id_t,
6 h8_tag_id_str)
7from doctools.util import log
8
9
10class LexError(Exception):
11 """
12 Examples of lex errors:
13
14 - h8_id.Invalid, like <> or &&
15 - Unclosed <!-- <? <![CDATA[ <script> <style>
16 """
17
18 def __init__(self, s, start_pos):
19 # type: (str, int) -> None
20 self.s = s
21 self.start_pos = start_pos
22
23 def __str__(self):
24 # type: () -> str
25 return '(LexError %r)' % (self.s[self.start_pos:self.start_pos + 20])
26
27
28def FindLineNum(s, error_pos):
29 # type: (str, int) -> int
30 current_pos = 0
31 line_num = 1
32 while True:
33 newline_pos = s.find('\n', current_pos)
34 #log('current = %d, N %d, line %d', current_pos, newline_pos, line_num)
35
36 if newline_pos == -1: # this is the last line
37 return line_num
38 if newline_pos >= error_pos:
39 return line_num
40 line_num += 1
41 current_pos = newline_pos + 1
42
43
44class ParseError(Exception):
45 """
46 Examples of parse errors
47
48 - unbalanced tag structure
49 - ul_table.py errors
50 """
51
52 def __init__(self, msg, s=None, start_pos=-1):
53 # type: (str, Optional[str], int) -> None
54 self.msg = msg
55 self.s = s
56 self.start_pos = start_pos
57
58 def __str__(self):
59 # type: () -> str
60 if self.s is not None:
61 assert self.start_pos != -1, self.start_pos
62 snippet = (self.s[self.start_pos:self.start_pos + 20])
63
64 line_num = FindLineNum(self.s, self.start_pos)
65 else:
66 snippet = ''
67 line_num = -1
68 msg = 'line %d: %r %r' % (line_num, self.msg, snippet)
69 return msg
70
71
72class Output(object):
73 """Takes an underlying input buffer and an output file. Maintains a
74 position in the input buffer.
75
76 Print FROM the input or print new text to the output.
77 """
78
79 def __init__(self, s, f, left_pos=0, right_pos=-1):
80 # type: (str, IO[str], int, int) -> None
81 self.s = s
82 self.f = f
83 self.pos = left_pos
84 self.right_pos = len(s) if right_pos == -1 else right_pos
85
86 def SkipTo(self, pos):
87 # type: (int) -> None
88 """Skip to a position."""
89 self.pos = pos
90
91 def PrintUntil(self, pos):
92 # type: (int) -> None
93 """Print until a position."""
94 piece = self.s[self.pos:pos]
95 self.f.write(piece)
96 self.pos = pos
97
98 def PrintTheRest(self):
99 # type: () -> None
100 """Print until the end of the string."""
101 self.PrintUntil(self.right_pos)
102
103 def Print(self, s):
104 # type: (str) -> None
105 """Print text to the underlying buffer."""
106 self.f.write(s)
107
108
109def MakeLexer(rules):
110 return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]
111
112
113#
114# Eggex
115#
116# Tag = / ~['>']+ /
117
118# Is this valid? A single character?
119# Tag = / ~'>'* /
120
121# Maybe better: / [NOT '>']+/
122# capital letters not allowed there?
123#
124# But then this is confusing:
125# / [NOT ~digit]+/
126#
127# / [NOT digit] / is [^\d]
128# / ~digit / is \D
129#
130# Or maybe:
131#
132# / [~ digit]+ /
133# / [~ '>']+ /
134# / [NOT '>']+ /
135
136# End = / '</' Tag '>' /
137# StartEnd = / '<' Tag '/>' /
138# Start = / '<' Tag '>' /
139#
140# EntityRef = / '&' dot{* N} ';' /
141
142# Tag name, or attribute name
143# colon is used in XML
144
145# https://www.w3.org/TR/xml/#NT-Name
146# Hm there is a lot of unicode stuff. We are simplifying parsing
147
148_NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter
149
150CHAR_LEX = [
151 # Characters
152 # https://www.w3.org/TR/xml/#sec-references
153 (r'&\# [0-9]+ ;', h8_id.DecChar),
154 (r'&\# x[0-9a-fA-F]+ ;', h8_id.HexChar),
155 (r'& %s ;' % _NAME, h8_id.CharEntity),
156 # Allow unquoted, and quoted
157 (r'&', h8_id.BadAmpersand),
158]
159
160HTM8_LEX = CHAR_LEX + [
161 (r'<!--', h8_id.CommentBegin),
162
163 # Processing instruction are used for the XML header:
164 # <?xml version="1.0" encoding="UTF-8"?>
165 # They are technically XML-only, but in HTML5, they are another kind of
166 # comment:
167 #
168 # https://developer.mozilla.org/en-US/docs/Web/API/ProcessingInstruction
169 #
170 (r'<\?', h8_id.ProcessingBegin),
171 # Not necessary in HTML5, but occurs in XML
172 (r'<!\[CDATA\[', h8_id.CDataBegin), # <![CDATA[
173
174 # Markup declarations
175 # - In HTML5, there is only <!DOCTYPE html>
176 # - XML has 4 more declarations: <!ELEMENT ...> ATTLIST ENTITY NOTATION
177 # - these seem to be part of DTD
178 # - it's useful to skip these, and be able to parse the rest of the document
179 # - Note: < is allowed?
180 (r'<! [^>\x00]+ >', h8_id.Decl),
181
182 # Tags
183 # Notes:
184 # - We look for a valid tag name, but we don't validate attributes.
185 # That's done in the tag lexer.
186 # - We don't allow leading whitespace
187 (r'</ (%s) >' % _NAME, h8_id.EndTag),
188 # self-closing <br/> comes before StartTag
189 # could/should these be collapsed into one rule?
190 (r'< (%s) [^>\x00]* />' % _NAME, h8_id.StartEndTag), # end </a>
191 (r'< (%s) [^>\x00]* >' % _NAME, h8_id.StartTag), # start <a>
192
193 # HTML5 allows unescaped > in raw data, but < is not allowed.
194 # https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
195 #
196 # - My early blog has THREE errors when disallowing >
197 # - So do some .wwz files
198 (r'[^&<>\x00]+', h8_id.RawData),
199 (r'>', h8_id.BadGreaterThan),
200 # < is an error
201 (r'.', h8_id.Invalid),
202]
203
204# Old notes:
205#
206# Non-greedy matches are regular and can be matched in linear time
207# with RE2.
208#
209# https://news.ycombinator.com/item?id=27099798
210#
211# Maybe try combining all of these for speed.
212
213# . is any char except newline
214# https://re2c.org/manual/manual_c.html
215
216# Discarded options
217#(r'<!-- .*? -->', h8_id.Comment),
218
219# Hack from Claude: \s\S instead of re.DOTALL. I don't like this
220#(r'<!-- [\s\S]*? -->', h8_id.Comment),
221#(r'<!-- (?:.|[\n])*? -->', h8_id.Comment),
222
223HTM8_LEX_COMPILED = MakeLexer(HTM8_LEX)
224
225
226class Lexer(object):
227
228 def __init__(self, s, left_pos=0, right_pos=-1, no_special_tags=False):
229 # type: (str, int, int, bool) -> None
230 self.s = s
231 self.pos = left_pos
232 self.right_pos = len(s) if right_pos == -1 else right_pos
233 self.no_special_tags = no_special_tags
234
235 # string -> compiled regex pattern object
236 self.cache = {} # type: Dict[str, Any]
237
238 # either </script> or </style> - we search until we see that
239 self.search_state = None # type: Optional[str]
240
241 # Position of tag name, if applicable
242 # - Set after you get a StartTag, EndTag, or StartEndTag
243 # - Unset on other tags
244 self.tag_pos_left = -1
245 self.tag_pos_right = -1
246
247 def _Read(self):
248 # type: () -> Tuple[h8_id_t, int]
249 if self.pos == self.right_pos:
250 return h8_id.EndOfStream, self.pos
251
252 assert self.pos < self.right_pos, self.pos
253
254 if self.search_state is not None and not self.no_special_tags:
255 # TODO: case-insensitive search for </SCRIPT> <SCRipt> ?
256 #
257 # Another strategy: enter a mode where we find ONLY the end tag
258 # regex, and any data that's not <, and then check the canonical
259 # tag name for 'script' or 'style'.
260 pos = self.s.find(self.search_state, self.pos)
261 if pos == -1:
262 # unterminated <script> or <style>
263 raise LexError(self.s, self.pos)
264 self.search_state = None
265 # beginning
266 return h8_id.HtmlCData, pos
267
268 # Find the first match.
269 # Note: frontend/match.py uses _LongestMatch(), which is different!
270 # TODO: reconcile them. This lexer should be expressible in re2c.
271
272 for pat, tok_id in HTM8_LEX_COMPILED:
273 m = pat.match(self.s, self.pos)
274 if m:
275 if tok_id in (h8_id.StartTag, h8_id.EndTag, h8_id.StartEndTag):
276 self.tag_pos_left = m.start(1)
277 self.tag_pos_right = m.end(1)
278 else:
279 # Reset state
280 self.tag_pos_left = -1
281 self.tag_pos_right = -1
282
283 if tok_id == h8_id.CommentBegin:
284 pos = self.s.find('-->', self.pos)
285 if pos == -1:
286 # unterminated <!--
287 raise LexError(self.s, self.pos)
288 return h8_id.Comment, pos + 3 # -->
289
290 if tok_id == h8_id.ProcessingBegin:
291 pos = self.s.find('?>', self.pos)
292 if pos == -1:
293 # unterminated <?
294 raise LexError(self.s, self.pos)
295 return h8_id.Processing, pos + 2 # ?>
296
297 if tok_id == h8_id.CDataBegin:
298 pos = self.s.find(']]>', self.pos)
299 if pos == -1:
300 # unterminated <![CDATA[
301 raise LexError(self.s, self.pos)
302 return h8_id.CData, pos + 3 # ]]>
303
304 if tok_id == h8_id.StartTag:
305 # TODO: reduce allocations
306 if (self.TagNameEquals('script') or
307 self.TagNameEquals('style')):
308 # <SCRipt a=b> -> </SCRipt>
309 self.search_state = '</' + self._LiteralTagName() + '>'
310
311 return tok_id, m.end()
312 else:
313 raise AssertionError('h8_id.Invalid rule should have matched')
314
315 def TagNameEquals(self, expected):
316 # type: (str) -> bool
317 assert self.tag_pos_left != -1, self.tag_pos_left
318 assert self.tag_pos_right != -1, self.tag_pos_right
319
320 # TODO: In C++, this does not need an allocation. Can we test
321 # directly?
322 return expected == self.CanonicalTagName()
323
324 def _LiteralTagName(self):
325 # type: () -> str
326 assert self.tag_pos_left != -1, self.tag_pos_left
327 assert self.tag_pos_right != -1, self.tag_pos_right
328
329 return self.s[self.tag_pos_left:self.tag_pos_right]
330
331 def CanonicalTagName(self):
332 # type: () -> str
333 tag_name = self._LiteralTagName()
334 # Most tags are already lower case, so avoid allocation with this conditional
335 # TODO: this could go in the mycpp runtime?
336 if tag_name.islower():
337 return tag_name
338 else:
339 return tag_name.lower()
340
341 def Read(self):
342 # type: () -> Tuple[h8_id_t, int]
343 tok_id, end_pos = self._Read()
344 self.pos = end_pos # advance
345 return tok_id, end_pos
346
347 def LookAhead(self, regex):
348 # type: (str) -> bool
349 # Cache the regex compilation. This could also be LookAheadFor(THEAD)
350 # or something.
351 pat = self.cache.get(regex)
352 if pat is None:
353 pat = re.compile(regex)
354 self.cache[regex] = pat
355
356 m = pat.match(self.s, self.pos)
357 return m is not None
358
359
360# Tag names:
361# Match <a or </a
362# Match <h2, but not <2h
363#
364# HTML 5 doesn't restrict tag names at all
365# https://html.spec.whatwg.org/#toc-syntax
366#
367# XML allows : - .
368# https://www.w3.org/TR/xml/#NT-NameChar
369
370# Namespaces for MathML, SVG
371# XLink, XML, XMLNS
372#
373# https://infra.spec.whatwg.org/#namespaces
374#
375# Allow - for td-attrs
376
377# Be very lenient - just no whitespace or special HTML chars
378# I don't think this is more lenient than HTML5, though we should check.
379_UNQUOTED_VALUE = r'''[^ \t\r\n<>&"'\x00]*'''
380
381# TODO: we don't need to capture the tag name here? That's done at the top
382# level
383_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)
384
385_TAG_LAST_RE = re.compile(r'\s* /? >', re.VERBOSE)
386
387# To match href="foo"
388# Note: in HTML5 and XML, single quoted attributes are also valid
389
390# <button disabled> is standard usage
391
392# NOTE: This used to allow whitespace around =
393# <a foo = "bar"> makes sense in XML
394# But then you also have
395# <a foo= bar> - which is TWO attributes, in HTML5
396# So the space is problematic
397
398_ATTR_RE = re.compile(
399 r'''
400\s+ # Leading whitespace is required
401(%s) # Attribute name
402(?: # Optional attribute value
403 \s* = \s* # Spaces allowed around =
404 (?:
405 " ([^>"\x00]*) " # double quoted value
406 | ' ([^>'\x00]*) ' # single quoted value
407 | (%s) # Attribute value
408 )
409)?
410''' % (_NAME, _UNQUOTED_VALUE), re.VERBOSE)
411
412
413class TagLexer(object):
414 """
415 Given a tag like <a href="..."> or <link type="..." />, the TagLexer
416 provides a few operations:
417
418 - What is the tag?
419 - Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
420 """
421
422 def __init__(self, s):
423 # type: (str) -> None
424 self.s = s
425 self.start_pos = -1 # Invalid
426 self.end_pos = -1
427
428 def Reset(self, start_pos, end_pos):
429 # type: (int, int) -> None
430 """Reuse instances of this object."""
431 assert start_pos >= 0, start_pos
432 assert end_pos >= 0, end_pos
433
434 self.start_pos = start_pos
435 self.end_pos = end_pos
436
437 def WholeTagString(self):
438 # type: () -> str
439 """Return the entire tag string, e.g. <a href='foo'>"""
440 return self.s[self.start_pos:self.end_pos]
441
442 def GetTagName(self):
443 # type: () -> str
444 # First event
445 tok_id, start, end = next(self.Tokens())
446 return self.s[start:end]
447
448 def GetSpanForAttrValue(self, attr_name):
449 # type: (str) -> Tuple[int, int]
450 """
451 Used by oils_doc.py, for href shortcuts
452 """
453 # Algorithm: search for QuotedValue or UnquotedValue after AttrName
454 # TODO: Could also cache these
455
456 events = self.Tokens()
457 val = (-1, -1)
458 try:
459 while True:
460 tok_id, start, end = next(events)
461 if tok_id == h8_tag_id.AttrName:
462 name = self.s[start:end]
463 if name == attr_name:
464 # The value should come next
465 tok_id, start, end = next(events)
466 assert tok_id in (
467 h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue,
468 h8_tag_id.MissingValue), h8_tag_id_str(tok_id)
469 val = start, end
470 break
471
472 except StopIteration:
473 pass
474 return val
475
476 def GetAttrRaw(self, attr_name):
477 # type: (str) -> Optional[str]
478 """
479 Return the value, which may be UNESCAPED.
480 """
481 start, end = self.GetSpanForAttrValue(attr_name)
482 if start == -1:
483 return None
484 return self.s[start:end]
485
486 def AllAttrsRawSlice(self):
487 # type: () -> List[Tuple[str, int, int]]
488 """
489 Get a list of pairs [('class', 3, 5), ('href', 9, 12)]
490 """
491 slices = []
492 events = self.Tokens()
493 try:
494 while True:
495 tok_id, start, end = next(events)
496 if tok_id == h8_tag_id.AttrName:
497 name = self.s[start:end]
498
499 # The value should come next
500 tok_id, start, end = next(events)
501 assert tok_id in (
502 h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue,
503 h8_tag_id.MissingValue), h8_tag_id_str(tok_id)
504 # Note: quoted values may have &amp;
505 # We would need ANOTHER lexer to unescape them, but we
506 # don't need that for ul-table
507 slices.append((name, start, end))
508 except StopIteration:
509 pass
510 return slices
511
512 def AllAttrsRaw(self):
513 # type: () -> List[Tuple[str, str]]
514 """
515 Get a list of pairs [('class', 'foo'), ('href', '?foo=1&amp;bar=2')]
516
517 The quoted values may be escaped. We would need another lexer to
518 unescape them.
519 """
520 slices = self.AllAttrsRawSlice()
521 pairs = []
522 for name, start, end in slices:
523 pairs.append((name, self.s[start:end]))
524 return pairs
525
526 def Tokens(self):
527 # type: () -> Iterator[Tuple[h8_tag_id_t, int, int]]
528 """
529 Yields a sequence of tokens: Tag (AttrName AttrValue?)*
530
531 Where each Token is (Type, start_pos, end_pos)
532
533 Note that start and end are NOT redundant! We skip over some unwanted
534 characters.
535 """
536 m = _TAG_RE.match(self.s, self.start_pos + 1)
537 if not m:
538 raise RuntimeError("Couldn't find HTML tag in %r" %
539 self.WholeTagString())
540 yield h8_tag_id.TagName, m.start(1), m.end(1)
541
542 pos = m.end(0)
543 #log('POS %d', pos)
544
545 while True:
546 # don't search past the end
547 m = _ATTR_RE.match(self.s, pos, self.end_pos)
548 if not m:
549 #log('BREAK pos %d', pos)
550 break
551 #log('AttrName %r', m.group(1))
552
553 yield h8_tag_id.AttrName, m.start(1), m.end(1)
554
555 #log('m.groups() %r', m.groups())
556 if m.group(2) is not None:
557 # double quoted
558 yield h8_tag_id.QuotedValue, m.start(2), m.end(2)
559 elif m.group(3) is not None:
560 # single quoted - TODO: could have different token types
561 yield h8_tag_id.QuotedValue, m.start(3), m.end(3)
562 elif m.group(4) is not None:
563 yield h8_tag_id.UnquotedValue, m.start(4), m.end(4)
564 else:
565 # <button disabled>
566 end = m.end(0)
567 yield h8_tag_id.MissingValue, end, end
568
569 # Skip past the "
570 pos = m.end(0)
571
572 #log('TOK %r', self.s)
573
574 m = _TAG_LAST_RE.match(self.s, pos)
575 #log('_TAG_LAST_RE match %r', self.s[pos:])
576 if not m:
577 # Extra data at end of tag. TODO: add messages for all these.
578 raise LexError(self.s, pos)
579
580
581# This is similar but not identical to
582# " ([^>"\x00]*) " # double quoted value
583# | ' ([^>'\x00]*) ' # single quoted value
584#
585# Note: for unquoted values, & isn't allowed, and thus &amp; and &#99; and
586# &#x99; are not allowed. We could relax that?
587ATTR_VALUE_LEXER = CHAR_LEX + [
588 (r'[^>&\x00]+', h8_id.RawData),
589 (r'.', h8_id.Invalid),
590]
591
592ATTR_VALUE_LEXER = MakeLexer(ATTR_VALUE_LEXER)
593
594
595class AttrValueLexer(object):
596 """
597 <a href="foo=99&amp;bar">
598 <a href='foo=99&amp;bar'>
599 <a href=unquoted>
600 """
601
602 def __init__(self, s):
603 # type: (str) -> None
604 self.s = s
605 self.start_pos = -1 # Invalid
606 self.end_pos = -1
607
608 def Reset(self, start_pos, end_pos):
609 # type: (int, int) -> None
610 """Reuse instances of this object."""
611 assert start_pos >= 0, start_pos
612 assert end_pos >= 0, end_pos
613
614 self.start_pos = start_pos
615 self.end_pos = end_pos
616
617 def NumTokens(self):
618 # type: () -> int
619 num_tokens = 0
620 pos = self.start_pos
621 for tok_id, end_pos in self.Tokens():
622 if tok_id == h8_id.Invalid:
623 raise LexError(self.s, pos)
624 pos = end_pos
625 #log('pos %d', pos)
626 num_tokens += 1
627 return num_tokens
628
629 def Tokens(self):
630 # type: () -> Iterator[Tuple[h8_id_t, int]]
631 pos = self.start_pos
632 while pos < self.end_pos:
633 # Find the first match, like above.
634 # Note: frontend/match.py uses _LongestMatch(), which is different!
635 # TODO: reconcile them. This lexer should be expressible in re2c.
636 for pat, tok_id in ATTR_VALUE_LEXER:
637 m = pat.match(self.s, pos)
638 if m:
639 if 0:
640 tok_str = m.group(0)
641 log('token = %r', tok_str)
642
643 end_pos = m.end(0)
644 yield tok_id, end_pos
645 pos = end_pos
646 break
647 else:
648 raise AssertionError('h8_id.Invalid rule should have matched')