OILS / lazylex / html.py View on Github | oils.pub

777 lines, 392 significant
1#!/usr/bin/env python2
2"""
3lazylex/html.py - Low-Level HTML Processing.
4
5See lazylex/README.md for details.
6
7TODO: This should be an Oils library eventually. It's a "lazily-parsed data
8structure" like TSV8
9"""
10from __future__ import print_function
11
12try:
13 from cStringIO import StringIO
14except ImportError:
15 from io import StringIO # python3
16import re
17import sys
18
19if sys.version_info.major == 2:
20 from typing import List, Tuple, Optional
21
22
23def log(msg, *args):
24 msg = msg % args
25 print(msg, file=sys.stderr)
26
27
28class LexError(Exception):
29 """
30 Examples of lex errors:
31
32 - Tok.Invalid, like <> or &&
33 - Unclosed <!-- <? <![CDATA[ <script> <style>
34 """
35
36 def __init__(self, s, start_pos):
37 self.s = s
38 self.start_pos = start_pos
39
40 def __str__(self):
41 return '(LexError %r)' % (self.s[self.start_pos:self.start_pos + 20])
42
43
44def FindLineNum(s, error_pos):
45 current_pos = 0
46 line_num = 1
47 while True:
48 newline_pos = s.find('\n', current_pos)
49 #log('current = %d, N %d, line %d', current_pos, newline_pos, line_num)
50
51 if newline_pos == -1: # this is the last line
52 return line_num
53 if newline_pos >= error_pos:
54 return line_num
55 line_num += 1
56 current_pos = newline_pos + 1
57
58
59class ParseError(Exception):
60 """
61 Examples of parse errors
62
63 - unbalanced tag structure
64 - ul_table.py errors
65 """
66
67 def __init__(self, msg, s=None, start_pos=-1):
68 self.msg = msg
69 self.s = s
70 self.start_pos = start_pos
71
72 def __str__(self):
73 if self.s is not None:
74 assert self.start_pos != -1, self.start_pos
75 snippet = (self.s[self.start_pos:self.start_pos + 20])
76
77 line_num = FindLineNum(self.s, self.start_pos)
78 else:
79 snippet = ''
80 line_num = -1
81 msg = 'line %d: %r %r' % (line_num, self.msg, snippet)
82 return msg
83
84
85class Output(object):
86 """Takes an underlying input buffer and an output file. Maintains a
87 position in the input buffer.
88
89 Print FROM the input or print new text to the output.
90 """
91
92 def __init__(self, s, f, left_pos=0, right_pos=-1):
93 self.s = s
94 self.f = f
95 self.pos = left_pos
96 self.right_pos = len(s) if right_pos == -1 else right_pos
97
98 def SkipTo(self, pos):
99 """Skip to a position."""
100 self.pos = pos
101
102 def PrintUntil(self, pos):
103 """Print until a position."""
104 piece = self.s[self.pos:pos]
105 self.f.write(piece)
106 self.pos = pos
107
108 def PrintTheRest(self):
109 """Print until the end of the string."""
110 self.PrintUntil(self.right_pos)
111
112 def Print(self, s):
113 """Print text to the underlying buffer."""
114 self.f.write(s)
115
116
117# HTML Tokens
118# CommentBegin, ProcessingBegin, CDataBegin are "pseudo-tokens", not visible
119TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin CData CDataBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData HtmlCData Invalid EndOfStream'.split(
120)
121
122
123class Tok(object):
124 """
125 Avoid lint errors by using these aliases
126 """
127 pass
128
129
130TOKEN_NAMES = [None] * len(TOKENS) # type: List[str]
131
132this_module = sys.modules[__name__]
133for i, tok_str in enumerate(TOKENS):
134 setattr(this_module, tok_str, i)
135 setattr(Tok, tok_str, i)
136 TOKEN_NAMES[i] = tok_str
137
138
139def TokenName(tok_id):
140 return TOKEN_NAMES[tok_id]
141
142
143def MakeLexer(rules):
144 return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]
145
146
147#
148# Eggex
149#
150# Tag = / ~['>']+ /
151
152# Is this valid? A single character?
153# Tag = / ~'>'* /
154
155# Maybe better: / [NOT '>']+/
156# capital letters not allowed there?
157#
158# But then this is confusing:
159# / [NOT ~digit]+/
160#
161# / [NOT digit] / is [^\d]
162# / ~digit / is \D
163#
164# Or maybe:
165#
166# / [~ digit]+ /
167# / [~ '>']+ /
168# / [NOT '>']+ /
169
170# End = / '</' Tag '>' /
171# StartEnd = / '<' Tag '/>' /
172# Start = / '<' Tag '>' /
173#
174# EntityRef = / '&' dot{* N} ';' /
175
176# Tag name, or attribute name
177# colon is used in XML
178
179# https://www.w3.org/TR/xml/#NT-Name
180# Hm there is a lot of unicode stuff. We are simplifying parsing
181
182_NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter
183
184LEXER = [
185 (r'<!--', Tok.CommentBegin),
186
187 # Processing instruction are used for the XML header:
188 # <?xml version="1.0" encoding="UTF-8"?>
189 # They are technically XML-only, but in HTML5, they are another kind of
190 # comment:
191 #
192 # https://developer.mozilla.org/en-US/docs/Web/API/ProcessingInstruction
193 #
194 (r'<\?', Tok.ProcessingBegin),
195 # Not necessary in HTML5, but occurs in XML
196 (r'<!\[CDATA\[', Tok.CDataBegin), # <![CDATA[
197
198 # Markup declarations
199 # - In HTML5, there is only <!DOCTYPE html>
200 # - XML has 4 more declarations: <!ELEMENT ...> ATTLIST ENTITY NOTATION
201 # - these seem to be part of DTD
202 # - it's useful to skip these, and be able to parse the rest of the document
203 # - Note: < is allowed?
204 (r'<! [^>]+ >', Tok.Decl),
205
206 # Tags
207 # Notes:
208 # - We look for a valid tag name, but we don't validate attributes.
209 # That's done in the tag lexer.
210 # - We don't allow leading whitespace
211 (r'</ (%s) >' % _NAME, Tok.EndTag),
212 # self-closing <br/> comes before StarttTag
213 (r'< (%s) [^>]* />' % _NAME, Tok.StartEndTag), # end </a>
214 (r'< (%s) [^>]* >' % _NAME, Tok.StartTag), # start <a>
215
216 # Characters
217 # https://www.w3.org/TR/xml/#sec-references
218 (r'&\# [0-9]+ ;', Tok.DecChar),
219 (r'&\# x[0-9a-fA-F]+ ;', Tok.HexChar),
220 (r'& %s ;' % _NAME, Tok.CharEntity),
221
222 # HTML5 allows unescaped > in raw data, but < is not allowed.
223 # https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
224 #
225 # - My early blog has THREE errors when disallowing >
226 # - So do some .wwz files
227 (r'[^&<]+', Tok.RawData),
228 (r'.', Tok.Invalid), # error!
229]
230
231# Old notes:
232#
233# Non-greedy matches are regular and can be matched in linear time
234# with RE2.
235#
236# https://news.ycombinator.com/item?id=27099798
237#
238# Maybe try combining all of these for speed.
239
240# . is any char except newline
241# https://re2c.org/manual/manual_c.html
242
243# Discarded options
244#(r'<!-- .*? -->', Tok.Comment),
245
246# Hack from Claude: \s\S instead of re.DOTALL. I don't like this
247#(r'<!-- [\s\S]*? -->', Tok.Comment),
248#(r'<!-- (?:.|[\n])*? -->', Tok.Comment),
249
250LEXER = MakeLexer(LEXER)
251
252
253class Lexer(object):
254
255 def __init__(self, s, left_pos=0, right_pos=-1):
256 self.s = s
257 self.pos = left_pos
258 self.right_pos = len(s) if right_pos == -1 else right_pos
259 self.cache = {} # string -> compiled regex pattern object
260
261 # either </script> or </style> - we search until we see that
262 self.search_state = None # type: Optional[str]
263
264 # Position of tag name, if applicable
265 # - Set after you get a StartTag, EndTag, or StartEndTag
266 # - Unset on other tags
267 self.tag_pos_left = -1
268 self.tag_pos_right = -1
269
270 def _Peek(self):
271 # type: () -> Tuple[int, int]
272 """
273 Note: not using _Peek() now
274 """
275 if self.pos == self.right_pos:
276 return Tok.EndOfStream, self.pos
277
278 assert self.pos < self.right_pos, self.pos
279
280 if self.search_state is not None:
281 pos = self.s.find(self.search_state, self.pos)
282 if pos == -1:
283 # unterminated <script> or <style>
284 raise LexError(self.s, self.pos)
285 self.search_state = None
286 # beginning
287 return Tok.HtmlCData, pos
288
289 # Find the first match.
290 # Note: frontend/match.py uses _LongestMatch(), which is different!
291 # TODO: reconcile them. This lexer should be expressible in re2c.
292
293 for pat, tok_id in LEXER:
294 m = pat.match(self.s, self.pos)
295 if m:
296 if tok_id in (Tok.StartTag, Tok.EndTag, Tok.StartEndTag):
297 self.tag_pos_left = m.start(1)
298 self.tag_pos_right = m.end(1)
299 else:
300 # Reset state
301 self.tag_pos_left = -1
302 self.tag_pos_right = -1
303
304 if tok_id == Tok.CommentBegin:
305 pos = self.s.find('-->', self.pos)
306 if pos == -1:
307 # unterminated <!--
308 raise LexError(self.s, self.pos)
309 return Tok.Comment, pos + 3 # -->
310
311 if tok_id == Tok.ProcessingBegin:
312 pos = self.s.find('?>', self.pos)
313 if pos == -1:
314 # unterminated <?
315 raise LexError(self.s, self.pos)
316 return Tok.Processing, pos + 2 # ?>
317
318 if tok_id == Tok.CDataBegin:
319 pos = self.s.find(']]>', self.pos)
320 if pos == -1:
321 # unterminated <![CDATA[
322 raise LexError(self.s, self.pos)
323 return Tok.CData, pos + 3 # ]]>
324
325 if tok_id == Tok.StartTag:
326 if self.TagNameEquals('script'):
327 self.search_state = '</script>'
328 elif self.TagNameEquals('style'):
329 self.search_state = '</style>'
330
331 return tok_id, m.end()
332 else:
333 raise AssertionError('Tok.Invalid rule should have matched')
334
335 def TagNameEquals(self, expected):
336 # type: (str) -> bool
337 assert self.tag_pos_left != -1, self.tag_pos_left
338 assert self.tag_pos_right != -1, self.tag_pos_right
339
340 # TODO: In C++, this does not need an allocation
341 return expected == self.s[self.tag_pos_left:self.tag_pos_right]
342
343 def TagName(self):
344 # type: () -> None
345 assert self.tag_pos_left != -1, self.tag_pos_left
346 assert self.tag_pos_right != -1, self.tag_pos_right
347
348 return self.s[self.tag_pos_left:self.tag_pos_right]
349
350 def Read(self):
351 # type: () -> Tuple[int, int]
352 tok_id, end_pos = self._Peek()
353 self.pos = end_pos # advance
354 return tok_id, end_pos
355
356 def LookAhead(self, regex):
357 # Cache the regex compilation. This could also be LookAheadFor(THEAD)
358 # or something.
359 pat = self.cache.get(regex)
360 if pat is None:
361 pat = re.compile(regex)
362 self.cache[regex] = pat
363
364 m = pat.match(self.s, self.pos)
365 return m is not None
366
367
368def _Tokens(s, left_pos, right_pos):
369 """
370 Args:
371 s: string to parse
372 left_pos, right_pos: Optional span boundaries.
373 """
374 lx = Lexer(s, left_pos, right_pos)
375 while True:
376 tok_id, pos = lx.Read()
377 yield tok_id, pos
378 if tok_id == Tok.EndOfStream:
379 break
380
381
382def ValidTokens(s, left_pos=0, right_pos=-1):
383 """Wrapper around _Tokens to prevent callers from having to handle Invalid.
384
385 I'm not combining the two functions because I might want to do a
386 'yield' transformation on Tokens()? Exceptions might complicate the
387 issue?
388 """
389 pos = left_pos
390 for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
391 if tok_id == Tok.Invalid:
392 raise LexError(s, pos)
393 yield tok_id, end_pos
394 pos = end_pos
395
396
397# Tag names:
398# Match <a or </a
399# Match <h2, but not <2h
400#
401# HTML 5 doesn't restrict tag names at all
402# https://html.spec.whatwg.org/#toc-syntax
403#
404# XML allows : - .
405# https://www.w3.org/TR/xml/#NT-NameChar
406
407# Namespaces for MathML, SVG
408# XLink, XML, XMLNS
409#
410# https://infra.spec.whatwg.org/#namespaces
411#
412# Allow - for td-attrs
413
414_ATTR_VALUE = r'[a-zA-Z0-9_\-]+' # allow hyphens
415
416# TODO: we don't need to capture the tag name here? That's done at the top
417# level
418_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)
419
420# To match href="foo"
421
422_ATTR_RE = re.compile(
423 r'''
424\s+ # Leading whitespace is required
425(%s) # Attribute name
426(?: # Optional attribute value
427 \s* = \s*
428 (?:
429 " ([^>"]*) " # double quoted value
430 | (%s) # Attribute value
431 # TODO: relax this? for href=$foo
432 )
433)?
434''' % (_NAME, _ATTR_VALUE), re.VERBOSE)
435
436TagName, AttrName, UnquotedValue, QuotedValue = range(4)
437
438
439class TagLexer(object):
440 """
441 Given a tag like <a href="..."> or <link type="..." />, the TagLexer
442 provides a few operations:
443
444 - What is the tag?
445 - Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
446 """
447
448 def __init__(self, s):
449 self.s = s
450 self.start_pos = -1 # Invalid
451 self.end_pos = -1
452
453 def Reset(self, start_pos, end_pos):
454 """Reuse instances of this object."""
455 self.start_pos = start_pos
456 self.end_pos = end_pos
457
458 def TagString(self):
459 return self.s[self.start_pos:self.end_pos]
460
461 def TagName(self):
462 # First event
463 tok_id, start, end = next(self.Tokens())
464 return self.s[start:end]
465
466 def GetSpanForAttrValue(self, attr_name):
467 # Algorithm: search for QuotedValue or UnquotedValue after AttrName
468 # TODO: Could also cache these
469
470 events = self.Tokens()
471 val = (-1, -1)
472 try:
473 while True:
474 tok_id, start, end = next(events)
475 if tok_id == AttrName:
476 name = self.s[start:end]
477 if name == attr_name:
478 # The value should come next
479 tok_id, start, end = next(events)
480 if tok_id in (QuotedValue, UnquotedValue):
481 # Note: quoted values may have &amp;
482 # We would need ANOTHER lexer to unescape them.
483 # Right now help_gen.py and oils_doc.py
484 val = start, end
485 break
486
487 except StopIteration:
488 pass
489 return val
490
491 def GetAttrRaw(self, attr_name):
492 """
493 Return the value, which may be UNESCAPED.
494 """
495 # Algorithm: search for QuotedValue or UnquotedValue after AttrName
496 # TODO: Could also cache these
497 start, end = self.GetSpanForAttrValue(attr_name)
498 if start == -1:
499 return None
500 return self.s[start:end]
501
502 def AllAttrsRaw(self):
503 """
504 Get a list of pairs [('class', 'foo'), ('href', '?foo=1&amp;bar=2')]
505
506 The quoted values may be escaped. We would need another lexer to
507 unescape them.
508 """
509 pairs = []
510 events = self.Tokens()
511 try:
512 while True:
513 tok_id, start, end = next(events)
514 if tok_id == AttrName:
515 name = self.s[start:end]
516
517 # The value should come next
518 tok_id, start, end = next(events)
519 if tok_id in (QuotedValue, UnquotedValue):
520 # Note: quoted values may have &amp;
521 # We would need ANOTHER lexer to unescape them, but we
522 # don't need that for ul-table
523
524 val = self.s[start:end]
525 pairs.append((name, val))
526 except StopIteration:
527 pass
528 return pairs
529
530 def Tokens(self):
531 """
532 Yields a sequence of tokens: Tag (AttrName AttrValue?)*
533
534 Where each Token is (Type, start_pos, end_pos)
535
536 Note that start and end are NOT redundant! We skip over some unwanted
537 characters.
538 """
539 m = _TAG_RE.match(self.s, self.start_pos + 1)
540 if not m:
541 raise RuntimeError("Couldn't find HTML tag in %r" %
542 self.TagString())
543 yield TagName, m.start(1), m.end(1)
544
545 pos = m.end(0)
546
547 while True:
548 # don't search past the end
549 m = _ATTR_RE.match(self.s, pos, self.end_pos)
550 if not m:
551 # A validating parser would check that > or /> is next -- there's no junk
552 break
553
554 yield AttrName, m.start(1), m.end(1)
555
556 # Quoted is group 2, unquoted is group 3.
557 if m.group(2) is not None:
558 yield QuotedValue, m.start(2), m.end(2)
559 elif m.group(3) is not None:
560 yield UnquotedValue, m.start(3), m.end(3)
561
562 # Skip past the "
563 pos = m.end(0)
564
565
566def ReadUntilStartTag(it, tag_lexer, tag_name):
567 """Find the next <foo>, returning its (start, end) positions
568
569 Raise ParseError if it's not found.
570
571 tag_lexer is RESET.
572 """
573 pos = 0
574 while True:
575 try:
576 tok_id, end_pos = next(it)
577 except StopIteration:
578 break
579 tag_lexer.Reset(pos, end_pos)
580 if tok_id == Tok.StartTag and tag_lexer.TagName() == tag_name:
581 return pos, end_pos
582
583 pos = end_pos
584
585 raise ParseError('No start tag %r' % tag_name)
586
587
588def ReadUntilEndTag(it, tag_lexer, tag_name):
589 """Find the next </foo>, returning its (start, end) position
590
591 Raise ParseError if it's not found.
592
593 tag_lexer is RESET.
594 """
595 pos = 0
596 while True:
597 try:
598 tok_id, end_pos = next(it)
599 except StopIteration:
600 break
601 tag_lexer.Reset(pos, end_pos)
602 if tok_id == Tok.EndTag and tag_lexer.TagName() == tag_name:
603 return pos, end_pos
604
605 pos = end_pos
606
607 raise ParseError('No end tag %r' % tag_name)
608
609
610CHAR_ENTITY = {
611 'amp': '&',
612 'lt': '<',
613 'gt': '>',
614 'quot': '"',
615}
616
617
618def ToText(s, left_pos=0, right_pos=-1):
619 """Given HTML, return text by unquoting &gt; and &lt; etc.
620
621 Used by:
622 doctools/oils_doc.py: PygmentsPlugin
623 doctools/help_gen.py: HelpIndexCards
624
625 In the latter case, we cold process some tags, like:
626
627 - Blue Link (not clickable, but still useful)
628 - Red X
629
630 That should be html.ToAnsi.
631 """
632 f = StringIO()
633 out = Output(s, f, left_pos, right_pos)
634
635 pos = left_pos
636 for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
637 if tok_id == Tok.RawData:
638 out.SkipTo(pos)
639 out.PrintUntil(end_pos)
640
641 elif tok_id == Tok.CharEntity: # &amp;
642
643 entity = s[pos + 1:end_pos - 1]
644
645 out.SkipTo(pos)
646 out.Print(CHAR_ENTITY[entity])
647 out.SkipTo(end_pos)
648
649 # Not handling these yet
650 elif tok_id == Tok.HexChar:
651 raise AssertionError('Hex Char %r' % s[pos:pos + 20])
652
653 elif tok_id == Tok.DecChar:
654 raise AssertionError('Dec Char %r' % s[pos:pos + 20])
655
656 pos = end_pos
657
658 out.PrintTheRest()
659 return f.getvalue()
660
661
662# https://developer.mozilla.org/en-US/docs/Glossary/Void_element
663VOID_ELEMENTS = [
664 'area',
665 'base',
666 'br',
667 'col',
668 'embed',
669 'hr',
670 'img',
671 'input',
672 'link',
673 'meta',
674 'param',
675 'source',
676 'track',
677 'wbr',
678]
679
680
681def main(argv):
682 action = argv[1]
683
684 if action in ('lex-tags', 'lex-attrs', 'lex-attr-values', 'well-formed'):
685 num_tokens = 0
686 num_start_tags = 0
687 num_start_end_tags = 0
688 num_attrs = 0
689 max_tag_stack = 0
690
691 errors = []
692 i = 0
693 for line in sys.stdin:
694 name = line.strip()
695 with open(name) as f:
696 contents = f.read()
697
698 tag_lexer = TagLexer(contents)
699 lx = Lexer(contents)
700 tokens = []
701 start_pos = 0
702 tag_stack = []
703 try:
704 while True:
705 tok_id, end_pos = lx.Read()
706
707 if tok_id == Tok.Invalid:
708 raise LexError(contents, start_pos)
709 if tok_id == Tok.EndOfStream:
710 break
711
712 tokens.append((tok_id, end_pos))
713
714 if tok_id == Tok.StartEndTag:
715 num_start_end_tags += 1
716 if action in ('lex-attrs', 'lex-attr-values',
717 'well-formed'):
718 tag_lexer.Reset(start_pos, end_pos)
719 all_attrs = tag_lexer.AllAttrsRaw()
720 num_attrs += len(all_attrs)
721 elif tok_id == Tok.StartTag:
722 num_start_tags += 1
723 if action in ('lex-attrs', 'lex-attr-values',
724 'well-formed'):
725 tag_lexer.Reset(start_pos, end_pos)
726 all_attrs = tag_lexer.AllAttrsRaw()
727
728 tag_name = lx.TagName()
729 # Don't bother to check
730 if tag_name not in VOID_ELEMENTS:
731 tag_stack.append(tag_name)
732 max_tag_stack = max(max_tag_stack, len(tag_stack))
733 elif tok_id == Tok.EndTag:
734 try:
735 expected = tag_stack.pop()
736 except IndexError:
737 raise ParseError('Tag stack empty',
738 s=contents,
739 start_pos=start_pos)
740
741 actual = lx.TagName()
742 if expected != actual:
743 raise ParseError(
744 'Expected closing tag %r, got %r' %
745 (expected, actual),
746 s=contents,
747 start_pos=start_pos)
748
749 start_pos = end_pos
750 except LexError as e:
751 log('Lex error in %r: %s', name, e)
752 errors.append((name, e))
753 except ParseError as e:
754 log('Parse error in %r: %s', name, e)
755 errors.append((name, e))
756 else:
757 num_tokens += len(tokens)
758
759 #print('%d %s' % (len(tokens), name))
760 i += 1
761
762 log('')
763 log(
764 ' %d tokens, %d start/end tags, %d start tags, %d attrs, %d max tag stack depth in %d files',
765 num_tokens, num_start_end_tags, num_start_tags, num_attrs,
766 max_tag_stack, i)
767 log(' %d errors', len(errors))
768 if 0:
769 for name, e in errors:
770 log('Error in %r: %s', name, e)
771
772 else:
773 raise RuntimeError('Invalid action %r' % action)
774
775
776if __name__ == '__main__':
777 main(sys.argv)