OILS / lazylex / html.py View on Github | oils.pub

1002 lines, 524 significant
1#!/usr/bin/env python2
2"""
3lazylex/html.py - Low-Level HTML Processing.
4
5See lazylex/README.md for details.
6"""
7from __future__ import print_function
8
9try:
10 from cStringIO import StringIO
11except ImportError:
12 from io import StringIO # python3
13import re
14import sys
15
16if sys.version_info.major == 2:
17 from typing import List, Tuple, Optional
18
19
20def log(msg, *args):
21 msg = msg % args
22 print(msg, file=sys.stderr)
23
24
25class LexError(Exception):
26 """
27 Examples of lex errors:
28
29 - Tok.Invalid, like <> or &&
30 - Unclosed <!-- <? <![CDATA[ <script> <style>
31 """
32
33 def __init__(self, s, start_pos):
34 self.s = s
35 self.start_pos = start_pos
36
37 def __str__(self):
38 return '(LexError %r)' % (self.s[self.start_pos:self.start_pos + 20])
39
40
41def FindLineNum(s, error_pos):
42 current_pos = 0
43 line_num = 1
44 while True:
45 newline_pos = s.find('\n', current_pos)
46 #log('current = %d, N %d, line %d', current_pos, newline_pos, line_num)
47
48 if newline_pos == -1: # this is the last line
49 return line_num
50 if newline_pos >= error_pos:
51 return line_num
52 line_num += 1
53 current_pos = newline_pos + 1
54
55
56class ParseError(Exception):
57 """
58 Examples of parse errors
59
60 - unbalanced tag structure
61 - ul_table.py errors
62 """
63
64 def __init__(self, msg, s=None, start_pos=-1):
65 self.msg = msg
66 self.s = s
67 self.start_pos = start_pos
68
69 def __str__(self):
70 if self.s is not None:
71 assert self.start_pos != -1, self.start_pos
72 snippet = (self.s[self.start_pos:self.start_pos + 20])
73
74 line_num = FindLineNum(self.s, self.start_pos)
75 else:
76 snippet = ''
77 line_num = -1
78 msg = 'line %d: %r %r' % (line_num, self.msg, snippet)
79 return msg
80
81
82class Output(object):
83 """Takes an underlying input buffer and an output file. Maintains a
84 position in the input buffer.
85
86 Print FROM the input or print new text to the output.
87 """
88
89 def __init__(self, s, f, left_pos=0, right_pos=-1):
90 self.s = s
91 self.f = f
92 self.pos = left_pos
93 self.right_pos = len(s) if right_pos == -1 else right_pos
94
95 def SkipTo(self, pos):
96 """Skip to a position."""
97 self.pos = pos
98
99 def PrintUntil(self, pos):
100 """Print until a position."""
101 piece = self.s[self.pos:pos]
102 self.f.write(piece)
103 self.pos = pos
104
105 def PrintTheRest(self):
106 """Print until the end of the string."""
107 self.PrintUntil(self.right_pos)
108
109 def Print(self, s):
110 """Print text to the underlying buffer."""
111 self.f.write(s)
112
113
114# HTML Tokens
115# CommentBegin, ProcessingBegin, CDataBegin are "pseudo-tokens", not visible
116TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin CData CDataBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData HtmlCData BadAmpersand Invalid EndOfStream'.split(
117)
118
119
120class Tok(object):
121 """
122 Avoid lint errors by using these aliases
123 """
124 pass
125
126
127TOKEN_NAMES = [None] * len(TOKENS) # type: List[str]
128
129this_module = sys.modules[__name__]
130for i, tok_str in enumerate(TOKENS):
131 setattr(this_module, tok_str, i)
132 setattr(Tok, tok_str, i)
133 TOKEN_NAMES[i] = tok_str
134
135
136def TokenName(tok_id):
137 return TOKEN_NAMES[tok_id]
138
139
140def MakeLexer(rules):
141 return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]
142
143
144#
145# Eggex
146#
147# Tag = / ~['>']+ /
148
149# Is this valid? A single character?
150# Tag = / ~'>'* /
151
152# Maybe better: / [NOT '>']+/
153# capital letters not allowed there?
154#
155# But then this is confusing:
156# / [NOT ~digit]+/
157#
158# / [NOT digit] / is [^\d]
159# / ~digit / is \D
160#
161# Or maybe:
162#
163# / [~ digit]+ /
164# / [~ '>']+ /
165# / [NOT '>']+ /
166
167# End = / '</' Tag '>' /
168# StartEnd = / '<' Tag '/>' /
169# Start = / '<' Tag '>' /
170#
171# EntityRef = / '&' dot{* N} ';' /
172
173# Tag name, or attribute name
174# colon is used in XML
175
176# https://www.w3.org/TR/xml/#NT-Name
177# Hm there is a lot of unicode stuff. We are simplifying parsing
178
179_NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter
180
181CHAR_LEX = [
182 # Characters
183 # https://www.w3.org/TR/xml/#sec-references
184 (r'&\# [0-9]+ ;', Tok.DecChar),
185 (r'&\# x[0-9a-fA-F]+ ;', Tok.HexChar),
186 (r'& %s ;' % _NAME, Tok.CharEntity),
187 # Allow unquoted, and quoted
188 (r'&', Tok.BadAmpersand),
189]
190
191LEXER = CHAR_LEX + [
192 (r'<!--', Tok.CommentBegin),
193
194 # Processing instruction are used for the XML header:
195 # <?xml version="1.0" encoding="UTF-8"?>
196 # They are technically XML-only, but in HTML5, they are another kind of
197 # comment:
198 #
199 # https://developer.mozilla.org/en-US/docs/Web/API/ProcessingInstruction
200 #
201 (r'<\?', Tok.ProcessingBegin),
202 # Not necessary in HTML5, but occurs in XML
203 (r'<!\[CDATA\[', Tok.CDataBegin), # <![CDATA[
204
205 # Markup declarations
206 # - In HTML5, there is only <!DOCTYPE html>
207 # - XML has 4 more declarations: <!ELEMENT ...> ATTLIST ENTITY NOTATION
208 # - these seem to be part of DTD
209 # - it's useful to skip these, and be able to parse the rest of the document
210 # - Note: < is allowed?
211 (r'<! [^>\x00]+ >', Tok.Decl),
212
213 # Tags
214 # Notes:
215 # - We look for a valid tag name, but we don't validate attributes.
216 # That's done in the tag lexer.
217 # - We don't allow leading whitespace
218 (r'</ (%s) >' % _NAME, Tok.EndTag),
219 # self-closing <br/> comes before StartTag
220 # could/should these be collapsed into one rule?
221 (r'< (%s) [^>\x00]* />' % _NAME, Tok.StartEndTag), # end </a>
222 (r'< (%s) [^>\x00]* >' % _NAME, Tok.StartTag), # start <a>
223
224 # HTML5 allows unescaped > in raw data, but < is not allowed.
225 # https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
226 #
227 # - My early blog has THREE errors when disallowing >
228 # - So do some .wwz files
229 (r'[^&<\x00]+', Tok.RawData),
230 (r'.', Tok.Invalid), # error!
231]
232
233# Old notes:
234#
235# Non-greedy matches are regular and can be matched in linear time
236# with RE2.
237#
238# https://news.ycombinator.com/item?id=27099798
239#
240# Maybe try combining all of these for speed.
241
242# . is any char except newline
243# https://re2c.org/manual/manual_c.html
244
245# Discarded options
246#(r'<!-- .*? -->', Tok.Comment),
247
248# Hack from Claude: \s\S instead of re.DOTALL. I don't like this
249#(r'<!-- [\s\S]*? -->', Tok.Comment),
250#(r'<!-- (?:.|[\n])*? -->', Tok.Comment),
251
252LEXER = MakeLexer(LEXER)
253
254
255class Lexer(object):
256
257 def __init__(self, s, left_pos=0, right_pos=-1, no_special_tags=False):
258 self.s = s
259 self.pos = left_pos
260 self.right_pos = len(s) if right_pos == -1 else right_pos
261 self.no_special_tags = no_special_tags
262
263 self.cache = {} # string -> compiled regex pattern object
264
265 # either </script> or </style> - we search until we see that
266 self.search_state = None # type: Optional[str]
267
268 # Position of tag name, if applicable
269 # - Set after you get a StartTag, EndTag, or StartEndTag
270 # - Unset on other tags
271 self.tag_pos_left = -1
272 self.tag_pos_right = -1
273
274 def _Peek(self):
275 # type: () -> Tuple[int, int]
276 """
277 Note: not using _Peek() now
278 """
279 if self.pos == self.right_pos:
280 return Tok.EndOfStream, self.pos
281
282 assert self.pos < self.right_pos, self.pos
283
284 if self.search_state is not None and not self.no_special_tags:
285 # TODO: case-insensitive search for </SCRIPT> <SCRipt> ?
286 pos = self.s.find(self.search_state, self.pos)
287 if pos == -1:
288 # unterminated <script> or <style>
289 raise LexError(self.s, self.pos)
290 self.search_state = None
291 # beginning
292 return Tok.HtmlCData, pos
293
294 # Find the first match.
295 # Note: frontend/match.py uses _LongestMatch(), which is different!
296 # TODO: reconcile them. This lexer should be expressible in re2c.
297
298 for pat, tok_id in LEXER:
299 m = pat.match(self.s, self.pos)
300 if m:
301 if tok_id in (Tok.StartTag, Tok.EndTag, Tok.StartEndTag):
302 self.tag_pos_left = m.start(1)
303 self.tag_pos_right = m.end(1)
304 else:
305 # Reset state
306 self.tag_pos_left = -1
307 self.tag_pos_right = -1
308
309 if tok_id == Tok.CommentBegin:
310 pos = self.s.find('-->', self.pos)
311 if pos == -1:
312 # unterminated <!--
313 raise LexError(self.s, self.pos)
314 return Tok.Comment, pos + 3 # -->
315
316 if tok_id == Tok.ProcessingBegin:
317 pos = self.s.find('?>', self.pos)
318 if pos == -1:
319 # unterminated <?
320 raise LexError(self.s, self.pos)
321 return Tok.Processing, pos + 2 # ?>
322
323 if tok_id == Tok.CDataBegin:
324 pos = self.s.find(']]>', self.pos)
325 if pos == -1:
326 # unterminated <![CDATA[
327 raise LexError(self.s, self.pos)
328 return Tok.CData, pos + 3 # ]]>
329
330 if tok_id == Tok.StartTag:
331 if self.TagNameEquals('script'):
332 self.search_state = '</script>'
333 elif self.TagNameEquals('style'):
334 self.search_state = '</style>'
335
336 return tok_id, m.end()
337 else:
338 raise AssertionError('Tok.Invalid rule should have matched')
339
340 def TagNameEquals(self, expected):
341 # type: (str) -> bool
342 assert self.tag_pos_left != -1, self.tag_pos_left
343 assert self.tag_pos_right != -1, self.tag_pos_right
344
345 # TODO: In C++, this does not need an allocation. Can we test
346 # directly?
347 return expected == self.CanonicalTagName()
348
349 def CanonicalTagName(self):
350 # type: () -> None
351 assert self.tag_pos_left != -1, self.tag_pos_left
352 assert self.tag_pos_right != -1, self.tag_pos_right
353
354 tag_name = self.s[self.tag_pos_left:self.tag_pos_right]
355 # Most tags are already lower case, so avoid allocation with this conditional
356 # TODO: this could go in the mycpp runtime?
357 if tag_name.islower():
358 return tag_name
359 else:
360 return tag_name.lower()
361
362 def Read(self):
363 # type: () -> Tuple[int, int]
364 tok_id, end_pos = self._Peek()
365 self.pos = end_pos # advance
366 return tok_id, end_pos
367
368 def LookAhead(self, regex):
369 # Cache the regex compilation. This could also be LookAheadFor(THEAD)
370 # or something.
371 pat = self.cache.get(regex)
372 if pat is None:
373 pat = re.compile(regex)
374 self.cache[regex] = pat
375
376 m = pat.match(self.s, self.pos)
377 return m is not None
378
379
380def _Tokens(s, left_pos, right_pos):
381 """
382 Args:
383 s: string to parse
384 left_pos, right_pos: Optional span boundaries.
385 """
386 lx = Lexer(s, left_pos, right_pos)
387 while True:
388 tok_id, pos = lx.Read()
389 yield tok_id, pos
390 if tok_id == Tok.EndOfStream:
391 break
392
393
394def ValidTokens(s, left_pos=0, right_pos=-1):
395 """Wrapper around _Tokens to prevent callers from having to handle Invalid.
396
397 I'm not combining the two functions because I might want to do a
398 'yield' transformation on Tokens()? Exceptions might complicate the
399 issue?
400 """
401 pos = left_pos
402 for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
403 if tok_id == Tok.Invalid:
404 raise LexError(s, pos)
405 yield tok_id, end_pos
406 pos = end_pos
407
408
409def ValidTokenList(s, no_special_tags=False):
410 """A wrapper that can be more easily translated to C++. Doesn't use iterators."""
411
412 start_pos = 0
413 tokens = []
414 lx = Lexer(s, no_special_tags=no_special_tags)
415 while True:
416 tok_id, end_pos = lx.Read()
417 tokens.append((tok_id, end_pos))
418 if tok_id == Tok.EndOfStream:
419 break
420 if tok_id == Tok.Invalid:
421 raise LexError(s, start_pos)
422 start_pos = end_pos
423 return tokens
424
425
426# Tag names:
427# Match <a or </a
428# Match <h2, but not <2h
429#
430# HTML 5 doesn't restrict tag names at all
431# https://html.spec.whatwg.org/#toc-syntax
432#
433# XML allows : - .
434# https://www.w3.org/TR/xml/#NT-NameChar
435
436# Namespaces for MathML, SVG
437# XLink, XML, XMLNS
438#
439# https://infra.spec.whatwg.org/#namespaces
440#
441# Allow - for td-attrs
442
443# Be very lenient - just no whitespace or special HTML chars
444# I don't think this is more lenient than HTML5, though we should check.
445_UNQUOTED_VALUE = r'''[^ \t\r\n<>&"'\x00]*'''
446
447# TODO: we don't need to capture the tag name here? That's done at the top
448# level
449_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)
450
451_TAG_LAST_RE = re.compile(r'\s* /? >', re.VERBOSE)
452
453# To match href="foo"
454# Note: in HTML5 and XML, single quoted attributes are also valid
455
456# <button disabled> is standard usage
457
458# NOTE: This used to allow whitespace around =
459# <a foo = "bar"> makes sense in XML
460# But then you also have
461# <a foo= bar> - which is TWO attributes, in HTML5
462# So the space is problematic
463
464_ATTR_RE = re.compile(
465 r'''
466\s+ # Leading whitespace is required
467(%s) # Attribute name
468(?: # Optional attribute value
469 \s* = \s* # Spaces allowed around =
470 (?:
471 " ([^>"\x00]*) " # double quoted value
472 | ' ([^>'\x00]*) ' # single quoted value
473 | (%s) # Attribute value
474 )
475)?
476''' % (_NAME, _UNQUOTED_VALUE), re.VERBOSE)
477
478TagName, AttrName, UnquotedValue, QuotedValue, MissingValue = range(5)
479
480
481class TagLexer(object):
482 """
483 Given a tag like <a href="..."> or <link type="..." />, the TagLexer
484 provides a few operations:
485
486 - What is the tag?
487 - Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
488 """
489
490 def __init__(self, s):
491 self.s = s
492 self.start_pos = -1 # Invalid
493 self.end_pos = -1
494
495 def Reset(self, start_pos, end_pos):
496 """Reuse instances of this object."""
497 assert start_pos >= 0, start_pos
498 assert end_pos >= 0, end_pos
499
500 self.start_pos = start_pos
501 self.end_pos = end_pos
502
503 def TagString(self):
504 return self.s[self.start_pos:self.end_pos]
505
506 def TagName(self):
507 # First event
508 tok_id, start, end = next(self.Tokens())
509 return self.s[start:end]
510
511 def GetSpanForAttrValue(self, attr_name):
512 """
513 Used by oils_doc.py, for href shortcuts
514 """
515 # Algorithm: search for QuotedValue or UnquotedValue after AttrName
516 # TODO: Could also cache these
517
518 events = self.Tokens()
519 val = (-1, -1)
520 try:
521 while True:
522 tok_id, start, end = next(events)
523 if tok_id == AttrName:
524 name = self.s[start:end]
525 if name == attr_name:
526 # The value should come next
527 tok_id, start, end = next(events)
528 assert tok_id in (QuotedValue, UnquotedValue,
529 MissingValue), TokenName(tok_id)
530 val = start, end
531 break
532
533 except StopIteration:
534 pass
535 return val
536
537 def GetAttrRaw(self, attr_name):
538 """
539 Return the value, which may be UNESCAPED.
540 """
541 start, end = self.GetSpanForAttrValue(attr_name)
542 if start == -1:
543 return None
544 return self.s[start:end]
545
546 def AllAttrsRawSlice(self):
547 """
548 Get a list of pairs [('class', 3, 5), ('href', 9, 12)]
549 """
550 slices = []
551 events = self.Tokens()
552 try:
553 while True:
554 tok_id, start, end = next(events)
555 if tok_id == AttrName:
556 name = self.s[start:end]
557
558 # The value should come next
559 tok_id, start, end = next(events)
560 assert tok_id in (QuotedValue, UnquotedValue,
561 MissingValue), TokenName(tok_id)
562 # Note: quoted values may have &amp;
563 # We would need ANOTHER lexer to unescape them, but we
564 # don't need that for ul-table
565 slices.append((name, start, end))
566 except StopIteration:
567 pass
568 return slices
569
570 def AllAttrsRaw(self):
571 """
572 Get a list of pairs [('class', 'foo'), ('href', '?foo=1&amp;bar=2')]
573
574 The quoted values may be escaped. We would need another lexer to
575 unescape them.
576 """
577 slices = self.AllAttrsRawSlice()
578 pairs = []
579 for name, start, end in slices:
580 pairs.append((name, self.s[start:end]))
581 return pairs
582
583 def Tokens(self):
584 """
585 Yields a sequence of tokens: Tag (AttrName AttrValue?)*
586
587 Where each Token is (Type, start_pos, end_pos)
588
589 Note that start and end are NOT redundant! We skip over some unwanted
590 characters.
591 """
592 m = _TAG_RE.match(self.s, self.start_pos + 1)
593 if not m:
594 raise RuntimeError("Couldn't find HTML tag in %r" %
595 self.TagString())
596 yield TagName, m.start(1), m.end(1)
597
598 pos = m.end(0)
599 #log('POS %d', pos)
600
601 while True:
602 # don't search past the end
603 m = _ATTR_RE.match(self.s, pos, self.end_pos)
604 if not m:
605 #log('BREAK pos %d', pos)
606 break
607 #log('AttrName %r', m.group(1))
608
609 yield AttrName, m.start(1), m.end(1)
610
611 #log('m.groups() %r', m.groups())
612 if m.group(2) is not None:
613 # double quoted
614 yield QuotedValue, m.start(2), m.end(2)
615 elif m.group(3) is not None:
616 # single quoted - TODO: could have different token types
617 yield QuotedValue, m.start(3), m.end(3)
618 elif m.group(4) is not None:
619 yield UnquotedValue, m.start(4), m.end(4)
620 else:
621 # <button disabled>
622 end = m.end(0)
623 yield MissingValue, end, end
624
625 # Skip past the "
626 pos = m.end(0)
627
628 #log('TOK %r', self.s)
629
630 m = _TAG_LAST_RE.match(self.s, pos)
631 #log('_TAG_LAST_RE match %r', self.s[pos:])
632 if not m:
633 # Extra data at end of tag. TODO: add messages for all these.
634 raise LexError(self.s, pos)
635
636
637# This is similar but not identical to
638# " ([^>"\x00]*) " # double quoted value
639# | ' ([^>'\x00]*) ' # single quoted value
640#
641# Note: for unquoted values, & isn't allowed, and thus &amp; and &#99; and
642# &#x99; are not allowed. We could relax that?
643ATTR_VALUE_LEXER = CHAR_LEX + [
644 (r'[^>&\x00]+', Tok.RawData),
645 (r'.', Tok.Invalid),
646]
647
648ATTR_VALUE_LEXER = MakeLexer(ATTR_VALUE_LEXER)
649
650
651class AttrValueLexer(object):
652 """
653 <a href="foo=99&amp;bar">
654 <a href='foo=99&amp;bar'>
655 <a href=unquoted>
656 """
657
658 def __init__(self, s):
659 self.s = s
660 self.start_pos = -1 # Invalid
661 self.end_pos = -1
662
663 def Reset(self, start_pos, end_pos):
664 """Reuse instances of this object."""
665 assert start_pos >= 0, start_pos
666 assert end_pos >= 0, end_pos
667
668 self.start_pos = start_pos
669 self.end_pos = end_pos
670
671 def NumTokens(self):
672 num_tokens = 0
673 pos = self.start_pos
674 for tok_id, end_pos in self.Tokens():
675 if tok_id == Tok.Invalid:
676 raise LexError(self.s, pos)
677 pos = end_pos
678 #log('pos %d', pos)
679 num_tokens += 1
680 return num_tokens
681
682 def Tokens(self):
683 pos = self.start_pos
684 while pos < self.end_pos:
685 # Find the first match, like above.
686 # Note: frontend/match.py uses _LongestMatch(), which is different!
687 # TODO: reconcile them. This lexer should be expressible in re2c.
688 for pat, tok_id in ATTR_VALUE_LEXER:
689 m = pat.match(self.s, pos)
690 if m:
691 if 0:
692 tok_str = m.group(0)
693 log('token = %r', tok_str)
694
695 end_pos = m.end(0)
696 yield tok_id, end_pos
697 pos = end_pos
698 break
699 else:
700 raise AssertionError('Tok.Invalid rule should have matched')
701
702
703def ReadUntilStartTag(it, tag_lexer, tag_name):
704 """Find the next <foo>, returning its (start, end) positions
705
706 Raise ParseError if it's not found.
707
708 tag_lexer is RESET.
709 """
710 pos = 0
711 while True:
712 try:
713 tok_id, end_pos = next(it)
714 except StopIteration:
715 break
716 tag_lexer.Reset(pos, end_pos)
717 if tok_id == Tok.StartTag and tag_lexer.TagName() == tag_name:
718 return pos, end_pos
719
720 pos = end_pos
721
722 raise ParseError('No start tag %r' % tag_name)
723
724
725def ReadUntilEndTag(it, tag_lexer, tag_name):
726 """Find the next </foo>, returning its (start, end) position
727
728 Raise ParseError if it's not found.
729
730 tag_lexer is RESET.
731 """
732 pos = 0
733 while True:
734 try:
735 tok_id, end_pos = next(it)
736 except StopIteration:
737 break
738 tag_lexer.Reset(pos, end_pos)
739 if tok_id == Tok.EndTag and tag_lexer.TagName() == tag_name:
740 return pos, end_pos
741
742 pos = end_pos
743
744 raise ParseError('No end tag %r' % tag_name)
745
746
747CHAR_ENTITY = {
748 'amp': '&',
749 'lt': '<',
750 'gt': '>',
751 'quot': '"',
752 'apos': "'",
753}
754
755
756def ToText(s, left_pos=0, right_pos=-1):
757 """Given HTML, return text by unquoting &gt; and &lt; etc.
758
759 Used by:
760 doctools/oils_doc.py: PygmentsPlugin
761 doctools/help_gen.py: HelpIndexCards
762
763 In the latter case, we cold process some tags, like:
764
765 - Blue Link (not clickable, but still useful)
766 - Red X
767
768 That should be html.ToAnsi.
769 """
770 f = StringIO()
771 out = Output(s, f, left_pos, right_pos)
772
773 pos = left_pos
774 for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
775 if tok_id in (Tok.RawData, Tok.BadAmpersand):
776 out.SkipTo(pos)
777 out.PrintUntil(end_pos)
778
779 elif tok_id == Tok.CharEntity: # &amp;
780
781 entity = s[pos + 1:end_pos - 1]
782
783 out.SkipTo(pos)
784 out.Print(CHAR_ENTITY[entity])
785 out.SkipTo(end_pos)
786
787 # Not handling these yet
788 elif tok_id == Tok.HexChar:
789 raise AssertionError('Hex Char %r' % s[pos:pos + 20])
790
791 elif tok_id == Tok.DecChar:
792 raise AssertionError('Dec Char %r' % s[pos:pos + 20])
793
794 else:
795 # Skip everything else
796 out.SkipTo(end_pos)
797
798 pos = end_pos
799
800 out.PrintTheRest()
801 return f.getvalue()
802
803
804# https://developer.mozilla.org/en-US/docs/Glossary/Void_element
805VOID_ELEMENTS = [
806 'area',
807 'base',
808 'br',
809 'col',
810 'embed',
811 'hr',
812 'img',
813 'input',
814 'link',
815 'meta',
816 'param',
817 'source',
818 'track',
819 'wbr',
820]
821
822LEX_ATTRS = 1 << 1
823LEX_QUOTED_VALUES = 1 << 2 # href="?x=42&amp;y=99"
824NO_SPECIAL_TAGS = 1 << 3 # <script> <style>, VOID tags, etc.
825BALANCED_TAGS = 1 << 4 # are tags balanced?
826
827
828def Validate(contents, flags, counters):
829 # type: (str, int, Counters) -> None
830
831 tag_lexer = TagLexer(contents)
832 val_lexer = AttrValueLexer(contents)
833
834 no_special_tags = bool(flags & NO_SPECIAL_TAGS)
835 lx = Lexer(contents, no_special_tags=no_special_tags)
836 tokens = []
837 start_pos = 0
838 tag_stack = []
839 while True:
840 tok_id, end_pos = lx.Read()
841 #log('TOP %s %r', TokenName(tok_id), contents[start_pos:end_pos])
842
843 if tok_id == Tok.Invalid:
844 raise LexError(contents, start_pos)
845 if tok_id == Tok.EndOfStream:
846 break
847
848 tokens.append((tok_id, end_pos))
849
850 if tok_id == Tok.StartEndTag:
851 counters.num_start_end_tags += 1
852
853 tag_lexer.Reset(start_pos, end_pos)
854 all_attrs = tag_lexer.AllAttrsRawSlice()
855 counters.num_attrs += len(all_attrs)
856 for name, val_start, val_end in all_attrs:
857 val_lexer.Reset(val_start, val_end)
858 counters.num_val_tokens += val_lexer.NumTokens()
859
860 counters.debug_attrs.extend(all_attrs)
861
862 elif tok_id == Tok.StartTag:
863 counters.num_start_tags += 1
864
865 tag_lexer.Reset(start_pos, end_pos)
866 all_attrs = tag_lexer.AllAttrsRawSlice()
867 counters.num_attrs += len(all_attrs)
868 for name, val_start, val_end in all_attrs:
869 val_lexer.Reset(val_start, val_end)
870 counters.num_val_tokens += val_lexer.NumTokens()
871
872 counters.debug_attrs.extend(all_attrs)
873
874 if flags & BALANCED_TAGS:
875 tag_name = lx.CanonicalTagName()
876 if flags & NO_SPECIAL_TAGS:
877 tag_stack.append(tag_name)
878 else:
879 # e.g. <meta> is considered self-closing, like <meta/>
880 if tag_name not in VOID_ELEMENTS:
881 tag_stack.append(tag_name)
882
883 counters.max_tag_stack = max(counters.max_tag_stack,
884 len(tag_stack))
885 elif tok_id == Tok.EndTag:
886 if flags & BALANCED_TAGS:
887 try:
888 expected = tag_stack.pop()
889 except IndexError:
890 raise ParseError('Tag stack empty',
891 s=contents,
892 start_pos=start_pos)
893
894 actual = lx.CanonicalTagName()
895 if expected != actual:
896 raise ParseError(
897 'Got unexpected closing tag %r; opening tag was %r' %
898 (contents[start_pos:end_pos], expected),
899 s=contents,
900 start_pos=start_pos)
901
902 start_pos = end_pos
903
904 if len(tag_stack) != 0:
905 raise ParseError('Missing closing tags at end of doc: %s' %
906 ' '.join(tag_stack),
907 s=contents,
908 start_pos=start_pos)
909
910 counters.num_tokens += len(tokens)
911
912
913class Counters(object):
914
915 def __init__(self):
916 self.num_tokens = 0
917 self.num_start_tags = 0
918 self.num_start_end_tags = 0
919 self.num_attrs = 0
920 self.max_tag_stack = 0
921 self.num_val_tokens = 0
922
923 self.debug_attrs = []
924
925
926def main(argv):
927 action = argv[1]
928
929 if action == 'tokens':
930 contents = sys.stdin.read()
931
932 lx = Lexer(contents)
933 start_pos = 0
934 while True:
935 tok_id, end_pos = lx.Read()
936 if tok_id == Tok.Invalid:
937 raise LexError(contents, start_pos)
938 if tok_id == Tok.EndOfStream:
939 break
940
941 frag = contents[start_pos:end_pos]
942 log('%d %s %r', end_pos, TokenName(tok_id), frag)
943 start_pos = end_pos
944
945 return 0
946
947 elif action in ('lex-htm8', 'parse-htm8', 'parse-xml'):
948
949 errors = []
950 counters = Counters()
951
952 flags = LEX_ATTRS | LEX_QUOTED_VALUES
953 if action.startswith('parse-'):
954 flags |= BALANCED_TAGS
955 if action == 'parse-xml':
956 flags |= NO_SPECIAL_TAGS
957
958 i = 0
959 for line in sys.stdin:
960 filename = line.strip()
961 with open(filename) as f:
962 contents = f.read()
963
964 try:
965 Validate(contents, flags, counters)
966 except LexError as e:
967 log('Lex error in %r: %s', filename, e)
968 errors.append((filename, e))
969 except ParseError as e:
970 log('Parse error in %r: %s', filename, e)
971 errors.append((filename, e))
972 i += 1
973
974 log('')
975 log('%10d tokens', counters.num_tokens)
976 log('%10d start/end tags', counters.num_start_end_tags)
977 log('%10d start tags', counters.num_start_tags)
978 log('%10d attrs', counters.num_attrs)
979 log('%10d max tag stack depth', counters.max_tag_stack)
980 log('%10d attr val tokens', counters.num_val_tokens)
981 log('%10d errors', len(errors))
982 if len(errors):
983 return 1
984 return 0
985
986 elif action == 'todo':
987 # Other algorithms:
988 #
989 # - select first subtree with given ID
990 # - this requires understanding the void tags I suppose
991 # - select all subtrees that have a class
992 # - materialize DOM
993
994 # Safe-HTM8? This is a filter
995 return 0
996
997 else:
998 raise RuntimeError('Invalid action %r' % action)
999
1000
1001if __name__ == '__main__':
1002 sys.exit(main(sys.argv))