OILS / lazylex / html.py View on Github | oils.pub

899 lines, 454 significant
1#!/usr/bin/env python2
2"""
3lazylex/html.py - Low-Level HTML Processing.
4
5See lazylex/README.md for details.
6
7Conflicts between HTML5 and XML:
8
9- In XML, <source> is like any tag, and must be closed,
10- In HTML, <source> is a VOID tag, and must NOT be closedlike any tag, and must be closed,
11
12- In XML, <script> and <style> don't have special treatment
13- In HTML, they do
14
15- The header is different - <!DOCTYPE html> vs. <?xml version= ... ?>
16
17So do have a mode for <script> <style> and void tags? Upgrade HX8 into HTM8?
18"""
19from __future__ import print_function
20
21try:
22 from cStringIO import StringIO
23except ImportError:
24 from io import StringIO # python3
25import re
26import sys
27
28if sys.version_info.major == 2:
29 from typing import List, Tuple, Optional
30
31
32def log(msg, *args):
33 msg = msg % args
34 print(msg, file=sys.stderr)
35
36
37class LexError(Exception):
38 """
39 Examples of lex errors:
40
41 - Tok.Invalid, like <> or &&
42 - Unclosed <!-- <? <![CDATA[ <script> <style>
43 """
44
45 def __init__(self, s, start_pos):
46 self.s = s
47 self.start_pos = start_pos
48
49 def __str__(self):
50 return '(LexError %r)' % (self.s[self.start_pos:self.start_pos + 20])
51
52
53def FindLineNum(s, error_pos):
54 current_pos = 0
55 line_num = 1
56 while True:
57 newline_pos = s.find('\n', current_pos)
58 #log('current = %d, N %d, line %d', current_pos, newline_pos, line_num)
59
60 if newline_pos == -1: # this is the last line
61 return line_num
62 if newline_pos >= error_pos:
63 return line_num
64 line_num += 1
65 current_pos = newline_pos + 1
66
67
68class ParseError(Exception):
69 """
70 Examples of parse errors
71
72 - unbalanced tag structure
73 - ul_table.py errors
74 """
75
76 def __init__(self, msg, s=None, start_pos=-1):
77 self.msg = msg
78 self.s = s
79 self.start_pos = start_pos
80
81 def __str__(self):
82 if self.s is not None:
83 assert self.start_pos != -1, self.start_pos
84 snippet = (self.s[self.start_pos:self.start_pos + 20])
85
86 line_num = FindLineNum(self.s, self.start_pos)
87 else:
88 snippet = ''
89 line_num = -1
90 msg = 'line %d: %r %r' % (line_num, self.msg, snippet)
91 return msg
92
93
94class Output(object):
95 """Takes an underlying input buffer and an output file. Maintains a
96 position in the input buffer.
97
98 Print FROM the input or print new text to the output.
99 """
100
101 def __init__(self, s, f, left_pos=0, right_pos=-1):
102 self.s = s
103 self.f = f
104 self.pos = left_pos
105 self.right_pos = len(s) if right_pos == -1 else right_pos
106
107 def SkipTo(self, pos):
108 """Skip to a position."""
109 self.pos = pos
110
111 def PrintUntil(self, pos):
112 """Print until a position."""
113 piece = self.s[self.pos:pos]
114 self.f.write(piece)
115 self.pos = pos
116
117 def PrintTheRest(self):
118 """Print until the end of the string."""
119 self.PrintUntil(self.right_pos)
120
121 def Print(self, s):
122 """Print text to the underlying buffer."""
123 self.f.write(s)
124
125
126# HTML Tokens
127# CommentBegin, ProcessingBegin, CDataBegin are "pseudo-tokens", not visible
128TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin CData CDataBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData HtmlCData Invalid EndOfStream'.split(
129)
130
131
132class Tok(object):
133 """
134 Avoid lint errors by using these aliases
135 """
136 pass
137
138
139TOKEN_NAMES = [None] * len(TOKENS) # type: List[str]
140
141this_module = sys.modules[__name__]
142for i, tok_str in enumerate(TOKENS):
143 setattr(this_module, tok_str, i)
144 setattr(Tok, tok_str, i)
145 TOKEN_NAMES[i] = tok_str
146
147
148def TokenName(tok_id):
149 return TOKEN_NAMES[tok_id]
150
151
152def MakeLexer(rules):
153 return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]
154
155
156#
157# Eggex
158#
159# Tag = / ~['>']+ /
160
161# Is this valid? A single character?
162# Tag = / ~'>'* /
163
164# Maybe better: / [NOT '>']+/
165# capital letters not allowed there?
166#
167# But then this is confusing:
168# / [NOT ~digit]+/
169#
170# / [NOT digit] / is [^\d]
171# / ~digit / is \D
172#
173# Or maybe:
174#
175# / [~ digit]+ /
176# / [~ '>']+ /
177# / [NOT '>']+ /
178
179# End = / '</' Tag '>' /
180# StartEnd = / '<' Tag '/>' /
181# Start = / '<' Tag '>' /
182#
183# EntityRef = / '&' dot{* N} ';' /
184
185# Tag name, or attribute name
186# colon is used in XML
187
188# https://www.w3.org/TR/xml/#NT-Name
189# Hm there is a lot of unicode stuff. We are simplifying parsing
190
191_NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter
192
193LEXER = [
194 (r'<!--', Tok.CommentBegin),
195
196 # Processing instruction are used for the XML header:
197 # <?xml version="1.0" encoding="UTF-8"?>
198 # They are technically XML-only, but in HTML5, they are another kind of
199 # comment:
200 #
201 # https://developer.mozilla.org/en-US/docs/Web/API/ProcessingInstruction
202 #
203 (r'<\?', Tok.ProcessingBegin),
204 # Not necessary in HTML5, but occurs in XML
205 (r'<!\[CDATA\[', Tok.CDataBegin), # <![CDATA[
206
207 # Markup declarations
208 # - In HTML5, there is only <!DOCTYPE html>
209 # - XML has 4 more declarations: <!ELEMENT ...> ATTLIST ENTITY NOTATION
210 # - these seem to be part of DTD
211 # - it's useful to skip these, and be able to parse the rest of the document
212 # - Note: < is allowed?
213 (r'<! [^>]+ >', Tok.Decl),
214
215 # Tags
216 # Notes:
217 # - We look for a valid tag name, but we don't validate attributes.
218 # That's done in the tag lexer.
219 # - We don't allow leading whitespace
220 (r'</ (%s) >' % _NAME, Tok.EndTag),
221 # self-closing <br/> comes before StartTag
222 # could/should these be collapsed into one rule?
223 (r'< (%s) [^>]* />' % _NAME, Tok.StartEndTag), # end </a>
224 (r'< (%s) [^>]* >' % _NAME, Tok.StartTag), # start <a>
225
226 # Characters
227 # https://www.w3.org/TR/xml/#sec-references
228 (r'&\# [0-9]+ ;', Tok.DecChar),
229 (r'&\# x[0-9a-fA-F]+ ;', Tok.HexChar),
230 (r'& %s ;' % _NAME, Tok.CharEntity),
231
232 # HTML5 allows unescaped > in raw data, but < is not allowed.
233 # https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
234 #
235 # - My early blog has THREE errors when disallowing >
236 # - So do some .wwz files
237 (r'[^&<]+', Tok.RawData),
238 (r'.', Tok.Invalid), # error!
239]
240
241# Old notes:
242#
243# Non-greedy matches are regular and can be matched in linear time
244# with RE2.
245#
246# https://news.ycombinator.com/item?id=27099798
247#
248# Maybe try combining all of these for speed.
249
250# . is any char except newline
251# https://re2c.org/manual/manual_c.html
252
253# Discarded options
254#(r'<!-- .*? -->', Tok.Comment),
255
256# Hack from Claude: \s\S instead of re.DOTALL. I don't like this
257#(r'<!-- [\s\S]*? -->', Tok.Comment),
258#(r'<!-- (?:.|[\n])*? -->', Tok.Comment),
259
260LEXER = MakeLexer(LEXER)
261
262
263class Lexer(object):
264
265 def __init__(self, s, left_pos=0, right_pos=-1, no_special_tags=False):
266 self.s = s
267 self.pos = left_pos
268 self.right_pos = len(s) if right_pos == -1 else right_pos
269 self.no_special_tags = no_special_tags
270
271 self.cache = {} # string -> compiled regex pattern object
272
273 # either </script> or </style> - we search until we see that
274 self.search_state = None # type: Optional[str]
275
276 # Position of tag name, if applicable
277 # - Set after you get a StartTag, EndTag, or StartEndTag
278 # - Unset on other tags
279 self.tag_pos_left = -1
280 self.tag_pos_right = -1
281
282 def _Peek(self):
283 # type: () -> Tuple[int, int]
284 """
285 Note: not using _Peek() now
286 """
287 if self.pos == self.right_pos:
288 return Tok.EndOfStream, self.pos
289
290 assert self.pos < self.right_pos, self.pos
291
292 if self.search_state is not None and not self.no_special_tags:
293 pos = self.s.find(self.search_state, self.pos)
294 if pos == -1:
295 # unterminated <script> or <style>
296 raise LexError(self.s, self.pos)
297 self.search_state = None
298 # beginning
299 return Tok.HtmlCData, pos
300
301 # Find the first match.
302 # Note: frontend/match.py uses _LongestMatch(), which is different!
303 # TODO: reconcile them. This lexer should be expressible in re2c.
304
305 for pat, tok_id in LEXER:
306 m = pat.match(self.s, self.pos)
307 if m:
308 if tok_id in (Tok.StartTag, Tok.EndTag, Tok.StartEndTag):
309 self.tag_pos_left = m.start(1)
310 self.tag_pos_right = m.end(1)
311 else:
312 # Reset state
313 self.tag_pos_left = -1
314 self.tag_pos_right = -1
315
316 if tok_id == Tok.CommentBegin:
317 pos = self.s.find('-->', self.pos)
318 if pos == -1:
319 # unterminated <!--
320 raise LexError(self.s, self.pos)
321 return Tok.Comment, pos + 3 # -->
322
323 if tok_id == Tok.ProcessingBegin:
324 pos = self.s.find('?>', self.pos)
325 if pos == -1:
326 # unterminated <?
327 raise LexError(self.s, self.pos)
328 return Tok.Processing, pos + 2 # ?>
329
330 if tok_id == Tok.CDataBegin:
331 pos = self.s.find(']]>', self.pos)
332 if pos == -1:
333 # unterminated <![CDATA[
334 raise LexError(self.s, self.pos)
335 return Tok.CData, pos + 3 # ]]>
336
337 if tok_id == Tok.StartTag:
338 if self.TagNameEquals('script'):
339 self.search_state = '</script>'
340 elif self.TagNameEquals('style'):
341 self.search_state = '</style>'
342
343 return tok_id, m.end()
344 else:
345 raise AssertionError('Tok.Invalid rule should have matched')
346
347 def TagNameEquals(self, expected):
348 # type: (str) -> bool
349 assert self.tag_pos_left != -1, self.tag_pos_left
350 assert self.tag_pos_right != -1, self.tag_pos_right
351
352 # TODO: In C++, this does not need an allocation
353 # TODO: conditionally lower() case here (maybe not in XML mode)
354 return expected == self.s[self.tag_pos_left:self.tag_pos_right]
355
356 def TagName(self):
357 # type: () -> None
358 assert self.tag_pos_left != -1, self.tag_pos_left
359 assert self.tag_pos_right != -1, self.tag_pos_right
360
361 # TODO: conditionally lower() case here (maybe not in XML mode)
362 return self.s[self.tag_pos_left:self.tag_pos_right]
363
364 def Read(self):
365 # type: () -> Tuple[int, int]
366 tok_id, end_pos = self._Peek()
367 self.pos = end_pos # advance
368 return tok_id, end_pos
369
370 def LookAhead(self, regex):
371 # Cache the regex compilation. This could also be LookAheadFor(THEAD)
372 # or something.
373 pat = self.cache.get(regex)
374 if pat is None:
375 pat = re.compile(regex)
376 self.cache[regex] = pat
377
378 m = pat.match(self.s, self.pos)
379 return m is not None
380
381
382def _Tokens(s, left_pos, right_pos):
383 """
384 Args:
385 s: string to parse
386 left_pos, right_pos: Optional span boundaries.
387 """
388 lx = Lexer(s, left_pos, right_pos)
389 while True:
390 tok_id, pos = lx.Read()
391 yield tok_id, pos
392 if tok_id == Tok.EndOfStream:
393 break
394
395
396def ValidTokens(s, left_pos=0, right_pos=-1):
397 """Wrapper around _Tokens to prevent callers from having to handle Invalid.
398
399 I'm not combining the two functions because I might want to do a
400 'yield' transformation on Tokens()? Exceptions might complicate the
401 issue?
402 """
403 pos = left_pos
404 for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
405 if tok_id == Tok.Invalid:
406 raise LexError(s, pos)
407 yield tok_id, end_pos
408 pos = end_pos
409
410
411def ValidTokenList(s, no_special_tags=False):
412 """A wrapper that can be more easily translated to C++. Doesn't use iterators."""
413
414 start_pos = 0
415 tokens = []
416 lx = Lexer(s, no_special_tags=no_special_tags)
417 while True:
418 tok_id, end_pos = lx.Read()
419 tokens.append((tok_id, end_pos))
420 if tok_id == Tok.EndOfStream:
421 break
422 if tok_id == Tok.Invalid:
423 raise LexError(s, start_pos)
424 start_pos = end_pos
425 return tokens
426
427
428# Tag names:
429# Match <a or </a
430# Match <h2, but not <2h
431#
432# HTML 5 doesn't restrict tag names at all
433# https://html.spec.whatwg.org/#toc-syntax
434#
435# XML allows : - .
436# https://www.w3.org/TR/xml/#NT-NameChar
437
438# Namespaces for MathML, SVG
439# XLink, XML, XMLNS
440#
441# https://infra.spec.whatwg.org/#namespaces
442#
443# Allow - for td-attrs
444
445# Be very lenient - just no whitespace or special HTML chars
446# I don't think this is more lenient than HTML5, though we should check.
447_UNQUOTED_VALUE = r'''[^\x00 \t\r\n<>&"']*'''
448
449# TODO: we don't need to capture the tag name here? That's done at the top
450# level
451_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)
452
453_TAG_LAST_RE = re.compile(r'\s* /? >', re.VERBOSE)
454
455# To match href="foo"
456# Note: in HTML5 and XML, single quoted attributes are also valid
457
458# <button disabled> is standard usage
459
460_ATTR_RE = re.compile(
461 r'''
462\s+ # Leading whitespace is required
463(%s) # Attribute name
464(?: # Optional attribute value
465 \s* = \s*
466 (?:
467 " ([^>"]*) " # double quoted value
468 | ' ([^>']*) ' # single quoted value
469 | (%s) # Attribute value
470 )
471)?
472''' % (_NAME, _UNQUOTED_VALUE), re.VERBOSE)
473
474TagName, AttrName, UnquotedValue, QuotedValue = range(4)
475
476
477class TagLexer(object):
478 """
479 Given a tag like <a href="..."> or <link type="..." />, the TagLexer
480 provides a few operations:
481
482 - What is the tag?
483 - Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
484 """
485
486 def __init__(self, s):
487 self.s = s
488 self.start_pos = -1 # Invalid
489 self.end_pos = -1
490
491 def Reset(self, start_pos, end_pos):
492 """Reuse instances of this object."""
493 self.start_pos = start_pos
494 self.end_pos = end_pos
495
496 def TagString(self):
497 return self.s[self.start_pos:self.end_pos]
498
499 def TagName(self):
500 # First event
501 tok_id, start, end = next(self.Tokens())
502 return self.s[start:end]
503
504 def GetSpanForAttrValue(self, attr_name):
505 """
506 Used by oils_doc.py, for href shortcuts
507 """
508 # Algorithm: search for QuotedValue or UnquotedValue after AttrName
509 # TODO: Could also cache these
510
511 events = self.Tokens()
512 val = (-1, -1)
513 try:
514 while True:
515 tok_id, start, end = next(events)
516 if tok_id == AttrName:
517 name = self.s[start:end]
518 if name == attr_name:
519 # The value should come next
520 tok_id, start, end = next(events)
521 if tok_id in (QuotedValue, UnquotedValue):
522 # Note: quoted values may have &amp;
523 # We would need ANOTHER lexer to unescape them.
524 # Right now help_gen.py and oils_doc.py
525 val = start, end
526 break
527
528 except StopIteration:
529 pass
530 return val
531
532 def GetAttrRaw(self, attr_name):
533 """
534 Return the value, which may be UNESCAPED.
535 """
536 start, end = self.GetSpanForAttrValue(attr_name)
537 if start == -1:
538 return None
539 return self.s[start:end]
540
541 def AllAttrsRaw(self):
542 """
543 Get a list of pairs [('class', 'foo'), ('href', '?foo=1&amp;bar=2')]
544
545 The quoted values may be escaped. We would need another lexer to
546 unescape them.
547 """
548 pairs = []
549 events = self.Tokens()
550 try:
551 while True:
552 tok_id, start, end = next(events)
553 if tok_id == AttrName:
554 name = self.s[start:end]
555
556 # The value should come next
557 tok_id, start, end = next(events)
558 if tok_id in (QuotedValue, UnquotedValue):
559 # Note: quoted values may have &amp;
560 # We would need ANOTHER lexer to unescape them, but we
561 # don't need that for ul-table
562
563 val = self.s[start:end]
564 pairs.append((name, val))
565 except StopIteration:
566 pass
567 return pairs
568
569 def Tokens(self):
570 """
571 Yields a sequence of tokens: Tag (AttrName AttrValue?)*
572
573 Where each Token is (Type, start_pos, end_pos)
574
575 Note that start and end are NOT redundant! We skip over some unwanted
576 characters.
577 """
578 m = _TAG_RE.match(self.s, self.start_pos + 1)
579 if not m:
580 raise RuntimeError("Couldn't find HTML tag in %r" %
581 self.TagString())
582 yield TagName, m.start(1), m.end(1)
583
584 pos = m.end(0)
585 #log('POS %d', pos)
586
587 while True:
588 # don't search past the end
589 m = _ATTR_RE.match(self.s, pos, self.end_pos)
590 if not m:
591 #log('BREAK pos %d', pos)
592 break
593 #log('AttrName %r', m.group(1))
594
595 yield AttrName, m.start(1), m.end(1)
596
597 if m.group(2) is not None:
598 # double quoted
599 yield QuotedValue, m.start(2), m.end(2)
600 elif m.group(3) is not None:
601 # single quoted - TODO: could have different token types
602 yield QuotedValue, m.start(3), m.end(3)
603 elif m.group(4) is not None:
604 yield UnquotedValue, m.start(4), m.end(4)
605
606 # Skip past the "
607 pos = m.end(0)
608
609 #log('TOK %r', self.s)
610
611 m = _TAG_LAST_RE.match(self.s, pos)
612 #log('_TAG_LAST_RE match %r', self.s[pos:])
613 if not m:
614 # Extra data at end of tag. TODO: add messages for all these.
615 raise LexError(self.s, pos)
616
617
618def ReadUntilStartTag(it, tag_lexer, tag_name):
619 """Find the next <foo>, returning its (start, end) positions
620
621 Raise ParseError if it's not found.
622
623 tag_lexer is RESET.
624 """
625 pos = 0
626 while True:
627 try:
628 tok_id, end_pos = next(it)
629 except StopIteration:
630 break
631 tag_lexer.Reset(pos, end_pos)
632 if tok_id == Tok.StartTag and tag_lexer.TagName() == tag_name:
633 return pos, end_pos
634
635 pos = end_pos
636
637 raise ParseError('No start tag %r' % tag_name)
638
639
640def ReadUntilEndTag(it, tag_lexer, tag_name):
641 """Find the next </foo>, returning its (start, end) position
642
643 Raise ParseError if it's not found.
644
645 tag_lexer is RESET.
646 """
647 pos = 0
648 while True:
649 try:
650 tok_id, end_pos = next(it)
651 except StopIteration:
652 break
653 tag_lexer.Reset(pos, end_pos)
654 if tok_id == Tok.EndTag and tag_lexer.TagName() == tag_name:
655 return pos, end_pos
656
657 pos = end_pos
658
659 raise ParseError('No end tag %r' % tag_name)
660
661
662CHAR_ENTITY = {
663 'amp': '&',
664 'lt': '<',
665 'gt': '>',
666 'quot': '"',
667}
668
669
670def ToText(s, left_pos=0, right_pos=-1):
671 """Given HTML, return text by unquoting &gt; and &lt; etc.
672
673 Used by:
674 doctools/oils_doc.py: PygmentsPlugin
675 doctools/help_gen.py: HelpIndexCards
676
677 In the latter case, we cold process some tags, like:
678
679 - Blue Link (not clickable, but still useful)
680 - Red X
681
682 That should be html.ToAnsi.
683 """
684 f = StringIO()
685 out = Output(s, f, left_pos, right_pos)
686
687 pos = left_pos
688 for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
689 if tok_id == Tok.RawData:
690 out.SkipTo(pos)
691 out.PrintUntil(end_pos)
692
693 elif tok_id == Tok.CharEntity: # &amp;
694
695 entity = s[pos + 1:end_pos - 1]
696
697 out.SkipTo(pos)
698 out.Print(CHAR_ENTITY[entity])
699 out.SkipTo(end_pos)
700
701 # Not handling these yet
702 elif tok_id == Tok.HexChar:
703 raise AssertionError('Hex Char %r' % s[pos:pos + 20])
704
705 elif tok_id == Tok.DecChar:
706 raise AssertionError('Dec Char %r' % s[pos:pos + 20])
707
708 pos = end_pos
709
710 out.PrintTheRest()
711 return f.getvalue()
712
713
714# https://developer.mozilla.org/en-US/docs/Glossary/Void_element
715VOID_ELEMENTS = [
716 'area',
717 'base',
718 'br',
719 'col',
720 'embed',
721 'hr',
722 'img',
723 'input',
724 'link',
725 'meta',
726 'param',
727 'source',
728 'track',
729 'wbr',
730]
731
732LEX_ATTRS = 1 << 1
733LEX_QUOTED_VALUES = 1 << 2 # href="?x=42&amp;y=99"
734NO_SPECIAL_TAGS = 1 << 3 # <script> <style>, VOID tags, etc.
735BALANCED_TAGS = 1 << 4 # are tags balanced?
736
737
738def Validate(contents, flags, counters):
739 # type: (str, int, Counters) -> None
740
741 tag_lexer = TagLexer(contents)
742 no_special_tags = bool(flags & NO_SPECIAL_TAGS)
743 lx = Lexer(contents, no_special_tags=no_special_tags)
744 tokens = []
745 start_pos = 0
746 tag_stack = []
747 while True:
748 tok_id, end_pos = lx.Read()
749
750 if tok_id == Tok.Invalid:
751 raise LexError(contents, start_pos)
752 if tok_id == Tok.EndOfStream:
753 break
754
755 tokens.append((tok_id, end_pos))
756
757 if tok_id == Tok.StartEndTag:
758 counters.num_start_end_tags += 1
759
760 tag_lexer.Reset(start_pos, end_pos)
761 all_attrs = tag_lexer.AllAttrsRaw()
762 counters.num_attrs += len(all_attrs)
763 counters.debug_attrs.extend(all_attrs)
764
765 elif tok_id == Tok.StartTag:
766 counters.num_start_tags += 1
767
768 tag_lexer.Reset(start_pos, end_pos)
769 all_attrs = tag_lexer.AllAttrsRaw()
770 counters.num_attrs += len(all_attrs)
771 counters.debug_attrs.extend(all_attrs)
772
773 if flags & BALANCED_TAGS:
774 tag_name = lx.TagName()
775 if flags & NO_SPECIAL_TAGS:
776 tag_stack.append(tag_name)
777 else:
778 # e.g. <meta> is considered self-closing, like <meta/>
779 if tag_name not in VOID_ELEMENTS:
780 tag_stack.append(tag_name)
781
782 counters.max_tag_stack = max(counters.max_tag_stack,
783 len(tag_stack))
784 elif tok_id == Tok.EndTag:
785 if flags & BALANCED_TAGS:
786 try:
787 expected = tag_stack.pop()
788 except IndexError:
789 raise ParseError('Tag stack empty',
790 s=contents,
791 start_pos=start_pos)
792
793 actual = lx.TagName()
794 if expected != actual:
795 raise ParseError(
796 'Got unexpected closing tag %r; opening tag was %r' %
797 (contents[start_pos:end_pos], expected),
798 s=contents,
799 start_pos=start_pos)
800
801 start_pos = end_pos
802
803 if len(tag_stack) != 0:
804 raise ParseError('Missing closing tags at end of doc: %s' %
805 ' '.join(tag_stack),
806 s=contents,
807 start_pos=start_pos)
808
809 counters.num_tokens += len(tokens)
810
811
812class Counters(object):
813
814 def __init__(self):
815 self.num_tokens = 0
816 self.num_start_tags = 0
817 self.num_start_end_tags = 0
818 self.num_attrs = 0
819 self.max_tag_stack = 0
820
821 self.debug_attrs = []
822
823
824def main(argv):
825 action = argv[1]
826
827 if action == 'tokens':
828 contents = sys.stdin.read()
829
830 lx = Lexer(contents)
831 start_pos = 0
832 while True:
833 tok_id, end_pos = lx.Read()
834 if tok_id == Tok.Invalid:
835 raise LexError(contents, start_pos)
836 if tok_id == Tok.EndOfStream:
837 break
838
839 frag = contents[start_pos:end_pos]
840 log('%d %s %r', end_pos, TokenName(tok_id), frag)
841 start_pos = end_pos
842
843 return 0
844
845 elif action in ('lex-htm8', 'parse-htm8', 'parse-xml'):
846
847 errors = []
848 counters = Counters()
849
850 flags = LEX_ATTRS | LEX_QUOTED_VALUES
851 if action.startswith('parse-'):
852 flags |= BALANCED_TAGS
853 if action == 'parse-xml':
854 flags |= NO_SPECIAL_TAGS
855
856 i = 0
857 for line in sys.stdin:
858 filename = line.strip()
859 with open(filename) as f:
860 contents = f.read()
861
862 try:
863 Validate(contents, flags, counters)
864 except LexError as e:
865 log('Lex error in %r: %s', filename, e)
866 errors.append((filename, e))
867 except ParseError as e:
868 log('Parse error in %r: %s', filename, e)
869 errors.append((filename, e))
870 i += 1
871
872 log('')
873 log(
874 ' %d tokens, %d start/end tags, %d start tags, %d attrs, %d max tag stack depth in %d files',
875 counters.num_tokens, counters.num_start_end_tags,
876 counters.num_start_tags, counters.num_attrs,
877 counters.max_tag_stack, i)
878 log(' %d errors', len(errors))
879 if len(errors):
880 return 1
881 return 0
882
883 elif action == 'todo':
884 # Other algorithms:
885 #
886 # - select first subtree with given ID
887 # - this requires understanding the void tags I suppose
888 # - select all subtrees that have a class
889 # - materialize DOM
890
891 # Safe-HTM8? This is a filter
892 return 0
893
894 else:
895 raise RuntimeError('Invalid action %r' % action)
896
897
898if __name__ == '__main__':
899 sys.exit(main(sys.argv))