OILS / lazylex / html.py View on Github | oils.pub

872 lines, 440 significant
1#!/usr/bin/env python2
2"""
3lazylex/html.py - Low-Level HTML Processing.
4
5See lazylex/README.md for details.
6
7Conflicts between HTML5 and XML:
8
9- In XML, <source> is like any tag, and must be closed,
10- In HTML, <source> is a VOID tag, and must NOT be closedlike any tag, and must be closed,
11
12- In XML, <script> and <style> don't have special treatment
13- In HTML, they do
14
15- The header is different - <!DOCTYPE html> vs. <?xml version= ... ?>
16
17So do have a mode for <script> <style> and void tags? Upgrade HX8 into HTM8?
18
19TODO:
20
21- Are there special rules for <svg> and <math>?
22- Do we need to know about <textarea> <pre>? Those don't have the same
23 whitespace rules
24"""
25from __future__ import print_function
26
27try:
28 from cStringIO import StringIO
29except ImportError:
30 from io import StringIO # python3
31import re
32import sys
33
34if sys.version_info.major == 2:
35 from typing import List, Tuple, Optional, Dict
36
37
38def log(msg, *args):
39 msg = msg % args
40 print(msg, file=sys.stderr)
41
42
43class LexError(Exception):
44 """
45 Examples of lex errors:
46
47 - Tok.Invalid, like <> or &&
48 - Unclosed <!-- <? <![CDATA[ <script> <style>
49 """
50
51 def __init__(self, s, start_pos):
52 self.s = s
53 self.start_pos = start_pos
54
55 def __str__(self):
56 return '(LexError %r)' % (self.s[self.start_pos:self.start_pos + 20])
57
58
59def FindLineNum(s, error_pos):
60 current_pos = 0
61 line_num = 1
62 while True:
63 newline_pos = s.find('\n', current_pos)
64 #log('current = %d, N %d, line %d', current_pos, newline_pos, line_num)
65
66 if newline_pos == -1: # this is the last line
67 return line_num
68 if newline_pos >= error_pos:
69 return line_num
70 line_num += 1
71 current_pos = newline_pos + 1
72
73
74class ParseError(Exception):
75 """
76 Examples of parse errors
77
78 - unbalanced tag structure
79 - ul_table.py errors
80 """
81
82 def __init__(self, msg, s=None, start_pos=-1):
83 self.msg = msg
84 self.s = s
85 self.start_pos = start_pos
86
87 def __str__(self):
88 if self.s is not None:
89 assert self.start_pos != -1, self.start_pos
90 snippet = (self.s[self.start_pos:self.start_pos + 20])
91
92 line_num = FindLineNum(self.s, self.start_pos)
93 else:
94 snippet = ''
95 line_num = -1
96 msg = 'line %d: %r %r' % (line_num, self.msg, snippet)
97 return msg
98
99
100class Output(object):
101 """Takes an underlying input buffer and an output file. Maintains a
102 position in the input buffer.
103
104 Print FROM the input or print new text to the output.
105 """
106
107 def __init__(self, s, f, left_pos=0, right_pos=-1):
108 self.s = s
109 self.f = f
110 self.pos = left_pos
111 self.right_pos = len(s) if right_pos == -1 else right_pos
112
113 def SkipTo(self, pos):
114 """Skip to a position."""
115 self.pos = pos
116
117 def PrintUntil(self, pos):
118 """Print until a position."""
119 piece = self.s[self.pos:pos]
120 self.f.write(piece)
121 self.pos = pos
122
123 def PrintTheRest(self):
124 """Print until the end of the string."""
125 self.PrintUntil(self.right_pos)
126
127 def Print(self, s):
128 """Print text to the underlying buffer."""
129 self.f.write(s)
130
131
132# HTML Tokens
133# CommentBegin, ProcessingBegin, CDataBegin are "pseudo-tokens", not visible
134TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin CData CDataBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData HtmlCData Invalid EndOfStream'.split(
135)
136
137
138class Tok(object):
139 """
140 Avoid lint errors by using these aliases
141 """
142 pass
143
144
145TOKEN_NAMES = [None] * len(TOKENS) # type: List[str]
146
147this_module = sys.modules[__name__]
148for i, tok_str in enumerate(TOKENS):
149 setattr(this_module, tok_str, i)
150 setattr(Tok, tok_str, i)
151 TOKEN_NAMES[i] = tok_str
152
153
154def TokenName(tok_id):
155 return TOKEN_NAMES[tok_id]
156
157
158def MakeLexer(rules):
159 return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]
160
161
162#
163# Eggex
164#
165# Tag = / ~['>']+ /
166
167# Is this valid? A single character?
168# Tag = / ~'>'* /
169
170# Maybe better: / [NOT '>']+/
171# capital letters not allowed there?
172#
173# But then this is confusing:
174# / [NOT ~digit]+/
175#
176# / [NOT digit] / is [^\d]
177# / ~digit / is \D
178#
179# Or maybe:
180#
181# / [~ digit]+ /
182# / [~ '>']+ /
183# / [NOT '>']+ /
184
185# End = / '</' Tag '>' /
186# StartEnd = / '<' Tag '/>' /
187# Start = / '<' Tag '>' /
188#
189# EntityRef = / '&' dot{* N} ';' /
190
191# Tag name, or attribute name
192# colon is used in XML
193
194# https://www.w3.org/TR/xml/#NT-Name
195# Hm there is a lot of unicode stuff. We are simplifying parsing
196
197_NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter
198
199LEXER = [
200 (r'<!--', Tok.CommentBegin),
201
202 # Processing instruction are used for the XML header:
203 # <?xml version="1.0" encoding="UTF-8"?>
204 # They are technically XML-only, but in HTML5, they are another kind of
205 # comment:
206 #
207 # https://developer.mozilla.org/en-US/docs/Web/API/ProcessingInstruction
208 #
209 (r'<\?', Tok.ProcessingBegin),
210 # Not necessary in HTML5, but occurs in XML
211 (r'<!\[CDATA\[', Tok.CDataBegin), # <![CDATA[
212
213 # Markup declarations
214 # - In HTML5, there is only <!DOCTYPE html>
215 # - XML has 4 more declarations: <!ELEMENT ...> ATTLIST ENTITY NOTATION
216 # - these seem to be part of DTD
217 # - it's useful to skip these, and be able to parse the rest of the document
218 # - Note: < is allowed?
219 (r'<! [^>]+ >', Tok.Decl),
220
221 # Tags
222 # Notes:
223 # - We look for a valid tag name, but we don't validate attributes.
224 # That's done in the tag lexer.
225 # - We don't allow leading whitespace
226 (r'</ (%s) >' % _NAME, Tok.EndTag),
227 # self-closing <br/> comes before StarttTag
228 (r'< (%s) [^>]* />' % _NAME, Tok.StartEndTag), # end </a>
229 (r'< (%s) [^>]* >' % _NAME, Tok.StartTag), # start <a>
230
231 # Characters
232 # https://www.w3.org/TR/xml/#sec-references
233 (r'&\# [0-9]+ ;', Tok.DecChar),
234 (r'&\# x[0-9a-fA-F]+ ;', Tok.HexChar),
235 (r'& %s ;' % _NAME, Tok.CharEntity),
236
237 # HTML5 allows unescaped > in raw data, but < is not allowed.
238 # https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
239 #
240 # - My early blog has THREE errors when disallowing >
241 # - So do some .wwz files
242 (r'[^&<]+', Tok.RawData),
243 (r'.', Tok.Invalid), # error!
244]
245
246# Old notes:
247#
248# Non-greedy matches are regular and can be matched in linear time
249# with RE2.
250#
251# https://news.ycombinator.com/item?id=27099798
252#
253# Maybe try combining all of these for speed.
254
255# . is any char except newline
256# https://re2c.org/manual/manual_c.html
257
258# Discarded options
259#(r'<!-- .*? -->', Tok.Comment),
260
261# Hack from Claude: \s\S instead of re.DOTALL. I don't like this
262#(r'<!-- [\s\S]*? -->', Tok.Comment),
263#(r'<!-- (?:.|[\n])*? -->', Tok.Comment),
264
265LEXER = MakeLexer(LEXER)
266
267
268class Lexer(object):
269
270 def __init__(self, s, left_pos=0, right_pos=-1, no_special_tags=False):
271 self.s = s
272 self.pos = left_pos
273 self.right_pos = len(s) if right_pos == -1 else right_pos
274 self.no_special_tags = no_special_tags
275
276 self.cache = {} # string -> compiled regex pattern object
277
278 # either </script> or </style> - we search until we see that
279 self.search_state = None # type: Optional[str]
280
281 # Position of tag name, if applicable
282 # - Set after you get a StartTag, EndTag, or StartEndTag
283 # - Unset on other tags
284 self.tag_pos_left = -1
285 self.tag_pos_right = -1
286
287 def _Peek(self):
288 # type: () -> Tuple[int, int]
289 """
290 Note: not using _Peek() now
291 """
292 if self.pos == self.right_pos:
293 return Tok.EndOfStream, self.pos
294
295 assert self.pos < self.right_pos, self.pos
296
297 if self.search_state is not None and not self.no_special_tags:
298 pos = self.s.find(self.search_state, self.pos)
299 if pos == -1:
300 # unterminated <script> or <style>
301 raise LexError(self.s, self.pos)
302 self.search_state = None
303 # beginning
304 return Tok.HtmlCData, pos
305
306 # Find the first match.
307 # Note: frontend/match.py uses _LongestMatch(), which is different!
308 # TODO: reconcile them. This lexer should be expressible in re2c.
309
310 for pat, tok_id in LEXER:
311 m = pat.match(self.s, self.pos)
312 if m:
313 if tok_id in (Tok.StartTag, Tok.EndTag, Tok.StartEndTag):
314 self.tag_pos_left = m.start(1)
315 self.tag_pos_right = m.end(1)
316 else:
317 # Reset state
318 self.tag_pos_left = -1
319 self.tag_pos_right = -1
320
321 if tok_id == Tok.CommentBegin:
322 pos = self.s.find('-->', self.pos)
323 if pos == -1:
324 # unterminated <!--
325 raise LexError(self.s, self.pos)
326 return Tok.Comment, pos + 3 # -->
327
328 if tok_id == Tok.ProcessingBegin:
329 pos = self.s.find('?>', self.pos)
330 if pos == -1:
331 # unterminated <?
332 raise LexError(self.s, self.pos)
333 return Tok.Processing, pos + 2 # ?>
334
335 if tok_id == Tok.CDataBegin:
336 pos = self.s.find(']]>', self.pos)
337 if pos == -1:
338 # unterminated <![CDATA[
339 raise LexError(self.s, self.pos)
340 return Tok.CData, pos + 3 # ]]>
341
342 if tok_id == Tok.StartTag:
343 if self.TagNameEquals('script'):
344 self.search_state = '</script>'
345 elif self.TagNameEquals('style'):
346 self.search_state = '</style>'
347
348 return tok_id, m.end()
349 else:
350 raise AssertionError('Tok.Invalid rule should have matched')
351
352 def TagNameEquals(self, expected):
353 # type: (str) -> bool
354 assert self.tag_pos_left != -1, self.tag_pos_left
355 assert self.tag_pos_right != -1, self.tag_pos_right
356
357 # TODO: In C++, this does not need an allocation
358 # TODO: conditionally lower() case here (maybe not in XML mode)
359 return expected == self.s[self.tag_pos_left:self.tag_pos_right]
360
361 def TagName(self):
362 # type: () -> None
363 assert self.tag_pos_left != -1, self.tag_pos_left
364 assert self.tag_pos_right != -1, self.tag_pos_right
365
366 # TODO: conditionally lower() case here (maybe not in XML mode)
367 return self.s[self.tag_pos_left:self.tag_pos_right]
368
369 def Read(self):
370 # type: () -> Tuple[int, int]
371 tok_id, end_pos = self._Peek()
372 self.pos = end_pos # advance
373 return tok_id, end_pos
374
375 def LookAhead(self, regex):
376 # Cache the regex compilation. This could also be LookAheadFor(THEAD)
377 # or something.
378 pat = self.cache.get(regex)
379 if pat is None:
380 pat = re.compile(regex)
381 self.cache[regex] = pat
382
383 m = pat.match(self.s, self.pos)
384 return m is not None
385
386
387def _Tokens(s, left_pos, right_pos):
388 """
389 Args:
390 s: string to parse
391 left_pos, right_pos: Optional span boundaries.
392 """
393 lx = Lexer(s, left_pos, right_pos)
394 while True:
395 tok_id, pos = lx.Read()
396 yield tok_id, pos
397 if tok_id == Tok.EndOfStream:
398 break
399
400
401def ValidTokens(s, left_pos=0, right_pos=-1):
402 """Wrapper around _Tokens to prevent callers from having to handle Invalid.
403
404 I'm not combining the two functions because I might want to do a
405 'yield' transformation on Tokens()? Exceptions might complicate the
406 issue?
407 """
408 pos = left_pos
409 for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
410 if tok_id == Tok.Invalid:
411 raise LexError(s, pos)
412 yield tok_id, end_pos
413 pos = end_pos
414
415
416def ValidTokenList(s, no_special_tags=False):
417 """A wrapper that can be more easily translated to C++. Doesn't use iterators."""
418
419 start_pos = 0
420 tokens = []
421 lx = Lexer(s, no_special_tags=no_special_tags)
422 while True:
423 tok_id, end_pos = lx.Read()
424 tokens.append((tok_id, end_pos))
425 if tok_id == Tok.EndOfStream:
426 break
427 if tok_id == Tok.Invalid:
428 raise LexError(s, start_pos)
429 start_pos = end_pos
430 return tokens
431
432
433# Tag names:
434# Match <a or </a
435# Match <h2, but not <2h
436#
437# HTML 5 doesn't restrict tag names at all
438# https://html.spec.whatwg.org/#toc-syntax
439#
440# XML allows : - .
441# https://www.w3.org/TR/xml/#NT-NameChar
442
443# Namespaces for MathML, SVG
444# XLink, XML, XMLNS
445#
446# https://infra.spec.whatwg.org/#namespaces
447#
448# Allow - for td-attrs
449
450_ATTR_VALUE = r'[a-zA-Z0-9_\-]+' # allow hyphens
451
452# TODO: we don't need to capture the tag name here? That's done at the top
453# level
454_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)
455
456# To match href="foo"
457
458_ATTR_RE = re.compile(
459 r'''
460\s+ # Leading whitespace is required
461(%s) # Attribute name
462(?: # Optional attribute value
463 \s* = \s*
464 (?:
465 " ([^>"]*) " # double quoted value
466 | (%s) # Attribute value
467 # TODO: relax this? for href=$foo
468 )
469)?
470''' % (_NAME, _ATTR_VALUE), re.VERBOSE)
471
472TagName, AttrName, UnquotedValue, QuotedValue = range(4)
473
474
475class TagLexer(object):
476 """
477 Given a tag like <a href="..."> or <link type="..." />, the TagLexer
478 provides a few operations:
479
480 - What is the tag?
481 - Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
482 """
483
484 def __init__(self, s):
485 self.s = s
486 self.start_pos = -1 # Invalid
487 self.end_pos = -1
488
489 def Reset(self, start_pos, end_pos):
490 """Reuse instances of this object."""
491 self.start_pos = start_pos
492 self.end_pos = end_pos
493
494 def TagString(self):
495 return self.s[self.start_pos:self.end_pos]
496
497 def TagName(self):
498 # First event
499 tok_id, start, end = next(self.Tokens())
500 return self.s[start:end]
501
502 def GetSpanForAttrValue(self, attr_name):
503 # Algorithm: search for QuotedValue or UnquotedValue after AttrName
504 # TODO: Could also cache these
505
506 events = self.Tokens()
507 val = (-1, -1)
508 try:
509 while True:
510 tok_id, start, end = next(events)
511 if tok_id == AttrName:
512 name = self.s[start:end]
513 if name == attr_name:
514 # The value should come next
515 tok_id, start, end = next(events)
516 if tok_id in (QuotedValue, UnquotedValue):
517 # Note: quoted values may have &amp;
518 # We would need ANOTHER lexer to unescape them.
519 # Right now help_gen.py and oils_doc.py
520 val = start, end
521 break
522
523 except StopIteration:
524 pass
525 return val
526
527 def GetAttrRaw(self, attr_name):
528 """
529 Return the value, which may be UNESCAPED.
530 """
531 # Algorithm: search for QuotedValue or UnquotedValue after AttrName
532 # TODO: Could also cache these
533 start, end = self.GetSpanForAttrValue(attr_name)
534 if start == -1:
535 return None
536 return self.s[start:end]
537
538 def AllAttrsRaw(self):
539 """
540 Get a list of pairs [('class', 'foo'), ('href', '?foo=1&amp;bar=2')]
541
542 The quoted values may be escaped. We would need another lexer to
543 unescape them.
544 """
545 pairs = []
546 events = self.Tokens()
547 try:
548 while True:
549 tok_id, start, end = next(events)
550 if tok_id == AttrName:
551 name = self.s[start:end]
552
553 # The value should come next
554 tok_id, start, end = next(events)
555 if tok_id in (QuotedValue, UnquotedValue):
556 # Note: quoted values may have &amp;
557 # We would need ANOTHER lexer to unescape them, but we
558 # don't need that for ul-table
559
560 val = self.s[start:end]
561 pairs.append((name, val))
562 except StopIteration:
563 pass
564 return pairs
565
566 def Tokens(self):
567 """
568 Yields a sequence of tokens: Tag (AttrName AttrValue?)*
569
570 Where each Token is (Type, start_pos, end_pos)
571
572 Note that start and end are NOT redundant! We skip over some unwanted
573 characters.
574 """
575 m = _TAG_RE.match(self.s, self.start_pos + 1)
576 if not m:
577 raise RuntimeError("Couldn't find HTML tag in %r" %
578 self.TagString())
579 yield TagName, m.start(1), m.end(1)
580
581 pos = m.end(0)
582
583 while True:
584 # don't search past the end
585 m = _ATTR_RE.match(self.s, pos, self.end_pos)
586 if not m:
587 # A validating parser would check that > or /> is next -- there's no junk
588 break
589
590 yield AttrName, m.start(1), m.end(1)
591
592 # Quoted is group 2, unquoted is group 3.
593 if m.group(2) is not None:
594 yield QuotedValue, m.start(2), m.end(2)
595 elif m.group(3) is not None:
596 yield UnquotedValue, m.start(3), m.end(3)
597
598 # Skip past the "
599 pos = m.end(0)
600
601
602def ReadUntilStartTag(it, tag_lexer, tag_name):
603 """Find the next <foo>, returning its (start, end) positions
604
605 Raise ParseError if it's not found.
606
607 tag_lexer is RESET.
608 """
609 pos = 0
610 while True:
611 try:
612 tok_id, end_pos = next(it)
613 except StopIteration:
614 break
615 tag_lexer.Reset(pos, end_pos)
616 if tok_id == Tok.StartTag and tag_lexer.TagName() == tag_name:
617 return pos, end_pos
618
619 pos = end_pos
620
621 raise ParseError('No start tag %r' % tag_name)
622
623
624def ReadUntilEndTag(it, tag_lexer, tag_name):
625 """Find the next </foo>, returning its (start, end) position
626
627 Raise ParseError if it's not found.
628
629 tag_lexer is RESET.
630 """
631 pos = 0
632 while True:
633 try:
634 tok_id, end_pos = next(it)
635 except StopIteration:
636 break
637 tag_lexer.Reset(pos, end_pos)
638 if tok_id == Tok.EndTag and tag_lexer.TagName() == tag_name:
639 return pos, end_pos
640
641 pos = end_pos
642
643 raise ParseError('No end tag %r' % tag_name)
644
645
646CHAR_ENTITY = {
647 'amp': '&',
648 'lt': '<',
649 'gt': '>',
650 'quot': '"',
651}
652
653
654def ToText(s, left_pos=0, right_pos=-1):
655 """Given HTML, return text by unquoting &gt; and &lt; etc.
656
657 Used by:
658 doctools/oils_doc.py: PygmentsPlugin
659 doctools/help_gen.py: HelpIndexCards
660
661 In the latter case, we cold process some tags, like:
662
663 - Blue Link (not clickable, but still useful)
664 - Red X
665
666 That should be html.ToAnsi.
667 """
668 f = StringIO()
669 out = Output(s, f, left_pos, right_pos)
670
671 pos = left_pos
672 for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
673 if tok_id == Tok.RawData:
674 out.SkipTo(pos)
675 out.PrintUntil(end_pos)
676
677 elif tok_id == Tok.CharEntity: # &amp;
678
679 entity = s[pos + 1:end_pos - 1]
680
681 out.SkipTo(pos)
682 out.Print(CHAR_ENTITY[entity])
683 out.SkipTo(end_pos)
684
685 # Not handling these yet
686 elif tok_id == Tok.HexChar:
687 raise AssertionError('Hex Char %r' % s[pos:pos + 20])
688
689 elif tok_id == Tok.DecChar:
690 raise AssertionError('Dec Char %r' % s[pos:pos + 20])
691
692 pos = end_pos
693
694 out.PrintTheRest()
695 return f.getvalue()
696
697
698# https://developer.mozilla.org/en-US/docs/Glossary/Void_element
699VOID_ELEMENTS = [
700 'area',
701 'base',
702 'br',
703 'col',
704 'embed',
705 'hr',
706 'img',
707 'input',
708 'link',
709 'meta',
710 'param',
711 'source',
712 'track',
713 'wbr',
714]
715
716LEX_ATTRS = 1 << 1
717LEX_QUOTED_VALUES = 1 << 2 # href="?x=42&amp;y=99"
718NO_SPECIAL_TAGS = 1 << 3 # <script> <style>, VOID tags, etc.
719BALANCED_TAGS = 1 << 4 # are tags balanced?
720
721
722def Validate(contents, flags, counters):
723 # type: (str, int, Counters) -> None
724
725 tag_lexer = TagLexer(contents)
726 no_special_tags = bool(flags & NO_SPECIAL_TAGS)
727 lx = Lexer(contents, no_special_tags=no_special_tags)
728 tokens = []
729 start_pos = 0
730 tag_stack = []
731 while True:
732 tok_id, end_pos = lx.Read()
733
734 if tok_id == Tok.Invalid:
735 raise LexError(contents, start_pos)
736 if tok_id == Tok.EndOfStream:
737 break
738
739 tokens.append((tok_id, end_pos))
740
741 if tok_id == Tok.StartEndTag:
742 counters.num_start_end_tags += 1
743
744 tag_lexer.Reset(start_pos, end_pos)
745 all_attrs = tag_lexer.AllAttrsRaw()
746 counters.num_attrs += len(all_attrs)
747
748 elif tok_id == Tok.StartTag:
749 counters.num_start_tags += 1
750
751 tag_lexer.Reset(start_pos, end_pos)
752 all_attrs = tag_lexer.AllAttrsRaw()
753 counters.num_attrs += len(all_attrs)
754
755 if flags & BALANCED_TAGS:
756 tag_name = lx.TagName()
757 if flags & NO_SPECIAL_TAGS:
758 tag_stack.append(tag_name)
759 else:
760 # e.g. <meta> is considered self-closing, like <meta/>
761 if tag_name not in VOID_ELEMENTS:
762 tag_stack.append(tag_name)
763
764 counters.max_tag_stack = max(counters.max_tag_stack,
765 len(tag_stack))
766 elif tok_id == Tok.EndTag:
767 if flags & BALANCED_TAGS:
768 try:
769 expected = tag_stack.pop()
770 except IndexError:
771 raise ParseError('Tag stack empty',
772 s=contents,
773 start_pos=start_pos)
774
775 actual = lx.TagName()
776 if expected != actual:
777 raise ParseError(
778 'Got unexpected closing tag %r; opening tag was %r' %
779 (contents[start_pos:end_pos], expected),
780 s=contents,
781 start_pos=start_pos)
782
783 start_pos = end_pos
784 counters.num_tokens += len(tokens)
785
786
787class Counters(object):
788
789 def __init__(self):
790 self.num_tokens = 0
791 self.num_start_tags = 0
792 self.num_start_end_tags = 0
793 self.num_attrs = 0
794 self.max_tag_stack = 0
795
796
797def main(argv):
798 action = argv[1]
799
800 if action == 'tokens':
801 contents = sys.stdin.read()
802
803 lx = Lexer(contents)
804 start_pos = 0
805 while True:
806 tok_id, end_pos = lx.Read()
807 if tok_id == Tok.Invalid:
808 raise LexError(contents, start_pos)
809 if tok_id == Tok.EndOfStream:
810 break
811
812 frag = contents[start_pos:end_pos]
813 log('%d %s %r', end_pos, TokenName(tok_id), frag)
814 start_pos = end_pos
815
816 return 0
817
818 elif action in ('lex-htm8', 'parse-htm8', 'parse-xml'):
819
820 errors = []
821 counters = Counters()
822
823 flags = LEX_ATTRS | LEX_QUOTED_VALUES
824 if action.startswith('parse-'):
825 flags |= BALANCED_TAGS
826 if action == 'parse-xml':
827 flags |= NO_SPECIAL_TAGS
828
829 i = 0
830 for line in sys.stdin:
831 filename = line.strip()
832 with open(filename) as f:
833 contents = f.read()
834
835 try:
836 Validate(contents, flags, counters)
837 except LexError as e:
838 log('Lex error in %r: %s', filename, e)
839 errors.append((filename, e))
840 except ParseError as e:
841 log('Parse error in %r: %s', filename, e)
842 errors.append((filename, e))
843 i += 1
844
845 log('')
846 log(
847 ' %d tokens, %d start/end tags, %d start tags, %d attrs, %d max tag stack depth in %d files',
848 counters.num_tokens, counters.num_start_end_tags,
849 counters.num_start_tags, counters.num_attrs,
850 counters.max_tag_stack, i)
851 log(' %d errors', len(errors))
852 if len(errors):
853 return 1
854 return 0
855
856 elif action == 'todo':
857 # Other algorithms:
858 #
859 # - select first subtree with given ID
860 # - this requires understanding the void tags I suppose
861 # - select all subtrees that have a class
862 # - materialize DOM
863
864 # Safe-HTM8? This is a filter
865 return 0
866
867 else:
868 raise RuntimeError('Invalid action %r' % action)
869
870
871if __name__ == '__main__':
872 sys.exit(main(sys.argv))