OILS / lazylex / html.py View on Github | oils.pub

989 lines, 516 significant
1#!/usr/bin/env python2
2"""
3lazylex/html.py - Low-Level HTML Processing.
4
5See lazylex/README.md for details.
6"""
7from __future__ import print_function
8
9try:
10 from cStringIO import StringIO
11except ImportError:
12 from io import StringIO # python3
13import re
14import sys
15
16if sys.version_info.major == 2:
17 from typing import List, Tuple, Optional
18
19
20def log(msg, *args):
21 msg = msg % args
22 print(msg, file=sys.stderr)
23
24
25class LexError(Exception):
26 """
27 Examples of lex errors:
28
29 - Tok.Invalid, like <> or &&
30 - Unclosed <!-- <? <![CDATA[ <script> <style>
31 """
32
33 def __init__(self, s, start_pos):
34 self.s = s
35 self.start_pos = start_pos
36
37 def __str__(self):
38 return '(LexError %r)' % (self.s[self.start_pos:self.start_pos + 20])
39
40
41def FindLineNum(s, error_pos):
42 current_pos = 0
43 line_num = 1
44 while True:
45 newline_pos = s.find('\n', current_pos)
46 #log('current = %d, N %d, line %d', current_pos, newline_pos, line_num)
47
48 if newline_pos == -1: # this is the last line
49 return line_num
50 if newline_pos >= error_pos:
51 return line_num
52 line_num += 1
53 current_pos = newline_pos + 1
54
55
56class ParseError(Exception):
57 """
58 Examples of parse errors
59
60 - unbalanced tag structure
61 - ul_table.py errors
62 """
63
64 def __init__(self, msg, s=None, start_pos=-1):
65 self.msg = msg
66 self.s = s
67 self.start_pos = start_pos
68
69 def __str__(self):
70 if self.s is not None:
71 assert self.start_pos != -1, self.start_pos
72 snippet = (self.s[self.start_pos:self.start_pos + 20])
73
74 line_num = FindLineNum(self.s, self.start_pos)
75 else:
76 snippet = ''
77 line_num = -1
78 msg = 'line %d: %r %r' % (line_num, self.msg, snippet)
79 return msg
80
81
82class Output(object):
83 """Takes an underlying input buffer and an output file. Maintains a
84 position in the input buffer.
85
86 Print FROM the input or print new text to the output.
87 """
88
89 def __init__(self, s, f, left_pos=0, right_pos=-1):
90 self.s = s
91 self.f = f
92 self.pos = left_pos
93 self.right_pos = len(s) if right_pos == -1 else right_pos
94
95 def SkipTo(self, pos):
96 """Skip to a position."""
97 self.pos = pos
98
99 def PrintUntil(self, pos):
100 """Print until a position."""
101 piece = self.s[self.pos:pos]
102 self.f.write(piece)
103 self.pos = pos
104
105 def PrintTheRest(self):
106 """Print until the end of the string."""
107 self.PrintUntil(self.right_pos)
108
109 def Print(self, s):
110 """Print text to the underlying buffer."""
111 self.f.write(s)
112
113
114# HTML Tokens
115# CommentBegin, ProcessingBegin, CDataBegin are "pseudo-tokens", not visible
116TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin CData CDataBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData HtmlCData Invalid EndOfStream'.split(
117)
118
119
120class Tok(object):
121 """
122 Avoid lint errors by using these aliases
123 """
124 pass
125
126
127TOKEN_NAMES = [None] * len(TOKENS) # type: List[str]
128
129this_module = sys.modules[__name__]
130for i, tok_str in enumerate(TOKENS):
131 setattr(this_module, tok_str, i)
132 setattr(Tok, tok_str, i)
133 TOKEN_NAMES[i] = tok_str
134
135
136def TokenName(tok_id):
137 return TOKEN_NAMES[tok_id]
138
139
140def MakeLexer(rules):
141 return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]
142
143
144#
145# Eggex
146#
147# Tag = / ~['>']+ /
148
149# Is this valid? A single character?
150# Tag = / ~'>'* /
151
152# Maybe better: / [NOT '>']+/
153# capital letters not allowed there?
154#
155# But then this is confusing:
156# / [NOT ~digit]+/
157#
158# / [NOT digit] / is [^\d]
159# / ~digit / is \D
160#
161# Or maybe:
162#
163# / [~ digit]+ /
164# / [~ '>']+ /
165# / [NOT '>']+ /
166
167# End = / '</' Tag '>' /
168# StartEnd = / '<' Tag '/>' /
169# Start = / '<' Tag '>' /
170#
171# EntityRef = / '&' dot{* N} ';' /
172
173# Tag name, or attribute name
174# colon is used in XML
175
176# https://www.w3.org/TR/xml/#NT-Name
177# Hm there is a lot of unicode stuff. We are simplifying parsing
178
179_NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter
180
181CHAR_LEX = [
182 # Characters
183 # https://www.w3.org/TR/xml/#sec-references
184 (r'&\# [0-9]+ ;', Tok.DecChar),
185 (r'&\# x[0-9a-fA-F]+ ;', Tok.HexChar),
186 (r'& %s ;' % _NAME, Tok.CharEntity),
187]
188
189LEXER = CHAR_LEX + [
190 (r'<!--', Tok.CommentBegin),
191
192 # Processing instruction are used for the XML header:
193 # <?xml version="1.0" encoding="UTF-8"?>
194 # They are technically XML-only, but in HTML5, they are another kind of
195 # comment:
196 #
197 # https://developer.mozilla.org/en-US/docs/Web/API/ProcessingInstruction
198 #
199 (r'<\?', Tok.ProcessingBegin),
200 # Not necessary in HTML5, but occurs in XML
201 (r'<!\[CDATA\[', Tok.CDataBegin), # <![CDATA[
202
203 # Markup declarations
204 # - In HTML5, there is only <!DOCTYPE html>
205 # - XML has 4 more declarations: <!ELEMENT ...> ATTLIST ENTITY NOTATION
206 # - these seem to be part of DTD
207 # - it's useful to skip these, and be able to parse the rest of the document
208 # - Note: < is allowed?
209 (r'<! [^>\x00]+ >', Tok.Decl),
210
211 # Tags
212 # Notes:
213 # - We look for a valid tag name, but we don't validate attributes.
214 # That's done in the tag lexer.
215 # - We don't allow leading whitespace
216 (r'</ (%s) >' % _NAME, Tok.EndTag),
217 # self-closing <br/> comes before StartTag
218 # could/should these be collapsed into one rule?
219 (r'< (%s) [^>\x00]* />' % _NAME, Tok.StartEndTag), # end </a>
220 (r'< (%s) [^>\x00]* >' % _NAME, Tok.StartTag), # start <a>
221
222 # HTML5 allows unescaped > in raw data, but < is not allowed.
223 # https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
224 #
225 # - My early blog has THREE errors when disallowing >
226 # - So do some .wwz files
227 (r'[^&<\x00]+', Tok.RawData),
228 (r'.', Tok.Invalid), # error!
229]
230
231# Old notes:
232#
233# Non-greedy matches are regular and can be matched in linear time
234# with RE2.
235#
236# https://news.ycombinator.com/item?id=27099798
237#
238# Maybe try combining all of these for speed.
239
240# . is any char except newline
241# https://re2c.org/manual/manual_c.html
242
243# Discarded options
244#(r'<!-- .*? -->', Tok.Comment),
245
246# Hack from Claude: \s\S instead of re.DOTALL. I don't like this
247#(r'<!-- [\s\S]*? -->', Tok.Comment),
248#(r'<!-- (?:.|[\n])*? -->', Tok.Comment),
249
250LEXER = MakeLexer(LEXER)
251
252
253class Lexer(object):
254
255 def __init__(self, s, left_pos=0, right_pos=-1, no_special_tags=False):
256 self.s = s
257 self.pos = left_pos
258 self.right_pos = len(s) if right_pos == -1 else right_pos
259 self.no_special_tags = no_special_tags
260
261 self.cache = {} # string -> compiled regex pattern object
262
263 # either </script> or </style> - we search until we see that
264 self.search_state = None # type: Optional[str]
265
266 # Position of tag name, if applicable
267 # - Set after you get a StartTag, EndTag, or StartEndTag
268 # - Unset on other tags
269 self.tag_pos_left = -1
270 self.tag_pos_right = -1
271
272 def _Peek(self):
273 # type: () -> Tuple[int, int]
274 """
275 Note: not using _Peek() now
276 """
277 if self.pos == self.right_pos:
278 return Tok.EndOfStream, self.pos
279
280 assert self.pos < self.right_pos, self.pos
281
282 if self.search_state is not None and not self.no_special_tags:
283 pos = self.s.find(self.search_state, self.pos)
284 if pos == -1:
285 # unterminated <script> or <style>
286 raise LexError(self.s, self.pos)
287 self.search_state = None
288 # beginning
289 return Tok.HtmlCData, pos
290
291 # Find the first match.
292 # Note: frontend/match.py uses _LongestMatch(), which is different!
293 # TODO: reconcile them. This lexer should be expressible in re2c.
294
295 for pat, tok_id in LEXER:
296 m = pat.match(self.s, self.pos)
297 if m:
298 if tok_id in (Tok.StartTag, Tok.EndTag, Tok.StartEndTag):
299 self.tag_pos_left = m.start(1)
300 self.tag_pos_right = m.end(1)
301 else:
302 # Reset state
303 self.tag_pos_left = -1
304 self.tag_pos_right = -1
305
306 if tok_id == Tok.CommentBegin:
307 pos = self.s.find('-->', self.pos)
308 if pos == -1:
309 # unterminated <!--
310 raise LexError(self.s, self.pos)
311 return Tok.Comment, pos + 3 # -->
312
313 if tok_id == Tok.ProcessingBegin:
314 pos = self.s.find('?>', self.pos)
315 if pos == -1:
316 # unterminated <?
317 raise LexError(self.s, self.pos)
318 return Tok.Processing, pos + 2 # ?>
319
320 if tok_id == Tok.CDataBegin:
321 pos = self.s.find(']]>', self.pos)
322 if pos == -1:
323 # unterminated <![CDATA[
324 raise LexError(self.s, self.pos)
325 return Tok.CData, pos + 3 # ]]>
326
327 if tok_id == Tok.StartTag:
328 if self.TagNameEquals('script'):
329 self.search_state = '</script>'
330 elif self.TagNameEquals('style'):
331 self.search_state = '</style>'
332
333 return tok_id, m.end()
334 else:
335 raise AssertionError('Tok.Invalid rule should have matched')
336
337 def TagNameEquals(self, expected):
338 # type: (str) -> bool
339 assert self.tag_pos_left != -1, self.tag_pos_left
340 assert self.tag_pos_right != -1, self.tag_pos_right
341
342 # TODO: In C++, this does not need an allocation
343 # TODO: conditionally lower() case here (maybe not in XML mode)
344 return expected == self.s[self.tag_pos_left:self.tag_pos_right]
345
346 def TagName(self):
347 # type: () -> None
348 assert self.tag_pos_left != -1, self.tag_pos_left
349 assert self.tag_pos_right != -1, self.tag_pos_right
350
351 # TODO: conditionally lower() case here (maybe not in XML mode)
352 return self.s[self.tag_pos_left:self.tag_pos_right]
353
354 def Read(self):
355 # type: () -> Tuple[int, int]
356 tok_id, end_pos = self._Peek()
357 self.pos = end_pos # advance
358 return tok_id, end_pos
359
360 def LookAhead(self, regex):
361 # Cache the regex compilation. This could also be LookAheadFor(THEAD)
362 # or something.
363 pat = self.cache.get(regex)
364 if pat is None:
365 pat = re.compile(regex)
366 self.cache[regex] = pat
367
368 m = pat.match(self.s, self.pos)
369 return m is not None
370
371
372def _Tokens(s, left_pos, right_pos):
373 """
374 Args:
375 s: string to parse
376 left_pos, right_pos: Optional span boundaries.
377 """
378 lx = Lexer(s, left_pos, right_pos)
379 while True:
380 tok_id, pos = lx.Read()
381 yield tok_id, pos
382 if tok_id == Tok.EndOfStream:
383 break
384
385
386def ValidTokens(s, left_pos=0, right_pos=-1):
387 """Wrapper around _Tokens to prevent callers from having to handle Invalid.
388
389 I'm not combining the two functions because I might want to do a
390 'yield' transformation on Tokens()? Exceptions might complicate the
391 issue?
392 """
393 pos = left_pos
394 for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
395 if tok_id == Tok.Invalid:
396 raise LexError(s, pos)
397 yield tok_id, end_pos
398 pos = end_pos
399
400
401def ValidTokenList(s, no_special_tags=False):
402 """A wrapper that can be more easily translated to C++. Doesn't use iterators."""
403
404 start_pos = 0
405 tokens = []
406 lx = Lexer(s, no_special_tags=no_special_tags)
407 while True:
408 tok_id, end_pos = lx.Read()
409 tokens.append((tok_id, end_pos))
410 if tok_id == Tok.EndOfStream:
411 break
412 if tok_id == Tok.Invalid:
413 raise LexError(s, start_pos)
414 start_pos = end_pos
415 return tokens
416
417
418# Tag names:
419# Match <a or </a
420# Match <h2, but not <2h
421#
422# HTML 5 doesn't restrict tag names at all
423# https://html.spec.whatwg.org/#toc-syntax
424#
425# XML allows : - .
426# https://www.w3.org/TR/xml/#NT-NameChar
427
428# Namespaces for MathML, SVG
429# XLink, XML, XMLNS
430#
431# https://infra.spec.whatwg.org/#namespaces
432#
433# Allow - for td-attrs
434
435# Be very lenient - just no whitespace or special HTML chars
436# I don't think this is more lenient than HTML5, though we should check.
437_UNQUOTED_VALUE = r'''[^ \t\r\n<>&"'\x00]*'''
438
439# TODO: we don't need to capture the tag name here? That's done at the top
440# level
441_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)
442
443_TAG_LAST_RE = re.compile(r'\s* /? >', re.VERBOSE)
444
445# To match href="foo"
446# Note: in HTML5 and XML, single quoted attributes are also valid
447
448# <button disabled> is standard usage
449
450# NOTE: This used to allow whitespace around =
451# <a foo = "bar"> makes sense in XML
452# But then you also have
453# <a foo= bar> - which is TWO attributes, in HTML5
454# So the space is problematic
455
456_ATTR_RE = re.compile(
457 r'''
458\s+ # Leading whitespace is required
459(%s) # Attribute name
460(?: # Optional attribute value
461 \s* = \s* # Spaces allowed around =
462 (?:
463 " ([^>"\x00]*) " # double quoted value
464 | ' ([^>'\x00]*) ' # single quoted value
465 | (%s) # Attribute value
466 )
467)?
468''' % (_NAME, _UNQUOTED_VALUE), re.VERBOSE)
469
470TagName, AttrName, UnquotedValue, QuotedValue, MissingValue = range(5)
471
472
473class TagLexer(object):
474 """
475 Given a tag like <a href="..."> or <link type="..." />, the TagLexer
476 provides a few operations:
477
478 - What is the tag?
479 - Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
480 """
481
482 def __init__(self, s):
483 self.s = s
484 self.start_pos = -1 # Invalid
485 self.end_pos = -1
486
487 def Reset(self, start_pos, end_pos):
488 """Reuse instances of this object."""
489 assert start_pos >= 0, start_pos
490 assert end_pos >= 0, end_pos
491
492 self.start_pos = start_pos
493 self.end_pos = end_pos
494
495 def TagString(self):
496 return self.s[self.start_pos:self.end_pos]
497
498 def TagName(self):
499 # First event
500 tok_id, start, end = next(self.Tokens())
501 return self.s[start:end]
502
503 def GetSpanForAttrValue(self, attr_name):
504 """
505 Used by oils_doc.py, for href shortcuts
506 """
507 # Algorithm: search for QuotedValue or UnquotedValue after AttrName
508 # TODO: Could also cache these
509
510 events = self.Tokens()
511 val = (-1, -1)
512 try:
513 while True:
514 tok_id, start, end = next(events)
515 if tok_id == AttrName:
516 name = self.s[start:end]
517 if name == attr_name:
518 # The value should come next
519 tok_id, start, end = next(events)
520 assert tok_id in (QuotedValue, UnquotedValue,
521 MissingValue), TokenName(tok_id)
522 val = start, end
523 break
524
525 except StopIteration:
526 pass
527 return val
528
529 def GetAttrRaw(self, attr_name):
530 """
531 Return the value, which may be UNESCAPED.
532 """
533 start, end = self.GetSpanForAttrValue(attr_name)
534 if start == -1:
535 return None
536 return self.s[start:end]
537
538 def AllAttrsRawSlice(self):
539 """
540 Get a list of pairs [('class', 3, 5), ('href', 9, 12)]
541 """
542 slices = []
543 events = self.Tokens()
544 try:
545 while True:
546 tok_id, start, end = next(events)
547 if tok_id == AttrName:
548 name = self.s[start:end]
549
550 # The value should come next
551 tok_id, start, end = next(events)
552 assert tok_id in (QuotedValue, UnquotedValue,
553 MissingValue), TokenName(tok_id)
554 # Note: quoted values may have &amp;
555 # We would need ANOTHER lexer to unescape them, but we
556 # don't need that for ul-table
557 slices.append((name, start, end))
558 except StopIteration:
559 pass
560 return slices
561
562 def AllAttrsRaw(self):
563 """
564 Get a list of pairs [('class', 'foo'), ('href', '?foo=1&amp;bar=2')]
565
566 The quoted values may be escaped. We would need another lexer to
567 unescape them.
568 """
569 slices = self.AllAttrsRawSlice()
570 pairs = []
571 for name, start, end in slices:
572 pairs.append((name, self.s[start:end]))
573 return pairs
574
575 def Tokens(self):
576 """
577 Yields a sequence of tokens: Tag (AttrName AttrValue?)*
578
579 Where each Token is (Type, start_pos, end_pos)
580
581 Note that start and end are NOT redundant! We skip over some unwanted
582 characters.
583 """
584 m = _TAG_RE.match(self.s, self.start_pos + 1)
585 if not m:
586 raise RuntimeError("Couldn't find HTML tag in %r" %
587 self.TagString())
588 yield TagName, m.start(1), m.end(1)
589
590 pos = m.end(0)
591 #log('POS %d', pos)
592
593 while True:
594 # don't search past the end
595 m = _ATTR_RE.match(self.s, pos, self.end_pos)
596 if not m:
597 #log('BREAK pos %d', pos)
598 break
599 #log('AttrName %r', m.group(1))
600
601 yield AttrName, m.start(1), m.end(1)
602
603 #log('m.groups() %r', m.groups())
604 if m.group(2) is not None:
605 # double quoted
606 yield QuotedValue, m.start(2), m.end(2)
607 elif m.group(3) is not None:
608 # single quoted - TODO: could have different token types
609 yield QuotedValue, m.start(3), m.end(3)
610 elif m.group(4) is not None:
611 yield UnquotedValue, m.start(4), m.end(4)
612 else:
613 # <button disabled>
614 end = m.end(0)
615 yield MissingValue, end, end
616
617 # Skip past the "
618 pos = m.end(0)
619
620 #log('TOK %r', self.s)
621
622 m = _TAG_LAST_RE.match(self.s, pos)
623 #log('_TAG_LAST_RE match %r', self.s[pos:])
624 if not m:
625 # Extra data at end of tag. TODO: add messages for all these.
626 raise LexError(self.s, pos)
627
628
629# This is similar but not identical to
630# " ([^>"\x00]*) " # double quoted value
631# | ' ([^>'\x00]*) ' # single quoted value
632#
633# Note: for unquoted values, & isn't allowed, and thus &amp; and &#99; and
634# &#x99; are not allowed. We could relax that?
635ATTR_VALUE_LEXER = CHAR_LEX + [
636 (r'[^>&\x00]+', Tok.RawData),
637 (r'.', Tok.Invalid),
638]
639
640ATTR_VALUE_LEXER = MakeLexer(ATTR_VALUE_LEXER)
641
642
643class AttrValueLexer(object):
644 """
645 <a href="foo=99&amp;bar">
646 <a href='foo=99&amp;bar'>
647 <a href=unquoted>
648 """
649
650 def __init__(self, s):
651 self.s = s
652 self.start_pos = -1 # Invalid
653 self.end_pos = -1
654
655 def Reset(self, start_pos, end_pos):
656 """Reuse instances of this object."""
657 assert start_pos >= 0, start_pos
658 assert end_pos >= 0, end_pos
659
660 self.start_pos = start_pos
661 self.end_pos = end_pos
662
663 def NumTokens(self):
664 num_tokens = 0
665 pos = self.start_pos
666 for tok_id, end_pos in self.Tokens():
667 if tok_id == Tok.Invalid:
668 raise LexError(self.s, pos)
669 pos = end_pos
670 #log('pos %d', pos)
671 num_tokens += 1
672 return num_tokens
673
674 def Tokens(self):
675 pos = self.start_pos
676 while pos < self.end_pos:
677 # Find the first match, like above.
678 # Note: frontend/match.py uses _LongestMatch(), which is different!
679 # TODO: reconcile them. This lexer should be expressible in re2c.
680 for pat, tok_id in ATTR_VALUE_LEXER:
681 m = pat.match(self.s, pos)
682 if m:
683 if 0:
684 tok_str = m.group(0)
685 log('token = %r', tok_str)
686
687 end_pos = m.end(0)
688 yield tok_id, end_pos
689 pos = end_pos
690 break
691 else:
692 raise AssertionError('Tok.Invalid rule should have matched')
693
694
695def ReadUntilStartTag(it, tag_lexer, tag_name):
696 """Find the next <foo>, returning its (start, end) positions
697
698 Raise ParseError if it's not found.
699
700 tag_lexer is RESET.
701 """
702 pos = 0
703 while True:
704 try:
705 tok_id, end_pos = next(it)
706 except StopIteration:
707 break
708 tag_lexer.Reset(pos, end_pos)
709 if tok_id == Tok.StartTag and tag_lexer.TagName() == tag_name:
710 return pos, end_pos
711
712 pos = end_pos
713
714 raise ParseError('No start tag %r' % tag_name)
715
716
717def ReadUntilEndTag(it, tag_lexer, tag_name):
718 """Find the next </foo>, returning its (start, end) position
719
720 Raise ParseError if it's not found.
721
722 tag_lexer is RESET.
723 """
724 pos = 0
725 while True:
726 try:
727 tok_id, end_pos = next(it)
728 except StopIteration:
729 break
730 tag_lexer.Reset(pos, end_pos)
731 if tok_id == Tok.EndTag and tag_lexer.TagName() == tag_name:
732 return pos, end_pos
733
734 pos = end_pos
735
736 raise ParseError('No end tag %r' % tag_name)
737
738
739CHAR_ENTITY = {
740 'amp': '&',
741 'lt': '<',
742 'gt': '>',
743 'quot': '"',
744}
745
746
747def ToText(s, left_pos=0, right_pos=-1):
748 """Given HTML, return text by unquoting &gt; and &lt; etc.
749
750 Used by:
751 doctools/oils_doc.py: PygmentsPlugin
752 doctools/help_gen.py: HelpIndexCards
753
754 In the latter case, we cold process some tags, like:
755
756 - Blue Link (not clickable, but still useful)
757 - Red X
758
759 That should be html.ToAnsi.
760 """
761 f = StringIO()
762 out = Output(s, f, left_pos, right_pos)
763
764 pos = left_pos
765 for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
766 if tok_id == Tok.RawData:
767 out.SkipTo(pos)
768 out.PrintUntil(end_pos)
769
770 elif tok_id == Tok.CharEntity: # &amp;
771
772 entity = s[pos + 1:end_pos - 1]
773
774 out.SkipTo(pos)
775 out.Print(CHAR_ENTITY[entity])
776 out.SkipTo(end_pos)
777
778 # Not handling these yet
779 elif tok_id == Tok.HexChar:
780 raise AssertionError('Hex Char %r' % s[pos:pos + 20])
781
782 elif tok_id == Tok.DecChar:
783 raise AssertionError('Dec Char %r' % s[pos:pos + 20])
784
785 pos = end_pos
786
787 out.PrintTheRest()
788 return f.getvalue()
789
790
791# https://developer.mozilla.org/en-US/docs/Glossary/Void_element
792VOID_ELEMENTS = [
793 'area',
794 'base',
795 'br',
796 'col',
797 'embed',
798 'hr',
799 'img',
800 'input',
801 'link',
802 'meta',
803 'param',
804 'source',
805 'track',
806 'wbr',
807]
808
809LEX_ATTRS = 1 << 1
810LEX_QUOTED_VALUES = 1 << 2 # href="?x=42&amp;y=99"
811NO_SPECIAL_TAGS = 1 << 3 # <script> <style>, VOID tags, etc.
812BALANCED_TAGS = 1 << 4 # are tags balanced?
813
814
815def Validate(contents, flags, counters):
816 # type: (str, int, Counters) -> None
817
818 tag_lexer = TagLexer(contents)
819 val_lexer = AttrValueLexer(contents)
820
821 no_special_tags = bool(flags & NO_SPECIAL_TAGS)
822 lx = Lexer(contents, no_special_tags=no_special_tags)
823 tokens = []
824 start_pos = 0
825 tag_stack = []
826 while True:
827 tok_id, end_pos = lx.Read()
828 #log('TOP %s %r', TokenName(tok_id), contents[start_pos:end_pos])
829
830 if tok_id == Tok.Invalid:
831 raise LexError(contents, start_pos)
832 if tok_id == Tok.EndOfStream:
833 break
834
835 tokens.append((tok_id, end_pos))
836
837 if tok_id == Tok.StartEndTag:
838 counters.num_start_end_tags += 1
839
840 tag_lexer.Reset(start_pos, end_pos)
841 all_attrs = tag_lexer.AllAttrsRawSlice()
842 counters.num_attrs += len(all_attrs)
843 for name, val_start, val_end in all_attrs:
844 val_lexer.Reset(val_start, val_end)
845 counters.num_val_tokens += val_lexer.NumTokens()
846
847 counters.debug_attrs.extend(all_attrs)
848
849 elif tok_id == Tok.StartTag:
850 counters.num_start_tags += 1
851
852 tag_lexer.Reset(start_pos, end_pos)
853 all_attrs = tag_lexer.AllAttrsRawSlice()
854 counters.num_attrs += len(all_attrs)
855 for name, val_start, val_end in all_attrs:
856 val_lexer.Reset(val_start, val_end)
857 counters.num_val_tokens += val_lexer.NumTokens()
858
859 counters.debug_attrs.extend(all_attrs)
860
861 if flags & BALANCED_TAGS:
862 tag_name = lx.TagName()
863 if flags & NO_SPECIAL_TAGS:
864 tag_stack.append(tag_name)
865 else:
866 # e.g. <meta> is considered self-closing, like <meta/>
867 if tag_name not in VOID_ELEMENTS:
868 tag_stack.append(tag_name)
869
870 counters.max_tag_stack = max(counters.max_tag_stack,
871 len(tag_stack))
872 elif tok_id == Tok.EndTag:
873 if flags & BALANCED_TAGS:
874 try:
875 expected = tag_stack.pop()
876 except IndexError:
877 raise ParseError('Tag stack empty',
878 s=contents,
879 start_pos=start_pos)
880
881 actual = lx.TagName()
882 if expected != actual:
883 raise ParseError(
884 'Got unexpected closing tag %r; opening tag was %r' %
885 (contents[start_pos:end_pos], expected),
886 s=contents,
887 start_pos=start_pos)
888
889 start_pos = end_pos
890
891 if len(tag_stack) != 0:
892 raise ParseError('Missing closing tags at end of doc: %s' %
893 ' '.join(tag_stack),
894 s=contents,
895 start_pos=start_pos)
896
897 counters.num_tokens += len(tokens)
898
899
900class Counters(object):
901
902 def __init__(self):
903 self.num_tokens = 0
904 self.num_start_tags = 0
905 self.num_start_end_tags = 0
906 self.num_attrs = 0
907 self.max_tag_stack = 0
908 self.num_val_tokens = 0
909
910 self.debug_attrs = []
911
912
913def main(argv):
914 action = argv[1]
915
916 if action == 'tokens':
917 contents = sys.stdin.read()
918
919 lx = Lexer(contents)
920 start_pos = 0
921 while True:
922 tok_id, end_pos = lx.Read()
923 if tok_id == Tok.Invalid:
924 raise LexError(contents, start_pos)
925 if tok_id == Tok.EndOfStream:
926 break
927
928 frag = contents[start_pos:end_pos]
929 log('%d %s %r', end_pos, TokenName(tok_id), frag)
930 start_pos = end_pos
931
932 return 0
933
934 elif action in ('lex-htm8', 'parse-htm8', 'parse-xml'):
935
936 errors = []
937 counters = Counters()
938
939 flags = LEX_ATTRS | LEX_QUOTED_VALUES
940 if action.startswith('parse-'):
941 flags |= BALANCED_TAGS
942 if action == 'parse-xml':
943 flags |= NO_SPECIAL_TAGS
944
945 i = 0
946 for line in sys.stdin:
947 filename = line.strip()
948 with open(filename) as f:
949 contents = f.read()
950
951 try:
952 Validate(contents, flags, counters)
953 except LexError as e:
954 log('Lex error in %r: %s', filename, e)
955 errors.append((filename, e))
956 except ParseError as e:
957 log('Parse error in %r: %s', filename, e)
958 errors.append((filename, e))
959 i += 1
960
961 log('')
962 log('%10d tokens', counters.num_tokens)
963 log('%10d start/end tags', counters.num_start_end_tags)
964 log('%10d start tags', counters.num_start_tags)
965 log('%10d attrs', counters.num_attrs)
966 log('%10d max tag stack depth', counters.max_tag_stack)
967 log('%10d attr val tokens', counters.num_val_tokens)
968 log('%10d errors', len(errors))
969 if len(errors):
970 return 1
971 return 0
972
973 elif action == 'todo':
974 # Other algorithms:
975 #
976 # - select first subtree with given ID
977 # - this requires understanding the void tags I suppose
978 # - select all subtrees that have a class
979 # - materialize DOM
980
981 # Safe-HTM8? This is a filter
982 return 0
983
984 else:
985 raise RuntimeError('Invalid action %r' % action)
986
987
988if __name__ == '__main__':
989 sys.exit(main(sys.argv))