OILS / lazylex / html.py View on Github | oils.pub

1001 lines, 516 significant
1#!/usr/bin/env python2
2"""
3lazylex/html.py - Low-Level HTML Processing.
4
5See lazylex/README.md for details.
6
7Conflicts between HTML5 and XML:
8
9- In XML, <source> is like any tag, and must be closed,
10- In HTML, <source> is a VOID tag, and must NOT be closedlike any tag, and must be closed,
11
12- In XML, <script> and <style> don't have special treatment
13- In HTML, they do
14
15- The header is different - <!DOCTYPE html> vs. <?xml version= ... ?>
16
17So do have a mode for <script> <style> and void tags? Upgrade HX8 into HTM8?
18"""
19from __future__ import print_function
20
21try:
22 from cStringIO import StringIO
23except ImportError:
24 from io import StringIO # python3
25import re
26import sys
27
28if sys.version_info.major == 2:
29 from typing import List, Tuple, Optional
30
31
32def log(msg, *args):
33 msg = msg % args
34 print(msg, file=sys.stderr)
35
36
37class LexError(Exception):
38 """
39 Examples of lex errors:
40
41 - Tok.Invalid, like <> or &&
42 - Unclosed <!-- <? <![CDATA[ <script> <style>
43 """
44
45 def __init__(self, s, start_pos):
46 self.s = s
47 self.start_pos = start_pos
48
49 def __str__(self):
50 return '(LexError %r)' % (self.s[self.start_pos:self.start_pos + 20])
51
52
53def FindLineNum(s, error_pos):
54 current_pos = 0
55 line_num = 1
56 while True:
57 newline_pos = s.find('\n', current_pos)
58 #log('current = %d, N %d, line %d', current_pos, newline_pos, line_num)
59
60 if newline_pos == -1: # this is the last line
61 return line_num
62 if newline_pos >= error_pos:
63 return line_num
64 line_num += 1
65 current_pos = newline_pos + 1
66
67
68class ParseError(Exception):
69 """
70 Examples of parse errors
71
72 - unbalanced tag structure
73 - ul_table.py errors
74 """
75
76 def __init__(self, msg, s=None, start_pos=-1):
77 self.msg = msg
78 self.s = s
79 self.start_pos = start_pos
80
81 def __str__(self):
82 if self.s is not None:
83 assert self.start_pos != -1, self.start_pos
84 snippet = (self.s[self.start_pos:self.start_pos + 20])
85
86 line_num = FindLineNum(self.s, self.start_pos)
87 else:
88 snippet = ''
89 line_num = -1
90 msg = 'line %d: %r %r' % (line_num, self.msg, snippet)
91 return msg
92
93
94class Output(object):
95 """Takes an underlying input buffer and an output file. Maintains a
96 position in the input buffer.
97
98 Print FROM the input or print new text to the output.
99 """
100
101 def __init__(self, s, f, left_pos=0, right_pos=-1):
102 self.s = s
103 self.f = f
104 self.pos = left_pos
105 self.right_pos = len(s) if right_pos == -1 else right_pos
106
107 def SkipTo(self, pos):
108 """Skip to a position."""
109 self.pos = pos
110
111 def PrintUntil(self, pos):
112 """Print until a position."""
113 piece = self.s[self.pos:pos]
114 self.f.write(piece)
115 self.pos = pos
116
117 def PrintTheRest(self):
118 """Print until the end of the string."""
119 self.PrintUntil(self.right_pos)
120
121 def Print(self, s):
122 """Print text to the underlying buffer."""
123 self.f.write(s)
124
125
126# HTML Tokens
127# CommentBegin, ProcessingBegin, CDataBegin are "pseudo-tokens", not visible
128TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin CData CDataBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData HtmlCData Invalid EndOfStream'.split(
129)
130
131
132class Tok(object):
133 """
134 Avoid lint errors by using these aliases
135 """
136 pass
137
138
139TOKEN_NAMES = [None] * len(TOKENS) # type: List[str]
140
141this_module = sys.modules[__name__]
142for i, tok_str in enumerate(TOKENS):
143 setattr(this_module, tok_str, i)
144 setattr(Tok, tok_str, i)
145 TOKEN_NAMES[i] = tok_str
146
147
148def TokenName(tok_id):
149 return TOKEN_NAMES[tok_id]
150
151
152def MakeLexer(rules):
153 return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]
154
155
156#
157# Eggex
158#
159# Tag = / ~['>']+ /
160
161# Is this valid? A single character?
162# Tag = / ~'>'* /
163
164# Maybe better: / [NOT '>']+/
165# capital letters not allowed there?
166#
167# But then this is confusing:
168# / [NOT ~digit]+/
169#
170# / [NOT digit] / is [^\d]
171# / ~digit / is \D
172#
173# Or maybe:
174#
175# / [~ digit]+ /
176# / [~ '>']+ /
177# / [NOT '>']+ /
178
179# End = / '</' Tag '>' /
180# StartEnd = / '<' Tag '/>' /
181# Start = / '<' Tag '>' /
182#
183# EntityRef = / '&' dot{* N} ';' /
184
185# Tag name, or attribute name
186# colon is used in XML
187
188# https://www.w3.org/TR/xml/#NT-Name
189# Hm there is a lot of unicode stuff. We are simplifying parsing
190
191_NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter
192
193CHAR_LEX = [
194 # Characters
195 # https://www.w3.org/TR/xml/#sec-references
196 (r'&\# [0-9]+ ;', Tok.DecChar),
197 (r'&\# x[0-9a-fA-F]+ ;', Tok.HexChar),
198 (r'& %s ;' % _NAME, Tok.CharEntity),
199]
200
201LEXER = CHAR_LEX + [
202 (r'<!--', Tok.CommentBegin),
203
204 # Processing instruction are used for the XML header:
205 # <?xml version="1.0" encoding="UTF-8"?>
206 # They are technically XML-only, but in HTML5, they are another kind of
207 # comment:
208 #
209 # https://developer.mozilla.org/en-US/docs/Web/API/ProcessingInstruction
210 #
211 (r'<\?', Tok.ProcessingBegin),
212 # Not necessary in HTML5, but occurs in XML
213 (r'<!\[CDATA\[', Tok.CDataBegin), # <![CDATA[
214
215 # Markup declarations
216 # - In HTML5, there is only <!DOCTYPE html>
217 # - XML has 4 more declarations: <!ELEMENT ...> ATTLIST ENTITY NOTATION
218 # - these seem to be part of DTD
219 # - it's useful to skip these, and be able to parse the rest of the document
220 # - Note: < is allowed?
221 (r'<! [^>\x00]+ >', Tok.Decl),
222
223 # Tags
224 # Notes:
225 # - We look for a valid tag name, but we don't validate attributes.
226 # That's done in the tag lexer.
227 # - We don't allow leading whitespace
228 (r'</ (%s) >' % _NAME, Tok.EndTag),
229 # self-closing <br/> comes before StartTag
230 # could/should these be collapsed into one rule?
231 (r'< (%s) [^>\x00]* />' % _NAME, Tok.StartEndTag), # end </a>
232 (r'< (%s) [^>\x00]* >' % _NAME, Tok.StartTag), # start <a>
233
234 # HTML5 allows unescaped > in raw data, but < is not allowed.
235 # https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
236 #
237 # - My early blog has THREE errors when disallowing >
238 # - So do some .wwz files
239 (r'[^&<\x00]+', Tok.RawData),
240 (r'.', Tok.Invalid), # error!
241]
242
243# Old notes:
244#
245# Non-greedy matches are regular and can be matched in linear time
246# with RE2.
247#
248# https://news.ycombinator.com/item?id=27099798
249#
250# Maybe try combining all of these for speed.
251
252# . is any char except newline
253# https://re2c.org/manual/manual_c.html
254
255# Discarded options
256#(r'<!-- .*? -->', Tok.Comment),
257
258# Hack from Claude: \s\S instead of re.DOTALL. I don't like this
259#(r'<!-- [\s\S]*? -->', Tok.Comment),
260#(r'<!-- (?:.|[\n])*? -->', Tok.Comment),
261
262LEXER = MakeLexer(LEXER)
263
264
265class Lexer(object):
266
267 def __init__(self, s, left_pos=0, right_pos=-1, no_special_tags=False):
268 self.s = s
269 self.pos = left_pos
270 self.right_pos = len(s) if right_pos == -1 else right_pos
271 self.no_special_tags = no_special_tags
272
273 self.cache = {} # string -> compiled regex pattern object
274
275 # either </script> or </style> - we search until we see that
276 self.search_state = None # type: Optional[str]
277
278 # Position of tag name, if applicable
279 # - Set after you get a StartTag, EndTag, or StartEndTag
280 # - Unset on other tags
281 self.tag_pos_left = -1
282 self.tag_pos_right = -1
283
284 def _Peek(self):
285 # type: () -> Tuple[int, int]
286 """
287 Note: not using _Peek() now
288 """
289 if self.pos == self.right_pos:
290 return Tok.EndOfStream, self.pos
291
292 assert self.pos < self.right_pos, self.pos
293
294 if self.search_state is not None and not self.no_special_tags:
295 pos = self.s.find(self.search_state, self.pos)
296 if pos == -1:
297 # unterminated <script> or <style>
298 raise LexError(self.s, self.pos)
299 self.search_state = None
300 # beginning
301 return Tok.HtmlCData, pos
302
303 # Find the first match.
304 # Note: frontend/match.py uses _LongestMatch(), which is different!
305 # TODO: reconcile them. This lexer should be expressible in re2c.
306
307 for pat, tok_id in LEXER:
308 m = pat.match(self.s, self.pos)
309 if m:
310 if tok_id in (Tok.StartTag, Tok.EndTag, Tok.StartEndTag):
311 self.tag_pos_left = m.start(1)
312 self.tag_pos_right = m.end(1)
313 else:
314 # Reset state
315 self.tag_pos_left = -1
316 self.tag_pos_right = -1
317
318 if tok_id == Tok.CommentBegin:
319 pos = self.s.find('-->', self.pos)
320 if pos == -1:
321 # unterminated <!--
322 raise LexError(self.s, self.pos)
323 return Tok.Comment, pos + 3 # -->
324
325 if tok_id == Tok.ProcessingBegin:
326 pos = self.s.find('?>', self.pos)
327 if pos == -1:
328 # unterminated <?
329 raise LexError(self.s, self.pos)
330 return Tok.Processing, pos + 2 # ?>
331
332 if tok_id == Tok.CDataBegin:
333 pos = self.s.find(']]>', self.pos)
334 if pos == -1:
335 # unterminated <![CDATA[
336 raise LexError(self.s, self.pos)
337 return Tok.CData, pos + 3 # ]]>
338
339 if tok_id == Tok.StartTag:
340 if self.TagNameEquals('script'):
341 self.search_state = '</script>'
342 elif self.TagNameEquals('style'):
343 self.search_state = '</style>'
344
345 return tok_id, m.end()
346 else:
347 raise AssertionError('Tok.Invalid rule should have matched')
348
349 def TagNameEquals(self, expected):
350 # type: (str) -> bool
351 assert self.tag_pos_left != -1, self.tag_pos_left
352 assert self.tag_pos_right != -1, self.tag_pos_right
353
354 # TODO: In C++, this does not need an allocation
355 # TODO: conditionally lower() case here (maybe not in XML mode)
356 return expected == self.s[self.tag_pos_left:self.tag_pos_right]
357
358 def TagName(self):
359 # type: () -> None
360 assert self.tag_pos_left != -1, self.tag_pos_left
361 assert self.tag_pos_right != -1, self.tag_pos_right
362
363 # TODO: conditionally lower() case here (maybe not in XML mode)
364 return self.s[self.tag_pos_left:self.tag_pos_right]
365
366 def Read(self):
367 # type: () -> Tuple[int, int]
368 tok_id, end_pos = self._Peek()
369 self.pos = end_pos # advance
370 return tok_id, end_pos
371
372 def LookAhead(self, regex):
373 # Cache the regex compilation. This could also be LookAheadFor(THEAD)
374 # or something.
375 pat = self.cache.get(regex)
376 if pat is None:
377 pat = re.compile(regex)
378 self.cache[regex] = pat
379
380 m = pat.match(self.s, self.pos)
381 return m is not None
382
383
384def _Tokens(s, left_pos, right_pos):
385 """
386 Args:
387 s: string to parse
388 left_pos, right_pos: Optional span boundaries.
389 """
390 lx = Lexer(s, left_pos, right_pos)
391 while True:
392 tok_id, pos = lx.Read()
393 yield tok_id, pos
394 if tok_id == Tok.EndOfStream:
395 break
396
397
398def ValidTokens(s, left_pos=0, right_pos=-1):
399 """Wrapper around _Tokens to prevent callers from having to handle Invalid.
400
401 I'm not combining the two functions because I might want to do a
402 'yield' transformation on Tokens()? Exceptions might complicate the
403 issue?
404 """
405 pos = left_pos
406 for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
407 if tok_id == Tok.Invalid:
408 raise LexError(s, pos)
409 yield tok_id, end_pos
410 pos = end_pos
411
412
413def ValidTokenList(s, no_special_tags=False):
414 """A wrapper that can be more easily translated to C++. Doesn't use iterators."""
415
416 start_pos = 0
417 tokens = []
418 lx = Lexer(s, no_special_tags=no_special_tags)
419 while True:
420 tok_id, end_pos = lx.Read()
421 tokens.append((tok_id, end_pos))
422 if tok_id == Tok.EndOfStream:
423 break
424 if tok_id == Tok.Invalid:
425 raise LexError(s, start_pos)
426 start_pos = end_pos
427 return tokens
428
429
430# Tag names:
431# Match <a or </a
432# Match <h2, but not <2h
433#
434# HTML 5 doesn't restrict tag names at all
435# https://html.spec.whatwg.org/#toc-syntax
436#
437# XML allows : - .
438# https://www.w3.org/TR/xml/#NT-NameChar
439
440# Namespaces for MathML, SVG
441# XLink, XML, XMLNS
442#
443# https://infra.spec.whatwg.org/#namespaces
444#
445# Allow - for td-attrs
446
447# Be very lenient - just no whitespace or special HTML chars
448# I don't think this is more lenient than HTML5, though we should check.
449_UNQUOTED_VALUE = r'''[^ \t\r\n<>&"'\x00]*'''
450
451# TODO: we don't need to capture the tag name here? That's done at the top
452# level
453_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)
454
455_TAG_LAST_RE = re.compile(r'\s* /? >', re.VERBOSE)
456
457# To match href="foo"
458# Note: in HTML5 and XML, single quoted attributes are also valid
459
460# <button disabled> is standard usage
461
462# NOTE: This used to allow whitespace around =
463# <a foo = "bar"> makes sense in XML
464# But then you also have
465# <a foo= bar> - which is TWO attributes, in HTML5
466# So the space is problematic
467
468_ATTR_RE = re.compile(
469 r'''
470\s+ # Leading whitespace is required
471(%s) # Attribute name
472(?: # Optional attribute value
473 =
474 (?:
475 " ([^>"\x00]*) " # double quoted value
476 | ' ([^>'\x00]*) ' # single quoted value
477 | (%s) # Attribute value
478 )
479)?
480''' % (_NAME, _UNQUOTED_VALUE), re.VERBOSE)
481
482TagName, AttrName, UnquotedValue, QuotedValue, MissingValue = range(5)
483
484
485class TagLexer(object):
486 """
487 Given a tag like <a href="..."> or <link type="..." />, the TagLexer
488 provides a few operations:
489
490 - What is the tag?
491 - Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
492 """
493
494 def __init__(self, s):
495 self.s = s
496 self.start_pos = -1 # Invalid
497 self.end_pos = -1
498
499 def Reset(self, start_pos, end_pos):
500 """Reuse instances of this object."""
501 assert start_pos >= 0, start_pos
502 assert end_pos >= 0, end_pos
503
504 self.start_pos = start_pos
505 self.end_pos = end_pos
506
507 def TagString(self):
508 return self.s[self.start_pos:self.end_pos]
509
510 def TagName(self):
511 # First event
512 tok_id, start, end = next(self.Tokens())
513 return self.s[start:end]
514
515 def GetSpanForAttrValue(self, attr_name):
516 """
517 Used by oils_doc.py, for href shortcuts
518 """
519 # Algorithm: search for QuotedValue or UnquotedValue after AttrName
520 # TODO: Could also cache these
521
522 events = self.Tokens()
523 val = (-1, -1)
524 try:
525 while True:
526 tok_id, start, end = next(events)
527 if tok_id == AttrName:
528 name = self.s[start:end]
529 if name == attr_name:
530 # The value should come next
531 tok_id, start, end = next(events)
532 assert tok_id in (QuotedValue, UnquotedValue,
533 MissingValue), TokenName(tok_id)
534 val = start, end
535 break
536
537 except StopIteration:
538 pass
539 return val
540
541 def GetAttrRaw(self, attr_name):
542 """
543 Return the value, which may be UNESCAPED.
544 """
545 start, end = self.GetSpanForAttrValue(attr_name)
546 if start == -1:
547 return None
548 return self.s[start:end]
549
550 def AllAttrsRawSlice(self):
551 """
552 Get a list of pairs [('class', 3, 5), ('href', 9, 12)]
553 """
554 slices = []
555 events = self.Tokens()
556 try:
557 while True:
558 tok_id, start, end = next(events)
559 if tok_id == AttrName:
560 name = self.s[start:end]
561
562 # The value should come next
563 tok_id, start, end = next(events)
564 assert tok_id in (QuotedValue, UnquotedValue,
565 MissingValue), TokenName(tok_id)
566 # Note: quoted values may have &amp;
567 # We would need ANOTHER lexer to unescape them, but we
568 # don't need that for ul-table
569 slices.append((name, start, end))
570 except StopIteration:
571 pass
572 return slices
573
574 def AllAttrsRaw(self):
575 """
576 Get a list of pairs [('class', 'foo'), ('href', '?foo=1&amp;bar=2')]
577
578 The quoted values may be escaped. We would need another lexer to
579 unescape them.
580 """
581 slices = self.AllAttrsRawSlice()
582 pairs = []
583 for name, start, end in slices:
584 pairs.append((name, self.s[start:end]))
585 return pairs
586
587 def Tokens(self):
588 """
589 Yields a sequence of tokens: Tag (AttrName AttrValue?)*
590
591 Where each Token is (Type, start_pos, end_pos)
592
593 Note that start and end are NOT redundant! We skip over some unwanted
594 characters.
595 """
596 m = _TAG_RE.match(self.s, self.start_pos + 1)
597 if not m:
598 raise RuntimeError("Couldn't find HTML tag in %r" %
599 self.TagString())
600 yield TagName, m.start(1), m.end(1)
601
602 pos = m.end(0)
603 #log('POS %d', pos)
604
605 while True:
606 # don't search past the end
607 m = _ATTR_RE.match(self.s, pos, self.end_pos)
608 if not m:
609 #log('BREAK pos %d', pos)
610 break
611 #log('AttrName %r', m.group(1))
612
613 yield AttrName, m.start(1), m.end(1)
614
615 #log('m.groups() %r', m.groups())
616 if m.group(2) is not None:
617 # double quoted
618 yield QuotedValue, m.start(2), m.end(2)
619 elif m.group(3) is not None:
620 # single quoted - TODO: could have different token types
621 yield QuotedValue, m.start(3), m.end(3)
622 elif m.group(4) is not None:
623 yield UnquotedValue, m.start(4), m.end(4)
624 else:
625 # <button disabled>
626 end = m.end(0)
627 yield MissingValue, end, end
628
629 # Skip past the "
630 pos = m.end(0)
631
632 #log('TOK %r', self.s)
633
634 m = _TAG_LAST_RE.match(self.s, pos)
635 #log('_TAG_LAST_RE match %r', self.s[pos:])
636 if not m:
637 # Extra data at end of tag. TODO: add messages for all these.
638 raise LexError(self.s, pos)
639
640
641# This is similar but not identical to
642# " ([^>"\x00]*) " # double quoted value
643# | ' ([^>'\x00]*) ' # single quoted value
644#
645# Note: for unquoted values, & isn't allowed, and thus &amp; and &#99; and
646# &#x99; are not allowed. We could relax that?
647ATTR_VALUE_LEXER = CHAR_LEX + [
648 (r'[^>&\x00]+', Tok.RawData),
649 (r'.', Tok.Invalid),
650]
651
652ATTR_VALUE_LEXER = MakeLexer(ATTR_VALUE_LEXER)
653
654
655class AttrValueLexer(object):
656 """
657 <a href="foo=99&amp;bar">
658 <a href='foo=99&amp;bar'>
659 <a href=unquoted>
660 """
661
662 def __init__(self, s):
663 self.s = s
664 self.start_pos = -1 # Invalid
665 self.end_pos = -1
666
667 def Reset(self, start_pos, end_pos):
668 """Reuse instances of this object."""
669 assert start_pos >= 0, start_pos
670 assert end_pos >= 0, end_pos
671
672 self.start_pos = start_pos
673 self.end_pos = end_pos
674
675 def NumTokens(self):
676 num_tokens = 0
677 pos = self.start_pos
678 for tok_id, end_pos in self.Tokens():
679 if tok_id == Tok.Invalid:
680 raise LexError(self.s, pos)
681 pos = end_pos
682 #log('pos %d', pos)
683 num_tokens += 1
684 return num_tokens
685
686 def Tokens(self):
687 pos = self.start_pos
688 while pos < self.end_pos:
689 # Find the first match, like above.
690 # Note: frontend/match.py uses _LongestMatch(), which is different!
691 # TODO: reconcile them. This lexer should be expressible in re2c.
692 for pat, tok_id in ATTR_VALUE_LEXER:
693 m = pat.match(self.s, pos)
694 if m:
695 if 0:
696 tok_str = m.group(0)
697 log('token = %r', tok_str)
698
699 end_pos = m.end(0)
700 yield tok_id, end_pos
701 pos = end_pos
702 break
703 else:
704 raise AssertionError('Tok.Invalid rule should have matched')
705
706
707def ReadUntilStartTag(it, tag_lexer, tag_name):
708 """Find the next <foo>, returning its (start, end) positions
709
710 Raise ParseError if it's not found.
711
712 tag_lexer is RESET.
713 """
714 pos = 0
715 while True:
716 try:
717 tok_id, end_pos = next(it)
718 except StopIteration:
719 break
720 tag_lexer.Reset(pos, end_pos)
721 if tok_id == Tok.StartTag and tag_lexer.TagName() == tag_name:
722 return pos, end_pos
723
724 pos = end_pos
725
726 raise ParseError('No start tag %r' % tag_name)
727
728
729def ReadUntilEndTag(it, tag_lexer, tag_name):
730 """Find the next </foo>, returning its (start, end) position
731
732 Raise ParseError if it's not found.
733
734 tag_lexer is RESET.
735 """
736 pos = 0
737 while True:
738 try:
739 tok_id, end_pos = next(it)
740 except StopIteration:
741 break
742 tag_lexer.Reset(pos, end_pos)
743 if tok_id == Tok.EndTag and tag_lexer.TagName() == tag_name:
744 return pos, end_pos
745
746 pos = end_pos
747
748 raise ParseError('No end tag %r' % tag_name)
749
750
751CHAR_ENTITY = {
752 'amp': '&',
753 'lt': '<',
754 'gt': '>',
755 'quot': '"',
756}
757
758
759def ToText(s, left_pos=0, right_pos=-1):
760 """Given HTML, return text by unquoting &gt; and &lt; etc.
761
762 Used by:
763 doctools/oils_doc.py: PygmentsPlugin
764 doctools/help_gen.py: HelpIndexCards
765
766 In the latter case, we cold process some tags, like:
767
768 - Blue Link (not clickable, but still useful)
769 - Red X
770
771 That should be html.ToAnsi.
772 """
773 f = StringIO()
774 out = Output(s, f, left_pos, right_pos)
775
776 pos = left_pos
777 for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
778 if tok_id == Tok.RawData:
779 out.SkipTo(pos)
780 out.PrintUntil(end_pos)
781
782 elif tok_id == Tok.CharEntity: # &amp;
783
784 entity = s[pos + 1:end_pos - 1]
785
786 out.SkipTo(pos)
787 out.Print(CHAR_ENTITY[entity])
788 out.SkipTo(end_pos)
789
790 # Not handling these yet
791 elif tok_id == Tok.HexChar:
792 raise AssertionError('Hex Char %r' % s[pos:pos + 20])
793
794 elif tok_id == Tok.DecChar:
795 raise AssertionError('Dec Char %r' % s[pos:pos + 20])
796
797 pos = end_pos
798
799 out.PrintTheRest()
800 return f.getvalue()
801
802
803# https://developer.mozilla.org/en-US/docs/Glossary/Void_element
804VOID_ELEMENTS = [
805 'area',
806 'base',
807 'br',
808 'col',
809 'embed',
810 'hr',
811 'img',
812 'input',
813 'link',
814 'meta',
815 'param',
816 'source',
817 'track',
818 'wbr',
819]
820
821LEX_ATTRS = 1 << 1
822LEX_QUOTED_VALUES = 1 << 2 # href="?x=42&amp;y=99"
823NO_SPECIAL_TAGS = 1 << 3 # <script> <style>, VOID tags, etc.
824BALANCED_TAGS = 1 << 4 # are tags balanced?
825
826
827def Validate(contents, flags, counters):
828 # type: (str, int, Counters) -> None
829
830 tag_lexer = TagLexer(contents)
831 val_lexer = AttrValueLexer(contents)
832
833 no_special_tags = bool(flags & NO_SPECIAL_TAGS)
834 lx = Lexer(contents, no_special_tags=no_special_tags)
835 tokens = []
836 start_pos = 0
837 tag_stack = []
838 while True:
839 tok_id, end_pos = lx.Read()
840 #log('TOP %s %r', TokenName(tok_id), contents[start_pos:end_pos])
841
842 if tok_id == Tok.Invalid:
843 raise LexError(contents, start_pos)
844 if tok_id == Tok.EndOfStream:
845 break
846
847 tokens.append((tok_id, end_pos))
848
849 if tok_id == Tok.StartEndTag:
850 counters.num_start_end_tags += 1
851
852 tag_lexer.Reset(start_pos, end_pos)
853 all_attrs = tag_lexer.AllAttrsRawSlice()
854 counters.num_attrs += len(all_attrs)
855 for name, val_start, val_end in all_attrs:
856 val_lexer.Reset(val_start, val_end)
857 counters.num_val_tokens += val_lexer.NumTokens()
858
859 counters.debug_attrs.extend(all_attrs)
860
861 elif tok_id == Tok.StartTag:
862 counters.num_start_tags += 1
863
864 tag_lexer.Reset(start_pos, end_pos)
865 all_attrs = tag_lexer.AllAttrsRawSlice()
866 counters.num_attrs += len(all_attrs)
867 for name, val_start, val_end in all_attrs:
868 val_lexer.Reset(val_start, val_end)
869 counters.num_val_tokens += val_lexer.NumTokens()
870
871 counters.debug_attrs.extend(all_attrs)
872
873 if flags & BALANCED_TAGS:
874 tag_name = lx.TagName()
875 if flags & NO_SPECIAL_TAGS:
876 tag_stack.append(tag_name)
877 else:
878 # e.g. <meta> is considered self-closing, like <meta/>
879 if tag_name not in VOID_ELEMENTS:
880 tag_stack.append(tag_name)
881
882 counters.max_tag_stack = max(counters.max_tag_stack,
883 len(tag_stack))
884 elif tok_id == Tok.EndTag:
885 if flags & BALANCED_TAGS:
886 try:
887 expected = tag_stack.pop()
888 except IndexError:
889 raise ParseError('Tag stack empty',
890 s=contents,
891 start_pos=start_pos)
892
893 actual = lx.TagName()
894 if expected != actual:
895 raise ParseError(
896 'Got unexpected closing tag %r; opening tag was %r' %
897 (contents[start_pos:end_pos], expected),
898 s=contents,
899 start_pos=start_pos)
900
901 start_pos = end_pos
902
903 if len(tag_stack) != 0:
904 raise ParseError('Missing closing tags at end of doc: %s' %
905 ' '.join(tag_stack),
906 s=contents,
907 start_pos=start_pos)
908
909 counters.num_tokens += len(tokens)
910
911
912class Counters(object):
913
914 def __init__(self):
915 self.num_tokens = 0
916 self.num_start_tags = 0
917 self.num_start_end_tags = 0
918 self.num_attrs = 0
919 self.max_tag_stack = 0
920 self.num_val_tokens = 0
921
922 self.debug_attrs = []
923
924
925def main(argv):
926 action = argv[1]
927
928 if action == 'tokens':
929 contents = sys.stdin.read()
930
931 lx = Lexer(contents)
932 start_pos = 0
933 while True:
934 tok_id, end_pos = lx.Read()
935 if tok_id == Tok.Invalid:
936 raise LexError(contents, start_pos)
937 if tok_id == Tok.EndOfStream:
938 break
939
940 frag = contents[start_pos:end_pos]
941 log('%d %s %r', end_pos, TokenName(tok_id), frag)
942 start_pos = end_pos
943
944 return 0
945
946 elif action in ('lex-htm8', 'parse-htm8', 'parse-xml'):
947
948 errors = []
949 counters = Counters()
950
951 flags = LEX_ATTRS | LEX_QUOTED_VALUES
952 if action.startswith('parse-'):
953 flags |= BALANCED_TAGS
954 if action == 'parse-xml':
955 flags |= NO_SPECIAL_TAGS
956
957 i = 0
958 for line in sys.stdin:
959 filename = line.strip()
960 with open(filename) as f:
961 contents = f.read()
962
963 try:
964 Validate(contents, flags, counters)
965 except LexError as e:
966 log('Lex error in %r: %s', filename, e)
967 errors.append((filename, e))
968 except ParseError as e:
969 log('Parse error in %r: %s', filename, e)
970 errors.append((filename, e))
971 i += 1
972
973 log('')
974 log('%10d tokens', counters.num_tokens)
975 log('%10d start/end tags', counters.num_start_end_tags)
976 log('%10d start tags', counters.num_start_tags)
977 log('%10d attrs', counters.num_attrs)
978 log('%10d max tag stack depth', counters.max_tag_stack)
979 log('%10d attr val tokens', counters.num_val_tokens)
980 log('%10d errors', len(errors))
981 if len(errors):
982 return 1
983 return 0
984
985 elif action == 'todo':
986 # Other algorithms:
987 #
988 # - select first subtree with given ID
989 # - this requires understanding the void tags I suppose
990 # - select all subtrees that have a class
991 # - materialize DOM
992
993 # Safe-HTM8? This is a filter
994 return 0
995
996 else:
997 raise RuntimeError('Invalid action %r' % action)
998
999
1000if __name__ == '__main__':
1001 sys.exit(main(sys.argv))