OILS / lazylex / html.py View on Github | oils.pub

1083 lines, 561 significant
1#!/usr/bin/env python2
2"""
3lazylex/html.py - Low-Level HTML Processing.
4
5See lazylex/README.md for details.
6"""
7from __future__ import print_function
8
9try:
10 from cStringIO import StringIO
11except ImportError:
12 from io import StringIO # python3
13import re
14import sys
15
16if sys.version_info.major == 2:
17 from typing import List, Tuple, Optional
18
19
20def log(msg, *args):
21 msg = msg % args
22 print(msg, file=sys.stderr)
23
24
25class LexError(Exception):
26 """
27 Examples of lex errors:
28
29 - Tok.Invalid, like <> or &&
30 - Unclosed <!-- <? <![CDATA[ <script> <style>
31 """
32
33 def __init__(self, s, start_pos):
34 self.s = s
35 self.start_pos = start_pos
36
37 def __str__(self):
38 return '(LexError %r)' % (self.s[self.start_pos:self.start_pos + 20])
39
40
41def FindLineNum(s, error_pos):
42 current_pos = 0
43 line_num = 1
44 while True:
45 newline_pos = s.find('\n', current_pos)
46 #log('current = %d, N %d, line %d', current_pos, newline_pos, line_num)
47
48 if newline_pos == -1: # this is the last line
49 return line_num
50 if newline_pos >= error_pos:
51 return line_num
52 line_num += 1
53 current_pos = newline_pos + 1
54
55
56class ParseError(Exception):
57 """
58 Examples of parse errors
59
60 - unbalanced tag structure
61 - ul_table.py errors
62 """
63
64 def __init__(self, msg, s=None, start_pos=-1):
65 self.msg = msg
66 self.s = s
67 self.start_pos = start_pos
68
69 def __str__(self):
70 if self.s is not None:
71 assert self.start_pos != -1, self.start_pos
72 snippet = (self.s[self.start_pos:self.start_pos + 20])
73
74 line_num = FindLineNum(self.s, self.start_pos)
75 else:
76 snippet = ''
77 line_num = -1
78 msg = 'line %d: %r %r' % (line_num, self.msg, snippet)
79 return msg
80
81
82class Output(object):
83 """Takes an underlying input buffer and an output file. Maintains a
84 position in the input buffer.
85
86 Print FROM the input or print new text to the output.
87 """
88
89 def __init__(self, s, f, left_pos=0, right_pos=-1):
90 self.s = s
91 self.f = f
92 self.pos = left_pos
93 self.right_pos = len(s) if right_pos == -1 else right_pos
94
95 def SkipTo(self, pos):
96 """Skip to a position."""
97 self.pos = pos
98
99 def PrintUntil(self, pos):
100 """Print until a position."""
101 piece = self.s[self.pos:pos]
102 self.f.write(piece)
103 self.pos = pos
104
105 def PrintTheRest(self):
106 """Print until the end of the string."""
107 self.PrintUntil(self.right_pos)
108
109 def Print(self, s):
110 """Print text to the underlying buffer."""
111 self.f.write(s)
112
113
114# HTML Tokens
115# CommentBegin, ProcessingBegin, CDataBegin are "pseudo-tokens", not visible
116TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin CData CDataBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData HtmlCData BadAmpersand BadGreaterThan BadLessThan Invalid EndOfStream'.split(
117)
118
119
120class Tok(object):
121 """
122 Avoid lint errors by using these aliases
123 """
124 pass
125
126
127TOKEN_NAMES = [None] * len(TOKENS) # type: List[str]
128
129this_module = sys.modules[__name__]
130for i, tok_str in enumerate(TOKENS):
131 setattr(this_module, tok_str, i)
132 setattr(Tok, tok_str, i)
133 TOKEN_NAMES[i] = tok_str
134
135
136def TokenName(tok_id):
137 return TOKEN_NAMES[tok_id]
138
139
140def MakeLexer(rules):
141 return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]
142
143
144#
145# Eggex
146#
147# Tag = / ~['>']+ /
148
149# Is this valid? A single character?
150# Tag = / ~'>'* /
151
152# Maybe better: / [NOT '>']+/
153# capital letters not allowed there?
154#
155# But then this is confusing:
156# / [NOT ~digit]+/
157#
158# / [NOT digit] / is [^\d]
159# / ~digit / is \D
160#
161# Or maybe:
162#
163# / [~ digit]+ /
164# / [~ '>']+ /
165# / [NOT '>']+ /
166
167# End = / '</' Tag '>' /
168# StartEnd = / '<' Tag '/>' /
169# Start = / '<' Tag '>' /
170#
171# EntityRef = / '&' dot{* N} ';' /
172
173# Tag name, or attribute name
174# colon is used in XML
175
176# https://www.w3.org/TR/xml/#NT-Name
177# Hm there is a lot of unicode stuff. We are simplifying parsing
178
179_NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter
180
181CHAR_LEX = [
182 # Characters
183 # https://www.w3.org/TR/xml/#sec-references
184 (r'&\# [0-9]+ ;', Tok.DecChar),
185 (r'&\# x[0-9a-fA-F]+ ;', Tok.HexChar),
186 (r'& %s ;' % _NAME, Tok.CharEntity),
187 # Allow unquoted, and quoted
188 (r'&', Tok.BadAmpersand),
189]
190
191LEXER = CHAR_LEX + [
192 (r'<!--', Tok.CommentBegin),
193
194 # Processing instruction are used for the XML header:
195 # <?xml version="1.0" encoding="UTF-8"?>
196 # They are technically XML-only, but in HTML5, they are another kind of
197 # comment:
198 #
199 # https://developer.mozilla.org/en-US/docs/Web/API/ProcessingInstruction
200 #
201 (r'<\?', Tok.ProcessingBegin),
202 # Not necessary in HTML5, but occurs in XML
203 (r'<!\[CDATA\[', Tok.CDataBegin), # <![CDATA[
204
205 # Markup declarations
206 # - In HTML5, there is only <!DOCTYPE html>
207 # - XML has 4 more declarations: <!ELEMENT ...> ATTLIST ENTITY NOTATION
208 # - these seem to be part of DTD
209 # - it's useful to skip these, and be able to parse the rest of the document
210 # - Note: < is allowed?
211 (r'<! [^>\x00]+ >', Tok.Decl),
212
213 # Tags
214 # Notes:
215 # - We look for a valid tag name, but we don't validate attributes.
216 # That's done in the tag lexer.
217 # - We don't allow leading whitespace
218 (r'</ (%s) >' % _NAME, Tok.EndTag),
219 # self-closing <br/> comes before StartTag
220 # could/should these be collapsed into one rule?
221 (r'< (%s) [^>\x00]* />' % _NAME, Tok.StartEndTag), # end </a>
222 (r'< (%s) [^>\x00]* >' % _NAME, Tok.StartTag), # start <a>
223
224 # HTML5 allows unescaped > in raw data, but < is not allowed.
225 # https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
226 #
227 # - My early blog has THREE errors when disallowing >
228 # - So do some .wwz files
229 (r'[^&<>\x00]+', Tok.RawData),
230 (r'>', Tok.BadGreaterThan),
231 # < is an error
232 (r'.', Tok.Invalid),
233]
234
235# Old notes:
236#
237# Non-greedy matches are regular and can be matched in linear time
238# with RE2.
239#
240# https://news.ycombinator.com/item?id=27099798
241#
242# Maybe try combining all of these for speed.
243
244# . is any char except newline
245# https://re2c.org/manual/manual_c.html
246
247# Discarded options
248#(r'<!-- .*? -->', Tok.Comment),
249
250# Hack from Claude: \s\S instead of re.DOTALL. I don't like this
251#(r'<!-- [\s\S]*? -->', Tok.Comment),
252#(r'<!-- (?:.|[\n])*? -->', Tok.Comment),
253
254LEXER = MakeLexer(LEXER)
255
256
257class Lexer(object):
258
259 def __init__(self, s, left_pos=0, right_pos=-1, no_special_tags=False):
260 self.s = s
261 self.pos = left_pos
262 self.right_pos = len(s) if right_pos == -1 else right_pos
263 self.no_special_tags = no_special_tags
264
265 self.cache = {} # string -> compiled regex pattern object
266
267 # either </script> or </style> - we search until we see that
268 self.search_state = None # type: Optional[str]
269
270 # Position of tag name, if applicable
271 # - Set after you get a StartTag, EndTag, or StartEndTag
272 # - Unset on other tags
273 self.tag_pos_left = -1
274 self.tag_pos_right = -1
275
276 def _Peek(self):
277 # type: () -> Tuple[int, int]
278 """
279 Note: not using _Peek() now
280 """
281 if self.pos == self.right_pos:
282 return Tok.EndOfStream, self.pos
283
284 assert self.pos < self.right_pos, self.pos
285
286 if self.search_state is not None and not self.no_special_tags:
287 # TODO: case-insensitive search for </SCRIPT> <SCRipt> ?
288 #
289 # Another strategy: enter a mode where we find ONLY the end tag
290 # regex, and any data that's not <, and then check the canonical
291 # tag name for 'script' or 'style'.
292 pos = self.s.find(self.search_state, self.pos)
293 if pos == -1:
294 # unterminated <script> or <style>
295 raise LexError(self.s, self.pos)
296 self.search_state = None
297 # beginning
298 return Tok.HtmlCData, pos
299
300 # Find the first match.
301 # Note: frontend/match.py uses _LongestMatch(), which is different!
302 # TODO: reconcile them. This lexer should be expressible in re2c.
303
304 for pat, tok_id in LEXER:
305 m = pat.match(self.s, self.pos)
306 if m:
307 if tok_id in (Tok.StartTag, Tok.EndTag, Tok.StartEndTag):
308 self.tag_pos_left = m.start(1)
309 self.tag_pos_right = m.end(1)
310 else:
311 # Reset state
312 self.tag_pos_left = -1
313 self.tag_pos_right = -1
314
315 if tok_id == Tok.CommentBegin:
316 pos = self.s.find('-->', self.pos)
317 if pos == -1:
318 # unterminated <!--
319 raise LexError(self.s, self.pos)
320 return Tok.Comment, pos + 3 # -->
321
322 if tok_id == Tok.ProcessingBegin:
323 pos = self.s.find('?>', self.pos)
324 if pos == -1:
325 # unterminated <?
326 raise LexError(self.s, self.pos)
327 return Tok.Processing, pos + 2 # ?>
328
329 if tok_id == Tok.CDataBegin:
330 pos = self.s.find(']]>', self.pos)
331 if pos == -1:
332 # unterminated <![CDATA[
333 raise LexError(self.s, self.pos)
334 return Tok.CData, pos + 3 # ]]>
335
336 if tok_id == Tok.StartTag:
337 # TODO: reduce allocations
338 if (self.TagNameEquals('script') or
339 self.TagNameEquals('style')):
340 # <SCRipt a=b> -> </SCRipt>
341 self.search_state = '</' + self._LiteralTagName() + '>'
342
343 return tok_id, m.end()
344 else:
345 raise AssertionError('Tok.Invalid rule should have matched')
346
347 def TagNameEquals(self, expected):
348 # type: (str) -> bool
349 assert self.tag_pos_left != -1, self.tag_pos_left
350 assert self.tag_pos_right != -1, self.tag_pos_right
351
352 # TODO: In C++, this does not need an allocation. Can we test
353 # directly?
354 return expected == self.CanonicalTagName()
355
356 def _LiteralTagName(self):
357 # type: () -> None
358 assert self.tag_pos_left != -1, self.tag_pos_left
359 assert self.tag_pos_right != -1, self.tag_pos_right
360
361 return self.s[self.tag_pos_left:self.tag_pos_right]
362
363 def CanonicalTagName(self):
364 # type: () -> None
365 tag_name = self._LiteralTagName()
366 # Most tags are already lower case, so avoid allocation with this conditional
367 # TODO: this could go in the mycpp runtime?
368 if tag_name.islower():
369 return tag_name
370 else:
371 return tag_name.lower()
372
373 def Read(self):
374 # type: () -> Tuple[int, int]
375 tok_id, end_pos = self._Peek()
376 self.pos = end_pos # advance
377 return tok_id, end_pos
378
379 def LookAhead(self, regex):
380 # Cache the regex compilation. This could also be LookAheadFor(THEAD)
381 # or something.
382 pat = self.cache.get(regex)
383 if pat is None:
384 pat = re.compile(regex)
385 self.cache[regex] = pat
386
387 m = pat.match(self.s, self.pos)
388 return m is not None
389
390
391def _Tokens(s, left_pos, right_pos):
392 """
393 Args:
394 s: string to parse
395 left_pos, right_pos: Optional span boundaries.
396 """
397 lx = Lexer(s, left_pos, right_pos)
398 while True:
399 tok_id, pos = lx.Read()
400 yield tok_id, pos
401 if tok_id == Tok.EndOfStream:
402 break
403
404
405def ValidTokens(s, left_pos=0, right_pos=-1):
406 """Wrapper around _Tokens to prevent callers from having to handle Invalid.
407
408 I'm not combining the two functions because I might want to do a
409 'yield' transformation on Tokens()? Exceptions might complicate the
410 issue?
411 """
412 pos = left_pos
413 for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
414 if tok_id == Tok.Invalid:
415 raise LexError(s, pos)
416 yield tok_id, end_pos
417 pos = end_pos
418
419
420def ValidTokenList(s, no_special_tags=False):
421 """A wrapper that can be more easily translated to C++. Doesn't use iterators."""
422
423 start_pos = 0
424 tokens = []
425 lx = Lexer(s, no_special_tags=no_special_tags)
426 while True:
427 tok_id, end_pos = lx.Read()
428 tokens.append((tok_id, end_pos))
429 if tok_id == Tok.EndOfStream:
430 break
431 if tok_id == Tok.Invalid:
432 raise LexError(s, start_pos)
433 start_pos = end_pos
434 return tokens
435
436
437# Tag names:
438# Match <a or </a
439# Match <h2, but not <2h
440#
441# HTML 5 doesn't restrict tag names at all
442# https://html.spec.whatwg.org/#toc-syntax
443#
444# XML allows : - .
445# https://www.w3.org/TR/xml/#NT-NameChar
446
447# Namespaces for MathML, SVG
448# XLink, XML, XMLNS
449#
450# https://infra.spec.whatwg.org/#namespaces
451#
452# Allow - for td-attrs
453
454# Be very lenient - just no whitespace or special HTML chars
455# I don't think this is more lenient than HTML5, though we should check.
456_UNQUOTED_VALUE = r'''[^ \t\r\n<>&"'\x00]*'''
457
458# TODO: we don't need to capture the tag name here? That's done at the top
459# level
460_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)
461
462_TAG_LAST_RE = re.compile(r'\s* /? >', re.VERBOSE)
463
464# To match href="foo"
465# Note: in HTML5 and XML, single quoted attributes are also valid
466
467# <button disabled> is standard usage
468
469# NOTE: This used to allow whitespace around =
470# <a foo = "bar"> makes sense in XML
471# But then you also have
472# <a foo= bar> - which is TWO attributes, in HTML5
473# So the space is problematic
474
475_ATTR_RE = re.compile(
476 r'''
477\s+ # Leading whitespace is required
478(%s) # Attribute name
479(?: # Optional attribute value
480 \s* = \s* # Spaces allowed around =
481 (?:
482 " ([^>"\x00]*) " # double quoted value
483 | ' ([^>'\x00]*) ' # single quoted value
484 | (%s) # Attribute value
485 )
486)?
487''' % (_NAME, _UNQUOTED_VALUE), re.VERBOSE)
488
489TagName, AttrName, UnquotedValue, QuotedValue, MissingValue = range(5)
490
491
492class TagLexer(object):
493 """
494 Given a tag like <a href="..."> or <link type="..." />, the TagLexer
495 provides a few operations:
496
497 - What is the tag?
498 - Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
499 """
500
501 def __init__(self, s):
502 self.s = s
503 self.start_pos = -1 # Invalid
504 self.end_pos = -1
505
506 def Reset(self, start_pos, end_pos):
507 """Reuse instances of this object."""
508 assert start_pos >= 0, start_pos
509 assert end_pos >= 0, end_pos
510
511 self.start_pos = start_pos
512 self.end_pos = end_pos
513
514 def TagString(self):
515 return self.s[self.start_pos:self.end_pos]
516
517 def TagName(self):
518 # First event
519 tok_id, start, end = next(self.Tokens())
520 return self.s[start:end]
521
522 def GetSpanForAttrValue(self, attr_name):
523 """
524 Used by oils_doc.py, for href shortcuts
525 """
526 # Algorithm: search for QuotedValue or UnquotedValue after AttrName
527 # TODO: Could also cache these
528
529 events = self.Tokens()
530 val = (-1, -1)
531 try:
532 while True:
533 tok_id, start, end = next(events)
534 if tok_id == AttrName:
535 name = self.s[start:end]
536 if name == attr_name:
537 # The value should come next
538 tok_id, start, end = next(events)
539 assert tok_id in (QuotedValue, UnquotedValue,
540 MissingValue), TokenName(tok_id)
541 val = start, end
542 break
543
544 except StopIteration:
545 pass
546 return val
547
548 def GetAttrRaw(self, attr_name):
549 """
550 Return the value, which may be UNESCAPED.
551 """
552 start, end = self.GetSpanForAttrValue(attr_name)
553 if start == -1:
554 return None
555 return self.s[start:end]
556
557 def AllAttrsRawSlice(self):
558 """
559 Get a list of pairs [('class', 3, 5), ('href', 9, 12)]
560 """
561 slices = []
562 events = self.Tokens()
563 try:
564 while True:
565 tok_id, start, end = next(events)
566 if tok_id == AttrName:
567 name = self.s[start:end]
568
569 # The value should come next
570 tok_id, start, end = next(events)
571 assert tok_id in (QuotedValue, UnquotedValue,
572 MissingValue), TokenName(tok_id)
573 # Note: quoted values may have &amp;
574 # We would need ANOTHER lexer to unescape them, but we
575 # don't need that for ul-table
576 slices.append((name, start, end))
577 except StopIteration:
578 pass
579 return slices
580
581 def AllAttrsRaw(self):
582 """
583 Get a list of pairs [('class', 'foo'), ('href', '?foo=1&amp;bar=2')]
584
585 The quoted values may be escaped. We would need another lexer to
586 unescape them.
587 """
588 slices = self.AllAttrsRawSlice()
589 pairs = []
590 for name, start, end in slices:
591 pairs.append((name, self.s[start:end]))
592 return pairs
593
594 def Tokens(self):
595 """
596 Yields a sequence of tokens: Tag (AttrName AttrValue?)*
597
598 Where each Token is (Type, start_pos, end_pos)
599
600 Note that start and end are NOT redundant! We skip over some unwanted
601 characters.
602 """
603 m = _TAG_RE.match(self.s, self.start_pos + 1)
604 if not m:
605 raise RuntimeError("Couldn't find HTML tag in %r" %
606 self.TagString())
607 yield TagName, m.start(1), m.end(1)
608
609 pos = m.end(0)
610 #log('POS %d', pos)
611
612 while True:
613 # don't search past the end
614 m = _ATTR_RE.match(self.s, pos, self.end_pos)
615 if not m:
616 #log('BREAK pos %d', pos)
617 break
618 #log('AttrName %r', m.group(1))
619
620 yield AttrName, m.start(1), m.end(1)
621
622 #log('m.groups() %r', m.groups())
623 if m.group(2) is not None:
624 # double quoted
625 yield QuotedValue, m.start(2), m.end(2)
626 elif m.group(3) is not None:
627 # single quoted - TODO: could have different token types
628 yield QuotedValue, m.start(3), m.end(3)
629 elif m.group(4) is not None:
630 yield UnquotedValue, m.start(4), m.end(4)
631 else:
632 # <button disabled>
633 end = m.end(0)
634 yield MissingValue, end, end
635
636 # Skip past the "
637 pos = m.end(0)
638
639 #log('TOK %r', self.s)
640
641 m = _TAG_LAST_RE.match(self.s, pos)
642 #log('_TAG_LAST_RE match %r', self.s[pos:])
643 if not m:
644 # Extra data at end of tag. TODO: add messages for all these.
645 raise LexError(self.s, pos)
646
647
648# This is similar but not identical to
649# " ([^>"\x00]*) " # double quoted value
650# | ' ([^>'\x00]*) ' # single quoted value
651#
652# Note: for unquoted values, & isn't allowed, and thus &amp; and &#99; and
653# &#x99; are not allowed. We could relax that?
654ATTR_VALUE_LEXER = CHAR_LEX + [
655 (r'[^>&\x00]+', Tok.RawData),
656 (r'.', Tok.Invalid),
657]
658
659ATTR_VALUE_LEXER = MakeLexer(ATTR_VALUE_LEXER)
660
661
662class AttrValueLexer(object):
663 """
664 <a href="foo=99&amp;bar">
665 <a href='foo=99&amp;bar'>
666 <a href=unquoted>
667 """
668
669 def __init__(self, s):
670 self.s = s
671 self.start_pos = -1 # Invalid
672 self.end_pos = -1
673
674 def Reset(self, start_pos, end_pos):
675 """Reuse instances of this object."""
676 assert start_pos >= 0, start_pos
677 assert end_pos >= 0, end_pos
678
679 self.start_pos = start_pos
680 self.end_pos = end_pos
681
682 def NumTokens(self):
683 num_tokens = 0
684 pos = self.start_pos
685 for tok_id, end_pos in self.Tokens():
686 if tok_id == Tok.Invalid:
687 raise LexError(self.s, pos)
688 pos = end_pos
689 #log('pos %d', pos)
690 num_tokens += 1
691 return num_tokens
692
693 def Tokens(self):
694 pos = self.start_pos
695 while pos < self.end_pos:
696 # Find the first match, like above.
697 # Note: frontend/match.py uses _LongestMatch(), which is different!
698 # TODO: reconcile them. This lexer should be expressible in re2c.
699 for pat, tok_id in ATTR_VALUE_LEXER:
700 m = pat.match(self.s, pos)
701 if m:
702 if 0:
703 tok_str = m.group(0)
704 log('token = %r', tok_str)
705
706 end_pos = m.end(0)
707 yield tok_id, end_pos
708 pos = end_pos
709 break
710 else:
711 raise AssertionError('Tok.Invalid rule should have matched')
712
713
714def ReadUntilStartTag(it, tag_lexer, tag_name):
715 """Find the next <foo>, returning its (start, end) positions
716
717 Raise ParseError if it's not found.
718
719 tag_lexer is RESET.
720 """
721 pos = 0
722 while True:
723 try:
724 tok_id, end_pos = next(it)
725 except StopIteration:
726 break
727 tag_lexer.Reset(pos, end_pos)
728 if tok_id == Tok.StartTag and tag_lexer.TagName() == tag_name:
729 return pos, end_pos
730
731 pos = end_pos
732
733 raise ParseError('No start tag %r' % tag_name)
734
735
736def ReadUntilEndTag(it, tag_lexer, tag_name):
737 """Find the next </foo>, returning its (start, end) position
738
739 Raise ParseError if it's not found.
740
741 tag_lexer is RESET.
742 """
743 pos = 0
744 while True:
745 try:
746 tok_id, end_pos = next(it)
747 except StopIteration:
748 break
749 tag_lexer.Reset(pos, end_pos)
750 if tok_id == Tok.EndTag and tag_lexer.TagName() == tag_name:
751 return pos, end_pos
752
753 pos = end_pos
754
755 raise ParseError('No end tag %r' % tag_name)
756
757
758CHAR_ENTITY = {
759 'amp': '&',
760 'lt': '<',
761 'gt': '>',
762 'quot': '"',
763 'apos': "'",
764}
765
766
767def ToText(s, left_pos=0, right_pos=-1):
768 """Given HTML, return text by unquoting &gt; and &lt; etc.
769
770 Used by:
771 doctools/oils_doc.py: PygmentsPlugin
772 doctools/help_gen.py: HelpIndexCards
773
774 In the latter case, we cold process some tags, like:
775
776 - Blue Link (not clickable, but still useful)
777 - Red X
778
779 That should be html.ToAnsi.
780 """
781 f = StringIO()
782 out = Output(s, f, left_pos, right_pos)
783
784 pos = left_pos
785 for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
786 if tok_id in (Tok.RawData, Tok.BadAmpersand, Tok.BadGreaterThan,
787 Tok.BadLessThan):
788 out.SkipTo(pos)
789 out.PrintUntil(end_pos)
790
791 elif tok_id == Tok.CharEntity: # &amp;
792
793 entity = s[pos + 1:end_pos - 1]
794
795 out.SkipTo(pos)
796 out.Print(CHAR_ENTITY[entity])
797 out.SkipTo(end_pos)
798
799 # Not handling these yet
800 elif tok_id == Tok.HexChar:
801 raise AssertionError('Hex Char %r' % s[pos:pos + 20])
802
803 elif tok_id == Tok.DecChar:
804 raise AssertionError('Dec Char %r' % s[pos:pos + 20])
805
806 else:
807 # Skip everything else
808 out.SkipTo(end_pos)
809
810 pos = end_pos
811
812 out.PrintTheRest()
813 return f.getvalue()
814
815
816# https://developer.mozilla.org/en-US/docs/Glossary/Void_element
817VOID_ELEMENTS = [
818 'area',
819 'base',
820 'br',
821 'col',
822 'embed',
823 'hr',
824 'img',
825 'input',
826 'link',
827 'meta',
828 'param',
829 'source',
830 'track',
831 'wbr',
832]
833
834LEX_ATTRS = 1 << 1
835LEX_QUOTED_VALUES = 1 << 2 # href="?x=42&amp;y=99"
836NO_SPECIAL_TAGS = 1 << 3 # <script> <style>, VOID tags, etc.
837BALANCED_TAGS = 1 << 4 # are tags balanced?
838
839
840def Validate(contents, flags, counters):
841 # type: (str, int, Counters) -> None
842
843 tag_lexer = TagLexer(contents)
844 val_lexer = AttrValueLexer(contents)
845
846 no_special_tags = bool(flags & NO_SPECIAL_TAGS)
847 lx = Lexer(contents, no_special_tags=no_special_tags)
848 tokens = []
849 start_pos = 0
850 tag_stack = []
851 while True:
852 tok_id, end_pos = lx.Read()
853 #log('TOP %s %r', TokenName(tok_id), contents[start_pos:end_pos])
854
855 if tok_id == Tok.Invalid:
856 raise LexError(contents, start_pos)
857 if tok_id == Tok.EndOfStream:
858 break
859
860 tokens.append((tok_id, end_pos))
861
862 if tok_id == Tok.StartEndTag:
863 counters.num_start_end_tags += 1
864
865 tag_lexer.Reset(start_pos, end_pos)
866 all_attrs = tag_lexer.AllAttrsRawSlice()
867 counters.num_attrs += len(all_attrs)
868 for name, val_start, val_end in all_attrs:
869 val_lexer.Reset(val_start, val_end)
870 counters.num_val_tokens += val_lexer.NumTokens()
871
872 counters.debug_attrs.extend(all_attrs)
873
874 elif tok_id == Tok.StartTag:
875 counters.num_start_tags += 1
876
877 tag_lexer.Reset(start_pos, end_pos)
878 all_attrs = tag_lexer.AllAttrsRawSlice()
879 counters.num_attrs += len(all_attrs)
880 for name, val_start, val_end in all_attrs:
881 val_lexer.Reset(val_start, val_end)
882 counters.num_val_tokens += val_lexer.NumTokens()
883
884 counters.debug_attrs.extend(all_attrs)
885
886 if flags & BALANCED_TAGS:
887 tag_name = lx.CanonicalTagName()
888 if flags & NO_SPECIAL_TAGS:
889 tag_stack.append(tag_name)
890 else:
891 # e.g. <meta> is considered self-closing, like <meta/>
892 if tag_name not in VOID_ELEMENTS:
893 tag_stack.append(tag_name)
894
895 counters.max_tag_stack = max(counters.max_tag_stack,
896 len(tag_stack))
897 elif tok_id == Tok.EndTag:
898 if flags & BALANCED_TAGS:
899 try:
900 expected = tag_stack.pop()
901 except IndexError:
902 raise ParseError('Tag stack empty',
903 s=contents,
904 start_pos=start_pos)
905
906 actual = lx.CanonicalTagName()
907 if expected != actual:
908 raise ParseError(
909 'Got unexpected closing tag %r; opening tag was %r' %
910 (contents[start_pos:end_pos], expected),
911 s=contents,
912 start_pos=start_pos)
913
914 start_pos = end_pos
915
916 if len(tag_stack) != 0:
917 raise ParseError('Missing closing tags at end of doc: %s' %
918 ' '.join(tag_stack),
919 s=contents,
920 start_pos=start_pos)
921
922 counters.num_tokens += len(tokens)
923
924
925def ToXml(htm8_str):
926 # type: (str) -> str
927
928 # TODO:
929 # 1. Lex it
930 # 2. < & > must be escaped
931 # a. in raw data
932 # b. in quoted strings
933 # 3. <script> turned into CDATA
934 # 4. void tags turned into self-closing tags
935 # 5. case-sensitive tag matching - not sure about this
936
937 tag_lexer = TagLexer(htm8_str)
938 val_lexer = AttrValueLexer(htm8_str)
939
940 f = StringIO()
941 out = Output(htm8_str, f)
942
943 lx = Lexer(htm8_str)
944
945 pos = 0
946 while True:
947 tok_id, end_pos = lx.Read()
948
949 if tok_id == Tok.Invalid:
950 raise LexError(htm8_str, pos)
951 if tok_id == Tok.EndOfStream:
952 break
953
954 if tok_id in (Tok.RawData, Tok.CharEntity, Tok.HexChar, Tok.DecChar):
955 out.PrintUntil(end_pos)
956 elif tok_id in (Tok.StartTag, Tok.StartEndTag):
957 tag_lexer.Reset(pos, end_pos)
958 # TODO: reduce allocations here
959 all_attrs = tag_lexer.AllAttrsRawSlice()
960 for name, val_start, val_end in all_attrs:
961 val_lexer.Reset(val_start, val_end)
962 # TODO: get the kind of string
963 #
964 # Quoted: we need to replace & with &amp; and < with &lt;
965 # note > is not allowed
966 # Unquoted: right now, we can just surround with double quotes
967 # because we don't allow any bad chars
968 # Empty : add "", so empty= becomes =""
969 # Missing : add ="", so missing becomes missing=""
970
971 tag_name = lx.CanonicalTagName()
972 if tok_id == Tok.StartTag and tag_name in VOID_ELEMENTS:
973 # TODO: instead of closing >, print />
974 pass
975
976 elif tok_id == Tok.BadAmpersand:
977 #out.SkipTo(pos)
978 out.Print('&amp;')
979 out.SkipTo(end_pos)
980
981 elif tok_id == Tok.BadGreaterThan:
982 #out.SkipTo(pos)
983 out.Print('&gt;')
984 out.SkipTo(end_pos)
985 else:
986 out.PrintUntil(end_pos)
987
988 pos = end_pos
989
990 out.PrintTheRest()
991 return f.getvalue()
992
993
994class Counters(object):
995
996 def __init__(self):
997 self.num_tokens = 0
998 self.num_start_tags = 0
999 self.num_start_end_tags = 0
1000 self.num_attrs = 0
1001 self.max_tag_stack = 0
1002 self.num_val_tokens = 0
1003
1004 self.debug_attrs = []
1005
1006
1007def main(argv):
1008 action = argv[1]
1009
1010 if action == 'tokens':
1011 contents = sys.stdin.read()
1012
1013 lx = Lexer(contents)
1014 start_pos = 0
1015 while True:
1016 tok_id, end_pos = lx.Read()
1017 if tok_id == Tok.Invalid:
1018 raise LexError(contents, start_pos)
1019 if tok_id == Tok.EndOfStream:
1020 break
1021
1022 frag = contents[start_pos:end_pos]
1023 log('%d %s %r', end_pos, TokenName(tok_id), frag)
1024 start_pos = end_pos
1025
1026 return 0
1027
1028 elif action in ('lex-htm8', 'parse-htm8', 'parse-xml'):
1029
1030 errors = []
1031 counters = Counters()
1032
1033 flags = LEX_ATTRS | LEX_QUOTED_VALUES
1034 if action.startswith('parse-'):
1035 flags |= BALANCED_TAGS
1036 if action == 'parse-xml':
1037 flags |= NO_SPECIAL_TAGS
1038
1039 i = 0
1040 for line in sys.stdin:
1041 filename = line.strip()
1042 with open(filename) as f:
1043 contents = f.read()
1044
1045 try:
1046 Validate(contents, flags, counters)
1047 except LexError as e:
1048 log('Lex error in %r: %s', filename, e)
1049 errors.append((filename, e))
1050 except ParseError as e:
1051 log('Parse error in %r: %s', filename, e)
1052 errors.append((filename, e))
1053 i += 1
1054
1055 log('')
1056 log('%10d tokens', counters.num_tokens)
1057 log('%10d start/end tags', counters.num_start_end_tags)
1058 log('%10d start tags', counters.num_start_tags)
1059 log('%10d attrs', counters.num_attrs)
1060 log('%10d max tag stack depth', counters.max_tag_stack)
1061 log('%10d attr val tokens', counters.num_val_tokens)
1062 log('%10d errors', len(errors))
1063 if len(errors):
1064 return 1
1065 return 0
1066
1067 elif action == 'todo':
1068 # Other algorithms:
1069 #
1070 # - select first subtree with given ID
1071 # - this requires understanding the void tags I suppose
1072 # - select all subtrees that have a class
1073 # - materialize DOM
1074
1075 # Safe-HTM8? This is a filter
1076 return 0
1077
1078 else:
1079 raise RuntimeError('Invalid action %r' % action)
1080
1081
1082if __name__ == '__main__':
1083 sys.exit(main(sys.argv))