OILS / lazylex / html.py View on Github | oils.pub

1084 lines, 561 significant
1#!/usr/bin/env python2
2"""
3lazylex/html.py - Low-Level HTML Processing.
4
5See lazylex/README.md for details.
6"""
7from __future__ import print_function
8
9try:
10 from cStringIO import StringIO
11except ImportError:
12 # for python3
13 from io import StringIO # type: ignore
14import re
15import sys
16
17if sys.version_info.major == 2:
18 from typing import List, Tuple, Optional
19
20
21def log(msg, *args):
22 msg = msg % args
23 print(msg, file=sys.stderr)
24
25
26class LexError(Exception):
27 """
28 Examples of lex errors:
29
30 - Tok.Invalid, like <> or &&
31 - Unclosed <!-- <? <![CDATA[ <script> <style>
32 """
33
34 def __init__(self, s, start_pos):
35 self.s = s
36 self.start_pos = start_pos
37
38 def __str__(self):
39 return '(LexError %r)' % (self.s[self.start_pos:self.start_pos + 20])
40
41
42def FindLineNum(s, error_pos):
43 current_pos = 0
44 line_num = 1
45 while True:
46 newline_pos = s.find('\n', current_pos)
47 #log('current = %d, N %d, line %d', current_pos, newline_pos, line_num)
48
49 if newline_pos == -1: # this is the last line
50 return line_num
51 if newline_pos >= error_pos:
52 return line_num
53 line_num += 1
54 current_pos = newline_pos + 1
55
56
57class ParseError(Exception):
58 """
59 Examples of parse errors
60
61 - unbalanced tag structure
62 - ul_table.py errors
63 """
64
65 def __init__(self, msg, s=None, start_pos=-1):
66 self.msg = msg
67 self.s = s
68 self.start_pos = start_pos
69
70 def __str__(self):
71 if self.s is not None:
72 assert self.start_pos != -1, self.start_pos
73 snippet = (self.s[self.start_pos:self.start_pos + 20])
74
75 line_num = FindLineNum(self.s, self.start_pos)
76 else:
77 snippet = ''
78 line_num = -1
79 msg = 'line %d: %r %r' % (line_num, self.msg, snippet)
80 return msg
81
82
83class Output(object):
84 """Takes an underlying input buffer and an output file. Maintains a
85 position in the input buffer.
86
87 Print FROM the input or print new text to the output.
88 """
89
90 def __init__(self, s, f, left_pos=0, right_pos=-1):
91 self.s = s
92 self.f = f
93 self.pos = left_pos
94 self.right_pos = len(s) if right_pos == -1 else right_pos
95
96 def SkipTo(self, pos):
97 """Skip to a position."""
98 self.pos = pos
99
100 def PrintUntil(self, pos):
101 """Print until a position."""
102 piece = self.s[self.pos:pos]
103 self.f.write(piece)
104 self.pos = pos
105
106 def PrintTheRest(self):
107 """Print until the end of the string."""
108 self.PrintUntil(self.right_pos)
109
110 def Print(self, s):
111 """Print text to the underlying buffer."""
112 self.f.write(s)
113
114
115# HTML Tokens
116# CommentBegin, ProcessingBegin, CDataBegin are "pseudo-tokens", not visible
117TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin CData CDataBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData HtmlCData BadAmpersand BadGreaterThan BadLessThan Invalid EndOfStream'.split(
118)
119
120
121class Tok(object):
122 """
123 Avoid lint errors by using these aliases
124 """
125 pass
126
127
128TOKEN_NAMES = [None] * len(TOKENS) # type: List[str]
129
130this_module = sys.modules[__name__]
131for i, tok_str in enumerate(TOKENS):
132 setattr(this_module, tok_str, i)
133 setattr(Tok, tok_str, i)
134 TOKEN_NAMES[i] = tok_str
135
136
137def TokenName(tok_id):
138 return TOKEN_NAMES[tok_id]
139
140
141def MakeLexer(rules):
142 return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]
143
144
145#
146# Eggex
147#
148# Tag = / ~['>']+ /
149
150# Is this valid? A single character?
151# Tag = / ~'>'* /
152
153# Maybe better: / [NOT '>']+/
154# capital letters not allowed there?
155#
156# But then this is confusing:
157# / [NOT ~digit]+/
158#
159# / [NOT digit] / is [^\d]
160# / ~digit / is \D
161#
162# Or maybe:
163#
164# / [~ digit]+ /
165# / [~ '>']+ /
166# / [NOT '>']+ /
167
168# End = / '</' Tag '>' /
169# StartEnd = / '<' Tag '/>' /
170# Start = / '<' Tag '>' /
171#
172# EntityRef = / '&' dot{* N} ';' /
173
174# Tag name, or attribute name
175# colon is used in XML
176
177# https://www.w3.org/TR/xml/#NT-Name
178# Hm there is a lot of unicode stuff. We are simplifying parsing
179
180_NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter
181
182CHAR_LEX = [
183 # Characters
184 # https://www.w3.org/TR/xml/#sec-references
185 (r'&\# [0-9]+ ;', Tok.DecChar),
186 (r'&\# x[0-9a-fA-F]+ ;', Tok.HexChar),
187 (r'& %s ;' % _NAME, Tok.CharEntity),
188 # Allow unquoted, and quoted
189 (r'&', Tok.BadAmpersand),
190]
191
192HTM8_LEX = CHAR_LEX + [
193 (r'<!--', Tok.CommentBegin),
194
195 # Processing instruction are used for the XML header:
196 # <?xml version="1.0" encoding="UTF-8"?>
197 # They are technically XML-only, but in HTML5, they are another kind of
198 # comment:
199 #
200 # https://developer.mozilla.org/en-US/docs/Web/API/ProcessingInstruction
201 #
202 (r'<\?', Tok.ProcessingBegin),
203 # Not necessary in HTML5, but occurs in XML
204 (r'<!\[CDATA\[', Tok.CDataBegin), # <![CDATA[
205
206 # Markup declarations
207 # - In HTML5, there is only <!DOCTYPE html>
208 # - XML has 4 more declarations: <!ELEMENT ...> ATTLIST ENTITY NOTATION
209 # - these seem to be part of DTD
210 # - it's useful to skip these, and be able to parse the rest of the document
211 # - Note: < is allowed?
212 (r'<! [^>\x00]+ >', Tok.Decl),
213
214 # Tags
215 # Notes:
216 # - We look for a valid tag name, but we don't validate attributes.
217 # That's done in the tag lexer.
218 # - We don't allow leading whitespace
219 (r'</ (%s) >' % _NAME, Tok.EndTag),
220 # self-closing <br/> comes before StartTag
221 # could/should these be collapsed into one rule?
222 (r'< (%s) [^>\x00]* />' % _NAME, Tok.StartEndTag), # end </a>
223 (r'< (%s) [^>\x00]* >' % _NAME, Tok.StartTag), # start <a>
224
225 # HTML5 allows unescaped > in raw data, but < is not allowed.
226 # https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
227 #
228 # - My early blog has THREE errors when disallowing >
229 # - So do some .wwz files
230 (r'[^&<>\x00]+', Tok.RawData),
231 (r'>', Tok.BadGreaterThan),
232 # < is an error
233 (r'.', Tok.Invalid),
234]
235
236# Old notes:
237#
238# Non-greedy matches are regular and can be matched in linear time
239# with RE2.
240#
241# https://news.ycombinator.com/item?id=27099798
242#
243# Maybe try combining all of these for speed.
244
245# . is any char except newline
246# https://re2c.org/manual/manual_c.html
247
248# Discarded options
249#(r'<!-- .*? -->', Tok.Comment),
250
251# Hack from Claude: \s\S instead of re.DOTALL. I don't like this
252#(r'<!-- [\s\S]*? -->', Tok.Comment),
253#(r'<!-- (?:.|[\n])*? -->', Tok.Comment),
254
255HTM8_LEX_COMPILED = MakeLexer(HTM8_LEX)
256
257
258class Lexer(object):
259
260 def __init__(self, s, left_pos=0, right_pos=-1, no_special_tags=False):
261 self.s = s
262 self.pos = left_pos
263 self.right_pos = len(s) if right_pos == -1 else right_pos
264 self.no_special_tags = no_special_tags
265
266 self.cache = {} # string -> compiled regex pattern object
267
268 # either </script> or </style> - we search until we see that
269 self.search_state = None # type: Optional[str]
270
271 # Position of tag name, if applicable
272 # - Set after you get a StartTag, EndTag, or StartEndTag
273 # - Unset on other tags
274 self.tag_pos_left = -1
275 self.tag_pos_right = -1
276
277 def _Peek(self):
278 # type: () -> Tuple[int, int]
279 """
280 Note: not using _Peek() now
281 """
282 if self.pos == self.right_pos:
283 return Tok.EndOfStream, self.pos
284
285 assert self.pos < self.right_pos, self.pos
286
287 if self.search_state is not None and not self.no_special_tags:
288 # TODO: case-insensitive search for </SCRIPT> <SCRipt> ?
289 #
290 # Another strategy: enter a mode where we find ONLY the end tag
291 # regex, and any data that's not <, and then check the canonical
292 # tag name for 'script' or 'style'.
293 pos = self.s.find(self.search_state, self.pos)
294 if pos == -1:
295 # unterminated <script> or <style>
296 raise LexError(self.s, self.pos)
297 self.search_state = None
298 # beginning
299 return Tok.HtmlCData, pos
300
301 # Find the first match.
302 # Note: frontend/match.py uses _LongestMatch(), which is different!
303 # TODO: reconcile them. This lexer should be expressible in re2c.
304
305 for pat, tok_id in HTM8_LEX_COMPILED:
306 m = pat.match(self.s, self.pos)
307 if m:
308 if tok_id in (Tok.StartTag, Tok.EndTag, Tok.StartEndTag):
309 self.tag_pos_left = m.start(1)
310 self.tag_pos_right = m.end(1)
311 else:
312 # Reset state
313 self.tag_pos_left = -1
314 self.tag_pos_right = -1
315
316 if tok_id == Tok.CommentBegin:
317 pos = self.s.find('-->', self.pos)
318 if pos == -1:
319 # unterminated <!--
320 raise LexError(self.s, self.pos)
321 return Tok.Comment, pos + 3 # -->
322
323 if tok_id == Tok.ProcessingBegin:
324 pos = self.s.find('?>', self.pos)
325 if pos == -1:
326 # unterminated <?
327 raise LexError(self.s, self.pos)
328 return Tok.Processing, pos + 2 # ?>
329
330 if tok_id == Tok.CDataBegin:
331 pos = self.s.find(']]>', self.pos)
332 if pos == -1:
333 # unterminated <![CDATA[
334 raise LexError(self.s, self.pos)
335 return Tok.CData, pos + 3 # ]]>
336
337 if tok_id == Tok.StartTag:
338 # TODO: reduce allocations
339 if (self.TagNameEquals('script') or
340 self.TagNameEquals('style')):
341 # <SCRipt a=b> -> </SCRipt>
342 self.search_state = '</' + self._LiteralTagName() + '>'
343
344 return tok_id, m.end()
345 else:
346 raise AssertionError('Tok.Invalid rule should have matched')
347
348 def TagNameEquals(self, expected):
349 # type: (str) -> bool
350 assert self.tag_pos_left != -1, self.tag_pos_left
351 assert self.tag_pos_right != -1, self.tag_pos_right
352
353 # TODO: In C++, this does not need an allocation. Can we test
354 # directly?
355 return expected == self.CanonicalTagName()
356
357 def _LiteralTagName(self):
358 # type: () -> str
359 assert self.tag_pos_left != -1, self.tag_pos_left
360 assert self.tag_pos_right != -1, self.tag_pos_right
361
362 return self.s[self.tag_pos_left:self.tag_pos_right]
363
364 def CanonicalTagName(self):
365 # type: () -> str
366 tag_name = self._LiteralTagName()
367 # Most tags are already lower case, so avoid allocation with this conditional
368 # TODO: this could go in the mycpp runtime?
369 if tag_name.islower():
370 return tag_name
371 else:
372 return tag_name.lower()
373
374 def Read(self):
375 # type: () -> Tuple[int, int]
376 tok_id, end_pos = self._Peek()
377 self.pos = end_pos # advance
378 return tok_id, end_pos
379
380 def LookAhead(self, regex):
381 # Cache the regex compilation. This could also be LookAheadFor(THEAD)
382 # or something.
383 pat = self.cache.get(regex)
384 if pat is None:
385 pat = re.compile(regex)
386 self.cache[regex] = pat
387
388 m = pat.match(self.s, self.pos)
389 return m is not None
390
391
392def _Tokens(s, left_pos, right_pos):
393 """
394 Args:
395 s: string to parse
396 left_pos, right_pos: Optional span boundaries.
397 """
398 lx = Lexer(s, left_pos, right_pos)
399 while True:
400 tok_id, pos = lx.Read()
401 yield tok_id, pos
402 if tok_id == Tok.EndOfStream:
403 break
404
405
406def ValidTokens(s, left_pos=0, right_pos=-1):
407 """Wrapper around _Tokens to prevent callers from having to handle Invalid.
408
409 I'm not combining the two functions because I might want to do a
410 'yield' transformation on Tokens()? Exceptions might complicate the
411 issue?
412 """
413 pos = left_pos
414 for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
415 if tok_id == Tok.Invalid:
416 raise LexError(s, pos)
417 yield tok_id, end_pos
418 pos = end_pos
419
420
421def ValidTokenList(s, no_special_tags=False):
422 """A wrapper that can be more easily translated to C++. Doesn't use iterators."""
423
424 start_pos = 0
425 tokens = []
426 lx = Lexer(s, no_special_tags=no_special_tags)
427 while True:
428 tok_id, end_pos = lx.Read()
429 tokens.append((tok_id, end_pos))
430 if tok_id == Tok.EndOfStream:
431 break
432 if tok_id == Tok.Invalid:
433 raise LexError(s, start_pos)
434 start_pos = end_pos
435 return tokens
436
437
438# Tag names:
439# Match <a or </a
440# Match <h2, but not <2h
441#
442# HTML 5 doesn't restrict tag names at all
443# https://html.spec.whatwg.org/#toc-syntax
444#
445# XML allows : - .
446# https://www.w3.org/TR/xml/#NT-NameChar
447
448# Namespaces for MathML, SVG
449# XLink, XML, XMLNS
450#
451# https://infra.spec.whatwg.org/#namespaces
452#
453# Allow - for td-attrs
454
455# Be very lenient - just no whitespace or special HTML chars
456# I don't think this is more lenient than HTML5, though we should check.
457_UNQUOTED_VALUE = r'''[^ \t\r\n<>&"'\x00]*'''
458
459# TODO: we don't need to capture the tag name here? That's done at the top
460# level
461_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)
462
463_TAG_LAST_RE = re.compile(r'\s* /? >', re.VERBOSE)
464
465# To match href="foo"
466# Note: in HTML5 and XML, single quoted attributes are also valid
467
468# <button disabled> is standard usage
469
470# NOTE: This used to allow whitespace around =
471# <a foo = "bar"> makes sense in XML
472# But then you also have
473# <a foo= bar> - which is TWO attributes, in HTML5
474# So the space is problematic
475
476_ATTR_RE = re.compile(
477 r'''
478\s+ # Leading whitespace is required
479(%s) # Attribute name
480(?: # Optional attribute value
481 \s* = \s* # Spaces allowed around =
482 (?:
483 " ([^>"\x00]*) " # double quoted value
484 | ' ([^>'\x00]*) ' # single quoted value
485 | (%s) # Attribute value
486 )
487)?
488''' % (_NAME, _UNQUOTED_VALUE), re.VERBOSE)
489
490TagName, AttrName, UnquotedValue, QuotedValue, MissingValue = range(5)
491
492
493class TagLexer(object):
494 """
495 Given a tag like <a href="..."> or <link type="..." />, the TagLexer
496 provides a few operations:
497
498 - What is the tag?
499 - Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
500 """
501
502 def __init__(self, s):
503 self.s = s
504 self.start_pos = -1 # Invalid
505 self.end_pos = -1
506
507 def Reset(self, start_pos, end_pos):
508 """Reuse instances of this object."""
509 assert start_pos >= 0, start_pos
510 assert end_pos >= 0, end_pos
511
512 self.start_pos = start_pos
513 self.end_pos = end_pos
514
515 def TagString(self):
516 return self.s[self.start_pos:self.end_pos]
517
518 def TagName(self):
519 # First event
520 tok_id, start, end = next(self.Tokens())
521 return self.s[start:end]
522
523 def GetSpanForAttrValue(self, attr_name):
524 """
525 Used by oils_doc.py, for href shortcuts
526 """
527 # Algorithm: search for QuotedValue or UnquotedValue after AttrName
528 # TODO: Could also cache these
529
530 events = self.Tokens()
531 val = (-1, -1)
532 try:
533 while True:
534 tok_id, start, end = next(events)
535 if tok_id == AttrName:
536 name = self.s[start:end]
537 if name == attr_name:
538 # The value should come next
539 tok_id, start, end = next(events)
540 assert tok_id in (QuotedValue, UnquotedValue,
541 MissingValue), TokenName(tok_id)
542 val = start, end
543 break
544
545 except StopIteration:
546 pass
547 return val
548
549 def GetAttrRaw(self, attr_name):
550 """
551 Return the value, which may be UNESCAPED.
552 """
553 start, end = self.GetSpanForAttrValue(attr_name)
554 if start == -1:
555 return None
556 return self.s[start:end]
557
558 def AllAttrsRawSlice(self):
559 """
560 Get a list of pairs [('class', 3, 5), ('href', 9, 12)]
561 """
562 slices = []
563 events = self.Tokens()
564 try:
565 while True:
566 tok_id, start, end = next(events)
567 if tok_id == AttrName:
568 name = self.s[start:end]
569
570 # The value should come next
571 tok_id, start, end = next(events)
572 assert tok_id in (QuotedValue, UnquotedValue,
573 MissingValue), TokenName(tok_id)
574 # Note: quoted values may have &amp;
575 # We would need ANOTHER lexer to unescape them, but we
576 # don't need that for ul-table
577 slices.append((name, start, end))
578 except StopIteration:
579 pass
580 return slices
581
582 def AllAttrsRaw(self):
583 """
584 Get a list of pairs [('class', 'foo'), ('href', '?foo=1&amp;bar=2')]
585
586 The quoted values may be escaped. We would need another lexer to
587 unescape them.
588 """
589 slices = self.AllAttrsRawSlice()
590 pairs = []
591 for name, start, end in slices:
592 pairs.append((name, self.s[start:end]))
593 return pairs
594
595 def Tokens(self):
596 """
597 Yields a sequence of tokens: Tag (AttrName AttrValue?)*
598
599 Where each Token is (Type, start_pos, end_pos)
600
601 Note that start and end are NOT redundant! We skip over some unwanted
602 characters.
603 """
604 m = _TAG_RE.match(self.s, self.start_pos + 1)
605 if not m:
606 raise RuntimeError("Couldn't find HTML tag in %r" %
607 self.TagString())
608 yield TagName, m.start(1), m.end(1)
609
610 pos = m.end(0)
611 #log('POS %d', pos)
612
613 while True:
614 # don't search past the end
615 m = _ATTR_RE.match(self.s, pos, self.end_pos)
616 if not m:
617 #log('BREAK pos %d', pos)
618 break
619 #log('AttrName %r', m.group(1))
620
621 yield AttrName, m.start(1), m.end(1)
622
623 #log('m.groups() %r', m.groups())
624 if m.group(2) is not None:
625 # double quoted
626 yield QuotedValue, m.start(2), m.end(2)
627 elif m.group(3) is not None:
628 # single quoted - TODO: could have different token types
629 yield QuotedValue, m.start(3), m.end(3)
630 elif m.group(4) is not None:
631 yield UnquotedValue, m.start(4), m.end(4)
632 else:
633 # <button disabled>
634 end = m.end(0)
635 yield MissingValue, end, end
636
637 # Skip past the "
638 pos = m.end(0)
639
640 #log('TOK %r', self.s)
641
642 m = _TAG_LAST_RE.match(self.s, pos)
643 #log('_TAG_LAST_RE match %r', self.s[pos:])
644 if not m:
645 # Extra data at end of tag. TODO: add messages for all these.
646 raise LexError(self.s, pos)
647
648
649# This is similar but not identical to
650# " ([^>"\x00]*) " # double quoted value
651# | ' ([^>'\x00]*) ' # single quoted value
652#
653# Note: for unquoted values, & isn't allowed, and thus &amp; and &#99; and
654# &#x99; are not allowed. We could relax that?
655ATTR_VALUE_LEXER = CHAR_LEX + [
656 (r'[^>&\x00]+', Tok.RawData),
657 (r'.', Tok.Invalid),
658]
659
660ATTR_VALUE_LEXER = MakeLexer(ATTR_VALUE_LEXER)
661
662
663class AttrValueLexer(object):
664 """
665 <a href="foo=99&amp;bar">
666 <a href='foo=99&amp;bar'>
667 <a href=unquoted>
668 """
669
670 def __init__(self, s):
671 self.s = s
672 self.start_pos = -1 # Invalid
673 self.end_pos = -1
674
675 def Reset(self, start_pos, end_pos):
676 """Reuse instances of this object."""
677 assert start_pos >= 0, start_pos
678 assert end_pos >= 0, end_pos
679
680 self.start_pos = start_pos
681 self.end_pos = end_pos
682
683 def NumTokens(self):
684 num_tokens = 0
685 pos = self.start_pos
686 for tok_id, end_pos in self.Tokens():
687 if tok_id == Tok.Invalid:
688 raise LexError(self.s, pos)
689 pos = end_pos
690 #log('pos %d', pos)
691 num_tokens += 1
692 return num_tokens
693
694 def Tokens(self):
695 pos = self.start_pos
696 while pos < self.end_pos:
697 # Find the first match, like above.
698 # Note: frontend/match.py uses _LongestMatch(), which is different!
699 # TODO: reconcile them. This lexer should be expressible in re2c.
700 for pat, tok_id in ATTR_VALUE_LEXER:
701 m = pat.match(self.s, pos)
702 if m:
703 if 0:
704 tok_str = m.group(0)
705 log('token = %r', tok_str)
706
707 end_pos = m.end(0)
708 yield tok_id, end_pos
709 pos = end_pos
710 break
711 else:
712 raise AssertionError('Tok.Invalid rule should have matched')
713
714
715def ReadUntilStartTag(it, tag_lexer, tag_name):
716 """Find the next <foo>, returning its (start, end) positions
717
718 Raise ParseError if it's not found.
719
720 tag_lexer is RESET.
721 """
722 pos = 0
723 while True:
724 try:
725 tok_id, end_pos = next(it)
726 except StopIteration:
727 break
728 tag_lexer.Reset(pos, end_pos)
729 if tok_id == Tok.StartTag and tag_lexer.TagName() == tag_name:
730 return pos, end_pos
731
732 pos = end_pos
733
734 raise ParseError('No start tag %r' % tag_name)
735
736
737def ReadUntilEndTag(it, tag_lexer, tag_name):
738 """Find the next </foo>, returning its (start, end) position
739
740 Raise ParseError if it's not found.
741
742 tag_lexer is RESET.
743 """
744 pos = 0
745 while True:
746 try:
747 tok_id, end_pos = next(it)
748 except StopIteration:
749 break
750 tag_lexer.Reset(pos, end_pos)
751 if tok_id == Tok.EndTag and tag_lexer.TagName() == tag_name:
752 return pos, end_pos
753
754 pos = end_pos
755
756 raise ParseError('No end tag %r' % tag_name)
757
758
759CHAR_ENTITY = {
760 'amp': '&',
761 'lt': '<',
762 'gt': '>',
763 'quot': '"',
764 'apos': "'",
765}
766
767
768def ToText(s, left_pos=0, right_pos=-1):
769 """Given HTML, return text by unquoting &gt; and &lt; etc.
770
771 Used by:
772 doctools/oils_doc.py: PygmentsPlugin
773 doctools/help_gen.py: HelpIndexCards
774
775 In the latter case, we cold process some tags, like:
776
777 - Blue Link (not clickable, but still useful)
778 - Red X
779
780 That should be html.ToAnsi.
781 """
782 f = StringIO()
783 out = Output(s, f, left_pos, right_pos)
784
785 pos = left_pos
786 for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
787 if tok_id in (Tok.RawData, Tok.BadAmpersand, Tok.BadGreaterThan,
788 Tok.BadLessThan):
789 out.SkipTo(pos)
790 out.PrintUntil(end_pos)
791
792 elif tok_id == Tok.CharEntity: # &amp;
793
794 entity = s[pos + 1:end_pos - 1]
795
796 out.SkipTo(pos)
797 out.Print(CHAR_ENTITY[entity])
798 out.SkipTo(end_pos)
799
800 # Not handling these yet
801 elif tok_id == Tok.HexChar:
802 raise AssertionError('Hex Char %r' % s[pos:pos + 20])
803
804 elif tok_id == Tok.DecChar:
805 raise AssertionError('Dec Char %r' % s[pos:pos + 20])
806
807 else:
808 # Skip everything else
809 out.SkipTo(end_pos)
810
811 pos = end_pos
812
813 out.PrintTheRest()
814 return f.getvalue()
815
816
817# https://developer.mozilla.org/en-US/docs/Glossary/Void_element
818VOID_ELEMENTS = [
819 'area',
820 'base',
821 'br',
822 'col',
823 'embed',
824 'hr',
825 'img',
826 'input',
827 'link',
828 'meta',
829 'param',
830 'source',
831 'track',
832 'wbr',
833]
834
835LEX_ATTRS = 1 << 1
836LEX_QUOTED_VALUES = 1 << 2 # href="?x=42&amp;y=99"
837NO_SPECIAL_TAGS = 1 << 3 # <script> <style>, VOID tags, etc.
838BALANCED_TAGS = 1 << 4 # are tags balanced?
839
840
841def Validate(contents, flags, counters):
842 # type: (str, int, Counters) -> None
843
844 tag_lexer = TagLexer(contents)
845 val_lexer = AttrValueLexer(contents)
846
847 no_special_tags = bool(flags & NO_SPECIAL_TAGS)
848 lx = Lexer(contents, no_special_tags=no_special_tags)
849 tokens = []
850 start_pos = 0
851 tag_stack = []
852 while True:
853 tok_id, end_pos = lx.Read()
854 #log('TOP %s %r', TokenName(tok_id), contents[start_pos:end_pos])
855
856 if tok_id == Tok.Invalid:
857 raise LexError(contents, start_pos)
858 if tok_id == Tok.EndOfStream:
859 break
860
861 tokens.append((tok_id, end_pos))
862
863 if tok_id == Tok.StartEndTag:
864 counters.num_start_end_tags += 1
865
866 tag_lexer.Reset(start_pos, end_pos)
867 all_attrs = tag_lexer.AllAttrsRawSlice()
868 counters.num_attrs += len(all_attrs)
869 for name, val_start, val_end in all_attrs:
870 val_lexer.Reset(val_start, val_end)
871 counters.num_val_tokens += val_lexer.NumTokens()
872
873 counters.debug_attrs.extend(all_attrs)
874
875 elif tok_id == Tok.StartTag:
876 counters.num_start_tags += 1
877
878 tag_lexer.Reset(start_pos, end_pos)
879 all_attrs = tag_lexer.AllAttrsRawSlice()
880 counters.num_attrs += len(all_attrs)
881 for name, val_start, val_end in all_attrs:
882 val_lexer.Reset(val_start, val_end)
883 counters.num_val_tokens += val_lexer.NumTokens()
884
885 counters.debug_attrs.extend(all_attrs)
886
887 if flags & BALANCED_TAGS:
888 tag_name = lx.CanonicalTagName()
889 if flags & NO_SPECIAL_TAGS:
890 tag_stack.append(tag_name)
891 else:
892 # e.g. <meta> is considered self-closing, like <meta/>
893 if tag_name not in VOID_ELEMENTS:
894 tag_stack.append(tag_name)
895
896 counters.max_tag_stack = max(counters.max_tag_stack,
897 len(tag_stack))
898 elif tok_id == Tok.EndTag:
899 if flags & BALANCED_TAGS:
900 try:
901 expected = tag_stack.pop()
902 except IndexError:
903 raise ParseError('Tag stack empty',
904 s=contents,
905 start_pos=start_pos)
906
907 actual = lx.CanonicalTagName()
908 if expected != actual:
909 raise ParseError(
910 'Got unexpected closing tag %r; opening tag was %r' %
911 (contents[start_pos:end_pos], expected),
912 s=contents,
913 start_pos=start_pos)
914
915 start_pos = end_pos
916
917 if len(tag_stack) != 0:
918 raise ParseError('Missing closing tags at end of doc: %s' %
919 ' '.join(tag_stack),
920 s=contents,
921 start_pos=start_pos)
922
923 counters.num_tokens += len(tokens)
924
925
926def ToXml(htm8_str):
927 # type: (str) -> str
928
929 # TODO:
930 # 1. Lex it
931 # 2. < & > must be escaped
932 # a. in raw data
933 # b. in quoted strings
934 # 3. <script> turned into CDATA
935 # 4. void tags turned into self-closing tags
936 # 5. case-sensitive tag matching - not sure about this
937
938 tag_lexer = TagLexer(htm8_str)
939 val_lexer = AttrValueLexer(htm8_str)
940
941 f = StringIO()
942 out = Output(htm8_str, f)
943
944 lx = Lexer(htm8_str)
945
946 pos = 0
947 while True:
948 tok_id, end_pos = lx.Read()
949
950 if tok_id == Tok.Invalid:
951 raise LexError(htm8_str, pos)
952 if tok_id == Tok.EndOfStream:
953 break
954
955 if tok_id in (Tok.RawData, Tok.CharEntity, Tok.HexChar, Tok.DecChar):
956 out.PrintUntil(end_pos)
957 elif tok_id in (Tok.StartTag, Tok.StartEndTag):
958 tag_lexer.Reset(pos, end_pos)
959 # TODO: reduce allocations here
960 all_attrs = tag_lexer.AllAttrsRawSlice()
961 for name, val_start, val_end in all_attrs:
962 val_lexer.Reset(val_start, val_end)
963 # TODO: get the kind of string
964 #
965 # Quoted: we need to replace & with &amp; and < with &lt;
966 # note > is not allowed
967 # Unquoted: right now, we can just surround with double quotes
968 # because we don't allow any bad chars
969 # Empty : add "", so empty= becomes =""
970 # Missing : add ="", so missing becomes missing=""
971
972 tag_name = lx.CanonicalTagName()
973 if tok_id == Tok.StartTag and tag_name in VOID_ELEMENTS:
974 # TODO: instead of closing >, print />
975 pass
976
977 elif tok_id == Tok.BadAmpersand:
978 #out.SkipTo(pos)
979 out.Print('&amp;')
980 out.SkipTo(end_pos)
981
982 elif tok_id == Tok.BadGreaterThan:
983 #out.SkipTo(pos)
984 out.Print('&gt;')
985 out.SkipTo(end_pos)
986 else:
987 out.PrintUntil(end_pos)
988
989 pos = end_pos
990
991 out.PrintTheRest()
992 return f.getvalue()
993
994
995class Counters(object):
996
997 def __init__(self):
998 self.num_tokens = 0
999 self.num_start_tags = 0
1000 self.num_start_end_tags = 0
1001 self.num_attrs = 0
1002 self.max_tag_stack = 0
1003 self.num_val_tokens = 0
1004
1005 self.debug_attrs = []
1006
1007
1008def main(argv):
1009 action = argv[1]
1010
1011 if action == 'tokens':
1012 contents = sys.stdin.read()
1013
1014 lx = Lexer(contents)
1015 start_pos = 0
1016 while True:
1017 tok_id, end_pos = lx.Read()
1018 if tok_id == Tok.Invalid:
1019 raise LexError(contents, start_pos)
1020 if tok_id == Tok.EndOfStream:
1021 break
1022
1023 frag = contents[start_pos:end_pos]
1024 log('%d %s %r', end_pos, TokenName(tok_id), frag)
1025 start_pos = end_pos
1026
1027 return 0
1028
1029 elif action in ('lex-htm8', 'parse-htm8', 'parse-xml'):
1030
1031 errors = []
1032 counters = Counters()
1033
1034 flags = LEX_ATTRS | LEX_QUOTED_VALUES
1035 if action.startswith('parse-'):
1036 flags |= BALANCED_TAGS
1037 if action == 'parse-xml':
1038 flags |= NO_SPECIAL_TAGS
1039
1040 i = 0
1041 for line in sys.stdin:
1042 filename = line.strip()
1043 with open(filename) as f:
1044 contents = f.read()
1045
1046 try:
1047 Validate(contents, flags, counters)
1048 except LexError as e:
1049 log('Lex error in %r: %s', filename, e)
1050 errors.append((filename, e))
1051 except ParseError as e:
1052 log('Parse error in %r: %s', filename, e)
1053 errors.append((filename, e))
1054 i += 1
1055
1056 log('')
1057 log('%10d tokens', counters.num_tokens)
1058 log('%10d start/end tags', counters.num_start_end_tags)
1059 log('%10d start tags', counters.num_start_tags)
1060 log('%10d attrs', counters.num_attrs)
1061 log('%10d max tag stack depth', counters.max_tag_stack)
1062 log('%10d attr val tokens', counters.num_val_tokens)
1063 log('%10d errors', len(errors))
1064 if len(errors):
1065 return 1
1066 return 0
1067
1068 elif action == 'todo':
1069 # Other algorithms:
1070 #
1071 # - select first subtree with given ID
1072 # - this requires understanding the void tags I suppose
1073 # - select all subtrees that have a class
1074 # - materialize DOM
1075
1076 # Safe-HTM8? This is a filter
1077 return 0
1078
1079 else:
1080 raise RuntimeError('Invalid action %r' % action)
1081
1082
1083if __name__ == '__main__':
1084 sys.exit(main(sys.argv))