OILS / lazylex / html.py View on Github | oils.pub

1129 lines, 565 significant
1#!/usr/bin/env python2
2"""
3lazylex/html.py - Low-Level HTML Processing.
4
5See lazylex/README.md for details.
6
7TODO:
8- Get rid of AttrValueLexer - this should be in the TagLexer
9 - this also means that unquoted values can be more similar
10 - We can use a single lexer mode for everything inside <>
11 - the SPACE is the only difference
12- UTF-8 check, like JSON8
13- Static typing
14
15"""
16from __future__ import print_function
17from typing import Iterator
18from typing import Union
19from typing import Any
20from typing import IO
21
22try:
23 from cStringIO import StringIO
24except ImportError:
25 # for python3
26 from io import StringIO # type: ignore
27import re
28import sys
29
30if sys.version_info.major == 2:
31 from typing import List, Tuple, Optional
32
33
34def log(msg, *args):
35 # type: (str, *Any) -> None
36 msg = msg % args
37 print(msg, file=sys.stderr)
38
39
40class LexError(Exception):
41 """
42 Examples of lex errors:
43
44 - Tok.Invalid, like <> or &&
45 - Unclosed <!-- <? <![CDATA[ <script> <style>
46 """
47
48 def __init__(self, s, start_pos):
49 # type: (str, int) -> None
50 self.s = s
51 self.start_pos = start_pos
52
53 def __str__(self):
54 # type: () -> str
55 return '(LexError %r)' % (self.s[self.start_pos:self.start_pos + 20])
56
57
58def FindLineNum(s, error_pos):
59 # type: (str, int) -> int
60 current_pos = 0
61 line_num = 1
62 while True:
63 newline_pos = s.find('\n', current_pos)
64 #log('current = %d, N %d, line %d', current_pos, newline_pos, line_num)
65
66 if newline_pos == -1: # this is the last line
67 return line_num
68 if newline_pos >= error_pos:
69 return line_num
70 line_num += 1
71 current_pos = newline_pos + 1
72
73
74class ParseError(Exception):
75 """
76 Examples of parse errors
77
78 - unbalanced tag structure
79 - ul_table.py errors
80 """
81
82 def __init__(self, msg, s=None, start_pos=-1):
83 # type: (str, Optional[str], int) -> None
84 self.msg = msg
85 self.s = s
86 self.start_pos = start_pos
87
88 def __str__(self):
89 # type: () -> str
90 if self.s is not None:
91 assert self.start_pos != -1, self.start_pos
92 snippet = (self.s[self.start_pos:self.start_pos + 20])
93
94 line_num = FindLineNum(self.s, self.start_pos)
95 else:
96 snippet = ''
97 line_num = -1
98 msg = 'line %d: %r %r' % (line_num, self.msg, snippet)
99 return msg
100
101
102class Output(object):
103 """Takes an underlying input buffer and an output file. Maintains a
104 position in the input buffer.
105
106 Print FROM the input or print new text to the output.
107 """
108
109 def __init__(self, s, f, left_pos=0, right_pos=-1):
110 # type: (str, IO[str], int, int) -> None
111 self.s = s
112 self.f = f
113 self.pos = left_pos
114 self.right_pos = len(s) if right_pos == -1 else right_pos
115
116 def SkipTo(self, pos):
117 # type: (int) -> None
118 """Skip to a position."""
119 self.pos = pos
120
121 def PrintUntil(self, pos):
122 # type: (int) -> None
123 """Print until a position."""
124 piece = self.s[self.pos:pos]
125 self.f.write(piece)
126 self.pos = pos
127
128 def PrintTheRest(self):
129 # type: () -> None
130 """Print until the end of the string."""
131 self.PrintUntil(self.right_pos)
132
133 def Print(self, s):
134 # type: (str) -> None
135 """Print text to the underlying buffer."""
136 self.f.write(s)
137
138
139# HTML Tokens
140# CommentBegin, ProcessingBegin, CDataBegin are "pseudo-tokens", not visible
141TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin CData CDataBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData HtmlCData BadAmpersand BadGreaterThan BadLessThan Invalid EndOfStream'.split(
142)
143
144
145class Tok(object):
146 """
147 Avoid lint errors by using these aliases
148 """
149 pass
150
151
152TOKEN_NAMES = [None] * len(TOKENS) # type: List[str]
153
154this_module = sys.modules[__name__]
155for i, tok_str in enumerate(TOKENS):
156 setattr(this_module, tok_str, i)
157 setattr(Tok, tok_str, i)
158 TOKEN_NAMES[i] = tok_str
159
160
161def TokenName(tok_id):
162 # type: (int) -> str
163 return TOKEN_NAMES[tok_id]
164
165
166def MakeLexer(rules):
167 return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]
168
169
170#
171# Eggex
172#
173# Tag = / ~['>']+ /
174
175# Is this valid? A single character?
176# Tag = / ~'>'* /
177
178# Maybe better: / [NOT '>']+/
179# capital letters not allowed there?
180#
181# But then this is confusing:
182# / [NOT ~digit]+/
183#
184# / [NOT digit] / is [^\d]
185# / ~digit / is \D
186#
187# Or maybe:
188#
189# / [~ digit]+ /
190# / [~ '>']+ /
191# / [NOT '>']+ /
192
193# End = / '</' Tag '>' /
194# StartEnd = / '<' Tag '/>' /
195# Start = / '<' Tag '>' /
196#
197# EntityRef = / '&' dot{* N} ';' /
198
199# Tag name, or attribute name
200# colon is used in XML
201
202# https://www.w3.org/TR/xml/#NT-Name
203# Hm there is a lot of unicode stuff. We are simplifying parsing
204
205_NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter
206
207CHAR_LEX = [
208 # Characters
209 # https://www.w3.org/TR/xml/#sec-references
210 (r'&\# [0-9]+ ;', Tok.DecChar),
211 (r'&\# x[0-9a-fA-F]+ ;', Tok.HexChar),
212 (r'& %s ;' % _NAME, Tok.CharEntity),
213 # Allow unquoted, and quoted
214 (r'&', Tok.BadAmpersand),
215]
216
217HTM8_LEX = CHAR_LEX + [
218 (r'<!--', Tok.CommentBegin),
219
220 # Processing instruction are used for the XML header:
221 # <?xml version="1.0" encoding="UTF-8"?>
222 # They are technically XML-only, but in HTML5, they are another kind of
223 # comment:
224 #
225 # https://developer.mozilla.org/en-US/docs/Web/API/ProcessingInstruction
226 #
227 (r'<\?', Tok.ProcessingBegin),
228 # Not necessary in HTML5, but occurs in XML
229 (r'<!\[CDATA\[', Tok.CDataBegin), # <![CDATA[
230
231 # Markup declarations
232 # - In HTML5, there is only <!DOCTYPE html>
233 # - XML has 4 more declarations: <!ELEMENT ...> ATTLIST ENTITY NOTATION
234 # - these seem to be part of DTD
235 # - it's useful to skip these, and be able to parse the rest of the document
236 # - Note: < is allowed?
237 (r'<! [^>\x00]+ >', Tok.Decl),
238
239 # Tags
240 # Notes:
241 # - We look for a valid tag name, but we don't validate attributes.
242 # That's done in the tag lexer.
243 # - We don't allow leading whitespace
244 (r'</ (%s) >' % _NAME, Tok.EndTag),
245 # self-closing <br/> comes before StartTag
246 # could/should these be collapsed into one rule?
247 (r'< (%s) [^>\x00]* />' % _NAME, Tok.StartEndTag), # end </a>
248 (r'< (%s) [^>\x00]* >' % _NAME, Tok.StartTag), # start <a>
249
250 # HTML5 allows unescaped > in raw data, but < is not allowed.
251 # https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
252 #
253 # - My early blog has THREE errors when disallowing >
254 # - So do some .wwz files
255 (r'[^&<>\x00]+', Tok.RawData),
256 (r'>', Tok.BadGreaterThan),
257 # < is an error
258 (r'.', Tok.Invalid),
259]
260
261# Old notes:
262#
263# Non-greedy matches are regular and can be matched in linear time
264# with RE2.
265#
266# https://news.ycombinator.com/item?id=27099798
267#
268# Maybe try combining all of these for speed.
269
270# . is any char except newline
271# https://re2c.org/manual/manual_c.html
272
273# Discarded options
274#(r'<!-- .*? -->', Tok.Comment),
275
276# Hack from Claude: \s\S instead of re.DOTALL. I don't like this
277#(r'<!-- [\s\S]*? -->', Tok.Comment),
278#(r'<!-- (?:.|[\n])*? -->', Tok.Comment),
279
280HTM8_LEX_COMPILED = MakeLexer(HTM8_LEX)
281
282
283class Lexer(object):
284
285 def __init__(self, s, left_pos=0, right_pos=-1, no_special_tags=False):
286 # type: (str, int, int, bool) -> None
287 self.s = s
288 self.pos = left_pos
289 self.right_pos = len(s) if right_pos == -1 else right_pos
290 self.no_special_tags = no_special_tags
291
292 self.cache = {} # string -> compiled regex pattern object
293
294 # either </script> or </style> - we search until we see that
295 self.search_state = None # type: Optional[str]
296
297 # Position of tag name, if applicable
298 # - Set after you get a StartTag, EndTag, or StartEndTag
299 # - Unset on other tags
300 self.tag_pos_left = -1
301 self.tag_pos_right = -1
302
303 def _Peek(self):
304 # type: () -> Tuple[int, int]
305 """
306 Note: not using _Peek() now
307 """
308 if self.pos == self.right_pos:
309 return Tok.EndOfStream, self.pos
310
311 assert self.pos < self.right_pos, self.pos
312
313 if self.search_state is not None and not self.no_special_tags:
314 # TODO: case-insensitive search for </SCRIPT> <SCRipt> ?
315 #
316 # Another strategy: enter a mode where we find ONLY the end tag
317 # regex, and any data that's not <, and then check the canonical
318 # tag name for 'script' or 'style'.
319 pos = self.s.find(self.search_state, self.pos)
320 if pos == -1:
321 # unterminated <script> or <style>
322 raise LexError(self.s, self.pos)
323 self.search_state = None
324 # beginning
325 return Tok.HtmlCData, pos
326
327 # Find the first match.
328 # Note: frontend/match.py uses _LongestMatch(), which is different!
329 # TODO: reconcile them. This lexer should be expressible in re2c.
330
331 for pat, tok_id in HTM8_LEX_COMPILED:
332 m = pat.match(self.s, self.pos)
333 if m:
334 if tok_id in (Tok.StartTag, Tok.EndTag, Tok.StartEndTag):
335 self.tag_pos_left = m.start(1)
336 self.tag_pos_right = m.end(1)
337 else:
338 # Reset state
339 self.tag_pos_left = -1
340 self.tag_pos_right = -1
341
342 if tok_id == Tok.CommentBegin:
343 pos = self.s.find('-->', self.pos)
344 if pos == -1:
345 # unterminated <!--
346 raise LexError(self.s, self.pos)
347 return Tok.Comment, pos + 3 # -->
348
349 if tok_id == Tok.ProcessingBegin:
350 pos = self.s.find('?>', self.pos)
351 if pos == -1:
352 # unterminated <?
353 raise LexError(self.s, self.pos)
354 return Tok.Processing, pos + 2 # ?>
355
356 if tok_id == Tok.CDataBegin:
357 pos = self.s.find(']]>', self.pos)
358 if pos == -1:
359 # unterminated <![CDATA[
360 raise LexError(self.s, self.pos)
361 return Tok.CData, pos + 3 # ]]>
362
363 if tok_id == Tok.StartTag:
364 # TODO: reduce allocations
365 if (self.TagNameEquals('script') or
366 self.TagNameEquals('style')):
367 # <SCRipt a=b> -> </SCRipt>
368 self.search_state = '</' + self._LiteralTagName() + '>'
369
370 return tok_id, m.end()
371 else:
372 raise AssertionError('Tok.Invalid rule should have matched')
373
374 def TagNameEquals(self, expected):
375 # type: (str) -> bool
376 assert self.tag_pos_left != -1, self.tag_pos_left
377 assert self.tag_pos_right != -1, self.tag_pos_right
378
379 # TODO: In C++, this does not need an allocation. Can we test
380 # directly?
381 return expected == self.CanonicalTagName()
382
383 def _LiteralTagName(self):
384 # type: () -> str
385 assert self.tag_pos_left != -1, self.tag_pos_left
386 assert self.tag_pos_right != -1, self.tag_pos_right
387
388 return self.s[self.tag_pos_left:self.tag_pos_right]
389
390 def CanonicalTagName(self):
391 # type: () -> str
392 tag_name = self._LiteralTagName()
393 # Most tags are already lower case, so avoid allocation with this conditional
394 # TODO: this could go in the mycpp runtime?
395 if tag_name.islower():
396 return tag_name
397 else:
398 return tag_name.lower()
399
400 def Read(self):
401 # type: () -> Tuple[int, int]
402 tok_id, end_pos = self._Peek()
403 self.pos = end_pos # advance
404 return tok_id, end_pos
405
406 def LookAhead(self, regex):
407 # type: (str) -> bool
408 # Cache the regex compilation. This could also be LookAheadFor(THEAD)
409 # or something.
410 pat = self.cache.get(regex)
411 if pat is None:
412 pat = re.compile(regex)
413 self.cache[regex] = pat
414
415 m = pat.match(self.s, self.pos)
416 return m is not None
417
418
419def _Tokens(s, left_pos, right_pos):
420 # type: (str, int, int) -> Iterator[Tuple[int, int]]
421 """
422 Args:
423 s: string to parse
424 left_pos, right_pos: Optional span boundaries.
425 """
426 lx = Lexer(s, left_pos, right_pos)
427 while True:
428 tok_id, pos = lx.Read()
429 yield tok_id, pos
430 if tok_id == Tok.EndOfStream:
431 break
432
433
434def ValidTokens(s, left_pos=0, right_pos=-1):
435 # type: (str, int, int) -> Iterator[Tuple[int, int]]
436 """Wrapper around _Tokens to prevent callers from having to handle Invalid.
437
438 I'm not combining the two functions because I might want to do a
439 'yield' transformation on Tokens()? Exceptions might complicate the
440 issue?
441 """
442 pos = left_pos
443 for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
444 if tok_id == Tok.Invalid:
445 raise LexError(s, pos)
446 yield tok_id, end_pos
447 pos = end_pos
448
449
450def ValidTokenList(s, no_special_tags=False):
451 # type: (str, bool) -> List[Tuple[int, int]]
452 """A wrapper that can be more easily translated to C++. Doesn't use iterators."""
453
454 start_pos = 0
455 tokens = []
456 lx = Lexer(s, no_special_tags=no_special_tags)
457 while True:
458 tok_id, end_pos = lx.Read()
459 tokens.append((tok_id, end_pos))
460 if tok_id == Tok.EndOfStream:
461 break
462 if tok_id == Tok.Invalid:
463 raise LexError(s, start_pos)
464 start_pos = end_pos
465 return tokens
466
467
468# Tag names:
469# Match <a or </a
470# Match <h2, but not <2h
471#
472# HTML 5 doesn't restrict tag names at all
473# https://html.spec.whatwg.org/#toc-syntax
474#
475# XML allows : - .
476# https://www.w3.org/TR/xml/#NT-NameChar
477
478# Namespaces for MathML, SVG
479# XLink, XML, XMLNS
480#
481# https://infra.spec.whatwg.org/#namespaces
482#
483# Allow - for td-attrs
484
485# Be very lenient - just no whitespace or special HTML chars
486# I don't think this is more lenient than HTML5, though we should check.
487_UNQUOTED_VALUE = r'''[^ \t\r\n<>&"'\x00]*'''
488
489# TODO: we don't need to capture the tag name here? That's done at the top
490# level
491_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)
492
493_TAG_LAST_RE = re.compile(r'\s* /? >', re.VERBOSE)
494
495# To match href="foo"
496# Note: in HTML5 and XML, single quoted attributes are also valid
497
498# <button disabled> is standard usage
499
500# NOTE: This used to allow whitespace around =
501# <a foo = "bar"> makes sense in XML
502# But then you also have
503# <a foo= bar> - which is TWO attributes, in HTML5
504# So the space is problematic
505
506_ATTR_RE = re.compile(
507 r'''
508\s+ # Leading whitespace is required
509(%s) # Attribute name
510(?: # Optional attribute value
511 \s* = \s* # Spaces allowed around =
512 (?:
513 " ([^>"\x00]*) " # double quoted value
514 | ' ([^>'\x00]*) ' # single quoted value
515 | (%s) # Attribute value
516 )
517)?
518''' % (_NAME, _UNQUOTED_VALUE), re.VERBOSE)
519
520TagName, AttrName, UnquotedValue, QuotedValue, MissingValue = range(5)
521
522
523class TagLexer(object):
524 """
525 Given a tag like <a href="..."> or <link type="..." />, the TagLexer
526 provides a few operations:
527
528 - What is the tag?
529 - Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
530 """
531
532 def __init__(self, s):
533 # type: (str) -> None
534 self.s = s
535 self.start_pos = -1 # Invalid
536 self.end_pos = -1
537
538 def Reset(self, start_pos, end_pos):
539 # type: (int, int) -> None
540 """Reuse instances of this object."""
541 assert start_pos >= 0, start_pos
542 assert end_pos >= 0, end_pos
543
544 self.start_pos = start_pos
545 self.end_pos = end_pos
546
547 def TagString(self):
548 return self.s[self.start_pos:self.end_pos]
549
550 def TagName(self):
551 # type: () -> str
552 # First event
553 tok_id, start, end = next(self.Tokens())
554 return self.s[start:end]
555
556 def GetSpanForAttrValue(self, attr_name):
557 # type: (str) -> Tuple[int, int]
558 """
559 Used by oils_doc.py, for href shortcuts
560 """
561 # Algorithm: search for QuotedValue or UnquotedValue after AttrName
562 # TODO: Could also cache these
563
564 events = self.Tokens()
565 val = (-1, -1)
566 try:
567 while True:
568 tok_id, start, end = next(events)
569 if tok_id == AttrName:
570 name = self.s[start:end]
571 if name == attr_name:
572 # The value should come next
573 tok_id, start, end = next(events)
574 assert tok_id in (QuotedValue, UnquotedValue,
575 MissingValue), TokenName(tok_id)
576 val = start, end
577 break
578
579 except StopIteration:
580 pass
581 return val
582
583 def GetAttrRaw(self, attr_name):
584 # type: (str) -> Optional[str]
585 """
586 Return the value, which may be UNESCAPED.
587 """
588 start, end = self.GetSpanForAttrValue(attr_name)
589 if start == -1:
590 return None
591 return self.s[start:end]
592
593 def AllAttrsRawSlice(self):
594 # type: () -> List[Tuple[str, int, int]]
595 """
596 Get a list of pairs [('class', 3, 5), ('href', 9, 12)]
597 """
598 slices = []
599 events = self.Tokens()
600 try:
601 while True:
602 tok_id, start, end = next(events)
603 if tok_id == AttrName:
604 name = self.s[start:end]
605
606 # The value should come next
607 tok_id, start, end = next(events)
608 assert tok_id in (QuotedValue, UnquotedValue,
609 MissingValue), TokenName(tok_id)
610 # Note: quoted values may have &amp;
611 # We would need ANOTHER lexer to unescape them, but we
612 # don't need that for ul-table
613 slices.append((name, start, end))
614 except StopIteration:
615 pass
616 return slices
617
618 def AllAttrsRaw(self):
619 # type: () -> List[Tuple[str, str]]
620 """
621 Get a list of pairs [('class', 'foo'), ('href', '?foo=1&amp;bar=2')]
622
623 The quoted values may be escaped. We would need another lexer to
624 unescape them.
625 """
626 slices = self.AllAttrsRawSlice()
627 pairs = []
628 for name, start, end in slices:
629 pairs.append((name, self.s[start:end]))
630 return pairs
631
632 def Tokens(self):
633 # type: () -> Iterator[Tuple[int, int, int]]
634 """
635 Yields a sequence of tokens: Tag (AttrName AttrValue?)*
636
637 Where each Token is (Type, start_pos, end_pos)
638
639 Note that start and end are NOT redundant! We skip over some unwanted
640 characters.
641 """
642 m = _TAG_RE.match(self.s, self.start_pos + 1)
643 if not m:
644 raise RuntimeError("Couldn't find HTML tag in %r" %
645 self.TagString())
646 yield TagName, m.start(1), m.end(1)
647
648 pos = m.end(0)
649 #log('POS %d', pos)
650
651 while True:
652 # don't search past the end
653 m = _ATTR_RE.match(self.s, pos, self.end_pos)
654 if not m:
655 #log('BREAK pos %d', pos)
656 break
657 #log('AttrName %r', m.group(1))
658
659 yield AttrName, m.start(1), m.end(1)
660
661 #log('m.groups() %r', m.groups())
662 if m.group(2) is not None:
663 # double quoted
664 yield QuotedValue, m.start(2), m.end(2)
665 elif m.group(3) is not None:
666 # single quoted - TODO: could have different token types
667 yield QuotedValue, m.start(3), m.end(3)
668 elif m.group(4) is not None:
669 yield UnquotedValue, m.start(4), m.end(4)
670 else:
671 # <button disabled>
672 end = m.end(0)
673 yield MissingValue, end, end
674
675 # Skip past the "
676 pos = m.end(0)
677
678 #log('TOK %r', self.s)
679
680 m = _TAG_LAST_RE.match(self.s, pos)
681 #log('_TAG_LAST_RE match %r', self.s[pos:])
682 if not m:
683 # Extra data at end of tag. TODO: add messages for all these.
684 raise LexError(self.s, pos)
685
686
687# This is similar but not identical to
688# " ([^>"\x00]*) " # double quoted value
689# | ' ([^>'\x00]*) ' # single quoted value
690#
691# Note: for unquoted values, & isn't allowed, and thus &amp; and &#99; and
692# &#x99; are not allowed. We could relax that?
693ATTR_VALUE_LEXER = CHAR_LEX + [
694 (r'[^>&\x00]+', Tok.RawData),
695 (r'.', Tok.Invalid),
696]
697
698ATTR_VALUE_LEXER = MakeLexer(ATTR_VALUE_LEXER)
699
700
701class AttrValueLexer(object):
702 """
703 <a href="foo=99&amp;bar">
704 <a href='foo=99&amp;bar'>
705 <a href=unquoted>
706 """
707
708 def __init__(self, s):
709 # type: (str) -> None
710 self.s = s
711 self.start_pos = -1 # Invalid
712 self.end_pos = -1
713
714 def Reset(self, start_pos, end_pos):
715 # type: (int, int) -> None
716 """Reuse instances of this object."""
717 assert start_pos >= 0, start_pos
718 assert end_pos >= 0, end_pos
719
720 self.start_pos = start_pos
721 self.end_pos = end_pos
722
723 def NumTokens(self):
724 # type: () -> int
725 num_tokens = 0
726 pos = self.start_pos
727 for tok_id, end_pos in self.Tokens():
728 if tok_id == Tok.Invalid:
729 raise LexError(self.s, pos)
730 pos = end_pos
731 #log('pos %d', pos)
732 num_tokens += 1
733 return num_tokens
734
735 def Tokens(self):
736 # type: () -> Iterator[Union[Iterator, Iterator[Tuple[int, int]]]]
737 pos = self.start_pos
738 while pos < self.end_pos:
739 # Find the first match, like above.
740 # Note: frontend/match.py uses _LongestMatch(), which is different!
741 # TODO: reconcile them. This lexer should be expressible in re2c.
742 for pat, tok_id in ATTR_VALUE_LEXER:
743 m = pat.match(self.s, pos)
744 if m:
745 if 0:
746 tok_str = m.group(0)
747 log('token = %r', tok_str)
748
749 end_pos = m.end(0)
750 yield tok_id, end_pos
751 pos = end_pos
752 break
753 else:
754 raise AssertionError('Tok.Invalid rule should have matched')
755
756
757def ReadUntilStartTag(it, tag_lexer, tag_name):
758 """Find the next <foo>, returning its (start, end) positions
759
760 Raise ParseError if it's not found.
761
762 tag_lexer is RESET.
763 """
764 pos = 0
765 while True:
766 try:
767 tok_id, end_pos = next(it)
768 except StopIteration:
769 break
770 tag_lexer.Reset(pos, end_pos)
771 if tok_id == Tok.StartTag and tag_lexer.TagName() == tag_name:
772 return pos, end_pos
773
774 pos = end_pos
775
776 raise ParseError('No start tag %r' % tag_name)
777
778
779def ReadUntilEndTag(it, tag_lexer, tag_name):
780 # type: (Iterator, TagLexer, str) -> Tuple[int, int]
781 """Find the next </foo>, returning its (start, end) position
782
783 Raise ParseError if it's not found.
784
785 tag_lexer is RESET.
786 """
787 pos = 0
788 while True:
789 try:
790 tok_id, end_pos = next(it)
791 except StopIteration:
792 break
793 tag_lexer.Reset(pos, end_pos)
794 if tok_id == Tok.EndTag and tag_lexer.TagName() == tag_name:
795 return pos, end_pos
796
797 pos = end_pos
798
799 raise ParseError('No end tag %r' % tag_name)
800
801
802CHAR_ENTITY = {
803 'amp': '&',
804 'lt': '<',
805 'gt': '>',
806 'quot': '"',
807 'apos': "'",
808}
809
810
811def ToText(s, left_pos=0, right_pos=-1):
812 # type: (str, int, int) -> str
813 """Given HTML, return text by unquoting &gt; and &lt; etc.
814
815 Used by:
816 doctools/oils_doc.py: PygmentsPlugin
817 doctools/help_gen.py: HelpIndexCards
818
819 In the latter case, we cold process some tags, like:
820
821 - Blue Link (not clickable, but still useful)
822 - Red X
823
824 That should be html.ToAnsi.
825 """
826 f = StringIO()
827 out = Output(s, f, left_pos, right_pos)
828
829 pos = left_pos
830 for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
831 if tok_id in (Tok.RawData, Tok.BadAmpersand, Tok.BadGreaterThan,
832 Tok.BadLessThan):
833 out.SkipTo(pos)
834 out.PrintUntil(end_pos)
835
836 elif tok_id == Tok.CharEntity: # &amp;
837
838 entity = s[pos + 1:end_pos - 1]
839
840 out.SkipTo(pos)
841 out.Print(CHAR_ENTITY[entity])
842 out.SkipTo(end_pos)
843
844 # Not handling these yet
845 elif tok_id == Tok.HexChar:
846 raise AssertionError('Hex Char %r' % s[pos:pos + 20])
847
848 elif tok_id == Tok.DecChar:
849 raise AssertionError('Dec Char %r' % s[pos:pos + 20])
850
851 else:
852 # Skip everything else
853 out.SkipTo(end_pos)
854
855 pos = end_pos
856
857 out.PrintTheRest()
858 return f.getvalue()
859
860
861# https://developer.mozilla.org/en-US/docs/Glossary/Void_element
862VOID_ELEMENTS = [
863 'area',
864 'base',
865 'br',
866 'col',
867 'embed',
868 'hr',
869 'img',
870 'input',
871 'link',
872 'meta',
873 'param',
874 'source',
875 'track',
876 'wbr',
877]
878
879LEX_ATTRS = 1 << 1
880LEX_QUOTED_VALUES = 1 << 2 # href="?x=42&amp;y=99"
881NO_SPECIAL_TAGS = 1 << 3 # <script> <style>, VOID tags, etc.
882BALANCED_TAGS = 1 << 4 # are tags balanced?
883
884
885def Validate(contents, flags, counters):
886 # type: (str, int, Counters) -> None
887
888 tag_lexer = TagLexer(contents)
889 val_lexer = AttrValueLexer(contents)
890
891 no_special_tags = bool(flags & NO_SPECIAL_TAGS)
892 lx = Lexer(contents, no_special_tags=no_special_tags)
893 tokens = []
894 start_pos = 0
895 tag_stack = []
896 while True:
897 tok_id, end_pos = lx.Read()
898 #log('TOP %s %r', TokenName(tok_id), contents[start_pos:end_pos])
899
900 if tok_id == Tok.Invalid:
901 raise LexError(contents, start_pos)
902 if tok_id == Tok.EndOfStream:
903 break
904
905 tokens.append((tok_id, end_pos))
906
907 if tok_id == Tok.StartEndTag:
908 counters.num_start_end_tags += 1
909
910 tag_lexer.Reset(start_pos, end_pos)
911 all_attrs = tag_lexer.AllAttrsRawSlice()
912 counters.num_attrs += len(all_attrs)
913 for name, val_start, val_end in all_attrs:
914 val_lexer.Reset(val_start, val_end)
915 counters.num_val_tokens += val_lexer.NumTokens()
916
917 counters.debug_attrs.extend(all_attrs)
918
919 elif tok_id == Tok.StartTag:
920 counters.num_start_tags += 1
921
922 tag_lexer.Reset(start_pos, end_pos)
923 all_attrs = tag_lexer.AllAttrsRawSlice()
924 counters.num_attrs += len(all_attrs)
925 for name, val_start, val_end in all_attrs:
926 val_lexer.Reset(val_start, val_end)
927 counters.num_val_tokens += val_lexer.NumTokens()
928
929 counters.debug_attrs.extend(all_attrs)
930
931 if flags & BALANCED_TAGS:
932 tag_name = lx.CanonicalTagName()
933 if flags & NO_SPECIAL_TAGS:
934 tag_stack.append(tag_name)
935 else:
936 # e.g. <meta> is considered self-closing, like <meta/>
937 if tag_name not in VOID_ELEMENTS:
938 tag_stack.append(tag_name)
939
940 counters.max_tag_stack = max(counters.max_tag_stack,
941 len(tag_stack))
942 elif tok_id == Tok.EndTag:
943 if flags & BALANCED_TAGS:
944 try:
945 expected = tag_stack.pop()
946 except IndexError:
947 raise ParseError('Tag stack empty',
948 s=contents,
949 start_pos=start_pos)
950
951 actual = lx.CanonicalTagName()
952 if expected != actual:
953 raise ParseError(
954 'Got unexpected closing tag %r; opening tag was %r' %
955 (contents[start_pos:end_pos], expected),
956 s=contents,
957 start_pos=start_pos)
958
959 start_pos = end_pos
960
961 if len(tag_stack) != 0:
962 raise ParseError('Missing closing tags at end of doc: %s' %
963 ' '.join(tag_stack),
964 s=contents,
965 start_pos=start_pos)
966
967 counters.num_tokens += len(tokens)
968
969
970def ToXml(htm8_str):
971 # type: (str) -> str
972
973 # TODO:
974 # 1. Lex it
975 # 2. < & > must be escaped
976 # a. in raw data
977 # b. in quoted strings
978 # 3. <script> turned into CDATA
979 # 4. void tags turned into self-closing tags
980 # 5. case-sensitive tag matching - not sure about this
981
982 tag_lexer = TagLexer(htm8_str)
983 val_lexer = AttrValueLexer(htm8_str)
984
985 f = StringIO()
986 out = Output(htm8_str, f)
987
988 lx = Lexer(htm8_str)
989
990 pos = 0
991 while True:
992 tok_id, end_pos = lx.Read()
993
994 if tok_id == Tok.Invalid:
995 raise LexError(htm8_str, pos)
996 if tok_id == Tok.EndOfStream:
997 break
998
999 if tok_id in (Tok.RawData, Tok.CharEntity, Tok.HexChar, Tok.DecChar):
1000 out.PrintUntil(end_pos)
1001 elif tok_id in (Tok.StartTag, Tok.StartEndTag):
1002 tag_lexer.Reset(pos, end_pos)
1003 # TODO: reduce allocations here
1004 all_attrs = tag_lexer.AllAttrsRawSlice()
1005 for name, val_start, val_end in all_attrs:
1006 val_lexer.Reset(val_start, val_end)
1007 # TODO: get the kind of string
1008 #
1009 # Quoted: we need to replace & with &amp; and < with &lt;
1010 # note > is not allowed
1011 # Unquoted: right now, we can just surround with double quotes
1012 # because we don't allow any bad chars
1013 # Empty : add "", so empty= becomes =""
1014 # Missing : add ="", so missing becomes missing=""
1015
1016 tag_name = lx.CanonicalTagName()
1017 if tok_id == Tok.StartTag and tag_name in VOID_ELEMENTS:
1018 # TODO: instead of closing >, print />
1019 pass
1020
1021 elif tok_id == Tok.BadAmpersand:
1022 #out.SkipTo(pos)
1023 out.Print('&amp;')
1024 out.SkipTo(end_pos)
1025
1026 elif tok_id == Tok.BadGreaterThan:
1027 #out.SkipTo(pos)
1028 out.Print('&gt;')
1029 out.SkipTo(end_pos)
1030 else:
1031 out.PrintUntil(end_pos)
1032
1033 pos = end_pos
1034
1035 out.PrintTheRest()
1036 return f.getvalue()
1037
1038
1039class Counters(object):
1040
1041 def __init__(self):
1042 # type: () -> None
1043 self.num_tokens = 0
1044 self.num_start_tags = 0
1045 self.num_start_end_tags = 0
1046 self.num_attrs = 0
1047 self.max_tag_stack = 0
1048 self.num_val_tokens = 0
1049
1050 self.debug_attrs = []
1051
1052
1053def main(argv):
1054 action = argv[1]
1055
1056 if action == 'tokens':
1057 contents = sys.stdin.read()
1058
1059 lx = Lexer(contents)
1060 start_pos = 0
1061 while True:
1062 tok_id, end_pos = lx.Read()
1063 if tok_id == Tok.Invalid:
1064 raise LexError(contents, start_pos)
1065 if tok_id == Tok.EndOfStream:
1066 break
1067
1068 frag = contents[start_pos:end_pos]
1069 log('%d %s %r', end_pos, TokenName(tok_id), frag)
1070 start_pos = end_pos
1071
1072 return 0
1073
1074 elif action in ('lex-htm8', 'parse-htm8', 'parse-xml'):
1075
1076 errors = []
1077 counters = Counters()
1078
1079 flags = LEX_ATTRS | LEX_QUOTED_VALUES
1080 if action.startswith('parse-'):
1081 flags |= BALANCED_TAGS
1082 if action == 'parse-xml':
1083 flags |= NO_SPECIAL_TAGS
1084
1085 i = 0
1086 for line in sys.stdin:
1087 filename = line.strip()
1088 with open(filename) as f:
1089 contents = f.read()
1090
1091 try:
1092 Validate(contents, flags, counters)
1093 except LexError as e:
1094 log('Lex error in %r: %s', filename, e)
1095 errors.append((filename, e))
1096 except ParseError as e:
1097 log('Parse error in %r: %s', filename, e)
1098 errors.append((filename, e))
1099 i += 1
1100
1101 log('')
1102 log('%10d tokens', counters.num_tokens)
1103 log('%10d start/end tags', counters.num_start_end_tags)
1104 log('%10d start tags', counters.num_start_tags)
1105 log('%10d attrs', counters.num_attrs)
1106 log('%10d max tag stack depth', counters.max_tag_stack)
1107 log('%10d attr val tokens', counters.num_val_tokens)
1108 log('%10d errors', len(errors))
1109 if len(errors):
1110 return 1
1111 return 0
1112
1113 elif action == 'todo':
1114 # Other algorithms:
1115 #
1116 # - select first subtree with given ID
1117 # - this requires understanding the void tags I suppose
1118 # - select all subtrees that have a class
1119 # - materialize DOM
1120
1121 # Safe-HTM8? This is a filter
1122 return 0
1123
1124 else:
1125 raise RuntimeError('Invalid action %r' % action)
1126
1127
1128if __name__ == '__main__':
1129 sys.exit(main(sys.argv))