OILS / lazylex / html.py View on Github | oils.pub

1127 lines, 565 significant
1#!/usr/bin/env python2
2"""
3lazylex/html.py - Low-Level HTML Processing.
4
5See lazylex/README.md for details.
6
7TODO:
8- Get rid of AttrValueLexer - this should be in the TagLexer
9 - this also means that unquoted values can be more similar
10 - We can use a single lexer mode for everything inside <>
11 - the SPACE is the only difference
12- UTF-8 check, like JSON8
13- Static typing
14
15"""
16from __future__ import print_function
17
18from _devbuild.gen.htm8_asdl import h8_id, h8_id_str
19from typing import Iterator
20from typing import Union
21from typing import Any
22from typing import IO
23
24try:
25 from cStringIO import StringIO
26except ImportError:
27 # for python3
28 from io import StringIO # type: ignore
29import re
30import sys
31
32if sys.version_info.major == 2:
33 from typing import List, Tuple, Optional
34
35
36def log(msg, *args):
37 # type: (str, *Any) -> None
38 msg = msg % args
39 print(msg, file=sys.stderr)
40
41
42class LexError(Exception):
43 """
44 Examples of lex errors:
45
46 - h8_id.Invalid, like <> or &&
47 - Unclosed <!-- <? <![CDATA[ <script> <style>
48 """
49
50 def __init__(self, s, start_pos):
51 # type: (str, int) -> None
52 self.s = s
53 self.start_pos = start_pos
54
55 def __str__(self):
56 # type: () -> str
57 return '(LexError %r)' % (self.s[self.start_pos:self.start_pos + 20])
58
59
60def FindLineNum(s, error_pos):
61 # type: (str, int) -> int
62 current_pos = 0
63 line_num = 1
64 while True:
65 newline_pos = s.find('\n', current_pos)
66 #log('current = %d, N %d, line %d', current_pos, newline_pos, line_num)
67
68 if newline_pos == -1: # this is the last line
69 return line_num
70 if newline_pos >= error_pos:
71 return line_num
72 line_num += 1
73 current_pos = newline_pos + 1
74
75
76class ParseError(Exception):
77 """
78 Examples of parse errors
79
80 - unbalanced tag structure
81 - ul_table.py errors
82 """
83
84 def __init__(self, msg, s=None, start_pos=-1):
85 # type: (str, Optional[str], int) -> None
86 self.msg = msg
87 self.s = s
88 self.start_pos = start_pos
89
90 def __str__(self):
91 # type: () -> str
92 if self.s is not None:
93 assert self.start_pos != -1, self.start_pos
94 snippet = (self.s[self.start_pos:self.start_pos + 20])
95
96 line_num = FindLineNum(self.s, self.start_pos)
97 else:
98 snippet = ''
99 line_num = -1
100 msg = 'line %d: %r %r' % (line_num, self.msg, snippet)
101 return msg
102
103
104class Output(object):
105 """Takes an underlying input buffer and an output file. Maintains a
106 position in the input buffer.
107
108 Print FROM the input or print new text to the output.
109 """
110
111 def __init__(self, s, f, left_pos=0, right_pos=-1):
112 # type: (str, IO[str], int, int) -> None
113 self.s = s
114 self.f = f
115 self.pos = left_pos
116 self.right_pos = len(s) if right_pos == -1 else right_pos
117
118 def SkipTo(self, pos):
119 # type: (int) -> None
120 """Skip to a position."""
121 self.pos = pos
122
123 def PrintUntil(self, pos):
124 # type: (int) -> None
125 """Print until a position."""
126 piece = self.s[self.pos:pos]
127 self.f.write(piece)
128 self.pos = pos
129
130 def PrintTheRest(self):
131 # type: () -> None
132 """Print until the end of the string."""
133 self.PrintUntil(self.right_pos)
134
135 def Print(self, s):
136 # type: (str) -> None
137 """Print text to the underlying buffer."""
138 self.f.write(s)
139
140
141# HTML Tokens
142# CommentBegin, ProcessingBegin, CDataBegin are "pseudo-tokens", not visible
143TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin CData CDataBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData HtmlCData BadAmpersand BadGreaterThan BadLessThan Invalid EndOfStream'.split(
144)
145
146
147class Tok(object):
148 """
149 Avoid lint errors by using these aliases
150 """
151 pass
152
153
154TOKEN_NAMES = [None] * len(TOKENS) # type: List[str]
155
156this_module = sys.modules[__name__]
157for i, tok_str in enumerate(TOKENS):
158 setattr(this_module, tok_str, i)
159 setattr(Tok, tok_str, i)
160 TOKEN_NAMES[i] = tok_str
161
162
163def MakeLexer(rules):
164 return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]
165
166
167#
168# Eggex
169#
170# Tag = / ~['>']+ /
171
172# Is this valid? A single character?
173# Tag = / ~'>'* /
174
175# Maybe better: / [NOT '>']+/
176# capital letters not allowed there?
177#
178# But then this is confusing:
179# / [NOT ~digit]+/
180#
181# / [NOT digit] / is [^\d]
182# / ~digit / is \D
183#
184# Or maybe:
185#
186# / [~ digit]+ /
187# / [~ '>']+ /
188# / [NOT '>']+ /
189
190# End = / '</' Tag '>' /
191# StartEnd = / '<' Tag '/>' /
192# Start = / '<' Tag '>' /
193#
194# EntityRef = / '&' dot{* N} ';' /
195
196# Tag name, or attribute name
197# colon is used in XML
198
199# https://www.w3.org/TR/xml/#NT-Name
200# Hm there is a lot of unicode stuff. We are simplifying parsing
201
202_NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter
203
204CHAR_LEX = [
205 # Characters
206 # https://www.w3.org/TR/xml/#sec-references
207 (r'&\# [0-9]+ ;', h8_id.DecChar),
208 (r'&\# x[0-9a-fA-F]+ ;', h8_id.HexChar),
209 (r'& %s ;' % _NAME, h8_id.CharEntity),
210 # Allow unquoted, and quoted
211 (r'&', h8_id.BadAmpersand),
212]
213
214HTM8_LEX = CHAR_LEX + [
215 (r'<!--', h8_id.CommentBegin),
216
217 # Processing instruction are used for the XML header:
218 # <?xml version="1.0" encoding="UTF-8"?>
219 # They are technically XML-only, but in HTML5, they are another kind of
220 # comment:
221 #
222 # https://developer.mozilla.org/en-US/docs/Web/API/ProcessingInstruction
223 #
224 (r'<\?', h8_id.ProcessingBegin),
225 # Not necessary in HTML5, but occurs in XML
226 (r'<!\[CDATA\[', h8_id.CDataBegin), # <![CDATA[
227
228 # Markup declarations
229 # - In HTML5, there is only <!DOCTYPE html>
230 # - XML has 4 more declarations: <!ELEMENT ...> ATTLIST ENTITY NOTATION
231 # - these seem to be part of DTD
232 # - it's useful to skip these, and be able to parse the rest of the document
233 # - Note: < is allowed?
234 (r'<! [^>\x00]+ >', h8_id.Decl),
235
236 # Tags
237 # Notes:
238 # - We look for a valid tag name, but we don't validate attributes.
239 # That's done in the tag lexer.
240 # - We don't allow leading whitespace
241 (r'</ (%s) >' % _NAME, h8_id.EndTag),
242 # self-closing <br/> comes before StartTag
243 # could/should these be collapsed into one rule?
244 (r'< (%s) [^>\x00]* />' % _NAME, h8_id.StartEndTag), # end </a>
245 (r'< (%s) [^>\x00]* >' % _NAME, h8_id.StartTag), # start <a>
246
247 # HTML5 allows unescaped > in raw data, but < is not allowed.
248 # https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
249 #
250 # - My early blog has THREE errors when disallowing >
251 # - So do some .wwz files
252 (r'[^&<>\x00]+', h8_id.RawData),
253 (r'>', h8_id.BadGreaterThan),
254 # < is an error
255 (r'.', h8_id.Invalid),
256]
257
258# Old notes:
259#
260# Non-greedy matches are regular and can be matched in linear time
261# with RE2.
262#
263# https://news.ycombinator.com/item?id=27099798
264#
265# Maybe try combining all of these for speed.
266
267# . is any char except newline
268# https://re2c.org/manual/manual_c.html
269
270# Discarded options
271#(r'<!-- .*? -->', h8_id.Comment),
272
273# Hack from Claude: \s\S instead of re.DOTALL. I don't like this
274#(r'<!-- [\s\S]*? -->', h8_id.Comment),
275#(r'<!-- (?:.|[\n])*? -->', h8_id.Comment),
276
277HTM8_LEX_COMPILED = MakeLexer(HTM8_LEX)
278
279
280class Lexer(object):
281
282 def __init__(self, s, left_pos=0, right_pos=-1, no_special_tags=False):
283 # type: (str, int, int, bool) -> None
284 self.s = s
285 self.pos = left_pos
286 self.right_pos = len(s) if right_pos == -1 else right_pos
287 self.no_special_tags = no_special_tags
288
289 self.cache = {} # string -> compiled regex pattern object
290
291 # either </script> or </style> - we search until we see that
292 self.search_state = None # type: Optional[str]
293
294 # Position of tag name, if applicable
295 # - Set after you get a StartTag, EndTag, or StartEndTag
296 # - Unset on other tags
297 self.tag_pos_left = -1
298 self.tag_pos_right = -1
299
300 def _Peek(self):
301 # type: () -> Tuple[int, int]
302 """
303 Note: not using _Peek() now
304 """
305 if self.pos == self.right_pos:
306 return h8_id.EndOfStream, self.pos
307
308 assert self.pos < self.right_pos, self.pos
309
310 if self.search_state is not None and not self.no_special_tags:
311 # TODO: case-insensitive search for </SCRIPT> <SCRipt> ?
312 #
313 # Another strategy: enter a mode where we find ONLY the end tag
314 # regex, and any data that's not <, and then check the canonical
315 # tag name for 'script' or 'style'.
316 pos = self.s.find(self.search_state, self.pos)
317 if pos == -1:
318 # unterminated <script> or <style>
319 raise LexError(self.s, self.pos)
320 self.search_state = None
321 # beginning
322 return h8_id.HtmlCData, pos
323
324 # Find the first match.
325 # Note: frontend/match.py uses _LongestMatch(), which is different!
326 # TODO: reconcile them. This lexer should be expressible in re2c.
327
328 for pat, tok_id in HTM8_LEX_COMPILED:
329 m = pat.match(self.s, self.pos)
330 if m:
331 if tok_id in (h8_id.StartTag, h8_id.EndTag, h8_id.StartEndTag):
332 self.tag_pos_left = m.start(1)
333 self.tag_pos_right = m.end(1)
334 else:
335 # Reset state
336 self.tag_pos_left = -1
337 self.tag_pos_right = -1
338
339 if tok_id == h8_id.CommentBegin:
340 pos = self.s.find('-->', self.pos)
341 if pos == -1:
342 # unterminated <!--
343 raise LexError(self.s, self.pos)
344 return h8_id.Comment, pos + 3 # -->
345
346 if tok_id == h8_id.ProcessingBegin:
347 pos = self.s.find('?>', self.pos)
348 if pos == -1:
349 # unterminated <?
350 raise LexError(self.s, self.pos)
351 return h8_id.Processing, pos + 2 # ?>
352
353 if tok_id == h8_id.CDataBegin:
354 pos = self.s.find(']]>', self.pos)
355 if pos == -1:
356 # unterminated <![CDATA[
357 raise LexError(self.s, self.pos)
358 return h8_id.CData, pos + 3 # ]]>
359
360 if tok_id == h8_id.StartTag:
361 # TODO: reduce allocations
362 if (self.TagNameEquals('script') or
363 self.TagNameEquals('style')):
364 # <SCRipt a=b> -> </SCRipt>
365 self.search_state = '</' + self._LiteralTagName() + '>'
366
367 return tok_id, m.end()
368 else:
369 raise AssertionError('h8_id.Invalid rule should have matched')
370
371 def TagNameEquals(self, expected):
372 # type: (str) -> bool
373 assert self.tag_pos_left != -1, self.tag_pos_left
374 assert self.tag_pos_right != -1, self.tag_pos_right
375
376 # TODO: In C++, this does not need an allocation. Can we test
377 # directly?
378 return expected == self.CanonicalTagName()
379
380 def _LiteralTagName(self):
381 # type: () -> str
382 assert self.tag_pos_left != -1, self.tag_pos_left
383 assert self.tag_pos_right != -1, self.tag_pos_right
384
385 return self.s[self.tag_pos_left:self.tag_pos_right]
386
387 def CanonicalTagName(self):
388 # type: () -> str
389 tag_name = self._LiteralTagName()
390 # Most tags are already lower case, so avoid allocation with this conditional
391 # TODO: this could go in the mycpp runtime?
392 if tag_name.islower():
393 return tag_name
394 else:
395 return tag_name.lower()
396
397 def Read(self):
398 # type: () -> Tuple[int, int]
399 tok_id, end_pos = self._Peek()
400 self.pos = end_pos # advance
401 return tok_id, end_pos
402
403 def LookAhead(self, regex):
404 # type: (str) -> bool
405 # Cache the regex compilation. This could also be LookAheadFor(THEAD)
406 # or something.
407 pat = self.cache.get(regex)
408 if pat is None:
409 pat = re.compile(regex)
410 self.cache[regex] = pat
411
412 m = pat.match(self.s, self.pos)
413 return m is not None
414
415
416def _Tokens(s, left_pos, right_pos):
417 # type: (str, int, int) -> Iterator[Tuple[int, int]]
418 """
419 Args:
420 s: string to parse
421 left_pos, right_pos: Optional span boundaries.
422 """
423 lx = Lexer(s, left_pos, right_pos)
424 while True:
425 tok_id, pos = lx.Read()
426 yield tok_id, pos
427 if tok_id == h8_id.EndOfStream:
428 break
429
430
431def ValidTokens(s, left_pos=0, right_pos=-1):
432 # type: (str, int, int) -> Iterator[Tuple[int, int]]
433 """Wrapper around _Tokens to prevent callers from having to handle Invalid.
434
435 I'm not combining the two functions because I might want to do a
436 'yield' transformation on Tokens()? Exceptions might complicate the
437 issue?
438 """
439 pos = left_pos
440 for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
441 if tok_id == h8_id.Invalid:
442 raise LexError(s, pos)
443 yield tok_id, end_pos
444 pos = end_pos
445
446
447def ValidTokenList(s, no_special_tags=False):
448 # type: (str, bool) -> List[Tuple[int, int]]
449 """A wrapper that can be more easily translated to C++. Doesn't use iterators."""
450
451 start_pos = 0
452 tokens = []
453 lx = Lexer(s, no_special_tags=no_special_tags)
454 while True:
455 tok_id, end_pos = lx.Read()
456 tokens.append((tok_id, end_pos))
457 if tok_id == h8_id.EndOfStream:
458 break
459 if tok_id == h8_id.Invalid:
460 raise LexError(s, start_pos)
461 start_pos = end_pos
462 return tokens
463
464
465# Tag names:
466# Match <a or </a
467# Match <h2, but not <2h
468#
469# HTML 5 doesn't restrict tag names at all
470# https://html.spec.whatwg.org/#toc-syntax
471#
472# XML allows : - .
473# https://www.w3.org/TR/xml/#NT-NameChar
474
475# Namespaces for MathML, SVG
476# XLink, XML, XMLNS
477#
478# https://infra.spec.whatwg.org/#namespaces
479#
480# Allow - for td-attrs
481
482# Be very lenient - just no whitespace or special HTML chars
483# I don't think this is more lenient than HTML5, though we should check.
484_UNQUOTED_VALUE = r'''[^ \t\r\n<>&"'\x00]*'''
485
486# TODO: we don't need to capture the tag name here? That's done at the top
487# level
488_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)
489
490_TAG_LAST_RE = re.compile(r'\s* /? >', re.VERBOSE)
491
492# To match href="foo"
493# Note: in HTML5 and XML, single quoted attributes are also valid
494
495# <button disabled> is standard usage
496
497# NOTE: This used to allow whitespace around =
498# <a foo = "bar"> makes sense in XML
499# But then you also have
500# <a foo= bar> - which is TWO attributes, in HTML5
501# So the space is problematic
502
503_ATTR_RE = re.compile(
504 r'''
505\s+ # Leading whitespace is required
506(%s) # Attribute name
507(?: # Optional attribute value
508 \s* = \s* # Spaces allowed around =
509 (?:
510 " ([^>"\x00]*) " # double quoted value
511 | ' ([^>'\x00]*) ' # single quoted value
512 | (%s) # Attribute value
513 )
514)?
515''' % (_NAME, _UNQUOTED_VALUE), re.VERBOSE)
516
517TagName, AttrName, UnquotedValue, QuotedValue, MissingValue = range(5)
518
519
520class TagLexer(object):
521 """
522 Given a tag like <a href="..."> or <link type="..." />, the TagLexer
523 provides a few operations:
524
525 - What is the tag?
526 - Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
527 """
528
529 def __init__(self, s):
530 # type: (str) -> None
531 self.s = s
532 self.start_pos = -1 # Invalid
533 self.end_pos = -1
534
535 def Reset(self, start_pos, end_pos):
536 # type: (int, int) -> None
537 """Reuse instances of this object."""
538 assert start_pos >= 0, start_pos
539 assert end_pos >= 0, end_pos
540
541 self.start_pos = start_pos
542 self.end_pos = end_pos
543
544 def TagString(self):
545 return self.s[self.start_pos:self.end_pos]
546
547 def TagName(self):
548 # type: () -> str
549 # First event
550 tok_id, start, end = next(self.Tokens())
551 return self.s[start:end]
552
553 def GetSpanForAttrValue(self, attr_name):
554 # type: (str) -> Tuple[int, int]
555 """
556 Used by oils_doc.py, for href shortcuts
557 """
558 # Algorithm: search for QuotedValue or UnquotedValue after AttrName
559 # TODO: Could also cache these
560
561 events = self.Tokens()
562 val = (-1, -1)
563 try:
564 while True:
565 tok_id, start, end = next(events)
566 if tok_id == AttrName:
567 name = self.s[start:end]
568 if name == attr_name:
569 # The value should come next
570 tok_id, start, end = next(events)
571 assert tok_id in (QuotedValue, UnquotedValue,
572 MissingValue), h8_id_str(tok_id)
573 val = start, end
574 break
575
576 except StopIteration:
577 pass
578 return val
579
580 def GetAttrRaw(self, attr_name):
581 # type: (str) -> Optional[str]
582 """
583 Return the value, which may be UNESCAPED.
584 """
585 start, end = self.GetSpanForAttrValue(attr_name)
586 if start == -1:
587 return None
588 return self.s[start:end]
589
590 def AllAttrsRawSlice(self):
591 # type: () -> List[Tuple[str, int, int]]
592 """
593 Get a list of pairs [('class', 3, 5), ('href', 9, 12)]
594 """
595 slices = []
596 events = self.Tokens()
597 try:
598 while True:
599 tok_id, start, end = next(events)
600 if tok_id == AttrName:
601 name = self.s[start:end]
602
603 # The value should come next
604 tok_id, start, end = next(events)
605 assert tok_id in (QuotedValue, UnquotedValue,
606 MissingValue), h8_id_str(tok_id)
607 # Note: quoted values may have &amp;
608 # We would need ANOTHER lexer to unescape them, but we
609 # don't need that for ul-table
610 slices.append((name, start, end))
611 except StopIteration:
612 pass
613 return slices
614
615 def AllAttrsRaw(self):
616 # type: () -> List[Tuple[str, str]]
617 """
618 Get a list of pairs [('class', 'foo'), ('href', '?foo=1&amp;bar=2')]
619
620 The quoted values may be escaped. We would need another lexer to
621 unescape them.
622 """
623 slices = self.AllAttrsRawSlice()
624 pairs = []
625 for name, start, end in slices:
626 pairs.append((name, self.s[start:end]))
627 return pairs
628
629 def Tokens(self):
630 # type: () -> Iterator[Tuple[int, int, int]]
631 """
632 Yields a sequence of tokens: Tag (AttrName AttrValue?)*
633
634 Where each Token is (Type, start_pos, end_pos)
635
636 Note that start and end are NOT redundant! We skip over some unwanted
637 characters.
638 """
639 m = _TAG_RE.match(self.s, self.start_pos + 1)
640 if not m:
641 raise RuntimeError("Couldn't find HTML tag in %r" %
642 self.TagString())
643 yield TagName, m.start(1), m.end(1)
644
645 pos = m.end(0)
646 #log('POS %d', pos)
647
648 while True:
649 # don't search past the end
650 m = _ATTR_RE.match(self.s, pos, self.end_pos)
651 if not m:
652 #log('BREAK pos %d', pos)
653 break
654 #log('AttrName %r', m.group(1))
655
656 yield AttrName, m.start(1), m.end(1)
657
658 #log('m.groups() %r', m.groups())
659 if m.group(2) is not None:
660 # double quoted
661 yield QuotedValue, m.start(2), m.end(2)
662 elif m.group(3) is not None:
663 # single quoted - TODO: could have different token types
664 yield QuotedValue, m.start(3), m.end(3)
665 elif m.group(4) is not None:
666 yield UnquotedValue, m.start(4), m.end(4)
667 else:
668 # <button disabled>
669 end = m.end(0)
670 yield MissingValue, end, end
671
672 # Skip past the "
673 pos = m.end(0)
674
675 #log('TOK %r', self.s)
676
677 m = _TAG_LAST_RE.match(self.s, pos)
678 #log('_TAG_LAST_RE match %r', self.s[pos:])
679 if not m:
680 # Extra data at end of tag. TODO: add messages for all these.
681 raise LexError(self.s, pos)
682
683
684# This is similar but not identical to
685# " ([^>"\x00]*) " # double quoted value
686# | ' ([^>'\x00]*) ' # single quoted value
687#
688# Note: for unquoted values, & isn't allowed, and thus &amp; and &#99; and
689# &#x99; are not allowed. We could relax that?
690ATTR_VALUE_LEXER = CHAR_LEX + [
691 (r'[^>&\x00]+', h8_id.RawData),
692 (r'.', h8_id.Invalid),
693]
694
695ATTR_VALUE_LEXER = MakeLexer(ATTR_VALUE_LEXER)
696
697
698class AttrValueLexer(object):
699 """
700 <a href="foo=99&amp;bar">
701 <a href='foo=99&amp;bar'>
702 <a href=unquoted>
703 """
704
705 def __init__(self, s):
706 # type: (str) -> None
707 self.s = s
708 self.start_pos = -1 # Invalid
709 self.end_pos = -1
710
711 def Reset(self, start_pos, end_pos):
712 # type: (int, int) -> None
713 """Reuse instances of this object."""
714 assert start_pos >= 0, start_pos
715 assert end_pos >= 0, end_pos
716
717 self.start_pos = start_pos
718 self.end_pos = end_pos
719
720 def NumTokens(self):
721 # type: () -> int
722 num_tokens = 0
723 pos = self.start_pos
724 for tok_id, end_pos in self.Tokens():
725 if tok_id == h8_id.Invalid:
726 raise LexError(self.s, pos)
727 pos = end_pos
728 #log('pos %d', pos)
729 num_tokens += 1
730 return num_tokens
731
732 def Tokens(self):
733 # type: () -> Iterator[Union[Iterator, Iterator[Tuple[int, int]]]]
734 pos = self.start_pos
735 while pos < self.end_pos:
736 # Find the first match, like above.
737 # Note: frontend/match.py uses _LongestMatch(), which is different!
738 # TODO: reconcile them. This lexer should be expressible in re2c.
739 for pat, tok_id in ATTR_VALUE_LEXER:
740 m = pat.match(self.s, pos)
741 if m:
742 if 0:
743 tok_str = m.group(0)
744 log('token = %r', tok_str)
745
746 end_pos = m.end(0)
747 yield tok_id, end_pos
748 pos = end_pos
749 break
750 else:
751 raise AssertionError('h8_id.Invalid rule should have matched')
752
753
754def ReadUntilStartTag(it, tag_lexer, tag_name):
755 """Find the next <foo>, returning its (start, end) positions
756
757 Raise ParseError if it's not found.
758
759 tag_lexer is RESET.
760 """
761 pos = 0
762 while True:
763 try:
764 tok_id, end_pos = next(it)
765 except StopIteration:
766 break
767 tag_lexer.Reset(pos, end_pos)
768 if tok_id == h8_id.StartTag and tag_lexer.TagName() == tag_name:
769 return pos, end_pos
770
771 pos = end_pos
772
773 raise ParseError('No start tag %r' % tag_name)
774
775
776def ReadUntilEndTag(it, tag_lexer, tag_name):
777 # type: (Iterator, TagLexer, str) -> Tuple[int, int]
778 """Find the next </foo>, returning its (start, end) position
779
780 Raise ParseError if it's not found.
781
782 tag_lexer is RESET.
783 """
784 pos = 0
785 while True:
786 try:
787 tok_id, end_pos = next(it)
788 except StopIteration:
789 break
790 tag_lexer.Reset(pos, end_pos)
791 if tok_id == h8_id.EndTag and tag_lexer.TagName() == tag_name:
792 return pos, end_pos
793
794 pos = end_pos
795
796 raise ParseError('No end tag %r' % tag_name)
797
798
799CHAR_ENTITY = {
800 'amp': '&',
801 'lt': '<',
802 'gt': '>',
803 'quot': '"',
804 'apos': "'",
805}
806
807
808def ToText(s, left_pos=0, right_pos=-1):
809 # type: (str, int, int) -> str
810 """Given HTML, return text by unquoting &gt; and &lt; etc.
811
812 Used by:
813 doctools/oils_doc.py: PygmentsPlugin
814 doctools/help_gen.py: HelpIndexCards
815
816 In the latter case, we cold process some tags, like:
817
818 - Blue Link (not clickable, but still useful)
819 - Red X
820
821 That should be html.ToAnsi.
822 """
823 f = StringIO()
824 out = Output(s, f, left_pos, right_pos)
825
826 pos = left_pos
827 for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
828 if tok_id in (h8_id.RawData, h8_id.BadAmpersand, h8_id.BadGreaterThan,
829 h8_id.BadLessThan):
830 out.SkipTo(pos)
831 out.PrintUntil(end_pos)
832
833 elif tok_id == h8_id.CharEntity: # &amp;
834
835 entity = s[pos + 1:end_pos - 1]
836
837 out.SkipTo(pos)
838 out.Print(CHAR_ENTITY[entity])
839 out.SkipTo(end_pos)
840
841 # Not handling these yet
842 elif tok_id == h8_id.HexChar:
843 raise AssertionError('Hex Char %r' % s[pos:pos + 20])
844
845 elif tok_id == h8_id.DecChar:
846 raise AssertionError('Dec Char %r' % s[pos:pos + 20])
847
848 else:
849 # Skip everything else
850 out.SkipTo(end_pos)
851
852 pos = end_pos
853
854 out.PrintTheRest()
855 return f.getvalue()
856
857
858# https://developer.mozilla.org/en-US/docs/Glossary/Void_element
859VOID_ELEMENTS = [
860 'area',
861 'base',
862 'br',
863 'col',
864 'embed',
865 'hr',
866 'img',
867 'input',
868 'link',
869 'meta',
870 'param',
871 'source',
872 'track',
873 'wbr',
874]
875
876LEX_ATTRS = 1 << 1
877LEX_QUOTED_VALUES = 1 << 2 # href="?x=42&amp;y=99"
878NO_SPECIAL_TAGS = 1 << 3 # <script> <style>, VOID tags, etc.
879BALANCED_TAGS = 1 << 4 # are tags balanced?
880
881
882def Validate(contents, flags, counters):
883 # type: (str, int, Counters) -> None
884
885 tag_lexer = TagLexer(contents)
886 val_lexer = AttrValueLexer(contents)
887
888 no_special_tags = bool(flags & NO_SPECIAL_TAGS)
889 lx = Lexer(contents, no_special_tags=no_special_tags)
890 tokens = []
891 start_pos = 0
892 tag_stack = []
893 while True:
894 tok_id, end_pos = lx.Read()
895 #log('TOP %s %r', h8_id_str(tok_id), contents[start_pos:end_pos])
896
897 if tok_id == h8_id.Invalid:
898 raise LexError(contents, start_pos)
899 if tok_id == h8_id.EndOfStream:
900 break
901
902 tokens.append((tok_id, end_pos))
903
904 if tok_id == h8_id.StartEndTag:
905 counters.num_start_end_tags += 1
906
907 tag_lexer.Reset(start_pos, end_pos)
908 all_attrs = tag_lexer.AllAttrsRawSlice()
909 counters.num_attrs += len(all_attrs)
910 for name, val_start, val_end in all_attrs:
911 val_lexer.Reset(val_start, val_end)
912 counters.num_val_tokens += val_lexer.NumTokens()
913
914 counters.debug_attrs.extend(all_attrs)
915
916 elif tok_id == h8_id.StartTag:
917 counters.num_start_tags += 1
918
919 tag_lexer.Reset(start_pos, end_pos)
920 all_attrs = tag_lexer.AllAttrsRawSlice()
921 counters.num_attrs += len(all_attrs)
922 for name, val_start, val_end in all_attrs:
923 val_lexer.Reset(val_start, val_end)
924 counters.num_val_tokens += val_lexer.NumTokens()
925
926 counters.debug_attrs.extend(all_attrs)
927
928 if flags & BALANCED_TAGS:
929 tag_name = lx.CanonicalTagName()
930 if flags & NO_SPECIAL_TAGS:
931 tag_stack.append(tag_name)
932 else:
933 # e.g. <meta> is considered self-closing, like <meta/>
934 if tag_name not in VOID_ELEMENTS:
935 tag_stack.append(tag_name)
936
937 counters.max_tag_stack = max(counters.max_tag_stack,
938 len(tag_stack))
939 elif tok_id == h8_id.EndTag:
940 if flags & BALANCED_TAGS:
941 try:
942 expected = tag_stack.pop()
943 except IndexError:
944 raise ParseError('Tag stack empty',
945 s=contents,
946 start_pos=start_pos)
947
948 actual = lx.CanonicalTagName()
949 if expected != actual:
950 raise ParseError(
951 'Got unexpected closing tag %r; opening tag was %r' %
952 (contents[start_pos:end_pos], expected),
953 s=contents,
954 start_pos=start_pos)
955
956 start_pos = end_pos
957
958 if len(tag_stack) != 0:
959 raise ParseError('Missing closing tags at end of doc: %s' %
960 ' '.join(tag_stack),
961 s=contents,
962 start_pos=start_pos)
963
964 counters.num_tokens += len(tokens)
965
966
967def ToXml(htm8_str):
968 # type: (str) -> str
969
970 # TODO:
971 # 1. Lex it
972 # 2. < & > must be escaped
973 # a. in raw data
974 # b. in quoted strings
975 # 3. <script> turned into CDATA
976 # 4. void tags turned into self-closing tags
977 # 5. case-sensitive tag matching - not sure about this
978
979 tag_lexer = TagLexer(htm8_str)
980 val_lexer = AttrValueLexer(htm8_str)
981
982 f = StringIO()
983 out = Output(htm8_str, f)
984
985 lx = Lexer(htm8_str)
986
987 pos = 0
988 while True:
989 tok_id, end_pos = lx.Read()
990
991 if tok_id == h8_id.Invalid:
992 raise LexError(htm8_str, pos)
993 if tok_id == h8_id.EndOfStream:
994 break
995
996 if tok_id in (h8_id.RawData, h8_id.CharEntity, h8_id.HexChar,
997 h8_id.DecChar):
998 out.PrintUntil(end_pos)
999 elif tok_id in (h8_id.StartTag, h8_id.StartEndTag):
1000 tag_lexer.Reset(pos, end_pos)
1001 # TODO: reduce allocations here
1002 all_attrs = tag_lexer.AllAttrsRawSlice()
1003 for name, val_start, val_end in all_attrs:
1004 val_lexer.Reset(val_start, val_end)
1005 # TODO: get the kind of string
1006 #
1007 # Quoted: we need to replace & with &amp; and < with &lt;
1008 # note > is not allowed
1009 # Unquoted: right now, we can just surround with double quotes
1010 # because we don't allow any bad chars
1011 # Empty : add "", so empty= becomes =""
1012 # Missing : add ="", so missing becomes missing=""
1013
1014 tag_name = lx.CanonicalTagName()
1015 if tok_id == h8_id.StartTag and tag_name in VOID_ELEMENTS:
1016 # TODO: instead of closing >, print />
1017 pass
1018
1019 elif tok_id == h8_id.BadAmpersand:
1020 #out.SkipTo(pos)
1021 out.Print('&amp;')
1022 out.SkipTo(end_pos)
1023
1024 elif tok_id == h8_id.BadGreaterThan:
1025 #out.SkipTo(pos)
1026 out.Print('&gt;')
1027 out.SkipTo(end_pos)
1028 else:
1029 out.PrintUntil(end_pos)
1030
1031 pos = end_pos
1032
1033 out.PrintTheRest()
1034 return f.getvalue()
1035
1036
1037class Counters(object):
1038
1039 def __init__(self):
1040 # type: () -> None
1041 self.num_tokens = 0
1042 self.num_start_tags = 0
1043 self.num_start_end_tags = 0
1044 self.num_attrs = 0
1045 self.max_tag_stack = 0
1046 self.num_val_tokens = 0
1047
1048 self.debug_attrs = []
1049
1050
1051def main(argv):
1052 action = argv[1]
1053
1054 if action == 'tokens':
1055 contents = sys.stdin.read()
1056
1057 lx = Lexer(contents)
1058 start_pos = 0
1059 while True:
1060 tok_id, end_pos = lx.Read()
1061 if tok_id == h8_id.Invalid:
1062 raise LexError(contents, start_pos)
1063 if tok_id == h8_id.EndOfStream:
1064 break
1065
1066 frag = contents[start_pos:end_pos]
1067 log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
1068 start_pos = end_pos
1069
1070 return 0
1071
1072 elif action in ('lex-htm8', 'parse-htm8', 'parse-xml'):
1073
1074 errors = []
1075 counters = Counters()
1076
1077 flags = LEX_ATTRS | LEX_QUOTED_VALUES
1078 if action.startswith('parse-'):
1079 flags |= BALANCED_TAGS
1080 if action == 'parse-xml':
1081 flags |= NO_SPECIAL_TAGS
1082
1083 i = 0
1084 for line in sys.stdin:
1085 filename = line.strip()
1086 with open(filename) as f:
1087 contents = f.read()
1088
1089 try:
1090 Validate(contents, flags, counters)
1091 except LexError as e:
1092 log('Lex error in %r: %s', filename, e)
1093 errors.append((filename, e))
1094 except ParseError as e:
1095 log('Parse error in %r: %s', filename, e)
1096 errors.append((filename, e))
1097 i += 1
1098
1099 log('')
1100 log('%10d tokens', counters.num_tokens)
1101 log('%10d start/end tags', counters.num_start_end_tags)
1102 log('%10d start tags', counters.num_start_tags)
1103 log('%10d attrs', counters.num_attrs)
1104 log('%10d max tag stack depth', counters.max_tag_stack)
1105 log('%10d attr val tokens', counters.num_val_tokens)
1106 log('%10d errors', len(errors))
1107 if len(errors):
1108 return 1
1109 return 0
1110
1111 elif action == 'todo':
1112 # Other algorithms:
1113 #
1114 # - select first subtree with given ID
1115 # - this requires understanding the void tags I suppose
1116 # - select all subtrees that have a class
1117 # - materialize DOM
1118
1119 # Safe-HTM8? This is a filter
1120 return 0
1121
1122 else:
1123 raise RuntimeError('Invalid action %r' % action)
1124
1125
1126if __name__ == '__main__':
1127 sys.exit(main(sys.argv))