OILS / lazylex / html.py View on Github | oils.pub

1126 lines, 559 significant
1#!/usr/bin/env python2
2"""
3lazylex/html.py - Low-Level HTML Processing.
4
5See lazylex/README.md for details.
6
7TODO:
8- Get rid of AttrValueLexer - this should be in the TagLexer
9 - this also means that unquoted values can be more similar
10 - We can use a single lexer mode for everything inside <>
11 - the SPACE is the only difference
12- UTF-8 check, like JSON8
13- Static typing
14
15"""
16from __future__ import print_function
17
18from _devbuild.gen.htm8_asdl import h8_id, h8_id_t, h8_id_str
19from typing import Dict, Iterator, Union, Any, IO
20
21try:
22 from cStringIO import StringIO
23except ImportError:
24 # for python3
25 from io import StringIO # type: ignore
26import re
27import sys
28
29if sys.version_info.major == 2:
30 from typing import List, Tuple, Optional
31
32
33def log(msg, *args):
34 # type: (str, *Any) -> None
35 msg = msg % args
36 print(msg, file=sys.stderr)
37
38
39class LexError(Exception):
40 """
41 Examples of lex errors:
42
43 - h8_id.Invalid, like <> or &&
44 - Unclosed <!-- <? <![CDATA[ <script> <style>
45 """
46
47 def __init__(self, s, start_pos):
48 # type: (str, int) -> None
49 self.s = s
50 self.start_pos = start_pos
51
52 def __str__(self):
53 # type: () -> str
54 return '(LexError %r)' % (self.s[self.start_pos:self.start_pos + 20])
55
56
57def FindLineNum(s, error_pos):
58 # type: (str, int) -> int
59 current_pos = 0
60 line_num = 1
61 while True:
62 newline_pos = s.find('\n', current_pos)
63 #log('current = %d, N %d, line %d', current_pos, newline_pos, line_num)
64
65 if newline_pos == -1: # this is the last line
66 return line_num
67 if newline_pos >= error_pos:
68 return line_num
69 line_num += 1
70 current_pos = newline_pos + 1
71
72
73class ParseError(Exception):
74 """
75 Examples of parse errors
76
77 - unbalanced tag structure
78 - ul_table.py errors
79 """
80
81 def __init__(self, msg, s=None, start_pos=-1):
82 # type: (str, Optional[str], int) -> None
83 self.msg = msg
84 self.s = s
85 self.start_pos = start_pos
86
87 def __str__(self):
88 # type: () -> str
89 if self.s is not None:
90 assert self.start_pos != -1, self.start_pos
91 snippet = (self.s[self.start_pos:self.start_pos + 20])
92
93 line_num = FindLineNum(self.s, self.start_pos)
94 else:
95 snippet = ''
96 line_num = -1
97 msg = 'line %d: %r %r' % (line_num, self.msg, snippet)
98 return msg
99
100
101class Output(object):
102 """Takes an underlying input buffer and an output file. Maintains a
103 position in the input buffer.
104
105 Print FROM the input or print new text to the output.
106 """
107
108 def __init__(self, s, f, left_pos=0, right_pos=-1):
109 # type: (str, IO[str], int, int) -> None
110 self.s = s
111 self.f = f
112 self.pos = left_pos
113 self.right_pos = len(s) if right_pos == -1 else right_pos
114
115 def SkipTo(self, pos):
116 # type: (int) -> None
117 """Skip to a position."""
118 self.pos = pos
119
120 def PrintUntil(self, pos):
121 # type: (int) -> None
122 """Print until a position."""
123 piece = self.s[self.pos:pos]
124 self.f.write(piece)
125 self.pos = pos
126
127 def PrintTheRest(self):
128 # type: () -> None
129 """Print until the end of the string."""
130 self.PrintUntil(self.right_pos)
131
132 def Print(self, s):
133 # type: (str) -> None
134 """Print text to the underlying buffer."""
135 self.f.write(s)
136
137
138# HTML Tokens
139# CommentBegin, ProcessingBegin, CDataBegin are "pseudo-tokens", not visible
140TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin CData CDataBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData HtmlCData BadAmpersand BadGreaterThan BadLessThan Invalid EndOfStream'.split(
141)
142
143
144class Tok(object):
145 """
146 Avoid lint errors by using these aliases
147 """
148 pass
149
150
151TOKEN_NAMES = [None] * len(TOKENS) # type: List[str]
152
153this_module = sys.modules[__name__]
154for i, tok_str in enumerate(TOKENS):
155 setattr(this_module, tok_str, i)
156 setattr(Tok, tok_str, i)
157 TOKEN_NAMES[i] = tok_str
158
159
160def MakeLexer(rules):
161 return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]
162
163
164#
165# Eggex
166#
167# Tag = / ~['>']+ /
168
169# Is this valid? A single character?
170# Tag = / ~'>'* /
171
172# Maybe better: / [NOT '>']+/
173# capital letters not allowed there?
174#
175# But then this is confusing:
176# / [NOT ~digit]+/
177#
178# / [NOT digit] / is [^\d]
179# / ~digit / is \D
180#
181# Or maybe:
182#
183# / [~ digit]+ /
184# / [~ '>']+ /
185# / [NOT '>']+ /
186
187# End = / '</' Tag '>' /
188# StartEnd = / '<' Tag '/>' /
189# Start = / '<' Tag '>' /
190#
191# EntityRef = / '&' dot{* N} ';' /
192
193# Tag name, or attribute name
194# colon is used in XML
195
196# https://www.w3.org/TR/xml/#NT-Name
197# Hm there is a lot of unicode stuff. We are simplifying parsing
198
199_NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter
200
201CHAR_LEX = [
202 # Characters
203 # https://www.w3.org/TR/xml/#sec-references
204 (r'&\# [0-9]+ ;', h8_id.DecChar),
205 (r'&\# x[0-9a-fA-F]+ ;', h8_id.HexChar),
206 (r'& %s ;' % _NAME, h8_id.CharEntity),
207 # Allow unquoted, and quoted
208 (r'&', h8_id.BadAmpersand),
209]
210
211HTM8_LEX = CHAR_LEX + [
212 (r'<!--', h8_id.CommentBegin),
213
214 # Processing instruction are used for the XML header:
215 # <?xml version="1.0" encoding="UTF-8"?>
216 # They are technically XML-only, but in HTML5, they are another kind of
217 # comment:
218 #
219 # https://developer.mozilla.org/en-US/docs/Web/API/ProcessingInstruction
220 #
221 (r'<\?', h8_id.ProcessingBegin),
222 # Not necessary in HTML5, but occurs in XML
223 (r'<!\[CDATA\[', h8_id.CDataBegin), # <![CDATA[
224
225 # Markup declarations
226 # - In HTML5, there is only <!DOCTYPE html>
227 # - XML has 4 more declarations: <!ELEMENT ...> ATTLIST ENTITY NOTATION
228 # - these seem to be part of DTD
229 # - it's useful to skip these, and be able to parse the rest of the document
230 # - Note: < is allowed?
231 (r'<! [^>\x00]+ >', h8_id.Decl),
232
233 # Tags
234 # Notes:
235 # - We look for a valid tag name, but we don't validate attributes.
236 # That's done in the tag lexer.
237 # - We don't allow leading whitespace
238 (r'</ (%s) >' % _NAME, h8_id.EndTag),
239 # self-closing <br/> comes before StartTag
240 # could/should these be collapsed into one rule?
241 (r'< (%s) [^>\x00]* />' % _NAME, h8_id.StartEndTag), # end </a>
242 (r'< (%s) [^>\x00]* >' % _NAME, h8_id.StartTag), # start <a>
243
244 # HTML5 allows unescaped > in raw data, but < is not allowed.
245 # https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
246 #
247 # - My early blog has THREE errors when disallowing >
248 # - So do some .wwz files
249 (r'[^&<>\x00]+', h8_id.RawData),
250 (r'>', h8_id.BadGreaterThan),
251 # < is an error
252 (r'.', h8_id.Invalid),
253]
254
255# Old notes:
256#
257# Non-greedy matches are regular and can be matched in linear time
258# with RE2.
259#
260# https://news.ycombinator.com/item?id=27099798
261#
262# Maybe try combining all of these for speed.
263
264# . is any char except newline
265# https://re2c.org/manual/manual_c.html
266
267# Discarded options
268#(r'<!-- .*? -->', h8_id.Comment),
269
270# Hack from Claude: \s\S instead of re.DOTALL. I don't like this
271#(r'<!-- [\s\S]*? -->', h8_id.Comment),
272#(r'<!-- (?:.|[\n])*? -->', h8_id.Comment),
273
274HTM8_LEX_COMPILED = MakeLexer(HTM8_LEX)
275
276
277class Lexer(object):
278
279 def __init__(self, s, left_pos=0, right_pos=-1, no_special_tags=False):
280 # type: (str, int, int, bool) -> None
281 self.s = s
282 self.pos = left_pos
283 self.right_pos = len(s) if right_pos == -1 else right_pos
284 self.no_special_tags = no_special_tags
285
286 # string -> compiled regex pattern object
287 self.cache = {} # type: Dict[str, Any]
288
289 # either </script> or </style> - we search until we see that
290 self.search_state = None # type: Optional[str]
291
292 # Position of tag name, if applicable
293 # - Set after you get a StartTag, EndTag, or StartEndTag
294 # - Unset on other tags
295 self.tag_pos_left = -1
296 self.tag_pos_right = -1
297
298 def _Read(self):
299 # type: () -> Tuple[h8_id_t, int]
300 if self.pos == self.right_pos:
301 return h8_id.EndOfStream, self.pos
302
303 assert self.pos < self.right_pos, self.pos
304
305 if self.search_state is not None and not self.no_special_tags:
306 # TODO: case-insensitive search for </SCRIPT> <SCRipt> ?
307 #
308 # Another strategy: enter a mode where we find ONLY the end tag
309 # regex, and any data that's not <, and then check the canonical
310 # tag name for 'script' or 'style'.
311 pos = self.s.find(self.search_state, self.pos)
312 if pos == -1:
313 # unterminated <script> or <style>
314 raise LexError(self.s, self.pos)
315 self.search_state = None
316 # beginning
317 return h8_id.HtmlCData, pos
318
319 # Find the first match.
320 # Note: frontend/match.py uses _LongestMatch(), which is different!
321 # TODO: reconcile them. This lexer should be expressible in re2c.
322
323 for pat, tok_id in HTM8_LEX_COMPILED:
324 m = pat.match(self.s, self.pos)
325 if m:
326 if tok_id in (h8_id.StartTag, h8_id.EndTag, h8_id.StartEndTag):
327 self.tag_pos_left = m.start(1)
328 self.tag_pos_right = m.end(1)
329 else:
330 # Reset state
331 self.tag_pos_left = -1
332 self.tag_pos_right = -1
333
334 if tok_id == h8_id.CommentBegin:
335 pos = self.s.find('-->', self.pos)
336 if pos == -1:
337 # unterminated <!--
338 raise LexError(self.s, self.pos)
339 return h8_id.Comment, pos + 3 # -->
340
341 if tok_id == h8_id.ProcessingBegin:
342 pos = self.s.find('?>', self.pos)
343 if pos == -1:
344 # unterminated <?
345 raise LexError(self.s, self.pos)
346 return h8_id.Processing, pos + 2 # ?>
347
348 if tok_id == h8_id.CDataBegin:
349 pos = self.s.find(']]>', self.pos)
350 if pos == -1:
351 # unterminated <![CDATA[
352 raise LexError(self.s, self.pos)
353 return h8_id.CData, pos + 3 # ]]>
354
355 if tok_id == h8_id.StartTag:
356 # TODO: reduce allocations
357 if (self.TagNameEquals('script') or
358 self.TagNameEquals('style')):
359 # <SCRipt a=b> -> </SCRipt>
360 self.search_state = '</' + self._LiteralTagName() + '>'
361
362 return tok_id, m.end()
363 else:
364 raise AssertionError('h8_id.Invalid rule should have matched')
365
366 def TagNameEquals(self, expected):
367 # type: (str) -> bool
368 assert self.tag_pos_left != -1, self.tag_pos_left
369 assert self.tag_pos_right != -1, self.tag_pos_right
370
371 # TODO: In C++, this does not need an allocation. Can we test
372 # directly?
373 return expected == self.CanonicalTagName()
374
375 def _LiteralTagName(self):
376 # type: () -> str
377 assert self.tag_pos_left != -1, self.tag_pos_left
378 assert self.tag_pos_right != -1, self.tag_pos_right
379
380 return self.s[self.tag_pos_left:self.tag_pos_right]
381
382 def CanonicalTagName(self):
383 # type: () -> str
384 tag_name = self._LiteralTagName()
385 # Most tags are already lower case, so avoid allocation with this conditional
386 # TODO: this could go in the mycpp runtime?
387 if tag_name.islower():
388 return tag_name
389 else:
390 return tag_name.lower()
391
392 def Read(self):
393 # type: () -> Tuple[h8_id_t, int]
394 tok_id, end_pos = self._Read()
395 self.pos = end_pos # advance
396 return tok_id, end_pos
397
398 def LookAhead(self, regex):
399 # type: (str) -> bool
400 # Cache the regex compilation. This could also be LookAheadFor(THEAD)
401 # or something.
402 pat = self.cache.get(regex)
403 if pat is None:
404 pat = re.compile(regex)
405 self.cache[regex] = pat
406
407 m = pat.match(self.s, self.pos)
408 return m is not None
409
410
411def _Tokens(s, left_pos, right_pos):
412 # type: (str, int, int) -> Iterator[Tuple[h8_id_t, int]]
413 """
414 Args:
415 s: string to parse
416 left_pos, right_pos: Optional span boundaries.
417 """
418 lx = Lexer(s, left_pos, right_pos)
419 while True:
420 tok_id, pos = lx.Read()
421 yield tok_id, pos
422 if tok_id == h8_id.EndOfStream:
423 break
424
425
426def ValidTokens(s, left_pos=0, right_pos=-1):
427 # type: (str, int, int) -> Iterator[Tuple[int, int]]
428 """Wrapper around _Tokens to prevent callers from having to handle Invalid.
429
430 I'm not combining the two functions because I might want to do a
431 'yield' transformation on Tokens()? Exceptions might complicate the
432 issue?
433 """
434 pos = left_pos
435 for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
436 if tok_id == h8_id.Invalid:
437 raise LexError(s, pos)
438 yield tok_id, end_pos
439 pos = end_pos
440
441
442def ValidTokenList(s, no_special_tags=False):
443 # type: (str, bool) -> List[Tuple[h8_id_t, int]]
444 """A wrapper that can be more easily translated to C++. Doesn't use iterators."""
445
446 start_pos = 0
447 tokens = []
448 lx = Lexer(s, no_special_tags=no_special_tags)
449 while True:
450 tok_id, end_pos = lx.Read()
451 tokens.append((tok_id, end_pos))
452 if tok_id == h8_id.EndOfStream:
453 break
454 if tok_id == h8_id.Invalid:
455 raise LexError(s, start_pos)
456 start_pos = end_pos
457 return tokens
458
459
460# Tag names:
461# Match <a or </a
462# Match <h2, but not <2h
463#
464# HTML 5 doesn't restrict tag names at all
465# https://html.spec.whatwg.org/#toc-syntax
466#
467# XML allows : - .
468# https://www.w3.org/TR/xml/#NT-NameChar
469
470# Namespaces for MathML, SVG
471# XLink, XML, XMLNS
472#
473# https://infra.spec.whatwg.org/#namespaces
474#
475# Allow - for td-attrs
476
477# Be very lenient - just no whitespace or special HTML chars
478# I don't think this is more lenient than HTML5, though we should check.
479_UNQUOTED_VALUE = r'''[^ \t\r\n<>&"'\x00]*'''
480
481# TODO: we don't need to capture the tag name here? That's done at the top
482# level
483_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)
484
485_TAG_LAST_RE = re.compile(r'\s* /? >', re.VERBOSE)
486
487# To match href="foo"
488# Note: in HTML5 and XML, single quoted attributes are also valid
489
490# <button disabled> is standard usage
491
492# NOTE: This used to allow whitespace around =
493# <a foo = "bar"> makes sense in XML
494# But then you also have
495# <a foo= bar> - which is TWO attributes, in HTML5
496# So the space is problematic
497
498_ATTR_RE = re.compile(
499 r'''
500\s+ # Leading whitespace is required
501(%s) # Attribute name
502(?: # Optional attribute value
503 \s* = \s* # Spaces allowed around =
504 (?:
505 " ([^>"\x00]*) " # double quoted value
506 | ' ([^>'\x00]*) ' # single quoted value
507 | (%s) # Attribute value
508 )
509)?
510''' % (_NAME, _UNQUOTED_VALUE), re.VERBOSE)
511
512TagName, AttrName, UnquotedValue, QuotedValue, MissingValue = range(5)
513
514
515class TagLexer(object):
516 """
517 Given a tag like <a href="..."> or <link type="..." />, the TagLexer
518 provides a few operations:
519
520 - What is the tag?
521 - Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
522 """
523
524 def __init__(self, s):
525 # type: (str) -> None
526 self.s = s
527 self.start_pos = -1 # Invalid
528 self.end_pos = -1
529
530 def Reset(self, start_pos, end_pos):
531 # type: (int, int) -> None
532 """Reuse instances of this object."""
533 assert start_pos >= 0, start_pos
534 assert end_pos >= 0, end_pos
535
536 self.start_pos = start_pos
537 self.end_pos = end_pos
538
539 def TagString(self):
540 # type: () -> str
541 """Return the entire tag string, e.g. <a href='foo'>"""
542 return self.s[self.start_pos:self.end_pos]
543
544 def TagName(self):
545 # type: () -> str
546 # First event
547 tok_id, start, end = next(self.Tokens())
548 return self.s[start:end]
549
550 def GetSpanForAttrValue(self, attr_name):
551 # type: (str) -> Tuple[int, int]
552 """
553 Used by oils_doc.py, for href shortcuts
554 """
555 # Algorithm: search for QuotedValue or UnquotedValue after AttrName
556 # TODO: Could also cache these
557
558 events = self.Tokens()
559 val = (-1, -1)
560 try:
561 while True:
562 tok_id, start, end = next(events)
563 if tok_id == AttrName:
564 name = self.s[start:end]
565 if name == attr_name:
566 # The value should come next
567 tok_id, start, end = next(events)
568 assert tok_id in (QuotedValue, UnquotedValue,
569 MissingValue), h8_id_str(tok_id)
570 val = start, end
571 break
572
573 except StopIteration:
574 pass
575 return val
576
577 def GetAttrRaw(self, attr_name):
578 # type: (str) -> Optional[str]
579 """
580 Return the value, which may be UNESCAPED.
581 """
582 start, end = self.GetSpanForAttrValue(attr_name)
583 if start == -1:
584 return None
585 return self.s[start:end]
586
587 def AllAttrsRawSlice(self):
588 # type: () -> List[Tuple[str, int, int]]
589 """
590 Get a list of pairs [('class', 3, 5), ('href', 9, 12)]
591 """
592 slices = []
593 events = self.Tokens()
594 try:
595 while True:
596 tok_id, start, end = next(events)
597 if tok_id == AttrName:
598 name = self.s[start:end]
599
600 # The value should come next
601 tok_id, start, end = next(events)
602 assert tok_id in (QuotedValue, UnquotedValue,
603 MissingValue), h8_id_str(tok_id)
604 # Note: quoted values may have &amp;
605 # We would need ANOTHER lexer to unescape them, but we
606 # don't need that for ul-table
607 slices.append((name, start, end))
608 except StopIteration:
609 pass
610 return slices
611
612 def AllAttrsRaw(self):
613 # type: () -> List[Tuple[str, str]]
614 """
615 Get a list of pairs [('class', 'foo'), ('href', '?foo=1&amp;bar=2')]
616
617 The quoted values may be escaped. We would need another lexer to
618 unescape them.
619 """
620 slices = self.AllAttrsRawSlice()
621 pairs = []
622 for name, start, end in slices:
623 pairs.append((name, self.s[start:end]))
624 return pairs
625
626 def Tokens(self):
627 # type: () -> Iterator[Tuple[int, int, int]]
628 """
629 Yields a sequence of tokens: Tag (AttrName AttrValue?)*
630
631 Where each Token is (Type, start_pos, end_pos)
632
633 Note that start and end are NOT redundant! We skip over some unwanted
634 characters.
635 """
636 m = _TAG_RE.match(self.s, self.start_pos + 1)
637 if not m:
638 raise RuntimeError("Couldn't find HTML tag in %r" %
639 self.TagString())
640 yield TagName, m.start(1), m.end(1)
641
642 pos = m.end(0)
643 #log('POS %d', pos)
644
645 while True:
646 # don't search past the end
647 m = _ATTR_RE.match(self.s, pos, self.end_pos)
648 if not m:
649 #log('BREAK pos %d', pos)
650 break
651 #log('AttrName %r', m.group(1))
652
653 yield AttrName, m.start(1), m.end(1)
654
655 #log('m.groups() %r', m.groups())
656 if m.group(2) is not None:
657 # double quoted
658 yield QuotedValue, m.start(2), m.end(2)
659 elif m.group(3) is not None:
660 # single quoted - TODO: could have different token types
661 yield QuotedValue, m.start(3), m.end(3)
662 elif m.group(4) is not None:
663 yield UnquotedValue, m.start(4), m.end(4)
664 else:
665 # <button disabled>
666 end = m.end(0)
667 yield MissingValue, end, end
668
669 # Skip past the "
670 pos = m.end(0)
671
672 #log('TOK %r', self.s)
673
674 m = _TAG_LAST_RE.match(self.s, pos)
675 #log('_TAG_LAST_RE match %r', self.s[pos:])
676 if not m:
677 # Extra data at end of tag. TODO: add messages for all these.
678 raise LexError(self.s, pos)
679
680
681# This is similar but not identical to
682# " ([^>"\x00]*) " # double quoted value
683# | ' ([^>'\x00]*) ' # single quoted value
684#
685# Note: for unquoted values, & isn't allowed, and thus &amp; and &#99; and
686# &#x99; are not allowed. We could relax that?
687ATTR_VALUE_LEXER = CHAR_LEX + [
688 (r'[^>&\x00]+', h8_id.RawData),
689 (r'.', h8_id.Invalid),
690]
691
692ATTR_VALUE_LEXER = MakeLexer(ATTR_VALUE_LEXER)
693
694
695class AttrValueLexer(object):
696 """
697 <a href="foo=99&amp;bar">
698 <a href='foo=99&amp;bar'>
699 <a href=unquoted>
700 """
701
702 def __init__(self, s):
703 # type: (str) -> None
704 self.s = s
705 self.start_pos = -1 # Invalid
706 self.end_pos = -1
707
708 def Reset(self, start_pos, end_pos):
709 # type: (int, int) -> None
710 """Reuse instances of this object."""
711 assert start_pos >= 0, start_pos
712 assert end_pos >= 0, end_pos
713
714 self.start_pos = start_pos
715 self.end_pos = end_pos
716
717 def NumTokens(self):
718 # type: () -> int
719 num_tokens = 0
720 pos = self.start_pos
721 for tok_id, end_pos in self.Tokens():
722 if tok_id == h8_id.Invalid:
723 raise LexError(self.s, pos)
724 pos = end_pos
725 #log('pos %d', pos)
726 num_tokens += 1
727 return num_tokens
728
729 def Tokens(self):
730 # type: () -> Iterator[Tuple[h8_id_t, int]]
731 pos = self.start_pos
732 while pos < self.end_pos:
733 # Find the first match, like above.
734 # Note: frontend/match.py uses _LongestMatch(), which is different!
735 # TODO: reconcile them. This lexer should be expressible in re2c.
736 for pat, tok_id in ATTR_VALUE_LEXER:
737 m = pat.match(self.s, pos)
738 if m:
739 if 0:
740 tok_str = m.group(0)
741 log('token = %r', tok_str)
742
743 end_pos = m.end(0)
744 yield tok_id, end_pos
745 pos = end_pos
746 break
747 else:
748 raise AssertionError('h8_id.Invalid rule should have matched')
749
750
751def ReadUntilStartTag(it, tag_lexer, tag_name):
752 # type: (Iterator[Tuple[h8_id_t, int]], TagLexer, str) -> Tuple[int, int]
753 """Find the next <foo>, returning its (start, end) positions
754
755 Raise ParseError if it's not found.
756
757 tag_lexer is RESET.
758 """
759 pos = 0
760 while True:
761 try:
762 tok_id, end_pos = next(it)
763 except StopIteration:
764 break
765 tag_lexer.Reset(pos, end_pos)
766 if tok_id == h8_id.StartTag and tag_lexer.TagName() == tag_name:
767 return pos, end_pos
768
769 pos = end_pos
770
771 raise ParseError('No start tag %r' % tag_name)
772
773
774def ReadUntilEndTag(it, tag_lexer, tag_name):
775 # type: (Iterator[Tuple[h8_id_t, int]], TagLexer, str) -> Tuple[int, int]
776 """Find the next </foo>, returning its (start, end) position
777
778 Raise ParseError if it's not found.
779
780 tag_lexer is RESET.
781 """
782 pos = 0
783 while True:
784 try:
785 tok_id, end_pos = next(it)
786 except StopIteration:
787 break
788 tag_lexer.Reset(pos, end_pos)
789 if tok_id == h8_id.EndTag and tag_lexer.TagName() == tag_name:
790 return pos, end_pos
791
792 pos = end_pos
793
794 raise ParseError('No end tag %r' % tag_name)
795
796
797CHAR_ENTITY = {
798 'amp': '&',
799 'lt': '<',
800 'gt': '>',
801 'quot': '"',
802 'apos': "'",
803}
804
805
806def ToText(s, left_pos=0, right_pos=-1):
807 # type: (str, int, int) -> str
808 """Given HTML, return text by unquoting &gt; and &lt; etc.
809
810 Used by:
811 doctools/oils_doc.py: PygmentsPlugin
812 doctools/help_gen.py: HelpIndexCards
813
814 In the latter case, we cold process some tags, like:
815
816 - Blue Link (not clickable, but still useful)
817 - Red X
818
819 That should be html.ToAnsi.
820 """
821 f = StringIO()
822 out = Output(s, f, left_pos, right_pos)
823
824 pos = left_pos
825 for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
826 if tok_id in (h8_id.RawData, h8_id.BadAmpersand, h8_id.BadGreaterThan,
827 h8_id.BadLessThan):
828 out.SkipTo(pos)
829 out.PrintUntil(end_pos)
830
831 elif tok_id == h8_id.CharEntity: # &amp;
832
833 entity = s[pos + 1:end_pos - 1]
834
835 out.SkipTo(pos)
836 out.Print(CHAR_ENTITY[entity])
837 out.SkipTo(end_pos)
838
839 # Not handling these yet
840 elif tok_id == h8_id.HexChar:
841 raise AssertionError('Hex Char %r' % s[pos:pos + 20])
842
843 elif tok_id == h8_id.DecChar:
844 raise AssertionError('Dec Char %r' % s[pos:pos + 20])
845
846 else:
847 # Skip everything else
848 out.SkipTo(end_pos)
849
850 pos = end_pos
851
852 out.PrintTheRest()
853 return f.getvalue()
854
855
856# https://developer.mozilla.org/en-US/docs/Glossary/Void_element
857VOID_ELEMENTS = [
858 'area',
859 'base',
860 'br',
861 'col',
862 'embed',
863 'hr',
864 'img',
865 'input',
866 'link',
867 'meta',
868 'param',
869 'source',
870 'track',
871 'wbr',
872]
873
874LEX_ATTRS = 1 << 1
875LEX_QUOTED_VALUES = 1 << 2 # href="?x=42&amp;y=99"
876NO_SPECIAL_TAGS = 1 << 3 # <script> <style>, VOID tags, etc.
877BALANCED_TAGS = 1 << 4 # are tags balanced?
878
879
880def Validate(contents, flags, counters):
881 # type: (str, int, Counters) -> None
882
883 tag_lexer = TagLexer(contents)
884 val_lexer = AttrValueLexer(contents)
885
886 no_special_tags = bool(flags & NO_SPECIAL_TAGS)
887 lx = Lexer(contents, no_special_tags=no_special_tags)
888 tokens = []
889 start_pos = 0
890 tag_stack = []
891 while True:
892 tok_id, end_pos = lx.Read()
893 #log('TOP %s %r', h8_id_str(tok_id), contents[start_pos:end_pos])
894
895 if tok_id == h8_id.Invalid:
896 raise LexError(contents, start_pos)
897 if tok_id == h8_id.EndOfStream:
898 break
899
900 tokens.append((tok_id, end_pos))
901
902 if tok_id == h8_id.StartEndTag:
903 counters.num_start_end_tags += 1
904
905 tag_lexer.Reset(start_pos, end_pos)
906 all_attrs = tag_lexer.AllAttrsRawSlice()
907 counters.num_attrs += len(all_attrs)
908 for name, val_start, val_end in all_attrs:
909 val_lexer.Reset(val_start, val_end)
910 counters.num_val_tokens += val_lexer.NumTokens()
911
912 #counters.debug_attrs.extend(all_attrs)
913
914 elif tok_id == h8_id.StartTag:
915 counters.num_start_tags += 1
916
917 tag_lexer.Reset(start_pos, end_pos)
918 all_attrs = tag_lexer.AllAttrsRawSlice()
919 counters.num_attrs += len(all_attrs)
920 for name, val_start, val_end in all_attrs:
921 val_lexer.Reset(val_start, val_end)
922 counters.num_val_tokens += val_lexer.NumTokens()
923
924 #counters.debug_attrs.extend(all_attrs)
925
926 if flags & BALANCED_TAGS:
927 tag_name = lx.CanonicalTagName()
928 if flags & NO_SPECIAL_TAGS:
929 tag_stack.append(tag_name)
930 else:
931 # e.g. <meta> is considered self-closing, like <meta/>
932 if tag_name not in VOID_ELEMENTS:
933 tag_stack.append(tag_name)
934
935 counters.max_tag_stack = max(counters.max_tag_stack,
936 len(tag_stack))
937 elif tok_id == h8_id.EndTag:
938 if flags & BALANCED_TAGS:
939 try:
940 expected = tag_stack.pop()
941 except IndexError:
942 raise ParseError('Tag stack empty',
943 s=contents,
944 start_pos=start_pos)
945
946 actual = lx.CanonicalTagName()
947 if expected != actual:
948 raise ParseError(
949 'Got unexpected closing tag %r; opening tag was %r' %
950 (contents[start_pos:end_pos], expected),
951 s=contents,
952 start_pos=start_pos)
953
954 start_pos = end_pos
955
956 if len(tag_stack) != 0:
957 raise ParseError('Missing closing tags at end of doc: %s' %
958 ' '.join(tag_stack),
959 s=contents,
960 start_pos=start_pos)
961
962 counters.num_tokens += len(tokens)
963
964
965def ToXml(htm8_str):
966 # type: (str) -> str
967
968 # TODO:
969 # 1. Lex it
970 # 2. < & > must be escaped
971 # a. in raw data
972 # b. in quoted strings
973 # 3. <script> turned into CDATA
974 # 4. void tags turned into self-closing tags
975 # 5. case-sensitive tag matching - not sure about this
976
977 tag_lexer = TagLexer(htm8_str)
978 val_lexer = AttrValueLexer(htm8_str)
979
980 f = StringIO()
981 out = Output(htm8_str, f)
982
983 lx = Lexer(htm8_str)
984
985 pos = 0
986 while True:
987 tok_id, end_pos = lx.Read()
988
989 if tok_id == h8_id.Invalid:
990 raise LexError(htm8_str, pos)
991 if tok_id == h8_id.EndOfStream:
992 break
993
994 if tok_id in (h8_id.RawData, h8_id.CharEntity, h8_id.HexChar,
995 h8_id.DecChar):
996 out.PrintUntil(end_pos)
997 elif tok_id in (h8_id.StartTag, h8_id.StartEndTag):
998 tag_lexer.Reset(pos, end_pos)
999 # TODO: reduce allocations here
1000 all_attrs = tag_lexer.AllAttrsRawSlice()
1001 for name, val_start, val_end in all_attrs:
1002 val_lexer.Reset(val_start, val_end)
1003 # TODO: get the kind of string
1004 #
1005 # Quoted: we need to replace & with &amp; and < with &lt;
1006 # note > is not allowed
1007 # Unquoted: right now, we can just surround with double quotes
1008 # because we don't allow any bad chars
1009 # Empty : add "", so empty= becomes =""
1010 # Missing : add ="", so missing becomes missing=""
1011
1012 tag_name = lx.CanonicalTagName()
1013 if tok_id == h8_id.StartTag and tag_name in VOID_ELEMENTS:
1014 # TODO: instead of closing >, print />
1015 pass
1016
1017 elif tok_id == h8_id.BadAmpersand:
1018 #out.SkipTo(pos)
1019 out.Print('&amp;')
1020 out.SkipTo(end_pos)
1021
1022 elif tok_id == h8_id.BadGreaterThan:
1023 #out.SkipTo(pos)
1024 out.Print('&gt;')
1025 out.SkipTo(end_pos)
1026 else:
1027 out.PrintUntil(end_pos)
1028
1029 pos = end_pos
1030
1031 out.PrintTheRest()
1032 return f.getvalue()
1033
1034
1035class Counters(object):
1036
1037 def __init__(self):
1038 # type: () -> None
1039 self.num_tokens = 0
1040 self.num_start_tags = 0
1041 self.num_start_end_tags = 0
1042 self.num_attrs = 0
1043 self.max_tag_stack = 0
1044 self.num_val_tokens = 0
1045
1046 #self.debug_attrs = []
1047
1048
1049def main(argv):
1050 # type: (List[str]) -> int
1051 action = argv[1]
1052
1053 if action == 'tokens':
1054 contents = sys.stdin.read()
1055
1056 lx = Lexer(contents)
1057 start_pos = 0
1058 while True:
1059 tok_id, end_pos = lx.Read()
1060 if tok_id == h8_id.Invalid:
1061 raise LexError(contents, start_pos)
1062 if tok_id == h8_id.EndOfStream:
1063 break
1064
1065 frag = contents[start_pos:end_pos]
1066 log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
1067 start_pos = end_pos
1068
1069 return 0
1070
1071 elif action in ('lex-htm8', 'parse-htm8', 'parse-xml'):
1072
1073 errors = []
1074 counters = Counters()
1075
1076 flags = LEX_ATTRS | LEX_QUOTED_VALUES
1077 if action.startswith('parse-'):
1078 flags |= BALANCED_TAGS
1079 if action == 'parse-xml':
1080 flags |= NO_SPECIAL_TAGS
1081
1082 i = 0
1083 for line in sys.stdin:
1084 filename = line.strip()
1085 with open(filename) as f:
1086 contents = f.read()
1087
1088 try:
1089 Validate(contents, flags, counters)
1090 except LexError as e:
1091 log('Lex error in %r: %s', filename, e)
1092 errors.append((filename, e))
1093 except ParseError as e:
1094 log('Parse error in %r: %s', filename, e)
1095 errors.append((filename, e))
1096 i += 1
1097
1098 log('')
1099 log('%10d tokens', counters.num_tokens)
1100 log('%10d start/end tags', counters.num_start_end_tags)
1101 log('%10d start tags', counters.num_start_tags)
1102 log('%10d attrs', counters.num_attrs)
1103 log('%10d max tag stack depth', counters.max_tag_stack)
1104 log('%10d attr val tokens', counters.num_val_tokens)
1105 log('%10d errors', len(errors))
1106 if len(errors):
1107 return 1
1108 return 0
1109
1110 elif action == 'todo':
1111 # Other algorithms:
1112 #
1113 # - select first subtree with given ID
1114 # - this requires understanding the void tags I suppose
1115 # - select all subtrees that have a class
1116 # - materialize DOM
1117
1118 # Safe-HTM8? This is a filter
1119 return 0
1120
1121 else:
1122 raise RuntimeError('Invalid action %r' % action)
1123
1124
1125if __name__ == '__main__':
1126 sys.exit(main(sys.argv))