OILS / lazylex / html.py View on Github | oils.pub

1105 lines, 551 significant
1#!/usr/bin/env python2
2"""
3lazylex/html.py - Low-Level HTML Processing.
4
5See lazylex/README.md for details.
6
7TODO:
8- Get rid of AttrValueLexer - this should be in the TagLexer
9 - this also means that unquoted values can be more similar
10 - We can use a single lexer mode for everything inside <>
11 - the SPACE is the only difference
12- UTF-8 check, like JSON8
13- Static typing
14
15"""
16from __future__ import print_function
17
18from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_id_str, h8_tag_id,
19 h8_tag_id_t, h8_tag_id_str)
20from typing import Dict, Iterator, Any, IO
21
22try:
23 from cStringIO import StringIO
24except ImportError:
25 # for python3
26 from io import StringIO # type: ignore
27import re
28import sys
29
30if sys.version_info.major == 2:
31 from typing import List, Tuple, Optional
32
33
34def log(msg, *args):
35 # type: (str, *Any) -> None
36 msg = msg % args
37 print(msg, file=sys.stderr)
38
39
40class LexError(Exception):
41 """
42 Examples of lex errors:
43
44 - h8_id.Invalid, like <> or &&
45 - Unclosed <!-- <? <![CDATA[ <script> <style>
46 """
47
48 def __init__(self, s, start_pos):
49 # type: (str, int) -> None
50 self.s = s
51 self.start_pos = start_pos
52
53 def __str__(self):
54 # type: () -> str
55 return '(LexError %r)' % (self.s[self.start_pos:self.start_pos + 20])
56
57
58def FindLineNum(s, error_pos):
59 # type: (str, int) -> int
60 current_pos = 0
61 line_num = 1
62 while True:
63 newline_pos = s.find('\n', current_pos)
64 #log('current = %d, N %d, line %d', current_pos, newline_pos, line_num)
65
66 if newline_pos == -1: # this is the last line
67 return line_num
68 if newline_pos >= error_pos:
69 return line_num
70 line_num += 1
71 current_pos = newline_pos + 1
72
73
74class ParseError(Exception):
75 """
76 Examples of parse errors
77
78 - unbalanced tag structure
79 - ul_table.py errors
80 """
81
82 def __init__(self, msg, s=None, start_pos=-1):
83 # type: (str, Optional[str], int) -> None
84 self.msg = msg
85 self.s = s
86 self.start_pos = start_pos
87
88 def __str__(self):
89 # type: () -> str
90 if self.s is not None:
91 assert self.start_pos != -1, self.start_pos
92 snippet = (self.s[self.start_pos:self.start_pos + 20])
93
94 line_num = FindLineNum(self.s, self.start_pos)
95 else:
96 snippet = ''
97 line_num = -1
98 msg = 'line %d: %r %r' % (line_num, self.msg, snippet)
99 return msg
100
101
102class Output(object):
103 """Takes an underlying input buffer and an output file. Maintains a
104 position in the input buffer.
105
106 Print FROM the input or print new text to the output.
107 """
108
109 def __init__(self, s, f, left_pos=0, right_pos=-1):
110 # type: (str, IO[str], int, int) -> None
111 self.s = s
112 self.f = f
113 self.pos = left_pos
114 self.right_pos = len(s) if right_pos == -1 else right_pos
115
116 def SkipTo(self, pos):
117 # type: (int) -> None
118 """Skip to a position."""
119 self.pos = pos
120
121 def PrintUntil(self, pos):
122 # type: (int) -> None
123 """Print until a position."""
124 piece = self.s[self.pos:pos]
125 self.f.write(piece)
126 self.pos = pos
127
128 def PrintTheRest(self):
129 # type: () -> None
130 """Print until the end of the string."""
131 self.PrintUntil(self.right_pos)
132
133 def Print(self, s):
134 # type: (str) -> None
135 """Print text to the underlying buffer."""
136 self.f.write(s)
137
138
139def MakeLexer(rules):
140 return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]
141
142
143#
144# Eggex
145#
146# Tag = / ~['>']+ /
147
148# Is this valid? A single character?
149# Tag = / ~'>'* /
150
151# Maybe better: / [NOT '>']+/
152# capital letters not allowed there?
153#
154# But then this is confusing:
155# / [NOT ~digit]+/
156#
157# / [NOT digit] / is [^\d]
158# / ~digit / is \D
159#
160# Or maybe:
161#
162# / [~ digit]+ /
163# / [~ '>']+ /
164# / [NOT '>']+ /
165
166# End = / '</' Tag '>' /
167# StartEnd = / '<' Tag '/>' /
168# Start = / '<' Tag '>' /
169#
170# EntityRef = / '&' dot{* N} ';' /
171
172# Tag name, or attribute name
173# colon is used in XML
174
175# https://www.w3.org/TR/xml/#NT-Name
176# Hm there is a lot of unicode stuff. We are simplifying parsing
177
178_NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter
179
180CHAR_LEX = [
181 # Characters
182 # https://www.w3.org/TR/xml/#sec-references
183 (r'&\# [0-9]+ ;', h8_id.DecChar),
184 (r'&\# x[0-9a-fA-F]+ ;', h8_id.HexChar),
185 (r'& %s ;' % _NAME, h8_id.CharEntity),
186 # Allow unquoted, and quoted
187 (r'&', h8_id.BadAmpersand),
188]
189
190HTM8_LEX = CHAR_LEX + [
191 (r'<!--', h8_id.CommentBegin),
192
193 # Processing instruction are used for the XML header:
194 # <?xml version="1.0" encoding="UTF-8"?>
195 # They are technically XML-only, but in HTML5, they are another kind of
196 # comment:
197 #
198 # https://developer.mozilla.org/en-US/docs/Web/API/ProcessingInstruction
199 #
200 (r'<\?', h8_id.ProcessingBegin),
201 # Not necessary in HTML5, but occurs in XML
202 (r'<!\[CDATA\[', h8_id.CDataBegin), # <![CDATA[
203
204 # Markup declarations
205 # - In HTML5, there is only <!DOCTYPE html>
206 # - XML has 4 more declarations: <!ELEMENT ...> ATTLIST ENTITY NOTATION
207 # - these seem to be part of DTD
208 # - it's useful to skip these, and be able to parse the rest of the document
209 # - Note: < is allowed?
210 (r'<! [^>\x00]+ >', h8_id.Decl),
211
212 # Tags
213 # Notes:
214 # - We look for a valid tag name, but we don't validate attributes.
215 # That's done in the tag lexer.
216 # - We don't allow leading whitespace
217 (r'</ (%s) >' % _NAME, h8_id.EndTag),
218 # self-closing <br/> comes before StartTag
219 # could/should these be collapsed into one rule?
220 (r'< (%s) [^>\x00]* />' % _NAME, h8_id.StartEndTag), # end </a>
221 (r'< (%s) [^>\x00]* >' % _NAME, h8_id.StartTag), # start <a>
222
223 # HTML5 allows unescaped > in raw data, but < is not allowed.
224 # https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
225 #
226 # - My early blog has THREE errors when disallowing >
227 # - So do some .wwz files
228 (r'[^&<>\x00]+', h8_id.RawData),
229 (r'>', h8_id.BadGreaterThan),
230 # < is an error
231 (r'.', h8_id.Invalid),
232]
233
234# Old notes:
235#
236# Non-greedy matches are regular and can be matched in linear time
237# with RE2.
238#
239# https://news.ycombinator.com/item?id=27099798
240#
241# Maybe try combining all of these for speed.
242
243# . is any char except newline
244# https://re2c.org/manual/manual_c.html
245
246# Discarded options
247#(r'<!-- .*? -->', h8_id.Comment),
248
249# Hack from Claude: \s\S instead of re.DOTALL. I don't like this
250#(r'<!-- [\s\S]*? -->', h8_id.Comment),
251#(r'<!-- (?:.|[\n])*? -->', h8_id.Comment),
252
253HTM8_LEX_COMPILED = MakeLexer(HTM8_LEX)
254
255
256class Lexer(object):
257
258 def __init__(self, s, left_pos=0, right_pos=-1, no_special_tags=False):
259 # type: (str, int, int, bool) -> None
260 self.s = s
261 self.pos = left_pos
262 self.right_pos = len(s) if right_pos == -1 else right_pos
263 self.no_special_tags = no_special_tags
264
265 # string -> compiled regex pattern object
266 self.cache = {} # type: Dict[str, Any]
267
268 # either </script> or </style> - we search until we see that
269 self.search_state = None # type: Optional[str]
270
271 # Position of tag name, if applicable
272 # - Set after you get a StartTag, EndTag, or StartEndTag
273 # - Unset on other tags
274 self.tag_pos_left = -1
275 self.tag_pos_right = -1
276
277 def _Read(self):
278 # type: () -> Tuple[h8_id_t, int]
279 if self.pos == self.right_pos:
280 return h8_id.EndOfStream, self.pos
281
282 assert self.pos < self.right_pos, self.pos
283
284 if self.search_state is not None and not self.no_special_tags:
285 # TODO: case-insensitive search for </SCRIPT> <SCRipt> ?
286 #
287 # Another strategy: enter a mode where we find ONLY the end tag
288 # regex, and any data that's not <, and then check the canonical
289 # tag name for 'script' or 'style'.
290 pos = self.s.find(self.search_state, self.pos)
291 if pos == -1:
292 # unterminated <script> or <style>
293 raise LexError(self.s, self.pos)
294 self.search_state = None
295 # beginning
296 return h8_id.HtmlCData, pos
297
298 # Find the first match.
299 # Note: frontend/match.py uses _LongestMatch(), which is different!
300 # TODO: reconcile them. This lexer should be expressible in re2c.
301
302 for pat, tok_id in HTM8_LEX_COMPILED:
303 m = pat.match(self.s, self.pos)
304 if m:
305 if tok_id in (h8_id.StartTag, h8_id.EndTag, h8_id.StartEndTag):
306 self.tag_pos_left = m.start(1)
307 self.tag_pos_right = m.end(1)
308 else:
309 # Reset state
310 self.tag_pos_left = -1
311 self.tag_pos_right = -1
312
313 if tok_id == h8_id.CommentBegin:
314 pos = self.s.find('-->', self.pos)
315 if pos == -1:
316 # unterminated <!--
317 raise LexError(self.s, self.pos)
318 return h8_id.Comment, pos + 3 # -->
319
320 if tok_id == h8_id.ProcessingBegin:
321 pos = self.s.find('?>', self.pos)
322 if pos == -1:
323 # unterminated <?
324 raise LexError(self.s, self.pos)
325 return h8_id.Processing, pos + 2 # ?>
326
327 if tok_id == h8_id.CDataBegin:
328 pos = self.s.find(']]>', self.pos)
329 if pos == -1:
330 # unterminated <![CDATA[
331 raise LexError(self.s, self.pos)
332 return h8_id.CData, pos + 3 # ]]>
333
334 if tok_id == h8_id.StartTag:
335 # TODO: reduce allocations
336 if (self.TagNameEquals('script') or
337 self.TagNameEquals('style')):
338 # <SCRipt a=b> -> </SCRipt>
339 self.search_state = '</' + self._LiteralTagName() + '>'
340
341 return tok_id, m.end()
342 else:
343 raise AssertionError('h8_id.Invalid rule should have matched')
344
345 def TagNameEquals(self, expected):
346 # type: (str) -> bool
347 assert self.tag_pos_left != -1, self.tag_pos_left
348 assert self.tag_pos_right != -1, self.tag_pos_right
349
350 # TODO: In C++, this does not need an allocation. Can we test
351 # directly?
352 return expected == self.CanonicalTagName()
353
354 def _LiteralTagName(self):
355 # type: () -> str
356 assert self.tag_pos_left != -1, self.tag_pos_left
357 assert self.tag_pos_right != -1, self.tag_pos_right
358
359 return self.s[self.tag_pos_left:self.tag_pos_right]
360
361 def CanonicalTagName(self):
362 # type: () -> str
363 tag_name = self._LiteralTagName()
364 # Most tags are already lower case, so avoid allocation with this conditional
365 # TODO: this could go in the mycpp runtime?
366 if tag_name.islower():
367 return tag_name
368 else:
369 return tag_name.lower()
370
371 def Read(self):
372 # type: () -> Tuple[h8_id_t, int]
373 tok_id, end_pos = self._Read()
374 self.pos = end_pos # advance
375 return tok_id, end_pos
376
377 def LookAhead(self, regex):
378 # type: (str) -> bool
379 # Cache the regex compilation. This could also be LookAheadFor(THEAD)
380 # or something.
381 pat = self.cache.get(regex)
382 if pat is None:
383 pat = re.compile(regex)
384 self.cache[regex] = pat
385
386 m = pat.match(self.s, self.pos)
387 return m is not None
388
389
390def _Tokens(s, left_pos, right_pos):
391 # type: (str, int, int) -> Iterator[Tuple[h8_id_t, int]]
392 """
393 Args:
394 s: string to parse
395 left_pos, right_pos: Optional span boundaries.
396 """
397 lx = Lexer(s, left_pos, right_pos)
398 while True:
399 tok_id, pos = lx.Read()
400 yield tok_id, pos
401 if tok_id == h8_id.EndOfStream:
402 break
403
404
405def ValidTokens(s, left_pos=0, right_pos=-1):
406 # type: (str, int, int) -> Iterator[Tuple[h8_id_t, int]]
407 """Wrapper around _Tokens to prevent callers from having to handle Invalid.
408
409 I'm not combining the two functions because I might want to do a
410 'yield' transformation on Tokens()? Exceptions might complicate the
411 issue?
412 """
413 pos = left_pos
414 for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
415 if tok_id == h8_id.Invalid:
416 raise LexError(s, pos)
417 yield tok_id, end_pos
418 pos = end_pos
419
420
421def ValidTokenList(s, no_special_tags=False):
422 # type: (str, bool) -> List[Tuple[h8_id_t, int]]
423 """A wrapper that can be more easily translated to C++. Doesn't use iterators."""
424
425 start_pos = 0
426 tokens = []
427 lx = Lexer(s, no_special_tags=no_special_tags)
428 while True:
429 tok_id, end_pos = lx.Read()
430 tokens.append((tok_id, end_pos))
431 if tok_id == h8_id.EndOfStream:
432 break
433 if tok_id == h8_id.Invalid:
434 raise LexError(s, start_pos)
435 start_pos = end_pos
436 return tokens
437
438
439# Tag names:
440# Match <a or </a
441# Match <h2, but not <2h
442#
443# HTML 5 doesn't restrict tag names at all
444# https://html.spec.whatwg.org/#toc-syntax
445#
446# XML allows : - .
447# https://www.w3.org/TR/xml/#NT-NameChar
448
449# Namespaces for MathML, SVG
450# XLink, XML, XMLNS
451#
452# https://infra.spec.whatwg.org/#namespaces
453#
454# Allow - for td-attrs
455
456# Be very lenient - just no whitespace or special HTML chars
457# I don't think this is more lenient than HTML5, though we should check.
458_UNQUOTED_VALUE = r'''[^ \t\r\n<>&"'\x00]*'''
459
460# TODO: we don't need to capture the tag name here? That's done at the top
461# level
462_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)
463
464_TAG_LAST_RE = re.compile(r'\s* /? >', re.VERBOSE)
465
466# To match href="foo"
467# Note: in HTML5 and XML, single quoted attributes are also valid
468
469# <button disabled> is standard usage
470
471# NOTE: This used to allow whitespace around =
472# <a foo = "bar"> makes sense in XML
473# But then you also have
474# <a foo= bar> - which is TWO attributes, in HTML5
475# So the space is problematic
476
477_ATTR_RE = re.compile(
478 r'''
479\s+ # Leading whitespace is required
480(%s) # Attribute name
481(?: # Optional attribute value
482 \s* = \s* # Spaces allowed around =
483 (?:
484 " ([^>"\x00]*) " # double quoted value
485 | ' ([^>'\x00]*) ' # single quoted value
486 | (%s) # Attribute value
487 )
488)?
489''' % (_NAME, _UNQUOTED_VALUE), re.VERBOSE)
490
491
492class TagLexer(object):
493 """
494 Given a tag like <a href="..."> or <link type="..." />, the TagLexer
495 provides a few operations:
496
497 - What is the tag?
498 - Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
499 """
500
501 def __init__(self, s):
502 # type: (str) -> None
503 self.s = s
504 self.start_pos = -1 # Invalid
505 self.end_pos = -1
506
507 def Reset(self, start_pos, end_pos):
508 # type: (int, int) -> None
509 """Reuse instances of this object."""
510 assert start_pos >= 0, start_pos
511 assert end_pos >= 0, end_pos
512
513 self.start_pos = start_pos
514 self.end_pos = end_pos
515
516 def TagString(self):
517 # type: () -> str
518 """Return the entire tag string, e.g. <a href='foo'>"""
519 return self.s[self.start_pos:self.end_pos]
520
521 def TagName(self):
522 # type: () -> str
523 # First event
524 tok_id, start, end = next(self.Tokens())
525 return self.s[start:end]
526
527 def GetSpanForAttrValue(self, attr_name):
528 # type: (str) -> Tuple[int, int]
529 """
530 Used by oils_doc.py, for href shortcuts
531 """
532 # Algorithm: search for QuotedValue or UnquotedValue after AttrName
533 # TODO: Could also cache these
534
535 events = self.Tokens()
536 val = (-1, -1)
537 try:
538 while True:
539 tok_id, start, end = next(events)
540 if tok_id == h8_tag_id.AttrName:
541 name = self.s[start:end]
542 if name == attr_name:
543 # The value should come next
544 tok_id, start, end = next(events)
545 assert tok_id in (
546 h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue,
547 h8_tag_id.MissingValue), h8_tag_id_str(tok_id)
548 val = start, end
549 break
550
551 except StopIteration:
552 pass
553 return val
554
555 def GetAttrRaw(self, attr_name):
556 # type: (str) -> Optional[str]
557 """
558 Return the value, which may be UNESCAPED.
559 """
560 start, end = self.GetSpanForAttrValue(attr_name)
561 if start == -1:
562 return None
563 return self.s[start:end]
564
565 def AllAttrsRawSlice(self):
566 # type: () -> List[Tuple[str, int, int]]
567 """
568 Get a list of pairs [('class', 3, 5), ('href', 9, 12)]
569 """
570 slices = []
571 events = self.Tokens()
572 try:
573 while True:
574 tok_id, start, end = next(events)
575 if tok_id == h8_tag_id.AttrName:
576 name = self.s[start:end]
577
578 # The value should come next
579 tok_id, start, end = next(events)
580 assert tok_id in (
581 h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue,
582 h8_tag_id.MissingValue), h8_tag_id_str(tok_id)
583 # Note: quoted values may have &amp;
584 # We would need ANOTHER lexer to unescape them, but we
585 # don't need that for ul-table
586 slices.append((name, start, end))
587 except StopIteration:
588 pass
589 return slices
590
591 def AllAttrsRaw(self):
592 # type: () -> List[Tuple[str, str]]
593 """
594 Get a list of pairs [('class', 'foo'), ('href', '?foo=1&amp;bar=2')]
595
596 The quoted values may be escaped. We would need another lexer to
597 unescape them.
598 """
599 slices = self.AllAttrsRawSlice()
600 pairs = []
601 for name, start, end in slices:
602 pairs.append((name, self.s[start:end]))
603 return pairs
604
605 def Tokens(self):
606 # type: () -> Iterator[Tuple[h8_tag_id_t, int, int]]
607 """
608 Yields a sequence of tokens: Tag (AttrName AttrValue?)*
609
610 Where each Token is (Type, start_pos, end_pos)
611
612 Note that start and end are NOT redundant! We skip over some unwanted
613 characters.
614 """
615 m = _TAG_RE.match(self.s, self.start_pos + 1)
616 if not m:
617 raise RuntimeError("Couldn't find HTML tag in %r" %
618 self.TagString())
619 yield h8_tag_id.TagName, m.start(1), m.end(1)
620
621 pos = m.end(0)
622 #log('POS %d', pos)
623
624 while True:
625 # don't search past the end
626 m = _ATTR_RE.match(self.s, pos, self.end_pos)
627 if not m:
628 #log('BREAK pos %d', pos)
629 break
630 #log('AttrName %r', m.group(1))
631
632 yield h8_tag_id.AttrName, m.start(1), m.end(1)
633
634 #log('m.groups() %r', m.groups())
635 if m.group(2) is not None:
636 # double quoted
637 yield h8_tag_id.QuotedValue, m.start(2), m.end(2)
638 elif m.group(3) is not None:
639 # single quoted - TODO: could have different token types
640 yield h8_tag_id.QuotedValue, m.start(3), m.end(3)
641 elif m.group(4) is not None:
642 yield h8_tag_id.UnquotedValue, m.start(4), m.end(4)
643 else:
644 # <button disabled>
645 end = m.end(0)
646 yield h8_tag_id.MissingValue, end, end
647
648 # Skip past the "
649 pos = m.end(0)
650
651 #log('TOK %r', self.s)
652
653 m = _TAG_LAST_RE.match(self.s, pos)
654 #log('_TAG_LAST_RE match %r', self.s[pos:])
655 if not m:
656 # Extra data at end of tag. TODO: add messages for all these.
657 raise LexError(self.s, pos)
658
659
660# This is similar but not identical to
661# " ([^>"\x00]*) " # double quoted value
662# | ' ([^>'\x00]*) ' # single quoted value
663#
664# Note: for unquoted values, & isn't allowed, and thus &amp; and &#99; and
665# &#x99; are not allowed. We could relax that?
666ATTR_VALUE_LEXER = CHAR_LEX + [
667 (r'[^>&\x00]+', h8_id.RawData),
668 (r'.', h8_id.Invalid),
669]
670
671ATTR_VALUE_LEXER = MakeLexer(ATTR_VALUE_LEXER)
672
673
674class AttrValueLexer(object):
675 """
676 <a href="foo=99&amp;bar">
677 <a href='foo=99&amp;bar'>
678 <a href=unquoted>
679 """
680
681 def __init__(self, s):
682 # type: (str) -> None
683 self.s = s
684 self.start_pos = -1 # Invalid
685 self.end_pos = -1
686
687 def Reset(self, start_pos, end_pos):
688 # type: (int, int) -> None
689 """Reuse instances of this object."""
690 assert start_pos >= 0, start_pos
691 assert end_pos >= 0, end_pos
692
693 self.start_pos = start_pos
694 self.end_pos = end_pos
695
696 def NumTokens(self):
697 # type: () -> int
698 num_tokens = 0
699 pos = self.start_pos
700 for tok_id, end_pos in self.Tokens():
701 if tok_id == h8_id.Invalid:
702 raise LexError(self.s, pos)
703 pos = end_pos
704 #log('pos %d', pos)
705 num_tokens += 1
706 return num_tokens
707
708 def Tokens(self):
709 # type: () -> Iterator[Tuple[h8_id_t, int]]
710 pos = self.start_pos
711 while pos < self.end_pos:
712 # Find the first match, like above.
713 # Note: frontend/match.py uses _LongestMatch(), which is different!
714 # TODO: reconcile them. This lexer should be expressible in re2c.
715 for pat, tok_id in ATTR_VALUE_LEXER:
716 m = pat.match(self.s, pos)
717 if m:
718 if 0:
719 tok_str = m.group(0)
720 log('token = %r', tok_str)
721
722 end_pos = m.end(0)
723 yield tok_id, end_pos
724 pos = end_pos
725 break
726 else:
727 raise AssertionError('h8_id.Invalid rule should have matched')
728
729
730def ReadUntilStartTag(it, tag_lexer, tag_name):
731 # type: (Iterator[Tuple[h8_id_t, int]], TagLexer, str) -> Tuple[int, int]
732 """Find the next <foo>, returning its (start, end) positions
733
734 Raise ParseError if it's not found.
735
736 tag_lexer is RESET.
737 """
738 pos = 0
739 while True:
740 try:
741 tok_id, end_pos = next(it)
742 except StopIteration:
743 break
744 tag_lexer.Reset(pos, end_pos)
745 if tok_id == h8_id.StartTag and tag_lexer.TagName() == tag_name:
746 return pos, end_pos
747
748 pos = end_pos
749
750 raise ParseError('No start tag %r' % tag_name)
751
752
753def ReadUntilEndTag(it, tag_lexer, tag_name):
754 # type: (Iterator[Tuple[h8_id_t, int]], TagLexer, str) -> Tuple[int, int]
755 """Find the next </foo>, returning its (start, end) position
756
757 Raise ParseError if it's not found.
758
759 tag_lexer is RESET.
760 """
761 pos = 0
762 while True:
763 try:
764 tok_id, end_pos = next(it)
765 except StopIteration:
766 break
767 tag_lexer.Reset(pos, end_pos)
768 if tok_id == h8_id.EndTag and tag_lexer.TagName() == tag_name:
769 return pos, end_pos
770
771 pos = end_pos
772
773 raise ParseError('No end tag %r' % tag_name)
774
775
776CHAR_ENTITY = {
777 'amp': '&',
778 'lt': '<',
779 'gt': '>',
780 'quot': '"',
781 'apos': "'",
782}
783
784
785def ToText(s, left_pos=0, right_pos=-1):
786 # type: (str, int, int) -> str
787 """Given HTML, return text by unquoting &gt; and &lt; etc.
788
789 Used by:
790 doctools/oils_doc.py: PygmentsPlugin
791 doctools/help_gen.py: HelpIndexCards
792
793 In the latter case, we cold process some tags, like:
794
795 - Blue Link (not clickable, but still useful)
796 - Red X
797
798 That should be html.ToAnsi.
799 """
800 f = StringIO()
801 out = Output(s, f, left_pos, right_pos)
802
803 pos = left_pos
804 for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
805 if tok_id in (h8_id.RawData, h8_id.BadAmpersand, h8_id.BadGreaterThan,
806 h8_id.BadLessThan):
807 out.SkipTo(pos)
808 out.PrintUntil(end_pos)
809
810 elif tok_id == h8_id.CharEntity: # &amp;
811
812 entity = s[pos + 1:end_pos - 1]
813
814 out.SkipTo(pos)
815 out.Print(CHAR_ENTITY[entity])
816 out.SkipTo(end_pos)
817
818 # Not handling these yet
819 elif tok_id == h8_id.HexChar:
820 raise AssertionError('Hex Char %r' % s[pos:pos + 20])
821
822 elif tok_id == h8_id.DecChar:
823 raise AssertionError('Dec Char %r' % s[pos:pos + 20])
824
825 else:
826 # Skip everything else
827 out.SkipTo(end_pos)
828
829 pos = end_pos
830
831 out.PrintTheRest()
832 return f.getvalue()
833
834
835# https://developer.mozilla.org/en-US/docs/Glossary/Void_element
836VOID_ELEMENTS = [
837 'area',
838 'base',
839 'br',
840 'col',
841 'embed',
842 'hr',
843 'img',
844 'input',
845 'link',
846 'meta',
847 'param',
848 'source',
849 'track',
850 'wbr',
851]
852
853LEX_ATTRS = 1 << 1
854LEX_QUOTED_VALUES = 1 << 2 # href="?x=42&amp;y=99"
855NO_SPECIAL_TAGS = 1 << 3 # <script> <style>, VOID tags, etc.
856BALANCED_TAGS = 1 << 4 # are tags balanced?
857
858
859def Validate(contents, flags, counters):
860 # type: (str, int, Counters) -> None
861
862 tag_lexer = TagLexer(contents)
863 val_lexer = AttrValueLexer(contents)
864
865 no_special_tags = bool(flags & NO_SPECIAL_TAGS)
866 lx = Lexer(contents, no_special_tags=no_special_tags)
867 tokens = []
868 start_pos = 0
869 tag_stack = []
870 while True:
871 tok_id, end_pos = lx.Read()
872 #log('TOP %s %r', h8_id_str(tok_id), contents[start_pos:end_pos])
873
874 if tok_id == h8_id.Invalid:
875 raise LexError(contents, start_pos)
876 if tok_id == h8_id.EndOfStream:
877 break
878
879 tokens.append((tok_id, end_pos))
880
881 if tok_id == h8_id.StartEndTag:
882 counters.num_start_end_tags += 1
883
884 tag_lexer.Reset(start_pos, end_pos)
885 all_attrs = tag_lexer.AllAttrsRawSlice()
886 counters.num_attrs += len(all_attrs)
887 for name, val_start, val_end in all_attrs:
888 val_lexer.Reset(val_start, val_end)
889 counters.num_val_tokens += val_lexer.NumTokens()
890
891 #counters.debug_attrs.extend(all_attrs)
892
893 elif tok_id == h8_id.StartTag:
894 counters.num_start_tags += 1
895
896 tag_lexer.Reset(start_pos, end_pos)
897 all_attrs = tag_lexer.AllAttrsRawSlice()
898 counters.num_attrs += len(all_attrs)
899 for name, val_start, val_end in all_attrs:
900 val_lexer.Reset(val_start, val_end)
901 counters.num_val_tokens += val_lexer.NumTokens()
902
903 #counters.debug_attrs.extend(all_attrs)
904
905 if flags & BALANCED_TAGS:
906 tag_name = lx.CanonicalTagName()
907 if flags & NO_SPECIAL_TAGS:
908 tag_stack.append(tag_name)
909 else:
910 # e.g. <meta> is considered self-closing, like <meta/>
911 if tag_name not in VOID_ELEMENTS:
912 tag_stack.append(tag_name)
913
914 counters.max_tag_stack = max(counters.max_tag_stack,
915 len(tag_stack))
916 elif tok_id == h8_id.EndTag:
917 if flags & BALANCED_TAGS:
918 try:
919 expected = tag_stack.pop()
920 except IndexError:
921 raise ParseError('Tag stack empty',
922 s=contents,
923 start_pos=start_pos)
924
925 actual = lx.CanonicalTagName()
926 if expected != actual:
927 raise ParseError(
928 'Got unexpected closing tag %r; opening tag was %r' %
929 (contents[start_pos:end_pos], expected),
930 s=contents,
931 start_pos=start_pos)
932
933 start_pos = end_pos
934
935 if len(tag_stack) != 0:
936 raise ParseError('Missing closing tags at end of doc: %s' %
937 ' '.join(tag_stack),
938 s=contents,
939 start_pos=start_pos)
940
941 counters.num_tokens += len(tokens)
942
943
944def ToXml(htm8_str):
945 # type: (str) -> str
946
947 # TODO:
948 # 1. Lex it
949 # 2. < & > must be escaped
950 # a. in raw data
951 # b. in quoted strings
952 # 3. <script> turned into CDATA
953 # 4. void tags turned into self-closing tags
954 # 5. case-sensitive tag matching - not sure about this
955
956 tag_lexer = TagLexer(htm8_str)
957 val_lexer = AttrValueLexer(htm8_str)
958
959 f = StringIO()
960 out = Output(htm8_str, f)
961
962 lx = Lexer(htm8_str)
963
964 pos = 0
965 while True:
966 tok_id, end_pos = lx.Read()
967
968 if tok_id == h8_id.Invalid:
969 raise LexError(htm8_str, pos)
970 if tok_id == h8_id.EndOfStream:
971 break
972
973 if tok_id in (h8_id.RawData, h8_id.CharEntity, h8_id.HexChar,
974 h8_id.DecChar):
975 out.PrintUntil(end_pos)
976 elif tok_id in (h8_id.StartTag, h8_id.StartEndTag):
977 tag_lexer.Reset(pos, end_pos)
978 # TODO: reduce allocations here
979 all_attrs = tag_lexer.AllAttrsRawSlice()
980 for name, val_start, val_end in all_attrs:
981 val_lexer.Reset(val_start, val_end)
982 # TODO: get the kind of string
983 #
984 # Quoted: we need to replace & with &amp; and < with &lt;
985 # note > is not allowed
986 # Unquoted: right now, we can just surround with double quotes
987 # because we don't allow any bad chars
988 # Empty : add "", so empty= becomes =""
989 # Missing : add ="", so missing becomes missing=""
990
991 tag_name = lx.CanonicalTagName()
992 if tok_id == h8_id.StartTag and tag_name in VOID_ELEMENTS:
993 # TODO: instead of closing >, print />
994 pass
995
996 elif tok_id == h8_id.BadAmpersand:
997 #out.SkipTo(pos)
998 out.Print('&amp;')
999 out.SkipTo(end_pos)
1000
1001 elif tok_id == h8_id.BadGreaterThan:
1002 #out.SkipTo(pos)
1003 out.Print('&gt;')
1004 out.SkipTo(end_pos)
1005 else:
1006 out.PrintUntil(end_pos)
1007
1008 pos = end_pos
1009
1010 out.PrintTheRest()
1011 return f.getvalue()
1012
1013
1014class Counters(object):
1015
1016 def __init__(self):
1017 # type: () -> None
1018 self.num_tokens = 0
1019 self.num_start_tags = 0
1020 self.num_start_end_tags = 0
1021 self.num_attrs = 0
1022 self.max_tag_stack = 0
1023 self.num_val_tokens = 0
1024
1025 #self.debug_attrs = []
1026
1027
1028def main(argv):
1029 # type: (List[str]) -> int
1030 action = argv[1]
1031
1032 if action == 'tokens':
1033 contents = sys.stdin.read()
1034
1035 lx = Lexer(contents)
1036 start_pos = 0
1037 while True:
1038 tok_id, end_pos = lx.Read()
1039 if tok_id == h8_id.Invalid:
1040 raise LexError(contents, start_pos)
1041 if tok_id == h8_id.EndOfStream:
1042 break
1043
1044 frag = contents[start_pos:end_pos]
1045 log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
1046 start_pos = end_pos
1047
1048 return 0
1049
1050 elif action in ('lex-htm8', 'parse-htm8', 'parse-xml'):
1051
1052 errors = []
1053 counters = Counters()
1054
1055 flags = LEX_ATTRS | LEX_QUOTED_VALUES
1056 if action.startswith('parse-'):
1057 flags |= BALANCED_TAGS
1058 if action == 'parse-xml':
1059 flags |= NO_SPECIAL_TAGS
1060
1061 i = 0
1062 for line in sys.stdin:
1063 filename = line.strip()
1064 with open(filename) as f:
1065 contents = f.read()
1066
1067 try:
1068 Validate(contents, flags, counters)
1069 except LexError as e:
1070 log('Lex error in %r: %s', filename, e)
1071 errors.append((filename, e))
1072 except ParseError as e:
1073 log('Parse error in %r: %s', filename, e)
1074 errors.append((filename, e))
1075 i += 1
1076
1077 log('')
1078 log('%10d tokens', counters.num_tokens)
1079 log('%10d start/end tags', counters.num_start_end_tags)
1080 log('%10d start tags', counters.num_start_tags)
1081 log('%10d attrs', counters.num_attrs)
1082 log('%10d max tag stack depth', counters.max_tag_stack)
1083 log('%10d attr val tokens', counters.num_val_tokens)
1084 log('%10d errors', len(errors))
1085 if len(errors):
1086 return 1
1087 return 0
1088
1089 elif action == 'todo':
1090 # Other algorithms:
1091 #
1092 # - select first subtree with given ID
1093 # - this requires understanding the void tags I suppose
1094 # - select all subtrees that have a class
1095 # - materialize DOM
1096
1097 # Safe-HTM8? This is a filter
1098 return 0
1099
1100 else:
1101 raise RuntimeError('Invalid action %r' % action)
1102
1103
1104if __name__ == '__main__':
1105 sys.exit(main(sys.argv))