OILS / lazylex / html.py View on Github | oils.pub

568 lines, 261 significant
1#!/usr/bin/env python2
2"""
3lazylex/html.py - Low-Level HTML Processing.
4
5See lazylex/README.md for details.
6
7TODO: This should be an Oils library eventually. It's a "lazily-parsed data
8structure" like TSV8
9"""
10from __future__ import print_function
11
12try:
13 from cStringIO import StringIO
14except ImportError:
15 from io import StringIO # python3
16import re
17import sys
18
19if sys.version_info.major == 2:
20 from typing import List, Tuple
21
22
23def log(msg, *args):
24 msg = msg % args
25 print(msg, file=sys.stderr)
26
27
28class LexError(Exception):
29 """For bad lexical elements like <> or &&"""
30
31 def __init__(self, s, pos):
32 self.s = s
33 self.pos = pos
34
35 def __str__(self):
36 return '(LexError %r)' % (self.s[self.pos:self.pos + 20])
37
38
39class ParseError(Exception):
40 """For errors in the tag structure."""
41
42 def __init__(self, msg, *args):
43 self.msg = msg
44 self.args = args
45
46 def __str__(self):
47 return '(ParseError %s)' % (self.msg % self.args)
48
49
50class Output(object):
51 """Takes an underlying input buffer and an output file. Maintains a
52 position in the input buffer.
53
54 Print FROM the input or print new text to the output.
55 """
56
57 def __init__(self, s, f, left_pos=0, right_pos=-1):
58 self.s = s
59 self.f = f
60 self.pos = left_pos
61 self.right_pos = len(s) if right_pos == -1 else right_pos
62
63 def SkipTo(self, pos):
64 """Skip to a position."""
65 self.pos = pos
66
67 def PrintUntil(self, pos):
68 """Print until a position."""
69 piece = self.s[self.pos:pos]
70 self.f.write(piece)
71 self.pos = pos
72
73 def PrintTheRest(self):
74 """Print until the end of the string."""
75 self.PrintUntil(self.right_pos)
76
77 def Print(self, s):
78 """Print text to the underlying buffer."""
79 self.f.write(s)
80
81
82# HTML Tokens
83TOKENS = 'Decl Comment Processing StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData Invalid EndOfStream'.split(
84)
85
86
87class Tok(object):
88 """
89 Avoid lint errors by using these aliases
90 """
91 pass
92
93
94assert len(TOKENS) == 12, TOKENS
95
96TOKEN_NAMES = [None] * len(TOKENS) # type: List[str]
97
98this_module = sys.modules[__name__]
99for i, tok_str in enumerate(TOKENS):
100 setattr(this_module, tok_str, i)
101 setattr(Tok, tok_str, i)
102 TOKEN_NAMES[i] = tok_str
103
104
105def TokenName(tok_id):
106 return TOKEN_NAMES[tok_id]
107
108
109def MakeLexer(rules):
110 return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]
111
112
113#
114# Eggex
115#
116# Tag = / ~['>']+ /
117
118# Is this valid? A single character?
119# Tag = / ~'>'* /
120
121# Maybe better: / [NOT '>']+/
122# capital letters not allowed there?
123#
124# But then this is confusing:
125# / [NOT ~digit]+/
126#
127# / [NOT digit] / is [^\d]
128# / ~digit / is \D
129#
130# Or maybe:
131#
132# / [~ digit]+ /
133# / [~ '>']+ /
134# / [NOT '>']+ /
135
136# End = / '</' Tag '>' /
137# StartEnd = / '<' Tag '/>' /
138# Start = / '<' Tag '>' /
139#
140# EntityRef = / '&' dot{* N} ';' /
141
142LEXER = [
143 # TODO: instead of nongreedy matches, the loop can just do .find('-->') and
144 # .find('?>')
145
146 # Actually non-greedy matches are regular and can be matched in linear time
147 # with RE2.
148 #
149 # https://news.ycombinator.com/item?id=27099798
150 #
151 # Maybe try combining all of these for speed.
152
153 # . is any char except newline
154 # https://re2c.org/manual/manual_c.html
155
156 # Hack from Claude: \s\S instead of re.DOTALL. I don't like this
157 #(r'<!-- [\s\S]*? -->', Tok.Comment),
158 (r'<!-- (?:.|[\n])*? -->', Tok.Comment),
159
160 #(r'<!-- .*? -->', Tok.Comment),
161
162 # Processing instruction are XML only, but they are treated like a comment
163 # in HTML:
164 #
165 # https://developer.mozilla.org/en-US/docs/Web/API/ProcessingInstruction
166 #
167 # We don't want to confuse them with start tags, so we recognize them at
168 # the top level.
169 (r'<\? (?:.|\n)*? \?>', Tok.Processing),
170
171 # NOTE: < is allowed in these.
172 (r'<! [^>]+ >', Tok.Decl), # <!DOCTYPE html>
173 (r'</ [^>]+ >', Tok.EndTag), # self-closing <br/> comes FIRST
174 (r'< [^>]+ />', Tok.StartEndTag), # end </a>
175 (r'< [^>]+ >', Tok.StartTag), # start <a>
176 (r'&\# [0-9]+ ;', Tok.DecChar),
177 (r'&\# x[0-9a-fA-F]+ ;', Tok.HexChar),
178 (r'& [a-zA-Z]+ ;', Tok.CharEntity),
179
180 # Note: > is allowed in raw data.
181 # https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
182 (r'[^&<]+', Tok.RawData),
183 (r'.', Tok.Invalid), # error!
184]
185
186LEXER = MakeLexer(LEXER)
187
188
189class Lexer(object):
190
191 def __init__(self, s, left_pos=0, right_pos=-1):
192 self.s = s
193 self.pos = left_pos
194 self.right_pos = len(s) if right_pos == -1 else right_pos
195 self.cache = {} # string -> compiled regex pattern object
196
197 def _Peek(self):
198 # type: () -> Tuple[int, int]
199 """
200 Note: not using _Peek() now
201 """
202 if self.pos == self.right_pos:
203 return Tok.EndOfStream, self.pos
204
205 assert self.pos < self.right_pos, self.pos
206
207 # Find the first match.
208 # Note: frontend/match.py uses _LongestMatch(), which is different!
209 # TODO: reconcile them. This lexer should be expressible in re2c.
210
211 # TODO: Get rid of non-greedy match
212
213 for pat, tok_id in LEXER:
214 m = pat.match(self.s, self.pos)
215 if m:
216 return tok_id, m.end()
217 else:
218 raise AssertionError('Tok.Invalid rule should have matched')
219
220 def Read(self):
221 # type: () -> Tuple[int, int]
222 tok_id, end_pos = self._Peek()
223 self.pos = end_pos # advance
224 return tok_id, end_pos
225
226 def LookAhead(self, regex):
227 # Cache the regex compilation. This could also be LookAheadFor(THEAD)
228 # or something.
229 pat = self.cache.get(regex)
230 if pat is None:
231 pat = re.compile(regex)
232 self.cache[regex] = pat
233
234 m = pat.match(self.s, self.pos)
235 return m is not None
236
237
238def _Tokens(s, left_pos, right_pos):
239 """
240 Args:
241 s: string to parse
242 left_pos, right_pos: Optional span boundaries.
243 """
244 lx = Lexer(s, left_pos, right_pos)
245 while True:
246 tok_id, pos = lx.Read()
247 yield tok_id, pos
248 if tok_id == Tok.EndOfStream:
249 break
250
251
252def ValidTokens(s, left_pos=0, right_pos=-1):
253 """Wrapper around _Tokens to prevent callers from having to handle Invalid.
254
255 I'm not combining the two functions because I might want to do a
256 'yield' transformation on Tokens()? Exceptions might complicate the
257 issue?
258 """
259 pos = left_pos
260 for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
261 if tok_id == Tok.Invalid:
262 raise LexError(s, pos)
263 yield tok_id, end_pos
264 pos = end_pos
265
266
267# Tag names:
268# Match <a or </a
269# Match <h2, but not <2h
270#
271# HTML 5 doesn't restrict tag names at all
272# https://html.spec.whatwg.org/#toc-syntax
273#
274# XML allows : - .
275# https://www.w3.org/TR/xml/#NT-NameChar
276
277# Namespaces for MathML, SVG
278# XLink, XML, XMLNS
279#
280# https://infra.spec.whatwg.org/#namespaces
281#
282# Allow - for td-attrs
283
284# Tag name, or attribue name
285_NAME = r'[a-zA-Z][a-zA-Z0-9_\-]*' # must start with letter
286
287_ATTR_VALUE = r'[a-zA-Z0-9_\-]+' # allow hyphens
288
289_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)
290
291# To match href="foo"
292
293_ATTR_RE = re.compile(
294 r'''
295\s+ # Leading whitespace is required
296(%s) # Attribute name
297(?: # Optional attribute value
298 \s* = \s*
299 (?:
300 " ([^>"]*) " # double quoted value
301 | (%s) # Attribute value
302 # TODO: relax this? for href=$foo
303 )
304)?
305''' % (_NAME, _ATTR_VALUE), re.VERBOSE)
306
307TagName, AttrName, UnquotedValue, QuotedValue = range(4)
308
309
310class TagLexer(object):
311 """
312 Given a tag like <a href="..."> or <link type="..." />, the TagLexer
313 provides a few operations:
314
315 - What is the tag?
316 - Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
317 """
318
319 def __init__(self, s):
320 self.s = s
321 self.start_pos = -1 # Invalid
322 self.end_pos = -1
323
324 def Reset(self, start_pos, end_pos):
325 """Reuse instances of this object."""
326 self.start_pos = start_pos
327 self.end_pos = end_pos
328
329 def TagString(self):
330 return self.s[self.start_pos:self.end_pos]
331
332 def TagName(self):
333 # First event
334 tok_id, start, end = next(self.Tokens())
335 return self.s[start:end]
336
337 def GetSpanForAttrValue(self, attr_name):
338 # Algorithm: search for QuotedValue or UnquotedValue after AttrName
339 # TODO: Could also cache these
340
341 events = self.Tokens()
342 val = (-1, -1)
343 try:
344 while True:
345 tok_id, start, end = next(events)
346 if tok_id == AttrName:
347 name = self.s[start:end]
348 if name == attr_name:
349 # The value should come next
350 tok_id, start, end = next(events)
351 if tok_id in (QuotedValue, UnquotedValue):
352 # Note: quoted values may have &amp;
353 # We would need ANOTHER lexer to unescape them.
354 # Right now help_gen.py and oils_doc.py
355 val = start, end
356 break
357
358 except StopIteration:
359 pass
360 return val
361
362 def GetAttrRaw(self, attr_name):
363 """
364 Return the value, which may be UNESCAPED.
365 """
366 # Algorithm: search for QuotedValue or UnquotedValue after AttrName
367 # TODO: Could also cache these
368 start, end = self.GetSpanForAttrValue(attr_name)
369 if start == -1:
370 return None
371 return self.s[start:end]
372
373 def AllAttrsRaw(self):
374 """
375 Get a list of pairs [('class', 'foo'), ('href', '?foo=1&amp;bar=2')]
376
377 The quoted values may be escaped. We would need another lexer to
378 unescape them.
379 """
380 pairs = []
381 events = self.Tokens()
382 try:
383 while True:
384 tok_id, start, end = next(events)
385 if tok_id == AttrName:
386 name = self.s[start:end]
387
388 # The value should come next
389 tok_id, start, end = next(events)
390 if tok_id in (QuotedValue, UnquotedValue):
391 # Note: quoted values may have &amp;
392 # We would need ANOTHER lexer to unescape them, but we
393 # don't need that for ul-table
394
395 val = self.s[start:end]
396 pairs.append((name, val))
397 except StopIteration:
398 pass
399 return pairs
400
401 def Tokens(self):
402 """
403 Yields a sequence of tokens: Tag (AttrName AttrValue?)*
404
405 Where each Token is (Type, start_pos, end_pos)
406
407 Note that start and end are NOT redundant! We skip over some unwanted
408 characters.
409 """
410 m = _TAG_RE.match(self.s, self.start_pos + 1)
411 if not m:
412 raise RuntimeError("Couldn't find HTML tag in %r" %
413 self.TagString())
414 yield TagName, m.start(1), m.end(1)
415
416 pos = m.end(0)
417
418 while True:
419 # don't search past the end
420 m = _ATTR_RE.match(self.s, pos, self.end_pos)
421 if not m:
422 # A validating parser would check that > or /> is next -- there's no junk
423 break
424
425 yield AttrName, m.start(1), m.end(1)
426
427 # Quoted is group 2, unquoted is group 3.
428 if m.group(2) is not None:
429 yield QuotedValue, m.start(2), m.end(2)
430 elif m.group(3) is not None:
431 yield UnquotedValue, m.start(3), m.end(3)
432
433 # Skip past the "
434 pos = m.end(0)
435
436
437def ReadUntilStartTag(it, tag_lexer, tag_name):
438 """Find the next <foo>, returning its (start, end) positions
439
440 Raise ParseError if it's not found.
441
442 tag_lexer is RESET.
443 """
444 pos = 0
445 while True:
446 try:
447 tok_id, end_pos = next(it)
448 except StopIteration:
449 break
450 tag_lexer.Reset(pos, end_pos)
451 if tok_id == Tok.StartTag and tag_lexer.TagName() == tag_name:
452 return pos, end_pos
453
454 pos = end_pos
455
456 raise ParseError('No start tag %r', tag_name)
457
458
459def ReadUntilEndTag(it, tag_lexer, tag_name):
460 """Find the next </foo>, returning its (start, end) position
461
462 Raise ParseError if it's not found.
463
464 tag_lexer is RESET.
465 """
466 pos = 0
467 while True:
468 try:
469 tok_id, end_pos = next(it)
470 except StopIteration:
471 break
472 tag_lexer.Reset(pos, end_pos)
473 if tok_id == Tok.EndTag and tag_lexer.TagName() == tag_name:
474 return pos, end_pos
475
476 pos = end_pos
477
478 raise ParseError('No end tag %r', tag_name)
479
480
481CHAR_ENTITY = {
482 'amp': '&',
483 'lt': '<',
484 'gt': '>',
485 'quot': '"',
486}
487
488
489def ToText(s, left_pos=0, right_pos=-1):
490 """Given HTML, return text by unquoting &gt; and &lt; etc.
491
492 Used by:
493 doctools/oils_doc.py: PygmentsPlugin
494 doctool/make_help.py: HelpIndexCards
495
496 In the latter case, we cold process some tags, like:
497
498 - Blue Link (not clickable, but still useful)
499 - Red X
500
501 That should be html.ToAnsi.
502 """
503 f = StringIO()
504 out = Output(s, f, left_pos, right_pos)
505
506 pos = left_pos
507 for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
508 if tok_id == Tok.RawData:
509 out.SkipTo(pos)
510 out.PrintUntil(end_pos)
511
512 elif tok_id == Tok.CharEntity: # &amp;
513
514 entity = s[pos + 1:end_pos - 1]
515
516 out.SkipTo(pos)
517 out.Print(CHAR_ENTITY[entity])
518 out.SkipTo(end_pos)
519
520 # Not handling these yet
521 elif tok_id == Tok.HexChar:
522 raise AssertionError('Hex Char %r' % s[pos:pos + 20])
523
524 elif tok_id == Tok.DecChar:
525 raise AssertionError('Dec Char %r' % s[pos:pos + 20])
526
527 pos = end_pos
528
529 out.PrintTheRest()
530 return f.getvalue()
531
532
533def main(argv):
534 action = argv[1]
535
536 if action == 'well-formed':
537 num_tokens = 0
538 errors = []
539 i = 0
540 for line in sys.stdin:
541 name = line.strip()
542 with open(name) as f:
543 contents = f.read()
544
545 lx = ValidTokens(contents)
546 try:
547 tokens = list(lx)
548 except LexError as e:
549 log('Error in %r: %s', name, e)
550 errors.append((name, e))
551 else:
552 num_tokens += len(tokens)
553 #print('%d %s' % (len(tokens), name))
554 i += 1
555
556 log('')
557 log(' %d tokens in %d files', num_tokens, i)
558 log(' %d errors', len(errors))
559 if 0:
560 for name, e in errors:
561 log('Error in %r: %s', name, e)
562
563 else:
564 raise RuntimeError('Invalid action %r' % action)
565
566
567if __name__ == '__main__':
568 main(sys.argv)