OILS / lazylex / html.py View on Github | oils.pub

378 lines, 172 significant
1#!/usr/bin/env python2
2"""
3lazylex/html.py - Wrapper around HTM8
4
5See doc/lazylex.md for details.
6
7"""
8from __future__ import print_function
9
10import re
11
12from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_tag_id, h8_tag_id_t,
13 h8_tag_id_str)
14from data_lang.htm8 import (Lexer, LexError, ParseError, Output, _NAME_RE)
15from doctools.util import log
16
17_ = log
18
19try:
20 from cStringIO import StringIO
21except ImportError:
22 # for python3
23 from io import StringIO # type: ignore
24import sys
25
26if sys.version_info.major == 2:
27 from typing import List, Tuple, Iterator, Optional
28
29
30def _Tokens(s, left_pos, right_pos):
31 # type: (str, int, int) -> Iterator[Tuple[h8_id_t, int]]
32 """
33 Args:
34 s: string to parse
35 left_pos, right_pos: Optional span boundaries.
36 """
37 lx = Lexer(s, left_pos, right_pos)
38 while True:
39 tok_id, pos = lx.Read()
40 yield tok_id, pos
41 if tok_id == h8_id.EndOfStream:
42 break
43
44
45def ValidTokens(s, left_pos=0, right_pos=-1):
46 # type: (str, int, int) -> Iterator[Tuple[h8_id_t, int]]
47 """Wrapper around _Tokens to prevent callers from having to handle Invalid.
48
49 I'm not combining the two functions because I might want to do a
50 'yield' transformation on Tokens()? Exceptions might complicate the
51 issue?
52 """
53 pos = left_pos
54 for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
55 if tok_id == h8_id.Invalid:
56 raise LexError('ValidTokens() got invalid token', s, pos)
57 yield tok_id, end_pos
58 pos = end_pos
59
60
61def ReadUntilStartTag(it, tag_lexer, tag_name):
62 # type: (Iterator[Tuple[h8_id_t, int]], TagLexer, str) -> Tuple[int, int]
63 """Find the next <foo>, returning its (start, end) positions
64
65 Raise ParseError if it's not found.
66
67 tag_lexer is RESET.
68 """
69 pos = 0
70 while True:
71 try:
72 tok_id, end_pos = next(it)
73 except StopIteration:
74 break
75 tag_lexer.Reset(pos, end_pos)
76 if tok_id == h8_id.StartTag and tag_lexer.GetTagName() == tag_name:
77 return pos, end_pos
78
79 pos = end_pos
80
81 raise ParseError('No start tag %r' % tag_name)
82
83
84def ReadUntilEndTag(it, tag_lexer, tag_name):
85 # type: (Iterator[Tuple[h8_id_t, int]], TagLexer, str) -> Tuple[int, int]
86 """Find the next </foo>, returning its (start, end) position
87
88 Raise ParseError if it's not found.
89
90 tag_lexer is RESET.
91 """
92 pos = 0
93 while True:
94 try:
95 tok_id, end_pos = next(it)
96 except StopIteration:
97 break
98 tag_lexer.Reset(pos, end_pos)
99 if tok_id == h8_id.EndTag and tag_lexer.GetTagName() == tag_name:
100 return pos, end_pos
101
102 pos = end_pos
103
104 raise ParseError('No end tag %r' % tag_name)
105
106
107CHAR_ENTITY = {
108 'amp': '&',
109 'lt': '<',
110 'gt': '>',
111 'quot': '"',
112 'apos': "'",
113}
114
115
116def ToText(s, left_pos=0, right_pos=-1):
117 # type: (str, int, int) -> str
118 """Given HTML, return text by unquoting &gt; and &lt; etc.
119
120 Used by:
121 doctools/oils_doc.py: PygmentsPlugin
122 doctools/help_gen.py: HelpIndexCards
123
124 In the latter case, we cold process some tags, like:
125
126 - Blue Link (not clickable, but still useful)
127 - Red X
128
129 That should be html.ToAnsi.
130 """
131 f = StringIO()
132 out = Output(s, f, left_pos, right_pos)
133
134 pos = left_pos
135 for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
136 if tok_id in (h8_id.RawData, h8_id.BadAmpersand, h8_id.BadGreaterThan,
137 h8_id.BadLessThan):
138 out.SkipTo(pos)
139 out.PrintUntil(end_pos)
140
141 elif tok_id == h8_id.CharEntity: # &amp;
142
143 entity = s[pos + 1:end_pos - 1]
144
145 out.SkipTo(pos)
146 out.Print(CHAR_ENTITY[entity])
147 out.SkipTo(end_pos)
148
149 # Not handling these yet
150 elif tok_id == h8_id.HexChar:
151 raise AssertionError('Hex Char %r' % s[pos:pos + 20])
152
153 elif tok_id == h8_id.DecChar:
154 raise AssertionError('Dec Char %r' % s[pos:pos + 20])
155
156 else:
157 # Skip everything else
158 out.SkipTo(end_pos)
159
160 pos = end_pos
161
162 out.PrintTheRest()
163 return f.getvalue()
164
165
166#
167# OLD TagLexer API - REMOVE THIS
168#
169# HTML 5 doesn't restrict tag names at all
170# https://html.spec.whatwg.org/#toc-syntax
171#
172# XML allows : - .
173# https://www.w3.org/TR/xml/#NT-NameChar
174
175# Namespaces for MathML, SVG
176# XLink, XML, XMLNS
177#
178# https://infra.spec.whatwg.org/#namespaces
179#
180# Allow - for td-attrs
181
182# Similar to _UNQUOTED_VALUE in data_lang/htm8.py
183_UNQUOTED_VALUE_OLD = r'''[^ \t\r\n<>&"'\x00]*'''
184
185_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME_RE, re.VERBOSE)
186
187_TAG_LAST_RE = re.compile(r'\s* /? >', re.VERBOSE)
188
189# To match href="foo"
190
191# <button disabled> is standard usage
192
193# NOTE: This used to allow whitespace around =
194# <a foo = "bar"> makes sense in XML
195# But then you also have
196# <a foo= bar> - which is TWO attributes, in HTML5
197# So the space is problematic
198
199_ATTR_RE = re.compile(
200 r'''
201\s+ # Leading whitespace is required
202(%s) # Attribute name
203(?: # Optional attribute value
204 \s* = \s* # Spaces allowed around =
205 (?:
206 " ([^>"\x00]*) " # double quoted value
207 | ' ([^>'\x00]*) ' # single quoted value
208 | (%s) # Attribute value
209 )
210)?
211''' % (_NAME_RE, _UNQUOTED_VALUE_OLD), re.VERBOSE)
212
213
214class TagLexer(object):
215 """
216 Given a tag like <a href="..."> or <link type="..." />, the TagLexer
217 provides a few operations:
218
219 - What is the tag?
220 - Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
221 """
222
223 def __init__(self, s):
224 # type: (str) -> None
225 self.s = s
226 self.start_pos = -1 # Invalid
227 self.end_pos = -1
228
229 def Reset(self, start_pos, end_pos):
230 # type: (int, int) -> None
231 """Reuse instances of this object."""
232 assert start_pos >= 0, start_pos
233 assert end_pos >= 0, end_pos
234
235 self.start_pos = start_pos
236 self.end_pos = end_pos
237
238 def WholeTagString(self):
239 # type: () -> str
240 """Return the entire tag string, e.g. <a href='foo'>"""
241 return self.s[self.start_pos:self.end_pos]
242
243 def GetTagName(self):
244 # type: () -> str
245 # First event
246 tok_id, start, end = next(self.Tokens())
247 return self.s[start:end]
248
249 def GetSpanForAttrValue(self, attr_name):
250 # type: (str) -> Tuple[int, int]
251 """
252 Used by oils_doc.py, for href shortcuts
253 """
254 # Algorithm: search for QuotedValue or UnquotedValue after AttrName
255 # TODO: Could also cache these
256
257 events = self.Tokens()
258 val = (-1, -1)
259 try:
260 while True:
261 tok_id, start, end = next(events)
262 if tok_id == h8_tag_id.AttrName:
263 name = self.s[start:end]
264 if name == attr_name:
265 # The value should come next
266 tok_id, start, end = next(events)
267 assert tok_id in (
268 h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue,
269 h8_tag_id.MissingValue), h8_tag_id_str(tok_id)
270 val = start, end
271 break
272
273 except StopIteration:
274 pass
275 return val
276
277 def GetAttrRaw(self, attr_name):
278 # type: (str) -> Optional[str]
279 """
280 Return the value, which may be UNESCAPED.
281 """
282 start, end = self.GetSpanForAttrValue(attr_name)
283 if start == -1:
284 return None
285 return self.s[start:end]
286
287 def AllAttrsRawSlice(self):
288 # type: () -> List[Tuple[str, int, int]]
289 """
290 Get a list of pairs [('class', 3, 5), ('href', 9, 12)]
291 """
292 slices = []
293 events = self.Tokens()
294 try:
295 while True:
296 tok_id, start, end = next(events)
297 if tok_id == h8_tag_id.AttrName:
298 name = self.s[start:end]
299
300 # The value should come next
301 tok_id, start, end = next(events)
302 assert tok_id in (
303 h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue,
304 h8_tag_id.MissingValue), h8_tag_id_str(tok_id)
305 # Note: quoted values may have &amp;
306 # We would need ANOTHER lexer to unescape them, but we
307 # don't need that for ul-table
308 slices.append((name, start, end))
309 except StopIteration:
310 pass
311 return slices
312
313 def AllAttrsRaw(self):
314 # type: () -> List[Tuple[str, str]]
315 """
316 Get a list of pairs [('class', 'foo'), ('href', '?foo=1&amp;bar=2')]
317
318 The quoted values may be escaped. We would need another lexer to
319 unescape them.
320 """
321 slices = self.AllAttrsRawSlice()
322 pairs = []
323 for name, start, end in slices:
324 pairs.append((name, self.s[start:end]))
325 return pairs
326
327 def Tokens(self):
328 # type: () -> Iterator[Tuple[h8_tag_id_t, int, int]]
329 """
330 Yields a sequence of tokens: Tag (AttrName AttrValue?)*
331
332 Where each Token is (Type, start_pos, end_pos)
333
334 Note that start and end are NOT redundant! We skip over some unwanted
335 characters.
336 """
337 m = _TAG_RE.match(self.s, self.start_pos + 1)
338 if not m:
339 raise RuntimeError("Couldn't find HTML tag in %r" %
340 self.WholeTagString())
341 yield h8_tag_id.TagName, m.start(1), m.end(1)
342
343 pos = m.end(0)
344 #log('POS %d', pos)
345
346 while True:
347 # don't search past the end
348 m = _ATTR_RE.match(self.s, pos, self.end_pos)
349 if not m:
350 #log('BREAK pos %d', pos)
351 break
352 #log('AttrName %r', m.group(1))
353
354 yield h8_tag_id.AttrName, m.start(1), m.end(1)
355
356 #log('m.groups() %r', m.groups())
357 if m.group(2) is not None:
358 # double quoted
359 yield h8_tag_id.QuotedValue, m.start(2), m.end(2)
360 elif m.group(3) is not None:
361 # single quoted - TODO: could have different token types
362 yield h8_tag_id.QuotedValue, m.start(3), m.end(3)
363 elif m.group(4) is not None:
364 yield h8_tag_id.UnquotedValue, m.start(4), m.end(4)
365 else:
366 # <button disabled>
367 end = m.end(0)
368 yield h8_tag_id.MissingValue, end, end
369
370 # Skip past the "
371 pos = m.end(0)
372
373 #log('TOK %r', self.s)
374
375 m = _TAG_LAST_RE.match(self.s, pos)
376 #log('_TAG_LAST_RE match %r', self.s[pos:])
377 if not m:
378 raise LexError('Extra data at end of tag', self.s, pos)