OILS / doctools / html_old.py View on Github | oils.pub

377 lines, 172 significant
1#!/usr/bin/env python2
2"""
3doctools/html_old.py - APIs that should be replaced by data_lang/htm8.py
4
5See doc/lazylex.md for details.
6"""
7from __future__ import print_function
8
9import re
10
11from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_tag_id, h8_tag_id_t,
12 h8_tag_id_str)
13from data_lang.htm8 import (Lexer, LexError, ParseError, Output, _NAME_RE)
14from doctools.util import log
15
16_ = log
17
18try:
19 from cStringIO import StringIO
20except ImportError:
21 # for python3
22 from io import StringIO # type: ignore
23import sys
24
25if sys.version_info.major == 2:
26 from typing import List, Tuple, Iterator, Optional
27
28
29def _Tokens(s, left_pos, right_pos):
30 # type: (str, int, int) -> Iterator[Tuple[h8_id_t, int]]
31 """
32 Args:
33 s: string to parse
34 left_pos, right_pos: Optional span boundaries.
35 """
36 lx = Lexer(s, left_pos, right_pos)
37 while True:
38 tok_id, pos = lx.Read()
39 yield tok_id, pos
40 if tok_id == h8_id.EndOfStream:
41 break
42
43
44def ValidTokens(s, left_pos=0, right_pos=-1):
45 # type: (str, int, int) -> Iterator[Tuple[h8_id_t, int]]
46 """Wrapper around _Tokens to prevent callers from having to handle Invalid.
47
48 I'm not combining the two functions because I might want to do a
49 'yield' transformation on Tokens()? Exceptions might complicate the
50 issue?
51 """
52 pos = left_pos
53 for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
54 if tok_id == h8_id.Invalid:
55 raise LexError('ValidTokens() got invalid token', s, pos)
56 yield tok_id, end_pos
57 pos = end_pos
58
59
60def ReadUntilStartTag(it, tag_lexer, tag_name):
61 # type: (Iterator[Tuple[h8_id_t, int]], TagLexer, str) -> Tuple[int, int]
62 """Find the next <foo>, returning its (start, end) positions
63
64 Raise ParseError if it's not found.
65
66 tag_lexer is RESET.
67 """
68 pos = 0
69 while True:
70 try:
71 tok_id, end_pos = next(it)
72 except StopIteration:
73 break
74 tag_lexer.Reset(pos, end_pos)
75 if tok_id == h8_id.StartTag and tag_lexer.GetTagName() == tag_name:
76 return pos, end_pos
77
78 pos = end_pos
79
80 raise ParseError('No start tag %r' % tag_name)
81
82
83def ReadUntilEndTag(it, tag_lexer, tag_name):
84 # type: (Iterator[Tuple[h8_id_t, int]], TagLexer, str) -> Tuple[int, int]
85 """Find the next </foo>, returning its (start, end) position
86
87 Raise ParseError if it's not found.
88
89 tag_lexer is RESET.
90 """
91 pos = 0
92 while True:
93 try:
94 tok_id, end_pos = next(it)
95 except StopIteration:
96 break
97 tag_lexer.Reset(pos, end_pos)
98 if tok_id == h8_id.EndTag and tag_lexer.GetTagName() == tag_name:
99 return pos, end_pos
100
101 pos = end_pos
102
103 raise ParseError('No end tag %r' % tag_name)
104
105
106CHAR_ENTITY = {
107 'amp': '&',
108 'lt': '<',
109 'gt': '>',
110 'quot': '"',
111 'apos': "'",
112}
113
114
115def ToText(s, left_pos=0, right_pos=-1):
116 # type: (str, int, int) -> str
117 """Given HTML, return text by unquoting &gt; and &lt; etc.
118
119 Used by:
120 doctools/oils_doc.py: PygmentsPlugin
121 doctools/help_gen.py: HelpIndexCards
122
123 In the latter case, we cold process some tags, like:
124
125 - Blue Link (not clickable, but still useful)
126 - Red X
127
128 That should be html.ToAnsi.
129 """
130 f = StringIO()
131 out = Output(s, f, left_pos, right_pos)
132
133 pos = left_pos
134 for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
135 if tok_id in (h8_id.RawData, h8_id.BadAmpersand, h8_id.BadGreaterThan,
136 h8_id.BadLessThan):
137 out.SkipTo(pos)
138 out.PrintUntil(end_pos)
139
140 elif tok_id == h8_id.CharEntity: # &amp;
141
142 entity = s[pos + 1:end_pos - 1]
143
144 out.SkipTo(pos)
145 out.Print(CHAR_ENTITY[entity])
146 out.SkipTo(end_pos)
147
148 # Not handling these yet
149 elif tok_id == h8_id.HexChar:
150 raise AssertionError('Hex Char %r' % s[pos:pos + 20])
151
152 elif tok_id == h8_id.DecChar:
153 raise AssertionError('Dec Char %r' % s[pos:pos + 20])
154
155 else:
156 # Skip everything else
157 out.SkipTo(end_pos)
158
159 pos = end_pos
160
161 out.PrintTheRest()
162 return f.getvalue()
163
164
165#
166# OLD TagLexer API - REMOVE THIS
167#
168# HTML 5 doesn't restrict tag names at all
169# https://html.spec.whatwg.org/#toc-syntax
170#
171# XML allows : - .
172# https://www.w3.org/TR/xml/#NT-NameChar
173
174# Namespaces for MathML, SVG
175# XLink, XML, XMLNS
176#
177# https://infra.spec.whatwg.org/#namespaces
178#
179# Allow - for td-attrs
180
181# Similar to _UNQUOTED_VALUE in data_lang/htm8.py
182_UNQUOTED_VALUE_OLD = r'''[^ \t\r\n<>&"'\x00]*'''
183
184_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME_RE, re.VERBOSE)
185
186_TAG_LAST_RE = re.compile(r'\s* /? >', re.VERBOSE)
187
188# To match href="foo"
189
190# <button disabled> is standard usage
191
192# NOTE: This used to allow whitespace around =
193# <a foo = "bar"> makes sense in XML
194# But then you also have
195# <a foo= bar> - which is TWO attributes, in HTML5
196# So the space is problematic
197
198_ATTR_RE = re.compile(
199 r'''
200\s+ # Leading whitespace is required
201(%s) # Attribute name
202(?: # Optional attribute value
203 \s* = \s* # Spaces allowed around =
204 (?:
205 " ([^>"\x00]*) " # double quoted value
206 | ' ([^>'\x00]*) ' # single quoted value
207 | (%s) # Attribute value
208 )
209)?
210''' % (_NAME_RE, _UNQUOTED_VALUE_OLD), re.VERBOSE)
211
212
213class TagLexer(object):
214 """
215 Given a tag like <a href="..."> or <link type="..." />, the TagLexer
216 provides a few operations:
217
218 - What is the tag?
219 - Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
220 """
221
222 def __init__(self, s):
223 # type: (str) -> None
224 self.s = s
225 self.start_pos = -1 # Invalid
226 self.end_pos = -1
227
228 def Reset(self, start_pos, end_pos):
229 # type: (int, int) -> None
230 """Reuse instances of this object."""
231 assert start_pos >= 0, start_pos
232 assert end_pos >= 0, end_pos
233
234 self.start_pos = start_pos
235 self.end_pos = end_pos
236
237 def WholeTagString(self):
238 # type: () -> str
239 """Return the entire tag string, e.g. <a href='foo'>"""
240 return self.s[self.start_pos:self.end_pos]
241
242 def GetTagName(self):
243 # type: () -> str
244 # First event
245 tok_id, start, end = next(self.Tokens())
246 return self.s[start:end]
247
248 def GetSpanForAttrValue(self, attr_name):
249 # type: (str) -> Tuple[int, int]
250 """
251 Used by oils_doc.py, for href shortcuts
252 """
253 # Algorithm: search for QuotedValue or UnquotedValue after AttrName
254 # TODO: Could also cache these
255
256 events = self.Tokens()
257 val = (-1, -1)
258 try:
259 while True:
260 tok_id, start, end = next(events)
261 if tok_id == h8_tag_id.AttrName:
262 name = self.s[start:end]
263 if name == attr_name:
264 # The value should come next
265 tok_id, start, end = next(events)
266 assert tok_id in (
267 h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue,
268 h8_tag_id.MissingValue), h8_tag_id_str(tok_id)
269 val = start, end
270 break
271
272 except StopIteration:
273 pass
274 return val
275
276 def GetAttrRaw(self, attr_name):
277 # type: (str) -> Optional[str]
278 """
279 Return the value, which may be UNESCAPED.
280 """
281 start, end = self.GetSpanForAttrValue(attr_name)
282 if start == -1:
283 return None
284 return self.s[start:end]
285
286 def AllAttrsRawSlice(self):
287 # type: () -> List[Tuple[str, int, int]]
288 """
289 Get a list of pairs [('class', 3, 5), ('href', 9, 12)]
290 """
291 slices = []
292 events = self.Tokens()
293 try:
294 while True:
295 tok_id, start, end = next(events)
296 if tok_id == h8_tag_id.AttrName:
297 name = self.s[start:end]
298
299 # The value should come next
300 tok_id, start, end = next(events)
301 assert tok_id in (
302 h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue,
303 h8_tag_id.MissingValue), h8_tag_id_str(tok_id)
304 # Note: quoted values may have &amp;
305 # We would need ANOTHER lexer to unescape them, but we
306 # don't need that for ul-table
307 slices.append((name, start, end))
308 except StopIteration:
309 pass
310 return slices
311
312 def AllAttrsRaw(self):
313 # type: () -> List[Tuple[str, str]]
314 """
315 Get a list of pairs [('class', 'foo'), ('href', '?foo=1&amp;bar=2')]
316
317 The quoted values may be escaped. We would need another lexer to
318 unescape them.
319 """
320 slices = self.AllAttrsRawSlice()
321 pairs = []
322 for name, start, end in slices:
323 pairs.append((name, self.s[start:end]))
324 return pairs
325
326 def Tokens(self):
327 # type: () -> Iterator[Tuple[h8_tag_id_t, int, int]]
328 """
329 Yields a sequence of tokens: Tag (AttrName AttrValue?)*
330
331 Where each Token is (Type, start_pos, end_pos)
332
333 Note that start and end are NOT redundant! We skip over some unwanted
334 characters.
335 """
336 m = _TAG_RE.match(self.s, self.start_pos + 1)
337 if not m:
338 raise RuntimeError("Couldn't find HTML tag in %r" %
339 self.WholeTagString())
340 yield h8_tag_id.TagName, m.start(1), m.end(1)
341
342 pos = m.end(0)
343 #log('POS %d', pos)
344
345 while True:
346 # don't search past the end
347 m = _ATTR_RE.match(self.s, pos, self.end_pos)
348 if not m:
349 #log('BREAK pos %d', pos)
350 break
351 #log('AttrName %r', m.group(1))
352
353 yield h8_tag_id.AttrName, m.start(1), m.end(1)
354
355 #log('m.groups() %r', m.groups())
356 if m.group(2) is not None:
357 # double quoted
358 yield h8_tag_id.QuotedValue, m.start(2), m.end(2)
359 elif m.group(3) is not None:
360 # single quoted - TODO: could have different token types
361 yield h8_tag_id.QuotedValue, m.start(3), m.end(3)
362 elif m.group(4) is not None:
363 yield h8_tag_id.UnquotedValue, m.start(4), m.end(4)
364 else:
365 # <button disabled>
366 end = m.end(0)
367 yield h8_tag_id.MissingValue, end, end
368
369 # Skip past the "
370 pos = m.end(0)
371
372 #log('TOK %r', self.s)
373
374 m = _TAG_LAST_RE.match(self.s, pos)
375 #log('_TAG_LAST_RE match %r', self.s[pos:])
376 if not m:
377 raise LexError('Extra data at end of tag', self.s, pos)