OILS / lazylex / html.py View on Github | oils.pub

432 lines, 260 significant
1#!/usr/bin/env python2
2"""
3lazylex/html.py - Wrapper around HTM8
4
5See doc/lazylex.md for details.
6
7"""
8from __future__ import print_function
9
10from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_id_str)
11from data_lang.htm8 import (Lexer, TagLexer, AttrValueLexer, LexError,
12 ParseError, Output)
13from doctools.util import log
14
15try:
16 from cStringIO import StringIO
17except ImportError:
18 # for python3
19 from io import StringIO # type: ignore
20import sys
21
22if sys.version_info.major == 2:
23 from typing import List, Tuple, Iterator
24
25
26def _Tokens(s, left_pos, right_pos):
27 # type: (str, int, int) -> Iterator[Tuple[h8_id_t, int]]
28 """
29 Args:
30 s: string to parse
31 left_pos, right_pos: Optional span boundaries.
32 """
33 lx = Lexer(s, left_pos, right_pos)
34 while True:
35 tok_id, pos = lx.Read()
36 yield tok_id, pos
37 if tok_id == h8_id.EndOfStream:
38 break
39
40
41def ValidTokens(s, left_pos=0, right_pos=-1):
42 # type: (str, int, int) -> Iterator[Tuple[h8_id_t, int]]
43 """Wrapper around _Tokens to prevent callers from having to handle Invalid.
44
45 I'm not combining the two functions because I might want to do a
46 'yield' transformation on Tokens()? Exceptions might complicate the
47 issue?
48 """
49 pos = left_pos
50 for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
51 if tok_id == h8_id.Invalid:
52 raise LexError(s, pos)
53 yield tok_id, end_pos
54 pos = end_pos
55
56
57def ReadUntilStartTag(it, tag_lexer, tag_name):
58 # type: (Iterator[Tuple[h8_id_t, int]], TagLexer, str) -> Tuple[int, int]
59 """Find the next <foo>, returning its (start, end) positions
60
61 Raise ParseError if it's not found.
62
63 tag_lexer is RESET.
64 """
65 pos = 0
66 while True:
67 try:
68 tok_id, end_pos = next(it)
69 except StopIteration:
70 break
71 tag_lexer.Reset(pos, end_pos)
72 if tok_id == h8_id.StartTag and tag_lexer.GetTagName() == tag_name:
73 return pos, end_pos
74
75 pos = end_pos
76
77 raise ParseError('No start tag %r' % tag_name)
78
79
80def ReadUntilEndTag(it, tag_lexer, tag_name):
81 # type: (Iterator[Tuple[h8_id_t, int]], TagLexer, str) -> Tuple[int, int]
82 """Find the next </foo>, returning its (start, end) position
83
84 Raise ParseError if it's not found.
85
86 tag_lexer is RESET.
87 """
88 pos = 0
89 while True:
90 try:
91 tok_id, end_pos = next(it)
92 except StopIteration:
93 break
94 tag_lexer.Reset(pos, end_pos)
95 if tok_id == h8_id.EndTag and tag_lexer.GetTagName() == tag_name:
96 return pos, end_pos
97
98 pos = end_pos
99
100 raise ParseError('No end tag %r' % tag_name)
101
102
103CHAR_ENTITY = {
104 'amp': '&',
105 'lt': '<',
106 'gt': '>',
107 'quot': '"',
108 'apos': "'",
109}
110
111
112def ToText(s, left_pos=0, right_pos=-1):
113 # type: (str, int, int) -> str
114 """Given HTML, return text by unquoting &gt; and &lt; etc.
115
116 Used by:
117 doctools/oils_doc.py: PygmentsPlugin
118 doctools/help_gen.py: HelpIndexCards
119
120 In the latter case, we cold process some tags, like:
121
122 - Blue Link (not clickable, but still useful)
123 - Red X
124
125 That should be html.ToAnsi.
126 """
127 f = StringIO()
128 out = Output(s, f, left_pos, right_pos)
129
130 pos = left_pos
131 for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
132 if tok_id in (h8_id.RawData, h8_id.BadAmpersand, h8_id.BadGreaterThan,
133 h8_id.BadLessThan):
134 out.SkipTo(pos)
135 out.PrintUntil(end_pos)
136
137 elif tok_id == h8_id.CharEntity: # &amp;
138
139 entity = s[pos + 1:end_pos - 1]
140
141 out.SkipTo(pos)
142 out.Print(CHAR_ENTITY[entity])
143 out.SkipTo(end_pos)
144
145 # Not handling these yet
146 elif tok_id == h8_id.HexChar:
147 raise AssertionError('Hex Char %r' % s[pos:pos + 20])
148
149 elif tok_id == h8_id.DecChar:
150 raise AssertionError('Dec Char %r' % s[pos:pos + 20])
151
152 else:
153 # Skip everything else
154 out.SkipTo(end_pos)
155
156 pos = end_pos
157
158 out.PrintTheRest()
159 return f.getvalue()
160
161
162# https://developer.mozilla.org/en-US/docs/Glossary/Void_element
163VOID_ELEMENTS = [
164 'area',
165 'base',
166 'br',
167 'col',
168 'embed',
169 'hr',
170 'img',
171 'input',
172 'link',
173 'meta',
174 'param',
175 'source',
176 'track',
177 'wbr',
178]
179
180LEX_ATTRS = 1 << 1
181LEX_QUOTED_VALUES = 1 << 2 # href="?x=42&amp;y=99"
182NO_SPECIAL_TAGS = 1 << 3 # <script> <style>, VOID tags, etc.
183BALANCED_TAGS = 1 << 4 # are tags balanced?
184
185
186def Validate(contents, flags, counters):
187 # type: (str, int, Counters) -> None
188
189 tag_lexer = TagLexer(contents)
190 val_lexer = AttrValueLexer(contents)
191
192 no_special_tags = bool(flags & NO_SPECIAL_TAGS)
193 lx = Lexer(contents, no_special_tags=no_special_tags)
194 tokens = []
195 start_pos = 0
196 tag_stack = []
197 while True:
198 tok_id, end_pos = lx.Read()
199 #log('TOP %s %r', h8_id_str(tok_id), contents[start_pos:end_pos])
200
201 if tok_id == h8_id.Invalid:
202 raise LexError(contents, start_pos)
203 if tok_id == h8_id.EndOfStream:
204 break
205
206 tokens.append((tok_id, end_pos))
207
208 if tok_id == h8_id.StartEndTag:
209 counters.num_start_end_tags += 1
210
211 tag_lexer.Reset(start_pos, end_pos)
212 all_attrs = tag_lexer.AllAttrsRawSlice()
213 counters.num_attrs += len(all_attrs)
214 for name, val_start, val_end in all_attrs:
215 val_lexer.Reset(val_start, val_end)
216 counters.num_val_tokens += val_lexer.NumTokens()
217
218 #counters.debug_attrs.extend(all_attrs)
219
220 elif tok_id == h8_id.StartTag:
221 counters.num_start_tags += 1
222
223 tag_lexer.Reset(start_pos, end_pos)
224 all_attrs = tag_lexer.AllAttrsRawSlice()
225 counters.num_attrs += len(all_attrs)
226 for name, val_start, val_end in all_attrs:
227 val_lexer.Reset(val_start, val_end)
228 counters.num_val_tokens += val_lexer.NumTokens()
229
230 #counters.debug_attrs.extend(all_attrs)
231
232 if flags & BALANCED_TAGS:
233 tag_name = lx.CanonicalTagName()
234 if flags & NO_SPECIAL_TAGS:
235 tag_stack.append(tag_name)
236 else:
237 # e.g. <meta> is considered self-closing, like <meta/>
238 if tag_name not in VOID_ELEMENTS:
239 tag_stack.append(tag_name)
240
241 counters.max_tag_stack = max(counters.max_tag_stack,
242 len(tag_stack))
243 elif tok_id == h8_id.EndTag:
244 if flags & BALANCED_TAGS:
245 try:
246 expected = tag_stack.pop()
247 except IndexError:
248 raise ParseError('Tag stack empty',
249 s=contents,
250 start_pos=start_pos)
251
252 actual = lx.CanonicalTagName()
253 if expected != actual:
254 raise ParseError(
255 'Got unexpected closing tag %r; opening tag was %r' %
256 (contents[start_pos:end_pos], expected),
257 s=contents,
258 start_pos=start_pos)
259
260 start_pos = end_pos
261
262 if len(tag_stack) != 0:
263 raise ParseError('Missing closing tags at end of doc: %s' %
264 ' '.join(tag_stack),
265 s=contents,
266 start_pos=start_pos)
267
268 counters.num_tokens += len(tokens)
269
270
271def ToXml(htm8_str):
272 # type: (str) -> str
273
274 # TODO:
275 # 1. Lex it
276 # 2. < & > must be escaped
277 # a. in raw data
278 # b. in quoted strings
279 # 3. <script> turned into CDATA
280 # 4. void tags turned into self-closing tags
281 # 5. case-sensitive tag matching - not sure about this
282
283 tag_lexer = TagLexer(htm8_str)
284 val_lexer = AttrValueLexer(htm8_str)
285
286 f = StringIO()
287 out = Output(htm8_str, f)
288
289 lx = Lexer(htm8_str)
290
291 pos = 0
292 while True:
293 tok_id, end_pos = lx.Read()
294
295 if tok_id == h8_id.Invalid:
296 raise LexError(htm8_str, pos)
297 if tok_id == h8_id.EndOfStream:
298 break
299
300 if tok_id in (h8_id.RawData, h8_id.CharEntity, h8_id.HexChar,
301 h8_id.DecChar):
302 out.PrintUntil(end_pos)
303 elif tok_id in (h8_id.StartTag, h8_id.StartEndTag):
304 tag_lexer.Reset(pos, end_pos)
305 # TODO: reduce allocations here
306 all_attrs = tag_lexer.AllAttrsRawSlice()
307 for name, val_start, val_end in all_attrs:
308 val_lexer.Reset(val_start, val_end)
309 # TODO: get the kind of string
310 #
311 # Quoted: we need to replace & with &amp; and < with &lt;
312 # note > is not allowed
313 # Unquoted: right now, we can just surround with double quotes
314 # because we don't allow any bad chars
315 # Empty : add "", so empty= becomes =""
316 # Missing : add ="", so missing becomes missing=""
317
318 tag_name = lx.CanonicalTagName()
319 if tok_id == h8_id.StartTag and tag_name in VOID_ELEMENTS:
320 # TODO: instead of closing >, print />
321 pass
322
323 elif tok_id == h8_id.BadAmpersand:
324 #out.SkipTo(pos)
325 out.Print('&amp;')
326 out.SkipTo(end_pos)
327
328 elif tok_id == h8_id.BadGreaterThan:
329 #out.SkipTo(pos)
330 out.Print('&gt;')
331 out.SkipTo(end_pos)
332 else:
333 out.PrintUntil(end_pos)
334
335 pos = end_pos
336
337 out.PrintTheRest()
338 return f.getvalue()
339
340
341class Counters(object):
342
343 def __init__(self):
344 # type: () -> None
345 self.num_tokens = 0
346 self.num_start_tags = 0
347 self.num_start_end_tags = 0
348 self.num_attrs = 0
349 self.max_tag_stack = 0
350 self.num_val_tokens = 0
351
352 #self.debug_attrs = []
353
354
355def main(argv):
356 # type: (List[str]) -> int
357 action = argv[1]
358
359 if action == 'tokens':
360 contents = sys.stdin.read()
361
362 lx = Lexer(contents)
363 start_pos = 0
364 while True:
365 tok_id, end_pos = lx.Read()
366 if tok_id == h8_id.Invalid:
367 raise LexError(contents, start_pos)
368 if tok_id == h8_id.EndOfStream:
369 break
370
371 frag = contents[start_pos:end_pos]
372 log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
373 start_pos = end_pos
374
375 return 0
376
377 elif action in ('lex-htm8', 'parse-htm8', 'parse-xml'):
378
379 errors = []
380 counters = Counters()
381
382 flags = LEX_ATTRS | LEX_QUOTED_VALUES
383 if action.startswith('parse-'):
384 flags |= BALANCED_TAGS
385 if action == 'parse-xml':
386 flags |= NO_SPECIAL_TAGS
387
388 i = 0
389 for line in sys.stdin:
390 filename = line.strip()
391 with open(filename) as f:
392 contents = f.read()
393
394 try:
395 Validate(contents, flags, counters)
396 except LexError as e:
397 log('Lex error in %r: %s', filename, e)
398 errors.append((filename, e))
399 except ParseError as e:
400 log('Parse error in %r: %s', filename, e)
401 errors.append((filename, e))
402 i += 1
403
404 log('')
405 log('%10d tokens', counters.num_tokens)
406 log('%10d start/end tags', counters.num_start_end_tags)
407 log('%10d start tags', counters.num_start_tags)
408 log('%10d attrs', counters.num_attrs)
409 log('%10d max tag stack depth', counters.max_tag_stack)
410 log('%10d attr val tokens', counters.num_val_tokens)
411 log('%10d errors', len(errors))
412 if len(errors):
413 return 1
414 return 0
415
416 elif action == 'todo':
417 # Other algorithms:
418 #
419 # - select first subtree with given ID
420 # - this requires understanding the void tags I suppose
421 # - select all subtrees that have a class
422 # - materialize DOM
423
424 # Safe-HTM8? This is a filter
425 return 0
426
427 else:
428 raise RuntimeError('Invalid action %r' % action)
429
430
431if __name__ == '__main__':
432 sys.exit(main(sys.argv))