OILS / lazylex / html.py View on Github | oils.pub

511 lines, 317 significant
1#!/usr/bin/env python2
2"""
3lazylex/html.py - Wrapper around HTM8
4
5See doc/lazylex.md for details.
6
7"""
8from __future__ import print_function
9
10from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_id_str)
11from data_lang import htm8
12from data_lang.htm8 import (Lexer, TagLexer, AttrValueLexer, LexError,
13 ParseError, Output)
14from doctools.util import log
15
16try:
17 from cStringIO import StringIO
18except ImportError:
19 # for python3
20 from io import StringIO # type: ignore
21import sys
22
23if sys.version_info.major == 2:
24 from typing import List, Tuple, Iterator
25
26
27def _Tokens(s, left_pos, right_pos):
28 # type: (str, int, int) -> Iterator[Tuple[h8_id_t, int]]
29 """
30 Args:
31 s: string to parse
32 left_pos, right_pos: Optional span boundaries.
33 """
34 lx = Lexer(s, left_pos, right_pos)
35 while True:
36 tok_id, pos = lx.Read()
37 yield tok_id, pos
38 if tok_id == h8_id.EndOfStream:
39 break
40
41
42def ValidTokens(s, left_pos=0, right_pos=-1):
43 # type: (str, int, int) -> Iterator[Tuple[h8_id_t, int]]
44 """Wrapper around _Tokens to prevent callers from having to handle Invalid.
45
46 I'm not combining the two functions because I might want to do a
47 'yield' transformation on Tokens()? Exceptions might complicate the
48 issue?
49 """
50 pos = left_pos
51 for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
52 if tok_id == h8_id.Invalid:
53 raise LexError('ValidTokens() got invalid token', s, pos)
54 yield tok_id, end_pos
55 pos = end_pos
56
57
58def ReadUntilStartTag(it, tag_lexer, tag_name):
59 # type: (Iterator[Tuple[h8_id_t, int]], TagLexer, str) -> Tuple[int, int]
60 """Find the next <foo>, returning its (start, end) positions
61
62 Raise ParseError if it's not found.
63
64 tag_lexer is RESET.
65 """
66 pos = 0
67 while True:
68 try:
69 tok_id, end_pos = next(it)
70 except StopIteration:
71 break
72 tag_lexer.Reset(pos, end_pos)
73 if tok_id == h8_id.StartTag and tag_lexer.GetTagName() == tag_name:
74 return pos, end_pos
75
76 pos = end_pos
77
78 raise ParseError('No start tag %r' % tag_name)
79
80
81def ReadUntilEndTag(it, tag_lexer, tag_name):
82 # type: (Iterator[Tuple[h8_id_t, int]], TagLexer, str) -> Tuple[int, int]
83 """Find the next </foo>, returning its (start, end) position
84
85 Raise ParseError if it's not found.
86
87 tag_lexer is RESET.
88 """
89 pos = 0
90 while True:
91 try:
92 tok_id, end_pos = next(it)
93 except StopIteration:
94 break
95 tag_lexer.Reset(pos, end_pos)
96 if tok_id == h8_id.EndTag and tag_lexer.GetTagName() == tag_name:
97 return pos, end_pos
98
99 pos = end_pos
100
101 raise ParseError('No end tag %r' % tag_name)
102
103
104CHAR_ENTITY = {
105 'amp': '&',
106 'lt': '<',
107 'gt': '>',
108 'quot': '"',
109 'apos': "'",
110}
111
112
113def ToText(s, left_pos=0, right_pos=-1):
114 # type: (str, int, int) -> str
115 """Given HTML, return text by unquoting &gt; and &lt; etc.
116
117 Used by:
118 doctools/oils_doc.py: PygmentsPlugin
119 doctools/help_gen.py: HelpIndexCards
120
121 In the latter case, we cold process some tags, like:
122
123 - Blue Link (not clickable, but still useful)
124 - Red X
125
126 That should be html.ToAnsi.
127 """
128 f = StringIO()
129 out = Output(s, f, left_pos, right_pos)
130
131 pos = left_pos
132 for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
133 if tok_id in (h8_id.RawData, h8_id.BadAmpersand, h8_id.BadGreaterThan,
134 h8_id.BadLessThan):
135 out.SkipTo(pos)
136 out.PrintUntil(end_pos)
137
138 elif tok_id == h8_id.CharEntity: # &amp;
139
140 entity = s[pos + 1:end_pos - 1]
141
142 out.SkipTo(pos)
143 out.Print(CHAR_ENTITY[entity])
144 out.SkipTo(end_pos)
145
146 # Not handling these yet
147 elif tok_id == h8_id.HexChar:
148 raise AssertionError('Hex Char %r' % s[pos:pos + 20])
149
150 elif tok_id == h8_id.DecChar:
151 raise AssertionError('Dec Char %r' % s[pos:pos + 20])
152
153 else:
154 # Skip everything else
155 out.SkipTo(end_pos)
156
157 pos = end_pos
158
159 out.PrintTheRest()
160 return f.getvalue()
161
162
163# https://developer.mozilla.org/en-US/docs/Glossary/Void_element
164VOID_ELEMENTS = [
165 'area',
166 'base',
167 'br',
168 'col',
169 'embed',
170 'hr',
171 'img',
172 'input',
173 'link',
174 'meta',
175 'param',
176 'source',
177 'track',
178 'wbr',
179]
180
181LEX_ATTRS = 1 << 1
182LEX_QUOTED_VALUES = 1 << 2 # href="?x=42&amp;y=99"
183NO_SPECIAL_TAGS = 1 << 3 # <script> <style>, VOID tags, etc.
184BALANCED_TAGS = 1 << 4 # are tags balanced?
185
186
187def ValidateOld(contents, flags, counters):
188 # type: (str, int, Counters) -> None
189
190 tag_lexer = TagLexer(contents)
191 val_lexer = AttrValueLexer(contents)
192
193 no_special_tags = bool(flags & NO_SPECIAL_TAGS)
194 lx = Lexer(contents, no_special_tags=no_special_tags)
195 tokens = []
196 start_pos = 0
197 tag_stack = []
198 while True:
199 tok_id, end_pos = lx.Read()
200 #log('TOP %s %r', h8_id_str(tok_id), contents[start_pos:end_pos])
201
202 if tok_id == h8_id.Invalid:
203 raise LexError('ValidateOld() got invalid token', contents,
204 start_pos)
205 if tok_id == h8_id.EndOfStream:
206 break
207
208 tokens.append((tok_id, end_pos))
209
210 if tok_id == h8_id.StartEndTag:
211 counters.num_start_end_tags += 1
212
213 tag_lexer.Reset(start_pos, end_pos)
214 all_attrs = tag_lexer.AllAttrsRawSlice()
215 counters.num_attrs += len(all_attrs)
216 for name, val_start, val_end in all_attrs:
217 val_lexer.Reset(val_start, val_end)
218 counters.num_val_tokens += val_lexer.NumTokens()
219
220 #counters.debug_attrs.extend(all_attrs)
221
222 elif tok_id == h8_id.StartTag:
223 counters.num_start_tags += 1
224
225 tag_lexer.Reset(start_pos, end_pos)
226 all_attrs = tag_lexer.AllAttrsRawSlice()
227 counters.num_attrs += len(all_attrs)
228 for name, val_start, val_end in all_attrs:
229 val_lexer.Reset(val_start, val_end)
230 counters.num_val_tokens += val_lexer.NumTokens()
231
232 #counters.debug_attrs.extend(all_attrs)
233
234 if flags & BALANCED_TAGS:
235 tag_name = lx.CanonicalTagName()
236 if flags & NO_SPECIAL_TAGS:
237 tag_stack.append(tag_name)
238 else:
239 # e.g. <meta> is considered self-closing, like <meta/>
240 if tag_name not in VOID_ELEMENTS:
241 tag_stack.append(tag_name)
242
243 counters.max_tag_stack = max(counters.max_tag_stack,
244 len(tag_stack))
245 elif tok_id == h8_id.EndTag:
246 if flags & BALANCED_TAGS:
247 try:
248 expected = tag_stack.pop()
249 except IndexError:
250 raise ParseError('Tag stack empty',
251 s=contents,
252 start_pos=start_pos)
253
254 actual = lx.CanonicalTagName()
255 if expected != actual:
256 raise ParseError(
257 'Got unexpected closing tag %r; opening tag was %r' %
258 (contents[start_pos:end_pos], expected),
259 s=contents,
260 start_pos=start_pos)
261
262 start_pos = end_pos
263
264 if len(tag_stack) != 0:
265 raise ParseError('Missing closing tags at end of doc: %s' %
266 ' '.join(tag_stack),
267 s=contents,
268 start_pos=start_pos)
269
270 counters.num_tokens += len(tokens)
271
272
273def Validate(contents, flags, counters):
274 # type: (str, int, Counters) -> None
275
276 attr_lx = htm8.AttrLexer(contents)
277
278 no_special_tags = bool(flags & NO_SPECIAL_TAGS)
279 lx = htm8.Lexer(contents, no_special_tags=no_special_tags)
280 tokens = []
281 start_pos = 0
282 tag_stack = []
283 while True:
284 tok_id, end_pos = lx.Read()
285 #log('TOP %s %r', h8_id_str(tok_id), contents[start_pos:end_pos])
286
287 if tok_id == h8_id.Invalid:
288 raise LexError('Validate() got invalid token', contents, start_pos)
289 if tok_id == h8_id.EndOfStream:
290 break
291
292 tokens.append((tok_id, end_pos))
293
294 if tok_id == h8_id.StartEndTag:
295 counters.num_start_end_tags += 1
296
297 attr_lx.Init(lx.TagNamePos(), end_pos)
298 all_attrs = htm8.AllAttrsRaw(attr_lx)
299 counters.num_attrs += len(all_attrs)
300 # TODO: val_lexer.NumTokens() can be replaced with tokens_out
301
302 elif tok_id == h8_id.StartTag:
303 counters.num_start_tags += 1
304
305 attr_lx.Init(lx.TagNamePos(), end_pos)
306 all_attrs = htm8.AllAttrsRaw(attr_lx)
307 counters.num_attrs += len(all_attrs)
308
309 #counters.debug_attrs.extend(all_attrs)
310
311 if flags & BALANCED_TAGS:
312 tag_name = lx.CanonicalTagName()
313 if flags & NO_SPECIAL_TAGS:
314 tag_stack.append(tag_name)
315 else:
316 # e.g. <meta> is considered self-closing, like <meta/>
317 if tag_name not in VOID_ELEMENTS:
318 tag_stack.append(tag_name)
319
320 counters.max_tag_stack = max(counters.max_tag_stack,
321 len(tag_stack))
322 elif tok_id == h8_id.EndTag:
323 if flags & BALANCED_TAGS:
324 try:
325 expected = tag_stack.pop()
326 except IndexError:
327 raise ParseError('Tag stack empty',
328 s=contents,
329 start_pos=start_pos)
330
331 actual = lx.CanonicalTagName()
332 if expected != actual:
333 raise ParseError(
334 'Got unexpected closing tag %r; opening tag was %r' %
335 (contents[start_pos:end_pos], expected),
336 s=contents,
337 start_pos=start_pos)
338
339 start_pos = end_pos
340
341 if len(tag_stack) != 0:
342 raise ParseError('Missing closing tags at end of doc: %s' %
343 ' '.join(tag_stack),
344 s=contents,
345 start_pos=start_pos)
346
347 counters.num_tokens += len(tokens)
348
349
350def ToXml(htm8_str):
351 # type: (str) -> str
352
353 # TODO:
354 # 1. Lex it
355 # 2. < & > must be escaped
356 # a. in raw data
357 # b. in quoted strings
358 # 3. <script> turned into CDATA
359 # 4. void tags turned into self-closing tags
360 # 5. case-sensitive tag matching - not sure about this
361
362 tag_lexer = TagLexer(htm8_str)
363 val_lexer = AttrValueLexer(htm8_str)
364
365 f = StringIO()
366 out = Output(htm8_str, f)
367
368 lx = Lexer(htm8_str)
369
370 pos = 0
371 while True:
372 tok_id, end_pos = lx.Read()
373
374 if tok_id == h8_id.Invalid:
375 raise LexError('ToXml() got invalid token', htm8_str, pos)
376 if tok_id == h8_id.EndOfStream:
377 break
378
379 if tok_id in (h8_id.RawData, h8_id.CharEntity, h8_id.HexChar,
380 h8_id.DecChar):
381 out.PrintUntil(end_pos)
382 elif tok_id in (h8_id.StartTag, h8_id.StartEndTag):
383 tag_lexer.Reset(pos, end_pos)
384 # TODO: reduce allocations here
385 all_attrs = tag_lexer.AllAttrsRawSlice()
386 for name, val_start, val_end in all_attrs:
387 val_lexer.Reset(val_start, val_end)
388 # TODO: get the kind of string
389 #
390 # Quoted: we need to replace & with &amp; and < with &lt;
391 # note > is not allowed
392 # Unquoted: right now, we can just surround with double quotes
393 # because we don't allow any bad chars
394 # Empty : add "", so empty= becomes =""
395 # Missing : add ="", so missing becomes missing=""
396
397 tag_name = lx.CanonicalTagName()
398 if tok_id == h8_id.StartTag and tag_name in VOID_ELEMENTS:
399 # TODO: instead of closing >, print />
400 pass
401
402 elif tok_id == h8_id.BadAmpersand:
403 #out.SkipTo(pos)
404 out.Print('&amp;')
405 out.SkipTo(end_pos)
406
407 elif tok_id == h8_id.BadGreaterThan:
408 #out.SkipTo(pos)
409 out.Print('&gt;')
410 out.SkipTo(end_pos)
411 else:
412 out.PrintUntil(end_pos)
413
414 pos = end_pos
415
416 out.PrintTheRest()
417 return f.getvalue()
418
419
420class Counters(object):
421
422 def __init__(self):
423 # type: () -> None
424 self.num_tokens = 0
425 self.num_start_tags = 0
426 self.num_start_end_tags = 0
427 self.num_attrs = 0
428 self.max_tag_stack = 0
429 self.num_val_tokens = 0
430
431 #self.debug_attrs = []
432
433
434def main(argv):
435 # type: (List[str]) -> int
436 action = argv[1]
437
438 if action == 'tokens':
439 contents = sys.stdin.read()
440
441 lx = Lexer(contents)
442 start_pos = 0
443 while True:
444 tok_id, end_pos = lx.Read()
445 if tok_id == h8_id.Invalid:
446 raise LexError('Invalid token', contents, start_pos)
447 if tok_id == h8_id.EndOfStream:
448 break
449
450 frag = contents[start_pos:end_pos]
451 log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
452 start_pos = end_pos
453
454 return 0
455
456 elif action in ('lex-htm8', 'parse-htm8', 'parse-xml'):
457
458 errors = []
459 counters = Counters()
460
461 flags = LEX_ATTRS | LEX_QUOTED_VALUES
462 if action.startswith('parse-'):
463 flags |= BALANCED_TAGS
464 if action == 'parse-xml':
465 flags |= NO_SPECIAL_TAGS
466
467 i = 0
468 for line in sys.stdin:
469 filename = line.strip()
470 with open(filename) as f:
471 contents = f.read()
472
473 try:
474 Validate(contents, flags, counters)
475 except LexError as e:
476 log('Lex error in %r: %s', filename, e)
477 errors.append((filename, e))
478 except ParseError as e:
479 log('Parse error in %r: %s', filename, e)
480 errors.append((filename, e))
481 i += 1
482
483 log('')
484 log('%10d tokens', counters.num_tokens)
485 log('%10d start/end tags', counters.num_start_end_tags)
486 log('%10d start tags', counters.num_start_tags)
487 log('%10d attrs', counters.num_attrs)
488 log('%10d max tag stack depth', counters.max_tag_stack)
489 log('%10d attr val tokens', counters.num_val_tokens)
490 log('%10d errors', len(errors))
491 if len(errors):
492 return 1
493 return 0
494
495 elif action == 'todo':
496 # Other algorithms:
497 #
498 # - select first subtree with given ID
499 # - this requires understanding the void tags I suppose
500 # - select all subtrees that have a class
501 # - materialize DOM
502
503 # Safe-HTM8? This is a filter
504 return 0
505
506 else:
507 raise RuntimeError('Invalid action %r' % action)
508
509
510if __name__ == '__main__':
511 sys.exit(main(sys.argv))