1 | #!/usr/bin/env python2
|
2 | """
|
3 | lazylex/html.py - Wrapper around HTM8
|
4 |
|
5 | See doc/lazylex.md for details.
|
6 |
|
7 | """
|
8 | from __future__ import print_function
|
9 |
|
10 | from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_id_str)
|
11 | from data_lang import htm8
|
12 | from data_lang.htm8 import (Lexer, TagLexer, AttrValueLexer, LexError,
|
13 | ParseError, Output)
|
14 | from doctools.util import log
|
15 |
|
16 | try:
|
17 | from cStringIO import StringIO
|
18 | except ImportError:
|
19 | # for python3
|
20 | from io import StringIO # type: ignore
|
21 | import sys
|
22 |
|
23 | if sys.version_info.major == 2:
|
24 | from typing import List, Tuple, Iterator
|
25 |
|
26 |
|
27 | def _Tokens(s, left_pos, right_pos):
|
28 | # type: (str, int, int) -> Iterator[Tuple[h8_id_t, int]]
|
29 | """
|
30 | Args:
|
31 | s: string to parse
|
32 | left_pos, right_pos: Optional span boundaries.
|
33 | """
|
34 | lx = Lexer(s, left_pos, right_pos)
|
35 | while True:
|
36 | tok_id, pos = lx.Read()
|
37 | yield tok_id, pos
|
38 | if tok_id == h8_id.EndOfStream:
|
39 | break
|
40 |
|
41 |
|
42 | def ValidTokens(s, left_pos=0, right_pos=-1):
|
43 | # type: (str, int, int) -> Iterator[Tuple[h8_id_t, int]]
|
44 | """Wrapper around _Tokens to prevent callers from having to handle Invalid.
|
45 |
|
46 | I'm not combining the two functions because I might want to do a
|
47 | 'yield' transformation on Tokens()? Exceptions might complicate the
|
48 | issue?
|
49 | """
|
50 | pos = left_pos
|
51 | for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
|
52 | if tok_id == h8_id.Invalid:
|
53 | raise LexError('ValidTokens() got invalid token', s, pos)
|
54 | yield tok_id, end_pos
|
55 | pos = end_pos
|
56 |
|
57 |
|
58 | def ReadUntilStartTag(it, tag_lexer, tag_name):
|
59 | # type: (Iterator[Tuple[h8_id_t, int]], TagLexer, str) -> Tuple[int, int]
|
60 | """Find the next <foo>, returning its (start, end) positions
|
61 |
|
62 | Raise ParseError if it's not found.
|
63 |
|
64 | tag_lexer is RESET.
|
65 | """
|
66 | pos = 0
|
67 | while True:
|
68 | try:
|
69 | tok_id, end_pos = next(it)
|
70 | except StopIteration:
|
71 | break
|
72 | tag_lexer.Reset(pos, end_pos)
|
73 | if tok_id == h8_id.StartTag and tag_lexer.GetTagName() == tag_name:
|
74 | return pos, end_pos
|
75 |
|
76 | pos = end_pos
|
77 |
|
78 | raise ParseError('No start tag %r' % tag_name)
|
79 |
|
80 |
|
81 | def ReadUntilEndTag(it, tag_lexer, tag_name):
|
82 | # type: (Iterator[Tuple[h8_id_t, int]], TagLexer, str) -> Tuple[int, int]
|
83 | """Find the next </foo>, returning its (start, end) position
|
84 |
|
85 | Raise ParseError if it's not found.
|
86 |
|
87 | tag_lexer is RESET.
|
88 | """
|
89 | pos = 0
|
90 | while True:
|
91 | try:
|
92 | tok_id, end_pos = next(it)
|
93 | except StopIteration:
|
94 | break
|
95 | tag_lexer.Reset(pos, end_pos)
|
96 | if tok_id == h8_id.EndTag and tag_lexer.GetTagName() == tag_name:
|
97 | return pos, end_pos
|
98 |
|
99 | pos = end_pos
|
100 |
|
101 | raise ParseError('No end tag %r' % tag_name)
|
102 |
|
103 |
|
104 | CHAR_ENTITY = {
|
105 | 'amp': '&',
|
106 | 'lt': '<',
|
107 | 'gt': '>',
|
108 | 'quot': '"',
|
109 | 'apos': "'",
|
110 | }
|
111 |
|
112 |
|
113 | def ToText(s, left_pos=0, right_pos=-1):
|
114 | # type: (str, int, int) -> str
|
115 | """Given HTML, return text by unquoting > and < etc.
|
116 |
|
117 | Used by:
|
118 | doctools/oils_doc.py: PygmentsPlugin
|
119 | doctools/help_gen.py: HelpIndexCards
|
120 |
|
121 | In the latter case, we cold process some tags, like:
|
122 |
|
123 | - Blue Link (not clickable, but still useful)
|
124 | - Red X
|
125 |
|
126 | That should be html.ToAnsi.
|
127 | """
|
128 | f = StringIO()
|
129 | out = Output(s, f, left_pos, right_pos)
|
130 |
|
131 | pos = left_pos
|
132 | for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
|
133 | if tok_id in (h8_id.RawData, h8_id.BadAmpersand, h8_id.BadGreaterThan,
|
134 | h8_id.BadLessThan):
|
135 | out.SkipTo(pos)
|
136 | out.PrintUntil(end_pos)
|
137 |
|
138 | elif tok_id == h8_id.CharEntity: # &
|
139 |
|
140 | entity = s[pos + 1:end_pos - 1]
|
141 |
|
142 | out.SkipTo(pos)
|
143 | out.Print(CHAR_ENTITY[entity])
|
144 | out.SkipTo(end_pos)
|
145 |
|
146 | # Not handling these yet
|
147 | elif tok_id == h8_id.HexChar:
|
148 | raise AssertionError('Hex Char %r' % s[pos:pos + 20])
|
149 |
|
150 | elif tok_id == h8_id.DecChar:
|
151 | raise AssertionError('Dec Char %r' % s[pos:pos + 20])
|
152 |
|
153 | else:
|
154 | # Skip everything else
|
155 | out.SkipTo(end_pos)
|
156 |
|
157 | pos = end_pos
|
158 |
|
159 | out.PrintTheRest()
|
160 | return f.getvalue()
|
161 |
|
162 |
|
163 | # https://developer.mozilla.org/en-US/docs/Glossary/Void_element
|
164 | VOID_ELEMENTS = [
|
165 | 'area',
|
166 | 'base',
|
167 | 'br',
|
168 | 'col',
|
169 | 'embed',
|
170 | 'hr',
|
171 | 'img',
|
172 | 'input',
|
173 | 'link',
|
174 | 'meta',
|
175 | 'param',
|
176 | 'source',
|
177 | 'track',
|
178 | 'wbr',
|
179 | ]
|
180 |
|
181 | LEX_ATTRS = 1 << 1
|
182 | LEX_QUOTED_VALUES = 1 << 2 # href="?x=42&y=99"
|
183 | NO_SPECIAL_TAGS = 1 << 3 # <script> <style>, VOID tags, etc.
|
184 | BALANCED_TAGS = 1 << 4 # are tags balanced?
|
185 |
|
186 |
|
187 | def ValidateOld(contents, flags, counters):
|
188 | # type: (str, int, Counters) -> None
|
189 |
|
190 | tag_lexer = TagLexer(contents)
|
191 | val_lexer = AttrValueLexer(contents)
|
192 |
|
193 | no_special_tags = bool(flags & NO_SPECIAL_TAGS)
|
194 | lx = Lexer(contents, no_special_tags=no_special_tags)
|
195 | tokens = []
|
196 | start_pos = 0
|
197 | tag_stack = []
|
198 | while True:
|
199 | tok_id, end_pos = lx.Read()
|
200 | #log('TOP %s %r', h8_id_str(tok_id), contents[start_pos:end_pos])
|
201 |
|
202 | if tok_id == h8_id.Invalid:
|
203 | raise LexError('ValidateOld() got invalid token', contents,
|
204 | start_pos)
|
205 | if tok_id == h8_id.EndOfStream:
|
206 | break
|
207 |
|
208 | tokens.append((tok_id, end_pos))
|
209 |
|
210 | if tok_id == h8_id.StartEndTag:
|
211 | counters.num_start_end_tags += 1
|
212 |
|
213 | tag_lexer.Reset(start_pos, end_pos)
|
214 | all_attrs = tag_lexer.AllAttrsRawSlice()
|
215 | counters.num_attrs += len(all_attrs)
|
216 | for name, val_start, val_end in all_attrs:
|
217 | val_lexer.Reset(val_start, val_end)
|
218 | counters.num_val_tokens += val_lexer.NumTokens()
|
219 |
|
220 | #counters.debug_attrs.extend(all_attrs)
|
221 |
|
222 | elif tok_id == h8_id.StartTag:
|
223 | counters.num_start_tags += 1
|
224 |
|
225 | tag_lexer.Reset(start_pos, end_pos)
|
226 | all_attrs = tag_lexer.AllAttrsRawSlice()
|
227 | counters.num_attrs += len(all_attrs)
|
228 | for name, val_start, val_end in all_attrs:
|
229 | val_lexer.Reset(val_start, val_end)
|
230 | counters.num_val_tokens += val_lexer.NumTokens()
|
231 |
|
232 | #counters.debug_attrs.extend(all_attrs)
|
233 |
|
234 | if flags & BALANCED_TAGS:
|
235 | tag_name = lx.CanonicalTagName()
|
236 | if flags & NO_SPECIAL_TAGS:
|
237 | tag_stack.append(tag_name)
|
238 | else:
|
239 | # e.g. <meta> is considered self-closing, like <meta/>
|
240 | if tag_name not in VOID_ELEMENTS:
|
241 | tag_stack.append(tag_name)
|
242 |
|
243 | counters.max_tag_stack = max(counters.max_tag_stack,
|
244 | len(tag_stack))
|
245 | elif tok_id == h8_id.EndTag:
|
246 | if flags & BALANCED_TAGS:
|
247 | try:
|
248 | expected = tag_stack.pop()
|
249 | except IndexError:
|
250 | raise ParseError('Tag stack empty',
|
251 | s=contents,
|
252 | start_pos=start_pos)
|
253 |
|
254 | actual = lx.CanonicalTagName()
|
255 | if expected != actual:
|
256 | raise ParseError(
|
257 | 'Got unexpected closing tag %r; opening tag was %r' %
|
258 | (contents[start_pos:end_pos], expected),
|
259 | s=contents,
|
260 | start_pos=start_pos)
|
261 |
|
262 | start_pos = end_pos
|
263 |
|
264 | if len(tag_stack) != 0:
|
265 | raise ParseError('Missing closing tags at end of doc: %s' %
|
266 | ' '.join(tag_stack),
|
267 | s=contents,
|
268 | start_pos=start_pos)
|
269 |
|
270 | counters.num_tokens += len(tokens)
|
271 |
|
272 |
|
273 | def Validate(contents, flags, counters):
|
274 | # type: (str, int, Counters) -> None
|
275 |
|
276 | attr_lx = htm8.AttrLexer(contents)
|
277 |
|
278 | no_special_tags = bool(flags & NO_SPECIAL_TAGS)
|
279 | lx = htm8.Lexer(contents, no_special_tags=no_special_tags)
|
280 | tokens = []
|
281 | start_pos = 0
|
282 | tag_stack = []
|
283 | while True:
|
284 | tok_id, end_pos = lx.Read()
|
285 | #log('TOP %s %r', h8_id_str(tok_id), contents[start_pos:end_pos])
|
286 |
|
287 | if tok_id == h8_id.Invalid:
|
288 | raise LexError('Validate() got invalid token', contents, start_pos)
|
289 | if tok_id == h8_id.EndOfStream:
|
290 | break
|
291 |
|
292 | tokens.append((tok_id, end_pos))
|
293 |
|
294 | if tok_id == h8_id.StartEndTag:
|
295 | counters.num_start_end_tags += 1
|
296 |
|
297 | attr_lx.Init(lx.TagNamePos(), end_pos)
|
298 | all_attrs = htm8.AllAttrsRaw(attr_lx)
|
299 | counters.num_attrs += len(all_attrs)
|
300 | # TODO: val_lexer.NumTokens() can be replaced with tokens_out
|
301 |
|
302 | elif tok_id == h8_id.StartTag:
|
303 | counters.num_start_tags += 1
|
304 |
|
305 | attr_lx.Init(lx.TagNamePos(), end_pos)
|
306 | all_attrs = htm8.AllAttrsRaw(attr_lx)
|
307 | counters.num_attrs += len(all_attrs)
|
308 |
|
309 | #counters.debug_attrs.extend(all_attrs)
|
310 |
|
311 | if flags & BALANCED_TAGS:
|
312 | tag_name = lx.CanonicalTagName()
|
313 | if flags & NO_SPECIAL_TAGS:
|
314 | tag_stack.append(tag_name)
|
315 | else:
|
316 | # e.g. <meta> is considered self-closing, like <meta/>
|
317 | if tag_name not in VOID_ELEMENTS:
|
318 | tag_stack.append(tag_name)
|
319 |
|
320 | counters.max_tag_stack = max(counters.max_tag_stack,
|
321 | len(tag_stack))
|
322 | elif tok_id == h8_id.EndTag:
|
323 | if flags & BALANCED_TAGS:
|
324 | try:
|
325 | expected = tag_stack.pop()
|
326 | except IndexError:
|
327 | raise ParseError('Tag stack empty',
|
328 | s=contents,
|
329 | start_pos=start_pos)
|
330 |
|
331 | actual = lx.CanonicalTagName()
|
332 | if expected != actual:
|
333 | raise ParseError(
|
334 | 'Got unexpected closing tag %r; opening tag was %r' %
|
335 | (contents[start_pos:end_pos], expected),
|
336 | s=contents,
|
337 | start_pos=start_pos)
|
338 |
|
339 | start_pos = end_pos
|
340 |
|
341 | if len(tag_stack) != 0:
|
342 | raise ParseError('Missing closing tags at end of doc: %s' %
|
343 | ' '.join(tag_stack),
|
344 | s=contents,
|
345 | start_pos=start_pos)
|
346 |
|
347 | counters.num_tokens += len(tokens)
|
348 |
|
349 |
|
350 | def ToXml(htm8_str):
|
351 | # type: (str) -> str
|
352 |
|
353 | # TODO:
|
354 | # 1. Lex it
|
355 | # 2. < & > must be escaped
|
356 | # a. in raw data
|
357 | # b. in quoted strings
|
358 | # 3. <script> turned into CDATA
|
359 | # 4. void tags turned into self-closing tags
|
360 | # 5. case-sensitive tag matching - not sure about this
|
361 |
|
362 | tag_lexer = TagLexer(htm8_str)
|
363 | val_lexer = AttrValueLexer(htm8_str)
|
364 |
|
365 | f = StringIO()
|
366 | out = Output(htm8_str, f)
|
367 |
|
368 | lx = Lexer(htm8_str)
|
369 |
|
370 | pos = 0
|
371 | while True:
|
372 | tok_id, end_pos = lx.Read()
|
373 |
|
374 | if tok_id == h8_id.Invalid:
|
375 | raise LexError('ToXml() got invalid token', htm8_str, pos)
|
376 | if tok_id == h8_id.EndOfStream:
|
377 | break
|
378 |
|
379 | if tok_id in (h8_id.RawData, h8_id.CharEntity, h8_id.HexChar,
|
380 | h8_id.DecChar):
|
381 | out.PrintUntil(end_pos)
|
382 | elif tok_id in (h8_id.StartTag, h8_id.StartEndTag):
|
383 | tag_lexer.Reset(pos, end_pos)
|
384 | # TODO: reduce allocations here
|
385 | all_attrs = tag_lexer.AllAttrsRawSlice()
|
386 | for name, val_start, val_end in all_attrs:
|
387 | val_lexer.Reset(val_start, val_end)
|
388 | # TODO: get the kind of string
|
389 | #
|
390 | # Quoted: we need to replace & with & and < with <
|
391 | # note > is not allowed
|
392 | # Unquoted: right now, we can just surround with double quotes
|
393 | # because we don't allow any bad chars
|
394 | # Empty : add "", so empty= becomes =""
|
395 | # Missing : add ="", so missing becomes missing=""
|
396 |
|
397 | tag_name = lx.CanonicalTagName()
|
398 | if tok_id == h8_id.StartTag and tag_name in VOID_ELEMENTS:
|
399 | # TODO: instead of closing >, print />
|
400 | pass
|
401 |
|
402 | elif tok_id == h8_id.BadAmpersand:
|
403 | #out.SkipTo(pos)
|
404 | out.Print('&')
|
405 | out.SkipTo(end_pos)
|
406 |
|
407 | elif tok_id == h8_id.BadGreaterThan:
|
408 | #out.SkipTo(pos)
|
409 | out.Print('>')
|
410 | out.SkipTo(end_pos)
|
411 | else:
|
412 | out.PrintUntil(end_pos)
|
413 |
|
414 | pos = end_pos
|
415 |
|
416 | out.PrintTheRest()
|
417 | return f.getvalue()
|
418 |
|
419 |
|
420 | class Counters(object):
|
421 |
|
422 | def __init__(self):
|
423 | # type: () -> None
|
424 | self.num_tokens = 0
|
425 | self.num_start_tags = 0
|
426 | self.num_start_end_tags = 0
|
427 | self.num_attrs = 0
|
428 | self.max_tag_stack = 0
|
429 | self.num_val_tokens = 0
|
430 |
|
431 | #self.debug_attrs = []
|
432 |
|
433 |
|
434 | def main(argv):
|
435 | # type: (List[str]) -> int
|
436 | action = argv[1]
|
437 |
|
438 | if action == 'tokens':
|
439 | contents = sys.stdin.read()
|
440 |
|
441 | lx = Lexer(contents)
|
442 | start_pos = 0
|
443 | while True:
|
444 | tok_id, end_pos = lx.Read()
|
445 | if tok_id == h8_id.Invalid:
|
446 | raise LexError('Invalid token', contents, start_pos)
|
447 | if tok_id == h8_id.EndOfStream:
|
448 | break
|
449 |
|
450 | frag = contents[start_pos:end_pos]
|
451 | log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
|
452 | start_pos = end_pos
|
453 |
|
454 | return 0
|
455 |
|
456 | elif action in ('lex-htm8', 'parse-htm8', 'parse-xml'):
|
457 |
|
458 | errors = []
|
459 | counters = Counters()
|
460 |
|
461 | flags = LEX_ATTRS | LEX_QUOTED_VALUES
|
462 | if action.startswith('parse-'):
|
463 | flags |= BALANCED_TAGS
|
464 | if action == 'parse-xml':
|
465 | flags |= NO_SPECIAL_TAGS
|
466 |
|
467 | i = 0
|
468 | for line in sys.stdin:
|
469 | filename = line.strip()
|
470 | with open(filename) as f:
|
471 | contents = f.read()
|
472 |
|
473 | try:
|
474 | Validate(contents, flags, counters)
|
475 | except LexError as e:
|
476 | log('Lex error in %r: %s', filename, e)
|
477 | errors.append((filename, e))
|
478 | except ParseError as e:
|
479 | log('Parse error in %r: %s', filename, e)
|
480 | errors.append((filename, e))
|
481 | i += 1
|
482 |
|
483 | log('')
|
484 | log('%10d tokens', counters.num_tokens)
|
485 | log('%10d start/end tags', counters.num_start_end_tags)
|
486 | log('%10d start tags', counters.num_start_tags)
|
487 | log('%10d attrs', counters.num_attrs)
|
488 | log('%10d max tag stack depth', counters.max_tag_stack)
|
489 | log('%10d attr val tokens', counters.num_val_tokens)
|
490 | log('%10d errors', len(errors))
|
491 | if len(errors):
|
492 | return 1
|
493 | return 0
|
494 |
|
495 | elif action == 'todo':
|
496 | # Other algorithms:
|
497 | #
|
498 | # - select first subtree with given ID
|
499 | # - this requires understanding the void tags I suppose
|
500 | # - select all subtrees that have a class
|
501 | # - materialize DOM
|
502 |
|
503 | # Safe-HTM8? This is a filter
|
504 | return 0
|
505 |
|
506 | else:
|
507 | raise RuntimeError('Invalid action %r' % action)
|
508 |
|
509 |
|
510 | if __name__ == '__main__':
|
511 | sys.exit(main(sys.argv))
|