OILS / lazylex / html.py View on Github | oils.pub

450 lines, 273 significant
1#!/usr/bin/env python2
2"""
3lazylex/html.py - Wrapper around HTM8
4
5See doc/lazylex.md for details.
6
7"""
8from __future__ import print_function
9
10from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_id_str)
11from data_lang.htm8 import (Lexer, TagLexer, AttrValueLexer, LexError,
12 ParseError, Output)
13from doctools.util import log
14
15try:
16 from cStringIO import StringIO
17except ImportError:
18 # for python3
19 from io import StringIO # type: ignore
20import sys
21
22if sys.version_info.major == 2:
23 from typing import List, Tuple, Iterator
24
25
26def _Tokens(s, left_pos, right_pos):
27 # type: (str, int, int) -> Iterator[Tuple[h8_id_t, int]]
28 """
29 Args:
30 s: string to parse
31 left_pos, right_pos: Optional span boundaries.
32 """
33 lx = Lexer(s, left_pos, right_pos)
34 while True:
35 tok_id, pos = lx.Read()
36 yield tok_id, pos
37 if tok_id == h8_id.EndOfStream:
38 break
39
40
41def ValidTokens(s, left_pos=0, right_pos=-1):
42 # type: (str, int, int) -> Iterator[Tuple[h8_id_t, int]]
43 """Wrapper around _Tokens to prevent callers from having to handle Invalid.
44
45 I'm not combining the two functions because I might want to do a
46 'yield' transformation on Tokens()? Exceptions might complicate the
47 issue?
48 """
49 pos = left_pos
50 for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
51 if tok_id == h8_id.Invalid:
52 raise LexError(s, pos)
53 yield tok_id, end_pos
54 pos = end_pos
55
56
57def ValidTokenList(s, no_special_tags=False):
58 # type: (str, bool) -> List[Tuple[h8_id_t, int]]
59 """A wrapper that can be more easily translated to C++. Doesn't use iterators."""
60
61 start_pos = 0
62 tokens = []
63 lx = Lexer(s, no_special_tags=no_special_tags)
64 while True:
65 tok_id, end_pos = lx.Read()
66 tokens.append((tok_id, end_pos))
67 if tok_id == h8_id.EndOfStream:
68 break
69 if tok_id == h8_id.Invalid:
70 raise LexError(s, start_pos)
71 start_pos = end_pos
72 return tokens
73
74
75def ReadUntilStartTag(it, tag_lexer, tag_name):
76 # type: (Iterator[Tuple[h8_id_t, int]], TagLexer, str) -> Tuple[int, int]
77 """Find the next <foo>, returning its (start, end) positions
78
79 Raise ParseError if it's not found.
80
81 tag_lexer is RESET.
82 """
83 pos = 0
84 while True:
85 try:
86 tok_id, end_pos = next(it)
87 except StopIteration:
88 break
89 tag_lexer.Reset(pos, end_pos)
90 if tok_id == h8_id.StartTag and tag_lexer.GetTagName() == tag_name:
91 return pos, end_pos
92
93 pos = end_pos
94
95 raise ParseError('No start tag %r' % tag_name)
96
97
98def ReadUntilEndTag(it, tag_lexer, tag_name):
99 # type: (Iterator[Tuple[h8_id_t, int]], TagLexer, str) -> Tuple[int, int]
100 """Find the next </foo>, returning its (start, end) position
101
102 Raise ParseError if it's not found.
103
104 tag_lexer is RESET.
105 """
106 pos = 0
107 while True:
108 try:
109 tok_id, end_pos = next(it)
110 except StopIteration:
111 break
112 tag_lexer.Reset(pos, end_pos)
113 if tok_id == h8_id.EndTag and tag_lexer.GetTagName() == tag_name:
114 return pos, end_pos
115
116 pos = end_pos
117
118 raise ParseError('No end tag %r' % tag_name)
119
120
121CHAR_ENTITY = {
122 'amp': '&',
123 'lt': '<',
124 'gt': '>',
125 'quot': '"',
126 'apos': "'",
127}
128
129
130def ToText(s, left_pos=0, right_pos=-1):
131 # type: (str, int, int) -> str
132 """Given HTML, return text by unquoting &gt; and &lt; etc.
133
134 Used by:
135 doctools/oils_doc.py: PygmentsPlugin
136 doctools/help_gen.py: HelpIndexCards
137
138 In the latter case, we cold process some tags, like:
139
140 - Blue Link (not clickable, but still useful)
141 - Red X
142
143 That should be html.ToAnsi.
144 """
145 f = StringIO()
146 out = Output(s, f, left_pos, right_pos)
147
148 pos = left_pos
149 for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
150 if tok_id in (h8_id.RawData, h8_id.BadAmpersand, h8_id.BadGreaterThan,
151 h8_id.BadLessThan):
152 out.SkipTo(pos)
153 out.PrintUntil(end_pos)
154
155 elif tok_id == h8_id.CharEntity: # &amp;
156
157 entity = s[pos + 1:end_pos - 1]
158
159 out.SkipTo(pos)
160 out.Print(CHAR_ENTITY[entity])
161 out.SkipTo(end_pos)
162
163 # Not handling these yet
164 elif tok_id == h8_id.HexChar:
165 raise AssertionError('Hex Char %r' % s[pos:pos + 20])
166
167 elif tok_id == h8_id.DecChar:
168 raise AssertionError('Dec Char %r' % s[pos:pos + 20])
169
170 else:
171 # Skip everything else
172 out.SkipTo(end_pos)
173
174 pos = end_pos
175
176 out.PrintTheRest()
177 return f.getvalue()
178
179
180# https://developer.mozilla.org/en-US/docs/Glossary/Void_element
181VOID_ELEMENTS = [
182 'area',
183 'base',
184 'br',
185 'col',
186 'embed',
187 'hr',
188 'img',
189 'input',
190 'link',
191 'meta',
192 'param',
193 'source',
194 'track',
195 'wbr',
196]
197
198LEX_ATTRS = 1 << 1
199LEX_QUOTED_VALUES = 1 << 2 # href="?x=42&amp;y=99"
200NO_SPECIAL_TAGS = 1 << 3 # <script> <style>, VOID tags, etc.
201BALANCED_TAGS = 1 << 4 # are tags balanced?
202
203
204def Validate(contents, flags, counters):
205 # type: (str, int, Counters) -> None
206
207 tag_lexer = TagLexer(contents)
208 val_lexer = AttrValueLexer(contents)
209
210 no_special_tags = bool(flags & NO_SPECIAL_TAGS)
211 lx = Lexer(contents, no_special_tags=no_special_tags)
212 tokens = []
213 start_pos = 0
214 tag_stack = []
215 while True:
216 tok_id, end_pos = lx.Read()
217 #log('TOP %s %r', h8_id_str(tok_id), contents[start_pos:end_pos])
218
219 if tok_id == h8_id.Invalid:
220 raise LexError(contents, start_pos)
221 if tok_id == h8_id.EndOfStream:
222 break
223
224 tokens.append((tok_id, end_pos))
225
226 if tok_id == h8_id.StartEndTag:
227 counters.num_start_end_tags += 1
228
229 tag_lexer.Reset(start_pos, end_pos)
230 all_attrs = tag_lexer.AllAttrsRawSlice()
231 counters.num_attrs += len(all_attrs)
232 for name, val_start, val_end in all_attrs:
233 val_lexer.Reset(val_start, val_end)
234 counters.num_val_tokens += val_lexer.NumTokens()
235
236 #counters.debug_attrs.extend(all_attrs)
237
238 elif tok_id == h8_id.StartTag:
239 counters.num_start_tags += 1
240
241 tag_lexer.Reset(start_pos, end_pos)
242 all_attrs = tag_lexer.AllAttrsRawSlice()
243 counters.num_attrs += len(all_attrs)
244 for name, val_start, val_end in all_attrs:
245 val_lexer.Reset(val_start, val_end)
246 counters.num_val_tokens += val_lexer.NumTokens()
247
248 #counters.debug_attrs.extend(all_attrs)
249
250 if flags & BALANCED_TAGS:
251 tag_name = lx.CanonicalTagName()
252 if flags & NO_SPECIAL_TAGS:
253 tag_stack.append(tag_name)
254 else:
255 # e.g. <meta> is considered self-closing, like <meta/>
256 if tag_name not in VOID_ELEMENTS:
257 tag_stack.append(tag_name)
258
259 counters.max_tag_stack = max(counters.max_tag_stack,
260 len(tag_stack))
261 elif tok_id == h8_id.EndTag:
262 if flags & BALANCED_TAGS:
263 try:
264 expected = tag_stack.pop()
265 except IndexError:
266 raise ParseError('Tag stack empty',
267 s=contents,
268 start_pos=start_pos)
269
270 actual = lx.CanonicalTagName()
271 if expected != actual:
272 raise ParseError(
273 'Got unexpected closing tag %r; opening tag was %r' %
274 (contents[start_pos:end_pos], expected),
275 s=contents,
276 start_pos=start_pos)
277
278 start_pos = end_pos
279
280 if len(tag_stack) != 0:
281 raise ParseError('Missing closing tags at end of doc: %s' %
282 ' '.join(tag_stack),
283 s=contents,
284 start_pos=start_pos)
285
286 counters.num_tokens += len(tokens)
287
288
289def ToXml(htm8_str):
290 # type: (str) -> str
291
292 # TODO:
293 # 1. Lex it
294 # 2. < & > must be escaped
295 # a. in raw data
296 # b. in quoted strings
297 # 3. <script> turned into CDATA
298 # 4. void tags turned into self-closing tags
299 # 5. case-sensitive tag matching - not sure about this
300
301 tag_lexer = TagLexer(htm8_str)
302 val_lexer = AttrValueLexer(htm8_str)
303
304 f = StringIO()
305 out = Output(htm8_str, f)
306
307 lx = Lexer(htm8_str)
308
309 pos = 0
310 while True:
311 tok_id, end_pos = lx.Read()
312
313 if tok_id == h8_id.Invalid:
314 raise LexError(htm8_str, pos)
315 if tok_id == h8_id.EndOfStream:
316 break
317
318 if tok_id in (h8_id.RawData, h8_id.CharEntity, h8_id.HexChar,
319 h8_id.DecChar):
320 out.PrintUntil(end_pos)
321 elif tok_id in (h8_id.StartTag, h8_id.StartEndTag):
322 tag_lexer.Reset(pos, end_pos)
323 # TODO: reduce allocations here
324 all_attrs = tag_lexer.AllAttrsRawSlice()
325 for name, val_start, val_end in all_attrs:
326 val_lexer.Reset(val_start, val_end)
327 # TODO: get the kind of string
328 #
329 # Quoted: we need to replace & with &amp; and < with &lt;
330 # note > is not allowed
331 # Unquoted: right now, we can just surround with double quotes
332 # because we don't allow any bad chars
333 # Empty : add "", so empty= becomes =""
334 # Missing : add ="", so missing becomes missing=""
335
336 tag_name = lx.CanonicalTagName()
337 if tok_id == h8_id.StartTag and tag_name in VOID_ELEMENTS:
338 # TODO: instead of closing >, print />
339 pass
340
341 elif tok_id == h8_id.BadAmpersand:
342 #out.SkipTo(pos)
343 out.Print('&amp;')
344 out.SkipTo(end_pos)
345
346 elif tok_id == h8_id.BadGreaterThan:
347 #out.SkipTo(pos)
348 out.Print('&gt;')
349 out.SkipTo(end_pos)
350 else:
351 out.PrintUntil(end_pos)
352
353 pos = end_pos
354
355 out.PrintTheRest()
356 return f.getvalue()
357
358
359class Counters(object):
360
361 def __init__(self):
362 # type: () -> None
363 self.num_tokens = 0
364 self.num_start_tags = 0
365 self.num_start_end_tags = 0
366 self.num_attrs = 0
367 self.max_tag_stack = 0
368 self.num_val_tokens = 0
369
370 #self.debug_attrs = []
371
372
373def main(argv):
374 # type: (List[str]) -> int
375 action = argv[1]
376
377 if action == 'tokens':
378 contents = sys.stdin.read()
379
380 lx = Lexer(contents)
381 start_pos = 0
382 while True:
383 tok_id, end_pos = lx.Read()
384 if tok_id == h8_id.Invalid:
385 raise LexError(contents, start_pos)
386 if tok_id == h8_id.EndOfStream:
387 break
388
389 frag = contents[start_pos:end_pos]
390 log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
391 start_pos = end_pos
392
393 return 0
394
395 elif action in ('lex-htm8', 'parse-htm8', 'parse-xml'):
396
397 errors = []
398 counters = Counters()
399
400 flags = LEX_ATTRS | LEX_QUOTED_VALUES
401 if action.startswith('parse-'):
402 flags |= BALANCED_TAGS
403 if action == 'parse-xml':
404 flags |= NO_SPECIAL_TAGS
405
406 i = 0
407 for line in sys.stdin:
408 filename = line.strip()
409 with open(filename) as f:
410 contents = f.read()
411
412 try:
413 Validate(contents, flags, counters)
414 except LexError as e:
415 log('Lex error in %r: %s', filename, e)
416 errors.append((filename, e))
417 except ParseError as e:
418 log('Parse error in %r: %s', filename, e)
419 errors.append((filename, e))
420 i += 1
421
422 log('')
423 log('%10d tokens', counters.num_tokens)
424 log('%10d start/end tags', counters.num_start_end_tags)
425 log('%10d start tags', counters.num_start_tags)
426 log('%10d attrs', counters.num_attrs)
427 log('%10d max tag stack depth', counters.max_tag_stack)
428 log('%10d attr val tokens', counters.num_val_tokens)
429 log('%10d errors', len(errors))
430 if len(errors):
431 return 1
432 return 0
433
434 elif action == 'todo':
435 # Other algorithms:
436 #
437 # - select first subtree with given ID
438 # - this requires understanding the void tags I suppose
439 # - select all subtrees that have a class
440 # - materialize DOM
441
442 # Safe-HTM8? This is a filter
443 return 0
444
445 else:
446 raise RuntimeError('Invalid action %r' % action)
447
448
449if __name__ == '__main__':
450 sys.exit(main(sys.argv))