OILS / lazylex / html.py View on Github | oils.pub

458 lines, 273 significant
1#!/usr/bin/env python2
2"""
3lazylex/html.py - Low-Level HTML Processing.
4
5See lazylex/README.md for details.
6
7TODO:
8- Get rid of AttrValueLexer - this should be in the TagLexer
9 - this also means that unquoted values can be more similar
10 - We can use a single lexer mode for everything inside <>
11 - the SPACE is the only difference
12- UTF-8 check, like JSON8
13- Static typing
14
15"""
16from __future__ import print_function
17
18from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_id_str)
19from data_lang.htm8 import (Lexer, TagLexer, AttrValueLexer, LexError,
20 ParseError, Output)
21from doctools.util import log
22
23try:
24 from cStringIO import StringIO
25except ImportError:
26 # for python3
27 from io import StringIO # type: ignore
28import sys
29
30if sys.version_info.major == 2:
31 from typing import List, Tuple, Iterator
32
33
34def _Tokens(s, left_pos, right_pos):
35 # type: (str, int, int) -> Iterator[Tuple[h8_id_t, int]]
36 """
37 Args:
38 s: string to parse
39 left_pos, right_pos: Optional span boundaries.
40 """
41 lx = Lexer(s, left_pos, right_pos)
42 while True:
43 tok_id, pos = lx.Read()
44 yield tok_id, pos
45 if tok_id == h8_id.EndOfStream:
46 break
47
48
49def ValidTokens(s, left_pos=0, right_pos=-1):
50 # type: (str, int, int) -> Iterator[Tuple[h8_id_t, int]]
51 """Wrapper around _Tokens to prevent callers from having to handle Invalid.
52
53 I'm not combining the two functions because I might want to do a
54 'yield' transformation on Tokens()? Exceptions might complicate the
55 issue?
56 """
57 pos = left_pos
58 for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
59 if tok_id == h8_id.Invalid:
60 raise LexError(s, pos)
61 yield tok_id, end_pos
62 pos = end_pos
63
64
65def ValidTokenList(s, no_special_tags=False):
66 # type: (str, bool) -> List[Tuple[h8_id_t, int]]
67 """A wrapper that can be more easily translated to C++. Doesn't use iterators."""
68
69 start_pos = 0
70 tokens = []
71 lx = Lexer(s, no_special_tags=no_special_tags)
72 while True:
73 tok_id, end_pos = lx.Read()
74 tokens.append((tok_id, end_pos))
75 if tok_id == h8_id.EndOfStream:
76 break
77 if tok_id == h8_id.Invalid:
78 raise LexError(s, start_pos)
79 start_pos = end_pos
80 return tokens
81
82
83def ReadUntilStartTag(it, tag_lexer, tag_name):
84 # type: (Iterator[Tuple[h8_id_t, int]], TagLexer, str) -> Tuple[int, int]
85 """Find the next <foo>, returning its (start, end) positions
86
87 Raise ParseError if it's not found.
88
89 tag_lexer is RESET.
90 """
91 pos = 0
92 while True:
93 try:
94 tok_id, end_pos = next(it)
95 except StopIteration:
96 break
97 tag_lexer.Reset(pos, end_pos)
98 if tok_id == h8_id.StartTag and tag_lexer.GetTagName() == tag_name:
99 return pos, end_pos
100
101 pos = end_pos
102
103 raise ParseError('No start tag %r' % tag_name)
104
105
106def ReadUntilEndTag(it, tag_lexer, tag_name):
107 # type: (Iterator[Tuple[h8_id_t, int]], TagLexer, str) -> Tuple[int, int]
108 """Find the next </foo>, returning its (start, end) position
109
110 Raise ParseError if it's not found.
111
112 tag_lexer is RESET.
113 """
114 pos = 0
115 while True:
116 try:
117 tok_id, end_pos = next(it)
118 except StopIteration:
119 break
120 tag_lexer.Reset(pos, end_pos)
121 if tok_id == h8_id.EndTag and tag_lexer.GetTagName() == tag_name:
122 return pos, end_pos
123
124 pos = end_pos
125
126 raise ParseError('No end tag %r' % tag_name)
127
128
129CHAR_ENTITY = {
130 'amp': '&',
131 'lt': '<',
132 'gt': '>',
133 'quot': '"',
134 'apos': "'",
135}
136
137
138def ToText(s, left_pos=0, right_pos=-1):
139 # type: (str, int, int) -> str
140 """Given HTML, return text by unquoting &gt; and &lt; etc.
141
142 Used by:
143 doctools/oils_doc.py: PygmentsPlugin
144 doctools/help_gen.py: HelpIndexCards
145
146 In the latter case, we cold process some tags, like:
147
148 - Blue Link (not clickable, but still useful)
149 - Red X
150
151 That should be html.ToAnsi.
152 """
153 f = StringIO()
154 out = Output(s, f, left_pos, right_pos)
155
156 pos = left_pos
157 for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
158 if tok_id in (h8_id.RawData, h8_id.BadAmpersand, h8_id.BadGreaterThan,
159 h8_id.BadLessThan):
160 out.SkipTo(pos)
161 out.PrintUntil(end_pos)
162
163 elif tok_id == h8_id.CharEntity: # &amp;
164
165 entity = s[pos + 1:end_pos - 1]
166
167 out.SkipTo(pos)
168 out.Print(CHAR_ENTITY[entity])
169 out.SkipTo(end_pos)
170
171 # Not handling these yet
172 elif tok_id == h8_id.HexChar:
173 raise AssertionError('Hex Char %r' % s[pos:pos + 20])
174
175 elif tok_id == h8_id.DecChar:
176 raise AssertionError('Dec Char %r' % s[pos:pos + 20])
177
178 else:
179 # Skip everything else
180 out.SkipTo(end_pos)
181
182 pos = end_pos
183
184 out.PrintTheRest()
185 return f.getvalue()
186
187
188# https://developer.mozilla.org/en-US/docs/Glossary/Void_element
189VOID_ELEMENTS = [
190 'area',
191 'base',
192 'br',
193 'col',
194 'embed',
195 'hr',
196 'img',
197 'input',
198 'link',
199 'meta',
200 'param',
201 'source',
202 'track',
203 'wbr',
204]
205
206LEX_ATTRS = 1 << 1
207LEX_QUOTED_VALUES = 1 << 2 # href="?x=42&amp;y=99"
208NO_SPECIAL_TAGS = 1 << 3 # <script> <style>, VOID tags, etc.
209BALANCED_TAGS = 1 << 4 # are tags balanced?
210
211
212def Validate(contents, flags, counters):
213 # type: (str, int, Counters) -> None
214
215 tag_lexer = TagLexer(contents)
216 val_lexer = AttrValueLexer(contents)
217
218 no_special_tags = bool(flags & NO_SPECIAL_TAGS)
219 lx = Lexer(contents, no_special_tags=no_special_tags)
220 tokens = []
221 start_pos = 0
222 tag_stack = []
223 while True:
224 tok_id, end_pos = lx.Read()
225 #log('TOP %s %r', h8_id_str(tok_id), contents[start_pos:end_pos])
226
227 if tok_id == h8_id.Invalid:
228 raise LexError(contents, start_pos)
229 if tok_id == h8_id.EndOfStream:
230 break
231
232 tokens.append((tok_id, end_pos))
233
234 if tok_id == h8_id.StartEndTag:
235 counters.num_start_end_tags += 1
236
237 tag_lexer.Reset(start_pos, end_pos)
238 all_attrs = tag_lexer.AllAttrsRawSlice()
239 counters.num_attrs += len(all_attrs)
240 for name, val_start, val_end in all_attrs:
241 val_lexer.Reset(val_start, val_end)
242 counters.num_val_tokens += val_lexer.NumTokens()
243
244 #counters.debug_attrs.extend(all_attrs)
245
246 elif tok_id == h8_id.StartTag:
247 counters.num_start_tags += 1
248
249 tag_lexer.Reset(start_pos, end_pos)
250 all_attrs = tag_lexer.AllAttrsRawSlice()
251 counters.num_attrs += len(all_attrs)
252 for name, val_start, val_end in all_attrs:
253 val_lexer.Reset(val_start, val_end)
254 counters.num_val_tokens += val_lexer.NumTokens()
255
256 #counters.debug_attrs.extend(all_attrs)
257
258 if flags & BALANCED_TAGS:
259 tag_name = lx.CanonicalTagName()
260 if flags & NO_SPECIAL_TAGS:
261 tag_stack.append(tag_name)
262 else:
263 # e.g. <meta> is considered self-closing, like <meta/>
264 if tag_name not in VOID_ELEMENTS:
265 tag_stack.append(tag_name)
266
267 counters.max_tag_stack = max(counters.max_tag_stack,
268 len(tag_stack))
269 elif tok_id == h8_id.EndTag:
270 if flags & BALANCED_TAGS:
271 try:
272 expected = tag_stack.pop()
273 except IndexError:
274 raise ParseError('Tag stack empty',
275 s=contents,
276 start_pos=start_pos)
277
278 actual = lx.CanonicalTagName()
279 if expected != actual:
280 raise ParseError(
281 'Got unexpected closing tag %r; opening tag was %r' %
282 (contents[start_pos:end_pos], expected),
283 s=contents,
284 start_pos=start_pos)
285
286 start_pos = end_pos
287
288 if len(tag_stack) != 0:
289 raise ParseError('Missing closing tags at end of doc: %s' %
290 ' '.join(tag_stack),
291 s=contents,
292 start_pos=start_pos)
293
294 counters.num_tokens += len(tokens)
295
296
297def ToXml(htm8_str):
298 # type: (str) -> str
299
300 # TODO:
301 # 1. Lex it
302 # 2. < & > must be escaped
303 # a. in raw data
304 # b. in quoted strings
305 # 3. <script> turned into CDATA
306 # 4. void tags turned into self-closing tags
307 # 5. case-sensitive tag matching - not sure about this
308
309 tag_lexer = TagLexer(htm8_str)
310 val_lexer = AttrValueLexer(htm8_str)
311
312 f = StringIO()
313 out = Output(htm8_str, f)
314
315 lx = Lexer(htm8_str)
316
317 pos = 0
318 while True:
319 tok_id, end_pos = lx.Read()
320
321 if tok_id == h8_id.Invalid:
322 raise LexError(htm8_str, pos)
323 if tok_id == h8_id.EndOfStream:
324 break
325
326 if tok_id in (h8_id.RawData, h8_id.CharEntity, h8_id.HexChar,
327 h8_id.DecChar):
328 out.PrintUntil(end_pos)
329 elif tok_id in (h8_id.StartTag, h8_id.StartEndTag):
330 tag_lexer.Reset(pos, end_pos)
331 # TODO: reduce allocations here
332 all_attrs = tag_lexer.AllAttrsRawSlice()
333 for name, val_start, val_end in all_attrs:
334 val_lexer.Reset(val_start, val_end)
335 # TODO: get the kind of string
336 #
337 # Quoted: we need to replace & with &amp; and < with &lt;
338 # note > is not allowed
339 # Unquoted: right now, we can just surround with double quotes
340 # because we don't allow any bad chars
341 # Empty : add "", so empty= becomes =""
342 # Missing : add ="", so missing becomes missing=""
343
344 tag_name = lx.CanonicalTagName()
345 if tok_id == h8_id.StartTag and tag_name in VOID_ELEMENTS:
346 # TODO: instead of closing >, print />
347 pass
348
349 elif tok_id == h8_id.BadAmpersand:
350 #out.SkipTo(pos)
351 out.Print('&amp;')
352 out.SkipTo(end_pos)
353
354 elif tok_id == h8_id.BadGreaterThan:
355 #out.SkipTo(pos)
356 out.Print('&gt;')
357 out.SkipTo(end_pos)
358 else:
359 out.PrintUntil(end_pos)
360
361 pos = end_pos
362
363 out.PrintTheRest()
364 return f.getvalue()
365
366
367class Counters(object):
368
369 def __init__(self):
370 # type: () -> None
371 self.num_tokens = 0
372 self.num_start_tags = 0
373 self.num_start_end_tags = 0
374 self.num_attrs = 0
375 self.max_tag_stack = 0
376 self.num_val_tokens = 0
377
378 #self.debug_attrs = []
379
380
381def main(argv):
382 # type: (List[str]) -> int
383 action = argv[1]
384
385 if action == 'tokens':
386 contents = sys.stdin.read()
387
388 lx = Lexer(contents)
389 start_pos = 0
390 while True:
391 tok_id, end_pos = lx.Read()
392 if tok_id == h8_id.Invalid:
393 raise LexError(contents, start_pos)
394 if tok_id == h8_id.EndOfStream:
395 break
396
397 frag = contents[start_pos:end_pos]
398 log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
399 start_pos = end_pos
400
401 return 0
402
403 elif action in ('lex-htm8', 'parse-htm8', 'parse-xml'):
404
405 errors = []
406 counters = Counters()
407
408 flags = LEX_ATTRS | LEX_QUOTED_VALUES
409 if action.startswith('parse-'):
410 flags |= BALANCED_TAGS
411 if action == 'parse-xml':
412 flags |= NO_SPECIAL_TAGS
413
414 i = 0
415 for line in sys.stdin:
416 filename = line.strip()
417 with open(filename) as f:
418 contents = f.read()
419
420 try:
421 Validate(contents, flags, counters)
422 except LexError as e:
423 log('Lex error in %r: %s', filename, e)
424 errors.append((filename, e))
425 except ParseError as e:
426 log('Parse error in %r: %s', filename, e)
427 errors.append((filename, e))
428 i += 1
429
430 log('')
431 log('%10d tokens', counters.num_tokens)
432 log('%10d start/end tags', counters.num_start_end_tags)
433 log('%10d start tags', counters.num_start_tags)
434 log('%10d attrs', counters.num_attrs)
435 log('%10d max tag stack depth', counters.max_tag_stack)
436 log('%10d attr val tokens', counters.num_val_tokens)
437 log('%10d errors', len(errors))
438 if len(errors):
439 return 1
440 return 0
441
442 elif action == 'todo':
443 # Other algorithms:
444 #
445 # - select first subtree with given ID
446 # - this requires understanding the void tags I suppose
447 # - select all subtrees that have a class
448 # - materialize DOM
449
450 # Safe-HTM8? This is a filter
451 return 0
452
453 else:
454 raise RuntimeError('Invalid action %r' % action)
455
456
457if __name__ == '__main__':
458 sys.exit(main(sys.argv))