OILS / data_lang / htm8_util.py View on Github | oils.pub

294 lines, 196 significant
1#!/usr/bin/env python2
2
3try:
4 from cStringIO import StringIO
5except ImportError:
6 # for python3
7 from io import StringIO # type: ignore
8import sys
9
10from typing import List
11
12from _devbuild.gen.htm8_asdl import (h8_id, h8_id_str, attr_value_e)
13from data_lang import htm8
14from data_lang.htm8 import (Lexer, LexError, ParseError, Output)
15from doctools.util import log
16
17# https://developer.mozilla.org/en-US/docs/Glossary/Void_element
18VOID_ELEMENTS = [
19 'area',
20 'base',
21 'br',
22 'col',
23 'embed',
24 'hr',
25 'img',
26 'input',
27 'link',
28 'meta',
29 'param',
30 'source',
31 'track',
32 'wbr',
33]
34
35NO_LEX_ATTRS = 1 << 1 # skip href="?x=42&amp;y=99"
36NO_SPECIAL_TAGS = 1 << 2 # <script> <style>, VOID tags, etc.
37BALANCED_TAGS = 1 << 3 # are tags balanced?
38
39
40def Validate(contents, flags, counters):
41 # type: (str, int, Counters) -> None
42
43 attr_lx = htm8.AttrLexer(contents)
44
45 no_special_tags = bool(flags & NO_SPECIAL_TAGS)
46 lx = htm8.Lexer(contents, no_special_tags=no_special_tags)
47 tokens = []
48 start_pos = 0
49 tag_stack = []
50 while True:
51 tok_id, end_pos = lx.Read()
52 #log('TOP %s %r', h8_id_str(tok_id), contents[start_pos:end_pos])
53
54 if tok_id == h8_id.Invalid:
55 raise LexError('Validate() got invalid token', contents, start_pos)
56 if tok_id == h8_id.EndOfStream:
57 break
58
59 tokens.append((tok_id, end_pos))
60
61 if tok_id == h8_id.StartEndTag:
62 counters.num_start_end_tags += 1
63
64 attr_lx.Init(tok_id, lx.TagNamePos(), end_pos)
65 if not bool(flags & NO_LEX_ATTRS):
66 all_attrs = htm8.AllAttrsRaw(attr_lx)
67 counters.num_attrs += len(all_attrs)
68 # TODO: val_lexer.NumTokens() can be replaced with tokens_out
69
70 elif tok_id == h8_id.StartTag:
71 counters.num_start_tags += 1
72
73 attr_lx.Init(tok_id, lx.TagNamePos(), end_pos)
74 if not bool(flags & NO_LEX_ATTRS):
75 all_attrs = htm8.AllAttrsRaw(attr_lx)
76 counters.num_attrs += len(all_attrs)
77
78 #counters.debug_attrs.extend(all_attrs)
79
80 if flags & BALANCED_TAGS:
81 tag_name = lx.CanonicalTagName()
82 if flags & NO_SPECIAL_TAGS:
83 tag_stack.append(tag_name)
84 else:
85 # e.g. <meta> is considered self-closing, like <meta/>
86 if tag_name not in VOID_ELEMENTS:
87 tag_stack.append(tag_name)
88
89 counters.max_tag_stack = max(counters.max_tag_stack,
90 len(tag_stack))
91 elif tok_id == h8_id.EndTag:
92 if flags & BALANCED_TAGS:
93 try:
94 expected = tag_stack.pop()
95 except IndexError:
96 raise ParseError('Tag stack empty',
97 s=contents,
98 start_pos=start_pos)
99
100 actual = lx.CanonicalTagName()
101 if expected != actual:
102 raise ParseError(
103 'Got unexpected closing tag %r; opening tag was %r' %
104 (contents[start_pos:end_pos], expected),
105 s=contents,
106 start_pos=start_pos)
107
108 start_pos = end_pos
109
110 if len(tag_stack) != 0:
111 raise ParseError('Missing closing tags at end of doc: %s' %
112 ' '.join(tag_stack),
113 s=contents,
114 start_pos=start_pos)
115
116 counters.num_tokens += len(tokens)
117
118
119def ToXml(htm8_str):
120 # type: (str) -> str
121
122 # TODO:
123 # 1. Lex it
124 # 2. < & > must be escaped
125 # a. in raw data
126 # b. in quoted strings
127 # 3. <script> turned into CDATA
128 # 4. void tags turned into self-closing tags
129 # 5. case-sensitive tag matching - not sure about this
130
131 attr_lexer = htm8.AttrLexer(htm8_str)
132
133 f = StringIO()
134 out = Output(htm8_str, f)
135
136 lx = Lexer(htm8_str)
137
138 pos = 0
139 while True:
140 tok_id, end_pos = lx.Read()
141
142 if tok_id == h8_id.Invalid:
143 raise LexError('ToXml() got invalid token', htm8_str, pos)
144 if tok_id == h8_id.EndOfStream:
145 break
146
147 if tok_id in (h8_id.RawData, h8_id.CharEntity, h8_id.HexChar,
148 h8_id.DecChar):
149 out.PrintUntil(end_pos)
150 elif tok_id in (h8_id.StartTag, h8_id.StartEndTag):
151 attr_lexer.Init(tok_id, lx.TagNamePos(), end_pos)
152 all_attrs = htm8.AllAttrsRawSlice(attr_lexer)
153 for name_start, name_end, equal_end, v, val_start, val_end in all_attrs:
154 if v == attr_value_e.Missing: # <a missing>
155 out.PrintUntil(name_end)
156 out.Print('=""')
157 elif v == attr_value_e.Empty: # <a empty=>
158 out.PrintUntil(equal_end)
159 out.Print('""')
160 elif v == attr_value_e.Unquoted: # <a foo=bar>
161 # Because we disallow ", we can just surround with quotes
162 out.PrintUntil(val_start)
163 out.Print('"')
164 out.PrintUntil(val_end)
165 out.Print('"')
166
167 #val_lexer.Reset(val_start, val_end)
168 pass
169 # TODO: get the kind of string
170 #
171 # Quoted: we need to replace & with &amp; and < with &lt;
172 # note > is not allowed
173 # Unquoted: right now, we can just surround with double quotes
174 # because we don't allow any bad chars
175 # Empty : add "", so empty= becomes =""
176 # Missing : add ="", so missing becomes missing=""
177
178 tag_name = lx.CanonicalTagName()
179 if tok_id == h8_id.StartTag and tag_name in VOID_ELEMENTS:
180 # TODO: instead of closing >, print />
181 pass
182
183 elif tok_id == h8_id.BadAmpersand:
184 #out.SkipTo(pos)
185 out.Print('&amp;')
186 out.SkipTo(end_pos)
187
188 elif tok_id == h8_id.BadGreaterThan:
189 #out.SkipTo(pos)
190 out.Print('&gt;')
191 out.SkipTo(end_pos)
192 else:
193 out.PrintUntil(end_pos)
194
195 pos = end_pos
196
197 out.PrintTheRest()
198 return f.getvalue()
199
200
201class Counters(object):
202
203 def __init__(self):
204 # type: () -> None
205 self.num_tokens = 0
206 self.num_start_tags = 0
207 self.num_start_end_tags = 0
208 self.num_attrs = 0
209 self.max_tag_stack = 0
210 self.num_val_tokens = 0
211
212 #self.debug_attrs = []
213
214
215def main(argv):
216 # type: (List[str]) -> int
217 action = argv[1]
218
219 if action == 'tokens':
220 contents = sys.stdin.read()
221
222 lx = Lexer(contents)
223 start_pos = 0
224 while True:
225 tok_id, end_pos = lx.Read()
226 if tok_id == h8_id.Invalid:
227 raise LexError('Invalid token', contents, start_pos)
228 if tok_id == h8_id.EndOfStream:
229 break
230
231 frag = contents[start_pos:end_pos]
232 log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
233 start_pos = end_pos
234
235 return 0
236
237 elif action in ('quick-scan', 'lex-htm8', 'parse-htm8', 'parse-xml'):
238
239 errors = []
240 counters = Counters()
241
242 flags = 0
243 if action == 'quick-scan':
244 flags |= NO_LEX_ATTRS
245 if action.startswith('parse-'):
246 flags |= BALANCED_TAGS
247 if action == 'parse-xml':
248 flags |= NO_SPECIAL_TAGS
249
250 i = 0
251 for line in sys.stdin:
252 filename = line.strip()
253 with open(filename) as f:
254 contents = f.read()
255
256 try:
257 Validate(contents, flags, counters)
258 except LexError as e:
259 log('Lex error in %r: %s', filename, e)
260 errors.append((filename, e))
261 except ParseError as e:
262 log('Parse error in %r: %s', filename, e)
263 errors.append((filename, e))
264 i += 1
265
266 log('')
267 log('%10d tokens', counters.num_tokens)
268 log('%10d start/end tags', counters.num_start_end_tags)
269 log('%10d start tags', counters.num_start_tags)
270 log('%10d attrs', counters.num_attrs)
271 log('%10d max tag stack depth', counters.max_tag_stack)
272 log('%10d attr val tokens', counters.num_val_tokens)
273 log('%10d errors', len(errors))
274 if len(errors):
275 return 1
276 return 0
277
278 elif action == 'todo':
279 # Other algorithms:
280 #
281 # - select first subtree with given ID
282 # - this requires understanding the void tags I suppose
283 # - select all subtrees that have a class
284 # - materialize DOM
285
286 # Safe-HTM8? This is a filter
287 return 0
288
289 else:
290 raise RuntimeError('Invalid action %r' % action)
291
292
293if __name__ == '__main__':
294 sys.exit(main(sys.argv))