OILS / data_lang / htm8_util.py View on Github | oils.pub

278 lines, 182 significant
1#!/usr/bin/env python2
2
3try:
4 from cStringIO import StringIO
5except ImportError:
6 # for python3
7 from io import StringIO # type: ignore
8import sys
9
10from typing import List
11
12from _devbuild.gen.htm8_asdl import (h8_id, h8_id_str)
13from data_lang import htm8
14from data_lang.htm8 import (Lexer, LexError, ParseError, Output)
15from doctools.util import log
16
17# https://developer.mozilla.org/en-US/docs/Glossary/Void_element
18VOID_ELEMENTS = [
19 'area',
20 'base',
21 'br',
22 'col',
23 'embed',
24 'hr',
25 'img',
26 'input',
27 'link',
28 'meta',
29 'param',
30 'source',
31 'track',
32 'wbr',
33]
34
35LEX_ATTRS = 1 << 1
36LEX_QUOTED_VALUES = 1 << 2 # href="?x=42&amp;y=99"
37NO_SPECIAL_TAGS = 1 << 3 # <script> <style>, VOID tags, etc.
38BALANCED_TAGS = 1 << 4 # are tags balanced?
39
40
41def Validate(contents, flags, counters):
42 # type: (str, int, Counters) -> None
43
44 attr_lx = htm8.AttrLexer(contents)
45
46 no_special_tags = bool(flags & NO_SPECIAL_TAGS)
47 lx = htm8.Lexer(contents, no_special_tags=no_special_tags)
48 tokens = []
49 start_pos = 0
50 tag_stack = []
51 while True:
52 tok_id, end_pos = lx.Read()
53 #log('TOP %s %r', h8_id_str(tok_id), contents[start_pos:end_pos])
54
55 if tok_id == h8_id.Invalid:
56 raise LexError('Validate() got invalid token', contents, start_pos)
57 if tok_id == h8_id.EndOfStream:
58 break
59
60 tokens.append((tok_id, end_pos))
61
62 if tok_id == h8_id.StartEndTag:
63 counters.num_start_end_tags += 1
64
65 attr_lx.Init(tok_id, lx.TagNamePos(), end_pos)
66 all_attrs = htm8.AllAttrsRaw(attr_lx)
67 counters.num_attrs += len(all_attrs)
68 # TODO: val_lexer.NumTokens() can be replaced with tokens_out
69
70 elif tok_id == h8_id.StartTag:
71 counters.num_start_tags += 1
72
73 attr_lx.Init(tok_id, lx.TagNamePos(), end_pos)
74 all_attrs = htm8.AllAttrsRaw(attr_lx)
75 counters.num_attrs += len(all_attrs)
76
77 #counters.debug_attrs.extend(all_attrs)
78
79 if flags & BALANCED_TAGS:
80 tag_name = lx.CanonicalTagName()
81 if flags & NO_SPECIAL_TAGS:
82 tag_stack.append(tag_name)
83 else:
84 # e.g. <meta> is considered self-closing, like <meta/>
85 if tag_name not in VOID_ELEMENTS:
86 tag_stack.append(tag_name)
87
88 counters.max_tag_stack = max(counters.max_tag_stack,
89 len(tag_stack))
90 elif tok_id == h8_id.EndTag:
91 if flags & BALANCED_TAGS:
92 try:
93 expected = tag_stack.pop()
94 except IndexError:
95 raise ParseError('Tag stack empty',
96 s=contents,
97 start_pos=start_pos)
98
99 actual = lx.CanonicalTagName()
100 if expected != actual:
101 raise ParseError(
102 'Got unexpected closing tag %r; opening tag was %r' %
103 (contents[start_pos:end_pos], expected),
104 s=contents,
105 start_pos=start_pos)
106
107 start_pos = end_pos
108
109 if len(tag_stack) != 0:
110 raise ParseError('Missing closing tags at end of doc: %s' %
111 ' '.join(tag_stack),
112 s=contents,
113 start_pos=start_pos)
114
115 counters.num_tokens += len(tokens)
116
117
118def ToXml(htm8_str):
119 # type: (str) -> str
120
121 # TODO:
122 # 1. Lex it
123 # 2. < & > must be escaped
124 # a. in raw data
125 # b. in quoted strings
126 # 3. <script> turned into CDATA
127 # 4. void tags turned into self-closing tags
128 # 5. case-sensitive tag matching - not sure about this
129
130 attr_lexer = htm8.AttrLexer(htm8_str)
131
132 f = StringIO()
133 out = Output(htm8_str, f)
134
135 lx = Lexer(htm8_str)
136
137 pos = 0
138 while True:
139 tok_id, end_pos = lx.Read()
140
141 if tok_id == h8_id.Invalid:
142 raise LexError('ToXml() got invalid token', htm8_str, pos)
143 if tok_id == h8_id.EndOfStream:
144 break
145
146 if tok_id in (h8_id.RawData, h8_id.CharEntity, h8_id.HexChar,
147 h8_id.DecChar):
148 out.PrintUntil(end_pos)
149 elif tok_id in (h8_id.StartTag, h8_id.StartEndTag):
150 attr_lexer.Init(tok_id, lx.TagNamePos(), end_pos)
151 all_attrs = htm8.AllAttrsRawSlice(attr_lexer)
152 for name_start, name_end, v, val_start, val_end in all_attrs:
153 #val_lexer.Reset(val_start, val_end)
154 pass
155 # TODO: get the kind of string
156 #
157 # Quoted: we need to replace & with &amp; and < with &lt;
158 # note > is not allowed
159 # Unquoted: right now, we can just surround with double quotes
160 # because we don't allow any bad chars
161 # Empty : add "", so empty= becomes =""
162 # Missing : add ="", so missing becomes missing=""
163
164 tag_name = lx.CanonicalTagName()
165 if tok_id == h8_id.StartTag and tag_name in VOID_ELEMENTS:
166 # TODO: instead of closing >, print />
167 pass
168
169 elif tok_id == h8_id.BadAmpersand:
170 #out.SkipTo(pos)
171 out.Print('&amp;')
172 out.SkipTo(end_pos)
173
174 elif tok_id == h8_id.BadGreaterThan:
175 #out.SkipTo(pos)
176 out.Print('&gt;')
177 out.SkipTo(end_pos)
178 else:
179 out.PrintUntil(end_pos)
180
181 pos = end_pos
182
183 out.PrintTheRest()
184 return f.getvalue()
185
186
187class Counters(object):
188
189 def __init__(self):
190 # type: () -> None
191 self.num_tokens = 0
192 self.num_start_tags = 0
193 self.num_start_end_tags = 0
194 self.num_attrs = 0
195 self.max_tag_stack = 0
196 self.num_val_tokens = 0
197
198 #self.debug_attrs = []
199
200
201def main(argv):
202 # type: (List[str]) -> int
203 action = argv[1]
204
205 if action == 'tokens':
206 contents = sys.stdin.read()
207
208 lx = Lexer(contents)
209 start_pos = 0
210 while True:
211 tok_id, end_pos = lx.Read()
212 if tok_id == h8_id.Invalid:
213 raise LexError('Invalid token', contents, start_pos)
214 if tok_id == h8_id.EndOfStream:
215 break
216
217 frag = contents[start_pos:end_pos]
218 log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
219 start_pos = end_pos
220
221 return 0
222
223 elif action in ('lex-htm8', 'parse-htm8', 'parse-xml'):
224
225 errors = []
226 counters = Counters()
227
228 flags = LEX_ATTRS | LEX_QUOTED_VALUES
229 if action.startswith('parse-'):
230 flags |= BALANCED_TAGS
231 if action == 'parse-xml':
232 flags |= NO_SPECIAL_TAGS
233
234 i = 0
235 for line in sys.stdin:
236 filename = line.strip()
237 with open(filename) as f:
238 contents = f.read()
239
240 try:
241 Validate(contents, flags, counters)
242 except LexError as e:
243 log('Lex error in %r: %s', filename, e)
244 errors.append((filename, e))
245 except ParseError as e:
246 log('Parse error in %r: %s', filename, e)
247 errors.append((filename, e))
248 i += 1
249
250 log('')
251 log('%10d tokens', counters.num_tokens)
252 log('%10d start/end tags', counters.num_start_end_tags)
253 log('%10d start tags', counters.num_start_tags)
254 log('%10d attrs', counters.num_attrs)
255 log('%10d max tag stack depth', counters.max_tag_stack)
256 log('%10d attr val tokens', counters.num_val_tokens)
257 log('%10d errors', len(errors))
258 if len(errors):
259 return 1
260 return 0
261
262 elif action == 'todo':
263 # Other algorithms:
264 #
265 # - select first subtree with given ID
266 # - this requires understanding the void tags I suppose
267 # - select all subtrees that have a class
268 # - materialize DOM
269
270 # Safe-HTM8? This is a filter
271 return 0
272
273 else:
274 raise RuntimeError('Invalid action %r' % action)
275
276
277if __name__ == '__main__':
278 sys.exit(main(sys.argv))