1 | #!/usr/bin/env python2
2 |
3 | try:
4 | from cStringIO import StringIO
5 | except ImportError:
6 | # for python3
7 | from io import StringIO # type: ignore
8 | import sys
9 |
10 | from typing import List
11 |
12 | from _devbuild.gen.htm8_asdl import (h8_id, h8_id_str)
13 | from data_lang import htm8
14 | from data_lang.htm8 import (Lexer, LexError, ParseError, Output)
15 | from doctools.util import log
16 |
17 | # https://developer.mozilla.org/en-US/docs/Glossary/Void_element
19 | 'area',
20 | 'base',
21 | 'br',
22 | 'col',
23 | 'embed',
24 | 'hr',
25 | 'img',
26 | 'input',
27 | 'link',
28 | 'meta',
29 | 'param',
30 | 'source',
31 | 'track',
32 | 'wbr',
33 | ]
34 |
35 | LEX_ATTRS = 1 << 1
36 | LEX_QUOTED_VALUES = 1 << 2 # href="?x=42&y=99"
37 | NO_SPECIAL_TAGS = 1 << 3 # <script> <style>, VOID tags, etc.
38 | BALANCED_TAGS = 1 << 4 # are tags balanced?
39 |
40 |
41 | def Validate(contents, flags, counters):
42 | # type: (str, int, Counters) -> None
43 |
44 | attr_lx = htm8.AttrLexer(contents)
45 |
46 | no_special_tags = bool(flags & NO_SPECIAL_TAGS)
47 | lx = htm8.Lexer(contents, no_special_tags=no_special_tags)
48 | tokens = []
49 | start_pos = 0
50 | tag_stack = []
51 | while True:
52 | tok_id, end_pos = lx.Read()
53 | #log('TOP %s %r', h8_id_str(tok_id), contents[start_pos:end_pos])
54 |
55 | if tok_id == h8_id.Invalid:
56 | raise LexError('Validate() got invalid token', contents, start_pos)
57 | if tok_id == h8_id.EndOfStream:
58 | break
59 |
60 | tokens.append((tok_id, end_pos))
61 |
62 | if tok_id == h8_id.StartEndTag:
63 | counters.num_start_end_tags += 1
64 |
65 | attr_lx.Init(tok_id, lx.TagNamePos(), end_pos)
66 | all_attrs = htm8.AllAttrsRaw(attr_lx)
67 | counters.num_attrs += len(all_attrs)
68 | # TODO: val_lexer.NumTokens() can be replaced with tokens_out
69 |
70 | elif tok_id == h8_id.StartTag:
71 | counters.num_start_tags += 1
72 |
73 | attr_lx.Init(tok_id, lx.TagNamePos(), end_pos)
74 | all_attrs = htm8.AllAttrsRaw(attr_lx)
75 | counters.num_attrs += len(all_attrs)
76 |
77 | #counters.debug_attrs.extend(all_attrs)
78 |
79 | if flags & BALANCED_TAGS:
80 | tag_name = lx.CanonicalTagName()
81 | if flags & NO_SPECIAL_TAGS:
82 | tag_stack.append(tag_name)
83 | else:
84 | # e.g. <meta> is considered self-closing, like <meta/>
85 | if tag_name not in VOID_ELEMENTS:
86 | tag_stack.append(tag_name)
87 |
88 | counters.max_tag_stack = max(counters.max_tag_stack,
89 | len(tag_stack))
90 | elif tok_id == h8_id.EndTag:
91 | if flags & BALANCED_TAGS:
92 | try:
93 | expected = tag_stack.pop()
94 | except IndexError:
95 | raise ParseError('Tag stack empty',
96 | s=contents,
97 | start_pos=start_pos)
98 |
99 | actual = lx.CanonicalTagName()
100 | if expected != actual:
101 | raise ParseError(
102 | 'Got unexpected closing tag %r; opening tag was %r' %
103 | (contents[start_pos:end_pos], expected),
104 | s=contents,
105 | start_pos=start_pos)
106 |
107 | start_pos = end_pos
108 |
109 | if len(tag_stack) != 0:
110 | raise ParseError('Missing closing tags at end of doc: %s' %
111 | ' '.join(tag_stack),
112 | s=contents,
113 | start_pos=start_pos)
114 |
115 | counters.num_tokens += len(tokens)
116 |
117 |
118 | def ToXml(htm8_str):
119 | # type: (str) -> str
120 |
121 | # TODO:
122 | # 1. Lex it
123 | # 2. < & > must be escaped
124 | # a. in raw data
125 | # b. in quoted strings
126 | # 3. <script> turned into CDATA
127 | # 4. void tags turned into self-closing tags
128 | # 5. case-sensitive tag matching - not sure about this
129 |
130 | attr_lexer = htm8.AttrLexer(htm8_str)
131 |
132 | f = StringIO()
133 | out = Output(htm8_str, f)
134 |
135 | lx = Lexer(htm8_str)
136 |
137 | pos = 0
138 | while True:
139 | tok_id, end_pos = lx.Read()
140 |
141 | if tok_id == h8_id.Invalid:
142 | raise LexError('ToXml() got invalid token', htm8_str, pos)
143 | if tok_id == h8_id.EndOfStream:
144 | break
145 |
146 | if tok_id in (h8_id.RawData, h8_id.CharEntity, h8_id.HexChar,
147 | h8_id.DecChar):
148 | out.PrintUntil(end_pos)
149 | elif tok_id in (h8_id.StartTag, h8_id.StartEndTag):
150 | attr_lexer.Init(tok_id, lx.TagNamePos(), end_pos)
151 | all_attrs = htm8.AllAttrsRawSlice(attr_lexer)
152 | for name_start, name_end, v, val_start, val_end in all_attrs:
153 | #val_lexer.Reset(val_start, val_end)
154 | pass
155 | # TODO: get the kind of string
156 | #
157 | # Quoted: we need to replace & with & and < with <
158 | # note > is not allowed
159 | # Unquoted: right now, we can just surround with double quotes
160 | # because we don't allow any bad chars
161 | # Empty : add "", so empty= becomes =""
162 | # Missing : add ="", so missing becomes missing=""
163 |
164 | tag_name = lx.CanonicalTagName()
165 | if tok_id == h8_id.StartTag and tag_name in VOID_ELEMENTS:
166 | # TODO: instead of closing >, print />
167 | pass
168 |
169 | elif tok_id == h8_id.BadAmpersand:
170 | #out.SkipTo(pos)
171 | out.Print('&')
172 | out.SkipTo(end_pos)
173 |
174 | elif tok_id == h8_id.BadGreaterThan:
175 | #out.SkipTo(pos)
176 | out.Print('>')
177 | out.SkipTo(end_pos)
178 | else:
179 | out.PrintUntil(end_pos)
180 |
181 | pos = end_pos
182 |
183 | out.PrintTheRest()
184 | return f.getvalue()
185 |
186 |
187 | class Counters(object):
188 |
189 | def __init__(self):
190 | # type: () -> None
191 | self.num_tokens = 0
192 | self.num_start_tags = 0
193 | self.num_start_end_tags = 0
194 | self.num_attrs = 0
195 | self.max_tag_stack = 0
196 | self.num_val_tokens = 0
197 |
198 | #self.debug_attrs = []
199 |
200 |
201 | def main(argv):
202 | # type: (List[str]) -> int
203 | action = argv[1]
204 |
205 | if action == 'tokens':
206 | contents = sys.stdin.read()
207 |
208 | lx = Lexer(contents)
209 | start_pos = 0
210 | while True:
211 | tok_id, end_pos = lx.Read()
212 | if tok_id == h8_id.Invalid:
213 | raise LexError('Invalid token', contents, start_pos)
214 | if tok_id == h8_id.EndOfStream:
215 | break
216 |
217 | frag = contents[start_pos:end_pos]
218 | log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
219 | start_pos = end_pos
220 |
221 | return 0
222 |
223 | elif action in ('lex-htm8', 'parse-htm8', 'parse-xml'):
224 |
225 | errors = []
226 | counters = Counters()
227 |
229 | if action.startswith('parse-'):
230 | flags |= BALANCED_TAGS
231 | if action == 'parse-xml':
232 | flags |= NO_SPECIAL_TAGS
233 |
234 | i = 0
235 | for line in sys.stdin:
236 | filename = line.strip()
237 | with open(filename) as f:
238 | contents = f.read()
239 |
240 | try:
241 | Validate(contents, flags, counters)
242 | except LexError as e:
243 | log('Lex error in %r: %s', filename, e)
244 | errors.append((filename, e))
245 | except ParseError as e:
246 | log('Parse error in %r: %s', filename, e)
247 | errors.append((filename, e))
248 | i += 1
249 |
250 | log('')
251 | log('%10d tokens', counters.num_tokens)
252 | log('%10d start/end tags', counters.num_start_end_tags)
253 | log('%10d start tags', counters.num_start_tags)
254 | log('%10d attrs', counters.num_attrs)
255 | log('%10d max tag stack depth', counters.max_tag_stack)
256 | log('%10d attr val tokens', counters.num_val_tokens)
257 | log('%10d errors', len(errors))
258 | if len(errors):
259 | return 1
260 | return 0
261 |
262 | elif action == 'todo':
263 | # Other algorithms:
264 | #
265 | # - select first subtree with given ID
266 | # - this requires understanding the void tags I suppose
267 | # - select all subtrees that have a class
268 | # - materialize DOM
269 |
270 | # Safe-HTM8? This is a filter
271 | return 0
272 |
273 | else:
274 | raise RuntimeError('Invalid action %r' % action)
275 |
276 |
277 | if __name__ == '__main__':
278 | sys.exit(main(sys.argv))