OILS / asdl / parse.py View on Github | oils.pub

473 lines, 258 significant
1#!/usr/bin/env python2
2"""
3parse.py
4"""
5from __future__ import print_function
6
7import re
8
9from asdl import ast
10from asdl.ast import (Use, Module, TypeDecl, SubTypeDecl, Constructor, Field,
11 Sum, SimpleSum, Product, Extern)
12
13from typing import Any, IO
14
15_KEYWORDS = ['use', 'module', 'generate', 'extern']
16
17_TOKENS = [
18 ('Keyword', ''),
19 ('Name', ''),
20
21 # For operators, the string matters
22 ('Equals', '='),
23 ('Comma', ','),
24 ('Question', '?'),
25 ('Pipe', '|'),
26 ('Asterisk', '*'),
27 ('LParen', '('),
28 ('RParen', ')'),
29 ('LBrace', '{'),
30 ('RBrace', '}'),
31 ('Percent', '%'),
32 ('LessThan', '<'), # for subtyping CompoundWord < List[word_part]
33
34 # Oils addition for parameterized types.
35 ('LBracket', '['),
36 ('RBracket', ']'),
37
38 # - Start with Dict[string, bool].
39 # - List[string] is an alias for string*
40 #
41 # statically typed: Dict and List
42 # dynamically typed: dict and list
43]
44
45_TOKEN_STR = [name for name, _ in _TOKENS] # integer -> string like LParen
46_TOKEN_INT = {} # string like '(' -> integer
47
48
49class TokenKind(object):
50 """ASDL tokens.
51
52 TokenKind.LBrace = 5, etc.
53 """
54 pass
55
56
57for i, (name, val) in enumerate(_TOKENS):
58 setattr(TokenKind, name, i)
59 _TOKEN_INT[val] = i
60
61
62class Token(object):
63
64 def __init__(self, kind, value, lineno):
65 # type: (int, str, int) -> None
66 self.kind = kind
67 self.value = value
68 self.lineno = lineno
69
70
71class ASDLSyntaxError(Exception):
72
73 def __init__(self, msg, lineno=None):
74 # type: (str, Any) -> None
75 self.msg = msg
76 self.lineno = lineno or '<unknown>'
77
78 def __str__(self):
79 return 'Syntax error on line {0.lineno}: {0.msg}'.format(self)
80
81
82TOKEN_RE = r'\s*(\w+|--.*|#.*|.)'
83
84
85def _Tokenize(f):
86 # type: (IO[bytes]) -> Any
87 """Tokenize the given buffer.
88
89 Yield Token objects.
90 """
91 for lineno, line in enumerate(f, 1):
92 for m in re.finditer(TOKEN_RE, line.strip()):
93 c = m.group(1)
94 if c in _KEYWORDS:
95 yield Token(TokenKind.Keyword, c, lineno) # type: ignore
96
97 elif c[0].isalpha() or c[0] == '_':
98 yield Token(TokenKind.Name, c, lineno) # type: ignore
99
100 elif c.startswith('--') or c.startswith('#'):
101 # ASDL comments start with --
102 # Added # comments like Python and shell
103 break
104
105 else:
106 # Operators
107 try:
108 op_kind = _TOKEN_INT[c]
109 except KeyError:
110 raise ASDLSyntaxError('Invalid operator %s' % c, lineno)
111 yield Token(op_kind, c, lineno)
112
113
114def _SumIsSimple(variant_list):
115 """Return True if a sum is a simple.
116
117 A sum is simple if its types have no fields, e.g.
118 unaryop = Invert | Not | UAdd | USub
119 """
120 for t in variant_list:
121 if t.fields or t.shared_type:
122 return False
123 return True
124
125
126_CODE_GEN_OPTIONS = [
127 'no_namespace_suffix', # Id.Foo instead of Id_e.Foo
128 'integers', # integer builtin_i instead of strongly typed builtin_e
129 'uint16', # like integers, but use uint16_t instead
130 'bit_set', # not implemented: 1 << n instead of n
131
132 # probably don't need this
133 # 'common_synthetic_field:left_tok',
134
135 # Put this type, and transitive closure of types it references, in the
136 # unique "first class variant" namespace, and generate type reflection.
137 'reflect_all_types',
138
139 # Squeeze and Freeze, with the number of bits as a option Hm the headers
140 # here still need type reflection. Probably OK.
141 'mirror_all_types:16',
142]
143
144
145class ASDLParser(object):
146 """Parser for ASDL files.
147
148 Create, then call the parse method on a buffer containing ASDL. This
149 is a simple recursive descent parser that uses _Tokenize for the
150 lexing.
151 """
152
153 def __init__(self):
154 # type: () -> None
155 self._tokenizer = None
156 self.cur_token = None # type: Token
157
158 def parse(self, f):
159 # type: (IO[bytes]) -> ast.Module
160 """Parse the ASDL in the file and return an AST with a Module root."""
161 self._tokenizer = _Tokenize(f)
162 self._advance()
163 return self._parse_module()
164
165 def _parse_module(self):
166 """
167 type_decl : NAME '=' compound_type
168 module : 'module' NAME '{'
169 use*
170 extern*
171 type_decl*
172 '}'
173
174 We added:
175 - use for imports
176 - generate on sum types
177 """
178 if not self._at_keyword('module'):
179 raise ASDLSyntaxError(
180 'Expected "module" (found {})'.format(self.cur_token.value),
181 self.cur_token.lineno)
182 self._advance()
183 name = self._match(TokenKind.Name)
184 self._match(TokenKind.LBrace)
185
186 uses = []
187 while self._at_keyword('use'):
188 uses.append(self._parse_use())
189
190 externs = []
191 while self._at_keyword('extern'):
192 externs.append(self._parse_extern())
193
194 defs = []
195 while self.cur_token.kind == TokenKind.Name:
196 typename = self._advance()
197 if self.cur_token.kind == TokenKind.Equals:
198 self._advance()
199 type_ = self._parse_compound_type()
200 defs.append(TypeDecl(typename, type_))
201 elif self.cur_token.kind == TokenKind.LessThan:
202 self._advance()
203 type_ = self._parse_type_expr()
204 defs.append(SubTypeDecl(typename, type_))
205 else:
206 raise ASDLSyntaxError(
207 'Expected = or < after type name (found {})'.format(
208 self.cur_token.value), self.cur_token.lineno)
209
210 self._match(TokenKind.RBrace)
211 return Module(name, uses, externs, defs)
212
213 def _parse_use(self):
214 """
215 use: 'use' NAME+ '{' NAME+ '}'
216
217 example: use frontend syntax { Token }
218
219 This means frontend/syntax.asdl.h :: Token
220 """
221 self._advance() # past 'use'
222 module_parts = []
223 while self.cur_token.kind == TokenKind.Name:
224 part = self._advance()
225 module_parts.append(part)
226
227 self._match(TokenKind.LBrace)
228
229 type_names = []
230 while self.cur_token.kind == TokenKind.Name:
231 t = self._advance()
232 type_names.append(t)
233 if self.cur_token.kind == TokenKind.RParen:
234 break
235 elif self.cur_token.kind == TokenKind.Comma:
236 self._advance()
237
238 self._match(TokenKind.RBrace)
239 #print('MOD %s' % module_parts)
240 return Use(module_parts, type_names)
241
242 def _parse_extern(self):
243 """
244 extern: 'extern' '[' NAME+ ']'
245
246 Examples:
247 extern _Builtin
248 extern _Callable
249 """
250 self._advance() # past 'extern'
251
252 self._match(TokenKind.LBracket)
253
254 # At least one name
255 names = [self._match(TokenKind.Name)]
256
257 while self.cur_token.kind == TokenKind.Name:
258 names.append(self._advance())
259
260 self._match(TokenKind.RBracket)
261
262 return Extern(names)
263
264 def _parse_compound_type(self):
265 """
266 constructor : NAME fields?
267 | NAME '%' NAME # shared variant
268
269 sum : constructor ('|' constructor)* generate?
270
271 compound_type : product
272 | sum
273
274 Examples:
275 alloc_members =
276 List
277 | Dict
278 | Struct
279 generate [bit_set]
280
281 -- color::Red, not color_e::Red or color_i::Red
282 color = Red | Green
283 generate [integers, no_sum_suffix]
284 """
285 if self.cur_token.kind == TokenKind.LParen:
286 # If we see a (, it's a product
287 return self._parse_product()
288 else:
289 # Otherwise it's a sum. Look for ConstructorId
290 sumlist = []
291 while True:
292 cons_name = self._match(TokenKind.Name)
293
294 shared_type = None
295 fields = None
296 if self.cur_token.kind == TokenKind.LParen:
297 fields = self._parse_fields()
298 elif self.cur_token.kind == TokenKind.Percent:
299 self._advance()
300 shared_type = self._match(TokenKind.Name)
301 else:
302 pass
303
304 cons = Constructor(cons_name, shared_type, fields)
305 sumlist.append(cons)
306
307 if self.cur_token.kind != TokenKind.Pipe:
308 break
309 self._advance()
310 generate = self._parse_optional_generate()
311
312 # Additional validation
313 if generate is not None:
314 for g in generate:
315 if g not in _CODE_GEN_OPTIONS:
316 raise ASDLSyntaxError('Invalid code gen option %r' % g,
317 self.cur_token.lineno)
318
319 if _SumIsSimple(sumlist):
320 return SimpleSum(sumlist, generate)
321 else:
322 return Sum(sumlist, generate)
323
324 def _parse_type_expr(self):
325 """One or two params:
326
327 type_params : '[' type_expr ( ',' type_expr )* ']'
328
329 type_expr : NAME type_params? (''?' | '*')? # allow one suffix
330
331 NAME is validated against Optional, List, Dict afterward
332 """
333 type_name = self._match(TokenKind.Name)
334
335 # Accept Python-like naming!
336 if type_name == 'str':
337 type_name = 'string'
338
339 children = []
340 if self.cur_token.kind == TokenKind.LBracket:
341 self._advance()
342 children.append(self._parse_type_expr())
343 if self.cur_token.kind == TokenKind.Comma:
344 self._advance()
345 children.append(self._parse_type_expr())
346
347 self._match(TokenKind.RBracket)
348
349 if type_name in ('List', 'Optional'):
350 if len(children) != 1:
351 raise ASDLSyntaxError(
352 'Expected 1 type param to {}'.format(type_name),
353 self.cur_token.lineno)
354 elif type_name == 'Dict':
355 if len(children) != 2:
356 raise ASDLSyntaxError(
357 'Expected 2 type params to {}'.format(type_name),
358 self.cur_token.lineno)
359 else:
360 if len(children) != 0:
361 raise ASDLSyntaxError(
362 'Expected zero type params to {}'.format(type_name),
363 self.cur_token.lineno)
364
365 if len(children):
366 typ = ast.ParameterizedType(type_name, children)
367 else:
368 typ = ast.NamedType(type_name)
369
370 if self.cur_token.kind == TokenKind.Asterisk:
371 # string* is equivalent to List[string]
372 typ = ast.ParameterizedType('List', [typ])
373 self._advance()
374
375 elif self.cur_token.kind == TokenKind.Question:
376 # string* is equivalent to Optional[string]
377 typ = ast.ParameterizedType('Optional', [typ])
378 self._advance()
379
380 return typ
381
382 def _parse_fields(self):
383 """
384 fields_inner: type_expr NAME ( ',' type_expr NAME )* ','?
385
386 fields : '(' fields_inner? ')'
387
388 Name Quantifier? should be changed to typename.
389 """
390 fields = []
391 self._match(TokenKind.LParen)
392 while self.cur_token.kind == TokenKind.Name:
393 typ = self._parse_type_expr()
394 field_name = self._match(TokenKind.Name)
395
396 fields.append(Field(typ, field_name))
397
398 if self.cur_token.kind == TokenKind.RParen:
399 break
400 elif self.cur_token.kind == TokenKind.Comma:
401 self._advance()
402
403 self._match(TokenKind.RParen)
404 return fields
405
406 def _parse_list(self):
407 """
408 list_inner: NAME ( ',' NAME )* ','?
409
410 list : '[' list_inner? ']'
411 """
412 generate = []
413 self._match(TokenKind.LBracket)
414 while self.cur_token.kind == TokenKind.Name:
415 name = self._match(TokenKind.Name)
416
417 generate.append(name)
418
419 if self.cur_token.kind == TokenKind.RBracket:
420 break
421 elif self.cur_token.kind == TokenKind.Comma:
422 self._advance()
423
424 self._match(TokenKind.RBracket)
425 return generate
426
427 def _parse_optional_generate(self):
428 """
429 generate : 'generate' list
430 """
431 if self._at_keyword('generate'):
432 self._advance()
433 return self._parse_list()
434 else:
435 return None
436
437 def _parse_product(self):
438 """Product: fields attributes?"""
439 return Product(self._parse_fields())
440
441 def _advance(self):
442 """Return current token; read next token into self.cur_token."""
443 cur_val = None if self.cur_token is None else self.cur_token.value
444 try:
445 self.cur_token = next(self._tokenizer)
446 except StopIteration:
447 self.cur_token = None
448 return cur_val
449
450 def _match(self, kind):
451 """The 'match' primitive of RD parsers.
452
453 * Verifies that the current token is of the given kind (kind can
454 be a tuple, in which the kind must match one of its members).
455 * Returns the value of the current token
456 * Reads in the next token
457
458 Args:
459 kind: A TokenKind, or a tuple of TokenKind
460 """
461 if self.cur_token.kind == kind:
462 value = self.cur_token.value
463 self._advance()
464 return value
465 else:
466 raise ASDLSyntaxError(
467 'Expected token {}, got {}'.format(_TOKEN_STR[kind],
468 self.cur_token.value),
469 self.cur_token.lineno)
470
471 def _at_keyword(self, keyword):
472 return (self.cur_token.kind == TokenKind.Keyword and
473 self.cur_token.value == keyword)