OILS / asdl / front_end.py View on Github | oilshell.org

525 lines, 297 significant
1"""front_end.py: Lexer and parser for the ASDL schema language."""
2from __future__ import print_function
3
4import re
5
6from asdl import ast
7from asdl.ast import (AST, Use, Module, TypeDecl, Constructor, Field, Sum,
8 SimpleSum, Product)
9from asdl.util import log
10
11_ = log
12
13_KEYWORDS = ['use', 'module', 'generate']
14
15_TOKENS = [
16 ('Keyword', ''),
17 ('Name', ''),
18
19 # For operators, the string matters
20 ('Equals', '='),
21 ('Comma', ','),
22 ('Question', '?'),
23 ('Pipe', '|'),
24 ('Asterisk', '*'),
25 ('LParen', '('),
26 ('RParen', ')'),
27 ('LBrace', '{'),
28 ('RBrace', '}'),
29 ('Percent', '%'),
30
31 # Oils addition for parameterized types.
32 ('LBracket', '['),
33 ('RBracket', ']'),
34
35 # - Start with Dict[string, bool].
36 # - List[string] is an alias for string*
37 #
38 # statically typed: Dict and List
39 # dynamically typed: dict and list
40]
41
42_TOKEN_STR = [name for name, _ in _TOKENS] # integer -> string like LParen
43_TOKEN_INT = {} # string like '(' -> integer
44
45
46class TokenKind(object):
47 """ASDL tokens.
48
49 TokenKind.LBrace = 5, etc.
50 """
51 pass
52
53
54for i, (name, val) in enumerate(_TOKENS):
55 setattr(TokenKind, name, i)
56 _TOKEN_INT[val] = i
57
58
59class Token(object):
60
61 def __init__(self, kind, value, lineno):
62 self.kind = kind
63 self.value = value
64 self.lineno = lineno
65
66
67class ASDLSyntaxError(Exception):
68
69 def __init__(self, msg, lineno=None):
70 self.msg = msg
71 self.lineno = lineno or '<unknown>'
72
73 def __str__(self):
74 return 'Syntax error on line {0.lineno}: {0.msg}'.format(self)
75
76
77def _Tokenize(f):
78 """Tokenize the given buffer.
79
80 Yield Token objects.
81 """
82 for lineno, line in enumerate(f, 1):
83 for m in re.finditer(r'\s*(\w+|--.*|#.*|.)', line.strip()):
84 c = m.group(1)
85 if c in _KEYWORDS:
86 yield Token(TokenKind.Keyword, c, lineno)
87
88 elif c[0].isalpha():
89 yield Token(TokenKind.Name, c, lineno)
90
91 elif c.startswith('--') or c.startswith('#'):
92 # ASDL comments start with --
93 # Added # comments like Python and shell
94 break
95
96 else:
97 # Operators
98 try:
99 op_kind = _TOKEN_INT[c]
100 except KeyError:
101 raise ASDLSyntaxError('Invalid operator %s' % c, lineno)
102 yield Token(op_kind, c, lineno)
103
104
105def _SumIsSimple(variant_list):
106 """Return True if a sum is a simple.
107
108 A sum is simple if its types have no fields, e.g. unaryop = Invert |
109 Not | UAdd | USub
110 """
111 for t in variant_list:
112 if t.fields or t.shared_type:
113 return False
114 return True
115
116
117_CODE_GEN_OPTIONS = [
118 'no_namespace_suffix', # Id.Foo instead of Id_e.Foo
119 'integers', # integer builtin_i instead of strongly typed builtin_e
120 'uint16', # like integers, but use uint16_t instead
121 'bit_set', # not implemented: 1 << n instead of n
122
123 # probably don't need this
124 # 'common_synthetic_field:left_tok',
125
126 # Put this type, and transitive closure of types it references, in the
127 # unique "first class variant" namespace, and generate type reflection.
128 'reflect_all_types',
129
130 # Squeeze and Freeze, with the number of bits as a option Hm the headers
131 # here still need type reflection. Probably OK.
132 'mirror_all_types:16',
133]
134
135
136class ASDLParser(object):
137 """Parser for ASDL files.
138
139 Create, then call the parse method on a buffer containing ASDL. This
140 is a simple recursive descent parser that uses _Tokenize for the
141 lexing.
142 """
143
144 def __init__(self):
145 self._tokenizer = None
146 self.cur_token = None
147
148 def parse(self, f):
149 """Parse the ASDL in the file and return an AST with a Module root."""
150 self._tokenizer = _Tokenize(f)
151 self._advance()
152 return self._parse_module()
153
154 def _parse_module(self):
155 """
156 type_decl : NAME '=' compound_type
157 module : 'module' NAME '{' use* type_decl* '}'
158
159 We added:
160 - use for imports
161 - generate on sum types
162 """
163 if not self._at_keyword('module'):
164 raise ASDLSyntaxError(
165 'Expected "module" (found {})'.format(self.cur_token.value),
166 self.cur_token.lineno)
167 self._advance()
168 name = self._match(TokenKind.Name)
169 self._match(TokenKind.LBrace)
170
171 uses = []
172 while self._at_keyword('use'):
173 uses.append(self._parse_use())
174
175 defs = []
176 while self.cur_token.kind == TokenKind.Name:
177 typename = self._advance()
178 self._match(TokenKind.Equals)
179 type_ = self._parse_compound_type()
180 defs.append(TypeDecl(typename, type_))
181
182 self._match(TokenKind.RBrace)
183 return Module(name, uses, defs)
184
185 def _parse_use(self):
186 """
187 use: 'use' NAME+ '{' NAME+ '}'
188
189 example: use frontend syntax { Token }
190
191 This means frontend/syntax.asdl.h :: Token
192 """
193 self._advance() # past 'use'
194 module_parts = []
195 while self.cur_token.kind == TokenKind.Name:
196 part = self._advance()
197 module_parts.append(part)
198
199 self._match(TokenKind.LBrace)
200
201 type_names = []
202 while self.cur_token.kind == TokenKind.Name:
203 t = self._advance()
204 type_names.append(t)
205 if self.cur_token.kind == TokenKind.RParen:
206 break
207 elif self.cur_token.kind == TokenKind.Comma:
208 self._advance()
209
210 self._match(TokenKind.RBrace)
211 #print('MOD %s' % module_parts)
212 return Use(module_parts, type_names)
213
214 def _parse_compound_type(self):
215 """
216 constructor : NAME fields?
217 | NAME '%' NAME # shared variant
218
219 sum : constructor ('|' constructor)* generate?
220
221 compound_type : product
222 | sum
223
224 Examples:
225 alloc_members =
226 List
227 | Dict
228 | Struct
229 generate [bit_set]
230
231 -- color::Red, not color_e::Red or color_i::Red
232 color = Red | Green
233 generate [integers, no_sum_suffix]
234 """
235 if self.cur_token.kind == TokenKind.LParen:
236 # If we see a (, it's a product
237 return self._parse_product()
238 else:
239 # Otherwise it's a sum. Look for ConstructorId
240 sumlist = []
241 while True:
242 cons_name = self._match(TokenKind.Name)
243
244 shared_type = None
245 fields = None
246 if self.cur_token.kind == TokenKind.LParen:
247 fields = self._parse_fields()
248 elif self.cur_token.kind == TokenKind.Percent:
249 self._advance()
250 shared_type = self._match(TokenKind.Name)
251 else:
252 pass
253
254 cons = Constructor(cons_name, shared_type, fields)
255 sumlist.append(cons)
256
257 if self.cur_token.kind != TokenKind.Pipe:
258 break
259 self._advance()
260 generate = self._parse_optional_generate()
261
262 # Additional validation
263 if generate is not None:
264 for g in generate:
265 if g not in _CODE_GEN_OPTIONS:
266 raise ASDLSyntaxError('Invalid code gen option %r' % g,
267 self.cur_token.lineno)
268
269 if _SumIsSimple(sumlist):
270 return SimpleSum(sumlist, generate)
271 else:
272 return Sum(sumlist, generate)
273
274 def _parse_type_expr(self):
275 """One or two params:
276
277 type_params : '[' type_expr ( ',' type_expr )* ']'
278
279 type_expr : NAME type_params? (''?' | '*')? # allow one suffix
280
281 NAME is validated against Optional, List, Dict afterward
282 """
283 type_name = self._match(TokenKind.Name)
284
285 # Accept Python-like naming!
286 if type_name == 'str':
287 type_name = 'string'
288
289 children = []
290 if self.cur_token.kind == TokenKind.LBracket:
291 self._advance()
292 children.append(self._parse_type_expr())
293 if self.cur_token.kind == TokenKind.Comma:
294 self._advance()
295 children.append(self._parse_type_expr())
296
297 self._match(TokenKind.RBracket)
298
299 if type_name in ('List', 'Optional'):
300 if len(children) != 1:
301 raise ASDLSyntaxError(
302 'Expected 1 type param to {}'.format(type_name),
303 self.cur_token.lineno)
304 elif type_name == 'Dict':
305 if len(children) != 2:
306 raise ASDLSyntaxError(
307 'Expected 2 type params to {}'.format(type_name),
308 self.cur_token.lineno)
309 else:
310 if len(children) != 0:
311 raise ASDLSyntaxError(
312 'Expected zero type params to {}'.format(type_name),
313 self.cur_token.lineno)
314
315 if len(children):
316 typ = ast.ParameterizedType(type_name, children)
317 else:
318 typ = ast.NamedType(type_name)
319
320 if self.cur_token.kind == TokenKind.Asterisk:
321 # string* is equivalent to List[string]
322 typ = ast.ParameterizedType('List', [typ])
323 self._advance()
324
325 elif self.cur_token.kind == TokenKind.Question:
326 # string* is equivalent to Optional[string]
327 typ = ast.ParameterizedType('Optional', [typ])
328 self._advance()
329
330 return typ
331
332 def _parse_fields(self):
333 """
334 fields_inner: type_expr NAME ( ',' type_expr NAME )* ','?
335
336 fields : '(' fields_inner? ')'
337
338 Name Quantifier? should be changed to typename.
339 """
340 fields = []
341 self._match(TokenKind.LParen)
342 while self.cur_token.kind == TokenKind.Name:
343 typ = self._parse_type_expr()
344 field_name = self._match(TokenKind.Name)
345
346 fields.append(Field(typ, field_name))
347
348 if self.cur_token.kind == TokenKind.RParen:
349 break
350 elif self.cur_token.kind == TokenKind.Comma:
351 self._advance()
352
353 self._match(TokenKind.RParen)
354 return fields
355
356 def _parse_list(self):
357 """
358 list_inner: NAME ( ',' NAME )* ','?
359
360 list : '[' list_inner? ']'
361 """
362 generate = []
363 self._match(TokenKind.LBracket)
364 while self.cur_token.kind == TokenKind.Name:
365 name = self._match(TokenKind.Name)
366
367 generate.append(name)
368
369 if self.cur_token.kind == TokenKind.RBracket:
370 break
371 elif self.cur_token.kind == TokenKind.Comma:
372 self._advance()
373
374 self._match(TokenKind.RBracket)
375 return generate
376
377 def _parse_optional_generate(self):
378 """
379 generate : 'generate' list
380 """
381 if self._at_keyword('generate'):
382 self._advance()
383 return self._parse_list()
384 else:
385 return None
386
387 def _parse_product(self):
388 """Product: fields attributes?"""
389 return Product(self._parse_fields())
390
391 def _advance(self):
392 """Return current token; read next token into self.cur_token."""
393 cur_val = None if self.cur_token is None else self.cur_token.value
394 try:
395 self.cur_token = next(self._tokenizer)
396 except StopIteration:
397 self.cur_token = None
398 return cur_val
399
400 def _match(self, kind):
401 """The 'match' primitive of RD parsers.
402
403 * Verifies that the current token is of the given kind (kind can
404 be a tuple, in which the kind must match one of its members).
405 * Returns the value of the current token
406 * Reads in the next token
407
408 Args:
409 kind: A TokenKind, or a tuple of TokenKind
410 """
411 if self.cur_token.kind == kind:
412 value = self.cur_token.value
413 self._advance()
414 return value
415 else:
416 raise ASDLSyntaxError(
417 'Expected token {}, got {}'.format(_TOKEN_STR[kind],
418 self.cur_token.value),
419 self.cur_token.lineno)
420
421 def _at_keyword(self, keyword):
422 return (self.cur_token.kind == TokenKind.Keyword and
423 self.cur_token.value == keyword)
424
425
426_PRIMITIVE_TYPES = [
427 'string',
428 'int',
429 'uint16', # used for Token length - should we generalize this?
430 'BigInt',
431 'float',
432 'bool',
433
434 # 'any' is used for value.{BuiltinProc,BuiltinFunc}, to cast from class
435 # type
436 'any',
437]
438
439
440def _ResolveType(typ, type_lookup):
441 # type: (AST, dict) -> None
442 """Recursively attach a 'resolved' field to AST nodes."""
443 if isinstance(typ, ast.NamedType):
444 if typ.name not in _PRIMITIVE_TYPES:
445 ast_node = type_lookup.get(typ.name)
446 if ast_node is None:
447 raise ASDLSyntaxError("Couldn't find type %r" % typ.name)
448 typ.resolved = ast_node
449
450 elif isinstance(typ, ast.ParameterizedType):
451 for child in typ.children:
452 _ResolveType(child, type_lookup)
453
454 if typ.type_name == 'Optional':
455 child = typ.children[0]
456 if isinstance(child, ast.NamedType):
457 if child.name in _PRIMITIVE_TYPES and child.name != 'string':
458 raise ASDLSyntaxError(
459 'Optional primitive type {} not allowed'.format(
460 child.name))
461
462 if child.resolved and isinstance(child.resolved,
463 ast.SimpleSum):
464 raise ASDLSyntaxError(
465 'Optional simple sum type {} not allowed'.format(
466 child.name))
467
468 else:
469 raise AssertionError()
470
471
472def _ResolveFields(field_ast_nodes, type_lookup):
473 """
474 Args:
475 type_lookup: Populated by name resolution
476 """
477 for field in field_ast_nodes:
478 _ResolveType(field.typ, type_lookup)
479
480
481def _ResolveModule(module, app_types):
482 """Name resolution for NamedType."""
483 # Types that fields are declared with: int, id, word_part, etc.
484 # Fields are NOT declared with Constructor names.
485 type_lookup = dict(app_types)
486
487 # Note: we don't actually load the type, and instead leave that to MyPy /
488 # C++. A consequence of this is TypeNameHeuristic().
489 for u in module.uses:
490 for type_name in u.type_names:
491 type_lookup[type_name] = u # type: ast.Use()
492
493 # NOTE: We need two passes because types can be mutually recursive, e.g.
494 # asdl/arith.asdl.
495
496 # First pass: collect declared types and make entries for them.
497 for d in module.dfns:
498 type_lookup[d.name] = d.value
499
500 # Second pass: add NamedType.resolved field
501 for d in module.dfns:
502 ast_node = d.value
503 if isinstance(ast_node, ast.Product):
504 #log('fields %s', ast_node.fields)
505 _ResolveFields(ast_node.fields, type_lookup)
506
507 elif isinstance(ast_node, ast.Sum):
508 for cons in ast_node.types:
509 _ResolveFields(cons.fields, type_lookup)
510
511 else:
512 raise AssertionError(ast_node)
513
514
515def LoadSchema(f, app_types, verbose=False):
516 """Returns an AST for the schema."""
517 p = ASDLParser()
518 schema_ast = p.parse(f)
519 if verbose:
520 import sys
521 schema_ast.Print(sys.stdout, 0)
522
523 # Make sure all the names are valid
524 _ResolveModule(schema_ast, app_types)
525 return schema_ast