1 | """front_end.py: Lexer and parser for the ASDL schema language."""
|
2 | from __future__ import print_function
|
3 |
|
4 | import re
|
5 |
|
6 | from asdl import ast
|
7 | from asdl.ast import (AST, Use, Module, TypeDecl, SubTypeDecl, Constructor,
|
8 | Field, Sum, SimpleSum, Product)
|
9 | from asdl.util import log
|
10 |
|
11 | # type checking not turned on yet
|
12 | from typing import Dict, Any
|
13 |
|
14 | _ = log
|
15 |
|
16 | _KEYWORDS = ['use', 'module', 'generate']
|
17 |
|
18 | _TOKENS = [
|
19 | ('Keyword', ''),
|
20 | ('Name', ''),
|
21 |
|
22 | # For operators, the string matters
|
23 | ('Equals', '='),
|
24 | ('Comma', ','),
|
25 | ('Question', '?'),
|
26 | ('Pipe', '|'),
|
27 | ('Asterisk', '*'),
|
28 | ('LParen', '('),
|
29 | ('RParen', ')'),
|
30 | ('LBrace', '{'),
|
31 | ('RBrace', '}'),
|
32 | ('Percent', '%'),
|
33 | ('LessThan', '<'), # for subtyping CompoundWord < List[word_part]
|
34 |
|
35 | # Oils addition for parameterized types.
|
36 | ('LBracket', '['),
|
37 | ('RBracket', ']'),
|
38 |
|
39 | # - Start with Dict[string, bool].
|
40 | # - List[string] is an alias for string*
|
41 | #
|
42 | # statically typed: Dict and List
|
43 | # dynamically typed: dict and list
|
44 | ]
|
45 |
|
46 | _TOKEN_STR = [name for name, _ in _TOKENS] # integer -> string like LParen
|
47 | _TOKEN_INT = {} # string like '(' -> integer
|
48 |
|
49 |
|
50 | class TokenKind(object):
|
51 | """ASDL tokens.
|
52 |
|
53 | TokenKind.LBrace = 5, etc.
|
54 | """
|
55 | pass
|
56 |
|
57 |
|
58 | for i, (name, val) in enumerate(_TOKENS):
|
59 | setattr(TokenKind, name, i)
|
60 | _TOKEN_INT[val] = i
|
61 |
|
62 |
|
63 | class Token(object):
|
64 |
|
65 | def __init__(self, kind, value, lineno):
|
66 | self.kind = kind
|
67 | self.value = value
|
68 | self.lineno = lineno
|
69 |
|
70 |
|
71 | class ASDLSyntaxError(Exception):
|
72 |
|
73 | def __init__(self, msg, lineno=None):
|
74 | self.msg = msg
|
75 | self.lineno = lineno or '<unknown>'
|
76 |
|
77 | def __str__(self):
|
78 | return 'Syntax error on line {0.lineno}: {0.msg}'.format(self)
|
79 |
|
80 |
|
81 | def _Tokenize(f):
|
82 | """Tokenize the given buffer.
|
83 |
|
84 | Yield Token objects.
|
85 | """
|
86 | for lineno, line in enumerate(f, 1):
|
87 | for m in re.finditer(r'\s*(\w+|--.*|#.*|.)', line.strip()):
|
88 | c = m.group(1)
|
89 | if c in _KEYWORDS:
|
90 | yield Token(TokenKind.Keyword, c, lineno)
|
91 |
|
92 | elif c[0].isalpha():
|
93 | yield Token(TokenKind.Name, c, lineno)
|
94 |
|
95 | elif c.startswith('--') or c.startswith('#'):
|
96 | # ASDL comments start with --
|
97 | # Added # comments like Python and shell
|
98 | break
|
99 |
|
100 | else:
|
101 | # Operators
|
102 | try:
|
103 | op_kind = _TOKEN_INT[c]
|
104 | except KeyError:
|
105 | raise ASDLSyntaxError('Invalid operator %s' % c, lineno)
|
106 | yield Token(op_kind, c, lineno)
|
107 |
|
108 |
|
109 | def _SumIsSimple(variant_list):
|
110 | """Return True if a sum is a simple.
|
111 |
|
112 | A sum is simple if its types have no fields, e.g. unaryop = Invert |
|
113 | Not | UAdd | USub
|
114 | """
|
115 | for t in variant_list:
|
116 | if t.fields or t.shared_type:
|
117 | return False
|
118 | return True
|
119 |
|
120 |
|
121 | _CODE_GEN_OPTIONS = [
|
122 | 'no_namespace_suffix', # Id.Foo instead of Id_e.Foo
|
123 | 'integers', # integer builtin_i instead of strongly typed builtin_e
|
124 | 'uint16', # like integers, but use uint16_t instead
|
125 | 'bit_set', # not implemented: 1 << n instead of n
|
126 |
|
127 | # probably don't need this
|
128 | # 'common_synthetic_field:left_tok',
|
129 |
|
130 | # Put this type, and transitive closure of types it references, in the
|
131 | # unique "first class variant" namespace, and generate type reflection.
|
132 | 'reflect_all_types',
|
133 |
|
134 | # Squeeze and Freeze, with the number of bits as a option Hm the headers
|
135 | # here still need type reflection. Probably OK.
|
136 | 'mirror_all_types:16',
|
137 | ]
|
138 |
|
139 |
|
140 | class ASDLParser(object):
|
141 | """Parser for ASDL files.
|
142 |
|
143 | Create, then call the parse method on a buffer containing ASDL. This
|
144 | is a simple recursive descent parser that uses _Tokenize for the
|
145 | lexing.
|
146 | """
|
147 |
|
148 | def __init__(self):
|
149 | self._tokenizer = None
|
150 | self.cur_token = None
|
151 |
|
152 | def parse(self, f):
|
153 | """Parse the ASDL in the file and return an AST with a Module root."""
|
154 | self._tokenizer = _Tokenize(f)
|
155 | self._advance()
|
156 | return self._parse_module()
|
157 |
|
158 | def _parse_module(self):
|
159 | """
|
160 | type_decl : NAME '=' compound_type
|
161 | module : 'module' NAME '{' use* type_decl* '}'
|
162 |
|
163 | We added:
|
164 | - use for imports
|
165 | - generate on sum types
|
166 | """
|
167 | if not self._at_keyword('module'):
|
168 | raise ASDLSyntaxError(
|
169 | 'Expected "module" (found {})'.format(self.cur_token.value),
|
170 | self.cur_token.lineno)
|
171 | self._advance()
|
172 | name = self._match(TokenKind.Name)
|
173 | self._match(TokenKind.LBrace)
|
174 |
|
175 | uses = []
|
176 | while self._at_keyword('use'):
|
177 | uses.append(self._parse_use())
|
178 |
|
179 | defs = []
|
180 | while self.cur_token.kind == TokenKind.Name:
|
181 | typename = self._advance()
|
182 | if self.cur_token.kind == TokenKind.Equals:
|
183 | self._advance()
|
184 | type_ = self._parse_compound_type()
|
185 | defs.append(TypeDecl(typename, type_))
|
186 | elif self.cur_token.kind == TokenKind.LessThan:
|
187 | self._advance()
|
188 | type_ = self._parse_type_expr()
|
189 | defs.append(SubTypeDecl(typename, type_))
|
190 | else:
|
191 | raise ASDLSyntaxError(
|
192 | 'Expected = or < after type name (found {})'.format(
|
193 | self.cur_token.value), self.cur_token.lineno)
|
194 |
|
195 | self._match(TokenKind.RBrace)
|
196 | return Module(name, uses, defs)
|
197 |
|
198 | def _parse_use(self):
|
199 | """
|
200 | use: 'use' NAME+ '{' NAME+ '}'
|
201 |
|
202 | example: use frontend syntax { Token }
|
203 |
|
204 | This means frontend/syntax.asdl.h :: Token
|
205 | """
|
206 | self._advance() # past 'use'
|
207 | module_parts = []
|
208 | while self.cur_token.kind == TokenKind.Name:
|
209 | part = self._advance()
|
210 | module_parts.append(part)
|
211 |
|
212 | self._match(TokenKind.LBrace)
|
213 |
|
214 | type_names = []
|
215 | while self.cur_token.kind == TokenKind.Name:
|
216 | t = self._advance()
|
217 | type_names.append(t)
|
218 | if self.cur_token.kind == TokenKind.RParen:
|
219 | break
|
220 | elif self.cur_token.kind == TokenKind.Comma:
|
221 | self._advance()
|
222 |
|
223 | self._match(TokenKind.RBrace)
|
224 | #print('MOD %s' % module_parts)
|
225 | return Use(module_parts, type_names)
|
226 |
|
227 | def _parse_compound_type(self):
|
228 | """
|
229 | constructor : NAME fields?
|
230 | | NAME '%' NAME # shared variant
|
231 |
|
232 | sum : constructor ('|' constructor)* generate?
|
233 |
|
234 | compound_type : product
|
235 | | sum
|
236 |
|
237 | Examples:
|
238 | alloc_members =
|
239 | List
|
240 | | Dict
|
241 | | Struct
|
242 | generate [bit_set]
|
243 |
|
244 | -- color::Red, not color_e::Red or color_i::Red
|
245 | color = Red | Green
|
246 | generate [integers, no_sum_suffix]
|
247 | """
|
248 | if self.cur_token.kind == TokenKind.LParen:
|
249 | # If we see a (, it's a product
|
250 | return self._parse_product()
|
251 | else:
|
252 | # Otherwise it's a sum. Look for ConstructorId
|
253 | sumlist = []
|
254 | while True:
|
255 | cons_name = self._match(TokenKind.Name)
|
256 |
|
257 | shared_type = None
|
258 | fields = None
|
259 | if self.cur_token.kind == TokenKind.LParen:
|
260 | fields = self._parse_fields()
|
261 | elif self.cur_token.kind == TokenKind.Percent:
|
262 | self._advance()
|
263 | shared_type = self._match(TokenKind.Name)
|
264 | else:
|
265 | pass
|
266 |
|
267 | cons = Constructor(cons_name, shared_type, fields)
|
268 | sumlist.append(cons)
|
269 |
|
270 | if self.cur_token.kind != TokenKind.Pipe:
|
271 | break
|
272 | self._advance()
|
273 | generate = self._parse_optional_generate()
|
274 |
|
275 | # Additional validation
|
276 | if generate is not None:
|
277 | for g in generate:
|
278 | if g not in _CODE_GEN_OPTIONS:
|
279 | raise ASDLSyntaxError('Invalid code gen option %r' % g,
|
280 | self.cur_token.lineno)
|
281 |
|
282 | if _SumIsSimple(sumlist):
|
283 | return SimpleSum(sumlist, generate)
|
284 | else:
|
285 | return Sum(sumlist, generate)
|
286 |
|
287 | def _parse_type_expr(self):
|
288 | """One or two params:
|
289 |
|
290 | type_params : '[' type_expr ( ',' type_expr )* ']'
|
291 |
|
292 | type_expr : NAME type_params? (''?' | '*')? # allow one suffix
|
293 |
|
294 | NAME is validated against Optional, List, Dict afterward
|
295 | """
|
296 | type_name = self._match(TokenKind.Name)
|
297 |
|
298 | # Accept Python-like naming!
|
299 | if type_name == 'str':
|
300 | type_name = 'string'
|
301 |
|
302 | children = []
|
303 | if self.cur_token.kind == TokenKind.LBracket:
|
304 | self._advance()
|
305 | children.append(self._parse_type_expr())
|
306 | if self.cur_token.kind == TokenKind.Comma:
|
307 | self._advance()
|
308 | children.append(self._parse_type_expr())
|
309 |
|
310 | self._match(TokenKind.RBracket)
|
311 |
|
312 | if type_name in ('List', 'Optional'):
|
313 | if len(children) != 1:
|
314 | raise ASDLSyntaxError(
|
315 | 'Expected 1 type param to {}'.format(type_name),
|
316 | self.cur_token.lineno)
|
317 | elif type_name == 'Dict':
|
318 | if len(children) != 2:
|
319 | raise ASDLSyntaxError(
|
320 | 'Expected 2 type params to {}'.format(type_name),
|
321 | self.cur_token.lineno)
|
322 | else:
|
323 | if len(children) != 0:
|
324 | raise ASDLSyntaxError(
|
325 | 'Expected zero type params to {}'.format(type_name),
|
326 | self.cur_token.lineno)
|
327 |
|
328 | if len(children):
|
329 | typ = ast.ParameterizedType(type_name, children)
|
330 | else:
|
331 | typ = ast.NamedType(type_name)
|
332 |
|
333 | if self.cur_token.kind == TokenKind.Asterisk:
|
334 | # string* is equivalent to List[string]
|
335 | typ = ast.ParameterizedType('List', [typ])
|
336 | self._advance()
|
337 |
|
338 | elif self.cur_token.kind == TokenKind.Question:
|
339 | # string* is equivalent to Optional[string]
|
340 | typ = ast.ParameterizedType('Optional', [typ])
|
341 | self._advance()
|
342 |
|
343 | return typ
|
344 |
|
345 | def _parse_fields(self):
|
346 | """
|
347 | fields_inner: type_expr NAME ( ',' type_expr NAME )* ','?
|
348 |
|
349 | fields : '(' fields_inner? ')'
|
350 |
|
351 | Name Quantifier? should be changed to typename.
|
352 | """
|
353 | fields = []
|
354 | self._match(TokenKind.LParen)
|
355 | while self.cur_token.kind == TokenKind.Name:
|
356 | typ = self._parse_type_expr()
|
357 | field_name = self._match(TokenKind.Name)
|
358 |
|
359 | fields.append(Field(typ, field_name))
|
360 |
|
361 | if self.cur_token.kind == TokenKind.RParen:
|
362 | break
|
363 | elif self.cur_token.kind == TokenKind.Comma:
|
364 | self._advance()
|
365 |
|
366 | self._match(TokenKind.RParen)
|
367 | return fields
|
368 |
|
369 | def _parse_list(self):
|
370 | """
|
371 | list_inner: NAME ( ',' NAME )* ','?
|
372 |
|
373 | list : '[' list_inner? ']'
|
374 | """
|
375 | generate = []
|
376 | self._match(TokenKind.LBracket)
|
377 | while self.cur_token.kind == TokenKind.Name:
|
378 | name = self._match(TokenKind.Name)
|
379 |
|
380 | generate.append(name)
|
381 |
|
382 | if self.cur_token.kind == TokenKind.RBracket:
|
383 | break
|
384 | elif self.cur_token.kind == TokenKind.Comma:
|
385 | self._advance()
|
386 |
|
387 | self._match(TokenKind.RBracket)
|
388 | return generate
|
389 |
|
390 | def _parse_optional_generate(self):
|
391 | """
|
392 | generate : 'generate' list
|
393 | """
|
394 | if self._at_keyword('generate'):
|
395 | self._advance()
|
396 | return self._parse_list()
|
397 | else:
|
398 | return None
|
399 |
|
400 | def _parse_product(self):
|
401 | """Product: fields attributes?"""
|
402 | return Product(self._parse_fields())
|
403 |
|
404 | def _advance(self):
|
405 | """Return current token; read next token into self.cur_token."""
|
406 | cur_val = None if self.cur_token is None else self.cur_token.value
|
407 | try:
|
408 | self.cur_token = next(self._tokenizer)
|
409 | except StopIteration:
|
410 | self.cur_token = None
|
411 | return cur_val
|
412 |
|
413 | def _match(self, kind):
|
414 | """The 'match' primitive of RD parsers.
|
415 |
|
416 | * Verifies that the current token is of the given kind (kind can
|
417 | be a tuple, in which the kind must match one of its members).
|
418 | * Returns the value of the current token
|
419 | * Reads in the next token
|
420 |
|
421 | Args:
|
422 | kind: A TokenKind, or a tuple of TokenKind
|
423 | """
|
424 | if self.cur_token.kind == kind:
|
425 | value = self.cur_token.value
|
426 | self._advance()
|
427 | return value
|
428 | else:
|
429 | raise ASDLSyntaxError(
|
430 | 'Expected token {}, got {}'.format(_TOKEN_STR[kind],
|
431 | self.cur_token.value),
|
432 | self.cur_token.lineno)
|
433 |
|
434 | def _at_keyword(self, keyword):
|
435 | return (self.cur_token.kind == TokenKind.Keyword and
|
436 | self.cur_token.value == keyword)
|
437 |
|
438 |
|
439 | _PRIMITIVE_TYPES = [
|
440 | 'string',
|
441 | 'int',
|
442 | 'uint16', # used for Token length - should we generalize this?
|
443 | 'BigInt',
|
444 | 'float',
|
445 | 'bool',
|
446 |
|
447 | # 'any' is used for value.{BuiltinProc,BuiltinFunc}, to cast from class
|
448 | # type
|
449 | 'any',
|
450 | ]
|
451 |
|
452 |
|
453 | def _ResolveType(typ, type_lookup):
|
454 | # type: (AST, Dict[str, Any]) -> None
|
455 | """Recursively attach a 'resolved' field to AST nodes."""
|
456 | if isinstance(typ, ast.NamedType):
|
457 | if typ.name not in _PRIMITIVE_TYPES:
|
458 | ast_node = type_lookup.get(typ.name)
|
459 | if ast_node is None:
|
460 | raise ASDLSyntaxError("Couldn't find type %r" % typ.name)
|
461 | typ.resolved = ast_node
|
462 |
|
463 | elif isinstance(typ, ast.ParameterizedType):
|
464 | for child in typ.children:
|
465 | _ResolveType(child, type_lookup)
|
466 |
|
467 | if typ.type_name == 'Optional':
|
468 | child = typ.children[0]
|
469 | if isinstance(child, ast.NamedType):
|
470 | if child.name in _PRIMITIVE_TYPES and child.name != 'string':
|
471 | raise ASDLSyntaxError(
|
472 | 'Optional primitive type {} not allowed'.format(
|
473 | child.name))
|
474 |
|
475 | if child.resolved and isinstance(child.resolved,
|
476 | ast.SimpleSum):
|
477 | raise ASDLSyntaxError(
|
478 | 'Optional simple sum type {} not allowed'.format(
|
479 | child.name))
|
480 |
|
481 | else:
|
482 | raise AssertionError()
|
483 |
|
484 |
|
485 | def _ResolveFields(field_ast_nodes, type_lookup):
|
486 | """
|
487 | Args:
|
488 | type_lookup: Populated by name resolution
|
489 | """
|
490 | for field in field_ast_nodes:
|
491 | _ResolveType(field.typ, type_lookup)
|
492 |
|
493 |
|
494 | def _ResolveModule(module, app_types):
|
495 | """Name resolution for NamedType."""
|
496 | # Types that fields are declared with: int, id, word_part, etc.
|
497 | # Fields are NOT declared with Constructor names.
|
498 | type_lookup = dict(app_types)
|
499 |
|
500 | # Note: we don't actually load the type, and instead leave that to MyPy /
|
501 | # C++. A consequence of this is TypeNameHeuristic().
|
502 | for u in module.uses:
|
503 | for type_name in u.type_names:
|
504 | type_lookup[type_name] = u # type: ast.Use()
|
505 |
|
506 | # NOTE: We need two passes because types can be mutually recursive, e.g.
|
507 | # asdl/arith.asdl.
|
508 |
|
509 | # First pass: collect declared types and make entries for them.
|
510 | for d in module.dfns:
|
511 | if d.name in type_lookup:
|
512 | raise ASDLSyntaxError('Type %r was already defined' % d.name)
|
513 |
|
514 | if isinstance(d, SubTypeDecl):
|
515 | # e.g. CompoundWord < List[str]
|
516 | type_lookup[d.name] = d.base_class
|
517 | else:
|
518 | # e.g. Token = (str a)
|
519 | type_lookup[d.name] = d.value
|
520 |
|
521 | # Second pass: add NamedType.resolved field
|
522 | for d in module.dfns:
|
523 | if isinstance(d, SubTypeDecl): # no fields
|
524 | _ResolveType(d.base_class, type_lookup)
|
525 | continue
|
526 |
|
527 | ast_node = d.value
|
528 | if isinstance(ast_node, ast.Product):
|
529 | #log('fields %s', ast_node.fields)
|
530 | _ResolveFields(ast_node.fields, type_lookup)
|
531 | continue
|
532 |
|
533 | if isinstance(ast_node, ast.Sum):
|
534 | for cons in ast_node.types:
|
535 | _ResolveFields(cons.fields, type_lookup)
|
536 | continue
|
537 |
|
538 | raise AssertionError(ast_node)
|
539 |
|
540 |
|
541 | def LoadSchema(f, app_types, verbose=False):
|
542 | """Returns an AST for the schema."""
|
543 | p = ASDLParser()
|
544 | schema_ast = p.parse(f)
|
545 | if verbose:
|
546 | import sys
|
547 | schema_ast.Print(sys.stdout, 0)
|
548 |
|
549 | # Make sure all the names are valid
|
550 | _ResolveModule(schema_ast, app_types)
|
551 | return schema_ast
|