OILS / asdl / front_end.py View on Github | oils.pub

607 lines, 341 significant
1"""front_end.py: Lexer and parser for the ASDL schema language."""
2from __future__ import print_function
3
4import re
5
6from asdl import ast
7from asdl.ast import (Use, Module, TypeDecl, SubTypeDecl, Constructor, Field,
8 Sum, SimpleSum, Product, Extern)
9from asdl.util import log
10
11from typing import List, Dict, Tuple, IO, Any, cast, TYPE_CHECKING
12
13if TYPE_CHECKING:
14 TypeLookup = Dict[str, ast.asdl_type_t]
15
16_ = log
17
18_KEYWORDS = ['use', 'module', 'generate', 'extern']
19
20_TOKENS = [
21 ('Keyword', ''),
22 ('Name', ''),
23
24 # For operators, the string matters
25 ('Equals', '='),
26 ('Comma', ','),
27 ('Question', '?'),
28 ('Pipe', '|'),
29 ('Asterisk', '*'),
30 ('LParen', '('),
31 ('RParen', ')'),
32 ('LBrace', '{'),
33 ('RBrace', '}'),
34 ('Percent', '%'),
35 ('LessThan', '<'), # for subtyping CompoundWord < List[word_part]
36
37 # Oils addition for parameterized types.
38 ('LBracket', '['),
39 ('RBracket', ']'),
40
41 # - Start with Dict[string, bool].
42 # - List[string] is an alias for string*
43 #
44 # statically typed: Dict and List
45 # dynamically typed: dict and list
46]
47
48_TOKEN_STR = [name for name, _ in _TOKENS] # integer -> string like LParen
49_TOKEN_INT = {} # string like '(' -> integer
50
51
52class TokenKind(object):
53 """ASDL tokens.
54
55 TokenKind.LBrace = 5, etc.
56 """
57 pass
58
59
60for i, (name, val) in enumerate(_TOKENS):
61 setattr(TokenKind, name, i)
62 _TOKEN_INT[val] = i
63
64
65class Token(object):
66
67 def __init__(self, kind, value, lineno):
68 # type: (int, str, int) -> None
69 self.kind = kind
70 self.value = value
71 self.lineno = lineno
72
73
74class ASDLSyntaxError(Exception):
75
76 def __init__(self, msg, lineno=None):
77 # type: (str, Any) -> None
78 self.msg = msg
79 self.lineno = lineno or '<unknown>'
80
81 def __str__(self):
82 return 'Syntax error on line {0.lineno}: {0.msg}'.format(self)
83
84
85TOKEN_RE = r'\s*(\w+|--.*|#.*|.)'
86
87
88def _Tokenize(f):
89 # type: (IO) -> Any
90 """Tokenize the given buffer.
91
92 Yield Token objects.
93 """
94 for lineno, line in enumerate(f, 1):
95 for m in re.finditer(TOKEN_RE, line.strip()):
96 c = m.group(1)
97 if c in _KEYWORDS:
98 yield Token(TokenKind.Keyword, c, lineno) # type: ignore
99
100 elif c[0].isalpha() or c[0] == '_':
101 yield Token(TokenKind.Name, c, lineno) # type: ignore
102
103 elif c.startswith('--') or c.startswith('#'):
104 # ASDL comments start with --
105 # Added # comments like Python and shell
106 break
107
108 else:
109 # Operators
110 try:
111 op_kind = _TOKEN_INT[c]
112 except KeyError:
113 raise ASDLSyntaxError('Invalid operator %s' % c, lineno)
114 yield Token(op_kind, c, lineno)
115
116
117def _SumIsSimple(variant_list):
118 """Return True if a sum is a simple.
119
120 A sum is simple if its types have no fields, e.g.
121 unaryop = Invert | Not | UAdd | USub
122 """
123 for t in variant_list:
124 if t.fields or t.shared_type:
125 return False
126 return True
127
128
129_CODE_GEN_OPTIONS = [
130 'no_namespace_suffix', # Id.Foo instead of Id_e.Foo
131 'integers', # integer builtin_i instead of strongly typed builtin_e
132 'uint16', # like integers, but use uint16_t instead
133 'bit_set', # not implemented: 1 << n instead of n
134
135 # probably don't need this
136 # 'common_synthetic_field:left_tok',
137
138 # Put this type, and transitive closure of types it references, in the
139 # unique "first class variant" namespace, and generate type reflection.
140 'reflect_all_types',
141
142 # Squeeze and Freeze, with the number of bits as a option Hm the headers
143 # here still need type reflection. Probably OK.
144 'mirror_all_types:16',
145]
146
147
148class ASDLParser(object):
149 """Parser for ASDL files.
150
151 Create, then call the parse method on a buffer containing ASDL. This
152 is a simple recursive descent parser that uses _Tokenize for the
153 lexing.
154 """
155
156 def __init__(self):
157 self._tokenizer = None
158 self.cur_token = None # type: Token
159
160 def parse(self, f):
161 # type: (IO) -> ast.Module
162 """Parse the ASDL in the file and return an AST with a Module root."""
163 self._tokenizer = _Tokenize(f)
164 self._advance()
165 return self._parse_module()
166
167 def _parse_module(self):
168 """
169 type_decl : NAME '=' compound_type
170 module : 'module' NAME '{'
171 use*
172 extern*
173 type_decl*
174 '}'
175
176 We added:
177 - use for imports
178 - generate on sum types
179 """
180 if not self._at_keyword('module'):
181 raise ASDLSyntaxError(
182 'Expected "module" (found {})'.format(self.cur_token.value),
183 self.cur_token.lineno)
184 self._advance()
185 name = self._match(TokenKind.Name)
186 self._match(TokenKind.LBrace)
187
188 uses = []
189 while self._at_keyword('use'):
190 uses.append(self._parse_use())
191
192 externs = []
193 while self._at_keyword('extern'):
194 externs.append(self._parse_extern())
195
196 defs = []
197 while self.cur_token.kind == TokenKind.Name:
198 typename = self._advance()
199 if self.cur_token.kind == TokenKind.Equals:
200 self._advance()
201 type_ = self._parse_compound_type()
202 defs.append(TypeDecl(typename, type_))
203 elif self.cur_token.kind == TokenKind.LessThan:
204 self._advance()
205 type_ = self._parse_type_expr()
206 defs.append(SubTypeDecl(typename, type_))
207 else:
208 raise ASDLSyntaxError(
209 'Expected = or < after type name (found {})'.format(
210 self.cur_token.value), self.cur_token.lineno)
211
212 self._match(TokenKind.RBrace)
213 return Module(name, uses, externs, defs)
214
215 def _parse_use(self):
216 """
217 use: 'use' NAME+ '{' NAME+ '}'
218
219 example: use frontend syntax { Token }
220
221 This means frontend/syntax.asdl.h :: Token
222 """
223 self._advance() # past 'use'
224 module_parts = []
225 while self.cur_token.kind == TokenKind.Name:
226 part = self._advance()
227 module_parts.append(part)
228
229 self._match(TokenKind.LBrace)
230
231 type_names = []
232 while self.cur_token.kind == TokenKind.Name:
233 t = self._advance()
234 type_names.append(t)
235 if self.cur_token.kind == TokenKind.RParen:
236 break
237 elif self.cur_token.kind == TokenKind.Comma:
238 self._advance()
239
240 self._match(TokenKind.RBrace)
241 #print('MOD %s' % module_parts)
242 return Use(module_parts, type_names)
243
244 def _parse_extern(self):
245 """
246 extern: 'extern' '[' NAME+ ']'
247
248 Examples:
249 extern _Builtin
250 extern _Callable
251 """
252 self._advance() # past 'extern'
253
254 self._match(TokenKind.LBracket)
255
256 # At least one name
257 names = [self._match(TokenKind.Name)]
258
259 while self.cur_token.kind == TokenKind.Name:
260 names.append(self._advance())
261
262 self._match(TokenKind.RBracket)
263
264 return Extern(names)
265
266 def _parse_compound_type(self):
267 """
268 constructor : NAME fields?
269 | NAME '%' NAME # shared variant
270
271 sum : constructor ('|' constructor)* generate?
272
273 compound_type : product
274 | sum
275
276 Examples:
277 alloc_members =
278 List
279 | Dict
280 | Struct
281 generate [bit_set]
282
283 -- color::Red, not color_e::Red or color_i::Red
284 color = Red | Green
285 generate [integers, no_sum_suffix]
286 """
287 if self.cur_token.kind == TokenKind.LParen:
288 # If we see a (, it's a product
289 return self._parse_product()
290 else:
291 # Otherwise it's a sum. Look for ConstructorId
292 sumlist = []
293 while True:
294 cons_name = self._match(TokenKind.Name)
295
296 shared_type = None
297 fields = None
298 if self.cur_token.kind == TokenKind.LParen:
299 fields = self._parse_fields()
300 elif self.cur_token.kind == TokenKind.Percent:
301 self._advance()
302 shared_type = self._match(TokenKind.Name)
303 else:
304 pass
305
306 cons = Constructor(cons_name, shared_type, fields)
307 sumlist.append(cons)
308
309 if self.cur_token.kind != TokenKind.Pipe:
310 break
311 self._advance()
312 generate = self._parse_optional_generate()
313
314 # Additional validation
315 if generate is not None:
316 for g in generate:
317 if g not in _CODE_GEN_OPTIONS:
318 raise ASDLSyntaxError('Invalid code gen option %r' % g,
319 self.cur_token.lineno)
320
321 if _SumIsSimple(sumlist):
322 return SimpleSum(sumlist, generate)
323 else:
324 return Sum(sumlist, generate)
325
326 def _parse_type_expr(self):
327 """One or two params:
328
329 type_params : '[' type_expr ( ',' type_expr )* ']'
330
331 type_expr : NAME type_params? (''?' | '*')? # allow one suffix
332
333 NAME is validated against Optional, List, Dict afterward
334 """
335 type_name = self._match(TokenKind.Name)
336
337 # Accept Python-like naming!
338 if type_name == 'str':
339 type_name = 'string'
340
341 children = []
342 if self.cur_token.kind == TokenKind.LBracket:
343 self._advance()
344 children.append(self._parse_type_expr())
345 if self.cur_token.kind == TokenKind.Comma:
346 self._advance()
347 children.append(self._parse_type_expr())
348
349 self._match(TokenKind.RBracket)
350
351 if type_name in ('List', 'Optional'):
352 if len(children) != 1:
353 raise ASDLSyntaxError(
354 'Expected 1 type param to {}'.format(type_name),
355 self.cur_token.lineno)
356 elif type_name == 'Dict':
357 if len(children) != 2:
358 raise ASDLSyntaxError(
359 'Expected 2 type params to {}'.format(type_name),
360 self.cur_token.lineno)
361 else:
362 if len(children) != 0:
363 raise ASDLSyntaxError(
364 'Expected zero type params to {}'.format(type_name),
365 self.cur_token.lineno)
366
367 if len(children):
368 typ = ast.ParameterizedType(type_name, children)
369 else:
370 typ = ast.NamedType(type_name)
371
372 if self.cur_token.kind == TokenKind.Asterisk:
373 # string* is equivalent to List[string]
374 typ = ast.ParameterizedType('List', [typ])
375 self._advance()
376
377 elif self.cur_token.kind == TokenKind.Question:
378 # string* is equivalent to Optional[string]
379 typ = ast.ParameterizedType('Optional', [typ])
380 self._advance()
381
382 return typ
383
384 def _parse_fields(self):
385 """
386 fields_inner: type_expr NAME ( ',' type_expr NAME )* ','?
387
388 fields : '(' fields_inner? ')'
389
390 Name Quantifier? should be changed to typename.
391 """
392 fields = []
393 self._match(TokenKind.LParen)
394 while self.cur_token.kind == TokenKind.Name:
395 typ = self._parse_type_expr()
396 field_name = self._match(TokenKind.Name)
397
398 fields.append(Field(typ, field_name))
399
400 if self.cur_token.kind == TokenKind.RParen:
401 break
402 elif self.cur_token.kind == TokenKind.Comma:
403 self._advance()
404
405 self._match(TokenKind.RParen)
406 return fields
407
408 def _parse_list(self):
409 """
410 list_inner: NAME ( ',' NAME )* ','?
411
412 list : '[' list_inner? ']'
413 """
414 generate = []
415 self._match(TokenKind.LBracket)
416 while self.cur_token.kind == TokenKind.Name:
417 name = self._match(TokenKind.Name)
418
419 generate.append(name)
420
421 if self.cur_token.kind == TokenKind.RBracket:
422 break
423 elif self.cur_token.kind == TokenKind.Comma:
424 self._advance()
425
426 self._match(TokenKind.RBracket)
427 return generate
428
429 def _parse_optional_generate(self):
430 """
431 generate : 'generate' list
432 """
433 if self._at_keyword('generate'):
434 self._advance()
435 return self._parse_list()
436 else:
437 return None
438
439 def _parse_product(self):
440 """Product: fields attributes?"""
441 return Product(self._parse_fields())
442
443 def _advance(self):
444 """Return current token; read next token into self.cur_token."""
445 cur_val = None if self.cur_token is None else self.cur_token.value
446 try:
447 self.cur_token = next(self._tokenizer)
448 except StopIteration:
449 self.cur_token = None
450 return cur_val
451
452 def _match(self, kind):
453 """The 'match' primitive of RD parsers.
454
455 * Verifies that the current token is of the given kind (kind can
456 be a tuple, in which the kind must match one of its members).
457 * Returns the value of the current token
458 * Reads in the next token
459
460 Args:
461 kind: A TokenKind, or a tuple of TokenKind
462 """
463 if self.cur_token.kind == kind:
464 value = self.cur_token.value
465 self._advance()
466 return value
467 else:
468 raise ASDLSyntaxError(
469 'Expected token {}, got {}'.format(_TOKEN_STR[kind],
470 self.cur_token.value),
471 self.cur_token.lineno)
472
473 def _at_keyword(self, keyword):
474 return (self.cur_token.kind == TokenKind.Keyword and
475 self.cur_token.value == keyword)
476
477
478_PRIMITIVE_TYPES = [
479 'string',
480 'int',
481 'uint16', # used for Token length - should we generalize this?
482 'BigInt',
483 'float',
484 'bool',
485
486 # 'any' is used for value.{BuiltinProc,BuiltinFunc}, to cast from class
487 # type
488 'any',
489]
490
491
492def _ResolveType(typ, type_lookup):
493 # type: (ast.type_expr_t, TypeLookup) -> None
494 """Recursively attach a 'resolved' field to AST nodes."""
495 if isinstance(typ, ast.NamedType):
496 if typ.name not in _PRIMITIVE_TYPES:
497 ast_node = type_lookup.get(typ.name)
498 if ast_node is None:
499 raise ASDLSyntaxError("Couldn't find type %r" % typ.name)
500 typ.resolved = ast_node
501
502 elif isinstance(typ, ast.ParameterizedType):
503 for child in typ.children:
504 _ResolveType(child, type_lookup)
505
506 if typ.type_name == 'Optional':
507 child = typ.children[0]
508 if isinstance(child, ast.NamedType):
509 if child.name in _PRIMITIVE_TYPES and child.name != 'string':
510 raise ASDLSyntaxError(
511 'Optional primitive type {} not allowed'.format(
512 child.name))
513
514 if child.resolved and isinstance(child.resolved,
515 ast.SimpleSum):
516 raise ASDLSyntaxError(
517 'Optional simple sum type {} not allowed'.format(
518 child.name))
519
520 else:
521 raise AssertionError()
522
523
524def _ResolveFields(field_ast_nodes, type_lookup):
525 # type: (List[Field], TypeLookup) -> None
526 """
527 Args:
528 type_lookup: Populated by name resolution
529 """
530 for field in field_ast_nodes:
531 _ResolveType(field.typ, type_lookup)
532
533
534def _ResolveModule(module, type_lookup):
535 # type: (Module, TypeLookup) -> None
536 """Name resolution for NamedType."""
537
538 # Note: we don't actually load the type, and instead leave that to MyPy /
539 # C++. A consequence of this is TypeNameHeuristic().
540 for u in module.uses:
541 for type_name in u.type_names:
542 type_lookup[type_name] = u
543
544 # NOTE: We need two passes because types can be mutually recursive, e.g.
545 # asdl/arith.asdl.
546
547 # First pass: collect declared types and make entries for them.
548 for ex in module.externs:
549 last = ex.names[-1] # e.g. _Callable
550 if last in type_lookup:
551 raise ASDLSyntaxError('Type %r was already defined' % last)
552 type_lookup[last] = ex
553
554 for d in module.dfns:
555
556 if isinstance(d, SubTypeDecl):
557 if d.name in type_lookup:
558 raise ASDLSyntaxError('Type %r was already defined' % d.name)
559
560 # e.g. CompoundWord < List[str]
561 type_lookup[d.name] = ast.DummyType() # this value isn't used?
562
563 elif isinstance(d, TypeDecl):
564 if d.name in type_lookup:
565 raise ASDLSyntaxError('Type %r was already defined' % d.name)
566
567 # e.g. Token = (str a)
568 type_lookup[d.name] = d.value
569
570 else:
571 raise AssertionError()
572
573 # Second pass: add NamedType.resolved field
574 for d in module.dfns:
575 if isinstance(d, SubTypeDecl): # no fields
576 _ResolveType(d.base_class, type_lookup)
577 continue
578
579 d = cast(ast.TypeDecl, d)
580
581 ast_node = d.value
582 if isinstance(ast_node, ast.Product):
583 #log('fields %s', ast_node.fields)
584 _ResolveFields(ast_node.fields, type_lookup)
585 continue
586
587 if isinstance(ast_node, ast.Sum):
588 for cons in ast_node.types:
589 _ResolveFields(cons.fields, type_lookup)
590 continue
591
592 raise AssertionError(ast_node)
593
594
595def LoadSchema(f, verbose=False):
596 # type: (IO, bool) -> Tuple[ast.Module, TypeLookup]
597 """Returns an AST for the schema."""
598 p = ASDLParser()
599 schema_ast = p.parse(f)
600 if verbose:
601 import sys
602 schema_ast.Print(sys.stdout, 0)
603
604 # Make sure all the names are valid
605 type_lookup = {} # type: TypeLookup
606 _ResolveModule(schema_ast, type_lookup)
607 return schema_ast, type_lookup