OILS / asdl / front_end.py View on Github | oilshell.org

551 lines, 317 significant
1"""front_end.py: Lexer and parser for the ASDL schema language."""
2from __future__ import print_function
3
4import re
5
6from asdl import ast
7from asdl.ast import (AST, Use, Module, TypeDecl, SubTypeDecl, Constructor,
8 Field, Sum, SimpleSum, Product)
9from asdl.util import log
10
11# type checking not turned on yet
12from typing import Dict, Any
13
14_ = log
15
16_KEYWORDS = ['use', 'module', 'generate']
17
18_TOKENS = [
19 ('Keyword', ''),
20 ('Name', ''),
21
22 # For operators, the string matters
23 ('Equals', '='),
24 ('Comma', ','),
25 ('Question', '?'),
26 ('Pipe', '|'),
27 ('Asterisk', '*'),
28 ('LParen', '('),
29 ('RParen', ')'),
30 ('LBrace', '{'),
31 ('RBrace', '}'),
32 ('Percent', '%'),
33 ('LessThan', '<'), # for subtyping CompoundWord < List[word_part]
34
35 # Oils addition for parameterized types.
36 ('LBracket', '['),
37 ('RBracket', ']'),
38
39 # - Start with Dict[string, bool].
40 # - List[string] is an alias for string*
41 #
42 # statically typed: Dict and List
43 # dynamically typed: dict and list
44]
45
46_TOKEN_STR = [name for name, _ in _TOKENS] # integer -> string like LParen
47_TOKEN_INT = {} # string like '(' -> integer
48
49
50class TokenKind(object):
51 """ASDL tokens.
52
53 TokenKind.LBrace = 5, etc.
54 """
55 pass
56
57
58for i, (name, val) in enumerate(_TOKENS):
59 setattr(TokenKind, name, i)
60 _TOKEN_INT[val] = i
61
62
63class Token(object):
64
65 def __init__(self, kind, value, lineno):
66 self.kind = kind
67 self.value = value
68 self.lineno = lineno
69
70
71class ASDLSyntaxError(Exception):
72
73 def __init__(self, msg, lineno=None):
74 self.msg = msg
75 self.lineno = lineno or '<unknown>'
76
77 def __str__(self):
78 return 'Syntax error on line {0.lineno}: {0.msg}'.format(self)
79
80
81def _Tokenize(f):
82 """Tokenize the given buffer.
83
84 Yield Token objects.
85 """
86 for lineno, line in enumerate(f, 1):
87 for m in re.finditer(r'\s*(\w+|--.*|#.*|.)', line.strip()):
88 c = m.group(1)
89 if c in _KEYWORDS:
90 yield Token(TokenKind.Keyword, c, lineno)
91
92 elif c[0].isalpha():
93 yield Token(TokenKind.Name, c, lineno)
94
95 elif c.startswith('--') or c.startswith('#'):
96 # ASDL comments start with --
97 # Added # comments like Python and shell
98 break
99
100 else:
101 # Operators
102 try:
103 op_kind = _TOKEN_INT[c]
104 except KeyError:
105 raise ASDLSyntaxError('Invalid operator %s' % c, lineno)
106 yield Token(op_kind, c, lineno)
107
108
109def _SumIsSimple(variant_list):
110 """Return True if a sum is a simple.
111
112 A sum is simple if its types have no fields, e.g. unaryop = Invert |
113 Not | UAdd | USub
114 """
115 for t in variant_list:
116 if t.fields or t.shared_type:
117 return False
118 return True
119
120
121_CODE_GEN_OPTIONS = [
122 'no_namespace_suffix', # Id.Foo instead of Id_e.Foo
123 'integers', # integer builtin_i instead of strongly typed builtin_e
124 'uint16', # like integers, but use uint16_t instead
125 'bit_set', # not implemented: 1 << n instead of n
126
127 # probably don't need this
128 # 'common_synthetic_field:left_tok',
129
130 # Put this type, and transitive closure of types it references, in the
131 # unique "first class variant" namespace, and generate type reflection.
132 'reflect_all_types',
133
134 # Squeeze and Freeze, with the number of bits as a option Hm the headers
135 # here still need type reflection. Probably OK.
136 'mirror_all_types:16',
137]
138
139
140class ASDLParser(object):
141 """Parser for ASDL files.
142
143 Create, then call the parse method on a buffer containing ASDL. This
144 is a simple recursive descent parser that uses _Tokenize for the
145 lexing.
146 """
147
148 def __init__(self):
149 self._tokenizer = None
150 self.cur_token = None
151
152 def parse(self, f):
153 """Parse the ASDL in the file and return an AST with a Module root."""
154 self._tokenizer = _Tokenize(f)
155 self._advance()
156 return self._parse_module()
157
158 def _parse_module(self):
159 """
160 type_decl : NAME '=' compound_type
161 module : 'module' NAME '{' use* type_decl* '}'
162
163 We added:
164 - use for imports
165 - generate on sum types
166 """
167 if not self._at_keyword('module'):
168 raise ASDLSyntaxError(
169 'Expected "module" (found {})'.format(self.cur_token.value),
170 self.cur_token.lineno)
171 self._advance()
172 name = self._match(TokenKind.Name)
173 self._match(TokenKind.LBrace)
174
175 uses = []
176 while self._at_keyword('use'):
177 uses.append(self._parse_use())
178
179 defs = []
180 while self.cur_token.kind == TokenKind.Name:
181 typename = self._advance()
182 if self.cur_token.kind == TokenKind.Equals:
183 self._advance()
184 type_ = self._parse_compound_type()
185 defs.append(TypeDecl(typename, type_))
186 elif self.cur_token.kind == TokenKind.LessThan:
187 self._advance()
188 type_ = self._parse_type_expr()
189 defs.append(SubTypeDecl(typename, type_))
190 else:
191 raise ASDLSyntaxError(
192 'Expected = or < after type name (found {})'.format(
193 self.cur_token.value), self.cur_token.lineno)
194
195 self._match(TokenKind.RBrace)
196 return Module(name, uses, defs)
197
198 def _parse_use(self):
199 """
200 use: 'use' NAME+ '{' NAME+ '}'
201
202 example: use frontend syntax { Token }
203
204 This means frontend/syntax.asdl.h :: Token
205 """
206 self._advance() # past 'use'
207 module_parts = []
208 while self.cur_token.kind == TokenKind.Name:
209 part = self._advance()
210 module_parts.append(part)
211
212 self._match(TokenKind.LBrace)
213
214 type_names = []
215 while self.cur_token.kind == TokenKind.Name:
216 t = self._advance()
217 type_names.append(t)
218 if self.cur_token.kind == TokenKind.RParen:
219 break
220 elif self.cur_token.kind == TokenKind.Comma:
221 self._advance()
222
223 self._match(TokenKind.RBrace)
224 #print('MOD %s' % module_parts)
225 return Use(module_parts, type_names)
226
227 def _parse_compound_type(self):
228 """
229 constructor : NAME fields?
230 | NAME '%' NAME # shared variant
231
232 sum : constructor ('|' constructor)* generate?
233
234 compound_type : product
235 | sum
236
237 Examples:
238 alloc_members =
239 List
240 | Dict
241 | Struct
242 generate [bit_set]
243
244 -- color::Red, not color_e::Red or color_i::Red
245 color = Red | Green
246 generate [integers, no_sum_suffix]
247 """
248 if self.cur_token.kind == TokenKind.LParen:
249 # If we see a (, it's a product
250 return self._parse_product()
251 else:
252 # Otherwise it's a sum. Look for ConstructorId
253 sumlist = []
254 while True:
255 cons_name = self._match(TokenKind.Name)
256
257 shared_type = None
258 fields = None
259 if self.cur_token.kind == TokenKind.LParen:
260 fields = self._parse_fields()
261 elif self.cur_token.kind == TokenKind.Percent:
262 self._advance()
263 shared_type = self._match(TokenKind.Name)
264 else:
265 pass
266
267 cons = Constructor(cons_name, shared_type, fields)
268 sumlist.append(cons)
269
270 if self.cur_token.kind != TokenKind.Pipe:
271 break
272 self._advance()
273 generate = self._parse_optional_generate()
274
275 # Additional validation
276 if generate is not None:
277 for g in generate:
278 if g not in _CODE_GEN_OPTIONS:
279 raise ASDLSyntaxError('Invalid code gen option %r' % g,
280 self.cur_token.lineno)
281
282 if _SumIsSimple(sumlist):
283 return SimpleSum(sumlist, generate)
284 else:
285 return Sum(sumlist, generate)
286
287 def _parse_type_expr(self):
288 """One or two params:
289
290 type_params : '[' type_expr ( ',' type_expr )* ']'
291
292 type_expr : NAME type_params? (''?' | '*')? # allow one suffix
293
294 NAME is validated against Optional, List, Dict afterward
295 """
296 type_name = self._match(TokenKind.Name)
297
298 # Accept Python-like naming!
299 if type_name == 'str':
300 type_name = 'string'
301
302 children = []
303 if self.cur_token.kind == TokenKind.LBracket:
304 self._advance()
305 children.append(self._parse_type_expr())
306 if self.cur_token.kind == TokenKind.Comma:
307 self._advance()
308 children.append(self._parse_type_expr())
309
310 self._match(TokenKind.RBracket)
311
312 if type_name in ('List', 'Optional'):
313 if len(children) != 1:
314 raise ASDLSyntaxError(
315 'Expected 1 type param to {}'.format(type_name),
316 self.cur_token.lineno)
317 elif type_name == 'Dict':
318 if len(children) != 2:
319 raise ASDLSyntaxError(
320 'Expected 2 type params to {}'.format(type_name),
321 self.cur_token.lineno)
322 else:
323 if len(children) != 0:
324 raise ASDLSyntaxError(
325 'Expected zero type params to {}'.format(type_name),
326 self.cur_token.lineno)
327
328 if len(children):
329 typ = ast.ParameterizedType(type_name, children)
330 else:
331 typ = ast.NamedType(type_name)
332
333 if self.cur_token.kind == TokenKind.Asterisk:
334 # string* is equivalent to List[string]
335 typ = ast.ParameterizedType('List', [typ])
336 self._advance()
337
338 elif self.cur_token.kind == TokenKind.Question:
339 # string* is equivalent to Optional[string]
340 typ = ast.ParameterizedType('Optional', [typ])
341 self._advance()
342
343 return typ
344
345 def _parse_fields(self):
346 """
347 fields_inner: type_expr NAME ( ',' type_expr NAME )* ','?
348
349 fields : '(' fields_inner? ')'
350
351 Name Quantifier? should be changed to typename.
352 """
353 fields = []
354 self._match(TokenKind.LParen)
355 while self.cur_token.kind == TokenKind.Name:
356 typ = self._parse_type_expr()
357 field_name = self._match(TokenKind.Name)
358
359 fields.append(Field(typ, field_name))
360
361 if self.cur_token.kind == TokenKind.RParen:
362 break
363 elif self.cur_token.kind == TokenKind.Comma:
364 self._advance()
365
366 self._match(TokenKind.RParen)
367 return fields
368
369 def _parse_list(self):
370 """
371 list_inner: NAME ( ',' NAME )* ','?
372
373 list : '[' list_inner? ']'
374 """
375 generate = []
376 self._match(TokenKind.LBracket)
377 while self.cur_token.kind == TokenKind.Name:
378 name = self._match(TokenKind.Name)
379
380 generate.append(name)
381
382 if self.cur_token.kind == TokenKind.RBracket:
383 break
384 elif self.cur_token.kind == TokenKind.Comma:
385 self._advance()
386
387 self._match(TokenKind.RBracket)
388 return generate
389
390 def _parse_optional_generate(self):
391 """
392 generate : 'generate' list
393 """
394 if self._at_keyword('generate'):
395 self._advance()
396 return self._parse_list()
397 else:
398 return None
399
400 def _parse_product(self):
401 """Product: fields attributes?"""
402 return Product(self._parse_fields())
403
404 def _advance(self):
405 """Return current token; read next token into self.cur_token."""
406 cur_val = None if self.cur_token is None else self.cur_token.value
407 try:
408 self.cur_token = next(self._tokenizer)
409 except StopIteration:
410 self.cur_token = None
411 return cur_val
412
413 def _match(self, kind):
414 """The 'match' primitive of RD parsers.
415
416 * Verifies that the current token is of the given kind (kind can
417 be a tuple, in which the kind must match one of its members).
418 * Returns the value of the current token
419 * Reads in the next token
420
421 Args:
422 kind: A TokenKind, or a tuple of TokenKind
423 """
424 if self.cur_token.kind == kind:
425 value = self.cur_token.value
426 self._advance()
427 return value
428 else:
429 raise ASDLSyntaxError(
430 'Expected token {}, got {}'.format(_TOKEN_STR[kind],
431 self.cur_token.value),
432 self.cur_token.lineno)
433
434 def _at_keyword(self, keyword):
435 return (self.cur_token.kind == TokenKind.Keyword and
436 self.cur_token.value == keyword)
437
438
439_PRIMITIVE_TYPES = [
440 'string',
441 'int',
442 'uint16', # used for Token length - should we generalize this?
443 'BigInt',
444 'float',
445 'bool',
446
447 # 'any' is used for value.{BuiltinProc,BuiltinFunc}, to cast from class
448 # type
449 'any',
450]
451
452
453def _ResolveType(typ, type_lookup):
454 # type: (AST, Dict[str, Any]) -> None
455 """Recursively attach a 'resolved' field to AST nodes."""
456 if isinstance(typ, ast.NamedType):
457 if typ.name not in _PRIMITIVE_TYPES:
458 ast_node = type_lookup.get(typ.name)
459 if ast_node is None:
460 raise ASDLSyntaxError("Couldn't find type %r" % typ.name)
461 typ.resolved = ast_node
462
463 elif isinstance(typ, ast.ParameterizedType):
464 for child in typ.children:
465 _ResolveType(child, type_lookup)
466
467 if typ.type_name == 'Optional':
468 child = typ.children[0]
469 if isinstance(child, ast.NamedType):
470 if child.name in _PRIMITIVE_TYPES and child.name != 'string':
471 raise ASDLSyntaxError(
472 'Optional primitive type {} not allowed'.format(
473 child.name))
474
475 if child.resolved and isinstance(child.resolved,
476 ast.SimpleSum):
477 raise ASDLSyntaxError(
478 'Optional simple sum type {} not allowed'.format(
479 child.name))
480
481 else:
482 raise AssertionError()
483
484
485def _ResolveFields(field_ast_nodes, type_lookup):
486 """
487 Args:
488 type_lookup: Populated by name resolution
489 """
490 for field in field_ast_nodes:
491 _ResolveType(field.typ, type_lookup)
492
493
494def _ResolveModule(module, app_types):
495 """Name resolution for NamedType."""
496 # Types that fields are declared with: int, id, word_part, etc.
497 # Fields are NOT declared with Constructor names.
498 type_lookup = dict(app_types)
499
500 # Note: we don't actually load the type, and instead leave that to MyPy /
501 # C++. A consequence of this is TypeNameHeuristic().
502 for u in module.uses:
503 for type_name in u.type_names:
504 type_lookup[type_name] = u # type: ast.Use()
505
506 # NOTE: We need two passes because types can be mutually recursive, e.g.
507 # asdl/arith.asdl.
508
509 # First pass: collect declared types and make entries for them.
510 for d in module.dfns:
511 if d.name in type_lookup:
512 raise ASDLSyntaxError('Type %r was already defined' % d.name)
513
514 if isinstance(d, SubTypeDecl):
515 # e.g. CompoundWord < List[str]
516 type_lookup[d.name] = d.base_class
517 else:
518 # e.g. Token = (str a)
519 type_lookup[d.name] = d.value
520
521 # Second pass: add NamedType.resolved field
522 for d in module.dfns:
523 if isinstance(d, SubTypeDecl): # no fields
524 _ResolveType(d.base_class, type_lookup)
525 continue
526
527 ast_node = d.value
528 if isinstance(ast_node, ast.Product):
529 #log('fields %s', ast_node.fields)
530 _ResolveFields(ast_node.fields, type_lookup)
531 continue
532
533 if isinstance(ast_node, ast.Sum):
534 for cons in ast_node.types:
535 _ResolveFields(cons.fields, type_lookup)
536 continue
537
538 raise AssertionError(ast_node)
539
540
541def LoadSchema(f, app_types, verbose=False):
542 """Returns an AST for the schema."""
543 p = ASDLParser()
544 schema_ast = p.parse(f)
545 if verbose:
546 import sys
547 schema_ast.Print(sys.stdout, 0)
548
549 # Make sure all the names are valid
550 _ResolveModule(schema_ast, app_types)
551 return schema_ast