| 1 | #!/usr/bin/env python2
|
| 2 | """
|
| 3 | parse.py
|
| 4 | """
|
| 5 | from __future__ import print_function
|
| 6 |
|
| 7 | import re
|
| 8 |
|
| 9 | from asdl import ast
|
| 10 | from asdl.ast import (Use, Module, TypeDecl, SubTypeDecl, Constructor, Field,
|
| 11 | Sum, SimpleSum, Product, Extern)
|
| 12 |
|
| 13 | from typing import Any, IO
|
| 14 |
|
| 15 | _KEYWORDS = ['use', 'module', 'generate', 'extern']
|
| 16 |
|
| 17 | _TOKENS = [
|
| 18 | ('Keyword', ''),
|
| 19 | ('Name', ''),
|
| 20 |
|
| 21 | # For operators, the string matters
|
| 22 | ('Equals', '='),
|
| 23 | ('Comma', ','),
|
| 24 | ('Question', '?'),
|
| 25 | ('Pipe', '|'),
|
| 26 | ('Asterisk', '*'),
|
| 27 | ('LParen', '('),
|
| 28 | ('RParen', ')'),
|
| 29 | ('LBrace', '{'),
|
| 30 | ('RBrace', '}'),
|
| 31 | ('Percent', '%'),
|
| 32 | ('LessThan', '<'), # for subtyping CompoundWord < List[word_part]
|
| 33 |
|
| 34 | # Oils addition for parameterized types.
|
| 35 | ('LBracket', '['),
|
| 36 | ('RBracket', ']'),
|
| 37 |
|
| 38 | # - Start with Dict[string, bool].
|
| 39 | # - List[string] is an alias for string*
|
| 40 | #
|
| 41 | # statically typed: Dict and List
|
| 42 | # dynamically typed: dict and list
|
| 43 | ]
|
| 44 |
|
| 45 | _TOKEN_STR = [name for name, _ in _TOKENS] # integer -> string like LParen
|
| 46 | _TOKEN_INT = {} # string like '(' -> integer
|
| 47 |
|
| 48 |
|
| 49 | class TokenKind(object):
|
| 50 | """ASDL tokens.
|
| 51 |
|
| 52 | TokenKind.LBrace = 5, etc.
|
| 53 | """
|
| 54 | pass
|
| 55 |
|
| 56 |
|
| 57 | for i, (name, val) in enumerate(_TOKENS):
|
| 58 | setattr(TokenKind, name, i)
|
| 59 | _TOKEN_INT[val] = i
|
| 60 |
|
| 61 |
|
| 62 | class Token(object):
|
| 63 |
|
| 64 | def __init__(self, kind, value, lineno):
|
| 65 | # type: (int, str, int) -> None
|
| 66 | self.kind = kind
|
| 67 | self.value = value
|
| 68 | self.lineno = lineno
|
| 69 |
|
| 70 |
|
| 71 | class ASDLSyntaxError(Exception):
|
| 72 |
|
| 73 | def __init__(self, msg, lineno=None):
|
| 74 | # type: (str, Any) -> None
|
| 75 | self.msg = msg
|
| 76 | self.lineno = lineno or '<unknown>'
|
| 77 |
|
| 78 | def __str__(self):
|
| 79 | return 'Syntax error on line {0.lineno}: {0.msg}'.format(self)
|
| 80 |
|
| 81 |
|
| 82 | TOKEN_RE = r'\s*(\w+|--.*|#.*|.)'
|
| 83 |
|
| 84 |
|
| 85 | def _Tokenize(f):
|
| 86 | # type: (IO[bytes]) -> Any
|
| 87 | """Tokenize the given buffer.
|
| 88 |
|
| 89 | Yield Token objects.
|
| 90 | """
|
| 91 | for lineno, line in enumerate(f, 1):
|
| 92 | for m in re.finditer(TOKEN_RE, line.strip()):
|
| 93 | c = m.group(1)
|
| 94 | if c in _KEYWORDS:
|
| 95 | yield Token(TokenKind.Keyword, c, lineno) # type: ignore
|
| 96 |
|
| 97 | elif c[0].isalpha() or c[0] == '_':
|
| 98 | yield Token(TokenKind.Name, c, lineno) # type: ignore
|
| 99 |
|
| 100 | elif c.startswith('--') or c.startswith('#'):
|
| 101 | # ASDL comments start with --
|
| 102 | # Added # comments like Python and shell
|
| 103 | break
|
| 104 |
|
| 105 | else:
|
| 106 | # Operators
|
| 107 | try:
|
| 108 | op_kind = _TOKEN_INT[c]
|
| 109 | except KeyError:
|
| 110 | raise ASDLSyntaxError('Invalid operator %s' % c, lineno)
|
| 111 | yield Token(op_kind, c, lineno)
|
| 112 |
|
| 113 |
|
| 114 | def _SumIsSimple(variant_list):
|
| 115 | """Return True if a sum is a simple.
|
| 116 |
|
| 117 | A sum is simple if its types have no fields, e.g.
|
| 118 | unaryop = Invert | Not | UAdd | USub
|
| 119 | """
|
| 120 | for t in variant_list:
|
| 121 | if t.fields or t.shared_type:
|
| 122 | return False
|
| 123 | return True
|
| 124 |
|
| 125 |
|
| 126 | _CODE_GEN_OPTIONS = [
|
| 127 | 'no_namespace_suffix', # Id.Foo instead of Id_e.Foo
|
| 128 | 'integers', # integer builtin_i instead of strongly typed builtin_e
|
| 129 | 'uint16', # like integers, but use uint16_t instead
|
| 130 | 'bit_set', # not implemented: 1 << n instead of n
|
| 131 |
|
| 132 | # probably don't need this
|
| 133 | # 'common_synthetic_field:left_tok',
|
| 134 |
|
| 135 | # Put this type, and transitive closure of types it references, in the
|
| 136 | # unique "first class variant" namespace, and generate type reflection.
|
| 137 | 'reflect_all_types',
|
| 138 |
|
| 139 | # Squeeze and Freeze, with the number of bits as a option Hm the headers
|
| 140 | # here still need type reflection. Probably OK.
|
| 141 | 'mirror_all_types:16',
|
| 142 | ]
|
| 143 |
|
| 144 |
|
| 145 | class ASDLParser(object):
|
| 146 | """Parser for ASDL files.
|
| 147 |
|
| 148 | Create, then call the parse method on a buffer containing ASDL. This
|
| 149 | is a simple recursive descent parser that uses _Tokenize for the
|
| 150 | lexing.
|
| 151 | """
|
| 152 |
|
| 153 | def __init__(self):
|
| 154 | # type: () -> None
|
| 155 | self._tokenizer = None
|
| 156 | self.cur_token = None # type: Token
|
| 157 |
|
| 158 | def parse(self, f):
|
| 159 | # type: (IO[bytes]) -> ast.Module
|
| 160 | """Parse the ASDL in the file and return an AST with a Module root."""
|
| 161 | self._tokenizer = _Tokenize(f)
|
| 162 | self._advance()
|
| 163 | return self._parse_module()
|
| 164 |
|
| 165 | def _parse_module(self):
|
| 166 | """
|
| 167 | type_decl : NAME '=' compound_type
|
| 168 | module : 'module' NAME '{'
|
| 169 | use*
|
| 170 | extern*
|
| 171 | type_decl*
|
| 172 | '}'
|
| 173 |
|
| 174 | We added:
|
| 175 | - use for imports
|
| 176 | - generate on sum types
|
| 177 | """
|
| 178 | if not self._at_keyword('module'):
|
| 179 | raise ASDLSyntaxError(
|
| 180 | 'Expected "module" (found {})'.format(self.cur_token.value),
|
| 181 | self.cur_token.lineno)
|
| 182 | self._advance()
|
| 183 | name = self._match(TokenKind.Name)
|
| 184 | self._match(TokenKind.LBrace)
|
| 185 |
|
| 186 | uses = []
|
| 187 | while self._at_keyword('use'):
|
| 188 | uses.append(self._parse_use())
|
| 189 |
|
| 190 | externs = []
|
| 191 | while self._at_keyword('extern'):
|
| 192 | externs.append(self._parse_extern())
|
| 193 |
|
| 194 | defs = []
|
| 195 | while self.cur_token.kind == TokenKind.Name:
|
| 196 | typename = self._advance()
|
| 197 | if self.cur_token.kind == TokenKind.Equals:
|
| 198 | self._advance()
|
| 199 | type_ = self._parse_compound_type()
|
| 200 | defs.append(TypeDecl(typename, type_))
|
| 201 | elif self.cur_token.kind == TokenKind.LessThan:
|
| 202 | self._advance()
|
| 203 | type_ = self._parse_type_expr()
|
| 204 | defs.append(SubTypeDecl(typename, type_))
|
| 205 | else:
|
| 206 | raise ASDLSyntaxError(
|
| 207 | 'Expected = or < after type name (found {})'.format(
|
| 208 | self.cur_token.value), self.cur_token.lineno)
|
| 209 |
|
| 210 | self._match(TokenKind.RBrace)
|
| 211 | return Module(name, uses, externs, defs)
|
| 212 |
|
| 213 | def _parse_use(self):
|
| 214 | """
|
| 215 | use: 'use' NAME+ '{' NAME+ '}'
|
| 216 |
|
| 217 | example: use frontend syntax { Token }
|
| 218 |
|
| 219 | This means frontend/syntax.asdl.h :: Token
|
| 220 | """
|
| 221 | self._advance() # past 'use'
|
| 222 | module_parts = []
|
| 223 | while self.cur_token.kind == TokenKind.Name:
|
| 224 | part = self._advance()
|
| 225 | module_parts.append(part)
|
| 226 |
|
| 227 | self._match(TokenKind.LBrace)
|
| 228 |
|
| 229 | type_names = []
|
| 230 | while self.cur_token.kind == TokenKind.Name:
|
| 231 | t = self._advance()
|
| 232 | type_names.append(t)
|
| 233 | if self.cur_token.kind == TokenKind.RParen:
|
| 234 | break
|
| 235 | elif self.cur_token.kind == TokenKind.Comma:
|
| 236 | self._advance()
|
| 237 |
|
| 238 | self._match(TokenKind.RBrace)
|
| 239 | #print('MOD %s' % module_parts)
|
| 240 | return Use(module_parts, type_names)
|
| 241 |
|
| 242 | def _parse_extern(self):
|
| 243 | """
|
| 244 | extern: 'extern' '[' NAME+ ']'
|
| 245 |
|
| 246 | Examples:
|
| 247 | extern _Builtin
|
| 248 | extern _Callable
|
| 249 | """
|
| 250 | self._advance() # past 'extern'
|
| 251 |
|
| 252 | self._match(TokenKind.LBracket)
|
| 253 |
|
| 254 | # At least one name
|
| 255 | names = [self._match(TokenKind.Name)]
|
| 256 |
|
| 257 | while self.cur_token.kind == TokenKind.Name:
|
| 258 | names.append(self._advance())
|
| 259 |
|
| 260 | self._match(TokenKind.RBracket)
|
| 261 |
|
| 262 | return Extern(names)
|
| 263 |
|
| 264 | def _parse_compound_type(self):
|
| 265 | """
|
| 266 | constructor : NAME fields?
|
| 267 | | NAME '%' NAME # shared variant
|
| 268 |
|
| 269 | sum : constructor ('|' constructor)* generate?
|
| 270 |
|
| 271 | compound_type : product
|
| 272 | | sum
|
| 273 |
|
| 274 | Examples:
|
| 275 | alloc_members =
|
| 276 | List
|
| 277 | | Dict
|
| 278 | | Struct
|
| 279 | generate [bit_set]
|
| 280 |
|
| 281 | -- color::Red, not color_e::Red or color_i::Red
|
| 282 | color = Red | Green
|
| 283 | generate [integers, no_sum_suffix]
|
| 284 | """
|
| 285 | if self.cur_token.kind == TokenKind.LParen:
|
| 286 | # If we see a (, it's a product
|
| 287 | return self._parse_product()
|
| 288 | else:
|
| 289 | # Otherwise it's a sum. Look for ConstructorId
|
| 290 | sumlist = []
|
| 291 | while True:
|
| 292 | cons_name = self._match(TokenKind.Name)
|
| 293 |
|
| 294 | shared_type = None
|
| 295 | fields = None
|
| 296 | if self.cur_token.kind == TokenKind.LParen:
|
| 297 | fields = self._parse_fields()
|
| 298 | elif self.cur_token.kind == TokenKind.Percent:
|
| 299 | self._advance()
|
| 300 | shared_type = self._match(TokenKind.Name)
|
| 301 | else:
|
| 302 | pass
|
| 303 |
|
| 304 | cons = Constructor(cons_name, shared_type, fields)
|
| 305 | sumlist.append(cons)
|
| 306 |
|
| 307 | if self.cur_token.kind != TokenKind.Pipe:
|
| 308 | break
|
| 309 | self._advance()
|
| 310 | generate = self._parse_optional_generate()
|
| 311 |
|
| 312 | # Additional validation
|
| 313 | if generate is not None:
|
| 314 | for g in generate:
|
| 315 | if g not in _CODE_GEN_OPTIONS:
|
| 316 | raise ASDLSyntaxError('Invalid code gen option %r' % g,
|
| 317 | self.cur_token.lineno)
|
| 318 |
|
| 319 | if _SumIsSimple(sumlist):
|
| 320 | return SimpleSum(sumlist, generate)
|
| 321 | else:
|
| 322 | return Sum(sumlist, generate)
|
| 323 |
|
| 324 | def _parse_type_expr(self):
|
| 325 | """One or two params:
|
| 326 |
|
| 327 | type_params : '[' type_expr ( ',' type_expr )* ']'
|
| 328 |
|
| 329 | type_expr : NAME type_params? (''?' | '*')? # allow one suffix
|
| 330 |
|
| 331 | NAME is validated against Optional, List, Dict afterward
|
| 332 | """
|
| 333 | type_name = self._match(TokenKind.Name)
|
| 334 |
|
| 335 | # Accept Python-like naming!
|
| 336 | if type_name == 'str':
|
| 337 | type_name = 'string'
|
| 338 |
|
| 339 | children = []
|
| 340 | if self.cur_token.kind == TokenKind.LBracket:
|
| 341 | self._advance()
|
| 342 | children.append(self._parse_type_expr())
|
| 343 | if self.cur_token.kind == TokenKind.Comma:
|
| 344 | self._advance()
|
| 345 | children.append(self._parse_type_expr())
|
| 346 |
|
| 347 | self._match(TokenKind.RBracket)
|
| 348 |
|
| 349 | if type_name in ('List', 'Optional'):
|
| 350 | if len(children) != 1:
|
| 351 | raise ASDLSyntaxError(
|
| 352 | 'Expected 1 type param to {}'.format(type_name),
|
| 353 | self.cur_token.lineno)
|
| 354 | elif type_name == 'Dict':
|
| 355 | if len(children) != 2:
|
| 356 | raise ASDLSyntaxError(
|
| 357 | 'Expected 2 type params to {}'.format(type_name),
|
| 358 | self.cur_token.lineno)
|
| 359 | else:
|
| 360 | if len(children) != 0:
|
| 361 | raise ASDLSyntaxError(
|
| 362 | 'Expected zero type params to {}'.format(type_name),
|
| 363 | self.cur_token.lineno)
|
| 364 |
|
| 365 | if len(children):
|
| 366 | typ = ast.ParameterizedType(type_name, children)
|
| 367 | else:
|
| 368 | typ = ast.NamedType(type_name)
|
| 369 |
|
| 370 | if self.cur_token.kind == TokenKind.Asterisk:
|
| 371 | # string* is equivalent to List[string]
|
| 372 | typ = ast.ParameterizedType('List', [typ])
|
| 373 | self._advance()
|
| 374 |
|
| 375 | elif self.cur_token.kind == TokenKind.Question:
|
| 376 | # string* is equivalent to Optional[string]
|
| 377 | typ = ast.ParameterizedType('Optional', [typ])
|
| 378 | self._advance()
|
| 379 |
|
| 380 | return typ
|
| 381 |
|
| 382 | def _parse_fields(self):
|
| 383 | """
|
| 384 | fields_inner: type_expr NAME ( ',' type_expr NAME )* ','?
|
| 385 |
|
| 386 | fields : '(' fields_inner? ')'
|
| 387 |
|
| 388 | Name Quantifier? should be changed to typename.
|
| 389 | """
|
| 390 | fields = []
|
| 391 | self._match(TokenKind.LParen)
|
| 392 | while self.cur_token.kind == TokenKind.Name:
|
| 393 | typ = self._parse_type_expr()
|
| 394 | field_name = self._match(TokenKind.Name)
|
| 395 |
|
| 396 | fields.append(Field(typ, field_name))
|
| 397 |
|
| 398 | if self.cur_token.kind == TokenKind.RParen:
|
| 399 | break
|
| 400 | elif self.cur_token.kind == TokenKind.Comma:
|
| 401 | self._advance()
|
| 402 |
|
| 403 | self._match(TokenKind.RParen)
|
| 404 | return fields
|
| 405 |
|
| 406 | def _parse_list(self):
|
| 407 | """
|
| 408 | list_inner: NAME ( ',' NAME )* ','?
|
| 409 |
|
| 410 | list : '[' list_inner? ']'
|
| 411 | """
|
| 412 | generate = []
|
| 413 | self._match(TokenKind.LBracket)
|
| 414 | while self.cur_token.kind == TokenKind.Name:
|
| 415 | name = self._match(TokenKind.Name)
|
| 416 |
|
| 417 | generate.append(name)
|
| 418 |
|
| 419 | if self.cur_token.kind == TokenKind.RBracket:
|
| 420 | break
|
| 421 | elif self.cur_token.kind == TokenKind.Comma:
|
| 422 | self._advance()
|
| 423 |
|
| 424 | self._match(TokenKind.RBracket)
|
| 425 | return generate
|
| 426 |
|
| 427 | def _parse_optional_generate(self):
|
| 428 | """
|
| 429 | generate : 'generate' list
|
| 430 | """
|
| 431 | if self._at_keyword('generate'):
|
| 432 | self._advance()
|
| 433 | return self._parse_list()
|
| 434 | else:
|
| 435 | return None
|
| 436 |
|
| 437 | def _parse_product(self):
|
| 438 | """Product: fields attributes?"""
|
| 439 | return Product(self._parse_fields())
|
| 440 |
|
| 441 | def _advance(self):
|
| 442 | """Return current token; read next token into self.cur_token."""
|
| 443 | cur_val = None if self.cur_token is None else self.cur_token.value
|
| 444 | try:
|
| 445 | self.cur_token = next(self._tokenizer)
|
| 446 | except StopIteration:
|
| 447 | self.cur_token = None
|
| 448 | return cur_val
|
| 449 |
|
| 450 | def _match(self, kind):
|
| 451 | """The 'match' primitive of RD parsers.
|
| 452 |
|
| 453 | * Verifies that the current token is of the given kind (kind can
|
| 454 | be a tuple, in which the kind must match one of its members).
|
| 455 | * Returns the value of the current token
|
| 456 | * Reads in the next token
|
| 457 |
|
| 458 | Args:
|
| 459 | kind: A TokenKind, or a tuple of TokenKind
|
| 460 | """
|
| 461 | if self.cur_token.kind == kind:
|
| 462 | value = self.cur_token.value
|
| 463 | self._advance()
|
| 464 | return value
|
| 465 | else:
|
| 466 | raise ASDLSyntaxError(
|
| 467 | 'Expected token {}, got {}'.format(_TOKEN_STR[kind],
|
| 468 | self.cur_token.value),
|
| 469 | self.cur_token.lineno)
|
| 470 |
|
| 471 | def _at_keyword(self, keyword):
|
| 472 | return (self.cur_token.kind == TokenKind.Keyword and
|
| 473 | self.cur_token.value == keyword)
|