asdl/front_end.py

OILS / asdl / front_end.py View on Github | oils.pub

590 lines, 334 significant

1	"""front_end.py: Lexer and parser for the ASDL schema language."""
2	from __future__ import print_function
3
4	import re
5
6	from asdl import ast
7	from asdl.ast import (AST, Use, Module, TypeDecl, SubTypeDecl, Constructor,
8	Field, Sum, SimpleSum, Product, Extern)
9	from asdl.util import log
10
11	# type checking not turned on yet
12	from typing import Dict, Any
13
14	_ = log
15
16	_KEYWORDS = ['use', 'module', 'generate', 'extern']
17
18	_TOKENS = [
19	('Keyword', ''),
20	('Name', ''),
21
22	# For operators, the string matters
23	('Equals', '='),
24	('Comma', ','),
25	('Question', '?'),
26	('Pipe', '\|'),
27	('Asterisk', '*'),
28	('LParen', '('),
29	('RParen', ')'),
30	('LBrace', '{'),
31	('RBrace', '}'),
32	('Percent', '%'),
33	('LessThan', '<'), # for subtyping CompoundWord < List[word_part]
34
35	# Oils addition for parameterized types.
36	('LBracket', '['),
37	('RBracket', ']'),
38
39	# - Start with Dict[string, bool].
40	# - List[string] is an alias for string*
41	#
42	# statically typed: Dict and List
43	# dynamically typed: dict and list
44	]
45
46	_TOKEN_STR = [name for name, _ in _TOKENS] # integer -> string like LParen
47	_TOKEN_INT = {} # string like '(' -> integer
48
49
50	class TokenKind(object):
51	"""ASDL tokens.
52
53	TokenKind.LBrace = 5, etc.
54	"""
55	pass
56
57
58	for i, (name, val) in enumerate(_TOKENS):
59	setattr(TokenKind, name, i)
60	_TOKEN_INT[val] = i
61
62
63	class Token(object):
64
65	def __init__(self, kind, value, lineno):
66	self.kind = kind
67	self.value = value
68	self.lineno = lineno
69
70
71	class ASDLSyntaxError(Exception):
72
73	def __init__(self, msg, lineno=None):
74	self.msg = msg
75	self.lineno = lineno or '<unknown>'
76
77	def __str__(self):
78	return 'Syntax error on line {0.lineno}: {0.msg}'.format(self)
79
80
81	TOKEN_RE = r'\s(\w+\|--.\|#.*\|.)'
82
83
84	def _Tokenize(f):
85	"""Tokenize the given buffer.
86
87	Yield Token objects.
88	"""
89	for lineno, line in enumerate(f, 1):
90	for m in re.finditer(TOKEN_RE, line.strip()):
91	c = m.group(1)
92	if c in _KEYWORDS:
93	yield Token(TokenKind.Keyword, c, lineno)
94
95	elif c[0].isalpha() or c[0] == '_':
96	yield Token(TokenKind.Name, c, lineno)
97
98	elif c.startswith('--') or c.startswith('#'):
99	# ASDL comments start with --
100	# Added # comments like Python and shell
101	break
102
103	else:
104	# Operators
105	try:
106	op_kind = _TOKEN_INT[c]
107	except KeyError:
108	raise ASDLSyntaxError('Invalid operator %s' % c, lineno)
109	yield Token(op_kind, c, lineno)
110
111
112	def _SumIsSimple(variant_list):
113	"""Return True if a sum is a simple.
114
115	A sum is simple if its types have no fields, e.g. unaryop = Invert \|
116	Not \| UAdd \| USub
117	"""
118	for t in variant_list:
119	if t.fields or t.shared_type:
120	return False
121	return True
122
123
124	_CODE_GEN_OPTIONS = [
125	'no_namespace_suffix', # Id.Foo instead of Id_e.Foo
126	'integers', # integer builtin_i instead of strongly typed builtin_e
127	'uint16', # like integers, but use uint16_t instead
128	'bit_set', # not implemented: 1 << n instead of n
129
130	# probably don't need this
131	# 'common_synthetic_field:left_tok',
132
133	# Put this type, and transitive closure of types it references, in the
134	# unique "first class variant" namespace, and generate type reflection.
135	'reflect_all_types',
136
137	# Squeeze and Freeze, with the number of bits as a option Hm the headers
138	# here still need type reflection. Probably OK.
139	'mirror_all_types:16',
140	]
141
142
143	class ASDLParser(object):
144	"""Parser for ASDL files.
145
146	Create, then call the parse method on a buffer containing ASDL. This
147	is a simple recursive descent parser that uses _Tokenize for the
148	lexing.
149	"""
150
151	def __init__(self):
152	self._tokenizer = None
153	self.cur_token = None
154
155	def parse(self, f):
156	"""Parse the ASDL in the file and return an AST with a Module root."""
157	self._tokenizer = _Tokenize(f)
158	self._advance()
159	return self._parse_module()
160
161	def _parse_module(self):
162	"""
163	type_decl : NAME '=' compound_type
164	module : 'module' NAME '{'
165	use*
166	extern*
167	type_decl*
168	'}'
169
170	We added:
171	- use for imports
172	- generate on sum types
173	"""
174	if not self._at_keyword('module'):
175	raise ASDLSyntaxError(
176	'Expected "module" (found {})'.format(self.cur_token.value),
177	self.cur_token.lineno)
178	self._advance()
179	name = self._match(TokenKind.Name)
180	self._match(TokenKind.LBrace)
181
182	uses = []
183	while self._at_keyword('use'):
184	uses.append(self._parse_use())
185
186	externs = []
187	while self._at_keyword('extern'):
188	externs.append(self._parse_extern())
189
190	defs = []
191	while self.cur_token.kind == TokenKind.Name:
192	typename = self._advance()
193	if self.cur_token.kind == TokenKind.Equals:
194	self._advance()
195	type_ = self._parse_compound_type()
196	defs.append(TypeDecl(typename, type_))
197	elif self.cur_token.kind == TokenKind.LessThan:
198	self._advance()
199	type_ = self._parse_type_expr()
200	defs.append(SubTypeDecl(typename, type_))
201	else:
202	raise ASDLSyntaxError(
203	'Expected = or < after type name (found {})'.format(
204	self.cur_token.value), self.cur_token.lineno)
205
206	self._match(TokenKind.RBrace)
207	return Module(name, uses, externs, defs)
208
209	def _parse_use(self):
210	"""
211	use: 'use' NAME+ '{' NAME+ '}'
212
213	example: use frontend syntax { Token }
214
215	This means frontend/syntax.asdl.h :: Token
216	"""
217	self._advance() # past 'use'
218	module_parts = []
219	while self.cur_token.kind == TokenKind.Name:
220	part = self._advance()
221	module_parts.append(part)
222
223	self._match(TokenKind.LBrace)
224
225	type_names = []
226	while self.cur_token.kind == TokenKind.Name:
227	t = self._advance()
228	type_names.append(t)
229	if self.cur_token.kind == TokenKind.RParen:
230	break
231	elif self.cur_token.kind == TokenKind.Comma:
232	self._advance()
233
234	self._match(TokenKind.RBrace)
235	#print('MOD %s' % module_parts)
236	return Use(module_parts, type_names)
237
238	def _parse_extern(self):
239	"""
240	extern: 'extern' NAME+
241
242	Examples:
243	extern _Builtin
244	extern _Callable
245	"""
246	self._advance() # past 'extern'
247
248	self._match(TokenKind.LBracket)
249
250	# At least one name
251	names = [self._match(TokenKind.Name)]
252
253	while self.cur_token.kind == TokenKind.Name:
254	names.append(self._advance())
255
256	self._match(TokenKind.RBracket)
257
258	return Extern(names)
259
260	def _parse_compound_type(self):
261	"""
262	constructor : NAME fields?
263	\| NAME '%' NAME # shared variant
264
265	sum : constructor ('\|' constructor)* generate?
266
267	compound_type : product
268	\| sum
269
270	Examples:
271	alloc_members =
272	List
273	\| Dict
274	\| Struct
275	generate [bit_set]
276
277	-- color::Red, not color_e::Red or color_i::Red
278	color = Red \| Green
279	generate [integers, no_sum_suffix]
280	"""
281	if self.cur_token.kind == TokenKind.LParen:
282	# If we see a (, it's a product
283	return self._parse_product()
284	else:
285	# Otherwise it's a sum. Look for ConstructorId
286	sumlist = []
287	while True:
288	cons_name = self._match(TokenKind.Name)
289
290	shared_type = None
291	fields = None
292	if self.cur_token.kind == TokenKind.LParen:
293	fields = self._parse_fields()
294	elif self.cur_token.kind == TokenKind.Percent:
295	self._advance()
296	shared_type = self._match(TokenKind.Name)
297	else:
298	pass
299
300	cons = Constructor(cons_name, shared_type, fields)
301	sumlist.append(cons)
302
303	if self.cur_token.kind != TokenKind.Pipe:
304	break
305	self._advance()
306	generate = self._parse_optional_generate()
307
308	# Additional validation
309	if generate is not None:
310	for g in generate:
311	if g not in _CODE_GEN_OPTIONS:
312	raise ASDLSyntaxError('Invalid code gen option %r' % g,
313	self.cur_token.lineno)
314
315	if _SumIsSimple(sumlist):
316	return SimpleSum(sumlist, generate)
317	else:
318	return Sum(sumlist, generate)
319
320	def _parse_type_expr(self):
321	"""One or two params:
322
323	type_params : '[' type_expr ( ',' type_expr )* ']'
324
325	type_expr : NAME type_params? (''?' \| '*')? # allow one suffix
326
327	NAME is validated against Optional, List, Dict afterward
328	"""
329	type_name = self._match(TokenKind.Name)
330
331	# Accept Python-like naming!
332	if type_name == 'str':
333	type_name = 'string'
334
335	children = []
336	if self.cur_token.kind == TokenKind.LBracket:
337	self._advance()
338	children.append(self._parse_type_expr())
339	if self.cur_token.kind == TokenKind.Comma:
340	self._advance()
341	children.append(self._parse_type_expr())
342
343	self._match(TokenKind.RBracket)
344
345	if type_name in ('List', 'Optional'):
346	if len(children) != 1:
347	raise ASDLSyntaxError(
348	'Expected 1 type param to {}'.format(type_name),
349	self.cur_token.lineno)
350	elif type_name == 'Dict':
351	if len(children) != 2:
352	raise ASDLSyntaxError(
353	'Expected 2 type params to {}'.format(type_name),
354	self.cur_token.lineno)
355	else:
356	if len(children) != 0:
357	raise ASDLSyntaxError(
358	'Expected zero type params to {}'.format(type_name),
359	self.cur_token.lineno)
360
361	if len(children):
362	typ = ast.ParameterizedType(type_name, children)
363	else:
364	typ = ast.NamedType(type_name)
365
366	if self.cur_token.kind == TokenKind.Asterisk:
367	# string* is equivalent to List[string]
368	typ = ast.ParameterizedType('List', [typ])
369	self._advance()
370
371	elif self.cur_token.kind == TokenKind.Question:
372	# string* is equivalent to Optional[string]
373	typ = ast.ParameterizedType('Optional', [typ])
374	self._advance()
375
376	return typ
377
378	def _parse_fields(self):
379	"""
380	fields_inner: type_expr NAME ( ',' type_expr NAME )* ','?
381
382	fields : '(' fields_inner? ')'
383
384	Name Quantifier? should be changed to typename.
385	"""
386	fields = []
387	self._match(TokenKind.LParen)
388	while self.cur_token.kind == TokenKind.Name:
389	typ = self._parse_type_expr()
390	field_name = self._match(TokenKind.Name)
391
392	fields.append(Field(typ, field_name))
393
394	if self.cur_token.kind == TokenKind.RParen:
395	break
396	elif self.cur_token.kind == TokenKind.Comma:
397	self._advance()
398
399	self._match(TokenKind.RParen)
400	return fields
401
402	def _parse_list(self):
403	"""
404	list_inner: NAME ( ',' NAME )* ','?
405
406	list : '[' list_inner? ']'
407	"""
408	generate = []
409	self._match(TokenKind.LBracket)
410	while self.cur_token.kind == TokenKind.Name:
411	name = self._match(TokenKind.Name)
412
413	generate.append(name)
414
415	if self.cur_token.kind == TokenKind.RBracket:
416	break
417	elif self.cur_token.kind == TokenKind.Comma:
418	self._advance()
419
420	self._match(TokenKind.RBracket)
421	return generate
422
423	def _parse_optional_generate(self):
424	"""
425	generate : 'generate' list
426	"""
427	if self._at_keyword('generate'):
428	self._advance()
429	return self._parse_list()
430	else:
431	return None
432
433	def _parse_product(self):
434	"""Product: fields attributes?"""
435	return Product(self._parse_fields())
436
437	def _advance(self):
438	"""Return current token; read next token into self.cur_token."""
439	cur_val = None if self.cur_token is None else self.cur_token.value
440	try:
441	self.cur_token = next(self._tokenizer)
442	except StopIteration:
443	self.cur_token = None
444	return cur_val
445
446	def _match(self, kind):
447	"""The 'match' primitive of RD parsers.
448
449	* Verifies that the current token is of the given kind (kind can
450	be a tuple, in which the kind must match one of its members).
451	* Returns the value of the current token
452	* Reads in the next token
453
454	Args:
455	kind: A TokenKind, or a tuple of TokenKind
456	"""
457	if self.cur_token.kind == kind:
458	value = self.cur_token.value
459	self._advance()
460	return value
461	else:
462	raise ASDLSyntaxError(
463	'Expected token {}, got {}'.format(_TOKEN_STR[kind],
464	self.cur_token.value),
465	self.cur_token.lineno)
466
467	def _at_keyword(self, keyword):
468	return (self.cur_token.kind == TokenKind.Keyword and
469	self.cur_token.value == keyword)
470
471
472	_PRIMITIVE_TYPES = [
473	'string',
474	'int',
475	'uint16', # used for Token length - should we generalize this?
476	'BigInt',
477	'float',
478	'bool',
479
480	# 'any' is used for value.{BuiltinProc,BuiltinFunc}, to cast from class
481	# type
482	'any',
483	]
484
485
486	def _ResolveType(typ, type_lookup):
487	# type: (AST, Dict[str, Any]) -> None
488	"""Recursively attach a 'resolved' field to AST nodes."""
489	if isinstance(typ, ast.NamedType):
490	if typ.name not in _PRIMITIVE_TYPES:
491	ast_node = type_lookup.get(typ.name)
492	if ast_node is None:
493	raise ASDLSyntaxError("Couldn't find type %r" % typ.name)
494	typ.resolved = ast_node
495
496	elif isinstance(typ, ast.ParameterizedType):
497	for child in typ.children:
498	_ResolveType(child, type_lookup)
499
500	if typ.type_name == 'Optional':
501	child = typ.children[0]
502	if isinstance(child, ast.NamedType):
503	if child.name in _PRIMITIVE_TYPES and child.name != 'string':
504	raise ASDLSyntaxError(
505	'Optional primitive type {} not allowed'.format(
506	child.name))
507
508	if child.resolved and isinstance(child.resolved,
509	ast.SimpleSum):
510	raise ASDLSyntaxError(
511	'Optional simple sum type {} not allowed'.format(
512	child.name))
513
514	else:
515	raise AssertionError()
516
517
518	def _ResolveFields(field_ast_nodes, type_lookup):
519	"""
520	Args:
521	type_lookup: Populated by name resolution
522	"""
523	for field in field_ast_nodes:
524	_ResolveType(field.typ, type_lookup)
525
526
527	def _ResolveModule(module, app_types):
528	"""Name resolution for NamedType."""
529	# Types that fields are declared with: int, id, word_part, etc.
530	# Fields are NOT declared with Constructor names.
531	type_lookup = dict(app_types)
532
533	# Note: we don't actually load the type, and instead leave that to MyPy /
534	# C++. A consequence of this is TypeNameHeuristic().
535	for u in module.uses:
536	for type_name in u.type_names:
537	type_lookup[type_name] = u # type: ast.Use()
538
539	# NOTE: We need two passes because types can be mutually recursive, e.g.
540	# asdl/arith.asdl.
541
542	# First pass: collect declared types and make entries for them.
543	for ex in module.externs:
544	last = ex.names[-1] # e.g. _Callable
545	if last in type_lookup:
546	raise ASDLSyntaxError('Type %r was already defined' % last)
547	type_lookup[last] = ex
548
549	for d in module.dfns:
550	if d.name in type_lookup:
551	raise ASDLSyntaxError('Type %r was already defined' % d.name)
552
553	if isinstance(d, SubTypeDecl):
554	# e.g. CompoundWord < List[str]
555	type_lookup[d.name] = d.base_class
556	else:
557	# e.g. Token = (str a)
558	type_lookup[d.name] = d.value
559
560	# Second pass: add NamedType.resolved field
561	for d in module.dfns:
562	if isinstance(d, SubTypeDecl): # no fields
563	_ResolveType(d.base_class, type_lookup)
564	continue
565
566	ast_node = d.value
567	if isinstance(ast_node, ast.Product):
568	#log('fields %s', ast_node.fields)
569	_ResolveFields(ast_node.fields, type_lookup)
570	continue
571
572	if isinstance(ast_node, ast.Sum):
573	for cons in ast_node.types:
574	_ResolveFields(cons.fields, type_lookup)
575	continue
576
577	raise AssertionError(ast_node)
578
579
580	def LoadSchema(f, app_types, verbose=False):
581	"""Returns an AST for the schema."""
582	p = ASDLParser()
583	schema_ast = p.parse(f)
584	if verbose:
585	import sys
586	schema_ast.Print(sys.stdout, 0)
587
588	# Make sure all the names are valid
589	_ResolveModule(schema_ast, app_types)
590	return schema_ast