asdl/front_end.py

OILS / asdl / front_end.py View on Github | oilshell.org

548 lines, 316 significant

1	"""front_end.py: Lexer and parser for the ASDL schema language."""
2	from __future__ import print_function
3
4	import re
5
6	from asdl import ast
7	from asdl.ast import (AST, Use, Module, TypeDecl, SubTypeDecl, Constructor,
8	Field, Sum, SimpleSum, Product)
9	from asdl.util import log
10
11	_ = log
12
13	_KEYWORDS = ['use', 'module', 'generate']
14
15	_TOKENS = [
16	('Keyword', ''),
17	('Name', ''),
18
19	# For operators, the string matters
20	('Equals', '='),
21	('Comma', ','),
22	('Question', '?'),
23	('Pipe', '\|'),
24	('Asterisk', '*'),
25	('LParen', '('),
26	('RParen', ')'),
27	('LBrace', '{'),
28	('RBrace', '}'),
29	('Percent', '%'),
30	('LessThan', '<'), # for subtyping CompoundWord < List[word_part]
31
32	# Oils addition for parameterized types.
33	('LBracket', '['),
34	('RBracket', ']'),
35
36	# - Start with Dict[string, bool].
37	# - List[string] is an alias for string*
38	#
39	# statically typed: Dict and List
40	# dynamically typed: dict and list
41	]
42
43	_TOKEN_STR = [name for name, _ in _TOKENS] # integer -> string like LParen
44	_TOKEN_INT = {} # string like '(' -> integer
45
46
47	class TokenKind(object):
48	"""ASDL tokens.
49
50	TokenKind.LBrace = 5, etc.
51	"""
52	pass
53
54
55	for i, (name, val) in enumerate(_TOKENS):
56	setattr(TokenKind, name, i)
57	_TOKEN_INT[val] = i
58
59
60	class Token(object):
61
62	def __init__(self, kind, value, lineno):
63	self.kind = kind
64	self.value = value
65	self.lineno = lineno
66
67
68	class ASDLSyntaxError(Exception):
69
70	def __init__(self, msg, lineno=None):
71	self.msg = msg
72	self.lineno = lineno or '<unknown>'
73
74	def __str__(self):
75	return 'Syntax error on line {0.lineno}: {0.msg}'.format(self)
76
77
78	def _Tokenize(f):
79	"""Tokenize the given buffer.
80
81	Yield Token objects.
82	"""
83	for lineno, line in enumerate(f, 1):
84	for m in re.finditer(r'\s(\w+\|--.\|#.*\|.)', line.strip()):
85	c = m.group(1)
86	if c in _KEYWORDS:
87	yield Token(TokenKind.Keyword, c, lineno)
88
89	elif c[0].isalpha():
90	yield Token(TokenKind.Name, c, lineno)
91
92	elif c.startswith('--') or c.startswith('#'):
93	# ASDL comments start with --
94	# Added # comments like Python and shell
95	break
96
97	else:
98	# Operators
99	try:
100	op_kind = _TOKEN_INT[c]
101	except KeyError:
102	raise ASDLSyntaxError('Invalid operator %s' % c, lineno)
103	yield Token(op_kind, c, lineno)
104
105
106	def _SumIsSimple(variant_list):
107	"""Return True if a sum is a simple.
108
109	A sum is simple if its types have no fields, e.g. unaryop = Invert \|
110	Not \| UAdd \| USub
111	"""
112	for t in variant_list:
113	if t.fields or t.shared_type:
114	return False
115	return True
116
117
118	_CODE_GEN_OPTIONS = [
119	'no_namespace_suffix', # Id.Foo instead of Id_e.Foo
120	'integers', # integer builtin_i instead of strongly typed builtin_e
121	'uint16', # like integers, but use uint16_t instead
122	'bit_set', # not implemented: 1 << n instead of n
123
124	# probably don't need this
125	# 'common_synthetic_field:left_tok',
126
127	# Put this type, and transitive closure of types it references, in the
128	# unique "first class variant" namespace, and generate type reflection.
129	'reflect_all_types',
130
131	# Squeeze and Freeze, with the number of bits as a option Hm the headers
132	# here still need type reflection. Probably OK.
133	'mirror_all_types:16',
134	]
135
136
137	class ASDLParser(object):
138	"""Parser for ASDL files.
139
140	Create, then call the parse method on a buffer containing ASDL. This
141	is a simple recursive descent parser that uses _Tokenize for the
142	lexing.
143	"""
144
145	def __init__(self):
146	self._tokenizer = None
147	self.cur_token = None
148
149	def parse(self, f):
150	"""Parse the ASDL in the file and return an AST with a Module root."""
151	self._tokenizer = _Tokenize(f)
152	self._advance()
153	return self._parse_module()
154
155	def _parse_module(self):
156	"""
157	type_decl : NAME '=' compound_type
158	module : 'module' NAME '{' use* type_decl* '}'
159
160	We added:
161	- use for imports
162	- generate on sum types
163	"""
164	if not self._at_keyword('module'):
165	raise ASDLSyntaxError(
166	'Expected "module" (found {})'.format(self.cur_token.value),
167	self.cur_token.lineno)
168	self._advance()
169	name = self._match(TokenKind.Name)
170	self._match(TokenKind.LBrace)
171
172	uses = []
173	while self._at_keyword('use'):
174	uses.append(self._parse_use())
175
176	defs = []
177	while self.cur_token.kind == TokenKind.Name:
178	typename = self._advance()
179	if self.cur_token.kind == TokenKind.Equals:
180	self._advance()
181	type_ = self._parse_compound_type()
182	defs.append(TypeDecl(typename, type_))
183	elif self.cur_token.kind == TokenKind.LessThan:
184	self._advance()
185	type_ = self._parse_type_expr()
186	defs.append(SubTypeDecl(typename, type_))
187	else:
188	raise ASDLSyntaxError(
189	'Expected = or < after type name (found {})'.format(
190	self.cur_token.value), self.cur_token.lineno)
191
192	self._match(TokenKind.RBrace)
193	return Module(name, uses, defs)
194
195	def _parse_use(self):
196	"""
197	use: 'use' NAME+ '{' NAME+ '}'
198
199	example: use frontend syntax { Token }
200
201	This means frontend/syntax.asdl.h :: Token
202	"""
203	self._advance() # past 'use'
204	module_parts = []
205	while self.cur_token.kind == TokenKind.Name:
206	part = self._advance()
207	module_parts.append(part)
208
209	self._match(TokenKind.LBrace)
210
211	type_names = []
212	while self.cur_token.kind == TokenKind.Name:
213	t = self._advance()
214	type_names.append(t)
215	if self.cur_token.kind == TokenKind.RParen:
216	break
217	elif self.cur_token.kind == TokenKind.Comma:
218	self._advance()
219
220	self._match(TokenKind.RBrace)
221	#print('MOD %s' % module_parts)
222	return Use(module_parts, type_names)
223
224	def _parse_compound_type(self):
225	"""
226	constructor : NAME fields?
227	\| NAME '%' NAME # shared variant
228
229	sum : constructor ('\|' constructor)* generate?
230
231	compound_type : product
232	\| sum
233
234	Examples:
235	alloc_members =
236	List
237	\| Dict
238	\| Struct
239	generate [bit_set]
240
241	-- color::Red, not color_e::Red or color_i::Red
242	color = Red \| Green
243	generate [integers, no_sum_suffix]
244	"""
245	if self.cur_token.kind == TokenKind.LParen:
246	# If we see a (, it's a product
247	return self._parse_product()
248	else:
249	# Otherwise it's a sum. Look for ConstructorId
250	sumlist = []
251	while True:
252	cons_name = self._match(TokenKind.Name)
253
254	shared_type = None
255	fields = None
256	if self.cur_token.kind == TokenKind.LParen:
257	fields = self._parse_fields()
258	elif self.cur_token.kind == TokenKind.Percent:
259	self._advance()
260	shared_type = self._match(TokenKind.Name)
261	else:
262	pass
263
264	cons = Constructor(cons_name, shared_type, fields)
265	sumlist.append(cons)
266
267	if self.cur_token.kind != TokenKind.Pipe:
268	break
269	self._advance()
270	generate = self._parse_optional_generate()
271
272	# Additional validation
273	if generate is not None:
274	for g in generate:
275	if g not in _CODE_GEN_OPTIONS:
276	raise ASDLSyntaxError('Invalid code gen option %r' % g,
277	self.cur_token.lineno)
278
279	if _SumIsSimple(sumlist):
280	return SimpleSum(sumlist, generate)
281	else:
282	return Sum(sumlist, generate)
283
284	def _parse_type_expr(self):
285	"""One or two params:
286
287	type_params : '[' type_expr ( ',' type_expr )* ']'
288
289	type_expr : NAME type_params? (''?' \| '*')? # allow one suffix
290
291	NAME is validated against Optional, List, Dict afterward
292	"""
293	type_name = self._match(TokenKind.Name)
294
295	# Accept Python-like naming!
296	if type_name == 'str':
297	type_name = 'string'
298
299	children = []
300	if self.cur_token.kind == TokenKind.LBracket:
301	self._advance()
302	children.append(self._parse_type_expr())
303	if self.cur_token.kind == TokenKind.Comma:
304	self._advance()
305	children.append(self._parse_type_expr())
306
307	self._match(TokenKind.RBracket)
308
309	if type_name in ('List', 'Optional'):
310	if len(children) != 1:
311	raise ASDLSyntaxError(
312	'Expected 1 type param to {}'.format(type_name),
313	self.cur_token.lineno)
314	elif type_name == 'Dict':
315	if len(children) != 2:
316	raise ASDLSyntaxError(
317	'Expected 2 type params to {}'.format(type_name),
318	self.cur_token.lineno)
319	else:
320	if len(children) != 0:
321	raise ASDLSyntaxError(
322	'Expected zero type params to {}'.format(type_name),
323	self.cur_token.lineno)
324
325	if len(children):
326	typ = ast.ParameterizedType(type_name, children)
327	else:
328	typ = ast.NamedType(type_name)
329
330	if self.cur_token.kind == TokenKind.Asterisk:
331	# string* is equivalent to List[string]
332	typ = ast.ParameterizedType('List', [typ])
333	self._advance()
334
335	elif self.cur_token.kind == TokenKind.Question:
336	# string* is equivalent to Optional[string]
337	typ = ast.ParameterizedType('Optional', [typ])
338	self._advance()
339
340	return typ
341
342	def _parse_fields(self):
343	"""
344	fields_inner: type_expr NAME ( ',' type_expr NAME )* ','?
345
346	fields : '(' fields_inner? ')'
347
348	Name Quantifier? should be changed to typename.
349	"""
350	fields = []
351	self._match(TokenKind.LParen)
352	while self.cur_token.kind == TokenKind.Name:
353	typ = self._parse_type_expr()
354	field_name = self._match(TokenKind.Name)
355
356	fields.append(Field(typ, field_name))
357
358	if self.cur_token.kind == TokenKind.RParen:
359	break
360	elif self.cur_token.kind == TokenKind.Comma:
361	self._advance()
362
363	self._match(TokenKind.RParen)
364	return fields
365
366	def _parse_list(self):
367	"""
368	list_inner: NAME ( ',' NAME )* ','?
369
370	list : '[' list_inner? ']'
371	"""
372	generate = []
373	self._match(TokenKind.LBracket)
374	while self.cur_token.kind == TokenKind.Name:
375	name = self._match(TokenKind.Name)
376
377	generate.append(name)
378
379	if self.cur_token.kind == TokenKind.RBracket:
380	break
381	elif self.cur_token.kind == TokenKind.Comma:
382	self._advance()
383
384	self._match(TokenKind.RBracket)
385	return generate
386
387	def _parse_optional_generate(self):
388	"""
389	generate : 'generate' list
390	"""
391	if self._at_keyword('generate'):
392	self._advance()
393	return self._parse_list()
394	else:
395	return None
396
397	def _parse_product(self):
398	"""Product: fields attributes?"""
399	return Product(self._parse_fields())
400
401	def _advance(self):
402	"""Return current token; read next token into self.cur_token."""
403	cur_val = None if self.cur_token is None else self.cur_token.value
404	try:
405	self.cur_token = next(self._tokenizer)
406	except StopIteration:
407	self.cur_token = None
408	return cur_val
409
410	def _match(self, kind):
411	"""The 'match' primitive of RD parsers.
412
413	* Verifies that the current token is of the given kind (kind can
414	be a tuple, in which the kind must match one of its members).
415	* Returns the value of the current token
416	* Reads in the next token
417
418	Args:
419	kind: A TokenKind, or a tuple of TokenKind
420	"""
421	if self.cur_token.kind == kind:
422	value = self.cur_token.value
423	self._advance()
424	return value
425	else:
426	raise ASDLSyntaxError(
427	'Expected token {}, got {}'.format(_TOKEN_STR[kind],
428	self.cur_token.value),
429	self.cur_token.lineno)
430
431	def _at_keyword(self, keyword):
432	return (self.cur_token.kind == TokenKind.Keyword and
433	self.cur_token.value == keyword)
434
435
436	_PRIMITIVE_TYPES = [
437	'string',
438	'int',
439	'uint16', # used for Token length - should we generalize this?
440	'BigInt',
441	'float',
442	'bool',
443
444	# 'any' is used for value.{BuiltinProc,BuiltinFunc}, to cast from class
445	# type
446	'any',
447	]
448
449
450	def _ResolveType(typ, type_lookup):
451	# type: (AST, Dict[str, Any]) -> None
452	"""Recursively attach a 'resolved' field to AST nodes."""
453	if isinstance(typ, ast.NamedType):
454	if typ.name not in _PRIMITIVE_TYPES:
455	ast_node = type_lookup.get(typ.name)
456	if ast_node is None:
457	raise ASDLSyntaxError("Couldn't find type %r" % typ.name)
458	typ.resolved = ast_node
459
460	elif isinstance(typ, ast.ParameterizedType):
461	for child in typ.children:
462	_ResolveType(child, type_lookup)
463
464	if typ.type_name == 'Optional':
465	child = typ.children[0]
466	if isinstance(child, ast.NamedType):
467	if child.name in _PRIMITIVE_TYPES and child.name != 'string':
468	raise ASDLSyntaxError(
469	'Optional primitive type {} not allowed'.format(
470	child.name))
471
472	if child.resolved and isinstance(child.resolved,
473	ast.SimpleSum):
474	raise ASDLSyntaxError(
475	'Optional simple sum type {} not allowed'.format(
476	child.name))
477
478	else:
479	raise AssertionError()
480
481
482	def _ResolveFields(field_ast_nodes, type_lookup):
483	"""
484	Args:
485	type_lookup: Populated by name resolution
486	"""
487	for field in field_ast_nodes:
488	_ResolveType(field.typ, type_lookup)
489
490
491	def _ResolveModule(module, app_types):
492	"""Name resolution for NamedType."""
493	# Types that fields are declared with: int, id, word_part, etc.
494	# Fields are NOT declared with Constructor names.
495	type_lookup = dict(app_types)
496
497	# Note: we don't actually load the type, and instead leave that to MyPy /
498	# C++. A consequence of this is TypeNameHeuristic().
499	for u in module.uses:
500	for type_name in u.type_names:
501	type_lookup[type_name] = u # type: ast.Use()
502
503	# NOTE: We need two passes because types can be mutually recursive, e.g.
504	# asdl/arith.asdl.
505
506	# First pass: collect declared types and make entries for them.
507	for d in module.dfns:
508	if d.name in type_lookup:
509	raise ASDLSyntaxError('Type %r was already defined' % d.name)
510
511	if isinstance(d, SubTypeDecl):
512	# e.g. CompoundWord < List[str]
513	type_lookup[d.name] = d.base_class
514	else:
515	# e.g. Token = (str a)
516	type_lookup[d.name] = d.value
517
518	# Second pass: add NamedType.resolved field
519	for d in module.dfns:
520	if isinstance(d, SubTypeDecl): # no fields
521	_ResolveType(d.base_class, type_lookup)
522	continue
523
524	ast_node = d.value
525	if isinstance(ast_node, ast.Product):
526	#log('fields %s', ast_node.fields)
527	_ResolveFields(ast_node.fields, type_lookup)
528	continue
529
530	if isinstance(ast_node, ast.Sum):
531	for cons in ast_node.types:
532	_ResolveFields(cons.fields, type_lookup)
533	continue
534
535	raise AssertionError(ast_node)
536
537
538	def LoadSchema(f, app_types, verbose=False):
539	"""Returns an AST for the schema."""
540	p = ASDLParser()
541	schema_ast = p.parse(f)
542	if verbose:
543	import sys
544	schema_ast.Print(sys.stdout, 0)
545
546	# Make sure all the names are valid
547	_ResolveModule(schema_ast, app_types)
548	return schema_ast