asdl/front_end.py

OILS / asdl / front_end.py View on Github | oilshell.org

525 lines, 297 significant

1	"""front_end.py: Lexer and parser for the ASDL schema language."""
2	from __future__ import print_function
3
4	import re
5
6	from asdl import ast
7	from asdl.ast import (AST, Use, Module, TypeDecl, Constructor, Field, Sum,
8	SimpleSum, Product)
9	from asdl.util import log
10
11	_ = log
12
13	_KEYWORDS = ['use', 'module', 'generate']
14
15	_TOKENS = [
16	('Keyword', ''),
17	('Name', ''),
18
19	# For operators, the string matters
20	('Equals', '='),
21	('Comma', ','),
22	('Question', '?'),
23	('Pipe', '\|'),
24	('Asterisk', '*'),
25	('LParen', '('),
26	('RParen', ')'),
27	('LBrace', '{'),
28	('RBrace', '}'),
29	('Percent', '%'),
30
31	# Oils addition for parameterized types.
32	('LBracket', '['),
33	('RBracket', ']'),
34
35	# - Start with Dict[string, bool].
36	# - List[string] is an alias for string*
37	#
38	# statically typed: Dict and List
39	# dynamically typed: dict and list
40	]
41
42	_TOKEN_STR = [name for name, _ in _TOKENS] # integer -> string like LParen
43	_TOKEN_INT = {} # string like '(' -> integer
44
45
46	class TokenKind(object):
47	"""ASDL tokens.
48
49	TokenKind.LBrace = 5, etc.
50	"""
51	pass
52
53
54	for i, (name, val) in enumerate(_TOKENS):
55	setattr(TokenKind, name, i)
56	_TOKEN_INT[val] = i
57
58
59	class Token(object):
60
61	def __init__(self, kind, value, lineno):
62	self.kind = kind
63	self.value = value
64	self.lineno = lineno
65
66
67	class ASDLSyntaxError(Exception):
68
69	def __init__(self, msg, lineno=None):
70	self.msg = msg
71	self.lineno = lineno or '<unknown>'
72
73	def __str__(self):
74	return 'Syntax error on line {0.lineno}: {0.msg}'.format(self)
75
76
77	def _Tokenize(f):
78	"""Tokenize the given buffer.
79
80	Yield Token objects.
81	"""
82	for lineno, line in enumerate(f, 1):
83	for m in re.finditer(r'\s(\w+\|--.\|#.*\|.)', line.strip()):
84	c = m.group(1)
85	if c in _KEYWORDS:
86	yield Token(TokenKind.Keyword, c, lineno)
87
88	elif c[0].isalpha():
89	yield Token(TokenKind.Name, c, lineno)
90
91	elif c.startswith('--') or c.startswith('#'):
92	# ASDL comments start with --
93	# Added # comments like Python and shell
94	break
95
96	else:
97	# Operators
98	try:
99	op_kind = _TOKEN_INT[c]
100	except KeyError:
101	raise ASDLSyntaxError('Invalid operator %s' % c, lineno)
102	yield Token(op_kind, c, lineno)
103
104
105	def _SumIsSimple(variant_list):
106	"""Return True if a sum is a simple.
107
108	A sum is simple if its types have no fields, e.g. unaryop = Invert \|
109	Not \| UAdd \| USub
110	"""
111	for t in variant_list:
112	if t.fields or t.shared_type:
113	return False
114	return True
115
116
117	_CODE_GEN_OPTIONS = [
118	'no_namespace_suffix', # Id.Foo instead of Id_e.Foo
119	'integers', # integer builtin_i instead of strongly typed builtin_e
120	'uint16', # like integers, but use uint16_t instead
121	'bit_set', # not implemented: 1 << n instead of n
122
123	# probably don't need this
124	# 'common_synthetic_field:left_tok',
125
126	# Put this type, and transitive closure of types it references, in the
127	# unique "first class variant" namespace, and generate type reflection.
128	'reflect_all_types',
129
130	# Squeeze and Freeze, with the number of bits as a option Hm the headers
131	# here still need type reflection. Probably OK.
132	'mirror_all_types:16',
133	]
134
135
136	class ASDLParser(object):
137	"""Parser for ASDL files.
138
139	Create, then call the parse method on a buffer containing ASDL. This
140	is a simple recursive descent parser that uses _Tokenize for the
141	lexing.
142	"""
143
144	def __init__(self):
145	self._tokenizer = None
146	self.cur_token = None
147
148	def parse(self, f):
149	"""Parse the ASDL in the file and return an AST with a Module root."""
150	self._tokenizer = _Tokenize(f)
151	self._advance()
152	return self._parse_module()
153
154	def _parse_module(self):
155	"""
156	type_decl : NAME '=' compound_type
157	module : 'module' NAME '{' use* type_decl* '}'
158
159	We added:
160	- use for imports
161	- generate on sum types
162	"""
163	if not self._at_keyword('module'):
164	raise ASDLSyntaxError(
165	'Expected "module" (found {})'.format(self.cur_token.value),
166	self.cur_token.lineno)
167	self._advance()
168	name = self._match(TokenKind.Name)
169	self._match(TokenKind.LBrace)
170
171	uses = []
172	while self._at_keyword('use'):
173	uses.append(self._parse_use())
174
175	defs = []
176	while self.cur_token.kind == TokenKind.Name:
177	typename = self._advance()
178	self._match(TokenKind.Equals)
179	type_ = self._parse_compound_type()
180	defs.append(TypeDecl(typename, type_))
181
182	self._match(TokenKind.RBrace)
183	return Module(name, uses, defs)
184
185	def _parse_use(self):
186	"""
187	use: 'use' NAME+ '{' NAME+ '}'
188
189	example: use frontend syntax { Token }
190
191	This means frontend/syntax.asdl.h :: Token
192	"""
193	self._advance() # past 'use'
194	module_parts = []
195	while self.cur_token.kind == TokenKind.Name:
196	part = self._advance()
197	module_parts.append(part)
198
199	self._match(TokenKind.LBrace)
200
201	type_names = []
202	while self.cur_token.kind == TokenKind.Name:
203	t = self._advance()
204	type_names.append(t)
205	if self.cur_token.kind == TokenKind.RParen:
206	break
207	elif self.cur_token.kind == TokenKind.Comma:
208	self._advance()
209
210	self._match(TokenKind.RBrace)
211	#print('MOD %s' % module_parts)
212	return Use(module_parts, type_names)
213
214	def _parse_compound_type(self):
215	"""
216	constructor : NAME fields?
217	\| NAME '%' NAME # shared variant
218
219	sum : constructor ('\|' constructor)* generate?
220
221	compound_type : product
222	\| sum
223
224	Examples:
225	alloc_members =
226	List
227	\| Dict
228	\| Struct
229	generate [bit_set]
230
231	-- color::Red, not color_e::Red or color_i::Red
232	color = Red \| Green
233	generate [integers, no_sum_suffix]
234	"""
235	if self.cur_token.kind == TokenKind.LParen:
236	# If we see a (, it's a product
237	return self._parse_product()
238	else:
239	# Otherwise it's a sum. Look for ConstructorId
240	sumlist = []
241	while True:
242	cons_name = self._match(TokenKind.Name)
243
244	shared_type = None
245	fields = None
246	if self.cur_token.kind == TokenKind.LParen:
247	fields = self._parse_fields()
248	elif self.cur_token.kind == TokenKind.Percent:
249	self._advance()
250	shared_type = self._match(TokenKind.Name)
251	else:
252	pass
253
254	cons = Constructor(cons_name, shared_type, fields)
255	sumlist.append(cons)
256
257	if self.cur_token.kind != TokenKind.Pipe:
258	break
259	self._advance()
260	generate = self._parse_optional_generate()
261
262	# Additional validation
263	if generate is not None:
264	for g in generate:
265	if g not in _CODE_GEN_OPTIONS:
266	raise ASDLSyntaxError('Invalid code gen option %r' % g,
267	self.cur_token.lineno)
268
269	if _SumIsSimple(sumlist):
270	return SimpleSum(sumlist, generate)
271	else:
272	return Sum(sumlist, generate)
273
274	def _parse_type_expr(self):
275	"""One or two params:
276
277	type_params : '[' type_expr ( ',' type_expr )* ']'
278
279	type_expr : NAME type_params? (''?' \| '*')? # allow one suffix
280
281	NAME is validated against Optional, List, Dict afterward
282	"""
283	type_name = self._match(TokenKind.Name)
284
285	# Accept Python-like naming!
286	if type_name == 'str':
287	type_name = 'string'
288
289	children = []
290	if self.cur_token.kind == TokenKind.LBracket:
291	self._advance()
292	children.append(self._parse_type_expr())
293	if self.cur_token.kind == TokenKind.Comma:
294	self._advance()
295	children.append(self._parse_type_expr())
296
297	self._match(TokenKind.RBracket)
298
299	if type_name in ('List', 'Optional'):
300	if len(children) != 1:
301	raise ASDLSyntaxError(
302	'Expected 1 type param to {}'.format(type_name),
303	self.cur_token.lineno)
304	elif type_name == 'Dict':
305	if len(children) != 2:
306	raise ASDLSyntaxError(
307	'Expected 2 type params to {}'.format(type_name),
308	self.cur_token.lineno)
309	else:
310	if len(children) != 0:
311	raise ASDLSyntaxError(
312	'Expected zero type params to {}'.format(type_name),
313	self.cur_token.lineno)
314
315	if len(children):
316	typ = ast.ParameterizedType(type_name, children)
317	else:
318	typ = ast.NamedType(type_name)
319
320	if self.cur_token.kind == TokenKind.Asterisk:
321	# string* is equivalent to List[string]
322	typ = ast.ParameterizedType('List', [typ])
323	self._advance()
324
325	elif self.cur_token.kind == TokenKind.Question:
326	# string* is equivalent to Optional[string]
327	typ = ast.ParameterizedType('Optional', [typ])
328	self._advance()
329
330	return typ
331
332	def _parse_fields(self):
333	"""
334	fields_inner: type_expr NAME ( ',' type_expr NAME )* ','?
335
336	fields : '(' fields_inner? ')'
337
338	Name Quantifier? should be changed to typename.
339	"""
340	fields = []
341	self._match(TokenKind.LParen)
342	while self.cur_token.kind == TokenKind.Name:
343	typ = self._parse_type_expr()
344	field_name = self._match(TokenKind.Name)
345
346	fields.append(Field(typ, field_name))
347
348	if self.cur_token.kind == TokenKind.RParen:
349	break
350	elif self.cur_token.kind == TokenKind.Comma:
351	self._advance()
352
353	self._match(TokenKind.RParen)
354	return fields
355
356	def _parse_list(self):
357	"""
358	list_inner: NAME ( ',' NAME )* ','?
359
360	list : '[' list_inner? ']'
361	"""
362	generate = []
363	self._match(TokenKind.LBracket)
364	while self.cur_token.kind == TokenKind.Name:
365	name = self._match(TokenKind.Name)
366
367	generate.append(name)
368
369	if self.cur_token.kind == TokenKind.RBracket:
370	break
371	elif self.cur_token.kind == TokenKind.Comma:
372	self._advance()
373
374	self._match(TokenKind.RBracket)
375	return generate
376
377	def _parse_optional_generate(self):
378	"""
379	generate : 'generate' list
380	"""
381	if self._at_keyword('generate'):
382	self._advance()
383	return self._parse_list()
384	else:
385	return None
386
387	def _parse_product(self):
388	"""Product: fields attributes?"""
389	return Product(self._parse_fields())
390
391	def _advance(self):
392	"""Return current token; read next token into self.cur_token."""
393	cur_val = None if self.cur_token is None else self.cur_token.value
394	try:
395	self.cur_token = next(self._tokenizer)
396	except StopIteration:
397	self.cur_token = None
398	return cur_val
399
400	def _match(self, kind):
401	"""The 'match' primitive of RD parsers.
402
403	* Verifies that the current token is of the given kind (kind can
404	be a tuple, in which the kind must match one of its members).
405	* Returns the value of the current token
406	* Reads in the next token
407
408	Args:
409	kind: A TokenKind, or a tuple of TokenKind
410	"""
411	if self.cur_token.kind == kind:
412	value = self.cur_token.value
413	self._advance()
414	return value
415	else:
416	raise ASDLSyntaxError(
417	'Expected token {}, got {}'.format(_TOKEN_STR[kind],
418	self.cur_token.value),
419	self.cur_token.lineno)
420
421	def _at_keyword(self, keyword):
422	return (self.cur_token.kind == TokenKind.Keyword and
423	self.cur_token.value == keyword)
424
425
426	_PRIMITIVE_TYPES = [
427	'string',
428	'int',
429	'uint16', # used for Token length - should we generalize this?
430	'BigInt',
431	'float',
432	'bool',
433
434	# 'any' is used for value.{BuiltinProc,BuiltinFunc}, to cast from class
435	# type
436	'any',
437	]
438
439
440	def _ResolveType(typ, type_lookup):
441	# type: (AST, dict) -> None
442	"""Recursively attach a 'resolved' field to AST nodes."""
443	if isinstance(typ, ast.NamedType):
444	if typ.name not in _PRIMITIVE_TYPES:
445	ast_node = type_lookup.get(typ.name)
446	if ast_node is None:
447	raise ASDLSyntaxError("Couldn't find type %r" % typ.name)
448	typ.resolved = ast_node
449
450	elif isinstance(typ, ast.ParameterizedType):
451	for child in typ.children:
452	_ResolveType(child, type_lookup)
453
454	if typ.type_name == 'Optional':
455	child = typ.children[0]
456	if isinstance(child, ast.NamedType):
457	if child.name in _PRIMITIVE_TYPES and child.name != 'string':
458	raise ASDLSyntaxError(
459	'Optional primitive type {} not allowed'.format(
460	child.name))
461
462	if child.resolved and isinstance(child.resolved,
463	ast.SimpleSum):
464	raise ASDLSyntaxError(
465	'Optional simple sum type {} not allowed'.format(
466	child.name))
467
468	else:
469	raise AssertionError()
470
471
472	def _ResolveFields(field_ast_nodes, type_lookup):
473	"""
474	Args:
475	type_lookup: Populated by name resolution
476	"""
477	for field in field_ast_nodes:
478	_ResolveType(field.typ, type_lookup)
479
480
481	def _ResolveModule(module, app_types):
482	"""Name resolution for NamedType."""
483	# Types that fields are declared with: int, id, word_part, etc.
484	# Fields are NOT declared with Constructor names.
485	type_lookup = dict(app_types)
486
487	# Note: we don't actually load the type, and instead leave that to MyPy /
488	# C++. A consequence of this is TypeNameHeuristic().
489	for u in module.uses:
490	for type_name in u.type_names:
491	type_lookup[type_name] = u # type: ast.Use()
492
493	# NOTE: We need two passes because types can be mutually recursive, e.g.
494	# asdl/arith.asdl.
495
496	# First pass: collect declared types and make entries for them.
497	for d in module.dfns:
498	type_lookup[d.name] = d.value
499
500	# Second pass: add NamedType.resolved field
501	for d in module.dfns:
502	ast_node = d.value
503	if isinstance(ast_node, ast.Product):
504	#log('fields %s', ast_node.fields)
505	_ResolveFields(ast_node.fields, type_lookup)
506
507	elif isinstance(ast_node, ast.Sum):
508	for cons in ast_node.types:
509	_ResolveFields(cons.fields, type_lookup)
510
511	else:
512	raise AssertionError(ast_node)
513
514
515	def LoadSchema(f, app_types, verbose=False):
516	"""Returns an AST for the schema."""
517	p = ASDLParser()
518	schema_ast = p.parse(f)
519	if verbose:
520	import sys
521	schema_ast.Print(sys.stdout, 0)
522
523	# Make sure all the names are valid
524	_ResolveModule(schema_ast, app_types)
525	return schema_ast