asdl/front_end.py

OILS / asdl / front_end.py View on Github | oilshell.org

551 lines, 317 significant

1	"""front_end.py: Lexer and parser for the ASDL schema language."""
2	from __future__ import print_function
3
4	import re
5
6	from asdl import ast
7	from asdl.ast import (AST, Use, Module, TypeDecl, SubTypeDecl, Constructor,
8	Field, Sum, SimpleSum, Product)
9	from asdl.util import log
10
11	# type checking not turned on yet
12	from typing import Dict, Any
13
14	_ = log
15
16	_KEYWORDS = ['use', 'module', 'generate']
17
18	_TOKENS = [
19	('Keyword', ''),
20	('Name', ''),
21
22	# For operators, the string matters
23	('Equals', '='),
24	('Comma', ','),
25	('Question', '?'),
26	('Pipe', '\|'),
27	('Asterisk', '*'),
28	('LParen', '('),
29	('RParen', ')'),
30	('LBrace', '{'),
31	('RBrace', '}'),
32	('Percent', '%'),
33	('LessThan', '<'), # for subtyping CompoundWord < List[word_part]
34
35	# Oils addition for parameterized types.
36	('LBracket', '['),
37	('RBracket', ']'),
38
39	# - Start with Dict[string, bool].
40	# - List[string] is an alias for string*
41	#
42	# statically typed: Dict and List
43	# dynamically typed: dict and list
44	]
45
46	_TOKEN_STR = [name for name, _ in _TOKENS] # integer -> string like LParen
47	_TOKEN_INT = {} # string like '(' -> integer
48
49
50	class TokenKind(object):
51	"""ASDL tokens.
52
53	TokenKind.LBrace = 5, etc.
54	"""
55	pass
56
57
58	for i, (name, val) in enumerate(_TOKENS):
59	setattr(TokenKind, name, i)
60	_TOKEN_INT[val] = i
61
62
63	class Token(object):
64
65	def __init__(self, kind, value, lineno):
66	self.kind = kind
67	self.value = value
68	self.lineno = lineno
69
70
71	class ASDLSyntaxError(Exception):
72
73	def __init__(self, msg, lineno=None):
74	self.msg = msg
75	self.lineno = lineno or '<unknown>'
76
77	def __str__(self):
78	return 'Syntax error on line {0.lineno}: {0.msg}'.format(self)
79
80
81	def _Tokenize(f):
82	"""Tokenize the given buffer.
83
84	Yield Token objects.
85	"""
86	for lineno, line in enumerate(f, 1):
87	for m in re.finditer(r'\s(\w+\|--.\|#.*\|.)', line.strip()):
88	c = m.group(1)
89	if c in _KEYWORDS:
90	yield Token(TokenKind.Keyword, c, lineno)
91
92	elif c[0].isalpha():
93	yield Token(TokenKind.Name, c, lineno)
94
95	elif c.startswith('--') or c.startswith('#'):
96	# ASDL comments start with --
97	# Added # comments like Python and shell
98	break
99
100	else:
101	# Operators
102	try:
103	op_kind = _TOKEN_INT[c]
104	except KeyError:
105	raise ASDLSyntaxError('Invalid operator %s' % c, lineno)
106	yield Token(op_kind, c, lineno)
107
108
109	def _SumIsSimple(variant_list):
110	"""Return True if a sum is a simple.
111
112	A sum is simple if its types have no fields, e.g. unaryop = Invert \|
113	Not \| UAdd \| USub
114	"""
115	for t in variant_list:
116	if t.fields or t.shared_type:
117	return False
118	return True
119
120
121	_CODE_GEN_OPTIONS = [
122	'no_namespace_suffix', # Id.Foo instead of Id_e.Foo
123	'integers', # integer builtin_i instead of strongly typed builtin_e
124	'uint16', # like integers, but use uint16_t instead
125	'bit_set', # not implemented: 1 << n instead of n
126
127	# probably don't need this
128	# 'common_synthetic_field:left_tok',
129
130	# Put this type, and transitive closure of types it references, in the
131	# unique "first class variant" namespace, and generate type reflection.
132	'reflect_all_types',
133
134	# Squeeze and Freeze, with the number of bits as a option Hm the headers
135	# here still need type reflection. Probably OK.
136	'mirror_all_types:16',
137	]
138
139
140	class ASDLParser(object):
141	"""Parser for ASDL files.
142
143	Create, then call the parse method on a buffer containing ASDL. This
144	is a simple recursive descent parser that uses _Tokenize for the
145	lexing.
146	"""
147
148	def __init__(self):
149	self._tokenizer = None
150	self.cur_token = None
151
152	def parse(self, f):
153	"""Parse the ASDL in the file and return an AST with a Module root."""
154	self._tokenizer = _Tokenize(f)
155	self._advance()
156	return self._parse_module()
157
158	def _parse_module(self):
159	"""
160	type_decl : NAME '=' compound_type
161	module : 'module' NAME '{' use* type_decl* '}'
162
163	We added:
164	- use for imports
165	- generate on sum types
166	"""
167	if not self._at_keyword('module'):
168	raise ASDLSyntaxError(
169	'Expected "module" (found {})'.format(self.cur_token.value),
170	self.cur_token.lineno)
171	self._advance()
172	name = self._match(TokenKind.Name)
173	self._match(TokenKind.LBrace)
174
175	uses = []
176	while self._at_keyword('use'):
177	uses.append(self._parse_use())
178
179	defs = []
180	while self.cur_token.kind == TokenKind.Name:
181	typename = self._advance()
182	if self.cur_token.kind == TokenKind.Equals:
183	self._advance()
184	type_ = self._parse_compound_type()
185	defs.append(TypeDecl(typename, type_))
186	elif self.cur_token.kind == TokenKind.LessThan:
187	self._advance()
188	type_ = self._parse_type_expr()
189	defs.append(SubTypeDecl(typename, type_))
190	else:
191	raise ASDLSyntaxError(
192	'Expected = or < after type name (found {})'.format(
193	self.cur_token.value), self.cur_token.lineno)
194
195	self._match(TokenKind.RBrace)
196	return Module(name, uses, defs)
197
198	def _parse_use(self):
199	"""
200	use: 'use' NAME+ '{' NAME+ '}'
201
202	example: use frontend syntax { Token }
203
204	This means frontend/syntax.asdl.h :: Token
205	"""
206	self._advance() # past 'use'
207	module_parts = []
208	while self.cur_token.kind == TokenKind.Name:
209	part = self._advance()
210	module_parts.append(part)
211
212	self._match(TokenKind.LBrace)
213
214	type_names = []
215	while self.cur_token.kind == TokenKind.Name:
216	t = self._advance()
217	type_names.append(t)
218	if self.cur_token.kind == TokenKind.RParen:
219	break
220	elif self.cur_token.kind == TokenKind.Comma:
221	self._advance()
222
223	self._match(TokenKind.RBrace)
224	#print('MOD %s' % module_parts)
225	return Use(module_parts, type_names)
226
227	def _parse_compound_type(self):
228	"""
229	constructor : NAME fields?
230	\| NAME '%' NAME # shared variant
231
232	sum : constructor ('\|' constructor)* generate?
233
234	compound_type : product
235	\| sum
236
237	Examples:
238	alloc_members =
239	List
240	\| Dict
241	\| Struct
242	generate [bit_set]
243
244	-- color::Red, not color_e::Red or color_i::Red
245	color = Red \| Green
246	generate [integers, no_sum_suffix]
247	"""
248	if self.cur_token.kind == TokenKind.LParen:
249	# If we see a (, it's a product
250	return self._parse_product()
251	else:
252	# Otherwise it's a sum. Look for ConstructorId
253	sumlist = []
254	while True:
255	cons_name = self._match(TokenKind.Name)
256
257	shared_type = None
258	fields = None
259	if self.cur_token.kind == TokenKind.LParen:
260	fields = self._parse_fields()
261	elif self.cur_token.kind == TokenKind.Percent:
262	self._advance()
263	shared_type = self._match(TokenKind.Name)
264	else:
265	pass
266
267	cons = Constructor(cons_name, shared_type, fields)
268	sumlist.append(cons)
269
270	if self.cur_token.kind != TokenKind.Pipe:
271	break
272	self._advance()
273	generate = self._parse_optional_generate()
274
275	# Additional validation
276	if generate is not None:
277	for g in generate:
278	if g not in _CODE_GEN_OPTIONS:
279	raise ASDLSyntaxError('Invalid code gen option %r' % g,
280	self.cur_token.lineno)
281
282	if _SumIsSimple(sumlist):
283	return SimpleSum(sumlist, generate)
284	else:
285	return Sum(sumlist, generate)
286
287	def _parse_type_expr(self):
288	"""One or two params:
289
290	type_params : '[' type_expr ( ',' type_expr )* ']'
291
292	type_expr : NAME type_params? (''?' \| '*')? # allow one suffix
293
294	NAME is validated against Optional, List, Dict afterward
295	"""
296	type_name = self._match(TokenKind.Name)
297
298	# Accept Python-like naming!
299	if type_name == 'str':
300	type_name = 'string'
301
302	children = []
303	if self.cur_token.kind == TokenKind.LBracket:
304	self._advance()
305	children.append(self._parse_type_expr())
306	if self.cur_token.kind == TokenKind.Comma:
307	self._advance()
308	children.append(self._parse_type_expr())
309
310	self._match(TokenKind.RBracket)
311
312	if type_name in ('List', 'Optional'):
313	if len(children) != 1:
314	raise ASDLSyntaxError(
315	'Expected 1 type param to {}'.format(type_name),
316	self.cur_token.lineno)
317	elif type_name == 'Dict':
318	if len(children) != 2:
319	raise ASDLSyntaxError(
320	'Expected 2 type params to {}'.format(type_name),
321	self.cur_token.lineno)
322	else:
323	if len(children) != 0:
324	raise ASDLSyntaxError(
325	'Expected zero type params to {}'.format(type_name),
326	self.cur_token.lineno)
327
328	if len(children):
329	typ = ast.ParameterizedType(type_name, children)
330	else:
331	typ = ast.NamedType(type_name)
332
333	if self.cur_token.kind == TokenKind.Asterisk:
334	# string* is equivalent to List[string]
335	typ = ast.ParameterizedType('List', [typ])
336	self._advance()
337
338	elif self.cur_token.kind == TokenKind.Question:
339	# string* is equivalent to Optional[string]
340	typ = ast.ParameterizedType('Optional', [typ])
341	self._advance()
342
343	return typ
344
345	def _parse_fields(self):
346	"""
347	fields_inner: type_expr NAME ( ',' type_expr NAME )* ','?
348
349	fields : '(' fields_inner? ')'
350
351	Name Quantifier? should be changed to typename.
352	"""
353	fields = []
354	self._match(TokenKind.LParen)
355	while self.cur_token.kind == TokenKind.Name:
356	typ = self._parse_type_expr()
357	field_name = self._match(TokenKind.Name)
358
359	fields.append(Field(typ, field_name))
360
361	if self.cur_token.kind == TokenKind.RParen:
362	break
363	elif self.cur_token.kind == TokenKind.Comma:
364	self._advance()
365
366	self._match(TokenKind.RParen)
367	return fields
368
369	def _parse_list(self):
370	"""
371	list_inner: NAME ( ',' NAME )* ','?
372
373	list : '[' list_inner? ']'
374	"""
375	generate = []
376	self._match(TokenKind.LBracket)
377	while self.cur_token.kind == TokenKind.Name:
378	name = self._match(TokenKind.Name)
379
380	generate.append(name)
381
382	if self.cur_token.kind == TokenKind.RBracket:
383	break
384	elif self.cur_token.kind == TokenKind.Comma:
385	self._advance()
386
387	self._match(TokenKind.RBracket)
388	return generate
389
390	def _parse_optional_generate(self):
391	"""
392	generate : 'generate' list
393	"""
394	if self._at_keyword('generate'):
395	self._advance()
396	return self._parse_list()
397	else:
398	return None
399
400	def _parse_product(self):
401	"""Product: fields attributes?"""
402	return Product(self._parse_fields())
403
404	def _advance(self):
405	"""Return current token; read next token into self.cur_token."""
406	cur_val = None if self.cur_token is None else self.cur_token.value
407	try:
408	self.cur_token = next(self._tokenizer)
409	except StopIteration:
410	self.cur_token = None
411	return cur_val
412
413	def _match(self, kind):
414	"""The 'match' primitive of RD parsers.
415
416	* Verifies that the current token is of the given kind (kind can
417	be a tuple, in which the kind must match one of its members).
418	* Returns the value of the current token
419	* Reads in the next token
420
421	Args:
422	kind: A TokenKind, or a tuple of TokenKind
423	"""
424	if self.cur_token.kind == kind:
425	value = self.cur_token.value
426	self._advance()
427	return value
428	else:
429	raise ASDLSyntaxError(
430	'Expected token {}, got {}'.format(_TOKEN_STR[kind],
431	self.cur_token.value),
432	self.cur_token.lineno)
433
434	def _at_keyword(self, keyword):
435	return (self.cur_token.kind == TokenKind.Keyword and
436	self.cur_token.value == keyword)
437
438
439	_PRIMITIVE_TYPES = [
440	'string',
441	'int',
442	'uint16', # used for Token length - should we generalize this?
443	'BigInt',
444	'float',
445	'bool',
446
447	# 'any' is used for value.{BuiltinProc,BuiltinFunc}, to cast from class
448	# type
449	'any',
450	]
451
452
453	def _ResolveType(typ, type_lookup):
454	# type: (AST, Dict[str, Any]) -> None
455	"""Recursively attach a 'resolved' field to AST nodes."""
456	if isinstance(typ, ast.NamedType):
457	if typ.name not in _PRIMITIVE_TYPES:
458	ast_node = type_lookup.get(typ.name)
459	if ast_node is None:
460	raise ASDLSyntaxError("Couldn't find type %r" % typ.name)
461	typ.resolved = ast_node
462
463	elif isinstance(typ, ast.ParameterizedType):
464	for child in typ.children:
465	_ResolveType(child, type_lookup)
466
467	if typ.type_name == 'Optional':
468	child = typ.children[0]
469	if isinstance(child, ast.NamedType):
470	if child.name in _PRIMITIVE_TYPES and child.name != 'string':
471	raise ASDLSyntaxError(
472	'Optional primitive type {} not allowed'.format(
473	child.name))
474
475	if child.resolved and isinstance(child.resolved,
476	ast.SimpleSum):
477	raise ASDLSyntaxError(
478	'Optional simple sum type {} not allowed'.format(
479	child.name))
480
481	else:
482	raise AssertionError()
483
484
485	def _ResolveFields(field_ast_nodes, type_lookup):
486	"""
487	Args:
488	type_lookup: Populated by name resolution
489	"""
490	for field in field_ast_nodes:
491	_ResolveType(field.typ, type_lookup)
492
493
494	def _ResolveModule(module, app_types):
495	"""Name resolution for NamedType."""
496	# Types that fields are declared with: int, id, word_part, etc.
497	# Fields are NOT declared with Constructor names.
498	type_lookup = dict(app_types)
499
500	# Note: we don't actually load the type, and instead leave that to MyPy /
501	# C++. A consequence of this is TypeNameHeuristic().
502	for u in module.uses:
503	for type_name in u.type_names:
504	type_lookup[type_name] = u # type: ast.Use()
505
506	# NOTE: We need two passes because types can be mutually recursive, e.g.
507	# asdl/arith.asdl.
508
509	# First pass: collect declared types and make entries for them.
510	for d in module.dfns:
511	if d.name in type_lookup:
512	raise ASDLSyntaxError('Type %r was already defined' % d.name)
513
514	if isinstance(d, SubTypeDecl):
515	# e.g. CompoundWord < List[str]
516	type_lookup[d.name] = d.base_class
517	else:
518	# e.g. Token = (str a)
519	type_lookup[d.name] = d.value
520
521	# Second pass: add NamedType.resolved field
522	for d in module.dfns:
523	if isinstance(d, SubTypeDecl): # no fields
524	_ResolveType(d.base_class, type_lookup)
525	continue
526
527	ast_node = d.value
528	if isinstance(ast_node, ast.Product):
529	#log('fields %s', ast_node.fields)
530	_ResolveFields(ast_node.fields, type_lookup)
531	continue
532
533	if isinstance(ast_node, ast.Sum):
534	for cons in ast_node.types:
535	_ResolveFields(cons.fields, type_lookup)
536	continue
537
538	raise AssertionError(ast_node)
539
540
541	def LoadSchema(f, app_types, verbose=False):
542	"""Returns an AST for the schema."""
543	p = ASDLParser()
544	schema_ast = p.parse(f)
545	if verbose:
546	import sys
547	schema_ast.Print(sys.stdout, 0)
548
549	# Make sure all the names are valid
550	_ResolveModule(schema_ast, app_types)
551	return schema_ast