asdl/parse.py

OILS / asdl / parse.py View on Github | oils.pub

473 lines, 258 significant

1	#!/usr/bin/env python2
2	"""
3	parse.py
4	"""
5	from __future__ import print_function
6
7	import re
8
9	from asdl import ast
10	from asdl.ast import (Use, Module, TypeDecl, SubTypeDecl, Constructor, Field,
11	Sum, SimpleSum, Product, Extern)
12
13	from typing import Any, IO
14
15	_KEYWORDS = ['use', 'module', 'generate', 'extern']
16
17	_TOKENS = [
18	('Keyword', ''),
19	('Name', ''),
20
21	# For operators, the string matters
22	('Equals', '='),
23	('Comma', ','),
24	('Question', '?'),
25	('Pipe', '\|'),
26	('Asterisk', '*'),
27	('LParen', '('),
28	('RParen', ')'),
29	('LBrace', '{'),
30	('RBrace', '}'),
31	('Percent', '%'),
32	('LessThan', '<'), # for subtyping CompoundWord < List[word_part]
33
34	# Oils addition for parameterized types.
35	('LBracket', '['),
36	('RBracket', ']'),
37
38	# - Start with Dict[string, bool].
39	# - List[string] is an alias for string*
40	#
41	# statically typed: Dict and List
42	# dynamically typed: dict and list
43	]
44
45	_TOKEN_STR = [name for name, _ in _TOKENS] # integer -> string like LParen
46	_TOKEN_INT = {} # string like '(' -> integer
47
48
49	class TokenKind(object):
50	"""ASDL tokens.
51
52	TokenKind.LBrace = 5, etc.
53	"""
54	pass
55
56
57	for i, (name, val) in enumerate(_TOKENS):
58	setattr(TokenKind, name, i)
59	_TOKEN_INT[val] = i
60
61
62	class Token(object):
63
64	def __init__(self, kind, value, lineno):
65	# type: (int, str, int) -> None
66	self.kind = kind
67	self.value = value
68	self.lineno = lineno
69
70
71	class ASDLSyntaxError(Exception):
72
73	def __init__(self, msg, lineno=None):
74	# type: (str, Any) -> None
75	self.msg = msg
76	self.lineno = lineno or '<unknown>'
77
78	def __str__(self):
79	return 'Syntax error on line {0.lineno}: {0.msg}'.format(self)
80
81
82	TOKEN_RE = r'\s(\w+\|--.\|#.*\|.)'
83
84
85	def _Tokenize(f):
86	# type: (IO[bytes]) -> Any
87	"""Tokenize the given buffer.
88
89	Yield Token objects.
90	"""
91	for lineno, line in enumerate(f, 1):
92	for m in re.finditer(TOKEN_RE, line.strip()):
93	c = m.group(1)
94	if c in _KEYWORDS:
95	yield Token(TokenKind.Keyword, c, lineno) # type: ignore
96
97	elif c[0].isalpha() or c[0] == '_':
98	yield Token(TokenKind.Name, c, lineno) # type: ignore
99
100	elif c.startswith('--') or c.startswith('#'):
101	# ASDL comments start with --
102	# Added # comments like Python and shell
103	break
104
105	else:
106	# Operators
107	try:
108	op_kind = _TOKEN_INT[c]
109	except KeyError:
110	raise ASDLSyntaxError('Invalid operator %s' % c, lineno)
111	yield Token(op_kind, c, lineno)
112
113
114	def _SumIsSimple(variant_list):
115	"""Return True if a sum is a simple.
116
117	A sum is simple if its types have no fields, e.g.
118	unaryop = Invert \| Not \| UAdd \| USub
119	"""
120	for t in variant_list:
121	if t.fields or t.shared_type:
122	return False
123	return True
124
125
126	_CODE_GEN_OPTIONS = [
127	'no_namespace_suffix', # Id.Foo instead of Id_e.Foo
128	'integers', # integer builtin_i instead of strongly typed builtin_e
129	'uint16', # like integers, but use uint16_t instead
130	'bit_set', # not implemented: 1 << n instead of n
131
132	# probably don't need this
133	# 'common_synthetic_field:left_tok',
134
135	# Put this type, and transitive closure of types it references, in the
136	# unique "first class variant" namespace, and generate type reflection.
137	'reflect_all_types',
138
139	# Squeeze and Freeze, with the number of bits as a option Hm the headers
140	# here still need type reflection. Probably OK.
141	'mirror_all_types:16',
142	]
143
144
145	class ASDLParser(object):
146	"""Parser for ASDL files.
147
148	Create, then call the parse method on a buffer containing ASDL. This
149	is a simple recursive descent parser that uses _Tokenize for the
150	lexing.
151	"""
152
153	def __init__(self):
154	# type: () -> None
155	self._tokenizer = None
156	self.cur_token = None # type: Token
157
158	def parse(self, f):
159	# type: (IO[bytes]) -> ast.Module
160	"""Parse the ASDL in the file and return an AST with a Module root."""
161	self._tokenizer = _Tokenize(f)
162	self._advance()
163	return self._parse_module()
164
165	def _parse_module(self):
166	"""
167	type_decl : NAME '=' compound_type
168	module : 'module' NAME '{'
169	use*
170	extern*
171	type_decl*
172	'}'
173
174	We added:
175	- use for imports
176	- generate on sum types
177	"""
178	if not self._at_keyword('module'):
179	raise ASDLSyntaxError(
180	'Expected "module" (found {})'.format(self.cur_token.value),
181	self.cur_token.lineno)
182	self._advance()
183	name = self._match(TokenKind.Name)
184	self._match(TokenKind.LBrace)
185
186	uses = []
187	while self._at_keyword('use'):
188	uses.append(self._parse_use())
189
190	externs = []
191	while self._at_keyword('extern'):
192	externs.append(self._parse_extern())
193
194	defs = []
195	while self.cur_token.kind == TokenKind.Name:
196	typename = self._advance()
197	if self.cur_token.kind == TokenKind.Equals:
198	self._advance()
199	type_ = self._parse_compound_type()
200	defs.append(TypeDecl(typename, type_))
201	elif self.cur_token.kind == TokenKind.LessThan:
202	self._advance()
203	type_ = self._parse_type_expr()
204	defs.append(SubTypeDecl(typename, type_))
205	else:
206	raise ASDLSyntaxError(
207	'Expected = or < after type name (found {})'.format(
208	self.cur_token.value), self.cur_token.lineno)
209
210	self._match(TokenKind.RBrace)
211	return Module(name, uses, externs, defs)
212
213	def _parse_use(self):
214	"""
215	use: 'use' NAME+ '{' NAME+ '}'
216
217	example: use frontend syntax { Token }
218
219	This means frontend/syntax.asdl.h :: Token
220	"""
221	self._advance() # past 'use'
222	module_parts = []
223	while self.cur_token.kind == TokenKind.Name:
224	part = self._advance()
225	module_parts.append(part)
226
227	self._match(TokenKind.LBrace)
228
229	type_names = []
230	while self.cur_token.kind == TokenKind.Name:
231	t = self._advance()
232	type_names.append(t)
233	if self.cur_token.kind == TokenKind.RParen:
234	break
235	elif self.cur_token.kind == TokenKind.Comma:
236	self._advance()
237
238	self._match(TokenKind.RBrace)
239	#print('MOD %s' % module_parts)
240	return Use(module_parts, type_names)
241
242	def _parse_extern(self):
243	"""
244	extern: 'extern' '[' NAME+ ']'
245
246	Examples:
247	extern _Builtin
248	extern _Callable
249	"""
250	self._advance() # past 'extern'
251
252	self._match(TokenKind.LBracket)
253
254	# At least one name
255	names = [self._match(TokenKind.Name)]
256
257	while self.cur_token.kind == TokenKind.Name:
258	names.append(self._advance())
259
260	self._match(TokenKind.RBracket)
261
262	return Extern(names)
263
264	def _parse_compound_type(self):
265	"""
266	constructor : NAME fields?
267	\| NAME '%' NAME # shared variant
268
269	sum : constructor ('\|' constructor)* generate?
270
271	compound_type : product
272	\| sum
273
274	Examples:
275	alloc_members =
276	List
277	\| Dict
278	\| Struct
279	generate [bit_set]
280
281	-- color::Red, not color_e::Red or color_i::Red
282	color = Red \| Green
283	generate [integers, no_sum_suffix]
284	"""
285	if self.cur_token.kind == TokenKind.LParen:
286	# If we see a (, it's a product
287	return self._parse_product()
288	else:
289	# Otherwise it's a sum. Look for ConstructorId
290	sumlist = []
291	while True:
292	cons_name = self._match(TokenKind.Name)
293
294	shared_type = None
295	fields = None
296	if self.cur_token.kind == TokenKind.LParen:
297	fields = self._parse_fields()
298	elif self.cur_token.kind == TokenKind.Percent:
299	self._advance()
300	shared_type = self._match(TokenKind.Name)
301	else:
302	pass
303
304	cons = Constructor(cons_name, shared_type, fields)
305	sumlist.append(cons)
306
307	if self.cur_token.kind != TokenKind.Pipe:
308	break
309	self._advance()
310	generate = self._parse_optional_generate()
311
312	# Additional validation
313	if generate is not None:
314	for g in generate:
315	if g not in _CODE_GEN_OPTIONS:
316	raise ASDLSyntaxError('Invalid code gen option %r' % g,
317	self.cur_token.lineno)
318
319	if _SumIsSimple(sumlist):
320	return SimpleSum(sumlist, generate)
321	else:
322	return Sum(sumlist, generate)
323
324	def _parse_type_expr(self):
325	"""One or two params:
326
327	type_params : '[' type_expr ( ',' type_expr )* ']'
328
329	type_expr : NAME type_params? (''?' \| '*')? # allow one suffix
330
331	NAME is validated against Optional, List, Dict afterward
332	"""
333	type_name = self._match(TokenKind.Name)
334
335	# Accept Python-like naming!
336	if type_name == 'str':
337	type_name = 'string'
338
339	children = []
340	if self.cur_token.kind == TokenKind.LBracket:
341	self._advance()
342	children.append(self._parse_type_expr())
343	if self.cur_token.kind == TokenKind.Comma:
344	self._advance()
345	children.append(self._parse_type_expr())
346
347	self._match(TokenKind.RBracket)
348
349	if type_name in ('List', 'Optional'):
350	if len(children) != 1:
351	raise ASDLSyntaxError(
352	'Expected 1 type param to {}'.format(type_name),
353	self.cur_token.lineno)
354	elif type_name == 'Dict':
355	if len(children) != 2:
356	raise ASDLSyntaxError(
357	'Expected 2 type params to {}'.format(type_name),
358	self.cur_token.lineno)
359	else:
360	if len(children) != 0:
361	raise ASDLSyntaxError(
362	'Expected zero type params to {}'.format(type_name),
363	self.cur_token.lineno)
364
365	if len(children):
366	typ = ast.ParameterizedType(type_name, children)
367	else:
368	typ = ast.NamedType(type_name)
369
370	if self.cur_token.kind == TokenKind.Asterisk:
371	# string* is equivalent to List[string]
372	typ = ast.ParameterizedType('List', [typ])
373	self._advance()
374
375	elif self.cur_token.kind == TokenKind.Question:
376	# string* is equivalent to Optional[string]
377	typ = ast.ParameterizedType('Optional', [typ])
378	self._advance()
379
380	return typ
381
382	def _parse_fields(self):
383	"""
384	fields_inner: type_expr NAME ( ',' type_expr NAME )* ','?
385
386	fields : '(' fields_inner? ')'
387
388	Name Quantifier? should be changed to typename.
389	"""
390	fields = []
391	self._match(TokenKind.LParen)
392	while self.cur_token.kind == TokenKind.Name:
393	typ = self._parse_type_expr()
394	field_name = self._match(TokenKind.Name)
395
396	fields.append(Field(typ, field_name))
397
398	if self.cur_token.kind == TokenKind.RParen:
399	break
400	elif self.cur_token.kind == TokenKind.Comma:
401	self._advance()
402
403	self._match(TokenKind.RParen)
404	return fields
405
406	def _parse_list(self):
407	"""
408	list_inner: NAME ( ',' NAME )* ','?
409
410	list : '[' list_inner? ']'
411	"""
412	generate = []
413	self._match(TokenKind.LBracket)
414	while self.cur_token.kind == TokenKind.Name:
415	name = self._match(TokenKind.Name)
416
417	generate.append(name)
418
419	if self.cur_token.kind == TokenKind.RBracket:
420	break
421	elif self.cur_token.kind == TokenKind.Comma:
422	self._advance()
423
424	self._match(TokenKind.RBracket)
425	return generate
426
427	def _parse_optional_generate(self):
428	"""
429	generate : 'generate' list
430	"""
431	if self._at_keyword('generate'):
432	self._advance()
433	return self._parse_list()
434	else:
435	return None
436
437	def _parse_product(self):
438	"""Product: fields attributes?"""
439	return Product(self._parse_fields())
440
441	def _advance(self):
442	"""Return current token; read next token into self.cur_token."""
443	cur_val = None if self.cur_token is None else self.cur_token.value
444	try:
445	self.cur_token = next(self._tokenizer)
446	except StopIteration:
447	self.cur_token = None
448	return cur_val
449
450	def _match(self, kind):
451	"""The 'match' primitive of RD parsers.
452
453	* Verifies that the current token is of the given kind (kind can
454	be a tuple, in which the kind must match one of its members).
455	* Returns the value of the current token
456	* Reads in the next token
457
458	Args:
459	kind: A TokenKind, or a tuple of TokenKind
460	"""
461	if self.cur_token.kind == kind:
462	value = self.cur_token.value
463	self._advance()
464	return value
465	else:
466	raise ASDLSyntaxError(
467	'Expected token {}, got {}'.format(_TOKEN_STR[kind],
468	self.cur_token.value),
469	self.cur_token.lineno)
470
471	def _at_keyword(self, keyword):
472	return (self.cur_token.kind == TokenKind.Keyword and
473	self.cur_token.value == keyword)