ysh/expr_parse.py

OILS / ysh / expr_parse.py View on Github | oilshell.org

389 lines, 236 significant

1	"""expr_parse.py."""
2	from __future__ import print_function
3
4	from _devbuild.gen.syntax_asdl import (loc, Token, DoubleQuoted, SingleQuoted,
5	CommandSub, ShArrayLiteral,
6	CompoundWord, word_part_t, word_e)
7	from _devbuild.gen.id_kind_asdl import Id, Kind, Id_str
8	from _devbuild.gen.types_asdl import lex_mode_e
9
10	from display import ui
11	from core.error import p_die
12	from frontend import consts
13	from frontend import lexer
14	from frontend import reader
15	from mycpp import mylib
16	from mycpp.mylib import log, tagswitch
17	from osh import braces
18	from osh import word_
19	from osh import word_compile
20	from pgen2 import parse
21	from pgen2.pnode import PNodeAllocator
22
23	_ = log
24
25	from typing import TYPE_CHECKING, Any, Dict, Tuple, List, cast, Optional
26	if TYPE_CHECKING:
27	from frontend.lexer import Lexer
28	from frontend.parse_lib import ParseContext
29	from pgen2.grammar import Grammar
30	from pgen2.pnode import PNode
31
32	if mylib.PYTHON:
33
34	class ParseTreePrinter(object):
35	"""Prints a tree of PNode instances."""
36
37	def __init__(self, names):
38	# type: (Dict[int, str]) -> None
39	self.names = names
40	self.f = mylib.Stdout()
41
42	def _Print(self, pnode, indent, i):
43	# type: (PNode, int, int) -> None
44
45	ind = ' ' * indent
46	# NOTE:
47	# - why isn't 'tok' None for PRODUCTIONS? There is some redundancy to get
48	# rid of.
49	if pnode.tok:
50	if isinstance(pnode.tok, Token):
51	v = lexer.TokenVal(pnode.tok)
52	else:
53	# e.g. CommandSub for x = $(echo hi)
54	v = repr(pnode.tok)
55	else:
56	v = '-'
57	self.f.write('%s%d %s %s\n' % (ind, i, self.names[pnode.typ], v))
58	if pnode.children is not None:
59	for i, child in enumerate(pnode.children):
60	self._Print(child, indent + 1, i)
61
62	def Print(self, pnode):
63	# type: (PNode) -> None
64	self._Print(pnode, 0, 0)
65
66
67	def _Classify(gr, tok):
68	# type: (Grammar, Token) -> int
69
70	# We have to match up what ParserGenerator.make_grammar() did when
71	# calling make_label() and make_first(). See classify() in
72	# opy/pgen2/driver.py.
73
74	id_ = tok.id # mycpp fix: we need C++ to do uint16_t -> int conversion
75
76	# TODO: use something more efficient than a Dict
77	if id_ in gr.tokens:
78	return gr.tokens[id_]
79
80	if id_ == Id.Unknown_DEqual:
81	p_die('Use === to be exact, or ~== to convert types', tok)
82
83	if id_ == Id.Unknown_DAmp:
84	p_die("Use 'and' in expression mode (OILS-ERR-15)", tok)
85	if id_ == Id.Unknown_DPipe:
86	p_die("Use 'or' in expression mode (OILS-ERR-15)", tok)
87	# Not possible to check '!' as it conflicts with Id.Expr_Bang
88
89	if id_ == Id.Unknown_DDot:
90	p_die(
91	'Use ..< for half-open range, or ..= for closed range (OILS-ERR-16)',
92	tok)
93
94	if id_ == Id.Unknown_Tok:
95	type_str = ''
96	else:
97	type_str = ' (%s)' % ui.PrettyId(tok.id)
98	p_die('Unexpected token in expression mode%s' % type_str, tok)
99
100
101	# Newlines are ignored between these pairs.
102	# yapf: disable
103	_OTHER_BALANCE = {
104
105	# Parenthesized expressions (tuples) and func/proc parameter lists
106	Id.Op_LParen: 1,
107	Id.Op_RParen: -1,
108	Id.Op_LBracket: 1,
109	Id.Op_RBracket: -1,
110
111	# Dicts are {}, and the grammar respects Op_Newline.
112	}
113	# yapf: enable
114
115
116	def _PushYshTokens(parse_ctx, gr, p, lex):
117	# type: (ParseContext, Grammar, parse.Parser, Lexer) -> Token
118	"""Push tokens onto pgen2's parser.
119
120	Returns the last token so it can be reused/seen by the CommandParser.
121	"""
122	#log('keywords = %s', gr.keywords)
123	#log('tokens = %s', gr.tokens)
124
125	last_token = None # type: Optional[Token]
126	prev_was_newline = False
127
128	balance = 0 # to ignore newlines
129
130	while True:
131	if last_token: # e.g. left over from WordParser
132	tok = last_token
133	#log('last_token = %s', last_token)
134	last_token = None
135	else:
136	tok = lex.Read(lex_mode_e.Expr)
137	#log('tok = %s', tok)
138
139	# Comments and whitespace. Newlines aren't ignored.
140	if consts.GetKind(tok.id) == Kind.Ignored:
141	continue
142
143	# For multiline lists, maps, etc.
144	if tok.id == Id.Op_Newline:
145	if balance > 0:
146	#log('*** SKIPPING NEWLINE')
147	continue
148	# Eliminate duplicate newline tokens. It makes the grammar simpler, and
149	# it's consistent with CPython's lexer and our own WordParser.
150	if prev_was_newline:
151	continue
152	prev_was_newline = True
153	else:
154	prev_was_newline = False
155
156	balance += _OTHER_BALANCE.get(tok.id, 0)
157	#log('BALANCE after seeing %s = %d', tok.id, balance)
158
159	if tok.id == Id.Op_LParen:
160	# For nesting inside $()
161	lex.PushHint(Id.Op_RParen, Id.Op_RParen)
162
163	#if tok.id == Id.Expr_Name and tok.val in KEYWORDS:
164	# tok.id = KEYWORDS[tok.val]
165	# log('Replaced with %s', tok.id)
166
167	assert tok.id < 256, Id_str(tok.id)
168
169	ilabel = _Classify(gr, tok)
170	#log('tok = %s, ilabel = %d', tok, ilabel)
171
172	if p.addtoken(tok.id, tok, ilabel):
173	return tok
174
175	#
176	# Mututally recursive calls into the command/word parsers.
177	#
178
179	if tok.id in (Id.Left_ColonPipe,
180	Id.Left_PercentParen): # :\| %( LEGACY!
181	left_tok = tok
182	if tok.id == Id.Left_PercentParen:
183	lex.PushHint(Id.Op_RParen, Id.Right_ShArrayLiteral)
184
185	# Blame the opening token
186	line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
187	w_parser = parse_ctx.MakeWordParser(lex, line_reader)
188	words = [] # type: List[CompoundWord]
189	close_tok = None # type: Optional[Token]
190	done = False
191	while not done:
192	w = w_parser.ReadWord(lex_mode_e.ShCommand)
193	with tagswitch(w) as case:
194	if case(word_e.Operator):
195	tok = cast(Token, w)
196	if tok.id == Id.Right_ShArrayLiteral:
197	if left_tok.id != Id.Left_PercentParen:
198	p_die('Expected ) to close', left_tok)
199	close_tok = tok
200	done = True # can't use break here
201	elif tok.id == Id.Op_Pipe:
202	if left_tok.id != Id.Left_ColonPipe:
203	p_die('Expected ) to close', left_tok)
204	close_tok = tok
205	done = True # can't use break here
206	elif tok.id == Id.Op_Newline: # internal newlines allowed
207	continue
208	else:
209	p_die('Unexpected token in array literal',
210	loc.Word(w))
211
212	elif case(word_e.Compound):
213	words.append(cast(CompoundWord, w))
214
215	else:
216	raise AssertionError()
217
218	words2 = braces.BraceDetectAll(words)
219	words3 = word_.TildeDetectAll(words2)
220
221	typ = Id.Expr_CastedDummy
222
223	lit_part = ShArrayLiteral(left_tok, words3, close_tok)
224	opaque = cast(Token, lit_part) # HACK for expr_to_ast
225	done = p.addtoken(typ, opaque, gr.tokens[typ])
226	assert not done # can't end the expression
227
228	# Now push the closing )
229	ilabel = _Classify(gr, close_tok)
230	done = p.addtoken(tok.id, close_tok, ilabel)
231	assert not done # can't end the expression
232
233	continue
234
235	# $( @( ^(
236	if tok.id in (Id.Left_DollarParen, Id.Left_AtParen,
237	Id.Left_CaretParen):
238
239	left_token = tok
240
241	lex.PushHint(Id.Op_RParen, Id.Eof_RParen)
242	line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
243	c_parser = parse_ctx.MakeParserForCommandSub(
244	line_reader, lex, Id.Eof_RParen)
245	node = c_parser.ParseCommandSub()
246	# A little gross: Copied from osh/word_parse.py
247	right_token = c_parser.w_parser.cur_token
248
249	cs_part = CommandSub(left_token, node, right_token)
250
251	typ = Id.Expr_CastedDummy
252	opaque = cast(Token, cs_part) # HACK for expr_to_ast
253	done = p.addtoken(typ, opaque, gr.tokens[typ])
254	assert not done # can't end the expression
255
256	# Now push the closing )
257	ilabel = _Classify(gr, right_token)
258	done = p.addtoken(right_token.id, right_token, ilabel)
259	assert not done # can't end the expression
260
261	continue
262
263	# " $" """ $""" ^"
264	if tok.id in (Id.Left_DoubleQuote, Id.Left_DollarDoubleQuote,
265	Id.Left_TDoubleQuote, Id.Left_DollarTDoubleQuote,
266	Id.Left_CaretDoubleQuote):
267
268	left_token = tok
269	line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
270	w_parser = parse_ctx.MakeWordParser(lex, line_reader)
271
272	parts = [] # type: List[word_part_t]
273	last_token = w_parser.ReadDoubleQuoted(left_token, parts)
274	expr_dq_part = DoubleQuoted(left_token, parts, last_token)
275
276	typ = Id.Expr_CastedDummy
277	opaque = cast(Token, expr_dq_part) # HACK for expr_to_ast
278	done = p.addtoken(typ, opaque, gr.tokens[typ])
279	assert not done # can't end the expression
280
281	continue
282
283	# ${
284	if tok.id == Id.Left_DollarBrace:
285	left_token = tok
286	line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
287	w_parser = parse_ctx.MakeWordParser(lex, line_reader)
288
289	part, last_token = w_parser.ReadBracedVarSub(left_token)
290
291	# It's casted word_part.BracedVarSub -> dummy -> expr.BracedVarSub!
292	typ = Id.Expr_CastedDummy
293	opaque = cast(Token, part) # HACK for expr_to_ast
294	done = p.addtoken(typ, opaque, gr.tokens[typ])
295	assert not done # can't end the expression
296
297	continue
298
299	# 'x' '''x'''
300	# r'x' r'''x'''
301	# u'x' u'''x'''
302	# b'x' b'''x'''
303	# $'x'
304	if tok.id in (Id.Left_SingleQuote, Id.Left_TSingleQuote,
305	Id.Left_RSingleQuote, Id.Left_RTSingleQuote,
306	Id.Left_USingleQuote, Id.Left_UTSingleQuote,
307	Id.Left_BSingleQuote, Id.Left_BTSingleQuote,
308	Id.Left_DollarSingleQuote):
309	if tok.id == Id.Left_DollarSingleQuote:
310	sq_mode = lex_mode_e.SQ_C
311	elif tok.id in (Id.Left_USingleQuote, Id.Left_UTSingleQuote,
312	Id.Left_BSingleQuote, Id.Left_BTSingleQuote):
313	sq_mode = lex_mode_e.J8_Str
314	else:
315	sq_mode = lex_mode_e.SQ_Raw
316
317	left_token = tok
318	line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
319	w_parser = parse_ctx.MakeWordParser(lex, line_reader)
320
321	tokens = [] # type: List[Token]
322	last_token = w_parser.ReadSingleQuoted(sq_mode, left_token, tokens,
323	True)
324
325	sval = word_compile.EvalSingleQuoted(left_token.id, tokens)
326	sq_part = SingleQuoted(left_token, sval, last_token)
327
328	typ = Id.Expr_CastedDummy
329	opaque = cast(Token, sq_part) # HACK for expr_to_ast
330	done = p.addtoken(typ, opaque, gr.tokens[typ])
331	assert not done # can't end the expression
332	continue
333
334	else:
335	# We never broke out -- EOF is too soon (how can this happen???)
336	raise parse.ParseError("incomplete input", tok.id, tok)
337
338
339	class ExprParser(object):
340	"""A wrapper around a pgen2 parser."""
341
342	def __init__(self, parse_ctx, gr):
343	# type: (ParseContext, Grammar) -> None
344	self.parse_ctx = parse_ctx
345	self.gr = gr
346	# Reused multiple times.
347	self.push_parser = parse.Parser(gr)
348	self.pnode_alloc = None # type: Optional[PNodeAllocator]
349
350	def Parse(self, lexer, start_symbol):
351	# type: (Lexer, int) -> Tuple[PNode, Token]
352
353	# Reuse the parser
354	self.push_parser.setup(start_symbol, self.pnode_alloc)
355	try:
356	last_token = _PushYshTokens(self.parse_ctx, self.gr,
357	self.push_parser, lexer)
358	except parse.ParseError as e:
359	#log('ERROR %s', e)
360	# TODO:
361	# - Describe what lexer mode we're in (Invalid syntax in regex)
362	# - Maybe say where the mode started
363	# - Id.Unknown_Tok could say "This character is invalid"
364
365	# ParseError has a "too much input" case but I haven't been able to
366	# tickle it. Maybe it's because of the Eof tokens?
367
368	p_die(
369	'Syntax error in expression (near %s)' % ui.PrettyId(e.tok.id),
370	e.tok)
371
372	return self.push_parser.rootnode, last_token
373
374
375	class ctx_PNodeAllocator(object):
376
377	def __init__(self, ep):
378	# type: (ExprParser) -> None
379	self.expr_parser = ep
380	self.expr_parser.pnode_alloc = PNodeAllocator()
381
382	def __enter__(self):
383	# type: () -> None
384	pass
385
386	def __exit__(self, type, value, traceback):
387	# type: (Any, Any, Any) -> None
388	self.expr_parser.pnode_alloc.Clear()
389	self.expr_parser.pnode_alloc = None