ysh/expr_parse.py

OILS / ysh / expr_parse.py View on Github | oilshell.org

383 lines, 232 significant

1	"""expr_parse.py."""
2	from __future__ import print_function
3
4	from _devbuild.gen.syntax_asdl import (loc, Token, DoubleQuoted, SingleQuoted,
5	CommandSub, ShArrayLiteral,
6	CompoundWord, word_part_t, word_e)
7	from _devbuild.gen.id_kind_asdl import Id, Kind, Id_str
8	from _devbuild.gen.types_asdl import lex_mode_e
9
10	from display import ui
11	from core.error import p_die
12	from frontend import consts
13	from frontend import lexer
14	from frontend import reader
15	from mycpp import mylib
16	from mycpp.mylib import log, tagswitch
17	from osh import braces
18	from osh import word_
19	from osh import word_compile
20	from pgen2 import parse
21	from pgen2.pnode import PNodeAllocator
22
23	_ = log
24
25	from typing import TYPE_CHECKING, Any, Dict, Tuple, List, cast, Optional
26	if TYPE_CHECKING:
27	from frontend.lexer import Lexer
28	from frontend.parse_lib import ParseContext
29	from pgen2.grammar import Grammar
30	from pgen2.pnode import PNode
31
32	if mylib.PYTHON:
33
34	class ParseTreePrinter(object):
35	"""Prints a tree of PNode instances."""
36
37	def __init__(self, names):
38	# type: (Dict[int, str]) -> None
39	self.names = names
40	self.f = mylib.Stdout()
41
42	def _Print(self, pnode, indent, i):
43	# type: (PNode, int, int) -> None
44
45	ind = ' ' * indent
46	# NOTE:
47	# - why isn't 'tok' None for PRODUCTIONS? There is some redundancy to get
48	# rid of.
49	if pnode.tok:
50	if isinstance(pnode.tok, Token):
51	v = lexer.TokenVal(pnode.tok)
52	else:
53	# e.g. CommandSub for x = $(echo hi)
54	v = repr(pnode.tok)
55	else:
56	v = '-'
57	self.f.write('%s%d %s %s\n' % (ind, i, self.names[pnode.typ], v))
58	if pnode.children is not None:
59	for i, child in enumerate(pnode.children):
60	self._Print(child, indent + 1, i)
61
62	def Print(self, pnode):
63	# type: (PNode) -> None
64	self._Print(pnode, 0, 0)
65
66
67	def _Classify(gr, tok):
68	# type: (Grammar, Token) -> int
69
70	# We have to match up what ParserGenerator.make_grammar() did when
71	# calling make_label() and make_first(). See classify() in
72	# opy/pgen2/driver.py.
73
74	id_ = tok.id # mycpp fix: we need C++ to do uint16_t -> int conversion
75
76	# TODO: use something more efficient than a Dict
77	if id_ in gr.tokens:
78	return gr.tokens[id_]
79
80	if id_ == Id.Unknown_DEqual:
81	p_die('Use === to be exact, or ~== to convert types', tok)
82	if id_ == Id.Unknown_DAmp:
83	p_die("Use 'and' in expression mode (OILS-ERR-15)", tok)
84	if id_ == Id.Unknown_DPipe:
85	p_die("Use 'or' in expression mode (OILS-ERR-15)", tok)
86	# Not possible to check '!' as it conflicts with Id.Expr_Bang
87
88	if id_ == Id.Unknown_Tok:
89	type_str = ''
90	else:
91	type_str = ' (%s)' % ui.PrettyId(tok.id)
92	p_die('Unexpected token in expression mode%s' % type_str, tok)
93
94
95	# Newlines are ignored between these pairs.
96	# yapf: disable
97	_OTHER_BALANCE = {
98
99	# Parenthesized expressions (tuples) and func/proc parameter lists
100	Id.Op_LParen: 1,
101	Id.Op_RParen: -1,
102	Id.Op_LBracket: 1,
103	Id.Op_RBracket: -1,
104
105	# Dicts are {}, and the grammar respects Op_Newline.
106	}
107	# yapf: enable
108
109
110	def _PushYshTokens(parse_ctx, gr, p, lex):
111	# type: (ParseContext, Grammar, parse.Parser, Lexer) -> Token
112	"""Push tokens onto pgen2's parser.
113
114	Returns the last token so it can be reused/seen by the CommandParser.
115	"""
116	#log('keywords = %s', gr.keywords)
117	#log('tokens = %s', gr.tokens)
118
119	last_token = None # type: Optional[Token]
120	prev_was_newline = False
121
122	balance = 0 # to ignore newlines
123
124	while True:
125	if last_token: # e.g. left over from WordParser
126	tok = last_token
127	#log('last_token = %s', last_token)
128	last_token = None
129	else:
130	tok = lex.Read(lex_mode_e.Expr)
131	#log('tok = %s', tok)
132
133	# Comments and whitespace. Newlines aren't ignored.
134	if consts.GetKind(tok.id) == Kind.Ignored:
135	continue
136
137	# For multiline lists, maps, etc.
138	if tok.id == Id.Op_Newline:
139	if balance > 0:
140	#log('*** SKIPPING NEWLINE')
141	continue
142	# Eliminate duplicate newline tokens. It makes the grammar simpler, and
143	# it's consistent with CPython's lexer and our own WordParser.
144	if prev_was_newline:
145	continue
146	prev_was_newline = True
147	else:
148	prev_was_newline = False
149
150	balance += _OTHER_BALANCE.get(tok.id, 0)
151	#log('BALANCE after seeing %s = %d', tok.id, balance)
152
153	if tok.id == Id.Op_LParen:
154	# For nesting inside $()
155	lex.PushHint(Id.Op_RParen, Id.Op_RParen)
156
157	#if tok.id == Id.Expr_Name and tok.val in KEYWORDS:
158	# tok.id = KEYWORDS[tok.val]
159	# log('Replaced with %s', tok.id)
160
161	assert tok.id < 256, Id_str(tok.id)
162
163	ilabel = _Classify(gr, tok)
164	#log('tok = %s, ilabel = %d', tok, ilabel)
165
166	if p.addtoken(tok.id, tok, ilabel):
167	return tok
168
169	#
170	# Mututally recursive calls into the command/word parsers.
171	#
172
173	if tok.id in (Id.Left_ColonPipe,
174	Id.Left_PercentParen): # :\| %( LEGACY!
175	left_tok = tok
176	if tok.id == Id.Left_PercentParen:
177	lex.PushHint(Id.Op_RParen, Id.Right_ShArrayLiteral)
178
179	# Blame the opening token
180	line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
181	w_parser = parse_ctx.MakeWordParser(lex, line_reader)
182	words = [] # type: List[CompoundWord]
183	close_tok = None # type: Optional[Token]
184	done = False
185	while not done:
186	w = w_parser.ReadWord(lex_mode_e.ShCommand)
187	with tagswitch(w) as case:
188	if case(word_e.Operator):
189	tok = cast(Token, w)
190	if tok.id == Id.Right_ShArrayLiteral:
191	if left_tok.id != Id.Left_PercentParen:
192	p_die('Expected ) to close', left_tok)
193	close_tok = tok
194	done = True # can't use break here
195	elif tok.id == Id.Op_Pipe:
196	if left_tok.id != Id.Left_ColonPipe:
197	p_die('Expected ) to close', left_tok)
198	close_tok = tok
199	done = True # can't use break here
200	elif tok.id == Id.Op_Newline: # internal newlines allowed
201	continue
202	else:
203	p_die('Unexpected token in array literal',
204	loc.Word(w))
205
206	elif case(word_e.Compound):
207	words.append(cast(CompoundWord, w))
208
209	else:
210	raise AssertionError()
211
212	words2 = braces.BraceDetectAll(words)
213	words3 = word_.TildeDetectAll(words2)
214
215	typ = Id.Expr_CastedDummy
216
217	lit_part = ShArrayLiteral(left_tok, words3, close_tok)
218	opaque = cast(Token, lit_part) # HACK for expr_to_ast
219	done = p.addtoken(typ, opaque, gr.tokens[typ])
220	assert not done # can't end the expression
221
222	# Now push the closing )
223	ilabel = _Classify(gr, close_tok)
224	done = p.addtoken(tok.id, close_tok, ilabel)
225	assert not done # can't end the expression
226
227	continue
228
229	# $( @( ^(
230	if tok.id in (Id.Left_DollarParen, Id.Left_AtParen,
231	Id.Left_CaretParen):
232
233	left_token = tok
234
235	lex.PushHint(Id.Op_RParen, Id.Eof_RParen)
236	line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
237	c_parser = parse_ctx.MakeParserForCommandSub(
238	line_reader, lex, Id.Eof_RParen)
239	node = c_parser.ParseCommandSub()
240	# A little gross: Copied from osh/word_parse.py
241	right_token = c_parser.w_parser.cur_token
242
243	cs_part = CommandSub(left_token, node, right_token)
244
245	typ = Id.Expr_CastedDummy
246	opaque = cast(Token, cs_part) # HACK for expr_to_ast
247	done = p.addtoken(typ, opaque, gr.tokens[typ])
248	assert not done # can't end the expression
249
250	# Now push the closing )
251	ilabel = _Classify(gr, right_token)
252	done = p.addtoken(right_token.id, right_token, ilabel)
253	assert not done # can't end the expression
254
255	continue
256
257	# " $" """ $""" ^"
258	if tok.id in (Id.Left_DoubleQuote, Id.Left_DollarDoubleQuote,
259	Id.Left_TDoubleQuote, Id.Left_DollarTDoubleQuote,
260	Id.Left_CaretDoubleQuote):
261
262	left_token = tok
263	line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
264	w_parser = parse_ctx.MakeWordParser(lex, line_reader)
265
266	parts = [] # type: List[word_part_t]
267	last_token = w_parser.ReadDoubleQuoted(left_token, parts)
268	expr_dq_part = DoubleQuoted(left_token, parts, last_token)
269
270	typ = Id.Expr_CastedDummy
271	opaque = cast(Token, expr_dq_part) # HACK for expr_to_ast
272	done = p.addtoken(typ, opaque, gr.tokens[typ])
273	assert not done # can't end the expression
274
275	continue
276
277	# ${
278	if tok.id == Id.Left_DollarBrace:
279	left_token = tok
280	line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
281	w_parser = parse_ctx.MakeWordParser(lex, line_reader)
282
283	part, last_token = w_parser.ReadBracedVarSub(left_token)
284
285	# It's casted word_part.BracedVarSub -> dummy -> expr.BracedVarSub!
286	typ = Id.Expr_CastedDummy
287	opaque = cast(Token, part) # HACK for expr_to_ast
288	done = p.addtoken(typ, opaque, gr.tokens[typ])
289	assert not done # can't end the expression
290
291	continue
292
293	# 'x' '''x'''
294	# r'x' r'''x'''
295	# u'x' u'''x'''
296	# b'x' b'''x'''
297	# $'x'
298	if tok.id in (Id.Left_SingleQuote, Id.Left_TSingleQuote,
299	Id.Left_RSingleQuote, Id.Left_RTSingleQuote,
300	Id.Left_USingleQuote, Id.Left_UTSingleQuote,
301	Id.Left_BSingleQuote, Id.Left_BTSingleQuote,
302	Id.Left_DollarSingleQuote):
303	if tok.id == Id.Left_DollarSingleQuote:
304	sq_mode = lex_mode_e.SQ_C
305	elif tok.id in (Id.Left_USingleQuote, Id.Left_UTSingleQuote,
306	Id.Left_BSingleQuote, Id.Left_BTSingleQuote):
307	sq_mode = lex_mode_e.J8_Str
308	else:
309	sq_mode = lex_mode_e.SQ_Raw
310
311	left_token = tok
312	line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
313	w_parser = parse_ctx.MakeWordParser(lex, line_reader)
314
315	tokens = [] # type: List[Token]
316	last_token = w_parser.ReadSingleQuoted(sq_mode, left_token, tokens,
317	True)
318
319	sval = word_compile.EvalSingleQuoted(left_token.id, tokens)
320	sq_part = SingleQuoted(left_token, sval, last_token)
321
322	typ = Id.Expr_CastedDummy
323	opaque = cast(Token, sq_part) # HACK for expr_to_ast
324	done = p.addtoken(typ, opaque, gr.tokens[typ])
325	assert not done # can't end the expression
326	continue
327
328	else:
329	# We never broke out -- EOF is too soon (how can this happen???)
330	raise parse.ParseError("incomplete input", tok.id, tok)
331
332
333	class ExprParser(object):
334	"""A wrapper around a pgen2 parser."""
335
336	def __init__(self, parse_ctx, gr):
337	# type: (ParseContext, Grammar) -> None
338	self.parse_ctx = parse_ctx
339	self.gr = gr
340	# Reused multiple times.
341	self.push_parser = parse.Parser(gr)
342	self.pnode_alloc = None # type: Optional[PNodeAllocator]
343
344	def Parse(self, lexer, start_symbol):
345	# type: (Lexer, int) -> Tuple[PNode, Token]
346
347	# Reuse the parser
348	self.push_parser.setup(start_symbol, self.pnode_alloc)
349	try:
350	last_token = _PushYshTokens(self.parse_ctx, self.gr,
351	self.push_parser, lexer)
352	except parse.ParseError as e:
353	#log('ERROR %s', e)
354	# TODO:
355	# - Describe what lexer mode we're in (Invalid syntax in regex)
356	# - Maybe say where the mode started
357	# - Id.Unknown_Tok could say "This character is invalid"
358
359	# ParseError has a "too much input" case but I haven't been able to
360	# tickle it. Maybe it's because of the Eof tokens?
361
362	p_die(
363	'Syntax error in expression (near %s)' % ui.PrettyId(e.tok.id),
364	e.tok)
365
366	return self.push_parser.rootnode, last_token
367
368
369	class ctx_PNodeAllocator(object):
370
371	def __init__(self, ep):
372	# type: (ExprParser) -> None
373	self.expr_parser = ep
374	self.expr_parser.pnode_alloc = PNodeAllocator()
375
376	def __enter__(self):
377	# type: () -> None
378	pass
379
380	def __exit__(self, type, value, traceback):
381	# type: (Any, Any, Any) -> None
382	self.expr_parser.pnode_alloc.Clear()
383	self.expr_parser.pnode_alloc = None