ysh/expr_parse.py

OILS / ysh / expr_parse.py View on Github | oilshell.org

387 lines, 234 significant

1	"""expr_parse.py."""
2	from __future__ import print_function
3
4	from _devbuild.gen.syntax_asdl import (loc, Token, DoubleQuoted, SingleQuoted,
5	CommandSub, ShArrayLiteral,
6	CompoundWord, word_part_t, word_e)
7	from _devbuild.gen.id_kind_asdl import Id, Kind, Id_str
8	from _devbuild.gen.types_asdl import lex_mode_e
9
10	from display import ui
11	from core.error import p_die
12	from frontend import consts
13	from frontend import lexer
14	from frontend import reader
15	from mycpp import mylib
16	from mycpp.mylib import log, tagswitch
17	from osh import braces
18	from osh import word_
19	from osh import word_compile
20	from pgen2 import parse
21	from pgen2.pnode import PNodeAllocator
22
23	_ = log
24
25	from typing import TYPE_CHECKING, Any, Dict, Tuple, List, cast, Optional
26	if TYPE_CHECKING:
27	from frontend.lexer import Lexer
28	from frontend.parse_lib import ParseContext
29	from pgen2.grammar import Grammar
30	from pgen2.pnode import PNode
31
32	if mylib.PYTHON:
33
34	class ParseTreePrinter(object):
35	"""Prints a tree of PNode instances."""
36
37	def __init__(self, names):
38	# type: (Dict[int, str]) -> None
39	self.names = names
40	self.f = mylib.Stdout()
41
42	def _Print(self, pnode, indent, i):
43	# type: (PNode, int, int) -> None
44
45	ind = ' ' * indent
46	# NOTE:
47	# - why isn't 'tok' None for PRODUCTIONS? There is some redundancy to get
48	# rid of.
49	if pnode.tok:
50	if isinstance(pnode.tok, Token):
51	v = lexer.TokenVal(pnode.tok)
52	else:
53	# e.g. CommandSub for x = $(echo hi)
54	v = repr(pnode.tok)
55	else:
56	v = '-'
57	self.f.write('%s%d %s %s\n' % (ind, i, self.names[pnode.typ], v))
58	if pnode.children is not None:
59	for i, child in enumerate(pnode.children):
60	self._Print(child, indent + 1, i)
61
62	def Print(self, pnode):
63	# type: (PNode) -> None
64	self._Print(pnode, 0, 0)
65
66
67	def _Classify(gr, tok):
68	# type: (Grammar, Token) -> int
69
70	# We have to match up what ParserGenerator.make_grammar() did when
71	# calling make_label() and make_first(). See classify() in
72	# opy/pgen2/driver.py.
73
74	id_ = tok.id # mycpp fix: we need C++ to do uint16_t -> int conversion
75
76	# TODO: use something more efficient than a Dict
77	if id_ in gr.tokens:
78	return gr.tokens[id_]
79
80	if id_ == Id.Unknown_DEqual:
81	p_die('Use === to be exact, or ~== to convert types', tok)
82
83	if id_ == Id.Unknown_DAmp:
84	p_die("Use 'and' in expression mode (OILS-ERR-15)", tok)
85	if id_ == Id.Unknown_DPipe:
86	p_die("Use 'or' in expression mode (OILS-ERR-15)", tok)
87	# Not possible to check '!' as it conflicts with Id.Expr_Bang
88
89	if id_ == Id.Unknown_DDot:
90	p_die('Use 1..<5 for half-open range, or 1..=5 for closed range (OILS-ERR-16)', tok)
91
92	if id_ == Id.Unknown_Tok:
93	type_str = ''
94	else:
95	type_str = ' (%s)' % ui.PrettyId(tok.id)
96	p_die('Unexpected token in expression mode%s' % type_str, tok)
97
98
99	# Newlines are ignored between these pairs.
100	# yapf: disable
101	_OTHER_BALANCE = {
102
103	# Parenthesized expressions (tuples) and func/proc parameter lists
104	Id.Op_LParen: 1,
105	Id.Op_RParen: -1,
106	Id.Op_LBracket: 1,
107	Id.Op_RBracket: -1,
108
109	# Dicts are {}, and the grammar respects Op_Newline.
110	}
111	# yapf: enable
112
113
114	def _PushYshTokens(parse_ctx, gr, p, lex):
115	# type: (ParseContext, Grammar, parse.Parser, Lexer) -> Token
116	"""Push tokens onto pgen2's parser.
117
118	Returns the last token so it can be reused/seen by the CommandParser.
119	"""
120	#log('keywords = %s', gr.keywords)
121	#log('tokens = %s', gr.tokens)
122
123	last_token = None # type: Optional[Token]
124	prev_was_newline = False
125
126	balance = 0 # to ignore newlines
127
128	while True:
129	if last_token: # e.g. left over from WordParser
130	tok = last_token
131	#log('last_token = %s', last_token)
132	last_token = None
133	else:
134	tok = lex.Read(lex_mode_e.Expr)
135	#log('tok = %s', tok)
136
137	# Comments and whitespace. Newlines aren't ignored.
138	if consts.GetKind(tok.id) == Kind.Ignored:
139	continue
140
141	# For multiline lists, maps, etc.
142	if tok.id == Id.Op_Newline:
143	if balance > 0:
144	#log('*** SKIPPING NEWLINE')
145	continue
146	# Eliminate duplicate newline tokens. It makes the grammar simpler, and
147	# it's consistent with CPython's lexer and our own WordParser.
148	if prev_was_newline:
149	continue
150	prev_was_newline = True
151	else:
152	prev_was_newline = False
153
154	balance += _OTHER_BALANCE.get(tok.id, 0)
155	#log('BALANCE after seeing %s = %d', tok.id, balance)
156
157	if tok.id == Id.Op_LParen:
158	# For nesting inside $()
159	lex.PushHint(Id.Op_RParen, Id.Op_RParen)
160
161	#if tok.id == Id.Expr_Name and tok.val in KEYWORDS:
162	# tok.id = KEYWORDS[tok.val]
163	# log('Replaced with %s', tok.id)
164
165	assert tok.id < 256, Id_str(tok.id)
166
167	ilabel = _Classify(gr, tok)
168	#log('tok = %s, ilabel = %d', tok, ilabel)
169
170	if p.addtoken(tok.id, tok, ilabel):
171	return tok
172
173	#
174	# Mututally recursive calls into the command/word parsers.
175	#
176
177	if tok.id in (Id.Left_ColonPipe,
178	Id.Left_PercentParen): # :\| %( LEGACY!
179	left_tok = tok
180	if tok.id == Id.Left_PercentParen:
181	lex.PushHint(Id.Op_RParen, Id.Right_ShArrayLiteral)
182
183	# Blame the opening token
184	line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
185	w_parser = parse_ctx.MakeWordParser(lex, line_reader)
186	words = [] # type: List[CompoundWord]
187	close_tok = None # type: Optional[Token]
188	done = False
189	while not done:
190	w = w_parser.ReadWord(lex_mode_e.ShCommand)
191	with tagswitch(w) as case:
192	if case(word_e.Operator):
193	tok = cast(Token, w)
194	if tok.id == Id.Right_ShArrayLiteral:
195	if left_tok.id != Id.Left_PercentParen:
196	p_die('Expected ) to close', left_tok)
197	close_tok = tok
198	done = True # can't use break here
199	elif tok.id == Id.Op_Pipe:
200	if left_tok.id != Id.Left_ColonPipe:
201	p_die('Expected ) to close', left_tok)
202	close_tok = tok
203	done = True # can't use break here
204	elif tok.id == Id.Op_Newline: # internal newlines allowed
205	continue
206	else:
207	p_die('Unexpected token in array literal',
208	loc.Word(w))
209
210	elif case(word_e.Compound):
211	words.append(cast(CompoundWord, w))
212
213	else:
214	raise AssertionError()
215
216	words2 = braces.BraceDetectAll(words)
217	words3 = word_.TildeDetectAll(words2)
218
219	typ = Id.Expr_CastedDummy
220
221	lit_part = ShArrayLiteral(left_tok, words3, close_tok)
222	opaque = cast(Token, lit_part) # HACK for expr_to_ast
223	done = p.addtoken(typ, opaque, gr.tokens[typ])
224	assert not done # can't end the expression
225
226	# Now push the closing )
227	ilabel = _Classify(gr, close_tok)
228	done = p.addtoken(tok.id, close_tok, ilabel)
229	assert not done # can't end the expression
230
231	continue
232
233	# $( @( ^(
234	if tok.id in (Id.Left_DollarParen, Id.Left_AtParen,
235	Id.Left_CaretParen):
236
237	left_token = tok
238
239	lex.PushHint(Id.Op_RParen, Id.Eof_RParen)
240	line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
241	c_parser = parse_ctx.MakeParserForCommandSub(
242	line_reader, lex, Id.Eof_RParen)
243	node = c_parser.ParseCommandSub()
244	# A little gross: Copied from osh/word_parse.py
245	right_token = c_parser.w_parser.cur_token
246
247	cs_part = CommandSub(left_token, node, right_token)
248
249	typ = Id.Expr_CastedDummy
250	opaque = cast(Token, cs_part) # HACK for expr_to_ast
251	done = p.addtoken(typ, opaque, gr.tokens[typ])
252	assert not done # can't end the expression
253
254	# Now push the closing )
255	ilabel = _Classify(gr, right_token)
256	done = p.addtoken(right_token.id, right_token, ilabel)
257	assert not done # can't end the expression
258
259	continue
260
261	# " $" """ $""" ^"
262	if tok.id in (Id.Left_DoubleQuote, Id.Left_DollarDoubleQuote,
263	Id.Left_TDoubleQuote, Id.Left_DollarTDoubleQuote,
264	Id.Left_CaretDoubleQuote):
265
266	left_token = tok
267	line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
268	w_parser = parse_ctx.MakeWordParser(lex, line_reader)
269
270	parts = [] # type: List[word_part_t]
271	last_token = w_parser.ReadDoubleQuoted(left_token, parts)
272	expr_dq_part = DoubleQuoted(left_token, parts, last_token)
273
274	typ = Id.Expr_CastedDummy
275	opaque = cast(Token, expr_dq_part) # HACK for expr_to_ast
276	done = p.addtoken(typ, opaque, gr.tokens[typ])
277	assert not done # can't end the expression
278
279	continue
280
281	# ${
282	if tok.id == Id.Left_DollarBrace:
283	left_token = tok
284	line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
285	w_parser = parse_ctx.MakeWordParser(lex, line_reader)
286
287	part, last_token = w_parser.ReadBracedVarSub(left_token)
288
289	# It's casted word_part.BracedVarSub -> dummy -> expr.BracedVarSub!
290	typ = Id.Expr_CastedDummy
291	opaque = cast(Token, part) # HACK for expr_to_ast
292	done = p.addtoken(typ, opaque, gr.tokens[typ])
293	assert not done # can't end the expression
294
295	continue
296
297	# 'x' '''x'''
298	# r'x' r'''x'''
299	# u'x' u'''x'''
300	# b'x' b'''x'''
301	# $'x'
302	if tok.id in (Id.Left_SingleQuote, Id.Left_TSingleQuote,
303	Id.Left_RSingleQuote, Id.Left_RTSingleQuote,
304	Id.Left_USingleQuote, Id.Left_UTSingleQuote,
305	Id.Left_BSingleQuote, Id.Left_BTSingleQuote,
306	Id.Left_DollarSingleQuote):
307	if tok.id == Id.Left_DollarSingleQuote:
308	sq_mode = lex_mode_e.SQ_C
309	elif tok.id in (Id.Left_USingleQuote, Id.Left_UTSingleQuote,
310	Id.Left_BSingleQuote, Id.Left_BTSingleQuote):
311	sq_mode = lex_mode_e.J8_Str
312	else:
313	sq_mode = lex_mode_e.SQ_Raw
314
315	left_token = tok
316	line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
317	w_parser = parse_ctx.MakeWordParser(lex, line_reader)
318
319	tokens = [] # type: List[Token]
320	last_token = w_parser.ReadSingleQuoted(sq_mode, left_token, tokens,
321	True)
322
323	sval = word_compile.EvalSingleQuoted(left_token.id, tokens)
324	sq_part = SingleQuoted(left_token, sval, last_token)
325
326	typ = Id.Expr_CastedDummy
327	opaque = cast(Token, sq_part) # HACK for expr_to_ast
328	done = p.addtoken(typ, opaque, gr.tokens[typ])
329	assert not done # can't end the expression
330	continue
331
332	else:
333	# We never broke out -- EOF is too soon (how can this happen???)
334	raise parse.ParseError("incomplete input", tok.id, tok)
335
336
337	class ExprParser(object):
338	"""A wrapper around a pgen2 parser."""
339
340	def __init__(self, parse_ctx, gr):
341	# type: (ParseContext, Grammar) -> None
342	self.parse_ctx = parse_ctx
343	self.gr = gr
344	# Reused multiple times.
345	self.push_parser = parse.Parser(gr)
346	self.pnode_alloc = None # type: Optional[PNodeAllocator]
347
348	def Parse(self, lexer, start_symbol):
349	# type: (Lexer, int) -> Tuple[PNode, Token]
350
351	# Reuse the parser
352	self.push_parser.setup(start_symbol, self.pnode_alloc)
353	try:
354	last_token = _PushYshTokens(self.parse_ctx, self.gr,
355	self.push_parser, lexer)
356	except parse.ParseError as e:
357	#log('ERROR %s', e)
358	# TODO:
359	# - Describe what lexer mode we're in (Invalid syntax in regex)
360	# - Maybe say where the mode started
361	# - Id.Unknown_Tok could say "This character is invalid"
362
363	# ParseError has a "too much input" case but I haven't been able to
364	# tickle it. Maybe it's because of the Eof tokens?
365
366	p_die(
367	'Syntax error in expression (near %s)' % ui.PrettyId(e.tok.id),
368	e.tok)
369
370	return self.push_parser.rootnode, last_token
371
372
373	class ctx_PNodeAllocator(object):
374
375	def __init__(self, ep):
376	# type: (ExprParser) -> None
377	self.expr_parser = ep
378	self.expr_parser.pnode_alloc = PNodeAllocator()
379
380	def __enter__(self):
381	# type: () -> None
382	pass
383
384	def __exit__(self, type, value, traceback):
385	# type: (Any, Any, Any) -> None
386	self.expr_parser.pnode_alloc.Clear()
387	self.expr_parser.pnode_alloc = None