OILS / ysh / expr_parse.py View on Github | oilshell.org

387 lines, 234 significant
1"""expr_parse.py."""
2from __future__ import print_function
3
4from _devbuild.gen.syntax_asdl import (loc, Token, DoubleQuoted, SingleQuoted,
5 CommandSub, ShArrayLiteral,
6 CompoundWord, word_part_t, word_e)
7from _devbuild.gen.id_kind_asdl import Id, Kind, Id_str
8from _devbuild.gen.types_asdl import lex_mode_e
9
10from display import ui
11from core.error import p_die
12from frontend import consts
13from frontend import lexer
14from frontend import reader
15from mycpp import mylib
16from mycpp.mylib import log, tagswitch
17from osh import braces
18from osh import word_
19from osh import word_compile
20from pgen2 import parse
21from pgen2.pnode import PNodeAllocator
22
23_ = log
24
25from typing import TYPE_CHECKING, Any, Dict, Tuple, List, cast, Optional
26if TYPE_CHECKING:
27 from frontend.lexer import Lexer
28 from frontend.parse_lib import ParseContext
29 from pgen2.grammar import Grammar
30 from pgen2.pnode import PNode
31
32if mylib.PYTHON:
33
34 class ParseTreePrinter(object):
35 """Prints a tree of PNode instances."""
36
37 def __init__(self, names):
38 # type: (Dict[int, str]) -> None
39 self.names = names
40 self.f = mylib.Stdout()
41
42 def _Print(self, pnode, indent, i):
43 # type: (PNode, int, int) -> None
44
45 ind = ' ' * indent
46 # NOTE:
47 # - why isn't 'tok' None for PRODUCTIONS? There is some redundancy to get
48 # rid of.
49 if pnode.tok:
50 if isinstance(pnode.tok, Token):
51 v = lexer.TokenVal(pnode.tok)
52 else:
53 # e.g. CommandSub for x = $(echo hi)
54 v = repr(pnode.tok)
55 else:
56 v = '-'
57 self.f.write('%s%d %s %s\n' % (ind, i, self.names[pnode.typ], v))
58 if pnode.children is not None:
59 for i, child in enumerate(pnode.children):
60 self._Print(child, indent + 1, i)
61
62 def Print(self, pnode):
63 # type: (PNode) -> None
64 self._Print(pnode, 0, 0)
65
66
67def _Classify(gr, tok):
68 # type: (Grammar, Token) -> int
69
70 # We have to match up what ParserGenerator.make_grammar() did when
71 # calling make_label() and make_first(). See classify() in
72 # opy/pgen2/driver.py.
73
74 id_ = tok.id # mycpp fix: we need C++ to do uint16_t -> int conversion
75
76 # TODO: use something more efficient than a Dict
77 if id_ in gr.tokens:
78 return gr.tokens[id_]
79
80 if id_ == Id.Unknown_DEqual:
81 p_die('Use === to be exact, or ~== to convert types', tok)
82
83 if id_ == Id.Unknown_DAmp:
84 p_die("Use 'and' in expression mode (OILS-ERR-15)", tok)
85 if id_ == Id.Unknown_DPipe:
86 p_die("Use 'or' in expression mode (OILS-ERR-15)", tok)
87 # Not possible to check '!' as it conflicts with Id.Expr_Bang
88
89 if id_ == Id.Unknown_DDot:
90 p_die('Use 1..<5 for half-open range, or 1..=5 for closed range (OILS-ERR-16)', tok)
91
92 if id_ == Id.Unknown_Tok:
93 type_str = ''
94 else:
95 type_str = ' (%s)' % ui.PrettyId(tok.id)
96 p_die('Unexpected token in expression mode%s' % type_str, tok)
97
98
99# Newlines are ignored between these pairs.
100# yapf: disable
101_OTHER_BALANCE = {
102
103 # Parenthesized expressions (tuples) and func/proc parameter lists
104 Id.Op_LParen: 1,
105 Id.Op_RParen: -1,
106 Id.Op_LBracket: 1,
107 Id.Op_RBracket: -1,
108
109 # Dicts are {}, and the grammar respects Op_Newline.
110}
111# yapf: enable
112
113
114def _PushYshTokens(parse_ctx, gr, p, lex):
115 # type: (ParseContext, Grammar, parse.Parser, Lexer) -> Token
116 """Push tokens onto pgen2's parser.
117
118 Returns the last token so it can be reused/seen by the CommandParser.
119 """
120 #log('keywords = %s', gr.keywords)
121 #log('tokens = %s', gr.tokens)
122
123 last_token = None # type: Optional[Token]
124 prev_was_newline = False
125
126 balance = 0 # to ignore newlines
127
128 while True:
129 if last_token: # e.g. left over from WordParser
130 tok = last_token
131 #log('last_token = %s', last_token)
132 last_token = None
133 else:
134 tok = lex.Read(lex_mode_e.Expr)
135 #log('tok = %s', tok)
136
137 # Comments and whitespace. Newlines aren't ignored.
138 if consts.GetKind(tok.id) == Kind.Ignored:
139 continue
140
141 # For multiline lists, maps, etc.
142 if tok.id == Id.Op_Newline:
143 if balance > 0:
144 #log('*** SKIPPING NEWLINE')
145 continue
146 # Eliminate duplicate newline tokens. It makes the grammar simpler, and
147 # it's consistent with CPython's lexer and our own WordParser.
148 if prev_was_newline:
149 continue
150 prev_was_newline = True
151 else:
152 prev_was_newline = False
153
154 balance += _OTHER_BALANCE.get(tok.id, 0)
155 #log('BALANCE after seeing %s = %d', tok.id, balance)
156
157 if tok.id == Id.Op_LParen:
158 # For nesting inside $()
159 lex.PushHint(Id.Op_RParen, Id.Op_RParen)
160
161 #if tok.id == Id.Expr_Name and tok.val in KEYWORDS:
162 # tok.id = KEYWORDS[tok.val]
163 # log('Replaced with %s', tok.id)
164
165 assert tok.id < 256, Id_str(tok.id)
166
167 ilabel = _Classify(gr, tok)
168 #log('tok = %s, ilabel = %d', tok, ilabel)
169
170 if p.addtoken(tok.id, tok, ilabel):
171 return tok
172
173 #
174 # Mututally recursive calls into the command/word parsers.
175 #
176
177 if tok.id in (Id.Left_ColonPipe,
178 Id.Left_PercentParen): # :| %( LEGACY!
179 left_tok = tok
180 if tok.id == Id.Left_PercentParen:
181 lex.PushHint(Id.Op_RParen, Id.Right_ShArrayLiteral)
182
183 # Blame the opening token
184 line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
185 w_parser = parse_ctx.MakeWordParser(lex, line_reader)
186 words = [] # type: List[CompoundWord]
187 close_tok = None # type: Optional[Token]
188 done = False
189 while not done:
190 w = w_parser.ReadWord(lex_mode_e.ShCommand)
191 with tagswitch(w) as case:
192 if case(word_e.Operator):
193 tok = cast(Token, w)
194 if tok.id == Id.Right_ShArrayLiteral:
195 if left_tok.id != Id.Left_PercentParen:
196 p_die('Expected ) to close', left_tok)
197 close_tok = tok
198 done = True # can't use break here
199 elif tok.id == Id.Op_Pipe:
200 if left_tok.id != Id.Left_ColonPipe:
201 p_die('Expected ) to close', left_tok)
202 close_tok = tok
203 done = True # can't use break here
204 elif tok.id == Id.Op_Newline: # internal newlines allowed
205 continue
206 else:
207 p_die('Unexpected token in array literal',
208 loc.Word(w))
209
210 elif case(word_e.Compound):
211 words.append(cast(CompoundWord, w))
212
213 else:
214 raise AssertionError()
215
216 words2 = braces.BraceDetectAll(words)
217 words3 = word_.TildeDetectAll(words2)
218
219 typ = Id.Expr_CastedDummy
220
221 lit_part = ShArrayLiteral(left_tok, words3, close_tok)
222 opaque = cast(Token, lit_part) # HACK for expr_to_ast
223 done = p.addtoken(typ, opaque, gr.tokens[typ])
224 assert not done # can't end the expression
225
226 # Now push the closing )
227 ilabel = _Classify(gr, close_tok)
228 done = p.addtoken(tok.id, close_tok, ilabel)
229 assert not done # can't end the expression
230
231 continue
232
233 # $( @( ^(
234 if tok.id in (Id.Left_DollarParen, Id.Left_AtParen,
235 Id.Left_CaretParen):
236
237 left_token = tok
238
239 lex.PushHint(Id.Op_RParen, Id.Eof_RParen)
240 line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
241 c_parser = parse_ctx.MakeParserForCommandSub(
242 line_reader, lex, Id.Eof_RParen)
243 node = c_parser.ParseCommandSub()
244 # A little gross: Copied from osh/word_parse.py
245 right_token = c_parser.w_parser.cur_token
246
247 cs_part = CommandSub(left_token, node, right_token)
248
249 typ = Id.Expr_CastedDummy
250 opaque = cast(Token, cs_part) # HACK for expr_to_ast
251 done = p.addtoken(typ, opaque, gr.tokens[typ])
252 assert not done # can't end the expression
253
254 # Now push the closing )
255 ilabel = _Classify(gr, right_token)
256 done = p.addtoken(right_token.id, right_token, ilabel)
257 assert not done # can't end the expression
258
259 continue
260
261 # " $" """ $""" ^"
262 if tok.id in (Id.Left_DoubleQuote, Id.Left_DollarDoubleQuote,
263 Id.Left_TDoubleQuote, Id.Left_DollarTDoubleQuote,
264 Id.Left_CaretDoubleQuote):
265
266 left_token = tok
267 line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
268 w_parser = parse_ctx.MakeWordParser(lex, line_reader)
269
270 parts = [] # type: List[word_part_t]
271 last_token = w_parser.ReadDoubleQuoted(left_token, parts)
272 expr_dq_part = DoubleQuoted(left_token, parts, last_token)
273
274 typ = Id.Expr_CastedDummy
275 opaque = cast(Token, expr_dq_part) # HACK for expr_to_ast
276 done = p.addtoken(typ, opaque, gr.tokens[typ])
277 assert not done # can't end the expression
278
279 continue
280
281 # ${
282 if tok.id == Id.Left_DollarBrace:
283 left_token = tok
284 line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
285 w_parser = parse_ctx.MakeWordParser(lex, line_reader)
286
287 part, last_token = w_parser.ReadBracedVarSub(left_token)
288
289 # It's casted word_part.BracedVarSub -> dummy -> expr.BracedVarSub!
290 typ = Id.Expr_CastedDummy
291 opaque = cast(Token, part) # HACK for expr_to_ast
292 done = p.addtoken(typ, opaque, gr.tokens[typ])
293 assert not done # can't end the expression
294
295 continue
296
297 # 'x' '''x'''
298 # r'x' r'''x'''
299 # u'x' u'''x'''
300 # b'x' b'''x'''
301 # $'x'
302 if tok.id in (Id.Left_SingleQuote, Id.Left_TSingleQuote,
303 Id.Left_RSingleQuote, Id.Left_RTSingleQuote,
304 Id.Left_USingleQuote, Id.Left_UTSingleQuote,
305 Id.Left_BSingleQuote, Id.Left_BTSingleQuote,
306 Id.Left_DollarSingleQuote):
307 if tok.id == Id.Left_DollarSingleQuote:
308 sq_mode = lex_mode_e.SQ_C
309 elif tok.id in (Id.Left_USingleQuote, Id.Left_UTSingleQuote,
310 Id.Left_BSingleQuote, Id.Left_BTSingleQuote):
311 sq_mode = lex_mode_e.J8_Str
312 else:
313 sq_mode = lex_mode_e.SQ_Raw
314
315 left_token = tok
316 line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
317 w_parser = parse_ctx.MakeWordParser(lex, line_reader)
318
319 tokens = [] # type: List[Token]
320 last_token = w_parser.ReadSingleQuoted(sq_mode, left_token, tokens,
321 True)
322
323 sval = word_compile.EvalSingleQuoted(left_token.id, tokens)
324 sq_part = SingleQuoted(left_token, sval, last_token)
325
326 typ = Id.Expr_CastedDummy
327 opaque = cast(Token, sq_part) # HACK for expr_to_ast
328 done = p.addtoken(typ, opaque, gr.tokens[typ])
329 assert not done # can't end the expression
330 continue
331
332 else:
333 # We never broke out -- EOF is too soon (how can this happen???)
334 raise parse.ParseError("incomplete input", tok.id, tok)
335
336
337class ExprParser(object):
338 """A wrapper around a pgen2 parser."""
339
340 def __init__(self, parse_ctx, gr):
341 # type: (ParseContext, Grammar) -> None
342 self.parse_ctx = parse_ctx
343 self.gr = gr
344 # Reused multiple times.
345 self.push_parser = parse.Parser(gr)
346 self.pnode_alloc = None # type: Optional[PNodeAllocator]
347
348 def Parse(self, lexer, start_symbol):
349 # type: (Lexer, int) -> Tuple[PNode, Token]
350
351 # Reuse the parser
352 self.push_parser.setup(start_symbol, self.pnode_alloc)
353 try:
354 last_token = _PushYshTokens(self.parse_ctx, self.gr,
355 self.push_parser, lexer)
356 except parse.ParseError as e:
357 #log('ERROR %s', e)
358 # TODO:
359 # - Describe what lexer mode we're in (Invalid syntax in regex)
360 # - Maybe say where the mode started
361 # - Id.Unknown_Tok could say "This character is invalid"
362
363 # ParseError has a "too much input" case but I haven't been able to
364 # tickle it. Maybe it's because of the Eof tokens?
365
366 p_die(
367 'Syntax error in expression (near %s)' % ui.PrettyId(e.tok.id),
368 e.tok)
369
370 return self.push_parser.rootnode, last_token
371
372
373class ctx_PNodeAllocator(object):
374
375 def __init__(self, ep):
376 # type: (ExprParser) -> None
377 self.expr_parser = ep
378 self.expr_parser.pnode_alloc = PNodeAllocator()
379
380 def __enter__(self):
381 # type: () -> None
382 pass
383
384 def __exit__(self, type, value, traceback):
385 # type: (Any, Any, Any) -> None
386 self.expr_parser.pnode_alloc.Clear()
387 self.expr_parser.pnode_alloc = None