1 | """expr_parse.py."""
|
2 | from __future__ import print_function
|
3 |
|
4 | from _devbuild.gen.syntax_asdl import (loc, Token, DoubleQuoted, SingleQuoted,
|
5 | CommandSub, ShArrayLiteral,
|
6 | CompoundWord, word_part_t, word_e)
|
7 | from _devbuild.gen.id_kind_asdl import Id, Kind, Id_str
|
8 | from _devbuild.gen.types_asdl import lex_mode_e
|
9 |
|
10 | from display import ui
|
11 | from core.error import p_die
|
12 | from frontend import consts
|
13 | from frontend import lexer
|
14 | from frontend import reader
|
15 | from mycpp import mylib
|
16 | from mycpp.mylib import log, tagswitch
|
17 | from osh import braces
|
18 | from osh import word_
|
19 | from osh import word_compile
|
20 | from pgen2 import parse
|
21 | from pgen2.pnode import PNodeAllocator
|
22 |
|
23 | _ = log
|
24 |
|
25 | from typing import TYPE_CHECKING, Any, Dict, Tuple, List, cast, Optional
|
26 | if TYPE_CHECKING:
|
27 | from frontend.lexer import Lexer
|
28 | from frontend.parse_lib import ParseContext
|
29 | from pgen2.grammar import Grammar
|
30 | from pgen2.pnode import PNode
|
31 |
|
32 | if mylib.PYTHON:
|
33 |
|
34 | class ParseTreePrinter(object):
|
35 | """Prints a tree of PNode instances."""
|
36 |
|
37 | def __init__(self, names):
|
38 | # type: (Dict[int, str]) -> None
|
39 | self.names = names
|
40 | self.f = mylib.Stdout()
|
41 |
|
42 | def _Print(self, pnode, indent, i):
|
43 | # type: (PNode, int, int) -> None
|
44 |
|
45 | ind = ' ' * indent
|
46 | # NOTE:
|
47 | # - why isn't 'tok' None for PRODUCTIONS? There is some redundancy to get
|
48 | # rid of.
|
49 | if pnode.tok:
|
50 | if isinstance(pnode.tok, Token):
|
51 | v = lexer.TokenVal(pnode.tok)
|
52 | else:
|
53 | # e.g. CommandSub for x = $(echo hi)
|
54 | v = repr(pnode.tok)
|
55 | else:
|
56 | v = '-'
|
57 | self.f.write('%s%d %s %s\n' % (ind, i, self.names[pnode.typ], v))
|
58 | if pnode.children is not None:
|
59 | for i, child in enumerate(pnode.children):
|
60 | self._Print(child, indent + 1, i)
|
61 |
|
62 | def Print(self, pnode):
|
63 | # type: (PNode) -> None
|
64 | self._Print(pnode, 0, 0)
|
65 |
|
66 |
|
67 | def _Classify(gr, tok):
|
68 | # type: (Grammar, Token) -> int
|
69 |
|
70 | # We have to match up what ParserGenerator.make_grammar() did when
|
71 | # calling make_label() and make_first(). See classify() in
|
72 | # opy/pgen2/driver.py.
|
73 |
|
74 | id_ = tok.id # mycpp fix: we need C++ to do uint16_t -> int conversion
|
75 |
|
76 | # TODO: use something more efficient than a Dict
|
77 | if id_ in gr.tokens:
|
78 | return gr.tokens[id_]
|
79 |
|
80 | if id_ == Id.Unknown_DEqual:
|
81 | p_die('Use === to be exact, or ~== to convert types', tok)
|
82 |
|
83 | if id_ == Id.Unknown_DAmp:
|
84 | p_die("Use 'and' in expression mode (OILS-ERR-15)", tok)
|
85 | if id_ == Id.Unknown_DPipe:
|
86 | p_die("Use 'or' in expression mode (OILS-ERR-15)", tok)
|
87 | # Not possible to check '!' as it conflicts with Id.Expr_Bang
|
88 |
|
89 | if id_ == Id.Unknown_DDot:
|
90 | p_die(
|
91 | 'Use ..< for half-open range, or ..= for closed range (OILS-ERR-16)',
|
92 | tok)
|
93 |
|
94 | if id_ == Id.Unknown_Tok:
|
95 | type_str = ''
|
96 | else:
|
97 | type_str = ' (%s)' % ui.PrettyId(tok.id)
|
98 | p_die('Unexpected token in expression mode%s' % type_str, tok)
|
99 |
|
100 |
|
101 | # Newlines are ignored between these pairs.
|
102 | # yapf: disable
|
103 | _OTHER_BALANCE = {
|
104 |
|
105 | # Parenthesized expressions (tuples) and func/proc parameter lists
|
106 | Id.Op_LParen: 1,
|
107 | Id.Op_RParen: -1,
|
108 | Id.Op_LBracket: 1,
|
109 | Id.Op_RBracket: -1,
|
110 |
|
111 | # Dicts are {}, and the grammar respects Op_Newline.
|
112 | }
|
113 | # yapf: enable
|
114 |
|
115 |
|
116 | def _PushYshTokens(parse_ctx, gr, p, lex):
|
117 | # type: (ParseContext, Grammar, parse.Parser, Lexer) -> Token
|
118 | """Push tokens onto pgen2's parser.
|
119 |
|
120 | Returns the last token so it can be reused/seen by the CommandParser.
|
121 | """
|
122 | #log('keywords = %s', gr.keywords)
|
123 | #log('tokens = %s', gr.tokens)
|
124 |
|
125 | last_token = None # type: Optional[Token]
|
126 | prev_was_newline = False
|
127 |
|
128 | balance = 0 # to ignore newlines
|
129 |
|
130 | while True:
|
131 | if last_token: # e.g. left over from WordParser
|
132 | tok = last_token
|
133 | #log('last_token = %s', last_token)
|
134 | last_token = None
|
135 | else:
|
136 | tok = lex.Read(lex_mode_e.Expr)
|
137 | #log('tok = %s', tok)
|
138 |
|
139 | # Comments and whitespace. Newlines aren't ignored.
|
140 | if consts.GetKind(tok.id) == Kind.Ignored:
|
141 | continue
|
142 |
|
143 | # For multiline lists, maps, etc.
|
144 | if tok.id == Id.Op_Newline:
|
145 | if balance > 0:
|
146 | #log('*** SKIPPING NEWLINE')
|
147 | continue
|
148 | # Eliminate duplicate newline tokens. It makes the grammar simpler, and
|
149 | # it's consistent with CPython's lexer and our own WordParser.
|
150 | if prev_was_newline:
|
151 | continue
|
152 | prev_was_newline = True
|
153 | else:
|
154 | prev_was_newline = False
|
155 |
|
156 | balance += _OTHER_BALANCE.get(tok.id, 0)
|
157 | #log('BALANCE after seeing %s = %d', tok.id, balance)
|
158 |
|
159 | if tok.id == Id.Op_LParen:
|
160 | # For nesting inside $()
|
161 | lex.PushHint(Id.Op_RParen, Id.Op_RParen)
|
162 |
|
163 | #if tok.id == Id.Expr_Name and tok.val in KEYWORDS:
|
164 | # tok.id = KEYWORDS[tok.val]
|
165 | # log('Replaced with %s', tok.id)
|
166 |
|
167 | assert tok.id < 256, Id_str(tok.id)
|
168 |
|
169 | ilabel = _Classify(gr, tok)
|
170 | #log('tok = %s, ilabel = %d', tok, ilabel)
|
171 |
|
172 | if p.addtoken(tok.id, tok, ilabel):
|
173 | return tok
|
174 |
|
175 | #
|
176 | # Mututally recursive calls into the command/word parsers.
|
177 | #
|
178 |
|
179 | if tok.id in (Id.Left_ColonPipe,
|
180 | Id.Left_PercentParen): # :| %( LEGACY!
|
181 | left_tok = tok
|
182 | if tok.id == Id.Left_PercentParen:
|
183 | lex.PushHint(Id.Op_RParen, Id.Right_ShArrayLiteral)
|
184 |
|
185 | # Blame the opening token
|
186 | line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
|
187 | w_parser = parse_ctx.MakeWordParser(lex, line_reader)
|
188 | words = [] # type: List[CompoundWord]
|
189 | close_tok = None # type: Optional[Token]
|
190 | done = False
|
191 | while not done:
|
192 | w = w_parser.ReadWord(lex_mode_e.ShCommand)
|
193 | with tagswitch(w) as case:
|
194 | if case(word_e.Operator):
|
195 | tok = cast(Token, w)
|
196 | if tok.id == Id.Right_ShArrayLiteral:
|
197 | if left_tok.id != Id.Left_PercentParen:
|
198 | p_die('Expected ) to close', left_tok)
|
199 | close_tok = tok
|
200 | done = True # can't use break here
|
201 | elif tok.id == Id.Op_Pipe:
|
202 | if left_tok.id != Id.Left_ColonPipe:
|
203 | p_die('Expected ) to close', left_tok)
|
204 | close_tok = tok
|
205 | done = True # can't use break here
|
206 | elif tok.id == Id.Op_Newline: # internal newlines allowed
|
207 | continue
|
208 | else:
|
209 | p_die('Unexpected token in array literal',
|
210 | loc.Word(w))
|
211 |
|
212 | elif case(word_e.Compound):
|
213 | words.append(cast(CompoundWord, w))
|
214 |
|
215 | else:
|
216 | raise AssertionError()
|
217 |
|
218 | words2 = braces.BraceDetectAll(words)
|
219 | words3 = word_.TildeDetectAll(words2)
|
220 |
|
221 | typ = Id.Expr_CastedDummy
|
222 |
|
223 | lit_part = ShArrayLiteral(left_tok, words3, close_tok)
|
224 | opaque = cast(Token, lit_part) # HACK for expr_to_ast
|
225 | done = p.addtoken(typ, opaque, gr.tokens[typ])
|
226 | assert not done # can't end the expression
|
227 |
|
228 | # Now push the closing )
|
229 | ilabel = _Classify(gr, close_tok)
|
230 | done = p.addtoken(tok.id, close_tok, ilabel)
|
231 | assert not done # can't end the expression
|
232 |
|
233 | continue
|
234 |
|
235 | # $( @( ^(
|
236 | if tok.id in (Id.Left_DollarParen, Id.Left_AtParen,
|
237 | Id.Left_CaretParen):
|
238 |
|
239 | left_token = tok
|
240 |
|
241 | lex.PushHint(Id.Op_RParen, Id.Eof_RParen)
|
242 | line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
|
243 | c_parser = parse_ctx.MakeParserForCommandSub(
|
244 | line_reader, lex, Id.Eof_RParen)
|
245 | node = c_parser.ParseCommandSub()
|
246 | # A little gross: Copied from osh/word_parse.py
|
247 | right_token = c_parser.w_parser.cur_token
|
248 |
|
249 | cs_part = CommandSub(left_token, node, right_token)
|
250 |
|
251 | typ = Id.Expr_CastedDummy
|
252 | opaque = cast(Token, cs_part) # HACK for expr_to_ast
|
253 | done = p.addtoken(typ, opaque, gr.tokens[typ])
|
254 | assert not done # can't end the expression
|
255 |
|
256 | # Now push the closing )
|
257 | ilabel = _Classify(gr, right_token)
|
258 | done = p.addtoken(right_token.id, right_token, ilabel)
|
259 | assert not done # can't end the expression
|
260 |
|
261 | continue
|
262 |
|
263 | # " $" """ $""" ^"
|
264 | if tok.id in (Id.Left_DoubleQuote, Id.Left_DollarDoubleQuote,
|
265 | Id.Left_TDoubleQuote, Id.Left_DollarTDoubleQuote,
|
266 | Id.Left_CaretDoubleQuote):
|
267 |
|
268 | left_token = tok
|
269 | line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
|
270 | w_parser = parse_ctx.MakeWordParser(lex, line_reader)
|
271 |
|
272 | parts = [] # type: List[word_part_t]
|
273 | last_token = w_parser.ReadDoubleQuoted(left_token, parts)
|
274 | expr_dq_part = DoubleQuoted(left_token, parts, last_token)
|
275 |
|
276 | typ = Id.Expr_CastedDummy
|
277 | opaque = cast(Token, expr_dq_part) # HACK for expr_to_ast
|
278 | done = p.addtoken(typ, opaque, gr.tokens[typ])
|
279 | assert not done # can't end the expression
|
280 |
|
281 | continue
|
282 |
|
283 | # ${
|
284 | if tok.id == Id.Left_DollarBrace:
|
285 | left_token = tok
|
286 | line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
|
287 | w_parser = parse_ctx.MakeWordParser(lex, line_reader)
|
288 |
|
289 | part, last_token = w_parser.ReadBracedVarSub(left_token)
|
290 |
|
291 | # It's casted word_part.BracedVarSub -> dummy -> expr.BracedVarSub!
|
292 | typ = Id.Expr_CastedDummy
|
293 | opaque = cast(Token, part) # HACK for expr_to_ast
|
294 | done = p.addtoken(typ, opaque, gr.tokens[typ])
|
295 | assert not done # can't end the expression
|
296 |
|
297 | continue
|
298 |
|
299 | # 'x' '''x'''
|
300 | # r'x' r'''x'''
|
301 | # u'x' u'''x'''
|
302 | # b'x' b'''x'''
|
303 | # $'x'
|
304 | if tok.id in (Id.Left_SingleQuote, Id.Left_TSingleQuote,
|
305 | Id.Left_RSingleQuote, Id.Left_RTSingleQuote,
|
306 | Id.Left_USingleQuote, Id.Left_UTSingleQuote,
|
307 | Id.Left_BSingleQuote, Id.Left_BTSingleQuote,
|
308 | Id.Left_DollarSingleQuote):
|
309 | if tok.id == Id.Left_DollarSingleQuote:
|
310 | sq_mode = lex_mode_e.SQ_C
|
311 | elif tok.id in (Id.Left_USingleQuote, Id.Left_UTSingleQuote,
|
312 | Id.Left_BSingleQuote, Id.Left_BTSingleQuote):
|
313 | sq_mode = lex_mode_e.J8_Str
|
314 | else:
|
315 | sq_mode = lex_mode_e.SQ_Raw
|
316 |
|
317 | left_token = tok
|
318 | line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
|
319 | w_parser = parse_ctx.MakeWordParser(lex, line_reader)
|
320 |
|
321 | tokens = [] # type: List[Token]
|
322 | last_token = w_parser.ReadSingleQuoted(sq_mode, left_token, tokens,
|
323 | True)
|
324 |
|
325 | sval = word_compile.EvalSingleQuoted(left_token.id, tokens)
|
326 | sq_part = SingleQuoted(left_token, sval, last_token)
|
327 |
|
328 | typ = Id.Expr_CastedDummy
|
329 | opaque = cast(Token, sq_part) # HACK for expr_to_ast
|
330 | done = p.addtoken(typ, opaque, gr.tokens[typ])
|
331 | assert not done # can't end the expression
|
332 | continue
|
333 |
|
334 | else:
|
335 | # We never broke out -- EOF is too soon (how can this happen???)
|
336 | raise parse.ParseError("incomplete input", tok.id, tok)
|
337 |
|
338 |
|
339 | class ExprParser(object):
|
340 | """A wrapper around a pgen2 parser."""
|
341 |
|
342 | def __init__(self, parse_ctx, gr):
|
343 | # type: (ParseContext, Grammar) -> None
|
344 | self.parse_ctx = parse_ctx
|
345 | self.gr = gr
|
346 | # Reused multiple times.
|
347 | self.push_parser = parse.Parser(gr)
|
348 | self.pnode_alloc = None # type: Optional[PNodeAllocator]
|
349 |
|
350 | def Parse(self, lexer, start_symbol):
|
351 | # type: (Lexer, int) -> Tuple[PNode, Token]
|
352 |
|
353 | # Reuse the parser
|
354 | self.push_parser.setup(start_symbol, self.pnode_alloc)
|
355 | try:
|
356 | last_token = _PushYshTokens(self.parse_ctx, self.gr,
|
357 | self.push_parser, lexer)
|
358 | except parse.ParseError as e:
|
359 | #log('ERROR %s', e)
|
360 | # TODO:
|
361 | # - Describe what lexer mode we're in (Invalid syntax in regex)
|
362 | # - Maybe say where the mode started
|
363 | # - Id.Unknown_Tok could say "This character is invalid"
|
364 |
|
365 | # ParseError has a "too much input" case but I haven't been able to
|
366 | # tickle it. Maybe it's because of the Eof tokens?
|
367 |
|
368 | p_die(
|
369 | 'Syntax error in expression (near %s)' % ui.PrettyId(e.tok.id),
|
370 | e.tok)
|
371 |
|
372 | return self.push_parser.rootnode, last_token
|
373 |
|
374 |
|
375 | class ctx_PNodeAllocator(object):
|
376 |
|
377 | def __init__(self, ep):
|
378 | # type: (ExprParser) -> None
|
379 | self.expr_parser = ep
|
380 | self.expr_parser.pnode_alloc = PNodeAllocator()
|
381 |
|
382 | def __enter__(self):
|
383 | # type: () -> None
|
384 | pass
|
385 |
|
386 | def __exit__(self, type, value, traceback):
|
387 | # type: (Any, Any, Any) -> None
|
388 | self.expr_parser.pnode_alloc.Clear()
|
389 | self.expr_parser.pnode_alloc = None
|