1 | # Copyright 2016 Andy Chu. All rights reserved.
|
2 | # Licensed under the Apache License, Version 2.0 (the "License");
|
3 | # you may not use this file except in compliance with the License.
|
4 | # You may obtain a copy of the License at
|
5 | #
|
6 | # http://www.apache.org/licenses/LICENSE-2.0
|
7 | """
|
8 | word_parse.py - Parse the shell word language.
|
9 |
|
10 | Hairy example:
|
11 |
|
12 | hi$((1 + 2))"$(echo hi)"${var:-__"$(echo default)"__}
|
13 |
|
14 | Substitutions can be nested, but which inner subs are allowed depends on the
|
15 | outer sub. Notes:
|
16 |
|
17 | lex_mode_e.ShCommand (_ReadUnquotedLeftParts)
|
18 | All subs and quotes are allowed:
|
19 | $v ${v} $() `` $(()) '' "" $'' $"" <() >()
|
20 |
|
21 | lex_mode_e.DQ (_ReadDoubleQuotedLeftParts)
|
22 | Var, Command, Arith, but no quotes.
|
23 | $v ${v} $() `` $(())
|
24 | No process substitution.
|
25 |
|
26 | lex_mode_e.Arith
|
27 | Similar to DQ: Var, Command, and Arith sub, but no process sub. bash doesn't
|
28 | allow quotes, but OSH does. We allow ALL FOUR kinds of quotes, because we
|
29 | need those for associative array indexing.
|
30 |
|
31 | lex_mode_e.VSub_ArgUnquoted
|
32 | Like ShCommand, everything is allowed (even process substitutions), but we
|
33 | stop at }, and space is SIGNIFICANT.
|
34 |
|
35 | Example: ${a:- b }
|
36 |
|
37 | ${X:-$v} ${X:-${v}} ${X:-$(echo hi)} ${X:-`echo hi`} ${X:-$((1+2))}
|
38 | ${X:-'single'} ${X:-"double"} ${X:-$'\n'} ${X:-<(echo hi)}
|
39 |
|
40 | lex_mode_e.VSub_ArgDQ
|
41 | In contrast to DQ, VS_ARG_DQ accepts nested "" and $'' and $"", e.g.
|
42 | "${x:-"default"}".
|
43 |
|
44 | In contrast, VSub_ArgUnquoted respects single quotes and process
|
45 | substitution.
|
46 |
|
47 | It's weird that double quotes are allowed. Space is also significant here,
|
48 | e.g. "${x:-a "b"}".
|
49 | """
|
50 |
|
51 | from _devbuild.gen import grammar_nt
|
52 | from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind
|
53 | from _devbuild.gen.types_asdl import (lex_mode_t, lex_mode_e)
|
54 | from _devbuild.gen.syntax_asdl import (
|
55 | BoolParamBox,
|
56 | Token,
|
57 | SimpleVarSub,
|
58 | loc,
|
59 | source,
|
60 | DoubleQuoted,
|
61 | SingleQuoted,
|
62 | BracedVarSub,
|
63 | CommandSub,
|
64 | ShArrayLiteral,
|
65 | AssocPair,
|
66 | bracket_op,
|
67 | bracket_op_t,
|
68 | suffix_op,
|
69 | suffix_op_t,
|
70 | rhs_word,
|
71 | rhs_word_e,
|
72 | rhs_word_t,
|
73 | word_e,
|
74 | word_t,
|
75 | CompoundWord,
|
76 | word_part,
|
77 | word_part_t,
|
78 | y_lhs_e,
|
79 | arith_expr_t,
|
80 | command,
|
81 | expr,
|
82 | expr_e,
|
83 | expr_t,
|
84 | pat_t,
|
85 | ArgList,
|
86 | Proc,
|
87 | Func,
|
88 | Subscript,
|
89 | Attribute,
|
90 | arith_expr,
|
91 | )
|
92 | from core import alloc
|
93 | from core.error import p_die
|
94 | from mycpp.mylib import log
|
95 | from core import pyutil
|
96 | from display import ui
|
97 | from frontend import consts
|
98 | from frontend import lexer
|
99 | from frontend import reader
|
100 | from osh import tdop
|
101 | from osh import arith_parse
|
102 | from osh import braces
|
103 | from osh import word_
|
104 | from osh import word_compile
|
105 | from mycpp.mylib import tagswitch
|
106 |
|
107 | from libc import HAVE_FNM_EXTMATCH
|
108 |
|
109 | from typing import List, Optional, Tuple, cast
|
110 | from typing import TYPE_CHECKING
|
111 | if TYPE_CHECKING:
|
112 | from frontend.lexer import Lexer
|
113 | from frontend.parse_lib import ParseContext
|
114 | from frontend.reader import _Reader
|
115 | from osh.cmd_parse import VarChecker
|
116 |
|
117 | unused1 = log
|
118 | unused2 = Id_str
|
119 |
|
120 | KINDS_THAT_END_WORDS = [Kind.Eof, Kind.WS, Kind.Op, Kind.Right]
|
121 |
|
122 |
|
123 | class WordEmitter(object):
|
124 | """Common interface for [ and [["""
|
125 |
|
126 | def __init__(self):
|
127 | # type: () -> None
|
128 | """Empty constructor for mycpp."""
|
129 | pass
|
130 |
|
131 | def ReadWord(self, lex_mode):
|
132 | # type: (lex_mode_t) -> word_t
|
133 | raise NotImplementedError()
|
134 |
|
135 |
|
136 | class WordParser(WordEmitter):
|
137 |
|
138 | def __init__(self, parse_ctx, lexer, line_reader):
|
139 | # type: (ParseContext, Lexer, _Reader) -> None
|
140 | self.parse_ctx = parse_ctx
|
141 | self.lexer = lexer
|
142 | self.line_reader = line_reader
|
143 | self.arena = line_reader.arena
|
144 |
|
145 | self.parse_opts = parse_ctx.parse_opts
|
146 | self.a_parser = tdop.TdopParser(arith_parse.Spec(), self,
|
147 | self.parse_opts)
|
148 | self.Reset()
|
149 |
|
150 | def Init(self, lex_mode):
|
151 | # type: (lex_mode_t) -> None
|
152 | """Used to parse arithmetic, see ParseContext."""
|
153 | self.next_lex_mode = lex_mode
|
154 |
|
155 | def Reset(self):
|
156 | # type: () -> None
|
157 | """Called by interactive loop."""
|
158 | # For _GetToken()
|
159 | self.cur_token = None # type: Token
|
160 | self.token_kind = Kind.Undefined
|
161 | self.token_type = Id.Undefined_Tok
|
162 |
|
163 | self.next_lex_mode = lex_mode_e.ShCommand
|
164 |
|
165 | # Boolean mutated by CommandParser via word_.ctx_EmitDocToken. For ### doc
|
166 | # comments
|
167 | self.emit_doc_token = False
|
168 | # Boolean mutated by CommandParser via word_.ctx_Multiline. '...' starts
|
169 | # multiline mode.
|
170 | self.multiline = False
|
171 |
|
172 | # For detecting invalid \n\n in multiline mode. Counts what we got
|
173 | # directly from the lexer.
|
174 | self.newline_state = 0
|
175 | # For consolidating \n\n -> \n for the CALLER. This simplifies the parsers
|
176 | # that consume words.
|
177 | self.returned_newline = False
|
178 |
|
179 | # For integration with pgen2
|
180 | self.buffered_word = None # type: word_t
|
181 |
|
182 | def _GetToken(self):
|
183 | # type: () -> None
|
184 | """Call this when you need to make a decision based on any of:
|
185 |
|
186 | self.token_type
|
187 | self.token_kind
|
188 | self.cur_token
|
189 | """
|
190 | if self.next_lex_mode == lex_mode_e.Undefined:
|
191 | return # _SetNext() not called, so do nothing
|
192 |
|
193 | is_fake = self.next_lex_mode == lex_mode_e.BashRegexFakeInner
|
194 | real_mode = (lex_mode_e.BashRegex if is_fake else self.next_lex_mode)
|
195 |
|
196 | self.cur_token = self.lexer.Read(real_mode)
|
197 |
|
198 | # MUTATE TOKEN for fake lexer mode.
|
199 | # This is for crazy stuff bash allows, like [[ s =~ (< >) ]]
|
200 | if (is_fake and self.cur_token.id
|
201 | in (Id.WS_Space, Id.BashRegex_AllowedInParens)):
|
202 | self.cur_token.id = Id.Lit_Chars
|
203 |
|
204 | self.token_type = self.cur_token.id
|
205 | self.token_kind = consts.GetKind(self.token_type)
|
206 |
|
207 | # number of consecutive newlines, ignoring whitespace
|
208 | if self.token_type == Id.Op_Newline:
|
209 | self.newline_state += 1
|
210 | elif self.token_kind != Kind.WS:
|
211 | self.newline_state = 0
|
212 |
|
213 | self.parse_ctx.trail.AppendToken(self.cur_token) # For completion
|
214 | self.next_lex_mode = lex_mode_e.Undefined
|
215 |
|
216 | def _SetNext(self, lex_mode):
|
217 | # type: (lex_mode_t) -> None
|
218 | """Set the next lex state, but don't actually read a token.
|
219 |
|
220 | We need this for proper interactive parsing.
|
221 | """
|
222 | self.next_lex_mode = lex_mode
|
223 |
|
224 | def _ReadVarOpArg(self, arg_lex_mode):
|
225 | # type: (lex_mode_t) -> rhs_word_t
|
226 |
|
227 | # NOTE: Operators like | and < are not treated as special, so ${a:- | >} is
|
228 | # valid, even when unquoted.
|
229 | self._SetNext(arg_lex_mode)
|
230 | self._GetToken()
|
231 |
|
232 | w = self._ReadVarOpArg2(arg_lex_mode, Id.Undefined_Tok,
|
233 | True) # empty_ok
|
234 |
|
235 | # If the Compound has no parts, and we're in a double-quoted VarSub
|
236 | # arg, and empty_ok, then return Empty. This is so it can evaluate to
|
237 | # the empty string and not get elided.
|
238 | #
|
239 | # Examples:
|
240 | # - "${s:-}", "${s/%pat/}"
|
241 | # It's similar to LooksLikeShAssignment where we turn x= into x=''. And it
|
242 | # has the same potential problem of not having Token location info.
|
243 | #
|
244 | # NOTE: empty_ok is False only for the PatSub pattern, which means we'll
|
245 | # return a Compound with no parts, which is explicitly checked with a
|
246 | # custom error message.
|
247 | if len(w.parts) == 0 and arg_lex_mode == lex_mode_e.VSub_ArgDQ:
|
248 | return rhs_word.Empty
|
249 |
|
250 | return w
|
251 |
|
252 | def _ReadVarOpArg2(self, arg_lex_mode, eof_type, empty_ok):
|
253 | # type: (lex_mode_t, Id_t, bool) -> CompoundWord
|
254 | """Return a CompoundWord.
|
255 |
|
256 | Helper function for _ReadVarOpArg and used directly by
|
257 | _ReadPatSubVarOp.
|
258 | """
|
259 | w = self._ReadCompoundWord3(arg_lex_mode, eof_type, empty_ok)
|
260 | #log('w %s', w)
|
261 | tilde = word_.TildeDetect(w)
|
262 | if tilde:
|
263 | w = tilde
|
264 | return w
|
265 |
|
266 | def _ReadSliceVarOp(self):
|
267 | # type: () -> suffix_op.Slice
|
268 | """
|
269 | Looking token after first ':'
|
270 |
|
271 | ArithExpr? (':' ArithExpr? )? '}'
|
272 | """
|
273 | self._NextNonSpace()
|
274 |
|
275 | cur_id = self.token_type
|
276 |
|
277 | if cur_id in (Id.Arith_RBrace, Id.Arith_Colon): # ${a:} or ${a::}
|
278 | begin = arith_expr.EmptyZero # type: arith_expr_t
|
279 | else:
|
280 | begin = self.a_parser.Parse()
|
281 | cur_id = self.a_parser.CurrentId() # advance
|
282 |
|
283 | if cur_id == Id.Arith_RBrace: # ${a:1} or ${@:1}
|
284 | # No length specified, so it's N
|
285 | no_length = None # type: Optional[arith_expr_t]
|
286 | return suffix_op.Slice(begin, no_length)
|
287 |
|
288 | elif cur_id == Id.Arith_Colon: # ${a:1:} or ${@:1:}
|
289 | colon_tok = self.cur_token
|
290 | self._NextNonSpace()
|
291 |
|
292 | if self.token_type == Id.Arith_RBrace:
|
293 | # quirky bash behavior:
|
294 | # ${a:1:} or ${a::} means length ZERO
|
295 | # but ${a:1} or ${a:} means length N
|
296 | if self.parse_opts.strict_parse_slice():
|
297 | p_die(
|
298 | "Slice length: Add explicit zero, or omit : for N (strict_parse_slice)",
|
299 | colon_tok)
|
300 |
|
301 | length = arith_expr.EmptyZero # type: arith_expr_t
|
302 | else:
|
303 | length = self._ReadArithExpr(Id.Arith_RBrace)
|
304 |
|
305 | return suffix_op.Slice(begin, length)
|
306 |
|
307 | else:
|
308 | p_die("Expected : or } in slice", self.cur_token)
|
309 |
|
310 | raise AssertionError() # for MyPy
|
311 |
|
312 | def _ReadPatSubVarOp(self):
|
313 | # type: () -> suffix_op.PatSub
|
314 | """Looking at the first '/' after VarOf:
|
315 |
|
316 | VarSub = ...
|
317 | | VarOf '/' Match ( '/' WORD? )?
|
318 | Match = '/' WORD # can't be empty
|
319 | | '#' WORD? # may be empty
|
320 | | '%' WORD?
|
321 | """
|
322 | slash_tok = self.cur_token # location info
|
323 | replace_mode = Id.Undefined_Tok # bizarre syntax / # %
|
324 |
|
325 | self._SetNext(lex_mode_e.VSub_ArgUnquoted) # advance past /
|
326 |
|
327 | self._GetToken()
|
328 | if self.token_type == Id.Right_DollarBrace:
|
329 | pat = CompoundWord([])
|
330 | return suffix_op.PatSub(pat, rhs_word.Empty, replace_mode,
|
331 | slash_tok)
|
332 |
|
333 | if self.token_type in (Id.Lit_Slash, Id.Lit_Pound, Id.Lit_Percent):
|
334 | replace_mode = self.token_type
|
335 | self._SetNext(lex_mode_e.VSub_ArgUnquoted)
|
336 |
|
337 | # Bash quirk:
|
338 | # echo ${x/#/replace} has an empty pattern
|
339 | # echo ${x////replace} is non-empty; it means echo ${x//'/'/replace}
|
340 | empty_ok = replace_mode != Id.Lit_Slash
|
341 | pat = self._ReadVarOpArg2(lex_mode_e.VSub_ArgUnquoted, Id.Lit_Slash,
|
342 | empty_ok)
|
343 | #log('pat 1 %r', pat)
|
344 |
|
345 | if self.token_type == Id.Lit_Slash:
|
346 | # read until }
|
347 | replace = self._ReadVarOpArg(
|
348 | lex_mode_e.VSub_ArgUnquoted) # type: rhs_word_t
|
349 | #log('r 1 %r', replace)
|
350 | else:
|
351 | # e.g. ${v/a} is the same as ${v/a/} -- empty replacement string
|
352 | replace = rhs_word.Empty
|
353 |
|
354 | self._GetToken()
|
355 | if self.token_type != Id.Right_DollarBrace:
|
356 | # This happens on invalid code
|
357 | p_die(
|
358 | "Expected } after replacement string, got %s" %
|
359 | ui.PrettyId(self.token_type), self.cur_token)
|
360 |
|
361 | return suffix_op.PatSub(pat, replace, replace_mode, slash_tok)
|
362 |
|
363 | def _ReadSubscript(self):
|
364 | # type: () -> bracket_op_t
|
365 | """ Subscript = '[' ('@' | '*' | ArithExpr) ']' """
|
366 | # Lookahead to see if we get @ or *. Otherwise read a full arithmetic
|
367 | # expression.
|
368 | next_id = self.lexer.LookPastSpace(lex_mode_e.Arith)
|
369 | if next_id in (Id.Lit_At, Id.Arith_Star):
|
370 | op = bracket_op.WholeArray(next_id) # type: bracket_op_t
|
371 |
|
372 | self._SetNext(lex_mode_e.Arith) # skip past [
|
373 | self._GetToken()
|
374 | self._SetNext(lex_mode_e.Arith) # skip past @
|
375 | self._GetToken()
|
376 | else:
|
377 | self._SetNext(lex_mode_e.Arith) # skip past [
|
378 | anode = self._ReadArithExpr(Id.Arith_RBracket)
|
379 | op = bracket_op.ArrayIndex(anode)
|
380 |
|
381 | if self.token_type != Id.Arith_RBracket: # Should be looking at ]
|
382 | p_die('Expected ] to close subscript', self.cur_token)
|
383 |
|
384 | self._SetNext(lex_mode_e.VSub_2) # skip past ]
|
385 | self._GetToken() # Needed to be in the same spot as no subscript
|
386 |
|
387 | return op
|
388 |
|
389 | def _ParseVarOf(self):
|
390 | # type: () -> BracedVarSub
|
391 | """
|
392 | VarOf = NAME Subscript?
|
393 | | NUMBER # no subscript allowed, none of these are arrays
|
394 | # ${@[1]} doesn't work, even though slicing does
|
395 | | VarSymbol
|
396 | """
|
397 | self._GetToken()
|
398 | name_token = self.cur_token
|
399 | self._SetNext(lex_mode_e.VSub_2)
|
400 |
|
401 | self._GetToken() # Check for []
|
402 | if self.token_type == Id.VOp2_LBracket:
|
403 | bracket_op = self._ReadSubscript()
|
404 | else:
|
405 | bracket_op = None
|
406 |
|
407 | part = BracedVarSub.CreateNull()
|
408 | part.name_tok = name_token
|
409 | part.var_name = lexer.TokenVal(name_token)
|
410 | part.bracket_op = bracket_op
|
411 | return part
|
412 |
|
413 | def _ParseVarExpr(self, arg_lex_mode, allow_query=False):
|
414 | # type: (lex_mode_t, bool) -> BracedVarSub
|
415 | """Start parsing at the op -- we already skipped past the name."""
|
416 | part = self._ParseVarOf()
|
417 |
|
418 | self._GetToken()
|
419 | if self.token_type == Id.Right_DollarBrace:
|
420 | return part # no ops
|
421 |
|
422 | op_kind = self.token_kind
|
423 |
|
424 | if op_kind == Kind.VTest:
|
425 | tok = self.cur_token
|
426 | arg_word = self._ReadVarOpArg(arg_lex_mode)
|
427 | if self.token_type != Id.Right_DollarBrace:
|
428 | p_die('Expected } to close ${', self.cur_token)
|
429 |
|
430 | part.suffix_op = suffix_op.Unary(tok, arg_word)
|
431 |
|
432 | elif op_kind == Kind.VOpYsh:
|
433 | tok = self.cur_token
|
434 | arg_word = self._ReadVarOpArg(arg_lex_mode)
|
435 | if self.token_type != Id.Right_DollarBrace:
|
436 | p_die('Expected } to close ${', self.cur_token)
|
437 |
|
438 | UP_arg_word = arg_word
|
439 | with tagswitch(arg_word) as case:
|
440 | if case(rhs_word_e.Empty):
|
441 | pass
|
442 | elif case(rhs_word_e.Compound):
|
443 | arg_word = cast(CompoundWord, UP_arg_word)
|
444 | # This handles ${x|html} and ${x %.3f} now
|
445 | # However I think ${x %.3f} should be statically parsed? It can enter
|
446 | # the printf lexer modes.
|
447 | ok, arg, quoted = word_.StaticEval(arg_word)
|
448 | if not ok or quoted:
|
449 | p_die('Expected a constant argument',
|
450 | loc.Word(arg_word))
|
451 |
|
452 | part.suffix_op = suffix_op.Static(tok, arg)
|
453 |
|
454 | elif op_kind == Kind.VOp0:
|
455 | part.suffix_op = self.cur_token # Nullary
|
456 | self._SetNext(lex_mode_e.VSub_2) # Expecting }
|
457 | self._GetToken()
|
458 |
|
459 | elif op_kind == Kind.VOp1: # % %% # ## etc.
|
460 | tok = self.cur_token
|
461 | # Weird exception that all shells have: these operators take a glob
|
462 | # pattern, so they're lexed as VSub_ArgUnquoted, not VSub_ArgDQ
|
463 | arg_word = self._ReadVarOpArg(lex_mode_e.VSub_ArgUnquoted)
|
464 | if self.token_type != Id.Right_DollarBrace:
|
465 | p_die('Expected } to close ${', self.cur_token)
|
466 |
|
467 | part.suffix_op = suffix_op.Unary(tok, arg_word)
|
468 |
|
469 | elif op_kind == Kind.VOp2: # / : [ ]
|
470 | if self.token_type == Id.VOp2_Slash:
|
471 | patsub_op = self._ReadPatSubVarOp() # type: suffix_op_t
|
472 | part.suffix_op = patsub_op
|
473 |
|
474 | # Checked by the method above
|
475 | assert self.token_type == Id.Right_DollarBrace, self.cur_token
|
476 |
|
477 | elif self.token_type == Id.VOp2_Colon:
|
478 | part.suffix_op = self._ReadSliceVarOp()
|
479 | # NOTE: } in arithmetic mode.
|
480 | if self.token_type != Id.Arith_RBrace:
|
481 | # Token seems off; doesn't point to X in # ${a:1:2 X
|
482 | p_die('Expected } to close ${', self.cur_token)
|
483 |
|
484 | else:
|
485 | # TODO: Does this ever happen?
|
486 | p_die('Unexpected token in ${} (%s)' % 'VOp2', self.cur_token)
|
487 |
|
488 | elif op_kind == Kind.VOp3: # ${prefix@} etc.
|
489 | if allow_query:
|
490 | part.suffix_op = self.cur_token # Nullary
|
491 | self._SetNext(lex_mode_e.VSub_2) # Expecting }
|
492 | self._GetToken()
|
493 | else:
|
494 | p_die("Unexpected token in ${} (%s)" % 'VOp3', self.cur_token)
|
495 |
|
496 | # NOTE: Arith_RBrace is for slicing, because it reads } in arithmetic
|
497 | # mode. It's redundantly checked above.
|
498 | if self.token_type not in (Id.Right_DollarBrace, Id.Arith_RBrace):
|
499 | # ${a.} or ${!a.}
|
500 | p_die('Expected } to close ${', self.cur_token)
|
501 |
|
502 | # Now look for ops
|
503 | return part
|
504 |
|
505 | def _ReadZshVarSub(self, left_token):
|
506 | # type: (Token) -> word_part.ZshVarSub
|
507 |
|
508 | self._SetNext(lex_mode_e.VSub_Zsh) # Move past ${(foo)
|
509 |
|
510 | # Can be empty
|
511 | w = self._ReadCompoundWord3(lex_mode_e.VSub_Zsh, Id.Right_DollarBrace,
|
512 | True)
|
513 | self._GetToken()
|
514 | return word_part.ZshVarSub(left_token, w, self.cur_token)
|
515 |
|
516 | def ReadBracedVarSub(self, left_token):
|
517 | # type: (Token) -> Tuple[BracedVarSub, Token]
|
518 | """ For YSH expressions like var x = ${x:-"default"}. """
|
519 | part = self._ReadBracedVarSub(left_token, d_quoted=False)
|
520 | last_token = self.cur_token
|
521 | return part, last_token
|
522 |
|
523 | def _ReadBracedVarSub(self, left_token, d_quoted):
|
524 | # type: (Token, bool) -> BracedVarSub
|
525 | """For the ${} expression language.
|
526 |
|
527 | NAME = [a-zA-Z_][a-zA-Z0-9_]*
|
528 | NUMBER = [0-9]+ # ${10}, ${11}, ...
|
529 |
|
530 | Subscript = '[' ('@' | '*' | ArithExpr) ']'
|
531 | VarSymbol = '!' | '@' | '#' | ...
|
532 | VarOf = NAME Subscript?
|
533 | | NUMBER # no subscript allowed, none of these are arrays
|
534 | # ${@[1]} doesn't work, even though slicing does
|
535 | | VarSymbol
|
536 |
|
537 | NULLARY_OP = '@Q' | '@E' | '@P' | '@A' | '@a' # VOp0
|
538 |
|
539 | TEST_OP = '-' | ':-' | '=' | ':=' | '+' | ':+' | '?' | ':?'
|
540 | STRIP_OP = '#' | '##' | '%' | '%%'
|
541 | CASE_OP = ',' | ',,' | '^' | '^^'
|
542 | UnaryOp = TEST_OP | STRIP_OP | CASE_OP
|
543 |
|
544 | YSH_UNARY = '|' | ' ' # ${x|html} and ${x %.3f}.
|
545 | # SPACE is operator not %
|
546 | Match = ('/' | '#' | '%') WORD # match all / prefix / suffix
|
547 | VarExpr = VarOf
|
548 | | VarOf NULLARY_OP
|
549 | | VarOf UnaryOp WORD
|
550 | | VarOf YSH_UNARY STATIC_WORD
|
551 | | VarOf ':' ArithExpr (':' ArithExpr )?
|
552 | | VarOf '/' Match '/' WORD
|
553 |
|
554 | LengthExpr = '#' VarOf # can't apply operators after length
|
555 |
|
556 | RefOrKeys = '!' VarExpr # CAN apply operators after a named ref
|
557 | # ${!ref[0]} vs ${!keys[@]} resolved later
|
558 |
|
559 | PrefixQuery = '!' NAME ('*' | '@') # list variable names with a prefix
|
560 |
|
561 | BuiltinSub = '.' WORD+ # ${.myproc 'builtin' $sub}
|
562 |
|
563 | VarSub = LengthExpr
|
564 | | RefOrKeys
|
565 | | PrefixQuery
|
566 | | VarExpr
|
567 | | BuiltinSub
|
568 |
|
569 | NOTES:
|
570 | - Arithmetic expressions are used twice, inside subscripts ${a[x+1]} and
|
571 | slicing ${a:x+1:y+2}
|
572 | - ${#} and ${!} need LL(2) lookahead (considering how my tokenizer works)
|
573 | - @ and * are technically arithmetic expressions in this implementation
|
574 | - We don't account for bash 4.4: ${param@operator} -- Q E P A a. Note that
|
575 | it's also vectorized.
|
576 |
|
577 | Strictness over bash:
|
578 | - echo ${a[0][0]} doesn't do anything useful, so we disallow it from the
|
579 | grammar
|
580 | - ! and # prefixes can't be composed, even though named refs can be
|
581 | composed with other operators
|
582 | - '#' means 4 different things: length prefix, VarSymbol, UnaryOp to strip
|
583 | a prefix, and it can also be a literal part of WORD.
|
584 |
|
585 | From the parser's point of view, the prefix # can't be combined with
|
586 | UnaryOp/slicing/matching, and the ! can. However
|
587 |
|
588 | - ${a[@]:1:2} is not allowed
|
589 | - ${#a[@]:1:2} is allowed, but gives the wrong answer
|
590 | """
|
591 | if d_quoted:
|
592 | arg_lex_mode = lex_mode_e.VSub_ArgDQ
|
593 | else:
|
594 | arg_lex_mode = lex_mode_e.VSub_ArgUnquoted
|
595 |
|
596 | self._SetNext(lex_mode_e.VSub_1)
|
597 | self._GetToken()
|
598 |
|
599 | ty = self.token_type
|
600 | first_tok = self.cur_token
|
601 |
|
602 | if ty == Id.VSub_Pound:
|
603 | # Disambiguate
|
604 | next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
|
605 | if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
|
606 | # e.g. a name, '#' is the prefix
|
607 | self._SetNext(lex_mode_e.VSub_1)
|
608 | part = self._ParseVarOf()
|
609 |
|
610 | self._GetToken()
|
611 | if self.token_type != Id.Right_DollarBrace:
|
612 | p_die('Expected } after length expression', self.cur_token)
|
613 |
|
614 | part.prefix_op = first_tok
|
615 |
|
616 | else: # not a prefix, '#' is the variable
|
617 | part = self._ParseVarExpr(arg_lex_mode)
|
618 |
|
619 | elif ty == Id.VSub_Bang:
|
620 | next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
|
621 | if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
|
622 | # e.g. a name, '!' is the prefix
|
623 | # ${!a} -- this is a ref
|
624 | # ${!3} -- this is ref
|
625 | # ${!a[1]} -- this is a ref
|
626 | # ${!a[@]} -- this is a keys
|
627 | # No lookahead -- do it in a second step, or at runtime
|
628 | self._SetNext(lex_mode_e.VSub_1)
|
629 | part = self._ParseVarExpr(arg_lex_mode, allow_query=True)
|
630 |
|
631 | part.prefix_op = first_tok
|
632 |
|
633 | else: # not a prefix, '!' is the variable
|
634 | part = self._ParseVarExpr(arg_lex_mode)
|
635 |
|
636 | elif ty == Id.VSub_Dot:
|
637 | # Note: this will become a new builtin_sub type, so this method must
|
638 | # return word_part_t rather than BracedVarSub. I don't think that
|
639 | # should cause problems.
|
640 | p_die('TODO: ${.myproc builtin sub}', self.cur_token)
|
641 |
|
642 | # VS_NAME, VS_NUMBER, symbol that isn't # or !
|
643 | elif self.token_kind == Kind.VSub:
|
644 | part = self._ParseVarExpr(arg_lex_mode)
|
645 |
|
646 | else:
|
647 | # e.g. ${^}
|
648 | p_die('Unexpected token in ${}', self.cur_token)
|
649 |
|
650 | part.left = left_token # attach the argument
|
651 | part.right = self.cur_token
|
652 | return part
|
653 |
|
654 | def _ReadSingleQuoted(self, left_token, lex_mode):
|
655 | # type: (Token, lex_mode_t) -> SingleQuoted
|
656 | """Internal method to read a word_part."""
|
657 | tokens = [] # type: List[Token]
|
658 | # In command mode, we never disallow backslashes like '\'
|
659 | right_quote = self.ReadSingleQuoted(lex_mode, left_token, tokens,
|
660 | False)
|
661 | sval = word_compile.EvalSingleQuoted(left_token.id, tokens)
|
662 | node = SingleQuoted(left_token, sval, right_quote)
|
663 | return node
|
664 |
|
665 | def ReadSingleQuoted(self, lex_mode, left_token, out_tokens, is_ysh_expr):
|
666 | # type: (lex_mode_t, Token, List[Token], bool) -> Token
|
667 | """Appends to out_tokens; returns last token
|
668 |
|
669 | Used by expr_parse.py
|
670 | """
|
671 | # TODO: Remove and use out_tokens
|
672 | tokens = [] # type: List[Token]
|
673 |
|
674 | # echo '\' is allowed, but x = '\' is invalid, in favor of x = r'\'
|
675 | no_backslashes = is_ysh_expr and left_token.id == Id.Left_SingleQuote
|
676 |
|
677 | expected_end_tokens = 3 if left_token.id in (
|
678 | Id.Left_TSingleQuote, Id.Left_RTSingleQuote, Id.Left_UTSingleQuote,
|
679 | Id.Left_BTSingleQuote) else 1
|
680 | num_end_tokens = 0
|
681 |
|
682 | while num_end_tokens < expected_end_tokens:
|
683 | self._SetNext(lex_mode)
|
684 | self._GetToken()
|
685 |
|
686 | # Kind.Char emitted in lex_mode.SQ_C
|
687 | if self.token_kind in (Kind.Lit, Kind.Char):
|
688 | tok = self.cur_token
|
689 | # Happens in lex_mode_e.SQ: 'one\two' is ambiguous, should be
|
690 | # r'one\two' or c'one\\two'
|
691 | if no_backslashes and lexer.TokenContains(tok, '\\'):
|
692 | p_die(
|
693 | r"Strings with backslashes should look like r'\n' or u'\n' or b'\n'",
|
694 | tok)
|
695 |
|
696 | if is_ysh_expr:
|
697 | # Disallow var x = $'\001'. Arguably we don't need these
|
698 | # checks because u'\u{1}' is the way to write it.
|
699 | if self.token_type == Id.Char_Octal3:
|
700 | p_die(
|
701 | r"Use \xhh or \u{...} instead of octal escapes in YSH strings",
|
702 | tok)
|
703 |
|
704 | if self.token_type == Id.Char_Hex and self.cur_token.length != 4:
|
705 | # disallow \xH
|
706 | p_die(
|
707 | r'Invalid hex escape in YSH string (must be \xHH)',
|
708 | tok)
|
709 |
|
710 | tokens.append(tok)
|
711 |
|
712 | elif self.token_kind == Kind.Unknown:
|
713 | tok = self.cur_token
|
714 | assert tok.id == Id.Unknown_Backslash, tok
|
715 |
|
716 | # x = $'\z' is disallowed; ditto for echo $'\z' if shopt -u parse_backslash
|
717 | if is_ysh_expr or not self.parse_opts.parse_backslash():
|
718 | p_die(
|
719 | "Invalid char escape in C-style string literal (OILS-ERR-11)",
|
720 | tok)
|
721 |
|
722 | tokens.append(tok)
|
723 |
|
724 | elif self.token_kind == Kind.Eof:
|
725 | p_die('Unexpected EOF in single-quoted string that began here',
|
726 | left_token)
|
727 |
|
728 | elif self.token_kind == Kind.Right:
|
729 | # assume Id.Right_SingleQuote
|
730 | num_end_tokens += 1
|
731 | tokens.append(self.cur_token)
|
732 |
|
733 | else:
|
734 | raise AssertionError(self.cur_token)
|
735 |
|
736 | if self.token_kind != Kind.Right:
|
737 | num_end_tokens = 0 # we need three in a ROW
|
738 |
|
739 | if expected_end_tokens == 1:
|
740 | tokens.pop()
|
741 | elif expected_end_tokens == 3: # Get rid of spurious end tokens
|
742 | tokens.pop()
|
743 | tokens.pop()
|
744 | tokens.pop()
|
745 |
|
746 | # Remove space from ''' r''' $''' in both expression mode and command mode
|
747 | if left_token.id in (Id.Left_TSingleQuote, Id.Left_RTSingleQuote,
|
748 | Id.Left_UTSingleQuote, Id.Left_BTSingleQuote):
|
749 | word_compile.RemoveLeadingSpaceSQ(tokens)
|
750 |
|
751 | # Validation after lexing - same 2 checks in j8.LexerDecoder
|
752 | is_u_string = left_token.id in (Id.Left_USingleQuote,
|
753 | Id.Left_UTSingleQuote)
|
754 |
|
755 | for tok in tokens:
|
756 | # u'\yff' is not valid, but b'\yff' is
|
757 | if is_u_string and tok.id == Id.Char_YHex:
|
758 | p_die(
|
759 | r"%s escapes not allowed in u'' strings" %
|
760 | lexer.TokenVal(tok), tok)
|
761 |
|
762 | out_tokens.extend(tokens)
|
763 | return self.cur_token
|
764 |
|
765 | def _ReadDoubleQuotedLeftParts(self):
|
766 | # type: () -> word_part_t
|
767 | """Read substitution parts in a double quoted context."""
|
768 | if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick):
|
769 | return self._ReadCommandSub(self.token_type, d_quoted=True)
|
770 |
|
771 | if self.token_type == Id.Left_DollarBrace:
|
772 | return self._ReadBracedVarSub(self.cur_token, d_quoted=True)
|
773 |
|
774 | if self.token_type == Id.Left_DollarDParen:
|
775 | return self._ReadArithSub()
|
776 |
|
777 | if self.token_type == Id.Left_DollarBracket:
|
778 | return self._ReadExprSub(lex_mode_e.DQ)
|
779 |
|
780 | if self.token_type == Id.Left_DollarBraceZsh:
|
781 | return self._ReadZshVarSub(self.cur_token)
|
782 |
|
783 | raise AssertionError(self.cur_token)
|
784 |
|
785 | def _ReadYshSingleQuoted(self, left_id):
|
786 | # type: (Id_t) -> CompoundWord
|
787 | """Read YSH style strings
|
788 |
|
789 | r'' u'' b''
|
790 | r''' ''' u''' ''' b''' '''
|
791 | """
|
792 | #log('BEF self.cur_token %s', self.cur_token)
|
793 | if left_id == Id.Left_RSingleQuote:
|
794 | lexer_mode = lex_mode_e.SQ_Raw
|
795 | triple_left_id = Id.Left_RTSingleQuote
|
796 | elif left_id == Id.Left_USingleQuote:
|
797 | lexer_mode = lex_mode_e.J8_Str
|
798 | triple_left_id = Id.Left_UTSingleQuote
|
799 | elif left_id == Id.Left_BSingleQuote:
|
800 | lexer_mode = lex_mode_e.J8_Str
|
801 | triple_left_id = Id.Left_BTSingleQuote
|
802 | else:
|
803 | raise AssertionError(left_id)
|
804 |
|
805 | # Needed for syntax checks
|
806 | left_tok = self.cur_token
|
807 | left_tok.id = left_id
|
808 |
|
809 | sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
|
810 |
|
811 | if (len(sq_part.sval) == 0 and self.lexer.ByteLookAhead() == "'"):
|
812 | self._SetNext(lex_mode_e.ShCommand)
|
813 | self._GetToken()
|
814 |
|
815 | assert self.token_type == Id.Left_SingleQuote
|
816 | # HACK: magically transform the third ' in u''' to
|
817 | # Id.Left_UTSingleQuote, so that ''' is the terminator
|
818 | left_tok = self.cur_token
|
819 | left_tok.id = triple_left_id
|
820 |
|
821 | # Handles stripping leading whitespace
|
822 | sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
|
823 |
|
824 | # Advance and validate
|
825 | self._SetNext(lex_mode_e.ShCommand)
|
826 |
|
827 | self._GetToken()
|
828 | if self.token_kind not in KINDS_THAT_END_WORDS:
|
829 | p_die('Unexpected token after YSH single-quoted string',
|
830 | self.cur_token)
|
831 |
|
832 | return CompoundWord([sq_part])
|
833 |
|
834 | def _ReadUnquotedLeftParts(self, triple_out):
|
835 | # type: (Optional[BoolParamBox]) -> word_part_t
|
836 | """Read substitutions and quoted strings (for lex_mode_e.ShCommand).
|
837 |
|
838 | If triple_out is set, then we try parsing triple quoted strings,
|
839 | and set its value to True if we got one.
|
840 | """
|
841 | if self.token_type in (Id.Left_DoubleQuote, Id.Left_DollarDoubleQuote):
|
842 | # Note: $"" is a synonym for "". It might make sense if it added
|
843 | # \n \0 \x00 \u{123} etc. But that's not what bash does!
|
844 | dq_part = self._ReadDoubleQuoted(self.cur_token)
|
845 | # Got empty word "" and there's a " after
|
846 | if (triple_out and len(dq_part.parts) == 0 and
|
847 | self.lexer.ByteLookAhead() == '"'):
|
848 |
|
849 | self._SetNext(lex_mode_e.ShCommand)
|
850 | self._GetToken()
|
851 | # HACK: magically transform the third " in """ to
|
852 | # Id.Left_TDoubleQuote, so that """ is the terminator
|
853 | left_dq_token = self.cur_token
|
854 | left_dq_token.id = Id.Left_TDoubleQuote
|
855 | triple_out.b = True # let caller know we got it
|
856 | return self._ReadDoubleQuoted(left_dq_token)
|
857 |
|
858 | return dq_part
|
859 |
|
860 | if self.token_type in (Id.Left_SingleQuote, Id.Left_RSingleQuote,
|
861 | Id.Left_DollarSingleQuote):
|
862 | if self.token_type == Id.Left_SingleQuote:
|
863 | lexer_mode = lex_mode_e.SQ_Raw
|
864 | triple_left_id = Id.Left_TSingleQuote
|
865 | elif self.token_type == Id.Left_RSingleQuote:
|
866 | lexer_mode = lex_mode_e.SQ_Raw
|
867 | triple_left_id = Id.Left_RTSingleQuote
|
868 | else:
|
869 | lexer_mode = lex_mode_e.SQ_C
|
870 | # there is no such thing as $'''
|
871 | triple_left_id = Id.Undefined_Tok
|
872 |
|
873 | sq_part = self._ReadSingleQuoted(self.cur_token, lexer_mode)
|
874 |
|
875 | # Got empty '' or r'' and there's a ' after
|
876 | # u'' and b'' are handled in _ReadYshSingleQuoted
|
877 | if (triple_left_id != Id.Undefined_Tok and
|
878 | triple_out is not None and len(sq_part.sval) == 0 and
|
879 | self.lexer.ByteLookAhead() == "'"):
|
880 |
|
881 | self._SetNext(lex_mode_e.ShCommand)
|
882 | self._GetToken()
|
883 |
|
884 | # HACK: magically transform the third ' in ''' to
|
885 | # Id.Left_TSingleQuote, so that ''' is the terminator
|
886 | left_sq_token = self.cur_token
|
887 | left_sq_token.id = triple_left_id
|
888 |
|
889 | triple_out.b = True # let caller know we got it
|
890 | return self._ReadSingleQuoted(left_sq_token, lexer_mode)
|
891 |
|
892 | return sq_part
|
893 |
|
894 | if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick,
|
895 | Id.Left_ProcSubIn, Id.Left_ProcSubOut):
|
896 | return self._ReadCommandSub(self.token_type, d_quoted=False)
|
897 |
|
898 | if self.token_type == Id.Left_DollarBrace:
|
899 | return self._ReadBracedVarSub(self.cur_token, d_quoted=False)
|
900 |
|
901 | if self.token_type == Id.Left_DollarDParen:
|
902 | return self._ReadArithSub()
|
903 |
|
904 | if self.token_type == Id.Left_DollarBracket:
|
905 | return self._ReadExprSub(lex_mode_e.ShCommand)
|
906 |
|
907 | if self.token_type == Id.Left_DollarBraceZsh:
|
908 | return self._ReadZshVarSub(self.cur_token)
|
909 |
|
910 | raise AssertionError(self.cur_token)
|
911 |
|
912 | def _ReadExtGlob(self):
|
913 | # type: () -> word_part.ExtGlob
|
914 | """
|
915 | Grammar:
|
916 | Item = CompoundWord | EPSILON # important: @(foo|) is allowed
|
917 | LEFT = '@(' | '*(' | '+(' | '?(' | '!('
|
918 | RIGHT = ')'
|
919 | ExtGlob = LEFT (Item '|')* Item RIGHT # ITEM may be empty
|
920 | Compound includes ExtGlob
|
921 | """
|
922 | left_token = self.cur_token
|
923 | right_token = None # type: Token
|
924 | arms = [] # type: List[CompoundWord]
|
925 |
|
926 | self.lexer.PushHint(Id.Op_RParen, Id.Right_ExtGlob)
|
927 | self._SetNext(lex_mode_e.ExtGlob) # advance past LEFT
|
928 |
|
929 | read_word = False # did we just a read a word? To handle @(||).
|
930 |
|
931 | while True:
|
932 | self._GetToken()
|
933 |
|
934 | if self.token_type == Id.Right_ExtGlob:
|
935 | if not read_word:
|
936 | arms.append(CompoundWord([]))
|
937 | right_token = self.cur_token
|
938 | break
|
939 |
|
940 | elif self.token_type == Id.Op_Pipe:
|
941 | if not read_word:
|
942 | arms.append(CompoundWord([]))
|
943 | read_word = False
|
944 | self._SetNext(lex_mode_e.ExtGlob)
|
945 |
|
946 | # lex_mode_e.ExtGlob should only produce these 4 kinds of tokens
|
947 | elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub,
|
948 | Kind.ExtGlob):
|
949 | w = self._ReadCompoundWord(lex_mode_e.ExtGlob)
|
950 | arms.append(w)
|
951 | read_word = True
|
952 |
|
953 | elif self.token_kind == Kind.Eof:
|
954 | p_die('Unexpected EOF reading extended glob that began here',
|
955 | left_token)
|
956 |
|
957 | else:
|
958 | raise AssertionError(self.cur_token)
|
959 |
|
960 | return word_part.ExtGlob(left_token, arms, right_token)
|
961 |
|
962 | def _ReadBashRegexGroup(self):
|
963 | # type: () -> word_part.BashRegexGroup
|
964 | """
|
965 | Grammar:
|
966 | BashRegexGroup = '(' WORD? ')
|
967 | """
|
968 | left_token = self.cur_token
|
969 | assert left_token.id == Id.BashRegex_LParen, left_token
|
970 |
|
971 | arms = [] # type: List[CompoundWord]
|
972 |
|
973 | self.lexer.PushHint(Id.Op_RParen, Id.Right_BashRegexGroup)
|
974 | self._SetNext(lex_mode_e.BashRegexFakeInner) # advance past LEFT
|
975 |
|
976 | self._GetToken()
|
977 | if self.token_type == Id.Right_BashRegexGroup: # empty ()
|
978 | return word_part.BashRegexGroup(left_token, None, self.cur_token)
|
979 |
|
980 | # lex_mode_e.BashRegex should only produce these 4 kinds of tokens
|
981 | if self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub, Kind.BashRegex):
|
982 | # Fake lexer mode that translates Id.WS_Space to Id.Lit_Chars
|
983 | # To allow bash style [[ s =~ (a b) ]]
|
984 | w = self._ReadCompoundWord(lex_mode_e.BashRegexFakeInner)
|
985 | arms.append(w)
|
986 |
|
987 | self._GetToken()
|
988 | if self.token_type != Id.Right_BashRegexGroup:
|
989 | p_die('Expected ) to close bash regex group', self.cur_token)
|
990 |
|
991 | return word_part.BashRegexGroup(left_token, w, self.cur_token)
|
992 |
|
993 | p_die('Expected word after ( opening bash regex group', self.cur_token)
|
994 |
|
995 | def _ReadLikeDQ(self, left_token, is_ysh_expr, out_parts):
|
996 | # type: (Optional[Token], bool, List[word_part_t]) -> None
|
997 | """
|
998 | Args:
|
999 | left_token: A token if we are reading a double quoted part, or None if
|
1000 | we're reading a here doc.
|
1001 | is_ysh_expr: Whether to disallow backticks and invalid char escapes
|
1002 | out_parts: list of word_part to append to
|
1003 | """
|
1004 | if left_token:
|
1005 | if left_token.id in (Id.Left_TDoubleQuote,
|
1006 | Id.Left_DollarTDoubleQuote):
|
1007 | expected_end_tokens = 3
|
1008 | else:
|
1009 | expected_end_tokens = 1
|
1010 | else:
|
1011 | expected_end_tokens = 1000 # here doc will break
|
1012 |
|
1013 | num_end_tokens = 0
|
1014 | while num_end_tokens < expected_end_tokens:
|
1015 | self._SetNext(lex_mode_e.DQ)
|
1016 | self._GetToken()
|
1017 |
|
1018 | if self.token_kind == Kind.Lit:
|
1019 | if self.token_type == Id.Lit_EscapedChar:
|
1020 | tok = self.cur_token
|
1021 | ch = lexer.TokenSliceLeft(tok, 1)
|
1022 | part = word_part.EscapedLiteral(tok,
|
1023 | ch) # type: word_part_t
|
1024 | else:
|
1025 | if self.token_type == Id.Lit_BadBackslash:
|
1026 | # echo "\z" is OK in shell, but 'x = "\z" is a syntax error in
|
1027 | # YSH.
|
1028 | # Slight hole: We don't catch 'x = ${undef:-"\z"} because of the
|
1029 | # recursion (unless parse_backslash)
|
1030 | if (is_ysh_expr or
|
1031 | not self.parse_opts.parse_backslash()):
|
1032 | p_die(
|
1033 | "Invalid char escape in double quoted string (OILS-ERR-12)",
|
1034 | self.cur_token)
|
1035 | elif self.token_type == Id.Lit_Dollar:
|
1036 | if is_ysh_expr or not self.parse_opts.parse_dollar():
|
1037 | p_die("Literal $ should be quoted like \$",
|
1038 | self.cur_token)
|
1039 |
|
1040 | part = self.cur_token
|
1041 | out_parts.append(part)
|
1042 |
|
1043 | elif self.token_kind == Kind.Left:
|
1044 | if self.token_type == Id.Left_Backtick and is_ysh_expr:
|
1045 | p_die("Invalid backtick: use $(cmd) or \\` in YSH strings",
|
1046 | self.cur_token)
|
1047 |
|
1048 | part = self._ReadDoubleQuotedLeftParts()
|
1049 | out_parts.append(part)
|
1050 |
|
1051 | elif self.token_kind == Kind.VSub:
|
1052 | tok = self.cur_token
|
1053 | part = SimpleVarSub(tok)
|
1054 | out_parts.append(part)
|
1055 | # NOTE: parsing "$f(x)" would BREAK CODE. Could add a more for it
|
1056 | # later.
|
1057 |
|
1058 | elif self.token_kind == Kind.Right:
|
1059 | assert self.token_type == Id.Right_DoubleQuote, self.token_type
|
1060 | if left_token:
|
1061 | num_end_tokens += 1
|
1062 |
|
1063 | # In a here doc, the right quote is literal!
|
1064 | out_parts.append(self.cur_token)
|
1065 |
|
1066 | elif self.token_kind == Kind.Eof:
|
1067 | if left_token:
|
1068 | p_die(
|
1069 | 'Unexpected EOF reading double-quoted string that began here',
|
1070 | left_token)
|
1071 | else: # here docs will have an EOF in their token stream
|
1072 | break
|
1073 |
|
1074 | else:
|
1075 | raise AssertionError(self.cur_token)
|
1076 |
|
1077 | if self.token_kind != Kind.Right:
|
1078 | num_end_tokens = 0 # """ must be CONSECUTIVE
|
1079 |
|
1080 | if expected_end_tokens == 1:
|
1081 | out_parts.pop()
|
1082 | elif expected_end_tokens == 3:
|
1083 | out_parts.pop()
|
1084 | out_parts.pop()
|
1085 | out_parts.pop()
|
1086 |
|
1087 | # Remove space from """ in both expression mode and command mode
|
1088 | if (left_token and left_token.id
|
1089 | in (Id.Left_TDoubleQuote, Id.Left_DollarTDoubleQuote)):
|
1090 | word_compile.RemoveLeadingSpaceDQ(out_parts)
|
1091 |
|
1092 | # Return nothing, since we appended to 'out_parts'
|
1093 |
|
1094 | def _ReadDoubleQuoted(self, left_token):
|
1095 | # type: (Token) -> DoubleQuoted
|
1096 | """Helper function for "hello $name".
|
1097 |
|
1098 | Args:
|
1099 | eof_type: for stopping at }, Id.Lit_RBrace
|
1100 | here_doc: Whether we are reading in a here doc context
|
1101 |
|
1102 | Also ${foo%%a b c} # treat this as double quoted. until you hit
|
1103 | """
|
1104 | parts = [] # type: List[word_part_t]
|
1105 | self._ReadLikeDQ(left_token, False, parts)
|
1106 |
|
1107 | right_quote = self.cur_token
|
1108 | return DoubleQuoted(left_token, parts, right_quote)
|
1109 |
|
1110 | def ReadDoubleQuoted(self, left_token, parts):
|
1111 | # type: (Token, List[word_part_t]) -> Token
|
1112 | """For expression mode.
|
1113 |
|
1114 | Read var x = "${dir:-}/$name"; etc.
|
1115 | """
|
1116 | self._ReadLikeDQ(left_token, True, parts)
|
1117 | return self.cur_token
|
1118 |
|
1119 | def _ReadCommandSub(self, left_id, d_quoted=False):
|
1120 | # type: (Id_t, bool) -> CommandSub
|
1121 | """
|
1122 | NOTE: This is not in the grammar, because word parts aren't in the grammar!
|
1123 |
|
1124 | command_sub = '$(' command_list ')'
|
1125 | | '@(' command_list ')'
|
1126 | | '<(' command_list ')'
|
1127 | | '>(' command_list ')'
|
1128 | | ` command_list `
|
1129 | """
|
1130 | left_token = self.cur_token
|
1131 |
|
1132 | # Set the lexer in a state so ) becomes the EOF token.
|
1133 | if left_id in (Id.Left_DollarParen, Id.Left_AtParen, Id.Left_ProcSubIn,
|
1134 | Id.Left_ProcSubOut):
|
1135 | self._SetNext(lex_mode_e.ShCommand) # advance past $( etc.
|
1136 |
|
1137 | right_id = Id.Eof_RParen
|
1138 | self.lexer.PushHint(Id.Op_RParen, right_id)
|
1139 | c_parser = self.parse_ctx.MakeParserForCommandSub(
|
1140 | self.line_reader, self.lexer, right_id)
|
1141 | # NOTE: This doesn't use something like main_loop because we don't want
|
1142 | # to interleave parsing and execution! Unlike 'source' and 'eval'.
|
1143 | node = c_parser.ParseCommandSub()
|
1144 |
|
1145 | right_token = c_parser.w_parser.cur_token
|
1146 |
|
1147 | elif left_id == Id.Left_Backtick and self.parse_ctx.do_lossless:
|
1148 | # NOTE: This is an APPROXIMATE solution for translation ONLY. See
|
1149 | # test/osh2oil.
|
1150 |
|
1151 | right_id = Id.Eof_Backtick
|
1152 | self.lexer.PushHint(Id.Left_Backtick, right_id)
|
1153 | c_parser = self.parse_ctx.MakeParserForCommandSub(
|
1154 | self.line_reader, self.lexer, right_id)
|
1155 | node = c_parser.ParseCommandSub()
|
1156 | right_token = c_parser.w_parser.cur_token
|
1157 |
|
1158 | elif left_id == Id.Left_Backtick:
|
1159 | if not self.parse_opts.parse_backticks():
|
1160 | p_die('Use $(cmd) instead of backticks (parse_backticks)',
|
1161 | left_token)
|
1162 |
|
1163 | self._SetNext(lex_mode_e.Backtick) # advance past `
|
1164 |
|
1165 | parts = [] # type: List[str]
|
1166 | while True:
|
1167 | self._GetToken()
|
1168 | #log("TOK %s", self.cur_token)
|
1169 |
|
1170 | if self.token_type == Id.Backtick_Quoted:
|
1171 | # Remove leading \
|
1172 | parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
|
1173 |
|
1174 | elif self.token_type == Id.Backtick_DoubleQuote:
|
1175 | # Compatibility: If backticks are double quoted, then double quotes
|
1176 | # within them have to be \"
|
1177 | # Shells aren't smart enough to match nested " and ` quotes (but OSH
|
1178 | # is)
|
1179 | if d_quoted:
|
1180 | # Remove leading \
|
1181 | parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
|
1182 | else:
|
1183 | parts.append(lexer.TokenVal(self.cur_token))
|
1184 |
|
1185 | elif self.token_type == Id.Backtick_Other:
|
1186 | parts.append(lexer.TokenVal(self.cur_token))
|
1187 |
|
1188 | elif self.token_type == Id.Backtick_Right:
|
1189 | break
|
1190 |
|
1191 | elif self.token_type == Id.Eof_Real:
|
1192 | # Note: this parse error is in the ORIGINAL context. No code_str yet.
|
1193 | p_die('Unexpected EOF while looking for closing backtick',
|
1194 | left_token)
|
1195 |
|
1196 | else:
|
1197 | raise AssertionError(self.cur_token)
|
1198 |
|
1199 | self._SetNext(lex_mode_e.Backtick)
|
1200 |
|
1201 | # Calculate right SPID on CommandSub BEFORE re-parsing.
|
1202 | right_token = self.cur_token
|
1203 |
|
1204 | code_str = ''.join(parts)
|
1205 | #log('code %r', code_str)
|
1206 |
|
1207 | # NOTE: This is similar to how we parse aliases in osh/cmd_parse.py. It
|
1208 | # won't have the same location info as MakeParserForCommandSub(), because
|
1209 | # the lexer is different.
|
1210 | arena = self.parse_ctx.arena
|
1211 | #arena = alloc.Arena()
|
1212 | line_reader = reader.StringLineReader(code_str, arena)
|
1213 | c_parser = self.parse_ctx.MakeOshParser(line_reader)
|
1214 | src = source.Reparsed('backticks', left_token, right_token)
|
1215 | with alloc.ctx_SourceCode(arena, src):
|
1216 | node = c_parser.ParseCommandSub()
|
1217 |
|
1218 | else:
|
1219 | raise AssertionError(left_id)
|
1220 |
|
1221 | return CommandSub(left_token, node, right_token)
|
1222 |
|
1223 | def _ReadExprSub(self, lex_mode):
|
1224 | # type: (lex_mode_t) -> word_part.ExprSub
|
1225 | """$[d->key] $[obj.method()] etc."""
|
1226 | left_token = self.cur_token
|
1227 |
|
1228 | self._SetNext(lex_mode_e.Expr)
|
1229 | enode, right_token = self.parse_ctx.ParseYshExpr(
|
1230 | self.lexer, grammar_nt.ysh_expr_sub)
|
1231 |
|
1232 | self._SetNext(lex_mode) # Move past ]
|
1233 | return word_part.ExprSub(left_token, enode, right_token)
|
1234 |
|
1235 | def ParseVarDecl(self, kw_token):
|
1236 | # type: (Token) -> command.VarDecl
|
1237 | """
|
1238 | oil_var_decl: name_type_list '=' testlist end_stmt
|
1239 |
|
1240 | Note that assignments must end with \n ; } or EOF. Unlike shell
|
1241 | assignments, we disallow:
|
1242 |
|
1243 | var x = 42 | wc -l
|
1244 | var x = 42 && echo hi
|
1245 | """
|
1246 | self._SetNext(lex_mode_e.Expr)
|
1247 | enode, last_token = self.parse_ctx.ParseVarDecl(kw_token, self.lexer)
|
1248 | # Hack to move } from what the Expr lexer modes gives to what CommandParser
|
1249 | # wants
|
1250 | if last_token.id == Id.Op_RBrace:
|
1251 | last_token.id = Id.Lit_RBrace
|
1252 |
|
1253 | # Let the CommandParser see the Op_Semi or Op_Newline.
|
1254 | self.buffered_word = last_token
|
1255 | self._SetNext(lex_mode_e.ShCommand) # always back to this
|
1256 | return enode
|
1257 |
|
1258 | def ParseMutation(self, kw_token, var_checker):
|
1259 | # type: (Token, VarChecker) -> command.Mutation
|
1260 | """
|
1261 | setvar i = 42
|
1262 | setvar i += 1
|
1263 | setvar a[i] = 42
|
1264 | setvar a[i] += 1
|
1265 | setvar d.key = 42
|
1266 | setvar d.key += 1
|
1267 | """
|
1268 | self._SetNext(lex_mode_e.Expr)
|
1269 | enode, last_token = self.parse_ctx.ParseMutation(kw_token, self.lexer)
|
1270 | # Hack to move } from what the Expr lexer modes gives to what CommandParser
|
1271 | # wants
|
1272 | if last_token.id == Id.Op_RBrace:
|
1273 | last_token.id = Id.Lit_RBrace
|
1274 |
|
1275 | for lhs in enode.lhs:
|
1276 | UP_lhs = lhs
|
1277 | with tagswitch(lhs) as case:
|
1278 | if case(y_lhs_e.Var):
|
1279 | lhs = cast(Token, UP_lhs)
|
1280 | var_checker.Check(kw_token.id, lexer.LazyStr(lhs), lhs)
|
1281 |
|
1282 | # Note: this does not cover cases like
|
1283 | # setvar (a[0])[1] = v
|
1284 | # setvar (d.key).other = v
|
1285 | # This leaks into catching all typos statically, which may be
|
1286 | # possible if 'use' makes all names explicit.
|
1287 | elif case(y_lhs_e.Subscript):
|
1288 | lhs = cast(Subscript, UP_lhs)
|
1289 | if lhs.obj.tag() == expr_e.Var:
|
1290 | v = cast(expr.Var, lhs.obj)
|
1291 | var_checker.Check(kw_token.id, v.name, v.left)
|
1292 |
|
1293 | elif case(y_lhs_e.Attribute):
|
1294 | lhs = cast(Attribute, UP_lhs)
|
1295 | if lhs.obj.tag() == expr_e.Var:
|
1296 | v = cast(expr.Var, lhs.obj)
|
1297 | var_checker.Check(kw_token.id, v.name, v.left)
|
1298 |
|
1299 | # Let the CommandParser see the Op_Semi or Op_Newline.
|
1300 | self.buffered_word = last_token
|
1301 | self._SetNext(lex_mode_e.ShCommand) # always back to this
|
1302 | return enode
|
1303 |
|
1304 | def ParseBareDecl(self):
|
1305 | # type: () -> expr_t
|
1306 | """
|
1307 | x = {name: val}
|
1308 | """
|
1309 | self._SetNext(lex_mode_e.Expr)
|
1310 | self._GetToken()
|
1311 | enode, last_token = self.parse_ctx.ParseYshExpr(
|
1312 | self.lexer, grammar_nt.command_expr)
|
1313 | if last_token.id == Id.Op_RBrace:
|
1314 | last_token.id = Id.Lit_RBrace
|
1315 | self.buffered_word = last_token
|
1316 | self._SetNext(lex_mode_e.ShCommand)
|
1317 | return enode
|
1318 |
|
1319 | def ParseYshExprForCommand(self):
|
1320 | # type: () -> expr_t
|
1321 |
|
1322 | # Fudge for this case
|
1323 | # for x in(y) {
|
1324 | # versus
|
1325 | # for x in (y) {
|
1326 | #
|
1327 | # In the former case, ReadWord on 'in' puts the lexer past (.
|
1328 | # Also see LookPastSpace in CommandParers.
|
1329 | # A simpler solution would be nicer.
|
1330 |
|
1331 | if self.token_type == Id.Op_LParen:
|
1332 | self.lexer.MaybeUnreadOne()
|
1333 |
|
1334 | enode, _ = self.parse_ctx.ParseYshExpr(self.lexer, grammar_nt.ysh_expr)
|
1335 |
|
1336 | self._SetNext(lex_mode_e.ShCommand)
|
1337 | return enode
|
1338 |
|
1339 | def ParseCommandExpr(self):
|
1340 | # type: () -> expr_t
|
1341 | """
|
1342 | = 1+2
|
1343 | """
|
1344 | enode, last_token = self.parse_ctx.ParseYshExpr(
|
1345 | self.lexer, grammar_nt.command_expr)
|
1346 |
|
1347 | # In some cases, such as the case statement, we expect *the lexer* to be
|
1348 | # pointing at the token right after the expression. But the expression
|
1349 | # parser must have read to the `last_token`. Unreading places the lexer
|
1350 | # back in the expected state. Ie:
|
1351 | #
|
1352 | # case (x) { case (x) {
|
1353 | # (else) { = x } (else) { = x }
|
1354 | # ^ The lexer is here ^ Unread to here
|
1355 | # } }
|
1356 | assert last_token.id in (Id.Op_Newline, Id.Eof_Real, Id.Op_Semi,
|
1357 | Id.Op_RBrace), last_token
|
1358 | if last_token.id != Id.Eof_Real:
|
1359 | # Eof_Real is the only token we cannot unread
|
1360 | self.lexer.MaybeUnreadOne()
|
1361 |
|
1362 | return enode
|
1363 |
|
1364 | def ParseProc(self, node):
|
1365 | # type: (Proc) -> None
|
1366 |
|
1367 | # proc name-with-hyphens() must be accepted
|
1368 | self._SetNext(lex_mode_e.ShCommand)
|
1369 | self._GetToken()
|
1370 | # example: 'proc f[' gets you Lit_ArrayLhsOpen
|
1371 | if self.token_type != Id.Lit_Chars:
|
1372 | p_die('Invalid proc name %s' % ui.PrettyToken(self.cur_token),
|
1373 | self.cur_token)
|
1374 |
|
1375 | # TODO: validate this more. Disallow proc 123 { }, which isn't disallowed
|
1376 | # for shell functions. Similar to IsValidVarName().
|
1377 | node.name = self.cur_token
|
1378 |
|
1379 | last_token = self.parse_ctx.ParseProc(self.lexer, node)
|
1380 |
|
1381 | # Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
|
1382 | assert last_token.id == Id.Op_LBrace
|
1383 | last_token.id = Id.Lit_LBrace
|
1384 | self.buffered_word = last_token
|
1385 |
|
1386 | self._SetNext(lex_mode_e.ShCommand)
|
1387 |
|
1388 | def ParseFunc(self, node):
|
1389 | # type: (Func) -> None
|
1390 | last_token = self.parse_ctx.ParseFunc(self.lexer, node)
|
1391 |
|
1392 | # Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
|
1393 | assert last_token.id == Id.Op_LBrace
|
1394 | last_token.id = Id.Lit_LBrace
|
1395 | self.buffered_word = last_token
|
1396 |
|
1397 | self._SetNext(lex_mode_e.ShCommand)
|
1398 |
|
1399 | def ParseYshCasePattern(self):
|
1400 | # type: () -> Tuple[pat_t, Token]
|
1401 | pat, left_tok, last_token = self.parse_ctx.ParseYshCasePattern(
|
1402 | self.lexer)
|
1403 |
|
1404 | if last_token.id == Id.Op_LBrace:
|
1405 | last_token.id = Id.Lit_LBrace
|
1406 | self.buffered_word = last_token
|
1407 |
|
1408 | return pat, left_tok
|
1409 |
|
1410 | def NewlineOkForYshCase(self):
|
1411 | # type: () -> Id_t
|
1412 | """Check for optional newline and consume it.
|
1413 |
|
1414 | This is a special case of `_NewlineOk` which fixed some "off-by-one" issues
|
1415 | which crop up while parsing Ysh Case Arms. For more details, see
|
1416 | #oil-dev > Progress On YSH Case Grammar on zulip.
|
1417 |
|
1418 | Returns a token id which is filled with the choice of
|
1419 |
|
1420 | word { echo word }
|
1421 | (3) { echo expr }
|
1422 | /e/ { echo eggex }
|
1423 | } # right brace
|
1424 | """
|
1425 | while True:
|
1426 | next_id = self.lexer.LookAheadOne(lex_mode_e.Expr)
|
1427 |
|
1428 | # Cannot lookahead past lines
|
1429 | if next_id == Id.Unknown_Tok:
|
1430 | if not self.lexer.MoveToNextLine(): # Try to move to next line
|
1431 | break # EOF
|
1432 | continue
|
1433 |
|
1434 | next_kind = consts.GetKind(next_id)
|
1435 | if next_id != Id.Op_Newline and next_kind != Kind.Ignored:
|
1436 | break
|
1437 |
|
1438 | self.lexer.Read(lex_mode_e.Expr)
|
1439 |
|
1440 | if next_id in (Id.Op_RBrace, Id.Op_LParen, Id.Arith_Slash):
|
1441 | self._SetNext(lex_mode_e.Expr) # Continue in expression mode
|
1442 | else:
|
1443 | # Consume the trailing Op_Newline
|
1444 | self._SetNext(lex_mode_e.ShCommand)
|
1445 | self._GetToken()
|
1446 |
|
1447 | return next_id
|
1448 |
|
1449 | def _ReadArithExpr(self, end_id):
|
1450 | # type: (Id_t) -> arith_expr_t
|
1451 | """Read and parse an arithmetic expression in various contexts.
|
1452 |
|
1453 | $(( 1+2 ))
|
1454 | (( a=1+2 ))
|
1455 | ${a[ 1+2 ]}
|
1456 | ${a : 1+2 : 1+2}
|
1457 |
|
1458 | See tests/arith-context.test.sh for ambiguous cases.
|
1459 |
|
1460 | ${a[a[0]]} is valid # VS_RBRACKET vs Id.Arith_RBracket
|
1461 |
|
1462 | ${s : a<b?0:1 : 1} # VS_COLON vs Id.Arith_Colon
|
1463 |
|
1464 | See the assertion in ArithParser.Parse() -- unexpected extra input.
|
1465 | """
|
1466 | # calls self.ReadWord(lex_mode_e.Arith)
|
1467 | anode = self.a_parser.Parse()
|
1468 | cur_id = self.a_parser.CurrentId()
|
1469 | if end_id != Id.Undefined_Tok and cur_id != end_id:
|
1470 | p_die(
|
1471 | 'Unexpected token after arithmetic expression (%s != %s)' %
|
1472 | (ui.PrettyId(cur_id), ui.PrettyId(end_id)),
|
1473 | loc.Word(self.a_parser.cur_word))
|
1474 | return anode
|
1475 |
|
1476 | def _ReadArithSub(self):
|
1477 | # type: () -> word_part.ArithSub
|
1478 | """Read an arith substitution, which contains an arith expression, e.g.
|
1479 |
|
1480 | $((a + 1)).
|
1481 | """
|
1482 | left_tok = self.cur_token
|
1483 |
|
1484 | # The second one needs to be disambiguated in stuff like stuff like:
|
1485 | # $(echo $(( 1+2 )) )
|
1486 | self.lexer.PushHint(Id.Op_RParen, Id.Right_DollarDParen)
|
1487 |
|
1488 | # NOTE: To disambiguate $(( as arith sub vs. command sub and subshell, we
|
1489 | # could save the lexer/reader state here, and retry if the arithmetic parse
|
1490 | # fails. But we can almost always catch this at parse time. There could
|
1491 | # be some exceptions like:
|
1492 | # $((echo * foo)) # looks like multiplication
|
1493 | # $((echo / foo)) # looks like division
|
1494 |
|
1495 | # $(( )) is valid
|
1496 | anode = arith_expr.EmptyZero # type: arith_expr_t
|
1497 |
|
1498 | self._NextNonSpace()
|
1499 | if self.token_type != Id.Arith_RParen:
|
1500 | anode = self._ReadArithExpr(Id.Arith_RParen)
|
1501 |
|
1502 | self._SetNext(lex_mode_e.ShCommand)
|
1503 |
|
1504 | # Ensure we get closing )
|
1505 | self._GetToken()
|
1506 | if self.token_type != Id.Right_DollarDParen:
|
1507 | p_die('Expected second ) to end arith sub', self.cur_token)
|
1508 |
|
1509 | right_tok = self.cur_token
|
1510 | return word_part.ArithSub(left_tok, anode, right_tok)
|
1511 |
|
1512 | def ReadDParen(self):
|
1513 | # type: () -> Tuple[arith_expr_t, Token]
|
1514 | """Read ((1+ 2)) -- command context.
|
1515 |
|
1516 | We're using the word parser because it's very similar to _ReadArithExpr
|
1517 | above.
|
1518 |
|
1519 | This also returns the terminating Id.Op_DRightParen token for location
|
1520 | info.
|
1521 | """
|
1522 | # (( )) is valid
|
1523 | anode = arith_expr.EmptyZero # type: arith_expr_t
|
1524 |
|
1525 | self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
|
1526 |
|
1527 | self._NextNonSpace()
|
1528 | if self.token_type != Id.Arith_RParen:
|
1529 | anode = self._ReadArithExpr(Id.Arith_RParen)
|
1530 |
|
1531 | self._SetNext(lex_mode_e.ShCommand)
|
1532 |
|
1533 | # Ensure we get the second )
|
1534 | self._GetToken()
|
1535 | right = self.cur_token
|
1536 | if right.id != Id.Op_DRightParen:
|
1537 | p_die('Expected second ) to end arith statement', right)
|
1538 |
|
1539 | self._SetNext(lex_mode_e.ShCommand)
|
1540 |
|
1541 | return anode, right
|
1542 |
|
1543 | def _NextNonSpace(self):
|
1544 | # type: () -> None
|
1545 | """Advance in lex_mode_e.Arith until non-space token.
|
1546 |
|
1547 | Same logic as _ReadWord, but used in
|
1548 | $(( ))
|
1549 | (( ))
|
1550 | for (( ))
|
1551 |
|
1552 | You can read self.token_type after this, without calling _GetToken.
|
1553 | """
|
1554 | while True:
|
1555 | self._SetNext(lex_mode_e.Arith)
|
1556 | self._GetToken()
|
1557 | if self.token_kind not in (Kind.Ignored, Kind.WS):
|
1558 | break
|
1559 |
|
1560 | def ReadForExpression(self):
|
1561 | # type: () -> command.ForExpr
|
1562 | """Read ((i=0; i<5; ++i)) -- part of command context."""
|
1563 | self._NextNonSpace() # skip over ((
|
1564 | cur_id = self.token_type # for end of arith expressions
|
1565 |
|
1566 | if cur_id == Id.Arith_Semi: # for (( ; i < 10; i++ ))
|
1567 | init_node = arith_expr.EmptyZero # type: arith_expr_t
|
1568 | else:
|
1569 | init_node = self.a_parser.Parse()
|
1570 | cur_id = self.a_parser.CurrentId()
|
1571 | self._NextNonSpace()
|
1572 |
|
1573 | # It's odd to keep track of both cur_id and self.token_type in this
|
1574 | # function, but it works, and is tested in 'test/parse_error.sh
|
1575 | # arith-integration'
|
1576 | if cur_id != Id.Arith_Semi: # for (( x=0 b; ... ))
|
1577 | p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
|
1578 |
|
1579 | self._GetToken()
|
1580 | cur_id = self.token_type
|
1581 |
|
1582 | if cur_id == Id.Arith_Semi: # for (( ; ; i++ ))
|
1583 | # empty condition is TRUE
|
1584 | cond_node = arith_expr.EmptyOne # type: arith_expr_t
|
1585 | else:
|
1586 | cond_node = self.a_parser.Parse()
|
1587 | cur_id = self.a_parser.CurrentId()
|
1588 |
|
1589 | if cur_id != Id.Arith_Semi: # for (( x=0; x<5 b ))
|
1590 | p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
|
1591 |
|
1592 | self._NextNonSpace()
|
1593 | if self.token_type == Id.Arith_RParen: # for (( ; ; ))
|
1594 | update_node = arith_expr.EmptyZero # type: arith_expr_t
|
1595 | else:
|
1596 | update_node = self._ReadArithExpr(Id.Arith_RParen)
|
1597 |
|
1598 | self._NextNonSpace()
|
1599 | if self.token_type != Id.Arith_RParen:
|
1600 | p_die('Expected ) to end for loop expression', self.cur_token)
|
1601 | self._SetNext(lex_mode_e.ShCommand)
|
1602 |
|
1603 | # redirects is None, will be assigned in CommandEvaluator
|
1604 | node = command.ForExpr.CreateNull()
|
1605 | node.init = init_node
|
1606 | node.cond = cond_node
|
1607 | node.update = update_node
|
1608 | return node
|
1609 |
|
1610 | def _ReadArrayLiteral(self):
|
1611 | # type: () -> word_part_t
|
1612 | """a=(1 2 3)
|
1613 |
|
1614 | TODO: See osh/cmd_parse.py:164 for Id.Lit_ArrayLhsOpen, for a[x++]=1
|
1615 |
|
1616 | We want:
|
1617 |
|
1618 | A=(['x']=1 ["x"]=2 [$x$y]=3)
|
1619 |
|
1620 | Maybe allow this as a literal string? Because I think I've seen it before?
|
1621 | Or maybe force people to patch to learn the rule.
|
1622 |
|
1623 | A=([x]=4)
|
1624 |
|
1625 | Starts with Lit_Other '[', and then it has Lit_ArrayLhsClose
|
1626 | Maybe enforce that ALL have keys or NONE of have keys.
|
1627 | """
|
1628 | self._SetNext(lex_mode_e.ShCommand) # advance past (
|
1629 | self._GetToken()
|
1630 | if self.cur_token.id != Id.Op_LParen:
|
1631 | p_die('Expected ( after =', self.cur_token)
|
1632 | left_token = self.cur_token
|
1633 | right_token = None # type: Token
|
1634 |
|
1635 | # MUST use a new word parser (with same lexer).
|
1636 | w_parser = self.parse_ctx.MakeWordParser(self.lexer, self.line_reader)
|
1637 | words = [] # type: List[CompoundWord]
|
1638 | done = False
|
1639 | while not done:
|
1640 | w = w_parser.ReadWord(lex_mode_e.ShCommand)
|
1641 | with tagswitch(w) as case:
|
1642 | if case(word_e.Operator):
|
1643 | tok = cast(Token, w)
|
1644 | if tok.id == Id.Right_ShArrayLiteral:
|
1645 | right_token = tok
|
1646 | done = True # can't use break here
|
1647 | # Unlike command parsing, array parsing allows embedded \n.
|
1648 | elif tok.id == Id.Op_Newline:
|
1649 | continue
|
1650 | else:
|
1651 | p_die('Unexpected token in array literal', loc.Word(w))
|
1652 |
|
1653 | elif case(word_e.Compound):
|
1654 | words.append(cast(CompoundWord, w))
|
1655 |
|
1656 | else:
|
1657 | raise AssertionError()
|
1658 |
|
1659 | if len(words) == 0: # a=() is empty indexed array
|
1660 | # Needed for type safety, doh
|
1661 | no_words = [] # type: List[word_t]
|
1662 | node = ShArrayLiteral(left_token, no_words, right_token)
|
1663 | return node
|
1664 |
|
1665 | pairs = [] # type: List[AssocPair]
|
1666 | # If the first one is a key/value pair, then the rest are assumed to be.
|
1667 | pair = word_.DetectAssocPair(words[0])
|
1668 | if pair:
|
1669 | pairs.append(pair)
|
1670 |
|
1671 | n = len(words)
|
1672 | for i in xrange(1, n):
|
1673 | w2 = words[i]
|
1674 | pair = word_.DetectAssocPair(w2)
|
1675 | if not pair:
|
1676 | p_die("Expected associative array pair", loc.Word(w2))
|
1677 |
|
1678 | pairs.append(pair)
|
1679 |
|
1680 | # invariant List?
|
1681 | return word_part.BashAssocLiteral(left_token, pairs, right_token)
|
1682 |
|
1683 | # Brace detection for arrays but NOT associative arrays
|
1684 | words2 = braces.BraceDetectAll(words)
|
1685 | words3 = word_.TildeDetectAll(words2)
|
1686 | return ShArrayLiteral(left_token, words3, right_token)
|
1687 |
|
1688 | def ParseProcCallArgs(self, start_symbol):
|
1689 | # type: (int) -> ArgList
|
1690 | """ json write (x) """
|
1691 | self.lexer.MaybeUnreadOne()
|
1692 |
|
1693 | arg_list = ArgList.CreateNull(alloc_lists=True)
|
1694 | arg_list.left = self.cur_token
|
1695 | self.parse_ctx.ParseProcCallArgs(self.lexer, arg_list, start_symbol)
|
1696 | return arg_list
|
1697 |
|
1698 | def _MaybeReadWordPart(self, is_first, lex_mode, parts):
|
1699 | # type: (bool, lex_mode_t, List[word_part_t]) -> bool
|
1700 | """Helper for _ReadCompoundWord3."""
|
1701 | done = False
|
1702 |
|
1703 | if self.token_type == Id.Lit_EscapedChar:
|
1704 | tok = self.cur_token
|
1705 | assert tok.length == 2
|
1706 | ch = lexer.TokenSliceLeft(tok, 1)
|
1707 | if not self.parse_opts.parse_backslash():
|
1708 | if not pyutil.IsValidCharEscape(ch):
|
1709 | p_die('Invalid char escape in unquoted word (OILS-ERR-13)',
|
1710 | self.cur_token)
|
1711 |
|
1712 | part = word_part.EscapedLiteral(self.cur_token,
|
1713 | ch) # type: word_part_t
|
1714 | else:
|
1715 | part = self.cur_token
|
1716 |
|
1717 | if is_first and self.token_type == Id.Lit_VarLike: # foo=
|
1718 | parts.append(part)
|
1719 | # Unfortunately it's awkward to pull the check for a=(1 2) up to
|
1720 | # _ReadWord.
|
1721 | next_id = self.lexer.LookPastSpace(lex_mode)
|
1722 | if next_id == Id.Op_LParen:
|
1723 | self.lexer.PushHint(Id.Op_RParen, Id.Right_ShArrayLiteral)
|
1724 | part2 = self._ReadArrayLiteral()
|
1725 | parts.append(part2)
|
1726 |
|
1727 | # Array literal must be the last part of the word.
|
1728 | self._SetNext(lex_mode)
|
1729 | self._GetToken()
|
1730 | # EOF, whitespace, newline, Right_Subshell
|
1731 | if self.token_kind not in KINDS_THAT_END_WORDS:
|
1732 | p_die('Unexpected token after array literal',
|
1733 | self.cur_token)
|
1734 | done = True
|
1735 |
|
1736 | elif (is_first and self.parse_opts.parse_at() and
|
1737 | self.token_type == Id.Lit_Splice):
|
1738 |
|
1739 | splice_tok = self.cur_token
|
1740 | part2 = word_part.Splice(splice_tok,
|
1741 | lexer.TokenSliceLeft(splice_tok, 1))
|
1742 |
|
1743 | parts.append(part2)
|
1744 |
|
1745 | # @words must be the last part of the word
|
1746 | self._SetNext(lex_mode)
|
1747 | self._GetToken()
|
1748 | # EOF, whitespace, newline, Right_Subshell
|
1749 | if self.token_kind not in KINDS_THAT_END_WORDS:
|
1750 | p_die('Unexpected token after array splice', self.cur_token)
|
1751 | done = True
|
1752 |
|
1753 | elif (is_first and self.parse_opts.parse_at() and
|
1754 | self.token_type == Id.Lit_AtLBracket): # @[split(x)]
|
1755 | part2 = self._ReadExprSub(lex_mode_e.DQ)
|
1756 | parts.append(part2)
|
1757 |
|
1758 | # @[split(x)]
|
1759 | self._SetNext(lex_mode)
|
1760 | self._GetToken()
|
1761 | # EOF, whitespace, newline, Right_Subshell
|
1762 | if self.token_kind not in KINDS_THAT_END_WORDS:
|
1763 | p_die('Unexpected token after Expr splice', self.cur_token)
|
1764 | done = True
|
1765 |
|
1766 | elif (is_first and self.parse_opts.parse_at() and
|
1767 | self.token_type == Id.Lit_AtLBraceDot):
|
1768 | p_die('TODO: @{.myproc builtin sub}', self.cur_token)
|
1769 |
|
1770 | elif (is_first and self.parse_opts.parse_at_all() and
|
1771 | self.token_type == Id.Lit_At):
|
1772 | # Because $[x] ${x} and perhaps $/x/ are reserved, it makes sense for @
|
1773 | # at the beginning of a word to be reserved.
|
1774 |
|
1775 | # Although should we relax 'echo @' ? I'm tempted to have a shortcut for
|
1776 | # @_argv and
|
1777 | p_die('Literal @ starting a word must be quoted (parse_at_all)',
|
1778 | self.cur_token)
|
1779 |
|
1780 | else:
|
1781 | # not a literal with lookahead; append it
|
1782 | parts.append(part)
|
1783 |
|
1784 | return done
|
1785 |
|
1786 | def _ReadCompoundWord(self, lex_mode):
|
1787 | # type: (lex_mode_t) -> CompoundWord
|
1788 | return self._ReadCompoundWord3(lex_mode, Id.Undefined_Tok, True)
|
1789 |
|
1790 | def _ReadCompoundWord3(self, lex_mode, eof_type, empty_ok):
|
1791 | # type: (lex_mode_t, Id_t, bool) -> CompoundWord
|
1792 | """
|
1793 | Precondition: Looking at the first token of the first word part
|
1794 | Postcondition: Looking at the token after, e.g. space or operator
|
1795 |
|
1796 | NOTE: eof_type is necessary because / is a literal, i.e. Lit_Slash, but it
|
1797 | could be an operator delimiting a compound word. Can we change lexer modes
|
1798 | and remove this special case?
|
1799 | """
|
1800 | w = CompoundWord([])
|
1801 | num_parts = 0
|
1802 | brace_count = 0
|
1803 | done = False
|
1804 | is_triple_quoted = None # type: Optional[BoolParamBox]
|
1805 |
|
1806 | while not done:
|
1807 | self._GetToken()
|
1808 |
|
1809 | allow_done = empty_ok or num_parts != 0
|
1810 | if allow_done and self.token_type == eof_type:
|
1811 | done = True # e.g. for ${foo//pat/replace}
|
1812 |
|
1813 | # Keywords like "for" are treated like literals
|
1814 | elif self.token_kind in (Kind.Lit, Kind.History, Kind.KW,
|
1815 | Kind.ControlFlow, Kind.BoolUnary,
|
1816 | Kind.BoolBinary):
|
1817 |
|
1818 | # Syntax error for { and }
|
1819 | if self.token_type == Id.Lit_LBrace:
|
1820 | brace_count += 1
|
1821 | elif self.token_type == Id.Lit_RBrace:
|
1822 | brace_count -= 1
|
1823 | elif self.token_type == Id.Lit_Dollar:
|
1824 | if not self.parse_opts.parse_dollar():
|
1825 | if num_parts == 0 and lex_mode == lex_mode_e.ShCommand:
|
1826 | next_byte = self.lexer.ByteLookAhead()
|
1827 | # TODO: switch lexer modes and parse $/d+/. But not ${a:-$/d+/}
|
1828 | if next_byte == '/':
|
1829 | #log('next_byte %r', next_byte)
|
1830 | pass
|
1831 |
|
1832 | p_die('Literal $ should be quoted like \$',
|
1833 | self.cur_token)
|
1834 |
|
1835 | done = self._MaybeReadWordPart(num_parts == 0, lex_mode,
|
1836 | w.parts)
|
1837 |
|
1838 | elif self.token_kind == Kind.VSub:
|
1839 | vsub_token = self.cur_token
|
1840 |
|
1841 | part = SimpleVarSub(vsub_token) # type: word_part_t
|
1842 | w.parts.append(part)
|
1843 |
|
1844 | elif self.token_kind == Kind.ExtGlob:
|
1845 | # If parse_at, we can take over @( to start @(seq 3)
|
1846 | # Users can also use look at ,(*.py|*.sh)
|
1847 | if (self.parse_opts.parse_at() and
|
1848 | self.token_type == Id.ExtGlob_At and num_parts == 0):
|
1849 | cs_part = self._ReadCommandSub(Id.Left_AtParen,
|
1850 | d_quoted=False)
|
1851 | # RARE mutation of tok.id!
|
1852 | cs_part.left_token.id = Id.Left_AtParen
|
1853 | part = cs_part # for type safety
|
1854 |
|
1855 | # Same check as _MaybeReadWordPart. @(seq 3)x is illegal, just like
|
1856 | # a=(one two)x and @arrayfunc(3)x.
|
1857 | self._GetToken()
|
1858 | if self.token_kind not in KINDS_THAT_END_WORDS:
|
1859 | p_die('Unexpected token after @()', self.cur_token)
|
1860 | done = True
|
1861 |
|
1862 | else:
|
1863 | if HAVE_FNM_EXTMATCH == 0:
|
1864 | p_die(
|
1865 | "Extended glob won't work without FNM_EXTMATCH support in libc",
|
1866 | self.cur_token)
|
1867 | part = self._ReadExtGlob()
|
1868 | w.parts.append(part)
|
1869 |
|
1870 | elif self.token_kind == Kind.BashRegex:
|
1871 | if self.token_type == Id.BashRegex_LParen: # Opening (
|
1872 | part = self._ReadBashRegexGroup()
|
1873 | w.parts.append(part)
|
1874 | else:
|
1875 | assert self.token_type == Id.BashRegex_AllowedInParens
|
1876 | p_die('Invalid token in bash regex', self.cur_token)
|
1877 |
|
1878 | elif self.token_kind == Kind.Left:
|
1879 | try_triple_quote = (self.parse_opts.parse_triple_quote() and
|
1880 | lex_mode == lex_mode_e.ShCommand and
|
1881 | num_parts == 0)
|
1882 |
|
1883 | # Save allocation
|
1884 | if try_triple_quote:
|
1885 | is_triple_quoted = BoolParamBox(False)
|
1886 |
|
1887 | part = self._ReadUnquotedLeftParts(is_triple_quoted)
|
1888 | w.parts.append(part)
|
1889 |
|
1890 | # NOT done yet, will advance below
|
1891 | elif self.token_kind == Kind.Right:
|
1892 | # Still part of the word; will be done on the next iter.
|
1893 | if self.token_type == Id.Right_DoubleQuote:
|
1894 | pass
|
1895 | # Never happens, no PushHint for this case.
|
1896 | #elif self.token_type == Id.Right_DollarParen:
|
1897 | # pass
|
1898 | elif self.token_type == Id.Right_Subshell:
|
1899 | # LEXER HACK for (case x in x) ;; esac )
|
1900 | # Rewind before it's used
|
1901 | assert self.next_lex_mode == lex_mode_e.Undefined
|
1902 | if self.lexer.MaybeUnreadOne():
|
1903 | self.lexer.PushHint(Id.Op_RParen, Id.Right_Subshell)
|
1904 | self._SetNext(lex_mode)
|
1905 | done = True
|
1906 | else:
|
1907 | done = True
|
1908 |
|
1909 | elif self.token_kind == Kind.Ignored:
|
1910 | done = True
|
1911 |
|
1912 | else:
|
1913 | # LEXER HACK for unbalanced case clause. 'case foo in esac' is valid,
|
1914 | # so to test for ESAC, we can read ) before getting a chance to
|
1915 | # PushHint(Id.Op_RParen, Id.Right_CasePat). So here we unread one
|
1916 | # token and do it again.
|
1917 |
|
1918 | # We get Id.Op_RParen at top level: case x in x) ;; esac
|
1919 | # We get Id.Eof_RParen inside ComSub: $(case x in x) ;; esac )
|
1920 | if self.token_type in (Id.Op_RParen, Id.Eof_RParen):
|
1921 | # Rewind before it's used
|
1922 | assert self.next_lex_mode == lex_mode_e.Undefined
|
1923 | if self.lexer.MaybeUnreadOne():
|
1924 | if self.token_type == Id.Eof_RParen:
|
1925 | # Redo translation
|
1926 | self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
|
1927 | self._SetNext(lex_mode)
|
1928 |
|
1929 | done = True # anything we don't recognize means we're done
|
1930 |
|
1931 | if not done:
|
1932 | self._SetNext(lex_mode)
|
1933 | num_parts += 1
|
1934 |
|
1935 | if (self.parse_opts.parse_brace() and num_parts > 1 and
|
1936 | brace_count != 0):
|
1937 | # accept { and }, but not foo{
|
1938 | p_die(
|
1939 | 'Word has unbalanced { }. Maybe add a space or quote it like \{',
|
1940 | loc.Word(w))
|
1941 |
|
1942 | if is_triple_quoted and is_triple_quoted.b and num_parts > 1:
|
1943 | p_die('Unexpected parts after triple quoted string',
|
1944 | loc.WordPart(w.parts[-1]))
|
1945 |
|
1946 | if 0:
|
1947 | from _devbuild.gen.syntax_asdl import word_part_str
|
1948 | word_key = ' '.join(word_part_str(p.tag()) for p in w.parts)
|
1949 | WORD_HIST[word_key] += 1
|
1950 | return w
|
1951 |
|
1952 | def _ReadArithWord(self):
|
1953 | # type: () -> Optional[word_t]
|
1954 | """ Helper for ReadArithWord() """
|
1955 | self._GetToken()
|
1956 |
|
1957 | if self.token_kind == Kind.Unknown:
|
1958 | # e.g. happened during dynamic parsing of unset 'a[$foo]' in gherkin
|
1959 | p_die(
|
1960 | 'Unexpected token while parsing arithmetic: %r' %
|
1961 | lexer.TokenVal(self.cur_token), self.cur_token)
|
1962 |
|
1963 | elif self.token_kind == Kind.Eof:
|
1964 | return self.cur_token
|
1965 |
|
1966 | elif self.token_kind == Kind.Ignored:
|
1967 | # Space should be ignored.
|
1968 | self._SetNext(lex_mode_e.Arith)
|
1969 | return None
|
1970 |
|
1971 | elif self.token_kind in (Kind.Arith, Kind.Right):
|
1972 | # Id.Right_DollarDParen IS just a normal token, handled by ArithParser
|
1973 | self._SetNext(lex_mode_e.Arith)
|
1974 | return self.cur_token
|
1975 |
|
1976 | elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub):
|
1977 | return self._ReadCompoundWord(lex_mode_e.Arith)
|
1978 |
|
1979 | else:
|
1980 | raise AssertionError(self.cur_token)
|
1981 |
|
1982 | def _ReadWord(self, word_mode):
|
1983 | # type: (lex_mode_t) -> Optional[word_t]
|
1984 | """Helper function for ReadWord()."""
|
1985 |
|
1986 | # Change the pseudo lexer mode to a real lexer mode
|
1987 | if word_mode == lex_mode_e.ShCommandFakeBrack:
|
1988 | lex_mode = lex_mode_e.ShCommand
|
1989 | else:
|
1990 | lex_mode = word_mode
|
1991 |
|
1992 | self._GetToken()
|
1993 |
|
1994 | if self.token_kind == Kind.Eof:
|
1995 | # No advance
|
1996 | return self.cur_token
|
1997 |
|
1998 | # Allow Arith for ) at end of for loop?
|
1999 | elif self.token_kind in (Kind.Op, Kind.Redir, Kind.Arith):
|
2000 | self._SetNext(lex_mode)
|
2001 |
|
2002 | # Newlines are complicated. See 3x2 matrix in the comment about
|
2003 | # self.multiline and self.newline_state above.
|
2004 | if self.token_type == Id.Op_Newline:
|
2005 | if self.multiline:
|
2006 | if self.newline_state > 1:
|
2007 | # This points at a blank line, but at least it gives the line number
|
2008 | p_die('Invalid blank line in multiline mode',
|
2009 | self.cur_token)
|
2010 | return None
|
2011 |
|
2012 | if self.returned_newline: # skip
|
2013 | return None
|
2014 |
|
2015 | return self.cur_token
|
2016 |
|
2017 | elif self.token_kind == Kind.Right:
|
2018 | if self.token_type not in (Id.Right_Subshell, Id.Right_ShFunction,
|
2019 | Id.Right_CasePat,
|
2020 | Id.Right_ShArrayLiteral):
|
2021 | raise AssertionError(self.cur_token)
|
2022 |
|
2023 | self._SetNext(lex_mode)
|
2024 | return self.cur_token
|
2025 |
|
2026 | elif self.token_kind in (Kind.Ignored, Kind.WS):
|
2027 | self._SetNext(lex_mode)
|
2028 | return None
|
2029 |
|
2030 | else:
|
2031 | assert self.token_kind in (Kind.VSub, Kind.Lit, Kind.History,
|
2032 | Kind.Left, Kind.KW, Kind.ControlFlow,
|
2033 | Kind.BoolUnary, Kind.BoolBinary,
|
2034 | Kind.ExtGlob,
|
2035 | Kind.BashRegex), 'Unhandled token kind'
|
2036 |
|
2037 | if (word_mode == lex_mode_e.ShCommandFakeBrack and
|
2038 | self.parse_opts.parse_bracket() and
|
2039 | self.token_type == Id.Lit_LBracket):
|
2040 | # Change [ from Kind.Lit -> Kind.Op
|
2041 | # So CommandParser can treat
|
2042 | # assert [42 === x]
|
2043 | # like
|
2044 | # json write (x)
|
2045 | bracket_word = self.cur_token
|
2046 | bracket_word.id = Id.Op_LBracket
|
2047 |
|
2048 | self._SetNext(lex_mode)
|
2049 | return bracket_word
|
2050 |
|
2051 | # We're beginning a word. If we see Id.Lit_Pound, change to
|
2052 | # lex_mode_e.Comment and read until end of line.
|
2053 | if self.token_type == Id.Lit_Pound:
|
2054 | self._SetNext(lex_mode_e.Comment)
|
2055 | self._GetToken()
|
2056 |
|
2057 | # NOTE: The # could be the last character in the file. It can't be
|
2058 | # Eof_{RParen,Backtick} because #) and #` are comments.
|
2059 | assert self.token_type in (Id.Ignored_Comment, Id.Eof_Real), \
|
2060 | self.cur_token
|
2061 |
|
2062 | # The next iteration will go into Kind.Ignored and set lex state to
|
2063 | # lex_mode_e.ShCommand/etc.
|
2064 | return None # tell ReadWord() to try again after comment
|
2065 |
|
2066 | elif self.token_type == Id.Lit_TPound: ### doc comment
|
2067 | self._SetNext(lex_mode_e.Comment)
|
2068 | self._GetToken()
|
2069 |
|
2070 | if self.token_type == Id.Ignored_Comment and self.emit_doc_token:
|
2071 | return self.cur_token
|
2072 |
|
2073 | return None # tell ReadWord() to try again after comment
|
2074 |
|
2075 | else:
|
2076 | # r'' u'' b''
|
2077 | if (self.token_type == Id.Lit_Chars and
|
2078 | self.lexer.LookAheadOne(
|
2079 | lex_mode_e.ShCommand) == Id.Left_SingleQuote):
|
2080 |
|
2081 | # When shopt -s parse_raw_string:
|
2082 | # echo r'hi' is like echo 'hi'
|
2083 | #
|
2084 | # echo u'\u{3bc}' b'\yff' works
|
2085 |
|
2086 | tok = self.cur_token
|
2087 | if self.parse_opts.parse_ysh_string():
|
2088 | if lexer.TokenEquals(tok, 'r'):
|
2089 | left_id = Id.Left_RSingleQuote
|
2090 | elif lexer.TokenEquals(tok, 'u'):
|
2091 | left_id = Id.Left_USingleQuote
|
2092 | elif lexer.TokenEquals(tok, 'b'):
|
2093 | left_id = Id.Left_BSingleQuote
|
2094 | else:
|
2095 | left_id = Id.Undefined_Tok
|
2096 |
|
2097 | if left_id != Id.Undefined_Tok:
|
2098 | # skip the r, and then 'foo' will be read as normal
|
2099 | self._SetNext(lex_mode_e.ShCommand)
|
2100 |
|
2101 | self._GetToken()
|
2102 | assert self.token_type == Id.Left_SingleQuote, self.token_type
|
2103 |
|
2104 | # Read the word in a different lexer mode
|
2105 | return self._ReadYshSingleQuoted(left_id)
|
2106 |
|
2107 | return self._ReadCompoundWord(lex_mode)
|
2108 |
|
2109 | def ParseVarRef(self):
|
2110 | # type: () -> BracedVarSub
|
2111 | """DYNAMIC parsing of what's inside ${!ref}
|
2112 |
|
2113 | # Same as VarOf production
|
2114 | VarRefExpr = VarOf EOF
|
2115 | """
|
2116 | self._SetNext(lex_mode_e.VSub_1)
|
2117 |
|
2118 | self._GetToken()
|
2119 | if self.token_kind != Kind.VSub:
|
2120 | p_die('Expected var name', self.cur_token)
|
2121 |
|
2122 | part = self._ParseVarOf()
|
2123 | # NOTE: no ${ } means no part.left and part.right
|
2124 | part.left = part.name_tok # cheat to make test pass
|
2125 | part.right = part.name_tok
|
2126 |
|
2127 | self._GetToken()
|
2128 | if self.token_type != Id.Eof_Real:
|
2129 | p_die('Expected end of var ref expression', self.cur_token)
|
2130 | return part
|
2131 |
|
2132 | def LookPastSpace(self):
|
2133 | # type: () -> Id_t
|
2134 | """Look ahead to the next token.
|
2135 |
|
2136 | For the CommandParser to recognize
|
2137 | array= (1 2 3)
|
2138 | YSH for ( versus bash for ((
|
2139 | YSH if ( versus if test
|
2140 | YSH while ( versus while test
|
2141 | YSH bare assignment 'grep =' versus 'grep foo'
|
2142 | """
|
2143 | assert self.token_type != Id.Undefined_Tok
|
2144 | if self.cur_token.id == Id.WS_Space:
|
2145 | id_ = self.lexer.LookPastSpace(lex_mode_e.ShCommand)
|
2146 | else:
|
2147 | id_ = self.cur_token.id
|
2148 | return id_
|
2149 |
|
2150 | def LookAheadFuncParens(self):
|
2151 | # type: () -> bool
|
2152 | """Special lookahead for f( ) { echo hi; } to check for ( )"""
|
2153 | assert self.token_type != Id.Undefined_Tok
|
2154 |
|
2155 | # We have to handle 2 cases because we buffer a token
|
2156 | if self.cur_token.id == Id.Op_LParen: # saw funcname(
|
2157 | return self.lexer.LookAheadFuncParens(1) # go back one char
|
2158 |
|
2159 | elif self.cur_token.id == Id.WS_Space: # saw funcname WHITESPACE
|
2160 | return self.lexer.LookAheadFuncParens(0)
|
2161 |
|
2162 | else:
|
2163 | return False
|
2164 |
|
2165 | def ReadWord(self, word_mode):
|
2166 | # type: (lex_mode_t) -> word_t
|
2167 | """Read the next word, using the given lexer mode.
|
2168 |
|
2169 | This is a stateful wrapper for the stateless _ReadWord function.
|
2170 | """
|
2171 | assert word_mode in (lex_mode_e.ShCommand,
|
2172 | lex_mode_e.ShCommandFakeBrack,
|
2173 | lex_mode_e.DBracket, lex_mode_e.BashRegex)
|
2174 |
|
2175 | if self.buffered_word: # For integration with pgen2
|
2176 | w = self.buffered_word
|
2177 | self.buffered_word = None
|
2178 | else:
|
2179 | while True:
|
2180 | w = self._ReadWord(word_mode)
|
2181 | if w is not None:
|
2182 | break
|
2183 |
|
2184 | self.returned_newline = (word_.CommandId(w) == Id.Op_Newline)
|
2185 | return w
|
2186 |
|
2187 | def ReadArithWord(self):
|
2188 | # type: () -> word_t
|
2189 | while True:
|
2190 | w = self._ReadArithWord()
|
2191 | if w is not None:
|
2192 | break
|
2193 | return w
|
2194 |
|
2195 | def ReadHereDocBody(self, parts):
|
2196 | # type: (List[word_part_t]) -> None
|
2197 | """
|
2198 | A here doc is like a double quoted context, except " isn't special.
|
2199 | """
|
2200 | self._ReadLikeDQ(None, False, parts)
|
2201 | # Returns nothing
|
2202 |
|
2203 | def ReadForPlugin(self):
|
2204 | # type: () -> CompoundWord
|
2205 | """For $PS1, $PS4, etc.
|
2206 |
|
2207 | This is just like reading a here doc line. "\n" is allowed, as
|
2208 | well as the typical substitutions ${x} $(echo hi) $((1 + 2)).
|
2209 | """
|
2210 | w = CompoundWord([])
|
2211 | self._ReadLikeDQ(None, False, w.parts)
|
2212 | return w
|
2213 |
|
2214 | def EmitDocToken(self, b):
|
2215 | # type: (bool) -> None
|
2216 | self.emit_doc_token = b
|
2217 |
|
2218 | def Multiline(self, b):
|
2219 | # type: (bool) -> None
|
2220 | self.multiline = b
|
2221 |
|
2222 |
|
2223 | if 0:
|
2224 | import collections
|
2225 | WORD_HIST = collections.Counter()
|
2226 |
|
2227 | # vim: sw=4
|