OILS / osh / word_parse.py View on Github | oils.pub

2380 lines, 1262 significant
1# Copyright 2016 Andy Chu. All rights reserved.
2# Licensed under the Apache License, Version 2.0 (the "License");
3# you may not use this file except in compliance with the License.
4# You may obtain a copy of the License at
5#
6# http://www.apache.org/licenses/LICENSE-2.0
7"""
8word_parse.py - Parse the shell word language.
9
10Hairy example:
11
12 hi$((1 + 2))"$(echo hi)"${var:-__"$(echo default)"__}
13
14Substitutions can be nested, but which inner subs are allowed depends on the
15outer sub. Notes:
16
17lex_mode_e.ShCommand (_ReadUnquotedLeftParts)
18 All subs and quotes are allowed:
19 $v ${v} $() `` $(()) '' "" $'' $"" <() >()
20
21lex_mode_e.DQ (_ReadDoubleQuotedLeftParts)
22 Var, Command, Arith, but no quotes.
23 $v ${v} $() `` $(())
24 No process substitution.
25
26lex_mode_e.Arith
27 Similar to DQ: Var, Command, and Arith sub, but no process sub. bash doesn't
28 allow quotes, but OSH does. We allow ALL FOUR kinds of quotes, because we
29 need those for associative array indexing.
30
31lex_mode_e.VSub_ArgUnquoted
32 Like ShCommand, everything is allowed (even process substitutions), but we
33 stop at }, and space is SIGNIFICANT.
34
35 Example: ${a:- b }
36
37 ${X:-$v} ${X:-${v}} ${X:-$(echo hi)} ${X:-`echo hi`} ${X:-$((1+2))}
38 ${X:-'single'} ${X:-"double"} ${X:-$'\n'} ${X:-<(echo hi)}
39
40lex_mode_e.VSub_ArgDQ
41 In contrast to DQ, VS_ARG_DQ accepts nested "" and $'' and $"", e.g.
42 "${x:-"default"}".
43
44 In contrast, VSub_ArgUnquoted respects single quotes and process
45 substitution.
46
47 It's weird that double quotes are allowed. Space is also significant here,
48 e.g. "${x:-a "b"}".
49"""
50
51from _devbuild.gen import grammar_nt
52from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind
53from _devbuild.gen.types_asdl import (lex_mode_t, lex_mode_e)
54from _devbuild.gen.syntax_asdl import (
55 ExprSub,
56 BoolParamBox,
57 Token,
58 SimpleVarSub,
59 loc,
60 source,
61 word,
62 DoubleQuoted,
63 SingleQuoted,
64 BracedVarSub,
65 CommandSub,
66 InitializerWord,
67 InitializerWord_t,
68 bracket_op,
69 bracket_op_t,
70 suffix_op,
71 suffix_op_t,
72 rhs_word,
73 rhs_word_e,
74 rhs_word_t,
75 word_e,
76 word_t,
77 CompoundWord,
78 word_part,
79 word_part_t,
80 y_lhs_e,
81 arith_expr_t,
82 command,
83 expr,
84 expr_e,
85 expr_t,
86 pat_t,
87 ArgList,
88 Proc,
89 Func,
90 Subscript,
91 Attribute,
92 arith_expr,
93 VarDecl,
94 Mutation,
95 word_part_e,
96)
97from core import alloc
98from core.error import p_die
99from mycpp.mylib import log
100from core import pyutil
101from display import ui
102from frontend import consts
103from frontend import lexer
104from frontend import reader
105from osh import tdop
106from osh import arith_parse
107from osh import braces
108from osh import word_
109from osh import word_compile
110from mycpp.mylib import tagswitch
111
112from libc import HAVE_FNM_EXTMATCH
113
114from typing import List, Optional, Tuple, cast
115from typing import TYPE_CHECKING
116if TYPE_CHECKING:
117 from frontend.lexer import Lexer
118 from frontend.parse_lib import ParseContext
119 from frontend.reader import _Reader
120 from osh.cmd_parse import VarChecker
121
122unused1 = log
123unused2 = Id_str
124
125KINDS_THAT_END_WORDS = [Kind.Eof, Kind.WS, Kind.Op, Kind.Right]
126
127
128def _IsValidYshWord(w):
129 # type: (CompoundWord) -> bool
130 """YSH word restriction
131
132 Allowed:
133 'foo' r'foo' --flag r'foo'
134 --flag='foo'
135 --flag="foo"
136 Not allowed:
137 --flag=r'bar' NAME=u'value' # ambiguous
138 --flag=b''' multi '''
139 """
140 parts = w.parts
141 n = len(parts)
142
143 if n != 0 and word_.LiteralId(parts[0]) == Id.Lit_Tilde:
144 # ~bob/src/'dir with spaces' is allowed
145 # ~bob/src/u'dir with spaces' is ambiguous, but allowed for simplicity
146 return True # early return
147
148 ok = True
149 if n >= 2:
150 # spec/ysh-TODO-deprecate - allow ''/usr/* workaround!
151 # note: ""/usr/* not allowed
152 part0 = parts[0]
153 if part0.tag() == word_part_e.SingleQuoted:
154 sq = cast(SingleQuoted, part0)
155 # Make sure $''' is still disallowed
156 if (sq.left.id == Id.Left_SingleQuote and len(sq.sval) == 0):
157 return True
158
159 for part in parts:
160 if part.tag() in (word_part_e.SingleQuoted,
161 word_part_e.DoubleQuoted):
162 ok = False
163
164 # Allow special cases:
165 # --flag='val' NAME='bar'
166 # But NOT
167 # --flag=r'val' NAME=r'val'
168 if not ok:
169 if (n == 2 and word_.LiteralId(parts[0]) == Id.Lit_VarLike):
170 ok = True
171 elif (n == 3 and word_.LiteralId(parts[0]) == Id.Lit_Chars and
172 word_.LiteralId(parts[1]) == Id.Lit_Equals):
173 ok = True
174
175 return ok
176
177
178class WordEmitter(object):
179 """Common interface for [ and [["""
180
181 def __init__(self):
182 # type: () -> None
183 """Empty constructor for mycpp."""
184 pass
185
186 def ReadWord(self, lex_mode):
187 # type: (lex_mode_t) -> word_t
188 raise NotImplementedError()
189
190
191class WordParser(WordEmitter):
192
193 def __init__(self, parse_ctx, lexer, line_reader):
194 # type: (ParseContext, Lexer, _Reader) -> None
195 self.parse_ctx = parse_ctx
196 self.lexer = lexer
197 self.line_reader = line_reader
198 self.arena = line_reader.arena
199
200 self.parse_opts = parse_ctx.parse_opts
201 self.a_parser = tdop.TdopParser(arith_parse.Spec(), self,
202 self.parse_opts)
203 self.Reset()
204
205 def Init(self, lex_mode):
206 # type: (lex_mode_t) -> None
207 """Used to parse arithmetic, see ParseContext."""
208 self.next_lex_mode = lex_mode
209
210 def Reset(self):
211 # type: () -> None
212 """Called by interactive loop."""
213 # For _GetToken()
214 self.cur_token = None # type: Token
215 self.token_kind = Kind.Undefined
216 self.token_type = Id.Undefined_Tok
217
218 self.next_lex_mode = lex_mode_e.ShCommand
219
220 # Boolean mutated by CommandParser via word_.ctx_EmitDocToken. For ### doc
221 # comments
222 self.emit_doc_token = False
223 # Boolean mutated by CommandParser via word_.ctx_Multiline. '...' starts
224 # multiline mode.
225 self.multiline = False
226
227 # For detecting invalid \n\n in multiline mode. Counts what we got
228 # directly from the lexer.
229 self.newline_state = 0
230 # For consolidating \n\n -> \n for the CALLER. This simplifies the parsers
231 # that consume words.
232 self.returned_newline = False
233
234 # For integration with pgen2
235 self.buffered_word = None # type: word_t
236
237 def _GetToken(self):
238 # type: () -> None
239 """Call this when you need to make a decision based on any of:
240
241 self.token_type
242 self.token_kind
243 self.cur_token
244 """
245 if self.next_lex_mode == lex_mode_e.Undefined:
246 return # _SetNext() not called, so do nothing
247
248 is_fake = self.next_lex_mode == lex_mode_e.BashRegexFakeInner
249 real_mode = (lex_mode_e.BashRegex if is_fake else self.next_lex_mode)
250
251 self.cur_token = self.lexer.Read(real_mode)
252
253 # MUTATE TOKEN for fake lexer mode.
254 # This is for crazy stuff bash allows, like [[ s =~ (< >) ]]
255 if (is_fake and self.cur_token.id
256 in (Id.WS_Space, Id.BashRegex_AllowedInParens)):
257 self.cur_token.id = Id.Lit_Chars
258
259 self.token_type = self.cur_token.id
260 self.token_kind = consts.GetKind(self.token_type)
261
262 # number of consecutive newlines, ignoring whitespace
263 if self.token_type == Id.Op_Newline:
264 self.newline_state += 1
265 elif self.token_kind != Kind.WS:
266 self.newline_state = 0
267
268 self.parse_ctx.trail.AppendToken(self.cur_token) # For completion
269 self.next_lex_mode = lex_mode_e.Undefined
270
271 def _SetNext(self, lex_mode):
272 # type: (lex_mode_t) -> None
273 """Set the next lex state, but don't actually read a token.
274
275 We need this for proper interactive parsing.
276 """
277 self.next_lex_mode = lex_mode
278
279 def _ReadVarOpArg(self, arg_lex_mode):
280 # type: (lex_mode_t) -> rhs_word_t
281
282 # NOTE: Operators like | and < are not treated as special, so ${a:- | >} is
283 # valid, even when unquoted.
284 self._SetNext(arg_lex_mode)
285 self._GetToken()
286
287 w = self._ReadVarOpArg2(arg_lex_mode, Id.Undefined_Tok,
288 True) # empty_ok
289
290 # If the Compound has no parts, and we're in a double-quoted VarSub
291 # arg, and empty_ok, then return Empty. This is so it can evaluate to
292 # the empty string and not get elided.
293 #
294 # Examples:
295 # - "${s:-}", "${s/%pat/}"
296 # It's similar to LooksLikeShAssignment where we turn x= into x=''. And it
297 # has the same potential problem of not having Token location info.
298 #
299 # NOTE: empty_ok is False only for the PatSub pattern, which means we'll
300 # return a Compound with no parts, which is explicitly checked with a
301 # custom error message.
302 if len(w.parts) == 0 and arg_lex_mode == lex_mode_e.VSub_ArgDQ:
303 return rhs_word.Empty
304
305 return w
306
307 def _ReadVarOpArg2(self, arg_lex_mode, eof_type, empty_ok):
308 # type: (lex_mode_t, Id_t, bool) -> CompoundWord
309 """Helper function for _ReadVarOpArg and _ReadPatSubVarOp"""
310 w = self._ReadCompoundWord3(arg_lex_mode, eof_type, empty_ok)
311 tilde = word_.TildeDetect(w)
312 if tilde:
313 w = tilde
314 return w
315
316 def _ReadSliceVarOp(self):
317 # type: () -> suffix_op.Slice
318 """
319 Looking token after first ':'
320
321 ArithExpr? (':' ArithExpr? )? '}'
322 """
323 self._NextNonSpace()
324
325 cur_id = self.token_type
326
327 if cur_id in (Id.Arith_RBrace, Id.Arith_Colon): # ${a:} or ${a::}
328 begin = arith_expr.EmptyZero # type: arith_expr_t
329 else:
330 begin = self.a_parser.Parse()
331 cur_id = self.a_parser.CurrentId() # advance
332
333 if cur_id == Id.Arith_RBrace: # ${a:1} or ${@:1}
334 # No length specified, so it's N
335 no_length = None # type: Optional[arith_expr_t]
336 return suffix_op.Slice(begin, no_length)
337
338 elif cur_id == Id.Arith_Colon: # ${a:1:} or ${@:1:}
339 colon_tok = self.cur_token
340 self._NextNonSpace()
341
342 if self.token_type == Id.Arith_RBrace:
343 # quirky bash behavior:
344 # ${a:1:} or ${a::} means length ZERO
345 # but ${a:1} or ${a:} means length N
346 if self.parse_opts.strict_parse_slice():
347 p_die(
348 "Slice length: Add explicit zero, or omit : for N (strict_parse_slice)",
349 colon_tok)
350
351 length = arith_expr.EmptyZero # type: arith_expr_t
352 else:
353 length = self._ReadArithExpr(Id.Arith_RBrace)
354
355 return suffix_op.Slice(begin, length)
356
357 else:
358 p_die("Expected : or } in slice", self.cur_token)
359
360 raise AssertionError() # for MyPy
361
362 def _ReadPatSubVarOp(self):
363 # type: () -> suffix_op.PatSub
364 """Looking at the first '/' after VarOf:
365
366 VarSub = ...
367 | VarOf '/' Match ( '/' WORD? )?
368 Match = '/' WORD # can't be empty
369 | '#' WORD? # may be empty
370 | '%' WORD?
371 """
372 slash_tok = self.cur_token # location info
373 replace_mode = Id.Undefined_Tok # bizarre syntax / # %
374
375 self._SetNext(lex_mode_e.VSub_ArgUnquoted) # advance past /
376
377 self._GetToken()
378 if self.token_type == Id.Right_DollarBrace:
379 pat = CompoundWord([])
380 return suffix_op.PatSub(pat, rhs_word.Empty, replace_mode,
381 slash_tok)
382
383 if self.token_type in (Id.Lit_Slash, Id.Lit_Pound, Id.Lit_Percent):
384 replace_mode = self.token_type
385 self._SetNext(lex_mode_e.VSub_ArgUnquoted)
386
387 # Bash quirk:
388 # echo ${x/#/replace} has an empty pattern
389 # echo ${x////replace} is non-empty; it means echo ${x//'/'/replace}
390 empty_ok = replace_mode != Id.Lit_Slash
391 pat = self._ReadVarOpArg2(lex_mode_e.VSub_ArgUnquoted, Id.Lit_Slash,
392 empty_ok)
393 #log('pat 1 %r', pat)
394
395 if self.token_type == Id.Lit_Slash:
396 # read until }
397 replace = self._ReadVarOpArg(
398 lex_mode_e.VSub_ArgUnquoted) # type: rhs_word_t
399 #log('r 1 %r', replace)
400 else:
401 # e.g. ${v/a} is the same as ${v/a/} -- empty replacement string
402 replace = rhs_word.Empty
403
404 self._GetToken()
405 if self.token_type != Id.Right_DollarBrace:
406 # This happens on invalid code
407 p_die(
408 "Expected } after replacement string, got %s" %
409 ui.PrettyId(self.token_type), self.cur_token)
410
411 return suffix_op.PatSub(pat, replace, replace_mode, slash_tok)
412
413 def _ReadSubscript(self):
414 # type: () -> bracket_op_t
415 """ Subscript = '[' ('@' | '*' | ArithExpr) ']' """
416 # Lookahead to see if we get @ or *. Otherwise read a full arithmetic
417 # expression.
418 next_id = self.lexer.LookPastSpace(lex_mode_e.Arith)
419 if next_id in (Id.Lit_At, Id.Arith_Star):
420 op = bracket_op.WholeArray(next_id) # type: bracket_op_t
421
422 self._SetNext(lex_mode_e.Arith) # skip past [
423 self._GetToken()
424 self._SetNext(lex_mode_e.Arith) # skip past @
425 self._GetToken()
426 else:
427 self._SetNext(lex_mode_e.Arith) # skip past [
428 anode = self._ReadArithExpr(Id.Arith_RBracket)
429 op = bracket_op.ArrayIndex(anode)
430
431 if self.token_type != Id.Arith_RBracket: # Should be looking at ]
432 p_die('Expected ] to close subscript', self.cur_token)
433
434 self._SetNext(lex_mode_e.VSub_2) # skip past ]
435 self._GetToken() # Needed to be in the same spot as no subscript
436
437 return op
438
439 def _ParseVarOf(self):
440 # type: () -> BracedVarSub
441 """
442 VarOf = NAME Subscript?
443 | NUMBER # no subscript allowed, none of these are arrays
444 # ${@[1]} doesn't work, even though slicing does
445 | VarSymbol
446 """
447 self._GetToken()
448 name_token = self.cur_token
449 self._SetNext(lex_mode_e.VSub_2)
450
451 self._GetToken() # Check for []
452 if self.token_type == Id.VOp2_LBracket:
453 bracket_op = self._ReadSubscript()
454 else:
455 bracket_op = None
456
457 part = BracedVarSub.CreateNull()
458 part.name_tok = name_token
459 part.var_name = lexer.TokenVal(name_token)
460 part.bracket_op = bracket_op
461 return part
462
463 def _ParseVarExpr(self, arg_lex_mode, allow_query=False):
464 # type: (lex_mode_t, bool) -> BracedVarSub
465 """Start parsing at the op -- we already skipped past the name."""
466 part = self._ParseVarOf()
467
468 self._GetToken()
469 if self.token_type == Id.Right_DollarBrace:
470 return part # no ops
471
472 op_kind = self.token_kind
473
474 if op_kind == Kind.VTest:
475 tok = self.cur_token
476 arg_word = self._ReadVarOpArg(arg_lex_mode)
477 if self.token_type != Id.Right_DollarBrace:
478 p_die('Expected } to close ${', self.cur_token)
479
480 part.suffix_op = suffix_op.Unary(tok, arg_word)
481
482 elif op_kind == Kind.VOpYsh:
483 tok = self.cur_token
484 arg_word = self._ReadVarOpArg(arg_lex_mode)
485 if self.token_type != Id.Right_DollarBrace:
486 p_die('Expected } to close ${', self.cur_token)
487
488 UP_arg_word = arg_word
489 with tagswitch(arg_word) as case:
490 if case(rhs_word_e.Empty):
491 pass
492 elif case(rhs_word_e.Compound):
493 arg_word = cast(CompoundWord, UP_arg_word)
494 # This handles ${x|html} and ${x %.3f} now
495 # However I think ${x %.3f} should be statically parsed? It can enter
496 # the printf lexer modes.
497 ok, arg, quoted = word_.StaticEval(arg_word)
498 if not ok or quoted:
499 p_die('Expected a constant argument',
500 loc.Word(arg_word))
501
502 part.suffix_op = suffix_op.Static(tok, arg)
503
504 elif op_kind == Kind.VOp0:
505 part.suffix_op = self.cur_token # Nullary
506 self._SetNext(lex_mode_e.VSub_2) # Expecting }
507 self._GetToken()
508
509 elif op_kind == Kind.VOp1: # % %% # ## etc.
510 tok = self.cur_token
511 # Weird exception that all shells have: these operators take a glob
512 # pattern, so they're lexed as VSub_ArgUnquoted, not VSub_ArgDQ
513 arg_word = self._ReadVarOpArg(lex_mode_e.VSub_ArgUnquoted)
514 if self.token_type != Id.Right_DollarBrace:
515 p_die('Expected } to close ${', self.cur_token)
516
517 part.suffix_op = suffix_op.Unary(tok, arg_word)
518
519 elif op_kind == Kind.VOp2: # / : [ ]
520 if self.token_type == Id.VOp2_Slash:
521 patsub_op = self._ReadPatSubVarOp() # type: suffix_op_t
522 part.suffix_op = patsub_op
523
524 # Checked by the method above
525 assert self.token_type == Id.Right_DollarBrace, self.cur_token
526
527 elif self.token_type == Id.VOp2_Colon:
528 part.suffix_op = self._ReadSliceVarOp()
529 # NOTE: } in arithmetic mode.
530 if self.token_type != Id.Arith_RBrace:
531 # Token seems off; doesn't point to X in # ${a:1:2 X
532 p_die('Expected } to close ${', self.cur_token)
533
534 else:
535 # TODO: Does this ever happen?
536 p_die('Unexpected token in ${} (%s)' % 'VOp2', self.cur_token)
537
538 elif op_kind == Kind.VOp3: # ${prefix@} etc.
539 if allow_query:
540 part.suffix_op = self.cur_token # Nullary
541 self._SetNext(lex_mode_e.VSub_2) # Expecting }
542 self._GetToken()
543 else:
544 p_die("Unexpected token in ${} (%s)" % 'VOp3', self.cur_token)
545
546 # NOTE: Arith_RBrace is for slicing, because it reads } in arithmetic
547 # mode. It's redundantly checked above.
548 if self.token_type not in (Id.Right_DollarBrace, Id.Arith_RBrace):
549 # ${a.} or ${!a.}
550 p_die('Expected } to close ${', self.cur_token)
551
552 # Now look for ops
553 return part
554
555 def _ReadZshVarSub(self, left_token):
556 # type: (Token) -> word_part.ZshVarSub
557
558 self._SetNext(lex_mode_e.VSub_Zsh) # Move past ${(foo)
559
560 # Can be empty
561 w = self._ReadCompoundWord3(lex_mode_e.VSub_Zsh, Id.Right_DollarBrace,
562 True)
563 self._GetToken()
564 return word_part.ZshVarSub(left_token, w, self.cur_token)
565
566 def ReadBracedVarSub(self, left_token):
567 # type: (Token) -> Tuple[BracedVarSub, Token]
568 """ For YSH expressions like var x = ${x:-"default"}. """
569 part = self._ReadBracedVarSub(left_token, d_quoted=False)
570 last_token = self.cur_token
571 return part, last_token
572
573 def _ReadBracedVarSub(self, left_token, d_quoted):
574 # type: (Token, bool) -> BracedVarSub
575 """For the ${} expression language.
576
577 NAME = [a-zA-Z_][a-zA-Z0-9_]*
578 NUMBER = [0-9]+ # ${10}, ${11}, ...
579
580 Subscript = '[' ('@' | '*' | ArithExpr) ']'
581 VarSymbol = '!' | '@' | '#' | ...
582 VarOf = NAME Subscript?
583 | NUMBER # no subscript allowed, none of these are arrays
584 # ${@[1]} doesn't work, even though slicing does
585 | VarSymbol
586
587 NULLARY_OP = '@Q' | '@E' | '@P' | '@A' | '@a' # VOp0
588
589 TEST_OP = '-' | ':-' | '=' | ':=' | '+' | ':+' | '?' | ':?'
590 STRIP_OP = '#' | '##' | '%' | '%%'
591 CASE_OP = ',' | ',,' | '^' | '^^'
592 UnaryOp = TEST_OP | STRIP_OP | CASE_OP
593
594 YSH_UNARY = '|' | ' ' # ${x|html} and ${x %.3f}.
595 # SPACE is operator not %
596 Match = ('/' | '#' | '%') WORD # match all / prefix / suffix
597 VarExpr = VarOf
598 | VarOf NULLARY_OP
599 | VarOf UnaryOp WORD
600 | VarOf YSH_UNARY STATIC_WORD
601 | VarOf ':' ArithExpr (':' ArithExpr )?
602 | VarOf '/' Match '/' WORD
603
604 LengthExpr = '#' VarOf # can't apply operators after length
605
606 RefOrKeys = '!' VarExpr # CAN apply operators after a named ref
607 # ${!ref[0]} vs ${!keys[@]} resolved later
608
609 PrefixQuery = '!' NAME ('*' | '@') # list variable names with a prefix
610
611 BuiltinSub = '.' WORD+ # ${.myproc 'builtin' $sub}
612
613 VarSub = LengthExpr
614 | RefOrKeys
615 | PrefixQuery
616 | VarExpr
617 | BuiltinSub
618
619 NOTES:
620 - Arithmetic expressions are used twice, inside subscripts ${a[x+1]} and
621 slicing ${a:x+1:y+2}
622 - ${#} and ${!} need LL(2) lookahead (considering how my tokenizer works)
623 - @ and * are technically arithmetic expressions in this implementation
624 - We don't account for bash 4.4: ${param@operator} -- Q E P A a. Note that
625 it's also vectorized.
626
627 Strictness over bash:
628 - echo ${a[0][0]} doesn't do anything useful, so we disallow it from the
629 grammar
630 - ! and # prefixes can't be composed, even though named refs can be
631 composed with other operators
632 - '#' means 4 different things: length prefix, VarSymbol, UnaryOp to strip
633 a prefix, and it can also be a literal part of WORD.
634
635 From the parser's point of view, the prefix # can't be combined with
636 UnaryOp/slicing/matching, and the ! can. However
637
638 - ${a[@]:1:2} is not allowed
639 - ${#a[@]:1:2} is allowed, but gives the wrong answer
640 """
641 if d_quoted:
642 arg_lex_mode = lex_mode_e.VSub_ArgDQ
643 else:
644 arg_lex_mode = lex_mode_e.VSub_ArgUnquoted
645
646 self._SetNext(lex_mode_e.VSub_1)
647 self._GetToken()
648
649 ty = self.token_type
650 first_tok = self.cur_token
651
652 if ty == Id.VSub_Pound:
653 # Disambiguate
654 next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
655 if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
656 # e.g. a name, '#' is the prefix
657 self._SetNext(lex_mode_e.VSub_1)
658 part = self._ParseVarOf()
659
660 self._GetToken()
661 if self.token_type != Id.Right_DollarBrace:
662 p_die('Expected } after length expression', self.cur_token)
663
664 part.prefix_op = first_tok
665
666 else: # not a prefix, '#' is the variable
667 part = self._ParseVarExpr(arg_lex_mode)
668
669 elif ty == Id.VSub_Bang:
670 next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
671 if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
672 # e.g. a name, '!' is the prefix
673 # ${!a} -- this is a ref
674 # ${!3} -- this is ref
675 # ${!a[1]} -- this is a ref
676 # ${!a[@]} -- this is a keys
677 # No lookahead -- do it in a second step, or at runtime
678 self._SetNext(lex_mode_e.VSub_1)
679 part = self._ParseVarExpr(arg_lex_mode, allow_query=True)
680
681 part.prefix_op = first_tok
682
683 else: # not a prefix, '!' is the variable
684 part = self._ParseVarExpr(arg_lex_mode)
685
686 elif ty == Id.VSub_Dot:
687 # Note: this will become a new builtin_sub type, so this method must
688 # return word_part_t rather than BracedVarSub. I don't think that
689 # should cause problems.
690 p_die('TODO: ${.myproc builtin sub}', self.cur_token)
691
692 # VS_NAME, VS_NUMBER, symbol that isn't # or !
693 elif self.token_kind == Kind.VSub:
694 part = self._ParseVarExpr(arg_lex_mode)
695
696 else:
697 # e.g. ${^}
698 p_die('Unexpected token in ${}', self.cur_token)
699
700 part.left = left_token # attach the argument
701 part.right = self.cur_token
702 return part
703
704 def _ReadSingleQuoted(self, left_token, lex_mode):
705 # type: (Token, lex_mode_t) -> SingleQuoted
706 """Internal method to read a word_part."""
707 tokens = [] # type: List[Token]
708 # In command mode, we never disallow backslashes like '\'
709 right_quote = self.ReadSingleQuoted(lex_mode, left_token, tokens,
710 False)
711 sval = word_compile.EvalSingleQuoted(left_token.id, tokens)
712 node = SingleQuoted(left_token, sval, right_quote)
713 return node
714
715 def ReadSingleQuoted(self, lex_mode, left_token, out_tokens, is_ysh_expr):
716 # type: (lex_mode_t, Token, List[Token], bool) -> Token
717 """Appends to out_tokens; returns last token
718
719 Used by expr_parse.py
720 """
721 if (left_token.id == Id.Left_DollarSingleQuote and
722 self.parse_opts.no_parse_osh()):
723 p_die("Instead of $'', use J8 strings like b'' (no_parse_osh)",
724 left_token)
725
726 # echo '\' is allowed, but x = '\' is invalid, in favor of x = r'\'
727 # enforce for triple-quoted strings: ''' \u ''' requires r''' \u '''
728 no_backslashes = is_ysh_expr and left_token.id in (
729 Id.Left_SingleQuote, Id.Left_TSingleQuote)
730
731 expected_end_tokens = 3 if left_token.id in (
732 Id.Left_TSingleQuote, Id.Left_RTSingleQuote, Id.Left_UTSingleQuote,
733 Id.Left_BTSingleQuote) else 1
734 num_end_tokens = 0
735
736 # TODO: could we directly append to out_tokens?
737 tokens = [] # type: List[Token]
738 while num_end_tokens < expected_end_tokens:
739 self._SetNext(lex_mode)
740 self._GetToken()
741
742 # Kind.Char emitted in lex_mode.SQ_C
743 if self.token_kind in (Kind.Lit, Kind.Char):
744 tok = self.cur_token
745 # Happens in lex_mode_e.SQ: 'one\two' is ambiguous, should be
746 # r'one\two' or c'one\\two'
747 if no_backslashes and lexer.TokenContains(tok, '\\'):
748 p_die(
749 "Ambiguous backslash: add explicit r'' or u'' prefix (OILS-ERR-20)",
750 tok)
751
752 if is_ysh_expr:
753 # Disallow var x = $'\001'. Arguably we don't need these
754 # checks because u'\u{1}' is the way to write it.
755 if self.token_type == Id.Char_Octal3:
756 p_die(
757 r"Use \xhh or \u{...} instead of octal escapes in YSH strings",
758 tok)
759
760 if self.token_type == Id.Char_Hex and self.cur_token.length != 4:
761 # disallow \xH
762 p_die(
763 r'Invalid hex escape in YSH string (must be \xHH)',
764 tok)
765
766 tokens.append(tok)
767
768 elif self.token_kind == Kind.Unknown:
769 tok = self.cur_token
770 assert tok.id == Id.Unknown_Backslash, tok
771
772 # x = $'\z' is disallowed; ditto for echo $'\z' if shopt --set no_parse_backslash
773 if is_ysh_expr or self.parse_opts.no_parse_backslash():
774 p_die(
775 "Invalid char escape in C-style string literal (OILS-ERR-11)",
776 tok)
777
778 tokens.append(tok)
779
780 elif self.token_kind == Kind.Eof:
781 p_die('Unexpected EOF in single-quoted string that began here',
782 left_token)
783
784 elif self.token_kind == Kind.Right:
785 # assume Id.Right_SingleQuote
786 num_end_tokens += 1
787 tokens.append(self.cur_token)
788
789 else:
790 raise AssertionError(self.cur_token)
791
792 if self.token_kind != Kind.Right:
793 num_end_tokens = 0 # we need three in a ROW
794
795 if expected_end_tokens == 1:
796 tokens.pop()
797 elif expected_end_tokens == 3: # Get rid of spurious end tokens
798 tokens.pop()
799 tokens.pop()
800 tokens.pop()
801
802 # Remove space from ''' r''' $''' in both expression mode and command mode
803 if left_token.id in (Id.Left_TSingleQuote, Id.Left_RTSingleQuote,
804 Id.Left_UTSingleQuote, Id.Left_BTSingleQuote):
805 word_compile.RemoveLeadingSpaceSQ(tokens)
806
807 # Validation after lexing - same 2 checks in j8.LexerDecoder
808 is_u_string = left_token.id in (Id.Left_USingleQuote,
809 Id.Left_UTSingleQuote)
810
811 for tok in tokens:
812 # u'\yff' is not valid, but b'\yff' is
813 if is_u_string and tok.id == Id.Char_YHex:
814 p_die(
815 r"%s escapes not allowed in u'' strings" %
816 lexer.TokenVal(tok), tok)
817
818 out_tokens.extend(tokens)
819 return self.cur_token
820
821 def _ReadDoubleQuotedLeftParts(self):
822 # type: () -> word_part_t
823 """Read substitution parts in a double quoted context."""
824 if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick):
825 return self._ReadCommandSub(self.token_type, d_quoted=True)
826
827 if self.token_type == Id.Left_DollarBrace:
828 return self._ReadBracedVarSub(self.cur_token, d_quoted=True)
829
830 if self.token_type == Id.Left_DollarDParen:
831 # TODO: Uncomment this after another regtest/aports run
832 # if (self.LookAheadDParens(shift_back=1)):
833 return self._ReadArithSub()
834 # else:
835 # Mutate token - we treat this '$((' as '$( ('
836 # self.cur_token.id = Id.Left_DollarParen
837 # return self._ReadCommandSub(Id.Left_DollarParen, d_quoted=True)
838
839 if self.token_type == Id.Left_DollarBracket:
840
841 if self.parse_opts.parse_ysh_expr_sub():
842 return self._ReadExprSub(lex_mode_e.DQ)
843 else:
844 return self._ReadArithSub(end_id=Id.Arith_RBracket)
845
846 if self.token_type == Id.Left_DollarBraceZsh:
847 return self._ReadZshVarSub(self.cur_token)
848
849 raise AssertionError(self.cur_token)
850
851 def _ReadYshSingleQuoted(self, left_id):
852 # type: (Id_t) -> CompoundWord
853 """Read YSH style strings
854
855 r'' u'' b''
856 r''' ''' u''' ''' b''' '''
857 """
858 #log('BEF self.cur_token %s', self.cur_token)
859 if left_id == Id.Left_RSingleQuote:
860 lexer_mode = lex_mode_e.SQ_Raw
861 triple_left_id = Id.Left_RTSingleQuote
862 elif left_id == Id.Left_USingleQuote:
863 lexer_mode = lex_mode_e.J8_Str
864 triple_left_id = Id.Left_UTSingleQuote
865 elif left_id == Id.Left_BSingleQuote:
866 lexer_mode = lex_mode_e.J8_Str
867 triple_left_id = Id.Left_BTSingleQuote
868 else:
869 raise AssertionError(left_id)
870
871 # Needed for syntax checks
872 left_tok = self.cur_token
873 left_tok.id = left_id
874
875 sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
876
877 if (len(sq_part.sval) == 0 and self.lexer.ByteLookAhead() == "'"):
878 self._SetNext(lex_mode_e.ShCommand)
879 self._GetToken()
880
881 assert self.token_type == Id.Left_SingleQuote
882 # HACK: magically transform the third ' in u''' to
883 # Id.Left_UTSingleQuote, so that ''' is the terminator
884 left_tok = self.cur_token
885 left_tok.id = triple_left_id
886
887 # Handles stripping leading whitespace
888 sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
889
890 # Advance and validate
891 self._SetNext(lex_mode_e.ShCommand)
892
893 self._GetToken()
894 if self.token_kind not in KINDS_THAT_END_WORDS:
895 p_die('Unexpected token after YSH single-quoted string',
896 self.cur_token)
897
898 return CompoundWord([sq_part])
899
900 def _ReadUnquotedLeftParts(self, triple_out):
901 # type: (Optional[BoolParamBox]) -> word_part_t
902 """Read substitutions and quoted strings (for lex_mode_e.ShCommand).
903
904 If triple_out is set, then we try parsing triple quoted strings,
905 and set its value to True if we got one.
906 """
907 if self.token_type in (Id.Left_DoubleQuote, Id.Left_DollarDoubleQuote):
908 # Note: $"" is a synonym for "". It might make sense if it added
909 # \n \0 \x00 \u{123} etc. But that's not what bash does!
910 dq_part = self._ReadDoubleQuoted(self.cur_token)
911 # Got empty word "" and there's a " after
912 if (triple_out and len(dq_part.parts) == 0 and
913 self.lexer.ByteLookAhead() == '"'):
914
915 self._SetNext(lex_mode_e.ShCommand)
916 self._GetToken()
917 # HACK: magically transform the third " in """ to
918 # Id.Left_TDoubleQuote, so that """ is the terminator
919 left_dq_token = self.cur_token
920 left_dq_token.id = Id.Left_TDoubleQuote
921 triple_out.b = True # let caller know we got it
922 return self._ReadDoubleQuoted(left_dq_token)
923
924 return dq_part
925
926 if self.token_type in (Id.Left_SingleQuote, Id.Left_RSingleQuote,
927 Id.Left_DollarSingleQuote):
928 if self.token_type == Id.Left_SingleQuote:
929 lexer_mode = lex_mode_e.SQ_Raw
930 triple_left_id = Id.Left_TSingleQuote
931 elif self.token_type == Id.Left_RSingleQuote:
932 lexer_mode = lex_mode_e.SQ_Raw
933 triple_left_id = Id.Left_RTSingleQuote
934 else:
935 lexer_mode = lex_mode_e.SQ_C
936 # there is no such thing as $'''
937 triple_left_id = Id.Undefined_Tok
938
939 sq_part = self._ReadSingleQuoted(self.cur_token, lexer_mode)
940
941 # Got empty '' or r'' and there's a ' after
942 # u'' and b'' are handled in _ReadYshSingleQuoted
943 if (triple_left_id != Id.Undefined_Tok and
944 triple_out is not None and len(sq_part.sval) == 0 and
945 self.lexer.ByteLookAhead() == "'"):
946
947 self._SetNext(lex_mode_e.ShCommand)
948 self._GetToken()
949
950 # HACK: magically transform the third ' in ''' to
951 # Id.Left_TSingleQuote, so that ''' is the terminator
952 left_sq_token = self.cur_token
953 left_sq_token.id = triple_left_id
954
955 triple_out.b = True # let caller know we got it
956 return self._ReadSingleQuoted(left_sq_token, lexer_mode)
957
958 return sq_part
959
960 if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick,
961 Id.Left_ProcSubIn, Id.Left_ProcSubOut):
962 return self._ReadCommandSub(self.token_type, d_quoted=False)
963
964 if self.token_type == Id.Left_DollarBrace:
965 return self._ReadBracedVarSub(self.cur_token, d_quoted=False)
966
967 if self.token_type == Id.Left_DollarDParen:
968 # TODO: Uncomment this after another regtest/aports run
969 # if (self.LookAheadDParens(shift_back=1)):
970 return self._ReadArithSub()
971 # else:
972 # Mutate token - we treat this '$((' as '$( ('
973 # self.cur_token.id = Id.Left_DollarParen
974 # return self._ReadCommandSub(Id.Left_DollarParen, d_quoted=True)
975
976 if self.token_type == Id.Left_DollarBracket:
977 if self.parse_opts.parse_ysh_expr_sub():
978 return self._ReadExprSub(lex_mode_e.ShCommand)
979 else:
980 return self._ReadArithSub(end_id=Id.Arith_RBracket)
981
982 if self.token_type == Id.Left_DollarBraceZsh:
983 return self._ReadZshVarSub(self.cur_token)
984
985 raise AssertionError(self.cur_token)
986
987 def _ReadExtGlob(self):
988 # type: () -> word_part.ExtGlob
989 """
990 Grammar:
991 Item = CompoundWord | EPSILON # important: @(foo|) is allowed
992 LEFT = '@(' | '*(' | '+(' | '?(' | '!('
993 RIGHT = ')'
994 ExtGlob = LEFT (Item '|')* Item RIGHT # ITEM may be empty
995 Compound includes ExtGlob
996 """
997 left_token = self.cur_token
998 right_token = None # type: Token
999 arms = [] # type: List[CompoundWord]
1000
1001 self.lexer.PushHint(Id.Op_RParen, Id.Right_ExtGlob)
1002 self._SetNext(lex_mode_e.ExtGlob) # advance past LEFT
1003
1004 read_word = False # did we just a read a word? To handle @(||).
1005
1006 while True:
1007 self._GetToken()
1008
1009 if self.token_type == Id.Right_ExtGlob:
1010 if not read_word:
1011 arms.append(CompoundWord([]))
1012 right_token = self.cur_token
1013 break
1014
1015 elif self.token_type == Id.Op_Pipe:
1016 if not read_word:
1017 arms.append(CompoundWord([]))
1018 read_word = False
1019 self._SetNext(lex_mode_e.ExtGlob)
1020
1021 # lex_mode_e.ExtGlob should only produce these 4 kinds of tokens
1022 elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub,
1023 Kind.ExtGlob):
1024 w = self._ReadCompoundWord(lex_mode_e.ExtGlob)
1025 arms.append(w)
1026 read_word = True
1027
1028 elif self.token_kind == Kind.Eof:
1029 p_die('Unexpected EOF reading extended glob that began here',
1030 left_token)
1031
1032 else:
1033 raise AssertionError(self.cur_token)
1034
1035 return word_part.ExtGlob(left_token, arms, right_token)
1036
1037 def _ReadBashRegexGroup(self):
1038 # type: () -> word_part.BashRegexGroup
1039 """
1040 Grammar:
1041 BashRegexGroup = '(' WORD? ')
1042 """
1043 left_token = self.cur_token
1044 assert left_token.id == Id.BashRegex_LParen, left_token
1045
1046 arms = [] # type: List[CompoundWord]
1047
1048 self.lexer.PushHint(Id.Op_RParen, Id.Right_BashRegexGroup)
1049 self._SetNext(lex_mode_e.BashRegexFakeInner) # advance past LEFT
1050
1051 self._GetToken()
1052 if self.token_type == Id.Right_BashRegexGroup: # empty ()
1053 return word_part.BashRegexGroup(left_token, None, self.cur_token)
1054
1055 # lex_mode_e.BashRegex should only produce these 4 kinds of tokens
1056 if self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub, Kind.BashRegex):
1057 # Fake lexer mode that translates Id.WS_Space to Id.Lit_Chars
1058 # To allow bash style [[ s =~ (a b) ]]
1059 w = self._ReadCompoundWord(lex_mode_e.BashRegexFakeInner)
1060 arms.append(w)
1061
1062 self._GetToken()
1063 if self.token_type != Id.Right_BashRegexGroup:
1064 p_die('Expected ) to close bash regex group', self.cur_token)
1065
1066 return word_part.BashRegexGroup(left_token, w, self.cur_token)
1067
1068 p_die('Expected word after ( opening bash regex group', self.cur_token)
1069
1070 # DQ and here documents both act similarly, so these parts are shared between them
1071 def _ParseDQToken(self, is_ysh_expr, caller_kind):
1072 # type: (bool, str) -> Optional[word_part_t]
1073 if self.token_kind == Kind.Lit:
1074 if self.token_type == Id.Lit_EscapedChar:
1075 tok = self.cur_token
1076 ch = lexer.TokenSliceLeft(tok, 1)
1077 return word_part.EscapedLiteral(tok, ch)
1078 elif self.token_type == Id.Lit_BadBackslash:
1079 # echo "\z" is OK in shell, but 'x = "\z" is a syntax error in
1080 # YSH.
1081 # Slight hole: We don't catch 'x = ${undef:-"\z"} because of the
1082 # recursion (unless no_parse_backslash)
1083 if (is_ysh_expr or self.parse_opts.no_parse_backslash()):
1084 p_die(
1085 "Invalid char escape in %s (OILS-ERR-12)"
1086 % caller_kind, self.cur_token)
1087 elif self.token_type == Id.Lit_Dollar:
1088 if is_ysh_expr or self.parse_opts.no_parse_dollar():
1089 p_die("Literal $ should be quoted like \$",
1090 self.cur_token)
1091
1092 return self.cur_token
1093
1094 elif self.token_kind == Kind.Left:
1095 if self.token_type == Id.Left_Backtick and is_ysh_expr:
1096 p_die("Backtick should be $(cmd) or \\` (OILS-ERR-18)",
1097 self.cur_token)
1098
1099 return self._ReadDoubleQuotedLeftParts()
1100
1101 elif self.token_kind == Kind.VSub:
1102 return SimpleVarSub(self.cur_token)
1103 # NOTE: parsing "$f(x)" would BREAK CODE. Could add a more for it
1104 # later.
1105
1106 return None
1107
1108 def _ReadLikeDQ(self, left_token, is_ysh_expr, out_parts):
1109 # type: (Token, bool, List[word_part_t]) -> None
1110 """
1111 Args:
1112 left_token: A token if we are reading a double quoted part, or None if
1113 we're reading a here doc.
1114 is_ysh_expr: Whether to disallow backticks and invalid char escapes
1115 out_parts: list of word_part to append to
1116 """
1117 if left_token.id in (Id.Left_TDoubleQuote,
1118 Id.Left_DollarTDoubleQuote):
1119 expected_end_tokens = 3
1120 else:
1121 expected_end_tokens = 1
1122
1123 num_end_tokens = 0
1124 while num_end_tokens < expected_end_tokens:
1125 self._SetNext(lex_mode_e.DQ)
1126 self._GetToken()
1127
1128 part = self._ParseDQToken(is_ysh_expr, "double quoted string")
1129 if part is not None:
1130 out_parts.append(part)
1131 continue
1132
1133 if self.token_kind == Kind.Right:
1134 assert self.token_type == Id.Right_DoubleQuote, self.token_type
1135 num_end_tokens += 1
1136 out_parts.append(self.cur_token)
1137
1138 elif self.token_kind == Kind.Eof:
1139 p_die(
1140 'Unexpected EOF reading double-quoted string that began here',
1141 left_token)
1142
1143 else:
1144 raise AssertionError(self.cur_token)
1145
1146 if self.token_kind != Kind.Right:
1147 num_end_tokens = 0 # """ must be CONSECUTIVE
1148
1149 if expected_end_tokens == 1:
1150 out_parts.pop()
1151 elif expected_end_tokens == 3:
1152 out_parts.pop()
1153 out_parts.pop()
1154 out_parts.pop()
1155
1156 # Remove space from """ in both expression mode and command mode
1157 if (left_token and left_token.id
1158 in (Id.Left_TDoubleQuote, Id.Left_DollarTDoubleQuote)):
1159 word_compile.RemoveLeadingSpaceDQ(out_parts)
1160
1161 # Return nothing, since we appended to 'out_parts'
1162
1163 def _ReadLikeHereDoc(self, is_ysh_expr, out_parts):
1164 # type: (bool, List[word_part_t]) -> None
1165 """
1166 Args:
1167 is_ysh_expr: Whether to disallow backticks and invalid char escapes
1168 out_parts: list of word_part to append to
1169 """
1170 while True:
1171 self._SetNext(lex_mode_e.HereDoc)
1172 self._GetToken()
1173
1174 part = self._ParseDQToken(is_ysh_expr, "here document")
1175 if part is not None:
1176 out_parts.append(part)
1177 continue
1178
1179 if self.token_kind == Kind.Eof:
1180 # Return nothing, since we appended to 'out_parts'
1181 return
1182
1183 else:
1184 raise AssertionError(self.cur_token)
1185
1186 def _ReadDoubleQuoted(self, left_token):
1187 # type: (Token) -> DoubleQuoted
1188 """Helper function for "hello $name".
1189
1190 Args:
1191 eof_type: for stopping at }, Id.Lit_RBrace
1192 here_doc: Whether we are reading in a here doc context
1193
1194 Also ${foo%%a b c} # treat this as double quoted. until you hit
1195 """
1196 parts = [] # type: List[word_part_t]
1197 self._ReadLikeDQ(left_token, False, parts)
1198
1199 right_quote = self.cur_token
1200 return DoubleQuoted(left_token, parts, right_quote)
1201
1202 def ReadDoubleQuoted(self, left_token, parts):
1203 # type: (Token, List[word_part_t]) -> Token
1204 """For expression mode.
1205
1206 Read var x = "${dir:-}/$name"; etc.
1207 """
1208 self._ReadLikeDQ(left_token, True, parts)
1209 return self.cur_token
1210
1211 def _ReadCommandSub(self, left_id, d_quoted=False):
1212 # type: (Id_t, bool) -> CommandSub
1213 """
1214 NOTE: This is not in the grammar, because word parts aren't in the grammar!
1215
1216 command_sub = '$(' command_list ')'
1217 | '@(' command_list ')'
1218 | '<(' command_list ')'
1219 | '>(' command_list ')'
1220 | ` command_list `
1221 """
1222 left_token = self.cur_token
1223
1224 # Set the lexer in a state so ) becomes the EOF token.
1225 if left_id in (Id.Left_DollarParen, Id.Left_AtParen, Id.Left_ProcSubIn,
1226 Id.Left_ProcSubOut):
1227 self._SetNext(lex_mode_e.ShCommand) # advance past $( etc.
1228
1229 right_id = Id.Eof_RParen
1230 self.lexer.PushHint(Id.Op_RParen, right_id)
1231 c_parser = self.parse_ctx.MakeParserForCommandSub(
1232 self.line_reader, self.lexer, right_id)
1233 # NOTE: This doesn't use something like main_loop because we don't want
1234 # to interleave parsing and execution! Unlike 'source' and 'eval'.
1235 node = c_parser.ParseCommandSub()
1236
1237 right_token = c_parser.w_parser.cur_token
1238
1239 elif left_id == Id.Left_Backtick and self.parse_ctx.do_lossless:
1240 # NOTE: This is an APPROXIMATE solution for translation ONLY. See
1241 # test/osh2oil.
1242
1243 right_id = Id.Eof_Backtick
1244 self.lexer.PushHint(Id.Left_Backtick, right_id)
1245 c_parser = self.parse_ctx.MakeParserForCommandSub(
1246 self.line_reader, self.lexer, right_id)
1247 node = c_parser.ParseCommandSub()
1248 right_token = c_parser.w_parser.cur_token
1249
1250 elif left_id == Id.Left_Backtick:
1251 if self.parse_opts.no_parse_backticks():
1252 p_die(
1253 'Backtick should be $(cmd) or \\` (no_parse_backticks, OILS-ERR-18)',
1254 left_token)
1255
1256 self._SetNext(lex_mode_e.Backtick) # advance past `
1257
1258 parts = [] # type: List[str]
1259 while True:
1260 self._GetToken()
1261 #log("TOK %s", self.cur_token)
1262
1263 if self.token_type == Id.Backtick_Quoted:
1264 # Remove leading \
1265 parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1266
1267 elif self.token_type == Id.Backtick_DoubleQuote:
1268 # Compatibility: If backticks are double quoted, then double quotes
1269 # within them have to be \"
1270 # Shells aren't smart enough to match nested " and ` quotes (but OSH
1271 # is)
1272 if d_quoted:
1273 # Remove leading \
1274 parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1275 else:
1276 parts.append(lexer.TokenVal(self.cur_token))
1277
1278 elif self.token_type == Id.Backtick_Other:
1279 parts.append(lexer.TokenVal(self.cur_token))
1280
1281 elif self.token_type == Id.Backtick_Right:
1282 break
1283
1284 elif self.token_type == Id.Eof_Real:
1285 # Note: this parse error is in the ORIGINAL context. No code_str yet.
1286 p_die('Unexpected EOF while looking for closing backtick',
1287 left_token)
1288
1289 else:
1290 raise AssertionError(self.cur_token)
1291
1292 self._SetNext(lex_mode_e.Backtick)
1293
1294 # Calculate right SPID on CommandSub BEFORE re-parsing.
1295 right_token = self.cur_token
1296
1297 code_str = ''.join(parts)
1298 #log('code %r', code_str)
1299
1300 # Save lines into a new, temporary arena, so SnipCodeBlock() isn't
1301 # messed up. Note: This is similar to how we parse aliases in
1302 # osh/cmd_parse.py. It won't have the same location info as
1303 # MakeParserForCommandSub(), because the reader is different.
1304 arena = alloc.Arena()
1305 # TODO: arena.PushSource()?
1306
1307 line_reader = reader.StringLineReader(code_str, arena)
1308 c_parser = self.parse_ctx.MakeOshParser(line_reader)
1309 src = source.Reparsed('backticks', left_token, right_token)
1310 with alloc.ctx_SourceCode(arena, src):
1311 node = c_parser.ParseCommandSub()
1312
1313 else:
1314 raise AssertionError(left_id)
1315
1316 return CommandSub(left_token, node, right_token)
1317
1318 def _ReadExprSub(self, lex_mode):
1319 # type: (lex_mode_t) -> ExprSub
1320 """$[d->key] $[obj.method()] etc."""
1321 left_token = self.cur_token
1322
1323 self._SetNext(lex_mode_e.Expr)
1324 enode, right_token = self.parse_ctx.ParseYshExpr(
1325 self.lexer, grammar_nt.ysh_expr_sub)
1326
1327 self._SetNext(lex_mode) # Move past ]
1328 return ExprSub(left_token, enode, right_token)
1329
1330 def ParseVarDecl(self, kw_token):
1331 # type: (Token) -> VarDecl
1332 """
1333 oil_var_decl: name_type_list '=' testlist end_stmt
1334
1335 Note that assignments must end with \n ; } or EOF. Unlike shell
1336 assignments, we disallow:
1337
1338 var x = 42 | wc -l
1339 var x = 42 && echo hi
1340 """
1341 self._SetNext(lex_mode_e.Expr)
1342 enode, last_token = self.parse_ctx.ParseVarDecl(kw_token, self.lexer)
1343 # Hack to move } from what the Expr lexer modes gives to what CommandParser
1344 # wants
1345 if last_token.id == Id.Op_RBrace:
1346 last_token.id = Id.Lit_RBrace
1347
1348 # Let the CommandParser see the Op_Semi or Op_Newline.
1349 self.buffered_word = last_token
1350 self._SetNext(lex_mode_e.ShCommand) # always back to this
1351 return enode
1352
1353 def ParseMutation(self, kw_token, var_checker):
1354 # type: (Token, VarChecker) -> Mutation
1355 """
1356 setvar i = 42
1357 setvar i += 1
1358 setvar a[i] = 42
1359 setvar a[i] += 1
1360 setvar d.key = 42
1361 setvar d.key += 1
1362 """
1363 self._SetNext(lex_mode_e.Expr)
1364 enode, last_token = self.parse_ctx.ParseMutation(kw_token, self.lexer)
1365 # Hack to move } from what the Expr lexer modes gives to what CommandParser
1366 # wants
1367 if last_token.id == Id.Op_RBrace:
1368 last_token.id = Id.Lit_RBrace
1369
1370 for lhs in enode.lhs:
1371 UP_lhs = lhs
1372 with tagswitch(lhs) as case:
1373 if case(y_lhs_e.Var):
1374 lhs = cast(Token, UP_lhs)
1375 var_checker.Check(kw_token.id, lexer.LazyStr(lhs), lhs)
1376
1377 # Note: this does not cover cases like
1378 # setvar (a[0])[1] = v
1379 # setvar (d.key).other = v
1380 # This leaks into catching all typos statically, which may be
1381 # possible if 'use' makes all names explicit.
1382 elif case(y_lhs_e.Subscript):
1383 lhs = cast(Subscript, UP_lhs)
1384 if lhs.obj.tag() == expr_e.Var:
1385 v = cast(expr.Var, lhs.obj)
1386 var_checker.Check(kw_token.id, v.name, v.left)
1387
1388 elif case(y_lhs_e.Attribute):
1389 lhs = cast(Attribute, UP_lhs)
1390 if lhs.obj.tag() == expr_e.Var:
1391 v = cast(expr.Var, lhs.obj)
1392 var_checker.Check(kw_token.id, v.name, v.left)
1393
1394 # Let the CommandParser see the Op_Semi or Op_Newline.
1395 self.buffered_word = last_token
1396 self._SetNext(lex_mode_e.ShCommand) # always back to this
1397 return enode
1398
1399 def ParseBareDecl(self):
1400 # type: () -> expr_t
1401 """
1402 x = {name: val}
1403 """
1404 self._SetNext(lex_mode_e.Expr)
1405 self._GetToken()
1406 enode, last_token = self.parse_ctx.ParseYshExpr(
1407 self.lexer, grammar_nt.command_expr)
1408 if last_token.id == Id.Op_RBrace:
1409 last_token.id = Id.Lit_RBrace
1410 self.buffered_word = last_token
1411 self._SetNext(lex_mode_e.ShCommand)
1412 return enode
1413
1414 def ParseYshExprForCommand(self):
1415 # type: () -> expr_t
1416
1417 # Fudge for this case
1418 # for x in(y) {
1419 # versus
1420 # for x in (y) {
1421 #
1422 # In the former case, ReadWord on 'in' puts the lexer past (.
1423 # Also see LookPastSpace in CommandParers.
1424 # A simpler solution would be nicer.
1425
1426 if self.token_type == Id.Op_LParen:
1427 self.lexer.MaybeUnreadOne()
1428
1429 enode, _ = self.parse_ctx.ParseYshExpr(self.lexer, grammar_nt.ysh_expr)
1430
1431 self._SetNext(lex_mode_e.ShCommand)
1432 return enode
1433
1434 def ParseCommandExpr(self):
1435 # type: () -> expr_t
1436 """
1437 = 1+2
1438 """
1439 enode, last_token = self.parse_ctx.ParseYshExpr(
1440 self.lexer, grammar_nt.command_expr)
1441
1442 # In some cases, such as the case statement, we expect *the lexer* to be
1443 # pointing at the token right after the expression. But the expression
1444 # parser must have read to the `last_token`. Unreading places the lexer
1445 # back in the expected state. Ie:
1446 #
1447 # case (x) { case (x) {
1448 # (else) { = x } (else) { = x }
1449 # ^ The lexer is here ^ Unread to here
1450 # } }
1451 assert last_token.id in (Id.Op_Newline, Id.Eof_Real, Id.Op_Semi,
1452 Id.Op_RBrace), last_token
1453 if last_token.id != Id.Eof_Real:
1454 # Eof_Real is the only token we cannot unread
1455 self.lexer.MaybeUnreadOne()
1456
1457 return enode
1458
1459 def ParseProc(self, node):
1460 # type: (Proc) -> None
1461
1462 # proc name-with-hyphens() must be accepted
1463 self._SetNext(lex_mode_e.ShCommand)
1464 self._GetToken()
1465 # example: 'proc f[' gets you Lit_ArrayLhsOpen
1466 if self.token_type != Id.Lit_Chars:
1467 p_die('Invalid proc name %s' % ui.PrettyToken(self.cur_token),
1468 self.cur_token)
1469
1470 # TODO: validate this more. Disallow proc 123 { }, which isn't disallowed
1471 # for shell functions. Similar to IsValidVarName().
1472 node.name = self.cur_token
1473
1474 last_token = self.parse_ctx.ParseProc(self.lexer, node)
1475
1476 # Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1477 assert last_token.id == Id.Op_LBrace
1478 last_token.id = Id.Lit_LBrace
1479 self.buffered_word = last_token
1480
1481 self._SetNext(lex_mode_e.ShCommand)
1482
1483 def ParseFunc(self, node):
1484 # type: (Func) -> None
1485 last_token = self.parse_ctx.ParseFunc(self.lexer, node)
1486
1487 # Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1488 assert last_token.id == Id.Op_LBrace
1489 last_token.id = Id.Lit_LBrace
1490 self.buffered_word = last_token
1491
1492 self._SetNext(lex_mode_e.ShCommand)
1493
1494 def ParseYshCasePattern(self):
1495 # type: () -> Tuple[pat_t, Token]
1496 pat, left_tok, last_token = self.parse_ctx.ParseYshCasePattern(
1497 self.lexer)
1498
1499 if last_token.id == Id.Op_LBrace:
1500 last_token.id = Id.Lit_LBrace
1501 self.buffered_word = last_token
1502
1503 return pat, left_tok
1504
1505 def NewlineOkForYshCase(self):
1506 # type: () -> Id_t
1507 """Check for optional newline and consume it.
1508
1509 This is a special case of `_NewlineOk` which fixed some "off-by-one" issues
1510 which crop up while parsing Ysh Case Arms. For more details, see
1511 #oil-dev > Progress On YSH Case Grammar on zulip.
1512
1513 Returns a token id which is filled with the choice of
1514
1515 word { echo word }
1516 (3) { echo expr }
1517 /e/ { echo eggex }
1518 } # right brace
1519 """
1520 while True:
1521 next_id = self.lexer.LookAheadOne(lex_mode_e.Expr)
1522
1523 # Cannot lookahead past lines
1524 if next_id == Id.Unknown_Tok:
1525 if not self.lexer.MoveToNextLine(): # Try to move to next line
1526 break # EOF
1527 continue
1528
1529 next_kind = consts.GetKind(next_id)
1530 if next_id != Id.Op_Newline and next_kind != Kind.Ignored:
1531 break
1532
1533 self.lexer.Read(lex_mode_e.Expr)
1534
1535 if next_id in (Id.Op_RBrace, Id.Op_LParen, Id.Arith_Slash):
1536 self._SetNext(lex_mode_e.Expr) # Continue in expression mode
1537 else:
1538 # Consume the trailing Op_Newline
1539 self._SetNext(lex_mode_e.ShCommand)
1540 self._GetToken()
1541
1542 return next_id
1543
1544 def _ReadArithExpr(self, end_id):
1545 # type: (Id_t) -> arith_expr_t
1546 """Read and parse an arithmetic expression in various contexts.
1547
1548 $(( 1+2 ))
1549 (( a=1+2 ))
1550 ${a[ 1+2 ]}
1551 ${a : 1+2 : 1+2}
1552
1553 See tests/arith-context.test.sh for ambiguous cases.
1554
1555 ${a[a[0]]} is valid # VS_RBRACKET vs Id.Arith_RBracket
1556
1557 ${s : a<b?0:1 : 1} # VS_COLON vs Id.Arith_Colon
1558
1559 See the assertion in ArithParser.Parse() -- unexpected extra input.
1560 """
1561 # calls self.ReadWord(lex_mode_e.Arith)
1562 anode = self.a_parser.Parse()
1563 cur_id = self.a_parser.CurrentId()
1564 if end_id != Id.Undefined_Tok and cur_id != end_id:
1565 p_die(
1566 'Unexpected token after arithmetic expression (%s != %s)' %
1567 (ui.PrettyId(cur_id), ui.PrettyId(end_id)),
1568 loc.Word(self.a_parser.cur_word))
1569 return anode
1570
1571 def _ReadArithSub(self, end_id=Id.Arith_RParen):
1572 # type: (Id_t) -> word_part.ArithSub
1573 """Read an arith substitution, which contains an arith expression, e.g.
1574
1575 $((a + 1)).
1576 """
1577 assert end_id in (Id.Arith_RParen, Id.Arith_RBracket)
1578
1579 left_tok = self.cur_token
1580
1581 # The second one needs to be disambiguated in stuff like stuff like:
1582 # $(echo $(( 1+2 )) )
1583 self.lexer.PushHint(Id.Op_RParen, Id.Right_DollarDParen)
1584
1585 # NOTE: To disambiguate $(( as arith sub vs. command sub and subshell, we
1586 # could save the lexer/reader state here, and retry if the arithmetic parse
1587 # fails. But we can almost always catch this at parse time. There could
1588 # be some exceptions like:
1589 # $((echo * foo)) # looks like multiplication
1590 # $((echo / foo)) # looks like division
1591
1592 # $(( )) is valid
1593 anode = arith_expr.EmptyZero # type: arith_expr_t
1594
1595 self._NextNonSpace()
1596 if self.token_type != Id.Arith_RParen:
1597 anode = self._ReadArithExpr(end_id)
1598
1599 self._SetNext(lex_mode_e.ShCommand)
1600
1601 if end_id == Id.Arith_RParen:
1602 # Ensure we get closing ) if we are looking for double ))
1603 # (In backwards compat mode, ] can also be the closing bracket, and
1604 # it would already be the current token, no need to skip further
1605 self._GetToken()
1606 if self.token_type != Id.Right_DollarDParen:
1607 p_die('Expected second ) to end arith sub', self.cur_token)
1608
1609 right_tok = self.cur_token
1610 return word_part.ArithSub(left_tok, anode, right_tok)
1611
1612 def ReadDParen(self):
1613 # type: () -> Tuple[arith_expr_t, Token]
1614 """Read ((1+ 2)) -- command context.
1615
1616 We're using the word parser because it's very similar to _ReadArithExpr
1617 above.
1618
1619 This also returns the terminating Id.Op_DRightParen token for location
1620 info.
1621 """
1622 # (( )) is valid
1623 anode = arith_expr.EmptyZero # type: arith_expr_t
1624
1625 self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
1626
1627 self._NextNonSpace()
1628 if self.token_type != Id.Arith_RParen:
1629 anode = self._ReadArithExpr(Id.Arith_RParen)
1630
1631 self._SetNext(lex_mode_e.ShCommand)
1632
1633 # Ensure we get the second )
1634 self._GetToken()
1635 right = self.cur_token
1636 if right.id != Id.Op_DRightParen:
1637 p_die('Expected second ) to end arith statement', right)
1638
1639 self._SetNext(lex_mode_e.ShCommand)
1640
1641 return anode, right
1642
1643 def _NextNonSpace(self):
1644 # type: () -> None
1645 """Advance in lex_mode_e.Arith until non-space token.
1646
1647 Same logic as _ReadWord, but used in
1648 $(( ))
1649 (( ))
1650 for (( ))
1651
1652 You can read self.token_type after this, without calling _GetToken.
1653 """
1654 while True:
1655 self._SetNext(lex_mode_e.Arith)
1656 self._GetToken()
1657 if self.token_kind not in (Kind.Ignored, Kind.WS):
1658 break
1659
1660 def ReadForExpression(self):
1661 # type: () -> command.ForExpr
1662 """Read ((i=0; i<5; ++i)) -- part of command context."""
1663 self._NextNonSpace() # skip over ((
1664 cur_id = self.token_type # for end of arith expressions
1665
1666 if cur_id == Id.Arith_Semi: # for (( ; i < 10; i++ ))
1667 init_node = arith_expr.EmptyZero # type: arith_expr_t
1668 else:
1669 init_node = self.a_parser.Parse()
1670 cur_id = self.a_parser.CurrentId()
1671 self._NextNonSpace()
1672
1673 # It's odd to keep track of both cur_id and self.token_type in this
1674 # function, but it works, and is tested in 'test/parse_error.sh
1675 # arith-integration'
1676 if cur_id != Id.Arith_Semi: # for (( x=0 b; ... ))
1677 p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1678
1679 self._GetToken()
1680 cur_id = self.token_type
1681
1682 if cur_id == Id.Arith_Semi: # for (( ; ; i++ ))
1683 # empty condition is TRUE
1684 cond_node = arith_expr.EmptyOne # type: arith_expr_t
1685 else:
1686 cond_node = self.a_parser.Parse()
1687 cur_id = self.a_parser.CurrentId()
1688
1689 if cur_id != Id.Arith_Semi: # for (( x=0; x<5 b ))
1690 p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1691
1692 self._NextNonSpace()
1693 if self.token_type == Id.Arith_RParen: # for (( ; ; ))
1694 update_node = arith_expr.EmptyZero # type: arith_expr_t
1695 else:
1696 update_node = self._ReadArithExpr(Id.Arith_RParen)
1697
1698 self._NextNonSpace()
1699 if self.token_type != Id.Arith_RParen:
1700 p_die('Expected ) to end for loop expression', self.cur_token)
1701 self._SetNext(lex_mode_e.ShCommand)
1702
1703 # redirects is None, will be assigned in CommandEvaluator
1704 node = command.ForExpr.CreateNull()
1705 node.init = init_node
1706 node.cond = cond_node
1707 node.update = update_node
1708 return node
1709
1710 def _ReadArrayLiteral(self):
1711 # type: () -> word_part_t
1712 """a=(1 2 3)
1713
1714 TODO: See osh/cmd_parse.py:164 for Id.Lit_ArrayLhsOpen, for a[x++]=1
1715
1716 We want:
1717
1718 A=(['x']=1 ["x"]=2 [$x$y]=3)
1719
1720 Maybe allow this as a literal string? Because I think I've seen it before?
1721 Or maybe force people to patch to learn the rule.
1722
1723 A=([x]=4)
1724
1725 Starts with Lit_Other '[', and then it has Lit_ArrayLhsClose
1726 Maybe enforce that ALL have keys or NONE of have keys.
1727 """
1728 self._SetNext(lex_mode_e.ShCommand) # advance past (
1729 self._GetToken()
1730 if self.cur_token.id != Id.Op_LParen:
1731 p_die('Expected ( after =', self.cur_token)
1732 left_token = self.cur_token
1733 right_token = None # type: Token
1734
1735 # MUST use a new word parser (with same lexer).
1736 w_parser = self.parse_ctx.MakeWordParser(self.lexer, self.line_reader)
1737 words = [] # type: List[CompoundWord]
1738 done = False
1739 while not done:
1740 w = w_parser.ReadWord(lex_mode_e.ShCommand)
1741 with tagswitch(w) as case:
1742 if case(word_e.Operator):
1743 tok = cast(Token, w)
1744 if tok.id == Id.Right_Initializer:
1745 right_token = tok
1746 done = True # can't use break here
1747 # Unlike command parsing, array parsing allows embedded \n.
1748 elif tok.id == Id.Op_Newline:
1749 continue
1750 else:
1751 p_die('Unexpected token in array literal', loc.Word(w))
1752
1753 elif case(word_e.Compound):
1754 words.append(cast(CompoundWord, w))
1755
1756 else:
1757 raise AssertionError()
1758
1759 initializer_words = [] # type: List[InitializerWord_t]
1760 for w in words:
1761 pair = word_.DetectAssocPair(w)
1762 if pair is not None:
1763 word_.TildeDetectAssign(pair.value) # pair.value is modified
1764 initializer_words.append(pair)
1765 else:
1766 w2 = braces.BraceDetect(w) # type: word_t
1767 if w2 is None:
1768 w2 = w
1769 w3 = word_.TildeDetect(w2) # type: word_t
1770 if w3 is None:
1771 w3 = w2
1772 initializer_words.append(InitializerWord.ArrayWord(w3))
1773
1774 # invariant List?
1775 return word_part.InitializerLiteral(left_token, initializer_words,
1776 right_token)
1777
1778 def ParseProcCallArgs(self, start_symbol):
1779 # type: (int) -> ArgList
1780 """ json write (x) """
1781 self.lexer.MaybeUnreadOne()
1782
1783 arg_list = ArgList.CreateNull(alloc_lists=True)
1784 arg_list.left = self.cur_token
1785 self.parse_ctx.ParseProcCallArgs(self.lexer, arg_list, start_symbol)
1786 return arg_list
1787
1788 def _MaybeReadWordPart(self, is_first, lex_mode, parts):
1789 # type: (bool, lex_mode_t, List[word_part_t]) -> bool
1790 """Helper for _ReadCompoundWord3."""
1791 done = False
1792
1793 if self.token_type == Id.Lit_EscapedChar:
1794 tok = self.cur_token
1795 assert tok.length == 2
1796 ch = lexer.TokenSliceLeft(tok, 1)
1797 if self.parse_opts.no_parse_backslash():
1798 if not pyutil.IsValidCharEscape(ch):
1799 p_die('Invalid char escape in unquoted word (OILS-ERR-13)',
1800 self.cur_token)
1801
1802 part = word_part.EscapedLiteral(self.cur_token,
1803 ch) # type: word_part_t
1804 else:
1805 part = self.cur_token
1806
1807 if is_first and self.token_type == Id.Lit_VarLike: # foo=
1808 parts.append(part)
1809 # Unfortunately it's awkward to pull the check for a=(1 2) up to
1810 # _ReadWord.
1811 next_id = self.lexer.LookPastSpace(lex_mode)
1812 if next_id == Id.Op_LParen:
1813 self.lexer.PushHint(Id.Op_RParen, Id.Right_Initializer)
1814 part2 = self._ReadArrayLiteral()
1815 parts.append(part2)
1816
1817 # Array literal must be the last part of the word.
1818 self._SetNext(lex_mode)
1819 self._GetToken()
1820 # EOF, whitespace, newline, Right_Subshell
1821 if self.token_kind not in KINDS_THAT_END_WORDS:
1822 p_die('Unexpected token after array literal',
1823 self.cur_token)
1824 done = True
1825
1826 elif (is_first and self.parse_opts.parse_at() and
1827 self.token_type == Id.Lit_Splice):
1828
1829 splice_tok = self.cur_token
1830 part2 = word_part.Splice(splice_tok,
1831 lexer.TokenSliceLeft(splice_tok, 1))
1832
1833 parts.append(part2)
1834
1835 # @words must be the last part of the word
1836 self._SetNext(lex_mode)
1837 self._GetToken()
1838 # EOF, whitespace, newline, Right_Subshell
1839 if self.token_kind not in KINDS_THAT_END_WORDS:
1840 p_die('Unexpected token after array splice', self.cur_token)
1841 done = True
1842
1843 elif (is_first and self.parse_opts.parse_at() and
1844 self.token_type == Id.Lit_AtLBracket): # @[split(x)]
1845 part2 = self._ReadExprSub(lex_mode_e.DQ)
1846 parts.append(part2)
1847
1848 # @[split(x)]
1849 self._SetNext(lex_mode)
1850 self._GetToken()
1851 # EOF, whitespace, newline, Right_Subshell
1852 if self.token_kind not in KINDS_THAT_END_WORDS:
1853 p_die('Unexpected token after Expr splice', self.cur_token)
1854 done = True
1855
1856 elif (is_first and self.parse_opts.parse_at() and
1857 self.token_type == Id.Lit_AtLBraceDot):
1858 p_die('TODO: @{.myproc builtin sub}', self.cur_token)
1859
1860 elif (is_first and self.parse_opts.parse_at_all() and
1861 self.token_type == Id.Lit_At):
1862 # Because $[x] ${x} and perhaps $/x/ are reserved, it makes sense for @
1863 # at the beginning of a word to be reserved.
1864
1865 # Although should we relax 'echo @' ? I'm tempted to have a shortcut for
1866 # @_argv and
1867 p_die('Literal @ starting a word must be quoted (parse_at_all)',
1868 self.cur_token)
1869
1870 else:
1871 # not a literal with lookahead; append it
1872 parts.append(part)
1873
1874 return done
1875
1876 def _ReadCompoundWord(self, lex_mode):
1877 # type: (lex_mode_t) -> CompoundWord
1878
1879 # This is the ONLY lexer mode that can return word.Redir
1880 assert lex_mode != lex_mode_e.ShCommand, lex_mode
1881
1882 w = self._ReadCompoundOrRedir(lex_mode)
1883 assert w.tag() == word_e.Compound, w
1884 return cast(CompoundWord, w)
1885
1886 def _ReadCompoundWord3(self, lex_mode, eof_type, empty_ok):
1887 # type: (lex_mode_t, Id_t, bool) -> CompoundWord
1888
1889 # This is the ONLY lexer mode that can return word.Redir
1890 assert lex_mode != lex_mode_e.ShCommand, lex_mode
1891
1892 w = self._ReadCompoundOrRedir3(lex_mode, eof_type, empty_ok)
1893 assert w.tag() == word_e.Compound, w
1894 return cast(CompoundWord, w)
1895
1896 def _ReadCompoundOrRedir(self, lex_mode):
1897 # type: (lex_mode_t) -> word_t
1898 """Returns either word.Compound or word.Redir"""
1899 return self._ReadCompoundOrRedir3(lex_mode, Id.Undefined_Tok, True)
1900
1901 def _ReadCompoundOrRedir3(self, lex_mode, eof_type, empty_ok):
1902 # type: (lex_mode_t, Id_t, bool) -> word_t
1903 """
1904 Precondition: Looking at the first token of the first word part
1905 Postcondition: Looking at the token after, e.g. space or operator
1906
1907 NOTE: eof_type is necessary because / is a literal, i.e. Lit_Slash, but it
1908 could be an operator delimiting a compound word. Can we change lexer modes
1909 and remove this special case?
1910
1911 Returns either word.Compound or word.Redir
1912 """
1913 w = CompoundWord([])
1914 num_parts = 0
1915 brace_count = 0
1916 done = False
1917 is_triple_quoted = None # type: Optional[BoolParamBox]
1918 saw_redir_left_tok = False
1919
1920 while not done:
1921 self._GetToken()
1922
1923 allow_done = empty_ok or num_parts != 0
1924 if allow_done and self.token_type == eof_type:
1925 done = True # e.g. for ${foo//pat/replace}
1926
1927 # Keywords like "for" are treated like literals
1928 elif self.token_kind in (Kind.Lit, Kind.History, Kind.KW,
1929 Kind.ControlFlow, Kind.BoolUnary,
1930 Kind.BoolBinary):
1931
1932 # Syntax error for { and }
1933 if self.token_type == Id.Lit_LBrace:
1934 brace_count += 1
1935 elif self.token_type == Id.Lit_RBrace:
1936 brace_count -= 1
1937 elif self.token_type == Id.Lit_Dollar:
1938 if self.parse_opts.no_parse_dollar():
1939 if num_parts == 0 and lex_mode == lex_mode_e.ShCommand:
1940 next_byte = self.lexer.ByteLookAhead()
1941 # TODO: switch lexer modes and parse $/d+/. But not ${a:-$/d+/}
1942 if next_byte == '/':
1943 #log('next_byte %r', next_byte)
1944 pass
1945
1946 p_die(
1947 'Literal $ should be quoted like \$ (no_parse_dollar)',
1948 self.cur_token)
1949 elif self.token_type in (Id.Lit_Number, Id.Lit_RedirVarName):
1950 saw_redir_left_tok = True
1951
1952 done = self._MaybeReadWordPart(num_parts == 0, lex_mode,
1953 w.parts)
1954
1955 elif self.token_kind == Kind.VSub:
1956 vsub_token = self.cur_token
1957
1958 part = SimpleVarSub(vsub_token) # type: word_part_t
1959 w.parts.append(part)
1960
1961 elif self.token_kind == Kind.ExtGlob:
1962 # If parse_at, we can take over @( to start @(seq 3)
1963 # Users can also use look at ,(*.py|*.sh)
1964 if (self.parse_opts.parse_at() and
1965 self.token_type == Id.ExtGlob_At and num_parts == 0):
1966 cs_part = self._ReadCommandSub(Id.Left_AtParen,
1967 d_quoted=False)
1968 # RARE mutation of tok.id!
1969 cs_part.left_token.id = Id.Left_AtParen
1970 part = cs_part # for type safety
1971
1972 # Same check as _MaybeReadWordPart. @(seq 3)x is illegal, just like
1973 # a=(one two)x and @arrayfunc(3)x.
1974 self._GetToken()
1975 if self.token_kind not in KINDS_THAT_END_WORDS:
1976 p_die('Unexpected token after @()', self.cur_token)
1977 done = True
1978
1979 else:
1980 if HAVE_FNM_EXTMATCH == 0:
1981 p_die(
1982 "Extended glob won't work without FNM_EXTMATCH support in libc",
1983 self.cur_token)
1984 part = self._ReadExtGlob()
1985 w.parts.append(part)
1986
1987 elif self.token_kind == Kind.BashRegex:
1988 if self.token_type == Id.BashRegex_LParen: # Opening (
1989 part = self._ReadBashRegexGroup()
1990 w.parts.append(part)
1991 else:
1992 assert self.token_type == Id.BashRegex_AllowedInParens
1993 p_die('Invalid token in bash regex', self.cur_token)
1994
1995 elif self.token_kind == Kind.Left:
1996 try_triple_quote = (self.parse_opts.parse_triple_quote() and
1997 lex_mode == lex_mode_e.ShCommand and
1998 num_parts == 0)
1999
2000 # Save allocation
2001 if try_triple_quote:
2002 is_triple_quoted = BoolParamBox(False)
2003
2004 part = self._ReadUnquotedLeftParts(is_triple_quoted)
2005 w.parts.append(part)
2006
2007 # NOT done yet, will advance below
2008 elif self.token_kind == Kind.Right:
2009 # Still part of the word; will be done on the next iter.
2010 if self.token_type == Id.Right_DoubleQuote:
2011 pass
2012 # Never happens, no PushHint for this case.
2013 #elif self.token_type == Id.Right_DollarParen:
2014 # pass
2015 elif self.token_type == Id.Right_Subshell:
2016 # LEXER HACK for (case x in x) ;; esac )
2017 # Rewind before it's used
2018 assert self.next_lex_mode == lex_mode_e.Undefined
2019 if self.lexer.MaybeUnreadOne():
2020 self.lexer.PushHint(Id.Op_RParen, Id.Right_Subshell)
2021 self._SetNext(lex_mode)
2022 done = True
2023 else:
2024 done = True
2025
2026 elif self.token_kind == Kind.Redir:
2027 # Check if the previous token was a possible left_tok to a
2028 # redirect operator, attach it to the word.Redir. And return
2029 # it instead of the CompoundWord.
2030
2031 # &> and &>> don't have a leading descriptor (2 is implied)
2032 if (saw_redir_left_tok and num_parts == 1 and self.token_type
2033 not in (Id.Redir_AndGreat, Id.Redir_AndDGreat)):
2034
2035 self._SetNext(lex_mode)
2036 left_tok = cast(Token, w.parts.pop())
2037 r = word.Redir(left_tok, self.cur_token)
2038 return r # EARLY RETURN
2039
2040 done = True
2041
2042 elif self.token_kind == Kind.Ignored:
2043 done = True
2044
2045 else:
2046 # LEXER HACK for unbalanced case clause. 'case foo in esac' is valid,
2047 # so to test for ESAC, we can read ) before getting a chance to
2048 # PushHint(Id.Op_RParen, Id.Right_CasePat). So here we unread one
2049 # token and do it again.
2050
2051 # We get Id.Op_RParen at top level: case x in x) ;; esac
2052 # We get Id.Eof_RParen inside ComSub: $(case x in x) ;; esac )
2053 if self.token_type in (Id.Op_RParen, Id.Eof_RParen):
2054 # Rewind before it's used
2055 assert self.next_lex_mode == lex_mode_e.Undefined
2056 if self.lexer.MaybeUnreadOne():
2057 if self.token_type == Id.Eof_RParen:
2058 # Redo translation
2059 self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
2060 self._SetNext(lex_mode)
2061
2062 done = True # anything we don't recognize means we're done
2063
2064 if not done:
2065 self._SetNext(lex_mode)
2066 num_parts += 1
2067
2068 if (self.parse_opts.parse_brace() and num_parts > 1 and
2069 brace_count != 0):
2070 # accept { and }, but not foo{
2071 p_die(
2072 'Word has unbalanced { }. Maybe add a space or quote it like \{',
2073 loc.Word(w))
2074
2075 if is_triple_quoted and is_triple_quoted.b and num_parts > 1:
2076 p_die('Unexpected parts after triple quoted string',
2077 loc.WordPart(w.parts[-1]))
2078
2079 if 0:
2080 from _devbuild.gen.syntax_asdl import word_part_str
2081 word_key = ' '.join(word_part_str(p.tag()) for p in w.parts)
2082 WORD_HIST[word_key] += 1
2083
2084 # YSH word restriction
2085 # (r'' u'' b'' are stripped on shopt -s parse_ysh_string)
2086 if self.parse_opts.no_parse_word_join() and not _IsValidYshWord(w):
2087 p_die("Invalid quoted word part in YSH (OILS-ERR-17)",
2088 loc.WordPart(part))
2089
2090 return w
2091
2092 def _ReadArithWord(self):
2093 # type: () -> Optional[word_t]
2094 """ Helper for ReadArithWord() """
2095 self._GetToken()
2096
2097 if self.token_kind == Kind.Unknown:
2098 # e.g. happened during dynamic parsing of unset 'a[$foo]' in gherkin
2099 p_die(
2100 'Unexpected token while parsing arithmetic: %r' %
2101 lexer.TokenVal(self.cur_token), self.cur_token)
2102
2103 elif self.token_kind == Kind.Eof:
2104 return self.cur_token
2105
2106 elif self.token_kind == Kind.Ignored:
2107 # Space should be ignored.
2108 self._SetNext(lex_mode_e.Arith)
2109 return None
2110
2111 elif self.token_kind in (Kind.Arith, Kind.Right):
2112 # Id.Right_DollarDParen IS just a normal token, handled by ArithParser
2113 self._SetNext(lex_mode_e.Arith)
2114 return self.cur_token
2115
2116 elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub):
2117 return self._ReadCompoundWord(lex_mode_e.Arith)
2118
2119 else:
2120 raise AssertionError(self.cur_token)
2121
2122 def _ReadWord(self, word_mode):
2123 # type: (lex_mode_t) -> Optional[word_t]
2124 """Helper function for ReadWord()."""
2125
2126 # Change the pseudo lexer mode to a real lexer mode
2127 if word_mode == lex_mode_e.ShCommandFakeBrack:
2128 lex_mode = lex_mode_e.ShCommand
2129 else:
2130 lex_mode = word_mode
2131
2132 self._GetToken()
2133
2134 if self.token_kind == Kind.Eof:
2135 # No advance
2136 return self.cur_token
2137
2138 elif self.token_kind == Kind.Redir:
2139 self._SetNext(lex_mode)
2140 # This is >out -- 3>out is handled below
2141 return word.Redir(None, self.cur_token)
2142
2143 # Allow Arith for ) at end of for loop?
2144 elif self.token_kind in (Kind.Op, Kind.Arith):
2145 self._SetNext(lex_mode)
2146
2147 # Newlines are complicated. See 3x2 matrix in the comment about
2148 # self.multiline and self.newline_state above.
2149 if self.token_type == Id.Op_Newline:
2150 if self.multiline:
2151 if self.newline_state > 1:
2152 # This points at a blank line, but at least it gives the line number
2153 p_die('Invalid blank line in multiline mode',
2154 self.cur_token)
2155 return None
2156
2157 if self.returned_newline: # skip
2158 return None
2159
2160 return self.cur_token
2161
2162 elif self.token_kind == Kind.Right:
2163 if self.token_type not in (Id.Right_Subshell, Id.Right_ShFunction,
2164 Id.Right_CasePat, Id.Right_Initializer):
2165 raise AssertionError(self.cur_token)
2166
2167 self._SetNext(lex_mode)
2168 return self.cur_token
2169
2170 elif self.token_kind in (Kind.Ignored, Kind.WS):
2171 self._SetNext(lex_mode)
2172 return None
2173
2174 else:
2175 assert self.token_kind in (Kind.VSub, Kind.Lit, Kind.History,
2176 Kind.Left, Kind.KW, Kind.ControlFlow,
2177 Kind.BoolUnary, Kind.BoolBinary,
2178 Kind.ExtGlob,
2179 Kind.BashRegex), 'Unhandled token kind'
2180
2181 if (word_mode == lex_mode_e.ShCommandFakeBrack and
2182 self.parse_opts.parse_bracket() and
2183 self.token_type == Id.Lit_LBracket):
2184 # Change [ from Kind.Lit -> Kind.Op
2185 # So CommandParser can treat
2186 # assert [42 === x]
2187 # like
2188 # json write (x)
2189 bracket_word = self.cur_token
2190 bracket_word.id = Id.Op_LBracket
2191
2192 self._SetNext(lex_mode)
2193 return bracket_word
2194
2195 # We're beginning a word. If we see Id.Lit_Pound, change to
2196 # lex_mode_e.Comment and read until end of line.
2197 if self.token_type == Id.Lit_Pound:
2198 self._SetNext(lex_mode_e.Comment)
2199 self._GetToken()
2200
2201 # NOTE: The # could be the last character in the file. It can't be
2202 # Eof_{RParen,Backtick} because #) and #` are comments.
2203 assert self.token_type in (Id.Ignored_Comment, Id.Eof_Real), \
2204 self.cur_token
2205
2206 # The next iteration will go into Kind.Ignored and set lex state to
2207 # lex_mode_e.ShCommand/etc.
2208 return None # tell ReadWord() to try again after comment
2209
2210 elif self.token_type == Id.Lit_TPound: ### doc comment
2211 self._SetNext(lex_mode_e.Comment)
2212 self._GetToken()
2213
2214 if self.token_type == Id.Ignored_Comment and self.emit_doc_token:
2215 return self.cur_token
2216
2217 return None # tell ReadWord() to try again after comment
2218
2219 else:
2220 # r'' u'' b'' at the beginning of a word
2221 if (self.token_type == Id.Lit_Chars and
2222 self.lexer.LookAheadOne(
2223 lex_mode_e.ShCommand) == Id.Left_SingleQuote):
2224
2225 # When shopt -s parse_ysh_string:
2226 # echo r'hi' is like echo 'hi'
2227 #
2228 # echo u'\u{3bc}' b'\yff' works
2229
2230 tok = self.cur_token
2231 if self.parse_opts.parse_ysh_string():
2232 if lexer.TokenEquals(tok, 'r'):
2233 left_id = Id.Left_RSingleQuote
2234 elif lexer.TokenEquals(tok, 'u'):
2235 left_id = Id.Left_USingleQuote
2236 elif lexer.TokenEquals(tok, 'b'):
2237 left_id = Id.Left_BSingleQuote
2238 else:
2239 left_id = Id.Undefined_Tok
2240
2241 if left_id != Id.Undefined_Tok:
2242 # skip the r, and then 'foo' will be read as normal
2243 self._SetNext(lex_mode_e.ShCommand)
2244
2245 self._GetToken()
2246 assert self.token_type == Id.Left_SingleQuote, self.token_type
2247
2248 # Read the word in a different lexer mode
2249 return self._ReadYshSingleQuoted(left_id)
2250
2251 return self._ReadCompoundOrRedir(lex_mode)
2252
2253 def ParseVarRef(self):
2254 # type: () -> BracedVarSub
2255 """DYNAMIC parsing of what's inside ${!ref}
2256
2257 # Same as VarOf production
2258 VarRefExpr = VarOf EOF
2259 """
2260 self._SetNext(lex_mode_e.VSub_1)
2261
2262 self._GetToken()
2263 if self.token_kind != Kind.VSub:
2264 p_die('Expected var name', self.cur_token)
2265
2266 part = self._ParseVarOf()
2267 # NOTE: no ${ } means no part.left and part.right
2268 part.left = part.name_tok # cheat to make test pass
2269 part.right = part.name_tok
2270
2271 self._GetToken()
2272 if self.token_type != Id.Eof_Real:
2273 p_die('Expected end of var ref expression', self.cur_token)
2274 return part
2275
2276 def LookPastSpace(self):
2277 # type: () -> Id_t
2278 """Look ahead to the next token.
2279
2280 For the CommandParser to recognize
2281 array= (1 2 3)
2282 YSH for ( versus bash for ((
2283 YSH if ( versus if test
2284 YSH while ( versus while test
2285 YSH bare assignment 'grep =' versus 'grep foo'
2286 """
2287 assert self.token_type != Id.Undefined_Tok
2288 if self.cur_token.id == Id.WS_Space:
2289 id_ = self.lexer.LookPastSpace(lex_mode_e.ShCommand)
2290 else:
2291 id_ = self.cur_token.id
2292 return id_
2293
2294 def LookAheadDParens(self, shift_back=0):
2295 # type: (int) -> bool
2296 """Special lookahead for (( )), to make sure it's an arithmetic
2297 expression (i.e. that the closing parens are a single token, not
2298 separated by anything).
2299 """
2300 assert self.token_type in (Id.Op_DLeftParen, Id.Left_DollarDParen)
2301
2302 return self.lexer.LookAheadDParens(shift_back)
2303
2304 def LookAheadFuncParens(self):
2305 # type: () -> bool
2306 """Special lookahead for f( ) { echo hi; } to check for ( )"""
2307 assert self.token_type != Id.Undefined_Tok
2308
2309 # We have to handle 2 cases because we buffer a token
2310 if self.cur_token.id == Id.Op_LParen: # saw funcname(
2311 return self.lexer.LookAheadFuncParens(1) # go back one char
2312
2313 elif self.cur_token.id == Id.WS_Space: # saw funcname WHITESPACE
2314 return self.lexer.LookAheadFuncParens(0)
2315
2316 else:
2317 return False
2318
2319 def ReadWord(self, word_mode):
2320 # type: (lex_mode_t) -> word_t
2321 """Read the next word, using the given lexer mode.
2322
2323 This is a stateful wrapper for the stateless _ReadWord function.
2324 """
2325 assert word_mode in (lex_mode_e.ShCommand,
2326 lex_mode_e.ShCommandFakeBrack,
2327 lex_mode_e.DBracket, lex_mode_e.BashRegex)
2328
2329 if self.buffered_word: # For integration with pgen2
2330 w = self.buffered_word
2331 self.buffered_word = None
2332 else:
2333 while True:
2334 w = self._ReadWord(word_mode)
2335 if w is not None:
2336 break
2337
2338 self.returned_newline = (word_.CommandId(w) == Id.Op_Newline)
2339 return w
2340
2341 def ReadArithWord(self):
2342 # type: () -> word_t
2343 while True:
2344 w = self._ReadArithWord()
2345 if w is not None:
2346 break
2347 return w
2348
2349 def ReadHereDocBody(self, parts):
2350 # type: (List[word_part_t]) -> None
2351 """
2352 A here doc is like a double quoted context, except " and \" aren't special.
2353 """
2354 self._ReadLikeHereDoc(False, parts)
2355
2356 def ReadForPlugin(self):
2357 # type: () -> CompoundWord
2358 """For $PS1, $PS4, etc.
2359
2360 This is just like reading a here doc line. "\n" is allowed, as
2361 well as the typical substitutions ${x} $(echo hi) $((1 + 2)).
2362 """
2363 w = CompoundWord([])
2364 self._ReadLikeHereDoc(False, w.parts)
2365 return w
2366
2367 def EmitDocToken(self, b):
2368 # type: (bool) -> None
2369 self.emit_doc_token = b
2370
2371 def Multiline(self, b):
2372 # type: (bool) -> None
2373 self.multiline = b
2374
2375
2376if 0:
2377 import collections
2378 WORD_HIST = collections.Counter()
2379
2380# vim: sw=4