OILS / osh / word_parse.py View on Github | oils.pub

2352 lines, 1251 significant
1# Copyright 2016 Andy Chu. All rights reserved.
2# Licensed under the Apache License, Version 2.0 (the "License");
3# you may not use this file except in compliance with the License.
4# You may obtain a copy of the License at
5#
6# http://www.apache.org/licenses/LICENSE-2.0
7"""
8word_parse.py - Parse the shell word language.
9
10Hairy example:
11
12 hi$((1 + 2))"$(echo hi)"${var:-__"$(echo default)"__}
13
14Substitutions can be nested, but which inner subs are allowed depends on the
15outer sub. Notes:
16
17lex_mode_e.ShCommand (_ReadUnquotedLeftParts)
18 All subs and quotes are allowed:
19 $v ${v} $() `` $(()) '' "" $'' $"" <() >()
20
21lex_mode_e.DQ (_ReadDoubleQuotedLeftParts)
22 Var, Command, Arith, but no quotes.
23 $v ${v} $() `` $(())
24 No process substitution.
25
26lex_mode_e.Arith
27 Similar to DQ: Var, Command, and Arith sub, but no process sub. bash doesn't
28 allow quotes, but OSH does. We allow ALL FOUR kinds of quotes, because we
29 need those for associative array indexing.
30
31lex_mode_e.VSub_ArgUnquoted
32 Like ShCommand, everything is allowed (even process substitutions), but we
33 stop at }, and space is SIGNIFICANT.
34
35 Example: ${a:- b }
36
37 ${X:-$v} ${X:-${v}} ${X:-$(echo hi)} ${X:-`echo hi`} ${X:-$((1+2))}
38 ${X:-'single'} ${X:-"double"} ${X:-$'\n'} ${X:-<(echo hi)}
39
40lex_mode_e.VSub_ArgDQ
41 In contrast to DQ, VS_ARG_DQ accepts nested "" and $'' and $"", e.g.
42 "${x:-"default"}".
43
44 In contrast, VSub_ArgUnquoted respects single quotes and process
45 substitution.
46
47 It's weird that double quotes are allowed. Space is also significant here,
48 e.g. "${x:-a "b"}".
49"""
50
51from _devbuild.gen import grammar_nt
52from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind
53from _devbuild.gen.types_asdl import (lex_mode_t, lex_mode_e)
54from _devbuild.gen.syntax_asdl import (
55 ExprSub,
56 BoolParamBox,
57 Token,
58 SimpleVarSub,
59 loc,
60 source,
61 word,
62 DoubleQuoted,
63 SingleQuoted,
64 BracedVarSub,
65 CommandSub,
66 InitializerWord,
67 InitializerWord_t,
68 bracket_op,
69 bracket_op_t,
70 suffix_op,
71 suffix_op_t,
72 rhs_word,
73 rhs_word_e,
74 rhs_word_t,
75 word_e,
76 word_t,
77 CompoundWord,
78 word_part,
79 word_part_t,
80 y_lhs_e,
81 arith_expr_t,
82 command,
83 expr,
84 expr_e,
85 expr_t,
86 pat_t,
87 ArgList,
88 Proc,
89 Func,
90 Subscript,
91 Attribute,
92 arith_expr,
93 VarDecl,
94 Mutation,
95 word_part_e,
96)
97from core import alloc
98from core.error import p_die
99from mycpp.mylib import log
100from core import pyutil
101from display import ui
102from frontend import consts
103from frontend import lexer
104from frontend import reader
105from osh import tdop
106from osh import arith_parse
107from osh import braces
108from osh import word_
109from osh import word_compile
110from mycpp.mylib import tagswitch
111
112from libc import HAVE_FNM_EXTMATCH
113
114from typing import List, Optional, Tuple, cast
115from typing import TYPE_CHECKING
116if TYPE_CHECKING:
117 from frontend.lexer import Lexer
118 from frontend.parse_lib import ParseContext
119 from frontend.reader import _Reader
120 from osh.cmd_parse import VarChecker
121
122unused1 = log
123unused2 = Id_str
124
125KINDS_THAT_END_WORDS = [Kind.Eof, Kind.WS, Kind.Op, Kind.Right]
126
127
128def _IsValidYshWord(w):
129 # type: (CompoundWord) -> bool
130 """YSH word restriction
131
132 Allowed:
133 'foo' r'foo' --flag r'foo'
134 --flag='foo'
135 --flag="foo"
136 Not allowed:
137 --flag=r'bar' NAME=u'value' # ambiguous
138 --flag=b''' multi '''
139 """
140 parts = w.parts
141 n = len(parts)
142
143 if n != 0 and word_.LiteralId(parts[0]) == Id.Lit_Tilde:
144 # ~bob/src/'dir with spaces' is allowed
145 # ~bob/src/u'dir with spaces' is ambiguous, but allowed for simplicity
146 return True # early return
147
148 ok = True
149 if n >= 2:
150 # spec/ysh-TODO-deprecate - allow ''/usr/* workaround!
151 # note: ""/usr/* not allowed
152 part0 = parts[0]
153 if part0.tag() == word_part_e.SingleQuoted:
154 sq = cast(SingleQuoted, part0)
155 # Make sure $''' is still disallowed
156 if (sq.left.id == Id.Left_SingleQuote and len(sq.sval) == 0):
157 return True
158
159 for part in parts:
160 if part.tag() in (word_part_e.SingleQuoted,
161 word_part_e.DoubleQuoted):
162 ok = False
163
164 # Allow special cases:
165 # --flag='val' NAME='bar'
166 # But NOT
167 # --flag=r'val' NAME=r'val'
168 if not ok:
169 if (n == 2 and word_.LiteralId(parts[0]) == Id.Lit_VarLike):
170 ok = True
171 elif (n == 3 and word_.LiteralId(parts[0]) == Id.Lit_Chars and
172 word_.LiteralId(parts[1]) == Id.Lit_Equals):
173 ok = True
174
175 return ok
176
177
178class WordEmitter(object):
179 """Common interface for [ and [["""
180
181 def __init__(self):
182 # type: () -> None
183 """Empty constructor for mycpp."""
184 pass
185
186 def ReadWord(self, lex_mode):
187 # type: (lex_mode_t) -> word_t
188 raise NotImplementedError()
189
190
191class WordParser(WordEmitter):
192
193 def __init__(self, parse_ctx, lexer, line_reader):
194 # type: (ParseContext, Lexer, _Reader) -> None
195 self.parse_ctx = parse_ctx
196 self.lexer = lexer
197 self.line_reader = line_reader
198 self.arena = line_reader.arena
199
200 self.parse_opts = parse_ctx.parse_opts
201 self.a_parser = tdop.TdopParser(arith_parse.Spec(), self,
202 self.parse_opts)
203 self.Reset()
204
205 def Init(self, lex_mode):
206 # type: (lex_mode_t) -> None
207 """Used to parse arithmetic, see ParseContext."""
208 self.next_lex_mode = lex_mode
209
210 def Reset(self):
211 # type: () -> None
212 """Called by interactive loop."""
213 # For _GetToken()
214 self.cur_token = None # type: Token
215 self.token_kind = Kind.Undefined
216 self.token_type = Id.Undefined_Tok
217
218 self.next_lex_mode = lex_mode_e.ShCommand
219
220 # Boolean mutated by CommandParser via word_.ctx_EmitDocToken. For ### doc
221 # comments
222 self.emit_doc_token = False
223 # Boolean mutated by CommandParser via word_.ctx_Multiline. '...' starts
224 # multiline mode.
225 self.multiline = False
226
227 # For detecting invalid \n\n in multiline mode. Counts what we got
228 # directly from the lexer.
229 self.newline_state = 0
230 # For consolidating \n\n -> \n for the CALLER. This simplifies the parsers
231 # that consume words.
232 self.returned_newline = False
233
234 # For integration with pgen2
235 self.buffered_word = None # type: word_t
236
237 def _GetToken(self):
238 # type: () -> None
239 """Call this when you need to make a decision based on any of:
240
241 self.token_type
242 self.token_kind
243 self.cur_token
244 """
245 if self.next_lex_mode == lex_mode_e.Undefined:
246 return # _SetNext() not called, so do nothing
247
248 is_fake = self.next_lex_mode == lex_mode_e.BashRegexFakeInner
249 real_mode = (lex_mode_e.BashRegex if is_fake else self.next_lex_mode)
250
251 self.cur_token = self.lexer.Read(real_mode)
252
253 # MUTATE TOKEN for fake lexer mode.
254 # This is for crazy stuff bash allows, like [[ s =~ (< >) ]]
255 if (is_fake and self.cur_token.id
256 in (Id.WS_Space, Id.BashRegex_AllowedInParens)):
257 self.cur_token.id = Id.Lit_Chars
258
259 self.token_type = self.cur_token.id
260 self.token_kind = consts.GetKind(self.token_type)
261
262 # number of consecutive newlines, ignoring whitespace
263 if self.token_type == Id.Op_Newline:
264 self.newline_state += 1
265 elif self.token_kind != Kind.WS:
266 self.newline_state = 0
267
268 self.parse_ctx.trail.AppendToken(self.cur_token) # For completion
269 self.next_lex_mode = lex_mode_e.Undefined
270
271 def _SetNext(self, lex_mode):
272 # type: (lex_mode_t) -> None
273 """Set the next lex state, but don't actually read a token.
274
275 We need this for proper interactive parsing.
276 """
277 self.next_lex_mode = lex_mode
278
279 def _ReadVarOpArg(self, arg_lex_mode):
280 # type: (lex_mode_t) -> rhs_word_t
281
282 # NOTE: Operators like | and < are not treated as special, so ${a:- | >} is
283 # valid, even when unquoted.
284 self._SetNext(arg_lex_mode)
285 self._GetToken()
286
287 w = self._ReadVarOpArg2(arg_lex_mode, Id.Undefined_Tok,
288 True) # empty_ok
289
290 # If the Compound has no parts, and we're in a double-quoted VarSub
291 # arg, and empty_ok, then return Empty. This is so it can evaluate to
292 # the empty string and not get elided.
293 #
294 # Examples:
295 # - "${s:-}", "${s/%pat/}"
296 # It's similar to LooksLikeShAssignment where we turn x= into x=''. And it
297 # has the same potential problem of not having Token location info.
298 #
299 # NOTE: empty_ok is False only for the PatSub pattern, which means we'll
300 # return a Compound with no parts, which is explicitly checked with a
301 # custom error message.
302 if len(w.parts) == 0 and arg_lex_mode == lex_mode_e.VSub_ArgDQ:
303 return rhs_word.Empty
304
305 return w
306
307 def _ReadVarOpArg2(self, arg_lex_mode, eof_type, empty_ok):
308 # type: (lex_mode_t, Id_t, bool) -> CompoundWord
309 """Helper function for _ReadVarOpArg and _ReadPatSubVarOp"""
310 w = self._ReadCompoundWord3(arg_lex_mode, eof_type, empty_ok)
311 tilde = word_.TildeDetect(w)
312 if tilde:
313 w = tilde
314 return w
315
316 def _ReadSliceVarOp(self):
317 # type: () -> suffix_op.Slice
318 """
319 Looking token after first ':'
320
321 ArithExpr? (':' ArithExpr? )? '}'
322 """
323 self._NextNonSpace()
324
325 cur_id = self.token_type
326
327 if cur_id in (Id.Arith_RBrace, Id.Arith_Colon): # ${a:} or ${a::}
328 begin = arith_expr.EmptyZero # type: arith_expr_t
329 else:
330 begin = self.a_parser.Parse()
331 cur_id = self.a_parser.CurrentId() # advance
332
333 if cur_id == Id.Arith_RBrace: # ${a:1} or ${@:1}
334 # No length specified, so it's N
335 no_length = None # type: Optional[arith_expr_t]
336 return suffix_op.Slice(begin, no_length)
337
338 elif cur_id == Id.Arith_Colon: # ${a:1:} or ${@:1:}
339 colon_tok = self.cur_token
340 self._NextNonSpace()
341
342 if self.token_type == Id.Arith_RBrace:
343 # quirky bash behavior:
344 # ${a:1:} or ${a::} means length ZERO
345 # but ${a:1} or ${a:} means length N
346 if self.parse_opts.strict_parse_slice():
347 p_die(
348 "Slice length: Add explicit zero, or omit : for N (strict_parse_slice)",
349 colon_tok)
350
351 length = arith_expr.EmptyZero # type: arith_expr_t
352 else:
353 length = self._ReadArithExpr(Id.Arith_RBrace)
354
355 return suffix_op.Slice(begin, length)
356
357 else:
358 p_die("Expected : or } in slice", self.cur_token)
359
360 raise AssertionError() # for MyPy
361
362 def _ReadPatSubVarOp(self):
363 # type: () -> suffix_op.PatSub
364 """Looking at the first '/' after VarOf:
365
366 VarSub = ...
367 | VarOf '/' Match ( '/' WORD? )?
368 Match = '/' WORD # can't be empty
369 | '#' WORD? # may be empty
370 | '%' WORD?
371 """
372 slash_tok = self.cur_token # location info
373 replace_mode = Id.Undefined_Tok # bizarre syntax / # %
374
375 self._SetNext(lex_mode_e.VSub_ArgUnquoted) # advance past /
376
377 self._GetToken()
378 if self.token_type == Id.Right_DollarBrace:
379 pat = CompoundWord([])
380 return suffix_op.PatSub(pat, rhs_word.Empty, replace_mode,
381 slash_tok)
382
383 if self.token_type in (Id.Lit_Slash, Id.Lit_Pound, Id.Lit_Percent):
384 replace_mode = self.token_type
385 self._SetNext(lex_mode_e.VSub_ArgUnquoted)
386
387 # Bash quirk:
388 # echo ${x/#/replace} has an empty pattern
389 # echo ${x////replace} is non-empty; it means echo ${x//'/'/replace}
390 empty_ok = replace_mode != Id.Lit_Slash
391 pat = self._ReadVarOpArg2(lex_mode_e.VSub_ArgUnquoted, Id.Lit_Slash,
392 empty_ok)
393 #log('pat 1 %r', pat)
394
395 if self.token_type == Id.Lit_Slash:
396 # read until }
397 replace = self._ReadVarOpArg(
398 lex_mode_e.VSub_ArgUnquoted) # type: rhs_word_t
399 #log('r 1 %r', replace)
400 else:
401 # e.g. ${v/a} is the same as ${v/a/} -- empty replacement string
402 replace = rhs_word.Empty
403
404 self._GetToken()
405 if self.token_type != Id.Right_DollarBrace:
406 # This happens on invalid code
407 p_die(
408 "Expected } after replacement string, got %s" %
409 ui.PrettyId(self.token_type), self.cur_token)
410
411 return suffix_op.PatSub(pat, replace, replace_mode, slash_tok)
412
413 def _ReadSubscript(self):
414 # type: () -> bracket_op_t
415 """ Subscript = '[' ('@' | '*' | ArithExpr) ']' """
416 # Lookahead to see if we get @ or *. Otherwise read a full arithmetic
417 # expression.
418 next_id = self.lexer.LookPastSpace(lex_mode_e.Arith)
419 if next_id in (Id.Lit_At, Id.Arith_Star):
420 op = bracket_op.WholeArray(next_id) # type: bracket_op_t
421
422 self._SetNext(lex_mode_e.Arith) # skip past [
423 self._GetToken()
424 self._SetNext(lex_mode_e.Arith) # skip past @
425 self._GetToken()
426 else:
427 self._SetNext(lex_mode_e.Arith) # skip past [
428 anode = self._ReadArithExpr(Id.Arith_RBracket)
429 op = bracket_op.ArrayIndex(anode)
430
431 if self.token_type != Id.Arith_RBracket: # Should be looking at ]
432 p_die('Expected ] to close subscript', self.cur_token)
433
434 self._SetNext(lex_mode_e.VSub_2) # skip past ]
435 self._GetToken() # Needed to be in the same spot as no subscript
436
437 return op
438
439 def _ParseVarOf(self):
440 # type: () -> BracedVarSub
441 """
442 VarOf = NAME Subscript?
443 | NUMBER # no subscript allowed, none of these are arrays
444 # ${@[1]} doesn't work, even though slicing does
445 | VarSymbol
446 """
447 self._GetToken()
448 name_token = self.cur_token
449 self._SetNext(lex_mode_e.VSub_2)
450
451 self._GetToken() # Check for []
452 if self.token_type == Id.VOp2_LBracket:
453 bracket_op = self._ReadSubscript()
454 else:
455 bracket_op = None
456
457 part = BracedVarSub.CreateNull()
458 part.name_tok = name_token
459 part.var_name = lexer.TokenVal(name_token)
460 part.bracket_op = bracket_op
461 return part
462
463 def _ParseVarExpr(self, arg_lex_mode, allow_query=False):
464 # type: (lex_mode_t, bool) -> BracedVarSub
465 """Start parsing at the op -- we already skipped past the name."""
466 part = self._ParseVarOf()
467
468 self._GetToken()
469 if self.token_type == Id.Right_DollarBrace:
470 return part # no ops
471
472 op_kind = self.token_kind
473
474 if op_kind == Kind.VTest:
475 tok = self.cur_token
476 arg_word = self._ReadVarOpArg(arg_lex_mode)
477 if self.token_type != Id.Right_DollarBrace:
478 p_die('Expected } to close ${', self.cur_token)
479
480 part.suffix_op = suffix_op.Unary(tok, arg_word)
481
482 elif op_kind == Kind.VOpYsh:
483 tok = self.cur_token
484 arg_word = self._ReadVarOpArg(arg_lex_mode)
485 if self.token_type != Id.Right_DollarBrace:
486 p_die('Expected } to close ${', self.cur_token)
487
488 UP_arg_word = arg_word
489 with tagswitch(arg_word) as case:
490 if case(rhs_word_e.Empty):
491 pass
492 elif case(rhs_word_e.Compound):
493 arg_word = cast(CompoundWord, UP_arg_word)
494 # This handles ${x|html} and ${x %.3f} now
495 # However I think ${x %.3f} should be statically parsed? It can enter
496 # the printf lexer modes.
497 ok, arg, quoted = word_.StaticEval(arg_word)
498 if not ok or quoted:
499 p_die('Expected a constant argument',
500 loc.Word(arg_word))
501
502 part.suffix_op = suffix_op.Static(tok, arg)
503
504 elif op_kind == Kind.VOp0:
505 part.suffix_op = self.cur_token # Nullary
506 self._SetNext(lex_mode_e.VSub_2) # Expecting }
507 self._GetToken()
508
509 elif op_kind == Kind.VOp1: # % %% # ## etc.
510 tok = self.cur_token
511 # Weird exception that all shells have: these operators take a glob
512 # pattern, so they're lexed as VSub_ArgUnquoted, not VSub_ArgDQ
513 arg_word = self._ReadVarOpArg(lex_mode_e.VSub_ArgUnquoted)
514 if self.token_type != Id.Right_DollarBrace:
515 p_die('Expected } to close ${', self.cur_token)
516
517 part.suffix_op = suffix_op.Unary(tok, arg_word)
518
519 elif op_kind == Kind.VOp2: # / : [ ]
520 if self.token_type == Id.VOp2_Slash:
521 patsub_op = self._ReadPatSubVarOp() # type: suffix_op_t
522 part.suffix_op = patsub_op
523
524 # Checked by the method above
525 assert self.token_type == Id.Right_DollarBrace, self.cur_token
526
527 elif self.token_type == Id.VOp2_Colon:
528 part.suffix_op = self._ReadSliceVarOp()
529 # NOTE: } in arithmetic mode.
530 if self.token_type != Id.Arith_RBrace:
531 # Token seems off; doesn't point to X in # ${a:1:2 X
532 p_die('Expected } to close ${', self.cur_token)
533
534 else:
535 # TODO: Does this ever happen?
536 p_die('Unexpected token in ${} (%s)' % 'VOp2', self.cur_token)
537
538 elif op_kind == Kind.VOp3: # ${prefix@} etc.
539 if allow_query:
540 part.suffix_op = self.cur_token # Nullary
541 self._SetNext(lex_mode_e.VSub_2) # Expecting }
542 self._GetToken()
543 else:
544 p_die("Unexpected token in ${} (%s)" % 'VOp3', self.cur_token)
545
546 # NOTE: Arith_RBrace is for slicing, because it reads } in arithmetic
547 # mode. It's redundantly checked above.
548 if self.token_type not in (Id.Right_DollarBrace, Id.Arith_RBrace):
549 # ${a.} or ${!a.}
550 p_die('Expected } to close ${', self.cur_token)
551
552 # Now look for ops
553 return part
554
555 def _ReadZshVarSub(self, left_token):
556 # type: (Token) -> word_part.ZshVarSub
557
558 self._SetNext(lex_mode_e.VSub_Zsh) # Move past ${(foo)
559
560 # Can be empty
561 w = self._ReadCompoundWord3(lex_mode_e.VSub_Zsh, Id.Right_DollarBrace,
562 True)
563 self._GetToken()
564 return word_part.ZshVarSub(left_token, w, self.cur_token)
565
566 def ReadBracedVarSub(self, left_token):
567 # type: (Token) -> Tuple[BracedVarSub, Token]
568 """ For YSH expressions like var x = ${x:-"default"}. """
569 part = self._ReadBracedVarSub(left_token, d_quoted=False)
570 last_token = self.cur_token
571 return part, last_token
572
573 def _ReadBracedVarSub(self, left_token, d_quoted):
574 # type: (Token, bool) -> BracedVarSub
575 """For the ${} expression language.
576
577 NAME = [a-zA-Z_][a-zA-Z0-9_]*
578 NUMBER = [0-9]+ # ${10}, ${11}, ...
579
580 Subscript = '[' ('@' | '*' | ArithExpr) ']'
581 VarSymbol = '!' | '@' | '#' | ...
582 VarOf = NAME Subscript?
583 | NUMBER # no subscript allowed, none of these are arrays
584 # ${@[1]} doesn't work, even though slicing does
585 | VarSymbol
586
587 NULLARY_OP = '@Q' | '@E' | '@P' | '@A' | '@a' # VOp0
588
589 TEST_OP = '-' | ':-' | '=' | ':=' | '+' | ':+' | '?' | ':?'
590 STRIP_OP = '#' | '##' | '%' | '%%'
591 CASE_OP = ',' | ',,' | '^' | '^^'
592 UnaryOp = TEST_OP | STRIP_OP | CASE_OP
593
594 YSH_UNARY = '|' | ' ' # ${x|html} and ${x %.3f}.
595 # SPACE is operator not %
596 Match = ('/' | '#' | '%') WORD # match all / prefix / suffix
597 VarExpr = VarOf
598 | VarOf NULLARY_OP
599 | VarOf UnaryOp WORD
600 | VarOf YSH_UNARY STATIC_WORD
601 | VarOf ':' ArithExpr (':' ArithExpr )?
602 | VarOf '/' Match '/' WORD
603
604 LengthExpr = '#' VarOf # can't apply operators after length
605
606 RefOrKeys = '!' VarExpr # CAN apply operators after a named ref
607 # ${!ref[0]} vs ${!keys[@]} resolved later
608
609 PrefixQuery = '!' NAME ('*' | '@') # list variable names with a prefix
610
611 BuiltinSub = '.' WORD+ # ${.myproc 'builtin' $sub}
612
613 VarSub = LengthExpr
614 | RefOrKeys
615 | PrefixQuery
616 | VarExpr
617 | BuiltinSub
618
619 NOTES:
620 - Arithmetic expressions are used twice, inside subscripts ${a[x+1]} and
621 slicing ${a:x+1:y+2}
622 - ${#} and ${!} need LL(2) lookahead (considering how my tokenizer works)
623 - @ and * are technically arithmetic expressions in this implementation
624 - We don't account for bash 4.4: ${param@operator} -- Q E P A a. Note that
625 it's also vectorized.
626
627 Strictness over bash:
628 - echo ${a[0][0]} doesn't do anything useful, so we disallow it from the
629 grammar
630 - ! and # prefixes can't be composed, even though named refs can be
631 composed with other operators
632 - '#' means 4 different things: length prefix, VarSymbol, UnaryOp to strip
633 a prefix, and it can also be a literal part of WORD.
634
635 From the parser's point of view, the prefix # can't be combined with
636 UnaryOp/slicing/matching, and the ! can. However
637
638 - ${a[@]:1:2} is not allowed
639 - ${#a[@]:1:2} is allowed, but gives the wrong answer
640 """
641 if d_quoted:
642 arg_lex_mode = lex_mode_e.VSub_ArgDQ
643 else:
644 arg_lex_mode = lex_mode_e.VSub_ArgUnquoted
645
646 self._SetNext(lex_mode_e.VSub_1)
647 self._GetToken()
648
649 ty = self.token_type
650 first_tok = self.cur_token
651
652 if ty == Id.VSub_Pound:
653 # Disambiguate
654 next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
655 if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
656 # e.g. a name, '#' is the prefix
657 self._SetNext(lex_mode_e.VSub_1)
658 part = self._ParseVarOf()
659
660 self._GetToken()
661 if self.token_type != Id.Right_DollarBrace:
662 p_die('Expected } after length expression', self.cur_token)
663
664 part.prefix_op = first_tok
665
666 else: # not a prefix, '#' is the variable
667 part = self._ParseVarExpr(arg_lex_mode)
668
669 elif ty == Id.VSub_Bang:
670 next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
671 if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
672 # e.g. a name, '!' is the prefix
673 # ${!a} -- this is a ref
674 # ${!3} -- this is ref
675 # ${!a[1]} -- this is a ref
676 # ${!a[@]} -- this is a keys
677 # No lookahead -- do it in a second step, or at runtime
678 self._SetNext(lex_mode_e.VSub_1)
679 part = self._ParseVarExpr(arg_lex_mode, allow_query=True)
680
681 part.prefix_op = first_tok
682
683 else: # not a prefix, '!' is the variable
684 part = self._ParseVarExpr(arg_lex_mode)
685
686 elif ty == Id.VSub_Dot:
687 # Note: this will become a new builtin_sub type, so this method must
688 # return word_part_t rather than BracedVarSub. I don't think that
689 # should cause problems.
690 p_die('TODO: ${.myproc builtin sub}', self.cur_token)
691
692 # VS_NAME, VS_NUMBER, symbol that isn't # or !
693 elif self.token_kind == Kind.VSub:
694 part = self._ParseVarExpr(arg_lex_mode)
695
696 else:
697 # e.g. ${^}
698 p_die('Unexpected token in ${}', self.cur_token)
699
700 part.left = left_token # attach the argument
701 part.right = self.cur_token
702 return part
703
704 def _ReadSingleQuoted(self, left_token, lex_mode):
705 # type: (Token, lex_mode_t) -> SingleQuoted
706 """Internal method to read a word_part."""
707 tokens = [] # type: List[Token]
708 # In command mode, we never disallow backslashes like '\'
709 right_quote = self.ReadSingleQuoted(lex_mode, left_token, tokens,
710 False)
711 sval = word_compile.EvalSingleQuoted(left_token.id, tokens)
712 node = SingleQuoted(left_token, sval, right_quote)
713 return node
714
715 def ReadSingleQuoted(self, lex_mode, left_token, out_tokens, is_ysh_expr):
716 # type: (lex_mode_t, Token, List[Token], bool) -> Token
717 """Appends to out_tokens; returns last token
718
719 Used by expr_parse.py
720 """
721 if (left_token.id == Id.Left_DollarSingleQuote and
722 self.parse_opts.no_parse_osh()):
723 p_die("Instead of $'', use J8 strings like b'' (no_parse_osh)",
724 left_token)
725
726 # echo '\' is allowed, but x = '\' is invalid, in favor of x = r'\'
727 # enforce for triple-quoted strings: ''' \u ''' requires r''' \u '''
728 no_backslashes = is_ysh_expr and left_token.id in (
729 Id.Left_SingleQuote, Id.Left_TSingleQuote)
730
731 expected_end_tokens = 3 if left_token.id in (
732 Id.Left_TSingleQuote, Id.Left_RTSingleQuote, Id.Left_UTSingleQuote,
733 Id.Left_BTSingleQuote) else 1
734 num_end_tokens = 0
735
736 # TODO: could we directly append to out_tokens?
737 tokens = [] # type: List[Token]
738 while num_end_tokens < expected_end_tokens:
739 self._SetNext(lex_mode)
740 self._GetToken()
741
742 # Kind.Char emitted in lex_mode.SQ_C
743 if self.token_kind in (Kind.Lit, Kind.Char):
744 tok = self.cur_token
745 # Happens in lex_mode_e.SQ: 'one\two' is ambiguous, should be
746 # r'one\two' or c'one\\two'
747 if no_backslashes and lexer.TokenContains(tok, '\\'):
748 p_die(
749 "Ambiguous backslash: add explicit r'' or u'' prefix (OILS-ERR-20)",
750 tok)
751
752 if is_ysh_expr:
753 # Disallow var x = $'\001'. Arguably we don't need these
754 # checks because u'\u{1}' is the way to write it.
755 if self.token_type == Id.Char_Octal3:
756 p_die(
757 r"Use \xhh or \u{...} instead of octal escapes in YSH strings",
758 tok)
759
760 if self.token_type == Id.Char_Hex and self.cur_token.length != 4:
761 # disallow \xH
762 p_die(
763 r'Invalid hex escape in YSH string (must be \xHH)',
764 tok)
765
766 tokens.append(tok)
767
768 elif self.token_kind == Kind.Unknown:
769 tok = self.cur_token
770 assert tok.id == Id.Unknown_Backslash, tok
771
772 # x = $'\z' is disallowed; ditto for echo $'\z' if shopt --set no_parse_backslash
773 if is_ysh_expr or self.parse_opts.no_parse_backslash():
774 p_die(
775 "Invalid char escape in C-style string literal (OILS-ERR-11)",
776 tok)
777
778 tokens.append(tok)
779
780 elif self.token_kind == Kind.Eof:
781 p_die('Unexpected EOF in single-quoted string that began here',
782 left_token)
783
784 elif self.token_kind == Kind.Right:
785 # assume Id.Right_SingleQuote
786 num_end_tokens += 1
787 tokens.append(self.cur_token)
788
789 else:
790 raise AssertionError(self.cur_token)
791
792 if self.token_kind != Kind.Right:
793 num_end_tokens = 0 # we need three in a ROW
794
795 if expected_end_tokens == 1:
796 tokens.pop()
797 elif expected_end_tokens == 3: # Get rid of spurious end tokens
798 tokens.pop()
799 tokens.pop()
800 tokens.pop()
801
802 # Remove space from ''' r''' $''' in both expression mode and command mode
803 if left_token.id in (Id.Left_TSingleQuote, Id.Left_RTSingleQuote,
804 Id.Left_UTSingleQuote, Id.Left_BTSingleQuote):
805 word_compile.RemoveLeadingSpaceSQ(tokens)
806
807 # Validation after lexing - same 2 checks in j8.LexerDecoder
808 is_u_string = left_token.id in (Id.Left_USingleQuote,
809 Id.Left_UTSingleQuote)
810
811 for tok in tokens:
812 # u'\yff' is not valid, but b'\yff' is
813 if is_u_string and tok.id == Id.Char_YHex:
814 p_die(
815 r"%s escapes not allowed in u'' strings" %
816 lexer.TokenVal(tok), tok)
817
818 out_tokens.extend(tokens)
819 return self.cur_token
820
821 def _ReadDoubleQuotedLeftParts(self):
822 # type: () -> word_part_t
823 """Read substitution parts in a double quoted context."""
824 if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick):
825 return self._ReadCommandSub(self.token_type, d_quoted=True)
826
827 if self.token_type == Id.Left_DollarBrace:
828 return self._ReadBracedVarSub(self.cur_token, d_quoted=True)
829
830 if self.token_type == Id.Left_DollarDParen:
831 # TODO: Uncomment this after another regtest/aports run
832 # if (self.LookAheadDParens(shift_back=1)):
833 return self._ReadArithSub()
834 # else:
835 # Mutate token - we treat this '$((' as '$( ('
836 # self.cur_token.id = Id.Left_DollarParen
837 # return self._ReadCommandSub(Id.Left_DollarParen, d_quoted=True)
838
839 if self.token_type == Id.Left_DollarBracket:
840 return self._ReadExprSub(lex_mode_e.DQ)
841
842 if self.token_type == Id.Left_DollarBraceZsh:
843 return self._ReadZshVarSub(self.cur_token)
844
845 raise AssertionError(self.cur_token)
846
847 def _ReadYshSingleQuoted(self, left_id):
848 # type: (Id_t) -> CompoundWord
849 """Read YSH style strings
850
851 r'' u'' b''
852 r''' ''' u''' ''' b''' '''
853 """
854 #log('BEF self.cur_token %s', self.cur_token)
855 if left_id == Id.Left_RSingleQuote:
856 lexer_mode = lex_mode_e.SQ_Raw
857 triple_left_id = Id.Left_RTSingleQuote
858 elif left_id == Id.Left_USingleQuote:
859 lexer_mode = lex_mode_e.J8_Str
860 triple_left_id = Id.Left_UTSingleQuote
861 elif left_id == Id.Left_BSingleQuote:
862 lexer_mode = lex_mode_e.J8_Str
863 triple_left_id = Id.Left_BTSingleQuote
864 else:
865 raise AssertionError(left_id)
866
867 # Needed for syntax checks
868 left_tok = self.cur_token
869 left_tok.id = left_id
870
871 sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
872
873 if (len(sq_part.sval) == 0 and self.lexer.ByteLookAhead() == "'"):
874 self._SetNext(lex_mode_e.ShCommand)
875 self._GetToken()
876
877 assert self.token_type == Id.Left_SingleQuote
878 # HACK: magically transform the third ' in u''' to
879 # Id.Left_UTSingleQuote, so that ''' is the terminator
880 left_tok = self.cur_token
881 left_tok.id = triple_left_id
882
883 # Handles stripping leading whitespace
884 sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
885
886 # Advance and validate
887 self._SetNext(lex_mode_e.ShCommand)
888
889 self._GetToken()
890 if self.token_kind not in KINDS_THAT_END_WORDS:
891 p_die('Unexpected token after YSH single-quoted string',
892 self.cur_token)
893
894 return CompoundWord([sq_part])
895
896 def _ReadUnquotedLeftParts(self, triple_out):
897 # type: (Optional[BoolParamBox]) -> word_part_t
898 """Read substitutions and quoted strings (for lex_mode_e.ShCommand).
899
900 If triple_out is set, then we try parsing triple quoted strings,
901 and set its value to True if we got one.
902 """
903 if self.token_type in (Id.Left_DoubleQuote, Id.Left_DollarDoubleQuote):
904 # Note: $"" is a synonym for "". It might make sense if it added
905 # \n \0 \x00 \u{123} etc. But that's not what bash does!
906 dq_part = self._ReadDoubleQuoted(self.cur_token)
907 # Got empty word "" and there's a " after
908 if (triple_out and len(dq_part.parts) == 0 and
909 self.lexer.ByteLookAhead() == '"'):
910
911 self._SetNext(lex_mode_e.ShCommand)
912 self._GetToken()
913 # HACK: magically transform the third " in """ to
914 # Id.Left_TDoubleQuote, so that """ is the terminator
915 left_dq_token = self.cur_token
916 left_dq_token.id = Id.Left_TDoubleQuote
917 triple_out.b = True # let caller know we got it
918 return self._ReadDoubleQuoted(left_dq_token)
919
920 return dq_part
921
922 if self.token_type in (Id.Left_SingleQuote, Id.Left_RSingleQuote,
923 Id.Left_DollarSingleQuote):
924 if self.token_type == Id.Left_SingleQuote:
925 lexer_mode = lex_mode_e.SQ_Raw
926 triple_left_id = Id.Left_TSingleQuote
927 elif self.token_type == Id.Left_RSingleQuote:
928 lexer_mode = lex_mode_e.SQ_Raw
929 triple_left_id = Id.Left_RTSingleQuote
930 else:
931 lexer_mode = lex_mode_e.SQ_C
932 # there is no such thing as $'''
933 triple_left_id = Id.Undefined_Tok
934
935 sq_part = self._ReadSingleQuoted(self.cur_token, lexer_mode)
936
937 # Got empty '' or r'' and there's a ' after
938 # u'' and b'' are handled in _ReadYshSingleQuoted
939 if (triple_left_id != Id.Undefined_Tok and
940 triple_out is not None and len(sq_part.sval) == 0 and
941 self.lexer.ByteLookAhead() == "'"):
942
943 self._SetNext(lex_mode_e.ShCommand)
944 self._GetToken()
945
946 # HACK: magically transform the third ' in ''' to
947 # Id.Left_TSingleQuote, so that ''' is the terminator
948 left_sq_token = self.cur_token
949 left_sq_token.id = triple_left_id
950
951 triple_out.b = True # let caller know we got it
952 return self._ReadSingleQuoted(left_sq_token, lexer_mode)
953
954 return sq_part
955
956 if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick,
957 Id.Left_ProcSubIn, Id.Left_ProcSubOut):
958 return self._ReadCommandSub(self.token_type, d_quoted=False)
959
960 if self.token_type == Id.Left_DollarBrace:
961 return self._ReadBracedVarSub(self.cur_token, d_quoted=False)
962
963 if self.token_type == Id.Left_DollarDParen:
964 # TODO: Uncomment this after another regtest/aports run
965 # if (self.LookAheadDParens(shift_back=1)):
966 return self._ReadArithSub()
967 # else:
968 # Mutate token - we treat this '$((' as '$( ('
969 # self.cur_token.id = Id.Left_DollarParen
970 # return self._ReadCommandSub(Id.Left_DollarParen, d_quoted=True)
971
972 if self.token_type == Id.Left_DollarBracket:
973 return self._ReadExprSub(lex_mode_e.ShCommand)
974
975 if self.token_type == Id.Left_DollarBraceZsh:
976 return self._ReadZshVarSub(self.cur_token)
977
978 raise AssertionError(self.cur_token)
979
980 def _ReadExtGlob(self):
981 # type: () -> word_part.ExtGlob
982 """
983 Grammar:
984 Item = CompoundWord | EPSILON # important: @(foo|) is allowed
985 LEFT = '@(' | '*(' | '+(' | '?(' | '!('
986 RIGHT = ')'
987 ExtGlob = LEFT (Item '|')* Item RIGHT # ITEM may be empty
988 Compound includes ExtGlob
989 """
990 left_token = self.cur_token
991 right_token = None # type: Token
992 arms = [] # type: List[CompoundWord]
993
994 self.lexer.PushHint(Id.Op_RParen, Id.Right_ExtGlob)
995 self._SetNext(lex_mode_e.ExtGlob) # advance past LEFT
996
997 read_word = False # did we just a read a word? To handle @(||).
998
999 while True:
1000 self._GetToken()
1001
1002 if self.token_type == Id.Right_ExtGlob:
1003 if not read_word:
1004 arms.append(CompoundWord([]))
1005 right_token = self.cur_token
1006 break
1007
1008 elif self.token_type == Id.Op_Pipe:
1009 if not read_word:
1010 arms.append(CompoundWord([]))
1011 read_word = False
1012 self._SetNext(lex_mode_e.ExtGlob)
1013
1014 # lex_mode_e.ExtGlob should only produce these 4 kinds of tokens
1015 elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub,
1016 Kind.ExtGlob):
1017 w = self._ReadCompoundWord(lex_mode_e.ExtGlob)
1018 arms.append(w)
1019 read_word = True
1020
1021 elif self.token_kind == Kind.Eof:
1022 p_die('Unexpected EOF reading extended glob that began here',
1023 left_token)
1024
1025 else:
1026 raise AssertionError(self.cur_token)
1027
1028 return word_part.ExtGlob(left_token, arms, right_token)
1029
1030 def _ReadBashRegexGroup(self):
1031 # type: () -> word_part.BashRegexGroup
1032 """
1033 Grammar:
1034 BashRegexGroup = '(' WORD? ')
1035 """
1036 left_token = self.cur_token
1037 assert left_token.id == Id.BashRegex_LParen, left_token
1038
1039 arms = [] # type: List[CompoundWord]
1040
1041 self.lexer.PushHint(Id.Op_RParen, Id.Right_BashRegexGroup)
1042 self._SetNext(lex_mode_e.BashRegexFakeInner) # advance past LEFT
1043
1044 self._GetToken()
1045 if self.token_type == Id.Right_BashRegexGroup: # empty ()
1046 return word_part.BashRegexGroup(left_token, None, self.cur_token)
1047
1048 # lex_mode_e.BashRegex should only produce these 4 kinds of tokens
1049 if self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub, Kind.BashRegex):
1050 # Fake lexer mode that translates Id.WS_Space to Id.Lit_Chars
1051 # To allow bash style [[ s =~ (a b) ]]
1052 w = self._ReadCompoundWord(lex_mode_e.BashRegexFakeInner)
1053 arms.append(w)
1054
1055 self._GetToken()
1056 if self.token_type != Id.Right_BashRegexGroup:
1057 p_die('Expected ) to close bash regex group', self.cur_token)
1058
1059 return word_part.BashRegexGroup(left_token, w, self.cur_token)
1060
1061 p_die('Expected word after ( opening bash regex group', self.cur_token)
1062
1063 def _ReadLikeDQ(self, left_token, is_ysh_expr, out_parts):
1064 # type: (Optional[Token], bool, List[word_part_t]) -> None
1065 """
1066 Args:
1067 left_token: A token if we are reading a double quoted part, or None if
1068 we're reading a here doc.
1069 is_ysh_expr: Whether to disallow backticks and invalid char escapes
1070 out_parts: list of word_part to append to
1071 """
1072 if left_token:
1073 if left_token.id in (Id.Left_TDoubleQuote,
1074 Id.Left_DollarTDoubleQuote):
1075 expected_end_tokens = 3
1076 else:
1077 expected_end_tokens = 1
1078 else:
1079 expected_end_tokens = 1000 # here doc will break
1080
1081 num_end_tokens = 0
1082 while num_end_tokens < expected_end_tokens:
1083 self._SetNext(lex_mode_e.DQ)
1084 self._GetToken()
1085
1086 if self.token_kind == Kind.Lit:
1087 if self.token_type == Id.Lit_EscapedChar:
1088 tok = self.cur_token
1089 ch = lexer.TokenSliceLeft(tok, 1)
1090 part = word_part.EscapedLiteral(tok,
1091 ch) # type: word_part_t
1092 else:
1093 if self.token_type == Id.Lit_BadBackslash:
1094 # echo "\z" is OK in shell, but 'x = "\z" is a syntax error in
1095 # YSH.
1096 # Slight hole: We don't catch 'x = ${undef:-"\z"} because of the
1097 # recursion (unless no_parse_backslash)
1098 if (is_ysh_expr or
1099 self.parse_opts.no_parse_backslash()):
1100 p_die(
1101 "Invalid char escape in double quoted string (OILS-ERR-12)",
1102 self.cur_token)
1103 elif self.token_type == Id.Lit_Dollar:
1104 if is_ysh_expr or self.parse_opts.no_parse_dollar():
1105 p_die("Literal $ should be quoted like \$",
1106 self.cur_token)
1107
1108 part = self.cur_token
1109 out_parts.append(part)
1110
1111 elif self.token_kind == Kind.Left:
1112 if self.token_type == Id.Left_Backtick and is_ysh_expr:
1113 p_die("Backtick should be $(cmd) or \\` (OILS-ERR-18)",
1114 self.cur_token)
1115
1116 part = self._ReadDoubleQuotedLeftParts()
1117 out_parts.append(part)
1118
1119 elif self.token_kind == Kind.VSub:
1120 tok = self.cur_token
1121 part = SimpleVarSub(tok)
1122 out_parts.append(part)
1123 # NOTE: parsing "$f(x)" would BREAK CODE. Could add a more for it
1124 # later.
1125
1126 elif self.token_kind == Kind.Right:
1127 assert self.token_type == Id.Right_DoubleQuote, self.token_type
1128 if left_token:
1129 num_end_tokens += 1
1130
1131 # In a here doc, the right quote is literal!
1132 out_parts.append(self.cur_token)
1133
1134 elif self.token_kind == Kind.Eof:
1135 if left_token:
1136 p_die(
1137 'Unexpected EOF reading double-quoted string that began here',
1138 left_token)
1139 else: # here docs will have an EOF in their token stream
1140 break
1141
1142 else:
1143 raise AssertionError(self.cur_token)
1144
1145 if self.token_kind != Kind.Right:
1146 num_end_tokens = 0 # """ must be CONSECUTIVE
1147
1148 if expected_end_tokens == 1:
1149 out_parts.pop()
1150 elif expected_end_tokens == 3:
1151 out_parts.pop()
1152 out_parts.pop()
1153 out_parts.pop()
1154
1155 # Remove space from """ in both expression mode and command mode
1156 if (left_token and left_token.id
1157 in (Id.Left_TDoubleQuote, Id.Left_DollarTDoubleQuote)):
1158 word_compile.RemoveLeadingSpaceDQ(out_parts)
1159
1160 # Return nothing, since we appended to 'out_parts'
1161
1162 def _ReadDoubleQuoted(self, left_token):
1163 # type: (Token) -> DoubleQuoted
1164 """Helper function for "hello $name".
1165
1166 Args:
1167 eof_type: for stopping at }, Id.Lit_RBrace
1168 here_doc: Whether we are reading in a here doc context
1169
1170 Also ${foo%%a b c} # treat this as double quoted. until you hit
1171 """
1172 parts = [] # type: List[word_part_t]
1173 self._ReadLikeDQ(left_token, False, parts)
1174
1175 right_quote = self.cur_token
1176 return DoubleQuoted(left_token, parts, right_quote)
1177
1178 def ReadDoubleQuoted(self, left_token, parts):
1179 # type: (Token, List[word_part_t]) -> Token
1180 """For expression mode.
1181
1182 Read var x = "${dir:-}/$name"; etc.
1183 """
1184 self._ReadLikeDQ(left_token, True, parts)
1185 return self.cur_token
1186
1187 def _ReadCommandSub(self, left_id, d_quoted=False):
1188 # type: (Id_t, bool) -> CommandSub
1189 """
1190 NOTE: This is not in the grammar, because word parts aren't in the grammar!
1191
1192 command_sub = '$(' command_list ')'
1193 | '@(' command_list ')'
1194 | '<(' command_list ')'
1195 | '>(' command_list ')'
1196 | ` command_list `
1197 """
1198 left_token = self.cur_token
1199
1200 # Set the lexer in a state so ) becomes the EOF token.
1201 if left_id in (Id.Left_DollarParen, Id.Left_AtParen, Id.Left_ProcSubIn,
1202 Id.Left_ProcSubOut):
1203 self._SetNext(lex_mode_e.ShCommand) # advance past $( etc.
1204
1205 right_id = Id.Eof_RParen
1206 self.lexer.PushHint(Id.Op_RParen, right_id)
1207 c_parser = self.parse_ctx.MakeParserForCommandSub(
1208 self.line_reader, self.lexer, right_id)
1209 # NOTE: This doesn't use something like main_loop because we don't want
1210 # to interleave parsing and execution! Unlike 'source' and 'eval'.
1211 node = c_parser.ParseCommandSub()
1212
1213 right_token = c_parser.w_parser.cur_token
1214
1215 elif left_id == Id.Left_Backtick and self.parse_ctx.do_lossless:
1216 # NOTE: This is an APPROXIMATE solution for translation ONLY. See
1217 # test/osh2oil.
1218
1219 right_id = Id.Eof_Backtick
1220 self.lexer.PushHint(Id.Left_Backtick, right_id)
1221 c_parser = self.parse_ctx.MakeParserForCommandSub(
1222 self.line_reader, self.lexer, right_id)
1223 node = c_parser.ParseCommandSub()
1224 right_token = c_parser.w_parser.cur_token
1225
1226 elif left_id == Id.Left_Backtick:
1227 if self.parse_opts.no_parse_backticks():
1228 p_die(
1229 'Backtick should be $(cmd) or \\` (no_parse_backticks, OILS-ERR-18)',
1230 left_token)
1231
1232 self._SetNext(lex_mode_e.Backtick) # advance past `
1233
1234 parts = [] # type: List[str]
1235 while True:
1236 self._GetToken()
1237 #log("TOK %s", self.cur_token)
1238
1239 if self.token_type == Id.Backtick_Quoted:
1240 # Remove leading \
1241 parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1242
1243 elif self.token_type == Id.Backtick_DoubleQuote:
1244 # Compatibility: If backticks are double quoted, then double quotes
1245 # within them have to be \"
1246 # Shells aren't smart enough to match nested " and ` quotes (but OSH
1247 # is)
1248 if d_quoted:
1249 # Remove leading \
1250 parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1251 else:
1252 parts.append(lexer.TokenVal(self.cur_token))
1253
1254 elif self.token_type == Id.Backtick_Other:
1255 parts.append(lexer.TokenVal(self.cur_token))
1256
1257 elif self.token_type == Id.Backtick_Right:
1258 break
1259
1260 elif self.token_type == Id.Eof_Real:
1261 # Note: this parse error is in the ORIGINAL context. No code_str yet.
1262 p_die('Unexpected EOF while looking for closing backtick',
1263 left_token)
1264
1265 else:
1266 raise AssertionError(self.cur_token)
1267
1268 self._SetNext(lex_mode_e.Backtick)
1269
1270 # Calculate right SPID on CommandSub BEFORE re-parsing.
1271 right_token = self.cur_token
1272
1273 code_str = ''.join(parts)
1274 #log('code %r', code_str)
1275
1276 # Save lines into a new, temporary arena, so SnipCodeBlock() isn't
1277 # messed up. Note: This is similar to how we parse aliases in
1278 # osh/cmd_parse.py. It won't have the same location info as
1279 # MakeParserForCommandSub(), because the reader is different.
1280 arena = alloc.Arena()
1281 # TODO: arena.PushSource()?
1282
1283 line_reader = reader.StringLineReader(code_str, arena)
1284 c_parser = self.parse_ctx.MakeOshParser(line_reader)
1285 src = source.Reparsed('backticks', left_token, right_token)
1286 with alloc.ctx_SourceCode(arena, src):
1287 node = c_parser.ParseCommandSub()
1288
1289 else:
1290 raise AssertionError(left_id)
1291
1292 return CommandSub(left_token, node, right_token)
1293
1294 def _ReadExprSub(self, lex_mode):
1295 # type: (lex_mode_t) -> ExprSub
1296 """$[d->key] $[obj.method()] etc."""
1297 left_token = self.cur_token
1298
1299 self._SetNext(lex_mode_e.Expr)
1300 enode, right_token = self.parse_ctx.ParseYshExpr(
1301 self.lexer, grammar_nt.ysh_expr_sub)
1302
1303 self._SetNext(lex_mode) # Move past ]
1304 return ExprSub(left_token, enode, right_token)
1305
1306 def ParseVarDecl(self, kw_token):
1307 # type: (Token) -> VarDecl
1308 """
1309 oil_var_decl: name_type_list '=' testlist end_stmt
1310
1311 Note that assignments must end with \n ; } or EOF. Unlike shell
1312 assignments, we disallow:
1313
1314 var x = 42 | wc -l
1315 var x = 42 && echo hi
1316 """
1317 self._SetNext(lex_mode_e.Expr)
1318 enode, last_token = self.parse_ctx.ParseVarDecl(kw_token, self.lexer)
1319 # Hack to move } from what the Expr lexer modes gives to what CommandParser
1320 # wants
1321 if last_token.id == Id.Op_RBrace:
1322 last_token.id = Id.Lit_RBrace
1323
1324 # Let the CommandParser see the Op_Semi or Op_Newline.
1325 self.buffered_word = last_token
1326 self._SetNext(lex_mode_e.ShCommand) # always back to this
1327 return enode
1328
1329 def ParseMutation(self, kw_token, var_checker):
1330 # type: (Token, VarChecker) -> Mutation
1331 """
1332 setvar i = 42
1333 setvar i += 1
1334 setvar a[i] = 42
1335 setvar a[i] += 1
1336 setvar d.key = 42
1337 setvar d.key += 1
1338 """
1339 self._SetNext(lex_mode_e.Expr)
1340 enode, last_token = self.parse_ctx.ParseMutation(kw_token, self.lexer)
1341 # Hack to move } from what the Expr lexer modes gives to what CommandParser
1342 # wants
1343 if last_token.id == Id.Op_RBrace:
1344 last_token.id = Id.Lit_RBrace
1345
1346 for lhs in enode.lhs:
1347 UP_lhs = lhs
1348 with tagswitch(lhs) as case:
1349 if case(y_lhs_e.Var):
1350 lhs = cast(Token, UP_lhs)
1351 var_checker.Check(kw_token.id, lexer.LazyStr(lhs), lhs)
1352
1353 # Note: this does not cover cases like
1354 # setvar (a[0])[1] = v
1355 # setvar (d.key).other = v
1356 # This leaks into catching all typos statically, which may be
1357 # possible if 'use' makes all names explicit.
1358 elif case(y_lhs_e.Subscript):
1359 lhs = cast(Subscript, UP_lhs)
1360 if lhs.obj.tag() == expr_e.Var:
1361 v = cast(expr.Var, lhs.obj)
1362 var_checker.Check(kw_token.id, v.name, v.left)
1363
1364 elif case(y_lhs_e.Attribute):
1365 lhs = cast(Attribute, UP_lhs)
1366 if lhs.obj.tag() == expr_e.Var:
1367 v = cast(expr.Var, lhs.obj)
1368 var_checker.Check(kw_token.id, v.name, v.left)
1369
1370 # Let the CommandParser see the Op_Semi or Op_Newline.
1371 self.buffered_word = last_token
1372 self._SetNext(lex_mode_e.ShCommand) # always back to this
1373 return enode
1374
1375 def ParseBareDecl(self):
1376 # type: () -> expr_t
1377 """
1378 x = {name: val}
1379 """
1380 self._SetNext(lex_mode_e.Expr)
1381 self._GetToken()
1382 enode, last_token = self.parse_ctx.ParseYshExpr(
1383 self.lexer, grammar_nt.command_expr)
1384 if last_token.id == Id.Op_RBrace:
1385 last_token.id = Id.Lit_RBrace
1386 self.buffered_word = last_token
1387 self._SetNext(lex_mode_e.ShCommand)
1388 return enode
1389
1390 def ParseYshExprForCommand(self):
1391 # type: () -> expr_t
1392
1393 # Fudge for this case
1394 # for x in(y) {
1395 # versus
1396 # for x in (y) {
1397 #
1398 # In the former case, ReadWord on 'in' puts the lexer past (.
1399 # Also see LookPastSpace in CommandParers.
1400 # A simpler solution would be nicer.
1401
1402 if self.token_type == Id.Op_LParen:
1403 self.lexer.MaybeUnreadOne()
1404
1405 enode, _ = self.parse_ctx.ParseYshExpr(self.lexer, grammar_nt.ysh_expr)
1406
1407 self._SetNext(lex_mode_e.ShCommand)
1408 return enode
1409
1410 def ParseCommandExpr(self):
1411 # type: () -> expr_t
1412 """
1413 = 1+2
1414 """
1415 enode, last_token = self.parse_ctx.ParseYshExpr(
1416 self.lexer, grammar_nt.command_expr)
1417
1418 # In some cases, such as the case statement, we expect *the lexer* to be
1419 # pointing at the token right after the expression. But the expression
1420 # parser must have read to the `last_token`. Unreading places the lexer
1421 # back in the expected state. Ie:
1422 #
1423 # case (x) { case (x) {
1424 # (else) { = x } (else) { = x }
1425 # ^ The lexer is here ^ Unread to here
1426 # } }
1427 assert last_token.id in (Id.Op_Newline, Id.Eof_Real, Id.Op_Semi,
1428 Id.Op_RBrace), last_token
1429 if last_token.id != Id.Eof_Real:
1430 # Eof_Real is the only token we cannot unread
1431 self.lexer.MaybeUnreadOne()
1432
1433 return enode
1434
1435 def ParseProc(self, node):
1436 # type: (Proc) -> None
1437
1438 # proc name-with-hyphens() must be accepted
1439 self._SetNext(lex_mode_e.ShCommand)
1440 self._GetToken()
1441 # example: 'proc f[' gets you Lit_ArrayLhsOpen
1442 if self.token_type != Id.Lit_Chars:
1443 p_die('Invalid proc name %s' % ui.PrettyToken(self.cur_token),
1444 self.cur_token)
1445
1446 # TODO: validate this more. Disallow proc 123 { }, which isn't disallowed
1447 # for shell functions. Similar to IsValidVarName().
1448 node.name = self.cur_token
1449
1450 last_token = self.parse_ctx.ParseProc(self.lexer, node)
1451
1452 # Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1453 assert last_token.id == Id.Op_LBrace
1454 last_token.id = Id.Lit_LBrace
1455 self.buffered_word = last_token
1456
1457 self._SetNext(lex_mode_e.ShCommand)
1458
1459 def ParseFunc(self, node):
1460 # type: (Func) -> None
1461 last_token = self.parse_ctx.ParseFunc(self.lexer, node)
1462
1463 # Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1464 assert last_token.id == Id.Op_LBrace
1465 last_token.id = Id.Lit_LBrace
1466 self.buffered_word = last_token
1467
1468 self._SetNext(lex_mode_e.ShCommand)
1469
1470 def ParseYshCasePattern(self):
1471 # type: () -> Tuple[pat_t, Token]
1472 pat, left_tok, last_token = self.parse_ctx.ParseYshCasePattern(
1473 self.lexer)
1474
1475 if last_token.id == Id.Op_LBrace:
1476 last_token.id = Id.Lit_LBrace
1477 self.buffered_word = last_token
1478
1479 return pat, left_tok
1480
1481 def NewlineOkForYshCase(self):
1482 # type: () -> Id_t
1483 """Check for optional newline and consume it.
1484
1485 This is a special case of `_NewlineOk` which fixed some "off-by-one" issues
1486 which crop up while parsing Ysh Case Arms. For more details, see
1487 #oil-dev > Progress On YSH Case Grammar on zulip.
1488
1489 Returns a token id which is filled with the choice of
1490
1491 word { echo word }
1492 (3) { echo expr }
1493 /e/ { echo eggex }
1494 } # right brace
1495 """
1496 while True:
1497 next_id = self.lexer.LookAheadOne(lex_mode_e.Expr)
1498
1499 # Cannot lookahead past lines
1500 if next_id == Id.Unknown_Tok:
1501 if not self.lexer.MoveToNextLine(): # Try to move to next line
1502 break # EOF
1503 continue
1504
1505 next_kind = consts.GetKind(next_id)
1506 if next_id != Id.Op_Newline and next_kind != Kind.Ignored:
1507 break
1508
1509 self.lexer.Read(lex_mode_e.Expr)
1510
1511 if next_id in (Id.Op_RBrace, Id.Op_LParen, Id.Arith_Slash):
1512 self._SetNext(lex_mode_e.Expr) # Continue in expression mode
1513 else:
1514 # Consume the trailing Op_Newline
1515 self._SetNext(lex_mode_e.ShCommand)
1516 self._GetToken()
1517
1518 return next_id
1519
1520 def _ReadArithExpr(self, end_id):
1521 # type: (Id_t) -> arith_expr_t
1522 """Read and parse an arithmetic expression in various contexts.
1523
1524 $(( 1+2 ))
1525 (( a=1+2 ))
1526 ${a[ 1+2 ]}
1527 ${a : 1+2 : 1+2}
1528
1529 See tests/arith-context.test.sh for ambiguous cases.
1530
1531 ${a[a[0]]} is valid # VS_RBRACKET vs Id.Arith_RBracket
1532
1533 ${s : a<b?0:1 : 1} # VS_COLON vs Id.Arith_Colon
1534
1535 See the assertion in ArithParser.Parse() -- unexpected extra input.
1536 """
1537 # calls self.ReadWord(lex_mode_e.Arith)
1538 anode = self.a_parser.Parse()
1539 cur_id = self.a_parser.CurrentId()
1540 if end_id != Id.Undefined_Tok and cur_id != end_id:
1541 p_die(
1542 'Unexpected token after arithmetic expression (%s != %s)' %
1543 (ui.PrettyId(cur_id), ui.PrettyId(end_id)),
1544 loc.Word(self.a_parser.cur_word))
1545 return anode
1546
1547 def _ReadArithSub(self):
1548 # type: () -> word_part.ArithSub
1549 """Read an arith substitution, which contains an arith expression, e.g.
1550
1551 $((a + 1)).
1552 """
1553 left_tok = self.cur_token
1554
1555 # The second one needs to be disambiguated in stuff like stuff like:
1556 # $(echo $(( 1+2 )) )
1557 self.lexer.PushHint(Id.Op_RParen, Id.Right_DollarDParen)
1558
1559 # NOTE: To disambiguate $(( as arith sub vs. command sub and subshell, we
1560 # could save the lexer/reader state here, and retry if the arithmetic parse
1561 # fails. But we can almost always catch this at parse time. There could
1562 # be some exceptions like:
1563 # $((echo * foo)) # looks like multiplication
1564 # $((echo / foo)) # looks like division
1565
1566 # $(( )) is valid
1567 anode = arith_expr.EmptyZero # type: arith_expr_t
1568
1569 self._NextNonSpace()
1570 if self.token_type != Id.Arith_RParen:
1571 anode = self._ReadArithExpr(Id.Arith_RParen)
1572
1573 self._SetNext(lex_mode_e.ShCommand)
1574
1575 # Ensure we get closing )
1576 self._GetToken()
1577 if self.token_type != Id.Right_DollarDParen:
1578 p_die('Expected second ) to end arith sub', self.cur_token)
1579
1580 right_tok = self.cur_token
1581 return word_part.ArithSub(left_tok, anode, right_tok)
1582
1583 def ReadDParen(self):
1584 # type: () -> Tuple[arith_expr_t, Token]
1585 """Read ((1+ 2)) -- command context.
1586
1587 We're using the word parser because it's very similar to _ReadArithExpr
1588 above.
1589
1590 This also returns the terminating Id.Op_DRightParen token for location
1591 info.
1592 """
1593 # (( )) is valid
1594 anode = arith_expr.EmptyZero # type: arith_expr_t
1595
1596 self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
1597
1598 self._NextNonSpace()
1599 if self.token_type != Id.Arith_RParen:
1600 anode = self._ReadArithExpr(Id.Arith_RParen)
1601
1602 self._SetNext(lex_mode_e.ShCommand)
1603
1604 # Ensure we get the second )
1605 self._GetToken()
1606 right = self.cur_token
1607 if right.id != Id.Op_DRightParen:
1608 p_die('Expected second ) to end arith statement', right)
1609
1610 self._SetNext(lex_mode_e.ShCommand)
1611
1612 return anode, right
1613
1614 def _NextNonSpace(self):
1615 # type: () -> None
1616 """Advance in lex_mode_e.Arith until non-space token.
1617
1618 Same logic as _ReadWord, but used in
1619 $(( ))
1620 (( ))
1621 for (( ))
1622
1623 You can read self.token_type after this, without calling _GetToken.
1624 """
1625 while True:
1626 self._SetNext(lex_mode_e.Arith)
1627 self._GetToken()
1628 if self.token_kind not in (Kind.Ignored, Kind.WS):
1629 break
1630
1631 def ReadForExpression(self):
1632 # type: () -> command.ForExpr
1633 """Read ((i=0; i<5; ++i)) -- part of command context."""
1634 self._NextNonSpace() # skip over ((
1635 cur_id = self.token_type # for end of arith expressions
1636
1637 if cur_id == Id.Arith_Semi: # for (( ; i < 10; i++ ))
1638 init_node = arith_expr.EmptyZero # type: arith_expr_t
1639 else:
1640 init_node = self.a_parser.Parse()
1641 cur_id = self.a_parser.CurrentId()
1642 self._NextNonSpace()
1643
1644 # It's odd to keep track of both cur_id and self.token_type in this
1645 # function, but it works, and is tested in 'test/parse_error.sh
1646 # arith-integration'
1647 if cur_id != Id.Arith_Semi: # for (( x=0 b; ... ))
1648 p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1649
1650 self._GetToken()
1651 cur_id = self.token_type
1652
1653 if cur_id == Id.Arith_Semi: # for (( ; ; i++ ))
1654 # empty condition is TRUE
1655 cond_node = arith_expr.EmptyOne # type: arith_expr_t
1656 else:
1657 cond_node = self.a_parser.Parse()
1658 cur_id = self.a_parser.CurrentId()
1659
1660 if cur_id != Id.Arith_Semi: # for (( x=0; x<5 b ))
1661 p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1662
1663 self._NextNonSpace()
1664 if self.token_type == Id.Arith_RParen: # for (( ; ; ))
1665 update_node = arith_expr.EmptyZero # type: arith_expr_t
1666 else:
1667 update_node = self._ReadArithExpr(Id.Arith_RParen)
1668
1669 self._NextNonSpace()
1670 if self.token_type != Id.Arith_RParen:
1671 p_die('Expected ) to end for loop expression', self.cur_token)
1672 self._SetNext(lex_mode_e.ShCommand)
1673
1674 # redirects is None, will be assigned in CommandEvaluator
1675 node = command.ForExpr.CreateNull()
1676 node.init = init_node
1677 node.cond = cond_node
1678 node.update = update_node
1679 return node
1680
1681 def _ReadArrayLiteral(self):
1682 # type: () -> word_part_t
1683 """a=(1 2 3)
1684
1685 TODO: See osh/cmd_parse.py:164 for Id.Lit_ArrayLhsOpen, for a[x++]=1
1686
1687 We want:
1688
1689 A=(['x']=1 ["x"]=2 [$x$y]=3)
1690
1691 Maybe allow this as a literal string? Because I think I've seen it before?
1692 Or maybe force people to patch to learn the rule.
1693
1694 A=([x]=4)
1695
1696 Starts with Lit_Other '[', and then it has Lit_ArrayLhsClose
1697 Maybe enforce that ALL have keys or NONE of have keys.
1698 """
1699 self._SetNext(lex_mode_e.ShCommand) # advance past (
1700 self._GetToken()
1701 if self.cur_token.id != Id.Op_LParen:
1702 p_die('Expected ( after =', self.cur_token)
1703 left_token = self.cur_token
1704 right_token = None # type: Token
1705
1706 # MUST use a new word parser (with same lexer).
1707 w_parser = self.parse_ctx.MakeWordParser(self.lexer, self.line_reader)
1708 words = [] # type: List[CompoundWord]
1709 done = False
1710 while not done:
1711 w = w_parser.ReadWord(lex_mode_e.ShCommand)
1712 with tagswitch(w) as case:
1713 if case(word_e.Operator):
1714 tok = cast(Token, w)
1715 if tok.id == Id.Right_Initializer:
1716 right_token = tok
1717 done = True # can't use break here
1718 # Unlike command parsing, array parsing allows embedded \n.
1719 elif tok.id == Id.Op_Newline:
1720 continue
1721 else:
1722 p_die('Unexpected token in array literal', loc.Word(w))
1723
1724 elif case(word_e.Compound):
1725 words.append(cast(CompoundWord, w))
1726
1727 else:
1728 raise AssertionError()
1729
1730 initializer_words = [] # type: List[InitializerWord_t]
1731 for w in words:
1732 pair = word_.DetectAssocPair(w)
1733 if pair is not None:
1734 word_.TildeDetectAssign(pair.value) # pair.value is modified
1735 initializer_words.append(pair)
1736 else:
1737 w2 = braces.BraceDetect(w) # type: word_t
1738 if w2 is None:
1739 w2 = w
1740 w3 = word_.TildeDetect(w2) # type: word_t
1741 if w3 is None:
1742 w3 = w2
1743 initializer_words.append(InitializerWord.ArrayWord(w3))
1744
1745 # invariant List?
1746 return word_part.InitializerLiteral(left_token, initializer_words,
1747 right_token)
1748
1749 def ParseProcCallArgs(self, start_symbol):
1750 # type: (int) -> ArgList
1751 """ json write (x) """
1752 self.lexer.MaybeUnreadOne()
1753
1754 arg_list = ArgList.CreateNull(alloc_lists=True)
1755 arg_list.left = self.cur_token
1756 self.parse_ctx.ParseProcCallArgs(self.lexer, arg_list, start_symbol)
1757 return arg_list
1758
1759 def _MaybeReadWordPart(self, is_first, lex_mode, parts):
1760 # type: (bool, lex_mode_t, List[word_part_t]) -> bool
1761 """Helper for _ReadCompoundWord3."""
1762 done = False
1763
1764 if self.token_type == Id.Lit_EscapedChar:
1765 tok = self.cur_token
1766 assert tok.length == 2
1767 ch = lexer.TokenSliceLeft(tok, 1)
1768 if self.parse_opts.no_parse_backslash():
1769 if not pyutil.IsValidCharEscape(ch):
1770 p_die('Invalid char escape in unquoted word (OILS-ERR-13)',
1771 self.cur_token)
1772
1773 part = word_part.EscapedLiteral(self.cur_token,
1774 ch) # type: word_part_t
1775 else:
1776 part = self.cur_token
1777
1778 if is_first and self.token_type == Id.Lit_VarLike: # foo=
1779 parts.append(part)
1780 # Unfortunately it's awkward to pull the check for a=(1 2) up to
1781 # _ReadWord.
1782 next_id = self.lexer.LookPastSpace(lex_mode)
1783 if next_id == Id.Op_LParen:
1784 self.lexer.PushHint(Id.Op_RParen, Id.Right_Initializer)
1785 part2 = self._ReadArrayLiteral()
1786 parts.append(part2)
1787
1788 # Array literal must be the last part of the word.
1789 self._SetNext(lex_mode)
1790 self._GetToken()
1791 # EOF, whitespace, newline, Right_Subshell
1792 if self.token_kind not in KINDS_THAT_END_WORDS:
1793 p_die('Unexpected token after array literal',
1794 self.cur_token)
1795 done = True
1796
1797 elif (is_first and self.parse_opts.parse_at() and
1798 self.token_type == Id.Lit_Splice):
1799
1800 splice_tok = self.cur_token
1801 part2 = word_part.Splice(splice_tok,
1802 lexer.TokenSliceLeft(splice_tok, 1))
1803
1804 parts.append(part2)
1805
1806 # @words must be the last part of the word
1807 self._SetNext(lex_mode)
1808 self._GetToken()
1809 # EOF, whitespace, newline, Right_Subshell
1810 if self.token_kind not in KINDS_THAT_END_WORDS:
1811 p_die('Unexpected token after array splice', self.cur_token)
1812 done = True
1813
1814 elif (is_first and self.parse_opts.parse_at() and
1815 self.token_type == Id.Lit_AtLBracket): # @[split(x)]
1816 part2 = self._ReadExprSub(lex_mode_e.DQ)
1817 parts.append(part2)
1818
1819 # @[split(x)]
1820 self._SetNext(lex_mode)
1821 self._GetToken()
1822 # EOF, whitespace, newline, Right_Subshell
1823 if self.token_kind not in KINDS_THAT_END_WORDS:
1824 p_die('Unexpected token after Expr splice', self.cur_token)
1825 done = True
1826
1827 elif (is_first and self.parse_opts.parse_at() and
1828 self.token_type == Id.Lit_AtLBraceDot):
1829 p_die('TODO: @{.myproc builtin sub}', self.cur_token)
1830
1831 elif (is_first and self.parse_opts.parse_at_all() and
1832 self.token_type == Id.Lit_At):
1833 # Because $[x] ${x} and perhaps $/x/ are reserved, it makes sense for @
1834 # at the beginning of a word to be reserved.
1835
1836 # Although should we relax 'echo @' ? I'm tempted to have a shortcut for
1837 # @_argv and
1838 p_die('Literal @ starting a word must be quoted (parse_at_all)',
1839 self.cur_token)
1840
1841 else:
1842 # not a literal with lookahead; append it
1843 parts.append(part)
1844
1845 return done
1846
1847 def _ReadCompoundWord(self, lex_mode):
1848 # type: (lex_mode_t) -> CompoundWord
1849
1850 # This is the ONLY lexer mode that can return word.Redir
1851 assert lex_mode != lex_mode_e.ShCommand, lex_mode
1852
1853 w = self._ReadCompoundOrRedir(lex_mode)
1854 assert w.tag() == word_e.Compound, w
1855 return cast(CompoundWord, w)
1856
1857 def _ReadCompoundWord3(self, lex_mode, eof_type, empty_ok):
1858 # type: (lex_mode_t, Id_t, bool) -> CompoundWord
1859
1860 # This is the ONLY lexer mode that can return word.Redir
1861 assert lex_mode != lex_mode_e.ShCommand, lex_mode
1862
1863 w = self._ReadCompoundOrRedir3(lex_mode, eof_type, empty_ok)
1864 assert w.tag() == word_e.Compound, w
1865 return cast(CompoundWord, w)
1866
1867 def _ReadCompoundOrRedir(self, lex_mode):
1868 # type: (lex_mode_t) -> word_t
1869 """Returns either word.Compound or word.Redir"""
1870 return self._ReadCompoundOrRedir3(lex_mode, Id.Undefined_Tok, True)
1871
1872 def _ReadCompoundOrRedir3(self, lex_mode, eof_type, empty_ok):
1873 # type: (lex_mode_t, Id_t, bool) -> word_t
1874 """
1875 Precondition: Looking at the first token of the first word part
1876 Postcondition: Looking at the token after, e.g. space or operator
1877
1878 NOTE: eof_type is necessary because / is a literal, i.e. Lit_Slash, but it
1879 could be an operator delimiting a compound word. Can we change lexer modes
1880 and remove this special case?
1881
1882 Returns either word.Compound or word.Redir
1883 """
1884 w = CompoundWord([])
1885 num_parts = 0
1886 brace_count = 0
1887 done = False
1888 is_triple_quoted = None # type: Optional[BoolParamBox]
1889 saw_redir_left_tok = False
1890
1891 while not done:
1892 self._GetToken()
1893
1894 allow_done = empty_ok or num_parts != 0
1895 if allow_done and self.token_type == eof_type:
1896 done = True # e.g. for ${foo//pat/replace}
1897
1898 # Keywords like "for" are treated like literals
1899 elif self.token_kind in (Kind.Lit, Kind.History, Kind.KW,
1900 Kind.ControlFlow, Kind.BoolUnary,
1901 Kind.BoolBinary):
1902
1903 # Syntax error for { and }
1904 if self.token_type == Id.Lit_LBrace:
1905 brace_count += 1
1906 elif self.token_type == Id.Lit_RBrace:
1907 brace_count -= 1
1908 elif self.token_type == Id.Lit_Dollar:
1909 if self.parse_opts.no_parse_dollar():
1910 if num_parts == 0 and lex_mode == lex_mode_e.ShCommand:
1911 next_byte = self.lexer.ByteLookAhead()
1912 # TODO: switch lexer modes and parse $/d+/. But not ${a:-$/d+/}
1913 if next_byte == '/':
1914 #log('next_byte %r', next_byte)
1915 pass
1916
1917 p_die(
1918 'Literal $ should be quoted like \$ (no_parse_dollar)',
1919 self.cur_token)
1920 elif self.token_type in (Id.Lit_Number, Id.Lit_RedirVarName):
1921 saw_redir_left_tok = True
1922
1923 done = self._MaybeReadWordPart(num_parts == 0, lex_mode,
1924 w.parts)
1925
1926 elif self.token_kind == Kind.VSub:
1927 vsub_token = self.cur_token
1928
1929 part = SimpleVarSub(vsub_token) # type: word_part_t
1930 w.parts.append(part)
1931
1932 elif self.token_kind == Kind.ExtGlob:
1933 # If parse_at, we can take over @( to start @(seq 3)
1934 # Users can also use look at ,(*.py|*.sh)
1935 if (self.parse_opts.parse_at() and
1936 self.token_type == Id.ExtGlob_At and num_parts == 0):
1937 cs_part = self._ReadCommandSub(Id.Left_AtParen,
1938 d_quoted=False)
1939 # RARE mutation of tok.id!
1940 cs_part.left_token.id = Id.Left_AtParen
1941 part = cs_part # for type safety
1942
1943 # Same check as _MaybeReadWordPart. @(seq 3)x is illegal, just like
1944 # a=(one two)x and @arrayfunc(3)x.
1945 self._GetToken()
1946 if self.token_kind not in KINDS_THAT_END_WORDS:
1947 p_die('Unexpected token after @()', self.cur_token)
1948 done = True
1949
1950 else:
1951 if HAVE_FNM_EXTMATCH == 0:
1952 p_die(
1953 "Extended glob won't work without FNM_EXTMATCH support in libc",
1954 self.cur_token)
1955 part = self._ReadExtGlob()
1956 w.parts.append(part)
1957
1958 elif self.token_kind == Kind.BashRegex:
1959 if self.token_type == Id.BashRegex_LParen: # Opening (
1960 part = self._ReadBashRegexGroup()
1961 w.parts.append(part)
1962 else:
1963 assert self.token_type == Id.BashRegex_AllowedInParens
1964 p_die('Invalid token in bash regex', self.cur_token)
1965
1966 elif self.token_kind == Kind.Left:
1967 try_triple_quote = (self.parse_opts.parse_triple_quote() and
1968 lex_mode == lex_mode_e.ShCommand and
1969 num_parts == 0)
1970
1971 # Save allocation
1972 if try_triple_quote:
1973 is_triple_quoted = BoolParamBox(False)
1974
1975 part = self._ReadUnquotedLeftParts(is_triple_quoted)
1976 w.parts.append(part)
1977
1978 # NOT done yet, will advance below
1979 elif self.token_kind == Kind.Right:
1980 # Still part of the word; will be done on the next iter.
1981 if self.token_type == Id.Right_DoubleQuote:
1982 pass
1983 # Never happens, no PushHint for this case.
1984 #elif self.token_type == Id.Right_DollarParen:
1985 # pass
1986 elif self.token_type == Id.Right_Subshell:
1987 # LEXER HACK for (case x in x) ;; esac )
1988 # Rewind before it's used
1989 assert self.next_lex_mode == lex_mode_e.Undefined
1990 if self.lexer.MaybeUnreadOne():
1991 self.lexer.PushHint(Id.Op_RParen, Id.Right_Subshell)
1992 self._SetNext(lex_mode)
1993 done = True
1994 else:
1995 done = True
1996
1997 elif self.token_kind == Kind.Redir:
1998 # Check if the previous token was a possible left_tok to a
1999 # redirect operator, attach it to the word.Redir. And return
2000 # it instead of the CompoundWord.
2001
2002 # &> and &>> don't have a leading descriptor (2 is implied)
2003 if (saw_redir_left_tok and num_parts == 1 and self.token_type
2004 not in (Id.Redir_AndGreat, Id.Redir_AndDGreat)):
2005
2006 self._SetNext(lex_mode)
2007 left_tok = cast(Token, w.parts.pop())
2008 r = word.Redir(left_tok, self.cur_token)
2009 return r # EARLY RETURN
2010
2011 done = True
2012
2013 elif self.token_kind == Kind.Ignored:
2014 done = True
2015
2016 else:
2017 # LEXER HACK for unbalanced case clause. 'case foo in esac' is valid,
2018 # so to test for ESAC, we can read ) before getting a chance to
2019 # PushHint(Id.Op_RParen, Id.Right_CasePat). So here we unread one
2020 # token and do it again.
2021
2022 # We get Id.Op_RParen at top level: case x in x) ;; esac
2023 # We get Id.Eof_RParen inside ComSub: $(case x in x) ;; esac )
2024 if self.token_type in (Id.Op_RParen, Id.Eof_RParen):
2025 # Rewind before it's used
2026 assert self.next_lex_mode == lex_mode_e.Undefined
2027 if self.lexer.MaybeUnreadOne():
2028 if self.token_type == Id.Eof_RParen:
2029 # Redo translation
2030 self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
2031 self._SetNext(lex_mode)
2032
2033 done = True # anything we don't recognize means we're done
2034
2035 if not done:
2036 self._SetNext(lex_mode)
2037 num_parts += 1
2038
2039 if (self.parse_opts.parse_brace() and num_parts > 1 and
2040 brace_count != 0):
2041 # accept { and }, but not foo{
2042 p_die(
2043 'Word has unbalanced { }. Maybe add a space or quote it like \{',
2044 loc.Word(w))
2045
2046 if is_triple_quoted and is_triple_quoted.b and num_parts > 1:
2047 p_die('Unexpected parts after triple quoted string',
2048 loc.WordPart(w.parts[-1]))
2049
2050 if 0:
2051 from _devbuild.gen.syntax_asdl import word_part_str
2052 word_key = ' '.join(word_part_str(p.tag()) for p in w.parts)
2053 WORD_HIST[word_key] += 1
2054
2055 # YSH word restriction
2056 # (r'' u'' b'' are stripped on shopt -s parse_ysh_string)
2057 if self.parse_opts.no_parse_word_join() and not _IsValidYshWord(w):
2058 p_die("Invalid quoted word part in YSH (OILS-ERR-17)",
2059 loc.WordPart(part))
2060
2061 return w
2062
2063 def _ReadArithWord(self):
2064 # type: () -> Optional[word_t]
2065 """ Helper for ReadArithWord() """
2066 self._GetToken()
2067
2068 if self.token_kind == Kind.Unknown:
2069 # e.g. happened during dynamic parsing of unset 'a[$foo]' in gherkin
2070 p_die(
2071 'Unexpected token while parsing arithmetic: %r' %
2072 lexer.TokenVal(self.cur_token), self.cur_token)
2073
2074 elif self.token_kind == Kind.Eof:
2075 return self.cur_token
2076
2077 elif self.token_kind == Kind.Ignored:
2078 # Space should be ignored.
2079 self._SetNext(lex_mode_e.Arith)
2080 return None
2081
2082 elif self.token_kind in (Kind.Arith, Kind.Right):
2083 # Id.Right_DollarDParen IS just a normal token, handled by ArithParser
2084 self._SetNext(lex_mode_e.Arith)
2085 return self.cur_token
2086
2087 elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub):
2088 return self._ReadCompoundWord(lex_mode_e.Arith)
2089
2090 else:
2091 raise AssertionError(self.cur_token)
2092
2093 def _ReadWord(self, word_mode):
2094 # type: (lex_mode_t) -> Optional[word_t]
2095 """Helper function for ReadWord()."""
2096
2097 # Change the pseudo lexer mode to a real lexer mode
2098 if word_mode == lex_mode_e.ShCommandFakeBrack:
2099 lex_mode = lex_mode_e.ShCommand
2100 else:
2101 lex_mode = word_mode
2102
2103 self._GetToken()
2104
2105 if self.token_kind == Kind.Eof:
2106 # No advance
2107 return self.cur_token
2108
2109 elif self.token_kind == Kind.Redir:
2110 self._SetNext(lex_mode)
2111 # This is >out -- 3>out is handled below
2112 return word.Redir(None, self.cur_token)
2113
2114 # Allow Arith for ) at end of for loop?
2115 elif self.token_kind in (Kind.Op, Kind.Arith):
2116 self._SetNext(lex_mode)
2117
2118 # Newlines are complicated. See 3x2 matrix in the comment about
2119 # self.multiline and self.newline_state above.
2120 if self.token_type == Id.Op_Newline:
2121 if self.multiline:
2122 if self.newline_state > 1:
2123 # This points at a blank line, but at least it gives the line number
2124 p_die('Invalid blank line in multiline mode',
2125 self.cur_token)
2126 return None
2127
2128 if self.returned_newline: # skip
2129 return None
2130
2131 return self.cur_token
2132
2133 elif self.token_kind == Kind.Right:
2134 if self.token_type not in (Id.Right_Subshell, Id.Right_ShFunction,
2135 Id.Right_CasePat, Id.Right_Initializer):
2136 raise AssertionError(self.cur_token)
2137
2138 self._SetNext(lex_mode)
2139 return self.cur_token
2140
2141 elif self.token_kind in (Kind.Ignored, Kind.WS):
2142 self._SetNext(lex_mode)
2143 return None
2144
2145 else:
2146 assert self.token_kind in (Kind.VSub, Kind.Lit, Kind.History,
2147 Kind.Left, Kind.KW, Kind.ControlFlow,
2148 Kind.BoolUnary, Kind.BoolBinary,
2149 Kind.ExtGlob,
2150 Kind.BashRegex), 'Unhandled token kind'
2151
2152 if (word_mode == lex_mode_e.ShCommandFakeBrack and
2153 self.parse_opts.parse_bracket() and
2154 self.token_type == Id.Lit_LBracket):
2155 # Change [ from Kind.Lit -> Kind.Op
2156 # So CommandParser can treat
2157 # assert [42 === x]
2158 # like
2159 # json write (x)
2160 bracket_word = self.cur_token
2161 bracket_word.id = Id.Op_LBracket
2162
2163 self._SetNext(lex_mode)
2164 return bracket_word
2165
2166 # We're beginning a word. If we see Id.Lit_Pound, change to
2167 # lex_mode_e.Comment and read until end of line.
2168 if self.token_type == Id.Lit_Pound:
2169 self._SetNext(lex_mode_e.Comment)
2170 self._GetToken()
2171
2172 # NOTE: The # could be the last character in the file. It can't be
2173 # Eof_{RParen,Backtick} because #) and #` are comments.
2174 assert self.token_type in (Id.Ignored_Comment, Id.Eof_Real), \
2175 self.cur_token
2176
2177 # The next iteration will go into Kind.Ignored and set lex state to
2178 # lex_mode_e.ShCommand/etc.
2179 return None # tell ReadWord() to try again after comment
2180
2181 elif self.token_type == Id.Lit_TPound: ### doc comment
2182 self._SetNext(lex_mode_e.Comment)
2183 self._GetToken()
2184
2185 if self.token_type == Id.Ignored_Comment and self.emit_doc_token:
2186 return self.cur_token
2187
2188 return None # tell ReadWord() to try again after comment
2189
2190 else:
2191 # r'' u'' b'' at the beginning of a word
2192 if (self.token_type == Id.Lit_Chars and
2193 self.lexer.LookAheadOne(
2194 lex_mode_e.ShCommand) == Id.Left_SingleQuote):
2195
2196 # When shopt -s parse_ysh_string:
2197 # echo r'hi' is like echo 'hi'
2198 #
2199 # echo u'\u{3bc}' b'\yff' works
2200
2201 tok = self.cur_token
2202 if self.parse_opts.parse_ysh_string():
2203 if lexer.TokenEquals(tok, 'r'):
2204 left_id = Id.Left_RSingleQuote
2205 elif lexer.TokenEquals(tok, 'u'):
2206 left_id = Id.Left_USingleQuote
2207 elif lexer.TokenEquals(tok, 'b'):
2208 left_id = Id.Left_BSingleQuote
2209 else:
2210 left_id = Id.Undefined_Tok
2211
2212 if left_id != Id.Undefined_Tok:
2213 # skip the r, and then 'foo' will be read as normal
2214 self._SetNext(lex_mode_e.ShCommand)
2215
2216 self._GetToken()
2217 assert self.token_type == Id.Left_SingleQuote, self.token_type
2218
2219 # Read the word in a different lexer mode
2220 return self._ReadYshSingleQuoted(left_id)
2221
2222 return self._ReadCompoundOrRedir(lex_mode)
2223
2224 def ParseVarRef(self):
2225 # type: () -> BracedVarSub
2226 """DYNAMIC parsing of what's inside ${!ref}
2227
2228 # Same as VarOf production
2229 VarRefExpr = VarOf EOF
2230 """
2231 self._SetNext(lex_mode_e.VSub_1)
2232
2233 self._GetToken()
2234 if self.token_kind != Kind.VSub:
2235 p_die('Expected var name', self.cur_token)
2236
2237 part = self._ParseVarOf()
2238 # NOTE: no ${ } means no part.left and part.right
2239 part.left = part.name_tok # cheat to make test pass
2240 part.right = part.name_tok
2241
2242 self._GetToken()
2243 if self.token_type != Id.Eof_Real:
2244 p_die('Expected end of var ref expression', self.cur_token)
2245 return part
2246
2247 def LookPastSpace(self):
2248 # type: () -> Id_t
2249 """Look ahead to the next token.
2250
2251 For the CommandParser to recognize
2252 array= (1 2 3)
2253 YSH for ( versus bash for ((
2254 YSH if ( versus if test
2255 YSH while ( versus while test
2256 YSH bare assignment 'grep =' versus 'grep foo'
2257 """
2258 assert self.token_type != Id.Undefined_Tok
2259 if self.cur_token.id == Id.WS_Space:
2260 id_ = self.lexer.LookPastSpace(lex_mode_e.ShCommand)
2261 else:
2262 id_ = self.cur_token.id
2263 return id_
2264
2265 def LookAheadDParens(self, shift_back=0):
2266 # type: (int) -> bool
2267 """Special lookahead for (( )), to make sure it's an arithmetic
2268 expression (i.e. that the closing parens are a single token, not
2269 separated by anything).
2270 """
2271 assert self.token_type in (Id.Op_DLeftParen, Id.Left_DollarDParen)
2272
2273 return self.lexer.LookAheadDParens(shift_back)
2274
2275 def LookAheadFuncParens(self):
2276 # type: () -> bool
2277 """Special lookahead for f( ) { echo hi; } to check for ( )"""
2278 assert self.token_type != Id.Undefined_Tok
2279
2280 # We have to handle 2 cases because we buffer a token
2281 if self.cur_token.id == Id.Op_LParen: # saw funcname(
2282 return self.lexer.LookAheadFuncParens(1) # go back one char
2283
2284 elif self.cur_token.id == Id.WS_Space: # saw funcname WHITESPACE
2285 return self.lexer.LookAheadFuncParens(0)
2286
2287 else:
2288 return False
2289
2290 def ReadWord(self, word_mode):
2291 # type: (lex_mode_t) -> word_t
2292 """Read the next word, using the given lexer mode.
2293
2294 This is a stateful wrapper for the stateless _ReadWord function.
2295 """
2296 assert word_mode in (lex_mode_e.ShCommand,
2297 lex_mode_e.ShCommandFakeBrack,
2298 lex_mode_e.DBracket, lex_mode_e.BashRegex)
2299
2300 if self.buffered_word: # For integration with pgen2
2301 w = self.buffered_word
2302 self.buffered_word = None
2303 else:
2304 while True:
2305 w = self._ReadWord(word_mode)
2306 if w is not None:
2307 break
2308
2309 self.returned_newline = (word_.CommandId(w) == Id.Op_Newline)
2310 return w
2311
2312 def ReadArithWord(self):
2313 # type: () -> word_t
2314 while True:
2315 w = self._ReadArithWord()
2316 if w is not None:
2317 break
2318 return w
2319
2320 def ReadHereDocBody(self, parts):
2321 # type: (List[word_part_t]) -> None
2322 """
2323 A here doc is like a double quoted context, except " isn't special.
2324 """
2325 self._ReadLikeDQ(None, False, parts)
2326 # Returns nothing
2327
2328 def ReadForPlugin(self):
2329 # type: () -> CompoundWord
2330 """For $PS1, $PS4, etc.
2331
2332 This is just like reading a here doc line. "\n" is allowed, as
2333 well as the typical substitutions ${x} $(echo hi) $((1 + 2)).
2334 """
2335 w = CompoundWord([])
2336 self._ReadLikeDQ(None, False, w.parts)
2337 return w
2338
2339 def EmitDocToken(self, b):
2340 # type: (bool) -> None
2341 self.emit_doc_token = b
2342
2343 def Multiline(self, b):
2344 # type: (bool) -> None
2345 self.multiline = b
2346
2347
2348if 0:
2349 import collections
2350 WORD_HIST = collections.Counter()
2351
2352# vim: sw=4