OILS / osh / word_parse.py View on Github | oils.pub

2218 lines, 1188 significant
1# Copyright 2016 Andy Chu. All rights reserved.
2# Licensed under the Apache License, Version 2.0 (the "License");
3# you may not use this file except in compliance with the License.
4# You may obtain a copy of the License at
5#
6# http://www.apache.org/licenses/LICENSE-2.0
7"""
8word_parse.py - Parse the shell word language.
9
10Hairy example:
11
12 hi$((1 + 2))"$(echo hi)"${var:-__"$(echo default)"__}
13
14Substitutions can be nested, but which inner subs are allowed depends on the
15outer sub. Notes:
16
17lex_mode_e.ShCommand (_ReadUnquotedLeftParts)
18 All subs and quotes are allowed:
19 $v ${v} $() `` $(()) '' "" $'' $"" <() >()
20
21lex_mode_e.DQ (_ReadDoubleQuotedLeftParts)
22 Var, Command, Arith, but no quotes.
23 $v ${v} $() `` $(())
24 No process substitution.
25
26lex_mode_e.Arith
27 Similar to DQ: Var, Command, and Arith sub, but no process sub. bash doesn't
28 allow quotes, but OSH does. We allow ALL FOUR kinds of quotes, because we
29 need those for associative array indexing.
30
31lex_mode_e.VSub_ArgUnquoted
32 Like ShCommand, everything is allowed (even process substitutions), but we
33 stop at }, and space is SIGNIFICANT.
34
35 Example: ${a:- b }
36
37 ${X:-$v} ${X:-${v}} ${X:-$(echo hi)} ${X:-`echo hi`} ${X:-$((1+2))}
38 ${X:-'single'} ${X:-"double"} ${X:-$'\n'} ${X:-<(echo hi)}
39
40lex_mode_e.VSub_ArgDQ
41 In contrast to DQ, VS_ARG_DQ accepts nested "" and $'' and $"", e.g.
42 "${x:-"default"}".
43
44 In contrast, VSub_ArgUnquoted respects single quotes and process
45 substitution.
46
47 It's weird that double quotes are allowed. Space is also significant here,
48 e.g. "${x:-a "b"}".
49"""
50
51from _devbuild.gen import grammar_nt
52from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind
53from _devbuild.gen.types_asdl import (lex_mode_t, lex_mode_e)
54from _devbuild.gen.syntax_asdl import (
55 BoolParamBox,
56 Token,
57 SimpleVarSub,
58 loc,
59 source,
60 DoubleQuoted,
61 SingleQuoted,
62 BracedVarSub,
63 CommandSub,
64 InitializerWord,
65 InitializerWord_t,
66 bracket_op,
67 bracket_op_t,
68 suffix_op,
69 suffix_op_t,
70 rhs_word,
71 rhs_word_e,
72 rhs_word_t,
73 word_e,
74 word_t,
75 CompoundWord,
76 word_part,
77 word_part_t,
78 y_lhs_e,
79 arith_expr_t,
80 command,
81 expr,
82 expr_e,
83 expr_t,
84 pat_t,
85 ArgList,
86 Proc,
87 Func,
88 Subscript,
89 Attribute,
90 arith_expr,
91 VarDecl,
92 Mutation,
93)
94from core import alloc
95from core.error import p_die
96from mycpp.mylib import log
97from core import pyutil
98from display import ui
99from frontend import consts
100from frontend import lexer
101from frontend import reader
102from osh import tdop
103from osh import arith_parse
104from osh import braces
105from osh import word_
106from osh import word_compile
107from mycpp.mylib import tagswitch
108
109from libc import HAVE_FNM_EXTMATCH
110
111from typing import List, Optional, Tuple, cast
112from typing import TYPE_CHECKING
113if TYPE_CHECKING:
114 from frontend.lexer import Lexer
115 from frontend.parse_lib import ParseContext
116 from frontend.reader import _Reader
117 from osh.cmd_parse import VarChecker
118
119unused1 = log
120unused2 = Id_str
121
122KINDS_THAT_END_WORDS = [Kind.Eof, Kind.WS, Kind.Op, Kind.Right]
123
124
125class WordEmitter(object):
126 """Common interface for [ and [["""
127
128 def __init__(self):
129 # type: () -> None
130 """Empty constructor for mycpp."""
131 pass
132
133 def ReadWord(self, lex_mode):
134 # type: (lex_mode_t) -> word_t
135 raise NotImplementedError()
136
137
138class WordParser(WordEmitter):
139
140 def __init__(self, parse_ctx, lexer, line_reader):
141 # type: (ParseContext, Lexer, _Reader) -> None
142 self.parse_ctx = parse_ctx
143 self.lexer = lexer
144 self.line_reader = line_reader
145 self.arena = line_reader.arena
146
147 self.parse_opts = parse_ctx.parse_opts
148 self.a_parser = tdop.TdopParser(arith_parse.Spec(), self,
149 self.parse_opts)
150 self.Reset()
151
152 def Init(self, lex_mode):
153 # type: (lex_mode_t) -> None
154 """Used to parse arithmetic, see ParseContext."""
155 self.next_lex_mode = lex_mode
156
157 def Reset(self):
158 # type: () -> None
159 """Called by interactive loop."""
160 # For _GetToken()
161 self.cur_token = None # type: Token
162 self.token_kind = Kind.Undefined
163 self.token_type = Id.Undefined_Tok
164
165 self.next_lex_mode = lex_mode_e.ShCommand
166
167 # Boolean mutated by CommandParser via word_.ctx_EmitDocToken. For ### doc
168 # comments
169 self.emit_doc_token = False
170 # Boolean mutated by CommandParser via word_.ctx_Multiline. '...' starts
171 # multiline mode.
172 self.multiline = False
173
174 # For detecting invalid \n\n in multiline mode. Counts what we got
175 # directly from the lexer.
176 self.newline_state = 0
177 # For consolidating \n\n -> \n for the CALLER. This simplifies the parsers
178 # that consume words.
179 self.returned_newline = False
180
181 # For integration with pgen2
182 self.buffered_word = None # type: word_t
183
184 def _GetToken(self):
185 # type: () -> None
186 """Call this when you need to make a decision based on any of:
187
188 self.token_type
189 self.token_kind
190 self.cur_token
191 """
192 if self.next_lex_mode == lex_mode_e.Undefined:
193 return # _SetNext() not called, so do nothing
194
195 is_fake = self.next_lex_mode == lex_mode_e.BashRegexFakeInner
196 real_mode = (lex_mode_e.BashRegex if is_fake else self.next_lex_mode)
197
198 self.cur_token = self.lexer.Read(real_mode)
199
200 # MUTATE TOKEN for fake lexer mode.
201 # This is for crazy stuff bash allows, like [[ s =~ (< >) ]]
202 if (is_fake and self.cur_token.id
203 in (Id.WS_Space, Id.BashRegex_AllowedInParens)):
204 self.cur_token.id = Id.Lit_Chars
205
206 self.token_type = self.cur_token.id
207 self.token_kind = consts.GetKind(self.token_type)
208
209 # number of consecutive newlines, ignoring whitespace
210 if self.token_type == Id.Op_Newline:
211 self.newline_state += 1
212 elif self.token_kind != Kind.WS:
213 self.newline_state = 0
214
215 self.parse_ctx.trail.AppendToken(self.cur_token) # For completion
216 self.next_lex_mode = lex_mode_e.Undefined
217
218 def _SetNext(self, lex_mode):
219 # type: (lex_mode_t) -> None
220 """Set the next lex state, but don't actually read a token.
221
222 We need this for proper interactive parsing.
223 """
224 self.next_lex_mode = lex_mode
225
226 def _ReadVarOpArg(self, arg_lex_mode):
227 # type: (lex_mode_t) -> rhs_word_t
228
229 # NOTE: Operators like | and < are not treated as special, so ${a:- | >} is
230 # valid, even when unquoted.
231 self._SetNext(arg_lex_mode)
232 self._GetToken()
233
234 w = self._ReadVarOpArg2(arg_lex_mode, Id.Undefined_Tok,
235 True) # empty_ok
236
237 # If the Compound has no parts, and we're in a double-quoted VarSub
238 # arg, and empty_ok, then return Empty. This is so it can evaluate to
239 # the empty string and not get elided.
240 #
241 # Examples:
242 # - "${s:-}", "${s/%pat/}"
243 # It's similar to LooksLikeShAssignment where we turn x= into x=''. And it
244 # has the same potential problem of not having Token location info.
245 #
246 # NOTE: empty_ok is False only for the PatSub pattern, which means we'll
247 # return a Compound with no parts, which is explicitly checked with a
248 # custom error message.
249 if len(w.parts) == 0 and arg_lex_mode == lex_mode_e.VSub_ArgDQ:
250 return rhs_word.Empty
251
252 return w
253
254 def _ReadVarOpArg2(self, arg_lex_mode, eof_type, empty_ok):
255 # type: (lex_mode_t, Id_t, bool) -> CompoundWord
256 """Return a CompoundWord.
257
258 Helper function for _ReadVarOpArg and used directly by
259 _ReadPatSubVarOp.
260 """
261 w = self._ReadCompoundWord3(arg_lex_mode, eof_type, empty_ok)
262 #log('w %s', w)
263 tilde = word_.TildeDetect(w)
264 if tilde:
265 w = tilde
266 return w
267
268 def _ReadSliceVarOp(self):
269 # type: () -> suffix_op.Slice
270 """
271 Looking token after first ':'
272
273 ArithExpr? (':' ArithExpr? )? '}'
274 """
275 self._NextNonSpace()
276
277 cur_id = self.token_type
278
279 if cur_id in (Id.Arith_RBrace, Id.Arith_Colon): # ${a:} or ${a::}
280 begin = arith_expr.EmptyZero # type: arith_expr_t
281 else:
282 begin = self.a_parser.Parse()
283 cur_id = self.a_parser.CurrentId() # advance
284
285 if cur_id == Id.Arith_RBrace: # ${a:1} or ${@:1}
286 # No length specified, so it's N
287 no_length = None # type: Optional[arith_expr_t]
288 return suffix_op.Slice(begin, no_length)
289
290 elif cur_id == Id.Arith_Colon: # ${a:1:} or ${@:1:}
291 colon_tok = self.cur_token
292 self._NextNonSpace()
293
294 if self.token_type == Id.Arith_RBrace:
295 # quirky bash behavior:
296 # ${a:1:} or ${a::} means length ZERO
297 # but ${a:1} or ${a:} means length N
298 if self.parse_opts.strict_parse_slice():
299 p_die(
300 "Slice length: Add explicit zero, or omit : for N (strict_parse_slice)",
301 colon_tok)
302
303 length = arith_expr.EmptyZero # type: arith_expr_t
304 else:
305 length = self._ReadArithExpr(Id.Arith_RBrace)
306
307 return suffix_op.Slice(begin, length)
308
309 else:
310 p_die("Expected : or } in slice", self.cur_token)
311
312 raise AssertionError() # for MyPy
313
314 def _ReadPatSubVarOp(self):
315 # type: () -> suffix_op.PatSub
316 """Looking at the first '/' after VarOf:
317
318 VarSub = ...
319 | VarOf '/' Match ( '/' WORD? )?
320 Match = '/' WORD # can't be empty
321 | '#' WORD? # may be empty
322 | '%' WORD?
323 """
324 slash_tok = self.cur_token # location info
325 replace_mode = Id.Undefined_Tok # bizarre syntax / # %
326
327 self._SetNext(lex_mode_e.VSub_ArgUnquoted) # advance past /
328
329 self._GetToken()
330 if self.token_type == Id.Right_DollarBrace:
331 pat = CompoundWord([])
332 return suffix_op.PatSub(pat, rhs_word.Empty, replace_mode,
333 slash_tok)
334
335 if self.token_type in (Id.Lit_Slash, Id.Lit_Pound, Id.Lit_Percent):
336 replace_mode = self.token_type
337 self._SetNext(lex_mode_e.VSub_ArgUnquoted)
338
339 # Bash quirk:
340 # echo ${x/#/replace} has an empty pattern
341 # echo ${x////replace} is non-empty; it means echo ${x//'/'/replace}
342 empty_ok = replace_mode != Id.Lit_Slash
343 pat = self._ReadVarOpArg2(lex_mode_e.VSub_ArgUnquoted, Id.Lit_Slash,
344 empty_ok)
345 #log('pat 1 %r', pat)
346
347 if self.token_type == Id.Lit_Slash:
348 # read until }
349 replace = self._ReadVarOpArg(
350 lex_mode_e.VSub_ArgUnquoted) # type: rhs_word_t
351 #log('r 1 %r', replace)
352 else:
353 # e.g. ${v/a} is the same as ${v/a/} -- empty replacement string
354 replace = rhs_word.Empty
355
356 self._GetToken()
357 if self.token_type != Id.Right_DollarBrace:
358 # This happens on invalid code
359 p_die(
360 "Expected } after replacement string, got %s" %
361 ui.PrettyId(self.token_type), self.cur_token)
362
363 return suffix_op.PatSub(pat, replace, replace_mode, slash_tok)
364
365 def _ReadSubscript(self):
366 # type: () -> bracket_op_t
367 """ Subscript = '[' ('@' | '*' | ArithExpr) ']' """
368 # Lookahead to see if we get @ or *. Otherwise read a full arithmetic
369 # expression.
370 next_id = self.lexer.LookPastSpace(lex_mode_e.Arith)
371 if next_id in (Id.Lit_At, Id.Arith_Star):
372 op = bracket_op.WholeArray(next_id) # type: bracket_op_t
373
374 self._SetNext(lex_mode_e.Arith) # skip past [
375 self._GetToken()
376 self._SetNext(lex_mode_e.Arith) # skip past @
377 self._GetToken()
378 else:
379 self._SetNext(lex_mode_e.Arith) # skip past [
380 anode = self._ReadArithExpr(Id.Arith_RBracket)
381 op = bracket_op.ArrayIndex(anode)
382
383 if self.token_type != Id.Arith_RBracket: # Should be looking at ]
384 p_die('Expected ] to close subscript', self.cur_token)
385
386 self._SetNext(lex_mode_e.VSub_2) # skip past ]
387 self._GetToken() # Needed to be in the same spot as no subscript
388
389 return op
390
391 def _ParseVarOf(self):
392 # type: () -> BracedVarSub
393 """
394 VarOf = NAME Subscript?
395 | NUMBER # no subscript allowed, none of these are arrays
396 # ${@[1]} doesn't work, even though slicing does
397 | VarSymbol
398 """
399 self._GetToken()
400 name_token = self.cur_token
401 self._SetNext(lex_mode_e.VSub_2)
402
403 self._GetToken() # Check for []
404 if self.token_type == Id.VOp2_LBracket:
405 bracket_op = self._ReadSubscript()
406 else:
407 bracket_op = None
408
409 part = BracedVarSub.CreateNull()
410 part.name_tok = name_token
411 part.var_name = lexer.TokenVal(name_token)
412 part.bracket_op = bracket_op
413 return part
414
415 def _ParseVarExpr(self, arg_lex_mode, allow_query=False):
416 # type: (lex_mode_t, bool) -> BracedVarSub
417 """Start parsing at the op -- we already skipped past the name."""
418 part = self._ParseVarOf()
419
420 self._GetToken()
421 if self.token_type == Id.Right_DollarBrace:
422 return part # no ops
423
424 op_kind = self.token_kind
425
426 if op_kind == Kind.VTest:
427 tok = self.cur_token
428 arg_word = self._ReadVarOpArg(arg_lex_mode)
429 if self.token_type != Id.Right_DollarBrace:
430 p_die('Expected } to close ${', self.cur_token)
431
432 part.suffix_op = suffix_op.Unary(tok, arg_word)
433
434 elif op_kind == Kind.VOpYsh:
435 tok = self.cur_token
436 arg_word = self._ReadVarOpArg(arg_lex_mode)
437 if self.token_type != Id.Right_DollarBrace:
438 p_die('Expected } to close ${', self.cur_token)
439
440 UP_arg_word = arg_word
441 with tagswitch(arg_word) as case:
442 if case(rhs_word_e.Empty):
443 pass
444 elif case(rhs_word_e.Compound):
445 arg_word = cast(CompoundWord, UP_arg_word)
446 # This handles ${x|html} and ${x %.3f} now
447 # However I think ${x %.3f} should be statically parsed? It can enter
448 # the printf lexer modes.
449 ok, arg, quoted = word_.StaticEval(arg_word)
450 if not ok or quoted:
451 p_die('Expected a constant argument',
452 loc.Word(arg_word))
453
454 part.suffix_op = suffix_op.Static(tok, arg)
455
456 elif op_kind == Kind.VOp0:
457 part.suffix_op = self.cur_token # Nullary
458 self._SetNext(lex_mode_e.VSub_2) # Expecting }
459 self._GetToken()
460
461 elif op_kind == Kind.VOp1: # % %% # ## etc.
462 tok = self.cur_token
463 # Weird exception that all shells have: these operators take a glob
464 # pattern, so they're lexed as VSub_ArgUnquoted, not VSub_ArgDQ
465 arg_word = self._ReadVarOpArg(lex_mode_e.VSub_ArgUnquoted)
466 if self.token_type != Id.Right_DollarBrace:
467 p_die('Expected } to close ${', self.cur_token)
468
469 part.suffix_op = suffix_op.Unary(tok, arg_word)
470
471 elif op_kind == Kind.VOp2: # / : [ ]
472 if self.token_type == Id.VOp2_Slash:
473 patsub_op = self._ReadPatSubVarOp() # type: suffix_op_t
474 part.suffix_op = patsub_op
475
476 # Checked by the method above
477 assert self.token_type == Id.Right_DollarBrace, self.cur_token
478
479 elif self.token_type == Id.VOp2_Colon:
480 part.suffix_op = self._ReadSliceVarOp()
481 # NOTE: } in arithmetic mode.
482 if self.token_type != Id.Arith_RBrace:
483 # Token seems off; doesn't point to X in # ${a:1:2 X
484 p_die('Expected } to close ${', self.cur_token)
485
486 else:
487 # TODO: Does this ever happen?
488 p_die('Unexpected token in ${} (%s)' % 'VOp2', self.cur_token)
489
490 elif op_kind == Kind.VOp3: # ${prefix@} etc.
491 if allow_query:
492 part.suffix_op = self.cur_token # Nullary
493 self._SetNext(lex_mode_e.VSub_2) # Expecting }
494 self._GetToken()
495 else:
496 p_die("Unexpected token in ${} (%s)" % 'VOp3', self.cur_token)
497
498 # NOTE: Arith_RBrace is for slicing, because it reads } in arithmetic
499 # mode. It's redundantly checked above.
500 if self.token_type not in (Id.Right_DollarBrace, Id.Arith_RBrace):
501 # ${a.} or ${!a.}
502 p_die('Expected } to close ${', self.cur_token)
503
504 # Now look for ops
505 return part
506
507 def _ReadZshVarSub(self, left_token):
508 # type: (Token) -> word_part.ZshVarSub
509
510 self._SetNext(lex_mode_e.VSub_Zsh) # Move past ${(foo)
511
512 # Can be empty
513 w = self._ReadCompoundWord3(lex_mode_e.VSub_Zsh, Id.Right_DollarBrace,
514 True)
515 self._GetToken()
516 return word_part.ZshVarSub(left_token, w, self.cur_token)
517
518 def ReadBracedVarSub(self, left_token):
519 # type: (Token) -> Tuple[BracedVarSub, Token]
520 """ For YSH expressions like var x = ${x:-"default"}. """
521 part = self._ReadBracedVarSub(left_token, d_quoted=False)
522 last_token = self.cur_token
523 return part, last_token
524
525 def _ReadBracedVarSub(self, left_token, d_quoted):
526 # type: (Token, bool) -> BracedVarSub
527 """For the ${} expression language.
528
529 NAME = [a-zA-Z_][a-zA-Z0-9_]*
530 NUMBER = [0-9]+ # ${10}, ${11}, ...
531
532 Subscript = '[' ('@' | '*' | ArithExpr) ']'
533 VarSymbol = '!' | '@' | '#' | ...
534 VarOf = NAME Subscript?
535 | NUMBER # no subscript allowed, none of these are arrays
536 # ${@[1]} doesn't work, even though slicing does
537 | VarSymbol
538
539 NULLARY_OP = '@Q' | '@E' | '@P' | '@A' | '@a' # VOp0
540
541 TEST_OP = '-' | ':-' | '=' | ':=' | '+' | ':+' | '?' | ':?'
542 STRIP_OP = '#' | '##' | '%' | '%%'
543 CASE_OP = ',' | ',,' | '^' | '^^'
544 UnaryOp = TEST_OP | STRIP_OP | CASE_OP
545
546 YSH_UNARY = '|' | ' ' # ${x|html} and ${x %.3f}.
547 # SPACE is operator not %
548 Match = ('/' | '#' | '%') WORD # match all / prefix / suffix
549 VarExpr = VarOf
550 | VarOf NULLARY_OP
551 | VarOf UnaryOp WORD
552 | VarOf YSH_UNARY STATIC_WORD
553 | VarOf ':' ArithExpr (':' ArithExpr )?
554 | VarOf '/' Match '/' WORD
555
556 LengthExpr = '#' VarOf # can't apply operators after length
557
558 RefOrKeys = '!' VarExpr # CAN apply operators after a named ref
559 # ${!ref[0]} vs ${!keys[@]} resolved later
560
561 PrefixQuery = '!' NAME ('*' | '@') # list variable names with a prefix
562
563 BuiltinSub = '.' WORD+ # ${.myproc 'builtin' $sub}
564
565 VarSub = LengthExpr
566 | RefOrKeys
567 | PrefixQuery
568 | VarExpr
569 | BuiltinSub
570
571 NOTES:
572 - Arithmetic expressions are used twice, inside subscripts ${a[x+1]} and
573 slicing ${a:x+1:y+2}
574 - ${#} and ${!} need LL(2) lookahead (considering how my tokenizer works)
575 - @ and * are technically arithmetic expressions in this implementation
576 - We don't account for bash 4.4: ${param@operator} -- Q E P A a. Note that
577 it's also vectorized.
578
579 Strictness over bash:
580 - echo ${a[0][0]} doesn't do anything useful, so we disallow it from the
581 grammar
582 - ! and # prefixes can't be composed, even though named refs can be
583 composed with other operators
584 - '#' means 4 different things: length prefix, VarSymbol, UnaryOp to strip
585 a prefix, and it can also be a literal part of WORD.
586
587 From the parser's point of view, the prefix # can't be combined with
588 UnaryOp/slicing/matching, and the ! can. However
589
590 - ${a[@]:1:2} is not allowed
591 - ${#a[@]:1:2} is allowed, but gives the wrong answer
592 """
593 if d_quoted:
594 arg_lex_mode = lex_mode_e.VSub_ArgDQ
595 else:
596 arg_lex_mode = lex_mode_e.VSub_ArgUnquoted
597
598 self._SetNext(lex_mode_e.VSub_1)
599 self._GetToken()
600
601 ty = self.token_type
602 first_tok = self.cur_token
603
604 if ty == Id.VSub_Pound:
605 # Disambiguate
606 next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
607 if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
608 # e.g. a name, '#' is the prefix
609 self._SetNext(lex_mode_e.VSub_1)
610 part = self._ParseVarOf()
611
612 self._GetToken()
613 if self.token_type != Id.Right_DollarBrace:
614 p_die('Expected } after length expression', self.cur_token)
615
616 part.prefix_op = first_tok
617
618 else: # not a prefix, '#' is the variable
619 part = self._ParseVarExpr(arg_lex_mode)
620
621 elif ty == Id.VSub_Bang:
622 next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
623 if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
624 # e.g. a name, '!' is the prefix
625 # ${!a} -- this is a ref
626 # ${!3} -- this is ref
627 # ${!a[1]} -- this is a ref
628 # ${!a[@]} -- this is a keys
629 # No lookahead -- do it in a second step, or at runtime
630 self._SetNext(lex_mode_e.VSub_1)
631 part = self._ParseVarExpr(arg_lex_mode, allow_query=True)
632
633 part.prefix_op = first_tok
634
635 else: # not a prefix, '!' is the variable
636 part = self._ParseVarExpr(arg_lex_mode)
637
638 elif ty == Id.VSub_Dot:
639 # Note: this will become a new builtin_sub type, so this method must
640 # return word_part_t rather than BracedVarSub. I don't think that
641 # should cause problems.
642 p_die('TODO: ${.myproc builtin sub}', self.cur_token)
643
644 # VS_NAME, VS_NUMBER, symbol that isn't # or !
645 elif self.token_kind == Kind.VSub:
646 part = self._ParseVarExpr(arg_lex_mode)
647
648 else:
649 # e.g. ${^}
650 p_die('Unexpected token in ${}', self.cur_token)
651
652 part.left = left_token # attach the argument
653 part.right = self.cur_token
654 return part
655
656 def _ReadSingleQuoted(self, left_token, lex_mode):
657 # type: (Token, lex_mode_t) -> SingleQuoted
658 """Internal method to read a word_part."""
659 tokens = [] # type: List[Token]
660 # In command mode, we never disallow backslashes like '\'
661 right_quote = self.ReadSingleQuoted(lex_mode, left_token, tokens,
662 False)
663 sval = word_compile.EvalSingleQuoted(left_token.id, tokens)
664 node = SingleQuoted(left_token, sval, right_quote)
665 return node
666
667 def ReadSingleQuoted(self, lex_mode, left_token, out_tokens, is_ysh_expr):
668 # type: (lex_mode_t, Token, List[Token], bool) -> Token
669 """Appends to out_tokens; returns last token
670
671 Used by expr_parse.py
672 """
673 # TODO: Remove and use out_tokens
674 tokens = [] # type: List[Token]
675
676 # echo '\' is allowed, but x = '\' is invalid, in favor of x = r'\'
677 no_backslashes = is_ysh_expr and left_token.id == Id.Left_SingleQuote
678
679 expected_end_tokens = 3 if left_token.id in (
680 Id.Left_TSingleQuote, Id.Left_RTSingleQuote, Id.Left_UTSingleQuote,
681 Id.Left_BTSingleQuote) else 1
682 num_end_tokens = 0
683
684 while num_end_tokens < expected_end_tokens:
685 self._SetNext(lex_mode)
686 self._GetToken()
687
688 # Kind.Char emitted in lex_mode.SQ_C
689 if self.token_kind in (Kind.Lit, Kind.Char):
690 tok = self.cur_token
691 # Happens in lex_mode_e.SQ: 'one\two' is ambiguous, should be
692 # r'one\two' or c'one\\two'
693 if no_backslashes and lexer.TokenContains(tok, '\\'):
694 p_die(
695 r"Strings with backslashes should look like r'\n' or u'\n' or b'\n'",
696 tok)
697
698 if is_ysh_expr:
699 # Disallow var x = $'\001'. Arguably we don't need these
700 # checks because u'\u{1}' is the way to write it.
701 if self.token_type == Id.Char_Octal3:
702 p_die(
703 r"Use \xhh or \u{...} instead of octal escapes in YSH strings",
704 tok)
705
706 if self.token_type == Id.Char_Hex and self.cur_token.length != 4:
707 # disallow \xH
708 p_die(
709 r'Invalid hex escape in YSH string (must be \xHH)',
710 tok)
711
712 tokens.append(tok)
713
714 elif self.token_kind == Kind.Unknown:
715 tok = self.cur_token
716 assert tok.id == Id.Unknown_Backslash, tok
717
718 # x = $'\z' is disallowed; ditto for echo $'\z' if shopt -u parse_backslash
719 if is_ysh_expr or not self.parse_opts.parse_backslash():
720 p_die(
721 "Invalid char escape in C-style string literal (OILS-ERR-11)",
722 tok)
723
724 tokens.append(tok)
725
726 elif self.token_kind == Kind.Eof:
727 p_die('Unexpected EOF in single-quoted string that began here',
728 left_token)
729
730 elif self.token_kind == Kind.Right:
731 # assume Id.Right_SingleQuote
732 num_end_tokens += 1
733 tokens.append(self.cur_token)
734
735 else:
736 raise AssertionError(self.cur_token)
737
738 if self.token_kind != Kind.Right:
739 num_end_tokens = 0 # we need three in a ROW
740
741 if expected_end_tokens == 1:
742 tokens.pop()
743 elif expected_end_tokens == 3: # Get rid of spurious end tokens
744 tokens.pop()
745 tokens.pop()
746 tokens.pop()
747
748 # Remove space from ''' r''' $''' in both expression mode and command mode
749 if left_token.id in (Id.Left_TSingleQuote, Id.Left_RTSingleQuote,
750 Id.Left_UTSingleQuote, Id.Left_BTSingleQuote):
751 word_compile.RemoveLeadingSpaceSQ(tokens)
752
753 # Validation after lexing - same 2 checks in j8.LexerDecoder
754 is_u_string = left_token.id in (Id.Left_USingleQuote,
755 Id.Left_UTSingleQuote)
756
757 for tok in tokens:
758 # u'\yff' is not valid, but b'\yff' is
759 if is_u_string and tok.id == Id.Char_YHex:
760 p_die(
761 r"%s escapes not allowed in u'' strings" %
762 lexer.TokenVal(tok), tok)
763
764 out_tokens.extend(tokens)
765 return self.cur_token
766
767 def _ReadDoubleQuotedLeftParts(self):
768 # type: () -> word_part_t
769 """Read substitution parts in a double quoted context."""
770 if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick):
771 return self._ReadCommandSub(self.token_type, d_quoted=True)
772
773 if self.token_type == Id.Left_DollarBrace:
774 return self._ReadBracedVarSub(self.cur_token, d_quoted=True)
775
776 if self.token_type == Id.Left_DollarDParen:
777 return self._ReadArithSub()
778
779 if self.token_type == Id.Left_DollarBracket:
780 return self._ReadExprSub(lex_mode_e.DQ)
781
782 if self.token_type == Id.Left_DollarBraceZsh:
783 return self._ReadZshVarSub(self.cur_token)
784
785 raise AssertionError(self.cur_token)
786
787 def _ReadYshSingleQuoted(self, left_id):
788 # type: (Id_t) -> CompoundWord
789 """Read YSH style strings
790
791 r'' u'' b''
792 r''' ''' u''' ''' b''' '''
793 """
794 #log('BEF self.cur_token %s', self.cur_token)
795 if left_id == Id.Left_RSingleQuote:
796 lexer_mode = lex_mode_e.SQ_Raw
797 triple_left_id = Id.Left_RTSingleQuote
798 elif left_id == Id.Left_USingleQuote:
799 lexer_mode = lex_mode_e.J8_Str
800 triple_left_id = Id.Left_UTSingleQuote
801 elif left_id == Id.Left_BSingleQuote:
802 lexer_mode = lex_mode_e.J8_Str
803 triple_left_id = Id.Left_BTSingleQuote
804 else:
805 raise AssertionError(left_id)
806
807 # Needed for syntax checks
808 left_tok = self.cur_token
809 left_tok.id = left_id
810
811 sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
812
813 if (len(sq_part.sval) == 0 and self.lexer.ByteLookAhead() == "'"):
814 self._SetNext(lex_mode_e.ShCommand)
815 self._GetToken()
816
817 assert self.token_type == Id.Left_SingleQuote
818 # HACK: magically transform the third ' in u''' to
819 # Id.Left_UTSingleQuote, so that ''' is the terminator
820 left_tok = self.cur_token
821 left_tok.id = triple_left_id
822
823 # Handles stripping leading whitespace
824 sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
825
826 # Advance and validate
827 self._SetNext(lex_mode_e.ShCommand)
828
829 self._GetToken()
830 if self.token_kind not in KINDS_THAT_END_WORDS:
831 p_die('Unexpected token after YSH single-quoted string',
832 self.cur_token)
833
834 return CompoundWord([sq_part])
835
836 def _ReadUnquotedLeftParts(self, triple_out):
837 # type: (Optional[BoolParamBox]) -> word_part_t
838 """Read substitutions and quoted strings (for lex_mode_e.ShCommand).
839
840 If triple_out is set, then we try parsing triple quoted strings,
841 and set its value to True if we got one.
842 """
843 if self.token_type in (Id.Left_DoubleQuote, Id.Left_DollarDoubleQuote):
844 # Note: $"" is a synonym for "". It might make sense if it added
845 # \n \0 \x00 \u{123} etc. But that's not what bash does!
846 dq_part = self._ReadDoubleQuoted(self.cur_token)
847 # Got empty word "" and there's a " after
848 if (triple_out and len(dq_part.parts) == 0 and
849 self.lexer.ByteLookAhead() == '"'):
850
851 self._SetNext(lex_mode_e.ShCommand)
852 self._GetToken()
853 # HACK: magically transform the third " in """ to
854 # Id.Left_TDoubleQuote, so that """ is the terminator
855 left_dq_token = self.cur_token
856 left_dq_token.id = Id.Left_TDoubleQuote
857 triple_out.b = True # let caller know we got it
858 return self._ReadDoubleQuoted(left_dq_token)
859
860 return dq_part
861
862 if self.token_type in (Id.Left_SingleQuote, Id.Left_RSingleQuote,
863 Id.Left_DollarSingleQuote):
864 if self.token_type == Id.Left_SingleQuote:
865 lexer_mode = lex_mode_e.SQ_Raw
866 triple_left_id = Id.Left_TSingleQuote
867 elif self.token_type == Id.Left_RSingleQuote:
868 lexer_mode = lex_mode_e.SQ_Raw
869 triple_left_id = Id.Left_RTSingleQuote
870 else:
871 lexer_mode = lex_mode_e.SQ_C
872 # there is no such thing as $'''
873 triple_left_id = Id.Undefined_Tok
874
875 sq_part = self._ReadSingleQuoted(self.cur_token, lexer_mode)
876
877 # Got empty '' or r'' and there's a ' after
878 # u'' and b'' are handled in _ReadYshSingleQuoted
879 if (triple_left_id != Id.Undefined_Tok and
880 triple_out is not None and len(sq_part.sval) == 0 and
881 self.lexer.ByteLookAhead() == "'"):
882
883 self._SetNext(lex_mode_e.ShCommand)
884 self._GetToken()
885
886 # HACK: magically transform the third ' in ''' to
887 # Id.Left_TSingleQuote, so that ''' is the terminator
888 left_sq_token = self.cur_token
889 left_sq_token.id = triple_left_id
890
891 triple_out.b = True # let caller know we got it
892 return self._ReadSingleQuoted(left_sq_token, lexer_mode)
893
894 return sq_part
895
896 if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick,
897 Id.Left_ProcSubIn, Id.Left_ProcSubOut):
898 return self._ReadCommandSub(self.token_type, d_quoted=False)
899
900 if self.token_type == Id.Left_DollarBrace:
901 return self._ReadBracedVarSub(self.cur_token, d_quoted=False)
902
903 if self.token_type == Id.Left_DollarDParen:
904 return self._ReadArithSub()
905
906 if self.token_type == Id.Left_DollarBracket:
907 return self._ReadExprSub(lex_mode_e.ShCommand)
908
909 if self.token_type == Id.Left_DollarBraceZsh:
910 return self._ReadZshVarSub(self.cur_token)
911
912 raise AssertionError(self.cur_token)
913
914 def _ReadExtGlob(self):
915 # type: () -> word_part.ExtGlob
916 """
917 Grammar:
918 Item = CompoundWord | EPSILON # important: @(foo|) is allowed
919 LEFT = '@(' | '*(' | '+(' | '?(' | '!('
920 RIGHT = ')'
921 ExtGlob = LEFT (Item '|')* Item RIGHT # ITEM may be empty
922 Compound includes ExtGlob
923 """
924 left_token = self.cur_token
925 right_token = None # type: Token
926 arms = [] # type: List[CompoundWord]
927
928 self.lexer.PushHint(Id.Op_RParen, Id.Right_ExtGlob)
929 self._SetNext(lex_mode_e.ExtGlob) # advance past LEFT
930
931 read_word = False # did we just a read a word? To handle @(||).
932
933 while True:
934 self._GetToken()
935
936 if self.token_type == Id.Right_ExtGlob:
937 if not read_word:
938 arms.append(CompoundWord([]))
939 right_token = self.cur_token
940 break
941
942 elif self.token_type == Id.Op_Pipe:
943 if not read_word:
944 arms.append(CompoundWord([]))
945 read_word = False
946 self._SetNext(lex_mode_e.ExtGlob)
947
948 # lex_mode_e.ExtGlob should only produce these 4 kinds of tokens
949 elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub,
950 Kind.ExtGlob):
951 w = self._ReadCompoundWord(lex_mode_e.ExtGlob)
952 arms.append(w)
953 read_word = True
954
955 elif self.token_kind == Kind.Eof:
956 p_die('Unexpected EOF reading extended glob that began here',
957 left_token)
958
959 else:
960 raise AssertionError(self.cur_token)
961
962 return word_part.ExtGlob(left_token, arms, right_token)
963
964 def _ReadBashRegexGroup(self):
965 # type: () -> word_part.BashRegexGroup
966 """
967 Grammar:
968 BashRegexGroup = '(' WORD? ')
969 """
970 left_token = self.cur_token
971 assert left_token.id == Id.BashRegex_LParen, left_token
972
973 arms = [] # type: List[CompoundWord]
974
975 self.lexer.PushHint(Id.Op_RParen, Id.Right_BashRegexGroup)
976 self._SetNext(lex_mode_e.BashRegexFakeInner) # advance past LEFT
977
978 self._GetToken()
979 if self.token_type == Id.Right_BashRegexGroup: # empty ()
980 return word_part.BashRegexGroup(left_token, None, self.cur_token)
981
982 # lex_mode_e.BashRegex should only produce these 4 kinds of tokens
983 if self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub, Kind.BashRegex):
984 # Fake lexer mode that translates Id.WS_Space to Id.Lit_Chars
985 # To allow bash style [[ s =~ (a b) ]]
986 w = self._ReadCompoundWord(lex_mode_e.BashRegexFakeInner)
987 arms.append(w)
988
989 self._GetToken()
990 if self.token_type != Id.Right_BashRegexGroup:
991 p_die('Expected ) to close bash regex group', self.cur_token)
992
993 return word_part.BashRegexGroup(left_token, w, self.cur_token)
994
995 p_die('Expected word after ( opening bash regex group', self.cur_token)
996
997 def _ReadLikeDQ(self, left_token, is_ysh_expr, out_parts):
998 # type: (Optional[Token], bool, List[word_part_t]) -> None
999 """
1000 Args:
1001 left_token: A token if we are reading a double quoted part, or None if
1002 we're reading a here doc.
1003 is_ysh_expr: Whether to disallow backticks and invalid char escapes
1004 out_parts: list of word_part to append to
1005 """
1006 if left_token:
1007 if left_token.id in (Id.Left_TDoubleQuote,
1008 Id.Left_DollarTDoubleQuote):
1009 expected_end_tokens = 3
1010 else:
1011 expected_end_tokens = 1
1012 else:
1013 expected_end_tokens = 1000 # here doc will break
1014
1015 num_end_tokens = 0
1016 while num_end_tokens < expected_end_tokens:
1017 self._SetNext(lex_mode_e.DQ)
1018 self._GetToken()
1019
1020 if self.token_kind == Kind.Lit:
1021 if self.token_type == Id.Lit_EscapedChar:
1022 tok = self.cur_token
1023 ch = lexer.TokenSliceLeft(tok, 1)
1024 part = word_part.EscapedLiteral(tok,
1025 ch) # type: word_part_t
1026 else:
1027 if self.token_type == Id.Lit_BadBackslash:
1028 # echo "\z" is OK in shell, but 'x = "\z" is a syntax error in
1029 # YSH.
1030 # Slight hole: We don't catch 'x = ${undef:-"\z"} because of the
1031 # recursion (unless parse_backslash)
1032 if (is_ysh_expr or
1033 not self.parse_opts.parse_backslash()):
1034 p_die(
1035 "Invalid char escape in double quoted string (OILS-ERR-12)",
1036 self.cur_token)
1037 elif self.token_type == Id.Lit_Dollar:
1038 if is_ysh_expr or not self.parse_opts.parse_dollar():
1039 p_die("Literal $ should be quoted like \$",
1040 self.cur_token)
1041
1042 part = self.cur_token
1043 out_parts.append(part)
1044
1045 elif self.token_kind == Kind.Left:
1046 if self.token_type == Id.Left_Backtick and is_ysh_expr:
1047 p_die("Invalid backtick: use $(cmd) or \\` in YSH strings",
1048 self.cur_token)
1049
1050 part = self._ReadDoubleQuotedLeftParts()
1051 out_parts.append(part)
1052
1053 elif self.token_kind == Kind.VSub:
1054 tok = self.cur_token
1055 part = SimpleVarSub(tok)
1056 out_parts.append(part)
1057 # NOTE: parsing "$f(x)" would BREAK CODE. Could add a more for it
1058 # later.
1059
1060 elif self.token_kind == Kind.Right:
1061 assert self.token_type == Id.Right_DoubleQuote, self.token_type
1062 if left_token:
1063 num_end_tokens += 1
1064
1065 # In a here doc, the right quote is literal!
1066 out_parts.append(self.cur_token)
1067
1068 elif self.token_kind == Kind.Eof:
1069 if left_token:
1070 p_die(
1071 'Unexpected EOF reading double-quoted string that began here',
1072 left_token)
1073 else: # here docs will have an EOF in their token stream
1074 break
1075
1076 else:
1077 raise AssertionError(self.cur_token)
1078
1079 if self.token_kind != Kind.Right:
1080 num_end_tokens = 0 # """ must be CONSECUTIVE
1081
1082 if expected_end_tokens == 1:
1083 out_parts.pop()
1084 elif expected_end_tokens == 3:
1085 out_parts.pop()
1086 out_parts.pop()
1087 out_parts.pop()
1088
1089 # Remove space from """ in both expression mode and command mode
1090 if (left_token and left_token.id
1091 in (Id.Left_TDoubleQuote, Id.Left_DollarTDoubleQuote)):
1092 word_compile.RemoveLeadingSpaceDQ(out_parts)
1093
1094 # Return nothing, since we appended to 'out_parts'
1095
1096 def _ReadDoubleQuoted(self, left_token):
1097 # type: (Token) -> DoubleQuoted
1098 """Helper function for "hello $name".
1099
1100 Args:
1101 eof_type: for stopping at }, Id.Lit_RBrace
1102 here_doc: Whether we are reading in a here doc context
1103
1104 Also ${foo%%a b c} # treat this as double quoted. until you hit
1105 """
1106 parts = [] # type: List[word_part_t]
1107 self._ReadLikeDQ(left_token, False, parts)
1108
1109 right_quote = self.cur_token
1110 return DoubleQuoted(left_token, parts, right_quote)
1111
1112 def ReadDoubleQuoted(self, left_token, parts):
1113 # type: (Token, List[word_part_t]) -> Token
1114 """For expression mode.
1115
1116 Read var x = "${dir:-}/$name"; etc.
1117 """
1118 self._ReadLikeDQ(left_token, True, parts)
1119 return self.cur_token
1120
1121 def _ReadCommandSub(self, left_id, d_quoted=False):
1122 # type: (Id_t, bool) -> CommandSub
1123 """
1124 NOTE: This is not in the grammar, because word parts aren't in the grammar!
1125
1126 command_sub = '$(' command_list ')'
1127 | '@(' command_list ')'
1128 | '<(' command_list ')'
1129 | '>(' command_list ')'
1130 | ` command_list `
1131 """
1132 left_token = self.cur_token
1133
1134 # Set the lexer in a state so ) becomes the EOF token.
1135 if left_id in (Id.Left_DollarParen, Id.Left_AtParen, Id.Left_ProcSubIn,
1136 Id.Left_ProcSubOut):
1137 self._SetNext(lex_mode_e.ShCommand) # advance past $( etc.
1138
1139 right_id = Id.Eof_RParen
1140 self.lexer.PushHint(Id.Op_RParen, right_id)
1141 c_parser = self.parse_ctx.MakeParserForCommandSub(
1142 self.line_reader, self.lexer, right_id)
1143 # NOTE: This doesn't use something like main_loop because we don't want
1144 # to interleave parsing and execution! Unlike 'source' and 'eval'.
1145 node = c_parser.ParseCommandSub()
1146
1147 right_token = c_parser.w_parser.cur_token
1148
1149 elif left_id == Id.Left_Backtick and self.parse_ctx.do_lossless:
1150 # NOTE: This is an APPROXIMATE solution for translation ONLY. See
1151 # test/osh2oil.
1152
1153 right_id = Id.Eof_Backtick
1154 self.lexer.PushHint(Id.Left_Backtick, right_id)
1155 c_parser = self.parse_ctx.MakeParserForCommandSub(
1156 self.line_reader, self.lexer, right_id)
1157 node = c_parser.ParseCommandSub()
1158 right_token = c_parser.w_parser.cur_token
1159
1160 elif left_id == Id.Left_Backtick:
1161 if not self.parse_opts.parse_backticks():
1162 p_die('Use $(cmd) instead of backticks (parse_backticks)',
1163 left_token)
1164
1165 self._SetNext(lex_mode_e.Backtick) # advance past `
1166
1167 parts = [] # type: List[str]
1168 while True:
1169 self._GetToken()
1170 #log("TOK %s", self.cur_token)
1171
1172 if self.token_type == Id.Backtick_Quoted:
1173 # Remove leading \
1174 parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1175
1176 elif self.token_type == Id.Backtick_DoubleQuote:
1177 # Compatibility: If backticks are double quoted, then double quotes
1178 # within them have to be \"
1179 # Shells aren't smart enough to match nested " and ` quotes (but OSH
1180 # is)
1181 if d_quoted:
1182 # Remove leading \
1183 parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1184 else:
1185 parts.append(lexer.TokenVal(self.cur_token))
1186
1187 elif self.token_type == Id.Backtick_Other:
1188 parts.append(lexer.TokenVal(self.cur_token))
1189
1190 elif self.token_type == Id.Backtick_Right:
1191 break
1192
1193 elif self.token_type == Id.Eof_Real:
1194 # Note: this parse error is in the ORIGINAL context. No code_str yet.
1195 p_die('Unexpected EOF while looking for closing backtick',
1196 left_token)
1197
1198 else:
1199 raise AssertionError(self.cur_token)
1200
1201 self._SetNext(lex_mode_e.Backtick)
1202
1203 # Calculate right SPID on CommandSub BEFORE re-parsing.
1204 right_token = self.cur_token
1205
1206 code_str = ''.join(parts)
1207 #log('code %r', code_str)
1208
1209 # NOTE: This is similar to how we parse aliases in osh/cmd_parse.py. It
1210 # won't have the same location info as MakeParserForCommandSub(), because
1211 # the lexer is different.
1212 arena = self.parse_ctx.arena
1213 #arena = alloc.Arena()
1214 line_reader = reader.StringLineReader(code_str, arena)
1215 c_parser = self.parse_ctx.MakeOshParser(line_reader)
1216 src = source.Reparsed('backticks', left_token, right_token)
1217 with alloc.ctx_SourceCode(arena, src):
1218 node = c_parser.ParseCommandSub()
1219
1220 else:
1221 raise AssertionError(left_id)
1222
1223 return CommandSub(left_token, node, right_token)
1224
1225 def _ReadExprSub(self, lex_mode):
1226 # type: (lex_mode_t) -> word_part.ExprSub
1227 """$[d->key] $[obj.method()] etc."""
1228 left_token = self.cur_token
1229
1230 self._SetNext(lex_mode_e.Expr)
1231 enode, right_token = self.parse_ctx.ParseYshExpr(
1232 self.lexer, grammar_nt.ysh_expr_sub)
1233
1234 self._SetNext(lex_mode) # Move past ]
1235 return word_part.ExprSub(left_token, enode, right_token)
1236
1237 def ParseVarDecl(self, kw_token):
1238 # type: (Token) -> VarDecl
1239 """
1240 oil_var_decl: name_type_list '=' testlist end_stmt
1241
1242 Note that assignments must end with \n ; } or EOF. Unlike shell
1243 assignments, we disallow:
1244
1245 var x = 42 | wc -l
1246 var x = 42 && echo hi
1247 """
1248 self._SetNext(lex_mode_e.Expr)
1249 enode, last_token = self.parse_ctx.ParseVarDecl(kw_token, self.lexer)
1250 # Hack to move } from what the Expr lexer modes gives to what CommandParser
1251 # wants
1252 if last_token.id == Id.Op_RBrace:
1253 last_token.id = Id.Lit_RBrace
1254
1255 # Let the CommandParser see the Op_Semi or Op_Newline.
1256 self.buffered_word = last_token
1257 self._SetNext(lex_mode_e.ShCommand) # always back to this
1258 return enode
1259
1260 def ParseMutation(self, kw_token, var_checker):
1261 # type: (Token, VarChecker) -> Mutation
1262 """
1263 setvar i = 42
1264 setvar i += 1
1265 setvar a[i] = 42
1266 setvar a[i] += 1
1267 setvar d.key = 42
1268 setvar d.key += 1
1269 """
1270 self._SetNext(lex_mode_e.Expr)
1271 enode, last_token = self.parse_ctx.ParseMutation(kw_token, self.lexer)
1272 # Hack to move } from what the Expr lexer modes gives to what CommandParser
1273 # wants
1274 if last_token.id == Id.Op_RBrace:
1275 last_token.id = Id.Lit_RBrace
1276
1277 for lhs in enode.lhs:
1278 UP_lhs = lhs
1279 with tagswitch(lhs) as case:
1280 if case(y_lhs_e.Var):
1281 lhs = cast(Token, UP_lhs)
1282 var_checker.Check(kw_token.id, lexer.LazyStr(lhs), lhs)
1283
1284 # Note: this does not cover cases like
1285 # setvar (a[0])[1] = v
1286 # setvar (d.key).other = v
1287 # This leaks into catching all typos statically, which may be
1288 # possible if 'use' makes all names explicit.
1289 elif case(y_lhs_e.Subscript):
1290 lhs = cast(Subscript, UP_lhs)
1291 if lhs.obj.tag() == expr_e.Var:
1292 v = cast(expr.Var, lhs.obj)
1293 var_checker.Check(kw_token.id, v.name, v.left)
1294
1295 elif case(y_lhs_e.Attribute):
1296 lhs = cast(Attribute, UP_lhs)
1297 if lhs.obj.tag() == expr_e.Var:
1298 v = cast(expr.Var, lhs.obj)
1299 var_checker.Check(kw_token.id, v.name, v.left)
1300
1301 # Let the CommandParser see the Op_Semi or Op_Newline.
1302 self.buffered_word = last_token
1303 self._SetNext(lex_mode_e.ShCommand) # always back to this
1304 return enode
1305
1306 def ParseBareDecl(self):
1307 # type: () -> expr_t
1308 """
1309 x = {name: val}
1310 """
1311 self._SetNext(lex_mode_e.Expr)
1312 self._GetToken()
1313 enode, last_token = self.parse_ctx.ParseYshExpr(
1314 self.lexer, grammar_nt.command_expr)
1315 if last_token.id == Id.Op_RBrace:
1316 last_token.id = Id.Lit_RBrace
1317 self.buffered_word = last_token
1318 self._SetNext(lex_mode_e.ShCommand)
1319 return enode
1320
1321 def ParseYshExprForCommand(self):
1322 # type: () -> expr_t
1323
1324 # Fudge for this case
1325 # for x in(y) {
1326 # versus
1327 # for x in (y) {
1328 #
1329 # In the former case, ReadWord on 'in' puts the lexer past (.
1330 # Also see LookPastSpace in CommandParers.
1331 # A simpler solution would be nicer.
1332
1333 if self.token_type == Id.Op_LParen:
1334 self.lexer.MaybeUnreadOne()
1335
1336 enode, _ = self.parse_ctx.ParseYshExpr(self.lexer, grammar_nt.ysh_expr)
1337
1338 self._SetNext(lex_mode_e.ShCommand)
1339 return enode
1340
1341 def ParseCommandExpr(self):
1342 # type: () -> expr_t
1343 """
1344 = 1+2
1345 """
1346 enode, last_token = self.parse_ctx.ParseYshExpr(
1347 self.lexer, grammar_nt.command_expr)
1348
1349 # In some cases, such as the case statement, we expect *the lexer* to be
1350 # pointing at the token right after the expression. But the expression
1351 # parser must have read to the `last_token`. Unreading places the lexer
1352 # back in the expected state. Ie:
1353 #
1354 # case (x) { case (x) {
1355 # (else) { = x } (else) { = x }
1356 # ^ The lexer is here ^ Unread to here
1357 # } }
1358 assert last_token.id in (Id.Op_Newline, Id.Eof_Real, Id.Op_Semi,
1359 Id.Op_RBrace), last_token
1360 if last_token.id != Id.Eof_Real:
1361 # Eof_Real is the only token we cannot unread
1362 self.lexer.MaybeUnreadOne()
1363
1364 return enode
1365
1366 def ParseProc(self, node):
1367 # type: (Proc) -> None
1368
1369 # proc name-with-hyphens() must be accepted
1370 self._SetNext(lex_mode_e.ShCommand)
1371 self._GetToken()
1372 # example: 'proc f[' gets you Lit_ArrayLhsOpen
1373 if self.token_type != Id.Lit_Chars:
1374 p_die('Invalid proc name %s' % ui.PrettyToken(self.cur_token),
1375 self.cur_token)
1376
1377 # TODO: validate this more. Disallow proc 123 { }, which isn't disallowed
1378 # for shell functions. Similar to IsValidVarName().
1379 node.name = self.cur_token
1380
1381 last_token = self.parse_ctx.ParseProc(self.lexer, node)
1382
1383 # Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1384 assert last_token.id == Id.Op_LBrace
1385 last_token.id = Id.Lit_LBrace
1386 self.buffered_word = last_token
1387
1388 self._SetNext(lex_mode_e.ShCommand)
1389
1390 def ParseFunc(self, node):
1391 # type: (Func) -> None
1392 last_token = self.parse_ctx.ParseFunc(self.lexer, node)
1393
1394 # Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1395 assert last_token.id == Id.Op_LBrace
1396 last_token.id = Id.Lit_LBrace
1397 self.buffered_word = last_token
1398
1399 self._SetNext(lex_mode_e.ShCommand)
1400
1401 def ParseYshCasePattern(self):
1402 # type: () -> Tuple[pat_t, Token]
1403 pat, left_tok, last_token = self.parse_ctx.ParseYshCasePattern(
1404 self.lexer)
1405
1406 if last_token.id == Id.Op_LBrace:
1407 last_token.id = Id.Lit_LBrace
1408 self.buffered_word = last_token
1409
1410 return pat, left_tok
1411
1412 def NewlineOkForYshCase(self):
1413 # type: () -> Id_t
1414 """Check for optional newline and consume it.
1415
1416 This is a special case of `_NewlineOk` which fixed some "off-by-one" issues
1417 which crop up while parsing Ysh Case Arms. For more details, see
1418 #oil-dev > Progress On YSH Case Grammar on zulip.
1419
1420 Returns a token id which is filled with the choice of
1421
1422 word { echo word }
1423 (3) { echo expr }
1424 /e/ { echo eggex }
1425 } # right brace
1426 """
1427 while True:
1428 next_id = self.lexer.LookAheadOne(lex_mode_e.Expr)
1429
1430 # Cannot lookahead past lines
1431 if next_id == Id.Unknown_Tok:
1432 if not self.lexer.MoveToNextLine(): # Try to move to next line
1433 break # EOF
1434 continue
1435
1436 next_kind = consts.GetKind(next_id)
1437 if next_id != Id.Op_Newline and next_kind != Kind.Ignored:
1438 break
1439
1440 self.lexer.Read(lex_mode_e.Expr)
1441
1442 if next_id in (Id.Op_RBrace, Id.Op_LParen, Id.Arith_Slash):
1443 self._SetNext(lex_mode_e.Expr) # Continue in expression mode
1444 else:
1445 # Consume the trailing Op_Newline
1446 self._SetNext(lex_mode_e.ShCommand)
1447 self._GetToken()
1448
1449 return next_id
1450
1451 def _ReadArithExpr(self, end_id):
1452 # type: (Id_t) -> arith_expr_t
1453 """Read and parse an arithmetic expression in various contexts.
1454
1455 $(( 1+2 ))
1456 (( a=1+2 ))
1457 ${a[ 1+2 ]}
1458 ${a : 1+2 : 1+2}
1459
1460 See tests/arith-context.test.sh for ambiguous cases.
1461
1462 ${a[a[0]]} is valid # VS_RBRACKET vs Id.Arith_RBracket
1463
1464 ${s : a<b?0:1 : 1} # VS_COLON vs Id.Arith_Colon
1465
1466 See the assertion in ArithParser.Parse() -- unexpected extra input.
1467 """
1468 # calls self.ReadWord(lex_mode_e.Arith)
1469 anode = self.a_parser.Parse()
1470 cur_id = self.a_parser.CurrentId()
1471 if end_id != Id.Undefined_Tok and cur_id != end_id:
1472 p_die(
1473 'Unexpected token after arithmetic expression (%s != %s)' %
1474 (ui.PrettyId(cur_id), ui.PrettyId(end_id)),
1475 loc.Word(self.a_parser.cur_word))
1476 return anode
1477
1478 def _ReadArithSub(self):
1479 # type: () -> word_part.ArithSub
1480 """Read an arith substitution, which contains an arith expression, e.g.
1481
1482 $((a + 1)).
1483 """
1484 left_tok = self.cur_token
1485
1486 # The second one needs to be disambiguated in stuff like stuff like:
1487 # $(echo $(( 1+2 )) )
1488 self.lexer.PushHint(Id.Op_RParen, Id.Right_DollarDParen)
1489
1490 # NOTE: To disambiguate $(( as arith sub vs. command sub and subshell, we
1491 # could save the lexer/reader state here, and retry if the arithmetic parse
1492 # fails. But we can almost always catch this at parse time. There could
1493 # be some exceptions like:
1494 # $((echo * foo)) # looks like multiplication
1495 # $((echo / foo)) # looks like division
1496
1497 # $(( )) is valid
1498 anode = arith_expr.EmptyZero # type: arith_expr_t
1499
1500 self._NextNonSpace()
1501 if self.token_type != Id.Arith_RParen:
1502 anode = self._ReadArithExpr(Id.Arith_RParen)
1503
1504 self._SetNext(lex_mode_e.ShCommand)
1505
1506 # Ensure we get closing )
1507 self._GetToken()
1508 if self.token_type != Id.Right_DollarDParen:
1509 p_die('Expected second ) to end arith sub', self.cur_token)
1510
1511 right_tok = self.cur_token
1512 return word_part.ArithSub(left_tok, anode, right_tok)
1513
1514 def ReadDParen(self):
1515 # type: () -> Tuple[arith_expr_t, Token]
1516 """Read ((1+ 2)) -- command context.
1517
1518 We're using the word parser because it's very similar to _ReadArithExpr
1519 above.
1520
1521 This also returns the terminating Id.Op_DRightParen token for location
1522 info.
1523 """
1524 # (( )) is valid
1525 anode = arith_expr.EmptyZero # type: arith_expr_t
1526
1527 self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
1528
1529 self._NextNonSpace()
1530 if self.token_type != Id.Arith_RParen:
1531 anode = self._ReadArithExpr(Id.Arith_RParen)
1532
1533 self._SetNext(lex_mode_e.ShCommand)
1534
1535 # Ensure we get the second )
1536 self._GetToken()
1537 right = self.cur_token
1538 if right.id != Id.Op_DRightParen:
1539 p_die('Expected second ) to end arith statement', right)
1540
1541 self._SetNext(lex_mode_e.ShCommand)
1542
1543 return anode, right
1544
1545 def _NextNonSpace(self):
1546 # type: () -> None
1547 """Advance in lex_mode_e.Arith until non-space token.
1548
1549 Same logic as _ReadWord, but used in
1550 $(( ))
1551 (( ))
1552 for (( ))
1553
1554 You can read self.token_type after this, without calling _GetToken.
1555 """
1556 while True:
1557 self._SetNext(lex_mode_e.Arith)
1558 self._GetToken()
1559 if self.token_kind not in (Kind.Ignored, Kind.WS):
1560 break
1561
1562 def ReadForExpression(self):
1563 # type: () -> command.ForExpr
1564 """Read ((i=0; i<5; ++i)) -- part of command context."""
1565 self._NextNonSpace() # skip over ((
1566 cur_id = self.token_type # for end of arith expressions
1567
1568 if cur_id == Id.Arith_Semi: # for (( ; i < 10; i++ ))
1569 init_node = arith_expr.EmptyZero # type: arith_expr_t
1570 else:
1571 init_node = self.a_parser.Parse()
1572 cur_id = self.a_parser.CurrentId()
1573 self._NextNonSpace()
1574
1575 # It's odd to keep track of both cur_id and self.token_type in this
1576 # function, but it works, and is tested in 'test/parse_error.sh
1577 # arith-integration'
1578 if cur_id != Id.Arith_Semi: # for (( x=0 b; ... ))
1579 p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1580
1581 self._GetToken()
1582 cur_id = self.token_type
1583
1584 if cur_id == Id.Arith_Semi: # for (( ; ; i++ ))
1585 # empty condition is TRUE
1586 cond_node = arith_expr.EmptyOne # type: arith_expr_t
1587 else:
1588 cond_node = self.a_parser.Parse()
1589 cur_id = self.a_parser.CurrentId()
1590
1591 if cur_id != Id.Arith_Semi: # for (( x=0; x<5 b ))
1592 p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1593
1594 self._NextNonSpace()
1595 if self.token_type == Id.Arith_RParen: # for (( ; ; ))
1596 update_node = arith_expr.EmptyZero # type: arith_expr_t
1597 else:
1598 update_node = self._ReadArithExpr(Id.Arith_RParen)
1599
1600 self._NextNonSpace()
1601 if self.token_type != Id.Arith_RParen:
1602 p_die('Expected ) to end for loop expression', self.cur_token)
1603 self._SetNext(lex_mode_e.ShCommand)
1604
1605 # redirects is None, will be assigned in CommandEvaluator
1606 node = command.ForExpr.CreateNull()
1607 node.init = init_node
1608 node.cond = cond_node
1609 node.update = update_node
1610 return node
1611
1612 def _ReadArrayLiteral(self):
1613 # type: () -> word_part_t
1614 """a=(1 2 3)
1615
1616 TODO: See osh/cmd_parse.py:164 for Id.Lit_ArrayLhsOpen, for a[x++]=1
1617
1618 We want:
1619
1620 A=(['x']=1 ["x"]=2 [$x$y]=3)
1621
1622 Maybe allow this as a literal string? Because I think I've seen it before?
1623 Or maybe force people to patch to learn the rule.
1624
1625 A=([x]=4)
1626
1627 Starts with Lit_Other '[', and then it has Lit_ArrayLhsClose
1628 Maybe enforce that ALL have keys or NONE of have keys.
1629 """
1630 self._SetNext(lex_mode_e.ShCommand) # advance past (
1631 self._GetToken()
1632 if self.cur_token.id != Id.Op_LParen:
1633 p_die('Expected ( after =', self.cur_token)
1634 left_token = self.cur_token
1635 right_token = None # type: Token
1636
1637 # MUST use a new word parser (with same lexer).
1638 w_parser = self.parse_ctx.MakeWordParser(self.lexer, self.line_reader)
1639 words = [] # type: List[CompoundWord]
1640 done = False
1641 while not done:
1642 w = w_parser.ReadWord(lex_mode_e.ShCommand)
1643 with tagswitch(w) as case:
1644 if case(word_e.Operator):
1645 tok = cast(Token, w)
1646 if tok.id == Id.Right_Initializer:
1647 right_token = tok
1648 done = True # can't use break here
1649 # Unlike command parsing, array parsing allows embedded \n.
1650 elif tok.id == Id.Op_Newline:
1651 continue
1652 else:
1653 p_die('Unexpected token in array literal', loc.Word(w))
1654
1655 elif case(word_e.Compound):
1656 words.append(cast(CompoundWord, w))
1657
1658 else:
1659 raise AssertionError()
1660
1661 initializer_words = [] # type: List[InitializerWord_t]
1662 for w in words:
1663 pair = word_.DetectAssocPair(w)
1664 if pair is not None:
1665 word_.TildeDetectAssign(pair.value) # pair.value is modified
1666 initializer_words.append(pair)
1667 else:
1668 w2 = braces.BraceDetect(w) # type: word_t
1669 if w2 is None:
1670 w2 = w
1671 w3 = word_.TildeDetect(w2) # type: word_t
1672 if w3 is None:
1673 w3 = w2
1674 initializer_words.append(InitializerWord.ArrayWord(w3))
1675
1676 # invariant List?
1677 return word_part.InitializerLiteral(left_token, initializer_words,
1678 right_token)
1679
1680 def ParseProcCallArgs(self, start_symbol):
1681 # type: (int) -> ArgList
1682 """ json write (x) """
1683 self.lexer.MaybeUnreadOne()
1684
1685 arg_list = ArgList.CreateNull(alloc_lists=True)
1686 arg_list.left = self.cur_token
1687 self.parse_ctx.ParseProcCallArgs(self.lexer, arg_list, start_symbol)
1688 return arg_list
1689
1690 def _MaybeReadWordPart(self, is_first, lex_mode, parts):
1691 # type: (bool, lex_mode_t, List[word_part_t]) -> bool
1692 """Helper for _ReadCompoundWord3."""
1693 done = False
1694
1695 if self.token_type == Id.Lit_EscapedChar:
1696 tok = self.cur_token
1697 assert tok.length == 2
1698 ch = lexer.TokenSliceLeft(tok, 1)
1699 if not self.parse_opts.parse_backslash():
1700 if not pyutil.IsValidCharEscape(ch):
1701 p_die('Invalid char escape in unquoted word (OILS-ERR-13)',
1702 self.cur_token)
1703
1704 part = word_part.EscapedLiteral(self.cur_token,
1705 ch) # type: word_part_t
1706 else:
1707 part = self.cur_token
1708
1709 if is_first and self.token_type == Id.Lit_VarLike: # foo=
1710 parts.append(part)
1711 # Unfortunately it's awkward to pull the check for a=(1 2) up to
1712 # _ReadWord.
1713 next_id = self.lexer.LookPastSpace(lex_mode)
1714 if next_id == Id.Op_LParen:
1715 self.lexer.PushHint(Id.Op_RParen, Id.Right_Initializer)
1716 part2 = self._ReadArrayLiteral()
1717 parts.append(part2)
1718
1719 # Array literal must be the last part of the word.
1720 self._SetNext(lex_mode)
1721 self._GetToken()
1722 # EOF, whitespace, newline, Right_Subshell
1723 if self.token_kind not in KINDS_THAT_END_WORDS:
1724 p_die('Unexpected token after array literal',
1725 self.cur_token)
1726 done = True
1727
1728 elif (is_first and self.parse_opts.parse_at() and
1729 self.token_type == Id.Lit_Splice):
1730
1731 splice_tok = self.cur_token
1732 part2 = word_part.Splice(splice_tok,
1733 lexer.TokenSliceLeft(splice_tok, 1))
1734
1735 parts.append(part2)
1736
1737 # @words must be the last part of the word
1738 self._SetNext(lex_mode)
1739 self._GetToken()
1740 # EOF, whitespace, newline, Right_Subshell
1741 if self.token_kind not in KINDS_THAT_END_WORDS:
1742 p_die('Unexpected token after array splice', self.cur_token)
1743 done = True
1744
1745 elif (is_first and self.parse_opts.parse_at() and
1746 self.token_type == Id.Lit_AtLBracket): # @[split(x)]
1747 part2 = self._ReadExprSub(lex_mode_e.DQ)
1748 parts.append(part2)
1749
1750 # @[split(x)]
1751 self._SetNext(lex_mode)
1752 self._GetToken()
1753 # EOF, whitespace, newline, Right_Subshell
1754 if self.token_kind not in KINDS_THAT_END_WORDS:
1755 p_die('Unexpected token after Expr splice', self.cur_token)
1756 done = True
1757
1758 elif (is_first and self.parse_opts.parse_at() and
1759 self.token_type == Id.Lit_AtLBraceDot):
1760 p_die('TODO: @{.myproc builtin sub}', self.cur_token)
1761
1762 elif (is_first and self.parse_opts.parse_at_all() and
1763 self.token_type == Id.Lit_At):
1764 # Because $[x] ${x} and perhaps $/x/ are reserved, it makes sense for @
1765 # at the beginning of a word to be reserved.
1766
1767 # Although should we relax 'echo @' ? I'm tempted to have a shortcut for
1768 # @_argv and
1769 p_die('Literal @ starting a word must be quoted (parse_at_all)',
1770 self.cur_token)
1771
1772 else:
1773 # not a literal with lookahead; append it
1774 parts.append(part)
1775
1776 return done
1777
1778 def _ReadCompoundWord(self, lex_mode):
1779 # type: (lex_mode_t) -> CompoundWord
1780 return self._ReadCompoundWord3(lex_mode, Id.Undefined_Tok, True)
1781
1782 def _ReadCompoundWord3(self, lex_mode, eof_type, empty_ok):
1783 # type: (lex_mode_t, Id_t, bool) -> CompoundWord
1784 """
1785 Precondition: Looking at the first token of the first word part
1786 Postcondition: Looking at the token after, e.g. space or operator
1787
1788 NOTE: eof_type is necessary because / is a literal, i.e. Lit_Slash, but it
1789 could be an operator delimiting a compound word. Can we change lexer modes
1790 and remove this special case?
1791 """
1792 w = CompoundWord([])
1793 num_parts = 0
1794 brace_count = 0
1795 done = False
1796 is_triple_quoted = None # type: Optional[BoolParamBox]
1797
1798 while not done:
1799 self._GetToken()
1800
1801 allow_done = empty_ok or num_parts != 0
1802 if allow_done and self.token_type == eof_type:
1803 done = True # e.g. for ${foo//pat/replace}
1804
1805 # Keywords like "for" are treated like literals
1806 elif self.token_kind in (Kind.Lit, Kind.History, Kind.KW,
1807 Kind.ControlFlow, Kind.BoolUnary,
1808 Kind.BoolBinary):
1809
1810 # Syntax error for { and }
1811 if self.token_type == Id.Lit_LBrace:
1812 brace_count += 1
1813 elif self.token_type == Id.Lit_RBrace:
1814 brace_count -= 1
1815 elif self.token_type == Id.Lit_Dollar:
1816 if not self.parse_opts.parse_dollar():
1817 if num_parts == 0 and lex_mode == lex_mode_e.ShCommand:
1818 next_byte = self.lexer.ByteLookAhead()
1819 # TODO: switch lexer modes and parse $/d+/. But not ${a:-$/d+/}
1820 if next_byte == '/':
1821 #log('next_byte %r', next_byte)
1822 pass
1823
1824 p_die('Literal $ should be quoted like \$',
1825 self.cur_token)
1826
1827 done = self._MaybeReadWordPart(num_parts == 0, lex_mode,
1828 w.parts)
1829
1830 elif self.token_kind == Kind.VSub:
1831 vsub_token = self.cur_token
1832
1833 part = SimpleVarSub(vsub_token) # type: word_part_t
1834 w.parts.append(part)
1835
1836 elif self.token_kind == Kind.ExtGlob:
1837 # If parse_at, we can take over @( to start @(seq 3)
1838 # Users can also use look at ,(*.py|*.sh)
1839 if (self.parse_opts.parse_at() and
1840 self.token_type == Id.ExtGlob_At and num_parts == 0):
1841 cs_part = self._ReadCommandSub(Id.Left_AtParen,
1842 d_quoted=False)
1843 # RARE mutation of tok.id!
1844 cs_part.left_token.id = Id.Left_AtParen
1845 part = cs_part # for type safety
1846
1847 # Same check as _MaybeReadWordPart. @(seq 3)x is illegal, just like
1848 # a=(one two)x and @arrayfunc(3)x.
1849 self._GetToken()
1850 if self.token_kind not in KINDS_THAT_END_WORDS:
1851 p_die('Unexpected token after @()', self.cur_token)
1852 done = True
1853
1854 else:
1855 if HAVE_FNM_EXTMATCH == 0:
1856 p_die(
1857 "Extended glob won't work without FNM_EXTMATCH support in libc",
1858 self.cur_token)
1859 part = self._ReadExtGlob()
1860 w.parts.append(part)
1861
1862 elif self.token_kind == Kind.BashRegex:
1863 if self.token_type == Id.BashRegex_LParen: # Opening (
1864 part = self._ReadBashRegexGroup()
1865 w.parts.append(part)
1866 else:
1867 assert self.token_type == Id.BashRegex_AllowedInParens
1868 p_die('Invalid token in bash regex', self.cur_token)
1869
1870 elif self.token_kind == Kind.Left:
1871 try_triple_quote = (self.parse_opts.parse_triple_quote() and
1872 lex_mode == lex_mode_e.ShCommand and
1873 num_parts == 0)
1874
1875 # Save allocation
1876 if try_triple_quote:
1877 is_triple_quoted = BoolParamBox(False)
1878
1879 part = self._ReadUnquotedLeftParts(is_triple_quoted)
1880 w.parts.append(part)
1881
1882 # NOT done yet, will advance below
1883 elif self.token_kind == Kind.Right:
1884 # Still part of the word; will be done on the next iter.
1885 if self.token_type == Id.Right_DoubleQuote:
1886 pass
1887 # Never happens, no PushHint for this case.
1888 #elif self.token_type == Id.Right_DollarParen:
1889 # pass
1890 elif self.token_type == Id.Right_Subshell:
1891 # LEXER HACK for (case x in x) ;; esac )
1892 # Rewind before it's used
1893 assert self.next_lex_mode == lex_mode_e.Undefined
1894 if self.lexer.MaybeUnreadOne():
1895 self.lexer.PushHint(Id.Op_RParen, Id.Right_Subshell)
1896 self._SetNext(lex_mode)
1897 done = True
1898 else:
1899 done = True
1900
1901 elif self.token_kind == Kind.Ignored:
1902 done = True
1903
1904 else:
1905 # LEXER HACK for unbalanced case clause. 'case foo in esac' is valid,
1906 # so to test for ESAC, we can read ) before getting a chance to
1907 # PushHint(Id.Op_RParen, Id.Right_CasePat). So here we unread one
1908 # token and do it again.
1909
1910 # We get Id.Op_RParen at top level: case x in x) ;; esac
1911 # We get Id.Eof_RParen inside ComSub: $(case x in x) ;; esac )
1912 if self.token_type in (Id.Op_RParen, Id.Eof_RParen):
1913 # Rewind before it's used
1914 assert self.next_lex_mode == lex_mode_e.Undefined
1915 if self.lexer.MaybeUnreadOne():
1916 if self.token_type == Id.Eof_RParen:
1917 # Redo translation
1918 self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
1919 self._SetNext(lex_mode)
1920
1921 done = True # anything we don't recognize means we're done
1922
1923 if not done:
1924 self._SetNext(lex_mode)
1925 num_parts += 1
1926
1927 if (self.parse_opts.parse_brace() and num_parts > 1 and
1928 brace_count != 0):
1929 # accept { and }, but not foo{
1930 p_die(
1931 'Word has unbalanced { }. Maybe add a space or quote it like \{',
1932 loc.Word(w))
1933
1934 if is_triple_quoted and is_triple_quoted.b and num_parts > 1:
1935 p_die('Unexpected parts after triple quoted string',
1936 loc.WordPart(w.parts[-1]))
1937
1938 if 0:
1939 from _devbuild.gen.syntax_asdl import word_part_str
1940 word_key = ' '.join(word_part_str(p.tag()) for p in w.parts)
1941 WORD_HIST[word_key] += 1
1942 return w
1943
1944 def _ReadArithWord(self):
1945 # type: () -> Optional[word_t]
1946 """ Helper for ReadArithWord() """
1947 self._GetToken()
1948
1949 if self.token_kind == Kind.Unknown:
1950 # e.g. happened during dynamic parsing of unset 'a[$foo]' in gherkin
1951 p_die(
1952 'Unexpected token while parsing arithmetic: %r' %
1953 lexer.TokenVal(self.cur_token), self.cur_token)
1954
1955 elif self.token_kind == Kind.Eof:
1956 return self.cur_token
1957
1958 elif self.token_kind == Kind.Ignored:
1959 # Space should be ignored.
1960 self._SetNext(lex_mode_e.Arith)
1961 return None
1962
1963 elif self.token_kind in (Kind.Arith, Kind.Right):
1964 # Id.Right_DollarDParen IS just a normal token, handled by ArithParser
1965 self._SetNext(lex_mode_e.Arith)
1966 return self.cur_token
1967
1968 elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub):
1969 return self._ReadCompoundWord(lex_mode_e.Arith)
1970
1971 else:
1972 raise AssertionError(self.cur_token)
1973
1974 def _ReadWord(self, word_mode):
1975 # type: (lex_mode_t) -> Optional[word_t]
1976 """Helper function for ReadWord()."""
1977
1978 # Change the pseudo lexer mode to a real lexer mode
1979 if word_mode == lex_mode_e.ShCommandFakeBrack:
1980 lex_mode = lex_mode_e.ShCommand
1981 else:
1982 lex_mode = word_mode
1983
1984 self._GetToken()
1985
1986 if self.token_kind == Kind.Eof:
1987 # No advance
1988 return self.cur_token
1989
1990 # Allow Arith for ) at end of for loop?
1991 elif self.token_kind in (Kind.Op, Kind.Redir, Kind.Arith):
1992 self._SetNext(lex_mode)
1993
1994 # Newlines are complicated. See 3x2 matrix in the comment about
1995 # self.multiline and self.newline_state above.
1996 if self.token_type == Id.Op_Newline:
1997 if self.multiline:
1998 if self.newline_state > 1:
1999 # This points at a blank line, but at least it gives the line number
2000 p_die('Invalid blank line in multiline mode',
2001 self.cur_token)
2002 return None
2003
2004 if self.returned_newline: # skip
2005 return None
2006
2007 return self.cur_token
2008
2009 elif self.token_kind == Kind.Right:
2010 if self.token_type not in (Id.Right_Subshell, Id.Right_ShFunction,
2011 Id.Right_CasePat, Id.Right_Initializer):
2012 raise AssertionError(self.cur_token)
2013
2014 self._SetNext(lex_mode)
2015 return self.cur_token
2016
2017 elif self.token_kind in (Kind.Ignored, Kind.WS):
2018 self._SetNext(lex_mode)
2019 return None
2020
2021 else:
2022 assert self.token_kind in (Kind.VSub, Kind.Lit, Kind.History,
2023 Kind.Left, Kind.KW, Kind.ControlFlow,
2024 Kind.BoolUnary, Kind.BoolBinary,
2025 Kind.ExtGlob,
2026 Kind.BashRegex), 'Unhandled token kind'
2027
2028 if (word_mode == lex_mode_e.ShCommandFakeBrack and
2029 self.parse_opts.parse_bracket() and
2030 self.token_type == Id.Lit_LBracket):
2031 # Change [ from Kind.Lit -> Kind.Op
2032 # So CommandParser can treat
2033 # assert [42 === x]
2034 # like
2035 # json write (x)
2036 bracket_word = self.cur_token
2037 bracket_word.id = Id.Op_LBracket
2038
2039 self._SetNext(lex_mode)
2040 return bracket_word
2041
2042 # We're beginning a word. If we see Id.Lit_Pound, change to
2043 # lex_mode_e.Comment and read until end of line.
2044 if self.token_type == Id.Lit_Pound:
2045 self._SetNext(lex_mode_e.Comment)
2046 self._GetToken()
2047
2048 # NOTE: The # could be the last character in the file. It can't be
2049 # Eof_{RParen,Backtick} because #) and #` are comments.
2050 assert self.token_type in (Id.Ignored_Comment, Id.Eof_Real), \
2051 self.cur_token
2052
2053 # The next iteration will go into Kind.Ignored and set lex state to
2054 # lex_mode_e.ShCommand/etc.
2055 return None # tell ReadWord() to try again after comment
2056
2057 elif self.token_type == Id.Lit_TPound: ### doc comment
2058 self._SetNext(lex_mode_e.Comment)
2059 self._GetToken()
2060
2061 if self.token_type == Id.Ignored_Comment and self.emit_doc_token:
2062 return self.cur_token
2063
2064 return None # tell ReadWord() to try again after comment
2065
2066 else:
2067 # r'' u'' b''
2068 if (self.token_type == Id.Lit_Chars and
2069 self.lexer.LookAheadOne(
2070 lex_mode_e.ShCommand) == Id.Left_SingleQuote):
2071
2072 # When shopt -s parse_raw_string:
2073 # echo r'hi' is like echo 'hi'
2074 #
2075 # echo u'\u{3bc}' b'\yff' works
2076
2077 tok = self.cur_token
2078 if self.parse_opts.parse_ysh_string():
2079 if lexer.TokenEquals(tok, 'r'):
2080 left_id = Id.Left_RSingleQuote
2081 elif lexer.TokenEquals(tok, 'u'):
2082 left_id = Id.Left_USingleQuote
2083 elif lexer.TokenEquals(tok, 'b'):
2084 left_id = Id.Left_BSingleQuote
2085 else:
2086 left_id = Id.Undefined_Tok
2087
2088 if left_id != Id.Undefined_Tok:
2089 # skip the r, and then 'foo' will be read as normal
2090 self._SetNext(lex_mode_e.ShCommand)
2091
2092 self._GetToken()
2093 assert self.token_type == Id.Left_SingleQuote, self.token_type
2094
2095 # Read the word in a different lexer mode
2096 return self._ReadYshSingleQuoted(left_id)
2097
2098 return self._ReadCompoundWord(lex_mode)
2099
2100 def ParseVarRef(self):
2101 # type: () -> BracedVarSub
2102 """DYNAMIC parsing of what's inside ${!ref}
2103
2104 # Same as VarOf production
2105 VarRefExpr = VarOf EOF
2106 """
2107 self._SetNext(lex_mode_e.VSub_1)
2108
2109 self._GetToken()
2110 if self.token_kind != Kind.VSub:
2111 p_die('Expected var name', self.cur_token)
2112
2113 part = self._ParseVarOf()
2114 # NOTE: no ${ } means no part.left and part.right
2115 part.left = part.name_tok # cheat to make test pass
2116 part.right = part.name_tok
2117
2118 self._GetToken()
2119 if self.token_type != Id.Eof_Real:
2120 p_die('Expected end of var ref expression', self.cur_token)
2121 return part
2122
2123 def LookPastSpace(self):
2124 # type: () -> Id_t
2125 """Look ahead to the next token.
2126
2127 For the CommandParser to recognize
2128 array= (1 2 3)
2129 YSH for ( versus bash for ((
2130 YSH if ( versus if test
2131 YSH while ( versus while test
2132 YSH bare assignment 'grep =' versus 'grep foo'
2133 """
2134 assert self.token_type != Id.Undefined_Tok
2135 if self.cur_token.id == Id.WS_Space:
2136 id_ = self.lexer.LookPastSpace(lex_mode_e.ShCommand)
2137 else:
2138 id_ = self.cur_token.id
2139 return id_
2140
2141 def LookAheadFuncParens(self):
2142 # type: () -> bool
2143 """Special lookahead for f( ) { echo hi; } to check for ( )"""
2144 assert self.token_type != Id.Undefined_Tok
2145
2146 # We have to handle 2 cases because we buffer a token
2147 if self.cur_token.id == Id.Op_LParen: # saw funcname(
2148 return self.lexer.LookAheadFuncParens(1) # go back one char
2149
2150 elif self.cur_token.id == Id.WS_Space: # saw funcname WHITESPACE
2151 return self.lexer.LookAheadFuncParens(0)
2152
2153 else:
2154 return False
2155
2156 def ReadWord(self, word_mode):
2157 # type: (lex_mode_t) -> word_t
2158 """Read the next word, using the given lexer mode.
2159
2160 This is a stateful wrapper for the stateless _ReadWord function.
2161 """
2162 assert word_mode in (lex_mode_e.ShCommand,
2163 lex_mode_e.ShCommandFakeBrack,
2164 lex_mode_e.DBracket, lex_mode_e.BashRegex)
2165
2166 if self.buffered_word: # For integration with pgen2
2167 w = self.buffered_word
2168 self.buffered_word = None
2169 else:
2170 while True:
2171 w = self._ReadWord(word_mode)
2172 if w is not None:
2173 break
2174
2175 self.returned_newline = (word_.CommandId(w) == Id.Op_Newline)
2176 return w
2177
2178 def ReadArithWord(self):
2179 # type: () -> word_t
2180 while True:
2181 w = self._ReadArithWord()
2182 if w is not None:
2183 break
2184 return w
2185
2186 def ReadHereDocBody(self, parts):
2187 # type: (List[word_part_t]) -> None
2188 """
2189 A here doc is like a double quoted context, except " isn't special.
2190 """
2191 self._ReadLikeDQ(None, False, parts)
2192 # Returns nothing
2193
2194 def ReadForPlugin(self):
2195 # type: () -> CompoundWord
2196 """For $PS1, $PS4, etc.
2197
2198 This is just like reading a here doc line. "\n" is allowed, as
2199 well as the typical substitutions ${x} $(echo hi) $((1 + 2)).
2200 """
2201 w = CompoundWord([])
2202 self._ReadLikeDQ(None, False, w.parts)
2203 return w
2204
2205 def EmitDocToken(self, b):
2206 # type: (bool) -> None
2207 self.emit_doc_token = b
2208
2209 def Multiline(self, b):
2210 # type: (bool) -> None
2211 self.multiline = b
2212
2213
2214if 0:
2215 import collections
2216 WORD_HIST = collections.Counter()
2217
2218# vim: sw=4