OILS / osh / word_parse.py View on Github | oils.pub

2267 lines, 1210 significant
1# Copyright 2016 Andy Chu. All rights reserved.
2# Licensed under the Apache License, Version 2.0 (the "License");
3# you may not use this file except in compliance with the License.
4# You may obtain a copy of the License at
5#
6# http://www.apache.org/licenses/LICENSE-2.0
7"""
8word_parse.py - Parse the shell word language.
9
10Hairy example:
11
12 hi$((1 + 2))"$(echo hi)"${var:-__"$(echo default)"__}
13
14Substitutions can be nested, but which inner subs are allowed depends on the
15outer sub. Notes:
16
17lex_mode_e.ShCommand (_ReadUnquotedLeftParts)
18 All subs and quotes are allowed:
19 $v ${v} $() `` $(()) '' "" $'' $"" <() >()
20
21lex_mode_e.DQ (_ReadDoubleQuotedLeftParts)
22 Var, Command, Arith, but no quotes.
23 $v ${v} $() `` $(())
24 No process substitution.
25
26lex_mode_e.Arith
27 Similar to DQ: Var, Command, and Arith sub, but no process sub. bash doesn't
28 allow quotes, but OSH does. We allow ALL FOUR kinds of quotes, because we
29 need those for associative array indexing.
30
31lex_mode_e.VSub_ArgUnquoted
32 Like ShCommand, everything is allowed (even process substitutions), but we
33 stop at }, and space is SIGNIFICANT.
34
35 Example: ${a:- b }
36
37 ${X:-$v} ${X:-${v}} ${X:-$(echo hi)} ${X:-`echo hi`} ${X:-$((1+2))}
38 ${X:-'single'} ${X:-"double"} ${X:-$'\n'} ${X:-<(echo hi)}
39
40lex_mode_e.VSub_ArgDQ
41 In contrast to DQ, VS_ARG_DQ accepts nested "" and $'' and $"", e.g.
42 "${x:-"default"}".
43
44 In contrast, VSub_ArgUnquoted respects single quotes and process
45 substitution.
46
47 It's weird that double quotes are allowed. Space is also significant here,
48 e.g. "${x:-a "b"}".
49"""
50
51from _devbuild.gen import grammar_nt
52from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind
53from _devbuild.gen.types_asdl import (lex_mode_t, lex_mode_e)
54from _devbuild.gen.syntax_asdl import (
55 BoolParamBox,
56 Token,
57 SimpleVarSub,
58 loc,
59 source,
60 DoubleQuoted,
61 SingleQuoted,
62 BracedVarSub,
63 CommandSub,
64 InitializerWord,
65 InitializerWord_t,
66 bracket_op,
67 bracket_op_t,
68 suffix_op,
69 suffix_op_t,
70 rhs_word,
71 rhs_word_e,
72 rhs_word_t,
73 word_e,
74 word_t,
75 CompoundWord,
76 word_part,
77 word_part_t,
78 y_lhs_e,
79 arith_expr_t,
80 command,
81 expr,
82 expr_e,
83 expr_t,
84 pat_t,
85 ArgList,
86 Proc,
87 Func,
88 Subscript,
89 Attribute,
90 arith_expr,
91 VarDecl,
92 Mutation,
93 word_part_e,
94)
95from core import alloc
96from core.error import p_die
97from mycpp.mylib import log
98from core import pyutil
99from display import ui
100from frontend import consts
101from frontend import lexer
102from frontend import reader
103from osh import tdop
104from osh import arith_parse
105from osh import braces
106from osh import word_
107from osh import word_compile
108from mycpp.mylib import tagswitch
109
110from libc import HAVE_FNM_EXTMATCH
111
112from typing import List, Optional, Tuple, cast
113from typing import TYPE_CHECKING
114if TYPE_CHECKING:
115 from frontend.lexer import Lexer
116 from frontend.parse_lib import ParseContext
117 from frontend.reader import _Reader
118 from osh.cmd_parse import VarChecker
119
120unused1 = log
121unused2 = Id_str
122
123KINDS_THAT_END_WORDS = [Kind.Eof, Kind.WS, Kind.Op, Kind.Right]
124
125
126def _IsValidYshWord(w):
127 # type: (CompoundWord) -> bool
128 """YSH word restriction
129
130 Allowed:
131 'foo' r'foo' --flag r'foo'
132 --flag='foo'
133 --flag="foo"
134 Not allowed:
135 --flag=r'bar' NAME=u'value' # ambiguous
136 --flag=b''' multi '''
137 """
138 parts = w.parts
139 n = len(parts)
140
141 if n != 0 and word_.LiteralId(parts[0]) == Id.Lit_Tilde:
142 # ~bob/src/'dir with spaces' is allowed
143 # ~bob/src/u'dir with spaces' is ambiguous, but allowed for simplicity
144 return True # early return
145
146 ok = True
147 if n >= 2:
148 for part in parts:
149 if part.tag() in (word_part_e.SingleQuoted,
150 word_part_e.DoubleQuoted):
151 ok = False
152
153 # Allow special cases:
154 # --flag='val' NAME='bar'
155 # But NOT
156 # --flag=r'val' NAME=r'val'
157 if not ok:
158 if (n == 2 and word_.LiteralId(parts[0]) == Id.Lit_VarLike):
159 ok = True
160 elif (n == 3 and word_.LiteralId(parts[0]) == Id.Lit_Chars and
161 word_.LiteralId(parts[1]) == Id.Lit_Equals):
162 ok = True
163
164 return ok
165
166
167class WordEmitter(object):
168 """Common interface for [ and [["""
169
170 def __init__(self):
171 # type: () -> None
172 """Empty constructor for mycpp."""
173 pass
174
175 def ReadWord(self, lex_mode):
176 # type: (lex_mode_t) -> word_t
177 raise NotImplementedError()
178
179
180class WordParser(WordEmitter):
181
182 def __init__(self, parse_ctx, lexer, line_reader):
183 # type: (ParseContext, Lexer, _Reader) -> None
184 self.parse_ctx = parse_ctx
185 self.lexer = lexer
186 self.line_reader = line_reader
187 self.arena = line_reader.arena
188
189 self.parse_opts = parse_ctx.parse_opts
190 self.a_parser = tdop.TdopParser(arith_parse.Spec(), self,
191 self.parse_opts)
192 self.Reset()
193
194 def Init(self, lex_mode):
195 # type: (lex_mode_t) -> None
196 """Used to parse arithmetic, see ParseContext."""
197 self.next_lex_mode = lex_mode
198
199 def Reset(self):
200 # type: () -> None
201 """Called by interactive loop."""
202 # For _GetToken()
203 self.cur_token = None # type: Token
204 self.token_kind = Kind.Undefined
205 self.token_type = Id.Undefined_Tok
206
207 self.next_lex_mode = lex_mode_e.ShCommand
208
209 # Boolean mutated by CommandParser via word_.ctx_EmitDocToken. For ### doc
210 # comments
211 self.emit_doc_token = False
212 # Boolean mutated by CommandParser via word_.ctx_Multiline. '...' starts
213 # multiline mode.
214 self.multiline = False
215
216 # For detecting invalid \n\n in multiline mode. Counts what we got
217 # directly from the lexer.
218 self.newline_state = 0
219 # For consolidating \n\n -> \n for the CALLER. This simplifies the parsers
220 # that consume words.
221 self.returned_newline = False
222
223 # For integration with pgen2
224 self.buffered_word = None # type: word_t
225
226 def _GetToken(self):
227 # type: () -> None
228 """Call this when you need to make a decision based on any of:
229
230 self.token_type
231 self.token_kind
232 self.cur_token
233 """
234 if self.next_lex_mode == lex_mode_e.Undefined:
235 return # _SetNext() not called, so do nothing
236
237 is_fake = self.next_lex_mode == lex_mode_e.BashRegexFakeInner
238 real_mode = (lex_mode_e.BashRegex if is_fake else self.next_lex_mode)
239
240 self.cur_token = self.lexer.Read(real_mode)
241
242 # MUTATE TOKEN for fake lexer mode.
243 # This is for crazy stuff bash allows, like [[ s =~ (< >) ]]
244 if (is_fake and self.cur_token.id
245 in (Id.WS_Space, Id.BashRegex_AllowedInParens)):
246 self.cur_token.id = Id.Lit_Chars
247
248 self.token_type = self.cur_token.id
249 self.token_kind = consts.GetKind(self.token_type)
250
251 # number of consecutive newlines, ignoring whitespace
252 if self.token_type == Id.Op_Newline:
253 self.newline_state += 1
254 elif self.token_kind != Kind.WS:
255 self.newline_state = 0
256
257 self.parse_ctx.trail.AppendToken(self.cur_token) # For completion
258 self.next_lex_mode = lex_mode_e.Undefined
259
260 def _SetNext(self, lex_mode):
261 # type: (lex_mode_t) -> None
262 """Set the next lex state, but don't actually read a token.
263
264 We need this for proper interactive parsing.
265 """
266 self.next_lex_mode = lex_mode
267
268 def _ReadVarOpArg(self, arg_lex_mode):
269 # type: (lex_mode_t) -> rhs_word_t
270
271 # NOTE: Operators like | and < are not treated as special, so ${a:- | >} is
272 # valid, even when unquoted.
273 self._SetNext(arg_lex_mode)
274 self._GetToken()
275
276 w = self._ReadVarOpArg2(arg_lex_mode, Id.Undefined_Tok,
277 True) # empty_ok
278
279 # If the Compound has no parts, and we're in a double-quoted VarSub
280 # arg, and empty_ok, then return Empty. This is so it can evaluate to
281 # the empty string and not get elided.
282 #
283 # Examples:
284 # - "${s:-}", "${s/%pat/}"
285 # It's similar to LooksLikeShAssignment where we turn x= into x=''. And it
286 # has the same potential problem of not having Token location info.
287 #
288 # NOTE: empty_ok is False only for the PatSub pattern, which means we'll
289 # return a Compound with no parts, which is explicitly checked with a
290 # custom error message.
291 if len(w.parts) == 0 and arg_lex_mode == lex_mode_e.VSub_ArgDQ:
292 return rhs_word.Empty
293
294 return w
295
296 def _ReadVarOpArg2(self, arg_lex_mode, eof_type, empty_ok):
297 # type: (lex_mode_t, Id_t, bool) -> CompoundWord
298 """Return a CompoundWord.
299
300 Helper function for _ReadVarOpArg and used directly by
301 _ReadPatSubVarOp.
302 """
303 w = self._ReadCompoundWord3(arg_lex_mode, eof_type, empty_ok)
304 #log('w %s', w)
305 tilde = word_.TildeDetect(w)
306 if tilde:
307 w = tilde
308 return w
309
310 def _ReadSliceVarOp(self):
311 # type: () -> suffix_op.Slice
312 """
313 Looking token after first ':'
314
315 ArithExpr? (':' ArithExpr? )? '}'
316 """
317 self._NextNonSpace()
318
319 cur_id = self.token_type
320
321 if cur_id in (Id.Arith_RBrace, Id.Arith_Colon): # ${a:} or ${a::}
322 begin = arith_expr.EmptyZero # type: arith_expr_t
323 else:
324 begin = self.a_parser.Parse()
325 cur_id = self.a_parser.CurrentId() # advance
326
327 if cur_id == Id.Arith_RBrace: # ${a:1} or ${@:1}
328 # No length specified, so it's N
329 no_length = None # type: Optional[arith_expr_t]
330 return suffix_op.Slice(begin, no_length)
331
332 elif cur_id == Id.Arith_Colon: # ${a:1:} or ${@:1:}
333 colon_tok = self.cur_token
334 self._NextNonSpace()
335
336 if self.token_type == Id.Arith_RBrace:
337 # quirky bash behavior:
338 # ${a:1:} or ${a::} means length ZERO
339 # but ${a:1} or ${a:} means length N
340 if self.parse_opts.strict_parse_slice():
341 p_die(
342 "Slice length: Add explicit zero, or omit : for N (strict_parse_slice)",
343 colon_tok)
344
345 length = arith_expr.EmptyZero # type: arith_expr_t
346 else:
347 length = self._ReadArithExpr(Id.Arith_RBrace)
348
349 return suffix_op.Slice(begin, length)
350
351 else:
352 p_die("Expected : or } in slice", self.cur_token)
353
354 raise AssertionError() # for MyPy
355
356 def _ReadPatSubVarOp(self):
357 # type: () -> suffix_op.PatSub
358 """Looking at the first '/' after VarOf:
359
360 VarSub = ...
361 | VarOf '/' Match ( '/' WORD? )?
362 Match = '/' WORD # can't be empty
363 | '#' WORD? # may be empty
364 | '%' WORD?
365 """
366 slash_tok = self.cur_token # location info
367 replace_mode = Id.Undefined_Tok # bizarre syntax / # %
368
369 self._SetNext(lex_mode_e.VSub_ArgUnquoted) # advance past /
370
371 self._GetToken()
372 if self.token_type == Id.Right_DollarBrace:
373 pat = CompoundWord([])
374 return suffix_op.PatSub(pat, rhs_word.Empty, replace_mode,
375 slash_tok)
376
377 if self.token_type in (Id.Lit_Slash, Id.Lit_Pound, Id.Lit_Percent):
378 replace_mode = self.token_type
379 self._SetNext(lex_mode_e.VSub_ArgUnquoted)
380
381 # Bash quirk:
382 # echo ${x/#/replace} has an empty pattern
383 # echo ${x////replace} is non-empty; it means echo ${x//'/'/replace}
384 empty_ok = replace_mode != Id.Lit_Slash
385 pat = self._ReadVarOpArg2(lex_mode_e.VSub_ArgUnquoted, Id.Lit_Slash,
386 empty_ok)
387 #log('pat 1 %r', pat)
388
389 if self.token_type == Id.Lit_Slash:
390 # read until }
391 replace = self._ReadVarOpArg(
392 lex_mode_e.VSub_ArgUnquoted) # type: rhs_word_t
393 #log('r 1 %r', replace)
394 else:
395 # e.g. ${v/a} is the same as ${v/a/} -- empty replacement string
396 replace = rhs_word.Empty
397
398 self._GetToken()
399 if self.token_type != Id.Right_DollarBrace:
400 # This happens on invalid code
401 p_die(
402 "Expected } after replacement string, got %s" %
403 ui.PrettyId(self.token_type), self.cur_token)
404
405 return suffix_op.PatSub(pat, replace, replace_mode, slash_tok)
406
407 def _ReadSubscript(self):
408 # type: () -> bracket_op_t
409 """ Subscript = '[' ('@' | '*' | ArithExpr) ']' """
410 # Lookahead to see if we get @ or *. Otherwise read a full arithmetic
411 # expression.
412 next_id = self.lexer.LookPastSpace(lex_mode_e.Arith)
413 if next_id in (Id.Lit_At, Id.Arith_Star):
414 op = bracket_op.WholeArray(next_id) # type: bracket_op_t
415
416 self._SetNext(lex_mode_e.Arith) # skip past [
417 self._GetToken()
418 self._SetNext(lex_mode_e.Arith) # skip past @
419 self._GetToken()
420 else:
421 self._SetNext(lex_mode_e.Arith) # skip past [
422 anode = self._ReadArithExpr(Id.Arith_RBracket)
423 op = bracket_op.ArrayIndex(anode)
424
425 if self.token_type != Id.Arith_RBracket: # Should be looking at ]
426 p_die('Expected ] to close subscript', self.cur_token)
427
428 self._SetNext(lex_mode_e.VSub_2) # skip past ]
429 self._GetToken() # Needed to be in the same spot as no subscript
430
431 return op
432
433 def _ParseVarOf(self):
434 # type: () -> BracedVarSub
435 """
436 VarOf = NAME Subscript?
437 | NUMBER # no subscript allowed, none of these are arrays
438 # ${@[1]} doesn't work, even though slicing does
439 | VarSymbol
440 """
441 self._GetToken()
442 name_token = self.cur_token
443 self._SetNext(lex_mode_e.VSub_2)
444
445 self._GetToken() # Check for []
446 if self.token_type == Id.VOp2_LBracket:
447 bracket_op = self._ReadSubscript()
448 else:
449 bracket_op = None
450
451 part = BracedVarSub.CreateNull()
452 part.name_tok = name_token
453 part.var_name = lexer.TokenVal(name_token)
454 part.bracket_op = bracket_op
455 return part
456
457 def _ParseVarExpr(self, arg_lex_mode, allow_query=False):
458 # type: (lex_mode_t, bool) -> BracedVarSub
459 """Start parsing at the op -- we already skipped past the name."""
460 part = self._ParseVarOf()
461
462 self._GetToken()
463 if self.token_type == Id.Right_DollarBrace:
464 return part # no ops
465
466 op_kind = self.token_kind
467
468 if op_kind == Kind.VTest:
469 tok = self.cur_token
470 arg_word = self._ReadVarOpArg(arg_lex_mode)
471 if self.token_type != Id.Right_DollarBrace:
472 p_die('Expected } to close ${', self.cur_token)
473
474 part.suffix_op = suffix_op.Unary(tok, arg_word)
475
476 elif op_kind == Kind.VOpYsh:
477 tok = self.cur_token
478 arg_word = self._ReadVarOpArg(arg_lex_mode)
479 if self.token_type != Id.Right_DollarBrace:
480 p_die('Expected } to close ${', self.cur_token)
481
482 UP_arg_word = arg_word
483 with tagswitch(arg_word) as case:
484 if case(rhs_word_e.Empty):
485 pass
486 elif case(rhs_word_e.Compound):
487 arg_word = cast(CompoundWord, UP_arg_word)
488 # This handles ${x|html} and ${x %.3f} now
489 # However I think ${x %.3f} should be statically parsed? It can enter
490 # the printf lexer modes.
491 ok, arg, quoted = word_.StaticEval(arg_word)
492 if not ok or quoted:
493 p_die('Expected a constant argument',
494 loc.Word(arg_word))
495
496 part.suffix_op = suffix_op.Static(tok, arg)
497
498 elif op_kind == Kind.VOp0:
499 part.suffix_op = self.cur_token # Nullary
500 self._SetNext(lex_mode_e.VSub_2) # Expecting }
501 self._GetToken()
502
503 elif op_kind == Kind.VOp1: # % %% # ## etc.
504 tok = self.cur_token
505 # Weird exception that all shells have: these operators take a glob
506 # pattern, so they're lexed as VSub_ArgUnquoted, not VSub_ArgDQ
507 arg_word = self._ReadVarOpArg(lex_mode_e.VSub_ArgUnquoted)
508 if self.token_type != Id.Right_DollarBrace:
509 p_die('Expected } to close ${', self.cur_token)
510
511 part.suffix_op = suffix_op.Unary(tok, arg_word)
512
513 elif op_kind == Kind.VOp2: # / : [ ]
514 if self.token_type == Id.VOp2_Slash:
515 patsub_op = self._ReadPatSubVarOp() # type: suffix_op_t
516 part.suffix_op = patsub_op
517
518 # Checked by the method above
519 assert self.token_type == Id.Right_DollarBrace, self.cur_token
520
521 elif self.token_type == Id.VOp2_Colon:
522 part.suffix_op = self._ReadSliceVarOp()
523 # NOTE: } in arithmetic mode.
524 if self.token_type != Id.Arith_RBrace:
525 # Token seems off; doesn't point to X in # ${a:1:2 X
526 p_die('Expected } to close ${', self.cur_token)
527
528 else:
529 # TODO: Does this ever happen?
530 p_die('Unexpected token in ${} (%s)' % 'VOp2', self.cur_token)
531
532 elif op_kind == Kind.VOp3: # ${prefix@} etc.
533 if allow_query:
534 part.suffix_op = self.cur_token # Nullary
535 self._SetNext(lex_mode_e.VSub_2) # Expecting }
536 self._GetToken()
537 else:
538 p_die("Unexpected token in ${} (%s)" % 'VOp3', self.cur_token)
539
540 # NOTE: Arith_RBrace is for slicing, because it reads } in arithmetic
541 # mode. It's redundantly checked above.
542 if self.token_type not in (Id.Right_DollarBrace, Id.Arith_RBrace):
543 # ${a.} or ${!a.}
544 p_die('Expected } to close ${', self.cur_token)
545
546 # Now look for ops
547 return part
548
549 def _ReadZshVarSub(self, left_token):
550 # type: (Token) -> word_part.ZshVarSub
551
552 self._SetNext(lex_mode_e.VSub_Zsh) # Move past ${(foo)
553
554 # Can be empty
555 w = self._ReadCompoundWord3(lex_mode_e.VSub_Zsh, Id.Right_DollarBrace,
556 True)
557 self._GetToken()
558 return word_part.ZshVarSub(left_token, w, self.cur_token)
559
560 def ReadBracedVarSub(self, left_token):
561 # type: (Token) -> Tuple[BracedVarSub, Token]
562 """ For YSH expressions like var x = ${x:-"default"}. """
563 part = self._ReadBracedVarSub(left_token, d_quoted=False)
564 last_token = self.cur_token
565 return part, last_token
566
567 def _ReadBracedVarSub(self, left_token, d_quoted):
568 # type: (Token, bool) -> BracedVarSub
569 """For the ${} expression language.
570
571 NAME = [a-zA-Z_][a-zA-Z0-9_]*
572 NUMBER = [0-9]+ # ${10}, ${11}, ...
573
574 Subscript = '[' ('@' | '*' | ArithExpr) ']'
575 VarSymbol = '!' | '@' | '#' | ...
576 VarOf = NAME Subscript?
577 | NUMBER # no subscript allowed, none of these are arrays
578 # ${@[1]} doesn't work, even though slicing does
579 | VarSymbol
580
581 NULLARY_OP = '@Q' | '@E' | '@P' | '@A' | '@a' # VOp0
582
583 TEST_OP = '-' | ':-' | '=' | ':=' | '+' | ':+' | '?' | ':?'
584 STRIP_OP = '#' | '##' | '%' | '%%'
585 CASE_OP = ',' | ',,' | '^' | '^^'
586 UnaryOp = TEST_OP | STRIP_OP | CASE_OP
587
588 YSH_UNARY = '|' | ' ' # ${x|html} and ${x %.3f}.
589 # SPACE is operator not %
590 Match = ('/' | '#' | '%') WORD # match all / prefix / suffix
591 VarExpr = VarOf
592 | VarOf NULLARY_OP
593 | VarOf UnaryOp WORD
594 | VarOf YSH_UNARY STATIC_WORD
595 | VarOf ':' ArithExpr (':' ArithExpr )?
596 | VarOf '/' Match '/' WORD
597
598 LengthExpr = '#' VarOf # can't apply operators after length
599
600 RefOrKeys = '!' VarExpr # CAN apply operators after a named ref
601 # ${!ref[0]} vs ${!keys[@]} resolved later
602
603 PrefixQuery = '!' NAME ('*' | '@') # list variable names with a prefix
604
605 BuiltinSub = '.' WORD+ # ${.myproc 'builtin' $sub}
606
607 VarSub = LengthExpr
608 | RefOrKeys
609 | PrefixQuery
610 | VarExpr
611 | BuiltinSub
612
613 NOTES:
614 - Arithmetic expressions are used twice, inside subscripts ${a[x+1]} and
615 slicing ${a:x+1:y+2}
616 - ${#} and ${!} need LL(2) lookahead (considering how my tokenizer works)
617 - @ and * are technically arithmetic expressions in this implementation
618 - We don't account for bash 4.4: ${param@operator} -- Q E P A a. Note that
619 it's also vectorized.
620
621 Strictness over bash:
622 - echo ${a[0][0]} doesn't do anything useful, so we disallow it from the
623 grammar
624 - ! and # prefixes can't be composed, even though named refs can be
625 composed with other operators
626 - '#' means 4 different things: length prefix, VarSymbol, UnaryOp to strip
627 a prefix, and it can also be a literal part of WORD.
628
629 From the parser's point of view, the prefix # can't be combined with
630 UnaryOp/slicing/matching, and the ! can. However
631
632 - ${a[@]:1:2} is not allowed
633 - ${#a[@]:1:2} is allowed, but gives the wrong answer
634 """
635 if d_quoted:
636 arg_lex_mode = lex_mode_e.VSub_ArgDQ
637 else:
638 arg_lex_mode = lex_mode_e.VSub_ArgUnquoted
639
640 self._SetNext(lex_mode_e.VSub_1)
641 self._GetToken()
642
643 ty = self.token_type
644 first_tok = self.cur_token
645
646 if ty == Id.VSub_Pound:
647 # Disambiguate
648 next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
649 if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
650 # e.g. a name, '#' is the prefix
651 self._SetNext(lex_mode_e.VSub_1)
652 part = self._ParseVarOf()
653
654 self._GetToken()
655 if self.token_type != Id.Right_DollarBrace:
656 p_die('Expected } after length expression', self.cur_token)
657
658 part.prefix_op = first_tok
659
660 else: # not a prefix, '#' is the variable
661 part = self._ParseVarExpr(arg_lex_mode)
662
663 elif ty == Id.VSub_Bang:
664 next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
665 if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
666 # e.g. a name, '!' is the prefix
667 # ${!a} -- this is a ref
668 # ${!3} -- this is ref
669 # ${!a[1]} -- this is a ref
670 # ${!a[@]} -- this is a keys
671 # No lookahead -- do it in a second step, or at runtime
672 self._SetNext(lex_mode_e.VSub_1)
673 part = self._ParseVarExpr(arg_lex_mode, allow_query=True)
674
675 part.prefix_op = first_tok
676
677 else: # not a prefix, '!' is the variable
678 part = self._ParseVarExpr(arg_lex_mode)
679
680 elif ty == Id.VSub_Dot:
681 # Note: this will become a new builtin_sub type, so this method must
682 # return word_part_t rather than BracedVarSub. I don't think that
683 # should cause problems.
684 p_die('TODO: ${.myproc builtin sub}', self.cur_token)
685
686 # VS_NAME, VS_NUMBER, symbol that isn't # or !
687 elif self.token_kind == Kind.VSub:
688 part = self._ParseVarExpr(arg_lex_mode)
689
690 else:
691 # e.g. ${^}
692 p_die('Unexpected token in ${}', self.cur_token)
693
694 part.left = left_token # attach the argument
695 part.right = self.cur_token
696 return part
697
698 def _ReadSingleQuoted(self, left_token, lex_mode):
699 # type: (Token, lex_mode_t) -> SingleQuoted
700 """Internal method to read a word_part."""
701 tokens = [] # type: List[Token]
702 # In command mode, we never disallow backslashes like '\'
703 right_quote = self.ReadSingleQuoted(lex_mode, left_token, tokens,
704 False)
705 sval = word_compile.EvalSingleQuoted(left_token.id, tokens)
706 node = SingleQuoted(left_token, sval, right_quote)
707 return node
708
709 def ReadSingleQuoted(self, lex_mode, left_token, out_tokens, is_ysh_expr):
710 # type: (lex_mode_t, Token, List[Token], bool) -> Token
711 """Appends to out_tokens; returns last token
712
713 Used by expr_parse.py
714 """
715 # TODO: Remove and use out_tokens
716 tokens = [] # type: List[Token]
717
718 # echo '\' is allowed, but x = '\' is invalid, in favor of x = r'\'
719 no_backslashes = is_ysh_expr and left_token.id == Id.Left_SingleQuote
720
721 expected_end_tokens = 3 if left_token.id in (
722 Id.Left_TSingleQuote, Id.Left_RTSingleQuote, Id.Left_UTSingleQuote,
723 Id.Left_BTSingleQuote) else 1
724 num_end_tokens = 0
725
726 while num_end_tokens < expected_end_tokens:
727 self._SetNext(lex_mode)
728 self._GetToken()
729
730 # Kind.Char emitted in lex_mode.SQ_C
731 if self.token_kind in (Kind.Lit, Kind.Char):
732 tok = self.cur_token
733 # Happens in lex_mode_e.SQ: 'one\two' is ambiguous, should be
734 # r'one\two' or c'one\\two'
735 if no_backslashes and lexer.TokenContains(tok, '\\'):
736 p_die(
737 r"Strings with backslashes should look like r'\n' or u'\n' or b'\n'",
738 tok)
739
740 if is_ysh_expr:
741 # Disallow var x = $'\001'. Arguably we don't need these
742 # checks because u'\u{1}' is the way to write it.
743 if self.token_type == Id.Char_Octal3:
744 p_die(
745 r"Use \xhh or \u{...} instead of octal escapes in YSH strings",
746 tok)
747
748 if self.token_type == Id.Char_Hex and self.cur_token.length != 4:
749 # disallow \xH
750 p_die(
751 r'Invalid hex escape in YSH string (must be \xHH)',
752 tok)
753
754 tokens.append(tok)
755
756 elif self.token_kind == Kind.Unknown:
757 tok = self.cur_token
758 assert tok.id == Id.Unknown_Backslash, tok
759
760 # x = $'\z' is disallowed; ditto for echo $'\z' if shopt -u parse_backslash
761 if is_ysh_expr or not self.parse_opts.parse_backslash():
762 p_die(
763 "Invalid char escape in C-style string literal (OILS-ERR-11)",
764 tok)
765
766 tokens.append(tok)
767
768 elif self.token_kind == Kind.Eof:
769 p_die('Unexpected EOF in single-quoted string that began here',
770 left_token)
771
772 elif self.token_kind == Kind.Right:
773 # assume Id.Right_SingleQuote
774 num_end_tokens += 1
775 tokens.append(self.cur_token)
776
777 else:
778 raise AssertionError(self.cur_token)
779
780 if self.token_kind != Kind.Right:
781 num_end_tokens = 0 # we need three in a ROW
782
783 if expected_end_tokens == 1:
784 tokens.pop()
785 elif expected_end_tokens == 3: # Get rid of spurious end tokens
786 tokens.pop()
787 tokens.pop()
788 tokens.pop()
789
790 # Remove space from ''' r''' $''' in both expression mode and command mode
791 if left_token.id in (Id.Left_TSingleQuote, Id.Left_RTSingleQuote,
792 Id.Left_UTSingleQuote, Id.Left_BTSingleQuote):
793 word_compile.RemoveLeadingSpaceSQ(tokens)
794
795 # Validation after lexing - same 2 checks in j8.LexerDecoder
796 is_u_string = left_token.id in (Id.Left_USingleQuote,
797 Id.Left_UTSingleQuote)
798
799 for tok in tokens:
800 # u'\yff' is not valid, but b'\yff' is
801 if is_u_string and tok.id == Id.Char_YHex:
802 p_die(
803 r"%s escapes not allowed in u'' strings" %
804 lexer.TokenVal(tok), tok)
805
806 out_tokens.extend(tokens)
807 return self.cur_token
808
809 def _ReadDoubleQuotedLeftParts(self):
810 # type: () -> word_part_t
811 """Read substitution parts in a double quoted context."""
812 if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick):
813 return self._ReadCommandSub(self.token_type, d_quoted=True)
814
815 if self.token_type == Id.Left_DollarBrace:
816 return self._ReadBracedVarSub(self.cur_token, d_quoted=True)
817
818 if self.token_type == Id.Left_DollarDParen:
819 return self._ReadArithSub()
820
821 if self.token_type == Id.Left_DollarBracket:
822 return self._ReadExprSub(lex_mode_e.DQ)
823
824 if self.token_type == Id.Left_DollarBraceZsh:
825 return self._ReadZshVarSub(self.cur_token)
826
827 raise AssertionError(self.cur_token)
828
829 def _ReadYshSingleQuoted(self, left_id):
830 # type: (Id_t) -> CompoundWord
831 """Read YSH style strings
832
833 r'' u'' b''
834 r''' ''' u''' ''' b''' '''
835 """
836 #log('BEF self.cur_token %s', self.cur_token)
837 if left_id == Id.Left_RSingleQuote:
838 lexer_mode = lex_mode_e.SQ_Raw
839 triple_left_id = Id.Left_RTSingleQuote
840 elif left_id == Id.Left_USingleQuote:
841 lexer_mode = lex_mode_e.J8_Str
842 triple_left_id = Id.Left_UTSingleQuote
843 elif left_id == Id.Left_BSingleQuote:
844 lexer_mode = lex_mode_e.J8_Str
845 triple_left_id = Id.Left_BTSingleQuote
846 else:
847 raise AssertionError(left_id)
848
849 # Needed for syntax checks
850 left_tok = self.cur_token
851 left_tok.id = left_id
852
853 sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
854
855 if (len(sq_part.sval) == 0 and self.lexer.ByteLookAhead() == "'"):
856 self._SetNext(lex_mode_e.ShCommand)
857 self._GetToken()
858
859 assert self.token_type == Id.Left_SingleQuote
860 # HACK: magically transform the third ' in u''' to
861 # Id.Left_UTSingleQuote, so that ''' is the terminator
862 left_tok = self.cur_token
863 left_tok.id = triple_left_id
864
865 # Handles stripping leading whitespace
866 sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
867
868 # Advance and validate
869 self._SetNext(lex_mode_e.ShCommand)
870
871 self._GetToken()
872 if self.token_kind not in KINDS_THAT_END_WORDS:
873 p_die('Unexpected token after YSH single-quoted string',
874 self.cur_token)
875
876 return CompoundWord([sq_part])
877
878 def _ReadUnquotedLeftParts(self, triple_out):
879 # type: (Optional[BoolParamBox]) -> word_part_t
880 """Read substitutions and quoted strings (for lex_mode_e.ShCommand).
881
882 If triple_out is set, then we try parsing triple quoted strings,
883 and set its value to True if we got one.
884 """
885 if self.token_type in (Id.Left_DoubleQuote, Id.Left_DollarDoubleQuote):
886 # Note: $"" is a synonym for "". It might make sense if it added
887 # \n \0 \x00 \u{123} etc. But that's not what bash does!
888 dq_part = self._ReadDoubleQuoted(self.cur_token)
889 # Got empty word "" and there's a " after
890 if (triple_out and len(dq_part.parts) == 0 and
891 self.lexer.ByteLookAhead() == '"'):
892
893 self._SetNext(lex_mode_e.ShCommand)
894 self._GetToken()
895 # HACK: magically transform the third " in """ to
896 # Id.Left_TDoubleQuote, so that """ is the terminator
897 left_dq_token = self.cur_token
898 left_dq_token.id = Id.Left_TDoubleQuote
899 triple_out.b = True # let caller know we got it
900 return self._ReadDoubleQuoted(left_dq_token)
901
902 return dq_part
903
904 if self.token_type in (Id.Left_SingleQuote, Id.Left_RSingleQuote,
905 Id.Left_DollarSingleQuote):
906 if self.token_type == Id.Left_SingleQuote:
907 lexer_mode = lex_mode_e.SQ_Raw
908 triple_left_id = Id.Left_TSingleQuote
909 elif self.token_type == Id.Left_RSingleQuote:
910 lexer_mode = lex_mode_e.SQ_Raw
911 triple_left_id = Id.Left_RTSingleQuote
912 else:
913 lexer_mode = lex_mode_e.SQ_C
914 # there is no such thing as $'''
915 triple_left_id = Id.Undefined_Tok
916
917 sq_part = self._ReadSingleQuoted(self.cur_token, lexer_mode)
918
919 # Got empty '' or r'' and there's a ' after
920 # u'' and b'' are handled in _ReadYshSingleQuoted
921 if (triple_left_id != Id.Undefined_Tok and
922 triple_out is not None and len(sq_part.sval) == 0 and
923 self.lexer.ByteLookAhead() == "'"):
924
925 self._SetNext(lex_mode_e.ShCommand)
926 self._GetToken()
927
928 # HACK: magically transform the third ' in ''' to
929 # Id.Left_TSingleQuote, so that ''' is the terminator
930 left_sq_token = self.cur_token
931 left_sq_token.id = triple_left_id
932
933 triple_out.b = True # let caller know we got it
934 return self._ReadSingleQuoted(left_sq_token, lexer_mode)
935
936 return sq_part
937
938 if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick,
939 Id.Left_ProcSubIn, Id.Left_ProcSubOut):
940 return self._ReadCommandSub(self.token_type, d_quoted=False)
941
942 if self.token_type == Id.Left_DollarBrace:
943 return self._ReadBracedVarSub(self.cur_token, d_quoted=False)
944
945 if self.token_type == Id.Left_DollarDParen:
946 return self._ReadArithSub()
947
948 if self.token_type == Id.Left_DollarBracket:
949 return self._ReadExprSub(lex_mode_e.ShCommand)
950
951 if self.token_type == Id.Left_DollarBraceZsh:
952 return self._ReadZshVarSub(self.cur_token)
953
954 raise AssertionError(self.cur_token)
955
956 def _ReadExtGlob(self):
957 # type: () -> word_part.ExtGlob
958 """
959 Grammar:
960 Item = CompoundWord | EPSILON # important: @(foo|) is allowed
961 LEFT = '@(' | '*(' | '+(' | '?(' | '!('
962 RIGHT = ')'
963 ExtGlob = LEFT (Item '|')* Item RIGHT # ITEM may be empty
964 Compound includes ExtGlob
965 """
966 left_token = self.cur_token
967 right_token = None # type: Token
968 arms = [] # type: List[CompoundWord]
969
970 self.lexer.PushHint(Id.Op_RParen, Id.Right_ExtGlob)
971 self._SetNext(lex_mode_e.ExtGlob) # advance past LEFT
972
973 read_word = False # did we just a read a word? To handle @(||).
974
975 while True:
976 self._GetToken()
977
978 if self.token_type == Id.Right_ExtGlob:
979 if not read_word:
980 arms.append(CompoundWord([]))
981 right_token = self.cur_token
982 break
983
984 elif self.token_type == Id.Op_Pipe:
985 if not read_word:
986 arms.append(CompoundWord([]))
987 read_word = False
988 self._SetNext(lex_mode_e.ExtGlob)
989
990 # lex_mode_e.ExtGlob should only produce these 4 kinds of tokens
991 elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub,
992 Kind.ExtGlob):
993 w = self._ReadCompoundWord(lex_mode_e.ExtGlob)
994 arms.append(w)
995 read_word = True
996
997 elif self.token_kind == Kind.Eof:
998 p_die('Unexpected EOF reading extended glob that began here',
999 left_token)
1000
1001 else:
1002 raise AssertionError(self.cur_token)
1003
1004 return word_part.ExtGlob(left_token, arms, right_token)
1005
1006 def _ReadBashRegexGroup(self):
1007 # type: () -> word_part.BashRegexGroup
1008 """
1009 Grammar:
1010 BashRegexGroup = '(' WORD? ')
1011 """
1012 left_token = self.cur_token
1013 assert left_token.id == Id.BashRegex_LParen, left_token
1014
1015 arms = [] # type: List[CompoundWord]
1016
1017 self.lexer.PushHint(Id.Op_RParen, Id.Right_BashRegexGroup)
1018 self._SetNext(lex_mode_e.BashRegexFakeInner) # advance past LEFT
1019
1020 self._GetToken()
1021 if self.token_type == Id.Right_BashRegexGroup: # empty ()
1022 return word_part.BashRegexGroup(left_token, None, self.cur_token)
1023
1024 # lex_mode_e.BashRegex should only produce these 4 kinds of tokens
1025 if self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub, Kind.BashRegex):
1026 # Fake lexer mode that translates Id.WS_Space to Id.Lit_Chars
1027 # To allow bash style [[ s =~ (a b) ]]
1028 w = self._ReadCompoundWord(lex_mode_e.BashRegexFakeInner)
1029 arms.append(w)
1030
1031 self._GetToken()
1032 if self.token_type != Id.Right_BashRegexGroup:
1033 p_die('Expected ) to close bash regex group', self.cur_token)
1034
1035 return word_part.BashRegexGroup(left_token, w, self.cur_token)
1036
1037 p_die('Expected word after ( opening bash regex group', self.cur_token)
1038
1039 def _ReadLikeDQ(self, left_token, is_ysh_expr, out_parts):
1040 # type: (Optional[Token], bool, List[word_part_t]) -> None
1041 """
1042 Args:
1043 left_token: A token if we are reading a double quoted part, or None if
1044 we're reading a here doc.
1045 is_ysh_expr: Whether to disallow backticks and invalid char escapes
1046 out_parts: list of word_part to append to
1047 """
1048 if left_token:
1049 if left_token.id in (Id.Left_TDoubleQuote,
1050 Id.Left_DollarTDoubleQuote):
1051 expected_end_tokens = 3
1052 else:
1053 expected_end_tokens = 1
1054 else:
1055 expected_end_tokens = 1000 # here doc will break
1056
1057 num_end_tokens = 0
1058 while num_end_tokens < expected_end_tokens:
1059 self._SetNext(lex_mode_e.DQ)
1060 self._GetToken()
1061
1062 if self.token_kind == Kind.Lit:
1063 if self.token_type == Id.Lit_EscapedChar:
1064 tok = self.cur_token
1065 ch = lexer.TokenSliceLeft(tok, 1)
1066 part = word_part.EscapedLiteral(tok,
1067 ch) # type: word_part_t
1068 else:
1069 if self.token_type == Id.Lit_BadBackslash:
1070 # echo "\z" is OK in shell, but 'x = "\z" is a syntax error in
1071 # YSH.
1072 # Slight hole: We don't catch 'x = ${undef:-"\z"} because of the
1073 # recursion (unless parse_backslash)
1074 if (is_ysh_expr or
1075 not self.parse_opts.parse_backslash()):
1076 p_die(
1077 "Invalid char escape in double quoted string (OILS-ERR-12)",
1078 self.cur_token)
1079 elif self.token_type == Id.Lit_Dollar:
1080 if is_ysh_expr or not self.parse_opts.parse_dollar():
1081 p_die("Literal $ should be quoted like \$",
1082 self.cur_token)
1083
1084 part = self.cur_token
1085 out_parts.append(part)
1086
1087 elif self.token_kind == Kind.Left:
1088 if self.token_type == Id.Left_Backtick and is_ysh_expr:
1089 p_die("Invalid backtick: use $(cmd) or \\` in YSH strings",
1090 self.cur_token)
1091
1092 part = self._ReadDoubleQuotedLeftParts()
1093 out_parts.append(part)
1094
1095 elif self.token_kind == Kind.VSub:
1096 tok = self.cur_token
1097 part = SimpleVarSub(tok)
1098 out_parts.append(part)
1099 # NOTE: parsing "$f(x)" would BREAK CODE. Could add a more for it
1100 # later.
1101
1102 elif self.token_kind == Kind.Right:
1103 assert self.token_type == Id.Right_DoubleQuote, self.token_type
1104 if left_token:
1105 num_end_tokens += 1
1106
1107 # In a here doc, the right quote is literal!
1108 out_parts.append(self.cur_token)
1109
1110 elif self.token_kind == Kind.Eof:
1111 if left_token:
1112 p_die(
1113 'Unexpected EOF reading double-quoted string that began here',
1114 left_token)
1115 else: # here docs will have an EOF in their token stream
1116 break
1117
1118 else:
1119 raise AssertionError(self.cur_token)
1120
1121 if self.token_kind != Kind.Right:
1122 num_end_tokens = 0 # """ must be CONSECUTIVE
1123
1124 if expected_end_tokens == 1:
1125 out_parts.pop()
1126 elif expected_end_tokens == 3:
1127 out_parts.pop()
1128 out_parts.pop()
1129 out_parts.pop()
1130
1131 # Remove space from """ in both expression mode and command mode
1132 if (left_token and left_token.id
1133 in (Id.Left_TDoubleQuote, Id.Left_DollarTDoubleQuote)):
1134 word_compile.RemoveLeadingSpaceDQ(out_parts)
1135
1136 # Return nothing, since we appended to 'out_parts'
1137
1138 def _ReadDoubleQuoted(self, left_token):
1139 # type: (Token) -> DoubleQuoted
1140 """Helper function for "hello $name".
1141
1142 Args:
1143 eof_type: for stopping at }, Id.Lit_RBrace
1144 here_doc: Whether we are reading in a here doc context
1145
1146 Also ${foo%%a b c} # treat this as double quoted. until you hit
1147 """
1148 parts = [] # type: List[word_part_t]
1149 self._ReadLikeDQ(left_token, False, parts)
1150
1151 right_quote = self.cur_token
1152 return DoubleQuoted(left_token, parts, right_quote)
1153
1154 def ReadDoubleQuoted(self, left_token, parts):
1155 # type: (Token, List[word_part_t]) -> Token
1156 """For expression mode.
1157
1158 Read var x = "${dir:-}/$name"; etc.
1159 """
1160 self._ReadLikeDQ(left_token, True, parts)
1161 return self.cur_token
1162
1163 def _ReadCommandSub(self, left_id, d_quoted=False):
1164 # type: (Id_t, bool) -> CommandSub
1165 """
1166 NOTE: This is not in the grammar, because word parts aren't in the grammar!
1167
1168 command_sub = '$(' command_list ')'
1169 | '@(' command_list ')'
1170 | '<(' command_list ')'
1171 | '>(' command_list ')'
1172 | ` command_list `
1173 """
1174 left_token = self.cur_token
1175
1176 # Set the lexer in a state so ) becomes the EOF token.
1177 if left_id in (Id.Left_DollarParen, Id.Left_AtParen, Id.Left_ProcSubIn,
1178 Id.Left_ProcSubOut):
1179 self._SetNext(lex_mode_e.ShCommand) # advance past $( etc.
1180
1181 right_id = Id.Eof_RParen
1182 self.lexer.PushHint(Id.Op_RParen, right_id)
1183 c_parser = self.parse_ctx.MakeParserForCommandSub(
1184 self.line_reader, self.lexer, right_id)
1185 # NOTE: This doesn't use something like main_loop because we don't want
1186 # to interleave parsing and execution! Unlike 'source' and 'eval'.
1187 node = c_parser.ParseCommandSub()
1188
1189 right_token = c_parser.w_parser.cur_token
1190
1191 elif left_id == Id.Left_Backtick and self.parse_ctx.do_lossless:
1192 # NOTE: This is an APPROXIMATE solution for translation ONLY. See
1193 # test/osh2oil.
1194
1195 right_id = Id.Eof_Backtick
1196 self.lexer.PushHint(Id.Left_Backtick, right_id)
1197 c_parser = self.parse_ctx.MakeParserForCommandSub(
1198 self.line_reader, self.lexer, right_id)
1199 node = c_parser.ParseCommandSub()
1200 right_token = c_parser.w_parser.cur_token
1201
1202 elif left_id == Id.Left_Backtick:
1203 if not self.parse_opts.parse_backticks():
1204 p_die('Use $(cmd) instead of backticks (parse_backticks)',
1205 left_token)
1206
1207 self._SetNext(lex_mode_e.Backtick) # advance past `
1208
1209 parts = [] # type: List[str]
1210 while True:
1211 self._GetToken()
1212 #log("TOK %s", self.cur_token)
1213
1214 if self.token_type == Id.Backtick_Quoted:
1215 # Remove leading \
1216 parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1217
1218 elif self.token_type == Id.Backtick_DoubleQuote:
1219 # Compatibility: If backticks are double quoted, then double quotes
1220 # within them have to be \"
1221 # Shells aren't smart enough to match nested " and ` quotes (but OSH
1222 # is)
1223 if d_quoted:
1224 # Remove leading \
1225 parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1226 else:
1227 parts.append(lexer.TokenVal(self.cur_token))
1228
1229 elif self.token_type == Id.Backtick_Other:
1230 parts.append(lexer.TokenVal(self.cur_token))
1231
1232 elif self.token_type == Id.Backtick_Right:
1233 break
1234
1235 elif self.token_type == Id.Eof_Real:
1236 # Note: this parse error is in the ORIGINAL context. No code_str yet.
1237 p_die('Unexpected EOF while looking for closing backtick',
1238 left_token)
1239
1240 else:
1241 raise AssertionError(self.cur_token)
1242
1243 self._SetNext(lex_mode_e.Backtick)
1244
1245 # Calculate right SPID on CommandSub BEFORE re-parsing.
1246 right_token = self.cur_token
1247
1248 code_str = ''.join(parts)
1249 #log('code %r', code_str)
1250
1251 # NOTE: This is similar to how we parse aliases in osh/cmd_parse.py. It
1252 # won't have the same location info as MakeParserForCommandSub(), because
1253 # the lexer is different.
1254 arena = self.parse_ctx.arena
1255 #arena = alloc.Arena()
1256 line_reader = reader.StringLineReader(code_str, arena)
1257 c_parser = self.parse_ctx.MakeOshParser(line_reader)
1258 src = source.Reparsed('backticks', left_token, right_token)
1259 with alloc.ctx_SourceCode(arena, src):
1260 node = c_parser.ParseCommandSub()
1261
1262 else:
1263 raise AssertionError(left_id)
1264
1265 return CommandSub(left_token, node, right_token)
1266
1267 def _ReadExprSub(self, lex_mode):
1268 # type: (lex_mode_t) -> word_part.ExprSub
1269 """$[d->key] $[obj.method()] etc."""
1270 left_token = self.cur_token
1271
1272 self._SetNext(lex_mode_e.Expr)
1273 enode, right_token = self.parse_ctx.ParseYshExpr(
1274 self.lexer, grammar_nt.ysh_expr_sub)
1275
1276 self._SetNext(lex_mode) # Move past ]
1277 return word_part.ExprSub(left_token, enode, right_token)
1278
1279 def ParseVarDecl(self, kw_token):
1280 # type: (Token) -> VarDecl
1281 """
1282 oil_var_decl: name_type_list '=' testlist end_stmt
1283
1284 Note that assignments must end with \n ; } or EOF. Unlike shell
1285 assignments, we disallow:
1286
1287 var x = 42 | wc -l
1288 var x = 42 && echo hi
1289 """
1290 self._SetNext(lex_mode_e.Expr)
1291 enode, last_token = self.parse_ctx.ParseVarDecl(kw_token, self.lexer)
1292 # Hack to move } from what the Expr lexer modes gives to what CommandParser
1293 # wants
1294 if last_token.id == Id.Op_RBrace:
1295 last_token.id = Id.Lit_RBrace
1296
1297 # Let the CommandParser see the Op_Semi or Op_Newline.
1298 self.buffered_word = last_token
1299 self._SetNext(lex_mode_e.ShCommand) # always back to this
1300 return enode
1301
1302 def ParseMutation(self, kw_token, var_checker):
1303 # type: (Token, VarChecker) -> Mutation
1304 """
1305 setvar i = 42
1306 setvar i += 1
1307 setvar a[i] = 42
1308 setvar a[i] += 1
1309 setvar d.key = 42
1310 setvar d.key += 1
1311 """
1312 self._SetNext(lex_mode_e.Expr)
1313 enode, last_token = self.parse_ctx.ParseMutation(kw_token, self.lexer)
1314 # Hack to move } from what the Expr lexer modes gives to what CommandParser
1315 # wants
1316 if last_token.id == Id.Op_RBrace:
1317 last_token.id = Id.Lit_RBrace
1318
1319 for lhs in enode.lhs:
1320 UP_lhs = lhs
1321 with tagswitch(lhs) as case:
1322 if case(y_lhs_e.Var):
1323 lhs = cast(Token, UP_lhs)
1324 var_checker.Check(kw_token.id, lexer.LazyStr(lhs), lhs)
1325
1326 # Note: this does not cover cases like
1327 # setvar (a[0])[1] = v
1328 # setvar (d.key).other = v
1329 # This leaks into catching all typos statically, which may be
1330 # possible if 'use' makes all names explicit.
1331 elif case(y_lhs_e.Subscript):
1332 lhs = cast(Subscript, UP_lhs)
1333 if lhs.obj.tag() == expr_e.Var:
1334 v = cast(expr.Var, lhs.obj)
1335 var_checker.Check(kw_token.id, v.name, v.left)
1336
1337 elif case(y_lhs_e.Attribute):
1338 lhs = cast(Attribute, UP_lhs)
1339 if lhs.obj.tag() == expr_e.Var:
1340 v = cast(expr.Var, lhs.obj)
1341 var_checker.Check(kw_token.id, v.name, v.left)
1342
1343 # Let the CommandParser see the Op_Semi or Op_Newline.
1344 self.buffered_word = last_token
1345 self._SetNext(lex_mode_e.ShCommand) # always back to this
1346 return enode
1347
1348 def ParseBareDecl(self):
1349 # type: () -> expr_t
1350 """
1351 x = {name: val}
1352 """
1353 self._SetNext(lex_mode_e.Expr)
1354 self._GetToken()
1355 enode, last_token = self.parse_ctx.ParseYshExpr(
1356 self.lexer, grammar_nt.command_expr)
1357 if last_token.id == Id.Op_RBrace:
1358 last_token.id = Id.Lit_RBrace
1359 self.buffered_word = last_token
1360 self._SetNext(lex_mode_e.ShCommand)
1361 return enode
1362
1363 def ParseYshExprForCommand(self):
1364 # type: () -> expr_t
1365
1366 # Fudge for this case
1367 # for x in(y) {
1368 # versus
1369 # for x in (y) {
1370 #
1371 # In the former case, ReadWord on 'in' puts the lexer past (.
1372 # Also see LookPastSpace in CommandParers.
1373 # A simpler solution would be nicer.
1374
1375 if self.token_type == Id.Op_LParen:
1376 self.lexer.MaybeUnreadOne()
1377
1378 enode, _ = self.parse_ctx.ParseYshExpr(self.lexer, grammar_nt.ysh_expr)
1379
1380 self._SetNext(lex_mode_e.ShCommand)
1381 return enode
1382
1383 def ParseCommandExpr(self):
1384 # type: () -> expr_t
1385 """
1386 = 1+2
1387 """
1388 enode, last_token = self.parse_ctx.ParseYshExpr(
1389 self.lexer, grammar_nt.command_expr)
1390
1391 # In some cases, such as the case statement, we expect *the lexer* to be
1392 # pointing at the token right after the expression. But the expression
1393 # parser must have read to the `last_token`. Unreading places the lexer
1394 # back in the expected state. Ie:
1395 #
1396 # case (x) { case (x) {
1397 # (else) { = x } (else) { = x }
1398 # ^ The lexer is here ^ Unread to here
1399 # } }
1400 assert last_token.id in (Id.Op_Newline, Id.Eof_Real, Id.Op_Semi,
1401 Id.Op_RBrace), last_token
1402 if last_token.id != Id.Eof_Real:
1403 # Eof_Real is the only token we cannot unread
1404 self.lexer.MaybeUnreadOne()
1405
1406 return enode
1407
1408 def ParseProc(self, node):
1409 # type: (Proc) -> None
1410
1411 # proc name-with-hyphens() must be accepted
1412 self._SetNext(lex_mode_e.ShCommand)
1413 self._GetToken()
1414 # example: 'proc f[' gets you Lit_ArrayLhsOpen
1415 if self.token_type != Id.Lit_Chars:
1416 p_die('Invalid proc name %s' % ui.PrettyToken(self.cur_token),
1417 self.cur_token)
1418
1419 # TODO: validate this more. Disallow proc 123 { }, which isn't disallowed
1420 # for shell functions. Similar to IsValidVarName().
1421 node.name = self.cur_token
1422
1423 last_token = self.parse_ctx.ParseProc(self.lexer, node)
1424
1425 # Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1426 assert last_token.id == Id.Op_LBrace
1427 last_token.id = Id.Lit_LBrace
1428 self.buffered_word = last_token
1429
1430 self._SetNext(lex_mode_e.ShCommand)
1431
1432 def ParseFunc(self, node):
1433 # type: (Func) -> None
1434 last_token = self.parse_ctx.ParseFunc(self.lexer, node)
1435
1436 # Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1437 assert last_token.id == Id.Op_LBrace
1438 last_token.id = Id.Lit_LBrace
1439 self.buffered_word = last_token
1440
1441 self._SetNext(lex_mode_e.ShCommand)
1442
1443 def ParseYshCasePattern(self):
1444 # type: () -> Tuple[pat_t, Token]
1445 pat, left_tok, last_token = self.parse_ctx.ParseYshCasePattern(
1446 self.lexer)
1447
1448 if last_token.id == Id.Op_LBrace:
1449 last_token.id = Id.Lit_LBrace
1450 self.buffered_word = last_token
1451
1452 return pat, left_tok
1453
1454 def NewlineOkForYshCase(self):
1455 # type: () -> Id_t
1456 """Check for optional newline and consume it.
1457
1458 This is a special case of `_NewlineOk` which fixed some "off-by-one" issues
1459 which crop up while parsing Ysh Case Arms. For more details, see
1460 #oil-dev > Progress On YSH Case Grammar on zulip.
1461
1462 Returns a token id which is filled with the choice of
1463
1464 word { echo word }
1465 (3) { echo expr }
1466 /e/ { echo eggex }
1467 } # right brace
1468 """
1469 while True:
1470 next_id = self.lexer.LookAheadOne(lex_mode_e.Expr)
1471
1472 # Cannot lookahead past lines
1473 if next_id == Id.Unknown_Tok:
1474 if not self.lexer.MoveToNextLine(): # Try to move to next line
1475 break # EOF
1476 continue
1477
1478 next_kind = consts.GetKind(next_id)
1479 if next_id != Id.Op_Newline and next_kind != Kind.Ignored:
1480 break
1481
1482 self.lexer.Read(lex_mode_e.Expr)
1483
1484 if next_id in (Id.Op_RBrace, Id.Op_LParen, Id.Arith_Slash):
1485 self._SetNext(lex_mode_e.Expr) # Continue in expression mode
1486 else:
1487 # Consume the trailing Op_Newline
1488 self._SetNext(lex_mode_e.ShCommand)
1489 self._GetToken()
1490
1491 return next_id
1492
1493 def _ReadArithExpr(self, end_id):
1494 # type: (Id_t) -> arith_expr_t
1495 """Read and parse an arithmetic expression in various contexts.
1496
1497 $(( 1+2 ))
1498 (( a=1+2 ))
1499 ${a[ 1+2 ]}
1500 ${a : 1+2 : 1+2}
1501
1502 See tests/arith-context.test.sh for ambiguous cases.
1503
1504 ${a[a[0]]} is valid # VS_RBRACKET vs Id.Arith_RBracket
1505
1506 ${s : a<b?0:1 : 1} # VS_COLON vs Id.Arith_Colon
1507
1508 See the assertion in ArithParser.Parse() -- unexpected extra input.
1509 """
1510 # calls self.ReadWord(lex_mode_e.Arith)
1511 anode = self.a_parser.Parse()
1512 cur_id = self.a_parser.CurrentId()
1513 if end_id != Id.Undefined_Tok and cur_id != end_id:
1514 p_die(
1515 'Unexpected token after arithmetic expression (%s != %s)' %
1516 (ui.PrettyId(cur_id), ui.PrettyId(end_id)),
1517 loc.Word(self.a_parser.cur_word))
1518 return anode
1519
1520 def _ReadArithSub(self):
1521 # type: () -> word_part.ArithSub
1522 """Read an arith substitution, which contains an arith expression, e.g.
1523
1524 $((a + 1)).
1525 """
1526 left_tok = self.cur_token
1527
1528 # The second one needs to be disambiguated in stuff like stuff like:
1529 # $(echo $(( 1+2 )) )
1530 self.lexer.PushHint(Id.Op_RParen, Id.Right_DollarDParen)
1531
1532 # NOTE: To disambiguate $(( as arith sub vs. command sub and subshell, we
1533 # could save the lexer/reader state here, and retry if the arithmetic parse
1534 # fails. But we can almost always catch this at parse time. There could
1535 # be some exceptions like:
1536 # $((echo * foo)) # looks like multiplication
1537 # $((echo / foo)) # looks like division
1538
1539 # $(( )) is valid
1540 anode = arith_expr.EmptyZero # type: arith_expr_t
1541
1542 self._NextNonSpace()
1543 if self.token_type != Id.Arith_RParen:
1544 anode = self._ReadArithExpr(Id.Arith_RParen)
1545
1546 self._SetNext(lex_mode_e.ShCommand)
1547
1548 # Ensure we get closing )
1549 self._GetToken()
1550 if self.token_type != Id.Right_DollarDParen:
1551 p_die('Expected second ) to end arith sub', self.cur_token)
1552
1553 right_tok = self.cur_token
1554 return word_part.ArithSub(left_tok, anode, right_tok)
1555
1556 def ReadDParen(self):
1557 # type: () -> Tuple[arith_expr_t, Token]
1558 """Read ((1+ 2)) -- command context.
1559
1560 We're using the word parser because it's very similar to _ReadArithExpr
1561 above.
1562
1563 This also returns the terminating Id.Op_DRightParen token for location
1564 info.
1565 """
1566 # (( )) is valid
1567 anode = arith_expr.EmptyZero # type: arith_expr_t
1568
1569 self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
1570
1571 self._NextNonSpace()
1572 if self.token_type != Id.Arith_RParen:
1573 anode = self._ReadArithExpr(Id.Arith_RParen)
1574
1575 self._SetNext(lex_mode_e.ShCommand)
1576
1577 # Ensure we get the second )
1578 self._GetToken()
1579 right = self.cur_token
1580 if right.id != Id.Op_DRightParen:
1581 p_die('Expected second ) to end arith statement', right)
1582
1583 self._SetNext(lex_mode_e.ShCommand)
1584
1585 return anode, right
1586
1587 def _NextNonSpace(self):
1588 # type: () -> None
1589 """Advance in lex_mode_e.Arith until non-space token.
1590
1591 Same logic as _ReadWord, but used in
1592 $(( ))
1593 (( ))
1594 for (( ))
1595
1596 You can read self.token_type after this, without calling _GetToken.
1597 """
1598 while True:
1599 self._SetNext(lex_mode_e.Arith)
1600 self._GetToken()
1601 if self.token_kind not in (Kind.Ignored, Kind.WS):
1602 break
1603
1604 def ReadForExpression(self):
1605 # type: () -> command.ForExpr
1606 """Read ((i=0; i<5; ++i)) -- part of command context."""
1607 self._NextNonSpace() # skip over ((
1608 cur_id = self.token_type # for end of arith expressions
1609
1610 if cur_id == Id.Arith_Semi: # for (( ; i < 10; i++ ))
1611 init_node = arith_expr.EmptyZero # type: arith_expr_t
1612 else:
1613 init_node = self.a_parser.Parse()
1614 cur_id = self.a_parser.CurrentId()
1615 self._NextNonSpace()
1616
1617 # It's odd to keep track of both cur_id and self.token_type in this
1618 # function, but it works, and is tested in 'test/parse_error.sh
1619 # arith-integration'
1620 if cur_id != Id.Arith_Semi: # for (( x=0 b; ... ))
1621 p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1622
1623 self._GetToken()
1624 cur_id = self.token_type
1625
1626 if cur_id == Id.Arith_Semi: # for (( ; ; i++ ))
1627 # empty condition is TRUE
1628 cond_node = arith_expr.EmptyOne # type: arith_expr_t
1629 else:
1630 cond_node = self.a_parser.Parse()
1631 cur_id = self.a_parser.CurrentId()
1632
1633 if cur_id != Id.Arith_Semi: # for (( x=0; x<5 b ))
1634 p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1635
1636 self._NextNonSpace()
1637 if self.token_type == Id.Arith_RParen: # for (( ; ; ))
1638 update_node = arith_expr.EmptyZero # type: arith_expr_t
1639 else:
1640 update_node = self._ReadArithExpr(Id.Arith_RParen)
1641
1642 self._NextNonSpace()
1643 if self.token_type != Id.Arith_RParen:
1644 p_die('Expected ) to end for loop expression', self.cur_token)
1645 self._SetNext(lex_mode_e.ShCommand)
1646
1647 # redirects is None, will be assigned in CommandEvaluator
1648 node = command.ForExpr.CreateNull()
1649 node.init = init_node
1650 node.cond = cond_node
1651 node.update = update_node
1652 return node
1653
1654 def _ReadArrayLiteral(self):
1655 # type: () -> word_part_t
1656 """a=(1 2 3)
1657
1658 TODO: See osh/cmd_parse.py:164 for Id.Lit_ArrayLhsOpen, for a[x++]=1
1659
1660 We want:
1661
1662 A=(['x']=1 ["x"]=2 [$x$y]=3)
1663
1664 Maybe allow this as a literal string? Because I think I've seen it before?
1665 Or maybe force people to patch to learn the rule.
1666
1667 A=([x]=4)
1668
1669 Starts with Lit_Other '[', and then it has Lit_ArrayLhsClose
1670 Maybe enforce that ALL have keys or NONE of have keys.
1671 """
1672 self._SetNext(lex_mode_e.ShCommand) # advance past (
1673 self._GetToken()
1674 if self.cur_token.id != Id.Op_LParen:
1675 p_die('Expected ( after =', self.cur_token)
1676 left_token = self.cur_token
1677 right_token = None # type: Token
1678
1679 # MUST use a new word parser (with same lexer).
1680 w_parser = self.parse_ctx.MakeWordParser(self.lexer, self.line_reader)
1681 words = [] # type: List[CompoundWord]
1682 done = False
1683 while not done:
1684 w = w_parser.ReadWord(lex_mode_e.ShCommand)
1685 with tagswitch(w) as case:
1686 if case(word_e.Operator):
1687 tok = cast(Token, w)
1688 if tok.id == Id.Right_Initializer:
1689 right_token = tok
1690 done = True # can't use break here
1691 # Unlike command parsing, array parsing allows embedded \n.
1692 elif tok.id == Id.Op_Newline:
1693 continue
1694 else:
1695 p_die('Unexpected token in array literal', loc.Word(w))
1696
1697 elif case(word_e.Compound):
1698 words.append(cast(CompoundWord, w))
1699
1700 else:
1701 raise AssertionError()
1702
1703 initializer_words = [] # type: List[InitializerWord_t]
1704 for w in words:
1705 pair = word_.DetectAssocPair(w)
1706 if pair is not None:
1707 word_.TildeDetectAssign(pair.value) # pair.value is modified
1708 initializer_words.append(pair)
1709 else:
1710 w2 = braces.BraceDetect(w) # type: word_t
1711 if w2 is None:
1712 w2 = w
1713 w3 = word_.TildeDetect(w2) # type: word_t
1714 if w3 is None:
1715 w3 = w2
1716 initializer_words.append(InitializerWord.ArrayWord(w3))
1717
1718 # invariant List?
1719 return word_part.InitializerLiteral(left_token, initializer_words,
1720 right_token)
1721
1722 def ParseProcCallArgs(self, start_symbol):
1723 # type: (int) -> ArgList
1724 """ json write (x) """
1725 self.lexer.MaybeUnreadOne()
1726
1727 arg_list = ArgList.CreateNull(alloc_lists=True)
1728 arg_list.left = self.cur_token
1729 self.parse_ctx.ParseProcCallArgs(self.lexer, arg_list, start_symbol)
1730 return arg_list
1731
1732 def _MaybeReadWordPart(self, is_first, lex_mode, parts):
1733 # type: (bool, lex_mode_t, List[word_part_t]) -> bool
1734 """Helper for _ReadCompoundWord3."""
1735 done = False
1736
1737 if self.token_type == Id.Lit_EscapedChar:
1738 tok = self.cur_token
1739 assert tok.length == 2
1740 ch = lexer.TokenSliceLeft(tok, 1)
1741 if not self.parse_opts.parse_backslash():
1742 if not pyutil.IsValidCharEscape(ch):
1743 p_die('Invalid char escape in unquoted word (OILS-ERR-13)',
1744 self.cur_token)
1745
1746 part = word_part.EscapedLiteral(self.cur_token,
1747 ch) # type: word_part_t
1748 else:
1749 part = self.cur_token
1750
1751 if is_first and self.token_type == Id.Lit_VarLike: # foo=
1752 parts.append(part)
1753 # Unfortunately it's awkward to pull the check for a=(1 2) up to
1754 # _ReadWord.
1755 next_id = self.lexer.LookPastSpace(lex_mode)
1756 if next_id == Id.Op_LParen:
1757 self.lexer.PushHint(Id.Op_RParen, Id.Right_Initializer)
1758 part2 = self._ReadArrayLiteral()
1759 parts.append(part2)
1760
1761 # Array literal must be the last part of the word.
1762 self._SetNext(lex_mode)
1763 self._GetToken()
1764 # EOF, whitespace, newline, Right_Subshell
1765 if self.token_kind not in KINDS_THAT_END_WORDS:
1766 p_die('Unexpected token after array literal',
1767 self.cur_token)
1768 done = True
1769
1770 elif (is_first and self.parse_opts.parse_at() and
1771 self.token_type == Id.Lit_Splice):
1772
1773 splice_tok = self.cur_token
1774 part2 = word_part.Splice(splice_tok,
1775 lexer.TokenSliceLeft(splice_tok, 1))
1776
1777 parts.append(part2)
1778
1779 # @words must be the last part of the word
1780 self._SetNext(lex_mode)
1781 self._GetToken()
1782 # EOF, whitespace, newline, Right_Subshell
1783 if self.token_kind not in KINDS_THAT_END_WORDS:
1784 p_die('Unexpected token after array splice', self.cur_token)
1785 done = True
1786
1787 elif (is_first and self.parse_opts.parse_at() and
1788 self.token_type == Id.Lit_AtLBracket): # @[split(x)]
1789 part2 = self._ReadExprSub(lex_mode_e.DQ)
1790 parts.append(part2)
1791
1792 # @[split(x)]
1793 self._SetNext(lex_mode)
1794 self._GetToken()
1795 # EOF, whitespace, newline, Right_Subshell
1796 if self.token_kind not in KINDS_THAT_END_WORDS:
1797 p_die('Unexpected token after Expr splice', self.cur_token)
1798 done = True
1799
1800 elif (is_first and self.parse_opts.parse_at() and
1801 self.token_type == Id.Lit_AtLBraceDot):
1802 p_die('TODO: @{.myproc builtin sub}', self.cur_token)
1803
1804 elif (is_first and self.parse_opts.parse_at_all() and
1805 self.token_type == Id.Lit_At):
1806 # Because $[x] ${x} and perhaps $/x/ are reserved, it makes sense for @
1807 # at the beginning of a word to be reserved.
1808
1809 # Although should we relax 'echo @' ? I'm tempted to have a shortcut for
1810 # @_argv and
1811 p_die('Literal @ starting a word must be quoted (parse_at_all)',
1812 self.cur_token)
1813
1814 else:
1815 # not a literal with lookahead; append it
1816 parts.append(part)
1817
1818 return done
1819
1820 def _ReadCompoundWord(self, lex_mode):
1821 # type: (lex_mode_t) -> CompoundWord
1822 return self._ReadCompoundWord3(lex_mode, Id.Undefined_Tok, True)
1823
1824 def _ReadCompoundWord3(self, lex_mode, eof_type, empty_ok):
1825 # type: (lex_mode_t, Id_t, bool) -> CompoundWord
1826 """
1827 Precondition: Looking at the first token of the first word part
1828 Postcondition: Looking at the token after, e.g. space or operator
1829
1830 NOTE: eof_type is necessary because / is a literal, i.e. Lit_Slash, but it
1831 could be an operator delimiting a compound word. Can we change lexer modes
1832 and remove this special case?
1833 """
1834 w = CompoundWord([])
1835 num_parts = 0
1836 brace_count = 0
1837 done = False
1838 is_triple_quoted = None # type: Optional[BoolParamBox]
1839
1840 while not done:
1841 self._GetToken()
1842
1843 allow_done = empty_ok or num_parts != 0
1844 if allow_done and self.token_type == eof_type:
1845 done = True # e.g. for ${foo//pat/replace}
1846
1847 # Keywords like "for" are treated like literals
1848 elif self.token_kind in (Kind.Lit, Kind.History, Kind.KW,
1849 Kind.ControlFlow, Kind.BoolUnary,
1850 Kind.BoolBinary):
1851
1852 # Syntax error for { and }
1853 if self.token_type == Id.Lit_LBrace:
1854 brace_count += 1
1855 elif self.token_type == Id.Lit_RBrace:
1856 brace_count -= 1
1857 elif self.token_type == Id.Lit_Dollar:
1858 if not self.parse_opts.parse_dollar():
1859 if num_parts == 0 and lex_mode == lex_mode_e.ShCommand:
1860 next_byte = self.lexer.ByteLookAhead()
1861 # TODO: switch lexer modes and parse $/d+/. But not ${a:-$/d+/}
1862 if next_byte == '/':
1863 #log('next_byte %r', next_byte)
1864 pass
1865
1866 p_die('Literal $ should be quoted like \$',
1867 self.cur_token)
1868
1869 done = self._MaybeReadWordPart(num_parts == 0, lex_mode,
1870 w.parts)
1871
1872 elif self.token_kind == Kind.VSub:
1873 vsub_token = self.cur_token
1874
1875 part = SimpleVarSub(vsub_token) # type: word_part_t
1876 w.parts.append(part)
1877
1878 elif self.token_kind == Kind.ExtGlob:
1879 # If parse_at, we can take over @( to start @(seq 3)
1880 # Users can also use look at ,(*.py|*.sh)
1881 if (self.parse_opts.parse_at() and
1882 self.token_type == Id.ExtGlob_At and num_parts == 0):
1883 cs_part = self._ReadCommandSub(Id.Left_AtParen,
1884 d_quoted=False)
1885 # RARE mutation of tok.id!
1886 cs_part.left_token.id = Id.Left_AtParen
1887 part = cs_part # for type safety
1888
1889 # Same check as _MaybeReadWordPart. @(seq 3)x is illegal, just like
1890 # a=(one two)x and @arrayfunc(3)x.
1891 self._GetToken()
1892 if self.token_kind not in KINDS_THAT_END_WORDS:
1893 p_die('Unexpected token after @()', self.cur_token)
1894 done = True
1895
1896 else:
1897 if HAVE_FNM_EXTMATCH == 0:
1898 p_die(
1899 "Extended glob won't work without FNM_EXTMATCH support in libc",
1900 self.cur_token)
1901 part = self._ReadExtGlob()
1902 w.parts.append(part)
1903
1904 elif self.token_kind == Kind.BashRegex:
1905 if self.token_type == Id.BashRegex_LParen: # Opening (
1906 part = self._ReadBashRegexGroup()
1907 w.parts.append(part)
1908 else:
1909 assert self.token_type == Id.BashRegex_AllowedInParens
1910 p_die('Invalid token in bash regex', self.cur_token)
1911
1912 elif self.token_kind == Kind.Left:
1913 try_triple_quote = (self.parse_opts.parse_triple_quote() and
1914 lex_mode == lex_mode_e.ShCommand and
1915 num_parts == 0)
1916
1917 # Save allocation
1918 if try_triple_quote:
1919 is_triple_quoted = BoolParamBox(False)
1920
1921 part = self._ReadUnquotedLeftParts(is_triple_quoted)
1922 w.parts.append(part)
1923
1924 # NOT done yet, will advance below
1925 elif self.token_kind == Kind.Right:
1926 # Still part of the word; will be done on the next iter.
1927 if self.token_type == Id.Right_DoubleQuote:
1928 pass
1929 # Never happens, no PushHint for this case.
1930 #elif self.token_type == Id.Right_DollarParen:
1931 # pass
1932 elif self.token_type == Id.Right_Subshell:
1933 # LEXER HACK for (case x in x) ;; esac )
1934 # Rewind before it's used
1935 assert self.next_lex_mode == lex_mode_e.Undefined
1936 if self.lexer.MaybeUnreadOne():
1937 self.lexer.PushHint(Id.Op_RParen, Id.Right_Subshell)
1938 self._SetNext(lex_mode)
1939 done = True
1940 else:
1941 done = True
1942
1943 elif self.token_kind == Kind.Ignored:
1944 done = True
1945
1946 else:
1947 # LEXER HACK for unbalanced case clause. 'case foo in esac' is valid,
1948 # so to test for ESAC, we can read ) before getting a chance to
1949 # PushHint(Id.Op_RParen, Id.Right_CasePat). So here we unread one
1950 # token and do it again.
1951
1952 # We get Id.Op_RParen at top level: case x in x) ;; esac
1953 # We get Id.Eof_RParen inside ComSub: $(case x in x) ;; esac )
1954 if self.token_type in (Id.Op_RParen, Id.Eof_RParen):
1955 # Rewind before it's used
1956 assert self.next_lex_mode == lex_mode_e.Undefined
1957 if self.lexer.MaybeUnreadOne():
1958 if self.token_type == Id.Eof_RParen:
1959 # Redo translation
1960 self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
1961 self._SetNext(lex_mode)
1962
1963 done = True # anything we don't recognize means we're done
1964
1965 if not done:
1966 self._SetNext(lex_mode)
1967 num_parts += 1
1968
1969 if (self.parse_opts.parse_brace() and num_parts > 1 and
1970 brace_count != 0):
1971 # accept { and }, but not foo{
1972 p_die(
1973 'Word has unbalanced { }. Maybe add a space or quote it like \{',
1974 loc.Word(w))
1975
1976 if is_triple_quoted and is_triple_quoted.b and num_parts > 1:
1977 p_die('Unexpected parts after triple quoted string',
1978 loc.WordPart(w.parts[-1]))
1979
1980 if 0:
1981 from _devbuild.gen.syntax_asdl import word_part_str
1982 word_key = ' '.join(word_part_str(p.tag()) for p in w.parts)
1983 WORD_HIST[word_key] += 1
1984
1985 # YSH word restriction
1986 # (r'' u'' b'' are stripped on shopt -s parse_ysh_string)
1987 if not self.parse_opts.parse_word_join() and not _IsValidYshWord(w):
1988 p_die("Invalid quoted word part in YSH (OILS-ERR-17)",
1989 loc.WordPart(part))
1990
1991 return w
1992
1993 def _ReadArithWord(self):
1994 # type: () -> Optional[word_t]
1995 """ Helper for ReadArithWord() """
1996 self._GetToken()
1997
1998 if self.token_kind == Kind.Unknown:
1999 # e.g. happened during dynamic parsing of unset 'a[$foo]' in gherkin
2000 p_die(
2001 'Unexpected token while parsing arithmetic: %r' %
2002 lexer.TokenVal(self.cur_token), self.cur_token)
2003
2004 elif self.token_kind == Kind.Eof:
2005 return self.cur_token
2006
2007 elif self.token_kind == Kind.Ignored:
2008 # Space should be ignored.
2009 self._SetNext(lex_mode_e.Arith)
2010 return None
2011
2012 elif self.token_kind in (Kind.Arith, Kind.Right):
2013 # Id.Right_DollarDParen IS just a normal token, handled by ArithParser
2014 self._SetNext(lex_mode_e.Arith)
2015 return self.cur_token
2016
2017 elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub):
2018 return self._ReadCompoundWord(lex_mode_e.Arith)
2019
2020 else:
2021 raise AssertionError(self.cur_token)
2022
2023 def _ReadWord(self, word_mode):
2024 # type: (lex_mode_t) -> Optional[word_t]
2025 """Helper function for ReadWord()."""
2026
2027 # Change the pseudo lexer mode to a real lexer mode
2028 if word_mode == lex_mode_e.ShCommandFakeBrack:
2029 lex_mode = lex_mode_e.ShCommand
2030 else:
2031 lex_mode = word_mode
2032
2033 self._GetToken()
2034
2035 if self.token_kind == Kind.Eof:
2036 # No advance
2037 return self.cur_token
2038
2039 # Allow Arith for ) at end of for loop?
2040 elif self.token_kind in (Kind.Op, Kind.Redir, Kind.Arith):
2041 self._SetNext(lex_mode)
2042
2043 # Newlines are complicated. See 3x2 matrix in the comment about
2044 # self.multiline and self.newline_state above.
2045 if self.token_type == Id.Op_Newline:
2046 if self.multiline:
2047 if self.newline_state > 1:
2048 # This points at a blank line, but at least it gives the line number
2049 p_die('Invalid blank line in multiline mode',
2050 self.cur_token)
2051 return None
2052
2053 if self.returned_newline: # skip
2054 return None
2055
2056 return self.cur_token
2057
2058 elif self.token_kind == Kind.Right:
2059 if self.token_type not in (Id.Right_Subshell, Id.Right_ShFunction,
2060 Id.Right_CasePat, Id.Right_Initializer):
2061 raise AssertionError(self.cur_token)
2062
2063 self._SetNext(lex_mode)
2064 return self.cur_token
2065
2066 elif self.token_kind in (Kind.Ignored, Kind.WS):
2067 self._SetNext(lex_mode)
2068 return None
2069
2070 else:
2071 assert self.token_kind in (Kind.VSub, Kind.Lit, Kind.History,
2072 Kind.Left, Kind.KW, Kind.ControlFlow,
2073 Kind.BoolUnary, Kind.BoolBinary,
2074 Kind.ExtGlob,
2075 Kind.BashRegex), 'Unhandled token kind'
2076
2077 if (word_mode == lex_mode_e.ShCommandFakeBrack and
2078 self.parse_opts.parse_bracket() and
2079 self.token_type == Id.Lit_LBracket):
2080 # Change [ from Kind.Lit -> Kind.Op
2081 # So CommandParser can treat
2082 # assert [42 === x]
2083 # like
2084 # json write (x)
2085 bracket_word = self.cur_token
2086 bracket_word.id = Id.Op_LBracket
2087
2088 self._SetNext(lex_mode)
2089 return bracket_word
2090
2091 # We're beginning a word. If we see Id.Lit_Pound, change to
2092 # lex_mode_e.Comment and read until end of line.
2093 if self.token_type == Id.Lit_Pound:
2094 self._SetNext(lex_mode_e.Comment)
2095 self._GetToken()
2096
2097 # NOTE: The # could be the last character in the file. It can't be
2098 # Eof_{RParen,Backtick} because #) and #` are comments.
2099 assert self.token_type in (Id.Ignored_Comment, Id.Eof_Real), \
2100 self.cur_token
2101
2102 # The next iteration will go into Kind.Ignored and set lex state to
2103 # lex_mode_e.ShCommand/etc.
2104 return None # tell ReadWord() to try again after comment
2105
2106 elif self.token_type == Id.Lit_TPound: ### doc comment
2107 self._SetNext(lex_mode_e.Comment)
2108 self._GetToken()
2109
2110 if self.token_type == Id.Ignored_Comment and self.emit_doc_token:
2111 return self.cur_token
2112
2113 return None # tell ReadWord() to try again after comment
2114
2115 else:
2116 # r'' u'' b'' at the beginning of a word
2117 if (self.token_type == Id.Lit_Chars and
2118 self.lexer.LookAheadOne(
2119 lex_mode_e.ShCommand) == Id.Left_SingleQuote):
2120
2121 # When shopt -s parse_ysh_string:
2122 # echo r'hi' is like echo 'hi'
2123 #
2124 # echo u'\u{3bc}' b'\yff' works
2125
2126 tok = self.cur_token
2127 if self.parse_opts.parse_ysh_string():
2128 if lexer.TokenEquals(tok, 'r'):
2129 left_id = Id.Left_RSingleQuote
2130 elif lexer.TokenEquals(tok, 'u'):
2131 left_id = Id.Left_USingleQuote
2132 elif lexer.TokenEquals(tok, 'b'):
2133 left_id = Id.Left_BSingleQuote
2134 else:
2135 left_id = Id.Undefined_Tok
2136
2137 if left_id != Id.Undefined_Tok:
2138 # skip the r, and then 'foo' will be read as normal
2139 self._SetNext(lex_mode_e.ShCommand)
2140
2141 self._GetToken()
2142 assert self.token_type == Id.Left_SingleQuote, self.token_type
2143
2144 # Read the word in a different lexer mode
2145 return self._ReadYshSingleQuoted(left_id)
2146
2147 return self._ReadCompoundWord(lex_mode)
2148
2149 def ParseVarRef(self):
2150 # type: () -> BracedVarSub
2151 """DYNAMIC parsing of what's inside ${!ref}
2152
2153 # Same as VarOf production
2154 VarRefExpr = VarOf EOF
2155 """
2156 self._SetNext(lex_mode_e.VSub_1)
2157
2158 self._GetToken()
2159 if self.token_kind != Kind.VSub:
2160 p_die('Expected var name', self.cur_token)
2161
2162 part = self._ParseVarOf()
2163 # NOTE: no ${ } means no part.left and part.right
2164 part.left = part.name_tok # cheat to make test pass
2165 part.right = part.name_tok
2166
2167 self._GetToken()
2168 if self.token_type != Id.Eof_Real:
2169 p_die('Expected end of var ref expression', self.cur_token)
2170 return part
2171
2172 def LookPastSpace(self):
2173 # type: () -> Id_t
2174 """Look ahead to the next token.
2175
2176 For the CommandParser to recognize
2177 array= (1 2 3)
2178 YSH for ( versus bash for ((
2179 YSH if ( versus if test
2180 YSH while ( versus while test
2181 YSH bare assignment 'grep =' versus 'grep foo'
2182 """
2183 assert self.token_type != Id.Undefined_Tok
2184 if self.cur_token.id == Id.WS_Space:
2185 id_ = self.lexer.LookPastSpace(lex_mode_e.ShCommand)
2186 else:
2187 id_ = self.cur_token.id
2188 return id_
2189
2190 def LookAheadFuncParens(self):
2191 # type: () -> bool
2192 """Special lookahead for f( ) { echo hi; } to check for ( )"""
2193 assert self.token_type != Id.Undefined_Tok
2194
2195 # We have to handle 2 cases because we buffer a token
2196 if self.cur_token.id == Id.Op_LParen: # saw funcname(
2197 return self.lexer.LookAheadFuncParens(1) # go back one char
2198
2199 elif self.cur_token.id == Id.WS_Space: # saw funcname WHITESPACE
2200 return self.lexer.LookAheadFuncParens(0)
2201
2202 else:
2203 return False
2204
2205 def ReadWord(self, word_mode):
2206 # type: (lex_mode_t) -> word_t
2207 """Read the next word, using the given lexer mode.
2208
2209 This is a stateful wrapper for the stateless _ReadWord function.
2210 """
2211 assert word_mode in (lex_mode_e.ShCommand,
2212 lex_mode_e.ShCommandFakeBrack,
2213 lex_mode_e.DBracket, lex_mode_e.BashRegex)
2214
2215 if self.buffered_word: # For integration with pgen2
2216 w = self.buffered_word
2217 self.buffered_word = None
2218 else:
2219 while True:
2220 w = self._ReadWord(word_mode)
2221 if w is not None:
2222 break
2223
2224 self.returned_newline = (word_.CommandId(w) == Id.Op_Newline)
2225 return w
2226
2227 def ReadArithWord(self):
2228 # type: () -> word_t
2229 while True:
2230 w = self._ReadArithWord()
2231 if w is not None:
2232 break
2233 return w
2234
2235 def ReadHereDocBody(self, parts):
2236 # type: (List[word_part_t]) -> None
2237 """
2238 A here doc is like a double quoted context, except " isn't special.
2239 """
2240 self._ReadLikeDQ(None, False, parts)
2241 # Returns nothing
2242
2243 def ReadForPlugin(self):
2244 # type: () -> CompoundWord
2245 """For $PS1, $PS4, etc.
2246
2247 This is just like reading a here doc line. "\n" is allowed, as
2248 well as the typical substitutions ${x} $(echo hi) $((1 + 2)).
2249 """
2250 w = CompoundWord([])
2251 self._ReadLikeDQ(None, False, w.parts)
2252 return w
2253
2254 def EmitDocToken(self, b):
2255 # type: (bool) -> None
2256 self.emit_doc_token = b
2257
2258 def Multiline(self, b):
2259 # type: (bool) -> None
2260 self.multiline = b
2261
2262
2263if 0:
2264 import collections
2265 WORD_HIST = collections.Counter()
2266
2267# vim: sw=4