OILS / osh / word_parse.py View on Github | oils.pub

2227 lines, 1190 significant
1# Copyright 2016 Andy Chu. All rights reserved.
2# Licensed under the Apache License, Version 2.0 (the "License");
3# you may not use this file except in compliance with the License.
4# You may obtain a copy of the License at
5#
6# http://www.apache.org/licenses/LICENSE-2.0
7"""
8word_parse.py - Parse the shell word language.
9
10Hairy example:
11
12 hi$((1 + 2))"$(echo hi)"${var:-__"$(echo default)"__}
13
14Substitutions can be nested, but which inner subs are allowed depends on the
15outer sub. Notes:
16
17lex_mode_e.ShCommand (_ReadUnquotedLeftParts)
18 All subs and quotes are allowed:
19 $v ${v} $() `` $(()) '' "" $'' $"" <() >()
20
21lex_mode_e.DQ (_ReadDoubleQuotedLeftParts)
22 Var, Command, Arith, but no quotes.
23 $v ${v} $() `` $(())
24 No process substitution.
25
26lex_mode_e.Arith
27 Similar to DQ: Var, Command, and Arith sub, but no process sub. bash doesn't
28 allow quotes, but OSH does. We allow ALL FOUR kinds of quotes, because we
29 need those for associative array indexing.
30
31lex_mode_e.VSub_ArgUnquoted
32 Like ShCommand, everything is allowed (even process substitutions), but we
33 stop at }, and space is SIGNIFICANT.
34
35 Example: ${a:- b }
36
37 ${X:-$v} ${X:-${v}} ${X:-$(echo hi)} ${X:-`echo hi`} ${X:-$((1+2))}
38 ${X:-'single'} ${X:-"double"} ${X:-$'\n'} ${X:-<(echo hi)}
39
40lex_mode_e.VSub_ArgDQ
41 In contrast to DQ, VS_ARG_DQ accepts nested "" and $'' and $"", e.g.
42 "${x:-"default"}".
43
44 In contrast, VSub_ArgUnquoted respects single quotes and process
45 substitution.
46
47 It's weird that double quotes are allowed. Space is also significant here,
48 e.g. "${x:-a "b"}".
49"""
50
51from _devbuild.gen import grammar_nt
52from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind
53from _devbuild.gen.types_asdl import (lex_mode_t, lex_mode_e)
54from _devbuild.gen.syntax_asdl import (
55 BoolParamBox,
56 Token,
57 SimpleVarSub,
58 loc,
59 source,
60 DoubleQuoted,
61 SingleQuoted,
62 BracedVarSub,
63 CommandSub,
64 ShArrayLiteral,
65 AssocPair,
66 bracket_op,
67 bracket_op_t,
68 suffix_op,
69 suffix_op_t,
70 rhs_word,
71 rhs_word_e,
72 rhs_word_t,
73 word_e,
74 word_t,
75 CompoundWord,
76 word_part,
77 word_part_t,
78 y_lhs_e,
79 arith_expr_t,
80 command,
81 expr,
82 expr_e,
83 expr_t,
84 pat_t,
85 ArgList,
86 Proc,
87 Func,
88 Subscript,
89 Attribute,
90 arith_expr,
91)
92from core import alloc
93from core.error import p_die
94from mycpp.mylib import log
95from core import pyutil
96from display import ui
97from frontend import consts
98from frontend import lexer
99from frontend import reader
100from osh import tdop
101from osh import arith_parse
102from osh import braces
103from osh import word_
104from osh import word_compile
105from mycpp.mylib import tagswitch
106
107from libc import HAVE_FNM_EXTMATCH
108
109from typing import List, Optional, Tuple, cast
110from typing import TYPE_CHECKING
111if TYPE_CHECKING:
112 from frontend.lexer import Lexer
113 from frontend.parse_lib import ParseContext
114 from frontend.reader import _Reader
115 from osh.cmd_parse import VarChecker
116
117unused1 = log
118unused2 = Id_str
119
120KINDS_THAT_END_WORDS = [Kind.Eof, Kind.WS, Kind.Op, Kind.Right]
121
122
123class WordEmitter(object):
124 """Common interface for [ and [["""
125
126 def __init__(self):
127 # type: () -> None
128 """Empty constructor for mycpp."""
129 pass
130
131 def ReadWord(self, lex_mode):
132 # type: (lex_mode_t) -> word_t
133 raise NotImplementedError()
134
135
136class WordParser(WordEmitter):
137
138 def __init__(self, parse_ctx, lexer, line_reader):
139 # type: (ParseContext, Lexer, _Reader) -> None
140 self.parse_ctx = parse_ctx
141 self.lexer = lexer
142 self.line_reader = line_reader
143 self.arena = line_reader.arena
144
145 self.parse_opts = parse_ctx.parse_opts
146 self.a_parser = tdop.TdopParser(arith_parse.Spec(), self,
147 self.parse_opts)
148 self.Reset()
149
150 def Init(self, lex_mode):
151 # type: (lex_mode_t) -> None
152 """Used to parse arithmetic, see ParseContext."""
153 self.next_lex_mode = lex_mode
154
155 def Reset(self):
156 # type: () -> None
157 """Called by interactive loop."""
158 # For _GetToken()
159 self.cur_token = None # type: Token
160 self.token_kind = Kind.Undefined
161 self.token_type = Id.Undefined_Tok
162
163 self.next_lex_mode = lex_mode_e.ShCommand
164
165 # Boolean mutated by CommandParser via word_.ctx_EmitDocToken. For ### doc
166 # comments
167 self.emit_doc_token = False
168 # Boolean mutated by CommandParser via word_.ctx_Multiline. '...' starts
169 # multiline mode.
170 self.multiline = False
171
172 # For detecting invalid \n\n in multiline mode. Counts what we got
173 # directly from the lexer.
174 self.newline_state = 0
175 # For consolidating \n\n -> \n for the CALLER. This simplifies the parsers
176 # that consume words.
177 self.returned_newline = False
178
179 # For integration with pgen2
180 self.buffered_word = None # type: word_t
181
182 def _GetToken(self):
183 # type: () -> None
184 """Call this when you need to make a decision based on any of:
185
186 self.token_type
187 self.token_kind
188 self.cur_token
189 """
190 if self.next_lex_mode == lex_mode_e.Undefined:
191 return # _SetNext() not called, so do nothing
192
193 is_fake = self.next_lex_mode == lex_mode_e.BashRegexFakeInner
194 real_mode = (lex_mode_e.BashRegex if is_fake else self.next_lex_mode)
195
196 self.cur_token = self.lexer.Read(real_mode)
197
198 # MUTATE TOKEN for fake lexer mode.
199 # This is for crazy stuff bash allows, like [[ s =~ (< >) ]]
200 if (is_fake and self.cur_token.id
201 in (Id.WS_Space, Id.BashRegex_AllowedInParens)):
202 self.cur_token.id = Id.Lit_Chars
203
204 self.token_type = self.cur_token.id
205 self.token_kind = consts.GetKind(self.token_type)
206
207 # number of consecutive newlines, ignoring whitespace
208 if self.token_type == Id.Op_Newline:
209 self.newline_state += 1
210 elif self.token_kind != Kind.WS:
211 self.newline_state = 0
212
213 self.parse_ctx.trail.AppendToken(self.cur_token) # For completion
214 self.next_lex_mode = lex_mode_e.Undefined
215
216 def _SetNext(self, lex_mode):
217 # type: (lex_mode_t) -> None
218 """Set the next lex state, but don't actually read a token.
219
220 We need this for proper interactive parsing.
221 """
222 self.next_lex_mode = lex_mode
223
224 def _ReadVarOpArg(self, arg_lex_mode):
225 # type: (lex_mode_t) -> rhs_word_t
226
227 # NOTE: Operators like | and < are not treated as special, so ${a:- | >} is
228 # valid, even when unquoted.
229 self._SetNext(arg_lex_mode)
230 self._GetToken()
231
232 w = self._ReadVarOpArg2(arg_lex_mode, Id.Undefined_Tok,
233 True) # empty_ok
234
235 # If the Compound has no parts, and we're in a double-quoted VarSub
236 # arg, and empty_ok, then return Empty. This is so it can evaluate to
237 # the empty string and not get elided.
238 #
239 # Examples:
240 # - "${s:-}", "${s/%pat/}"
241 # It's similar to LooksLikeShAssignment where we turn x= into x=''. And it
242 # has the same potential problem of not having Token location info.
243 #
244 # NOTE: empty_ok is False only for the PatSub pattern, which means we'll
245 # return a Compound with no parts, which is explicitly checked with a
246 # custom error message.
247 if len(w.parts) == 0 and arg_lex_mode == lex_mode_e.VSub_ArgDQ:
248 return rhs_word.Empty
249
250 return w
251
252 def _ReadVarOpArg2(self, arg_lex_mode, eof_type, empty_ok):
253 # type: (lex_mode_t, Id_t, bool) -> CompoundWord
254 """Return a CompoundWord.
255
256 Helper function for _ReadVarOpArg and used directly by
257 _ReadPatSubVarOp.
258 """
259 w = self._ReadCompoundWord3(arg_lex_mode, eof_type, empty_ok)
260 #log('w %s', w)
261 tilde = word_.TildeDetect(w)
262 if tilde:
263 w = tilde
264 return w
265
266 def _ReadSliceVarOp(self):
267 # type: () -> suffix_op.Slice
268 """
269 Looking token after first ':'
270
271 ArithExpr? (':' ArithExpr? )? '}'
272 """
273 self._NextNonSpace()
274
275 cur_id = self.token_type
276
277 if cur_id in (Id.Arith_RBrace, Id.Arith_Colon): # ${a:} or ${a::}
278 begin = arith_expr.EmptyZero # type: arith_expr_t
279 else:
280 begin = self.a_parser.Parse()
281 cur_id = self.a_parser.CurrentId() # advance
282
283 if cur_id == Id.Arith_RBrace: # ${a:1} or ${@:1}
284 # No length specified, so it's N
285 no_length = None # type: Optional[arith_expr_t]
286 return suffix_op.Slice(begin, no_length)
287
288 elif cur_id == Id.Arith_Colon: # ${a:1:} or ${@:1:}
289 colon_tok = self.cur_token
290 self._NextNonSpace()
291
292 if self.token_type == Id.Arith_RBrace:
293 # quirky bash behavior:
294 # ${a:1:} or ${a::} means length ZERO
295 # but ${a:1} or ${a:} means length N
296 if self.parse_opts.strict_parse_slice():
297 p_die(
298 "Slice length: Add explicit zero, or omit : for N (strict_parse_slice)",
299 colon_tok)
300
301 length = arith_expr.EmptyZero # type: arith_expr_t
302 else:
303 length = self._ReadArithExpr(Id.Arith_RBrace)
304
305 return suffix_op.Slice(begin, length)
306
307 else:
308 p_die("Expected : or } in slice", self.cur_token)
309
310 raise AssertionError() # for MyPy
311
312 def _ReadPatSubVarOp(self):
313 # type: () -> suffix_op.PatSub
314 """Looking at the first '/' after VarOf:
315
316 VarSub = ...
317 | VarOf '/' Match ( '/' WORD? )?
318 Match = '/' WORD # can't be empty
319 | '#' WORD? # may be empty
320 | '%' WORD?
321 """
322 slash_tok = self.cur_token # location info
323 replace_mode = Id.Undefined_Tok # bizarre syntax / # %
324
325 self._SetNext(lex_mode_e.VSub_ArgUnquoted) # advance past /
326
327 self._GetToken()
328 if self.token_type == Id.Right_DollarBrace:
329 pat = CompoundWord([])
330 return suffix_op.PatSub(pat, rhs_word.Empty, replace_mode,
331 slash_tok)
332
333 if self.token_type in (Id.Lit_Slash, Id.Lit_Pound, Id.Lit_Percent):
334 replace_mode = self.token_type
335 self._SetNext(lex_mode_e.VSub_ArgUnquoted)
336
337 # Bash quirk:
338 # echo ${x/#/replace} has an empty pattern
339 # echo ${x////replace} is non-empty; it means echo ${x//'/'/replace}
340 empty_ok = replace_mode != Id.Lit_Slash
341 pat = self._ReadVarOpArg2(lex_mode_e.VSub_ArgUnquoted, Id.Lit_Slash,
342 empty_ok)
343 #log('pat 1 %r', pat)
344
345 if self.token_type == Id.Lit_Slash:
346 # read until }
347 replace = self._ReadVarOpArg(
348 lex_mode_e.VSub_ArgUnquoted) # type: rhs_word_t
349 #log('r 1 %r', replace)
350 else:
351 # e.g. ${v/a} is the same as ${v/a/} -- empty replacement string
352 replace = rhs_word.Empty
353
354 self._GetToken()
355 if self.token_type != Id.Right_DollarBrace:
356 # This happens on invalid code
357 p_die(
358 "Expected } after replacement string, got %s" %
359 ui.PrettyId(self.token_type), self.cur_token)
360
361 return suffix_op.PatSub(pat, replace, replace_mode, slash_tok)
362
363 def _ReadSubscript(self):
364 # type: () -> bracket_op_t
365 """ Subscript = '[' ('@' | '*' | ArithExpr) ']' """
366 # Lookahead to see if we get @ or *. Otherwise read a full arithmetic
367 # expression.
368 next_id = self.lexer.LookPastSpace(lex_mode_e.Arith)
369 if next_id in (Id.Lit_At, Id.Arith_Star):
370 op = bracket_op.WholeArray(next_id) # type: bracket_op_t
371
372 self._SetNext(lex_mode_e.Arith) # skip past [
373 self._GetToken()
374 self._SetNext(lex_mode_e.Arith) # skip past @
375 self._GetToken()
376 else:
377 self._SetNext(lex_mode_e.Arith) # skip past [
378 anode = self._ReadArithExpr(Id.Arith_RBracket)
379 op = bracket_op.ArrayIndex(anode)
380
381 if self.token_type != Id.Arith_RBracket: # Should be looking at ]
382 p_die('Expected ] to close subscript', self.cur_token)
383
384 self._SetNext(lex_mode_e.VSub_2) # skip past ]
385 self._GetToken() # Needed to be in the same spot as no subscript
386
387 return op
388
389 def _ParseVarOf(self):
390 # type: () -> BracedVarSub
391 """
392 VarOf = NAME Subscript?
393 | NUMBER # no subscript allowed, none of these are arrays
394 # ${@[1]} doesn't work, even though slicing does
395 | VarSymbol
396 """
397 self._GetToken()
398 name_token = self.cur_token
399 self._SetNext(lex_mode_e.VSub_2)
400
401 self._GetToken() # Check for []
402 if self.token_type == Id.VOp2_LBracket:
403 bracket_op = self._ReadSubscript()
404 else:
405 bracket_op = None
406
407 part = BracedVarSub.CreateNull()
408 part.name_tok = name_token
409 part.var_name = lexer.TokenVal(name_token)
410 part.bracket_op = bracket_op
411 return part
412
413 def _ParseVarExpr(self, arg_lex_mode, allow_query=False):
414 # type: (lex_mode_t, bool) -> BracedVarSub
415 """Start parsing at the op -- we already skipped past the name."""
416 part = self._ParseVarOf()
417
418 self._GetToken()
419 if self.token_type == Id.Right_DollarBrace:
420 return part # no ops
421
422 op_kind = self.token_kind
423
424 if op_kind == Kind.VTest:
425 tok = self.cur_token
426 arg_word = self._ReadVarOpArg(arg_lex_mode)
427 if self.token_type != Id.Right_DollarBrace:
428 p_die('Expected } to close ${', self.cur_token)
429
430 part.suffix_op = suffix_op.Unary(tok, arg_word)
431
432 elif op_kind == Kind.VOpYsh:
433 tok = self.cur_token
434 arg_word = self._ReadVarOpArg(arg_lex_mode)
435 if self.token_type != Id.Right_DollarBrace:
436 p_die('Expected } to close ${', self.cur_token)
437
438 UP_arg_word = arg_word
439 with tagswitch(arg_word) as case:
440 if case(rhs_word_e.Empty):
441 pass
442 elif case(rhs_word_e.Compound):
443 arg_word = cast(CompoundWord, UP_arg_word)
444 # This handles ${x|html} and ${x %.3f} now
445 # However I think ${x %.3f} should be statically parsed? It can enter
446 # the printf lexer modes.
447 ok, arg, quoted = word_.StaticEval(arg_word)
448 if not ok or quoted:
449 p_die('Expected a constant argument',
450 loc.Word(arg_word))
451
452 part.suffix_op = suffix_op.Static(tok, arg)
453
454 elif op_kind == Kind.VOp0:
455 part.suffix_op = self.cur_token # Nullary
456 self._SetNext(lex_mode_e.VSub_2) # Expecting }
457 self._GetToken()
458
459 elif op_kind == Kind.VOp1: # % %% # ## etc.
460 tok = self.cur_token
461 # Weird exception that all shells have: these operators take a glob
462 # pattern, so they're lexed as VSub_ArgUnquoted, not VSub_ArgDQ
463 arg_word = self._ReadVarOpArg(lex_mode_e.VSub_ArgUnquoted)
464 if self.token_type != Id.Right_DollarBrace:
465 p_die('Expected } to close ${', self.cur_token)
466
467 part.suffix_op = suffix_op.Unary(tok, arg_word)
468
469 elif op_kind == Kind.VOp2: # / : [ ]
470 if self.token_type == Id.VOp2_Slash:
471 patsub_op = self._ReadPatSubVarOp() # type: suffix_op_t
472 part.suffix_op = patsub_op
473
474 # Checked by the method above
475 assert self.token_type == Id.Right_DollarBrace, self.cur_token
476
477 elif self.token_type == Id.VOp2_Colon:
478 part.suffix_op = self._ReadSliceVarOp()
479 # NOTE: } in arithmetic mode.
480 if self.token_type != Id.Arith_RBrace:
481 # Token seems off; doesn't point to X in # ${a:1:2 X
482 p_die('Expected } to close ${', self.cur_token)
483
484 else:
485 # TODO: Does this ever happen?
486 p_die('Unexpected token in ${} (%s)' % 'VOp2', self.cur_token)
487
488 elif op_kind == Kind.VOp3: # ${prefix@} etc.
489 if allow_query:
490 part.suffix_op = self.cur_token # Nullary
491 self._SetNext(lex_mode_e.VSub_2) # Expecting }
492 self._GetToken()
493 else:
494 p_die("Unexpected token in ${} (%s)" % 'VOp3', self.cur_token)
495
496 # NOTE: Arith_RBrace is for slicing, because it reads } in arithmetic
497 # mode. It's redundantly checked above.
498 if self.token_type not in (Id.Right_DollarBrace, Id.Arith_RBrace):
499 # ${a.} or ${!a.}
500 p_die('Expected } to close ${', self.cur_token)
501
502 # Now look for ops
503 return part
504
505 def _ReadZshVarSub(self, left_token):
506 # type: (Token) -> word_part.ZshVarSub
507
508 self._SetNext(lex_mode_e.VSub_Zsh) # Move past ${(foo)
509
510 # Can be empty
511 w = self._ReadCompoundWord3(lex_mode_e.VSub_Zsh, Id.Right_DollarBrace,
512 True)
513 self._GetToken()
514 return word_part.ZshVarSub(left_token, w, self.cur_token)
515
516 def ReadBracedVarSub(self, left_token):
517 # type: (Token) -> Tuple[BracedVarSub, Token]
518 """ For YSH expressions like var x = ${x:-"default"}. """
519 part = self._ReadBracedVarSub(left_token, d_quoted=False)
520 last_token = self.cur_token
521 return part, last_token
522
523 def _ReadBracedVarSub(self, left_token, d_quoted):
524 # type: (Token, bool) -> BracedVarSub
525 """For the ${} expression language.
526
527 NAME = [a-zA-Z_][a-zA-Z0-9_]*
528 NUMBER = [0-9]+ # ${10}, ${11}, ...
529
530 Subscript = '[' ('@' | '*' | ArithExpr) ']'
531 VarSymbol = '!' | '@' | '#' | ...
532 VarOf = NAME Subscript?
533 | NUMBER # no subscript allowed, none of these are arrays
534 # ${@[1]} doesn't work, even though slicing does
535 | VarSymbol
536
537 NULLARY_OP = '@Q' | '@E' | '@P' | '@A' | '@a' # VOp0
538
539 TEST_OP = '-' | ':-' | '=' | ':=' | '+' | ':+' | '?' | ':?'
540 STRIP_OP = '#' | '##' | '%' | '%%'
541 CASE_OP = ',' | ',,' | '^' | '^^'
542 UnaryOp = TEST_OP | STRIP_OP | CASE_OP
543
544 YSH_UNARY = '|' | ' ' # ${x|html} and ${x %.3f}.
545 # SPACE is operator not %
546 Match = ('/' | '#' | '%') WORD # match all / prefix / suffix
547 VarExpr = VarOf
548 | VarOf NULLARY_OP
549 | VarOf UnaryOp WORD
550 | VarOf YSH_UNARY STATIC_WORD
551 | VarOf ':' ArithExpr (':' ArithExpr )?
552 | VarOf '/' Match '/' WORD
553
554 LengthExpr = '#' VarOf # can't apply operators after length
555
556 RefOrKeys = '!' VarExpr # CAN apply operators after a named ref
557 # ${!ref[0]} vs ${!keys[@]} resolved later
558
559 PrefixQuery = '!' NAME ('*' | '@') # list variable names with a prefix
560
561 BuiltinSub = '.' WORD+ # ${.myproc 'builtin' $sub}
562
563 VarSub = LengthExpr
564 | RefOrKeys
565 | PrefixQuery
566 | VarExpr
567 | BuiltinSub
568
569 NOTES:
570 - Arithmetic expressions are used twice, inside subscripts ${a[x+1]} and
571 slicing ${a:x+1:y+2}
572 - ${#} and ${!} need LL(2) lookahead (considering how my tokenizer works)
573 - @ and * are technically arithmetic expressions in this implementation
574 - We don't account for bash 4.4: ${param@operator} -- Q E P A a. Note that
575 it's also vectorized.
576
577 Strictness over bash:
578 - echo ${a[0][0]} doesn't do anything useful, so we disallow it from the
579 grammar
580 - ! and # prefixes can't be composed, even though named refs can be
581 composed with other operators
582 - '#' means 4 different things: length prefix, VarSymbol, UnaryOp to strip
583 a prefix, and it can also be a literal part of WORD.
584
585 From the parser's point of view, the prefix # can't be combined with
586 UnaryOp/slicing/matching, and the ! can. However
587
588 - ${a[@]:1:2} is not allowed
589 - ${#a[@]:1:2} is allowed, but gives the wrong answer
590 """
591 if d_quoted:
592 arg_lex_mode = lex_mode_e.VSub_ArgDQ
593 else:
594 arg_lex_mode = lex_mode_e.VSub_ArgUnquoted
595
596 self._SetNext(lex_mode_e.VSub_1)
597 self._GetToken()
598
599 ty = self.token_type
600 first_tok = self.cur_token
601
602 if ty == Id.VSub_Pound:
603 # Disambiguate
604 next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
605 if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
606 # e.g. a name, '#' is the prefix
607 self._SetNext(lex_mode_e.VSub_1)
608 part = self._ParseVarOf()
609
610 self._GetToken()
611 if self.token_type != Id.Right_DollarBrace:
612 p_die('Expected } after length expression', self.cur_token)
613
614 part.prefix_op = first_tok
615
616 else: # not a prefix, '#' is the variable
617 part = self._ParseVarExpr(arg_lex_mode)
618
619 elif ty == Id.VSub_Bang:
620 next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
621 if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
622 # e.g. a name, '!' is the prefix
623 # ${!a} -- this is a ref
624 # ${!3} -- this is ref
625 # ${!a[1]} -- this is a ref
626 # ${!a[@]} -- this is a keys
627 # No lookahead -- do it in a second step, or at runtime
628 self._SetNext(lex_mode_e.VSub_1)
629 part = self._ParseVarExpr(arg_lex_mode, allow_query=True)
630
631 part.prefix_op = first_tok
632
633 else: # not a prefix, '!' is the variable
634 part = self._ParseVarExpr(arg_lex_mode)
635
636 elif ty == Id.VSub_Dot:
637 # Note: this will become a new builtin_sub type, so this method must
638 # return word_part_t rather than BracedVarSub. I don't think that
639 # should cause problems.
640 p_die('TODO: ${.myproc builtin sub}', self.cur_token)
641
642 # VS_NAME, VS_NUMBER, symbol that isn't # or !
643 elif self.token_kind == Kind.VSub:
644 part = self._ParseVarExpr(arg_lex_mode)
645
646 else:
647 # e.g. ${^}
648 p_die('Unexpected token in ${}', self.cur_token)
649
650 part.left = left_token # attach the argument
651 part.right = self.cur_token
652 return part
653
654 def _ReadSingleQuoted(self, left_token, lex_mode):
655 # type: (Token, lex_mode_t) -> SingleQuoted
656 """Internal method to read a word_part."""
657 tokens = [] # type: List[Token]
658 # In command mode, we never disallow backslashes like '\'
659 right_quote = self.ReadSingleQuoted(lex_mode, left_token, tokens,
660 False)
661 sval = word_compile.EvalSingleQuoted(left_token.id, tokens)
662 node = SingleQuoted(left_token, sval, right_quote)
663 return node
664
665 def ReadSingleQuoted(self, lex_mode, left_token, out_tokens, is_ysh_expr):
666 # type: (lex_mode_t, Token, List[Token], bool) -> Token
667 """Appends to out_tokens; returns last token
668
669 Used by expr_parse.py
670 """
671 # TODO: Remove and use out_tokens
672 tokens = [] # type: List[Token]
673
674 # echo '\' is allowed, but x = '\' is invalid, in favor of x = r'\'
675 no_backslashes = is_ysh_expr and left_token.id == Id.Left_SingleQuote
676
677 expected_end_tokens = 3 if left_token.id in (
678 Id.Left_TSingleQuote, Id.Left_RTSingleQuote, Id.Left_UTSingleQuote,
679 Id.Left_BTSingleQuote) else 1
680 num_end_tokens = 0
681
682 while num_end_tokens < expected_end_tokens:
683 self._SetNext(lex_mode)
684 self._GetToken()
685
686 # Kind.Char emitted in lex_mode.SQ_C
687 if self.token_kind in (Kind.Lit, Kind.Char):
688 tok = self.cur_token
689 # Happens in lex_mode_e.SQ: 'one\two' is ambiguous, should be
690 # r'one\two' or c'one\\two'
691 if no_backslashes and lexer.TokenContains(tok, '\\'):
692 p_die(
693 r"Strings with backslashes should look like r'\n' or u'\n' or b'\n'",
694 tok)
695
696 if is_ysh_expr:
697 # Disallow var x = $'\001'. Arguably we don't need these
698 # checks because u'\u{1}' is the way to write it.
699 if self.token_type == Id.Char_Octal3:
700 p_die(
701 r"Use \xhh or \u{...} instead of octal escapes in YSH strings",
702 tok)
703
704 if self.token_type == Id.Char_Hex and self.cur_token.length != 4:
705 # disallow \xH
706 p_die(
707 r'Invalid hex escape in YSH string (must be \xHH)',
708 tok)
709
710 tokens.append(tok)
711
712 elif self.token_kind == Kind.Unknown:
713 tok = self.cur_token
714 assert tok.id == Id.Unknown_Backslash, tok
715
716 # x = $'\z' is disallowed; ditto for echo $'\z' if shopt -u parse_backslash
717 if is_ysh_expr or not self.parse_opts.parse_backslash():
718 p_die(
719 "Invalid char escape in C-style string literal (OILS-ERR-11)",
720 tok)
721
722 tokens.append(tok)
723
724 elif self.token_kind == Kind.Eof:
725 p_die('Unexpected EOF in single-quoted string that began here',
726 left_token)
727
728 elif self.token_kind == Kind.Right:
729 # assume Id.Right_SingleQuote
730 num_end_tokens += 1
731 tokens.append(self.cur_token)
732
733 else:
734 raise AssertionError(self.cur_token)
735
736 if self.token_kind != Kind.Right:
737 num_end_tokens = 0 # we need three in a ROW
738
739 if expected_end_tokens == 1:
740 tokens.pop()
741 elif expected_end_tokens == 3: # Get rid of spurious end tokens
742 tokens.pop()
743 tokens.pop()
744 tokens.pop()
745
746 # Remove space from ''' r''' $''' in both expression mode and command mode
747 if left_token.id in (Id.Left_TSingleQuote, Id.Left_RTSingleQuote,
748 Id.Left_UTSingleQuote, Id.Left_BTSingleQuote):
749 word_compile.RemoveLeadingSpaceSQ(tokens)
750
751 # Validation after lexing - same 2 checks in j8.LexerDecoder
752 is_u_string = left_token.id in (Id.Left_USingleQuote,
753 Id.Left_UTSingleQuote)
754
755 for tok in tokens:
756 # u'\yff' is not valid, but b'\yff' is
757 if is_u_string and tok.id == Id.Char_YHex:
758 p_die(
759 r"%s escapes not allowed in u'' strings" %
760 lexer.TokenVal(tok), tok)
761
762 out_tokens.extend(tokens)
763 return self.cur_token
764
765 def _ReadDoubleQuotedLeftParts(self):
766 # type: () -> word_part_t
767 """Read substitution parts in a double quoted context."""
768 if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick):
769 return self._ReadCommandSub(self.token_type, d_quoted=True)
770
771 if self.token_type == Id.Left_DollarBrace:
772 return self._ReadBracedVarSub(self.cur_token, d_quoted=True)
773
774 if self.token_type == Id.Left_DollarDParen:
775 return self._ReadArithSub()
776
777 if self.token_type == Id.Left_DollarBracket:
778 return self._ReadExprSub(lex_mode_e.DQ)
779
780 if self.token_type == Id.Left_DollarBraceZsh:
781 return self._ReadZshVarSub(self.cur_token)
782
783 raise AssertionError(self.cur_token)
784
785 def _ReadYshSingleQuoted(self, left_id):
786 # type: (Id_t) -> CompoundWord
787 """Read YSH style strings
788
789 r'' u'' b''
790 r''' ''' u''' ''' b''' '''
791 """
792 #log('BEF self.cur_token %s', self.cur_token)
793 if left_id == Id.Left_RSingleQuote:
794 lexer_mode = lex_mode_e.SQ_Raw
795 triple_left_id = Id.Left_RTSingleQuote
796 elif left_id == Id.Left_USingleQuote:
797 lexer_mode = lex_mode_e.J8_Str
798 triple_left_id = Id.Left_UTSingleQuote
799 elif left_id == Id.Left_BSingleQuote:
800 lexer_mode = lex_mode_e.J8_Str
801 triple_left_id = Id.Left_BTSingleQuote
802 else:
803 raise AssertionError(left_id)
804
805 # Needed for syntax checks
806 left_tok = self.cur_token
807 left_tok.id = left_id
808
809 sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
810
811 if (len(sq_part.sval) == 0 and self.lexer.ByteLookAhead() == "'"):
812 self._SetNext(lex_mode_e.ShCommand)
813 self._GetToken()
814
815 assert self.token_type == Id.Left_SingleQuote
816 # HACK: magically transform the third ' in u''' to
817 # Id.Left_UTSingleQuote, so that ''' is the terminator
818 left_tok = self.cur_token
819 left_tok.id = triple_left_id
820
821 # Handles stripping leading whitespace
822 sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
823
824 # Advance and validate
825 self._SetNext(lex_mode_e.ShCommand)
826
827 self._GetToken()
828 if self.token_kind not in KINDS_THAT_END_WORDS:
829 p_die('Unexpected token after YSH single-quoted string',
830 self.cur_token)
831
832 return CompoundWord([sq_part])
833
834 def _ReadUnquotedLeftParts(self, triple_out):
835 # type: (Optional[BoolParamBox]) -> word_part_t
836 """Read substitutions and quoted strings (for lex_mode_e.ShCommand).
837
838 If triple_out is set, then we try parsing triple quoted strings,
839 and set its value to True if we got one.
840 """
841 if self.token_type in (Id.Left_DoubleQuote, Id.Left_DollarDoubleQuote):
842 # Note: $"" is a synonym for "". It might make sense if it added
843 # \n \0 \x00 \u{123} etc. But that's not what bash does!
844 dq_part = self._ReadDoubleQuoted(self.cur_token)
845 # Got empty word "" and there's a " after
846 if (triple_out and len(dq_part.parts) == 0 and
847 self.lexer.ByteLookAhead() == '"'):
848
849 self._SetNext(lex_mode_e.ShCommand)
850 self._GetToken()
851 # HACK: magically transform the third " in """ to
852 # Id.Left_TDoubleQuote, so that """ is the terminator
853 left_dq_token = self.cur_token
854 left_dq_token.id = Id.Left_TDoubleQuote
855 triple_out.b = True # let caller know we got it
856 return self._ReadDoubleQuoted(left_dq_token)
857
858 return dq_part
859
860 if self.token_type in (Id.Left_SingleQuote, Id.Left_RSingleQuote,
861 Id.Left_DollarSingleQuote):
862 if self.token_type == Id.Left_SingleQuote:
863 lexer_mode = lex_mode_e.SQ_Raw
864 triple_left_id = Id.Left_TSingleQuote
865 elif self.token_type == Id.Left_RSingleQuote:
866 lexer_mode = lex_mode_e.SQ_Raw
867 triple_left_id = Id.Left_RTSingleQuote
868 else:
869 lexer_mode = lex_mode_e.SQ_C
870 # there is no such thing as $'''
871 triple_left_id = Id.Undefined_Tok
872
873 sq_part = self._ReadSingleQuoted(self.cur_token, lexer_mode)
874
875 # Got empty '' or r'' and there's a ' after
876 # u'' and b'' are handled in _ReadYshSingleQuoted
877 if (triple_left_id != Id.Undefined_Tok and
878 triple_out is not None and len(sq_part.sval) == 0 and
879 self.lexer.ByteLookAhead() == "'"):
880
881 self._SetNext(lex_mode_e.ShCommand)
882 self._GetToken()
883
884 # HACK: magically transform the third ' in ''' to
885 # Id.Left_TSingleQuote, so that ''' is the terminator
886 left_sq_token = self.cur_token
887 left_sq_token.id = triple_left_id
888
889 triple_out.b = True # let caller know we got it
890 return self._ReadSingleQuoted(left_sq_token, lexer_mode)
891
892 return sq_part
893
894 if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick,
895 Id.Left_ProcSubIn, Id.Left_ProcSubOut):
896 return self._ReadCommandSub(self.token_type, d_quoted=False)
897
898 if self.token_type == Id.Left_DollarBrace:
899 return self._ReadBracedVarSub(self.cur_token, d_quoted=False)
900
901 if self.token_type == Id.Left_DollarDParen:
902 return self._ReadArithSub()
903
904 if self.token_type == Id.Left_DollarBracket:
905 return self._ReadExprSub(lex_mode_e.ShCommand)
906
907 if self.token_type == Id.Left_DollarBraceZsh:
908 return self._ReadZshVarSub(self.cur_token)
909
910 raise AssertionError(self.cur_token)
911
912 def _ReadExtGlob(self):
913 # type: () -> word_part.ExtGlob
914 """
915 Grammar:
916 Item = CompoundWord | EPSILON # important: @(foo|) is allowed
917 LEFT = '@(' | '*(' | '+(' | '?(' | '!('
918 RIGHT = ')'
919 ExtGlob = LEFT (Item '|')* Item RIGHT # ITEM may be empty
920 Compound includes ExtGlob
921 """
922 left_token = self.cur_token
923 right_token = None # type: Token
924 arms = [] # type: List[CompoundWord]
925
926 self.lexer.PushHint(Id.Op_RParen, Id.Right_ExtGlob)
927 self._SetNext(lex_mode_e.ExtGlob) # advance past LEFT
928
929 read_word = False # did we just a read a word? To handle @(||).
930
931 while True:
932 self._GetToken()
933
934 if self.token_type == Id.Right_ExtGlob:
935 if not read_word:
936 arms.append(CompoundWord([]))
937 right_token = self.cur_token
938 break
939
940 elif self.token_type == Id.Op_Pipe:
941 if not read_word:
942 arms.append(CompoundWord([]))
943 read_word = False
944 self._SetNext(lex_mode_e.ExtGlob)
945
946 # lex_mode_e.ExtGlob should only produce these 4 kinds of tokens
947 elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub,
948 Kind.ExtGlob):
949 w = self._ReadCompoundWord(lex_mode_e.ExtGlob)
950 arms.append(w)
951 read_word = True
952
953 elif self.token_kind == Kind.Eof:
954 p_die('Unexpected EOF reading extended glob that began here',
955 left_token)
956
957 else:
958 raise AssertionError(self.cur_token)
959
960 return word_part.ExtGlob(left_token, arms, right_token)
961
962 def _ReadBashRegexGroup(self):
963 # type: () -> word_part.BashRegexGroup
964 """
965 Grammar:
966 BashRegexGroup = '(' WORD? ')
967 """
968 left_token = self.cur_token
969 assert left_token.id == Id.BashRegex_LParen, left_token
970
971 arms = [] # type: List[CompoundWord]
972
973 self.lexer.PushHint(Id.Op_RParen, Id.Right_BashRegexGroup)
974 self._SetNext(lex_mode_e.BashRegexFakeInner) # advance past LEFT
975
976 self._GetToken()
977 if self.token_type == Id.Right_BashRegexGroup: # empty ()
978 return word_part.BashRegexGroup(left_token, None, self.cur_token)
979
980 # lex_mode_e.BashRegex should only produce these 4 kinds of tokens
981 if self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub, Kind.BashRegex):
982 # Fake lexer mode that translates Id.WS_Space to Id.Lit_Chars
983 # To allow bash style [[ s =~ (a b) ]]
984 w = self._ReadCompoundWord(lex_mode_e.BashRegexFakeInner)
985 arms.append(w)
986
987 self._GetToken()
988 if self.token_type != Id.Right_BashRegexGroup:
989 p_die('Expected ) to close bash regex group', self.cur_token)
990
991 return word_part.BashRegexGroup(left_token, w, self.cur_token)
992
993 p_die('Expected word after ( opening bash regex group', self.cur_token)
994
995 def _ReadLikeDQ(self, left_token, is_ysh_expr, out_parts):
996 # type: (Optional[Token], bool, List[word_part_t]) -> None
997 """
998 Args:
999 left_token: A token if we are reading a double quoted part, or None if
1000 we're reading a here doc.
1001 is_ysh_expr: Whether to disallow backticks and invalid char escapes
1002 out_parts: list of word_part to append to
1003 """
1004 if left_token:
1005 if left_token.id in (Id.Left_TDoubleQuote,
1006 Id.Left_DollarTDoubleQuote):
1007 expected_end_tokens = 3
1008 else:
1009 expected_end_tokens = 1
1010 else:
1011 expected_end_tokens = 1000 # here doc will break
1012
1013 num_end_tokens = 0
1014 while num_end_tokens < expected_end_tokens:
1015 self._SetNext(lex_mode_e.DQ)
1016 self._GetToken()
1017
1018 if self.token_kind == Kind.Lit:
1019 if self.token_type == Id.Lit_EscapedChar:
1020 tok = self.cur_token
1021 ch = lexer.TokenSliceLeft(tok, 1)
1022 part = word_part.EscapedLiteral(tok,
1023 ch) # type: word_part_t
1024 else:
1025 if self.token_type == Id.Lit_BadBackslash:
1026 # echo "\z" is OK in shell, but 'x = "\z" is a syntax error in
1027 # YSH.
1028 # Slight hole: We don't catch 'x = ${undef:-"\z"} because of the
1029 # recursion (unless parse_backslash)
1030 if (is_ysh_expr or
1031 not self.parse_opts.parse_backslash()):
1032 p_die(
1033 "Invalid char escape in double quoted string (OILS-ERR-12)",
1034 self.cur_token)
1035 elif self.token_type == Id.Lit_Dollar:
1036 if is_ysh_expr or not self.parse_opts.parse_dollar():
1037 p_die("Literal $ should be quoted like \$",
1038 self.cur_token)
1039
1040 part = self.cur_token
1041 out_parts.append(part)
1042
1043 elif self.token_kind == Kind.Left:
1044 if self.token_type == Id.Left_Backtick and is_ysh_expr:
1045 p_die("Invalid backtick: use $(cmd) or \\` in YSH strings",
1046 self.cur_token)
1047
1048 part = self._ReadDoubleQuotedLeftParts()
1049 out_parts.append(part)
1050
1051 elif self.token_kind == Kind.VSub:
1052 tok = self.cur_token
1053 part = SimpleVarSub(tok)
1054 out_parts.append(part)
1055 # NOTE: parsing "$f(x)" would BREAK CODE. Could add a more for it
1056 # later.
1057
1058 elif self.token_kind == Kind.Right:
1059 assert self.token_type == Id.Right_DoubleQuote, self.token_type
1060 if left_token:
1061 num_end_tokens += 1
1062
1063 # In a here doc, the right quote is literal!
1064 out_parts.append(self.cur_token)
1065
1066 elif self.token_kind == Kind.Eof:
1067 if left_token:
1068 p_die(
1069 'Unexpected EOF reading double-quoted string that began here',
1070 left_token)
1071 else: # here docs will have an EOF in their token stream
1072 break
1073
1074 else:
1075 raise AssertionError(self.cur_token)
1076
1077 if self.token_kind != Kind.Right:
1078 num_end_tokens = 0 # """ must be CONSECUTIVE
1079
1080 if expected_end_tokens == 1:
1081 out_parts.pop()
1082 elif expected_end_tokens == 3:
1083 out_parts.pop()
1084 out_parts.pop()
1085 out_parts.pop()
1086
1087 # Remove space from """ in both expression mode and command mode
1088 if (left_token and left_token.id
1089 in (Id.Left_TDoubleQuote, Id.Left_DollarTDoubleQuote)):
1090 word_compile.RemoveLeadingSpaceDQ(out_parts)
1091
1092 # Return nothing, since we appended to 'out_parts'
1093
1094 def _ReadDoubleQuoted(self, left_token):
1095 # type: (Token) -> DoubleQuoted
1096 """Helper function for "hello $name".
1097
1098 Args:
1099 eof_type: for stopping at }, Id.Lit_RBrace
1100 here_doc: Whether we are reading in a here doc context
1101
1102 Also ${foo%%a b c} # treat this as double quoted. until you hit
1103 """
1104 parts = [] # type: List[word_part_t]
1105 self._ReadLikeDQ(left_token, False, parts)
1106
1107 right_quote = self.cur_token
1108 return DoubleQuoted(left_token, parts, right_quote)
1109
1110 def ReadDoubleQuoted(self, left_token, parts):
1111 # type: (Token, List[word_part_t]) -> Token
1112 """For expression mode.
1113
1114 Read var x = "${dir:-}/$name"; etc.
1115 """
1116 self._ReadLikeDQ(left_token, True, parts)
1117 return self.cur_token
1118
1119 def _ReadCommandSub(self, left_id, d_quoted=False):
1120 # type: (Id_t, bool) -> CommandSub
1121 """
1122 NOTE: This is not in the grammar, because word parts aren't in the grammar!
1123
1124 command_sub = '$(' command_list ')'
1125 | '@(' command_list ')'
1126 | '<(' command_list ')'
1127 | '>(' command_list ')'
1128 | ` command_list `
1129 """
1130 left_token = self.cur_token
1131
1132 # Set the lexer in a state so ) becomes the EOF token.
1133 if left_id in (Id.Left_DollarParen, Id.Left_AtParen, Id.Left_ProcSubIn,
1134 Id.Left_ProcSubOut):
1135 self._SetNext(lex_mode_e.ShCommand) # advance past $( etc.
1136
1137 right_id = Id.Eof_RParen
1138 self.lexer.PushHint(Id.Op_RParen, right_id)
1139 c_parser = self.parse_ctx.MakeParserForCommandSub(
1140 self.line_reader, self.lexer, right_id)
1141 # NOTE: This doesn't use something like main_loop because we don't want
1142 # to interleave parsing and execution! Unlike 'source' and 'eval'.
1143 node = c_parser.ParseCommandSub()
1144
1145 right_token = c_parser.w_parser.cur_token
1146
1147 elif left_id == Id.Left_Backtick and self.parse_ctx.do_lossless:
1148 # NOTE: This is an APPROXIMATE solution for translation ONLY. See
1149 # test/osh2oil.
1150
1151 right_id = Id.Eof_Backtick
1152 self.lexer.PushHint(Id.Left_Backtick, right_id)
1153 c_parser = self.parse_ctx.MakeParserForCommandSub(
1154 self.line_reader, self.lexer, right_id)
1155 node = c_parser.ParseCommandSub()
1156 right_token = c_parser.w_parser.cur_token
1157
1158 elif left_id == Id.Left_Backtick:
1159 if not self.parse_opts.parse_backticks():
1160 p_die('Use $(cmd) instead of backticks (parse_backticks)',
1161 left_token)
1162
1163 self._SetNext(lex_mode_e.Backtick) # advance past `
1164
1165 parts = [] # type: List[str]
1166 while True:
1167 self._GetToken()
1168 #log("TOK %s", self.cur_token)
1169
1170 if self.token_type == Id.Backtick_Quoted:
1171 # Remove leading \
1172 parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1173
1174 elif self.token_type == Id.Backtick_DoubleQuote:
1175 # Compatibility: If backticks are double quoted, then double quotes
1176 # within them have to be \"
1177 # Shells aren't smart enough to match nested " and ` quotes (but OSH
1178 # is)
1179 if d_quoted:
1180 # Remove leading \
1181 parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1182 else:
1183 parts.append(lexer.TokenVal(self.cur_token))
1184
1185 elif self.token_type == Id.Backtick_Other:
1186 parts.append(lexer.TokenVal(self.cur_token))
1187
1188 elif self.token_type == Id.Backtick_Right:
1189 break
1190
1191 elif self.token_type == Id.Eof_Real:
1192 # Note: this parse error is in the ORIGINAL context. No code_str yet.
1193 p_die('Unexpected EOF while looking for closing backtick',
1194 left_token)
1195
1196 else:
1197 raise AssertionError(self.cur_token)
1198
1199 self._SetNext(lex_mode_e.Backtick)
1200
1201 # Calculate right SPID on CommandSub BEFORE re-parsing.
1202 right_token = self.cur_token
1203
1204 code_str = ''.join(parts)
1205 #log('code %r', code_str)
1206
1207 # NOTE: This is similar to how we parse aliases in osh/cmd_parse.py. It
1208 # won't have the same location info as MakeParserForCommandSub(), because
1209 # the lexer is different.
1210 arena = self.parse_ctx.arena
1211 #arena = alloc.Arena()
1212 line_reader = reader.StringLineReader(code_str, arena)
1213 c_parser = self.parse_ctx.MakeOshParser(line_reader)
1214 src = source.Reparsed('backticks', left_token, right_token)
1215 with alloc.ctx_SourceCode(arena, src):
1216 node = c_parser.ParseCommandSub()
1217
1218 else:
1219 raise AssertionError(left_id)
1220
1221 return CommandSub(left_token, node, right_token)
1222
1223 def _ReadExprSub(self, lex_mode):
1224 # type: (lex_mode_t) -> word_part.ExprSub
1225 """$[d->key] $[obj.method()] etc."""
1226 left_token = self.cur_token
1227
1228 self._SetNext(lex_mode_e.Expr)
1229 enode, right_token = self.parse_ctx.ParseYshExpr(
1230 self.lexer, grammar_nt.ysh_expr_sub)
1231
1232 self._SetNext(lex_mode) # Move past ]
1233 return word_part.ExprSub(left_token, enode, right_token)
1234
1235 def ParseVarDecl(self, kw_token):
1236 # type: (Token) -> command.VarDecl
1237 """
1238 oil_var_decl: name_type_list '=' testlist end_stmt
1239
1240 Note that assignments must end with \n ; } or EOF. Unlike shell
1241 assignments, we disallow:
1242
1243 var x = 42 | wc -l
1244 var x = 42 && echo hi
1245 """
1246 self._SetNext(lex_mode_e.Expr)
1247 enode, last_token = self.parse_ctx.ParseVarDecl(kw_token, self.lexer)
1248 # Hack to move } from what the Expr lexer modes gives to what CommandParser
1249 # wants
1250 if last_token.id == Id.Op_RBrace:
1251 last_token.id = Id.Lit_RBrace
1252
1253 # Let the CommandParser see the Op_Semi or Op_Newline.
1254 self.buffered_word = last_token
1255 self._SetNext(lex_mode_e.ShCommand) # always back to this
1256 return enode
1257
1258 def ParseMutation(self, kw_token, var_checker):
1259 # type: (Token, VarChecker) -> command.Mutation
1260 """
1261 setvar i = 42
1262 setvar i += 1
1263 setvar a[i] = 42
1264 setvar a[i] += 1
1265 setvar d.key = 42
1266 setvar d.key += 1
1267 """
1268 self._SetNext(lex_mode_e.Expr)
1269 enode, last_token = self.parse_ctx.ParseMutation(kw_token, self.lexer)
1270 # Hack to move } from what the Expr lexer modes gives to what CommandParser
1271 # wants
1272 if last_token.id == Id.Op_RBrace:
1273 last_token.id = Id.Lit_RBrace
1274
1275 for lhs in enode.lhs:
1276 UP_lhs = lhs
1277 with tagswitch(lhs) as case:
1278 if case(y_lhs_e.Var):
1279 lhs = cast(Token, UP_lhs)
1280 var_checker.Check(kw_token.id, lexer.LazyStr(lhs), lhs)
1281
1282 # Note: this does not cover cases like
1283 # setvar (a[0])[1] = v
1284 # setvar (d.key).other = v
1285 # This leaks into catching all typos statically, which may be
1286 # possible if 'use' makes all names explicit.
1287 elif case(y_lhs_e.Subscript):
1288 lhs = cast(Subscript, UP_lhs)
1289 if lhs.obj.tag() == expr_e.Var:
1290 v = cast(expr.Var, lhs.obj)
1291 var_checker.Check(kw_token.id, v.name, v.left)
1292
1293 elif case(y_lhs_e.Attribute):
1294 lhs = cast(Attribute, UP_lhs)
1295 if lhs.obj.tag() == expr_e.Var:
1296 v = cast(expr.Var, lhs.obj)
1297 var_checker.Check(kw_token.id, v.name, v.left)
1298
1299 # Let the CommandParser see the Op_Semi or Op_Newline.
1300 self.buffered_word = last_token
1301 self._SetNext(lex_mode_e.ShCommand) # always back to this
1302 return enode
1303
1304 def ParseBareDecl(self):
1305 # type: () -> expr_t
1306 """
1307 x = {name: val}
1308 """
1309 self._SetNext(lex_mode_e.Expr)
1310 self._GetToken()
1311 enode, last_token = self.parse_ctx.ParseYshExpr(
1312 self.lexer, grammar_nt.command_expr)
1313 if last_token.id == Id.Op_RBrace:
1314 last_token.id = Id.Lit_RBrace
1315 self.buffered_word = last_token
1316 self._SetNext(lex_mode_e.ShCommand)
1317 return enode
1318
1319 def ParseYshExprForCommand(self):
1320 # type: () -> expr_t
1321
1322 # Fudge for this case
1323 # for x in(y) {
1324 # versus
1325 # for x in (y) {
1326 #
1327 # In the former case, ReadWord on 'in' puts the lexer past (.
1328 # Also see LookPastSpace in CommandParers.
1329 # A simpler solution would be nicer.
1330
1331 if self.token_type == Id.Op_LParen:
1332 self.lexer.MaybeUnreadOne()
1333
1334 enode, _ = self.parse_ctx.ParseYshExpr(self.lexer, grammar_nt.ysh_expr)
1335
1336 self._SetNext(lex_mode_e.ShCommand)
1337 return enode
1338
1339 def ParseCommandExpr(self):
1340 # type: () -> expr_t
1341 """
1342 = 1+2
1343 """
1344 enode, last_token = self.parse_ctx.ParseYshExpr(
1345 self.lexer, grammar_nt.command_expr)
1346
1347 # In some cases, such as the case statement, we expect *the lexer* to be
1348 # pointing at the token right after the expression. But the expression
1349 # parser must have read to the `last_token`. Unreading places the lexer
1350 # back in the expected state. Ie:
1351 #
1352 # case (x) { case (x) {
1353 # (else) { = x } (else) { = x }
1354 # ^ The lexer is here ^ Unread to here
1355 # } }
1356 assert last_token.id in (Id.Op_Newline, Id.Eof_Real, Id.Op_Semi,
1357 Id.Op_RBrace), last_token
1358 if last_token.id != Id.Eof_Real:
1359 # Eof_Real is the only token we cannot unread
1360 self.lexer.MaybeUnreadOne()
1361
1362 return enode
1363
1364 def ParseProc(self, node):
1365 # type: (Proc) -> None
1366
1367 # proc name-with-hyphens() must be accepted
1368 self._SetNext(lex_mode_e.ShCommand)
1369 self._GetToken()
1370 # example: 'proc f[' gets you Lit_ArrayLhsOpen
1371 if self.token_type != Id.Lit_Chars:
1372 p_die('Invalid proc name %s' % ui.PrettyToken(self.cur_token),
1373 self.cur_token)
1374
1375 # TODO: validate this more. Disallow proc 123 { }, which isn't disallowed
1376 # for shell functions. Similar to IsValidVarName().
1377 node.name = self.cur_token
1378
1379 last_token = self.parse_ctx.ParseProc(self.lexer, node)
1380
1381 # Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1382 assert last_token.id == Id.Op_LBrace
1383 last_token.id = Id.Lit_LBrace
1384 self.buffered_word = last_token
1385
1386 self._SetNext(lex_mode_e.ShCommand)
1387
1388 def ParseFunc(self, node):
1389 # type: (Func) -> None
1390 last_token = self.parse_ctx.ParseFunc(self.lexer, node)
1391
1392 # Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1393 assert last_token.id == Id.Op_LBrace
1394 last_token.id = Id.Lit_LBrace
1395 self.buffered_word = last_token
1396
1397 self._SetNext(lex_mode_e.ShCommand)
1398
1399 def ParseYshCasePattern(self):
1400 # type: () -> Tuple[pat_t, Token]
1401 pat, left_tok, last_token = self.parse_ctx.ParseYshCasePattern(
1402 self.lexer)
1403
1404 if last_token.id == Id.Op_LBrace:
1405 last_token.id = Id.Lit_LBrace
1406 self.buffered_word = last_token
1407
1408 return pat, left_tok
1409
1410 def NewlineOkForYshCase(self):
1411 # type: () -> Id_t
1412 """Check for optional newline and consume it.
1413
1414 This is a special case of `_NewlineOk` which fixed some "off-by-one" issues
1415 which crop up while parsing Ysh Case Arms. For more details, see
1416 #oil-dev > Progress On YSH Case Grammar on zulip.
1417
1418 Returns a token id which is filled with the choice of
1419
1420 word { echo word }
1421 (3) { echo expr }
1422 /e/ { echo eggex }
1423 } # right brace
1424 """
1425 while True:
1426 next_id = self.lexer.LookAheadOne(lex_mode_e.Expr)
1427
1428 # Cannot lookahead past lines
1429 if next_id == Id.Unknown_Tok:
1430 if not self.lexer.MoveToNextLine(): # Try to move to next line
1431 break # EOF
1432 continue
1433
1434 next_kind = consts.GetKind(next_id)
1435 if next_id != Id.Op_Newline and next_kind != Kind.Ignored:
1436 break
1437
1438 self.lexer.Read(lex_mode_e.Expr)
1439
1440 if next_id in (Id.Op_RBrace, Id.Op_LParen, Id.Arith_Slash):
1441 self._SetNext(lex_mode_e.Expr) # Continue in expression mode
1442 else:
1443 # Consume the trailing Op_Newline
1444 self._SetNext(lex_mode_e.ShCommand)
1445 self._GetToken()
1446
1447 return next_id
1448
1449 def _ReadArithExpr(self, end_id):
1450 # type: (Id_t) -> arith_expr_t
1451 """Read and parse an arithmetic expression in various contexts.
1452
1453 $(( 1+2 ))
1454 (( a=1+2 ))
1455 ${a[ 1+2 ]}
1456 ${a : 1+2 : 1+2}
1457
1458 See tests/arith-context.test.sh for ambiguous cases.
1459
1460 ${a[a[0]]} is valid # VS_RBRACKET vs Id.Arith_RBracket
1461
1462 ${s : a<b?0:1 : 1} # VS_COLON vs Id.Arith_Colon
1463
1464 See the assertion in ArithParser.Parse() -- unexpected extra input.
1465 """
1466 # calls self.ReadWord(lex_mode_e.Arith)
1467 anode = self.a_parser.Parse()
1468 cur_id = self.a_parser.CurrentId()
1469 if end_id != Id.Undefined_Tok and cur_id != end_id:
1470 p_die(
1471 'Unexpected token after arithmetic expression (%s != %s)' %
1472 (ui.PrettyId(cur_id), ui.PrettyId(end_id)),
1473 loc.Word(self.a_parser.cur_word))
1474 return anode
1475
1476 def _ReadArithSub(self):
1477 # type: () -> word_part.ArithSub
1478 """Read an arith substitution, which contains an arith expression, e.g.
1479
1480 $((a + 1)).
1481 """
1482 left_tok = self.cur_token
1483
1484 # The second one needs to be disambiguated in stuff like stuff like:
1485 # $(echo $(( 1+2 )) )
1486 self.lexer.PushHint(Id.Op_RParen, Id.Right_DollarDParen)
1487
1488 # NOTE: To disambiguate $(( as arith sub vs. command sub and subshell, we
1489 # could save the lexer/reader state here, and retry if the arithmetic parse
1490 # fails. But we can almost always catch this at parse time. There could
1491 # be some exceptions like:
1492 # $((echo * foo)) # looks like multiplication
1493 # $((echo / foo)) # looks like division
1494
1495 # $(( )) is valid
1496 anode = arith_expr.EmptyZero # type: arith_expr_t
1497
1498 self._NextNonSpace()
1499 if self.token_type != Id.Arith_RParen:
1500 anode = self._ReadArithExpr(Id.Arith_RParen)
1501
1502 self._SetNext(lex_mode_e.ShCommand)
1503
1504 # Ensure we get closing )
1505 self._GetToken()
1506 if self.token_type != Id.Right_DollarDParen:
1507 p_die('Expected second ) to end arith sub', self.cur_token)
1508
1509 right_tok = self.cur_token
1510 return word_part.ArithSub(left_tok, anode, right_tok)
1511
1512 def ReadDParen(self):
1513 # type: () -> Tuple[arith_expr_t, Token]
1514 """Read ((1+ 2)) -- command context.
1515
1516 We're using the word parser because it's very similar to _ReadArithExpr
1517 above.
1518
1519 This also returns the terminating Id.Op_DRightParen token for location
1520 info.
1521 """
1522 # (( )) is valid
1523 anode = arith_expr.EmptyZero # type: arith_expr_t
1524
1525 self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
1526
1527 self._NextNonSpace()
1528 if self.token_type != Id.Arith_RParen:
1529 anode = self._ReadArithExpr(Id.Arith_RParen)
1530
1531 self._SetNext(lex_mode_e.ShCommand)
1532
1533 # Ensure we get the second )
1534 self._GetToken()
1535 right = self.cur_token
1536 if right.id != Id.Op_DRightParen:
1537 p_die('Expected second ) to end arith statement', right)
1538
1539 self._SetNext(lex_mode_e.ShCommand)
1540
1541 return anode, right
1542
1543 def _NextNonSpace(self):
1544 # type: () -> None
1545 """Advance in lex_mode_e.Arith until non-space token.
1546
1547 Same logic as _ReadWord, but used in
1548 $(( ))
1549 (( ))
1550 for (( ))
1551
1552 You can read self.token_type after this, without calling _GetToken.
1553 """
1554 while True:
1555 self._SetNext(lex_mode_e.Arith)
1556 self._GetToken()
1557 if self.token_kind not in (Kind.Ignored, Kind.WS):
1558 break
1559
1560 def ReadForExpression(self):
1561 # type: () -> command.ForExpr
1562 """Read ((i=0; i<5; ++i)) -- part of command context."""
1563 self._NextNonSpace() # skip over ((
1564 cur_id = self.token_type # for end of arith expressions
1565
1566 if cur_id == Id.Arith_Semi: # for (( ; i < 10; i++ ))
1567 init_node = arith_expr.EmptyZero # type: arith_expr_t
1568 else:
1569 init_node = self.a_parser.Parse()
1570 cur_id = self.a_parser.CurrentId()
1571 self._NextNonSpace()
1572
1573 # It's odd to keep track of both cur_id and self.token_type in this
1574 # function, but it works, and is tested in 'test/parse_error.sh
1575 # arith-integration'
1576 if cur_id != Id.Arith_Semi: # for (( x=0 b; ... ))
1577 p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1578
1579 self._GetToken()
1580 cur_id = self.token_type
1581
1582 if cur_id == Id.Arith_Semi: # for (( ; ; i++ ))
1583 # empty condition is TRUE
1584 cond_node = arith_expr.EmptyOne # type: arith_expr_t
1585 else:
1586 cond_node = self.a_parser.Parse()
1587 cur_id = self.a_parser.CurrentId()
1588
1589 if cur_id != Id.Arith_Semi: # for (( x=0; x<5 b ))
1590 p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1591
1592 self._NextNonSpace()
1593 if self.token_type == Id.Arith_RParen: # for (( ; ; ))
1594 update_node = arith_expr.EmptyZero # type: arith_expr_t
1595 else:
1596 update_node = self._ReadArithExpr(Id.Arith_RParen)
1597
1598 self._NextNonSpace()
1599 if self.token_type != Id.Arith_RParen:
1600 p_die('Expected ) to end for loop expression', self.cur_token)
1601 self._SetNext(lex_mode_e.ShCommand)
1602
1603 # redirects is None, will be assigned in CommandEvaluator
1604 node = command.ForExpr.CreateNull()
1605 node.init = init_node
1606 node.cond = cond_node
1607 node.update = update_node
1608 return node
1609
1610 def _ReadArrayLiteral(self):
1611 # type: () -> word_part_t
1612 """a=(1 2 3)
1613
1614 TODO: See osh/cmd_parse.py:164 for Id.Lit_ArrayLhsOpen, for a[x++]=1
1615
1616 We want:
1617
1618 A=(['x']=1 ["x"]=2 [$x$y]=3)
1619
1620 Maybe allow this as a literal string? Because I think I've seen it before?
1621 Or maybe force people to patch to learn the rule.
1622
1623 A=([x]=4)
1624
1625 Starts with Lit_Other '[', and then it has Lit_ArrayLhsClose
1626 Maybe enforce that ALL have keys or NONE of have keys.
1627 """
1628 self._SetNext(lex_mode_e.ShCommand) # advance past (
1629 self._GetToken()
1630 if self.cur_token.id != Id.Op_LParen:
1631 p_die('Expected ( after =', self.cur_token)
1632 left_token = self.cur_token
1633 right_token = None # type: Token
1634
1635 # MUST use a new word parser (with same lexer).
1636 w_parser = self.parse_ctx.MakeWordParser(self.lexer, self.line_reader)
1637 words = [] # type: List[CompoundWord]
1638 done = False
1639 while not done:
1640 w = w_parser.ReadWord(lex_mode_e.ShCommand)
1641 with tagswitch(w) as case:
1642 if case(word_e.Operator):
1643 tok = cast(Token, w)
1644 if tok.id == Id.Right_ShArrayLiteral:
1645 right_token = tok
1646 done = True # can't use break here
1647 # Unlike command parsing, array parsing allows embedded \n.
1648 elif tok.id == Id.Op_Newline:
1649 continue
1650 else:
1651 p_die('Unexpected token in array literal', loc.Word(w))
1652
1653 elif case(word_e.Compound):
1654 words.append(cast(CompoundWord, w))
1655
1656 else:
1657 raise AssertionError()
1658
1659 if len(words) == 0: # a=() is empty indexed array
1660 # Needed for type safety, doh
1661 no_words = [] # type: List[word_t]
1662 node = ShArrayLiteral(left_token, no_words, right_token)
1663 return node
1664
1665 pairs = [] # type: List[AssocPair]
1666 # If the first one is a key/value pair, then the rest are assumed to be.
1667 pair = word_.DetectAssocPair(words[0])
1668 if pair:
1669 pairs.append(pair)
1670
1671 n = len(words)
1672 for i in xrange(1, n):
1673 w2 = words[i]
1674 pair = word_.DetectAssocPair(w2)
1675 if not pair:
1676 p_die("Expected associative array pair", loc.Word(w2))
1677
1678 pairs.append(pair)
1679
1680 # invariant List?
1681 return word_part.BashAssocLiteral(left_token, pairs, right_token)
1682
1683 # Brace detection for arrays but NOT associative arrays
1684 words2 = braces.BraceDetectAll(words)
1685 words3 = word_.TildeDetectAll(words2)
1686 return ShArrayLiteral(left_token, words3, right_token)
1687
1688 def ParseProcCallArgs(self, start_symbol):
1689 # type: (int) -> ArgList
1690 """ json write (x) """
1691 self.lexer.MaybeUnreadOne()
1692
1693 arg_list = ArgList.CreateNull(alloc_lists=True)
1694 arg_list.left = self.cur_token
1695 self.parse_ctx.ParseProcCallArgs(self.lexer, arg_list, start_symbol)
1696 return arg_list
1697
1698 def _MaybeReadWordPart(self, is_first, lex_mode, parts):
1699 # type: (bool, lex_mode_t, List[word_part_t]) -> bool
1700 """Helper for _ReadCompoundWord3."""
1701 done = False
1702
1703 if self.token_type == Id.Lit_EscapedChar:
1704 tok = self.cur_token
1705 assert tok.length == 2
1706 ch = lexer.TokenSliceLeft(tok, 1)
1707 if not self.parse_opts.parse_backslash():
1708 if not pyutil.IsValidCharEscape(ch):
1709 p_die('Invalid char escape in unquoted word (OILS-ERR-13)',
1710 self.cur_token)
1711
1712 part = word_part.EscapedLiteral(self.cur_token,
1713 ch) # type: word_part_t
1714 else:
1715 part = self.cur_token
1716
1717 if is_first and self.token_type == Id.Lit_VarLike: # foo=
1718 parts.append(part)
1719 # Unfortunately it's awkward to pull the check for a=(1 2) up to
1720 # _ReadWord.
1721 next_id = self.lexer.LookPastSpace(lex_mode)
1722 if next_id == Id.Op_LParen:
1723 self.lexer.PushHint(Id.Op_RParen, Id.Right_ShArrayLiteral)
1724 part2 = self._ReadArrayLiteral()
1725 parts.append(part2)
1726
1727 # Array literal must be the last part of the word.
1728 self._SetNext(lex_mode)
1729 self._GetToken()
1730 # EOF, whitespace, newline, Right_Subshell
1731 if self.token_kind not in KINDS_THAT_END_WORDS:
1732 p_die('Unexpected token after array literal',
1733 self.cur_token)
1734 done = True
1735
1736 elif (is_first and self.parse_opts.parse_at() and
1737 self.token_type == Id.Lit_Splice):
1738
1739 splice_tok = self.cur_token
1740 part2 = word_part.Splice(splice_tok,
1741 lexer.TokenSliceLeft(splice_tok, 1))
1742
1743 parts.append(part2)
1744
1745 # @words must be the last part of the word
1746 self._SetNext(lex_mode)
1747 self._GetToken()
1748 # EOF, whitespace, newline, Right_Subshell
1749 if self.token_kind not in KINDS_THAT_END_WORDS:
1750 p_die('Unexpected token after array splice', self.cur_token)
1751 done = True
1752
1753 elif (is_first and self.parse_opts.parse_at() and
1754 self.token_type == Id.Lit_AtLBracket): # @[split(x)]
1755 part2 = self._ReadExprSub(lex_mode_e.DQ)
1756 parts.append(part2)
1757
1758 # @[split(x)]
1759 self._SetNext(lex_mode)
1760 self._GetToken()
1761 # EOF, whitespace, newline, Right_Subshell
1762 if self.token_kind not in KINDS_THAT_END_WORDS:
1763 p_die('Unexpected token after Expr splice', self.cur_token)
1764 done = True
1765
1766 elif (is_first and self.parse_opts.parse_at() and
1767 self.token_type == Id.Lit_AtLBraceDot):
1768 p_die('TODO: @{.myproc builtin sub}', self.cur_token)
1769
1770 elif (is_first and self.parse_opts.parse_at_all() and
1771 self.token_type == Id.Lit_At):
1772 # Because $[x] ${x} and perhaps $/x/ are reserved, it makes sense for @
1773 # at the beginning of a word to be reserved.
1774
1775 # Although should we relax 'echo @' ? I'm tempted to have a shortcut for
1776 # @_argv and
1777 p_die('Literal @ starting a word must be quoted (parse_at_all)',
1778 self.cur_token)
1779
1780 else:
1781 # not a literal with lookahead; append it
1782 parts.append(part)
1783
1784 return done
1785
1786 def _ReadCompoundWord(self, lex_mode):
1787 # type: (lex_mode_t) -> CompoundWord
1788 return self._ReadCompoundWord3(lex_mode, Id.Undefined_Tok, True)
1789
1790 def _ReadCompoundWord3(self, lex_mode, eof_type, empty_ok):
1791 # type: (lex_mode_t, Id_t, bool) -> CompoundWord
1792 """
1793 Precondition: Looking at the first token of the first word part
1794 Postcondition: Looking at the token after, e.g. space or operator
1795
1796 NOTE: eof_type is necessary because / is a literal, i.e. Lit_Slash, but it
1797 could be an operator delimiting a compound word. Can we change lexer modes
1798 and remove this special case?
1799 """
1800 w = CompoundWord([])
1801 num_parts = 0
1802 brace_count = 0
1803 done = False
1804 is_triple_quoted = None # type: Optional[BoolParamBox]
1805
1806 while not done:
1807 self._GetToken()
1808
1809 allow_done = empty_ok or num_parts != 0
1810 if allow_done and self.token_type == eof_type:
1811 done = True # e.g. for ${foo//pat/replace}
1812
1813 # Keywords like "for" are treated like literals
1814 elif self.token_kind in (Kind.Lit, Kind.History, Kind.KW,
1815 Kind.ControlFlow, Kind.BoolUnary,
1816 Kind.BoolBinary):
1817
1818 # Syntax error for { and }
1819 if self.token_type == Id.Lit_LBrace:
1820 brace_count += 1
1821 elif self.token_type == Id.Lit_RBrace:
1822 brace_count -= 1
1823 elif self.token_type == Id.Lit_Dollar:
1824 if not self.parse_opts.parse_dollar():
1825 if num_parts == 0 and lex_mode == lex_mode_e.ShCommand:
1826 next_byte = self.lexer.ByteLookAhead()
1827 # TODO: switch lexer modes and parse $/d+/. But not ${a:-$/d+/}
1828 if next_byte == '/':
1829 #log('next_byte %r', next_byte)
1830 pass
1831
1832 p_die('Literal $ should be quoted like \$',
1833 self.cur_token)
1834
1835 done = self._MaybeReadWordPart(num_parts == 0, lex_mode,
1836 w.parts)
1837
1838 elif self.token_kind == Kind.VSub:
1839 vsub_token = self.cur_token
1840
1841 part = SimpleVarSub(vsub_token) # type: word_part_t
1842 w.parts.append(part)
1843
1844 elif self.token_kind == Kind.ExtGlob:
1845 # If parse_at, we can take over @( to start @(seq 3)
1846 # Users can also use look at ,(*.py|*.sh)
1847 if (self.parse_opts.parse_at() and
1848 self.token_type == Id.ExtGlob_At and num_parts == 0):
1849 cs_part = self._ReadCommandSub(Id.Left_AtParen,
1850 d_quoted=False)
1851 # RARE mutation of tok.id!
1852 cs_part.left_token.id = Id.Left_AtParen
1853 part = cs_part # for type safety
1854
1855 # Same check as _MaybeReadWordPart. @(seq 3)x is illegal, just like
1856 # a=(one two)x and @arrayfunc(3)x.
1857 self._GetToken()
1858 if self.token_kind not in KINDS_THAT_END_WORDS:
1859 p_die('Unexpected token after @()', self.cur_token)
1860 done = True
1861
1862 else:
1863 if HAVE_FNM_EXTMATCH == 0:
1864 p_die(
1865 "Extended glob won't work without FNM_EXTMATCH support in libc",
1866 self.cur_token)
1867 part = self._ReadExtGlob()
1868 w.parts.append(part)
1869
1870 elif self.token_kind == Kind.BashRegex:
1871 if self.token_type == Id.BashRegex_LParen: # Opening (
1872 part = self._ReadBashRegexGroup()
1873 w.parts.append(part)
1874 else:
1875 assert self.token_type == Id.BashRegex_AllowedInParens
1876 p_die('Invalid token in bash regex', self.cur_token)
1877
1878 elif self.token_kind == Kind.Left:
1879 try_triple_quote = (self.parse_opts.parse_triple_quote() and
1880 lex_mode == lex_mode_e.ShCommand and
1881 num_parts == 0)
1882
1883 # Save allocation
1884 if try_triple_quote:
1885 is_triple_quoted = BoolParamBox(False)
1886
1887 part = self._ReadUnquotedLeftParts(is_triple_quoted)
1888 w.parts.append(part)
1889
1890 # NOT done yet, will advance below
1891 elif self.token_kind == Kind.Right:
1892 # Still part of the word; will be done on the next iter.
1893 if self.token_type == Id.Right_DoubleQuote:
1894 pass
1895 # Never happens, no PushHint for this case.
1896 #elif self.token_type == Id.Right_DollarParen:
1897 # pass
1898 elif self.token_type == Id.Right_Subshell:
1899 # LEXER HACK for (case x in x) ;; esac )
1900 # Rewind before it's used
1901 assert self.next_lex_mode == lex_mode_e.Undefined
1902 if self.lexer.MaybeUnreadOne():
1903 self.lexer.PushHint(Id.Op_RParen, Id.Right_Subshell)
1904 self._SetNext(lex_mode)
1905 done = True
1906 else:
1907 done = True
1908
1909 elif self.token_kind == Kind.Ignored:
1910 done = True
1911
1912 else:
1913 # LEXER HACK for unbalanced case clause. 'case foo in esac' is valid,
1914 # so to test for ESAC, we can read ) before getting a chance to
1915 # PushHint(Id.Op_RParen, Id.Right_CasePat). So here we unread one
1916 # token and do it again.
1917
1918 # We get Id.Op_RParen at top level: case x in x) ;; esac
1919 # We get Id.Eof_RParen inside ComSub: $(case x in x) ;; esac )
1920 if self.token_type in (Id.Op_RParen, Id.Eof_RParen):
1921 # Rewind before it's used
1922 assert self.next_lex_mode == lex_mode_e.Undefined
1923 if self.lexer.MaybeUnreadOne():
1924 if self.token_type == Id.Eof_RParen:
1925 # Redo translation
1926 self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
1927 self._SetNext(lex_mode)
1928
1929 done = True # anything we don't recognize means we're done
1930
1931 if not done:
1932 self._SetNext(lex_mode)
1933 num_parts += 1
1934
1935 if (self.parse_opts.parse_brace() and num_parts > 1 and
1936 brace_count != 0):
1937 # accept { and }, but not foo{
1938 p_die(
1939 'Word has unbalanced { }. Maybe add a space or quote it like \{',
1940 loc.Word(w))
1941
1942 if is_triple_quoted and is_triple_quoted.b and num_parts > 1:
1943 p_die('Unexpected parts after triple quoted string',
1944 loc.WordPart(w.parts[-1]))
1945
1946 if 0:
1947 from _devbuild.gen.syntax_asdl import word_part_str
1948 word_key = ' '.join(word_part_str(p.tag()) for p in w.parts)
1949 WORD_HIST[word_key] += 1
1950 return w
1951
1952 def _ReadArithWord(self):
1953 # type: () -> Optional[word_t]
1954 """ Helper for ReadArithWord() """
1955 self._GetToken()
1956
1957 if self.token_kind == Kind.Unknown:
1958 # e.g. happened during dynamic parsing of unset 'a[$foo]' in gherkin
1959 p_die(
1960 'Unexpected token while parsing arithmetic: %r' %
1961 lexer.TokenVal(self.cur_token), self.cur_token)
1962
1963 elif self.token_kind == Kind.Eof:
1964 return self.cur_token
1965
1966 elif self.token_kind == Kind.Ignored:
1967 # Space should be ignored.
1968 self._SetNext(lex_mode_e.Arith)
1969 return None
1970
1971 elif self.token_kind in (Kind.Arith, Kind.Right):
1972 # Id.Right_DollarDParen IS just a normal token, handled by ArithParser
1973 self._SetNext(lex_mode_e.Arith)
1974 return self.cur_token
1975
1976 elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub):
1977 return self._ReadCompoundWord(lex_mode_e.Arith)
1978
1979 else:
1980 raise AssertionError(self.cur_token)
1981
1982 def _ReadWord(self, word_mode):
1983 # type: (lex_mode_t) -> Optional[word_t]
1984 """Helper function for ReadWord()."""
1985
1986 # Change the pseudo lexer mode to a real lexer mode
1987 if word_mode == lex_mode_e.ShCommandFakeBrack:
1988 lex_mode = lex_mode_e.ShCommand
1989 else:
1990 lex_mode = word_mode
1991
1992 self._GetToken()
1993
1994 if self.token_kind == Kind.Eof:
1995 # No advance
1996 return self.cur_token
1997
1998 # Allow Arith for ) at end of for loop?
1999 elif self.token_kind in (Kind.Op, Kind.Redir, Kind.Arith):
2000 self._SetNext(lex_mode)
2001
2002 # Newlines are complicated. See 3x2 matrix in the comment about
2003 # self.multiline and self.newline_state above.
2004 if self.token_type == Id.Op_Newline:
2005 if self.multiline:
2006 if self.newline_state > 1:
2007 # This points at a blank line, but at least it gives the line number
2008 p_die('Invalid blank line in multiline mode',
2009 self.cur_token)
2010 return None
2011
2012 if self.returned_newline: # skip
2013 return None
2014
2015 return self.cur_token
2016
2017 elif self.token_kind == Kind.Right:
2018 if self.token_type not in (Id.Right_Subshell, Id.Right_ShFunction,
2019 Id.Right_CasePat,
2020 Id.Right_ShArrayLiteral):
2021 raise AssertionError(self.cur_token)
2022
2023 self._SetNext(lex_mode)
2024 return self.cur_token
2025
2026 elif self.token_kind in (Kind.Ignored, Kind.WS):
2027 self._SetNext(lex_mode)
2028 return None
2029
2030 else:
2031 assert self.token_kind in (Kind.VSub, Kind.Lit, Kind.History,
2032 Kind.Left, Kind.KW, Kind.ControlFlow,
2033 Kind.BoolUnary, Kind.BoolBinary,
2034 Kind.ExtGlob,
2035 Kind.BashRegex), 'Unhandled token kind'
2036
2037 if (word_mode == lex_mode_e.ShCommandFakeBrack and
2038 self.parse_opts.parse_bracket() and
2039 self.token_type == Id.Lit_LBracket):
2040 # Change [ from Kind.Lit -> Kind.Op
2041 # So CommandParser can treat
2042 # assert [42 === x]
2043 # like
2044 # json write (x)
2045 bracket_word = self.cur_token
2046 bracket_word.id = Id.Op_LBracket
2047
2048 self._SetNext(lex_mode)
2049 return bracket_word
2050
2051 # We're beginning a word. If we see Id.Lit_Pound, change to
2052 # lex_mode_e.Comment and read until end of line.
2053 if self.token_type == Id.Lit_Pound:
2054 self._SetNext(lex_mode_e.Comment)
2055 self._GetToken()
2056
2057 # NOTE: The # could be the last character in the file. It can't be
2058 # Eof_{RParen,Backtick} because #) and #` are comments.
2059 assert self.token_type in (Id.Ignored_Comment, Id.Eof_Real), \
2060 self.cur_token
2061
2062 # The next iteration will go into Kind.Ignored and set lex state to
2063 # lex_mode_e.ShCommand/etc.
2064 return None # tell ReadWord() to try again after comment
2065
2066 elif self.token_type == Id.Lit_TPound: ### doc comment
2067 self._SetNext(lex_mode_e.Comment)
2068 self._GetToken()
2069
2070 if self.token_type == Id.Ignored_Comment and self.emit_doc_token:
2071 return self.cur_token
2072
2073 return None # tell ReadWord() to try again after comment
2074
2075 else:
2076 # r'' u'' b''
2077 if (self.token_type == Id.Lit_Chars and
2078 self.lexer.LookAheadOne(
2079 lex_mode_e.ShCommand) == Id.Left_SingleQuote):
2080
2081 # When shopt -s parse_raw_string:
2082 # echo r'hi' is like echo 'hi'
2083 #
2084 # echo u'\u{3bc}' b'\yff' works
2085
2086 tok = self.cur_token
2087 if self.parse_opts.parse_ysh_string():
2088 if lexer.TokenEquals(tok, 'r'):
2089 left_id = Id.Left_RSingleQuote
2090 elif lexer.TokenEquals(tok, 'u'):
2091 left_id = Id.Left_USingleQuote
2092 elif lexer.TokenEquals(tok, 'b'):
2093 left_id = Id.Left_BSingleQuote
2094 else:
2095 left_id = Id.Undefined_Tok
2096
2097 if left_id != Id.Undefined_Tok:
2098 # skip the r, and then 'foo' will be read as normal
2099 self._SetNext(lex_mode_e.ShCommand)
2100
2101 self._GetToken()
2102 assert self.token_type == Id.Left_SingleQuote, self.token_type
2103
2104 # Read the word in a different lexer mode
2105 return self._ReadYshSingleQuoted(left_id)
2106
2107 return self._ReadCompoundWord(lex_mode)
2108
2109 def ParseVarRef(self):
2110 # type: () -> BracedVarSub
2111 """DYNAMIC parsing of what's inside ${!ref}
2112
2113 # Same as VarOf production
2114 VarRefExpr = VarOf EOF
2115 """
2116 self._SetNext(lex_mode_e.VSub_1)
2117
2118 self._GetToken()
2119 if self.token_kind != Kind.VSub:
2120 p_die('Expected var name', self.cur_token)
2121
2122 part = self._ParseVarOf()
2123 # NOTE: no ${ } means no part.left and part.right
2124 part.left = part.name_tok # cheat to make test pass
2125 part.right = part.name_tok
2126
2127 self._GetToken()
2128 if self.token_type != Id.Eof_Real:
2129 p_die('Expected end of var ref expression', self.cur_token)
2130 return part
2131
2132 def LookPastSpace(self):
2133 # type: () -> Id_t
2134 """Look ahead to the next token.
2135
2136 For the CommandParser to recognize
2137 array= (1 2 3)
2138 YSH for ( versus bash for ((
2139 YSH if ( versus if test
2140 YSH while ( versus while test
2141 YSH bare assignment 'grep =' versus 'grep foo'
2142 """
2143 assert self.token_type != Id.Undefined_Tok
2144 if self.cur_token.id == Id.WS_Space:
2145 id_ = self.lexer.LookPastSpace(lex_mode_e.ShCommand)
2146 else:
2147 id_ = self.cur_token.id
2148 return id_
2149
2150 def LookAheadFuncParens(self):
2151 # type: () -> bool
2152 """Special lookahead for f( ) { echo hi; } to check for ( )"""
2153 assert self.token_type != Id.Undefined_Tok
2154
2155 # We have to handle 2 cases because we buffer a token
2156 if self.cur_token.id == Id.Op_LParen: # saw funcname(
2157 return self.lexer.LookAheadFuncParens(1) # go back one char
2158
2159 elif self.cur_token.id == Id.WS_Space: # saw funcname WHITESPACE
2160 return self.lexer.LookAheadFuncParens(0)
2161
2162 else:
2163 return False
2164
2165 def ReadWord(self, word_mode):
2166 # type: (lex_mode_t) -> word_t
2167 """Read the next word, using the given lexer mode.
2168
2169 This is a stateful wrapper for the stateless _ReadWord function.
2170 """
2171 assert word_mode in (lex_mode_e.ShCommand,
2172 lex_mode_e.ShCommandFakeBrack,
2173 lex_mode_e.DBracket, lex_mode_e.BashRegex)
2174
2175 if self.buffered_word: # For integration with pgen2
2176 w = self.buffered_word
2177 self.buffered_word = None
2178 else:
2179 while True:
2180 w = self._ReadWord(word_mode)
2181 if w is not None:
2182 break
2183
2184 self.returned_newline = (word_.CommandId(w) == Id.Op_Newline)
2185 return w
2186
2187 def ReadArithWord(self):
2188 # type: () -> word_t
2189 while True:
2190 w = self._ReadArithWord()
2191 if w is not None:
2192 break
2193 return w
2194
2195 def ReadHereDocBody(self, parts):
2196 # type: (List[word_part_t]) -> None
2197 """
2198 A here doc is like a double quoted context, except " isn't special.
2199 """
2200 self._ReadLikeDQ(None, False, parts)
2201 # Returns nothing
2202
2203 def ReadForPlugin(self):
2204 # type: () -> CompoundWord
2205 """For $PS1, $PS4, etc.
2206
2207 This is just like reading a here doc line. "\n" is allowed, as
2208 well as the typical substitutions ${x} $(echo hi) $((1 + 2)).
2209 """
2210 w = CompoundWord([])
2211 self._ReadLikeDQ(None, False, w.parts)
2212 return w
2213
2214 def EmitDocToken(self, b):
2215 # type: (bool) -> None
2216 self.emit_doc_token = b
2217
2218 def Multiline(self, b):
2219 # type: (bool) -> None
2220 self.multiline = b
2221
2222
2223if 0:
2224 import collections
2225 WORD_HIST = collections.Counter()
2226
2227# vim: sw=4