OILS / osh / word_parse.py View on Github | oils.pub

2216 lines, 1186 significant
1# Copyright 2016 Andy Chu. All rights reserved.
2# Licensed under the Apache License, Version 2.0 (the "License");
3# you may not use this file except in compliance with the License.
4# You may obtain a copy of the License at
5#
6# http://www.apache.org/licenses/LICENSE-2.0
7"""
8word_parse.py - Parse the shell word language.
9
10Hairy example:
11
12 hi$((1 + 2))"$(echo hi)"${var:-__"$(echo default)"__}
13
14Substitutions can be nested, but which inner subs are allowed depends on the
15outer sub. Notes:
16
17lex_mode_e.ShCommand (_ReadUnquotedLeftParts)
18 All subs and quotes are allowed:
19 $v ${v} $() `` $(()) '' "" $'' $"" <() >()
20
21lex_mode_e.DQ (_ReadDoubleQuotedLeftParts)
22 Var, Command, Arith, but no quotes.
23 $v ${v} $() `` $(())
24 No process substitution.
25
26lex_mode_e.Arith
27 Similar to DQ: Var, Command, and Arith sub, but no process sub. bash doesn't
28 allow quotes, but OSH does. We allow ALL FOUR kinds of quotes, because we
29 need those for associative array indexing.
30
31lex_mode_e.VSub_ArgUnquoted
32 Like ShCommand, everything is allowed (even process substitutions), but we
33 stop at }, and space is SIGNIFICANT.
34
35 Example: ${a:- b }
36
37 ${X:-$v} ${X:-${v}} ${X:-$(echo hi)} ${X:-`echo hi`} ${X:-$((1+2))}
38 ${X:-'single'} ${X:-"double"} ${X:-$'\n'} ${X:-<(echo hi)}
39
40lex_mode_e.VSub_ArgDQ
41 In contrast to DQ, VS_ARG_DQ accepts nested "" and $'' and $"", e.g.
42 "${x:-"default"}".
43
44 In contrast, VSub_ArgUnquoted respects single quotes and process
45 substitution.
46
47 It's weird that double quotes are allowed. Space is also significant here,
48 e.g. "${x:-a "b"}".
49"""
50
51from _devbuild.gen import grammar_nt
52from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind
53from _devbuild.gen.types_asdl import (lex_mode_t, lex_mode_e)
54from _devbuild.gen.syntax_asdl import (
55 BoolParamBox,
56 Token,
57 SimpleVarSub,
58 loc,
59 source,
60 DoubleQuoted,
61 SingleQuoted,
62 BracedVarSub,
63 CommandSub,
64 InitializerWord,
65 InitializerWord_t,
66 bracket_op,
67 bracket_op_t,
68 suffix_op,
69 suffix_op_t,
70 rhs_word,
71 rhs_word_e,
72 rhs_word_t,
73 word_e,
74 word_t,
75 CompoundWord,
76 word_part,
77 word_part_t,
78 y_lhs_e,
79 arith_expr_t,
80 command,
81 expr,
82 expr_e,
83 expr_t,
84 pat_t,
85 ArgList,
86 Proc,
87 Func,
88 Subscript,
89 Attribute,
90 arith_expr,
91)
92from core import alloc
93from core.error import p_die
94from mycpp.mylib import log
95from core import pyutil
96from display import ui
97from frontend import consts
98from frontend import lexer
99from frontend import reader
100from osh import tdop
101from osh import arith_parse
102from osh import braces
103from osh import word_
104from osh import word_compile
105from mycpp.mylib import tagswitch
106
107from libc import HAVE_FNM_EXTMATCH
108
109from typing import List, Optional, Tuple, cast
110from typing import TYPE_CHECKING
111if TYPE_CHECKING:
112 from frontend.lexer import Lexer
113 from frontend.parse_lib import ParseContext
114 from frontend.reader import _Reader
115 from osh.cmd_parse import VarChecker
116
117unused1 = log
118unused2 = Id_str
119
120KINDS_THAT_END_WORDS = [Kind.Eof, Kind.WS, Kind.Op, Kind.Right]
121
122
123class WordEmitter(object):
124 """Common interface for [ and [["""
125
126 def __init__(self):
127 # type: () -> None
128 """Empty constructor for mycpp."""
129 pass
130
131 def ReadWord(self, lex_mode):
132 # type: (lex_mode_t) -> word_t
133 raise NotImplementedError()
134
135
136class WordParser(WordEmitter):
137
138 def __init__(self, parse_ctx, lexer, line_reader):
139 # type: (ParseContext, Lexer, _Reader) -> None
140 self.parse_ctx = parse_ctx
141 self.lexer = lexer
142 self.line_reader = line_reader
143 self.arena = line_reader.arena
144
145 self.parse_opts = parse_ctx.parse_opts
146 self.a_parser = tdop.TdopParser(arith_parse.Spec(), self,
147 self.parse_opts)
148 self.Reset()
149
150 def Init(self, lex_mode):
151 # type: (lex_mode_t) -> None
152 """Used to parse arithmetic, see ParseContext."""
153 self.next_lex_mode = lex_mode
154
155 def Reset(self):
156 # type: () -> None
157 """Called by interactive loop."""
158 # For _GetToken()
159 self.cur_token = None # type: Token
160 self.token_kind = Kind.Undefined
161 self.token_type = Id.Undefined_Tok
162
163 self.next_lex_mode = lex_mode_e.ShCommand
164
165 # Boolean mutated by CommandParser via word_.ctx_EmitDocToken. For ### doc
166 # comments
167 self.emit_doc_token = False
168 # Boolean mutated by CommandParser via word_.ctx_Multiline. '...' starts
169 # multiline mode.
170 self.multiline = False
171
172 # For detecting invalid \n\n in multiline mode. Counts what we got
173 # directly from the lexer.
174 self.newline_state = 0
175 # For consolidating \n\n -> \n for the CALLER. This simplifies the parsers
176 # that consume words.
177 self.returned_newline = False
178
179 # For integration with pgen2
180 self.buffered_word = None # type: word_t
181
182 def _GetToken(self):
183 # type: () -> None
184 """Call this when you need to make a decision based on any of:
185
186 self.token_type
187 self.token_kind
188 self.cur_token
189 """
190 if self.next_lex_mode == lex_mode_e.Undefined:
191 return # _SetNext() not called, so do nothing
192
193 is_fake = self.next_lex_mode == lex_mode_e.BashRegexFakeInner
194 real_mode = (lex_mode_e.BashRegex if is_fake else self.next_lex_mode)
195
196 self.cur_token = self.lexer.Read(real_mode)
197
198 # MUTATE TOKEN for fake lexer mode.
199 # This is for crazy stuff bash allows, like [[ s =~ (< >) ]]
200 if (is_fake and self.cur_token.id
201 in (Id.WS_Space, Id.BashRegex_AllowedInParens)):
202 self.cur_token.id = Id.Lit_Chars
203
204 self.token_type = self.cur_token.id
205 self.token_kind = consts.GetKind(self.token_type)
206
207 # number of consecutive newlines, ignoring whitespace
208 if self.token_type == Id.Op_Newline:
209 self.newline_state += 1
210 elif self.token_kind != Kind.WS:
211 self.newline_state = 0
212
213 self.parse_ctx.trail.AppendToken(self.cur_token) # For completion
214 self.next_lex_mode = lex_mode_e.Undefined
215
216 def _SetNext(self, lex_mode):
217 # type: (lex_mode_t) -> None
218 """Set the next lex state, but don't actually read a token.
219
220 We need this for proper interactive parsing.
221 """
222 self.next_lex_mode = lex_mode
223
224 def _ReadVarOpArg(self, arg_lex_mode):
225 # type: (lex_mode_t) -> rhs_word_t
226
227 # NOTE: Operators like | and < are not treated as special, so ${a:- | >} is
228 # valid, even when unquoted.
229 self._SetNext(arg_lex_mode)
230 self._GetToken()
231
232 w = self._ReadVarOpArg2(arg_lex_mode, Id.Undefined_Tok,
233 True) # empty_ok
234
235 # If the Compound has no parts, and we're in a double-quoted VarSub
236 # arg, and empty_ok, then return Empty. This is so it can evaluate to
237 # the empty string and not get elided.
238 #
239 # Examples:
240 # - "${s:-}", "${s/%pat/}"
241 # It's similar to LooksLikeShAssignment where we turn x= into x=''. And it
242 # has the same potential problem of not having Token location info.
243 #
244 # NOTE: empty_ok is False only for the PatSub pattern, which means we'll
245 # return a Compound with no parts, which is explicitly checked with a
246 # custom error message.
247 if len(w.parts) == 0 and arg_lex_mode == lex_mode_e.VSub_ArgDQ:
248 return rhs_word.Empty
249
250 return w
251
252 def _ReadVarOpArg2(self, arg_lex_mode, eof_type, empty_ok):
253 # type: (lex_mode_t, Id_t, bool) -> CompoundWord
254 """Return a CompoundWord.
255
256 Helper function for _ReadVarOpArg and used directly by
257 _ReadPatSubVarOp.
258 """
259 w = self._ReadCompoundWord3(arg_lex_mode, eof_type, empty_ok)
260 #log('w %s', w)
261 tilde = word_.TildeDetect(w)
262 if tilde:
263 w = tilde
264 return w
265
266 def _ReadSliceVarOp(self):
267 # type: () -> suffix_op.Slice
268 """
269 Looking token after first ':'
270
271 ArithExpr? (':' ArithExpr? )? '}'
272 """
273 self._NextNonSpace()
274
275 cur_id = self.token_type
276
277 if cur_id in (Id.Arith_RBrace, Id.Arith_Colon): # ${a:} or ${a::}
278 begin = arith_expr.EmptyZero # type: arith_expr_t
279 else:
280 begin = self.a_parser.Parse()
281 cur_id = self.a_parser.CurrentId() # advance
282
283 if cur_id == Id.Arith_RBrace: # ${a:1} or ${@:1}
284 # No length specified, so it's N
285 no_length = None # type: Optional[arith_expr_t]
286 return suffix_op.Slice(begin, no_length)
287
288 elif cur_id == Id.Arith_Colon: # ${a:1:} or ${@:1:}
289 colon_tok = self.cur_token
290 self._NextNonSpace()
291
292 if self.token_type == Id.Arith_RBrace:
293 # quirky bash behavior:
294 # ${a:1:} or ${a::} means length ZERO
295 # but ${a:1} or ${a:} means length N
296 if self.parse_opts.strict_parse_slice():
297 p_die(
298 "Slice length: Add explicit zero, or omit : for N (strict_parse_slice)",
299 colon_tok)
300
301 length = arith_expr.EmptyZero # type: arith_expr_t
302 else:
303 length = self._ReadArithExpr(Id.Arith_RBrace)
304
305 return suffix_op.Slice(begin, length)
306
307 else:
308 p_die("Expected : or } in slice", self.cur_token)
309
310 raise AssertionError() # for MyPy
311
312 def _ReadPatSubVarOp(self):
313 # type: () -> suffix_op.PatSub
314 """Looking at the first '/' after VarOf:
315
316 VarSub = ...
317 | VarOf '/' Match ( '/' WORD? )?
318 Match = '/' WORD # can't be empty
319 | '#' WORD? # may be empty
320 | '%' WORD?
321 """
322 slash_tok = self.cur_token # location info
323 replace_mode = Id.Undefined_Tok # bizarre syntax / # %
324
325 self._SetNext(lex_mode_e.VSub_ArgUnquoted) # advance past /
326
327 self._GetToken()
328 if self.token_type == Id.Right_DollarBrace:
329 pat = CompoundWord([])
330 return suffix_op.PatSub(pat, rhs_word.Empty, replace_mode,
331 slash_tok)
332
333 if self.token_type in (Id.Lit_Slash, Id.Lit_Pound, Id.Lit_Percent):
334 replace_mode = self.token_type
335 self._SetNext(lex_mode_e.VSub_ArgUnquoted)
336
337 # Bash quirk:
338 # echo ${x/#/replace} has an empty pattern
339 # echo ${x////replace} is non-empty; it means echo ${x//'/'/replace}
340 empty_ok = replace_mode != Id.Lit_Slash
341 pat = self._ReadVarOpArg2(lex_mode_e.VSub_ArgUnquoted, Id.Lit_Slash,
342 empty_ok)
343 #log('pat 1 %r', pat)
344
345 if self.token_type == Id.Lit_Slash:
346 # read until }
347 replace = self._ReadVarOpArg(
348 lex_mode_e.VSub_ArgUnquoted) # type: rhs_word_t
349 #log('r 1 %r', replace)
350 else:
351 # e.g. ${v/a} is the same as ${v/a/} -- empty replacement string
352 replace = rhs_word.Empty
353
354 self._GetToken()
355 if self.token_type != Id.Right_DollarBrace:
356 # This happens on invalid code
357 p_die(
358 "Expected } after replacement string, got %s" %
359 ui.PrettyId(self.token_type), self.cur_token)
360
361 return suffix_op.PatSub(pat, replace, replace_mode, slash_tok)
362
363 def _ReadSubscript(self):
364 # type: () -> bracket_op_t
365 """ Subscript = '[' ('@' | '*' | ArithExpr) ']' """
366 # Lookahead to see if we get @ or *. Otherwise read a full arithmetic
367 # expression.
368 next_id = self.lexer.LookPastSpace(lex_mode_e.Arith)
369 if next_id in (Id.Lit_At, Id.Arith_Star):
370 op = bracket_op.WholeArray(next_id) # type: bracket_op_t
371
372 self._SetNext(lex_mode_e.Arith) # skip past [
373 self._GetToken()
374 self._SetNext(lex_mode_e.Arith) # skip past @
375 self._GetToken()
376 else:
377 self._SetNext(lex_mode_e.Arith) # skip past [
378 anode = self._ReadArithExpr(Id.Arith_RBracket)
379 op = bracket_op.ArrayIndex(anode)
380
381 if self.token_type != Id.Arith_RBracket: # Should be looking at ]
382 p_die('Expected ] to close subscript', self.cur_token)
383
384 self._SetNext(lex_mode_e.VSub_2) # skip past ]
385 self._GetToken() # Needed to be in the same spot as no subscript
386
387 return op
388
389 def _ParseVarOf(self):
390 # type: () -> BracedVarSub
391 """
392 VarOf = NAME Subscript?
393 | NUMBER # no subscript allowed, none of these are arrays
394 # ${@[1]} doesn't work, even though slicing does
395 | VarSymbol
396 """
397 self._GetToken()
398 name_token = self.cur_token
399 self._SetNext(lex_mode_e.VSub_2)
400
401 self._GetToken() # Check for []
402 if self.token_type == Id.VOp2_LBracket:
403 bracket_op = self._ReadSubscript()
404 else:
405 bracket_op = None
406
407 part = BracedVarSub.CreateNull()
408 part.name_tok = name_token
409 part.var_name = lexer.TokenVal(name_token)
410 part.bracket_op = bracket_op
411 return part
412
413 def _ParseVarExpr(self, arg_lex_mode, allow_query=False):
414 # type: (lex_mode_t, bool) -> BracedVarSub
415 """Start parsing at the op -- we already skipped past the name."""
416 part = self._ParseVarOf()
417
418 self._GetToken()
419 if self.token_type == Id.Right_DollarBrace:
420 return part # no ops
421
422 op_kind = self.token_kind
423
424 if op_kind == Kind.VTest:
425 tok = self.cur_token
426 arg_word = self._ReadVarOpArg(arg_lex_mode)
427 if self.token_type != Id.Right_DollarBrace:
428 p_die('Expected } to close ${', self.cur_token)
429
430 part.suffix_op = suffix_op.Unary(tok, arg_word)
431
432 elif op_kind == Kind.VOpYsh:
433 tok = self.cur_token
434 arg_word = self._ReadVarOpArg(arg_lex_mode)
435 if self.token_type != Id.Right_DollarBrace:
436 p_die('Expected } to close ${', self.cur_token)
437
438 UP_arg_word = arg_word
439 with tagswitch(arg_word) as case:
440 if case(rhs_word_e.Empty):
441 pass
442 elif case(rhs_word_e.Compound):
443 arg_word = cast(CompoundWord, UP_arg_word)
444 # This handles ${x|html} and ${x %.3f} now
445 # However I think ${x %.3f} should be statically parsed? It can enter
446 # the printf lexer modes.
447 ok, arg, quoted = word_.StaticEval(arg_word)
448 if not ok or quoted:
449 p_die('Expected a constant argument',
450 loc.Word(arg_word))
451
452 part.suffix_op = suffix_op.Static(tok, arg)
453
454 elif op_kind == Kind.VOp0:
455 part.suffix_op = self.cur_token # Nullary
456 self._SetNext(lex_mode_e.VSub_2) # Expecting }
457 self._GetToken()
458
459 elif op_kind == Kind.VOp1: # % %% # ## etc.
460 tok = self.cur_token
461 # Weird exception that all shells have: these operators take a glob
462 # pattern, so they're lexed as VSub_ArgUnquoted, not VSub_ArgDQ
463 arg_word = self._ReadVarOpArg(lex_mode_e.VSub_ArgUnquoted)
464 if self.token_type != Id.Right_DollarBrace:
465 p_die('Expected } to close ${', self.cur_token)
466
467 part.suffix_op = suffix_op.Unary(tok, arg_word)
468
469 elif op_kind == Kind.VOp2: # / : [ ]
470 if self.token_type == Id.VOp2_Slash:
471 patsub_op = self._ReadPatSubVarOp() # type: suffix_op_t
472 part.suffix_op = patsub_op
473
474 # Checked by the method above
475 assert self.token_type == Id.Right_DollarBrace, self.cur_token
476
477 elif self.token_type == Id.VOp2_Colon:
478 part.suffix_op = self._ReadSliceVarOp()
479 # NOTE: } in arithmetic mode.
480 if self.token_type != Id.Arith_RBrace:
481 # Token seems off; doesn't point to X in # ${a:1:2 X
482 p_die('Expected } to close ${', self.cur_token)
483
484 else:
485 # TODO: Does this ever happen?
486 p_die('Unexpected token in ${} (%s)' % 'VOp2', self.cur_token)
487
488 elif op_kind == Kind.VOp3: # ${prefix@} etc.
489 if allow_query:
490 part.suffix_op = self.cur_token # Nullary
491 self._SetNext(lex_mode_e.VSub_2) # Expecting }
492 self._GetToken()
493 else:
494 p_die("Unexpected token in ${} (%s)" % 'VOp3', self.cur_token)
495
496 # NOTE: Arith_RBrace is for slicing, because it reads } in arithmetic
497 # mode. It's redundantly checked above.
498 if self.token_type not in (Id.Right_DollarBrace, Id.Arith_RBrace):
499 # ${a.} or ${!a.}
500 p_die('Expected } to close ${', self.cur_token)
501
502 # Now look for ops
503 return part
504
505 def _ReadZshVarSub(self, left_token):
506 # type: (Token) -> word_part.ZshVarSub
507
508 self._SetNext(lex_mode_e.VSub_Zsh) # Move past ${(foo)
509
510 # Can be empty
511 w = self._ReadCompoundWord3(lex_mode_e.VSub_Zsh, Id.Right_DollarBrace,
512 True)
513 self._GetToken()
514 return word_part.ZshVarSub(left_token, w, self.cur_token)
515
516 def ReadBracedVarSub(self, left_token):
517 # type: (Token) -> Tuple[BracedVarSub, Token]
518 """ For YSH expressions like var x = ${x:-"default"}. """
519 part = self._ReadBracedVarSub(left_token, d_quoted=False)
520 last_token = self.cur_token
521 return part, last_token
522
523 def _ReadBracedVarSub(self, left_token, d_quoted):
524 # type: (Token, bool) -> BracedVarSub
525 """For the ${} expression language.
526
527 NAME = [a-zA-Z_][a-zA-Z0-9_]*
528 NUMBER = [0-9]+ # ${10}, ${11}, ...
529
530 Subscript = '[' ('@' | '*' | ArithExpr) ']'
531 VarSymbol = '!' | '@' | '#' | ...
532 VarOf = NAME Subscript?
533 | NUMBER # no subscript allowed, none of these are arrays
534 # ${@[1]} doesn't work, even though slicing does
535 | VarSymbol
536
537 NULLARY_OP = '@Q' | '@E' | '@P' | '@A' | '@a' # VOp0
538
539 TEST_OP = '-' | ':-' | '=' | ':=' | '+' | ':+' | '?' | ':?'
540 STRIP_OP = '#' | '##' | '%' | '%%'
541 CASE_OP = ',' | ',,' | '^' | '^^'
542 UnaryOp = TEST_OP | STRIP_OP | CASE_OP
543
544 YSH_UNARY = '|' | ' ' # ${x|html} and ${x %.3f}.
545 # SPACE is operator not %
546 Match = ('/' | '#' | '%') WORD # match all / prefix / suffix
547 VarExpr = VarOf
548 | VarOf NULLARY_OP
549 | VarOf UnaryOp WORD
550 | VarOf YSH_UNARY STATIC_WORD
551 | VarOf ':' ArithExpr (':' ArithExpr )?
552 | VarOf '/' Match '/' WORD
553
554 LengthExpr = '#' VarOf # can't apply operators after length
555
556 RefOrKeys = '!' VarExpr # CAN apply operators after a named ref
557 # ${!ref[0]} vs ${!keys[@]} resolved later
558
559 PrefixQuery = '!' NAME ('*' | '@') # list variable names with a prefix
560
561 BuiltinSub = '.' WORD+ # ${.myproc 'builtin' $sub}
562
563 VarSub = LengthExpr
564 | RefOrKeys
565 | PrefixQuery
566 | VarExpr
567 | BuiltinSub
568
569 NOTES:
570 - Arithmetic expressions are used twice, inside subscripts ${a[x+1]} and
571 slicing ${a:x+1:y+2}
572 - ${#} and ${!} need LL(2) lookahead (considering how my tokenizer works)
573 - @ and * are technically arithmetic expressions in this implementation
574 - We don't account for bash 4.4: ${param@operator} -- Q E P A a. Note that
575 it's also vectorized.
576
577 Strictness over bash:
578 - echo ${a[0][0]} doesn't do anything useful, so we disallow it from the
579 grammar
580 - ! and # prefixes can't be composed, even though named refs can be
581 composed with other operators
582 - '#' means 4 different things: length prefix, VarSymbol, UnaryOp to strip
583 a prefix, and it can also be a literal part of WORD.
584
585 From the parser's point of view, the prefix # can't be combined with
586 UnaryOp/slicing/matching, and the ! can. However
587
588 - ${a[@]:1:2} is not allowed
589 - ${#a[@]:1:2} is allowed, but gives the wrong answer
590 """
591 if d_quoted:
592 arg_lex_mode = lex_mode_e.VSub_ArgDQ
593 else:
594 arg_lex_mode = lex_mode_e.VSub_ArgUnquoted
595
596 self._SetNext(lex_mode_e.VSub_1)
597 self._GetToken()
598
599 ty = self.token_type
600 first_tok = self.cur_token
601
602 if ty == Id.VSub_Pound:
603 # Disambiguate
604 next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
605 if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
606 # e.g. a name, '#' is the prefix
607 self._SetNext(lex_mode_e.VSub_1)
608 part = self._ParseVarOf()
609
610 self._GetToken()
611 if self.token_type != Id.Right_DollarBrace:
612 p_die('Expected } after length expression', self.cur_token)
613
614 part.prefix_op = first_tok
615
616 else: # not a prefix, '#' is the variable
617 part = self._ParseVarExpr(arg_lex_mode)
618
619 elif ty == Id.VSub_Bang:
620 next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
621 if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
622 # e.g. a name, '!' is the prefix
623 # ${!a} -- this is a ref
624 # ${!3} -- this is ref
625 # ${!a[1]} -- this is a ref
626 # ${!a[@]} -- this is a keys
627 # No lookahead -- do it in a second step, or at runtime
628 self._SetNext(lex_mode_e.VSub_1)
629 part = self._ParseVarExpr(arg_lex_mode, allow_query=True)
630
631 part.prefix_op = first_tok
632
633 else: # not a prefix, '!' is the variable
634 part = self._ParseVarExpr(arg_lex_mode)
635
636 elif ty == Id.VSub_Dot:
637 # Note: this will become a new builtin_sub type, so this method must
638 # return word_part_t rather than BracedVarSub. I don't think that
639 # should cause problems.
640 p_die('TODO: ${.myproc builtin sub}', self.cur_token)
641
642 # VS_NAME, VS_NUMBER, symbol that isn't # or !
643 elif self.token_kind == Kind.VSub:
644 part = self._ParseVarExpr(arg_lex_mode)
645
646 else:
647 # e.g. ${^}
648 p_die('Unexpected token in ${}', self.cur_token)
649
650 part.left = left_token # attach the argument
651 part.right = self.cur_token
652 return part
653
654 def _ReadSingleQuoted(self, left_token, lex_mode):
655 # type: (Token, lex_mode_t) -> SingleQuoted
656 """Internal method to read a word_part."""
657 tokens = [] # type: List[Token]
658 # In command mode, we never disallow backslashes like '\'
659 right_quote = self.ReadSingleQuoted(lex_mode, left_token, tokens,
660 False)
661 sval = word_compile.EvalSingleQuoted(left_token.id, tokens)
662 node = SingleQuoted(left_token, sval, right_quote)
663 return node
664
665 def ReadSingleQuoted(self, lex_mode, left_token, out_tokens, is_ysh_expr):
666 # type: (lex_mode_t, Token, List[Token], bool) -> Token
667 """Appends to out_tokens; returns last token
668
669 Used by expr_parse.py
670 """
671 # TODO: Remove and use out_tokens
672 tokens = [] # type: List[Token]
673
674 # echo '\' is allowed, but x = '\' is invalid, in favor of x = r'\'
675 no_backslashes = is_ysh_expr and left_token.id == Id.Left_SingleQuote
676
677 expected_end_tokens = 3 if left_token.id in (
678 Id.Left_TSingleQuote, Id.Left_RTSingleQuote, Id.Left_UTSingleQuote,
679 Id.Left_BTSingleQuote) else 1
680 num_end_tokens = 0
681
682 while num_end_tokens < expected_end_tokens:
683 self._SetNext(lex_mode)
684 self._GetToken()
685
686 # Kind.Char emitted in lex_mode.SQ_C
687 if self.token_kind in (Kind.Lit, Kind.Char):
688 tok = self.cur_token
689 # Happens in lex_mode_e.SQ: 'one\two' is ambiguous, should be
690 # r'one\two' or c'one\\two'
691 if no_backslashes and lexer.TokenContains(tok, '\\'):
692 p_die(
693 r"Strings with backslashes should look like r'\n' or u'\n' or b'\n'",
694 tok)
695
696 if is_ysh_expr:
697 # Disallow var x = $'\001'. Arguably we don't need these
698 # checks because u'\u{1}' is the way to write it.
699 if self.token_type == Id.Char_Octal3:
700 p_die(
701 r"Use \xhh or \u{...} instead of octal escapes in YSH strings",
702 tok)
703
704 if self.token_type == Id.Char_Hex and self.cur_token.length != 4:
705 # disallow \xH
706 p_die(
707 r'Invalid hex escape in YSH string (must be \xHH)',
708 tok)
709
710 tokens.append(tok)
711
712 elif self.token_kind == Kind.Unknown:
713 tok = self.cur_token
714 assert tok.id == Id.Unknown_Backslash, tok
715
716 # x = $'\z' is disallowed; ditto for echo $'\z' if shopt -u parse_backslash
717 if is_ysh_expr or not self.parse_opts.parse_backslash():
718 p_die(
719 "Invalid char escape in C-style string literal (OILS-ERR-11)",
720 tok)
721
722 tokens.append(tok)
723
724 elif self.token_kind == Kind.Eof:
725 p_die('Unexpected EOF in single-quoted string that began here',
726 left_token)
727
728 elif self.token_kind == Kind.Right:
729 # assume Id.Right_SingleQuote
730 num_end_tokens += 1
731 tokens.append(self.cur_token)
732
733 else:
734 raise AssertionError(self.cur_token)
735
736 if self.token_kind != Kind.Right:
737 num_end_tokens = 0 # we need three in a ROW
738
739 if expected_end_tokens == 1:
740 tokens.pop()
741 elif expected_end_tokens == 3: # Get rid of spurious end tokens
742 tokens.pop()
743 tokens.pop()
744 tokens.pop()
745
746 # Remove space from ''' r''' $''' in both expression mode and command mode
747 if left_token.id in (Id.Left_TSingleQuote, Id.Left_RTSingleQuote,
748 Id.Left_UTSingleQuote, Id.Left_BTSingleQuote):
749 word_compile.RemoveLeadingSpaceSQ(tokens)
750
751 # Validation after lexing - same 2 checks in j8.LexerDecoder
752 is_u_string = left_token.id in (Id.Left_USingleQuote,
753 Id.Left_UTSingleQuote)
754
755 for tok in tokens:
756 # u'\yff' is not valid, but b'\yff' is
757 if is_u_string and tok.id == Id.Char_YHex:
758 p_die(
759 r"%s escapes not allowed in u'' strings" %
760 lexer.TokenVal(tok), tok)
761
762 out_tokens.extend(tokens)
763 return self.cur_token
764
765 def _ReadDoubleQuotedLeftParts(self):
766 # type: () -> word_part_t
767 """Read substitution parts in a double quoted context."""
768 if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick):
769 return self._ReadCommandSub(self.token_type, d_quoted=True)
770
771 if self.token_type == Id.Left_DollarBrace:
772 return self._ReadBracedVarSub(self.cur_token, d_quoted=True)
773
774 if self.token_type == Id.Left_DollarDParen:
775 return self._ReadArithSub()
776
777 if self.token_type == Id.Left_DollarBracket:
778 return self._ReadExprSub(lex_mode_e.DQ)
779
780 if self.token_type == Id.Left_DollarBraceZsh:
781 return self._ReadZshVarSub(self.cur_token)
782
783 raise AssertionError(self.cur_token)
784
785 def _ReadYshSingleQuoted(self, left_id):
786 # type: (Id_t) -> CompoundWord
787 """Read YSH style strings
788
789 r'' u'' b''
790 r''' ''' u''' ''' b''' '''
791 """
792 #log('BEF self.cur_token %s', self.cur_token)
793 if left_id == Id.Left_RSingleQuote:
794 lexer_mode = lex_mode_e.SQ_Raw
795 triple_left_id = Id.Left_RTSingleQuote
796 elif left_id == Id.Left_USingleQuote:
797 lexer_mode = lex_mode_e.J8_Str
798 triple_left_id = Id.Left_UTSingleQuote
799 elif left_id == Id.Left_BSingleQuote:
800 lexer_mode = lex_mode_e.J8_Str
801 triple_left_id = Id.Left_BTSingleQuote
802 else:
803 raise AssertionError(left_id)
804
805 # Needed for syntax checks
806 left_tok = self.cur_token
807 left_tok.id = left_id
808
809 sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
810
811 if (len(sq_part.sval) == 0 and self.lexer.ByteLookAhead() == "'"):
812 self._SetNext(lex_mode_e.ShCommand)
813 self._GetToken()
814
815 assert self.token_type == Id.Left_SingleQuote
816 # HACK: magically transform the third ' in u''' to
817 # Id.Left_UTSingleQuote, so that ''' is the terminator
818 left_tok = self.cur_token
819 left_tok.id = triple_left_id
820
821 # Handles stripping leading whitespace
822 sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
823
824 # Advance and validate
825 self._SetNext(lex_mode_e.ShCommand)
826
827 self._GetToken()
828 if self.token_kind not in KINDS_THAT_END_WORDS:
829 p_die('Unexpected token after YSH single-quoted string',
830 self.cur_token)
831
832 return CompoundWord([sq_part])
833
834 def _ReadUnquotedLeftParts(self, triple_out):
835 # type: (Optional[BoolParamBox]) -> word_part_t
836 """Read substitutions and quoted strings (for lex_mode_e.ShCommand).
837
838 If triple_out is set, then we try parsing triple quoted strings,
839 and set its value to True if we got one.
840 """
841 if self.token_type in (Id.Left_DoubleQuote, Id.Left_DollarDoubleQuote):
842 # Note: $"" is a synonym for "". It might make sense if it added
843 # \n \0 \x00 \u{123} etc. But that's not what bash does!
844 dq_part = self._ReadDoubleQuoted(self.cur_token)
845 # Got empty word "" and there's a " after
846 if (triple_out and len(dq_part.parts) == 0 and
847 self.lexer.ByteLookAhead() == '"'):
848
849 self._SetNext(lex_mode_e.ShCommand)
850 self._GetToken()
851 # HACK: magically transform the third " in """ to
852 # Id.Left_TDoubleQuote, so that """ is the terminator
853 left_dq_token = self.cur_token
854 left_dq_token.id = Id.Left_TDoubleQuote
855 triple_out.b = True # let caller know we got it
856 return self._ReadDoubleQuoted(left_dq_token)
857
858 return dq_part
859
860 if self.token_type in (Id.Left_SingleQuote, Id.Left_RSingleQuote,
861 Id.Left_DollarSingleQuote):
862 if self.token_type == Id.Left_SingleQuote:
863 lexer_mode = lex_mode_e.SQ_Raw
864 triple_left_id = Id.Left_TSingleQuote
865 elif self.token_type == Id.Left_RSingleQuote:
866 lexer_mode = lex_mode_e.SQ_Raw
867 triple_left_id = Id.Left_RTSingleQuote
868 else:
869 lexer_mode = lex_mode_e.SQ_C
870 # there is no such thing as $'''
871 triple_left_id = Id.Undefined_Tok
872
873 sq_part = self._ReadSingleQuoted(self.cur_token, lexer_mode)
874
875 # Got empty '' or r'' and there's a ' after
876 # u'' and b'' are handled in _ReadYshSingleQuoted
877 if (triple_left_id != Id.Undefined_Tok and
878 triple_out is not None and len(sq_part.sval) == 0 and
879 self.lexer.ByteLookAhead() == "'"):
880
881 self._SetNext(lex_mode_e.ShCommand)
882 self._GetToken()
883
884 # HACK: magically transform the third ' in ''' to
885 # Id.Left_TSingleQuote, so that ''' is the terminator
886 left_sq_token = self.cur_token
887 left_sq_token.id = triple_left_id
888
889 triple_out.b = True # let caller know we got it
890 return self._ReadSingleQuoted(left_sq_token, lexer_mode)
891
892 return sq_part
893
894 if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick,
895 Id.Left_ProcSubIn, Id.Left_ProcSubOut):
896 return self._ReadCommandSub(self.token_type, d_quoted=False)
897
898 if self.token_type == Id.Left_DollarBrace:
899 return self._ReadBracedVarSub(self.cur_token, d_quoted=False)
900
901 if self.token_type == Id.Left_DollarDParen:
902 return self._ReadArithSub()
903
904 if self.token_type == Id.Left_DollarBracket:
905 return self._ReadExprSub(lex_mode_e.ShCommand)
906
907 if self.token_type == Id.Left_DollarBraceZsh:
908 return self._ReadZshVarSub(self.cur_token)
909
910 raise AssertionError(self.cur_token)
911
912 def _ReadExtGlob(self):
913 # type: () -> word_part.ExtGlob
914 """
915 Grammar:
916 Item = CompoundWord | EPSILON # important: @(foo|) is allowed
917 LEFT = '@(' | '*(' | '+(' | '?(' | '!('
918 RIGHT = ')'
919 ExtGlob = LEFT (Item '|')* Item RIGHT # ITEM may be empty
920 Compound includes ExtGlob
921 """
922 left_token = self.cur_token
923 right_token = None # type: Token
924 arms = [] # type: List[CompoundWord]
925
926 self.lexer.PushHint(Id.Op_RParen, Id.Right_ExtGlob)
927 self._SetNext(lex_mode_e.ExtGlob) # advance past LEFT
928
929 read_word = False # did we just a read a word? To handle @(||).
930
931 while True:
932 self._GetToken()
933
934 if self.token_type == Id.Right_ExtGlob:
935 if not read_word:
936 arms.append(CompoundWord([]))
937 right_token = self.cur_token
938 break
939
940 elif self.token_type == Id.Op_Pipe:
941 if not read_word:
942 arms.append(CompoundWord([]))
943 read_word = False
944 self._SetNext(lex_mode_e.ExtGlob)
945
946 # lex_mode_e.ExtGlob should only produce these 4 kinds of tokens
947 elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub,
948 Kind.ExtGlob):
949 w = self._ReadCompoundWord(lex_mode_e.ExtGlob)
950 arms.append(w)
951 read_word = True
952
953 elif self.token_kind == Kind.Eof:
954 p_die('Unexpected EOF reading extended glob that began here',
955 left_token)
956
957 else:
958 raise AssertionError(self.cur_token)
959
960 return word_part.ExtGlob(left_token, arms, right_token)
961
962 def _ReadBashRegexGroup(self):
963 # type: () -> word_part.BashRegexGroup
964 """
965 Grammar:
966 BashRegexGroup = '(' WORD? ')
967 """
968 left_token = self.cur_token
969 assert left_token.id == Id.BashRegex_LParen, left_token
970
971 arms = [] # type: List[CompoundWord]
972
973 self.lexer.PushHint(Id.Op_RParen, Id.Right_BashRegexGroup)
974 self._SetNext(lex_mode_e.BashRegexFakeInner) # advance past LEFT
975
976 self._GetToken()
977 if self.token_type == Id.Right_BashRegexGroup: # empty ()
978 return word_part.BashRegexGroup(left_token, None, self.cur_token)
979
980 # lex_mode_e.BashRegex should only produce these 4 kinds of tokens
981 if self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub, Kind.BashRegex):
982 # Fake lexer mode that translates Id.WS_Space to Id.Lit_Chars
983 # To allow bash style [[ s =~ (a b) ]]
984 w = self._ReadCompoundWord(lex_mode_e.BashRegexFakeInner)
985 arms.append(w)
986
987 self._GetToken()
988 if self.token_type != Id.Right_BashRegexGroup:
989 p_die('Expected ) to close bash regex group', self.cur_token)
990
991 return word_part.BashRegexGroup(left_token, w, self.cur_token)
992
993 p_die('Expected word after ( opening bash regex group', self.cur_token)
994
995 def _ReadLikeDQ(self, left_token, is_ysh_expr, out_parts):
996 # type: (Optional[Token], bool, List[word_part_t]) -> None
997 """
998 Args:
999 left_token: A token if we are reading a double quoted part, or None if
1000 we're reading a here doc.
1001 is_ysh_expr: Whether to disallow backticks and invalid char escapes
1002 out_parts: list of word_part to append to
1003 """
1004 if left_token:
1005 if left_token.id in (Id.Left_TDoubleQuote,
1006 Id.Left_DollarTDoubleQuote):
1007 expected_end_tokens = 3
1008 else:
1009 expected_end_tokens = 1
1010 else:
1011 expected_end_tokens = 1000 # here doc will break
1012
1013 num_end_tokens = 0
1014 while num_end_tokens < expected_end_tokens:
1015 self._SetNext(lex_mode_e.DQ)
1016 self._GetToken()
1017
1018 if self.token_kind == Kind.Lit:
1019 if self.token_type == Id.Lit_EscapedChar:
1020 tok = self.cur_token
1021 ch = lexer.TokenSliceLeft(tok, 1)
1022 part = word_part.EscapedLiteral(tok,
1023 ch) # type: word_part_t
1024 else:
1025 if self.token_type == Id.Lit_BadBackslash:
1026 # echo "\z" is OK in shell, but 'x = "\z" is a syntax error in
1027 # YSH.
1028 # Slight hole: We don't catch 'x = ${undef:-"\z"} because of the
1029 # recursion (unless parse_backslash)
1030 if (is_ysh_expr or
1031 not self.parse_opts.parse_backslash()):
1032 p_die(
1033 "Invalid char escape in double quoted string (OILS-ERR-12)",
1034 self.cur_token)
1035 elif self.token_type == Id.Lit_Dollar:
1036 if is_ysh_expr or not self.parse_opts.parse_dollar():
1037 p_die("Literal $ should be quoted like \$",
1038 self.cur_token)
1039
1040 part = self.cur_token
1041 out_parts.append(part)
1042
1043 elif self.token_kind == Kind.Left:
1044 if self.token_type == Id.Left_Backtick and is_ysh_expr:
1045 p_die("Invalid backtick: use $(cmd) or \\` in YSH strings",
1046 self.cur_token)
1047
1048 part = self._ReadDoubleQuotedLeftParts()
1049 out_parts.append(part)
1050
1051 elif self.token_kind == Kind.VSub:
1052 tok = self.cur_token
1053 part = SimpleVarSub(tok)
1054 out_parts.append(part)
1055 # NOTE: parsing "$f(x)" would BREAK CODE. Could add a more for it
1056 # later.
1057
1058 elif self.token_kind == Kind.Right:
1059 assert self.token_type == Id.Right_DoubleQuote, self.token_type
1060 if left_token:
1061 num_end_tokens += 1
1062
1063 # In a here doc, the right quote is literal!
1064 out_parts.append(self.cur_token)
1065
1066 elif self.token_kind == Kind.Eof:
1067 if left_token:
1068 p_die(
1069 'Unexpected EOF reading double-quoted string that began here',
1070 left_token)
1071 else: # here docs will have an EOF in their token stream
1072 break
1073
1074 else:
1075 raise AssertionError(self.cur_token)
1076
1077 if self.token_kind != Kind.Right:
1078 num_end_tokens = 0 # """ must be CONSECUTIVE
1079
1080 if expected_end_tokens == 1:
1081 out_parts.pop()
1082 elif expected_end_tokens == 3:
1083 out_parts.pop()
1084 out_parts.pop()
1085 out_parts.pop()
1086
1087 # Remove space from """ in both expression mode and command mode
1088 if (left_token and left_token.id
1089 in (Id.Left_TDoubleQuote, Id.Left_DollarTDoubleQuote)):
1090 word_compile.RemoveLeadingSpaceDQ(out_parts)
1091
1092 # Return nothing, since we appended to 'out_parts'
1093
1094 def _ReadDoubleQuoted(self, left_token):
1095 # type: (Token) -> DoubleQuoted
1096 """Helper function for "hello $name".
1097
1098 Args:
1099 eof_type: for stopping at }, Id.Lit_RBrace
1100 here_doc: Whether we are reading in a here doc context
1101
1102 Also ${foo%%a b c} # treat this as double quoted. until you hit
1103 """
1104 parts = [] # type: List[word_part_t]
1105 self._ReadLikeDQ(left_token, False, parts)
1106
1107 right_quote = self.cur_token
1108 return DoubleQuoted(left_token, parts, right_quote)
1109
1110 def ReadDoubleQuoted(self, left_token, parts):
1111 # type: (Token, List[word_part_t]) -> Token
1112 """For expression mode.
1113
1114 Read var x = "${dir:-}/$name"; etc.
1115 """
1116 self._ReadLikeDQ(left_token, True, parts)
1117 return self.cur_token
1118
1119 def _ReadCommandSub(self, left_id, d_quoted=False):
1120 # type: (Id_t, bool) -> CommandSub
1121 """
1122 NOTE: This is not in the grammar, because word parts aren't in the grammar!
1123
1124 command_sub = '$(' command_list ')'
1125 | '@(' command_list ')'
1126 | '<(' command_list ')'
1127 | '>(' command_list ')'
1128 | ` command_list `
1129 """
1130 left_token = self.cur_token
1131
1132 # Set the lexer in a state so ) becomes the EOF token.
1133 if left_id in (Id.Left_DollarParen, Id.Left_AtParen, Id.Left_ProcSubIn,
1134 Id.Left_ProcSubOut):
1135 self._SetNext(lex_mode_e.ShCommand) # advance past $( etc.
1136
1137 right_id = Id.Eof_RParen
1138 self.lexer.PushHint(Id.Op_RParen, right_id)
1139 c_parser = self.parse_ctx.MakeParserForCommandSub(
1140 self.line_reader, self.lexer, right_id)
1141 # NOTE: This doesn't use something like main_loop because we don't want
1142 # to interleave parsing and execution! Unlike 'source' and 'eval'.
1143 node = c_parser.ParseCommandSub()
1144
1145 right_token = c_parser.w_parser.cur_token
1146
1147 elif left_id == Id.Left_Backtick and self.parse_ctx.do_lossless:
1148 # NOTE: This is an APPROXIMATE solution for translation ONLY. See
1149 # test/osh2oil.
1150
1151 right_id = Id.Eof_Backtick
1152 self.lexer.PushHint(Id.Left_Backtick, right_id)
1153 c_parser = self.parse_ctx.MakeParserForCommandSub(
1154 self.line_reader, self.lexer, right_id)
1155 node = c_parser.ParseCommandSub()
1156 right_token = c_parser.w_parser.cur_token
1157
1158 elif left_id == Id.Left_Backtick:
1159 if not self.parse_opts.parse_backticks():
1160 p_die('Use $(cmd) instead of backticks (parse_backticks)',
1161 left_token)
1162
1163 self._SetNext(lex_mode_e.Backtick) # advance past `
1164
1165 parts = [] # type: List[str]
1166 while True:
1167 self._GetToken()
1168 #log("TOK %s", self.cur_token)
1169
1170 if self.token_type == Id.Backtick_Quoted:
1171 # Remove leading \
1172 parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1173
1174 elif self.token_type == Id.Backtick_DoubleQuote:
1175 # Compatibility: If backticks are double quoted, then double quotes
1176 # within them have to be \"
1177 # Shells aren't smart enough to match nested " and ` quotes (but OSH
1178 # is)
1179 if d_quoted:
1180 # Remove leading \
1181 parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1182 else:
1183 parts.append(lexer.TokenVal(self.cur_token))
1184
1185 elif self.token_type == Id.Backtick_Other:
1186 parts.append(lexer.TokenVal(self.cur_token))
1187
1188 elif self.token_type == Id.Backtick_Right:
1189 break
1190
1191 elif self.token_type == Id.Eof_Real:
1192 # Note: this parse error is in the ORIGINAL context. No code_str yet.
1193 p_die('Unexpected EOF while looking for closing backtick',
1194 left_token)
1195
1196 else:
1197 raise AssertionError(self.cur_token)
1198
1199 self._SetNext(lex_mode_e.Backtick)
1200
1201 # Calculate right SPID on CommandSub BEFORE re-parsing.
1202 right_token = self.cur_token
1203
1204 code_str = ''.join(parts)
1205 #log('code %r', code_str)
1206
1207 # NOTE: This is similar to how we parse aliases in osh/cmd_parse.py. It
1208 # won't have the same location info as MakeParserForCommandSub(), because
1209 # the lexer is different.
1210 arena = self.parse_ctx.arena
1211 #arena = alloc.Arena()
1212 line_reader = reader.StringLineReader(code_str, arena)
1213 c_parser = self.parse_ctx.MakeOshParser(line_reader)
1214 src = source.Reparsed('backticks', left_token, right_token)
1215 with alloc.ctx_SourceCode(arena, src):
1216 node = c_parser.ParseCommandSub()
1217
1218 else:
1219 raise AssertionError(left_id)
1220
1221 return CommandSub(left_token, node, right_token)
1222
1223 def _ReadExprSub(self, lex_mode):
1224 # type: (lex_mode_t) -> word_part.ExprSub
1225 """$[d->key] $[obj.method()] etc."""
1226 left_token = self.cur_token
1227
1228 self._SetNext(lex_mode_e.Expr)
1229 enode, right_token = self.parse_ctx.ParseYshExpr(
1230 self.lexer, grammar_nt.ysh_expr_sub)
1231
1232 self._SetNext(lex_mode) # Move past ]
1233 return word_part.ExprSub(left_token, enode, right_token)
1234
1235 def ParseVarDecl(self, kw_token):
1236 # type: (Token) -> command.VarDecl
1237 """
1238 oil_var_decl: name_type_list '=' testlist end_stmt
1239
1240 Note that assignments must end with \n ; } or EOF. Unlike shell
1241 assignments, we disallow:
1242
1243 var x = 42 | wc -l
1244 var x = 42 && echo hi
1245 """
1246 self._SetNext(lex_mode_e.Expr)
1247 enode, last_token = self.parse_ctx.ParseVarDecl(kw_token, self.lexer)
1248 # Hack to move } from what the Expr lexer modes gives to what CommandParser
1249 # wants
1250 if last_token.id == Id.Op_RBrace:
1251 last_token.id = Id.Lit_RBrace
1252
1253 # Let the CommandParser see the Op_Semi or Op_Newline.
1254 self.buffered_word = last_token
1255 self._SetNext(lex_mode_e.ShCommand) # always back to this
1256 return enode
1257
1258 def ParseMutation(self, kw_token, var_checker):
1259 # type: (Token, VarChecker) -> command.Mutation
1260 """
1261 setvar i = 42
1262 setvar i += 1
1263 setvar a[i] = 42
1264 setvar a[i] += 1
1265 setvar d.key = 42
1266 setvar d.key += 1
1267 """
1268 self._SetNext(lex_mode_e.Expr)
1269 enode, last_token = self.parse_ctx.ParseMutation(kw_token, self.lexer)
1270 # Hack to move } from what the Expr lexer modes gives to what CommandParser
1271 # wants
1272 if last_token.id == Id.Op_RBrace:
1273 last_token.id = Id.Lit_RBrace
1274
1275 for lhs in enode.lhs:
1276 UP_lhs = lhs
1277 with tagswitch(lhs) as case:
1278 if case(y_lhs_e.Var):
1279 lhs = cast(Token, UP_lhs)
1280 var_checker.Check(kw_token.id, lexer.LazyStr(lhs), lhs)
1281
1282 # Note: this does not cover cases like
1283 # setvar (a[0])[1] = v
1284 # setvar (d.key).other = v
1285 # This leaks into catching all typos statically, which may be
1286 # possible if 'use' makes all names explicit.
1287 elif case(y_lhs_e.Subscript):
1288 lhs = cast(Subscript, UP_lhs)
1289 if lhs.obj.tag() == expr_e.Var:
1290 v = cast(expr.Var, lhs.obj)
1291 var_checker.Check(kw_token.id, v.name, v.left)
1292
1293 elif case(y_lhs_e.Attribute):
1294 lhs = cast(Attribute, UP_lhs)
1295 if lhs.obj.tag() == expr_e.Var:
1296 v = cast(expr.Var, lhs.obj)
1297 var_checker.Check(kw_token.id, v.name, v.left)
1298
1299 # Let the CommandParser see the Op_Semi or Op_Newline.
1300 self.buffered_word = last_token
1301 self._SetNext(lex_mode_e.ShCommand) # always back to this
1302 return enode
1303
1304 def ParseBareDecl(self):
1305 # type: () -> expr_t
1306 """
1307 x = {name: val}
1308 """
1309 self._SetNext(lex_mode_e.Expr)
1310 self._GetToken()
1311 enode, last_token = self.parse_ctx.ParseYshExpr(
1312 self.lexer, grammar_nt.command_expr)
1313 if last_token.id == Id.Op_RBrace:
1314 last_token.id = Id.Lit_RBrace
1315 self.buffered_word = last_token
1316 self._SetNext(lex_mode_e.ShCommand)
1317 return enode
1318
1319 def ParseYshExprForCommand(self):
1320 # type: () -> expr_t
1321
1322 # Fudge for this case
1323 # for x in(y) {
1324 # versus
1325 # for x in (y) {
1326 #
1327 # In the former case, ReadWord on 'in' puts the lexer past (.
1328 # Also see LookPastSpace in CommandParers.
1329 # A simpler solution would be nicer.
1330
1331 if self.token_type == Id.Op_LParen:
1332 self.lexer.MaybeUnreadOne()
1333
1334 enode, _ = self.parse_ctx.ParseYshExpr(self.lexer, grammar_nt.ysh_expr)
1335
1336 self._SetNext(lex_mode_e.ShCommand)
1337 return enode
1338
1339 def ParseCommandExpr(self):
1340 # type: () -> expr_t
1341 """
1342 = 1+2
1343 """
1344 enode, last_token = self.parse_ctx.ParseYshExpr(
1345 self.lexer, grammar_nt.command_expr)
1346
1347 # In some cases, such as the case statement, we expect *the lexer* to be
1348 # pointing at the token right after the expression. But the expression
1349 # parser must have read to the `last_token`. Unreading places the lexer
1350 # back in the expected state. Ie:
1351 #
1352 # case (x) { case (x) {
1353 # (else) { = x } (else) { = x }
1354 # ^ The lexer is here ^ Unread to here
1355 # } }
1356 assert last_token.id in (Id.Op_Newline, Id.Eof_Real, Id.Op_Semi,
1357 Id.Op_RBrace), last_token
1358 if last_token.id != Id.Eof_Real:
1359 # Eof_Real is the only token we cannot unread
1360 self.lexer.MaybeUnreadOne()
1361
1362 return enode
1363
1364 def ParseProc(self, node):
1365 # type: (Proc) -> None
1366
1367 # proc name-with-hyphens() must be accepted
1368 self._SetNext(lex_mode_e.ShCommand)
1369 self._GetToken()
1370 # example: 'proc f[' gets you Lit_ArrayLhsOpen
1371 if self.token_type != Id.Lit_Chars:
1372 p_die('Invalid proc name %s' % ui.PrettyToken(self.cur_token),
1373 self.cur_token)
1374
1375 # TODO: validate this more. Disallow proc 123 { }, which isn't disallowed
1376 # for shell functions. Similar to IsValidVarName().
1377 node.name = self.cur_token
1378
1379 last_token = self.parse_ctx.ParseProc(self.lexer, node)
1380
1381 # Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1382 assert last_token.id == Id.Op_LBrace
1383 last_token.id = Id.Lit_LBrace
1384 self.buffered_word = last_token
1385
1386 self._SetNext(lex_mode_e.ShCommand)
1387
1388 def ParseFunc(self, node):
1389 # type: (Func) -> None
1390 last_token = self.parse_ctx.ParseFunc(self.lexer, node)
1391
1392 # Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1393 assert last_token.id == Id.Op_LBrace
1394 last_token.id = Id.Lit_LBrace
1395 self.buffered_word = last_token
1396
1397 self._SetNext(lex_mode_e.ShCommand)
1398
1399 def ParseYshCasePattern(self):
1400 # type: () -> Tuple[pat_t, Token]
1401 pat, left_tok, last_token = self.parse_ctx.ParseYshCasePattern(
1402 self.lexer)
1403
1404 if last_token.id == Id.Op_LBrace:
1405 last_token.id = Id.Lit_LBrace
1406 self.buffered_word = last_token
1407
1408 return pat, left_tok
1409
1410 def NewlineOkForYshCase(self):
1411 # type: () -> Id_t
1412 """Check for optional newline and consume it.
1413
1414 This is a special case of `_NewlineOk` which fixed some "off-by-one" issues
1415 which crop up while parsing Ysh Case Arms. For more details, see
1416 #oil-dev > Progress On YSH Case Grammar on zulip.
1417
1418 Returns a token id which is filled with the choice of
1419
1420 word { echo word }
1421 (3) { echo expr }
1422 /e/ { echo eggex }
1423 } # right brace
1424 """
1425 while True:
1426 next_id = self.lexer.LookAheadOne(lex_mode_e.Expr)
1427
1428 # Cannot lookahead past lines
1429 if next_id == Id.Unknown_Tok:
1430 if not self.lexer.MoveToNextLine(): # Try to move to next line
1431 break # EOF
1432 continue
1433
1434 next_kind = consts.GetKind(next_id)
1435 if next_id != Id.Op_Newline and next_kind != Kind.Ignored:
1436 break
1437
1438 self.lexer.Read(lex_mode_e.Expr)
1439
1440 if next_id in (Id.Op_RBrace, Id.Op_LParen, Id.Arith_Slash):
1441 self._SetNext(lex_mode_e.Expr) # Continue in expression mode
1442 else:
1443 # Consume the trailing Op_Newline
1444 self._SetNext(lex_mode_e.ShCommand)
1445 self._GetToken()
1446
1447 return next_id
1448
1449 def _ReadArithExpr(self, end_id):
1450 # type: (Id_t) -> arith_expr_t
1451 """Read and parse an arithmetic expression in various contexts.
1452
1453 $(( 1+2 ))
1454 (( a=1+2 ))
1455 ${a[ 1+2 ]}
1456 ${a : 1+2 : 1+2}
1457
1458 See tests/arith-context.test.sh for ambiguous cases.
1459
1460 ${a[a[0]]} is valid # VS_RBRACKET vs Id.Arith_RBracket
1461
1462 ${s : a<b?0:1 : 1} # VS_COLON vs Id.Arith_Colon
1463
1464 See the assertion in ArithParser.Parse() -- unexpected extra input.
1465 """
1466 # calls self.ReadWord(lex_mode_e.Arith)
1467 anode = self.a_parser.Parse()
1468 cur_id = self.a_parser.CurrentId()
1469 if end_id != Id.Undefined_Tok and cur_id != end_id:
1470 p_die(
1471 'Unexpected token after arithmetic expression (%s != %s)' %
1472 (ui.PrettyId(cur_id), ui.PrettyId(end_id)),
1473 loc.Word(self.a_parser.cur_word))
1474 return anode
1475
1476 def _ReadArithSub(self):
1477 # type: () -> word_part.ArithSub
1478 """Read an arith substitution, which contains an arith expression, e.g.
1479
1480 $((a + 1)).
1481 """
1482 left_tok = self.cur_token
1483
1484 # The second one needs to be disambiguated in stuff like stuff like:
1485 # $(echo $(( 1+2 )) )
1486 self.lexer.PushHint(Id.Op_RParen, Id.Right_DollarDParen)
1487
1488 # NOTE: To disambiguate $(( as arith sub vs. command sub and subshell, we
1489 # could save the lexer/reader state here, and retry if the arithmetic parse
1490 # fails. But we can almost always catch this at parse time. There could
1491 # be some exceptions like:
1492 # $((echo * foo)) # looks like multiplication
1493 # $((echo / foo)) # looks like division
1494
1495 # $(( )) is valid
1496 anode = arith_expr.EmptyZero # type: arith_expr_t
1497
1498 self._NextNonSpace()
1499 if self.token_type != Id.Arith_RParen:
1500 anode = self._ReadArithExpr(Id.Arith_RParen)
1501
1502 self._SetNext(lex_mode_e.ShCommand)
1503
1504 # Ensure we get closing )
1505 self._GetToken()
1506 if self.token_type != Id.Right_DollarDParen:
1507 p_die('Expected second ) to end arith sub', self.cur_token)
1508
1509 right_tok = self.cur_token
1510 return word_part.ArithSub(left_tok, anode, right_tok)
1511
1512 def ReadDParen(self):
1513 # type: () -> Tuple[arith_expr_t, Token]
1514 """Read ((1+ 2)) -- command context.
1515
1516 We're using the word parser because it's very similar to _ReadArithExpr
1517 above.
1518
1519 This also returns the terminating Id.Op_DRightParen token for location
1520 info.
1521 """
1522 # (( )) is valid
1523 anode = arith_expr.EmptyZero # type: arith_expr_t
1524
1525 self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
1526
1527 self._NextNonSpace()
1528 if self.token_type != Id.Arith_RParen:
1529 anode = self._ReadArithExpr(Id.Arith_RParen)
1530
1531 self._SetNext(lex_mode_e.ShCommand)
1532
1533 # Ensure we get the second )
1534 self._GetToken()
1535 right = self.cur_token
1536 if right.id != Id.Op_DRightParen:
1537 p_die('Expected second ) to end arith statement', right)
1538
1539 self._SetNext(lex_mode_e.ShCommand)
1540
1541 return anode, right
1542
1543 def _NextNonSpace(self):
1544 # type: () -> None
1545 """Advance in lex_mode_e.Arith until non-space token.
1546
1547 Same logic as _ReadWord, but used in
1548 $(( ))
1549 (( ))
1550 for (( ))
1551
1552 You can read self.token_type after this, without calling _GetToken.
1553 """
1554 while True:
1555 self._SetNext(lex_mode_e.Arith)
1556 self._GetToken()
1557 if self.token_kind not in (Kind.Ignored, Kind.WS):
1558 break
1559
1560 def ReadForExpression(self):
1561 # type: () -> command.ForExpr
1562 """Read ((i=0; i<5; ++i)) -- part of command context."""
1563 self._NextNonSpace() # skip over ((
1564 cur_id = self.token_type # for end of arith expressions
1565
1566 if cur_id == Id.Arith_Semi: # for (( ; i < 10; i++ ))
1567 init_node = arith_expr.EmptyZero # type: arith_expr_t
1568 else:
1569 init_node = self.a_parser.Parse()
1570 cur_id = self.a_parser.CurrentId()
1571 self._NextNonSpace()
1572
1573 # It's odd to keep track of both cur_id and self.token_type in this
1574 # function, but it works, and is tested in 'test/parse_error.sh
1575 # arith-integration'
1576 if cur_id != Id.Arith_Semi: # for (( x=0 b; ... ))
1577 p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1578
1579 self._GetToken()
1580 cur_id = self.token_type
1581
1582 if cur_id == Id.Arith_Semi: # for (( ; ; i++ ))
1583 # empty condition is TRUE
1584 cond_node = arith_expr.EmptyOne # type: arith_expr_t
1585 else:
1586 cond_node = self.a_parser.Parse()
1587 cur_id = self.a_parser.CurrentId()
1588
1589 if cur_id != Id.Arith_Semi: # for (( x=0; x<5 b ))
1590 p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1591
1592 self._NextNonSpace()
1593 if self.token_type == Id.Arith_RParen: # for (( ; ; ))
1594 update_node = arith_expr.EmptyZero # type: arith_expr_t
1595 else:
1596 update_node = self._ReadArithExpr(Id.Arith_RParen)
1597
1598 self._NextNonSpace()
1599 if self.token_type != Id.Arith_RParen:
1600 p_die('Expected ) to end for loop expression', self.cur_token)
1601 self._SetNext(lex_mode_e.ShCommand)
1602
1603 # redirects is None, will be assigned in CommandEvaluator
1604 node = command.ForExpr.CreateNull()
1605 node.init = init_node
1606 node.cond = cond_node
1607 node.update = update_node
1608 return node
1609
1610 def _ReadArrayLiteral(self):
1611 # type: () -> word_part_t
1612 """a=(1 2 3)
1613
1614 TODO: See osh/cmd_parse.py:164 for Id.Lit_ArrayLhsOpen, for a[x++]=1
1615
1616 We want:
1617
1618 A=(['x']=1 ["x"]=2 [$x$y]=3)
1619
1620 Maybe allow this as a literal string? Because I think I've seen it before?
1621 Or maybe force people to patch to learn the rule.
1622
1623 A=([x]=4)
1624
1625 Starts with Lit_Other '[', and then it has Lit_ArrayLhsClose
1626 Maybe enforce that ALL have keys or NONE of have keys.
1627 """
1628 self._SetNext(lex_mode_e.ShCommand) # advance past (
1629 self._GetToken()
1630 if self.cur_token.id != Id.Op_LParen:
1631 p_die('Expected ( after =', self.cur_token)
1632 left_token = self.cur_token
1633 right_token = None # type: Token
1634
1635 # MUST use a new word parser (with same lexer).
1636 w_parser = self.parse_ctx.MakeWordParser(self.lexer, self.line_reader)
1637 words = [] # type: List[CompoundWord]
1638 done = False
1639 while not done:
1640 w = w_parser.ReadWord(lex_mode_e.ShCommand)
1641 with tagswitch(w) as case:
1642 if case(word_e.Operator):
1643 tok = cast(Token, w)
1644 if tok.id == Id.Right_Initializer:
1645 right_token = tok
1646 done = True # can't use break here
1647 # Unlike command parsing, array parsing allows embedded \n.
1648 elif tok.id == Id.Op_Newline:
1649 continue
1650 else:
1651 p_die('Unexpected token in array literal', loc.Word(w))
1652
1653 elif case(word_e.Compound):
1654 words.append(cast(CompoundWord, w))
1655
1656 else:
1657 raise AssertionError()
1658
1659 initializer_words = [] # type: List[InitializerWord_t]
1660 for w in words:
1661 pair = word_.DetectAssocPair(w)
1662 if pair is not None:
1663 word_.TildeDetectAssign(pair.value) # pair.value is modified
1664 initializer_words.append(pair)
1665 else:
1666 w2 = braces.BraceDetect(w) # type: word_t
1667 if w2 is None:
1668 w2 = w
1669 w3 = word_.TildeDetect(w2) # type: word_t
1670 if w3 is None:
1671 w3 = w2
1672 initializer_words.append(InitializerWord.ArrayWord(w3))
1673
1674 # invariant List?
1675 return word_part.InitializerLiteral(left_token, initializer_words,
1676 right_token)
1677
1678 def ParseProcCallArgs(self, start_symbol):
1679 # type: (int) -> ArgList
1680 """ json write (x) """
1681 self.lexer.MaybeUnreadOne()
1682
1683 arg_list = ArgList.CreateNull(alloc_lists=True)
1684 arg_list.left = self.cur_token
1685 self.parse_ctx.ParseProcCallArgs(self.lexer, arg_list, start_symbol)
1686 return arg_list
1687
1688 def _MaybeReadWordPart(self, is_first, lex_mode, parts):
1689 # type: (bool, lex_mode_t, List[word_part_t]) -> bool
1690 """Helper for _ReadCompoundWord3."""
1691 done = False
1692
1693 if self.token_type == Id.Lit_EscapedChar:
1694 tok = self.cur_token
1695 assert tok.length == 2
1696 ch = lexer.TokenSliceLeft(tok, 1)
1697 if not self.parse_opts.parse_backslash():
1698 if not pyutil.IsValidCharEscape(ch):
1699 p_die('Invalid char escape in unquoted word (OILS-ERR-13)',
1700 self.cur_token)
1701
1702 part = word_part.EscapedLiteral(self.cur_token,
1703 ch) # type: word_part_t
1704 else:
1705 part = self.cur_token
1706
1707 if is_first and self.token_type == Id.Lit_VarLike: # foo=
1708 parts.append(part)
1709 # Unfortunately it's awkward to pull the check for a=(1 2) up to
1710 # _ReadWord.
1711 next_id = self.lexer.LookPastSpace(lex_mode)
1712 if next_id == Id.Op_LParen:
1713 self.lexer.PushHint(Id.Op_RParen, Id.Right_Initializer)
1714 part2 = self._ReadArrayLiteral()
1715 parts.append(part2)
1716
1717 # Array literal must be the last part of the word.
1718 self._SetNext(lex_mode)
1719 self._GetToken()
1720 # EOF, whitespace, newline, Right_Subshell
1721 if self.token_kind not in KINDS_THAT_END_WORDS:
1722 p_die('Unexpected token after array literal',
1723 self.cur_token)
1724 done = True
1725
1726 elif (is_first and self.parse_opts.parse_at() and
1727 self.token_type == Id.Lit_Splice):
1728
1729 splice_tok = self.cur_token
1730 part2 = word_part.Splice(splice_tok,
1731 lexer.TokenSliceLeft(splice_tok, 1))
1732
1733 parts.append(part2)
1734
1735 # @words must be the last part of the word
1736 self._SetNext(lex_mode)
1737 self._GetToken()
1738 # EOF, whitespace, newline, Right_Subshell
1739 if self.token_kind not in KINDS_THAT_END_WORDS:
1740 p_die('Unexpected token after array splice', self.cur_token)
1741 done = True
1742
1743 elif (is_first and self.parse_opts.parse_at() and
1744 self.token_type == Id.Lit_AtLBracket): # @[split(x)]
1745 part2 = self._ReadExprSub(lex_mode_e.DQ)
1746 parts.append(part2)
1747
1748 # @[split(x)]
1749 self._SetNext(lex_mode)
1750 self._GetToken()
1751 # EOF, whitespace, newline, Right_Subshell
1752 if self.token_kind not in KINDS_THAT_END_WORDS:
1753 p_die('Unexpected token after Expr splice', self.cur_token)
1754 done = True
1755
1756 elif (is_first and self.parse_opts.parse_at() and
1757 self.token_type == Id.Lit_AtLBraceDot):
1758 p_die('TODO: @{.myproc builtin sub}', self.cur_token)
1759
1760 elif (is_first and self.parse_opts.parse_at_all() and
1761 self.token_type == Id.Lit_At):
1762 # Because $[x] ${x} and perhaps $/x/ are reserved, it makes sense for @
1763 # at the beginning of a word to be reserved.
1764
1765 # Although should we relax 'echo @' ? I'm tempted to have a shortcut for
1766 # @_argv and
1767 p_die('Literal @ starting a word must be quoted (parse_at_all)',
1768 self.cur_token)
1769
1770 else:
1771 # not a literal with lookahead; append it
1772 parts.append(part)
1773
1774 return done
1775
1776 def _ReadCompoundWord(self, lex_mode):
1777 # type: (lex_mode_t) -> CompoundWord
1778 return self._ReadCompoundWord3(lex_mode, Id.Undefined_Tok, True)
1779
1780 def _ReadCompoundWord3(self, lex_mode, eof_type, empty_ok):
1781 # type: (lex_mode_t, Id_t, bool) -> CompoundWord
1782 """
1783 Precondition: Looking at the first token of the first word part
1784 Postcondition: Looking at the token after, e.g. space or operator
1785
1786 NOTE: eof_type is necessary because / is a literal, i.e. Lit_Slash, but it
1787 could be an operator delimiting a compound word. Can we change lexer modes
1788 and remove this special case?
1789 """
1790 w = CompoundWord([])
1791 num_parts = 0
1792 brace_count = 0
1793 done = False
1794 is_triple_quoted = None # type: Optional[BoolParamBox]
1795
1796 while not done:
1797 self._GetToken()
1798
1799 allow_done = empty_ok or num_parts != 0
1800 if allow_done and self.token_type == eof_type:
1801 done = True # e.g. for ${foo//pat/replace}
1802
1803 # Keywords like "for" are treated like literals
1804 elif self.token_kind in (Kind.Lit, Kind.History, Kind.KW,
1805 Kind.ControlFlow, Kind.BoolUnary,
1806 Kind.BoolBinary):
1807
1808 # Syntax error for { and }
1809 if self.token_type == Id.Lit_LBrace:
1810 brace_count += 1
1811 elif self.token_type == Id.Lit_RBrace:
1812 brace_count -= 1
1813 elif self.token_type == Id.Lit_Dollar:
1814 if not self.parse_opts.parse_dollar():
1815 if num_parts == 0 and lex_mode == lex_mode_e.ShCommand:
1816 next_byte = self.lexer.ByteLookAhead()
1817 # TODO: switch lexer modes and parse $/d+/. But not ${a:-$/d+/}
1818 if next_byte == '/':
1819 #log('next_byte %r', next_byte)
1820 pass
1821
1822 p_die('Literal $ should be quoted like \$',
1823 self.cur_token)
1824
1825 done = self._MaybeReadWordPart(num_parts == 0, lex_mode,
1826 w.parts)
1827
1828 elif self.token_kind == Kind.VSub:
1829 vsub_token = self.cur_token
1830
1831 part = SimpleVarSub(vsub_token) # type: word_part_t
1832 w.parts.append(part)
1833
1834 elif self.token_kind == Kind.ExtGlob:
1835 # If parse_at, we can take over @( to start @(seq 3)
1836 # Users can also use look at ,(*.py|*.sh)
1837 if (self.parse_opts.parse_at() and
1838 self.token_type == Id.ExtGlob_At and num_parts == 0):
1839 cs_part = self._ReadCommandSub(Id.Left_AtParen,
1840 d_quoted=False)
1841 # RARE mutation of tok.id!
1842 cs_part.left_token.id = Id.Left_AtParen
1843 part = cs_part # for type safety
1844
1845 # Same check as _MaybeReadWordPart. @(seq 3)x is illegal, just like
1846 # a=(one two)x and @arrayfunc(3)x.
1847 self._GetToken()
1848 if self.token_kind not in KINDS_THAT_END_WORDS:
1849 p_die('Unexpected token after @()', self.cur_token)
1850 done = True
1851
1852 else:
1853 if HAVE_FNM_EXTMATCH == 0:
1854 p_die(
1855 "Extended glob won't work without FNM_EXTMATCH support in libc",
1856 self.cur_token)
1857 part = self._ReadExtGlob()
1858 w.parts.append(part)
1859
1860 elif self.token_kind == Kind.BashRegex:
1861 if self.token_type == Id.BashRegex_LParen: # Opening (
1862 part = self._ReadBashRegexGroup()
1863 w.parts.append(part)
1864 else:
1865 assert self.token_type == Id.BashRegex_AllowedInParens
1866 p_die('Invalid token in bash regex', self.cur_token)
1867
1868 elif self.token_kind == Kind.Left:
1869 try_triple_quote = (self.parse_opts.parse_triple_quote() and
1870 lex_mode == lex_mode_e.ShCommand and
1871 num_parts == 0)
1872
1873 # Save allocation
1874 if try_triple_quote:
1875 is_triple_quoted = BoolParamBox(False)
1876
1877 part = self._ReadUnquotedLeftParts(is_triple_quoted)
1878 w.parts.append(part)
1879
1880 # NOT done yet, will advance below
1881 elif self.token_kind == Kind.Right:
1882 # Still part of the word; will be done on the next iter.
1883 if self.token_type == Id.Right_DoubleQuote:
1884 pass
1885 # Never happens, no PushHint for this case.
1886 #elif self.token_type == Id.Right_DollarParen:
1887 # pass
1888 elif self.token_type == Id.Right_Subshell:
1889 # LEXER HACK for (case x in x) ;; esac )
1890 # Rewind before it's used
1891 assert self.next_lex_mode == lex_mode_e.Undefined
1892 if self.lexer.MaybeUnreadOne():
1893 self.lexer.PushHint(Id.Op_RParen, Id.Right_Subshell)
1894 self._SetNext(lex_mode)
1895 done = True
1896 else:
1897 done = True
1898
1899 elif self.token_kind == Kind.Ignored:
1900 done = True
1901
1902 else:
1903 # LEXER HACK for unbalanced case clause. 'case foo in esac' is valid,
1904 # so to test for ESAC, we can read ) before getting a chance to
1905 # PushHint(Id.Op_RParen, Id.Right_CasePat). So here we unread one
1906 # token and do it again.
1907
1908 # We get Id.Op_RParen at top level: case x in x) ;; esac
1909 # We get Id.Eof_RParen inside ComSub: $(case x in x) ;; esac )
1910 if self.token_type in (Id.Op_RParen, Id.Eof_RParen):
1911 # Rewind before it's used
1912 assert self.next_lex_mode == lex_mode_e.Undefined
1913 if self.lexer.MaybeUnreadOne():
1914 if self.token_type == Id.Eof_RParen:
1915 # Redo translation
1916 self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
1917 self._SetNext(lex_mode)
1918
1919 done = True # anything we don't recognize means we're done
1920
1921 if not done:
1922 self._SetNext(lex_mode)
1923 num_parts += 1
1924
1925 if (self.parse_opts.parse_brace() and num_parts > 1 and
1926 brace_count != 0):
1927 # accept { and }, but not foo{
1928 p_die(
1929 'Word has unbalanced { }. Maybe add a space or quote it like \{',
1930 loc.Word(w))
1931
1932 if is_triple_quoted and is_triple_quoted.b and num_parts > 1:
1933 p_die('Unexpected parts after triple quoted string',
1934 loc.WordPart(w.parts[-1]))
1935
1936 if 0:
1937 from _devbuild.gen.syntax_asdl import word_part_str
1938 word_key = ' '.join(word_part_str(p.tag()) for p in w.parts)
1939 WORD_HIST[word_key] += 1
1940 return w
1941
1942 def _ReadArithWord(self):
1943 # type: () -> Optional[word_t]
1944 """ Helper for ReadArithWord() """
1945 self._GetToken()
1946
1947 if self.token_kind == Kind.Unknown:
1948 # e.g. happened during dynamic parsing of unset 'a[$foo]' in gherkin
1949 p_die(
1950 'Unexpected token while parsing arithmetic: %r' %
1951 lexer.TokenVal(self.cur_token), self.cur_token)
1952
1953 elif self.token_kind == Kind.Eof:
1954 return self.cur_token
1955
1956 elif self.token_kind == Kind.Ignored:
1957 # Space should be ignored.
1958 self._SetNext(lex_mode_e.Arith)
1959 return None
1960
1961 elif self.token_kind in (Kind.Arith, Kind.Right):
1962 # Id.Right_DollarDParen IS just a normal token, handled by ArithParser
1963 self._SetNext(lex_mode_e.Arith)
1964 return self.cur_token
1965
1966 elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub):
1967 return self._ReadCompoundWord(lex_mode_e.Arith)
1968
1969 else:
1970 raise AssertionError(self.cur_token)
1971
1972 def _ReadWord(self, word_mode):
1973 # type: (lex_mode_t) -> Optional[word_t]
1974 """Helper function for ReadWord()."""
1975
1976 # Change the pseudo lexer mode to a real lexer mode
1977 if word_mode == lex_mode_e.ShCommandFakeBrack:
1978 lex_mode = lex_mode_e.ShCommand
1979 else:
1980 lex_mode = word_mode
1981
1982 self._GetToken()
1983
1984 if self.token_kind == Kind.Eof:
1985 # No advance
1986 return self.cur_token
1987
1988 # Allow Arith for ) at end of for loop?
1989 elif self.token_kind in (Kind.Op, Kind.Redir, Kind.Arith):
1990 self._SetNext(lex_mode)
1991
1992 # Newlines are complicated. See 3x2 matrix in the comment about
1993 # self.multiline and self.newline_state above.
1994 if self.token_type == Id.Op_Newline:
1995 if self.multiline:
1996 if self.newline_state > 1:
1997 # This points at a blank line, but at least it gives the line number
1998 p_die('Invalid blank line in multiline mode',
1999 self.cur_token)
2000 return None
2001
2002 if self.returned_newline: # skip
2003 return None
2004
2005 return self.cur_token
2006
2007 elif self.token_kind == Kind.Right:
2008 if self.token_type not in (Id.Right_Subshell, Id.Right_ShFunction,
2009 Id.Right_CasePat, Id.Right_Initializer):
2010 raise AssertionError(self.cur_token)
2011
2012 self._SetNext(lex_mode)
2013 return self.cur_token
2014
2015 elif self.token_kind in (Kind.Ignored, Kind.WS):
2016 self._SetNext(lex_mode)
2017 return None
2018
2019 else:
2020 assert self.token_kind in (Kind.VSub, Kind.Lit, Kind.History,
2021 Kind.Left, Kind.KW, Kind.ControlFlow,
2022 Kind.BoolUnary, Kind.BoolBinary,
2023 Kind.ExtGlob,
2024 Kind.BashRegex), 'Unhandled token kind'
2025
2026 if (word_mode == lex_mode_e.ShCommandFakeBrack and
2027 self.parse_opts.parse_bracket() and
2028 self.token_type == Id.Lit_LBracket):
2029 # Change [ from Kind.Lit -> Kind.Op
2030 # So CommandParser can treat
2031 # assert [42 === x]
2032 # like
2033 # json write (x)
2034 bracket_word = self.cur_token
2035 bracket_word.id = Id.Op_LBracket
2036
2037 self._SetNext(lex_mode)
2038 return bracket_word
2039
2040 # We're beginning a word. If we see Id.Lit_Pound, change to
2041 # lex_mode_e.Comment and read until end of line.
2042 if self.token_type == Id.Lit_Pound:
2043 self._SetNext(lex_mode_e.Comment)
2044 self._GetToken()
2045
2046 # NOTE: The # could be the last character in the file. It can't be
2047 # Eof_{RParen,Backtick} because #) and #` are comments.
2048 assert self.token_type in (Id.Ignored_Comment, Id.Eof_Real), \
2049 self.cur_token
2050
2051 # The next iteration will go into Kind.Ignored and set lex state to
2052 # lex_mode_e.ShCommand/etc.
2053 return None # tell ReadWord() to try again after comment
2054
2055 elif self.token_type == Id.Lit_TPound: ### doc comment
2056 self._SetNext(lex_mode_e.Comment)
2057 self._GetToken()
2058
2059 if self.token_type == Id.Ignored_Comment and self.emit_doc_token:
2060 return self.cur_token
2061
2062 return None # tell ReadWord() to try again after comment
2063
2064 else:
2065 # r'' u'' b''
2066 if (self.token_type == Id.Lit_Chars and
2067 self.lexer.LookAheadOne(
2068 lex_mode_e.ShCommand) == Id.Left_SingleQuote):
2069
2070 # When shopt -s parse_raw_string:
2071 # echo r'hi' is like echo 'hi'
2072 #
2073 # echo u'\u{3bc}' b'\yff' works
2074
2075 tok = self.cur_token
2076 if self.parse_opts.parse_ysh_string():
2077 if lexer.TokenEquals(tok, 'r'):
2078 left_id = Id.Left_RSingleQuote
2079 elif lexer.TokenEquals(tok, 'u'):
2080 left_id = Id.Left_USingleQuote
2081 elif lexer.TokenEquals(tok, 'b'):
2082 left_id = Id.Left_BSingleQuote
2083 else:
2084 left_id = Id.Undefined_Tok
2085
2086 if left_id != Id.Undefined_Tok:
2087 # skip the r, and then 'foo' will be read as normal
2088 self._SetNext(lex_mode_e.ShCommand)
2089
2090 self._GetToken()
2091 assert self.token_type == Id.Left_SingleQuote, self.token_type
2092
2093 # Read the word in a different lexer mode
2094 return self._ReadYshSingleQuoted(left_id)
2095
2096 return self._ReadCompoundWord(lex_mode)
2097
2098 def ParseVarRef(self):
2099 # type: () -> BracedVarSub
2100 """DYNAMIC parsing of what's inside ${!ref}
2101
2102 # Same as VarOf production
2103 VarRefExpr = VarOf EOF
2104 """
2105 self._SetNext(lex_mode_e.VSub_1)
2106
2107 self._GetToken()
2108 if self.token_kind != Kind.VSub:
2109 p_die('Expected var name', self.cur_token)
2110
2111 part = self._ParseVarOf()
2112 # NOTE: no ${ } means no part.left and part.right
2113 part.left = part.name_tok # cheat to make test pass
2114 part.right = part.name_tok
2115
2116 self._GetToken()
2117 if self.token_type != Id.Eof_Real:
2118 p_die('Expected end of var ref expression', self.cur_token)
2119 return part
2120
2121 def LookPastSpace(self):
2122 # type: () -> Id_t
2123 """Look ahead to the next token.
2124
2125 For the CommandParser to recognize
2126 array= (1 2 3)
2127 YSH for ( versus bash for ((
2128 YSH if ( versus if test
2129 YSH while ( versus while test
2130 YSH bare assignment 'grep =' versus 'grep foo'
2131 """
2132 assert self.token_type != Id.Undefined_Tok
2133 if self.cur_token.id == Id.WS_Space:
2134 id_ = self.lexer.LookPastSpace(lex_mode_e.ShCommand)
2135 else:
2136 id_ = self.cur_token.id
2137 return id_
2138
2139 def LookAheadFuncParens(self):
2140 # type: () -> bool
2141 """Special lookahead for f( ) { echo hi; } to check for ( )"""
2142 assert self.token_type != Id.Undefined_Tok
2143
2144 # We have to handle 2 cases because we buffer a token
2145 if self.cur_token.id == Id.Op_LParen: # saw funcname(
2146 return self.lexer.LookAheadFuncParens(1) # go back one char
2147
2148 elif self.cur_token.id == Id.WS_Space: # saw funcname WHITESPACE
2149 return self.lexer.LookAheadFuncParens(0)
2150
2151 else:
2152 return False
2153
2154 def ReadWord(self, word_mode):
2155 # type: (lex_mode_t) -> word_t
2156 """Read the next word, using the given lexer mode.
2157
2158 This is a stateful wrapper for the stateless _ReadWord function.
2159 """
2160 assert word_mode in (lex_mode_e.ShCommand,
2161 lex_mode_e.ShCommandFakeBrack,
2162 lex_mode_e.DBracket, lex_mode_e.BashRegex)
2163
2164 if self.buffered_word: # For integration with pgen2
2165 w = self.buffered_word
2166 self.buffered_word = None
2167 else:
2168 while True:
2169 w = self._ReadWord(word_mode)
2170 if w is not None:
2171 break
2172
2173 self.returned_newline = (word_.CommandId(w) == Id.Op_Newline)
2174 return w
2175
2176 def ReadArithWord(self):
2177 # type: () -> word_t
2178 while True:
2179 w = self._ReadArithWord()
2180 if w is not None:
2181 break
2182 return w
2183
2184 def ReadHereDocBody(self, parts):
2185 # type: (List[word_part_t]) -> None
2186 """
2187 A here doc is like a double quoted context, except " isn't special.
2188 """
2189 self._ReadLikeDQ(None, False, parts)
2190 # Returns nothing
2191
2192 def ReadForPlugin(self):
2193 # type: () -> CompoundWord
2194 """For $PS1, $PS4, etc.
2195
2196 This is just like reading a here doc line. "\n" is allowed, as
2197 well as the typical substitutions ${x} $(echo hi) $((1 + 2)).
2198 """
2199 w = CompoundWord([])
2200 self._ReadLikeDQ(None, False, w.parts)
2201 return w
2202
2203 def EmitDocToken(self, b):
2204 # type: (bool) -> None
2205 self.emit_doc_token = b
2206
2207 def Multiline(self, b):
2208 # type: (bool) -> None
2209 self.multiline = b
2210
2211
2212if 0:
2213 import collections
2214 WORD_HIST = collections.Counter()
2215
2216# vim: sw=4