OILS / osh / word_parse.py View on Github | oilshell.org

2219 lines, 1184 significant
1# Copyright 2016 Andy Chu. All rights reserved.
2# Licensed under the Apache License, Version 2.0 (the "License");
3# you may not use this file except in compliance with the License.
4# You may obtain a copy of the License at
5#
6# http://www.apache.org/licenses/LICENSE-2.0
7"""
8word_parse.py - Parse the shell word language.
9
10Hairy example:
11
12 hi$((1 + 2))"$(echo hi)"${var:-__"$(echo default)"__}
13
14Substitutions can be nested, but which inner subs are allowed depends on the
15outer sub. Notes:
16
17lex_mode_e.ShCommand (_ReadUnquotedLeftParts)
18 All subs and quotes are allowed:
19 $v ${v} $() `` $(()) '' "" $'' $"" <() >()
20
21lex_mode_e.DQ (_ReadDoubleQuotedLeftParts)
22 Var, Command, Arith, but no quotes.
23 $v ${v} $() `` $(())
24 No process substitution.
25
26lex_mode_e.Arith
27 Similar to DQ: Var, Command, and Arith sub, but no process sub. bash doesn't
28 allow quotes, but OSH does. We allow ALL FOUR kinds of quotes, because we
29 need those for associative array indexing.
30
31lex_mode_e.VSub_ArgUnquoted
32 Like ShCommand, everything is allowed (even process substitutions), but we
33 stop at }, and space is SIGNIFICANT.
34
35 Example: ${a:- b }
36
37 ${X:-$v} ${X:-${v}} ${X:-$(echo hi)} ${X:-`echo hi`} ${X:-$((1+2))}
38 ${X:-'single'} ${X:-"double"} ${X:-$'\n'} ${X:-<(echo hi)}
39
40lex_mode_e.VSub_ArgDQ
41 In contrast to DQ, VS_ARG_DQ accepts nested "" and $'' and $"", e.g.
42 "${x:-"default"}".
43
44 In contrast, VSub_ArgUnquoted respects single quotes and process
45 substitution.
46
47 It's weird that double quotes are allowed. Space is also significant here,
48 e.g. "${x:-a "b"}".
49"""
50
51from _devbuild.gen import grammar_nt
52from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind
53from _devbuild.gen.types_asdl import (lex_mode_t, lex_mode_e)
54from _devbuild.gen.syntax_asdl import (
55 BoolParamBox,
56 Token,
57 SimpleVarSub,
58 loc,
59 source,
60 DoubleQuoted,
61 SingleQuoted,
62 BracedVarSub,
63 CommandSub,
64 ShArrayLiteral,
65 AssocPair,
66 bracket_op,
67 bracket_op_t,
68 suffix_op,
69 suffix_op_t,
70 rhs_word,
71 rhs_word_e,
72 rhs_word_t,
73 word_e,
74 word_t,
75 CompoundWord,
76 word_part,
77 word_part_t,
78 y_lhs_e,
79 arith_expr_t,
80 command,
81 expr,
82 expr_e,
83 expr_t,
84 pat_t,
85 ArgList,
86 Proc,
87 Func,
88 Subscript,
89 Attribute,
90 arith_expr,
91)
92from core import alloc
93from core.error import p_die
94from mycpp.mylib import log
95from core import pyutil
96from display import ui
97from frontend import consts
98from frontend import lexer
99from frontend import reader
100from osh import tdop
101from osh import arith_parse
102from osh import braces
103from osh import word_
104from osh import word_compile
105from mycpp.mylib import tagswitch
106
107from typing import List, Optional, Tuple, cast
108from typing import TYPE_CHECKING
109if TYPE_CHECKING:
110 from frontend.lexer import Lexer
111 from frontend.parse_lib import ParseContext
112 from frontend.reader import _Reader
113 from osh.cmd_parse import VarChecker
114
115unused1 = log
116unused2 = Id_str
117
118KINDS_THAT_END_WORDS = [Kind.Eof, Kind.WS, Kind.Op, Kind.Right]
119
120
121class WordEmitter(object):
122 """Common interface for [ and [["""
123
124 def __init__(self):
125 # type: () -> None
126 """Empty constructor for mycpp."""
127 pass
128
129 def ReadWord(self, lex_mode):
130 # type: (lex_mode_t) -> word_t
131 raise NotImplementedError()
132
133
134class WordParser(WordEmitter):
135
136 def __init__(self, parse_ctx, lexer, line_reader):
137 # type: (ParseContext, Lexer, _Reader) -> None
138 self.parse_ctx = parse_ctx
139 self.lexer = lexer
140 self.line_reader = line_reader
141 self.arena = line_reader.arena
142
143 self.parse_opts = parse_ctx.parse_opts
144 self.a_parser = tdop.TdopParser(arith_parse.Spec(), self,
145 self.parse_opts)
146 self.Reset()
147
148 def Init(self, lex_mode):
149 # type: (lex_mode_t) -> None
150 """Used to parse arithmetic, see ParseContext."""
151 self.next_lex_mode = lex_mode
152
153 def Reset(self):
154 # type: () -> None
155 """Called by interactive loop."""
156 # For _GetToken()
157 self.cur_token = None # type: Token
158 self.token_kind = Kind.Undefined
159 self.token_type = Id.Undefined_Tok
160
161 self.next_lex_mode = lex_mode_e.ShCommand
162
163 # Boolean mutated by CommandParser via word_.ctx_EmitDocToken. For ### doc
164 # comments
165 self.emit_doc_token = False
166 # Boolean mutated by CommandParser via word_.ctx_Multiline. '...' starts
167 # multiline mode.
168 self.multiline = False
169
170 # For detecting invalid \n\n in multiline mode. Counts what we got
171 # directly from the lexer.
172 self.newline_state = 0
173 # For consolidating \n\n -> \n for the CALLER. This simplifies the parsers
174 # that consume words.
175 self.returned_newline = False
176
177 # For integration with pgen2
178 self.buffered_word = None # type: word_t
179
180 def _GetToken(self):
181 # type: () -> None
182 """Call this when you need to make a decision based on any of:
183
184 self.token_type
185 self.token_kind
186 self.cur_token
187 """
188 if self.next_lex_mode == lex_mode_e.Undefined:
189 return # _SetNext() not called, so do nothing
190
191 is_fake = self.next_lex_mode == lex_mode_e.BashRegexFakeInner
192 real_mode = (lex_mode_e.BashRegex if is_fake else self.next_lex_mode)
193
194 self.cur_token = self.lexer.Read(real_mode)
195
196 # MUTATE TOKEN for fake lexer mode.
197 # This is for crazy stuff bash allows, like [[ s =~ (< >) ]]
198 if (is_fake and self.cur_token.id
199 in (Id.WS_Space, Id.BashRegex_AllowedInParens)):
200 self.cur_token.id = Id.Lit_Chars
201
202 self.token_type = self.cur_token.id
203 self.token_kind = consts.GetKind(self.token_type)
204
205 # number of consecutive newlines, ignoring whitespace
206 if self.token_type == Id.Op_Newline:
207 self.newline_state += 1
208 elif self.token_kind != Kind.WS:
209 self.newline_state = 0
210
211 self.parse_ctx.trail.AppendToken(self.cur_token) # For completion
212 self.next_lex_mode = lex_mode_e.Undefined
213
214 def _SetNext(self, lex_mode):
215 # type: (lex_mode_t) -> None
216 """Set the next lex state, but don't actually read a token.
217
218 We need this for proper interactive parsing.
219 """
220 self.next_lex_mode = lex_mode
221
222 def _ReadVarOpArg(self, arg_lex_mode):
223 # type: (lex_mode_t) -> rhs_word_t
224
225 # NOTE: Operators like | and < are not treated as special, so ${a:- | >} is
226 # valid, even when unquoted.
227 self._SetNext(arg_lex_mode)
228 self._GetToken()
229
230 w = self._ReadVarOpArg2(arg_lex_mode, Id.Undefined_Tok,
231 True) # empty_ok
232
233 # If the Compound has no parts, and we're in a double-quoted VarSub
234 # arg, and empty_ok, then return Empty. This is so it can evaluate to
235 # the empty string and not get elided.
236 #
237 # Examples:
238 # - "${s:-}", "${s/%pat/}"
239 # It's similar to LooksLikeShAssignment where we turn x= into x=''. And it
240 # has the same potential problem of not having Token location info.
241 #
242 # NOTE: empty_ok is False only for the PatSub pattern, which means we'll
243 # return a Compound with no parts, which is explicitly checked with a
244 # custom error message.
245 if len(w.parts) == 0 and arg_lex_mode == lex_mode_e.VSub_ArgDQ:
246 return rhs_word.Empty
247
248 return w
249
250 def _ReadVarOpArg2(self, arg_lex_mode, eof_type, empty_ok):
251 # type: (lex_mode_t, Id_t, bool) -> CompoundWord
252 """Return a CompoundWord.
253
254 Helper function for _ReadVarOpArg and used directly by
255 _ReadPatSubVarOp.
256 """
257 w = self._ReadCompoundWord3(arg_lex_mode, eof_type, empty_ok)
258 #log('w %s', w)
259 tilde = word_.TildeDetect(w)
260 if tilde:
261 w = tilde
262 return w
263
264 def _ReadSliceVarOp(self):
265 # type: () -> suffix_op.Slice
266 """
267 Looking token after first ':'
268
269 ArithExpr? (':' ArithExpr? )? '}'
270 """
271 self._NextNonSpace()
272
273 cur_id = self.token_type
274
275 if cur_id in (Id.Arith_RBrace, Id.Arith_Colon): # ${a:} or ${a::}
276 begin = arith_expr.EmptyZero # type: arith_expr_t
277 else:
278 begin = self.a_parser.Parse()
279 cur_id = self.a_parser.CurrentId() # advance
280
281 if cur_id == Id.Arith_RBrace: # ${a:1} or ${@:1}
282 # No length specified, so it's N
283 no_length = None # type: Optional[arith_expr_t]
284 return suffix_op.Slice(begin, no_length)
285
286 elif cur_id == Id.Arith_Colon: # ${a:1:} or ${@:1:}
287 colon_tok = self.cur_token
288 self._NextNonSpace()
289
290 if self.token_type == Id.Arith_RBrace:
291 # quirky bash behavior:
292 # ${a:1:} or ${a::} means length ZERO
293 # but ${a:1} or ${a:} means length N
294 if self.parse_opts.strict_parse_slice():
295 p_die(
296 "Slice length: Add explicit zero, or omit : for N (strict_parse_slice)",
297 colon_tok)
298
299 length = arith_expr.EmptyZero # type: arith_expr_t
300 else:
301 length = self._ReadArithExpr(Id.Arith_RBrace)
302
303 return suffix_op.Slice(begin, length)
304
305 else:
306 p_die("Expected : or } in slice", self.cur_token)
307
308 raise AssertionError() # for MyPy
309
310 def _ReadPatSubVarOp(self):
311 # type: () -> suffix_op.PatSub
312 """Looking at the first '/' after VarOf:
313
314 VarSub = ...
315 | VarOf '/' Match ( '/' WORD? )?
316 Match = '/' WORD # can't be empty
317 | '#' WORD? # may be empty
318 | '%' WORD?
319 """
320 slash_tok = self.cur_token # location info
321 replace_mode = Id.Undefined_Tok # bizarre syntax / # %
322
323 self._SetNext(lex_mode_e.VSub_ArgUnquoted) # advance past /
324
325 self._GetToken()
326 if self.token_type == Id.Right_DollarBrace:
327 pat = CompoundWord([])
328 return suffix_op.PatSub(pat, rhs_word.Empty, replace_mode,
329 slash_tok)
330
331 if self.token_type in (Id.Lit_Slash, Id.Lit_Pound, Id.Lit_Percent):
332 replace_mode = self.token_type
333 self._SetNext(lex_mode_e.VSub_ArgUnquoted)
334
335 # Bash quirk:
336 # echo ${x/#/replace} has an empty pattern
337 # echo ${x////replace} is non-empty; it means echo ${x//'/'/replace}
338 empty_ok = replace_mode != Id.Lit_Slash
339 pat = self._ReadVarOpArg2(lex_mode_e.VSub_ArgUnquoted, Id.Lit_Slash,
340 empty_ok)
341 #log('pat 1 %r', pat)
342
343 if self.token_type == Id.Lit_Slash:
344 # read until }
345 replace = self._ReadVarOpArg(
346 lex_mode_e.VSub_ArgUnquoted) # type: rhs_word_t
347 #log('r 1 %r', replace)
348 else:
349 # e.g. ${v/a} is the same as ${v/a/} -- empty replacement string
350 replace = rhs_word.Empty
351
352 self._GetToken()
353 if self.token_type != Id.Right_DollarBrace:
354 # This happens on invalid code
355 p_die(
356 "Expected } after replacement string, got %s" %
357 ui.PrettyId(self.token_type), self.cur_token)
358
359 return suffix_op.PatSub(pat, replace, replace_mode, slash_tok)
360
361 def _ReadSubscript(self):
362 # type: () -> bracket_op_t
363 """ Subscript = '[' ('@' | '*' | ArithExpr) ']' """
364 # Lookahead to see if we get @ or *. Otherwise read a full arithmetic
365 # expression.
366 next_id = self.lexer.LookPastSpace(lex_mode_e.Arith)
367 if next_id in (Id.Lit_At, Id.Arith_Star):
368 op = bracket_op.WholeArray(next_id) # type: bracket_op_t
369
370 self._SetNext(lex_mode_e.Arith) # skip past [
371 self._GetToken()
372 self._SetNext(lex_mode_e.Arith) # skip past @
373 self._GetToken()
374 else:
375 self._SetNext(lex_mode_e.Arith) # skip past [
376 anode = self._ReadArithExpr(Id.Arith_RBracket)
377 op = bracket_op.ArrayIndex(anode)
378
379 if self.token_type != Id.Arith_RBracket: # Should be looking at ]
380 p_die('Expected ] to close subscript', self.cur_token)
381
382 self._SetNext(lex_mode_e.VSub_2) # skip past ]
383 self._GetToken() # Needed to be in the same spot as no subscript
384
385 return op
386
387 def _ParseVarOf(self):
388 # type: () -> BracedVarSub
389 """
390 VarOf = NAME Subscript?
391 | NUMBER # no subscript allowed, none of these are arrays
392 # ${@[1]} doesn't work, even though slicing does
393 | VarSymbol
394 """
395 self._GetToken()
396 name_token = self.cur_token
397 self._SetNext(lex_mode_e.VSub_2)
398
399 self._GetToken() # Check for []
400 if self.token_type == Id.VOp2_LBracket:
401 bracket_op = self._ReadSubscript()
402 else:
403 bracket_op = None
404
405 part = BracedVarSub.CreateNull()
406 part.token = name_token
407 part.var_name = lexer.TokenVal(name_token)
408 part.bracket_op = bracket_op
409 return part
410
411 def _ParseVarExpr(self, arg_lex_mode, allow_query=False):
412 # type: (lex_mode_t, bool) -> BracedVarSub
413 """Start parsing at the op -- we already skipped past the name."""
414 part = self._ParseVarOf()
415
416 self._GetToken()
417 if self.token_type == Id.Right_DollarBrace:
418 return part # no ops
419
420 op_kind = self.token_kind
421
422 if op_kind == Kind.VTest:
423 tok = self.cur_token
424 arg_word = self._ReadVarOpArg(arg_lex_mode)
425 if self.token_type != Id.Right_DollarBrace:
426 p_die('Expected } to close ${', self.cur_token)
427
428 part.suffix_op = suffix_op.Unary(tok, arg_word)
429
430 elif op_kind == Kind.VOpYsh:
431 tok = self.cur_token
432 arg_word = self._ReadVarOpArg(arg_lex_mode)
433 if self.token_type != Id.Right_DollarBrace:
434 p_die('Expected } to close ${', self.cur_token)
435
436 UP_arg_word = arg_word
437 with tagswitch(arg_word) as case:
438 if case(rhs_word_e.Empty):
439 pass
440 elif case(rhs_word_e.Compound):
441 arg_word = cast(CompoundWord, UP_arg_word)
442 # This handles ${x|html} and ${x %.3f} now
443 # However I think ${x %.3f} should be statically parsed? It can enter
444 # the printf lexer modes.
445 ok, arg, quoted = word_.StaticEval(arg_word)
446 if not ok or quoted:
447 p_die('Expected a constant argument',
448 loc.Word(arg_word))
449
450 part.suffix_op = suffix_op.Static(tok, arg)
451
452 elif op_kind == Kind.VOp0:
453 part.suffix_op = self.cur_token # Nullary
454 self._SetNext(lex_mode_e.VSub_2) # Expecting }
455 self._GetToken()
456
457 elif op_kind == Kind.VOp1: # % %% # ## etc.
458 tok = self.cur_token
459 # Weird exception that all shells have: these operators take a glob
460 # pattern, so they're lexed as VSub_ArgUnquoted, not VSub_ArgDQ
461 arg_word = self._ReadVarOpArg(lex_mode_e.VSub_ArgUnquoted)
462 if self.token_type != Id.Right_DollarBrace:
463 p_die('Expected } to close ${', self.cur_token)
464
465 part.suffix_op = suffix_op.Unary(tok, arg_word)
466
467 elif op_kind == Kind.VOp2: # / : [ ]
468 if self.token_type == Id.VOp2_Slash:
469 patsub_op = self._ReadPatSubVarOp() # type: suffix_op_t
470 part.suffix_op = patsub_op
471
472 # Checked by the method above
473 assert self.token_type == Id.Right_DollarBrace, self.cur_token
474
475 elif self.token_type == Id.VOp2_Colon:
476 part.suffix_op = self._ReadSliceVarOp()
477 # NOTE: } in arithmetic mode.
478 if self.token_type != Id.Arith_RBrace:
479 # Token seems off; doesn't point to X in # ${a:1:2 X
480 p_die('Expected } to close ${', self.cur_token)
481
482 else:
483 # TODO: Does this ever happen?
484 p_die('Unexpected token in ${} (%s)' % 'VOp2', self.cur_token)
485
486 elif op_kind == Kind.VOp3: # ${prefix@} etc.
487 if allow_query:
488 part.suffix_op = self.cur_token # Nullary
489 self._SetNext(lex_mode_e.VSub_2) # Expecting }
490 self._GetToken()
491 else:
492 p_die("Unexpected token in ${} (%s)" % 'VOp3', self.cur_token)
493
494 # NOTE: Arith_RBrace is for slicing, because it reads } in arithmetic
495 # mode. It's redundantly checked above.
496 if self.token_type not in (Id.Right_DollarBrace, Id.Arith_RBrace):
497 # ${a.} or ${!a.}
498 p_die('Expected } to close ${', self.cur_token)
499
500 # Now look for ops
501 return part
502
503 def _ReadZshVarSub(self, left_token):
504 # type: (Token) -> word_part.ZshVarSub
505
506 self._SetNext(lex_mode_e.VSub_Zsh) # Move past ${(foo)
507
508 # Can be empty
509 w = self._ReadCompoundWord3(lex_mode_e.VSub_Zsh, Id.Right_DollarBrace,
510 True)
511 self._GetToken()
512 return word_part.ZshVarSub(left_token, w, self.cur_token)
513
514 def ReadBracedVarSub(self, left_token):
515 # type: (Token) -> Tuple[BracedVarSub, Token]
516 """ For YSH expressions like var x = ${x:-"default"}. """
517 part = self._ReadBracedVarSub(left_token, d_quoted=False)
518 last_token = self.cur_token
519 return part, last_token
520
521 def _ReadBracedVarSub(self, left_token, d_quoted):
522 # type: (Token, bool) -> BracedVarSub
523 """For the ${} expression language.
524
525 NAME = [a-zA-Z_][a-zA-Z0-9_]*
526 NUMBER = [0-9]+ # ${10}, ${11}, ...
527
528 Subscript = '[' ('@' | '*' | ArithExpr) ']'
529 VarSymbol = '!' | '@' | '#' | ...
530 VarOf = NAME Subscript?
531 | NUMBER # no subscript allowed, none of these are arrays
532 # ${@[1]} doesn't work, even though slicing does
533 | VarSymbol
534
535 NULLARY_OP = '@Q' | '@E' | '@P' | '@A' | '@a' # VOp0
536
537 TEST_OP = '-' | ':-' | '=' | ':=' | '+' | ':+' | '?' | ':?'
538 STRIP_OP = '#' | '##' | '%' | '%%'
539 CASE_OP = ',' | ',,' | '^' | '^^'
540 UnaryOp = TEST_OP | STRIP_OP | CASE_OP
541
542 YSH_UNARY = '|' | ' ' # ${x|html} and ${x %.3f}.
543 # SPACE is operator not %
544 Match = ('/' | '#' | '%') WORD # match all / prefix / suffix
545 VarExpr = VarOf
546 | VarOf NULLARY_OP
547 | VarOf UnaryOp WORD
548 | VarOf YSH_UNARY STATIC_WORD
549 | VarOf ':' ArithExpr (':' ArithExpr )?
550 | VarOf '/' Match '/' WORD
551
552 LengthExpr = '#' VarOf # can't apply operators after length
553
554 RefOrKeys = '!' VarExpr # CAN apply operators after a named ref
555 # ${!ref[0]} vs ${!keys[@]} resolved later
556
557 PrefixQuery = '!' NAME ('*' | '@') # list variable names with a prefix
558
559 BuiltinSub = '.' WORD+ # ${.myproc 'builtin' $sub}
560
561 VarSub = LengthExpr
562 | RefOrKeys
563 | PrefixQuery
564 | VarExpr
565 | BuiltinSub
566
567 NOTES:
568 - Arithmetic expressions are used twice, inside subscripts ${a[x+1]} and
569 slicing ${a:x+1:y+2}
570 - ${#} and ${!} need LL(2) lookahead (considering how my tokenizer works)
571 - @ and * are technically arithmetic expressions in this implementation
572 - We don't account for bash 4.4: ${param@operator} -- Q E P A a. Note that
573 it's also vectorized.
574
575 Strictness over bash:
576 - echo ${a[0][0]} doesn't do anything useful, so we disallow it from the
577 grammar
578 - ! and # prefixes can't be composed, even though named refs can be
579 composed with other operators
580 - '#' means 4 different things: length prefix, VarSymbol, UnaryOp to strip
581 a prefix, and it can also be a literal part of WORD.
582
583 From the parser's point of view, the prefix # can't be combined with
584 UnaryOp/slicing/matching, and the ! can. However
585
586 - ${a[@]:1:2} is not allowed
587 - ${#a[@]:1:2} is allowed, but gives the wrong answer
588 """
589 if d_quoted:
590 arg_lex_mode = lex_mode_e.VSub_ArgDQ
591 else:
592 arg_lex_mode = lex_mode_e.VSub_ArgUnquoted
593
594 self._SetNext(lex_mode_e.VSub_1)
595 self._GetToken()
596
597 ty = self.token_type
598 first_tok = self.cur_token
599
600 if ty == Id.VSub_Pound:
601 # Disambiguate
602 next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
603 if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
604 # e.g. a name, '#' is the prefix
605 self._SetNext(lex_mode_e.VSub_1)
606 part = self._ParseVarOf()
607
608 self._GetToken()
609 if self.token_type != Id.Right_DollarBrace:
610 p_die('Expected } after length expression', self.cur_token)
611
612 part.prefix_op = first_tok
613
614 else: # not a prefix, '#' is the variable
615 part = self._ParseVarExpr(arg_lex_mode)
616
617 elif ty == Id.VSub_Bang:
618 next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
619 if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
620 # e.g. a name, '!' is the prefix
621 # ${!a} -- this is a ref
622 # ${!3} -- this is ref
623 # ${!a[1]} -- this is a ref
624 # ${!a[@]} -- this is a keys
625 # No lookahead -- do it in a second step, or at runtime
626 self._SetNext(lex_mode_e.VSub_1)
627 part = self._ParseVarExpr(arg_lex_mode, allow_query=True)
628
629 part.prefix_op = first_tok
630
631 else: # not a prefix, '!' is the variable
632 part = self._ParseVarExpr(arg_lex_mode)
633
634 elif ty == Id.VSub_Dot:
635 # Note: this will become a new builtin_sub type, so this method must
636 # return word_part_t rather than BracedVarSub. I don't think that
637 # should cause problems.
638 p_die('TODO: ${.myproc builtin sub}', self.cur_token)
639
640 # VS_NAME, VS_NUMBER, symbol that isn't # or !
641 elif self.token_kind == Kind.VSub:
642 part = self._ParseVarExpr(arg_lex_mode)
643
644 else:
645 # e.g. ${^}
646 p_die('Unexpected token in ${}', self.cur_token)
647
648 part.left = left_token # attach the argument
649 part.right = self.cur_token
650 return part
651
652 def _ReadSingleQuoted(self, left_token, lex_mode):
653 # type: (Token, lex_mode_t) -> SingleQuoted
654 """Internal method to read a word_part."""
655 tokens = [] # type: List[Token]
656 # In command mode, we never disallow backslashes like '\'
657 right_quote = self.ReadSingleQuoted(lex_mode, left_token, tokens,
658 False)
659 sval = word_compile.EvalSingleQuoted(left_token.id, tokens)
660 node = SingleQuoted(left_token, sval, right_quote)
661 return node
662
663 def ReadSingleQuoted(self, lex_mode, left_token, out_tokens, is_ysh_expr):
664 # type: (lex_mode_t, Token, List[Token], bool) -> Token
665 """Appends to out_tokens; returns last token
666
667 Used by expr_parse.py
668 """
669 # TODO: Remove and use out_tokens
670 tokens = [] # type: List[Token]
671
672 # echo '\' is allowed, but x = '\' is invalid, in favor of x = r'\'
673 no_backslashes = is_ysh_expr and left_token.id == Id.Left_SingleQuote
674
675 expected_end_tokens = 3 if left_token.id in (
676 Id.Left_TSingleQuote, Id.Left_RTSingleQuote, Id.Left_UTSingleQuote,
677 Id.Left_BTSingleQuote) else 1
678 num_end_tokens = 0
679
680 while num_end_tokens < expected_end_tokens:
681 self._SetNext(lex_mode)
682 self._GetToken()
683
684 # Kind.Char emitted in lex_mode.SQ_C
685 if self.token_kind in (Kind.Lit, Kind.Char):
686 tok = self.cur_token
687 # Happens in lex_mode_e.SQ: 'one\two' is ambiguous, should be
688 # r'one\two' or c'one\\two'
689 if no_backslashes and lexer.TokenContains(tok, '\\'):
690 p_die(
691 r"Strings with backslashes should look like r'\n' or u'\n' or b'\n'",
692 tok)
693
694 if is_ysh_expr:
695 # Disallow var x = $'\001'. Arguably we don't need these
696 # checks because u'\u{1}' is the way to write it.
697 if self.token_type == Id.Char_Octal3:
698 p_die(
699 r"Use \xhh or \u{...} instead of octal escapes in YSH strings",
700 tok)
701
702 if self.token_type == Id.Char_Hex and self.cur_token.length != 4:
703 # disallow \xH
704 p_die(
705 r'Invalid hex escape in YSH string (must be \xHH)',
706 tok)
707
708 tokens.append(tok)
709
710 elif self.token_kind == Kind.Unknown:
711 tok = self.cur_token
712 assert tok.id == Id.Unknown_Backslash, tok
713
714 # x = $'\z' is disallowed; ditto for echo $'\z' if shopt -u parse_backslash
715 if is_ysh_expr or not self.parse_opts.parse_backslash():
716 p_die(
717 "Invalid char escape in C-style string literal (OILS-ERR-11)",
718 tok)
719
720 tokens.append(tok)
721
722 elif self.token_kind == Kind.Eof:
723 p_die('Unexpected EOF in single-quoted string that began here',
724 left_token)
725
726 elif self.token_kind == Kind.Right:
727 # assume Id.Right_SingleQuote
728 num_end_tokens += 1
729 tokens.append(self.cur_token)
730
731 else:
732 raise AssertionError(self.cur_token)
733
734 if self.token_kind != Kind.Right:
735 num_end_tokens = 0 # we need three in a ROW
736
737 if expected_end_tokens == 1:
738 tokens.pop()
739 elif expected_end_tokens == 3: # Get rid of spurious end tokens
740 tokens.pop()
741 tokens.pop()
742 tokens.pop()
743
744 # Remove space from ''' r''' $''' in both expression mode and command mode
745 if left_token.id in (Id.Left_TSingleQuote, Id.Left_RTSingleQuote,
746 Id.Left_UTSingleQuote, Id.Left_BTSingleQuote):
747 word_compile.RemoveLeadingSpaceSQ(tokens)
748
749 # Validation after lexing - same 2 checks in j8.LexerDecoder
750 is_u_string = left_token.id in (Id.Left_USingleQuote,
751 Id.Left_UTSingleQuote)
752
753 for tok in tokens:
754 # u'\yff' is not valid, but b'\yff' is
755 if is_u_string and tok.id == Id.Char_YHex:
756 p_die(
757 r"%s escapes not allowed in u'' strings" %
758 lexer.TokenVal(tok), tok)
759
760 out_tokens.extend(tokens)
761 return self.cur_token
762
763 def _ReadDoubleQuotedLeftParts(self):
764 # type: () -> word_part_t
765 """Read substitution parts in a double quoted context."""
766 if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick):
767 return self._ReadCommandSub(self.token_type, d_quoted=True)
768
769 if self.token_type == Id.Left_DollarBrace:
770 return self._ReadBracedVarSub(self.cur_token, d_quoted=True)
771
772 if self.token_type == Id.Left_DollarDParen:
773 return self._ReadArithSub()
774
775 if self.token_type == Id.Left_DollarBracket:
776 return self._ReadExprSub(lex_mode_e.DQ)
777
778 raise AssertionError(self.cur_token)
779
780 def _ReadYshSingleQuoted(self, left_id):
781 # type: (Id_t) -> CompoundWord
782 """Read YSH style strings
783
784 r'' u'' b''
785 r''' ''' u''' ''' b''' '''
786 """
787 #log('BEF self.cur_token %s', self.cur_token)
788 if left_id == Id.Left_RSingleQuote:
789 lexer_mode = lex_mode_e.SQ_Raw
790 triple_left_id = Id.Left_RTSingleQuote
791 elif left_id == Id.Left_USingleQuote:
792 lexer_mode = lex_mode_e.J8_Str
793 triple_left_id = Id.Left_UTSingleQuote
794 elif left_id == Id.Left_BSingleQuote:
795 lexer_mode = lex_mode_e.J8_Str
796 triple_left_id = Id.Left_BTSingleQuote
797 else:
798 raise AssertionError(left_id)
799
800 # Needed for syntax checks
801 left_tok = self.cur_token
802 left_tok.id = left_id
803
804 sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
805
806 if (len(sq_part.sval) == 0 and self.lexer.ByteLookAhead() == "'"):
807 self._SetNext(lex_mode_e.ShCommand)
808 self._GetToken()
809
810 assert self.token_type == Id.Left_SingleQuote
811 # HACK: magically transform the third ' in u''' to
812 # Id.Left_UTSingleQuote, so that ''' is the terminator
813 left_tok = self.cur_token
814 left_tok.id = triple_left_id
815
816 # Handles stripping leading whitespace
817 sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
818
819 # Advance and validate
820 self._SetNext(lex_mode_e.ShCommand)
821
822 self._GetToken()
823 if self.token_kind not in KINDS_THAT_END_WORDS:
824 p_die('Unexpected token after YSH single-quoted string',
825 self.cur_token)
826
827 return CompoundWord([sq_part])
828
829 def _ReadUnquotedLeftParts(self, triple_out):
830 # type: (Optional[BoolParamBox]) -> word_part_t
831 """Read substitutions and quoted strings (for lex_mode_e.ShCommand).
832
833 If triple_out is set, then we try parsing triple quoted strings,
834 and set its value to True if we got one.
835 """
836 if self.token_type in (Id.Left_DoubleQuote, Id.Left_DollarDoubleQuote):
837 # Note: $"" is a synonym for "". It might make sense if it added
838 # \n \0 \x00 \u{123} etc. But that's not what bash does!
839 dq_part = self._ReadDoubleQuoted(self.cur_token)
840 # Got empty word "" and there's a " after
841 if (triple_out and len(dq_part.parts) == 0 and
842 self.lexer.ByteLookAhead() == '"'):
843
844 self._SetNext(lex_mode_e.ShCommand)
845 self._GetToken()
846 # HACK: magically transform the third " in """ to
847 # Id.Left_TDoubleQuote, so that """ is the terminator
848 left_dq_token = self.cur_token
849 left_dq_token.id = Id.Left_TDoubleQuote
850 triple_out.b = True # let caller know we got it
851 return self._ReadDoubleQuoted(left_dq_token)
852
853 return dq_part
854
855 if self.token_type in (Id.Left_SingleQuote, Id.Left_RSingleQuote,
856 Id.Left_DollarSingleQuote):
857 if self.token_type == Id.Left_SingleQuote:
858 lexer_mode = lex_mode_e.SQ_Raw
859 triple_left_id = Id.Left_TSingleQuote
860 elif self.token_type == Id.Left_RSingleQuote:
861 lexer_mode = lex_mode_e.SQ_Raw
862 triple_left_id = Id.Left_RTSingleQuote
863 else:
864 lexer_mode = lex_mode_e.SQ_C
865 # there is no such thing as $'''
866 triple_left_id = Id.Undefined_Tok
867
868 sq_part = self._ReadSingleQuoted(self.cur_token, lexer_mode)
869
870 # Got empty '' or r'' and there's a ' after
871 # u'' and b'' are handled in _ReadYshSingleQuoted
872 if (triple_left_id != Id.Undefined_Tok and
873 triple_out is not None and len(sq_part.sval) == 0 and
874 self.lexer.ByteLookAhead() == "'"):
875
876 self._SetNext(lex_mode_e.ShCommand)
877 self._GetToken()
878
879 # HACK: magically transform the third ' in ''' to
880 # Id.Left_TSingleQuote, so that ''' is the terminator
881 left_sq_token = self.cur_token
882 left_sq_token.id = triple_left_id
883
884 triple_out.b = True # let caller know we got it
885 return self._ReadSingleQuoted(left_sq_token, lexer_mode)
886
887 return sq_part
888
889 if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick,
890 Id.Left_ProcSubIn, Id.Left_ProcSubOut):
891 return self._ReadCommandSub(self.token_type, d_quoted=False)
892
893 if self.token_type == Id.Left_DollarBrace:
894 return self._ReadBracedVarSub(self.cur_token, d_quoted=False)
895
896 if self.token_type == Id.Left_DollarDParen:
897 return self._ReadArithSub()
898
899 if self.token_type == Id.Left_DollarBracket:
900 return self._ReadExprSub(lex_mode_e.ShCommand)
901
902 if self.token_type == Id.Left_DollarBraceZsh:
903 return self._ReadZshVarSub(self.cur_token)
904
905 raise AssertionError(self.cur_token)
906
907 def _ReadExtGlob(self):
908 # type: () -> word_part.ExtGlob
909 """
910 Grammar:
911 Item = CompoundWord | EPSILON # important: @(foo|) is allowed
912 LEFT = '@(' | '*(' | '+(' | '?(' | '!('
913 RIGHT = ')'
914 ExtGlob = LEFT (Item '|')* Item RIGHT # ITEM may be empty
915 Compound includes ExtGlob
916 """
917 left_token = self.cur_token
918 right_token = None # type: Token
919 arms = [] # type: List[CompoundWord]
920
921 self.lexer.PushHint(Id.Op_RParen, Id.Right_ExtGlob)
922 self._SetNext(lex_mode_e.ExtGlob) # advance past LEFT
923
924 read_word = False # did we just a read a word? To handle @(||).
925
926 while True:
927 self._GetToken()
928
929 if self.token_type == Id.Right_ExtGlob:
930 if not read_word:
931 arms.append(CompoundWord([]))
932 right_token = self.cur_token
933 break
934
935 elif self.token_type == Id.Op_Pipe:
936 if not read_word:
937 arms.append(CompoundWord([]))
938 read_word = False
939 self._SetNext(lex_mode_e.ExtGlob)
940
941 # lex_mode_e.ExtGlob should only produce these 4 kinds of tokens
942 elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub,
943 Kind.ExtGlob):
944 w = self._ReadCompoundWord(lex_mode_e.ExtGlob)
945 arms.append(w)
946 read_word = True
947
948 elif self.token_kind == Kind.Eof:
949 p_die('Unexpected EOF reading extended glob that began here',
950 left_token)
951
952 else:
953 raise AssertionError(self.cur_token)
954
955 return word_part.ExtGlob(left_token, arms, right_token)
956
957 def _ReadBashRegexGroup(self):
958 # type: () -> word_part.BashRegexGroup
959 """
960 Grammar:
961 BashRegexGroup = '(' WORD? ')
962 """
963 left_token = self.cur_token
964 assert left_token.id == Id.BashRegex_LParen, left_token
965
966 right_token = None # type: Token
967 arms = [] # type: List[CompoundWord]
968
969 self.lexer.PushHint(Id.Op_RParen, Id.Right_BashRegexGroup)
970 self._SetNext(lex_mode_e.BashRegexFakeInner) # advance past LEFT
971
972 self._GetToken()
973 if self.token_type == Id.Right_BashRegexGroup: # empty ()
974 return word_part.BashRegexGroup(left_token, None, self.cur_token)
975
976 # lex_mode_e.BashRegex should only produce these 4 kinds of tokens
977 if self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub, Kind.BashRegex):
978 # Fake lexer mode that translates Id.WS_Space to Id.Lit_Chars
979 # To allow bash style [[ s =~ (a b) ]]
980 w = self._ReadCompoundWord(lex_mode_e.BashRegexFakeInner)
981 arms.append(w)
982
983 self._GetToken()
984 if self.token_type != Id.Right_BashRegexGroup:
985 p_die('Expected ) to close bash regex group', self.cur_token)
986
987 return word_part.BashRegexGroup(left_token, w, self.cur_token)
988
989 p_die('Expected word after ( opening bash regex group', self.cur_token)
990
991 def _ReadLikeDQ(self, left_token, is_ysh_expr, out_parts):
992 # type: (Optional[Token], bool, List[word_part_t]) -> None
993 """
994 Args:
995 left_token: A token if we are reading a double quoted part, or None if
996 we're reading a here doc.
997 is_ysh_expr: Whether to disallow backticks and invalid char escapes
998 out_parts: list of word_part to append to
999 """
1000 if left_token:
1001 if left_token.id in (Id.Left_TDoubleQuote,
1002 Id.Left_DollarTDoubleQuote):
1003 expected_end_tokens = 3
1004 else:
1005 expected_end_tokens = 1
1006 else:
1007 expected_end_tokens = 1000 # here doc will break
1008
1009 num_end_tokens = 0
1010 while num_end_tokens < expected_end_tokens:
1011 self._SetNext(lex_mode_e.DQ)
1012 self._GetToken()
1013
1014 if self.token_kind == Kind.Lit:
1015 if self.token_type == Id.Lit_EscapedChar:
1016 tok = self.cur_token
1017 ch = lexer.TokenSliceLeft(tok, 1)
1018 part = word_part.EscapedLiteral(tok,
1019 ch) # type: word_part_t
1020 else:
1021 if self.token_type == Id.Lit_BadBackslash:
1022 # echo "\z" is OK in shell, but 'x = "\z" is a syntax error in
1023 # YSH.
1024 # Slight hole: We don't catch 'x = ${undef:-"\z"} because of the
1025 # recursion (unless parse_backslash)
1026 if (is_ysh_expr or
1027 not self.parse_opts.parse_backslash()):
1028 p_die(
1029 "Invalid char escape in double quoted string (OILS-ERR-12)",
1030 self.cur_token)
1031 elif self.token_type == Id.Lit_Dollar:
1032 if is_ysh_expr or not self.parse_opts.parse_dollar():
1033 p_die("Literal $ should be quoted like \$",
1034 self.cur_token)
1035
1036 part = self.cur_token
1037 out_parts.append(part)
1038
1039 elif self.token_kind == Kind.Left:
1040 if self.token_type == Id.Left_Backtick and is_ysh_expr:
1041 p_die("Invalid backtick: use $(cmd) or \\` in YSH strings",
1042 self.cur_token)
1043
1044 part = self._ReadDoubleQuotedLeftParts()
1045 out_parts.append(part)
1046
1047 elif self.token_kind == Kind.VSub:
1048 tok = self.cur_token
1049 part = SimpleVarSub(tok)
1050 out_parts.append(part)
1051 # NOTE: parsing "$f(x)" would BREAK CODE. Could add a more for it
1052 # later.
1053
1054 elif self.token_kind == Kind.Right:
1055 assert self.token_type == Id.Right_DoubleQuote, self.token_type
1056 if left_token:
1057 num_end_tokens += 1
1058
1059 # In a here doc, the right quote is literal!
1060 out_parts.append(self.cur_token)
1061
1062 elif self.token_kind == Kind.Eof:
1063 if left_token:
1064 p_die(
1065 'Unexpected EOF reading double-quoted string that began here',
1066 left_token)
1067 else: # here docs will have an EOF in their token stream
1068 break
1069
1070 else:
1071 raise AssertionError(self.cur_token)
1072
1073 if self.token_kind != Kind.Right:
1074 num_end_tokens = 0 # """ must be CONSECUTIVE
1075
1076 if expected_end_tokens == 1:
1077 out_parts.pop()
1078 elif expected_end_tokens == 3:
1079 out_parts.pop()
1080 out_parts.pop()
1081 out_parts.pop()
1082
1083 # Remove space from """ in both expression mode and command mode
1084 if (left_token and left_token.id
1085 in (Id.Left_TDoubleQuote, Id.Left_DollarTDoubleQuote)):
1086 word_compile.RemoveLeadingSpaceDQ(out_parts)
1087
1088 # Return nothing, since we appended to 'out_parts'
1089
1090 def _ReadDoubleQuoted(self, left_token):
1091 # type: (Token) -> DoubleQuoted
1092 """Helper function for "hello $name".
1093
1094 Args:
1095 eof_type: for stopping at }, Id.Lit_RBrace
1096 here_doc: Whether we are reading in a here doc context
1097
1098 Also ${foo%%a b c} # treat this as double quoted. until you hit
1099 """
1100 parts = [] # type: List[word_part_t]
1101 self._ReadLikeDQ(left_token, False, parts)
1102
1103 right_quote = self.cur_token
1104 return DoubleQuoted(left_token, parts, right_quote)
1105
1106 def ReadDoubleQuoted(self, left_token, parts):
1107 # type: (Token, List[word_part_t]) -> Token
1108 """For expression mode.
1109
1110 Read var x = "${dir:-}/$name"; etc.
1111 """
1112 self._ReadLikeDQ(left_token, True, parts)
1113 return self.cur_token
1114
1115 def _ReadCommandSub(self, left_id, d_quoted=False):
1116 # type: (Id_t, bool) -> CommandSub
1117 """
1118 NOTE: This is not in the grammar, because word parts aren't in the grammar!
1119
1120 command_sub = '$(' command_list ')'
1121 | '@(' command_list ')'
1122 | '<(' command_list ')'
1123 | '>(' command_list ')'
1124 | ` command_list `
1125 """
1126 left_token = self.cur_token
1127
1128 # Set the lexer in a state so ) becomes the EOF token.
1129 if left_id in (Id.Left_DollarParen, Id.Left_AtParen, Id.Left_ProcSubIn,
1130 Id.Left_ProcSubOut):
1131 self._SetNext(lex_mode_e.ShCommand) # advance past $( etc.
1132
1133 right_id = Id.Eof_RParen
1134 self.lexer.PushHint(Id.Op_RParen, right_id)
1135 c_parser = self.parse_ctx.MakeParserForCommandSub(
1136 self.line_reader, self.lexer, right_id)
1137 # NOTE: This doesn't use something like main_loop because we don't want
1138 # to interleave parsing and execution! Unlike 'source' and 'eval'.
1139 node = c_parser.ParseCommandSub()
1140
1141 right_token = c_parser.w_parser.cur_token
1142
1143 elif left_id == Id.Left_Backtick and self.parse_ctx.do_lossless:
1144 # NOTE: This is an APPROXIMATE solution for translation ONLY. See
1145 # test/osh2oil.
1146
1147 right_id = Id.Eof_Backtick
1148 self.lexer.PushHint(Id.Left_Backtick, right_id)
1149 c_parser = self.parse_ctx.MakeParserForCommandSub(
1150 self.line_reader, self.lexer, right_id)
1151 node = c_parser.ParseCommandSub()
1152 right_token = c_parser.w_parser.cur_token
1153
1154 elif left_id == Id.Left_Backtick:
1155 if not self.parse_opts.parse_backticks():
1156 p_die('Use $(cmd) instead of backticks (parse_backticks)',
1157 left_token)
1158
1159 self._SetNext(lex_mode_e.Backtick) # advance past `
1160
1161 parts = [] # type: List[str]
1162 while True:
1163 self._GetToken()
1164 #log("TOK %s", self.cur_token)
1165
1166 if self.token_type == Id.Backtick_Quoted:
1167 # Remove leading \
1168 parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1169
1170 elif self.token_type == Id.Backtick_DoubleQuote:
1171 # Compatibility: If backticks are double quoted, then double quotes
1172 # within them have to be \"
1173 # Shells aren't smart enough to match nested " and ` quotes (but OSH
1174 # is)
1175 if d_quoted:
1176 # Remove leading \
1177 parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1178 else:
1179 parts.append(lexer.TokenVal(self.cur_token))
1180
1181 elif self.token_type == Id.Backtick_Other:
1182 parts.append(lexer.TokenVal(self.cur_token))
1183
1184 elif self.token_type == Id.Backtick_Right:
1185 break
1186
1187 elif self.token_type == Id.Eof_Real:
1188 # Note: this parse error is in the ORIGINAL context. No code_str yet.
1189 p_die('Unexpected EOF while looking for closing backtick',
1190 left_token)
1191
1192 else:
1193 raise AssertionError(self.cur_token)
1194
1195 self._SetNext(lex_mode_e.Backtick)
1196
1197 # Calculate right SPID on CommandSub BEFORE re-parsing.
1198 right_token = self.cur_token
1199
1200 code_str = ''.join(parts)
1201 #log('code %r', code_str)
1202
1203 # NOTE: This is similar to how we parse aliases in osh/cmd_parse.py. It
1204 # won't have the same location info as MakeParserForCommandSub(), because
1205 # the lexer is different.
1206 arena = self.parse_ctx.arena
1207 #arena = alloc.Arena()
1208 line_reader = reader.StringLineReader(code_str, arena)
1209 c_parser = self.parse_ctx.MakeOshParser(line_reader)
1210 src = source.Reparsed('backticks', left_token, right_token)
1211 with alloc.ctx_SourceCode(arena, src):
1212 node = c_parser.ParseCommandSub()
1213
1214 else:
1215 raise AssertionError(left_id)
1216
1217 return CommandSub(left_token, node, right_token)
1218
1219 def _ReadExprSub(self, lex_mode):
1220 # type: (lex_mode_t) -> word_part.ExprSub
1221 """$[d->key] $[obj.method()] etc."""
1222 left_token = self.cur_token
1223
1224 self._SetNext(lex_mode_e.Expr)
1225 enode, right_token = self.parse_ctx.ParseYshExpr(
1226 self.lexer, grammar_nt.ysh_expr_sub)
1227
1228 self._SetNext(lex_mode) # Move past ]
1229 return word_part.ExprSub(left_token, enode, right_token)
1230
1231 def ParseVarDecl(self, kw_token):
1232 # type: (Token) -> command.VarDecl
1233 """
1234 oil_var_decl: name_type_list '=' testlist end_stmt
1235
1236 Note that assignments must end with \n ; } or EOF. Unlike shell
1237 assignments, we disallow:
1238
1239 var x = 42 | wc -l
1240 var x = 42 && echo hi
1241 """
1242 self._SetNext(lex_mode_e.Expr)
1243 enode, last_token = self.parse_ctx.ParseVarDecl(kw_token, self.lexer)
1244 # Hack to move } from what the Expr lexer modes gives to what CommandParser
1245 # wants
1246 if last_token.id == Id.Op_RBrace:
1247 last_token.id = Id.Lit_RBrace
1248
1249 # Let the CommandParser see the Op_Semi or Op_Newline.
1250 self.buffered_word = last_token
1251 self._SetNext(lex_mode_e.ShCommand) # always back to this
1252 return enode
1253
1254 def ParseMutation(self, kw_token, var_checker):
1255 # type: (Token, VarChecker) -> command.Mutation
1256 """
1257 setvar i = 42
1258 setvar i += 1
1259 setvar a[i] = 42
1260 setvar a[i] += 1
1261 setvar d.key = 42
1262 setvar d.key += 1
1263 """
1264 self._SetNext(lex_mode_e.Expr)
1265 enode, last_token = self.parse_ctx.ParseMutation(kw_token, self.lexer)
1266 # Hack to move } from what the Expr lexer modes gives to what CommandParser
1267 # wants
1268 if last_token.id == Id.Op_RBrace:
1269 last_token.id = Id.Lit_RBrace
1270
1271 for lhs in enode.lhs:
1272 UP_lhs = lhs
1273 with tagswitch(lhs) as case:
1274 if case(y_lhs_e.Var):
1275 lhs = cast(Token, UP_lhs)
1276 var_checker.Check(kw_token.id, lexer.LazyStr(lhs), lhs)
1277
1278 # Note: this does not cover cases like
1279 # setvar (a[0])[1] = v
1280 # setvar (d.key).other = v
1281 # This leaks into catching all typos statically, which may be
1282 # possible if 'use' makes all names explicit.
1283 elif case(y_lhs_e.Subscript):
1284 lhs = cast(Subscript, UP_lhs)
1285 if lhs.obj.tag() == expr_e.Var:
1286 v = cast(expr.Var, lhs.obj)
1287 var_checker.Check(kw_token.id, v.name, v.left)
1288
1289 elif case(y_lhs_e.Attribute):
1290 lhs = cast(Attribute, UP_lhs)
1291 if lhs.obj.tag() == expr_e.Var:
1292 v = cast(expr.Var, lhs.obj)
1293 var_checker.Check(kw_token.id, v.name, v.left)
1294
1295 # Let the CommandParser see the Op_Semi or Op_Newline.
1296 self.buffered_word = last_token
1297 self._SetNext(lex_mode_e.ShCommand) # always back to this
1298 return enode
1299
1300 def ParseBareDecl(self):
1301 # type: () -> expr_t
1302 """
1303 x = {name: val}
1304 """
1305 self._SetNext(lex_mode_e.Expr)
1306 self._GetToken()
1307 enode, last_token = self.parse_ctx.ParseYshExpr(
1308 self.lexer, grammar_nt.command_expr)
1309 if last_token.id == Id.Op_RBrace:
1310 last_token.id = Id.Lit_RBrace
1311 self.buffered_word = last_token
1312 self._SetNext(lex_mode_e.ShCommand)
1313 return enode
1314
1315 def ParseYshExprForCommand(self):
1316 # type: () -> expr_t
1317
1318 # Fudge for this case
1319 # for x in(y) {
1320 # versus
1321 # for x in (y) {
1322 #
1323 # In the former case, ReadWord on 'in' puts the lexer past (.
1324 # Also see LookPastSpace in CommandParers.
1325 # A simpler solution would be nicer.
1326
1327 if self.token_type == Id.Op_LParen:
1328 self.lexer.MaybeUnreadOne()
1329
1330 enode, _ = self.parse_ctx.ParseYshExpr(self.lexer, grammar_nt.ysh_expr)
1331
1332 self._SetNext(lex_mode_e.ShCommand)
1333 return enode
1334
1335 def ParseCommandExpr(self):
1336 # type: () -> expr_t
1337 """
1338 = 1+2
1339 """
1340 enode, last_token = self.parse_ctx.ParseYshExpr(
1341 self.lexer, grammar_nt.command_expr)
1342
1343 # In some cases, such as the case statement, we expect *the lexer* to be
1344 # pointing at the token right after the expression. But the expression
1345 # parser must have read to the `last_token`. Unreading places the lexer
1346 # back in the expected state. Ie:
1347 #
1348 # case (x) { case (x) {
1349 # (else) { = x } (else) { = x }
1350 # ^ The lexer is here ^ Unread to here
1351 # } }
1352 assert last_token.id in (Id.Op_Newline, Id.Eof_Real, Id.Op_Semi,
1353 Id.Op_RBrace), last_token
1354 if last_token.id != Id.Eof_Real:
1355 # Eof_Real is the only token we cannot unread
1356 self.lexer.MaybeUnreadOne()
1357
1358 return enode
1359
1360 def ParseProc(self, node):
1361 # type: (Proc) -> None
1362
1363 # proc name-with-hyphens() must be accepted
1364 self._SetNext(lex_mode_e.ShCommand)
1365 self._GetToken()
1366 # example: 'proc f[' gets you Lit_ArrayLhsOpen
1367 if self.token_type != Id.Lit_Chars:
1368 p_die('Invalid proc name %s' % ui.PrettyToken(self.cur_token),
1369 self.cur_token)
1370
1371 # TODO: validate this more. Disallow proc 123 { }, which isn't disallowed
1372 # for shell functions. Similar to IsValidVarName().
1373 node.name = self.cur_token
1374
1375 last_token = self.parse_ctx.ParseProc(self.lexer, node)
1376
1377 # Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1378 assert last_token.id == Id.Op_LBrace
1379 last_token.id = Id.Lit_LBrace
1380 self.buffered_word = last_token
1381
1382 self._SetNext(lex_mode_e.ShCommand)
1383
1384 def ParseFunc(self, node):
1385 # type: (Func) -> None
1386 last_token = self.parse_ctx.ParseFunc(self.lexer, node)
1387
1388 # Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1389 assert last_token.id == Id.Op_LBrace
1390 last_token.id = Id.Lit_LBrace
1391 self.buffered_word = last_token
1392
1393 self._SetNext(lex_mode_e.ShCommand)
1394
1395 def ParseYshCasePattern(self):
1396 # type: () -> Tuple[pat_t, Token]
1397 pat, left_tok, last_token = self.parse_ctx.ParseYshCasePattern(
1398 self.lexer)
1399
1400 if last_token.id == Id.Op_LBrace:
1401 last_token.id = Id.Lit_LBrace
1402 self.buffered_word = last_token
1403
1404 return pat, left_tok
1405
1406 def NewlineOkForYshCase(self):
1407 # type: () -> Id_t
1408 """Check for optional newline and consume it.
1409
1410 This is a special case of `_NewlineOk` which fixed some "off-by-one" issues
1411 which crop up while parsing Ysh Case Arms. For more details, see
1412 #oil-dev > Progress On YSH Case Grammar on zulip.
1413
1414 Returns a token id which is filled with the choice of
1415
1416 word { echo word }
1417 (3) { echo expr }
1418 /e/ { echo eggex }
1419 } # right brace
1420 """
1421 while True:
1422 next_id = self.lexer.LookAheadOne(lex_mode_e.Expr)
1423
1424 # Cannot lookahead past lines
1425 if next_id == Id.Unknown_Tok:
1426 if not self.lexer.MoveToNextLine(): # Try to move to next line
1427 break # EOF
1428 continue
1429
1430 next_kind = consts.GetKind(next_id)
1431 if next_id != Id.Op_Newline and next_kind != Kind.Ignored:
1432 break
1433
1434 self.lexer.Read(lex_mode_e.Expr)
1435
1436 if next_id in (Id.Op_RBrace, Id.Op_LParen, Id.Arith_Slash):
1437 self._SetNext(lex_mode_e.Expr) # Continue in expression mode
1438 else:
1439 # Consume the trailing Op_Newline
1440 self._SetNext(lex_mode_e.ShCommand)
1441 self._GetToken()
1442
1443 return next_id
1444
1445 def _ReadArithExpr(self, end_id):
1446 # type: (Id_t) -> arith_expr_t
1447 """Read and parse an arithmetic expression in various contexts.
1448
1449 $(( 1+2 ))
1450 (( a=1+2 ))
1451 ${a[ 1+2 ]}
1452 ${a : 1+2 : 1+2}
1453
1454 See tests/arith-context.test.sh for ambiguous cases.
1455
1456 ${a[a[0]]} is valid # VS_RBRACKET vs Id.Arith_RBracket
1457
1458 ${s : a<b?0:1 : 1} # VS_COLON vs Id.Arith_Colon
1459
1460 See the assertion in ArithParser.Parse() -- unexpected extra input.
1461 """
1462 # calls self.ReadWord(lex_mode_e.Arith)
1463 anode = self.a_parser.Parse()
1464 cur_id = self.a_parser.CurrentId()
1465 if end_id != Id.Undefined_Tok and cur_id != end_id:
1466 p_die(
1467 'Unexpected token after arithmetic expression (%s != %s)' %
1468 (ui.PrettyId(cur_id), ui.PrettyId(end_id)),
1469 loc.Word(self.a_parser.cur_word))
1470 return anode
1471
1472 def _ReadArithSub(self):
1473 # type: () -> word_part.ArithSub
1474 """Read an arith substitution, which contains an arith expression, e.g.
1475
1476 $((a + 1)).
1477 """
1478 left_tok = self.cur_token
1479
1480 # The second one needs to be disambiguated in stuff like stuff like:
1481 # $(echo $(( 1+2 )) )
1482 self.lexer.PushHint(Id.Op_RParen, Id.Right_DollarDParen)
1483
1484 # NOTE: To disambiguate $(( as arith sub vs. command sub and subshell, we
1485 # could save the lexer/reader state here, and retry if the arithmetic parse
1486 # fails. But we can almost always catch this at parse time. There could
1487 # be some exceptions like:
1488 # $((echo * foo)) # looks like multiplication
1489 # $((echo / foo)) # looks like division
1490
1491 # $(( )) is valid
1492 anode = arith_expr.EmptyZero # type: arith_expr_t
1493
1494 self._NextNonSpace()
1495 if self.token_type != Id.Arith_RParen:
1496 anode = self._ReadArithExpr(Id.Arith_RParen)
1497
1498 self._SetNext(lex_mode_e.ShCommand)
1499
1500 # Ensure we get closing )
1501 self._GetToken()
1502 if self.token_type != Id.Right_DollarDParen:
1503 p_die('Expected second ) to end arith sub', self.cur_token)
1504
1505 right_tok = self.cur_token
1506 return word_part.ArithSub(left_tok, anode, right_tok)
1507
1508 def ReadDParen(self):
1509 # type: () -> Tuple[arith_expr_t, Token]
1510 """Read ((1+ 2)) -- command context.
1511
1512 We're using the word parser because it's very similar to _ReadArithExpr
1513 above.
1514
1515 This also returns the terminating Id.Op_DRightParen token for location
1516 info.
1517 """
1518 # (( )) is valid
1519 anode = arith_expr.EmptyZero # type: arith_expr_t
1520
1521 self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
1522
1523 self._NextNonSpace()
1524 if self.token_type != Id.Arith_RParen:
1525 anode = self._ReadArithExpr(Id.Arith_RParen)
1526
1527 self._SetNext(lex_mode_e.ShCommand)
1528
1529 # Ensure we get the second )
1530 self._GetToken()
1531 right = self.cur_token
1532 if right.id != Id.Op_DRightParen:
1533 p_die('Expected second ) to end arith statement', right)
1534
1535 self._SetNext(lex_mode_e.ShCommand)
1536
1537 return anode, right
1538
1539 def _NextNonSpace(self):
1540 # type: () -> None
1541 """Advance in lex_mode_e.Arith until non-space token.
1542
1543 Same logic as _ReadWord, but used in
1544 $(( ))
1545 (( ))
1546 for (( ))
1547
1548 You can read self.token_type after this, without calling _GetToken.
1549 """
1550 while True:
1551 self._SetNext(lex_mode_e.Arith)
1552 self._GetToken()
1553 if self.token_kind not in (Kind.Ignored, Kind.WS):
1554 break
1555
1556 def ReadForExpression(self):
1557 # type: () -> command.ForExpr
1558 """Read ((i=0; i<5; ++i)) -- part of command context."""
1559 self._NextNonSpace() # skip over ((
1560 cur_id = self.token_type # for end of arith expressions
1561
1562 if cur_id == Id.Arith_Semi: # for (( ; i < 10; i++ ))
1563 init_node = arith_expr.EmptyZero # type: arith_expr_t
1564 else:
1565 init_node = self.a_parser.Parse()
1566 cur_id = self.a_parser.CurrentId()
1567 self._NextNonSpace()
1568
1569 # It's odd to keep track of both cur_id and self.token_type in this
1570 # function, but it works, and is tested in 'test/parse_error.sh
1571 # arith-integration'
1572 if cur_id != Id.Arith_Semi: # for (( x=0 b; ... ))
1573 p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1574
1575 self._GetToken()
1576 cur_id = self.token_type
1577
1578 if cur_id == Id.Arith_Semi: # for (( ; ; i++ ))
1579 # empty condition is TRUE
1580 cond_node = arith_expr.EmptyOne # type: arith_expr_t
1581 else:
1582 cond_node = self.a_parser.Parse()
1583 cur_id = self.a_parser.CurrentId()
1584
1585 if cur_id != Id.Arith_Semi: # for (( x=0; x<5 b ))
1586 p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1587
1588 self._NextNonSpace()
1589 if self.token_type == Id.Arith_RParen: # for (( ; ; ))
1590 update_node = arith_expr.EmptyZero # type: arith_expr_t
1591 else:
1592 update_node = self._ReadArithExpr(Id.Arith_RParen)
1593
1594 self._NextNonSpace()
1595 if self.token_type != Id.Arith_RParen:
1596 p_die('Expected ) to end for loop expression', self.cur_token)
1597 self._SetNext(lex_mode_e.ShCommand)
1598
1599 # redirects is None, will be assigned in CommandEvaluator
1600 node = command.ForExpr.CreateNull()
1601 node.init = init_node
1602 node.cond = cond_node
1603 node.update = update_node
1604 return node
1605
1606 def _ReadArrayLiteral(self):
1607 # type: () -> word_part_t
1608 """a=(1 2 3)
1609
1610 TODO: See osh/cmd_parse.py:164 for Id.Lit_ArrayLhsOpen, for a[x++]=1
1611
1612 We want:
1613
1614 A=(['x']=1 ["x"]=2 [$x$y]=3)
1615
1616 Maybe allow this as a literal string? Because I think I've seen it before?
1617 Or maybe force people to patch to learn the rule.
1618
1619 A=([x]=4)
1620
1621 Starts with Lit_Other '[', and then it has Lit_ArrayLhsClose
1622 Maybe enforce that ALL have keys or NONE of have keys.
1623 """
1624 self._SetNext(lex_mode_e.ShCommand) # advance past (
1625 self._GetToken()
1626 if self.cur_token.id != Id.Op_LParen:
1627 p_die('Expected ( after =', self.cur_token)
1628 left_token = self.cur_token
1629 right_token = None # type: Token
1630
1631 # MUST use a new word parser (with same lexer).
1632 w_parser = self.parse_ctx.MakeWordParser(self.lexer, self.line_reader)
1633 words = [] # type: List[CompoundWord]
1634 done = False
1635 while not done:
1636 w = w_parser.ReadWord(lex_mode_e.ShCommand)
1637 with tagswitch(w) as case:
1638 if case(word_e.Operator):
1639 tok = cast(Token, w)
1640 if tok.id == Id.Right_ShArrayLiteral:
1641 right_token = tok
1642 done = True # can't use break here
1643 # Unlike command parsing, array parsing allows embedded \n.
1644 elif tok.id == Id.Op_Newline:
1645 continue
1646 else:
1647 p_die('Unexpected token in array literal', loc.Word(w))
1648
1649 elif case(word_e.Compound):
1650 words.append(cast(CompoundWord, w))
1651
1652 else:
1653 raise AssertionError()
1654
1655 if len(words) == 0: # a=() is empty indexed array
1656 # Needed for type safety, doh
1657 no_words = [] # type: List[word_t]
1658 node = ShArrayLiteral(left_token, no_words, right_token)
1659 return node
1660
1661 pairs = [] # type: List[AssocPair]
1662 # If the first one is a key/value pair, then the rest are assumed to be.
1663 pair = word_.DetectAssocPair(words[0])
1664 if pair:
1665 pairs.append(pair)
1666
1667 n = len(words)
1668 for i in xrange(1, n):
1669 w2 = words[i]
1670 pair = word_.DetectAssocPair(w2)
1671 if not pair:
1672 p_die("Expected associative array pair", loc.Word(w2))
1673
1674 pairs.append(pair)
1675
1676 # invariant List?
1677 return word_part.BashAssocLiteral(left_token, pairs, right_token)
1678
1679 # Brace detection for arrays but NOT associative arrays
1680 words2 = braces.BraceDetectAll(words)
1681 words3 = word_.TildeDetectAll(words2)
1682 return ShArrayLiteral(left_token, words3, right_token)
1683
1684 def ParseProcCallArgs(self, start_symbol):
1685 # type: (int) -> ArgList
1686 """ json write (x) """
1687 self.lexer.MaybeUnreadOne()
1688
1689 arg_list = ArgList.CreateNull(alloc_lists=True)
1690 arg_list.left = self.cur_token
1691 self.parse_ctx.ParseProcCallArgs(self.lexer, arg_list, start_symbol)
1692 return arg_list
1693
1694 def _MaybeReadWordPart(self, is_first, lex_mode, parts):
1695 # type: (bool, lex_mode_t, List[word_part_t]) -> bool
1696 """Helper for _ReadCompoundWord3."""
1697 done = False
1698
1699 if self.token_type == Id.Lit_EscapedChar:
1700 tok = self.cur_token
1701 assert tok.length == 2
1702 ch = lexer.TokenSliceLeft(tok, 1)
1703 if not self.parse_opts.parse_backslash():
1704 if not pyutil.IsValidCharEscape(ch):
1705 p_die('Invalid char escape in unquoted word (OILS-ERR-13)',
1706 self.cur_token)
1707
1708 part = word_part.EscapedLiteral(self.cur_token,
1709 ch) # type: word_part_t
1710 else:
1711 part = self.cur_token
1712
1713 if is_first and self.token_type == Id.Lit_VarLike: # foo=
1714 parts.append(part)
1715 # Unfortunately it's awkward to pull the check for a=(1 2) up to
1716 # _ReadWord.
1717 next_id = self.lexer.LookPastSpace(lex_mode)
1718 if next_id == Id.Op_LParen:
1719 self.lexer.PushHint(Id.Op_RParen, Id.Right_ShArrayLiteral)
1720 part2 = self._ReadArrayLiteral()
1721 parts.append(part2)
1722
1723 # Array literal must be the last part of the word.
1724 self._SetNext(lex_mode)
1725 self._GetToken()
1726 # EOF, whitespace, newline, Right_Subshell
1727 if self.token_kind not in KINDS_THAT_END_WORDS:
1728 p_die('Unexpected token after array literal',
1729 self.cur_token)
1730 done = True
1731
1732 elif (is_first and self.parse_opts.parse_at() and
1733 self.token_type == Id.Lit_Splice):
1734
1735 splice_tok = self.cur_token
1736 part2 = word_part.Splice(splice_tok,
1737 lexer.TokenSliceLeft(splice_tok, 1))
1738
1739 parts.append(part2)
1740
1741 # @words must be the last part of the word
1742 self._SetNext(lex_mode)
1743 self._GetToken()
1744 # EOF, whitespace, newline, Right_Subshell
1745 if self.token_kind not in KINDS_THAT_END_WORDS:
1746 p_die('Unexpected token after array splice', self.cur_token)
1747 done = True
1748
1749 elif (is_first and self.parse_opts.parse_at() and
1750 self.token_type == Id.Lit_AtLBracket): # @[split(x)]
1751 part2 = self._ReadExprSub(lex_mode_e.DQ)
1752 parts.append(part2)
1753
1754 # @[split(x)]
1755 self._SetNext(lex_mode)
1756 self._GetToken()
1757 # EOF, whitespace, newline, Right_Subshell
1758 if self.token_kind not in KINDS_THAT_END_WORDS:
1759 p_die('Unexpected token after Expr splice', self.cur_token)
1760 done = True
1761
1762 elif (is_first and self.parse_opts.parse_at() and
1763 self.token_type == Id.Lit_AtLBraceDot):
1764 p_die('TODO: @{.myproc builtin sub}', self.cur_token)
1765
1766 elif (is_first and self.parse_opts.parse_at_all() and
1767 self.token_type == Id.Lit_At):
1768 # Because $[x] ${x} and perhaps $/x/ are reserved, it makes sense for @
1769 # at the beginning of a word to be reserved.
1770
1771 # Although should we relax 'echo @' ? I'm tempted to have a shortcut for
1772 # @_argv and
1773 p_die('Literal @ starting a word must be quoted (parse_at_all)',
1774 self.cur_token)
1775
1776 else:
1777 # not a literal with lookahead; append it
1778 parts.append(part)
1779
1780 return done
1781
1782 def _ReadCompoundWord(self, lex_mode):
1783 # type: (lex_mode_t) -> CompoundWord
1784 return self._ReadCompoundWord3(lex_mode, Id.Undefined_Tok, True)
1785
1786 def _ReadCompoundWord3(self, lex_mode, eof_type, empty_ok):
1787 # type: (lex_mode_t, Id_t, bool) -> CompoundWord
1788 """
1789 Precondition: Looking at the first token of the first word part
1790 Postcondition: Looking at the token after, e.g. space or operator
1791
1792 NOTE: eof_type is necessary because / is a literal, i.e. Lit_Slash, but it
1793 could be an operator delimiting a compound word. Can we change lexer modes
1794 and remove this special case?
1795 """
1796 w = CompoundWord([])
1797 num_parts = 0
1798 brace_count = 0
1799 done = False
1800 is_triple_quoted = None # type: Optional[BoolParamBox]
1801
1802 while not done:
1803 self._GetToken()
1804
1805 allow_done = empty_ok or num_parts != 0
1806 if allow_done and self.token_type == eof_type:
1807 done = True # e.g. for ${foo//pat/replace}
1808
1809 # Keywords like "for" are treated like literals
1810 elif self.token_kind in (Kind.Lit, Kind.History, Kind.KW,
1811 Kind.ControlFlow, Kind.BoolUnary,
1812 Kind.BoolBinary):
1813
1814 # Syntax error for { and }
1815 if self.token_type == Id.Lit_LBrace:
1816 brace_count += 1
1817 elif self.token_type == Id.Lit_RBrace:
1818 brace_count -= 1
1819 elif self.token_type == Id.Lit_Dollar:
1820 if not self.parse_opts.parse_dollar():
1821 if num_parts == 0 and lex_mode == lex_mode_e.ShCommand:
1822 next_byte = self.lexer.ByteLookAhead()
1823 # TODO: switch lexer modes and parse $/d+/. But not ${a:-$/d+/}
1824 if next_byte == '/':
1825 #log('next_byte %r', next_byte)
1826 pass
1827
1828 p_die('Literal $ should be quoted like \$',
1829 self.cur_token)
1830
1831 done = self._MaybeReadWordPart(num_parts == 0, lex_mode,
1832 w.parts)
1833
1834 elif self.token_kind == Kind.VSub:
1835 vsub_token = self.cur_token
1836
1837 part = SimpleVarSub(vsub_token) # type: word_part_t
1838 w.parts.append(part)
1839
1840 elif self.token_kind == Kind.ExtGlob:
1841 # If parse_at, we can take over @( to start @(seq 3)
1842 # Users can also use look at ,(*.py|*.sh)
1843 if (self.parse_opts.parse_at() and
1844 self.token_type == Id.ExtGlob_At and num_parts == 0):
1845 cs_part = self._ReadCommandSub(Id.Left_AtParen,
1846 d_quoted=False)
1847 # RARE mutation of tok.id!
1848 cs_part.left_token.id = Id.Left_AtParen
1849 part = cs_part # for type safety
1850
1851 # Same check as _MaybeReadWordPart. @(seq 3)x is illegal, just like
1852 # a=(one two)x and @arrayfunc(3)x.
1853 self._GetToken()
1854 if self.token_kind not in KINDS_THAT_END_WORDS:
1855 p_die('Unexpected token after @()', self.cur_token)
1856 done = True
1857
1858 else:
1859 part = self._ReadExtGlob()
1860 w.parts.append(part)
1861
1862 elif self.token_kind == Kind.BashRegex:
1863 if self.token_type == Id.BashRegex_LParen: # Opening (
1864 part = self._ReadBashRegexGroup()
1865 w.parts.append(part)
1866 else:
1867 assert self.token_type == Id.BashRegex_AllowedInParens
1868 p_die('Invalid token in bash regex', self.cur_token)
1869
1870 elif self.token_kind == Kind.Left:
1871 try_triple_quote = (self.parse_opts.parse_triple_quote() and
1872 lex_mode == lex_mode_e.ShCommand and
1873 num_parts == 0)
1874
1875 # Save allocation
1876 if try_triple_quote:
1877 is_triple_quoted = BoolParamBox(False)
1878
1879 part = self._ReadUnquotedLeftParts(is_triple_quoted)
1880 w.parts.append(part)
1881
1882 # NOT done yet, will advance below
1883 elif self.token_kind == Kind.Right:
1884 # Still part of the word; will be done on the next iter.
1885 if self.token_type == Id.Right_DoubleQuote:
1886 pass
1887 # Never happens, no PushHint for this case.
1888 #elif self.token_type == Id.Right_DollarParen:
1889 # pass
1890 elif self.token_type == Id.Right_Subshell:
1891 # LEXER HACK for (case x in x) ;; esac )
1892 # Rewind before it's used
1893 assert self.next_lex_mode == lex_mode_e.Undefined
1894 if self.lexer.MaybeUnreadOne():
1895 self.lexer.PushHint(Id.Op_RParen, Id.Right_Subshell)
1896 self._SetNext(lex_mode)
1897 done = True
1898 else:
1899 done = True
1900
1901 elif self.token_kind == Kind.Ignored:
1902 done = True
1903
1904 else:
1905 # LEXER HACK for unbalanced case clause. 'case foo in esac' is valid,
1906 # so to test for ESAC, we can read ) before getting a chance to
1907 # PushHint(Id.Op_RParen, Id.Right_CasePat). So here we unread one
1908 # token and do it again.
1909
1910 # We get Id.Op_RParen at top level: case x in x) ;; esac
1911 # We get Id.Eof_RParen inside ComSub: $(case x in x) ;; esac )
1912 if self.token_type in (Id.Op_RParen, Id.Eof_RParen):
1913 # Rewind before it's used
1914 assert self.next_lex_mode == lex_mode_e.Undefined
1915 if self.lexer.MaybeUnreadOne():
1916 if self.token_type == Id.Eof_RParen:
1917 # Redo translation
1918 self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
1919 self._SetNext(lex_mode)
1920
1921 done = True # anything we don't recognize means we're done
1922
1923 if not done:
1924 self._SetNext(lex_mode)
1925 num_parts += 1
1926
1927 if (self.parse_opts.parse_brace() and num_parts > 1 and
1928 brace_count != 0):
1929 # accept { and }, but not foo{
1930 p_die(
1931 'Word has unbalanced { }. Maybe add a space or quote it like \{',
1932 loc.Word(w))
1933
1934 if is_triple_quoted and is_triple_quoted.b and num_parts > 1:
1935 p_die('Unexpected parts after triple quoted string',
1936 loc.WordPart(w.parts[-1]))
1937
1938 if 0:
1939 from _devbuild.gen.syntax_asdl import word_part_str
1940 word_key = ' '.join(word_part_str(p.tag()) for p in w.parts)
1941 WORD_HIST[word_key] += 1
1942 return w
1943
1944 def _ReadArithWord(self):
1945 # type: () -> Optional[word_t]
1946 """ Helper for ReadArithWord() """
1947 self._GetToken()
1948
1949 if self.token_kind == Kind.Unknown:
1950 # e.g. happened during dynamic parsing of unset 'a[$foo]' in gherkin
1951 p_die(
1952 'Unexpected token while parsing arithmetic: %r' %
1953 lexer.TokenVal(self.cur_token), self.cur_token)
1954
1955 elif self.token_kind == Kind.Eof:
1956 return self.cur_token
1957
1958 elif self.token_kind == Kind.Ignored:
1959 # Space should be ignored.
1960 self._SetNext(lex_mode_e.Arith)
1961 return None
1962
1963 elif self.token_kind in (Kind.Arith, Kind.Right):
1964 # Id.Right_DollarDParen IS just a normal token, handled by ArithParser
1965 self._SetNext(lex_mode_e.Arith)
1966 return self.cur_token
1967
1968 elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub):
1969 return self._ReadCompoundWord(lex_mode_e.Arith)
1970
1971 else:
1972 raise AssertionError(self.cur_token)
1973
1974 def _ReadWord(self, word_mode):
1975 # type: (lex_mode_t) -> Optional[word_t]
1976 """Helper function for ReadWord()."""
1977
1978 # Change the pseudo lexer mode to a real lexer mode
1979 if word_mode == lex_mode_e.ShCommandFakeBrack:
1980 lex_mode = lex_mode_e.ShCommand
1981 else:
1982 lex_mode = word_mode
1983
1984 self._GetToken()
1985
1986 if self.token_kind == Kind.Eof:
1987 # No advance
1988 return self.cur_token
1989
1990 # Allow Arith for ) at end of for loop?
1991 elif self.token_kind in (Kind.Op, Kind.Redir, Kind.Arith):
1992 self._SetNext(lex_mode)
1993
1994 # Newlines are complicated. See 3x2 matrix in the comment about
1995 # self.multiline and self.newline_state above.
1996 if self.token_type == Id.Op_Newline:
1997 if self.multiline:
1998 if self.newline_state > 1:
1999 # This points at a blank line, but at least it gives the line number
2000 p_die('Invalid blank line in multiline mode',
2001 self.cur_token)
2002 return None
2003
2004 if self.returned_newline: # skip
2005 return None
2006
2007 return self.cur_token
2008
2009 elif self.token_kind == Kind.Right:
2010 if self.token_type not in (Id.Right_Subshell, Id.Right_ShFunction,
2011 Id.Right_CasePat,
2012 Id.Right_ShArrayLiteral):
2013 raise AssertionError(self.cur_token)
2014
2015 self._SetNext(lex_mode)
2016 return self.cur_token
2017
2018 elif self.token_kind in (Kind.Ignored, Kind.WS):
2019 self._SetNext(lex_mode)
2020 return None
2021
2022 else:
2023 assert self.token_kind in (Kind.VSub, Kind.Lit, Kind.History,
2024 Kind.Left, Kind.KW, Kind.ControlFlow,
2025 Kind.BoolUnary, Kind.BoolBinary,
2026 Kind.ExtGlob,
2027 Kind.BashRegex), 'Unhandled token kind'
2028
2029 if (word_mode == lex_mode_e.ShCommandFakeBrack and
2030 self.parse_opts.parse_bracket() and
2031 self.token_type == Id.Lit_LBracket):
2032 # Change [ from Kind.Lit -> Kind.Op
2033 # So CommandParser can treat
2034 # assert [42 === x]
2035 # like
2036 # json write (x)
2037 bracket_word = self.cur_token
2038 bracket_word.id = Id.Op_LBracket
2039
2040 self._SetNext(lex_mode)
2041 return bracket_word
2042
2043 # We're beginning a word. If we see Id.Lit_Pound, change to
2044 # lex_mode_e.Comment and read until end of line.
2045 if self.token_type == Id.Lit_Pound:
2046 self._SetNext(lex_mode_e.Comment)
2047 self._GetToken()
2048
2049 # NOTE: The # could be the last character in the file. It can't be
2050 # Eof_{RParen,Backtick} because #) and #` are comments.
2051 assert self.token_type in (Id.Ignored_Comment, Id.Eof_Real), \
2052 self.cur_token
2053
2054 # The next iteration will go into Kind.Ignored and set lex state to
2055 # lex_mode_e.ShCommand/etc.
2056 return None # tell ReadWord() to try again after comment
2057
2058 elif self.token_type == Id.Lit_TPound: ### doc comment
2059 self._SetNext(lex_mode_e.Comment)
2060 self._GetToken()
2061
2062 if self.token_type == Id.Ignored_Comment and self.emit_doc_token:
2063 return self.cur_token
2064
2065 return None # tell ReadWord() to try again after comment
2066
2067 else:
2068 # r'' u'' b''
2069 if (self.token_type == Id.Lit_Chars and
2070 self.lexer.LookAheadOne(
2071 lex_mode_e.ShCommand) == Id.Left_SingleQuote):
2072
2073 # When shopt -s parse_raw_string:
2074 # echo r'hi' is like echo 'hi'
2075 #
2076 # echo u'\u{3bc}' b'\yff' works
2077
2078 tok = self.cur_token
2079 if self.parse_opts.parse_ysh_string():
2080 if lexer.TokenEquals(tok, 'r'):
2081 left_id = Id.Left_RSingleQuote
2082 elif lexer.TokenEquals(tok, 'u'):
2083 left_id = Id.Left_USingleQuote
2084 elif lexer.TokenEquals(tok, 'b'):
2085 left_id = Id.Left_BSingleQuote
2086 else:
2087 left_id = Id.Undefined_Tok
2088
2089 if left_id != Id.Undefined_Tok:
2090 # skip the r, and then 'foo' will be read as normal
2091 self._SetNext(lex_mode_e.ShCommand)
2092
2093 self._GetToken()
2094 assert self.token_type == Id.Left_SingleQuote, self.token_type
2095
2096 # Read the word in a different lexer mode
2097 return self._ReadYshSingleQuoted(left_id)
2098
2099 return self._ReadCompoundWord(lex_mode)
2100
2101 def ParseVarRef(self):
2102 # type: () -> BracedVarSub
2103 """DYNAMIC parsing of what's inside ${!ref}
2104
2105 # Same as VarOf production
2106 VarRefExpr = VarOf EOF
2107 """
2108 self._SetNext(lex_mode_e.VSub_1)
2109
2110 self._GetToken()
2111 if self.token_kind != Kind.VSub:
2112 p_die('Expected var name', self.cur_token)
2113
2114 part = self._ParseVarOf()
2115 # NOTE: no ${ } means no part.left and part.right
2116 part.left = part.token # cheat to make test pass
2117 part.right = part.token
2118
2119 self._GetToken()
2120 if self.token_type != Id.Eof_Real:
2121 p_die('Expected end of var ref expression', self.cur_token)
2122 return part
2123
2124 def LookPastSpace(self):
2125 # type: () -> Id_t
2126 """Look ahead to the next token.
2127
2128 For the CommandParser to recognize
2129 array= (1 2 3)
2130 YSH for ( versus bash for ((
2131 YSH if ( versus if test
2132 YSH while ( versus while test
2133 YSH bare assignment 'grep =' versus 'grep foo'
2134 """
2135 assert self.token_type != Id.Undefined_Tok
2136 if self.cur_token.id == Id.WS_Space:
2137 id_ = self.lexer.LookPastSpace(lex_mode_e.ShCommand)
2138 else:
2139 id_ = self.cur_token.id
2140 return id_
2141
2142 def LookAheadFuncParens(self):
2143 # type: () -> bool
2144 """Special lookahead for f( ) { echo hi; } to check for ( )"""
2145 assert self.token_type != Id.Undefined_Tok
2146
2147 # We have to handle 2 cases because we buffer a token
2148 if self.cur_token.id == Id.Op_LParen: # saw funcname(
2149 return self.lexer.LookAheadFuncParens(1) # go back one char
2150
2151 elif self.cur_token.id == Id.WS_Space: # saw funcname WHITESPACE
2152 return self.lexer.LookAheadFuncParens(0)
2153
2154 else:
2155 return False
2156
2157 def ReadWord(self, word_mode):
2158 # type: (lex_mode_t) -> word_t
2159 """Read the next word, using the given lexer mode.
2160
2161 This is a stateful wrapper for the stateless _ReadWord function.
2162 """
2163 assert word_mode in (lex_mode_e.ShCommand,
2164 lex_mode_e.ShCommandFakeBrack,
2165 lex_mode_e.DBracket, lex_mode_e.BashRegex)
2166
2167 if self.buffered_word: # For integration with pgen2
2168 w = self.buffered_word
2169 self.buffered_word = None
2170 else:
2171 while True:
2172 w = self._ReadWord(word_mode)
2173 if w is not None:
2174 break
2175
2176 self.returned_newline = (word_.CommandId(w) == Id.Op_Newline)
2177 return w
2178
2179 def ReadArithWord(self):
2180 # type: () -> word_t
2181 while True:
2182 w = self._ReadArithWord()
2183 if w is not None:
2184 break
2185 return w
2186
2187 def ReadHereDocBody(self, parts):
2188 # type: (List[word_part_t]) -> None
2189 """
2190 A here doc is like a double quoted context, except " isn't special.
2191 """
2192 self._ReadLikeDQ(None, False, parts)
2193 # Returns nothing
2194
2195 def ReadForPlugin(self):
2196 # type: () -> CompoundWord
2197 """For $PS1, $PS4, etc.
2198
2199 This is just like reading a here doc line. "\n" is allowed, as
2200 well as the typical substitutions ${x} $(echo hi) $((1 + 2)).
2201 """
2202 w = CompoundWord([])
2203 self._ReadLikeDQ(None, False, w.parts)
2204 return w
2205
2206 def EmitDocToken(self, b):
2207 # type: (bool) -> None
2208 self.emit_doc_token = b
2209
2210 def Multiline(self, b):
2211 # type: (bool) -> None
2212 self.multiline = b
2213
2214
2215if 0:
2216 import collections
2217 WORD_HIST = collections.Counter()
2218
2219# vim: sw=4