OILS / osh / word_parse.py View on Github | oils.pub

2224 lines, 1188 significant
1# Copyright 2016 Andy Chu. All rights reserved.
2# Licensed under the Apache License, Version 2.0 (the "License");
3# you may not use this file except in compliance with the License.
4# You may obtain a copy of the License at
5#
6# http://www.apache.org/licenses/LICENSE-2.0
7"""
8word_parse.py - Parse the shell word language.
9
10Hairy example:
11
12 hi$((1 + 2))"$(echo hi)"${var:-__"$(echo default)"__}
13
14Substitutions can be nested, but which inner subs are allowed depends on the
15outer sub. Notes:
16
17lex_mode_e.ShCommand (_ReadUnquotedLeftParts)
18 All subs and quotes are allowed:
19 $v ${v} $() `` $(()) '' "" $'' $"" <() >()
20
21lex_mode_e.DQ (_ReadDoubleQuotedLeftParts)
22 Var, Command, Arith, but no quotes.
23 $v ${v} $() `` $(())
24 No process substitution.
25
26lex_mode_e.Arith
27 Similar to DQ: Var, Command, and Arith sub, but no process sub. bash doesn't
28 allow quotes, but OSH does. We allow ALL FOUR kinds of quotes, because we
29 need those for associative array indexing.
30
31lex_mode_e.VSub_ArgUnquoted
32 Like ShCommand, everything is allowed (even process substitutions), but we
33 stop at }, and space is SIGNIFICANT.
34
35 Example: ${a:- b }
36
37 ${X:-$v} ${X:-${v}} ${X:-$(echo hi)} ${X:-`echo hi`} ${X:-$((1+2))}
38 ${X:-'single'} ${X:-"double"} ${X:-$'\n'} ${X:-<(echo hi)}
39
40lex_mode_e.VSub_ArgDQ
41 In contrast to DQ, VS_ARG_DQ accepts nested "" and $'' and $"", e.g.
42 "${x:-"default"}".
43
44 In contrast, VSub_ArgUnquoted respects single quotes and process
45 substitution.
46
47 It's weird that double quotes are allowed. Space is also significant here,
48 e.g. "${x:-a "b"}".
49"""
50
51from _devbuild.gen import grammar_nt
52from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind
53from _devbuild.gen.types_asdl import (lex_mode_t, lex_mode_e)
54from _devbuild.gen.syntax_asdl import (
55 BoolParamBox,
56 Token,
57 SimpleVarSub,
58 loc,
59 source,
60 DoubleQuoted,
61 SingleQuoted,
62 BracedVarSub,
63 CommandSub,
64 ShArrayLiteral,
65 AssocPair,
66 bracket_op,
67 bracket_op_t,
68 suffix_op,
69 suffix_op_t,
70 rhs_word,
71 rhs_word_e,
72 rhs_word_t,
73 word_e,
74 word_t,
75 CompoundWord,
76 word_part,
77 word_part_t,
78 y_lhs_e,
79 arith_expr_t,
80 command,
81 expr,
82 expr_e,
83 expr_t,
84 pat_t,
85 ArgList,
86 Proc,
87 Func,
88 Subscript,
89 Attribute,
90 arith_expr,
91)
92from core import alloc
93from core.error import p_die
94from mycpp.mylib import log
95from core import pyutil
96from display import ui
97from frontend import consts
98from frontend import lexer
99from frontend import reader
100from osh import tdop
101from osh import arith_parse
102from osh import braces
103from osh import word_
104from osh import word_compile
105from mycpp.mylib import tagswitch
106
107from libc import HAVE_FNM_EXTMATCH
108
109from typing import List, Optional, Tuple, cast
110from typing import TYPE_CHECKING
111if TYPE_CHECKING:
112 from frontend.lexer import Lexer
113 from frontend.parse_lib import ParseContext
114 from frontend.reader import _Reader
115 from osh.cmd_parse import VarChecker
116
117unused1 = log
118unused2 = Id_str
119
120KINDS_THAT_END_WORDS = [Kind.Eof, Kind.WS, Kind.Op, Kind.Right]
121
122
123class WordEmitter(object):
124 """Common interface for [ and [["""
125
126 def __init__(self):
127 # type: () -> None
128 """Empty constructor for mycpp."""
129 pass
130
131 def ReadWord(self, lex_mode):
132 # type: (lex_mode_t) -> word_t
133 raise NotImplementedError()
134
135
136class WordParser(WordEmitter):
137
138 def __init__(self, parse_ctx, lexer, line_reader):
139 # type: (ParseContext, Lexer, _Reader) -> None
140 self.parse_ctx = parse_ctx
141 self.lexer = lexer
142 self.line_reader = line_reader
143 self.arena = line_reader.arena
144
145 self.parse_opts = parse_ctx.parse_opts
146 self.a_parser = tdop.TdopParser(arith_parse.Spec(), self,
147 self.parse_opts)
148 self.Reset()
149
150 def Init(self, lex_mode):
151 # type: (lex_mode_t) -> None
152 """Used to parse arithmetic, see ParseContext."""
153 self.next_lex_mode = lex_mode
154
155 def Reset(self):
156 # type: () -> None
157 """Called by interactive loop."""
158 # For _GetToken()
159 self.cur_token = None # type: Token
160 self.token_kind = Kind.Undefined
161 self.token_type = Id.Undefined_Tok
162
163 self.next_lex_mode = lex_mode_e.ShCommand
164
165 # Boolean mutated by CommandParser via word_.ctx_EmitDocToken. For ### doc
166 # comments
167 self.emit_doc_token = False
168 # Boolean mutated by CommandParser via word_.ctx_Multiline. '...' starts
169 # multiline mode.
170 self.multiline = False
171
172 # For detecting invalid \n\n in multiline mode. Counts what we got
173 # directly from the lexer.
174 self.newline_state = 0
175 # For consolidating \n\n -> \n for the CALLER. This simplifies the parsers
176 # that consume words.
177 self.returned_newline = False
178
179 # For integration with pgen2
180 self.buffered_word = None # type: word_t
181
182 def _GetToken(self):
183 # type: () -> None
184 """Call this when you need to make a decision based on any of:
185
186 self.token_type
187 self.token_kind
188 self.cur_token
189 """
190 if self.next_lex_mode == lex_mode_e.Undefined:
191 return # _SetNext() not called, so do nothing
192
193 is_fake = self.next_lex_mode == lex_mode_e.BashRegexFakeInner
194 real_mode = (lex_mode_e.BashRegex if is_fake else self.next_lex_mode)
195
196 self.cur_token = self.lexer.Read(real_mode)
197
198 # MUTATE TOKEN for fake lexer mode.
199 # This is for crazy stuff bash allows, like [[ s =~ (< >) ]]
200 if (is_fake and self.cur_token.id
201 in (Id.WS_Space, Id.BashRegex_AllowedInParens)):
202 self.cur_token.id = Id.Lit_Chars
203
204 self.token_type = self.cur_token.id
205 self.token_kind = consts.GetKind(self.token_type)
206
207 # number of consecutive newlines, ignoring whitespace
208 if self.token_type == Id.Op_Newline:
209 self.newline_state += 1
210 elif self.token_kind != Kind.WS:
211 self.newline_state = 0
212
213 self.parse_ctx.trail.AppendToken(self.cur_token) # For completion
214 self.next_lex_mode = lex_mode_e.Undefined
215
216 def _SetNext(self, lex_mode):
217 # type: (lex_mode_t) -> None
218 """Set the next lex state, but don't actually read a token.
219
220 We need this for proper interactive parsing.
221 """
222 self.next_lex_mode = lex_mode
223
224 def _ReadVarOpArg(self, arg_lex_mode):
225 # type: (lex_mode_t) -> rhs_word_t
226
227 # NOTE: Operators like | and < are not treated as special, so ${a:- | >} is
228 # valid, even when unquoted.
229 self._SetNext(arg_lex_mode)
230 self._GetToken()
231
232 w = self._ReadVarOpArg2(arg_lex_mode, Id.Undefined_Tok,
233 True) # empty_ok
234
235 # If the Compound has no parts, and we're in a double-quoted VarSub
236 # arg, and empty_ok, then return Empty. This is so it can evaluate to
237 # the empty string and not get elided.
238 #
239 # Examples:
240 # - "${s:-}", "${s/%pat/}"
241 # It's similar to LooksLikeShAssignment where we turn x= into x=''. And it
242 # has the same potential problem of not having Token location info.
243 #
244 # NOTE: empty_ok is False only for the PatSub pattern, which means we'll
245 # return a Compound with no parts, which is explicitly checked with a
246 # custom error message.
247 if len(w.parts) == 0 and arg_lex_mode == lex_mode_e.VSub_ArgDQ:
248 return rhs_word.Empty
249
250 return w
251
252 def _ReadVarOpArg2(self, arg_lex_mode, eof_type, empty_ok):
253 # type: (lex_mode_t, Id_t, bool) -> CompoundWord
254 """Return a CompoundWord.
255
256 Helper function for _ReadVarOpArg and used directly by
257 _ReadPatSubVarOp.
258 """
259 w = self._ReadCompoundWord3(arg_lex_mode, eof_type, empty_ok)
260 #log('w %s', w)
261 tilde = word_.TildeDetect(w)
262 if tilde:
263 w = tilde
264 return w
265
266 def _ReadSliceVarOp(self):
267 # type: () -> suffix_op.Slice
268 """
269 Looking token after first ':'
270
271 ArithExpr? (':' ArithExpr? )? '}'
272 """
273 self._NextNonSpace()
274
275 cur_id = self.token_type
276
277 if cur_id in (Id.Arith_RBrace, Id.Arith_Colon): # ${a:} or ${a::}
278 begin = arith_expr.EmptyZero # type: arith_expr_t
279 else:
280 begin = self.a_parser.Parse()
281 cur_id = self.a_parser.CurrentId() # advance
282
283 if cur_id == Id.Arith_RBrace: # ${a:1} or ${@:1}
284 # No length specified, so it's N
285 no_length = None # type: Optional[arith_expr_t]
286 return suffix_op.Slice(begin, no_length)
287
288 elif cur_id == Id.Arith_Colon: # ${a:1:} or ${@:1:}
289 colon_tok = self.cur_token
290 self._NextNonSpace()
291
292 if self.token_type == Id.Arith_RBrace:
293 # quirky bash behavior:
294 # ${a:1:} or ${a::} means length ZERO
295 # but ${a:1} or ${a:} means length N
296 if self.parse_opts.strict_parse_slice():
297 p_die(
298 "Slice length: Add explicit zero, or omit : for N (strict_parse_slice)",
299 colon_tok)
300
301 length = arith_expr.EmptyZero # type: arith_expr_t
302 else:
303 length = self._ReadArithExpr(Id.Arith_RBrace)
304
305 return suffix_op.Slice(begin, length)
306
307 else:
308 p_die("Expected : or } in slice", self.cur_token)
309
310 raise AssertionError() # for MyPy
311
312 def _ReadPatSubVarOp(self):
313 # type: () -> suffix_op.PatSub
314 """Looking at the first '/' after VarOf:
315
316 VarSub = ...
317 | VarOf '/' Match ( '/' WORD? )?
318 Match = '/' WORD # can't be empty
319 | '#' WORD? # may be empty
320 | '%' WORD?
321 """
322 slash_tok = self.cur_token # location info
323 replace_mode = Id.Undefined_Tok # bizarre syntax / # %
324
325 self._SetNext(lex_mode_e.VSub_ArgUnquoted) # advance past /
326
327 self._GetToken()
328 if self.token_type == Id.Right_DollarBrace:
329 pat = CompoundWord([])
330 return suffix_op.PatSub(pat, rhs_word.Empty, replace_mode,
331 slash_tok)
332
333 if self.token_type in (Id.Lit_Slash, Id.Lit_Pound, Id.Lit_Percent):
334 replace_mode = self.token_type
335 self._SetNext(lex_mode_e.VSub_ArgUnquoted)
336
337 # Bash quirk:
338 # echo ${x/#/replace} has an empty pattern
339 # echo ${x////replace} is non-empty; it means echo ${x//'/'/replace}
340 empty_ok = replace_mode != Id.Lit_Slash
341 pat = self._ReadVarOpArg2(lex_mode_e.VSub_ArgUnquoted, Id.Lit_Slash,
342 empty_ok)
343 #log('pat 1 %r', pat)
344
345 if self.token_type == Id.Lit_Slash:
346 # read until }
347 replace = self._ReadVarOpArg(
348 lex_mode_e.VSub_ArgUnquoted) # type: rhs_word_t
349 #log('r 1 %r', replace)
350 else:
351 # e.g. ${v/a} is the same as ${v/a/} -- empty replacement string
352 replace = rhs_word.Empty
353
354 self._GetToken()
355 if self.token_type != Id.Right_DollarBrace:
356 # This happens on invalid code
357 p_die(
358 "Expected } after replacement string, got %s" %
359 ui.PrettyId(self.token_type), self.cur_token)
360
361 return suffix_op.PatSub(pat, replace, replace_mode, slash_tok)
362
363 def _ReadSubscript(self):
364 # type: () -> bracket_op_t
365 """ Subscript = '[' ('@' | '*' | ArithExpr) ']' """
366 # Lookahead to see if we get @ or *. Otherwise read a full arithmetic
367 # expression.
368 next_id = self.lexer.LookPastSpace(lex_mode_e.Arith)
369 if next_id in (Id.Lit_At, Id.Arith_Star):
370 op = bracket_op.WholeArray(next_id) # type: bracket_op_t
371
372 self._SetNext(lex_mode_e.Arith) # skip past [
373 self._GetToken()
374 self._SetNext(lex_mode_e.Arith) # skip past @
375 self._GetToken()
376 else:
377 self._SetNext(lex_mode_e.Arith) # skip past [
378 anode = self._ReadArithExpr(Id.Arith_RBracket)
379 op = bracket_op.ArrayIndex(anode)
380
381 if self.token_type != Id.Arith_RBracket: # Should be looking at ]
382 p_die('Expected ] to close subscript', self.cur_token)
383
384 self._SetNext(lex_mode_e.VSub_2) # skip past ]
385 self._GetToken() # Needed to be in the same spot as no subscript
386
387 return op
388
389 def _ParseVarOf(self):
390 # type: () -> BracedVarSub
391 """
392 VarOf = NAME Subscript?
393 | NUMBER # no subscript allowed, none of these are arrays
394 # ${@[1]} doesn't work, even though slicing does
395 | VarSymbol
396 """
397 self._GetToken()
398 name_token = self.cur_token
399 self._SetNext(lex_mode_e.VSub_2)
400
401 self._GetToken() # Check for []
402 if self.token_type == Id.VOp2_LBracket:
403 bracket_op = self._ReadSubscript()
404 else:
405 bracket_op = None
406
407 part = BracedVarSub.CreateNull()
408 part.name_tok = name_token
409 part.var_name = lexer.TokenVal(name_token)
410 part.bracket_op = bracket_op
411 return part
412
413 def _ParseVarExpr(self, arg_lex_mode, allow_query=False):
414 # type: (lex_mode_t, bool) -> BracedVarSub
415 """Start parsing at the op -- we already skipped past the name."""
416 part = self._ParseVarOf()
417
418 self._GetToken()
419 if self.token_type == Id.Right_DollarBrace:
420 return part # no ops
421
422 op_kind = self.token_kind
423
424 if op_kind == Kind.VTest:
425 tok = self.cur_token
426 arg_word = self._ReadVarOpArg(arg_lex_mode)
427 if self.token_type != Id.Right_DollarBrace:
428 p_die('Expected } to close ${', self.cur_token)
429
430 part.suffix_op = suffix_op.Unary(tok, arg_word)
431
432 elif op_kind == Kind.VOpYsh:
433 tok = self.cur_token
434 arg_word = self._ReadVarOpArg(arg_lex_mode)
435 if self.token_type != Id.Right_DollarBrace:
436 p_die('Expected } to close ${', self.cur_token)
437
438 UP_arg_word = arg_word
439 with tagswitch(arg_word) as case:
440 if case(rhs_word_e.Empty):
441 pass
442 elif case(rhs_word_e.Compound):
443 arg_word = cast(CompoundWord, UP_arg_word)
444 # This handles ${x|html} and ${x %.3f} now
445 # However I think ${x %.3f} should be statically parsed? It can enter
446 # the printf lexer modes.
447 ok, arg, quoted = word_.StaticEval(arg_word)
448 if not ok or quoted:
449 p_die('Expected a constant argument',
450 loc.Word(arg_word))
451
452 part.suffix_op = suffix_op.Static(tok, arg)
453
454 elif op_kind == Kind.VOp0:
455 part.suffix_op = self.cur_token # Nullary
456 self._SetNext(lex_mode_e.VSub_2) # Expecting }
457 self._GetToken()
458
459 elif op_kind == Kind.VOp1: # % %% # ## etc.
460 tok = self.cur_token
461 # Weird exception that all shells have: these operators take a glob
462 # pattern, so they're lexed as VSub_ArgUnquoted, not VSub_ArgDQ
463 arg_word = self._ReadVarOpArg(lex_mode_e.VSub_ArgUnquoted)
464 if self.token_type != Id.Right_DollarBrace:
465 p_die('Expected } to close ${', self.cur_token)
466
467 part.suffix_op = suffix_op.Unary(tok, arg_word)
468
469 elif op_kind == Kind.VOp2: # / : [ ]
470 if self.token_type == Id.VOp2_Slash:
471 patsub_op = self._ReadPatSubVarOp() # type: suffix_op_t
472 part.suffix_op = patsub_op
473
474 # Checked by the method above
475 assert self.token_type == Id.Right_DollarBrace, self.cur_token
476
477 elif self.token_type == Id.VOp2_Colon:
478 part.suffix_op = self._ReadSliceVarOp()
479 # NOTE: } in arithmetic mode.
480 if self.token_type != Id.Arith_RBrace:
481 # Token seems off; doesn't point to X in # ${a:1:2 X
482 p_die('Expected } to close ${', self.cur_token)
483
484 else:
485 # TODO: Does this ever happen?
486 p_die('Unexpected token in ${} (%s)' % 'VOp2', self.cur_token)
487
488 elif op_kind == Kind.VOp3: # ${prefix@} etc.
489 if allow_query:
490 part.suffix_op = self.cur_token # Nullary
491 self._SetNext(lex_mode_e.VSub_2) # Expecting }
492 self._GetToken()
493 else:
494 p_die("Unexpected token in ${} (%s)" % 'VOp3', self.cur_token)
495
496 # NOTE: Arith_RBrace is for slicing, because it reads } in arithmetic
497 # mode. It's redundantly checked above.
498 if self.token_type not in (Id.Right_DollarBrace, Id.Arith_RBrace):
499 # ${a.} or ${!a.}
500 p_die('Expected } to close ${', self.cur_token)
501
502 # Now look for ops
503 return part
504
505 def _ReadZshVarSub(self, left_token):
506 # type: (Token) -> word_part.ZshVarSub
507
508 self._SetNext(lex_mode_e.VSub_Zsh) # Move past ${(foo)
509
510 # Can be empty
511 w = self._ReadCompoundWord3(lex_mode_e.VSub_Zsh, Id.Right_DollarBrace,
512 True)
513 self._GetToken()
514 return word_part.ZshVarSub(left_token, w, self.cur_token)
515
516 def ReadBracedVarSub(self, left_token):
517 # type: (Token) -> Tuple[BracedVarSub, Token]
518 """ For YSH expressions like var x = ${x:-"default"}. """
519 part = self._ReadBracedVarSub(left_token, d_quoted=False)
520 last_token = self.cur_token
521 return part, last_token
522
523 def _ReadBracedVarSub(self, left_token, d_quoted):
524 # type: (Token, bool) -> BracedVarSub
525 """For the ${} expression language.
526
527 NAME = [a-zA-Z_][a-zA-Z0-9_]*
528 NUMBER = [0-9]+ # ${10}, ${11}, ...
529
530 Subscript = '[' ('@' | '*' | ArithExpr) ']'
531 VarSymbol = '!' | '@' | '#' | ...
532 VarOf = NAME Subscript?
533 | NUMBER # no subscript allowed, none of these are arrays
534 # ${@[1]} doesn't work, even though slicing does
535 | VarSymbol
536
537 NULLARY_OP = '@Q' | '@E' | '@P' | '@A' | '@a' # VOp0
538
539 TEST_OP = '-' | ':-' | '=' | ':=' | '+' | ':+' | '?' | ':?'
540 STRIP_OP = '#' | '##' | '%' | '%%'
541 CASE_OP = ',' | ',,' | '^' | '^^'
542 UnaryOp = TEST_OP | STRIP_OP | CASE_OP
543
544 YSH_UNARY = '|' | ' ' # ${x|html} and ${x %.3f}.
545 # SPACE is operator not %
546 Match = ('/' | '#' | '%') WORD # match all / prefix / suffix
547 VarExpr = VarOf
548 | VarOf NULLARY_OP
549 | VarOf UnaryOp WORD
550 | VarOf YSH_UNARY STATIC_WORD
551 | VarOf ':' ArithExpr (':' ArithExpr )?
552 | VarOf '/' Match '/' WORD
553
554 LengthExpr = '#' VarOf # can't apply operators after length
555
556 RefOrKeys = '!' VarExpr # CAN apply operators after a named ref
557 # ${!ref[0]} vs ${!keys[@]} resolved later
558
559 PrefixQuery = '!' NAME ('*' | '@') # list variable names with a prefix
560
561 BuiltinSub = '.' WORD+ # ${.myproc 'builtin' $sub}
562
563 VarSub = LengthExpr
564 | RefOrKeys
565 | PrefixQuery
566 | VarExpr
567 | BuiltinSub
568
569 NOTES:
570 - Arithmetic expressions are used twice, inside subscripts ${a[x+1]} and
571 slicing ${a:x+1:y+2}
572 - ${#} and ${!} need LL(2) lookahead (considering how my tokenizer works)
573 - @ and * are technically arithmetic expressions in this implementation
574 - We don't account for bash 4.4: ${param@operator} -- Q E P A a. Note that
575 it's also vectorized.
576
577 Strictness over bash:
578 - echo ${a[0][0]} doesn't do anything useful, so we disallow it from the
579 grammar
580 - ! and # prefixes can't be composed, even though named refs can be
581 composed with other operators
582 - '#' means 4 different things: length prefix, VarSymbol, UnaryOp to strip
583 a prefix, and it can also be a literal part of WORD.
584
585 From the parser's point of view, the prefix # can't be combined with
586 UnaryOp/slicing/matching, and the ! can. However
587
588 - ${a[@]:1:2} is not allowed
589 - ${#a[@]:1:2} is allowed, but gives the wrong answer
590 """
591 if d_quoted:
592 arg_lex_mode = lex_mode_e.VSub_ArgDQ
593 else:
594 arg_lex_mode = lex_mode_e.VSub_ArgUnquoted
595
596 self._SetNext(lex_mode_e.VSub_1)
597 self._GetToken()
598
599 ty = self.token_type
600 first_tok = self.cur_token
601
602 if ty == Id.VSub_Pound:
603 # Disambiguate
604 next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
605 if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
606 # e.g. a name, '#' is the prefix
607 self._SetNext(lex_mode_e.VSub_1)
608 part = self._ParseVarOf()
609
610 self._GetToken()
611 if self.token_type != Id.Right_DollarBrace:
612 p_die('Expected } after length expression', self.cur_token)
613
614 part.prefix_op = first_tok
615
616 else: # not a prefix, '#' is the variable
617 part = self._ParseVarExpr(arg_lex_mode)
618
619 elif ty == Id.VSub_Bang:
620 next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
621 if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
622 # e.g. a name, '!' is the prefix
623 # ${!a} -- this is a ref
624 # ${!3} -- this is ref
625 # ${!a[1]} -- this is a ref
626 # ${!a[@]} -- this is a keys
627 # No lookahead -- do it in a second step, or at runtime
628 self._SetNext(lex_mode_e.VSub_1)
629 part = self._ParseVarExpr(arg_lex_mode, allow_query=True)
630
631 part.prefix_op = first_tok
632
633 else: # not a prefix, '!' is the variable
634 part = self._ParseVarExpr(arg_lex_mode)
635
636 elif ty == Id.VSub_Dot:
637 # Note: this will become a new builtin_sub type, so this method must
638 # return word_part_t rather than BracedVarSub. I don't think that
639 # should cause problems.
640 p_die('TODO: ${.myproc builtin sub}', self.cur_token)
641
642 # VS_NAME, VS_NUMBER, symbol that isn't # or !
643 elif self.token_kind == Kind.VSub:
644 part = self._ParseVarExpr(arg_lex_mode)
645
646 else:
647 # e.g. ${^}
648 p_die('Unexpected token in ${}', self.cur_token)
649
650 part.left = left_token # attach the argument
651 part.right = self.cur_token
652 return part
653
654 def _ReadSingleQuoted(self, left_token, lex_mode):
655 # type: (Token, lex_mode_t) -> SingleQuoted
656 """Internal method to read a word_part."""
657 tokens = [] # type: List[Token]
658 # In command mode, we never disallow backslashes like '\'
659 right_quote = self.ReadSingleQuoted(lex_mode, left_token, tokens,
660 False)
661 sval = word_compile.EvalSingleQuoted(left_token.id, tokens)
662 node = SingleQuoted(left_token, sval, right_quote)
663 return node
664
665 def ReadSingleQuoted(self, lex_mode, left_token, out_tokens, is_ysh_expr):
666 # type: (lex_mode_t, Token, List[Token], bool) -> Token
667 """Appends to out_tokens; returns last token
668
669 Used by expr_parse.py
670 """
671 # TODO: Remove and use out_tokens
672 tokens = [] # type: List[Token]
673
674 # echo '\' is allowed, but x = '\' is invalid, in favor of x = r'\'
675 no_backslashes = is_ysh_expr and left_token.id == Id.Left_SingleQuote
676
677 expected_end_tokens = 3 if left_token.id in (
678 Id.Left_TSingleQuote, Id.Left_RTSingleQuote, Id.Left_UTSingleQuote,
679 Id.Left_BTSingleQuote) else 1
680 num_end_tokens = 0
681
682 while num_end_tokens < expected_end_tokens:
683 self._SetNext(lex_mode)
684 self._GetToken()
685
686 # Kind.Char emitted in lex_mode.SQ_C
687 if self.token_kind in (Kind.Lit, Kind.Char):
688 tok = self.cur_token
689 # Happens in lex_mode_e.SQ: 'one\two' is ambiguous, should be
690 # r'one\two' or c'one\\two'
691 if no_backslashes and lexer.TokenContains(tok, '\\'):
692 p_die(
693 r"Strings with backslashes should look like r'\n' or u'\n' or b'\n'",
694 tok)
695
696 if is_ysh_expr:
697 # Disallow var x = $'\001'. Arguably we don't need these
698 # checks because u'\u{1}' is the way to write it.
699 if self.token_type == Id.Char_Octal3:
700 p_die(
701 r"Use \xhh or \u{...} instead of octal escapes in YSH strings",
702 tok)
703
704 if self.token_type == Id.Char_Hex and self.cur_token.length != 4:
705 # disallow \xH
706 p_die(
707 r'Invalid hex escape in YSH string (must be \xHH)',
708 tok)
709
710 tokens.append(tok)
711
712 elif self.token_kind == Kind.Unknown:
713 tok = self.cur_token
714 assert tok.id == Id.Unknown_Backslash, tok
715
716 # x = $'\z' is disallowed; ditto for echo $'\z' if shopt -u parse_backslash
717 if is_ysh_expr or not self.parse_opts.parse_backslash():
718 p_die(
719 "Invalid char escape in C-style string literal (OILS-ERR-11)",
720 tok)
721
722 tokens.append(tok)
723
724 elif self.token_kind == Kind.Eof:
725 p_die('Unexpected EOF in single-quoted string that began here',
726 left_token)
727
728 elif self.token_kind == Kind.Right:
729 # assume Id.Right_SingleQuote
730 num_end_tokens += 1
731 tokens.append(self.cur_token)
732
733 else:
734 raise AssertionError(self.cur_token)
735
736 if self.token_kind != Kind.Right:
737 num_end_tokens = 0 # we need three in a ROW
738
739 if expected_end_tokens == 1:
740 tokens.pop()
741 elif expected_end_tokens == 3: # Get rid of spurious end tokens
742 tokens.pop()
743 tokens.pop()
744 tokens.pop()
745
746 # Remove space from ''' r''' $''' in both expression mode and command mode
747 if left_token.id in (Id.Left_TSingleQuote, Id.Left_RTSingleQuote,
748 Id.Left_UTSingleQuote, Id.Left_BTSingleQuote):
749 word_compile.RemoveLeadingSpaceSQ(tokens)
750
751 # Validation after lexing - same 2 checks in j8.LexerDecoder
752 is_u_string = left_token.id in (Id.Left_USingleQuote,
753 Id.Left_UTSingleQuote)
754
755 for tok in tokens:
756 # u'\yff' is not valid, but b'\yff' is
757 if is_u_string and tok.id == Id.Char_YHex:
758 p_die(
759 r"%s escapes not allowed in u'' strings" %
760 lexer.TokenVal(tok), tok)
761
762 out_tokens.extend(tokens)
763 return self.cur_token
764
765 def _ReadDoubleQuotedLeftParts(self):
766 # type: () -> word_part_t
767 """Read substitution parts in a double quoted context."""
768 if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick):
769 return self._ReadCommandSub(self.token_type, d_quoted=True)
770
771 if self.token_type == Id.Left_DollarBrace:
772 return self._ReadBracedVarSub(self.cur_token, d_quoted=True)
773
774 if self.token_type == Id.Left_DollarDParen:
775 return self._ReadArithSub()
776
777 if self.token_type == Id.Left_DollarBracket:
778 return self._ReadExprSub(lex_mode_e.DQ)
779
780 raise AssertionError(self.cur_token)
781
782 def _ReadYshSingleQuoted(self, left_id):
783 # type: (Id_t) -> CompoundWord
784 """Read YSH style strings
785
786 r'' u'' b''
787 r''' ''' u''' ''' b''' '''
788 """
789 #log('BEF self.cur_token %s', self.cur_token)
790 if left_id == Id.Left_RSingleQuote:
791 lexer_mode = lex_mode_e.SQ_Raw
792 triple_left_id = Id.Left_RTSingleQuote
793 elif left_id == Id.Left_USingleQuote:
794 lexer_mode = lex_mode_e.J8_Str
795 triple_left_id = Id.Left_UTSingleQuote
796 elif left_id == Id.Left_BSingleQuote:
797 lexer_mode = lex_mode_e.J8_Str
798 triple_left_id = Id.Left_BTSingleQuote
799 else:
800 raise AssertionError(left_id)
801
802 # Needed for syntax checks
803 left_tok = self.cur_token
804 left_tok.id = left_id
805
806 sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
807
808 if (len(sq_part.sval) == 0 and self.lexer.ByteLookAhead() == "'"):
809 self._SetNext(lex_mode_e.ShCommand)
810 self._GetToken()
811
812 assert self.token_type == Id.Left_SingleQuote
813 # HACK: magically transform the third ' in u''' to
814 # Id.Left_UTSingleQuote, so that ''' is the terminator
815 left_tok = self.cur_token
816 left_tok.id = triple_left_id
817
818 # Handles stripping leading whitespace
819 sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
820
821 # Advance and validate
822 self._SetNext(lex_mode_e.ShCommand)
823
824 self._GetToken()
825 if self.token_kind not in KINDS_THAT_END_WORDS:
826 p_die('Unexpected token after YSH single-quoted string',
827 self.cur_token)
828
829 return CompoundWord([sq_part])
830
831 def _ReadUnquotedLeftParts(self, triple_out):
832 # type: (Optional[BoolParamBox]) -> word_part_t
833 """Read substitutions and quoted strings (for lex_mode_e.ShCommand).
834
835 If triple_out is set, then we try parsing triple quoted strings,
836 and set its value to True if we got one.
837 """
838 if self.token_type in (Id.Left_DoubleQuote, Id.Left_DollarDoubleQuote):
839 # Note: $"" is a synonym for "". It might make sense if it added
840 # \n \0 \x00 \u{123} etc. But that's not what bash does!
841 dq_part = self._ReadDoubleQuoted(self.cur_token)
842 # Got empty word "" and there's a " after
843 if (triple_out and len(dq_part.parts) == 0 and
844 self.lexer.ByteLookAhead() == '"'):
845
846 self._SetNext(lex_mode_e.ShCommand)
847 self._GetToken()
848 # HACK: magically transform the third " in """ to
849 # Id.Left_TDoubleQuote, so that """ is the terminator
850 left_dq_token = self.cur_token
851 left_dq_token.id = Id.Left_TDoubleQuote
852 triple_out.b = True # let caller know we got it
853 return self._ReadDoubleQuoted(left_dq_token)
854
855 return dq_part
856
857 if self.token_type in (Id.Left_SingleQuote, Id.Left_RSingleQuote,
858 Id.Left_DollarSingleQuote):
859 if self.token_type == Id.Left_SingleQuote:
860 lexer_mode = lex_mode_e.SQ_Raw
861 triple_left_id = Id.Left_TSingleQuote
862 elif self.token_type == Id.Left_RSingleQuote:
863 lexer_mode = lex_mode_e.SQ_Raw
864 triple_left_id = Id.Left_RTSingleQuote
865 else:
866 lexer_mode = lex_mode_e.SQ_C
867 # there is no such thing as $'''
868 triple_left_id = Id.Undefined_Tok
869
870 sq_part = self._ReadSingleQuoted(self.cur_token, lexer_mode)
871
872 # Got empty '' or r'' and there's a ' after
873 # u'' and b'' are handled in _ReadYshSingleQuoted
874 if (triple_left_id != Id.Undefined_Tok and
875 triple_out is not None and len(sq_part.sval) == 0 and
876 self.lexer.ByteLookAhead() == "'"):
877
878 self._SetNext(lex_mode_e.ShCommand)
879 self._GetToken()
880
881 # HACK: magically transform the third ' in ''' to
882 # Id.Left_TSingleQuote, so that ''' is the terminator
883 left_sq_token = self.cur_token
884 left_sq_token.id = triple_left_id
885
886 triple_out.b = True # let caller know we got it
887 return self._ReadSingleQuoted(left_sq_token, lexer_mode)
888
889 return sq_part
890
891 if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick,
892 Id.Left_ProcSubIn, Id.Left_ProcSubOut):
893 return self._ReadCommandSub(self.token_type, d_quoted=False)
894
895 if self.token_type == Id.Left_DollarBrace:
896 return self._ReadBracedVarSub(self.cur_token, d_quoted=False)
897
898 if self.token_type == Id.Left_DollarDParen:
899 return self._ReadArithSub()
900
901 if self.token_type == Id.Left_DollarBracket:
902 return self._ReadExprSub(lex_mode_e.ShCommand)
903
904 if self.token_type == Id.Left_DollarBraceZsh:
905 return self._ReadZshVarSub(self.cur_token)
906
907 raise AssertionError(self.cur_token)
908
909 def _ReadExtGlob(self):
910 # type: () -> word_part.ExtGlob
911 """
912 Grammar:
913 Item = CompoundWord | EPSILON # important: @(foo|) is allowed
914 LEFT = '@(' | '*(' | '+(' | '?(' | '!('
915 RIGHT = ')'
916 ExtGlob = LEFT (Item '|')* Item RIGHT # ITEM may be empty
917 Compound includes ExtGlob
918 """
919 left_token = self.cur_token
920 right_token = None # type: Token
921 arms = [] # type: List[CompoundWord]
922
923 self.lexer.PushHint(Id.Op_RParen, Id.Right_ExtGlob)
924 self._SetNext(lex_mode_e.ExtGlob) # advance past LEFT
925
926 read_word = False # did we just a read a word? To handle @(||).
927
928 while True:
929 self._GetToken()
930
931 if self.token_type == Id.Right_ExtGlob:
932 if not read_word:
933 arms.append(CompoundWord([]))
934 right_token = self.cur_token
935 break
936
937 elif self.token_type == Id.Op_Pipe:
938 if not read_word:
939 arms.append(CompoundWord([]))
940 read_word = False
941 self._SetNext(lex_mode_e.ExtGlob)
942
943 # lex_mode_e.ExtGlob should only produce these 4 kinds of tokens
944 elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub,
945 Kind.ExtGlob):
946 w = self._ReadCompoundWord(lex_mode_e.ExtGlob)
947 arms.append(w)
948 read_word = True
949
950 elif self.token_kind == Kind.Eof:
951 p_die('Unexpected EOF reading extended glob that began here',
952 left_token)
953
954 else:
955 raise AssertionError(self.cur_token)
956
957 return word_part.ExtGlob(left_token, arms, right_token)
958
959 def _ReadBashRegexGroup(self):
960 # type: () -> word_part.BashRegexGroup
961 """
962 Grammar:
963 BashRegexGroup = '(' WORD? ')
964 """
965 left_token = self.cur_token
966 assert left_token.id == Id.BashRegex_LParen, left_token
967
968 arms = [] # type: List[CompoundWord]
969
970 self.lexer.PushHint(Id.Op_RParen, Id.Right_BashRegexGroup)
971 self._SetNext(lex_mode_e.BashRegexFakeInner) # advance past LEFT
972
973 self._GetToken()
974 if self.token_type == Id.Right_BashRegexGroup: # empty ()
975 return word_part.BashRegexGroup(left_token, None, self.cur_token)
976
977 # lex_mode_e.BashRegex should only produce these 4 kinds of tokens
978 if self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub, Kind.BashRegex):
979 # Fake lexer mode that translates Id.WS_Space to Id.Lit_Chars
980 # To allow bash style [[ s =~ (a b) ]]
981 w = self._ReadCompoundWord(lex_mode_e.BashRegexFakeInner)
982 arms.append(w)
983
984 self._GetToken()
985 if self.token_type != Id.Right_BashRegexGroup:
986 p_die('Expected ) to close bash regex group', self.cur_token)
987
988 return word_part.BashRegexGroup(left_token, w, self.cur_token)
989
990 p_die('Expected word after ( opening bash regex group', self.cur_token)
991
992 def _ReadLikeDQ(self, left_token, is_ysh_expr, out_parts):
993 # type: (Optional[Token], bool, List[word_part_t]) -> None
994 """
995 Args:
996 left_token: A token if we are reading a double quoted part, or None if
997 we're reading a here doc.
998 is_ysh_expr: Whether to disallow backticks and invalid char escapes
999 out_parts: list of word_part to append to
1000 """
1001 if left_token:
1002 if left_token.id in (Id.Left_TDoubleQuote,
1003 Id.Left_DollarTDoubleQuote):
1004 expected_end_tokens = 3
1005 else:
1006 expected_end_tokens = 1
1007 else:
1008 expected_end_tokens = 1000 # here doc will break
1009
1010 num_end_tokens = 0
1011 while num_end_tokens < expected_end_tokens:
1012 self._SetNext(lex_mode_e.DQ)
1013 self._GetToken()
1014
1015 if self.token_kind == Kind.Lit:
1016 if self.token_type == Id.Lit_EscapedChar:
1017 tok = self.cur_token
1018 ch = lexer.TokenSliceLeft(tok, 1)
1019 part = word_part.EscapedLiteral(tok,
1020 ch) # type: word_part_t
1021 else:
1022 if self.token_type == Id.Lit_BadBackslash:
1023 # echo "\z" is OK in shell, but 'x = "\z" is a syntax error in
1024 # YSH.
1025 # Slight hole: We don't catch 'x = ${undef:-"\z"} because of the
1026 # recursion (unless parse_backslash)
1027 if (is_ysh_expr or
1028 not self.parse_opts.parse_backslash()):
1029 p_die(
1030 "Invalid char escape in double quoted string (OILS-ERR-12)",
1031 self.cur_token)
1032 elif self.token_type == Id.Lit_Dollar:
1033 if is_ysh_expr or not self.parse_opts.parse_dollar():
1034 p_die("Literal $ should be quoted like \$",
1035 self.cur_token)
1036
1037 part = self.cur_token
1038 out_parts.append(part)
1039
1040 elif self.token_kind == Kind.Left:
1041 if self.token_type == Id.Left_Backtick and is_ysh_expr:
1042 p_die("Invalid backtick: use $(cmd) or \\` in YSH strings",
1043 self.cur_token)
1044
1045 part = self._ReadDoubleQuotedLeftParts()
1046 out_parts.append(part)
1047
1048 elif self.token_kind == Kind.VSub:
1049 tok = self.cur_token
1050 part = SimpleVarSub(tok)
1051 out_parts.append(part)
1052 # NOTE: parsing "$f(x)" would BREAK CODE. Could add a more for it
1053 # later.
1054
1055 elif self.token_kind == Kind.Right:
1056 assert self.token_type == Id.Right_DoubleQuote, self.token_type
1057 if left_token:
1058 num_end_tokens += 1
1059
1060 # In a here doc, the right quote is literal!
1061 out_parts.append(self.cur_token)
1062
1063 elif self.token_kind == Kind.Eof:
1064 if left_token:
1065 p_die(
1066 'Unexpected EOF reading double-quoted string that began here',
1067 left_token)
1068 else: # here docs will have an EOF in their token stream
1069 break
1070
1071 else:
1072 raise AssertionError(self.cur_token)
1073
1074 if self.token_kind != Kind.Right:
1075 num_end_tokens = 0 # """ must be CONSECUTIVE
1076
1077 if expected_end_tokens == 1:
1078 out_parts.pop()
1079 elif expected_end_tokens == 3:
1080 out_parts.pop()
1081 out_parts.pop()
1082 out_parts.pop()
1083
1084 # Remove space from """ in both expression mode and command mode
1085 if (left_token and left_token.id
1086 in (Id.Left_TDoubleQuote, Id.Left_DollarTDoubleQuote)):
1087 word_compile.RemoveLeadingSpaceDQ(out_parts)
1088
1089 # Return nothing, since we appended to 'out_parts'
1090
1091 def _ReadDoubleQuoted(self, left_token):
1092 # type: (Token) -> DoubleQuoted
1093 """Helper function for "hello $name".
1094
1095 Args:
1096 eof_type: for stopping at }, Id.Lit_RBrace
1097 here_doc: Whether we are reading in a here doc context
1098
1099 Also ${foo%%a b c} # treat this as double quoted. until you hit
1100 """
1101 parts = [] # type: List[word_part_t]
1102 self._ReadLikeDQ(left_token, False, parts)
1103
1104 right_quote = self.cur_token
1105 return DoubleQuoted(left_token, parts, right_quote)
1106
1107 def ReadDoubleQuoted(self, left_token, parts):
1108 # type: (Token, List[word_part_t]) -> Token
1109 """For expression mode.
1110
1111 Read var x = "${dir:-}/$name"; etc.
1112 """
1113 self._ReadLikeDQ(left_token, True, parts)
1114 return self.cur_token
1115
1116 def _ReadCommandSub(self, left_id, d_quoted=False):
1117 # type: (Id_t, bool) -> CommandSub
1118 """
1119 NOTE: This is not in the grammar, because word parts aren't in the grammar!
1120
1121 command_sub = '$(' command_list ')'
1122 | '@(' command_list ')'
1123 | '<(' command_list ')'
1124 | '>(' command_list ')'
1125 | ` command_list `
1126 """
1127 left_token = self.cur_token
1128
1129 # Set the lexer in a state so ) becomes the EOF token.
1130 if left_id in (Id.Left_DollarParen, Id.Left_AtParen, Id.Left_ProcSubIn,
1131 Id.Left_ProcSubOut):
1132 self._SetNext(lex_mode_e.ShCommand) # advance past $( etc.
1133
1134 right_id = Id.Eof_RParen
1135 self.lexer.PushHint(Id.Op_RParen, right_id)
1136 c_parser = self.parse_ctx.MakeParserForCommandSub(
1137 self.line_reader, self.lexer, right_id)
1138 # NOTE: This doesn't use something like main_loop because we don't want
1139 # to interleave parsing and execution! Unlike 'source' and 'eval'.
1140 node = c_parser.ParseCommandSub()
1141
1142 right_token = c_parser.w_parser.cur_token
1143
1144 elif left_id == Id.Left_Backtick and self.parse_ctx.do_lossless:
1145 # NOTE: This is an APPROXIMATE solution for translation ONLY. See
1146 # test/osh2oil.
1147
1148 right_id = Id.Eof_Backtick
1149 self.lexer.PushHint(Id.Left_Backtick, right_id)
1150 c_parser = self.parse_ctx.MakeParserForCommandSub(
1151 self.line_reader, self.lexer, right_id)
1152 node = c_parser.ParseCommandSub()
1153 right_token = c_parser.w_parser.cur_token
1154
1155 elif left_id == Id.Left_Backtick:
1156 if not self.parse_opts.parse_backticks():
1157 p_die('Use $(cmd) instead of backticks (parse_backticks)',
1158 left_token)
1159
1160 self._SetNext(lex_mode_e.Backtick) # advance past `
1161
1162 parts = [] # type: List[str]
1163 while True:
1164 self._GetToken()
1165 #log("TOK %s", self.cur_token)
1166
1167 if self.token_type == Id.Backtick_Quoted:
1168 # Remove leading \
1169 parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1170
1171 elif self.token_type == Id.Backtick_DoubleQuote:
1172 # Compatibility: If backticks are double quoted, then double quotes
1173 # within them have to be \"
1174 # Shells aren't smart enough to match nested " and ` quotes (but OSH
1175 # is)
1176 if d_quoted:
1177 # Remove leading \
1178 parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1179 else:
1180 parts.append(lexer.TokenVal(self.cur_token))
1181
1182 elif self.token_type == Id.Backtick_Other:
1183 parts.append(lexer.TokenVal(self.cur_token))
1184
1185 elif self.token_type == Id.Backtick_Right:
1186 break
1187
1188 elif self.token_type == Id.Eof_Real:
1189 # Note: this parse error is in the ORIGINAL context. No code_str yet.
1190 p_die('Unexpected EOF while looking for closing backtick',
1191 left_token)
1192
1193 else:
1194 raise AssertionError(self.cur_token)
1195
1196 self._SetNext(lex_mode_e.Backtick)
1197
1198 # Calculate right SPID on CommandSub BEFORE re-parsing.
1199 right_token = self.cur_token
1200
1201 code_str = ''.join(parts)
1202 #log('code %r', code_str)
1203
1204 # NOTE: This is similar to how we parse aliases in osh/cmd_parse.py. It
1205 # won't have the same location info as MakeParserForCommandSub(), because
1206 # the lexer is different.
1207 arena = self.parse_ctx.arena
1208 #arena = alloc.Arena()
1209 line_reader = reader.StringLineReader(code_str, arena)
1210 c_parser = self.parse_ctx.MakeOshParser(line_reader)
1211 src = source.Reparsed('backticks', left_token, right_token)
1212 with alloc.ctx_SourceCode(arena, src):
1213 node = c_parser.ParseCommandSub()
1214
1215 else:
1216 raise AssertionError(left_id)
1217
1218 return CommandSub(left_token, node, right_token)
1219
1220 def _ReadExprSub(self, lex_mode):
1221 # type: (lex_mode_t) -> word_part.ExprSub
1222 """$[d->key] $[obj.method()] etc."""
1223 left_token = self.cur_token
1224
1225 self._SetNext(lex_mode_e.Expr)
1226 enode, right_token = self.parse_ctx.ParseYshExpr(
1227 self.lexer, grammar_nt.ysh_expr_sub)
1228
1229 self._SetNext(lex_mode) # Move past ]
1230 return word_part.ExprSub(left_token, enode, right_token)
1231
1232 def ParseVarDecl(self, kw_token):
1233 # type: (Token) -> command.VarDecl
1234 """
1235 oil_var_decl: name_type_list '=' testlist end_stmt
1236
1237 Note that assignments must end with \n ; } or EOF. Unlike shell
1238 assignments, we disallow:
1239
1240 var x = 42 | wc -l
1241 var x = 42 && echo hi
1242 """
1243 self._SetNext(lex_mode_e.Expr)
1244 enode, last_token = self.parse_ctx.ParseVarDecl(kw_token, self.lexer)
1245 # Hack to move } from what the Expr lexer modes gives to what CommandParser
1246 # wants
1247 if last_token.id == Id.Op_RBrace:
1248 last_token.id = Id.Lit_RBrace
1249
1250 # Let the CommandParser see the Op_Semi or Op_Newline.
1251 self.buffered_word = last_token
1252 self._SetNext(lex_mode_e.ShCommand) # always back to this
1253 return enode
1254
1255 def ParseMutation(self, kw_token, var_checker):
1256 # type: (Token, VarChecker) -> command.Mutation
1257 """
1258 setvar i = 42
1259 setvar i += 1
1260 setvar a[i] = 42
1261 setvar a[i] += 1
1262 setvar d.key = 42
1263 setvar d.key += 1
1264 """
1265 self._SetNext(lex_mode_e.Expr)
1266 enode, last_token = self.parse_ctx.ParseMutation(kw_token, self.lexer)
1267 # Hack to move } from what the Expr lexer modes gives to what CommandParser
1268 # wants
1269 if last_token.id == Id.Op_RBrace:
1270 last_token.id = Id.Lit_RBrace
1271
1272 for lhs in enode.lhs:
1273 UP_lhs = lhs
1274 with tagswitch(lhs) as case:
1275 if case(y_lhs_e.Var):
1276 lhs = cast(Token, UP_lhs)
1277 var_checker.Check(kw_token.id, lexer.LazyStr(lhs), lhs)
1278
1279 # Note: this does not cover cases like
1280 # setvar (a[0])[1] = v
1281 # setvar (d.key).other = v
1282 # This leaks into catching all typos statically, which may be
1283 # possible if 'use' makes all names explicit.
1284 elif case(y_lhs_e.Subscript):
1285 lhs = cast(Subscript, UP_lhs)
1286 if lhs.obj.tag() == expr_e.Var:
1287 v = cast(expr.Var, lhs.obj)
1288 var_checker.Check(kw_token.id, v.name, v.left)
1289
1290 elif case(y_lhs_e.Attribute):
1291 lhs = cast(Attribute, UP_lhs)
1292 if lhs.obj.tag() == expr_e.Var:
1293 v = cast(expr.Var, lhs.obj)
1294 var_checker.Check(kw_token.id, v.name, v.left)
1295
1296 # Let the CommandParser see the Op_Semi or Op_Newline.
1297 self.buffered_word = last_token
1298 self._SetNext(lex_mode_e.ShCommand) # always back to this
1299 return enode
1300
1301 def ParseBareDecl(self):
1302 # type: () -> expr_t
1303 """
1304 x = {name: val}
1305 """
1306 self._SetNext(lex_mode_e.Expr)
1307 self._GetToken()
1308 enode, last_token = self.parse_ctx.ParseYshExpr(
1309 self.lexer, grammar_nt.command_expr)
1310 if last_token.id == Id.Op_RBrace:
1311 last_token.id = Id.Lit_RBrace
1312 self.buffered_word = last_token
1313 self._SetNext(lex_mode_e.ShCommand)
1314 return enode
1315
1316 def ParseYshExprForCommand(self):
1317 # type: () -> expr_t
1318
1319 # Fudge for this case
1320 # for x in(y) {
1321 # versus
1322 # for x in (y) {
1323 #
1324 # In the former case, ReadWord on 'in' puts the lexer past (.
1325 # Also see LookPastSpace in CommandParers.
1326 # A simpler solution would be nicer.
1327
1328 if self.token_type == Id.Op_LParen:
1329 self.lexer.MaybeUnreadOne()
1330
1331 enode, _ = self.parse_ctx.ParseYshExpr(self.lexer, grammar_nt.ysh_expr)
1332
1333 self._SetNext(lex_mode_e.ShCommand)
1334 return enode
1335
1336 def ParseCommandExpr(self):
1337 # type: () -> expr_t
1338 """
1339 = 1+2
1340 """
1341 enode, last_token = self.parse_ctx.ParseYshExpr(
1342 self.lexer, grammar_nt.command_expr)
1343
1344 # In some cases, such as the case statement, we expect *the lexer* to be
1345 # pointing at the token right after the expression. But the expression
1346 # parser must have read to the `last_token`. Unreading places the lexer
1347 # back in the expected state. Ie:
1348 #
1349 # case (x) { case (x) {
1350 # (else) { = x } (else) { = x }
1351 # ^ The lexer is here ^ Unread to here
1352 # } }
1353 assert last_token.id in (Id.Op_Newline, Id.Eof_Real, Id.Op_Semi,
1354 Id.Op_RBrace), last_token
1355 if last_token.id != Id.Eof_Real:
1356 # Eof_Real is the only token we cannot unread
1357 self.lexer.MaybeUnreadOne()
1358
1359 return enode
1360
1361 def ParseProc(self, node):
1362 # type: (Proc) -> None
1363
1364 # proc name-with-hyphens() must be accepted
1365 self._SetNext(lex_mode_e.ShCommand)
1366 self._GetToken()
1367 # example: 'proc f[' gets you Lit_ArrayLhsOpen
1368 if self.token_type != Id.Lit_Chars:
1369 p_die('Invalid proc name %s' % ui.PrettyToken(self.cur_token),
1370 self.cur_token)
1371
1372 # TODO: validate this more. Disallow proc 123 { }, which isn't disallowed
1373 # for shell functions. Similar to IsValidVarName().
1374 node.name = self.cur_token
1375
1376 last_token = self.parse_ctx.ParseProc(self.lexer, node)
1377
1378 # Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1379 assert last_token.id == Id.Op_LBrace
1380 last_token.id = Id.Lit_LBrace
1381 self.buffered_word = last_token
1382
1383 self._SetNext(lex_mode_e.ShCommand)
1384
1385 def ParseFunc(self, node):
1386 # type: (Func) -> None
1387 last_token = self.parse_ctx.ParseFunc(self.lexer, node)
1388
1389 # Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1390 assert last_token.id == Id.Op_LBrace
1391 last_token.id = Id.Lit_LBrace
1392 self.buffered_word = last_token
1393
1394 self._SetNext(lex_mode_e.ShCommand)
1395
1396 def ParseYshCasePattern(self):
1397 # type: () -> Tuple[pat_t, Token]
1398 pat, left_tok, last_token = self.parse_ctx.ParseYshCasePattern(
1399 self.lexer)
1400
1401 if last_token.id == Id.Op_LBrace:
1402 last_token.id = Id.Lit_LBrace
1403 self.buffered_word = last_token
1404
1405 return pat, left_tok
1406
1407 def NewlineOkForYshCase(self):
1408 # type: () -> Id_t
1409 """Check for optional newline and consume it.
1410
1411 This is a special case of `_NewlineOk` which fixed some "off-by-one" issues
1412 which crop up while parsing Ysh Case Arms. For more details, see
1413 #oil-dev > Progress On YSH Case Grammar on zulip.
1414
1415 Returns a token id which is filled with the choice of
1416
1417 word { echo word }
1418 (3) { echo expr }
1419 /e/ { echo eggex }
1420 } # right brace
1421 """
1422 while True:
1423 next_id = self.lexer.LookAheadOne(lex_mode_e.Expr)
1424
1425 # Cannot lookahead past lines
1426 if next_id == Id.Unknown_Tok:
1427 if not self.lexer.MoveToNextLine(): # Try to move to next line
1428 break # EOF
1429 continue
1430
1431 next_kind = consts.GetKind(next_id)
1432 if next_id != Id.Op_Newline and next_kind != Kind.Ignored:
1433 break
1434
1435 self.lexer.Read(lex_mode_e.Expr)
1436
1437 if next_id in (Id.Op_RBrace, Id.Op_LParen, Id.Arith_Slash):
1438 self._SetNext(lex_mode_e.Expr) # Continue in expression mode
1439 else:
1440 # Consume the trailing Op_Newline
1441 self._SetNext(lex_mode_e.ShCommand)
1442 self._GetToken()
1443
1444 return next_id
1445
1446 def _ReadArithExpr(self, end_id):
1447 # type: (Id_t) -> arith_expr_t
1448 """Read and parse an arithmetic expression in various contexts.
1449
1450 $(( 1+2 ))
1451 (( a=1+2 ))
1452 ${a[ 1+2 ]}
1453 ${a : 1+2 : 1+2}
1454
1455 See tests/arith-context.test.sh for ambiguous cases.
1456
1457 ${a[a[0]]} is valid # VS_RBRACKET vs Id.Arith_RBracket
1458
1459 ${s : a<b?0:1 : 1} # VS_COLON vs Id.Arith_Colon
1460
1461 See the assertion in ArithParser.Parse() -- unexpected extra input.
1462 """
1463 # calls self.ReadWord(lex_mode_e.Arith)
1464 anode = self.a_parser.Parse()
1465 cur_id = self.a_parser.CurrentId()
1466 if end_id != Id.Undefined_Tok and cur_id != end_id:
1467 p_die(
1468 'Unexpected token after arithmetic expression (%s != %s)' %
1469 (ui.PrettyId(cur_id), ui.PrettyId(end_id)),
1470 loc.Word(self.a_parser.cur_word))
1471 return anode
1472
1473 def _ReadArithSub(self):
1474 # type: () -> word_part.ArithSub
1475 """Read an arith substitution, which contains an arith expression, e.g.
1476
1477 $((a + 1)).
1478 """
1479 left_tok = self.cur_token
1480
1481 # The second one needs to be disambiguated in stuff like stuff like:
1482 # $(echo $(( 1+2 )) )
1483 self.lexer.PushHint(Id.Op_RParen, Id.Right_DollarDParen)
1484
1485 # NOTE: To disambiguate $(( as arith sub vs. command sub and subshell, we
1486 # could save the lexer/reader state here, and retry if the arithmetic parse
1487 # fails. But we can almost always catch this at parse time. There could
1488 # be some exceptions like:
1489 # $((echo * foo)) # looks like multiplication
1490 # $((echo / foo)) # looks like division
1491
1492 # $(( )) is valid
1493 anode = arith_expr.EmptyZero # type: arith_expr_t
1494
1495 self._NextNonSpace()
1496 if self.token_type != Id.Arith_RParen:
1497 anode = self._ReadArithExpr(Id.Arith_RParen)
1498
1499 self._SetNext(lex_mode_e.ShCommand)
1500
1501 # Ensure we get closing )
1502 self._GetToken()
1503 if self.token_type != Id.Right_DollarDParen:
1504 p_die('Expected second ) to end arith sub', self.cur_token)
1505
1506 right_tok = self.cur_token
1507 return word_part.ArithSub(left_tok, anode, right_tok)
1508
1509 def ReadDParen(self):
1510 # type: () -> Tuple[arith_expr_t, Token]
1511 """Read ((1+ 2)) -- command context.
1512
1513 We're using the word parser because it's very similar to _ReadArithExpr
1514 above.
1515
1516 This also returns the terminating Id.Op_DRightParen token for location
1517 info.
1518 """
1519 # (( )) is valid
1520 anode = arith_expr.EmptyZero # type: arith_expr_t
1521
1522 self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
1523
1524 self._NextNonSpace()
1525 if self.token_type != Id.Arith_RParen:
1526 anode = self._ReadArithExpr(Id.Arith_RParen)
1527
1528 self._SetNext(lex_mode_e.ShCommand)
1529
1530 # Ensure we get the second )
1531 self._GetToken()
1532 right = self.cur_token
1533 if right.id != Id.Op_DRightParen:
1534 p_die('Expected second ) to end arith statement', right)
1535
1536 self._SetNext(lex_mode_e.ShCommand)
1537
1538 return anode, right
1539
1540 def _NextNonSpace(self):
1541 # type: () -> None
1542 """Advance in lex_mode_e.Arith until non-space token.
1543
1544 Same logic as _ReadWord, but used in
1545 $(( ))
1546 (( ))
1547 for (( ))
1548
1549 You can read self.token_type after this, without calling _GetToken.
1550 """
1551 while True:
1552 self._SetNext(lex_mode_e.Arith)
1553 self._GetToken()
1554 if self.token_kind not in (Kind.Ignored, Kind.WS):
1555 break
1556
1557 def ReadForExpression(self):
1558 # type: () -> command.ForExpr
1559 """Read ((i=0; i<5; ++i)) -- part of command context."""
1560 self._NextNonSpace() # skip over ((
1561 cur_id = self.token_type # for end of arith expressions
1562
1563 if cur_id == Id.Arith_Semi: # for (( ; i < 10; i++ ))
1564 init_node = arith_expr.EmptyZero # type: arith_expr_t
1565 else:
1566 init_node = self.a_parser.Parse()
1567 cur_id = self.a_parser.CurrentId()
1568 self._NextNonSpace()
1569
1570 # It's odd to keep track of both cur_id and self.token_type in this
1571 # function, but it works, and is tested in 'test/parse_error.sh
1572 # arith-integration'
1573 if cur_id != Id.Arith_Semi: # for (( x=0 b; ... ))
1574 p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1575
1576 self._GetToken()
1577 cur_id = self.token_type
1578
1579 if cur_id == Id.Arith_Semi: # for (( ; ; i++ ))
1580 # empty condition is TRUE
1581 cond_node = arith_expr.EmptyOne # type: arith_expr_t
1582 else:
1583 cond_node = self.a_parser.Parse()
1584 cur_id = self.a_parser.CurrentId()
1585
1586 if cur_id != Id.Arith_Semi: # for (( x=0; x<5 b ))
1587 p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1588
1589 self._NextNonSpace()
1590 if self.token_type == Id.Arith_RParen: # for (( ; ; ))
1591 update_node = arith_expr.EmptyZero # type: arith_expr_t
1592 else:
1593 update_node = self._ReadArithExpr(Id.Arith_RParen)
1594
1595 self._NextNonSpace()
1596 if self.token_type != Id.Arith_RParen:
1597 p_die('Expected ) to end for loop expression', self.cur_token)
1598 self._SetNext(lex_mode_e.ShCommand)
1599
1600 # redirects is None, will be assigned in CommandEvaluator
1601 node = command.ForExpr.CreateNull()
1602 node.init = init_node
1603 node.cond = cond_node
1604 node.update = update_node
1605 return node
1606
1607 def _ReadArrayLiteral(self):
1608 # type: () -> word_part_t
1609 """a=(1 2 3)
1610
1611 TODO: See osh/cmd_parse.py:164 for Id.Lit_ArrayLhsOpen, for a[x++]=1
1612
1613 We want:
1614
1615 A=(['x']=1 ["x"]=2 [$x$y]=3)
1616
1617 Maybe allow this as a literal string? Because I think I've seen it before?
1618 Or maybe force people to patch to learn the rule.
1619
1620 A=([x]=4)
1621
1622 Starts with Lit_Other '[', and then it has Lit_ArrayLhsClose
1623 Maybe enforce that ALL have keys or NONE of have keys.
1624 """
1625 self._SetNext(lex_mode_e.ShCommand) # advance past (
1626 self._GetToken()
1627 if self.cur_token.id != Id.Op_LParen:
1628 p_die('Expected ( after =', self.cur_token)
1629 left_token = self.cur_token
1630 right_token = None # type: Token
1631
1632 # MUST use a new word parser (with same lexer).
1633 w_parser = self.parse_ctx.MakeWordParser(self.lexer, self.line_reader)
1634 words = [] # type: List[CompoundWord]
1635 done = False
1636 while not done:
1637 w = w_parser.ReadWord(lex_mode_e.ShCommand)
1638 with tagswitch(w) as case:
1639 if case(word_e.Operator):
1640 tok = cast(Token, w)
1641 if tok.id == Id.Right_ShArrayLiteral:
1642 right_token = tok
1643 done = True # can't use break here
1644 # Unlike command parsing, array parsing allows embedded \n.
1645 elif tok.id == Id.Op_Newline:
1646 continue
1647 else:
1648 p_die('Unexpected token in array literal', loc.Word(w))
1649
1650 elif case(word_e.Compound):
1651 words.append(cast(CompoundWord, w))
1652
1653 else:
1654 raise AssertionError()
1655
1656 if len(words) == 0: # a=() is empty indexed array
1657 # Needed for type safety, doh
1658 no_words = [] # type: List[word_t]
1659 node = ShArrayLiteral(left_token, no_words, right_token)
1660 return node
1661
1662 pairs = [] # type: List[AssocPair]
1663 # If the first one is a key/value pair, then the rest are assumed to be.
1664 pair = word_.DetectAssocPair(words[0])
1665 if pair:
1666 pairs.append(pair)
1667
1668 n = len(words)
1669 for i in xrange(1, n):
1670 w2 = words[i]
1671 pair = word_.DetectAssocPair(w2)
1672 if not pair:
1673 p_die("Expected associative array pair", loc.Word(w2))
1674
1675 pairs.append(pair)
1676
1677 # invariant List?
1678 return word_part.BashAssocLiteral(left_token, pairs, right_token)
1679
1680 # Brace detection for arrays but NOT associative arrays
1681 words2 = braces.BraceDetectAll(words)
1682 words3 = word_.TildeDetectAll(words2)
1683 return ShArrayLiteral(left_token, words3, right_token)
1684
1685 def ParseProcCallArgs(self, start_symbol):
1686 # type: (int) -> ArgList
1687 """ json write (x) """
1688 self.lexer.MaybeUnreadOne()
1689
1690 arg_list = ArgList.CreateNull(alloc_lists=True)
1691 arg_list.left = self.cur_token
1692 self.parse_ctx.ParseProcCallArgs(self.lexer, arg_list, start_symbol)
1693 return arg_list
1694
1695 def _MaybeReadWordPart(self, is_first, lex_mode, parts):
1696 # type: (bool, lex_mode_t, List[word_part_t]) -> bool
1697 """Helper for _ReadCompoundWord3."""
1698 done = False
1699
1700 if self.token_type == Id.Lit_EscapedChar:
1701 tok = self.cur_token
1702 assert tok.length == 2
1703 ch = lexer.TokenSliceLeft(tok, 1)
1704 if not self.parse_opts.parse_backslash():
1705 if not pyutil.IsValidCharEscape(ch):
1706 p_die('Invalid char escape in unquoted word (OILS-ERR-13)',
1707 self.cur_token)
1708
1709 part = word_part.EscapedLiteral(self.cur_token,
1710 ch) # type: word_part_t
1711 else:
1712 part = self.cur_token
1713
1714 if is_first and self.token_type == Id.Lit_VarLike: # foo=
1715 parts.append(part)
1716 # Unfortunately it's awkward to pull the check for a=(1 2) up to
1717 # _ReadWord.
1718 next_id = self.lexer.LookPastSpace(lex_mode)
1719 if next_id == Id.Op_LParen:
1720 self.lexer.PushHint(Id.Op_RParen, Id.Right_ShArrayLiteral)
1721 part2 = self._ReadArrayLiteral()
1722 parts.append(part2)
1723
1724 # Array literal must be the last part of the word.
1725 self._SetNext(lex_mode)
1726 self._GetToken()
1727 # EOF, whitespace, newline, Right_Subshell
1728 if self.token_kind not in KINDS_THAT_END_WORDS:
1729 p_die('Unexpected token after array literal',
1730 self.cur_token)
1731 done = True
1732
1733 elif (is_first and self.parse_opts.parse_at() and
1734 self.token_type == Id.Lit_Splice):
1735
1736 splice_tok = self.cur_token
1737 part2 = word_part.Splice(splice_tok,
1738 lexer.TokenSliceLeft(splice_tok, 1))
1739
1740 parts.append(part2)
1741
1742 # @words must be the last part of the word
1743 self._SetNext(lex_mode)
1744 self._GetToken()
1745 # EOF, whitespace, newline, Right_Subshell
1746 if self.token_kind not in KINDS_THAT_END_WORDS:
1747 p_die('Unexpected token after array splice', self.cur_token)
1748 done = True
1749
1750 elif (is_first and self.parse_opts.parse_at() and
1751 self.token_type == Id.Lit_AtLBracket): # @[split(x)]
1752 part2 = self._ReadExprSub(lex_mode_e.DQ)
1753 parts.append(part2)
1754
1755 # @[split(x)]
1756 self._SetNext(lex_mode)
1757 self._GetToken()
1758 # EOF, whitespace, newline, Right_Subshell
1759 if self.token_kind not in KINDS_THAT_END_WORDS:
1760 p_die('Unexpected token after Expr splice', self.cur_token)
1761 done = True
1762
1763 elif (is_first and self.parse_opts.parse_at() and
1764 self.token_type == Id.Lit_AtLBraceDot):
1765 p_die('TODO: @{.myproc builtin sub}', self.cur_token)
1766
1767 elif (is_first and self.parse_opts.parse_at_all() and
1768 self.token_type == Id.Lit_At):
1769 # Because $[x] ${x} and perhaps $/x/ are reserved, it makes sense for @
1770 # at the beginning of a word to be reserved.
1771
1772 # Although should we relax 'echo @' ? I'm tempted to have a shortcut for
1773 # @_argv and
1774 p_die('Literal @ starting a word must be quoted (parse_at_all)',
1775 self.cur_token)
1776
1777 else:
1778 # not a literal with lookahead; append it
1779 parts.append(part)
1780
1781 return done
1782
1783 def _ReadCompoundWord(self, lex_mode):
1784 # type: (lex_mode_t) -> CompoundWord
1785 return self._ReadCompoundWord3(lex_mode, Id.Undefined_Tok, True)
1786
1787 def _ReadCompoundWord3(self, lex_mode, eof_type, empty_ok):
1788 # type: (lex_mode_t, Id_t, bool) -> CompoundWord
1789 """
1790 Precondition: Looking at the first token of the first word part
1791 Postcondition: Looking at the token after, e.g. space or operator
1792
1793 NOTE: eof_type is necessary because / is a literal, i.e. Lit_Slash, but it
1794 could be an operator delimiting a compound word. Can we change lexer modes
1795 and remove this special case?
1796 """
1797 w = CompoundWord([])
1798 num_parts = 0
1799 brace_count = 0
1800 done = False
1801 is_triple_quoted = None # type: Optional[BoolParamBox]
1802
1803 while not done:
1804 self._GetToken()
1805
1806 allow_done = empty_ok or num_parts != 0
1807 if allow_done and self.token_type == eof_type:
1808 done = True # e.g. for ${foo//pat/replace}
1809
1810 # Keywords like "for" are treated like literals
1811 elif self.token_kind in (Kind.Lit, Kind.History, Kind.KW,
1812 Kind.ControlFlow, Kind.BoolUnary,
1813 Kind.BoolBinary):
1814
1815 # Syntax error for { and }
1816 if self.token_type == Id.Lit_LBrace:
1817 brace_count += 1
1818 elif self.token_type == Id.Lit_RBrace:
1819 brace_count -= 1
1820 elif self.token_type == Id.Lit_Dollar:
1821 if not self.parse_opts.parse_dollar():
1822 if num_parts == 0 and lex_mode == lex_mode_e.ShCommand:
1823 next_byte = self.lexer.ByteLookAhead()
1824 # TODO: switch lexer modes and parse $/d+/. But not ${a:-$/d+/}
1825 if next_byte == '/':
1826 #log('next_byte %r', next_byte)
1827 pass
1828
1829 p_die('Literal $ should be quoted like \$',
1830 self.cur_token)
1831
1832 done = self._MaybeReadWordPart(num_parts == 0, lex_mode,
1833 w.parts)
1834
1835 elif self.token_kind == Kind.VSub:
1836 vsub_token = self.cur_token
1837
1838 part = SimpleVarSub(vsub_token) # type: word_part_t
1839 w.parts.append(part)
1840
1841 elif self.token_kind == Kind.ExtGlob:
1842 # If parse_at, we can take over @( to start @(seq 3)
1843 # Users can also use look at ,(*.py|*.sh)
1844 if (self.parse_opts.parse_at() and
1845 self.token_type == Id.ExtGlob_At and num_parts == 0):
1846 cs_part = self._ReadCommandSub(Id.Left_AtParen,
1847 d_quoted=False)
1848 # RARE mutation of tok.id!
1849 cs_part.left_token.id = Id.Left_AtParen
1850 part = cs_part # for type safety
1851
1852 # Same check as _MaybeReadWordPart. @(seq 3)x is illegal, just like
1853 # a=(one two)x and @arrayfunc(3)x.
1854 self._GetToken()
1855 if self.token_kind not in KINDS_THAT_END_WORDS:
1856 p_die('Unexpected token after @()', self.cur_token)
1857 done = True
1858
1859 else:
1860 if HAVE_FNM_EXTMATCH == 0:
1861 p_die(
1862 "Extended glob won't work without FNM_EXTMATCH support in libc",
1863 self.cur_token)
1864 part = self._ReadExtGlob()
1865 w.parts.append(part)
1866
1867 elif self.token_kind == Kind.BashRegex:
1868 if self.token_type == Id.BashRegex_LParen: # Opening (
1869 part = self._ReadBashRegexGroup()
1870 w.parts.append(part)
1871 else:
1872 assert self.token_type == Id.BashRegex_AllowedInParens
1873 p_die('Invalid token in bash regex', self.cur_token)
1874
1875 elif self.token_kind == Kind.Left:
1876 try_triple_quote = (self.parse_opts.parse_triple_quote() and
1877 lex_mode == lex_mode_e.ShCommand and
1878 num_parts == 0)
1879
1880 # Save allocation
1881 if try_triple_quote:
1882 is_triple_quoted = BoolParamBox(False)
1883
1884 part = self._ReadUnquotedLeftParts(is_triple_quoted)
1885 w.parts.append(part)
1886
1887 # NOT done yet, will advance below
1888 elif self.token_kind == Kind.Right:
1889 # Still part of the word; will be done on the next iter.
1890 if self.token_type == Id.Right_DoubleQuote:
1891 pass
1892 # Never happens, no PushHint for this case.
1893 #elif self.token_type == Id.Right_DollarParen:
1894 # pass
1895 elif self.token_type == Id.Right_Subshell:
1896 # LEXER HACK for (case x in x) ;; esac )
1897 # Rewind before it's used
1898 assert self.next_lex_mode == lex_mode_e.Undefined
1899 if self.lexer.MaybeUnreadOne():
1900 self.lexer.PushHint(Id.Op_RParen, Id.Right_Subshell)
1901 self._SetNext(lex_mode)
1902 done = True
1903 else:
1904 done = True
1905
1906 elif self.token_kind == Kind.Ignored:
1907 done = True
1908
1909 else:
1910 # LEXER HACK for unbalanced case clause. 'case foo in esac' is valid,
1911 # so to test for ESAC, we can read ) before getting a chance to
1912 # PushHint(Id.Op_RParen, Id.Right_CasePat). So here we unread one
1913 # token and do it again.
1914
1915 # We get Id.Op_RParen at top level: case x in x) ;; esac
1916 # We get Id.Eof_RParen inside ComSub: $(case x in x) ;; esac )
1917 if self.token_type in (Id.Op_RParen, Id.Eof_RParen):
1918 # Rewind before it's used
1919 assert self.next_lex_mode == lex_mode_e.Undefined
1920 if self.lexer.MaybeUnreadOne():
1921 if self.token_type == Id.Eof_RParen:
1922 # Redo translation
1923 self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
1924 self._SetNext(lex_mode)
1925
1926 done = True # anything we don't recognize means we're done
1927
1928 if not done:
1929 self._SetNext(lex_mode)
1930 num_parts += 1
1931
1932 if (self.parse_opts.parse_brace() and num_parts > 1 and
1933 brace_count != 0):
1934 # accept { and }, but not foo{
1935 p_die(
1936 'Word has unbalanced { }. Maybe add a space or quote it like \{',
1937 loc.Word(w))
1938
1939 if is_triple_quoted and is_triple_quoted.b and num_parts > 1:
1940 p_die('Unexpected parts after triple quoted string',
1941 loc.WordPart(w.parts[-1]))
1942
1943 if 0:
1944 from _devbuild.gen.syntax_asdl import word_part_str
1945 word_key = ' '.join(word_part_str(p.tag()) for p in w.parts)
1946 WORD_HIST[word_key] += 1
1947 return w
1948
1949 def _ReadArithWord(self):
1950 # type: () -> Optional[word_t]
1951 """ Helper for ReadArithWord() """
1952 self._GetToken()
1953
1954 if self.token_kind == Kind.Unknown:
1955 # e.g. happened during dynamic parsing of unset 'a[$foo]' in gherkin
1956 p_die(
1957 'Unexpected token while parsing arithmetic: %r' %
1958 lexer.TokenVal(self.cur_token), self.cur_token)
1959
1960 elif self.token_kind == Kind.Eof:
1961 return self.cur_token
1962
1963 elif self.token_kind == Kind.Ignored:
1964 # Space should be ignored.
1965 self._SetNext(lex_mode_e.Arith)
1966 return None
1967
1968 elif self.token_kind in (Kind.Arith, Kind.Right):
1969 # Id.Right_DollarDParen IS just a normal token, handled by ArithParser
1970 self._SetNext(lex_mode_e.Arith)
1971 return self.cur_token
1972
1973 elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub):
1974 return self._ReadCompoundWord(lex_mode_e.Arith)
1975
1976 else:
1977 raise AssertionError(self.cur_token)
1978
1979 def _ReadWord(self, word_mode):
1980 # type: (lex_mode_t) -> Optional[word_t]
1981 """Helper function for ReadWord()."""
1982
1983 # Change the pseudo lexer mode to a real lexer mode
1984 if word_mode == lex_mode_e.ShCommandFakeBrack:
1985 lex_mode = lex_mode_e.ShCommand
1986 else:
1987 lex_mode = word_mode
1988
1989 self._GetToken()
1990
1991 if self.token_kind == Kind.Eof:
1992 # No advance
1993 return self.cur_token
1994
1995 # Allow Arith for ) at end of for loop?
1996 elif self.token_kind in (Kind.Op, Kind.Redir, Kind.Arith):
1997 self._SetNext(lex_mode)
1998
1999 # Newlines are complicated. See 3x2 matrix in the comment about
2000 # self.multiline and self.newline_state above.
2001 if self.token_type == Id.Op_Newline:
2002 if self.multiline:
2003 if self.newline_state > 1:
2004 # This points at a blank line, but at least it gives the line number
2005 p_die('Invalid blank line in multiline mode',
2006 self.cur_token)
2007 return None
2008
2009 if self.returned_newline: # skip
2010 return None
2011
2012 return self.cur_token
2013
2014 elif self.token_kind == Kind.Right:
2015 if self.token_type not in (Id.Right_Subshell, Id.Right_ShFunction,
2016 Id.Right_CasePat,
2017 Id.Right_ShArrayLiteral):
2018 raise AssertionError(self.cur_token)
2019
2020 self._SetNext(lex_mode)
2021 return self.cur_token
2022
2023 elif self.token_kind in (Kind.Ignored, Kind.WS):
2024 self._SetNext(lex_mode)
2025 return None
2026
2027 else:
2028 assert self.token_kind in (Kind.VSub, Kind.Lit, Kind.History,
2029 Kind.Left, Kind.KW, Kind.ControlFlow,
2030 Kind.BoolUnary, Kind.BoolBinary,
2031 Kind.ExtGlob,
2032 Kind.BashRegex), 'Unhandled token kind'
2033
2034 if (word_mode == lex_mode_e.ShCommandFakeBrack and
2035 self.parse_opts.parse_bracket() and
2036 self.token_type == Id.Lit_LBracket):
2037 # Change [ from Kind.Lit -> Kind.Op
2038 # So CommandParser can treat
2039 # assert [42 === x]
2040 # like
2041 # json write (x)
2042 bracket_word = self.cur_token
2043 bracket_word.id = Id.Op_LBracket
2044
2045 self._SetNext(lex_mode)
2046 return bracket_word
2047
2048 # We're beginning a word. If we see Id.Lit_Pound, change to
2049 # lex_mode_e.Comment and read until end of line.
2050 if self.token_type == Id.Lit_Pound:
2051 self._SetNext(lex_mode_e.Comment)
2052 self._GetToken()
2053
2054 # NOTE: The # could be the last character in the file. It can't be
2055 # Eof_{RParen,Backtick} because #) and #` are comments.
2056 assert self.token_type in (Id.Ignored_Comment, Id.Eof_Real), \
2057 self.cur_token
2058
2059 # The next iteration will go into Kind.Ignored and set lex state to
2060 # lex_mode_e.ShCommand/etc.
2061 return None # tell ReadWord() to try again after comment
2062
2063 elif self.token_type == Id.Lit_TPound: ### doc comment
2064 self._SetNext(lex_mode_e.Comment)
2065 self._GetToken()
2066
2067 if self.token_type == Id.Ignored_Comment and self.emit_doc_token:
2068 return self.cur_token
2069
2070 return None # tell ReadWord() to try again after comment
2071
2072 else:
2073 # r'' u'' b''
2074 if (self.token_type == Id.Lit_Chars and
2075 self.lexer.LookAheadOne(
2076 lex_mode_e.ShCommand) == Id.Left_SingleQuote):
2077
2078 # When shopt -s parse_raw_string:
2079 # echo r'hi' is like echo 'hi'
2080 #
2081 # echo u'\u{3bc}' b'\yff' works
2082
2083 tok = self.cur_token
2084 if self.parse_opts.parse_ysh_string():
2085 if lexer.TokenEquals(tok, 'r'):
2086 left_id = Id.Left_RSingleQuote
2087 elif lexer.TokenEquals(tok, 'u'):
2088 left_id = Id.Left_USingleQuote
2089 elif lexer.TokenEquals(tok, 'b'):
2090 left_id = Id.Left_BSingleQuote
2091 else:
2092 left_id = Id.Undefined_Tok
2093
2094 if left_id != Id.Undefined_Tok:
2095 # skip the r, and then 'foo' will be read as normal
2096 self._SetNext(lex_mode_e.ShCommand)
2097
2098 self._GetToken()
2099 assert self.token_type == Id.Left_SingleQuote, self.token_type
2100
2101 # Read the word in a different lexer mode
2102 return self._ReadYshSingleQuoted(left_id)
2103
2104 return self._ReadCompoundWord(lex_mode)
2105
2106 def ParseVarRef(self):
2107 # type: () -> BracedVarSub
2108 """DYNAMIC parsing of what's inside ${!ref}
2109
2110 # Same as VarOf production
2111 VarRefExpr = VarOf EOF
2112 """
2113 self._SetNext(lex_mode_e.VSub_1)
2114
2115 self._GetToken()
2116 if self.token_kind != Kind.VSub:
2117 p_die('Expected var name', self.cur_token)
2118
2119 part = self._ParseVarOf()
2120 # NOTE: no ${ } means no part.left and part.right
2121 part.left = part.name_tok # cheat to make test pass
2122 part.right = part.name_tok
2123
2124 self._GetToken()
2125 if self.token_type != Id.Eof_Real:
2126 p_die('Expected end of var ref expression', self.cur_token)
2127 return part
2128
2129 def LookPastSpace(self):
2130 # type: () -> Id_t
2131 """Look ahead to the next token.
2132
2133 For the CommandParser to recognize
2134 array= (1 2 3)
2135 YSH for ( versus bash for ((
2136 YSH if ( versus if test
2137 YSH while ( versus while test
2138 YSH bare assignment 'grep =' versus 'grep foo'
2139 """
2140 assert self.token_type != Id.Undefined_Tok
2141 if self.cur_token.id == Id.WS_Space:
2142 id_ = self.lexer.LookPastSpace(lex_mode_e.ShCommand)
2143 else:
2144 id_ = self.cur_token.id
2145 return id_
2146
2147 def LookAheadFuncParens(self):
2148 # type: () -> bool
2149 """Special lookahead for f( ) { echo hi; } to check for ( )"""
2150 assert self.token_type != Id.Undefined_Tok
2151
2152 # We have to handle 2 cases because we buffer a token
2153 if self.cur_token.id == Id.Op_LParen: # saw funcname(
2154 return self.lexer.LookAheadFuncParens(1) # go back one char
2155
2156 elif self.cur_token.id == Id.WS_Space: # saw funcname WHITESPACE
2157 return self.lexer.LookAheadFuncParens(0)
2158
2159 else:
2160 return False
2161
2162 def ReadWord(self, word_mode):
2163 # type: (lex_mode_t) -> word_t
2164 """Read the next word, using the given lexer mode.
2165
2166 This is a stateful wrapper for the stateless _ReadWord function.
2167 """
2168 assert word_mode in (lex_mode_e.ShCommand,
2169 lex_mode_e.ShCommandFakeBrack,
2170 lex_mode_e.DBracket, lex_mode_e.BashRegex)
2171
2172 if self.buffered_word: # For integration with pgen2
2173 w = self.buffered_word
2174 self.buffered_word = None
2175 else:
2176 while True:
2177 w = self._ReadWord(word_mode)
2178 if w is not None:
2179 break
2180
2181 self.returned_newline = (word_.CommandId(w) == Id.Op_Newline)
2182 return w
2183
2184 def ReadArithWord(self):
2185 # type: () -> word_t
2186 while True:
2187 w = self._ReadArithWord()
2188 if w is not None:
2189 break
2190 return w
2191
2192 def ReadHereDocBody(self, parts):
2193 # type: (List[word_part_t]) -> None
2194 """
2195 A here doc is like a double quoted context, except " isn't special.
2196 """
2197 self._ReadLikeDQ(None, False, parts)
2198 # Returns nothing
2199
2200 def ReadForPlugin(self):
2201 # type: () -> CompoundWord
2202 """For $PS1, $PS4, etc.
2203
2204 This is just like reading a here doc line. "\n" is allowed, as
2205 well as the typical substitutions ${x} $(echo hi) $((1 + 2)).
2206 """
2207 w = CompoundWord([])
2208 self._ReadLikeDQ(None, False, w.parts)
2209 return w
2210
2211 def EmitDocToken(self, b):
2212 # type: (bool) -> None
2213 self.emit_doc_token = b
2214
2215 def Multiline(self, b):
2216 # type: (bool) -> None
2217 self.multiline = b
2218
2219
2220if 0:
2221 import collections
2222 WORD_HIST = collections.Counter()
2223
2224# vim: sw=4