OILS / osh / word_parse.py View on Github | oils.pub

2261 lines, 1208 significant
1# Copyright 2016 Andy Chu. All rights reserved.
2# Licensed under the Apache License, Version 2.0 (the "License");
3# you may not use this file except in compliance with the License.
4# You may obtain a copy of the License at
5#
6# http://www.apache.org/licenses/LICENSE-2.0
7"""
8word_parse.py - Parse the shell word language.
9
10Hairy example:
11
12 hi$((1 + 2))"$(echo hi)"${var:-__"$(echo default)"__}
13
14Substitutions can be nested, but which inner subs are allowed depends on the
15outer sub. Notes:
16
17lex_mode_e.ShCommand (_ReadUnquotedLeftParts)
18 All subs and quotes are allowed:
19 $v ${v} $() `` $(()) '' "" $'' $"" <() >()
20
21lex_mode_e.DQ (_ReadDoubleQuotedLeftParts)
22 Var, Command, Arith, but no quotes.
23 $v ${v} $() `` $(())
24 No process substitution.
25
26lex_mode_e.Arith
27 Similar to DQ: Var, Command, and Arith sub, but no process sub. bash doesn't
28 allow quotes, but OSH does. We allow ALL FOUR kinds of quotes, because we
29 need those for associative array indexing.
30
31lex_mode_e.VSub_ArgUnquoted
32 Like ShCommand, everything is allowed (even process substitutions), but we
33 stop at }, and space is SIGNIFICANT.
34
35 Example: ${a:- b }
36
37 ${X:-$v} ${X:-${v}} ${X:-$(echo hi)} ${X:-`echo hi`} ${X:-$((1+2))}
38 ${X:-'single'} ${X:-"double"} ${X:-$'\n'} ${X:-<(echo hi)}
39
40lex_mode_e.VSub_ArgDQ
41 In contrast to DQ, VS_ARG_DQ accepts nested "" and $'' and $"", e.g.
42 "${x:-"default"}".
43
44 In contrast, VSub_ArgUnquoted respects single quotes and process
45 substitution.
46
47 It's weird that double quotes are allowed. Space is also significant here,
48 e.g. "${x:-a "b"}".
49"""
50
51from _devbuild.gen import grammar_nt
52from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind
53from _devbuild.gen.types_asdl import (lex_mode_t, lex_mode_e)
54from _devbuild.gen.syntax_asdl import (
55 BoolParamBox,
56 Token,
57 SimpleVarSub,
58 loc,
59 source,
60 DoubleQuoted,
61 SingleQuoted,
62 BracedVarSub,
63 CommandSub,
64 InitializerWord,
65 InitializerWord_t,
66 bracket_op,
67 bracket_op_t,
68 suffix_op,
69 suffix_op_t,
70 rhs_word,
71 rhs_word_e,
72 rhs_word_t,
73 word_e,
74 word_t,
75 CompoundWord,
76 word_part,
77 word_part_t,
78 y_lhs_e,
79 arith_expr_t,
80 command,
81 expr,
82 expr_e,
83 expr_t,
84 pat_t,
85 ArgList,
86 Proc,
87 Func,
88 Subscript,
89 Attribute,
90 arith_expr,
91 VarDecl,
92 Mutation,
93 word_part_e,
94)
95from core import alloc
96from core.error import p_die
97from mycpp.mylib import log
98from core import pyutil
99from display import ui
100from frontend import consts
101from frontend import lexer
102from frontend import reader
103from osh import tdop
104from osh import arith_parse
105from osh import braces
106from osh import word_
107from osh import word_compile
108from mycpp.mylib import tagswitch
109
110from libc import HAVE_FNM_EXTMATCH
111
112from typing import List, Optional, Tuple, cast
113from typing import TYPE_CHECKING
114if TYPE_CHECKING:
115 from frontend.lexer import Lexer
116 from frontend.parse_lib import ParseContext
117 from frontend.reader import _Reader
118 from osh.cmd_parse import VarChecker
119
120unused1 = log
121unused2 = Id_str
122
123KINDS_THAT_END_WORDS = [Kind.Eof, Kind.WS, Kind.Op, Kind.Right]
124
125
126def _CheckYshWord(w):
127 # type: (CompoundWord) -> bool
128 """YSH word restriction
129
130 Allowed:
131 'foo' r'foo' --flag r'foo'
132 --flag='foo'
133 --flag="foo"
134 Not allowed:
135 --flag=r'bar' NAME=u'value' # ambiguous
136 --flag=b''' multi '''
137 """
138 parts = w.parts
139 n = len(parts)
140 ok = True
141 if n >= 2:
142 for part in parts:
143 if part.tag() in (word_part_e.SingleQuoted,
144 word_part_e.DoubleQuoted):
145 ok = False
146
147 # Allow special cases:
148 # --flag='val' NAME='bar'
149 # But NOT
150 # --flag=r'val' NAME=r'val'
151 if not ok:
152 if (n == 2 and word_.LiteralId(parts[0]) == Id.Lit_VarLike):
153 ok = True
154 elif (n == 3 and word_.LiteralId(parts[0]) == Id.Lit_Chars and
155 word_.LiteralId(parts[1]) == Id.Lit_Equals):
156 ok = True
157
158 return ok
159
160
161class WordEmitter(object):
162 """Common interface for [ and [["""
163
164 def __init__(self):
165 # type: () -> None
166 """Empty constructor for mycpp."""
167 pass
168
169 def ReadWord(self, lex_mode):
170 # type: (lex_mode_t) -> word_t
171 raise NotImplementedError()
172
173
174class WordParser(WordEmitter):
175
176 def __init__(self, parse_ctx, lexer, line_reader):
177 # type: (ParseContext, Lexer, _Reader) -> None
178 self.parse_ctx = parse_ctx
179 self.lexer = lexer
180 self.line_reader = line_reader
181 self.arena = line_reader.arena
182
183 self.parse_opts = parse_ctx.parse_opts
184 self.a_parser = tdop.TdopParser(arith_parse.Spec(), self,
185 self.parse_opts)
186 self.Reset()
187
188 def Init(self, lex_mode):
189 # type: (lex_mode_t) -> None
190 """Used to parse arithmetic, see ParseContext."""
191 self.next_lex_mode = lex_mode
192
193 def Reset(self):
194 # type: () -> None
195 """Called by interactive loop."""
196 # For _GetToken()
197 self.cur_token = None # type: Token
198 self.token_kind = Kind.Undefined
199 self.token_type = Id.Undefined_Tok
200
201 self.next_lex_mode = lex_mode_e.ShCommand
202
203 # Boolean mutated by CommandParser via word_.ctx_EmitDocToken. For ### doc
204 # comments
205 self.emit_doc_token = False
206 # Boolean mutated by CommandParser via word_.ctx_Multiline. '...' starts
207 # multiline mode.
208 self.multiline = False
209
210 # For detecting invalid \n\n in multiline mode. Counts what we got
211 # directly from the lexer.
212 self.newline_state = 0
213 # For consolidating \n\n -> \n for the CALLER. This simplifies the parsers
214 # that consume words.
215 self.returned_newline = False
216
217 # For integration with pgen2
218 self.buffered_word = None # type: word_t
219
220 def _GetToken(self):
221 # type: () -> None
222 """Call this when you need to make a decision based on any of:
223
224 self.token_type
225 self.token_kind
226 self.cur_token
227 """
228 if self.next_lex_mode == lex_mode_e.Undefined:
229 return # _SetNext() not called, so do nothing
230
231 is_fake = self.next_lex_mode == lex_mode_e.BashRegexFakeInner
232 real_mode = (lex_mode_e.BashRegex if is_fake else self.next_lex_mode)
233
234 self.cur_token = self.lexer.Read(real_mode)
235
236 # MUTATE TOKEN for fake lexer mode.
237 # This is for crazy stuff bash allows, like [[ s =~ (< >) ]]
238 if (is_fake and self.cur_token.id
239 in (Id.WS_Space, Id.BashRegex_AllowedInParens)):
240 self.cur_token.id = Id.Lit_Chars
241
242 self.token_type = self.cur_token.id
243 self.token_kind = consts.GetKind(self.token_type)
244
245 # number of consecutive newlines, ignoring whitespace
246 if self.token_type == Id.Op_Newline:
247 self.newline_state += 1
248 elif self.token_kind != Kind.WS:
249 self.newline_state = 0
250
251 self.parse_ctx.trail.AppendToken(self.cur_token) # For completion
252 self.next_lex_mode = lex_mode_e.Undefined
253
254 def _SetNext(self, lex_mode):
255 # type: (lex_mode_t) -> None
256 """Set the next lex state, but don't actually read a token.
257
258 We need this for proper interactive parsing.
259 """
260 self.next_lex_mode = lex_mode
261
262 def _ReadVarOpArg(self, arg_lex_mode):
263 # type: (lex_mode_t) -> rhs_word_t
264
265 # NOTE: Operators like | and < are not treated as special, so ${a:- | >} is
266 # valid, even when unquoted.
267 self._SetNext(arg_lex_mode)
268 self._GetToken()
269
270 w = self._ReadVarOpArg2(arg_lex_mode, Id.Undefined_Tok,
271 True) # empty_ok
272
273 # If the Compound has no parts, and we're in a double-quoted VarSub
274 # arg, and empty_ok, then return Empty. This is so it can evaluate to
275 # the empty string and not get elided.
276 #
277 # Examples:
278 # - "${s:-}", "${s/%pat/}"
279 # It's similar to LooksLikeShAssignment where we turn x= into x=''. And it
280 # has the same potential problem of not having Token location info.
281 #
282 # NOTE: empty_ok is False only for the PatSub pattern, which means we'll
283 # return a Compound with no parts, which is explicitly checked with a
284 # custom error message.
285 if len(w.parts) == 0 and arg_lex_mode == lex_mode_e.VSub_ArgDQ:
286 return rhs_word.Empty
287
288 return w
289
290 def _ReadVarOpArg2(self, arg_lex_mode, eof_type, empty_ok):
291 # type: (lex_mode_t, Id_t, bool) -> CompoundWord
292 """Return a CompoundWord.
293
294 Helper function for _ReadVarOpArg and used directly by
295 _ReadPatSubVarOp.
296 """
297 w = self._ReadCompoundWord3(arg_lex_mode, eof_type, empty_ok)
298 #log('w %s', w)
299 tilde = word_.TildeDetect(w)
300 if tilde:
301 w = tilde
302 return w
303
304 def _ReadSliceVarOp(self):
305 # type: () -> suffix_op.Slice
306 """
307 Looking token after first ':'
308
309 ArithExpr? (':' ArithExpr? )? '}'
310 """
311 self._NextNonSpace()
312
313 cur_id = self.token_type
314
315 if cur_id in (Id.Arith_RBrace, Id.Arith_Colon): # ${a:} or ${a::}
316 begin = arith_expr.EmptyZero # type: arith_expr_t
317 else:
318 begin = self.a_parser.Parse()
319 cur_id = self.a_parser.CurrentId() # advance
320
321 if cur_id == Id.Arith_RBrace: # ${a:1} or ${@:1}
322 # No length specified, so it's N
323 no_length = None # type: Optional[arith_expr_t]
324 return suffix_op.Slice(begin, no_length)
325
326 elif cur_id == Id.Arith_Colon: # ${a:1:} or ${@:1:}
327 colon_tok = self.cur_token
328 self._NextNonSpace()
329
330 if self.token_type == Id.Arith_RBrace:
331 # quirky bash behavior:
332 # ${a:1:} or ${a::} means length ZERO
333 # but ${a:1} or ${a:} means length N
334 if self.parse_opts.strict_parse_slice():
335 p_die(
336 "Slice length: Add explicit zero, or omit : for N (strict_parse_slice)",
337 colon_tok)
338
339 length = arith_expr.EmptyZero # type: arith_expr_t
340 else:
341 length = self._ReadArithExpr(Id.Arith_RBrace)
342
343 return suffix_op.Slice(begin, length)
344
345 else:
346 p_die("Expected : or } in slice", self.cur_token)
347
348 raise AssertionError() # for MyPy
349
350 def _ReadPatSubVarOp(self):
351 # type: () -> suffix_op.PatSub
352 """Looking at the first '/' after VarOf:
353
354 VarSub = ...
355 | VarOf '/' Match ( '/' WORD? )?
356 Match = '/' WORD # can't be empty
357 | '#' WORD? # may be empty
358 | '%' WORD?
359 """
360 slash_tok = self.cur_token # location info
361 replace_mode = Id.Undefined_Tok # bizarre syntax / # %
362
363 self._SetNext(lex_mode_e.VSub_ArgUnquoted) # advance past /
364
365 self._GetToken()
366 if self.token_type == Id.Right_DollarBrace:
367 pat = CompoundWord([])
368 return suffix_op.PatSub(pat, rhs_word.Empty, replace_mode,
369 slash_tok)
370
371 if self.token_type in (Id.Lit_Slash, Id.Lit_Pound, Id.Lit_Percent):
372 replace_mode = self.token_type
373 self._SetNext(lex_mode_e.VSub_ArgUnquoted)
374
375 # Bash quirk:
376 # echo ${x/#/replace} has an empty pattern
377 # echo ${x////replace} is non-empty; it means echo ${x//'/'/replace}
378 empty_ok = replace_mode != Id.Lit_Slash
379 pat = self._ReadVarOpArg2(lex_mode_e.VSub_ArgUnquoted, Id.Lit_Slash,
380 empty_ok)
381 #log('pat 1 %r', pat)
382
383 if self.token_type == Id.Lit_Slash:
384 # read until }
385 replace = self._ReadVarOpArg(
386 lex_mode_e.VSub_ArgUnquoted) # type: rhs_word_t
387 #log('r 1 %r', replace)
388 else:
389 # e.g. ${v/a} is the same as ${v/a/} -- empty replacement string
390 replace = rhs_word.Empty
391
392 self._GetToken()
393 if self.token_type != Id.Right_DollarBrace:
394 # This happens on invalid code
395 p_die(
396 "Expected } after replacement string, got %s" %
397 ui.PrettyId(self.token_type), self.cur_token)
398
399 return suffix_op.PatSub(pat, replace, replace_mode, slash_tok)
400
401 def _ReadSubscript(self):
402 # type: () -> bracket_op_t
403 """ Subscript = '[' ('@' | '*' | ArithExpr) ']' """
404 # Lookahead to see if we get @ or *. Otherwise read a full arithmetic
405 # expression.
406 next_id = self.lexer.LookPastSpace(lex_mode_e.Arith)
407 if next_id in (Id.Lit_At, Id.Arith_Star):
408 op = bracket_op.WholeArray(next_id) # type: bracket_op_t
409
410 self._SetNext(lex_mode_e.Arith) # skip past [
411 self._GetToken()
412 self._SetNext(lex_mode_e.Arith) # skip past @
413 self._GetToken()
414 else:
415 self._SetNext(lex_mode_e.Arith) # skip past [
416 anode = self._ReadArithExpr(Id.Arith_RBracket)
417 op = bracket_op.ArrayIndex(anode)
418
419 if self.token_type != Id.Arith_RBracket: # Should be looking at ]
420 p_die('Expected ] to close subscript', self.cur_token)
421
422 self._SetNext(lex_mode_e.VSub_2) # skip past ]
423 self._GetToken() # Needed to be in the same spot as no subscript
424
425 return op
426
427 def _ParseVarOf(self):
428 # type: () -> BracedVarSub
429 """
430 VarOf = NAME Subscript?
431 | NUMBER # no subscript allowed, none of these are arrays
432 # ${@[1]} doesn't work, even though slicing does
433 | VarSymbol
434 """
435 self._GetToken()
436 name_token = self.cur_token
437 self._SetNext(lex_mode_e.VSub_2)
438
439 self._GetToken() # Check for []
440 if self.token_type == Id.VOp2_LBracket:
441 bracket_op = self._ReadSubscript()
442 else:
443 bracket_op = None
444
445 part = BracedVarSub.CreateNull()
446 part.name_tok = name_token
447 part.var_name = lexer.TokenVal(name_token)
448 part.bracket_op = bracket_op
449 return part
450
451 def _ParseVarExpr(self, arg_lex_mode, allow_query=False):
452 # type: (lex_mode_t, bool) -> BracedVarSub
453 """Start parsing at the op -- we already skipped past the name."""
454 part = self._ParseVarOf()
455
456 self._GetToken()
457 if self.token_type == Id.Right_DollarBrace:
458 return part # no ops
459
460 op_kind = self.token_kind
461
462 if op_kind == Kind.VTest:
463 tok = self.cur_token
464 arg_word = self._ReadVarOpArg(arg_lex_mode)
465 if self.token_type != Id.Right_DollarBrace:
466 p_die('Expected } to close ${', self.cur_token)
467
468 part.suffix_op = suffix_op.Unary(tok, arg_word)
469
470 elif op_kind == Kind.VOpYsh:
471 tok = self.cur_token
472 arg_word = self._ReadVarOpArg(arg_lex_mode)
473 if self.token_type != Id.Right_DollarBrace:
474 p_die('Expected } to close ${', self.cur_token)
475
476 UP_arg_word = arg_word
477 with tagswitch(arg_word) as case:
478 if case(rhs_word_e.Empty):
479 pass
480 elif case(rhs_word_e.Compound):
481 arg_word = cast(CompoundWord, UP_arg_word)
482 # This handles ${x|html} and ${x %.3f} now
483 # However I think ${x %.3f} should be statically parsed? It can enter
484 # the printf lexer modes.
485 ok, arg, quoted = word_.StaticEval(arg_word)
486 if not ok or quoted:
487 p_die('Expected a constant argument',
488 loc.Word(arg_word))
489
490 part.suffix_op = suffix_op.Static(tok, arg)
491
492 elif op_kind == Kind.VOp0:
493 part.suffix_op = self.cur_token # Nullary
494 self._SetNext(lex_mode_e.VSub_2) # Expecting }
495 self._GetToken()
496
497 elif op_kind == Kind.VOp1: # % %% # ## etc.
498 tok = self.cur_token
499 # Weird exception that all shells have: these operators take a glob
500 # pattern, so they're lexed as VSub_ArgUnquoted, not VSub_ArgDQ
501 arg_word = self._ReadVarOpArg(lex_mode_e.VSub_ArgUnquoted)
502 if self.token_type != Id.Right_DollarBrace:
503 p_die('Expected } to close ${', self.cur_token)
504
505 part.suffix_op = suffix_op.Unary(tok, arg_word)
506
507 elif op_kind == Kind.VOp2: # / : [ ]
508 if self.token_type == Id.VOp2_Slash:
509 patsub_op = self._ReadPatSubVarOp() # type: suffix_op_t
510 part.suffix_op = patsub_op
511
512 # Checked by the method above
513 assert self.token_type == Id.Right_DollarBrace, self.cur_token
514
515 elif self.token_type == Id.VOp2_Colon:
516 part.suffix_op = self._ReadSliceVarOp()
517 # NOTE: } in arithmetic mode.
518 if self.token_type != Id.Arith_RBrace:
519 # Token seems off; doesn't point to X in # ${a:1:2 X
520 p_die('Expected } to close ${', self.cur_token)
521
522 else:
523 # TODO: Does this ever happen?
524 p_die('Unexpected token in ${} (%s)' % 'VOp2', self.cur_token)
525
526 elif op_kind == Kind.VOp3: # ${prefix@} etc.
527 if allow_query:
528 part.suffix_op = self.cur_token # Nullary
529 self._SetNext(lex_mode_e.VSub_2) # Expecting }
530 self._GetToken()
531 else:
532 p_die("Unexpected token in ${} (%s)" % 'VOp3', self.cur_token)
533
534 # NOTE: Arith_RBrace is for slicing, because it reads } in arithmetic
535 # mode. It's redundantly checked above.
536 if self.token_type not in (Id.Right_DollarBrace, Id.Arith_RBrace):
537 # ${a.} or ${!a.}
538 p_die('Expected } to close ${', self.cur_token)
539
540 # Now look for ops
541 return part
542
543 def _ReadZshVarSub(self, left_token):
544 # type: (Token) -> word_part.ZshVarSub
545
546 self._SetNext(lex_mode_e.VSub_Zsh) # Move past ${(foo)
547
548 # Can be empty
549 w = self._ReadCompoundWord3(lex_mode_e.VSub_Zsh, Id.Right_DollarBrace,
550 True)
551 self._GetToken()
552 return word_part.ZshVarSub(left_token, w, self.cur_token)
553
554 def ReadBracedVarSub(self, left_token):
555 # type: (Token) -> Tuple[BracedVarSub, Token]
556 """ For YSH expressions like var x = ${x:-"default"}. """
557 part = self._ReadBracedVarSub(left_token, d_quoted=False)
558 last_token = self.cur_token
559 return part, last_token
560
561 def _ReadBracedVarSub(self, left_token, d_quoted):
562 # type: (Token, bool) -> BracedVarSub
563 """For the ${} expression language.
564
565 NAME = [a-zA-Z_][a-zA-Z0-9_]*
566 NUMBER = [0-9]+ # ${10}, ${11}, ...
567
568 Subscript = '[' ('@' | '*' | ArithExpr) ']'
569 VarSymbol = '!' | '@' | '#' | ...
570 VarOf = NAME Subscript?
571 | NUMBER # no subscript allowed, none of these are arrays
572 # ${@[1]} doesn't work, even though slicing does
573 | VarSymbol
574
575 NULLARY_OP = '@Q' | '@E' | '@P' | '@A' | '@a' # VOp0
576
577 TEST_OP = '-' | ':-' | '=' | ':=' | '+' | ':+' | '?' | ':?'
578 STRIP_OP = '#' | '##' | '%' | '%%'
579 CASE_OP = ',' | ',,' | '^' | '^^'
580 UnaryOp = TEST_OP | STRIP_OP | CASE_OP
581
582 YSH_UNARY = '|' | ' ' # ${x|html} and ${x %.3f}.
583 # SPACE is operator not %
584 Match = ('/' | '#' | '%') WORD # match all / prefix / suffix
585 VarExpr = VarOf
586 | VarOf NULLARY_OP
587 | VarOf UnaryOp WORD
588 | VarOf YSH_UNARY STATIC_WORD
589 | VarOf ':' ArithExpr (':' ArithExpr )?
590 | VarOf '/' Match '/' WORD
591
592 LengthExpr = '#' VarOf # can't apply operators after length
593
594 RefOrKeys = '!' VarExpr # CAN apply operators after a named ref
595 # ${!ref[0]} vs ${!keys[@]} resolved later
596
597 PrefixQuery = '!' NAME ('*' | '@') # list variable names with a prefix
598
599 BuiltinSub = '.' WORD+ # ${.myproc 'builtin' $sub}
600
601 VarSub = LengthExpr
602 | RefOrKeys
603 | PrefixQuery
604 | VarExpr
605 | BuiltinSub
606
607 NOTES:
608 - Arithmetic expressions are used twice, inside subscripts ${a[x+1]} and
609 slicing ${a:x+1:y+2}
610 - ${#} and ${!} need LL(2) lookahead (considering how my tokenizer works)
611 - @ and * are technically arithmetic expressions in this implementation
612 - We don't account for bash 4.4: ${param@operator} -- Q E P A a. Note that
613 it's also vectorized.
614
615 Strictness over bash:
616 - echo ${a[0][0]} doesn't do anything useful, so we disallow it from the
617 grammar
618 - ! and # prefixes can't be composed, even though named refs can be
619 composed with other operators
620 - '#' means 4 different things: length prefix, VarSymbol, UnaryOp to strip
621 a prefix, and it can also be a literal part of WORD.
622
623 From the parser's point of view, the prefix # can't be combined with
624 UnaryOp/slicing/matching, and the ! can. However
625
626 - ${a[@]:1:2} is not allowed
627 - ${#a[@]:1:2} is allowed, but gives the wrong answer
628 """
629 if d_quoted:
630 arg_lex_mode = lex_mode_e.VSub_ArgDQ
631 else:
632 arg_lex_mode = lex_mode_e.VSub_ArgUnquoted
633
634 self._SetNext(lex_mode_e.VSub_1)
635 self._GetToken()
636
637 ty = self.token_type
638 first_tok = self.cur_token
639
640 if ty == Id.VSub_Pound:
641 # Disambiguate
642 next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
643 if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
644 # e.g. a name, '#' is the prefix
645 self._SetNext(lex_mode_e.VSub_1)
646 part = self._ParseVarOf()
647
648 self._GetToken()
649 if self.token_type != Id.Right_DollarBrace:
650 p_die('Expected } after length expression', self.cur_token)
651
652 part.prefix_op = first_tok
653
654 else: # not a prefix, '#' is the variable
655 part = self._ParseVarExpr(arg_lex_mode)
656
657 elif ty == Id.VSub_Bang:
658 next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
659 if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
660 # e.g. a name, '!' is the prefix
661 # ${!a} -- this is a ref
662 # ${!3} -- this is ref
663 # ${!a[1]} -- this is a ref
664 # ${!a[@]} -- this is a keys
665 # No lookahead -- do it in a second step, or at runtime
666 self._SetNext(lex_mode_e.VSub_1)
667 part = self._ParseVarExpr(arg_lex_mode, allow_query=True)
668
669 part.prefix_op = first_tok
670
671 else: # not a prefix, '!' is the variable
672 part = self._ParseVarExpr(arg_lex_mode)
673
674 elif ty == Id.VSub_Dot:
675 # Note: this will become a new builtin_sub type, so this method must
676 # return word_part_t rather than BracedVarSub. I don't think that
677 # should cause problems.
678 p_die('TODO: ${.myproc builtin sub}', self.cur_token)
679
680 # VS_NAME, VS_NUMBER, symbol that isn't # or !
681 elif self.token_kind == Kind.VSub:
682 part = self._ParseVarExpr(arg_lex_mode)
683
684 else:
685 # e.g. ${^}
686 p_die('Unexpected token in ${}', self.cur_token)
687
688 part.left = left_token # attach the argument
689 part.right = self.cur_token
690 return part
691
692 def _ReadSingleQuoted(self, left_token, lex_mode):
693 # type: (Token, lex_mode_t) -> SingleQuoted
694 """Internal method to read a word_part."""
695 tokens = [] # type: List[Token]
696 # In command mode, we never disallow backslashes like '\'
697 right_quote = self.ReadSingleQuoted(lex_mode, left_token, tokens,
698 False)
699 sval = word_compile.EvalSingleQuoted(left_token.id, tokens)
700 node = SingleQuoted(left_token, sval, right_quote)
701 return node
702
703 def ReadSingleQuoted(self, lex_mode, left_token, out_tokens, is_ysh_expr):
704 # type: (lex_mode_t, Token, List[Token], bool) -> Token
705 """Appends to out_tokens; returns last token
706
707 Used by expr_parse.py
708 """
709 # TODO: Remove and use out_tokens
710 tokens = [] # type: List[Token]
711
712 # echo '\' is allowed, but x = '\' is invalid, in favor of x = r'\'
713 no_backslashes = is_ysh_expr and left_token.id == Id.Left_SingleQuote
714
715 expected_end_tokens = 3 if left_token.id in (
716 Id.Left_TSingleQuote, Id.Left_RTSingleQuote, Id.Left_UTSingleQuote,
717 Id.Left_BTSingleQuote) else 1
718 num_end_tokens = 0
719
720 while num_end_tokens < expected_end_tokens:
721 self._SetNext(lex_mode)
722 self._GetToken()
723
724 # Kind.Char emitted in lex_mode.SQ_C
725 if self.token_kind in (Kind.Lit, Kind.Char):
726 tok = self.cur_token
727 # Happens in lex_mode_e.SQ: 'one\two' is ambiguous, should be
728 # r'one\two' or c'one\\two'
729 if no_backslashes and lexer.TokenContains(tok, '\\'):
730 p_die(
731 r"Strings with backslashes should look like r'\n' or u'\n' or b'\n'",
732 tok)
733
734 if is_ysh_expr:
735 # Disallow var x = $'\001'. Arguably we don't need these
736 # checks because u'\u{1}' is the way to write it.
737 if self.token_type == Id.Char_Octal3:
738 p_die(
739 r"Use \xhh or \u{...} instead of octal escapes in YSH strings",
740 tok)
741
742 if self.token_type == Id.Char_Hex and self.cur_token.length != 4:
743 # disallow \xH
744 p_die(
745 r'Invalid hex escape in YSH string (must be \xHH)',
746 tok)
747
748 tokens.append(tok)
749
750 elif self.token_kind == Kind.Unknown:
751 tok = self.cur_token
752 assert tok.id == Id.Unknown_Backslash, tok
753
754 # x = $'\z' is disallowed; ditto for echo $'\z' if shopt -u parse_backslash
755 if is_ysh_expr or not self.parse_opts.parse_backslash():
756 p_die(
757 "Invalid char escape in C-style string literal (OILS-ERR-11)",
758 tok)
759
760 tokens.append(tok)
761
762 elif self.token_kind == Kind.Eof:
763 p_die('Unexpected EOF in single-quoted string that began here',
764 left_token)
765
766 elif self.token_kind == Kind.Right:
767 # assume Id.Right_SingleQuote
768 num_end_tokens += 1
769 tokens.append(self.cur_token)
770
771 else:
772 raise AssertionError(self.cur_token)
773
774 if self.token_kind != Kind.Right:
775 num_end_tokens = 0 # we need three in a ROW
776
777 if expected_end_tokens == 1:
778 tokens.pop()
779 elif expected_end_tokens == 3: # Get rid of spurious end tokens
780 tokens.pop()
781 tokens.pop()
782 tokens.pop()
783
784 # Remove space from ''' r''' $''' in both expression mode and command mode
785 if left_token.id in (Id.Left_TSingleQuote, Id.Left_RTSingleQuote,
786 Id.Left_UTSingleQuote, Id.Left_BTSingleQuote):
787 word_compile.RemoveLeadingSpaceSQ(tokens)
788
789 # Validation after lexing - same 2 checks in j8.LexerDecoder
790 is_u_string = left_token.id in (Id.Left_USingleQuote,
791 Id.Left_UTSingleQuote)
792
793 for tok in tokens:
794 # u'\yff' is not valid, but b'\yff' is
795 if is_u_string and tok.id == Id.Char_YHex:
796 p_die(
797 r"%s escapes not allowed in u'' strings" %
798 lexer.TokenVal(tok), tok)
799
800 out_tokens.extend(tokens)
801 return self.cur_token
802
803 def _ReadDoubleQuotedLeftParts(self):
804 # type: () -> word_part_t
805 """Read substitution parts in a double quoted context."""
806 if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick):
807 return self._ReadCommandSub(self.token_type, d_quoted=True)
808
809 if self.token_type == Id.Left_DollarBrace:
810 return self._ReadBracedVarSub(self.cur_token, d_quoted=True)
811
812 if self.token_type == Id.Left_DollarDParen:
813 return self._ReadArithSub()
814
815 if self.token_type == Id.Left_DollarBracket:
816 return self._ReadExprSub(lex_mode_e.DQ)
817
818 if self.token_type == Id.Left_DollarBraceZsh:
819 return self._ReadZshVarSub(self.cur_token)
820
821 raise AssertionError(self.cur_token)
822
823 def _ReadYshSingleQuoted(self, left_id):
824 # type: (Id_t) -> CompoundWord
825 """Read YSH style strings
826
827 r'' u'' b''
828 r''' ''' u''' ''' b''' '''
829 """
830 #log('BEF self.cur_token %s', self.cur_token)
831 if left_id == Id.Left_RSingleQuote:
832 lexer_mode = lex_mode_e.SQ_Raw
833 triple_left_id = Id.Left_RTSingleQuote
834 elif left_id == Id.Left_USingleQuote:
835 lexer_mode = lex_mode_e.J8_Str
836 triple_left_id = Id.Left_UTSingleQuote
837 elif left_id == Id.Left_BSingleQuote:
838 lexer_mode = lex_mode_e.J8_Str
839 triple_left_id = Id.Left_BTSingleQuote
840 else:
841 raise AssertionError(left_id)
842
843 # Needed for syntax checks
844 left_tok = self.cur_token
845 left_tok.id = left_id
846
847 sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
848
849 if (len(sq_part.sval) == 0 and self.lexer.ByteLookAhead() == "'"):
850 self._SetNext(lex_mode_e.ShCommand)
851 self._GetToken()
852
853 assert self.token_type == Id.Left_SingleQuote
854 # HACK: magically transform the third ' in u''' to
855 # Id.Left_UTSingleQuote, so that ''' is the terminator
856 left_tok = self.cur_token
857 left_tok.id = triple_left_id
858
859 # Handles stripping leading whitespace
860 sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
861
862 # Advance and validate
863 self._SetNext(lex_mode_e.ShCommand)
864
865 self._GetToken()
866 if self.token_kind not in KINDS_THAT_END_WORDS:
867 p_die('Unexpected token after YSH single-quoted string',
868 self.cur_token)
869
870 return CompoundWord([sq_part])
871
872 def _ReadUnquotedLeftParts(self, triple_out):
873 # type: (Optional[BoolParamBox]) -> word_part_t
874 """Read substitutions and quoted strings (for lex_mode_e.ShCommand).
875
876 If triple_out is set, then we try parsing triple quoted strings,
877 and set its value to True if we got one.
878 """
879 if self.token_type in (Id.Left_DoubleQuote, Id.Left_DollarDoubleQuote):
880 # Note: $"" is a synonym for "". It might make sense if it added
881 # \n \0 \x00 \u{123} etc. But that's not what bash does!
882 dq_part = self._ReadDoubleQuoted(self.cur_token)
883 # Got empty word "" and there's a " after
884 if (triple_out and len(dq_part.parts) == 0 and
885 self.lexer.ByteLookAhead() == '"'):
886
887 self._SetNext(lex_mode_e.ShCommand)
888 self._GetToken()
889 # HACK: magically transform the third " in """ to
890 # Id.Left_TDoubleQuote, so that """ is the terminator
891 left_dq_token = self.cur_token
892 left_dq_token.id = Id.Left_TDoubleQuote
893 triple_out.b = True # let caller know we got it
894 return self._ReadDoubleQuoted(left_dq_token)
895
896 return dq_part
897
898 if self.token_type in (Id.Left_SingleQuote, Id.Left_RSingleQuote,
899 Id.Left_DollarSingleQuote):
900 if self.token_type == Id.Left_SingleQuote:
901 lexer_mode = lex_mode_e.SQ_Raw
902 triple_left_id = Id.Left_TSingleQuote
903 elif self.token_type == Id.Left_RSingleQuote:
904 lexer_mode = lex_mode_e.SQ_Raw
905 triple_left_id = Id.Left_RTSingleQuote
906 else:
907 lexer_mode = lex_mode_e.SQ_C
908 # there is no such thing as $'''
909 triple_left_id = Id.Undefined_Tok
910
911 sq_part = self._ReadSingleQuoted(self.cur_token, lexer_mode)
912
913 # Got empty '' or r'' and there's a ' after
914 # u'' and b'' are handled in _ReadYshSingleQuoted
915 if (triple_left_id != Id.Undefined_Tok and
916 triple_out is not None and len(sq_part.sval) == 0 and
917 self.lexer.ByteLookAhead() == "'"):
918
919 self._SetNext(lex_mode_e.ShCommand)
920 self._GetToken()
921
922 # HACK: magically transform the third ' in ''' to
923 # Id.Left_TSingleQuote, so that ''' is the terminator
924 left_sq_token = self.cur_token
925 left_sq_token.id = triple_left_id
926
927 triple_out.b = True # let caller know we got it
928 return self._ReadSingleQuoted(left_sq_token, lexer_mode)
929
930 return sq_part
931
932 if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick,
933 Id.Left_ProcSubIn, Id.Left_ProcSubOut):
934 return self._ReadCommandSub(self.token_type, d_quoted=False)
935
936 if self.token_type == Id.Left_DollarBrace:
937 return self._ReadBracedVarSub(self.cur_token, d_quoted=False)
938
939 if self.token_type == Id.Left_DollarDParen:
940 return self._ReadArithSub()
941
942 if self.token_type == Id.Left_DollarBracket:
943 return self._ReadExprSub(lex_mode_e.ShCommand)
944
945 if self.token_type == Id.Left_DollarBraceZsh:
946 return self._ReadZshVarSub(self.cur_token)
947
948 raise AssertionError(self.cur_token)
949
950 def _ReadExtGlob(self):
951 # type: () -> word_part.ExtGlob
952 """
953 Grammar:
954 Item = CompoundWord | EPSILON # important: @(foo|) is allowed
955 LEFT = '@(' | '*(' | '+(' | '?(' | '!('
956 RIGHT = ')'
957 ExtGlob = LEFT (Item '|')* Item RIGHT # ITEM may be empty
958 Compound includes ExtGlob
959 """
960 left_token = self.cur_token
961 right_token = None # type: Token
962 arms = [] # type: List[CompoundWord]
963
964 self.lexer.PushHint(Id.Op_RParen, Id.Right_ExtGlob)
965 self._SetNext(lex_mode_e.ExtGlob) # advance past LEFT
966
967 read_word = False # did we just a read a word? To handle @(||).
968
969 while True:
970 self._GetToken()
971
972 if self.token_type == Id.Right_ExtGlob:
973 if not read_word:
974 arms.append(CompoundWord([]))
975 right_token = self.cur_token
976 break
977
978 elif self.token_type == Id.Op_Pipe:
979 if not read_word:
980 arms.append(CompoundWord([]))
981 read_word = False
982 self._SetNext(lex_mode_e.ExtGlob)
983
984 # lex_mode_e.ExtGlob should only produce these 4 kinds of tokens
985 elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub,
986 Kind.ExtGlob):
987 w = self._ReadCompoundWord(lex_mode_e.ExtGlob)
988 arms.append(w)
989 read_word = True
990
991 elif self.token_kind == Kind.Eof:
992 p_die('Unexpected EOF reading extended glob that began here',
993 left_token)
994
995 else:
996 raise AssertionError(self.cur_token)
997
998 return word_part.ExtGlob(left_token, arms, right_token)
999
1000 def _ReadBashRegexGroup(self):
1001 # type: () -> word_part.BashRegexGroup
1002 """
1003 Grammar:
1004 BashRegexGroup = '(' WORD? ')
1005 """
1006 left_token = self.cur_token
1007 assert left_token.id == Id.BashRegex_LParen, left_token
1008
1009 arms = [] # type: List[CompoundWord]
1010
1011 self.lexer.PushHint(Id.Op_RParen, Id.Right_BashRegexGroup)
1012 self._SetNext(lex_mode_e.BashRegexFakeInner) # advance past LEFT
1013
1014 self._GetToken()
1015 if self.token_type == Id.Right_BashRegexGroup: # empty ()
1016 return word_part.BashRegexGroup(left_token, None, self.cur_token)
1017
1018 # lex_mode_e.BashRegex should only produce these 4 kinds of tokens
1019 if self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub, Kind.BashRegex):
1020 # Fake lexer mode that translates Id.WS_Space to Id.Lit_Chars
1021 # To allow bash style [[ s =~ (a b) ]]
1022 w = self._ReadCompoundWord(lex_mode_e.BashRegexFakeInner)
1023 arms.append(w)
1024
1025 self._GetToken()
1026 if self.token_type != Id.Right_BashRegexGroup:
1027 p_die('Expected ) to close bash regex group', self.cur_token)
1028
1029 return word_part.BashRegexGroup(left_token, w, self.cur_token)
1030
1031 p_die('Expected word after ( opening bash regex group', self.cur_token)
1032
1033 def _ReadLikeDQ(self, left_token, is_ysh_expr, out_parts):
1034 # type: (Optional[Token], bool, List[word_part_t]) -> None
1035 """
1036 Args:
1037 left_token: A token if we are reading a double quoted part, or None if
1038 we're reading a here doc.
1039 is_ysh_expr: Whether to disallow backticks and invalid char escapes
1040 out_parts: list of word_part to append to
1041 """
1042 if left_token:
1043 if left_token.id in (Id.Left_TDoubleQuote,
1044 Id.Left_DollarTDoubleQuote):
1045 expected_end_tokens = 3
1046 else:
1047 expected_end_tokens = 1
1048 else:
1049 expected_end_tokens = 1000 # here doc will break
1050
1051 num_end_tokens = 0
1052 while num_end_tokens < expected_end_tokens:
1053 self._SetNext(lex_mode_e.DQ)
1054 self._GetToken()
1055
1056 if self.token_kind == Kind.Lit:
1057 if self.token_type == Id.Lit_EscapedChar:
1058 tok = self.cur_token
1059 ch = lexer.TokenSliceLeft(tok, 1)
1060 part = word_part.EscapedLiteral(tok,
1061 ch) # type: word_part_t
1062 else:
1063 if self.token_type == Id.Lit_BadBackslash:
1064 # echo "\z" is OK in shell, but 'x = "\z" is a syntax error in
1065 # YSH.
1066 # Slight hole: We don't catch 'x = ${undef:-"\z"} because of the
1067 # recursion (unless parse_backslash)
1068 if (is_ysh_expr or
1069 not self.parse_opts.parse_backslash()):
1070 p_die(
1071 "Invalid char escape in double quoted string (OILS-ERR-12)",
1072 self.cur_token)
1073 elif self.token_type == Id.Lit_Dollar:
1074 if is_ysh_expr or not self.parse_opts.parse_dollar():
1075 p_die("Literal $ should be quoted like \$",
1076 self.cur_token)
1077
1078 part = self.cur_token
1079 out_parts.append(part)
1080
1081 elif self.token_kind == Kind.Left:
1082 if self.token_type == Id.Left_Backtick and is_ysh_expr:
1083 p_die("Invalid backtick: use $(cmd) or \\` in YSH strings",
1084 self.cur_token)
1085
1086 part = self._ReadDoubleQuotedLeftParts()
1087 out_parts.append(part)
1088
1089 elif self.token_kind == Kind.VSub:
1090 tok = self.cur_token
1091 part = SimpleVarSub(tok)
1092 out_parts.append(part)
1093 # NOTE: parsing "$f(x)" would BREAK CODE. Could add a more for it
1094 # later.
1095
1096 elif self.token_kind == Kind.Right:
1097 assert self.token_type == Id.Right_DoubleQuote, self.token_type
1098 if left_token:
1099 num_end_tokens += 1
1100
1101 # In a here doc, the right quote is literal!
1102 out_parts.append(self.cur_token)
1103
1104 elif self.token_kind == Kind.Eof:
1105 if left_token:
1106 p_die(
1107 'Unexpected EOF reading double-quoted string that began here',
1108 left_token)
1109 else: # here docs will have an EOF in their token stream
1110 break
1111
1112 else:
1113 raise AssertionError(self.cur_token)
1114
1115 if self.token_kind != Kind.Right:
1116 num_end_tokens = 0 # """ must be CONSECUTIVE
1117
1118 if expected_end_tokens == 1:
1119 out_parts.pop()
1120 elif expected_end_tokens == 3:
1121 out_parts.pop()
1122 out_parts.pop()
1123 out_parts.pop()
1124
1125 # Remove space from """ in both expression mode and command mode
1126 if (left_token and left_token.id
1127 in (Id.Left_TDoubleQuote, Id.Left_DollarTDoubleQuote)):
1128 word_compile.RemoveLeadingSpaceDQ(out_parts)
1129
1130 # Return nothing, since we appended to 'out_parts'
1131
1132 def _ReadDoubleQuoted(self, left_token):
1133 # type: (Token) -> DoubleQuoted
1134 """Helper function for "hello $name".
1135
1136 Args:
1137 eof_type: for stopping at }, Id.Lit_RBrace
1138 here_doc: Whether we are reading in a here doc context
1139
1140 Also ${foo%%a b c} # treat this as double quoted. until you hit
1141 """
1142 parts = [] # type: List[word_part_t]
1143 self._ReadLikeDQ(left_token, False, parts)
1144
1145 right_quote = self.cur_token
1146 return DoubleQuoted(left_token, parts, right_quote)
1147
1148 def ReadDoubleQuoted(self, left_token, parts):
1149 # type: (Token, List[word_part_t]) -> Token
1150 """For expression mode.
1151
1152 Read var x = "${dir:-}/$name"; etc.
1153 """
1154 self._ReadLikeDQ(left_token, True, parts)
1155 return self.cur_token
1156
1157 def _ReadCommandSub(self, left_id, d_quoted=False):
1158 # type: (Id_t, bool) -> CommandSub
1159 """
1160 NOTE: This is not in the grammar, because word parts aren't in the grammar!
1161
1162 command_sub = '$(' command_list ')'
1163 | '@(' command_list ')'
1164 | '<(' command_list ')'
1165 | '>(' command_list ')'
1166 | ` command_list `
1167 """
1168 left_token = self.cur_token
1169
1170 # Set the lexer in a state so ) becomes the EOF token.
1171 if left_id in (Id.Left_DollarParen, Id.Left_AtParen, Id.Left_ProcSubIn,
1172 Id.Left_ProcSubOut):
1173 self._SetNext(lex_mode_e.ShCommand) # advance past $( etc.
1174
1175 right_id = Id.Eof_RParen
1176 self.lexer.PushHint(Id.Op_RParen, right_id)
1177 c_parser = self.parse_ctx.MakeParserForCommandSub(
1178 self.line_reader, self.lexer, right_id)
1179 # NOTE: This doesn't use something like main_loop because we don't want
1180 # to interleave parsing and execution! Unlike 'source' and 'eval'.
1181 node = c_parser.ParseCommandSub()
1182
1183 right_token = c_parser.w_parser.cur_token
1184
1185 elif left_id == Id.Left_Backtick and self.parse_ctx.do_lossless:
1186 # NOTE: This is an APPROXIMATE solution for translation ONLY. See
1187 # test/osh2oil.
1188
1189 right_id = Id.Eof_Backtick
1190 self.lexer.PushHint(Id.Left_Backtick, right_id)
1191 c_parser = self.parse_ctx.MakeParserForCommandSub(
1192 self.line_reader, self.lexer, right_id)
1193 node = c_parser.ParseCommandSub()
1194 right_token = c_parser.w_parser.cur_token
1195
1196 elif left_id == Id.Left_Backtick:
1197 if not self.parse_opts.parse_backticks():
1198 p_die('Use $(cmd) instead of backticks (parse_backticks)',
1199 left_token)
1200
1201 self._SetNext(lex_mode_e.Backtick) # advance past `
1202
1203 parts = [] # type: List[str]
1204 while True:
1205 self._GetToken()
1206 #log("TOK %s", self.cur_token)
1207
1208 if self.token_type == Id.Backtick_Quoted:
1209 # Remove leading \
1210 parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1211
1212 elif self.token_type == Id.Backtick_DoubleQuote:
1213 # Compatibility: If backticks are double quoted, then double quotes
1214 # within them have to be \"
1215 # Shells aren't smart enough to match nested " and ` quotes (but OSH
1216 # is)
1217 if d_quoted:
1218 # Remove leading \
1219 parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1220 else:
1221 parts.append(lexer.TokenVal(self.cur_token))
1222
1223 elif self.token_type == Id.Backtick_Other:
1224 parts.append(lexer.TokenVal(self.cur_token))
1225
1226 elif self.token_type == Id.Backtick_Right:
1227 break
1228
1229 elif self.token_type == Id.Eof_Real:
1230 # Note: this parse error is in the ORIGINAL context. No code_str yet.
1231 p_die('Unexpected EOF while looking for closing backtick',
1232 left_token)
1233
1234 else:
1235 raise AssertionError(self.cur_token)
1236
1237 self._SetNext(lex_mode_e.Backtick)
1238
1239 # Calculate right SPID on CommandSub BEFORE re-parsing.
1240 right_token = self.cur_token
1241
1242 code_str = ''.join(parts)
1243 #log('code %r', code_str)
1244
1245 # NOTE: This is similar to how we parse aliases in osh/cmd_parse.py. It
1246 # won't have the same location info as MakeParserForCommandSub(), because
1247 # the lexer is different.
1248 arena = self.parse_ctx.arena
1249 #arena = alloc.Arena()
1250 line_reader = reader.StringLineReader(code_str, arena)
1251 c_parser = self.parse_ctx.MakeOshParser(line_reader)
1252 src = source.Reparsed('backticks', left_token, right_token)
1253 with alloc.ctx_SourceCode(arena, src):
1254 node = c_parser.ParseCommandSub()
1255
1256 else:
1257 raise AssertionError(left_id)
1258
1259 return CommandSub(left_token, node, right_token)
1260
1261 def _ReadExprSub(self, lex_mode):
1262 # type: (lex_mode_t) -> word_part.ExprSub
1263 """$[d->key] $[obj.method()] etc."""
1264 left_token = self.cur_token
1265
1266 self._SetNext(lex_mode_e.Expr)
1267 enode, right_token = self.parse_ctx.ParseYshExpr(
1268 self.lexer, grammar_nt.ysh_expr_sub)
1269
1270 self._SetNext(lex_mode) # Move past ]
1271 return word_part.ExprSub(left_token, enode, right_token)
1272
1273 def ParseVarDecl(self, kw_token):
1274 # type: (Token) -> VarDecl
1275 """
1276 oil_var_decl: name_type_list '=' testlist end_stmt
1277
1278 Note that assignments must end with \n ; } or EOF. Unlike shell
1279 assignments, we disallow:
1280
1281 var x = 42 | wc -l
1282 var x = 42 && echo hi
1283 """
1284 self._SetNext(lex_mode_e.Expr)
1285 enode, last_token = self.parse_ctx.ParseVarDecl(kw_token, self.lexer)
1286 # Hack to move } from what the Expr lexer modes gives to what CommandParser
1287 # wants
1288 if last_token.id == Id.Op_RBrace:
1289 last_token.id = Id.Lit_RBrace
1290
1291 # Let the CommandParser see the Op_Semi or Op_Newline.
1292 self.buffered_word = last_token
1293 self._SetNext(lex_mode_e.ShCommand) # always back to this
1294 return enode
1295
1296 def ParseMutation(self, kw_token, var_checker):
1297 # type: (Token, VarChecker) -> Mutation
1298 """
1299 setvar i = 42
1300 setvar i += 1
1301 setvar a[i] = 42
1302 setvar a[i] += 1
1303 setvar d.key = 42
1304 setvar d.key += 1
1305 """
1306 self._SetNext(lex_mode_e.Expr)
1307 enode, last_token = self.parse_ctx.ParseMutation(kw_token, self.lexer)
1308 # Hack to move } from what the Expr lexer modes gives to what CommandParser
1309 # wants
1310 if last_token.id == Id.Op_RBrace:
1311 last_token.id = Id.Lit_RBrace
1312
1313 for lhs in enode.lhs:
1314 UP_lhs = lhs
1315 with tagswitch(lhs) as case:
1316 if case(y_lhs_e.Var):
1317 lhs = cast(Token, UP_lhs)
1318 var_checker.Check(kw_token.id, lexer.LazyStr(lhs), lhs)
1319
1320 # Note: this does not cover cases like
1321 # setvar (a[0])[1] = v
1322 # setvar (d.key).other = v
1323 # This leaks into catching all typos statically, which may be
1324 # possible if 'use' makes all names explicit.
1325 elif case(y_lhs_e.Subscript):
1326 lhs = cast(Subscript, UP_lhs)
1327 if lhs.obj.tag() == expr_e.Var:
1328 v = cast(expr.Var, lhs.obj)
1329 var_checker.Check(kw_token.id, v.name, v.left)
1330
1331 elif case(y_lhs_e.Attribute):
1332 lhs = cast(Attribute, UP_lhs)
1333 if lhs.obj.tag() == expr_e.Var:
1334 v = cast(expr.Var, lhs.obj)
1335 var_checker.Check(kw_token.id, v.name, v.left)
1336
1337 # Let the CommandParser see the Op_Semi or Op_Newline.
1338 self.buffered_word = last_token
1339 self._SetNext(lex_mode_e.ShCommand) # always back to this
1340 return enode
1341
1342 def ParseBareDecl(self):
1343 # type: () -> expr_t
1344 """
1345 x = {name: val}
1346 """
1347 self._SetNext(lex_mode_e.Expr)
1348 self._GetToken()
1349 enode, last_token = self.parse_ctx.ParseYshExpr(
1350 self.lexer, grammar_nt.command_expr)
1351 if last_token.id == Id.Op_RBrace:
1352 last_token.id = Id.Lit_RBrace
1353 self.buffered_word = last_token
1354 self._SetNext(lex_mode_e.ShCommand)
1355 return enode
1356
1357 def ParseYshExprForCommand(self):
1358 # type: () -> expr_t
1359
1360 # Fudge for this case
1361 # for x in(y) {
1362 # versus
1363 # for x in (y) {
1364 #
1365 # In the former case, ReadWord on 'in' puts the lexer past (.
1366 # Also see LookPastSpace in CommandParers.
1367 # A simpler solution would be nicer.
1368
1369 if self.token_type == Id.Op_LParen:
1370 self.lexer.MaybeUnreadOne()
1371
1372 enode, _ = self.parse_ctx.ParseYshExpr(self.lexer, grammar_nt.ysh_expr)
1373
1374 self._SetNext(lex_mode_e.ShCommand)
1375 return enode
1376
1377 def ParseCommandExpr(self):
1378 # type: () -> expr_t
1379 """
1380 = 1+2
1381 """
1382 enode, last_token = self.parse_ctx.ParseYshExpr(
1383 self.lexer, grammar_nt.command_expr)
1384
1385 # In some cases, such as the case statement, we expect *the lexer* to be
1386 # pointing at the token right after the expression. But the expression
1387 # parser must have read to the `last_token`. Unreading places the lexer
1388 # back in the expected state. Ie:
1389 #
1390 # case (x) { case (x) {
1391 # (else) { = x } (else) { = x }
1392 # ^ The lexer is here ^ Unread to here
1393 # } }
1394 assert last_token.id in (Id.Op_Newline, Id.Eof_Real, Id.Op_Semi,
1395 Id.Op_RBrace), last_token
1396 if last_token.id != Id.Eof_Real:
1397 # Eof_Real is the only token we cannot unread
1398 self.lexer.MaybeUnreadOne()
1399
1400 return enode
1401
1402 def ParseProc(self, node):
1403 # type: (Proc) -> None
1404
1405 # proc name-with-hyphens() must be accepted
1406 self._SetNext(lex_mode_e.ShCommand)
1407 self._GetToken()
1408 # example: 'proc f[' gets you Lit_ArrayLhsOpen
1409 if self.token_type != Id.Lit_Chars:
1410 p_die('Invalid proc name %s' % ui.PrettyToken(self.cur_token),
1411 self.cur_token)
1412
1413 # TODO: validate this more. Disallow proc 123 { }, which isn't disallowed
1414 # for shell functions. Similar to IsValidVarName().
1415 node.name = self.cur_token
1416
1417 last_token = self.parse_ctx.ParseProc(self.lexer, node)
1418
1419 # Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1420 assert last_token.id == Id.Op_LBrace
1421 last_token.id = Id.Lit_LBrace
1422 self.buffered_word = last_token
1423
1424 self._SetNext(lex_mode_e.ShCommand)
1425
1426 def ParseFunc(self, node):
1427 # type: (Func) -> None
1428 last_token = self.parse_ctx.ParseFunc(self.lexer, node)
1429
1430 # Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1431 assert last_token.id == Id.Op_LBrace
1432 last_token.id = Id.Lit_LBrace
1433 self.buffered_word = last_token
1434
1435 self._SetNext(lex_mode_e.ShCommand)
1436
1437 def ParseYshCasePattern(self):
1438 # type: () -> Tuple[pat_t, Token]
1439 pat, left_tok, last_token = self.parse_ctx.ParseYshCasePattern(
1440 self.lexer)
1441
1442 if last_token.id == Id.Op_LBrace:
1443 last_token.id = Id.Lit_LBrace
1444 self.buffered_word = last_token
1445
1446 return pat, left_tok
1447
1448 def NewlineOkForYshCase(self):
1449 # type: () -> Id_t
1450 """Check for optional newline and consume it.
1451
1452 This is a special case of `_NewlineOk` which fixed some "off-by-one" issues
1453 which crop up while parsing Ysh Case Arms. For more details, see
1454 #oil-dev > Progress On YSH Case Grammar on zulip.
1455
1456 Returns a token id which is filled with the choice of
1457
1458 word { echo word }
1459 (3) { echo expr }
1460 /e/ { echo eggex }
1461 } # right brace
1462 """
1463 while True:
1464 next_id = self.lexer.LookAheadOne(lex_mode_e.Expr)
1465
1466 # Cannot lookahead past lines
1467 if next_id == Id.Unknown_Tok:
1468 if not self.lexer.MoveToNextLine(): # Try to move to next line
1469 break # EOF
1470 continue
1471
1472 next_kind = consts.GetKind(next_id)
1473 if next_id != Id.Op_Newline and next_kind != Kind.Ignored:
1474 break
1475
1476 self.lexer.Read(lex_mode_e.Expr)
1477
1478 if next_id in (Id.Op_RBrace, Id.Op_LParen, Id.Arith_Slash):
1479 self._SetNext(lex_mode_e.Expr) # Continue in expression mode
1480 else:
1481 # Consume the trailing Op_Newline
1482 self._SetNext(lex_mode_e.ShCommand)
1483 self._GetToken()
1484
1485 return next_id
1486
1487 def _ReadArithExpr(self, end_id):
1488 # type: (Id_t) -> arith_expr_t
1489 """Read and parse an arithmetic expression in various contexts.
1490
1491 $(( 1+2 ))
1492 (( a=1+2 ))
1493 ${a[ 1+2 ]}
1494 ${a : 1+2 : 1+2}
1495
1496 See tests/arith-context.test.sh for ambiguous cases.
1497
1498 ${a[a[0]]} is valid # VS_RBRACKET vs Id.Arith_RBracket
1499
1500 ${s : a<b?0:1 : 1} # VS_COLON vs Id.Arith_Colon
1501
1502 See the assertion in ArithParser.Parse() -- unexpected extra input.
1503 """
1504 # calls self.ReadWord(lex_mode_e.Arith)
1505 anode = self.a_parser.Parse()
1506 cur_id = self.a_parser.CurrentId()
1507 if end_id != Id.Undefined_Tok and cur_id != end_id:
1508 p_die(
1509 'Unexpected token after arithmetic expression (%s != %s)' %
1510 (ui.PrettyId(cur_id), ui.PrettyId(end_id)),
1511 loc.Word(self.a_parser.cur_word))
1512 return anode
1513
1514 def _ReadArithSub(self):
1515 # type: () -> word_part.ArithSub
1516 """Read an arith substitution, which contains an arith expression, e.g.
1517
1518 $((a + 1)).
1519 """
1520 left_tok = self.cur_token
1521
1522 # The second one needs to be disambiguated in stuff like stuff like:
1523 # $(echo $(( 1+2 )) )
1524 self.lexer.PushHint(Id.Op_RParen, Id.Right_DollarDParen)
1525
1526 # NOTE: To disambiguate $(( as arith sub vs. command sub and subshell, we
1527 # could save the lexer/reader state here, and retry if the arithmetic parse
1528 # fails. But we can almost always catch this at parse time. There could
1529 # be some exceptions like:
1530 # $((echo * foo)) # looks like multiplication
1531 # $((echo / foo)) # looks like division
1532
1533 # $(( )) is valid
1534 anode = arith_expr.EmptyZero # type: arith_expr_t
1535
1536 self._NextNonSpace()
1537 if self.token_type != Id.Arith_RParen:
1538 anode = self._ReadArithExpr(Id.Arith_RParen)
1539
1540 self._SetNext(lex_mode_e.ShCommand)
1541
1542 # Ensure we get closing )
1543 self._GetToken()
1544 if self.token_type != Id.Right_DollarDParen:
1545 p_die('Expected second ) to end arith sub', self.cur_token)
1546
1547 right_tok = self.cur_token
1548 return word_part.ArithSub(left_tok, anode, right_tok)
1549
1550 def ReadDParen(self):
1551 # type: () -> Tuple[arith_expr_t, Token]
1552 """Read ((1+ 2)) -- command context.
1553
1554 We're using the word parser because it's very similar to _ReadArithExpr
1555 above.
1556
1557 This also returns the terminating Id.Op_DRightParen token for location
1558 info.
1559 """
1560 # (( )) is valid
1561 anode = arith_expr.EmptyZero # type: arith_expr_t
1562
1563 self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
1564
1565 self._NextNonSpace()
1566 if self.token_type != Id.Arith_RParen:
1567 anode = self._ReadArithExpr(Id.Arith_RParen)
1568
1569 self._SetNext(lex_mode_e.ShCommand)
1570
1571 # Ensure we get the second )
1572 self._GetToken()
1573 right = self.cur_token
1574 if right.id != Id.Op_DRightParen:
1575 p_die('Expected second ) to end arith statement', right)
1576
1577 self._SetNext(lex_mode_e.ShCommand)
1578
1579 return anode, right
1580
1581 def _NextNonSpace(self):
1582 # type: () -> None
1583 """Advance in lex_mode_e.Arith until non-space token.
1584
1585 Same logic as _ReadWord, but used in
1586 $(( ))
1587 (( ))
1588 for (( ))
1589
1590 You can read self.token_type after this, without calling _GetToken.
1591 """
1592 while True:
1593 self._SetNext(lex_mode_e.Arith)
1594 self._GetToken()
1595 if self.token_kind not in (Kind.Ignored, Kind.WS):
1596 break
1597
1598 def ReadForExpression(self):
1599 # type: () -> command.ForExpr
1600 """Read ((i=0; i<5; ++i)) -- part of command context."""
1601 self._NextNonSpace() # skip over ((
1602 cur_id = self.token_type # for end of arith expressions
1603
1604 if cur_id == Id.Arith_Semi: # for (( ; i < 10; i++ ))
1605 init_node = arith_expr.EmptyZero # type: arith_expr_t
1606 else:
1607 init_node = self.a_parser.Parse()
1608 cur_id = self.a_parser.CurrentId()
1609 self._NextNonSpace()
1610
1611 # It's odd to keep track of both cur_id and self.token_type in this
1612 # function, but it works, and is tested in 'test/parse_error.sh
1613 # arith-integration'
1614 if cur_id != Id.Arith_Semi: # for (( x=0 b; ... ))
1615 p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1616
1617 self._GetToken()
1618 cur_id = self.token_type
1619
1620 if cur_id == Id.Arith_Semi: # for (( ; ; i++ ))
1621 # empty condition is TRUE
1622 cond_node = arith_expr.EmptyOne # type: arith_expr_t
1623 else:
1624 cond_node = self.a_parser.Parse()
1625 cur_id = self.a_parser.CurrentId()
1626
1627 if cur_id != Id.Arith_Semi: # for (( x=0; x<5 b ))
1628 p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1629
1630 self._NextNonSpace()
1631 if self.token_type == Id.Arith_RParen: # for (( ; ; ))
1632 update_node = arith_expr.EmptyZero # type: arith_expr_t
1633 else:
1634 update_node = self._ReadArithExpr(Id.Arith_RParen)
1635
1636 self._NextNonSpace()
1637 if self.token_type != Id.Arith_RParen:
1638 p_die('Expected ) to end for loop expression', self.cur_token)
1639 self._SetNext(lex_mode_e.ShCommand)
1640
1641 # redirects is None, will be assigned in CommandEvaluator
1642 node = command.ForExpr.CreateNull()
1643 node.init = init_node
1644 node.cond = cond_node
1645 node.update = update_node
1646 return node
1647
1648 def _ReadArrayLiteral(self):
1649 # type: () -> word_part_t
1650 """a=(1 2 3)
1651
1652 TODO: See osh/cmd_parse.py:164 for Id.Lit_ArrayLhsOpen, for a[x++]=1
1653
1654 We want:
1655
1656 A=(['x']=1 ["x"]=2 [$x$y]=3)
1657
1658 Maybe allow this as a literal string? Because I think I've seen it before?
1659 Or maybe force people to patch to learn the rule.
1660
1661 A=([x]=4)
1662
1663 Starts with Lit_Other '[', and then it has Lit_ArrayLhsClose
1664 Maybe enforce that ALL have keys or NONE of have keys.
1665 """
1666 self._SetNext(lex_mode_e.ShCommand) # advance past (
1667 self._GetToken()
1668 if self.cur_token.id != Id.Op_LParen:
1669 p_die('Expected ( after =', self.cur_token)
1670 left_token = self.cur_token
1671 right_token = None # type: Token
1672
1673 # MUST use a new word parser (with same lexer).
1674 w_parser = self.parse_ctx.MakeWordParser(self.lexer, self.line_reader)
1675 words = [] # type: List[CompoundWord]
1676 done = False
1677 while not done:
1678 w = w_parser.ReadWord(lex_mode_e.ShCommand)
1679 with tagswitch(w) as case:
1680 if case(word_e.Operator):
1681 tok = cast(Token, w)
1682 if tok.id == Id.Right_Initializer:
1683 right_token = tok
1684 done = True # can't use break here
1685 # Unlike command parsing, array parsing allows embedded \n.
1686 elif tok.id == Id.Op_Newline:
1687 continue
1688 else:
1689 p_die('Unexpected token in array literal', loc.Word(w))
1690
1691 elif case(word_e.Compound):
1692 words.append(cast(CompoundWord, w))
1693
1694 else:
1695 raise AssertionError()
1696
1697 initializer_words = [] # type: List[InitializerWord_t]
1698 for w in words:
1699 pair = word_.DetectAssocPair(w)
1700 if pair is not None:
1701 word_.TildeDetectAssign(pair.value) # pair.value is modified
1702 initializer_words.append(pair)
1703 else:
1704 w2 = braces.BraceDetect(w) # type: word_t
1705 if w2 is None:
1706 w2 = w
1707 w3 = word_.TildeDetect(w2) # type: word_t
1708 if w3 is None:
1709 w3 = w2
1710 initializer_words.append(InitializerWord.ArrayWord(w3))
1711
1712 # invariant List?
1713 return word_part.InitializerLiteral(left_token, initializer_words,
1714 right_token)
1715
1716 def ParseProcCallArgs(self, start_symbol):
1717 # type: (int) -> ArgList
1718 """ json write (x) """
1719 self.lexer.MaybeUnreadOne()
1720
1721 arg_list = ArgList.CreateNull(alloc_lists=True)
1722 arg_list.left = self.cur_token
1723 self.parse_ctx.ParseProcCallArgs(self.lexer, arg_list, start_symbol)
1724 return arg_list
1725
1726 def _MaybeReadWordPart(self, is_first, lex_mode, parts):
1727 # type: (bool, lex_mode_t, List[word_part_t]) -> bool
1728 """Helper for _ReadCompoundWord3."""
1729 done = False
1730
1731 if self.token_type == Id.Lit_EscapedChar:
1732 tok = self.cur_token
1733 assert tok.length == 2
1734 ch = lexer.TokenSliceLeft(tok, 1)
1735 if not self.parse_opts.parse_backslash():
1736 if not pyutil.IsValidCharEscape(ch):
1737 p_die('Invalid char escape in unquoted word (OILS-ERR-13)',
1738 self.cur_token)
1739
1740 part = word_part.EscapedLiteral(self.cur_token,
1741 ch) # type: word_part_t
1742 else:
1743 part = self.cur_token
1744
1745 if is_first and self.token_type == Id.Lit_VarLike: # foo=
1746 parts.append(part)
1747 # Unfortunately it's awkward to pull the check for a=(1 2) up to
1748 # _ReadWord.
1749 next_id = self.lexer.LookPastSpace(lex_mode)
1750 if next_id == Id.Op_LParen:
1751 self.lexer.PushHint(Id.Op_RParen, Id.Right_Initializer)
1752 part2 = self._ReadArrayLiteral()
1753 parts.append(part2)
1754
1755 # Array literal must be the last part of the word.
1756 self._SetNext(lex_mode)
1757 self._GetToken()
1758 # EOF, whitespace, newline, Right_Subshell
1759 if self.token_kind not in KINDS_THAT_END_WORDS:
1760 p_die('Unexpected token after array literal',
1761 self.cur_token)
1762 done = True
1763
1764 elif (is_first and self.parse_opts.parse_at() and
1765 self.token_type == Id.Lit_Splice):
1766
1767 splice_tok = self.cur_token
1768 part2 = word_part.Splice(splice_tok,
1769 lexer.TokenSliceLeft(splice_tok, 1))
1770
1771 parts.append(part2)
1772
1773 # @words must be the last part of the word
1774 self._SetNext(lex_mode)
1775 self._GetToken()
1776 # EOF, whitespace, newline, Right_Subshell
1777 if self.token_kind not in KINDS_THAT_END_WORDS:
1778 p_die('Unexpected token after array splice', self.cur_token)
1779 done = True
1780
1781 elif (is_first and self.parse_opts.parse_at() and
1782 self.token_type == Id.Lit_AtLBracket): # @[split(x)]
1783 part2 = self._ReadExprSub(lex_mode_e.DQ)
1784 parts.append(part2)
1785
1786 # @[split(x)]
1787 self._SetNext(lex_mode)
1788 self._GetToken()
1789 # EOF, whitespace, newline, Right_Subshell
1790 if self.token_kind not in KINDS_THAT_END_WORDS:
1791 p_die('Unexpected token after Expr splice', self.cur_token)
1792 done = True
1793
1794 elif (is_first and self.parse_opts.parse_at() and
1795 self.token_type == Id.Lit_AtLBraceDot):
1796 p_die('TODO: @{.myproc builtin sub}', self.cur_token)
1797
1798 elif (is_first and self.parse_opts.parse_at_all() and
1799 self.token_type == Id.Lit_At):
1800 # Because $[x] ${x} and perhaps $/x/ are reserved, it makes sense for @
1801 # at the beginning of a word to be reserved.
1802
1803 # Although should we relax 'echo @' ? I'm tempted to have a shortcut for
1804 # @_argv and
1805 p_die('Literal @ starting a word must be quoted (parse_at_all)',
1806 self.cur_token)
1807
1808 else:
1809 # not a literal with lookahead; append it
1810 parts.append(part)
1811
1812 return done
1813
1814 def _ReadCompoundWord(self, lex_mode):
1815 # type: (lex_mode_t) -> CompoundWord
1816 return self._ReadCompoundWord3(lex_mode, Id.Undefined_Tok, True)
1817
1818 def _ReadCompoundWord3(self, lex_mode, eof_type, empty_ok):
1819 # type: (lex_mode_t, Id_t, bool) -> CompoundWord
1820 """
1821 Precondition: Looking at the first token of the first word part
1822 Postcondition: Looking at the token after, e.g. space or operator
1823
1824 NOTE: eof_type is necessary because / is a literal, i.e. Lit_Slash, but it
1825 could be an operator delimiting a compound word. Can we change lexer modes
1826 and remove this special case?
1827 """
1828 w = CompoundWord([])
1829 num_parts = 0
1830 brace_count = 0
1831 done = False
1832 is_triple_quoted = None # type: Optional[BoolParamBox]
1833
1834 while not done:
1835 self._GetToken()
1836
1837 allow_done = empty_ok or num_parts != 0
1838 if allow_done and self.token_type == eof_type:
1839 done = True # e.g. for ${foo//pat/replace}
1840
1841 # Keywords like "for" are treated like literals
1842 elif self.token_kind in (Kind.Lit, Kind.History, Kind.KW,
1843 Kind.ControlFlow, Kind.BoolUnary,
1844 Kind.BoolBinary):
1845
1846 # Syntax error for { and }
1847 if self.token_type == Id.Lit_LBrace:
1848 brace_count += 1
1849 elif self.token_type == Id.Lit_RBrace:
1850 brace_count -= 1
1851 elif self.token_type == Id.Lit_Dollar:
1852 if not self.parse_opts.parse_dollar():
1853 if num_parts == 0 and lex_mode == lex_mode_e.ShCommand:
1854 next_byte = self.lexer.ByteLookAhead()
1855 # TODO: switch lexer modes and parse $/d+/. But not ${a:-$/d+/}
1856 if next_byte == '/':
1857 #log('next_byte %r', next_byte)
1858 pass
1859
1860 p_die('Literal $ should be quoted like \$',
1861 self.cur_token)
1862
1863 done = self._MaybeReadWordPart(num_parts == 0, lex_mode,
1864 w.parts)
1865
1866 elif self.token_kind == Kind.VSub:
1867 vsub_token = self.cur_token
1868
1869 part = SimpleVarSub(vsub_token) # type: word_part_t
1870 w.parts.append(part)
1871
1872 elif self.token_kind == Kind.ExtGlob:
1873 # If parse_at, we can take over @( to start @(seq 3)
1874 # Users can also use look at ,(*.py|*.sh)
1875 if (self.parse_opts.parse_at() and
1876 self.token_type == Id.ExtGlob_At and num_parts == 0):
1877 cs_part = self._ReadCommandSub(Id.Left_AtParen,
1878 d_quoted=False)
1879 # RARE mutation of tok.id!
1880 cs_part.left_token.id = Id.Left_AtParen
1881 part = cs_part # for type safety
1882
1883 # Same check as _MaybeReadWordPart. @(seq 3)x is illegal, just like
1884 # a=(one two)x and @arrayfunc(3)x.
1885 self._GetToken()
1886 if self.token_kind not in KINDS_THAT_END_WORDS:
1887 p_die('Unexpected token after @()', self.cur_token)
1888 done = True
1889
1890 else:
1891 if HAVE_FNM_EXTMATCH == 0:
1892 p_die(
1893 "Extended glob won't work without FNM_EXTMATCH support in libc",
1894 self.cur_token)
1895 part = self._ReadExtGlob()
1896 w.parts.append(part)
1897
1898 elif self.token_kind == Kind.BashRegex:
1899 if self.token_type == Id.BashRegex_LParen: # Opening (
1900 part = self._ReadBashRegexGroup()
1901 w.parts.append(part)
1902 else:
1903 assert self.token_type == Id.BashRegex_AllowedInParens
1904 p_die('Invalid token in bash regex', self.cur_token)
1905
1906 elif self.token_kind == Kind.Left:
1907 try_triple_quote = (self.parse_opts.parse_triple_quote() and
1908 lex_mode == lex_mode_e.ShCommand and
1909 num_parts == 0)
1910
1911 # Save allocation
1912 if try_triple_quote:
1913 is_triple_quoted = BoolParamBox(False)
1914
1915 part = self._ReadUnquotedLeftParts(is_triple_quoted)
1916 w.parts.append(part)
1917
1918 # NOT done yet, will advance below
1919 elif self.token_kind == Kind.Right:
1920 # Still part of the word; will be done on the next iter.
1921 if self.token_type == Id.Right_DoubleQuote:
1922 pass
1923 # Never happens, no PushHint for this case.
1924 #elif self.token_type == Id.Right_DollarParen:
1925 # pass
1926 elif self.token_type == Id.Right_Subshell:
1927 # LEXER HACK for (case x in x) ;; esac )
1928 # Rewind before it's used
1929 assert self.next_lex_mode == lex_mode_e.Undefined
1930 if self.lexer.MaybeUnreadOne():
1931 self.lexer.PushHint(Id.Op_RParen, Id.Right_Subshell)
1932 self._SetNext(lex_mode)
1933 done = True
1934 else:
1935 done = True
1936
1937 elif self.token_kind == Kind.Ignored:
1938 done = True
1939
1940 else:
1941 # LEXER HACK for unbalanced case clause. 'case foo in esac' is valid,
1942 # so to test for ESAC, we can read ) before getting a chance to
1943 # PushHint(Id.Op_RParen, Id.Right_CasePat). So here we unread one
1944 # token and do it again.
1945
1946 # We get Id.Op_RParen at top level: case x in x) ;; esac
1947 # We get Id.Eof_RParen inside ComSub: $(case x in x) ;; esac )
1948 if self.token_type in (Id.Op_RParen, Id.Eof_RParen):
1949 # Rewind before it's used
1950 assert self.next_lex_mode == lex_mode_e.Undefined
1951 if self.lexer.MaybeUnreadOne():
1952 if self.token_type == Id.Eof_RParen:
1953 # Redo translation
1954 self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
1955 self._SetNext(lex_mode)
1956
1957 done = True # anything we don't recognize means we're done
1958
1959 if not done:
1960 self._SetNext(lex_mode)
1961 num_parts += 1
1962
1963 if (self.parse_opts.parse_brace() and num_parts > 1 and
1964 brace_count != 0):
1965 # accept { and }, but not foo{
1966 p_die(
1967 'Word has unbalanced { }. Maybe add a space or quote it like \{',
1968 loc.Word(w))
1969
1970 if is_triple_quoted and is_triple_quoted.b and num_parts > 1:
1971 p_die('Unexpected parts after triple quoted string',
1972 loc.WordPart(w.parts[-1]))
1973
1974 if 0:
1975 from _devbuild.gen.syntax_asdl import word_part_str
1976 word_key = ' '.join(word_part_str(p.tag()) for p in w.parts)
1977 WORD_HIST[word_key] += 1
1978
1979 # YSH word restriction
1980 # (r'' u'' b'' are stripped on shopt -s parse_ysh_string)
1981 if self.parse_opts.parse_ysh_string() and not _CheckYshWord(w):
1982 p_die("Invalid quoted word part in YSH (OILS-ERR-17)",
1983 loc.WordPart(part))
1984
1985 return w
1986
1987 def _ReadArithWord(self):
1988 # type: () -> Optional[word_t]
1989 """ Helper for ReadArithWord() """
1990 self._GetToken()
1991
1992 if self.token_kind == Kind.Unknown:
1993 # e.g. happened during dynamic parsing of unset 'a[$foo]' in gherkin
1994 p_die(
1995 'Unexpected token while parsing arithmetic: %r' %
1996 lexer.TokenVal(self.cur_token), self.cur_token)
1997
1998 elif self.token_kind == Kind.Eof:
1999 return self.cur_token
2000
2001 elif self.token_kind == Kind.Ignored:
2002 # Space should be ignored.
2003 self._SetNext(lex_mode_e.Arith)
2004 return None
2005
2006 elif self.token_kind in (Kind.Arith, Kind.Right):
2007 # Id.Right_DollarDParen IS just a normal token, handled by ArithParser
2008 self._SetNext(lex_mode_e.Arith)
2009 return self.cur_token
2010
2011 elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub):
2012 return self._ReadCompoundWord(lex_mode_e.Arith)
2013
2014 else:
2015 raise AssertionError(self.cur_token)
2016
2017 def _ReadWord(self, word_mode):
2018 # type: (lex_mode_t) -> Optional[word_t]
2019 """Helper function for ReadWord()."""
2020
2021 # Change the pseudo lexer mode to a real lexer mode
2022 if word_mode == lex_mode_e.ShCommandFakeBrack:
2023 lex_mode = lex_mode_e.ShCommand
2024 else:
2025 lex_mode = word_mode
2026
2027 self._GetToken()
2028
2029 if self.token_kind == Kind.Eof:
2030 # No advance
2031 return self.cur_token
2032
2033 # Allow Arith for ) at end of for loop?
2034 elif self.token_kind in (Kind.Op, Kind.Redir, Kind.Arith):
2035 self._SetNext(lex_mode)
2036
2037 # Newlines are complicated. See 3x2 matrix in the comment about
2038 # self.multiline and self.newline_state above.
2039 if self.token_type == Id.Op_Newline:
2040 if self.multiline:
2041 if self.newline_state > 1:
2042 # This points at a blank line, but at least it gives the line number
2043 p_die('Invalid blank line in multiline mode',
2044 self.cur_token)
2045 return None
2046
2047 if self.returned_newline: # skip
2048 return None
2049
2050 return self.cur_token
2051
2052 elif self.token_kind == Kind.Right:
2053 if self.token_type not in (Id.Right_Subshell, Id.Right_ShFunction,
2054 Id.Right_CasePat, Id.Right_Initializer):
2055 raise AssertionError(self.cur_token)
2056
2057 self._SetNext(lex_mode)
2058 return self.cur_token
2059
2060 elif self.token_kind in (Kind.Ignored, Kind.WS):
2061 self._SetNext(lex_mode)
2062 return None
2063
2064 else:
2065 assert self.token_kind in (Kind.VSub, Kind.Lit, Kind.History,
2066 Kind.Left, Kind.KW, Kind.ControlFlow,
2067 Kind.BoolUnary, Kind.BoolBinary,
2068 Kind.ExtGlob,
2069 Kind.BashRegex), 'Unhandled token kind'
2070
2071 if (word_mode == lex_mode_e.ShCommandFakeBrack and
2072 self.parse_opts.parse_bracket() and
2073 self.token_type == Id.Lit_LBracket):
2074 # Change [ from Kind.Lit -> Kind.Op
2075 # So CommandParser can treat
2076 # assert [42 === x]
2077 # like
2078 # json write (x)
2079 bracket_word = self.cur_token
2080 bracket_word.id = Id.Op_LBracket
2081
2082 self._SetNext(lex_mode)
2083 return bracket_word
2084
2085 # We're beginning a word. If we see Id.Lit_Pound, change to
2086 # lex_mode_e.Comment and read until end of line.
2087 if self.token_type == Id.Lit_Pound:
2088 self._SetNext(lex_mode_e.Comment)
2089 self._GetToken()
2090
2091 # NOTE: The # could be the last character in the file. It can't be
2092 # Eof_{RParen,Backtick} because #) and #` are comments.
2093 assert self.token_type in (Id.Ignored_Comment, Id.Eof_Real), \
2094 self.cur_token
2095
2096 # The next iteration will go into Kind.Ignored and set lex state to
2097 # lex_mode_e.ShCommand/etc.
2098 return None # tell ReadWord() to try again after comment
2099
2100 elif self.token_type == Id.Lit_TPound: ### doc comment
2101 self._SetNext(lex_mode_e.Comment)
2102 self._GetToken()
2103
2104 if self.token_type == Id.Ignored_Comment and self.emit_doc_token:
2105 return self.cur_token
2106
2107 return None # tell ReadWord() to try again after comment
2108
2109 else:
2110 # r'' u'' b'' at the beginning of a word
2111 if (self.token_type == Id.Lit_Chars and
2112 self.lexer.LookAheadOne(
2113 lex_mode_e.ShCommand) == Id.Left_SingleQuote):
2114
2115 # When shopt -s parse_ysh_string:
2116 # echo r'hi' is like echo 'hi'
2117 #
2118 # echo u'\u{3bc}' b'\yff' works
2119
2120 tok = self.cur_token
2121 if self.parse_opts.parse_ysh_string():
2122 if lexer.TokenEquals(tok, 'r'):
2123 left_id = Id.Left_RSingleQuote
2124 elif lexer.TokenEquals(tok, 'u'):
2125 left_id = Id.Left_USingleQuote
2126 elif lexer.TokenEquals(tok, 'b'):
2127 left_id = Id.Left_BSingleQuote
2128 else:
2129 left_id = Id.Undefined_Tok
2130
2131 if left_id != Id.Undefined_Tok:
2132 # skip the r, and then 'foo' will be read as normal
2133 self._SetNext(lex_mode_e.ShCommand)
2134
2135 self._GetToken()
2136 assert self.token_type == Id.Left_SingleQuote, self.token_type
2137
2138 # Read the word in a different lexer mode
2139 return self._ReadYshSingleQuoted(left_id)
2140
2141 return self._ReadCompoundWord(lex_mode)
2142
2143 def ParseVarRef(self):
2144 # type: () -> BracedVarSub
2145 """DYNAMIC parsing of what's inside ${!ref}
2146
2147 # Same as VarOf production
2148 VarRefExpr = VarOf EOF
2149 """
2150 self._SetNext(lex_mode_e.VSub_1)
2151
2152 self._GetToken()
2153 if self.token_kind != Kind.VSub:
2154 p_die('Expected var name', self.cur_token)
2155
2156 part = self._ParseVarOf()
2157 # NOTE: no ${ } means no part.left and part.right
2158 part.left = part.name_tok # cheat to make test pass
2159 part.right = part.name_tok
2160
2161 self._GetToken()
2162 if self.token_type != Id.Eof_Real:
2163 p_die('Expected end of var ref expression', self.cur_token)
2164 return part
2165
2166 def LookPastSpace(self):
2167 # type: () -> Id_t
2168 """Look ahead to the next token.
2169
2170 For the CommandParser to recognize
2171 array= (1 2 3)
2172 YSH for ( versus bash for ((
2173 YSH if ( versus if test
2174 YSH while ( versus while test
2175 YSH bare assignment 'grep =' versus 'grep foo'
2176 """
2177 assert self.token_type != Id.Undefined_Tok
2178 if self.cur_token.id == Id.WS_Space:
2179 id_ = self.lexer.LookPastSpace(lex_mode_e.ShCommand)
2180 else:
2181 id_ = self.cur_token.id
2182 return id_
2183
2184 def LookAheadFuncParens(self):
2185 # type: () -> bool
2186 """Special lookahead for f( ) { echo hi; } to check for ( )"""
2187 assert self.token_type != Id.Undefined_Tok
2188
2189 # We have to handle 2 cases because we buffer a token
2190 if self.cur_token.id == Id.Op_LParen: # saw funcname(
2191 return self.lexer.LookAheadFuncParens(1) # go back one char
2192
2193 elif self.cur_token.id == Id.WS_Space: # saw funcname WHITESPACE
2194 return self.lexer.LookAheadFuncParens(0)
2195
2196 else:
2197 return False
2198
2199 def ReadWord(self, word_mode):
2200 # type: (lex_mode_t) -> word_t
2201 """Read the next word, using the given lexer mode.
2202
2203 This is a stateful wrapper for the stateless _ReadWord function.
2204 """
2205 assert word_mode in (lex_mode_e.ShCommand,
2206 lex_mode_e.ShCommandFakeBrack,
2207 lex_mode_e.DBracket, lex_mode_e.BashRegex)
2208
2209 if self.buffered_word: # For integration with pgen2
2210 w = self.buffered_word
2211 self.buffered_word = None
2212 else:
2213 while True:
2214 w = self._ReadWord(word_mode)
2215 if w is not None:
2216 break
2217
2218 self.returned_newline = (word_.CommandId(w) == Id.Op_Newline)
2219 return w
2220
2221 def ReadArithWord(self):
2222 # type: () -> word_t
2223 while True:
2224 w = self._ReadArithWord()
2225 if w is not None:
2226 break
2227 return w
2228
2229 def ReadHereDocBody(self, parts):
2230 # type: (List[word_part_t]) -> None
2231 """
2232 A here doc is like a double quoted context, except " isn't special.
2233 """
2234 self._ReadLikeDQ(None, False, parts)
2235 # Returns nothing
2236
2237 def ReadForPlugin(self):
2238 # type: () -> CompoundWord
2239 """For $PS1, $PS4, etc.
2240
2241 This is just like reading a here doc line. "\n" is allowed, as
2242 well as the typical substitutions ${x} $(echo hi) $((1 + 2)).
2243 """
2244 w = CompoundWord([])
2245 self._ReadLikeDQ(None, False, w.parts)
2246 return w
2247
2248 def EmitDocToken(self, b):
2249 # type: (bool) -> None
2250 self.emit_doc_token = b
2251
2252 def Multiline(self, b):
2253 # type: (bool) -> None
2254 self.multiline = b
2255
2256
2257if 0:
2258 import collections
2259 WORD_HIST = collections.Counter()
2260
2261# vim: sw=4