osh/word_parse.py

OILS / osh / word_parse.py View on Github | oils.pub

2381 lines, 1272 significant

1	# Copyright 2016 Andy Chu. All rights reserved.
2	# Licensed under the Apache License, Version 2.0 (the "License");
3	# you may not use this file except in compliance with the License.
4	# You may obtain a copy of the License at
5	#
6	# http://www.apache.org/licenses/LICENSE-2.0
7	"""
8	word_parse.py - Parse the shell word language.
9
10	Hairy example:
11
12	hi$((1 + 2))"$(echo hi)"${var:-__"$(echo default)"__}
13
14	Substitutions can be nested, but which inner subs are allowed depends on the
15	outer sub. Notes:
16
17	lex_mode_e.ShCommand (_ReadUnquotedLeftParts)
18	All subs and quotes are allowed:
19	$v ${v} $() `` $(()) '' "" $'' $"" <() >()
20
21	lex_mode_e.DQ (_ReadDoubleQuotedLeftParts)
22	Var, Command, Arith, but no quotes.
23	$v ${v} $() `` $(())
24	No process substitution.
25
26	lex_mode_e.Arith
27	Similar to DQ: Var, Command, and Arith sub, but no process sub. bash doesn't
28	allow quotes, but OSH does. We allow ALL FOUR kinds of quotes, because we
29	need those for associative array indexing.
30
31	lex_mode_e.VSub_ArgUnquoted
32	Like ShCommand, everything is allowed (even process substitutions), but we
33	stop at }, and space is SIGNIFICANT.
34
35	Example: ${a:- b }
36
37	${X:-$v} ${X:-${v}} ${X:-$(echo hi)} ${X:-`echo hi`} ${X:-$((1+2))}
38	${X:-'single'} ${X:-"double"} ${X:-$'\n'} ${X:-<(echo hi)}
39
40	lex_mode_e.VSub_ArgDQ
41	In contrast to DQ, VS_ARG_DQ accepts nested "" and $'' and $"", e.g.
42	"${x:-"default"}".
43
44	In contrast, VSub_ArgUnquoted respects single quotes and process
45	substitution.
46
47	It's weird that double quotes are allowed. Space is also significant here,
48	e.g. "${x:-a "b"}".
49	"""
50
51	from _devbuild.gen import grammar_nt
52	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind
53	from _devbuild.gen.types_asdl import (lex_mode_t, lex_mode_e)
54	from _devbuild.gen.syntax_asdl import (
55	ExprSub,
56	BoolParamBox,
57	Token,
58	SimpleVarSub,
59	loc,
60	source,
61	word,
62	DoubleQuoted,
63	SingleQuoted,
64	BracedVarSub,
65	CommandSub,
66	InitializerWord,
67	InitializerWord_t,
68	bracket_op,
69	bracket_op_t,
70	suffix_op,
71	suffix_op_t,
72	rhs_word,
73	rhs_word_e,
74	rhs_word_t,
75	word_e,
76	word_t,
77	CompoundWord,
78	word_part,
79	word_part_t,
80	y_lhs_e,
81	arith_expr_t,
82	command,
83	expr,
84	expr_e,
85	expr_t,
86	pat_t,
87	ArgList,
88	Proc,
89	Func,
90	Subscript,
91	Attribute,
92	arith_expr,
93	VarDecl,
94	Mutation,
95	word_part_e,
96	)
97	from core import alloc
98	from core.error import p_die
99	from mycpp.mylib import log
100	from core import pyutil
101	from display import ui
102	from frontend import consts
103	from frontend import lexer
104	from frontend import reader
105	from osh import tdop
106	from osh import arith_parse
107	from osh import braces
108	from osh import word_
109	from osh import word_compile
110	from mycpp.mylib import tagswitch
111
112	from libc import HAVE_FNM_EXTMATCH
113
114	from typing import List, Optional, Tuple, cast
115	from typing import TYPE_CHECKING
116	if TYPE_CHECKING:
117	from frontend.lexer import Lexer
118	from frontend.parse_lib import ParseContext
119	from frontend.reader import _Reader
120	from osh.cmd_parse import VarChecker
121
122	unused1 = log
123	unused2 = Id_str
124
125	KINDS_THAT_END_WORDS = [Kind.Eof, Kind.WS, Kind.Op, Kind.Right]
126
127
128	def _IsValidYshWord(w):
129	# type: (CompoundWord) -> bool
130	"""YSH word restriction
131
132	Allowed:
133	'foo' r'foo' --flag r'foo'
134	--flag='foo'
135	--flag="foo"
136	Not allowed:
137	--flag=r'bar' NAME=u'value' # ambiguous
138	--flag=b''' multi '''
139	"""
140	parts = w.parts
141	n = len(parts)
142
143	if n != 0 and word_.LiteralId(parts[0]) == Id.Lit_Tilde:
144	# ~bob/src/'dir with spaces' is allowed
145	# ~bob/src/u'dir with spaces' is ambiguous, but allowed for simplicity
146	return True # early return
147
148	ok = True
149	if n >= 2:
150	# spec/ysh-TODO-deprecate - allow ''/usr/* workaround!
151	# note: ""/usr/* not allowed
152	part0 = parts[0]
153	if part0.tag() == word_part_e.SingleQuoted:
154	sq = cast(SingleQuoted, part0)
155	# Make sure $''' is still disallowed
156	if (sq.left.id == Id.Left_SingleQuote and len(sq.sval) == 0):
157	return True
158
159	for part in parts:
160	if part.tag() in (word_part_e.SingleQuoted,
161	word_part_e.DoubleQuoted):
162	ok = False
163
164	# Allow special cases:
165	# --flag='val' NAME='bar'
166	# But NOT
167	# --flag=r'val' NAME=r'val'
168	if not ok:
169	if (n == 2 and word_.LiteralId(parts[0]) == Id.Lit_VarLike):
170	ok = True
171	elif (n == 3 and word_.LiteralId(parts[0]) == Id.Lit_Chars and
172	word_.LiteralId(parts[1]) == Id.Lit_Equals):
173	ok = True
174
175	return ok
176
177
178	class WordEmitter(object):
179	"""Common interface for [ and [["""
180
181	def __init__(self):
182	# type: () -> None
183	"""Empty constructor for mycpp."""
184	pass
185
186	def ReadWord(self, lex_mode):
187	# type: (lex_mode_t) -> word_t
188	raise NotImplementedError()
189
190
191	class WordParser(WordEmitter):
192
193	def __init__(self, parse_ctx, lexer, line_reader):
194	# type: (ParseContext, Lexer, _Reader) -> None
195	self.parse_ctx = parse_ctx
196	self.lexer = lexer
197	self.line_reader = line_reader
198	self.arena = line_reader.arena
199
200	self.parse_opts = parse_ctx.parse_opts
201	self.a_parser = tdop.TdopParser(arith_parse.Spec(), self,
202	self.parse_opts)
203	self.Reset()
204
205	def Init(self, lex_mode):
206	# type: (lex_mode_t) -> None
207	"""Used to parse arithmetic, see ParseContext."""
208	self.next_lex_mode = lex_mode
209
210	def Reset(self):
211	# type: () -> None
212	"""Called by interactive loop."""
213	# For _GetToken()
214	self.cur_token = None # type: Token
215	self.token_kind = Kind.Undefined
216	self.token_type = Id.Undefined_Tok
217
218	self.next_lex_mode = lex_mode_e.ShCommand
219
220	# Boolean mutated by CommandParser via word_.ctx_EmitDocToken. For ### doc
221	# comments
222	self.emit_doc_token = False
223	# Boolean mutated by CommandParser via word_.ctx_Multiline. '...' starts
224	# multiline mode.
225	self.multiline = False
226
227	# For detecting invalid \n\n in multiline mode. Counts what we got
228	# directly from the lexer.
229	self.newline_state = 0
230	# For consolidating \n\n -> \n for the CALLER. This simplifies the parsers
231	# that consume words.
232	self.returned_newline = False
233
234	# For integration with pgen2
235	self.buffered_word = None # type: word_t
236
237	def _GetToken(self):
238	# type: () -> None
239	"""Call this when you need to make a decision based on any of:
240
241	self.token_type
242	self.token_kind
243	self.cur_token
244	"""
245	if self.next_lex_mode == lex_mode_e.Undefined:
246	return # _SetNext() not called, so do nothing
247
248	is_fake = self.next_lex_mode == lex_mode_e.BashRegexFakeInner
249	real_mode = (lex_mode_e.BashRegex if is_fake else self.next_lex_mode)
250
251	self.cur_token = self.lexer.Read(real_mode)
252
253	# MUTATE TOKEN for fake lexer mode.
254	# This is for crazy stuff bash allows, like [[ s =~ (< >) ]]
255	if (is_fake and self.cur_token.id
256	in (Id.WS_Space, Id.BashRegex_AllowedInParens)):
257	self.cur_token.id = Id.Lit_Chars
258
259	self.token_type = self.cur_token.id
260	self.token_kind = consts.GetKind(self.token_type)
261
262	# number of consecutive newlines, ignoring whitespace
263	if self.token_type == Id.Op_Newline:
264	self.newline_state += 1
265	elif self.token_kind != Kind.WS:
266	self.newline_state = 0
267
268	self.parse_ctx.trail.AppendToken(self.cur_token) # For completion
269	self.next_lex_mode = lex_mode_e.Undefined
270
271	def _SetNext(self, lex_mode):
272	# type: (lex_mode_t) -> None
273	"""Set the next lex state, but don't actually read a token.
274
275	We need this for proper interactive parsing.
276	"""
277	self.next_lex_mode = lex_mode
278
279	def _ReadVarOpArg(self, arg_lex_mode):
280	# type: (lex_mode_t) -> rhs_word_t
281
282	# NOTE: Operators like \| and < are not treated as special, so ${a:- \| >} is
283	# valid, even when unquoted.
284	self._SetNext(arg_lex_mode)
285	self._GetToken()
286
287	w = self._ReadVarOpArg2(arg_lex_mode, Id.Undefined_Tok,
288	True) # empty_ok
289
290	# If the Compound has no parts, and we're in a double-quoted VarSub
291	# arg, and empty_ok, then return Empty. This is so it can evaluate to
292	# the empty string and not get elided.
293	#
294	# Examples:
295	# - "${s:-}", "${s/%pat/}"
296	# It's similar to LooksLikeShAssignment where we turn x= into x=''. And it
297	# has the same potential problem of not having Token location info.
298	#
299	# NOTE: empty_ok is False only for the PatSub pattern, which means we'll
300	# return a Compound with no parts, which is explicitly checked with a
301	# custom error message.
302	if len(w.parts) == 0 and arg_lex_mode == lex_mode_e.VSub_ArgDQ:
303	return rhs_word.Empty
304
305	return w
306
307	def _ReadVarOpArg2(self, arg_lex_mode, eof_type, empty_ok):
308	# type: (lex_mode_t, Id_t, bool) -> CompoundWord
309	"""Helper function for _ReadVarOpArg and _ReadPatSubVarOp"""
310	w = self._ReadCompoundWord3(arg_lex_mode, eof_type, empty_ok)
311	tilde = word_.TildeDetect(w)
312	if tilde:
313	w = tilde
314	return w
315
316	def _ReadSliceVarOp(self):
317	# type: () -> suffix_op.Slice
318	"""
319	Looking token after first ':'
320
321	ArithExpr? (':' ArithExpr? )? '}'
322	"""
323	self._NextNonSpace()
324
325	cur_id = self.token_type
326
327	if cur_id in (Id.Arith_RBrace, Id.Arith_Colon): # ${a:} or ${a::}
328	begin = arith_expr.EmptyZero # type: arith_expr_t
329	else:
330	begin = self.a_parser.Parse()
331	cur_id = self.a_parser.CurrentId() # advance
332
333	if cur_id == Id.Arith_RBrace: # ${a:1} or ${@:1}
334	# No length specified, so it's N
335	no_length = None # type: Optional[arith_expr_t]
336	return suffix_op.Slice(begin, no_length)
337
338	elif cur_id == Id.Arith_Colon: # ${a:1:} or ${@:1:}
339	colon_tok = self.cur_token
340	self._NextNonSpace()
341
342	if self.token_type == Id.Arith_RBrace:
343	# quirky bash behavior:
344	# ${a:1:} or ${a::} means length ZERO
345	# but ${a:1} or ${a:} means length N
346	if self.parse_opts.strict_parse_slice():
347	p_die(
348	"Slice length: Add explicit zero, or omit : for N (strict_parse_slice)",
349	colon_tok)
350
351	length = arith_expr.EmptyZero # type: arith_expr_t
352	else:
353	length = self._ReadArithExpr(Id.Arith_RBrace)
354
355	return suffix_op.Slice(begin, length)
356
357	else:
358	p_die("Expected : or } in slice", self.cur_token)
359
360	raise AssertionError() # for MyPy
361
362	def _ReadPatSubVarOp(self):
363	# type: () -> suffix_op.PatSub
364	"""Looking at the first '/' after VarOf:
365
366	VarSub = ...
367	\| VarOf '/' Match ( '/' WORD? )?
368	Match = '/' WORD # can't be empty
369	\| '#' WORD? # may be empty
370	\| '%' WORD?
371	"""
372	slash_tok = self.cur_token # location info
373	replace_mode = Id.Undefined_Tok # bizarre syntax / # %
374
375	self._SetNext(lex_mode_e.VSub_ArgUnquoted) # advance past /
376
377	self._GetToken()
378	if self.token_type == Id.Right_DollarBrace:
379	pat = CompoundWord([])
380	return suffix_op.PatSub(pat, rhs_word.Empty, replace_mode,
381	slash_tok)
382
383	if self.token_type in (Id.Lit_Slash, Id.Lit_Pound, Id.Lit_Percent):
384	replace_mode = self.token_type
385	self._SetNext(lex_mode_e.VSub_ArgUnquoted)
386
387	# Bash quirk:
388	# echo ${x/#/replace} has an empty pattern
389	# echo ${x////replace} is non-empty; it means echo ${x//'/'/replace}
390	empty_ok = replace_mode != Id.Lit_Slash
391	pat = self._ReadVarOpArg2(lex_mode_e.VSub_ArgUnquoted, Id.Lit_Slash,
392	empty_ok)
393	#log('pat 1 %r', pat)
394
395	if self.token_type == Id.Lit_Slash:
396	# read until }
397	replace = self._ReadVarOpArg(
398	lex_mode_e.VSub_ArgUnquoted) # type: rhs_word_t
399	#log('r 1 %r', replace)
400	else:
401	# e.g. ${v/a} is the same as ${v/a/} -- empty replacement string
402	replace = rhs_word.Empty
403
404	self._GetToken()
405	if self.token_type != Id.Right_DollarBrace:
406	# This happens on invalid code
407	p_die(
408	"Expected } after replacement string, got %s" %
409	ui.PrettyId(self.token_type), self.cur_token)
410
411	return suffix_op.PatSub(pat, replace, replace_mode, slash_tok)
412
413	def _ReadSubscript(self):
414	# type: () -> bracket_op_t
415	""" Subscript = '[' ('@' \| '*' \| ArithExpr) ']' """
416	# Lookahead to see if we get @ or *. Otherwise read a full arithmetic
417	# expression.
418	next_id = self.lexer.LookPastSpace(lex_mode_e.Arith)
419	if next_id in (Id.Lit_At, Id.Arith_Star):
420	op = bracket_op.WholeArray(next_id) # type: bracket_op_t
421
422	self._SetNext(lex_mode_e.Arith) # skip past [
423	self._GetToken()
424	self._SetNext(lex_mode_e.Arith) # skip past @
425	self._GetToken()
426	else:
427	self._SetNext(lex_mode_e.Arith) # skip past [
428	anode = self._ReadArithExpr(Id.Arith_RBracket)
429	op = bracket_op.ArrayIndex(anode)
430
431	if self.token_type != Id.Arith_RBracket: # Should be looking at ]
432	p_die('Expected ] to close subscript', self.cur_token)
433
434	self._SetNext(lex_mode_e.VSub_2) # skip past ]
435	self._GetToken() # Needed to be in the same spot as no subscript
436
437	return op
438
439	def _ParseVarOf(self):
440	# type: () -> BracedVarSub
441	"""
442	VarOf = NAME Subscript?
443	\| NUMBER # no subscript allowed, none of these are arrays
444	# ${@[1]} doesn't work, even though slicing does
445	\| VarSymbol
446	"""
447	self._GetToken()
448	name_token = self.cur_token
449	self._SetNext(lex_mode_e.VSub_2)
450
451	self._GetToken() # Check for []
452	if self.token_type == Id.VOp2_LBracket:
453	bracket_op = self._ReadSubscript()
454	else:
455	bracket_op = None
456
457	part = BracedVarSub.CreateNull()
458	part.name_tok = name_token
459	part.var_name = lexer.TokenVal(name_token)
460	part.bracket_op = bracket_op
461	return part
462
463	def _ParseVarExpr(self, arg_lex_mode, allow_query=False):
464	# type: (lex_mode_t, bool) -> BracedVarSub
465	"""Start parsing at the op -- we already skipped past the name."""
466	part = self._ParseVarOf()
467
468	self._GetToken()
469	if self.token_type == Id.Right_DollarBrace:
470	return part # no ops
471
472	op_kind = self.token_kind
473
474	if op_kind == Kind.VTest:
475	tok = self.cur_token
476	arg_word = self._ReadVarOpArg(arg_lex_mode)
477	if self.token_type != Id.Right_DollarBrace:
478	p_die('Expected } to close ${', self.cur_token)
479
480	part.suffix_op = suffix_op.Unary(tok, arg_word)
481
482	elif op_kind == Kind.VOpYsh:
483	tok = self.cur_token
484	arg_word = self._ReadVarOpArg(arg_lex_mode)
485	if self.token_type != Id.Right_DollarBrace:
486	p_die('Expected } to close ${', self.cur_token)
487
488	UP_arg_word = arg_word
489	with tagswitch(arg_word) as case:
490	if case(rhs_word_e.Empty):
491	pass
492	elif case(rhs_word_e.Compound):
493	arg_word = cast(CompoundWord, UP_arg_word)
494	# This handles ${x\|html} and ${x %.3f} now
495	# However I think ${x %.3f} should be statically parsed? It can enter
496	# the printf lexer modes.
497	ok, arg, quoted = word_.StaticEval(arg_word)
498	if not ok or quoted:
499	p_die('Expected a constant argument',
500	loc.Word(arg_word))
501
502	part.suffix_op = suffix_op.Static(tok, arg)
503
504	elif op_kind == Kind.VOp0:
505	part.suffix_op = self.cur_token # Nullary
506	self._SetNext(lex_mode_e.VSub_2) # Expecting }
507	self._GetToken()
508
509	elif op_kind == Kind.VOp1: # % %% # ## etc.
510	tok = self.cur_token
511	# Weird exception that all shells have: these operators take a glob
512	# pattern, so they're lexed as VSub_ArgUnquoted, not VSub_ArgDQ
513	arg_word = self._ReadVarOpArg(lex_mode_e.VSub_ArgUnquoted)
514	if self.token_type != Id.Right_DollarBrace:
515	p_die('Expected } to close ${', self.cur_token)
516
517	part.suffix_op = suffix_op.Unary(tok, arg_word)
518
519	elif op_kind == Kind.VOp2: # / : [ ]
520	if self.token_type == Id.VOp2_Slash:
521	patsub_op = self._ReadPatSubVarOp() # type: suffix_op_t
522	part.suffix_op = patsub_op
523
524	# Checked by the method above
525	assert self.token_type == Id.Right_DollarBrace, self.cur_token
526
527	elif self.token_type == Id.VOp2_Colon:
528	part.suffix_op = self._ReadSliceVarOp()
529	# NOTE: } in arithmetic mode.
530	if self.token_type != Id.Arith_RBrace:
531	# Token seems off; doesn't point to X in # ${a:1:2 X
532	p_die('Expected } to close ${', self.cur_token)
533
534	else:
535	# TODO: Does this ever happen?
536	p_die('Unexpected token in ${} (%s)' % 'VOp2', self.cur_token)
537
538	elif op_kind == Kind.VOp3: # ${prefix@} etc.
539	if allow_query:
540	part.suffix_op = self.cur_token # Nullary
541	self._SetNext(lex_mode_e.VSub_2) # Expecting }
542	self._GetToken()
543	else:
544	p_die("Unexpected token in ${} (%s)" % 'VOp3', self.cur_token)
545
546	# NOTE: Arith_RBrace is for slicing, because it reads } in arithmetic
547	# mode. It's redundantly checked above.
548	if self.token_type not in (Id.Right_DollarBrace, Id.Arith_RBrace):
549	# ${a.} or ${!a.}
550	p_die('Expected } to close ${', self.cur_token)
551
552	# Now look for ops
553	return part
554
555	def _ReadZshVarSub(self, left_token):
556	# type: (Token) -> word_part.ZshVarSub
557
558	self._SetNext(lex_mode_e.VSub_Zsh) # Move past ${(foo)
559
560	# Can be empty
561	w = self._ReadCompoundWord3(lex_mode_e.VSub_Zsh, Id.Right_DollarBrace,
562	True)
563	self._GetToken()
564	return word_part.ZshVarSub(left_token, w, self.cur_token)
565
566	def ReadBracedVarSub(self, left_token):
567	# type: (Token) -> Tuple[BracedVarSub, Token]
568	""" For YSH expressions like var x = ${x:-"default"}. """
569	part = self._ReadBracedVarSub(left_token, d_quoted=False)
570	last_token = self.cur_token
571	return part, last_token
572
573	def _ReadBracedVarSub(self, left_token, d_quoted):
574	# type: (Token, bool) -> BracedVarSub
575	"""For the ${} expression language.
576
577	NAME = [a-zA-Z_][a-zA-Z0-9_]*
578	NUMBER = [0-9]+ # ${10}, ${11}, ...
579
580	Subscript = '[' ('@' \| '*' \| ArithExpr) ']'
581	VarSymbol = '!' \| '@' \| '#' \| ...
582	VarOf = NAME Subscript?
583	\| NUMBER # no subscript allowed, none of these are arrays
584	# ${@[1]} doesn't work, even though slicing does
585	\| VarSymbol
586
587	NULLARY_OP = '@Q' \| '@E' \| '@P' \| '@A' \| '@a' # VOp0
588
589	TEST_OP = '-' \| ':-' \| '=' \| ':=' \| '+' \| ':+' \| '?' \| ':?'
590	STRIP_OP = '#' \| '##' \| '%' \| '%%'
591	CASE_OP = ',' \| ',,' \| '^' \| '^^'
592	UnaryOp = TEST_OP \| STRIP_OP \| CASE_OP
593
594	YSH_UNARY = '\|' \| ' ' # ${x\|html} and ${x %.3f}.
595	# SPACE is operator not %
596	Match = ('/' \| '#' \| '%') WORD # match all / prefix / suffix
597	VarExpr = VarOf
598	\| VarOf NULLARY_OP
599	\| VarOf UnaryOp WORD
600	\| VarOf YSH_UNARY STATIC_WORD
601	\| VarOf ':' ArithExpr (':' ArithExpr )?
602	\| VarOf '/' Match '/' WORD
603
604	LengthExpr = '#' VarOf # can't apply operators after length
605
606	RefOrKeys = '!' VarExpr # CAN apply operators after a named ref
607	# ${!ref[0]} vs ${!keys[@]} resolved later
608
609	PrefixQuery = '!' NAME ('*' \| '@') # list variable names with a prefix
610
611	BuiltinSub = '.' WORD+ # ${.myproc 'builtin' $sub}
612
613	VarSub = LengthExpr
614	\| RefOrKeys
615	\| PrefixQuery
616	\| VarExpr
617	\| BuiltinSub
618
619	NOTES:
620	- Arithmetic expressions are used twice, inside subscripts ${a[x+1]} and
621	slicing ${a:x+1:y+2}
622	- ${#} and ${!} need LL(2) lookahead (considering how my tokenizer works)
623	- @ and * are technically arithmetic expressions in this implementation
624	- We don't account for bash 4.4: ${param@operator} -- Q E P A a. Note that
625	it's also vectorized.
626
627	Strictness over bash:
628	- echo ${a[0][0]} doesn't do anything useful, so we disallow it from the
629	grammar
630	- ! and # prefixes can't be composed, even though named refs can be
631	composed with other operators
632	- '#' means 4 different things: length prefix, VarSymbol, UnaryOp to strip
633	a prefix, and it can also be a literal part of WORD.
634
635	From the parser's point of view, the prefix # can't be combined with
636	UnaryOp/slicing/matching, and the ! can. However
637
638	- ${a[@]:1:2} is not allowed
639	- ${#a[@]:1:2} is allowed, but gives the wrong answer
640	"""
641	if d_quoted:
642	arg_lex_mode = lex_mode_e.VSub_ArgDQ
643	else:
644	arg_lex_mode = lex_mode_e.VSub_ArgUnquoted
645
646	self._SetNext(lex_mode_e.VSub_1)
647	self._GetToken()
648
649	ty = self.token_type
650	first_tok = self.cur_token
651
652	if ty == Id.VSub_Pound:
653	# Disambiguate
654	next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
655	if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
656	# e.g. a name, '#' is the prefix
657	self._SetNext(lex_mode_e.VSub_1)
658	part = self._ParseVarOf()
659
660	self._GetToken()
661	if self.token_type != Id.Right_DollarBrace:
662	p_die('Expected } after length expression', self.cur_token)
663
664	part.prefix_op = first_tok
665
666	else: # not a prefix, '#' is the variable
667	part = self._ParseVarExpr(arg_lex_mode)
668
669	elif ty == Id.VSub_Bang:
670	next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
671	if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
672	# e.g. a name, '!' is the prefix
673	# ${!a} -- this is a ref
674	# ${!3} -- this is ref
675	# ${!a[1]} -- this is a ref
676	# ${!a[@]} -- this is a keys
677	# No lookahead -- do it in a second step, or at runtime
678	self._SetNext(lex_mode_e.VSub_1)
679	part = self._ParseVarExpr(arg_lex_mode, allow_query=True)
680
681	part.prefix_op = first_tok
682
683	else: # not a prefix, '!' is the variable
684	part = self._ParseVarExpr(arg_lex_mode)
685
686	elif ty == Id.VSub_Dot:
687	# Note: this will become a new builtin_sub type, so this method must
688	# return word_part_t rather than BracedVarSub. I don't think that
689	# should cause problems.
690	p_die('TODO: ${.myproc builtin sub}', self.cur_token)
691
692	# VS_NAME, VS_NUMBER, symbol that isn't # or !
693	elif self.token_kind == Kind.VSub:
694	part = self._ParseVarExpr(arg_lex_mode)
695
696	else:
697	# e.g. ${^}
698	p_die('Unexpected token in ${}', self.cur_token)
699
700	part.left = left_token # attach the argument
701	part.right = self.cur_token
702	return part
703
704	def _ReadSingleQuoted(self, left_token, lex_mode):
705	# type: (Token, lex_mode_t) -> SingleQuoted
706	"""Internal method to read a word_part."""
707	tokens = [] # type: List[Token]
708	# In command mode, we never disallow backslashes like '\'
709	right_quote = self.ReadSingleQuoted(lex_mode, left_token, tokens,
710	False)
711	sval = word_compile.EvalSingleQuoted(left_token.id, tokens)
712	node = SingleQuoted(left_token, sval, right_quote)
713	return node
714
715	def ReadSingleQuoted(self, lex_mode, left_token, out_tokens, is_ysh_expr):
716	# type: (lex_mode_t, Token, List[Token], bool) -> Token
717	"""Appends to out_tokens; returns last token
718
719	Used by expr_parse.py
720	"""
721	if (left_token.id == Id.Left_DollarSingleQuote and
722	self.parse_opts.no_parse_osh()):
723	p_die("Instead of $'', use J8 strings like b'' (no_parse_osh)",
724	left_token)
725
726	# echo '\' is allowed, but x = '\' is invalid, in favor of x = r'\'
727	# enforce for triple-quoted strings: ''' \u ''' requires r''' \u '''
728	no_backslashes = is_ysh_expr and left_token.id in (
729	Id.Left_SingleQuote, Id.Left_TSingleQuote)
730
731	expected_end_tokens = 3 if left_token.id in (
732	Id.Left_TSingleQuote, Id.Left_RTSingleQuote, Id.Left_UTSingleQuote,
733	Id.Left_BTSingleQuote) else 1
734	num_end_tokens = 0
735
736	# TODO: could we directly append to out_tokens?
737	tokens = [] # type: List[Token]
738	while num_end_tokens < expected_end_tokens:
739	self._SetNext(lex_mode)
740	self._GetToken()
741
742	# Kind.Char emitted in lex_mode.SQ_C
743	if self.token_kind in (Kind.Lit, Kind.Char):
744	tok = self.cur_token
745	# Happens in lex_mode_e.SQ: 'one\two' is ambiguous, should be
746	# r'one\two' or c'one\\two'
747	if no_backslashes and lexer.TokenContains(tok, '\\'):
748	p_die(
749	"Ambiguous backslash: add explicit r'' or u'' prefix (OILS-ERR-20)",
750	tok)
751
752	if is_ysh_expr:
753	# Disallow var x = $'\001'. Arguably we don't need these
754	# checks because u'\u{1}' is the way to write it.
755	if self.token_type == Id.Char_Octal3:
756	p_die(
757	r"Use \xhh or \u{...} instead of octal escapes in YSH strings",
758	tok)
759
760	if self.token_type == Id.Char_Hex and self.cur_token.length != 4:
761	# disallow \xH
762	p_die(
763	r'Invalid hex escape in YSH string (must be \xHH)',
764	tok)
765
766	tokens.append(tok)
767
768	elif self.token_kind == Kind.Unknown:
769	tok = self.cur_token
770	assert tok.id == Id.Unknown_Backslash, tok
771
772	# x = $'\z' is disallowed; ditto for echo $'\z' if shopt --set no_parse_backslash
773	if is_ysh_expr or self.parse_opts.no_parse_backslash():
774	p_die(
775	"Invalid char escape in C-style string literal (OILS-ERR-11)",
776	tok)
777
778	tokens.append(tok)
779
780	elif self.token_kind == Kind.Eof:
781	p_die('Unexpected EOF in single-quoted string that began here',
782	left_token)
783
784	elif self.token_kind == Kind.Right:
785	# assume Id.Right_SingleQuote
786	num_end_tokens += 1
787	tokens.append(self.cur_token)
788
789	else:
790	raise AssertionError(self.cur_token)
791
792	if self.token_kind != Kind.Right:
793	num_end_tokens = 0 # we need three in a ROW
794
795	if expected_end_tokens == 1:
796	tokens.pop()
797	elif expected_end_tokens == 3: # Get rid of spurious end tokens
798	tokens.pop()
799	tokens.pop()
800	tokens.pop()
801
802	# Remove space from ''' r''' $''' in both expression mode and command mode
803	if left_token.id in (Id.Left_TSingleQuote, Id.Left_RTSingleQuote,
804	Id.Left_UTSingleQuote, Id.Left_BTSingleQuote):
805	word_compile.RemoveLeadingSpaceSQ(tokens)
806
807	# Validation after lexing - same 2 checks in j8.LexerDecoder
808	is_u_string = left_token.id in (Id.Left_USingleQuote,
809	Id.Left_UTSingleQuote)
810
811	for tok in tokens:
812	# u'\yff' is not valid, but b'\yff' is
813	if is_u_string and tok.id == Id.Char_YHex:
814	p_die(
815	r"%s escapes not allowed in u'' strings" %
816	lexer.TokenVal(tok), tok)
817
818	out_tokens.extend(tokens)
819	return self.cur_token
820
821	def _ReadDoubleQuotedLeftParts(self):
822	# type: () -> word_part_t
823	"""Read substitution parts in a double quoted context."""
824	if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick):
825	return self._ReadCommandSub(self.token_type, d_quoted=True)
826
827	if self.token_type == Id.Left_DollarBrace:
828	return self._ReadBracedVarSub(self.cur_token, d_quoted=True)
829
830	if self.token_type == Id.Left_DollarDParen:
831	# TODO: Uncomment this after another regtest/aports run
832	# if (self.LookAheadDParens(shift_back=1)):
833	return self._ReadArithSub()
834	# else:
835	# Mutate token - we treat this '$((' as '$( ('
836	# self.cur_token.id = Id.Left_DollarParen
837	# return self._ReadCommandSub(Id.Left_DollarParen, d_quoted=True)
838
839	if self.token_type == Id.Left_DollarBracket:
840
841	if self.parse_opts.parse_ysh_expr_sub():
842	return self._ReadExprSub(lex_mode_e.DQ)
843	else:
844	return self._ReadArithSub(end_id=Id.Arith_RBracket)
845
846	if self.token_type == Id.Left_DollarBraceZsh:
847	return self._ReadZshVarSub(self.cur_token)
848
849	raise AssertionError(self.cur_token)
850
851	def _ReadYshSingleQuoted(self, left_id):
852	# type: (Id_t) -> CompoundWord
853	"""Read YSH style strings
854
855	r'' u'' b''
856	r''' ''' u''' ''' b''' '''
857	"""
858	#log('BEF self.cur_token %s', self.cur_token)
859	if left_id == Id.Left_RSingleQuote:
860	lexer_mode = lex_mode_e.SQ_Raw
861	triple_left_id = Id.Left_RTSingleQuote
862	elif left_id == Id.Left_USingleQuote:
863	lexer_mode = lex_mode_e.J8_Str
864	triple_left_id = Id.Left_UTSingleQuote
865	elif left_id == Id.Left_BSingleQuote:
866	lexer_mode = lex_mode_e.J8_Str
867	triple_left_id = Id.Left_BTSingleQuote
868	else:
869	raise AssertionError(left_id)
870
871	# Needed for syntax checks
872	left_tok = self.cur_token
873	left_tok.id = left_id
874
875	sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
876
877	if (len(sq_part.sval) == 0 and self.lexer.ByteLookAhead() == "'"):
878	self._SetNext(lex_mode_e.ShCommand)
879	self._GetToken()
880
881	assert self.token_type == Id.Left_SingleQuote
882	# HACK: magically transform the third ' in u''' to
883	# Id.Left_UTSingleQuote, so that ''' is the terminator
884	left_tok = self.cur_token
885	left_tok.id = triple_left_id
886
887	# Handles stripping leading whitespace
888	sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
889
890	# Advance and validate
891	self._SetNext(lex_mode_e.ShCommand)
892
893	self._GetToken()
894	if self.token_kind not in KINDS_THAT_END_WORDS:
895	p_die('Unexpected token after YSH single-quoted string',
896	self.cur_token)
897
898	return CompoundWord([sq_part])
899
900	def _ReadUnquotedLeftParts(self, triple_out):
901	# type: (Optional[BoolParamBox]) -> word_part_t
902	"""Read substitutions and quoted strings (for lex_mode_e.ShCommand).
903
904	If triple_out is set, then we try parsing triple quoted strings,
905	and set its value to True if we got one.
906	"""
907	if self.token_type in (Id.Left_DoubleQuote, Id.Left_DollarDoubleQuote):
908	# Note: $"" is a synonym for "". It might make sense if it added
909	# \n \0 \x00 \u{123} etc. But that's not what bash does!
910	dq_part = self._ReadDoubleQuoted(self.cur_token)
911	# Got empty word "" and there's a " after
912	if (triple_out and len(dq_part.parts) == 0 and
913	self.lexer.ByteLookAhead() == '"'):
914
915	self._SetNext(lex_mode_e.ShCommand)
916	self._GetToken()
917	# HACK: magically transform the third " in """ to
918	# Id.Left_TDoubleQuote, so that """ is the terminator
919	left_dq_token = self.cur_token
920	left_dq_token.id = Id.Left_TDoubleQuote
921	triple_out.b = True # let caller know we got it
922	return self._ReadDoubleQuoted(left_dq_token)
923
924	return dq_part
925
926	if self.token_type in (Id.Left_SingleQuote, Id.Left_RSingleQuote,
927	Id.Left_DollarSingleQuote):
928	if self.token_type == Id.Left_SingleQuote:
929	lexer_mode = lex_mode_e.SQ_Raw
930	triple_left_id = Id.Left_TSingleQuote
931	elif self.token_type == Id.Left_RSingleQuote:
932	lexer_mode = lex_mode_e.SQ_Raw
933	triple_left_id = Id.Left_RTSingleQuote
934	else:
935	lexer_mode = lex_mode_e.SQ_C
936	# there is no such thing as $'''
937	triple_left_id = Id.Undefined_Tok
938
939	sq_part = self._ReadSingleQuoted(self.cur_token, lexer_mode)
940
941	# Got empty '' or r'' and there's a ' after
942	# u'' and b'' are handled in _ReadYshSingleQuoted
943	if (triple_left_id != Id.Undefined_Tok and
944	triple_out is not None and len(sq_part.sval) == 0 and
945	self.lexer.ByteLookAhead() == "'"):
946
947	self._SetNext(lex_mode_e.ShCommand)
948	self._GetToken()
949
950	# HACK: magically transform the third ' in ''' to
951	# Id.Left_TSingleQuote, so that ''' is the terminator
952	left_sq_token = self.cur_token
953	left_sq_token.id = triple_left_id
954
955	triple_out.b = True # let caller know we got it
956	return self._ReadSingleQuoted(left_sq_token, lexer_mode)
957
958	return sq_part
959
960	if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick,
961	Id.Left_ProcSubIn, Id.Left_ProcSubOut):
962	return self._ReadCommandSub(self.token_type, d_quoted=False)
963
964	if self.token_type == Id.Left_DollarBrace:
965	return self._ReadBracedVarSub(self.cur_token, d_quoted=False)
966
967	if self.token_type == Id.Left_DollarDParen:
968	# TODO: Uncomment this after another regtest/aports run
969	# if (self.LookAheadDParens(shift_back=1)):
970	return self._ReadArithSub()
971	# else:
972	# Mutate token - we treat this '$((' as '$( ('
973	# self.cur_token.id = Id.Left_DollarParen
974	# return self._ReadCommandSub(Id.Left_DollarParen, d_quoted=True)
975
976	if self.token_type == Id.Left_DollarBracket:
977	if self.parse_opts.parse_ysh_expr_sub():
978	return self._ReadExprSub(lex_mode_e.ShCommand)
979	else:
980	return self._ReadArithSub(end_id=Id.Arith_RBracket)
981
982	if self.token_type == Id.Left_DollarBraceZsh:
983	return self._ReadZshVarSub(self.cur_token)
984
985	raise AssertionError(self.cur_token)
986
987	def _ReadExtGlob(self):
988	# type: () -> word_part.ExtGlob
989	"""
990	Grammar:
991	Item = CompoundWord \| EPSILON # important: @(foo\|) is allowed
992	LEFT = '@(' \| '*(' \| '+(' \| '?(' \| '!('
993	RIGHT = ')'
994	ExtGlob = LEFT (Item '\|')* Item RIGHT # ITEM may be empty
995	Compound includes ExtGlob
996	"""
997	left_token = self.cur_token
998	right_token = None # type: Token
999	arms = [] # type: List[CompoundWord]
1000
1001	self.lexer.PushHint(Id.Op_RParen, Id.Right_ExtGlob)
1002	self._SetNext(lex_mode_e.ExtGlob) # advance past LEFT
1003
1004	read_word = False # did we just a read a word? To handle @(\|\|).
1005
1006	while True:
1007	self._GetToken()
1008
1009	if self.token_type == Id.Right_ExtGlob:
1010	if not read_word:
1011	arms.append(CompoundWord([]))
1012	right_token = self.cur_token
1013	break
1014
1015	elif self.token_type == Id.Op_Pipe:
1016	if not read_word:
1017	arms.append(CompoundWord([]))
1018	read_word = False
1019	self._SetNext(lex_mode_e.ExtGlob)
1020
1021	# lex_mode_e.ExtGlob should only produce these 4 kinds of tokens
1022	elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub,
1023	Kind.ExtGlob):
1024	w = self._ReadCompoundWord(lex_mode_e.ExtGlob)
1025	arms.append(w)
1026	read_word = True
1027
1028	elif self.token_kind == Kind.Eof:
1029	p_die('Unexpected EOF reading extended glob that began here',
1030	left_token)
1031
1032	else:
1033	raise AssertionError(self.cur_token)
1034
1035	return word_part.ExtGlob(left_token, arms, right_token)
1036
1037	def _ReadBashRegexGroup(self):
1038	# type: () -> word_part.BashRegexGroup
1039	"""
1040	Grammar:
1041	BashRegexGroup = '(' WORD? ')
1042	"""
1043	left_token = self.cur_token
1044	assert left_token.id == Id.BashRegex_LParen, left_token
1045
1046	arms = [] # type: List[CompoundWord]
1047
1048	self.lexer.PushHint(Id.Op_RParen, Id.Right_BashRegexGroup)
1049	self._SetNext(lex_mode_e.BashRegexFakeInner) # advance past LEFT
1050
1051	self._GetToken()
1052	if self.token_type == Id.Right_BashRegexGroup: # empty ()
1053	return word_part.BashRegexGroup(left_token, None, self.cur_token)
1054
1055	# lex_mode_e.BashRegex should only produce these 4 kinds of tokens
1056	if self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub, Kind.BashRegex):
1057	# Fake lexer mode that translates Id.WS_Space to Id.Lit_Chars
1058	# To allow bash style [[ s =~ (a b) ]]
1059	w = self._ReadCompoundWord(lex_mode_e.BashRegexFakeInner)
1060	arms.append(w)
1061
1062	self._GetToken()
1063	if self.token_type != Id.Right_BashRegexGroup:
1064	p_die('Expected ) to close bash regex group', self.cur_token)
1065
1066	return word_part.BashRegexGroup(left_token, w, self.cur_token)
1067
1068	p_die('Expected word after ( opening bash regex group', self.cur_token)
1069
1070	def _ReadLikeDQ(self,
1071	left_token,
1072	is_ysh_expr,
1073	out_parts,
1074	is_here_doc=False):
1075	# type: (Optional[Token], bool, List[word_part_t], bool) -> None
1076	"""
1077	Args:
1078	left_token: A token if we are reading a double quoted part, or None if
1079	we're reading a here doc.
1080	is_ysh_expr: Whether to disallow backticks and invalid char escapes
1081	out_parts: list of word_part to append to
1082	"""
1083	if left_token:
1084	if left_token.id in (Id.Left_TDoubleQuote,
1085	Id.Left_DollarTDoubleQuote):
1086	expected_end_tokens = 3
1087	else:
1088	expected_end_tokens = 1
1089	else:
1090	expected_end_tokens = 1000 # here doc will break
1091
1092	num_end_tokens = 0
1093	while num_end_tokens < expected_end_tokens:
1094	self._SetNext(lex_mode_e.DQ)
1095	self._GetToken()
1096
1097	if self.token_kind == Kind.Lit:
1098	if self.token_type == Id.Lit_EscapedChar:
1099	tok = self.cur_token
1100	ch = lexer.TokenSliceLeft(tok, 1)
1101	part = word_part.EscapedLiteral(tok,
1102	ch) # type: word_part_t
1103
1104	elif self.token_type == Id.Lit_EscapedDoubleQuote:
1105	if left_token:
1106	part = word_part.EscapedLiteral(
1107	tok, "\"")
1108	else:
1109	# in here docs \" should not be escaped, staying as literal characters
1110	tok = self.cur_token
1111	part = Token(Id.Lit_Chars, tok.length, tok.col,
1112	tok.line, tok.tval)
1113
1114	else:
1115	if self.token_type == Id.Lit_BadBackslash:
1116	# echo "\z" is OK in shell, but 'x = "\z" is a syntax error in
1117	# YSH.
1118	# Slight hole: We don't catch 'x = ${undef:-"\z"} because of the
1119	# recursion (unless no_parse_backslash)
1120	if (is_ysh_expr or
1121	self.parse_opts.no_parse_backslash()):
1122	p_die(
1123	"Invalid char escape in double quoted string (OILS-ERR-12)",
1124	self.cur_token)
1125	elif self.token_type == Id.Lit_Dollar:
1126	if is_ysh_expr or self.parse_opts.no_parse_dollar():
1127	p_die("Literal $ should be quoted like \$",
1128	self.cur_token)
1129
1130	part = self.cur_token
1131
1132	out_parts.append(part)
1133
1134	elif self.token_kind == Kind.Left:
1135	if self.token_type == Id.Left_Backtick and is_ysh_expr:
1136	p_die("Backtick should be $(cmd) or \\` (OILS-ERR-18)",
1137	self.cur_token)
1138
1139	part = self._ReadDoubleQuotedLeftParts()
1140	out_parts.append(part)
1141
1142	elif self.token_kind == Kind.VSub:
1143	tok = self.cur_token
1144	part = SimpleVarSub(tok)
1145	out_parts.append(part)
1146	# NOTE: parsing "$f(x)" would BREAK CODE. Could add a more for it
1147	# later.
1148
1149	elif self.token_kind == Kind.Right:
1150	assert self.token_type == Id.Right_DoubleQuote, self.token_type
1151	if left_token:
1152	num_end_tokens += 1
1153
1154	# In a here doc, the right quote is literal!
1155	out_parts.append(self.cur_token)
1156
1157	elif self.token_kind == Kind.Eof:
1158	if left_token:
1159	p_die(
1160	'Unexpected EOF reading double-quoted string that began here',
1161	left_token)
1162	else: # here docs will have an EOF in their token stream
1163	break
1164
1165	else:
1166	raise AssertionError(self.cur_token)
1167
1168	if self.token_kind != Kind.Right:
1169	num_end_tokens = 0 # """ must be CONSECUTIVE
1170
1171	if expected_end_tokens == 1:
1172	out_parts.pop()
1173	elif expected_end_tokens == 3:
1174	out_parts.pop()
1175	out_parts.pop()
1176	out_parts.pop()
1177
1178	# Remove space from """ in both expression mode and command mode
1179	if (left_token and left_token.id
1180	in (Id.Left_TDoubleQuote, Id.Left_DollarTDoubleQuote)):
1181	word_compile.RemoveLeadingSpaceDQ(out_parts)
1182
1183	# Return nothing, since we appended to 'out_parts'
1184
1185	def _ReadDoubleQuoted(self, left_token):
1186	# type: (Token) -> DoubleQuoted
1187	"""Helper function for "hello $name".
1188
1189	Args:
1190	eof_type: for stopping at }, Id.Lit_RBrace
1191	here_doc: Whether we are reading in a here doc context
1192
1193	Also ${foo%%a b c} # treat this as double quoted. until you hit
1194	"""
1195	parts = [] # type: List[word_part_t]
1196	self._ReadLikeDQ(left_token, False, parts)
1197
1198	right_quote = self.cur_token
1199	return DoubleQuoted(left_token, parts, right_quote)
1200
1201	def ReadDoubleQuoted(self, left_token, parts):
1202	# type: (Token, List[word_part_t]) -> Token
1203	"""For expression mode.
1204
1205	Read var x = "${dir:-}/$name"; etc.
1206	"""
1207	self._ReadLikeDQ(left_token, True, parts)
1208	return self.cur_token
1209
1210	def _ReadCommandSub(self, left_id, d_quoted=False):
1211	# type: (Id_t, bool) -> CommandSub
1212	"""
1213	NOTE: This is not in the grammar, because word parts aren't in the grammar!
1214
1215	command_sub = '$(' command_list ')'
1216	\| '@(' command_list ')'
1217	\| '<(' command_list ')'
1218	\| '>(' command_list ')'
1219	\| ` command_list `
1220	"""
1221	left_token = self.cur_token
1222
1223	# Set the lexer in a state so ) becomes the EOF token.
1224	if left_id in (Id.Left_DollarParen, Id.Left_AtParen, Id.Left_ProcSubIn,
1225	Id.Left_ProcSubOut):
1226	self._SetNext(lex_mode_e.ShCommand) # advance past $( etc.
1227
1228	right_id = Id.Eof_RParen
1229	self.lexer.PushHint(Id.Op_RParen, right_id)
1230	c_parser = self.parse_ctx.MakeParserForCommandSub(
1231	self.line_reader, self.lexer, right_id)
1232	# NOTE: This doesn't use something like main_loop because we don't want
1233	# to interleave parsing and execution! Unlike 'source' and 'eval'.
1234	node = c_parser.ParseCommandSub()
1235
1236	right_token = c_parser.w_parser.cur_token
1237
1238	elif left_id == Id.Left_Backtick and self.parse_ctx.do_lossless:
1239	# NOTE: This is an APPROXIMATE solution for translation ONLY. See
1240	# test/osh2oil.
1241
1242	right_id = Id.Eof_Backtick
1243	self.lexer.PushHint(Id.Left_Backtick, right_id)
1244	c_parser = self.parse_ctx.MakeParserForCommandSub(
1245	self.line_reader, self.lexer, right_id)
1246	node = c_parser.ParseCommandSub()
1247	right_token = c_parser.w_parser.cur_token
1248
1249	elif left_id == Id.Left_Backtick:
1250	if self.parse_opts.no_parse_backticks():
1251	p_die(
1252	'Backtick should be $(cmd) or \\` (no_parse_backticks, OILS-ERR-18)',
1253	left_token)
1254
1255	self._SetNext(lex_mode_e.Backtick) # advance past `
1256
1257	parts = [] # type: List[str]
1258	while True:
1259	self._GetToken()
1260	#log("TOK %s", self.cur_token)
1261
1262	if self.token_type == Id.Backtick_Quoted:
1263	# Remove leading \
1264	parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1265
1266	elif self.token_type == Id.Backtick_DoubleQuote:
1267	# Compatibility: If backticks are double quoted, then double quotes
1268	# within them have to be \"
1269	# Shells aren't smart enough to match nested " and ` quotes (but OSH
1270	# is)
1271	if d_quoted:
1272	# Remove leading \
1273	parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1274	else:
1275	parts.append(lexer.TokenVal(self.cur_token))
1276
1277	elif self.token_type == Id.Backtick_Other:
1278	parts.append(lexer.TokenVal(self.cur_token))
1279
1280	elif self.token_type == Id.Backtick_Right:
1281	break
1282
1283	elif self.token_type == Id.Eof_Real:
1284	# Note: this parse error is in the ORIGINAL context. No code_str yet.
1285	p_die('Unexpected EOF while looking for closing backtick',
1286	left_token)
1287
1288	else:
1289	raise AssertionError(self.cur_token)
1290
1291	self._SetNext(lex_mode_e.Backtick)
1292
1293	# Calculate right SPID on CommandSub BEFORE re-parsing.
1294	right_token = self.cur_token
1295
1296	code_str = ''.join(parts)
1297	#log('code %r', code_str)
1298
1299	# Save lines into a new, temporary arena, so SnipCodeBlock() isn't
1300	# messed up. Note: This is similar to how we parse aliases in
1301	# osh/cmd_parse.py. It won't have the same location info as
1302	# MakeParserForCommandSub(), because the reader is different.
1303	arena = alloc.Arena()
1304	# TODO: arena.PushSource()?
1305
1306	line_reader = reader.StringLineReader(code_str, arena)
1307	c_parser = self.parse_ctx.MakeOshParser(line_reader)
1308	src = source.Reparsed('backticks', left_token, right_token)
1309	with alloc.ctx_SourceCode(arena, src):
1310	node = c_parser.ParseCommandSub()
1311
1312	else:
1313	raise AssertionError(left_id)
1314
1315	return CommandSub(left_token, node, right_token)
1316
1317	def _ReadExprSub(self, lex_mode):
1318	# type: (lex_mode_t) -> ExprSub
1319	"""$[d->key] $[obj.method()] etc."""
1320	left_token = self.cur_token
1321
1322	self._SetNext(lex_mode_e.Expr)
1323	enode, right_token = self.parse_ctx.ParseYshExpr(
1324	self.lexer, grammar_nt.ysh_expr_sub)
1325
1326	self._SetNext(lex_mode) # Move past ]
1327	return ExprSub(left_token, enode, right_token)
1328
1329	def ParseVarDecl(self, kw_token):
1330	# type: (Token) -> VarDecl
1331	"""
1332	oil_var_decl: name_type_list '=' testlist end_stmt
1333
1334	Note that assignments must end with \n ; } or EOF. Unlike shell
1335	assignments, we disallow:
1336
1337	var x = 42 \| wc -l
1338	var x = 42 && echo hi
1339	"""
1340	self._SetNext(lex_mode_e.Expr)
1341	enode, last_token = self.parse_ctx.ParseVarDecl(kw_token, self.lexer)
1342	# Hack to move } from what the Expr lexer modes gives to what CommandParser
1343	# wants
1344	if last_token.id == Id.Op_RBrace:
1345	last_token.id = Id.Lit_RBrace
1346
1347	# Let the CommandParser see the Op_Semi or Op_Newline.
1348	self.buffered_word = last_token
1349	self._SetNext(lex_mode_e.ShCommand) # always back to this
1350	return enode
1351
1352	def ParseMutation(self, kw_token, var_checker):
1353	# type: (Token, VarChecker) -> Mutation
1354	"""
1355	setvar i = 42
1356	setvar i += 1
1357	setvar a[i] = 42
1358	setvar a[i] += 1
1359	setvar d.key = 42
1360	setvar d.key += 1
1361	"""
1362	self._SetNext(lex_mode_e.Expr)
1363	enode, last_token = self.parse_ctx.ParseMutation(kw_token, self.lexer)
1364	# Hack to move } from what the Expr lexer modes gives to what CommandParser
1365	# wants
1366	if last_token.id == Id.Op_RBrace:
1367	last_token.id = Id.Lit_RBrace
1368
1369	for lhs in enode.lhs:
1370	UP_lhs = lhs
1371	with tagswitch(lhs) as case:
1372	if case(y_lhs_e.Var):
1373	lhs = cast(Token, UP_lhs)
1374	var_checker.Check(kw_token.id, lexer.LazyStr(lhs), lhs)
1375
1376	# Note: this does not cover cases like
1377	# setvar (a[0])[1] = v
1378	# setvar (d.key).other = v
1379	# This leaks into catching all typos statically, which may be
1380	# possible if 'use' makes all names explicit.
1381	elif case(y_lhs_e.Subscript):
1382	lhs = cast(Subscript, UP_lhs)
1383	if lhs.obj.tag() == expr_e.Var:
1384	v = cast(expr.Var, lhs.obj)
1385	var_checker.Check(kw_token.id, v.name, v.left)
1386
1387	elif case(y_lhs_e.Attribute):
1388	lhs = cast(Attribute, UP_lhs)
1389	if lhs.obj.tag() == expr_e.Var:
1390	v = cast(expr.Var, lhs.obj)
1391	var_checker.Check(kw_token.id, v.name, v.left)
1392
1393	# Let the CommandParser see the Op_Semi or Op_Newline.
1394	self.buffered_word = last_token
1395	self._SetNext(lex_mode_e.ShCommand) # always back to this
1396	return enode
1397
1398	def ParseBareDecl(self):
1399	# type: () -> expr_t
1400	"""
1401	x = {name: val}
1402	"""
1403	self._SetNext(lex_mode_e.Expr)
1404	self._GetToken()
1405	enode, last_token = self.parse_ctx.ParseYshExpr(
1406	self.lexer, grammar_nt.command_expr)
1407	if last_token.id == Id.Op_RBrace:
1408	last_token.id = Id.Lit_RBrace
1409	self.buffered_word = last_token
1410	self._SetNext(lex_mode_e.ShCommand)
1411	return enode
1412
1413	def ParseYshExprForCommand(self):
1414	# type: () -> expr_t
1415
1416	# Fudge for this case
1417	# for x in(y) {
1418	# versus
1419	# for x in (y) {
1420	#
1421	# In the former case, ReadWord on 'in' puts the lexer past (.
1422	# Also see LookPastSpace in CommandParers.
1423	# A simpler solution would be nicer.
1424
1425	if self.token_type == Id.Op_LParen:
1426	self.lexer.MaybeUnreadOne()
1427
1428	enode, _ = self.parse_ctx.ParseYshExpr(self.lexer, grammar_nt.ysh_expr)
1429
1430	self._SetNext(lex_mode_e.ShCommand)
1431	return enode
1432
1433	def ParseCommandExpr(self):
1434	# type: () -> expr_t
1435	"""
1436	= 1+2
1437	"""
1438	enode, last_token = self.parse_ctx.ParseYshExpr(
1439	self.lexer, grammar_nt.command_expr)
1440
1441	# In some cases, such as the case statement, we expect the lexer to be
1442	# pointing at the token right after the expression. But the expression
1443	# parser must have read to the `last_token`. Unreading places the lexer
1444	# back in the expected state. Ie:
1445	#
1446	# case (x) { case (x) {
1447	# (else) { = x } (else) { = x }
1448	# ^ The lexer is here ^ Unread to here
1449	# } }
1450	assert last_token.id in (Id.Op_Newline, Id.Eof_Real, Id.Op_Semi,
1451	Id.Op_RBrace), last_token
1452	if last_token.id != Id.Eof_Real:
1453	# Eof_Real is the only token we cannot unread
1454	self.lexer.MaybeUnreadOne()
1455
1456	return enode
1457
1458	def ParseProc(self, node):
1459	# type: (Proc) -> None
1460
1461	# proc name-with-hyphens() must be accepted
1462	self._SetNext(lex_mode_e.ShCommand)
1463	self._GetToken()
1464	# example: 'proc f[' gets you Lit_ArrayLhsOpen
1465	if self.token_type != Id.Lit_Chars:
1466	p_die('Invalid proc name %s' % ui.PrettyToken(self.cur_token),
1467	self.cur_token)
1468
1469	# TODO: validate this more. Disallow proc 123 { }, which isn't disallowed
1470	# for shell functions. Similar to IsValidVarName().
1471	node.name = self.cur_token
1472
1473	last_token = self.parse_ctx.ParseProc(self.lexer, node)
1474
1475	# Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1476	assert last_token.id == Id.Op_LBrace
1477	last_token.id = Id.Lit_LBrace
1478	self.buffered_word = last_token
1479
1480	self._SetNext(lex_mode_e.ShCommand)
1481
1482	def ParseFunc(self, node):
1483	# type: (Func) -> None
1484	last_token = self.parse_ctx.ParseFunc(self.lexer, node)
1485
1486	# Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1487	assert last_token.id == Id.Op_LBrace
1488	last_token.id = Id.Lit_LBrace
1489	self.buffered_word = last_token
1490
1491	self._SetNext(lex_mode_e.ShCommand)
1492
1493	def ParseYshCasePattern(self):
1494	# type: () -> Tuple[pat_t, Token]
1495	pat, left_tok, last_token = self.parse_ctx.ParseYshCasePattern(
1496	self.lexer)
1497
1498	if last_token.id == Id.Op_LBrace:
1499	last_token.id = Id.Lit_LBrace
1500	self.buffered_word = last_token
1501
1502	return pat, left_tok
1503
1504	def NewlineOkForYshCase(self):
1505	# type: () -> Id_t
1506	"""Check for optional newline and consume it.
1507
1508	This is a special case of `_NewlineOk` which fixed some "off-by-one" issues
1509	which crop up while parsing Ysh Case Arms. For more details, see
1510	#oil-dev > Progress On YSH Case Grammar on zulip.
1511
1512	Returns a token id which is filled with the choice of
1513
1514	word { echo word }
1515	(3) { echo expr }
1516	/e/ { echo eggex }
1517	} # right brace
1518	"""
1519	while True:
1520	next_id = self.lexer.LookAheadOne(lex_mode_e.Expr)
1521
1522	# Cannot lookahead past lines
1523	if next_id == Id.Unknown_Tok:
1524	if not self.lexer.MoveToNextLine(): # Try to move to next line
1525	break # EOF
1526	continue
1527
1528	next_kind = consts.GetKind(next_id)
1529	if next_id != Id.Op_Newline and next_kind != Kind.Ignored:
1530	break
1531
1532	self.lexer.Read(lex_mode_e.Expr)
1533
1534	if next_id in (Id.Op_RBrace, Id.Op_LParen, Id.Arith_Slash):
1535	self._SetNext(lex_mode_e.Expr) # Continue in expression mode
1536	else:
1537	# Consume the trailing Op_Newline
1538	self._SetNext(lex_mode_e.ShCommand)
1539	self._GetToken()
1540
1541	return next_id
1542
1543	def _ReadArithExpr(self, end_id):
1544	# type: (Id_t) -> arith_expr_t
1545	"""Read and parse an arithmetic expression in various contexts.
1546
1547	$(( 1+2 ))
1548	(( a=1+2 ))
1549	${a[ 1+2 ]}
1550	${a : 1+2 : 1+2}
1551
1552	See tests/arith-context.test.sh for ambiguous cases.
1553
1554	${a[a[0]]} is valid # VS_RBRACKET vs Id.Arith_RBracket
1555
1556	${s : a<b?0:1 : 1} # VS_COLON vs Id.Arith_Colon
1557
1558	See the assertion in ArithParser.Parse() -- unexpected extra input.
1559	"""
1560	# calls self.ReadWord(lex_mode_e.Arith)
1561	anode = self.a_parser.Parse()
1562	cur_id = self.a_parser.CurrentId()
1563	if end_id != Id.Undefined_Tok and cur_id != end_id:
1564	p_die(
1565	'Unexpected token after arithmetic expression (%s != %s)' %
1566	(ui.PrettyId(cur_id), ui.PrettyId(end_id)),
1567	loc.Word(self.a_parser.cur_word))
1568	return anode
1569
1570	def _ReadArithSub(self, end_id=Id.Arith_RParen):
1571	# type: (Id_t) -> word_part.ArithSub
1572	"""Read an arith substitution, which contains an arith expression, e.g.
1573
1574	$((a + 1)).
1575	"""
1576	assert end_id in (Id.Arith_RParen, Id.Arith_RBracket)
1577
1578	left_tok = self.cur_token
1579
1580	# The second one needs to be disambiguated in stuff like stuff like:
1581	# $(echo $(( 1+2 )) )
1582	if end_id == Id.Arith_RParen:
1583	self.lexer.PushHint(Id.Op_RParen, Id.Right_DollarDParen)
1584
1585	# NOTE: To disambiguate $(( as arith sub vs. command sub and subshell, we
1586	# could save the lexer/reader state here, and retry if the arithmetic parse
1587	# fails. But we can almost always catch this at parse time. There could
1588	# be some exceptions like:
1589	# $((echo * foo)) # looks like multiplication
1590	# $((echo / foo)) # looks like division
1591
1592	# $(( )) is valid
1593	anode = arith_expr.EmptyZero # type: arith_expr_t
1594
1595	self._NextNonSpace()
1596	if self.token_type != Id.Arith_RParen:
1597	anode = self._ReadArithExpr(end_id)
1598
1599	self._SetNext(lex_mode_e.ShCommand)
1600
1601	if end_id == Id.Arith_RParen:
1602	# Ensure we get closing ) if we are looking for double ))
1603	# (In backwards compat mode, ] can also be the closing bracket, and
1604	# it would already be the current token, no need to skip further
1605	self._GetToken()
1606	if self.token_type != Id.Right_DollarDParen:
1607	p_die('Expected second ) to end arith sub', self.cur_token)
1608
1609	right_tok = self.cur_token
1610	return word_part.ArithSub(left_tok, anode, right_tok)
1611
1612	def ReadDParen(self):
1613	# type: () -> Tuple[arith_expr_t, Token]
1614	"""Read ((1+ 2)) -- command context.
1615
1616	We're using the word parser because it's very similar to _ReadArithExpr
1617	above.
1618
1619	This also returns the terminating Id.Op_DRightParen token for location
1620	info.
1621	"""
1622	# (( )) is valid
1623	anode = arith_expr.EmptyZero # type: arith_expr_t
1624
1625	self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
1626
1627	self._NextNonSpace()
1628	if self.token_type != Id.Arith_RParen:
1629	anode = self._ReadArithExpr(Id.Arith_RParen)
1630
1631	self._SetNext(lex_mode_e.ShCommand)
1632
1633	# Ensure we get the second )
1634	self._GetToken()
1635	right = self.cur_token
1636	if right.id != Id.Op_DRightParen:
1637	p_die('Expected second ) to end arith statement', right)
1638
1639	self._SetNext(lex_mode_e.ShCommand)
1640
1641	return anode, right
1642
1643	def _NextNonSpace(self):
1644	# type: () -> None
1645	"""Advance in lex_mode_e.Arith until non-space token.
1646
1647	Same logic as _ReadWord, but used in
1648	$(( ))
1649	(( ))
1650	for (( ))
1651
1652	You can read self.token_type after this, without calling _GetToken.
1653	"""
1654	while True:
1655	self._SetNext(lex_mode_e.Arith)
1656	self._GetToken()
1657	if self.token_kind not in (Kind.Ignored, Kind.WS):
1658	break
1659
1660	def ReadForExpression(self):
1661	# type: () -> command.ForExpr
1662	"""Read ((i=0; i<5; ++i)) -- part of command context."""
1663	self._NextNonSpace() # skip over ((
1664	cur_id = self.token_type # for end of arith expressions
1665
1666	if cur_id == Id.Arith_Semi: # for (( ; i < 10; i++ ))
1667	init_node = arith_expr.EmptyZero # type: arith_expr_t
1668	else:
1669	init_node = self.a_parser.Parse()
1670	cur_id = self.a_parser.CurrentId()
1671	self._NextNonSpace()
1672
1673	# It's odd to keep track of both cur_id and self.token_type in this
1674	# function, but it works, and is tested in 'test/parse_error.sh
1675	# arith-integration'
1676	if cur_id != Id.Arith_Semi: # for (( x=0 b; ... ))
1677	p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1678
1679	self._GetToken()
1680	cur_id = self.token_type
1681
1682	if cur_id == Id.Arith_Semi: # for (( ; ; i++ ))
1683	# empty condition is TRUE
1684	cond_node = arith_expr.EmptyOne # type: arith_expr_t
1685	else:
1686	cond_node = self.a_parser.Parse()
1687	cur_id = self.a_parser.CurrentId()
1688
1689	if cur_id != Id.Arith_Semi: # for (( x=0; x<5 b ))
1690	p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1691
1692	self._NextNonSpace()
1693	if self.token_type == Id.Arith_RParen: # for (( ; ; ))
1694	update_node = arith_expr.EmptyZero # type: arith_expr_t
1695	else:
1696	update_node = self._ReadArithExpr(Id.Arith_RParen)
1697
1698	self._NextNonSpace()
1699	if self.token_type != Id.Arith_RParen:
1700	p_die('Expected ) to end for loop expression', self.cur_token)
1701	self._SetNext(lex_mode_e.ShCommand)
1702
1703	# redirects is None, will be assigned in CommandEvaluator
1704	node = command.ForExpr.CreateNull()
1705	node.init = init_node
1706	node.cond = cond_node
1707	node.update = update_node
1708	return node
1709
1710	def _ReadArrayLiteral(self):
1711	# type: () -> word_part_t
1712	"""a=(1 2 3)
1713
1714	TODO: See osh/cmd_parse.py:164 for Id.Lit_ArrayLhsOpen, for a[x++]=1
1715
1716	We want:
1717
1718	A=(['x']=1 ["x"]=2 [$x$y]=3)
1719
1720	Maybe allow this as a literal string? Because I think I've seen it before?
1721	Or maybe force people to patch to learn the rule.
1722
1723	A=([x]=4)
1724
1725	Starts with Lit_Other '[', and then it has Lit_ArrayLhsClose
1726	Maybe enforce that ALL have keys or NONE of have keys.
1727	"""
1728	self._SetNext(lex_mode_e.ShCommand) # advance past (
1729	self._GetToken()
1730	if self.cur_token.id != Id.Op_LParen:
1731	p_die('Expected ( after =', self.cur_token)
1732	left_token = self.cur_token
1733	right_token = None # type: Token
1734
1735	# MUST use a new word parser (with same lexer).
1736	w_parser = self.parse_ctx.MakeWordParser(self.lexer, self.line_reader)
1737	words = [] # type: List[CompoundWord]
1738	done = False
1739	while not done:
1740	w = w_parser.ReadWord(lex_mode_e.ShCommand)
1741	with tagswitch(w) as case:
1742	if case(word_e.Operator):
1743	tok = cast(Token, w)
1744	if tok.id == Id.Right_Initializer:
1745	right_token = tok
1746	done = True # can't use break here
1747	# Unlike command parsing, array parsing allows embedded \n.
1748	elif tok.id == Id.Op_Newline:
1749	continue
1750	else:
1751	p_die('Unexpected token in array literal', loc.Word(w))
1752
1753	elif case(word_e.Compound):
1754	words.append(cast(CompoundWord, w))
1755
1756	else:
1757	raise AssertionError()
1758
1759	initializer_words = [] # type: List[InitializerWord_t]
1760	for w in words:
1761	pair = word_.DetectAssocPair(w)
1762	if pair is not None:
1763	word_.TildeDetectAssign(pair.value) # pair.value is modified
1764	initializer_words.append(pair)
1765	else:
1766	w2 = braces.BraceDetect(w) # type: word_t
1767	if w2 is None:
1768	w2 = w
1769	w3 = word_.TildeDetect(w2) # type: word_t
1770	if w3 is None:
1771	w3 = w2
1772	initializer_words.append(InitializerWord.ArrayWord(w3))
1773
1774	# invariant List?
1775	return word_part.InitializerLiteral(left_token, initializer_words,
1776	right_token)
1777
1778	def ParseProcCallArgs(self, start_symbol):
1779	# type: (int) -> ArgList
1780	""" json write (x) """
1781	self.lexer.MaybeUnreadOne()
1782
1783	arg_list = ArgList.CreateNull(alloc_lists=True)
1784	arg_list.left = self.cur_token
1785	self.parse_ctx.ParseProcCallArgs(self.lexer, arg_list, start_symbol)
1786	return arg_list
1787
1788	def _MaybeReadWordPart(self, is_first, lex_mode, parts):
1789	# type: (bool, lex_mode_t, List[word_part_t]) -> bool
1790	"""Helper for _ReadCompoundWord3."""
1791	done = False
1792
1793	if self.token_type == Id.Lit_EscapedChar:
1794	tok = self.cur_token
1795	assert tok.length == 2
1796	ch = lexer.TokenSliceLeft(tok, 1)
1797	if self.parse_opts.no_parse_backslash():
1798	if not pyutil.IsValidCharEscape(ch):
1799	p_die('Invalid char escape in unquoted word (OILS-ERR-13)',
1800	self.cur_token)
1801
1802	part = word_part.EscapedLiteral(self.cur_token,
1803	ch) # type: word_part_t
1804	else:
1805	part = self.cur_token
1806
1807	if is_first and self.token_type == Id.Lit_VarLike: # foo=
1808	parts.append(part)
1809	# Unfortunately it's awkward to pull the check for a=(1 2) up to
1810	# _ReadWord.
1811	next_id = self.lexer.LookPastSpace(lex_mode)
1812	if next_id == Id.Op_LParen:
1813	self.lexer.PushHint(Id.Op_RParen, Id.Right_Initializer)
1814	part2 = self._ReadArrayLiteral()
1815	parts.append(part2)
1816
1817	# Array literal must be the last part of the word.
1818	self._SetNext(lex_mode)
1819	self._GetToken()
1820	# EOF, whitespace, newline, Right_Subshell
1821	if self.token_kind not in KINDS_THAT_END_WORDS:
1822	p_die('Unexpected token after array literal',
1823	self.cur_token)
1824	done = True
1825
1826	elif (is_first and self.parse_opts.parse_at() and
1827	self.token_type == Id.Lit_Splice):
1828
1829	splice_tok = self.cur_token
1830	part2 = word_part.Splice(splice_tok,
1831	lexer.TokenSliceLeft(splice_tok, 1))
1832
1833	parts.append(part2)
1834
1835	# @words must be the last part of the word
1836	self._SetNext(lex_mode)
1837	self._GetToken()
1838	# EOF, whitespace, newline, Right_Subshell
1839	if self.token_kind not in KINDS_THAT_END_WORDS:
1840	p_die('Unexpected token after array splice', self.cur_token)
1841	done = True
1842
1843	elif (is_first and self.parse_opts.parse_at() and
1844	self.token_type == Id.Lit_AtLBracket): # @[split(x)]
1845	part2 = self._ReadExprSub(lex_mode_e.DQ)
1846	parts.append(part2)
1847
1848	# @[split(x)]
1849	self._SetNext(lex_mode)
1850	self._GetToken()
1851	# EOF, whitespace, newline, Right_Subshell
1852	if self.token_kind not in KINDS_THAT_END_WORDS:
1853	p_die('Unexpected token after Expr splice', self.cur_token)
1854	done = True
1855
1856	elif (is_first and self.parse_opts.parse_at() and
1857	self.token_type == Id.Lit_AtLBraceDot):
1858	p_die('TODO: @{.myproc builtin sub}', self.cur_token)
1859
1860	elif (is_first and self.parse_opts.parse_at_all() and
1861	self.token_type == Id.Lit_At):
1862	# Because $[x] ${x} and perhaps $/x/ are reserved, it makes sense for @
1863	# at the beginning of a word to be reserved.
1864
1865	# Although should we relax 'echo @' ? I'm tempted to have a shortcut for
1866	# @_argv and
1867	p_die('Literal @ starting a word must be quoted (parse_at_all)',
1868	self.cur_token)
1869
1870	else:
1871	# not a literal with lookahead; append it
1872	parts.append(part)
1873
1874	return done
1875
1876	def _ReadCompoundWord(self, lex_mode):
1877	# type: (lex_mode_t) -> CompoundWord
1878
1879	# This is the ONLY lexer mode that can return word.Redir
1880	assert lex_mode != lex_mode_e.ShCommand, lex_mode
1881
1882	w = self._ReadCompoundOrRedir(lex_mode)
1883	assert w.tag() == word_e.Compound, w
1884	return cast(CompoundWord, w)
1885
1886	def _ReadCompoundWord3(self, lex_mode, eof_type, empty_ok):
1887	# type: (lex_mode_t, Id_t, bool) -> CompoundWord
1888
1889	# This is the ONLY lexer mode that can return word.Redir
1890	assert lex_mode != lex_mode_e.ShCommand, lex_mode
1891
1892	w = self._ReadCompoundOrRedir3(lex_mode, eof_type, empty_ok)
1893	assert w.tag() == word_e.Compound, w
1894	return cast(CompoundWord, w)
1895
1896	def _ReadCompoundOrRedir(self, lex_mode):
1897	# type: (lex_mode_t) -> word_t
1898	"""Returns either word.Compound or word.Redir"""
1899	return self._ReadCompoundOrRedir3(lex_mode, Id.Undefined_Tok, True)
1900
1901	def _ReadCompoundOrRedir3(self, lex_mode, eof_type, empty_ok):
1902	# type: (lex_mode_t, Id_t, bool) -> word_t
1903	"""
1904	Precondition: Looking at the first token of the first word part
1905	Postcondition: Looking at the token after, e.g. space or operator
1906
1907	NOTE: eof_type is necessary because / is a literal, i.e. Lit_Slash, but it
1908	could be an operator delimiting a compound word. Can we change lexer modes
1909	and remove this special case?
1910
1911	Returns either word.Compound or word.Redir
1912	"""
1913	w = CompoundWord([])
1914	num_parts = 0
1915	brace_count = 0
1916	done = False
1917	is_triple_quoted = None # type: Optional[BoolParamBox]
1918	saw_redir_left_tok = False
1919
1920	while not done:
1921	self._GetToken()
1922
1923	allow_done = empty_ok or num_parts != 0
1924	if allow_done and self.token_type == eof_type:
1925	done = True # e.g. for ${foo//pat/replace}
1926
1927	# Keywords like "for" are treated like literals
1928	elif self.token_kind in (Kind.Lit, Kind.History, Kind.KW,
1929	Kind.ControlFlow, Kind.BoolUnary,
1930	Kind.BoolBinary):
1931
1932	# Syntax error for { and }
1933	if self.token_type == Id.Lit_LBrace:
1934	brace_count += 1
1935	elif self.token_type == Id.Lit_RBrace:
1936	brace_count -= 1
1937	elif self.token_type == Id.Lit_Dollar:
1938	if self.parse_opts.no_parse_dollar():
1939	if num_parts == 0 and lex_mode == lex_mode_e.ShCommand:
1940	next_byte = self.lexer.ByteLookAhead()
1941	# TODO: switch lexer modes and parse $/d+/. But not ${a:-$/d+/}
1942	if next_byte == '/':
1943	#log('next_byte %r', next_byte)
1944	pass
1945
1946	p_die(
1947	'Literal $ should be quoted like \$ (no_parse_dollar)',
1948	self.cur_token)
1949	elif self.token_type in (Id.Lit_Number, Id.Lit_RedirVarName):
1950	saw_redir_left_tok = True
1951
1952	done = self._MaybeReadWordPart(num_parts == 0, lex_mode,
1953	w.parts)
1954
1955	elif self.token_kind == Kind.VSub:
1956	vsub_token = self.cur_token
1957
1958	part = SimpleVarSub(vsub_token) # type: word_part_t
1959	w.parts.append(part)
1960
1961	elif self.token_kind == Kind.ExtGlob:
1962	# If parse_at, we can take over @( to start @(seq 3)
1963	# Users can also use look at ,(.py\|.sh)
1964	if (self.parse_opts.parse_at() and
1965	self.token_type == Id.ExtGlob_At and num_parts == 0):
1966	cs_part = self._ReadCommandSub(Id.Left_AtParen,
1967	d_quoted=False)
1968	# RARE mutation of tok.id!
1969	cs_part.left_token.id = Id.Left_AtParen
1970	part = cs_part # for type safety
1971
1972	# Same check as _MaybeReadWordPart. @(seq 3)x is illegal, just like
1973	# a=(one two)x and @arrayfunc(3)x.
1974	self._GetToken()
1975	if self.token_kind not in KINDS_THAT_END_WORDS:
1976	p_die('Unexpected token after @()', self.cur_token)
1977	done = True
1978
1979	else:
1980	if HAVE_FNM_EXTMATCH == 0:
1981	p_die(
1982	"Extended glob won't work without FNM_EXTMATCH support in libc",
1983	self.cur_token)
1984	part = self._ReadExtGlob()
1985	w.parts.append(part)
1986
1987	elif self.token_kind == Kind.BashRegex:
1988	if self.token_type == Id.BashRegex_LParen: # Opening (
1989	part = self._ReadBashRegexGroup()
1990	w.parts.append(part)
1991	else:
1992	assert self.token_type == Id.BashRegex_AllowedInParens
1993	p_die('Invalid token in bash regex', self.cur_token)
1994
1995	elif self.token_kind == Kind.Left:
1996	try_triple_quote = (self.parse_opts.parse_triple_quote() and
1997	lex_mode == lex_mode_e.ShCommand and
1998	num_parts == 0)
1999
2000	# Save allocation
2001	if try_triple_quote:
2002	is_triple_quoted = BoolParamBox(False)
2003
2004	part = self._ReadUnquotedLeftParts(is_triple_quoted)
2005	w.parts.append(part)
2006
2007	# NOT done yet, will advance below
2008	elif self.token_kind == Kind.Right:
2009	# Still part of the word; will be done on the next iter.
2010	if self.token_type == Id.Right_DoubleQuote:
2011	pass
2012	# Never happens, no PushHint for this case.
2013	#elif self.token_type == Id.Right_DollarParen:
2014	# pass
2015	elif self.token_type == Id.Right_Subshell:
2016	# LEXER HACK for (case x in x) ;; esac )
2017	# Rewind before it's used
2018	assert self.next_lex_mode == lex_mode_e.Undefined
2019	if self.lexer.MaybeUnreadOne():
2020	self.lexer.PushHint(Id.Op_RParen, Id.Right_Subshell)
2021	self._SetNext(lex_mode)
2022	done = True
2023	else:
2024	done = True
2025
2026	elif self.token_kind == Kind.Redir:
2027	# Check if the previous token was a possible left_tok to a
2028	# redirect operator, attach it to the word.Redir. And return
2029	# it instead of the CompoundWord.
2030
2031	# &> and &>> don't have a leading descriptor (2 is implied)
2032	if (saw_redir_left_tok and num_parts == 1 and self.token_type
2033	not in (Id.Redir_AndGreat, Id.Redir_AndDGreat)):
2034
2035	self._SetNext(lex_mode)
2036	left_tok = cast(Token, w.parts.pop())
2037	r = word.Redir(left_tok, self.cur_token)
2038	return r # EARLY RETURN
2039
2040	done = True
2041
2042	elif self.token_kind == Kind.Ignored:
2043	done = True
2044
2045	else:
2046	# LEXER HACK for unbalanced case clause. 'case foo in esac' is valid,
2047	# so to test for ESAC, we can read ) before getting a chance to
2048	# PushHint(Id.Op_RParen, Id.Right_CasePat). So here we unread one
2049	# token and do it again.
2050
2051	# We get Id.Op_RParen at top level: case x in x) ;; esac
2052	# We get Id.Eof_RParen inside ComSub: $(case x in x) ;; esac )
2053	if self.token_type in (Id.Op_RParen, Id.Eof_RParen):
2054	# Rewind before it's used
2055	assert self.next_lex_mode == lex_mode_e.Undefined
2056	if self.lexer.MaybeUnreadOne():
2057	if self.token_type == Id.Eof_RParen:
2058	# Redo translation
2059	self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
2060	self._SetNext(lex_mode)
2061
2062	done = True # anything we don't recognize means we're done
2063
2064	if not done:
2065	self._SetNext(lex_mode)
2066	num_parts += 1
2067
2068	if (self.parse_opts.parse_brace() and num_parts > 1 and
2069	brace_count != 0):
2070	# accept { and }, but not foo{
2071	p_die(
2072	'Word has unbalanced { }. Maybe add a space or quote it like \{',
2073	loc.Word(w))
2074
2075	if is_triple_quoted and is_triple_quoted.b and num_parts > 1:
2076	p_die('Unexpected parts after triple quoted string',
2077	loc.WordPart(w.parts[-1]))
2078
2079	if 0:
2080	from _devbuild.gen.syntax_asdl import word_part_str
2081	word_key = ' '.join(word_part_str(p.tag()) for p in w.parts)
2082	WORD_HIST[word_key] += 1
2083
2084	# YSH word restriction
2085	# (r'' u'' b'' are stripped on shopt -s parse_ysh_string)
2086	if self.parse_opts.no_parse_word_join() and not _IsValidYshWord(w):
2087	p_die("Invalid quoted word part in YSH (OILS-ERR-17)",
2088	loc.WordPart(part))
2089
2090	return w
2091
2092	def _ReadArithWord(self):
2093	# type: () -> Optional[word_t]
2094	""" Helper for ReadArithWord() """
2095	self._GetToken()
2096
2097	if self.token_kind == Kind.Unknown:
2098	# e.g. happened during dynamic parsing of unset 'a[$foo]' in gherkin
2099	p_die(
2100	'Unexpected token while parsing arithmetic: %r' %
2101	lexer.TokenVal(self.cur_token), self.cur_token)
2102
2103	elif self.token_kind == Kind.Eof:
2104	return self.cur_token
2105
2106	elif self.token_kind == Kind.Ignored:
2107	# Space should be ignored.
2108	self._SetNext(lex_mode_e.Arith)
2109	return None
2110
2111	elif self.token_kind in (Kind.Arith, Kind.Right):
2112	# Id.Right_DollarDParen IS just a normal token, handled by ArithParser
2113	self._SetNext(lex_mode_e.Arith)
2114	return self.cur_token
2115
2116	elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub):
2117	return self._ReadCompoundWord(lex_mode_e.Arith)
2118
2119	else:
2120	raise AssertionError(self.cur_token)
2121
2122	def _ReadWord(self, word_mode):
2123	# type: (lex_mode_t) -> Optional[word_t]
2124	"""Helper function for ReadWord()."""
2125
2126	# Change the pseudo lexer mode to a real lexer mode
2127	if word_mode == lex_mode_e.ShCommandFakeBrack:
2128	lex_mode = lex_mode_e.ShCommand
2129	else:
2130	lex_mode = word_mode
2131
2132	self._GetToken()
2133
2134	if self.token_kind == Kind.Eof:
2135	# No advance
2136	return self.cur_token
2137
2138	elif self.token_kind == Kind.Redir:
2139	self._SetNext(lex_mode)
2140	# This is >out -- 3>out is handled below
2141	return word.Redir(None, self.cur_token)
2142
2143	# Allow Arith for ) at end of for loop?
2144	elif self.token_kind in (Kind.Op, Kind.Arith):
2145	self._SetNext(lex_mode)
2146
2147	# Newlines are complicated. See 3x2 matrix in the comment about
2148	# self.multiline and self.newline_state above.
2149	if self.token_type == Id.Op_Newline:
2150	if self.multiline:
2151	if self.newline_state > 1:
2152	# This points at a blank line, but at least it gives the line number
2153	p_die('Invalid blank line in multiline mode',
2154	self.cur_token)
2155	return None
2156
2157	if self.returned_newline: # skip
2158	return None
2159
2160	return self.cur_token
2161
2162	elif self.token_kind == Kind.Right:
2163	if self.token_type not in (Id.Right_Subshell, Id.Right_ShFunction,
2164	Id.Right_CasePat, Id.Right_Initializer):
2165	raise AssertionError(self.cur_token)
2166
2167	self._SetNext(lex_mode)
2168	return self.cur_token
2169
2170	elif self.token_kind in (Kind.Ignored, Kind.WS):
2171	self._SetNext(lex_mode)
2172	return None
2173
2174	else:
2175	assert self.token_kind in (Kind.VSub, Kind.Lit, Kind.History,
2176	Kind.Left, Kind.KW, Kind.ControlFlow,
2177	Kind.BoolUnary, Kind.BoolBinary,
2178	Kind.ExtGlob,
2179	Kind.BashRegex), 'Unhandled token kind'
2180
2181	if (word_mode == lex_mode_e.ShCommandFakeBrack and
2182	self.parse_opts.parse_bracket() and
2183	self.token_type == Id.Lit_LBracket):
2184	# Change [ from Kind.Lit -> Kind.Op
2185	# So CommandParser can treat
2186	# assert [42 === x]
2187	# like
2188	# json write (x)
2189	bracket_word = self.cur_token
2190	bracket_word.id = Id.Op_LBracket
2191
2192	self._SetNext(lex_mode)
2193	return bracket_word
2194
2195	# We're beginning a word. If we see Id.Lit_Pound, change to
2196	# lex_mode_e.Comment and read until end of line.
2197	if self.token_type == Id.Lit_Pound:
2198	self._SetNext(lex_mode_e.Comment)
2199	self._GetToken()
2200
2201	# NOTE: The # could be the last character in the file. It can't be
2202	# Eof_{RParen,Backtick} because #) and #` are comments.
2203	assert self.token_type in (Id.Ignored_Comment, Id.Eof_Real), \
2204	self.cur_token
2205
2206	# The next iteration will go into Kind.Ignored and set lex state to
2207	# lex_mode_e.ShCommand/etc.
2208	return None # tell ReadWord() to try again after comment
2209
2210	elif self.token_type == Id.Lit_TPound: ### doc comment
2211	self._SetNext(lex_mode_e.Comment)
2212	self._GetToken()
2213
2214	if self.token_type == Id.Ignored_Comment and self.emit_doc_token:
2215	return self.cur_token
2216
2217	return None # tell ReadWord() to try again after comment
2218
2219	else:
2220	# r'' u'' b'' at the beginning of a word
2221	if (self.token_type == Id.Lit_Chars and
2222	self.lexer.LookAheadOne(
2223	lex_mode_e.ShCommand) == Id.Left_SingleQuote):
2224
2225	# When shopt -s parse_ysh_string:
2226	# echo r'hi' is like echo 'hi'
2227	#
2228	# echo u'\u{3bc}' b'\yff' works
2229
2230	tok = self.cur_token
2231	if self.parse_opts.parse_ysh_string():
2232	if lexer.TokenEquals(tok, 'r'):
2233	left_id = Id.Left_RSingleQuote
2234	elif lexer.TokenEquals(tok, 'u'):
2235	left_id = Id.Left_USingleQuote
2236	elif lexer.TokenEquals(tok, 'b'):
2237	left_id = Id.Left_BSingleQuote
2238	else:
2239	left_id = Id.Undefined_Tok
2240
2241	if left_id != Id.Undefined_Tok:
2242	# skip the r, and then 'foo' will be read as normal
2243	self._SetNext(lex_mode_e.ShCommand)
2244
2245	self._GetToken()
2246	assert self.token_type == Id.Left_SingleQuote, self.token_type
2247
2248	# Read the word in a different lexer mode
2249	return self._ReadYshSingleQuoted(left_id)
2250
2251	return self._ReadCompoundOrRedir(lex_mode)
2252
2253	def ParseVarRef(self):
2254	# type: () -> BracedVarSub
2255	"""DYNAMIC parsing of what's inside ${!ref}
2256
2257	# Same as VarOf production
2258	VarRefExpr = VarOf EOF
2259	"""
2260	self._SetNext(lex_mode_e.VSub_1)
2261
2262	self._GetToken()
2263	if self.token_kind != Kind.VSub:
2264	p_die('Expected var name', self.cur_token)
2265
2266	part = self._ParseVarOf()
2267	# NOTE: no ${ } means no part.left and part.right
2268	part.left = part.name_tok # cheat to make test pass
2269	part.right = part.name_tok
2270
2271	self._GetToken()
2272	if self.token_type != Id.Eof_Real:
2273	p_die('Expected end of var ref expression', self.cur_token)
2274	return part
2275
2276	def LookPastSpace(self):
2277	# type: () -> Id_t
2278	"""Look ahead to the next token.
2279
2280	For the CommandParser to recognize
2281	array= (1 2 3)
2282	YSH for ( versus bash for ((
2283	YSH if ( versus if test
2284	YSH while ( versus while test
2285	YSH bare assignment 'grep =' versus 'grep foo'
2286	"""
2287	assert self.token_type != Id.Undefined_Tok
2288	if self.cur_token.id == Id.WS_Space:
2289	id_ = self.lexer.LookPastSpace(lex_mode_e.ShCommand)
2290	else:
2291	id_ = self.cur_token.id
2292	return id_
2293
2294	def LookAheadDParens(self, shift_back=0):
2295	# type: (int) -> bool
2296	"""Special lookahead for (( )), to make sure it's an arithmetic
2297	expression (i.e. that the closing parens are a single token, not
2298	separated by anything).
2299	"""
2300	assert self.token_type in (Id.Op_DLeftParen, Id.Left_DollarDParen)
2301
2302	return self.lexer.LookAheadDParens(shift_back)
2303
2304	def LookAheadFuncParens(self):
2305	# type: () -> bool
2306	"""Special lookahead for f( ) { echo hi; } to check for ( )"""
2307	assert self.token_type != Id.Undefined_Tok
2308
2309	# We have to handle 2 cases because we buffer a token
2310	if self.cur_token.id == Id.Op_LParen: # saw funcname(
2311	return self.lexer.LookAheadFuncParens(1) # go back one char
2312
2313	elif self.cur_token.id == Id.WS_Space: # saw funcname WHITESPACE
2314	return self.lexer.LookAheadFuncParens(0)
2315
2316	else:
2317	return False
2318
2319	def ReadWord(self, word_mode):
2320	# type: (lex_mode_t) -> word_t
2321	"""Read the next word, using the given lexer mode.
2322
2323	This is a stateful wrapper for the stateless _ReadWord function.
2324	"""
2325	assert word_mode in (lex_mode_e.ShCommand,
2326	lex_mode_e.ShCommandFakeBrack,
2327	lex_mode_e.DBracket, lex_mode_e.BashRegex)
2328
2329	if self.buffered_word: # For integration with pgen2
2330	w = self.buffered_word
2331	self.buffered_word = None
2332	else:
2333	while True:
2334	w = self._ReadWord(word_mode)
2335	if w is not None:
2336	break
2337
2338	self.returned_newline = (word_.CommandId(w) == Id.Op_Newline)
2339	return w
2340
2341	def ReadArithWord(self):
2342	# type: () -> word_t
2343	while True:
2344	w = self._ReadArithWord()
2345	if w is not None:
2346	break
2347	return w
2348
2349	def ReadHereDocBody(self, parts):
2350	# type: (List[word_part_t]) -> None
2351	"""
2352	A here doc is like a double quoted context, except " and \" aren't special.
2353	"""
2354	self._ReadLikeDQ(None, False, parts, is_here_doc=True)
2355	# Returns nothing
2356
2357	def ReadForPlugin(self):
2358	# type: () -> CompoundWord
2359	"""For $PS1, $PS4, etc.
2360
2361	This is just like reading a here doc line. "\n" is allowed, as
2362	well as the typical substitutions ${x} $(echo hi) $((1 + 2)).
2363	"""
2364	w = CompoundWord([])
2365	self._ReadLikeDQ(None, False, w.parts)
2366	return w
2367
2368	def EmitDocToken(self, b):
2369	# type: (bool) -> None
2370	self.emit_doc_token = b
2371
2372	def Multiline(self, b):
2373	# type: (bool) -> None
2374	self.multiline = b
2375
2376
2377	if 0:
2378	import collections
2379	WORD_HIST = collections.Counter()
2380
2381	# vim: sw=4