osh/word_parse.py

OILS / osh / word_parse.py View on Github | oils.pub

2352 lines, 1251 significant

1	# Copyright 2016 Andy Chu. All rights reserved.
2	# Licensed under the Apache License, Version 2.0 (the "License");
3	# you may not use this file except in compliance with the License.
4	# You may obtain a copy of the License at
5	#
6	# http://www.apache.org/licenses/LICENSE-2.0
7	"""
8	word_parse.py - Parse the shell word language.
9
10	Hairy example:
11
12	hi$((1 + 2))"$(echo hi)"${var:-__"$(echo default)"__}
13
14	Substitutions can be nested, but which inner subs are allowed depends on the
15	outer sub. Notes:
16
17	lex_mode_e.ShCommand (_ReadUnquotedLeftParts)
18	All subs and quotes are allowed:
19	$v ${v} $() `` $(()) '' "" $'' $"" <() >()
20
21	lex_mode_e.DQ (_ReadDoubleQuotedLeftParts)
22	Var, Command, Arith, but no quotes.
23	$v ${v} $() `` $(())
24	No process substitution.
25
26	lex_mode_e.Arith
27	Similar to DQ: Var, Command, and Arith sub, but no process sub. bash doesn't
28	allow quotes, but OSH does. We allow ALL FOUR kinds of quotes, because we
29	need those for associative array indexing.
30
31	lex_mode_e.VSub_ArgUnquoted
32	Like ShCommand, everything is allowed (even process substitutions), but we
33	stop at }, and space is SIGNIFICANT.
34
35	Example: ${a:- b }
36
37	${X:-$v} ${X:-${v}} ${X:-$(echo hi)} ${X:-`echo hi`} ${X:-$((1+2))}
38	${X:-'single'} ${X:-"double"} ${X:-$'\n'} ${X:-<(echo hi)}
39
40	lex_mode_e.VSub_ArgDQ
41	In contrast to DQ, VS_ARG_DQ accepts nested "" and $'' and $"", e.g.
42	"${x:-"default"}".
43
44	In contrast, VSub_ArgUnquoted respects single quotes and process
45	substitution.
46
47	It's weird that double quotes are allowed. Space is also significant here,
48	e.g. "${x:-a "b"}".
49	"""
50
51	from _devbuild.gen import grammar_nt
52	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind
53	from _devbuild.gen.types_asdl import (lex_mode_t, lex_mode_e)
54	from _devbuild.gen.syntax_asdl import (
55	ExprSub,
56	BoolParamBox,
57	Token,
58	SimpleVarSub,
59	loc,
60	source,
61	word,
62	DoubleQuoted,
63	SingleQuoted,
64	BracedVarSub,
65	CommandSub,
66	InitializerWord,
67	InitializerWord_t,
68	bracket_op,
69	bracket_op_t,
70	suffix_op,
71	suffix_op_t,
72	rhs_word,
73	rhs_word_e,
74	rhs_word_t,
75	word_e,
76	word_t,
77	CompoundWord,
78	word_part,
79	word_part_t,
80	y_lhs_e,
81	arith_expr_t,
82	command,
83	expr,
84	expr_e,
85	expr_t,
86	pat_t,
87	ArgList,
88	Proc,
89	Func,
90	Subscript,
91	Attribute,
92	arith_expr,
93	VarDecl,
94	Mutation,
95	word_part_e,
96	)
97	from core import alloc
98	from core.error import p_die
99	from mycpp.mylib import log
100	from core import pyutil
101	from display import ui
102	from frontend import consts
103	from frontend import lexer
104	from frontend import reader
105	from osh import tdop
106	from osh import arith_parse
107	from osh import braces
108	from osh import word_
109	from osh import word_compile
110	from mycpp.mylib import tagswitch
111
112	from libc import HAVE_FNM_EXTMATCH
113
114	from typing import List, Optional, Tuple, cast
115	from typing import TYPE_CHECKING
116	if TYPE_CHECKING:
117	from frontend.lexer import Lexer
118	from frontend.parse_lib import ParseContext
119	from frontend.reader import _Reader
120	from osh.cmd_parse import VarChecker
121
122	unused1 = log
123	unused2 = Id_str
124
125	KINDS_THAT_END_WORDS = [Kind.Eof, Kind.WS, Kind.Op, Kind.Right]
126
127
128	def _IsValidYshWord(w):
129	# type: (CompoundWord) -> bool
130	"""YSH word restriction
131
132	Allowed:
133	'foo' r'foo' --flag r'foo'
134	--flag='foo'
135	--flag="foo"
136	Not allowed:
137	--flag=r'bar' NAME=u'value' # ambiguous
138	--flag=b''' multi '''
139	"""
140	parts = w.parts
141	n = len(parts)
142
143	if n != 0 and word_.LiteralId(parts[0]) == Id.Lit_Tilde:
144	# ~bob/src/'dir with spaces' is allowed
145	# ~bob/src/u'dir with spaces' is ambiguous, but allowed for simplicity
146	return True # early return
147
148	ok = True
149	if n >= 2:
150	# spec/ysh-TODO-deprecate - allow ''/usr/* workaround!
151	# note: ""/usr/* not allowed
152	part0 = parts[0]
153	if part0.tag() == word_part_e.SingleQuoted:
154	sq = cast(SingleQuoted, part0)
155	# Make sure $''' is still disallowed
156	if (sq.left.id == Id.Left_SingleQuote and len(sq.sval) == 0):
157	return True
158
159	for part in parts:
160	if part.tag() in (word_part_e.SingleQuoted,
161	word_part_e.DoubleQuoted):
162	ok = False
163
164	# Allow special cases:
165	# --flag='val' NAME='bar'
166	# But NOT
167	# --flag=r'val' NAME=r'val'
168	if not ok:
169	if (n == 2 and word_.LiteralId(parts[0]) == Id.Lit_VarLike):
170	ok = True
171	elif (n == 3 and word_.LiteralId(parts[0]) == Id.Lit_Chars and
172	word_.LiteralId(parts[1]) == Id.Lit_Equals):
173	ok = True
174
175	return ok
176
177
178	class WordEmitter(object):
179	"""Common interface for [ and [["""
180
181	def __init__(self):
182	# type: () -> None
183	"""Empty constructor for mycpp."""
184	pass
185
186	def ReadWord(self, lex_mode):
187	# type: (lex_mode_t) -> word_t
188	raise NotImplementedError()
189
190
191	class WordParser(WordEmitter):
192
193	def __init__(self, parse_ctx, lexer, line_reader):
194	# type: (ParseContext, Lexer, _Reader) -> None
195	self.parse_ctx = parse_ctx
196	self.lexer = lexer
197	self.line_reader = line_reader
198	self.arena = line_reader.arena
199
200	self.parse_opts = parse_ctx.parse_opts
201	self.a_parser = tdop.TdopParser(arith_parse.Spec(), self,
202	self.parse_opts)
203	self.Reset()
204
205	def Init(self, lex_mode):
206	# type: (lex_mode_t) -> None
207	"""Used to parse arithmetic, see ParseContext."""
208	self.next_lex_mode = lex_mode
209
210	def Reset(self):
211	# type: () -> None
212	"""Called by interactive loop."""
213	# For _GetToken()
214	self.cur_token = None # type: Token
215	self.token_kind = Kind.Undefined
216	self.token_type = Id.Undefined_Tok
217
218	self.next_lex_mode = lex_mode_e.ShCommand
219
220	# Boolean mutated by CommandParser via word_.ctx_EmitDocToken. For ### doc
221	# comments
222	self.emit_doc_token = False
223	# Boolean mutated by CommandParser via word_.ctx_Multiline. '...' starts
224	# multiline mode.
225	self.multiline = False
226
227	# For detecting invalid \n\n in multiline mode. Counts what we got
228	# directly from the lexer.
229	self.newline_state = 0
230	# For consolidating \n\n -> \n for the CALLER. This simplifies the parsers
231	# that consume words.
232	self.returned_newline = False
233
234	# For integration with pgen2
235	self.buffered_word = None # type: word_t
236
237	def _GetToken(self):
238	# type: () -> None
239	"""Call this when you need to make a decision based on any of:
240
241	self.token_type
242	self.token_kind
243	self.cur_token
244	"""
245	if self.next_lex_mode == lex_mode_e.Undefined:
246	return # _SetNext() not called, so do nothing
247
248	is_fake = self.next_lex_mode == lex_mode_e.BashRegexFakeInner
249	real_mode = (lex_mode_e.BashRegex if is_fake else self.next_lex_mode)
250
251	self.cur_token = self.lexer.Read(real_mode)
252
253	# MUTATE TOKEN for fake lexer mode.
254	# This is for crazy stuff bash allows, like [[ s =~ (< >) ]]
255	if (is_fake and self.cur_token.id
256	in (Id.WS_Space, Id.BashRegex_AllowedInParens)):
257	self.cur_token.id = Id.Lit_Chars
258
259	self.token_type = self.cur_token.id
260	self.token_kind = consts.GetKind(self.token_type)
261
262	# number of consecutive newlines, ignoring whitespace
263	if self.token_type == Id.Op_Newline:
264	self.newline_state += 1
265	elif self.token_kind != Kind.WS:
266	self.newline_state = 0
267
268	self.parse_ctx.trail.AppendToken(self.cur_token) # For completion
269	self.next_lex_mode = lex_mode_e.Undefined
270
271	def _SetNext(self, lex_mode):
272	# type: (lex_mode_t) -> None
273	"""Set the next lex state, but don't actually read a token.
274
275	We need this for proper interactive parsing.
276	"""
277	self.next_lex_mode = lex_mode
278
279	def _ReadVarOpArg(self, arg_lex_mode):
280	# type: (lex_mode_t) -> rhs_word_t
281
282	# NOTE: Operators like \| and < are not treated as special, so ${a:- \| >} is
283	# valid, even when unquoted.
284	self._SetNext(arg_lex_mode)
285	self._GetToken()
286
287	w = self._ReadVarOpArg2(arg_lex_mode, Id.Undefined_Tok,
288	True) # empty_ok
289
290	# If the Compound has no parts, and we're in a double-quoted VarSub
291	# arg, and empty_ok, then return Empty. This is so it can evaluate to
292	# the empty string and not get elided.
293	#
294	# Examples:
295	# - "${s:-}", "${s/%pat/}"
296	# It's similar to LooksLikeShAssignment where we turn x= into x=''. And it
297	# has the same potential problem of not having Token location info.
298	#
299	# NOTE: empty_ok is False only for the PatSub pattern, which means we'll
300	# return a Compound with no parts, which is explicitly checked with a
301	# custom error message.
302	if len(w.parts) == 0 and arg_lex_mode == lex_mode_e.VSub_ArgDQ:
303	return rhs_word.Empty
304
305	return w
306
307	def _ReadVarOpArg2(self, arg_lex_mode, eof_type, empty_ok):
308	# type: (lex_mode_t, Id_t, bool) -> CompoundWord
309	"""Helper function for _ReadVarOpArg and _ReadPatSubVarOp"""
310	w = self._ReadCompoundWord3(arg_lex_mode, eof_type, empty_ok)
311	tilde = word_.TildeDetect(w)
312	if tilde:
313	w = tilde
314	return w
315
316	def _ReadSliceVarOp(self):
317	# type: () -> suffix_op.Slice
318	"""
319	Looking token after first ':'
320
321	ArithExpr? (':' ArithExpr? )? '}'
322	"""
323	self._NextNonSpace()
324
325	cur_id = self.token_type
326
327	if cur_id in (Id.Arith_RBrace, Id.Arith_Colon): # ${a:} or ${a::}
328	begin = arith_expr.EmptyZero # type: arith_expr_t
329	else:
330	begin = self.a_parser.Parse()
331	cur_id = self.a_parser.CurrentId() # advance
332
333	if cur_id == Id.Arith_RBrace: # ${a:1} or ${@:1}
334	# No length specified, so it's N
335	no_length = None # type: Optional[arith_expr_t]
336	return suffix_op.Slice(begin, no_length)
337
338	elif cur_id == Id.Arith_Colon: # ${a:1:} or ${@:1:}
339	colon_tok = self.cur_token
340	self._NextNonSpace()
341
342	if self.token_type == Id.Arith_RBrace:
343	# quirky bash behavior:
344	# ${a:1:} or ${a::} means length ZERO
345	# but ${a:1} or ${a:} means length N
346	if self.parse_opts.strict_parse_slice():
347	p_die(
348	"Slice length: Add explicit zero, or omit : for N (strict_parse_slice)",
349	colon_tok)
350
351	length = arith_expr.EmptyZero # type: arith_expr_t
352	else:
353	length = self._ReadArithExpr(Id.Arith_RBrace)
354
355	return suffix_op.Slice(begin, length)
356
357	else:
358	p_die("Expected : or } in slice", self.cur_token)
359
360	raise AssertionError() # for MyPy
361
362	def _ReadPatSubVarOp(self):
363	# type: () -> suffix_op.PatSub
364	"""Looking at the first '/' after VarOf:
365
366	VarSub = ...
367	\| VarOf '/' Match ( '/' WORD? )?
368	Match = '/' WORD # can't be empty
369	\| '#' WORD? # may be empty
370	\| '%' WORD?
371	"""
372	slash_tok = self.cur_token # location info
373	replace_mode = Id.Undefined_Tok # bizarre syntax / # %
374
375	self._SetNext(lex_mode_e.VSub_ArgUnquoted) # advance past /
376
377	self._GetToken()
378	if self.token_type == Id.Right_DollarBrace:
379	pat = CompoundWord([])
380	return suffix_op.PatSub(pat, rhs_word.Empty, replace_mode,
381	slash_tok)
382
383	if self.token_type in (Id.Lit_Slash, Id.Lit_Pound, Id.Lit_Percent):
384	replace_mode = self.token_type
385	self._SetNext(lex_mode_e.VSub_ArgUnquoted)
386
387	# Bash quirk:
388	# echo ${x/#/replace} has an empty pattern
389	# echo ${x////replace} is non-empty; it means echo ${x//'/'/replace}
390	empty_ok = replace_mode != Id.Lit_Slash
391	pat = self._ReadVarOpArg2(lex_mode_e.VSub_ArgUnquoted, Id.Lit_Slash,
392	empty_ok)
393	#log('pat 1 %r', pat)
394
395	if self.token_type == Id.Lit_Slash:
396	# read until }
397	replace = self._ReadVarOpArg(
398	lex_mode_e.VSub_ArgUnquoted) # type: rhs_word_t
399	#log('r 1 %r', replace)
400	else:
401	# e.g. ${v/a} is the same as ${v/a/} -- empty replacement string
402	replace = rhs_word.Empty
403
404	self._GetToken()
405	if self.token_type != Id.Right_DollarBrace:
406	# This happens on invalid code
407	p_die(
408	"Expected } after replacement string, got %s" %
409	ui.PrettyId(self.token_type), self.cur_token)
410
411	return suffix_op.PatSub(pat, replace, replace_mode, slash_tok)
412
413	def _ReadSubscript(self):
414	# type: () -> bracket_op_t
415	""" Subscript = '[' ('@' \| '*' \| ArithExpr) ']' """
416	# Lookahead to see if we get @ or *. Otherwise read a full arithmetic
417	# expression.
418	next_id = self.lexer.LookPastSpace(lex_mode_e.Arith)
419	if next_id in (Id.Lit_At, Id.Arith_Star):
420	op = bracket_op.WholeArray(next_id) # type: bracket_op_t
421
422	self._SetNext(lex_mode_e.Arith) # skip past [
423	self._GetToken()
424	self._SetNext(lex_mode_e.Arith) # skip past @
425	self._GetToken()
426	else:
427	self._SetNext(lex_mode_e.Arith) # skip past [
428	anode = self._ReadArithExpr(Id.Arith_RBracket)
429	op = bracket_op.ArrayIndex(anode)
430
431	if self.token_type != Id.Arith_RBracket: # Should be looking at ]
432	p_die('Expected ] to close subscript', self.cur_token)
433
434	self._SetNext(lex_mode_e.VSub_2) # skip past ]
435	self._GetToken() # Needed to be in the same spot as no subscript
436
437	return op
438
439	def _ParseVarOf(self):
440	# type: () -> BracedVarSub
441	"""
442	VarOf = NAME Subscript?
443	\| NUMBER # no subscript allowed, none of these are arrays
444	# ${@[1]} doesn't work, even though slicing does
445	\| VarSymbol
446	"""
447	self._GetToken()
448	name_token = self.cur_token
449	self._SetNext(lex_mode_e.VSub_2)
450
451	self._GetToken() # Check for []
452	if self.token_type == Id.VOp2_LBracket:
453	bracket_op = self._ReadSubscript()
454	else:
455	bracket_op = None
456
457	part = BracedVarSub.CreateNull()
458	part.name_tok = name_token
459	part.var_name = lexer.TokenVal(name_token)
460	part.bracket_op = bracket_op
461	return part
462
463	def _ParseVarExpr(self, arg_lex_mode, allow_query=False):
464	# type: (lex_mode_t, bool) -> BracedVarSub
465	"""Start parsing at the op -- we already skipped past the name."""
466	part = self._ParseVarOf()
467
468	self._GetToken()
469	if self.token_type == Id.Right_DollarBrace:
470	return part # no ops
471
472	op_kind = self.token_kind
473
474	if op_kind == Kind.VTest:
475	tok = self.cur_token
476	arg_word = self._ReadVarOpArg(arg_lex_mode)
477	if self.token_type != Id.Right_DollarBrace:
478	p_die('Expected } to close ${', self.cur_token)
479
480	part.suffix_op = suffix_op.Unary(tok, arg_word)
481
482	elif op_kind == Kind.VOpYsh:
483	tok = self.cur_token
484	arg_word = self._ReadVarOpArg(arg_lex_mode)
485	if self.token_type != Id.Right_DollarBrace:
486	p_die('Expected } to close ${', self.cur_token)
487
488	UP_arg_word = arg_word
489	with tagswitch(arg_word) as case:
490	if case(rhs_word_e.Empty):
491	pass
492	elif case(rhs_word_e.Compound):
493	arg_word = cast(CompoundWord, UP_arg_word)
494	# This handles ${x\|html} and ${x %.3f} now
495	# However I think ${x %.3f} should be statically parsed? It can enter
496	# the printf lexer modes.
497	ok, arg, quoted = word_.StaticEval(arg_word)
498	if not ok or quoted:
499	p_die('Expected a constant argument',
500	loc.Word(arg_word))
501
502	part.suffix_op = suffix_op.Static(tok, arg)
503
504	elif op_kind == Kind.VOp0:
505	part.suffix_op = self.cur_token # Nullary
506	self._SetNext(lex_mode_e.VSub_2) # Expecting }
507	self._GetToken()
508
509	elif op_kind == Kind.VOp1: # % %% # ## etc.
510	tok = self.cur_token
511	# Weird exception that all shells have: these operators take a glob
512	# pattern, so they're lexed as VSub_ArgUnquoted, not VSub_ArgDQ
513	arg_word = self._ReadVarOpArg(lex_mode_e.VSub_ArgUnquoted)
514	if self.token_type != Id.Right_DollarBrace:
515	p_die('Expected } to close ${', self.cur_token)
516
517	part.suffix_op = suffix_op.Unary(tok, arg_word)
518
519	elif op_kind == Kind.VOp2: # / : [ ]
520	if self.token_type == Id.VOp2_Slash:
521	patsub_op = self._ReadPatSubVarOp() # type: suffix_op_t
522	part.suffix_op = patsub_op
523
524	# Checked by the method above
525	assert self.token_type == Id.Right_DollarBrace, self.cur_token
526
527	elif self.token_type == Id.VOp2_Colon:
528	part.suffix_op = self._ReadSliceVarOp()
529	# NOTE: } in arithmetic mode.
530	if self.token_type != Id.Arith_RBrace:
531	# Token seems off; doesn't point to X in # ${a:1:2 X
532	p_die('Expected } to close ${', self.cur_token)
533
534	else:
535	# TODO: Does this ever happen?
536	p_die('Unexpected token in ${} (%s)' % 'VOp2', self.cur_token)
537
538	elif op_kind == Kind.VOp3: # ${prefix@} etc.
539	if allow_query:
540	part.suffix_op = self.cur_token # Nullary
541	self._SetNext(lex_mode_e.VSub_2) # Expecting }
542	self._GetToken()
543	else:
544	p_die("Unexpected token in ${} (%s)" % 'VOp3', self.cur_token)
545
546	# NOTE: Arith_RBrace is for slicing, because it reads } in arithmetic
547	# mode. It's redundantly checked above.
548	if self.token_type not in (Id.Right_DollarBrace, Id.Arith_RBrace):
549	# ${a.} or ${!a.}
550	p_die('Expected } to close ${', self.cur_token)
551
552	# Now look for ops
553	return part
554
555	def _ReadZshVarSub(self, left_token):
556	# type: (Token) -> word_part.ZshVarSub
557
558	self._SetNext(lex_mode_e.VSub_Zsh) # Move past ${(foo)
559
560	# Can be empty
561	w = self._ReadCompoundWord3(lex_mode_e.VSub_Zsh, Id.Right_DollarBrace,
562	True)
563	self._GetToken()
564	return word_part.ZshVarSub(left_token, w, self.cur_token)
565
566	def ReadBracedVarSub(self, left_token):
567	# type: (Token) -> Tuple[BracedVarSub, Token]
568	""" For YSH expressions like var x = ${x:-"default"}. """
569	part = self._ReadBracedVarSub(left_token, d_quoted=False)
570	last_token = self.cur_token
571	return part, last_token
572
573	def _ReadBracedVarSub(self, left_token, d_quoted):
574	# type: (Token, bool) -> BracedVarSub
575	"""For the ${} expression language.
576
577	NAME = [a-zA-Z_][a-zA-Z0-9_]*
578	NUMBER = [0-9]+ # ${10}, ${11}, ...
579
580	Subscript = '[' ('@' \| '*' \| ArithExpr) ']'
581	VarSymbol = '!' \| '@' \| '#' \| ...
582	VarOf = NAME Subscript?
583	\| NUMBER # no subscript allowed, none of these are arrays
584	# ${@[1]} doesn't work, even though slicing does
585	\| VarSymbol
586
587	NULLARY_OP = '@Q' \| '@E' \| '@P' \| '@A' \| '@a' # VOp0
588
589	TEST_OP = '-' \| ':-' \| '=' \| ':=' \| '+' \| ':+' \| '?' \| ':?'
590	STRIP_OP = '#' \| '##' \| '%' \| '%%'
591	CASE_OP = ',' \| ',,' \| '^' \| '^^'
592	UnaryOp = TEST_OP \| STRIP_OP \| CASE_OP
593
594	YSH_UNARY = '\|' \| ' ' # ${x\|html} and ${x %.3f}.
595	# SPACE is operator not %
596	Match = ('/' \| '#' \| '%') WORD # match all / prefix / suffix
597	VarExpr = VarOf
598	\| VarOf NULLARY_OP
599	\| VarOf UnaryOp WORD
600	\| VarOf YSH_UNARY STATIC_WORD
601	\| VarOf ':' ArithExpr (':' ArithExpr )?
602	\| VarOf '/' Match '/' WORD
603
604	LengthExpr = '#' VarOf # can't apply operators after length
605
606	RefOrKeys = '!' VarExpr # CAN apply operators after a named ref
607	# ${!ref[0]} vs ${!keys[@]} resolved later
608
609	PrefixQuery = '!' NAME ('*' \| '@') # list variable names with a prefix
610
611	BuiltinSub = '.' WORD+ # ${.myproc 'builtin' $sub}
612
613	VarSub = LengthExpr
614	\| RefOrKeys
615	\| PrefixQuery
616	\| VarExpr
617	\| BuiltinSub
618
619	NOTES:
620	- Arithmetic expressions are used twice, inside subscripts ${a[x+1]} and
621	slicing ${a:x+1:y+2}
622	- ${#} and ${!} need LL(2) lookahead (considering how my tokenizer works)
623	- @ and * are technically arithmetic expressions in this implementation
624	- We don't account for bash 4.4: ${param@operator} -- Q E P A a. Note that
625	it's also vectorized.
626
627	Strictness over bash:
628	- echo ${a[0][0]} doesn't do anything useful, so we disallow it from the
629	grammar
630	- ! and # prefixes can't be composed, even though named refs can be
631	composed with other operators
632	- '#' means 4 different things: length prefix, VarSymbol, UnaryOp to strip
633	a prefix, and it can also be a literal part of WORD.
634
635	From the parser's point of view, the prefix # can't be combined with
636	UnaryOp/slicing/matching, and the ! can. However
637
638	- ${a[@]:1:2} is not allowed
639	- ${#a[@]:1:2} is allowed, but gives the wrong answer
640	"""
641	if d_quoted:
642	arg_lex_mode = lex_mode_e.VSub_ArgDQ
643	else:
644	arg_lex_mode = lex_mode_e.VSub_ArgUnquoted
645
646	self._SetNext(lex_mode_e.VSub_1)
647	self._GetToken()
648
649	ty = self.token_type
650	first_tok = self.cur_token
651
652	if ty == Id.VSub_Pound:
653	# Disambiguate
654	next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
655	if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
656	# e.g. a name, '#' is the prefix
657	self._SetNext(lex_mode_e.VSub_1)
658	part = self._ParseVarOf()
659
660	self._GetToken()
661	if self.token_type != Id.Right_DollarBrace:
662	p_die('Expected } after length expression', self.cur_token)
663
664	part.prefix_op = first_tok
665
666	else: # not a prefix, '#' is the variable
667	part = self._ParseVarExpr(arg_lex_mode)
668
669	elif ty == Id.VSub_Bang:
670	next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
671	if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
672	# e.g. a name, '!' is the prefix
673	# ${!a} -- this is a ref
674	# ${!3} -- this is ref
675	# ${!a[1]} -- this is a ref
676	# ${!a[@]} -- this is a keys
677	# No lookahead -- do it in a second step, or at runtime
678	self._SetNext(lex_mode_e.VSub_1)
679	part = self._ParseVarExpr(arg_lex_mode, allow_query=True)
680
681	part.prefix_op = first_tok
682
683	else: # not a prefix, '!' is the variable
684	part = self._ParseVarExpr(arg_lex_mode)
685
686	elif ty == Id.VSub_Dot:
687	# Note: this will become a new builtin_sub type, so this method must
688	# return word_part_t rather than BracedVarSub. I don't think that
689	# should cause problems.
690	p_die('TODO: ${.myproc builtin sub}', self.cur_token)
691
692	# VS_NAME, VS_NUMBER, symbol that isn't # or !
693	elif self.token_kind == Kind.VSub:
694	part = self._ParseVarExpr(arg_lex_mode)
695
696	else:
697	# e.g. ${^}
698	p_die('Unexpected token in ${}', self.cur_token)
699
700	part.left = left_token # attach the argument
701	part.right = self.cur_token
702	return part
703
704	def _ReadSingleQuoted(self, left_token, lex_mode):
705	# type: (Token, lex_mode_t) -> SingleQuoted
706	"""Internal method to read a word_part."""
707	tokens = [] # type: List[Token]
708	# In command mode, we never disallow backslashes like '\'
709	right_quote = self.ReadSingleQuoted(lex_mode, left_token, tokens,
710	False)
711	sval = word_compile.EvalSingleQuoted(left_token.id, tokens)
712	node = SingleQuoted(left_token, sval, right_quote)
713	return node
714
715	def ReadSingleQuoted(self, lex_mode, left_token, out_tokens, is_ysh_expr):
716	# type: (lex_mode_t, Token, List[Token], bool) -> Token
717	"""Appends to out_tokens; returns last token
718
719	Used by expr_parse.py
720	"""
721	if (left_token.id == Id.Left_DollarSingleQuote and
722	self.parse_opts.no_parse_osh()):
723	p_die("Instead of $'', use J8 strings like b'' (no_parse_osh)",
724	left_token)
725
726	# echo '\' is allowed, but x = '\' is invalid, in favor of x = r'\'
727	# enforce for triple-quoted strings: ''' \u ''' requires r''' \u '''
728	no_backslashes = is_ysh_expr and left_token.id in (
729	Id.Left_SingleQuote, Id.Left_TSingleQuote)
730
731	expected_end_tokens = 3 if left_token.id in (
732	Id.Left_TSingleQuote, Id.Left_RTSingleQuote, Id.Left_UTSingleQuote,
733	Id.Left_BTSingleQuote) else 1
734	num_end_tokens = 0
735
736	# TODO: could we directly append to out_tokens?
737	tokens = [] # type: List[Token]
738	while num_end_tokens < expected_end_tokens:
739	self._SetNext(lex_mode)
740	self._GetToken()
741
742	# Kind.Char emitted in lex_mode.SQ_C
743	if self.token_kind in (Kind.Lit, Kind.Char):
744	tok = self.cur_token
745	# Happens in lex_mode_e.SQ: 'one\two' is ambiguous, should be
746	# r'one\two' or c'one\\two'
747	if no_backslashes and lexer.TokenContains(tok, '\\'):
748	p_die(
749	"Ambiguous backslash: add explicit r'' or u'' prefix (OILS-ERR-20)",
750	tok)
751
752	if is_ysh_expr:
753	# Disallow var x = $'\001'. Arguably we don't need these
754	# checks because u'\u{1}' is the way to write it.
755	if self.token_type == Id.Char_Octal3:
756	p_die(
757	r"Use \xhh or \u{...} instead of octal escapes in YSH strings",
758	tok)
759
760	if self.token_type == Id.Char_Hex and self.cur_token.length != 4:
761	# disallow \xH
762	p_die(
763	r'Invalid hex escape in YSH string (must be \xHH)',
764	tok)
765
766	tokens.append(tok)
767
768	elif self.token_kind == Kind.Unknown:
769	tok = self.cur_token
770	assert tok.id == Id.Unknown_Backslash, tok
771
772	# x = $'\z' is disallowed; ditto for echo $'\z' if shopt --set no_parse_backslash
773	if is_ysh_expr or self.parse_opts.no_parse_backslash():
774	p_die(
775	"Invalid char escape in C-style string literal (OILS-ERR-11)",
776	tok)
777
778	tokens.append(tok)
779
780	elif self.token_kind == Kind.Eof:
781	p_die('Unexpected EOF in single-quoted string that began here',
782	left_token)
783
784	elif self.token_kind == Kind.Right:
785	# assume Id.Right_SingleQuote
786	num_end_tokens += 1
787	tokens.append(self.cur_token)
788
789	else:
790	raise AssertionError(self.cur_token)
791
792	if self.token_kind != Kind.Right:
793	num_end_tokens = 0 # we need three in a ROW
794
795	if expected_end_tokens == 1:
796	tokens.pop()
797	elif expected_end_tokens == 3: # Get rid of spurious end tokens
798	tokens.pop()
799	tokens.pop()
800	tokens.pop()
801
802	# Remove space from ''' r''' $''' in both expression mode and command mode
803	if left_token.id in (Id.Left_TSingleQuote, Id.Left_RTSingleQuote,
804	Id.Left_UTSingleQuote, Id.Left_BTSingleQuote):
805	word_compile.RemoveLeadingSpaceSQ(tokens)
806
807	# Validation after lexing - same 2 checks in j8.LexerDecoder
808	is_u_string = left_token.id in (Id.Left_USingleQuote,
809	Id.Left_UTSingleQuote)
810
811	for tok in tokens:
812	# u'\yff' is not valid, but b'\yff' is
813	if is_u_string and tok.id == Id.Char_YHex:
814	p_die(
815	r"%s escapes not allowed in u'' strings" %
816	lexer.TokenVal(tok), tok)
817
818	out_tokens.extend(tokens)
819	return self.cur_token
820
821	def _ReadDoubleQuotedLeftParts(self):
822	# type: () -> word_part_t
823	"""Read substitution parts in a double quoted context."""
824	if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick):
825	return self._ReadCommandSub(self.token_type, d_quoted=True)
826
827	if self.token_type == Id.Left_DollarBrace:
828	return self._ReadBracedVarSub(self.cur_token, d_quoted=True)
829
830	if self.token_type == Id.Left_DollarDParen:
831	# TODO: Uncomment this after another regtest/aports run
832	# if (self.LookAheadDParens(shift_back=1)):
833	return self._ReadArithSub()
834	# else:
835	# Mutate token - we treat this '$((' as '$( ('
836	# self.cur_token.id = Id.Left_DollarParen
837	# return self._ReadCommandSub(Id.Left_DollarParen, d_quoted=True)
838
839	if self.token_type == Id.Left_DollarBracket:
840	return self._ReadExprSub(lex_mode_e.DQ)
841
842	if self.token_type == Id.Left_DollarBraceZsh:
843	return self._ReadZshVarSub(self.cur_token)
844
845	raise AssertionError(self.cur_token)
846
847	def _ReadYshSingleQuoted(self, left_id):
848	# type: (Id_t) -> CompoundWord
849	"""Read YSH style strings
850
851	r'' u'' b''
852	r''' ''' u''' ''' b''' '''
853	"""
854	#log('BEF self.cur_token %s', self.cur_token)
855	if left_id == Id.Left_RSingleQuote:
856	lexer_mode = lex_mode_e.SQ_Raw
857	triple_left_id = Id.Left_RTSingleQuote
858	elif left_id == Id.Left_USingleQuote:
859	lexer_mode = lex_mode_e.J8_Str
860	triple_left_id = Id.Left_UTSingleQuote
861	elif left_id == Id.Left_BSingleQuote:
862	lexer_mode = lex_mode_e.J8_Str
863	triple_left_id = Id.Left_BTSingleQuote
864	else:
865	raise AssertionError(left_id)
866
867	# Needed for syntax checks
868	left_tok = self.cur_token
869	left_tok.id = left_id
870
871	sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
872
873	if (len(sq_part.sval) == 0 and self.lexer.ByteLookAhead() == "'"):
874	self._SetNext(lex_mode_e.ShCommand)
875	self._GetToken()
876
877	assert self.token_type == Id.Left_SingleQuote
878	# HACK: magically transform the third ' in u''' to
879	# Id.Left_UTSingleQuote, so that ''' is the terminator
880	left_tok = self.cur_token
881	left_tok.id = triple_left_id
882
883	# Handles stripping leading whitespace
884	sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
885
886	# Advance and validate
887	self._SetNext(lex_mode_e.ShCommand)
888
889	self._GetToken()
890	if self.token_kind not in KINDS_THAT_END_WORDS:
891	p_die('Unexpected token after YSH single-quoted string',
892	self.cur_token)
893
894	return CompoundWord([sq_part])
895
896	def _ReadUnquotedLeftParts(self, triple_out):
897	# type: (Optional[BoolParamBox]) -> word_part_t
898	"""Read substitutions and quoted strings (for lex_mode_e.ShCommand).
899
900	If triple_out is set, then we try parsing triple quoted strings,
901	and set its value to True if we got one.
902	"""
903	if self.token_type in (Id.Left_DoubleQuote, Id.Left_DollarDoubleQuote):
904	# Note: $"" is a synonym for "". It might make sense if it added
905	# \n \0 \x00 \u{123} etc. But that's not what bash does!
906	dq_part = self._ReadDoubleQuoted(self.cur_token)
907	# Got empty word "" and there's a " after
908	if (triple_out and len(dq_part.parts) == 0 and
909	self.lexer.ByteLookAhead() == '"'):
910
911	self._SetNext(lex_mode_e.ShCommand)
912	self._GetToken()
913	# HACK: magically transform the third " in """ to
914	# Id.Left_TDoubleQuote, so that """ is the terminator
915	left_dq_token = self.cur_token
916	left_dq_token.id = Id.Left_TDoubleQuote
917	triple_out.b = True # let caller know we got it
918	return self._ReadDoubleQuoted(left_dq_token)
919
920	return dq_part
921
922	if self.token_type in (Id.Left_SingleQuote, Id.Left_RSingleQuote,
923	Id.Left_DollarSingleQuote):
924	if self.token_type == Id.Left_SingleQuote:
925	lexer_mode = lex_mode_e.SQ_Raw
926	triple_left_id = Id.Left_TSingleQuote
927	elif self.token_type == Id.Left_RSingleQuote:
928	lexer_mode = lex_mode_e.SQ_Raw
929	triple_left_id = Id.Left_RTSingleQuote
930	else:
931	lexer_mode = lex_mode_e.SQ_C
932	# there is no such thing as $'''
933	triple_left_id = Id.Undefined_Tok
934
935	sq_part = self._ReadSingleQuoted(self.cur_token, lexer_mode)
936
937	# Got empty '' or r'' and there's a ' after
938	# u'' and b'' are handled in _ReadYshSingleQuoted
939	if (triple_left_id != Id.Undefined_Tok and
940	triple_out is not None and len(sq_part.sval) == 0 and
941	self.lexer.ByteLookAhead() == "'"):
942
943	self._SetNext(lex_mode_e.ShCommand)
944	self._GetToken()
945
946	# HACK: magically transform the third ' in ''' to
947	# Id.Left_TSingleQuote, so that ''' is the terminator
948	left_sq_token = self.cur_token
949	left_sq_token.id = triple_left_id
950
951	triple_out.b = True # let caller know we got it
952	return self._ReadSingleQuoted(left_sq_token, lexer_mode)
953
954	return sq_part
955
956	if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick,
957	Id.Left_ProcSubIn, Id.Left_ProcSubOut):
958	return self._ReadCommandSub(self.token_type, d_quoted=False)
959
960	if self.token_type == Id.Left_DollarBrace:
961	return self._ReadBracedVarSub(self.cur_token, d_quoted=False)
962
963	if self.token_type == Id.Left_DollarDParen:
964	# TODO: Uncomment this after another regtest/aports run
965	# if (self.LookAheadDParens(shift_back=1)):
966	return self._ReadArithSub()
967	# else:
968	# Mutate token - we treat this '$((' as '$( ('
969	# self.cur_token.id = Id.Left_DollarParen
970	# return self._ReadCommandSub(Id.Left_DollarParen, d_quoted=True)
971
972	if self.token_type == Id.Left_DollarBracket:
973	return self._ReadExprSub(lex_mode_e.ShCommand)
974
975	if self.token_type == Id.Left_DollarBraceZsh:
976	return self._ReadZshVarSub(self.cur_token)
977
978	raise AssertionError(self.cur_token)
979
980	def _ReadExtGlob(self):
981	# type: () -> word_part.ExtGlob
982	"""
983	Grammar:
984	Item = CompoundWord \| EPSILON # important: @(foo\|) is allowed
985	LEFT = '@(' \| '*(' \| '+(' \| '?(' \| '!('
986	RIGHT = ')'
987	ExtGlob = LEFT (Item '\|')* Item RIGHT # ITEM may be empty
988	Compound includes ExtGlob
989	"""
990	left_token = self.cur_token
991	right_token = None # type: Token
992	arms = [] # type: List[CompoundWord]
993
994	self.lexer.PushHint(Id.Op_RParen, Id.Right_ExtGlob)
995	self._SetNext(lex_mode_e.ExtGlob) # advance past LEFT
996
997	read_word = False # did we just a read a word? To handle @(\|\|).
998
999	while True:
1000	self._GetToken()
1001
1002	if self.token_type == Id.Right_ExtGlob:
1003	if not read_word:
1004	arms.append(CompoundWord([]))
1005	right_token = self.cur_token
1006	break
1007
1008	elif self.token_type == Id.Op_Pipe:
1009	if not read_word:
1010	arms.append(CompoundWord([]))
1011	read_word = False
1012	self._SetNext(lex_mode_e.ExtGlob)
1013
1014	# lex_mode_e.ExtGlob should only produce these 4 kinds of tokens
1015	elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub,
1016	Kind.ExtGlob):
1017	w = self._ReadCompoundWord(lex_mode_e.ExtGlob)
1018	arms.append(w)
1019	read_word = True
1020
1021	elif self.token_kind == Kind.Eof:
1022	p_die('Unexpected EOF reading extended glob that began here',
1023	left_token)
1024
1025	else:
1026	raise AssertionError(self.cur_token)
1027
1028	return word_part.ExtGlob(left_token, arms, right_token)
1029
1030	def _ReadBashRegexGroup(self):
1031	# type: () -> word_part.BashRegexGroup
1032	"""
1033	Grammar:
1034	BashRegexGroup = '(' WORD? ')
1035	"""
1036	left_token = self.cur_token
1037	assert left_token.id == Id.BashRegex_LParen, left_token
1038
1039	arms = [] # type: List[CompoundWord]
1040
1041	self.lexer.PushHint(Id.Op_RParen, Id.Right_BashRegexGroup)
1042	self._SetNext(lex_mode_e.BashRegexFakeInner) # advance past LEFT
1043
1044	self._GetToken()
1045	if self.token_type == Id.Right_BashRegexGroup: # empty ()
1046	return word_part.BashRegexGroup(left_token, None, self.cur_token)
1047
1048	# lex_mode_e.BashRegex should only produce these 4 kinds of tokens
1049	if self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub, Kind.BashRegex):
1050	# Fake lexer mode that translates Id.WS_Space to Id.Lit_Chars
1051	# To allow bash style [[ s =~ (a b) ]]
1052	w = self._ReadCompoundWord(lex_mode_e.BashRegexFakeInner)
1053	arms.append(w)
1054
1055	self._GetToken()
1056	if self.token_type != Id.Right_BashRegexGroup:
1057	p_die('Expected ) to close bash regex group', self.cur_token)
1058
1059	return word_part.BashRegexGroup(left_token, w, self.cur_token)
1060
1061	p_die('Expected word after ( opening bash regex group', self.cur_token)
1062
1063	def _ReadLikeDQ(self, left_token, is_ysh_expr, out_parts):
1064	# type: (Optional[Token], bool, List[word_part_t]) -> None
1065	"""
1066	Args:
1067	left_token: A token if we are reading a double quoted part, or None if
1068	we're reading a here doc.
1069	is_ysh_expr: Whether to disallow backticks and invalid char escapes
1070	out_parts: list of word_part to append to
1071	"""
1072	if left_token:
1073	if left_token.id in (Id.Left_TDoubleQuote,
1074	Id.Left_DollarTDoubleQuote):
1075	expected_end_tokens = 3
1076	else:
1077	expected_end_tokens = 1
1078	else:
1079	expected_end_tokens = 1000 # here doc will break
1080
1081	num_end_tokens = 0
1082	while num_end_tokens < expected_end_tokens:
1083	self._SetNext(lex_mode_e.DQ)
1084	self._GetToken()
1085
1086	if self.token_kind == Kind.Lit:
1087	if self.token_type == Id.Lit_EscapedChar:
1088	tok = self.cur_token
1089	ch = lexer.TokenSliceLeft(tok, 1)
1090	part = word_part.EscapedLiteral(tok,
1091	ch) # type: word_part_t
1092	else:
1093	if self.token_type == Id.Lit_BadBackslash:
1094	# echo "\z" is OK in shell, but 'x = "\z" is a syntax error in
1095	# YSH.
1096	# Slight hole: We don't catch 'x = ${undef:-"\z"} because of the
1097	# recursion (unless no_parse_backslash)
1098	if (is_ysh_expr or
1099	self.parse_opts.no_parse_backslash()):
1100	p_die(
1101	"Invalid char escape in double quoted string (OILS-ERR-12)",
1102	self.cur_token)
1103	elif self.token_type == Id.Lit_Dollar:
1104	if is_ysh_expr or self.parse_opts.no_parse_dollar():
1105	p_die("Literal $ should be quoted like \$",
1106	self.cur_token)
1107
1108	part = self.cur_token
1109	out_parts.append(part)
1110
1111	elif self.token_kind == Kind.Left:
1112	if self.token_type == Id.Left_Backtick and is_ysh_expr:
1113	p_die("Backtick should be $(cmd) or \\` (OILS-ERR-18)",
1114	self.cur_token)
1115
1116	part = self._ReadDoubleQuotedLeftParts()
1117	out_parts.append(part)
1118
1119	elif self.token_kind == Kind.VSub:
1120	tok = self.cur_token
1121	part = SimpleVarSub(tok)
1122	out_parts.append(part)
1123	# NOTE: parsing "$f(x)" would BREAK CODE. Could add a more for it
1124	# later.
1125
1126	elif self.token_kind == Kind.Right:
1127	assert self.token_type == Id.Right_DoubleQuote, self.token_type
1128	if left_token:
1129	num_end_tokens += 1
1130
1131	# In a here doc, the right quote is literal!
1132	out_parts.append(self.cur_token)
1133
1134	elif self.token_kind == Kind.Eof:
1135	if left_token:
1136	p_die(
1137	'Unexpected EOF reading double-quoted string that began here',
1138	left_token)
1139	else: # here docs will have an EOF in their token stream
1140	break
1141
1142	else:
1143	raise AssertionError(self.cur_token)
1144
1145	if self.token_kind != Kind.Right:
1146	num_end_tokens = 0 # """ must be CONSECUTIVE
1147
1148	if expected_end_tokens == 1:
1149	out_parts.pop()
1150	elif expected_end_tokens == 3:
1151	out_parts.pop()
1152	out_parts.pop()
1153	out_parts.pop()
1154
1155	# Remove space from """ in both expression mode and command mode
1156	if (left_token and left_token.id
1157	in (Id.Left_TDoubleQuote, Id.Left_DollarTDoubleQuote)):
1158	word_compile.RemoveLeadingSpaceDQ(out_parts)
1159
1160	# Return nothing, since we appended to 'out_parts'
1161
1162	def _ReadDoubleQuoted(self, left_token):
1163	# type: (Token) -> DoubleQuoted
1164	"""Helper function for "hello $name".
1165
1166	Args:
1167	eof_type: for stopping at }, Id.Lit_RBrace
1168	here_doc: Whether we are reading in a here doc context
1169
1170	Also ${foo%%a b c} # treat this as double quoted. until you hit
1171	"""
1172	parts = [] # type: List[word_part_t]
1173	self._ReadLikeDQ(left_token, False, parts)
1174
1175	right_quote = self.cur_token
1176	return DoubleQuoted(left_token, parts, right_quote)
1177
1178	def ReadDoubleQuoted(self, left_token, parts):
1179	# type: (Token, List[word_part_t]) -> Token
1180	"""For expression mode.
1181
1182	Read var x = "${dir:-}/$name"; etc.
1183	"""
1184	self._ReadLikeDQ(left_token, True, parts)
1185	return self.cur_token
1186
1187	def _ReadCommandSub(self, left_id, d_quoted=False):
1188	# type: (Id_t, bool) -> CommandSub
1189	"""
1190	NOTE: This is not in the grammar, because word parts aren't in the grammar!
1191
1192	command_sub = '$(' command_list ')'
1193	\| '@(' command_list ')'
1194	\| '<(' command_list ')'
1195	\| '>(' command_list ')'
1196	\| ` command_list `
1197	"""
1198	left_token = self.cur_token
1199
1200	# Set the lexer in a state so ) becomes the EOF token.
1201	if left_id in (Id.Left_DollarParen, Id.Left_AtParen, Id.Left_ProcSubIn,
1202	Id.Left_ProcSubOut):
1203	self._SetNext(lex_mode_e.ShCommand) # advance past $( etc.
1204
1205	right_id = Id.Eof_RParen
1206	self.lexer.PushHint(Id.Op_RParen, right_id)
1207	c_parser = self.parse_ctx.MakeParserForCommandSub(
1208	self.line_reader, self.lexer, right_id)
1209	# NOTE: This doesn't use something like main_loop because we don't want
1210	# to interleave parsing and execution! Unlike 'source' and 'eval'.
1211	node = c_parser.ParseCommandSub()
1212
1213	right_token = c_parser.w_parser.cur_token
1214
1215	elif left_id == Id.Left_Backtick and self.parse_ctx.do_lossless:
1216	# NOTE: This is an APPROXIMATE solution for translation ONLY. See
1217	# test/osh2oil.
1218
1219	right_id = Id.Eof_Backtick
1220	self.lexer.PushHint(Id.Left_Backtick, right_id)
1221	c_parser = self.parse_ctx.MakeParserForCommandSub(
1222	self.line_reader, self.lexer, right_id)
1223	node = c_parser.ParseCommandSub()
1224	right_token = c_parser.w_parser.cur_token
1225
1226	elif left_id == Id.Left_Backtick:
1227	if self.parse_opts.no_parse_backticks():
1228	p_die(
1229	'Backtick should be $(cmd) or \\` (no_parse_backticks, OILS-ERR-18)',
1230	left_token)
1231
1232	self._SetNext(lex_mode_e.Backtick) # advance past `
1233
1234	parts = [] # type: List[str]
1235	while True:
1236	self._GetToken()
1237	#log("TOK %s", self.cur_token)
1238
1239	if self.token_type == Id.Backtick_Quoted:
1240	# Remove leading \
1241	parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1242
1243	elif self.token_type == Id.Backtick_DoubleQuote:
1244	# Compatibility: If backticks are double quoted, then double quotes
1245	# within them have to be \"
1246	# Shells aren't smart enough to match nested " and ` quotes (but OSH
1247	# is)
1248	if d_quoted:
1249	# Remove leading \
1250	parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1251	else:
1252	parts.append(lexer.TokenVal(self.cur_token))
1253
1254	elif self.token_type == Id.Backtick_Other:
1255	parts.append(lexer.TokenVal(self.cur_token))
1256
1257	elif self.token_type == Id.Backtick_Right:
1258	break
1259
1260	elif self.token_type == Id.Eof_Real:
1261	# Note: this parse error is in the ORIGINAL context. No code_str yet.
1262	p_die('Unexpected EOF while looking for closing backtick',
1263	left_token)
1264
1265	else:
1266	raise AssertionError(self.cur_token)
1267
1268	self._SetNext(lex_mode_e.Backtick)
1269
1270	# Calculate right SPID on CommandSub BEFORE re-parsing.
1271	right_token = self.cur_token
1272
1273	code_str = ''.join(parts)
1274	#log('code %r', code_str)
1275
1276	# Save lines into a new, temporary arena, so SnipCodeBlock() isn't
1277	# messed up. Note: This is similar to how we parse aliases in
1278	# osh/cmd_parse.py. It won't have the same location info as
1279	# MakeParserForCommandSub(), because the reader is different.
1280	arena = alloc.Arena()
1281	# TODO: arena.PushSource()?
1282
1283	line_reader = reader.StringLineReader(code_str, arena)
1284	c_parser = self.parse_ctx.MakeOshParser(line_reader)
1285	src = source.Reparsed('backticks', left_token, right_token)
1286	with alloc.ctx_SourceCode(arena, src):
1287	node = c_parser.ParseCommandSub()
1288
1289	else:
1290	raise AssertionError(left_id)
1291
1292	return CommandSub(left_token, node, right_token)
1293
1294	def _ReadExprSub(self, lex_mode):
1295	# type: (lex_mode_t) -> ExprSub
1296	"""$[d->key] $[obj.method()] etc."""
1297	left_token = self.cur_token
1298
1299	self._SetNext(lex_mode_e.Expr)
1300	enode, right_token = self.parse_ctx.ParseYshExpr(
1301	self.lexer, grammar_nt.ysh_expr_sub)
1302
1303	self._SetNext(lex_mode) # Move past ]
1304	return ExprSub(left_token, enode, right_token)
1305
1306	def ParseVarDecl(self, kw_token):
1307	# type: (Token) -> VarDecl
1308	"""
1309	oil_var_decl: name_type_list '=' testlist end_stmt
1310
1311	Note that assignments must end with \n ; } or EOF. Unlike shell
1312	assignments, we disallow:
1313
1314	var x = 42 \| wc -l
1315	var x = 42 && echo hi
1316	"""
1317	self._SetNext(lex_mode_e.Expr)
1318	enode, last_token = self.parse_ctx.ParseVarDecl(kw_token, self.lexer)
1319	# Hack to move } from what the Expr lexer modes gives to what CommandParser
1320	# wants
1321	if last_token.id == Id.Op_RBrace:
1322	last_token.id = Id.Lit_RBrace
1323
1324	# Let the CommandParser see the Op_Semi or Op_Newline.
1325	self.buffered_word = last_token
1326	self._SetNext(lex_mode_e.ShCommand) # always back to this
1327	return enode
1328
1329	def ParseMutation(self, kw_token, var_checker):
1330	# type: (Token, VarChecker) -> Mutation
1331	"""
1332	setvar i = 42
1333	setvar i += 1
1334	setvar a[i] = 42
1335	setvar a[i] += 1
1336	setvar d.key = 42
1337	setvar d.key += 1
1338	"""
1339	self._SetNext(lex_mode_e.Expr)
1340	enode, last_token = self.parse_ctx.ParseMutation(kw_token, self.lexer)
1341	# Hack to move } from what the Expr lexer modes gives to what CommandParser
1342	# wants
1343	if last_token.id == Id.Op_RBrace:
1344	last_token.id = Id.Lit_RBrace
1345
1346	for lhs in enode.lhs:
1347	UP_lhs = lhs
1348	with tagswitch(lhs) as case:
1349	if case(y_lhs_e.Var):
1350	lhs = cast(Token, UP_lhs)
1351	var_checker.Check(kw_token.id, lexer.LazyStr(lhs), lhs)
1352
1353	# Note: this does not cover cases like
1354	# setvar (a[0])[1] = v
1355	# setvar (d.key).other = v
1356	# This leaks into catching all typos statically, which may be
1357	# possible if 'use' makes all names explicit.
1358	elif case(y_lhs_e.Subscript):
1359	lhs = cast(Subscript, UP_lhs)
1360	if lhs.obj.tag() == expr_e.Var:
1361	v = cast(expr.Var, lhs.obj)
1362	var_checker.Check(kw_token.id, v.name, v.left)
1363
1364	elif case(y_lhs_e.Attribute):
1365	lhs = cast(Attribute, UP_lhs)
1366	if lhs.obj.tag() == expr_e.Var:
1367	v = cast(expr.Var, lhs.obj)
1368	var_checker.Check(kw_token.id, v.name, v.left)
1369
1370	# Let the CommandParser see the Op_Semi or Op_Newline.
1371	self.buffered_word = last_token
1372	self._SetNext(lex_mode_e.ShCommand) # always back to this
1373	return enode
1374
1375	def ParseBareDecl(self):
1376	# type: () -> expr_t
1377	"""
1378	x = {name: val}
1379	"""
1380	self._SetNext(lex_mode_e.Expr)
1381	self._GetToken()
1382	enode, last_token = self.parse_ctx.ParseYshExpr(
1383	self.lexer, grammar_nt.command_expr)
1384	if last_token.id == Id.Op_RBrace:
1385	last_token.id = Id.Lit_RBrace
1386	self.buffered_word = last_token
1387	self._SetNext(lex_mode_e.ShCommand)
1388	return enode
1389
1390	def ParseYshExprForCommand(self):
1391	# type: () -> expr_t
1392
1393	# Fudge for this case
1394	# for x in(y) {
1395	# versus
1396	# for x in (y) {
1397	#
1398	# In the former case, ReadWord on 'in' puts the lexer past (.
1399	# Also see LookPastSpace in CommandParers.
1400	# A simpler solution would be nicer.
1401
1402	if self.token_type == Id.Op_LParen:
1403	self.lexer.MaybeUnreadOne()
1404
1405	enode, _ = self.parse_ctx.ParseYshExpr(self.lexer, grammar_nt.ysh_expr)
1406
1407	self._SetNext(lex_mode_e.ShCommand)
1408	return enode
1409
1410	def ParseCommandExpr(self):
1411	# type: () -> expr_t
1412	"""
1413	= 1+2
1414	"""
1415	enode, last_token = self.parse_ctx.ParseYshExpr(
1416	self.lexer, grammar_nt.command_expr)
1417
1418	# In some cases, such as the case statement, we expect the lexer to be
1419	# pointing at the token right after the expression. But the expression
1420	# parser must have read to the `last_token`. Unreading places the lexer
1421	# back in the expected state. Ie:
1422	#
1423	# case (x) { case (x) {
1424	# (else) { = x } (else) { = x }
1425	# ^ The lexer is here ^ Unread to here
1426	# } }
1427	assert last_token.id in (Id.Op_Newline, Id.Eof_Real, Id.Op_Semi,
1428	Id.Op_RBrace), last_token
1429	if last_token.id != Id.Eof_Real:
1430	# Eof_Real is the only token we cannot unread
1431	self.lexer.MaybeUnreadOne()
1432
1433	return enode
1434
1435	def ParseProc(self, node):
1436	# type: (Proc) -> None
1437
1438	# proc name-with-hyphens() must be accepted
1439	self._SetNext(lex_mode_e.ShCommand)
1440	self._GetToken()
1441	# example: 'proc f[' gets you Lit_ArrayLhsOpen
1442	if self.token_type != Id.Lit_Chars:
1443	p_die('Invalid proc name %s' % ui.PrettyToken(self.cur_token),
1444	self.cur_token)
1445
1446	# TODO: validate this more. Disallow proc 123 { }, which isn't disallowed
1447	# for shell functions. Similar to IsValidVarName().
1448	node.name = self.cur_token
1449
1450	last_token = self.parse_ctx.ParseProc(self.lexer, node)
1451
1452	# Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1453	assert last_token.id == Id.Op_LBrace
1454	last_token.id = Id.Lit_LBrace
1455	self.buffered_word = last_token
1456
1457	self._SetNext(lex_mode_e.ShCommand)
1458
1459	def ParseFunc(self, node):
1460	# type: (Func) -> None
1461	last_token = self.parse_ctx.ParseFunc(self.lexer, node)
1462
1463	# Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1464	assert last_token.id == Id.Op_LBrace
1465	last_token.id = Id.Lit_LBrace
1466	self.buffered_word = last_token
1467
1468	self._SetNext(lex_mode_e.ShCommand)
1469
1470	def ParseYshCasePattern(self):
1471	# type: () -> Tuple[pat_t, Token]
1472	pat, left_tok, last_token = self.parse_ctx.ParseYshCasePattern(
1473	self.lexer)
1474
1475	if last_token.id == Id.Op_LBrace:
1476	last_token.id = Id.Lit_LBrace
1477	self.buffered_word = last_token
1478
1479	return pat, left_tok
1480
1481	def NewlineOkForYshCase(self):
1482	# type: () -> Id_t
1483	"""Check for optional newline and consume it.
1484
1485	This is a special case of `_NewlineOk` which fixed some "off-by-one" issues
1486	which crop up while parsing Ysh Case Arms. For more details, see
1487	#oil-dev > Progress On YSH Case Grammar on zulip.
1488
1489	Returns a token id which is filled with the choice of
1490
1491	word { echo word }
1492	(3) { echo expr }
1493	/e/ { echo eggex }
1494	} # right brace
1495	"""
1496	while True:
1497	next_id = self.lexer.LookAheadOne(lex_mode_e.Expr)
1498
1499	# Cannot lookahead past lines
1500	if next_id == Id.Unknown_Tok:
1501	if not self.lexer.MoveToNextLine(): # Try to move to next line
1502	break # EOF
1503	continue
1504
1505	next_kind = consts.GetKind(next_id)
1506	if next_id != Id.Op_Newline and next_kind != Kind.Ignored:
1507	break
1508
1509	self.lexer.Read(lex_mode_e.Expr)
1510
1511	if next_id in (Id.Op_RBrace, Id.Op_LParen, Id.Arith_Slash):
1512	self._SetNext(lex_mode_e.Expr) # Continue in expression mode
1513	else:
1514	# Consume the trailing Op_Newline
1515	self._SetNext(lex_mode_e.ShCommand)
1516	self._GetToken()
1517
1518	return next_id
1519
1520	def _ReadArithExpr(self, end_id):
1521	# type: (Id_t) -> arith_expr_t
1522	"""Read and parse an arithmetic expression in various contexts.
1523
1524	$(( 1+2 ))
1525	(( a=1+2 ))
1526	${a[ 1+2 ]}
1527	${a : 1+2 : 1+2}
1528
1529	See tests/arith-context.test.sh for ambiguous cases.
1530
1531	${a[a[0]]} is valid # VS_RBRACKET vs Id.Arith_RBracket
1532
1533	${s : a<b?0:1 : 1} # VS_COLON vs Id.Arith_Colon
1534
1535	See the assertion in ArithParser.Parse() -- unexpected extra input.
1536	"""
1537	# calls self.ReadWord(lex_mode_e.Arith)
1538	anode = self.a_parser.Parse()
1539	cur_id = self.a_parser.CurrentId()
1540	if end_id != Id.Undefined_Tok and cur_id != end_id:
1541	p_die(
1542	'Unexpected token after arithmetic expression (%s != %s)' %
1543	(ui.PrettyId(cur_id), ui.PrettyId(end_id)),
1544	loc.Word(self.a_parser.cur_word))
1545	return anode
1546
1547	def _ReadArithSub(self):
1548	# type: () -> word_part.ArithSub
1549	"""Read an arith substitution, which contains an arith expression, e.g.
1550
1551	$((a + 1)).
1552	"""
1553	left_tok = self.cur_token
1554
1555	# The second one needs to be disambiguated in stuff like stuff like:
1556	# $(echo $(( 1+2 )) )
1557	self.lexer.PushHint(Id.Op_RParen, Id.Right_DollarDParen)
1558
1559	# NOTE: To disambiguate $(( as arith sub vs. command sub and subshell, we
1560	# could save the lexer/reader state here, and retry if the arithmetic parse
1561	# fails. But we can almost always catch this at parse time. There could
1562	# be some exceptions like:
1563	# $((echo * foo)) # looks like multiplication
1564	# $((echo / foo)) # looks like division
1565
1566	# $(( )) is valid
1567	anode = arith_expr.EmptyZero # type: arith_expr_t
1568
1569	self._NextNonSpace()
1570	if self.token_type != Id.Arith_RParen:
1571	anode = self._ReadArithExpr(Id.Arith_RParen)
1572
1573	self._SetNext(lex_mode_e.ShCommand)
1574
1575	# Ensure we get closing )
1576	self._GetToken()
1577	if self.token_type != Id.Right_DollarDParen:
1578	p_die('Expected second ) to end arith sub', self.cur_token)
1579
1580	right_tok = self.cur_token
1581	return word_part.ArithSub(left_tok, anode, right_tok)
1582
1583	def ReadDParen(self):
1584	# type: () -> Tuple[arith_expr_t, Token]
1585	"""Read ((1+ 2)) -- command context.
1586
1587	We're using the word parser because it's very similar to _ReadArithExpr
1588	above.
1589
1590	This also returns the terminating Id.Op_DRightParen token for location
1591	info.
1592	"""
1593	# (( )) is valid
1594	anode = arith_expr.EmptyZero # type: arith_expr_t
1595
1596	self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
1597
1598	self._NextNonSpace()
1599	if self.token_type != Id.Arith_RParen:
1600	anode = self._ReadArithExpr(Id.Arith_RParen)
1601
1602	self._SetNext(lex_mode_e.ShCommand)
1603
1604	# Ensure we get the second )
1605	self._GetToken()
1606	right = self.cur_token
1607	if right.id != Id.Op_DRightParen:
1608	p_die('Expected second ) to end arith statement', right)
1609
1610	self._SetNext(lex_mode_e.ShCommand)
1611
1612	return anode, right
1613
1614	def _NextNonSpace(self):
1615	# type: () -> None
1616	"""Advance in lex_mode_e.Arith until non-space token.
1617
1618	Same logic as _ReadWord, but used in
1619	$(( ))
1620	(( ))
1621	for (( ))
1622
1623	You can read self.token_type after this, without calling _GetToken.
1624	"""
1625	while True:
1626	self._SetNext(lex_mode_e.Arith)
1627	self._GetToken()
1628	if self.token_kind not in (Kind.Ignored, Kind.WS):
1629	break
1630
1631	def ReadForExpression(self):
1632	# type: () -> command.ForExpr
1633	"""Read ((i=0; i<5; ++i)) -- part of command context."""
1634	self._NextNonSpace() # skip over ((
1635	cur_id = self.token_type # for end of arith expressions
1636
1637	if cur_id == Id.Arith_Semi: # for (( ; i < 10; i++ ))
1638	init_node = arith_expr.EmptyZero # type: arith_expr_t
1639	else:
1640	init_node = self.a_parser.Parse()
1641	cur_id = self.a_parser.CurrentId()
1642	self._NextNonSpace()
1643
1644	# It's odd to keep track of both cur_id and self.token_type in this
1645	# function, but it works, and is tested in 'test/parse_error.sh
1646	# arith-integration'
1647	if cur_id != Id.Arith_Semi: # for (( x=0 b; ... ))
1648	p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1649
1650	self._GetToken()
1651	cur_id = self.token_type
1652
1653	if cur_id == Id.Arith_Semi: # for (( ; ; i++ ))
1654	# empty condition is TRUE
1655	cond_node = arith_expr.EmptyOne # type: arith_expr_t
1656	else:
1657	cond_node = self.a_parser.Parse()
1658	cur_id = self.a_parser.CurrentId()
1659
1660	if cur_id != Id.Arith_Semi: # for (( x=0; x<5 b ))
1661	p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1662
1663	self._NextNonSpace()
1664	if self.token_type == Id.Arith_RParen: # for (( ; ; ))
1665	update_node = arith_expr.EmptyZero # type: arith_expr_t
1666	else:
1667	update_node = self._ReadArithExpr(Id.Arith_RParen)
1668
1669	self._NextNonSpace()
1670	if self.token_type != Id.Arith_RParen:
1671	p_die('Expected ) to end for loop expression', self.cur_token)
1672	self._SetNext(lex_mode_e.ShCommand)
1673
1674	# redirects is None, will be assigned in CommandEvaluator
1675	node = command.ForExpr.CreateNull()
1676	node.init = init_node
1677	node.cond = cond_node
1678	node.update = update_node
1679	return node
1680
1681	def _ReadArrayLiteral(self):
1682	# type: () -> word_part_t
1683	"""a=(1 2 3)
1684
1685	TODO: See osh/cmd_parse.py:164 for Id.Lit_ArrayLhsOpen, for a[x++]=1
1686
1687	We want:
1688
1689	A=(['x']=1 ["x"]=2 [$x$y]=3)
1690
1691	Maybe allow this as a literal string? Because I think I've seen it before?
1692	Or maybe force people to patch to learn the rule.
1693
1694	A=([x]=4)
1695
1696	Starts with Lit_Other '[', and then it has Lit_ArrayLhsClose
1697	Maybe enforce that ALL have keys or NONE of have keys.
1698	"""
1699	self._SetNext(lex_mode_e.ShCommand) # advance past (
1700	self._GetToken()
1701	if self.cur_token.id != Id.Op_LParen:
1702	p_die('Expected ( after =', self.cur_token)
1703	left_token = self.cur_token
1704	right_token = None # type: Token
1705
1706	# MUST use a new word parser (with same lexer).
1707	w_parser = self.parse_ctx.MakeWordParser(self.lexer, self.line_reader)
1708	words = [] # type: List[CompoundWord]
1709	done = False
1710	while not done:
1711	w = w_parser.ReadWord(lex_mode_e.ShCommand)
1712	with tagswitch(w) as case:
1713	if case(word_e.Operator):
1714	tok = cast(Token, w)
1715	if tok.id == Id.Right_Initializer:
1716	right_token = tok
1717	done = True # can't use break here
1718	# Unlike command parsing, array parsing allows embedded \n.
1719	elif tok.id == Id.Op_Newline:
1720	continue
1721	else:
1722	p_die('Unexpected token in array literal', loc.Word(w))
1723
1724	elif case(word_e.Compound):
1725	words.append(cast(CompoundWord, w))
1726
1727	else:
1728	raise AssertionError()
1729
1730	initializer_words = [] # type: List[InitializerWord_t]
1731	for w in words:
1732	pair = word_.DetectAssocPair(w)
1733	if pair is not None:
1734	word_.TildeDetectAssign(pair.value) # pair.value is modified
1735	initializer_words.append(pair)
1736	else:
1737	w2 = braces.BraceDetect(w) # type: word_t
1738	if w2 is None:
1739	w2 = w
1740	w3 = word_.TildeDetect(w2) # type: word_t
1741	if w3 is None:
1742	w3 = w2
1743	initializer_words.append(InitializerWord.ArrayWord(w3))
1744
1745	# invariant List?
1746	return word_part.InitializerLiteral(left_token, initializer_words,
1747	right_token)
1748
1749	def ParseProcCallArgs(self, start_symbol):
1750	# type: (int) -> ArgList
1751	""" json write (x) """
1752	self.lexer.MaybeUnreadOne()
1753
1754	arg_list = ArgList.CreateNull(alloc_lists=True)
1755	arg_list.left = self.cur_token
1756	self.parse_ctx.ParseProcCallArgs(self.lexer, arg_list, start_symbol)
1757	return arg_list
1758
1759	def _MaybeReadWordPart(self, is_first, lex_mode, parts):
1760	# type: (bool, lex_mode_t, List[word_part_t]) -> bool
1761	"""Helper for _ReadCompoundWord3."""
1762	done = False
1763
1764	if self.token_type == Id.Lit_EscapedChar:
1765	tok = self.cur_token
1766	assert tok.length == 2
1767	ch = lexer.TokenSliceLeft(tok, 1)
1768	if self.parse_opts.no_parse_backslash():
1769	if not pyutil.IsValidCharEscape(ch):
1770	p_die('Invalid char escape in unquoted word (OILS-ERR-13)',
1771	self.cur_token)
1772
1773	part = word_part.EscapedLiteral(self.cur_token,
1774	ch) # type: word_part_t
1775	else:
1776	part = self.cur_token
1777
1778	if is_first and self.token_type == Id.Lit_VarLike: # foo=
1779	parts.append(part)
1780	# Unfortunately it's awkward to pull the check for a=(1 2) up to
1781	# _ReadWord.
1782	next_id = self.lexer.LookPastSpace(lex_mode)
1783	if next_id == Id.Op_LParen:
1784	self.lexer.PushHint(Id.Op_RParen, Id.Right_Initializer)
1785	part2 = self._ReadArrayLiteral()
1786	parts.append(part2)
1787
1788	# Array literal must be the last part of the word.
1789	self._SetNext(lex_mode)
1790	self._GetToken()
1791	# EOF, whitespace, newline, Right_Subshell
1792	if self.token_kind not in KINDS_THAT_END_WORDS:
1793	p_die('Unexpected token after array literal',
1794	self.cur_token)
1795	done = True
1796
1797	elif (is_first and self.parse_opts.parse_at() and
1798	self.token_type == Id.Lit_Splice):
1799
1800	splice_tok = self.cur_token
1801	part2 = word_part.Splice(splice_tok,
1802	lexer.TokenSliceLeft(splice_tok, 1))
1803
1804	parts.append(part2)
1805
1806	# @words must be the last part of the word
1807	self._SetNext(lex_mode)
1808	self._GetToken()
1809	# EOF, whitespace, newline, Right_Subshell
1810	if self.token_kind not in KINDS_THAT_END_WORDS:
1811	p_die('Unexpected token after array splice', self.cur_token)
1812	done = True
1813
1814	elif (is_first and self.parse_opts.parse_at() and
1815	self.token_type == Id.Lit_AtLBracket): # @[split(x)]
1816	part2 = self._ReadExprSub(lex_mode_e.DQ)
1817	parts.append(part2)
1818
1819	# @[split(x)]
1820	self._SetNext(lex_mode)
1821	self._GetToken()
1822	# EOF, whitespace, newline, Right_Subshell
1823	if self.token_kind not in KINDS_THAT_END_WORDS:
1824	p_die('Unexpected token after Expr splice', self.cur_token)
1825	done = True
1826
1827	elif (is_first and self.parse_opts.parse_at() and
1828	self.token_type == Id.Lit_AtLBraceDot):
1829	p_die('TODO: @{.myproc builtin sub}', self.cur_token)
1830
1831	elif (is_first and self.parse_opts.parse_at_all() and
1832	self.token_type == Id.Lit_At):
1833	# Because $[x] ${x} and perhaps $/x/ are reserved, it makes sense for @
1834	# at the beginning of a word to be reserved.
1835
1836	# Although should we relax 'echo @' ? I'm tempted to have a shortcut for
1837	# @_argv and
1838	p_die('Literal @ starting a word must be quoted (parse_at_all)',
1839	self.cur_token)
1840
1841	else:
1842	# not a literal with lookahead; append it
1843	parts.append(part)
1844
1845	return done
1846
1847	def _ReadCompoundWord(self, lex_mode):
1848	# type: (lex_mode_t) -> CompoundWord
1849
1850	# This is the ONLY lexer mode that can return word.Redir
1851	assert lex_mode != lex_mode_e.ShCommand, lex_mode
1852
1853	w = self._ReadCompoundOrRedir(lex_mode)
1854	assert w.tag() == word_e.Compound, w
1855	return cast(CompoundWord, w)
1856
1857	def _ReadCompoundWord3(self, lex_mode, eof_type, empty_ok):
1858	# type: (lex_mode_t, Id_t, bool) -> CompoundWord
1859
1860	# This is the ONLY lexer mode that can return word.Redir
1861	assert lex_mode != lex_mode_e.ShCommand, lex_mode
1862
1863	w = self._ReadCompoundOrRedir3(lex_mode, eof_type, empty_ok)
1864	assert w.tag() == word_e.Compound, w
1865	return cast(CompoundWord, w)
1866
1867	def _ReadCompoundOrRedir(self, lex_mode):
1868	# type: (lex_mode_t) -> word_t
1869	"""Returns either word.Compound or word.Redir"""
1870	return self._ReadCompoundOrRedir3(lex_mode, Id.Undefined_Tok, True)
1871
1872	def _ReadCompoundOrRedir3(self, lex_mode, eof_type, empty_ok):
1873	# type: (lex_mode_t, Id_t, bool) -> word_t
1874	"""
1875	Precondition: Looking at the first token of the first word part
1876	Postcondition: Looking at the token after, e.g. space or operator
1877
1878	NOTE: eof_type is necessary because / is a literal, i.e. Lit_Slash, but it
1879	could be an operator delimiting a compound word. Can we change lexer modes
1880	and remove this special case?
1881
1882	Returns either word.Compound or word.Redir
1883	"""
1884	w = CompoundWord([])
1885	num_parts = 0
1886	brace_count = 0
1887	done = False
1888	is_triple_quoted = None # type: Optional[BoolParamBox]
1889	saw_redir_left_tok = False
1890
1891	while not done:
1892	self._GetToken()
1893
1894	allow_done = empty_ok or num_parts != 0
1895	if allow_done and self.token_type == eof_type:
1896	done = True # e.g. for ${foo//pat/replace}
1897
1898	# Keywords like "for" are treated like literals
1899	elif self.token_kind in (Kind.Lit, Kind.History, Kind.KW,
1900	Kind.ControlFlow, Kind.BoolUnary,
1901	Kind.BoolBinary):
1902
1903	# Syntax error for { and }
1904	if self.token_type == Id.Lit_LBrace:
1905	brace_count += 1
1906	elif self.token_type == Id.Lit_RBrace:
1907	brace_count -= 1
1908	elif self.token_type == Id.Lit_Dollar:
1909	if self.parse_opts.no_parse_dollar():
1910	if num_parts == 0 and lex_mode == lex_mode_e.ShCommand:
1911	next_byte = self.lexer.ByteLookAhead()
1912	# TODO: switch lexer modes and parse $/d+/. But not ${a:-$/d+/}
1913	if next_byte == '/':
1914	#log('next_byte %r', next_byte)
1915	pass
1916
1917	p_die(
1918	'Literal $ should be quoted like \$ (no_parse_dollar)',
1919	self.cur_token)
1920	elif self.token_type in (Id.Lit_Number, Id.Lit_RedirVarName):
1921	saw_redir_left_tok = True
1922
1923	done = self._MaybeReadWordPart(num_parts == 0, lex_mode,
1924	w.parts)
1925
1926	elif self.token_kind == Kind.VSub:
1927	vsub_token = self.cur_token
1928
1929	part = SimpleVarSub(vsub_token) # type: word_part_t
1930	w.parts.append(part)
1931
1932	elif self.token_kind == Kind.ExtGlob:
1933	# If parse_at, we can take over @( to start @(seq 3)
1934	# Users can also use look at ,(.py\|.sh)
1935	if (self.parse_opts.parse_at() and
1936	self.token_type == Id.ExtGlob_At and num_parts == 0):
1937	cs_part = self._ReadCommandSub(Id.Left_AtParen,
1938	d_quoted=False)
1939	# RARE mutation of tok.id!
1940	cs_part.left_token.id = Id.Left_AtParen
1941	part = cs_part # for type safety
1942
1943	# Same check as _MaybeReadWordPart. @(seq 3)x is illegal, just like
1944	# a=(one two)x and @arrayfunc(3)x.
1945	self._GetToken()
1946	if self.token_kind not in KINDS_THAT_END_WORDS:
1947	p_die('Unexpected token after @()', self.cur_token)
1948	done = True
1949
1950	else:
1951	if HAVE_FNM_EXTMATCH == 0:
1952	p_die(
1953	"Extended glob won't work without FNM_EXTMATCH support in libc",
1954	self.cur_token)
1955	part = self._ReadExtGlob()
1956	w.parts.append(part)
1957
1958	elif self.token_kind == Kind.BashRegex:
1959	if self.token_type == Id.BashRegex_LParen: # Opening (
1960	part = self._ReadBashRegexGroup()
1961	w.parts.append(part)
1962	else:
1963	assert self.token_type == Id.BashRegex_AllowedInParens
1964	p_die('Invalid token in bash regex', self.cur_token)
1965
1966	elif self.token_kind == Kind.Left:
1967	try_triple_quote = (self.parse_opts.parse_triple_quote() and
1968	lex_mode == lex_mode_e.ShCommand and
1969	num_parts == 0)
1970
1971	# Save allocation
1972	if try_triple_quote:
1973	is_triple_quoted = BoolParamBox(False)
1974
1975	part = self._ReadUnquotedLeftParts(is_triple_quoted)
1976	w.parts.append(part)
1977
1978	# NOT done yet, will advance below
1979	elif self.token_kind == Kind.Right:
1980	# Still part of the word; will be done on the next iter.
1981	if self.token_type == Id.Right_DoubleQuote:
1982	pass
1983	# Never happens, no PushHint for this case.
1984	#elif self.token_type == Id.Right_DollarParen:
1985	# pass
1986	elif self.token_type == Id.Right_Subshell:
1987	# LEXER HACK for (case x in x) ;; esac )
1988	# Rewind before it's used
1989	assert self.next_lex_mode == lex_mode_e.Undefined
1990	if self.lexer.MaybeUnreadOne():
1991	self.lexer.PushHint(Id.Op_RParen, Id.Right_Subshell)
1992	self._SetNext(lex_mode)
1993	done = True
1994	else:
1995	done = True
1996
1997	elif self.token_kind == Kind.Redir:
1998	# Check if the previous token was a possible left_tok to a
1999	# redirect operator, attach it to the word.Redir. And return
2000	# it instead of the CompoundWord.
2001
2002	# &> and &>> don't have a leading descriptor (2 is implied)
2003	if (saw_redir_left_tok and num_parts == 1 and self.token_type
2004	not in (Id.Redir_AndGreat, Id.Redir_AndDGreat)):
2005
2006	self._SetNext(lex_mode)
2007	left_tok = cast(Token, w.parts.pop())
2008	r = word.Redir(left_tok, self.cur_token)
2009	return r # EARLY RETURN
2010
2011	done = True
2012
2013	elif self.token_kind == Kind.Ignored:
2014	done = True
2015
2016	else:
2017	# LEXER HACK for unbalanced case clause. 'case foo in esac' is valid,
2018	# so to test for ESAC, we can read ) before getting a chance to
2019	# PushHint(Id.Op_RParen, Id.Right_CasePat). So here we unread one
2020	# token and do it again.
2021
2022	# We get Id.Op_RParen at top level: case x in x) ;; esac
2023	# We get Id.Eof_RParen inside ComSub: $(case x in x) ;; esac )
2024	if self.token_type in (Id.Op_RParen, Id.Eof_RParen):
2025	# Rewind before it's used
2026	assert self.next_lex_mode == lex_mode_e.Undefined
2027	if self.lexer.MaybeUnreadOne():
2028	if self.token_type == Id.Eof_RParen:
2029	# Redo translation
2030	self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
2031	self._SetNext(lex_mode)
2032
2033	done = True # anything we don't recognize means we're done
2034
2035	if not done:
2036	self._SetNext(lex_mode)
2037	num_parts += 1
2038
2039	if (self.parse_opts.parse_brace() and num_parts > 1 and
2040	brace_count != 0):
2041	# accept { and }, but not foo{
2042	p_die(
2043	'Word has unbalanced { }. Maybe add a space or quote it like \{',
2044	loc.Word(w))
2045
2046	if is_triple_quoted and is_triple_quoted.b and num_parts > 1:
2047	p_die('Unexpected parts after triple quoted string',
2048	loc.WordPart(w.parts[-1]))
2049
2050	if 0:
2051	from _devbuild.gen.syntax_asdl import word_part_str
2052	word_key = ' '.join(word_part_str(p.tag()) for p in w.parts)
2053	WORD_HIST[word_key] += 1
2054
2055	# YSH word restriction
2056	# (r'' u'' b'' are stripped on shopt -s parse_ysh_string)
2057	if self.parse_opts.no_parse_word_join() and not _IsValidYshWord(w):
2058	p_die("Invalid quoted word part in YSH (OILS-ERR-17)",
2059	loc.WordPart(part))
2060
2061	return w
2062
2063	def _ReadArithWord(self):
2064	# type: () -> Optional[word_t]
2065	""" Helper for ReadArithWord() """
2066	self._GetToken()
2067
2068	if self.token_kind == Kind.Unknown:
2069	# e.g. happened during dynamic parsing of unset 'a[$foo]' in gherkin
2070	p_die(
2071	'Unexpected token while parsing arithmetic: %r' %
2072	lexer.TokenVal(self.cur_token), self.cur_token)
2073
2074	elif self.token_kind == Kind.Eof:
2075	return self.cur_token
2076
2077	elif self.token_kind == Kind.Ignored:
2078	# Space should be ignored.
2079	self._SetNext(lex_mode_e.Arith)
2080	return None
2081
2082	elif self.token_kind in (Kind.Arith, Kind.Right):
2083	# Id.Right_DollarDParen IS just a normal token, handled by ArithParser
2084	self._SetNext(lex_mode_e.Arith)
2085	return self.cur_token
2086
2087	elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub):
2088	return self._ReadCompoundWord(lex_mode_e.Arith)
2089
2090	else:
2091	raise AssertionError(self.cur_token)
2092
2093	def _ReadWord(self, word_mode):
2094	# type: (lex_mode_t) -> Optional[word_t]
2095	"""Helper function for ReadWord()."""
2096
2097	# Change the pseudo lexer mode to a real lexer mode
2098	if word_mode == lex_mode_e.ShCommandFakeBrack:
2099	lex_mode = lex_mode_e.ShCommand
2100	else:
2101	lex_mode = word_mode
2102
2103	self._GetToken()
2104
2105	if self.token_kind == Kind.Eof:
2106	# No advance
2107	return self.cur_token
2108
2109	elif self.token_kind == Kind.Redir:
2110	self._SetNext(lex_mode)
2111	# This is >out -- 3>out is handled below
2112	return word.Redir(None, self.cur_token)
2113
2114	# Allow Arith for ) at end of for loop?
2115	elif self.token_kind in (Kind.Op, Kind.Arith):
2116	self._SetNext(lex_mode)
2117
2118	# Newlines are complicated. See 3x2 matrix in the comment about
2119	# self.multiline and self.newline_state above.
2120	if self.token_type == Id.Op_Newline:
2121	if self.multiline:
2122	if self.newline_state > 1:
2123	# This points at a blank line, but at least it gives the line number
2124	p_die('Invalid blank line in multiline mode',
2125	self.cur_token)
2126	return None
2127
2128	if self.returned_newline: # skip
2129	return None
2130
2131	return self.cur_token
2132
2133	elif self.token_kind == Kind.Right:
2134	if self.token_type not in (Id.Right_Subshell, Id.Right_ShFunction,
2135	Id.Right_CasePat, Id.Right_Initializer):
2136	raise AssertionError(self.cur_token)
2137
2138	self._SetNext(lex_mode)
2139	return self.cur_token
2140
2141	elif self.token_kind in (Kind.Ignored, Kind.WS):
2142	self._SetNext(lex_mode)
2143	return None
2144
2145	else:
2146	assert self.token_kind in (Kind.VSub, Kind.Lit, Kind.History,
2147	Kind.Left, Kind.KW, Kind.ControlFlow,
2148	Kind.BoolUnary, Kind.BoolBinary,
2149	Kind.ExtGlob,
2150	Kind.BashRegex), 'Unhandled token kind'
2151
2152	if (word_mode == lex_mode_e.ShCommandFakeBrack and
2153	self.parse_opts.parse_bracket() and
2154	self.token_type == Id.Lit_LBracket):
2155	# Change [ from Kind.Lit -> Kind.Op
2156	# So CommandParser can treat
2157	# assert [42 === x]
2158	# like
2159	# json write (x)
2160	bracket_word = self.cur_token
2161	bracket_word.id = Id.Op_LBracket
2162
2163	self._SetNext(lex_mode)
2164	return bracket_word
2165
2166	# We're beginning a word. If we see Id.Lit_Pound, change to
2167	# lex_mode_e.Comment and read until end of line.
2168	if self.token_type == Id.Lit_Pound:
2169	self._SetNext(lex_mode_e.Comment)
2170	self._GetToken()
2171
2172	# NOTE: The # could be the last character in the file. It can't be
2173	# Eof_{RParen,Backtick} because #) and #` are comments.
2174	assert self.token_type in (Id.Ignored_Comment, Id.Eof_Real), \
2175	self.cur_token
2176
2177	# The next iteration will go into Kind.Ignored and set lex state to
2178	# lex_mode_e.ShCommand/etc.
2179	return None # tell ReadWord() to try again after comment
2180
2181	elif self.token_type == Id.Lit_TPound: ### doc comment
2182	self._SetNext(lex_mode_e.Comment)
2183	self._GetToken()
2184
2185	if self.token_type == Id.Ignored_Comment and self.emit_doc_token:
2186	return self.cur_token
2187
2188	return None # tell ReadWord() to try again after comment
2189
2190	else:
2191	# r'' u'' b'' at the beginning of a word
2192	if (self.token_type == Id.Lit_Chars and
2193	self.lexer.LookAheadOne(
2194	lex_mode_e.ShCommand) == Id.Left_SingleQuote):
2195
2196	# When shopt -s parse_ysh_string:
2197	# echo r'hi' is like echo 'hi'
2198	#
2199	# echo u'\u{3bc}' b'\yff' works
2200
2201	tok = self.cur_token
2202	if self.parse_opts.parse_ysh_string():
2203	if lexer.TokenEquals(tok, 'r'):
2204	left_id = Id.Left_RSingleQuote
2205	elif lexer.TokenEquals(tok, 'u'):
2206	left_id = Id.Left_USingleQuote
2207	elif lexer.TokenEquals(tok, 'b'):
2208	left_id = Id.Left_BSingleQuote
2209	else:
2210	left_id = Id.Undefined_Tok
2211
2212	if left_id != Id.Undefined_Tok:
2213	# skip the r, and then 'foo' will be read as normal
2214	self._SetNext(lex_mode_e.ShCommand)
2215
2216	self._GetToken()
2217	assert self.token_type == Id.Left_SingleQuote, self.token_type
2218
2219	# Read the word in a different lexer mode
2220	return self._ReadYshSingleQuoted(left_id)
2221
2222	return self._ReadCompoundOrRedir(lex_mode)
2223
2224	def ParseVarRef(self):
2225	# type: () -> BracedVarSub
2226	"""DYNAMIC parsing of what's inside ${!ref}
2227
2228	# Same as VarOf production
2229	VarRefExpr = VarOf EOF
2230	"""
2231	self._SetNext(lex_mode_e.VSub_1)
2232
2233	self._GetToken()
2234	if self.token_kind != Kind.VSub:
2235	p_die('Expected var name', self.cur_token)
2236
2237	part = self._ParseVarOf()
2238	# NOTE: no ${ } means no part.left and part.right
2239	part.left = part.name_tok # cheat to make test pass
2240	part.right = part.name_tok
2241
2242	self._GetToken()
2243	if self.token_type != Id.Eof_Real:
2244	p_die('Expected end of var ref expression', self.cur_token)
2245	return part
2246
2247	def LookPastSpace(self):
2248	# type: () -> Id_t
2249	"""Look ahead to the next token.
2250
2251	For the CommandParser to recognize
2252	array= (1 2 3)
2253	YSH for ( versus bash for ((
2254	YSH if ( versus if test
2255	YSH while ( versus while test
2256	YSH bare assignment 'grep =' versus 'grep foo'
2257	"""
2258	assert self.token_type != Id.Undefined_Tok
2259	if self.cur_token.id == Id.WS_Space:
2260	id_ = self.lexer.LookPastSpace(lex_mode_e.ShCommand)
2261	else:
2262	id_ = self.cur_token.id
2263	return id_
2264
2265	def LookAheadDParens(self, shift_back=0):
2266	# type: (int) -> bool
2267	"""Special lookahead for (( )), to make sure it's an arithmetic
2268	expression (i.e. that the closing parens are a single token, not
2269	separated by anything).
2270	"""
2271	assert self.token_type in (Id.Op_DLeftParen, Id.Left_DollarDParen)
2272
2273	return self.lexer.LookAheadDParens(shift_back)
2274
2275	def LookAheadFuncParens(self):
2276	# type: () -> bool
2277	"""Special lookahead for f( ) { echo hi; } to check for ( )"""
2278	assert self.token_type != Id.Undefined_Tok
2279
2280	# We have to handle 2 cases because we buffer a token
2281	if self.cur_token.id == Id.Op_LParen: # saw funcname(
2282	return self.lexer.LookAheadFuncParens(1) # go back one char
2283
2284	elif self.cur_token.id == Id.WS_Space: # saw funcname WHITESPACE
2285	return self.lexer.LookAheadFuncParens(0)
2286
2287	else:
2288	return False
2289
2290	def ReadWord(self, word_mode):
2291	# type: (lex_mode_t) -> word_t
2292	"""Read the next word, using the given lexer mode.
2293
2294	This is a stateful wrapper for the stateless _ReadWord function.
2295	"""
2296	assert word_mode in (lex_mode_e.ShCommand,
2297	lex_mode_e.ShCommandFakeBrack,
2298	lex_mode_e.DBracket, lex_mode_e.BashRegex)
2299
2300	if self.buffered_word: # For integration with pgen2
2301	w = self.buffered_word
2302	self.buffered_word = None
2303	else:
2304	while True:
2305	w = self._ReadWord(word_mode)
2306	if w is not None:
2307	break
2308
2309	self.returned_newline = (word_.CommandId(w) == Id.Op_Newline)
2310	return w
2311
2312	def ReadArithWord(self):
2313	# type: () -> word_t
2314	while True:
2315	w = self._ReadArithWord()
2316	if w is not None:
2317	break
2318	return w
2319
2320	def ReadHereDocBody(self, parts):
2321	# type: (List[word_part_t]) -> None
2322	"""
2323	A here doc is like a double quoted context, except " isn't special.
2324	"""
2325	self._ReadLikeDQ(None, False, parts)
2326	# Returns nothing
2327
2328	def ReadForPlugin(self):
2329	# type: () -> CompoundWord
2330	"""For $PS1, $PS4, etc.
2331
2332	This is just like reading a here doc line. "\n" is allowed, as
2333	well as the typical substitutions ${x} $(echo hi) $((1 + 2)).
2334	"""
2335	w = CompoundWord([])
2336	self._ReadLikeDQ(None, False, w.parts)
2337	return w
2338
2339	def EmitDocToken(self, b):
2340	# type: (bool) -> None
2341	self.emit_doc_token = b
2342
2343	def Multiline(self, b):
2344	# type: (bool) -> None
2345	self.multiline = b
2346
2347
2348	if 0:
2349	import collections
2350	WORD_HIST = collections.Counter()
2351
2352	# vim: sw=4