osh/word_parse.py

OILS / osh / word_parse.py View on Github | oils.pub

2224 lines, 1188 significant

1	# Copyright 2016 Andy Chu. All rights reserved.
2	# Licensed under the Apache License, Version 2.0 (the "License");
3	# you may not use this file except in compliance with the License.
4	# You may obtain a copy of the License at
5	#
6	# http://www.apache.org/licenses/LICENSE-2.0
7	"""
8	word_parse.py - Parse the shell word language.
9
10	Hairy example:
11
12	hi$((1 + 2))"$(echo hi)"${var:-__"$(echo default)"__}
13
14	Substitutions can be nested, but which inner subs are allowed depends on the
15	outer sub. Notes:
16
17	lex_mode_e.ShCommand (_ReadUnquotedLeftParts)
18	All subs and quotes are allowed:
19	$v ${v} $() `` $(()) '' "" $'' $"" <() >()
20
21	lex_mode_e.DQ (_ReadDoubleQuotedLeftParts)
22	Var, Command, Arith, but no quotes.
23	$v ${v} $() `` $(())
24	No process substitution.
25
26	lex_mode_e.Arith
27	Similar to DQ: Var, Command, and Arith sub, but no process sub. bash doesn't
28	allow quotes, but OSH does. We allow ALL FOUR kinds of quotes, because we
29	need those for associative array indexing.
30
31	lex_mode_e.VSub_ArgUnquoted
32	Like ShCommand, everything is allowed (even process substitutions), but we
33	stop at }, and space is SIGNIFICANT.
34
35	Example: ${a:- b }
36
37	${X:-$v} ${X:-${v}} ${X:-$(echo hi)} ${X:-`echo hi`} ${X:-$((1+2))}
38	${X:-'single'} ${X:-"double"} ${X:-$'\n'} ${X:-<(echo hi)}
39
40	lex_mode_e.VSub_ArgDQ
41	In contrast to DQ, VS_ARG_DQ accepts nested "" and $'' and $"", e.g.
42	"${x:-"default"}".
43
44	In contrast, VSub_ArgUnquoted respects single quotes and process
45	substitution.
46
47	It's weird that double quotes are allowed. Space is also significant here,
48	e.g. "${x:-a "b"}".
49	"""
50
51	from _devbuild.gen import grammar_nt
52	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind
53	from _devbuild.gen.types_asdl import (lex_mode_t, lex_mode_e)
54	from _devbuild.gen.syntax_asdl import (
55	BoolParamBox,
56	Token,
57	SimpleVarSub,
58	loc,
59	source,
60	DoubleQuoted,
61	SingleQuoted,
62	BracedVarSub,
63	CommandSub,
64	ShArrayLiteral,
65	AssocPair,
66	bracket_op,
67	bracket_op_t,
68	suffix_op,
69	suffix_op_t,
70	rhs_word,
71	rhs_word_e,
72	rhs_word_t,
73	word_e,
74	word_t,
75	CompoundWord,
76	word_part,
77	word_part_t,
78	y_lhs_e,
79	arith_expr_t,
80	command,
81	expr,
82	expr_e,
83	expr_t,
84	pat_t,
85	ArgList,
86	Proc,
87	Func,
88	Subscript,
89	Attribute,
90	arith_expr,
91	)
92	from core import alloc
93	from core.error import p_die
94	from mycpp.mylib import log
95	from core import pyutil
96	from display import ui
97	from frontend import consts
98	from frontend import lexer
99	from frontend import reader
100	from osh import tdop
101	from osh import arith_parse
102	from osh import braces
103	from osh import word_
104	from osh import word_compile
105	from mycpp.mylib import tagswitch
106
107	from libc import HAVE_FNM_EXTMATCH
108
109	from typing import List, Optional, Tuple, cast
110	from typing import TYPE_CHECKING
111	if TYPE_CHECKING:
112	from frontend.lexer import Lexer
113	from frontend.parse_lib import ParseContext
114	from frontend.reader import _Reader
115	from osh.cmd_parse import VarChecker
116
117	unused1 = log
118	unused2 = Id_str
119
120	KINDS_THAT_END_WORDS = [Kind.Eof, Kind.WS, Kind.Op, Kind.Right]
121
122
123	class WordEmitter(object):
124	"""Common interface for [ and [["""
125
126	def __init__(self):
127	# type: () -> None
128	"""Empty constructor for mycpp."""
129	pass
130
131	def ReadWord(self, lex_mode):
132	# type: (lex_mode_t) -> word_t
133	raise NotImplementedError()
134
135
136	class WordParser(WordEmitter):
137
138	def __init__(self, parse_ctx, lexer, line_reader):
139	# type: (ParseContext, Lexer, _Reader) -> None
140	self.parse_ctx = parse_ctx
141	self.lexer = lexer
142	self.line_reader = line_reader
143	self.arena = line_reader.arena
144
145	self.parse_opts = parse_ctx.parse_opts
146	self.a_parser = tdop.TdopParser(arith_parse.Spec(), self,
147	self.parse_opts)
148	self.Reset()
149
150	def Init(self, lex_mode):
151	# type: (lex_mode_t) -> None
152	"""Used to parse arithmetic, see ParseContext."""
153	self.next_lex_mode = lex_mode
154
155	def Reset(self):
156	# type: () -> None
157	"""Called by interactive loop."""
158	# For _GetToken()
159	self.cur_token = None # type: Token
160	self.token_kind = Kind.Undefined
161	self.token_type = Id.Undefined_Tok
162
163	self.next_lex_mode = lex_mode_e.ShCommand
164
165	# Boolean mutated by CommandParser via word_.ctx_EmitDocToken. For ### doc
166	# comments
167	self.emit_doc_token = False
168	# Boolean mutated by CommandParser via word_.ctx_Multiline. '...' starts
169	# multiline mode.
170	self.multiline = False
171
172	# For detecting invalid \n\n in multiline mode. Counts what we got
173	# directly from the lexer.
174	self.newline_state = 0
175	# For consolidating \n\n -> \n for the CALLER. This simplifies the parsers
176	# that consume words.
177	self.returned_newline = False
178
179	# For integration with pgen2
180	self.buffered_word = None # type: word_t
181
182	def _GetToken(self):
183	# type: () -> None
184	"""Call this when you need to make a decision based on any of:
185
186	self.token_type
187	self.token_kind
188	self.cur_token
189	"""
190	if self.next_lex_mode == lex_mode_e.Undefined:
191	return # _SetNext() not called, so do nothing
192
193	is_fake = self.next_lex_mode == lex_mode_e.BashRegexFakeInner
194	real_mode = (lex_mode_e.BashRegex if is_fake else self.next_lex_mode)
195
196	self.cur_token = self.lexer.Read(real_mode)
197
198	# MUTATE TOKEN for fake lexer mode.
199	# This is for crazy stuff bash allows, like [[ s =~ (< >) ]]
200	if (is_fake and self.cur_token.id
201	in (Id.WS_Space, Id.BashRegex_AllowedInParens)):
202	self.cur_token.id = Id.Lit_Chars
203
204	self.token_type = self.cur_token.id
205	self.token_kind = consts.GetKind(self.token_type)
206
207	# number of consecutive newlines, ignoring whitespace
208	if self.token_type == Id.Op_Newline:
209	self.newline_state += 1
210	elif self.token_kind != Kind.WS:
211	self.newline_state = 0
212
213	self.parse_ctx.trail.AppendToken(self.cur_token) # For completion
214	self.next_lex_mode = lex_mode_e.Undefined
215
216	def _SetNext(self, lex_mode):
217	# type: (lex_mode_t) -> None
218	"""Set the next lex state, but don't actually read a token.
219
220	We need this for proper interactive parsing.
221	"""
222	self.next_lex_mode = lex_mode
223
224	def _ReadVarOpArg(self, arg_lex_mode):
225	# type: (lex_mode_t) -> rhs_word_t
226
227	# NOTE: Operators like \| and < are not treated as special, so ${a:- \| >} is
228	# valid, even when unquoted.
229	self._SetNext(arg_lex_mode)
230	self._GetToken()
231
232	w = self._ReadVarOpArg2(arg_lex_mode, Id.Undefined_Tok,
233	True) # empty_ok
234
235	# If the Compound has no parts, and we're in a double-quoted VarSub
236	# arg, and empty_ok, then return Empty. This is so it can evaluate to
237	# the empty string and not get elided.
238	#
239	# Examples:
240	# - "${s:-}", "${s/%pat/}"
241	# It's similar to LooksLikeShAssignment where we turn x= into x=''. And it
242	# has the same potential problem of not having Token location info.
243	#
244	# NOTE: empty_ok is False only for the PatSub pattern, which means we'll
245	# return a Compound with no parts, which is explicitly checked with a
246	# custom error message.
247	if len(w.parts) == 0 and arg_lex_mode == lex_mode_e.VSub_ArgDQ:
248	return rhs_word.Empty
249
250	return w
251
252	def _ReadVarOpArg2(self, arg_lex_mode, eof_type, empty_ok):
253	# type: (lex_mode_t, Id_t, bool) -> CompoundWord
254	"""Return a CompoundWord.
255
256	Helper function for _ReadVarOpArg and used directly by
257	_ReadPatSubVarOp.
258	"""
259	w = self._ReadCompoundWord3(arg_lex_mode, eof_type, empty_ok)
260	#log('w %s', w)
261	tilde = word_.TildeDetect(w)
262	if tilde:
263	w = tilde
264	return w
265
266	def _ReadSliceVarOp(self):
267	# type: () -> suffix_op.Slice
268	"""
269	Looking token after first ':'
270
271	ArithExpr? (':' ArithExpr? )? '}'
272	"""
273	self._NextNonSpace()
274
275	cur_id = self.token_type
276
277	if cur_id in (Id.Arith_RBrace, Id.Arith_Colon): # ${a:} or ${a::}
278	begin = arith_expr.EmptyZero # type: arith_expr_t
279	else:
280	begin = self.a_parser.Parse()
281	cur_id = self.a_parser.CurrentId() # advance
282
283	if cur_id == Id.Arith_RBrace: # ${a:1} or ${@:1}
284	# No length specified, so it's N
285	no_length = None # type: Optional[arith_expr_t]
286	return suffix_op.Slice(begin, no_length)
287
288	elif cur_id == Id.Arith_Colon: # ${a:1:} or ${@:1:}
289	colon_tok = self.cur_token
290	self._NextNonSpace()
291
292	if self.token_type == Id.Arith_RBrace:
293	# quirky bash behavior:
294	# ${a:1:} or ${a::} means length ZERO
295	# but ${a:1} or ${a:} means length N
296	if self.parse_opts.strict_parse_slice():
297	p_die(
298	"Slice length: Add explicit zero, or omit : for N (strict_parse_slice)",
299	colon_tok)
300
301	length = arith_expr.EmptyZero # type: arith_expr_t
302	else:
303	length = self._ReadArithExpr(Id.Arith_RBrace)
304
305	return suffix_op.Slice(begin, length)
306
307	else:
308	p_die("Expected : or } in slice", self.cur_token)
309
310	raise AssertionError() # for MyPy
311
312	def _ReadPatSubVarOp(self):
313	# type: () -> suffix_op.PatSub
314	"""Looking at the first '/' after VarOf:
315
316	VarSub = ...
317	\| VarOf '/' Match ( '/' WORD? )?
318	Match = '/' WORD # can't be empty
319	\| '#' WORD? # may be empty
320	\| '%' WORD?
321	"""
322	slash_tok = self.cur_token # location info
323	replace_mode = Id.Undefined_Tok # bizarre syntax / # %
324
325	self._SetNext(lex_mode_e.VSub_ArgUnquoted) # advance past /
326
327	self._GetToken()
328	if self.token_type == Id.Right_DollarBrace:
329	pat = CompoundWord([])
330	return suffix_op.PatSub(pat, rhs_word.Empty, replace_mode,
331	slash_tok)
332
333	if self.token_type in (Id.Lit_Slash, Id.Lit_Pound, Id.Lit_Percent):
334	replace_mode = self.token_type
335	self._SetNext(lex_mode_e.VSub_ArgUnquoted)
336
337	# Bash quirk:
338	# echo ${x/#/replace} has an empty pattern
339	# echo ${x////replace} is non-empty; it means echo ${x//'/'/replace}
340	empty_ok = replace_mode != Id.Lit_Slash
341	pat = self._ReadVarOpArg2(lex_mode_e.VSub_ArgUnquoted, Id.Lit_Slash,
342	empty_ok)
343	#log('pat 1 %r', pat)
344
345	if self.token_type == Id.Lit_Slash:
346	# read until }
347	replace = self._ReadVarOpArg(
348	lex_mode_e.VSub_ArgUnquoted) # type: rhs_word_t
349	#log('r 1 %r', replace)
350	else:
351	# e.g. ${v/a} is the same as ${v/a/} -- empty replacement string
352	replace = rhs_word.Empty
353
354	self._GetToken()
355	if self.token_type != Id.Right_DollarBrace:
356	# This happens on invalid code
357	p_die(
358	"Expected } after replacement string, got %s" %
359	ui.PrettyId(self.token_type), self.cur_token)
360
361	return suffix_op.PatSub(pat, replace, replace_mode, slash_tok)
362
363	def _ReadSubscript(self):
364	# type: () -> bracket_op_t
365	""" Subscript = '[' ('@' \| '*' \| ArithExpr) ']' """
366	# Lookahead to see if we get @ or *. Otherwise read a full arithmetic
367	# expression.
368	next_id = self.lexer.LookPastSpace(lex_mode_e.Arith)
369	if next_id in (Id.Lit_At, Id.Arith_Star):
370	op = bracket_op.WholeArray(next_id) # type: bracket_op_t
371
372	self._SetNext(lex_mode_e.Arith) # skip past [
373	self._GetToken()
374	self._SetNext(lex_mode_e.Arith) # skip past @
375	self._GetToken()
376	else:
377	self._SetNext(lex_mode_e.Arith) # skip past [
378	anode = self._ReadArithExpr(Id.Arith_RBracket)
379	op = bracket_op.ArrayIndex(anode)
380
381	if self.token_type != Id.Arith_RBracket: # Should be looking at ]
382	p_die('Expected ] to close subscript', self.cur_token)
383
384	self._SetNext(lex_mode_e.VSub_2) # skip past ]
385	self._GetToken() # Needed to be in the same spot as no subscript
386
387	return op
388
389	def _ParseVarOf(self):
390	# type: () -> BracedVarSub
391	"""
392	VarOf = NAME Subscript?
393	\| NUMBER # no subscript allowed, none of these are arrays
394	# ${@[1]} doesn't work, even though slicing does
395	\| VarSymbol
396	"""
397	self._GetToken()
398	name_token = self.cur_token
399	self._SetNext(lex_mode_e.VSub_2)
400
401	self._GetToken() # Check for []
402	if self.token_type == Id.VOp2_LBracket:
403	bracket_op = self._ReadSubscript()
404	else:
405	bracket_op = None
406
407	part = BracedVarSub.CreateNull()
408	part.name_tok = name_token
409	part.var_name = lexer.TokenVal(name_token)
410	part.bracket_op = bracket_op
411	return part
412
413	def _ParseVarExpr(self, arg_lex_mode, allow_query=False):
414	# type: (lex_mode_t, bool) -> BracedVarSub
415	"""Start parsing at the op -- we already skipped past the name."""
416	part = self._ParseVarOf()
417
418	self._GetToken()
419	if self.token_type == Id.Right_DollarBrace:
420	return part # no ops
421
422	op_kind = self.token_kind
423
424	if op_kind == Kind.VTest:
425	tok = self.cur_token
426	arg_word = self._ReadVarOpArg(arg_lex_mode)
427	if self.token_type != Id.Right_DollarBrace:
428	p_die('Expected } to close ${', self.cur_token)
429
430	part.suffix_op = suffix_op.Unary(tok, arg_word)
431
432	elif op_kind == Kind.VOpYsh:
433	tok = self.cur_token
434	arg_word = self._ReadVarOpArg(arg_lex_mode)
435	if self.token_type != Id.Right_DollarBrace:
436	p_die('Expected } to close ${', self.cur_token)
437
438	UP_arg_word = arg_word
439	with tagswitch(arg_word) as case:
440	if case(rhs_word_e.Empty):
441	pass
442	elif case(rhs_word_e.Compound):
443	arg_word = cast(CompoundWord, UP_arg_word)
444	# This handles ${x\|html} and ${x %.3f} now
445	# However I think ${x %.3f} should be statically parsed? It can enter
446	# the printf lexer modes.
447	ok, arg, quoted = word_.StaticEval(arg_word)
448	if not ok or quoted:
449	p_die('Expected a constant argument',
450	loc.Word(arg_word))
451
452	part.suffix_op = suffix_op.Static(tok, arg)
453
454	elif op_kind == Kind.VOp0:
455	part.suffix_op = self.cur_token # Nullary
456	self._SetNext(lex_mode_e.VSub_2) # Expecting }
457	self._GetToken()
458
459	elif op_kind == Kind.VOp1: # % %% # ## etc.
460	tok = self.cur_token
461	# Weird exception that all shells have: these operators take a glob
462	# pattern, so they're lexed as VSub_ArgUnquoted, not VSub_ArgDQ
463	arg_word = self._ReadVarOpArg(lex_mode_e.VSub_ArgUnquoted)
464	if self.token_type != Id.Right_DollarBrace:
465	p_die('Expected } to close ${', self.cur_token)
466
467	part.suffix_op = suffix_op.Unary(tok, arg_word)
468
469	elif op_kind == Kind.VOp2: # / : [ ]
470	if self.token_type == Id.VOp2_Slash:
471	patsub_op = self._ReadPatSubVarOp() # type: suffix_op_t
472	part.suffix_op = patsub_op
473
474	# Checked by the method above
475	assert self.token_type == Id.Right_DollarBrace, self.cur_token
476
477	elif self.token_type == Id.VOp2_Colon:
478	part.suffix_op = self._ReadSliceVarOp()
479	# NOTE: } in arithmetic mode.
480	if self.token_type != Id.Arith_RBrace:
481	# Token seems off; doesn't point to X in # ${a:1:2 X
482	p_die('Expected } to close ${', self.cur_token)
483
484	else:
485	# TODO: Does this ever happen?
486	p_die('Unexpected token in ${} (%s)' % 'VOp2', self.cur_token)
487
488	elif op_kind == Kind.VOp3: # ${prefix@} etc.
489	if allow_query:
490	part.suffix_op = self.cur_token # Nullary
491	self._SetNext(lex_mode_e.VSub_2) # Expecting }
492	self._GetToken()
493	else:
494	p_die("Unexpected token in ${} (%s)" % 'VOp3', self.cur_token)
495
496	# NOTE: Arith_RBrace is for slicing, because it reads } in arithmetic
497	# mode. It's redundantly checked above.
498	if self.token_type not in (Id.Right_DollarBrace, Id.Arith_RBrace):
499	# ${a.} or ${!a.}
500	p_die('Expected } to close ${', self.cur_token)
501
502	# Now look for ops
503	return part
504
505	def _ReadZshVarSub(self, left_token):
506	# type: (Token) -> word_part.ZshVarSub
507
508	self._SetNext(lex_mode_e.VSub_Zsh) # Move past ${(foo)
509
510	# Can be empty
511	w = self._ReadCompoundWord3(lex_mode_e.VSub_Zsh, Id.Right_DollarBrace,
512	True)
513	self._GetToken()
514	return word_part.ZshVarSub(left_token, w, self.cur_token)
515
516	def ReadBracedVarSub(self, left_token):
517	# type: (Token) -> Tuple[BracedVarSub, Token]
518	""" For YSH expressions like var x = ${x:-"default"}. """
519	part = self._ReadBracedVarSub(left_token, d_quoted=False)
520	last_token = self.cur_token
521	return part, last_token
522
523	def _ReadBracedVarSub(self, left_token, d_quoted):
524	# type: (Token, bool) -> BracedVarSub
525	"""For the ${} expression language.
526
527	NAME = [a-zA-Z_][a-zA-Z0-9_]*
528	NUMBER = [0-9]+ # ${10}, ${11}, ...
529
530	Subscript = '[' ('@' \| '*' \| ArithExpr) ']'
531	VarSymbol = '!' \| '@' \| '#' \| ...
532	VarOf = NAME Subscript?
533	\| NUMBER # no subscript allowed, none of these are arrays
534	# ${@[1]} doesn't work, even though slicing does
535	\| VarSymbol
536
537	NULLARY_OP = '@Q' \| '@E' \| '@P' \| '@A' \| '@a' # VOp0
538
539	TEST_OP = '-' \| ':-' \| '=' \| ':=' \| '+' \| ':+' \| '?' \| ':?'
540	STRIP_OP = '#' \| '##' \| '%' \| '%%'
541	CASE_OP = ',' \| ',,' \| '^' \| '^^'
542	UnaryOp = TEST_OP \| STRIP_OP \| CASE_OP
543
544	YSH_UNARY = '\|' \| ' ' # ${x\|html} and ${x %.3f}.
545	# SPACE is operator not %
546	Match = ('/' \| '#' \| '%') WORD # match all / prefix / suffix
547	VarExpr = VarOf
548	\| VarOf NULLARY_OP
549	\| VarOf UnaryOp WORD
550	\| VarOf YSH_UNARY STATIC_WORD
551	\| VarOf ':' ArithExpr (':' ArithExpr )?
552	\| VarOf '/' Match '/' WORD
553
554	LengthExpr = '#' VarOf # can't apply operators after length
555
556	RefOrKeys = '!' VarExpr # CAN apply operators after a named ref
557	# ${!ref[0]} vs ${!keys[@]} resolved later
558
559	PrefixQuery = '!' NAME ('*' \| '@') # list variable names with a prefix
560
561	BuiltinSub = '.' WORD+ # ${.myproc 'builtin' $sub}
562
563	VarSub = LengthExpr
564	\| RefOrKeys
565	\| PrefixQuery
566	\| VarExpr
567	\| BuiltinSub
568
569	NOTES:
570	- Arithmetic expressions are used twice, inside subscripts ${a[x+1]} and
571	slicing ${a:x+1:y+2}
572	- ${#} and ${!} need LL(2) lookahead (considering how my tokenizer works)
573	- @ and * are technically arithmetic expressions in this implementation
574	- We don't account for bash 4.4: ${param@operator} -- Q E P A a. Note that
575	it's also vectorized.
576
577	Strictness over bash:
578	- echo ${a[0][0]} doesn't do anything useful, so we disallow it from the
579	grammar
580	- ! and # prefixes can't be composed, even though named refs can be
581	composed with other operators
582	- '#' means 4 different things: length prefix, VarSymbol, UnaryOp to strip
583	a prefix, and it can also be a literal part of WORD.
584
585	From the parser's point of view, the prefix # can't be combined with
586	UnaryOp/slicing/matching, and the ! can. However
587
588	- ${a[@]:1:2} is not allowed
589	- ${#a[@]:1:2} is allowed, but gives the wrong answer
590	"""
591	if d_quoted:
592	arg_lex_mode = lex_mode_e.VSub_ArgDQ
593	else:
594	arg_lex_mode = lex_mode_e.VSub_ArgUnquoted
595
596	self._SetNext(lex_mode_e.VSub_1)
597	self._GetToken()
598
599	ty = self.token_type
600	first_tok = self.cur_token
601
602	if ty == Id.VSub_Pound:
603	# Disambiguate
604	next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
605	if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
606	# e.g. a name, '#' is the prefix
607	self._SetNext(lex_mode_e.VSub_1)
608	part = self._ParseVarOf()
609
610	self._GetToken()
611	if self.token_type != Id.Right_DollarBrace:
612	p_die('Expected } after length expression', self.cur_token)
613
614	part.prefix_op = first_tok
615
616	else: # not a prefix, '#' is the variable
617	part = self._ParseVarExpr(arg_lex_mode)
618
619	elif ty == Id.VSub_Bang:
620	next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
621	if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
622	# e.g. a name, '!' is the prefix
623	# ${!a} -- this is a ref
624	# ${!3} -- this is ref
625	# ${!a[1]} -- this is a ref
626	# ${!a[@]} -- this is a keys
627	# No lookahead -- do it in a second step, or at runtime
628	self._SetNext(lex_mode_e.VSub_1)
629	part = self._ParseVarExpr(arg_lex_mode, allow_query=True)
630
631	part.prefix_op = first_tok
632
633	else: # not a prefix, '!' is the variable
634	part = self._ParseVarExpr(arg_lex_mode)
635
636	elif ty == Id.VSub_Dot:
637	# Note: this will become a new builtin_sub type, so this method must
638	# return word_part_t rather than BracedVarSub. I don't think that
639	# should cause problems.
640	p_die('TODO: ${.myproc builtin sub}', self.cur_token)
641
642	# VS_NAME, VS_NUMBER, symbol that isn't # or !
643	elif self.token_kind == Kind.VSub:
644	part = self._ParseVarExpr(arg_lex_mode)
645
646	else:
647	# e.g. ${^}
648	p_die('Unexpected token in ${}', self.cur_token)
649
650	part.left = left_token # attach the argument
651	part.right = self.cur_token
652	return part
653
654	def _ReadSingleQuoted(self, left_token, lex_mode):
655	# type: (Token, lex_mode_t) -> SingleQuoted
656	"""Internal method to read a word_part."""
657	tokens = [] # type: List[Token]
658	# In command mode, we never disallow backslashes like '\'
659	right_quote = self.ReadSingleQuoted(lex_mode, left_token, tokens,
660	False)
661	sval = word_compile.EvalSingleQuoted(left_token.id, tokens)
662	node = SingleQuoted(left_token, sval, right_quote)
663	return node
664
665	def ReadSingleQuoted(self, lex_mode, left_token, out_tokens, is_ysh_expr):
666	# type: (lex_mode_t, Token, List[Token], bool) -> Token
667	"""Appends to out_tokens; returns last token
668
669	Used by expr_parse.py
670	"""
671	# TODO: Remove and use out_tokens
672	tokens = [] # type: List[Token]
673
674	# echo '\' is allowed, but x = '\' is invalid, in favor of x = r'\'
675	no_backslashes = is_ysh_expr and left_token.id == Id.Left_SingleQuote
676
677	expected_end_tokens = 3 if left_token.id in (
678	Id.Left_TSingleQuote, Id.Left_RTSingleQuote, Id.Left_UTSingleQuote,
679	Id.Left_BTSingleQuote) else 1
680	num_end_tokens = 0
681
682	while num_end_tokens < expected_end_tokens:
683	self._SetNext(lex_mode)
684	self._GetToken()
685
686	# Kind.Char emitted in lex_mode.SQ_C
687	if self.token_kind in (Kind.Lit, Kind.Char):
688	tok = self.cur_token
689	# Happens in lex_mode_e.SQ: 'one\two' is ambiguous, should be
690	# r'one\two' or c'one\\two'
691	if no_backslashes and lexer.TokenContains(tok, '\\'):
692	p_die(
693	r"Strings with backslashes should look like r'\n' or u'\n' or b'\n'",
694	tok)
695
696	if is_ysh_expr:
697	# Disallow var x = $'\001'. Arguably we don't need these
698	# checks because u'\u{1}' is the way to write it.
699	if self.token_type == Id.Char_Octal3:
700	p_die(
701	r"Use \xhh or \u{...} instead of octal escapes in YSH strings",
702	tok)
703
704	if self.token_type == Id.Char_Hex and self.cur_token.length != 4:
705	# disallow \xH
706	p_die(
707	r'Invalid hex escape in YSH string (must be \xHH)',
708	tok)
709
710	tokens.append(tok)
711
712	elif self.token_kind == Kind.Unknown:
713	tok = self.cur_token
714	assert tok.id == Id.Unknown_Backslash, tok
715
716	# x = $'\z' is disallowed; ditto for echo $'\z' if shopt -u parse_backslash
717	if is_ysh_expr or not self.parse_opts.parse_backslash():
718	p_die(
719	"Invalid char escape in C-style string literal (OILS-ERR-11)",
720	tok)
721
722	tokens.append(tok)
723
724	elif self.token_kind == Kind.Eof:
725	p_die('Unexpected EOF in single-quoted string that began here',
726	left_token)
727
728	elif self.token_kind == Kind.Right:
729	# assume Id.Right_SingleQuote
730	num_end_tokens += 1
731	tokens.append(self.cur_token)
732
733	else:
734	raise AssertionError(self.cur_token)
735
736	if self.token_kind != Kind.Right:
737	num_end_tokens = 0 # we need three in a ROW
738
739	if expected_end_tokens == 1:
740	tokens.pop()
741	elif expected_end_tokens == 3: # Get rid of spurious end tokens
742	tokens.pop()
743	tokens.pop()
744	tokens.pop()
745
746	# Remove space from ''' r''' $''' in both expression mode and command mode
747	if left_token.id in (Id.Left_TSingleQuote, Id.Left_RTSingleQuote,
748	Id.Left_UTSingleQuote, Id.Left_BTSingleQuote):
749	word_compile.RemoveLeadingSpaceSQ(tokens)
750
751	# Validation after lexing - same 2 checks in j8.LexerDecoder
752	is_u_string = left_token.id in (Id.Left_USingleQuote,
753	Id.Left_UTSingleQuote)
754
755	for tok in tokens:
756	# u'\yff' is not valid, but b'\yff' is
757	if is_u_string and tok.id == Id.Char_YHex:
758	p_die(
759	r"%s escapes not allowed in u'' strings" %
760	lexer.TokenVal(tok), tok)
761
762	out_tokens.extend(tokens)
763	return self.cur_token
764
765	def _ReadDoubleQuotedLeftParts(self):
766	# type: () -> word_part_t
767	"""Read substitution parts in a double quoted context."""
768	if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick):
769	return self._ReadCommandSub(self.token_type, d_quoted=True)
770
771	if self.token_type == Id.Left_DollarBrace:
772	return self._ReadBracedVarSub(self.cur_token, d_quoted=True)
773
774	if self.token_type == Id.Left_DollarDParen:
775	return self._ReadArithSub()
776
777	if self.token_type == Id.Left_DollarBracket:
778	return self._ReadExprSub(lex_mode_e.DQ)
779
780	raise AssertionError(self.cur_token)
781
782	def _ReadYshSingleQuoted(self, left_id):
783	# type: (Id_t) -> CompoundWord
784	"""Read YSH style strings
785
786	r'' u'' b''
787	r''' ''' u''' ''' b''' '''
788	"""
789	#log('BEF self.cur_token %s', self.cur_token)
790	if left_id == Id.Left_RSingleQuote:
791	lexer_mode = lex_mode_e.SQ_Raw
792	triple_left_id = Id.Left_RTSingleQuote
793	elif left_id == Id.Left_USingleQuote:
794	lexer_mode = lex_mode_e.J8_Str
795	triple_left_id = Id.Left_UTSingleQuote
796	elif left_id == Id.Left_BSingleQuote:
797	lexer_mode = lex_mode_e.J8_Str
798	triple_left_id = Id.Left_BTSingleQuote
799	else:
800	raise AssertionError(left_id)
801
802	# Needed for syntax checks
803	left_tok = self.cur_token
804	left_tok.id = left_id
805
806	sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
807
808	if (len(sq_part.sval) == 0 and self.lexer.ByteLookAhead() == "'"):
809	self._SetNext(lex_mode_e.ShCommand)
810	self._GetToken()
811
812	assert self.token_type == Id.Left_SingleQuote
813	# HACK: magically transform the third ' in u''' to
814	# Id.Left_UTSingleQuote, so that ''' is the terminator
815	left_tok = self.cur_token
816	left_tok.id = triple_left_id
817
818	# Handles stripping leading whitespace
819	sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
820
821	# Advance and validate
822	self._SetNext(lex_mode_e.ShCommand)
823
824	self._GetToken()
825	if self.token_kind not in KINDS_THAT_END_WORDS:
826	p_die('Unexpected token after YSH single-quoted string',
827	self.cur_token)
828
829	return CompoundWord([sq_part])
830
831	def _ReadUnquotedLeftParts(self, triple_out):
832	# type: (Optional[BoolParamBox]) -> word_part_t
833	"""Read substitutions and quoted strings (for lex_mode_e.ShCommand).
834
835	If triple_out is set, then we try parsing triple quoted strings,
836	and set its value to True if we got one.
837	"""
838	if self.token_type in (Id.Left_DoubleQuote, Id.Left_DollarDoubleQuote):
839	# Note: $"" is a synonym for "". It might make sense if it added
840	# \n \0 \x00 \u{123} etc. But that's not what bash does!
841	dq_part = self._ReadDoubleQuoted(self.cur_token)
842	# Got empty word "" and there's a " after
843	if (triple_out and len(dq_part.parts) == 0 and
844	self.lexer.ByteLookAhead() == '"'):
845
846	self._SetNext(lex_mode_e.ShCommand)
847	self._GetToken()
848	# HACK: magically transform the third " in """ to
849	# Id.Left_TDoubleQuote, so that """ is the terminator
850	left_dq_token = self.cur_token
851	left_dq_token.id = Id.Left_TDoubleQuote
852	triple_out.b = True # let caller know we got it
853	return self._ReadDoubleQuoted(left_dq_token)
854
855	return dq_part
856
857	if self.token_type in (Id.Left_SingleQuote, Id.Left_RSingleQuote,
858	Id.Left_DollarSingleQuote):
859	if self.token_type == Id.Left_SingleQuote:
860	lexer_mode = lex_mode_e.SQ_Raw
861	triple_left_id = Id.Left_TSingleQuote
862	elif self.token_type == Id.Left_RSingleQuote:
863	lexer_mode = lex_mode_e.SQ_Raw
864	triple_left_id = Id.Left_RTSingleQuote
865	else:
866	lexer_mode = lex_mode_e.SQ_C
867	# there is no such thing as $'''
868	triple_left_id = Id.Undefined_Tok
869
870	sq_part = self._ReadSingleQuoted(self.cur_token, lexer_mode)
871
872	# Got empty '' or r'' and there's a ' after
873	# u'' and b'' are handled in _ReadYshSingleQuoted
874	if (triple_left_id != Id.Undefined_Tok and
875	triple_out is not None and len(sq_part.sval) == 0 and
876	self.lexer.ByteLookAhead() == "'"):
877
878	self._SetNext(lex_mode_e.ShCommand)
879	self._GetToken()
880
881	# HACK: magically transform the third ' in ''' to
882	# Id.Left_TSingleQuote, so that ''' is the terminator
883	left_sq_token = self.cur_token
884	left_sq_token.id = triple_left_id
885
886	triple_out.b = True # let caller know we got it
887	return self._ReadSingleQuoted(left_sq_token, lexer_mode)
888
889	return sq_part
890
891	if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick,
892	Id.Left_ProcSubIn, Id.Left_ProcSubOut):
893	return self._ReadCommandSub(self.token_type, d_quoted=False)
894
895	if self.token_type == Id.Left_DollarBrace:
896	return self._ReadBracedVarSub(self.cur_token, d_quoted=False)
897
898	if self.token_type == Id.Left_DollarDParen:
899	return self._ReadArithSub()
900
901	if self.token_type == Id.Left_DollarBracket:
902	return self._ReadExprSub(lex_mode_e.ShCommand)
903
904	if self.token_type == Id.Left_DollarBraceZsh:
905	return self._ReadZshVarSub(self.cur_token)
906
907	raise AssertionError(self.cur_token)
908
909	def _ReadExtGlob(self):
910	# type: () -> word_part.ExtGlob
911	"""
912	Grammar:
913	Item = CompoundWord \| EPSILON # important: @(foo\|) is allowed
914	LEFT = '@(' \| '*(' \| '+(' \| '?(' \| '!('
915	RIGHT = ')'
916	ExtGlob = LEFT (Item '\|')* Item RIGHT # ITEM may be empty
917	Compound includes ExtGlob
918	"""
919	left_token = self.cur_token
920	right_token = None # type: Token
921	arms = [] # type: List[CompoundWord]
922
923	self.lexer.PushHint(Id.Op_RParen, Id.Right_ExtGlob)
924	self._SetNext(lex_mode_e.ExtGlob) # advance past LEFT
925
926	read_word = False # did we just a read a word? To handle @(\|\|).
927
928	while True:
929	self._GetToken()
930
931	if self.token_type == Id.Right_ExtGlob:
932	if not read_word:
933	arms.append(CompoundWord([]))
934	right_token = self.cur_token
935	break
936
937	elif self.token_type == Id.Op_Pipe:
938	if not read_word:
939	arms.append(CompoundWord([]))
940	read_word = False
941	self._SetNext(lex_mode_e.ExtGlob)
942
943	# lex_mode_e.ExtGlob should only produce these 4 kinds of tokens
944	elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub,
945	Kind.ExtGlob):
946	w = self._ReadCompoundWord(lex_mode_e.ExtGlob)
947	arms.append(w)
948	read_word = True
949
950	elif self.token_kind == Kind.Eof:
951	p_die('Unexpected EOF reading extended glob that began here',
952	left_token)
953
954	else:
955	raise AssertionError(self.cur_token)
956
957	return word_part.ExtGlob(left_token, arms, right_token)
958
959	def _ReadBashRegexGroup(self):
960	# type: () -> word_part.BashRegexGroup
961	"""
962	Grammar:
963	BashRegexGroup = '(' WORD? ')
964	"""
965	left_token = self.cur_token
966	assert left_token.id == Id.BashRegex_LParen, left_token
967
968	arms = [] # type: List[CompoundWord]
969
970	self.lexer.PushHint(Id.Op_RParen, Id.Right_BashRegexGroup)
971	self._SetNext(lex_mode_e.BashRegexFakeInner) # advance past LEFT
972
973	self._GetToken()
974	if self.token_type == Id.Right_BashRegexGroup: # empty ()
975	return word_part.BashRegexGroup(left_token, None, self.cur_token)
976
977	# lex_mode_e.BashRegex should only produce these 4 kinds of tokens
978	if self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub, Kind.BashRegex):
979	# Fake lexer mode that translates Id.WS_Space to Id.Lit_Chars
980	# To allow bash style [[ s =~ (a b) ]]
981	w = self._ReadCompoundWord(lex_mode_e.BashRegexFakeInner)
982	arms.append(w)
983
984	self._GetToken()
985	if self.token_type != Id.Right_BashRegexGroup:
986	p_die('Expected ) to close bash regex group', self.cur_token)
987
988	return word_part.BashRegexGroup(left_token, w, self.cur_token)
989
990	p_die('Expected word after ( opening bash regex group', self.cur_token)
991
992	def _ReadLikeDQ(self, left_token, is_ysh_expr, out_parts):
993	# type: (Optional[Token], bool, List[word_part_t]) -> None
994	"""
995	Args:
996	left_token: A token if we are reading a double quoted part, or None if
997	we're reading a here doc.
998	is_ysh_expr: Whether to disallow backticks and invalid char escapes
999	out_parts: list of word_part to append to
1000	"""
1001	if left_token:
1002	if left_token.id in (Id.Left_TDoubleQuote,
1003	Id.Left_DollarTDoubleQuote):
1004	expected_end_tokens = 3
1005	else:
1006	expected_end_tokens = 1
1007	else:
1008	expected_end_tokens = 1000 # here doc will break
1009
1010	num_end_tokens = 0
1011	while num_end_tokens < expected_end_tokens:
1012	self._SetNext(lex_mode_e.DQ)
1013	self._GetToken()
1014
1015	if self.token_kind == Kind.Lit:
1016	if self.token_type == Id.Lit_EscapedChar:
1017	tok = self.cur_token
1018	ch = lexer.TokenSliceLeft(tok, 1)
1019	part = word_part.EscapedLiteral(tok,
1020	ch) # type: word_part_t
1021	else:
1022	if self.token_type == Id.Lit_BadBackslash:
1023	# echo "\z" is OK in shell, but 'x = "\z" is a syntax error in
1024	# YSH.
1025	# Slight hole: We don't catch 'x = ${undef:-"\z"} because of the
1026	# recursion (unless parse_backslash)
1027	if (is_ysh_expr or
1028	not self.parse_opts.parse_backslash()):
1029	p_die(
1030	"Invalid char escape in double quoted string (OILS-ERR-12)",
1031	self.cur_token)
1032	elif self.token_type == Id.Lit_Dollar:
1033	if is_ysh_expr or not self.parse_opts.parse_dollar():
1034	p_die("Literal $ should be quoted like \$",
1035	self.cur_token)
1036
1037	part = self.cur_token
1038	out_parts.append(part)
1039
1040	elif self.token_kind == Kind.Left:
1041	if self.token_type == Id.Left_Backtick and is_ysh_expr:
1042	p_die("Invalid backtick: use $(cmd) or \\` in YSH strings",
1043	self.cur_token)
1044
1045	part = self._ReadDoubleQuotedLeftParts()
1046	out_parts.append(part)
1047
1048	elif self.token_kind == Kind.VSub:
1049	tok = self.cur_token
1050	part = SimpleVarSub(tok)
1051	out_parts.append(part)
1052	# NOTE: parsing "$f(x)" would BREAK CODE. Could add a more for it
1053	# later.
1054
1055	elif self.token_kind == Kind.Right:
1056	assert self.token_type == Id.Right_DoubleQuote, self.token_type
1057	if left_token:
1058	num_end_tokens += 1
1059
1060	# In a here doc, the right quote is literal!
1061	out_parts.append(self.cur_token)
1062
1063	elif self.token_kind == Kind.Eof:
1064	if left_token:
1065	p_die(
1066	'Unexpected EOF reading double-quoted string that began here',
1067	left_token)
1068	else: # here docs will have an EOF in their token stream
1069	break
1070
1071	else:
1072	raise AssertionError(self.cur_token)
1073
1074	if self.token_kind != Kind.Right:
1075	num_end_tokens = 0 # """ must be CONSECUTIVE
1076
1077	if expected_end_tokens == 1:
1078	out_parts.pop()
1079	elif expected_end_tokens == 3:
1080	out_parts.pop()
1081	out_parts.pop()
1082	out_parts.pop()
1083
1084	# Remove space from """ in both expression mode and command mode
1085	if (left_token and left_token.id
1086	in (Id.Left_TDoubleQuote, Id.Left_DollarTDoubleQuote)):
1087	word_compile.RemoveLeadingSpaceDQ(out_parts)
1088
1089	# Return nothing, since we appended to 'out_parts'
1090
1091	def _ReadDoubleQuoted(self, left_token):
1092	# type: (Token) -> DoubleQuoted
1093	"""Helper function for "hello $name".
1094
1095	Args:
1096	eof_type: for stopping at }, Id.Lit_RBrace
1097	here_doc: Whether we are reading in a here doc context
1098
1099	Also ${foo%%a b c} # treat this as double quoted. until you hit
1100	"""
1101	parts = [] # type: List[word_part_t]
1102	self._ReadLikeDQ(left_token, False, parts)
1103
1104	right_quote = self.cur_token
1105	return DoubleQuoted(left_token, parts, right_quote)
1106
1107	def ReadDoubleQuoted(self, left_token, parts):
1108	# type: (Token, List[word_part_t]) -> Token
1109	"""For expression mode.
1110
1111	Read var x = "${dir:-}/$name"; etc.
1112	"""
1113	self._ReadLikeDQ(left_token, True, parts)
1114	return self.cur_token
1115
1116	def _ReadCommandSub(self, left_id, d_quoted=False):
1117	# type: (Id_t, bool) -> CommandSub
1118	"""
1119	NOTE: This is not in the grammar, because word parts aren't in the grammar!
1120
1121	command_sub = '$(' command_list ')'
1122	\| '@(' command_list ')'
1123	\| '<(' command_list ')'
1124	\| '>(' command_list ')'
1125	\| ` command_list `
1126	"""
1127	left_token = self.cur_token
1128
1129	# Set the lexer in a state so ) becomes the EOF token.
1130	if left_id in (Id.Left_DollarParen, Id.Left_AtParen, Id.Left_ProcSubIn,
1131	Id.Left_ProcSubOut):
1132	self._SetNext(lex_mode_e.ShCommand) # advance past $( etc.
1133
1134	right_id = Id.Eof_RParen
1135	self.lexer.PushHint(Id.Op_RParen, right_id)
1136	c_parser = self.parse_ctx.MakeParserForCommandSub(
1137	self.line_reader, self.lexer, right_id)
1138	# NOTE: This doesn't use something like main_loop because we don't want
1139	# to interleave parsing and execution! Unlike 'source' and 'eval'.
1140	node = c_parser.ParseCommandSub()
1141
1142	right_token = c_parser.w_parser.cur_token
1143
1144	elif left_id == Id.Left_Backtick and self.parse_ctx.do_lossless:
1145	# NOTE: This is an APPROXIMATE solution for translation ONLY. See
1146	# test/osh2oil.
1147
1148	right_id = Id.Eof_Backtick
1149	self.lexer.PushHint(Id.Left_Backtick, right_id)
1150	c_parser = self.parse_ctx.MakeParserForCommandSub(
1151	self.line_reader, self.lexer, right_id)
1152	node = c_parser.ParseCommandSub()
1153	right_token = c_parser.w_parser.cur_token
1154
1155	elif left_id == Id.Left_Backtick:
1156	if not self.parse_opts.parse_backticks():
1157	p_die('Use $(cmd) instead of backticks (parse_backticks)',
1158	left_token)
1159
1160	self._SetNext(lex_mode_e.Backtick) # advance past `
1161
1162	parts = [] # type: List[str]
1163	while True:
1164	self._GetToken()
1165	#log("TOK %s", self.cur_token)
1166
1167	if self.token_type == Id.Backtick_Quoted:
1168	# Remove leading \
1169	parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1170
1171	elif self.token_type == Id.Backtick_DoubleQuote:
1172	# Compatibility: If backticks are double quoted, then double quotes
1173	# within them have to be \"
1174	# Shells aren't smart enough to match nested " and ` quotes (but OSH
1175	# is)
1176	if d_quoted:
1177	# Remove leading \
1178	parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1179	else:
1180	parts.append(lexer.TokenVal(self.cur_token))
1181
1182	elif self.token_type == Id.Backtick_Other:
1183	parts.append(lexer.TokenVal(self.cur_token))
1184
1185	elif self.token_type == Id.Backtick_Right:
1186	break
1187
1188	elif self.token_type == Id.Eof_Real:
1189	# Note: this parse error is in the ORIGINAL context. No code_str yet.
1190	p_die('Unexpected EOF while looking for closing backtick',
1191	left_token)
1192
1193	else:
1194	raise AssertionError(self.cur_token)
1195
1196	self._SetNext(lex_mode_e.Backtick)
1197
1198	# Calculate right SPID on CommandSub BEFORE re-parsing.
1199	right_token = self.cur_token
1200
1201	code_str = ''.join(parts)
1202	#log('code %r', code_str)
1203
1204	# NOTE: This is similar to how we parse aliases in osh/cmd_parse.py. It
1205	# won't have the same location info as MakeParserForCommandSub(), because
1206	# the lexer is different.
1207	arena = self.parse_ctx.arena
1208	#arena = alloc.Arena()
1209	line_reader = reader.StringLineReader(code_str, arena)
1210	c_parser = self.parse_ctx.MakeOshParser(line_reader)
1211	src = source.Reparsed('backticks', left_token, right_token)
1212	with alloc.ctx_SourceCode(arena, src):
1213	node = c_parser.ParseCommandSub()
1214
1215	else:
1216	raise AssertionError(left_id)
1217
1218	return CommandSub(left_token, node, right_token)
1219
1220	def _ReadExprSub(self, lex_mode):
1221	# type: (lex_mode_t) -> word_part.ExprSub
1222	"""$[d->key] $[obj.method()] etc."""
1223	left_token = self.cur_token
1224
1225	self._SetNext(lex_mode_e.Expr)
1226	enode, right_token = self.parse_ctx.ParseYshExpr(
1227	self.lexer, grammar_nt.ysh_expr_sub)
1228
1229	self._SetNext(lex_mode) # Move past ]
1230	return word_part.ExprSub(left_token, enode, right_token)
1231
1232	def ParseVarDecl(self, kw_token):
1233	# type: (Token) -> command.VarDecl
1234	"""
1235	oil_var_decl: name_type_list '=' testlist end_stmt
1236
1237	Note that assignments must end with \n ; } or EOF. Unlike shell
1238	assignments, we disallow:
1239
1240	var x = 42 \| wc -l
1241	var x = 42 && echo hi
1242	"""
1243	self._SetNext(lex_mode_e.Expr)
1244	enode, last_token = self.parse_ctx.ParseVarDecl(kw_token, self.lexer)
1245	# Hack to move } from what the Expr lexer modes gives to what CommandParser
1246	# wants
1247	if last_token.id == Id.Op_RBrace:
1248	last_token.id = Id.Lit_RBrace
1249
1250	# Let the CommandParser see the Op_Semi or Op_Newline.
1251	self.buffered_word = last_token
1252	self._SetNext(lex_mode_e.ShCommand) # always back to this
1253	return enode
1254
1255	def ParseMutation(self, kw_token, var_checker):
1256	# type: (Token, VarChecker) -> command.Mutation
1257	"""
1258	setvar i = 42
1259	setvar i += 1
1260	setvar a[i] = 42
1261	setvar a[i] += 1
1262	setvar d.key = 42
1263	setvar d.key += 1
1264	"""
1265	self._SetNext(lex_mode_e.Expr)
1266	enode, last_token = self.parse_ctx.ParseMutation(kw_token, self.lexer)
1267	# Hack to move } from what the Expr lexer modes gives to what CommandParser
1268	# wants
1269	if last_token.id == Id.Op_RBrace:
1270	last_token.id = Id.Lit_RBrace
1271
1272	for lhs in enode.lhs:
1273	UP_lhs = lhs
1274	with tagswitch(lhs) as case:
1275	if case(y_lhs_e.Var):
1276	lhs = cast(Token, UP_lhs)
1277	var_checker.Check(kw_token.id, lexer.LazyStr(lhs), lhs)
1278
1279	# Note: this does not cover cases like
1280	# setvar (a[0])[1] = v
1281	# setvar (d.key).other = v
1282	# This leaks into catching all typos statically, which may be
1283	# possible if 'use' makes all names explicit.
1284	elif case(y_lhs_e.Subscript):
1285	lhs = cast(Subscript, UP_lhs)
1286	if lhs.obj.tag() == expr_e.Var:
1287	v = cast(expr.Var, lhs.obj)
1288	var_checker.Check(kw_token.id, v.name, v.left)
1289
1290	elif case(y_lhs_e.Attribute):
1291	lhs = cast(Attribute, UP_lhs)
1292	if lhs.obj.tag() == expr_e.Var:
1293	v = cast(expr.Var, lhs.obj)
1294	var_checker.Check(kw_token.id, v.name, v.left)
1295
1296	# Let the CommandParser see the Op_Semi or Op_Newline.
1297	self.buffered_word = last_token
1298	self._SetNext(lex_mode_e.ShCommand) # always back to this
1299	return enode
1300
1301	def ParseBareDecl(self):
1302	# type: () -> expr_t
1303	"""
1304	x = {name: val}
1305	"""
1306	self._SetNext(lex_mode_e.Expr)
1307	self._GetToken()
1308	enode, last_token = self.parse_ctx.ParseYshExpr(
1309	self.lexer, grammar_nt.command_expr)
1310	if last_token.id == Id.Op_RBrace:
1311	last_token.id = Id.Lit_RBrace
1312	self.buffered_word = last_token
1313	self._SetNext(lex_mode_e.ShCommand)
1314	return enode
1315
1316	def ParseYshExprForCommand(self):
1317	# type: () -> expr_t
1318
1319	# Fudge for this case
1320	# for x in(y) {
1321	# versus
1322	# for x in (y) {
1323	#
1324	# In the former case, ReadWord on 'in' puts the lexer past (.
1325	# Also see LookPastSpace in CommandParers.
1326	# A simpler solution would be nicer.
1327
1328	if self.token_type == Id.Op_LParen:
1329	self.lexer.MaybeUnreadOne()
1330
1331	enode, _ = self.parse_ctx.ParseYshExpr(self.lexer, grammar_nt.ysh_expr)
1332
1333	self._SetNext(lex_mode_e.ShCommand)
1334	return enode
1335
1336	def ParseCommandExpr(self):
1337	# type: () -> expr_t
1338	"""
1339	= 1+2
1340	"""
1341	enode, last_token = self.parse_ctx.ParseYshExpr(
1342	self.lexer, grammar_nt.command_expr)
1343
1344	# In some cases, such as the case statement, we expect the lexer to be
1345	# pointing at the token right after the expression. But the expression
1346	# parser must have read to the `last_token`. Unreading places the lexer
1347	# back in the expected state. Ie:
1348	#
1349	# case (x) { case (x) {
1350	# (else) { = x } (else) { = x }
1351	# ^ The lexer is here ^ Unread to here
1352	# } }
1353	assert last_token.id in (Id.Op_Newline, Id.Eof_Real, Id.Op_Semi,
1354	Id.Op_RBrace), last_token
1355	if last_token.id != Id.Eof_Real:
1356	# Eof_Real is the only token we cannot unread
1357	self.lexer.MaybeUnreadOne()
1358
1359	return enode
1360
1361	def ParseProc(self, node):
1362	# type: (Proc) -> None
1363
1364	# proc name-with-hyphens() must be accepted
1365	self._SetNext(lex_mode_e.ShCommand)
1366	self._GetToken()
1367	# example: 'proc f[' gets you Lit_ArrayLhsOpen
1368	if self.token_type != Id.Lit_Chars:
1369	p_die('Invalid proc name %s' % ui.PrettyToken(self.cur_token),
1370	self.cur_token)
1371
1372	# TODO: validate this more. Disallow proc 123 { }, which isn't disallowed
1373	# for shell functions. Similar to IsValidVarName().
1374	node.name = self.cur_token
1375
1376	last_token = self.parse_ctx.ParseProc(self.lexer, node)
1377
1378	# Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1379	assert last_token.id == Id.Op_LBrace
1380	last_token.id = Id.Lit_LBrace
1381	self.buffered_word = last_token
1382
1383	self._SetNext(lex_mode_e.ShCommand)
1384
1385	def ParseFunc(self, node):
1386	# type: (Func) -> None
1387	last_token = self.parse_ctx.ParseFunc(self.lexer, node)
1388
1389	# Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1390	assert last_token.id == Id.Op_LBrace
1391	last_token.id = Id.Lit_LBrace
1392	self.buffered_word = last_token
1393
1394	self._SetNext(lex_mode_e.ShCommand)
1395
1396	def ParseYshCasePattern(self):
1397	# type: () -> Tuple[pat_t, Token]
1398	pat, left_tok, last_token = self.parse_ctx.ParseYshCasePattern(
1399	self.lexer)
1400
1401	if last_token.id == Id.Op_LBrace:
1402	last_token.id = Id.Lit_LBrace
1403	self.buffered_word = last_token
1404
1405	return pat, left_tok
1406
1407	def NewlineOkForYshCase(self):
1408	# type: () -> Id_t
1409	"""Check for optional newline and consume it.
1410
1411	This is a special case of `_NewlineOk` which fixed some "off-by-one" issues
1412	which crop up while parsing Ysh Case Arms. For more details, see
1413	#oil-dev > Progress On YSH Case Grammar on zulip.
1414
1415	Returns a token id which is filled with the choice of
1416
1417	word { echo word }
1418	(3) { echo expr }
1419	/e/ { echo eggex }
1420	} # right brace
1421	"""
1422	while True:
1423	next_id = self.lexer.LookAheadOne(lex_mode_e.Expr)
1424
1425	# Cannot lookahead past lines
1426	if next_id == Id.Unknown_Tok:
1427	if not self.lexer.MoveToNextLine(): # Try to move to next line
1428	break # EOF
1429	continue
1430
1431	next_kind = consts.GetKind(next_id)
1432	if next_id != Id.Op_Newline and next_kind != Kind.Ignored:
1433	break
1434
1435	self.lexer.Read(lex_mode_e.Expr)
1436
1437	if next_id in (Id.Op_RBrace, Id.Op_LParen, Id.Arith_Slash):
1438	self._SetNext(lex_mode_e.Expr) # Continue in expression mode
1439	else:
1440	# Consume the trailing Op_Newline
1441	self._SetNext(lex_mode_e.ShCommand)
1442	self._GetToken()
1443
1444	return next_id
1445
1446	def _ReadArithExpr(self, end_id):
1447	# type: (Id_t) -> arith_expr_t
1448	"""Read and parse an arithmetic expression in various contexts.
1449
1450	$(( 1+2 ))
1451	(( a=1+2 ))
1452	${a[ 1+2 ]}
1453	${a : 1+2 : 1+2}
1454
1455	See tests/arith-context.test.sh for ambiguous cases.
1456
1457	${a[a[0]]} is valid # VS_RBRACKET vs Id.Arith_RBracket
1458
1459	${s : a<b?0:1 : 1} # VS_COLON vs Id.Arith_Colon
1460
1461	See the assertion in ArithParser.Parse() -- unexpected extra input.
1462	"""
1463	# calls self.ReadWord(lex_mode_e.Arith)
1464	anode = self.a_parser.Parse()
1465	cur_id = self.a_parser.CurrentId()
1466	if end_id != Id.Undefined_Tok and cur_id != end_id:
1467	p_die(
1468	'Unexpected token after arithmetic expression (%s != %s)' %
1469	(ui.PrettyId(cur_id), ui.PrettyId(end_id)),
1470	loc.Word(self.a_parser.cur_word))
1471	return anode
1472
1473	def _ReadArithSub(self):
1474	# type: () -> word_part.ArithSub
1475	"""Read an arith substitution, which contains an arith expression, e.g.
1476
1477	$((a + 1)).
1478	"""
1479	left_tok = self.cur_token
1480
1481	# The second one needs to be disambiguated in stuff like stuff like:
1482	# $(echo $(( 1+2 )) )
1483	self.lexer.PushHint(Id.Op_RParen, Id.Right_DollarDParen)
1484
1485	# NOTE: To disambiguate $(( as arith sub vs. command sub and subshell, we
1486	# could save the lexer/reader state here, and retry if the arithmetic parse
1487	# fails. But we can almost always catch this at parse time. There could
1488	# be some exceptions like:
1489	# $((echo * foo)) # looks like multiplication
1490	# $((echo / foo)) # looks like division
1491
1492	# $(( )) is valid
1493	anode = arith_expr.EmptyZero # type: arith_expr_t
1494
1495	self._NextNonSpace()
1496	if self.token_type != Id.Arith_RParen:
1497	anode = self._ReadArithExpr(Id.Arith_RParen)
1498
1499	self._SetNext(lex_mode_e.ShCommand)
1500
1501	# Ensure we get closing )
1502	self._GetToken()
1503	if self.token_type != Id.Right_DollarDParen:
1504	p_die('Expected second ) to end arith sub', self.cur_token)
1505
1506	right_tok = self.cur_token
1507	return word_part.ArithSub(left_tok, anode, right_tok)
1508
1509	def ReadDParen(self):
1510	# type: () -> Tuple[arith_expr_t, Token]
1511	"""Read ((1+ 2)) -- command context.
1512
1513	We're using the word parser because it's very similar to _ReadArithExpr
1514	above.
1515
1516	This also returns the terminating Id.Op_DRightParen token for location
1517	info.
1518	"""
1519	# (( )) is valid
1520	anode = arith_expr.EmptyZero # type: arith_expr_t
1521
1522	self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
1523
1524	self._NextNonSpace()
1525	if self.token_type != Id.Arith_RParen:
1526	anode = self._ReadArithExpr(Id.Arith_RParen)
1527
1528	self._SetNext(lex_mode_e.ShCommand)
1529
1530	# Ensure we get the second )
1531	self._GetToken()
1532	right = self.cur_token
1533	if right.id != Id.Op_DRightParen:
1534	p_die('Expected second ) to end arith statement', right)
1535
1536	self._SetNext(lex_mode_e.ShCommand)
1537
1538	return anode, right
1539
1540	def _NextNonSpace(self):
1541	# type: () -> None
1542	"""Advance in lex_mode_e.Arith until non-space token.
1543
1544	Same logic as _ReadWord, but used in
1545	$(( ))
1546	(( ))
1547	for (( ))
1548
1549	You can read self.token_type after this, without calling _GetToken.
1550	"""
1551	while True:
1552	self._SetNext(lex_mode_e.Arith)
1553	self._GetToken()
1554	if self.token_kind not in (Kind.Ignored, Kind.WS):
1555	break
1556
1557	def ReadForExpression(self):
1558	# type: () -> command.ForExpr
1559	"""Read ((i=0; i<5; ++i)) -- part of command context."""
1560	self._NextNonSpace() # skip over ((
1561	cur_id = self.token_type # for end of arith expressions
1562
1563	if cur_id == Id.Arith_Semi: # for (( ; i < 10; i++ ))
1564	init_node = arith_expr.EmptyZero # type: arith_expr_t
1565	else:
1566	init_node = self.a_parser.Parse()
1567	cur_id = self.a_parser.CurrentId()
1568	self._NextNonSpace()
1569
1570	# It's odd to keep track of both cur_id and self.token_type in this
1571	# function, but it works, and is tested in 'test/parse_error.sh
1572	# arith-integration'
1573	if cur_id != Id.Arith_Semi: # for (( x=0 b; ... ))
1574	p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1575
1576	self._GetToken()
1577	cur_id = self.token_type
1578
1579	if cur_id == Id.Arith_Semi: # for (( ; ; i++ ))
1580	# empty condition is TRUE
1581	cond_node = arith_expr.EmptyOne # type: arith_expr_t
1582	else:
1583	cond_node = self.a_parser.Parse()
1584	cur_id = self.a_parser.CurrentId()
1585
1586	if cur_id != Id.Arith_Semi: # for (( x=0; x<5 b ))
1587	p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1588
1589	self._NextNonSpace()
1590	if self.token_type == Id.Arith_RParen: # for (( ; ; ))
1591	update_node = arith_expr.EmptyZero # type: arith_expr_t
1592	else:
1593	update_node = self._ReadArithExpr(Id.Arith_RParen)
1594
1595	self._NextNonSpace()
1596	if self.token_type != Id.Arith_RParen:
1597	p_die('Expected ) to end for loop expression', self.cur_token)
1598	self._SetNext(lex_mode_e.ShCommand)
1599
1600	# redirects is None, will be assigned in CommandEvaluator
1601	node = command.ForExpr.CreateNull()
1602	node.init = init_node
1603	node.cond = cond_node
1604	node.update = update_node
1605	return node
1606
1607	def _ReadArrayLiteral(self):
1608	# type: () -> word_part_t
1609	"""a=(1 2 3)
1610
1611	TODO: See osh/cmd_parse.py:164 for Id.Lit_ArrayLhsOpen, for a[x++]=1
1612
1613	We want:
1614
1615	A=(['x']=1 ["x"]=2 [$x$y]=3)
1616
1617	Maybe allow this as a literal string? Because I think I've seen it before?
1618	Or maybe force people to patch to learn the rule.
1619
1620	A=([x]=4)
1621
1622	Starts with Lit_Other '[', and then it has Lit_ArrayLhsClose
1623	Maybe enforce that ALL have keys or NONE of have keys.
1624	"""
1625	self._SetNext(lex_mode_e.ShCommand) # advance past (
1626	self._GetToken()
1627	if self.cur_token.id != Id.Op_LParen:
1628	p_die('Expected ( after =', self.cur_token)
1629	left_token = self.cur_token
1630	right_token = None # type: Token
1631
1632	# MUST use a new word parser (with same lexer).
1633	w_parser = self.parse_ctx.MakeWordParser(self.lexer, self.line_reader)
1634	words = [] # type: List[CompoundWord]
1635	done = False
1636	while not done:
1637	w = w_parser.ReadWord(lex_mode_e.ShCommand)
1638	with tagswitch(w) as case:
1639	if case(word_e.Operator):
1640	tok = cast(Token, w)
1641	if tok.id == Id.Right_ShArrayLiteral:
1642	right_token = tok
1643	done = True # can't use break here
1644	# Unlike command parsing, array parsing allows embedded \n.
1645	elif tok.id == Id.Op_Newline:
1646	continue
1647	else:
1648	p_die('Unexpected token in array literal', loc.Word(w))
1649
1650	elif case(word_e.Compound):
1651	words.append(cast(CompoundWord, w))
1652
1653	else:
1654	raise AssertionError()
1655
1656	if len(words) == 0: # a=() is empty indexed array
1657	# Needed for type safety, doh
1658	no_words = [] # type: List[word_t]
1659	node = ShArrayLiteral(left_token, no_words, right_token)
1660	return node
1661
1662	pairs = [] # type: List[AssocPair]
1663	# If the first one is a key/value pair, then the rest are assumed to be.
1664	pair = word_.DetectAssocPair(words[0])
1665	if pair:
1666	pairs.append(pair)
1667
1668	n = len(words)
1669	for i in xrange(1, n):
1670	w2 = words[i]
1671	pair = word_.DetectAssocPair(w2)
1672	if not pair:
1673	p_die("Expected associative array pair", loc.Word(w2))
1674
1675	pairs.append(pair)
1676
1677	# invariant List?
1678	return word_part.BashAssocLiteral(left_token, pairs, right_token)
1679
1680	# Brace detection for arrays but NOT associative arrays
1681	words2 = braces.BraceDetectAll(words)
1682	words3 = word_.TildeDetectAll(words2)
1683	return ShArrayLiteral(left_token, words3, right_token)
1684
1685	def ParseProcCallArgs(self, start_symbol):
1686	# type: (int) -> ArgList
1687	""" json write (x) """
1688	self.lexer.MaybeUnreadOne()
1689
1690	arg_list = ArgList.CreateNull(alloc_lists=True)
1691	arg_list.left = self.cur_token
1692	self.parse_ctx.ParseProcCallArgs(self.lexer, arg_list, start_symbol)
1693	return arg_list
1694
1695	def _MaybeReadWordPart(self, is_first, lex_mode, parts):
1696	# type: (bool, lex_mode_t, List[word_part_t]) -> bool
1697	"""Helper for _ReadCompoundWord3."""
1698	done = False
1699
1700	if self.token_type == Id.Lit_EscapedChar:
1701	tok = self.cur_token
1702	assert tok.length == 2
1703	ch = lexer.TokenSliceLeft(tok, 1)
1704	if not self.parse_opts.parse_backslash():
1705	if not pyutil.IsValidCharEscape(ch):
1706	p_die('Invalid char escape in unquoted word (OILS-ERR-13)',
1707	self.cur_token)
1708
1709	part = word_part.EscapedLiteral(self.cur_token,
1710	ch) # type: word_part_t
1711	else:
1712	part = self.cur_token
1713
1714	if is_first and self.token_type == Id.Lit_VarLike: # foo=
1715	parts.append(part)
1716	# Unfortunately it's awkward to pull the check for a=(1 2) up to
1717	# _ReadWord.
1718	next_id = self.lexer.LookPastSpace(lex_mode)
1719	if next_id == Id.Op_LParen:
1720	self.lexer.PushHint(Id.Op_RParen, Id.Right_ShArrayLiteral)
1721	part2 = self._ReadArrayLiteral()
1722	parts.append(part2)
1723
1724	# Array literal must be the last part of the word.
1725	self._SetNext(lex_mode)
1726	self._GetToken()
1727	# EOF, whitespace, newline, Right_Subshell
1728	if self.token_kind not in KINDS_THAT_END_WORDS:
1729	p_die('Unexpected token after array literal',
1730	self.cur_token)
1731	done = True
1732
1733	elif (is_first and self.parse_opts.parse_at() and
1734	self.token_type == Id.Lit_Splice):
1735
1736	splice_tok = self.cur_token
1737	part2 = word_part.Splice(splice_tok,
1738	lexer.TokenSliceLeft(splice_tok, 1))
1739
1740	parts.append(part2)
1741
1742	# @words must be the last part of the word
1743	self._SetNext(lex_mode)
1744	self._GetToken()
1745	# EOF, whitespace, newline, Right_Subshell
1746	if self.token_kind not in KINDS_THAT_END_WORDS:
1747	p_die('Unexpected token after array splice', self.cur_token)
1748	done = True
1749
1750	elif (is_first and self.parse_opts.parse_at() and
1751	self.token_type == Id.Lit_AtLBracket): # @[split(x)]
1752	part2 = self._ReadExprSub(lex_mode_e.DQ)
1753	parts.append(part2)
1754
1755	# @[split(x)]
1756	self._SetNext(lex_mode)
1757	self._GetToken()
1758	# EOF, whitespace, newline, Right_Subshell
1759	if self.token_kind not in KINDS_THAT_END_WORDS:
1760	p_die('Unexpected token after Expr splice', self.cur_token)
1761	done = True
1762
1763	elif (is_first and self.parse_opts.parse_at() and
1764	self.token_type == Id.Lit_AtLBraceDot):
1765	p_die('TODO: @{.myproc builtin sub}', self.cur_token)
1766
1767	elif (is_first and self.parse_opts.parse_at_all() and
1768	self.token_type == Id.Lit_At):
1769	# Because $[x] ${x} and perhaps $/x/ are reserved, it makes sense for @
1770	# at the beginning of a word to be reserved.
1771
1772	# Although should we relax 'echo @' ? I'm tempted to have a shortcut for
1773	# @_argv and
1774	p_die('Literal @ starting a word must be quoted (parse_at_all)',
1775	self.cur_token)
1776
1777	else:
1778	# not a literal with lookahead; append it
1779	parts.append(part)
1780
1781	return done
1782
1783	def _ReadCompoundWord(self, lex_mode):
1784	# type: (lex_mode_t) -> CompoundWord
1785	return self._ReadCompoundWord3(lex_mode, Id.Undefined_Tok, True)
1786
1787	def _ReadCompoundWord3(self, lex_mode, eof_type, empty_ok):
1788	# type: (lex_mode_t, Id_t, bool) -> CompoundWord
1789	"""
1790	Precondition: Looking at the first token of the first word part
1791	Postcondition: Looking at the token after, e.g. space or operator
1792
1793	NOTE: eof_type is necessary because / is a literal, i.e. Lit_Slash, but it
1794	could be an operator delimiting a compound word. Can we change lexer modes
1795	and remove this special case?
1796	"""
1797	w = CompoundWord([])
1798	num_parts = 0
1799	brace_count = 0
1800	done = False
1801	is_triple_quoted = None # type: Optional[BoolParamBox]
1802
1803	while not done:
1804	self._GetToken()
1805
1806	allow_done = empty_ok or num_parts != 0
1807	if allow_done and self.token_type == eof_type:
1808	done = True # e.g. for ${foo//pat/replace}
1809
1810	# Keywords like "for" are treated like literals
1811	elif self.token_kind in (Kind.Lit, Kind.History, Kind.KW,
1812	Kind.ControlFlow, Kind.BoolUnary,
1813	Kind.BoolBinary):
1814
1815	# Syntax error for { and }
1816	if self.token_type == Id.Lit_LBrace:
1817	brace_count += 1
1818	elif self.token_type == Id.Lit_RBrace:
1819	brace_count -= 1
1820	elif self.token_type == Id.Lit_Dollar:
1821	if not self.parse_opts.parse_dollar():
1822	if num_parts == 0 and lex_mode == lex_mode_e.ShCommand:
1823	next_byte = self.lexer.ByteLookAhead()
1824	# TODO: switch lexer modes and parse $/d+/. But not ${a:-$/d+/}
1825	if next_byte == '/':
1826	#log('next_byte %r', next_byte)
1827	pass
1828
1829	p_die('Literal $ should be quoted like \$',
1830	self.cur_token)
1831
1832	done = self._MaybeReadWordPart(num_parts == 0, lex_mode,
1833	w.parts)
1834
1835	elif self.token_kind == Kind.VSub:
1836	vsub_token = self.cur_token
1837
1838	part = SimpleVarSub(vsub_token) # type: word_part_t
1839	w.parts.append(part)
1840
1841	elif self.token_kind == Kind.ExtGlob:
1842	# If parse_at, we can take over @( to start @(seq 3)
1843	# Users can also use look at ,(.py\|.sh)
1844	if (self.parse_opts.parse_at() and
1845	self.token_type == Id.ExtGlob_At and num_parts == 0):
1846	cs_part = self._ReadCommandSub(Id.Left_AtParen,
1847	d_quoted=False)
1848	# RARE mutation of tok.id!
1849	cs_part.left_token.id = Id.Left_AtParen
1850	part = cs_part # for type safety
1851
1852	# Same check as _MaybeReadWordPart. @(seq 3)x is illegal, just like
1853	# a=(one two)x and @arrayfunc(3)x.
1854	self._GetToken()
1855	if self.token_kind not in KINDS_THAT_END_WORDS:
1856	p_die('Unexpected token after @()', self.cur_token)
1857	done = True
1858
1859	else:
1860	if HAVE_FNM_EXTMATCH == 0:
1861	p_die(
1862	"Extended glob won't work without FNM_EXTMATCH support in libc",
1863	self.cur_token)
1864	part = self._ReadExtGlob()
1865	w.parts.append(part)
1866
1867	elif self.token_kind == Kind.BashRegex:
1868	if self.token_type == Id.BashRegex_LParen: # Opening (
1869	part = self._ReadBashRegexGroup()
1870	w.parts.append(part)
1871	else:
1872	assert self.token_type == Id.BashRegex_AllowedInParens
1873	p_die('Invalid token in bash regex', self.cur_token)
1874
1875	elif self.token_kind == Kind.Left:
1876	try_triple_quote = (self.parse_opts.parse_triple_quote() and
1877	lex_mode == lex_mode_e.ShCommand and
1878	num_parts == 0)
1879
1880	# Save allocation
1881	if try_triple_quote:
1882	is_triple_quoted = BoolParamBox(False)
1883
1884	part = self._ReadUnquotedLeftParts(is_triple_quoted)
1885	w.parts.append(part)
1886
1887	# NOT done yet, will advance below
1888	elif self.token_kind == Kind.Right:
1889	# Still part of the word; will be done on the next iter.
1890	if self.token_type == Id.Right_DoubleQuote:
1891	pass
1892	# Never happens, no PushHint for this case.
1893	#elif self.token_type == Id.Right_DollarParen:
1894	# pass
1895	elif self.token_type == Id.Right_Subshell:
1896	# LEXER HACK for (case x in x) ;; esac )
1897	# Rewind before it's used
1898	assert self.next_lex_mode == lex_mode_e.Undefined
1899	if self.lexer.MaybeUnreadOne():
1900	self.lexer.PushHint(Id.Op_RParen, Id.Right_Subshell)
1901	self._SetNext(lex_mode)
1902	done = True
1903	else:
1904	done = True
1905
1906	elif self.token_kind == Kind.Ignored:
1907	done = True
1908
1909	else:
1910	# LEXER HACK for unbalanced case clause. 'case foo in esac' is valid,
1911	# so to test for ESAC, we can read ) before getting a chance to
1912	# PushHint(Id.Op_RParen, Id.Right_CasePat). So here we unread one
1913	# token and do it again.
1914
1915	# We get Id.Op_RParen at top level: case x in x) ;; esac
1916	# We get Id.Eof_RParen inside ComSub: $(case x in x) ;; esac )
1917	if self.token_type in (Id.Op_RParen, Id.Eof_RParen):
1918	# Rewind before it's used
1919	assert self.next_lex_mode == lex_mode_e.Undefined
1920	if self.lexer.MaybeUnreadOne():
1921	if self.token_type == Id.Eof_RParen:
1922	# Redo translation
1923	self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
1924	self._SetNext(lex_mode)
1925
1926	done = True # anything we don't recognize means we're done
1927
1928	if not done:
1929	self._SetNext(lex_mode)
1930	num_parts += 1
1931
1932	if (self.parse_opts.parse_brace() and num_parts > 1 and
1933	brace_count != 0):
1934	# accept { and }, but not foo{
1935	p_die(
1936	'Word has unbalanced { }. Maybe add a space or quote it like \{',
1937	loc.Word(w))
1938
1939	if is_triple_quoted and is_triple_quoted.b and num_parts > 1:
1940	p_die('Unexpected parts after triple quoted string',
1941	loc.WordPart(w.parts[-1]))
1942
1943	if 0:
1944	from _devbuild.gen.syntax_asdl import word_part_str
1945	word_key = ' '.join(word_part_str(p.tag()) for p in w.parts)
1946	WORD_HIST[word_key] += 1
1947	return w
1948
1949	def _ReadArithWord(self):
1950	# type: () -> Optional[word_t]
1951	""" Helper for ReadArithWord() """
1952	self._GetToken()
1953
1954	if self.token_kind == Kind.Unknown:
1955	# e.g. happened during dynamic parsing of unset 'a[$foo]' in gherkin
1956	p_die(
1957	'Unexpected token while parsing arithmetic: %r' %
1958	lexer.TokenVal(self.cur_token), self.cur_token)
1959
1960	elif self.token_kind == Kind.Eof:
1961	return self.cur_token
1962
1963	elif self.token_kind == Kind.Ignored:
1964	# Space should be ignored.
1965	self._SetNext(lex_mode_e.Arith)
1966	return None
1967
1968	elif self.token_kind in (Kind.Arith, Kind.Right):
1969	# Id.Right_DollarDParen IS just a normal token, handled by ArithParser
1970	self._SetNext(lex_mode_e.Arith)
1971	return self.cur_token
1972
1973	elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub):
1974	return self._ReadCompoundWord(lex_mode_e.Arith)
1975
1976	else:
1977	raise AssertionError(self.cur_token)
1978
1979	def _ReadWord(self, word_mode):
1980	# type: (lex_mode_t) -> Optional[word_t]
1981	"""Helper function for ReadWord()."""
1982
1983	# Change the pseudo lexer mode to a real lexer mode
1984	if word_mode == lex_mode_e.ShCommandFakeBrack:
1985	lex_mode = lex_mode_e.ShCommand
1986	else:
1987	lex_mode = word_mode
1988
1989	self._GetToken()
1990
1991	if self.token_kind == Kind.Eof:
1992	# No advance
1993	return self.cur_token
1994
1995	# Allow Arith for ) at end of for loop?
1996	elif self.token_kind in (Kind.Op, Kind.Redir, Kind.Arith):
1997	self._SetNext(lex_mode)
1998
1999	# Newlines are complicated. See 3x2 matrix in the comment about
2000	# self.multiline and self.newline_state above.
2001	if self.token_type == Id.Op_Newline:
2002	if self.multiline:
2003	if self.newline_state > 1:
2004	# This points at a blank line, but at least it gives the line number
2005	p_die('Invalid blank line in multiline mode',
2006	self.cur_token)
2007	return None
2008
2009	if self.returned_newline: # skip
2010	return None
2011
2012	return self.cur_token
2013
2014	elif self.token_kind == Kind.Right:
2015	if self.token_type not in (Id.Right_Subshell, Id.Right_ShFunction,
2016	Id.Right_CasePat,
2017	Id.Right_ShArrayLiteral):
2018	raise AssertionError(self.cur_token)
2019
2020	self._SetNext(lex_mode)
2021	return self.cur_token
2022
2023	elif self.token_kind in (Kind.Ignored, Kind.WS):
2024	self._SetNext(lex_mode)
2025	return None
2026
2027	else:
2028	assert self.token_kind in (Kind.VSub, Kind.Lit, Kind.History,
2029	Kind.Left, Kind.KW, Kind.ControlFlow,
2030	Kind.BoolUnary, Kind.BoolBinary,
2031	Kind.ExtGlob,
2032	Kind.BashRegex), 'Unhandled token kind'
2033
2034	if (word_mode == lex_mode_e.ShCommandFakeBrack and
2035	self.parse_opts.parse_bracket() and
2036	self.token_type == Id.Lit_LBracket):
2037	# Change [ from Kind.Lit -> Kind.Op
2038	# So CommandParser can treat
2039	# assert [42 === x]
2040	# like
2041	# json write (x)
2042	bracket_word = self.cur_token
2043	bracket_word.id = Id.Op_LBracket
2044
2045	self._SetNext(lex_mode)
2046	return bracket_word
2047
2048	# We're beginning a word. If we see Id.Lit_Pound, change to
2049	# lex_mode_e.Comment and read until end of line.
2050	if self.token_type == Id.Lit_Pound:
2051	self._SetNext(lex_mode_e.Comment)
2052	self._GetToken()
2053
2054	# NOTE: The # could be the last character in the file. It can't be
2055	# Eof_{RParen,Backtick} because #) and #` are comments.
2056	assert self.token_type in (Id.Ignored_Comment, Id.Eof_Real), \
2057	self.cur_token
2058
2059	# The next iteration will go into Kind.Ignored and set lex state to
2060	# lex_mode_e.ShCommand/etc.
2061	return None # tell ReadWord() to try again after comment
2062
2063	elif self.token_type == Id.Lit_TPound: ### doc comment
2064	self._SetNext(lex_mode_e.Comment)
2065	self._GetToken()
2066
2067	if self.token_type == Id.Ignored_Comment and self.emit_doc_token:
2068	return self.cur_token
2069
2070	return None # tell ReadWord() to try again after comment
2071
2072	else:
2073	# r'' u'' b''
2074	if (self.token_type == Id.Lit_Chars and
2075	self.lexer.LookAheadOne(
2076	lex_mode_e.ShCommand) == Id.Left_SingleQuote):
2077
2078	# When shopt -s parse_raw_string:
2079	# echo r'hi' is like echo 'hi'
2080	#
2081	# echo u'\u{3bc}' b'\yff' works
2082
2083	tok = self.cur_token
2084	if self.parse_opts.parse_ysh_string():
2085	if lexer.TokenEquals(tok, 'r'):
2086	left_id = Id.Left_RSingleQuote
2087	elif lexer.TokenEquals(tok, 'u'):
2088	left_id = Id.Left_USingleQuote
2089	elif lexer.TokenEquals(tok, 'b'):
2090	left_id = Id.Left_BSingleQuote
2091	else:
2092	left_id = Id.Undefined_Tok
2093
2094	if left_id != Id.Undefined_Tok:
2095	# skip the r, and then 'foo' will be read as normal
2096	self._SetNext(lex_mode_e.ShCommand)
2097
2098	self._GetToken()
2099	assert self.token_type == Id.Left_SingleQuote, self.token_type
2100
2101	# Read the word in a different lexer mode
2102	return self._ReadYshSingleQuoted(left_id)
2103
2104	return self._ReadCompoundWord(lex_mode)
2105
2106	def ParseVarRef(self):
2107	# type: () -> BracedVarSub
2108	"""DYNAMIC parsing of what's inside ${!ref}
2109
2110	# Same as VarOf production
2111	VarRefExpr = VarOf EOF
2112	"""
2113	self._SetNext(lex_mode_e.VSub_1)
2114
2115	self._GetToken()
2116	if self.token_kind != Kind.VSub:
2117	p_die('Expected var name', self.cur_token)
2118
2119	part = self._ParseVarOf()
2120	# NOTE: no ${ } means no part.left and part.right
2121	part.left = part.name_tok # cheat to make test pass
2122	part.right = part.name_tok
2123
2124	self._GetToken()
2125	if self.token_type != Id.Eof_Real:
2126	p_die('Expected end of var ref expression', self.cur_token)
2127	return part
2128
2129	def LookPastSpace(self):
2130	# type: () -> Id_t
2131	"""Look ahead to the next token.
2132
2133	For the CommandParser to recognize
2134	array= (1 2 3)
2135	YSH for ( versus bash for ((
2136	YSH if ( versus if test
2137	YSH while ( versus while test
2138	YSH bare assignment 'grep =' versus 'grep foo'
2139	"""
2140	assert self.token_type != Id.Undefined_Tok
2141	if self.cur_token.id == Id.WS_Space:
2142	id_ = self.lexer.LookPastSpace(lex_mode_e.ShCommand)
2143	else:
2144	id_ = self.cur_token.id
2145	return id_
2146
2147	def LookAheadFuncParens(self):
2148	# type: () -> bool
2149	"""Special lookahead for f( ) { echo hi; } to check for ( )"""
2150	assert self.token_type != Id.Undefined_Tok
2151
2152	# We have to handle 2 cases because we buffer a token
2153	if self.cur_token.id == Id.Op_LParen: # saw funcname(
2154	return self.lexer.LookAheadFuncParens(1) # go back one char
2155
2156	elif self.cur_token.id == Id.WS_Space: # saw funcname WHITESPACE
2157	return self.lexer.LookAheadFuncParens(0)
2158
2159	else:
2160	return False
2161
2162	def ReadWord(self, word_mode):
2163	# type: (lex_mode_t) -> word_t
2164	"""Read the next word, using the given lexer mode.
2165
2166	This is a stateful wrapper for the stateless _ReadWord function.
2167	"""
2168	assert word_mode in (lex_mode_e.ShCommand,
2169	lex_mode_e.ShCommandFakeBrack,
2170	lex_mode_e.DBracket, lex_mode_e.BashRegex)
2171
2172	if self.buffered_word: # For integration with pgen2
2173	w = self.buffered_word
2174	self.buffered_word = None
2175	else:
2176	while True:
2177	w = self._ReadWord(word_mode)
2178	if w is not None:
2179	break
2180
2181	self.returned_newline = (word_.CommandId(w) == Id.Op_Newline)
2182	return w
2183
2184	def ReadArithWord(self):
2185	# type: () -> word_t
2186	while True:
2187	w = self._ReadArithWord()
2188	if w is not None:
2189	break
2190	return w
2191
2192	def ReadHereDocBody(self, parts):
2193	# type: (List[word_part_t]) -> None
2194	"""
2195	A here doc is like a double quoted context, except " isn't special.
2196	"""
2197	self._ReadLikeDQ(None, False, parts)
2198	# Returns nothing
2199
2200	def ReadForPlugin(self):
2201	# type: () -> CompoundWord
2202	"""For $PS1, $PS4, etc.
2203
2204	This is just like reading a here doc line. "\n" is allowed, as
2205	well as the typical substitutions ${x} $(echo hi) $((1 + 2)).
2206	"""
2207	w = CompoundWord([])
2208	self._ReadLikeDQ(None, False, w.parts)
2209	return w
2210
2211	def EmitDocToken(self, b):
2212	# type: (bool) -> None
2213	self.emit_doc_token = b
2214
2215	def Multiline(self, b):
2216	# type: (bool) -> None
2217	self.multiline = b
2218
2219
2220	if 0:
2221	import collections
2222	WORD_HIST = collections.Counter()
2223
2224	# vim: sw=4