osh/word_parse.py

OILS / osh / word_parse.py View on Github | oils.pub

2227 lines, 1190 significant

1	# Copyright 2016 Andy Chu. All rights reserved.
2	# Licensed under the Apache License, Version 2.0 (the "License");
3	# you may not use this file except in compliance with the License.
4	# You may obtain a copy of the License at
5	#
6	# http://www.apache.org/licenses/LICENSE-2.0
7	"""
8	word_parse.py - Parse the shell word language.
9
10	Hairy example:
11
12	hi$((1 + 2))"$(echo hi)"${var:-__"$(echo default)"__}
13
14	Substitutions can be nested, but which inner subs are allowed depends on the
15	outer sub. Notes:
16
17	lex_mode_e.ShCommand (_ReadUnquotedLeftParts)
18	All subs and quotes are allowed:
19	$v ${v} $() `` $(()) '' "" $'' $"" <() >()
20
21	lex_mode_e.DQ (_ReadDoubleQuotedLeftParts)
22	Var, Command, Arith, but no quotes.
23	$v ${v} $() `` $(())
24	No process substitution.
25
26	lex_mode_e.Arith
27	Similar to DQ: Var, Command, and Arith sub, but no process sub. bash doesn't
28	allow quotes, but OSH does. We allow ALL FOUR kinds of quotes, because we
29	need those for associative array indexing.
30
31	lex_mode_e.VSub_ArgUnquoted
32	Like ShCommand, everything is allowed (even process substitutions), but we
33	stop at }, and space is SIGNIFICANT.
34
35	Example: ${a:- b }
36
37	${X:-$v} ${X:-${v}} ${X:-$(echo hi)} ${X:-`echo hi`} ${X:-$((1+2))}
38	${X:-'single'} ${X:-"double"} ${X:-$'\n'} ${X:-<(echo hi)}
39
40	lex_mode_e.VSub_ArgDQ
41	In contrast to DQ, VS_ARG_DQ accepts nested "" and $'' and $"", e.g.
42	"${x:-"default"}".
43
44	In contrast, VSub_ArgUnquoted respects single quotes and process
45	substitution.
46
47	It's weird that double quotes are allowed. Space is also significant here,
48	e.g. "${x:-a "b"}".
49	"""
50
51	from _devbuild.gen import grammar_nt
52	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind
53	from _devbuild.gen.types_asdl import (lex_mode_t, lex_mode_e)
54	from _devbuild.gen.syntax_asdl import (
55	BoolParamBox,
56	Token,
57	SimpleVarSub,
58	loc,
59	source,
60	DoubleQuoted,
61	SingleQuoted,
62	BracedVarSub,
63	CommandSub,
64	ShArrayLiteral,
65	AssocPair,
66	bracket_op,
67	bracket_op_t,
68	suffix_op,
69	suffix_op_t,
70	rhs_word,
71	rhs_word_e,
72	rhs_word_t,
73	word_e,
74	word_t,
75	CompoundWord,
76	word_part,
77	word_part_t,
78	y_lhs_e,
79	arith_expr_t,
80	command,
81	expr,
82	expr_e,
83	expr_t,
84	pat_t,
85	ArgList,
86	Proc,
87	Func,
88	Subscript,
89	Attribute,
90	arith_expr,
91	)
92	from core import alloc
93	from core.error import p_die
94	from mycpp.mylib import log
95	from core import pyutil
96	from display import ui
97	from frontend import consts
98	from frontend import lexer
99	from frontend import reader
100	from osh import tdop
101	from osh import arith_parse
102	from osh import braces
103	from osh import word_
104	from osh import word_compile
105	from mycpp.mylib import tagswitch
106
107	from libc import HAVE_FNM_EXTMATCH
108
109	from typing import List, Optional, Tuple, cast
110	from typing import TYPE_CHECKING
111	if TYPE_CHECKING:
112	from frontend.lexer import Lexer
113	from frontend.parse_lib import ParseContext
114	from frontend.reader import _Reader
115	from osh.cmd_parse import VarChecker
116
117	unused1 = log
118	unused2 = Id_str
119
120	KINDS_THAT_END_WORDS = [Kind.Eof, Kind.WS, Kind.Op, Kind.Right]
121
122
123	class WordEmitter(object):
124	"""Common interface for [ and [["""
125
126	def __init__(self):
127	# type: () -> None
128	"""Empty constructor for mycpp."""
129	pass
130
131	def ReadWord(self, lex_mode):
132	# type: (lex_mode_t) -> word_t
133	raise NotImplementedError()
134
135
136	class WordParser(WordEmitter):
137
138	def __init__(self, parse_ctx, lexer, line_reader):
139	# type: (ParseContext, Lexer, _Reader) -> None
140	self.parse_ctx = parse_ctx
141	self.lexer = lexer
142	self.line_reader = line_reader
143	self.arena = line_reader.arena
144
145	self.parse_opts = parse_ctx.parse_opts
146	self.a_parser = tdop.TdopParser(arith_parse.Spec(), self,
147	self.parse_opts)
148	self.Reset()
149
150	def Init(self, lex_mode):
151	# type: (lex_mode_t) -> None
152	"""Used to parse arithmetic, see ParseContext."""
153	self.next_lex_mode = lex_mode
154
155	def Reset(self):
156	# type: () -> None
157	"""Called by interactive loop."""
158	# For _GetToken()
159	self.cur_token = None # type: Token
160	self.token_kind = Kind.Undefined
161	self.token_type = Id.Undefined_Tok
162
163	self.next_lex_mode = lex_mode_e.ShCommand
164
165	# Boolean mutated by CommandParser via word_.ctx_EmitDocToken. For ### doc
166	# comments
167	self.emit_doc_token = False
168	# Boolean mutated by CommandParser via word_.ctx_Multiline. '...' starts
169	# multiline mode.
170	self.multiline = False
171
172	# For detecting invalid \n\n in multiline mode. Counts what we got
173	# directly from the lexer.
174	self.newline_state = 0
175	# For consolidating \n\n -> \n for the CALLER. This simplifies the parsers
176	# that consume words.
177	self.returned_newline = False
178
179	# For integration with pgen2
180	self.buffered_word = None # type: word_t
181
182	def _GetToken(self):
183	# type: () -> None
184	"""Call this when you need to make a decision based on any of:
185
186	self.token_type
187	self.token_kind
188	self.cur_token
189	"""
190	if self.next_lex_mode == lex_mode_e.Undefined:
191	return # _SetNext() not called, so do nothing
192
193	is_fake = self.next_lex_mode == lex_mode_e.BashRegexFakeInner
194	real_mode = (lex_mode_e.BashRegex if is_fake else self.next_lex_mode)
195
196	self.cur_token = self.lexer.Read(real_mode)
197
198	# MUTATE TOKEN for fake lexer mode.
199	# This is for crazy stuff bash allows, like [[ s =~ (< >) ]]
200	if (is_fake and self.cur_token.id
201	in (Id.WS_Space, Id.BashRegex_AllowedInParens)):
202	self.cur_token.id = Id.Lit_Chars
203
204	self.token_type = self.cur_token.id
205	self.token_kind = consts.GetKind(self.token_type)
206
207	# number of consecutive newlines, ignoring whitespace
208	if self.token_type == Id.Op_Newline:
209	self.newline_state += 1
210	elif self.token_kind != Kind.WS:
211	self.newline_state = 0
212
213	self.parse_ctx.trail.AppendToken(self.cur_token) # For completion
214	self.next_lex_mode = lex_mode_e.Undefined
215
216	def _SetNext(self, lex_mode):
217	# type: (lex_mode_t) -> None
218	"""Set the next lex state, but don't actually read a token.
219
220	We need this for proper interactive parsing.
221	"""
222	self.next_lex_mode = lex_mode
223
224	def _ReadVarOpArg(self, arg_lex_mode):
225	# type: (lex_mode_t) -> rhs_word_t
226
227	# NOTE: Operators like \| and < are not treated as special, so ${a:- \| >} is
228	# valid, even when unquoted.
229	self._SetNext(arg_lex_mode)
230	self._GetToken()
231
232	w = self._ReadVarOpArg2(arg_lex_mode, Id.Undefined_Tok,
233	True) # empty_ok
234
235	# If the Compound has no parts, and we're in a double-quoted VarSub
236	# arg, and empty_ok, then return Empty. This is so it can evaluate to
237	# the empty string and not get elided.
238	#
239	# Examples:
240	# - "${s:-}", "${s/%pat/}"
241	# It's similar to LooksLikeShAssignment where we turn x= into x=''. And it
242	# has the same potential problem of not having Token location info.
243	#
244	# NOTE: empty_ok is False only for the PatSub pattern, which means we'll
245	# return a Compound with no parts, which is explicitly checked with a
246	# custom error message.
247	if len(w.parts) == 0 and arg_lex_mode == lex_mode_e.VSub_ArgDQ:
248	return rhs_word.Empty
249
250	return w
251
252	def _ReadVarOpArg2(self, arg_lex_mode, eof_type, empty_ok):
253	# type: (lex_mode_t, Id_t, bool) -> CompoundWord
254	"""Return a CompoundWord.
255
256	Helper function for _ReadVarOpArg and used directly by
257	_ReadPatSubVarOp.
258	"""
259	w = self._ReadCompoundWord3(arg_lex_mode, eof_type, empty_ok)
260	#log('w %s', w)
261	tilde = word_.TildeDetect(w)
262	if tilde:
263	w = tilde
264	return w
265
266	def _ReadSliceVarOp(self):
267	# type: () -> suffix_op.Slice
268	"""
269	Looking token after first ':'
270
271	ArithExpr? (':' ArithExpr? )? '}'
272	"""
273	self._NextNonSpace()
274
275	cur_id = self.token_type
276
277	if cur_id in (Id.Arith_RBrace, Id.Arith_Colon): # ${a:} or ${a::}
278	begin = arith_expr.EmptyZero # type: arith_expr_t
279	else:
280	begin = self.a_parser.Parse()
281	cur_id = self.a_parser.CurrentId() # advance
282
283	if cur_id == Id.Arith_RBrace: # ${a:1} or ${@:1}
284	# No length specified, so it's N
285	no_length = None # type: Optional[arith_expr_t]
286	return suffix_op.Slice(begin, no_length)
287
288	elif cur_id == Id.Arith_Colon: # ${a:1:} or ${@:1:}
289	colon_tok = self.cur_token
290	self._NextNonSpace()
291
292	if self.token_type == Id.Arith_RBrace:
293	# quirky bash behavior:
294	# ${a:1:} or ${a::} means length ZERO
295	# but ${a:1} or ${a:} means length N
296	if self.parse_opts.strict_parse_slice():
297	p_die(
298	"Slice length: Add explicit zero, or omit : for N (strict_parse_slice)",
299	colon_tok)
300
301	length = arith_expr.EmptyZero # type: arith_expr_t
302	else:
303	length = self._ReadArithExpr(Id.Arith_RBrace)
304
305	return suffix_op.Slice(begin, length)
306
307	else:
308	p_die("Expected : or } in slice", self.cur_token)
309
310	raise AssertionError() # for MyPy
311
312	def _ReadPatSubVarOp(self):
313	# type: () -> suffix_op.PatSub
314	"""Looking at the first '/' after VarOf:
315
316	VarSub = ...
317	\| VarOf '/' Match ( '/' WORD? )?
318	Match = '/' WORD # can't be empty
319	\| '#' WORD? # may be empty
320	\| '%' WORD?
321	"""
322	slash_tok = self.cur_token # location info
323	replace_mode = Id.Undefined_Tok # bizarre syntax / # %
324
325	self._SetNext(lex_mode_e.VSub_ArgUnquoted) # advance past /
326
327	self._GetToken()
328	if self.token_type == Id.Right_DollarBrace:
329	pat = CompoundWord([])
330	return suffix_op.PatSub(pat, rhs_word.Empty, replace_mode,
331	slash_tok)
332
333	if self.token_type in (Id.Lit_Slash, Id.Lit_Pound, Id.Lit_Percent):
334	replace_mode = self.token_type
335	self._SetNext(lex_mode_e.VSub_ArgUnquoted)
336
337	# Bash quirk:
338	# echo ${x/#/replace} has an empty pattern
339	# echo ${x////replace} is non-empty; it means echo ${x//'/'/replace}
340	empty_ok = replace_mode != Id.Lit_Slash
341	pat = self._ReadVarOpArg2(lex_mode_e.VSub_ArgUnquoted, Id.Lit_Slash,
342	empty_ok)
343	#log('pat 1 %r', pat)
344
345	if self.token_type == Id.Lit_Slash:
346	# read until }
347	replace = self._ReadVarOpArg(
348	lex_mode_e.VSub_ArgUnquoted) # type: rhs_word_t
349	#log('r 1 %r', replace)
350	else:
351	# e.g. ${v/a} is the same as ${v/a/} -- empty replacement string
352	replace = rhs_word.Empty
353
354	self._GetToken()
355	if self.token_type != Id.Right_DollarBrace:
356	# This happens on invalid code
357	p_die(
358	"Expected } after replacement string, got %s" %
359	ui.PrettyId(self.token_type), self.cur_token)
360
361	return suffix_op.PatSub(pat, replace, replace_mode, slash_tok)
362
363	def _ReadSubscript(self):
364	# type: () -> bracket_op_t
365	""" Subscript = '[' ('@' \| '*' \| ArithExpr) ']' """
366	# Lookahead to see if we get @ or *. Otherwise read a full arithmetic
367	# expression.
368	next_id = self.lexer.LookPastSpace(lex_mode_e.Arith)
369	if next_id in (Id.Lit_At, Id.Arith_Star):
370	op = bracket_op.WholeArray(next_id) # type: bracket_op_t
371
372	self._SetNext(lex_mode_e.Arith) # skip past [
373	self._GetToken()
374	self._SetNext(lex_mode_e.Arith) # skip past @
375	self._GetToken()
376	else:
377	self._SetNext(lex_mode_e.Arith) # skip past [
378	anode = self._ReadArithExpr(Id.Arith_RBracket)
379	op = bracket_op.ArrayIndex(anode)
380
381	if self.token_type != Id.Arith_RBracket: # Should be looking at ]
382	p_die('Expected ] to close subscript', self.cur_token)
383
384	self._SetNext(lex_mode_e.VSub_2) # skip past ]
385	self._GetToken() # Needed to be in the same spot as no subscript
386
387	return op
388
389	def _ParseVarOf(self):
390	# type: () -> BracedVarSub
391	"""
392	VarOf = NAME Subscript?
393	\| NUMBER # no subscript allowed, none of these are arrays
394	# ${@[1]} doesn't work, even though slicing does
395	\| VarSymbol
396	"""
397	self._GetToken()
398	name_token = self.cur_token
399	self._SetNext(lex_mode_e.VSub_2)
400
401	self._GetToken() # Check for []
402	if self.token_type == Id.VOp2_LBracket:
403	bracket_op = self._ReadSubscript()
404	else:
405	bracket_op = None
406
407	part = BracedVarSub.CreateNull()
408	part.name_tok = name_token
409	part.var_name = lexer.TokenVal(name_token)
410	part.bracket_op = bracket_op
411	return part
412
413	def _ParseVarExpr(self, arg_lex_mode, allow_query=False):
414	# type: (lex_mode_t, bool) -> BracedVarSub
415	"""Start parsing at the op -- we already skipped past the name."""
416	part = self._ParseVarOf()
417
418	self._GetToken()
419	if self.token_type == Id.Right_DollarBrace:
420	return part # no ops
421
422	op_kind = self.token_kind
423
424	if op_kind == Kind.VTest:
425	tok = self.cur_token
426	arg_word = self._ReadVarOpArg(arg_lex_mode)
427	if self.token_type != Id.Right_DollarBrace:
428	p_die('Expected } to close ${', self.cur_token)
429
430	part.suffix_op = suffix_op.Unary(tok, arg_word)
431
432	elif op_kind == Kind.VOpYsh:
433	tok = self.cur_token
434	arg_word = self._ReadVarOpArg(arg_lex_mode)
435	if self.token_type != Id.Right_DollarBrace:
436	p_die('Expected } to close ${', self.cur_token)
437
438	UP_arg_word = arg_word
439	with tagswitch(arg_word) as case:
440	if case(rhs_word_e.Empty):
441	pass
442	elif case(rhs_word_e.Compound):
443	arg_word = cast(CompoundWord, UP_arg_word)
444	# This handles ${x\|html} and ${x %.3f} now
445	# However I think ${x %.3f} should be statically parsed? It can enter
446	# the printf lexer modes.
447	ok, arg, quoted = word_.StaticEval(arg_word)
448	if not ok or quoted:
449	p_die('Expected a constant argument',
450	loc.Word(arg_word))
451
452	part.suffix_op = suffix_op.Static(tok, arg)
453
454	elif op_kind == Kind.VOp0:
455	part.suffix_op = self.cur_token # Nullary
456	self._SetNext(lex_mode_e.VSub_2) # Expecting }
457	self._GetToken()
458
459	elif op_kind == Kind.VOp1: # % %% # ## etc.
460	tok = self.cur_token
461	# Weird exception that all shells have: these operators take a glob
462	# pattern, so they're lexed as VSub_ArgUnquoted, not VSub_ArgDQ
463	arg_word = self._ReadVarOpArg(lex_mode_e.VSub_ArgUnquoted)
464	if self.token_type != Id.Right_DollarBrace:
465	p_die('Expected } to close ${', self.cur_token)
466
467	part.suffix_op = suffix_op.Unary(tok, arg_word)
468
469	elif op_kind == Kind.VOp2: # / : [ ]
470	if self.token_type == Id.VOp2_Slash:
471	patsub_op = self._ReadPatSubVarOp() # type: suffix_op_t
472	part.suffix_op = patsub_op
473
474	# Checked by the method above
475	assert self.token_type == Id.Right_DollarBrace, self.cur_token
476
477	elif self.token_type == Id.VOp2_Colon:
478	part.suffix_op = self._ReadSliceVarOp()
479	# NOTE: } in arithmetic mode.
480	if self.token_type != Id.Arith_RBrace:
481	# Token seems off; doesn't point to X in # ${a:1:2 X
482	p_die('Expected } to close ${', self.cur_token)
483
484	else:
485	# TODO: Does this ever happen?
486	p_die('Unexpected token in ${} (%s)' % 'VOp2', self.cur_token)
487
488	elif op_kind == Kind.VOp3: # ${prefix@} etc.
489	if allow_query:
490	part.suffix_op = self.cur_token # Nullary
491	self._SetNext(lex_mode_e.VSub_2) # Expecting }
492	self._GetToken()
493	else:
494	p_die("Unexpected token in ${} (%s)" % 'VOp3', self.cur_token)
495
496	# NOTE: Arith_RBrace is for slicing, because it reads } in arithmetic
497	# mode. It's redundantly checked above.
498	if self.token_type not in (Id.Right_DollarBrace, Id.Arith_RBrace):
499	# ${a.} or ${!a.}
500	p_die('Expected } to close ${', self.cur_token)
501
502	# Now look for ops
503	return part
504
505	def _ReadZshVarSub(self, left_token):
506	# type: (Token) -> word_part.ZshVarSub
507
508	self._SetNext(lex_mode_e.VSub_Zsh) # Move past ${(foo)
509
510	# Can be empty
511	w = self._ReadCompoundWord3(lex_mode_e.VSub_Zsh, Id.Right_DollarBrace,
512	True)
513	self._GetToken()
514	return word_part.ZshVarSub(left_token, w, self.cur_token)
515
516	def ReadBracedVarSub(self, left_token):
517	# type: (Token) -> Tuple[BracedVarSub, Token]
518	""" For YSH expressions like var x = ${x:-"default"}. """
519	part = self._ReadBracedVarSub(left_token, d_quoted=False)
520	last_token = self.cur_token
521	return part, last_token
522
523	def _ReadBracedVarSub(self, left_token, d_quoted):
524	# type: (Token, bool) -> BracedVarSub
525	"""For the ${} expression language.
526
527	NAME = [a-zA-Z_][a-zA-Z0-9_]*
528	NUMBER = [0-9]+ # ${10}, ${11}, ...
529
530	Subscript = '[' ('@' \| '*' \| ArithExpr) ']'
531	VarSymbol = '!' \| '@' \| '#' \| ...
532	VarOf = NAME Subscript?
533	\| NUMBER # no subscript allowed, none of these are arrays
534	# ${@[1]} doesn't work, even though slicing does
535	\| VarSymbol
536
537	NULLARY_OP = '@Q' \| '@E' \| '@P' \| '@A' \| '@a' # VOp0
538
539	TEST_OP = '-' \| ':-' \| '=' \| ':=' \| '+' \| ':+' \| '?' \| ':?'
540	STRIP_OP = '#' \| '##' \| '%' \| '%%'
541	CASE_OP = ',' \| ',,' \| '^' \| '^^'
542	UnaryOp = TEST_OP \| STRIP_OP \| CASE_OP
543
544	YSH_UNARY = '\|' \| ' ' # ${x\|html} and ${x %.3f}.
545	# SPACE is operator not %
546	Match = ('/' \| '#' \| '%') WORD # match all / prefix / suffix
547	VarExpr = VarOf
548	\| VarOf NULLARY_OP
549	\| VarOf UnaryOp WORD
550	\| VarOf YSH_UNARY STATIC_WORD
551	\| VarOf ':' ArithExpr (':' ArithExpr )?
552	\| VarOf '/' Match '/' WORD
553
554	LengthExpr = '#' VarOf # can't apply operators after length
555
556	RefOrKeys = '!' VarExpr # CAN apply operators after a named ref
557	# ${!ref[0]} vs ${!keys[@]} resolved later
558
559	PrefixQuery = '!' NAME ('*' \| '@') # list variable names with a prefix
560
561	BuiltinSub = '.' WORD+ # ${.myproc 'builtin' $sub}
562
563	VarSub = LengthExpr
564	\| RefOrKeys
565	\| PrefixQuery
566	\| VarExpr
567	\| BuiltinSub
568
569	NOTES:
570	- Arithmetic expressions are used twice, inside subscripts ${a[x+1]} and
571	slicing ${a:x+1:y+2}
572	- ${#} and ${!} need LL(2) lookahead (considering how my tokenizer works)
573	- @ and * are technically arithmetic expressions in this implementation
574	- We don't account for bash 4.4: ${param@operator} -- Q E P A a. Note that
575	it's also vectorized.
576
577	Strictness over bash:
578	- echo ${a[0][0]} doesn't do anything useful, so we disallow it from the
579	grammar
580	- ! and # prefixes can't be composed, even though named refs can be
581	composed with other operators
582	- '#' means 4 different things: length prefix, VarSymbol, UnaryOp to strip
583	a prefix, and it can also be a literal part of WORD.
584
585	From the parser's point of view, the prefix # can't be combined with
586	UnaryOp/slicing/matching, and the ! can. However
587
588	- ${a[@]:1:2} is not allowed
589	- ${#a[@]:1:2} is allowed, but gives the wrong answer
590	"""
591	if d_quoted:
592	arg_lex_mode = lex_mode_e.VSub_ArgDQ
593	else:
594	arg_lex_mode = lex_mode_e.VSub_ArgUnquoted
595
596	self._SetNext(lex_mode_e.VSub_1)
597	self._GetToken()
598
599	ty = self.token_type
600	first_tok = self.cur_token
601
602	if ty == Id.VSub_Pound:
603	# Disambiguate
604	next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
605	if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
606	# e.g. a name, '#' is the prefix
607	self._SetNext(lex_mode_e.VSub_1)
608	part = self._ParseVarOf()
609
610	self._GetToken()
611	if self.token_type != Id.Right_DollarBrace:
612	p_die('Expected } after length expression', self.cur_token)
613
614	part.prefix_op = first_tok
615
616	else: # not a prefix, '#' is the variable
617	part = self._ParseVarExpr(arg_lex_mode)
618
619	elif ty == Id.VSub_Bang:
620	next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
621	if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
622	# e.g. a name, '!' is the prefix
623	# ${!a} -- this is a ref
624	# ${!3} -- this is ref
625	# ${!a[1]} -- this is a ref
626	# ${!a[@]} -- this is a keys
627	# No lookahead -- do it in a second step, or at runtime
628	self._SetNext(lex_mode_e.VSub_1)
629	part = self._ParseVarExpr(arg_lex_mode, allow_query=True)
630
631	part.prefix_op = first_tok
632
633	else: # not a prefix, '!' is the variable
634	part = self._ParseVarExpr(arg_lex_mode)
635
636	elif ty == Id.VSub_Dot:
637	# Note: this will become a new builtin_sub type, so this method must
638	# return word_part_t rather than BracedVarSub. I don't think that
639	# should cause problems.
640	p_die('TODO: ${.myproc builtin sub}', self.cur_token)
641
642	# VS_NAME, VS_NUMBER, symbol that isn't # or !
643	elif self.token_kind == Kind.VSub:
644	part = self._ParseVarExpr(arg_lex_mode)
645
646	else:
647	# e.g. ${^}
648	p_die('Unexpected token in ${}', self.cur_token)
649
650	part.left = left_token # attach the argument
651	part.right = self.cur_token
652	return part
653
654	def _ReadSingleQuoted(self, left_token, lex_mode):
655	# type: (Token, lex_mode_t) -> SingleQuoted
656	"""Internal method to read a word_part."""
657	tokens = [] # type: List[Token]
658	# In command mode, we never disallow backslashes like '\'
659	right_quote = self.ReadSingleQuoted(lex_mode, left_token, tokens,
660	False)
661	sval = word_compile.EvalSingleQuoted(left_token.id, tokens)
662	node = SingleQuoted(left_token, sval, right_quote)
663	return node
664
665	def ReadSingleQuoted(self, lex_mode, left_token, out_tokens, is_ysh_expr):
666	# type: (lex_mode_t, Token, List[Token], bool) -> Token
667	"""Appends to out_tokens; returns last token
668
669	Used by expr_parse.py
670	"""
671	# TODO: Remove and use out_tokens
672	tokens = [] # type: List[Token]
673
674	# echo '\' is allowed, but x = '\' is invalid, in favor of x = r'\'
675	no_backslashes = is_ysh_expr and left_token.id == Id.Left_SingleQuote
676
677	expected_end_tokens = 3 if left_token.id in (
678	Id.Left_TSingleQuote, Id.Left_RTSingleQuote, Id.Left_UTSingleQuote,
679	Id.Left_BTSingleQuote) else 1
680	num_end_tokens = 0
681
682	while num_end_tokens < expected_end_tokens:
683	self._SetNext(lex_mode)
684	self._GetToken()
685
686	# Kind.Char emitted in lex_mode.SQ_C
687	if self.token_kind in (Kind.Lit, Kind.Char):
688	tok = self.cur_token
689	# Happens in lex_mode_e.SQ: 'one\two' is ambiguous, should be
690	# r'one\two' or c'one\\two'
691	if no_backslashes and lexer.TokenContains(tok, '\\'):
692	p_die(
693	r"Strings with backslashes should look like r'\n' or u'\n' or b'\n'",
694	tok)
695
696	if is_ysh_expr:
697	# Disallow var x = $'\001'. Arguably we don't need these
698	# checks because u'\u{1}' is the way to write it.
699	if self.token_type == Id.Char_Octal3:
700	p_die(
701	r"Use \xhh or \u{...} instead of octal escapes in YSH strings",
702	tok)
703
704	if self.token_type == Id.Char_Hex and self.cur_token.length != 4:
705	# disallow \xH
706	p_die(
707	r'Invalid hex escape in YSH string (must be \xHH)',
708	tok)
709
710	tokens.append(tok)
711
712	elif self.token_kind == Kind.Unknown:
713	tok = self.cur_token
714	assert tok.id == Id.Unknown_Backslash, tok
715
716	# x = $'\z' is disallowed; ditto for echo $'\z' if shopt -u parse_backslash
717	if is_ysh_expr or not self.parse_opts.parse_backslash():
718	p_die(
719	"Invalid char escape in C-style string literal (OILS-ERR-11)",
720	tok)
721
722	tokens.append(tok)
723
724	elif self.token_kind == Kind.Eof:
725	p_die('Unexpected EOF in single-quoted string that began here',
726	left_token)
727
728	elif self.token_kind == Kind.Right:
729	# assume Id.Right_SingleQuote
730	num_end_tokens += 1
731	tokens.append(self.cur_token)
732
733	else:
734	raise AssertionError(self.cur_token)
735
736	if self.token_kind != Kind.Right:
737	num_end_tokens = 0 # we need three in a ROW
738
739	if expected_end_tokens == 1:
740	tokens.pop()
741	elif expected_end_tokens == 3: # Get rid of spurious end tokens
742	tokens.pop()
743	tokens.pop()
744	tokens.pop()
745
746	# Remove space from ''' r''' $''' in both expression mode and command mode
747	if left_token.id in (Id.Left_TSingleQuote, Id.Left_RTSingleQuote,
748	Id.Left_UTSingleQuote, Id.Left_BTSingleQuote):
749	word_compile.RemoveLeadingSpaceSQ(tokens)
750
751	# Validation after lexing - same 2 checks in j8.LexerDecoder
752	is_u_string = left_token.id in (Id.Left_USingleQuote,
753	Id.Left_UTSingleQuote)
754
755	for tok in tokens:
756	# u'\yff' is not valid, but b'\yff' is
757	if is_u_string and tok.id == Id.Char_YHex:
758	p_die(
759	r"%s escapes not allowed in u'' strings" %
760	lexer.TokenVal(tok), tok)
761
762	out_tokens.extend(tokens)
763	return self.cur_token
764
765	def _ReadDoubleQuotedLeftParts(self):
766	# type: () -> word_part_t
767	"""Read substitution parts in a double quoted context."""
768	if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick):
769	return self._ReadCommandSub(self.token_type, d_quoted=True)
770
771	if self.token_type == Id.Left_DollarBrace:
772	return self._ReadBracedVarSub(self.cur_token, d_quoted=True)
773
774	if self.token_type == Id.Left_DollarDParen:
775	return self._ReadArithSub()
776
777	if self.token_type == Id.Left_DollarBracket:
778	return self._ReadExprSub(lex_mode_e.DQ)
779
780	if self.token_type == Id.Left_DollarBraceZsh:
781	return self._ReadZshVarSub(self.cur_token)
782
783	raise AssertionError(self.cur_token)
784
785	def _ReadYshSingleQuoted(self, left_id):
786	# type: (Id_t) -> CompoundWord
787	"""Read YSH style strings
788
789	r'' u'' b''
790	r''' ''' u''' ''' b''' '''
791	"""
792	#log('BEF self.cur_token %s', self.cur_token)
793	if left_id == Id.Left_RSingleQuote:
794	lexer_mode = lex_mode_e.SQ_Raw
795	triple_left_id = Id.Left_RTSingleQuote
796	elif left_id == Id.Left_USingleQuote:
797	lexer_mode = lex_mode_e.J8_Str
798	triple_left_id = Id.Left_UTSingleQuote
799	elif left_id == Id.Left_BSingleQuote:
800	lexer_mode = lex_mode_e.J8_Str
801	triple_left_id = Id.Left_BTSingleQuote
802	else:
803	raise AssertionError(left_id)
804
805	# Needed for syntax checks
806	left_tok = self.cur_token
807	left_tok.id = left_id
808
809	sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
810
811	if (len(sq_part.sval) == 0 and self.lexer.ByteLookAhead() == "'"):
812	self._SetNext(lex_mode_e.ShCommand)
813	self._GetToken()
814
815	assert self.token_type == Id.Left_SingleQuote
816	# HACK: magically transform the third ' in u''' to
817	# Id.Left_UTSingleQuote, so that ''' is the terminator
818	left_tok = self.cur_token
819	left_tok.id = triple_left_id
820
821	# Handles stripping leading whitespace
822	sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
823
824	# Advance and validate
825	self._SetNext(lex_mode_e.ShCommand)
826
827	self._GetToken()
828	if self.token_kind not in KINDS_THAT_END_WORDS:
829	p_die('Unexpected token after YSH single-quoted string',
830	self.cur_token)
831
832	return CompoundWord([sq_part])
833
834	def _ReadUnquotedLeftParts(self, triple_out):
835	# type: (Optional[BoolParamBox]) -> word_part_t
836	"""Read substitutions and quoted strings (for lex_mode_e.ShCommand).
837
838	If triple_out is set, then we try parsing triple quoted strings,
839	and set its value to True if we got one.
840	"""
841	if self.token_type in (Id.Left_DoubleQuote, Id.Left_DollarDoubleQuote):
842	# Note: $"" is a synonym for "". It might make sense if it added
843	# \n \0 \x00 \u{123} etc. But that's not what bash does!
844	dq_part = self._ReadDoubleQuoted(self.cur_token)
845	# Got empty word "" and there's a " after
846	if (triple_out and len(dq_part.parts) == 0 and
847	self.lexer.ByteLookAhead() == '"'):
848
849	self._SetNext(lex_mode_e.ShCommand)
850	self._GetToken()
851	# HACK: magically transform the third " in """ to
852	# Id.Left_TDoubleQuote, so that """ is the terminator
853	left_dq_token = self.cur_token
854	left_dq_token.id = Id.Left_TDoubleQuote
855	triple_out.b = True # let caller know we got it
856	return self._ReadDoubleQuoted(left_dq_token)
857
858	return dq_part
859
860	if self.token_type in (Id.Left_SingleQuote, Id.Left_RSingleQuote,
861	Id.Left_DollarSingleQuote):
862	if self.token_type == Id.Left_SingleQuote:
863	lexer_mode = lex_mode_e.SQ_Raw
864	triple_left_id = Id.Left_TSingleQuote
865	elif self.token_type == Id.Left_RSingleQuote:
866	lexer_mode = lex_mode_e.SQ_Raw
867	triple_left_id = Id.Left_RTSingleQuote
868	else:
869	lexer_mode = lex_mode_e.SQ_C
870	# there is no such thing as $'''
871	triple_left_id = Id.Undefined_Tok
872
873	sq_part = self._ReadSingleQuoted(self.cur_token, lexer_mode)
874
875	# Got empty '' or r'' and there's a ' after
876	# u'' and b'' are handled in _ReadYshSingleQuoted
877	if (triple_left_id != Id.Undefined_Tok and
878	triple_out is not None and len(sq_part.sval) == 0 and
879	self.lexer.ByteLookAhead() == "'"):
880
881	self._SetNext(lex_mode_e.ShCommand)
882	self._GetToken()
883
884	# HACK: magically transform the third ' in ''' to
885	# Id.Left_TSingleQuote, so that ''' is the terminator
886	left_sq_token = self.cur_token
887	left_sq_token.id = triple_left_id
888
889	triple_out.b = True # let caller know we got it
890	return self._ReadSingleQuoted(left_sq_token, lexer_mode)
891
892	return sq_part
893
894	if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick,
895	Id.Left_ProcSubIn, Id.Left_ProcSubOut):
896	return self._ReadCommandSub(self.token_type, d_quoted=False)
897
898	if self.token_type == Id.Left_DollarBrace:
899	return self._ReadBracedVarSub(self.cur_token, d_quoted=False)
900
901	if self.token_type == Id.Left_DollarDParen:
902	return self._ReadArithSub()
903
904	if self.token_type == Id.Left_DollarBracket:
905	return self._ReadExprSub(lex_mode_e.ShCommand)
906
907	if self.token_type == Id.Left_DollarBraceZsh:
908	return self._ReadZshVarSub(self.cur_token)
909
910	raise AssertionError(self.cur_token)
911
912	def _ReadExtGlob(self):
913	# type: () -> word_part.ExtGlob
914	"""
915	Grammar:
916	Item = CompoundWord \| EPSILON # important: @(foo\|) is allowed
917	LEFT = '@(' \| '*(' \| '+(' \| '?(' \| '!('
918	RIGHT = ')'
919	ExtGlob = LEFT (Item '\|')* Item RIGHT # ITEM may be empty
920	Compound includes ExtGlob
921	"""
922	left_token = self.cur_token
923	right_token = None # type: Token
924	arms = [] # type: List[CompoundWord]
925
926	self.lexer.PushHint(Id.Op_RParen, Id.Right_ExtGlob)
927	self._SetNext(lex_mode_e.ExtGlob) # advance past LEFT
928
929	read_word = False # did we just a read a word? To handle @(\|\|).
930
931	while True:
932	self._GetToken()
933
934	if self.token_type == Id.Right_ExtGlob:
935	if not read_word:
936	arms.append(CompoundWord([]))
937	right_token = self.cur_token
938	break
939
940	elif self.token_type == Id.Op_Pipe:
941	if not read_word:
942	arms.append(CompoundWord([]))
943	read_word = False
944	self._SetNext(lex_mode_e.ExtGlob)
945
946	# lex_mode_e.ExtGlob should only produce these 4 kinds of tokens
947	elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub,
948	Kind.ExtGlob):
949	w = self._ReadCompoundWord(lex_mode_e.ExtGlob)
950	arms.append(w)
951	read_word = True
952
953	elif self.token_kind == Kind.Eof:
954	p_die('Unexpected EOF reading extended glob that began here',
955	left_token)
956
957	else:
958	raise AssertionError(self.cur_token)
959
960	return word_part.ExtGlob(left_token, arms, right_token)
961
962	def _ReadBashRegexGroup(self):
963	# type: () -> word_part.BashRegexGroup
964	"""
965	Grammar:
966	BashRegexGroup = '(' WORD? ')
967	"""
968	left_token = self.cur_token
969	assert left_token.id == Id.BashRegex_LParen, left_token
970
971	arms = [] # type: List[CompoundWord]
972
973	self.lexer.PushHint(Id.Op_RParen, Id.Right_BashRegexGroup)
974	self._SetNext(lex_mode_e.BashRegexFakeInner) # advance past LEFT
975
976	self._GetToken()
977	if self.token_type == Id.Right_BashRegexGroup: # empty ()
978	return word_part.BashRegexGroup(left_token, None, self.cur_token)
979
980	# lex_mode_e.BashRegex should only produce these 4 kinds of tokens
981	if self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub, Kind.BashRegex):
982	# Fake lexer mode that translates Id.WS_Space to Id.Lit_Chars
983	# To allow bash style [[ s =~ (a b) ]]
984	w = self._ReadCompoundWord(lex_mode_e.BashRegexFakeInner)
985	arms.append(w)
986
987	self._GetToken()
988	if self.token_type != Id.Right_BashRegexGroup:
989	p_die('Expected ) to close bash regex group', self.cur_token)
990
991	return word_part.BashRegexGroup(left_token, w, self.cur_token)
992
993	p_die('Expected word after ( opening bash regex group', self.cur_token)
994
995	def _ReadLikeDQ(self, left_token, is_ysh_expr, out_parts):
996	# type: (Optional[Token], bool, List[word_part_t]) -> None
997	"""
998	Args:
999	left_token: A token if we are reading a double quoted part, or None if
1000	we're reading a here doc.
1001	is_ysh_expr: Whether to disallow backticks and invalid char escapes
1002	out_parts: list of word_part to append to
1003	"""
1004	if left_token:
1005	if left_token.id in (Id.Left_TDoubleQuote,
1006	Id.Left_DollarTDoubleQuote):
1007	expected_end_tokens = 3
1008	else:
1009	expected_end_tokens = 1
1010	else:
1011	expected_end_tokens = 1000 # here doc will break
1012
1013	num_end_tokens = 0
1014	while num_end_tokens < expected_end_tokens:
1015	self._SetNext(lex_mode_e.DQ)
1016	self._GetToken()
1017
1018	if self.token_kind == Kind.Lit:
1019	if self.token_type == Id.Lit_EscapedChar:
1020	tok = self.cur_token
1021	ch = lexer.TokenSliceLeft(tok, 1)
1022	part = word_part.EscapedLiteral(tok,
1023	ch) # type: word_part_t
1024	else:
1025	if self.token_type == Id.Lit_BadBackslash:
1026	# echo "\z" is OK in shell, but 'x = "\z" is a syntax error in
1027	# YSH.
1028	# Slight hole: We don't catch 'x = ${undef:-"\z"} because of the
1029	# recursion (unless parse_backslash)
1030	if (is_ysh_expr or
1031	not self.parse_opts.parse_backslash()):
1032	p_die(
1033	"Invalid char escape in double quoted string (OILS-ERR-12)",
1034	self.cur_token)
1035	elif self.token_type == Id.Lit_Dollar:
1036	if is_ysh_expr or not self.parse_opts.parse_dollar():
1037	p_die("Literal $ should be quoted like \$",
1038	self.cur_token)
1039
1040	part = self.cur_token
1041	out_parts.append(part)
1042
1043	elif self.token_kind == Kind.Left:
1044	if self.token_type == Id.Left_Backtick and is_ysh_expr:
1045	p_die("Invalid backtick: use $(cmd) or \\` in YSH strings",
1046	self.cur_token)
1047
1048	part = self._ReadDoubleQuotedLeftParts()
1049	out_parts.append(part)
1050
1051	elif self.token_kind == Kind.VSub:
1052	tok = self.cur_token
1053	part = SimpleVarSub(tok)
1054	out_parts.append(part)
1055	# NOTE: parsing "$f(x)" would BREAK CODE. Could add a more for it
1056	# later.
1057
1058	elif self.token_kind == Kind.Right:
1059	assert self.token_type == Id.Right_DoubleQuote, self.token_type
1060	if left_token:
1061	num_end_tokens += 1
1062
1063	# In a here doc, the right quote is literal!
1064	out_parts.append(self.cur_token)
1065
1066	elif self.token_kind == Kind.Eof:
1067	if left_token:
1068	p_die(
1069	'Unexpected EOF reading double-quoted string that began here',
1070	left_token)
1071	else: # here docs will have an EOF in their token stream
1072	break
1073
1074	else:
1075	raise AssertionError(self.cur_token)
1076
1077	if self.token_kind != Kind.Right:
1078	num_end_tokens = 0 # """ must be CONSECUTIVE
1079
1080	if expected_end_tokens == 1:
1081	out_parts.pop()
1082	elif expected_end_tokens == 3:
1083	out_parts.pop()
1084	out_parts.pop()
1085	out_parts.pop()
1086
1087	# Remove space from """ in both expression mode and command mode
1088	if (left_token and left_token.id
1089	in (Id.Left_TDoubleQuote, Id.Left_DollarTDoubleQuote)):
1090	word_compile.RemoveLeadingSpaceDQ(out_parts)
1091
1092	# Return nothing, since we appended to 'out_parts'
1093
1094	def _ReadDoubleQuoted(self, left_token):
1095	# type: (Token) -> DoubleQuoted
1096	"""Helper function for "hello $name".
1097
1098	Args:
1099	eof_type: for stopping at }, Id.Lit_RBrace
1100	here_doc: Whether we are reading in a here doc context
1101
1102	Also ${foo%%a b c} # treat this as double quoted. until you hit
1103	"""
1104	parts = [] # type: List[word_part_t]
1105	self._ReadLikeDQ(left_token, False, parts)
1106
1107	right_quote = self.cur_token
1108	return DoubleQuoted(left_token, parts, right_quote)
1109
1110	def ReadDoubleQuoted(self, left_token, parts):
1111	# type: (Token, List[word_part_t]) -> Token
1112	"""For expression mode.
1113
1114	Read var x = "${dir:-}/$name"; etc.
1115	"""
1116	self._ReadLikeDQ(left_token, True, parts)
1117	return self.cur_token
1118
1119	def _ReadCommandSub(self, left_id, d_quoted=False):
1120	# type: (Id_t, bool) -> CommandSub
1121	"""
1122	NOTE: This is not in the grammar, because word parts aren't in the grammar!
1123
1124	command_sub = '$(' command_list ')'
1125	\| '@(' command_list ')'
1126	\| '<(' command_list ')'
1127	\| '>(' command_list ')'
1128	\| ` command_list `
1129	"""
1130	left_token = self.cur_token
1131
1132	# Set the lexer in a state so ) becomes the EOF token.
1133	if left_id in (Id.Left_DollarParen, Id.Left_AtParen, Id.Left_ProcSubIn,
1134	Id.Left_ProcSubOut):
1135	self._SetNext(lex_mode_e.ShCommand) # advance past $( etc.
1136
1137	right_id = Id.Eof_RParen
1138	self.lexer.PushHint(Id.Op_RParen, right_id)
1139	c_parser = self.parse_ctx.MakeParserForCommandSub(
1140	self.line_reader, self.lexer, right_id)
1141	# NOTE: This doesn't use something like main_loop because we don't want
1142	# to interleave parsing and execution! Unlike 'source' and 'eval'.
1143	node = c_parser.ParseCommandSub()
1144
1145	right_token = c_parser.w_parser.cur_token
1146
1147	elif left_id == Id.Left_Backtick and self.parse_ctx.do_lossless:
1148	# NOTE: This is an APPROXIMATE solution for translation ONLY. See
1149	# test/osh2oil.
1150
1151	right_id = Id.Eof_Backtick
1152	self.lexer.PushHint(Id.Left_Backtick, right_id)
1153	c_parser = self.parse_ctx.MakeParserForCommandSub(
1154	self.line_reader, self.lexer, right_id)
1155	node = c_parser.ParseCommandSub()
1156	right_token = c_parser.w_parser.cur_token
1157
1158	elif left_id == Id.Left_Backtick:
1159	if not self.parse_opts.parse_backticks():
1160	p_die('Use $(cmd) instead of backticks (parse_backticks)',
1161	left_token)
1162
1163	self._SetNext(lex_mode_e.Backtick) # advance past `
1164
1165	parts = [] # type: List[str]
1166	while True:
1167	self._GetToken()
1168	#log("TOK %s", self.cur_token)
1169
1170	if self.token_type == Id.Backtick_Quoted:
1171	# Remove leading \
1172	parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1173
1174	elif self.token_type == Id.Backtick_DoubleQuote:
1175	# Compatibility: If backticks are double quoted, then double quotes
1176	# within them have to be \"
1177	# Shells aren't smart enough to match nested " and ` quotes (but OSH
1178	# is)
1179	if d_quoted:
1180	# Remove leading \
1181	parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1182	else:
1183	parts.append(lexer.TokenVal(self.cur_token))
1184
1185	elif self.token_type == Id.Backtick_Other:
1186	parts.append(lexer.TokenVal(self.cur_token))
1187
1188	elif self.token_type == Id.Backtick_Right:
1189	break
1190
1191	elif self.token_type == Id.Eof_Real:
1192	# Note: this parse error is in the ORIGINAL context. No code_str yet.
1193	p_die('Unexpected EOF while looking for closing backtick',
1194	left_token)
1195
1196	else:
1197	raise AssertionError(self.cur_token)
1198
1199	self._SetNext(lex_mode_e.Backtick)
1200
1201	# Calculate right SPID on CommandSub BEFORE re-parsing.
1202	right_token = self.cur_token
1203
1204	code_str = ''.join(parts)
1205	#log('code %r', code_str)
1206
1207	# NOTE: This is similar to how we parse aliases in osh/cmd_parse.py. It
1208	# won't have the same location info as MakeParserForCommandSub(), because
1209	# the lexer is different.
1210	arena = self.parse_ctx.arena
1211	#arena = alloc.Arena()
1212	line_reader = reader.StringLineReader(code_str, arena)
1213	c_parser = self.parse_ctx.MakeOshParser(line_reader)
1214	src = source.Reparsed('backticks', left_token, right_token)
1215	with alloc.ctx_SourceCode(arena, src):
1216	node = c_parser.ParseCommandSub()
1217
1218	else:
1219	raise AssertionError(left_id)
1220
1221	return CommandSub(left_token, node, right_token)
1222
1223	def _ReadExprSub(self, lex_mode):
1224	# type: (lex_mode_t) -> word_part.ExprSub
1225	"""$[d->key] $[obj.method()] etc."""
1226	left_token = self.cur_token
1227
1228	self._SetNext(lex_mode_e.Expr)
1229	enode, right_token = self.parse_ctx.ParseYshExpr(
1230	self.lexer, grammar_nt.ysh_expr_sub)
1231
1232	self._SetNext(lex_mode) # Move past ]
1233	return word_part.ExprSub(left_token, enode, right_token)
1234
1235	def ParseVarDecl(self, kw_token):
1236	# type: (Token) -> command.VarDecl
1237	"""
1238	oil_var_decl: name_type_list '=' testlist end_stmt
1239
1240	Note that assignments must end with \n ; } or EOF. Unlike shell
1241	assignments, we disallow:
1242
1243	var x = 42 \| wc -l
1244	var x = 42 && echo hi
1245	"""
1246	self._SetNext(lex_mode_e.Expr)
1247	enode, last_token = self.parse_ctx.ParseVarDecl(kw_token, self.lexer)
1248	# Hack to move } from what the Expr lexer modes gives to what CommandParser
1249	# wants
1250	if last_token.id == Id.Op_RBrace:
1251	last_token.id = Id.Lit_RBrace
1252
1253	# Let the CommandParser see the Op_Semi or Op_Newline.
1254	self.buffered_word = last_token
1255	self._SetNext(lex_mode_e.ShCommand) # always back to this
1256	return enode
1257
1258	def ParseMutation(self, kw_token, var_checker):
1259	# type: (Token, VarChecker) -> command.Mutation
1260	"""
1261	setvar i = 42
1262	setvar i += 1
1263	setvar a[i] = 42
1264	setvar a[i] += 1
1265	setvar d.key = 42
1266	setvar d.key += 1
1267	"""
1268	self._SetNext(lex_mode_e.Expr)
1269	enode, last_token = self.parse_ctx.ParseMutation(kw_token, self.lexer)
1270	# Hack to move } from what the Expr lexer modes gives to what CommandParser
1271	# wants
1272	if last_token.id == Id.Op_RBrace:
1273	last_token.id = Id.Lit_RBrace
1274
1275	for lhs in enode.lhs:
1276	UP_lhs = lhs
1277	with tagswitch(lhs) as case:
1278	if case(y_lhs_e.Var):
1279	lhs = cast(Token, UP_lhs)
1280	var_checker.Check(kw_token.id, lexer.LazyStr(lhs), lhs)
1281
1282	# Note: this does not cover cases like
1283	# setvar (a[0])[1] = v
1284	# setvar (d.key).other = v
1285	# This leaks into catching all typos statically, which may be
1286	# possible if 'use' makes all names explicit.
1287	elif case(y_lhs_e.Subscript):
1288	lhs = cast(Subscript, UP_lhs)
1289	if lhs.obj.tag() == expr_e.Var:
1290	v = cast(expr.Var, lhs.obj)
1291	var_checker.Check(kw_token.id, v.name, v.left)
1292
1293	elif case(y_lhs_e.Attribute):
1294	lhs = cast(Attribute, UP_lhs)
1295	if lhs.obj.tag() == expr_e.Var:
1296	v = cast(expr.Var, lhs.obj)
1297	var_checker.Check(kw_token.id, v.name, v.left)
1298
1299	# Let the CommandParser see the Op_Semi or Op_Newline.
1300	self.buffered_word = last_token
1301	self._SetNext(lex_mode_e.ShCommand) # always back to this
1302	return enode
1303
1304	def ParseBareDecl(self):
1305	# type: () -> expr_t
1306	"""
1307	x = {name: val}
1308	"""
1309	self._SetNext(lex_mode_e.Expr)
1310	self._GetToken()
1311	enode, last_token = self.parse_ctx.ParseYshExpr(
1312	self.lexer, grammar_nt.command_expr)
1313	if last_token.id == Id.Op_RBrace:
1314	last_token.id = Id.Lit_RBrace
1315	self.buffered_word = last_token
1316	self._SetNext(lex_mode_e.ShCommand)
1317	return enode
1318
1319	def ParseYshExprForCommand(self):
1320	# type: () -> expr_t
1321
1322	# Fudge for this case
1323	# for x in(y) {
1324	# versus
1325	# for x in (y) {
1326	#
1327	# In the former case, ReadWord on 'in' puts the lexer past (.
1328	# Also see LookPastSpace in CommandParers.
1329	# A simpler solution would be nicer.
1330
1331	if self.token_type == Id.Op_LParen:
1332	self.lexer.MaybeUnreadOne()
1333
1334	enode, _ = self.parse_ctx.ParseYshExpr(self.lexer, grammar_nt.ysh_expr)
1335
1336	self._SetNext(lex_mode_e.ShCommand)
1337	return enode
1338
1339	def ParseCommandExpr(self):
1340	# type: () -> expr_t
1341	"""
1342	= 1+2
1343	"""
1344	enode, last_token = self.parse_ctx.ParseYshExpr(
1345	self.lexer, grammar_nt.command_expr)
1346
1347	# In some cases, such as the case statement, we expect the lexer to be
1348	# pointing at the token right after the expression. But the expression
1349	# parser must have read to the `last_token`. Unreading places the lexer
1350	# back in the expected state. Ie:
1351	#
1352	# case (x) { case (x) {
1353	# (else) { = x } (else) { = x }
1354	# ^ The lexer is here ^ Unread to here
1355	# } }
1356	assert last_token.id in (Id.Op_Newline, Id.Eof_Real, Id.Op_Semi,
1357	Id.Op_RBrace), last_token
1358	if last_token.id != Id.Eof_Real:
1359	# Eof_Real is the only token we cannot unread
1360	self.lexer.MaybeUnreadOne()
1361
1362	return enode
1363
1364	def ParseProc(self, node):
1365	# type: (Proc) -> None
1366
1367	# proc name-with-hyphens() must be accepted
1368	self._SetNext(lex_mode_e.ShCommand)
1369	self._GetToken()
1370	# example: 'proc f[' gets you Lit_ArrayLhsOpen
1371	if self.token_type != Id.Lit_Chars:
1372	p_die('Invalid proc name %s' % ui.PrettyToken(self.cur_token),
1373	self.cur_token)
1374
1375	# TODO: validate this more. Disallow proc 123 { }, which isn't disallowed
1376	# for shell functions. Similar to IsValidVarName().
1377	node.name = self.cur_token
1378
1379	last_token = self.parse_ctx.ParseProc(self.lexer, node)
1380
1381	# Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1382	assert last_token.id == Id.Op_LBrace
1383	last_token.id = Id.Lit_LBrace
1384	self.buffered_word = last_token
1385
1386	self._SetNext(lex_mode_e.ShCommand)
1387
1388	def ParseFunc(self, node):
1389	# type: (Func) -> None
1390	last_token = self.parse_ctx.ParseFunc(self.lexer, node)
1391
1392	# Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1393	assert last_token.id == Id.Op_LBrace
1394	last_token.id = Id.Lit_LBrace
1395	self.buffered_word = last_token
1396
1397	self._SetNext(lex_mode_e.ShCommand)
1398
1399	def ParseYshCasePattern(self):
1400	# type: () -> Tuple[pat_t, Token]
1401	pat, left_tok, last_token = self.parse_ctx.ParseYshCasePattern(
1402	self.lexer)
1403
1404	if last_token.id == Id.Op_LBrace:
1405	last_token.id = Id.Lit_LBrace
1406	self.buffered_word = last_token
1407
1408	return pat, left_tok
1409
1410	def NewlineOkForYshCase(self):
1411	# type: () -> Id_t
1412	"""Check for optional newline and consume it.
1413
1414	This is a special case of `_NewlineOk` which fixed some "off-by-one" issues
1415	which crop up while parsing Ysh Case Arms. For more details, see
1416	#oil-dev > Progress On YSH Case Grammar on zulip.
1417
1418	Returns a token id which is filled with the choice of
1419
1420	word { echo word }
1421	(3) { echo expr }
1422	/e/ { echo eggex }
1423	} # right brace
1424	"""
1425	while True:
1426	next_id = self.lexer.LookAheadOne(lex_mode_e.Expr)
1427
1428	# Cannot lookahead past lines
1429	if next_id == Id.Unknown_Tok:
1430	if not self.lexer.MoveToNextLine(): # Try to move to next line
1431	break # EOF
1432	continue
1433
1434	next_kind = consts.GetKind(next_id)
1435	if next_id != Id.Op_Newline and next_kind != Kind.Ignored:
1436	break
1437
1438	self.lexer.Read(lex_mode_e.Expr)
1439
1440	if next_id in (Id.Op_RBrace, Id.Op_LParen, Id.Arith_Slash):
1441	self._SetNext(lex_mode_e.Expr) # Continue in expression mode
1442	else:
1443	# Consume the trailing Op_Newline
1444	self._SetNext(lex_mode_e.ShCommand)
1445	self._GetToken()
1446
1447	return next_id
1448
1449	def _ReadArithExpr(self, end_id):
1450	# type: (Id_t) -> arith_expr_t
1451	"""Read and parse an arithmetic expression in various contexts.
1452
1453	$(( 1+2 ))
1454	(( a=1+2 ))
1455	${a[ 1+2 ]}
1456	${a : 1+2 : 1+2}
1457
1458	See tests/arith-context.test.sh for ambiguous cases.
1459
1460	${a[a[0]]} is valid # VS_RBRACKET vs Id.Arith_RBracket
1461
1462	${s : a<b?0:1 : 1} # VS_COLON vs Id.Arith_Colon
1463
1464	See the assertion in ArithParser.Parse() -- unexpected extra input.
1465	"""
1466	# calls self.ReadWord(lex_mode_e.Arith)
1467	anode = self.a_parser.Parse()
1468	cur_id = self.a_parser.CurrentId()
1469	if end_id != Id.Undefined_Tok and cur_id != end_id:
1470	p_die(
1471	'Unexpected token after arithmetic expression (%s != %s)' %
1472	(ui.PrettyId(cur_id), ui.PrettyId(end_id)),
1473	loc.Word(self.a_parser.cur_word))
1474	return anode
1475
1476	def _ReadArithSub(self):
1477	# type: () -> word_part.ArithSub
1478	"""Read an arith substitution, which contains an arith expression, e.g.
1479
1480	$((a + 1)).
1481	"""
1482	left_tok = self.cur_token
1483
1484	# The second one needs to be disambiguated in stuff like stuff like:
1485	# $(echo $(( 1+2 )) )
1486	self.lexer.PushHint(Id.Op_RParen, Id.Right_DollarDParen)
1487
1488	# NOTE: To disambiguate $(( as arith sub vs. command sub and subshell, we
1489	# could save the lexer/reader state here, and retry if the arithmetic parse
1490	# fails. But we can almost always catch this at parse time. There could
1491	# be some exceptions like:
1492	# $((echo * foo)) # looks like multiplication
1493	# $((echo / foo)) # looks like division
1494
1495	# $(( )) is valid
1496	anode = arith_expr.EmptyZero # type: arith_expr_t
1497
1498	self._NextNonSpace()
1499	if self.token_type != Id.Arith_RParen:
1500	anode = self._ReadArithExpr(Id.Arith_RParen)
1501
1502	self._SetNext(lex_mode_e.ShCommand)
1503
1504	# Ensure we get closing )
1505	self._GetToken()
1506	if self.token_type != Id.Right_DollarDParen:
1507	p_die('Expected second ) to end arith sub', self.cur_token)
1508
1509	right_tok = self.cur_token
1510	return word_part.ArithSub(left_tok, anode, right_tok)
1511
1512	def ReadDParen(self):
1513	# type: () -> Tuple[arith_expr_t, Token]
1514	"""Read ((1+ 2)) -- command context.
1515
1516	We're using the word parser because it's very similar to _ReadArithExpr
1517	above.
1518
1519	This also returns the terminating Id.Op_DRightParen token for location
1520	info.
1521	"""
1522	# (( )) is valid
1523	anode = arith_expr.EmptyZero # type: arith_expr_t
1524
1525	self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
1526
1527	self._NextNonSpace()
1528	if self.token_type != Id.Arith_RParen:
1529	anode = self._ReadArithExpr(Id.Arith_RParen)
1530
1531	self._SetNext(lex_mode_e.ShCommand)
1532
1533	# Ensure we get the second )
1534	self._GetToken()
1535	right = self.cur_token
1536	if right.id != Id.Op_DRightParen:
1537	p_die('Expected second ) to end arith statement', right)
1538
1539	self._SetNext(lex_mode_e.ShCommand)
1540
1541	return anode, right
1542
1543	def _NextNonSpace(self):
1544	# type: () -> None
1545	"""Advance in lex_mode_e.Arith until non-space token.
1546
1547	Same logic as _ReadWord, but used in
1548	$(( ))
1549	(( ))
1550	for (( ))
1551
1552	You can read self.token_type after this, without calling _GetToken.
1553	"""
1554	while True:
1555	self._SetNext(lex_mode_e.Arith)
1556	self._GetToken()
1557	if self.token_kind not in (Kind.Ignored, Kind.WS):
1558	break
1559
1560	def ReadForExpression(self):
1561	# type: () -> command.ForExpr
1562	"""Read ((i=0; i<5; ++i)) -- part of command context."""
1563	self._NextNonSpace() # skip over ((
1564	cur_id = self.token_type # for end of arith expressions
1565
1566	if cur_id == Id.Arith_Semi: # for (( ; i < 10; i++ ))
1567	init_node = arith_expr.EmptyZero # type: arith_expr_t
1568	else:
1569	init_node = self.a_parser.Parse()
1570	cur_id = self.a_parser.CurrentId()
1571	self._NextNonSpace()
1572
1573	# It's odd to keep track of both cur_id and self.token_type in this
1574	# function, but it works, and is tested in 'test/parse_error.sh
1575	# arith-integration'
1576	if cur_id != Id.Arith_Semi: # for (( x=0 b; ... ))
1577	p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1578
1579	self._GetToken()
1580	cur_id = self.token_type
1581
1582	if cur_id == Id.Arith_Semi: # for (( ; ; i++ ))
1583	# empty condition is TRUE
1584	cond_node = arith_expr.EmptyOne # type: arith_expr_t
1585	else:
1586	cond_node = self.a_parser.Parse()
1587	cur_id = self.a_parser.CurrentId()
1588
1589	if cur_id != Id.Arith_Semi: # for (( x=0; x<5 b ))
1590	p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1591
1592	self._NextNonSpace()
1593	if self.token_type == Id.Arith_RParen: # for (( ; ; ))
1594	update_node = arith_expr.EmptyZero # type: arith_expr_t
1595	else:
1596	update_node = self._ReadArithExpr(Id.Arith_RParen)
1597
1598	self._NextNonSpace()
1599	if self.token_type != Id.Arith_RParen:
1600	p_die('Expected ) to end for loop expression', self.cur_token)
1601	self._SetNext(lex_mode_e.ShCommand)
1602
1603	# redirects is None, will be assigned in CommandEvaluator
1604	node = command.ForExpr.CreateNull()
1605	node.init = init_node
1606	node.cond = cond_node
1607	node.update = update_node
1608	return node
1609
1610	def _ReadArrayLiteral(self):
1611	# type: () -> word_part_t
1612	"""a=(1 2 3)
1613
1614	TODO: See osh/cmd_parse.py:164 for Id.Lit_ArrayLhsOpen, for a[x++]=1
1615
1616	We want:
1617
1618	A=(['x']=1 ["x"]=2 [$x$y]=3)
1619
1620	Maybe allow this as a literal string? Because I think I've seen it before?
1621	Or maybe force people to patch to learn the rule.
1622
1623	A=([x]=4)
1624
1625	Starts with Lit_Other '[', and then it has Lit_ArrayLhsClose
1626	Maybe enforce that ALL have keys or NONE of have keys.
1627	"""
1628	self._SetNext(lex_mode_e.ShCommand) # advance past (
1629	self._GetToken()
1630	if self.cur_token.id != Id.Op_LParen:
1631	p_die('Expected ( after =', self.cur_token)
1632	left_token = self.cur_token
1633	right_token = None # type: Token
1634
1635	# MUST use a new word parser (with same lexer).
1636	w_parser = self.parse_ctx.MakeWordParser(self.lexer, self.line_reader)
1637	words = [] # type: List[CompoundWord]
1638	done = False
1639	while not done:
1640	w = w_parser.ReadWord(lex_mode_e.ShCommand)
1641	with tagswitch(w) as case:
1642	if case(word_e.Operator):
1643	tok = cast(Token, w)
1644	if tok.id == Id.Right_ShArrayLiteral:
1645	right_token = tok
1646	done = True # can't use break here
1647	# Unlike command parsing, array parsing allows embedded \n.
1648	elif tok.id == Id.Op_Newline:
1649	continue
1650	else:
1651	p_die('Unexpected token in array literal', loc.Word(w))
1652
1653	elif case(word_e.Compound):
1654	words.append(cast(CompoundWord, w))
1655
1656	else:
1657	raise AssertionError()
1658
1659	if len(words) == 0: # a=() is empty indexed array
1660	# Needed for type safety, doh
1661	no_words = [] # type: List[word_t]
1662	node = ShArrayLiteral(left_token, no_words, right_token)
1663	return node
1664
1665	pairs = [] # type: List[AssocPair]
1666	# If the first one is a key/value pair, then the rest are assumed to be.
1667	pair = word_.DetectAssocPair(words[0])
1668	if pair:
1669	pairs.append(pair)
1670
1671	n = len(words)
1672	for i in xrange(1, n):
1673	w2 = words[i]
1674	pair = word_.DetectAssocPair(w2)
1675	if not pair:
1676	p_die("Expected associative array pair", loc.Word(w2))
1677
1678	pairs.append(pair)
1679
1680	# invariant List?
1681	return word_part.BashAssocLiteral(left_token, pairs, right_token)
1682
1683	# Brace detection for arrays but NOT associative arrays
1684	words2 = braces.BraceDetectAll(words)
1685	words3 = word_.TildeDetectAll(words2)
1686	return ShArrayLiteral(left_token, words3, right_token)
1687
1688	def ParseProcCallArgs(self, start_symbol):
1689	# type: (int) -> ArgList
1690	""" json write (x) """
1691	self.lexer.MaybeUnreadOne()
1692
1693	arg_list = ArgList.CreateNull(alloc_lists=True)
1694	arg_list.left = self.cur_token
1695	self.parse_ctx.ParseProcCallArgs(self.lexer, arg_list, start_symbol)
1696	return arg_list
1697
1698	def _MaybeReadWordPart(self, is_first, lex_mode, parts):
1699	# type: (bool, lex_mode_t, List[word_part_t]) -> bool
1700	"""Helper for _ReadCompoundWord3."""
1701	done = False
1702
1703	if self.token_type == Id.Lit_EscapedChar:
1704	tok = self.cur_token
1705	assert tok.length == 2
1706	ch = lexer.TokenSliceLeft(tok, 1)
1707	if not self.parse_opts.parse_backslash():
1708	if not pyutil.IsValidCharEscape(ch):
1709	p_die('Invalid char escape in unquoted word (OILS-ERR-13)',
1710	self.cur_token)
1711
1712	part = word_part.EscapedLiteral(self.cur_token,
1713	ch) # type: word_part_t
1714	else:
1715	part = self.cur_token
1716
1717	if is_first and self.token_type == Id.Lit_VarLike: # foo=
1718	parts.append(part)
1719	# Unfortunately it's awkward to pull the check for a=(1 2) up to
1720	# _ReadWord.
1721	next_id = self.lexer.LookPastSpace(lex_mode)
1722	if next_id == Id.Op_LParen:
1723	self.lexer.PushHint(Id.Op_RParen, Id.Right_ShArrayLiteral)
1724	part2 = self._ReadArrayLiteral()
1725	parts.append(part2)
1726
1727	# Array literal must be the last part of the word.
1728	self._SetNext(lex_mode)
1729	self._GetToken()
1730	# EOF, whitespace, newline, Right_Subshell
1731	if self.token_kind not in KINDS_THAT_END_WORDS:
1732	p_die('Unexpected token after array literal',
1733	self.cur_token)
1734	done = True
1735
1736	elif (is_first and self.parse_opts.parse_at() and
1737	self.token_type == Id.Lit_Splice):
1738
1739	splice_tok = self.cur_token
1740	part2 = word_part.Splice(splice_tok,
1741	lexer.TokenSliceLeft(splice_tok, 1))
1742
1743	parts.append(part2)
1744
1745	# @words must be the last part of the word
1746	self._SetNext(lex_mode)
1747	self._GetToken()
1748	# EOF, whitespace, newline, Right_Subshell
1749	if self.token_kind not in KINDS_THAT_END_WORDS:
1750	p_die('Unexpected token after array splice', self.cur_token)
1751	done = True
1752
1753	elif (is_first and self.parse_opts.parse_at() and
1754	self.token_type == Id.Lit_AtLBracket): # @[split(x)]
1755	part2 = self._ReadExprSub(lex_mode_e.DQ)
1756	parts.append(part2)
1757
1758	# @[split(x)]
1759	self._SetNext(lex_mode)
1760	self._GetToken()
1761	# EOF, whitespace, newline, Right_Subshell
1762	if self.token_kind not in KINDS_THAT_END_WORDS:
1763	p_die('Unexpected token after Expr splice', self.cur_token)
1764	done = True
1765
1766	elif (is_first and self.parse_opts.parse_at() and
1767	self.token_type == Id.Lit_AtLBraceDot):
1768	p_die('TODO: @{.myproc builtin sub}', self.cur_token)
1769
1770	elif (is_first and self.parse_opts.parse_at_all() and
1771	self.token_type == Id.Lit_At):
1772	# Because $[x] ${x} and perhaps $/x/ are reserved, it makes sense for @
1773	# at the beginning of a word to be reserved.
1774
1775	# Although should we relax 'echo @' ? I'm tempted to have a shortcut for
1776	# @_argv and
1777	p_die('Literal @ starting a word must be quoted (parse_at_all)',
1778	self.cur_token)
1779
1780	else:
1781	# not a literal with lookahead; append it
1782	parts.append(part)
1783
1784	return done
1785
1786	def _ReadCompoundWord(self, lex_mode):
1787	# type: (lex_mode_t) -> CompoundWord
1788	return self._ReadCompoundWord3(lex_mode, Id.Undefined_Tok, True)
1789
1790	def _ReadCompoundWord3(self, lex_mode, eof_type, empty_ok):
1791	# type: (lex_mode_t, Id_t, bool) -> CompoundWord
1792	"""
1793	Precondition: Looking at the first token of the first word part
1794	Postcondition: Looking at the token after, e.g. space or operator
1795
1796	NOTE: eof_type is necessary because / is a literal, i.e. Lit_Slash, but it
1797	could be an operator delimiting a compound word. Can we change lexer modes
1798	and remove this special case?
1799	"""
1800	w = CompoundWord([])
1801	num_parts = 0
1802	brace_count = 0
1803	done = False
1804	is_triple_quoted = None # type: Optional[BoolParamBox]
1805
1806	while not done:
1807	self._GetToken()
1808
1809	allow_done = empty_ok or num_parts != 0
1810	if allow_done and self.token_type == eof_type:
1811	done = True # e.g. for ${foo//pat/replace}
1812
1813	# Keywords like "for" are treated like literals
1814	elif self.token_kind in (Kind.Lit, Kind.History, Kind.KW,
1815	Kind.ControlFlow, Kind.BoolUnary,
1816	Kind.BoolBinary):
1817
1818	# Syntax error for { and }
1819	if self.token_type == Id.Lit_LBrace:
1820	brace_count += 1
1821	elif self.token_type == Id.Lit_RBrace:
1822	brace_count -= 1
1823	elif self.token_type == Id.Lit_Dollar:
1824	if not self.parse_opts.parse_dollar():
1825	if num_parts == 0 and lex_mode == lex_mode_e.ShCommand:
1826	next_byte = self.lexer.ByteLookAhead()
1827	# TODO: switch lexer modes and parse $/d+/. But not ${a:-$/d+/}
1828	if next_byte == '/':
1829	#log('next_byte %r', next_byte)
1830	pass
1831
1832	p_die('Literal $ should be quoted like \$',
1833	self.cur_token)
1834
1835	done = self._MaybeReadWordPart(num_parts == 0, lex_mode,
1836	w.parts)
1837
1838	elif self.token_kind == Kind.VSub:
1839	vsub_token = self.cur_token
1840
1841	part = SimpleVarSub(vsub_token) # type: word_part_t
1842	w.parts.append(part)
1843
1844	elif self.token_kind == Kind.ExtGlob:
1845	# If parse_at, we can take over @( to start @(seq 3)
1846	# Users can also use look at ,(.py\|.sh)
1847	if (self.parse_opts.parse_at() and
1848	self.token_type == Id.ExtGlob_At and num_parts == 0):
1849	cs_part = self._ReadCommandSub(Id.Left_AtParen,
1850	d_quoted=False)
1851	# RARE mutation of tok.id!
1852	cs_part.left_token.id = Id.Left_AtParen
1853	part = cs_part # for type safety
1854
1855	# Same check as _MaybeReadWordPart. @(seq 3)x is illegal, just like
1856	# a=(one two)x and @arrayfunc(3)x.
1857	self._GetToken()
1858	if self.token_kind not in KINDS_THAT_END_WORDS:
1859	p_die('Unexpected token after @()', self.cur_token)
1860	done = True
1861
1862	else:
1863	if HAVE_FNM_EXTMATCH == 0:
1864	p_die(
1865	"Extended glob won't work without FNM_EXTMATCH support in libc",
1866	self.cur_token)
1867	part = self._ReadExtGlob()
1868	w.parts.append(part)
1869
1870	elif self.token_kind == Kind.BashRegex:
1871	if self.token_type == Id.BashRegex_LParen: # Opening (
1872	part = self._ReadBashRegexGroup()
1873	w.parts.append(part)
1874	else:
1875	assert self.token_type == Id.BashRegex_AllowedInParens
1876	p_die('Invalid token in bash regex', self.cur_token)
1877
1878	elif self.token_kind == Kind.Left:
1879	try_triple_quote = (self.parse_opts.parse_triple_quote() and
1880	lex_mode == lex_mode_e.ShCommand and
1881	num_parts == 0)
1882
1883	# Save allocation
1884	if try_triple_quote:
1885	is_triple_quoted = BoolParamBox(False)
1886
1887	part = self._ReadUnquotedLeftParts(is_triple_quoted)
1888	w.parts.append(part)
1889
1890	# NOT done yet, will advance below
1891	elif self.token_kind == Kind.Right:
1892	# Still part of the word; will be done on the next iter.
1893	if self.token_type == Id.Right_DoubleQuote:
1894	pass
1895	# Never happens, no PushHint for this case.
1896	#elif self.token_type == Id.Right_DollarParen:
1897	# pass
1898	elif self.token_type == Id.Right_Subshell:
1899	# LEXER HACK for (case x in x) ;; esac )
1900	# Rewind before it's used
1901	assert self.next_lex_mode == lex_mode_e.Undefined
1902	if self.lexer.MaybeUnreadOne():
1903	self.lexer.PushHint(Id.Op_RParen, Id.Right_Subshell)
1904	self._SetNext(lex_mode)
1905	done = True
1906	else:
1907	done = True
1908
1909	elif self.token_kind == Kind.Ignored:
1910	done = True
1911
1912	else:
1913	# LEXER HACK for unbalanced case clause. 'case foo in esac' is valid,
1914	# so to test for ESAC, we can read ) before getting a chance to
1915	# PushHint(Id.Op_RParen, Id.Right_CasePat). So here we unread one
1916	# token and do it again.
1917
1918	# We get Id.Op_RParen at top level: case x in x) ;; esac
1919	# We get Id.Eof_RParen inside ComSub: $(case x in x) ;; esac )
1920	if self.token_type in (Id.Op_RParen, Id.Eof_RParen):
1921	# Rewind before it's used
1922	assert self.next_lex_mode == lex_mode_e.Undefined
1923	if self.lexer.MaybeUnreadOne():
1924	if self.token_type == Id.Eof_RParen:
1925	# Redo translation
1926	self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
1927	self._SetNext(lex_mode)
1928
1929	done = True # anything we don't recognize means we're done
1930
1931	if not done:
1932	self._SetNext(lex_mode)
1933	num_parts += 1
1934
1935	if (self.parse_opts.parse_brace() and num_parts > 1 and
1936	brace_count != 0):
1937	# accept { and }, but not foo{
1938	p_die(
1939	'Word has unbalanced { }. Maybe add a space or quote it like \{',
1940	loc.Word(w))
1941
1942	if is_triple_quoted and is_triple_quoted.b and num_parts > 1:
1943	p_die('Unexpected parts after triple quoted string',
1944	loc.WordPart(w.parts[-1]))
1945
1946	if 0:
1947	from _devbuild.gen.syntax_asdl import word_part_str
1948	word_key = ' '.join(word_part_str(p.tag()) for p in w.parts)
1949	WORD_HIST[word_key] += 1
1950	return w
1951
1952	def _ReadArithWord(self):
1953	# type: () -> Optional[word_t]
1954	""" Helper for ReadArithWord() """
1955	self._GetToken()
1956
1957	if self.token_kind == Kind.Unknown:
1958	# e.g. happened during dynamic parsing of unset 'a[$foo]' in gherkin
1959	p_die(
1960	'Unexpected token while parsing arithmetic: %r' %
1961	lexer.TokenVal(self.cur_token), self.cur_token)
1962
1963	elif self.token_kind == Kind.Eof:
1964	return self.cur_token
1965
1966	elif self.token_kind == Kind.Ignored:
1967	# Space should be ignored.
1968	self._SetNext(lex_mode_e.Arith)
1969	return None
1970
1971	elif self.token_kind in (Kind.Arith, Kind.Right):
1972	# Id.Right_DollarDParen IS just a normal token, handled by ArithParser
1973	self._SetNext(lex_mode_e.Arith)
1974	return self.cur_token
1975
1976	elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub):
1977	return self._ReadCompoundWord(lex_mode_e.Arith)
1978
1979	else:
1980	raise AssertionError(self.cur_token)
1981
1982	def _ReadWord(self, word_mode):
1983	# type: (lex_mode_t) -> Optional[word_t]
1984	"""Helper function for ReadWord()."""
1985
1986	# Change the pseudo lexer mode to a real lexer mode
1987	if word_mode == lex_mode_e.ShCommandFakeBrack:
1988	lex_mode = lex_mode_e.ShCommand
1989	else:
1990	lex_mode = word_mode
1991
1992	self._GetToken()
1993
1994	if self.token_kind == Kind.Eof:
1995	# No advance
1996	return self.cur_token
1997
1998	# Allow Arith for ) at end of for loop?
1999	elif self.token_kind in (Kind.Op, Kind.Redir, Kind.Arith):
2000	self._SetNext(lex_mode)
2001
2002	# Newlines are complicated. See 3x2 matrix in the comment about
2003	# self.multiline and self.newline_state above.
2004	if self.token_type == Id.Op_Newline:
2005	if self.multiline:
2006	if self.newline_state > 1:
2007	# This points at a blank line, but at least it gives the line number
2008	p_die('Invalid blank line in multiline mode',
2009	self.cur_token)
2010	return None
2011
2012	if self.returned_newline: # skip
2013	return None
2014
2015	return self.cur_token
2016
2017	elif self.token_kind == Kind.Right:
2018	if self.token_type not in (Id.Right_Subshell, Id.Right_ShFunction,
2019	Id.Right_CasePat,
2020	Id.Right_ShArrayLiteral):
2021	raise AssertionError(self.cur_token)
2022
2023	self._SetNext(lex_mode)
2024	return self.cur_token
2025
2026	elif self.token_kind in (Kind.Ignored, Kind.WS):
2027	self._SetNext(lex_mode)
2028	return None
2029
2030	else:
2031	assert self.token_kind in (Kind.VSub, Kind.Lit, Kind.History,
2032	Kind.Left, Kind.KW, Kind.ControlFlow,
2033	Kind.BoolUnary, Kind.BoolBinary,
2034	Kind.ExtGlob,
2035	Kind.BashRegex), 'Unhandled token kind'
2036
2037	if (word_mode == lex_mode_e.ShCommandFakeBrack and
2038	self.parse_opts.parse_bracket() and
2039	self.token_type == Id.Lit_LBracket):
2040	# Change [ from Kind.Lit -> Kind.Op
2041	# So CommandParser can treat
2042	# assert [42 === x]
2043	# like
2044	# json write (x)
2045	bracket_word = self.cur_token
2046	bracket_word.id = Id.Op_LBracket
2047
2048	self._SetNext(lex_mode)
2049	return bracket_word
2050
2051	# We're beginning a word. If we see Id.Lit_Pound, change to
2052	# lex_mode_e.Comment and read until end of line.
2053	if self.token_type == Id.Lit_Pound:
2054	self._SetNext(lex_mode_e.Comment)
2055	self._GetToken()
2056
2057	# NOTE: The # could be the last character in the file. It can't be
2058	# Eof_{RParen,Backtick} because #) and #` are comments.
2059	assert self.token_type in (Id.Ignored_Comment, Id.Eof_Real), \
2060	self.cur_token
2061
2062	# The next iteration will go into Kind.Ignored and set lex state to
2063	# lex_mode_e.ShCommand/etc.
2064	return None # tell ReadWord() to try again after comment
2065
2066	elif self.token_type == Id.Lit_TPound: ### doc comment
2067	self._SetNext(lex_mode_e.Comment)
2068	self._GetToken()
2069
2070	if self.token_type == Id.Ignored_Comment and self.emit_doc_token:
2071	return self.cur_token
2072
2073	return None # tell ReadWord() to try again after comment
2074
2075	else:
2076	# r'' u'' b''
2077	if (self.token_type == Id.Lit_Chars and
2078	self.lexer.LookAheadOne(
2079	lex_mode_e.ShCommand) == Id.Left_SingleQuote):
2080
2081	# When shopt -s parse_raw_string:
2082	# echo r'hi' is like echo 'hi'
2083	#
2084	# echo u'\u{3bc}' b'\yff' works
2085
2086	tok = self.cur_token
2087	if self.parse_opts.parse_ysh_string():
2088	if lexer.TokenEquals(tok, 'r'):
2089	left_id = Id.Left_RSingleQuote
2090	elif lexer.TokenEquals(tok, 'u'):
2091	left_id = Id.Left_USingleQuote
2092	elif lexer.TokenEquals(tok, 'b'):
2093	left_id = Id.Left_BSingleQuote
2094	else:
2095	left_id = Id.Undefined_Tok
2096
2097	if left_id != Id.Undefined_Tok:
2098	# skip the r, and then 'foo' will be read as normal
2099	self._SetNext(lex_mode_e.ShCommand)
2100
2101	self._GetToken()
2102	assert self.token_type == Id.Left_SingleQuote, self.token_type
2103
2104	# Read the word in a different lexer mode
2105	return self._ReadYshSingleQuoted(left_id)
2106
2107	return self._ReadCompoundWord(lex_mode)
2108
2109	def ParseVarRef(self):
2110	# type: () -> BracedVarSub
2111	"""DYNAMIC parsing of what's inside ${!ref}
2112
2113	# Same as VarOf production
2114	VarRefExpr = VarOf EOF
2115	"""
2116	self._SetNext(lex_mode_e.VSub_1)
2117
2118	self._GetToken()
2119	if self.token_kind != Kind.VSub:
2120	p_die('Expected var name', self.cur_token)
2121
2122	part = self._ParseVarOf()
2123	# NOTE: no ${ } means no part.left and part.right
2124	part.left = part.name_tok # cheat to make test pass
2125	part.right = part.name_tok
2126
2127	self._GetToken()
2128	if self.token_type != Id.Eof_Real:
2129	p_die('Expected end of var ref expression', self.cur_token)
2130	return part
2131
2132	def LookPastSpace(self):
2133	# type: () -> Id_t
2134	"""Look ahead to the next token.
2135
2136	For the CommandParser to recognize
2137	array= (1 2 3)
2138	YSH for ( versus bash for ((
2139	YSH if ( versus if test
2140	YSH while ( versus while test
2141	YSH bare assignment 'grep =' versus 'grep foo'
2142	"""
2143	assert self.token_type != Id.Undefined_Tok
2144	if self.cur_token.id == Id.WS_Space:
2145	id_ = self.lexer.LookPastSpace(lex_mode_e.ShCommand)
2146	else:
2147	id_ = self.cur_token.id
2148	return id_
2149
2150	def LookAheadFuncParens(self):
2151	# type: () -> bool
2152	"""Special lookahead for f( ) { echo hi; } to check for ( )"""
2153	assert self.token_type != Id.Undefined_Tok
2154
2155	# We have to handle 2 cases because we buffer a token
2156	if self.cur_token.id == Id.Op_LParen: # saw funcname(
2157	return self.lexer.LookAheadFuncParens(1) # go back one char
2158
2159	elif self.cur_token.id == Id.WS_Space: # saw funcname WHITESPACE
2160	return self.lexer.LookAheadFuncParens(0)
2161
2162	else:
2163	return False
2164
2165	def ReadWord(self, word_mode):
2166	# type: (lex_mode_t) -> word_t
2167	"""Read the next word, using the given lexer mode.
2168
2169	This is a stateful wrapper for the stateless _ReadWord function.
2170	"""
2171	assert word_mode in (lex_mode_e.ShCommand,
2172	lex_mode_e.ShCommandFakeBrack,
2173	lex_mode_e.DBracket, lex_mode_e.BashRegex)
2174
2175	if self.buffered_word: # For integration with pgen2
2176	w = self.buffered_word
2177	self.buffered_word = None
2178	else:
2179	while True:
2180	w = self._ReadWord(word_mode)
2181	if w is not None:
2182	break
2183
2184	self.returned_newline = (word_.CommandId(w) == Id.Op_Newline)
2185	return w
2186
2187	def ReadArithWord(self):
2188	# type: () -> word_t
2189	while True:
2190	w = self._ReadArithWord()
2191	if w is not None:
2192	break
2193	return w
2194
2195	def ReadHereDocBody(self, parts):
2196	# type: (List[word_part_t]) -> None
2197	"""
2198	A here doc is like a double quoted context, except " isn't special.
2199	"""
2200	self._ReadLikeDQ(None, False, parts)
2201	# Returns nothing
2202
2203	def ReadForPlugin(self):
2204	# type: () -> CompoundWord
2205	"""For $PS1, $PS4, etc.
2206
2207	This is just like reading a here doc line. "\n" is allowed, as
2208	well as the typical substitutions ${x} $(echo hi) $((1 + 2)).
2209	"""
2210	w = CompoundWord([])
2211	self._ReadLikeDQ(None, False, w.parts)
2212	return w
2213
2214	def EmitDocToken(self, b):
2215	# type: (bool) -> None
2216	self.emit_doc_token = b
2217
2218	def Multiline(self, b):
2219	# type: (bool) -> None
2220	self.multiline = b
2221
2222
2223	if 0:
2224	import collections
2225	WORD_HIST = collections.Counter()
2226
2227	# vim: sw=4