osh/word_parse.py

OILS / osh / word_parse.py View on Github | oils.pub

2216 lines, 1186 significant

1	# Copyright 2016 Andy Chu. All rights reserved.
2	# Licensed under the Apache License, Version 2.0 (the "License");
3	# you may not use this file except in compliance with the License.
4	# You may obtain a copy of the License at
5	#
6	# http://www.apache.org/licenses/LICENSE-2.0
7	"""
8	word_parse.py - Parse the shell word language.
9
10	Hairy example:
11
12	hi$((1 + 2))"$(echo hi)"${var:-__"$(echo default)"__}
13
14	Substitutions can be nested, but which inner subs are allowed depends on the
15	outer sub. Notes:
16
17	lex_mode_e.ShCommand (_ReadUnquotedLeftParts)
18	All subs and quotes are allowed:
19	$v ${v} $() `` $(()) '' "" $'' $"" <() >()
20
21	lex_mode_e.DQ (_ReadDoubleQuotedLeftParts)
22	Var, Command, Arith, but no quotes.
23	$v ${v} $() `` $(())
24	No process substitution.
25
26	lex_mode_e.Arith
27	Similar to DQ: Var, Command, and Arith sub, but no process sub. bash doesn't
28	allow quotes, but OSH does. We allow ALL FOUR kinds of quotes, because we
29	need those for associative array indexing.
30
31	lex_mode_e.VSub_ArgUnquoted
32	Like ShCommand, everything is allowed (even process substitutions), but we
33	stop at }, and space is SIGNIFICANT.
34
35	Example: ${a:- b }
36
37	${X:-$v} ${X:-${v}} ${X:-$(echo hi)} ${X:-`echo hi`} ${X:-$((1+2))}
38	${X:-'single'} ${X:-"double"} ${X:-$'\n'} ${X:-<(echo hi)}
39
40	lex_mode_e.VSub_ArgDQ
41	In contrast to DQ, VS_ARG_DQ accepts nested "" and $'' and $"", e.g.
42	"${x:-"default"}".
43
44	In contrast, VSub_ArgUnquoted respects single quotes and process
45	substitution.
46
47	It's weird that double quotes are allowed. Space is also significant here,
48	e.g. "${x:-a "b"}".
49	"""
50
51	from _devbuild.gen import grammar_nt
52	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind
53	from _devbuild.gen.types_asdl import (lex_mode_t, lex_mode_e)
54	from _devbuild.gen.syntax_asdl import (
55	BoolParamBox,
56	Token,
57	SimpleVarSub,
58	loc,
59	source,
60	DoubleQuoted,
61	SingleQuoted,
62	BracedVarSub,
63	CommandSub,
64	InitializerWord,
65	InitializerWord_t,
66	bracket_op,
67	bracket_op_t,
68	suffix_op,
69	suffix_op_t,
70	rhs_word,
71	rhs_word_e,
72	rhs_word_t,
73	word_e,
74	word_t,
75	CompoundWord,
76	word_part,
77	word_part_t,
78	y_lhs_e,
79	arith_expr_t,
80	command,
81	expr,
82	expr_e,
83	expr_t,
84	pat_t,
85	ArgList,
86	Proc,
87	Func,
88	Subscript,
89	Attribute,
90	arith_expr,
91	)
92	from core import alloc
93	from core.error import p_die
94	from mycpp.mylib import log
95	from core import pyutil
96	from display import ui
97	from frontend import consts
98	from frontend import lexer
99	from frontend import reader
100	from osh import tdop
101	from osh import arith_parse
102	from osh import braces
103	from osh import word_
104	from osh import word_compile
105	from mycpp.mylib import tagswitch
106
107	from libc import HAVE_FNM_EXTMATCH
108
109	from typing import List, Optional, Tuple, cast
110	from typing import TYPE_CHECKING
111	if TYPE_CHECKING:
112	from frontend.lexer import Lexer
113	from frontend.parse_lib import ParseContext
114	from frontend.reader import _Reader
115	from osh.cmd_parse import VarChecker
116
117	unused1 = log
118	unused2 = Id_str
119
120	KINDS_THAT_END_WORDS = [Kind.Eof, Kind.WS, Kind.Op, Kind.Right]
121
122
123	class WordEmitter(object):
124	"""Common interface for [ and [["""
125
126	def __init__(self):
127	# type: () -> None
128	"""Empty constructor for mycpp."""
129	pass
130
131	def ReadWord(self, lex_mode):
132	# type: (lex_mode_t) -> word_t
133	raise NotImplementedError()
134
135
136	class WordParser(WordEmitter):
137
138	def __init__(self, parse_ctx, lexer, line_reader):
139	# type: (ParseContext, Lexer, _Reader) -> None
140	self.parse_ctx = parse_ctx
141	self.lexer = lexer
142	self.line_reader = line_reader
143	self.arena = line_reader.arena
144
145	self.parse_opts = parse_ctx.parse_opts
146	self.a_parser = tdop.TdopParser(arith_parse.Spec(), self,
147	self.parse_opts)
148	self.Reset()
149
150	def Init(self, lex_mode):
151	# type: (lex_mode_t) -> None
152	"""Used to parse arithmetic, see ParseContext."""
153	self.next_lex_mode = lex_mode
154
155	def Reset(self):
156	# type: () -> None
157	"""Called by interactive loop."""
158	# For _GetToken()
159	self.cur_token = None # type: Token
160	self.token_kind = Kind.Undefined
161	self.token_type = Id.Undefined_Tok
162
163	self.next_lex_mode = lex_mode_e.ShCommand
164
165	# Boolean mutated by CommandParser via word_.ctx_EmitDocToken. For ### doc
166	# comments
167	self.emit_doc_token = False
168	# Boolean mutated by CommandParser via word_.ctx_Multiline. '...' starts
169	# multiline mode.
170	self.multiline = False
171
172	# For detecting invalid \n\n in multiline mode. Counts what we got
173	# directly from the lexer.
174	self.newline_state = 0
175	# For consolidating \n\n -> \n for the CALLER. This simplifies the parsers
176	# that consume words.
177	self.returned_newline = False
178
179	# For integration with pgen2
180	self.buffered_word = None # type: word_t
181
182	def _GetToken(self):
183	# type: () -> None
184	"""Call this when you need to make a decision based on any of:
185
186	self.token_type
187	self.token_kind
188	self.cur_token
189	"""
190	if self.next_lex_mode == lex_mode_e.Undefined:
191	return # _SetNext() not called, so do nothing
192
193	is_fake = self.next_lex_mode == lex_mode_e.BashRegexFakeInner
194	real_mode = (lex_mode_e.BashRegex if is_fake else self.next_lex_mode)
195
196	self.cur_token = self.lexer.Read(real_mode)
197
198	# MUTATE TOKEN for fake lexer mode.
199	# This is for crazy stuff bash allows, like [[ s =~ (< >) ]]
200	if (is_fake and self.cur_token.id
201	in (Id.WS_Space, Id.BashRegex_AllowedInParens)):
202	self.cur_token.id = Id.Lit_Chars
203
204	self.token_type = self.cur_token.id
205	self.token_kind = consts.GetKind(self.token_type)
206
207	# number of consecutive newlines, ignoring whitespace
208	if self.token_type == Id.Op_Newline:
209	self.newline_state += 1
210	elif self.token_kind != Kind.WS:
211	self.newline_state = 0
212
213	self.parse_ctx.trail.AppendToken(self.cur_token) # For completion
214	self.next_lex_mode = lex_mode_e.Undefined
215
216	def _SetNext(self, lex_mode):
217	# type: (lex_mode_t) -> None
218	"""Set the next lex state, but don't actually read a token.
219
220	We need this for proper interactive parsing.
221	"""
222	self.next_lex_mode = lex_mode
223
224	def _ReadVarOpArg(self, arg_lex_mode):
225	# type: (lex_mode_t) -> rhs_word_t
226
227	# NOTE: Operators like \| and < are not treated as special, so ${a:- \| >} is
228	# valid, even when unquoted.
229	self._SetNext(arg_lex_mode)
230	self._GetToken()
231
232	w = self._ReadVarOpArg2(arg_lex_mode, Id.Undefined_Tok,
233	True) # empty_ok
234
235	# If the Compound has no parts, and we're in a double-quoted VarSub
236	# arg, and empty_ok, then return Empty. This is so it can evaluate to
237	# the empty string and not get elided.
238	#
239	# Examples:
240	# - "${s:-}", "${s/%pat/}"
241	# It's similar to LooksLikeShAssignment where we turn x= into x=''. And it
242	# has the same potential problem of not having Token location info.
243	#
244	# NOTE: empty_ok is False only for the PatSub pattern, which means we'll
245	# return a Compound with no parts, which is explicitly checked with a
246	# custom error message.
247	if len(w.parts) == 0 and arg_lex_mode == lex_mode_e.VSub_ArgDQ:
248	return rhs_word.Empty
249
250	return w
251
252	def _ReadVarOpArg2(self, arg_lex_mode, eof_type, empty_ok):
253	# type: (lex_mode_t, Id_t, bool) -> CompoundWord
254	"""Return a CompoundWord.
255
256	Helper function for _ReadVarOpArg and used directly by
257	_ReadPatSubVarOp.
258	"""
259	w = self._ReadCompoundWord3(arg_lex_mode, eof_type, empty_ok)
260	#log('w %s', w)
261	tilde = word_.TildeDetect(w)
262	if tilde:
263	w = tilde
264	return w
265
266	def _ReadSliceVarOp(self):
267	# type: () -> suffix_op.Slice
268	"""
269	Looking token after first ':'
270
271	ArithExpr? (':' ArithExpr? )? '}'
272	"""
273	self._NextNonSpace()
274
275	cur_id = self.token_type
276
277	if cur_id in (Id.Arith_RBrace, Id.Arith_Colon): # ${a:} or ${a::}
278	begin = arith_expr.EmptyZero # type: arith_expr_t
279	else:
280	begin = self.a_parser.Parse()
281	cur_id = self.a_parser.CurrentId() # advance
282
283	if cur_id == Id.Arith_RBrace: # ${a:1} or ${@:1}
284	# No length specified, so it's N
285	no_length = None # type: Optional[arith_expr_t]
286	return suffix_op.Slice(begin, no_length)
287
288	elif cur_id == Id.Arith_Colon: # ${a:1:} or ${@:1:}
289	colon_tok = self.cur_token
290	self._NextNonSpace()
291
292	if self.token_type == Id.Arith_RBrace:
293	# quirky bash behavior:
294	# ${a:1:} or ${a::} means length ZERO
295	# but ${a:1} or ${a:} means length N
296	if self.parse_opts.strict_parse_slice():
297	p_die(
298	"Slice length: Add explicit zero, or omit : for N (strict_parse_slice)",
299	colon_tok)
300
301	length = arith_expr.EmptyZero # type: arith_expr_t
302	else:
303	length = self._ReadArithExpr(Id.Arith_RBrace)
304
305	return suffix_op.Slice(begin, length)
306
307	else:
308	p_die("Expected : or } in slice", self.cur_token)
309
310	raise AssertionError() # for MyPy
311
312	def _ReadPatSubVarOp(self):
313	# type: () -> suffix_op.PatSub
314	"""Looking at the first '/' after VarOf:
315
316	VarSub = ...
317	\| VarOf '/' Match ( '/' WORD? )?
318	Match = '/' WORD # can't be empty
319	\| '#' WORD? # may be empty
320	\| '%' WORD?
321	"""
322	slash_tok = self.cur_token # location info
323	replace_mode = Id.Undefined_Tok # bizarre syntax / # %
324
325	self._SetNext(lex_mode_e.VSub_ArgUnquoted) # advance past /
326
327	self._GetToken()
328	if self.token_type == Id.Right_DollarBrace:
329	pat = CompoundWord([])
330	return suffix_op.PatSub(pat, rhs_word.Empty, replace_mode,
331	slash_tok)
332
333	if self.token_type in (Id.Lit_Slash, Id.Lit_Pound, Id.Lit_Percent):
334	replace_mode = self.token_type
335	self._SetNext(lex_mode_e.VSub_ArgUnquoted)
336
337	# Bash quirk:
338	# echo ${x/#/replace} has an empty pattern
339	# echo ${x////replace} is non-empty; it means echo ${x//'/'/replace}
340	empty_ok = replace_mode != Id.Lit_Slash
341	pat = self._ReadVarOpArg2(lex_mode_e.VSub_ArgUnquoted, Id.Lit_Slash,
342	empty_ok)
343	#log('pat 1 %r', pat)
344
345	if self.token_type == Id.Lit_Slash:
346	# read until }
347	replace = self._ReadVarOpArg(
348	lex_mode_e.VSub_ArgUnquoted) # type: rhs_word_t
349	#log('r 1 %r', replace)
350	else:
351	# e.g. ${v/a} is the same as ${v/a/} -- empty replacement string
352	replace = rhs_word.Empty
353
354	self._GetToken()
355	if self.token_type != Id.Right_DollarBrace:
356	# This happens on invalid code
357	p_die(
358	"Expected } after replacement string, got %s" %
359	ui.PrettyId(self.token_type), self.cur_token)
360
361	return suffix_op.PatSub(pat, replace, replace_mode, slash_tok)
362
363	def _ReadSubscript(self):
364	# type: () -> bracket_op_t
365	""" Subscript = '[' ('@' \| '*' \| ArithExpr) ']' """
366	# Lookahead to see if we get @ or *. Otherwise read a full arithmetic
367	# expression.
368	next_id = self.lexer.LookPastSpace(lex_mode_e.Arith)
369	if next_id in (Id.Lit_At, Id.Arith_Star):
370	op = bracket_op.WholeArray(next_id) # type: bracket_op_t
371
372	self._SetNext(lex_mode_e.Arith) # skip past [
373	self._GetToken()
374	self._SetNext(lex_mode_e.Arith) # skip past @
375	self._GetToken()
376	else:
377	self._SetNext(lex_mode_e.Arith) # skip past [
378	anode = self._ReadArithExpr(Id.Arith_RBracket)
379	op = bracket_op.ArrayIndex(anode)
380
381	if self.token_type != Id.Arith_RBracket: # Should be looking at ]
382	p_die('Expected ] to close subscript', self.cur_token)
383
384	self._SetNext(lex_mode_e.VSub_2) # skip past ]
385	self._GetToken() # Needed to be in the same spot as no subscript
386
387	return op
388
389	def _ParseVarOf(self):
390	# type: () -> BracedVarSub
391	"""
392	VarOf = NAME Subscript?
393	\| NUMBER # no subscript allowed, none of these are arrays
394	# ${@[1]} doesn't work, even though slicing does
395	\| VarSymbol
396	"""
397	self._GetToken()
398	name_token = self.cur_token
399	self._SetNext(lex_mode_e.VSub_2)
400
401	self._GetToken() # Check for []
402	if self.token_type == Id.VOp2_LBracket:
403	bracket_op = self._ReadSubscript()
404	else:
405	bracket_op = None
406
407	part = BracedVarSub.CreateNull()
408	part.name_tok = name_token
409	part.var_name = lexer.TokenVal(name_token)
410	part.bracket_op = bracket_op
411	return part
412
413	def _ParseVarExpr(self, arg_lex_mode, allow_query=False):
414	# type: (lex_mode_t, bool) -> BracedVarSub
415	"""Start parsing at the op -- we already skipped past the name."""
416	part = self._ParseVarOf()
417
418	self._GetToken()
419	if self.token_type == Id.Right_DollarBrace:
420	return part # no ops
421
422	op_kind = self.token_kind
423
424	if op_kind == Kind.VTest:
425	tok = self.cur_token
426	arg_word = self._ReadVarOpArg(arg_lex_mode)
427	if self.token_type != Id.Right_DollarBrace:
428	p_die('Expected } to close ${', self.cur_token)
429
430	part.suffix_op = suffix_op.Unary(tok, arg_word)
431
432	elif op_kind == Kind.VOpYsh:
433	tok = self.cur_token
434	arg_word = self._ReadVarOpArg(arg_lex_mode)
435	if self.token_type != Id.Right_DollarBrace:
436	p_die('Expected } to close ${', self.cur_token)
437
438	UP_arg_word = arg_word
439	with tagswitch(arg_word) as case:
440	if case(rhs_word_e.Empty):
441	pass
442	elif case(rhs_word_e.Compound):
443	arg_word = cast(CompoundWord, UP_arg_word)
444	# This handles ${x\|html} and ${x %.3f} now
445	# However I think ${x %.3f} should be statically parsed? It can enter
446	# the printf lexer modes.
447	ok, arg, quoted = word_.StaticEval(arg_word)
448	if not ok or quoted:
449	p_die('Expected a constant argument',
450	loc.Word(arg_word))
451
452	part.suffix_op = suffix_op.Static(tok, arg)
453
454	elif op_kind == Kind.VOp0:
455	part.suffix_op = self.cur_token # Nullary
456	self._SetNext(lex_mode_e.VSub_2) # Expecting }
457	self._GetToken()
458
459	elif op_kind == Kind.VOp1: # % %% # ## etc.
460	tok = self.cur_token
461	# Weird exception that all shells have: these operators take a glob
462	# pattern, so they're lexed as VSub_ArgUnquoted, not VSub_ArgDQ
463	arg_word = self._ReadVarOpArg(lex_mode_e.VSub_ArgUnquoted)
464	if self.token_type != Id.Right_DollarBrace:
465	p_die('Expected } to close ${', self.cur_token)
466
467	part.suffix_op = suffix_op.Unary(tok, arg_word)
468
469	elif op_kind == Kind.VOp2: # / : [ ]
470	if self.token_type == Id.VOp2_Slash:
471	patsub_op = self._ReadPatSubVarOp() # type: suffix_op_t
472	part.suffix_op = patsub_op
473
474	# Checked by the method above
475	assert self.token_type == Id.Right_DollarBrace, self.cur_token
476
477	elif self.token_type == Id.VOp2_Colon:
478	part.suffix_op = self._ReadSliceVarOp()
479	# NOTE: } in arithmetic mode.
480	if self.token_type != Id.Arith_RBrace:
481	# Token seems off; doesn't point to X in # ${a:1:2 X
482	p_die('Expected } to close ${', self.cur_token)
483
484	else:
485	# TODO: Does this ever happen?
486	p_die('Unexpected token in ${} (%s)' % 'VOp2', self.cur_token)
487
488	elif op_kind == Kind.VOp3: # ${prefix@} etc.
489	if allow_query:
490	part.suffix_op = self.cur_token # Nullary
491	self._SetNext(lex_mode_e.VSub_2) # Expecting }
492	self._GetToken()
493	else:
494	p_die("Unexpected token in ${} (%s)" % 'VOp3', self.cur_token)
495
496	# NOTE: Arith_RBrace is for slicing, because it reads } in arithmetic
497	# mode. It's redundantly checked above.
498	if self.token_type not in (Id.Right_DollarBrace, Id.Arith_RBrace):
499	# ${a.} or ${!a.}
500	p_die('Expected } to close ${', self.cur_token)
501
502	# Now look for ops
503	return part
504
505	def _ReadZshVarSub(self, left_token):
506	# type: (Token) -> word_part.ZshVarSub
507
508	self._SetNext(lex_mode_e.VSub_Zsh) # Move past ${(foo)
509
510	# Can be empty
511	w = self._ReadCompoundWord3(lex_mode_e.VSub_Zsh, Id.Right_DollarBrace,
512	True)
513	self._GetToken()
514	return word_part.ZshVarSub(left_token, w, self.cur_token)
515
516	def ReadBracedVarSub(self, left_token):
517	# type: (Token) -> Tuple[BracedVarSub, Token]
518	""" For YSH expressions like var x = ${x:-"default"}. """
519	part = self._ReadBracedVarSub(left_token, d_quoted=False)
520	last_token = self.cur_token
521	return part, last_token
522
523	def _ReadBracedVarSub(self, left_token, d_quoted):
524	# type: (Token, bool) -> BracedVarSub
525	"""For the ${} expression language.
526
527	NAME = [a-zA-Z_][a-zA-Z0-9_]*
528	NUMBER = [0-9]+ # ${10}, ${11}, ...
529
530	Subscript = '[' ('@' \| '*' \| ArithExpr) ']'
531	VarSymbol = '!' \| '@' \| '#' \| ...
532	VarOf = NAME Subscript?
533	\| NUMBER # no subscript allowed, none of these are arrays
534	# ${@[1]} doesn't work, even though slicing does
535	\| VarSymbol
536
537	NULLARY_OP = '@Q' \| '@E' \| '@P' \| '@A' \| '@a' # VOp0
538
539	TEST_OP = '-' \| ':-' \| '=' \| ':=' \| '+' \| ':+' \| '?' \| ':?'
540	STRIP_OP = '#' \| '##' \| '%' \| '%%'
541	CASE_OP = ',' \| ',,' \| '^' \| '^^'
542	UnaryOp = TEST_OP \| STRIP_OP \| CASE_OP
543
544	YSH_UNARY = '\|' \| ' ' # ${x\|html} and ${x %.3f}.
545	# SPACE is operator not %
546	Match = ('/' \| '#' \| '%') WORD # match all / prefix / suffix
547	VarExpr = VarOf
548	\| VarOf NULLARY_OP
549	\| VarOf UnaryOp WORD
550	\| VarOf YSH_UNARY STATIC_WORD
551	\| VarOf ':' ArithExpr (':' ArithExpr )?
552	\| VarOf '/' Match '/' WORD
553
554	LengthExpr = '#' VarOf # can't apply operators after length
555
556	RefOrKeys = '!' VarExpr # CAN apply operators after a named ref
557	# ${!ref[0]} vs ${!keys[@]} resolved later
558
559	PrefixQuery = '!' NAME ('*' \| '@') # list variable names with a prefix
560
561	BuiltinSub = '.' WORD+ # ${.myproc 'builtin' $sub}
562
563	VarSub = LengthExpr
564	\| RefOrKeys
565	\| PrefixQuery
566	\| VarExpr
567	\| BuiltinSub
568
569	NOTES:
570	- Arithmetic expressions are used twice, inside subscripts ${a[x+1]} and
571	slicing ${a:x+1:y+2}
572	- ${#} and ${!} need LL(2) lookahead (considering how my tokenizer works)
573	- @ and * are technically arithmetic expressions in this implementation
574	- We don't account for bash 4.4: ${param@operator} -- Q E P A a. Note that
575	it's also vectorized.
576
577	Strictness over bash:
578	- echo ${a[0][0]} doesn't do anything useful, so we disallow it from the
579	grammar
580	- ! and # prefixes can't be composed, even though named refs can be
581	composed with other operators
582	- '#' means 4 different things: length prefix, VarSymbol, UnaryOp to strip
583	a prefix, and it can also be a literal part of WORD.
584
585	From the parser's point of view, the prefix # can't be combined with
586	UnaryOp/slicing/matching, and the ! can. However
587
588	- ${a[@]:1:2} is not allowed
589	- ${#a[@]:1:2} is allowed, but gives the wrong answer
590	"""
591	if d_quoted:
592	arg_lex_mode = lex_mode_e.VSub_ArgDQ
593	else:
594	arg_lex_mode = lex_mode_e.VSub_ArgUnquoted
595
596	self._SetNext(lex_mode_e.VSub_1)
597	self._GetToken()
598
599	ty = self.token_type
600	first_tok = self.cur_token
601
602	if ty == Id.VSub_Pound:
603	# Disambiguate
604	next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
605	if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
606	# e.g. a name, '#' is the prefix
607	self._SetNext(lex_mode_e.VSub_1)
608	part = self._ParseVarOf()
609
610	self._GetToken()
611	if self.token_type != Id.Right_DollarBrace:
612	p_die('Expected } after length expression', self.cur_token)
613
614	part.prefix_op = first_tok
615
616	else: # not a prefix, '#' is the variable
617	part = self._ParseVarExpr(arg_lex_mode)
618
619	elif ty == Id.VSub_Bang:
620	next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
621	if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
622	# e.g. a name, '!' is the prefix
623	# ${!a} -- this is a ref
624	# ${!3} -- this is ref
625	# ${!a[1]} -- this is a ref
626	# ${!a[@]} -- this is a keys
627	# No lookahead -- do it in a second step, or at runtime
628	self._SetNext(lex_mode_e.VSub_1)
629	part = self._ParseVarExpr(arg_lex_mode, allow_query=True)
630
631	part.prefix_op = first_tok
632
633	else: # not a prefix, '!' is the variable
634	part = self._ParseVarExpr(arg_lex_mode)
635
636	elif ty == Id.VSub_Dot:
637	# Note: this will become a new builtin_sub type, so this method must
638	# return word_part_t rather than BracedVarSub. I don't think that
639	# should cause problems.
640	p_die('TODO: ${.myproc builtin sub}', self.cur_token)
641
642	# VS_NAME, VS_NUMBER, symbol that isn't # or !
643	elif self.token_kind == Kind.VSub:
644	part = self._ParseVarExpr(arg_lex_mode)
645
646	else:
647	# e.g. ${^}
648	p_die('Unexpected token in ${}', self.cur_token)
649
650	part.left = left_token # attach the argument
651	part.right = self.cur_token
652	return part
653
654	def _ReadSingleQuoted(self, left_token, lex_mode):
655	# type: (Token, lex_mode_t) -> SingleQuoted
656	"""Internal method to read a word_part."""
657	tokens = [] # type: List[Token]
658	# In command mode, we never disallow backslashes like '\'
659	right_quote = self.ReadSingleQuoted(lex_mode, left_token, tokens,
660	False)
661	sval = word_compile.EvalSingleQuoted(left_token.id, tokens)
662	node = SingleQuoted(left_token, sval, right_quote)
663	return node
664
665	def ReadSingleQuoted(self, lex_mode, left_token, out_tokens, is_ysh_expr):
666	# type: (lex_mode_t, Token, List[Token], bool) -> Token
667	"""Appends to out_tokens; returns last token
668
669	Used by expr_parse.py
670	"""
671	# TODO: Remove and use out_tokens
672	tokens = [] # type: List[Token]
673
674	# echo '\' is allowed, but x = '\' is invalid, in favor of x = r'\'
675	no_backslashes = is_ysh_expr and left_token.id == Id.Left_SingleQuote
676
677	expected_end_tokens = 3 if left_token.id in (
678	Id.Left_TSingleQuote, Id.Left_RTSingleQuote, Id.Left_UTSingleQuote,
679	Id.Left_BTSingleQuote) else 1
680	num_end_tokens = 0
681
682	while num_end_tokens < expected_end_tokens:
683	self._SetNext(lex_mode)
684	self._GetToken()
685
686	# Kind.Char emitted in lex_mode.SQ_C
687	if self.token_kind in (Kind.Lit, Kind.Char):
688	tok = self.cur_token
689	# Happens in lex_mode_e.SQ: 'one\two' is ambiguous, should be
690	# r'one\two' or c'one\\two'
691	if no_backslashes and lexer.TokenContains(tok, '\\'):
692	p_die(
693	r"Strings with backslashes should look like r'\n' or u'\n' or b'\n'",
694	tok)
695
696	if is_ysh_expr:
697	# Disallow var x = $'\001'. Arguably we don't need these
698	# checks because u'\u{1}' is the way to write it.
699	if self.token_type == Id.Char_Octal3:
700	p_die(
701	r"Use \xhh or \u{...} instead of octal escapes in YSH strings",
702	tok)
703
704	if self.token_type == Id.Char_Hex and self.cur_token.length != 4:
705	# disallow \xH
706	p_die(
707	r'Invalid hex escape in YSH string (must be \xHH)',
708	tok)
709
710	tokens.append(tok)
711
712	elif self.token_kind == Kind.Unknown:
713	tok = self.cur_token
714	assert tok.id == Id.Unknown_Backslash, tok
715
716	# x = $'\z' is disallowed; ditto for echo $'\z' if shopt -u parse_backslash
717	if is_ysh_expr or not self.parse_opts.parse_backslash():
718	p_die(
719	"Invalid char escape in C-style string literal (OILS-ERR-11)",
720	tok)
721
722	tokens.append(tok)
723
724	elif self.token_kind == Kind.Eof:
725	p_die('Unexpected EOF in single-quoted string that began here',
726	left_token)
727
728	elif self.token_kind == Kind.Right:
729	# assume Id.Right_SingleQuote
730	num_end_tokens += 1
731	tokens.append(self.cur_token)
732
733	else:
734	raise AssertionError(self.cur_token)
735
736	if self.token_kind != Kind.Right:
737	num_end_tokens = 0 # we need three in a ROW
738
739	if expected_end_tokens == 1:
740	tokens.pop()
741	elif expected_end_tokens == 3: # Get rid of spurious end tokens
742	tokens.pop()
743	tokens.pop()
744	tokens.pop()
745
746	# Remove space from ''' r''' $''' in both expression mode and command mode
747	if left_token.id in (Id.Left_TSingleQuote, Id.Left_RTSingleQuote,
748	Id.Left_UTSingleQuote, Id.Left_BTSingleQuote):
749	word_compile.RemoveLeadingSpaceSQ(tokens)
750
751	# Validation after lexing - same 2 checks in j8.LexerDecoder
752	is_u_string = left_token.id in (Id.Left_USingleQuote,
753	Id.Left_UTSingleQuote)
754
755	for tok in tokens:
756	# u'\yff' is not valid, but b'\yff' is
757	if is_u_string and tok.id == Id.Char_YHex:
758	p_die(
759	r"%s escapes not allowed in u'' strings" %
760	lexer.TokenVal(tok), tok)
761
762	out_tokens.extend(tokens)
763	return self.cur_token
764
765	def _ReadDoubleQuotedLeftParts(self):
766	# type: () -> word_part_t
767	"""Read substitution parts in a double quoted context."""
768	if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick):
769	return self._ReadCommandSub(self.token_type, d_quoted=True)
770
771	if self.token_type == Id.Left_DollarBrace:
772	return self._ReadBracedVarSub(self.cur_token, d_quoted=True)
773
774	if self.token_type == Id.Left_DollarDParen:
775	return self._ReadArithSub()
776
777	if self.token_type == Id.Left_DollarBracket:
778	return self._ReadExprSub(lex_mode_e.DQ)
779
780	if self.token_type == Id.Left_DollarBraceZsh:
781	return self._ReadZshVarSub(self.cur_token)
782
783	raise AssertionError(self.cur_token)
784
785	def _ReadYshSingleQuoted(self, left_id):
786	# type: (Id_t) -> CompoundWord
787	"""Read YSH style strings
788
789	r'' u'' b''
790	r''' ''' u''' ''' b''' '''
791	"""
792	#log('BEF self.cur_token %s', self.cur_token)
793	if left_id == Id.Left_RSingleQuote:
794	lexer_mode = lex_mode_e.SQ_Raw
795	triple_left_id = Id.Left_RTSingleQuote
796	elif left_id == Id.Left_USingleQuote:
797	lexer_mode = lex_mode_e.J8_Str
798	triple_left_id = Id.Left_UTSingleQuote
799	elif left_id == Id.Left_BSingleQuote:
800	lexer_mode = lex_mode_e.J8_Str
801	triple_left_id = Id.Left_BTSingleQuote
802	else:
803	raise AssertionError(left_id)
804
805	# Needed for syntax checks
806	left_tok = self.cur_token
807	left_tok.id = left_id
808
809	sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
810
811	if (len(sq_part.sval) == 0 and self.lexer.ByteLookAhead() == "'"):
812	self._SetNext(lex_mode_e.ShCommand)
813	self._GetToken()
814
815	assert self.token_type == Id.Left_SingleQuote
816	# HACK: magically transform the third ' in u''' to
817	# Id.Left_UTSingleQuote, so that ''' is the terminator
818	left_tok = self.cur_token
819	left_tok.id = triple_left_id
820
821	# Handles stripping leading whitespace
822	sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
823
824	# Advance and validate
825	self._SetNext(lex_mode_e.ShCommand)
826
827	self._GetToken()
828	if self.token_kind not in KINDS_THAT_END_WORDS:
829	p_die('Unexpected token after YSH single-quoted string',
830	self.cur_token)
831
832	return CompoundWord([sq_part])
833
834	def _ReadUnquotedLeftParts(self, triple_out):
835	# type: (Optional[BoolParamBox]) -> word_part_t
836	"""Read substitutions and quoted strings (for lex_mode_e.ShCommand).
837
838	If triple_out is set, then we try parsing triple quoted strings,
839	and set its value to True if we got one.
840	"""
841	if self.token_type in (Id.Left_DoubleQuote, Id.Left_DollarDoubleQuote):
842	# Note: $"" is a synonym for "". It might make sense if it added
843	# \n \0 \x00 \u{123} etc. But that's not what bash does!
844	dq_part = self._ReadDoubleQuoted(self.cur_token)
845	# Got empty word "" and there's a " after
846	if (triple_out and len(dq_part.parts) == 0 and
847	self.lexer.ByteLookAhead() == '"'):
848
849	self._SetNext(lex_mode_e.ShCommand)
850	self._GetToken()
851	# HACK: magically transform the third " in """ to
852	# Id.Left_TDoubleQuote, so that """ is the terminator
853	left_dq_token = self.cur_token
854	left_dq_token.id = Id.Left_TDoubleQuote
855	triple_out.b = True # let caller know we got it
856	return self._ReadDoubleQuoted(left_dq_token)
857
858	return dq_part
859
860	if self.token_type in (Id.Left_SingleQuote, Id.Left_RSingleQuote,
861	Id.Left_DollarSingleQuote):
862	if self.token_type == Id.Left_SingleQuote:
863	lexer_mode = lex_mode_e.SQ_Raw
864	triple_left_id = Id.Left_TSingleQuote
865	elif self.token_type == Id.Left_RSingleQuote:
866	lexer_mode = lex_mode_e.SQ_Raw
867	triple_left_id = Id.Left_RTSingleQuote
868	else:
869	lexer_mode = lex_mode_e.SQ_C
870	# there is no such thing as $'''
871	triple_left_id = Id.Undefined_Tok
872
873	sq_part = self._ReadSingleQuoted(self.cur_token, lexer_mode)
874
875	# Got empty '' or r'' and there's a ' after
876	# u'' and b'' are handled in _ReadYshSingleQuoted
877	if (triple_left_id != Id.Undefined_Tok and
878	triple_out is not None and len(sq_part.sval) == 0 and
879	self.lexer.ByteLookAhead() == "'"):
880
881	self._SetNext(lex_mode_e.ShCommand)
882	self._GetToken()
883
884	# HACK: magically transform the third ' in ''' to
885	# Id.Left_TSingleQuote, so that ''' is the terminator
886	left_sq_token = self.cur_token
887	left_sq_token.id = triple_left_id
888
889	triple_out.b = True # let caller know we got it
890	return self._ReadSingleQuoted(left_sq_token, lexer_mode)
891
892	return sq_part
893
894	if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick,
895	Id.Left_ProcSubIn, Id.Left_ProcSubOut):
896	return self._ReadCommandSub(self.token_type, d_quoted=False)
897
898	if self.token_type == Id.Left_DollarBrace:
899	return self._ReadBracedVarSub(self.cur_token, d_quoted=False)
900
901	if self.token_type == Id.Left_DollarDParen:
902	return self._ReadArithSub()
903
904	if self.token_type == Id.Left_DollarBracket:
905	return self._ReadExprSub(lex_mode_e.ShCommand)
906
907	if self.token_type == Id.Left_DollarBraceZsh:
908	return self._ReadZshVarSub(self.cur_token)
909
910	raise AssertionError(self.cur_token)
911
912	def _ReadExtGlob(self):
913	# type: () -> word_part.ExtGlob
914	"""
915	Grammar:
916	Item = CompoundWord \| EPSILON # important: @(foo\|) is allowed
917	LEFT = '@(' \| '*(' \| '+(' \| '?(' \| '!('
918	RIGHT = ')'
919	ExtGlob = LEFT (Item '\|')* Item RIGHT # ITEM may be empty
920	Compound includes ExtGlob
921	"""
922	left_token = self.cur_token
923	right_token = None # type: Token
924	arms = [] # type: List[CompoundWord]
925
926	self.lexer.PushHint(Id.Op_RParen, Id.Right_ExtGlob)
927	self._SetNext(lex_mode_e.ExtGlob) # advance past LEFT
928
929	read_word = False # did we just a read a word? To handle @(\|\|).
930
931	while True:
932	self._GetToken()
933
934	if self.token_type == Id.Right_ExtGlob:
935	if not read_word:
936	arms.append(CompoundWord([]))
937	right_token = self.cur_token
938	break
939
940	elif self.token_type == Id.Op_Pipe:
941	if not read_word:
942	arms.append(CompoundWord([]))
943	read_word = False
944	self._SetNext(lex_mode_e.ExtGlob)
945
946	# lex_mode_e.ExtGlob should only produce these 4 kinds of tokens
947	elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub,
948	Kind.ExtGlob):
949	w = self._ReadCompoundWord(lex_mode_e.ExtGlob)
950	arms.append(w)
951	read_word = True
952
953	elif self.token_kind == Kind.Eof:
954	p_die('Unexpected EOF reading extended glob that began here',
955	left_token)
956
957	else:
958	raise AssertionError(self.cur_token)
959
960	return word_part.ExtGlob(left_token, arms, right_token)
961
962	def _ReadBashRegexGroup(self):
963	# type: () -> word_part.BashRegexGroup
964	"""
965	Grammar:
966	BashRegexGroup = '(' WORD? ')
967	"""
968	left_token = self.cur_token
969	assert left_token.id == Id.BashRegex_LParen, left_token
970
971	arms = [] # type: List[CompoundWord]
972
973	self.lexer.PushHint(Id.Op_RParen, Id.Right_BashRegexGroup)
974	self._SetNext(lex_mode_e.BashRegexFakeInner) # advance past LEFT
975
976	self._GetToken()
977	if self.token_type == Id.Right_BashRegexGroup: # empty ()
978	return word_part.BashRegexGroup(left_token, None, self.cur_token)
979
980	# lex_mode_e.BashRegex should only produce these 4 kinds of tokens
981	if self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub, Kind.BashRegex):
982	# Fake lexer mode that translates Id.WS_Space to Id.Lit_Chars
983	# To allow bash style [[ s =~ (a b) ]]
984	w = self._ReadCompoundWord(lex_mode_e.BashRegexFakeInner)
985	arms.append(w)
986
987	self._GetToken()
988	if self.token_type != Id.Right_BashRegexGroup:
989	p_die('Expected ) to close bash regex group', self.cur_token)
990
991	return word_part.BashRegexGroup(left_token, w, self.cur_token)
992
993	p_die('Expected word after ( opening bash regex group', self.cur_token)
994
995	def _ReadLikeDQ(self, left_token, is_ysh_expr, out_parts):
996	# type: (Optional[Token], bool, List[word_part_t]) -> None
997	"""
998	Args:
999	left_token: A token if we are reading a double quoted part, or None if
1000	we're reading a here doc.
1001	is_ysh_expr: Whether to disallow backticks and invalid char escapes
1002	out_parts: list of word_part to append to
1003	"""
1004	if left_token:
1005	if left_token.id in (Id.Left_TDoubleQuote,
1006	Id.Left_DollarTDoubleQuote):
1007	expected_end_tokens = 3
1008	else:
1009	expected_end_tokens = 1
1010	else:
1011	expected_end_tokens = 1000 # here doc will break
1012
1013	num_end_tokens = 0
1014	while num_end_tokens < expected_end_tokens:
1015	self._SetNext(lex_mode_e.DQ)
1016	self._GetToken()
1017
1018	if self.token_kind == Kind.Lit:
1019	if self.token_type == Id.Lit_EscapedChar:
1020	tok = self.cur_token
1021	ch = lexer.TokenSliceLeft(tok, 1)
1022	part = word_part.EscapedLiteral(tok,
1023	ch) # type: word_part_t
1024	else:
1025	if self.token_type == Id.Lit_BadBackslash:
1026	# echo "\z" is OK in shell, but 'x = "\z" is a syntax error in
1027	# YSH.
1028	# Slight hole: We don't catch 'x = ${undef:-"\z"} because of the
1029	# recursion (unless parse_backslash)
1030	if (is_ysh_expr or
1031	not self.parse_opts.parse_backslash()):
1032	p_die(
1033	"Invalid char escape in double quoted string (OILS-ERR-12)",
1034	self.cur_token)
1035	elif self.token_type == Id.Lit_Dollar:
1036	if is_ysh_expr or not self.parse_opts.parse_dollar():
1037	p_die("Literal $ should be quoted like \$",
1038	self.cur_token)
1039
1040	part = self.cur_token
1041	out_parts.append(part)
1042
1043	elif self.token_kind == Kind.Left:
1044	if self.token_type == Id.Left_Backtick and is_ysh_expr:
1045	p_die("Invalid backtick: use $(cmd) or \\` in YSH strings",
1046	self.cur_token)
1047
1048	part = self._ReadDoubleQuotedLeftParts()
1049	out_parts.append(part)
1050
1051	elif self.token_kind == Kind.VSub:
1052	tok = self.cur_token
1053	part = SimpleVarSub(tok)
1054	out_parts.append(part)
1055	# NOTE: parsing "$f(x)" would BREAK CODE. Could add a more for it
1056	# later.
1057
1058	elif self.token_kind == Kind.Right:
1059	assert self.token_type == Id.Right_DoubleQuote, self.token_type
1060	if left_token:
1061	num_end_tokens += 1
1062
1063	# In a here doc, the right quote is literal!
1064	out_parts.append(self.cur_token)
1065
1066	elif self.token_kind == Kind.Eof:
1067	if left_token:
1068	p_die(
1069	'Unexpected EOF reading double-quoted string that began here',
1070	left_token)
1071	else: # here docs will have an EOF in their token stream
1072	break
1073
1074	else:
1075	raise AssertionError(self.cur_token)
1076
1077	if self.token_kind != Kind.Right:
1078	num_end_tokens = 0 # """ must be CONSECUTIVE
1079
1080	if expected_end_tokens == 1:
1081	out_parts.pop()
1082	elif expected_end_tokens == 3:
1083	out_parts.pop()
1084	out_parts.pop()
1085	out_parts.pop()
1086
1087	# Remove space from """ in both expression mode and command mode
1088	if (left_token and left_token.id
1089	in (Id.Left_TDoubleQuote, Id.Left_DollarTDoubleQuote)):
1090	word_compile.RemoveLeadingSpaceDQ(out_parts)
1091
1092	# Return nothing, since we appended to 'out_parts'
1093
1094	def _ReadDoubleQuoted(self, left_token):
1095	# type: (Token) -> DoubleQuoted
1096	"""Helper function for "hello $name".
1097
1098	Args:
1099	eof_type: for stopping at }, Id.Lit_RBrace
1100	here_doc: Whether we are reading in a here doc context
1101
1102	Also ${foo%%a b c} # treat this as double quoted. until you hit
1103	"""
1104	parts = [] # type: List[word_part_t]
1105	self._ReadLikeDQ(left_token, False, parts)
1106
1107	right_quote = self.cur_token
1108	return DoubleQuoted(left_token, parts, right_quote)
1109
1110	def ReadDoubleQuoted(self, left_token, parts):
1111	# type: (Token, List[word_part_t]) -> Token
1112	"""For expression mode.
1113
1114	Read var x = "${dir:-}/$name"; etc.
1115	"""
1116	self._ReadLikeDQ(left_token, True, parts)
1117	return self.cur_token
1118
1119	def _ReadCommandSub(self, left_id, d_quoted=False):
1120	# type: (Id_t, bool) -> CommandSub
1121	"""
1122	NOTE: This is not in the grammar, because word parts aren't in the grammar!
1123
1124	command_sub = '$(' command_list ')'
1125	\| '@(' command_list ')'
1126	\| '<(' command_list ')'
1127	\| '>(' command_list ')'
1128	\| ` command_list `
1129	"""
1130	left_token = self.cur_token
1131
1132	# Set the lexer in a state so ) becomes the EOF token.
1133	if left_id in (Id.Left_DollarParen, Id.Left_AtParen, Id.Left_ProcSubIn,
1134	Id.Left_ProcSubOut):
1135	self._SetNext(lex_mode_e.ShCommand) # advance past $( etc.
1136
1137	right_id = Id.Eof_RParen
1138	self.lexer.PushHint(Id.Op_RParen, right_id)
1139	c_parser = self.parse_ctx.MakeParserForCommandSub(
1140	self.line_reader, self.lexer, right_id)
1141	# NOTE: This doesn't use something like main_loop because we don't want
1142	# to interleave parsing and execution! Unlike 'source' and 'eval'.
1143	node = c_parser.ParseCommandSub()
1144
1145	right_token = c_parser.w_parser.cur_token
1146
1147	elif left_id == Id.Left_Backtick and self.parse_ctx.do_lossless:
1148	# NOTE: This is an APPROXIMATE solution for translation ONLY. See
1149	# test/osh2oil.
1150
1151	right_id = Id.Eof_Backtick
1152	self.lexer.PushHint(Id.Left_Backtick, right_id)
1153	c_parser = self.parse_ctx.MakeParserForCommandSub(
1154	self.line_reader, self.lexer, right_id)
1155	node = c_parser.ParseCommandSub()
1156	right_token = c_parser.w_parser.cur_token
1157
1158	elif left_id == Id.Left_Backtick:
1159	if not self.parse_opts.parse_backticks():
1160	p_die('Use $(cmd) instead of backticks (parse_backticks)',
1161	left_token)
1162
1163	self._SetNext(lex_mode_e.Backtick) # advance past `
1164
1165	parts = [] # type: List[str]
1166	while True:
1167	self._GetToken()
1168	#log("TOK %s", self.cur_token)
1169
1170	if self.token_type == Id.Backtick_Quoted:
1171	# Remove leading \
1172	parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1173
1174	elif self.token_type == Id.Backtick_DoubleQuote:
1175	# Compatibility: If backticks are double quoted, then double quotes
1176	# within them have to be \"
1177	# Shells aren't smart enough to match nested " and ` quotes (but OSH
1178	# is)
1179	if d_quoted:
1180	# Remove leading \
1181	parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1182	else:
1183	parts.append(lexer.TokenVal(self.cur_token))
1184
1185	elif self.token_type == Id.Backtick_Other:
1186	parts.append(lexer.TokenVal(self.cur_token))
1187
1188	elif self.token_type == Id.Backtick_Right:
1189	break
1190
1191	elif self.token_type == Id.Eof_Real:
1192	# Note: this parse error is in the ORIGINAL context. No code_str yet.
1193	p_die('Unexpected EOF while looking for closing backtick',
1194	left_token)
1195
1196	else:
1197	raise AssertionError(self.cur_token)
1198
1199	self._SetNext(lex_mode_e.Backtick)
1200
1201	# Calculate right SPID on CommandSub BEFORE re-parsing.
1202	right_token = self.cur_token
1203
1204	code_str = ''.join(parts)
1205	#log('code %r', code_str)
1206
1207	# NOTE: This is similar to how we parse aliases in osh/cmd_parse.py. It
1208	# won't have the same location info as MakeParserForCommandSub(), because
1209	# the lexer is different.
1210	arena = self.parse_ctx.arena
1211	#arena = alloc.Arena()
1212	line_reader = reader.StringLineReader(code_str, arena)
1213	c_parser = self.parse_ctx.MakeOshParser(line_reader)
1214	src = source.Reparsed('backticks', left_token, right_token)
1215	with alloc.ctx_SourceCode(arena, src):
1216	node = c_parser.ParseCommandSub()
1217
1218	else:
1219	raise AssertionError(left_id)
1220
1221	return CommandSub(left_token, node, right_token)
1222
1223	def _ReadExprSub(self, lex_mode):
1224	# type: (lex_mode_t) -> word_part.ExprSub
1225	"""$[d->key] $[obj.method()] etc."""
1226	left_token = self.cur_token
1227
1228	self._SetNext(lex_mode_e.Expr)
1229	enode, right_token = self.parse_ctx.ParseYshExpr(
1230	self.lexer, grammar_nt.ysh_expr_sub)
1231
1232	self._SetNext(lex_mode) # Move past ]
1233	return word_part.ExprSub(left_token, enode, right_token)
1234
1235	def ParseVarDecl(self, kw_token):
1236	# type: (Token) -> command.VarDecl
1237	"""
1238	oil_var_decl: name_type_list '=' testlist end_stmt
1239
1240	Note that assignments must end with \n ; } or EOF. Unlike shell
1241	assignments, we disallow:
1242
1243	var x = 42 \| wc -l
1244	var x = 42 && echo hi
1245	"""
1246	self._SetNext(lex_mode_e.Expr)
1247	enode, last_token = self.parse_ctx.ParseVarDecl(kw_token, self.lexer)
1248	# Hack to move } from what the Expr lexer modes gives to what CommandParser
1249	# wants
1250	if last_token.id == Id.Op_RBrace:
1251	last_token.id = Id.Lit_RBrace
1252
1253	# Let the CommandParser see the Op_Semi or Op_Newline.
1254	self.buffered_word = last_token
1255	self._SetNext(lex_mode_e.ShCommand) # always back to this
1256	return enode
1257
1258	def ParseMutation(self, kw_token, var_checker):
1259	# type: (Token, VarChecker) -> command.Mutation
1260	"""
1261	setvar i = 42
1262	setvar i += 1
1263	setvar a[i] = 42
1264	setvar a[i] += 1
1265	setvar d.key = 42
1266	setvar d.key += 1
1267	"""
1268	self._SetNext(lex_mode_e.Expr)
1269	enode, last_token = self.parse_ctx.ParseMutation(kw_token, self.lexer)
1270	# Hack to move } from what the Expr lexer modes gives to what CommandParser
1271	# wants
1272	if last_token.id == Id.Op_RBrace:
1273	last_token.id = Id.Lit_RBrace
1274
1275	for lhs in enode.lhs:
1276	UP_lhs = lhs
1277	with tagswitch(lhs) as case:
1278	if case(y_lhs_e.Var):
1279	lhs = cast(Token, UP_lhs)
1280	var_checker.Check(kw_token.id, lexer.LazyStr(lhs), lhs)
1281
1282	# Note: this does not cover cases like
1283	# setvar (a[0])[1] = v
1284	# setvar (d.key).other = v
1285	# This leaks into catching all typos statically, which may be
1286	# possible if 'use' makes all names explicit.
1287	elif case(y_lhs_e.Subscript):
1288	lhs = cast(Subscript, UP_lhs)
1289	if lhs.obj.tag() == expr_e.Var:
1290	v = cast(expr.Var, lhs.obj)
1291	var_checker.Check(kw_token.id, v.name, v.left)
1292
1293	elif case(y_lhs_e.Attribute):
1294	lhs = cast(Attribute, UP_lhs)
1295	if lhs.obj.tag() == expr_e.Var:
1296	v = cast(expr.Var, lhs.obj)
1297	var_checker.Check(kw_token.id, v.name, v.left)
1298
1299	# Let the CommandParser see the Op_Semi or Op_Newline.
1300	self.buffered_word = last_token
1301	self._SetNext(lex_mode_e.ShCommand) # always back to this
1302	return enode
1303
1304	def ParseBareDecl(self):
1305	# type: () -> expr_t
1306	"""
1307	x = {name: val}
1308	"""
1309	self._SetNext(lex_mode_e.Expr)
1310	self._GetToken()
1311	enode, last_token = self.parse_ctx.ParseYshExpr(
1312	self.lexer, grammar_nt.command_expr)
1313	if last_token.id == Id.Op_RBrace:
1314	last_token.id = Id.Lit_RBrace
1315	self.buffered_word = last_token
1316	self._SetNext(lex_mode_e.ShCommand)
1317	return enode
1318
1319	def ParseYshExprForCommand(self):
1320	# type: () -> expr_t
1321
1322	# Fudge for this case
1323	# for x in(y) {
1324	# versus
1325	# for x in (y) {
1326	#
1327	# In the former case, ReadWord on 'in' puts the lexer past (.
1328	# Also see LookPastSpace in CommandParers.
1329	# A simpler solution would be nicer.
1330
1331	if self.token_type == Id.Op_LParen:
1332	self.lexer.MaybeUnreadOne()
1333
1334	enode, _ = self.parse_ctx.ParseYshExpr(self.lexer, grammar_nt.ysh_expr)
1335
1336	self._SetNext(lex_mode_e.ShCommand)
1337	return enode
1338
1339	def ParseCommandExpr(self):
1340	# type: () -> expr_t
1341	"""
1342	= 1+2
1343	"""
1344	enode, last_token = self.parse_ctx.ParseYshExpr(
1345	self.lexer, grammar_nt.command_expr)
1346
1347	# In some cases, such as the case statement, we expect the lexer to be
1348	# pointing at the token right after the expression. But the expression
1349	# parser must have read to the `last_token`. Unreading places the lexer
1350	# back in the expected state. Ie:
1351	#
1352	# case (x) { case (x) {
1353	# (else) { = x } (else) { = x }
1354	# ^ The lexer is here ^ Unread to here
1355	# } }
1356	assert last_token.id in (Id.Op_Newline, Id.Eof_Real, Id.Op_Semi,
1357	Id.Op_RBrace), last_token
1358	if last_token.id != Id.Eof_Real:
1359	# Eof_Real is the only token we cannot unread
1360	self.lexer.MaybeUnreadOne()
1361
1362	return enode
1363
1364	def ParseProc(self, node):
1365	# type: (Proc) -> None
1366
1367	# proc name-with-hyphens() must be accepted
1368	self._SetNext(lex_mode_e.ShCommand)
1369	self._GetToken()
1370	# example: 'proc f[' gets you Lit_ArrayLhsOpen
1371	if self.token_type != Id.Lit_Chars:
1372	p_die('Invalid proc name %s' % ui.PrettyToken(self.cur_token),
1373	self.cur_token)
1374
1375	# TODO: validate this more. Disallow proc 123 { }, which isn't disallowed
1376	# for shell functions. Similar to IsValidVarName().
1377	node.name = self.cur_token
1378
1379	last_token = self.parse_ctx.ParseProc(self.lexer, node)
1380
1381	# Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1382	assert last_token.id == Id.Op_LBrace
1383	last_token.id = Id.Lit_LBrace
1384	self.buffered_word = last_token
1385
1386	self._SetNext(lex_mode_e.ShCommand)
1387
1388	def ParseFunc(self, node):
1389	# type: (Func) -> None
1390	last_token = self.parse_ctx.ParseFunc(self.lexer, node)
1391
1392	# Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1393	assert last_token.id == Id.Op_LBrace
1394	last_token.id = Id.Lit_LBrace
1395	self.buffered_word = last_token
1396
1397	self._SetNext(lex_mode_e.ShCommand)
1398
1399	def ParseYshCasePattern(self):
1400	# type: () -> Tuple[pat_t, Token]
1401	pat, left_tok, last_token = self.parse_ctx.ParseYshCasePattern(
1402	self.lexer)
1403
1404	if last_token.id == Id.Op_LBrace:
1405	last_token.id = Id.Lit_LBrace
1406	self.buffered_word = last_token
1407
1408	return pat, left_tok
1409
1410	def NewlineOkForYshCase(self):
1411	# type: () -> Id_t
1412	"""Check for optional newline and consume it.
1413
1414	This is a special case of `_NewlineOk` which fixed some "off-by-one" issues
1415	which crop up while parsing Ysh Case Arms. For more details, see
1416	#oil-dev > Progress On YSH Case Grammar on zulip.
1417
1418	Returns a token id which is filled with the choice of
1419
1420	word { echo word }
1421	(3) { echo expr }
1422	/e/ { echo eggex }
1423	} # right brace
1424	"""
1425	while True:
1426	next_id = self.lexer.LookAheadOne(lex_mode_e.Expr)
1427
1428	# Cannot lookahead past lines
1429	if next_id == Id.Unknown_Tok:
1430	if not self.lexer.MoveToNextLine(): # Try to move to next line
1431	break # EOF
1432	continue
1433
1434	next_kind = consts.GetKind(next_id)
1435	if next_id != Id.Op_Newline and next_kind != Kind.Ignored:
1436	break
1437
1438	self.lexer.Read(lex_mode_e.Expr)
1439
1440	if next_id in (Id.Op_RBrace, Id.Op_LParen, Id.Arith_Slash):
1441	self._SetNext(lex_mode_e.Expr) # Continue in expression mode
1442	else:
1443	# Consume the trailing Op_Newline
1444	self._SetNext(lex_mode_e.ShCommand)
1445	self._GetToken()
1446
1447	return next_id
1448
1449	def _ReadArithExpr(self, end_id):
1450	# type: (Id_t) -> arith_expr_t
1451	"""Read and parse an arithmetic expression in various contexts.
1452
1453	$(( 1+2 ))
1454	(( a=1+2 ))
1455	${a[ 1+2 ]}
1456	${a : 1+2 : 1+2}
1457
1458	See tests/arith-context.test.sh for ambiguous cases.
1459
1460	${a[a[0]]} is valid # VS_RBRACKET vs Id.Arith_RBracket
1461
1462	${s : a<b?0:1 : 1} # VS_COLON vs Id.Arith_Colon
1463
1464	See the assertion in ArithParser.Parse() -- unexpected extra input.
1465	"""
1466	# calls self.ReadWord(lex_mode_e.Arith)
1467	anode = self.a_parser.Parse()
1468	cur_id = self.a_parser.CurrentId()
1469	if end_id != Id.Undefined_Tok and cur_id != end_id:
1470	p_die(
1471	'Unexpected token after arithmetic expression (%s != %s)' %
1472	(ui.PrettyId(cur_id), ui.PrettyId(end_id)),
1473	loc.Word(self.a_parser.cur_word))
1474	return anode
1475
1476	def _ReadArithSub(self):
1477	# type: () -> word_part.ArithSub
1478	"""Read an arith substitution, which contains an arith expression, e.g.
1479
1480	$((a + 1)).
1481	"""
1482	left_tok = self.cur_token
1483
1484	# The second one needs to be disambiguated in stuff like stuff like:
1485	# $(echo $(( 1+2 )) )
1486	self.lexer.PushHint(Id.Op_RParen, Id.Right_DollarDParen)
1487
1488	# NOTE: To disambiguate $(( as arith sub vs. command sub and subshell, we
1489	# could save the lexer/reader state here, and retry if the arithmetic parse
1490	# fails. But we can almost always catch this at parse time. There could
1491	# be some exceptions like:
1492	# $((echo * foo)) # looks like multiplication
1493	# $((echo / foo)) # looks like division
1494
1495	# $(( )) is valid
1496	anode = arith_expr.EmptyZero # type: arith_expr_t
1497
1498	self._NextNonSpace()
1499	if self.token_type != Id.Arith_RParen:
1500	anode = self._ReadArithExpr(Id.Arith_RParen)
1501
1502	self._SetNext(lex_mode_e.ShCommand)
1503
1504	# Ensure we get closing )
1505	self._GetToken()
1506	if self.token_type != Id.Right_DollarDParen:
1507	p_die('Expected second ) to end arith sub', self.cur_token)
1508
1509	right_tok = self.cur_token
1510	return word_part.ArithSub(left_tok, anode, right_tok)
1511
1512	def ReadDParen(self):
1513	# type: () -> Tuple[arith_expr_t, Token]
1514	"""Read ((1+ 2)) -- command context.
1515
1516	We're using the word parser because it's very similar to _ReadArithExpr
1517	above.
1518
1519	This also returns the terminating Id.Op_DRightParen token for location
1520	info.
1521	"""
1522	# (( )) is valid
1523	anode = arith_expr.EmptyZero # type: arith_expr_t
1524
1525	self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
1526
1527	self._NextNonSpace()
1528	if self.token_type != Id.Arith_RParen:
1529	anode = self._ReadArithExpr(Id.Arith_RParen)
1530
1531	self._SetNext(lex_mode_e.ShCommand)
1532
1533	# Ensure we get the second )
1534	self._GetToken()
1535	right = self.cur_token
1536	if right.id != Id.Op_DRightParen:
1537	p_die('Expected second ) to end arith statement', right)
1538
1539	self._SetNext(lex_mode_e.ShCommand)
1540
1541	return anode, right
1542
1543	def _NextNonSpace(self):
1544	# type: () -> None
1545	"""Advance in lex_mode_e.Arith until non-space token.
1546
1547	Same logic as _ReadWord, but used in
1548	$(( ))
1549	(( ))
1550	for (( ))
1551
1552	You can read self.token_type after this, without calling _GetToken.
1553	"""
1554	while True:
1555	self._SetNext(lex_mode_e.Arith)
1556	self._GetToken()
1557	if self.token_kind not in (Kind.Ignored, Kind.WS):
1558	break
1559
1560	def ReadForExpression(self):
1561	# type: () -> command.ForExpr
1562	"""Read ((i=0; i<5; ++i)) -- part of command context."""
1563	self._NextNonSpace() # skip over ((
1564	cur_id = self.token_type # for end of arith expressions
1565
1566	if cur_id == Id.Arith_Semi: # for (( ; i < 10; i++ ))
1567	init_node = arith_expr.EmptyZero # type: arith_expr_t
1568	else:
1569	init_node = self.a_parser.Parse()
1570	cur_id = self.a_parser.CurrentId()
1571	self._NextNonSpace()
1572
1573	# It's odd to keep track of both cur_id and self.token_type in this
1574	# function, but it works, and is tested in 'test/parse_error.sh
1575	# arith-integration'
1576	if cur_id != Id.Arith_Semi: # for (( x=0 b; ... ))
1577	p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1578
1579	self._GetToken()
1580	cur_id = self.token_type
1581
1582	if cur_id == Id.Arith_Semi: # for (( ; ; i++ ))
1583	# empty condition is TRUE
1584	cond_node = arith_expr.EmptyOne # type: arith_expr_t
1585	else:
1586	cond_node = self.a_parser.Parse()
1587	cur_id = self.a_parser.CurrentId()
1588
1589	if cur_id != Id.Arith_Semi: # for (( x=0; x<5 b ))
1590	p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1591
1592	self._NextNonSpace()
1593	if self.token_type == Id.Arith_RParen: # for (( ; ; ))
1594	update_node = arith_expr.EmptyZero # type: arith_expr_t
1595	else:
1596	update_node = self._ReadArithExpr(Id.Arith_RParen)
1597
1598	self._NextNonSpace()
1599	if self.token_type != Id.Arith_RParen:
1600	p_die('Expected ) to end for loop expression', self.cur_token)
1601	self._SetNext(lex_mode_e.ShCommand)
1602
1603	# redirects is None, will be assigned in CommandEvaluator
1604	node = command.ForExpr.CreateNull()
1605	node.init = init_node
1606	node.cond = cond_node
1607	node.update = update_node
1608	return node
1609
1610	def _ReadArrayLiteral(self):
1611	# type: () -> word_part_t
1612	"""a=(1 2 3)
1613
1614	TODO: See osh/cmd_parse.py:164 for Id.Lit_ArrayLhsOpen, for a[x++]=1
1615
1616	We want:
1617
1618	A=(['x']=1 ["x"]=2 [$x$y]=3)
1619
1620	Maybe allow this as a literal string? Because I think I've seen it before?
1621	Or maybe force people to patch to learn the rule.
1622
1623	A=([x]=4)
1624
1625	Starts with Lit_Other '[', and then it has Lit_ArrayLhsClose
1626	Maybe enforce that ALL have keys or NONE of have keys.
1627	"""
1628	self._SetNext(lex_mode_e.ShCommand) # advance past (
1629	self._GetToken()
1630	if self.cur_token.id != Id.Op_LParen:
1631	p_die('Expected ( after =', self.cur_token)
1632	left_token = self.cur_token
1633	right_token = None # type: Token
1634
1635	# MUST use a new word parser (with same lexer).
1636	w_parser = self.parse_ctx.MakeWordParser(self.lexer, self.line_reader)
1637	words = [] # type: List[CompoundWord]
1638	done = False
1639	while not done:
1640	w = w_parser.ReadWord(lex_mode_e.ShCommand)
1641	with tagswitch(w) as case:
1642	if case(word_e.Operator):
1643	tok = cast(Token, w)
1644	if tok.id == Id.Right_Initializer:
1645	right_token = tok
1646	done = True # can't use break here
1647	# Unlike command parsing, array parsing allows embedded \n.
1648	elif tok.id == Id.Op_Newline:
1649	continue
1650	else:
1651	p_die('Unexpected token in array literal', loc.Word(w))
1652
1653	elif case(word_e.Compound):
1654	words.append(cast(CompoundWord, w))
1655
1656	else:
1657	raise AssertionError()
1658
1659	initializer_words = [] # type: List[InitializerWord_t]
1660	for w in words:
1661	pair = word_.DetectAssocPair(w)
1662	if pair is not None:
1663	word_.TildeDetectAssign(pair.value) # pair.value is modified
1664	initializer_words.append(pair)
1665	else:
1666	w2 = braces.BraceDetect(w) # type: word_t
1667	if w2 is None:
1668	w2 = w
1669	w3 = word_.TildeDetect(w2) # type: word_t
1670	if w3 is None:
1671	w3 = w2
1672	initializer_words.append(InitializerWord.ArrayWord(w3))
1673
1674	# invariant List?
1675	return word_part.InitializerLiteral(left_token, initializer_words,
1676	right_token)
1677
1678	def ParseProcCallArgs(self, start_symbol):
1679	# type: (int) -> ArgList
1680	""" json write (x) """
1681	self.lexer.MaybeUnreadOne()
1682
1683	arg_list = ArgList.CreateNull(alloc_lists=True)
1684	arg_list.left = self.cur_token
1685	self.parse_ctx.ParseProcCallArgs(self.lexer, arg_list, start_symbol)
1686	return arg_list
1687
1688	def _MaybeReadWordPart(self, is_first, lex_mode, parts):
1689	# type: (bool, lex_mode_t, List[word_part_t]) -> bool
1690	"""Helper for _ReadCompoundWord3."""
1691	done = False
1692
1693	if self.token_type == Id.Lit_EscapedChar:
1694	tok = self.cur_token
1695	assert tok.length == 2
1696	ch = lexer.TokenSliceLeft(tok, 1)
1697	if not self.parse_opts.parse_backslash():
1698	if not pyutil.IsValidCharEscape(ch):
1699	p_die('Invalid char escape in unquoted word (OILS-ERR-13)',
1700	self.cur_token)
1701
1702	part = word_part.EscapedLiteral(self.cur_token,
1703	ch) # type: word_part_t
1704	else:
1705	part = self.cur_token
1706
1707	if is_first and self.token_type == Id.Lit_VarLike: # foo=
1708	parts.append(part)
1709	# Unfortunately it's awkward to pull the check for a=(1 2) up to
1710	# _ReadWord.
1711	next_id = self.lexer.LookPastSpace(lex_mode)
1712	if next_id == Id.Op_LParen:
1713	self.lexer.PushHint(Id.Op_RParen, Id.Right_Initializer)
1714	part2 = self._ReadArrayLiteral()
1715	parts.append(part2)
1716
1717	# Array literal must be the last part of the word.
1718	self._SetNext(lex_mode)
1719	self._GetToken()
1720	# EOF, whitespace, newline, Right_Subshell
1721	if self.token_kind not in KINDS_THAT_END_WORDS:
1722	p_die('Unexpected token after array literal',
1723	self.cur_token)
1724	done = True
1725
1726	elif (is_first and self.parse_opts.parse_at() and
1727	self.token_type == Id.Lit_Splice):
1728
1729	splice_tok = self.cur_token
1730	part2 = word_part.Splice(splice_tok,
1731	lexer.TokenSliceLeft(splice_tok, 1))
1732
1733	parts.append(part2)
1734
1735	# @words must be the last part of the word
1736	self._SetNext(lex_mode)
1737	self._GetToken()
1738	# EOF, whitespace, newline, Right_Subshell
1739	if self.token_kind not in KINDS_THAT_END_WORDS:
1740	p_die('Unexpected token after array splice', self.cur_token)
1741	done = True
1742
1743	elif (is_first and self.parse_opts.parse_at() and
1744	self.token_type == Id.Lit_AtLBracket): # @[split(x)]
1745	part2 = self._ReadExprSub(lex_mode_e.DQ)
1746	parts.append(part2)
1747
1748	# @[split(x)]
1749	self._SetNext(lex_mode)
1750	self._GetToken()
1751	# EOF, whitespace, newline, Right_Subshell
1752	if self.token_kind not in KINDS_THAT_END_WORDS:
1753	p_die('Unexpected token after Expr splice', self.cur_token)
1754	done = True
1755
1756	elif (is_first and self.parse_opts.parse_at() and
1757	self.token_type == Id.Lit_AtLBraceDot):
1758	p_die('TODO: @{.myproc builtin sub}', self.cur_token)
1759
1760	elif (is_first and self.parse_opts.parse_at_all() and
1761	self.token_type == Id.Lit_At):
1762	# Because $[x] ${x} and perhaps $/x/ are reserved, it makes sense for @
1763	# at the beginning of a word to be reserved.
1764
1765	# Although should we relax 'echo @' ? I'm tempted to have a shortcut for
1766	# @_argv and
1767	p_die('Literal @ starting a word must be quoted (parse_at_all)',
1768	self.cur_token)
1769
1770	else:
1771	# not a literal with lookahead; append it
1772	parts.append(part)
1773
1774	return done
1775
1776	def _ReadCompoundWord(self, lex_mode):
1777	# type: (lex_mode_t) -> CompoundWord
1778	return self._ReadCompoundWord3(lex_mode, Id.Undefined_Tok, True)
1779
1780	def _ReadCompoundWord3(self, lex_mode, eof_type, empty_ok):
1781	# type: (lex_mode_t, Id_t, bool) -> CompoundWord
1782	"""
1783	Precondition: Looking at the first token of the first word part
1784	Postcondition: Looking at the token after, e.g. space or operator
1785
1786	NOTE: eof_type is necessary because / is a literal, i.e. Lit_Slash, but it
1787	could be an operator delimiting a compound word. Can we change lexer modes
1788	and remove this special case?
1789	"""
1790	w = CompoundWord([])
1791	num_parts = 0
1792	brace_count = 0
1793	done = False
1794	is_triple_quoted = None # type: Optional[BoolParamBox]
1795
1796	while not done:
1797	self._GetToken()
1798
1799	allow_done = empty_ok or num_parts != 0
1800	if allow_done and self.token_type == eof_type:
1801	done = True # e.g. for ${foo//pat/replace}
1802
1803	# Keywords like "for" are treated like literals
1804	elif self.token_kind in (Kind.Lit, Kind.History, Kind.KW,
1805	Kind.ControlFlow, Kind.BoolUnary,
1806	Kind.BoolBinary):
1807
1808	# Syntax error for { and }
1809	if self.token_type == Id.Lit_LBrace:
1810	brace_count += 1
1811	elif self.token_type == Id.Lit_RBrace:
1812	brace_count -= 1
1813	elif self.token_type == Id.Lit_Dollar:
1814	if not self.parse_opts.parse_dollar():
1815	if num_parts == 0 and lex_mode == lex_mode_e.ShCommand:
1816	next_byte = self.lexer.ByteLookAhead()
1817	# TODO: switch lexer modes and parse $/d+/. But not ${a:-$/d+/}
1818	if next_byte == '/':
1819	#log('next_byte %r', next_byte)
1820	pass
1821
1822	p_die('Literal $ should be quoted like \$',
1823	self.cur_token)
1824
1825	done = self._MaybeReadWordPart(num_parts == 0, lex_mode,
1826	w.parts)
1827
1828	elif self.token_kind == Kind.VSub:
1829	vsub_token = self.cur_token
1830
1831	part = SimpleVarSub(vsub_token) # type: word_part_t
1832	w.parts.append(part)
1833
1834	elif self.token_kind == Kind.ExtGlob:
1835	# If parse_at, we can take over @( to start @(seq 3)
1836	# Users can also use look at ,(.py\|.sh)
1837	if (self.parse_opts.parse_at() and
1838	self.token_type == Id.ExtGlob_At and num_parts == 0):
1839	cs_part = self._ReadCommandSub(Id.Left_AtParen,
1840	d_quoted=False)
1841	# RARE mutation of tok.id!
1842	cs_part.left_token.id = Id.Left_AtParen
1843	part = cs_part # for type safety
1844
1845	# Same check as _MaybeReadWordPart. @(seq 3)x is illegal, just like
1846	# a=(one two)x and @arrayfunc(3)x.
1847	self._GetToken()
1848	if self.token_kind not in KINDS_THAT_END_WORDS:
1849	p_die('Unexpected token after @()', self.cur_token)
1850	done = True
1851
1852	else:
1853	if HAVE_FNM_EXTMATCH == 0:
1854	p_die(
1855	"Extended glob won't work without FNM_EXTMATCH support in libc",
1856	self.cur_token)
1857	part = self._ReadExtGlob()
1858	w.parts.append(part)
1859
1860	elif self.token_kind == Kind.BashRegex:
1861	if self.token_type == Id.BashRegex_LParen: # Opening (
1862	part = self._ReadBashRegexGroup()
1863	w.parts.append(part)
1864	else:
1865	assert self.token_type == Id.BashRegex_AllowedInParens
1866	p_die('Invalid token in bash regex', self.cur_token)
1867
1868	elif self.token_kind == Kind.Left:
1869	try_triple_quote = (self.parse_opts.parse_triple_quote() and
1870	lex_mode == lex_mode_e.ShCommand and
1871	num_parts == 0)
1872
1873	# Save allocation
1874	if try_triple_quote:
1875	is_triple_quoted = BoolParamBox(False)
1876
1877	part = self._ReadUnquotedLeftParts(is_triple_quoted)
1878	w.parts.append(part)
1879
1880	# NOT done yet, will advance below
1881	elif self.token_kind == Kind.Right:
1882	# Still part of the word; will be done on the next iter.
1883	if self.token_type == Id.Right_DoubleQuote:
1884	pass
1885	# Never happens, no PushHint for this case.
1886	#elif self.token_type == Id.Right_DollarParen:
1887	# pass
1888	elif self.token_type == Id.Right_Subshell:
1889	# LEXER HACK for (case x in x) ;; esac )
1890	# Rewind before it's used
1891	assert self.next_lex_mode == lex_mode_e.Undefined
1892	if self.lexer.MaybeUnreadOne():
1893	self.lexer.PushHint(Id.Op_RParen, Id.Right_Subshell)
1894	self._SetNext(lex_mode)
1895	done = True
1896	else:
1897	done = True
1898
1899	elif self.token_kind == Kind.Ignored:
1900	done = True
1901
1902	else:
1903	# LEXER HACK for unbalanced case clause. 'case foo in esac' is valid,
1904	# so to test for ESAC, we can read ) before getting a chance to
1905	# PushHint(Id.Op_RParen, Id.Right_CasePat). So here we unread one
1906	# token and do it again.
1907
1908	# We get Id.Op_RParen at top level: case x in x) ;; esac
1909	# We get Id.Eof_RParen inside ComSub: $(case x in x) ;; esac )
1910	if self.token_type in (Id.Op_RParen, Id.Eof_RParen):
1911	# Rewind before it's used
1912	assert self.next_lex_mode == lex_mode_e.Undefined
1913	if self.lexer.MaybeUnreadOne():
1914	if self.token_type == Id.Eof_RParen:
1915	# Redo translation
1916	self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
1917	self._SetNext(lex_mode)
1918
1919	done = True # anything we don't recognize means we're done
1920
1921	if not done:
1922	self._SetNext(lex_mode)
1923	num_parts += 1
1924
1925	if (self.parse_opts.parse_brace() and num_parts > 1 and
1926	brace_count != 0):
1927	# accept { and }, but not foo{
1928	p_die(
1929	'Word has unbalanced { }. Maybe add a space or quote it like \{',
1930	loc.Word(w))
1931
1932	if is_triple_quoted and is_triple_quoted.b and num_parts > 1:
1933	p_die('Unexpected parts after triple quoted string',
1934	loc.WordPart(w.parts[-1]))
1935
1936	if 0:
1937	from _devbuild.gen.syntax_asdl import word_part_str
1938	word_key = ' '.join(word_part_str(p.tag()) for p in w.parts)
1939	WORD_HIST[word_key] += 1
1940	return w
1941
1942	def _ReadArithWord(self):
1943	# type: () -> Optional[word_t]
1944	""" Helper for ReadArithWord() """
1945	self._GetToken()
1946
1947	if self.token_kind == Kind.Unknown:
1948	# e.g. happened during dynamic parsing of unset 'a[$foo]' in gherkin
1949	p_die(
1950	'Unexpected token while parsing arithmetic: %r' %
1951	lexer.TokenVal(self.cur_token), self.cur_token)
1952
1953	elif self.token_kind == Kind.Eof:
1954	return self.cur_token
1955
1956	elif self.token_kind == Kind.Ignored:
1957	# Space should be ignored.
1958	self._SetNext(lex_mode_e.Arith)
1959	return None
1960
1961	elif self.token_kind in (Kind.Arith, Kind.Right):
1962	# Id.Right_DollarDParen IS just a normal token, handled by ArithParser
1963	self._SetNext(lex_mode_e.Arith)
1964	return self.cur_token
1965
1966	elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub):
1967	return self._ReadCompoundWord(lex_mode_e.Arith)
1968
1969	else:
1970	raise AssertionError(self.cur_token)
1971
1972	def _ReadWord(self, word_mode):
1973	# type: (lex_mode_t) -> Optional[word_t]
1974	"""Helper function for ReadWord()."""
1975
1976	# Change the pseudo lexer mode to a real lexer mode
1977	if word_mode == lex_mode_e.ShCommandFakeBrack:
1978	lex_mode = lex_mode_e.ShCommand
1979	else:
1980	lex_mode = word_mode
1981
1982	self._GetToken()
1983
1984	if self.token_kind == Kind.Eof:
1985	# No advance
1986	return self.cur_token
1987
1988	# Allow Arith for ) at end of for loop?
1989	elif self.token_kind in (Kind.Op, Kind.Redir, Kind.Arith):
1990	self._SetNext(lex_mode)
1991
1992	# Newlines are complicated. See 3x2 matrix in the comment about
1993	# self.multiline and self.newline_state above.
1994	if self.token_type == Id.Op_Newline:
1995	if self.multiline:
1996	if self.newline_state > 1:
1997	# This points at a blank line, but at least it gives the line number
1998	p_die('Invalid blank line in multiline mode',
1999	self.cur_token)
2000	return None
2001
2002	if self.returned_newline: # skip
2003	return None
2004
2005	return self.cur_token
2006
2007	elif self.token_kind == Kind.Right:
2008	if self.token_type not in (Id.Right_Subshell, Id.Right_ShFunction,
2009	Id.Right_CasePat, Id.Right_Initializer):
2010	raise AssertionError(self.cur_token)
2011
2012	self._SetNext(lex_mode)
2013	return self.cur_token
2014
2015	elif self.token_kind in (Kind.Ignored, Kind.WS):
2016	self._SetNext(lex_mode)
2017	return None
2018
2019	else:
2020	assert self.token_kind in (Kind.VSub, Kind.Lit, Kind.History,
2021	Kind.Left, Kind.KW, Kind.ControlFlow,
2022	Kind.BoolUnary, Kind.BoolBinary,
2023	Kind.ExtGlob,
2024	Kind.BashRegex), 'Unhandled token kind'
2025
2026	if (word_mode == lex_mode_e.ShCommandFakeBrack and
2027	self.parse_opts.parse_bracket() and
2028	self.token_type == Id.Lit_LBracket):
2029	# Change [ from Kind.Lit -> Kind.Op
2030	# So CommandParser can treat
2031	# assert [42 === x]
2032	# like
2033	# json write (x)
2034	bracket_word = self.cur_token
2035	bracket_word.id = Id.Op_LBracket
2036
2037	self._SetNext(lex_mode)
2038	return bracket_word
2039
2040	# We're beginning a word. If we see Id.Lit_Pound, change to
2041	# lex_mode_e.Comment and read until end of line.
2042	if self.token_type == Id.Lit_Pound:
2043	self._SetNext(lex_mode_e.Comment)
2044	self._GetToken()
2045
2046	# NOTE: The # could be the last character in the file. It can't be
2047	# Eof_{RParen,Backtick} because #) and #` are comments.
2048	assert self.token_type in (Id.Ignored_Comment, Id.Eof_Real), \
2049	self.cur_token
2050
2051	# The next iteration will go into Kind.Ignored and set lex state to
2052	# lex_mode_e.ShCommand/etc.
2053	return None # tell ReadWord() to try again after comment
2054
2055	elif self.token_type == Id.Lit_TPound: ### doc comment
2056	self._SetNext(lex_mode_e.Comment)
2057	self._GetToken()
2058
2059	if self.token_type == Id.Ignored_Comment and self.emit_doc_token:
2060	return self.cur_token
2061
2062	return None # tell ReadWord() to try again after comment
2063
2064	else:
2065	# r'' u'' b''
2066	if (self.token_type == Id.Lit_Chars and
2067	self.lexer.LookAheadOne(
2068	lex_mode_e.ShCommand) == Id.Left_SingleQuote):
2069
2070	# When shopt -s parse_raw_string:
2071	# echo r'hi' is like echo 'hi'
2072	#
2073	# echo u'\u{3bc}' b'\yff' works
2074
2075	tok = self.cur_token
2076	if self.parse_opts.parse_ysh_string():
2077	if lexer.TokenEquals(tok, 'r'):
2078	left_id = Id.Left_RSingleQuote
2079	elif lexer.TokenEquals(tok, 'u'):
2080	left_id = Id.Left_USingleQuote
2081	elif lexer.TokenEquals(tok, 'b'):
2082	left_id = Id.Left_BSingleQuote
2083	else:
2084	left_id = Id.Undefined_Tok
2085
2086	if left_id != Id.Undefined_Tok:
2087	# skip the r, and then 'foo' will be read as normal
2088	self._SetNext(lex_mode_e.ShCommand)
2089
2090	self._GetToken()
2091	assert self.token_type == Id.Left_SingleQuote, self.token_type
2092
2093	# Read the word in a different lexer mode
2094	return self._ReadYshSingleQuoted(left_id)
2095
2096	return self._ReadCompoundWord(lex_mode)
2097
2098	def ParseVarRef(self):
2099	# type: () -> BracedVarSub
2100	"""DYNAMIC parsing of what's inside ${!ref}
2101
2102	# Same as VarOf production
2103	VarRefExpr = VarOf EOF
2104	"""
2105	self._SetNext(lex_mode_e.VSub_1)
2106
2107	self._GetToken()
2108	if self.token_kind != Kind.VSub:
2109	p_die('Expected var name', self.cur_token)
2110
2111	part = self._ParseVarOf()
2112	# NOTE: no ${ } means no part.left and part.right
2113	part.left = part.name_tok # cheat to make test pass
2114	part.right = part.name_tok
2115
2116	self._GetToken()
2117	if self.token_type != Id.Eof_Real:
2118	p_die('Expected end of var ref expression', self.cur_token)
2119	return part
2120
2121	def LookPastSpace(self):
2122	# type: () -> Id_t
2123	"""Look ahead to the next token.
2124
2125	For the CommandParser to recognize
2126	array= (1 2 3)
2127	YSH for ( versus bash for ((
2128	YSH if ( versus if test
2129	YSH while ( versus while test
2130	YSH bare assignment 'grep =' versus 'grep foo'
2131	"""
2132	assert self.token_type != Id.Undefined_Tok
2133	if self.cur_token.id == Id.WS_Space:
2134	id_ = self.lexer.LookPastSpace(lex_mode_e.ShCommand)
2135	else:
2136	id_ = self.cur_token.id
2137	return id_
2138
2139	def LookAheadFuncParens(self):
2140	# type: () -> bool
2141	"""Special lookahead for f( ) { echo hi; } to check for ( )"""
2142	assert self.token_type != Id.Undefined_Tok
2143
2144	# We have to handle 2 cases because we buffer a token
2145	if self.cur_token.id == Id.Op_LParen: # saw funcname(
2146	return self.lexer.LookAheadFuncParens(1) # go back one char
2147
2148	elif self.cur_token.id == Id.WS_Space: # saw funcname WHITESPACE
2149	return self.lexer.LookAheadFuncParens(0)
2150
2151	else:
2152	return False
2153
2154	def ReadWord(self, word_mode):
2155	# type: (lex_mode_t) -> word_t
2156	"""Read the next word, using the given lexer mode.
2157
2158	This is a stateful wrapper for the stateless _ReadWord function.
2159	"""
2160	assert word_mode in (lex_mode_e.ShCommand,
2161	lex_mode_e.ShCommandFakeBrack,
2162	lex_mode_e.DBracket, lex_mode_e.BashRegex)
2163
2164	if self.buffered_word: # For integration with pgen2
2165	w = self.buffered_word
2166	self.buffered_word = None
2167	else:
2168	while True:
2169	w = self._ReadWord(word_mode)
2170	if w is not None:
2171	break
2172
2173	self.returned_newline = (word_.CommandId(w) == Id.Op_Newline)
2174	return w
2175
2176	def ReadArithWord(self):
2177	# type: () -> word_t
2178	while True:
2179	w = self._ReadArithWord()
2180	if w is not None:
2181	break
2182	return w
2183
2184	def ReadHereDocBody(self, parts):
2185	# type: (List[word_part_t]) -> None
2186	"""
2187	A here doc is like a double quoted context, except " isn't special.
2188	"""
2189	self._ReadLikeDQ(None, False, parts)
2190	# Returns nothing
2191
2192	def ReadForPlugin(self):
2193	# type: () -> CompoundWord
2194	"""For $PS1, $PS4, etc.
2195
2196	This is just like reading a here doc line. "\n" is allowed, as
2197	well as the typical substitutions ${x} $(echo hi) $((1 + 2)).
2198	"""
2199	w = CompoundWord([])
2200	self._ReadLikeDQ(None, False, w.parts)
2201	return w
2202
2203	def EmitDocToken(self, b):
2204	# type: (bool) -> None
2205	self.emit_doc_token = b
2206
2207	def Multiline(self, b):
2208	# type: (bool) -> None
2209	self.multiline = b
2210
2211
2212	if 0:
2213	import collections
2214	WORD_HIST = collections.Counter()
2215
2216	# vim: sw=4