frontend/lexer

OILS / frontend / lexer_def.py View on Github | oilshell.org

1139 lines, 579 significant

1	"""
2	lexer_def.py - Lexing for OSH, YSH, and J8 Notation.
3
4	The OSH/YSH lexer has lexer modes, each with a regex -> Id mapping.
5
6	After changing this file, run:
7
8	build/py.sh all
9
10	or at least:
11
12	build/py.sh fastlex
13
14	Input Handling
15	--------------
16
17	Every line is NUL terminated:
18
19	'one\n\0' 'last line\0'
20
21	which means that no regexes below should match \0.
22
23	For example, use [^'\0]+ instead of [^']+ .
24
25	If this rule isn't followed, we would read uninitialized memory past the
26	sentinel. Python's regex engine knows where the end of the input string is, so
27	it doesn't require need a sentinel like \0.
28
29	The frontend/lexer_gen.py generator adds a pattern mapping \0 to Id.Eol_Tok.
30	"""
31
32	from _devbuild.gen.id_kind_asdl import Id, Id_t, Kind
33	from _devbuild.gen.types_asdl import lex_mode_e
34
35	from frontend import id_kind_def
36
37	from typing import Tuple
38
39	# Initialize spec that the lexer depends on.
40	ID_SPEC = id_kind_def.IdSpec({}, {})
41
42	id_kind_def.AddKinds(ID_SPEC)
43	id_kind_def.AddBoolKinds(ID_SPEC) # must come second
44	id_kind_def.SetupTestBuiltin(ID_SPEC, {}, {}, {})
45
46
47	def C(pat, tok_type):
48	# type: (str, Id_t) -> Tuple[bool, str, Id_t]
49	"""Lexer rule with a constant string, e.g. C('$*', VSub_Star)"""
50	return (False, pat, tok_type)
51
52
53	def R(pat, tok_type):
54	# type: (str, Id_t) -> Tuple[bool, str, Id_t]
55	"""Lexer rule with a regex string, e.g. R('\$[0-9]', VSub_Number)"""
56	return (True, pat, tok_type)
57
58
59	# See unit tests in frontend/match_test.py.
60	# We need the [^\0]* because the re2c translation assumes it's anchored like $.
61	SHOULD_HIJACK_RE = r'#![^\0]sh[ \t\r\n][^\0]'
62
63	# Separates words (\r it not whitespace here)
64	_SIGNIFICANT_SPACE = R(r'[ \t]+', Id.WS_Space)
65
66	_BACKSLASH = [
67	# To be conservative, we could deny a set of chars similar to
68	# _LITERAL_WHITELIST_REGEX, rather than allowing all the operator characters
69	# like \( and \;.
70	#
71	# strict_backslash makes this stricter.
72	R(r'\\[^\n\0]', Id.Lit_EscapedChar),
73	C('\\\n', Id.Ignored_LineCont),
74	]
75
76	# Only 4 characters are backslash escaped inside "".
77	# https://www.gnu.org/software/bash/manual/bash.html#Double-Quotes
78	_DQ_BACKSLASH = [
79	R(r'\\[$`"\\]', Id.Lit_EscapedChar),
80	C('\\', Id.Lit_BadBackslash), # syntax error in YSH, but NOT in OSH
81	]
82
83	VAR_NAME_RE = r'[a-zA-Z_][a-zA-Z0-9_]*'
84
85	# All Kind.VSub
86	_VARS = [
87	# Unbraced variables
88	R(r'\$' + VAR_NAME_RE, Id.VSub_DollarName),
89	R(r'\$[0-9]', Id.VSub_Number),
90	C(r'$!', Id.VSub_Bang),
91	C(r'$@', Id.VSub_At),
92	C(r'$#', Id.VSub_Pound),
93	C(r'$$', Id.VSub_Dollar),
94	C(r'$*', Id.VSub_Star),
95	C(r'$-', Id.VSub_Hyphen),
96	C(r'$?', Id.VSub_QMark),
97	]
98
99	# Kind.Left that are valid in double-quoted modes.
100
101	_LEFT_SUBS = [
102	C('`', Id.Left_Backtick),
103	C('$(', Id.Left_DollarParen),
104	C('${', Id.Left_DollarBrace),
105	# Parse zsh syntax, but don't execute it.
106	# The examples we've seen so far are like ${(%):-} and ${(m)
107	R(r'\$\{$[^)\0]+$', Id.Left_DollarBraceZsh),
108	C('$((', Id.Left_DollarDParen),
109	C('$[', Id.Left_DollarBracket),
110	]
111
112	# Additional Kind.Left that are valid in unquoted modes.
113	_LEFT_UNQUOTED = [
114	C('"', Id.Left_DoubleQuote),
115	C("'", Id.Left_SingleQuote),
116	C('$"', Id.Left_DollarDoubleQuote),
117	C("$'", Id.Left_DollarSingleQuote),
118	]
119
120	_LEFT_PROCSUB = [
121	C('<(', Id.Left_ProcSubIn),
122	C('>(', Id.Left_ProcSubOut),
123	]
124
125	# The regexes below are in Python syntax, but are translate to re2c syntax by
126	# frontend/lexer_gen.py.
127	#
128	# http://re2c.org/manual/syntax/syntax.html
129	# https://docs.python.org/2/library/re.html
130	#
131	# We use a limited set of constructs:
132	# - + and * for repetition
133	# - Character classes [] with simple ranges and negation
134	# - Escapes like \n \0
135
136	LEXER_DEF = {} # TODO: Should be a list so we enforce order.
137
138	# Anything until the end of the line is a comment. Does not match the newline
139	# itself. We want to switch modes and possibly process Op_Newline for here
140	# docs, etc.
141	LEXER_DEF[lex_mode_e.Comment] = [R(r'[^\n\0]*', Id.Ignored_Comment)]
142
143	# A whitelist to make bigger Lit_Chars tokens. We don't want one byte at a time.
144	#
145	# The shell language says that "anything other byte" is a literal character --
146	# for example, unquoted $ \ ! are literal, not a syntax error.
147	#
148	# That is, a literal is defined NEGATIVELY, for a single characters. But here
149	# we define a SUBSET of literal chars POSITIVELY.
150
151	# The range \x80-\xff makes sure that UTF-8 sequences are a single token.
152	_LITERAL_WHITELIST_REGEX = r'[\x80-\xffa-zA-Z0-9_.\-]+'
153
154	_UNQUOTED = _BACKSLASH + _LEFT_SUBS + _LEFT_UNQUOTED + _LEFT_PROCSUB + _VARS + [
155	# NOTE: We could add anything 128 and above to this character class? So
156	# utf-8 characters don't get split?
157	R(_LITERAL_WHITELIST_REGEX, Id.Lit_Chars),
158	C('~', Id.Lit_Tilde), # for tilde sub
159	C('/', Id.Lit_Slash), # also for tilde sub
160	C(':', Id.Lit_Colon), # for special PATH=a:~foo tilde detection
161	C('$', Id.Lit_Dollar), # shopt -u parse_dollar
162	C('#', Id.Lit_Pound), # For comments
163	_SIGNIFICANT_SPACE,
164	C('\n', Id.Op_Newline),
165	C('&', Id.Op_Amp),
166	C('\|', Id.Op_Pipe),
167	C('\|&', Id.Op_PipeAmp),
168	C('&&', Id.Op_DAmp),
169	C('\|\|', Id.Op_DPipe),
170	C(';', Id.Op_Semi),
171	# Case terminators
172	C(';;', Id.Op_DSemi),
173	C(';&', Id.Op_SemiAmp),
174	C(';;&', Id.Op_DSemiAmp),
175	C('(', Id.Op_LParen),
176	C(')', Id.Op_RParen),
177	R(r'[^\0]', Id.Lit_Other), # any other single char is a literal
178	]
179
180	# In ShCommand and DBracket states.
181	_EXTGLOB_BEGIN = [
182	C(',(', Id.ExtGlob_Comma), # YSH synonym for @(...)
183	C('@(', Id.ExtGlob_At),
184	C('*(', Id.ExtGlob_Star),
185	C('+(', Id.ExtGlob_Plus),
186	C('?(', Id.ExtGlob_QMark),
187	C('!(', Id.ExtGlob_Bang),
188	]
189
190	KEYWORDS = [
191	# NOTE: { is matched elsewhere
192	C('[[', Id.KW_DLeftBracket),
193	C('!', Id.KW_Bang),
194	C('for', Id.KW_For),
195	C('while', Id.KW_While),
196	C('until', Id.KW_Until),
197	C('do', Id.KW_Do),
198	C('done', Id.KW_Done),
199	C('in', Id.KW_In),
200	C('case', Id.KW_Case),
201	C('esac', Id.KW_Esac),
202	C('if', Id.KW_If),
203	C('fi', Id.KW_Fi),
204	C('then', Id.KW_Then),
205	C('else', Id.KW_Else),
206	C('elif', Id.KW_Elif),
207	C('function', Id.KW_Function),
208	C('time', Id.KW_Time),
209
210	# YSH
211	C('const', Id.KW_Const), # maybe remove this
212	C('var', Id.KW_Var),
213	C('setvar', Id.KW_SetVar),
214	C('setglobal', Id.KW_SetGlobal),
215	C('call', Id.KW_Call),
216	C('proc', Id.KW_Proc),
217	C('typed', Id.KW_Typed),
218	C('func', Id.KW_Func),
219	]
220
221	# These are treated like builtins in bash, but keywords in OSH. However, we
222	# maintain compatibility with bash for the 'type' builtin.
223	CONTROL_FLOW = [
224	C('break', Id.ControlFlow_Break),
225	C('continue', Id.ControlFlow_Continue),
226	C('return', Id.ControlFlow_Return),
227	C('exit', Id.ControlFlow_Exit),
228	]
229
230	# Used by ysh/grammar_gen.py too
231	EXPR_WORDS = [
232	C('null', Id.Expr_Null),
233	C('true', Id.Expr_True),
234	C('false', Id.Expr_False),
235	C('and', Id.Expr_And),
236	C('or', Id.Expr_Or),
237	C('not', Id.Expr_Not),
238	C('for', Id.Expr_For),
239	C('is', Id.Expr_Is),
240	C('in', Id.Expr_In),
241	C('if', Id.Expr_If),
242	C('else', Id.Expr_Else),
243
244	# Unused: could be for function literals, although we also have
245	# \|x\| x+1 lambdas
246	C('func', Id.Expr_Func),
247
248	# / <capture d+/
249	C('capture', Id.Expr_Capture),
250	# / <capture d+ as date> /
251	C('as', Id.Expr_As),
252	]
253
254	FD_VAR_NAME = r'\{' + VAR_NAME_RE + r'\}'
255
256	# file descriptors can only have two digits, like mksh
257	# dash/zsh/etc. can have one
258	FD_NUM = r'[0-9]?[0-9]?'
259
260	# These two can must be recognized in the ShCommand state, but can't nested
261	# within [[.
262	# Keywords have to be checked before _UNQUOTED so we get <KW_If "if"> instead
263	# of <Lit_Chars "if">.
264	LEXER_DEF[lex_mode_e.ShCommand] = [
265	# These four are not allowed within [[, so they are in ShCommand but not
266	# _UNQUOTED.
267
268	# e.g. beginning of NAME=val, which will always be longer than
269	# _LITERAL_WHITELIST_REGEX.
270	R(VAR_NAME_RE + '\+?=', Id.Lit_VarLike),
271	R(VAR_NAME_RE + '\[', Id.Lit_ArrayLhsOpen),
272	R(r'\]\+?=', Id.Lit_ArrayLhsClose),
273	C('((', Id.Op_DLeftParen),
274
275	# For static globbing, and [] for array literals
276	C('[', Id.Lit_LBracket), # e.g. A=(['x']=1)
277	C(']', Id.Lit_RBracket), # e.g. *.[ch]
278	# NOTE: Glob_Star and Glob_QMark are for dynamic parsing
279	C('*', Id.Lit_Star),
280	C('?', Id.Lit_QMark),
281	C('###', Id.Lit_TPound), # like Lit_Pound, for doc comments
282	C('...', Id.Lit_TDot), # ... for multiline commands
283
284	# For brace expansion {a,b}
285	C('{', Id.Lit_LBrace),
286	C('}', Id.Lit_RBrace), # Also for var sub ${a}
287	C(',', Id.Lit_Comma),
288	C('=', Id.Lit_Equals), # for = f(x) and x = 1+2*3
289	C('@', Id.Lit_At), # for detecting @[, @' etc. shopt -s parse_at_all
290
291	# @array and @func(1, c)
292	R('@' + VAR_NAME_RE, Id.Lit_Splice), # for YSH splicing
293	C('@[', Id.Lit_AtLBracket), # @[split(x)]
294	C('@{.', Id.Lit_AtLBraceDot), # for split builtin sub @{.myproc arg1}
295	R(FD_NUM + r'<', Id.Redir_Less),
296	R(FD_NUM + r'>', Id.Redir_Great),
297	R(FD_NUM + r'<<', Id.Redir_DLess),
298	R(FD_NUM + r'<<<', Id.Redir_TLess),
299	R(FD_NUM + r'>>', Id.Redir_DGreat),
300	R(FD_NUM + r'<<-', Id.Redir_DLessDash),
301	R(FD_NUM + r'>&', Id.Redir_GreatAnd),
302	R(FD_NUM + r'<&', Id.Redir_LessAnd),
303	R(FD_NUM + r'<>', Id.Redir_LessGreat),
304	R(FD_NUM + r'>\\|', Id.Redir_Clobber),
305	R(FD_VAR_NAME + r'<', Id.Redir_Less),
306	R(FD_VAR_NAME + r'>', Id.Redir_Great),
307	R(FD_VAR_NAME + r'<<', Id.Redir_DLess),
308	R(FD_VAR_NAME + r'<<<', Id.Redir_TLess),
309	R(FD_VAR_NAME + r'>>', Id.Redir_DGreat),
310	R(FD_VAR_NAME + r'<<-', Id.Redir_DLessDash),
311	R(FD_VAR_NAME + r'>&', Id.Redir_GreatAnd),
312	R(FD_VAR_NAME + r'<&', Id.Redir_LessAnd),
313	R(FD_VAR_NAME + r'<>', Id.Redir_LessGreat),
314	R(FD_VAR_NAME + r'>\\|', Id.Redir_Clobber),
315
316	# No leading descriptor (2 is implied)
317	C(r'&>', Id.Redir_AndGreat),
318	C(r'&>>', Id.Redir_AndDGreat),
319	] + KEYWORDS + CONTROL_FLOW + _UNQUOTED + _EXTGLOB_BEGIN
320
321	# Preprocessing before ShCommand
322	LEXER_DEF[lex_mode_e.Backtick] = [
323	C(r'`', Id.Backtick_Right),
324	# A backslash, and then $ or ` or \
325	R(r'\\[$`\\]', Id.Backtick_Quoted),
326	# \" treated specially, depending on whether bacticks are double-quoted!
327	R(r'\\"', Id.Backtick_DoubleQuote),
328	R(r'[^`\\\0]+', Id.Backtick_Other), # contiguous run of literals
329	R(r'[^\0]', Id.Backtick_Other), # anything else
330	]
331
332	# DBRACKET: can be like ShCommand, except:
333	# - Don't really need redirects either... Redir_Less could be Op_Less
334	# - Id.Op_DLeftParen can't be nested inside.
335	LEXER_DEF[lex_mode_e.DBracket] = [
336	C(']]', Id.Lit_DRightBracket),
337	# Must be KW and not Op, because we can have stuff like [[ $foo == !* ]]
338	# in addition to [[ ! a && b ]]
339	C('!', Id.KW_Bang),
340	C('<', Id.Op_Less),
341	C('>', Id.Op_Great),
342	] + ID_SPEC.LexerPairs(Kind.BoolUnary) + \
343	ID_SPEC.LexerPairs(Kind.BoolBinary) + \
344	_UNQUOTED + _EXTGLOB_BEGIN
345
346	# Inside an extended glob, most characters are literals, including spaces and
347	# punctuation. We also accept \, $var, ${var}, "", etc. They can also be
348	# nested, so _EXTGLOB_BEGIN appears here.
349	#
350	# Example: echo @(<> <>\|&&\|'foo'\|$bar)
351	LEXER_DEF[lex_mode_e.ExtGlob] = \
352	_BACKSLASH + _LEFT_SUBS + _LEFT_UNQUOTED + _VARS + _EXTGLOB_BEGIN + [
353	R(r'[^\\$`"\'\|)@*+!?\0]+', Id.Lit_Chars),
354	C('\|', Id.Op_Pipe),
355	C(')', Id.Op_RParen), # maybe be translated to Id.ExtGlob_RParen
356	R(r'[^\0]', Id.Lit_Other), # everything else is literal
357	]
358
359	# Notes on BASH_REGEX states
360	#
361	# From bash manual:
362	#
363	# - Any part of the pattern may be quoted to force the quoted portion to be
364	# matched as a string.
365	# - Bracket expressions in regular expressions must be treated carefully, since
366	# normal quoting characters lose their meanings between brackets.
367	# - If the pattern is stored in a shell variable, quoting the variable
368	# expansion forces the entire pattern to be matched as a string.
369	#
370	# Is there a re.escape function? It's just like EscapeGlob and UnescapeGlob.
371	#
372	# TODO: For testing, write a script to extract and save regexes... and compile
373	# them with regcomp. I've only seen constant regexes.
374	#
375	# bash code: ( \| ) are special
376
377	LEXER_DEF[lex_mode_e.BashRegex] = _LEFT_SUBS + _LEFT_UNQUOTED + _VARS + [
378	# Like lex_mode_e.ShCommand
379	R(_LITERAL_WHITELIST_REGEX, Id.Lit_Chars),
380
381	# Tokens for Tilde sub. bash weirdness: RHS of [[ x =~ ~ ]] is expanded
382	C('~', Id.Lit_Tilde),
383	C('/', Id.Lit_Slash),
384
385	# Id.WS_Space delimits words. In lex_mode_e.BashRegexFakeInner, we
386	# translate them to Id.Lit_Chars.
387	_SIGNIFICANT_SPACE,
388
389	# Analogous to Id.ExtGlob_* - we need to change lexer modes when we hit this
390	C('(', Id.BashRegex_LParen),
391
392	# Not special, this is like lex_mode_e.Outer
393	C(')', Id.Op_RParen),
394
395	# Copied and adapted from _UNQUOTED
396	# \n & ; < > are parse errors OUTSIDE a group [[ s =~ ; ]]
397	# but become allowed INSIDE a group [[ s =~ (;) ]]
398	C('\n', Id.BashRegex_AllowedInParens),
399	C('&', Id.BashRegex_AllowedInParens),
400	C(';', Id.BashRegex_AllowedInParens),
401	C('>', Id.BashRegex_AllowedInParens),
402	C('<', Id.BashRegex_AllowedInParens),
403
404	# e.g. \| is Id.Lit_Other, not pipe operator
405	R(r'[^\0]', Id.Lit_Other), # like _UNQUOTED, any other byte is literal
406	] + _BACKSLASH # These have to come after RegexMeta
407
408	LEXER_DEF[lex_mode_e.DQ] = _DQ_BACKSLASH + [
409	C('\\\n', Id.Ignored_LineCont),
410	] + _LEFT_SUBS + _VARS + [
411	R(r'[^$`"\0\\]+', Id.Lit_Chars), # matches a line at most
412	C('$', Id.Lit_Dollar), # completion of var names relies on this
413	# NOTE: When parsing here doc line, this token doesn't end it.
414	C('"', Id.Right_DoubleQuote),
415	]
416
417	_VS_ARG_COMMON = [
418	C('/', Id.Lit_Slash), # for patsub (not Id.VOp2_Slash)
419	C('#', Id.Lit_Pound), # for patsub prefix (not Id.VOp1_Pound)
420	C('%', Id.Lit_Percent), # for patsdub suffix (not Id.VOp1_Percent)
421	C('}', Id.Right_DollarBrace), # For var sub "${a}"
422	C('$', Id.Lit_Dollar), # completion of var names relies on this
423	]
424
425	# We don't execute zsh var subs, but to find the closing } properly, we need to
426	# to recognize \} and '}' and "}" $'}' etc.
427	LEXER_DEF[lex_mode_e.VSub_Zsh] = \
428	_BACKSLASH + _LEFT_SUBS + _LEFT_UNQUOTED + _LEFT_PROCSUB + \
429	[
430	C('}', Id.Right_DollarBrace), # For var sub "${a}"
431	R(r'[^\0]', Id.Lit_Other), # e.g. "$", must be last
432	]
433
434	# Kind.{Lit,Ignored,VSub,Left,Right,Eof}
435	LEXER_DEF[lex_mode_e.VSub_ArgUnquoted] = \
436	_BACKSLASH + _VS_ARG_COMMON + _LEFT_SUBS + _LEFT_UNQUOTED + _LEFT_PROCSUB + \
437	_VARS + _EXTGLOB_BEGIN + [
438
439	# Token for Tilde sub
440	C('~', Id.Lit_Tilde),
441
442	# - doesn't match ~ for tilde sub
443	# - doesn't match < and > so it doesn't eat <()
444	# - doesn't match @ ! ? + * so it doesn't eat _EXTGLOB_BEGIN -- ( alone it
445	# not enough
446	R(r'[^$`~/}"\'\0\\#%<>@!?+*]+', Id.Lit_Chars),
447	R(r'[^\0]', Id.Lit_Other), # e.g. "$", must be last
448	]
449
450	# Kind.{Lit,Ignored,VSub,Left,Right,Eof}
451	LEXER_DEF[lex_mode_e.VSub_ArgDQ] = \
452	_DQ_BACKSLASH + _VS_ARG_COMMON + _LEFT_SUBS + _VARS + [
453
454	C(r'\}', Id.Lit_EscapedChar), # For "${var-\}}"
455
456	R(r'[^$`/}"\0\\#%]+', Id.Lit_Chars), # matches a line at most
457
458	# Weird wart: even in double quoted state, double quotes are allowed
459	C('"', Id.Left_DoubleQuote),
460
461	# Another weird wart of bash/mksh: $'' is recognized but NOT ''!
462	C("$'", Id.Left_DollarSingleQuote),
463	]
464
465	# NOTE: Id.Ignored_LineCont is NOT supported in SQ state, as opposed to DQ
466	# state.
467	LEXER_DEF[lex_mode_e.SQ_Raw] = [
468	R(r"[^'\0]+", Id.Lit_Chars), # matches a line at most
469	C("'", Id.Right_SingleQuote),
470	]
471
472	# The main purpose for EXPR_CHARS is in regex literals, e.g. [a-z \t \n].
473	#
474	# In YSH expressions, Chars are code point integers, so \u{1234} is the same as
475	# 0x1234. And \0 is 0x0.
476
477	# In Python:
478	# chr(0x00012345) == u'\U00012345'
479	#
480	# In YSH:
481	# 0x00012345 == \u{12345}
482	# chr(0x00012345) == chr(\u{12345}) == $'\u{012345}'
483
484	_U_BRACED_CHAR = R(r'\\[uU]\{[0-9a-fA-F]{1,6}\}', Id.Char_UBraced)
485
486	_X_CHAR_LOOSE = R(r'\\x[0-9a-fA-F]{1,2}', Id.Char_Hex) # bash
487	_CHAR_YHEX = R(r'\\y[0-9a-fA-F]{2}', Id.Char_YHex) # \yff - J8 only
488
489	_U4_CHAR_LOOSE = R(r'\\u[0-9a-fA-F]{1,4}', Id.Char_Unicode4) # bash
490
491	_U4_CHAR_STRICT = R(r'\\u[0-9a-fA-F]{4}', Id.Char_Unicode4) # JSON-only
492
493	#_JSON_ONE_CHAR = R(r'\\[\\"/bfnrt]', Id.Char_OneChar)
494	EXPR_CHARS = [
495	# Allow same backslash escapes as J8 strings, except;
496	# - legacy \b \f
497	# - unnecessary \/
498	#
499	# Note that \0 should be written \y00.
500	R(r'''\\[\\"'nrt]''', Id.Char_OneChar),
501	_CHAR_YHEX,
502
503	# Eggex. This is a LITERAL translation to \xff in ERE? So it's not \yff
504	# It doesn't have semantics; it's just syntax.
505	R(r'\\x[0-9a-fA-F]{2}', Id.Char_Hex),
506	_U_BRACED_CHAR,
507	]
508
509	# Shared between echo -e and $''.
510	_C_STRING_COMMON = [
511
512	# \x6 is valid in bash
513	_X_CHAR_LOOSE,
514	_U4_CHAR_LOOSE,
515	R(r'\\U[0-9a-fA-F]{1,8}', Id.Char_Unicode8),
516	R(r'\\[0abeEfrtnv\\]', Id.Char_OneChar),
517
518	# e.g. \A is not an escape, and \x doesn't match a hex escape. We allow it,
519	# but a lint tool could warn about it.
520	C('\\', Id.Unknown_Backslash),
521	]
522
523	ECHO_E_DEF = _C_STRING_COMMON + [
524	# Note: tokens above \0377 can either be truncated or be flagged a syntax
525	# error in strict mode.
526	R(r'\\0[0-7]{1,3}', Id.Char_Octal4),
527	C(r'\c', Id.Char_Stop),
528
529	# e.g. 'foo', anything that's not a backslash escape
530	R(r'[^\\\0]+', Id.Lit_Chars),
531	]
532
533	# https://json.org/
534
535	# Note that [0-9] has to come second, because Python chooses the first match.
536	_JSON_INT = r'-?([1-9][0-9]*\|[0-9])' # Numbers can't start with leading 0
537	_JSON_FRACTION = r'(\.[0-9]+)?'
538	_JSON_EXP = r'([eE][-+]?[0-9]+)?'
539
540	# R5RS extended alphabetic characters
541	# https://groups.csail.mit.edu/mac/ftpdir/scheme-reports/r5rs-html/r5rs_4.html
542	#
543	# ! $ % & * + - . / : < = > ? @ ^ _ ~
544
545	# Description from Guile Scheme - https://www.gnu.org/software/guile/manual/html_node/Symbol-Read-Syntax.html
546	#
547	# "The read syntax for a symbol is a sequence of letters, digits, and extended
548	# alphabetic characters, beginning with a character that cannot begin a
549	# number. In addition, the special cases of +, -, and ... are read as symbols
550	# even though numbers can begin with +, - or ."
551	#
552	# (They should have used regular languages!)
553
554	# We take out $ and @ for our splicing syntax, i.e. $unquote and
555	# @unquote-splicing. And : for now because we use it for name:value.
556
557	# Also note Scheme allows \|a b\| for symbols with funny chars, and Guile scheme
558	# allows #{a b}#. We could use `a b` or (symbol "a b").
559
560	J8_SYMBOL_CHARS = r'!%&*+./<=>?^_~-' # - is last for regex char class
561
562	# yapf: disable
563	J8_SYMBOL_RE = (
564	r'[a-zA-Z' + J8_SYMBOL_CHARS + ']' +
565	r'[a-zA-Z0-9' + J8_SYMBOL_CHARS + ']*')
566	# yapf: enable
567
568	_J8_LEFT = [
569	C('"', Id.Left_DoubleQuote), # JSON string
570	C('j"', Id.Left_JDoubleQuote), # JSON string with explicit J8 prefix
571	# Three left quotes that are J8 only
572	C("u'", Id.Left_USingleQuote), # unicode string
573	C("'", Id.Left_USingleQuote), # '' is alias for u'' in data, not in code
574	C("b'", Id.Left_BSingleQuote), # byte string
575	]
576
577	J8_DEF = _J8_LEFT + [
578	C('[', Id.J8_LBracket),
579	C(']', Id.J8_RBracket),
580	C('{', Id.J8_LBrace),
581	C('}', Id.J8_RBrace),
582	C('(', Id.J8_LParen), # NIL8 only
583	C(')', Id.J8_RParen), # NIL8 only
584	C(',', Id.J8_Comma),
585	C(':', Id.J8_Colon),
586	C('null', Id.J8_Null),
587	C('true', Id.J8_Bool),
588	C('false', Id.J8_Bool),
589	R(_JSON_INT, Id.J8_Int),
590	R(_JSON_INT + _JSON_FRACTION + _JSON_EXP, Id.J8_Float),
591
592	# Identifier names come AFTER null true false.
593	# - Happens to be the same as shell identifier # names.
594	# - Note that JS allows $ as an identifier, but we don't.
595	# - Used for dict keys / NIL8 field names.
596	R(VAR_NAME_RE, Id.J8_Identifier),
597
598	# Symbol is a SUPERSET of Identifier. The first word in NIL8 can be can
599	# be either Symbol or plain Identifier, but field names can only be
600	# Identifier. JSON8 only has Identifier.
601	#R(J8_SYMBOL_RE, Id.J8_Symbol), # NIL8 only
602	R(r'[~!@$%^&*+=\|;./<>?-]+', Id.J8_Operator), # NIL8 only
603	R(r'[ \r\t]+', Id.Ignored_Space),
604	# A separate token, to count lines for error messages
605	C('\n', Id.Ignored_Newline),
606	# comment is # until end of line
607	# // comments are JavaScript style, but right now we might want them as
608	# symbols?
609	R(r'#[^\n\0]*', Id.Ignored_Comment), # J8 only (JSON8, NIL8)
610
611	# This will reject ASCII control chars
612	R(r'[^\0]', Id.Unknown_Tok),
613	]
614
615	# Exclude control characters 0x00-0x1f, aka 0-31 in J8 data only (not YSH code)
616	_ASCII_CONTROL = R(r'[\x01-\x1F]', Id.Char_AsciiControl)
617
618	J8_LINES_DEF = _J8_LEFT + [
619	# not sure if we want \r here - same with lex_mode_e.Expr
620	R(r'[ \r\t]+', Id.WS_Space),
621	R(r'[\n]', Id.J8_Newline),
622
623	# doesn't match \t, which means tabs are allowed in the middle of unquoted
624	# lines
625	_ASCII_CONTROL,
626
627	# not space or ' or " or ASCII control or EOF
628	R(r'''[^ \t\r\n'"\x00-\x1F]+''', Id.Lit_Chars),
629	]
630
631	# https://json.org list of chars, plus '
632	_JSON_ONE_CHAR = R(r'\\[\\"/bfnrt]', Id.Char_OneChar)
633
634	# b'' u'' strings - what's common between code and data.
635	_J8_STR_COMMON = [
636	C("'", Id.Right_SingleQuote), # end for J8
637	_JSON_ONE_CHAR,
638	C("\\'", Id.Char_OneChar), # since ' ends, allow \'
639	_CHAR_YHEX,
640	_U_BRACED_CHAR, # \u{123456} - J8 only
641
642	# osh/word_parse.py relies on this. It has to be consistent with $''
643	# lexing, which uses _C_STRING_COMMON
644	C('\\', Id.Unknown_Backslash),
645	]
646
647	# Lexer for J8 strings in CODE.
648	LEXER_DEF[lex_mode_e.J8_Str] = _J8_STR_COMMON + [
649	# Don't produce Char_AsciiControl tokens - that's only for data
650
651	# will match invalid UTF-8 - we have a separate validation step
652	R(r"[^\\'\0]+", Id.Lit_Chars),
653	]
654
655	# Lexer for J8 string data.
656	# ASCII control characters are disallowed in DATA, but not CODE!
657	J8_STR_DEF = _J8_STR_COMMON + [
658	_ASCII_CONTROL,
659	# will match invalid UTF-8 - we have a separate validation step
660	R(r"[^\\'\x00-\x1F]+", Id.Lit_Chars),
661	]
662
663	# Lexer for JSON string data - e.g. "json \" \u1234"
664	JSON_STR_DEF = [
665	C('"', Id.Right_DoubleQuote), # end for JSON
666	_JSON_ONE_CHAR,
667	_U4_CHAR_STRICT, # \u1234 - JSON only
668
669	# High surrogate [\uD800, \uDC00)
670	# Low surrogate [\uDC00, \uE000)
671	# This pattern makes it easier to decode. Unpaired surrogates because Id.Char_Unicode4.
672	R(
673	r'\\u[dD][89aAbB][0-9a-fA-F][0-9a-fA-F]\\u[dD][cCdDeEfF][0-9a-fA-F][0-9a-fA-F]',
674	Id.Char_SurrogatePair),
675	C('\\', Id.Unknown_Backslash), # e.g. the \ before bad \z
676	_ASCII_CONTROL,
677
678	# Note: This will match INVALID UTF-8. UTF-8 validation is another step.
679	R(r'[^\\"\x00-\x1F]+', Id.Lit_Chars),
680	]
681
682	_WHITESPACE = r'[ \t\r\n]*' # ASCII whitespace doesn't have legacy \f \v
683
684	SH_NUMBER_DEF = [
685	R('-?0', Id.ShNumber_Dec), # not octal, may be negative
686	R(r'-?[1-9][0-9]*', Id.ShNumber_Dec), # may be be negative
687	R(r'-?0[0-7]+', Id.ShNumber_Oct), # may be negative
688	# these may NOT be negative!
689	R(r'0x[0-9A-Fa-f]+', Id.ShNumber_Hex),
690	R(r'[1-9][0-9]*#[0-9a-zA-Z@_]+', Id.ShNumber_BaseN),
691	R(r'[^\0]', Id.Unknown_Tok), # any other char
692	]
693
694	OCTAL3_RE = r'\\[0-7]{1,3}'
695
696	# https://www.gnu.org/software/bash/manual/html_node/Controlling-the-PromptEvaluator.html#Controlling-the-PromptEvaluator
697	PS1_DEF = [
698	R(OCTAL3_RE, Id.PS_Octal3),
699	R(r'\\[adehHjlnrstT@AuvVwW!#$\\]', Id.PS_Subst),
700	# \D{%H:%M} strftime format
701	R(r'\\D\{[^}\0]*\}', Id.PS_Subst),
702	C(r'\[', Id.PS_LBrace), # non-printing
703	C(r'\]', Id.PS_RBrace),
704	R(r'[^\\\0]+', Id.PS_Literals),
705	# e.g. \x is not a valid escape.
706	C('\\', Id.PS_BadBackslash),
707	]
708
709	# NOTE: Id.Ignored_LineCont is also not supported here, even though the whole
710	# point of it is that supports other backslash escapes like \n! It just
711	# becomes a regular backslash.
712	LEXER_DEF[lex_mode_e.SQ_C] = _C_STRING_COMMON + [
713	# Weird special case matching bash: backslash that ends a line. We emit
714	# this token literally in OSH, but disable it in YSH.
715	C('\\\n', Id.Unknown_Backslash),
716
717	# Silly difference! In echo -e, the syntax is \0377, but here it's $'\377',
718	# with no leading 0.
719	R(OCTAL3_RE, Id.Char_Octal3),
720
721	# ' and " are escaped in $'' mode, but not echo -e.
722	C(r"\'", Id.Char_OneChar),
723	C(r'\"', Id.Char_OneChar),
724
725	# e.g. 'foo', anything that's not a backslash escape or '
726	R(r"[^\\'\0]+", Id.Lit_Chars),
727	C("'", Id.Right_SingleQuote),
728	]
729
730	LEXER_DEF[lex_mode_e.PrintfOuter] = _C_STRING_COMMON + [
731	R(OCTAL3_RE, Id.Char_Octal3),
732	R(r"[^%\\\0]+", Id.Lit_Chars),
733	C('%%', Id.Format_EscapedPercent),
734	C('%', Id.Format_Percent),
735	]
736
737	# Maybe: bash also supports %(strftime)T
738	LEXER_DEF[lex_mode_e.PrintfPercent] = [
739	# Flags
740	R('[- +#]', Id.Format_Flag),
741	C('0', Id.Format_Zero),
742	R('[1-9][0-9]*', Id.Format_Num),
743	C('*', Id.Format_Star),
744	C('.', Id.Format_Dot),
745	# We support dsq. The others we parse to display an error message.
746	R('[disqbcouxXeEfFgG]', Id.Format_Type),
747	R('$[^()\0]*$T', Id.Format_Time),
748	R(r'[^\0]', Id.Unknown_Tok), # any other char
749	]
750
751	LEXER_DEF[lex_mode_e.VSub_1] = [
752	R(VAR_NAME_RE, Id.VSub_Name),
753	# ${11} is valid, compared to $11 which is $1 and then literal 1.
754	R(r'[0-9]+', Id.VSub_Number),
755	C('!', Id.VSub_Bang),
756	C('@', Id.VSub_At),
757	C('#', Id.VSub_Pound),
758	C('$', Id.VSub_Dollar),
759	C('*', Id.VSub_Star),
760	C('-', Id.VSub_Hyphen),
761	C('?', Id.VSub_QMark),
762	C('.', Id.VSub_Dot), # ${.myproc builtin sub}
763	C('}', Id.Right_DollarBrace),
764	C('\\\n', Id.Ignored_LineCont),
765	C('\n', Id.Unknown_Tok), # newline not allowed inside ${}
766	R(r'[^\0]', Id.Unknown_Tok), # any char except newline
767	]
768
769	LEXER_DEF[lex_mode_e.VSub_2] = \
770	ID_SPEC.LexerPairs(Kind.VTest) + \
771	ID_SPEC.LexerPairs(Kind.VOp0) + \
772	ID_SPEC.LexerPairs(Kind.VOpYsh) + \
773	ID_SPEC.LexerPairs(Kind.VOp1) + \
774	ID_SPEC.LexerPairs(Kind.VOp2) + \
775	ID_SPEC.LexerPairs(Kind.VOp3) + [
776	C('}', Id.Right_DollarBrace),
777
778	C('\\\n', Id.Ignored_LineCont),
779	C('\n', Id.Unknown_Tok), # newline not allowed inside ${}
780	R(r'[^\0]', Id.Unknown_Tok), # any char except newline
781	]
782
783	_EXPR_ARITH_SHARED = [
784	C('\\\n', Id.Ignored_LineCont),
785	R(r'[^\0]', Id.Unknown_Tok) # any char. This should be a syntax error.
786	]
787
788	# https://www.gnu.org/software/bash/manual/html_node/Shell-Arithmetic.html#Shell-Arithmetic
789	LEXER_DEF[lex_mode_e.Arith] = \
790	_LEFT_SUBS + _VARS + _LEFT_UNQUOTED + [
791
792	# Arithmetic expressions can cross newlines.
793	R(r'[ \t\r\n]+', Id.Ignored_Space),
794
795	# Examples of arith constants:
796	# 64#azAZ
797	# 0xabc 0xABC
798	# 0123
799	# A separate digits token makes this easier to parse STATICALLY. But this
800	# doesn't help with DYNAMIC parsing.
801	R(VAR_NAME_RE, Id.Lit_ArithVarLike), # for variable names or 64#_
802	R(r'[0-9]+', Id.Lit_Digits),
803	C('@', Id.Lit_At), # for 64#@ or ${a[@]}
804	C('#', Id.Lit_Pound), # for 64#a
805
806	# TODO: 64#@ interferes with VS_AT. Hm.
807	] + ID_SPEC.LexerPairs(Kind.Arith) + _EXPR_ARITH_SHARED
808
809	# A lexer for the parser that converts globs to extended regexes. Since we're
810	# only parsing character classes ([^[:space:][:alpha:]]) as opaque blobs, we
811	# don't need lexer modes here.
812	GLOB_DEF = [
813	# These could be operators in the glob, or just literals in a char class,
814	# e.g. touch '?'; echo [?].
815	C('*', Id.Glob_Star),
816	C('?', Id.Glob_QMark),
817
818	# For negation. Treated as operators inside [], but literals outside.
819	C('!', Id.Glob_Bang),
820	C('^', Id.Glob_Caret),
821
822	# Character classes.
823	C('[', Id.Glob_LBracket),
824	C(']', Id.Glob_RBracket),
825
826	# There is no whitelist of characters; backslashes are unconditionally
827	# removed. With libc.fnmatch(), the pattern r'\f' matches 'f' but not '\\f'.
828	# See libc_test.py.
829	R(r'\\[^\0]', Id.Glob_EscapedChar),
830	C('\\', Id.Glob_BadBackslash), # Trailing single backslash
831
832	# For efficiency, combine other characters into a single token, e.g. 'py' in
833	# '*.py' or 'alpha' in '[[:alpha:]]'.
834	R(r'[a-zA-Z0-9_]+', Id.Glob_CleanLiterals), # no regex escaping
835	R(r'[^\0]', Id.Glob_OtherLiteral), # anything else -- examine the char
836	]
837
838	# History expansion. We're doing this as "pre-lexing" since that's what bash
839	# and zsh seem to do. Example:
840	#
841	# $ foo=x
842	# $ echo $
843	# $ !!foo # expands to echo $foo and prints x
844	#
845	# We can also reuse this in the RootCompleter to expand history interactively.
846	#
847	# bash note: handled in lib/readline/histexpand.c. Quite messy and handles
848	# quotes AGAIN.
849	#
850	# Note: \! gets expanded to literal \! for the real lexer, but no history
851	# expansion occurs.
852
853	HISTORY_DEF = [
854	# Common operators.
855	R(r'![!*^$]', Id.History_Op),
856
857	# By command number.
858	R(r'!-?[0-9]+', Id.History_Num),
859
860	# Search by prefix of substring (optional '?').
861	# NOTE: there are no numbers allowed here! Bash doesn't seem to support it.
862	# No hyphen since it conflits with $-1 too.
863	#
864	# Required trailing whitespace is there to avoid conflict with [!charclass]
865	# and ${!indirect}. This is a simpler hack than the one bash has. See
866	# frontend/lex_test.py.
867	R(r'!\??[a-zA-Z_/.][0-9a-zA-Z_/.]+[ \t\r\n]', Id.History_Search),
868
869	# Comment is until end of line
870	R(r"#[^\0]*", Id.History_Other),
871
872	# Single quoted, e.g. 'a' or $'\n'. Terminated by another single quote or
873	# end of string.
874	R(r"'[^'\0]*'?", Id.History_Other),
875
876	# Runs of chars that are definitely not special
877	R(r"[^!\\'#\0]+", Id.History_Other),
878
879	# Escaped characters. \! disables history
880	R(r'\\[^\0]', Id.History_Other),
881	# Other single chars, like a trailing \ or !
882	R(r'[^\0]', Id.History_Other),
883	]
884
885	BRACE_RANGE_DEF = [
886	R(r'-?[0-9]+', Id.Range_Int),
887	R(r'[a-zA-Z]', Id.Range_Char), # just a single character
888	R(r'\.\.', Id.Range_Dots),
889	R(r'[^\0]', Id.Range_Other), # invalid
890	]
891
892	#
893	# YSH lexing
894	#
895
896	# Valid in lex_mode_e.{Expr,DQ}
897	# Used by ysh/grammar_gen.py
898	YSH_LEFT_SUBS = [
899	C('$(', Id.Left_DollarParen),
900	C('${', Id.Left_DollarBrace),
901	C('$[', Id.Left_DollarBracket), # TODO: Implement $[x]
902	]
903
904	# Valid in lex_mode_e.Expr, but not valid in DQ
905	# Used by ysh/grammar_gen.py
906
907	YSH_LEFT_UNQUOTED = [
908	# Double quoted
909	C('"', Id.Left_DoubleQuote),
910	C('$"', Id.Left_DollarDoubleQuote), # $"" is synonym for ""
911	C('j"', Id.Left_JDoubleQuote), # for printing ERROR
912	# Single quoted
913	C("'", Id.Left_SingleQuote),
914	C("r'", Id.Left_RSingleQuote),
915	C("u'", Id.Left_USingleQuote),
916	C("b'", Id.Left_BSingleQuote),
917	C("$'", Id.Left_DollarSingleQuote), # legacy
918	C('^"', Id.Left_CaretDoubleQuote),
919	C('"""', Id.Left_TDoubleQuote),
920	C('$"""', Id.Left_DollarTDoubleQuote),
921	# In expression mode, we add the r'' and c'' prefixes for '' and $''.
922	C("'''", Id.Left_TSingleQuote),
923	C("r'''", Id.Left_RTSingleQuote),
924	C("u'''", Id.Left_UTSingleQuote),
925	C("b'''", Id.Left_BTSingleQuote),
926	C('@(', Id.Left_AtParen), # Split Command Sub
927	C('^(', Id.Left_CaretParen), # Block literals in expression mode
928	C('^[', Id.Left_CaretBracket), # Expr literals
929	C('^{', Id.Left_CaretBrace), # Unused
930	C(':\|', Id.Left_ColonPipe), # shell-like word arrays.
931	C('%(', Id.Left_PercentParen), # old syntax for shell-like word arrays.
932	C('%[', Id.Expr_Reserved), # Maybe: like %() without unquoted [], {}
933	C('%{', Id.Expr_Reserved), # Table literals
934	# t = %{
935	# name:Str age:Int
936	# 'andy c' 10
937	# }
938	# Significant newlines. No unquoted [], {}
939
940	# Not sure if we'll use these
941	C('@{', Id.Expr_Reserved),
942	C('@[', Id.Expr_Reserved),
943
944	# Idea: Set literals are #{a, b} like Clojure
945	]
946
947	# Used by ysh/grammar_gen.py
948	EXPR_OPS = [
949	# Terminator
950	C(';', Id.Op_Semi),
951	C('(', Id.Op_LParen),
952	C(')', Id.Op_RParen),
953	# NOTE: type expressions are expressions, e.g. Dict[Str, Int]
954	C('[', Id.Op_LBracket),
955	C(']', Id.Op_RBracket),
956	C('{', Id.Op_LBrace),
957	C('}', Id.Op_RBrace),
958	]
959
960	# Newline is significant, but sometimes elided by expr_parse.py.
961	_EXPR_NEWLINE_COMMENT = [
962	C('\n', Id.Op_Newline),
963	R(r'#[^\n\0]*', Id.Ignored_Comment),
964	# Like lex_mode_e.Arith, \r is whitespace even without \n
965	R(r'[ \t\r]+', Id.Ignored_Space),
966	]
967
968	# Note: if you call match.LooksLikeInteger(s), mops.FromStr(s) may still
969	# fail. However you should call BOTH, because we don't rely want to rely on
970	# the underlying stroll() to define the language accepted.
971	LOOKS_LIKE_INTEGER = _WHITESPACE + '-?[0-9]+' + _WHITESPACE
972
973	# TODO: use for YSH comparison operators > >= < <=
974	#
975	# Python allows 0 to be written 00 or 0_0_0, which is weird. But let's be
976	# consistent, and avoid '00' turning into a float!
977	_YSH_DECIMAL_INT_RE = r'[0-9](_?[0-9])*'
978
979	LOOKS_LIKE_YSH_INT = _WHITESPACE + '-?' + _YSH_DECIMAL_INT_RE + _WHITESPACE
980
981	_YSH_FLOAT_RE = (
982	_YSH_DECIMAL_INT_RE +
983	# Unlike Python, exponent can't be like 42e5_000. There's no use because
984	# 1e309 is already inf. Let's keep our code simple.
985	r'(\.' + _YSH_DECIMAL_INT_RE + ')?([eE][+\-]?[0-9]+)?')
986
987	# Ditto, used for YSH comparison operators
988	# Added optional Optional -?
989	# Example: -3_000_000.000_001e12
990	LOOKS_LIKE_YSH_FLOAT = _WHITESPACE + '-?' + _YSH_FLOAT_RE + _WHITESPACE
991
992	# Python 3 float literals:
993
994	# digitpart ::= digit (["_"] digit)*
995	# fraction ::= "." digitpart
996	# exponent ::= ("e" \| "E") ["+" \| "-"] digitpart
997	# pointfloat ::= [digitpart] fraction \| digitpart "."
998	# exponentfloat ::= (digitpart \| pointfloat) exponent
999	# floatnumber ::= pointfloat \| exponentfloat
1000
1001	# NOTE: Borrowing tokens from Arith (i.e. $(( )) ), but not using LexerPairs().
1002	LEXER_DEF[lex_mode_e.Expr] = \
1003	_VARS + YSH_LEFT_SUBS + YSH_LEFT_UNQUOTED + EXPR_OPS + EXPR_WORDS + \
1004	EXPR_CHARS + [
1005
1006	# https://docs.python.org/3/reference/lexical_analysis.html#integer-literals
1007	#
1008	# integer ::= decinteger \| bininteger \| octinteger \| hexinteger
1009	# decinteger ::= nonzerodigit (["_"] digit)* \| "0"+ (["_"] "0")*
1010	# bininteger ::= "0" ("b" \| "B") (["_"] bindigit)+
1011	# octinteger ::= "0" ("o" \| "O") (["_"] octdigit)+
1012	# hexinteger ::= "0" ("x" \| "X") (["_"] hexdigit)+
1013	# nonzerodigit ::= "1"..."9"
1014	# digit ::= "0"..."9"
1015	# bindigit ::= "0" \| "1"
1016	# octdigit ::= "0"..."7"
1017	# hexdigit ::= digit \| "a"..."f" \| "A"..."F"
1018
1019	R(_YSH_DECIMAL_INT_RE, Id.Expr_DecInt),
1020
1021	R(r'0[bB](_?[01])+', Id.Expr_BinInt),
1022	R(r'0[oO](_?[0-7])+', Id.Expr_OctInt),
1023	R(r'0[xX](_?[0-9a-fA-F])+', Id.Expr_HexInt),
1024
1025	R(_YSH_FLOAT_RE, Id.Expr_Float),
1026
1027	# These can be looked up as keywords separately, so you enforce that they have
1028	# space around them?
1029	R(VAR_NAME_RE, Id.Expr_Name),
1030
1031	R('%' + VAR_NAME_RE, Id.Expr_Symbol),
1032
1033	#
1034	# Arith
1035	#
1036
1037	C(',', Id.Arith_Comma),
1038	C(':', Id.Arith_Colon), # for slicing a[1:2], and mylist:pop()
1039
1040	C('?', Id.Arith_QMark), # regex postfix
1041
1042	C('+', Id.Arith_Plus), # arith infix, regex postfix
1043	C('-', Id.Arith_Minus), # arith infix, regex postfix
1044	C('*', Id.Arith_Star),
1045	C('^', Id.Arith_Caret), # xor
1046	C('/', Id.Arith_Slash),
1047	C('%', Id.Arith_Percent),
1048
1049	C('**', Id.Arith_DStar), # exponentiation
1050	C('++', Id.Arith_DPlus), # Option for string/list concatenation
1051
1052	C('<', Id.Arith_Less),
1053	C('>', Id.Arith_Great),
1054	C('<=', Id.Arith_LessEqual),
1055	C('>=', Id.Arith_GreatEqual),
1056	C('===', Id.Expr_TEqual),
1057	C('!==', Id.Expr_NotDEqual),
1058
1059	C('==', Id.Unknown_DEqual), # user must choose === or ~==
1060
1061	C('&&', Id.Unknown_DAmp),
1062	C('\|\|', Id.Unknown_DPipe),
1063
1064	# Bitwise operators
1065	C('&', Id.Arith_Amp),
1066	C('\|', Id.Arith_Pipe),
1067	C('>>', Id.Arith_DGreat),
1068	C('<<', Id.Arith_DLess), # Doesn't Java also have <<< ?
1069
1070	# Bitwise complement, as well as infix pattern matching
1071	C('~', Id.Arith_Tilde),
1072	C('!~', Id.Expr_NotTilde),
1073	C('~~', Id.Expr_DTilde),
1074	C('!~~', Id.Expr_NotDTilde),
1075
1076	# Left out for now:
1077	# ++ -- -- needed for loops, awk?
1078	# ! && \|\| -- needed for find dialect
1079	# = += etc.
1080
1081	C('=', Id.Arith_Equal),
1082
1083	C('+=', Id.Arith_PlusEqual),
1084	C('-=', Id.Arith_MinusEqual),
1085	C('*=', Id.Arith_StarEqual),
1086	C('/=', Id.Arith_SlashEqual),
1087	C('%=', Id.Arith_PercentEqual),
1088
1089	C('>>=', Id.Arith_DGreatEqual),
1090	C('<<=', Id.Arith_DLessEqual),
1091	C('&=', Id.Arith_AmpEqual),
1092	C('\|=', Id.Arith_PipeEqual),
1093	C('^=', Id.Arith_CaretEqual), # Exponentiation
1094
1095	# Augmented assignment that YSH has, but sh and OSH don't have
1096	C('**=', Id.Expr_DStarEqual),
1097	C('//=', Id.Expr_DSlashEqual),
1098
1099	#
1100	# Expr
1101	#
1102
1103	C('!', Id.Expr_Bang), # For eggex negation
1104
1105	C('//', Id.Expr_DSlash), # For YSH integer division
1106	C('~==', Id.Expr_TildeDEqual), # approximate equality
1107
1108	C('.', Id.Expr_Dot), # d.key is alias for d['key']
1109	C('..', Id.Unknown_DDot), # legacy half-open range 1..5
1110	C('..<', Id.Expr_DDotLessThan), # half-open range 1..<5
1111	C('..=', Id.Expr_DDotEqual), # closed range 1..5
1112	C('->', Id.Expr_RArrow), # s->startswith()
1113	C('$', Id.Expr_Dollar), # legacy regex end: /d+ $/ (better written /d+ >/
1114
1115	# Reserved this. Go uses it for channels, etc.
1116	# I guess it conflicts with -4<-3, but that's OK -- spaces suffices.
1117	C('<-', Id.Expr_Reserved),
1118	C('=>', Id.Expr_RDArrow), # for df => filter(age > 10)
1119	# and match (x) { 1 => "one" }
1120	# note: other languages use \|>
1121	# R/dplyr uses %>%
1122
1123	C('...', Id.Expr_Ellipsis), # f(...args) and maybe a[:, ...]
1124
1125	# For multiline regex literals?
1126	C('///', Id.Expr_Reserved),
1127
1128	# Splat operators
1129	C('@', Id.Expr_At),
1130	# NOTE: Unused
1131	C('@@', Id.Expr_DoubleAt),
1132	] + _EXPR_NEWLINE_COMMENT + _EXPR_ARITH_SHARED
1133
1134	LEXER_DEF[lex_mode_e.FuncParens] = [
1135	# () with spaces
1136	R(r'[ \t]$[ \t]$', Id.LookAhead_FuncParens),
1137	# anything else
1138	R(r'[^\0]', Id.Unknown_Tok)
1139	]