frontend/lexer

OILS / frontend / lexer_def.py View on Github | oils.pub

1140 lines, 586 significant

1	"""
2	lexer_def.py - Lexer for OSH, YSH, and J8 Notation.
3
4	This lexer has lexer MODES, each with a regex -> Id mapping.
5
6	After changing this file, run:
7
8	build/py.sh all
9
10	or at least:
11
12	build/py.sh fastlex
13
14	Input Handling
15	--------------
16
17	Every line is NUL terminated:
18
19	'one\n\0' 'last line\0'
20
21	which means that no regexes below should match \0.
22
23	For example, use [^'\0]+ instead of [^']+ .
24
25	If this rule isn't followed, we would read uninitialized memory past the
26	sentinel. Python's regex engine knows where the end of the input string is, so
27	it doesn't require need a sentinel like \0.
28
29	The generator frontend/lexer_gen.py adds a pattern mapping \0 to Id.Eol_Tok.
30	"""
31
32	from _devbuild.gen.id_kind_asdl import Id, Id_t, Kind
33	from _devbuild.gen.types_asdl import lex_mode_e
34
35	from frontend import id_kind_def
36
37	from typing import Tuple
38
39	# Initialize spec that the lexer depends on.
40	ID_SPEC = id_kind_def.IdSpec({}, {})
41
42	id_kind_def.AddKinds(ID_SPEC)
43	id_kind_def.AddBoolKinds(ID_SPEC) # must come second
44	id_kind_def.SetupTestBuiltin(ID_SPEC, {}, {}, {})
45
46
47	def C(pat, tok_type):
48	# type: (str, Id_t) -> Tuple[bool, str, Id_t]
49	"""Lexer rule with a constant string, e.g. C('$*', VSub_Star)"""
50	return (False, pat, tok_type)
51
52
53	def R(pat, tok_type):
54	# type: (str, Id_t) -> Tuple[bool, str, Id_t]
55	"""Lexer rule with a regex string, e.g. R('\$[0-9]', VSub_Number)"""
56	return (True, pat, tok_type)
57
58
59	# utf8, utf-8, UTF8, UTF-8, etc.
60	IS_UTF8_CODESET_RE = r'[uU][tT][fF]-?8'
61
62	# See unit tests in frontend/match_test.py.
63	# We need the [^\0]* because the re2c translation assumes it's anchored like $.
64	SHOULD_HIJACK_RE = r'#![^\0]sh[ \t\r\n][^\0]'
65
66	# Separates words (\r it not whitespace here)
67	_SIGNIFICANT_SPACE = R(r'[ \t]+', Id.WS_Space)
68
69	_BACKSLASH = [
70	# To be conservative, we could deny a set of chars similar to
71	# _LITERAL_WHITELIST_REGEX, rather than allowing all the operator characters
72	# like \( and \;.
73	#
74	# strict_backslash makes this stricter.
75	R(r'\\[^\n\0]', Id.Lit_EscapedChar),
76	C('\\\n', Id.Ignored_LineCont),
77	]
78
79	# Only 4 characters are backslash escaped inside "".
80	# https://www.gnu.org/software/bash/manual/bash.html#Double-Quotes
81	_DQ_ESCAPED_CHAR = R(r'\\[$`"\\]', Id.Lit_EscapedChar)
82
83	VAR_NAME_RE = r'[a-zA-Z_][a-zA-Z0-9_]*'
84
85	# All Kind.VSub
86	_VARS = [
87	# Unbraced variables
88	R(r'\$' + VAR_NAME_RE, Id.VSub_DollarName),
89	R(r'\$[0-9]', Id.VSub_Number),
90	C(r'$!', Id.VSub_Bang),
91	C(r'$@', Id.VSub_At),
92	C(r'$#', Id.VSub_Pound),
93	C(r'$$', Id.VSub_Dollar),
94	C(r'$*', Id.VSub_Star),
95	C(r'$-', Id.VSub_Hyphen),
96	C(r'$?', Id.VSub_QMark),
97	]
98
99	# Kind.Left that are valid in double-quoted modes.
100
101	_LEFT_SUBS = [
102	C('`', Id.Left_Backtick),
103	C('$(', Id.Left_DollarParen),
104	C('${', Id.Left_DollarBrace),
105	# Parse zsh syntax, but don't execute it.
106	# The examples we've seen so far are like ${(%):-} and ${(m)
107	R(r'\$\{$[^)\0]+$', Id.Left_DollarBraceZsh),
108	C('$((', Id.Left_DollarDParen),
109	C('$[', Id.Left_DollarBracket),
110	]
111
112	# Additional Kind.Left that are valid in unquoted modes.
113	_LEFT_UNQUOTED = [
114	C('"', Id.Left_DoubleQuote),
115	C("'", Id.Left_SingleQuote),
116	C('$"', Id.Left_DollarDoubleQuote),
117	C("$'", Id.Left_DollarSingleQuote),
118	]
119
120	_LEFT_PROCSUB = [
121	C('<(', Id.Left_ProcSubIn),
122	C('>(', Id.Left_ProcSubOut),
123	]
124
125	# The regexes below are in Python syntax, but are translate to re2c syntax by
126	# frontend/lexer_gen.py.
127	#
128	# http://re2c.org/manual/syntax/syntax.html
129	# https://docs.python.org/2/library/re.html
130	#
131	# We use a limited set of constructs:
132	# - + and * for repetition
133	# - Character classes [] with simple ranges and negation
134	# - Escapes like \n \0
135
136	LEXER_DEF = {}
137
138	# Anything until the end of the line is a comment. Does not match the newline
139	# itself. We want to switch modes and possibly process Op_Newline for here
140	# docs, etc.
141	LEXER_DEF[lex_mode_e.Comment] = [R(r'[^\n\0]*', Id.Ignored_Comment)]
142
143	# A whitelist to make bigger Lit_Chars tokens. We don't want one byte at a time.
144	#
145	# The shell language says that "anything other byte" is a literal character --
146	# for example, unquoted $ \ ! are literal, not a syntax error.
147	#
148	# That is, a literal is defined NEGATIVELY, for a single characters. But here
149	# we define a SUBSET of literal chars POSITIVELY.
150
151	# The range \x80-\xff makes sure that UTF-8 sequences are a single token.
152	_LITERAL_WHITELIST_REGEX = r'[\x80-\xffa-zA-Z0-9_.\-]+'
153
154	_UNQUOTED = _BACKSLASH + _LEFT_SUBS + _LEFT_UNQUOTED + _LEFT_PROCSUB + _VARS + [
155	# NOTE: We could add anything 128 and above to this character class? So
156	# utf-8 characters don't get split?
157	R(_LITERAL_WHITELIST_REGEX, Id.Lit_Chars),
158	C('~', Id.Lit_Tilde), # for tilde sub
159	C('/', Id.Lit_Slash), # also for tilde sub
160	C(':', Id.Lit_Colon), # for special PATH=a:~foo tilde detection
161	C('$', Id.Lit_Dollar), # shopt --set no_parse_dollar
162	C('#', Id.Lit_Pound), # For comments
163	_SIGNIFICANT_SPACE,
164	C('\n', Id.Op_Newline),
165	C('&', Id.Op_Amp),
166	C('\|', Id.Op_Pipe),
167	C('\|&', Id.Op_PipeAmp),
168	C('&&', Id.Op_DAmp),
169	C('\|\|', Id.Op_DPipe),
170	C(';', Id.Op_Semi),
171	# Case terminators
172	C(';;', Id.Op_DSemi),
173	C(';&', Id.Op_SemiAmp),
174	C(';;&', Id.Op_DSemiAmp),
175	C('(', Id.Op_LParen),
176	C(')', Id.Op_RParen),
177	R(r'[^\0]', Id.Lit_Other), # any other single char is a literal
178	]
179
180	# In lex_mode_e.{ShCommand,DBracket}
181	_EXTGLOB_BEGIN = [
182	C(',(', Id.ExtGlob_Comma), # YSH synonym for @(...)
183	C('@(', Id.ExtGlob_At),
184	C('*(', Id.ExtGlob_Star),
185	C('+(', Id.ExtGlob_Plus),
186	C('?(', Id.ExtGlob_QMark),
187	C('!(', Id.ExtGlob_Bang),
188	]
189
190	KEYWORDS = [
191	# NOTE: { is matched elsewhere
192	C('[[', Id.KW_DLeftBracket),
193	C('!', Id.KW_Bang),
194	C('for', Id.KW_For),
195	C('while', Id.KW_While),
196	C('until', Id.KW_Until),
197	C('do', Id.KW_Do),
198	C('done', Id.KW_Done),
199	C('in', Id.KW_In),
200	C('case', Id.KW_Case),
201	C('esac', Id.KW_Esac),
202	C('if', Id.KW_If),
203	C('fi', Id.KW_Fi),
204	C('then', Id.KW_Then),
205	C('else', Id.KW_Else),
206	C('elif', Id.KW_Elif),
207	C('function', Id.KW_Function),
208	C('time', Id.KW_Time),
209
210	# YSH
211	C('const', Id.KW_Const), # maybe remove this
212	C('var', Id.KW_Var),
213	C('setvar', Id.KW_SetVar),
214	C('setglobal', Id.KW_SetGlobal),
215	C('call', Id.KW_Call),
216	C('proc', Id.KW_Proc),
217	C('typed', Id.KW_Typed),
218	C('func', Id.KW_Func),
219	]
220
221	# These are treated like builtins in bash, but keywords in OSH. However, we
222	# maintain compatibility with bash for the 'type' builtin.
223	CONTROL_FLOW = [
224	C('break', Id.ControlFlow_Break),
225	C('continue', Id.ControlFlow_Continue),
226	C('return', Id.ControlFlow_Return),
227	C('exit', Id.ControlFlow_Exit),
228	]
229
230	# Used by ysh/grammar_gen.py too
231	EXPR_WORDS = [
232	C('null', Id.Expr_Null),
233	C('true', Id.Expr_True),
234	C('false', Id.Expr_False),
235	C('and', Id.Expr_And),
236	C('or', Id.Expr_Or),
237	C('not', Id.Expr_Not),
238	C('for', Id.Expr_For),
239	C('is', Id.Expr_Is),
240	C('in', Id.Expr_In),
241	C('if', Id.Expr_If),
242	C('else', Id.Expr_Else),
243
244	# Unused: could be for func and proc litearls
245	#
246	# Note: we also have lambda literals \|x\| x+1
247	# I don't think we need them now, but the difference vs func is that the
248	# body is an expression. Note: JavaScript uses (x, y) => x + y which
249	# causes parsing problems.
250	C('func', Id.Expr_Func),
251	C('proc', Id.Expr_Proc),
252
253	# / <capture d+/
254	C('capture', Id.Expr_Capture),
255	# / <capture d+ as date> /
256	C('as', Id.Expr_As),
257	]
258
259	FD_VAR_NAME = r'\{' + VAR_NAME_RE + r'\}'
260
261	# file descriptors can only have two digits, like mksh
262	# dash/zsh/etc. can have one
263	FD_NUM = r'[0-9]?[0-9]?'
264
265	# These two can must be recognized in the ShCommand mode, but can't nested
266	# within [[.
267	# Keywords have to be checked before _UNQUOTED so we get <KW_If "if"> instead
268	# of <Lit_Chars "if">.
269	LEXER_DEF[lex_mode_e.ShCommand] = [
270	# These four are not allowed within [[, so they are in ShCommand but not
271	# _UNQUOTED.
272
273	# e.g. beginning of NAME=val, which will always be longer than
274	# _LITERAL_WHITELIST_REGEX.
275	R(VAR_NAME_RE + '\+?=', Id.Lit_VarLike),
276	R(VAR_NAME_RE + '\[', Id.Lit_ArrayLhsOpen),
277	R(r'\]\+?=', Id.Lit_ArrayLhsClose),
278	C('((', Id.Op_DLeftParen),
279
280	# For static globbing, and [] for array literals
281	C('[', Id.Lit_LBracket), # e.g. A=(['x']=1)
282	C(']', Id.Lit_RBracket), # e.g. *.[ch]
283	# NOTE: Glob_Star and Glob_QMark are for dynamic parsing
284	C('*', Id.Lit_Star),
285	C('?', Id.Lit_QMark),
286	C('###', Id.Lit_TPound), # like Lit_Pound, for doc comments
287	C('...', Id.Lit_TDot), # ... for multiline commands
288
289	# For brace expansion {a,b}
290	C('{', Id.Lit_LBrace),
291	C('}', Id.Lit_RBrace), # Also for var sub ${a}
292	C(',', Id.Lit_Comma),
293	C('=', Id.Lit_Equals), # for = f(x) and x = 1+2*3
294	C('@', Id.Lit_At), # for detecting @[, @' etc. shopt -s parse_at_all
295	R(FD_VAR_NAME, Id.Lit_RedirVarName),
296	R(FD_NUM, Id.Lit_Number),
297
298	# @array and @func(1, c)
299	R('@' + VAR_NAME_RE, Id.Lit_Splice), # for YSH splicing
300	C('@[', Id.Lit_AtLBracket), # @[split(x)]
301	C('@{.', Id.Lit_AtLBraceDot), # for split builtin sub @{.myproc arg1}
302	R(r'<', Id.Redir_Less),
303	R(r'>', Id.Redir_Great),
304	R(r'<<', Id.Redir_DLess),
305	R(r'<<<', Id.Redir_TLess),
306	R(r'>>', Id.Redir_DGreat),
307	R(r'<<-', Id.Redir_DLessDash),
308	R(r'>&', Id.Redir_GreatAnd),
309	R(r'<&', Id.Redir_LessAnd),
310	R(r'<>', Id.Redir_LessGreat),
311	R(r'>\\|', Id.Redir_Clobber),
312	C(r'&>', Id.Redir_AndGreat),
313	C(r'&>>', Id.Redir_AndDGreat),
314	] + KEYWORDS + CONTROL_FLOW + _UNQUOTED + _EXTGLOB_BEGIN
315
316	# Preprocessing before ShCommand
317	LEXER_DEF[lex_mode_e.Backtick] = [
318	C(r'`', Id.Backtick_Right),
319	# A backslash, and then $ or ` or \
320	R(r'\\[$`\\]', Id.Backtick_Quoted),
321	# \" treated specially, depending on whether bacticks are double-quoted!
322	R(r'\\"', Id.Backtick_DoubleQuote),
323	R(r'[^`\\\0]+', Id.Backtick_Other), # contiguous run of literals
324	R(r'[^\0]', Id.Backtick_Other), # anything else
325	]
326
327	# DBracket: can be like ShCommand, except:
328	# - Don't really need redirects either... Redir_Less could be Op_Less
329	# - Id.Op_DLeftParen can't be nested inside.
330	LEXER_DEF[lex_mode_e.DBracket] = [
331	C(']]', Id.Lit_DRightBracket),
332	# Must be KW and not Op, because we can have stuff like [[ $foo == !* ]]
333	# in addition to [[ ! a && b ]]
334	C('!', Id.KW_Bang),
335	C('<', Id.Op_Less),
336	C('>', Id.Op_Great),
337	] + ID_SPEC.LexerPairs(Kind.BoolUnary) + \
338	ID_SPEC.LexerPairs(Kind.BoolBinary) + \
339	_UNQUOTED + _EXTGLOB_BEGIN
340
341	# Inside an extended glob, most characters are literals, including spaces and
342	# punctuation. We also accept \, $var, ${var}, "", etc. They can also be
343	# nested, so _EXTGLOB_BEGIN appears here.
344	#
345	# Example: echo @(<> <>\|&&\|'foo'\|$bar)
346	LEXER_DEF[lex_mode_e.ExtGlob] = \
347	_BACKSLASH + _LEFT_SUBS + _LEFT_UNQUOTED + _VARS + _EXTGLOB_BEGIN + [
348	R(r'[^\\$`"\'\|)@*+!?\0]+', Id.Lit_Chars),
349	C('\|', Id.Op_Pipe),
350	C(')', Id.Op_RParen), # maybe be translated to Id.ExtGlob_RParen
351	R(r'[^\0]', Id.Lit_Other), # everything else is literal
352	]
353
354	# Notes on BASH_REGEX states
355	#
356	# From bash manual:
357	#
358	# - Any part of the pattern may be quoted to force the quoted portion to be
359	# matched as a string.
360	# - Bracket expressions in regular expressions must be treated carefully, since
361	# normal quoting characters lose their meanings between brackets.
362	# - If the pattern is stored in a shell variable, quoting the variable
363	# expansion forces the entire pattern to be matched as a string.
364	#
365	# Is there a re.escape function? It's just like EscapeGlob and UnescapeGlob.
366	#
367	# bash code: ( \| ) are special
368
369	LEXER_DEF[lex_mode_e.BashRegex] = _LEFT_SUBS + _LEFT_UNQUOTED + _VARS + [
370	# Like lex_mode_e.ShCommand
371	R(_LITERAL_WHITELIST_REGEX, Id.Lit_Chars),
372
373	# Tokens for Tilde sub. bash weirdness: RHS of [[ x =~ ~ ]] is expanded
374	C('~', Id.Lit_Tilde),
375	C('/', Id.Lit_Slash),
376
377	# Id.WS_Space delimits words. In lex_mode_e.BashRegexFakeInner, we
378	# translate them to Id.Lit_Chars.
379	_SIGNIFICANT_SPACE,
380
381	# Analogous to Id.ExtGlob_* - we need to change lexer modes when we hit this
382	C('(', Id.BashRegex_LParen),
383
384	# Not special, this is like lex_mode_e.ShCommand
385	C(')', Id.Op_RParen),
386
387	# Copied and adapted from _UNQUOTED
388	# \n & ; < > are parse errors OUTSIDE a group [[ s =~ ; ]]
389	# but become allowed INSIDE a group [[ s =~ (;) ]]
390	C('\n', Id.BashRegex_AllowedInParens),
391	C('&', Id.BashRegex_AllowedInParens),
392	C(';', Id.BashRegex_AllowedInParens),
393	C('>', Id.BashRegex_AllowedInParens),
394	C('<', Id.BashRegex_AllowedInParens),
395
396	# e.g. \| is Id.Lit_Other, not pipe operator
397	R(r'[^\0]', Id.Lit_Other), # like _UNQUOTED, any other byte is literal
398	] + _BACKSLASH # These have to come after RegexMeta
399
400	LEXER_DEF[lex_mode_e.DQ] = [
401	_DQ_ESCAPED_CHAR,
402	C('\\\n', Id.Ignored_LineCont),
403	C('\\', Id.Lit_BadBackslash), # syntax error in YSH, but NOT in OSH
404	] + _LEFT_SUBS + _VARS + [
405	R(r'[^$`"\0\\]+', Id.Lit_Chars), # matches a line at most
406	C('$', Id.Lit_Dollar), # completion of var names relies on this
407	C('"', Id.Right_DoubleQuote),
408	]
409
410	LEXER_DEF[lex_mode_e.HereDoc] = [
411	R(r'\\[$`\\]', Id.Lit_EscapedChar), # \" is not an escaped char in here documents
412	C('\\\n', Id.Ignored_LineCont),
413	C('\\', Id.Lit_BadBackslash), # syntax error in YSH, but NOT in OSH
414	] + _LEFT_SUBS + _VARS + [
415	# remember [^] is set exclusion for regex
416	R(r'[^$`\0\\]+', Id.Lit_Chars), # matches a line at most
417	C('$', Id.Lit_Dollar), # completion of var names relies on this
418	]
419
420	_VS_ARG_COMMON = [
421	C('/', Id.Lit_Slash), # for patsub (not Id.VOp2_Slash)
422	C('#', Id.Lit_Pound), # for patsub prefix (not Id.VOp1_Pound)
423	C('%', Id.Lit_Percent), # for patsdub suffix (not Id.VOp1_Percent)
424	C('}', Id.Right_DollarBrace), # For var sub "${a}"
425	C('$', Id.Lit_Dollar), # completion of var names relies on this
426	]
427
428	# We don't execute zsh var subs, but to find the closing } properly, we need to
429	## to recognize \} and '}' and "}" $'}' etc.
430	LEXER_DEF[lex_mode_e.VSub_Zsh] = \
431	_BACKSLASH + _LEFT_SUBS + _LEFT_UNQUOTED + _LEFT_PROCSUB + \
432	[
433	C('}', Id.Right_DollarBrace), # For var sub "${a}"
434	R(r'[^\0]', Id.Lit_Other), # e.g. "$", must be last
435	]
436
437	# Kind.{Lit,Ignored,VSub,Left,Right,Eof}
438	LEXER_DEF[lex_mode_e.VSub_ArgUnquoted] = \
439	_BACKSLASH + _VS_ARG_COMMON + _LEFT_SUBS + _LEFT_UNQUOTED + _LEFT_PROCSUB + \
440	_VARS + _EXTGLOB_BEGIN + [
441
442	# Token for Tilde sub
443	C('~', Id.Lit_Tilde),
444
445	# - doesn't match ~ for tilde sub
446	# - doesn't match < and > so it doesn't eat <()
447	# - doesn't match @ ! ? + * so it doesn't eat _EXTGLOB_BEGIN -- ( alone it
448	# not enough
449	R(r'[^$`~/}"\'\0\\#%<>@!?+*]+', Id.Lit_Chars),
450	R(r'[^\0]', Id.Lit_Other), # e.g. "$", must be last
451	]
452
453	# Kind.{Lit,Ignored,VSub,Left,Right,Eof}
454	LEXER_DEF[lex_mode_e.VSub_ArgDQ] = [
455	_DQ_ESCAPED_CHAR,
456	C(r'\}', Id.Lit_EscapedChar), # For "${var-\}}"
457	C('\\\n', Id.Ignored_LineCont),
458	C('\\', Id.Lit_BadBackslash), # syntax error in YSH, but NOT in OSH
459	] + _VS_ARG_COMMON + _LEFT_SUBS + _VARS + [
460	R(r'[^$`/}"\0\\#%]+', Id.Lit_Chars), # matches a line at most
461
462	# Weird wart: even in double quoted state, double quotes are allowed
463	C('"', Id.Left_DoubleQuote),
464
465	# Another weird wart of bash/mksh: $'' is recognized but NOT ''!
466	C("$'", Id.Left_DollarSingleQuote),
467	]
468
469	# NOTE: Id.Ignored_LineCont is NOT supported in SQ state, as opposed to DQ
470	# state.
471	LEXER_DEF[lex_mode_e.SQ_Raw] = [
472	R(r"[^'\0]+", Id.Lit_Chars), # matches a line at most
473	C("'", Id.Right_SingleQuote),
474	]
475
476	# The main purpose for EXPR_CHARS is in regex literals, e.g. [a-z \t \n].
477	#
478	# In YSH expressions, Chars are code point integers, so \u{1234} is the same as
479	# 0x1234. And \0 is 0x0.
480
481	# In Python:
482	# chr(0x00012345) == u'\U00012345'
483	#
484	# In YSH:
485	# 0x00012345 == \u{12345}
486	# chr(0x00012345) == chr(\u{12345}) == $'\u{012345}'
487
488	_U_BRACED_CHAR = R(r'\\[uU]\{[0-9a-fA-F]{1,6}\}', Id.Char_UBraced)
489
490	_X_CHAR_LOOSE = R(r'\\x[0-9a-fA-F]{1,2}', Id.Char_Hex) # bash
491	_CHAR_YHEX = R(r'\\y[0-9a-fA-F]{2}', Id.Char_YHex) # \yff - J8 only
492
493	_U4_CHAR_LOOSE = R(r'\\u[0-9a-fA-F]{1,4}', Id.Char_Unicode4) # bash
494
495	_U4_CHAR_STRICT = R(r'\\u[0-9a-fA-F]{4}', Id.Char_Unicode4) # JSON-only
496
497	EXPR_CHARS = [
498	# Allow same backslash escapes as J8 strings, except;
499	# - legacy \b \f
500	# - unnecessary \/
501	#
502	# Note that \0 should be written \y00.
503	R(r'''\\[\\"'nrt]''', Id.Char_OneChar),
504	_CHAR_YHEX,
505
506	# LEGACY Eggex. This is a LITERAL translation to 0xff in ERE?
507	# This is a syntax error in a YSH expression - it doesn't handle the token
508	R(r'\\x[0-9a-fA-F]{2}', Id.Char_Hex),
509	_U_BRACED_CHAR,
510	]
511
512	# Shared between echo -e and $''.
513	_C_STRING_COMMON = [
514
515	# \x6 is valid in bash
516	_X_CHAR_LOOSE,
517	_U4_CHAR_LOOSE,
518	R(r'\\U[0-9a-fA-F]{1,8}', Id.Char_Unicode8),
519	R(r'\\[0abeEfrtnv\\]', Id.Char_OneChar),
520
521	# e.g. \A is not an escape, and \x doesn't match a hex escape. We allow it,
522	# but a lint tool could warn about it.
523	C('\\', Id.Unknown_Backslash),
524	]
525
526	ECHO_E_DEF = _C_STRING_COMMON + [
527	# Note: tokens above \0377 can either be truncated or be flagged a syntax
528	# error in strict mode.
529	R(r'\\0[0-7]{1,3}', Id.Char_Octal4),
530	C(r'\c', Id.Char_Stop),
531
532	# e.g. 'foo', anything that's not a backslash escape
533	R(r'[^\\\0]+', Id.Lit_Chars),
534	]
535
536	OCTAL3_RE = r'\\[0-7]{1,3}'
537
538	PRINTF_B_DEF = ECHO_E_DEF + [
539	# \123 octal form is accepted as an extension
540	R(OCTAL3_RE, Id.Char_Octal3),
541	]
542
543	# https://json.org/
544
545	# Note that [0-9] has to come second, because Python chooses the first match.
546	_JSON_INT = r'-?([1-9][0-9]*\|[0-9])' # Numbers can't start with leading 0
547	_JSON_FRACTION = r'(\.[0-9]+)?'
548	_JSON_EXP = r'([eE][-+]?[0-9]+)?'
549
550	# R5RS extended alphabetic characters
551	# https://groups.csail.mit.edu/mac/ftpdir/scheme-reports/r5rs-html/r5rs_4.html
552	#
553	# ! $ % & * + - . / : < = > ? @ ^ _ ~
554
555	# Description from Guile Scheme - https://www.gnu.org/software/guile/manual/html_node/Symbol-Read-Syntax.html
556	#
557	# "The read syntax for a symbol is a sequence of letters, digits, and extended
558	# alphabetic characters, beginning with a character that cannot begin a
559	# number. In addition, the special cases of +, -, and ... are read as symbols
560	# even though numbers can begin with +, - or ."
561	#
562	# (They should have used regular languages!)
563
564	# We take out $ and @ for our splicing syntax, i.e. $unquote and
565	# @unquote-splicing. And : for now because we use it for name:value.
566
567	# Also note Scheme allows \|a b\| for symbols with funny chars, and Guile scheme
568	# allows #{a b}#. We could use `a b` or (symbol "a b").
569
570	J8_SYMBOL_CHARS = r'!%&*+./<=>?^_~-' # - is last for regex char class
571
572	# yapf: disable
573	J8_SYMBOL_RE = (
574	r'[a-zA-Z' + J8_SYMBOL_CHARS + ']' +
575	r'[a-zA-Z0-9' + J8_SYMBOL_CHARS + ']*')
576	# yapf: enable
577
578	_J8_LEFT = [
579	C('"', Id.Left_DoubleQuote), # JSON string
580	C('j"', Id.Left_JDoubleQuote), # JSON string with explicit J8 prefix
581	# Three left quotes that are J8 only
582	C("u'", Id.Left_USingleQuote), # unicode string
583	C("'", Id.Left_USingleQuote), # '' is alias for u'' in data, not in code
584	C("b'", Id.Left_BSingleQuote), # byte string
585	]
586
587	J8_DEF = _J8_LEFT + [
588	C('[', Id.J8_LBracket),
589	C(']', Id.J8_RBracket),
590	C('{', Id.J8_LBrace),
591	C('}', Id.J8_RBrace),
592	C('(', Id.J8_LParen), # NIL8 only
593	C(')', Id.J8_RParen), # NIL8 only
594	C(',', Id.J8_Comma),
595	C(':', Id.J8_Colon),
596	C('null', Id.J8_Null),
597	C('true', Id.J8_Bool),
598	C('false', Id.J8_Bool),
599	R(_JSON_INT, Id.J8_Int),
600	R(_JSON_INT + _JSON_FRACTION + _JSON_EXP, Id.J8_Float),
601
602	# Identifier names come AFTER null true false.
603	# - Happens to be the same as shell identifier # names.
604	# - Note that JS allows $ as an identifier, but we don't.
605	# - Used for dict keys / NIL8 field names.
606	R(VAR_NAME_RE, Id.J8_Identifier),
607
608	# Symbol is a SUPERSET of Identifier. The first word in NIL8 can be can
609	# be either Symbol or plain Identifier, but field names can only be
610	# Identifier. JSON8 only has Identifier.
611	#R(J8_SYMBOL_RE, Id.J8_Symbol), # NIL8 only
612	R(r'[~!@$%^&*+=\|;./<>?-]+', Id.J8_Operator), # NIL8 only
613	R(r'[ \r\t]+', Id.Ignored_Space),
614	# A separate token, to count lines for error messages
615	C('\n', Id.Ignored_Newline),
616	# comment is # until end of line
617	# // comments are JavaScript style, but right now we might want them as
618	# symbols?
619	R(r'#[^\n\0]*', Id.Ignored_Comment), # J8 only (JSON8, NIL8)
620
621	# This will reject ASCII control chars
622	R(r'[^\0]', Id.Unknown_Tok),
623	]
624
625	# Exclude control characters 0x00-0x1f, aka 0-31 in J8 data only (not YSH code)
626	_ASCII_CONTROL = R(r'[\x01-\x1F]', Id.Char_AsciiControl)
627
628	J8_LINES_DEF = _J8_LEFT + [
629	# not sure if we want \r here - same with lex_mode_e.Expr
630	R(r'[ \r\t]+', Id.WS_Space),
631	R(r'[\n]', Id.J8_Newline),
632
633	# doesn't match \t, which means tabs are allowed in the middle of unquoted
634	# lines
635	_ASCII_CONTROL,
636
637	# not space or ' or " or ASCII control or EOF
638	R(r'''[^ \t\r\n'"\x00-\x1F]+''', Id.Lit_Chars),
639	]
640
641	# https://json.org list of chars
642	_JSON_ONE_CHAR = R(r'\\[\\"/bfnrt]', Id.Char_OneChar)
643
644	# b'' u'' strings - what's common between code and data.
645	_J8_STR_COMMON = [
646	C("'", Id.Right_SingleQuote), # end for J8
647	_JSON_ONE_CHAR,
648	C("\\'", Id.Char_OneChar), # since ' ends, allow \'
649	_CHAR_YHEX,
650	_U_BRACED_CHAR, # \u{123456} - J8 only
651
652	# osh/word_parse.py relies on this. It has to be consistent with $''
653	# lexing, which uses _C_STRING_COMMON
654	C('\\', Id.Unknown_Backslash),
655	]
656
657	# Lexer for J8 strings in CODE.
658	LEXER_DEF[lex_mode_e.J8_Str] = _J8_STR_COMMON + [
659	# Don't produce Char_AsciiControl tokens - that's only for data
660
661	# will match invalid UTF-8 - we have a separate validation step
662	R(r"[^\\'\0]+", Id.Lit_Chars),
663	]
664
665	# Lexer for J8 string data.
666	# ASCII control characters are disallowed in DATA, but not CODE!
667	J8_STR_DEF = _J8_STR_COMMON + [
668	_ASCII_CONTROL,
669	# will match invalid UTF-8 - we have a separate validation step
670	R(r"[^\\'\x00-\x1F]+", Id.Lit_Chars),
671	]
672
673	# Lexer for JSON string data - e.g. "json \" \u1234"
674	JSON_STR_DEF = [
675	C('"', Id.Right_DoubleQuote), # end for JSON
676	_JSON_ONE_CHAR,
677	_U4_CHAR_STRICT, # \u1234 - JSON only
678
679	# High surrogate [\uD800, \uDC00)
680	# Low surrogate [\uDC00, \uE000)
681	# This pattern makes it easier to decode. Unpaired surrogates because Id.Char_Unicode4.
682	R(
683	r'\\u[dD][89aAbB][0-9a-fA-F][0-9a-fA-F]\\u[dD][cCdDeEfF][0-9a-fA-F][0-9a-fA-F]',
684	Id.Char_SurrogatePair),
685	C('\\', Id.Unknown_Backslash), # e.g. the \ before bad \z
686	_ASCII_CONTROL,
687
688	# Note: This will match INVALID UTF-8. UTF-8 validation is another step.
689	R(r'[^\\"\x00-\x1F]+', Id.Lit_Chars),
690	]
691
692	_WHITESPACE = r'[ \t\r\n]*' # ASCII whitespace doesn't have legacy \f \v
693
694	SH_NUMBER_DEF = [
695	R('0', Id.ShNumber_Dec),
696	R(r'[1-9][0-9]*', Id.ShNumber_Dec),
697	R(r'0[0-7]+', Id.ShNumber_Oct),
698	R(r'0[xX][0-9A-Fa-f]+', Id.ShNumber_Hex),
699	R(r'[1-9][0-9]*#[0-9a-zA-Z@_]+', Id.ShNumber_BaseN),
700	R(r'[^\0]', Id.Unknown_Tok), # any other char
701	]
702
703	# https://www.gnu.org/software/bash/manual/html_node/Controlling-the-PromptEvaluator.html#Controlling-the-PromptEvaluator
704	PS1_DEF = [
705	R(OCTAL3_RE, Id.PS_Octal3),
706	R(r'\\[adehHjlnrstT@AuvVwW!#$\\]', Id.PS_Subst),
707	# \D{%H:%M} strftime format
708	R(r'\\D\{[^}\0]*\}', Id.PS_Subst),
709	C(r'\[', Id.PS_LBrace), # non-printing
710	C(r'\]', Id.PS_RBrace),
711	R(r'[^\\\0]+', Id.PS_Literals),
712	# e.g. \x is not a valid escape.
713	C('\\', Id.PS_BadBackslash),
714	]
715
716	# NOTE: Id.Ignored_LineCont is also not supported here, even though the whole
717	# point of it is that supports other backslash escapes like \n! It just
718	# becomes a regular backslash.
719	LEXER_DEF[lex_mode_e.SQ_C] = _C_STRING_COMMON + [
720	# Weird special case matching bash: backslash that ends a line. We emit
721	# this token literally in OSH, but disable it in YSH.
722	C('\\\n', Id.Unknown_Backslash),
723
724	# Silly difference! In echo -e, the syntax is \0377, but here it's $'\377',
725	# with no leading 0.
726	R(OCTAL3_RE, Id.Char_Octal3),
727
728	# ' and " are escaped in $'' mode, but not echo -e.
729	C(r"\'", Id.Char_OneChar),
730	C(r'\"', Id.Char_OneChar),
731
732	# e.g. 'foo', anything that's not a backslash escape or '
733	R(r"[^\\'\0]+", Id.Lit_Chars),
734	C("'", Id.Right_SingleQuote),
735	]
736
737	LEXER_DEF[lex_mode_e.PrintfOuter] = _C_STRING_COMMON + [
738	R(OCTAL3_RE, Id.Char_Octal3),
739	R(r"[^%\\\0]+", Id.Lit_Chars),
740	C('%%', Id.Format_EscapedPercent),
741	C('%', Id.Format_Percent),
742	]
743
744	# Maybe: bash also supports %(strftime)T
745	LEXER_DEF[lex_mode_e.PrintfPercent] = [
746	# Flags
747	R('[- +#]', Id.Format_Flag),
748	C('0', Id.Format_Zero),
749	R('[1-9][0-9]*', Id.Format_Num),
750	C('*', Id.Format_Star),
751	C('.', Id.Format_Dot),
752	# We support dsq. The others we parse to display an error message.
753	R('[disqbcouxXeEfFgG]', Id.Format_Type),
754	R('$[^()\0]*$T', Id.Format_Time),
755	R(r'[^\0]', Id.Unknown_Tok), # any other char
756	]
757
758	LEXER_DEF[lex_mode_e.VSub_1] = [
759	R(VAR_NAME_RE, Id.VSub_Name),
760	# ${11} is valid, compared to $11 which is $1 and then literal 1.
761	R(r'[0-9]+', Id.VSub_Number),
762	C('!', Id.VSub_Bang),
763	C('@', Id.VSub_At),
764	C('#', Id.VSub_Pound),
765	C('$', Id.VSub_Dollar),
766	C('*', Id.VSub_Star),
767	C('-', Id.VSub_Hyphen),
768	C('?', Id.VSub_QMark),
769	C('.', Id.VSub_Dot), # ${.myproc builtin sub}
770	C('}', Id.Right_DollarBrace),
771	C('\\\n', Id.Ignored_LineCont),
772	C('\n', Id.Unknown_Tok), # newline not allowed inside ${}
773	R(r'[^\0]', Id.Unknown_Tok), # any char except newline
774	]
775
776	LEXER_DEF[lex_mode_e.VSub_2] = \
777	ID_SPEC.LexerPairs(Kind.VTest) + \
778	ID_SPEC.LexerPairs(Kind.VOp0) + \
779	ID_SPEC.LexerPairs(Kind.VOpYsh) + \
780	ID_SPEC.LexerPairs(Kind.VOp1) + \
781	ID_SPEC.LexerPairs(Kind.VOp2) + \
782	ID_SPEC.LexerPairs(Kind.VOp3) + [
783	C('}', Id.Right_DollarBrace),
784
785	C('\\\n', Id.Ignored_LineCont),
786	C('\n', Id.Unknown_Tok), # newline not allowed inside ${}
787	R(r'[^\0]', Id.Unknown_Tok), # any char except newline
788	]
789
790	_EXPR_ARITH_SHARED = [
791	C('\\\n', Id.Ignored_LineCont),
792	R(r'[^\0]', Id.Unknown_Tok) # any char. This should be a syntax error.
793	]
794
795	# https://www.gnu.org/software/bash/manual/html_node/Shell-Arithmetic.html#Shell-Arithmetic
796	LEXER_DEF[lex_mode_e.Arith] = \
797	_LEFT_SUBS + _VARS + _LEFT_UNQUOTED + [
798
799	# Arithmetic expressions can cross newlines.
800	R(r'[ \t\r\n]+', Id.Ignored_Space),
801
802	# Examples of arith constants:
803	# 64#azAZ
804	# 0xabc 0xABC
805	# 0123
806	# A separate digits token makes this easier to parse STATICALLY. But this
807	# doesn't help with DYNAMIC parsing.
808	R(VAR_NAME_RE, Id.Lit_ArithVarLike), # for variable names or 64#_
809	R(r'[0-9]+', Id.Lit_Digits),
810	C('@', Id.Lit_At), # for 64#@ or ${a[@]}
811	C('#', Id.Lit_Pound), # for 64#a
812
813	# TODO: 64#@ interferes with VS_AT. Hm.
814	] + ID_SPEC.LexerPairs(Kind.Arith) + _EXPR_ARITH_SHARED
815
816	# A lexer for the parser that converts globs to extended regexes. Since we're
817	# only parsing character classes ([^[:space:][:alpha:]]) as opaque blobs, we
818	# don't need lexer modes here.
819	GLOB_DEF = [
820	# These could be operators in the glob, or just literals in a char class,
821	# e.g. touch '?'; echo [?].
822	C('*', Id.Glob_Star),
823	C('?', Id.Glob_QMark),
824
825	# For negation. Treated as operators inside [], but literals outside.
826	C('!', Id.Glob_Bang),
827	C('^', Id.Glob_Caret),
828
829	# Character classes.
830	C('[', Id.Glob_LBracket),
831	C(']', Id.Glob_RBracket),
832
833	# There is no whitelist of characters; backslashes are unconditionally
834	# removed. With libc.fnmatch(), the pattern r'\f' matches 'f' but not '\\f'.
835	# See libc_test.py.
836	R(r'\\[^\0]', Id.Glob_EscapedChar),
837	C('\\', Id.Glob_BadBackslash), # Trailing single backslash
838
839	# For efficiency, combine other characters into a single token, e.g. 'py' in
840	# '*.py' or 'alpha' in '[[:alpha:]]'.
841	R(r'[a-zA-Z0-9_]+', Id.Glob_CleanLiterals), # no regex escaping
842	R(r'[^\0]', Id.Glob_OtherLiteral), # anything else -- examine the char
843	]
844
845	# History expansion. We're doing this as "pre-lexing" since that's what bash
846	# and zsh seem to do. Example:
847	#
848	# $ foo=x
849	# $ echo $
850	# $ !!foo # expands to echo $foo and prints x
851	#
852	# We can also reuse this in the RootCompleter to expand history interactively.
853	#
854	# bash note: handled in lib/readline/histexpand.c. Quite messy and handles
855	# quotes AGAIN.
856	#
857	# Note: \! gets expanded to literal \! for the real lexer, but no history
858	# expansion occurs.
859
860	HISTORY_DEF = [
861	# Common operators.
862	R(r'![!*^$]', Id.History_Op),
863
864	# By command number.
865	R(r'!-?[0-9]+', Id.History_Num),
866
867	# Search by prefix of substring (optional '?').
868	# NOTE: there are no numbers allowed here! Bash doesn't seem to support it.
869	# No hyphen since it conflits with $-1 too.
870	#
871	# Required trailing whitespace is there to avoid conflict with [!charclass]
872	# and ${!indirect}. This is a simpler hack than the one bash has. See
873	# frontend/lex_test.py.
874	R(r'!\??[a-zA-Z_/.][0-9a-zA-Z_/.]+[ \t\r\n]', Id.History_Search),
875
876	# Comment is until end of line
877	R(r"#[^\0]*", Id.History_Other),
878
879	# Single quoted, e.g. 'a' or $'\n'. Terminated by another single quote or
880	# end of string.
881	R(r"'[^'\0]*'?", Id.History_Other),
882
883	# Runs of chars that are definitely not special
884	R(r"[^!\\'#\0]+", Id.History_Other),
885
886	# Escaped characters. \! disables history
887	R(r'\\[^\0]', Id.History_Other),
888	# Other single chars, like a trailing \ or !
889	R(r'[^\0]', Id.History_Other),
890	]
891
892	BRACE_RANGE_DEF = [
893	R(r'-?[0-9]+', Id.Range_Int),
894	R(r'[a-zA-Z]', Id.Range_Char), # just a single character
895	R(r'\.\.', Id.Range_Dots),
896	R(r'[^\0]', Id.Range_Other), # invalid
897	]
898
899	#
900	# YSH lexing
901	#
902
903	# Valid in lex_mode_e.{Expr,DQ}
904	# Used by ysh/grammar_gen.py
905	YSH_LEFT_SUBS = [
906	C('$(', Id.Left_DollarParen),
907	C('${', Id.Left_DollarBrace),
908	C('$[', Id.Left_DollarBracket),
909	]
910
911	# Valid in lex_mode_e.Expr, but not valid in DQ
912	# Used by ysh/grammar_gen.py
913
914	YSH_LEFT_UNQUOTED = [
915	# Double quoted
916	C('"', Id.Left_DoubleQuote),
917	C('$"', Id.Left_DollarDoubleQuote), # $"" is synonym for ""
918	C('j"', Id.Left_JDoubleQuote), # for printing ERROR
919	# Single quoted
920	C("'", Id.Left_SingleQuote),
921	C("r'", Id.Left_RSingleQuote),
922	C("u'", Id.Left_USingleQuote),
923	C("b'", Id.Left_BSingleQuote),
924	C("$'", Id.Left_DollarSingleQuote), # legacy
925	C('^"', Id.Left_CaretDoubleQuote),
926	C('"""', Id.Left_TDoubleQuote),
927	C('$"""', Id.Left_DollarTDoubleQuote),
928	# In expression mode, we add the r'' and c'' prefixes for '' and $''.
929	C("'''", Id.Left_TSingleQuote),
930	C("r'''", Id.Left_RTSingleQuote),
931	C("u'''", Id.Left_UTSingleQuote),
932	C("b'''", Id.Left_BTSingleQuote),
933	C('@(', Id.Left_AtParen), # Split Command Sub
934	C('@[', Id.Left_AtBracket), # Array splice in expression mode
935	C('^(', Id.Left_CaretParen), # Block literals in expression mode
936	C('^[', Id.Left_CaretBracket), # Expr literals
937	C('^{', Id.Left_CaretBrace), # Unused
938	C(':\|', Id.Left_ColonPipe), # shell-like word arrays.
939
940	# DEPRECATED syntax for :\| sh array \|
941	C('%(', Id.Left_PercentParen),
942	# May not use these
943	C('%[', Id.Expr_Reserved),
944	C('%{', Id.Expr_Reserved), # Table literals? Vertical dict?
945	C('@{', Id.Expr_Reserved),
946	]
947
948	# Used by ysh/grammar_gen.py
949	EXPR_OPS = [
950	# Terminator
951	C(';', Id.Op_Semi),
952	C('(', Id.Op_LParen),
953	C(')', Id.Op_RParen),
954	# Note: type expressions are expressions, e.g. Dict[Str, Int]
955	C('[', Id.Op_LBracket),
956	C(']', Id.Op_RBracket),
957	C('{', Id.Op_LBrace),
958	C('}', Id.Op_RBrace),
959	]
960
961	# Newline is significant, but sometimes elided by expr_parse.py.
962	_EXPR_NEWLINE_COMMENT = [
963	C('\n', Id.Op_Newline),
964	R(r'#[^\n\0]*', Id.Ignored_Comment),
965	# Like lex_mode_e.Arith, \r is whitespace even without \n
966	R(r'[ \t\r]+', Id.Ignored_Space),
967	]
968
969	# Note: if you call match.LooksLikeInteger(s), mops.FromStr(s) may still
970	# fail. However you should call BOTH, because we don't rely want to rely on
971	# the underlying stroll() to define the language accepted.
972	LOOKS_LIKE_INTEGER = _WHITESPACE + '-?[0-9]+' + _WHITESPACE
973
974	# TODO: use for YSH comparison operators > >= < <=
975	#
976	# Python allows 0 to be written 00 or 0_0_0, which is weird. But let's be
977	# consistent, and avoid '00' turning into a float!
978	_YSH_DECIMAL_INT_RE = r'[0-9](_?[0-9])*'
979
980	LOOKS_LIKE_YSH_INT = _WHITESPACE + '-?' + _YSH_DECIMAL_INT_RE + _WHITESPACE
981
982	_YSH_FLOAT_RE = (
983	_YSH_DECIMAL_INT_RE +
984	# Unlike Python, exponent can't be like 42e5_000. There's no use because
985	# 1e309 is already inf. Let's keep our code simple.
986	r'(\.' + _YSH_DECIMAL_INT_RE + ')?([eE][+\-]?[0-9]+)?')
987
988	# Ditto, used for YSH comparison operators
989	# Added optional Optional -?
990	# Example: -3_000_000.000_001e12
991	LOOKS_LIKE_YSH_FLOAT = _WHITESPACE + '-?' + _YSH_FLOAT_RE + _WHITESPACE
992
993	# Python 3 float literals:
994
995	# digitpart ::= digit (["_"] digit)*
996	# fraction ::= "." digitpart
997	# exponent ::= ("e" \| "E") ["+" \| "-"] digitpart
998	# pointfloat ::= [digitpart] fraction \| digitpart "."
999	# exponentfloat ::= (digitpart \| pointfloat) exponent
1000	# floatnumber ::= pointfloat \| exponentfloat
1001
1002	# NOTE: Borrowing tokens from Arith (i.e. $(( )) ), but not using LexerPairs().
1003	LEXER_DEF[lex_mode_e.Expr] = \
1004	_VARS + YSH_LEFT_SUBS + YSH_LEFT_UNQUOTED + EXPR_OPS + EXPR_WORDS + \
1005	EXPR_CHARS + [
1006
1007	# https://docs.python.org/3/reference/lexical_analysis.html#integer-literals
1008	#
1009	# integer ::= decinteger \| bininteger \| octinteger \| hexinteger
1010	# decinteger ::= nonzerodigit (["_"] digit)* \| "0"+ (["_"] "0")*
1011	# bininteger ::= "0" ("b" \| "B") (["_"] bindigit)+
1012	# octinteger ::= "0" ("o" \| "O") (["_"] octdigit)+
1013	# hexinteger ::= "0" ("x" \| "X") (["_"] hexdigit)+
1014	# nonzerodigit ::= "1"..."9"
1015	# digit ::= "0"..."9"
1016	# bindigit ::= "0" \| "1"
1017	# octdigit ::= "0"..."7"
1018	# hexdigit ::= digit \| "a"..."f" \| "A"..."F"
1019
1020	R(_YSH_DECIMAL_INT_RE, Id.Expr_DecInt),
1021
1022	R(r'0[bB](_?[01])+', Id.Expr_BinInt),
1023	R(r'0[oO](_?[0-7])+', Id.Expr_OctInt),
1024	R(r'0[xX](_?[0-9a-fA-F])+', Id.Expr_HexInt),
1025
1026	R(_YSH_FLOAT_RE, Id.Expr_Float),
1027
1028	# These can be looked up as keywords separately, so you enforce that they have
1029	# space around them?
1030	R(VAR_NAME_RE, Id.Expr_Name),
1031
1032	R('%' + VAR_NAME_RE, Id.Expr_Symbol),
1033
1034	#
1035	# Arith
1036	#
1037
1038	C(',', Id.Arith_Comma),
1039	C(':', Id.Arith_Colon), # for slicing a[1:2], and mylist:pop()
1040
1041	C('?', Id.Arith_QMark), # regex postfix
1042
1043	C('+', Id.Arith_Plus), # arith infix, regex postfix
1044	C('-', Id.Arith_Minus), # arith infix, regex postfix
1045	C('*', Id.Arith_Star),
1046	C('^', Id.Arith_Caret), # xor
1047	C('/', Id.Arith_Slash),
1048	C('%', Id.Arith_Percent),
1049
1050	C('**', Id.Arith_DStar), # exponentiation
1051	C('++', Id.Arith_DPlus), # Option for string/list concatenation
1052
1053	C('<', Id.Arith_Less),
1054	C('>', Id.Arith_Great),
1055	C('<=', Id.Arith_LessEqual),
1056	C('>=', Id.Arith_GreatEqual),
1057	C('===', Id.Expr_TEqual),
1058	C('!==', Id.Expr_NotDEqual),
1059
1060	C('==', Id.Unknown_DEqual), # user must choose === or ~==
1061
1062	C('&&', Id.Unknown_DAmp),
1063	C('\|\|', Id.Unknown_DPipe),
1064
1065	# Bitwise operators
1066	C('&', Id.Arith_Amp),
1067	C('\|', Id.Arith_Pipe),
1068	C('>>', Id.Arith_DGreat),
1069	C('<<', Id.Arith_DLess), # Doesn't Java also have <<< ?
1070
1071	# Bitwise complement, as well as infix pattern matching
1072	C('~', Id.Arith_Tilde),
1073	C('!~', Id.Expr_NotTilde),
1074	C('~~', Id.Expr_DTilde),
1075	C('!~~', Id.Expr_NotDTilde),
1076
1077	# Left out for now:
1078	# ++ -- -- needed for loops, awk?
1079	# ! && \|\| -- needed for find dialect
1080	# = += etc.
1081
1082	C('=', Id.Arith_Equal),
1083
1084	C('+=', Id.Arith_PlusEqual),
1085	C('-=', Id.Arith_MinusEqual),
1086	C('*=', Id.Arith_StarEqual),
1087	C('/=', Id.Arith_SlashEqual),
1088	C('%=', Id.Arith_PercentEqual),
1089
1090	C('>>=', Id.Arith_DGreatEqual),
1091	C('<<=', Id.Arith_DLessEqual),
1092	C('&=', Id.Arith_AmpEqual),
1093	C('\|=', Id.Arith_PipeEqual),
1094	C('^=', Id.Arith_CaretEqual), # Exponentiation
1095
1096	# Augmented assignment that YSH has, but sh and OSH don't have
1097	C('**=', Id.Expr_DStarEqual),
1098	C('//=', Id.Expr_DSlashEqual),
1099
1100	#
1101	# Expr
1102	#
1103
1104	C('!', Id.Expr_Bang), # For eggex negation
1105
1106	C('//', Id.Expr_DSlash), # For YSH integer division
1107	C('~==', Id.Expr_TildeDEqual), # approximate equality
1108
1109	C('.', Id.Expr_Dot), # d.key is alias for d['key']
1110	C('..', Id.Unknown_DDot), # legacy half-open range 1..5
1111	C('..<', Id.Expr_DDotLessThan), # half-open range 1..<5
1112	C('..=', Id.Expr_DDotEqual), # closed range 1..5
1113	C('->', Id.Expr_RArrow), # s->startswith()
1114	C('$', Id.Expr_Dollar), # legacy regex end: /d+ $/ (better written /d+ >/
1115
1116	# Reserved this. Go uses it for channels, etc.
1117	# I guess it conflicts with -4<-3, but that's OK -- spaces suffices.
1118	C('<-', Id.Expr_Reserved),
1119	C('=>', Id.Expr_RDArrow), # for df => filter(age > 10)
1120	# and match (x) { 1 => "one" }
1121	# note: other languages use \|>
1122	# R/dplyr uses %>%
1123
1124	C('...', Id.Expr_Ellipsis), # f(...args) and maybe a[:, ...]
1125
1126	# For multiline regex literals?
1127	C('///', Id.Expr_Reserved),
1128
1129	# Splat operators
1130	C('@', Id.Expr_At),
1131	# NOTE: Unused
1132	C('@@', Id.Expr_DoubleAt),
1133	] + _EXPR_NEWLINE_COMMENT + _EXPR_ARITH_SHARED
1134
1135	LEXER_DEF[lex_mode_e.FuncParens] = [
1136	# () with spaces
1137	R(r'[ \t]$[ \t]$', Id.LookAhead_FuncParens),
1138	# anything else
1139	R(r'[^\0]', Id.Unknown_Tok)
1140	]