frontend/lexer

OILS / frontend / lexer_def.py View on Github | oils.pub

1131 lines, 578 significant

1	"""
2	lexer_def.py - Lexer for OSH, YSH, and J8 Notation.
3
4	This lexer has lexer MODES, each with a regex -> Id mapping.
5
6	After changing this file, run:
7
8	build/py.sh all
9
10	or at least:
11
12	build/py.sh fastlex
13
14	Input Handling
15	--------------
16
17	Every line is NUL terminated:
18
19	'one\n\0' 'last line\0'
20
21	which means that no regexes below should match \0.
22
23	For example, use [^'\0]+ instead of [^']+ .
24
25	If this rule isn't followed, we would read uninitialized memory past the
26	sentinel. Python's regex engine knows where the end of the input string is, so
27	it doesn't require need a sentinel like \0.
28
29	The generator frontend/lexer_gen.py adds a pattern mapping \0 to Id.Eol_Tok.
30	"""
31
32	from _devbuild.gen.id_kind_asdl import Id, Id_t, Kind
33	from _devbuild.gen.types_asdl import lex_mode_e
34
35	from frontend import id_kind_def
36
37	from typing import Tuple
38
39	# Initialize spec that the lexer depends on.
40	ID_SPEC = id_kind_def.IdSpec({}, {})
41
42	id_kind_def.AddKinds(ID_SPEC)
43	id_kind_def.AddBoolKinds(ID_SPEC) # must come second
44	id_kind_def.SetupTestBuiltin(ID_SPEC, {}, {}, {})
45
46
47	def C(pat, tok_type):
48	# type: (str, Id_t) -> Tuple[bool, str, Id_t]
49	"""Lexer rule with a constant string, e.g. C('$*', VSub_Star)"""
50	return (False, pat, tok_type)
51
52
53	def R(pat, tok_type):
54	# type: (str, Id_t) -> Tuple[bool, str, Id_t]
55	"""Lexer rule with a regex string, e.g. R('\$[0-9]', VSub_Number)"""
56	return (True, pat, tok_type)
57
58
59	# utf8, utf-8, UTF8, UTF-8, etc.
60	IS_UTF8_CODESET_RE = r'[uU][tT][fF]-?8'
61
62	# See unit tests in frontend/match_test.py.
63	# We need the [^\0]* because the re2c translation assumes it's anchored like $.
64	SHOULD_HIJACK_RE = r'#![^\0]sh[ \t\r\n][^\0]'
65
66	# Separates words (\r it not whitespace here)
67	_SIGNIFICANT_SPACE = R(r'[ \t]+', Id.WS_Space)
68
69	_BACKSLASH = [
70	# To be conservative, we could deny a set of chars similar to
71	# _LITERAL_WHITELIST_REGEX, rather than allowing all the operator characters
72	# like \( and \;.
73	#
74	# strict_backslash makes this stricter.
75	R(r'\\[^\n\0]', Id.Lit_EscapedChar),
76	C('\\\n', Id.Ignored_LineCont),
77	]
78
79	# Only 4 characters are backslash escaped inside "".
80	# https://www.gnu.org/software/bash/manual/bash.html#Double-Quotes
81	_DQ_ESCAPED_CHAR = R(r'\\[$`"\\]', Id.Lit_EscapedChar)
82
83	VAR_NAME_RE = r'[a-zA-Z_][a-zA-Z0-9_]*'
84
85	# All Kind.VSub
86	_VARS = [
87	# Unbraced variables
88	R(r'\$' + VAR_NAME_RE, Id.VSub_DollarName),
89	R(r'\$[0-9]', Id.VSub_Number),
90	C(r'$!', Id.VSub_Bang),
91	C(r'$@', Id.VSub_At),
92	C(r'$#', Id.VSub_Pound),
93	C(r'$$', Id.VSub_Dollar),
94	C(r'$*', Id.VSub_Star),
95	C(r'$-', Id.VSub_Hyphen),
96	C(r'$?', Id.VSub_QMark),
97	]
98
99	# Kind.Left that are valid in double-quoted modes.
100
101	_LEFT_SUBS = [
102	C('`', Id.Left_Backtick),
103	C('$(', Id.Left_DollarParen),
104	C('${', Id.Left_DollarBrace),
105	# Parse zsh syntax, but don't execute it.
106	# The examples we've seen so far are like ${(%):-} and ${(m)
107	R(r'\$\{$[^)\0]+$', Id.Left_DollarBraceZsh),
108	C('$((', Id.Left_DollarDParen),
109	C('$[', Id.Left_DollarBracket),
110	]
111
112	# Additional Kind.Left that are valid in unquoted modes.
113	_LEFT_UNQUOTED = [
114	C('"', Id.Left_DoubleQuote),
115	C("'", Id.Left_SingleQuote),
116	C('$"', Id.Left_DollarDoubleQuote),
117	C("$'", Id.Left_DollarSingleQuote),
118	]
119
120	_LEFT_PROCSUB = [
121	C('<(', Id.Left_ProcSubIn),
122	C('>(', Id.Left_ProcSubOut),
123	]
124
125	# The regexes below are in Python syntax, but are translate to re2c syntax by
126	# frontend/lexer_gen.py.
127	#
128	# http://re2c.org/manual/syntax/syntax.html
129	# https://docs.python.org/2/library/re.html
130	#
131	# We use a limited set of constructs:
132	# - + and * for repetition
133	# - Character classes [] with simple ranges and negation
134	# - Escapes like \n \0
135
136	LEXER_DEF = {}
137
138	# Anything until the end of the line is a comment. Does not match the newline
139	# itself. We want to switch modes and possibly process Op_Newline for here
140	# docs, etc.
141	LEXER_DEF[lex_mode_e.Comment] = [R(r'[^\n\0]*', Id.Ignored_Comment)]
142
143	# A whitelist to make bigger Lit_Chars tokens. We don't want one byte at a time.
144	#
145	# The shell language says that "anything other byte" is a literal character --
146	# for example, unquoted $ \ ! are literal, not a syntax error.
147	#
148	# That is, a literal is defined NEGATIVELY, for a single characters. But here
149	# we define a SUBSET of literal chars POSITIVELY.
150
151	# The range \x80-\xff makes sure that UTF-8 sequences are a single token.
152	_LITERAL_WHITELIST_REGEX = r'[\x80-\xffa-zA-Z0-9_.\-]+'
153
154	_UNQUOTED = _BACKSLASH + _LEFT_SUBS + _LEFT_UNQUOTED + _LEFT_PROCSUB + _VARS + [
155	# NOTE: We could add anything 128 and above to this character class? So
156	# utf-8 characters don't get split?
157	R(_LITERAL_WHITELIST_REGEX, Id.Lit_Chars),
158	C('~', Id.Lit_Tilde), # for tilde sub
159	C('/', Id.Lit_Slash), # also for tilde sub
160	C(':', Id.Lit_Colon), # for special PATH=a:~foo tilde detection
161	C('$', Id.Lit_Dollar), # shopt --set no_parse_dollar
162	C('#', Id.Lit_Pound), # For comments
163	_SIGNIFICANT_SPACE,
164	C('\n', Id.Op_Newline),
165	C('&', Id.Op_Amp),
166	C('\|', Id.Op_Pipe),
167	C('\|&', Id.Op_PipeAmp),
168	C('&&', Id.Op_DAmp),
169	C('\|\|', Id.Op_DPipe),
170	C(';', Id.Op_Semi),
171	# Case terminators
172	C(';;', Id.Op_DSemi),
173	C(';&', Id.Op_SemiAmp),
174	C(';;&', Id.Op_DSemiAmp),
175	C('(', Id.Op_LParen),
176	C(')', Id.Op_RParen),
177	R(r'[^\0]', Id.Lit_Other), # any other single char is a literal
178	]
179
180	# In lex_mode_e.{ShCommand,DBracket}
181	_EXTGLOB_BEGIN = [
182	C(',(', Id.ExtGlob_Comma), # YSH synonym for @(...)
183	C('@(', Id.ExtGlob_At),
184	C('*(', Id.ExtGlob_Star),
185	C('+(', Id.ExtGlob_Plus),
186	C('?(', Id.ExtGlob_QMark),
187	C('!(', Id.ExtGlob_Bang),
188	]
189
190	KEYWORDS = [
191	# NOTE: { is matched elsewhere
192	C('[[', Id.KW_DLeftBracket),
193	C('!', Id.KW_Bang),
194	C('for', Id.KW_For),
195	C('while', Id.KW_While),
196	C('until', Id.KW_Until),
197	C('do', Id.KW_Do),
198	C('done', Id.KW_Done),
199	C('in', Id.KW_In),
200	C('case', Id.KW_Case),
201	C('esac', Id.KW_Esac),
202	C('if', Id.KW_If),
203	C('fi', Id.KW_Fi),
204	C('then', Id.KW_Then),
205	C('else', Id.KW_Else),
206	C('elif', Id.KW_Elif),
207	C('function', Id.KW_Function),
208	C('time', Id.KW_Time),
209
210	# YSH
211	C('const', Id.KW_Const), # maybe remove this
212	C('var', Id.KW_Var),
213	C('setvar', Id.KW_SetVar),
214	C('setglobal', Id.KW_SetGlobal),
215	C('call', Id.KW_Call),
216	C('proc', Id.KW_Proc),
217	C('typed', Id.KW_Typed),
218	C('func', Id.KW_Func),
219	]
220
221	# These are treated like builtins in bash, but keywords in OSH. However, we
222	# maintain compatibility with bash for the 'type' builtin.
223	CONTROL_FLOW = [
224	C('break', Id.ControlFlow_Break),
225	C('continue', Id.ControlFlow_Continue),
226	C('return', Id.ControlFlow_Return),
227	C('exit', Id.ControlFlow_Exit),
228	]
229
230	# Used by ysh/grammar_gen.py too
231	EXPR_WORDS = [
232	C('null', Id.Expr_Null),
233	C('true', Id.Expr_True),
234	C('false', Id.Expr_False),
235	C('and', Id.Expr_And),
236	C('or', Id.Expr_Or),
237	C('not', Id.Expr_Not),
238	C('for', Id.Expr_For),
239	C('is', Id.Expr_Is),
240	C('in', Id.Expr_In),
241	C('if', Id.Expr_If),
242	C('else', Id.Expr_Else),
243
244	# Unused: could be for func and proc litearls
245	#
246	# Note: we also have lambda literals \|x\| x+1
247	# I don't think we need them now, but the difference vs func is that the
248	# body is an expression. Note: JavaScript uses (x, y) => x + y which
249	# causes parsing problems.
250	C('func', Id.Expr_Func),
251	C('proc', Id.Expr_Proc),
252
253	# / <capture d+/
254	C('capture', Id.Expr_Capture),
255	# / <capture d+ as date> /
256	C('as', Id.Expr_As),
257	]
258
259	FD_VAR_NAME = r'\{' + VAR_NAME_RE + r'\}'
260
261	# file descriptors can only have two digits, like mksh
262	# dash/zsh/etc. can have one
263	FD_NUM = r'[0-9]?[0-9]?'
264
265	# These two can must be recognized in the ShCommand mode, but can't nested
266	# within [[.
267	# Keywords have to be checked before _UNQUOTED so we get <KW_If "if"> instead
268	# of <Lit_Chars "if">.
269	LEXER_DEF[lex_mode_e.ShCommand] = [
270	# These four are not allowed within [[, so they are in ShCommand but not
271	# _UNQUOTED.
272
273	# e.g. beginning of NAME=val, which will always be longer than
274	# _LITERAL_WHITELIST_REGEX.
275	R(VAR_NAME_RE + '\+?=', Id.Lit_VarLike),
276	R(VAR_NAME_RE + '\[', Id.Lit_ArrayLhsOpen),
277	R(r'\]\+?=', Id.Lit_ArrayLhsClose),
278	C('((', Id.Op_DLeftParen),
279
280	# For static globbing, and [] for array literals
281	C('[', Id.Lit_LBracket), # e.g. A=(['x']=1)
282	C(']', Id.Lit_RBracket), # e.g. *.[ch]
283	# NOTE: Glob_Star and Glob_QMark are for dynamic parsing
284	C('*', Id.Lit_Star),
285	C('?', Id.Lit_QMark),
286	C('###', Id.Lit_TPound), # like Lit_Pound, for doc comments
287	C('...', Id.Lit_TDot), # ... for multiline commands
288
289	# For brace expansion {a,b}
290	C('{', Id.Lit_LBrace),
291	C('}', Id.Lit_RBrace), # Also for var sub ${a}
292	C(',', Id.Lit_Comma),
293	C('=', Id.Lit_Equals), # for = f(x) and x = 1+2*3
294	C('@', Id.Lit_At), # for detecting @[, @' etc. shopt -s parse_at_all
295	R(FD_VAR_NAME, Id.Lit_RedirVarName),
296	R(FD_NUM, Id.Lit_Number),
297
298	# @array and @func(1, c)
299	R('@' + VAR_NAME_RE, Id.Lit_Splice), # for YSH splicing
300	C('@[', Id.Lit_AtLBracket), # @[split(x)]
301	C('@{.', Id.Lit_AtLBraceDot), # for split builtin sub @{.myproc arg1}
302	R(r'<', Id.Redir_Less),
303	R(r'>', Id.Redir_Great),
304	R(r'<<', Id.Redir_DLess),
305	R(r'<<<', Id.Redir_TLess),
306	R(r'>>', Id.Redir_DGreat),
307	R(r'<<-', Id.Redir_DLessDash),
308	R(r'>&', Id.Redir_GreatAnd),
309	R(r'<&', Id.Redir_LessAnd),
310	R(r'<>', Id.Redir_LessGreat),
311	R(r'>\\|', Id.Redir_Clobber),
312	C(r'&>', Id.Redir_AndGreat),
313	C(r'&>>', Id.Redir_AndDGreat),
314	] + KEYWORDS + CONTROL_FLOW + _UNQUOTED + _EXTGLOB_BEGIN
315
316	# Preprocessing before ShCommand
317	LEXER_DEF[lex_mode_e.Backtick] = [
318	C(r'`', Id.Backtick_Right),
319	# A backslash, and then $ or ` or \
320	R(r'\\[$`\\]', Id.Backtick_Quoted),
321	# \" treated specially, depending on whether bacticks are double-quoted!
322	R(r'\\"', Id.Backtick_DoubleQuote),
323	R(r'[^`\\\0]+', Id.Backtick_Other), # contiguous run of literals
324	R(r'[^\0]', Id.Backtick_Other), # anything else
325	]
326
327	# DBracket: can be like ShCommand, except:
328	# - Don't really need redirects either... Redir_Less could be Op_Less
329	# - Id.Op_DLeftParen can't be nested inside.
330	LEXER_DEF[lex_mode_e.DBracket] = [
331	C(']]', Id.Lit_DRightBracket),
332	# Must be KW and not Op, because we can have stuff like [[ $foo == !* ]]
333	# in addition to [[ ! a && b ]]
334	C('!', Id.KW_Bang),
335	C('<', Id.Op_Less),
336	C('>', Id.Op_Great),
337	] + ID_SPEC.LexerPairs(Kind.BoolUnary) + \
338	ID_SPEC.LexerPairs(Kind.BoolBinary) + \
339	_UNQUOTED + _EXTGLOB_BEGIN
340
341	# Inside an extended glob, most characters are literals, including spaces and
342	# punctuation. We also accept \, $var, ${var}, "", etc. They can also be
343	# nested, so _EXTGLOB_BEGIN appears here.
344	#
345	# Example: echo @(<> <>\|&&\|'foo'\|$bar)
346	LEXER_DEF[lex_mode_e.ExtGlob] = \
347	_BACKSLASH + _LEFT_SUBS + _LEFT_UNQUOTED + _VARS + _EXTGLOB_BEGIN + [
348	R(r'[^\\$`"\'\|)@*+!?\0]+', Id.Lit_Chars),
349	C('\|', Id.Op_Pipe),
350	C(')', Id.Op_RParen), # maybe be translated to Id.ExtGlob_RParen
351	R(r'[^\0]', Id.Lit_Other), # everything else is literal
352	]
353
354	# Notes on BASH_REGEX states
355	#
356	# From bash manual:
357	#
358	# - Any part of the pattern may be quoted to force the quoted portion to be
359	# matched as a string.
360	# - Bracket expressions in regular expressions must be treated carefully, since
361	# normal quoting characters lose their meanings between brackets.
362	# - If the pattern is stored in a shell variable, quoting the variable
363	# expansion forces the entire pattern to be matched as a string.
364	#
365	# Is there a re.escape function? It's just like EscapeGlob and UnescapeGlob.
366	#
367	# bash code: ( \| ) are special
368
369	LEXER_DEF[lex_mode_e.BashRegex] = _LEFT_SUBS + _LEFT_UNQUOTED + _VARS + [
370	# Like lex_mode_e.ShCommand
371	R(_LITERAL_WHITELIST_REGEX, Id.Lit_Chars),
372
373	# Tokens for Tilde sub. bash weirdness: RHS of [[ x =~ ~ ]] is expanded
374	C('~', Id.Lit_Tilde),
375	C('/', Id.Lit_Slash),
376
377	# Id.WS_Space delimits words. In lex_mode_e.BashRegexFakeInner, we
378	# translate them to Id.Lit_Chars.
379	_SIGNIFICANT_SPACE,
380
381	# Analogous to Id.ExtGlob_* - we need to change lexer modes when we hit this
382	C('(', Id.BashRegex_LParen),
383
384	# Not special, this is like lex_mode_e.ShCommand
385	C(')', Id.Op_RParen),
386
387	# Copied and adapted from _UNQUOTED
388	# \n & ; < > are parse errors OUTSIDE a group [[ s =~ ; ]]
389	# but become allowed INSIDE a group [[ s =~ (;) ]]
390	C('\n', Id.BashRegex_AllowedInParens),
391	C('&', Id.BashRegex_AllowedInParens),
392	C(';', Id.BashRegex_AllowedInParens),
393	C('>', Id.BashRegex_AllowedInParens),
394	C('<', Id.BashRegex_AllowedInParens),
395
396	# e.g. \| is Id.Lit_Other, not pipe operator
397	R(r'[^\0]', Id.Lit_Other), # like _UNQUOTED, any other byte is literal
398	] + _BACKSLASH # These have to come after RegexMeta
399
400	LEXER_DEF[lex_mode_e.DQ] = [
401	_DQ_ESCAPED_CHAR,
402	C('\\\n', Id.Ignored_LineCont),
403	C('\\', Id.Lit_BadBackslash), # syntax error in YSH, but NOT in OSH
404	] + _LEFT_SUBS + _VARS + [
405	R(r'[^$`"\0\\]+', Id.Lit_Chars), # matches a line at most
406	C('$', Id.Lit_Dollar), # completion of var names relies on this
407	# NOTE: When parsing here doc line, this token doesn't end it.
408	C('"', Id.Right_DoubleQuote),
409	]
410
411	_VS_ARG_COMMON = [
412	C('/', Id.Lit_Slash), # for patsub (not Id.VOp2_Slash)
413	C('#', Id.Lit_Pound), # for patsub prefix (not Id.VOp1_Pound)
414	C('%', Id.Lit_Percent), # for patsdub suffix (not Id.VOp1_Percent)
415	C('}', Id.Right_DollarBrace), # For var sub "${a}"
416	C('$', Id.Lit_Dollar), # completion of var names relies on this
417	]
418
419	# We don't execute zsh var subs, but to find the closing } properly, we need to
420	# to recognize \} and '}' and "}" $'}' etc.
421	LEXER_DEF[lex_mode_e.VSub_Zsh] = \
422	_BACKSLASH + _LEFT_SUBS + _LEFT_UNQUOTED + _LEFT_PROCSUB + \
423	[
424	C('}', Id.Right_DollarBrace), # For var sub "${a}"
425	R(r'[^\0]', Id.Lit_Other), # e.g. "$", must be last
426	]
427
428	# Kind.{Lit,Ignored,VSub,Left,Right,Eof}
429	LEXER_DEF[lex_mode_e.VSub_ArgUnquoted] = \
430	_BACKSLASH + _VS_ARG_COMMON + _LEFT_SUBS + _LEFT_UNQUOTED + _LEFT_PROCSUB + \
431	_VARS + _EXTGLOB_BEGIN + [
432
433	# Token for Tilde sub
434	C('~', Id.Lit_Tilde),
435
436	# - doesn't match ~ for tilde sub
437	# - doesn't match < and > so it doesn't eat <()
438	# - doesn't match @ ! ? + * so it doesn't eat _EXTGLOB_BEGIN -- ( alone it
439	# not enough
440	R(r'[^$`~/}"\'\0\\#%<>@!?+*]+', Id.Lit_Chars),
441	R(r'[^\0]', Id.Lit_Other), # e.g. "$", must be last
442	]
443
444	# Kind.{Lit,Ignored,VSub,Left,Right,Eof}
445	LEXER_DEF[lex_mode_e.VSub_ArgDQ] = [
446	_DQ_ESCAPED_CHAR,
447	C(r'\}', Id.Lit_EscapedChar), # For "${var-\}}"
448	C('\\\n', Id.Ignored_LineCont),
449	C('\\', Id.Lit_BadBackslash), # syntax error in YSH, but NOT in OSH
450	] + _VS_ARG_COMMON + _LEFT_SUBS + _VARS + [
451	R(r'[^$`/}"\0\\#%]+', Id.Lit_Chars), # matches a line at most
452
453	# Weird wart: even in double quoted state, double quotes are allowed
454	C('"', Id.Left_DoubleQuote),
455
456	# Another weird wart of bash/mksh: $'' is recognized but NOT ''!
457	C("$'", Id.Left_DollarSingleQuote),
458	]
459
460	# NOTE: Id.Ignored_LineCont is NOT supported in SQ state, as opposed to DQ
461	# state.
462	LEXER_DEF[lex_mode_e.SQ_Raw] = [
463	R(r"[^'\0]+", Id.Lit_Chars), # matches a line at most
464	C("'", Id.Right_SingleQuote),
465	]
466
467	# The main purpose for EXPR_CHARS is in regex literals, e.g. [a-z \t \n].
468	#
469	# In YSH expressions, Chars are code point integers, so \u{1234} is the same as
470	# 0x1234. And \0 is 0x0.
471
472	# In Python:
473	# chr(0x00012345) == u'\U00012345'
474	#
475	# In YSH:
476	# 0x00012345 == \u{12345}
477	# chr(0x00012345) == chr(\u{12345}) == $'\u{012345}'
478
479	_U_BRACED_CHAR = R(r'\\[uU]\{[0-9a-fA-F]{1,6}\}', Id.Char_UBraced)
480
481	_X_CHAR_LOOSE = R(r'\\x[0-9a-fA-F]{1,2}', Id.Char_Hex) # bash
482	_CHAR_YHEX = R(r'\\y[0-9a-fA-F]{2}', Id.Char_YHex) # \yff - J8 only
483
484	_U4_CHAR_LOOSE = R(r'\\u[0-9a-fA-F]{1,4}', Id.Char_Unicode4) # bash
485
486	_U4_CHAR_STRICT = R(r'\\u[0-9a-fA-F]{4}', Id.Char_Unicode4) # JSON-only
487
488	EXPR_CHARS = [
489	# Allow same backslash escapes as J8 strings, except;
490	# - legacy \b \f
491	# - unnecessary \/
492	#
493	# Note that \0 should be written \y00.
494	R(r'''\\[\\"'nrt]''', Id.Char_OneChar),
495	_CHAR_YHEX,
496
497	# LEGACY Eggex. This is a LITERAL translation to 0xff in ERE?
498	# This is a syntax error in a YSH expression - it doesn't handle the token
499	R(r'\\x[0-9a-fA-F]{2}', Id.Char_Hex),
500	_U_BRACED_CHAR,
501	]
502
503	# Shared between echo -e and $''.
504	_C_STRING_COMMON = [
505
506	# \x6 is valid in bash
507	_X_CHAR_LOOSE,
508	_U4_CHAR_LOOSE,
509	R(r'\\U[0-9a-fA-F]{1,8}', Id.Char_Unicode8),
510	R(r'\\[0abeEfrtnv\\]', Id.Char_OneChar),
511
512	# e.g. \A is not an escape, and \x doesn't match a hex escape. We allow it,
513	# but a lint tool could warn about it.
514	C('\\', Id.Unknown_Backslash),
515	]
516
517	ECHO_E_DEF = _C_STRING_COMMON + [
518	# Note: tokens above \0377 can either be truncated or be flagged a syntax
519	# error in strict mode.
520	R(r'\\0[0-7]{1,3}', Id.Char_Octal4),
521	C(r'\c', Id.Char_Stop),
522
523	# e.g. 'foo', anything that's not a backslash escape
524	R(r'[^\\\0]+', Id.Lit_Chars),
525	]
526
527	OCTAL3_RE = r'\\[0-7]{1,3}'
528
529	PRINTF_B_DEF = ECHO_E_DEF + [
530	# \123 octal form is accepted as an extension
531	R(OCTAL3_RE, Id.Char_Octal3),
532	]
533
534	# https://json.org/
535
536	# Note that [0-9] has to come second, because Python chooses the first match.
537	_JSON_INT = r'-?([1-9][0-9]*\|[0-9])' # Numbers can't start with leading 0
538	_JSON_FRACTION = r'(\.[0-9]+)?'
539	_JSON_EXP = r'([eE][-+]?[0-9]+)?'
540
541	# R5RS extended alphabetic characters
542	# https://groups.csail.mit.edu/mac/ftpdir/scheme-reports/r5rs-html/r5rs_4.html
543	#
544	# ! $ % & * + - . / : < = > ? @ ^ _ ~
545
546	# Description from Guile Scheme - https://www.gnu.org/software/guile/manual/html_node/Symbol-Read-Syntax.html
547	#
548	# "The read syntax for a symbol is a sequence of letters, digits, and extended
549	# alphabetic characters, beginning with a character that cannot begin a
550	# number. In addition, the special cases of +, -, and ... are read as symbols
551	# even though numbers can begin with +, - or ."
552	#
553	# (They should have used regular languages!)
554
555	# We take out $ and @ for our splicing syntax, i.e. $unquote and
556	# @unquote-splicing. And : for now because we use it for name:value.
557
558	# Also note Scheme allows \|a b\| for symbols with funny chars, and Guile scheme
559	# allows #{a b}#. We could use `a b` or (symbol "a b").
560
561	J8_SYMBOL_CHARS = r'!%&*+./<=>?^_~-' # - is last for regex char class
562
563	# yapf: disable
564	J8_SYMBOL_RE = (
565	r'[a-zA-Z' + J8_SYMBOL_CHARS + ']' +
566	r'[a-zA-Z0-9' + J8_SYMBOL_CHARS + ']*')
567	# yapf: enable
568
569	_J8_LEFT = [
570	C('"', Id.Left_DoubleQuote), # JSON string
571	C('j"', Id.Left_JDoubleQuote), # JSON string with explicit J8 prefix
572	# Three left quotes that are J8 only
573	C("u'", Id.Left_USingleQuote), # unicode string
574	C("'", Id.Left_USingleQuote), # '' is alias for u'' in data, not in code
575	C("b'", Id.Left_BSingleQuote), # byte string
576	]
577
578	J8_DEF = _J8_LEFT + [
579	C('[', Id.J8_LBracket),
580	C(']', Id.J8_RBracket),
581	C('{', Id.J8_LBrace),
582	C('}', Id.J8_RBrace),
583	C('(', Id.J8_LParen), # NIL8 only
584	C(')', Id.J8_RParen), # NIL8 only
585	C(',', Id.J8_Comma),
586	C(':', Id.J8_Colon),
587	C('null', Id.J8_Null),
588	C('true', Id.J8_Bool),
589	C('false', Id.J8_Bool),
590	R(_JSON_INT, Id.J8_Int),
591	R(_JSON_INT + _JSON_FRACTION + _JSON_EXP, Id.J8_Float),
592
593	# Identifier names come AFTER null true false.
594	# - Happens to be the same as shell identifier # names.
595	# - Note that JS allows $ as an identifier, but we don't.
596	# - Used for dict keys / NIL8 field names.
597	R(VAR_NAME_RE, Id.J8_Identifier),
598
599	# Symbol is a SUPERSET of Identifier. The first word in NIL8 can be can
600	# be either Symbol or plain Identifier, but field names can only be
601	# Identifier. JSON8 only has Identifier.
602	#R(J8_SYMBOL_RE, Id.J8_Symbol), # NIL8 only
603	R(r'[~!@$%^&*+=\|;./<>?-]+', Id.J8_Operator), # NIL8 only
604	R(r'[ \r\t]+', Id.Ignored_Space),
605	# A separate token, to count lines for error messages
606	C('\n', Id.Ignored_Newline),
607	# comment is # until end of line
608	# // comments are JavaScript style, but right now we might want them as
609	# symbols?
610	R(r'#[^\n\0]*', Id.Ignored_Comment), # J8 only (JSON8, NIL8)
611
612	# This will reject ASCII control chars
613	R(r'[^\0]', Id.Unknown_Tok),
614	]
615
616	# Exclude control characters 0x00-0x1f, aka 0-31 in J8 data only (not YSH code)
617	_ASCII_CONTROL = R(r'[\x01-\x1F]', Id.Char_AsciiControl)
618
619	J8_LINES_DEF = _J8_LEFT + [
620	# not sure if we want \r here - same with lex_mode_e.Expr
621	R(r'[ \r\t]+', Id.WS_Space),
622	R(r'[\n]', Id.J8_Newline),
623
624	# doesn't match \t, which means tabs are allowed in the middle of unquoted
625	# lines
626	_ASCII_CONTROL,
627
628	# not space or ' or " or ASCII control or EOF
629	R(r'''[^ \t\r\n'"\x00-\x1F]+''', Id.Lit_Chars),
630	]
631
632	# https://json.org list of chars
633	_JSON_ONE_CHAR = R(r'\\[\\"/bfnrt]', Id.Char_OneChar)
634
635	# b'' u'' strings - what's common between code and data.
636	_J8_STR_COMMON = [
637	C("'", Id.Right_SingleQuote), # end for J8
638	_JSON_ONE_CHAR,
639	C("\\'", Id.Char_OneChar), # since ' ends, allow \'
640	_CHAR_YHEX,
641	_U_BRACED_CHAR, # \u{123456} - J8 only
642
643	# osh/word_parse.py relies on this. It has to be consistent with $''
644	# lexing, which uses _C_STRING_COMMON
645	C('\\', Id.Unknown_Backslash),
646	]
647
648	# Lexer for J8 strings in CODE.
649	LEXER_DEF[lex_mode_e.J8_Str] = _J8_STR_COMMON + [
650	# Don't produce Char_AsciiControl tokens - that's only for data
651
652	# will match invalid UTF-8 - we have a separate validation step
653	R(r"[^\\'\0]+", Id.Lit_Chars),
654	]
655
656	# Lexer for J8 string data.
657	# ASCII control characters are disallowed in DATA, but not CODE!
658	J8_STR_DEF = _J8_STR_COMMON + [
659	_ASCII_CONTROL,
660	# will match invalid UTF-8 - we have a separate validation step
661	R(r"[^\\'\x00-\x1F]+", Id.Lit_Chars),
662	]
663
664	# Lexer for JSON string data - e.g. "json \" \u1234"
665	JSON_STR_DEF = [
666	C('"', Id.Right_DoubleQuote), # end for JSON
667	_JSON_ONE_CHAR,
668	_U4_CHAR_STRICT, # \u1234 - JSON only
669
670	# High surrogate [\uD800, \uDC00)
671	# Low surrogate [\uDC00, \uE000)
672	# This pattern makes it easier to decode. Unpaired surrogates because Id.Char_Unicode4.
673	R(
674	r'\\u[dD][89aAbB][0-9a-fA-F][0-9a-fA-F]\\u[dD][cCdDeEfF][0-9a-fA-F][0-9a-fA-F]',
675	Id.Char_SurrogatePair),
676	C('\\', Id.Unknown_Backslash), # e.g. the \ before bad \z
677	_ASCII_CONTROL,
678
679	# Note: This will match INVALID UTF-8. UTF-8 validation is another step.
680	R(r'[^\\"\x00-\x1F]+', Id.Lit_Chars),
681	]
682
683	_WHITESPACE = r'[ \t\r\n]*' # ASCII whitespace doesn't have legacy \f \v
684
685	SH_NUMBER_DEF = [
686	R('0', Id.ShNumber_Dec),
687	R(r'[1-9][0-9]*', Id.ShNumber_Dec),
688	R(r'0[0-7]+', Id.ShNumber_Oct),
689	R(r'0[xX][0-9A-Fa-f]+', Id.ShNumber_Hex),
690	R(r'[1-9][0-9]*#[0-9a-zA-Z@_]+', Id.ShNumber_BaseN),
691	R(r'[^\0]', Id.Unknown_Tok), # any other char
692	]
693
694	# https://www.gnu.org/software/bash/manual/html_node/Controlling-the-PromptEvaluator.html#Controlling-the-PromptEvaluator
695	PS1_DEF = [
696	R(OCTAL3_RE, Id.PS_Octal3),
697	R(r'\\[adehHjlnrstT@AuvVwW!#$\\]', Id.PS_Subst),
698	# \D{%H:%M} strftime format
699	R(r'\\D\{[^}\0]*\}', Id.PS_Subst),
700	C(r'\[', Id.PS_LBrace), # non-printing
701	C(r'\]', Id.PS_RBrace),
702	R(r'[^\\\0]+', Id.PS_Literals),
703	# e.g. \x is not a valid escape.
704	C('\\', Id.PS_BadBackslash),
705	]
706
707	# NOTE: Id.Ignored_LineCont is also not supported here, even though the whole
708	# point of it is that supports other backslash escapes like \n! It just
709	# becomes a regular backslash.
710	LEXER_DEF[lex_mode_e.SQ_C] = _C_STRING_COMMON + [
711	# Weird special case matching bash: backslash that ends a line. We emit
712	# this token literally in OSH, but disable it in YSH.
713	C('\\\n', Id.Unknown_Backslash),
714
715	# Silly difference! In echo -e, the syntax is \0377, but here it's $'\377',
716	# with no leading 0.
717	R(OCTAL3_RE, Id.Char_Octal3),
718
719	# ' and " are escaped in $'' mode, but not echo -e.
720	C(r"\'", Id.Char_OneChar),
721	C(r'\"', Id.Char_OneChar),
722
723	# e.g. 'foo', anything that's not a backslash escape or '
724	R(r"[^\\'\0]+", Id.Lit_Chars),
725	C("'", Id.Right_SingleQuote),
726	]
727
728	LEXER_DEF[lex_mode_e.PrintfOuter] = _C_STRING_COMMON + [
729	R(OCTAL3_RE, Id.Char_Octal3),
730	R(r"[^%\\\0]+", Id.Lit_Chars),
731	C('%%', Id.Format_EscapedPercent),
732	C('%', Id.Format_Percent),
733	]
734
735	# Maybe: bash also supports %(strftime)T
736	LEXER_DEF[lex_mode_e.PrintfPercent] = [
737	# Flags
738	R('[- +#]', Id.Format_Flag),
739	C('0', Id.Format_Zero),
740	R('[1-9][0-9]*', Id.Format_Num),
741	C('*', Id.Format_Star),
742	C('.', Id.Format_Dot),
743	# We support dsq. The others we parse to display an error message.
744	R('[disqbcouxXeEfFgG]', Id.Format_Type),
745	R('$[^()\0]*$T', Id.Format_Time),
746	R(r'[^\0]', Id.Unknown_Tok), # any other char
747	]
748
749	LEXER_DEF[lex_mode_e.VSub_1] = [
750	R(VAR_NAME_RE, Id.VSub_Name),
751	# ${11} is valid, compared to $11 which is $1 and then literal 1.
752	R(r'[0-9]+', Id.VSub_Number),
753	C('!', Id.VSub_Bang),
754	C('@', Id.VSub_At),
755	C('#', Id.VSub_Pound),
756	C('$', Id.VSub_Dollar),
757	C('*', Id.VSub_Star),
758	C('-', Id.VSub_Hyphen),
759	C('?', Id.VSub_QMark),
760	C('.', Id.VSub_Dot), # ${.myproc builtin sub}
761	C('}', Id.Right_DollarBrace),
762	C('\\\n', Id.Ignored_LineCont),
763	C('\n', Id.Unknown_Tok), # newline not allowed inside ${}
764	R(r'[^\0]', Id.Unknown_Tok), # any char except newline
765	]
766
767	LEXER_DEF[lex_mode_e.VSub_2] = \
768	ID_SPEC.LexerPairs(Kind.VTest) + \
769	ID_SPEC.LexerPairs(Kind.VOp0) + \
770	ID_SPEC.LexerPairs(Kind.VOpYsh) + \
771	ID_SPEC.LexerPairs(Kind.VOp1) + \
772	ID_SPEC.LexerPairs(Kind.VOp2) + \
773	ID_SPEC.LexerPairs(Kind.VOp3) + [
774	C('}', Id.Right_DollarBrace),
775
776	C('\\\n', Id.Ignored_LineCont),
777	C('\n', Id.Unknown_Tok), # newline not allowed inside ${}
778	R(r'[^\0]', Id.Unknown_Tok), # any char except newline
779	]
780
781	_EXPR_ARITH_SHARED = [
782	C('\\\n', Id.Ignored_LineCont),
783	R(r'[^\0]', Id.Unknown_Tok) # any char. This should be a syntax error.
784	]
785
786	# https://www.gnu.org/software/bash/manual/html_node/Shell-Arithmetic.html#Shell-Arithmetic
787	LEXER_DEF[lex_mode_e.Arith] = \
788	_LEFT_SUBS + _VARS + _LEFT_UNQUOTED + [
789
790	# Arithmetic expressions can cross newlines.
791	R(r'[ \t\r\n]+', Id.Ignored_Space),
792
793	# Examples of arith constants:
794	# 64#azAZ
795	# 0xabc 0xABC
796	# 0123
797	# A separate digits token makes this easier to parse STATICALLY. But this
798	# doesn't help with DYNAMIC parsing.
799	R(VAR_NAME_RE, Id.Lit_ArithVarLike), # for variable names or 64#_
800	R(r'[0-9]+', Id.Lit_Digits),
801	C('@', Id.Lit_At), # for 64#@ or ${a[@]}
802	C('#', Id.Lit_Pound), # for 64#a
803
804	# TODO: 64#@ interferes with VS_AT. Hm.
805	] + ID_SPEC.LexerPairs(Kind.Arith) + _EXPR_ARITH_SHARED
806
807	# A lexer for the parser that converts globs to extended regexes. Since we're
808	# only parsing character classes ([^[:space:][:alpha:]]) as opaque blobs, we
809	# don't need lexer modes here.
810	GLOB_DEF = [
811	# These could be operators in the glob, or just literals in a char class,
812	# e.g. touch '?'; echo [?].
813	C('*', Id.Glob_Star),
814	C('?', Id.Glob_QMark),
815
816	# For negation. Treated as operators inside [], but literals outside.
817	C('!', Id.Glob_Bang),
818	C('^', Id.Glob_Caret),
819
820	# Character classes.
821	C('[', Id.Glob_LBracket),
822	C(']', Id.Glob_RBracket),
823
824	# There is no whitelist of characters; backslashes are unconditionally
825	# removed. With libc.fnmatch(), the pattern r'\f' matches 'f' but not '\\f'.
826	# See libc_test.py.
827	R(r'\\[^\0]', Id.Glob_EscapedChar),
828	C('\\', Id.Glob_BadBackslash), # Trailing single backslash
829
830	# For efficiency, combine other characters into a single token, e.g. 'py' in
831	# '*.py' or 'alpha' in '[[:alpha:]]'.
832	R(r'[a-zA-Z0-9_]+', Id.Glob_CleanLiterals), # no regex escaping
833	R(r'[^\0]', Id.Glob_OtherLiteral), # anything else -- examine the char
834	]
835
836	# History expansion. We're doing this as "pre-lexing" since that's what bash
837	# and zsh seem to do. Example:
838	#
839	# $ foo=x
840	# $ echo $
841	# $ !!foo # expands to echo $foo and prints x
842	#
843	# We can also reuse this in the RootCompleter to expand history interactively.
844	#
845	# bash note: handled in lib/readline/histexpand.c. Quite messy and handles
846	# quotes AGAIN.
847	#
848	# Note: \! gets expanded to literal \! for the real lexer, but no history
849	# expansion occurs.
850
851	HISTORY_DEF = [
852	# Common operators.
853	R(r'![!*^$]', Id.History_Op),
854
855	# By command number.
856	R(r'!-?[0-9]+', Id.History_Num),
857
858	# Search by prefix of substring (optional '?').
859	# NOTE: there are no numbers allowed here! Bash doesn't seem to support it.
860	# No hyphen since it conflits with $-1 too.
861	#
862	# Required trailing whitespace is there to avoid conflict with [!charclass]
863	# and ${!indirect}. This is a simpler hack than the one bash has. See
864	# frontend/lex_test.py.
865	R(r'!\??[a-zA-Z_/.][0-9a-zA-Z_/.]+[ \t\r\n]', Id.History_Search),
866
867	# Comment is until end of line
868	R(r"#[^\0]*", Id.History_Other),
869
870	# Single quoted, e.g. 'a' or $'\n'. Terminated by another single quote or
871	# end of string.
872	R(r"'[^'\0]*'?", Id.History_Other),
873
874	# Runs of chars that are definitely not special
875	R(r"[^!\\'#\0]+", Id.History_Other),
876
877	# Escaped characters. \! disables history
878	R(r'\\[^\0]', Id.History_Other),
879	# Other single chars, like a trailing \ or !
880	R(r'[^\0]', Id.History_Other),
881	]
882
883	BRACE_RANGE_DEF = [
884	R(r'-?[0-9]+', Id.Range_Int),
885	R(r'[a-zA-Z]', Id.Range_Char), # just a single character
886	R(r'\.\.', Id.Range_Dots),
887	R(r'[^\0]', Id.Range_Other), # invalid
888	]
889
890	#
891	# YSH lexing
892	#
893
894	# Valid in lex_mode_e.{Expr,DQ}
895	# Used by ysh/grammar_gen.py
896	YSH_LEFT_SUBS = [
897	C('$(', Id.Left_DollarParen),
898	C('${', Id.Left_DollarBrace),
899	C('$[', Id.Left_DollarBracket),
900	]
901
902	# Valid in lex_mode_e.Expr, but not valid in DQ
903	# Used by ysh/grammar_gen.py
904
905	YSH_LEFT_UNQUOTED = [
906	# Double quoted
907	C('"', Id.Left_DoubleQuote),
908	C('$"', Id.Left_DollarDoubleQuote), # $"" is synonym for ""
909	C('j"', Id.Left_JDoubleQuote), # for printing ERROR
910	# Single quoted
911	C("'", Id.Left_SingleQuote),
912	C("r'", Id.Left_RSingleQuote),
913	C("u'", Id.Left_USingleQuote),
914	C("b'", Id.Left_BSingleQuote),
915	C("$'", Id.Left_DollarSingleQuote), # legacy
916	C('^"', Id.Left_CaretDoubleQuote),
917	C('"""', Id.Left_TDoubleQuote),
918	C('$"""', Id.Left_DollarTDoubleQuote),
919	# In expression mode, we add the r'' and c'' prefixes for '' and $''.
920	C("'''", Id.Left_TSingleQuote),
921	C("r'''", Id.Left_RTSingleQuote),
922	C("u'''", Id.Left_UTSingleQuote),
923	C("b'''", Id.Left_BTSingleQuote),
924	C('@(', Id.Left_AtParen), # Split Command Sub
925	C('@[', Id.Left_AtBracket), # Array splice in expression mode
926	C('^(', Id.Left_CaretParen), # Block literals in expression mode
927	C('^[', Id.Left_CaretBracket), # Expr literals
928	C('^{', Id.Left_CaretBrace), # Unused
929	C(':\|', Id.Left_ColonPipe), # shell-like word arrays.
930
931	# DEPRECATED syntax for :\| sh array \|
932	C('%(', Id.Left_PercentParen),
933	# May not use these
934	C('%[', Id.Expr_Reserved),
935	C('%{', Id.Expr_Reserved), # Table literals? Vertical dict?
936	C('@{', Id.Expr_Reserved),
937	]
938
939	# Used by ysh/grammar_gen.py
940	EXPR_OPS = [
941	# Terminator
942	C(';', Id.Op_Semi),
943	C('(', Id.Op_LParen),
944	C(')', Id.Op_RParen),
945	# Note: type expressions are expressions, e.g. Dict[Str, Int]
946	C('[', Id.Op_LBracket),
947	C(']', Id.Op_RBracket),
948	C('{', Id.Op_LBrace),
949	C('}', Id.Op_RBrace),
950	]
951
952	# Newline is significant, but sometimes elided by expr_parse.py.
953	_EXPR_NEWLINE_COMMENT = [
954	C('\n', Id.Op_Newline),
955	R(r'#[^\n\0]*', Id.Ignored_Comment),
956	# Like lex_mode_e.Arith, \r is whitespace even without \n
957	R(r'[ \t\r]+', Id.Ignored_Space),
958	]
959
960	# Note: if you call match.LooksLikeInteger(s), mops.FromStr(s) may still
961	# fail. However you should call BOTH, because we don't rely want to rely on
962	# the underlying stroll() to define the language accepted.
963	LOOKS_LIKE_INTEGER = _WHITESPACE + '-?[0-9]+' + _WHITESPACE
964
965	# TODO: use for YSH comparison operators > >= < <=
966	#
967	# Python allows 0 to be written 00 or 0_0_0, which is weird. But let's be
968	# consistent, and avoid '00' turning into a float!
969	_YSH_DECIMAL_INT_RE = r'[0-9](_?[0-9])*'
970
971	LOOKS_LIKE_YSH_INT = _WHITESPACE + '-?' + _YSH_DECIMAL_INT_RE + _WHITESPACE
972
973	_YSH_FLOAT_RE = (
974	_YSH_DECIMAL_INT_RE +
975	# Unlike Python, exponent can't be like 42e5_000. There's no use because
976	# 1e309 is already inf. Let's keep our code simple.
977	r'(\.' + _YSH_DECIMAL_INT_RE + ')?([eE][+\-]?[0-9]+)?')
978
979	# Ditto, used for YSH comparison operators
980	# Added optional Optional -?
981	# Example: -3_000_000.000_001e12
982	LOOKS_LIKE_YSH_FLOAT = _WHITESPACE + '-?' + _YSH_FLOAT_RE + _WHITESPACE
983
984	# Python 3 float literals:
985
986	# digitpart ::= digit (["_"] digit)*
987	# fraction ::= "." digitpart
988	# exponent ::= ("e" \| "E") ["+" \| "-"] digitpart
989	# pointfloat ::= [digitpart] fraction \| digitpart "."
990	# exponentfloat ::= (digitpart \| pointfloat) exponent
991	# floatnumber ::= pointfloat \| exponentfloat
992
993	# NOTE: Borrowing tokens from Arith (i.e. $(( )) ), but not using LexerPairs().
994	LEXER_DEF[lex_mode_e.Expr] = \
995	_VARS + YSH_LEFT_SUBS + YSH_LEFT_UNQUOTED + EXPR_OPS + EXPR_WORDS + \
996	EXPR_CHARS + [
997
998	# https://docs.python.org/3/reference/lexical_analysis.html#integer-literals
999	#
1000	# integer ::= decinteger \| bininteger \| octinteger \| hexinteger
1001	# decinteger ::= nonzerodigit (["_"] digit)* \| "0"+ (["_"] "0")*
1002	# bininteger ::= "0" ("b" \| "B") (["_"] bindigit)+
1003	# octinteger ::= "0" ("o" \| "O") (["_"] octdigit)+
1004	# hexinteger ::= "0" ("x" \| "X") (["_"] hexdigit)+
1005	# nonzerodigit ::= "1"..."9"
1006	# digit ::= "0"..."9"
1007	# bindigit ::= "0" \| "1"
1008	# octdigit ::= "0"..."7"
1009	# hexdigit ::= digit \| "a"..."f" \| "A"..."F"
1010
1011	R(_YSH_DECIMAL_INT_RE, Id.Expr_DecInt),
1012
1013	R(r'0[bB](_?[01])+', Id.Expr_BinInt),
1014	R(r'0[oO](_?[0-7])+', Id.Expr_OctInt),
1015	R(r'0[xX](_?[0-9a-fA-F])+', Id.Expr_HexInt),
1016
1017	R(_YSH_FLOAT_RE, Id.Expr_Float),
1018
1019	# These can be looked up as keywords separately, so you enforce that they have
1020	# space around them?
1021	R(VAR_NAME_RE, Id.Expr_Name),
1022
1023	R('%' + VAR_NAME_RE, Id.Expr_Symbol),
1024
1025	#
1026	# Arith
1027	#
1028
1029	C(',', Id.Arith_Comma),
1030	C(':', Id.Arith_Colon), # for slicing a[1:2], and mylist:pop()
1031
1032	C('?', Id.Arith_QMark), # regex postfix
1033
1034	C('+', Id.Arith_Plus), # arith infix, regex postfix
1035	C('-', Id.Arith_Minus), # arith infix, regex postfix
1036	C('*', Id.Arith_Star),
1037	C('^', Id.Arith_Caret), # xor
1038	C('/', Id.Arith_Slash),
1039	C('%', Id.Arith_Percent),
1040
1041	C('**', Id.Arith_DStar), # exponentiation
1042	C('++', Id.Arith_DPlus), # Option for string/list concatenation
1043
1044	C('<', Id.Arith_Less),
1045	C('>', Id.Arith_Great),
1046	C('<=', Id.Arith_LessEqual),
1047	C('>=', Id.Arith_GreatEqual),
1048	C('===', Id.Expr_TEqual),
1049	C('!==', Id.Expr_NotDEqual),
1050
1051	C('==', Id.Unknown_DEqual), # user must choose === or ~==
1052
1053	C('&&', Id.Unknown_DAmp),
1054	C('\|\|', Id.Unknown_DPipe),
1055
1056	# Bitwise operators
1057	C('&', Id.Arith_Amp),
1058	C('\|', Id.Arith_Pipe),
1059	C('>>', Id.Arith_DGreat),
1060	C('<<', Id.Arith_DLess), # Doesn't Java also have <<< ?
1061
1062	# Bitwise complement, as well as infix pattern matching
1063	C('~', Id.Arith_Tilde),
1064	C('!~', Id.Expr_NotTilde),
1065	C('~~', Id.Expr_DTilde),
1066	C('!~~', Id.Expr_NotDTilde),
1067
1068	# Left out for now:
1069	# ++ -- -- needed for loops, awk?
1070	# ! && \|\| -- needed for find dialect
1071	# = += etc.
1072
1073	C('=', Id.Arith_Equal),
1074
1075	C('+=', Id.Arith_PlusEqual),
1076	C('-=', Id.Arith_MinusEqual),
1077	C('*=', Id.Arith_StarEqual),
1078	C('/=', Id.Arith_SlashEqual),
1079	C('%=', Id.Arith_PercentEqual),
1080
1081	C('>>=', Id.Arith_DGreatEqual),
1082	C('<<=', Id.Arith_DLessEqual),
1083	C('&=', Id.Arith_AmpEqual),
1084	C('\|=', Id.Arith_PipeEqual),
1085	C('^=', Id.Arith_CaretEqual), # Exponentiation
1086
1087	# Augmented assignment that YSH has, but sh and OSH don't have
1088	C('**=', Id.Expr_DStarEqual),
1089	C('//=', Id.Expr_DSlashEqual),
1090
1091	#
1092	# Expr
1093	#
1094
1095	C('!', Id.Expr_Bang), # For eggex negation
1096
1097	C('//', Id.Expr_DSlash), # For YSH integer division
1098	C('~==', Id.Expr_TildeDEqual), # approximate equality
1099
1100	C('.', Id.Expr_Dot), # d.key is alias for d['key']
1101	C('..', Id.Unknown_DDot), # legacy half-open range 1..5
1102	C('..<', Id.Expr_DDotLessThan), # half-open range 1..<5
1103	C('..=', Id.Expr_DDotEqual), # closed range 1..5
1104	C('->', Id.Expr_RArrow), # s->startswith()
1105	C('$', Id.Expr_Dollar), # legacy regex end: /d+ $/ (better written /d+ >/
1106
1107	# Reserved this. Go uses it for channels, etc.
1108	# I guess it conflicts with -4<-3, but that's OK -- spaces suffices.
1109	C('<-', Id.Expr_Reserved),
1110	C('=>', Id.Expr_RDArrow), # for df => filter(age > 10)
1111	# and match (x) { 1 => "one" }
1112	# note: other languages use \|>
1113	# R/dplyr uses %>%
1114
1115	C('...', Id.Expr_Ellipsis), # f(...args) and maybe a[:, ...]
1116
1117	# For multiline regex literals?
1118	C('///', Id.Expr_Reserved),
1119
1120	# Splat operators
1121	C('@', Id.Expr_At),
1122	# NOTE: Unused
1123	C('@@', Id.Expr_DoubleAt),
1124	] + _EXPR_NEWLINE_COMMENT + _EXPR_ARITH_SHARED
1125
1126	LEXER_DEF[lex_mode_e.FuncParens] = [
1127	# () with spaces
1128	R(r'[ \t]$[ \t]$', Id.LookAhead_FuncParens),
1129	# anything else
1130	R(r'[^\0]', Id.Unknown_Tok)
1131	]