OILS / ysh / grammar.pgen2 View on Github | oils.pub

553 lines, 190 significant
1# Grammar for YSH.
2# Adapted from the Python 3.7 expression grammar, with several changes!
3#
4# TODO:
5# - List comprehensions
6# - There's also chaining => and maybe implicit vectorization ==>
7# - But list comprehensions are more familiar, and they are concise
8# - Generator expressions?
9# - Do we need lambdas?
10
11# Note: trailing commas are allowed:
12# {k: mydict,}
13# [mylist,]
14# mytuple,
15# f(args,)
16# func f(params,)
17#
18# Kinds used:
19# VSub, Left, Right, Expr, Op, Arith, Char, Eof, Unknown
20
21# YSH patch: removed @=
22augassign: (
23 '+=' | '-=' | '*=' | '/=' |
24 '**=' | '//=' | '%=' |
25 '&=' | '|=' | '^=' | '<<=' | '>>='
26)
27
28test: or_test ['if' or_test 'else' test] | lambdef
29
30# Lambdas follow the same rules as Python:
31#
32# |x| 1, 2 == (|x| 1), 2
33# |x| x if True else 42 == |x| (x if True else 42)
34#
35# Python also had a test_nocond production like this: We don't need it because
36# we can't have multiple ifs.
37# [x for x in range(3) if lambda x: x if 1]
38#
39# The zero arg syntax like || 1 annoys me -- but this also works:
40# func() { return 1 }
41#
42# We used name_type_list rather than param_group because a default value like
43# x|y (bitwise or) conflicts with the | delimiter!
44#
45# TODO: consider this syntax:
46# fn (x) x # expression
47# fn (x) ^( echo hi ) # statement
48
49lambdef: '|' [name_type_list] '|' test
50
51or_test: and_test ('or' and_test)*
52and_test: not_test ('and' not_test)*
53not_test: 'not' not_test | comparison
54comparison: range_expr (comp_op range_expr)*
55
56# Unlike slice, beginning and end are required
57range_expr: (
58 expr ['..<' expr] |
59 expr ['..=' expr]
60)
61
62# YSH patch: remove legacy <>, add === and more
63comp_op: (
64 '<'|'>'|'==='|'>='|'<='|'!=='|'in'|'not' 'in'|'is'|'is' 'not'|
65 '~' | '!~' | '~~' | '!~~' | '~=='
66)
67
68# For lists and dicts. Note: In Python this was star_expr *foo
69splat_expr: '...' expr
70
71expr: xor_expr ('|' xor_expr)*
72xor_expr: and_expr ('^' and_expr)*
73and_expr: shift_expr ('&' shift_expr)*
74shift_expr: arith_expr (('<<'|'>>') arith_expr)*
75# YSH: add concatenation ++ with same precedence as +
76arith_expr: term (('+'|'-'|'++') term)*
77# YSH: removed '@' matrix mul
78term: factor (('*'|'/'|'//'|'%') factor)*
79factor: ('+'|'-'|'~') factor | power
80# YSH: removed Python 3 'await'
81power: atom trailer* ['**' factor]
82
83# Note: I think splat_expr is not for list comprehensions, it's only for
84# literals like [42, *x] in Python, or [42, ...x] in YSH. This is new in
85# Python 3.
86# I think splat_expr expressed awkwardly because of pgen limitations.
87testlist_comp: (test|splat_expr) ( comp_for | (',' (test|splat_expr))* [','] )
88
89atom: (
90 '(' [testlist_comp] ')' # empty tuple/list, or parenthesized expression
91 | '[' [testlist_comp] ']' # empty list or list comprehension
92 | '^[' testlist ']' # expression literal
93 # note: ^[x for x in y] is invalid
94 # but ^[[x for x in y]] is a list comprehension
95
96 # Note: newlines are significant inside {}, unlike inside () and []
97 | '{' [Op_Newline] [dict] '}'
98 | '&' Expr_Name place_trailer*
99
100 # NOTE: These atoms are are allowed in typed array literals
101 | Expr_Name | Expr_Null | Expr_True | Expr_False
102
103 # Allow suffixes on floats and decimals
104 # e.g. 100 M is a function M which multiplies by 1_000_000
105 # e.g. 100 Mi is a function Mi which multiplies by 1024 * 1024
106 | Expr_Float [Expr_Name]
107 | Expr_DecInt [Expr_Name]
108
109 | Expr_BinInt | Expr_OctInt | Expr_HexInt
110
111 | Char_OneChar # char literal \n \\ etc.
112 | Char_YHex
113 | Char_UBraced # char literal \u{3bc}
114
115 | dq_string | sq_string
116 # Expr_Symbol could be %mykey
117
118 | eggex
119
120 # $foo is disallowed, but $? is allowed. Should be "$foo" to indicate a
121 # string, or ${foo:-}
122 | simple_var_sub
123 | sh_command_sub | braced_var_sub
124 | sh_array_literal
125 | old_sh_array_literal
126 | ysh_expr_sub_2
127)
128
129place_trailer: (
130 '[' subscriptlist ']'
131 | '.' Expr_Name
132)
133
134# var f = f(x)
135trailer: (
136 '(' [arglist] ')'
137 | '[' subscriptlist ']'
138
139 # Is a {} trailing useful for anything? It's not in Python or JS
140
141 | '.' Expr_Name
142 | '->' Expr_Name
143 | '=>' Expr_Name
144)
145
146# YSH patch: this is 'expr' instead of 'test'
147# - 1:(3<4) doesn't make sense.
148# - TODO: could we revert this? I think it might have been because we wanted
149# first class slices like var x = 1:n, but we have ranges var x = 1 .. n instead.
150# - There was also the colon conflict for :symbol
151
152subscriptlist: subscript (',' subscript)* [',']
153
154# TODO: Add => as low precedence operator, for Func[Str, Int => Str]
155subscript: expr | [expr] ':' [expr]
156
157# TODO: => should be even lower precedence here too
158testlist: test (',' test)* [',']
159
160# Dict syntax resembles JavaScript
161# https://stackoverflow.com/questions/38948306/what-is-javascript-shorthand-property
162#
163# Examples:
164# {age: 20} is like {'age': 20}
165#
166# x = 'age'
167# d = %{[x]: 20} # Evaluate x as a variable
168# d = %{["foo$x"]: 20} # Another expression
169# d = %{[x, y]: 20} # Tuple key
170# d = %{key1, key1: 123}
171# Notes:
172# - Value is optional when the key is a name, because it can be taken from the
173# environment.
174# - We don't have:
175# - dict comprehensions. Maybe wait until LR parsing?
176# - Splatting with **
177
178dict_pair: (
179 Expr_Name [':' test]
180 | '[' testlist ']' ':' test
181 | sq_string ':' test
182 | dq_string ':' test
183)
184
185comma_newline: ',' [Op_Newline] | Op_Newline
186
187dict: dict_pair (comma_newline dict_pair)* [comma_newline]
188
189# This how Python implemented dict comprehensions. We can probably do the
190# same.
191#
192# dictorsetmaker: ( ((test ':' test | '**' expr)
193# (comp_for | (',' (test ':' test | '**' expr))* [','])) |
194# ((test | splat_expr)
195# (comp_for | (',' (test | splat_expr))* [','])) )
196
197# The reason that keywords are test nodes instead of NAME is that using NAME
198# results in an ambiguity. ast.c makes sure it's a NAME.
199# "test '=' test" is really "keyword '=' test", but we have no such token.
200# These need to be in a single rule to avoid grammar that is ambiguous
201# to our LL(1) parser. Even though 'test' includes '*expr' in splat_expr,
202# we explicitly match '*' here, too, to give it proper precedence.
203# Illegal combinations and orderings are blocked in ast.c:
204# multiple (test comp_for) arguments are blocked; keyword unpackings
205# that precede iterable unpackings are blocked; etc.
206
207argument: (
208 test [comp_for]
209 # named arg
210 | test '=' test
211 # splat. The ... goes before, not after, to be consistent with Python, JS,
212 # and the prefix @ operator.
213 | '...' test
214)
215
216# The grammar at call sites is less restrictive than at declaration sites.
217# ... can appear anywhere. Keyword args can appear anywhere too.
218arg_group: argument (',' argument)* [',']
219arglist: (
220 [arg_group]
221 [';' [arg_group]]
222)
223arglist3: (
224 [arg_group]
225 [';' [arg_group]]
226 [';' [argument]] # procs have an extra block argument
227)
228
229
230# YSH patch: test_nocond -> or_test. I believe this was trying to prevent the
231# "double if" ambiguity here:
232# #
233# [x for x in range(3) if lambda x: x if 1]
234#
235# but YSH doesn't supported "nested loops", so we don't have this problem.
236comp_for: 'for' name_type_list 'in' or_test ['if' or_test]
237
238
239#
240# Expressions that are New in YSH
241#
242
243# Notes:
244# - Most of these occur in 'atom' above
245# - You can write $mystr but not mystr. It has to be (mystr)
246array_item: (
247 Expr_Null | Expr_True | Expr_False
248 | Expr_Float | Expr_DecInt | Expr_BinInt | Expr_OctInt | Expr_HexInt
249 | dq_string | sq_string
250 | sh_command_sub | braced_var_sub | simple_var_sub
251 | '(' test ')'
252)
253sh_array_literal: ':|' Expr_CastedDummy Op_Pipe
254
255# TODO: remove old array
256old_sh_array_literal: '%(' Expr_CastedDummy Right_Initializer
257sh_command_sub: ( '$(' | '@(' | '^(' ) Expr_CastedDummy Eof_RParen
258
259# " $" """ $""" ^"
260dq_string: (
261 Left_DoubleQuote | Left_DollarDoubleQuote |
262 Left_TDoubleQuote | Left_DollarTDoubleQuote |
263 Left_CaretDoubleQuote
264 ) Expr_CastedDummy Right_DoubleQuote
265
266# ' ''' r' r'''
267# $' for "refactoring" property
268# u' u''' b' b'''
269sq_string: (
270 Left_SingleQuote | Left_TSingleQuote
271 | Left_RSingleQuote | Left_RTSingleQuote
272 | Left_DollarSingleQuote
273 | Left_USingleQuote | Left_UTSingleQuote
274 | Left_BSingleQuote | Left_BTSingleQuote
275) Expr_CastedDummy Right_SingleQuote
276
277braced_var_sub: '${' Expr_CastedDummy Right_DollarBrace
278
279# $[expr] or @[expr_splice] inside expressions. The 'ysh_expr_sub' rule below is for commands.
280ysh_expr_sub_2: ('$[' | '@[') testlist ']'
281
282simple_var_sub: (
283 # This is everything in Kind.VSub except VSub_Name, which is braced: ${foo}
284 #
285 # Note: we could allow $foo and $0, but disallow the rest in favor of ${@}
286 # and ${-}? Meh it's too inconsistent.
287 VSub_DollarName | VSub_Number
288 | VSub_Bang | VSub_At | VSub_Pound | VSub_Dollar | VSub_Star | VSub_Hyphen
289 | VSub_QMark
290 # NOTE: $? should be STATUS because it's an integer.
291)
292
293#
294# Assignment / Type Variables
295#
296# Several differences vs. Python:
297#
298# - no yield expression on RHS
299# - no star expressions on either side (Python 3) *x, y = 2, *b
300# - no multiple assignments like: var x = y = 3
301# - type annotation syntax is more restrictive # a: (1+2) = 3 is OK in python
302# - We're validating the lvalue here, instead of doing it in the "transformer".
303# We have the 'var' prefix which helps.
304
305# name_type use cases:
306# var x Int, y Int = 3, 5
307# / <capture d+ as date: int> /
308#
309# for x Int, y Int
310# [x for x Int, y Int in ...]
311#
312# func(x Int, y Int) - this is separate
313
314# Optional colon because we want both
315
316# var x: Int = 42 # colon looks nicer
317# proc p (; x Int, y Int; z Int) { echo hi } # colon gets in the way of ;
318
319name_type: Expr_Name [':'] [type_expr]
320name_type_list: name_type (',' name_type)*
321
322type_expr: Expr_Name [ '[' type_expr (',' type_expr)* ']' ]
323
324# NOTE: Eof_RParen and Eof_Backtick aren't allowed because we don't want 'var'
325# in command subs.
326end_stmt: '}' | ';' | Op_Newline | Eof_Real
327
328# TODO: allow -> to denote aliasing/mutation
329ysh_var_decl: name_type_list ['=' testlist] end_stmt
330
331# Note: this is more precise way of writing ysh_mutation, but it's ambiguous :(
332# ysh_mutation: lhs augassign testlist end_stmt
333# | lhs_list '=' testlist end_stmt
334
335# Note: for YSH (not Tea), we could accept [':'] expr for setvar :out = 'foo'
336lhs_list: expr (',' expr)*
337
338# TODO: allow -> to denote aliasing/mutation
339ysh_mutation: lhs_list (augassign | '=') testlist end_stmt
340
341# proc arg lists, like:
342# json write (x, indent=1)
343# cd /tmp ( ; ; ^(echo hi))
344#
345# What about:
346# myproc /tmp [ ; ; ^(echo hi)] - I guess this doesn't make sense?
347ysh_eager_arglist: '(' [arglist3] ')'
348ysh_lazy_arglist: '[' [arglist] ']'
349
350#
351# Other Entry Points
352#
353
354# if (x > 0) etc.
355ysh_expr: '(' testlist ')'
356
357# = 42 + a[i]
358# call f(x)
359command_expr: testlist end_stmt
360
361# $[d->key] etc.
362# See also ysh_expr_sub_2
363ysh_expr_sub: testlist ']'
364
365# Signatures for proc and func.
366
367# Note: 'proc name-with-hyphens' is allowed, so we can't parse the name in
368# expression mode.
369ysh_proc: (
370 [ '('
371 [ param_group ] # word params, with defaults
372 [ ';' [ param_group ] ] # positional typed params, with defaults
373 [ ';' [ param_group ] ] # named params, with defaults
374 [ ';' [ param_group ] ] # optional block param, with no type or default
375
376 # This causes a pgen2 error? It doesn't know which branch to take
377 # So we have the extra {block} syntax
378 #[ ';' Expr_Name ] # optional block param, with no type or default
379 ')'
380 ]
381 '{' # opening { for pgen2
382)
383
384ysh_func: (
385 Expr_Name '(' [param_group] [';' param_group] ')' ['=>' type_expr] '{'
386)
387
388param: Expr_Name [type_expr] ['=' expr]
389
390# This is an awkward way of writing that '...' has to come last.
391param_group: (
392 (param ',')*
393 [ (param | '...' Expr_Name) [','] ]
394)
395
396#
397# Regex Sublanguage
398#
399
400char_literal: (
401 Char_OneChar | Char_UBraced | Char_YHex
402| Char_Hex # LEGACY \x01; the right thing is \u{1} or \y00
403)
404
405# we allow a-z A-Z 0-9 as ranges, but otherwise they have to be quoted
406# The parser enforces that they are single strings
407range_char: Expr_Name | Expr_DecInt | sq_string | char_literal
408
409# digit or a-z
410# We have to do further validation of ranges later.
411class_literal_term: (
412 # NOTE: range_char has sq_string
413 range_char ['-' range_char ]
414 # splice a literal set of characters
415 | '@' Expr_Name
416 | '!' Expr_Name
417 # Reserved for [[.collating sequences.]] (Unicode)
418 | '.' Expr_Name
419 # Reserved for [[=character equivalents=]] (Unicode)
420 | '=' Expr_Name
421 # TODO: Do these char classes actually work in bash/awk/egrep/sed/etc.?
422
423)
424class_literal: '[' class_literal_term+ ']'
425
426# NOTE: Here is an example of where you can put ^ in the middle of a pattern in
427# Python, and it matters!
428# >>> r = re.compile('.f[a-z]*', re.DOTALL|re.MULTILINE)
429# >>> r.findall('z\nfoo\nbeef\nfood\n')
430# ['\nfoo', 'ef', '\nfood']
431# >>> r = re.compile('.^f[a-z]*', re.DOTALL|re.MULTILINE)
432# r.findall('z\nfoo\nbeef\nfood\n')
433# ['\nfoo', '\nfood']
434
435re_atom: (
436 char_literal
437 # builtin regex like 'digit' or a regex reference like 'D'
438 | Expr_Name
439 # %begin or %end
440 | Expr_Symbol
441 | class_literal
442 # !digit or ![a-f]. Note ! %boundary could be \B in Python, but ERE
443 # doesn't have anything like that
444 | '!' (Expr_Name | class_literal)
445
446 # syntactic space for Perl-style backtracking
447 # !!REF 1 !!REF name
448 # !!AHEAD(d+) !!BEHIND(d+) !!NOT_AHEAD(d+) !!NOT_BEHIND(d+)
449 #
450 # Note: !! conflicts with history
451 | '!' '!' Expr_Name (Expr_Name | Expr_DecInt | '(' regex ')')
452
453 # Splice another expression
454 | '@' Expr_Name
455 # any %start %end are preferred
456 | '.' | '^' | '$'
457 # In a language-independent spec, backslashes are disallowed within 'sq'.
458 # Write it with char literals outside strings: 'foo' \\ 'bar' \n
459 #
460 # No double-quoted strings because you can write "x = $x" with 'x = ' @x
461 | sq_string
462
463 # grouping (non-capturing in Perl; capturing in ERE although < > is preferred)
464 | '(' regex ')'
465
466 # Capturing group, with optional name and conversion function
467 # <capture d+ as date>
468 # <capture d+ as date: int>
469 # <capture d+ : int>
470 | '<' 'capture' regex ['as' Expr_Name] [':' Expr_Name] '>'
471
472 # Might want this obscure conditional construct. Can't use C-style ternary
473 # because '?' is a regex operator.
474 #| '{' regex 'if' regex 'else' regex '}'
475
476 # Others:
477 # PCRE has (?R ) for recursion? That could be !RECURSE()
478 # Note: .NET has && in character classes, making it a recursive language
479)
480
481# e.g. a{3} a{3,4} a{3,} a{,4} but not a{,}
482repeat_range: (
483 Expr_DecInt [',']
484 | ',' Expr_DecInt
485 | Expr_DecInt ',' Expr_DecInt
486)
487
488repeat_op: (
489 '+' | '*' | '?'
490 # In PCRE, ?? *? +? {}? is lazy/nongreedy and ?+ *+ ++ {}+ is "possessive"
491 # We use N and P modifiers within {}.
492 # a{L +} a{P ?} a{P 3,4} a{P ,4}
493 | '{' [Expr_Name] ('+' | '*' | '?' | repeat_range) '}'
494)
495
496re_alt: (re_atom [repeat_op])+
497
498regex: [re_alt] (('|'|'or') re_alt)*
499
500# e.g. /digit+ ; multiline !ignorecase/
501#
502# This can express translation preferences:
503#
504# / d+ ; ; ERE / is '[[:digit:]]+'
505# / d+ ; ; PCRE / is '\d+'
506# / d+ ; ignorecase ; python / is '(?i)\d+'
507
508# Python has the syntax
509# (?i:myre) to set a flag
510# (?-i:myre) to remove a flag
511#
512# They can apply to portions of the expression, which we don't have here.
513re_flag: ['!'] Expr_Name
514eggex: '/' regex [';' re_flag* [';' Expr_Name] ] '/'
515
516# Patterns are the start of a case arm. Ie,
517#
518# case (foo) {
519# (40 + 2) | (0) { echo number }
520# ^^^^^^^^^^^^^^-- This is pattern
521# }
522#
523# Due to limitations created from pgen2/cmd_parser interactions, we also parse
524# the leading '{' token of the case arm body in pgen2. We do this to help pgen2
525# figure out when to transfer control back to the cmd_parser. For more details
526# see #oil-dev > Dev Friction / Smells.
527#
528# case (foo) {
529# (40 + 2) | (0) { echo number }
530# ^-- End of pattern/beginning of case arm body
531# }
532
533ysh_case_pat: (
534 '(' (pat_else | pat_exprs)
535 | eggex
536) [Op_Newline] '{'
537
538pat_else: 'else' ')'
539pat_exprs: expr ')' [Op_Newline] ('|' [Op_Newline] '(' expr ')' [Op_Newline])*
540
541
542# Syntax reserved for PCRE/Python, but that's not in ERE:
543#
544# non-greedy a{N *}
545# non-capturing ( digit+ )
546# backtracking !!REF 1 !!AHEAD(d+)
547#
548# Legacy syntax:
549#
550# ^ and $ instead of %start and %end
551# < and > instead of %start_word and %end_word
552# . instead of dot
553# | instead of 'or'