1 | # Grammar for YSH.
|
2 | # Adapted from the Python 3.7 expression grammar, with several changes!
|
3 | #
|
4 | # TODO:
|
5 | # - List comprehensions
|
6 | # - There's also chaining => and maybe implicit vectorization ==>
|
7 | # - But list comprehensions are more familiar, and they are concise
|
8 | # - Generator expressions?
|
9 | # - Do we need lambdas?
|
10 |
|
11 | # Note: trailing commas are allowed:
|
12 | # {k: mydict,}
|
13 | # [mylist,]
|
14 | # mytuple,
|
15 | # f(args,)
|
16 | # func f(params,)
|
17 | #
|
18 | # Kinds used:
|
19 | # VSub, Left, Right, Expr, Op, Arith, Char, Eof, Unknown
|
20 |
|
21 | # YSH patch: removed @=
|
22 | augassign: (
|
23 | '+=' | '-=' | '*=' | '/=' |
|
24 | '**=' | '//=' | '%=' |
|
25 | '&=' | '|=' | '^=' | '<<=' | '>>='
|
26 | )
|
27 |
|
28 | test: or_test ['if' or_test 'else' test] | lambdef
|
29 |
|
30 | # Lambdas follow the same rules as Python:
|
31 | #
|
32 | # |x| 1, 2 == (|x| 1), 2
|
33 | # |x| x if True else 42 == |x| (x if True else 42)
|
34 | #
|
35 | # Python also had a test_nocond production like this: We don't need it because
|
36 | # we can't have multiple ifs.
|
37 | # [x for x in range(3) if lambda x: x if 1]
|
38 | #
|
39 | # The zero arg syntax like || 1 annoys me -- but this also works:
|
40 | # func() { return 1 }
|
41 | #
|
42 | # We used name_type_list rather than param_group because a default value like
|
43 | # x|y (bitwise or) conflicts with the | delimiter!
|
44 | #
|
45 | # TODO: consider this syntax:
|
46 | # fn (x) x # expression
|
47 | # fn (x) ^( echo hi ) # statement
|
48 |
|
49 | lambdef: '|' [name_type_list] '|' test
|
50 |
|
51 | or_test: and_test ('or' and_test)*
|
52 | and_test: not_test ('and' not_test)*
|
53 | not_test: 'not' not_test | comparison
|
54 | comparison: range_expr (comp_op range_expr)*
|
55 |
|
56 | # Unlike slice, beginning and end are required
|
57 | range_expr: (
|
58 | expr ['..<' expr] |
|
59 | expr ['..=' expr]
|
60 | )
|
61 |
|
62 | # YSH patch: remove legacy <>, add === and more
|
63 | comp_op: (
|
64 | '<'|'>'|'==='|'>='|'<='|'!=='|'in'|'not' 'in'|'is'|'is' 'not'|
|
65 | '~' | '!~' | '~~' | '!~~' | '~=='
|
66 | )
|
67 |
|
68 | # For lists and dicts. Note: In Python this was star_expr *foo
|
69 | splat_expr: '...' expr
|
70 |
|
71 | expr: xor_expr ('|' xor_expr)*
|
72 | xor_expr: and_expr ('^' and_expr)*
|
73 | and_expr: shift_expr ('&' shift_expr)*
|
74 | shift_expr: arith_expr (('<<'|'>>') arith_expr)*
|
75 | # YSH: add concatenation ++ with same precedence as +
|
76 | arith_expr: term (('+'|'-'|'++') term)*
|
77 | # YSH: removed '@' matrix mul
|
78 | term: factor (('*'|'/'|'//'|'%') factor)*
|
79 | factor: ('+'|'-'|'~') factor | power
|
80 | # YSH: removed Python 3 'await'
|
81 | power: atom trailer* ['**' factor]
|
82 |
|
83 | testlist_comp: (test|splat_expr) ( comp_for | (',' (test|splat_expr))* [','] )
|
84 |
|
85 | atom: (
|
86 | '(' [testlist_comp] ')' # empty tuple/list, or parenthesized expression
|
87 | | '[' [testlist_comp] ']' # empty list or list comprehension
|
88 | | '^[' testlist ']' # expression literal
|
89 | # note: ^[x for x in y] is invalid
|
90 | # but ^[[x for x in y]] is a list comprehension
|
91 |
|
92 | # Note: newlines are significant inside {}, unlike inside () and []
|
93 | | '{' [Op_Newline] [dict] '}'
|
94 | | '&' Expr_Name place_trailer*
|
95 |
|
96 | # NOTE: These atoms are are allowed in typed array literals
|
97 | | Expr_Name | Expr_Null | Expr_True | Expr_False
|
98 |
|
99 | # Allow suffixes on floats and decimals
|
100 | # e.g. 100 M is a function M which multiplies by 1_000_000
|
101 | # e.g. 100 Mi is a function Mi which multiplies by 1024 * 1024
|
102 | | Expr_Float [Expr_Name]
|
103 | | Expr_DecInt [Expr_Name]
|
104 |
|
105 | | Expr_BinInt | Expr_OctInt | Expr_HexInt
|
106 |
|
107 | | Char_OneChar # char literal \n \\ etc.
|
108 | | Char_YHex
|
109 | | Char_UBraced # char literal \u{3bc}
|
110 |
|
111 | | dq_string | sq_string
|
112 | # Expr_Symbol could be %mykey
|
113 |
|
114 | | eggex
|
115 |
|
116 | # $foo is disallowed, but $? is allowed. Should be "$foo" to indicate a
|
117 | # string, or ${foo:-}
|
118 | | simple_var_sub
|
119 | | sh_command_sub | braced_var_sub
|
120 | | sh_array_literal
|
121 | | old_sh_array_literal
|
122 | )
|
123 |
|
124 | place_trailer: (
|
125 | '[' subscriptlist ']'
|
126 | | '.' Expr_Name
|
127 | )
|
128 |
|
129 | # var f = f(x)
|
130 | trailer: (
|
131 | '(' [arglist] ')'
|
132 | | '[' subscriptlist ']'
|
133 |
|
134 | # Is a {} trailing useful for anything? It's not in Python or JS
|
135 |
|
136 | | '.' Expr_Name
|
137 | | '->' Expr_Name
|
138 | | '=>' Expr_Name
|
139 | )
|
140 |
|
141 | # YSH patch: this is 'expr' instead of 'test'
|
142 | # - 1:(3<4) doesn't make sense.
|
143 | # - TODO: could we revert this? I think it might have been because we wanted
|
144 | # first class slices like var x = 1:n, but we have ranges var x = 1 .. n instead.
|
145 | # - There was also the colon conflict for :symbol
|
146 |
|
147 | subscriptlist: subscript (',' subscript)* [',']
|
148 |
|
149 | # TODO: Add => as low precedence operator, for Func[Str, Int => Str]
|
150 | subscript: expr | [expr] ':' [expr]
|
151 |
|
152 | # TODO: => should be even lower precedence here too
|
153 | testlist: test (',' test)* [',']
|
154 |
|
155 | # Dict syntax resembles JavaScript
|
156 | # https://stackoverflow.com/questions/38948306/what-is-javascript-shorthand-property
|
157 | #
|
158 | # Examples:
|
159 | # {age: 20} is like {'age': 20}
|
160 | #
|
161 | # x = 'age'
|
162 | # d = %{[x]: 20} # Evaluate x as a variable
|
163 | # d = %{["foo$x"]: 20} # Another expression
|
164 | # d = %{[x, y]: 20} # Tuple key
|
165 | # d = %{key1, key1: 123}
|
166 | # Notes:
|
167 | # - Value is optional when the key is a name, because it can be taken from the
|
168 | # environment.
|
169 | # - We don't have:
|
170 | # - dict comprehensions. Maybe wait until LR parsing?
|
171 | # - Splatting with **
|
172 |
|
173 | dict_pair: (
|
174 | Expr_Name [':' test]
|
175 | | '[' testlist ']' ':' test
|
176 | | sq_string ':' test
|
177 | | dq_string ':' test
|
178 | )
|
179 |
|
180 | comma_newline: ',' [Op_Newline] | Op_Newline
|
181 |
|
182 | dict: dict_pair (comma_newline dict_pair)* [comma_newline]
|
183 |
|
184 | # This how Python implemented dict comprehensions. We can probably do the
|
185 | # same.
|
186 | #
|
187 | # dictorsetmaker: ( ((test ':' test | '**' expr)
|
188 | # (comp_for | (',' (test ':' test | '**' expr))* [','])) |
|
189 | # ((test | splat_expr)
|
190 | # (comp_for | (',' (test | splat_expr))* [','])) )
|
191 |
|
192 | # The reason that keywords are test nodes instead of NAME is that using NAME
|
193 | # results in an ambiguity. ast.c makes sure it's a NAME.
|
194 | # "test '=' test" is really "keyword '=' test", but we have no such token.
|
195 | # These need to be in a single rule to avoid grammar that is ambiguous
|
196 | # to our LL(1) parser. Even though 'test' includes '*expr' in splat_expr,
|
197 | # we explicitly match '*' here, too, to give it proper precedence.
|
198 | # Illegal combinations and orderings are blocked in ast.c:
|
199 | # multiple (test comp_for) arguments are blocked; keyword unpackings
|
200 | # that precede iterable unpackings are blocked; etc.
|
201 |
|
202 | argument: (
|
203 | test [comp_for]
|
204 | # named arg
|
205 | | test '=' test
|
206 | # splat. The ... goes before, not after, to be consistent with Python, JS,
|
207 | # and the prefix @ operator.
|
208 | | '...' test
|
209 | )
|
210 |
|
211 | # The grammar at call sites is less restrictive than at declaration sites.
|
212 | # ... can appear anywhere. Keyword args can appear anywhere too.
|
213 | arg_group: argument (',' argument)* [',']
|
214 | arglist: (
|
215 | [arg_group]
|
216 | [';' [arg_group]]
|
217 | )
|
218 | arglist3: (
|
219 | [arg_group]
|
220 | [';' [arg_group]]
|
221 | [';' [argument]] # procs have an extra block argument
|
222 | )
|
223 |
|
224 |
|
225 | # YSH patch: test_nocond -> or_test. I believe this was trying to prevent the
|
226 | # "double if" ambiguity here:
|
227 | # #
|
228 | # [x for x in range(3) if lambda x: x if 1]
|
229 | #
|
230 | # but YSH doesn't supported "nested loops", so we don't have this problem.
|
231 | comp_for: 'for' name_type_list 'in' or_test ['if' or_test]
|
232 |
|
233 |
|
234 | #
|
235 | # Expressions that are New in YSH
|
236 | #
|
237 |
|
238 | # Notes:
|
239 | # - Most of these occur in 'atom' above
|
240 | # - You can write $mystr but not mystr. It has to be (mystr)
|
241 | array_item: (
|
242 | Expr_Null | Expr_True | Expr_False
|
243 | | Expr_Float | Expr_DecInt | Expr_BinInt | Expr_OctInt | Expr_HexInt
|
244 | | dq_string | sq_string
|
245 | | sh_command_sub | braced_var_sub | simple_var_sub
|
246 | | '(' test ')'
|
247 | )
|
248 | sh_array_literal: ':|' Expr_CastedDummy Op_Pipe
|
249 |
|
250 | # TODO: remove old array
|
251 | old_sh_array_literal: '%(' Expr_CastedDummy Right_ShArrayLiteral
|
252 | sh_command_sub: ( '$(' | '@(' | '^(' ) Expr_CastedDummy Eof_RParen
|
253 |
|
254 | # " $" """ $""" ^"
|
255 | dq_string: (
|
256 | Left_DoubleQuote | Left_DollarDoubleQuote |
|
257 | Left_TDoubleQuote | Left_DollarTDoubleQuote |
|
258 | Left_CaretDoubleQuote
|
259 | ) Expr_CastedDummy Right_DoubleQuote
|
260 |
|
261 | # ' ''' r' r'''
|
262 | # $' for "refactoring" property
|
263 | # u' u''' b' b'''
|
264 | sq_string: (
|
265 | Left_SingleQuote | Left_TSingleQuote
|
266 | | Left_RSingleQuote | Left_RTSingleQuote
|
267 | | Left_DollarSingleQuote
|
268 | | Left_USingleQuote | Left_UTSingleQuote
|
269 | | Left_BSingleQuote | Left_BTSingleQuote
|
270 | ) Expr_CastedDummy Right_SingleQuote
|
271 |
|
272 | braced_var_sub: '${' Expr_CastedDummy Right_DollarBrace
|
273 |
|
274 | simple_var_sub: (
|
275 | # This is everything in Kind.VSub except VSub_Name, which is braced: ${foo}
|
276 | #
|
277 | # Note: we could allow $foo and $0, but disallow the rest in favor of ${@}
|
278 | # and ${-}? Meh it's too inconsistent.
|
279 | VSub_DollarName | VSub_Number
|
280 | | VSub_Bang | VSub_At | VSub_Pound | VSub_Dollar | VSub_Star | VSub_Hyphen
|
281 | | VSub_QMark
|
282 | # NOTE: $? should be STATUS because it's an integer.
|
283 | )
|
284 |
|
285 | #
|
286 | # Assignment / Type Variables
|
287 | #
|
288 | # Several differences vs. Python:
|
289 | #
|
290 | # - no yield expression on RHS
|
291 | # - no star expressions on either side (Python 3) *x, y = 2, *b
|
292 | # - no multiple assignments like: var x = y = 3
|
293 | # - type annotation syntax is more restrictive # a: (1+2) = 3 is OK in python
|
294 | # - We're validating the lvalue here, instead of doing it in the "transformer".
|
295 | # We have the 'var' prefix which helps.
|
296 |
|
297 | # name_type use cases:
|
298 | # var x Int, y Int = 3, 5
|
299 | # / <capture d+ as date: int> /
|
300 | #
|
301 | # for x Int, y Int
|
302 | # [x for x Int, y Int in ...]
|
303 | #
|
304 | # func(x Int, y Int) - this is separate
|
305 |
|
306 | # Optional colon because we want both
|
307 |
|
308 | # var x: Int = 42 # colon looks nicer
|
309 | # proc p (; x Int, y Int; z Int) { echo hi } # colon gets in the way of ;
|
310 |
|
311 | name_type: Expr_Name [':'] [type_expr]
|
312 | name_type_list: name_type (',' name_type)*
|
313 |
|
314 | type_expr: Expr_Name [ '[' type_expr (',' type_expr)* ']' ]
|
315 |
|
316 | # NOTE: Eof_RParen and Eof_Backtick aren't allowed because we don't want 'var'
|
317 | # in command subs.
|
318 | end_stmt: '}' | ';' | Op_Newline | Eof_Real
|
319 |
|
320 | # TODO: allow -> to denote aliasing/mutation
|
321 | ysh_var_decl: name_type_list ['=' testlist] end_stmt
|
322 |
|
323 | # Note: this is more precise way of writing ysh_mutation, but it's ambiguous :(
|
324 | # ysh_mutation: lhs augassign testlist end_stmt
|
325 | # | lhs_list '=' testlist end_stmt
|
326 |
|
327 | # Note: for YSH (not Tea), we could accept [':'] expr for setvar :out = 'foo'
|
328 | lhs_list: expr (',' expr)*
|
329 |
|
330 | # TODO: allow -> to denote aliasing/mutation
|
331 | ysh_mutation: lhs_list (augassign | '=') testlist end_stmt
|
332 |
|
333 | # proc arg lists, like:
|
334 | # json write (x, indent=1)
|
335 | # cd /tmp ( ; ; ^(echo hi))
|
336 | #
|
337 | # What about:
|
338 | # myproc /tmp [ ; ; ^(echo hi)] - I guess this doesn't make sense?
|
339 | ysh_eager_arglist: '(' [arglist3] ')'
|
340 | ysh_lazy_arglist: '[' [arglist] ']'
|
341 |
|
342 | #
|
343 | # Other Entry Points
|
344 | #
|
345 |
|
346 | # if (x > 0) etc.
|
347 | ysh_expr: '(' testlist ')'
|
348 |
|
349 | # = 42 + a[i]
|
350 | # call f(x)
|
351 | command_expr: testlist end_stmt
|
352 |
|
353 | # $[d->key] etc.
|
354 | ysh_expr_sub: testlist ']'
|
355 |
|
356 | # Signatures for proc and func.
|
357 |
|
358 | # Note: 'proc name-with-hyphens' is allowed, so we can't parse the name in
|
359 | # expression mode.
|
360 | ysh_proc: (
|
361 | [ '('
|
362 | [ param_group ] # word params, with defaults
|
363 | [ ';' [ param_group ] ] # positional typed params, with defaults
|
364 | [ ';' [ param_group ] ] # named params, with defaults
|
365 | [ ';' [ param_group ] ] # optional block param, with no type or default
|
366 |
|
367 | # This causes a pgen2 error? It doesn't know which branch to take
|
368 | # So we have the extra {block} syntax
|
369 | #[ ';' Expr_Name ] # optional block param, with no type or default
|
370 | ')'
|
371 | ]
|
372 | '{' # opening { for pgen2
|
373 | )
|
374 |
|
375 | ysh_func: (
|
376 | Expr_Name '(' [param_group] [';' param_group] ')' ['=>' type_expr] '{'
|
377 | )
|
378 |
|
379 | param: Expr_Name [type_expr] ['=' expr]
|
380 |
|
381 | # This is an awkward way of writing that '...' has to come last.
|
382 | param_group: (
|
383 | (param ',')*
|
384 | [ (param | '...' Expr_Name) [','] ]
|
385 | )
|
386 |
|
387 | #
|
388 | # Regex Sublanguage
|
389 | #
|
390 |
|
391 | char_literal: Char_OneChar | Char_Hex | Char_UBraced
|
392 |
|
393 | # we allow a-z A-Z 0-9 as ranges, but otherwise they have to be quoted
|
394 | # The parser enforces that they are single strings
|
395 | range_char: Expr_Name | Expr_DecInt | sq_string | char_literal
|
396 |
|
397 | # digit or a-z
|
398 | # We have to do further validation of ranges later.
|
399 | class_literal_term: (
|
400 | # NOTE: range_char has sq_string
|
401 | range_char ['-' range_char ]
|
402 | # splice a literal set of characters
|
403 | | '@' Expr_Name
|
404 | | '!' Expr_Name
|
405 | # Reserved for [[.collating sequences.]] (Unicode)
|
406 | | '.' Expr_Name
|
407 | # Reserved for [[=character equivalents=]] (Unicode)
|
408 | | '=' Expr_Name
|
409 | # TODO: Do these char classes actually work in bash/awk/egrep/sed/etc.?
|
410 |
|
411 | )
|
412 | class_literal: '[' class_literal_term+ ']'
|
413 |
|
414 | # NOTE: Here is an example of where you can put ^ in the middle of a pattern in
|
415 | # Python, and it matters!
|
416 | # >>> r = re.compile('.f[a-z]*', re.DOTALL|re.MULTILINE)
|
417 | # >>> r.findall('z\nfoo\nbeef\nfood\n')
|
418 | # ['\nfoo', 'ef', '\nfood']
|
419 | # >>> r = re.compile('.^f[a-z]*', re.DOTALL|re.MULTILINE)
|
420 | # r.findall('z\nfoo\nbeef\nfood\n')
|
421 | # ['\nfoo', '\nfood']
|
422 |
|
423 | re_atom: (
|
424 | char_literal
|
425 | # builtin regex like 'digit' or a regex reference like 'D'
|
426 | | Expr_Name
|
427 | # %begin or %end
|
428 | | Expr_Symbol
|
429 | | class_literal
|
430 | # !digit or ![a-f]. Note ! %boundary could be \B in Python, but ERE
|
431 | # doesn't have anything like that
|
432 | | '!' (Expr_Name | class_literal)
|
433 |
|
434 | # syntactic space for Perl-style backtracking
|
435 | # !!REF 1 !!REF name
|
436 | # !!AHEAD(d+) !!BEHIND(d+) !!NOT_AHEAD(d+) !!NOT_BEHIND(d+)
|
437 | #
|
438 | # Note: !! conflicts with history
|
439 | | '!' '!' Expr_Name (Expr_Name | Expr_DecInt | '(' regex ')')
|
440 |
|
441 | # Splice another expression
|
442 | | '@' Expr_Name
|
443 | # any %start %end are preferred
|
444 | | '.' | '^' | '$'
|
445 | # In a language-independent spec, backslashes are disallowed within 'sq'.
|
446 | # Write it with char literals outside strings: 'foo' \\ 'bar' \n
|
447 | #
|
448 | # No double-quoted strings because you can write "x = $x" with 'x = ' @x
|
449 | | sq_string
|
450 |
|
451 | # grouping (non-capturing in Perl; capturing in ERE although < > is preferred)
|
452 | | '(' regex ')'
|
453 |
|
454 | # Capturing group, with optional name and conversion function
|
455 | # <capture d+ as date>
|
456 | # <capture d+ as date: int>
|
457 | # <capture d+ : int>
|
458 | | '<' 'capture' regex ['as' Expr_Name] [':' Expr_Name] '>'
|
459 |
|
460 | # Might want this obscure conditional construct. Can't use C-style ternary
|
461 | # because '?' is a regex operator.
|
462 | #| '{' regex 'if' regex 'else' regex '}'
|
463 |
|
464 | # Others:
|
465 | # PCRE has (?R ) for recursion? That could be !RECURSE()
|
466 | # Note: .NET has && in character classes, making it a recursive language
|
467 | )
|
468 |
|
469 | # e.g. a{3} a{3,4} a{3,} a{,4} but not a{,}
|
470 | repeat_range: (
|
471 | Expr_DecInt [',']
|
472 | | ',' Expr_DecInt
|
473 | | Expr_DecInt ',' Expr_DecInt
|
474 | )
|
475 |
|
476 | repeat_op: (
|
477 | '+' | '*' | '?'
|
478 | # In PCRE, ?? *? +? {}? is lazy/nongreedy and ?+ *+ ++ {}+ is "possessive"
|
479 | # We use N and P modifiers within {}.
|
480 | # a{L +} a{P ?} a{P 3,4} a{P ,4}
|
481 | | '{' [Expr_Name] ('+' | '*' | '?' | repeat_range) '}'
|
482 | )
|
483 |
|
484 | re_alt: (re_atom [repeat_op])+
|
485 |
|
486 | regex: [re_alt] (('|'|'or') re_alt)*
|
487 |
|
488 | # e.g. /digit+ ; multiline !ignorecase/
|
489 | #
|
490 | # This can express translation preferences:
|
491 | #
|
492 | # / d+ ; ; ERE / is '[[:digit:]]+'
|
493 | # / d+ ; ; PCRE / is '\d+'
|
494 | # / d+ ; ignorecase ; python / is '(?i)\d+'
|
495 |
|
496 | # Python has the syntax
|
497 | # (?i:myre) to set a flag
|
498 | # (?-i:myre) to remove a flag
|
499 | #
|
500 | # They can apply to portions of the expression, which we don't have here.
|
501 | re_flag: ['!'] Expr_Name
|
502 | eggex: '/' regex [';' re_flag* [';' Expr_Name] ] '/'
|
503 |
|
504 | # Patterns are the start of a case arm. Ie,
|
505 | #
|
506 | # case (foo) {
|
507 | # (40 + 2) | (0) { echo number }
|
508 | # ^^^^^^^^^^^^^^-- This is pattern
|
509 | # }
|
510 | #
|
511 | # Due to limitations created from pgen2/cmd_parser interactions, we also parse
|
512 | # the leading '{' token of the case arm body in pgen2. We do this to help pgen2
|
513 | # figure out when to transfer control back to the cmd_parser. For more details
|
514 | # see #oil-dev > Dev Friction / Smells.
|
515 | #
|
516 | # case (foo) {
|
517 | # (40 + 2) | (0) { echo number }
|
518 | # ^-- End of pattern/beginning of case arm body
|
519 | # }
|
520 |
|
521 | ysh_case_pat: (
|
522 | '(' (pat_else | pat_exprs)
|
523 | | eggex
|
524 | ) [Op_Newline] '{'
|
525 |
|
526 | pat_else: 'else' ')'
|
527 | pat_exprs: expr ')' [Op_Newline] ('|' [Op_Newline] '(' expr ')' [Op_Newline])*
|
528 |
|
529 |
|
530 | # Syntax reserved for PCRE/Python, but that's not in ERE:
|
531 | #
|
532 | # non-greedy a{N *}
|
533 | # non-capturing ( digit+ )
|
534 | # backtracking !!REF 1 !!AHEAD(d+)
|
535 | #
|
536 | # Legacy syntax:
|
537 | #
|
538 | # ^ and $ instead of %start and %end
|
539 | # < and > instead of %start_word and %end_word
|
540 | # . instead of dot
|
541 | # | instead of 'or'
|