OILS / ysh / expr_to_ast.py View on Github | oilshell.org

1713 lines, 1032 significant
1"""expr_to_ast.py."""
2from __future__ import print_function
3
4from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind
5from _devbuild.gen.syntax_asdl import (
6 Token,
7 SimpleVarSub,
8 loc,
9 loc_t,
10 DoubleQuoted,
11 SingleQuoted,
12 BracedVarSub,
13 CommandSub,
14 ShArrayLiteral,
15 command,
16 expr,
17 expr_e,
18 expr_t,
19 expr_context_e,
20 re,
21 re_t,
22 re_repeat,
23 re_repeat_t,
24 class_literal_term,
25 class_literal_term_t,
26 PosixClass,
27 PerlClass,
28 NameType,
29 y_lhs_t,
30 Comprehension,
31 Subscript,
32 Attribute,
33 proc_sig,
34 proc_sig_t,
35 Param,
36 RestParam,
37 ParamGroup,
38 NamedArg,
39 ArgList,
40 pat,
41 pat_t,
42 TypeExpr,
43 Func,
44 Eggex,
45 EggexFlag,
46 CharCode,
47 CharRange,
48)
49from _devbuild.gen.value_asdl import value, value_t
50from _devbuild.gen import grammar_nt
51from core.error import p_die
52from data_lang import j8
53from frontend import consts
54from frontend import lexer
55from frontend import location
56from mycpp import mops
57from mycpp import mylib
58from mycpp.mylib import log, tagswitch
59from osh import word_compile
60from ysh import expr_parse
61from ysh import regex_translate
62
63from typing import TYPE_CHECKING, Dict, List, Tuple, Optional, cast
64if TYPE_CHECKING:
65 from pgen2.grammar import Grammar
66 from pgen2.pnode import PNode
67
68_ = log
69
70PERL_CLASSES = {
71 'd': 'd',
72 'w': 'w',
73 'word': 'w',
74 's': 's',
75}
76# https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html
77POSIX_CLASSES = [
78 'alnum',
79 'cntrl',
80 'lower',
81 'space',
82 'alpha',
83 'digit',
84 'print',
85 'upper',
86 'blank',
87 'graph',
88 'punct',
89 'xdigit',
90]
91# NOTE: There are also things like \p{Greek} that we could put in the
92# "non-sigil" namespace.
93
94RANGE_POINT_TOO_LONG = "Range start/end shouldn't have more than one character"
95
96POS_ARG_MISPLACED = "Positional arg can't appear in group of named args"
97
98# Copied from pgen2/token.py to avoid dependency.
99NT_OFFSET = 256
100
101if mylib.PYTHON:
102
103 def MakeGrammarNames(ysh_grammar):
104 # type: (Grammar) -> Dict[int, str]
105
106 # TODO: Break this dependency
107 from frontend import lexer_def
108
109 names = {}
110
111 for id_name, k in lexer_def.ID_SPEC.id_str2int.items():
112 # Hm some are out of range
113 #assert k < 256, (k, id_name)
114
115 # TODO: Some tokens have values greater than NT_OFFSET
116 if k < NT_OFFSET:
117 names[k] = id_name
118
119 for k, v in ysh_grammar.number2symbol.items():
120 assert k >= NT_OFFSET, (k, v)
121 names[k] = v
122
123 return names
124
125
126class Transformer(object):
127 """Homogeneous parse tree -> heterogeneous AST ("lossless syntax tree")
128
129 pgen2 (Python's LL parser generator) doesn't have semantic actions like yacc,
130 so this "transformer" is the equivalent.
131
132 Files to refer to when modifying this function:
133
134 ysh/grammar.pgen2 (generates _devbuild/gen/grammar_nt.py)
135 frontend/syntax.asdl (generates _devbuild/gen/syntax_asdl.py)
136
137 Related examples:
138
139 opy/compiler2/transformer.py (Python's parse tree -> AST, ~1500 lines)
140 Python-2.7.13/Python/ast.c (the "real" CPython version, ~3600 lines)
141
142 Other:
143 frontend/parse_lib.py (turn on print_parse_tree)
144
145 Public methods:
146 Expr, VarDecl
147 atom, trailer, etc. are private, named after productions in grammar.pgen2.
148 """
149
150 def __init__(self, gr):
151 # type: (Grammar) -> None
152 self.number2symbol = gr.number2symbol
153 if mylib.PYTHON:
154 names = MakeGrammarNames(gr)
155 # print raw nodes
156 self.p_printer = expr_parse.ParseTreePrinter(names)
157
158 def _LeftAssoc(self, p_node):
159 # type: (PNode) -> expr_t
160 """For an associative binary operation.
161
162 Examples:
163 xor_expr: and_expr ('xor' and_expr)*
164 term: factor (('*'|'/'|'%'|'div') factor)*
165
166 3 - 1 - 2 must be grouped as ((3 - 1) - 2).
167 """
168 # Note: Compare the iteractive com_binary() method in
169 # opy/compiler2/transformer.py.
170
171 # Examples:
172 # - The PNode for '3 - 1' will have 3 children
173 # - The PNode for '3 - 1 - 2' will have 5 children
174
175 #self.p_printer.Print(p_node)
176
177 i = 1 # index of the operator
178 n = p_node.NumChildren()
179
180 left = self.Expr(p_node.GetChild(0))
181 while i < n:
182 op = p_node.GetChild(i)
183 right = self.Expr(p_node.GetChild(i + 1))
184
185 # create a new left node
186 left = expr.Binary(op.tok, left, right)
187 i += 2
188
189 return left
190
191 def _Trailer(self, base, p_trailer):
192 # type: (expr_t, PNode) -> expr_t
193 """
194 trailer: ( '(' [arglist] ')' | '[' subscriptlist ']'
195 | '.' NAME | '->' NAME | '::' NAME
196 )
197 """
198 tok0 = p_trailer.GetChild(0).tok
199 typ0 = p_trailer.GetChild(0).typ
200
201 if typ0 == Id.Op_LParen:
202 lparen = tok0
203 rparen = p_trailer.GetChild(-1).tok
204 arglist = ArgList(lparen, [], None, [], None, None, rparen)
205 if p_trailer.NumChildren() == 2: # ()
206 return expr.FuncCall(base, arglist)
207
208 p = p_trailer.GetChild(1) # the X in ( X )
209 assert p.typ == grammar_nt.arglist # f(x, y)
210 self._ArgList(p, arglist)
211 return expr.FuncCall(base, arglist)
212
213 if typ0 == Id.Op_LBracket:
214 p_args = p_trailer.GetChild(1)
215 assert p_args.typ == grammar_nt.subscriptlist
216 n = p_args.NumChildren()
217 if n > 1:
218 p_die("Only 1 subscript is accepted", p_args.GetChild(1).tok)
219
220 a = p_args.GetChild(0)
221 return Subscript(tok0, base, self._Subscript(a))
222
223 if typ0 in (Id.Expr_Dot, Id.Expr_RArrow, Id.Expr_RDArrow):
224 attr = p_trailer.GetChild(1).tok # will be Id.Expr_Name
225 return Attribute(base, tok0, attr, lexer.TokenVal(attr),
226 expr_context_e.Store)
227
228 raise AssertionError(typ0)
229
230 def _DictPair(self, p_node):
231 # type: (PNode) -> Tuple[expr_t, expr_t]
232 """
233 dict_pair: ( Expr_Name [':' test]
234 | '[' testlist ']' ':' test )
235 | sq_string ':' test
236 | dq_string ':' test )
237 """
238 assert p_node.typ == grammar_nt.dict_pair
239
240 typ = p_node.GetChild(0).typ
241
242 if typ in (grammar_nt.sq_string, grammar_nt.dq_string):
243 key = self.Expr(p_node.GetChild(0)) # type: expr_t
244 val = self.Expr(p_node.GetChild(2))
245 return key, val
246
247 tok0 = p_node.GetChild(0).tok
248 id_ = tok0.id
249
250 if id_ == Id.Expr_Name:
251 key_str = value.Str(lexer.TokenVal(tok0))
252 key = expr.Const(tok0, key_str)
253 if p_node.NumChildren() >= 3:
254 val = self.Expr(p_node.GetChild(2))
255 else:
256 val = expr.Implicit
257
258 if id_ == Id.Op_LBracket: # {[x+y]: 'val'}
259 key = self.Expr(p_node.GetChild(1))
260 val = self.Expr(p_node.GetChild(4))
261 return key, val
262
263 return key, val
264
265 def _Dict(self, parent, p_node):
266 # type: (PNode, PNode) -> expr.Dict
267 """
268 dict: dict_pair (comma_newline dict_pair)* [comma_newline]
269 """
270 if p_node.typ == Id.Op_RBrace: # {}
271 return expr.Dict(parent.tok, [], [])
272
273 assert p_node.typ == grammar_nt.dict
274
275 keys = [] # type: List[expr_t]
276 values = [] # type: List[expr_t]
277
278 n = p_node.NumChildren()
279 for i in xrange(0, n, 2):
280 key, val = self._DictPair(p_node.GetChild(i))
281 keys.append(key)
282 values.append(val)
283
284 return expr.Dict(parent.tok, keys, values)
285
286 def _Tuple(self, parent):
287 # type: (PNode) -> expr_t
288
289 n = parent.NumChildren()
290
291 # (x) -- not a tuple
292 if n == 1:
293 return self.Expr(parent.GetChild(0))
294
295 # x, and (x,) aren't allowed
296 if n == 2:
297 p_die('Invalid trailing comma', parent.GetChild(1).tok)
298
299 elts = [] # type: List[expr_t]
300 for i in xrange(0, n, 2): # skip commas
301 p_node = parent.GetChild(i)
302 elts.append(self.Expr(p_node))
303
304 return expr.Tuple(parent.tok, elts,
305 expr_context_e.Store) # unused expr_context_e
306
307 def _TestlistComp(self, parent, p_node, id0):
308 # type: (PNode, PNode, Id_t) -> expr_t
309 """
310 testlist_comp:
311 (test|star_expr) ( comp_for | (',' (test|star_expr))* [','] )
312 """
313 assert p_node.typ == grammar_nt.testlist_comp
314
315 n = p_node.NumChildren()
316 if n > 1 and p_node.GetChild(1).typ == grammar_nt.comp_for:
317 elt = self.Expr(p_node.GetChild(0))
318 comp = self._CompFor(p_node.GetChild(1))
319 if id0 == Id.Op_LParen: # (x+1 for x in y)
320 return expr.GeneratorExp(elt, [comp])
321 if id0 == Id.Op_LBracket: # [x+1 for x in y]
322 return expr.ListComp(parent.tok, elt, [comp])
323 raise AssertionError()
324
325 if id0 == Id.Op_LParen:
326 # Parenthesized expression like (x+1) or (x)
327 if n == 1:
328 return self.Expr(p_node.GetChild(0))
329
330 # Tuples (1,) (1, 2) etc. - TODO: should be a list literal?
331 if p_node.GetChild(1).typ == Id.Arith_Comma:
332 return self._Tuple(p_node)
333
334 raise AssertionError()
335
336 if id0 == Id.Op_LBracket: # List [1,2,3]
337 elts = [] # type: List[expr_t]
338 for i in xrange(0, n, 2): # skip commas
339 elts.append(self.Expr(p_node.GetChild(i)))
340
341 return expr.List(parent.tok, elts,
342 expr_context_e.Store) # unused expr_context_e
343
344 raise AssertionError(Id_str(id0))
345
346 def _Atom(self, parent):
347 # type: (PNode) -> expr_t
348 """Handle alternatives of 'atom' where there's more than one child."""
349
350 tok = parent.GetChild(0).tok
351 id_ = tok.id
352 n = parent.NumChildren()
353
354 if id_ == Id.Op_LParen:
355 # atom: '(' [yield_expr|testlist_comp] ')' | ...
356 if n == 2: # () is a tuple
357 assert (
358 parent.GetChild(1).typ == Id.Op_RParen), parent.GetChild(1)
359 return expr.Tuple(tok, [], expr_context_e.Store)
360
361 return self._TestlistComp(parent, parent.GetChild(1), id_)
362
363 if id_ == Id.Op_LBracket:
364 # atom: ... | '[' [testlist_comp] ']' | ...
365
366 if n == 2: # []
367 assert (parent.GetChild(1).typ == Id.Op_RBracket
368 ), parent.GetChild(1)
369 return expr.List(tok, [],
370 expr_context_e.Store) # unused expr_context_e
371
372 return self._TestlistComp(parent, parent.GetChild(1), id_)
373
374 if id_ == Id.Left_CaretBracket: # ^[42 + x]
375 child = self.Expr(parent.GetChild(1))
376 return expr.Literal(child)
377
378 if id_ == Id.Op_LBrace:
379 # atom: ... | '{' [Op_Newline] [dict] '}'
380 i = 1
381 if parent.GetChild(i).typ == Id.Op_Newline:
382 i += 1
383 return self._Dict(parent, parent.GetChild(i))
384
385 if id_ == Id.Arith_Amp:
386 n = parent.NumChildren()
387 if n >= 3:
388 p_die("Places in containers not implemented yet",
389 parent.GetChild(2).tok)
390
391 name_tok = parent.GetChild(1).tok
392 return expr.Place(name_tok, lexer.TokenVal(name_tok), [])
393
394 if id_ == Id.Expr_Func:
395 # STUB. This should really be a Func, not Lambda.
396 return expr.Lambda([], expr.Implicit)
397
398 # 100 M
399 # Ignoring the suffix for now
400 if id_ == Id.Expr_DecInt:
401 assert n > 1
402 p_die("Units suffix not implemented", parent.GetChild(1).tok)
403 #return self.Expr(parent.GetChild(0))
404
405 # 100.5 M
406 # Ignoring the suffix for now
407 if id_ == Id.Expr_Float:
408 assert n > 1
409 p_die("unix suffix implemented", parent.GetChild(1).tok)
410 #return self.Expr(parent.GetChild(0))
411
412 raise AssertionError(Id_str(id_))
413
414 def _NameType(self, p_node):
415 # type: (PNode) -> NameType
416 """ name_type: Expr_Name [':'] [type_expr] """
417 name_tok = p_node.GetChild(0).tok
418 typ = None # type: Optional[TypeExpr]
419
420 n = p_node.NumChildren()
421 if n == 2:
422 typ = self._TypeExpr(p_node.GetChild(1))
423 if n == 3:
424 typ = self._TypeExpr(p_node.GetChild(2))
425
426 return NameType(name_tok, lexer.TokenVal(name_tok), typ)
427
428 def _NameTypeList(self, p_node):
429 # type: (PNode) -> List[NameType]
430 """ name_type_list: name_type (',' name_type)* """
431 assert p_node.typ == grammar_nt.name_type_list
432 results = [] # type: List[NameType]
433
434 n = p_node.NumChildren()
435 for i in xrange(0, n, 2): # was children[::2]
436 results.append(self._NameType(p_node.GetChild(i)))
437 return results
438
439 def _CompFor(self, p_node):
440 # type: (PNode) -> Comprehension
441 """comp_for: 'for' exprlist 'in' or_test ['if' or_test]"""
442 lhs = self._NameTypeList(p_node.GetChild(1))
443 iterable = self.Expr(p_node.GetChild(3))
444
445 if p_node.NumChildren() >= 6:
446 cond = self.Expr(p_node.GetChild(5))
447 else:
448 cond = None
449
450 return Comprehension(lhs, iterable, cond)
451
452 def _CompareChain(self, parent):
453 # type: (PNode) -> expr_t
454 """comparison: expr (comp_op expr)*"""
455 cmp_ops = [] # type: List[Token]
456 comparators = [] # type: List[expr_t]
457 left = self.Expr(parent.GetChild(0))
458
459 i = 1
460 n = parent.NumChildren()
461 while i < n:
462 p = parent.GetChild(i)
463 op = p.GetChild(0).tok
464 if p.NumChildren() == 2:
465 # Blame the first token, and change its type
466 if op.id == Id.Expr_Not: # not in
467 op.id = Id.Node_NotIn
468 elif op.id == Id.Expr_Is: # is not
469 op.id = Id.Node_IsNot
470 else:
471 raise AssertionError()
472 else:
473 # is, <, ==, etc.
474 pass
475
476 cmp_ops.append(op)
477 i += 1
478 comparators.append(self.Expr(parent.GetChild(i)))
479 i += 1
480 return expr.Compare(left, cmp_ops, comparators)
481
482 def _Subscript(self, parent):
483 # type: (PNode) -> expr_t
484 """subscript: expr | [expr] ':' [expr]"""
485 typ0 = parent.GetChild(0).typ
486
487 n = parent.NumChildren()
488
489 if typ0 == grammar_nt.expr:
490 if n == 3: # a[1:2]
491 lower = self.Expr(parent.GetChild(0))
492 op_tok = parent.GetChild(1).tok
493 upper = self.Expr(parent.GetChild(2))
494
495 elif n == 2: # a[1:]
496 lower = self.Expr(parent.GetChild(0))
497 op_tok = parent.GetChild(1).tok
498 upper = None
499 else: # a[1]
500 return self.Expr(parent.GetChild(0))
501 else:
502 assert typ0 == Id.Arith_Colon
503 lower = None
504 if n == 1: # a[:]
505 op_tok = parent.GetChild(0).tok
506 upper = None
507 else: # a[:3]
508 op_tok = parent.GetChild(0).tok
509 upper = self.Expr(parent.GetChild(1))
510
511 return expr.Slice(lower, op_tok, upper)
512
513 def Expr(self, pnode):
514 # type: (PNode) -> expr_t
515 """Transform expressions (as opposed to statements)"""
516 typ = pnode.typ
517
518 #
519 # YSH Entry Points / Additions
520 #
521
522 if typ == grammar_nt.ysh_expr: # for if/while
523 # ysh_expr: '(' testlist ')'
524 return self.Expr(pnode.GetChild(1))
525
526 if typ == grammar_nt.command_expr:
527 # return_expr: testlist end_stmt
528 return self.Expr(pnode.GetChild(0))
529
530 #
531 # Python-like Expressions / Operators
532 #
533
534 if typ == grammar_nt.atom:
535 if pnode.NumChildren() == 1:
536 return self.Expr(pnode.GetChild(0))
537 return self._Atom(pnode)
538
539 if typ == grammar_nt.testlist:
540 # testlist: test (',' test)* [',']
541 return self._Tuple(pnode)
542
543 if typ == grammar_nt.test:
544 # test: or_test ['if' or_test 'else' test] | lambdef
545 if pnode.NumChildren() == 1:
546 return self.Expr(pnode.GetChild(0))
547
548 # TODO: Handle lambdef
549
550 test = self.Expr(pnode.GetChild(2))
551 body = self.Expr(pnode.GetChild(0))
552 orelse = self.Expr(pnode.GetChild(4))
553 return expr.IfExp(test, body, orelse)
554
555 if typ == grammar_nt.lambdef:
556 # lambdef: '|' [name_type_list] '|' test
557
558 n = pnode.NumChildren()
559 if n == 4:
560 params = self._NameTypeList(pnode.GetChild(1))
561 else:
562 params = []
563
564 body = self.Expr(pnode.GetChild(n - 1))
565 return expr.Lambda(params, body)
566
567 #
568 # Operators with Precedence
569 #
570
571 if typ == grammar_nt.or_test:
572 # or_test: and_test ('or' and_test)*
573 return self._LeftAssoc(pnode)
574
575 if typ == grammar_nt.and_test:
576 # and_test: not_test ('and' not_test)*
577 return self._LeftAssoc(pnode)
578
579 if typ == grammar_nt.not_test:
580 # not_test: 'not' not_test | comparison
581 if pnode.NumChildren() == 1:
582 return self.Expr(pnode.GetChild(0))
583
584 op_tok = pnode.GetChild(0).tok # not
585 return expr.Unary(op_tok, self.Expr(pnode.GetChild(1)))
586
587 elif typ == grammar_nt.comparison:
588 if pnode.NumChildren() == 1:
589 return self.Expr(pnode.GetChild(0))
590
591 return self._CompareChain(pnode)
592
593 elif typ == grammar_nt.range_expr:
594 n = pnode.NumChildren()
595 if n == 1:
596 return self.Expr(pnode.GetChild(0))
597
598 if n == 3:
599 return expr.Range(self.Expr(pnode.GetChild(0)),
600 pnode.GetChild(1).tok,
601 self.Expr(pnode.GetChild(2)))
602
603 raise AssertionError(n)
604
605 elif typ == grammar_nt.expr:
606 # expr: xor_expr ('|' xor_expr)*
607 return self._LeftAssoc(pnode)
608
609 if typ == grammar_nt.xor_expr:
610 # xor_expr: and_expr ('xor' and_expr)*
611 return self._LeftAssoc(pnode)
612
613 if typ == grammar_nt.and_expr: # a & b
614 # and_expr: shift_expr ('&' shift_expr)*
615 return self._LeftAssoc(pnode)
616
617 elif typ == grammar_nt.shift_expr:
618 # shift_expr: arith_expr (('<<'|'>>') arith_expr)*
619 return self._LeftAssoc(pnode)
620
621 elif typ == grammar_nt.arith_expr:
622 # arith_expr: term (('+'|'-') term)*
623 return self._LeftAssoc(pnode)
624
625 elif typ == grammar_nt.term:
626 # term: factor (('*'|'/'|'div'|'mod') factor)*
627 return self._LeftAssoc(pnode)
628
629 elif typ == grammar_nt.factor:
630 # factor: ('+'|'-'|'~') factor | power
631 # the power would have already been reduced
632 if pnode.NumChildren() == 1:
633 return self.Expr(pnode.GetChild(0))
634
635 assert pnode.NumChildren() == 2
636 op = pnode.GetChild(0)
637 e = pnode.GetChild(1)
638
639 assert isinstance(op.tok, Token)
640 return expr.Unary(op.tok, self.Expr(e))
641
642 elif typ == grammar_nt.power:
643 # power: atom trailer* ['**' factor]
644
645 node = self.Expr(pnode.GetChild(0))
646 if pnode.NumChildren() == 1: # No trailers
647 return node
648
649 # Support a->startswith(b) and mydict.key
650 n = pnode.NumChildren()
651 i = 1
652 while i < n and pnode.GetChild(i).typ == grammar_nt.trailer:
653 node = self._Trailer(node, pnode.GetChild(i))
654 i += 1
655
656 if i != n: # ['**' factor]
657 op_tok = pnode.GetChild(i).tok
658 assert op_tok.id == Id.Arith_DStar, op_tok
659 factor = self.Expr(pnode.GetChild(i + 1))
660 node = expr.Binary(op_tok, node, factor)
661
662 return node
663
664 elif typ == grammar_nt.eggex:
665 return self._Eggex(pnode)
666
667 elif typ == grammar_nt.ysh_expr_sub:
668 return self.Expr(pnode.GetChild(0))
669
670 #
671 # YSH Lexer Modes
672 #
673
674 elif typ == grammar_nt.sh_array_literal:
675 return cast(ShArrayLiteral, pnode.GetChild(1).tok)
676
677 elif typ == grammar_nt.old_sh_array_literal:
678 return cast(ShArrayLiteral, pnode.GetChild(1).tok)
679
680 elif typ == grammar_nt.sh_command_sub:
681 return cast(CommandSub, pnode.GetChild(1).tok)
682
683 elif typ == grammar_nt.braced_var_sub:
684 return cast(BracedVarSub, pnode.GetChild(1).tok)
685
686 elif typ == grammar_nt.dq_string:
687 dq = cast(DoubleQuoted, pnode.GetChild(1).tok)
688 # sugar: ^"..." is short for ^["..."]
689 if pnode.GetChild(0).typ == Id.Left_CaretDoubleQuote:
690 return expr.Literal(dq)
691 return dq
692
693 elif typ == grammar_nt.sq_string:
694 return cast(SingleQuoted, pnode.GetChild(1).tok)
695
696 elif typ == grammar_nt.simple_var_sub:
697 tok = pnode.GetChild(0).tok
698
699 if tok.id == Id.VSub_DollarName: # $foo is disallowed
700 bare = lexer.TokenSliceLeft(tok, 1)
701 p_die(
702 'In expressions, remove $ and use `%s`, or sometimes "$%s"'
703 % (bare, bare), tok)
704
705 # $? is allowed
706 return SimpleVarSub(tok)
707
708 #
709 # Terminals
710 #
711
712 tok = pnode.tok
713 if typ == Id.Expr_Name:
714 return expr.Var(tok, lexer.TokenVal(tok))
715
716 # Everything else is an expr.Const
717 tok_str = lexer.TokenVal(tok)
718 # Remove underscores from 1_000_000. The lexer is responsible for
719 # validation.
720 c_under = tok_str.replace('_', '')
721
722 if typ == Id.Expr_DecInt:
723 try:
724 cval = value.Int(mops.FromStr(c_under)) # type: value_t
725 except ValueError:
726 p_die('Decimal int constant is too large', tok)
727
728 elif typ == Id.Expr_BinInt:
729 assert c_under[:2] in ('0b', '0B'), c_under
730 try:
731 cval = value.Int(mops.FromStr(c_under[2:], 2))
732 except ValueError:
733 p_die('Binary int constant is too large', tok)
734
735 elif typ == Id.Expr_OctInt:
736 assert c_under[:2] in ('0o', '0O'), c_under
737 try:
738 cval = value.Int(mops.FromStr(c_under[2:], 8))
739 except ValueError:
740 p_die('Octal int constant is too large', tok)
741
742 elif typ == Id.Expr_HexInt:
743 assert c_under[:2] in ('0x', '0X'), c_under
744 try:
745 cval = value.Int(mops.FromStr(c_under[2:], 16))
746 except ValueError:
747 p_die('Hex int constant is too large', tok)
748
749 elif typ == Id.Expr_Float:
750 # Note: float() in mycpp/gc_builtins.cc currently uses strtod
751 # I think this never raises ValueError, because the lexer
752 # should only accept strings that strtod() does?
753 cval = value.Float(float(c_under))
754
755 elif typ == Id.Expr_Null:
756 cval = value.Null
757
758 elif typ == Id.Expr_True:
759 cval = value.Bool(True)
760
761 elif typ == Id.Expr_False:
762 cval = value.Bool(False)
763
764 elif typ == Id.Char_OneChar: # \n
765 assert len(tok_str) == 2, tok_str
766 s = consts.LookupCharC(lexer.TokenSliceLeft(tok, 1))
767 cval = value.Str(s)
768
769 elif typ == Id.Char_YHex: # \yff
770 assert len(tok_str) == 4, tok_str
771 hex_str = lexer.TokenSliceLeft(tok, 2)
772 s = chr(int(hex_str, 16))
773 cval = value.Str(s)
774
775 elif typ == Id.Char_UBraced: # \u{123}
776 hex_str = lexer.TokenSlice(tok, 3, -1)
777 code_point = int(hex_str, 16)
778 s = j8.Utf8Encode(code_point)
779 cval = value.Str(s)
780
781 else:
782 raise AssertionError(typ)
783
784 return expr.Const(tok, cval)
785
786 def _CheckLhs(self, lhs):
787 # type: (expr_t) -> None
788
789 UP_lhs = lhs
790 with tagswitch(lhs) as case:
791 if case(expr_e.Var):
792 # OK - e.g. setvar a.b.c[i] = 42
793 pass
794
795 elif case(expr_e.Subscript):
796 lhs = cast(Subscript, UP_lhs)
797 self._CheckLhs(lhs.obj) # recurse on LHS
798
799 elif case(expr_e.Attribute):
800 lhs = cast(Attribute, UP_lhs)
801 self._CheckLhs(lhs.obj) # recurse on LHS
802
803 else:
804 # Illegal - e.g. setglobal {}["key"] = 42
805 p_die("Subscript/Attribute not allowed on this LHS expression",
806 location.TokenForExpr(lhs))
807
808 def _LhsExprList(self, p_node):
809 # type: (PNode) -> List[y_lhs_t]
810 """lhs_list: expr (',' expr)*"""
811 assert p_node.typ == grammar_nt.lhs_list
812
813 lhs_list = [] # type: List[y_lhs_t]
814 n = p_node.NumChildren()
815 for i in xrange(0, n, 2):
816 p = p_node.GetChild(i)
817 #self.p_printer.Print(p)
818
819 e = self.Expr(p)
820 UP_e = e
821 with tagswitch(e) as case:
822 if case(expr_e.Var):
823 e = cast(expr.Var, UP_e)
824 lhs_list.append(e.left)
825
826 elif case(expr_e.Subscript):
827 e = cast(Subscript, UP_e)
828 self._CheckLhs(e)
829 lhs_list.append(e)
830
831 elif case(expr_e.Attribute):
832 e = cast(Attribute, UP_e)
833 self._CheckLhs(e)
834 if e.op.id != Id.Expr_Dot:
835 # e.g. setvar obj->method is not valid
836 p_die("Can't assign to this attribute expr", e.op)
837 lhs_list.append(e)
838
839 else:
840 pass # work around mycpp bug
841
842 # TODO: could blame arbitary expr_t, bu this works most of
843 # the time
844 if p.tok:
845 blame = p.tok # type: loc_t
846 else:
847 blame = loc.Missing
848 p_die("Can't assign to this expression", blame)
849
850 return lhs_list
851
852 def MakeVarDecl(self, p_node):
853 # type: (PNode) -> command.VarDecl
854 """
855 ysh_var_decl: name_type_list ['=' testlist] end_stmt
856 """
857 assert p_node.typ == grammar_nt.ysh_var_decl
858
859 lhs = self._NameTypeList(p_node.GetChild(0)) # could be a tuple
860
861 # This syntax is confusing, and different than JavaScript
862 # var x, y = 1, 2
863 # But this is useful:
864 # var flag, i = parseArgs(spec, argv)
865
866 n = p_node.NumChildren()
867 if n >= 3:
868 rhs = self.Expr(p_node.GetChild(2))
869 else:
870 rhs = None
871
872 # The caller should fill in the keyword token.
873 return command.VarDecl(None, lhs, rhs)
874
875 def MakeMutation(self, p_node):
876 # type: (PNode) -> command.Mutation
877 """
878 ysh_mutation: lhs_list (augassign | '=') testlist end_stmt
879 """
880 typ = p_node.typ
881 assert typ == grammar_nt.ysh_mutation
882
883 lhs_list = self._LhsExprList(p_node.GetChild(0)) # could be a tuple
884 op_tok = p_node.GetChild(1).tok
885 if len(lhs_list) > 1 and op_tok.id != Id.Arith_Equal:
886 p_die('Multiple assignment must use =', op_tok)
887 rhs = self.Expr(p_node.GetChild(2))
888 return command.Mutation(None, lhs_list, op_tok, rhs)
889
890 def _EggexFlag(self, p_node):
891 # type: (PNode) -> EggexFlag
892 n = p_node.NumChildren()
893 if n == 1:
894 return EggexFlag(False, p_node.GetChild(0).tok)
895 elif n == 2:
896 return EggexFlag(True, p_node.GetChild(1).tok)
897 else:
898 raise AssertionError()
899
900 def _Eggex(self, p_node):
901 # type: (PNode) -> Eggex
902 """
903 eggex: '/' regex [';' re_flag* [';' Expr_Name] ] '/'
904 """
905 left = p_node.GetChild(0).tok
906 regex = self._Regex(p_node.GetChild(1))
907
908 flags = [] # type: List[EggexFlag]
909 trans_pref = None # type: Optional[Token]
910
911 i = 2
912 current = p_node.GetChild(i)
913 if current.typ == Id.Op_Semi:
914 i += 1
915 while True:
916 current = p_node.GetChild(i)
917 if current.typ != grammar_nt.re_flag:
918 break
919 flags.append(self._EggexFlag(current))
920 i += 1
921
922 if current.typ == Id.Op_Semi:
923 i += 1
924 trans_pref = p_node.GetChild(i).tok
925
926 # Canonicalize and validate flags for ERE only. Default is ERE.
927 if trans_pref is None or lexer.TokenVal(trans_pref) == 'ERE':
928 canonical_flags = regex_translate.CanonicalFlags(flags)
929 else:
930 canonical_flags = None
931
932 return Eggex(left, regex, flags, trans_pref, canonical_flags)
933
934 def YshCasePattern(self, pnode):
935 # type: (PNode) -> pat_t
936 assert pnode.typ == grammar_nt.ysh_case_pat, pnode
937
938 pattern = pnode.GetChild(0)
939 typ = pattern.typ
940 if typ == Id.Op_LParen:
941 # pat_expr or pat_else
942 pattern = pnode.GetChild(1)
943 typ = pattern.typ
944
945 if typ == grammar_nt.pat_else:
946 return pat.Else
947
948 if typ == grammar_nt.pat_exprs:
949 exprs = [] # type: List[expr_t]
950 for i in xrange(pattern.NumChildren()):
951 child = pattern.GetChild(i)
952 if child.typ == grammar_nt.expr:
953 expr = self.Expr(child)
954 exprs.append(expr)
955 return pat.YshExprs(exprs)
956
957 if typ == grammar_nt.eggex:
958 return self._Eggex(pattern)
959
960 raise AssertionError()
961
962 def _BlockArg(self, p_node):
963 # type: (PNode) -> expr_t
964
965 n = p_node.NumChildren()
966 if n == 1:
967 child = p_node.GetChild(0)
968 return self.Expr(child)
969
970 # It can only be an expression, not a=42, or ...expr
971 p_die('Invalid block expression argument', p_node.tok)
972
973 def _Argument(self, p_node, after_semi, arglist):
974 # type: (PNode, bool, ArgList) -> None
975 """
976 argument: (
977 test [comp_for]
978 | test '=' test # named arg
979 | '...' test # var args
980 )
981 """
982 pos_args = arglist.pos_args
983 named_args = arglist.named_args
984
985 assert p_node.typ == grammar_nt.argument, p_node
986 n = p_node.NumChildren()
987 if n == 1:
988 child = p_node.GetChild(0)
989 if after_semi:
990 p_die(POS_ARG_MISPLACED, child.tok)
991 arg = self.Expr(child)
992 pos_args.append(arg)
993 return
994
995 if n == 2:
996 # Note: We allow multiple spreads, just like Julia. They are
997 # concatenated as in lists and dicts.
998 tok0 = p_node.GetChild(0).tok
999 if tok0.id == Id.Expr_Ellipsis:
1000 spread_expr = expr.Spread(tok0, self.Expr(p_node.GetChild(1)))
1001 if after_semi: # f(; ... named)
1002 named_args.append(NamedArg(None, spread_expr))
1003 else: # f(...named)
1004 pos_args.append(spread_expr)
1005 return
1006
1007 # Note: generator expression not implemented
1008 if p_node.GetChild(1).typ == grammar_nt.comp_for:
1009 child = p_node.GetChild(0)
1010 if after_semi:
1011 p_die(POS_ARG_MISPLACED, child.tok)
1012
1013 elt = self.Expr(child)
1014 comp = self._CompFor(p_node.GetChild(1))
1015 arg = expr.GeneratorExp(elt, [comp])
1016 pos_args.append(arg)
1017 return
1018
1019 raise AssertionError()
1020
1021 if n == 3: # named args can come before or after the semicolon
1022 n1 = NamedArg(
1023 p_node.GetChild(0).tok, self.Expr(p_node.GetChild(2)))
1024 named_args.append(n1)
1025 return
1026
1027 raise AssertionError()
1028
1029 def _ArgGroup(self, p_node, after_semi, arglist):
1030 # type: (PNode, bool, ArgList) -> None
1031 """
1032 arg_group: argument (',' argument)* [',']
1033 """
1034 for i in xrange(p_node.NumChildren()):
1035 p_child = p_node.GetChild(i)
1036 if p_child.typ == grammar_nt.argument:
1037 self._Argument(p_child, after_semi, arglist)
1038
1039 def _ArgList(self, p_node, arglist):
1040 # type: (PNode, ArgList) -> None
1041 """For both funcs and procs
1042
1043 arglist: (
1044 [arg_group]
1045 [';' [arg_group]]
1046 )
1047
1048 arglist3: ...
1049 """
1050 n = p_node.NumChildren()
1051 if n == 0:
1052 return
1053
1054 i = 0
1055
1056 if i >= n:
1057 return
1058 child = p_node.GetChild(i)
1059 if child.typ == grammar_nt.arg_group:
1060 self._ArgGroup(child, False, arglist)
1061 i += 1
1062
1063 if i >= n:
1064 return
1065 child = p_node.GetChild(i)
1066 if child.typ == Id.Op_Semi:
1067 arglist.semi_tok = child.tok
1068 i += 1
1069
1070 # Named args after first semi-colon
1071 if i >= n:
1072 return
1073 child = p_node.GetChild(i)
1074 if child.typ == grammar_nt.arg_group:
1075 self._ArgGroup(child, True, arglist)
1076 i += 1
1077
1078 #
1079 # Special third group may have block expression - only for arglist3,
1080 # used for procs!
1081 #
1082
1083 if i >= n:
1084 return
1085 assert p_node.typ == grammar_nt.arglist3, p_node
1086
1087 child = p_node.GetChild(i)
1088 if child.typ == Id.Op_Semi:
1089 arglist.semi_tok2 = child.tok
1090 i += 1
1091
1092 if i >= n:
1093 return
1094 child = p_node.GetChild(i)
1095 if child.typ == grammar_nt.argument:
1096 arglist.block_expr = self._BlockArg(child)
1097 i += 1
1098
1099 def ProcCallArgs(self, pnode, arglist):
1100 # type: (PNode, ArgList) -> None
1101 """
1102 ysh_eager_arglist: '(' [arglist3] ')'
1103 ysh_lazy_arglist: '[' [arglist] ']'
1104 """
1105 n = pnode.NumChildren()
1106 if n == 2: # f()
1107 return
1108
1109 if n == 3:
1110 child1 = pnode.GetChild(1) # the X in '( X )'
1111
1112 self._ArgList(child1, arglist)
1113 return
1114
1115 raise AssertionError()
1116
1117 def _TypeExpr(self, pnode):
1118 # type: (PNode) -> TypeExpr
1119 """
1120 type_expr: Expr_Name [ '[' type_expr (',' type_expr)* ']' ]
1121 """
1122 assert pnode.typ == grammar_nt.type_expr, pnode.typ
1123
1124 ty = TypeExpr.CreateNull() # don't allocate children
1125
1126 ty.tok = pnode.GetChild(0).tok
1127 ty.name = lexer.TokenVal(ty.tok)
1128
1129 n = pnode.NumChildren()
1130 if n == 1:
1131 return ty
1132
1133 ty.params = []
1134 i = 2
1135 while i < n:
1136 p = self._TypeExpr(pnode.GetChild(i))
1137 ty.params.append(p)
1138 i += 2 # skip comma
1139
1140 return ty
1141
1142 def _Param(self, pnode):
1143 # type: (PNode) -> Param
1144 """
1145 param: Expr_Name [type_expr] ['=' expr]
1146 """
1147 assert pnode.typ == grammar_nt.param
1148
1149 name_tok = pnode.GetChild(0).tok
1150 n = pnode.NumChildren()
1151
1152 assert name_tok.id == Id.Expr_Name, name_tok
1153
1154 default_val = None # type: expr_t
1155 type_ = None # type: TypeExpr
1156
1157 if n == 1:
1158 # proc p(a)
1159 pass
1160
1161 elif n == 2:
1162 # proc p(a Int)
1163 type_ = self._TypeExpr(pnode.GetChild(1))
1164
1165 elif n == 3:
1166 # proc p(a = 3)
1167 default_val = self.Expr(pnode.GetChild(2))
1168
1169 elif n == 4:
1170 # proc p(a Int = 3)
1171 type_ = self._TypeExpr(pnode.GetChild(1))
1172 default_val = self.Expr(pnode.GetChild(3))
1173
1174 return Param(name_tok, lexer.TokenVal(name_tok), type_, default_val)
1175
1176 def _ParamGroup(self, p_node):
1177 # type: (PNode) -> ParamGroup
1178 """
1179 param_group:
1180 (param ',')*
1181 [ (param | '...' Expr_Name) [,] ]
1182 """
1183 assert p_node.typ == grammar_nt.param_group, p_node
1184
1185 params = [] # type: List[Param]
1186 rest_of = None # type: Optional[RestParam]
1187
1188 n = p_node.NumChildren()
1189 i = 0
1190 while i < n:
1191 child = p_node.GetChild(i)
1192 if child.typ == grammar_nt.param:
1193 params.append(self._Param(child))
1194
1195 elif child.typ == Id.Expr_Ellipsis:
1196 tok = p_node.GetChild(i + 1).tok
1197 rest_of = RestParam(tok, lexer.TokenVal(tok))
1198
1199 i += 2
1200
1201 return ParamGroup(params, rest_of)
1202
1203 def Proc(self, p_node):
1204 # type: (PNode) -> proc_sig_t
1205 """
1206 ysh_proc: (
1207 [ '('
1208 [ param_group ] # word params, with defaults
1209 [ ';' [ param_group ] ] # positional typed params, with defaults
1210 [ ';' [ param_group ] ] # named params, with defaults
1211 [ ';' Expr_Name ] # optional block param, with no type or default
1212 ')'
1213 ]
1214 '{' # opening { for pgen2
1215 )
1216 """
1217 typ = p_node.typ
1218 assert typ == grammar_nt.ysh_proc
1219
1220 n = p_node.NumChildren()
1221 if n == 1: # proc f {
1222 return proc_sig.Open
1223
1224 if n == 3: # proc f () {
1225 sig = proc_sig.Closed.CreateNull(alloc_lists=True) # no params
1226
1227 # proc f( three param groups, and block group )
1228 sig = proc_sig.Closed.CreateNull(alloc_lists=True) # no params
1229
1230 # Word args
1231 i = 1
1232 child = p_node.GetChild(i)
1233 if child.typ == grammar_nt.param_group:
1234 sig.word = self._ParamGroup(p_node.GetChild(i))
1235
1236 # Validate word args
1237 for word in sig.word.params:
1238 if word.type:
1239 if word.type.name not in ('Str', 'Ref'):
1240 p_die('Word params may only have type Str or Ref',
1241 word.type.tok)
1242 if word.type.params is not None:
1243 p_die('Unexpected type parameters', word.type.tok)
1244
1245 i += 2
1246 else:
1247 i += 1
1248
1249 #log('i %d n %d', i, n)
1250 if i >= n:
1251 return sig
1252
1253 # Positional args
1254 child = p_node.GetChild(i)
1255 if child.typ == grammar_nt.param_group:
1256 sig.positional = self._ParamGroup(p_node.GetChild(i))
1257 i += 2
1258 else:
1259 i += 1
1260
1261 #log('i %d n %d', i, n)
1262 if i >= n:
1263 return sig
1264
1265 # Keyword args
1266 child = p_node.GetChild(i)
1267 if child.typ == grammar_nt.param_group:
1268 sig.named = self._ParamGroup(p_node.GetChild(i))
1269 i += 2
1270 else:
1271 i += 1
1272
1273 #log('i %d n %d', i, n)
1274 if i >= n:
1275 return sig
1276
1277 child = p_node.GetChild(i)
1278 if child.typ == grammar_nt.param_group:
1279 group = self._ParamGroup(p_node.GetChild(i))
1280 params = group.params
1281 if len(params) > 1:
1282 p_die('Only 1 block param is allowed', params[1].blame_tok)
1283 if group.rest_of:
1284 p_die("Rest param isn't allowed for blocks",
1285 group.rest_of.blame_tok)
1286
1287 if len(params) == 1:
1288 if params[0].type:
1289 if params[0].type.name != 'Command':
1290 p_die('Block param must have type Command',
1291 params[0].type.tok)
1292 if params[0].type.params is not None:
1293 p_die('Unexpected type parameters', params[0].type.tok)
1294
1295 sig.block_param = params[0]
1296
1297 return sig
1298
1299 def YshFunc(self, p_node, out):
1300 # type: (PNode, Func) -> None
1301 """
1302 ysh_func: Expr_Name '(' [param_group] [';' param_group] ')'
1303 """
1304 assert p_node.typ == grammar_nt.ysh_func
1305
1306 #self.p_printer.Print(p_node)
1307
1308 out.name = p_node.GetChild(0).tok
1309
1310 n = p_node.NumChildren()
1311 i = 2 # after (
1312
1313 child = p_node.GetChild(i)
1314 if child.typ == grammar_nt.param_group:
1315 out.positional = self._ParamGroup(child)
1316 i += 2 # skip past ;
1317 else:
1318 i += 1
1319
1320 if i >= n:
1321 return
1322
1323 child = p_node.GetChild(i)
1324 if child.typ == grammar_nt.param_group:
1325 out.named = self._ParamGroup(child)
1326
1327 #
1328 # Eggex Language
1329 #
1330
1331 def _RangeCharSingleQuoted(self, p_node):
1332 # type: (PNode) -> Optional[CharCode]
1333
1334 assert p_node.typ == grammar_nt.range_char, p_node
1335
1336 # 'a' in 'a'-'b'
1337
1338 child0 = p_node.GetChild(0)
1339 if child0.typ == grammar_nt.sq_string:
1340 sq_part = cast(SingleQuoted, child0.GetChild(1).tok)
1341 n = len(sq_part.sval)
1342 if n == 0:
1343 p_die("Quoted range char can't be empty",
1344 loc.WordPart(sq_part))
1345 elif n == 1:
1346 return CharCode(sq_part.left, ord(sq_part.sval[0]), False)
1347 else:
1348 p_die(RANGE_POINT_TOO_LONG, loc.WordPart(sq_part))
1349 return None
1350
1351 def _OtherRangeToken(self, p_node):
1352 # type: (PNode) -> Token
1353 """An endpoint of a range (single char)
1354
1355 range_char: Expr_Name | Expr_DecInt | sq_string | char_literal
1356 a-z 0-9 'a'-'z' \x00-\xff
1357 """
1358 assert p_node.typ == grammar_nt.range_char, p_node
1359
1360 child0 = p_node.GetChild(0)
1361 if child0.typ == grammar_nt.char_literal:
1362 # \x00 in /[\x00 - \x20]/
1363 tok = child0.GetChild(0).tok
1364 return tok
1365
1366 tok = p_node.tok
1367 # a in a-z is Expr_Name
1368 # 0 in 0-9 is Expr_DecInt
1369 assert tok.id in (Id.Expr_Name, Id.Expr_DecInt), tok
1370
1371 if tok.length != 1:
1372 p_die(RANGE_POINT_TOO_LONG, tok)
1373 return tok
1374
1375 def _NonRangeChars(self, p_node):
1376 # type: (PNode) -> class_literal_term_t
1377 """
1378 \" \u1234 '#'
1379 """
1380 assert p_node.typ == grammar_nt.range_char, p_node
1381
1382 child0 = p_node.GetChild(0)
1383 typ0 = p_node.GetChild(0).typ
1384
1385 if typ0 == grammar_nt.sq_string:
1386 return cast(SingleQuoted, child0.GetChild(1).tok)
1387
1388 if typ0 == grammar_nt.char_literal:
1389 return word_compile.EvalCharLiteralForRegex(child0.tok)
1390
1391 if typ0 == Id.Expr_Name:
1392 # Look up PerlClass and PosixClass
1393 return self._NameInClass(None, child0.tok)
1394
1395 raise AssertionError()
1396
1397 def _ClassLiteralTerm(self, p_node):
1398 # type: (PNode) -> class_literal_term_t
1399 """
1400 class_literal_term:
1401 range_char ['-' range_char ]
1402 | '@' Expr_Name # splice
1403 | '!' Expr_Name # negate char class
1404 ...
1405 """
1406 assert p_node.typ == grammar_nt.class_literal_term, p_node
1407
1408 typ0 = p_node.GetChild(0).typ
1409
1410 if typ0 == grammar_nt.range_char:
1411 n = p_node.NumChildren()
1412
1413 if n == 1:
1414 return self._NonRangeChars(p_node.GetChild(0))
1415
1416 # 'a'-'z' etc.
1417 if n == 3:
1418 assert p_node.GetChild(1).typ == Id.Arith_Minus, p_node
1419
1420 left = p_node.GetChild(0)
1421 right = p_node.GetChild(2)
1422
1423 code1 = self._RangeCharSingleQuoted(left)
1424 if code1 is None:
1425 tok1 = self._OtherRangeToken(left)
1426 code1 = word_compile.EvalCharLiteralForRegex(tok1)
1427
1428 code2 = self._RangeCharSingleQuoted(right)
1429 if code2 is None:
1430 tok2 = self._OtherRangeToken(right)
1431 code2 = word_compile.EvalCharLiteralForRegex(tok2)
1432 return CharRange(code1, code2)
1433
1434 raise AssertionError()
1435
1436 if typ0 == Id.Expr_At:
1437 tok1 = p_node.GetChild(1).tok
1438 return class_literal_term.Splice(tok1, lexer.TokenVal(tok1))
1439
1440 if typ0 == Id.Expr_Bang:
1441 return self._NameInClass(
1442 p_node.GetChild(0).tok,
1443 p_node.GetChild(1).tok)
1444
1445 p_die("This kind of class literal term isn't implemented",
1446 p_node.GetChild(0).tok)
1447
1448 def _ClassLiteral(self, p_node):
1449 # type: (PNode) -> List[class_literal_term_t]
1450 """class_literal: '[' class_literal_term+ ']'."""
1451 assert p_node.typ == grammar_nt.class_literal
1452 # skip [ and ]
1453 terms = [] # type: List[class_literal_term_t]
1454 for i in xrange(1, p_node.NumChildren() - 1):
1455 terms.append(self._ClassLiteralTerm(p_node.GetChild(i)))
1456
1457 return terms
1458
1459 def _NameInRegex(self, negated_tok, tok):
1460 # type: (Token, Token) -> re_t
1461 tok_str = lexer.TokenVal(tok)
1462 if tok_str == 'dot':
1463 if negated_tok:
1464 p_die("Can't negate this symbol", tok)
1465 return re.Primitive(tok, Id.Eggex_Dot)
1466
1467 if tok_str in POSIX_CLASSES:
1468 return PosixClass(negated_tok, tok_str)
1469
1470 perl = PERL_CLASSES.get(tok_str)
1471 if perl is not None:
1472 return PerlClass(negated_tok, perl)
1473
1474 if tok_str[0].isupper(): # e.g. HexDigit
1475 return re.Splice(tok, lexer.TokenVal(tok))
1476
1477 p_die("%r isn't a character class" % tok_str, tok)
1478
1479 def _NameInClass(self, negated_tok, tok):
1480 # type: (Token, Token) -> class_literal_term_t
1481 """Like the above, but 'dot' and 'd' don't mean anything within []"""
1482 tok_str = lexer.TokenVal(tok)
1483
1484 # A bare, unquoted character literal. In the grammar, this is expressed as
1485 # range_char without an ending.
1486
1487 # d is NOT 'digit', it's a literal 'd'!
1488 if len(tok_str) == 1:
1489 # Expr_Name matches VAR_NAME_RE, which starts with [a-zA-Z_]
1490 assert tok.id in (Id.Expr_Name, Id.Expr_DecInt)
1491
1492 if negated_tok: # [~d] is not allowed, only [~digit]
1493 p_die("Can't negate this symbol", tok)
1494 return word_compile.EvalCharLiteralForRegex(tok)
1495
1496 # digit, word, but not d, w, etc.
1497 if tok_str in POSIX_CLASSES:
1498 return PosixClass(negated_tok, tok_str)
1499
1500 perl = PERL_CLASSES.get(tok_str)
1501 if perl is not None:
1502 return PerlClass(negated_tok, perl)
1503 p_die("%r isn't a character class" % tok_str, tok)
1504
1505 def _ReAtom(self, p_atom):
1506 # type: (PNode) -> re_t
1507 """
1508 re_atom: ( char_literal | ...
1509 """
1510 assert p_atom.typ == grammar_nt.re_atom, p_atom.typ
1511
1512 child0 = p_atom.GetChild(0)
1513
1514 typ0 = p_atom.GetChild(0).typ
1515 tok0 = p_atom.GetChild(0).tok
1516
1517 # Non-terminals
1518
1519 if typ0 == grammar_nt.class_literal:
1520 return re.CharClassLiteral(False, self._ClassLiteral(child0))
1521
1522 if typ0 == grammar_nt.sq_string:
1523 return cast(SingleQuoted, child0.GetChild(1).tok)
1524
1525 if typ0 == grammar_nt.char_literal:
1526 # Note: ERE doesn't seem to support escapes like Python
1527 # https://docs.python.org/3/library/re.html
1528 # We might want to do a translation like this;
1529 #
1530 # \u{03bc} -> \u03bc
1531 # \x00 -> \x00
1532 # \n -> \n
1533
1534 # Must be Id.Char_{OneChar,Hex,UBraced}
1535 assert consts.GetKind(tok0.id) == Kind.Char
1536 s = word_compile.EvalCStringToken(tok0.id, lexer.TokenVal(tok0))
1537 return re.LiteralChars(tok0, s)
1538
1539 # Special punctuation
1540 if typ0 == Id.Expr_Dot: # .
1541 return re.Primitive(tok0, Id.Eggex_Dot)
1542
1543 if typ0 == Id.Arith_Caret: # ^
1544 return re.Primitive(tok0, Id.Eggex_Start)
1545
1546 if typ0 == Id.Expr_Dollar: # $
1547 return re.Primitive(tok0, Id.Eggex_End)
1548
1549 if typ0 == Id.Expr_Name:
1550 # d digit -> PosixClass PerlClass etc.
1551 return self._NameInRegex(None, tok0)
1552
1553 if typ0 == Id.Expr_Symbol:
1554 # Validate symbols here, like we validate PerlClass, etc.
1555 tok_str = lexer.TokenVal(tok0)
1556 if tok_str == '%start':
1557 return re.Primitive(tok0, Id.Eggex_Start)
1558 if tok_str == '%end':
1559 return re.Primitive(tok0, Id.Eggex_End)
1560 p_die("Unexpected token %r in regex" % tok_str, tok0)
1561
1562 if typ0 == Id.Expr_At:
1563 # | '@' Expr_Name
1564 tok1 = p_atom.GetChild(1).tok
1565 return re.Splice(tok0, lexer.TokenVal(tok1))
1566
1567 if typ0 == Id.Expr_Bang:
1568 # | '!' (Expr_Name | class_literal)
1569 # | '!' '!' Expr_Name (Expr_Name | Expr_DecInt | '(' regex ')')
1570 n = p_atom.NumChildren()
1571 if n == 2:
1572 child1 = p_atom.GetChild(1)
1573 if child1.typ == grammar_nt.class_literal:
1574 return re.CharClassLiteral(True,
1575 self._ClassLiteral(child1))
1576 else:
1577 return self._NameInRegex(tok0, p_atom.GetChild(1).tok)
1578 else:
1579 # Note: !! conflicts with shell history
1580 p_die(
1581 "Backtracking with !! isn't implemented (requires Python/PCRE)",
1582 p_atom.GetChild(1).tok)
1583
1584 if typ0 == Id.Op_LParen:
1585 # | '(' regex ')'
1586
1587 # Note: in ERE (d+) is the same as <d+>. That is, Group becomes
1588 # Capture.
1589 return re.Group(self._Regex(p_atom.GetChild(1)))
1590
1591 if typ0 == Id.Arith_Less:
1592 # | '<' 'capture' regex ['as' Expr_Name] [':' Expr_Name] '>'
1593
1594 n = p_atom.NumChildren()
1595 assert n == 4 or n == 6 or n == 8, n
1596
1597 # < capture d+ >
1598 regex = self._Regex(p_atom.GetChild(2))
1599
1600 as_name = None # type: Optional[Token]
1601 func_name = None # type: Optional[Token]
1602
1603 i = 3 # points at any of > as :
1604
1605 typ = p_atom.GetChild(i).typ
1606 if typ == Id.Expr_As:
1607 as_name = p_atom.GetChild(i + 1).tok
1608 i += 2
1609
1610 typ = p_atom.GetChild(i).typ
1611 if typ == Id.Arith_Colon:
1612 func_name = p_atom.GetChild(i + 1).tok
1613
1614 return re.Capture(regex, as_name, func_name)
1615
1616 raise AssertionError(typ0)
1617
1618 def _RepeatOp(self, p_repeat):
1619 # type: (PNode) -> re_repeat_t
1620 """
1621 repeat_op: '+' | '*' | '?'
1622 | '{' [Expr_Name] ('+' | '*' | '?' | repeat_range) '}'
1623 """
1624 assert p_repeat.typ == grammar_nt.repeat_op, p_repeat
1625
1626 tok = p_repeat.GetChild(0).tok
1627 id_ = tok.id
1628
1629 if id_ in (Id.Arith_Plus, Id.Arith_Star, Id.Arith_QMark):
1630 return tok # a+ a* a?
1631
1632 if id_ == Id.Op_LBrace:
1633 child1 = p_repeat.GetChild(1)
1634 if child1.typ != grammar_nt.repeat_range:
1635 # e.g. dot{N *} is .*?
1636 p_die("Perl-style repetition isn't implemented with libc",
1637 child1.tok)
1638
1639 # repeat_range: (
1640 # Expr_DecInt [',']
1641 # | ',' Expr_DecInt
1642 # | Expr_DecInt ',' Expr_DecInt
1643 # )
1644
1645 n = child1.NumChildren()
1646 if n == 1: # {3}
1647 tok = child1.GetChild(0).tok
1648 return tok # different operator than + * ?
1649
1650 if n == 2:
1651 if child1.GetChild(0).typ == Id.Expr_DecInt: # {,3}
1652 left = child1.GetChild(0).tok
1653 return re_repeat.Range(left, lexer.TokenVal(left), '',
1654 None)
1655 else: # {1,}
1656 right = child1.GetChild(1).tok
1657 return re_repeat.Range(None, '', lexer.TokenVal(right),
1658 right)
1659
1660 if n == 3: # {1,3}
1661 left = child1.GetChild(0).tok
1662 right = child1.GetChild(2).tok
1663 return re_repeat.Range(left, lexer.TokenVal(left),
1664 lexer.TokenVal(right), right)
1665
1666 raise AssertionError(n)
1667
1668 raise AssertionError(id_)
1669
1670 def _ReAlt(self, p_node):
1671 # type: (PNode) -> re_t
1672 """
1673 re_alt: (re_atom [repeat_op])+
1674 """
1675 assert p_node.typ == grammar_nt.re_alt
1676
1677 i = 0
1678 n = p_node.NumChildren()
1679 seq = [] # type: List[re_t]
1680 while i < n:
1681 r = self._ReAtom(p_node.GetChild(i))
1682 i += 1
1683 if i < n and p_node.GetChild(i).typ == grammar_nt.repeat_op:
1684 repeat_op = self._RepeatOp(p_node.GetChild(i))
1685 r = re.Repeat(r, repeat_op)
1686 i += 1
1687 seq.append(r)
1688
1689 if len(seq) == 1:
1690 return seq[0]
1691 else:
1692 return re.Seq(seq)
1693
1694 def _Regex(self, p_node):
1695 # type: (PNode) -> re_t
1696 """
1697 regex: [re_alt] (('|'|'or') re_alt)*
1698 """
1699 assert p_node.typ == grammar_nt.regex
1700
1701 n = p_node.NumChildren()
1702 alts = [] # type: List[re_t]
1703 for i in xrange(0, n, 2): # was children[::2]
1704 c = p_node.GetChild(i)
1705 alts.append(self._ReAlt(c))
1706
1707 if len(alts) == 1:
1708 return alts[0]
1709 else:
1710 return re.Alt(alts)
1711
1712
1713# vim: sw=4