OILS / ysh / expr_to_ast.py View on Github | oilshell.org

1723 lines, 1037 significant
1"""expr_to_ast.py."""
2from __future__ import print_function
3
4from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind
5from _devbuild.gen.syntax_asdl import (
6 Token,
7 SimpleVarSub,
8 loc,
9 loc_t,
10 DoubleQuoted,
11 SingleQuoted,
12 BracedVarSub,
13 CommandSub,
14 ShArrayLiteral,
15 command,
16 expr,
17 expr_e,
18 expr_t,
19 expr_context_e,
20 re,
21 re_t,
22 re_repeat,
23 re_repeat_t,
24 class_literal_term,
25 class_literal_term_t,
26 PosixClass,
27 PerlClass,
28 NameType,
29 y_lhs_t,
30 Comprehension,
31 Subscript,
32 Attribute,
33 proc_sig,
34 proc_sig_t,
35 Param,
36 RestParam,
37 ParamGroup,
38 NamedArg,
39 ArgList,
40 pat,
41 pat_t,
42 TypeExpr,
43 Func,
44 Eggex,
45 EggexFlag,
46 CharCode,
47 CharRange,
48)
49from _devbuild.gen.value_asdl import value, value_t
50from _devbuild.gen import grammar_nt
51from core.error import p_die
52from data_lang import j8
53from frontend import consts
54from frontend import lexer
55from frontend import location
56from mycpp import mops
57from mycpp import mylib
58from mycpp.mylib import log, tagswitch
59from osh import word_compile
60from ysh import expr_parse
61from ysh import regex_translate
62
63from typing import TYPE_CHECKING, Dict, List, Tuple, Optional, cast
64if TYPE_CHECKING:
65 from pgen2.grammar import Grammar
66 from pgen2.pnode import PNode
67
68_ = log
69
70PERL_CLASSES = {
71 'd': 'd',
72 'w': 'w',
73 'word': 'w',
74 's': 's',
75}
76# https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html
77POSIX_CLASSES = [
78 'alnum',
79 'cntrl',
80 'lower',
81 'space',
82 'alpha',
83 'digit',
84 'print',
85 'upper',
86 'blank',
87 'graph',
88 'punct',
89 'xdigit',
90]
91# NOTE: There are also things like \p{Greek} that we could put in the
92# "non-sigil" namespace.
93
94RANGE_POINT_TOO_LONG = "Range start/end shouldn't have more than one character"
95
96POS_ARG_MISPLACED = "Positional arg can't appear in group of named args"
97
98# Copied from pgen2/token.py to avoid dependency.
99NT_OFFSET = 256
100
101if mylib.PYTHON:
102
103 def MakeGrammarNames(ysh_grammar):
104 # type: (Grammar) -> Dict[int, str]
105
106 # TODO: Break this dependency
107 from frontend import lexer_def
108
109 names = {}
110
111 for id_name, k in lexer_def.ID_SPEC.id_str2int.items():
112 # Hm some are out of range
113 #assert k < 256, (k, id_name)
114
115 # TODO: Some tokens have values greater than NT_OFFSET
116 if k < NT_OFFSET:
117 names[k] = id_name
118
119 for k, v in ysh_grammar.number2symbol.items():
120 assert k >= NT_OFFSET, (k, v)
121 names[k] = v
122
123 return names
124
125
126class Transformer(object):
127 """Homogeneous parse tree -> heterogeneous AST ("lossless syntax tree")
128
129 pgen2 (Python's LL parser generator) doesn't have semantic actions like yacc,
130 so this "transformer" is the equivalent.
131
132 Files to refer to when modifying this function:
133
134 ysh/grammar.pgen2 (generates _devbuild/gen/grammar_nt.py)
135 frontend/syntax.asdl (generates _devbuild/gen/syntax_asdl.py)
136
137 Related examples:
138
139 opy/compiler2/transformer.py (Python's parse tree -> AST, ~1500 lines)
140 Python-2.7.13/Python/ast.c (the "real" CPython version, ~3600 lines)
141
142 Other:
143 frontend/parse_lib.py (turn on print_parse_tree)
144
145 Public methods:
146 Expr, VarDecl
147 atom, trailer, etc. are private, named after productions in grammar.pgen2.
148 """
149
150 def __init__(self, gr):
151 # type: (Grammar) -> None
152 self.number2symbol = gr.number2symbol
153 if mylib.PYTHON:
154 names = MakeGrammarNames(gr)
155 # print raw nodes
156 self.p_printer = expr_parse.ParseTreePrinter(names)
157
158 def _LeftAssoc(self, p_node):
159 # type: (PNode) -> expr_t
160 """For an associative binary operation.
161
162 Examples:
163 xor_expr: and_expr ('xor' and_expr)*
164 term: factor (('*'|'/'|'%'|'div') factor)*
165
166 3 - 1 - 2 must be grouped as ((3 - 1) - 2).
167 """
168 # Note: Compare the iteractive com_binary() method in
169 # opy/compiler2/transformer.py.
170
171 # Examples:
172 # - The PNode for '3 - 1' will have 3 children
173 # - The PNode for '3 - 1 - 2' will have 5 children
174
175 #self.p_printer.Print(p_node)
176
177 i = 1 # index of the operator
178 n = p_node.NumChildren()
179
180 left = self.Expr(p_node.GetChild(0))
181 while i < n:
182 op = p_node.GetChild(i)
183 right = self.Expr(p_node.GetChild(i + 1))
184
185 # create a new left node
186 left = expr.Binary(op.tok, left, right)
187 i += 2
188
189 return left
190
191 def _Trailer(self, base, p_trailer):
192 # type: (expr_t, PNode) -> expr_t
193 """
194 trailer: ( '(' [arglist] ')' | '[' subscriptlist ']'
195 | '.' NAME | '->' NAME | '::' NAME
196 )
197 """
198 tok0 = p_trailer.GetChild(0).tok
199 typ0 = p_trailer.GetChild(0).typ
200
201 if typ0 == Id.Op_LParen:
202 lparen = tok0
203 rparen = p_trailer.GetChild(-1).tok
204 arglist = ArgList(lparen, [], None, [], None, None, rparen)
205 if p_trailer.NumChildren() == 2: # ()
206 return expr.FuncCall(base, arglist)
207
208 p = p_trailer.GetChild(1) # the X in ( X )
209 assert p.typ == grammar_nt.arglist # f(x, y)
210 self._ArgList(p, arglist)
211 return expr.FuncCall(base, arglist)
212
213 if typ0 == Id.Op_LBracket:
214 p_args = p_trailer.GetChild(1)
215 assert p_args.typ == grammar_nt.subscriptlist
216
217 n = p_args.NumChildren()
218 if n == 1: # a[1] a[1:2] a[:] etc.
219 subscript = self._Subscript(p_args.GetChild(0))
220 else: # a[1, 2] a[1:2, :]
221 slices = []
222 for i in xrange(0, n, 2):
223 slices.append(self._Subscript(p_args.GetChild(i)))
224 # expr.Tuple evaluates to List in YSH.
225 #
226 # Note that syntactically, a[1:2, 3:4] is the the only way to
227 # get a List[Slice]. [1:2, 3:4] by itself is not allowed.
228 comma_tok = p_args.GetChild(1).tok
229 subscript = expr.Tuple(comma_tok, slices, expr_context_e.Store)
230
231 return Subscript(tok0, base, subscript)
232
233 if typ0 in (Id.Expr_Dot, Id.Expr_RArrow, Id.Expr_RDArrow):
234 attr = p_trailer.GetChild(1).tok # will be Id.Expr_Name
235 return Attribute(base, tok0, attr, lexer.TokenVal(attr),
236 expr_context_e.Store)
237
238 raise AssertionError(typ0)
239
240 def _DictPair(self, p_node):
241 # type: (PNode) -> Tuple[expr_t, expr_t]
242 """
243 dict_pair: ( Expr_Name [':' test]
244 | '[' testlist ']' ':' test )
245 | sq_string ':' test
246 | dq_string ':' test )
247 """
248 assert p_node.typ == grammar_nt.dict_pair
249
250 typ = p_node.GetChild(0).typ
251
252 if typ in (grammar_nt.sq_string, grammar_nt.dq_string):
253 key = self.Expr(p_node.GetChild(0)) # type: expr_t
254 val = self.Expr(p_node.GetChild(2))
255 return key, val
256
257 tok0 = p_node.GetChild(0).tok
258 id_ = tok0.id
259
260 if id_ == Id.Expr_Name:
261 key_str = value.Str(lexer.TokenVal(tok0))
262 key = expr.Const(tok0, key_str)
263 if p_node.NumChildren() >= 3:
264 val = self.Expr(p_node.GetChild(2))
265 else:
266 val = expr.Implicit
267
268 if id_ == Id.Op_LBracket: # {[x+y]: 'val'}
269 key = self.Expr(p_node.GetChild(1))
270 val = self.Expr(p_node.GetChild(4))
271 return key, val
272
273 return key, val
274
275 def _Dict(self, parent, p_node):
276 # type: (PNode, PNode) -> expr.Dict
277 """
278 dict: dict_pair (comma_newline dict_pair)* [comma_newline]
279 """
280 if p_node.typ == Id.Op_RBrace: # {}
281 return expr.Dict(parent.tok, [], [])
282
283 assert p_node.typ == grammar_nt.dict
284
285 keys = [] # type: List[expr_t]
286 values = [] # type: List[expr_t]
287
288 n = p_node.NumChildren()
289 for i in xrange(0, n, 2):
290 key, val = self._DictPair(p_node.GetChild(i))
291 keys.append(key)
292 values.append(val)
293
294 return expr.Dict(parent.tok, keys, values)
295
296 def _Tuple(self, parent):
297 # type: (PNode) -> expr_t
298
299 n = parent.NumChildren()
300
301 # (x) -- not a tuple
302 if n == 1:
303 return self.Expr(parent.GetChild(0))
304
305 # x, and (x,) aren't allowed
306 if n == 2:
307 p_die('Invalid trailing comma', parent.GetChild(1).tok)
308
309 elts = [] # type: List[expr_t]
310 for i in xrange(0, n, 2): # skip commas
311 p_node = parent.GetChild(i)
312 elts.append(self.Expr(p_node))
313
314 return expr.Tuple(parent.tok, elts,
315 expr_context_e.Store) # unused expr_context_e
316
317 def _TestlistComp(self, parent, p_node, id0):
318 # type: (PNode, PNode, Id_t) -> expr_t
319 """
320 testlist_comp:
321 (test|star_expr) ( comp_for | (',' (test|star_expr))* [','] )
322 """
323 assert p_node.typ == grammar_nt.testlist_comp
324
325 n = p_node.NumChildren()
326 if n > 1 and p_node.GetChild(1).typ == grammar_nt.comp_for:
327 elt = self.Expr(p_node.GetChild(0))
328 comp = self._CompFor(p_node.GetChild(1))
329 if id0 == Id.Op_LParen: # (x+1 for x in y)
330 return expr.GeneratorExp(elt, [comp])
331 if id0 == Id.Op_LBracket: # [x+1 for x in y]
332 return expr.ListComp(parent.tok, elt, [comp])
333 raise AssertionError()
334
335 if id0 == Id.Op_LParen:
336 # Parenthesized expression like (x+1) or (x)
337 if n == 1:
338 return self.Expr(p_node.GetChild(0))
339
340 # Tuples (1,) (1, 2) etc. - TODO: should be a list literal?
341 if p_node.GetChild(1).typ == Id.Arith_Comma:
342 return self._Tuple(p_node)
343
344 raise AssertionError()
345
346 if id0 == Id.Op_LBracket: # List [1,2,3]
347 elts = [] # type: List[expr_t]
348 for i in xrange(0, n, 2): # skip commas
349 elts.append(self.Expr(p_node.GetChild(i)))
350
351 return expr.List(parent.tok, elts,
352 expr_context_e.Store) # unused expr_context_e
353
354 raise AssertionError(Id_str(id0))
355
356 def _Atom(self, parent):
357 # type: (PNode) -> expr_t
358 """Handle alternatives of 'atom' where there's more than one child."""
359
360 tok = parent.GetChild(0).tok
361 id_ = tok.id
362 n = parent.NumChildren()
363
364 if id_ == Id.Op_LParen:
365 # atom: '(' [yield_expr|testlist_comp] ')' | ...
366 if n == 2: # () is a tuple
367 assert (
368 parent.GetChild(1).typ == Id.Op_RParen), parent.GetChild(1)
369 return expr.Tuple(tok, [], expr_context_e.Store)
370
371 return self._TestlistComp(parent, parent.GetChild(1), id_)
372
373 if id_ == Id.Op_LBracket:
374 # atom: ... | '[' [testlist_comp] ']' | ...
375
376 if n == 2: # []
377 assert (parent.GetChild(1).typ == Id.Op_RBracket
378 ), parent.GetChild(1)
379 return expr.List(tok, [],
380 expr_context_e.Store) # unused expr_context_e
381
382 return self._TestlistComp(parent, parent.GetChild(1), id_)
383
384 if id_ == Id.Left_CaretBracket: # ^[42 + x]
385 child = self.Expr(parent.GetChild(1))
386 return expr.Literal(child)
387
388 if id_ == Id.Op_LBrace:
389 # atom: ... | '{' [Op_Newline] [dict] '}'
390 i = 1
391 if parent.GetChild(i).typ == Id.Op_Newline:
392 i += 1
393 return self._Dict(parent, parent.GetChild(i))
394
395 if id_ == Id.Arith_Amp:
396 n = parent.NumChildren()
397 if n >= 3:
398 p_die("Places in containers not implemented yet",
399 parent.GetChild(2).tok)
400
401 name_tok = parent.GetChild(1).tok
402 return expr.Place(name_tok, lexer.TokenVal(name_tok), [])
403
404 if id_ == Id.Expr_Func:
405 # STUB. This should really be a Func, not Lambda.
406 return expr.Lambda([], expr.Implicit)
407
408 # 100 M
409 # Ignoring the suffix for now
410 if id_ == Id.Expr_DecInt:
411 assert n > 1
412 p_die("Units suffix not implemented", parent.GetChild(1).tok)
413 #return self.Expr(parent.GetChild(0))
414
415 # 100.5 M
416 # Ignoring the suffix for now
417 if id_ == Id.Expr_Float:
418 assert n > 1
419 p_die("unix suffix implemented", parent.GetChild(1).tok)
420 #return self.Expr(parent.GetChild(0))
421
422 raise AssertionError(Id_str(id_))
423
424 def _NameType(self, p_node):
425 # type: (PNode) -> NameType
426 """ name_type: Expr_Name [':'] [type_expr] """
427 name_tok = p_node.GetChild(0).tok
428 typ = None # type: Optional[TypeExpr]
429
430 n = p_node.NumChildren()
431 if n == 2:
432 typ = self._TypeExpr(p_node.GetChild(1))
433 if n == 3:
434 typ = self._TypeExpr(p_node.GetChild(2))
435
436 return NameType(name_tok, lexer.TokenVal(name_tok), typ)
437
438 def _NameTypeList(self, p_node):
439 # type: (PNode) -> List[NameType]
440 """ name_type_list: name_type (',' name_type)* """
441 assert p_node.typ == grammar_nt.name_type_list
442 results = [] # type: List[NameType]
443
444 n = p_node.NumChildren()
445 for i in xrange(0, n, 2): # was children[::2]
446 results.append(self._NameType(p_node.GetChild(i)))
447 return results
448
449 def _CompFor(self, p_node):
450 # type: (PNode) -> Comprehension
451 """comp_for: 'for' exprlist 'in' or_test ['if' or_test]"""
452 lhs = self._NameTypeList(p_node.GetChild(1))
453 iterable = self.Expr(p_node.GetChild(3))
454
455 if p_node.NumChildren() >= 6:
456 cond = self.Expr(p_node.GetChild(5))
457 else:
458 cond = None
459
460 return Comprehension(lhs, iterable, cond)
461
462 def _CompareChain(self, parent):
463 # type: (PNode) -> expr_t
464 """comparison: expr (comp_op expr)*"""
465 cmp_ops = [] # type: List[Token]
466 comparators = [] # type: List[expr_t]
467 left = self.Expr(parent.GetChild(0))
468
469 i = 1
470 n = parent.NumChildren()
471 while i < n:
472 p = parent.GetChild(i)
473 op = p.GetChild(0).tok
474 if p.NumChildren() == 2:
475 # Blame the first token, and change its type
476 if op.id == Id.Expr_Not: # not in
477 op.id = Id.Node_NotIn
478 elif op.id == Id.Expr_Is: # is not
479 op.id = Id.Node_IsNot
480 else:
481 raise AssertionError()
482 else:
483 # is, <, ==, etc.
484 pass
485
486 cmp_ops.append(op)
487 i += 1
488 comparators.append(self.Expr(parent.GetChild(i)))
489 i += 1
490 return expr.Compare(left, cmp_ops, comparators)
491
492 def _Subscript(self, parent):
493 # type: (PNode) -> expr_t
494 """subscript: expr | [expr] ':' [expr]"""
495 typ0 = parent.GetChild(0).typ
496
497 n = parent.NumChildren()
498
499 if typ0 == grammar_nt.expr:
500 if n == 3: # a[1:2]
501 lower = self.Expr(parent.GetChild(0))
502 op_tok = parent.GetChild(1).tok
503 upper = self.Expr(parent.GetChild(2))
504
505 elif n == 2: # a[1:]
506 lower = self.Expr(parent.GetChild(0))
507 op_tok = parent.GetChild(1).tok
508 upper = None
509 else: # a[1]
510 return self.Expr(parent.GetChild(0))
511 else:
512 assert typ0 == Id.Arith_Colon
513 lower = None
514 if n == 1: # a[:]
515 op_tok = parent.GetChild(0).tok
516 upper = None
517 else: # a[:3]
518 op_tok = parent.GetChild(0).tok
519 upper = self.Expr(parent.GetChild(1))
520
521 return expr.Slice(lower, op_tok, upper)
522
523 def Expr(self, pnode):
524 # type: (PNode) -> expr_t
525 """Transform expressions (as opposed to statements)"""
526 typ = pnode.typ
527
528 #
529 # YSH Entry Points / Additions
530 #
531
532 if typ == grammar_nt.ysh_expr: # for if/while
533 # ysh_expr: '(' testlist ')'
534 return self.Expr(pnode.GetChild(1))
535
536 if typ == grammar_nt.command_expr:
537 # return_expr: testlist end_stmt
538 return self.Expr(pnode.GetChild(0))
539
540 #
541 # Python-like Expressions / Operators
542 #
543
544 if typ == grammar_nt.atom:
545 if pnode.NumChildren() == 1:
546 return self.Expr(pnode.GetChild(0))
547 return self._Atom(pnode)
548
549 if typ == grammar_nt.testlist:
550 # testlist: test (',' test)* [',']
551 return self._Tuple(pnode)
552
553 if typ == grammar_nt.test:
554 # test: or_test ['if' or_test 'else' test] | lambdef
555 if pnode.NumChildren() == 1:
556 return self.Expr(pnode.GetChild(0))
557
558 # TODO: Handle lambdef
559
560 test = self.Expr(pnode.GetChild(2))
561 body = self.Expr(pnode.GetChild(0))
562 orelse = self.Expr(pnode.GetChild(4))
563 return expr.IfExp(test, body, orelse)
564
565 if typ == grammar_nt.lambdef:
566 # lambdef: '|' [name_type_list] '|' test
567
568 n = pnode.NumChildren()
569 if n == 4:
570 params = self._NameTypeList(pnode.GetChild(1))
571 else:
572 params = []
573
574 body = self.Expr(pnode.GetChild(n - 1))
575 return expr.Lambda(params, body)
576
577 #
578 # Operators with Precedence
579 #
580
581 if typ == grammar_nt.or_test:
582 # or_test: and_test ('or' and_test)*
583 return self._LeftAssoc(pnode)
584
585 if typ == grammar_nt.and_test:
586 # and_test: not_test ('and' not_test)*
587 return self._LeftAssoc(pnode)
588
589 if typ == grammar_nt.not_test:
590 # not_test: 'not' not_test | comparison
591 if pnode.NumChildren() == 1:
592 return self.Expr(pnode.GetChild(0))
593
594 op_tok = pnode.GetChild(0).tok # not
595 return expr.Unary(op_tok, self.Expr(pnode.GetChild(1)))
596
597 elif typ == grammar_nt.comparison:
598 if pnode.NumChildren() == 1:
599 return self.Expr(pnode.GetChild(0))
600
601 return self._CompareChain(pnode)
602
603 elif typ == grammar_nt.range_expr:
604 n = pnode.NumChildren()
605 if n == 1:
606 return self.Expr(pnode.GetChild(0))
607
608 if n == 3:
609 return expr.Range(self.Expr(pnode.GetChild(0)),
610 pnode.GetChild(1).tok,
611 self.Expr(pnode.GetChild(2)))
612
613 raise AssertionError(n)
614
615 elif typ == grammar_nt.expr:
616 # expr: xor_expr ('|' xor_expr)*
617 return self._LeftAssoc(pnode)
618
619 if typ == grammar_nt.xor_expr:
620 # xor_expr: and_expr ('xor' and_expr)*
621 return self._LeftAssoc(pnode)
622
623 if typ == grammar_nt.and_expr: # a & b
624 # and_expr: shift_expr ('&' shift_expr)*
625 return self._LeftAssoc(pnode)
626
627 elif typ == grammar_nt.shift_expr:
628 # shift_expr: arith_expr (('<<'|'>>') arith_expr)*
629 return self._LeftAssoc(pnode)
630
631 elif typ == grammar_nt.arith_expr:
632 # arith_expr: term (('+'|'-') term)*
633 return self._LeftAssoc(pnode)
634
635 elif typ == grammar_nt.term:
636 # term: factor (('*'|'/'|'div'|'mod') factor)*
637 return self._LeftAssoc(pnode)
638
639 elif typ == grammar_nt.factor:
640 # factor: ('+'|'-'|'~') factor | power
641 # the power would have already been reduced
642 if pnode.NumChildren() == 1:
643 return self.Expr(pnode.GetChild(0))
644
645 assert pnode.NumChildren() == 2
646 op = pnode.GetChild(0)
647 e = pnode.GetChild(1)
648
649 assert isinstance(op.tok, Token)
650 return expr.Unary(op.tok, self.Expr(e))
651
652 elif typ == grammar_nt.power:
653 # power: atom trailer* ['**' factor]
654
655 node = self.Expr(pnode.GetChild(0))
656 if pnode.NumChildren() == 1: # No trailers
657 return node
658
659 # Support a->startswith(b) and mydict.key
660 n = pnode.NumChildren()
661 i = 1
662 while i < n and pnode.GetChild(i).typ == grammar_nt.trailer:
663 node = self._Trailer(node, pnode.GetChild(i))
664 i += 1
665
666 if i != n: # ['**' factor]
667 op_tok = pnode.GetChild(i).tok
668 assert op_tok.id == Id.Arith_DStar, op_tok
669 factor = self.Expr(pnode.GetChild(i + 1))
670 node = expr.Binary(op_tok, node, factor)
671
672 return node
673
674 elif typ == grammar_nt.eggex:
675 return self._Eggex(pnode)
676
677 elif typ == grammar_nt.ysh_expr_sub:
678 return self.Expr(pnode.GetChild(0))
679
680 #
681 # YSH Lexer Modes
682 #
683
684 elif typ == grammar_nt.sh_array_literal:
685 return cast(ShArrayLiteral, pnode.GetChild(1).tok)
686
687 elif typ == grammar_nt.old_sh_array_literal:
688 return cast(ShArrayLiteral, pnode.GetChild(1).tok)
689
690 elif typ == grammar_nt.sh_command_sub:
691 return cast(CommandSub, pnode.GetChild(1).tok)
692
693 elif typ == grammar_nt.braced_var_sub:
694 return cast(BracedVarSub, pnode.GetChild(1).tok)
695
696 elif typ == grammar_nt.dq_string:
697 dq = cast(DoubleQuoted, pnode.GetChild(1).tok)
698 # sugar: ^"..." is short for ^["..."]
699 if pnode.GetChild(0).typ == Id.Left_CaretDoubleQuote:
700 return expr.Literal(dq)
701 return dq
702
703 elif typ == grammar_nt.sq_string:
704 return cast(SingleQuoted, pnode.GetChild(1).tok)
705
706 elif typ == grammar_nt.simple_var_sub:
707 tok = pnode.GetChild(0).tok
708
709 if tok.id == Id.VSub_DollarName: # $foo is disallowed
710 bare = lexer.TokenSliceLeft(tok, 1)
711 p_die(
712 'In expressions, remove $ and use `%s`, or sometimes "$%s"'
713 % (bare, bare), tok)
714
715 # $? is allowed
716 return SimpleVarSub(tok)
717
718 #
719 # Terminals
720 #
721
722 tok = pnode.tok
723 if typ == Id.Expr_Name:
724 return expr.Var(tok, lexer.TokenVal(tok))
725
726 # Everything else is an expr.Const
727 tok_str = lexer.TokenVal(tok)
728 # Remove underscores from 1_000_000. The lexer is responsible for
729 # validation.
730 c_under = tok_str.replace('_', '')
731
732 if typ == Id.Expr_DecInt:
733 try:
734 cval = value.Int(mops.FromStr(c_under)) # type: value_t
735 except ValueError:
736 p_die('Decimal int constant is too large', tok)
737
738 elif typ == Id.Expr_BinInt:
739 assert c_under[:2] in ('0b', '0B'), c_under
740 try:
741 cval = value.Int(mops.FromStr(c_under[2:], 2))
742 except ValueError:
743 p_die('Binary int constant is too large', tok)
744
745 elif typ == Id.Expr_OctInt:
746 assert c_under[:2] in ('0o', '0O'), c_under
747 try:
748 cval = value.Int(mops.FromStr(c_under[2:], 8))
749 except ValueError:
750 p_die('Octal int constant is too large', tok)
751
752 elif typ == Id.Expr_HexInt:
753 assert c_under[:2] in ('0x', '0X'), c_under
754 try:
755 cval = value.Int(mops.FromStr(c_under[2:], 16))
756 except ValueError:
757 p_die('Hex int constant is too large', tok)
758
759 elif typ == Id.Expr_Float:
760 # Note: float() in mycpp/gc_builtins.cc currently uses strtod
761 # I think this never raises ValueError, because the lexer
762 # should only accept strings that strtod() does?
763 cval = value.Float(float(c_under))
764
765 elif typ == Id.Expr_Null:
766 cval = value.Null
767
768 elif typ == Id.Expr_True:
769 cval = value.Bool(True)
770
771 elif typ == Id.Expr_False:
772 cval = value.Bool(False)
773
774 elif typ == Id.Char_OneChar: # \n
775 assert len(tok_str) == 2, tok_str
776 s = consts.LookupCharC(lexer.TokenSliceLeft(tok, 1))
777 cval = value.Str(s)
778
779 elif typ == Id.Char_YHex: # \yff
780 assert len(tok_str) == 4, tok_str
781 hex_str = lexer.TokenSliceLeft(tok, 2)
782 s = chr(int(hex_str, 16))
783 cval = value.Str(s)
784
785 elif typ == Id.Char_UBraced: # \u{123}
786 hex_str = lexer.TokenSlice(tok, 3, -1)
787 code_point = int(hex_str, 16)
788 s = j8.Utf8Encode(code_point)
789 cval = value.Str(s)
790
791 else:
792 raise AssertionError(typ)
793
794 return expr.Const(tok, cval)
795
796 def _CheckLhs(self, lhs):
797 # type: (expr_t) -> None
798
799 UP_lhs = lhs
800 with tagswitch(lhs) as case:
801 if case(expr_e.Var):
802 # OK - e.g. setvar a.b.c[i] = 42
803 pass
804
805 elif case(expr_e.Subscript):
806 lhs = cast(Subscript, UP_lhs)
807 self._CheckLhs(lhs.obj) # recurse on LHS
808
809 elif case(expr_e.Attribute):
810 lhs = cast(Attribute, UP_lhs)
811 self._CheckLhs(lhs.obj) # recurse on LHS
812
813 else:
814 # Illegal - e.g. setglobal {}["key"] = 42
815 p_die("Subscript/Attribute not allowed on this LHS expression",
816 location.TokenForExpr(lhs))
817
818 def _LhsExprList(self, p_node):
819 # type: (PNode) -> List[y_lhs_t]
820 """lhs_list: expr (',' expr)*"""
821 assert p_node.typ == grammar_nt.lhs_list
822
823 lhs_list = [] # type: List[y_lhs_t]
824 n = p_node.NumChildren()
825 for i in xrange(0, n, 2):
826 p = p_node.GetChild(i)
827 #self.p_printer.Print(p)
828
829 e = self.Expr(p)
830 UP_e = e
831 with tagswitch(e) as case:
832 if case(expr_e.Var):
833 e = cast(expr.Var, UP_e)
834 lhs_list.append(e.left)
835
836 elif case(expr_e.Subscript):
837 e = cast(Subscript, UP_e)
838 self._CheckLhs(e)
839 lhs_list.append(e)
840
841 elif case(expr_e.Attribute):
842 e = cast(Attribute, UP_e)
843 self._CheckLhs(e)
844 if e.op.id != Id.Expr_Dot:
845 # e.g. setvar obj->method is not valid
846 p_die("Can't assign to this attribute expr", e.op)
847 lhs_list.append(e)
848
849 else:
850 pass # work around mycpp bug
851
852 # TODO: could blame arbitary expr_t, bu this works most of
853 # the time
854 if p.tok:
855 blame = p.tok # type: loc_t
856 else:
857 blame = loc.Missing
858 p_die("Can't assign to this expression", blame)
859
860 return lhs_list
861
862 def MakeVarDecl(self, p_node):
863 # type: (PNode) -> command.VarDecl
864 """
865 ysh_var_decl: name_type_list ['=' testlist] end_stmt
866 """
867 assert p_node.typ == grammar_nt.ysh_var_decl
868
869 lhs = self._NameTypeList(p_node.GetChild(0)) # could be a tuple
870
871 # This syntax is confusing, and different than JavaScript
872 # var x, y = 1, 2
873 # But this is useful:
874 # var flag, i = parseArgs(spec, argv)
875
876 n = p_node.NumChildren()
877 if n >= 3:
878 rhs = self.Expr(p_node.GetChild(2))
879 else:
880 rhs = None
881
882 # The caller should fill in the keyword token.
883 return command.VarDecl(None, lhs, rhs)
884
885 def MakeMutation(self, p_node):
886 # type: (PNode) -> command.Mutation
887 """
888 ysh_mutation: lhs_list (augassign | '=') testlist end_stmt
889 """
890 typ = p_node.typ
891 assert typ == grammar_nt.ysh_mutation
892
893 lhs_list = self._LhsExprList(p_node.GetChild(0)) # could be a tuple
894 op_tok = p_node.GetChild(1).tok
895 if len(lhs_list) > 1 and op_tok.id != Id.Arith_Equal:
896 p_die('Multiple assignment must use =', op_tok)
897 rhs = self.Expr(p_node.GetChild(2))
898 return command.Mutation(None, lhs_list, op_tok, rhs)
899
900 def _EggexFlag(self, p_node):
901 # type: (PNode) -> EggexFlag
902 n = p_node.NumChildren()
903 if n == 1:
904 return EggexFlag(False, p_node.GetChild(0).tok)
905 elif n == 2:
906 return EggexFlag(True, p_node.GetChild(1).tok)
907 else:
908 raise AssertionError()
909
910 def _Eggex(self, p_node):
911 # type: (PNode) -> Eggex
912 """
913 eggex: '/' regex [';' re_flag* [';' Expr_Name] ] '/'
914 """
915 left = p_node.GetChild(0).tok
916 regex = self._Regex(p_node.GetChild(1))
917
918 flags = [] # type: List[EggexFlag]
919 trans_pref = None # type: Optional[Token]
920
921 i = 2
922 current = p_node.GetChild(i)
923 if current.typ == Id.Op_Semi:
924 i += 1
925 while True:
926 current = p_node.GetChild(i)
927 if current.typ != grammar_nt.re_flag:
928 break
929 flags.append(self._EggexFlag(current))
930 i += 1
931
932 if current.typ == Id.Op_Semi:
933 i += 1
934 trans_pref = p_node.GetChild(i).tok
935
936 # Canonicalize and validate flags for ERE only. Default is ERE.
937 if trans_pref is None or lexer.TokenVal(trans_pref) == 'ERE':
938 canonical_flags = regex_translate.CanonicalFlags(flags)
939 else:
940 canonical_flags = None
941
942 return Eggex(left, regex, flags, trans_pref, canonical_flags)
943
944 def YshCasePattern(self, pnode):
945 # type: (PNode) -> pat_t
946 assert pnode.typ == grammar_nt.ysh_case_pat, pnode
947
948 pattern = pnode.GetChild(0)
949 typ = pattern.typ
950 if typ == Id.Op_LParen:
951 # pat_expr or pat_else
952 pattern = pnode.GetChild(1)
953 typ = pattern.typ
954
955 if typ == grammar_nt.pat_else:
956 return pat.Else
957
958 if typ == grammar_nt.pat_exprs:
959 exprs = [] # type: List[expr_t]
960 for i in xrange(pattern.NumChildren()):
961 child = pattern.GetChild(i)
962 if child.typ == grammar_nt.expr:
963 expr = self.Expr(child)
964 exprs.append(expr)
965 return pat.YshExprs(exprs)
966
967 if typ == grammar_nt.eggex:
968 return self._Eggex(pattern)
969
970 raise AssertionError()
971
972 def _BlockArg(self, p_node):
973 # type: (PNode) -> expr_t
974
975 n = p_node.NumChildren()
976 if n == 1:
977 child = p_node.GetChild(0)
978 return self.Expr(child)
979
980 # It can only be an expression, not a=42, or ...expr
981 p_die('Invalid block expression argument', p_node.tok)
982
983 def _Argument(self, p_node, after_semi, arglist):
984 # type: (PNode, bool, ArgList) -> None
985 """
986 argument: (
987 test [comp_for]
988 | test '=' test # named arg
989 | '...' test # var args
990 )
991 """
992 pos_args = arglist.pos_args
993 named_args = arglist.named_args
994
995 assert p_node.typ == grammar_nt.argument, p_node
996 n = p_node.NumChildren()
997 if n == 1:
998 child = p_node.GetChild(0)
999 if after_semi:
1000 p_die(POS_ARG_MISPLACED, child.tok)
1001 arg = self.Expr(child)
1002 pos_args.append(arg)
1003 return
1004
1005 if n == 2:
1006 # Note: We allow multiple spreads, just like Julia. They are
1007 # concatenated as in lists and dicts.
1008 tok0 = p_node.GetChild(0).tok
1009 if tok0.id == Id.Expr_Ellipsis:
1010 spread_expr = expr.Spread(tok0, self.Expr(p_node.GetChild(1)))
1011 if after_semi: # f(; ... named)
1012 named_args.append(NamedArg(None, spread_expr))
1013 else: # f(...named)
1014 pos_args.append(spread_expr)
1015 return
1016
1017 # Note: generator expression not implemented
1018 if p_node.GetChild(1).typ == grammar_nt.comp_for:
1019 child = p_node.GetChild(0)
1020 if after_semi:
1021 p_die(POS_ARG_MISPLACED, child.tok)
1022
1023 elt = self.Expr(child)
1024 comp = self._CompFor(p_node.GetChild(1))
1025 arg = expr.GeneratorExp(elt, [comp])
1026 pos_args.append(arg)
1027 return
1028
1029 raise AssertionError()
1030
1031 if n == 3: # named args can come before or after the semicolon
1032 n1 = NamedArg(
1033 p_node.GetChild(0).tok, self.Expr(p_node.GetChild(2)))
1034 named_args.append(n1)
1035 return
1036
1037 raise AssertionError()
1038
1039 def _ArgGroup(self, p_node, after_semi, arglist):
1040 # type: (PNode, bool, ArgList) -> None
1041 """
1042 arg_group: argument (',' argument)* [',']
1043 """
1044 for i in xrange(p_node.NumChildren()):
1045 p_child = p_node.GetChild(i)
1046 if p_child.typ == grammar_nt.argument:
1047 self._Argument(p_child, after_semi, arglist)
1048
1049 def _ArgList(self, p_node, arglist):
1050 # type: (PNode, ArgList) -> None
1051 """For both funcs and procs
1052
1053 arglist: (
1054 [arg_group]
1055 [';' [arg_group]]
1056 )
1057
1058 arglist3: ...
1059 """
1060 n = p_node.NumChildren()
1061 if n == 0:
1062 return
1063
1064 i = 0
1065
1066 if i >= n:
1067 return
1068 child = p_node.GetChild(i)
1069 if child.typ == grammar_nt.arg_group:
1070 self._ArgGroup(child, False, arglist)
1071 i += 1
1072
1073 if i >= n:
1074 return
1075 child = p_node.GetChild(i)
1076 if child.typ == Id.Op_Semi:
1077 arglist.semi_tok = child.tok
1078 i += 1
1079
1080 # Named args after first semi-colon
1081 if i >= n:
1082 return
1083 child = p_node.GetChild(i)
1084 if child.typ == grammar_nt.arg_group:
1085 self._ArgGroup(child, True, arglist)
1086 i += 1
1087
1088 #
1089 # Special third group may have block expression - only for arglist3,
1090 # used for procs!
1091 #
1092
1093 if i >= n:
1094 return
1095 assert p_node.typ == grammar_nt.arglist3, p_node
1096
1097 child = p_node.GetChild(i)
1098 if child.typ == Id.Op_Semi:
1099 arglist.semi_tok2 = child.tok
1100 i += 1
1101
1102 if i >= n:
1103 return
1104 child = p_node.GetChild(i)
1105 if child.typ == grammar_nt.argument:
1106 arglist.block_expr = self._BlockArg(child)
1107 i += 1
1108
1109 def ProcCallArgs(self, pnode, arglist):
1110 # type: (PNode, ArgList) -> None
1111 """
1112 ysh_eager_arglist: '(' [arglist3] ')'
1113 ysh_lazy_arglist: '[' [arglist] ']'
1114 """
1115 n = pnode.NumChildren()
1116 if n == 2: # f()
1117 return
1118
1119 if n == 3:
1120 child1 = pnode.GetChild(1) # the X in '( X )'
1121
1122 self._ArgList(child1, arglist)
1123 return
1124
1125 raise AssertionError()
1126
1127 def _TypeExpr(self, pnode):
1128 # type: (PNode) -> TypeExpr
1129 """
1130 type_expr: Expr_Name [ '[' type_expr (',' type_expr)* ']' ]
1131 """
1132 assert pnode.typ == grammar_nt.type_expr, pnode.typ
1133
1134 ty = TypeExpr.CreateNull() # don't allocate children
1135
1136 ty.tok = pnode.GetChild(0).tok
1137 ty.name = lexer.TokenVal(ty.tok)
1138
1139 n = pnode.NumChildren()
1140 if n == 1:
1141 return ty
1142
1143 ty.params = []
1144 i = 2
1145 while i < n:
1146 p = self._TypeExpr(pnode.GetChild(i))
1147 ty.params.append(p)
1148 i += 2 # skip comma
1149
1150 return ty
1151
1152 def _Param(self, pnode):
1153 # type: (PNode) -> Param
1154 """
1155 param: Expr_Name [type_expr] ['=' expr]
1156 """
1157 assert pnode.typ == grammar_nt.param
1158
1159 name_tok = pnode.GetChild(0).tok
1160 n = pnode.NumChildren()
1161
1162 assert name_tok.id == Id.Expr_Name, name_tok
1163
1164 default_val = None # type: expr_t
1165 type_ = None # type: TypeExpr
1166
1167 if n == 1:
1168 # proc p(a)
1169 pass
1170
1171 elif n == 2:
1172 # proc p(a Int)
1173 type_ = self._TypeExpr(pnode.GetChild(1))
1174
1175 elif n == 3:
1176 # proc p(a = 3)
1177 default_val = self.Expr(pnode.GetChild(2))
1178
1179 elif n == 4:
1180 # proc p(a Int = 3)
1181 type_ = self._TypeExpr(pnode.GetChild(1))
1182 default_val = self.Expr(pnode.GetChild(3))
1183
1184 return Param(name_tok, lexer.TokenVal(name_tok), type_, default_val)
1185
1186 def _ParamGroup(self, p_node):
1187 # type: (PNode) -> ParamGroup
1188 """
1189 param_group:
1190 (param ',')*
1191 [ (param | '...' Expr_Name) [,] ]
1192 """
1193 assert p_node.typ == grammar_nt.param_group, p_node
1194
1195 params = [] # type: List[Param]
1196 rest_of = None # type: Optional[RestParam]
1197
1198 n = p_node.NumChildren()
1199 i = 0
1200 while i < n:
1201 child = p_node.GetChild(i)
1202 if child.typ == grammar_nt.param:
1203 params.append(self._Param(child))
1204
1205 elif child.typ == Id.Expr_Ellipsis:
1206 tok = p_node.GetChild(i + 1).tok
1207 rest_of = RestParam(tok, lexer.TokenVal(tok))
1208
1209 i += 2
1210
1211 return ParamGroup(params, rest_of)
1212
1213 def Proc(self, p_node):
1214 # type: (PNode) -> proc_sig_t
1215 """
1216 ysh_proc: (
1217 [ '('
1218 [ param_group ] # word params, with defaults
1219 [ ';' [ param_group ] ] # positional typed params, with defaults
1220 [ ';' [ param_group ] ] # named params, with defaults
1221 [ ';' Expr_Name ] # optional block param, with no type or default
1222 ')'
1223 ]
1224 '{' # opening { for pgen2
1225 )
1226 """
1227 typ = p_node.typ
1228 assert typ == grammar_nt.ysh_proc
1229
1230 n = p_node.NumChildren()
1231 if n == 1: # proc f {
1232 return proc_sig.Open
1233
1234 if n == 3: # proc f () {
1235 sig = proc_sig.Closed.CreateNull(alloc_lists=True) # no params
1236
1237 # proc f( three param groups, and block group )
1238 sig = proc_sig.Closed.CreateNull(alloc_lists=True) # no params
1239
1240 # Word args
1241 i = 1
1242 child = p_node.GetChild(i)
1243 if child.typ == grammar_nt.param_group:
1244 sig.word = self._ParamGroup(p_node.GetChild(i))
1245
1246 # Validate word args
1247 for word in sig.word.params:
1248 if word.type:
1249 if word.type.name not in ('Str', 'Ref'):
1250 p_die('Word params may only have type Str or Ref',
1251 word.type.tok)
1252 if word.type.params is not None:
1253 p_die('Unexpected type parameters', word.type.tok)
1254
1255 i += 2
1256 else:
1257 i += 1
1258
1259 #log('i %d n %d', i, n)
1260 if i >= n:
1261 return sig
1262
1263 # Positional args
1264 child = p_node.GetChild(i)
1265 if child.typ == grammar_nt.param_group:
1266 sig.positional = self._ParamGroup(p_node.GetChild(i))
1267 i += 2
1268 else:
1269 i += 1
1270
1271 #log('i %d n %d', i, n)
1272 if i >= n:
1273 return sig
1274
1275 # Keyword args
1276 child = p_node.GetChild(i)
1277 if child.typ == grammar_nt.param_group:
1278 sig.named = self._ParamGroup(p_node.GetChild(i))
1279 i += 2
1280 else:
1281 i += 1
1282
1283 #log('i %d n %d', i, n)
1284 if i >= n:
1285 return sig
1286
1287 child = p_node.GetChild(i)
1288 if child.typ == grammar_nt.param_group:
1289 group = self._ParamGroup(p_node.GetChild(i))
1290 params = group.params
1291 if len(params) > 1:
1292 p_die('Only 1 block param is allowed', params[1].blame_tok)
1293 if group.rest_of:
1294 p_die("Rest param isn't allowed for blocks",
1295 group.rest_of.blame_tok)
1296
1297 if len(params) == 1:
1298 if params[0].type:
1299 if params[0].type.name != 'Command':
1300 p_die('Block param must have type Command',
1301 params[0].type.tok)
1302 if params[0].type.params is not None:
1303 p_die('Unexpected type parameters', params[0].type.tok)
1304
1305 sig.block_param = params[0]
1306
1307 return sig
1308
1309 def YshFunc(self, p_node, out):
1310 # type: (PNode, Func) -> None
1311 """
1312 ysh_func: Expr_Name '(' [param_group] [';' param_group] ')'
1313 """
1314 assert p_node.typ == grammar_nt.ysh_func
1315
1316 #self.p_printer.Print(p_node)
1317
1318 out.name = p_node.GetChild(0).tok
1319
1320 n = p_node.NumChildren()
1321 i = 2 # after (
1322
1323 child = p_node.GetChild(i)
1324 if child.typ == grammar_nt.param_group:
1325 out.positional = self._ParamGroup(child)
1326 i += 2 # skip past ;
1327 else:
1328 i += 1
1329
1330 if i >= n:
1331 return
1332
1333 child = p_node.GetChild(i)
1334 if child.typ == grammar_nt.param_group:
1335 out.named = self._ParamGroup(child)
1336
1337 #
1338 # Eggex Language
1339 #
1340
1341 def _RangeCharSingleQuoted(self, p_node):
1342 # type: (PNode) -> Optional[CharCode]
1343
1344 assert p_node.typ == grammar_nt.range_char, p_node
1345
1346 # 'a' in 'a'-'b'
1347
1348 child0 = p_node.GetChild(0)
1349 if child0.typ == grammar_nt.sq_string:
1350 sq_part = cast(SingleQuoted, child0.GetChild(1).tok)
1351 n = len(sq_part.sval)
1352 if n == 0:
1353 p_die("Quoted range char can't be empty",
1354 loc.WordPart(sq_part))
1355 elif n == 1:
1356 return CharCode(sq_part.left, ord(sq_part.sval[0]), False)
1357 else:
1358 p_die(RANGE_POINT_TOO_LONG, loc.WordPart(sq_part))
1359 return None
1360
1361 def _OtherRangeToken(self, p_node):
1362 # type: (PNode) -> Token
1363 """An endpoint of a range (single char)
1364
1365 range_char: Expr_Name | Expr_DecInt | sq_string | char_literal
1366 a-z 0-9 'a'-'z' \x00-\xff
1367 """
1368 assert p_node.typ == grammar_nt.range_char, p_node
1369
1370 child0 = p_node.GetChild(0)
1371 if child0.typ == grammar_nt.char_literal:
1372 # \x00 in /[\x00 - \x20]/
1373 tok = child0.GetChild(0).tok
1374 return tok
1375
1376 tok = p_node.tok
1377 # a in a-z is Expr_Name
1378 # 0 in 0-9 is Expr_DecInt
1379 assert tok.id in (Id.Expr_Name, Id.Expr_DecInt), tok
1380
1381 if tok.length != 1:
1382 p_die(RANGE_POINT_TOO_LONG, tok)
1383 return tok
1384
1385 def _NonRangeChars(self, p_node):
1386 # type: (PNode) -> class_literal_term_t
1387 """
1388 \" \u1234 '#'
1389 """
1390 assert p_node.typ == grammar_nt.range_char, p_node
1391
1392 child0 = p_node.GetChild(0)
1393 typ0 = p_node.GetChild(0).typ
1394
1395 if typ0 == grammar_nt.sq_string:
1396 return cast(SingleQuoted, child0.GetChild(1).tok)
1397
1398 if typ0 == grammar_nt.char_literal:
1399 return word_compile.EvalCharLiteralForRegex(child0.tok)
1400
1401 if typ0 == Id.Expr_Name:
1402 # Look up PerlClass and PosixClass
1403 return self._NameInClass(None, child0.tok)
1404
1405 raise AssertionError()
1406
1407 def _ClassLiteralTerm(self, p_node):
1408 # type: (PNode) -> class_literal_term_t
1409 """
1410 class_literal_term:
1411 range_char ['-' range_char ]
1412 | '@' Expr_Name # splice
1413 | '!' Expr_Name # negate char class
1414 ...
1415 """
1416 assert p_node.typ == grammar_nt.class_literal_term, p_node
1417
1418 typ0 = p_node.GetChild(0).typ
1419
1420 if typ0 == grammar_nt.range_char:
1421 n = p_node.NumChildren()
1422
1423 if n == 1:
1424 return self._NonRangeChars(p_node.GetChild(0))
1425
1426 # 'a'-'z' etc.
1427 if n == 3:
1428 assert p_node.GetChild(1).typ == Id.Arith_Minus, p_node
1429
1430 left = p_node.GetChild(0)
1431 right = p_node.GetChild(2)
1432
1433 code1 = self._RangeCharSingleQuoted(left)
1434 if code1 is None:
1435 tok1 = self._OtherRangeToken(left)
1436 code1 = word_compile.EvalCharLiteralForRegex(tok1)
1437
1438 code2 = self._RangeCharSingleQuoted(right)
1439 if code2 is None:
1440 tok2 = self._OtherRangeToken(right)
1441 code2 = word_compile.EvalCharLiteralForRegex(tok2)
1442 return CharRange(code1, code2)
1443
1444 raise AssertionError()
1445
1446 if typ0 == Id.Expr_At:
1447 tok1 = p_node.GetChild(1).tok
1448 return class_literal_term.Splice(tok1, lexer.TokenVal(tok1))
1449
1450 if typ0 == Id.Expr_Bang:
1451 return self._NameInClass(
1452 p_node.GetChild(0).tok,
1453 p_node.GetChild(1).tok)
1454
1455 p_die("This kind of class literal term isn't implemented",
1456 p_node.GetChild(0).tok)
1457
1458 def _ClassLiteral(self, p_node):
1459 # type: (PNode) -> List[class_literal_term_t]
1460 """class_literal: '[' class_literal_term+ ']'."""
1461 assert p_node.typ == grammar_nt.class_literal
1462 # skip [ and ]
1463 terms = [] # type: List[class_literal_term_t]
1464 for i in xrange(1, p_node.NumChildren() - 1):
1465 terms.append(self._ClassLiteralTerm(p_node.GetChild(i)))
1466
1467 return terms
1468
1469 def _NameInRegex(self, negated_tok, tok):
1470 # type: (Token, Token) -> re_t
1471 tok_str = lexer.TokenVal(tok)
1472 if tok_str == 'dot':
1473 if negated_tok:
1474 p_die("Can't negate this symbol", tok)
1475 return re.Primitive(tok, Id.Eggex_Dot)
1476
1477 if tok_str in POSIX_CLASSES:
1478 return PosixClass(negated_tok, tok_str)
1479
1480 perl = PERL_CLASSES.get(tok_str)
1481 if perl is not None:
1482 return PerlClass(negated_tok, perl)
1483
1484 if tok_str[0].isupper(): # e.g. HexDigit
1485 return re.Splice(tok, lexer.TokenVal(tok))
1486
1487 p_die("%r isn't a character class" % tok_str, tok)
1488
1489 def _NameInClass(self, negated_tok, tok):
1490 # type: (Token, Token) -> class_literal_term_t
1491 """Like the above, but 'dot' and 'd' don't mean anything within []"""
1492 tok_str = lexer.TokenVal(tok)
1493
1494 # A bare, unquoted character literal. In the grammar, this is expressed as
1495 # range_char without an ending.
1496
1497 # d is NOT 'digit', it's a literal 'd'!
1498 if len(tok_str) == 1:
1499 # Expr_Name matches VAR_NAME_RE, which starts with [a-zA-Z_]
1500 assert tok.id in (Id.Expr_Name, Id.Expr_DecInt)
1501
1502 if negated_tok: # [~d] is not allowed, only [~digit]
1503 p_die("Can't negate this symbol", tok)
1504 return word_compile.EvalCharLiteralForRegex(tok)
1505
1506 # digit, word, but not d, w, etc.
1507 if tok_str in POSIX_CLASSES:
1508 return PosixClass(negated_tok, tok_str)
1509
1510 perl = PERL_CLASSES.get(tok_str)
1511 if perl is not None:
1512 return PerlClass(negated_tok, perl)
1513 p_die("%r isn't a character class" % tok_str, tok)
1514
1515 def _ReAtom(self, p_atom):
1516 # type: (PNode) -> re_t
1517 """
1518 re_atom: ( char_literal | ...
1519 """
1520 assert p_atom.typ == grammar_nt.re_atom, p_atom.typ
1521
1522 child0 = p_atom.GetChild(0)
1523
1524 typ0 = p_atom.GetChild(0).typ
1525 tok0 = p_atom.GetChild(0).tok
1526
1527 # Non-terminals
1528
1529 if typ0 == grammar_nt.class_literal:
1530 return re.CharClassLiteral(False, self._ClassLiteral(child0))
1531
1532 if typ0 == grammar_nt.sq_string:
1533 return cast(SingleQuoted, child0.GetChild(1).tok)
1534
1535 if typ0 == grammar_nt.char_literal:
1536 # Note: ERE doesn't seem to support escapes like Python
1537 # https://docs.python.org/3/library/re.html
1538 # We might want to do a translation like this;
1539 #
1540 # \u{03bc} -> \u03bc
1541 # \x00 -> \x00
1542 # \n -> \n
1543
1544 # Must be Id.Char_{OneChar,Hex,UBraced}
1545 assert consts.GetKind(tok0.id) == Kind.Char
1546 s = word_compile.EvalCStringToken(tok0.id, lexer.TokenVal(tok0))
1547 return re.LiteralChars(tok0, s)
1548
1549 # Special punctuation
1550 if typ0 == Id.Expr_Dot: # .
1551 return re.Primitive(tok0, Id.Eggex_Dot)
1552
1553 if typ0 == Id.Arith_Caret: # ^
1554 return re.Primitive(tok0, Id.Eggex_Start)
1555
1556 if typ0 == Id.Expr_Dollar: # $
1557 return re.Primitive(tok0, Id.Eggex_End)
1558
1559 if typ0 == Id.Expr_Name:
1560 # d digit -> PosixClass PerlClass etc.
1561 return self._NameInRegex(None, tok0)
1562
1563 if typ0 == Id.Expr_Symbol:
1564 # Validate symbols here, like we validate PerlClass, etc.
1565 tok_str = lexer.TokenVal(tok0)
1566 if tok_str == '%start':
1567 return re.Primitive(tok0, Id.Eggex_Start)
1568 if tok_str == '%end':
1569 return re.Primitive(tok0, Id.Eggex_End)
1570 p_die("Unexpected token %r in regex" % tok_str, tok0)
1571
1572 if typ0 == Id.Expr_At:
1573 # | '@' Expr_Name
1574 tok1 = p_atom.GetChild(1).tok
1575 return re.Splice(tok0, lexer.TokenVal(tok1))
1576
1577 if typ0 == Id.Expr_Bang:
1578 # | '!' (Expr_Name | class_literal)
1579 # | '!' '!' Expr_Name (Expr_Name | Expr_DecInt | '(' regex ')')
1580 n = p_atom.NumChildren()
1581 if n == 2:
1582 child1 = p_atom.GetChild(1)
1583 if child1.typ == grammar_nt.class_literal:
1584 return re.CharClassLiteral(True,
1585 self._ClassLiteral(child1))
1586 else:
1587 return self._NameInRegex(tok0, p_atom.GetChild(1).tok)
1588 else:
1589 # Note: !! conflicts with shell history
1590 p_die(
1591 "Backtracking with !! isn't implemented (requires Python/PCRE)",
1592 p_atom.GetChild(1).tok)
1593
1594 if typ0 == Id.Op_LParen:
1595 # | '(' regex ')'
1596
1597 # Note: in ERE (d+) is the same as <d+>. That is, Group becomes
1598 # Capture.
1599 return re.Group(self._Regex(p_atom.GetChild(1)))
1600
1601 if typ0 == Id.Arith_Less:
1602 # | '<' 'capture' regex ['as' Expr_Name] [':' Expr_Name] '>'
1603
1604 n = p_atom.NumChildren()
1605 assert n == 4 or n == 6 or n == 8, n
1606
1607 # < capture d+ >
1608 regex = self._Regex(p_atom.GetChild(2))
1609
1610 as_name = None # type: Optional[Token]
1611 func_name = None # type: Optional[Token]
1612
1613 i = 3 # points at any of > as :
1614
1615 typ = p_atom.GetChild(i).typ
1616 if typ == Id.Expr_As:
1617 as_name = p_atom.GetChild(i + 1).tok
1618 i += 2
1619
1620 typ = p_atom.GetChild(i).typ
1621 if typ == Id.Arith_Colon:
1622 func_name = p_atom.GetChild(i + 1).tok
1623
1624 return re.Capture(regex, as_name, func_name)
1625
1626 raise AssertionError(typ0)
1627
1628 def _RepeatOp(self, p_repeat):
1629 # type: (PNode) -> re_repeat_t
1630 """
1631 repeat_op: '+' | '*' | '?'
1632 | '{' [Expr_Name] ('+' | '*' | '?' | repeat_range) '}'
1633 """
1634 assert p_repeat.typ == grammar_nt.repeat_op, p_repeat
1635
1636 tok = p_repeat.GetChild(0).tok
1637 id_ = tok.id
1638
1639 if id_ in (Id.Arith_Plus, Id.Arith_Star, Id.Arith_QMark):
1640 return tok # a+ a* a?
1641
1642 if id_ == Id.Op_LBrace:
1643 child1 = p_repeat.GetChild(1)
1644 if child1.typ != grammar_nt.repeat_range:
1645 # e.g. dot{N *} is .*?
1646 p_die("Perl-style repetition isn't implemented with libc",
1647 child1.tok)
1648
1649 # repeat_range: (
1650 # Expr_DecInt [',']
1651 # | ',' Expr_DecInt
1652 # | Expr_DecInt ',' Expr_DecInt
1653 # )
1654
1655 n = child1.NumChildren()
1656 if n == 1: # {3}
1657 tok = child1.GetChild(0).tok
1658 return tok # different operator than + * ?
1659
1660 if n == 2:
1661 if child1.GetChild(0).typ == Id.Expr_DecInt: # {,3}
1662 left = child1.GetChild(0).tok
1663 return re_repeat.Range(left, lexer.TokenVal(left), '',
1664 None)
1665 else: # {1,}
1666 right = child1.GetChild(1).tok
1667 return re_repeat.Range(None, '', lexer.TokenVal(right),
1668 right)
1669
1670 if n == 3: # {1,3}
1671 left = child1.GetChild(0).tok
1672 right = child1.GetChild(2).tok
1673 return re_repeat.Range(left, lexer.TokenVal(left),
1674 lexer.TokenVal(right), right)
1675
1676 raise AssertionError(n)
1677
1678 raise AssertionError(id_)
1679
1680 def _ReAlt(self, p_node):
1681 # type: (PNode) -> re_t
1682 """
1683 re_alt: (re_atom [repeat_op])+
1684 """
1685 assert p_node.typ == grammar_nt.re_alt
1686
1687 i = 0
1688 n = p_node.NumChildren()
1689 seq = [] # type: List[re_t]
1690 while i < n:
1691 r = self._ReAtom(p_node.GetChild(i))
1692 i += 1
1693 if i < n and p_node.GetChild(i).typ == grammar_nt.repeat_op:
1694 repeat_op = self._RepeatOp(p_node.GetChild(i))
1695 r = re.Repeat(r, repeat_op)
1696 i += 1
1697 seq.append(r)
1698
1699 if len(seq) == 1:
1700 return seq[0]
1701 else:
1702 return re.Seq(seq)
1703
1704 def _Regex(self, p_node):
1705 # type: (PNode) -> re_t
1706 """
1707 regex: [re_alt] (('|'|'or') re_alt)*
1708 """
1709 assert p_node.typ == grammar_nt.regex
1710
1711 n = p_node.NumChildren()
1712 alts = [] # type: List[re_t]
1713 for i in xrange(0, n, 2): # was children[::2]
1714 c = p_node.GetChild(i)
1715 alts.append(self._ReAlt(c))
1716
1717 if len(alts) == 1:
1718 return alts[0]
1719 else:
1720 return re.Alt(alts)
1721
1722
1723# vim: sw=4