OILS / ysh / expr_to_ast.py View on Github | oilshell.org

1721 lines, 1035 significant
1"""expr_to_ast.py."""
2from __future__ import print_function
3
4from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind
5from _devbuild.gen.syntax_asdl import (
6 Token,
7 SimpleVarSub,
8 loc,
9 loc_t,
10 DoubleQuoted,
11 SingleQuoted,
12 BracedVarSub,
13 CommandSub,
14 ShArrayLiteral,
15 command,
16 expr,
17 expr_e,
18 expr_t,
19 expr_context_e,
20 re,
21 re_t,
22 re_repeat,
23 re_repeat_t,
24 class_literal_term,
25 class_literal_term_t,
26 PosixClass,
27 PerlClass,
28 NameType,
29 y_lhs_t,
30 Comprehension,
31 Subscript,
32 Attribute,
33 proc_sig,
34 proc_sig_t,
35 Param,
36 RestParam,
37 ParamGroup,
38 NamedArg,
39 ArgList,
40 pat,
41 pat_t,
42 TypeExpr,
43 Func,
44 Eggex,
45 EggexFlag,
46 CharCode,
47 CharRange,
48)
49from _devbuild.gen.value_asdl import value, value_t
50from _devbuild.gen import grammar_nt
51from core.error import p_die
52from data_lang import j8
53from frontend import consts
54from frontend import lexer
55from frontend import location
56from mycpp import mops
57from mycpp import mylib
58from mycpp.mylib import log, tagswitch
59from osh import word_compile
60from ysh import expr_parse
61from ysh import regex_translate
62
63from typing import TYPE_CHECKING, Dict, List, Tuple, Optional, cast
64if TYPE_CHECKING:
65 from pgen2.grammar import Grammar
66 from pgen2.pnode import PNode
67
68_ = log
69
70PERL_CLASSES = {
71 'd': 'd',
72 'w': 'w',
73 'word': 'w',
74 's': 's',
75}
76# https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html
77POSIX_CLASSES = [
78 'alnum',
79 'cntrl',
80 'lower',
81 'space',
82 'alpha',
83 'digit',
84 'print',
85 'upper',
86 'blank',
87 'graph',
88 'punct',
89 'xdigit',
90]
91# NOTE: There are also things like \p{Greek} that we could put in the
92# "non-sigil" namespace.
93
94RANGE_POINT_TOO_LONG = "Range start/end shouldn't have more than one character"
95
96POS_ARG_MISPLACED = "Positional arg can't appear in group of named args"
97
98# Copied from pgen2/token.py to avoid dependency.
99NT_OFFSET = 256
100
101if mylib.PYTHON:
102
103 def MakeGrammarNames(ysh_grammar):
104 # type: (Grammar) -> Dict[int, str]
105
106 # TODO: Break this dependency
107 from frontend import lexer_def
108
109 names = {}
110
111 for id_name, k in lexer_def.ID_SPEC.id_str2int.items():
112 # Hm some are out of range
113 #assert k < 256, (k, id_name)
114
115 # TODO: Some tokens have values greater than NT_OFFSET
116 if k < NT_OFFSET:
117 names[k] = id_name
118
119 for k, v in ysh_grammar.number2symbol.items():
120 assert k >= NT_OFFSET, (k, v)
121 names[k] = v
122
123 return names
124
125
126class Transformer(object):
127 """Homogeneous parse tree -> heterogeneous AST ("lossless syntax tree")
128
129 pgen2 (Python's LL parser generator) doesn't have semantic actions like yacc,
130 so this "transformer" is the equivalent.
131
132 Files to refer to when modifying this function:
133
134 ysh/grammar.pgen2 (generates _devbuild/gen/grammar_nt.py)
135 frontend/syntax.asdl (generates _devbuild/gen/syntax_asdl.py)
136
137 Related examples:
138
139 opy/compiler2/transformer.py (Python's parse tree -> AST, ~1500 lines)
140 Python-2.7.13/Python/ast.c (the "real" CPython version, ~3600 lines)
141
142 Other:
143 frontend/parse_lib.py (turn on print_parse_tree)
144
145 Public methods:
146 Expr, VarDecl
147 atom, trailer, etc. are private, named after productions in grammar.pgen2.
148 """
149
150 def __init__(self, gr):
151 # type: (Grammar) -> None
152 self.number2symbol = gr.number2symbol
153 if mylib.PYTHON:
154 names = MakeGrammarNames(gr)
155 # print raw nodes
156 self.p_printer = expr_parse.ParseTreePrinter(names)
157
158 def _LeftAssoc(self, p_node):
159 # type: (PNode) -> expr_t
160 """For an associative binary operation.
161
162 Examples:
163 xor_expr: and_expr ('xor' and_expr)*
164 term: factor (('*'|'/'|'%'|'div') factor)*
165
166 3 - 1 - 2 must be grouped as ((3 - 1) - 2).
167 """
168 # Note: Compare the iteractive com_binary() method in
169 # opy/compiler2/transformer.py.
170
171 # Examples:
172 # - The PNode for '3 - 1' will have 3 children
173 # - The PNode for '3 - 1 - 2' will have 5 children
174
175 #self.p_printer.Print(p_node)
176
177 i = 1 # index of the operator
178 n = p_node.NumChildren()
179
180 left = self.Expr(p_node.GetChild(0))
181 while i < n:
182 op = p_node.GetChild(i)
183 right = self.Expr(p_node.GetChild(i + 1))
184
185 # create a new left node
186 left = expr.Binary(op.tok, left, right)
187 i += 2
188
189 return left
190
191 def _Trailer(self, base, p_trailer):
192 # type: (expr_t, PNode) -> expr_t
193 """
194 trailer: ( '(' [arglist] ')' | '[' subscriptlist ']'
195 | '.' NAME | '->' NAME | '::' NAME
196 )
197 """
198 tok0 = p_trailer.GetChild(0).tok
199 typ0 = p_trailer.GetChild(0).typ
200
201 if typ0 == Id.Op_LParen:
202 lparen = tok0
203 rparen = p_trailer.GetChild(-1).tok
204 arglist = ArgList(lparen, [], None, [], None, None, rparen)
205 if p_trailer.NumChildren() == 2: # ()
206 return expr.FuncCall(base, arglist)
207
208 p = p_trailer.GetChild(1) # the X in ( X )
209 assert p.typ == grammar_nt.arglist # f(x, y)
210 self._ArgList(p, arglist)
211 return expr.FuncCall(base, arglist)
212
213 if typ0 == Id.Op_LBracket:
214 p_args = p_trailer.GetChild(1)
215 assert p_args.typ == grammar_nt.subscriptlist
216
217 n = p_args.NumChildren()
218 if n == 1: # a[1] a[1:2] a[:] etc.
219 subscript = self._Subscript(p_args.GetChild(0))
220 else: # a[1, 2] a[1:2, :]
221 slices = [] # type: List[expr_t]
222 for i in xrange(0, n, 2):
223 slices.append(self._Subscript(p_args.GetChild(i)))
224 # expr.Tuple evaluates to List in YSH.
225 #
226 # Note that syntactically, a[1:2, 3:4] is the the only way to
227 # get a List[Slice]. [1:2, 3:4] by itself is not allowed.
228 comma_tok = p_args.GetChild(1).tok
229 subscript = expr.Tuple(comma_tok, slices, expr_context_e.Store)
230
231 return Subscript(tok0, base, subscript)
232
233 if typ0 in (Id.Expr_Dot, Id.Expr_RArrow, Id.Expr_RDArrow):
234 attr = p_trailer.GetChild(1).tok # will be Id.Expr_Name
235 return Attribute(base, tok0, attr, lexer.TokenVal(attr),
236 expr_context_e.Store)
237
238 raise AssertionError(typ0)
239
240 def _DictPair(self, p_node):
241 # type: (PNode) -> Tuple[expr_t, expr_t]
242 """
243 dict_pair: ( Expr_Name [':' test]
244 | '[' testlist ']' ':' test )
245 | sq_string ':' test
246 | dq_string ':' test )
247 """
248 assert p_node.typ == grammar_nt.dict_pair
249
250 typ = p_node.GetChild(0).typ
251
252 if typ in (grammar_nt.sq_string, grammar_nt.dq_string):
253 key = self.Expr(p_node.GetChild(0)) # type: expr_t
254 val = self.Expr(p_node.GetChild(2))
255 return key, val
256
257 tok0 = p_node.GetChild(0).tok
258 id_ = tok0.id
259
260 if id_ == Id.Expr_Name:
261 key_str = value.Str(lexer.TokenVal(tok0))
262 key = expr.Const(tok0, key_str)
263 if p_node.NumChildren() >= 3:
264 val = self.Expr(p_node.GetChild(2))
265 else:
266 val = expr.Implicit
267
268 if id_ == Id.Op_LBracket: # {[x+y]: 'val'}
269 key = self.Expr(p_node.GetChild(1))
270 val = self.Expr(p_node.GetChild(4))
271 return key, val
272
273 return key, val
274
275 def _Dict(self, parent, p_node):
276 # type: (PNode, PNode) -> expr.Dict
277 """
278 dict: dict_pair (comma_newline dict_pair)* [comma_newline]
279 """
280 if p_node.typ == Id.Op_RBrace: # {}
281 return expr.Dict(parent.tok, [], [])
282
283 assert p_node.typ == grammar_nt.dict
284
285 keys = [] # type: List[expr_t]
286 values = [] # type: List[expr_t]
287
288 n = p_node.NumChildren()
289 for i in xrange(0, n, 2):
290 key, val = self._DictPair(p_node.GetChild(i))
291 keys.append(key)
292 values.append(val)
293
294 return expr.Dict(parent.tok, keys, values)
295
296 def _Tuple(self, parent):
297 # type: (PNode) -> expr_t
298
299 n = parent.NumChildren()
300
301 # (x) -- not a tuple
302 if n == 1:
303 return self.Expr(parent.GetChild(0))
304
305 # x, and (x,) aren't allowed
306 if n == 2:
307 p_die('Invalid trailing comma', parent.GetChild(1).tok)
308
309 elts = [] # type: List[expr_t]
310 for i in xrange(0, n, 2): # skip commas
311 p_node = parent.GetChild(i)
312 elts.append(self.Expr(p_node))
313
314 return expr.Tuple(parent.tok, elts,
315 expr_context_e.Store) # unused expr_context_e
316
317 def _TestlistComp(self, parent, p_node, id0):
318 # type: (PNode, PNode, Id_t) -> expr_t
319 """
320 testlist_comp:
321 (test|star_expr) ( comp_for | (',' (test|star_expr))* [','] )
322 """
323 assert p_node.typ == grammar_nt.testlist_comp
324
325 n = p_node.NumChildren()
326 if n > 1 and p_node.GetChild(1).typ == grammar_nt.comp_for:
327 elt = self.Expr(p_node.GetChild(0))
328 comp = self._CompFor(p_node.GetChild(1))
329 if id0 == Id.Op_LParen: # (x+1 for x in y)
330 return expr.GeneratorExp(elt, [comp])
331 if id0 == Id.Op_LBracket: # [x+1 for x in y]
332 return expr.ListComp(parent.tok, elt, [comp])
333 raise AssertionError()
334
335 if id0 == Id.Op_LParen:
336 # Parenthesized expression like (x+1) or (x)
337 if n == 1:
338 return self.Expr(p_node.GetChild(0))
339
340 # Tuples (1,) (1, 2) etc. - TODO: should be a list literal?
341 if p_node.GetChild(1).typ == Id.Arith_Comma:
342 return self._Tuple(p_node)
343
344 raise AssertionError()
345
346 if id0 == Id.Op_LBracket: # List [1,2,3]
347 elts = [] # type: List[expr_t]
348 for i in xrange(0, n, 2): # skip commas
349 elts.append(self.Expr(p_node.GetChild(i)))
350
351 return expr.List(parent.tok, elts,
352 expr_context_e.Store) # unused expr_context_e
353
354 raise AssertionError(Id_str(id0))
355
356 def _Atom(self, parent):
357 # type: (PNode) -> expr_t
358 """Handle alternatives of 'atom' where there's more than one child."""
359
360 tok = parent.GetChild(0).tok
361 id_ = tok.id
362 n = parent.NumChildren()
363
364 if id_ == Id.Op_LParen:
365 # atom: '(' [yield_expr|testlist_comp] ')' | ...
366 if n == 2: # () is a tuple
367 assert (
368 parent.GetChild(1).typ == Id.Op_RParen), parent.GetChild(1)
369 return expr.Tuple(tok, [], expr_context_e.Store)
370
371 return self._TestlistComp(parent, parent.GetChild(1), id_)
372
373 if id_ == Id.Op_LBracket:
374 # atom: ... | '[' [testlist_comp] ']' | ...
375
376 if n == 2: # []
377 assert (parent.GetChild(1).typ == Id.Op_RBracket
378 ), parent.GetChild(1)
379 return expr.List(tok, [],
380 expr_context_e.Store) # unused expr_context_e
381
382 return self._TestlistComp(parent, parent.GetChild(1), id_)
383
384 if id_ == Id.Left_CaretBracket: # ^[42 + x]
385 child = self.Expr(parent.GetChild(1))
386 return expr.Literal(child)
387
388 if id_ == Id.Op_LBrace:
389 # atom: ... | '{' [Op_Newline] [dict] '}'
390 i = 1
391 if parent.GetChild(i).typ == Id.Op_Newline:
392 i += 1
393 return self._Dict(parent, parent.GetChild(i))
394
395 if id_ == Id.Arith_Amp:
396 n = parent.NumChildren()
397 if n >= 3:
398 p_die("Places in containers not implemented yet",
399 parent.GetChild(2).tok)
400
401 name_tok = parent.GetChild(1).tok
402 return expr.Place(name_tok, lexer.TokenVal(name_tok), [])
403
404 if id_ == Id.Expr_Func:
405 # STUB. This should really be a Func, not Lambda.
406 return expr.Lambda([], expr.Implicit)
407
408 # 100 M
409 # Ignoring the suffix for now
410 if id_ == Id.Expr_DecInt:
411 assert n > 1
412 p_die("Units suffix not implemented", parent.GetChild(1).tok)
413 #return self.Expr(parent.GetChild(0))
414
415 # 100.5 M
416 # Ignoring the suffix for now
417 if id_ == Id.Expr_Float:
418 assert n > 1
419 p_die("unix suffix implemented", parent.GetChild(1).tok)
420 #return self.Expr(parent.GetChild(0))
421
422 raise AssertionError(Id_str(id_))
423
424 def _NameType(self, p_node):
425 # type: (PNode) -> NameType
426 """ name_type: Expr_Name [':'] [type_expr] """
427 name_tok = p_node.GetChild(0).tok
428 typ = None # type: Optional[TypeExpr]
429
430 n = p_node.NumChildren()
431 if n == 2:
432 typ = self._TypeExpr(p_node.GetChild(1))
433 if n == 3:
434 typ = self._TypeExpr(p_node.GetChild(2))
435
436 return NameType(name_tok, lexer.TokenVal(name_tok), typ)
437
438 def _NameTypeList(self, p_node):
439 # type: (PNode) -> List[NameType]
440 """ name_type_list: name_type (',' name_type)* """
441 assert p_node.typ == grammar_nt.name_type_list
442 results = [] # type: List[NameType]
443
444 n = p_node.NumChildren()
445 for i in xrange(0, n, 2): # was children[::2]
446 results.append(self._NameType(p_node.GetChild(i)))
447 return results
448
449 def _CompFor(self, p_node):
450 # type: (PNode) -> Comprehension
451 """comp_for: 'for' exprlist 'in' or_test ['if' or_test]"""
452 lhs = self._NameTypeList(p_node.GetChild(1))
453 iterable = self.Expr(p_node.GetChild(3))
454
455 if p_node.NumChildren() >= 6:
456 cond = self.Expr(p_node.GetChild(5))
457 else:
458 cond = None
459
460 return Comprehension(lhs, iterable, cond)
461
462 def _CompareChain(self, parent):
463 # type: (PNode) -> expr_t
464 """comparison: expr (comp_op expr)*"""
465 cmp_ops = [] # type: List[Token]
466 comparators = [] # type: List[expr_t]
467 left = self.Expr(parent.GetChild(0))
468
469 i = 1
470 n = parent.NumChildren()
471 while i < n:
472 p = parent.GetChild(i)
473 op = p.GetChild(0).tok
474 if p.NumChildren() == 2:
475 # Blame the first token, and change its type
476 if op.id == Id.Expr_Not: # not in
477 op.id = Id.Node_NotIn
478 elif op.id == Id.Expr_Is: # is not
479 op.id = Id.Node_IsNot
480 else:
481 raise AssertionError()
482 else:
483 # is, <, ==, etc.
484 pass
485
486 cmp_ops.append(op)
487 i += 1
488 comparators.append(self.Expr(parent.GetChild(i)))
489 i += 1
490 return expr.Compare(left, cmp_ops, comparators)
491
492 def _Subscript(self, parent):
493 # type: (PNode) -> expr_t
494 """subscript: expr | [expr] ':' [expr]"""
495 typ0 = parent.GetChild(0).typ
496
497 n = parent.NumChildren()
498
499 if typ0 == grammar_nt.expr:
500 if n == 3: # a[1:2]
501 lower = self.Expr(parent.GetChild(0))
502 op_tok = parent.GetChild(1).tok
503 upper = self.Expr(parent.GetChild(2))
504
505 elif n == 2: # a[1:]
506 lower = self.Expr(parent.GetChild(0))
507 op_tok = parent.GetChild(1).tok
508 upper = None
509 else: # a[1]
510 return self.Expr(parent.GetChild(0))
511 else:
512 assert typ0 == Id.Arith_Colon
513 lower = None
514 if n == 1: # a[:]
515 op_tok = parent.GetChild(0).tok
516 upper = None
517 else: # a[:3]
518 op_tok = parent.GetChild(0).tok
519 upper = self.Expr(parent.GetChild(1))
520
521 return expr.Slice(lower, op_tok, upper)
522
523 def Expr(self, pnode):
524 # type: (PNode) -> expr_t
525 """Transform expressions (as opposed to statements)"""
526 typ = pnode.typ
527
528 #
529 # YSH Entry Points / Additions
530 #
531
532 if typ == grammar_nt.ysh_expr: # for if/while
533 # ysh_expr: '(' testlist ')'
534 return self.Expr(pnode.GetChild(1))
535
536 if typ == grammar_nt.command_expr:
537 # return_expr: testlist end_stmt
538 return self.Expr(pnode.GetChild(0))
539
540 #
541 # Python-like Expressions / Operators
542 #
543
544 if typ == grammar_nt.atom:
545 if pnode.NumChildren() == 1:
546 return self.Expr(pnode.GetChild(0))
547 return self._Atom(pnode)
548
549 if typ == grammar_nt.testlist:
550 # testlist: test (',' test)* [',']
551 return self._Tuple(pnode)
552
553 if typ == grammar_nt.test:
554 # test: or_test ['if' or_test 'else' test] | lambdef
555 if pnode.NumChildren() == 1:
556 return self.Expr(pnode.GetChild(0))
557
558 # TODO: Handle lambdef
559
560 test = self.Expr(pnode.GetChild(2))
561 body = self.Expr(pnode.GetChild(0))
562 orelse = self.Expr(pnode.GetChild(4))
563 return expr.IfExp(test, body, orelse)
564
565 if typ == grammar_nt.lambdef:
566 # lambdef: '|' [name_type_list] '|' test
567
568 n = pnode.NumChildren()
569 if n == 4:
570 params = self._NameTypeList(pnode.GetChild(1))
571 else:
572 params = []
573
574 body = self.Expr(pnode.GetChild(n - 1))
575 return expr.Lambda(params, body)
576
577 #
578 # Operators with Precedence
579 #
580
581 if typ == grammar_nt.or_test:
582 # or_test: and_test ('or' and_test)*
583 return self._LeftAssoc(pnode)
584
585 if typ == grammar_nt.and_test:
586 # and_test: not_test ('and' not_test)*
587 return self._LeftAssoc(pnode)
588
589 if typ == grammar_nt.not_test:
590 # not_test: 'not' not_test | comparison
591 if pnode.NumChildren() == 1:
592 return self.Expr(pnode.GetChild(0))
593
594 op_tok = pnode.GetChild(0).tok # not
595 return expr.Unary(op_tok, self.Expr(pnode.GetChild(1)))
596
597 elif typ == grammar_nt.comparison:
598 if pnode.NumChildren() == 1:
599 return self.Expr(pnode.GetChild(0))
600
601 return self._CompareChain(pnode)
602
603 elif typ == grammar_nt.range_expr:
604 n = pnode.NumChildren()
605 if n == 1:
606 return self.Expr(pnode.GetChild(0))
607
608 if n == 3:
609 return expr.Range(self.Expr(pnode.GetChild(0)),
610 pnode.GetChild(1).tok,
611 self.Expr(pnode.GetChild(2)))
612
613 raise AssertionError(n)
614
615 elif typ == grammar_nt.expr:
616 # expr: xor_expr ('|' xor_expr)*
617 return self._LeftAssoc(pnode)
618
619 if typ == grammar_nt.xor_expr:
620 # xor_expr: and_expr ('xor' and_expr)*
621 return self._LeftAssoc(pnode)
622
623 if typ == grammar_nt.and_expr: # a & b
624 # and_expr: shift_expr ('&' shift_expr)*
625 return self._LeftAssoc(pnode)
626
627 elif typ == grammar_nt.shift_expr:
628 # shift_expr: arith_expr (('<<'|'>>') arith_expr)*
629 return self._LeftAssoc(pnode)
630
631 elif typ == grammar_nt.arith_expr:
632 # arith_expr: term (('+'|'-') term)*
633 return self._LeftAssoc(pnode)
634
635 elif typ == grammar_nt.term:
636 # term: factor (('*'|'/'|'div'|'mod') factor)*
637 return self._LeftAssoc(pnode)
638
639 elif typ == grammar_nt.factor:
640 # factor: ('+'|'-'|'~') factor | power
641 # the power would have already been reduced
642 if pnode.NumChildren() == 1:
643 return self.Expr(pnode.GetChild(0))
644
645 assert pnode.NumChildren() == 2
646 op = pnode.GetChild(0)
647 e = pnode.GetChild(1)
648
649 assert isinstance(op.tok, Token)
650 return expr.Unary(op.tok, self.Expr(e))
651
652 elif typ == grammar_nt.power:
653 # power: atom trailer* ['**' factor]
654
655 node = self.Expr(pnode.GetChild(0))
656 if pnode.NumChildren() == 1: # No trailers
657 return node
658
659 # Support a->startswith(b) and mydict.key
660 n = pnode.NumChildren()
661 i = 1
662 while i < n and pnode.GetChild(i).typ == grammar_nt.trailer:
663 node = self._Trailer(node, pnode.GetChild(i))
664 i += 1
665
666 if i != n: # ['**' factor]
667 op_tok = pnode.GetChild(i).tok
668 assert op_tok.id == Id.Arith_DStar, op_tok
669 factor = self.Expr(pnode.GetChild(i + 1))
670 node = expr.Binary(op_tok, node, factor)
671
672 return node
673
674 elif typ == grammar_nt.eggex:
675 return self._Eggex(pnode)
676
677 elif typ == grammar_nt.ysh_expr_sub:
678 return self.Expr(pnode.GetChild(0))
679
680 #
681 # YSH Lexer Modes
682 #
683
684 elif typ == grammar_nt.sh_array_literal:
685 return cast(ShArrayLiteral, pnode.GetChild(1).tok)
686
687 elif typ == grammar_nt.old_sh_array_literal:
688 return cast(ShArrayLiteral, pnode.GetChild(1).tok)
689
690 elif typ == grammar_nt.sh_command_sub:
691 return cast(CommandSub, pnode.GetChild(1).tok)
692
693 elif typ == grammar_nt.braced_var_sub:
694 return cast(BracedVarSub, pnode.GetChild(1).tok)
695
696 elif typ == grammar_nt.dq_string:
697 dq = cast(DoubleQuoted, pnode.GetChild(1).tok)
698 # sugar: ^"..." is short for ^["..."]
699 if pnode.GetChild(0).typ == Id.Left_CaretDoubleQuote:
700 return expr.Literal(dq)
701 return dq
702
703 elif typ == grammar_nt.sq_string:
704 return cast(SingleQuoted, pnode.GetChild(1).tok)
705
706 elif typ == grammar_nt.simple_var_sub:
707 tok = pnode.GetChild(0).tok
708
709 if tok.id == Id.VSub_DollarName: # $foo is disallowed
710 bare = lexer.TokenSliceLeft(tok, 1)
711 p_die(
712 'In expressions, remove $ and use `%s`, or sometimes "$%s"'
713 % (bare, bare), tok)
714
715 # $? is allowed
716 return SimpleVarSub(tok)
717
718 #
719 # Terminals
720 #
721
722 tok = pnode.tok
723 if typ == Id.Expr_Name:
724 return expr.Var(tok, lexer.TokenVal(tok))
725
726 # Everything else is an expr.Const
727 tok_str = lexer.TokenVal(tok)
728 # Remove underscores from 1_000_000. The lexer is responsible for
729 # validation.
730 c_under = tok_str.replace('_', '')
731
732 if typ == Id.Expr_DecInt:
733 try:
734 cval = value.Int(mops.FromStr(c_under)) # type: value_t
735 except ValueError:
736 p_die('Decimal int constant is too large', tok)
737
738 elif typ == Id.Expr_BinInt:
739 assert c_under[:2] in ('0b', '0B'), c_under
740 try:
741 cval = value.Int(mops.FromStr(c_under[2:], 2))
742 except ValueError:
743 p_die('Binary int constant is too large', tok)
744
745 elif typ == Id.Expr_OctInt:
746 assert c_under[:2] in ('0o', '0O'), c_under
747 try:
748 cval = value.Int(mops.FromStr(c_under[2:], 8))
749 except ValueError:
750 p_die('Octal int constant is too large', tok)
751
752 elif typ == Id.Expr_HexInt:
753 assert c_under[:2] in ('0x', '0X'), c_under
754 try:
755 cval = value.Int(mops.FromStr(c_under[2:], 16))
756 except ValueError:
757 p_die('Hex int constant is too large', tok)
758
759 elif typ == Id.Expr_Float:
760 # Note: float() in mycpp/gc_builtins.cc currently uses strtod
761 # I think this never raises ValueError, because the lexer
762 # should only accept strings that strtod() does?
763 cval = value.Float(float(c_under))
764
765 elif typ == Id.Expr_Null:
766 cval = value.Null
767
768 elif typ == Id.Expr_True:
769 cval = value.Bool(True)
770
771 elif typ == Id.Expr_False:
772 cval = value.Bool(False)
773
774 elif typ == Id.Char_OneChar: # \n
775 assert len(tok_str) == 2, tok_str
776 s = consts.LookupCharC(lexer.TokenSliceLeft(tok, 1))
777 cval = value.Str(s)
778
779 elif typ == Id.Char_YHex: # \yff
780 assert len(tok_str) == 4, tok_str
781 hex_str = lexer.TokenSliceLeft(tok, 2)
782 s = chr(int(hex_str, 16))
783 cval = value.Str(s)
784
785 elif typ == Id.Char_UBraced: # \u{123}
786 hex_str = lexer.TokenSlice(tok, 3, -1)
787 code_point = int(hex_str, 16)
788 s = j8.Utf8Encode(code_point)
789 cval = value.Str(s)
790
791 else:
792 raise AssertionError(typ)
793
794 return expr.Const(tok, cval)
795
796 def _CheckLhs(self, lhs):
797 # type: (expr_t) -> None
798
799 UP_lhs = lhs
800 with tagswitch(lhs) as case:
801 if case(expr_e.Var):
802 # OK - e.g. setvar a.b.c[i] = 42
803 pass
804
805 elif case(expr_e.Subscript):
806 lhs = cast(Subscript, UP_lhs)
807 self._CheckLhs(lhs.obj) # recurse on LHS
808
809 elif case(expr_e.Attribute):
810 lhs = cast(Attribute, UP_lhs)
811 self._CheckLhs(lhs.obj) # recurse on LHS
812
813 else:
814 # Illegal - e.g. setglobal {}["key"] = 42
815 p_die("Subscript/Attribute not allowed on this LHS expression",
816 location.TokenForExpr(lhs))
817
818 def _LhsExprList(self, p_node):
819 # type: (PNode) -> List[y_lhs_t]
820 """lhs_list: expr (',' expr)*"""
821 assert p_node.typ == grammar_nt.lhs_list
822
823 lhs_list = [] # type: List[y_lhs_t]
824 n = p_node.NumChildren()
825 for i in xrange(0, n, 2):
826 p = p_node.GetChild(i)
827 #self.p_printer.Print(p)
828
829 e = self.Expr(p)
830 UP_e = e
831 with tagswitch(e) as case:
832 if case(expr_e.Var):
833 e = cast(expr.Var, UP_e)
834 lhs_list.append(e.left)
835
836 elif case(expr_e.Subscript):
837 e = cast(Subscript, UP_e)
838 self._CheckLhs(e)
839 lhs_list.append(e)
840
841 elif case(expr_e.Attribute):
842 e = cast(Attribute, UP_e)
843 self._CheckLhs(e)
844 if e.op.id != Id.Expr_Dot:
845 # e.g. setvar obj->method is not valid
846 p_die("Can't assign to this attribute expr", e.op)
847 lhs_list.append(e)
848
849 else:
850 pass # work around mycpp bug
851
852 # TODO: could blame arbitary expr_t, bu this works most of
853 # the time
854 if p.tok:
855 blame = p.tok # type: loc_t
856 else:
857 blame = loc.Missing
858 p_die("Can't assign to this expression", blame)
859
860 return lhs_list
861
862 def MakeVarDecl(self, p_node):
863 # type: (PNode) -> command.VarDecl
864 """
865 ysh_var_decl: name_type_list ['=' testlist] end_stmt
866 """
867 assert p_node.typ == grammar_nt.ysh_var_decl
868
869 lhs = self._NameTypeList(p_node.GetChild(0)) # could be a tuple
870
871 # This syntax is confusing, and different than JavaScript
872 # var x, y = 1, 2
873 # But this is useful:
874 # var flag, i = parseArgs(spec, argv)
875
876 n = p_node.NumChildren()
877 if n >= 3:
878 rhs = self.Expr(p_node.GetChild(2))
879 else:
880 rhs = None
881
882 # The caller should fill in the keyword token.
883 return command.VarDecl(None, lhs, rhs)
884
885 def MakeMutation(self, p_node):
886 # type: (PNode) -> command.Mutation
887 """
888 ysh_mutation: lhs_list (augassign | '=') testlist end_stmt
889 """
890 assert p_node.typ == grammar_nt.ysh_mutation
891
892 lhs_list = self._LhsExprList(p_node.GetChild(0)) # could be a tuple
893 op_tok = p_node.GetChild(1).tok
894 if len(lhs_list) > 1 and op_tok.id != Id.Arith_Equal:
895 p_die('Multiple assignment must use =', op_tok)
896 rhs = self.Expr(p_node.GetChild(2))
897 return command.Mutation(None, lhs_list, op_tok, rhs)
898
899 def _EggexFlag(self, p_node):
900 # type: (PNode) -> EggexFlag
901 n = p_node.NumChildren()
902 if n == 1:
903 return EggexFlag(False, p_node.GetChild(0).tok)
904 elif n == 2:
905 return EggexFlag(True, p_node.GetChild(1).tok)
906 else:
907 raise AssertionError()
908
909 def _Eggex(self, p_node):
910 # type: (PNode) -> Eggex
911 """
912 eggex: '/' regex [';' re_flag* [';' Expr_Name] ] '/'
913 """
914 left = p_node.GetChild(0).tok
915 regex = self._Regex(p_node.GetChild(1))
916
917 flags = [] # type: List[EggexFlag]
918 trans_pref = None # type: Optional[Token]
919
920 i = 2
921 current = p_node.GetChild(i)
922 if current.typ == Id.Op_Semi:
923 i += 1
924 while True:
925 current = p_node.GetChild(i)
926 if current.typ != grammar_nt.re_flag:
927 break
928 flags.append(self._EggexFlag(current))
929 i += 1
930
931 if current.typ == Id.Op_Semi:
932 i += 1
933 trans_pref = p_node.GetChild(i).tok
934
935 # Canonicalize and validate flags for ERE only. Default is ERE.
936 if trans_pref is None or lexer.TokenVal(trans_pref) == 'ERE':
937 canonical_flags = regex_translate.CanonicalFlags(flags)
938 else:
939 canonical_flags = None
940
941 return Eggex(left, regex, flags, trans_pref, canonical_flags)
942
943 def YshCasePattern(self, pnode):
944 # type: (PNode) -> pat_t
945 assert pnode.typ == grammar_nt.ysh_case_pat, pnode
946
947 pattern = pnode.GetChild(0)
948 typ = pattern.typ
949 if typ == Id.Op_LParen:
950 # pat_expr or pat_else
951 pattern = pnode.GetChild(1)
952 typ = pattern.typ
953
954 if typ == grammar_nt.pat_else:
955 return pat.Else
956
957 if typ == grammar_nt.pat_exprs:
958 exprs = [] # type: List[expr_t]
959 for i in xrange(pattern.NumChildren()):
960 child = pattern.GetChild(i)
961 if child.typ == grammar_nt.expr:
962 expr = self.Expr(child)
963 exprs.append(expr)
964 return pat.YshExprs(exprs)
965
966 if typ == grammar_nt.eggex:
967 return self._Eggex(pattern)
968
969 raise AssertionError()
970
971 def _BlockArg(self, p_node):
972 # type: (PNode) -> expr_t
973
974 n = p_node.NumChildren()
975 if n == 1:
976 child = p_node.GetChild(0)
977 return self.Expr(child)
978
979 # It can only be an expression, not a=42, or ...expr
980 p_die('Invalid block expression argument', p_node.tok)
981
982 def _Argument(self, p_node, after_semi, arglist):
983 # type: (PNode, bool, ArgList) -> None
984 """
985 argument: (
986 test [comp_for]
987 | test '=' test # named arg
988 | '...' test # var args
989 )
990 """
991 pos_args = arglist.pos_args
992 named_args = arglist.named_args
993
994 assert p_node.typ == grammar_nt.argument, p_node
995 n = p_node.NumChildren()
996 if n == 1:
997 child = p_node.GetChild(0)
998 if after_semi:
999 p_die(POS_ARG_MISPLACED, child.tok)
1000 arg = self.Expr(child)
1001 pos_args.append(arg)
1002 return
1003
1004 if n == 2:
1005 # Note: We allow multiple spreads, just like Julia. They are
1006 # concatenated as in lists and dicts.
1007 tok0 = p_node.GetChild(0).tok
1008 if tok0.id == Id.Expr_Ellipsis:
1009 spread_expr = expr.Spread(tok0, self.Expr(p_node.GetChild(1)))
1010 if after_semi: # f(; ... named)
1011 named_args.append(NamedArg(None, spread_expr))
1012 else: # f(...named)
1013 pos_args.append(spread_expr)
1014 return
1015
1016 # Note: generator expression not implemented
1017 if p_node.GetChild(1).typ == grammar_nt.comp_for:
1018 child = p_node.GetChild(0)
1019 if after_semi:
1020 p_die(POS_ARG_MISPLACED, child.tok)
1021
1022 elt = self.Expr(child)
1023 comp = self._CompFor(p_node.GetChild(1))
1024 arg = expr.GeneratorExp(elt, [comp])
1025 pos_args.append(arg)
1026 return
1027
1028 raise AssertionError()
1029
1030 if n == 3: # named args can come before or after the semicolon
1031 n1 = NamedArg(
1032 p_node.GetChild(0).tok, self.Expr(p_node.GetChild(2)))
1033 named_args.append(n1)
1034 return
1035
1036 raise AssertionError()
1037
1038 def _ArgGroup(self, p_node, after_semi, arglist):
1039 # type: (PNode, bool, ArgList) -> None
1040 """
1041 arg_group: argument (',' argument)* [',']
1042 """
1043 for i in xrange(p_node.NumChildren()):
1044 p_child = p_node.GetChild(i)
1045 if p_child.typ == grammar_nt.argument:
1046 self._Argument(p_child, after_semi, arglist)
1047
1048 def _ArgList(self, p_node, arglist):
1049 # type: (PNode, ArgList) -> None
1050 """For both funcs and procs
1051
1052 arglist: (
1053 [arg_group]
1054 [';' [arg_group]]
1055 )
1056
1057 arglist3: ...
1058 """
1059 n = p_node.NumChildren()
1060 if n == 0:
1061 return
1062
1063 i = 0
1064
1065 if i >= n:
1066 return
1067 child = p_node.GetChild(i)
1068 if child.typ == grammar_nt.arg_group:
1069 self._ArgGroup(child, False, arglist)
1070 i += 1
1071
1072 if i >= n:
1073 return
1074 child = p_node.GetChild(i)
1075 if child.typ == Id.Op_Semi:
1076 arglist.semi_tok = child.tok
1077 i += 1
1078
1079 # Named args after first semi-colon
1080 if i >= n:
1081 return
1082 child = p_node.GetChild(i)
1083 if child.typ == grammar_nt.arg_group:
1084 self._ArgGroup(child, True, arglist)
1085 i += 1
1086
1087 #
1088 # Special third group may have block expression - only for arglist3,
1089 # used for procs!
1090 #
1091
1092 if i >= n:
1093 return
1094 assert p_node.typ == grammar_nt.arglist3, p_node
1095
1096 child = p_node.GetChild(i)
1097 if child.typ == Id.Op_Semi:
1098 arglist.semi_tok2 = child.tok
1099 i += 1
1100
1101 if i >= n:
1102 return
1103 child = p_node.GetChild(i)
1104 if child.typ == grammar_nt.argument:
1105 arglist.block_expr = self._BlockArg(child)
1106 i += 1
1107
1108 def ProcCallArgs(self, pnode, arglist):
1109 # type: (PNode, ArgList) -> None
1110 """
1111 ysh_eager_arglist: '(' [arglist3] ')'
1112 ysh_lazy_arglist: '[' [arglist] ']'
1113 """
1114 n = pnode.NumChildren()
1115 if n == 2: # f()
1116 return
1117
1118 if n == 3:
1119 child1 = pnode.GetChild(1) # the X in '( X )'
1120
1121 self._ArgList(child1, arglist)
1122 return
1123
1124 raise AssertionError()
1125
1126 def _TypeExpr(self, pnode):
1127 # type: (PNode) -> TypeExpr
1128 """
1129 type_expr: Expr_Name [ '[' type_expr (',' type_expr)* ']' ]
1130 """
1131 assert pnode.typ == grammar_nt.type_expr, pnode.typ
1132
1133 ty = TypeExpr.CreateNull() # don't allocate children
1134
1135 ty.tok = pnode.GetChild(0).tok
1136 ty.name = lexer.TokenVal(ty.tok)
1137
1138 n = pnode.NumChildren()
1139 if n == 1:
1140 return ty
1141
1142 ty.params = []
1143 i = 2
1144 while i < n:
1145 p = self._TypeExpr(pnode.GetChild(i))
1146 ty.params.append(p)
1147 i += 2 # skip comma
1148
1149 return ty
1150
1151 def _Param(self, pnode):
1152 # type: (PNode) -> Param
1153 """
1154 param: Expr_Name [type_expr] ['=' expr]
1155 """
1156 assert pnode.typ == grammar_nt.param
1157
1158 name_tok = pnode.GetChild(0).tok
1159 n = pnode.NumChildren()
1160
1161 assert name_tok.id == Id.Expr_Name, name_tok
1162
1163 default_val = None # type: expr_t
1164 type_ = None # type: TypeExpr
1165
1166 if n == 1:
1167 # proc p(a)
1168 pass
1169
1170 elif n == 2:
1171 # proc p(a Int)
1172 type_ = self._TypeExpr(pnode.GetChild(1))
1173
1174 elif n == 3:
1175 # proc p(a = 3)
1176 default_val = self.Expr(pnode.GetChild(2))
1177
1178 elif n == 4:
1179 # proc p(a Int = 3)
1180 type_ = self._TypeExpr(pnode.GetChild(1))
1181 default_val = self.Expr(pnode.GetChild(3))
1182
1183 return Param(name_tok, lexer.TokenVal(name_tok), type_, default_val)
1184
1185 def _ParamGroup(self, p_node):
1186 # type: (PNode) -> ParamGroup
1187 """
1188 param_group:
1189 (param ',')*
1190 [ (param | '...' Expr_Name) [,] ]
1191 """
1192 assert p_node.typ == grammar_nt.param_group, p_node
1193
1194 params = [] # type: List[Param]
1195 rest_of = None # type: Optional[RestParam]
1196
1197 n = p_node.NumChildren()
1198 i = 0
1199 while i < n:
1200 child = p_node.GetChild(i)
1201 if child.typ == grammar_nt.param:
1202 params.append(self._Param(child))
1203
1204 elif child.typ == Id.Expr_Ellipsis:
1205 tok = p_node.GetChild(i + 1).tok
1206 rest_of = RestParam(tok, lexer.TokenVal(tok))
1207
1208 i += 2
1209
1210 return ParamGroup(params, rest_of)
1211
1212 def Proc(self, p_node):
1213 # type: (PNode) -> proc_sig_t
1214 """
1215 ysh_proc: (
1216 [ '('
1217 [ param_group ] # word params, with defaults
1218 [ ';' [ param_group ] ] # positional typed params, with defaults
1219 [ ';' [ param_group ] ] # named params, with defaults
1220 [ ';' Expr_Name ] # optional block param, with no type or default
1221 ')'
1222 ]
1223 '{' # opening { for pgen2
1224 )
1225 """
1226 assert p_node.typ == grammar_nt.ysh_proc
1227
1228 n = p_node.NumChildren()
1229 if n == 1: # proc f {
1230 return proc_sig.Open
1231
1232 if n == 3: # proc f () {
1233 sig = proc_sig.Closed.CreateNull(alloc_lists=True) # no params
1234
1235 # proc f( three param groups, and block group )
1236 sig = proc_sig.Closed.CreateNull(alloc_lists=True) # no params
1237
1238 # Word args
1239 i = 1
1240 child = p_node.GetChild(i)
1241 if child.typ == grammar_nt.param_group:
1242 sig.word = self._ParamGroup(p_node.GetChild(i))
1243
1244 # Validate word args
1245 for word in sig.word.params:
1246 if word.type:
1247 if word.type.name not in ('Str', 'Ref'):
1248 p_die('Word params may only have type Str or Ref',
1249 word.type.tok)
1250 if word.type.params is not None:
1251 p_die('Unexpected type parameters', word.type.tok)
1252
1253 i += 2
1254 else:
1255 i += 1
1256
1257 #log('i %d n %d', i, n)
1258 if i >= n:
1259 return sig
1260
1261 # Positional args
1262 child = p_node.GetChild(i)
1263 if child.typ == grammar_nt.param_group:
1264 sig.positional = self._ParamGroup(p_node.GetChild(i))
1265 i += 2
1266 else:
1267 i += 1
1268
1269 #log('i %d n %d', i, n)
1270 if i >= n:
1271 return sig
1272
1273 # Keyword args
1274 child = p_node.GetChild(i)
1275 if child.typ == grammar_nt.param_group:
1276 sig.named = self._ParamGroup(p_node.GetChild(i))
1277 i += 2
1278 else:
1279 i += 1
1280
1281 #log('i %d n %d', i, n)
1282 if i >= n:
1283 return sig
1284
1285 child = p_node.GetChild(i)
1286 if child.typ == grammar_nt.param_group:
1287 group = self._ParamGroup(p_node.GetChild(i))
1288 params = group.params
1289 if len(params) > 1:
1290 p_die('Only 1 block param is allowed', params[1].blame_tok)
1291 if group.rest_of:
1292 p_die("Rest param isn't allowed for blocks",
1293 group.rest_of.blame_tok)
1294
1295 if len(params) == 1:
1296 if params[0].type:
1297 if params[0].type.name != 'Command':
1298 p_die('Block param must have type Command',
1299 params[0].type.tok)
1300 if params[0].type.params is not None:
1301 p_die('Unexpected type parameters', params[0].type.tok)
1302
1303 sig.block_param = params[0]
1304
1305 return sig
1306
1307 def YshFunc(self, p_node, out):
1308 # type: (PNode, Func) -> None
1309 """
1310 ysh_func: Expr_Name '(' [param_group] [';' param_group] ')'
1311 """
1312 assert p_node.typ == grammar_nt.ysh_func
1313
1314 #self.p_printer.Print(p_node)
1315
1316 out.name = p_node.GetChild(0).tok
1317
1318 n = p_node.NumChildren()
1319 i = 2 # after (
1320
1321 child = p_node.GetChild(i)
1322 if child.typ == grammar_nt.param_group:
1323 out.positional = self._ParamGroup(child)
1324 i += 2 # skip past ;
1325 else:
1326 i += 1
1327
1328 if i >= n:
1329 return
1330
1331 child = p_node.GetChild(i)
1332 if child.typ == grammar_nt.param_group:
1333 out.named = self._ParamGroup(child)
1334
1335 #
1336 # Eggex Language
1337 #
1338
1339 def _RangeCharSingleQuoted(self, p_node):
1340 # type: (PNode) -> Optional[CharCode]
1341
1342 assert p_node.typ == grammar_nt.range_char, p_node
1343
1344 # 'a' in 'a'-'b'
1345
1346 child0 = p_node.GetChild(0)
1347 if child0.typ == grammar_nt.sq_string:
1348 sq_part = cast(SingleQuoted, child0.GetChild(1).tok)
1349 n = len(sq_part.sval)
1350 if n == 0:
1351 p_die("Quoted range char can't be empty",
1352 loc.WordPart(sq_part))
1353 elif n == 1:
1354 return CharCode(sq_part.left, ord(sq_part.sval[0]), False)
1355 else:
1356 p_die(RANGE_POINT_TOO_LONG, loc.WordPart(sq_part))
1357 return None
1358
1359 def _OtherRangeToken(self, p_node):
1360 # type: (PNode) -> Token
1361 """An endpoint of a range (single char)
1362
1363 range_char: Expr_Name | Expr_DecInt | sq_string | char_literal
1364 a-z 0-9 'a'-'z' \x00-\xff
1365 """
1366 assert p_node.typ == grammar_nt.range_char, p_node
1367
1368 child0 = p_node.GetChild(0)
1369 if child0.typ == grammar_nt.char_literal:
1370 # \x00 in /[\x00 - \x20]/
1371 tok = child0.GetChild(0).tok
1372 return tok
1373
1374 tok = p_node.tok
1375 # a in a-z is Expr_Name
1376 # 0 in 0-9 is Expr_DecInt
1377 assert tok.id in (Id.Expr_Name, Id.Expr_DecInt), tok
1378
1379 if tok.length != 1:
1380 p_die(RANGE_POINT_TOO_LONG, tok)
1381 return tok
1382
1383 def _NonRangeChars(self, p_node):
1384 # type: (PNode) -> class_literal_term_t
1385 """
1386 \" \u1234 '#'
1387 """
1388 assert p_node.typ == grammar_nt.range_char, p_node
1389
1390 child0 = p_node.GetChild(0)
1391 typ0 = p_node.GetChild(0).typ
1392
1393 if typ0 == grammar_nt.sq_string:
1394 return cast(SingleQuoted, child0.GetChild(1).tok)
1395
1396 if typ0 == grammar_nt.char_literal:
1397 return word_compile.EvalCharLiteralForRegex(child0.tok)
1398
1399 if typ0 == Id.Expr_Name:
1400 # Look up PerlClass and PosixClass
1401 return self._NameInClass(None, child0.tok)
1402
1403 raise AssertionError()
1404
1405 def _ClassLiteralTerm(self, p_node):
1406 # type: (PNode) -> class_literal_term_t
1407 """
1408 class_literal_term:
1409 range_char ['-' range_char ]
1410 | '@' Expr_Name # splice
1411 | '!' Expr_Name # negate char class
1412 ...
1413 """
1414 assert p_node.typ == grammar_nt.class_literal_term, p_node
1415
1416 typ0 = p_node.GetChild(0).typ
1417
1418 if typ0 == grammar_nt.range_char:
1419 n = p_node.NumChildren()
1420
1421 if n == 1:
1422 return self._NonRangeChars(p_node.GetChild(0))
1423
1424 # 'a'-'z' etc.
1425 if n == 3:
1426 assert p_node.GetChild(1).typ == Id.Arith_Minus, p_node
1427
1428 left = p_node.GetChild(0)
1429 right = p_node.GetChild(2)
1430
1431 code1 = self._RangeCharSingleQuoted(left)
1432 if code1 is None:
1433 tok1 = self._OtherRangeToken(left)
1434 code1 = word_compile.EvalCharLiteralForRegex(tok1)
1435
1436 code2 = self._RangeCharSingleQuoted(right)
1437 if code2 is None:
1438 tok2 = self._OtherRangeToken(right)
1439 code2 = word_compile.EvalCharLiteralForRegex(tok2)
1440 return CharRange(code1, code2)
1441
1442 raise AssertionError()
1443
1444 if typ0 == Id.Expr_At:
1445 tok1 = p_node.GetChild(1).tok
1446 return class_literal_term.Splice(tok1, lexer.TokenVal(tok1))
1447
1448 if typ0 == Id.Expr_Bang:
1449 return self._NameInClass(
1450 p_node.GetChild(0).tok,
1451 p_node.GetChild(1).tok)
1452
1453 p_die("This kind of class literal term isn't implemented",
1454 p_node.GetChild(0).tok)
1455
1456 def _ClassLiteral(self, p_node):
1457 # type: (PNode) -> List[class_literal_term_t]
1458 """class_literal: '[' class_literal_term+ ']'."""
1459 assert p_node.typ == grammar_nt.class_literal
1460 # skip [ and ]
1461 terms = [] # type: List[class_literal_term_t]
1462 for i in xrange(1, p_node.NumChildren() - 1):
1463 terms.append(self._ClassLiteralTerm(p_node.GetChild(i)))
1464
1465 return terms
1466
1467 def _NameInRegex(self, negated_tok, tok):
1468 # type: (Token, Token) -> re_t
1469 tok_str = lexer.TokenVal(tok)
1470 if tok_str == 'dot':
1471 if negated_tok:
1472 p_die("Can't negate this symbol", tok)
1473 return re.Primitive(tok, Id.Eggex_Dot)
1474
1475 if tok_str in POSIX_CLASSES:
1476 return PosixClass(negated_tok, tok_str)
1477
1478 perl = PERL_CLASSES.get(tok_str)
1479 if perl is not None:
1480 return PerlClass(negated_tok, perl)
1481
1482 if tok_str[0].isupper(): # e.g. HexDigit
1483 return re.Splice(tok, lexer.TokenVal(tok))
1484
1485 p_die("%r isn't a character class" % tok_str, tok)
1486
1487 def _NameInClass(self, negated_tok, tok):
1488 # type: (Token, Token) -> class_literal_term_t
1489 """Like the above, but 'dot' and 'd' don't mean anything within []"""
1490 tok_str = lexer.TokenVal(tok)
1491
1492 # A bare, unquoted character literal. In the grammar, this is expressed as
1493 # range_char without an ending.
1494
1495 # d is NOT 'digit', it's a literal 'd'!
1496 if len(tok_str) == 1:
1497 # Expr_Name matches VAR_NAME_RE, which starts with [a-zA-Z_]
1498 assert tok.id in (Id.Expr_Name, Id.Expr_DecInt)
1499
1500 if negated_tok: # [~d] is not allowed, only [~digit]
1501 p_die("Can't negate this symbol", tok)
1502 return word_compile.EvalCharLiteralForRegex(tok)
1503
1504 # digit, word, but not d, w, etc.
1505 if tok_str in POSIX_CLASSES:
1506 return PosixClass(negated_tok, tok_str)
1507
1508 perl = PERL_CLASSES.get(tok_str)
1509 if perl is not None:
1510 return PerlClass(negated_tok, perl)
1511 p_die("%r isn't a character class" % tok_str, tok)
1512
1513 def _ReAtom(self, p_atom):
1514 # type: (PNode) -> re_t
1515 """
1516 re_atom: ( char_literal | ...
1517 """
1518 assert p_atom.typ == grammar_nt.re_atom, p_atom.typ
1519
1520 child0 = p_atom.GetChild(0)
1521
1522 typ0 = p_atom.GetChild(0).typ
1523 tok0 = p_atom.GetChild(0).tok
1524
1525 # Non-terminals
1526
1527 if typ0 == grammar_nt.class_literal:
1528 return re.CharClassLiteral(False, self._ClassLiteral(child0))
1529
1530 if typ0 == grammar_nt.sq_string:
1531 return cast(SingleQuoted, child0.GetChild(1).tok)
1532
1533 if typ0 == grammar_nt.char_literal:
1534 # Note: ERE doesn't seem to support escapes like Python
1535 # https://docs.python.org/3/library/re.html
1536 # We might want to do a translation like this;
1537 #
1538 # \u{03bc} -> \u03bc
1539 # \x00 -> \x00
1540 # \n -> \n
1541
1542 # Must be Id.Char_{OneChar,Hex,UBraced}
1543 assert consts.GetKind(tok0.id) == Kind.Char
1544 s = word_compile.EvalCStringToken(tok0.id, lexer.TokenVal(tok0))
1545 return re.LiteralChars(tok0, s)
1546
1547 # Special punctuation
1548 if typ0 == Id.Expr_Dot: # .
1549 return re.Primitive(tok0, Id.Eggex_Dot)
1550
1551 if typ0 == Id.Arith_Caret: # ^
1552 return re.Primitive(tok0, Id.Eggex_Start)
1553
1554 if typ0 == Id.Expr_Dollar: # $
1555 return re.Primitive(tok0, Id.Eggex_End)
1556
1557 if typ0 == Id.Expr_Name:
1558 # d digit -> PosixClass PerlClass etc.
1559 return self._NameInRegex(None, tok0)
1560
1561 if typ0 == Id.Expr_Symbol:
1562 # Validate symbols here, like we validate PerlClass, etc.
1563 tok_str = lexer.TokenVal(tok0)
1564 if tok_str == '%start':
1565 return re.Primitive(tok0, Id.Eggex_Start)
1566 if tok_str == '%end':
1567 return re.Primitive(tok0, Id.Eggex_End)
1568 p_die("Unexpected token %r in regex" % tok_str, tok0)
1569
1570 if typ0 == Id.Expr_At:
1571 # | '@' Expr_Name
1572 tok1 = p_atom.GetChild(1).tok
1573 return re.Splice(tok0, lexer.TokenVal(tok1))
1574
1575 if typ0 == Id.Expr_Bang:
1576 # | '!' (Expr_Name | class_literal)
1577 # | '!' '!' Expr_Name (Expr_Name | Expr_DecInt | '(' regex ')')
1578 n = p_atom.NumChildren()
1579 if n == 2:
1580 child1 = p_atom.GetChild(1)
1581 if child1.typ == grammar_nt.class_literal:
1582 return re.CharClassLiteral(True,
1583 self._ClassLiteral(child1))
1584 else:
1585 return self._NameInRegex(tok0, p_atom.GetChild(1).tok)
1586 else:
1587 # Note: !! conflicts with shell history
1588 p_die(
1589 "Backtracking with !! isn't implemented (requires Python/PCRE)",
1590 p_atom.GetChild(1).tok)
1591
1592 if typ0 == Id.Op_LParen:
1593 # | '(' regex ')'
1594
1595 # Note: in ERE (d+) is the same as <d+>. That is, Group becomes
1596 # Capture.
1597 return re.Group(self._Regex(p_atom.GetChild(1)))
1598
1599 if typ0 == Id.Arith_Less:
1600 # | '<' 'capture' regex ['as' Expr_Name] [':' Expr_Name] '>'
1601
1602 n = p_atom.NumChildren()
1603 assert n == 4 or n == 6 or n == 8, n
1604
1605 # < capture d+ >
1606 regex = self._Regex(p_atom.GetChild(2))
1607
1608 as_name = None # type: Optional[Token]
1609 func_name = None # type: Optional[Token]
1610
1611 i = 3 # points at any of > as :
1612
1613 typ = p_atom.GetChild(i).typ
1614 if typ == Id.Expr_As:
1615 as_name = p_atom.GetChild(i + 1).tok
1616 i += 2
1617
1618 typ = p_atom.GetChild(i).typ
1619 if typ == Id.Arith_Colon:
1620 func_name = p_atom.GetChild(i + 1).tok
1621
1622 return re.Capture(regex, as_name, func_name)
1623
1624 raise AssertionError(typ0)
1625
1626 def _RepeatOp(self, p_repeat):
1627 # type: (PNode) -> re_repeat_t
1628 """
1629 repeat_op: '+' | '*' | '?'
1630 | '{' [Expr_Name] ('+' | '*' | '?' | repeat_range) '}'
1631 """
1632 assert p_repeat.typ == grammar_nt.repeat_op, p_repeat
1633
1634 tok = p_repeat.GetChild(0).tok
1635 id_ = tok.id
1636
1637 if id_ in (Id.Arith_Plus, Id.Arith_Star, Id.Arith_QMark):
1638 return tok # a+ a* a?
1639
1640 if id_ == Id.Op_LBrace:
1641 child1 = p_repeat.GetChild(1)
1642 if child1.typ != grammar_nt.repeat_range:
1643 # e.g. dot{N *} is .*?
1644 p_die("Perl-style repetition isn't implemented with libc",
1645 child1.tok)
1646
1647 # repeat_range: (
1648 # Expr_DecInt [',']
1649 # | ',' Expr_DecInt
1650 # | Expr_DecInt ',' Expr_DecInt
1651 # )
1652
1653 n = child1.NumChildren()
1654 if n == 1: # {3}
1655 tok = child1.GetChild(0).tok
1656 return tok # different operator than + * ?
1657
1658 if n == 2:
1659 if child1.GetChild(0).typ == Id.Expr_DecInt: # {,3}
1660 left = child1.GetChild(0).tok
1661 return re_repeat.Range(left, lexer.TokenVal(left), '',
1662 None)
1663 else: # {1,}
1664 right = child1.GetChild(1).tok
1665 return re_repeat.Range(None, '', lexer.TokenVal(right),
1666 right)
1667
1668 if n == 3: # {1,3}
1669 left = child1.GetChild(0).tok
1670 right = child1.GetChild(2).tok
1671 return re_repeat.Range(left, lexer.TokenVal(left),
1672 lexer.TokenVal(right), right)
1673
1674 raise AssertionError(n)
1675
1676 raise AssertionError(id_)
1677
1678 def _ReAlt(self, p_node):
1679 # type: (PNode) -> re_t
1680 """
1681 re_alt: (re_atom [repeat_op])+
1682 """
1683 assert p_node.typ == grammar_nt.re_alt
1684
1685 i = 0
1686 n = p_node.NumChildren()
1687 seq = [] # type: List[re_t]
1688 while i < n:
1689 r = self._ReAtom(p_node.GetChild(i))
1690 i += 1
1691 if i < n and p_node.GetChild(i).typ == grammar_nt.repeat_op:
1692 repeat_op = self._RepeatOp(p_node.GetChild(i))
1693 r = re.Repeat(r, repeat_op)
1694 i += 1
1695 seq.append(r)
1696
1697 if len(seq) == 1:
1698 return seq[0]
1699 else:
1700 return re.Seq(seq)
1701
1702 def _Regex(self, p_node):
1703 # type: (PNode) -> re_t
1704 """
1705 regex: [re_alt] (('|'|'or') re_alt)*
1706 """
1707 assert p_node.typ == grammar_nt.regex
1708
1709 n = p_node.NumChildren()
1710 alts = [] # type: List[re_t]
1711 for i in xrange(0, n, 2): # was children[::2]
1712 c = p_node.GetChild(i)
1713 alts.append(self._ReAlt(c))
1714
1715 if len(alts) == 1:
1716 return alts[0]
1717 else:
1718 return re.Alt(alts)
1719
1720
1721# vim: sw=4