OILS / opy / _regtest / src / opy / pgen2 / grammar.py View on Github | oilshell.org

222 lines, 71 significant
1# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2# Licensed to PSF under a Contributor Agreement.
3
4"""This module defines the data structures used to represent a grammar.
5
6These are a bit arcane because they are derived from the data
7structures used by Python's 'pgen' parser generator.
8
9There's also a table here mapping operators to their names in the
10token module; the Python tokenize module reports all operators as the
11fallback token code OP, but the parser needs the actual token code.
12
13"""
14
15# Python imports
16import cStringIO
17import collections
18import pickle
19import marshal
20
21# Local imports
22from . import token, tokenize
23
24
25class Grammar(object):
26 """Pgen parsing tables conversion class.
27
28 Once initialized, this class supplies the grammar tables for the
29 parsing engine implemented by parse.py. The parsing engine
30 accesses the instance variables directly. The class here does not
31 provide initialization of the tables; several subclasses exist to
32 do this (see the conv and pgen modules).
33
34 The load() method reads the tables from a pickle file, which is
35 much faster than the other ways offered by subclasses. The pickle
36 file is written by calling dump() (after loading the grammar
37 tables using a subclass). The report() method prints a readable
38 representation of the tables to stdout, for debugging.
39
40 The instance variables are as follows:
41
42 symbol2number -- a dict mapping symbol names to numbers. Symbol
43 numbers are always 256 or higher, to distinguish
44 them from token numbers, which are between 0 and
45 255 (inclusive).
46
47 number2symbol -- a dict mapping numbers to symbol names;
48 these two are each other's inverse.
49
50 states -- a list of DFAs, where each DFA is a list of
51 states, each state is a list of arcs, and each
52 arc is a (i, j) pair where i is a label and j is
53 a state number. The DFA number is the index into
54 this list. (This name is slightly confusing.)
55 Final states are represented by a special arc of
56 the form (0, j) where j is its own state number.
57
58 dfas -- a dict mapping symbol numbers to (DFA, first)
59 pairs, where DFA is an item from the states list
60 above, and first is a set of tokens that can
61 begin this grammar rule (represented by a dict
62 whose values are always 1).
63
64 labels -- a list of (x, y) pairs where x is either a token
65 number or a symbol number, and y is either None
66 or a string; the strings are keywords. The label
67 number is the index in this list; label numbers
68 are used to mark state transitions (arcs) in the
69 DFAs.
70
71 start -- the number of the grammar's start symbol.
72
73 keywords -- a dict mapping keyword strings to arc labels.
74
75 tokens -- a dict mapping token numbers to arc labels.
76
77 """
78
79 def __init__(self):
80 self.symbol2number = {}
81 self.number2symbol = {}
82 self.states = []
83 self.dfas = {}
84 self.labels = [(0, "EMPTY")]
85 self.keywords = {}
86 self.tokens = {}
87 self.symbol2label = {}
88 self.start = 256
89
90 def dump(self, filename):
91 """Dump the grammar tables to a pickle file.
92
93 dump() recursively changes all dict to OrderedDict, so the pickled file
94 is not exactly the same as what was passed in to dump(). load() uses the
95 pickled file to create the tables, but only changes OrderedDict to dict
96 at the top level; it does not recursively change OrderedDict to dict.
97 So, the loaded tables are different from the original tables that were
98 passed to load() in that some of the OrderedDict (from the pickled file)
99 are not changed back to dict. For parsing, this has no effect on
100 performance because OrderedDict uses dict's __getitem__ with nothing in
101 between.
102 """
103 with open(filename, "wb") as f:
104 d = _make_deterministic(self.__dict__)
105 pickle.dump(d, f, 2) # protocol 2
106 # d isn't marshallable. But neither is self.__dict__.
107 # Probably have to go through and marshal each member.
108 #marshal.dump(self.__dict__, f)
109
110 if 0:
111 s = ''
112 s += marshal.dumps(self.symbol2number)
113 s += marshal.dumps(self.number2symbol)
114 s += marshal.dumps(self.dfas)
115 s += marshal.dumps(self.labels)
116 s += marshal.dumps(self.keywords)
117 s += marshal.dumps(self.tokens)
118 s += marshal.dumps(self.symbol2label)
119 s += marshal.dumps(self.start)
120 print('Marshalled bytes: %d' % len(s))
121
122 def load(self, f):
123 d = pickle.load(f)
124 self.__dict__.update(d)
125
126 def copy(self):
127 """
128 Copy the grammar.
129 """
130 new = self.__class__()
131 for dict_attr in ("symbol2number", "number2symbol", "dfas", "keywords",
132 "tokens", "symbol2label"):
133 setattr(new, dict_attr, getattr(self, dict_attr).copy())
134 new.labels = self.labels[:]
135 new.states = self.states[:]
136 new.start = self.start
137 return new
138
139 def report(self):
140 """Dump the grammar tables to standard output, for debugging."""
141 from pprint import pprint
142 print("s2n")
143 pprint(self.symbol2number)
144 print("n2s")
145 pprint(self.number2symbol)
146 print("states")
147 pprint(self.states)
148 print("dfas")
149 pprint(self.dfas)
150 print("labels")
151 pprint(self.labels)
152 print("start", self.start)
153
154
155def _make_deterministic(top):
156 if isinstance(top, dict):
157 return collections.OrderedDict(
158 sorted(((k, _make_deterministic(v)) for k, v in top.items())))
159 if isinstance(top, list):
160 return [_make_deterministic(e) for e in top]
161 if isinstance(top, tuple):
162 return tuple(_make_deterministic(e) for e in top)
163 return top
164
165
166# Map from operator to number (since tokenize doesn't do this)
167
168opmap_raw = """
169( LPAR
170) RPAR
171[ LSQB
172] RSQB
173: COLON
174, COMMA
175; SEMI
176+ PLUS
177- MINUS
178* STAR
179/ SLASH
180| VBAR
181& AMPER
182< LESS
183> GREATER
184= EQUAL
185. DOT
186% PERCENT
187` BACKQUOTE
188{ LBRACE
189} RBRACE
190@ AT
191@= ATEQUAL
192== EQEQUAL
193!= NOTEQUAL
194<> NOTEQUAL
195<= LESSEQUAL
196>= GREATEREQUAL
197~ TILDE
198^ CIRCUMFLEX
199<< LEFTSHIFT
200>> RIGHTSHIFT
201** DOUBLESTAR
202+= PLUSEQUAL
203-= MINEQUAL
204*= STAREQUAL
205/= SLASHEQUAL
206%= PERCENTEQUAL
207&= AMPEREQUAL
208|= VBAREQUAL
209^= CIRCUMFLEXEQUAL
210<<= LEFTSHIFTEQUAL
211>>= RIGHTSHIFTEQUAL
212**= DOUBLESTAREQUAL
213// DOUBLESLASH
214//= DOUBLESLASHEQUAL
215-> RARROW
216"""
217
218opmap = {}
219for line in opmap_raw.splitlines():
220 if line:
221 op, name = line.split()
222 opmap[op] = getattr(token, name)