OILS / pgen2 / grammar.py View on Github | oils.pub

303 lines, 142 significant
1# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2# Licensed to PSF under a Contributor Agreement.
3
4"""This module defines the data structures used to represent a grammar.
5
6These are a bit arcane because they are derived from the data
7structures used by Python's 'pgen' parser generator.
8
9There's also a table here mapping operators to their names in the
10token module; the Python tokenize module reports all operators as the
11fallback token code OP, but the parser needs the actual token code.
12
13"""
14
15import marshal
16
17from mycpp.mylib import log
18from mycpp import mylib
19
20from typing import TYPE_CHECKING
21
22if TYPE_CHECKING:
23 from typing import IO, Dict, List, Tuple
24
25 # Type aliases
26 arc_t = Tuple[int, int]
27 first_t = Dict[int, int]
28 states_t = List[List[arc_t]]
29 dfa_t = Tuple[states_t, first_t]
30
31
32class Grammar(object):
33 """Pgen parsing tables conversion class.
34
35 Once initialized, this class supplies the grammar tables for the
36 parsing engine implemented by parse.py. The parsing engine
37 accesses the instance variables directly. The class here does not
38 provide initialization of the tables; several subclasses exist to
39 do this (see the conv and pgen modules).
40
41 The load() method reads the tables from a pickle file, which is
42 much faster than the other ways offered by subclasses. The pickle
43 file is written by calling dump() (after loading the grammar
44 tables using a subclass). The report() method prints a readable
45 representation of the tables to stdout, for debugging.
46
47 The instance variables are as follows:
48
49 symbol2number -- a dict mapping symbol names to numbers. Symbol
50 numbers are always NT_OFFSET or higher, to distinguish
51 them from token numbers, which are between 0 and 255
52 (inclusive).
53
54 number2symbol -- a dict mapping numbers to symbol names;
55 these two are each other's inverse.
56
57 states -- a list of DFAs, where each DFA is a list of
58 states, each state is a list of arcs, and each
59 arc is a (i, j) pair where i is a label and j is
60 a state number. The DFA number is the index into
61 this list. (This name is slightly confusing.)
62 Final states are represented by a special arc of
63 the form (0, j) where j is its own state number.
64
65 dfas -- a dict mapping symbol numbers to (DFA, first)
66 pairs, where DFA is an item from the states list
67 above, and first is a set of tokens that can
68 begin this grammar rule (represented by a dict
69 whose values are always 1).
70
71 labels -- a list of (x, y) pairs where x is either a token
72 number or a symbol number, and y is either None
73 or a string; the strings are keywords. The label
74 number is the index in this list; label numbers
75 are used to mark state transitions (arcs) in the
76 DFAs.
77
78 Oils patch: this became List[int] where int is the
79 token/symbol number.
80
81 start -- the number of the grammar's start symbol.
82
83 keywords -- a dict mapping keyword strings to arc labels.
84
85 tokens -- a dict mapping token numbers to arc labels.
86
87 """
88
89 def __init__(self):
90 # type: () -> None
91 self.symbol2number = {} # type: Dict[str, int]
92 self.number2symbol = {} # type: Dict[int, str]
93
94 # TODO: See MakeGrammar in pgen2/pgen.py
95 # To see the type
96 # states: List[List[arcs]]
97 # arc: (int, int)
98 # dfs = Dict[int, Tuple[states, ...]]
99
100 self.states = [] # type: states_t
101 self.dfas = {} # type: Dict[int, dfa_t]
102 # Oils patch: used to be [(0, "EMPTY")]. I suppose 0 is a special value?
103 # Or is it ENDMARKER?
104 self.labels = [0] # type: List[int]
105 self.keywords = {} # type: Dict[str, int]
106 self.tokens = {} # type: Dict[int, int]
107 self.symbol2label = {} # type: Dict[str, int]
108 self.start = 256 # TODO: move to pgen.NT_OFFSET, breaks OVM tarball
109
110 if mylib.PYTHON:
111 def dump(self, f):
112 # type: (IO[str]) -> None
113 """Dump the grammar tables to a marshal file.
114
115 Oils patch: changed pickle to marshal.
116
117 dump() recursively changes all dict to OrderedDict, so the pickled file
118 is not exactly the same as what was passed in to dump(). load() uses the
119 pickled file to create the tables, but only changes OrderedDict to dict
120 at the top level; it does not recursively change OrderedDict to dict.
121 So, the loaded tables are different from the original tables that were
122 passed to load() in that some of the OrderedDict (from the pickled file)
123 are not changed back to dict. For parsing, this has no effect on
124 performance because OrderedDict uses dict's __getitem__ with nothing in
125 between.
126 """
127 # Hack to get rid of Id_t
128 labels = [int(i) for i in self.labels]
129 tokens = dict((int(k), v) for (k, v) in self.tokens.iteritems())
130
131 #self.report()
132 payload = (
133 self.MARSHAL_HEADER,
134 self.symbol2number,
135 self.number2symbol,
136 self.states,
137 self.dfas,
138 labels,
139 self.keywords,
140 tokens,
141 self.symbol2label,
142 self.start,
143 ) # tuple
144 marshal.dump(payload, f) # version 2 is latest
145
146 def dump_nonterminals_py(self, f):
147 # type: (IO[str]) -> None
148 """Write a Python module with nonterminals.
149
150 Analogous to the 'symbol' module in Python.
151 """
152 f.write('# This code is generated by pgen2/grammar.py\n\n')
153 for num in sorted(self.number2symbol):
154 name = self.number2symbol[num]
155 f.write('%s = %d\n' % (name, num))
156
157 def dump_nonterminals_cpp(self, f):
158 # type: (IO[str]) -> None
159 """Write a Python module with nonterminals.
160
161 Analogous to the 'symbol' module in Python.
162 """
163 f.write("""\
164// This code is generated by pgen2/grammar.py
165
166namespace grammar_nt {
167""")
168 for num in sorted(self.number2symbol):
169 name = self.number2symbol[num]
170 f.write(' const int %s = %d;\n' % (name, num))
171 f.write("""\
172
173} // namespace grammar_nt
174""")
175
176 def dump_cpp(self, src_f):
177 # type: (IO[str]) -> None
178 """Dump a C++ implementation of the grammar tables."""
179 src_f.write("""\
180// This code is generated by pgen2/grammar.py
181# include "cpp/pgen2.h"
182# include "mycpp/runtime.h"
183
184namespace grammar {
185
186""")
187
188 src_f.write('Grammar::Grammar() {\n')
189 src_f.write(' symbol2number = Alloc<Dict<BigStr*, int>>();\n')
190 src_f.write(' number2symbol = Alloc<Dict<int, BigStr*>>();\n')
191 src_f.write(' dfas = Alloc<Dict<int, dfa_t*>>();\n')
192 src_f.write(' keywords = Alloc<Dict<BigStr*, int>>();\n')
193 src_f.write(' tokens = Alloc<Dict<int, int>>();\n')
194 src_f.write(' symbol2label = Alloc<Dict<BigStr*, int>>();\n')
195 src_f.write(' start = %d;\n' % self.start)
196
197 src_f.write('\n')
198 for i, (states, first) in self.dfas.items():
199 src_f.write(' {\n')
200 src_f.write(' states_t* st = Alloc<states_t>();\n')
201 for s in states:
202 src_f.write(' st->append(NewList<arc_t*>(\n')
203 src_f.write(' std::initializer_list<arc_t*>{\n')
204 for a, b in s:
205 src_f.write(' Alloc<Tuple2<int, int>>(%d, %d),\n' % (a, b))
206
207 src_f.write(' })\n')
208 src_f.write(' );\n')
209
210 src_f.write(' first_t* first = Alloc<first_t>();\n')
211 for k, v in first.items():
212 src_f.write(' first->set(%d, %d);\n' % (k, v))
213
214 src_f.write(' dfa_t* dfa = Alloc<dfa_t>(st, first);\n')
215 src_f.write(' dfas->set(%d, dfa);\n' % i)
216 src_f.write(' \n')
217 src_f.write(' }\n')
218 src_f.write('\n')
219
220 for symbol, num in self.symbol2number.items():
221 src_f.write(' symbol2number->set(StrFromC("%s"), %d);\n' % (symbol, num))
222
223 src_f.write('\n')
224 for num, symbol in self.number2symbol.items():
225 src_f.write(' number2symbol->set(%d, StrFromC("%s"));\n' % (num, symbol))
226
227 src_f.write('\n')
228 for symbol, label in self.symbol2label.items():
229 src_f.write(' symbol2label->set(StrFromC("%s"), %d);\n' % (symbol, label))
230
231 src_f.write('\n')
232 for ks, ki in self.keywords.items():
233 src_f.write(' keywords->set(StrFromC("%s"), %d);\n' % (ks, ki))
234
235 src_f.write('\n')
236 for k, v in self.tokens.items():
237 src_f.write(' tokens->set(%d, %d);\n' % (k, v))
238
239 src_f.write('\n')
240 src_f.write('\n')
241 src_f.write(' labels = NewList<int>(\n')
242 src_f.write(' std::initializer_list<int>{\n')
243 src_f.write(' ')
244 src_f.write(',\n '.join([str(label) for label in self.labels]))
245 src_f.write('\n')
246 src_f.write(' }\n')
247 src_f.write(' );\n')
248
249 src_f.write('\n')
250 src_f.write('};\n')
251
252 src_f.write("""\
253
254} // namespace grammar
255""")
256
257 MARSHAL_HEADER = 'PGEN2\n' # arbitrary header
258
259 def loads(self, s):
260 # type: (str) -> None
261 """Load the grammar from a string.
262
263 We have to use a string rather than a file because the marshal module
264 doesn't "fake" support zipimport files.
265 """
266 payload = marshal.loads(s)
267 if payload[0] != self.MARSHAL_HEADER:
268 raise RuntimeError('Invalid header %r' % payload[0])
269
270 ( _,
271 self.symbol2number,
272 self.number2symbol,
273 self.states,
274 self.dfas,
275 self.labels,
276 self.keywords,
277 self.tokens,
278 self.symbol2label,
279 self.start,
280 ) = payload
281 #self.report()
282
283 assert isinstance(self.symbol2number, dict), self.symbol2number
284 assert isinstance(self.number2symbol, dict), self.number2symbol
285
286 def report(self):
287 # type: () -> None
288 """Dump the grammar tables to standard output, for debugging."""
289 log("symbol2number: %d entries", len(self.symbol2number))
290 log("number2symbol: %d entries", len(self.number2symbol))
291 log("states: %d entries", len(self.states))
292 log("dfas: %d entries", len(self.dfas))
293 return
294 from pprint import pprint
295 print("labels")
296 pprint(self.labels)
297 print("keywords")
298 pprint(self.labels)
299 print("tokens")
300 pprint(self.tokens)
301 print("symbol2label")
302 pprint(self.symbol2label)
303 print("start", self.start)