OILS / pgen2 / pgen.py View on Github | oils.pub

430 lines, 315 significant
1# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2# Licensed to PSF under a Contributor Agreement.
3
4# Pgen imports
5#import grammar, token, tokenize
6# NOTE: Need these special versions of token/tokenize for BACKQUOTE and such.
7from . import grammar, token, tokenize
8from mycpp.mylib import log
9
10_ = log
11
12
13class PythonTokDef(object):
14
15 def GetTerminalNum(self, label):
16 """ e.g. NAME -> 1 """
17 itoken = getattr(token, label, None)
18 assert isinstance(itoken, int), label
19 assert itoken in token.tok_name, label
20 return itoken
21
22 def GetKeywordNum(self, value):
23 """
24 e.g 'xor' -> Id.Expr_Xor
25
26 Python doesn't have this, but Oil does. Returns None if not found.
27 """
28 return None
29
30 def GetOpNum(self, op_str):
31 """
32 e.g '(' -> LPAR
33
34 Raises an exception if it's not found.
35 """
36 return token.opmap[op_str]
37
38
39class ParserGenerator(object):
40
41 def __init__(self, lexer):
42 self.lexer = lexer
43
44 def parse(self):
45 self.gettoken() # Initialize lookahead
46
47 dfas = {}
48 startsymbol = None
49 # MSTART: (NEWLINE | RULE)* ENDMARKER
50 while self.type != token.ENDMARKER:
51 while self.type == token.NEWLINE:
52 self.gettoken()
53 # RULE: NAME ':' RHS NEWLINE
54 name = self.expect(token.NAME)
55 self.expect(token.OP, ":")
56 a, z = self.parse_rhs()
57 self.expect(token.NEWLINE)
58 #self.dump_nfa(name, a, z)
59 dfa = self.make_dfa(a, z)
60 #self.dump_dfa(name, dfa)
61 oldlen = len(dfa)
62 self.simplify_dfa(dfa)
63 newlen = len(dfa)
64 dfas[name] = dfa
65 #print name, oldlen, newlen
66 if startsymbol is None:
67 startsymbol = name
68 return dfas, startsymbol
69
70 def make_dfa(self, start, finish):
71 # To turn an NFA into a DFA, we define the states of the DFA
72 # to correspond to *sets* of states of the NFA. Then do some
73 # state reduction. Let's represent sets as dicts with 1 for
74 # values.
75 assert isinstance(start, NFAState)
76 assert isinstance(finish, NFAState)
77 def closure(state):
78 base = {}
79 addclosure(state, base)
80 return base
81 def addclosure(state, base):
82 assert isinstance(state, NFAState)
83 if state in base:
84 return
85 base[state] = 1
86 for label, next in state.arcs:
87 if label is None:
88 addclosure(next, base)
89 states = [DFAState(closure(start), finish)]
90 for state in states: # NB states grows while we're iterating
91 arcs = {}
92 for nfastate in state.nfaset:
93 for label, next in nfastate.arcs:
94 if label is not None:
95 addclosure(next, arcs.setdefault(label, {}))
96 for label, nfaset in sorted(arcs.items()):
97 for st in states:
98 if st.nfaset == nfaset:
99 break
100 else:
101 st = DFAState(nfaset, finish)
102 states.append(st)
103 state.addarc(st, label)
104 return states # List of DFAState instances; first one is start
105
106 def dump_nfa(self, name, start, finish):
107 print("Dump of NFA for", name)
108 todo = [start]
109 for i, state in enumerate(todo):
110 print(" State", i, state is finish and "(final)" or "")
111 for label, next in state.arcs:
112 if next in todo:
113 j = todo.index(next)
114 else:
115 j = len(todo)
116 todo.append(next)
117 if label is None:
118 print(" -> %d" % j)
119 else:
120 print(" %s -> %d" % (label, j))
121
122 def dump_dfa(self, name, dfa):
123 print("Dump of DFA for", name)
124 for i, state in enumerate(dfa):
125 print(" State", i, state.isfinal and "(final)" or "")
126 for label, next in sorted(state.arcs.items()):
127 print(" %s -> %d" % (label, dfa.index(next)))
128
129 def simplify_dfa(self, dfa):
130 # This is not theoretically optimal, but works well enough.
131 # Algorithm: repeatedly look for two states that have the same
132 # set of arcs (same labels pointing to the same nodes) and
133 # unify them, until things stop changing.
134
135 # dfa is a list of DFAState instances
136 changes = True
137 while changes:
138 changes = False
139 for i, state_i in enumerate(dfa):
140 for j in range(i+1, len(dfa)):
141 state_j = dfa[j]
142 if state_i == state_j:
143 #print " unify", i, j
144 del dfa[j]
145 for state in dfa:
146 state.unifystate(state_j, state_i)
147 changes = True
148 break
149
150 def parse_rhs(self):
151 # RHS: ALT ('|' ALT)*
152 a, z = self.parse_alt()
153 if self.value != "|":
154 return a, z
155 else:
156 aa = NFAState()
157 zz = NFAState()
158 aa.addarc(a)
159 z.addarc(zz)
160 while self.value == "|":
161 self.gettoken()
162 a, z = self.parse_alt()
163 aa.addarc(a)
164 z.addarc(zz)
165 return aa, zz
166
167 def parse_alt(self):
168 # ALT: ITEM+
169 a, b = self.parse_item()
170 while (self.value in ("(", "[") or
171 self.type in (token.NAME, token.STRING)):
172 c, d = self.parse_item()
173 b.addarc(c)
174 b = d
175 return a, b
176
177 def parse_item(self):
178 # ITEM: '[' RHS ']' | ATOM ['+' | '*']
179 if self.value == "[":
180 self.gettoken()
181 a, z = self.parse_rhs()
182 self.expect(token.OP, "]")
183 a.addarc(z)
184 return a, z
185 else:
186 a, z = self.parse_atom()
187 value = self.value
188 if value not in ("+", "*"):
189 return a, z
190 self.gettoken()
191 z.addarc(a)
192 if value == "+":
193 return a, z
194 else:
195 return a, a
196
197 def parse_atom(self):
198 # ATOM: '(' RHS ')' | NAME | STRING
199 if self.value == "(":
200 self.gettoken()
201 a, z = self.parse_rhs()
202 self.expect(token.OP, ")")
203 return a, z
204 elif self.type in (token.NAME, token.STRING):
205 a = NFAState()
206 z = NFAState()
207 a.addarc(z, self.value)
208 self.gettoken()
209 return a, z
210 else:
211 self.raise_error("expected (...) or NAME or STRING, got %s/%s",
212 self.type, self.value)
213
214 def expect(self, type, value=None):
215 if self.type != type or (value is not None and self.value != value):
216 self.raise_error("expected %s/%s, got %s/%s",
217 type, value, self.type, self.value)
218 value = self.value
219 self.gettoken()
220 return value
221
222 def gettoken(self):
223 tup = next(self.lexer)
224 while tup[0] in (tokenize.COMMENT, tokenize.NL):
225 tup = next(self.lexer)
226 self.type, self.value, self.begin, self.end, self.line = tup
227 #print token.tok_name[self.type], repr(self.value)
228
229 def raise_error(self, msg, *args):
230 if args:
231 msg = msg % args
232 raise SyntaxError(msg, ('<grammar>', self.end[0],
233 self.end[1], self.line))
234
235class NFAState(object):
236
237 def __init__(self):
238 self.arcs = [] # list of (label, NFAState) pairs
239
240 def addarc(self, next, label=None):
241 assert label is None or isinstance(label, str)
242 assert isinstance(next, NFAState)
243 self.arcs.append((label, next))
244
245class DFAState(object):
246
247 def __init__(self, nfaset, final):
248 assert isinstance(nfaset, dict)
249 assert isinstance(next(iter(nfaset)), NFAState)
250 assert isinstance(final, NFAState)
251 self.nfaset = nfaset
252 self.isfinal = final in nfaset
253 self.arcs = {} # map from label to DFAState
254
255 def addarc(self, next, label):
256 assert isinstance(label, str)
257 assert label not in self.arcs
258 assert isinstance(next, DFAState)
259 self.arcs[label] = next
260
261 def unifystate(self, old, new):
262 for label, next in self.arcs.items():
263 if next is old:
264 self.arcs[label] = new
265
266 def __eq__(self, other):
267 # Equality test -- ignore the nfaset instance variable
268 assert isinstance(other, DFAState)
269 if self.isfinal != other.isfinal:
270 return False
271 # Can't just return self.arcs == other.arcs, because that
272 # would invoke this method recursively, with cycles...
273 if len(self.arcs) != len(other.arcs):
274 return False
275 for label, next in self.arcs.items():
276 if next is not other.arcs.get(label):
277 return False
278 return True
279
280 __hash__ = None # For Py3 compatibility.
281
282
283def calcfirst(dfas, first, name):
284 """Recursive function that mutates first."""
285 dfa = dfas[name]
286 first[name] = None # dummy to detect left recursion
287 state = dfa[0]
288 totalset = {}
289 overlapcheck = {}
290 for label, _ in state.arcs.items():
291 if label in dfas:
292 if label in first:
293 fset = first[label]
294 if fset is None:
295 raise ValueError("recursion for rule %r" % name)
296 else:
297 calcfirst(dfas, first, label)
298 fset = first[label]
299 totalset.update(fset)
300 overlapcheck[label] = fset
301 else:
302 totalset[label] = 1
303 overlapcheck[label] = {label: 1}
304 inverse = {}
305 for label, itsfirst in overlapcheck.items():
306 for symbol in itsfirst:
307 if symbol in inverse:
308 raise ValueError("rule %s is ambiguous; %s is in the"
309 " first sets of %s as well as %s" %
310 (name, symbol, label, inverse[symbol]))
311 inverse[symbol] = label
312 first[name] = totalset
313
314
315def make_label(tok_def, gr, label):
316 """Given a grammar item, return a unique integer representing it.
317
318 It could be:
319 1. or_test - a non-terminal
320 2. Expr_Name - a terminal
321 3. 'for' - keyword (quotes)
322 4. '>=' - operator (quotes)
323
324 Oil addition
325 5. Op_RBracket -- anything with _ is assumed to be in the Id namespace.
326 """
327 #log('make_label %r', label)
328 # XXX Maybe this should be a method on a subclass of converter?
329 ilabel = len(gr.labels)
330 if label[0].isalpha():
331 if label in gr.symbol2number: # NON-TERMINAL
332 if label in gr.symbol2label:
333 return gr.symbol2label[label]
334 else:
335 gr.labels.append(gr.symbol2number[label])
336 gr.symbol2label[label] = ilabel
337 return ilabel
338 else: # TERMINAL like (NAME, NUMBER, STRING)
339 itoken = tok_def.GetTerminalNum(label)
340 if itoken in gr.tokens:
341 return gr.tokens[itoken]
342 else:
343 gr.labels.append(itoken)
344 #log('%s %d -> %s', token.tok_name[itoken], itoken, ilabel)
345 gr.tokens[itoken] = ilabel
346 return ilabel
347 else:
348 # Either a keyword or an operator
349 assert label[0] in ('"', "'"), label
350 value = eval(label)
351
352 # Treat 'xor' just like '^'. TODO: I think this code can be
353 # simplified.
354 n = tok_def.GetKeywordNum(value) # int or None
355
356 if value[0].isalpha() and n is None: # A word like 'for', 'xor'
357 # Then look in the keywords automatically extracted from the
358 # grammar.
359 if value in gr.keywords:
360 return gr.keywords[value]
361 else:
362 gr.labels.append(token.NAME) # arbitrary number < NT_OFFSET
363 gr.keywords[value] = ilabel
364 return ilabel
365
366 else: # An operator e.g. '>='
367 if n is None:
368 itoken = tok_def.GetOpNum(value)
369 else:
370 itoken = n
371
372 if itoken in gr.tokens:
373 return gr.tokens[itoken]
374 else:
375 gr.labels.append(itoken)
376 gr.tokens[itoken] = ilabel
377 return ilabel
378
379
380def make_first(tok_def, rawfirst, gr):
381 first = {}
382 for label in sorted(rawfirst):
383 ilabel = make_label(tok_def, gr, label)
384 ##assert ilabel not in first # XXX failed on <> ... !=
385 first[ilabel] = 1
386 return first
387
388
389def MakeGrammar(f, tok_def=None):
390 """Construct a Grammar object from a file."""
391
392 lexer = tokenize.generate_tokens(f.readline)
393 p = ParserGenerator(lexer)
394 dfas, startsymbol = p.parse()
395
396 first = {} # map from symbol name to set of tokens
397 for name in sorted(dfas):
398 if name not in first:
399 calcfirst(dfas, first, name)
400 #print name, self.first[name].keys()
401
402 # TODO: startsymbol support could be removed. The call to p.setup() in
403 # PushTokens() can always specify it explicitly.
404 names = sorted(dfas)
405 names.remove(startsymbol)
406 names.insert(0, startsymbol)
407
408 gr = grammar.Grammar()
409 for name in names:
410 i = token.NT_OFFSET + len(gr.symbol2number)
411 gr.symbol2number[name] = i
412 gr.number2symbol[i] = name
413
414 tok_def = tok_def or PythonTokDef()
415 for name in names:
416 dfa = dfas[name]
417 states = []
418 for state in dfa:
419 arcs = []
420 for label, next_ in sorted(state.arcs.items()):
421 arcs.append((make_label(tok_def, gr, label), dfa.index(next_)))
422 if state.isfinal:
423 arcs.append((0, dfa.index(state)))
424 states.append(arcs)
425 gr.states.append(states)
426 fi = make_first(tok_def, first[name], gr)
427 gr.dfas[gr.symbol2number[name]] = (states, fi)
428
429 gr.start = gr.symbol2number[startsymbol]
430 return gr