opy/_regtest/src/opy/pgen2/grammar.py

OILS / opy / _regtest / src / opy / pgen2 / grammar.py View on Github | oilshell.org

222 lines, 71 significant

1	# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2	# Licensed to PSF under a Contributor Agreement.
3
4	"""This module defines the data structures used to represent a grammar.
5
6	These are a bit arcane because they are derived from the data
7	structures used by Python's 'pgen' parser generator.
8
9	There's also a table here mapping operators to their names in the
10	token module; the Python tokenize module reports all operators as the
11	fallback token code OP, but the parser needs the actual token code.
12
13	"""
14
15	# Python imports
16	import cStringIO
17	import collections
18	import pickle
19	import marshal
20
21	# Local imports
22	from . import token, tokenize
23
24
25	class Grammar(object):
26	"""Pgen parsing tables conversion class.
27
28	Once initialized, this class supplies the grammar tables for the
29	parsing engine implemented by parse.py. The parsing engine
30	accesses the instance variables directly. The class here does not
31	provide initialization of the tables; several subclasses exist to
32	do this (see the conv and pgen modules).
33
34	The load() method reads the tables from a pickle file, which is
35	much faster than the other ways offered by subclasses. The pickle
36	file is written by calling dump() (after loading the grammar
37	tables using a subclass). The report() method prints a readable
38	representation of the tables to stdout, for debugging.
39
40	The instance variables are as follows:
41
42	symbol2number -- a dict mapping symbol names to numbers. Symbol
43	numbers are always 256 or higher, to distinguish
44	them from token numbers, which are between 0 and
45	255 (inclusive).
46
47	number2symbol -- a dict mapping numbers to symbol names;
48	these two are each other's inverse.
49
50	states -- a list of DFAs, where each DFA is a list of
51	states, each state is a list of arcs, and each
52	arc is a (i, j) pair where i is a label and j is
53	a state number. The DFA number is the index into
54	this list. (This name is slightly confusing.)
55	Final states are represented by a special arc of
56	the form (0, j) where j is its own state number.
57
58	dfas -- a dict mapping symbol numbers to (DFA, first)
59	pairs, where DFA is an item from the states list
60	above, and first is a set of tokens that can
61	begin this grammar rule (represented by a dict
62	whose values are always 1).
63
64	labels -- a list of (x, y) pairs where x is either a token
65	number or a symbol number, and y is either None
66	or a string; the strings are keywords. The label
67	number is the index in this list; label numbers
68	are used to mark state transitions (arcs) in the
69	DFAs.
70
71	start -- the number of the grammar's start symbol.
72
73	keywords -- a dict mapping keyword strings to arc labels.
74
75	tokens -- a dict mapping token numbers to arc labels.
76
77	"""
78
79	def __init__(self):
80	self.symbol2number = {}
81	self.number2symbol = {}
82	self.states = []
83	self.dfas = {}
84	self.labels = [(0, "EMPTY")]
85	self.keywords = {}
86	self.tokens = {}
87	self.symbol2label = {}
88	self.start = 256
89
90	def dump(self, filename):
91	"""Dump the grammar tables to a pickle file.
92
93	dump() recursively changes all dict to OrderedDict, so the pickled file
94	is not exactly the same as what was passed in to dump(). load() uses the
95	pickled file to create the tables, but only changes OrderedDict to dict
96	at the top level; it does not recursively change OrderedDict to dict.
97	So, the loaded tables are different from the original tables that were
98	passed to load() in that some of the OrderedDict (from the pickled file)
99	are not changed back to dict. For parsing, this has no effect on
100	performance because OrderedDict uses dict's __getitem__ with nothing in
101	between.
102	"""
103	with open(filename, "wb") as f:
104	d = _make_deterministic(self.__dict__)
105	pickle.dump(d, f, 2) # protocol 2
106	# d isn't marshallable. But neither is self.__dict__.
107	# Probably have to go through and marshal each member.
108	#marshal.dump(self.__dict__, f)
109
110	if 0:
111	s = ''
112	s += marshal.dumps(self.symbol2number)
113	s += marshal.dumps(self.number2symbol)
114	s += marshal.dumps(self.dfas)
115	s += marshal.dumps(self.labels)
116	s += marshal.dumps(self.keywords)
117	s += marshal.dumps(self.tokens)
118	s += marshal.dumps(self.symbol2label)
119	s += marshal.dumps(self.start)
120	print('Marshalled bytes: %d' % len(s))
121
122	def load(self, f):
123	d = pickle.load(f)
124	self.__dict__.update(d)
125
126	def copy(self):
127	"""
128	Copy the grammar.
129	"""
130	new = self.__class__()
131	for dict_attr in ("symbol2number", "number2symbol", "dfas", "keywords",
132	"tokens", "symbol2label"):
133	setattr(new, dict_attr, getattr(self, dict_attr).copy())
134	new.labels = self.labels[:]
135	new.states = self.states[:]
136	new.start = self.start
137	return new
138
139	def report(self):
140	"""Dump the grammar tables to standard output, for debugging."""
141	from pprint import pprint
142	print("s2n")
143	pprint(self.symbol2number)
144	print("n2s")
145	pprint(self.number2symbol)
146	print("states")
147	pprint(self.states)
148	print("dfas")
149	pprint(self.dfas)
150	print("labels")
151	pprint(self.labels)
152	print("start", self.start)
153
154
155	def _make_deterministic(top):
156	if isinstance(top, dict):
157	return collections.OrderedDict(
158	sorted(((k, _make_deterministic(v)) for k, v in top.items())))
159	if isinstance(top, list):
160	return [_make_deterministic(e) for e in top]
161	if isinstance(top, tuple):
162	return tuple(_make_deterministic(e) for e in top)
163	return top
164
165
166	# Map from operator to number (since tokenize doesn't do this)
167
168	opmap_raw = """
169	( LPAR
170	) RPAR
171	[ LSQB
172	] RSQB
173	: COLON
174	, COMMA
175	; SEMI
176	+ PLUS
177	- MINUS
178	* STAR
179	/ SLASH
180	\| VBAR
181	& AMPER
182	< LESS
183	> GREATER
184	= EQUAL
185	. DOT
186	% PERCENT
187	` BACKQUOTE
188	{ LBRACE
189	} RBRACE
190	@ AT
191	@= ATEQUAL
192	== EQEQUAL
193	!= NOTEQUAL
194	<> NOTEQUAL
195	<= LESSEQUAL
196	>= GREATEREQUAL
197	~ TILDE
198	^ CIRCUMFLEX
199	<< LEFTSHIFT
200	>> RIGHTSHIFT
201	** DOUBLESTAR
202	+= PLUSEQUAL
203	-= MINEQUAL
204	*= STAREQUAL
205	/= SLASHEQUAL
206	%= PERCENTEQUAL
207	&= AMPEREQUAL
208	\|= VBAREQUAL
209	^= CIRCUMFLEXEQUAL
210	<<= LEFTSHIFTEQUAL
211	>>= RIGHTSHIFTEQUAL
212	**= DOUBLESTAREQUAL
213	// DOUBLESLASH
214	//= DOUBLESLASHEQUAL
215	-> RARROW
216	"""
217
218	opmap = {}
219	for line in opmap_raw.splitlines():
220	if line:
221	op, name = line.split()
222	opmap[op] = getattr(token, name)