OILS / build / cpython_defs.py View on Github | oils.pub

520 lines, 338 significant
1#!/usr/bin/env python2
2"""
3parse_cpython.py
4"""
5from __future__ import print_function
6
7import errno
8import os
9import re
10import sys
11
12from mycpp.mylib import log
13# TODO: Could move these to a place where they don't depend on Oil
14from frontend.lexer_def import C, R
15
16
17C_DEF = [
18 R(r'#.*', 'Comment'),
19 R(r'[ \t\n]+', 'Whitespace'),
20
21 # This could be more space-insensitive.
22 R(r'static.*PyMethodDef (.*)\[\]\s*=\s*', 'BeginDef'),
23 C(r'{', 'LBrace'),
24 C(r'}', 'RBrace'),
25 C(r',', 'Comma'),
26 C(r';', 'Semi'),
27 R(r'"([^"]*)"', 'Str'),
28 C(r'FILE', 'FILE'),
29 C(r'PyDoc_STR(', 'LDocStr'),
30 C(r')', 'RDocStr'),
31 R(r'[^,}\n]+', 'Opaque'),
32]
33
34
35# NOTE: This is copied from osh/match.py because we don't have 're' there.
36def _CompileAll(pat_list):
37 result = []
38 for is_regex, pat, token_id in pat_list:
39 if not is_regex:
40 pat = re.escape(pat) # turn $ into \$
41 result.append((re.compile(pat), token_id))
42 return result
43
44
45class Lexer(object):
46 def __init__(self, pat_list):
47 self.pat_list = _CompileAll(pat_list)
48
49 def Tokens(self, s):
50 pos = 0
51 n = len(s)
52 while pos < n:
53 for pat, id_ in self.pat_list:
54 # FIRST MATCH
55 m = pat.match(s, pos)
56 if m:
57 if m.groups():
58 start, end = m.start(1), m.end(1)
59 else:
60 start, end = m.start(0), m.end(0)
61 pos = m.end()
62 break # found a match
63 else:
64 raise AssertionError(
65 'no token matched at position %r: %r' % ( pos, s[pos]))
66
67 if id_ != 'Whitespace':
68 yield id_, s[start:end], pos
69 yield 'EOF', '', -1
70
71
72class Parser(object):
73 """Parser for C PyMethodDef initializer lists."""
74
75 def __init__(self, tokens):
76 self.tokens = tokens
77 self.Next() # initialize
78
79 def Next(self):
80 while True:
81 self.tok_id, self.tok_val, self.pos = self.tokens.next()
82 if self.tok_id not in ('Comment', 'Whitespace'):
83 break
84 if 0:
85 log('%s %r', self.tok_id, self.tok_val)
86
87 def Eat(self, tok_id):
88 if self.tok_id != tok_id:
89 raise RuntimeError(
90 'Expected %r, got %r %r (byte offset %d)' %
91 (tok_id, self.tok_id, self.tok_val, self.pos))
92
93 self.Next()
94
95 def ParseName(self):
96 """
97 Name = Str | Opaque('NULL') | Opaque('0')
98 """
99 if self.tok_id == 'Str':
100 name = self.tok_val
101 elif self.tok_id == 'Opaque':
102 assert self.tok_val in ('NULL', '0')
103 name = None
104 else:
105 raise RuntimeError('Unexpected token %r' % self.tok_id)
106 self.Next()
107 return name
108
109 def ParseVal(self):
110 """
111 Val = Str
112 | Opaque
113 | LDocStr Str+ RDocStr # string concatenation happens
114 """
115 if self.tok_id == 'LDocStr':
116 self.Next()
117
118 val = self.tok_val
119 self.Eat('Str')
120 while self.tok_id == 'Str':
121 val += self.tok_val
122 self.Next()
123
124 self.Eat('RDocStr')
125
126 elif self.tok_id in ('Opaque', 'Str'):
127 val = self.tok_val
128 self.Next()
129
130 else:
131 raise RuntimeError('Unexpected token %r' % self.tok_id)
132
133 return val
134
135 def ParseItem(self):
136 """
137 Item = '{' Name (',' Val)+ '}' ','?
138 """
139 self.Eat('LBrace')
140 name = self.ParseName()
141
142 vals = []
143 while self.tok_id == 'Comma':
144 self.Next()
145 vals.append(self.ParseVal())
146
147 self.Eat('RBrace')
148
149 if self.tok_id == 'Comma': # Optional
150 self.Next()
151
152 return name, vals
153
154 def ParseDef(self):
155 """
156 Def = BeginDef '{' Item+ '}' ';'
157 """
158 def_name = self.tok_val
159 self.Eat('BeginDef')
160 self.Eat('LBrace')
161
162 items = []
163 while self.tok_id != 'RBrace':
164 items.append(self.ParseItem())
165
166 self.Next()
167 self.Eat('Semi')
168
169 return (def_name, items)
170
171 def ParseHeader(self):
172 self.Eat('FILE')
173 path = self.tok_val
174 self.Eat('Opaque')
175 return path
176
177 def ParseFile(self):
178 """
179 File = Header Def*
180 """
181 path = self.ParseHeader()
182 defs = []
183 while self.tok_id not in ('FILE', 'EOF'):
184 defs.append(self.ParseDef())
185
186 return path, defs
187
188 def ParseStream(self):
189 """
190 Stream = File*
191 """
192 files = []
193 while self.tok_id != 'EOF':
194 files.append(self.ParseFile())
195
196 return files
197
198
199def PrettyPrint(rel_path, def_name, entries, predicate, f, stats):
200 def out(msg, *args):
201 if args:
202 msg = msg % args
203 print(msg, file=f, end='')
204
205 out('static PyMethodDef %s[] = {\n', def_name)
206 for entry_name, vals in entries:
207 if entry_name is None:
208 out(' {0},\n') # null initializer
209 continue
210 stats['num_methods'] += 1
211
212 if not predicate(rel_path, def_name, entry_name):
213 stats['num_filtered'] += 1
214 continue
215
216 # Reprint the definition, but omit the docstring.
217 out(' {"%s", ', entry_name)
218 out(vals[0]) # The C function
219 out(', ')
220 out(vals[1]) # The flags
221 out('},\n')
222 out('};\n')
223
224
225MODULES_TO_FILTER = [
226 # My Own
227 'libc.c',
228 'fastlex.c',
229 'line_input.c',
230
231 'import.c',
232 'marshal.c', # additional filters below
233 #'zipimport.c', # Cannot filter this because find_module called from C!
234
235 # Types for Builtins
236 'enumobject.c',
237 'rangeobject.c',
238
239 # Interpreter types
240 'descrobject.c',
241 'exceptions.c',
242 'structseq.c',
243 '_warnings.c',
244
245 # Control flow
246 'frameobject.c',
247 'genobject.c',
248 'iterobject.c',
249
250 # GC
251 '_weakref.c',
252 'weakrefobject.c',
253 'gcmodule.c',
254
255 # "Data types"
256 #'boolobject.c', # No defs
257 'cStringIO.c',
258 'dictobject.c',
259 'fileobject.c',
260 'floatobject.c',
261 'intobject.c',
262 'listobject.c',
263 'longobject.c',
264 #'moduleobject.c', # No defs
265 'setobject.c',
266 'stringobject.c',
267 'tupleobject.c',
268 'sliceobject.c',
269 'typeobject.c',
270
271 # Builtins
272 'bltinmodule.c', # additional filters below
273 #'sysmodule.c', # Filtered below
274
275 # Libraries
276 'errnomodule.c', # has no methods, but include it for completeness
277 'fcntlmodule.c',
278 'posixmodule.c',
279 'pwdmodule.c',
280 'readline.c',
281 'resource.c',
282 'signalmodule.c',
283 'timemodule.c',
284 'termios.c',
285 'mathmodule.c',
286]
287
288
289class OilMethodFilter(object):
290
291 def __init__(self, py_names):
292 self.py_names = py_names
293
294 def __call__(self, rel_path, def_name, method_name):
295 basename = os.path.basename(rel_path)
296
297 if method_name == 'count': # False positive for {str,list,tuple}.count()
298 return False
299
300 if method_name == 'collect': # False positive: pyannotate and gcmodule.c
301 return False
302
303 # enter/exit needed for 'with open'. __length_hint__ is an optimization.
304 if method_name in ('__enter__', '__exit__', '__length_hint__'):
305 return True
306 # Notes:
307 # - __reduce__ and __setstate__ are for pickle. And I think
308 # __getnewargs__.
309 # - Do we need __sizeof__? Is that for sys.getsizeof()?
310
311 # 5/2022: avoid regression? Not sure why this was getting deleted
312 if method_name == '__getitem__':
313 return True
314
315 # NOTE: LoadYshGrammar needs marshal.loads().
316 # False positive for yajl.dumps() and load()
317 if basename == 'marshal.c' and method_name in ('dump', 'dumps', 'load'):
318 return False
319
320 # Auto-filtering gave false-positives here.
321 # We don't need top-level next(). The method should be good enough.
322 # iter is a field name
323 if (basename == 'bltinmodule.c' and method_name in
324 ('compile', 'format', 'next', 'vars', 'iter', 'eval', 'bin')):
325 return False
326 if basename == 'bltinmodule.c':
327 # Get "bootstrapping error" without this.
328 if method_name == '__import__':
329 return True
330
331 if basename == '_warnings.c' and method_name == 'warn':
332 return False
333
334 if basename == 'dictobject.c' and method_name in (
335 'iterkeys', 'itervalues', 'copy', 'fromkeys', 'popitem', 'setdefault'):
336 return False
337
338 if basename == 'tupleobject.c' and method_name == 'index':
339 return False
340
341 if basename == 'stringobject.c' and method_name == 'translate':
342 # false positive from arg.translate
343 return False
344
345 if basename == 'setobject.c' and method_name in ('pop', 'copy'):
346 return False
347
348 if basename == 'frozensetobject.c' and method_name == 'copy':
349 return False
350
351 if basename == 'sliceobject.c' and method_name == 'indices':
352 return False
353
354 # Shadowed by fanos.send(), posix.close(), etc.
355 if basename == 'genobject.c' and method_name in ('send', 'close'):
356 return False
357
358 # We're using list.remove()
359 if basename == 'posixmodule.c' and method_name == 'remove': # Shadowed
360 return False
361
362 # We're using dict.clear() and list.remove()
363 if basename == 'setobject.c' and method_name in ('clear', 'remove'):
364 return False
365
366 # Do custom filtering here.
367 if (basename == 'sysmodule.c' and method_name not in self.py_names):
368 # These can't be removed or they cause assertions!
369 if method_name not in ('displayhook', 'excepthook'):
370 return False
371
372 # This one is called from C.
373 if basename == 'signalmodule.c' and method_name == 'default_int_handler':
374 return True
375
376 # Name collisions
377 if basename == 'mathmodule.c' and method_name in ('exp', 'log'):
378 return False
379
380 # segfault without this
381 if basename == 'typeobject.c' and method_name == '__new__':
382 return True
383
384 if basename == 'descrobject.c':
385 # Apparently used for dir() on class namespace, as in dir(Id).
386 if method_name == 'keys':
387 return True
388 return False
389
390 # Try just filtering {time,pwd,posix}module.c, etc.
391 if basename in MODULES_TO_FILTER and method_name not in self.py_names:
392 return False
393
394 #log('= %s %s', def_name, method_name)
395
396 # If it doesn't appear in the .py source, it can't be used. (Exception: it
397 # could be used in C source with dynamic lookup? But I don't think CPython
398 # does that.)
399 #if method_name not in self.py_names:
400 if 0:
401 log('Omitting %r', method_name)
402 return False
403
404 return True
405
406
407def main(argv):
408 action = argv[1]
409
410 try:
411 py_names_path = argv[2]
412 except IndexError:
413 method_filter = None
414 else:
415 py_names = set()
416 with open(py_names_path) as f:
417 for line in f:
418 py_names.add(line.strip())
419 method_filter = OilMethodFilter(py_names)
420
421 if action == 'filtered':
422 tokens = None
423 else:
424 tokens = Lexer(C_DEF).Tokens(sys.stdin.read())
425
426 if action == 'lex': # for debugging
427 while True:
428 id_, value, pos = tokens.next()
429 print('%s\t%r' % (id_, value))
430 if id_ == 'EOF':
431 break
432
433 elif action == 'audit': # show after filtering, for debugging
434 p = Parser(tokens)
435 files = p.ParseStream()
436 for rel_path, defs in files:
437 basename = os.path.basename(rel_path)
438
439 print(rel_path)
440 for def_name, entries in defs:
441 print('\t' + def_name)
442 for method_name, vals in entries:
443 if method_name is None:
444 continue
445 if not method_filter(rel_path, def_name, method_name):
446 continue
447 print('\t\t%s %s' % (method_name, vals))
448
449 elif action == 'filter': # for slimming the build down
450 out_dir = argv[3]
451
452 p = Parser(tokens)
453 files = p.ParseStream()
454
455 # Print to files.
456
457 stats = {'num_methods': 0, 'num_defs': 0, 'num_filtered': 0}
458 for rel_path, defs in files:
459 # Make a directory for each .c file! Each file is a def.
460 c_dir = os.path.join(out_dir, rel_path)
461 try:
462 os.makedirs(c_dir)
463 except OSError as e:
464 if e.errno != errno.EEXIST:
465 raise
466
467 for def_name, entries in defs:
468 out_path = os.path.join(c_dir, '%s.def' % def_name)
469
470 # TODO: Write a separate file here for each one. We have to include a
471 # different file at each definition.
472
473 with open(out_path, 'w') as f:
474 print('// %s' % rel_path, file=f)
475 print('', file=f)
476 PrettyPrint(rel_path, def_name, entries, method_filter, f, stats)
477
478 stats['num_defs'] += 1
479 log('Wrote %s', out_path)
480
481 stats['num_left'] = stats['num_methods'] - stats['num_filtered']
482 log('cpython_defs.py: Filtered %(num_filtered)d of %(num_methods)d methods, '
483 'leaving %(num_left)d (from %(num_defs)d definitions)' % stats)
484
485 elif action == 'tsv':
486 p = Parser(tokens)
487 files = p.ParseStream()
488 header = [
489 'file', 'def_name', 'py_method_name', 'c_symbol_name', 'flags',
490 'used'
491 ]
492 print('\t'.join(header))
493 for rel_path, defs in files:
494 for def_name, entries in defs:
495 for method_name, vals in entries:
496 if method_name is None:
497 continue
498 b = method_filter(rel_path, def_name, method_name)
499 used = 'T' if b else 'F'
500
501 # TODO: The c_symbol_name could be parsed better. It sometimes has
502 # "(PyCFunction)" on the front of it.
503
504 row = [rel_path, def_name, method_name, vals[0], vals[1], used]
505 print('\t'.join(row))
506
507 elif action == 'filtered':
508 for name in MODULES_TO_FILTER:
509 print(name)
510
511 else:
512 raise RuntimeError('Invalid action %r' % action)
513
514
515if __name__ == '__main__':
516 try:
517 main(sys.argv)
518 except RuntimeError as e:
519 print('FATAL: %s' % e, file=sys.stderr)
520 sys.exit(1)