1 | #!/usr/bin/env python2
|
2 | """
|
3 | parse_cpython.py
|
4 | """
|
5 | from __future__ import print_function
|
6 |
|
7 | import errno
|
8 | import os
|
9 | import re
|
10 | import sys
|
11 |
|
12 | from mycpp.mylib import log
|
13 | # TODO: Could move these to a place where they don't depend on Oil
|
14 | from frontend.lexer_def import C, R
|
15 |
|
16 |
|
17 | C_DEF = [
|
18 | R(r'#.*', 'Comment'),
|
19 | R(r'[ \t\n]+', 'Whitespace'),
|
20 |
|
21 | # This could be more space-insensitive.
|
22 | R(r'static.*PyMethodDef (.*)\[\]\s*=\s*', 'BeginDef'),
|
23 | C(r'{', 'LBrace'),
|
24 | C(r'}', 'RBrace'),
|
25 | C(r',', 'Comma'),
|
26 | C(r';', 'Semi'),
|
27 | R(r'"([^"]*)"', 'Str'),
|
28 | C(r'FILE', 'FILE'),
|
29 | C(r'PyDoc_STR(', 'LDocStr'),
|
30 | C(r')', 'RDocStr'),
|
31 | R(r'[^,}\n]+', 'Opaque'),
|
32 | ]
|
33 |
|
34 |
|
35 | # NOTE: This is copied from osh/match.py because we don't have 're' there.
|
36 | def _CompileAll(pat_list):
|
37 | result = []
|
38 | for is_regex, pat, token_id in pat_list:
|
39 | if not is_regex:
|
40 | pat = re.escape(pat) # turn $ into \$
|
41 | result.append((re.compile(pat), token_id))
|
42 | return result
|
43 |
|
44 |
|
45 | class Lexer(object):
|
46 | def __init__(self, pat_list):
|
47 | self.pat_list = _CompileAll(pat_list)
|
48 |
|
49 | def Tokens(self, s):
|
50 | pos = 0
|
51 | n = len(s)
|
52 | while pos < n:
|
53 | for pat, id_ in self.pat_list:
|
54 | # FIRST MATCH
|
55 | m = pat.match(s, pos)
|
56 | if m:
|
57 | if m.groups():
|
58 | start, end = m.start(1), m.end(1)
|
59 | else:
|
60 | start, end = m.start(0), m.end(0)
|
61 | pos = m.end()
|
62 | break # found a match
|
63 | else:
|
64 | raise AssertionError(
|
65 | 'no token matched at position %r: %r' % ( pos, s[pos]))
|
66 |
|
67 | if id_ != 'Whitespace':
|
68 | yield id_, s[start:end], pos
|
69 | yield 'EOF', '', -1
|
70 |
|
71 |
|
72 | class Parser(object):
|
73 | """Parser for C PyMethodDef initializer lists."""
|
74 |
|
75 | def __init__(self, tokens):
|
76 | self.tokens = tokens
|
77 | self.Next() # initialize
|
78 |
|
79 | def Next(self):
|
80 | while True:
|
81 | self.tok_id, self.tok_val, self.pos = self.tokens.next()
|
82 | if self.tok_id not in ('Comment', 'Whitespace'):
|
83 | break
|
84 | if 0:
|
85 | log('%s %r', self.tok_id, self.tok_val)
|
86 |
|
87 | def Eat(self, tok_id):
|
88 | if self.tok_id != tok_id:
|
89 | raise RuntimeError(
|
90 | 'Expected %r, got %r %r (byte offset %d)' %
|
91 | (tok_id, self.tok_id, self.tok_val, self.pos))
|
92 |
|
93 | self.Next()
|
94 |
|
95 | def ParseName(self):
|
96 | """
|
97 | Name = Str | Opaque('NULL') | Opaque('0')
|
98 | """
|
99 | if self.tok_id == 'Str':
|
100 | name = self.tok_val
|
101 | elif self.tok_id == 'Opaque':
|
102 | assert self.tok_val in ('NULL', '0')
|
103 | name = None
|
104 | else:
|
105 | raise RuntimeError('Unexpected token %r' % self.tok_id)
|
106 | self.Next()
|
107 | return name
|
108 |
|
109 | def ParseVal(self):
|
110 | """
|
111 | Val = Str
|
112 | | Opaque
|
113 | | LDocStr Str+ RDocStr # string concatenation happens
|
114 | """
|
115 | if self.tok_id == 'LDocStr':
|
116 | self.Next()
|
117 |
|
118 | val = self.tok_val
|
119 | self.Eat('Str')
|
120 | while self.tok_id == 'Str':
|
121 | val += self.tok_val
|
122 | self.Next()
|
123 |
|
124 | self.Eat('RDocStr')
|
125 |
|
126 | elif self.tok_id in ('Opaque', 'Str'):
|
127 | val = self.tok_val
|
128 | self.Next()
|
129 |
|
130 | else:
|
131 | raise RuntimeError('Unexpected token %r' % self.tok_id)
|
132 |
|
133 | return val
|
134 |
|
135 | def ParseItem(self):
|
136 | """
|
137 | Item = '{' Name (',' Val)+ '}' ','?
|
138 | """
|
139 | self.Eat('LBrace')
|
140 | name = self.ParseName()
|
141 |
|
142 | vals = []
|
143 | while self.tok_id == 'Comma':
|
144 | self.Next()
|
145 | vals.append(self.ParseVal())
|
146 |
|
147 | self.Eat('RBrace')
|
148 |
|
149 | if self.tok_id == 'Comma': # Optional
|
150 | self.Next()
|
151 |
|
152 | return name, vals
|
153 |
|
154 | def ParseDef(self):
|
155 | """
|
156 | Def = BeginDef '{' Item+ '}' ';'
|
157 | """
|
158 | def_name = self.tok_val
|
159 | self.Eat('BeginDef')
|
160 | self.Eat('LBrace')
|
161 |
|
162 | items = []
|
163 | while self.tok_id != 'RBrace':
|
164 | items.append(self.ParseItem())
|
165 |
|
166 | self.Next()
|
167 | self.Eat('Semi')
|
168 |
|
169 | return (def_name, items)
|
170 |
|
171 | def ParseHeader(self):
|
172 | self.Eat('FILE')
|
173 | path = self.tok_val
|
174 | self.Eat('Opaque')
|
175 | return path
|
176 |
|
177 | def ParseFile(self):
|
178 | """
|
179 | File = Header Def*
|
180 | """
|
181 | path = self.ParseHeader()
|
182 | defs = []
|
183 | while self.tok_id not in ('FILE', 'EOF'):
|
184 | defs.append(self.ParseDef())
|
185 |
|
186 | return path, defs
|
187 |
|
188 | def ParseStream(self):
|
189 | """
|
190 | Stream = File*
|
191 | """
|
192 | files = []
|
193 | while self.tok_id != 'EOF':
|
194 | files.append(self.ParseFile())
|
195 |
|
196 | return files
|
197 |
|
198 |
|
199 | def PrettyPrint(rel_path, def_name, entries, predicate, f, stats):
|
200 | def out(msg, *args):
|
201 | if args:
|
202 | msg = msg % args
|
203 | print(msg, file=f, end='')
|
204 |
|
205 | out('static PyMethodDef %s[] = {\n', def_name)
|
206 | for entry_name, vals in entries:
|
207 | if entry_name is None:
|
208 | out(' {0},\n') # null initializer
|
209 | continue
|
210 | stats['num_methods'] += 1
|
211 |
|
212 | if not predicate(rel_path, def_name, entry_name):
|
213 | stats['num_filtered'] += 1
|
214 | continue
|
215 |
|
216 | # Reprint the definition, but omit the docstring.
|
217 | out(' {"%s", ', entry_name)
|
218 | out(vals[0]) # The C function
|
219 | out(', ')
|
220 | out(vals[1]) # The flags
|
221 | out('},\n')
|
222 | out('};\n')
|
223 |
|
224 |
|
225 | MODULES_TO_FILTER = [
|
226 | # My Own
|
227 | 'libc.c',
|
228 | 'fastlex.c',
|
229 | 'line_input.c',
|
230 |
|
231 | 'import.c',
|
232 | 'marshal.c', # additional filters below
|
233 | #'zipimport.c', # Cannot filter this because find_module called from C!
|
234 |
|
235 | # Types for Builtins
|
236 | 'enumobject.c',
|
237 | 'rangeobject.c',
|
238 |
|
239 | # Interpreter types
|
240 | 'descrobject.c',
|
241 | 'exceptions.c',
|
242 | 'structseq.c',
|
243 | '_warnings.c',
|
244 |
|
245 | # Control flow
|
246 | 'frameobject.c',
|
247 | 'genobject.c',
|
248 | 'iterobject.c',
|
249 |
|
250 | # GC
|
251 | '_weakref.c',
|
252 | 'weakrefobject.c',
|
253 | 'gcmodule.c',
|
254 |
|
255 | # "Data types"
|
256 | #'boolobject.c', # No defs
|
257 | 'cStringIO.c',
|
258 | 'dictobject.c',
|
259 | 'fileobject.c',
|
260 | 'floatobject.c',
|
261 | 'intobject.c',
|
262 | 'listobject.c',
|
263 | 'longobject.c',
|
264 | #'moduleobject.c', # No defs
|
265 | 'setobject.c',
|
266 | 'stringobject.c',
|
267 | 'tupleobject.c',
|
268 | 'sliceobject.c',
|
269 | 'typeobject.c',
|
270 |
|
271 | # Builtins
|
272 | 'bltinmodule.c', # additional filters below
|
273 | #'sysmodule.c', # Filtered below
|
274 |
|
275 | # Libraries
|
276 | 'errnomodule.c', # has no methods, but include it for completeness
|
277 | 'fcntlmodule.c',
|
278 | 'posixmodule.c',
|
279 | 'pwdmodule.c',
|
280 | 'readline.c',
|
281 | 'resource.c',
|
282 | 'signalmodule.c',
|
283 | 'timemodule.c',
|
284 | 'termios.c',
|
285 | 'mathmodule.c',
|
286 | ]
|
287 |
|
288 |
|
289 | class OilMethodFilter(object):
|
290 |
|
291 | def __init__(self, py_names):
|
292 | self.py_names = py_names
|
293 |
|
294 | def __call__(self, rel_path, def_name, method_name):
|
295 | basename = os.path.basename(rel_path)
|
296 |
|
297 | if method_name == 'count': # False positive for {str,list,tuple}.count()
|
298 | return False
|
299 |
|
300 | if method_name == 'collect': # False positive: pyannotate and gcmodule.c
|
301 | return False
|
302 |
|
303 | # enter/exit needed for 'with open'. __length_hint__ is an optimization.
|
304 | if method_name in ('__enter__', '__exit__', '__length_hint__'):
|
305 | return True
|
306 | # Notes:
|
307 | # - __reduce__ and __setstate__ are for pickle. And I think
|
308 | # __getnewargs__.
|
309 | # - Do we need __sizeof__? Is that for sys.getsizeof()?
|
310 |
|
311 | # 5/2022: avoid regression? Not sure why this was getting deleted
|
312 | if method_name == '__getitem__':
|
313 | return True
|
314 |
|
315 | # NOTE: LoadYshGrammar needs marshal.loads().
|
316 | # False positive for yajl.dumps() and load()
|
317 | if basename == 'marshal.c' and method_name in ('dump', 'dumps', 'load'):
|
318 | return False
|
319 |
|
320 | # Auto-filtering gave false-positives here.
|
321 | # We don't need top-level next(). The method should be good enough.
|
322 | # iter is a field name
|
323 | if (basename == 'bltinmodule.c' and method_name in
|
324 | ('compile', 'format', 'next', 'vars', 'iter', 'eval', 'bin')):
|
325 | return False
|
326 | if basename == 'bltinmodule.c':
|
327 | # Get "bootstrapping error" without this.
|
328 | if method_name == '__import__':
|
329 | return True
|
330 |
|
331 | if basename == '_warnings.c' and method_name == 'warn':
|
332 | return False
|
333 |
|
334 | if basename == 'dictobject.c' and method_name in (
|
335 | 'iterkeys', 'itervalues', 'copy', 'fromkeys', 'popitem', 'setdefault'):
|
336 | return False
|
337 |
|
338 | if basename == 'tupleobject.c' and method_name == 'index':
|
339 | return False
|
340 |
|
341 | if basename == 'stringobject.c' and method_name == 'translate':
|
342 | # false positive from arg.translate
|
343 | return False
|
344 |
|
345 | if basename == 'setobject.c' and method_name in ('pop', 'copy'):
|
346 | return False
|
347 |
|
348 | if basename == 'frozensetobject.c' and method_name == 'copy':
|
349 | return False
|
350 |
|
351 | if basename == 'sliceobject.c' and method_name == 'indices':
|
352 | return False
|
353 |
|
354 | # Shadowed by fanos.send(), posix.close(), etc.
|
355 | if basename == 'genobject.c' and method_name in ('send', 'close'):
|
356 | return False
|
357 |
|
358 | # We're using list.remove()
|
359 | if basename == 'posixmodule.c' and method_name == 'remove': # Shadowed
|
360 | return False
|
361 |
|
362 | # We're using dict.clear() and list.remove()
|
363 | if basename == 'setobject.c' and method_name in ('clear', 'remove'):
|
364 | return False
|
365 |
|
366 | # Do custom filtering here.
|
367 | if (basename == 'sysmodule.c' and method_name not in self.py_names):
|
368 | # These can't be removed or they cause assertions!
|
369 | if method_name not in ('displayhook', 'excepthook'):
|
370 | return False
|
371 |
|
372 | # This one is called from C.
|
373 | if basename == 'signalmodule.c' and method_name == 'default_int_handler':
|
374 | return True
|
375 |
|
376 | # Name collisions
|
377 | if basename == 'mathmodule.c' and method_name in ('exp', 'log'):
|
378 | return False
|
379 |
|
380 | # segfault without this
|
381 | if basename == 'typeobject.c' and method_name == '__new__':
|
382 | return True
|
383 |
|
384 | if basename == 'descrobject.c':
|
385 | # Apparently used for dir() on class namespace, as in dir(Id).
|
386 | if method_name == 'keys':
|
387 | return True
|
388 | return False
|
389 |
|
390 | # Try just filtering {time,pwd,posix}module.c, etc.
|
391 | if basename in MODULES_TO_FILTER and method_name not in self.py_names:
|
392 | return False
|
393 |
|
394 | #log('= %s %s', def_name, method_name)
|
395 |
|
396 | # If it doesn't appear in the .py source, it can't be used. (Exception: it
|
397 | # could be used in C source with dynamic lookup? But I don't think CPython
|
398 | # does that.)
|
399 | #if method_name not in self.py_names:
|
400 | if 0:
|
401 | log('Omitting %r', method_name)
|
402 | return False
|
403 |
|
404 | return True
|
405 |
|
406 |
|
407 | def main(argv):
|
408 | action = argv[1]
|
409 |
|
410 | try:
|
411 | py_names_path = argv[2]
|
412 | except IndexError:
|
413 | method_filter = None
|
414 | else:
|
415 | py_names = set()
|
416 | with open(py_names_path) as f:
|
417 | for line in f:
|
418 | py_names.add(line.strip())
|
419 | method_filter = OilMethodFilter(py_names)
|
420 |
|
421 | if action == 'filtered':
|
422 | tokens = None
|
423 | else:
|
424 | tokens = Lexer(C_DEF).Tokens(sys.stdin.read())
|
425 |
|
426 | if action == 'lex': # for debugging
|
427 | while True:
|
428 | id_, value, pos = tokens.next()
|
429 | print('%s\t%r' % (id_, value))
|
430 | if id_ == 'EOF':
|
431 | break
|
432 |
|
433 | elif action == 'audit': # show after filtering, for debugging
|
434 | p = Parser(tokens)
|
435 | files = p.ParseStream()
|
436 | for rel_path, defs in files:
|
437 | basename = os.path.basename(rel_path)
|
438 |
|
439 | print(rel_path)
|
440 | for def_name, entries in defs:
|
441 | print('\t' + def_name)
|
442 | for method_name, vals in entries:
|
443 | if method_name is None:
|
444 | continue
|
445 | if not method_filter(rel_path, def_name, method_name):
|
446 | continue
|
447 | print('\t\t%s %s' % (method_name, vals))
|
448 |
|
449 | elif action == 'filter': # for slimming the build down
|
450 | out_dir = argv[3]
|
451 |
|
452 | p = Parser(tokens)
|
453 | files = p.ParseStream()
|
454 |
|
455 | # Print to files.
|
456 |
|
457 | stats = {'num_methods': 0, 'num_defs': 0, 'num_filtered': 0}
|
458 | for rel_path, defs in files:
|
459 | # Make a directory for each .c file! Each file is a def.
|
460 | c_dir = os.path.join(out_dir, rel_path)
|
461 | try:
|
462 | os.makedirs(c_dir)
|
463 | except OSError as e:
|
464 | if e.errno != errno.EEXIST:
|
465 | raise
|
466 |
|
467 | for def_name, entries in defs:
|
468 | out_path = os.path.join(c_dir, '%s.def' % def_name)
|
469 |
|
470 | # TODO: Write a separate file here for each one. We have to include a
|
471 | # different file at each definition.
|
472 |
|
473 | with open(out_path, 'w') as f:
|
474 | print('// %s' % rel_path, file=f)
|
475 | print('', file=f)
|
476 | PrettyPrint(rel_path, def_name, entries, method_filter, f, stats)
|
477 |
|
478 | stats['num_defs'] += 1
|
479 | log('Wrote %s', out_path)
|
480 |
|
481 | stats['num_left'] = stats['num_methods'] - stats['num_filtered']
|
482 | log('cpython_defs.py: Filtered %(num_filtered)d of %(num_methods)d methods, '
|
483 | 'leaving %(num_left)d (from %(num_defs)d definitions)' % stats)
|
484 |
|
485 | elif action == 'tsv':
|
486 | p = Parser(tokens)
|
487 | files = p.ParseStream()
|
488 | header = [
|
489 | 'file', 'def_name', 'py_method_name', 'c_symbol_name', 'flags',
|
490 | 'used'
|
491 | ]
|
492 | print('\t'.join(header))
|
493 | for rel_path, defs in files:
|
494 | for def_name, entries in defs:
|
495 | for method_name, vals in entries:
|
496 | if method_name is None:
|
497 | continue
|
498 | b = method_filter(rel_path, def_name, method_name)
|
499 | used = 'T' if b else 'F'
|
500 |
|
501 | # TODO: The c_symbol_name could be parsed better. It sometimes has
|
502 | # "(PyCFunction)" on the front of it.
|
503 |
|
504 | row = [rel_path, def_name, method_name, vals[0], vals[1], used]
|
505 | print('\t'.join(row))
|
506 |
|
507 | elif action == 'filtered':
|
508 | for name in MODULES_TO_FILTER:
|
509 | print(name)
|
510 |
|
511 | else:
|
512 | raise RuntimeError('Invalid action %r' % action)
|
513 |
|
514 |
|
515 | if __name__ == '__main__':
|
516 | try:
|
517 | main(sys.argv)
|
518 | except RuntimeError as e:
|
519 | print('FATAL: %s' % e, file=sys.stderr)
|
520 | sys.exit(1)
|