1 | """
|
2 | pass_state.py
|
3 | """
|
4 | from __future__ import print_function
|
5 |
|
6 | import os
|
7 | import re
|
8 | import subprocess
|
9 | from collections import defaultdict
|
10 |
|
11 | from mypy.types import Type
|
12 | from mypy.nodes import Expression
|
13 |
|
14 | from mycpp.util import join_name, log, split_py_name, SymbolPath
|
15 |
|
16 | from typing import Optional
|
17 |
|
18 | _ = log
|
19 |
|
20 |
|
21 | class ModuleMember(object):
|
22 | """
|
23 | A member of a Python module.
|
24 |
|
25 | e.g. core.state.Mem => core::state::Mem
|
26 | """
|
27 |
|
28 | def __init__(self, module_path: SymbolPath, member: str) -> None:
|
29 | self.module_path = module_path
|
30 | self.member = member
|
31 |
|
32 |
|
33 | class StaticObjectMember(object):
|
34 | """
|
35 | A static member of an object. Usually a a method like an alternative constructor.
|
36 |
|
37 | e.g. runtime_asdl.Cell.CreateNull() => runtime_asdl::Cell::CreateNull()
|
38 | """
|
39 |
|
40 | def __init__(self, base_type_name: SymbolPath, member: str) -> None:
|
41 | self.base_type_name = base_type_name
|
42 | self.member = member
|
43 |
|
44 |
|
45 | class HeapObjectMember(object):
|
46 | """
|
47 | A member of a heap-allocated object.
|
48 |
|
49 | e.g foo.empty() => foo->empty()
|
50 | """
|
51 |
|
52 | def __init__(self, object_expr: Expression, object_type: Type,
|
53 | member: str) -> None:
|
54 | self.object_expr = object_expr
|
55 | self.object_type = object_type
|
56 | self.member = member
|
57 |
|
58 |
|
59 | class StackObjectMember(object):
|
60 | """
|
61 | A member of a stack-allocated object.
|
62 |
|
63 | e.g foo.empty() => foo.empty()
|
64 | """
|
65 |
|
66 | def __init__(self, object_expr: Expression, object_type: Type,
|
67 | member: str) -> None:
|
68 | self.ojbect_expr = object_expr
|
69 | self.object_type = object_type
|
70 | self.member = member
|
71 |
|
72 |
|
73 | class Virtual(object):
|
74 | """Calculate which C++ methods need the virtual keyword.
|
75 |
|
76 | See unit test for example usage.
|
77 | """
|
78 |
|
79 | def __init__(self) -> None:
|
80 | self.methods: dict[SymbolPath, list[str]] = defaultdict(list)
|
81 | self.subclasses: dict[SymbolPath, list[tuple[str]]] = defaultdict(list)
|
82 | self.virtuals: dict[tuple[SymbolPath, str], Optional[tuple[SymbolPath,
|
83 | str]]] = {}
|
84 | self.has_vtable: dict[SymbolPath, bool] = {}
|
85 | self.can_reorder_fields: dict[SymbolPath, bool] = {}
|
86 |
|
87 | # _Executor -> vm::_Executor
|
88 | self.base_class_unique: dict[str, SymbolPath] = {}
|
89 |
|
90 | # These are called on the Forward Declare pass
|
91 | def OnMethod(self, class_name: SymbolPath, method_name: str) -> None:
|
92 | #log('OnMethod %s %s', class_name, method_name)
|
93 |
|
94 | # __init__ and so forth don't count
|
95 | if method_name.startswith('__') and method_name.endswith('__'):
|
96 | return
|
97 |
|
98 | self.methods[class_name].append(method_name)
|
99 |
|
100 | def OnSubclass(self, base_class: SymbolPath, subclass: SymbolPath) -> None:
|
101 | if len(base_class) > 1:
|
102 | # Hack for
|
103 | #
|
104 | # class _Executor: pass
|
105 | # versus
|
106 | # class MyExecutor(vm._Executor): pass
|
107 | base_key = base_class[-1]
|
108 |
|
109 | # Fail if we have two base classes in different namespaces with the same
|
110 | # name.
|
111 | if base_key in self.base_class_unique:
|
112 | # Make sure we don't have collisions
|
113 | assert (self.base_class_unique[base_key] == base_class or
|
114 | base_class
|
115 | in self.subclasses[self.base_class_unique[base_key]]
|
116 | ), base_class
|
117 | else:
|
118 | self.base_class_unique[base_key] = base_class
|
119 |
|
120 | else:
|
121 | base_key = base_class
|
122 |
|
123 | self.subclasses[base_class].append(subclass)
|
124 |
|
125 | def Calculate(self) -> None:
|
126 | """Call this after the forward declare pass."""
|
127 | for base_class, subclasses in self.subclasses.items():
|
128 | self.can_reorder_fields[base_class] = False
|
129 |
|
130 | for subclass in subclasses:
|
131 | self.can_reorder_fields[subclass] = False
|
132 |
|
133 | b_methods = self.methods[base_class]
|
134 | s_methods = self.methods[subclass]
|
135 | overlapping = set(b_methods) & set(s_methods)
|
136 | for method in overlapping:
|
137 | self.virtuals[(base_class, method)] = None
|
138 | self.virtuals[(subclass, method)] = (base_class, method)
|
139 | if overlapping:
|
140 | self.has_vtable[base_class] = True
|
141 | self.has_vtable[subclass] = True
|
142 |
|
143 | # These is called on the Decl pass
|
144 | def IsVirtual(self, class_name: SymbolPath, method_name: str) -> bool:
|
145 | return (class_name, method_name) in self.virtuals
|
146 |
|
147 | def HasVTable(self, class_name: SymbolPath) -> bool:
|
148 | return class_name in self.has_vtable
|
149 |
|
150 | def CanReorderFields(self, class_name: SymbolPath) -> bool:
|
151 | if class_name in self.can_reorder_fields:
|
152 | return self.can_reorder_fields[class_name]
|
153 | else:
|
154 | return True # by default they can be reordered
|
155 |
|
156 |
|
157 | def SymbolPathToReference(func: str, p: SymbolPath) -> str:
|
158 | if len(p) > 1:
|
159 | return '$ObjectMember({}, {})'.format(join_name(p[:-1], delim='.'),
|
160 | p[-1])
|
161 |
|
162 | return '$LocalVariable({}, {})'.format(func, p[0])
|
163 |
|
164 |
|
165 | class Fact(object):
|
166 | """
|
167 | An abstract fact. These can be used to build up datalog programs.
|
168 | """
|
169 |
|
170 | def __init__(self) -> None:
|
171 | pass
|
172 |
|
173 | @staticmethod
|
174 | def name() -> str:
|
175 | raise NotImplementedError()
|
176 |
|
177 | def Generate(self, func: str, statement: int) -> str:
|
178 | raise NotImplementedError()
|
179 |
|
180 |
|
181 | class FunctionCall(Fact):
|
182 |
|
183 | def __init__(self, callee: str) -> None:
|
184 | self.callee = callee
|
185 |
|
186 | @staticmethod
|
187 | def name() -> str:
|
188 | return 'call'
|
189 |
|
190 | def Generate(self, func: str, statement: int) -> str:
|
191 | return '{}\t{}\t{}\n'.format(func, statement, self.callee)
|
192 |
|
193 |
|
194 | class Definition(Fact):
|
195 | """
|
196 | The definition of a variable. This corresponds to an allocation.
|
197 | """
|
198 |
|
199 | def __init__(self, ref: SymbolPath, obj: str) -> None:
|
200 | self.ref = ref
|
201 | self.obj = obj
|
202 |
|
203 | @staticmethod
|
204 | def name() -> str:
|
205 | return 'assign'
|
206 |
|
207 | def Generate(self, func: str, statement: int) -> str:
|
208 | return '{}\t{}\t{}\t{}\n'.format(func, statement,
|
209 | SymbolPathToReference(func, self.ref),
|
210 | self.obj)
|
211 |
|
212 |
|
213 | class Assignment(Fact):
|
214 | """
|
215 | The assignment of one variable or object member to another.
|
216 | """
|
217 |
|
218 | def __init__(self, lhs: SymbolPath, rhs: SymbolPath) -> None:
|
219 | self.lhs = lhs
|
220 | self.rhs = rhs
|
221 |
|
222 | @staticmethod
|
223 | def name() -> str:
|
224 | return 'assign'
|
225 |
|
226 | def Generate(self, func: str, statement: int) -> str:
|
227 | return '{}\t{}\t{}\t$Ref({})\n'.format(
|
228 | func, statement, SymbolPathToReference(func, self.lhs),
|
229 | SymbolPathToReference(func, self.rhs))
|
230 |
|
231 |
|
232 | class Use(Fact):
|
233 | """
|
234 | The use of a reference.
|
235 |
|
236 | In the last assignment below, we would emit Use(foo) and Use(x). We would,
|
237 | however, not emit Use(foo.a) since it is an lvalue and would instead be
|
238 | covered by the Assign fact. Similarly, the first two assignments do not
|
239 | generate Use facts.
|
240 |
|
241 | foo = Foo()
|
242 | x = Bar()
|
243 | foo.a = x
|
244 |
|
245 | Any time a reference appears in an expression (or expression-statement) it
|
246 | will be considered used.
|
247 |
|
248 | some_function(a) => Use(a)
|
249 | a + b => Use(a), Use(b)
|
250 | print(thing.dict[key]) => Use(thing), Use(thing.dict), Use(key)
|
251 | obj.func() => Use(obj)
|
252 | """
|
253 |
|
254 | def __init__(self, ref: SymbolPath) -> None:
|
255 | self.ref = ref
|
256 |
|
257 | @staticmethod
|
258 | def name() -> str:
|
259 | return 'use'
|
260 |
|
261 | def Generate(self, func: str, statement: int) -> str:
|
262 | return '{}\t{}\t{}\n'.format(func, statement,
|
263 | SymbolPathToReference(func, self.ref))
|
264 |
|
265 |
|
266 | class Bind(Fact):
|
267 | """
|
268 | Binding a reference to a positional function parameter.
|
269 | """
|
270 |
|
271 | def __init__(self, ref: SymbolPath, callee: SymbolPath,
|
272 | arg_pos: int) -> None:
|
273 | self.ref = ref
|
274 | self.callee = callee
|
275 | self.arg_pos = arg_pos
|
276 |
|
277 | @staticmethod
|
278 | def name() -> str:
|
279 | return 'bind'
|
280 |
|
281 | def Generate(self, func: str, statement: int) -> str:
|
282 | return '{}\t{}\t{}\t{}\t{}\n'.format(
|
283 | func, statement, SymbolPathToReference(func, self.ref),
|
284 | join_name(self.callee, delim='.'), self.arg_pos)
|
285 |
|
286 |
|
287 | class ControlFlowGraph(object):
|
288 | """
|
289 | A simple control-flow graph.
|
290 |
|
291 | Every statement in the program is represented as a node in a graph with
|
292 | unique a numeric ID. Control flow is represented as directed edges through
|
293 | the graph. Loops can introduce back-edges. Every node in the graph will
|
294 | satisfy at least one of the following conditions:
|
295 |
|
296 | - Its indegree is at least one.
|
297 |
|
298 | - Its outdegree is at least one.
|
299 |
|
300 | For simple linear graphs all you need is the AddStatement method. For more
|
301 | complex flows there is a set of context managers below to help simplify
|
302 | construction.
|
303 |
|
304 | - For branches-like statements (e.g. if- and try- statements) use
|
305 | CfgBranchContext. It will take care of the details associated with
|
306 | stitching the different branches to statements in the next statement.
|
307 |
|
308 | - For loops, use CfgLoopContext. It will take care of adding back-edges
|
309 | and connecting break statements to any statements that proceed the
|
310 | loop.
|
311 |
|
312 | - CfgBlockContext can be used for simple cases where you just want to
|
313 | track the beginning and end of a sequence of statements.
|
314 |
|
315 | Statements can carry annotations called facts, which are used as inputs to
|
316 | datalog programs to perform dataflow diffrent kinds of dataflow analyses.
|
317 | To annotate a statement, use the AddFact method with any object that
|
318 | implements the Fact interface.
|
319 |
|
320 | See the unit tests in pass_state_test.py and the mycpp phase in
|
321 | control_flow_pass.py for detailed examples of usage.
|
322 | """
|
323 |
|
324 | def __init__(self) -> None:
|
325 | self.statement_counter: int = 0
|
326 | self.edges: set[tuple[int, int]] = set({})
|
327 | self.block_stack: list[int] = []
|
328 | self.predecessors: set[int] = set({})
|
329 | self.deadends: set[int] = set({})
|
330 |
|
331 | # order doesn't actually matter here, but sets require elements to be
|
332 | # hashable
|
333 | self.facts: dict[int, list[Fact]] = defaultdict(list)
|
334 |
|
335 | def AddEdge(self, pred: int, succ: int) -> None:
|
336 | """
|
337 | Add a directed edge from pred to succ. If pred is a deadend, its
|
338 | non-deadends will be used instead.
|
339 | """
|
340 | if pred in self.deadends:
|
341 | for w in [u for (u, v) in self.edges if v == pred]:
|
342 | self.AddEdge(w, succ)
|
343 | else:
|
344 | self.edges.add((pred, succ))
|
345 |
|
346 | def AddDeadend(self, statement: int):
|
347 | """
|
348 | Mark a statement as a dead-end (e.g. return or continue).
|
349 | """
|
350 | self.deadends.add(statement)
|
351 |
|
352 | def AddStatement(self) -> int:
|
353 | """
|
354 | Add a new statement and return its ID.
|
355 | """
|
356 | if len(self.predecessors) == 0:
|
357 | if len(self.block_stack):
|
358 | self.predecessors.add(self.block_stack[-1])
|
359 | else:
|
360 | self.predecessors.add(self.statement_counter)
|
361 |
|
362 | self.statement_counter += 1
|
363 | for pred in self.predecessors:
|
364 | self.AddEdge(pred, self.statement_counter)
|
365 |
|
366 | self.predecessors = set({})
|
367 |
|
368 | if len(self.block_stack):
|
369 | self.block_stack[-1] = self.statement_counter
|
370 |
|
371 | return self.statement_counter
|
372 |
|
373 | def AddFact(self, statement: int, fact: Fact) -> None:
|
374 | """
|
375 | Annotate a statement with a fact.
|
376 | """
|
377 | self.facts[statement].append(fact)
|
378 |
|
379 | def _PushBlock(self, begin: Optional[int] = None) -> int:
|
380 | """
|
381 | Start a block at the given statement ID. If a beginning statement isn't
|
382 | provided one will be created and its ID will be returend.
|
383 |
|
384 | Direct use of this function is discouraged. Consider using one of the
|
385 | block context managers below instead.
|
386 | """
|
387 | if begin is None:
|
388 | begin = self.AddStatement()
|
389 | else:
|
390 | self.predecessors.add(begin)
|
391 |
|
392 | self.block_stack.append(begin)
|
393 | return begin
|
394 |
|
395 | def _PopBlock(self) -> int:
|
396 | """
|
397 | Pop a block from the top of the stack and return the ID of the block's
|
398 | last statement.
|
399 |
|
400 | Direct use of this function is discouraged. Consider using one of the
|
401 | block context managers below instead.
|
402 | """
|
403 | assert len(self.block_stack)
|
404 | last = self.block_stack.pop()
|
405 | if len(self.block_stack) and last not in self.deadends:
|
406 | self.block_stack[-1] = last
|
407 |
|
408 | return last
|
409 |
|
410 |
|
411 | class CfgBlockContext(object):
|
412 | """
|
413 | Context manager to make dealing with things like with-statements easier.
|
414 | """
|
415 |
|
416 | def __init__(self,
|
417 | cfg: ControlFlowGraph,
|
418 | begin: Optional[int] = None) -> None:
|
419 | self.cfg = cfg
|
420 | if cfg is None:
|
421 | return
|
422 |
|
423 | self.entry = self.cfg._PushBlock(begin)
|
424 | self.exit = self.entry
|
425 |
|
426 | def __enter__(self) -> None:
|
427 | return self if self.cfg else None
|
428 |
|
429 | def __exit__(self, *args) -> None:
|
430 | if not self.cfg:
|
431 | return
|
432 |
|
433 | self.exit = self.cfg._PopBlock()
|
434 |
|
435 |
|
436 | class CfgBranchContext(object):
|
437 | """
|
438 | Context manager to make dealing with if-else blocks easier.
|
439 | """
|
440 |
|
441 | def __init__(self, cfg: ControlFlowGraph, branch_point: int) -> None:
|
442 | self.cfg = cfg
|
443 | self.entry = branch_point
|
444 | self.exit = self.entry
|
445 | if cfg is None:
|
446 | return
|
447 |
|
448 | self.arms = []
|
449 | self.pushed = False
|
450 |
|
451 | def AddBranch(self, entry: Optional[int] = None):
|
452 | if not self.cfg:
|
453 | return CfgBranchContext(None, None)
|
454 |
|
455 | self.arms.append(CfgBranchContext(self.cfg, entry or self.entry))
|
456 | self.cfg._PushBlock(self.arms[-1].entry)
|
457 | self.arms[-1].pushed = True
|
458 | return self.arms[-1]
|
459 |
|
460 | def __enter__(self) -> None:
|
461 | return self
|
462 |
|
463 | def __exit__(self, *args) -> None:
|
464 | if not self.cfg:
|
465 | return
|
466 |
|
467 | if self.pushed:
|
468 | self.exit = self.cfg._PopBlock()
|
469 |
|
470 | for arm in self.arms:
|
471 | if arm.exit not in self.cfg.deadends:
|
472 | self.cfg.predecessors.add(arm.exit)
|
473 |
|
474 |
|
475 | class CfgLoopContext(object):
|
476 | """
|
477 | Context manager to make dealing with loops easier.
|
478 | """
|
479 |
|
480 | def __init__(self,
|
481 | cfg: ControlFlowGraph,
|
482 | entry: Optional[int] = None) -> None:
|
483 | self.cfg = cfg
|
484 | self.breaks = set({})
|
485 | if cfg is None:
|
486 | return
|
487 |
|
488 | self.entry = self.cfg._PushBlock(entry)
|
489 | self.exit = self.entry
|
490 |
|
491 | def AddBreak(self, statement: int) -> None:
|
492 | assert self.cfg
|
493 | self.breaks.add(statement)
|
494 | self.cfg.AddDeadend(statement)
|
495 |
|
496 | def AddContinue(self, statement: int) -> None:
|
497 | self.cfg.AddEdge(statement, self.entry)
|
498 | self.cfg.AddDeadend(statement)
|
499 |
|
500 | def __enter__(self) -> None:
|
501 | return self if self.cfg else None
|
502 |
|
503 | def __exit__(self, *args) -> None:
|
504 | if not self.cfg:
|
505 | return
|
506 |
|
507 | self.exit = self.cfg._PopBlock()
|
508 | self.cfg.AddEdge(self.exit, self.entry)
|
509 | for pred in self.cfg.predecessors:
|
510 | self.cfg.AddEdge(pred, self.entry)
|
511 |
|
512 | # If we had any breaks, arm the predecessor set with the current
|
513 | # statement and the break statements.
|
514 | if len(self.breaks):
|
515 | if len(self.cfg.block_stack):
|
516 | self.cfg.predecessors.add(self.cfg.block_stack[-1])
|
517 | else:
|
518 | self.cfg.predecessors.add(self.cfg.statement_counter)
|
519 |
|
520 | for b in self.breaks:
|
521 | self.cfg.deadends.remove(b)
|
522 | self.cfg.predecessors.add(b)
|
523 |
|
524 |
|
525 | class StackRoots(object):
|
526 | """
|
527 | Output of the souffle stack roots solver.
|
528 | """
|
529 |
|
530 | def __init__(self, tuples: set[tuple[SymbolPath, SymbolPath]]) -> None:
|
531 | self.root_tuples = tuples
|
532 |
|
533 | def needs_root(self, func: SymbolPath, reference: SymbolPath) -> bool:
|
534 | """
|
535 | Returns true if the given reference should have a stack root.
|
536 | """
|
537 | return (func, reference) in self.root_tuples
|
538 |
|
539 |
|
540 | def DumpControlFlowGraphs(cfgs: dict[str, ControlFlowGraph],
|
541 | facts_dir='_tmp/mycpp-facts') -> None:
|
542 | """
|
543 | Dump the given control flow graphs and associated facts into the given
|
544 | directory as text files that can be consumed by datalog.
|
545 | """
|
546 | edge_facts = '{}/cf_edge.facts'.format(facts_dir)
|
547 |
|
548 | os.makedirs(facts_dir, exist_ok=True)
|
549 | # Open files for all facts that we might emit even if we don't end up having
|
550 | # anything to write to them. Souffle will complain if it can't find the file
|
551 | # for anything marked as an input.
|
552 | fact_files = {
|
553 | fact_type.name():
|
554 | open('{}/{}.facts'.format(facts_dir, fact_type.name()), 'w')
|
555 | for fact_type in Fact.__subclasses__()
|
556 | }
|
557 | with open(edge_facts, 'w') as cfg_f:
|
558 | for func, cfg in sorted(cfgs.items()):
|
559 | joined = join_name(func, delim='.')
|
560 | for (u, v) in sorted(cfg.edges):
|
561 | cfg_f.write('{}\t{}\t{}\n'.format(joined, u, v))
|
562 |
|
563 | for statement, facts in sorted(cfg.facts.items()):
|
564 | for fact in facts: # already sorted temporally
|
565 | fact_f = fact_files[fact.name()]
|
566 | fact_f.write(fact.Generate(joined, statement))
|
567 |
|
568 | for f in fact_files.values():
|
569 | f.close()
|
570 |
|
571 |
|
572 | def ComputeMinimalStackRoots(cfgs: dict[str, ControlFlowGraph],
|
573 | souffle_dir: str = '_tmp') -> StackRoots:
|
574 | """
|
575 | Run the the souffle stack roots solver and translate its output in a format
|
576 | that can be queried by cppgen_pass.
|
577 | """
|
578 | facts_dir = '{}/facts'.format(souffle_dir)
|
579 | os.makedirs(facts_dir)
|
580 | output_dir = '{}/outputs'.format(souffle_dir)
|
581 | os.makedirs(output_dir)
|
582 | DumpControlFlowGraphs(cfgs, facts_dir=facts_dir)
|
583 |
|
584 | subprocess.check_call([
|
585 | '_bin/datalog/dataflow',
|
586 | '-F',
|
587 | facts_dir,
|
588 | '-D',
|
589 | output_dir,
|
590 | ])
|
591 |
|
592 | tuples: set[tuple[SymbolPath, SymbolPath]] = set({})
|
593 | with open('{}/stack_root_vars.tsv'.format(output_dir), 'r') as roots_f:
|
594 | pat = re.compile(r'\$(.*)\((.*), (.*)\)')
|
595 | for line in roots_f:
|
596 | function, ref = line.split('\t')
|
597 | m = pat.match(ref)
|
598 | assert m.group(1) in ('LocalVariable', 'ObjectMember')
|
599 | if m.group(1) == 'LocalVariable':
|
600 | _, ref_func, var_name = m.groups()
|
601 | assert ref_func == function
|
602 | tuples.add((split_py_name(function), (var_name, )))
|
603 |
|
604 | if m.group(1) == 'ObjectMember':
|
605 | _, base_obj, member_name = m.groups()
|
606 | tuples.add((split_py_name(function),
|
607 | split_py_name(base_obj) + (member_name, )))
|
608 |
|
609 | return StackRoots(tuples)
|