Skip to content

Commit ac468c8

Browse files
yoffCopilot
andcommitted
Python: extend new SSA with ESSA-shaped adapter + baseline comparison test
Phase 0.5 - Adapter API on top of the shared SSA: Adds the legacy-ESSA-shaped class hierarchy that the dataflow library consumes, layered on the shared 'Ssa::Make' instantiation: * EssaDefinition / EssaNodeDefinition: the latter exposes 'getDefiningNode()' (the CFG node at the def's index in its BB) and 'getVariable()' / 'getScope()'. * AssignmentDefinition: matches Assign, AnnAssign with value, AssignExpr and AugAssign target Names. Exposes 'getValue()' pointing at the RHS' CFG node. * ParameterDefinition: matches when the defining Name is in parameter context. * WithDefinition: matches 'with ... as x:' bindings. * ScopeEntryDefinition: implicit entry defs at synthetic position '-1' of the scope's entry basic block (non-local / global / builtin / captured reads). * PhiFunction (alias for PhiNode). * EssaVariable adapter wrapping a 'Ssa::Definition' with 'getAUse()', 'getDefinition()', 'getAnUltimateDefinition()', and 'getName()'. * AdjacentUses module with 'firstUse' and 'adjacentUseUse' predicates bridging to 'Ssa::firstUse' / 'Ssa::adjacentUseUse'. This is the minimum API the new dataflow's internals call into. The richer legacy ESSA (refinement nodes, attribute refinements, edge refinements) stays in 'semmle.python.essa.Essa' for legacy code. Phase 0.6 - Comparison test: Adds 'dataflow-new-ssa-vs-legacy/CmpTest.ql' that snapshots the difference between definitions produced by new SSA vs legacy ESSA on the same Python source. Baseline output records the current 'def-only-old' mismatches, grouped by category: * function/class/global definitions with no in-scope read (intentional; SSA is liveness-pruned) * captured / closure variables (real gap in new SSA - no closure-capture handling yet) * module variables __name__ / __package__ / $ (legacy ESSA implicit bindings) * exception 'as' bindings (depend on raise modelling) Zero 'def-only-new' mismatches: the new SSA never produces a spurious definition compared to legacy ESSA on this corpus. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 8790f63 commit ac468c8

4 files changed

Lines changed: 326 additions & 0 deletions

File tree

python/ql/lib/semmle/python/dataflow/new/internal/SsaImpl.qll

Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,3 +177,197 @@ final class WriteDefinition = Ssa::WriteDefinition;
177177
final class UncertainWriteDefinition = Ssa::UncertainWriteDefinition;
178178

179179
final class PhiNode = Ssa::PhiNode;
180+
181+
// ===========================================================================
182+
// ESSA-shaped adapter layer
183+
//
184+
// The dataflow library (`python/ql/lib/semmle/python/dataflow/new/`) and
185+
// related modules (`ApiGraphs.qll`, etc.) consume the legacy ESSA API
186+
// (`EssaVariable`, `EssaDefinition`, `AssignmentDefinition`,
187+
// `ScopeEntryDefinition`, `ParameterDefinition`, `WithDefinition`,
188+
// `PhiFunction`, plus the `AdjacentUses` module). To migrate them off
189+
// the legacy CFG, we expose the same API surface on top of the
190+
// shared SSA built above.
191+
//
192+
// This adapter is intentionally narrow: it covers only the predicates
193+
// that new dataflow consumes. The richer legacy ESSA — refinement
194+
// nodes, attribute refinements, edge refinements — stays available
195+
// via `semmle.python.essa.Essa` for points-to / legacy code.
196+
// ===========================================================================
197+
/**
198+
* Gets the CFG node at which a write definition's binding takes place.
199+
*
200+
* This is the `Cfg::ControlFlowNode` whose index in `def`'s basic block
201+
* is the same as `def`'s defining index. Phi definitions have no
202+
* defining CFG node and are excluded.
203+
*/
204+
private Cfg::ControlFlowNode writeDefNode(Ssa::WriteDefinition def) {
205+
exists(CfgImpl::BasicBlock bb, int i |
206+
def.definesAt(_, bb, i) and
207+
result = bb.getNode(i)
208+
)
209+
}
210+
211+
/**
212+
* A write definition whose binding has a corresponding CFG node — i.e.
213+
* everything that's not a phi node. Mirrors legacy ESSA's
214+
* `EssaNodeDefinition`.
215+
*/
216+
class EssaNodeDefinition extends Ssa::WriteDefinition {
217+
/** Gets the CFG node where this definition's binding takes place. */
218+
Cfg::ControlFlowNode getDefiningNode() { result = writeDefNode(this) }
219+
220+
/** Gets the variable defined here (legacy name). */
221+
SsaSourceVariable getVariable() { result = this.getSourceVariable() }
222+
223+
/** Gets the enclosing scope. */
224+
Py::Scope getScope() {
225+
exists(Cfg::ControlFlowNode n | n = this.getDefiningNode() | result = n.getScope())
226+
}
227+
}
228+
229+
/**
230+
* An assignment definition `x = e`. The defining node is `x`'s CFG
231+
* node; the value is `e`'s CFG node.
232+
*/
233+
class AssignmentDefinition extends EssaNodeDefinition {
234+
AssignmentDefinition() {
235+
exists(Cfg::NameNode n | n = this.getDefiningNode() |
236+
exists(Py::Assign a | a.getATarget() = n.getNode())
237+
or
238+
exists(Py::AnnAssign a | a.getTarget() = n.getNode() and exists(a.getValue()))
239+
or
240+
exists(Py::AssignExpr a | a.getTarget() = n.getNode())
241+
or
242+
exists(Py::AugAssign a | a.getTarget() = n.getNode())
243+
)
244+
}
245+
246+
/** Gets the CFG node for the value being assigned, if statically known. */
247+
Cfg::ControlFlowNode getValue() {
248+
exists(Cfg::NameNode target | target = this.getDefiningNode() |
249+
exists(Py::Assign a |
250+
a.getATarget() = target.getNode() and
251+
result.getNode() = a.getValue()
252+
)
253+
or
254+
exists(Py::AnnAssign a |
255+
a.getTarget() = target.getNode() and
256+
result.getNode() = a.getValue()
257+
)
258+
or
259+
exists(Py::AssignExpr a |
260+
a.getTarget() = target.getNode() and
261+
result.getNode() = a.getValue()
262+
)
263+
)
264+
}
265+
}
266+
267+
/**
268+
* A parameter definition — the binding of a parameter name in a
269+
* function's scope.
270+
*/
271+
class ParameterDefinition extends EssaNodeDefinition {
272+
ParameterDefinition() { this.getDefiningNode().isParameter() }
273+
274+
/** Gets the AST `Parameter` (a `Py::Name` in param context). */
275+
Py::Name getParameter() { result = this.getDefiningNode().getNode() }
276+
}
277+
278+
/**
279+
* A definition introduced by a `with ... as x:` clause.
280+
*/
281+
class WithDefinition extends EssaNodeDefinition {
282+
WithDefinition() {
283+
exists(Cfg::NameNode n, Py::With w |
284+
n = this.getDefiningNode() and
285+
w.getOptionalVars() = n.getNode()
286+
)
287+
}
288+
}
289+
290+
/**
291+
* An implicit entry definition for a non-local / captured / global /
292+
* builtin variable read in a scope but not defined there.
293+
*/
294+
class ScopeEntryDefinition extends Ssa::Definition {
295+
ScopeEntryDefinition() {
296+
exists(CfgImpl::BasicBlock bb |
297+
this.definesAt(_, bb, -1) and
298+
bb instanceof CfgImpl::Cfg::EntryBasicBlock
299+
)
300+
}
301+
302+
/** Gets the variable being entered. */
303+
SsaSourceVariable getVariable() { result = this.getSourceVariable() }
304+
305+
/** Gets the enclosing scope. */
306+
Py::Scope getScope() {
307+
exists(CfgImpl::BasicBlock bb |
308+
this.definesAt(_, bb, -1) and
309+
result = this.getSourceVariable().getVariable().getScope()
310+
)
311+
}
312+
}
313+
314+
/** A phi node (alias matching legacy naming). */
315+
class PhiFunction = PhiNode;
316+
317+
/** Base class for all ESSA definitions (legacy-shaped). */
318+
class EssaDefinition = Ssa::Definition;
319+
320+
/**
321+
* An adapter representing a single SSA-defined "variable" — wrapping
322+
* one `Ssa::Definition`. Mirrors legacy `EssaVariable` API.
323+
*/
324+
class EssaVariable extends Ssa::Definition {
325+
/** Gets the underlying SSA definition (legacy name). */
326+
Ssa::Definition getDefinition() { result = this }
327+
328+
/** Gets a CFG node where this definition is used. */
329+
Cfg::NameNode getAUse() {
330+
exists(CfgImpl::BasicBlock bb, int i |
331+
Ssa::ssaDefReachesRead(this.getSourceVariable(), this, bb, i) and
332+
bb.getNode(i) = result
333+
)
334+
}
335+
336+
/** Gets the (textual) name of the underlying variable. */
337+
string getName() { result = this.getSourceVariable().getVariable().getId() }
338+
339+
/** Gets an ultimate non-phi ancestor of this definition. */
340+
EssaVariable getAnUltimateDefinition() {
341+
if this instanceof PhiNode
342+
then
343+
exists(Ssa::Definition input |
344+
Ssa::phiHasInputFromBlock(this, input, _) and
345+
result = input.(EssaVariable).getAnUltimateDefinition()
346+
)
347+
else result = this
348+
}
349+
}
350+
351+
/**
352+
* Adjacent use-use and def-use relations exposed by the shared SSA
353+
* library. Provides the same interface as legacy
354+
* `semmle.python.essa.SsaCompute::AdjacentUses`.
355+
*/
356+
module AdjacentUses {
357+
/** Holds if `nodeFrom` and `nodeTo` are adjacent uses of the same SSA variable. */
358+
predicate adjacentUseUse(Cfg::NameNode nodeFrom, Cfg::NameNode nodeTo) {
359+
exists(SsaSourceVariable v, CfgImpl::BasicBlock bb1, int i1, CfgImpl::BasicBlock bb2, int i2 |
360+
Ssa::adjacentUseUse(bb1, i1, bb2, i2, v, _) and
361+
nodeFrom = bb1.getNode(i1) and
362+
nodeTo = bb2.getNode(i2)
363+
)
364+
}
365+
366+
/** Holds if `use` is a first use of definition `def`. */
367+
predicate firstUse(Ssa::Definition def, Cfg::NameNode use) {
368+
exists(CfgImpl::BasicBlock bb, int i |
369+
Ssa::firstUse(def, bb, i, _) and
370+
use = bb.getNode(i)
371+
)
372+
}
373+
}
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
| def-only-old | $:0:0 |
2+
| def-only-old | GLOBAL:49:1 |
3+
| def-only-old | GLOBAL:52:1 |
4+
| def-only-old | __name__:0:0 |
5+
| def-only-old | __package__:0:0 |
6+
| def-only-old | closure:31:5 |
7+
| def-only-old | e:37:1 |
8+
| def-only-old | e:40:25 |
9+
| def-only-old | exception_binding:37:5 |
10+
| def-only-old | if_else_branch:12:5 |
11+
| def-only-old | kwargs:27:32 |
12+
| def-only-old | loop:20:5 |
13+
| def-only-old | parameter:27:5 |
14+
| def-only-old | read_global:52:5 |
15+
| def-only-old | reassignment:6:5 |
16+
| def-only-old | simple_assign:1:5 |
17+
| def-only-old | with_binding:44:5 |
18+
| def-only-old | x:20:1 |
19+
| def-only-old | x:31:13 |
20+
| def-only-old | x:32:5 |
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
/**
2+
* Compares the new-CFG SSA against the legacy ESSA on the same Python
3+
* sources. Reports definitions present in one implementation but not
4+
* the other, identified by variable name + source position.
5+
*
6+
* The `.expected` file records the current diff as a snapshot: as the
7+
* new SSA matures (closing captured-variable gap, exception bindings,
8+
* etc.) and tracks more variables, the snapshot should monotonically
9+
* shrink.
10+
*
11+
* Known categories of `def-only-old` mismatches:
12+
* - Function / class / global definitions with no in-scope read
13+
* (intentional: SSA is liveness-pruned, write-only variables are
14+
* not tracked).
15+
* - Captured / closure variables (gap: new SSA does not yet model
16+
* closure captures).
17+
* - Module variables `__name__`, `__package__`, `$` (legacy ESSA
18+
* adds implicit bindings the new SSA does not).
19+
* - Exception-handler `as` bindings (depend on raise modelling).
20+
*
21+
* `def-only-new` mismatches would indicate the new SSA produces spurious
22+
* definitions; currently none are expected.
23+
*/
24+
25+
import python
26+
import semmle.python.dataflow.new.internal.SsaImpl as NewSsa
27+
import semmle.python.controlflow.internal.Cfg as Cfg
28+
import semmle.python.essa.Essa
29+
30+
string newDefSig(NewSsa::EssaNodeDefinition def) {
31+
exists(Cfg::ControlFlowNode n | n = def.getDefiningNode() |
32+
result =
33+
def.getVariable().getVariable().getId() + ":" + n.getLocation().getStartLine() + ":" +
34+
n.getLocation().getStartColumn()
35+
)
36+
}
37+
38+
string legacyDefSig(EssaNodeDefinition def) {
39+
exists(ControlFlowNode n | n = def.getDefiningNode() |
40+
result =
41+
def.getSourceVariable().getName() + ":" + n.getLocation().getStartLine() + ":" +
42+
n.getLocation().getStartColumn()
43+
)
44+
}
45+
46+
from string kind, string sig
47+
where
48+
kind = "def-only-new" and
49+
exists(NewSsa::EssaNodeDefinition def |
50+
sig = newDefSig(def) and
51+
not exists(EssaNodeDefinition legacyDef | sig = legacyDefSig(legacyDef))
52+
)
53+
or
54+
kind = "def-only-old" and
55+
exists(EssaNodeDefinition legacyDef |
56+
sig = legacyDefSig(legacyDef) and
57+
not exists(NewSsa::EssaNodeDefinition def | sig = newDefSig(def))
58+
)
59+
select kind, sig
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
def simple_assign():
2+
x = 1
3+
return x
4+
5+
6+
def reassignment():
7+
x = 1
8+
x = 2
9+
return x
10+
11+
12+
def if_else_branch(cond):
13+
if cond:
14+
x = 1
15+
else:
16+
x = 2
17+
return x
18+
19+
20+
def loop(xs):
21+
total = 0
22+
for x in xs:
23+
total = total + x
24+
return total
25+
26+
27+
def parameter(a, b=2, *args, **kwargs):
28+
return a + b + sum(args)
29+
30+
31+
def closure(x):
32+
def inner():
33+
return x
34+
return inner
35+
36+
37+
def exception_binding():
38+
try:
39+
compute()
40+
except Exception as e:
41+
return e
42+
43+
44+
def with_binding():
45+
with open("file") as f:
46+
return f.read()
47+
48+
49+
GLOBAL = 1
50+
51+
52+
def read_global():
53+
return GLOBAL

0 commit comments

Comments
 (0)