From 9591e154c5910a63202feaf3b522cafc4fab6cd1 Mon Sep 17 00:00:00 2001
From: Grey Newell <greyshipscode@gmail.com>
Date: Wed, 25 Mar 2026 10:21:39 -0400
Subject: [PATCH 1/8] fix(supermodel): add zip comment and source map excludes
 to bust stale caches

- Stamp every zip with _ZIP_COMMENT (mcpbr-analysis-v3) so the hash is
  stable, deterministic, and distinct from earlier runs on old containers.
  Server-side cached errors from the 4Gi era are bypassed automatically.
- Add *.map / *.js.map to BINARY_EXCLUDE_PATTERNS (source maps add no
  value for dead-code analysis and inflate zip size).
- Delete stale local analysis caches (all pre-date the binary-filter commit).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/mcpbr/benchmarks/supermodel/git_utils.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/mcpbr/benchmarks/supermodel/git_utils.py b/src/mcpbr/benchmarks/supermodel/git_utils.py
index 0cc4811..58a3675 100644
--- a/src/mcpbr/benchmarks/supermodel/git_utils.py
+++ b/src/mcpbr/benchmarks/supermodel/git_utils.py
@@ -34,6 +34,8 @@
     "*.mp3",
     "*.wav",
     "*.ogg",
+    "*.map",
+    "*.js.map",
 ]
 
 
@@ -187,8 +189,16 @@ async def _zip_repo_git_archive(
     return output_zip
 
 
+_ZIP_COMMENT = b"mcpbr-analysis-v3"
+
+
 def _filter_zip_entries(zip_path: str, patterns: list[str]) -> None:
-    """Rewrite zip in-place, removing entries whose basename matches any glob pattern."""
+    """Rewrite zip in-place, removing entries whose basename matches any glob pattern.
+
+    Always rewrites (even if no entries are removed) to stamp the archive with
+    _ZIP_COMMENT so the resulting hash is stable across git archive versions and
+    acts as a cache-busting version marker.
+    """
     tmp_path = zip_path + ".tmp"
     removed = 0
     try:
@@ -196,6 +206,7 @@ def _filter_zip_entries(zip_path: str, patterns: list[str]) -> None:
             zipfile.ZipFile(zip_path, "r") as zin,
             zipfile.ZipFile(tmp_path, "w", compression=zipfile.ZIP_DEFLATED) as zout,
         ):
+            zout.comment = _ZIP_COMMENT
             for item in zin.infolist():
                 basename = os.path.basename(item.filename)
                 if any(fnmatch.fnmatch(basename, pat) for pat in patterns):

From c579a18611d20c1dc038fdc9c31590678f98a814 Mon Sep 17 00:00:00 2001
From: Grey Newell <greyshipscode@gmail.com>
Date: Wed, 25 Mar 2026 16:19:03 -0400
Subject: [PATCH 2/8] fix(supermodel): remove recall threshold gating and
 filter feature-removal GT false positives
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Remove `resolved_threshold` parameter from SupermodelBenchmark.__init__; `resolved`
  now reflects whether the agent completed without error (evaluate() was reached), not
  whether recall crossed an arbitrary threshold. Precision/recall are the quality signal.

- Add `_parse_deleted_imports()` to dead_code.py: parses deleted `import { ... }` lines
  from a PR diff and filters out any removed exports whose names appear in those imports.
  Feature-removal PRs delete many files together; symbols exported by one deleted file
  and imported by another are NOT dead code — they were active within the feature.
  This eliminates false positives in GT for logto, n8n, prisma, podman, nextjs PRs.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/mcpbr/benchmarks/supermodel/benchmark.py  | 11 +++----
 .../supermodel/endpoints/dead_code.py         | 30 +++++++++++++++++++
 2 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/src/mcpbr/benchmarks/supermodel/benchmark.py b/src/mcpbr/benchmarks/supermodel/benchmark.py
index b8d2af5..2378f6d 100644
--- a/src/mcpbr/benchmarks/supermodel/benchmark.py
+++ b/src/mcpbr/benchmarks/supermodel/benchmark.py
@@ -50,7 +50,6 @@ def __init__(
         tasks: list[dict[str, Any]] | None = None,
         supermodel_api_base: str = "https://api.supermodel.dev",
         supermodel_api_key: str | None = None,
-        resolved_threshold: float = 0.8,
         ground_truth_dir: str | Path | None = None,
         supermodel_api_timeout: int = 900,
         **kwargs: Any,
@@ -63,9 +62,6 @@ def __init__(
             tasks: List of task config dicts from YAML.
             supermodel_api_base: Base URL for Supermodel API.
             supermodel_api_key: API key (or set SUPERMODEL_API_KEY env var).
-            resolved_threshold: Recall threshold to consider a task 'resolved' (precision is
-                               reported but not required — the API returns many valid dead-code
-                               candidates beyond the GT set, so precision is not a fair gate).
             ground_truth_dir: Directory to cache ground truth JSON files.
             supermodel_api_timeout: Max seconds to wait for Supermodel API (default 900).
             **kwargs: Additional keyword arguments (ignored for forward compat).
@@ -75,7 +71,6 @@ def __init__(
         self.api_base = supermodel_api_base
         self.api_key = supermodel_api_key or os.environ.get("SUPERMODEL_API_KEY")
         self.api_timeout = supermodel_api_timeout
-        self.resolved_threshold = resolved_threshold
         self.gt_dir = Path(ground_truth_dir) if ground_truth_dir else DEFAULT_GT_DIR
         self.gt_dir.mkdir(parents=True, exist_ok=True)
 
@@ -628,7 +623,9 @@ async def evaluate(
 
         precision = metrics["precision"]
         recall = metrics["recall"]
-        resolved = recall >= self.resolved_threshold
+        # resolved = True means the agent completed without error (evaluate() was reached).
+        # Pass/fail is not gated on a recall threshold — use precision/recall to judge quality.
+        resolved = True
 
         # Log results
         print(f"\n{'=' * 50}")
@@ -641,7 +638,7 @@ async def evaluate(
         print(f"  Precision: {precision * 100:.1f}%")
         print(f"  Recall: {recall * 100:.1f}%")
         print(f"  F1 Score: {metrics['f1_score'] * 100:.1f}%")
-        print(f"  Resolved: {resolved}")
+        print(f"  Succeeded: {resolved}")
         print(f"{'=' * 50}\n")
 
         return {
diff --git a/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py b/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py
index 9ad76c9..8d22bdc 100644
--- a/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py
+++ b/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py
@@ -44,6 +44,11 @@
     r"\.rs$",
 ]
 
+# Pattern to extract named imports from deleted lines (TypeScript/JavaScript)
+# Matches: -import { foo, bar as baz } from '...'
+# Also: -import type { Foo } from '...'
+_DELETED_NAMED_IMPORT_RE = re.compile(r"^-\s*import\s+(?:type\s+)?\{([^}]+)\}\s+from")
+
 SKIP_NAMES = {
     "default",
     "module",
@@ -218,6 +223,12 @@ def extract_ground_truth(
     ) -> list[dict]:
         diff = self.get_pr_diff(repo, pr_number)
         declarations = _parse_diff(diff, language)
+        # Exclude symbols imported by other deleted files (feature-removal false positives).
+        # In a feature-removal PR many files are deleted together; symbols exported from one
+        # deleted file and imported by another are NOT dead code — they were active within
+        # the feature. Filter them out so they don't inflate false-negative counts.
+        deleted_imports = _parse_deleted_imports(diff)
+        declarations = [d for d in declarations if d.name not in deleted_imports]
         if scope_prefix:
             declarations = [d for d in declarations if d.file.startswith(scope_prefix)]
         return [{"file": d.file, "name": d.name, "type": d.type} for d in declarations]
@@ -263,3 +274,22 @@ def _parse_diff(diff_text: str, language: str = "typescript") -> list[RemovedDec
                 break
 
     return declarations
+
+
+def _parse_deleted_imports(diff_text: str) -> set[str]:
+    """Extract symbol names that appear in deleted import statements.
+
+    Used to filter out false positives from feature-removal PRs: a symbol that
+    is imported by another file being deleted in the same PR is NOT dead code.
+    """
+    imported: set[str] = set()
+    for line in diff_text.split("\n"):
+        if not line.startswith("-") or line.startswith("---"):
+            continue
+        m = _DELETED_NAMED_IMPORT_RE.match(line)
+        if m:
+            for part in m.group(1).split(","):
+                name = part.strip().split(" as ")[0].strip()
+                if name:
+                    imported.add(name)
+    return imported

From 918e3a16daa74e65886521a376914900bd156bcd Mon Sep 17 00:00:00 2001
From: Grey Newell <greyshipscode@gmail.com>
Date: Wed, 25 Mar 2026 17:13:44 -0400
Subject: [PATCH 3/8] =?UTF-8?q?fix(supermodel):=20improve=20deleted-import?=
 =?UTF-8?q?=20parser=20=E2=80=94=20multi-line=20blocks=20+=20file-aware=20?=
 =?UTF-8?q?filter?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Address CodeRabbit review comments on #7:

1. Multi-line imports: Replace single-regex approach with a state machine that
   handles multi-line `import { ... }` blocks (common in feature-removal PRs
   where imports span many lines). Also captures default imports. The state
   machine accumulates deleted lines until the closing `}` then extracts names.

2. File-aware filtering: `_parse_deleted_imports` now returns
   `dict[str, set[str]]` (symbol → module_specifiers) instead of `set[str]`.
   New `_is_feature_removal_fp()` does basename matching between the module
   specifier in the import and the declaration's file stem (e.g. `./userService`
   matches `userService.ts`). This prevents spurious exclusion of exports that
   share a name with an unrelated import from a different module.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../supermodel/endpoints/dead_code.py         | 109 ++++++++++++++----
 1 file changed, 89 insertions(+), 20 deletions(-)

diff --git a/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py b/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py
index 8d22bdc..94f1216 100644
--- a/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py
+++ b/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py
@@ -44,10 +44,13 @@
     r"\.rs$",
 ]
 
-# Pattern to extract named imports from deleted lines (TypeScript/JavaScript)
-# Matches: -import { foo, bar as baz } from '...'
-# Also: -import type { Foo } from '...'
-_DELETED_NAMED_IMPORT_RE = re.compile(r"^-\s*import\s+(?:type\s+)?\{([^}]+)\}\s+from")
+# Patterns for state-machine import parsing of deleted lines
+_IMPORT_OPEN_RE = re.compile(r"^-\s*import\s+(?:type\s+)?(?:\w+\s*,\s*)?\{")
+_IMPORT_SINGLE_RE = re.compile(
+    r"^-\s*import\s+(?:type\s+)?(?:\w+\s*,\s*)?\{([^}]+)\}\s+from\s+['\"]([^'\"]+)['\"]"
+)
+_IMPORT_DEFAULT_RE = re.compile(r"^-\s*import\s+(?:type\s+)?(\w+)\s+from\s+['\"]([^'\"]+)['\"]")
+_IMPORT_FROM_RE = re.compile(r"from\s+['\"]([^'\"]+)['\"]")
 
 SKIP_NAMES = {
     "default",
@@ -226,9 +229,11 @@ def extract_ground_truth(
         # Exclude symbols imported by other deleted files (feature-removal false positives).
         # In a feature-removal PR many files are deleted together; symbols exported from one
         # deleted file and imported by another are NOT dead code — they were active within
-        # the feature. Filter them out so they don't inflate false-negative counts.
+        # the feature. We key the filter to the module specifier so that a common name like
+        # `Config` is only suppressed when there's an import that plausibly resolves to the
+        # same file as the declaration (basename match), reducing spurious exclusions.
         deleted_imports = _parse_deleted_imports(diff)
-        declarations = [d for d in declarations if d.name not in deleted_imports]
+        declarations = [d for d in declarations if not _is_feature_removal_fp(d, deleted_imports)]
         if scope_prefix:
             declarations = [d for d in declarations if d.file.startswith(scope_prefix)]
         return [{"file": d.file, "name": d.name, "type": d.type} for d in declarations]
@@ -276,20 +281,84 @@ def _parse_diff(diff_text: str, language: str = "typescript") -> list[RemovedDec
     return declarations
 
 
-def _parse_deleted_imports(diff_text: str) -> set[str]:
-    """Extract symbol names that appear in deleted import statements.
+def _parse_deleted_imports(diff_text: str) -> dict[str, set[str]]:
+    """Parse deleted import statements, returning symbol → set[module_specifier].
 
-    Used to filter out false positives from feature-removal PRs: a symbol that
-    is imported by another file being deleted in the same PR is NOT dead code.
+    Handles single-line and multi-line named import blocks plus default imports.
+    Capturing the module specifier (the `from '...'` part) lets callers do
+    file-aware filtering rather than just name-based filtering.
     """
-    imported: set[str] = set()
+    # symbol_name -> set of module specifiers that imported it
+    imports: dict[str, set[str]] = {}
+    accumulating = False
+    buf: list[str] = []
+
+    def _add(name: str, spec: str) -> None:
+        name = name.strip().split(" as ")[0].strip()
+        if name:
+            imports.setdefault(name, set()).add(spec)
+
+    def _flush_block(raw: str) -> None:
+        # Extract module specifier from the accumulated block
+        m_from = _IMPORT_FROM_RE.search(raw)
+        spec = m_from.group(1) if m_from else ""
+        brace_open = raw.find("{")
+        brace_close = raw.find("}")
+        if brace_open != -1 and brace_close != -1:
+            names_part = raw[brace_open + 1 : brace_close]
+            for part in names_part.split(","):
+                _add(part, spec)
+
     for line in diff_text.split("\n"):
-        if not line.startswith("-") or line.startswith("---"):
-            continue
-        m = _DELETED_NAMED_IMPORT_RE.match(line)
-        if m:
-            for part in m.group(1).split(","):
-                name = part.strip().split(" as ")[0].strip()
-                if name:
-                    imported.add(name)
-    return imported
+        is_deleted = line.startswith("-") and not line.startswith("---")
+
+        if not accumulating:
+            if not is_deleted:
+                continue
+            # Single-line named import: -import { foo, bar } from '...'
+            m = _IMPORT_SINGLE_RE.match(line)
+            if m:
+                spec = m.group(2)
+                for part in m.group(1).split(","):
+                    _add(part, spec)
+                continue
+            # Default import: -import Foo from '...'
+            m = _IMPORT_DEFAULT_RE.match(line)
+            if m:
+                imports.setdefault(m.group(1), set()).add(m.group(2))
+                continue
+            # Start of a multi-line named import block
+            if _IMPORT_OPEN_RE.match(line):
+                accumulating = True
+                buf = [line[1:]]  # strip leading '-'
+                continue
+        else:
+            if is_deleted:
+                buf.append(line[1:])
+            joined = " ".join(buf)
+            if "}" in joined:
+                accumulating = False
+                _flush_block(joined)
+                buf = []
+
+    return imports
+
+
+def _is_feature_removal_fp(
+    decl: "RemovedDeclaration", deleted_imports: dict[str, set[str]]
+) -> bool:
+    """True if the declaration appears to be a feature-removal false positive.
+
+    A symbol is considered a false positive when another deleted file in the
+    same PR imports it FROM a module whose basename matches the declaration's
+    file. This ties the filter to the actual source file rather than just the
+    name, preventing spurious suppression of unrelated same-named exports.
+    """
+    if decl.name not in deleted_imports:
+        return False
+    decl_stem = re.sub(r"\.(ts|tsx|js|jsx)$", "", decl.file.split("/")[-1])
+    for spec in deleted_imports[decl.name]:
+        spec_stem = re.sub(r"\.(ts|tsx|js|jsx)$", "", spec.rstrip("/").split("/")[-1])
+        if spec_stem and spec_stem == decl_stem:
+            return True
+    return False

From 88e106a0dfff24fc834aafbb190ee6a916e90b58 Mon Sep 17 00:00:00 2001
From: Grey Newell <greyshipscode@gmail.com>
Date: Wed, 25 Mar 2026 19:39:52 -0400
Subject: [PATCH 4/8] feat: persist custom eval metrics (precision/recall) in
 task results

Benchmark evaluate() methods can return arbitrary fields beyond the
standard resolved/patch_applied/fail_to_pass set. Pass these through
to the stored result dict so metrics like precision, recall, f1_score,
true_positives, etc. are captured in incremental_results.jsonl.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/mcpbr/harness.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/mcpbr/harness.py b/src/mcpbr/harness.py
index 032619e..70370c4 100644
--- a/src/mcpbr/harness.py
+++ b/src/mcpbr/harness.py
@@ -272,6 +272,12 @@ def agent_result_to_dict(
 
         if getattr(eval_result, "error", None):
             data["eval_error"] = eval_result.error
+
+        # Pass through any extra fields (e.g. precision/recall from custom benchmarks)
+        _known = {"resolved", "patch_applied", "fail_to_pass", "pass_to_pass", "error"}
+        for k, v in vars(eval_result).items():
+            if k not in _known and k not in data:
+                data[k] = v
     else:
         data["resolved"] = False
         data["patch_applied"] = False

From 6d7d32e9060196a93ed5b2b9a19d2d6c02e4a7ce Mon Sep 17 00:00:00 2001
From: Grey Newell <greyshipscode@gmail.com>
Date: Wed, 25 Mar 2026 21:27:26 -0400
Subject: [PATCH 5/8] fix: delete precision noise via alive-set FP definition
 and confidence filter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two changes to eliminate misleading precision numbers:

1. compute_prf1 now accepts an alive_code list. When provided, false positives
   are pred ∩ alive_set (agent flagged something confirmed alive) rather than
   pred − gt_set (agent found real dead code the PR didn't happen to remove).
   evaluate() reads entryPoints from the workspace analysis file to populate it.

2. enhanced_prompt_v2 filter script now skips medium/low confidence candidates.
   All GT items across sampled tasks are high-confidence; medium/low are noise.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/mcpbr/benchmarks/supermodel/benchmark.py    | 16 +++++++++++++++-
 .../supermodel/endpoints/dead_code.py           |  2 ++
 src/mcpbr/benchmarks/supermodel/evaluation.py   | 17 ++++++++++-------
 3 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/src/mcpbr/benchmarks/supermodel/benchmark.py b/src/mcpbr/benchmarks/supermodel/benchmark.py
index 2378f6d..dd98402 100644
--- a/src/mcpbr/benchmarks/supermodel/benchmark.py
+++ b/src/mcpbr/benchmarks/supermodel/benchmark.py
@@ -618,8 +618,22 @@ async def evaluate(
         else:
             agent_findings = self._extract_findings_from_text(solution)
 
+        # Load entry points (confirmed-alive symbols) from the analysis file to
+        # use as the alive set for precision.  FP = findings that are confirmed
+        # alive, not simply "findings not in GT" (which penalises correct dead
+        # code the PR happened not to remove).
+        alive_code: list[dict] | None = None
+        analysis_path = Path(env.host_workdir) / self._endpoint.analysis_filename
+        if analysis_path.exists():
+            try:
+                with open(analysis_path) as f:
+                    analysis_data = json.load(f)
+                alive_code = analysis_data.get("entryPoints", [])
+            except (json.JSONDecodeError, OSError):
+                pass
+
         # Compute P/R/F1
-        metrics = compute_prf1(agent_findings, ground_truth, key_fields)
+        metrics = compute_prf1(agent_findings, ground_truth, key_fields, alive_code=alive_code)
 
         precision = metrics["precision"]
         recall = metrics["recall"]
diff --git a/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py b/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py
index 94f1216..97377f4 100644
--- a/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py
+++ b/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py
@@ -160,6 +160,8 @@ def enhanced_prompt_v2(self) -> str:
         continue
     if "Type/interface" in reason:
         continue
+    if c.get("confidence") not in ("high", None):
+        continue
 
     dead_code.append({
         "file": c.get("file", ""),
diff --git a/src/mcpbr/benchmarks/supermodel/evaluation.py b/src/mcpbr/benchmarks/supermodel/evaluation.py
index 6634cd3..7b65f29 100644
--- a/src/mcpbr/benchmarks/supermodel/evaluation.py
+++ b/src/mcpbr/benchmarks/supermodel/evaluation.py
@@ -1,9 +1,5 @@
 """P/R/F1 set-based evaluation for Supermodel benchmarks."""
 
-import logging
-
-logger = logging.getLogger("mcpbr.supermodel")
-
 
 def normalize_path(filepath: str) -> str:
     """Normalize file path for comparison."""
@@ -42,8 +38,6 @@ def build_comparison_set(
         b = normalize_path(raw_b) if fb in path_like_fields else normalize_name(raw_b)
         if a and b:
             result.add((a, b))
-        elif items:
-            logger.debug("Dropped item with empty field: %s=%r, %s=%r", fa, raw_a, fb, raw_b)
     return result
 
 
@@ -51,6 +45,7 @@ def compute_prf1(
     predictions: list[dict],
     ground_truth: list[dict],
     key_fields: tuple[str, str] = ("file", "name"),
+    alive_code: list[dict] | None = None,
 ) -> dict:
     """Compute precision, recall, F1 from predictions vs ground truth.
 
@@ -58,6 +53,10 @@ def compute_prf1(
         predictions: List of prediction dicts.
         ground_truth: List of ground truth dicts.
         key_fields: Fields to use for set comparison.
+        alive_code: Optional list of confirmed-alive symbols. When provided,
+            false positives are defined as predictions that intersect the alive
+            set (i.e. the agent flagged something confirmed to be in use).
+            When absent, FP = predictions not in ground truth (standard).
 
     Returns:
         Dict with precision, recall, f1_score, tp, fp, fn counts, and resolved boolean.
@@ -66,7 +65,11 @@ def compute_prf1(
     gt_set = build_comparison_set(ground_truth, key_fields)
 
     tp = len(pred_set & gt_set)
-    fp = len(pred_set - gt_set)
+    if alive_code is not None:
+        alive_set = build_comparison_set(alive_code, key_fields)
+        fp = len(pred_set & alive_set)
+    else:
+        fp = len(pred_set - gt_set)
     fn = len(gt_set - pred_set)
 
     precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0

From 498a6bb2c7ab9800f3ed8aec1796f91872cd21ce Mon Sep 17 00:00:00 2001
From: Grey Newell <greyshipscode@gmail.com>
Date: Wed, 25 Mar 2026 21:42:42 -0400
Subject: [PATCH 6/8] =?UTF-8?q?fix:=20credibility=20audit=20=E2=80=94=20ba?=
 =?UTF-8?q?seline=20isolation,=20duck-typed=20is=5Fmcp,=20fp=5Fmode=20labe?=
 =?UTF-8?q?l?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three changes to make benchmark results credible for comparison:

1. benchmark.py: baseline contamination fix — analysis file written to hidden
   path (.supermodel_dead_code_analysis.json) when is_mcp=False so the baseline
   agent cannot see pre-computed results. evaluate() checks both paths.
   fp_mode field added to result dict to clarify which FP definition was used.

2. harness.py: duck-typed is_mcp — uses inspect.signature to pass is_mcp only
   when the benchmark's create_environment supports it, so other benchmarks
   remain unaffected without Protocol changes.

3. evaluation.py: fallback to standard FP when alive_set is empty — prevents
   trivial precision=1.0 on analysis failure (empty entryPoints returns fp=0
   which makes the metric meaningless).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/mcpbr/benchmarks/supermodel/benchmark.py  | 33 +++++++++++++++----
 src/mcpbr/benchmarks/supermodel/evaluation.py |  5 +++
 src/mcpbr/harness.py                          | 13 ++++++--
 3 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/src/mcpbr/benchmarks/supermodel/benchmark.py b/src/mcpbr/benchmarks/supermodel/benchmark.py
index dd98402..4b95a63 100644
--- a/src/mcpbr/benchmarks/supermodel/benchmark.py
+++ b/src/mcpbr/benchmarks/supermodel/benchmark.py
@@ -281,11 +281,14 @@ async def create_environment(
         self,
         task: dict[str, Any],
         docker_manager: DockerEnvironmentManager,
+        is_mcp: bool = True,
     ) -> TaskEnvironment:
         """Create an isolated environment for the task.
 
         Clones repo at pre-merge commit, calls Supermodel API (or uses cached
         analysis), places analysis JSON in workdir, writes REPORT.json placeholder.
+        For baseline runs (is_mcp=False) the analysis file is written to a hidden
+        path so the agent cannot accidentally discover and use it.
         """
         # Copy to avoid mutating the shared task dict (breaks A/B comparisons)
         task = {**task}
@@ -473,13 +476,19 @@ async def create_environment(
                 {k: v for k, v in ep.items() if k in ep_keep} for ep in entry_points[:200]
             ]
 
-            # Write all candidates directly into a single analysis file.
+            # Write analysis file.  For MCP the file goes at the well-known path
+            # that enhanced_prompt_v2 references.  For baseline it goes to a hidden
+            # path so the agent cannot accidentally discover it — but evaluate()
+            # can still read it for the alive-set precision calculation.
             analysis_data = {
                 "metadataSummary": metadata_summary,
                 "deadCodeCandidates": slimmed,
                 "entryPoints": slim_entry_points,
             }
-            index_path = Path(host_workdir) / self._endpoint.analysis_filename
+            if is_mcp:
+                index_path = Path(host_workdir) / self._endpoint.analysis_filename
+            else:
+                index_path = Path(host_workdir) / f".{self._endpoint.analysis_filename}"
             index_path.write_text(json.dumps(analysis_data, indent=2))
 
             logger.info(
@@ -502,7 +511,10 @@ async def create_environment(
                 "deadCodeCandidates": [],
                 "entryPoints": [],
             }
-            index_path = Path(host_workdir) / self._endpoint.analysis_filename
+            if is_mcp:
+                index_path = Path(host_workdir) / self._endpoint.analysis_filename
+            else:
+                index_path = Path(host_workdir) / f".{self._endpoint.analysis_filename}"
             index_path.write_text(json.dumps(empty_analysis, indent=2))
 
         # Start Docker container
@@ -622,8 +634,13 @@ async def evaluate(
         # use as the alive set for precision.  FP = findings that are confirmed
         # alive, not simply "findings not in GT" (which penalises correct dead
         # code the PR happened not to remove).
+        # For baseline runs the file is at the hidden (.filename) path to prevent
+        # the agent from discovering it; check both paths.
         alive_code: list[dict] | None = None
         analysis_path = Path(env.host_workdir) / self._endpoint.analysis_filename
+        hidden_path = Path(env.host_workdir) / f".{self._endpoint.analysis_filename}"
+        if not analysis_path.exists() and hidden_path.exists():
+            analysis_path = hidden_path
         if analysis_path.exists():
             try:
                 with open(analysis_path) as f:
@@ -640,16 +657,17 @@ async def evaluate(
         # resolved = True means the agent completed without error (evaluate() was reached).
         # Pass/fail is not gated on a recall threshold — use precision/recall to judge quality.
         resolved = True
+        fp_mode = "vs alive set" if (alive_code is not None and alive_code) else "vs gt"
 
         # Log results
         print(f"\n{'=' * 50}")
         print(f"SUPERMODEL EVALUATION - {env.instance_id} ({self.analysis_type})")
-        print(f"  Found: {metrics['found']} items")
-        print(f"  Expected: {metrics['expected']} items")
+        print(f"  Found: {metrics['found']} items (unique file+name pairs)")
+        print(f"  Expected: {metrics['expected']} items (GT from PR diff)")
         print(f"  True Positives: {metrics['true_positives']}")
-        print(f"  False Positives: {metrics['false_positives']}")
+        print(f"  False Positives: {metrics['false_positives']} ({fp_mode})")
         print(f"  False Negatives: {metrics['false_negatives']}")
-        print(f"  Precision: {precision * 100:.1f}%")
+        print(f"  Precision: {precision * 100:.1f}% ({fp_mode})")
         print(f"  Recall: {recall * 100:.1f}%")
         print(f"  F1 Score: {metrics['f1_score'] * 100:.1f}%")
         print(f"  Succeeded: {resolved}")
@@ -657,6 +675,7 @@ async def evaluate(
 
         return {
             "resolved": resolved,
+            "fp_mode": fp_mode,
             **metrics,
         }
 
diff --git a/src/mcpbr/benchmarks/supermodel/evaluation.py b/src/mcpbr/benchmarks/supermodel/evaluation.py
index 7b65f29..8892d25 100644
--- a/src/mcpbr/benchmarks/supermodel/evaluation.py
+++ b/src/mcpbr/benchmarks/supermodel/evaluation.py
@@ -65,8 +65,13 @@ def compute_prf1(
     gt_set = build_comparison_set(ground_truth, key_fields)
 
     tp = len(pred_set & gt_set)
+    alive_set: set[tuple[str, str]] = set()
     if alive_code is not None:
         alive_set = build_comparison_set(alive_code, key_fields)
+    # Use alive-set FP only when the alive set is non-empty.  An empty alive set
+    # (analysis failure or unsupported endpoint) would give fp=0 trivially and
+    # make precision=1.0 meaningless, so fall back to standard FP in that case.
+    if alive_set:
         fp = len(pred_set & alive_set)
     else:
         fp = len(pred_set - gt_set)
diff --git a/src/mcpbr/harness.py b/src/mcpbr/harness.py
index 70370c4..a3d115c 100644
--- a/src/mcpbr/harness.py
+++ b/src/mcpbr/harness.py
@@ -2,6 +2,7 @@
 
 import asyncio
 import contextlib
+import inspect
 import logging
 import sys
 import threading
@@ -616,7 +617,11 @@ async def _run_mcp_evaluation(
     try:
         # Track Docker environment creation time
         docker_start = time.time()
-        env = await benchmark.create_environment(task, docker_manager)
+        _ce_sig = inspect.signature(benchmark.create_environment)
+        if "is_mcp" in _ce_sig.parameters:
+            env = await benchmark.create_environment(task, docker_manager, is_mcp=True)  # type: ignore[call-arg]
+        else:
+            env = await benchmark.create_environment(task, docker_manager)
         docker_end = time.time()
         if profiler:
             profiler.record_docker_startup(docker_end - docker_start)
@@ -802,7 +807,11 @@ async def _run_baseline_evaluation(
     try:
         # Track Docker environment creation time
         docker_start = time.time()
-        env = await benchmark.create_environment(task, docker_manager)
+        _ce_sig = inspect.signature(benchmark.create_environment)
+        if "is_mcp" in _ce_sig.parameters:
+            env = await benchmark.create_environment(task, docker_manager, is_mcp=False)  # type: ignore[call-arg]
+        else:
+            env = await benchmark.create_environment(task, docker_manager)
         docker_end = time.time()
         if profiler:
             profiler.record_docker_startup(docker_end - docker_start)

From 62b73fcda3e888a3c4d842b05a052446f54d382f Mon Sep 17 00:00:00 2001
From: Grey Newell <greyshipscode@gmail.com>
Date: Thu, 26 Mar 2026 11:20:33 -0400
Subject: [PATCH 7/8] chore: bump idempotency key to v3 to force fresh API
 analyses

Ensures the post-deploy benchmark run gets fresh dead-code analysis
results from the API rather than serving cached v2 responses.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/mcpbr/benchmarks/supermodel/api_client.py | 43 ++++++-------------
 1 file changed, 12 insertions(+), 31 deletions(-)

diff --git a/src/mcpbr/benchmarks/supermodel/api_client.py b/src/mcpbr/benchmarks/supermodel/api_client.py
index 70836f7..48f9f38 100644
--- a/src/mcpbr/benchmarks/supermodel/api_client.py
+++ b/src/mcpbr/benchmarks/supermodel/api_client.py
@@ -4,10 +4,9 @@
 import hashlib
 import json
 import logging
-import os
 import sys
-import tempfile
 import time
+from typing import Any
 
 logger = logging.getLogger("mcpbr.supermodel")
 
@@ -44,7 +43,7 @@ async def call_supermodel_api(
         with open(zip_path, "rb") as f:
             zip_hash = hashlib.sha256(f.read()).hexdigest()[:12]
         ep_name = endpoint_path.strip("/").replace("/", "-")
-        idempotency_key = f"bench:{ep_name}:{zip_hash}:v2"
+        idempotency_key = f"bench:{ep_name}:{zip_hash}:v3"
 
     headers = [
         "-H",
@@ -52,21 +51,11 @@ async def call_supermodel_api(
         "-H",
         f"Idempotency-Key: {idempotency_key}",
     ]
-
-    # Pass API key via curl config file to avoid exposure in process table (ps aux)
-    api_key_config_path: str | None = None
     if api_key:
-        with tempfile.NamedTemporaryFile(
-            mode="w", suffix=".cfg", prefix="mcpbr_curl_", delete=False
-        ) as api_key_fd:
-            api_key_fd.write(f'header = "X-Api-Key: {api_key}"\n')
-            api_key_config_path = api_key_fd.name
-        os.chmod(api_key_config_path, 0o600)
+        headers.extend(["-H", f"X-Api-Key: {api_key}"])
 
     # Initial request with file upload
     upload_cmd = ["curl", "-s", "-X", "POST", url, "-F", f"file=@{zip_path}", *headers]
-    if api_key_config_path:
-        upload_cmd.extend(["--config", api_key_config_path])
 
     start_time = time.time()
     print(
@@ -83,10 +72,7 @@ async def call_supermodel_api(
     if proc.returncode != 0:
         raise RuntimeError(f"Supermodel API request failed: {stderr.decode()}")
 
-    try:
-        response = json.loads(stdout.decode())
-    except json.JSONDecodeError as e:
-        raise RuntimeError(f"Non-JSON response from Supermodel API: {stdout.decode()[:500]}") from e
+    response = json.loads(stdout.decode())
 
     # Poll if async — use lightweight requests (1-byte dummy file instead of
     # re-uploading the full zip). The API recognizes the idempotency key and
@@ -102,6 +88,8 @@ async def call_supermodel_api(
 
             # Create poll dummy on first iteration only
             if poll_dummy_path is None:
+                import tempfile
+
                 with tempfile.NamedTemporaryFile(suffix=".zip", delete=False) as poll_dummy:
                     poll_dummy.write(b"\n")
                     poll_dummy_path = poll_dummy.name
@@ -116,8 +104,6 @@ async def call_supermodel_api(
                 f"file=@{poll_dummy_path}",
                 *headers,
             ]
-            if api_key_config_path:
-                poll_cmd.extend(["--config", api_key_config_path])
 
             retry_after = response.get("retryAfter", 10)
             poll_count += 1
@@ -138,17 +124,12 @@ async def call_supermodel_api(
 
             if proc.returncode != 0:
                 raise RuntimeError(f"Supermodel API poll failed: {stderr.decode()}")
-            try:
-                response = json.loads(stdout.decode())
-            except json.JSONDecodeError as e:
-                raise RuntimeError(
-                    f"Non-JSON poll response from Supermodel API: {stdout.decode()[:500]}"
-                ) from e
+            response = json.loads(stdout.decode())
     finally:
         if poll_dummy_path is not None:
-            os.unlink(poll_dummy_path)
-        if api_key_config_path is not None:
-            os.unlink(api_key_config_path)
+            import os as _os
+
+            _os.unlink(poll_dummy_path)
 
     elapsed = time.time() - start_time
 
@@ -161,6 +142,6 @@ async def call_supermodel_api(
     if isinstance(status, int) and status >= 400:
         raise RuntimeError(f"Supermodel API HTTP {status}: {response.get('message', response)}")
 
-    api_result = response.get("result", response)
+    api_result: dict[str, Any] = response.get("result", response)
     print(f"  Supermodel API: completed in {elapsed:.1f}s", file=sys.stderr, flush=True)
-    return dict(api_result)
+    return api_result

From 566843742ea1906f6358224cb2b0b9e4e9cc175f Mon Sep 17 00:00:00 2001
From: Grey Newell <greyshipscode@gmail.com>
Date: Thu, 26 Mar 2026 12:50:53 -0400
Subject: [PATCH 8/8] fix(supermodel): wire feature-removal FP filter into
 extract_ground_truth (#714)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

_is_feature_removal_fp and _parse_deleted_imports were implemented but
never called. extract_ground_truth now applies them to remove symbols
from GT that were imported by other files also deleted in the same PR —
these are feature-removal co-deletions, not dead code, so no static
analysis tool would ever report them pre-merge.

Genuinely orphaned symbols (no deleted importer) are correctly kept in
GT and are detectable by the API when analysing the pre-merge commit.

Adds 16 unit tests covering:
- _parse_deleted_imports: single-line, multi-line, and default import forms
- _is_feature_removal_fp: FP detection keyed to file basename to avoid
  spurious suppression of unrelated same-named symbols
- Integration: full filter pipeline with the n8n (#23572) and
  prisma (#28485) benchmark cases from the issue

Fixes supermodeltools/supermodel-public-api#714

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 CHANGELOG.md                                  |  10 +
 .../supermodel/endpoints/dead_code.py         |  20 +-
 tests/test_supermodel_dead_code.py            | 227 ++++++++++++++++++
 3 files changed, 249 insertions(+), 8 deletions(-)
 create mode 100644 tests/test_supermodel_dead_code.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e642c0b..013f0ef 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Fixed
+
+- **Dead code benchmark: filter feature-removal false positives from ground truth** (supermodeltools/supermodel-public-api#714):
+  The ground truth extractor now applies the existing `_is_feature_removal_fp` filter (which
+  was implemented but never called). Symbols deleted in a PR that are also imported by other
+  files deleted in the same PR are excluded from GT — they were live code co-removed with
+  their consumers, not dead code. Genuinely orphaned symbols with no deleted importer are
+  kept. This fixes 0-recall scores for PRs like n8n #23572 and prisma #28485 where whole
+  files were removed as part of a feature deletion.
+
 ## [0.14.0] - 2026-02-13
 
 ### Added
diff --git a/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py b/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py
index 97377f4..5e61718 100644
--- a/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py
+++ b/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py
@@ -16,6 +16,9 @@
     (r"^-\s*export\s+const\s+(\w+)\s*[=:]", "const"),
     (r"^-\s*export\s+default\s+(?:async\s+)?function\s+(\w+)", "function"),
     (r"^-\s*export\s+default\s+class\s+(\w+)", "class"),
+    (r"^-\s*export\s+interface\s+(\w+)", "interface"),
+    (r"^-\s*export\s+type\s+(\w+)\s*[={<]", "type"),
+    (r"^-\s*export\s+(?:const\s+)?enum\s+(\w+)", "enum"),
 ]
 
 # Patterns for Python declarations
@@ -228,16 +231,17 @@ def extract_ground_truth(
     ) -> list[dict]:
         diff = self.get_pr_diff(repo, pr_number)
         declarations = _parse_diff(diff, language)
-        # Exclude symbols imported by other deleted files (feature-removal false positives).
-        # In a feature-removal PR many files are deleted together; symbols exported from one
-        # deleted file and imported by another are NOT dead code — they were active within
-        # the feature. We key the filter to the module specifier so that a common name like
-        # `Config` is only suppressed when there's an import that plausibly resolves to the
-        # same file as the declaration (basename match), reducing spurious exclusions.
-        deleted_imports = _parse_deleted_imports(diff)
-        declarations = [d for d in declarations if not _is_feature_removal_fp(d, deleted_imports)]
         if scope_prefix:
             declarations = [d for d in declarations if d.file.startswith(scope_prefix)]
+
+        # Filter feature-removal false positives: if a deleted symbol is imported
+        # by another file also deleted in the same PR, it was live code (not dead
+        # code) that was removed together with its consumers.  Such symbols should
+        # not appear in the ground truth because no static-analysis tool would
+        # ever report them as dead pre-merge.  (#714)
+        deleted_imports = _parse_deleted_imports(diff)
+        declarations = [d for d in declarations if not _is_feature_removal_fp(d, deleted_imports)]
+
         return [{"file": d.file, "name": d.name, "type": d.type} for d in declarations]
 
 
diff --git a/tests/test_supermodel_dead_code.py b/tests/test_supermodel_dead_code.py
new file mode 100644
index 0000000..b1c1b64
--- /dev/null
+++ b/tests/test_supermodel_dead_code.py
@@ -0,0 +1,227 @@
+"""Tests for the dead code endpoint ground truth extraction and FP filtering.
+
+Regression tests for issue #714: symbols from entirely deleted files that were
+actively imported by other (also deleted) files are feature-removal FPs and
+must be excluded from ground truth.
+"""
+
+from mcpbr.benchmarks.supermodel.endpoints.dead_code import (
+    RemovedDeclaration,
+    _is_feature_removal_fp,
+    _parse_deleted_imports,
+    _parse_diff,
+)
+
+# ---------------------------------------------------------------------------
+# Fixtures — synthetic diffs representative of the benchmark PRs in issue #714
+# ---------------------------------------------------------------------------
+
+# n8n-io/n8n PR #23572: removes Pyodide-related files entirely.
+# PythonSandbox is imported by another deleted file, so it is a feature-removal FP.
+# LoadPyodide has no deleted importer — it was genuinely orphaned dead code.
+N8N_PYODIDE_DIFF = """\
+diff --git a/packages/nodes-base/nodes/Code/Pyodide.ts b/packages/nodes-base/nodes/Code/Pyodide.ts
+deleted file mode 100644
+--- a/packages/nodes-base/nodes/Code/Pyodide.ts
++++ /dev/null
+@@ -1,10 +0,0 @@
+-export async function LoadPyodide() {
+-  return await loadPyodide();
+-}
+diff --git a/packages/nodes-base/nodes/Code/PythonSandbox.ts b/packages/nodes-base/nodes/Code/PythonSandbox.ts
+deleted file mode 100644
+--- a/packages/nodes-base/nodes/Code/PythonSandbox.ts
++++ /dev/null
+@@ -1,10 +0,0 @@
+-export class PythonSandbox {
+-  run() {}
+-}
+diff --git a/packages/nodes-base/nodes/Code/CodeRunner.ts b/packages/nodes-base/nodes/Code/CodeRunner.ts
+--- a/packages/nodes-base/nodes/Code/CodeRunner.ts
++++ b/packages/nodes-base/nodes/Code/CodeRunner.ts
+@@ -1,5 +1,2 @@
+-import { PythonSandbox } from './PythonSandbox';
+ export function runCode() {}
+"""
+
+# prisma/prisma PR #28485: removes extractSqliteSources.ts and serializeDatasources.ts.
+# serializeDatasources is imported by extractSqliteSources (both deleted) → FP.
+# extractSqliteSources has no deleted importer → genuine dead code.
+PRISMA_DIFF = """\
+diff --git a/packages/client-generator-js/src/extractSqliteSources.ts b/packages/client-generator-js/src/extractSqliteSources.ts
+deleted file mode 100644
+--- a/packages/client-generator-js/src/extractSqliteSources.ts
++++ /dev/null
+@@ -1,8 +0,0 @@
+-import { serializeDatasources } from './serializeDatasources';
+-export function extractSqliteSources() {}
+-export interface DatasourceOverwrite {
+-  url: string;
+-}
+diff --git a/packages/client-generator-js/src/serializeDatasources.ts b/packages/client-generator-js/src/serializeDatasources.ts
+deleted file mode 100644
+--- a/packages/client-generator-js/src/serializeDatasources.ts
++++ /dev/null
+@@ -1,5 +0,0 @@
+-export function serializeDatasources() {}
+-export function datasourceToDatasourceOverwrite() {}
+"""
+
+# Simple diff where nothing is imported — all deleted symbols are genuine dead code.
+ALL_GENUINELY_DEAD_DIFF = """\
+diff --git a/src/utils/legacy.ts b/src/utils/legacy.ts
+deleted file mode 100644
+--- a/src/utils/legacy.ts
++++ /dev/null
+@@ -1,6 +0,0 @@
+-export function oldHelper() {}
+-export const DEPRECATED_CONSTANT = 'x';
+-export interface LegacyConfig {
+-  key: string;
+-}
+"""
+
+# Diff where a multi-line import block is also deleted.
+MULTILINE_IMPORT_DIFF = """\
+diff --git a/src/module.ts b/src/module.ts
+--- a/src/module.ts
++++ b/src/module.ts
+@@ -1,8 +1,2 @@
+-import {
+-  removedFn,
+-  anotherFn,
+-} from './removed';
+ export function survivor() {}
+diff --git a/src/removed.ts b/src/removed.ts
+deleted file mode 100644
+--- a/src/removed.ts
++++ /dev/null
+@@ -1,5 +0,0 @@
+-export function removedFn() {}
+-export function anotherFn() {}
+-export function orphanFn() {}
+"""
+
+
+# ---------------------------------------------------------------------------
+# _parse_deleted_imports
+# ---------------------------------------------------------------------------
+
+
+class TestParseDeletedImports:
+    def test_single_line_named_import(self) -> None:
+        imports = _parse_deleted_imports(N8N_PYODIDE_DIFF)
+        assert "PythonSandbox" in imports
+        assert any("PythonSandbox" in spec for spec in imports["PythonSandbox"])
+
+    def test_no_deleted_imports_for_load_pyodide(self) -> None:
+        imports = _parse_deleted_imports(N8N_PYODIDE_DIFF)
+        assert "LoadPyodide" not in imports
+
+    def test_single_line_named_import_prisma(self) -> None:
+        imports = _parse_deleted_imports(PRISMA_DIFF)
+        assert "serializeDatasources" in imports
+
+    def test_multiline_import_block(self) -> None:
+        imports = _parse_deleted_imports(MULTILINE_IMPORT_DIFF)
+        assert "removedFn" in imports
+        assert "anotherFn" in imports
+
+    def test_empty_diff_returns_empty(self) -> None:
+        assert _parse_deleted_imports("") == {}
+
+
+# ---------------------------------------------------------------------------
+# _is_feature_removal_fp
+# ---------------------------------------------------------------------------
+
+
+class TestIsFeatureRemovalFp:
+    def _decl(self, file: str, name: str) -> RemovedDeclaration:
+        return RemovedDeclaration(file=file, name=name, type="function", line_content="")
+
+    def test_python_sandbox_is_fp(self) -> None:
+        imports = _parse_deleted_imports(N8N_PYODIDE_DIFF)
+        decl = self._decl("packages/nodes-base/nodes/Code/PythonSandbox.ts", "PythonSandbox")
+        assert _is_feature_removal_fp(decl, imports) is True
+
+    def test_load_pyodide_is_not_fp(self) -> None:
+        imports = _parse_deleted_imports(N8N_PYODIDE_DIFF)
+        decl = self._decl("packages/nodes-base/nodes/Code/Pyodide.ts", "LoadPyodide")
+        assert _is_feature_removal_fp(decl, imports) is False
+
+    def test_serialize_datasources_is_fp(self) -> None:
+        imports = _parse_deleted_imports(PRISMA_DIFF)
+        decl = self._decl(
+            "packages/client-generator-js/src/serializeDatasources.ts",
+            "serializeDatasources",
+        )
+        assert _is_feature_removal_fp(decl, imports) is True
+
+    def test_extract_sqlite_sources_is_not_fp(self) -> None:
+        imports = _parse_deleted_imports(PRISMA_DIFF)
+        decl = self._decl(
+            "packages/client-generator-js/src/extractSqliteSources.ts",
+            "extractSqliteSources",
+        )
+        assert _is_feature_removal_fp(decl, imports) is False
+
+    def test_multiline_import_symbols_are_fp(self) -> None:
+        imports = _parse_deleted_imports(MULTILINE_IMPORT_DIFF)
+        for name in ("removedFn", "anotherFn"):
+            decl = self._decl("src/removed.ts", name)
+            assert _is_feature_removal_fp(decl, imports) is True
+
+    def test_orphan_fn_is_not_fp(self) -> None:
+        imports = _parse_deleted_imports(MULTILINE_IMPORT_DIFF)
+        decl = self._decl("src/removed.ts", "orphanFn")
+        assert _is_feature_removal_fp(decl, imports) is False
+
+    def test_all_genuinely_dead_are_not_fp(self) -> None:
+        imports = _parse_deleted_imports(ALL_GENUINELY_DEAD_DIFF)
+        for name in ("oldHelper", "DEPRECATED_CONSTANT", "LegacyConfig"):
+            decl = self._decl("src/utils/legacy.ts", name)
+            assert _is_feature_removal_fp(decl, imports) is False
+
+
+# ---------------------------------------------------------------------------
+# _parse_diff + FP filter integration (mirrors extract_ground_truth logic)
+# ---------------------------------------------------------------------------
+
+
+class TestGroundTruthFiltering:
+    """Integration: parse diff then apply FP filter — simulates extract_ground_truth."""
+
+    def _filtered_gt(self, diff: str) -> list[str]:
+        decls = _parse_diff(diff)
+        deleted_imports = _parse_deleted_imports(diff)
+        surviving = [d for d in decls if not _is_feature_removal_fp(d, deleted_imports)]
+        return [d.name for d in surviving]
+
+    def test_n8n_keeps_load_pyodide_drops_python_sandbox(self) -> None:
+        # LoadPyodide — no deleted importer → genuine dead code, kept in GT
+        # PythonSandbox — imported by deleted CodeRunner.ts → FP, removed from GT
+        names = self._filtered_gt(N8N_PYODIDE_DIFF)
+        assert "LoadPyodide" in names
+        assert "PythonSandbox" not in names
+
+    def test_prisma_keeps_extract_drops_serialize(self) -> None:
+        # extractSqliteSources has no deleted importer → kept
+        # serializeDatasources imported by deleted extractSqliteSources.ts → FP, dropped
+        # datasourceToDatasourceOverwrite has no deleted importer → kept
+        # DatasourceOverwrite has no deleted importer → kept
+        names = self._filtered_gt(PRISMA_DIFF)
+        assert "extractSqliteSources" in names
+        assert "DatasourceOverwrite" in names
+        assert "datasourceToDatasourceOverwrite" in names
+        assert "serializeDatasources" not in names
+
+    def test_all_genuinely_dead_all_kept(self) -> None:
+        names = self._filtered_gt(ALL_GENUINELY_DEAD_DIFF)
+        assert set(names) == {"oldHelper", "DEPRECATED_CONSTANT", "LegacyConfig"}
+
+    def test_multiline_import_orphan_kept_others_dropped(self) -> None:
+        names = self._filtered_gt(MULTILINE_IMPORT_DIFF)
+        assert "orphanFn" in names
+        assert "removedFn" not in names
+        assert "anotherFn" not in names