From 9591e154c5910a63202feaf3b522cafc4fab6cd1 Mon Sep 17 00:00:00 2001
From: Grey Newell <greyshipscode@gmail.com>
Date: Wed, 25 Mar 2026 10:21:39 -0400
Subject: [PATCH 1/3] fix(supermodel): add zip comment and source map excludes
 to bust stale caches

- Stamp every zip with _ZIP_COMMENT (mcpbr-analysis-v3) so the hash is
  stable, deterministic, and distinct from earlier runs on old containers.
  Server-side cached errors from the 4Gi era are bypassed automatically.
- Add *.map / *.js.map to BINARY_EXCLUDE_PATTERNS (source maps add no
  value for dead-code analysis and inflate zip size).
- Delete stale local analysis caches (all pre-date the binary-filter commit).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/mcpbr/benchmarks/supermodel/git_utils.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/mcpbr/benchmarks/supermodel/git_utils.py b/src/mcpbr/benchmarks/supermodel/git_utils.py
index 0cc4811..58a3675 100644
--- a/src/mcpbr/benchmarks/supermodel/git_utils.py
+++ b/src/mcpbr/benchmarks/supermodel/git_utils.py
@@ -34,6 +34,8 @@
     "*.mp3",
     "*.wav",
     "*.ogg",
+    "*.map",
+    "*.js.map",
 ]
 
 
@@ -187,8 +189,16 @@ async def _zip_repo_git_archive(
     return output_zip
 
 
+_ZIP_COMMENT = b"mcpbr-analysis-v3"
+
+
 def _filter_zip_entries(zip_path: str, patterns: list[str]) -> None:
-    """Rewrite zip in-place, removing entries whose basename matches any glob pattern."""
+    """Rewrite zip in-place, removing entries whose basename matches any glob pattern.
+
+    Always rewrites (even if no entries are removed) to stamp the archive with
+    _ZIP_COMMENT so the resulting hash is stable across git archive versions and
+    acts as a cache-busting version marker.
+    """
     tmp_path = zip_path + ".tmp"
     removed = 0
     try:
@@ -196,6 +206,7 @@ def _filter_zip_entries(zip_path: str, patterns: list[str]) -> None:
             zipfile.ZipFile(zip_path, "r") as zin,
             zipfile.ZipFile(tmp_path, "w", compression=zipfile.ZIP_DEFLATED) as zout,
         ):
+            zout.comment = _ZIP_COMMENT
             for item in zin.infolist():
                 basename = os.path.basename(item.filename)
                 if any(fnmatch.fnmatch(basename, pat) for pat in patterns):

From c579a18611d20c1dc038fdc9c31590678f98a814 Mon Sep 17 00:00:00 2001
From: Grey Newell <greyshipscode@gmail.com>
Date: Wed, 25 Mar 2026 16:19:03 -0400
Subject: [PATCH 2/3] fix(supermodel): remove recall threshold gating and
 filter feature-removal GT false positives
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Remove `resolved_threshold` parameter from SupermodelBenchmark.__init__; `resolved`
  now reflects whether the agent completed without error (evaluate() was reached), not
  whether recall crossed an arbitrary threshold. Precision/recall are the quality signal.

- Add `_parse_deleted_imports()` to dead_code.py: parses deleted `import { ... }` lines
  from a PR diff and filters out any removed exports whose names appear in those imports.
  Feature-removal PRs delete many files together; symbols exported by one deleted file
  and imported by another are NOT dead code — they were active within the feature.
  This eliminates false positives in GT for logto, n8n, prisma, podman, nextjs PRs.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/mcpbr/benchmarks/supermodel/benchmark.py  | 11 +++----
 .../supermodel/endpoints/dead_code.py         | 30 +++++++++++++++++++
 2 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/src/mcpbr/benchmarks/supermodel/benchmark.py b/src/mcpbr/benchmarks/supermodel/benchmark.py
index b8d2af5..2378f6d 100644
--- a/src/mcpbr/benchmarks/supermodel/benchmark.py
+++ b/src/mcpbr/benchmarks/supermodel/benchmark.py
@@ -50,7 +50,6 @@ def __init__(
         tasks: list[dict[str, Any]] | None = None,
         supermodel_api_base: str = "https://api.supermodel.dev",
         supermodel_api_key: str | None = None,
-        resolved_threshold: float = 0.8,
         ground_truth_dir: str | Path | None = None,
         supermodel_api_timeout: int = 900,
         **kwargs: Any,
@@ -63,9 +62,6 @@ def __init__(
             tasks: List of task config dicts from YAML.
             supermodel_api_base: Base URL for Supermodel API.
             supermodel_api_key: API key (or set SUPERMODEL_API_KEY env var).
-            resolved_threshold: Recall threshold to consider a task 'resolved' (precision is
-                               reported but not required — the API returns many valid dead-code
-                               candidates beyond the GT set, so precision is not a fair gate).
             ground_truth_dir: Directory to cache ground truth JSON files.
             supermodel_api_timeout: Max seconds to wait for Supermodel API (default 900).
             **kwargs: Additional keyword arguments (ignored for forward compat).
@@ -75,7 +71,6 @@ def __init__(
         self.api_base = supermodel_api_base
         self.api_key = supermodel_api_key or os.environ.get("SUPERMODEL_API_KEY")
         self.api_timeout = supermodel_api_timeout
-        self.resolved_threshold = resolved_threshold
         self.gt_dir = Path(ground_truth_dir) if ground_truth_dir else DEFAULT_GT_DIR
         self.gt_dir.mkdir(parents=True, exist_ok=True)
 
@@ -628,7 +623,9 @@ async def evaluate(
 
         precision = metrics["precision"]
         recall = metrics["recall"]
-        resolved = recall >= self.resolved_threshold
+        # resolved = True means the agent completed without error (evaluate() was reached).
+        # Pass/fail is not gated on a recall threshold — use precision/recall to judge quality.
+        resolved = True
 
         # Log results
         print(f"\n{'=' * 50}")
@@ -641,7 +638,7 @@ async def evaluate(
         print(f"  Precision: {precision * 100:.1f}%")
         print(f"  Recall: {recall * 100:.1f}%")
         print(f"  F1 Score: {metrics['f1_score'] * 100:.1f}%")
-        print(f"  Resolved: {resolved}")
+        print(f"  Succeeded: {resolved}")
         print(f"{'=' * 50}\n")
 
         return {
diff --git a/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py b/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py
index 9ad76c9..8d22bdc 100644
--- a/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py
+++ b/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py
@@ -44,6 +44,11 @@
     r"\.rs$",
 ]
 
+# Pattern to extract named imports from deleted lines (TypeScript/JavaScript)
+# Matches: -import { foo, bar as baz } from '...'
+# Also: -import type { Foo } from '...'
+_DELETED_NAMED_IMPORT_RE = re.compile(r"^-\s*import\s+(?:type\s+)?\{([^}]+)\}\s+from")
+
 SKIP_NAMES = {
     "default",
     "module",
@@ -218,6 +223,12 @@ def extract_ground_truth(
     ) -> list[dict]:
         diff = self.get_pr_diff(repo, pr_number)
         declarations = _parse_diff(diff, language)
+        # Exclude symbols imported by other deleted files (feature-removal false positives).
+        # In a feature-removal PR many files are deleted together; symbols exported from one
+        # deleted file and imported by another are NOT dead code — they were active within
+        # the feature. Filter them out so they don't inflate false-negative counts.
+        deleted_imports = _parse_deleted_imports(diff)
+        declarations = [d for d in declarations if d.name not in deleted_imports]
         if scope_prefix:
             declarations = [d for d in declarations if d.file.startswith(scope_prefix)]
         return [{"file": d.file, "name": d.name, "type": d.type} for d in declarations]
@@ -263,3 +274,22 @@ def _parse_diff(diff_text: str, language: str = "typescript") -> list[RemovedDec
                 break
 
     return declarations
+
+
+def _parse_deleted_imports(diff_text: str) -> set[str]:
+    """Extract symbol names that appear in deleted import statements.
+
+    Used to filter out false positives from feature-removal PRs: a symbol that
+    is imported by another file being deleted in the same PR is NOT dead code.
+    """
+    imported: set[str] = set()
+    for line in diff_text.split("\n"):
+        if not line.startswith("-") or line.startswith("---"):
+            continue
+        m = _DELETED_NAMED_IMPORT_RE.match(line)
+        if m:
+            for part in m.group(1).split(","):
+                name = part.strip().split(" as ")[0].strip()
+                if name:
+                    imported.add(name)
+    return imported

From 918e3a16daa74e65886521a376914900bd156bcd Mon Sep 17 00:00:00 2001
From: Grey Newell <greyshipscode@gmail.com>
Date: Wed, 25 Mar 2026 17:13:44 -0400
Subject: [PATCH 3/3] =?UTF-8?q?fix(supermodel):=20improve=20deleted-import?=
 =?UTF-8?q?=20parser=20=E2=80=94=20multi-line=20blocks=20+=20file-aware=20?=
 =?UTF-8?q?filter?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Address CodeRabbit review comments on #7:

1. Multi-line imports: Replace single-regex approach with a state machine that
   handles multi-line `import { ... }` blocks (common in feature-removal PRs
   where imports span many lines). Also captures default imports. The state
   machine accumulates deleted lines until the closing `}` then extracts names.

2. File-aware filtering: `_parse_deleted_imports` now returns
   `dict[str, set[str]]` (symbol → module_specifiers) instead of `set[str]`.
   New `_is_feature_removal_fp()` does basename matching between the module
   specifier in the import and the declaration's file stem (e.g. `./userService`
   matches `userService.ts`). This prevents spurious exclusion of exports that
   share a name with an unrelated import from a different module.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../supermodel/endpoints/dead_code.py         | 109 ++++++++++++++----
 1 file changed, 89 insertions(+), 20 deletions(-)

diff --git a/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py b/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py
index 8d22bdc..94f1216 100644
--- a/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py
+++ b/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py
@@ -44,10 +44,13 @@
     r"\.rs$",
 ]
 
-# Pattern to extract named imports from deleted lines (TypeScript/JavaScript)
-# Matches: -import { foo, bar as baz } from '...'
-# Also: -import type { Foo } from '...'
-_DELETED_NAMED_IMPORT_RE = re.compile(r"^-\s*import\s+(?:type\s+)?\{([^}]+)\}\s+from")
+# Patterns for state-machine import parsing of deleted lines
+_IMPORT_OPEN_RE = re.compile(r"^-\s*import\s+(?:type\s+)?(?:\w+\s*,\s*)?\{")
+_IMPORT_SINGLE_RE = re.compile(
+    r"^-\s*import\s+(?:type\s+)?(?:\w+\s*,\s*)?\{([^}]+)\}\s+from\s+['\"]([^'\"]+)['\"]"
+)
+_IMPORT_DEFAULT_RE = re.compile(r"^-\s*import\s+(?:type\s+)?(\w+)\s+from\s+['\"]([^'\"]+)['\"]")
+_IMPORT_FROM_RE = re.compile(r"from\s+['\"]([^'\"]+)['\"]")
 
 SKIP_NAMES = {
     "default",
@@ -226,9 +229,11 @@ def extract_ground_truth(
         # Exclude symbols imported by other deleted files (feature-removal false positives).
         # In a feature-removal PR many files are deleted together; symbols exported from one
         # deleted file and imported by another are NOT dead code — they were active within
-        # the feature. Filter them out so they don't inflate false-negative counts.
+        # the feature. We key the filter to the module specifier so that a common name like
+        # `Config` is only suppressed when there's an import that plausibly resolves to the
+        # same file as the declaration (basename match), reducing spurious exclusions.
         deleted_imports = _parse_deleted_imports(diff)
-        declarations = [d for d in declarations if d.name not in deleted_imports]
+        declarations = [d for d in declarations if not _is_feature_removal_fp(d, deleted_imports)]
         if scope_prefix:
             declarations = [d for d in declarations if d.file.startswith(scope_prefix)]
         return [{"file": d.file, "name": d.name, "type": d.type} for d in declarations]
@@ -276,20 +281,84 @@ def _parse_diff(diff_text: str, language: str = "typescript") -> list[RemovedDec
     return declarations
 
 
-def _parse_deleted_imports(diff_text: str) -> set[str]:
-    """Extract symbol names that appear in deleted import statements.
+def _parse_deleted_imports(diff_text: str) -> dict[str, set[str]]:
+    """Parse deleted import statements, returning symbol → set[module_specifier].
 
-    Used to filter out false positives from feature-removal PRs: a symbol that
-    is imported by another file being deleted in the same PR is NOT dead code.
+    Handles single-line and multi-line named import blocks plus default imports.
+    Capturing the module specifier (the `from '...'` part) lets callers do
+    file-aware filtering rather than just name-based filtering.
     """
-    imported: set[str] = set()
+    # symbol_name -> set of module specifiers that imported it
+    imports: dict[str, set[str]] = {}
+    accumulating = False
+    buf: list[str] = []
+
+    def _add(name: str, spec: str) -> None:
+        name = name.strip().split(" as ")[0].strip()
+        if name:
+            imports.setdefault(name, set()).add(spec)
+
+    def _flush_block(raw: str) -> None:
+        # Extract module specifier from the accumulated block
+        m_from = _IMPORT_FROM_RE.search(raw)
+        spec = m_from.group(1) if m_from else ""
+        brace_open = raw.find("{")
+        brace_close = raw.find("}")
+        if brace_open != -1 and brace_close != -1:
+            names_part = raw[brace_open + 1 : brace_close]
+            for part in names_part.split(","):
+                _add(part, spec)
+
     for line in diff_text.split("\n"):
-        if not line.startswith("-") or line.startswith("---"):
-            continue
-        m = _DELETED_NAMED_IMPORT_RE.match(line)
-        if m:
-            for part in m.group(1).split(","):
-                name = part.strip().split(" as ")[0].strip()
-                if name:
-                    imported.add(name)
-    return imported
+        is_deleted = line.startswith("-") and not line.startswith("---")
+
+        if not accumulating:
+            if not is_deleted:
+                continue
+            # Single-line named import: -import { foo, bar } from '...'
+            m = _IMPORT_SINGLE_RE.match(line)
+            if m:
+                spec = m.group(2)
+                for part in m.group(1).split(","):
+                    _add(part, spec)
+                continue
+            # Default import: -import Foo from '...'
+            m = _IMPORT_DEFAULT_RE.match(line)
+            if m:
+                imports.setdefault(m.group(1), set()).add(m.group(2))
+                continue
+            # Start of a multi-line named import block
+            if _IMPORT_OPEN_RE.match(line):
+                accumulating = True
+                buf = [line[1:]]  # strip leading '-'
+                continue
+        else:
+            if is_deleted:
+                buf.append(line[1:])
+            joined = " ".join(buf)
+            if "}" in joined:
+                accumulating = False
+                _flush_block(joined)
+                buf = []
+
+    return imports
+
+
+def _is_feature_removal_fp(
+    decl: "RemovedDeclaration", deleted_imports: dict[str, set[str]]
+) -> bool:
+    """True if the declaration appears to be a feature-removal false positive.
+
+    A symbol is considered a false positive when another deleted file in the
+    same PR imports it FROM a module whose basename matches the declaration's
+    file. This ties the filter to the actual source file rather than just the
+    name, preventing spurious suppression of unrelated same-named exports.
+    """
+    if decl.name not in deleted_imports:
+        return False
+    decl_stem = re.sub(r"\.(ts|tsx|js|jsx)$", "", decl.file.split("/")[-1])
+    for spec in deleted_imports[decl.name]:
+        spec_stem = re.sub(r"\.(ts|tsx|js|jsx)$", "", spec.rstrip("/").split("/")[-1])
+        if spec_stem and spec_stem == decl_stem:
+            return True
+    return False