supermodeltools · greynewell · Mar 25, 2026 · Mar 25, 2026 · Mar 25, 2026 · Mar 25, 2026
diff --git a/src/mcpbr/benchmarks/supermodel/benchmark.py b/src/mcpbr/benchmarks/supermodel/benchmark.py
@@ -50,7 +50,6 @@ def __init__(
         tasks: list[dict[str, Any]] | None = None,
         supermodel_api_base: str = "https://api.supermodel.dev",
         supermodel_api_key: str | None = None,
-        resolved_threshold: float = 0.8,
         ground_truth_dir: str | Path | None = None,
         supermodel_api_timeout: int = 900,
         **kwargs: Any,
@@ -63,9 +62,6 @@ def __init__(
             tasks: List of task config dicts from YAML.
             supermodel_api_base: Base URL for Supermodel API.
             supermodel_api_key: API key (or set SUPERMODEL_API_KEY env var).
-            resolved_threshold: Recall threshold to consider a task 'resolved' (precision is
-                               reported but not required — the API returns many valid dead-code
-                               candidates beyond the GT set, so precision is not a fair gate).
             ground_truth_dir: Directory to cache ground truth JSON files.
             supermodel_api_timeout: Max seconds to wait for Supermodel API (default 900).
             **kwargs: Additional keyword arguments (ignored for forward compat).
@@ -75,7 +71,6 @@ def __init__(
         self.api_base = supermodel_api_base
         self.api_key = supermodel_api_key or os.environ.get("SUPERMODEL_API_KEY")
         self.api_timeout = supermodel_api_timeout
-        self.resolved_threshold = resolved_threshold
         self.gt_dir = Path(ground_truth_dir) if ground_truth_dir else DEFAULT_GT_DIR
         self.gt_dir.mkdir(parents=True, exist_ok=True)
 
@@ -628,7 +623,9 @@ async def evaluate(
 
         precision = metrics["precision"]
         recall = metrics["recall"]
-        resolved = recall >= self.resolved_threshold
+        # resolved = True means the agent completed without error (evaluate() was reached).
+        # Pass/fail is not gated on a recall threshold — use precision/recall to judge quality.
+        resolved = True
 
         # Log results
         print(f"\n{'=' * 50}")
@@ -641,7 +638,7 @@ async def evaluate(
         print(f"  Precision: {precision * 100:.1f}%")
         print(f"  Recall: {recall * 100:.1f}%")
         print(f"  F1 Score: {metrics['f1_score'] * 100:.1f}%")
-        print(f"  Resolved: {resolved}")
+        print(f"  Succeeded: {resolved}")
         print(f"{'=' * 50}\n")
 
         return {

diff --git a/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py b/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py
@@ -44,6 +44,14 @@
     r"\.rs$",
 ]
 
+# Patterns for state-machine import parsing of deleted lines
+_IMPORT_OPEN_RE = re.compile(r"^-\s*import\s+(?:type\s+)?(?:\w+\s*,\s*)?\{")
+_IMPORT_SINGLE_RE = re.compile(
+    r"^-\s*import\s+(?:type\s+)?(?:\w+\s*,\s*)?\{([^}]+)\}\s+from\s+['\"]([^'\"]+)['\"]"
+)
+_IMPORT_DEFAULT_RE = re.compile(r"^-\s*import\s+(?:type\s+)?(\w+)\s+from\s+['\"]([^'\"]+)['\"]")
+_IMPORT_FROM_RE = re.compile(r"from\s+['\"]([^'\"]+)['\"]")
+
 SKIP_NAMES = {
     "default",
     "module",
@@ -218,6 +226,14 @@ def extract_ground_truth(
     ) -> list[dict]:
         diff = self.get_pr_diff(repo, pr_number)
         declarations = _parse_diff(diff, language)
+        # Exclude symbols imported by other deleted files (feature-removal false positives).
+        # In a feature-removal PR many files are deleted together; symbols exported from one
+        # deleted file and imported by another are NOT dead code — they were active within
+        # the feature. We key the filter to the module specifier so that a common name like
+        # `Config` is only suppressed when there's an import that plausibly resolves to the
+        # same file as the declaration (basename match), reducing spurious exclusions.
+        deleted_imports = _parse_deleted_imports(diff)
+        declarations = [d for d in declarations if not _is_feature_removal_fp(d, deleted_imports)]
         if scope_prefix:
             declarations = [d for d in declarations if d.file.startswith(scope_prefix)]
         return [{"file": d.file, "name": d.name, "type": d.type} for d in declarations]
@@ -263,3 +279,86 @@ def _parse_diff(diff_text: str, language: str = "typescript") -> list[RemovedDec
                 break
 
     return declarations
+
+
+def _parse_deleted_imports(diff_text: str) -> dict[str, set[str]]:
+    """Parse deleted import statements, returning symbol → set[module_specifier].
+
+    Handles single-line and multi-line named import blocks plus default imports.
+    Capturing the module specifier (the `from '...'` part) lets callers do
+    file-aware filtering rather than just name-based filtering.
+    """
+    # symbol_name -> set of module specifiers that imported it
+    imports: dict[str, set[str]] = {}
+    accumulating = False
+    buf: list[str] = []
+
+    def _add(name: str, spec: str) -> None:
+        name = name.strip().split(" as ")[0].strip()
+        if name:
+            imports.setdefault(name, set()).add(spec)
+
+    def _flush_block(raw: str) -> None:
+        # Extract module specifier from the accumulated block
+        m_from = _IMPORT_FROM_RE.search(raw)
+        spec = m_from.group(1) if m_from else ""
+        brace_open = raw.find("{")
+        brace_close = raw.find("}")
+        if brace_open != -1 and brace_close != -1:
+            names_part = raw[brace_open + 1 : brace_close]
+            for part in names_part.split(","):
+                _add(part, spec)
+
+    for line in diff_text.split("\n"):
+        is_deleted = line.startswith("-") and not line.startswith("---")
+
+        if not accumulating:
+            if not is_deleted:
+                continue
+            # Single-line named import: -import { foo, bar } from '...'
+            m = _IMPORT_SINGLE_RE.match(line)
+            if m:
+                spec = m.group(2)
+                for part in m.group(1).split(","):
+                    _add(part, spec)
+                continue
+            # Default import: -import Foo from '...'
+            m = _IMPORT_DEFAULT_RE.match(line)
+            if m:
+                imports.setdefault(m.group(1), set()).add(m.group(2))
+                continue
+            # Start of a multi-line named import block
+            if _IMPORT_OPEN_RE.match(line):
+                accumulating = True
+                buf = [line[1:]]  # strip leading '-'
+                continue
+        else:
+            if is_deleted:
+                buf.append(line[1:])
+            joined = " ".join(buf)
+            if "}" in joined:
+                accumulating = False
+                _flush_block(joined)
+                buf = []
+
+    return imports
+
+
+def _is_feature_removal_fp(
+    decl: "RemovedDeclaration", deleted_imports: dict[str, set[str]]
+) -> bool:
+    """True if the declaration appears to be a feature-removal false positive.
+
+    A symbol is considered a false positive when another deleted file in the
+    same PR imports it FROM a module whose basename matches the declaration's
+    file. This ties the filter to the actual source file rather than just the
+    name, preventing spurious suppression of unrelated same-named exports.
+    """
+    if decl.name not in deleted_imports:
+        return False
+    decl_stem = re.sub(r"\.(ts|tsx|js|jsx)$", "", decl.file.split("/")[-1])
+    for spec in deleted_imports[decl.name]:
+        spec_stem = re.sub(r"\.(ts|tsx|js|jsx)$", "", spec.rstrip("/").split("/")[-1])
+        if spec_stem and spec_stem == decl_stem:
+            return True
+    return False
diff --git a/src/mcpbr/benchmarks/supermodel/git_utils.py b/src/mcpbr/benchmarks/supermodel/git_utils.py
@@ -34,6 +34,8 @@
     "*.mp3",
     "*.wav",
     "*.ogg",
+    "*.map",
+    "*.js.map",
 ]
 
 
@@ -187,15 +189,24 @@ async def _zip_repo_git_archive(
     return output_zip
 
 
+_ZIP_COMMENT = b"mcpbr-analysis-v3"
+
+
 def _filter_zip_entries(zip_path: str, patterns: list[str]) -> None:
-    """Rewrite zip in-place, removing entries whose basename matches any glob pattern."""
+    """Rewrite zip in-place, removing entries whose basename matches any glob pattern.
+
+    Always rewrites (even if no entries are removed) to stamp the archive with
+    _ZIP_COMMENT so the resulting hash is stable across git archive versions and
+    acts as a cache-busting version marker.
+    """
     tmp_path = zip_path + ".tmp"
     removed = 0
     try:
         with (
             zipfile.ZipFile(zip_path, "r") as zin,
             zipfile.ZipFile(tmp_path, "w", compression=zipfile.ZIP_DEFLATED) as zout,
         ):
+            zout.comment = _ZIP_COMMENT
             for item in zin.infolist():
                 basename = os.path.basename(item.filename)
                 if any(fnmatch.fnmatch(basename, pat) for pat in patterns):