diff --git a/src/mcpbr/benchmarks/supermodel/benchmark.py b/src/mcpbr/benchmarks/supermodel/benchmark.py index b8d2af5..2378f6d 100644 --- a/src/mcpbr/benchmarks/supermodel/benchmark.py +++ b/src/mcpbr/benchmarks/supermodel/benchmark.py @@ -50,7 +50,6 @@ def __init__( tasks: list[dict[str, Any]] | None = None, supermodel_api_base: str = "https://api.supermodel.dev", supermodel_api_key: str | None = None, - resolved_threshold: float = 0.8, ground_truth_dir: str | Path | None = None, supermodel_api_timeout: int = 900, **kwargs: Any, @@ -63,9 +62,6 @@ def __init__( tasks: List of task config dicts from YAML. supermodel_api_base: Base URL for Supermodel API. supermodel_api_key: API key (or set SUPERMODEL_API_KEY env var). - resolved_threshold: Recall threshold to consider a task 'resolved' (precision is - reported but not required — the API returns many valid dead-code - candidates beyond the GT set, so precision is not a fair gate). ground_truth_dir: Directory to cache ground truth JSON files. supermodel_api_timeout: Max seconds to wait for Supermodel API (default 900). **kwargs: Additional keyword arguments (ignored for forward compat). @@ -75,7 +71,6 @@ def __init__( self.api_base = supermodel_api_base self.api_key = supermodel_api_key or os.environ.get("SUPERMODEL_API_KEY") self.api_timeout = supermodel_api_timeout - self.resolved_threshold = resolved_threshold self.gt_dir = Path(ground_truth_dir) if ground_truth_dir else DEFAULT_GT_DIR self.gt_dir.mkdir(parents=True, exist_ok=True) @@ -628,7 +623,9 @@ async def evaluate( precision = metrics["precision"] recall = metrics["recall"] - resolved = recall >= self.resolved_threshold + # resolved = True means the agent completed without error (evaluate() was reached). + # Pass/fail is not gated on a recall threshold — use precision/recall to judge quality. + resolved = True # Log results print(f"\n{'=' * 50}") @@ -641,7 +638,7 @@ async def evaluate( print(f" Precision: {precision * 100:.1f}%") print(f" Recall: {recall * 100:.1f}%") print(f" F1 Score: {metrics['f1_score'] * 100:.1f}%") - print(f" Resolved: {resolved}") + print(f" Succeeded: {resolved}") print(f"{'=' * 50}\n") return { diff --git a/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py b/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py index 9ad76c9..94f1216 100644 --- a/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py +++ b/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py @@ -44,6 +44,14 @@ r"\.rs$", ] +# Patterns for state-machine import parsing of deleted lines +_IMPORT_OPEN_RE = re.compile(r"^-\s*import\s+(?:type\s+)?(?:\w+\s*,\s*)?\{") +_IMPORT_SINGLE_RE = re.compile( + r"^-\s*import\s+(?:type\s+)?(?:\w+\s*,\s*)?\{([^}]+)\}\s+from\s+['\"]([^'\"]+)['\"]" +) +_IMPORT_DEFAULT_RE = re.compile(r"^-\s*import\s+(?:type\s+)?(\w+)\s+from\s+['\"]([^'\"]+)['\"]") +_IMPORT_FROM_RE = re.compile(r"from\s+['\"]([^'\"]+)['\"]") + SKIP_NAMES = { "default", "module", @@ -218,6 +226,14 @@ def extract_ground_truth( ) -> list[dict]: diff = self.get_pr_diff(repo, pr_number) declarations = _parse_diff(diff, language) + # Exclude symbols imported by other deleted files (feature-removal false positives). + # In a feature-removal PR many files are deleted together; symbols exported from one + # deleted file and imported by another are NOT dead code — they were active within + # the feature. We key the filter to the module specifier so that a common name like + # `Config` is only suppressed when there's an import that plausibly resolves to the + # same file as the declaration (basename match), reducing spurious exclusions. + deleted_imports = _parse_deleted_imports(diff) + declarations = [d for d in declarations if not _is_feature_removal_fp(d, deleted_imports)] if scope_prefix: declarations = [d for d in declarations if d.file.startswith(scope_prefix)] return [{"file": d.file, "name": d.name, "type": d.type} for d in declarations] @@ -263,3 +279,86 @@ def _parse_diff(diff_text: str, language: str = "typescript") -> list[RemovedDec break return declarations + + +def _parse_deleted_imports(diff_text: str) -> dict[str, set[str]]: + """Parse deleted import statements, returning symbol → set[module_specifier]. + + Handles single-line and multi-line named import blocks plus default imports. + Capturing the module specifier (the `from '...'` part) lets callers do + file-aware filtering rather than just name-based filtering. + """ + # symbol_name -> set of module specifiers that imported it + imports: dict[str, set[str]] = {} + accumulating = False + buf: list[str] = [] + + def _add(name: str, spec: str) -> None: + name = name.strip().split(" as ")[0].strip() + if name: + imports.setdefault(name, set()).add(spec) + + def _flush_block(raw: str) -> None: + # Extract module specifier from the accumulated block + m_from = _IMPORT_FROM_RE.search(raw) + spec = m_from.group(1) if m_from else "" + brace_open = raw.find("{") + brace_close = raw.find("}") + if brace_open != -1 and brace_close != -1: + names_part = raw[brace_open + 1 : brace_close] + for part in names_part.split(","): + _add(part, spec) + + for line in diff_text.split("\n"): + is_deleted = line.startswith("-") and not line.startswith("---") + + if not accumulating: + if not is_deleted: + continue + # Single-line named import: -import { foo, bar } from '...' + m = _IMPORT_SINGLE_RE.match(line) + if m: + spec = m.group(2) + for part in m.group(1).split(","): + _add(part, spec) + continue + # Default import: -import Foo from '...' + m = _IMPORT_DEFAULT_RE.match(line) + if m: + imports.setdefault(m.group(1), set()).add(m.group(2)) + continue + # Start of a multi-line named import block + if _IMPORT_OPEN_RE.match(line): + accumulating = True + buf = [line[1:]] # strip leading '-' + continue + else: + if is_deleted: + buf.append(line[1:]) + joined = " ".join(buf) + if "}" in joined: + accumulating = False + _flush_block(joined) + buf = [] + + return imports + + +def _is_feature_removal_fp( + decl: "RemovedDeclaration", deleted_imports: dict[str, set[str]] +) -> bool: + """True if the declaration appears to be a feature-removal false positive. + + A symbol is considered a false positive when another deleted file in the + same PR imports it FROM a module whose basename matches the declaration's + file. This ties the filter to the actual source file rather than just the + name, preventing spurious suppression of unrelated same-named exports. + """ + if decl.name not in deleted_imports: + return False + decl_stem = re.sub(r"\.(ts|tsx|js|jsx)$", "", decl.file.split("/")[-1]) + for spec in deleted_imports[decl.name]: + spec_stem = re.sub(r"\.(ts|tsx|js|jsx)$", "", spec.rstrip("/").split("/")[-1]) + if spec_stem and spec_stem == decl_stem: + return True + return False diff --git a/src/mcpbr/benchmarks/supermodel/git_utils.py b/src/mcpbr/benchmarks/supermodel/git_utils.py index 0cc4811..58a3675 100644 --- a/src/mcpbr/benchmarks/supermodel/git_utils.py +++ b/src/mcpbr/benchmarks/supermodel/git_utils.py @@ -34,6 +34,8 @@ "*.mp3", "*.wav", "*.ogg", + "*.map", + "*.js.map", ] @@ -187,8 +189,16 @@ async def _zip_repo_git_archive( return output_zip +_ZIP_COMMENT = b"mcpbr-analysis-v3" + + def _filter_zip_entries(zip_path: str, patterns: list[str]) -> None: - """Rewrite zip in-place, removing entries whose basename matches any glob pattern.""" + """Rewrite zip in-place, removing entries whose basename matches any glob pattern. + + Always rewrites (even if no entries are removed) to stamp the archive with + _ZIP_COMMENT so the resulting hash is stable across git archive versions and + acts as a cache-busting version marker. + """ tmp_path = zip_path + ".tmp" removed = 0 try: @@ -196,6 +206,7 @@ def _filter_zip_entries(zip_path: str, patterns: list[str]) -> None: zipfile.ZipFile(zip_path, "r") as zin, zipfile.ZipFile(tmp_path, "w", compression=zipfile.ZIP_DEFLATED) as zout, ): + zout.comment = _ZIP_COMMENT for item in zin.infolist(): basename = os.path.basename(item.filename) if any(fnmatch.fnmatch(basename, pat) for pat in patterns):