From 9591e154c5910a63202feaf3b522cafc4fab6cd1 Mon Sep 17 00:00:00 2001 From: Grey Newell Date: Wed, 25 Mar 2026 10:21:39 -0400 Subject: [PATCH 1/8] fix(supermodel): add zip comment and source map excludes to bust stale caches - Stamp every zip with _ZIP_COMMENT (mcpbr-analysis-v3) so the hash is stable, deterministic, and distinct from earlier runs on old containers. Server-side cached errors from the 4Gi era are bypassed automatically. - Add *.map / *.js.map to BINARY_EXCLUDE_PATTERNS (source maps add no value for dead-code analysis and inflate zip size). - Delete stale local analysis caches (all pre-date the binary-filter commit). Co-Authored-By: Claude Sonnet 4.6 --- src/mcpbr/benchmarks/supermodel/git_utils.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/mcpbr/benchmarks/supermodel/git_utils.py b/src/mcpbr/benchmarks/supermodel/git_utils.py index 0cc4811..58a3675 100644 --- a/src/mcpbr/benchmarks/supermodel/git_utils.py +++ b/src/mcpbr/benchmarks/supermodel/git_utils.py @@ -34,6 +34,8 @@ "*.mp3", "*.wav", "*.ogg", + "*.map", + "*.js.map", ] @@ -187,8 +189,16 @@ async def _zip_repo_git_archive( return output_zip +_ZIP_COMMENT = b"mcpbr-analysis-v3" + + def _filter_zip_entries(zip_path: str, patterns: list[str]) -> None: - """Rewrite zip in-place, removing entries whose basename matches any glob pattern.""" + """Rewrite zip in-place, removing entries whose basename matches any glob pattern. + + Always rewrites (even if no entries are removed) to stamp the archive with + _ZIP_COMMENT so the resulting hash is stable across git archive versions and + acts as a cache-busting version marker. + """ tmp_path = zip_path + ".tmp" removed = 0 try: @@ -196,6 +206,7 @@ def _filter_zip_entries(zip_path: str, patterns: list[str]) -> None: zipfile.ZipFile(zip_path, "r") as zin, zipfile.ZipFile(tmp_path, "w", compression=zipfile.ZIP_DEFLATED) as zout, ): + zout.comment = _ZIP_COMMENT for item in zin.infolist(): basename = os.path.basename(item.filename) if any(fnmatch.fnmatch(basename, pat) for pat in patterns): From c579a18611d20c1dc038fdc9c31590678f98a814 Mon Sep 17 00:00:00 2001 From: Grey Newell Date: Wed, 25 Mar 2026 16:19:03 -0400 Subject: [PATCH 2/8] fix(supermodel): remove recall threshold gating and filter feature-removal GT false positives MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove `resolved_threshold` parameter from SupermodelBenchmark.__init__; `resolved` now reflects whether the agent completed without error (evaluate() was reached), not whether recall crossed an arbitrary threshold. Precision/recall are the quality signal. - Add `_parse_deleted_imports()` to dead_code.py: parses deleted `import { ... }` lines from a PR diff and filters out any removed exports whose names appear in those imports. Feature-removal PRs delete many files together; symbols exported by one deleted file and imported by another are NOT dead code — they were active within the feature. This eliminates false positives in GT for logto, n8n, prisma, podman, nextjs PRs. Co-Authored-By: Claude Sonnet 4.6 --- src/mcpbr/benchmarks/supermodel/benchmark.py | 11 +++---- .../supermodel/endpoints/dead_code.py | 30 +++++++++++++++++++ 2 files changed, 34 insertions(+), 7 deletions(-) diff --git a/src/mcpbr/benchmarks/supermodel/benchmark.py b/src/mcpbr/benchmarks/supermodel/benchmark.py index b8d2af5..2378f6d 100644 --- a/src/mcpbr/benchmarks/supermodel/benchmark.py +++ b/src/mcpbr/benchmarks/supermodel/benchmark.py @@ -50,7 +50,6 @@ def __init__( tasks: list[dict[str, Any]] | None = None, supermodel_api_base: str = "https://api.supermodel.dev", supermodel_api_key: str | None = None, - resolved_threshold: float = 0.8, ground_truth_dir: str | Path | None = None, supermodel_api_timeout: int = 900, **kwargs: Any, @@ -63,9 +62,6 @@ def __init__( tasks: List of task config dicts from YAML. supermodel_api_base: Base URL for Supermodel API. supermodel_api_key: API key (or set SUPERMODEL_API_KEY env var). - resolved_threshold: Recall threshold to consider a task 'resolved' (precision is - reported but not required — the API returns many valid dead-code - candidates beyond the GT set, so precision is not a fair gate). ground_truth_dir: Directory to cache ground truth JSON files. supermodel_api_timeout: Max seconds to wait for Supermodel API (default 900). **kwargs: Additional keyword arguments (ignored for forward compat). @@ -75,7 +71,6 @@ def __init__( self.api_base = supermodel_api_base self.api_key = supermodel_api_key or os.environ.get("SUPERMODEL_API_KEY") self.api_timeout = supermodel_api_timeout - self.resolved_threshold = resolved_threshold self.gt_dir = Path(ground_truth_dir) if ground_truth_dir else DEFAULT_GT_DIR self.gt_dir.mkdir(parents=True, exist_ok=True) @@ -628,7 +623,9 @@ async def evaluate( precision = metrics["precision"] recall = metrics["recall"] - resolved = recall >= self.resolved_threshold + # resolved = True means the agent completed without error (evaluate() was reached). + # Pass/fail is not gated on a recall threshold — use precision/recall to judge quality. + resolved = True # Log results print(f"\n{'=' * 50}") @@ -641,7 +638,7 @@ async def evaluate( print(f" Precision: {precision * 100:.1f}%") print(f" Recall: {recall * 100:.1f}%") print(f" F1 Score: {metrics['f1_score'] * 100:.1f}%") - print(f" Resolved: {resolved}") + print(f" Succeeded: {resolved}") print(f"{'=' * 50}\n") return { diff --git a/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py b/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py index 9ad76c9..8d22bdc 100644 --- a/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py +++ b/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py @@ -44,6 +44,11 @@ r"\.rs$", ] +# Pattern to extract named imports from deleted lines (TypeScript/JavaScript) +# Matches: -import { foo, bar as baz } from '...' +# Also: -import type { Foo } from '...' +_DELETED_NAMED_IMPORT_RE = re.compile(r"^-\s*import\s+(?:type\s+)?\{([^}]+)\}\s+from") + SKIP_NAMES = { "default", "module", @@ -218,6 +223,12 @@ def extract_ground_truth( ) -> list[dict]: diff = self.get_pr_diff(repo, pr_number) declarations = _parse_diff(diff, language) + # Exclude symbols imported by other deleted files (feature-removal false positives). + # In a feature-removal PR many files are deleted together; symbols exported from one + # deleted file and imported by another are NOT dead code — they were active within + # the feature. Filter them out so they don't inflate false-negative counts. + deleted_imports = _parse_deleted_imports(diff) + declarations = [d for d in declarations if d.name not in deleted_imports] if scope_prefix: declarations = [d for d in declarations if d.file.startswith(scope_prefix)] return [{"file": d.file, "name": d.name, "type": d.type} for d in declarations] @@ -263,3 +274,22 @@ def _parse_diff(diff_text: str, language: str = "typescript") -> list[RemovedDec break return declarations + + +def _parse_deleted_imports(diff_text: str) -> set[str]: + """Extract symbol names that appear in deleted import statements. + + Used to filter out false positives from feature-removal PRs: a symbol that + is imported by another file being deleted in the same PR is NOT dead code. + """ + imported: set[str] = set() + for line in diff_text.split("\n"): + if not line.startswith("-") or line.startswith("---"): + continue + m = _DELETED_NAMED_IMPORT_RE.match(line) + if m: + for part in m.group(1).split(","): + name = part.strip().split(" as ")[0].strip() + if name: + imported.add(name) + return imported From 918e3a16daa74e65886521a376914900bd156bcd Mon Sep 17 00:00:00 2001 From: Grey Newell Date: Wed, 25 Mar 2026 17:13:44 -0400 Subject: [PATCH 3/8] =?UTF-8?q?fix(supermodel):=20improve=20deleted-import?= =?UTF-8?q?=20parser=20=E2=80=94=20multi-line=20blocks=20+=20file-aware=20?= =?UTF-8?q?filter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address CodeRabbit review comments on #7: 1. Multi-line imports: Replace single-regex approach with a state machine that handles multi-line `import { ... }` blocks (common in feature-removal PRs where imports span many lines). Also captures default imports. The state machine accumulates deleted lines until the closing `}` then extracts names. 2. File-aware filtering: `_parse_deleted_imports` now returns `dict[str, set[str]]` (symbol → module_specifiers) instead of `set[str]`. New `_is_feature_removal_fp()` does basename matching between the module specifier in the import and the declaration's file stem (e.g. `./userService` matches `userService.ts`). This prevents spurious exclusion of exports that share a name with an unrelated import from a different module. Co-Authored-By: Claude Sonnet 4.6 --- .../supermodel/endpoints/dead_code.py | 109 ++++++++++++++---- 1 file changed, 89 insertions(+), 20 deletions(-) diff --git a/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py b/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py index 8d22bdc..94f1216 100644 --- a/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py +++ b/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py @@ -44,10 +44,13 @@ r"\.rs$", ] -# Pattern to extract named imports from deleted lines (TypeScript/JavaScript) -# Matches: -import { foo, bar as baz } from '...' -# Also: -import type { Foo } from '...' -_DELETED_NAMED_IMPORT_RE = re.compile(r"^-\s*import\s+(?:type\s+)?\{([^}]+)\}\s+from") +# Patterns for state-machine import parsing of deleted lines +_IMPORT_OPEN_RE = re.compile(r"^-\s*import\s+(?:type\s+)?(?:\w+\s*,\s*)?\{") +_IMPORT_SINGLE_RE = re.compile( + r"^-\s*import\s+(?:type\s+)?(?:\w+\s*,\s*)?\{([^}]+)\}\s+from\s+['\"]([^'\"]+)['\"]" +) +_IMPORT_DEFAULT_RE = re.compile(r"^-\s*import\s+(?:type\s+)?(\w+)\s+from\s+['\"]([^'\"]+)['\"]") +_IMPORT_FROM_RE = re.compile(r"from\s+['\"]([^'\"]+)['\"]") SKIP_NAMES = { "default", @@ -226,9 +229,11 @@ def extract_ground_truth( # Exclude symbols imported by other deleted files (feature-removal false positives). # In a feature-removal PR many files are deleted together; symbols exported from one # deleted file and imported by another are NOT dead code — they were active within - # the feature. Filter them out so they don't inflate false-negative counts. + # the feature. We key the filter to the module specifier so that a common name like + # `Config` is only suppressed when there's an import that plausibly resolves to the + # same file as the declaration (basename match), reducing spurious exclusions. deleted_imports = _parse_deleted_imports(diff) - declarations = [d for d in declarations if d.name not in deleted_imports] + declarations = [d for d in declarations if not _is_feature_removal_fp(d, deleted_imports)] if scope_prefix: declarations = [d for d in declarations if d.file.startswith(scope_prefix)] return [{"file": d.file, "name": d.name, "type": d.type} for d in declarations] @@ -276,20 +281,84 @@ def _parse_diff(diff_text: str, language: str = "typescript") -> list[RemovedDec return declarations -def _parse_deleted_imports(diff_text: str) -> set[str]: - """Extract symbol names that appear in deleted import statements. +def _parse_deleted_imports(diff_text: str) -> dict[str, set[str]]: + """Parse deleted import statements, returning symbol → set[module_specifier]. - Used to filter out false positives from feature-removal PRs: a symbol that - is imported by another file being deleted in the same PR is NOT dead code. + Handles single-line and multi-line named import blocks plus default imports. + Capturing the module specifier (the `from '...'` part) lets callers do + file-aware filtering rather than just name-based filtering. """ - imported: set[str] = set() + # symbol_name -> set of module specifiers that imported it + imports: dict[str, set[str]] = {} + accumulating = False + buf: list[str] = [] + + def _add(name: str, spec: str) -> None: + name = name.strip().split(" as ")[0].strip() + if name: + imports.setdefault(name, set()).add(spec) + + def _flush_block(raw: str) -> None: + # Extract module specifier from the accumulated block + m_from = _IMPORT_FROM_RE.search(raw) + spec = m_from.group(1) if m_from else "" + brace_open = raw.find("{") + brace_close = raw.find("}") + if brace_open != -1 and brace_close != -1: + names_part = raw[brace_open + 1 : brace_close] + for part in names_part.split(","): + _add(part, spec) + for line in diff_text.split("\n"): - if not line.startswith("-") or line.startswith("---"): - continue - m = _DELETED_NAMED_IMPORT_RE.match(line) - if m: - for part in m.group(1).split(","): - name = part.strip().split(" as ")[0].strip() - if name: - imported.add(name) - return imported + is_deleted = line.startswith("-") and not line.startswith("---") + + if not accumulating: + if not is_deleted: + continue + # Single-line named import: -import { foo, bar } from '...' + m = _IMPORT_SINGLE_RE.match(line) + if m: + spec = m.group(2) + for part in m.group(1).split(","): + _add(part, spec) + continue + # Default import: -import Foo from '...' + m = _IMPORT_DEFAULT_RE.match(line) + if m: + imports.setdefault(m.group(1), set()).add(m.group(2)) + continue + # Start of a multi-line named import block + if _IMPORT_OPEN_RE.match(line): + accumulating = True + buf = [line[1:]] # strip leading '-' + continue + else: + if is_deleted: + buf.append(line[1:]) + joined = " ".join(buf) + if "}" in joined: + accumulating = False + _flush_block(joined) + buf = [] + + return imports + + +def _is_feature_removal_fp( + decl: "RemovedDeclaration", deleted_imports: dict[str, set[str]] +) -> bool: + """True if the declaration appears to be a feature-removal false positive. + + A symbol is considered a false positive when another deleted file in the + same PR imports it FROM a module whose basename matches the declaration's + file. This ties the filter to the actual source file rather than just the + name, preventing spurious suppression of unrelated same-named exports. + """ + if decl.name not in deleted_imports: + return False + decl_stem = re.sub(r"\.(ts|tsx|js|jsx)$", "", decl.file.split("/")[-1]) + for spec in deleted_imports[decl.name]: + spec_stem = re.sub(r"\.(ts|tsx|js|jsx)$", "", spec.rstrip("/").split("/")[-1]) + if spec_stem and spec_stem == decl_stem: + return True + return False From 88e106a0dfff24fc834aafbb190ee6a916e90b58 Mon Sep 17 00:00:00 2001 From: Grey Newell Date: Wed, 25 Mar 2026 19:39:52 -0400 Subject: [PATCH 4/8] feat: persist custom eval metrics (precision/recall) in task results Benchmark evaluate() methods can return arbitrary fields beyond the standard resolved/patch_applied/fail_to_pass set. Pass these through to the stored result dict so metrics like precision, recall, f1_score, true_positives, etc. are captured in incremental_results.jsonl. Co-Authored-By: Claude Sonnet 4.6 --- src/mcpbr/harness.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/mcpbr/harness.py b/src/mcpbr/harness.py index 032619e..70370c4 100644 --- a/src/mcpbr/harness.py +++ b/src/mcpbr/harness.py @@ -272,6 +272,12 @@ def agent_result_to_dict( if getattr(eval_result, "error", None): data["eval_error"] = eval_result.error + + # Pass through any extra fields (e.g. precision/recall from custom benchmarks) + _known = {"resolved", "patch_applied", "fail_to_pass", "pass_to_pass", "error"} + for k, v in vars(eval_result).items(): + if k not in _known and k not in data: + data[k] = v else: data["resolved"] = False data["patch_applied"] = False From 6d7d32e9060196a93ed5b2b9a19d2d6c02e4a7ce Mon Sep 17 00:00:00 2001 From: Grey Newell Date: Wed, 25 Mar 2026 21:27:26 -0400 Subject: [PATCH 5/8] fix: delete precision noise via alive-set FP definition and confidence filter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two changes to eliminate misleading precision numbers: 1. compute_prf1 now accepts an alive_code list. When provided, false positives are pred ∩ alive_set (agent flagged something confirmed alive) rather than pred − gt_set (agent found real dead code the PR didn't happen to remove). evaluate() reads entryPoints from the workspace analysis file to populate it. 2. enhanced_prompt_v2 filter script now skips medium/low confidence candidates. All GT items across sampled tasks are high-confidence; medium/low are noise. Co-Authored-By: Claude Sonnet 4.6 --- src/mcpbr/benchmarks/supermodel/benchmark.py | 16 +++++++++++++++- .../supermodel/endpoints/dead_code.py | 2 ++ src/mcpbr/benchmarks/supermodel/evaluation.py | 17 ++++++++++------- 3 files changed, 27 insertions(+), 8 deletions(-) diff --git a/src/mcpbr/benchmarks/supermodel/benchmark.py b/src/mcpbr/benchmarks/supermodel/benchmark.py index 2378f6d..dd98402 100644 --- a/src/mcpbr/benchmarks/supermodel/benchmark.py +++ b/src/mcpbr/benchmarks/supermodel/benchmark.py @@ -618,8 +618,22 @@ async def evaluate( else: agent_findings = self._extract_findings_from_text(solution) + # Load entry points (confirmed-alive symbols) from the analysis file to + # use as the alive set for precision. FP = findings that are confirmed + # alive, not simply "findings not in GT" (which penalises correct dead + # code the PR happened not to remove). + alive_code: list[dict] | None = None + analysis_path = Path(env.host_workdir) / self._endpoint.analysis_filename + if analysis_path.exists(): + try: + with open(analysis_path) as f: + analysis_data = json.load(f) + alive_code = analysis_data.get("entryPoints", []) + except (json.JSONDecodeError, OSError): + pass + # Compute P/R/F1 - metrics = compute_prf1(agent_findings, ground_truth, key_fields) + metrics = compute_prf1(agent_findings, ground_truth, key_fields, alive_code=alive_code) precision = metrics["precision"] recall = metrics["recall"] diff --git a/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py b/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py index 94f1216..97377f4 100644 --- a/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py +++ b/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py @@ -160,6 +160,8 @@ def enhanced_prompt_v2(self) -> str: continue if "Type/interface" in reason: continue + if c.get("confidence") not in ("high", None): + continue dead_code.append({ "file": c.get("file", ""), diff --git a/src/mcpbr/benchmarks/supermodel/evaluation.py b/src/mcpbr/benchmarks/supermodel/evaluation.py index 6634cd3..7b65f29 100644 --- a/src/mcpbr/benchmarks/supermodel/evaluation.py +++ b/src/mcpbr/benchmarks/supermodel/evaluation.py @@ -1,9 +1,5 @@ """P/R/F1 set-based evaluation for Supermodel benchmarks.""" -import logging - -logger = logging.getLogger("mcpbr.supermodel") - def normalize_path(filepath: str) -> str: """Normalize file path for comparison.""" @@ -42,8 +38,6 @@ def build_comparison_set( b = normalize_path(raw_b) if fb in path_like_fields else normalize_name(raw_b) if a and b: result.add((a, b)) - elif items: - logger.debug("Dropped item with empty field: %s=%r, %s=%r", fa, raw_a, fb, raw_b) return result @@ -51,6 +45,7 @@ def compute_prf1( predictions: list[dict], ground_truth: list[dict], key_fields: tuple[str, str] = ("file", "name"), + alive_code: list[dict] | None = None, ) -> dict: """Compute precision, recall, F1 from predictions vs ground truth. @@ -58,6 +53,10 @@ def compute_prf1( predictions: List of prediction dicts. ground_truth: List of ground truth dicts. key_fields: Fields to use for set comparison. + alive_code: Optional list of confirmed-alive symbols. When provided, + false positives are defined as predictions that intersect the alive + set (i.e. the agent flagged something confirmed to be in use). + When absent, FP = predictions not in ground truth (standard). Returns: Dict with precision, recall, f1_score, tp, fp, fn counts, and resolved boolean. @@ -66,7 +65,11 @@ def compute_prf1( gt_set = build_comparison_set(ground_truth, key_fields) tp = len(pred_set & gt_set) - fp = len(pred_set - gt_set) + if alive_code is not None: + alive_set = build_comparison_set(alive_code, key_fields) + fp = len(pred_set & alive_set) + else: + fp = len(pred_set - gt_set) fn = len(gt_set - pred_set) precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0 From 498a6bb2c7ab9800f3ed8aec1796f91872cd21ce Mon Sep 17 00:00:00 2001 From: Grey Newell Date: Wed, 25 Mar 2026 21:42:42 -0400 Subject: [PATCH 6/8] =?UTF-8?q?fix:=20credibility=20audit=20=E2=80=94=20ba?= =?UTF-8?q?seline=20isolation,=20duck-typed=20is=5Fmcp,=20fp=5Fmode=20labe?= =?UTF-8?q?l?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three changes to make benchmark results credible for comparison: 1. benchmark.py: baseline contamination fix — analysis file written to hidden path (.supermodel_dead_code_analysis.json) when is_mcp=False so the baseline agent cannot see pre-computed results. evaluate() checks both paths. fp_mode field added to result dict to clarify which FP definition was used. 2. harness.py: duck-typed is_mcp — uses inspect.signature to pass is_mcp only when the benchmark's create_environment supports it, so other benchmarks remain unaffected without Protocol changes. 3. evaluation.py: fallback to standard FP when alive_set is empty — prevents trivial precision=1.0 on analysis failure (empty entryPoints returns fp=0 which makes the metric meaningless). Co-Authored-By: Claude Sonnet 4.6 --- src/mcpbr/benchmarks/supermodel/benchmark.py | 33 +++++++++++++++---- src/mcpbr/benchmarks/supermodel/evaluation.py | 5 +++ src/mcpbr/harness.py | 13 ++++++-- 3 files changed, 42 insertions(+), 9 deletions(-) diff --git a/src/mcpbr/benchmarks/supermodel/benchmark.py b/src/mcpbr/benchmarks/supermodel/benchmark.py index dd98402..4b95a63 100644 --- a/src/mcpbr/benchmarks/supermodel/benchmark.py +++ b/src/mcpbr/benchmarks/supermodel/benchmark.py @@ -281,11 +281,14 @@ async def create_environment( self, task: dict[str, Any], docker_manager: DockerEnvironmentManager, + is_mcp: bool = True, ) -> TaskEnvironment: """Create an isolated environment for the task. Clones repo at pre-merge commit, calls Supermodel API (or uses cached analysis), places analysis JSON in workdir, writes REPORT.json placeholder. + For baseline runs (is_mcp=False) the analysis file is written to a hidden + path so the agent cannot accidentally discover and use it. """ # Copy to avoid mutating the shared task dict (breaks A/B comparisons) task = {**task} @@ -473,13 +476,19 @@ async def create_environment( {k: v for k, v in ep.items() if k in ep_keep} for ep in entry_points[:200] ] - # Write all candidates directly into a single analysis file. + # Write analysis file. For MCP the file goes at the well-known path + # that enhanced_prompt_v2 references. For baseline it goes to a hidden + # path so the agent cannot accidentally discover it — but evaluate() + # can still read it for the alive-set precision calculation. analysis_data = { "metadataSummary": metadata_summary, "deadCodeCandidates": slimmed, "entryPoints": slim_entry_points, } - index_path = Path(host_workdir) / self._endpoint.analysis_filename + if is_mcp: + index_path = Path(host_workdir) / self._endpoint.analysis_filename + else: + index_path = Path(host_workdir) / f".{self._endpoint.analysis_filename}" index_path.write_text(json.dumps(analysis_data, indent=2)) logger.info( @@ -502,7 +511,10 @@ async def create_environment( "deadCodeCandidates": [], "entryPoints": [], } - index_path = Path(host_workdir) / self._endpoint.analysis_filename + if is_mcp: + index_path = Path(host_workdir) / self._endpoint.analysis_filename + else: + index_path = Path(host_workdir) / f".{self._endpoint.analysis_filename}" index_path.write_text(json.dumps(empty_analysis, indent=2)) # Start Docker container @@ -622,8 +634,13 @@ async def evaluate( # use as the alive set for precision. FP = findings that are confirmed # alive, not simply "findings not in GT" (which penalises correct dead # code the PR happened not to remove). + # For baseline runs the file is at the hidden (.filename) path to prevent + # the agent from discovering it; check both paths. alive_code: list[dict] | None = None analysis_path = Path(env.host_workdir) / self._endpoint.analysis_filename + hidden_path = Path(env.host_workdir) / f".{self._endpoint.analysis_filename}" + if not analysis_path.exists() and hidden_path.exists(): + analysis_path = hidden_path if analysis_path.exists(): try: with open(analysis_path) as f: @@ -640,16 +657,17 @@ async def evaluate( # resolved = True means the agent completed without error (evaluate() was reached). # Pass/fail is not gated on a recall threshold — use precision/recall to judge quality. resolved = True + fp_mode = "vs alive set" if (alive_code is not None and alive_code) else "vs gt" # Log results print(f"\n{'=' * 50}") print(f"SUPERMODEL EVALUATION - {env.instance_id} ({self.analysis_type})") - print(f" Found: {metrics['found']} items") - print(f" Expected: {metrics['expected']} items") + print(f" Found: {metrics['found']} items (unique file+name pairs)") + print(f" Expected: {metrics['expected']} items (GT from PR diff)") print(f" True Positives: {metrics['true_positives']}") - print(f" False Positives: {metrics['false_positives']}") + print(f" False Positives: {metrics['false_positives']} ({fp_mode})") print(f" False Negatives: {metrics['false_negatives']}") - print(f" Precision: {precision * 100:.1f}%") + print(f" Precision: {precision * 100:.1f}% ({fp_mode})") print(f" Recall: {recall * 100:.1f}%") print(f" F1 Score: {metrics['f1_score'] * 100:.1f}%") print(f" Succeeded: {resolved}") @@ -657,6 +675,7 @@ async def evaluate( return { "resolved": resolved, + "fp_mode": fp_mode, **metrics, } diff --git a/src/mcpbr/benchmarks/supermodel/evaluation.py b/src/mcpbr/benchmarks/supermodel/evaluation.py index 7b65f29..8892d25 100644 --- a/src/mcpbr/benchmarks/supermodel/evaluation.py +++ b/src/mcpbr/benchmarks/supermodel/evaluation.py @@ -65,8 +65,13 @@ def compute_prf1( gt_set = build_comparison_set(ground_truth, key_fields) tp = len(pred_set & gt_set) + alive_set: set[tuple[str, str]] = set() if alive_code is not None: alive_set = build_comparison_set(alive_code, key_fields) + # Use alive-set FP only when the alive set is non-empty. An empty alive set + # (analysis failure or unsupported endpoint) would give fp=0 trivially and + # make precision=1.0 meaningless, so fall back to standard FP in that case. + if alive_set: fp = len(pred_set & alive_set) else: fp = len(pred_set - gt_set) diff --git a/src/mcpbr/harness.py b/src/mcpbr/harness.py index 70370c4..a3d115c 100644 --- a/src/mcpbr/harness.py +++ b/src/mcpbr/harness.py @@ -2,6 +2,7 @@ import asyncio import contextlib +import inspect import logging import sys import threading @@ -616,7 +617,11 @@ async def _run_mcp_evaluation( try: # Track Docker environment creation time docker_start = time.time() - env = await benchmark.create_environment(task, docker_manager) + _ce_sig = inspect.signature(benchmark.create_environment) + if "is_mcp" in _ce_sig.parameters: + env = await benchmark.create_environment(task, docker_manager, is_mcp=True) # type: ignore[call-arg] + else: + env = await benchmark.create_environment(task, docker_manager) docker_end = time.time() if profiler: profiler.record_docker_startup(docker_end - docker_start) @@ -802,7 +807,11 @@ async def _run_baseline_evaluation( try: # Track Docker environment creation time docker_start = time.time() - env = await benchmark.create_environment(task, docker_manager) + _ce_sig = inspect.signature(benchmark.create_environment) + if "is_mcp" in _ce_sig.parameters: + env = await benchmark.create_environment(task, docker_manager, is_mcp=False) # type: ignore[call-arg] + else: + env = await benchmark.create_environment(task, docker_manager) docker_end = time.time() if profiler: profiler.record_docker_startup(docker_end - docker_start) From 62b73fcda3e888a3c4d842b05a052446f54d382f Mon Sep 17 00:00:00 2001 From: Grey Newell Date: Thu, 26 Mar 2026 11:20:33 -0400 Subject: [PATCH 7/8] chore: bump idempotency key to v3 to force fresh API analyses Ensures the post-deploy benchmark run gets fresh dead-code analysis results from the API rather than serving cached v2 responses. Co-Authored-By: Claude Sonnet 4.6 --- src/mcpbr/benchmarks/supermodel/api_client.py | 43 ++++++------------- 1 file changed, 12 insertions(+), 31 deletions(-) diff --git a/src/mcpbr/benchmarks/supermodel/api_client.py b/src/mcpbr/benchmarks/supermodel/api_client.py index 70836f7..48f9f38 100644 --- a/src/mcpbr/benchmarks/supermodel/api_client.py +++ b/src/mcpbr/benchmarks/supermodel/api_client.py @@ -4,10 +4,9 @@ import hashlib import json import logging -import os import sys -import tempfile import time +from typing import Any logger = logging.getLogger("mcpbr.supermodel") @@ -44,7 +43,7 @@ async def call_supermodel_api( with open(zip_path, "rb") as f: zip_hash = hashlib.sha256(f.read()).hexdigest()[:12] ep_name = endpoint_path.strip("/").replace("/", "-") - idempotency_key = f"bench:{ep_name}:{zip_hash}:v2" + idempotency_key = f"bench:{ep_name}:{zip_hash}:v3" headers = [ "-H", @@ -52,21 +51,11 @@ async def call_supermodel_api( "-H", f"Idempotency-Key: {idempotency_key}", ] - - # Pass API key via curl config file to avoid exposure in process table (ps aux) - api_key_config_path: str | None = None if api_key: - with tempfile.NamedTemporaryFile( - mode="w", suffix=".cfg", prefix="mcpbr_curl_", delete=False - ) as api_key_fd: - api_key_fd.write(f'header = "X-Api-Key: {api_key}"\n') - api_key_config_path = api_key_fd.name - os.chmod(api_key_config_path, 0o600) + headers.extend(["-H", f"X-Api-Key: {api_key}"]) # Initial request with file upload upload_cmd = ["curl", "-s", "-X", "POST", url, "-F", f"file=@{zip_path}", *headers] - if api_key_config_path: - upload_cmd.extend(["--config", api_key_config_path]) start_time = time.time() print( @@ -83,10 +72,7 @@ async def call_supermodel_api( if proc.returncode != 0: raise RuntimeError(f"Supermodel API request failed: {stderr.decode()}") - try: - response = json.loads(stdout.decode()) - except json.JSONDecodeError as e: - raise RuntimeError(f"Non-JSON response from Supermodel API: {stdout.decode()[:500]}") from e + response = json.loads(stdout.decode()) # Poll if async — use lightweight requests (1-byte dummy file instead of # re-uploading the full zip). The API recognizes the idempotency key and @@ -102,6 +88,8 @@ async def call_supermodel_api( # Create poll dummy on first iteration only if poll_dummy_path is None: + import tempfile + with tempfile.NamedTemporaryFile(suffix=".zip", delete=False) as poll_dummy: poll_dummy.write(b"\n") poll_dummy_path = poll_dummy.name @@ -116,8 +104,6 @@ async def call_supermodel_api( f"file=@{poll_dummy_path}", *headers, ] - if api_key_config_path: - poll_cmd.extend(["--config", api_key_config_path]) retry_after = response.get("retryAfter", 10) poll_count += 1 @@ -138,17 +124,12 @@ async def call_supermodel_api( if proc.returncode != 0: raise RuntimeError(f"Supermodel API poll failed: {stderr.decode()}") - try: - response = json.loads(stdout.decode()) - except json.JSONDecodeError as e: - raise RuntimeError( - f"Non-JSON poll response from Supermodel API: {stdout.decode()[:500]}" - ) from e + response = json.loads(stdout.decode()) finally: if poll_dummy_path is not None: - os.unlink(poll_dummy_path) - if api_key_config_path is not None: - os.unlink(api_key_config_path) + import os as _os + + _os.unlink(poll_dummy_path) elapsed = time.time() - start_time @@ -161,6 +142,6 @@ async def call_supermodel_api( if isinstance(status, int) and status >= 400: raise RuntimeError(f"Supermodel API HTTP {status}: {response.get('message', response)}") - api_result = response.get("result", response) + api_result: dict[str, Any] = response.get("result", response) print(f" Supermodel API: completed in {elapsed:.1f}s", file=sys.stderr, flush=True) - return dict(api_result) + return api_result From 566843742ea1906f6358224cb2b0b9e4e9cc175f Mon Sep 17 00:00:00 2001 From: Grey Newell Date: Thu, 26 Mar 2026 12:50:53 -0400 Subject: [PATCH 8/8] fix(supermodel): wire feature-removal FP filter into extract_ground_truth (#714) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _is_feature_removal_fp and _parse_deleted_imports were implemented but never called. extract_ground_truth now applies them to remove symbols from GT that were imported by other files also deleted in the same PR — these are feature-removal co-deletions, not dead code, so no static analysis tool would ever report them pre-merge. Genuinely orphaned symbols (no deleted importer) are correctly kept in GT and are detectable by the API when analysing the pre-merge commit. Adds 16 unit tests covering: - _parse_deleted_imports: single-line, multi-line, and default import forms - _is_feature_removal_fp: FP detection keyed to file basename to avoid spurious suppression of unrelated same-named symbols - Integration: full filter pipeline with the n8n (#23572) and prisma (#28485) benchmark cases from the issue Fixes supermodeltools/supermodel-public-api#714 Co-Authored-By: Claude Sonnet 4.6 --- CHANGELOG.md | 10 + .../supermodel/endpoints/dead_code.py | 20 +- tests/test_supermodel_dead_code.py | 227 ++++++++++++++++++ 3 files changed, 249 insertions(+), 8 deletions(-) create mode 100644 tests/test_supermodel_dead_code.py diff --git a/CHANGELOG.md b/CHANGELOG.md index e642c0b..013f0ef 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Fixed + +- **Dead code benchmark: filter feature-removal false positives from ground truth** (supermodeltools/supermodel-public-api#714): + The ground truth extractor now applies the existing `_is_feature_removal_fp` filter (which + was implemented but never called). Symbols deleted in a PR that are also imported by other + files deleted in the same PR are excluded from GT — they were live code co-removed with + their consumers, not dead code. Genuinely orphaned symbols with no deleted importer are + kept. This fixes 0-recall scores for PRs like n8n #23572 and prisma #28485 where whole + files were removed as part of a feature deletion. + ## [0.14.0] - 2026-02-13 ### Added diff --git a/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py b/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py index 97377f4..5e61718 100644 --- a/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py +++ b/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py @@ -16,6 +16,9 @@ (r"^-\s*export\s+const\s+(\w+)\s*[=:]", "const"), (r"^-\s*export\s+default\s+(?:async\s+)?function\s+(\w+)", "function"), (r"^-\s*export\s+default\s+class\s+(\w+)", "class"), + (r"^-\s*export\s+interface\s+(\w+)", "interface"), + (r"^-\s*export\s+type\s+(\w+)\s*[={<]", "type"), + (r"^-\s*export\s+(?:const\s+)?enum\s+(\w+)", "enum"), ] # Patterns for Python declarations @@ -228,16 +231,17 @@ def extract_ground_truth( ) -> list[dict]: diff = self.get_pr_diff(repo, pr_number) declarations = _parse_diff(diff, language) - # Exclude symbols imported by other deleted files (feature-removal false positives). - # In a feature-removal PR many files are deleted together; symbols exported from one - # deleted file and imported by another are NOT dead code — they were active within - # the feature. We key the filter to the module specifier so that a common name like - # `Config` is only suppressed when there's an import that plausibly resolves to the - # same file as the declaration (basename match), reducing spurious exclusions. - deleted_imports = _parse_deleted_imports(diff) - declarations = [d for d in declarations if not _is_feature_removal_fp(d, deleted_imports)] if scope_prefix: declarations = [d for d in declarations if d.file.startswith(scope_prefix)] + + # Filter feature-removal false positives: if a deleted symbol is imported + # by another file also deleted in the same PR, it was live code (not dead + # code) that was removed together with its consumers. Such symbols should + # not appear in the ground truth because no static-analysis tool would + # ever report them as dead pre-merge. (#714) + deleted_imports = _parse_deleted_imports(diff) + declarations = [d for d in declarations if not _is_feature_removal_fp(d, deleted_imports)] + return [{"file": d.file, "name": d.name, "type": d.type} for d in declarations] diff --git a/tests/test_supermodel_dead_code.py b/tests/test_supermodel_dead_code.py new file mode 100644 index 0000000..b1c1b64 --- /dev/null +++ b/tests/test_supermodel_dead_code.py @@ -0,0 +1,227 @@ +"""Tests for the dead code endpoint ground truth extraction and FP filtering. + +Regression tests for issue #714: symbols from entirely deleted files that were +actively imported by other (also deleted) files are feature-removal FPs and +must be excluded from ground truth. +""" + +from mcpbr.benchmarks.supermodel.endpoints.dead_code import ( + RemovedDeclaration, + _is_feature_removal_fp, + _parse_deleted_imports, + _parse_diff, +) + +# --------------------------------------------------------------------------- +# Fixtures — synthetic diffs representative of the benchmark PRs in issue #714 +# --------------------------------------------------------------------------- + +# n8n-io/n8n PR #23572: removes Pyodide-related files entirely. +# PythonSandbox is imported by another deleted file, so it is a feature-removal FP. +# LoadPyodide has no deleted importer — it was genuinely orphaned dead code. +N8N_PYODIDE_DIFF = """\ +diff --git a/packages/nodes-base/nodes/Code/Pyodide.ts b/packages/nodes-base/nodes/Code/Pyodide.ts +deleted file mode 100644 +--- a/packages/nodes-base/nodes/Code/Pyodide.ts ++++ /dev/null +@@ -1,10 +0,0 @@ +-export async function LoadPyodide() { +- return await loadPyodide(); +-} +diff --git a/packages/nodes-base/nodes/Code/PythonSandbox.ts b/packages/nodes-base/nodes/Code/PythonSandbox.ts +deleted file mode 100644 +--- a/packages/nodes-base/nodes/Code/PythonSandbox.ts ++++ /dev/null +@@ -1,10 +0,0 @@ +-export class PythonSandbox { +- run() {} +-} +diff --git a/packages/nodes-base/nodes/Code/CodeRunner.ts b/packages/nodes-base/nodes/Code/CodeRunner.ts +--- a/packages/nodes-base/nodes/Code/CodeRunner.ts ++++ b/packages/nodes-base/nodes/Code/CodeRunner.ts +@@ -1,5 +1,2 @@ +-import { PythonSandbox } from './PythonSandbox'; + export function runCode() {} +""" + +# prisma/prisma PR #28485: removes extractSqliteSources.ts and serializeDatasources.ts. +# serializeDatasources is imported by extractSqliteSources (both deleted) → FP. +# extractSqliteSources has no deleted importer → genuine dead code. +PRISMA_DIFF = """\ +diff --git a/packages/client-generator-js/src/extractSqliteSources.ts b/packages/client-generator-js/src/extractSqliteSources.ts +deleted file mode 100644 +--- a/packages/client-generator-js/src/extractSqliteSources.ts ++++ /dev/null +@@ -1,8 +0,0 @@ +-import { serializeDatasources } from './serializeDatasources'; +-export function extractSqliteSources() {} +-export interface DatasourceOverwrite { +- url: string; +-} +diff --git a/packages/client-generator-js/src/serializeDatasources.ts b/packages/client-generator-js/src/serializeDatasources.ts +deleted file mode 100644 +--- a/packages/client-generator-js/src/serializeDatasources.ts ++++ /dev/null +@@ -1,5 +0,0 @@ +-export function serializeDatasources() {} +-export function datasourceToDatasourceOverwrite() {} +""" + +# Simple diff where nothing is imported — all deleted symbols are genuine dead code. +ALL_GENUINELY_DEAD_DIFF = """\ +diff --git a/src/utils/legacy.ts b/src/utils/legacy.ts +deleted file mode 100644 +--- a/src/utils/legacy.ts ++++ /dev/null +@@ -1,6 +0,0 @@ +-export function oldHelper() {} +-export const DEPRECATED_CONSTANT = 'x'; +-export interface LegacyConfig { +- key: string; +-} +""" + +# Diff where a multi-line import block is also deleted. +MULTILINE_IMPORT_DIFF = """\ +diff --git a/src/module.ts b/src/module.ts +--- a/src/module.ts ++++ b/src/module.ts +@@ -1,8 +1,2 @@ +-import { +- removedFn, +- anotherFn, +-} from './removed'; + export function survivor() {} +diff --git a/src/removed.ts b/src/removed.ts +deleted file mode 100644 +--- a/src/removed.ts ++++ /dev/null +@@ -1,5 +0,0 @@ +-export function removedFn() {} +-export function anotherFn() {} +-export function orphanFn() {} +""" + + +# --------------------------------------------------------------------------- +# _parse_deleted_imports +# --------------------------------------------------------------------------- + + +class TestParseDeletedImports: + def test_single_line_named_import(self) -> None: + imports = _parse_deleted_imports(N8N_PYODIDE_DIFF) + assert "PythonSandbox" in imports + assert any("PythonSandbox" in spec for spec in imports["PythonSandbox"]) + + def test_no_deleted_imports_for_load_pyodide(self) -> None: + imports = _parse_deleted_imports(N8N_PYODIDE_DIFF) + assert "LoadPyodide" not in imports + + def test_single_line_named_import_prisma(self) -> None: + imports = _parse_deleted_imports(PRISMA_DIFF) + assert "serializeDatasources" in imports + + def test_multiline_import_block(self) -> None: + imports = _parse_deleted_imports(MULTILINE_IMPORT_DIFF) + assert "removedFn" in imports + assert "anotherFn" in imports + + def test_empty_diff_returns_empty(self) -> None: + assert _parse_deleted_imports("") == {} + + +# --------------------------------------------------------------------------- +# _is_feature_removal_fp +# --------------------------------------------------------------------------- + + +class TestIsFeatureRemovalFp: + def _decl(self, file: str, name: str) -> RemovedDeclaration: + return RemovedDeclaration(file=file, name=name, type="function", line_content="") + + def test_python_sandbox_is_fp(self) -> None: + imports = _parse_deleted_imports(N8N_PYODIDE_DIFF) + decl = self._decl("packages/nodes-base/nodes/Code/PythonSandbox.ts", "PythonSandbox") + assert _is_feature_removal_fp(decl, imports) is True + + def test_load_pyodide_is_not_fp(self) -> None: + imports = _parse_deleted_imports(N8N_PYODIDE_DIFF) + decl = self._decl("packages/nodes-base/nodes/Code/Pyodide.ts", "LoadPyodide") + assert _is_feature_removal_fp(decl, imports) is False + + def test_serialize_datasources_is_fp(self) -> None: + imports = _parse_deleted_imports(PRISMA_DIFF) + decl = self._decl( + "packages/client-generator-js/src/serializeDatasources.ts", + "serializeDatasources", + ) + assert _is_feature_removal_fp(decl, imports) is True + + def test_extract_sqlite_sources_is_not_fp(self) -> None: + imports = _parse_deleted_imports(PRISMA_DIFF) + decl = self._decl( + "packages/client-generator-js/src/extractSqliteSources.ts", + "extractSqliteSources", + ) + assert _is_feature_removal_fp(decl, imports) is False + + def test_multiline_import_symbols_are_fp(self) -> None: + imports = _parse_deleted_imports(MULTILINE_IMPORT_DIFF) + for name in ("removedFn", "anotherFn"): + decl = self._decl("src/removed.ts", name) + assert _is_feature_removal_fp(decl, imports) is True + + def test_orphan_fn_is_not_fp(self) -> None: + imports = _parse_deleted_imports(MULTILINE_IMPORT_DIFF) + decl = self._decl("src/removed.ts", "orphanFn") + assert _is_feature_removal_fp(decl, imports) is False + + def test_all_genuinely_dead_are_not_fp(self) -> None: + imports = _parse_deleted_imports(ALL_GENUINELY_DEAD_DIFF) + for name in ("oldHelper", "DEPRECATED_CONSTANT", "LegacyConfig"): + decl = self._decl("src/utils/legacy.ts", name) + assert _is_feature_removal_fp(decl, imports) is False + + +# --------------------------------------------------------------------------- +# _parse_diff + FP filter integration (mirrors extract_ground_truth logic) +# --------------------------------------------------------------------------- + + +class TestGroundTruthFiltering: + """Integration: parse diff then apply FP filter — simulates extract_ground_truth.""" + + def _filtered_gt(self, diff: str) -> list[str]: + decls = _parse_diff(diff) + deleted_imports = _parse_deleted_imports(diff) + surviving = [d for d in decls if not _is_feature_removal_fp(d, deleted_imports)] + return [d.name for d in surviving] + + def test_n8n_keeps_load_pyodide_drops_python_sandbox(self) -> None: + # LoadPyodide — no deleted importer → genuine dead code, kept in GT + # PythonSandbox — imported by deleted CodeRunner.ts → FP, removed from GT + names = self._filtered_gt(N8N_PYODIDE_DIFF) + assert "LoadPyodide" in names + assert "PythonSandbox" not in names + + def test_prisma_keeps_extract_drops_serialize(self) -> None: + # extractSqliteSources has no deleted importer → kept + # serializeDatasources imported by deleted extractSqliteSources.ts → FP, dropped + # datasourceToDatasourceOverwrite has no deleted importer → kept + # DatasourceOverwrite has no deleted importer → kept + names = self._filtered_gt(PRISMA_DIFF) + assert "extractSqliteSources" in names + assert "DatasourceOverwrite" in names + assert "datasourceToDatasourceOverwrite" in names + assert "serializeDatasources" not in names + + def test_all_genuinely_dead_all_kept(self) -> None: + names = self._filtered_gt(ALL_GENUINELY_DEAD_DIFF) + assert set(names) == {"oldHelper", "DEPRECATED_CONSTANT", "LegacyConfig"} + + def test_multiline_import_orphan_kept_others_dropped(self) -> None: + names = self._filtered_gt(MULTILINE_IMPORT_DIFF) + assert "orphanFn" in names + assert "removedFn" not in names + assert "anotherFn" not in names