From 9591e154c5910a63202feaf3b522cafc4fab6cd1 Mon Sep 17 00:00:00 2001 From: Grey Newell Date: Wed, 25 Mar 2026 10:21:39 -0400 Subject: [PATCH 1/3] fix(supermodel): add zip comment and source map excludes to bust stale caches - Stamp every zip with _ZIP_COMMENT (mcpbr-analysis-v3) so the hash is stable, deterministic, and distinct from earlier runs on old containers. Server-side cached errors from the 4Gi era are bypassed automatically. - Add *.map / *.js.map to BINARY_EXCLUDE_PATTERNS (source maps add no value for dead-code analysis and inflate zip size). - Delete stale local analysis caches (all pre-date the binary-filter commit). Co-Authored-By: Claude Sonnet 4.6 --- src/mcpbr/benchmarks/supermodel/git_utils.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/mcpbr/benchmarks/supermodel/git_utils.py b/src/mcpbr/benchmarks/supermodel/git_utils.py index 0cc4811..58a3675 100644 --- a/src/mcpbr/benchmarks/supermodel/git_utils.py +++ b/src/mcpbr/benchmarks/supermodel/git_utils.py @@ -34,6 +34,8 @@ "*.mp3", "*.wav", "*.ogg", + "*.map", + "*.js.map", ] @@ -187,8 +189,16 @@ async def _zip_repo_git_archive( return output_zip +_ZIP_COMMENT = b"mcpbr-analysis-v3" + + def _filter_zip_entries(zip_path: str, patterns: list[str]) -> None: - """Rewrite zip in-place, removing entries whose basename matches any glob pattern.""" + """Rewrite zip in-place, removing entries whose basename matches any glob pattern. + + Always rewrites (even if no entries are removed) to stamp the archive with + _ZIP_COMMENT so the resulting hash is stable across git archive versions and + acts as a cache-busting version marker. + """ tmp_path = zip_path + ".tmp" removed = 0 try: @@ -196,6 +206,7 @@ def _filter_zip_entries(zip_path: str, patterns: list[str]) -> None: zipfile.ZipFile(zip_path, "r") as zin, zipfile.ZipFile(tmp_path, "w", compression=zipfile.ZIP_DEFLATED) as zout, ): + zout.comment = _ZIP_COMMENT for item in zin.infolist(): basename = os.path.basename(item.filename) if any(fnmatch.fnmatch(basename, pat) for pat in patterns): From c579a18611d20c1dc038fdc9c31590678f98a814 Mon Sep 17 00:00:00 2001 From: Grey Newell Date: Wed, 25 Mar 2026 16:19:03 -0400 Subject: [PATCH 2/3] fix(supermodel): remove recall threshold gating and filter feature-removal GT false positives MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove `resolved_threshold` parameter from SupermodelBenchmark.__init__; `resolved` now reflects whether the agent completed without error (evaluate() was reached), not whether recall crossed an arbitrary threshold. Precision/recall are the quality signal. - Add `_parse_deleted_imports()` to dead_code.py: parses deleted `import { ... }` lines from a PR diff and filters out any removed exports whose names appear in those imports. Feature-removal PRs delete many files together; symbols exported by one deleted file and imported by another are NOT dead code — they were active within the feature. This eliminates false positives in GT for logto, n8n, prisma, podman, nextjs PRs. Co-Authored-By: Claude Sonnet 4.6 --- src/mcpbr/benchmarks/supermodel/benchmark.py | 11 +++---- .../supermodel/endpoints/dead_code.py | 30 +++++++++++++++++++ 2 files changed, 34 insertions(+), 7 deletions(-) diff --git a/src/mcpbr/benchmarks/supermodel/benchmark.py b/src/mcpbr/benchmarks/supermodel/benchmark.py index b8d2af5..2378f6d 100644 --- a/src/mcpbr/benchmarks/supermodel/benchmark.py +++ b/src/mcpbr/benchmarks/supermodel/benchmark.py @@ -50,7 +50,6 @@ def __init__( tasks: list[dict[str, Any]] | None = None, supermodel_api_base: str = "https://api.supermodel.dev", supermodel_api_key: str | None = None, - resolved_threshold: float = 0.8, ground_truth_dir: str | Path | None = None, supermodel_api_timeout: int = 900, **kwargs: Any, @@ -63,9 +62,6 @@ def __init__( tasks: List of task config dicts from YAML. supermodel_api_base: Base URL for Supermodel API. supermodel_api_key: API key (or set SUPERMODEL_API_KEY env var). - resolved_threshold: Recall threshold to consider a task 'resolved' (precision is - reported but not required — the API returns many valid dead-code - candidates beyond the GT set, so precision is not a fair gate). ground_truth_dir: Directory to cache ground truth JSON files. supermodel_api_timeout: Max seconds to wait for Supermodel API (default 900). **kwargs: Additional keyword arguments (ignored for forward compat). @@ -75,7 +71,6 @@ def __init__( self.api_base = supermodel_api_base self.api_key = supermodel_api_key or os.environ.get("SUPERMODEL_API_KEY") self.api_timeout = supermodel_api_timeout - self.resolved_threshold = resolved_threshold self.gt_dir = Path(ground_truth_dir) if ground_truth_dir else DEFAULT_GT_DIR self.gt_dir.mkdir(parents=True, exist_ok=True) @@ -628,7 +623,9 @@ async def evaluate( precision = metrics["precision"] recall = metrics["recall"] - resolved = recall >= self.resolved_threshold + # resolved = True means the agent completed without error (evaluate() was reached). + # Pass/fail is not gated on a recall threshold — use precision/recall to judge quality. + resolved = True # Log results print(f"\n{'=' * 50}") @@ -641,7 +638,7 @@ async def evaluate( print(f" Precision: {precision * 100:.1f}%") print(f" Recall: {recall * 100:.1f}%") print(f" F1 Score: {metrics['f1_score'] * 100:.1f}%") - print(f" Resolved: {resolved}") + print(f" Succeeded: {resolved}") print(f"{'=' * 50}\n") return { diff --git a/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py b/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py index 9ad76c9..8d22bdc 100644 --- a/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py +++ b/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py @@ -44,6 +44,11 @@ r"\.rs$", ] +# Pattern to extract named imports from deleted lines (TypeScript/JavaScript) +# Matches: -import { foo, bar as baz } from '...' +# Also: -import type { Foo } from '...' +_DELETED_NAMED_IMPORT_RE = re.compile(r"^-\s*import\s+(?:type\s+)?\{([^}]+)\}\s+from") + SKIP_NAMES = { "default", "module", @@ -218,6 +223,12 @@ def extract_ground_truth( ) -> list[dict]: diff = self.get_pr_diff(repo, pr_number) declarations = _parse_diff(diff, language) + # Exclude symbols imported by other deleted files (feature-removal false positives). + # In a feature-removal PR many files are deleted together; symbols exported from one + # deleted file and imported by another are NOT dead code — they were active within + # the feature. Filter them out so they don't inflate false-negative counts. + deleted_imports = _parse_deleted_imports(diff) + declarations = [d for d in declarations if d.name not in deleted_imports] if scope_prefix: declarations = [d for d in declarations if d.file.startswith(scope_prefix)] return [{"file": d.file, "name": d.name, "type": d.type} for d in declarations] @@ -263,3 +274,22 @@ def _parse_diff(diff_text: str, language: str = "typescript") -> list[RemovedDec break return declarations + + +def _parse_deleted_imports(diff_text: str) -> set[str]: + """Extract symbol names that appear in deleted import statements. + + Used to filter out false positives from feature-removal PRs: a symbol that + is imported by another file being deleted in the same PR is NOT dead code. + """ + imported: set[str] = set() + for line in diff_text.split("\n"): + if not line.startswith("-") or line.startswith("---"): + continue + m = _DELETED_NAMED_IMPORT_RE.match(line) + if m: + for part in m.group(1).split(","): + name = part.strip().split(" as ")[0].strip() + if name: + imported.add(name) + return imported From 918e3a16daa74e65886521a376914900bd156bcd Mon Sep 17 00:00:00 2001 From: Grey Newell Date: Wed, 25 Mar 2026 17:13:44 -0400 Subject: [PATCH 3/3] =?UTF-8?q?fix(supermodel):=20improve=20deleted-import?= =?UTF-8?q?=20parser=20=E2=80=94=20multi-line=20blocks=20+=20file-aware=20?= =?UTF-8?q?filter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address CodeRabbit review comments on #7: 1. Multi-line imports: Replace single-regex approach with a state machine that handles multi-line `import { ... }` blocks (common in feature-removal PRs where imports span many lines). Also captures default imports. The state machine accumulates deleted lines until the closing `}` then extracts names. 2. File-aware filtering: `_parse_deleted_imports` now returns `dict[str, set[str]]` (symbol → module_specifiers) instead of `set[str]`. New `_is_feature_removal_fp()` does basename matching between the module specifier in the import and the declaration's file stem (e.g. `./userService` matches `userService.ts`). This prevents spurious exclusion of exports that share a name with an unrelated import from a different module. Co-Authored-By: Claude Sonnet 4.6 --- .../supermodel/endpoints/dead_code.py | 109 ++++++++++++++---- 1 file changed, 89 insertions(+), 20 deletions(-) diff --git a/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py b/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py index 8d22bdc..94f1216 100644 --- a/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py +++ b/src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py @@ -44,10 +44,13 @@ r"\.rs$", ] -# Pattern to extract named imports from deleted lines (TypeScript/JavaScript) -# Matches: -import { foo, bar as baz } from '...' -# Also: -import type { Foo } from '...' -_DELETED_NAMED_IMPORT_RE = re.compile(r"^-\s*import\s+(?:type\s+)?\{([^}]+)\}\s+from") +# Patterns for state-machine import parsing of deleted lines +_IMPORT_OPEN_RE = re.compile(r"^-\s*import\s+(?:type\s+)?(?:\w+\s*,\s*)?\{") +_IMPORT_SINGLE_RE = re.compile( + r"^-\s*import\s+(?:type\s+)?(?:\w+\s*,\s*)?\{([^}]+)\}\s+from\s+['\"]([^'\"]+)['\"]" +) +_IMPORT_DEFAULT_RE = re.compile(r"^-\s*import\s+(?:type\s+)?(\w+)\s+from\s+['\"]([^'\"]+)['\"]") +_IMPORT_FROM_RE = re.compile(r"from\s+['\"]([^'\"]+)['\"]") SKIP_NAMES = { "default", @@ -226,9 +229,11 @@ def extract_ground_truth( # Exclude symbols imported by other deleted files (feature-removal false positives). # In a feature-removal PR many files are deleted together; symbols exported from one # deleted file and imported by another are NOT dead code — they were active within - # the feature. Filter them out so they don't inflate false-negative counts. + # the feature. We key the filter to the module specifier so that a common name like + # `Config` is only suppressed when there's an import that plausibly resolves to the + # same file as the declaration (basename match), reducing spurious exclusions. deleted_imports = _parse_deleted_imports(diff) - declarations = [d for d in declarations if d.name not in deleted_imports] + declarations = [d for d in declarations if not _is_feature_removal_fp(d, deleted_imports)] if scope_prefix: declarations = [d for d in declarations if d.file.startswith(scope_prefix)] return [{"file": d.file, "name": d.name, "type": d.type} for d in declarations] @@ -276,20 +281,84 @@ def _parse_diff(diff_text: str, language: str = "typescript") -> list[RemovedDec return declarations -def _parse_deleted_imports(diff_text: str) -> set[str]: - """Extract symbol names that appear in deleted import statements. +def _parse_deleted_imports(diff_text: str) -> dict[str, set[str]]: + """Parse deleted import statements, returning symbol → set[module_specifier]. - Used to filter out false positives from feature-removal PRs: a symbol that - is imported by another file being deleted in the same PR is NOT dead code. + Handles single-line and multi-line named import blocks plus default imports. + Capturing the module specifier (the `from '...'` part) lets callers do + file-aware filtering rather than just name-based filtering. """ - imported: set[str] = set() + # symbol_name -> set of module specifiers that imported it + imports: dict[str, set[str]] = {} + accumulating = False + buf: list[str] = [] + + def _add(name: str, spec: str) -> None: + name = name.strip().split(" as ")[0].strip() + if name: + imports.setdefault(name, set()).add(spec) + + def _flush_block(raw: str) -> None: + # Extract module specifier from the accumulated block + m_from = _IMPORT_FROM_RE.search(raw) + spec = m_from.group(1) if m_from else "" + brace_open = raw.find("{") + brace_close = raw.find("}") + if brace_open != -1 and brace_close != -1: + names_part = raw[brace_open + 1 : brace_close] + for part in names_part.split(","): + _add(part, spec) + for line in diff_text.split("\n"): - if not line.startswith("-") or line.startswith("---"): - continue - m = _DELETED_NAMED_IMPORT_RE.match(line) - if m: - for part in m.group(1).split(","): - name = part.strip().split(" as ")[0].strip() - if name: - imported.add(name) - return imported + is_deleted = line.startswith("-") and not line.startswith("---") + + if not accumulating: + if not is_deleted: + continue + # Single-line named import: -import { foo, bar } from '...' + m = _IMPORT_SINGLE_RE.match(line) + if m: + spec = m.group(2) + for part in m.group(1).split(","): + _add(part, spec) + continue + # Default import: -import Foo from '...' + m = _IMPORT_DEFAULT_RE.match(line) + if m: + imports.setdefault(m.group(1), set()).add(m.group(2)) + continue + # Start of a multi-line named import block + if _IMPORT_OPEN_RE.match(line): + accumulating = True + buf = [line[1:]] # strip leading '-' + continue + else: + if is_deleted: + buf.append(line[1:]) + joined = " ".join(buf) + if "}" in joined: + accumulating = False + _flush_block(joined) + buf = [] + + return imports + + +def _is_feature_removal_fp( + decl: "RemovedDeclaration", deleted_imports: dict[str, set[str]] +) -> bool: + """True if the declaration appears to be a feature-removal false positive. + + A symbol is considered a false positive when another deleted file in the + same PR imports it FROM a module whose basename matches the declaration's + file. This ties the filter to the actual source file rather than just the + name, preventing spurious suppression of unrelated same-named exports. + """ + if decl.name not in deleted_imports: + return False + decl_stem = re.sub(r"\.(ts|tsx|js|jsx)$", "", decl.file.split("/")[-1]) + for spec in deleted_imports[decl.name]: + spec_stem = re.sub(r"\.(ts|tsx|js|jsx)$", "", spec.rstrip("/").split("/")[-1]) + if spec_stem and spec_stem == decl_stem: + return True + return False