Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 4 additions & 7 deletions src/mcpbr/benchmarks/supermodel/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ def __init__(
tasks: list[dict[str, Any]] | None = None,
supermodel_api_base: str = "https://api.supermodel.dev",
supermodel_api_key: str | None = None,
resolved_threshold: float = 0.8,
ground_truth_dir: str | Path | None = None,
supermodel_api_timeout: int = 900,
**kwargs: Any,
Expand All @@ -63,9 +62,6 @@ def __init__(
tasks: List of task config dicts from YAML.
supermodel_api_base: Base URL for Supermodel API.
supermodel_api_key: API key (or set SUPERMODEL_API_KEY env var).
resolved_threshold: Recall threshold to consider a task 'resolved' (precision is
reported but not required — the API returns many valid dead-code
candidates beyond the GT set, so precision is not a fair gate).
ground_truth_dir: Directory to cache ground truth JSON files.
supermodel_api_timeout: Max seconds to wait for Supermodel API (default 900).
**kwargs: Additional keyword arguments (ignored for forward compat).
Expand All @@ -75,7 +71,6 @@ def __init__(
self.api_base = supermodel_api_base
self.api_key = supermodel_api_key or os.environ.get("SUPERMODEL_API_KEY")
self.api_timeout = supermodel_api_timeout
self.resolved_threshold = resolved_threshold
self.gt_dir = Path(ground_truth_dir) if ground_truth_dir else DEFAULT_GT_DIR
self.gt_dir.mkdir(parents=True, exist_ok=True)

Expand Down Expand Up @@ -628,7 +623,9 @@ async def evaluate(

precision = metrics["precision"]
recall = metrics["recall"]
resolved = recall >= self.resolved_threshold
# resolved = True means the agent completed without error (evaluate() was reached).
# Pass/fail is not gated on a recall threshold — use precision/recall to judge quality.
resolved = True
Comment on lines +626 to +628
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Don't repurpose resolved into "evaluate() was reached".

With Line 628 set unconditionally, a task that writes REPORT.json but gets 0% recall still reports resolved=True. src/mcpbr/harness.py still uses resolved for PASS/FAIL logging, resolution rates, comparison deltas, and notifications, so downstream summaries can now look perfect on completely wrong answers. Please emit a separate succeeded flag here, or migrate those consumers in the same PR.

Also applies to: 641-646

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@src/mcpbr/benchmarks/supermodel/benchmark.py` around lines 626 - 628, The
code incorrectly repurposes the existing resolved flag to mean "evaluate() was
reached"; instead, leave resolved representing task completion without error and
add a new boolean succeeded that is set when evaluate() is actually reached
(and/or the evaluation completed successfully) — e.g., after evaluate() returns
and REPORT.json is produced set succeeded = True; do NOT set resolved = True
unconditionally. Apply the same change to the other related blocks around the
evaluate() call (the section referenced at lines ~641-646) so downstream
consumers can still use resolved for PASS/FAIL and use succeeded when emitting
evaluation-reached/resolution metrics or notifications.


# Log results
print(f"\n{'=' * 50}")
Expand All @@ -641,7 +638,7 @@ async def evaluate(
print(f" Precision: {precision * 100:.1f}%")
print(f" Recall: {recall * 100:.1f}%")
print(f" F1 Score: {metrics['f1_score'] * 100:.1f}%")
print(f" Resolved: {resolved}")
print(f" Succeeded: {resolved}")
print(f"{'=' * 50}\n")

return {
Expand Down
99 changes: 99 additions & 0 deletions src/mcpbr/benchmarks/supermodel/endpoints/dead_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,14 @@
r"\.rs$",
]

# Patterns for state-machine import parsing of deleted lines
_IMPORT_OPEN_RE = re.compile(r"^-\s*import\s+(?:type\s+)?(?:\w+\s*,\s*)?\{")
_IMPORT_SINGLE_RE = re.compile(
r"^-\s*import\s+(?:type\s+)?(?:\w+\s*,\s*)?\{([^}]+)\}\s+from\s+['\"]([^'\"]+)['\"]"
)
_IMPORT_DEFAULT_RE = re.compile(r"^-\s*import\s+(?:type\s+)?(\w+)\s+from\s+['\"]([^'\"]+)['\"]")
_IMPORT_FROM_RE = re.compile(r"from\s+['\"]([^'\"]+)['\"]")

SKIP_NAMES = {
"default",
"module",
Expand Down Expand Up @@ -218,6 +226,14 @@ def extract_ground_truth(
) -> list[dict]:
diff = self.get_pr_diff(repo, pr_number)
declarations = _parse_diff(diff, language)
# Exclude symbols imported by other deleted files (feature-removal false positives).
# In a feature-removal PR many files are deleted together; symbols exported from one
# deleted file and imported by another are NOT dead code — they were active within
# the feature. We key the filter to the module specifier so that a common name like
# `Config` is only suppressed when there's an import that plausibly resolves to the
# same file as the declaration (basename match), reducing spurious exclusions.
deleted_imports = _parse_deleted_imports(diff)
declarations = [d for d in declarations if not _is_feature_removal_fp(d, deleted_imports)]
if scope_prefix:
declarations = [d for d in declarations if d.file.startswith(scope_prefix)]
return [{"file": d.file, "name": d.name, "type": d.type} for d in declarations]
Expand Down Expand Up @@ -263,3 +279,86 @@ def _parse_diff(diff_text: str, language: str = "typescript") -> list[RemovedDec
break

return declarations


def _parse_deleted_imports(diff_text: str) -> dict[str, set[str]]:
"""Parse deleted import statements, returning symbol → set[module_specifier].

Handles single-line and multi-line named import blocks plus default imports.
Capturing the module specifier (the `from '...'` part) lets callers do
file-aware filtering rather than just name-based filtering.
"""
# symbol_name -> set of module specifiers that imported it
imports: dict[str, set[str]] = {}
accumulating = False
buf: list[str] = []

def _add(name: str, spec: str) -> None:
name = name.strip().split(" as ")[0].strip()
if name:
imports.setdefault(name, set()).add(spec)

def _flush_block(raw: str) -> None:
# Extract module specifier from the accumulated block
m_from = _IMPORT_FROM_RE.search(raw)
spec = m_from.group(1) if m_from else ""
brace_open = raw.find("{")
brace_close = raw.find("}")
if brace_open != -1 and brace_close != -1:
names_part = raw[brace_open + 1 : brace_close]
for part in names_part.split(","):
_add(part, spec)

for line in diff_text.split("\n"):
is_deleted = line.startswith("-") and not line.startswith("---")

if not accumulating:
if not is_deleted:
continue
# Single-line named import: -import { foo, bar } from '...'
m = _IMPORT_SINGLE_RE.match(line)
if m:
spec = m.group(2)
for part in m.group(1).split(","):
_add(part, spec)
continue
# Default import: -import Foo from '...'
m = _IMPORT_DEFAULT_RE.match(line)
if m:
imports.setdefault(m.group(1), set()).add(m.group(2))
continue
# Start of a multi-line named import block
if _IMPORT_OPEN_RE.match(line):
accumulating = True
buf = [line[1:]] # strip leading '-'
continue
else:
if is_deleted:
buf.append(line[1:])
joined = " ".join(buf)
if "}" in joined:
accumulating = False
_flush_block(joined)
buf = []

return imports


def _is_feature_removal_fp(
decl: "RemovedDeclaration", deleted_imports: dict[str, set[str]]
) -> bool:
"""True if the declaration appears to be a feature-removal false positive.

A symbol is considered a false positive when another deleted file in the
same PR imports it FROM a module whose basename matches the declaration's
file. This ties the filter to the actual source file rather than just the
name, preventing spurious suppression of unrelated same-named exports.
"""
if decl.name not in deleted_imports:
return False
decl_stem = re.sub(r"\.(ts|tsx|js|jsx)$", "", decl.file.split("/")[-1])
for spec in deleted_imports[decl.name]:
spec_stem = re.sub(r"\.(ts|tsx|js|jsx)$", "", spec.rstrip("/").split("/")[-1])
if spec_stem and spec_stem == decl_stem:
return True
return False
13 changes: 12 additions & 1 deletion src/mcpbr/benchmarks/supermodel/git_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
"*.mp3",
"*.wav",
"*.ogg",
"*.map",
"*.js.map",
]


Expand Down Expand Up @@ -187,15 +189,24 @@ async def _zip_repo_git_archive(
return output_zip


_ZIP_COMMENT = b"mcpbr-analysis-v3"


def _filter_zip_entries(zip_path: str, patterns: list[str]) -> None:
"""Rewrite zip in-place, removing entries whose basename matches any glob pattern."""
"""Rewrite zip in-place, removing entries whose basename matches any glob pattern.

Always rewrites (even if no entries are removed) to stamp the archive with
_ZIP_COMMENT so the resulting hash is stable across git archive versions and
acts as a cache-busting version marker.
"""
tmp_path = zip_path + ".tmp"
removed = 0
try:
with (
zipfile.ZipFile(zip_path, "r") as zin,
zipfile.ZipFile(tmp_path, "w", compression=zipfile.ZIP_DEFLATED) as zout,
):
zout.comment = _ZIP_COMMENT
for item in zin.infolist():
basename = os.path.basename(item.filename)
if any(fnmatch.fnmatch(basename, pat) for pat in patterns):
Expand Down
Loading