diff --git a/benchmarks/terminal_bench/.gitignore b/benchmarks/terminal_bench/.gitignore new file mode 100644 index 00000000..7a60b85e --- /dev/null +++ b/benchmarks/terminal_bench/.gitignore @@ -0,0 +1,2 @@ +__pycache__/ +*.pyc diff --git a/benchmarks/terminal_bench/README.md b/benchmarks/terminal_bench/README.md new file mode 100644 index 00000000..d1da4de8 --- /dev/null +++ b/benchmarks/terminal_bench/README.md @@ -0,0 +1,52 @@ +# codegraff on Terminal-Bench 2.0 + +A [Harbor](https://tbench.ai) agent adapter that runs the `graff` CLI against +[Terminal-Bench 2.0](https://tbench.ai/leaderboard) tasks. + +`codegraff_agent.py` is a `BaseInstalledAgent` that, inside each task container: + +1. installs system deps and the released `graff` binary (`install.sh`, linux + x86_64 / aarch64); +2. injects the host's `~/.forge` provider credentials so `graff` is + authenticated; +3. runs `graff --prompt ""` headlessly and tees output to the + trial's agent log. + +## Prerequisites + +- A container runtime exposing a Docker-compatible socket (Docker Desktop, + Podman machine, OrbStack, or Colima — running). +- `harbor` installed (`uv tool install harbor` or `pipx install harbor`). +- `graff` authenticated **on the host**: `graff provider login`. The adapter + copies `~/.forge/.credentials.json` + `~/.forge/.forge.toml` into the + container; the active model is whatever `[session]` in the toml selects. + +## Run + +```bash +# from the codegraff repo root +./benchmarks/terminal_bench/run.sh # 1 task, smoke test +./benchmarks/terminal_bench/run.sh -l 32 -n 8 # 32 tasks, 8 concurrent +./benchmarks/terminal_bench/run.sh -t hello-world # a single named task +``` + +Or invoke Harbor directly: + +```bash +PYTHONPATH="$(pwd)/benchmarks/terminal_bench" \ +harbor run \ + -d terminal-bench/terminal-bench-2 \ + --agent-import-path codegraff_agent:CodeGraff \ + -l 1 -n 1 +``` + +Results land in `./jobs//`. Per-trial `graff` output is at +`.../agent/codegraff.txt`; the verifier reward is at `.../verifier/reward.json`. + +## Notes / known gaps + +- Token accounting is not yet parsed from `graff`, so `populate_context_post_run` + leaves the counters at their defaults rather than reporting fabricated numbers. + The reward is unaffected (it comes from the verifier's filesystem checks). +- The container must reach the provider endpoint configured on the host. Tasks + marked network-isolated will not be able to call the model. diff --git a/benchmarks/terminal_bench/codegraff_agent.py b/benchmarks/terminal_bench/codegraff_agent.py new file mode 100644 index 00000000..0ea7cc47 --- /dev/null +++ b/benchmarks/terminal_bench/codegraff_agent.py @@ -0,0 +1,147 @@ +"""Terminal-Bench 2.0 (Harbor) agent adapter for codegraff's ``graff`` CLI. + +Harbor drives an agent inside a fresh task container. This adapter: + +1. ``install`` — installs system deps, then the released ``graff`` binary via the + official installer into ``~/.local/bin`` (supports linux x86_64 + aarch64). +2. ``run`` — injects the host's provider credentials so ``graff`` is + authenticated, then runs ``graff --prompt `` headlessly and tees + the output to the trial's agent log. + +Run it from the repo root: + + PYTHONPATH="$(pwd)/benchmarks/terminal_bench" \ + harbor run -d terminal-bench/terminal-bench-2 \ + --agent-import-path codegraff_agent:CodeGraff \ + -t -n 1 + +The model is whatever the host ``~/.forge/.forge.toml`` ``[session]`` selects; +authenticate once on the host with ``graff provider login`` before running. +""" + +import os +import re +import shlex +from pathlib import Path + +from harbor.agents.installed.base import BaseInstalledAgent, with_prompt_template +from harbor.environments.base import BaseEnvironment +from harbor.models.agent.context import AgentContext +from harbor.models.trial.paths import EnvironmentPaths + +CODEGRAFF_INSTALL_URL = ( + "https://github.com/justrach/codegraff/releases/latest/download/install.sh" +) +_HOST_FORGE_DIR = Path(os.path.expanduser("~/.forge")) + + +class CodeGraff(BaseInstalledAgent): + """Runs the ``graff`` CLI as a Terminal-Bench agent.""" + + # graff does not yet emit Harbor's ATIF trajectory format. + SUPPORTS_ATIF: bool = False + + @staticmethod + def name() -> str: + return "codegraff" + + def get_version_command(self) -> str | None: + return 'export PATH="$HOME/.local/bin:$PATH"; graff --version' + + def parse_version(self, stdout: str) -> str: + match = re.search(r"(\d+\.\d+\.\d+)", stdout) + return match.group(1) if match else stdout.strip() + + async def install(self, environment: BaseEnvironment) -> None: + # System dependencies (root): the installer needs curl + certs; git is + # commonly required by the tasks themselves. + await self.exec_as_root( + environment, + command=( + "if command -v apt-get >/dev/null 2>&1; then" + " apt-get update && apt-get install -y curl ca-certificates bash git;" + " elif command -v apk >/dev/null 2>&1; then" + " apk add --no-cache curl ca-certificates bash git libgcc;" + " elif command -v dnf >/dev/null 2>&1; then" + " dnf install -y curl ca-certificates bash git;" + " elif command -v yum >/dev/null 2>&1; then" + " yum install -y curl ca-certificates bash git;" + " else echo 'no known package manager; assuming curl present' >&2; fi" + ), + env={"DEBIAN_FRONTEND": "noninteractive"}, + timeout_sec=600, + ) + # Install graff (agent user) into ~/.local/bin via the official installer. + await self.exec_as_agent( + environment, + command=( + "set -eu; " + f"curl -fsSL {shlex.quote(CODEGRAFF_INSTALL_URL)} | sh && " + "echo 'export PATH=\"$HOME/.local/bin:$PATH\"' >> ~/.bashrc && " + 'export PATH="$HOME/.local/bin:$PATH" && graff --version' + ), + timeout_sec=600, + ) + + async def _inject_credentials(self, environment: BaseEnvironment) -> None: + """Copy the host's graff provider credentials into the container. + + graff reads providers from ``~/.forge/.credentials.json`` and its active + model from ``~/.forge/.forge.toml``. There is no env-var key path, so we + upload the host files. Auto-update is disabled so graff does not try to + self-update on first run inside the container. + """ + creds = _HOST_FORGE_DIR / ".credentials.json" + toml = _HOST_FORGE_DIR / ".forge.toml" + if not creds.exists(): + raise RuntimeError( + f"No graff credentials at {creds}. " + "Authenticate on the host first: `graff provider login`." + ) + + await self.exec_as_agent(environment, command="mkdir -p $HOME/.forge") + + await environment.upload_file(creds, "/tmp/cg_credentials.json") + await self.exec_as_agent( + environment, + command=( + "cp /tmp/cg_credentials.json $HOME/.forge/.credentials.json && " + "chmod 600 $HOME/.forge/.credentials.json" + ), + ) + + if toml.exists(): + await environment.upload_file(toml, "/tmp/cg_forge.toml") + await self.exec_as_agent( + environment, + command=( + "cp /tmp/cg_forge.toml $HOME/.forge/.forge.toml && " + "sed -i 's/^auto_update = true/auto_update = false/' " + "$HOME/.forge/.forge.toml 2>/dev/null || true" + ), + ) + + @with_prompt_template + async def run( + self, instruction: str, environment: BaseEnvironment, context: AgentContext + ) -> None: + await self._inject_credentials(environment) + + escaped_instruction = shlex.quote(instruction) + log_path = (EnvironmentPaths.agent_dir / "codegraff.txt").as_posix() + + await self.exec_as_agent( + environment, + command=( + 'export PATH="$HOME/.local/bin:$PATH"; ' + f"graff --prompt {escaped_instruction} " + f"2>&1 None: + # graff does not yet expose per-run token accounting in a parseable form; + # leave the counters at their defaults rather than reporting fabricated + # numbers. The reward still comes from the verifier's filesystem checks. + return diff --git a/benchmarks/terminal_bench/run.sh b/benchmarks/terminal_bench/run.sh new file mode 100755 index 00000000..6bdc672d --- /dev/null +++ b/benchmarks/terminal_bench/run.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +# Run codegraff's `graff` CLI on Terminal-Bench 2.0 via Harbor. +# Extra args are passed through to `harbor run` (e.g. -l, -n, -t, -k). +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +ADAPTER_DIR="$REPO_ROOT/benchmarks/terminal_bench" + +# Default: one task, one trial (smoke test). Override by passing -l/-n/-t. +DEFAULT_ARGS=(-l 1 -n 1) +if [ "$#" -gt 0 ]; then + DEFAULT_ARGS=() +fi + +PYTHONPATH="$ADAPTER_DIR${PYTHONPATH:+:$PYTHONPATH}" \ +harbor run \ + -d terminal-bench/terminal-bench-2 \ + --agent-import-path codegraff_agent:CodeGraff \ + -o "$REPO_ROOT/jobs" \ + "${DEFAULT_ARGS[@]}" \ + "$@" diff --git a/crates/forge_services/src/context_engine.rs b/crates/forge_services/src/context_engine.rs index f65ae6d4..7782daf1 100644 --- a/crates/forge_services/src/context_engine.rs +++ b/crates/forge_services/src/context_engine.rs @@ -4,7 +4,9 @@ use std::sync::Arc; use anyhow::{Context, Result}; use async_trait::async_trait; -use forge_app::{CommandInfra, EnvironmentInfra, FileReaderInfra, WalkerInfra, WorkspaceService}; +use forge_app::{ + CommandInfra, EnvironmentInfra, FileReaderInfra, KVStore, WalkerInfra, WorkspaceService, +}; use forge_domain::{ AuthCredential, AuthDetails, ProviderId, ProviderRepository, SyncProgress, UserId, WorkspaceId, WorkspaceIndexRepository, @@ -16,6 +18,55 @@ use tracing::info; use crate::fd::FileDiscovery; use crate::sync::{WorkspaceSyncEngine, canonicalize_path}; +/// Content-addressed cache key for semantic-search results. +/// +/// The key folds in the workspace's index version (`node_count` + +/// `last_updated`), so re-indexing the workspace changes the key and a stale +/// result is never served. This is the exact, content-addressed half of memory +/// guarding the fuzzy (semantic) recall it memoizes. +#[derive(Hash)] +struct SemSearchCacheKey<'a> { + namespace: &'static str, + workspace_id: &'a WorkspaceId, + query: &'a str, + use_case: &'a str, + limit: Option, + top_k: Option, + starts_with: Option<&'a str>, + ends_with: Option<&'a [String]>, + node_count: Option, + last_updated_ms: Option, +} + +impl<'a> SemSearchCacheKey<'a> { + fn new( + workspace: &'a forge_domain::WorkspaceInfo, + params: &'a forge_domain::SearchParams<'a>, + ) -> Self { + Self { + namespace: "codegraff.sem_search.v1", + workspace_id: &workspace.workspace_id, + query: params.query, + use_case: params.use_case.as_str(), + limit: params.limit, + top_k: params.top_k, + starts_with: params.starts_with.as_deref(), + ends_with: params.ends_with.as_deref(), + node_count: workspace.node_count, + last_updated_ms: workspace.last_updated.map(|t| t.timestamp_millis()), + } + } + + /// Deterministic content address for this key (stable across processes, so a + /// disk-backed cache resolves the same query+index to the same entry). + fn stable_key(&self) -> u64 { + use std::hash::{Hash, Hasher}; + let mut hasher = std::collections::hash_map::DefaultHasher::new(); + self.hash(&mut hasher); + hasher.finish() + } +} + /// Service for indexing workspaces and performing semantic search. /// /// `F` provides infrastructure capabilities (file I/O, environment, etc.) and @@ -222,6 +273,7 @@ impl< + EnvironmentInfra + CommandInfra + WalkerInfra + + KVStore + 'static, D: FileDiscovery + 'static, > WorkspaceService for ForgeWorkspaceService @@ -263,6 +315,18 @@ impl< .await? .ok_or(forge_domain::Error::WorkspaceNotFound)?; + // Memoize the semantic (fuzzy) recall under a content-addressed key whose + // hash folds in the workspace index version, so a re-indexed workspace + // busts the cache instead of serving a stale result. + let cache_key = SemSearchCacheKey::new(&workspace, ¶ms).stable_key(); + if let Ok(Some(cached)) = self + .infra + .cache_get::>(&cache_key) + .await + { + return Ok(cached); + } + let search_query = forge_domain::CodeBase::new(user_id, workspace.workspace_id.clone(), params); @@ -272,6 +336,9 @@ impl< .await .context("Failed to search")?; + // Best-effort memoize; a cache failure must never break search. + let _ = self.infra.cache_set(&cache_key, &results).await; + Ok(results) } @@ -421,3 +488,59 @@ impl< } } } + +#[cfg(test)] +mod tests { + use forge_domain::WorkspaceId; + use pretty_assertions::{assert_eq, assert_ne}; + + use super::SemSearchCacheKey; + + fn key<'a>( + ws: &'a WorkspaceId, + query: &'a str, + node_count: Option, + last_updated_ms: Option, + ) -> SemSearchCacheKey<'a> { + SemSearchCacheKey { + namespace: "codegraff.sem_search.v1", + workspace_id: ws, + query, + use_case: "search", + limit: Some(10), + top_k: Some(5), + starts_with: None, + ends_with: None, + node_count, + last_updated_ms, + } + } + + #[test] + fn test_sem_cache_key_is_deterministic() { + let ws = WorkspaceId::generate(); + let actual = key(&ws, "find the parser", Some(100), Some(1_000)).stable_key(); + let expected = key(&ws, "find the parser", Some(100), Some(1_000)).stable_key(); + assert_eq!(actual, expected); + } + + #[test] + fn test_sem_cache_key_busts_when_index_reindexed() { + let ws = WorkspaceId::generate(); + let before = key(&ws, "find the parser", Some(100), Some(1_000)).stable_key(); + // Workspace re-indexed: last_updated advances -> different key. + let after_touch = key(&ws, "find the parser", Some(100), Some(2_000)).stable_key(); + // Workspace grew: node_count changes -> different key. + let after_grow = key(&ws, "find the parser", Some(101), Some(1_000)).stable_key(); + assert_ne!(before, after_touch); + assert_ne!(before, after_grow); + } + + #[test] + fn test_sem_cache_key_differs_by_query() { + let ws = WorkspaceId::generate(); + let actual = key(&ws, "find the parser", Some(100), Some(1_000)).stable_key(); + let other = key(&ws, "find the lexer", Some(100), Some(1_000)).stable_key(); + assert_ne!(actual, other); + } +} diff --git a/crates/forge_services/src/tool_services/fs_patch.rs b/crates/forge_services/src/tool_services/fs_patch.rs index 31471bb4..80beb267 100644 --- a/crates/forge_services/src/tool_services/fs_patch.rs +++ b/crates/forge_services/src/tool_services/fs_patch.rs @@ -141,6 +141,10 @@ enum Error { NoMatch(String), #[error("Could not find swap target text: {0}")] NoSwapTarget(String), + #[error( + "Replacement is a no-op: the new text is identical to the matched text '{0}', so the patch would change nothing. Revise the patch (or read the file again if you expected a difference)." + )] + NoOpReplace(String), #[error( "Multiple matches found for search text: '{0}'. Either provide a more specific search pattern or use replace_all to replace all occurrences." )] @@ -219,6 +223,17 @@ fn apply_replacement( .get(patch.start..patch.end()) .ok_or_else(|| Error::RangeOutOfBounds(patch.start, patch.end(), haystack.len()))?; + // A replace whose new text equals the matched text produces no effect. + // Fail honestly instead of writing identical content and reporting + // success, so the caller does not consume a turn on a phantom change. + if matches!( + operation, + PatchOperation::Replace | PatchOperation::ReplaceAll + ) && needle == normalized_content + { + return Err(Error::NoOpReplace(needle.to_string())); + } + // Apply the operation based on its type match operation { // Prepend content before the matched text @@ -569,6 +584,38 @@ mod tests { assert_eq!(actual, expected); } + #[test] + fn test_apply_replacement_rejects_noop_replace() { + let haystack = "alpha beta gamma".to_string(); + let range = super::Range::find_exact(&haystack, "beta"); + + let actual = super::apply_replacement(haystack, range, &PatchOperation::Replace, "beta"); + + assert!(matches!(actual, Err(super::Error::NoOpReplace(_)))); + } + + #[test] + fn test_apply_replacement_rejects_noop_replace_all() { + let haystack = "foo foo foo".to_string(); + let range = super::Range::find_exact(&haystack, "foo"); + + let actual = super::apply_replacement(haystack, range, &PatchOperation::ReplaceAll, "foo"); + + assert!(matches!(actual, Err(super::Error::NoOpReplace(_)))); + } + + #[test] + fn test_apply_replacement_allows_real_replace() { + let haystack = "alpha beta gamma".to_string(); + let range = super::Range::find_exact(&haystack, "beta"); + + let actual = + super::apply_replacement(haystack, range, &PatchOperation::Replace, "BETA").unwrap(); + let expected = "alpha BETA gamma"; + + assert_eq!(actual, expected); + } + #[test] fn test_range_from_search_match_multi_line() { let source = "line1\nline2\nline3\nline4";