Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions benchmarks/terminal_bench/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
__pycache__/
*.pyc
52 changes: 52 additions & 0 deletions benchmarks/terminal_bench/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# codegraff on Terminal-Bench 2.0

A [Harbor](https://tbench.ai) agent adapter that runs the `graff` CLI against
[Terminal-Bench 2.0](https://tbench.ai/leaderboard) tasks.

`codegraff_agent.py` is a `BaseInstalledAgent` that, inside each task container:

1. installs system deps and the released `graff` binary (`install.sh`, linux
x86_64 / aarch64);
2. injects the host's `~/.forge` provider credentials so `graff` is
authenticated;
3. runs `graff --prompt "<task instruction>"` headlessly and tees output to the
trial's agent log.

## Prerequisites

- A container runtime exposing a Docker-compatible socket (Docker Desktop,
Podman machine, OrbStack, or Colima — running).
- `harbor` installed (`uv tool install harbor` or `pipx install harbor`).
- `graff` authenticated **on the host**: `graff provider login`. The adapter
copies `~/.forge/.credentials.json` + `~/.forge/.forge.toml` into the
container; the active model is whatever `[session]` in the toml selects.

## Run

```bash
# from the codegraff repo root
./benchmarks/terminal_bench/run.sh # 1 task, smoke test
./benchmarks/terminal_bench/run.sh -l 32 -n 8 # 32 tasks, 8 concurrent
./benchmarks/terminal_bench/run.sh -t hello-world # a single named task
```

Or invoke Harbor directly:

```bash
PYTHONPATH="$(pwd)/benchmarks/terminal_bench" \
harbor run \
-d terminal-bench/terminal-bench-2 \
--agent-import-path codegraff_agent:CodeGraff \
-l 1 -n 1
```

Results land in `./jobs/<timestamp>/`. Per-trial `graff` output is at
`.../agent/codegraff.txt`; the verifier reward is at `.../verifier/reward.json`.

## Notes / known gaps

- Token accounting is not yet parsed from `graff`, so `populate_context_post_run`
leaves the counters at their defaults rather than reporting fabricated numbers.
The reward is unaffected (it comes from the verifier's filesystem checks).
- The container must reach the provider endpoint configured on the host. Tasks
marked network-isolated will not be able to call the model.
147 changes: 147 additions & 0 deletions benchmarks/terminal_bench/codegraff_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
"""Terminal-Bench 2.0 (Harbor) agent adapter for codegraff's ``graff`` CLI.

Harbor drives an agent inside a fresh task container. This adapter:

1. ``install`` — installs system deps, then the released ``graff`` binary via the
official installer into ``~/.local/bin`` (supports linux x86_64 + aarch64).
2. ``run`` — injects the host's provider credentials so ``graff`` is
authenticated, then runs ``graff --prompt <instruction>`` headlessly and tees
the output to the trial's agent log.

Run it from the repo root:

PYTHONPATH="$(pwd)/benchmarks/terminal_bench" \
harbor run -d terminal-bench/terminal-bench-2 \
--agent-import-path codegraff_agent:CodeGraff \
-t <task> -n 1

The model is whatever the host ``~/.forge/.forge.toml`` ``[session]`` selects;
authenticate once on the host with ``graff provider login`` before running.
"""

import os
import re
import shlex
from pathlib import Path

from harbor.agents.installed.base import BaseInstalledAgent, with_prompt_template
from harbor.environments.base import BaseEnvironment
from harbor.models.agent.context import AgentContext
from harbor.models.trial.paths import EnvironmentPaths

CODEGRAFF_INSTALL_URL = (
"https://github.com/justrach/codegraff/releases/latest/download/install.sh"
)
_HOST_FORGE_DIR = Path(os.path.expanduser("~/.forge"))


class CodeGraff(BaseInstalledAgent):
"""Runs the ``graff`` CLI as a Terminal-Bench agent."""

# graff does not yet emit Harbor's ATIF trajectory format.
SUPPORTS_ATIF: bool = False

@staticmethod
def name() -> str:
return "codegraff"

def get_version_command(self) -> str | None:
return 'export PATH="$HOME/.local/bin:$PATH"; graff --version'

def parse_version(self, stdout: str) -> str:
match = re.search(r"(\d+\.\d+\.\d+)", stdout)
return match.group(1) if match else stdout.strip()

async def install(self, environment: BaseEnvironment) -> None:
# System dependencies (root): the installer needs curl + certs; git is
# commonly required by the tasks themselves.
await self.exec_as_root(
environment,
command=(
"if command -v apt-get >/dev/null 2>&1; then"
" apt-get update && apt-get install -y curl ca-certificates bash git;"
" elif command -v apk >/dev/null 2>&1; then"
" apk add --no-cache curl ca-certificates bash git libgcc;"
" elif command -v dnf >/dev/null 2>&1; then"
" dnf install -y curl ca-certificates bash git;"
" elif command -v yum >/dev/null 2>&1; then"
" yum install -y curl ca-certificates bash git;"
" else echo 'no known package manager; assuming curl present' >&2; fi"
),
env={"DEBIAN_FRONTEND": "noninteractive"},
timeout_sec=600,
)
# Install graff (agent user) into ~/.local/bin via the official installer.
await self.exec_as_agent(
environment,
command=(
"set -eu; "
f"curl -fsSL {shlex.quote(CODEGRAFF_INSTALL_URL)} | sh && "
"echo 'export PATH=\"$HOME/.local/bin:$PATH\"' >> ~/.bashrc && "
'export PATH="$HOME/.local/bin:$PATH" && graff --version'
),
timeout_sec=600,
)

async def _inject_credentials(self, environment: BaseEnvironment) -> None:
"""Copy the host's graff provider credentials into the container.

graff reads providers from ``~/.forge/.credentials.json`` and its active
model from ``~/.forge/.forge.toml``. There is no env-var key path, so we
upload the host files. Auto-update is disabled so graff does not try to
self-update on first run inside the container.
"""
creds = _HOST_FORGE_DIR / ".credentials.json"
toml = _HOST_FORGE_DIR / ".forge.toml"
if not creds.exists():
raise RuntimeError(
f"No graff credentials at {creds}. "
"Authenticate on the host first: `graff provider login`."
)

await self.exec_as_agent(environment, command="mkdir -p $HOME/.forge")

await environment.upload_file(creds, "/tmp/cg_credentials.json")
await self.exec_as_agent(
environment,
command=(
"cp /tmp/cg_credentials.json $HOME/.forge/.credentials.json && "
"chmod 600 $HOME/.forge/.credentials.json"
),
)

if toml.exists():
await environment.upload_file(toml, "/tmp/cg_forge.toml")
await self.exec_as_agent(
environment,
command=(
"cp /tmp/cg_forge.toml $HOME/.forge/.forge.toml && "
"sed -i 's/^auto_update = true/auto_update = false/' "
"$HOME/.forge/.forge.toml 2>/dev/null || true"
),
)

@with_prompt_template
async def run(
self, instruction: str, environment: BaseEnvironment, context: AgentContext
) -> None:
await self._inject_credentials(environment)

escaped_instruction = shlex.quote(instruction)
log_path = (EnvironmentPaths.agent_dir / "codegraff.txt").as_posix()

await self.exec_as_agent(
environment,
command=(
'export PATH="$HOME/.local/bin:$PATH"; '
f"graff --prompt {escaped_instruction} "
f"2>&1 </dev/null | tee {shlex.quote(log_path)}"
),
env={"CODEGRAFF_DISABLE_UPDATE": "1"},
)

def populate_context_post_run(self, context: AgentContext) -> None:
# graff does not yet expose per-run token accounting in a parseable form;
# leave the counters at their defaults rather than reporting fabricated
# numbers. The reward still comes from the verifier's filesystem checks.
return
21 changes: 21 additions & 0 deletions benchmarks/terminal_bench/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/usr/bin/env bash
# Run codegraff's `graff` CLI on Terminal-Bench 2.0 via Harbor.
# Extra args are passed through to `harbor run` (e.g. -l, -n, -t, -k).
set -euo pipefail

REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
ADAPTER_DIR="$REPO_ROOT/benchmarks/terminal_bench"

# Default: one task, one trial (smoke test). Override by passing -l/-n/-t.
DEFAULT_ARGS=(-l 1 -n 1)
if [ "$#" -gt 0 ]; then
DEFAULT_ARGS=()
fi

PYTHONPATH="$ADAPTER_DIR${PYTHONPATH:+:$PYTHONPATH}" \
harbor run \
-d terminal-bench/terminal-bench-2 \
--agent-import-path codegraff_agent:CodeGraff \
-o "$REPO_ROOT/jobs" \
"${DEFAULT_ARGS[@]}" \
"$@"
125 changes: 124 additions & 1 deletion crates/forge_services/src/context_engine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@ use std::sync::Arc;

use anyhow::{Context, Result};
use async_trait::async_trait;
use forge_app::{CommandInfra, EnvironmentInfra, FileReaderInfra, WalkerInfra, WorkspaceService};
use forge_app::{
CommandInfra, EnvironmentInfra, FileReaderInfra, KVStore, WalkerInfra, WorkspaceService,
};
use forge_domain::{
AuthCredential, AuthDetails, ProviderId, ProviderRepository, SyncProgress, UserId, WorkspaceId,
WorkspaceIndexRepository,
Expand All @@ -16,6 +18,55 @@ use tracing::info;
use crate::fd::FileDiscovery;
use crate::sync::{WorkspaceSyncEngine, canonicalize_path};

/// Content-addressed cache key for semantic-search results.
///
/// The key folds in the workspace's index version (`node_count` +
/// `last_updated`), so re-indexing the workspace changes the key and a stale
/// result is never served. This is the exact, content-addressed half of memory
/// guarding the fuzzy (semantic) recall it memoizes.
#[derive(Hash)]
struct SemSearchCacheKey<'a> {
namespace: &'static str,
workspace_id: &'a WorkspaceId,
query: &'a str,
use_case: &'a str,
limit: Option<usize>,
top_k: Option<u32>,
starts_with: Option<&'a str>,
ends_with: Option<&'a [String]>,
node_count: Option<u64>,
last_updated_ms: Option<i64>,
}

impl<'a> SemSearchCacheKey<'a> {
fn new(
workspace: &'a forge_domain::WorkspaceInfo,
params: &'a forge_domain::SearchParams<'a>,
) -> Self {
Self {
namespace: "codegraff.sem_search.v1",
workspace_id: &workspace.workspace_id,
query: params.query,
use_case: params.use_case.as_str(),
limit: params.limit,
top_k: params.top_k,
starts_with: params.starts_with.as_deref(),
ends_with: params.ends_with.as_deref(),
node_count: workspace.node_count,
last_updated_ms: workspace.last_updated.map(|t| t.timestamp_millis()),
}
}

/// Deterministic content address for this key (stable across processes, so a
/// disk-backed cache resolves the same query+index to the same entry).
fn stable_key(&self) -> u64 {
use std::hash::{Hash, Hasher};
let mut hasher = std::collections::hash_map::DefaultHasher::new();
self.hash(&mut hasher);
hasher.finish()
}
}

/// Service for indexing workspaces and performing semantic search.
///
/// `F` provides infrastructure capabilities (file I/O, environment, etc.) and
Expand Down Expand Up @@ -222,6 +273,7 @@ impl<
+ EnvironmentInfra<Config = forge_config::ForgeConfig>
+ CommandInfra
+ WalkerInfra
+ KVStore
+ 'static,
D: FileDiscovery + 'static,
> WorkspaceService for ForgeWorkspaceService<F, D>
Expand Down Expand Up @@ -263,6 +315,18 @@ impl<
.await?
.ok_or(forge_domain::Error::WorkspaceNotFound)?;

// Memoize the semantic (fuzzy) recall under a content-addressed key whose
// hash folds in the workspace index version, so a re-indexed workspace
// busts the cache instead of serving a stale result.
let cache_key = SemSearchCacheKey::new(&workspace, &params).stable_key();
if let Ok(Some(cached)) = self
.infra
.cache_get::<u64, Vec<forge_domain::Node>>(&cache_key)
.await
{
return Ok(cached);
}

let search_query =
forge_domain::CodeBase::new(user_id, workspace.workspace_id.clone(), params);

Expand All @@ -272,6 +336,9 @@ impl<
.await
.context("Failed to search")?;

// Best-effort memoize; a cache failure must never break search.
let _ = self.infra.cache_set(&cache_key, &results).await;

Ok(results)
}

Expand Down Expand Up @@ -421,3 +488,59 @@ impl<
}
}
}

#[cfg(test)]
mod tests {
use forge_domain::WorkspaceId;
use pretty_assertions::{assert_eq, assert_ne};

use super::SemSearchCacheKey;

fn key<'a>(
ws: &'a WorkspaceId,
query: &'a str,
node_count: Option<u64>,
last_updated_ms: Option<i64>,
) -> SemSearchCacheKey<'a> {
SemSearchCacheKey {
namespace: "codegraff.sem_search.v1",
workspace_id: ws,
query,
use_case: "search",
limit: Some(10),
top_k: Some(5),
starts_with: None,
ends_with: None,
node_count,
last_updated_ms,
}
}

#[test]
fn test_sem_cache_key_is_deterministic() {
let ws = WorkspaceId::generate();
let actual = key(&ws, "find the parser", Some(100), Some(1_000)).stable_key();
let expected = key(&ws, "find the parser", Some(100), Some(1_000)).stable_key();
assert_eq!(actual, expected);
}

#[test]
fn test_sem_cache_key_busts_when_index_reindexed() {
let ws = WorkspaceId::generate();
let before = key(&ws, "find the parser", Some(100), Some(1_000)).stable_key();
// Workspace re-indexed: last_updated advances -> different key.
let after_touch = key(&ws, "find the parser", Some(100), Some(2_000)).stable_key();
// Workspace grew: node_count changes -> different key.
let after_grow = key(&ws, "find the parser", Some(101), Some(1_000)).stable_key();
assert_ne!(before, after_touch);
assert_ne!(before, after_grow);
}

#[test]
fn test_sem_cache_key_differs_by_query() {
let ws = WorkspaceId::generate();
let actual = key(&ws, "find the parser", Some(100), Some(1_000)).stable_key();
let other = key(&ws, "find the lexer", Some(100), Some(1_000)).stable_key();
assert_ne!(actual, other);
}
}
Loading