justrach · lekt9 · May 30, 2026 · May 30, 2026 · May 30, 2026
diff --git a/benchmarks/terminal_bench/.gitignore b/benchmarks/terminal_bench/.gitignore
@@ -0,0 +1,2 @@
+__pycache__/
+*.pyc
diff --git a/benchmarks/terminal_bench/README.md b/benchmarks/terminal_bench/README.md
@@ -0,0 +1,52 @@
+# codegraff on Terminal-Bench 2.0
+
+A [Harbor](https://tbench.ai) agent adapter that runs the `graff` CLI against
+[Terminal-Bench 2.0](https://tbench.ai/leaderboard) tasks.
+
+`codegraff_agent.py` is a `BaseInstalledAgent` that, inside each task container:
+
+1. installs system deps and the released `graff` binary (`install.sh`, linux
+   x86_64 / aarch64);
+2. injects the host's `~/.forge` provider credentials so `graff` is
+   authenticated;
+3. runs `graff --prompt "<task instruction>"` headlessly and tees output to the
+   trial's agent log.
+
+## Prerequisites
+
+- A container runtime exposing a Docker-compatible socket (Docker Desktop,
+  Podman machine, OrbStack, or Colima — running).
+- `harbor` installed (`uv tool install harbor` or `pipx install harbor`).
+- `graff` authenticated **on the host**: `graff provider login`. The adapter
+  copies `~/.forge/.credentials.json` + `~/.forge/.forge.toml` into the
+  container; the active model is whatever `[session]` in the toml selects.
+
+## Run
+
+```bash
+# from the codegraff repo root
+./benchmarks/terminal_bench/run.sh                 # 1 task, smoke test
+./benchmarks/terminal_bench/run.sh -l 32 -n 8      # 32 tasks, 8 concurrent
+./benchmarks/terminal_bench/run.sh -t hello-world  # a single named task
+```
+
+Or invoke Harbor directly:
+
+```bash
+PYTHONPATH="$(pwd)/benchmarks/terminal_bench" \
+harbor run \
+  -d terminal-bench/terminal-bench-2 \
+  --agent-import-path codegraff_agent:CodeGraff \
+  -l 1 -n 1
+```
+
+Results land in `./jobs/<timestamp>/`. Per-trial `graff` output is at
+`.../agent/codegraff.txt`; the verifier reward is at `.../verifier/reward.json`.
+
+## Notes / known gaps
+
+- Token accounting is not yet parsed from `graff`, so `populate_context_post_run`
+  leaves the counters at their defaults rather than reporting fabricated numbers.
+  The reward is unaffected (it comes from the verifier's filesystem checks).
+- The container must reach the provider endpoint configured on the host. Tasks
+  marked network-isolated will not be able to call the model.
diff --git a/benchmarks/terminal_bench/codegraff_agent.py b/benchmarks/terminal_bench/codegraff_agent.py
@@ -0,0 +1,147 @@
+"""Terminal-Bench 2.0 (Harbor) agent adapter for codegraff's ``graff`` CLI.
+
+Harbor drives an agent inside a fresh task container. This adapter:
+
+1. ``install`` — installs system deps, then the released ``graff`` binary via the
+   official installer into ``~/.local/bin`` (supports linux x86_64 + aarch64).
+2. ``run`` — injects the host's provider credentials so ``graff`` is
+   authenticated, then runs ``graff --prompt <instruction>`` headlessly and tees
+   the output to the trial's agent log.
+
+Run it from the repo root:
+
+    PYTHONPATH="$(pwd)/benchmarks/terminal_bench" \
+    harbor run -d terminal-bench/terminal-bench-2 \
+      --agent-import-path codegraff_agent:CodeGraff \
+      -t <task> -n 1
+
+The model is whatever the host ``~/.forge/.forge.toml`` ``[session]`` selects;
+authenticate once on the host with ``graff provider login`` before running.
+"""
+
+import os
+import re
+import shlex
+from pathlib import Path
+
+from harbor.agents.installed.base import BaseInstalledAgent, with_prompt_template
+from harbor.environments.base import BaseEnvironment
+from harbor.models.agent.context import AgentContext
+from harbor.models.trial.paths import EnvironmentPaths
+
+CODEGRAFF_INSTALL_URL = (
+    "https://github.com/justrach/codegraff/releases/latest/download/install.sh"
+)
+_HOST_FORGE_DIR = Path(os.path.expanduser("~/.forge"))
+
+
+class CodeGraff(BaseInstalledAgent):
+    """Runs the ``graff`` CLI as a Terminal-Bench agent."""
+
+    # graff does not yet emit Harbor's ATIF trajectory format.
+    SUPPORTS_ATIF: bool = False
+
+    @staticmethod
+    def name() -> str:
+        return "codegraff"
+
+    def get_version_command(self) -> str | None:
+        return 'export PATH="$HOME/.local/bin:$PATH"; graff --version'
+
+    def parse_version(self, stdout: str) -> str:
+        match = re.search(r"(\d+\.\d+\.\d+)", stdout)
+        return match.group(1) if match else stdout.strip()
+
+    async def install(self, environment: BaseEnvironment) -> None:
+        # System dependencies (root): the installer needs curl + certs; git is
+        # commonly required by the tasks themselves.
+        await self.exec_as_root(
+            environment,
+            command=(
+                "if command -v apt-get >/dev/null 2>&1; then"
+                "  apt-get update && apt-get install -y curl ca-certificates bash git;"
+                " elif command -v apk >/dev/null 2>&1; then"
+                "  apk add --no-cache curl ca-certificates bash git libgcc;"
+                " elif command -v dnf >/dev/null 2>&1; then"
+                "  dnf install -y curl ca-certificates bash git;"
+                " elif command -v yum >/dev/null 2>&1; then"
+                "  yum install -y curl ca-certificates bash git;"
+                " else echo 'no known package manager; assuming curl present' >&2; fi"
+            ),
+            env={"DEBIAN_FRONTEND": "noninteractive"},
+            timeout_sec=600,
+        )
+        # Install graff (agent user) into ~/.local/bin via the official installer.
+        await self.exec_as_agent(
+            environment,
+            command=(
+                "set -eu; "
+                f"curl -fsSL {shlex.quote(CODEGRAFF_INSTALL_URL)} | sh && "
+                "echo 'export PATH=\"$HOME/.local/bin:$PATH\"' >> ~/.bashrc && "
+                'export PATH="$HOME/.local/bin:$PATH" && graff --version'
+            ),
+            timeout_sec=600,
+        )
+
+    async def _inject_credentials(self, environment: BaseEnvironment) -> None:
+        """Copy the host's graff provider credentials into the container.
+
+        graff reads providers from ``~/.forge/.credentials.json`` and its active
+        model from ``~/.forge/.forge.toml``. There is no env-var key path, so we
+        upload the host files. Auto-update is disabled so graff does not try to
+        self-update on first run inside the container.
+        """
+        creds = _HOST_FORGE_DIR / ".credentials.json"
+        toml = _HOST_FORGE_DIR / ".forge.toml"
+        if not creds.exists():
+            raise RuntimeError(
+                f"No graff credentials at {creds}. "
+                "Authenticate on the host first: `graff provider login`."
+            )
+
+        await self.exec_as_agent(environment, command="mkdir -p $HOME/.forge")
+
+        await environment.upload_file(creds, "/tmp/cg_credentials.json")
+        await self.exec_as_agent(
+            environment,
+            command=(
+                "cp /tmp/cg_credentials.json $HOME/.forge/.credentials.json && "
+                "chmod 600 $HOME/.forge/.credentials.json"
+            ),
+        )
+
+        if toml.exists():
+            await environment.upload_file(toml, "/tmp/cg_forge.toml")
+            await self.exec_as_agent(
+                environment,
+                command=(
+                    "cp /tmp/cg_forge.toml $HOME/.forge/.forge.toml && "
+                    "sed -i 's/^auto_update = true/auto_update = false/' "
+                    "$HOME/.forge/.forge.toml 2>/dev/null || true"
+                ),
+            )
+
+    @with_prompt_template
+    async def run(
+        self, instruction: str, environment: BaseEnvironment, context: AgentContext
+    ) -> None:
+        await self._inject_credentials(environment)
+
+        escaped_instruction = shlex.quote(instruction)
+        log_path = (EnvironmentPaths.agent_dir / "codegraff.txt").as_posix()
+
+        await self.exec_as_agent(
+            environment,
+            command=(
+                'export PATH="$HOME/.local/bin:$PATH"; '
+                f"graff --prompt {escaped_instruction} "
+                f"2>&1 </dev/null | tee {shlex.quote(log_path)}"
+            ),
+            env={"CODEGRAFF_DISABLE_UPDATE": "1"},
+        )
+
+    def populate_context_post_run(self, context: AgentContext) -> None:
+        # graff does not yet expose per-run token accounting in a parseable form;
+        # leave the counters at their defaults rather than reporting fabricated
+        # numbers. The reward still comes from the verifier's filesystem checks.
+        return
diff --git a/benchmarks/terminal_bench/run.sh b/benchmarks/terminal_bench/run.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+# Run codegraff's `graff` CLI on Terminal-Bench 2.0 via Harbor.
+# Extra args are passed through to `harbor run` (e.g. -l, -n, -t, -k).
+set -euo pipefail
+
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+ADAPTER_DIR="$REPO_ROOT/benchmarks/terminal_bench"
+
+# Default: one task, one trial (smoke test). Override by passing -l/-n/-t.
+DEFAULT_ARGS=(-l 1 -n 1)
+if [ "$#" -gt 0 ]; then
+  DEFAULT_ARGS=()
+fi
+
+PYTHONPATH="$ADAPTER_DIR${PYTHONPATH:+:$PYTHONPATH}" \
+harbor run \
+  -d terminal-bench/terminal-bench-2 \
+  --agent-import-path codegraff_agent:CodeGraff \
+  -o "$REPO_ROOT/jobs" \
+  "${DEFAULT_ARGS[@]}" \
+  "$@"
diff --git a/crates/forge_services/src/context_engine.rs b/crates/forge_services/src/context_engine.rs
@@ -4,7 +4,9 @@ use std::sync::Arc;
 
 use anyhow::{Context, Result};
 use async_trait::async_trait;
-use forge_app::{CommandInfra, EnvironmentInfra, FileReaderInfra, WalkerInfra, WorkspaceService};
+use forge_app::{
+    CommandInfra, EnvironmentInfra, FileReaderInfra, KVStore, WalkerInfra, WorkspaceService,
+};
 use forge_domain::{
     AuthCredential, AuthDetails, ProviderId, ProviderRepository, SyncProgress, UserId, WorkspaceId,
     WorkspaceIndexRepository,
@@ -16,6 +18,55 @@ use tracing::info;
 use crate::fd::FileDiscovery;
 use crate::sync::{WorkspaceSyncEngine, canonicalize_path};
 
+/// Content-addressed cache key for semantic-search results.
+///
+/// The key folds in the workspace's index version (`node_count` +
+/// `last_updated`), so re-indexing the workspace changes the key and a stale
+/// result is never served. This is the exact, content-addressed half of memory
+/// guarding the fuzzy (semantic) recall it memoizes.
+#[derive(Hash)]
+struct SemSearchCacheKey<'a> {
+    namespace: &'static str,
+    workspace_id: &'a WorkspaceId,
+    query: &'a str,
+    use_case: &'a str,
+    limit: Option<usize>,
+    top_k: Option<u32>,
+    starts_with: Option<&'a str>,
+    ends_with: Option<&'a [String]>,
+    node_count: Option<u64>,
+    last_updated_ms: Option<i64>,
+}
+
+impl<'a> SemSearchCacheKey<'a> {
+    fn new(
+        workspace: &'a forge_domain::WorkspaceInfo,
+        params: &'a forge_domain::SearchParams<'a>,
+    ) -> Self {
+        Self {
+            namespace: "codegraff.sem_search.v1",
+            workspace_id: &workspace.workspace_id,
+            query: params.query,
+            use_case: params.use_case.as_str(),
+            limit: params.limit,
+            top_k: params.top_k,
+            starts_with: params.starts_with.as_deref(),
+            ends_with: params.ends_with.as_deref(),
+            node_count: workspace.node_count,
+            last_updated_ms: workspace.last_updated.map(|t| t.timestamp_millis()),
+        }
+    }
+
+    /// Deterministic content address for this key (stable across processes, so a
+    /// disk-backed cache resolves the same query+index to the same entry).
+    fn stable_key(&self) -> u64 {
+        use std::hash::{Hash, Hasher};
+        let mut hasher = std::collections::hash_map::DefaultHasher::new();
+        self.hash(&mut hasher);
+        hasher.finish()
+    }
+}
+
 /// Service for indexing workspaces and performing semantic search.
 ///
 /// `F` provides infrastructure capabilities (file I/O, environment, etc.) and
@@ -222,6 +273,7 @@ impl<
         + EnvironmentInfra<Config = forge_config::ForgeConfig>
         + CommandInfra
         + WalkerInfra
+        + KVStore
         + 'static,
     D: FileDiscovery + 'static,
 > WorkspaceService for ForgeWorkspaceService<F, D>
@@ -263,6 +315,18 @@ impl<
             .await?
             .ok_or(forge_domain::Error::WorkspaceNotFound)?;
 
+        // Memoize the semantic (fuzzy) recall under a content-addressed key whose
+        // hash folds in the workspace index version, so a re-indexed workspace
+        // busts the cache instead of serving a stale result.
+        let cache_key = SemSearchCacheKey::new(&workspace, &params).stable_key();
+        if let Ok(Some(cached)) = self
+            .infra
+            .cache_get::<u64, Vec<forge_domain::Node>>(&cache_key)
+            .await
+        {
+            return Ok(cached);
+        }
+
         let search_query =
             forge_domain::CodeBase::new(user_id, workspace.workspace_id.clone(), params);
 
@@ -272,6 +336,9 @@ impl<
             .await
             .context("Failed to search")?;
 
+        // Best-effort memoize; a cache failure must never break search.
+        let _ = self.infra.cache_set(&cache_key, &results).await;
+
         Ok(results)
     }
 
@@ -421,3 +488,59 @@ impl<
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use forge_domain::WorkspaceId;
+    use pretty_assertions::{assert_eq, assert_ne};
+
+    use super::SemSearchCacheKey;
+
+    fn key<'a>(
+        ws: &'a WorkspaceId,
+        query: &'a str,
+        node_count: Option<u64>,
+        last_updated_ms: Option<i64>,
+    ) -> SemSearchCacheKey<'a> {
+        SemSearchCacheKey {
+            namespace: "codegraff.sem_search.v1",
+            workspace_id: ws,
+            query,
+            use_case: "search",
+            limit: Some(10),
+            top_k: Some(5),
+            starts_with: None,
+            ends_with: None,
+            node_count,
+            last_updated_ms,
+        }
+    }
+
+    #[test]
+    fn test_sem_cache_key_is_deterministic() {
+        let ws = WorkspaceId::generate();
+        let actual = key(&ws, "find the parser", Some(100), Some(1_000)).stable_key();
+        let expected = key(&ws, "find the parser", Some(100), Some(1_000)).stable_key();
+        assert_eq!(actual, expected);
+    }
+
+    #[test]
+    fn test_sem_cache_key_busts_when_index_reindexed() {
+        let ws = WorkspaceId::generate();
+        let before = key(&ws, "find the parser", Some(100), Some(1_000)).stable_key();
+        // Workspace re-indexed: last_updated advances -> different key.
+        let after_touch = key(&ws, "find the parser", Some(100), Some(2_000)).stable_key();
+        // Workspace grew: node_count changes -> different key.
+        let after_grow = key(&ws, "find the parser", Some(101), Some(1_000)).stable_key();
+        assert_ne!(before, after_touch);
+        assert_ne!(before, after_grow);
+    }
+
+    #[test]
+    fn test_sem_cache_key_differs_by_query() {
+        let ws = WorkspaceId::generate();
+        let actual = key(&ws, "find the parser", Some(100), Some(1_000)).stable_key();
+        let other = key(&ws, "find the lexer", Some(100), Some(1_000)).stable_key();
+        assert_ne!(actual, other);
+    }
+}