diff --git a/.github/workflows/remote-evals.yml b/.github/workflows/remote-evals.yml
new file mode 100644
index 0000000000..f16f0e6b87
--- /dev/null
+++ b/.github/workflows/remote-evals.yml
@@ -0,0 +1,212 @@
+name: Remote Evaluations (SDK)
+
+# This workflow runs Codebuff evaluations using the public SDK exclusively.
+# It creates a containerized backend environment and runs evaluations via CodebuffClient.
+# Trigger: Add [remote-eval] to commit message or use workflow_dispatch
+# Matrix mode: Add [remote-eval-all] to commit message for parallel evaluations
+
+on:
+  push:
+    branches: ['**']
+  workflow_dispatch:
+    inputs:
+      eval_file:
+        description: 'Eval file to run (e.g., eval-codebuff.json)'
+        required: false
+        default: 'eval-codebuff.json'
+        type: string
+      commit_index:
+        description: 'Commit index to evaluate (0-based)'
+        required: false
+        default: '0'
+        type: string
+      mode:
+        description: 'Auth mode (seed or bypass)'
+        required: false
+        default: 'bypass'
+        type: choice
+        options:
+          - 'bypass'
+          - 'seed'
+
+jobs:
+  remote-evals:
+    runs-on: ubuntu-latest
+    timeout-minutes: 60
+    
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Check commit message
+        id: check_commit
+        env:
+          COMMIT_MESSAGE: ${{ github.event.head_commit.message }}
+        run: |
+          shopt -s nocasematch
+          if [[ "$COMMIT_MESSAGE" == *"[remote-eval]"* ]] || [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
+            echo "should_run_evals=true" >> $GITHUB_OUTPUT
+            echo "Will run remote evaluations"
+          else
+            echo "should_run_evals=false" >> $GITHUB_OUTPUT
+            echo "Skipping remote evaluations (add [remote-eval] to commit message to trigger)"
+          fi
+
+      - name: Set up Bun
+        if: steps.check_commit.outputs.should_run_evals == 'true'
+        uses: oven-sh/setup-bun@v2
+        with:
+          bun-version: '1.2.12'
+
+      - name: Install dependencies
+        if: steps.check_commit.outputs.should_run_evals == 'true'
+        run: bun install --frozen-lockfile
+
+      - name: Validate environment for SDK evaluation
+        if: steps.check_commit.outputs.should_run_evals == 'true'
+        run: |
+          echo "🔍 Validating SDK evaluation environment..."
+          echo "  Checking for required files..."
+          test -f evals/scripts/run-remote-parameterized.sh || { echo "❌ Missing run-remote-parameterized.sh"; exit 1; }
+          test -f evals/git-evals/run-single-eval.ts || { echo "❌ Missing run-single-eval.ts"; exit 1; }
+          test -f evals/docker-compose.evals.yml || { echo "❌ Missing docker-compose.evals.yml"; exit 1; }
+          echo "  Checking SDK package..."
+          bun --version
+          echo "✅ Environment validation passed"
+
+      - name: Run remote evaluation
+        if: steps.check_commit.outputs.should_run_evals == 'true'
+        env:
+          EVAL_FILE: ${{ inputs.eval_file || 'eval-codebuff.json' }}
+          COMMIT_INDEX: ${{ inputs.commit_index || '0' }}
+          MODE: ${{ inputs.mode || 'bypass' }}
+          CODEBUFF_WEBSOCKET_URL: "ws://127.0.0.1:4242/ws"
+          CODEBUFF_SKIP_BINARY_CHECK: "1"
+        run: |
+          echo "🚀 Remote Evaluation Starting (SDK Mode)"
+          echo "📋 GitHub Actions Environment:"
+          echo "  Runner: ${{ runner.os }}"
+          echo "  SHA: ${{ github.sha }}"
+          echo "  Ref: ${{ github.ref }}"
+          echo "  Event: ${{ github.event_name }}"
+          echo "  Eval File: $EVAL_FILE"
+          echo "  Commit Index: $COMMIT_INDEX"
+          echo "  Mode: $MODE"
+          echo "🐳 Docker Info:"
+          docker --version
+          docker compose version
+          echo "💾 Disk Space:"
+          df -h
+          echo "🔧 Starting SDK-based evaluation..."
+          bash evals/scripts/run-remote-parameterized.sh "$MODE" "$EVAL_FILE" "$COMMIT_INDEX"
+
+      - name: Dump logs on failure
+        if: failure() && steps.check_commit.outputs.should_run_evals == 'true'
+        run: |
+          echo "❌ SDK Evaluation failed - dumping diagnostic information"
+          echo "🔧 SDK Environment:"
+          echo "  CODEBUFF_WEBSOCKET_URL: ${CODEBUFF_WEBSOCKET_URL:-not set}"
+          echo "  CODEBUFF_SKIP_BINARY_CHECK: ${CODEBUFF_SKIP_BINARY_CHECK:-not set}"
+          echo "  CODEBUFF_API_KEY: ${CODEBUFF_API_KEY:+[SET]}${CODEBUFF_API_KEY:-[NOT SET]}"
+          echo "🐳 Docker containers status:"
+          docker ps -a || true
+          echo "📋 Backend container logs:"
+          docker compose -f evals/docker-compose.evals.yml logs backend --tail=200 || true
+          echo "📋 Database container logs:"
+          docker compose -f evals/docker-compose.evals.yml logs db --tail=100 || true
+          echo "💾 Disk usage:"
+          df -h || true
+          echo "🧠 Memory usage:"
+          free -h || true
+          echo "📁 Evaluation files:"
+          ls -la evals/git-evals/ || true
+          ls -la evals/scripts/ || true
+
+      - name: Upload evaluation logs
+        if: always() && steps.check_commit.outputs.should_run_evals == 'true'
+        uses: actions/upload-artifact@v4
+        with:
+          name: remote-eval-logs-${{ github.sha }}
+          path: |
+            evals/test-repos/
+            debug/
+            ~/.cache/bun/
+          retention-days: 7
+
+      - name: Cleanup containers
+        if: always() && steps.check_commit.outputs.should_run_evals == 'true'
+        run: |
+          echo "🧹 Final cleanup - removing all containers and volumes..."
+          docker compose -f evals/docker-compose.evals.yml down -v || true
+          docker system prune -f || true
+          echo "✅ Cleanup completed"
+
+  # Optional: Matrix job to run multiple evaluations in parallel
+  remote-evals-matrix:
+    runs-on: ubuntu-latest
+    timeout-minutes: 90
+    if: contains(github.event.head_commit.message, '[remote-eval-all]')
+    
+    strategy:
+      fail-fast: false
+      matrix:
+        eval:
+          - { file: 'eval-codebuff.json', index: '0' }
+          - { file: 'eval-codebuff.json', index: '1' }
+          - { file: 'eval-manifold.json', index: '0' }
+        
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Bun
+        uses: oven-sh/setup-bun@v2
+        with:
+          bun-version: '1.2.12'
+
+      - name: Install dependencies
+        run: bun install --frozen-lockfile
+
+      - name: Validate environment for SDK evaluation
+        run: |
+          echo "🔍 Validating SDK evaluation environment for matrix job..."
+          test -f evals/scripts/run-remote-parameterized.sh || { echo "❌ Missing run-remote-parameterized.sh"; exit 1; }
+          test -f evals/git-evals/run-single-eval.ts || { echo "❌ Missing run-single-eval.ts"; exit 1; }
+          test -f evals/docker-compose.evals.yml || { echo "❌ Missing docker-compose.evals.yml"; exit 1; }
+          echo "✅ Matrix environment validation passed"
+
+      - name: Run evaluation matrix
+        env:
+          EVAL_FILE: ${{ matrix.eval.file }}
+          COMMIT_INDEX: ${{ matrix.eval.index }}
+          CODEBUFF_WEBSOCKET_URL: "ws://127.0.0.1:4242/ws"
+          CODEBUFF_SKIP_BINARY_CHECK: "1"
+        run: |
+          echo "🚀 Running matrix evaluation (SDK Mode)..."
+          bash evals/scripts/run-remote-parameterized.sh "bypass" "$EVAL_FILE" "$COMMIT_INDEX"
+
+      - name: Dump matrix logs on failure
+        if: failure()
+        run: |
+          echo "❌ Matrix SDK Evaluation failed - dumping diagnostic information"
+          echo "🔧 Matrix job details: File=$EVAL_FILE, Index=$COMMIT_INDEX"
+          echo "🐳 Docker containers status:"
+          docker ps -a || true
+          echo "📋 Container logs:"
+          docker compose -f evals/docker-compose.evals.yml logs --tail=100 || true
+
+      - name: Upload matrix evaluation results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: remote-eval-matrix-${{ matrix.eval.file }}-${{ matrix.eval.index }}-${{ github.sha }}
+          path: |
+            evals/test-repos/
+            debug/
+          retention-days: 7
+
+      - name: Cleanup containers
+        if: always()
+        run: |
+          docker compose -f evals/docker-compose.evals.yml down -v || true
+          docker system prune -f || true
\ No newline at end of file
diff --git a/backend/src/index.ts b/backend/src/index.ts
index bb038db89b..d20742cfd8 100644
--- a/backend/src/index.ts
+++ b/backend/src/index.ts
@@ -19,6 +19,7 @@ import {
   sendRequestReconnect,
   waitForAllClientsDisconnected,
   listen as webSocketListen,
+  isWebSocketReady,
 } from './websockets/server'
 
 const app = express()
@@ -31,7 +32,11 @@ app.get('/', (req, res) => {
 })
 
 app.get('/healthz', (req, res) => {
-  res.send('ok')
+  if (isWebSocketReady()) {
+    res.send('ok')
+  } else {
+    res.status(503).send('starting')
+  }
 })
 
 app.post('/api/usage', usageHandler)
diff --git a/backend/src/websockets/auth.ts b/backend/src/websockets/auth.ts
index 927c56d43f..11b0df5edf 100644
--- a/backend/src/websockets/auth.ts
+++ b/backend/src/websockets/auth.ts
@@ -11,6 +11,12 @@ export interface UserInfo {
 export async function getUserIdFromAuthToken(
   authToken: string,
 ): Promise<string | undefined> {
+  // Test-only auth bypass
+  const bypass = process.env.CODEBUFF_TEST_AUTH_TOKEN
+  if (process.env.NODE_ENV === 'test' && bypass && authToken === bypass) {
+    return 'test-user'
+  }
+
   const user = await db
     .select({ id: schema.user.id })
     .from(schema.user)
@@ -25,6 +31,12 @@ export async function getUserIdFromAuthToken(
 export async function getUserInfoFromAuthToken(
   authToken: string,
 ): Promise<UserInfo | undefined> {
+  // Test-only auth bypass
+  const bypass = process.env.CODEBUFF_TEST_AUTH_TOKEN
+  if (process.env.NODE_ENV === 'test' && bypass && authToken === bypass) {
+    return { id: 'test-user', email: 'evals@test.local', discord_id: null }
+  }
+
   const user = await db
     .select({
       id: schema.user.id,
diff --git a/backend/src/websockets/server.ts b/backend/src/websockets/server.ts
index 2f91d488f0..fd7177dba6 100644
--- a/backend/src/websockets/server.ts
+++ b/backend/src/websockets/server.ts
@@ -18,6 +18,8 @@ export const SWITCHBOARD = new Switchboard()
 // if a connection doesn't ping for this long, we assume the other side is toast
 const CONNECTION_TIMEOUT_MS = 60 * 1000
 
+let wsReady = false
+
 export class MessageParseError extends Error {
   details?: unknown
   constructor(message: string, details?: unknown) {
@@ -87,6 +89,7 @@ export function listen(server: HttpServer, path: string) {
   let deadConnectionCleaner: NodeJS.Timeout | undefined
   wss.on('listening', () => {
     logger.info(`Web socket server listening on ${path}.`)
+    wsReady = true
     deadConnectionCleaner = setInterval(function ping() {
       const now = Date.now()
       try {
@@ -175,3 +178,7 @@ export function sendRequestReconnect() {
 export function waitForAllClientsDisconnected() {
   return SWITCHBOARD.waitForAllClientsDisconnected()
 }
+
+export function isWebSocketReady() {
+  return wsReady
+}
diff --git a/codebuff.json b/codebuff.json
index 4fa5aa1592..334ca10b17 100644
--- a/codebuff.json
+++ b/codebuff.json
@@ -57,7 +57,7 @@
     },
     {
       "name": "prettier-format",
-      "command": "git diff --name-only --diff-filter=ACMR | grep -E '\\.(ts|tsx|json|md)$' | xargs -r npx prettier --write",
+      "command": "set -o pipefail && CHANGED=\"$(git diff --name-only --diff-filter=ACMR | grep -E '\\.(ts|tsx|json|md)$' | xargs -r npx prettier --list-different || true)\"; [ -n \"$CHANGED\" ] && echo \"$CHANGED\" | xargs -r npx prettier --write --log-level=warn && printf '%s\\n' \"$CHANGED\" || true",
       "filePattern": "**/*.{ts,tsx,json,md}"
     },
     {
@@ -70,6 +70,11 @@
       "command": "bun run typecheck",
       "cwd": ".agents",
       "filePattern": ".agents/**/*.ts"
+    },
+    {
+      "name": "eslint-fix-imports",
+      "command": "set -o pipefail && git diff --name-only --diff-filter=ACMR | grep -E '\\.(ts|tsx|js|jsx)$' | xargs -r npx eslint --fix --quiet",
+      "filePattern": "**/*.{ts,tsx,js,jsx}"
     }
   ]
 }
diff --git a/evals/README.md b/evals/README.md
new file mode 100644
index 0000000000..2abbd46802
--- /dev/null
+++ b/evals/README.md
@@ -0,0 +1,136 @@
+# Remote Evaluation Infrastructure
+
+This directory contains the infrastructure for running Codebuff evaluations in containerized environments (Docker Compose) for CI/CD and local testing.
+
+## Quick Start
+
+### Option 1: Using Drizzle Seed (Recommended)
+```bash
+bash evals/scripts/run-remote.sh seed
+```
+
+### Option 2: Using Test Auth Bypass (Faster)
+```bash
+bash evals/scripts/run-remote.sh bypass
+```
+
+## Prerequisites
+
+- Docker and Docker Compose
+- Bun runtime
+- Optional: `npm install -g codebuff` (or set `CODEBUFF_SKIP_BINARY_CHECK=1`)
+
+## Architecture
+
+- **evals/docker-compose.evals.yml**: Orchestrates PostgreSQL database and backend services
+- **evals/backend.Dockerfile**: Backend container definition
+- **evals/seeds/seed-evals.ts**: Drizzle-based database seeding for test users/sessions
+- **evals/scripts/run-remote.sh**: Main runner script with teardown
+- **evals/scripts/wait-for-healthz.sh**: Health check waiting utility
+
+## Key Features
+
+### SDK Enhancements
+- **Binary Check Skip**: Set `CODEBUFF_SKIP_BINARY_CHECK=1` to skip codebuff CLI requirement
+- **WebSocket URL Override**: Set `CODEBUFF_WEBSOCKET_URL=ws://127.0.0.1:4242/ws` to target ephemeral backend
+
+### Backend Enhancements
+- **Test Auth Bypass**: Set `CODEBUFF_TEST_AUTH_TOKEN` + `NODE_ENV=test` for quick auth
+- **WebSocket-Ready Health Check**: `/healthz` returns 503 until WebSocket server is accepting connections
+
+### Container Strategy
+- **Loopback Binding**: Backend bound to `127.0.0.1:4242` only (no public exposure)
+- **Optimized PostgreSQL**: Fast settings for CI (fsync=off, etc.)
+- **Build Context**: Uses repo root with Dockerfile in evals/ for clean separation
+
+## Environment Variables
+
+- `CODEBUFF_WEBSOCKET_URL`: Override WebSocket URL (e.g., `ws://127.0.0.1:4242/ws`)
+- `CODEBUFF_SKIP_BINARY_CHECK=1`: Skip SDK binary presence check
+- `CODEBUFF_TEST_AUTH_TOKEN`: Enable test-only auth bypass (when NODE_ENV=test)
+- `CODEBUFF_API_KEY`: API key for SDK authentication (set by scripts)
+
+## GitHub Actions Integration
+
+### Automatic Trigger
+Add `[remote-eval]` to your commit message to trigger remote evaluations:
+```bash
+git commit -m "fix: terminal CWD handling [remote-eval]"
+```
+
+### Manual Trigger
+Go to Actions → Remote Evaluations → Run workflow:
+- **Eval file**: `eval-codebuff.json` (default)
+- **Commit index**: `0` (default) 
+- **Mode**: `bypass` or `seed`
+
+### Matrix Evaluations
+Add `[remote-eval-all]` to run multiple evaluations in parallel:
+```bash
+git commit -m "major: refactor terminal logic [remote-eval-all]"
+```
+
+### Workflow Files
+- `.github/workflows/remote-evals.yml` - Main remote evaluation workflow
+- Uses our containerized infrastructure with Docker Compose
+- Uploads artifacts and logs automatically
+- Handles cleanup and error reporting
+
+### Usage in CI
+
+```yaml
+# Single evaluation
+- name: Run remote eval (bypass mode)
+  run: bash evals/scripts/run-remote-parameterized.sh bypass eval-codebuff.json 0
+
+# With database seeding  
+- name: Run remote eval (seed mode)
+  run: bash evals/scripts/run-remote-parameterized.sh seed eval-manifold.json 1
+```
+
+## Manual Usage
+
+1. Start services:
+   ```bash
+   docker compose -f evals/docker-compose.evals.yml up -d --build db backend
+   ```
+
+2. Wait for readiness:
+   ```bash
+   evals/scripts/wait-for-healthz.sh http://127.0.0.1:4242/healthz 90
+   ```
+
+3. Seed database and capture API key:
+   ```bash
+   KEY_LINE=$(docker compose -f evals/docker-compose.evals.yml run --rm seeder | tail -n1)
+   export CODEBUFF_API_KEY="${KEY_LINE#CODEBUFF_API_KEY=}"
+   ```
+
+4. Run evaluation:
+   ```bash
+   export CODEBUFF_WEBSOCKET_URL=ws://127.0.0.1:4242/ws
+   export CODEBUFF_SKIP_BINARY_CHECK=1
+   bun scripts/git-evals/run-single-eval.ts --prompt "Your test prompt"
+   ```
+
+5. Cleanup:
+   ```bash
+   docker compose -f evals/docker-compose.evals.yml down -v
+   ```
+
+## Troubleshooting
+
+- **Connection Issues**: Check that `CODEBUFF_WEBSOCKET_URL=ws://127.0.0.1:4242/ws` is set
+- **Auth Failures**: Verify `CODEBUFF_API_KEY` is properly captured from seeder output
+- **Backend Not Ready**: Ensure `/healthz` returns 200 before proceeding
+- **Port Conflicts**: Backend binds to `127.0.0.1:4242` - ensure port is available
+
+## Implementation Details
+
+Based on the remote-eval-infra-plan.md specification:
+- Monorepo + Bun compatible
+- Docker-agnostic backend (Dockerfile lives in evals/)
+- Idempotent Drizzle seeding with deterministic IDs
+- WS readiness validation in health checks
+- Test-only auth bypass for fast smoke tests
+- Comprehensive error logging and cleanup
\ No newline at end of file
diff --git a/evals/backend.Dockerfile b/evals/backend.Dockerfile
new file mode 100644
index 0000000000..c8a0cfed6c
--- /dev/null
+++ b/evals/backend.Dockerfile
@@ -0,0 +1,6 @@
+FROM oven/bun:1.1.34 as base
+WORKDIR /app
+COPY . .
+RUN bun install --frozen-lockfile
+EXPOSE 4242
+CMD ["bun", "--cwd", "backend", "dev"]
\ No newline at end of file
diff --git a/evals/docker-compose.evals.yml b/evals/docker-compose.evals.yml
new file mode 100644
index 0000000000..81bfc05b63
--- /dev/null
+++ b/evals/docker-compose.evals.yml
@@ -0,0 +1,101 @@
+services:
+  db:
+    image: postgres:16-alpine
+    environment:
+      POSTGRES_USER: codebuff
+      POSTGRES_PASSWORD: codebuff
+      POSTGRES_DB: codebuff
+    command: [
+      "postgres",
+      "-c", "fsync=off",
+      "-c", "synchronous_commit=off",
+      "-c", "full_page_writes=off",
+      "-c", "log_statement=all",
+      "-c", "log_destination=stderr",
+      "-c", "logging_collector=off"
+    ]
+    healthcheck:
+      test: ["CMD-SHELL", "pg_isready -U codebuff -d codebuff"]
+      interval: 5s
+      timeout: 3s
+      retries: 20
+    logging:
+      driver: "json-file"
+      options:
+        max-size: "10m"
+        max-file: "3"
+
+  backend:
+    build:
+      context: ..                 # project root
+      dockerfile: ./evals/backend.Dockerfile
+    environment:
+      # Database
+      DATABASE_URL: postgresql://codebuff:codebuff@db:5432/codebuff
+      NODE_ENV: test
+      PORT: 4242
+      
+      # Required API keys (dummy values for testing)
+      ANTHROPIC_API_KEY: test-key
+      ANTHROPIC_API_KEY2: test-key
+      HELICONE_API_KEY: test-key
+      OPEN_AI_KEY: test-key
+      GEMINI_API_KEY: test-key
+      GOOGLE_GENERATIVE_AI_API_KEY: test-key
+      DEEPSEEK_API_KEY: test-key
+      OPEN_ROUTER_API_KEY: test-key
+      RELACE_API_KEY: test-key
+      LINKUP_API_KEY: test-key
+      GOOGLE_CLOUD_PROJECT_ID: test-project
+      
+      # Auth/Web variables
+      CODEBUFF_GITHUB_ID: test-id
+      CODEBUFF_GITHUB_SECRET: test-secret
+      NEXTAUTH_SECRET: test-secret-32-chars-long-minimum
+      STRIPE_SECRET_KEY: sk_test_dummy
+      STRIPE_WEBHOOK_SECRET_KEY: whsec_dummy
+      STRIPE_USAGE_PRICE_ID: price_dummy
+      STRIPE_TEAM_FEE_PRICE_ID: price_dummy
+      LOOPS_API_KEY: test-key
+      DISCORD_PUBLIC_KEY: test-key
+      DISCORD_BOT_TOKEN: test-token
+      DISCORD_APPLICATION_ID: test-id
+      API_KEY_ENCRYPTION_SECRET: 1234567890123456789012345678901a
+      
+      # Public variables
+      NEXT_PUBLIC_CB_ENVIRONMENT: test
+      NEXT_PUBLIC_APP_URL: http://localhost:3000
+      NEXT_PUBLIC_BACKEND_URL: http://localhost:4242
+      NEXT_PUBLIC_SUPPORT_EMAIL: test@example.com
+      NEXT_PUBLIC_STRIPE_PUBLISHABLE_KEY: pk_test_dummy
+      NEXT_PUBLIC_STRIPE_CUSTOMER_PORTAL: https://dummy.stripe.com
+      
+      # Optional test-only bypass
+      CODEBUFF_TEST_AUTH_TOKEN: ${CODEBUFF_TEST_AUTH_TOKEN}
+    depends_on:
+      db:
+        condition: service_healthy
+    ports:
+      - "127.0.0.1:4242:4242"     # loopback only
+    healthcheck:
+      test: ["CMD", "curl", "-fsS", "http://localhost:4242/healthz"]
+      interval: 5s
+      timeout: 3s
+      retries: 30
+    logging:
+      driver: "json-file"
+      options:
+        max-size: "50m"
+        max-file: "3"
+
+  seeder:
+    image: oven/bun:1.1.34
+    working_dir: /app
+    volumes:
+      - ..:/app:ro
+    environment:
+      DATABASE_URL: postgresql://codebuff:codebuff@db:5432/codebuff
+    entrypoint: ["bun", "run", "evals/seeds/seed-evals.ts"]
+    depends_on:
+      db:
+        condition: service_healthy
\ No newline at end of file
diff --git a/evals/git-evals/run-git-evals-legacy.ts b/evals/git-evals/run-git-evals-legacy.ts
new file mode 100644
index 0000000000..94a713c110
--- /dev/null
+++ b/evals/git-evals/run-git-evals-legacy.ts
@@ -0,0 +1,580 @@
+import { execSync, fork } from 'child_process'
+import fs from 'fs'
+import path from 'path'
+
+import { disableLiveUserInputCheck } from '@codebuff/backend/live-user-inputs'
+import { promptAiSdkStructured } from '@codebuff/backend/llm-apis/vercel-ai-sdk/ai-sdk'
+import { models } from '@codebuff/common/constants'
+import { getDefaultConfig } from '@codebuff/common/json-config/default'
+import { AgentTemplateTypes } from '@codebuff/common/types/session-state'
+import { withTimeout } from '@codebuff/common/util/promise'
+import { generateCompactId } from '@codebuff/common/util/string'
+import pLimit from 'p-limit'
+
+import {
+  createFileReadingMock,
+  loopMainPrompt,
+  resetRepoToCommit,
+} from '../scaffolding'
+import { createInitialSessionState } from '../test-setup'
+import { judgeEvalRun } from './judge-git-eval'
+import { extractRepoNameFromUrl, setupTestRepo } from './setup-test-repo'
+import { AgentDecisionSchema } from './types'
+
+import type { AgentStep } from '../scaffolding'
+import type {
+  AgentDecision,
+  CodebuffTrace,
+  EvalCommit,
+  EvalRunJudged,
+  EvalRunLog,
+  FileState,
+  FullEvalLog,
+  EvalData,
+} from './types'
+
+disableLiveUserInputCheck()
+
+// Try Gemini!
+const AGENT_TYPE = AgentTemplateTypes.base
+
+const EDIT_FILE_TOOL_NAMES = ['write_file', 'str_replace'] as const
+
+export async function runSingleEval(
+  evalCommit: EvalCommit,
+  projectPath: string,
+  clientSessionId: string,
+  fingerprintId: string,
+  agentType: string = AGENT_TYPE,
+): Promise<EvalRunJudged> {
+  const startTime = new Date()
+  const trace: CodebuffTrace[] = []
+  let error: string | undefined
+
+  // Add process-level error handlers for this eval
+  const originalUncaughtHandler = process.listeners('uncaughtException')
+  const originalUnhandledHandler = process.listeners('unhandledRejection')
+
+  let processError: string | undefined
+
+  const uncaughtHandler = (err: Error) => {
+    console.error('Uncaught exception during eval:', err)
+    processError = `Uncaught exception: ${err.message}\n${err.stack}`
+  }
+
+  const unhandledHandler = (reason: any, promise: Promise<any>) => {
+    console.error('Unhandled rejection during eval:', reason)
+    processError = `Unhandled rejection: ${reason instanceof Error ? { message: reason.message, stack: reason.stack } : String(reason)}`
+  }
+
+  process.on('uncaughtException', uncaughtHandler)
+  process.on('unhandledRejection', unhandledHandler)
+
+  try {
+    // Reset to the commit before the target commit
+    resetRepoToCommit(projectPath, `${evalCommit.sha}^`)
+
+    // Initialize agent state
+    createFileReadingMock(projectPath)
+    let sessionState = await createInitialSessionState(projectPath)
+
+    let currentDecision: AgentDecision = 'continue'
+    let attempts = 0
+    const MAX_ATTEMPTS = 5
+
+    while (currentDecision === 'continue' && attempts < MAX_ATTEMPTS) {
+      // Check for process-level errors
+      if (processError) {
+        throw new Error(processError)
+      }
+
+      function renderAgentStep(step: AgentStep): string {
+        const { response, toolCalls, toolResults } = step
+        return [
+          `\`\`\`text_response\n${response}\n\`\`\``,
+          `\`\`\`tool_calls\n${JSON.stringify(toolCalls, null, 2)}\n\`\`\``,
+          `\`\`\`tool_results\n${JSON.stringify(toolResults, null, 2)}\n\`\`\``,
+        ].join('\n\n')
+      }
+      const renderedTrace = trace
+        .map(
+          ({ prompt, steps }) =>
+            `You: ${prompt}\n\nCodebuff:${steps.map(renderAgentStep).join('\n\n')}`,
+        )
+        .join('\n\n')
+
+      // Get next prompt from Sonnet agent with timeout
+      let agentResponse: any
+      try {
+        agentResponse = await promptAiSdkStructured({
+          messages: [
+            {
+              role: 'user',
+              content: `You are an expert software engineer tasked with implementing a specification using CodeBuff, an AI coding assistant. Your goal is to prompt CodeBuff to implement the spec correctly. You are in a conversation with this coding agent.
+
+Current spec to implement:
+<spec>${evalCommit.spec}</spec>
+
+Your conversation with Codebuff so far:
+<conversation>${renderedTrace}</conversation>
+
+Note that files can only be changed with tools. If no tools are called, no files were changed.
+
+You must decide whether to:
+1. 'continue' - Generate a follow-up prompt for Codebuff
+2. 'complete' - The implementation is done and satisfies the spec
+3. 'halt' - The implementation is off track and unlikely to be completed within ${MAX_ATTEMPTS - attempts} more attempts
+
+If deciding to continue, include a clear, focused prompt for Codebuff in next_prompt.
+Explain your reasoning in detail.`,
+            },
+          ],
+          schema: AgentDecisionSchema,
+          model: models.gemini2_5_flash,
+          clientSessionId,
+          fingerprintId,
+          userInputId: generateCompactId(),
+          userId: undefined,
+          timeout: 5 * 60_000, // 5 minute timeout
+        })
+      } catch (agentError) {
+        throw new Error(
+          `Agent decision failed: ${agentError instanceof Error ? agentError.message : String(agentError)}`,
+        )
+      }
+
+      console.log('Agent decision:', agentResponse.decision)
+      console.log('Agent reasoning:', agentResponse.reasoning)
+
+      if (agentResponse.decision === 'continue' && !agentResponse.next_prompt) {
+        agentResponse.next_prompt = 'continue'
+      }
+
+      // If continuing, run CodeBuff with the agent's prompt
+      if (agentResponse.decision === 'continue') {
+        const prompt = agentResponse.next_prompt!
+
+        // Use loopMainPrompt with timeout wrapper
+        const codeBuffResult = await withTimeout(
+          loopMainPrompt({
+            sessionState,
+            prompt,
+            projectPath,
+            maxIterations: 20,
+            agentType: agentType as any,
+          }),
+          // Timeout after 30 minutes
+          60_000 * 30,
+        )
+
+        sessionState.mainAgentState = codeBuffResult.agentState
+        sessionState.mainAgentState.stepsRemaining =
+          getDefaultConfig().maxAgentSteps
+        trace.push({ prompt, steps: codeBuffResult.steps })
+      }
+
+      currentDecision = agentResponse.decision
+      attempts++
+    }
+  } catch (e) {
+    console.error('Error in runSingleEval:', e)
+    error =
+      e instanceof Error
+        ? `${e.message}\n${e.stack}`
+        : `Unknown error: ${String(e)}`
+  } finally {
+    // Clean up process-level error handlers
+    process.removeListener('uncaughtException', uncaughtHandler)
+    process.removeListener('unhandledRejection', unhandledHandler)
+
+    // Restore original handlers
+    originalUncaughtHandler.forEach((handler) => {
+      if (typeof handler === 'function') {
+        process.on('uncaughtException', handler)
+      }
+    })
+    originalUnhandledHandler.forEach((handler) => {
+      if (typeof handler === 'function') {
+        process.on('unhandledRejection', handler)
+      }
+    })
+  }
+
+  // If we caught a process-level error, use that
+  if (processError && !error) {
+    error = processError
+  }
+
+  const endTime = new Date()
+  const durationMs = endTime.getTime() - startTime.getTime()
+
+  const fileStates = getCodebuffFileStates(trace, evalCommit.sha, projectPath)
+
+  const evalRun: EvalRunLog = {
+    eval_commit: evalCommit,
+    trace,
+    error,
+    fileStates,
+    durationMs,
+  }
+
+  // Add judging results even for failed runs
+  try {
+    const judgingResults = await judgeEvalRun(evalRun)
+    console.log('Judging results:', judgingResults)
+    return {
+      ...evalRun,
+      judging_results: judgingResults,
+    }
+  } catch (judgingError) {
+    console.error('Error in judging:', judgingError)
+    // Return without judging results if judging fails
+    return {
+      ...evalRun,
+      judging_results: {
+        analysis: 'Judging failed due to error',
+        strengths: [],
+        weaknesses: ['Judging process encountered an error'],
+        metrics: {
+          completionScore: 0,
+          efficiencyScore: 0,
+          codeQualityScore: 0,
+          overallScore: 0,
+        },
+      },
+    }
+  }
+}
+
+function getCodebuffFileStates(
+  trace: CodebuffTrace[],
+  evalCommitSha: string,
+  projectPath: string,
+): FileState[] {
+  const codebuffWrittenFilePaths = new Set<string>()
+  if (trace) {
+    // trace might be undefined or empty if error occurred very early
+    for (const traceEntry of trace) {
+      for (const step of traceEntry.steps) {
+        if (step.toolCalls) {
+          for (const toolCall of step.toolCalls) {
+            if (
+              EDIT_FILE_TOOL_NAMES.includes(toolCall.toolName as any) &&
+              'path' in toolCall.input &&
+              toolCall.input.path
+            ) {
+              codebuffWrittenFilePaths.add(toolCall.input.path as string)
+            }
+          }
+        }
+      }
+    }
+  }
+
+  const fileStates: FileState[] = []
+
+  if (codebuffWrittenFilePaths.size > 0) {
+    for (const filePath of codebuffWrittenFilePaths) {
+      // Capture "after" state
+      const fullPath = path.join(projectPath, filePath)
+      let postContent: string
+      try {
+        postContent = fs.existsSync(fullPath)
+          ? fs.readFileSync(fullPath, 'utf-8')
+          : '[FILE_NOT_FOUND_POST_RUN]'
+      } catch (e) {
+        console.error(`Error reading file ${fullPath} for after state:`, e)
+        postContent = '[ERROR_READING_AFTER_STATE]'
+      }
+
+      // Capture "before" state
+      let preContent: string
+      try {
+        preContent = execSync(`git show ${evalCommitSha}^:"${filePath}"`, {
+          cwd: projectPath,
+          stdio: ['ignore', 'pipe', 'ignore'],
+        }).toString()
+      } catch (e) {
+        preContent = '[FILE_DID_NOT_EXIST_PRIOR_TO_CODEBUFF_CHANGES]'
+      }
+
+      fileStates.push({ path: filePath, preContent, postContent })
+    }
+  }
+  return fileStates
+}
+
+export function mockRunGitEvals(path: string) {
+  const result = JSON.parse(fs.readFileSync(path, 'utf-8')) as FullEvalLog
+
+  return result
+}
+
+// Global concurrency limiter that can be shared across multiple repository evaluations
+let globalConcurrencyLimiter: ReturnType<typeof pLimit> | null = null
+
+export function setGlobalConcurrencyLimit(limit: number) {
+  globalConcurrencyLimiter = pLimit(limit)
+}
+
+export async function runGitEvals(
+  evalDataPath: string,
+  outputDir: string,
+  agentType: string = AGENT_TYPE,
+  limit?: number,
+  logToStdout: boolean = false,
+): Promise<FullEvalLog> {
+  console.log(`Loading eval data from: ${evalDataPath}`)
+  const evalData = JSON.parse(
+    fs.readFileSync(evalDataPath, 'utf-8'),
+  ) as EvalData
+
+  console.log(
+    `Loaded ${evalData.evalCommits.length} eval commits from ${evalDataPath}`,
+  )
+
+  const { repoUrl } = evalData
+
+  // Extract repo name from URL or use provided testRepoName as fallback
+  const testRepoName = evalData.testRepoName || extractRepoNameFromUrl(repoUrl)
+
+  const clientSessionId = generateCompactId()
+  const fingerprintId = generateCompactId()
+
+  // Generate unique trace ID for this run
+  const traceId = generateCompactId()
+  console.log(`Starting eval run with trace ID: ${traceId}`)
+
+  // Ensure output directory exists
+  if (!fs.existsSync(outputDir)) {
+    fs.mkdirSync(outputDir, { recursive: true })
+  }
+
+  const logsDir = path.join(outputDir, 'logs', `${testRepoName}-${traceId}`)
+  fs.mkdirSync(logsDir, { recursive: true })
+
+  // Generate filenames with trace ID (single file that gets overwritten)
+  const partialOutputPath = path.join(
+    outputDir,
+    `eval-partial-${testRepoName}-${traceId}.json`,
+  )
+
+  const commitsToRun = limit
+    ? evalData.evalCommits.slice(0, limit)
+    : evalData.evalCommits
+
+  console.log(
+    `Running ${commitsToRun.length} evaluations out of ${evalData.evalCommits.length} total commits...`,
+  )
+  console.log(
+    `Using concurrency limit: ${globalConcurrencyLimiter ? 'global limiter' : 'local limiter (20)'}`,
+  )
+
+  // Use global limiter if available, otherwise create a local one
+  const limitConcurrency = globalConcurrencyLimiter || pLimit(20)
+
+  const evalPromises = commitsToRun.map((evalCommit, index) => {
+    return limitConcurrency(
+      () =>
+        new Promise<EvalRunJudged>(async (resolve, reject) => {
+          try {
+            console.log(
+              `Setting up test repository for commit ${evalCommit.sha}...`,
+            )
+            const projectPath = await setupTestRepo(
+              repoUrl,
+              testRepoName,
+              evalCommit.sha,
+            )
+
+            console.log(
+              `Starting ${testRepoName} eval ${index + 1}/${commitsToRun.length} for commit ${evalCommit.spec.split('\n')[0]}...`,
+            )
+
+            const safeMessage = evalCommit.spec
+              .split('\n')[0]
+              .replace(/[^a-zA-Z0-9]/g, '_')
+              .slice(0, 30)
+            const logFilename = `${safeMessage}-${evalCommit.sha.slice(0, 7)}.log`
+            const logPath = path.join(logsDir, logFilename)
+            const logStream = logToStdout
+              ? process.stdout
+              : fs.createWriteStream(logPath)
+
+            // Write evalCommit to temporary file to avoid long command line arguments
+            const tempEvalCommitPath = path.join(
+              logsDir,
+              `eval-commit-${evalCommit.sha.slice(0, 7)}.json`,
+            )
+            fs.writeFileSync(tempEvalCommitPath, JSON.stringify(evalCommit))
+
+            const child = fork(
+              path.resolve(__dirname, 'run-single-eval-process.ts'),
+              [
+                tempEvalCommitPath,
+                projectPath,
+                clientSessionId,
+                fingerprintId,
+                agentType,
+              ],
+              { stdio: ['pipe', 'pipe', 'pipe', 'ipc'] },
+            )
+
+            child.stdout?.pipe(logStream)
+            child.stderr?.pipe(logStream)
+
+            child.on(
+              'message',
+              (message: {
+                type: string
+                result?: EvalRunJudged
+                error?: any
+              }) => {
+                // Clean up temp file
+                try {
+                  fs.unlinkSync(tempEvalCommitPath)
+                } catch (e) {
+                  console.warn(
+                    `Failed to clean up temp file ${tempEvalCommitPath}:`,
+                    e,
+                  )
+                }
+                if (message.type === 'result' && message.result) {
+                  console.log(
+                    `Completed eval for commit ${testRepoName} - ${evalCommit.spec.split('\n')[0]}`,
+                  )
+                  if (!logToStdout) {
+                    console.log(`${JSON.stringify(message.result, null, 2)}`)
+                  }
+                  resolve(message.result)
+                } else if (message.type === 'error') {
+                  console.error(
+                    `Received error while running eval: ${message.error.stack}\n`,
+                    { message },
+                  )
+                  const err = new Error(message.error.message)
+                  reject(err)
+                }
+              },
+            )
+
+            child.on('exit', (code) => {
+              logStream.end()
+              if (code !== 0) {
+                console.error(
+                  `Eval process for ${evalCommit.sha} exited with code ${code}. See logs at ${logPath}`,
+                )
+                reject(
+                  new Error(
+                    `Eval process for ${evalCommit.sha} exited with code ${code}`,
+                  ),
+                )
+              }
+            })
+          } catch (error) {
+            console.error(
+              `Error while running git eval for ${testRepoName} commit ${evalCommit.sha}`,
+              { error },
+            )
+            reject(error)
+          }
+        }),
+    )
+  })
+
+  const results = await Promise.allSettled(evalPromises)
+
+  console.log(
+    `Promise.allSettled completed. Results: ${results.length} total, ${results.filter((r) => r.status === 'fulfilled').length} fulfilled, ${results.filter((r) => r.status === 'rejected').length} rejected`,
+  )
+
+  // Log rejected promises for debugging
+  results.forEach((result, index) => {
+    if (result.status === 'rejected') {
+      console.error(
+        `❌ Eval ${index + 1}/${commitsToRun.length} (${commitsToRun[index].sha}) was rejected:`,
+        result.reason,
+      )
+    }
+  })
+
+  const evalRuns = results
+    .filter((result) => result.status === 'fulfilled')
+    .map((result) => result.value)
+
+  // Calculate final overall metrics
+  const overallMetrics = calculateOverallMetrics(evalRuns)
+
+  const result: FullEvalLog = {
+    test_repo_name: testRepoName,
+    generation_date: new Date().toISOString(),
+    eval_runs: evalRuns,
+    overall_metrics: overallMetrics,
+  }
+
+  // Create final filename with trace ID
+  const finalOutputPath = path.join(
+    outputDir,
+    `eval-result-${testRepoName}-${traceId}.json`,
+  )
+
+  // Write final results to file
+  fs.writeFileSync(finalOutputPath, JSON.stringify(result, null, 2))
+
+  console.log('All evals complete!')
+  console.log(`Final results written to ${finalOutputPath}`)
+
+  return result
+}
+
+function calculateOverallMetrics(evalRuns: EvalRunJudged[]) {
+  return {
+    average_completion:
+      evalRuns.reduce(
+        (sum, run) => sum + (run.judging_results.metrics.completionScore || 0),
+        0,
+      ) / evalRuns.length,
+    average_efficiency:
+      evalRuns.reduce(
+        (sum, run) => sum + (run.judging_results.metrics.efficiencyScore || 0),
+        0,
+      ) / evalRuns.length,
+    average_code_quality:
+      evalRuns.reduce(
+        (sum, run) => sum + (run.judging_results.metrics.codeQualityScore || 0),
+        0,
+      ) / evalRuns.length,
+    average_overall:
+      evalRuns.reduce(
+        (sum, run) => sum + (run.judging_results.metrics.overallScore || 0),
+        0,
+      ) / evalRuns.length,
+    average_duration_ms:
+      evalRuns.reduce((sum, run) => sum + run.durationMs, 0) / evalRuns.length,
+    total_runs: evalRuns.length,
+    successful_runs: evalRuns.filter((run) => !run.error).length,
+    failed_runs: evalRuns.filter((run) => run.error).length,
+  }
+}
+
+// CLI handling
+if (require.main === module) {
+  const args = process.argv.slice(2)
+  console.info(
+    'Usage: bun run run-git-eval [eval-data-path] [output-dir] [agent-type]',
+  )
+
+  const evalDataPath = args[0] || 'git-evals/git-evals.json'
+  const outputDir = args[1] || 'git-evals'
+  const agentType = args[2] || AGENT_TYPE
+
+  runGitEvals(evalDataPath, outputDir, agentType)
+    .then(() => {
+      console.log('Done!')
+      process.exit(0)
+    })
+    .catch((err) => {
+      console.error('Error running evals:', err)
+      process.exit(1)
+    })
+}
diff --git a/evals/git-evals/run-git-evals.ts b/evals/git-evals/run-git-evals.ts
index 94a713c110..2f3859444e 100644
--- a/evals/git-evals/run-git-evals.ts
+++ b/evals/git-evals/run-git-evals.ts
@@ -2,26 +2,20 @@ import { execSync, fork } from 'child_process'
 import fs from 'fs'
 import path from 'path'
 
-import { disableLiveUserInputCheck } from '@codebuff/backend/live-user-inputs'
 import { promptAiSdkStructured } from '@codebuff/backend/llm-apis/vercel-ai-sdk/ai-sdk'
 import { models } from '@codebuff/common/constants'
-import { getDefaultConfig } from '@codebuff/common/json-config/default'
-import { AgentTemplateTypes } from '@codebuff/common/types/session-state'
-import { withTimeout } from '@codebuff/common/util/promise'
 import { generateCompactId } from '@codebuff/common/util/string'
+import { withTimeout } from '@codebuff/common/util/promise'
+import { CodebuffClient } from '../../sdk/src/client'
 import pLimit from 'p-limit'
 
 import {
-  createFileReadingMock,
-  loopMainPrompt,
   resetRepoToCommit,
 } from '../scaffolding'
-import { createInitialSessionState } from '../test-setup'
-import { judgeEvalRun } from './judge-git-eval'
 import { extractRepoNameFromUrl, setupTestRepo } from './setup-test-repo'
+import { judgeEvalRun } from './judge-git-eval'
 import { AgentDecisionSchema } from './types'
 
-import type { AgentStep } from '../scaffolding'
 import type {
   AgentDecision,
   CodebuffTrace,
@@ -33,11 +27,6 @@ import type {
   EvalData,
 } from './types'
 
-disableLiveUserInputCheck()
-
-// Try Gemini!
-const AGENT_TYPE = AgentTemplateTypes.base
-
 const EDIT_FILE_TOOL_NAMES = ['write_file', 'str_replace'] as const
 
 export async function runSingleEval(
@@ -45,7 +34,7 @@ export async function runSingleEval(
   projectPath: string,
   clientSessionId: string,
   fingerprintId: string,
-  agentType: string = AGENT_TYPE,
+  agentType: string = 'base',
 ): Promise<EvalRunJudged> {
   const startTime = new Date()
   const trace: CodebuffTrace[] = []
@@ -70,17 +59,24 @@ export async function runSingleEval(
   process.on('uncaughtException', uncaughtHandler)
   process.on('unhandledRejection', unhandledHandler)
 
+  let client: CodebuffClient | undefined
+
   try {
     // Reset to the commit before the target commit
     resetRepoToCommit(projectPath, `${evalCommit.sha}^`)
 
-    // Initialize agent state
-    createFileReadingMock(projectPath)
-    let sessionState = await createInitialSessionState(projectPath)
+    // Initialize SDK client
+    client = new CodebuffClient({
+      cwd: projectPath,
+      onError: (error) => {
+        console.error('SDK error:', error.message)
+      },
+    })
 
     let currentDecision: AgentDecision = 'continue'
     let attempts = 0
     const MAX_ATTEMPTS = 5
+    let previousRun: any = undefined
 
     while (currentDecision === 'continue' && attempts < MAX_ATTEMPTS) {
       // Check for process-level errors
@@ -88,7 +84,7 @@ export async function runSingleEval(
         throw new Error(processError)
       }
 
-      function renderAgentStep(step: AgentStep): string {
+      function renderAgentStep(step: any): string {
         const { response, toolCalls, toolResults } = step
         return [
           `\`\`\`text_response\n${response}\n\`\`\``,
@@ -96,6 +92,7 @@ export async function runSingleEval(
           `\`\`\`tool_results\n${JSON.stringify(toolResults, null, 2)}\n\`\`\``,
         ].join('\n\n')
       }
+
       const renderedTrace = trace
         .map(
           ({ prompt, steps }) =>
@@ -143,9 +140,17 @@ Explain your reasoning in detail.`,
         )
       }
 
+      console.log('Agent response:', JSON.stringify(agentResponse, null, 2))
       console.log('Agent decision:', agentResponse.decision)
       console.log('Agent reasoning:', agentResponse.reasoning)
 
+      // Handle undefined decision
+      if (!agentResponse.decision) {
+        console.warn('Agent decision is undefined, defaulting to halt')
+        agentResponse.decision = 'halt'
+        agentResponse.reasoning = 'Agent failed to provide a decision'
+      }
+
       if (agentResponse.decision === 'continue' && !agentResponse.next_prompt) {
         agentResponse.next_prompt = 'continue'
       }
@@ -154,35 +159,77 @@ Explain your reasoning in detail.`,
       if (agentResponse.decision === 'continue') {
         const prompt = agentResponse.next_prompt!
 
-        // Use loopMainPrompt with timeout wrapper
+        // Use SDK client with timeout wrapper
         const codeBuffResult = await withTimeout(
-          loopMainPrompt({
-            sessionState,
+          client.run({
+            agent: agentType,
             prompt,
-            projectPath,
-            maxIterations: 20,
-            agentType: agentType as any,
+            previousRun,
           }),
           // Timeout after 30 minutes
           60_000 * 30,
         )
 
-        sessionState.mainAgentState = codeBuffResult.agentState
-        sessionState.mainAgentState.stepsRemaining =
-          getDefaultConfig().maxAgentSteps
-        trace.push({ prompt, steps: codeBuffResult.steps })
+        // Convert SDK results to expected trace format
+        const toolResults = codeBuffResult.toolResults || []
+        const steps = []
+        
+        // Group tool results by response chunks if available
+        if (toolResults.length > 0) {
+          let currentResponse = ''
+          let currentToolCalls = []
+          let currentToolResults = []
+          
+          for (const result of toolResults) {
+            if (result.toolCall) {
+              currentToolCalls.push(result.toolCall)
+            }
+            currentToolResults.push(result)
+            if (result.output?.value) {
+              currentResponse += result.output.value
+            }
+          }
+          
+          steps.push({
+            response: currentResponse || prompt, // Fallback to prompt if no response
+            toolCalls: currentToolCalls,
+            toolResults: currentToolResults
+          })
+        } else {
+          // No tool results, likely just a text response
+          steps.push({
+            response: 'Processing completed',
+            toolCalls: [],
+            toolResults: []
+          })
+        }
+        
+        trace.push({ prompt, steps })
+
+        // Update previousRun for next iteration
+        previousRun = codeBuffResult
       }
 
       currentDecision = agentResponse.decision
       attempts++
     }
   } catch (e) {
-    console.error('Error in runSingleEval:', e)
+    console.error('Error in runSingleEvalSDK:', e)
     error =
       e instanceof Error
         ? `${e.message}\n${e.stack}`
         : `Unknown error: ${String(e)}`
   } finally {
+    // Close SDK client connection safely
+    if (client) {
+      try {
+        client.closeConnection()
+      } catch (closeError) {
+        // WebSocket might not be connected yet, so just log and continue
+        console.debug('Note: SDK client close error (likely not connected):', closeError)
+      }
+    }
+
     // Clean up process-level error handlers
     process.removeListener('uncaughtException', uncaughtHandler)
     process.removeListener('unhandledRejection', unhandledHandler)
@@ -304,12 +351,6 @@ function getCodebuffFileStates(
   return fileStates
 }
 
-export function mockRunGitEvals(path: string) {
-  const result = JSON.parse(fs.readFileSync(path, 'utf-8')) as FullEvalLog
-
-  return result
-}
-
 // Global concurrency limiter that can be shared across multiple repository evaluations
 let globalConcurrencyLimiter: ReturnType<typeof pLimit> | null = null
 
@@ -320,7 +361,7 @@ export function setGlobalConcurrencyLimit(limit: number) {
 export async function runGitEvals(
   evalDataPath: string,
   outputDir: string,
-  agentType: string = AGENT_TYPE,
+  agentType: string = 'base',
   limit?: number,
   logToStdout: boolean = false,
 ): Promise<FullEvalLog> {
@@ -561,12 +602,12 @@ function calculateOverallMetrics(evalRuns: EvalRunJudged[]) {
 if (require.main === module) {
   const args = process.argv.slice(2)
   console.info(
-    'Usage: bun run run-git-eval [eval-data-path] [output-dir] [agent-type]',
+    'Usage: bun run run-git-eval-sdk [eval-data-path] [output-dir] [agent-type]',
   )
 
   const evalDataPath = args[0] || 'git-evals/git-evals.json'
   const outputDir = args[1] || 'git-evals'
-  const agentType = args[2] || AGENT_TYPE
+  const agentType = args[2] || 'base'
 
   runGitEvals(evalDataPath, outputDir, agentType)
     .then(() => {
@@ -577,4 +618,4 @@ if (require.main === module) {
       console.error('Error running evals:', err)
       process.exit(1)
     })
-}
+}
\ No newline at end of file
diff --git a/evals/git-evals/run-single-eval-legacy.ts b/evals/git-evals/run-single-eval-legacy.ts
new file mode 100644
index 0000000000..d9feeea62e
--- /dev/null
+++ b/evals/git-evals/run-single-eval-legacy.ts
@@ -0,0 +1,238 @@
+#!/usr/bin/env bun
+
+import fs from 'fs'
+
+import { generateCompactId } from '@codebuff/common/util/string'
+import {
+  setProjectRoot,
+  setWorkingDirectory,
+} from '@codebuff/npm-app/project-files'
+import { recreateShell } from '@codebuff/npm-app/terminal/run-command'
+import { Command, Flags } from '@oclif/core'
+
+import { createFileReadingMock } from '../scaffolding'
+import { setupTestEnvironmentVariables } from '../test-setup'
+import { runSingleEval } from './run-git-evals'
+import { extractRepoNameFromUrl, setupTestRepo } from './setup-test-repo'
+
+import type { EvalCommit, EvalData, ModelConfig } from './types'
+
+class RunSingleEvalCommand extends Command {
+  static description = 'Run a single git evaluation task'
+
+  static examples = [
+    '$ bun run-single-eval --eval-file eval-codebuff.json --commit-index 0',
+    '$ bun run-single-eval --eval-file eval-manifold.json --commit-sha abc123',
+    '$ bun run-single-eval --eval-file eval-codebuff.json --commit-index 5 --output results.json',
+  ]
+
+  static flags = {
+    'eval-file': Flags.string({
+      char: 'f',
+      description: 'Path to the eval JSON file (e.g., eval-codebuff.json)',
+      required: true,
+    }),
+    'commit-index': Flags.integer({
+      char: 'i',
+      description: 'Index of the commit to evaluate (0-based)',
+    }),
+    'commit-sha': Flags.string({
+      char: 's',
+      description: 'SHA of the specific commit to evaluate',
+    }),
+    output: Flags.string({
+      char: 'o',
+      description: 'Output file path for results (optional)',
+    }),
+    'model-config': Flags.string({
+      char: 'm',
+      description: 'JSON string with model configuration (optional)',
+      default: '{}',
+    }),
+    help: Flags.help({ char: 'h' }),
+  }
+
+  async run(): Promise<void> {
+    const { flags } = await this.parse(RunSingleEvalCommand)
+
+    // Validate that either commit-index or commit-sha is provided
+    if (
+      !flags['commit-index'] &&
+      flags['commit-index'] !== 0 &&
+      !flags['commit-sha']
+    ) {
+      this.error('Either --commit-index or --commit-sha must be provided')
+    }
+
+    if (flags['commit-index'] !== undefined && flags['commit-sha']) {
+      this.error('Cannot specify both --commit-index and --commit-sha')
+    }
+
+    await runSingleEvalTask(flags)
+  }
+}
+
+async function runSingleEvalTask(options: {
+  'eval-file': string
+  'commit-index'?: number
+  'commit-sha'?: string
+  output?: string
+  'model-config': string
+}): Promise<void> {
+  const {
+    'eval-file': evalFile,
+    'commit-index': commitIndex,
+    'commit-sha': commitSha,
+    output: outputFile,
+    'model-config': modelConfigStr,
+  } = options
+
+  console.log('🚀 Starting single git eval...')
+  console.log(`Eval file: ${evalFile}`)
+
+  // Load eval data
+  if (!fs.existsSync(evalFile)) {
+    throw new Error(`Eval file not found: ${evalFile}`)
+  }
+
+  const evalData = JSON.parse(fs.readFileSync(evalFile, 'utf-8')) as EvalData
+  console.log(`Repository: ${evalData.repoUrl}`)
+  console.log(`Total commits available: ${evalData.evalCommits.length}`)
+
+  // Find the specific commit to evaluate
+  let evalCommit: EvalCommit
+  if (commitSha) {
+    const found = evalData.evalCommits.find((commit) =>
+      commit.sha.startsWith(commitSha),
+    )
+    if (!found) {
+      throw new Error(`Commit with SHA ${commitSha} not found in eval data`)
+    }
+    evalCommit = found
+    console.log(`Selected commit by SHA: ${commitSha}`)
+  } else if (commitIndex !== undefined) {
+    if (commitIndex < 0 || commitIndex >= evalData.evalCommits.length) {
+      throw new Error(
+        `Commit index ${commitIndex} is out of range (0-${evalData.evalCommits.length - 1})`,
+      )
+    }
+    evalCommit = evalData.evalCommits[commitIndex]
+    console.log(`Selected commit by index: ${commitIndex}`)
+  } else {
+    throw new Error('No commit specified')
+  }
+
+  console.log(
+    `Commit: ${evalCommit.sha.slice(0, 8)} - ${evalCommit.spec.split('\n')[0]}`,
+  )
+
+  // Parse model config
+  let modelConfig: ModelConfig
+  try {
+    modelConfig = JSON.parse(modelConfigStr)
+  } catch (error) {
+    throw new Error(`Invalid model config JSON: ${error}`)
+  }
+
+  // Setup test environment
+  console.log('🔧 Setting up test environment...')
+  setupTestEnvironmentVariables()
+
+  // Setup test repository
+  const testRepoName =
+    evalData.testRepoName || extractRepoNameFromUrl(evalData.repoUrl)
+  console.log(`📁 Setting up test repository: ${testRepoName}`)
+
+  const projectPath = await setupTestRepo(
+    evalData.repoUrl,
+    testRepoName,
+    evalCommit.sha,
+  )
+  console.log(`Repository cloned to: ${projectPath}`)
+
+  // Setup project context
+  setProjectRoot(projectPath)
+  createFileReadingMock(projectPath)
+  recreateShell(projectPath)
+  setWorkingDirectory(projectPath)
+
+  // Generate session identifiers
+  const clientSessionId = generateCompactId()
+  const fingerprintId = generateCompactId()
+
+  console.log('🤖 Running evaluation...')
+  console.log(
+    `Spec: ${evalCommit.spec.slice(0, 100)}${evalCommit.spec.length > 100 ? '...' : ''}`,
+  )
+
+  const startTime = Date.now()
+
+  try {
+    // Run the evaluation
+    const result = await runSingleEval(
+      evalCommit,
+      projectPath,
+      clientSessionId,
+      fingerprintId,
+    )
+
+    const duration = Date.now() - startTime
+    console.log(`✅ Evaluation completed in ${(duration / 1000).toFixed(1)}s`)
+
+    // Display results
+    if (result.error) {
+      console.log(`❌ Error occurred: ${result.error}`)
+    } else {
+      console.log('📊 Results:')
+      if (result.judging_results) {
+        const metrics = result.judging_results.metrics
+        console.log(`  Overall Score: ${metrics.overallScore.toFixed(2)}/10`)
+        console.log(`  Completion: ${metrics.completionScore.toFixed(2)}/10`)
+        console.log(`  Efficiency: ${metrics.efficiencyScore.toFixed(2)}/10`)
+        console.log(`  Code Quality: ${metrics.codeQualityScore.toFixed(2)}/10`)
+
+        if (result.judging_results.strengths.length > 0) {
+          console.log('  Strengths:')
+          result.judging_results.strengths.forEach((strength) => {
+            console.log(`    • ${strength}`)
+          })
+        }
+
+        if (result.judging_results.weaknesses.length > 0) {
+          console.log('  Weaknesses:')
+          result.judging_results.weaknesses.forEach((weakness) => {
+            console.log(`    • ${weakness}`)
+          })
+        }
+      }
+
+      console.log(`  Files modified: ${result.fileStates.length}`)
+      console.log(`  Conversation turns: ${result.trace.length}`)
+    }
+
+    // Save results if output file specified
+    if (outputFile) {
+      fs.writeFileSync(outputFile, JSON.stringify(result, null, 2))
+      console.log(`💾 Results saved to: ${outputFile}`)
+    }
+
+    process.exit(0)
+  } catch (error) {
+    const duration = Date.now() - startTime
+    console.error(
+      `❌ Evaluation failed after ${(duration / 1000).toFixed(1)}s:`,
+      error,
+    )
+    process.exit(1)
+  }
+}
+
+// CLI handling
+if (require.main === module) {
+  RunSingleEvalCommand.run().catch((err) => {
+    console.error('Error running single eval:', err)
+    process.exit(1)
+  })
+}
+
+export { RunSingleEvalCommand, runSingleEvalTask }
diff --git a/evals/git-evals/run-single-eval-process-legacy.ts b/evals/git-evals/run-single-eval-process-legacy.ts
new file mode 100644
index 0000000000..25148d3e1b
--- /dev/null
+++ b/evals/git-evals/run-single-eval-process-legacy.ts
@@ -0,0 +1,80 @@
+import fs from 'fs'
+
+import {
+  setProjectRoot,
+  setWorkingDirectory,
+} from '@codebuff/npm-app/project-files'
+import { recreateShell } from '@codebuff/npm-app/terminal/run-command'
+
+import { createFileReadingMock } from '../scaffolding'
+import { setupTestEnvironmentVariables } from '../test-setup'
+import { runSingleEval } from './run-git-evals'
+
+import type { EvalCommit } from './types'
+
+async function main() {
+  const [
+    evalCommitFilePath,
+    projectPath,
+    clientSessionId,
+    fingerprintId,
+    agentType,
+  ] = process.argv.slice(2)
+
+  if (
+    !evalCommitFilePath ||
+    !projectPath ||
+    !clientSessionId ||
+    !fingerprintId ||
+    !agentType
+  ) {
+    console.error('Missing required arguments for single eval process')
+    process.exit(1)
+  }
+
+  let evalCommit: EvalCommit
+  try {
+    const evalCommitStr = fs.readFileSync(evalCommitFilePath, 'utf-8')
+    evalCommit = JSON.parse(evalCommitStr)
+  } catch (error) {
+    console.error('Failed to read evalCommit from file:', error)
+    process.exit(1)
+  }
+
+  try {
+    // Setup environment for this process
+    setProjectRoot(projectPath)
+    setupTestEnvironmentVariables()
+    createFileReadingMock(projectPath)
+    recreateShell(projectPath)
+    setWorkingDirectory(projectPath)
+
+    const result = await runSingleEval(
+      evalCommit,
+      projectPath,
+      clientSessionId,
+      fingerprintId,
+      agentType,
+    )
+    console.log('Final result:', { result })
+    if (process.send) {
+      process.send({ type: 'result', result })
+    }
+  } catch (error) {
+    if (process.send) {
+      process.send({
+        type: 'error',
+        error:
+          error instanceof Error
+            ? { message: error.message, stack: error.stack }
+            : { message: String(error) },
+      })
+    }
+  } finally {
+    setTimeout(() => {
+      process.exit(0)
+    }, 2000)
+  }
+}
+
+main()
diff --git a/evals/git-evals/run-single-eval-process.ts b/evals/git-evals/run-single-eval-process.ts
index 25148d3e1b..f7d373db64 100644
--- a/evals/git-evals/run-single-eval-process.ts
+++ b/evals/git-evals/run-single-eval-process.ts
@@ -1,80 +1,41 @@
-import fs from 'fs'
-
-import {
-  setProjectRoot,
-  setWorkingDirectory,
-} from '@codebuff/npm-app/project-files'
-import { recreateShell } from '@codebuff/npm-app/terminal/run-command'
+#!/usr/bin/env bun
 
-import { createFileReadingMock } from '../scaffolding'
-import { setupTestEnvironmentVariables } from '../test-setup'
+import fs from 'fs'
 import { runSingleEval } from './run-git-evals'
-
 import type { EvalCommit } from './types'
 
-async function main() {
-  const [
-    evalCommitFilePath,
-    projectPath,
-    clientSessionId,
-    fingerprintId,
-    agentType,
-  ] = process.argv.slice(2)
-
-  if (
-    !evalCommitFilePath ||
-    !projectPath ||
-    !clientSessionId ||
-    !fingerprintId ||
-    !agentType
-  ) {
-    console.error('Missing required arguments for single eval process')
-    process.exit(1)
-  }
-
-  let evalCommit: EvalCommit
-  try {
-    const evalCommitStr = fs.readFileSync(evalCommitFilePath, 'utf-8')
-    evalCommit = JSON.parse(evalCommitStr)
-  } catch (error) {
-    console.error('Failed to read evalCommit from file:', error)
-    process.exit(1)
-  }
+process.on('message', () => {})
 
+async function main() {
   try {
-    // Setup environment for this process
-    setProjectRoot(projectPath)
-    setupTestEnvironmentVariables()
-    createFileReadingMock(projectPath)
-    recreateShell(projectPath)
-    setWorkingDirectory(projectPath)
+    const [tempEvalCommitPath, projectPath, clientSessionId, fingerprintId, agentType] = process.argv.slice(2)
+    
+    if (!tempEvalCommitPath || !projectPath || !clientSessionId || !fingerprintId) {
+      throw new Error('Missing required arguments: tempEvalCommitPath, projectPath, clientSessionId, fingerprintId')
+    }
 
+    // Load eval commit from temp file
+    const evalCommit = JSON.parse(fs.readFileSync(tempEvalCommitPath, 'utf-8')) as EvalCommit
+    
     const result = await runSingleEval(
       evalCommit,
       projectPath,
       clientSessionId,
       fingerprintId,
-      agentType,
+      agentType || 'base'
     )
-    console.log('Final result:', { result })
+
+    // Send result back to parent process
     if (process.send) {
       process.send({ type: 'result', result })
     }
   } catch (error) {
+    console.error('Error in run-single-eval-process-sdk:', error)
     if (process.send) {
-      process.send({
-        type: 'error',
-        error:
-          error instanceof Error
-            ? { message: error.message, stack: error.stack }
-            : { message: String(error) },
-      })
+      process.send({ type: 'error', error: { message: (error as Error).message, stack: (error as Error).stack } })
     }
-  } finally {
-    setTimeout(() => {
-      process.exit(0)
-    }, 2000)
+    process.exit(1)
   }
 }
 
-main()
+main()
\ No newline at end of file
diff --git a/evals/git-evals/run-single-eval-simple-sdk.ts b/evals/git-evals/run-single-eval-simple-sdk.ts
new file mode 100755
index 0000000000..5345291da7
--- /dev/null
+++ b/evals/git-evals/run-single-eval-simple-sdk.ts
@@ -0,0 +1,256 @@
+#!/usr/bin/env bun
+
+import fs from 'fs'
+import { execSync } from 'child_process'
+import path from 'path'
+
+import { generateCompactId } from '@codebuff/common/util/string'
+import { CodebuffClient } from '../../sdk/src/client'
+import { Command, Flags } from '@oclif/core'
+
+import { extractRepoNameFromUrl, setupTestRepo } from './setup-test-repo'
+import { resetRepoToCommit } from '../scaffolding'
+
+import type { EvalCommit, EvalData, FileState } from './types'
+
+class RunSingleEvalSimpleSDKCommand extends Command {
+  static description = 'Run a single git evaluation task using the Codebuff SDK (simplified version)'
+
+  static examples = [
+    '$ bun run-single-eval-simple-sdk --eval-file eval-codebuff.json --commit-index 0',
+    '$ bun run-single-eval-simple-sdk --eval-file eval-manifold.json --commit-sha abc123',
+  ]
+
+  static flags = {
+    'eval-file': Flags.string({
+      char: 'f',
+      description: 'Path to the eval JSON file (e.g., eval-codebuff.json)',
+      required: true,
+    }),
+    'commit-index': Flags.integer({
+      char: 'i',
+      description: 'Index of the commit to evaluate (0-based)',
+    }),
+    'commit-sha': Flags.string({
+      char: 's',
+      description: 'SHA of the specific commit to evaluate',
+    }),
+    output: Flags.string({
+      char: 'o',
+      description: 'Output file path for results (optional)',
+    }),
+    help: Flags.help({ char: 'h' }),
+  }
+
+  async run(): Promise<void> {
+    const { flags } = await this.parse(RunSingleEvalSimpleSDKCommand)
+
+    // Validate that either commit-index or commit-sha is provided
+    if (
+      !flags['commit-index'] &&
+      flags['commit-index'] !== 0 &&
+      !flags['commit-sha']
+    ) {
+      this.error('Either --commit-index or --commit-sha must be provided')
+    }
+
+    if (flags['commit-index'] !== undefined && flags['commit-sha']) {
+      this.error('Cannot specify both --commit-index and --commit-sha')
+    }
+
+    await runSingleEvalTaskSimpleSDK(flags)
+  }
+}
+
+async function runSingleEvalTaskSimpleSDK(options: {
+  'eval-file': string
+  'commit-index'?: number
+  'commit-sha'?: string
+  output?: string
+}): Promise<void> {
+  const {
+    'eval-file': evalFile,
+    'commit-index': commitIndex,
+    'commit-sha': commitSha,
+    output: outputFile,
+  } = options
+
+  console.log('🚀 Starting single git eval (Simple SDK mode)...')
+  console.log(`Eval file: ${evalFile}`)
+
+  // Load eval data
+  if (!fs.existsSync(evalFile)) {
+    throw new Error(`Eval file not found: ${evalFile}`)
+  }
+
+  const evalData = JSON.parse(fs.readFileSync(evalFile, 'utf-8')) as EvalData
+  console.log(`Repository: ${evalData.repoUrl}`)
+  console.log(`Total commits available: ${evalData.evalCommits.length}`)
+
+  // Find the specific commit to evaluate
+  let evalCommit: EvalCommit
+  if (commitSha) {
+    const found = evalData.evalCommits.find((commit) =>
+      commit.sha.startsWith(commitSha),
+    )
+    if (!found) {
+      throw new Error(`Commit with SHA ${commitSha} not found in eval data`)
+    }
+    evalCommit = found
+    console.log(`Selected commit by SHA: ${commitSha}`)
+  } else if (commitIndex !== undefined) {
+    if (commitIndex < 0 || commitIndex >= evalData.evalCommits.length) {
+      throw new Error(
+        `Commit index ${commitIndex} is out of range (0-${evalData.evalCommits.length - 1})`,
+      )
+    }
+    evalCommit = evalData.evalCommits[commitIndex]
+    console.log(`Selected commit by index: ${commitIndex}`)
+  } else {
+    throw new Error('No commit specified')
+  }
+
+  console.log(
+    `Commit: ${evalCommit.sha.slice(0, 8)} - ${evalCommit.spec.split('\n')[0]}`,
+  )
+
+  // Setup test repository
+  const testRepoName =
+    evalData.testRepoName || extractRepoNameFromUrl(evalData.repoUrl)
+  console.log(`📁 Setting up test repository: ${testRepoName}`)
+
+  const projectPath = await setupTestRepo(
+    evalData.repoUrl,
+    testRepoName,
+    evalCommit.sha,
+  )
+  console.log(`Repository cloned to: ${projectPath}`)
+
+  console.log('🤖 Running evaluation with SDK...')
+  console.log(
+    `Spec: ${evalCommit.spec.slice(0, 100)}${evalCommit.spec.length > 100 ? '...' : ''}`,
+  )
+
+  const startTime = Date.now()
+  let error: string | undefined
+  let fileStates: FileState[] = []
+
+  try {
+    // Reset to the commit before the target commit
+    resetRepoToCommit(projectPath, `${evalCommit.sha}^`)
+
+    // Initialize SDK client
+    const client = new CodebuffClient({
+      cwd: projectPath,
+      onError: (error) => {
+        console.error('SDK error:', error.message)
+      },
+    })
+
+    console.log('Running Codebuff with the specification...')
+    
+    // Run CodeBuff directly with the specification
+    const result = await client.run({
+      agent: 'base',
+      prompt: evalCommit.spec,
+    })
+
+    console.log('SDK run completed successfully')
+    console.log(`Tool results: ${result.toolResults?.length || 0}`)
+
+    // Extract file changes from tool results
+    const changedFiles = new Set<string>()
+    if (result.toolResults) {
+      for (const toolResult of result.toolResults) {
+        if (toolResult.toolCall && 
+            (toolResult.toolCall.toolName === 'write_file' || toolResult.toolCall.toolName === 'str_replace') &&
+            'path' in toolResult.toolCall.input) {
+          changedFiles.add(toolResult.toolCall.input.path as string)
+        }
+      }
+    }
+
+    // Capture file states
+    fileStates = Array.from(changedFiles).map(filePath => {
+      // Capture "after" state
+      const fullPath = path.join(projectPath, filePath)
+      let postContent: string
+      try {
+        postContent = fs.existsSync(fullPath)
+          ? fs.readFileSync(fullPath, 'utf-8')
+          : '[FILE_NOT_FOUND_POST_RUN]'
+      } catch (e) {
+        console.error(`Error reading file ${fullPath} for after state:`, e)
+        postContent = '[ERROR_READING_AFTER_STATE]'
+      }
+
+      // Capture "before" state
+      let preContent: string
+      try {
+        preContent = execSync(`git show ${evalCommit.sha}^:"${filePath}"`, {
+          cwd: projectPath,
+          stdio: ['ignore', 'pipe', 'ignore'],
+        }).toString()
+      } catch (e) {
+        preContent = '[FILE_DID_NOT_EXIST_PRIOR_TO_CODEBUFF_CHANGES]'
+      }
+
+      return { path: filePath, preContent, postContent }
+    })
+
+    // Close connection safely
+    try {
+      client.closeConnection()
+    } catch (closeError) {
+      console.debug('Note: SDK client close error (likely not connected):', closeError)
+    }
+
+  } catch (e) {
+    console.error('Error in evaluation:', e)
+    error = e instanceof Error ? `${e.message}\n${e.stack}` : `Unknown error: ${String(e)}`
+  }
+
+  const duration = Date.now() - startTime
+  console.log(`✅ Evaluation completed in ${(duration / 1000).toFixed(1)}s`)
+
+  // Create simple result structure (without judging for now)
+  const result = {
+    eval_commit: evalCommit,
+    error,
+    fileStates,
+    durationMs: duration,
+    simplified: true, // Flag to indicate this is the simplified SDK version
+  }
+
+  // Display results
+  if (error) {
+    console.log(`❌ Error occurred: ${error}`)
+  } else {
+    console.log('📊 Results:')
+    console.log(`  Files modified: ${fileStates.length}`)
+    if (fileStates.length > 0) {
+      console.log('  Modified files:')
+      fileStates.forEach(file => {
+        console.log(`    • ${file.path}`)
+      })
+    }
+  }
+
+  // Save results if output file specified
+  if (outputFile) {
+    fs.writeFileSync(outputFile, JSON.stringify(result, null, 2))
+    console.log(`💾 Results saved to: ${outputFile}`)
+  }
+
+  process.exit(error ? 1 : 0)
+}
+
+// CLI handling
+if (require.main === module) {
+  RunSingleEvalSimpleSDKCommand.run().catch((err) => {
+    console.error('Error running simple SDK eval:', err)
+    process.exit(1)
+  })
+}
+
+export { RunSingleEvalSimpleSDKCommand, runSingleEvalTaskSimpleSDK }
\ No newline at end of file
diff --git a/evals/git-evals/run-single-eval.ts b/evals/git-evals/run-single-eval.ts
old mode 100644
new mode 100755
index d9feeea62e..a9811e0ff5
--- a/evals/git-evals/run-single-eval.ts
+++ b/evals/git-evals/run-single-eval.ts
@@ -3,22 +3,16 @@
 import fs from 'fs'
 
 import { generateCompactId } from '@codebuff/common/util/string'
-import {
-  setProjectRoot,
-  setWorkingDirectory,
-} from '@codebuff/npm-app/project-files'
-import { recreateShell } from '@codebuff/npm-app/terminal/run-command'
+import { CodebuffClient } from '../../sdk/src/client'
 import { Command, Flags } from '@oclif/core'
 
-import { createFileReadingMock } from '../scaffolding'
-import { setupTestEnvironmentVariables } from '../test-setup'
-import { runSingleEval } from './run-git-evals'
 import { extractRepoNameFromUrl, setupTestRepo } from './setup-test-repo'
+import { runSingleEval } from './run-git-evals'
 
 import type { EvalCommit, EvalData, ModelConfig } from './types'
 
 class RunSingleEvalCommand extends Command {
-  static description = 'Run a single git evaluation task'
+  static description = 'Run a single git evaluation task using the Codebuff SDK'
 
   static examples = [
     '$ bun run-single-eval --eval-file eval-codebuff.json --commit-index 0',
@@ -134,10 +128,6 @@ async function runSingleEvalTask(options: {
     throw new Error(`Invalid model config JSON: ${error}`)
   }
 
-  // Setup test environment
-  console.log('🔧 Setting up test environment...')
-  setupTestEnvironmentVariables()
-
   // Setup test repository
   const testRepoName =
     evalData.testRepoName || extractRepoNameFromUrl(evalData.repoUrl)
@@ -150,12 +140,6 @@ async function runSingleEvalTask(options: {
   )
   console.log(`Repository cloned to: ${projectPath}`)
 
-  // Setup project context
-  setProjectRoot(projectPath)
-  createFileReadingMock(projectPath)
-  recreateShell(projectPath)
-  setWorkingDirectory(projectPath)
-
   // Generate session identifiers
   const clientSessionId = generateCompactId()
   const fingerprintId = generateCompactId()
@@ -168,7 +152,7 @@ async function runSingleEvalTask(options: {
   const startTime = Date.now()
 
   try {
-    // Run the evaluation
+    // Run the evaluation using SDK
     const result = await runSingleEval(
       evalCommit,
       projectPath,
@@ -235,4 +219,4 @@ if (require.main === module) {
   })
 }
 
-export { RunSingleEvalCommand, runSingleEvalTask }
+export { RunSingleEvalCommand, runSingleEvalTask }
\ No newline at end of file
diff --git a/evals/scripts/run-remote-parameterized.sh b/evals/scripts/run-remote-parameterized.sh
new file mode 100755
index 0000000000..78fb9e6b36
--- /dev/null
+++ b/evals/scripts/run-remote-parameterized.sh
@@ -0,0 +1,92 @@
+#!/bin/bash
+set -euo pipefail
+
+# Logging function with timestamps
+log() {
+  echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
+}
+
+# Parameters
+MODE="${1:-seed}"              # 'seed' (Drizzle) or 'bypass'
+EVAL_FILE="${2:-eval-codebuff.json}"    # eval file name
+COMMIT_INDEX="${3:-0}"         # commit index
+
+log "🚀 Remote Evaluation Infrastructure Starting (SDK Mode)"
+log "📋 Parameters:"
+log "  Mode: $MODE"
+log "  Eval File: $EVAL_FILE"
+log "  Commit Index: $COMMIT_INDEX"
+log "  Working Directory: $(pwd)"
+log "  Script Directory: $(dirname "$0")"
+
+export CODEBUFF_WEBSOCKET_URL="ws://127.0.0.1:4242/ws"
+export CODEBUFF_SKIP_BINARY_CHECK=1
+
+# Start services
+log "📦 Starting Docker services..."
+log "  Compose file: $(dirname "$0")/../docker-compose.evals.yml"
+docker compose -f "$(dirname "$0")/../docker-compose.evals.yml" up -d --build db backend
+
+# Wait for backend to be ready
+log "⏳ Waiting for backend to be ready..."
+START_TIME=$(date +%s)
+"$(dirname "$0")/wait-for-healthz.sh" "http://127.0.0.1:4242/healthz" 90 || {
+  log '❌ Health check failed; dumping logs...'
+  log '📋 Backend logs:'
+  docker compose -f "$(dirname "$0")/../docker-compose.evals.yml" logs backend --tail=200 || true
+  log '📋 Database logs:'
+  docker compose -f "$(dirname "$0")/../docker-compose.evals.yml" logs db --tail=50 || true
+  exit 1
+}
+READY_TIME=$(date +%s)
+log "✅ Backend ready in $((READY_TIME - START_TIME)) seconds"
+
+# Set up authentication
+if [ "$MODE" = "bypass" ]; then
+  log "🔐 Setting up bypass authentication..."
+  export CODEBUFF_TEST_AUTH_TOKEN="$(openssl rand -hex 16)"
+  export CODEBUFF_API_KEY="$CODEBUFF_TEST_AUTH_TOKEN"
+  log "  Generated test auth token: ${CODEBUFF_TEST_AUTH_TOKEN:0:8}..."
+else
+  log "🌱 Setting up database seed authentication..."
+  log "  Running seeder container..."
+  SEED_START=$(date +%s)
+  KEY_LINE=$(docker compose -f "$(dirname "$0")/../docker-compose.evals.yml" run --rm seeder | tail -n1)
+  export CODEBUFF_API_KEY="${KEY_LINE#CODEBUFF_API_KEY=}"
+  SEED_END=$(date +%s)
+  log "  Seeding completed in $((SEED_END - SEED_START)) seconds"
+  log "  Extracted API key: ${CODEBUFF_API_KEY:0:8}..."
+fi
+
+# Run evaluation (SDK mode only)
+log "🤖 Starting evaluation (SDK mode)..."
+log "  File: evals/git-evals/$EVAL_FILE"
+log "  Commit Index: $COMMIT_INDEX"
+log "  Using: CodebuffClient from SDK"
+log "  Environment: CODEBUFF_WEBSOCKET_URL=$CODEBUFF_WEBSOCKET_URL"
+log "  This may take 10-30 minutes depending on task complexity..."
+
+EVAL_START=$(date +%s)
+bun evals/git-evals/run-single-eval.ts \
+  --eval-file="evals/git-evals/$EVAL_FILE" \
+  --commit-index="$COMMIT_INDEX"
+
+EVAL_EXIT_CODE=$?
+EVAL_END=$(date +%s)
+EVAL_DURATION=$((EVAL_END - EVAL_START))
+
+if [ $EVAL_EXIT_CODE -eq 0 ]; then
+  log "✅ Evaluation completed successfully in ${EVAL_DURATION} seconds!"
+else
+  log "❌ Evaluation failed with exit code $EVAL_EXIT_CODE after ${EVAL_DURATION} seconds"
+  log "📋 Final backend logs:"
+  docker compose -f "$(dirname "$0")/../docker-compose.evals.yml" logs backend --tail=100 || true
+fi
+
+# Cleanup
+log "🧹 Cleaning up Docker containers..."
+docker compose -f "$(dirname "$0")/../docker-compose.evals.yml" down -v
+
+TOTAL_DURATION=$((EVAL_END - START_TIME))
+log "🏁 Remote evaluation finished in ${TOTAL_DURATION} total seconds (exit code: $EVAL_EXIT_CODE)"
+exit $EVAL_EXIT_CODE
\ No newline at end of file
diff --git a/evals/scripts/run-remote.sh b/evals/scripts/run-remote.sh
new file mode 100755
index 0000000000..6954d9559f
--- /dev/null
+++ b/evals/scripts/run-remote.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+set -euo pipefail
+
+# Logging function
+log() {
+  echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
+}
+
+MODE="${1:-seed}"  # 'seed' (Drizzle) or 'bypass'
+log "🚀 Starting remote evaluation infrastructure (SDK mode)"
+log "Mode: $MODE"
+
+export CODEBUFF_WEBSOCKET_URL="ws://127.0.0.1:4242/ws"
+export CODEBUFF_SKIP_BINARY_CHECK=1
+log "Environment variables set:"
+log "  CODEBUFF_WEBSOCKET_URL=$CODEBUFF_WEBSOCKET_URL"
+log "  CODEBUFF_SKIP_BINARY_CHECK=$CODEBUFF_SKIP_BINARY_CHECK"
+
+# Start services
+log "📦 Starting Docker services (db + backend)..."
+docker compose -f "$(dirname "$0")/../docker-compose.evals.yml" up -d --build db backend
+
+log "⏳ Waiting for backend health check..."
+"$(dirname "$0")/wait-for-healthz.sh" "http://127.0.0.1:4242/healthz" 90 || {
+  log '❌ Health check failed; dumping backend logs...'
+  docker compose -f "$(dirname "$0")/../docker-compose.evals.yml" logs backend --tail=200 || true
+  log '❌ Dumping database logs...'
+  docker compose -f "$(dirname "$0")/../docker-compose.evals.yml" logs db --tail=50 || true
+  exit 1
+}
+
+if [ "$MODE" = "bypass" ]; then
+  log "🔐 Setting up bypass authentication..."
+  export CODEBUFF_TEST_AUTH_TOKEN="$(openssl rand -hex 16)"
+  export CODEBUFF_API_KEY="$CODEBUFF_TEST_AUTH_TOKEN"
+  log "  Generated test auth token: ${CODEBUFF_TEST_AUTH_TOKEN:0:8}..."
+else
+  log "🌱 Setting up database seed authentication..."
+  log "  Running seeder container..."
+  KEY_LINE=$(docker compose -f "$(dirname "$0")/../docker-compose.evals.yml" run --rm seeder | tail -n1)
+  export CODEBUFF_API_KEY="${KEY_LINE#CODEBUFF_API_KEY=}"
+  log "  Extracted API key: ${CODEBUFF_API_KEY:0:8}..."
+fi
+
+log "🤖 Starting evaluation (SDK mode)..."
+log "  Eval file: evals/git-evals/eval-codebuff.json"
+log "  Commit index: 0"
+log "  Using: CodebuffClient from SDK"
+log "  This may take 10-30 minutes depending on task complexity..."
+
+bun evals/git-evals/run-single-eval.ts \
+  --eval-file="evals/git-evals/eval-codebuff.json" \
+  --commit-index=0
+
+EVAL_EXIT_CODE=$?
+if [ $EVAL_EXIT_CODE -eq 0 ]; then
+  log "✅ Evaluation completed successfully!"
+else
+  log "❌ Evaluation failed with exit code $EVAL_EXIT_CODE"
+fi
+
+log "🧹 Cleaning up Docker containers..."
+docker compose -f "$(dirname "$0")/../docker-compose.evals.yml" down -v
+
+log "🏁 Remote evaluation finished (exit code: $EVAL_EXIT_CODE)"
+exit $EVAL_EXIT_CODE
\ No newline at end of file
diff --git a/evals/scripts/wait-for-healthz.sh b/evals/scripts/wait-for-healthz.sh
new file mode 100755
index 0000000000..6538dcf4a8
--- /dev/null
+++ b/evals/scripts/wait-for-healthz.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+set -euo pipefail
+
+# Logging function
+log() {
+  echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
+}
+
+URL="$1"; TIMEOUT="${2:-60}"
+log "🏥 Health check starting"
+log "  URL: $URL"
+log "  Timeout: ${TIMEOUT}s"
+
+for i in $(seq 1 "$TIMEOUT"); do
+  if curl -fsS "$URL" >/dev/null 2>&1; then 
+    log "✅ Backend is healthy and ready!"
+    exit 0
+  fi
+  
+  # Log every 10 seconds to avoid spam
+  if [ $((i % 10)) -eq 0 ] || [ $i -le 5 ]; then
+    log "⏳ Waiting for backend... (${i}s / ${TIMEOUT}s)"
+  fi
+  
+  sleep 1
+done
+
+log "❌ Backend health check failed after $TIMEOUT seconds" >&2
+log "🔍 Final health check attempt..."
+RESPONSE=$(curl -s -w "HTTP_CODE:%{http_code}" "$URL" 2>/dev/null || echo "CURL_FAILED")
+log "  Response: $RESPONSE"
+exit 1
\ No newline at end of file
diff --git a/evals/seeds/seed-evals.ts b/evals/seeds/seed-evals.ts
new file mode 100644
index 0000000000..54d92d057f
--- /dev/null
+++ b/evals/seeds/seed-evals.ts
@@ -0,0 +1,85 @@
+import 'dotenv/config'
+import { drizzle } from 'drizzle-orm/node-postgres'
+import { Client } from 'pg'
+import crypto from 'crypto'
+import {
+  user,
+  session,
+} from '../../common/src/db/schema'
+
+// Logging function
+function log(message: string) {
+  const timestamp = new Date().toISOString()
+  console.error(`[${timestamp}] ${message}`)
+}
+
+async function main() {
+  log('🌱 Starting database seeding for evaluations')
+  
+  const DATABASE_URL = process.env.DATABASE_URL!
+  log(`📊 Connecting to database: ${DATABASE_URL.replace(/\/\/.*@/, '//***@')}`)
+  
+  const client = new Client({ connectionString: DATABASE_URL })
+  const startTime = Date.now()
+  
+  try {
+    await client.connect()
+    log('✅ Database connection established')
+    
+    const db = drizzle(client)
+
+    // deterministic IDs for idempotency
+    const userId = 'test-user'
+    const email = 'evals@test.local'
+    const token = crypto.randomUUID()
+    
+    log('👤 Creating test user...')
+    log(`  User ID: ${userId}`)
+    log(`  Email: ${email}`)
+
+    // upsert user
+    await db
+      .insert(user)
+      .values({
+        id: userId,
+        email,
+        name: 'Test User',
+        created_at: new Date(),
+      })
+      .onConflictDoNothing()
+    
+    log('✅ Test user created/updated')
+
+    log('🔑 Creating session token...')
+    log(`  Token: ${token.substring(0, 8)}...`)
+    
+    // upsert session / api token row
+    await db
+      .insert(session)
+      .values({
+        sessionToken: token,
+        userId,
+        expires: new Date(Date.now() + 24 * 60 * 60 * 1000), // 24 hours
+      })
+      .onConflictDoNothing()
+
+    log('✅ Session token created/updated')
+    const duration = Date.now() - startTime
+    log(`🏁 Database seeding completed in ${duration}ms`)
+
+    // Output the API key for the runner script to capture
+    console.log(`CODEBUFF_API_KEY=${token}`)
+    
+  } catch (error) {
+    log(`❌ Database seeding failed: ${error}`)
+    throw error
+  } finally {
+    await client.end()
+    log('🔌 Database connection closed')
+  }
+}
+
+main().catch((err) => {
+  console.error(err)
+  process.exit(1)
+})
\ No newline at end of file
diff --git a/remote-eval-infra-plan.md b/remote-eval-infra-plan.md
new file mode 100644
index 0000000000..283c2bd33d
--- /dev/null
+++ b/remote-eval-infra-plan.md
@@ -0,0 +1,386 @@
+Briefing (Read First)
+- Monorepo + Bun basics
+  - Monorepo with TypeScript + Bun everywhere. Local runs often use `.bin/bun`, which can inject dev env (`NEXT_PUBLIC_CB_ENVIRONMENT=dev`). Prefer plain `bun` in CI to avoid unintended dev defaults.
+  - The SDK runner is at `scripts/git-evals/run-single-eval.ts` and imports from `../../sdk/src`. It streams conversation/events to console.
+
+- SDK connectivity + auth
+  - Today the SDK hard-requires the `codebuff` CLI in PATH (constructor checks with `which/where`). Install with `npm i -g codebuff` OR implement the skip flag below.
+  - ✅ IMPLEMENTED: Add an optional skip flag (recommended): if `CODEBUFF_SKIP_BINARY_CHECK=1`, skip the CLI presence check.
+  - ✅ IMPLEMENTED: Default WS URL depends on env. In CI/remote, explicitly set `CODEBUFF_WEBSOCKET_URL=ws://127.0.0.1:4242/ws` so the SDK connects to your ephemeral backend (and not prod/dev defaults).
+  - ✅ IMPLEMENTED: Provide an API key as `CODEBUFF_API_KEY`. In seed mode, this comes from Drizzle seed output. In bypass mode, reuse `CODEBUFF_TEST_AUTH_TOKEN`.
+  - ✅ NEW: SDK-based evaluation scripts created: `run-single-eval-sdk.ts`, `run-single-eval-simple-sdk.ts`, and `run-git-evals-sdk.ts`
+
+- Docker containment (backend stays Docker‑agnostic)
+  - All infra (Compose, Dockerfile, scripts, seeding) lives under `evals/`. The backend does not reference Docker.
+  - Compose binds backend to loopback only (`127.0.0.1:4242`), so nothing is publicly exposed in CI.
+
+- Readiness + flake control
+  - Don’t just wait for HTTP bind—wait for `/healthz` to return 200 AND ensure WS is accepting connections. Use a curl loop with a strict timeout (60–90s) or enhance `/healthz` to signal WS readiness.
+  - Stream backend logs on failure to diagnose quickly.
+
+- Seeding strategy: Drizzle (preferred)
+  - Seed lives in `evals/seeds/seed-evals.ts` and imports tables from `backend/db/schema.ts`.
+  - Use deterministic IDs + `onConflictDoNothing()` for idempotency.
+  - Print exactly one line: `CODEBUFF_API_KEY=...`. The runner parses this—avoid extra logs.
+  - Align with the backend’s token model: confirm whether API tokens live in `session` or a dedicated `api_keys` table, and include required fields (e.g., `expiresAt`, `createdAt`).
+
+- Test‑only auth bypass (fastest fallback)
+  - If `CODEBUFF_TEST_AUTH_TOKEN` is set AND `NODE_ENV=test`, accept that token in WS auth and attach a minimal user context. Skip DB lookups; great for smoke tests.
+
+- CI specifics
+  - Use Docker Compose under `evals/` for parity and a one‑liner.
+  - Install `codebuff` globally in the runner (or use the skip flag after we add it).
+  - Set `CODEBUFF_WEBSOCKET_URL` + `CODEBUFF_API_KEY` explicitly; mask secrets; tear down with `docker compose down -v`.
+  - Concurrency: separate Compose project names or only use internal networking.
+
+- Common pitfalls
+  - `.bin/bun` locally can set dev defaults and point SDK at localhost. In CI, always set `CODEBUFF_WEBSOCKET_URL`.
+  - `/healthz` returning 200 before WS is ready → flakiness. Gate readiness on WS availability.
+  - Seed failures: wrong import path or missing required columns. Inspect `backend/db/schema.ts` and insert minimum viable fields.
+  - Token mismatch: ensure seeded token matches WS auth expectations. If unsure, use bypass first.
+  - No `codebuff` in PATH → SDK throws. Install it or use the skip flag once implemented.
+
+- Quick execution checklist
+  - ✅ IMPLEMENTED: `npm i -g codebuff` (or set `CODEBUFF_SKIP_BINARY_CHECK=1` after we add it)
+  - ✅ IMPLEMENTED: `docker compose -f evals/docker-compose.evals.yml up -d --build db backend`
+  - ✅ IMPLEMENTED: Wait for `http://127.0.0.1:4242/healthz` OK (WS-ready semantics)
+  - ✅ IMPLEMENTED: Seed (Drizzle) → capture `CODEBUFF_API_KEY` OR set bypass envs
+  - ✅ UPDATED: SDK-only: `bash evals/scripts/run-remote-parameterized.sh bypass eval-codebuff.json 0`
+  - ✅ IMPLEMENTED: `docker compose -f evals/docker-compose.evals.yml down -v`
+
+## SDK-Only Evaluation Infrastructure
+
+The evaluation infrastructure now uses the public Codebuff SDK exclusively:
+
+### SDK Mode (Only Option)
+- Uses public `CodebuffClient` from `@codebuff/sdk`
+- Clean separation from internal backend APIs
+- Reliable and consistent for CI/CD environments
+- Usage: `bash evals/scripts/run-remote-parameterized.sh bypass eval-codebuff.json 0`
+
+### Available Scripts:
+- `evals/git-evals/run-single-eval.ts` - Main SDK evaluation command
+- `evals/git-evals/run-git-evals.ts` - Batch SDK evaluations  
+- `evals/git-evals/run-single-eval-simple-sdk.ts` - Simplified SDK evaluation (direct execution)
+- `evals/scripts/run-remote.sh` - Basic remote evaluation script
+- `evals/scripts/run-remote-parameterized.sh` - Parameterized remote evaluation script
+
+### Legacy Files (Preserved for Reference):
+- `evals/git-evals/run-single-eval-legacy.ts` - Original internal API version
+- `evals/git-evals/run-git-evals-legacy.ts` - Original internal API version
+- `evals/git-evals/run-single-eval-process-legacy.ts` - Original process wrapper
+
+### GitHub Actions Support:
+- Simplified workflow using SDK-only approach
+- No mode selection needed (always uses SDK)
+- Matrix jobs use SDK consistently
+
+---
+
+New Tweaks and TODOs (from review)
+- Implement SDK skip flag (env guard) and WS URL override:
+```
+// sdk/src/client.ts (skip flag pseudo-patch)
+const SKIP = process.env.CODEBUFF_SKIP_BINARY_CHECK === '1'
+if (!SKIP) {
+  const isWindows = process.platform === 'win32'
+  if (
+    execFileSync(isWindows ? 'where' : 'which', [CODEBUFF_BINARY])
+      .toString()
+      .trim() === ''
+  ) {
+    throw new Error('Missing codebuff binary ...')
+  }
+}
+```
+```
+// sdk/src/constants.ts (WS override pseudo-patch)
+const WS_FROM_ENV = process.env.CODEBUFF_WEBSOCKET_URL || process.env.CB_WS_URL
+export const WEBSOCKET_URL = WS_FROM_ENV ?? (
+  IS_PROD ? 'wss://manicode-backend.onrender.com/ws' : 'ws://localhost:4242/ws'
+)
+```
+
+- Health readiness contract: ensure /healthz implies WS is ready. If needed, add a WS-ready flag in server startup before returning 200:
+```
+// backend readiness (pseudo-code)
+let wsReady = false
+startWebsocketServer(() => { wsReady = true })
+app.get('/healthz', (req, res) => {
+  return wsReady ? res.status(200).send('ok') : res.status(503).send('starting')
+})
+```
+
+- Backend start command: confirm the backend has a script that starts the WS server on 4242; otherwise define one and call it from the Dockerfile:
+```
+// package.json (backend) pseudo-snippet
+{
+  "scripts": {
+    "start:ws": "bun run dev" // or explicit entry that starts WS on 4242
+  }
+}
+```
+```
+# evals/backend.Dockerfile (if needed)
+CMD ["bun", "--cwd", "backend", "start:ws"]
+```
+
+- Drizzle seed alignment: verify exact token table/columns and adjust seed accordingly (examples):
+```
+// evals/seeds/seed-evals.ts (pseudo)
+await db.insert(session).values({
+  id: token,
+  userId,
+  expiresAt: new Date(Date.now() + 24*60*60*1000),
+  createdAt: new Date(),
+  // any other required columns
+}).onConflictDoNothing()
+```
+
+- Container path sanity: ensure import path is correct from inside the seeder container:
+```
+// from evals/seeds/seed-evals.ts
+import { user, session } from '../../backend/db/schema'
+```
+
+- Debugging playbook additions:
+```
+# On failure dump logs
+docker compose -f evals/docker-compose.evals.yml logs backend --tail=200 || true
+# If healthz flaps, add a longer timeout
+bash evals/scripts/wait-for-healthz.sh http://127.0.0.1:4242/healthz 120
+```
+
+---
+
+1) Directory layout (all infra under evals/; backend stays Docker-agnostic)
+- Place all Dockerfiles, compose files, seed scripts, and run scripts in `evals/`
+- Build backend image using project root as build context while specifying the Dockerfile inside `evals/`
+
+```
+repo-root/
+  evals/
+    docker-compose.evals.yml
+    backend.Dockerfile
+    scripts/
+      run-remote.sh
+      wait-for-healthz.sh
+    seeds/
+      seed-evals.ts     # Drizzle seed script (preferred)
+    README.md
+  backend/
+    db/
+      schema.ts        # existing drizzle schema (used by seed)
+      # ...migrations, drizzle config; unchanged
+  # other packages unchanged
+```
+
+2) Compose file (db + backend), healthchecks, no public exposure
+- Build backend from repo root as build context, using Dockerfile at evals/backend.Dockerfile
+- Bind backend to localhost only; SDK connects via ws://127.0.0.1:4242/ws
+- Use a separate seeder service (or run seeding via `docker compose run --rm seeder`)
+
+```
+# evals/docker-compose.evals.yml
+version: '3.9'
+services:
+  db:
+    image: postgres:16-alpine
+    environment:
+      POSTGRES_USER: codebuff
+      POSTGRES_PASSWORD: codebuff
+      POSTGRES_DB: codebuff
+    command: [
+      "postgres",
+      "-c", "fsync=off",
+      "-c", "synchronous_commit=off",
+      "-c", "full_page_writes=off"
+    ]
+    healthcheck:
+      test: ["CMD-SHELL", "pg_isready -U codebuff -d codebuff"]
+      interval: 5s
+      timeout: 3s
+      retries: 20
+
+  backend:
+    build:
+      context: ..                 # project root
+      dockerfile: ./evals/backend.Dockerfile
+    environment:
+      DATABASE_URL: postgresql://codebuff:codebuff@db:5432/codebuff
+      NODE_ENV: test
+      # Optional test-only bypass (see Section 5)
+      CODEBUFF_TEST_AUTH_TOKEN: ${CODEBUFF_TEST_AUTH_TOKEN}
+    depends_on:
+      db:
+        condition: service_healthy
+    ports:
+      - "127.0.0.1:4242:4242"     # loopback only
+    healthcheck:
+      test: ["CMD", "curl", "-fsS", "http://localhost:4242/healthz"]
+      interval: 5s
+      timeout: 3s
+      retries: 30
+
+  seeder:
+    image: oven/bun:1.1.34
+    working_dir: /app
+    volumes:
+      - ..:/app:ro
+    environment:
+      DATABASE_URL: postgresql://codebuff:codebuff@db:5432/codebuff
+    entrypoint: ["bun", "run", "evals/seeds/seed-evals.ts"]
+    depends_on:
+      db:
+        condition: service_healthy
+```
+
+3) Backend image build (Dockerfile living in evals/)
+- Keep backend unaware of Docker by placing the Dockerfile in evals; reference backend code via build context
+
+```
+# evals/backend.Dockerfile
+FROM oven/bun:1.1.34 as base
+WORKDIR /app
+COPY . .
+RUN bun install --frozen-lockfile
+EXPOSE 4242
+CMD ["bun", "--cwd", "backend", "dev"]
+```
+
+4) SDK URL override (prefer explicit WS URL; no prod/dev confusion)
+- Allow CODEBUFF_WEBSOCKET_URL to override default; keeps CI/local targeting explicit
+
+```
+// sdk/src/constants.ts (pseudo-patch)
+const WS_FROM_ENV = process.env.CODEBUFF_WEBSOCKET_URL || process.env.CB_WS_URL
+export const WEBSOCKET_URL = WS_FROM_ENV ?? (
+  IS_PROD ? 'wss://manicode-backend.onrender.com/ws' : 'ws://localhost:4242/ws'
+)
+```
+
+5) Test-only auth bypass (optional, fastest)
+- Minimal code change in backend auth path (no Docker coupling). If `CODEBUFF_TEST_AUTH_TOKEN` is set and matches incoming token, accept it and attach minimal user context.
+
+```
+// backend/src/websockets/auth.ts (pseudo-code)
+export function getUserInfoFromAuthToken(authToken: string): UserInfo | null {
+  const bypass = process.env.CODEBUFF_TEST_AUTH_TOKEN
+  if (process.env.NODE_ENV === 'test' && bypass && authToken === bypass) {
+    return { userId: 'test-user', email: 'evals@test.local', isAdmin: false }
+  }
+  // ...existing lookup against sessions/users...
+}
+```
+
+6) Drizzle seed (preferred over raw SQL)
+- Seed a minimal user/org/session/API key using Drizzle ORM, talking directly to Postgres in Compose
+- Keep seed entirely under evals/; import schema from backend/db/schema.ts for type safety
+- Print a single line: `CODEBUFF_API_KEY=...` for the runner to capture
+
+```
+// evals/seeds/seed-evals.ts (pseudo-code with Drizzle)
+import 'dotenv/config'
+import { drizzle } from 'drizzle-orm/node-postgres'
+import { Client } from 'pg'
+import {
+  user, session, org, /* other tables as needed */
+} from '../../backend/db/schema'  // adjust import path if needed
+
+async function main() {
+  const DATABASE_URL = process.env.DATABASE_URL!
+  const client = new Client({ connectionString: DATABASE_URL })
+  await client.connect()
+  const db = drizzle(client)
+
+  // deterministic IDs for idempotency
+  const userId = 'test-user'
+  const email = 'evals@test.local'
+  const token = crypto.randomUUID() // or deterministic for replay
+
+  // upsert user
+  await db
+    .insert(user)
+    .values({ id: userId, email /* ...other required fields */ })
+    .onConflictDoNothing()
+
+  // upsert org (optional; link user as owner)
+  // await db.insert(org).values({ ... }).onConflictDoNothing()
+
+  // upsert session / api token row
+  await db
+    .insert(session)
+    .values({ id: token, userId, /* expiresAt, createdAt, etc. */ })
+    .onConflictDoNothing()
+
+  console.log(`CODEBUFF_API_KEY=${token}`)
+  await client.end()
+}
+
+main().catch((err) => { console.error(err); process.exit(1) })
+```
+
+7) One-liner runner script (spin up, wait, seed with Drizzle, run, tear down)
+- Lives entirely in evals/scripts; wires envs and points SDK to ephemeral WS
+- Supports: bypass mode OR real seeding mode via Drizzle
+
+```
+# evals/scripts/run-remote.sh
+set -euo pipefail
+MODE="${1:-seed}"  # 'seed' (Drizzle) or 'bypass'
+export CODEBUFF_WEBSOCKET_URL="ws://127.0.0.1:4242/ws"
+export CODEBUFF_SKIP_BINARY_CHECK=1  # after skip flag is added
+
+# Start services
+docker compose -f "$(dirname "$0")/../docker-compose.evals.yml" up -d --build db backend
+"$(dirname "$0")/wait-for-healthz.sh" "http://127.0.0.1:4242/healthz" 90 || {
+  echo 'Healthz failed; dumping backend logs...'
+  docker compose -f "$(dirname "$0")/../docker-compose.evals.yml" logs backend --tail=200 || true
+  exit 1
+}
+
+if [ "$MODE" = "bypass" ]; then
+  export CODEBUFF_TEST_AUTH_TOKEN="$(openssl rand -hex 16)"
+  export CODEBUFF_API_KEY="$CODEBUFF_TEST_AUTH_TOKEN"
+else
+  # Drizzle seed via compose for network access to db
+  KEY_LINE=$(docker compose -f "$(dirname "$0")/../docker-compose.evals.yml" run --rm seeder | tail -n1)
+  export CODEBUFF_API_KEY="${KEY_LINE#CODEBUFF_API_KEY=}"
+fi
+
+bun scripts/git-evals/run-single-eval.ts \
+  --prompt "Say hi and print the working directory" \
+  --max-steps 10
+
+docker compose -f "$(dirname "$0")/../docker-compose.evals.yml" down -v
+```
+
+```
+# evals/scripts/wait-for-healthz.sh
+set -euo pipefail
+URL="$1"; TIMEOUT="${2:-60}"
+for i in $(seq 1 "$TIMEOUT"); do
+  if curl -fsS "$URL" >/dev/null 2>&1; then exit 0; fi
+  sleep 1
+  echo "waiting for backend... ($i s)"
+done
+echo "backend healthz did not become ready in $TIMEOUT seconds" >&2
+exit 1
+```
+
+8) GitHub Actions sketch (contained orchestration)
+- The workflow calls the one-liner; Drizzle seed by default
+
+```
+# .github/workflows/remote-evals.yml (pseudo-snippet)
+jobs:
+  remote-evals:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: oven-sh/setup-bun@v1
+      - name: Install codebuff CLI (SDK binary check)
+        run: npm i -g codebuff
+      - name: Run remote eval (Drizzle seed)
+        run: bash evals/scripts/run-remote.sh seed
+```
diff --git a/sdk/src/client.ts b/sdk/src/client.ts
index 3a7cf42278..3aea09a38e 100644
--- a/sdk/src/client.ts
+++ b/sdk/src/client.ts
@@ -1,3 +1,5 @@
+import { execFileSync } from 'child_process'
+
 import { initialSessionState, type RunState } from './run-state'
 import { changeFile } from './tools/change-file'
 import { getFiles } from './tools/read-files'
@@ -11,6 +13,8 @@ import { API_KEY_ENV_VAR } from '../../common/src/constants'
 import { DEFAULT_MAX_AGENT_STEPS } from '../../common/src/json-config/constants'
 import { toolNames } from '../../common/src/tools/constants'
 
+import { CODEBUFF_BINARY } from './constants'
+
 import type { CustomToolDefinition } from './custom-tool'
 import type { AgentDefinition } from '../../common/src/templates/initial-agents-dir/types/agent-definition'
 import type { ToolName } from '../../common/src/tools/constants'
@@ -68,6 +72,22 @@ export class CodebuffClient {
       )
     }
 
+    // Check for codebuff binary unless skip flag is set
+    const SKIP = process.env.CODEBUFF_SKIP_BINARY_CHECK === '1'
+    if (!SKIP) {
+      try {
+        const isWindows = process.platform === 'win32'
+        const result = execFileSync(isWindows ? 'where' : 'which', [CODEBUFF_BINARY])
+          .toString()
+          .trim()
+        if (result === '') {
+          throw new Error(`Missing codebuff binary in PATH. Please install with 'npm install -g codebuff' or set CODEBUFF_SKIP_BINARY_CHECK=1 to skip this check.`)
+        }
+      } catch (error) {
+        throw new Error(`Missing codebuff binary in PATH. Please install with 'npm install -g codebuff' or set CODEBUFF_SKIP_BINARY_CHECK=1 to skip this check.`)
+      }
+    }
+
     this.cwd = cwd
     this.overrideTools = overrideTools ?? {}
     this.websocketHandler = new WebSocketHandler({
diff --git a/sdk/src/constants.ts b/sdk/src/constants.ts
index 9c829db1bd..2054144b6d 100644
--- a/sdk/src/constants.ts
+++ b/sdk/src/constants.ts
@@ -4,9 +4,10 @@ export const IS_DEV = process.env.NEXT_PUBLIC_CB_ENVIRONMENT === 'dev'
 export const IS_TEST = process.env.NEXT_PUBLIC_CB_ENVIRONMENT === 'test'
 export const IS_PROD = !IS_DEV && !IS_TEST
 
-export const WEBSOCKET_URL = IS_PROD
-  ? 'wss://manicode-backend.onrender.com/ws'
-  : 'ws://localhost:4242/ws'
+const WS_FROM_ENV = process.env.CODEBUFF_WEBSOCKET_URL || process.env.CB_WS_URL
+export const WEBSOCKET_URL = WS_FROM_ENV ?? (
+  IS_PROD ? 'wss://manicode-backend.onrender.com/ws' : 'ws://localhost:4242/ws'
+)
 export const WEBSITE_URL = IS_PROD
   ? 'https://codebuff.com'
   : 'http://localhost:3000'