diff --git a/.github/workflows/remote-evals.yml b/.github/workflows/remote-evals.yml new file mode 100644 index 0000000000..f16f0e6b87 --- /dev/null +++ b/.github/workflows/remote-evals.yml @@ -0,0 +1,212 @@ +name: Remote Evaluations (SDK) + +# This workflow runs Codebuff evaluations using the public SDK exclusively. +# It creates a containerized backend environment and runs evaluations via CodebuffClient. +# Trigger: Add [remote-eval] to commit message or use workflow_dispatch +# Matrix mode: Add [remote-eval-all] to commit message for parallel evaluations + +on: + push: + branches: ['**'] + workflow_dispatch: + inputs: + eval_file: + description: 'Eval file to run (e.g., eval-codebuff.json)' + required: false + default: 'eval-codebuff.json' + type: string + commit_index: + description: 'Commit index to evaluate (0-based)' + required: false + default: '0' + type: string + mode: + description: 'Auth mode (seed or bypass)' + required: false + default: 'bypass' + type: choice + options: + - 'bypass' + - 'seed' + +jobs: + remote-evals: + runs-on: ubuntu-latest + timeout-minutes: 60 + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Check commit message + id: check_commit + env: + COMMIT_MESSAGE: ${{ github.event.head_commit.message }} + run: | + shopt -s nocasematch + if [[ "$COMMIT_MESSAGE" == *"[remote-eval]"* ]] || [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then + echo "should_run_evals=true" >> $GITHUB_OUTPUT + echo "Will run remote evaluations" + else + echo "should_run_evals=false" >> $GITHUB_OUTPUT + echo "Skipping remote evaluations (add [remote-eval] to commit message to trigger)" + fi + + - name: Set up Bun + if: steps.check_commit.outputs.should_run_evals == 'true' + uses: oven-sh/setup-bun@v2 + with: + bun-version: '1.2.12' + + - name: Install dependencies + if: steps.check_commit.outputs.should_run_evals == 'true' + run: bun install --frozen-lockfile + + - name: Validate environment for SDK evaluation + if: steps.check_commit.outputs.should_run_evals == 'true' + run: | + echo "πŸ” Validating SDK evaluation environment..." + echo " Checking for required files..." + test -f evals/scripts/run-remote-parameterized.sh || { echo "❌ Missing run-remote-parameterized.sh"; exit 1; } + test -f evals/git-evals/run-single-eval.ts || { echo "❌ Missing run-single-eval.ts"; exit 1; } + test -f evals/docker-compose.evals.yml || { echo "❌ Missing docker-compose.evals.yml"; exit 1; } + echo " Checking SDK package..." + bun --version + echo "βœ… Environment validation passed" + + - name: Run remote evaluation + if: steps.check_commit.outputs.should_run_evals == 'true' + env: + EVAL_FILE: ${{ inputs.eval_file || 'eval-codebuff.json' }} + COMMIT_INDEX: ${{ inputs.commit_index || '0' }} + MODE: ${{ inputs.mode || 'bypass' }} + CODEBUFF_WEBSOCKET_URL: "ws://127.0.0.1:4242/ws" + CODEBUFF_SKIP_BINARY_CHECK: "1" + run: | + echo "πŸš€ Remote Evaluation Starting (SDK Mode)" + echo "πŸ“‹ GitHub Actions Environment:" + echo " Runner: ${{ runner.os }}" + echo " SHA: ${{ github.sha }}" + echo " Ref: ${{ github.ref }}" + echo " Event: ${{ github.event_name }}" + echo " Eval File: $EVAL_FILE" + echo " Commit Index: $COMMIT_INDEX" + echo " Mode: $MODE" + echo "🐳 Docker Info:" + docker --version + docker compose version + echo "πŸ’Ύ Disk Space:" + df -h + echo "πŸ”§ Starting SDK-based evaluation..." + bash evals/scripts/run-remote-parameterized.sh "$MODE" "$EVAL_FILE" "$COMMIT_INDEX" + + - name: Dump logs on failure + if: failure() && steps.check_commit.outputs.should_run_evals == 'true' + run: | + echo "❌ SDK Evaluation failed - dumping diagnostic information" + echo "πŸ”§ SDK Environment:" + echo " CODEBUFF_WEBSOCKET_URL: ${CODEBUFF_WEBSOCKET_URL:-not set}" + echo " CODEBUFF_SKIP_BINARY_CHECK: ${CODEBUFF_SKIP_BINARY_CHECK:-not set}" + echo " CODEBUFF_API_KEY: ${CODEBUFF_API_KEY:+[SET]}${CODEBUFF_API_KEY:-[NOT SET]}" + echo "🐳 Docker containers status:" + docker ps -a || true + echo "πŸ“‹ Backend container logs:" + docker compose -f evals/docker-compose.evals.yml logs backend --tail=200 || true + echo "πŸ“‹ Database container logs:" + docker compose -f evals/docker-compose.evals.yml logs db --tail=100 || true + echo "πŸ’Ύ Disk usage:" + df -h || true + echo "🧠 Memory usage:" + free -h || true + echo "πŸ“ Evaluation files:" + ls -la evals/git-evals/ || true + ls -la evals/scripts/ || true + + - name: Upload evaluation logs + if: always() && steps.check_commit.outputs.should_run_evals == 'true' + uses: actions/upload-artifact@v4 + with: + name: remote-eval-logs-${{ github.sha }} + path: | + evals/test-repos/ + debug/ + ~/.cache/bun/ + retention-days: 7 + + - name: Cleanup containers + if: always() && steps.check_commit.outputs.should_run_evals == 'true' + run: | + echo "🧹 Final cleanup - removing all containers and volumes..." + docker compose -f evals/docker-compose.evals.yml down -v || true + docker system prune -f || true + echo "βœ… Cleanup completed" + + # Optional: Matrix job to run multiple evaluations in parallel + remote-evals-matrix: + runs-on: ubuntu-latest + timeout-minutes: 90 + if: contains(github.event.head_commit.message, '[remote-eval-all]') + + strategy: + fail-fast: false + matrix: + eval: + - { file: 'eval-codebuff.json', index: '0' } + - { file: 'eval-codebuff.json', index: '1' } + - { file: 'eval-manifold.json', index: '0' } + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Bun + uses: oven-sh/setup-bun@v2 + with: + bun-version: '1.2.12' + + - name: Install dependencies + run: bun install --frozen-lockfile + + - name: Validate environment for SDK evaluation + run: | + echo "πŸ” Validating SDK evaluation environment for matrix job..." + test -f evals/scripts/run-remote-parameterized.sh || { echo "❌ Missing run-remote-parameterized.sh"; exit 1; } + test -f evals/git-evals/run-single-eval.ts || { echo "❌ Missing run-single-eval.ts"; exit 1; } + test -f evals/docker-compose.evals.yml || { echo "❌ Missing docker-compose.evals.yml"; exit 1; } + echo "βœ… Matrix environment validation passed" + + - name: Run evaluation matrix + env: + EVAL_FILE: ${{ matrix.eval.file }} + COMMIT_INDEX: ${{ matrix.eval.index }} + CODEBUFF_WEBSOCKET_URL: "ws://127.0.0.1:4242/ws" + CODEBUFF_SKIP_BINARY_CHECK: "1" + run: | + echo "πŸš€ Running matrix evaluation (SDK Mode)..." + bash evals/scripts/run-remote-parameterized.sh "bypass" "$EVAL_FILE" "$COMMIT_INDEX" + + - name: Dump matrix logs on failure + if: failure() + run: | + echo "❌ Matrix SDK Evaluation failed - dumping diagnostic information" + echo "πŸ”§ Matrix job details: File=$EVAL_FILE, Index=$COMMIT_INDEX" + echo "🐳 Docker containers status:" + docker ps -a || true + echo "πŸ“‹ Container logs:" + docker compose -f evals/docker-compose.evals.yml logs --tail=100 || true + + - name: Upload matrix evaluation results + if: always() + uses: actions/upload-artifact@v4 + with: + name: remote-eval-matrix-${{ matrix.eval.file }}-${{ matrix.eval.index }}-${{ github.sha }} + path: | + evals/test-repos/ + debug/ + retention-days: 7 + + - name: Cleanup containers + if: always() + run: | + docker compose -f evals/docker-compose.evals.yml down -v || true + docker system prune -f || true \ No newline at end of file diff --git a/backend/src/index.ts b/backend/src/index.ts index bb038db89b..d20742cfd8 100644 --- a/backend/src/index.ts +++ b/backend/src/index.ts @@ -19,6 +19,7 @@ import { sendRequestReconnect, waitForAllClientsDisconnected, listen as webSocketListen, + isWebSocketReady, } from './websockets/server' const app = express() @@ -31,7 +32,11 @@ app.get('/', (req, res) => { }) app.get('/healthz', (req, res) => { - res.send('ok') + if (isWebSocketReady()) { + res.send('ok') + } else { + res.status(503).send('starting') + } }) app.post('/api/usage', usageHandler) diff --git a/backend/src/websockets/auth.ts b/backend/src/websockets/auth.ts index 927c56d43f..11b0df5edf 100644 --- a/backend/src/websockets/auth.ts +++ b/backend/src/websockets/auth.ts @@ -11,6 +11,12 @@ export interface UserInfo { export async function getUserIdFromAuthToken( authToken: string, ): Promise { + // Test-only auth bypass + const bypass = process.env.CODEBUFF_TEST_AUTH_TOKEN + if (process.env.NODE_ENV === 'test' && bypass && authToken === bypass) { + return 'test-user' + } + const user = await db .select({ id: schema.user.id }) .from(schema.user) @@ -25,6 +31,12 @@ export async function getUserIdFromAuthToken( export async function getUserInfoFromAuthToken( authToken: string, ): Promise { + // Test-only auth bypass + const bypass = process.env.CODEBUFF_TEST_AUTH_TOKEN + if (process.env.NODE_ENV === 'test' && bypass && authToken === bypass) { + return { id: 'test-user', email: 'evals@test.local', discord_id: null } + } + const user = await db .select({ id: schema.user.id, diff --git a/backend/src/websockets/server.ts b/backend/src/websockets/server.ts index 2f91d488f0..fd7177dba6 100644 --- a/backend/src/websockets/server.ts +++ b/backend/src/websockets/server.ts @@ -18,6 +18,8 @@ export const SWITCHBOARD = new Switchboard() // if a connection doesn't ping for this long, we assume the other side is toast const CONNECTION_TIMEOUT_MS = 60 * 1000 +let wsReady = false + export class MessageParseError extends Error { details?: unknown constructor(message: string, details?: unknown) { @@ -87,6 +89,7 @@ export function listen(server: HttpServer, path: string) { let deadConnectionCleaner: NodeJS.Timeout | undefined wss.on('listening', () => { logger.info(`Web socket server listening on ${path}.`) + wsReady = true deadConnectionCleaner = setInterval(function ping() { const now = Date.now() try { @@ -175,3 +178,7 @@ export function sendRequestReconnect() { export function waitForAllClientsDisconnected() { return SWITCHBOARD.waitForAllClientsDisconnected() } + +export function isWebSocketReady() { + return wsReady +} diff --git a/codebuff.json b/codebuff.json index 4fa5aa1592..334ca10b17 100644 --- a/codebuff.json +++ b/codebuff.json @@ -57,7 +57,7 @@ }, { "name": "prettier-format", - "command": "git diff --name-only --diff-filter=ACMR | grep -E '\\.(ts|tsx|json|md)$' | xargs -r npx prettier --write", + "command": "set -o pipefail && CHANGED=\"$(git diff --name-only --diff-filter=ACMR | grep -E '\\.(ts|tsx|json|md)$' | xargs -r npx prettier --list-different || true)\"; [ -n \"$CHANGED\" ] && echo \"$CHANGED\" | xargs -r npx prettier --write --log-level=warn && printf '%s\\n' \"$CHANGED\" || true", "filePattern": "**/*.{ts,tsx,json,md}" }, { @@ -70,6 +70,11 @@ "command": "bun run typecheck", "cwd": ".agents", "filePattern": ".agents/**/*.ts" + }, + { + "name": "eslint-fix-imports", + "command": "set -o pipefail && git diff --name-only --diff-filter=ACMR | grep -E '\\.(ts|tsx|js|jsx)$' | xargs -r npx eslint --fix --quiet", + "filePattern": "**/*.{ts,tsx,js,jsx}" } ] } diff --git a/evals/README.md b/evals/README.md new file mode 100644 index 0000000000..2abbd46802 --- /dev/null +++ b/evals/README.md @@ -0,0 +1,136 @@ +# Remote Evaluation Infrastructure + +This directory contains the infrastructure for running Codebuff evaluations in containerized environments (Docker Compose) for CI/CD and local testing. + +## Quick Start + +### Option 1: Using Drizzle Seed (Recommended) +```bash +bash evals/scripts/run-remote.sh seed +``` + +### Option 2: Using Test Auth Bypass (Faster) +```bash +bash evals/scripts/run-remote.sh bypass +``` + +## Prerequisites + +- Docker and Docker Compose +- Bun runtime +- Optional: `npm install -g codebuff` (or set `CODEBUFF_SKIP_BINARY_CHECK=1`) + +## Architecture + +- **evals/docker-compose.evals.yml**: Orchestrates PostgreSQL database and backend services +- **evals/backend.Dockerfile**: Backend container definition +- **evals/seeds/seed-evals.ts**: Drizzle-based database seeding for test users/sessions +- **evals/scripts/run-remote.sh**: Main runner script with teardown +- **evals/scripts/wait-for-healthz.sh**: Health check waiting utility + +## Key Features + +### SDK Enhancements +- **Binary Check Skip**: Set `CODEBUFF_SKIP_BINARY_CHECK=1` to skip codebuff CLI requirement +- **WebSocket URL Override**: Set `CODEBUFF_WEBSOCKET_URL=ws://127.0.0.1:4242/ws` to target ephemeral backend + +### Backend Enhancements +- **Test Auth Bypass**: Set `CODEBUFF_TEST_AUTH_TOKEN` + `NODE_ENV=test` for quick auth +- **WebSocket-Ready Health Check**: `/healthz` returns 503 until WebSocket server is accepting connections + +### Container Strategy +- **Loopback Binding**: Backend bound to `127.0.0.1:4242` only (no public exposure) +- **Optimized PostgreSQL**: Fast settings for CI (fsync=off, etc.) +- **Build Context**: Uses repo root with Dockerfile in evals/ for clean separation + +## Environment Variables + +- `CODEBUFF_WEBSOCKET_URL`: Override WebSocket URL (e.g., `ws://127.0.0.1:4242/ws`) +- `CODEBUFF_SKIP_BINARY_CHECK=1`: Skip SDK binary presence check +- `CODEBUFF_TEST_AUTH_TOKEN`: Enable test-only auth bypass (when NODE_ENV=test) +- `CODEBUFF_API_KEY`: API key for SDK authentication (set by scripts) + +## GitHub Actions Integration + +### Automatic Trigger +Add `[remote-eval]` to your commit message to trigger remote evaluations: +```bash +git commit -m "fix: terminal CWD handling [remote-eval]" +``` + +### Manual Trigger +Go to Actions β†’ Remote Evaluations β†’ Run workflow: +- **Eval file**: `eval-codebuff.json` (default) +- **Commit index**: `0` (default) +- **Mode**: `bypass` or `seed` + +### Matrix Evaluations +Add `[remote-eval-all]` to run multiple evaluations in parallel: +```bash +git commit -m "major: refactor terminal logic [remote-eval-all]" +``` + +### Workflow Files +- `.github/workflows/remote-evals.yml` - Main remote evaluation workflow +- Uses our containerized infrastructure with Docker Compose +- Uploads artifacts and logs automatically +- Handles cleanup and error reporting + +### Usage in CI + +```yaml +# Single evaluation +- name: Run remote eval (bypass mode) + run: bash evals/scripts/run-remote-parameterized.sh bypass eval-codebuff.json 0 + +# With database seeding +- name: Run remote eval (seed mode) + run: bash evals/scripts/run-remote-parameterized.sh seed eval-manifold.json 1 +``` + +## Manual Usage + +1. Start services: + ```bash + docker compose -f evals/docker-compose.evals.yml up -d --build db backend + ``` + +2. Wait for readiness: + ```bash + evals/scripts/wait-for-healthz.sh http://127.0.0.1:4242/healthz 90 + ``` + +3. Seed database and capture API key: + ```bash + KEY_LINE=$(docker compose -f evals/docker-compose.evals.yml run --rm seeder | tail -n1) + export CODEBUFF_API_KEY="${KEY_LINE#CODEBUFF_API_KEY=}" + ``` + +4. Run evaluation: + ```bash + export CODEBUFF_WEBSOCKET_URL=ws://127.0.0.1:4242/ws + export CODEBUFF_SKIP_BINARY_CHECK=1 + bun scripts/git-evals/run-single-eval.ts --prompt "Your test prompt" + ``` + +5. Cleanup: + ```bash + docker compose -f evals/docker-compose.evals.yml down -v + ``` + +## Troubleshooting + +- **Connection Issues**: Check that `CODEBUFF_WEBSOCKET_URL=ws://127.0.0.1:4242/ws` is set +- **Auth Failures**: Verify `CODEBUFF_API_KEY` is properly captured from seeder output +- **Backend Not Ready**: Ensure `/healthz` returns 200 before proceeding +- **Port Conflicts**: Backend binds to `127.0.0.1:4242` - ensure port is available + +## Implementation Details + +Based on the remote-eval-infra-plan.md specification: +- Monorepo + Bun compatible +- Docker-agnostic backend (Dockerfile lives in evals/) +- Idempotent Drizzle seeding with deterministic IDs +- WS readiness validation in health checks +- Test-only auth bypass for fast smoke tests +- Comprehensive error logging and cleanup \ No newline at end of file diff --git a/evals/backend.Dockerfile b/evals/backend.Dockerfile new file mode 100644 index 0000000000..c8a0cfed6c --- /dev/null +++ b/evals/backend.Dockerfile @@ -0,0 +1,6 @@ +FROM oven/bun:1.1.34 as base +WORKDIR /app +COPY . . +RUN bun install --frozen-lockfile +EXPOSE 4242 +CMD ["bun", "--cwd", "backend", "dev"] \ No newline at end of file diff --git a/evals/docker-compose.evals.yml b/evals/docker-compose.evals.yml new file mode 100644 index 0000000000..81bfc05b63 --- /dev/null +++ b/evals/docker-compose.evals.yml @@ -0,0 +1,101 @@ +services: + db: + image: postgres:16-alpine + environment: + POSTGRES_USER: codebuff + POSTGRES_PASSWORD: codebuff + POSTGRES_DB: codebuff + command: [ + "postgres", + "-c", "fsync=off", + "-c", "synchronous_commit=off", + "-c", "full_page_writes=off", + "-c", "log_statement=all", + "-c", "log_destination=stderr", + "-c", "logging_collector=off" + ] + healthcheck: + test: ["CMD-SHELL", "pg_isready -U codebuff -d codebuff"] + interval: 5s + timeout: 3s + retries: 20 + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" + + backend: + build: + context: .. # project root + dockerfile: ./evals/backend.Dockerfile + environment: + # Database + DATABASE_URL: postgresql://codebuff:codebuff@db:5432/codebuff + NODE_ENV: test + PORT: 4242 + + # Required API keys (dummy values for testing) + ANTHROPIC_API_KEY: test-key + ANTHROPIC_API_KEY2: test-key + HELICONE_API_KEY: test-key + OPEN_AI_KEY: test-key + GEMINI_API_KEY: test-key + GOOGLE_GENERATIVE_AI_API_KEY: test-key + DEEPSEEK_API_KEY: test-key + OPEN_ROUTER_API_KEY: test-key + RELACE_API_KEY: test-key + LINKUP_API_KEY: test-key + GOOGLE_CLOUD_PROJECT_ID: test-project + + # Auth/Web variables + CODEBUFF_GITHUB_ID: test-id + CODEBUFF_GITHUB_SECRET: test-secret + NEXTAUTH_SECRET: test-secret-32-chars-long-minimum + STRIPE_SECRET_KEY: sk_test_dummy + STRIPE_WEBHOOK_SECRET_KEY: whsec_dummy + STRIPE_USAGE_PRICE_ID: price_dummy + STRIPE_TEAM_FEE_PRICE_ID: price_dummy + LOOPS_API_KEY: test-key + DISCORD_PUBLIC_KEY: test-key + DISCORD_BOT_TOKEN: test-token + DISCORD_APPLICATION_ID: test-id + API_KEY_ENCRYPTION_SECRET: 1234567890123456789012345678901a + + # Public variables + NEXT_PUBLIC_CB_ENVIRONMENT: test + NEXT_PUBLIC_APP_URL: http://localhost:3000 + NEXT_PUBLIC_BACKEND_URL: http://localhost:4242 + NEXT_PUBLIC_SUPPORT_EMAIL: test@example.com + NEXT_PUBLIC_STRIPE_PUBLISHABLE_KEY: pk_test_dummy + NEXT_PUBLIC_STRIPE_CUSTOMER_PORTAL: https://dummy.stripe.com + + # Optional test-only bypass + CODEBUFF_TEST_AUTH_TOKEN: ${CODEBUFF_TEST_AUTH_TOKEN} + depends_on: + db: + condition: service_healthy + ports: + - "127.0.0.1:4242:4242" # loopback only + healthcheck: + test: ["CMD", "curl", "-fsS", "http://localhost:4242/healthz"] + interval: 5s + timeout: 3s + retries: 30 + logging: + driver: "json-file" + options: + max-size: "50m" + max-file: "3" + + seeder: + image: oven/bun:1.1.34 + working_dir: /app + volumes: + - ..:/app:ro + environment: + DATABASE_URL: postgresql://codebuff:codebuff@db:5432/codebuff + entrypoint: ["bun", "run", "evals/seeds/seed-evals.ts"] + depends_on: + db: + condition: service_healthy \ No newline at end of file diff --git a/evals/git-evals/run-git-evals-legacy.ts b/evals/git-evals/run-git-evals-legacy.ts new file mode 100644 index 0000000000..94a713c110 --- /dev/null +++ b/evals/git-evals/run-git-evals-legacy.ts @@ -0,0 +1,580 @@ +import { execSync, fork } from 'child_process' +import fs from 'fs' +import path from 'path' + +import { disableLiveUserInputCheck } from '@codebuff/backend/live-user-inputs' +import { promptAiSdkStructured } from '@codebuff/backend/llm-apis/vercel-ai-sdk/ai-sdk' +import { models } from '@codebuff/common/constants' +import { getDefaultConfig } from '@codebuff/common/json-config/default' +import { AgentTemplateTypes } from '@codebuff/common/types/session-state' +import { withTimeout } from '@codebuff/common/util/promise' +import { generateCompactId } from '@codebuff/common/util/string' +import pLimit from 'p-limit' + +import { + createFileReadingMock, + loopMainPrompt, + resetRepoToCommit, +} from '../scaffolding' +import { createInitialSessionState } from '../test-setup' +import { judgeEvalRun } from './judge-git-eval' +import { extractRepoNameFromUrl, setupTestRepo } from './setup-test-repo' +import { AgentDecisionSchema } from './types' + +import type { AgentStep } from '../scaffolding' +import type { + AgentDecision, + CodebuffTrace, + EvalCommit, + EvalRunJudged, + EvalRunLog, + FileState, + FullEvalLog, + EvalData, +} from './types' + +disableLiveUserInputCheck() + +// Try Gemini! +const AGENT_TYPE = AgentTemplateTypes.base + +const EDIT_FILE_TOOL_NAMES = ['write_file', 'str_replace'] as const + +export async function runSingleEval( + evalCommit: EvalCommit, + projectPath: string, + clientSessionId: string, + fingerprintId: string, + agentType: string = AGENT_TYPE, +): Promise { + const startTime = new Date() + const trace: CodebuffTrace[] = [] + let error: string | undefined + + // Add process-level error handlers for this eval + const originalUncaughtHandler = process.listeners('uncaughtException') + const originalUnhandledHandler = process.listeners('unhandledRejection') + + let processError: string | undefined + + const uncaughtHandler = (err: Error) => { + console.error('Uncaught exception during eval:', err) + processError = `Uncaught exception: ${err.message}\n${err.stack}` + } + + const unhandledHandler = (reason: any, promise: Promise) => { + console.error('Unhandled rejection during eval:', reason) + processError = `Unhandled rejection: ${reason instanceof Error ? { message: reason.message, stack: reason.stack } : String(reason)}` + } + + process.on('uncaughtException', uncaughtHandler) + process.on('unhandledRejection', unhandledHandler) + + try { + // Reset to the commit before the target commit + resetRepoToCommit(projectPath, `${evalCommit.sha}^`) + + // Initialize agent state + createFileReadingMock(projectPath) + let sessionState = await createInitialSessionState(projectPath) + + let currentDecision: AgentDecision = 'continue' + let attempts = 0 + const MAX_ATTEMPTS = 5 + + while (currentDecision === 'continue' && attempts < MAX_ATTEMPTS) { + // Check for process-level errors + if (processError) { + throw new Error(processError) + } + + function renderAgentStep(step: AgentStep): string { + const { response, toolCalls, toolResults } = step + return [ + `\`\`\`text_response\n${response}\n\`\`\``, + `\`\`\`tool_calls\n${JSON.stringify(toolCalls, null, 2)}\n\`\`\``, + `\`\`\`tool_results\n${JSON.stringify(toolResults, null, 2)}\n\`\`\``, + ].join('\n\n') + } + const renderedTrace = trace + .map( + ({ prompt, steps }) => + `You: ${prompt}\n\nCodebuff:${steps.map(renderAgentStep).join('\n\n')}`, + ) + .join('\n\n') + + // Get next prompt from Sonnet agent with timeout + let agentResponse: any + try { + agentResponse = await promptAiSdkStructured({ + messages: [ + { + role: 'user', + content: `You are an expert software engineer tasked with implementing a specification using CodeBuff, an AI coding assistant. Your goal is to prompt CodeBuff to implement the spec correctly. You are in a conversation with this coding agent. + +Current spec to implement: +${evalCommit.spec} + +Your conversation with Codebuff so far: +${renderedTrace} + +Note that files can only be changed with tools. If no tools are called, no files were changed. + +You must decide whether to: +1. 'continue' - Generate a follow-up prompt for Codebuff +2. 'complete' - The implementation is done and satisfies the spec +3. 'halt' - The implementation is off track and unlikely to be completed within ${MAX_ATTEMPTS - attempts} more attempts + +If deciding to continue, include a clear, focused prompt for Codebuff in next_prompt. +Explain your reasoning in detail.`, + }, + ], + schema: AgentDecisionSchema, + model: models.gemini2_5_flash, + clientSessionId, + fingerprintId, + userInputId: generateCompactId(), + userId: undefined, + timeout: 5 * 60_000, // 5 minute timeout + }) + } catch (agentError) { + throw new Error( + `Agent decision failed: ${agentError instanceof Error ? agentError.message : String(agentError)}`, + ) + } + + console.log('Agent decision:', agentResponse.decision) + console.log('Agent reasoning:', agentResponse.reasoning) + + if (agentResponse.decision === 'continue' && !agentResponse.next_prompt) { + agentResponse.next_prompt = 'continue' + } + + // If continuing, run CodeBuff with the agent's prompt + if (agentResponse.decision === 'continue') { + const prompt = agentResponse.next_prompt! + + // Use loopMainPrompt with timeout wrapper + const codeBuffResult = await withTimeout( + loopMainPrompt({ + sessionState, + prompt, + projectPath, + maxIterations: 20, + agentType: agentType as any, + }), + // Timeout after 30 minutes + 60_000 * 30, + ) + + sessionState.mainAgentState = codeBuffResult.agentState + sessionState.mainAgentState.stepsRemaining = + getDefaultConfig().maxAgentSteps + trace.push({ prompt, steps: codeBuffResult.steps }) + } + + currentDecision = agentResponse.decision + attempts++ + } + } catch (e) { + console.error('Error in runSingleEval:', e) + error = + e instanceof Error + ? `${e.message}\n${e.stack}` + : `Unknown error: ${String(e)}` + } finally { + // Clean up process-level error handlers + process.removeListener('uncaughtException', uncaughtHandler) + process.removeListener('unhandledRejection', unhandledHandler) + + // Restore original handlers + originalUncaughtHandler.forEach((handler) => { + if (typeof handler === 'function') { + process.on('uncaughtException', handler) + } + }) + originalUnhandledHandler.forEach((handler) => { + if (typeof handler === 'function') { + process.on('unhandledRejection', handler) + } + }) + } + + // If we caught a process-level error, use that + if (processError && !error) { + error = processError + } + + const endTime = new Date() + const durationMs = endTime.getTime() - startTime.getTime() + + const fileStates = getCodebuffFileStates(trace, evalCommit.sha, projectPath) + + const evalRun: EvalRunLog = { + eval_commit: evalCommit, + trace, + error, + fileStates, + durationMs, + } + + // Add judging results even for failed runs + try { + const judgingResults = await judgeEvalRun(evalRun) + console.log('Judging results:', judgingResults) + return { + ...evalRun, + judging_results: judgingResults, + } + } catch (judgingError) { + console.error('Error in judging:', judgingError) + // Return without judging results if judging fails + return { + ...evalRun, + judging_results: { + analysis: 'Judging failed due to error', + strengths: [], + weaknesses: ['Judging process encountered an error'], + metrics: { + completionScore: 0, + efficiencyScore: 0, + codeQualityScore: 0, + overallScore: 0, + }, + }, + } + } +} + +function getCodebuffFileStates( + trace: CodebuffTrace[], + evalCommitSha: string, + projectPath: string, +): FileState[] { + const codebuffWrittenFilePaths = new Set() + if (trace) { + // trace might be undefined or empty if error occurred very early + for (const traceEntry of trace) { + for (const step of traceEntry.steps) { + if (step.toolCalls) { + for (const toolCall of step.toolCalls) { + if ( + EDIT_FILE_TOOL_NAMES.includes(toolCall.toolName as any) && + 'path' in toolCall.input && + toolCall.input.path + ) { + codebuffWrittenFilePaths.add(toolCall.input.path as string) + } + } + } + } + } + } + + const fileStates: FileState[] = [] + + if (codebuffWrittenFilePaths.size > 0) { + for (const filePath of codebuffWrittenFilePaths) { + // Capture "after" state + const fullPath = path.join(projectPath, filePath) + let postContent: string + try { + postContent = fs.existsSync(fullPath) + ? fs.readFileSync(fullPath, 'utf-8') + : '[FILE_NOT_FOUND_POST_RUN]' + } catch (e) { + console.error(`Error reading file ${fullPath} for after state:`, e) + postContent = '[ERROR_READING_AFTER_STATE]' + } + + // Capture "before" state + let preContent: string + try { + preContent = execSync(`git show ${evalCommitSha}^:"${filePath}"`, { + cwd: projectPath, + stdio: ['ignore', 'pipe', 'ignore'], + }).toString() + } catch (e) { + preContent = '[FILE_DID_NOT_EXIST_PRIOR_TO_CODEBUFF_CHANGES]' + } + + fileStates.push({ path: filePath, preContent, postContent }) + } + } + return fileStates +} + +export function mockRunGitEvals(path: string) { + const result = JSON.parse(fs.readFileSync(path, 'utf-8')) as FullEvalLog + + return result +} + +// Global concurrency limiter that can be shared across multiple repository evaluations +let globalConcurrencyLimiter: ReturnType | null = null + +export function setGlobalConcurrencyLimit(limit: number) { + globalConcurrencyLimiter = pLimit(limit) +} + +export async function runGitEvals( + evalDataPath: string, + outputDir: string, + agentType: string = AGENT_TYPE, + limit?: number, + logToStdout: boolean = false, +): Promise { + console.log(`Loading eval data from: ${evalDataPath}`) + const evalData = JSON.parse( + fs.readFileSync(evalDataPath, 'utf-8'), + ) as EvalData + + console.log( + `Loaded ${evalData.evalCommits.length} eval commits from ${evalDataPath}`, + ) + + const { repoUrl } = evalData + + // Extract repo name from URL or use provided testRepoName as fallback + const testRepoName = evalData.testRepoName || extractRepoNameFromUrl(repoUrl) + + const clientSessionId = generateCompactId() + const fingerprintId = generateCompactId() + + // Generate unique trace ID for this run + const traceId = generateCompactId() + console.log(`Starting eval run with trace ID: ${traceId}`) + + // Ensure output directory exists + if (!fs.existsSync(outputDir)) { + fs.mkdirSync(outputDir, { recursive: true }) + } + + const logsDir = path.join(outputDir, 'logs', `${testRepoName}-${traceId}`) + fs.mkdirSync(logsDir, { recursive: true }) + + // Generate filenames with trace ID (single file that gets overwritten) + const partialOutputPath = path.join( + outputDir, + `eval-partial-${testRepoName}-${traceId}.json`, + ) + + const commitsToRun = limit + ? evalData.evalCommits.slice(0, limit) + : evalData.evalCommits + + console.log( + `Running ${commitsToRun.length} evaluations out of ${evalData.evalCommits.length} total commits...`, + ) + console.log( + `Using concurrency limit: ${globalConcurrencyLimiter ? 'global limiter' : 'local limiter (20)'}`, + ) + + // Use global limiter if available, otherwise create a local one + const limitConcurrency = globalConcurrencyLimiter || pLimit(20) + + const evalPromises = commitsToRun.map((evalCommit, index) => { + return limitConcurrency( + () => + new Promise(async (resolve, reject) => { + try { + console.log( + `Setting up test repository for commit ${evalCommit.sha}...`, + ) + const projectPath = await setupTestRepo( + repoUrl, + testRepoName, + evalCommit.sha, + ) + + console.log( + `Starting ${testRepoName} eval ${index + 1}/${commitsToRun.length} for commit ${evalCommit.spec.split('\n')[0]}...`, + ) + + const safeMessage = evalCommit.spec + .split('\n')[0] + .replace(/[^a-zA-Z0-9]/g, '_') + .slice(0, 30) + const logFilename = `${safeMessage}-${evalCommit.sha.slice(0, 7)}.log` + const logPath = path.join(logsDir, logFilename) + const logStream = logToStdout + ? process.stdout + : fs.createWriteStream(logPath) + + // Write evalCommit to temporary file to avoid long command line arguments + const tempEvalCommitPath = path.join( + logsDir, + `eval-commit-${evalCommit.sha.slice(0, 7)}.json`, + ) + fs.writeFileSync(tempEvalCommitPath, JSON.stringify(evalCommit)) + + const child = fork( + path.resolve(__dirname, 'run-single-eval-process.ts'), + [ + tempEvalCommitPath, + projectPath, + clientSessionId, + fingerprintId, + agentType, + ], + { stdio: ['pipe', 'pipe', 'pipe', 'ipc'] }, + ) + + child.stdout?.pipe(logStream) + child.stderr?.pipe(logStream) + + child.on( + 'message', + (message: { + type: string + result?: EvalRunJudged + error?: any + }) => { + // Clean up temp file + try { + fs.unlinkSync(tempEvalCommitPath) + } catch (e) { + console.warn( + `Failed to clean up temp file ${tempEvalCommitPath}:`, + e, + ) + } + if (message.type === 'result' && message.result) { + console.log( + `Completed eval for commit ${testRepoName} - ${evalCommit.spec.split('\n')[0]}`, + ) + if (!logToStdout) { + console.log(`${JSON.stringify(message.result, null, 2)}`) + } + resolve(message.result) + } else if (message.type === 'error') { + console.error( + `Received error while running eval: ${message.error.stack}\n`, + { message }, + ) + const err = new Error(message.error.message) + reject(err) + } + }, + ) + + child.on('exit', (code) => { + logStream.end() + if (code !== 0) { + console.error( + `Eval process for ${evalCommit.sha} exited with code ${code}. See logs at ${logPath}`, + ) + reject( + new Error( + `Eval process for ${evalCommit.sha} exited with code ${code}`, + ), + ) + } + }) + } catch (error) { + console.error( + `Error while running git eval for ${testRepoName} commit ${evalCommit.sha}`, + { error }, + ) + reject(error) + } + }), + ) + }) + + const results = await Promise.allSettled(evalPromises) + + console.log( + `Promise.allSettled completed. Results: ${results.length} total, ${results.filter((r) => r.status === 'fulfilled').length} fulfilled, ${results.filter((r) => r.status === 'rejected').length} rejected`, + ) + + // Log rejected promises for debugging + results.forEach((result, index) => { + if (result.status === 'rejected') { + console.error( + `❌ Eval ${index + 1}/${commitsToRun.length} (${commitsToRun[index].sha}) was rejected:`, + result.reason, + ) + } + }) + + const evalRuns = results + .filter((result) => result.status === 'fulfilled') + .map((result) => result.value) + + // Calculate final overall metrics + const overallMetrics = calculateOverallMetrics(evalRuns) + + const result: FullEvalLog = { + test_repo_name: testRepoName, + generation_date: new Date().toISOString(), + eval_runs: evalRuns, + overall_metrics: overallMetrics, + } + + // Create final filename with trace ID + const finalOutputPath = path.join( + outputDir, + `eval-result-${testRepoName}-${traceId}.json`, + ) + + // Write final results to file + fs.writeFileSync(finalOutputPath, JSON.stringify(result, null, 2)) + + console.log('All evals complete!') + console.log(`Final results written to ${finalOutputPath}`) + + return result +} + +function calculateOverallMetrics(evalRuns: EvalRunJudged[]) { + return { + average_completion: + evalRuns.reduce( + (sum, run) => sum + (run.judging_results.metrics.completionScore || 0), + 0, + ) / evalRuns.length, + average_efficiency: + evalRuns.reduce( + (sum, run) => sum + (run.judging_results.metrics.efficiencyScore || 0), + 0, + ) / evalRuns.length, + average_code_quality: + evalRuns.reduce( + (sum, run) => sum + (run.judging_results.metrics.codeQualityScore || 0), + 0, + ) / evalRuns.length, + average_overall: + evalRuns.reduce( + (sum, run) => sum + (run.judging_results.metrics.overallScore || 0), + 0, + ) / evalRuns.length, + average_duration_ms: + evalRuns.reduce((sum, run) => sum + run.durationMs, 0) / evalRuns.length, + total_runs: evalRuns.length, + successful_runs: evalRuns.filter((run) => !run.error).length, + failed_runs: evalRuns.filter((run) => run.error).length, + } +} + +// CLI handling +if (require.main === module) { + const args = process.argv.slice(2) + console.info( + 'Usage: bun run run-git-eval [eval-data-path] [output-dir] [agent-type]', + ) + + const evalDataPath = args[0] || 'git-evals/git-evals.json' + const outputDir = args[1] || 'git-evals' + const agentType = args[2] || AGENT_TYPE + + runGitEvals(evalDataPath, outputDir, agentType) + .then(() => { + console.log('Done!') + process.exit(0) + }) + .catch((err) => { + console.error('Error running evals:', err) + process.exit(1) + }) +} diff --git a/evals/git-evals/run-git-evals.ts b/evals/git-evals/run-git-evals.ts index 94a713c110..2f3859444e 100644 --- a/evals/git-evals/run-git-evals.ts +++ b/evals/git-evals/run-git-evals.ts @@ -2,26 +2,20 @@ import { execSync, fork } from 'child_process' import fs from 'fs' import path from 'path' -import { disableLiveUserInputCheck } from '@codebuff/backend/live-user-inputs' import { promptAiSdkStructured } from '@codebuff/backend/llm-apis/vercel-ai-sdk/ai-sdk' import { models } from '@codebuff/common/constants' -import { getDefaultConfig } from '@codebuff/common/json-config/default' -import { AgentTemplateTypes } from '@codebuff/common/types/session-state' -import { withTimeout } from '@codebuff/common/util/promise' import { generateCompactId } from '@codebuff/common/util/string' +import { withTimeout } from '@codebuff/common/util/promise' +import { CodebuffClient } from '../../sdk/src/client' import pLimit from 'p-limit' import { - createFileReadingMock, - loopMainPrompt, resetRepoToCommit, } from '../scaffolding' -import { createInitialSessionState } from '../test-setup' -import { judgeEvalRun } from './judge-git-eval' import { extractRepoNameFromUrl, setupTestRepo } from './setup-test-repo' +import { judgeEvalRun } from './judge-git-eval' import { AgentDecisionSchema } from './types' -import type { AgentStep } from '../scaffolding' import type { AgentDecision, CodebuffTrace, @@ -33,11 +27,6 @@ import type { EvalData, } from './types' -disableLiveUserInputCheck() - -// Try Gemini! -const AGENT_TYPE = AgentTemplateTypes.base - const EDIT_FILE_TOOL_NAMES = ['write_file', 'str_replace'] as const export async function runSingleEval( @@ -45,7 +34,7 @@ export async function runSingleEval( projectPath: string, clientSessionId: string, fingerprintId: string, - agentType: string = AGENT_TYPE, + agentType: string = 'base', ): Promise { const startTime = new Date() const trace: CodebuffTrace[] = [] @@ -70,17 +59,24 @@ export async function runSingleEval( process.on('uncaughtException', uncaughtHandler) process.on('unhandledRejection', unhandledHandler) + let client: CodebuffClient | undefined + try { // Reset to the commit before the target commit resetRepoToCommit(projectPath, `${evalCommit.sha}^`) - // Initialize agent state - createFileReadingMock(projectPath) - let sessionState = await createInitialSessionState(projectPath) + // Initialize SDK client + client = new CodebuffClient({ + cwd: projectPath, + onError: (error) => { + console.error('SDK error:', error.message) + }, + }) let currentDecision: AgentDecision = 'continue' let attempts = 0 const MAX_ATTEMPTS = 5 + let previousRun: any = undefined while (currentDecision === 'continue' && attempts < MAX_ATTEMPTS) { // Check for process-level errors @@ -88,7 +84,7 @@ export async function runSingleEval( throw new Error(processError) } - function renderAgentStep(step: AgentStep): string { + function renderAgentStep(step: any): string { const { response, toolCalls, toolResults } = step return [ `\`\`\`text_response\n${response}\n\`\`\``, @@ -96,6 +92,7 @@ export async function runSingleEval( `\`\`\`tool_results\n${JSON.stringify(toolResults, null, 2)}\n\`\`\``, ].join('\n\n') } + const renderedTrace = trace .map( ({ prompt, steps }) => @@ -143,9 +140,17 @@ Explain your reasoning in detail.`, ) } + console.log('Agent response:', JSON.stringify(agentResponse, null, 2)) console.log('Agent decision:', agentResponse.decision) console.log('Agent reasoning:', agentResponse.reasoning) + // Handle undefined decision + if (!agentResponse.decision) { + console.warn('Agent decision is undefined, defaulting to halt') + agentResponse.decision = 'halt' + agentResponse.reasoning = 'Agent failed to provide a decision' + } + if (agentResponse.decision === 'continue' && !agentResponse.next_prompt) { agentResponse.next_prompt = 'continue' } @@ -154,35 +159,77 @@ Explain your reasoning in detail.`, if (agentResponse.decision === 'continue') { const prompt = agentResponse.next_prompt! - // Use loopMainPrompt with timeout wrapper + // Use SDK client with timeout wrapper const codeBuffResult = await withTimeout( - loopMainPrompt({ - sessionState, + client.run({ + agent: agentType, prompt, - projectPath, - maxIterations: 20, - agentType: agentType as any, + previousRun, }), // Timeout after 30 minutes 60_000 * 30, ) - sessionState.mainAgentState = codeBuffResult.agentState - sessionState.mainAgentState.stepsRemaining = - getDefaultConfig().maxAgentSteps - trace.push({ prompt, steps: codeBuffResult.steps }) + // Convert SDK results to expected trace format + const toolResults = codeBuffResult.toolResults || [] + const steps = [] + + // Group tool results by response chunks if available + if (toolResults.length > 0) { + let currentResponse = '' + let currentToolCalls = [] + let currentToolResults = [] + + for (const result of toolResults) { + if (result.toolCall) { + currentToolCalls.push(result.toolCall) + } + currentToolResults.push(result) + if (result.output?.value) { + currentResponse += result.output.value + } + } + + steps.push({ + response: currentResponse || prompt, // Fallback to prompt if no response + toolCalls: currentToolCalls, + toolResults: currentToolResults + }) + } else { + // No tool results, likely just a text response + steps.push({ + response: 'Processing completed', + toolCalls: [], + toolResults: [] + }) + } + + trace.push({ prompt, steps }) + + // Update previousRun for next iteration + previousRun = codeBuffResult } currentDecision = agentResponse.decision attempts++ } } catch (e) { - console.error('Error in runSingleEval:', e) + console.error('Error in runSingleEvalSDK:', e) error = e instanceof Error ? `${e.message}\n${e.stack}` : `Unknown error: ${String(e)}` } finally { + // Close SDK client connection safely + if (client) { + try { + client.closeConnection() + } catch (closeError) { + // WebSocket might not be connected yet, so just log and continue + console.debug('Note: SDK client close error (likely not connected):', closeError) + } + } + // Clean up process-level error handlers process.removeListener('uncaughtException', uncaughtHandler) process.removeListener('unhandledRejection', unhandledHandler) @@ -304,12 +351,6 @@ function getCodebuffFileStates( return fileStates } -export function mockRunGitEvals(path: string) { - const result = JSON.parse(fs.readFileSync(path, 'utf-8')) as FullEvalLog - - return result -} - // Global concurrency limiter that can be shared across multiple repository evaluations let globalConcurrencyLimiter: ReturnType | null = null @@ -320,7 +361,7 @@ export function setGlobalConcurrencyLimit(limit: number) { export async function runGitEvals( evalDataPath: string, outputDir: string, - agentType: string = AGENT_TYPE, + agentType: string = 'base', limit?: number, logToStdout: boolean = false, ): Promise { @@ -561,12 +602,12 @@ function calculateOverallMetrics(evalRuns: EvalRunJudged[]) { if (require.main === module) { const args = process.argv.slice(2) console.info( - 'Usage: bun run run-git-eval [eval-data-path] [output-dir] [agent-type]', + 'Usage: bun run run-git-eval-sdk [eval-data-path] [output-dir] [agent-type]', ) const evalDataPath = args[0] || 'git-evals/git-evals.json' const outputDir = args[1] || 'git-evals' - const agentType = args[2] || AGENT_TYPE + const agentType = args[2] || 'base' runGitEvals(evalDataPath, outputDir, agentType) .then(() => { @@ -577,4 +618,4 @@ if (require.main === module) { console.error('Error running evals:', err) process.exit(1) }) -} +} \ No newline at end of file diff --git a/evals/git-evals/run-single-eval-legacy.ts b/evals/git-evals/run-single-eval-legacy.ts new file mode 100644 index 0000000000..d9feeea62e --- /dev/null +++ b/evals/git-evals/run-single-eval-legacy.ts @@ -0,0 +1,238 @@ +#!/usr/bin/env bun + +import fs from 'fs' + +import { generateCompactId } from '@codebuff/common/util/string' +import { + setProjectRoot, + setWorkingDirectory, +} from '@codebuff/npm-app/project-files' +import { recreateShell } from '@codebuff/npm-app/terminal/run-command' +import { Command, Flags } from '@oclif/core' + +import { createFileReadingMock } from '../scaffolding' +import { setupTestEnvironmentVariables } from '../test-setup' +import { runSingleEval } from './run-git-evals' +import { extractRepoNameFromUrl, setupTestRepo } from './setup-test-repo' + +import type { EvalCommit, EvalData, ModelConfig } from './types' + +class RunSingleEvalCommand extends Command { + static description = 'Run a single git evaluation task' + + static examples = [ + '$ bun run-single-eval --eval-file eval-codebuff.json --commit-index 0', + '$ bun run-single-eval --eval-file eval-manifold.json --commit-sha abc123', + '$ bun run-single-eval --eval-file eval-codebuff.json --commit-index 5 --output results.json', + ] + + static flags = { + 'eval-file': Flags.string({ + char: 'f', + description: 'Path to the eval JSON file (e.g., eval-codebuff.json)', + required: true, + }), + 'commit-index': Flags.integer({ + char: 'i', + description: 'Index of the commit to evaluate (0-based)', + }), + 'commit-sha': Flags.string({ + char: 's', + description: 'SHA of the specific commit to evaluate', + }), + output: Flags.string({ + char: 'o', + description: 'Output file path for results (optional)', + }), + 'model-config': Flags.string({ + char: 'm', + description: 'JSON string with model configuration (optional)', + default: '{}', + }), + help: Flags.help({ char: 'h' }), + } + + async run(): Promise { + const { flags } = await this.parse(RunSingleEvalCommand) + + // Validate that either commit-index or commit-sha is provided + if ( + !flags['commit-index'] && + flags['commit-index'] !== 0 && + !flags['commit-sha'] + ) { + this.error('Either --commit-index or --commit-sha must be provided') + } + + if (flags['commit-index'] !== undefined && flags['commit-sha']) { + this.error('Cannot specify both --commit-index and --commit-sha') + } + + await runSingleEvalTask(flags) + } +} + +async function runSingleEvalTask(options: { + 'eval-file': string + 'commit-index'?: number + 'commit-sha'?: string + output?: string + 'model-config': string +}): Promise { + const { + 'eval-file': evalFile, + 'commit-index': commitIndex, + 'commit-sha': commitSha, + output: outputFile, + 'model-config': modelConfigStr, + } = options + + console.log('πŸš€ Starting single git eval...') + console.log(`Eval file: ${evalFile}`) + + // Load eval data + if (!fs.existsSync(evalFile)) { + throw new Error(`Eval file not found: ${evalFile}`) + } + + const evalData = JSON.parse(fs.readFileSync(evalFile, 'utf-8')) as EvalData + console.log(`Repository: ${evalData.repoUrl}`) + console.log(`Total commits available: ${evalData.evalCommits.length}`) + + // Find the specific commit to evaluate + let evalCommit: EvalCommit + if (commitSha) { + const found = evalData.evalCommits.find((commit) => + commit.sha.startsWith(commitSha), + ) + if (!found) { + throw new Error(`Commit with SHA ${commitSha} not found in eval data`) + } + evalCommit = found + console.log(`Selected commit by SHA: ${commitSha}`) + } else if (commitIndex !== undefined) { + if (commitIndex < 0 || commitIndex >= evalData.evalCommits.length) { + throw new Error( + `Commit index ${commitIndex} is out of range (0-${evalData.evalCommits.length - 1})`, + ) + } + evalCommit = evalData.evalCommits[commitIndex] + console.log(`Selected commit by index: ${commitIndex}`) + } else { + throw new Error('No commit specified') + } + + console.log( + `Commit: ${evalCommit.sha.slice(0, 8)} - ${evalCommit.spec.split('\n')[0]}`, + ) + + // Parse model config + let modelConfig: ModelConfig + try { + modelConfig = JSON.parse(modelConfigStr) + } catch (error) { + throw new Error(`Invalid model config JSON: ${error}`) + } + + // Setup test environment + console.log('πŸ”§ Setting up test environment...') + setupTestEnvironmentVariables() + + // Setup test repository + const testRepoName = + evalData.testRepoName || extractRepoNameFromUrl(evalData.repoUrl) + console.log(`πŸ“ Setting up test repository: ${testRepoName}`) + + const projectPath = await setupTestRepo( + evalData.repoUrl, + testRepoName, + evalCommit.sha, + ) + console.log(`Repository cloned to: ${projectPath}`) + + // Setup project context + setProjectRoot(projectPath) + createFileReadingMock(projectPath) + recreateShell(projectPath) + setWorkingDirectory(projectPath) + + // Generate session identifiers + const clientSessionId = generateCompactId() + const fingerprintId = generateCompactId() + + console.log('πŸ€– Running evaluation...') + console.log( + `Spec: ${evalCommit.spec.slice(0, 100)}${evalCommit.spec.length > 100 ? '...' : ''}`, + ) + + const startTime = Date.now() + + try { + // Run the evaluation + const result = await runSingleEval( + evalCommit, + projectPath, + clientSessionId, + fingerprintId, + ) + + const duration = Date.now() - startTime + console.log(`βœ… Evaluation completed in ${(duration / 1000).toFixed(1)}s`) + + // Display results + if (result.error) { + console.log(`❌ Error occurred: ${result.error}`) + } else { + console.log('πŸ“Š Results:') + if (result.judging_results) { + const metrics = result.judging_results.metrics + console.log(` Overall Score: ${metrics.overallScore.toFixed(2)}/10`) + console.log(` Completion: ${metrics.completionScore.toFixed(2)}/10`) + console.log(` Efficiency: ${metrics.efficiencyScore.toFixed(2)}/10`) + console.log(` Code Quality: ${metrics.codeQualityScore.toFixed(2)}/10`) + + if (result.judging_results.strengths.length > 0) { + console.log(' Strengths:') + result.judging_results.strengths.forEach((strength) => { + console.log(` β€’ ${strength}`) + }) + } + + if (result.judging_results.weaknesses.length > 0) { + console.log(' Weaknesses:') + result.judging_results.weaknesses.forEach((weakness) => { + console.log(` β€’ ${weakness}`) + }) + } + } + + console.log(` Files modified: ${result.fileStates.length}`) + console.log(` Conversation turns: ${result.trace.length}`) + } + + // Save results if output file specified + if (outputFile) { + fs.writeFileSync(outputFile, JSON.stringify(result, null, 2)) + console.log(`πŸ’Ύ Results saved to: ${outputFile}`) + } + + process.exit(0) + } catch (error) { + const duration = Date.now() - startTime + console.error( + `❌ Evaluation failed after ${(duration / 1000).toFixed(1)}s:`, + error, + ) + process.exit(1) + } +} + +// CLI handling +if (require.main === module) { + RunSingleEvalCommand.run().catch((err) => { + console.error('Error running single eval:', err) + process.exit(1) + }) +} + +export { RunSingleEvalCommand, runSingleEvalTask } diff --git a/evals/git-evals/run-single-eval-process-legacy.ts b/evals/git-evals/run-single-eval-process-legacy.ts new file mode 100644 index 0000000000..25148d3e1b --- /dev/null +++ b/evals/git-evals/run-single-eval-process-legacy.ts @@ -0,0 +1,80 @@ +import fs from 'fs' + +import { + setProjectRoot, + setWorkingDirectory, +} from '@codebuff/npm-app/project-files' +import { recreateShell } from '@codebuff/npm-app/terminal/run-command' + +import { createFileReadingMock } from '../scaffolding' +import { setupTestEnvironmentVariables } from '../test-setup' +import { runSingleEval } from './run-git-evals' + +import type { EvalCommit } from './types' + +async function main() { + const [ + evalCommitFilePath, + projectPath, + clientSessionId, + fingerprintId, + agentType, + ] = process.argv.slice(2) + + if ( + !evalCommitFilePath || + !projectPath || + !clientSessionId || + !fingerprintId || + !agentType + ) { + console.error('Missing required arguments for single eval process') + process.exit(1) + } + + let evalCommit: EvalCommit + try { + const evalCommitStr = fs.readFileSync(evalCommitFilePath, 'utf-8') + evalCommit = JSON.parse(evalCommitStr) + } catch (error) { + console.error('Failed to read evalCommit from file:', error) + process.exit(1) + } + + try { + // Setup environment for this process + setProjectRoot(projectPath) + setupTestEnvironmentVariables() + createFileReadingMock(projectPath) + recreateShell(projectPath) + setWorkingDirectory(projectPath) + + const result = await runSingleEval( + evalCommit, + projectPath, + clientSessionId, + fingerprintId, + agentType, + ) + console.log('Final result:', { result }) + if (process.send) { + process.send({ type: 'result', result }) + } + } catch (error) { + if (process.send) { + process.send({ + type: 'error', + error: + error instanceof Error + ? { message: error.message, stack: error.stack } + : { message: String(error) }, + }) + } + } finally { + setTimeout(() => { + process.exit(0) + }, 2000) + } +} + +main() diff --git a/evals/git-evals/run-single-eval-process.ts b/evals/git-evals/run-single-eval-process.ts index 25148d3e1b..f7d373db64 100644 --- a/evals/git-evals/run-single-eval-process.ts +++ b/evals/git-evals/run-single-eval-process.ts @@ -1,80 +1,41 @@ -import fs from 'fs' - -import { - setProjectRoot, - setWorkingDirectory, -} from '@codebuff/npm-app/project-files' -import { recreateShell } from '@codebuff/npm-app/terminal/run-command' +#!/usr/bin/env bun -import { createFileReadingMock } from '../scaffolding' -import { setupTestEnvironmentVariables } from '../test-setup' +import fs from 'fs' import { runSingleEval } from './run-git-evals' - import type { EvalCommit } from './types' -async function main() { - const [ - evalCommitFilePath, - projectPath, - clientSessionId, - fingerprintId, - agentType, - ] = process.argv.slice(2) - - if ( - !evalCommitFilePath || - !projectPath || - !clientSessionId || - !fingerprintId || - !agentType - ) { - console.error('Missing required arguments for single eval process') - process.exit(1) - } - - let evalCommit: EvalCommit - try { - const evalCommitStr = fs.readFileSync(evalCommitFilePath, 'utf-8') - evalCommit = JSON.parse(evalCommitStr) - } catch (error) { - console.error('Failed to read evalCommit from file:', error) - process.exit(1) - } +process.on('message', () => {}) +async function main() { try { - // Setup environment for this process - setProjectRoot(projectPath) - setupTestEnvironmentVariables() - createFileReadingMock(projectPath) - recreateShell(projectPath) - setWorkingDirectory(projectPath) + const [tempEvalCommitPath, projectPath, clientSessionId, fingerprintId, agentType] = process.argv.slice(2) + + if (!tempEvalCommitPath || !projectPath || !clientSessionId || !fingerprintId) { + throw new Error('Missing required arguments: tempEvalCommitPath, projectPath, clientSessionId, fingerprintId') + } + // Load eval commit from temp file + const evalCommit = JSON.parse(fs.readFileSync(tempEvalCommitPath, 'utf-8')) as EvalCommit + const result = await runSingleEval( evalCommit, projectPath, clientSessionId, fingerprintId, - agentType, + agentType || 'base' ) - console.log('Final result:', { result }) + + // Send result back to parent process if (process.send) { process.send({ type: 'result', result }) } } catch (error) { + console.error('Error in run-single-eval-process-sdk:', error) if (process.send) { - process.send({ - type: 'error', - error: - error instanceof Error - ? { message: error.message, stack: error.stack } - : { message: String(error) }, - }) + process.send({ type: 'error', error: { message: (error as Error).message, stack: (error as Error).stack } }) } - } finally { - setTimeout(() => { - process.exit(0) - }, 2000) + process.exit(1) } } -main() +main() \ No newline at end of file diff --git a/evals/git-evals/run-single-eval-simple-sdk.ts b/evals/git-evals/run-single-eval-simple-sdk.ts new file mode 100755 index 0000000000..5345291da7 --- /dev/null +++ b/evals/git-evals/run-single-eval-simple-sdk.ts @@ -0,0 +1,256 @@ +#!/usr/bin/env bun + +import fs from 'fs' +import { execSync } from 'child_process' +import path from 'path' + +import { generateCompactId } from '@codebuff/common/util/string' +import { CodebuffClient } from '../../sdk/src/client' +import { Command, Flags } from '@oclif/core' + +import { extractRepoNameFromUrl, setupTestRepo } from './setup-test-repo' +import { resetRepoToCommit } from '../scaffolding' + +import type { EvalCommit, EvalData, FileState } from './types' + +class RunSingleEvalSimpleSDKCommand extends Command { + static description = 'Run a single git evaluation task using the Codebuff SDK (simplified version)' + + static examples = [ + '$ bun run-single-eval-simple-sdk --eval-file eval-codebuff.json --commit-index 0', + '$ bun run-single-eval-simple-sdk --eval-file eval-manifold.json --commit-sha abc123', + ] + + static flags = { + 'eval-file': Flags.string({ + char: 'f', + description: 'Path to the eval JSON file (e.g., eval-codebuff.json)', + required: true, + }), + 'commit-index': Flags.integer({ + char: 'i', + description: 'Index of the commit to evaluate (0-based)', + }), + 'commit-sha': Flags.string({ + char: 's', + description: 'SHA of the specific commit to evaluate', + }), + output: Flags.string({ + char: 'o', + description: 'Output file path for results (optional)', + }), + help: Flags.help({ char: 'h' }), + } + + async run(): Promise { + const { flags } = await this.parse(RunSingleEvalSimpleSDKCommand) + + // Validate that either commit-index or commit-sha is provided + if ( + !flags['commit-index'] && + flags['commit-index'] !== 0 && + !flags['commit-sha'] + ) { + this.error('Either --commit-index or --commit-sha must be provided') + } + + if (flags['commit-index'] !== undefined && flags['commit-sha']) { + this.error('Cannot specify both --commit-index and --commit-sha') + } + + await runSingleEvalTaskSimpleSDK(flags) + } +} + +async function runSingleEvalTaskSimpleSDK(options: { + 'eval-file': string + 'commit-index'?: number + 'commit-sha'?: string + output?: string +}): Promise { + const { + 'eval-file': evalFile, + 'commit-index': commitIndex, + 'commit-sha': commitSha, + output: outputFile, + } = options + + console.log('πŸš€ Starting single git eval (Simple SDK mode)...') + console.log(`Eval file: ${evalFile}`) + + // Load eval data + if (!fs.existsSync(evalFile)) { + throw new Error(`Eval file not found: ${evalFile}`) + } + + const evalData = JSON.parse(fs.readFileSync(evalFile, 'utf-8')) as EvalData + console.log(`Repository: ${evalData.repoUrl}`) + console.log(`Total commits available: ${evalData.evalCommits.length}`) + + // Find the specific commit to evaluate + let evalCommit: EvalCommit + if (commitSha) { + const found = evalData.evalCommits.find((commit) => + commit.sha.startsWith(commitSha), + ) + if (!found) { + throw new Error(`Commit with SHA ${commitSha} not found in eval data`) + } + evalCommit = found + console.log(`Selected commit by SHA: ${commitSha}`) + } else if (commitIndex !== undefined) { + if (commitIndex < 0 || commitIndex >= evalData.evalCommits.length) { + throw new Error( + `Commit index ${commitIndex} is out of range (0-${evalData.evalCommits.length - 1})`, + ) + } + evalCommit = evalData.evalCommits[commitIndex] + console.log(`Selected commit by index: ${commitIndex}`) + } else { + throw new Error('No commit specified') + } + + console.log( + `Commit: ${evalCommit.sha.slice(0, 8)} - ${evalCommit.spec.split('\n')[0]}`, + ) + + // Setup test repository + const testRepoName = + evalData.testRepoName || extractRepoNameFromUrl(evalData.repoUrl) + console.log(`πŸ“ Setting up test repository: ${testRepoName}`) + + const projectPath = await setupTestRepo( + evalData.repoUrl, + testRepoName, + evalCommit.sha, + ) + console.log(`Repository cloned to: ${projectPath}`) + + console.log('πŸ€– Running evaluation with SDK...') + console.log( + `Spec: ${evalCommit.spec.slice(0, 100)}${evalCommit.spec.length > 100 ? '...' : ''}`, + ) + + const startTime = Date.now() + let error: string | undefined + let fileStates: FileState[] = [] + + try { + // Reset to the commit before the target commit + resetRepoToCommit(projectPath, `${evalCommit.sha}^`) + + // Initialize SDK client + const client = new CodebuffClient({ + cwd: projectPath, + onError: (error) => { + console.error('SDK error:', error.message) + }, + }) + + console.log('Running Codebuff with the specification...') + + // Run CodeBuff directly with the specification + const result = await client.run({ + agent: 'base', + prompt: evalCommit.spec, + }) + + console.log('SDK run completed successfully') + console.log(`Tool results: ${result.toolResults?.length || 0}`) + + // Extract file changes from tool results + const changedFiles = new Set() + if (result.toolResults) { + for (const toolResult of result.toolResults) { + if (toolResult.toolCall && + (toolResult.toolCall.toolName === 'write_file' || toolResult.toolCall.toolName === 'str_replace') && + 'path' in toolResult.toolCall.input) { + changedFiles.add(toolResult.toolCall.input.path as string) + } + } + } + + // Capture file states + fileStates = Array.from(changedFiles).map(filePath => { + // Capture "after" state + const fullPath = path.join(projectPath, filePath) + let postContent: string + try { + postContent = fs.existsSync(fullPath) + ? fs.readFileSync(fullPath, 'utf-8') + : '[FILE_NOT_FOUND_POST_RUN]' + } catch (e) { + console.error(`Error reading file ${fullPath} for after state:`, e) + postContent = '[ERROR_READING_AFTER_STATE]' + } + + // Capture "before" state + let preContent: string + try { + preContent = execSync(`git show ${evalCommit.sha}^:"${filePath}"`, { + cwd: projectPath, + stdio: ['ignore', 'pipe', 'ignore'], + }).toString() + } catch (e) { + preContent = '[FILE_DID_NOT_EXIST_PRIOR_TO_CODEBUFF_CHANGES]' + } + + return { path: filePath, preContent, postContent } + }) + + // Close connection safely + try { + client.closeConnection() + } catch (closeError) { + console.debug('Note: SDK client close error (likely not connected):', closeError) + } + + } catch (e) { + console.error('Error in evaluation:', e) + error = e instanceof Error ? `${e.message}\n${e.stack}` : `Unknown error: ${String(e)}` + } + + const duration = Date.now() - startTime + console.log(`βœ… Evaluation completed in ${(duration / 1000).toFixed(1)}s`) + + // Create simple result structure (without judging for now) + const result = { + eval_commit: evalCommit, + error, + fileStates, + durationMs: duration, + simplified: true, // Flag to indicate this is the simplified SDK version + } + + // Display results + if (error) { + console.log(`❌ Error occurred: ${error}`) + } else { + console.log('πŸ“Š Results:') + console.log(` Files modified: ${fileStates.length}`) + if (fileStates.length > 0) { + console.log(' Modified files:') + fileStates.forEach(file => { + console.log(` β€’ ${file.path}`) + }) + } + } + + // Save results if output file specified + if (outputFile) { + fs.writeFileSync(outputFile, JSON.stringify(result, null, 2)) + console.log(`πŸ’Ύ Results saved to: ${outputFile}`) + } + + process.exit(error ? 1 : 0) +} + +// CLI handling +if (require.main === module) { + RunSingleEvalSimpleSDKCommand.run().catch((err) => { + console.error('Error running simple SDK eval:', err) + process.exit(1) + }) +} + +export { RunSingleEvalSimpleSDKCommand, runSingleEvalTaskSimpleSDK } \ No newline at end of file diff --git a/evals/git-evals/run-single-eval.ts b/evals/git-evals/run-single-eval.ts old mode 100644 new mode 100755 index d9feeea62e..a9811e0ff5 --- a/evals/git-evals/run-single-eval.ts +++ b/evals/git-evals/run-single-eval.ts @@ -3,22 +3,16 @@ import fs from 'fs' import { generateCompactId } from '@codebuff/common/util/string' -import { - setProjectRoot, - setWorkingDirectory, -} from '@codebuff/npm-app/project-files' -import { recreateShell } from '@codebuff/npm-app/terminal/run-command' +import { CodebuffClient } from '../../sdk/src/client' import { Command, Flags } from '@oclif/core' -import { createFileReadingMock } from '../scaffolding' -import { setupTestEnvironmentVariables } from '../test-setup' -import { runSingleEval } from './run-git-evals' import { extractRepoNameFromUrl, setupTestRepo } from './setup-test-repo' +import { runSingleEval } from './run-git-evals' import type { EvalCommit, EvalData, ModelConfig } from './types' class RunSingleEvalCommand extends Command { - static description = 'Run a single git evaluation task' + static description = 'Run a single git evaluation task using the Codebuff SDK' static examples = [ '$ bun run-single-eval --eval-file eval-codebuff.json --commit-index 0', @@ -134,10 +128,6 @@ async function runSingleEvalTask(options: { throw new Error(`Invalid model config JSON: ${error}`) } - // Setup test environment - console.log('πŸ”§ Setting up test environment...') - setupTestEnvironmentVariables() - // Setup test repository const testRepoName = evalData.testRepoName || extractRepoNameFromUrl(evalData.repoUrl) @@ -150,12 +140,6 @@ async function runSingleEvalTask(options: { ) console.log(`Repository cloned to: ${projectPath}`) - // Setup project context - setProjectRoot(projectPath) - createFileReadingMock(projectPath) - recreateShell(projectPath) - setWorkingDirectory(projectPath) - // Generate session identifiers const clientSessionId = generateCompactId() const fingerprintId = generateCompactId() @@ -168,7 +152,7 @@ async function runSingleEvalTask(options: { const startTime = Date.now() try { - // Run the evaluation + // Run the evaluation using SDK const result = await runSingleEval( evalCommit, projectPath, @@ -235,4 +219,4 @@ if (require.main === module) { }) } -export { RunSingleEvalCommand, runSingleEvalTask } +export { RunSingleEvalCommand, runSingleEvalTask } \ No newline at end of file diff --git a/evals/scripts/run-remote-parameterized.sh b/evals/scripts/run-remote-parameterized.sh new file mode 100755 index 0000000000..78fb9e6b36 --- /dev/null +++ b/evals/scripts/run-remote-parameterized.sh @@ -0,0 +1,92 @@ +#!/bin/bash +set -euo pipefail + +# Logging function with timestamps +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" +} + +# Parameters +MODE="${1:-seed}" # 'seed' (Drizzle) or 'bypass' +EVAL_FILE="${2:-eval-codebuff.json}" # eval file name +COMMIT_INDEX="${3:-0}" # commit index + +log "πŸš€ Remote Evaluation Infrastructure Starting (SDK Mode)" +log "πŸ“‹ Parameters:" +log " Mode: $MODE" +log " Eval File: $EVAL_FILE" +log " Commit Index: $COMMIT_INDEX" +log " Working Directory: $(pwd)" +log " Script Directory: $(dirname "$0")" + +export CODEBUFF_WEBSOCKET_URL="ws://127.0.0.1:4242/ws" +export CODEBUFF_SKIP_BINARY_CHECK=1 + +# Start services +log "πŸ“¦ Starting Docker services..." +log " Compose file: $(dirname "$0")/../docker-compose.evals.yml" +docker compose -f "$(dirname "$0")/../docker-compose.evals.yml" up -d --build db backend + +# Wait for backend to be ready +log "⏳ Waiting for backend to be ready..." +START_TIME=$(date +%s) +"$(dirname "$0")/wait-for-healthz.sh" "http://127.0.0.1:4242/healthz" 90 || { + log '❌ Health check failed; dumping logs...' + log 'πŸ“‹ Backend logs:' + docker compose -f "$(dirname "$0")/../docker-compose.evals.yml" logs backend --tail=200 || true + log 'πŸ“‹ Database logs:' + docker compose -f "$(dirname "$0")/../docker-compose.evals.yml" logs db --tail=50 || true + exit 1 +} +READY_TIME=$(date +%s) +log "βœ… Backend ready in $((READY_TIME - START_TIME)) seconds" + +# Set up authentication +if [ "$MODE" = "bypass" ]; then + log "πŸ” Setting up bypass authentication..." + export CODEBUFF_TEST_AUTH_TOKEN="$(openssl rand -hex 16)" + export CODEBUFF_API_KEY="$CODEBUFF_TEST_AUTH_TOKEN" + log " Generated test auth token: ${CODEBUFF_TEST_AUTH_TOKEN:0:8}..." +else + log "🌱 Setting up database seed authentication..." + log " Running seeder container..." + SEED_START=$(date +%s) + KEY_LINE=$(docker compose -f "$(dirname "$0")/../docker-compose.evals.yml" run --rm seeder | tail -n1) + export CODEBUFF_API_KEY="${KEY_LINE#CODEBUFF_API_KEY=}" + SEED_END=$(date +%s) + log " Seeding completed in $((SEED_END - SEED_START)) seconds" + log " Extracted API key: ${CODEBUFF_API_KEY:0:8}..." +fi + +# Run evaluation (SDK mode only) +log "πŸ€– Starting evaluation (SDK mode)..." +log " File: evals/git-evals/$EVAL_FILE" +log " Commit Index: $COMMIT_INDEX" +log " Using: CodebuffClient from SDK" +log " Environment: CODEBUFF_WEBSOCKET_URL=$CODEBUFF_WEBSOCKET_URL" +log " This may take 10-30 minutes depending on task complexity..." + +EVAL_START=$(date +%s) +bun evals/git-evals/run-single-eval.ts \ + --eval-file="evals/git-evals/$EVAL_FILE" \ + --commit-index="$COMMIT_INDEX" + +EVAL_EXIT_CODE=$? +EVAL_END=$(date +%s) +EVAL_DURATION=$((EVAL_END - EVAL_START)) + +if [ $EVAL_EXIT_CODE -eq 0 ]; then + log "βœ… Evaluation completed successfully in ${EVAL_DURATION} seconds!" +else + log "❌ Evaluation failed with exit code $EVAL_EXIT_CODE after ${EVAL_DURATION} seconds" + log "πŸ“‹ Final backend logs:" + docker compose -f "$(dirname "$0")/../docker-compose.evals.yml" logs backend --tail=100 || true +fi + +# Cleanup +log "🧹 Cleaning up Docker containers..." +docker compose -f "$(dirname "$0")/../docker-compose.evals.yml" down -v + +TOTAL_DURATION=$((EVAL_END - START_TIME)) +log "🏁 Remote evaluation finished in ${TOTAL_DURATION} total seconds (exit code: $EVAL_EXIT_CODE)" +exit $EVAL_EXIT_CODE \ No newline at end of file diff --git a/evals/scripts/run-remote.sh b/evals/scripts/run-remote.sh new file mode 100755 index 0000000000..6954d9559f --- /dev/null +++ b/evals/scripts/run-remote.sh @@ -0,0 +1,66 @@ +#!/bin/bash +set -euo pipefail + +# Logging function +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" +} + +MODE="${1:-seed}" # 'seed' (Drizzle) or 'bypass' +log "πŸš€ Starting remote evaluation infrastructure (SDK mode)" +log "Mode: $MODE" + +export CODEBUFF_WEBSOCKET_URL="ws://127.0.0.1:4242/ws" +export CODEBUFF_SKIP_BINARY_CHECK=1 +log "Environment variables set:" +log " CODEBUFF_WEBSOCKET_URL=$CODEBUFF_WEBSOCKET_URL" +log " CODEBUFF_SKIP_BINARY_CHECK=$CODEBUFF_SKIP_BINARY_CHECK" + +# Start services +log "πŸ“¦ Starting Docker services (db + backend)..." +docker compose -f "$(dirname "$0")/../docker-compose.evals.yml" up -d --build db backend + +log "⏳ Waiting for backend health check..." +"$(dirname "$0")/wait-for-healthz.sh" "http://127.0.0.1:4242/healthz" 90 || { + log '❌ Health check failed; dumping backend logs...' + docker compose -f "$(dirname "$0")/../docker-compose.evals.yml" logs backend --tail=200 || true + log '❌ Dumping database logs...' + docker compose -f "$(dirname "$0")/../docker-compose.evals.yml" logs db --tail=50 || true + exit 1 +} + +if [ "$MODE" = "bypass" ]; then + log "πŸ” Setting up bypass authentication..." + export CODEBUFF_TEST_AUTH_TOKEN="$(openssl rand -hex 16)" + export CODEBUFF_API_KEY="$CODEBUFF_TEST_AUTH_TOKEN" + log " Generated test auth token: ${CODEBUFF_TEST_AUTH_TOKEN:0:8}..." +else + log "🌱 Setting up database seed authentication..." + log " Running seeder container..." + KEY_LINE=$(docker compose -f "$(dirname "$0")/../docker-compose.evals.yml" run --rm seeder | tail -n1) + export CODEBUFF_API_KEY="${KEY_LINE#CODEBUFF_API_KEY=}" + log " Extracted API key: ${CODEBUFF_API_KEY:0:8}..." +fi + +log "πŸ€– Starting evaluation (SDK mode)..." +log " Eval file: evals/git-evals/eval-codebuff.json" +log " Commit index: 0" +log " Using: CodebuffClient from SDK" +log " This may take 10-30 minutes depending on task complexity..." + +bun evals/git-evals/run-single-eval.ts \ + --eval-file="evals/git-evals/eval-codebuff.json" \ + --commit-index=0 + +EVAL_EXIT_CODE=$? +if [ $EVAL_EXIT_CODE -eq 0 ]; then + log "βœ… Evaluation completed successfully!" +else + log "❌ Evaluation failed with exit code $EVAL_EXIT_CODE" +fi + +log "🧹 Cleaning up Docker containers..." +docker compose -f "$(dirname "$0")/../docker-compose.evals.yml" down -v + +log "🏁 Remote evaluation finished (exit code: $EVAL_EXIT_CODE)" +exit $EVAL_EXIT_CODE \ No newline at end of file diff --git a/evals/scripts/wait-for-healthz.sh b/evals/scripts/wait-for-healthz.sh new file mode 100755 index 0000000000..6538dcf4a8 --- /dev/null +++ b/evals/scripts/wait-for-healthz.sh @@ -0,0 +1,32 @@ +#!/bin/bash +set -euo pipefail + +# Logging function +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" +} + +URL="$1"; TIMEOUT="${2:-60}" +log "πŸ₯ Health check starting" +log " URL: $URL" +log " Timeout: ${TIMEOUT}s" + +for i in $(seq 1 "$TIMEOUT"); do + if curl -fsS "$URL" >/dev/null 2>&1; then + log "βœ… Backend is healthy and ready!" + exit 0 + fi + + # Log every 10 seconds to avoid spam + if [ $((i % 10)) -eq 0 ] || [ $i -le 5 ]; then + log "⏳ Waiting for backend... (${i}s / ${TIMEOUT}s)" + fi + + sleep 1 +done + +log "❌ Backend health check failed after $TIMEOUT seconds" >&2 +log "πŸ” Final health check attempt..." +RESPONSE=$(curl -s -w "HTTP_CODE:%{http_code}" "$URL" 2>/dev/null || echo "CURL_FAILED") +log " Response: $RESPONSE" +exit 1 \ No newline at end of file diff --git a/evals/seeds/seed-evals.ts b/evals/seeds/seed-evals.ts new file mode 100644 index 0000000000..54d92d057f --- /dev/null +++ b/evals/seeds/seed-evals.ts @@ -0,0 +1,85 @@ +import 'dotenv/config' +import { drizzle } from 'drizzle-orm/node-postgres' +import { Client } from 'pg' +import crypto from 'crypto' +import { + user, + session, +} from '../../common/src/db/schema' + +// Logging function +function log(message: string) { + const timestamp = new Date().toISOString() + console.error(`[${timestamp}] ${message}`) +} + +async function main() { + log('🌱 Starting database seeding for evaluations') + + const DATABASE_URL = process.env.DATABASE_URL! + log(`πŸ“Š Connecting to database: ${DATABASE_URL.replace(/\/\/.*@/, '//***@')}`) + + const client = new Client({ connectionString: DATABASE_URL }) + const startTime = Date.now() + + try { + await client.connect() + log('βœ… Database connection established') + + const db = drizzle(client) + + // deterministic IDs for idempotency + const userId = 'test-user' + const email = 'evals@test.local' + const token = crypto.randomUUID() + + log('πŸ‘€ Creating test user...') + log(` User ID: ${userId}`) + log(` Email: ${email}`) + + // upsert user + await db + .insert(user) + .values({ + id: userId, + email, + name: 'Test User', + created_at: new Date(), + }) + .onConflictDoNothing() + + log('βœ… Test user created/updated') + + log('πŸ”‘ Creating session token...') + log(` Token: ${token.substring(0, 8)}...`) + + // upsert session / api token row + await db + .insert(session) + .values({ + sessionToken: token, + userId, + expires: new Date(Date.now() + 24 * 60 * 60 * 1000), // 24 hours + }) + .onConflictDoNothing() + + log('βœ… Session token created/updated') + const duration = Date.now() - startTime + log(`🏁 Database seeding completed in ${duration}ms`) + + // Output the API key for the runner script to capture + console.log(`CODEBUFF_API_KEY=${token}`) + + } catch (error) { + log(`❌ Database seeding failed: ${error}`) + throw error + } finally { + await client.end() + log('πŸ”Œ Database connection closed') + } +} + +main().catch((err) => { + console.error(err) + process.exit(1) +}) \ No newline at end of file diff --git a/remote-eval-infra-plan.md b/remote-eval-infra-plan.md new file mode 100644 index 0000000000..283c2bd33d --- /dev/null +++ b/remote-eval-infra-plan.md @@ -0,0 +1,386 @@ +Briefing (Read First) +- Monorepo + Bun basics + - Monorepo with TypeScript + Bun everywhere. Local runs often use `.bin/bun`, which can inject dev env (`NEXT_PUBLIC_CB_ENVIRONMENT=dev`). Prefer plain `bun` in CI to avoid unintended dev defaults. + - The SDK runner is at `scripts/git-evals/run-single-eval.ts` and imports from `../../sdk/src`. It streams conversation/events to console. + +- SDK connectivity + auth + - Today the SDK hard-requires the `codebuff` CLI in PATH (constructor checks with `which/where`). Install with `npm i -g codebuff` OR implement the skip flag below. + - βœ… IMPLEMENTED: Add an optional skip flag (recommended): if `CODEBUFF_SKIP_BINARY_CHECK=1`, skip the CLI presence check. + - βœ… IMPLEMENTED: Default WS URL depends on env. In CI/remote, explicitly set `CODEBUFF_WEBSOCKET_URL=ws://127.0.0.1:4242/ws` so the SDK connects to your ephemeral backend (and not prod/dev defaults). + - βœ… IMPLEMENTED: Provide an API key as `CODEBUFF_API_KEY`. In seed mode, this comes from Drizzle seed output. In bypass mode, reuse `CODEBUFF_TEST_AUTH_TOKEN`. + - βœ… NEW: SDK-based evaluation scripts created: `run-single-eval-sdk.ts`, `run-single-eval-simple-sdk.ts`, and `run-git-evals-sdk.ts` + +- Docker containment (backend stays Docker‑agnostic) + - All infra (Compose, Dockerfile, scripts, seeding) lives under `evals/`. The backend does not reference Docker. + - Compose binds backend to loopback only (`127.0.0.1:4242`), so nothing is publicly exposed in CI. + +- Readiness + flake control + - Don’t just wait for HTTP bindβ€”wait for `/healthz` to return 200 AND ensure WS is accepting connections. Use a curl loop with a strict timeout (60–90s) or enhance `/healthz` to signal WS readiness. + - Stream backend logs on failure to diagnose quickly. + +- Seeding strategy: Drizzle (preferred) + - Seed lives in `evals/seeds/seed-evals.ts` and imports tables from `backend/db/schema.ts`. + - Use deterministic IDs + `onConflictDoNothing()` for idempotency. + - Print exactly one line: `CODEBUFF_API_KEY=...`. The runner parses thisβ€”avoid extra logs. + - Align with the backend’s token model: confirm whether API tokens live in `session` or a dedicated `api_keys` table, and include required fields (e.g., `expiresAt`, `createdAt`). + +- Test‑only auth bypass (fastest fallback) + - If `CODEBUFF_TEST_AUTH_TOKEN` is set AND `NODE_ENV=test`, accept that token in WS auth and attach a minimal user context. Skip DB lookups; great for smoke tests. + +- CI specifics + - Use Docker Compose under `evals/` for parity and a one‑liner. + - Install `codebuff` globally in the runner (or use the skip flag after we add it). + - Set `CODEBUFF_WEBSOCKET_URL` + `CODEBUFF_API_KEY` explicitly; mask secrets; tear down with `docker compose down -v`. + - Concurrency: separate Compose project names or only use internal networking. + +- Common pitfalls + - `.bin/bun` locally can set dev defaults and point SDK at localhost. In CI, always set `CODEBUFF_WEBSOCKET_URL`. + - `/healthz` returning 200 before WS is ready β†’ flakiness. Gate readiness on WS availability. + - Seed failures: wrong import path or missing required columns. Inspect `backend/db/schema.ts` and insert minimum viable fields. + - Token mismatch: ensure seeded token matches WS auth expectations. If unsure, use bypass first. + - No `codebuff` in PATH β†’ SDK throws. Install it or use the skip flag once implemented. + +- Quick execution checklist + - βœ… IMPLEMENTED: `npm i -g codebuff` (or set `CODEBUFF_SKIP_BINARY_CHECK=1` after we add it) + - βœ… IMPLEMENTED: `docker compose -f evals/docker-compose.evals.yml up -d --build db backend` + - βœ… IMPLEMENTED: Wait for `http://127.0.0.1:4242/healthz` OK (WS-ready semantics) + - βœ… IMPLEMENTED: Seed (Drizzle) β†’ capture `CODEBUFF_API_KEY` OR set bypass envs + - βœ… UPDATED: SDK-only: `bash evals/scripts/run-remote-parameterized.sh bypass eval-codebuff.json 0` + - βœ… IMPLEMENTED: `docker compose -f evals/docker-compose.evals.yml down -v` + +## SDK-Only Evaluation Infrastructure + +The evaluation infrastructure now uses the public Codebuff SDK exclusively: + +### SDK Mode (Only Option) +- Uses public `CodebuffClient` from `@codebuff/sdk` +- Clean separation from internal backend APIs +- Reliable and consistent for CI/CD environments +- Usage: `bash evals/scripts/run-remote-parameterized.sh bypass eval-codebuff.json 0` + +### Available Scripts: +- `evals/git-evals/run-single-eval.ts` - Main SDK evaluation command +- `evals/git-evals/run-git-evals.ts` - Batch SDK evaluations +- `evals/git-evals/run-single-eval-simple-sdk.ts` - Simplified SDK evaluation (direct execution) +- `evals/scripts/run-remote.sh` - Basic remote evaluation script +- `evals/scripts/run-remote-parameterized.sh` - Parameterized remote evaluation script + +### Legacy Files (Preserved for Reference): +- `evals/git-evals/run-single-eval-legacy.ts` - Original internal API version +- `evals/git-evals/run-git-evals-legacy.ts` - Original internal API version +- `evals/git-evals/run-single-eval-process-legacy.ts` - Original process wrapper + +### GitHub Actions Support: +- Simplified workflow using SDK-only approach +- No mode selection needed (always uses SDK) +- Matrix jobs use SDK consistently + +--- + +New Tweaks and TODOs (from review) +- Implement SDK skip flag (env guard) and WS URL override: +``` +// sdk/src/client.ts (skip flag pseudo-patch) +const SKIP = process.env.CODEBUFF_SKIP_BINARY_CHECK === '1' +if (!SKIP) { + const isWindows = process.platform === 'win32' + if ( + execFileSync(isWindows ? 'where' : 'which', [CODEBUFF_BINARY]) + .toString() + .trim() === '' + ) { + throw new Error('Missing codebuff binary ...') + } +} +``` +``` +// sdk/src/constants.ts (WS override pseudo-patch) +const WS_FROM_ENV = process.env.CODEBUFF_WEBSOCKET_URL || process.env.CB_WS_URL +export const WEBSOCKET_URL = WS_FROM_ENV ?? ( + IS_PROD ? 'wss://manicode-backend.onrender.com/ws' : 'ws://localhost:4242/ws' +) +``` + +- Health readiness contract: ensure /healthz implies WS is ready. If needed, add a WS-ready flag in server startup before returning 200: +``` +// backend readiness (pseudo-code) +let wsReady = false +startWebsocketServer(() => { wsReady = true }) +app.get('/healthz', (req, res) => { + return wsReady ? res.status(200).send('ok') : res.status(503).send('starting') +}) +``` + +- Backend start command: confirm the backend has a script that starts the WS server on 4242; otherwise define one and call it from the Dockerfile: +``` +// package.json (backend) pseudo-snippet +{ + "scripts": { + "start:ws": "bun run dev" // or explicit entry that starts WS on 4242 + } +} +``` +``` +# evals/backend.Dockerfile (if needed) +CMD ["bun", "--cwd", "backend", "start:ws"] +``` + +- Drizzle seed alignment: verify exact token table/columns and adjust seed accordingly (examples): +``` +// evals/seeds/seed-evals.ts (pseudo) +await db.insert(session).values({ + id: token, + userId, + expiresAt: new Date(Date.now() + 24*60*60*1000), + createdAt: new Date(), + // any other required columns +}).onConflictDoNothing() +``` + +- Container path sanity: ensure import path is correct from inside the seeder container: +``` +// from evals/seeds/seed-evals.ts +import { user, session } from '../../backend/db/schema' +``` + +- Debugging playbook additions: +``` +# On failure dump logs +docker compose -f evals/docker-compose.evals.yml logs backend --tail=200 || true +# If healthz flaps, add a longer timeout +bash evals/scripts/wait-for-healthz.sh http://127.0.0.1:4242/healthz 120 +``` + +--- + +1) Directory layout (all infra under evals/; backend stays Docker-agnostic) +- Place all Dockerfiles, compose files, seed scripts, and run scripts in `evals/` +- Build backend image using project root as build context while specifying the Dockerfile inside `evals/` + +``` +repo-root/ + evals/ + docker-compose.evals.yml + backend.Dockerfile + scripts/ + run-remote.sh + wait-for-healthz.sh + seeds/ + seed-evals.ts # Drizzle seed script (preferred) + README.md + backend/ + db/ + schema.ts # existing drizzle schema (used by seed) + # ...migrations, drizzle config; unchanged + # other packages unchanged +``` + +2) Compose file (db + backend), healthchecks, no public exposure +- Build backend from repo root as build context, using Dockerfile at evals/backend.Dockerfile +- Bind backend to localhost only; SDK connects via ws://127.0.0.1:4242/ws +- Use a separate seeder service (or run seeding via `docker compose run --rm seeder`) + +``` +# evals/docker-compose.evals.yml +version: '3.9' +services: + db: + image: postgres:16-alpine + environment: + POSTGRES_USER: codebuff + POSTGRES_PASSWORD: codebuff + POSTGRES_DB: codebuff + command: [ + "postgres", + "-c", "fsync=off", + "-c", "synchronous_commit=off", + "-c", "full_page_writes=off" + ] + healthcheck: + test: ["CMD-SHELL", "pg_isready -U codebuff -d codebuff"] + interval: 5s + timeout: 3s + retries: 20 + + backend: + build: + context: .. # project root + dockerfile: ./evals/backend.Dockerfile + environment: + DATABASE_URL: postgresql://codebuff:codebuff@db:5432/codebuff + NODE_ENV: test + # Optional test-only bypass (see Section 5) + CODEBUFF_TEST_AUTH_TOKEN: ${CODEBUFF_TEST_AUTH_TOKEN} + depends_on: + db: + condition: service_healthy + ports: + - "127.0.0.1:4242:4242" # loopback only + healthcheck: + test: ["CMD", "curl", "-fsS", "http://localhost:4242/healthz"] + interval: 5s + timeout: 3s + retries: 30 + + seeder: + image: oven/bun:1.1.34 + working_dir: /app + volumes: + - ..:/app:ro + environment: + DATABASE_URL: postgresql://codebuff:codebuff@db:5432/codebuff + entrypoint: ["bun", "run", "evals/seeds/seed-evals.ts"] + depends_on: + db: + condition: service_healthy +``` + +3) Backend image build (Dockerfile living in evals/) +- Keep backend unaware of Docker by placing the Dockerfile in evals; reference backend code via build context + +``` +# evals/backend.Dockerfile +FROM oven/bun:1.1.34 as base +WORKDIR /app +COPY . . +RUN bun install --frozen-lockfile +EXPOSE 4242 +CMD ["bun", "--cwd", "backend", "dev"] +``` + +4) SDK URL override (prefer explicit WS URL; no prod/dev confusion) +- Allow CODEBUFF_WEBSOCKET_URL to override default; keeps CI/local targeting explicit + +``` +// sdk/src/constants.ts (pseudo-patch) +const WS_FROM_ENV = process.env.CODEBUFF_WEBSOCKET_URL || process.env.CB_WS_URL +export const WEBSOCKET_URL = WS_FROM_ENV ?? ( + IS_PROD ? 'wss://manicode-backend.onrender.com/ws' : 'ws://localhost:4242/ws' +) +``` + +5) Test-only auth bypass (optional, fastest) +- Minimal code change in backend auth path (no Docker coupling). If `CODEBUFF_TEST_AUTH_TOKEN` is set and matches incoming token, accept it and attach minimal user context. + +``` +// backend/src/websockets/auth.ts (pseudo-code) +export function getUserInfoFromAuthToken(authToken: string): UserInfo | null { + const bypass = process.env.CODEBUFF_TEST_AUTH_TOKEN + if (process.env.NODE_ENV === 'test' && bypass && authToken === bypass) { + return { userId: 'test-user', email: 'evals@test.local', isAdmin: false } + } + // ...existing lookup against sessions/users... +} +``` + +6) Drizzle seed (preferred over raw SQL) +- Seed a minimal user/org/session/API key using Drizzle ORM, talking directly to Postgres in Compose +- Keep seed entirely under evals/; import schema from backend/db/schema.ts for type safety +- Print a single line: `CODEBUFF_API_KEY=...` for the runner to capture + +``` +// evals/seeds/seed-evals.ts (pseudo-code with Drizzle) +import 'dotenv/config' +import { drizzle } from 'drizzle-orm/node-postgres' +import { Client } from 'pg' +import { + user, session, org, /* other tables as needed */ +} from '../../backend/db/schema' // adjust import path if needed + +async function main() { + const DATABASE_URL = process.env.DATABASE_URL! + const client = new Client({ connectionString: DATABASE_URL }) + await client.connect() + const db = drizzle(client) + + // deterministic IDs for idempotency + const userId = 'test-user' + const email = 'evals@test.local' + const token = crypto.randomUUID() // or deterministic for replay + + // upsert user + await db + .insert(user) + .values({ id: userId, email /* ...other required fields */ }) + .onConflictDoNothing() + + // upsert org (optional; link user as owner) + // await db.insert(org).values({ ... }).onConflictDoNothing() + + // upsert session / api token row + await db + .insert(session) + .values({ id: token, userId, /* expiresAt, createdAt, etc. */ }) + .onConflictDoNothing() + + console.log(`CODEBUFF_API_KEY=${token}`) + await client.end() +} + +main().catch((err) => { console.error(err); process.exit(1) }) +``` + +7) One-liner runner script (spin up, wait, seed with Drizzle, run, tear down) +- Lives entirely in evals/scripts; wires envs and points SDK to ephemeral WS +- Supports: bypass mode OR real seeding mode via Drizzle + +``` +# evals/scripts/run-remote.sh +set -euo pipefail +MODE="${1:-seed}" # 'seed' (Drizzle) or 'bypass' +export CODEBUFF_WEBSOCKET_URL="ws://127.0.0.1:4242/ws" +export CODEBUFF_SKIP_BINARY_CHECK=1 # after skip flag is added + +# Start services +docker compose -f "$(dirname "$0")/../docker-compose.evals.yml" up -d --build db backend +"$(dirname "$0")/wait-for-healthz.sh" "http://127.0.0.1:4242/healthz" 90 || { + echo 'Healthz failed; dumping backend logs...' + docker compose -f "$(dirname "$0")/../docker-compose.evals.yml" logs backend --tail=200 || true + exit 1 +} + +if [ "$MODE" = "bypass" ]; then + export CODEBUFF_TEST_AUTH_TOKEN="$(openssl rand -hex 16)" + export CODEBUFF_API_KEY="$CODEBUFF_TEST_AUTH_TOKEN" +else + # Drizzle seed via compose for network access to db + KEY_LINE=$(docker compose -f "$(dirname "$0")/../docker-compose.evals.yml" run --rm seeder | tail -n1) + export CODEBUFF_API_KEY="${KEY_LINE#CODEBUFF_API_KEY=}" +fi + +bun scripts/git-evals/run-single-eval.ts \ + --prompt "Say hi and print the working directory" \ + --max-steps 10 + +docker compose -f "$(dirname "$0")/../docker-compose.evals.yml" down -v +``` + +``` +# evals/scripts/wait-for-healthz.sh +set -euo pipefail +URL="$1"; TIMEOUT="${2:-60}" +for i in $(seq 1 "$TIMEOUT"); do + if curl -fsS "$URL" >/dev/null 2>&1; then exit 0; fi + sleep 1 + echo "waiting for backend... ($i s)" +done +echo "backend healthz did not become ready in $TIMEOUT seconds" >&2 +exit 1 +``` + +8) GitHub Actions sketch (contained orchestration) +- The workflow calls the one-liner; Drizzle seed by default + +``` +# .github/workflows/remote-evals.yml (pseudo-snippet) +jobs: + remote-evals: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: oven-sh/setup-bun@v1 + - name: Install codebuff CLI (SDK binary check) + run: npm i -g codebuff + - name: Run remote eval (Drizzle seed) + run: bash evals/scripts/run-remote.sh seed +``` diff --git a/sdk/src/client.ts b/sdk/src/client.ts index 3a7cf42278..3aea09a38e 100644 --- a/sdk/src/client.ts +++ b/sdk/src/client.ts @@ -1,3 +1,5 @@ +import { execFileSync } from 'child_process' + import { initialSessionState, type RunState } from './run-state' import { changeFile } from './tools/change-file' import { getFiles } from './tools/read-files' @@ -11,6 +13,8 @@ import { API_KEY_ENV_VAR } from '../../common/src/constants' import { DEFAULT_MAX_AGENT_STEPS } from '../../common/src/json-config/constants' import { toolNames } from '../../common/src/tools/constants' +import { CODEBUFF_BINARY } from './constants' + import type { CustomToolDefinition } from './custom-tool' import type { AgentDefinition } from '../../common/src/templates/initial-agents-dir/types/agent-definition' import type { ToolName } from '../../common/src/tools/constants' @@ -68,6 +72,22 @@ export class CodebuffClient { ) } + // Check for codebuff binary unless skip flag is set + const SKIP = process.env.CODEBUFF_SKIP_BINARY_CHECK === '1' + if (!SKIP) { + try { + const isWindows = process.platform === 'win32' + const result = execFileSync(isWindows ? 'where' : 'which', [CODEBUFF_BINARY]) + .toString() + .trim() + if (result === '') { + throw new Error(`Missing codebuff binary in PATH. Please install with 'npm install -g codebuff' or set CODEBUFF_SKIP_BINARY_CHECK=1 to skip this check.`) + } + } catch (error) { + throw new Error(`Missing codebuff binary in PATH. Please install with 'npm install -g codebuff' or set CODEBUFF_SKIP_BINARY_CHECK=1 to skip this check.`) + } + } + this.cwd = cwd this.overrideTools = overrideTools ?? {} this.websocketHandler = new WebSocketHandler({ diff --git a/sdk/src/constants.ts b/sdk/src/constants.ts index 9c829db1bd..2054144b6d 100644 --- a/sdk/src/constants.ts +++ b/sdk/src/constants.ts @@ -4,9 +4,10 @@ export const IS_DEV = process.env.NEXT_PUBLIC_CB_ENVIRONMENT === 'dev' export const IS_TEST = process.env.NEXT_PUBLIC_CB_ENVIRONMENT === 'test' export const IS_PROD = !IS_DEV && !IS_TEST -export const WEBSOCKET_URL = IS_PROD - ? 'wss://manicode-backend.onrender.com/ws' - : 'ws://localhost:4242/ws' +const WS_FROM_ENV = process.env.CODEBUFF_WEBSOCKET_URL || process.env.CB_WS_URL +export const WEBSOCKET_URL = WS_FROM_ENV ?? ( + IS_PROD ? 'wss://manicode-backend.onrender.com/ws' : 'ws://localhost:4242/ws' +) export const WEBSITE_URL = IS_PROD ? 'https://codebuff.com' : 'http://localhost:3000'