CodebuffAI · brandonkachen · Aug 15, 2025 · Aug 19, 2025 · Aug 19, 2025 · Aug 19, 2025
diff --git a/.github/workflows/remote-evals.yml b/.github/workflows/remote-evals.yml
@@ -0,0 +1,212 @@
+name: Remote Evaluations (SDK)
+
+# This workflow runs Codebuff evaluations using the public SDK exclusively.
+# It creates a containerized backend environment and runs evaluations via CodebuffClient.
+# Trigger: Add [remote-eval] to commit message or use workflow_dispatch
+# Matrix mode: Add [remote-eval-all] to commit message for parallel evaluations
+
+on:
+  push:
+    branches: ['**']
+  workflow_dispatch:
+    inputs:
+      eval_file:
+        description: 'Eval file to run (e.g., eval-codebuff.json)'
+        required: false
+        default: 'eval-codebuff.json'
+        type: string
+      commit_index:
+        description: 'Commit index to evaluate (0-based)'
+        required: false
+        default: '0'
+        type: string
+      mode:
+        description: 'Auth mode (seed or bypass)'
+        required: false
+        default: 'bypass'
+        type: choice
+        options:
+          - 'bypass'
+          - 'seed'
+
+jobs:
+  remote-evals:
+    runs-on: ubuntu-latest
+    timeout-minutes: 60
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Check commit message
+        id: check_commit
+        env:
+          COMMIT_MESSAGE: ${{ github.event.head_commit.message }}
+        run: |
+          shopt -s nocasematch
+          if [[ "$COMMIT_MESSAGE" == *"[remote-eval]"* ]] || [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
+            echo "should_run_evals=true" >> $GITHUB_OUTPUT
+            echo "Will run remote evaluations"
+          else
+            echo "should_run_evals=false" >> $GITHUB_OUTPUT
+            echo "Skipping remote evaluations (add [remote-eval] to commit message to trigger)"
+          fi
+
+      - name: Set up Bun
+        if: steps.check_commit.outputs.should_run_evals == 'true'
+        uses: oven-sh/setup-bun@v2
+        with:
+          bun-version: '1.2.12'
+
+      - name: Install dependencies
+        if: steps.check_commit.outputs.should_run_evals == 'true'
+        run: bun install --frozen-lockfile
+
+      - name: Validate environment for SDK evaluation
+        if: steps.check_commit.outputs.should_run_evals == 'true'
+        run: |
+          echo "🔍 Validating SDK evaluation environment..."
+          echo "  Checking for required files..."
+          test -f evals/scripts/run-remote-parameterized.sh || { echo "❌ Missing run-remote-parameterized.sh"; exit 1; }
+          test -f evals/git-evals/run-single-eval.ts || { echo "❌ Missing run-single-eval.ts"; exit 1; }
+          test -f evals/docker-compose.evals.yml || { echo "❌ Missing docker-compose.evals.yml"; exit 1; }
+          echo "  Checking SDK package..."
+          bun --version
+          echo "✅ Environment validation passed"
+
+      - name: Run remote evaluation
+        if: steps.check_commit.outputs.should_run_evals == 'true'
+        env:
+          EVAL_FILE: ${{ inputs.eval_file || 'eval-codebuff.json' }}
+          COMMIT_INDEX: ${{ inputs.commit_index || '0' }}
+          MODE: ${{ inputs.mode || 'bypass' }}
+          CODEBUFF_WEBSOCKET_URL: "ws://127.0.0.1:4242/ws"
+          CODEBUFF_SKIP_BINARY_CHECK: "1"
+        run: |
+          echo "🚀 Remote Evaluation Starting (SDK Mode)"
+          echo "📋 GitHub Actions Environment:"
+          echo "  Runner: ${{ runner.os }}"
+          echo "  SHA: ${{ github.sha }}"
+          echo "  Ref: ${{ github.ref }}"
+          echo "  Event: ${{ github.event_name }}"
+          echo "  Eval File: $EVAL_FILE"
+          echo "  Commit Index: $COMMIT_INDEX"
+          echo "  Mode: $MODE"
+          echo "🐳 Docker Info:"
+          docker --version
+          docker compose version
+          echo "💾 Disk Space:"
+          df -h
+          echo "🔧 Starting SDK-based evaluation..."
+          bash evals/scripts/run-remote-parameterized.sh "$MODE" "$EVAL_FILE" "$COMMIT_INDEX"
+
+      - name: Dump logs on failure
+        if: failure() && steps.check_commit.outputs.should_run_evals == 'true'
+        run: |
+          echo "❌ SDK Evaluation failed - dumping diagnostic information"
+          echo "🔧 SDK Environment:"
+          echo "  CODEBUFF_WEBSOCKET_URL: ${CODEBUFF_WEBSOCKET_URL:-not set}"
+          echo "  CODEBUFF_SKIP_BINARY_CHECK: ${CODEBUFF_SKIP_BINARY_CHECK:-not set}"
+          echo "  CODEBUFF_API_KEY: ${CODEBUFF_API_KEY:+[SET]}${CODEBUFF_API_KEY:-[NOT SET]}"
+          echo "🐳 Docker containers status:"
+          docker ps -a || true
+          echo "📋 Backend container logs:"
+          docker compose -f evals/docker-compose.evals.yml logs backend --tail=200 || true
+          echo "📋 Database container logs:"
+          docker compose -f evals/docker-compose.evals.yml logs db --tail=100 || true
+          echo "💾 Disk usage:"
+          df -h || true
+          echo "🧠 Memory usage:"
+          free -h || true
+          echo "📁 Evaluation files:"
+          ls -la evals/git-evals/ || true
+          ls -la evals/scripts/ || true
+
+      - name: Upload evaluation logs
+        if: always() && steps.check_commit.outputs.should_run_evals == 'true'
+        uses: actions/upload-artifact@v4
+        with:
+          name: remote-eval-logs-${{ github.sha }}
+          path: |
+            evals/test-repos/
+            debug/
+            ~/.cache/bun/
+          retention-days: 7
+
+      - name: Cleanup containers
+        if: always() && steps.check_commit.outputs.should_run_evals == 'true'
+        run: |
+          echo "🧹 Final cleanup - removing all containers and volumes..."
+          docker compose -f evals/docker-compose.evals.yml down -v || true
+          docker system prune -f || true
+          echo "✅ Cleanup completed"
+
+  # Optional: Matrix job to run multiple evaluations in parallel
+  remote-evals-matrix:
+    runs-on: ubuntu-latest
+    timeout-minutes: 90
+    if: contains(github.event.head_commit.message, '[remote-eval-all]')
+
+    strategy:
+      fail-fast: false
+      matrix:
+        eval:
+          - { file: 'eval-codebuff.json', index: '0' }
+          - { file: 'eval-codebuff.json', index: '1' }
+          - { file: 'eval-manifold.json', index: '0' }
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Bun
+        uses: oven-sh/setup-bun@v2
+        with:
+          bun-version: '1.2.12'
+
+      - name: Install dependencies
+        run: bun install --frozen-lockfile
+
+      - name: Validate environment for SDK evaluation
+        run: |
+          echo "🔍 Validating SDK evaluation environment for matrix job..."
+          test -f evals/scripts/run-remote-parameterized.sh || { echo "❌ Missing run-remote-parameterized.sh"; exit 1; }
+          test -f evals/git-evals/run-single-eval.ts || { echo "❌ Missing run-single-eval.ts"; exit 1; }
+          test -f evals/docker-compose.evals.yml || { echo "❌ Missing docker-compose.evals.yml"; exit 1; }
+          echo "✅ Matrix environment validation passed"
+
+      - name: Run evaluation matrix
+        env:
+          EVAL_FILE: ${{ matrix.eval.file }}
+          COMMIT_INDEX: ${{ matrix.eval.index }}
+          CODEBUFF_WEBSOCKET_URL: "ws://127.0.0.1:4242/ws"
+          CODEBUFF_SKIP_BINARY_CHECK: "1"
+        run: |
+          echo "🚀 Running matrix evaluation (SDK Mode)..."
+          bash evals/scripts/run-remote-parameterized.sh "bypass" "$EVAL_FILE" "$COMMIT_INDEX"
+
+      - name: Dump matrix logs on failure
+        if: failure()
+        run: |
+          echo "❌ Matrix SDK Evaluation failed - dumping diagnostic information"
+          echo "🔧 Matrix job details: File=$EVAL_FILE, Index=$COMMIT_INDEX"
+          echo "🐳 Docker containers status:"
+          docker ps -a || true
+          echo "📋 Container logs:"
+          docker compose -f evals/docker-compose.evals.yml logs --tail=100 || true
+
+      - name: Upload matrix evaluation results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: remote-eval-matrix-${{ matrix.eval.file }}-${{ matrix.eval.index }}-${{ github.sha }}
+          path: |
+            evals/test-repos/
+            debug/
+          retention-days: 7
+
+      - name: Cleanup containers
+        if: always()
+        run: |
+          docker compose -f evals/docker-compose.evals.yml down -v || true
+          docker system prune -f || true
diff --git a/backend/src/index.ts b/backend/src/index.ts
@@ -19,6 +19,7 @@ import {
   sendRequestReconnect,
   waitForAllClientsDisconnected,
   listen as webSocketListen,
+  isWebSocketReady,
 } from './websockets/server'
 
 const app = express()
@@ -31,7 +32,11 @@ app.get('/', (req, res) => {
 })
 
 app.get('/healthz', (req, res) => {
-  res.send('ok')
+  if (isWebSocketReady()) {
+    res.send('ok')
+  } else {
+    res.status(503).send('starting')
+  }
 })
 
 app.post('/api/usage', usageHandler)

diff --git a/backend/src/websockets/auth.ts b/backend/src/websockets/auth.ts
@@ -11,6 +11,12 @@ export interface UserInfo {
 export async function getUserIdFromAuthToken(
   authToken: string,
 ): Promise<string | undefined> {
+  // Test-only auth bypass
+  const bypass = process.env.CODEBUFF_TEST_AUTH_TOKEN
+  if (process.env.NODE_ENV === 'test' && bypass && authToken === bypass) {
+    return 'test-user'
+  }
+
   const user = await db
     .select({ id: schema.user.id })
     .from(schema.user)
@@ -25,6 +31,12 @@ export async function getUserIdFromAuthToken(
 export async function getUserInfoFromAuthToken(
   authToken: string,
 ): Promise<UserInfo | undefined> {
+  // Test-only auth bypass
+  const bypass = process.env.CODEBUFF_TEST_AUTH_TOKEN
+  if (process.env.NODE_ENV === 'test' && bypass && authToken === bypass) {
+    return { id: 'test-user', email: 'evals@test.local', discord_id: null }
+  }
+
   const user = await db
     .select({
       id: schema.user.id,

diff --git a/backend/src/websockets/server.ts b/backend/src/websockets/server.ts
@@ -18,6 +18,8 @@ export const SWITCHBOARD = new Switchboard()
 // if a connection doesn't ping for this long, we assume the other side is toast
 const CONNECTION_TIMEOUT_MS = 60 * 1000
 
+let wsReady = false
+
 export class MessageParseError extends Error {
   details?: unknown
   constructor(message: string, details?: unknown) {
@@ -87,6 +89,7 @@ export function listen(server: HttpServer, path: string) {
   let deadConnectionCleaner: NodeJS.Timeout | undefined
   wss.on('listening', () => {
     logger.info(`Web socket server listening on ${path}.`)
+    wsReady = true
     deadConnectionCleaner = setInterval(function ping() {
       const now = Date.now()
       try {
@@ -175,3 +178,7 @@ export function sendRequestReconnect() {
 export function waitForAllClientsDisconnected() {
   return SWITCHBOARD.waitForAllClientsDisconnected()
 }
+
+export function isWebSocketReady() {
+  return wsReady
+}
diff --git a/codebuff.json b/codebuff.json
@@ -57,7 +57,7 @@
     },
     {
       "name": "prettier-format",
-      "command": "git diff --name-only --diff-filter=ACMR | grep -E '\\.(ts|tsx|json|md)$' | xargs -r npx prettier --write",
+      "command": "set -o pipefail && CHANGED=\"$(git diff --name-only --diff-filter=ACMR | grep -E '\\.(ts|tsx|json|md)$' | xargs -r npx prettier --list-different || true)\"; [ -n \"$CHANGED\" ] && echo \"$CHANGED\" | xargs -r npx prettier --write --log-level=warn && printf '%s\\n' \"$CHANGED\" || true",
       "filePattern": "**/*.{ts,tsx,json,md}"
     },
     {
@@ -70,6 +70,11 @@
       "command": "bun run typecheck",
       "cwd": ".agents",
       "filePattern": ".agents/**/*.ts"
+    },
+    {
+      "name": "eslint-fix-imports",
+      "command": "set -o pipefail && git diff --name-only --diff-filter=ACMR | grep -E '\\.(ts|tsx|js|jsx)$' | xargs -r npx eslint --fix --quiet",
+      "filePattern": "**/*.{ts,tsx,js,jsx}"
     }
   ]
 }