Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
212 changes: 212 additions & 0 deletions .github/workflows/remote-evals.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
name: Remote Evaluations (SDK)

# This workflow runs Codebuff evaluations using the public SDK exclusively.
# It creates a containerized backend environment and runs evaluations via CodebuffClient.
# Trigger: Add [remote-eval] to commit message or use workflow_dispatch
# Matrix mode: Add [remote-eval-all] to commit message for parallel evaluations

on:
push:
branches: ['**']
workflow_dispatch:
inputs:
eval_file:
description: 'Eval file to run (e.g., eval-codebuff.json)'
required: false
default: 'eval-codebuff.json'
type: string
commit_index:
description: 'Commit index to evaluate (0-based)'
required: false
default: '0'
type: string
mode:
description: 'Auth mode (seed or bypass)'
required: false
default: 'bypass'
type: choice
options:
- 'bypass'
- 'seed'

jobs:
remote-evals:
runs-on: ubuntu-latest
timeout-minutes: 60

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Check commit message
id: check_commit
env:
COMMIT_MESSAGE: ${{ github.event.head_commit.message }}
run: |
shopt -s nocasematch
if [[ "$COMMIT_MESSAGE" == *"[remote-eval]"* ]] || [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
echo "should_run_evals=true" >> $GITHUB_OUTPUT
echo "Will run remote evaluations"
else
echo "should_run_evals=false" >> $GITHUB_OUTPUT
echo "Skipping remote evaluations (add [remote-eval] to commit message to trigger)"
fi

- name: Set up Bun
if: steps.check_commit.outputs.should_run_evals == 'true'
uses: oven-sh/setup-bun@v2
with:
bun-version: '1.2.12'

- name: Install dependencies
if: steps.check_commit.outputs.should_run_evals == 'true'
run: bun install --frozen-lockfile

- name: Validate environment for SDK evaluation
if: steps.check_commit.outputs.should_run_evals == 'true'
run: |
echo "🔍 Validating SDK evaluation environment..."
echo " Checking for required files..."
test -f evals/scripts/run-remote-parameterized.sh || { echo "❌ Missing run-remote-parameterized.sh"; exit 1; }
test -f evals/git-evals/run-single-eval.ts || { echo "❌ Missing run-single-eval.ts"; exit 1; }
test -f evals/docker-compose.evals.yml || { echo "❌ Missing docker-compose.evals.yml"; exit 1; }
echo " Checking SDK package..."
bun --version
echo "✅ Environment validation passed"

- name: Run remote evaluation
if: steps.check_commit.outputs.should_run_evals == 'true'
env:
EVAL_FILE: ${{ inputs.eval_file || 'eval-codebuff.json' }}
COMMIT_INDEX: ${{ inputs.commit_index || '0' }}
MODE: ${{ inputs.mode || 'bypass' }}
CODEBUFF_WEBSOCKET_URL: "ws://127.0.0.1:4242/ws"
CODEBUFF_SKIP_BINARY_CHECK: "1"
run: |
echo "🚀 Remote Evaluation Starting (SDK Mode)"
echo "📋 GitHub Actions Environment:"
echo " Runner: ${{ runner.os }}"
echo " SHA: ${{ github.sha }}"
echo " Ref: ${{ github.ref }}"
echo " Event: ${{ github.event_name }}"
echo " Eval File: $EVAL_FILE"
echo " Commit Index: $COMMIT_INDEX"
echo " Mode: $MODE"
echo "🐳 Docker Info:"
docker --version
docker compose version
echo "💾 Disk Space:"
df -h
echo "🔧 Starting SDK-based evaluation..."
bash evals/scripts/run-remote-parameterized.sh "$MODE" "$EVAL_FILE" "$COMMIT_INDEX"

- name: Dump logs on failure
if: failure() && steps.check_commit.outputs.should_run_evals == 'true'
run: |
echo "❌ SDK Evaluation failed - dumping diagnostic information"
echo "🔧 SDK Environment:"
echo " CODEBUFF_WEBSOCKET_URL: ${CODEBUFF_WEBSOCKET_URL:-not set}"
echo " CODEBUFF_SKIP_BINARY_CHECK: ${CODEBUFF_SKIP_BINARY_CHECK:-not set}"
echo " CODEBUFF_API_KEY: ${CODEBUFF_API_KEY:+[SET]}${CODEBUFF_API_KEY:-[NOT SET]}"
echo "🐳 Docker containers status:"
docker ps -a || true
echo "📋 Backend container logs:"
docker compose -f evals/docker-compose.evals.yml logs backend --tail=200 || true
echo "📋 Database container logs:"
docker compose -f evals/docker-compose.evals.yml logs db --tail=100 || true
echo "💾 Disk usage:"
df -h || true
echo "🧠 Memory usage:"
free -h || true
echo "📁 Evaluation files:"
ls -la evals/git-evals/ || true
ls -la evals/scripts/ || true

- name: Upload evaluation logs
if: always() && steps.check_commit.outputs.should_run_evals == 'true'
uses: actions/upload-artifact@v4
with:
name: remote-eval-logs-${{ github.sha }}
path: |
evals/test-repos/
debug/
~/.cache/bun/
retention-days: 7

- name: Cleanup containers
if: always() && steps.check_commit.outputs.should_run_evals == 'true'
run: |
echo "🧹 Final cleanup - removing all containers and volumes..."
docker compose -f evals/docker-compose.evals.yml down -v || true
docker system prune -f || true
echo "✅ Cleanup completed"

# Optional: Matrix job to run multiple evaluations in parallel
remote-evals-matrix:
runs-on: ubuntu-latest
timeout-minutes: 90
if: contains(github.event.head_commit.message, '[remote-eval-all]')

strategy:
fail-fast: false
matrix:
eval:
- { file: 'eval-codebuff.json', index: '0' }
- { file: 'eval-codebuff.json', index: '1' }
- { file: 'eval-manifold.json', index: '0' }

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Set up Bun
uses: oven-sh/setup-bun@v2
with:
bun-version: '1.2.12'

- name: Install dependencies
run: bun install --frozen-lockfile

- name: Validate environment for SDK evaluation
run: |
echo "🔍 Validating SDK evaluation environment for matrix job..."
test -f evals/scripts/run-remote-parameterized.sh || { echo "❌ Missing run-remote-parameterized.sh"; exit 1; }
test -f evals/git-evals/run-single-eval.ts || { echo "❌ Missing run-single-eval.ts"; exit 1; }
test -f evals/docker-compose.evals.yml || { echo "❌ Missing docker-compose.evals.yml"; exit 1; }
echo "✅ Matrix environment validation passed"

- name: Run evaluation matrix
env:
EVAL_FILE: ${{ matrix.eval.file }}
COMMIT_INDEX: ${{ matrix.eval.index }}
CODEBUFF_WEBSOCKET_URL: "ws://127.0.0.1:4242/ws"
CODEBUFF_SKIP_BINARY_CHECK: "1"
run: |
echo "🚀 Running matrix evaluation (SDK Mode)..."
bash evals/scripts/run-remote-parameterized.sh "bypass" "$EVAL_FILE" "$COMMIT_INDEX"

- name: Dump matrix logs on failure
if: failure()
run: |
echo "❌ Matrix SDK Evaluation failed - dumping diagnostic information"
echo "🔧 Matrix job details: File=$EVAL_FILE, Index=$COMMIT_INDEX"
echo "🐳 Docker containers status:"
docker ps -a || true
echo "📋 Container logs:"
docker compose -f evals/docker-compose.evals.yml logs --tail=100 || true

- name: Upload matrix evaluation results
if: always()
uses: actions/upload-artifact@v4
with:
name: remote-eval-matrix-${{ matrix.eval.file }}-${{ matrix.eval.index }}-${{ github.sha }}
path: |
evals/test-repos/
debug/
retention-days: 7

- name: Cleanup containers
if: always()
run: |
docker compose -f evals/docker-compose.evals.yml down -v || true
docker system prune -f || true
7 changes: 6 additions & 1 deletion backend/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import {
sendRequestReconnect,
waitForAllClientsDisconnected,
listen as webSocketListen,
isWebSocketReady,
} from './websockets/server'

const app = express()
Expand All @@ -31,7 +32,11 @@ app.get('/', (req, res) => {
})

app.get('/healthz', (req, res) => {
res.send('ok')
if (isWebSocketReady()) {
res.send('ok')
} else {
res.status(503).send('starting')
}
})

app.post('/api/usage', usageHandler)
Expand Down
12 changes: 12 additions & 0 deletions backend/src/websockets/auth.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,12 @@ export interface UserInfo {
export async function getUserIdFromAuthToken(
authToken: string,
): Promise<string | undefined> {
// Test-only auth bypass
const bypass = process.env.CODEBUFF_TEST_AUTH_TOKEN
if (process.env.NODE_ENV === 'test' && bypass && authToken === bypass) {
return 'test-user'
}

const user = await db
.select({ id: schema.user.id })
.from(schema.user)
Expand All @@ -25,6 +31,12 @@ export async function getUserIdFromAuthToken(
export async function getUserInfoFromAuthToken(
authToken: string,
): Promise<UserInfo | undefined> {
// Test-only auth bypass
const bypass = process.env.CODEBUFF_TEST_AUTH_TOKEN
if (process.env.NODE_ENV === 'test' && bypass && authToken === bypass) {
return { id: 'test-user', email: 'evals@test.local', discord_id: null }
}

const user = await db
.select({
id: schema.user.id,
Expand Down
7 changes: 7 additions & 0 deletions backend/src/websockets/server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ export const SWITCHBOARD = new Switchboard()
// if a connection doesn't ping for this long, we assume the other side is toast
const CONNECTION_TIMEOUT_MS = 60 * 1000

let wsReady = false

export class MessageParseError extends Error {
details?: unknown
constructor(message: string, details?: unknown) {
Expand Down Expand Up @@ -87,6 +89,7 @@ export function listen(server: HttpServer, path: string) {
let deadConnectionCleaner: NodeJS.Timeout | undefined
wss.on('listening', () => {
logger.info(`Web socket server listening on ${path}.`)
wsReady = true
deadConnectionCleaner = setInterval(function ping() {
const now = Date.now()
try {
Expand Down Expand Up @@ -175,3 +178,7 @@ export function sendRequestReconnect() {
export function waitForAllClientsDisconnected() {
return SWITCHBOARD.waitForAllClientsDisconnected()
}

export function isWebSocketReady() {
return wsReady
}
7 changes: 6 additions & 1 deletion codebuff.json
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@
},
{
"name": "prettier-format",
"command": "git diff --name-only --diff-filter=ACMR | grep -E '\\.(ts|tsx|json|md)$' | xargs -r npx prettier --write",
"command": "set -o pipefail && CHANGED=\"$(git diff --name-only --diff-filter=ACMR | grep -E '\\.(ts|tsx|json|md)$' | xargs -r npx prettier --list-different || true)\"; [ -n \"$CHANGED\" ] && echo \"$CHANGED\" | xargs -r npx prettier --write --log-level=warn && printf '%s\\n' \"$CHANGED\" || true",
"filePattern": "**/*.{ts,tsx,json,md}"
},
{
Expand All @@ -70,6 +70,11 @@
"command": "bun run typecheck",
"cwd": ".agents",
"filePattern": ".agents/**/*.ts"
},
{
"name": "eslint-fix-imports",
"command": "set -o pipefail && git diff --name-only --diff-filter=ACMR | grep -E '\\.(ts|tsx|js|jsx)$' | xargs -r npx eslint --fix --quiet",
"filePattern": "**/*.{ts,tsx,js,jsx}"
}
]
}
Loading
Loading