Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
215 changes: 215 additions & 0 deletions .github/workflows/fs-benchmarks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
name: Filesystem Benchmark

on:
pull_request:
paths:
- 'src/fs/**'
- 'src/sandbox/**'
- 'src/util/**'
- 'src/run.ts'
- 'src/merge-results.ts'
- 'package.json'
schedule:
- cron: '0 3 * * *' # Daily at 03:00 UTC
workflow_dispatch:
inputs:
iterations:
description: 'Iterations per provider'
required: false
default: '100'
file_size_mb:
description: 'Large file size in MB'
required: false
default: '64'
small_files:
description: 'Number of small files in workload'
required: false
default: '1000'

concurrency:
group: fs-benchmarks
cancel-in-progress: true

permissions:
contents: write
pull-requests: write

jobs:
bench:
name: Bench ${{ matrix.provider }}
runs-on: namespace-profile-default
timeout-minutes: 60
strategy:
fail-fast: false
matrix:
provider:
- archil
- blaxel
- cloudflare
- codesandbox
- daytona
- declaw
- e2b
- hopx
- modal
- namespace
- runloop
- upstash
- vercel
steps:
- uses: actions/checkout@v4
- uses: actions/setup-node@v4
with:
node-version: 24
cache: 'npm'
- name: Install dependencies
run: |
if [ "${{ github.event_name }}" = "schedule" ]; then
npm update
else
npm ci
fi
- name: Clear stale results from checkout
run: rm -rf results/fs/
- name: Run filesystem benchmark
env:
COMPUTESDK_API_KEY: ${{ secrets.COMPUTESDK_API_KEY }}
ARCHIL_API_KEY: ${{ secrets.ARCHIL_API_KEY }}
ARCHIL_REGION: ${{ secrets.ARCHIL_REGION }}
ARCHIL_DISK_ID: ${{ secrets.ARCHIL_DISK_ID }}
BL_API_KEY: ${{ secrets.BL_API_KEY }}
BL_WORKSPACE: ${{ secrets.BL_WORKSPACE }}
CLOUDFLARE_SANDBOX_URL: ${{ secrets.CLOUDFLARE_SANDBOX_URL }}
CLOUDFLARE_SANDBOX_SECRET: ${{ secrets.CLOUDFLARE_SANDBOX_SECRET }}
CSB_API_KEY: ${{ secrets.CSB_API_KEY }}
DAYTONA_API_KEY: ${{ secrets.DAYTONA_API_KEY }}
DECLAW_API_KEY: ${{ secrets.DECLAW_API_KEY }}
E2B_API_KEY: ${{ secrets.E2B_API_KEY }}
HOPX_API_KEY: ${{ secrets.HOPX_API_KEY }}
MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
NSC_TOKEN: ${{ secrets.NSC_TOKEN }}
RUNLOOP_API_KEY: ${{ secrets.RUNLOOP_API_KEY }}
UPSTASH_BOX_API_KEY: ${{ secrets.UPSTASH_BOX_API_KEY }}
VERCEL_TOKEN: ${{ secrets.VERCEL_TOKEN }}
VERCEL_TEAM_ID: ${{ secrets.VERCEL_TEAM_ID }}
VERCEL_PROJECT_ID: ${{ secrets.VERCEL_PROJECT_ID }}
run: |
npm run bench -- \
--mode fs \
--provider ${{ matrix.provider }} \
--fs-file-size-mb ${{ github.event.inputs.file_size_mb || '64' }} \
--fs-small-files ${{ github.event.inputs.small_files || '1000' }} \
--iterations ${{ github.event_name == 'pull_request' && '5' || github.event.inputs.iterations || '100' }}
- name: Upload results
if: always()
uses: actions/upload-artifact@v4
with:
name: fs-results-${{ matrix.provider }}
path: results/fs/
if-no-files-found: ignore
retention-days: 7

collect:
name: Collect Results
runs-on: namespace-profile-default
needs: bench
if: always()
steps:
- uses: actions/checkout@v4
- uses: actions/setup-node@v4
with:
node-version: 24
cache: 'npm'
- name: Install dependencies
run: |
if [ "${{ github.event_name }}" = "schedule" ]; then
npm update
else
npm ci
fi
- name: Download all artifacts
uses: actions/download-artifact@v4
with:
path: artifacts/
pattern: fs-results-*
- name: Merge results
run: npx tsx src/merge-results.ts --input artifacts --mode fs
- name: Post results to PR
if: github.event_name == 'pull_request'
continue-on-error: true
uses: actions/github-script@v7
with:
script: |
const fs = require('fs');
const path = require('path');

const runUrl = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
const latestPath = path.join('results', 'fs', 'latest.json');

let body = '## Filesystem Benchmark Results\n\n';

if (!fs.existsSync(latestPath)) {
body += '> No filesystem benchmark results were generated.\n\n';
} else {
const data = JSON.parse(fs.readFileSync(latestPath, 'utf-8'));
const results = data.results
.filter(r => !r.skipped)
.sort((a, b) => (b.compositeScore || 0) - (a.compositeScore || 0));

if (results.length === 0) {
body += '> No filesystem benchmark results were generated.\n\n';
} else {
body += '| # | Provider | Score | Read | Write | Small Files | Metadata | Status |\n';
body += '|---|----------|-------|------|-------|-------------|----------|--------|\n';

results.forEach((r, i) => {
const score = r.compositeScore !== undefined ? r.compositeScore.toFixed(1) : '--';
const read = (r.summary.readMs.median / 1000).toFixed(2) + 's';
const write = (r.summary.writeMs.median / 1000).toFixed(2) + 's';
const small = (r.summary.smallFileOpsMs.median / 1000).toFixed(2) + 's';
const meta = (r.summary.metadataOpsMs.median / 1000).toFixed(2) + 's';
const ok = r.iterations.filter(it => !it.error).length;
const total = r.iterations.length;
body += `| ${i + 1} | ${r.provider} | ${score} | ${read} | ${write} | ${small} | ${meta} | ${ok}/${total} |\n`;
});

body += '\n';
}
}

body += `---\n*[View full run](${runUrl})*`;

const marker = '## Filesystem Benchmark Results';
const { data: comments } = await github.rest.issues.listComments({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
});

const existing = comments.find(c => c.body.startsWith(marker));

if (existing) {
await github.rest.issues.updateComment({
owner: context.repo.owner,
repo: context.repo.repo,
comment_id: existing.id,
body,
});
} else {
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
body,
});
}
- name: Commit and push
if: github.event_name != 'pull_request'
run: |
git config user.name "github-actions[bot]"
git config user.email "github-actions[bot]@users.noreply.github.com"
git add package.json package-lock.json results/fs/
git diff --cached --quiet && echo "No changes to commit" && exit 0
git commit -m "chore: update fs benchmark results [skip ci]"
git push
33 changes: 32 additions & 1 deletion METHODOLOGY.md
Original file line number Diff line number Diff line change
Expand Up @@ -133,16 +133,47 @@ Each sandbox still measures its own individual TTI. We also capture:

**Why burst matters:** AI agents and orchestration tools often spin up many sandboxes at once. Burst testing reveals how providers handle sudden spikes — provisioning queue depth, rate limiting, and failure rates under peak demand.

### Filesystem (FS)

FS benchmarks run inside a freshly created sandbox to measure local workspace disk performance after startup. This mode is separate from TTI and object storage tests.

```bash
npm run bench:fs
```

| Parameter | Default |
|-----------|---------|
| Iterations per provider | 100 |
| Large file size | 64MB |
| Small files count | 1000 |
| Timeout per iteration | 120 seconds |

Each successful iteration runs four workload blocks in sequence:

| Workload | Description |
|----------|-------------|
| **Large-file write** | Write a fixed-size buffer to disk and measure elapsed time |
| **Large-file read** | Read the same file back and verify byte length |
| **Small-file ops** | Create, read, and delete many small files |
| **Metadata ops** | Repeated `stat` + `rename` operations to stress metadata paths |

From these timings we derive:
- Read and write latency stats (median, p95, p99)
- Small-file and metadata latency stats (median, p95, p99)
- Read and write throughput (Mbps)
- Success rate and a reliability-weighted composite score

### Running All Tests

By default, `npm run bench` runs all three tests in sequence:
By default, `npm run bench` runs the three TTI tests in sequence:

```bash
npm run bench # Runs sequential → staggered → burst
npm run bench -- --provider e2b # All 3 tests, single provider
npm run bench:sequential # Sequential only
npm run bench:staggered # Staggered only
npm run bench:burst # Burst only
npm run bench:fs # Filesystem only
```

## Test Configuration
Expand Down
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,15 +48,17 @@ Each benchmark creates a fresh sandbox, runs `node -v`, and records wall-clock t

## Methodology

Each benchmark creates a fresh sandbox, runs `node -v`, and records wall-clock time. We run three test modes daily:
Each benchmark creates a fresh sandbox, runs `node -v`, and records wall-clock time. We run three TTI test modes daily:

**Sequential** — Sandboxes are created one at a time. Each is created, tested, and destroyed before the next begins. 100 iterations per provider. This is the baseline — isolated cold-start performance with no contention.

**Staggered** — 100 sandboxes are launched per provider with a 200ms delay between each, gradually ramping up concurrent load. Reveals how TTI degrades under increasing pressure, queue depth effects, and rate limiting behavior.

**Burst** — 100 sandboxes are created simultaneously with no delay between launches. Tests how providers handle sudden spikes — provisioning queue depth, rate limiting, and failure rates under peak demand.

For each provider we report min, max, median, P95, P99, and average TTI, plus a **composite score** (0–100) that combines weighted timing metrics with success rate. Providers must be both fast *and* reliable to score well.
**Filesystem (FS)** — In-sandbox disk benchmarks that measure large-file read/write latency and throughput, plus many small-file and metadata-heavy operations. This captures local workspace IO performance after the sandbox is interactive.

For each provider we report min, max, median, P95, P99, and average TTI, plus a **composite score** (0–100) that combines weighted timing metrics with success rate. Providers must be both fast *and* reliable to score well. FS mode uses the same reliability-weighted score approach across read/write and file-op metrics.

### Composite Score

Expand Down
3 changes: 3 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@
"bench:storage:4mb": "tsx src/run.ts --mode storage --file-size 4MB",
"bench:storage:10mb": "tsx src/run.ts --mode storage --file-size 10MB",
"bench:storage:16mb": "tsx src/run.ts --mode storage --file-size 16MB",
"bench:fs": "tsx src/run.ts --mode fs",
"bench:fs:64mb": "tsx src/run.ts --mode fs --fs-file-size-mb 64",
"bench:fs:e2b": "tsx src/run.ts --mode fs --provider e2b",
"update-readme": "tsx src/update-readme.ts",
"generate-svg": "tsx src/sandbox/generate-svg.ts",
"generate-svg:sequential": "tsx src/sandbox/generate-svg.ts --mode sequential",
Expand Down
Loading
Loading