From a7b66094d07ef02691aad5c0be924f332521e436 Mon Sep 17 00:00:00 2001
From: isaacbmiller <isaacbmiller@gmail.com>
Date: Sat, 28 Feb 2026 17:51:17 -0500
Subject: [PATCH 1/4] perf: defer heavy init to ASGI lifespan for fast port
 binding

Move module discovery, LM creation, and route registration into the
ASGI lifespan context so the server socket binds immediately. This
eliminates false 'not listening' warnings from Fly.io / Kubernetes
during the ~10-15s boot window.

Health endpoints (/health/live, /health/ready) respond as soon as the
port is open. /health/ready returns 503 until initialization completes.

Also: rewrite fly-e2e-test skill to remove tmux dependency, fix shell
redirection in fly ssh commands, and fix API key prompt handling.

Co-authored-by: factory-droid[bot] <138933559+factory-droid[bot]@users.noreply.github.com>
---
 .claude/skills/fly-e2e-test/SKILL.md | 226 ++++++++--------
 src/dspy_cli/server/app.py           | 376 +++++++++++++++------------
 src/dspy_cli/server/runner.py        |  70 ++---
 tests/gateway/test_gateway_routes.py |  12 +-
 tests/test_serve_integration.py      | 106 ++++----
 5 files changed, 410 insertions(+), 380 deletions(-)

diff --git a/.claude/skills/fly-e2e-test/SKILL.md b/.claude/skills/fly-e2e-test/SKILL.md
index f7eade0..5b2d4d9 100644
--- a/.claude/skills/fly-e2e-test/SKILL.md
+++ b/.claude/skills/fly-e2e-test/SKILL.md
@@ -9,11 +9,12 @@ allowed-tools:
 
 Deploy a fresh dspy-cli project to Fly.io using your local code changes, run full integration tests (health, auth, LLM execution), and **guarantee cleanup** regardless of success or failure.
 
-## ⚠️ CRITICAL RULES
+## CRITICAL RULES
 
 1. **NEVER commit directly to main** - Always create a side branch first, even for small changes
 2. **ALWAYS clean up** - Destroy Fly apps and delete temp branches, even if tests fail
 3. **Use temp branches** - Name them `e2e-test/{timestamp}-{random}` for easy identification
+4. **Run cleanup in a trap** - Use bash trap or always-run-cleanup pattern
 
 ## Prerequisites
 
@@ -24,80 +25,76 @@ Deploy a fresh dspy-cli project to Fly.io using your local code changes, run ful
 
 ## Quick Start
 
-Run each phase in a tmux session to enable output capture and cleanup tracking.
+All commands run directly in the shell (no tmux required). Use environment variables to pass state between steps.
 
 ### Phase 1: Setup Environment
 
 ```bash
-# Create tmux session
-tmux new-session -d -s e2e-fly -c /Users/isaac/projects/dspy-cli
-
-# Set variables
-tmux send-keys -t e2e-fly 'export DSPY_CLI_DIR="/Users/isaac/projects/dspy-cli"' C-m
-tmux send-keys -t e2e-fly 'export TIMESTAMP=$(date +%s)' C-m
-tmux send-keys -t e2e-fly 'export RANDOM_SUFFIX=$(head -c 4 /dev/urandom | xxd -p)' C-m
-tmux send-keys -t e2e-fly 'export FLY_APP_NAME="dspy-e2e-${RANDOM_SUFFIX}"' C-m
-tmux send-keys -t e2e-fly 'export TEMP_BRANCH="e2e-test/${TIMESTAMP}-${RANDOM_SUFFIX}"' C-m
+export DSPY_CLI_DIR="/Users/isaac/projects/dspy-cli"
+export TIMESTAMP=$(date +%s)
+export RANDOM_SUFFIX=$(head -c 4 /dev/urandom | xxd -p)
+export FLY_APP_NAME="dspy-e2e-${RANDOM_SUFFIX}"
+export TEMP_BRANCH="e2e-test/${TIMESTAMP}-${RANDOM_SUFFIX}"
+export DSPY_API_KEY_VALUE="test-e2e-$(head -c 8 /dev/urandom | xxd -p)"
 
 # Source .env for OPENAI_API_KEY
-tmux send-keys -t e2e-fly 'set -a && source .env && set +a' C-m
+set -a && source "$DSPY_CLI_DIR/.env" && set +a
 
 # Verify setup
-tmux send-keys -t e2e-fly 'echo "App: $FLY_APP_NAME Branch: $TEMP_BRANCH"' C-m
+echo "App: $FLY_APP_NAME Branch: $TEMP_BRANCH"
 ```
 
 ### Phase 2: Pre-flight Checks
 
 ```bash
-# Verify fly CLI
-tmux send-keys -t e2e-fly 'fly version && fly auth whoami' C-m
-
-# Check for uncommitted changes (stash if needed)
-tmux send-keys -t e2e-fly 'git status --porcelain' C-m
+fly version && fly auth whoami
+git -C "$DSPY_CLI_DIR" status --porcelain
 
 # Clean up any orphaned e2e resources
-tmux send-keys -t e2e-fly 'fly apps list 2>/dev/null | grep "dspy-e2e" || echo "No orphaned apps"' C-m
+fly apps list 2>/dev/null | grep "dspy-e2e" || echo "No orphaned apps"
 ```
 
 ### Phase 3: Create and Push Temp Branch
 
 ```bash
-tmux send-keys -t e2e-fly 'git checkout -b "$TEMP_BRANCH"' C-m
-tmux send-keys -t e2e-fly 'git push -u origin "$TEMP_BRANCH"' C-m
+git -C "$DSPY_CLI_DIR" checkout -b "$TEMP_BRANCH"
+git -C "$DSPY_CLI_DIR" push -u origin "$TEMP_BRANCH"
 ```
 
 ### Phase 4: Create Test Project
 
 ```bash
-# Create temp directory
-tmux send-keys -t e2e-fly 'export TEST_DIR=$(mktemp -d) && echo "TEST_DIR=$TEST_DIR"' C-m
-
-# Create project (will prompt for API key confirmation - send Y)
-tmux send-keys -t e2e-fly 'uv run --directory "$DSPY_CLI_DIR" dspy-cli new fly-e2e-test --program-name qa_module --signature "question:str -> answer:str" --module-type Predict --model openai/gpt-4o-mini' C-m
+export TEST_DIR=$(mktemp -d) && echo "TEST_DIR=$TEST_DIR"
 
-# When prompted "Proceed with this API key? [Y/n]:", send:
-tmux send-keys -t e2e-fly 'Y' C-m
+# Pipe "Y" to accept the API key confirmation prompt
+echo "Y" | uv run --directory "$DSPY_CLI_DIR" dspy-cli new fly-e2e-test \
+  --program-name qa_module \
+  --signature "question:str -> answer:str" \
+  --module-type Predict \
+  --model openai/gpt-4o-mini
 
-# Move project to temp dir (dspy-cli creates in current dir)
-tmux send-keys -t e2e-fly 'mv "$DSPY_CLI_DIR/fly-e2e-test" "$TEST_DIR/" && cd "$TEST_DIR/fly-e2e-test"' C-m
+# Move project to temp dir (dspy-cli new creates in current dir)
+mv "$DSPY_CLI_DIR/fly-e2e-test" "$TEST_DIR/"
+cd "$TEST_DIR/fly-e2e-test"
 ```
 
 ### Phase 5: Modify for Git-Based dspy-cli
 
 ```bash
-# Update pyproject.toml to install dspy-cli from temp branch
-tmux send-keys -t e2e-fly 'sed -i.bak "s|\"dspy-cli\"|\"dspy-cli @ git+https://github.com/cmpnd-ai/dspy-cli.git@$TEMP_BRANCH\"|" pyproject.toml' C-m
+cd "$TEST_DIR/fly-e2e-test"
+
+# Update pyproject.toml to install dspy-cli from temp branch (use double quotes for variable expansion)
+sed -i.bak "s|\"dspy-cli\"|\"dspy-cli @ git+https://github.com/cmpnd-ai/dspy-cli.git@$TEMP_BRANCH\"|" pyproject.toml
 
-# IMPORTANT: Update Dockerfile to include git (required for git-based deps)
-# NOTE: This is an example dockerfile. There may be specific changes in a newer version of dspy-cli. Check the current Dockerfile and add the Git install line
-tmux send-keys -t e2e-fly 'cat > Dockerfile << '"'"'EOF'"'"'
+# Update Dockerfile: add git (required for git-based deps)
+# NOTE: Check the current Dockerfile.template for the latest CMD format and update accordingly
+cat > Dockerfile << 'EOF'
 FROM python:3.11-slim
 
 ENV PYTHONDONTWRITEBYTECODE=1
 ENV PYTHONUNBUFFERED=1
 ENV XDG_CACHE_HOME=/tmp/.cache
 
-# Install git for fetching dspy-cli from git URL
 RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
 
 COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
@@ -108,16 +105,17 @@ RUN uv sync --no-dev
 EXPOSE 8000
 
 CMD ["uv", "run", "dspy-cli", "serve", "--host", "0.0.0.0", "--port", "8000", "--auth", "--no-reload"]
-EOF' C-m
+EOF
 ```
 
 ### Phase 6: Create fly.toml and Deploy
 
 ```bash
-# Create fly.toml
-tmux send-keys -t e2e-fly 'cat > fly.toml << EOF
-app = '"'"'$FLY_APP_NAME'"'"'
-primary_region = '"'"'ewr'"'"'
+cd "$TEST_DIR/fly-e2e-test"
+
+cat > fly.toml << EOF
+app = '$FLY_APP_NAME'
+primary_region = 'ewr'
 
 [build]
 
@@ -127,97 +125,120 @@ primary_region = '"'"'ewr'"'"'
   auto_stop_machines = true
   auto_start_machines = true
   min_machines_running = 0
-  processes = ['"'"'app'"'"']
+  processes = ['app']
 
-[[vm]]
-  memory = '"'"'512mb'"'"'
-  cpu_kind = '"'"'shared'"'"'
-  cpus = 1
-EOF' C-m
+[deploy]
+  ha = false
 
-# Create app
-tmux send-keys -t e2e-fly 'fly apps create "$FLY_APP_NAME" --org personal' C-m
+[checks]
+  [checks.health]
+    port = 8000
+    type = "http"
+    interval = "10s"
+    timeout = "5s"
+    grace_period = "30s"
+    method = "GET"
+    path = "/health/live"
 
-# Generate a random API key for testing
-tmux send-keys -t e2e-fly 'export DSPY_API_KEY_VALUE="test-e2e-$(head -c 8 /dev/urandom | xxd -p)"' C-m
+[[vm]]
+  memory = '512mb'
+  cpu_kind = 'shared'
+  cpus = 1
+EOF
 
-# Set secrets using fly secrets (required env vars for your app)
-# Add any additional env vars your project needs here
-tmux send-keys -t e2e-fly 'fly secrets set OPENAI_API_KEY="$OPENAI_API_KEY" DSPY_API_KEY="$DSPY_API_KEY_VALUE" --app "$FLY_APP_NAME"' C-m
+# Create app and set secrets
+fly apps create "$FLY_APP_NAME" --org personal
+fly secrets set OPENAI_API_KEY="$OPENAI_API_KEY" DSPY_API_KEY="$DSPY_API_KEY_VALUE" --app "$FLY_APP_NAME"
 
 # Deploy (takes ~2-3 minutes)
-tmux send-keys -t e2e-fly 'fly deploy --app "$FLY_APP_NAME" --wait-timeout 300' C-m
+fly deploy --app "$FLY_APP_NAME" --wait-timeout 300
 ```
 
 ### Phase 7: Run Integration Tests
 
 ```bash
-tmux send-keys -t e2e-fly 'export FLY_APP_URL="https://$FLY_APP_NAME.fly.dev"' C-m
+export FLY_APP_URL="https://$FLY_APP_NAME.fly.dev"
+
+# Wait for app to be ready (poll /health/ready)
+for i in $(seq 1 30); do
+  STATUS=$(curl -s -o /dev/null -w "%{http_code}" "$FLY_APP_URL/health/ready")
+  if [ "$STATUS" = "200" ]; then echo "App ready after ${i}s"; break; fi
+  sleep 1
+done
+
+# Test 1: Health endpoints (no auth required)
+echo "=== Test 1: Liveness ===" && curl -s "$FLY_APP_URL/health/live"
+echo "=== Test 2: Readiness ===" && curl -s "$FLY_APP_URL/health/ready"
+echo "=== Test 3: Legacy health ===" && curl -s "$FLY_APP_URL/health"
+
+# Test 4: Auth redirect (unauthenticated)
+echo "=== Test 4: Auth Redirect ===" && curl -s -o /dev/null -w "HTTP: %{http_code}\n" "$FLY_APP_URL/programs"
+
+# Test 5: Auth success (authenticated)
+echo "=== Test 5: Auth Success ===" && curl -s -H "Authorization: Bearer $DSPY_API_KEY_VALUE" "$FLY_APP_URL/programs"
+
+# Test 6: LLM Module Execution
+echo "=== Test 6: LLM Execution ===" && curl -s -X POST \
+  -H "Authorization: Bearer $DSPY_API_KEY_VALUE" \
+  -H "Content-Type: application/json" \
+  -d '{"question": "What is 2+2? Reply with just the number."}' \
+  "$FLY_APP_URL/QaModulePredict"
+```
 
-# Test 1: Health Check
-tmux send-keys -t e2e-fly 'echo "=== Test 1: Health Check ===" && curl -s "$FLY_APP_URL/health"' C-m
-# Expected: {"status":"ok"}
+### Phase 8: SSH Inspection (optional)
 
-# Test 2: Auth Redirect (unauthenticated)
-tmux send-keys -t e2e-fly 'echo "=== Test 2: Auth Redirect ===" && curl -s -o /dev/null -w "HTTP: %{http_code}\n" "$FLY_APP_URL/programs"' C-m
-# Expected: HTTP: 303
+With `ha = false`, there's only one machine so SSH always targets it.
+Shell redirects like `2>/dev/null` don't work in `-C` commands -- wrap in `sh -c`:
 
-# Test 3: Auth Success (authenticated)
-tmux send-keys -t e2e-fly 'echo "=== Test 3: Auth Success ===" && curl -s -H "Authorization: Bearer $DSPY_API_KEY_VALUE" "$FLY_APP_URL/programs"' C-m
-# Expected: {"programs":[{"name":"QaModulePredict",...}]}
+```bash
+# Inspect the machine filesystem
+fly ssh console --app "$FLY_APP_NAME" -C "sh -c 'find /root -name \"*.log\" 2>/dev/null'"
 
-# Test 4: LLM Module Execution
-tmux send-keys -t e2e-fly 'echo "=== Test 4: LLM Execution ===" && curl -s -X POST -H "Authorization: Bearer $DSPY_API_KEY_VALUE" -H "Content-Type: application/json" -d '"'"'{"question": "What is 2+2?"}'"'"' "$FLY_APP_URL/QaModulePredict"' C-m
-# Expected: {"answer":"4"} (or similar)
+# Check inference logs
+fly ssh console --app "$FLY_APP_NAME" -C "cat /logs/QaModulePredict.log"
 ```
 
-### Phase 8: Guaranteed Cleanup
+### Phase 9: Guaranteed Cleanup
 
 **ALWAYS run cleanup, even if tests fail:**
 
 ```bash
 # Destroy Fly app
-tmux send-keys -t e2e-fly 'fly apps destroy "$FLY_APP_NAME" --yes' C-m
+fly apps destroy "$FLY_APP_NAME" --yes
 
 # Delete remote branch
-tmux send-keys -t e2e-fly 'git -C "$DSPY_CLI_DIR" push origin --delete "$TEMP_BRANCH"' C-m
+git -C "$DSPY_CLI_DIR" push origin --delete "$TEMP_BRANCH"
 
 # Return to main and delete local branch
-tmux send-keys -t e2e-fly 'git -C "$DSPY_CLI_DIR" checkout main' C-m
-tmux send-keys -t e2e-fly 'git -C "$DSPY_CLI_DIR" branch -D "$TEMP_BRANCH"' C-m
+git -C "$DSPY_CLI_DIR" checkout main
+git -C "$DSPY_CLI_DIR" branch -D "$TEMP_BRANCH"
 
 # Remove temp directory
-tmux send-keys -t e2e-fly 'rm -rf "$TEST_DIR"' C-m
-
-# Kill tmux session
-tmux kill-session -t e2e-fly
+rm -rf "$TEST_DIR"
 ```
 
 ## Verification Checklist
 
 | Test | Expected Result |
 |------|-----------------|
-| Health Check | `{"status":"ok"}` |
+| `/health/live` (no auth) | `{"status":"alive"}` |
+| `/health/ready` (no auth) | `{"status":"ready","programs":1}` |
+| `/health` (no auth) | `{"status":"ok"}` |
 | Auth Redirect (no auth) | HTTP 303 |
 | Auth Success (Bearer token) | JSON with `QaModulePredict` |
 | LLM Execution | JSON with `"answer"` field |
 
 ## Cleanup Verification
 
-After running cleanup, verify:
-
 ```bash
-# No orphaned Fly apps
-fly apps list | grep "dspy-e2e" || echo "Clean"
-
-# No orphaned branches
-git branch -r | grep "e2e-test/" || echo "Clean"
+fly apps list | grep "dspy-e2e" || echo "No orphaned apps"
+git branch -r | grep "e2e-test/" || echo "No orphaned branches"
 ```
 
 ## Troubleshooting
 
 ### Deploy fails with "Git executable not found"
-The Dockerfile must include git installation. Ensure the Dockerfile has:
+The Dockerfile must include git installation:
 ```dockerfile
 RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
 ```
@@ -230,13 +251,19 @@ sed -i.bak 's|...|...|' pyproject.toml  # Won't expand $TEMP_BRANCH
 ```
 
 ### Project created in wrong directory
-`dspy-cli new` creates projects relative to the current working directory, not where it's run from. Move the project after creation:
+`dspy-cli new` creates projects relative to the current working directory. Move the project after creation:
 ```bash
 mv "$DSPY_CLI_DIR/fly-e2e-test" "$TEST_DIR/"
 ```
 
+### Shell redirects fail in fly ssh -C
+Wrap the remote command in `sh -c`:
+```bash
+fly ssh console --app "$FLY_APP_NAME" -C "sh -c 'find / -name \"*dspy*\" 2>/dev/null'"
+```
+
 ### Cleanup fails
-If any cleanup step fails, run them individually:
+Run each step individually:
 ```bash
 fly apps destroy "dspy-e2e-XXXX" --yes
 git push origin --delete "e2e-test/XXXX"
@@ -245,27 +272,18 @@ git branch -D "e2e-test/XXXX"
 ```
 
 ### App crashes due to missing environment variables
-Use `fly secrets` to set any required env vars. Check the app logs to see which vars are missing:
 ```bash
-# View logs to find missing env vars
 fly logs --app "$FLY_APP_NAME" --no-tail
-
-# Set additional secrets as needed
-fly secrets set VAR_NAME="value" ANOTHER_VAR="value" --app "$FLY_APP_NAME"
-
-# List current secrets
+fly secrets set VAR_NAME="value" --app "$FLY_APP_NAME"
 fly secrets list --app "$FLY_APP_NAME"
 ```
 
-Common env vars that might be needed:
+Common env vars:
 - `OPENAI_API_KEY` - Required for OpenAI models
 - `DSPY_API_KEY` - Required when `--auth` is enabled
-- Project-specific vars (check your gateway's `setup()` method)
-
-## Multi-Layer Cleanup Protection
 
-1. **Unique naming**: `dspy-e2e-{random}` prevents conflicts
-2. **Pre-test orphan cleanup**: Removes stale resources before starting
-3. **tmux session**: Enables output capture and manual recovery
-4. **Explicit cleanup phase**: Always runs after tests
-5. **Verification commands**: Confirm cleanup succeeded
+### Per-machine cache fragmentation (multi-machine deployments)
+The `ha = false` setting in fly.toml keeps E2E tests on a single machine,
+avoiding this issue. For production deployments with multiple machines, the
+LM response cache (`.dspy_cache`) is local to each VM, so requests hitting
+different machines may miss the cache.
diff --git a/src/dspy_cli/server/app.py b/src/dspy_cli/server/app.py
index 471a562..6496d22 100644
--- a/src/dspy_cli/server/app.py
+++ b/src/dspy_cli/server/app.py
@@ -1,17 +1,24 @@
-"""FastAPI application factory."""
+"""FastAPI application factory with deferred initialization.
+
+The app shell (health endpoints, CORS, auth) is created immediately so the
+server port binds fast. Heavy work (module discovery, LM creation, route
+registration) is deferred to the ASGI lifespan startup, which runs *after*
+the socket is listening. This eliminates false "not listening" warnings from
+container orchestrators like Fly.io and Kubernetes.
+"""
 
 import logging
 import os
 from contextlib import asynccontextmanager
+from dataclasses import dataclass
 from pathlib import Path
-from typing import Dict, List, Union
+from typing import Dict, List, Optional, Union
+
+from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
 
-
 import dspy
-from fastapi import FastAPI
-
 from dspy_cli.config import get_model_config, get_program_model
 from dspy_cli.discovery import discover_modules
 from dspy_cli.discovery.gateway_finder import get_gateways_for_module, is_cron_gateway
@@ -26,6 +33,17 @@
 logger = logging.getLogger(__name__)
 
 
+@dataclass
+class _AppInitParams:
+    """Parameters stored on app.state for deferred initialization."""
+    config: Dict
+    package_path: Path
+    package_name: str
+    logs_dir: Path
+    enable_auth: bool
+    sync_workers: Optional[int]
+
+
 def create_app(
     config: Dict,
     package_path: Path,
@@ -37,6 +55,10 @@ def create_app(
 ) -> FastAPI:
     """Create and configure the FastAPI application.
 
+    Returns a lightweight app shell with health endpoints. Heavy initialization
+    (module discovery, LM creation, route registration) is deferred to the
+    ASGI lifespan so the server port binds quickly.
+
     Args:
         config: Loaded configuration dictionary
         package_path: Path to the modules package
@@ -49,30 +71,80 @@ def create_app(
     Returns:
         Configured FastAPI application
     """
-    # Setup logging
     setup_logging()
 
-    # Initialize bounded executor for sync module execution
-    worker_count = sync_workers or config.get("server", {}).get("sync_worker_threads") or DEFAULT_SYNC_WORKERS
-    init_executor(max_workers=worker_count)
+    # Configure default DSPy model synchronously (before async lifespan).
+    # dspy.settings.configure() must be called outside async tasks due to DSPy's
+    # ownership guard, and it's lightweight (no I/O).
+    default_model_alias = config["models"]["default"]
+    default_model_config = get_model_config(config, default_model_alias)
+    _configure_dspy_model(default_model_config)
+    logger.info(f"Configured default model: {default_model_alias}")
+
+    init_params = _AppInitParams(
+        config=config,
+        package_path=package_path,
+        package_name=package_name,
+        logs_dir=logs_dir,
+        enable_auth=enable_auth,
+        sync_workers=sync_workers,
+    )
 
-    # Create FastAPI app
     app = FastAPI(
         title="DSPy API",
         description="Automatically generated API for DSPy programs",
         version="0.1.0",
-        lifespan=lifespan,
+        lifespan=_make_lifespan(init_params),
     )
 
-    # Configure CORS if enabled (env var takes precedence over config file)
+    # Pre-init state
+    app.state._ready = False
+    app.state.modules = []
+    app.state.program_lms = {}
+    app.state.logs_dir = logs_dir
+    app.state.metrics_cache = {}
+
+    # CORS (lightweight, no deferred init needed)
+    _setup_cors(app, config)
+
+    # Health endpoints (available as soon as the port binds)
+    @app.get("/health/live")
+    async def liveness():
+        """Liveness probe -- returns 200 if the process is running."""
+        return {"status": "alive"}
+
+    @app.get("/health/ready")
+    async def readiness():
+        """Readiness probe -- returns 200 when all LM instances are initialized."""
+        if not app.state._ready:
+            return JSONResponse(status_code=503, content={"status": "starting"})
+        modules = app.state.modules
+        if not modules:
+            return JSONResponse(status_code=503, content={"status": "not_ready", "reason": "no modules discovered"})
+        missing = [m.name for m in modules if m.name not in app.state.program_lms]
+        if missing:
+            return JSONResponse(status_code=503, content={"status": "not_ready", "reason": f"LMs not initialized: {missing}"})
+        return {"status": "ready", "programs": len(modules)}
+
+    # Auth middleware (early setup with mutable open_paths so lifespan can add gateway paths)
+    if enable_auth:
+        _setup_auth_middleware(app)
+
+    return app
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _setup_cors(app: FastAPI, config: Dict):
+    """Configure CORS middleware (lightweight, runs immediately)."""
     cors_origins: Union[str, List[str], None] = os.environ.get("DSPY_CORS_ORIGINS")
     if cors_origins is None:
         cors_origins = config.get("server", {}).get("cors_origins")
 
     if cors_origins:
-
         if cors_origins == "*" or cors_origins == ["*"]:
-            # Wildcard mode - no credentials allowed
             app.add_middleware(
                 CORSMiddleware,
                 allow_origins=["*"],
@@ -82,7 +154,6 @@ def create_app(
             )
             logger.info("CORS enabled for all origins (wildcard mode)")
         else:
-            # Specific origins - allow credentials
             origins = (
                 cors_origins
                 if isinstance(cors_origins, list)
@@ -97,13 +168,102 @@ def create_app(
             )
             logger.info(f"CORS enabled for origins: {origins}")
 
-    # Store logs directory and metrics cache in app state
-    app.state.logs_dir = logs_dir
-    app.state.metrics_cache = {}
+
+def _setup_auth_middleware(app: FastAPI):
+    """Set up auth middleware with default open paths.
+
+    The open_paths set is mutable so the lifespan can add gateway public
+    paths after module discovery.
+    """
+    from dspy_cli.server.auth import (
+        DEFAULT_OPEN_PATHS,
+        AuthMiddleware,
+        create_auth_routes,
+        generate_token,
+        get_api_token,
+    )
+
+    token = get_api_token()
+    if not token:
+        token = generate_token()
+        os.environ["DSPY_API_KEY"] = token
+        logger.warning("=" * 60)
+        logger.warning("DSPY_API_KEY not set. Generated temporary token:")
+        logger.warning(f"  {token}")
+        logger.warning("Set DSPY_API_KEY as an environment secret for a persistent token.")
+        logger.warning("=" * 60)
+
+    auth_router = create_auth_routes(token)
+    app.include_router(auth_router)
+
+    # Mutable set -- _deferred_init will add gateway public paths later
+    open_paths = set(DEFAULT_OPEN_PATHS)
+    app.state._auth_open_paths = open_paths
+
+    app.add_middleware(AuthMiddleware, token=token, open_paths=open_paths)
+    logger.info("Authentication enabled")
+
+
+# ---------------------------------------------------------------------------
+# Lifespan & deferred initialization
+# ---------------------------------------------------------------------------
+
+def _make_lifespan(params: _AppInitParams):
+    """Create the ASGI lifespan context manager.
+
+    Heavy initialization runs here, *after* the server socket is bound.
+    """
+    @asynccontextmanager
+    async def lifespan(app: FastAPI):
+        # --- Startup (port is already listening at this point) ---
+        _deferred_init(app, params)
+
+        # Run any callbacks registered by runner.py (MCP, OpenAPI save, etc.)
+        for cb in getattr(app.state, "_on_ready_callbacks", []):
+            try:
+                cb()
+            except Exception as e:
+                logger.warning(f"on_ready callback error: {e}")
+
+        scheduler = getattr(app.state, "scheduler", None)
+        if scheduler and scheduler.job_count > 0:
+            scheduler.start()
+
+        yield
+
+        # --- Shutdown ---
+        if scheduler and scheduler.job_count > 0:
+            scheduler.shutdown()
+
+        for shutdown_fn in getattr(app.state, "_gateway_shutdowns", []):
+            try:
+                shutdown_fn()
+            except Exception as e:
+                logger.warning(f"Gateway shutdown error: {e}")
+
+        shutdown_executor()
+
+    return lifespan
+
+
+def _deferred_init(app: FastAPI, params: _AppInitParams):
+    """Heavy initialization: module discovery, LM creation, route registration.
+
+    Called from the lifespan startup, after the server port is already bound.
+    """
+    config = params.config
+
+    # Initialize bounded executor for sync module execution
+    worker_count = (
+        params.sync_workers
+        or config.get("server", {}).get("sync_worker_threads")
+        or DEFAULT_SYNC_WORKERS
+    )
+    init_executor(max_workers=worker_count)
 
     # Discover modules
-    logger.info(f"Discovering modules in {package_path}")
-    modules = discover_modules(package_path, package_name)
+    logger.info(f"Discovering modules in {params.package_path}")
+    modules = discover_modules(params.package_path, params.package_name)
 
     if not modules:
         logger.warning("No DSPy modules discovered!")
@@ -115,49 +275,32 @@ def create_app(
         duplicate_set = set(duplicates)
         error_msg = f"Error: Duplicate module names found: {', '.join(sorted(duplicate_set))}"
         logger.error(error_msg)
-        logger.error("Each module must have a unique class name.")
         raise ValueError(error_msg)
 
-    # Configure default model
-    default_model_alias = config["models"]["default"]
-    default_model_config = get_model_config(config, default_model_alias)
-    _configure_dspy_model(default_model_config)
-
-    logger.info(f"Configured default model: {default_model_alias}")
-
-    # Create LM instances for each program and store them
-    app.state.program_lms = {}
+    # Create LM instances for each program (dspy.settings already configured in create_app)
     for module in modules:
-        # Get model for this program (could be overridden)
         model_alias = get_program_model(config, module.name)
         model_config = get_model_config(config, model_alias)
-
-        # Create LM instance for this program
         lm = _create_lm_instance(model_config)
         app.state.program_lms[module.name] = lm
-
         logger.info(f"Created LM for program: {module.name} (model: {model_alias})")
 
     # Initialize scheduler for cron gateways
-    scheduler = GatewayScheduler(logs_dir)
+    scheduler = GatewayScheduler(params.logs_dir)
     app.state.scheduler = scheduler
 
     # Track registered API paths to detect conflicts
-    registered_paths: Dict[str, str] = {}  # path -> "module.gateway" for error messages
+    registered_paths: Dict[str, str] = {}
 
     # Create routes for each discovered module
     for module in modules:
-        # Get the LM instance for this program
         lm = app.state.program_lms[module.name]
         model_alias = get_program_model(config, module.name)
         model_config = get_model_config(config, model_alias)
-
-        # Get all gateways for this module and route by type
         gateways = get_gateways_for_module(module)
 
         for gateway in gateways:
             if is_cron_gateway(gateway):
-                # Register with scheduler instead of creating HTTP route
                 scheduler.register_cron_gateway(
                     module=module,
                     gateway=gateway,
@@ -166,7 +309,6 @@ def create_app(
                 )
                 logger.info(f"Registered cron gateway: {module.name} ({gateway.__class__.__name__}, schedule: {gateway.schedule})")
             elif isinstance(gateway, APIGateway):
-                # Calculate the route path (same logic as routes.py)
                 if gateway.path:
                     route_path = gateway.path
                 elif isinstance(gateway, IdentityGateway):
@@ -174,7 +316,6 @@ def create_app(
                 else:
                     route_path = f"/{module.name}/{gateway.__class__.__name__}"
 
-                # Check for path conflicts
                 gateway_id = f"{module.name}.{gateway.__class__.__name__}"
                 if route_path in registered_paths:
                     existing = registered_paths[route_path]
@@ -192,22 +333,6 @@ def create_app(
             else:
                 logger.warning(f"Unknown gateway type for {module.name}: {type(gateway)}")
 
-    # Health check endpoints
-    @app.get("/health/live")
-    async def liveness():
-        """Liveness probe -- returns 200 if the process is running."""
-        return {"status": "alive"}
-
-    @app.get("/health/ready")
-    async def readiness():
-        """Readiness probe -- returns 200 when all LM instances are initialized."""
-        if not modules:
-            return JSONResponse(status_code=503, content={"status": "not_ready", "reason": "no modules discovered"})
-        missing = [m.name for m in modules if m.name not in app.state.program_lms]
-        if missing:
-            return JSONResponse(status_code=503, content={"status": "not_ready", "reason": f"LMs not initialized: {missing}"})
-        return {"status": "ready", "programs": len(modules)}
-
     # Add programs list endpoint
     @app.get("/programs")
     async def list_programs():
@@ -215,29 +340,20 @@ async def list_programs():
         programs = []
         for module in modules:
             model_alias = get_program_model(config, module.name)
-
-            program_info = {
+            programs.append({
                 "name": module.name,
                 "model": model_alias,
                 "endpoint": f"/{module.name}",
-            }
-
-            programs.append(program_info)
-
+            })
         return {"programs": programs}
 
     # Add metrics endpoints
     @app.get("/api/metrics")
     async def list_metrics(sort_by: str = "calls", order: str = "desc"):
-        """Get aggregated metrics for all programs.
-
-        Args:
-            sort_by: Sort key (name, calls, latency, cost, tokens, last_call)
-            order: Sort order (asc, desc)
-        """
+        """Get aggregated metrics for all programs."""
         program_names = [m.name for m in modules]
         metrics_list = get_all_metrics(
-            logs_dir,
+            params.logs_dir,
             program_names,
             app.state.metrics_cache,
             sort_by=sort_by,
@@ -253,40 +369,28 @@ async def program_metrics(program_name: str):
             raise HTTPException(status_code=404, detail=f"Program '{program_name}' not found")
 
         metrics = get_program_metrics_cached(
-            logs_dir,
+            params.logs_dir,
             program_name,
             app.state.metrics_cache,
         )
         return {"metrics": metrics.to_dict()}
 
-    # Store modules in app state for access by routes
+    # Store modules and config in app state
     app.state.modules = modules
     app.state.config = config
 
-    # Enhance OpenAPI metadata with DSPy-specific information
+    # Enhance OpenAPI metadata
     app_id = config.get("app_id", "DSPy API")
     app_description = config.get("description", "Automatically generated API for DSPy programs")
-
-    # Create program-to-model mapping
     program_models = {module.name: get_program_model(config, module.name) for module in modules}
-
-    # Create DSPy extensions
     extensions = create_openapi_extensions(config, modules, program_models)
-
-    enhance_openapi_metadata(
-        app,
-        title=app_id,
-        description=app_description,
-        extensions=extensions
-    )
-
+    enhance_openapi_metadata(app, title=app_id, description=app_description, extensions=extensions)
     logger.info("Enhanced OpenAPI metadata with DSPy configuration")
 
-    # Register UI routes (always enabled)
+    # Register UI routes
     from fastapi.staticfiles import StaticFiles
     from dspy_cli.server.ui import create_ui_routes
 
-    # Mount static files
     static_dir = Path(__file__).parent.parent / "templates" / "ui" / "static"
     if static_dir.exists():
         app.mount("/static", StaticFiles(directory=str(static_dir)), name="static")
@@ -294,81 +398,27 @@ async def program_metrics(program_name: str):
     else:
         logger.warning(f"Static directory not found: {static_dir}")
 
-    # Create UI routes
-    create_ui_routes(app, modules, config, logs_dir, auth_enabled=enable_auth)
+    create_ui_routes(app, modules, config, params.logs_dir, auth_enabled=params.enable_auth)
     logger.info("UI routes registered")
 
-    # Setup authentication if enabled
-    if enable_auth:
-        from dspy_cli.server.auth import (
-            DEFAULT_OPEN_PATHS,
-            AuthMiddleware,
-            create_auth_routes,
-            generate_token,
-            get_api_token,
-        )
-
-        token = get_api_token()
-        if not token:
-            # Auto-generate a token and log it (Jupyter-style)
-            token = generate_token()
-            import os as os_module
-            os_module.environ["DSPY_API_KEY"] = token
-            logger.warning("=" * 60)
-            logger.warning("DSPY_API_KEY not set. Generated temporary token:")
-            logger.warning(f"  {token}")
-            logger.warning("Set DSPY_API_KEY as an environment secret for a persistent token.")
-            logger.warning("=" * 60)
-
-        # Add auth routes (login/logout)
-        auth_router = create_auth_routes(token)
-        app.include_router(auth_router)
-
-        # Combine default open paths with gateway public paths (requires_auth=False)
-        open_paths = set(DEFAULT_OPEN_PATHS)
+    # Update auth open paths with gateway public paths discovered during route creation
+    if params.enable_auth and hasattr(app.state, "_auth_open_paths"):
         if hasattr(app.state, "public_paths"):
-            open_paths.update(app.state.public_paths)
+            app.state._auth_open_paths.update(app.state.public_paths)
 
-        # Add auth middleware (must be added after routes)
-        app.add_middleware(AuthMiddleware, token=token, open_paths=open_paths)
-        logger.info("Authentication enabled")
+    # Reset cached OpenAPI schema since routes were added dynamically
+    app.openapi_schema = None
 
-    return app
+    app.state._ready = True
+    logger.info("Application initialization complete")
 
 
-@asynccontextmanager
-async def lifespan(app: FastAPI):
-    """Lifespan context manager for startup/shutdown events."""
-    # Startup
-    scheduler = getattr(app.state, "scheduler", None)
-    if scheduler and scheduler.job_count > 0:
-        scheduler.start()
-
-    yield
-
-    # Shutdown
-    if scheduler and scheduler.job_count > 0:
-        scheduler.shutdown()
-
-    for shutdown_fn in getattr(app.state, "_gateway_shutdowns", []):
-        try:
-            shutdown_fn()
-        except Exception as e:
-            logger.warning(f"Gateway shutdown error: {e}")
-
-    shutdown_executor()
-
+# ---------------------------------------------------------------------------
+# LM helpers (unchanged)
+# ---------------------------------------------------------------------------
 
 def _create_lm_instance(model_config: Dict) -> dspy.LM:
-    """Create a DSPy LM instance from configuration.
-
-    Args:
-        model_config: Model configuration dictionary
-
-    Returns:
-        Configured LM instance
-    """
-    # Extract configuration
+    """Create a DSPy LM instance from configuration."""
     model = model_config.get("model")
     model_type = model_config.get("model_type", "chat")
     temperature = model_config.get("temperature")
@@ -377,7 +427,6 @@ def _create_lm_instance(model_config: Dict) -> dspy.LM:
     api_base = model_config.get("api_base")
     cache = model_config.get("cache")
 
-    # Build kwargs
     kwargs = {}
     if temperature is not None:
         kwargs["temperature"] = temperature
@@ -390,25 +439,12 @@ def _create_lm_instance(model_config: Dict) -> dspy.LM:
     if cache is not None:
         kwargs["cache"] = cache
 
-    # Create and return LM instance
-    return dspy.LM(
-        model=model,
-        model_type=model_type,
-        **kwargs
-    )
+    return dspy.LM(model=model, model_type=model_type, **kwargs)
 
 
 def _configure_dspy_model(model_config: Dict):
-    """Configure DSPy with a language model.
-
-    Args:
-        model_config: Model configuration dictionary
-    """
-    # Create LM instance
+    """Configure DSPy with a language model."""
     lm = _create_lm_instance(model_config)
-
-    # Disable global history: it's an unprotected plain list that races under
-    # concurrent async/threaded requests. Inference logs capture everything we need.
     dspy.settings.configure(lm=lm, disable_history=True)
 
     model = model_config.get("model")
diff --git a/src/dspy_cli/server/runner.py b/src/dspy_cli/server/runner.py
index c6aac83..867ed7e 100644
--- a/src/dspy_cli/server/runner.py
+++ b/src/dspy_cli/server/runner.py
@@ -124,8 +124,11 @@ def create_app_instance():
         sync_workers=sync_workers,
     )
 
-    # Mount MCP if enabled
-    _maybe_mount_mcp(app, enable_mcp)
+    # Register post-init callbacks (run by the lifespan after module discovery)
+    def _on_ready():
+        _maybe_mount_mcp(app, enable_mcp)
+
+    app.state._on_ready_callbacks = [_on_ready]
 
     return app
 
@@ -199,61 +202,26 @@ def main(
             enable_auth=auth,
             sync_workers=sync_workers,
         )
-
-        # Mount MCP if enabled
-        def notify_cli(msg: str, level: str = "info"):
-            color = "green" if level == "info" else "yellow"
-            click.echo(click.style(msg, fg=color))
-
-        _maybe_mount_mcp(app, mcp, notify=notify_cli)
-
     except Exception as e:
         click.echo(click.style(f"Error creating application: {e}", fg="red"))
         raise click.Abort()
 
-    click.echo()
-    click.echo(click.style("Discovered Programs:", fg="cyan", bold=True))
-    click.echo()
-
-    if hasattr(app.state, "modules") and app.state.modules:
-        for module in app.state.modules:
-            click.echo(f"  • {module.name}")
-            click.echo(f"    POST /{module.name}")
-    else:
-        click.echo(click.style("  No programs discovered", fg="yellow"))
-        click.echo()
-        click.echo("Make sure your DSPy modules:")
-        click.echo("  1. Are in src/<package>/modules/")
-        click.echo("  2. Subclass dspy.Module")
-        click.echo("  3. Are not named with a leading underscore")
-        click.echo("  4. If you are using external dependencies:")
-        from dspy_cli.utils.venv import venv_activate_command
-        click.echo(f"     - Ensure your venv is activated ({venv_activate_command()})")
-        click.echo("     - Make sure you have dspy-cli as a local dependency")
-        click.echo("     - Install them using pip install -e .")
+    # Register post-init callbacks (run by the lifespan after module discovery)
+    def _on_ready():
+        _maybe_mount_mcp(app, mcp)
+        if save_openapi:
+            try:
+                spec = generate_openapi_spec(app)
+                spec_filename = f"openapi.{openapi_format}"
+                spec_path = Path.cwd() / spec_filename
+                save_openapi_spec(spec, spec_path, format=openapi_format)
+                logger.info("OpenAPI spec saved: %s", spec_filename)
+            except Exception as e:
+                logger.warning("Could not save OpenAPI spec: %s", e)
+
+    app.state._on_ready_callbacks = [_on_ready]
 
     click.echo()
-    click.echo(click.style("Additional Endpoints:", fg="cyan", bold=True))
-    click.echo()
-    click.echo("  GET /programs - List all programs and their schemas")
-    click.echo("  GET /openapi.json - OpenAPI specification")
-    click.echo("  GET / - Web UI for interactive testing")
-    if mcp:
-        click.echo("  POST /mcp - Model Context Protocol server")
-    click.echo()
-
-    # Generate and save OpenAPI spec if requested
-    if save_openapi:
-        try:
-            spec = generate_openapi_spec(app)
-            spec_filename = f"openapi.{openapi_format}"
-            spec_path = Path.cwd() / spec_filename
-            save_openapi_spec(spec, spec_path, format=openapi_format)
-            click.echo(click.style(f"✓ OpenAPI spec saved: {spec_filename}", fg="green"))
-            click.echo()
-        except Exception as e:
-            click.echo(click.style(f"Warning: Could not save OpenAPI spec: {e}", fg="yellow"))
-            click.echo()
 
     host_string = "localhost" if host == "0.0.0.0" else host
     click.echo(click.style("=" * 60, fg="cyan"))
diff --git a/tests/gateway/test_gateway_routes.py b/tests/gateway/test_gateway_routes.py
index e47ad03..e4b4c9a 100644
--- a/tests/gateway/test_gateway_routes.py
+++ b/tests/gateway/test_gateway_routes.py
@@ -184,9 +184,10 @@ def test_custom_gateway_path(self, gateway_project, test_config):
             enable_ui=False
         )
         
-        routes = [r.path for r in app.routes if hasattr(r, "path")]
-        assert "/webhooks/process" in routes
-        assert "/WebhookProcessor" not in routes
+        with TestClient(app):
+            routes = [r.path for r in app.routes if hasattr(r, "path")]
+            assert "/webhooks/process" in routes
+            assert "/WebhookProcessor" not in routes
 
     def test_custom_gateway_input_transform(self, gateway_project, test_config):
         """Gateway should transform webhook payload to pipeline inputs."""
@@ -246,8 +247,9 @@ def test_no_gateway_uses_module_name_path(self, identity_gateway_project, test_c
             enable_ui=False
         )
         
-        routes = [r.path for r in app.routes if hasattr(r, "path")]
-        assert "/Echo" in routes
+        with TestClient(app):
+            routes = [r.path for r in app.routes if hasattr(r, "path")]
+            assert "/Echo" in routes
 
     def test_identity_gateway_passthrough(self, identity_gateway_project, test_config):
         """IdentityGateway should pass inputs/outputs unchanged."""
diff --git a/tests/test_serve_integration.py b/tests/test_serve_integration.py
index 27e4955..3937779 100644
--- a/tests/test_serve_integration.py
+++ b/tests/test_serve_integration.py
@@ -103,21 +103,22 @@ def test_create_app_discovers_modules(temp_project, test_config):
         enable_ui=False
     )
     
-    # Verify Echo module was discovered
-    assert hasattr(app.state, "modules")
-    module_names = [m.name for m in app.state.modules]
-    assert "Echo" in module_names
-    
-    # Verify POST /Echo route exists
-    routes = [r for r in app.routes]
-    echo_route = None
-    for route in routes:
-        if hasattr(route, "path") and route.path == "/Echo":
-            echo_route = route
-            break
-    
-    assert echo_route is not None, "POST /Echo route not found"
-    assert "POST" in echo_route.methods
+    with TestClient(app):
+        # Verify Echo module was discovered
+        assert hasattr(app.state, "modules")
+        module_names = [m.name for m in app.state.modules]
+        assert "Echo" in module_names
+        
+        # Verify POST /Echo route exists
+        routes = [r for r in app.routes]
+        echo_route = None
+        for route in routes:
+            if hasattr(route, "path") and route.path == "/Echo":
+                echo_route = route
+                break
+        
+        assert echo_route is not None, "POST /Echo route not found"
+        assert "POST" in echo_route.methods
 
 
 def test_openapi_spec_generation(temp_project, test_config):
@@ -130,17 +131,18 @@ def test_openapi_spec_generation(temp_project, test_config):
         enable_ui=False
     )
     
-    # Generate spec
-    spec = generate_openapi_spec(app)
-    
-    # Verify basic structure
-    assert "openapi" in spec
-    assert "paths" in spec
-    assert "/Echo" in spec["paths"]
-    assert "post" in spec["paths"]["/Echo"]
-    
-    # Verify other standard endpoints
-    assert "/programs" in spec["paths"]
+    with TestClient(app):
+        # Generate spec (routes registered during lifespan)
+        spec = generate_openapi_spec(app)
+        
+        # Verify basic structure
+        assert "openapi" in spec
+        assert "paths" in spec
+        assert "/Echo" in spec["paths"]
+        assert "post" in spec["paths"]["/Echo"]
+        
+        # Verify other standard endpoints
+        assert "/programs" in spec["paths"]
 
 
 def test_save_openapi_spec_json(temp_project, test_config):
@@ -153,17 +155,17 @@ def test_save_openapi_spec_json(temp_project, test_config):
         enable_ui=False
     )
     
-    spec = generate_openapi_spec(app)
-    output_path = temp_project["root"] / "openapi.json"
-    
-    save_openapi_spec(spec, output_path, format="json")
-    
-    assert output_path.exists()
-    
-    # Verify it's valid JSON
-    import json
-    content = json.loads(output_path.read_text())
-    assert "/Echo" in content["paths"]
+    with TestClient(app):
+        spec = generate_openapi_spec(app)
+        output_path = temp_project["root"] / "openapi.json"
+        
+        save_openapi_spec(spec, output_path, format="json")
+        
+        assert output_path.exists()
+        
+        import json
+        content = json.loads(output_path.read_text())
+        assert "/Echo" in content["paths"]
 
 
 def test_save_openapi_spec_yaml(temp_project, test_config):
@@ -176,17 +178,17 @@ def test_save_openapi_spec_yaml(temp_project, test_config):
         enable_ui=False
     )
     
-    spec = generate_openapi_spec(app)
-    output_path = temp_project["root"] / "openapi.yaml"
-    
-    save_openapi_spec(spec, output_path, format="yaml")
-    
-    assert output_path.exists()
-    
-    # Verify it's valid YAML
-    import yaml
-    content = yaml.safe_load(output_path.read_text())
-    assert "/Echo" in content["paths"]
+    with TestClient(app):
+        spec = generate_openapi_spec(app)
+        output_path = temp_project["root"] / "openapi.yaml"
+        
+        save_openapi_spec(spec, output_path, format="yaml")
+        
+        assert output_path.exists()
+        
+        import yaml
+        content = yaml.safe_load(output_path.read_text())
+        assert "/Echo" in content["paths"]
 
 
 def test_runner_main_no_reload(temp_project, test_config, monkeypatch):
@@ -196,7 +198,7 @@ def test_runner_main_no_reload(temp_project, test_config, monkeypatch):
     # Mock load_config to return test config
     monkeypatch.setattr("dspy_cli.server.runner.load_config", lambda: test_config)
     
-    # Mock uvicorn.run to avoid starting server
+    # Mock uvicorn.run to trigger the lifespan (via TestClient) so _on_ready runs
     calls = []
     def fake_run(app_or_str, **kw):
         calls.append({
@@ -206,6 +208,10 @@ def fake_run(app_or_str, **kw):
             "reload": kw.get("reload", False),
             "factory": kw.get("factory", False)
         })
+        # Simulate startup so _on_ready callbacks fire (e.g. OpenAPI save)
+        if not isinstance(app_or_str, str):
+            with TestClient(app_or_str):
+                pass
     
     monkeypatch.setattr("uvicorn.run", fake_run)
     
@@ -226,7 +232,7 @@ def fake_run(app_or_str, **kw):
     assert calls[0]["port"] == 1234
     assert calls[0]["reload"] is False
     
-    # Verify OpenAPI was saved
+    # Verify OpenAPI was saved (by _on_ready callback during lifespan)
     assert (temp_project["root"] / "openapi.json").exists()
 
 

From d331fa80e5a16bb71104a7db4508abbae1538e3f Mon Sep 17 00:00:00 2001
From: isaacbmiller <isaacbmiller@gmail.com>
Date: Sun, 1 Mar 2026 16:52:59 -0500
Subject: [PATCH 2/4] feat: add fly.io load test skill for production scaling
 validation

Co-authored-by: factory-droid[bot] <138933559+factory-droid[bot]@users.noreply.github.com>
---
 .claude/skills/fly-load-test/SKILL.md | 459 ++++++++++++++++++++++++++
 1 file changed, 459 insertions(+)
 create mode 100644 .claude/skills/fly-load-test/SKILL.md

diff --git a/.claude/skills/fly-load-test/SKILL.md b/.claude/skills/fly-load-test/SKILL.md
new file mode 100644
index 0000000..934c1f8
--- /dev/null
+++ b/.claude/skills/fly-load-test/SKILL.md
@@ -0,0 +1,459 @@
+---
+name: fly-load-test
+description: Load test dspy-cli on Fly.io with synthetic delay module (zero LLM cost). Finds per-machine concurrency ceiling, tests autoscaling, produces sizing guide. (project)
+allowed-tools:
+  - Bash
+---
+
+# Fly.io Load Test Skill
+
+Deploy dspy-cli with a **SleepModule** (zero LLM cost) to Fly.io, use `hey` to find the per-machine concurrency ceiling, test multi-machine autoscaling, and produce a production sizing guide.
+
+## CRITICAL RULES
+
+1. **NEVER commit directly to main** - Always create a side branch
+2. **ALWAYS clean up** - Destroy Fly apps and delete temp branches, even if tests fail
+3. **No real LLM calls** - The SleepModule simulates latency with `time.sleep()`
+4. **Record all results** - Print `hey` output and memory stats for every phase
+
+## Prerequisites
+
+1. **fly CLI**: Installed and authenticated (`fly auth whoami`)
+2. **hey**: Load testing tool (`brew install hey`)
+3. **Git**: Clean working directory (stash uncommitted changes first)
+4. **Git push access**: Ability to push to origin
+
+## Quick Start
+
+### Phase 1: Setup Environment
+
+```bash
+export DSPY_CLI_DIR="/Users/isaac/projects/dspy-cli"
+export TIMESTAMP=$(date +%s)
+export RANDOM_SUFFIX=$(head -c 4 /dev/urandom | xxd -p)
+export FLY_APP_NAME="dspy-load-${RANDOM_SUFFIX}"
+export TEMP_BRANCH="load-test/${TIMESTAMP}-${RANDOM_SUFFIX}"
+export DSPY_API_KEY_VALUE="load-test-$(head -c 8 /dev/urandom | xxd -p)"
+
+echo "App: $FLY_APP_NAME Branch: $TEMP_BRANCH"
+```
+
+### Phase 2: Pre-flight Checks
+
+```bash
+fly version && fly auth whoami
+which hey || echo "INSTALL hey: brew install hey"
+git -C "$DSPY_CLI_DIR" status --porcelain
+
+# Clean up any orphaned load test resources
+fly apps list 2>/dev/null | grep "dspy-load" || echo "No orphaned apps"
+```
+
+### Phase 3: Create and Push Temp Branch
+
+```bash
+git -C "$DSPY_CLI_DIR" checkout -b "$TEMP_BRANCH"
+git -C "$DSPY_CLI_DIR" push -u origin "$TEMP_BRANCH"
+```
+
+### Phase 4: Create Test Project with SleepModule
+
+```bash
+export TEST_DIR=$(mktemp -d) && echo "TEST_DIR=$TEST_DIR"
+
+# Create project (pipe "Y" to accept the API key prompt)
+echo "Y" | uv run --directory "$DSPY_CLI_DIR" dspy-cli new load-test-app \
+  --program-name sleep_module \
+  --signature "delay_seconds:float -> result:str" \
+  --module-type Predict \
+  --model openai/gpt-4o-mini
+
+mv "$DSPY_CLI_DIR/load-test-app" "$TEST_DIR/"
+cd "$TEST_DIR/load-test-app"
+```
+
+Now replace the generated module with SleepModule (which never calls an LM):
+
+```bash
+cd "$TEST_DIR/load-test-app"
+
+# Find the generated module file and replace it
+MODULE_FILE=$(find src/*/modules/ -name "*.py" ! -name "__init__.py" | head -1)
+echo "Replacing module: $MODULE_FILE"
+
+cat > "$MODULE_FILE" << 'PYEOF'
+import time
+import dspy
+
+
+class SleepModule(dspy.Module):
+    """Synthetic delay module for load testing. Never calls an LLM."""
+
+    def forward(self, delay_seconds: float = 1.0) -> str:
+        time.sleep(delay_seconds)
+        return f"slept {delay_seconds}s"
+PYEOF
+```
+
+### Phase 5: Modify for Git-Based dspy-cli
+
+```bash
+cd "$TEST_DIR/load-test-app"
+
+# Install dspy-cli from temp branch
+sed -i.bak "s|\"dspy-cli\"|\"dspy-cli @ git+https://github.com/cmpnd-ai/dspy-cli.git@$TEMP_BRANCH\"|" pyproject.toml
+
+# Custom Dockerfile with git support
+cat > Dockerfile << 'EOF'
+FROM python:3.11-slim
+
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+ENV XDG_CACHE_HOME=/tmp/.cache
+
+RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
+
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
+
+COPY . .
+RUN uv sync --no-dev
+
+EXPOSE 8000
+
+CMD ["uv", "run", "dspy-cli", "serve", "--host", "0.0.0.0", "--port", "8000", "--auth", "--no-reload", "--sync-workers", "64"]
+EOF
+```
+
+**IMPORTANT**: The `--sync-workers` value in the CMD changes per phase:
+- Phase A: `--sync-workers 64`
+- Phase B: `--sync-workers 128`
+- Phase C: `--sync-workers 256`
+
+To change it between phases, edit the Dockerfile CMD and redeploy:
+```bash
+sed -i.bak 's/--sync-workers [0-9]*/--sync-workers 128/' Dockerfile
+fly deploy --app "$FLY_APP_NAME" --wait-timeout 300
+```
+
+### Phase 6: Create fly.toml and Deploy
+
+```bash
+cd "$TEST_DIR/load-test-app"
+
+cat > fly.toml << EOF
+app = '$FLY_APP_NAME'
+primary_region = 'ewr'
+
+[build]
+
+[deploy]
+  ha = false
+
+[http_service]
+  internal_port = 8000
+  force_https = true
+  auto_stop_machines = 'stop'
+  auto_start_machines = true
+  min_machines_running = 1
+  processes = ['app']
+
+  [http_service.concurrency]
+    type = 'requests'
+    soft_limit = 100
+    hard_limit = 128
+
+[checks]
+  [checks.health]
+    port = 8000
+    type = "http"
+    interval = "10s"
+    timeout = "5s"
+    grace_period = "30s"
+    method = "GET"
+    path = "/health/live"
+
+[[vm]]
+  memory = '1gb'
+  cpu_kind = 'shared'
+  cpus = 2
+EOF
+
+fly apps create "$FLY_APP_NAME" --org personal
+
+# Dummy OpenAI key -- SleepModule never calls an LM, but dspy.LM() init needs it
+fly secrets set OPENAI_API_KEY="sk-dummy-not-used" DSPY_API_KEY="$DSPY_API_KEY_VALUE" --app "$FLY_APP_NAME"
+
+fly deploy --app "$FLY_APP_NAME" --wait-timeout 300
+```
+
+### Phase 7: Wait for Ready
+
+```bash
+export FLY_APP_URL="https://$FLY_APP_NAME.fly.dev"
+
+for i in $(seq 1 60); do
+  STATUS=$(curl -s -o /dev/null -w "%{http_code}" "$FLY_APP_URL/health/ready")
+  if [ "$STATUS" = "200" ]; then echo "App ready after ${i}s"; break; fi
+  echo "Waiting... ($STATUS)"
+  sleep 2
+done
+
+# Confirm SleepModule is discovered
+curl -s -H "Authorization: Bearer $DSPY_API_KEY_VALUE" "$FLY_APP_URL/programs"
+```
+
+### Phase 8: Single-Machine Load Tests
+
+Run `hey` sweeps at increasing concurrency. Each run sends 200 requests with a 1s sleep delay. Theoretical max throughput = `min(concurrency, sync_workers)` rps.
+
+**Phase A: --sync-workers 64**
+
+```bash
+export URL="$FLY_APP_URL/SleepModule"
+export AUTH="Authorization: Bearer $DSPY_API_KEY_VALUE"
+export BODY='{"delay_seconds": 1.0}'
+
+echo "=== Phase A: 64 workers, c=10 ==="
+hey -n 200 -c 10 -t 30 -m POST -H "$AUTH" -H "Content-Type: application/json" -d "$BODY" "$URL"
+
+echo "=== Phase A: 64 workers, c=32 ==="
+hey -n 200 -c 32 -t 30 -m POST -H "$AUTH" -H "Content-Type: application/json" -d "$BODY" "$URL"
+
+echo "=== Phase A: 64 workers, c=64 ==="
+hey -n 200 -c 64 -t 30 -m POST -H "$AUTH" -H "Content-Type: application/json" -d "$BODY" "$URL"
+
+echo "=== Phase A: 64 workers, c=100 ==="
+hey -n 200 -c 100 -t 30 -m POST -H "$AUTH" -H "Content-Type: application/json" -d "$BODY" "$URL"
+
+echo "=== Phase A: 64 workers, c=128 ==="
+hey -n 200 -c 128 -t 30 -m POST -H "$AUTH" -H "Content-Type: application/json" -d "$BODY" "$URL"
+
+# Check memory after heavy load
+fly ssh console --app "$FLY_APP_NAME" -C "cat /proc/meminfo | head -5"
+```
+
+**Phase B: --sync-workers 128** (redeploy first)
+
+```bash
+cd "$TEST_DIR/load-test-app"
+sed -i.bak 's/--sync-workers [0-9]*/--sync-workers 128/' Dockerfile
+fly deploy --app "$FLY_APP_NAME" --wait-timeout 300
+
+# Wait for ready
+for i in $(seq 1 60); do
+  STATUS=$(curl -s -o /dev/null -w "%{http_code}" "$FLY_APP_URL/health/ready")
+  if [ "$STATUS" = "200" ]; then echo "Ready after ${i}s"; break; fi
+  sleep 2
+done
+
+echo "=== Phase B: 128 workers, c=64 ==="
+hey -n 200 -c 64 -t 30 -m POST -H "$AUTH" -H "Content-Type: application/json" -d "$BODY" "$URL"
+
+echo "=== Phase B: 128 workers, c=128 ==="
+hey -n 200 -c 128 -t 30 -m POST -H "$AUTH" -H "Content-Type: application/json" -d "$BODY" "$URL"
+
+echo "=== Phase B: 128 workers, c=200 ==="
+hey -n 300 -c 200 -t 30 -m POST -H "$AUTH" -H "Content-Type: application/json" -d "$BODY" "$URL"
+
+echo "=== Phase B: 128 workers, c=256 ==="
+hey -n 300 -c 256 -t 30 -m POST -H "$AUTH" -H "Content-Type: application/json" -d "$BODY" "$URL"
+
+fly ssh console --app "$FLY_APP_NAME" -C "cat /proc/meminfo | head -5"
+```
+
+**Phase C: --sync-workers 256** (redeploy, only if Phase B didn't OOM)
+
+```bash
+cd "$TEST_DIR/load-test-app"
+sed -i.bak 's/--sync-workers [0-9]*/--sync-workers 256/' Dockerfile
+fly deploy --app "$FLY_APP_NAME" --wait-timeout 300
+
+for i in $(seq 1 60); do
+  STATUS=$(curl -s -o /dev/null -w "%{http_code}" "$FLY_APP_URL/health/ready")
+  if [ "$STATUS" = "200" ]; then echo "Ready after ${i}s"; break; fi
+  sleep 2
+done
+
+echo "=== Phase C: 256 workers, c=128 ==="
+hey -n 200 -c 128 -t 30 -m POST -H "$AUTH" -H "Content-Type: application/json" -d "$BODY" "$URL"
+
+echo "=== Phase C: 256 workers, c=256 ==="
+hey -n 300 -c 256 -t 30 -m POST -H "$AUTH" -H "Content-Type: application/json" -d "$BODY" "$URL"
+
+echo "=== Phase C: 256 workers, c=300 ==="
+hey -n 300 -c 300 -t 30 -m POST -H "$AUTH" -H "Content-Type: application/json" -d "$BODY" "$URL"
+
+fly ssh console --app "$FLY_APP_NAME" -C "cat /proc/meminfo | head -5"
+```
+
+### Phase 9: Multi-Machine Autoscaling Test
+
+Use the best `--sync-workers` from Phases A-C. Remove `ha = false` and scale to 3 machines:
+
+```bash
+cd "$TEST_DIR/load-test-app"
+
+# Update fly.toml: remove ha = false
+sed -i.bak '/ha = false/d' fly.toml
+
+# Set concurrency limits based on findings (adjust these!)
+# soft_limit = ~80% of sync_workers, hard_limit = sync_workers
+# Example for 128 workers:
+sed -i.bak 's/soft_limit = [0-9]*/soft_limit = 100/' fly.toml
+sed -i.bak 's/hard_limit = [0-9]*/hard_limit = 128/' fly.toml
+
+fly deploy --app "$FLY_APP_NAME" --wait-timeout 300
+
+# Scale to 3 machines
+fly scale count 3 --app "$FLY_APP_NAME"
+
+# Wait for all machines to be ready
+sleep 30
+fly machines list --app "$FLY_APP_NAME"
+
+for i in $(seq 1 60); do
+  STATUS=$(curl -s -o /dev/null -w "%{http_code}" "$FLY_APP_URL/health/ready")
+  if [ "$STATUS" = "200" ]; then echo "Ready after ${i}s"; break; fi
+  sleep 2
+done
+```
+
+Now blast at concurrency levels that should trigger multi-machine distribution:
+
+```bash
+# Should spread across machines (3 x 128 = 384 slots)
+echo "=== Autoscale: c=100 (fits in 1 machine) ==="
+hey -n 300 -c 100 -t 30 -m POST -H "$AUTH" -H "Content-Type: application/json" -d "$BODY" "$URL"
+
+echo "=== Autoscale: c=200 (needs 2 machines) ==="
+hey -n 400 -c 200 -t 30 -m POST -H "$AUTH" -H "Content-Type: application/json" -d "$BODY" "$URL"
+
+echo "=== Autoscale: c=300 (needs 3 machines) ==="
+hey -n 600 -c 300 -t 30 -m POST -H "$AUTH" -H "Content-Type: application/json" -d "$BODY" "$URL"
+
+# Check machine status (which ones are started/stopped)
+fly machines list --app "$FLY_APP_NAME"
+```
+
+**Test auto-stop/auto-start:**
+
+```bash
+# Wait for idle machines to stop (~5 min)
+echo "Waiting 5 minutes for auto-stop..."
+sleep 300
+fly machines list --app "$FLY_APP_NAME"
+
+# Hit the endpoint -- should auto-start a machine
+echo "=== Cold start test ==="
+time curl -s -X POST -H "$AUTH" -H "Content-Type: application/json" -d "$BODY" "$URL"
+
+# Check: how long did the cold start take?
+fly machines list --app "$FLY_APP_NAME"
+```
+
+### Phase 10: Guaranteed Cleanup
+
+**ALWAYS run cleanup, even if tests fail:**
+
+```bash
+fly apps destroy "$FLY_APP_NAME" --yes
+
+git -C "$DSPY_CLI_DIR" push origin --delete "$TEMP_BRANCH"
+git -C "$DSPY_CLI_DIR" checkout main
+git -C "$DSPY_CLI_DIR" branch -D "$TEMP_BRANCH"
+
+rm -rf "$TEST_DIR"
+```
+
+## Interpreting Results
+
+### hey Output Key Metrics
+
+```
+Summary:
+  Total:        X.XXX secs        ← wall clock time
+  Requests/sec: XX.XX             ← throughput (target: min(concurrency, sync_workers))
+
+Latency distribution:
+  50% in X.XXX secs               ← should be ~1s (the sleep duration) when not queuing
+  95% in X.XXX secs               ← spikes here = queuing
+  99% in X.XXX secs               ← worst case
+
+Status code distribution:
+  [200] XXX responses              ← success
+  [503] XXX responses              ← server overloaded (hit hard_limit or OOM)
+```
+
+### What Good Looks Like
+
+| Concurrency | Expected RPS (128 workers) | Expected p50 | Sign of trouble |
+|-------------|---------------------------|-------------|-----------------|
+| c <= workers | ~c rps | ~1.0s | - |
+| c = workers | ~workers rps | ~1.0s | Perfect saturation |
+| c = 1.5x workers | ~workers rps | ~1.5s | Queuing (expected) |
+| c = 2x workers | ~workers rps | ~2.0s | Deep queue |
+| Any | < expected | > 3s | OOM, CPU thrash, or errors |
+
+### Memory Check
+
+```bash
+fly ssh console --app "$FLY_APP_NAME" -C "cat /proc/meminfo | head -5"
+```
+
+If `MemAvailable` drops below ~100MB under load, you've found the memory wall. Reduce `--sync-workers` or increase VM memory.
+
+## Production Sizing Guide
+
+*Fill in after running tests. Template:*
+
+| Target Concurrent | VM | `--sync-workers` | `soft_limit` | `hard_limit` | Machines |
+|-------------------|-----|-----------------|-------------|-------------|----------|
+| 50 | shared-cpu-2x 1gb | ? | ? | ? | 1 |
+| 100 | shared-cpu-2x 1gb | ? | ? | ? | 1 |
+| 200 | shared-cpu-2x 1gb | ? | ? | ? | 2 |
+| 500 | shared-cpu-2x 1gb | ? | ? | ? | 4-5 |
+
+**Rules:**
+- `hard_limit = sync_workers` (the thread pool ceiling; no more concurrent work is possible)
+- `soft_limit = ~80% of sync_workers` (gives fly ~seconds to wake another machine)
+- Machines = `ceil(target_concurrent / hard_limit)`
+
+## Cleanup Verification
+
+```bash
+fly apps list | grep "dspy-load" || echo "No orphaned apps"
+git branch -r | grep "load-test/" || echo "No orphaned branches"
+```
+
+## Troubleshooting
+
+### SleepModule not discovered
+Check that the module file is in `src/<pkg>/modules/` and the class inherits from `dspy.Module`. Verify with:
+```bash
+fly ssh console --app "$FLY_APP_NAME" -C "sh -c 'find /src -name \"*.py\" | head -20'"
+```
+
+### "No module named dspy" during build
+The `uv sync --no-dev` in Dockerfile should install dspy via the dspy-cli dependency. Check `pyproject.toml` has the git URL correctly.
+
+### OOM kills during load test
+Reduce `--sync-workers`, or increase VM memory. Check:
+```bash
+fly logs --app "$FLY_APP_NAME" --no-tail | grep -i "oom\|kill\|memory"
+```
+
+### hey: "socket: too many open files"
+On macOS, increase ulimit before running hey:
+```bash
+ulimit -n 10240
+```
+
+### Autoscaling doesn't trigger
+Verify concurrency limits in fly.toml match what's deployed:
+```bash
+fly config show --app "$FLY_APP_NAME" | grep -A5 concurrency
+```
+Fly only wakes stopped machines when `soft_limit` is exceeded. If all machines are already running, no new ones start (fly doesn't create machines, only starts/stops existing ones).
+
+### Machines don't auto-stop
+`auto_stop_machines = 'stop'` only stops machines with zero connections. If hey keeps connections alive, wait for them to close. Default idle timeout is ~5 minutes.

From e62f269150a8ca9140e395595003bd47f1a2364e Mon Sep 17 00:00:00 2001
From: isaacbmiller <isaacbmiller@gmail.com>
Date: Sun, 1 Mar 2026 17:00:45 -0500
Subject: [PATCH 3/4] refactor: move skill changes to separate PR

Revert fly-e2e-test SKILL.md to main version and remove fly-load-test
skill. These will be submitted in a dedicated skills PR.

Co-authored-by: factory-droid[bot] <138933559+factory-droid[bot]@users.noreply.github.com>
---
 .claude/skills/fly-e2e-test/SKILL.md  | 226 ++++++-------
 .claude/skills/fly-load-test/SKILL.md | 459 --------------------------
 2 files changed, 104 insertions(+), 581 deletions(-)
 delete mode 100644 .claude/skills/fly-load-test/SKILL.md

diff --git a/.claude/skills/fly-e2e-test/SKILL.md b/.claude/skills/fly-e2e-test/SKILL.md
index 5b2d4d9..f7eade0 100644
--- a/.claude/skills/fly-e2e-test/SKILL.md
+++ b/.claude/skills/fly-e2e-test/SKILL.md
@@ -9,12 +9,11 @@ allowed-tools:
 
 Deploy a fresh dspy-cli project to Fly.io using your local code changes, run full integration tests (health, auth, LLM execution), and **guarantee cleanup** regardless of success or failure.
 
-## CRITICAL RULES
+## ⚠️ CRITICAL RULES
 
 1. **NEVER commit directly to main** - Always create a side branch first, even for small changes
 2. **ALWAYS clean up** - Destroy Fly apps and delete temp branches, even if tests fail
 3. **Use temp branches** - Name them `e2e-test/{timestamp}-{random}` for easy identification
-4. **Run cleanup in a trap** - Use bash trap or always-run-cleanup pattern
 
 ## Prerequisites
 
@@ -25,76 +24,80 @@ Deploy a fresh dspy-cli project to Fly.io using your local code changes, run ful
 
 ## Quick Start
 
-All commands run directly in the shell (no tmux required). Use environment variables to pass state between steps.
+Run each phase in a tmux session to enable output capture and cleanup tracking.
 
 ### Phase 1: Setup Environment
 
 ```bash
-export DSPY_CLI_DIR="/Users/isaac/projects/dspy-cli"
-export TIMESTAMP=$(date +%s)
-export RANDOM_SUFFIX=$(head -c 4 /dev/urandom | xxd -p)
-export FLY_APP_NAME="dspy-e2e-${RANDOM_SUFFIX}"
-export TEMP_BRANCH="e2e-test/${TIMESTAMP}-${RANDOM_SUFFIX}"
-export DSPY_API_KEY_VALUE="test-e2e-$(head -c 8 /dev/urandom | xxd -p)"
+# Create tmux session
+tmux new-session -d -s e2e-fly -c /Users/isaac/projects/dspy-cli
+
+# Set variables
+tmux send-keys -t e2e-fly 'export DSPY_CLI_DIR="/Users/isaac/projects/dspy-cli"' C-m
+tmux send-keys -t e2e-fly 'export TIMESTAMP=$(date +%s)' C-m
+tmux send-keys -t e2e-fly 'export RANDOM_SUFFIX=$(head -c 4 /dev/urandom | xxd -p)' C-m
+tmux send-keys -t e2e-fly 'export FLY_APP_NAME="dspy-e2e-${RANDOM_SUFFIX}"' C-m
+tmux send-keys -t e2e-fly 'export TEMP_BRANCH="e2e-test/${TIMESTAMP}-${RANDOM_SUFFIX}"' C-m
 
 # Source .env for OPENAI_API_KEY
-set -a && source "$DSPY_CLI_DIR/.env" && set +a
+tmux send-keys -t e2e-fly 'set -a && source .env && set +a' C-m
 
 # Verify setup
-echo "App: $FLY_APP_NAME Branch: $TEMP_BRANCH"
+tmux send-keys -t e2e-fly 'echo "App: $FLY_APP_NAME Branch: $TEMP_BRANCH"' C-m
 ```
 
 ### Phase 2: Pre-flight Checks
 
 ```bash
-fly version && fly auth whoami
-git -C "$DSPY_CLI_DIR" status --porcelain
+# Verify fly CLI
+tmux send-keys -t e2e-fly 'fly version && fly auth whoami' C-m
+
+# Check for uncommitted changes (stash if needed)
+tmux send-keys -t e2e-fly 'git status --porcelain' C-m
 
 # Clean up any orphaned e2e resources
-fly apps list 2>/dev/null | grep "dspy-e2e" || echo "No orphaned apps"
+tmux send-keys -t e2e-fly 'fly apps list 2>/dev/null | grep "dspy-e2e" || echo "No orphaned apps"' C-m
 ```
 
 ### Phase 3: Create and Push Temp Branch
 
 ```bash
-git -C "$DSPY_CLI_DIR" checkout -b "$TEMP_BRANCH"
-git -C "$DSPY_CLI_DIR" push -u origin "$TEMP_BRANCH"
+tmux send-keys -t e2e-fly 'git checkout -b "$TEMP_BRANCH"' C-m
+tmux send-keys -t e2e-fly 'git push -u origin "$TEMP_BRANCH"' C-m
 ```
 
 ### Phase 4: Create Test Project
 
 ```bash
-export TEST_DIR=$(mktemp -d) && echo "TEST_DIR=$TEST_DIR"
+# Create temp directory
+tmux send-keys -t e2e-fly 'export TEST_DIR=$(mktemp -d) && echo "TEST_DIR=$TEST_DIR"' C-m
 
-# Pipe "Y" to accept the API key confirmation prompt
-echo "Y" | uv run --directory "$DSPY_CLI_DIR" dspy-cli new fly-e2e-test \
-  --program-name qa_module \
-  --signature "question:str -> answer:str" \
-  --module-type Predict \
-  --model openai/gpt-4o-mini
+# Create project (will prompt for API key confirmation - send Y)
+tmux send-keys -t e2e-fly 'uv run --directory "$DSPY_CLI_DIR" dspy-cli new fly-e2e-test --program-name qa_module --signature "question:str -> answer:str" --module-type Predict --model openai/gpt-4o-mini' C-m
 
-# Move project to temp dir (dspy-cli new creates in current dir)
-mv "$DSPY_CLI_DIR/fly-e2e-test" "$TEST_DIR/"
-cd "$TEST_DIR/fly-e2e-test"
+# When prompted "Proceed with this API key? [Y/n]:", send:
+tmux send-keys -t e2e-fly 'Y' C-m
+
+# Move project to temp dir (dspy-cli creates in current dir)
+tmux send-keys -t e2e-fly 'mv "$DSPY_CLI_DIR/fly-e2e-test" "$TEST_DIR/" && cd "$TEST_DIR/fly-e2e-test"' C-m
 ```
 
 ### Phase 5: Modify for Git-Based dspy-cli
 
 ```bash
-cd "$TEST_DIR/fly-e2e-test"
-
-# Update pyproject.toml to install dspy-cli from temp branch (use double quotes for variable expansion)
-sed -i.bak "s|\"dspy-cli\"|\"dspy-cli @ git+https://github.com/cmpnd-ai/dspy-cli.git@$TEMP_BRANCH\"|" pyproject.toml
+# Update pyproject.toml to install dspy-cli from temp branch
+tmux send-keys -t e2e-fly 'sed -i.bak "s|\"dspy-cli\"|\"dspy-cli @ git+https://github.com/cmpnd-ai/dspy-cli.git@$TEMP_BRANCH\"|" pyproject.toml' C-m
 
-# Update Dockerfile: add git (required for git-based deps)
-# NOTE: Check the current Dockerfile.template for the latest CMD format and update accordingly
-cat > Dockerfile << 'EOF'
+# IMPORTANT: Update Dockerfile to include git (required for git-based deps)
+# NOTE: This is an example dockerfile. There may be specific changes in a newer version of dspy-cli. Check the current Dockerfile and add the Git install line
+tmux send-keys -t e2e-fly 'cat > Dockerfile << '"'"'EOF'"'"'
 FROM python:3.11-slim
 
 ENV PYTHONDONTWRITEBYTECODE=1
 ENV PYTHONUNBUFFERED=1
 ENV XDG_CACHE_HOME=/tmp/.cache
 
+# Install git for fetching dspy-cli from git URL
 RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
 
 COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
@@ -105,17 +108,16 @@ RUN uv sync --no-dev
 EXPOSE 8000
 
 CMD ["uv", "run", "dspy-cli", "serve", "--host", "0.0.0.0", "--port", "8000", "--auth", "--no-reload"]
-EOF
+EOF' C-m
 ```
 
 ### Phase 6: Create fly.toml and Deploy
 
 ```bash
-cd "$TEST_DIR/fly-e2e-test"
-
-cat > fly.toml << EOF
-app = '$FLY_APP_NAME'
-primary_region = 'ewr'
+# Create fly.toml
+tmux send-keys -t e2e-fly 'cat > fly.toml << EOF
+app = '"'"'$FLY_APP_NAME'"'"'
+primary_region = '"'"'ewr'"'"'
 
 [build]
 
@@ -125,120 +127,97 @@ primary_region = 'ewr'
   auto_stop_machines = true
   auto_start_machines = true
   min_machines_running = 0
-  processes = ['app']
-
-[deploy]
-  ha = false
-
-[checks]
-  [checks.health]
-    port = 8000
-    type = "http"
-    interval = "10s"
-    timeout = "5s"
-    grace_period = "30s"
-    method = "GET"
-    path = "/health/live"
+  processes = ['"'"'app'"'"']
 
 [[vm]]
-  memory = '512mb'
-  cpu_kind = 'shared'
+  memory = '"'"'512mb'"'"'
+  cpu_kind = '"'"'shared'"'"'
   cpus = 1
-EOF
+EOF' C-m
+
+# Create app
+tmux send-keys -t e2e-fly 'fly apps create "$FLY_APP_NAME" --org personal' C-m
 
-# Create app and set secrets
-fly apps create "$FLY_APP_NAME" --org personal
-fly secrets set OPENAI_API_KEY="$OPENAI_API_KEY" DSPY_API_KEY="$DSPY_API_KEY_VALUE" --app "$FLY_APP_NAME"
+# Generate a random API key for testing
+tmux send-keys -t e2e-fly 'export DSPY_API_KEY_VALUE="test-e2e-$(head -c 8 /dev/urandom | xxd -p)"' C-m
+
+# Set secrets using fly secrets (required env vars for your app)
+# Add any additional env vars your project needs here
+tmux send-keys -t e2e-fly 'fly secrets set OPENAI_API_KEY="$OPENAI_API_KEY" DSPY_API_KEY="$DSPY_API_KEY_VALUE" --app "$FLY_APP_NAME"' C-m
 
 # Deploy (takes ~2-3 minutes)
-fly deploy --app "$FLY_APP_NAME" --wait-timeout 300
+tmux send-keys -t e2e-fly 'fly deploy --app "$FLY_APP_NAME" --wait-timeout 300' C-m
 ```
 
 ### Phase 7: Run Integration Tests
 
 ```bash
-export FLY_APP_URL="https://$FLY_APP_NAME.fly.dev"
-
-# Wait for app to be ready (poll /health/ready)
-for i in $(seq 1 30); do
-  STATUS=$(curl -s -o /dev/null -w "%{http_code}" "$FLY_APP_URL/health/ready")
-  if [ "$STATUS" = "200" ]; then echo "App ready after ${i}s"; break; fi
-  sleep 1
-done
-
-# Test 1: Health endpoints (no auth required)
-echo "=== Test 1: Liveness ===" && curl -s "$FLY_APP_URL/health/live"
-echo "=== Test 2: Readiness ===" && curl -s "$FLY_APP_URL/health/ready"
-echo "=== Test 3: Legacy health ===" && curl -s "$FLY_APP_URL/health"
-
-# Test 4: Auth redirect (unauthenticated)
-echo "=== Test 4: Auth Redirect ===" && curl -s -o /dev/null -w "HTTP: %{http_code}\n" "$FLY_APP_URL/programs"
-
-# Test 5: Auth success (authenticated)
-echo "=== Test 5: Auth Success ===" && curl -s -H "Authorization: Bearer $DSPY_API_KEY_VALUE" "$FLY_APP_URL/programs"
-
-# Test 6: LLM Module Execution
-echo "=== Test 6: LLM Execution ===" && curl -s -X POST \
-  -H "Authorization: Bearer $DSPY_API_KEY_VALUE" \
-  -H "Content-Type: application/json" \
-  -d '{"question": "What is 2+2? Reply with just the number."}' \
-  "$FLY_APP_URL/QaModulePredict"
-```
+tmux send-keys -t e2e-fly 'export FLY_APP_URL="https://$FLY_APP_NAME.fly.dev"' C-m
 
-### Phase 8: SSH Inspection (optional)
+# Test 1: Health Check
+tmux send-keys -t e2e-fly 'echo "=== Test 1: Health Check ===" && curl -s "$FLY_APP_URL/health"' C-m
+# Expected: {"status":"ok"}
 
-With `ha = false`, there's only one machine so SSH always targets it.
-Shell redirects like `2>/dev/null` don't work in `-C` commands -- wrap in `sh -c`:
+# Test 2: Auth Redirect (unauthenticated)
+tmux send-keys -t e2e-fly 'echo "=== Test 2: Auth Redirect ===" && curl -s -o /dev/null -w "HTTP: %{http_code}\n" "$FLY_APP_URL/programs"' C-m
+# Expected: HTTP: 303
 
-```bash
-# Inspect the machine filesystem
-fly ssh console --app "$FLY_APP_NAME" -C "sh -c 'find /root -name \"*.log\" 2>/dev/null'"
+# Test 3: Auth Success (authenticated)
+tmux send-keys -t e2e-fly 'echo "=== Test 3: Auth Success ===" && curl -s -H "Authorization: Bearer $DSPY_API_KEY_VALUE" "$FLY_APP_URL/programs"' C-m
+# Expected: {"programs":[{"name":"QaModulePredict",...}]}
 
-# Check inference logs
-fly ssh console --app "$FLY_APP_NAME" -C "cat /logs/QaModulePredict.log"
+# Test 4: LLM Module Execution
+tmux send-keys -t e2e-fly 'echo "=== Test 4: LLM Execution ===" && curl -s -X POST -H "Authorization: Bearer $DSPY_API_KEY_VALUE" -H "Content-Type: application/json" -d '"'"'{"question": "What is 2+2?"}'"'"' "$FLY_APP_URL/QaModulePredict"' C-m
+# Expected: {"answer":"4"} (or similar)
 ```
 
-### Phase 9: Guaranteed Cleanup
+### Phase 8: Guaranteed Cleanup
 
 **ALWAYS run cleanup, even if tests fail:**
 
 ```bash
 # Destroy Fly app
-fly apps destroy "$FLY_APP_NAME" --yes
+tmux send-keys -t e2e-fly 'fly apps destroy "$FLY_APP_NAME" --yes' C-m
 
 # Delete remote branch
-git -C "$DSPY_CLI_DIR" push origin --delete "$TEMP_BRANCH"
+tmux send-keys -t e2e-fly 'git -C "$DSPY_CLI_DIR" push origin --delete "$TEMP_BRANCH"' C-m
 
 # Return to main and delete local branch
-git -C "$DSPY_CLI_DIR" checkout main
-git -C "$DSPY_CLI_DIR" branch -D "$TEMP_BRANCH"
+tmux send-keys -t e2e-fly 'git -C "$DSPY_CLI_DIR" checkout main' C-m
+tmux send-keys -t e2e-fly 'git -C "$DSPY_CLI_DIR" branch -D "$TEMP_BRANCH"' C-m
 
 # Remove temp directory
-rm -rf "$TEST_DIR"
+tmux send-keys -t e2e-fly 'rm -rf "$TEST_DIR"' C-m
+
+# Kill tmux session
+tmux kill-session -t e2e-fly
 ```
 
 ## Verification Checklist
 
 | Test | Expected Result |
 |------|-----------------|
-| `/health/live` (no auth) | `{"status":"alive"}` |
-| `/health/ready` (no auth) | `{"status":"ready","programs":1}` |
-| `/health` (no auth) | `{"status":"ok"}` |
+| Health Check | `{"status":"ok"}` |
 | Auth Redirect (no auth) | HTTP 303 |
 | Auth Success (Bearer token) | JSON with `QaModulePredict` |
 | LLM Execution | JSON with `"answer"` field |
 
 ## Cleanup Verification
 
+After running cleanup, verify:
+
 ```bash
-fly apps list | grep "dspy-e2e" || echo "No orphaned apps"
-git branch -r | grep "e2e-test/" || echo "No orphaned branches"
+# No orphaned Fly apps
+fly apps list | grep "dspy-e2e" || echo "Clean"
+
+# No orphaned branches
+git branch -r | grep "e2e-test/" || echo "Clean"
 ```
 
 ## Troubleshooting
 
 ### Deploy fails with "Git executable not found"
-The Dockerfile must include git installation:
+The Dockerfile must include git installation. Ensure the Dockerfile has:
 ```dockerfile
 RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
 ```
@@ -251,19 +230,13 @@ sed -i.bak 's|...|...|' pyproject.toml  # Won't expand $TEMP_BRANCH
 ```
 
 ### Project created in wrong directory
-`dspy-cli new` creates projects relative to the current working directory. Move the project after creation:
+`dspy-cli new` creates projects relative to the current working directory, not where it's run from. Move the project after creation:
 ```bash
 mv "$DSPY_CLI_DIR/fly-e2e-test" "$TEST_DIR/"
 ```
 
-### Shell redirects fail in fly ssh -C
-Wrap the remote command in `sh -c`:
-```bash
-fly ssh console --app "$FLY_APP_NAME" -C "sh -c 'find / -name \"*dspy*\" 2>/dev/null'"
-```
-
 ### Cleanup fails
-Run each step individually:
+If any cleanup step fails, run them individually:
 ```bash
 fly apps destroy "dspy-e2e-XXXX" --yes
 git push origin --delete "e2e-test/XXXX"
@@ -272,18 +245,27 @@ git branch -D "e2e-test/XXXX"
 ```
 
 ### App crashes due to missing environment variables
+Use `fly secrets` to set any required env vars. Check the app logs to see which vars are missing:
 ```bash
+# View logs to find missing env vars
 fly logs --app "$FLY_APP_NAME" --no-tail
-fly secrets set VAR_NAME="value" --app "$FLY_APP_NAME"
+
+# Set additional secrets as needed
+fly secrets set VAR_NAME="value" ANOTHER_VAR="value" --app "$FLY_APP_NAME"
+
+# List current secrets
 fly secrets list --app "$FLY_APP_NAME"
 ```
 
-Common env vars:
+Common env vars that might be needed:
 - `OPENAI_API_KEY` - Required for OpenAI models
 - `DSPY_API_KEY` - Required when `--auth` is enabled
+- Project-specific vars (check your gateway's `setup()` method)
+
+## Multi-Layer Cleanup Protection
 
-### Per-machine cache fragmentation (multi-machine deployments)
-The `ha = false` setting in fly.toml keeps E2E tests on a single machine,
-avoiding this issue. For production deployments with multiple machines, the
-LM response cache (`.dspy_cache`) is local to each VM, so requests hitting
-different machines may miss the cache.
+1. **Unique naming**: `dspy-e2e-{random}` prevents conflicts
+2. **Pre-test orphan cleanup**: Removes stale resources before starting
+3. **tmux session**: Enables output capture and manual recovery
+4. **Explicit cleanup phase**: Always runs after tests
+5. **Verification commands**: Confirm cleanup succeeded
diff --git a/.claude/skills/fly-load-test/SKILL.md b/.claude/skills/fly-load-test/SKILL.md
deleted file mode 100644
index 934c1f8..0000000
--- a/.claude/skills/fly-load-test/SKILL.md
+++ /dev/null
@@ -1,459 +0,0 @@
----
-name: fly-load-test
-description: Load test dspy-cli on Fly.io with synthetic delay module (zero LLM cost). Finds per-machine concurrency ceiling, tests autoscaling, produces sizing guide. (project)
-allowed-tools:
-  - Bash
----
-
-# Fly.io Load Test Skill
-
-Deploy dspy-cli with a **SleepModule** (zero LLM cost) to Fly.io, use `hey` to find the per-machine concurrency ceiling, test multi-machine autoscaling, and produce a production sizing guide.
-
-## CRITICAL RULES
-
-1. **NEVER commit directly to main** - Always create a side branch
-2. **ALWAYS clean up** - Destroy Fly apps and delete temp branches, even if tests fail
-3. **No real LLM calls** - The SleepModule simulates latency with `time.sleep()`
-4. **Record all results** - Print `hey` output and memory stats for every phase
-
-## Prerequisites
-
-1. **fly CLI**: Installed and authenticated (`fly auth whoami`)
-2. **hey**: Load testing tool (`brew install hey`)
-3. **Git**: Clean working directory (stash uncommitted changes first)
-4. **Git push access**: Ability to push to origin
-
-## Quick Start
-
-### Phase 1: Setup Environment
-
-```bash
-export DSPY_CLI_DIR="/Users/isaac/projects/dspy-cli"
-export TIMESTAMP=$(date +%s)
-export RANDOM_SUFFIX=$(head -c 4 /dev/urandom | xxd -p)
-export FLY_APP_NAME="dspy-load-${RANDOM_SUFFIX}"
-export TEMP_BRANCH="load-test/${TIMESTAMP}-${RANDOM_SUFFIX}"
-export DSPY_API_KEY_VALUE="load-test-$(head -c 8 /dev/urandom | xxd -p)"
-
-echo "App: $FLY_APP_NAME Branch: $TEMP_BRANCH"
-```
-
-### Phase 2: Pre-flight Checks
-
-```bash
-fly version && fly auth whoami
-which hey || echo "INSTALL hey: brew install hey"
-git -C "$DSPY_CLI_DIR" status --porcelain
-
-# Clean up any orphaned load test resources
-fly apps list 2>/dev/null | grep "dspy-load" || echo "No orphaned apps"
-```
-
-### Phase 3: Create and Push Temp Branch
-
-```bash
-git -C "$DSPY_CLI_DIR" checkout -b "$TEMP_BRANCH"
-git -C "$DSPY_CLI_DIR" push -u origin "$TEMP_BRANCH"
-```
-
-### Phase 4: Create Test Project with SleepModule
-
-```bash
-export TEST_DIR=$(mktemp -d) && echo "TEST_DIR=$TEST_DIR"
-
-# Create project (pipe "Y" to accept the API key prompt)
-echo "Y" | uv run --directory "$DSPY_CLI_DIR" dspy-cli new load-test-app \
-  --program-name sleep_module \
-  --signature "delay_seconds:float -> result:str" \
-  --module-type Predict \
-  --model openai/gpt-4o-mini
-
-mv "$DSPY_CLI_DIR/load-test-app" "$TEST_DIR/"
-cd "$TEST_DIR/load-test-app"
-```
-
-Now replace the generated module with SleepModule (which never calls an LM):
-
-```bash
-cd "$TEST_DIR/load-test-app"
-
-# Find the generated module file and replace it
-MODULE_FILE=$(find src/*/modules/ -name "*.py" ! -name "__init__.py" | head -1)
-echo "Replacing module: $MODULE_FILE"
-
-cat > "$MODULE_FILE" << 'PYEOF'
-import time
-import dspy
-
-
-class SleepModule(dspy.Module):
-    """Synthetic delay module for load testing. Never calls an LLM."""
-
-    def forward(self, delay_seconds: float = 1.0) -> str:
-        time.sleep(delay_seconds)
-        return f"slept {delay_seconds}s"
-PYEOF
-```
-
-### Phase 5: Modify for Git-Based dspy-cli
-
-```bash
-cd "$TEST_DIR/load-test-app"
-
-# Install dspy-cli from temp branch
-sed -i.bak "s|\"dspy-cli\"|\"dspy-cli @ git+https://github.com/cmpnd-ai/dspy-cli.git@$TEMP_BRANCH\"|" pyproject.toml
-
-# Custom Dockerfile with git support
-cat > Dockerfile << 'EOF'
-FROM python:3.11-slim
-
-ENV PYTHONDONTWRITEBYTECODE=1
-ENV PYTHONUNBUFFERED=1
-ENV XDG_CACHE_HOME=/tmp/.cache
-
-RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
-
-COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
-
-COPY . .
-RUN uv sync --no-dev
-
-EXPOSE 8000
-
-CMD ["uv", "run", "dspy-cli", "serve", "--host", "0.0.0.0", "--port", "8000", "--auth", "--no-reload", "--sync-workers", "64"]
-EOF
-```
-
-**IMPORTANT**: The `--sync-workers` value in the CMD changes per phase:
-- Phase A: `--sync-workers 64`
-- Phase B: `--sync-workers 128`
-- Phase C: `--sync-workers 256`
-
-To change it between phases, edit the Dockerfile CMD and redeploy:
-```bash
-sed -i.bak 's/--sync-workers [0-9]*/--sync-workers 128/' Dockerfile
-fly deploy --app "$FLY_APP_NAME" --wait-timeout 300
-```
-
-### Phase 6: Create fly.toml and Deploy
-
-```bash
-cd "$TEST_DIR/load-test-app"
-
-cat > fly.toml << EOF
-app = '$FLY_APP_NAME'
-primary_region = 'ewr'
-
-[build]
-
-[deploy]
-  ha = false
-
-[http_service]
-  internal_port = 8000
-  force_https = true
-  auto_stop_machines = 'stop'
-  auto_start_machines = true
-  min_machines_running = 1
-  processes = ['app']
-
-  [http_service.concurrency]
-    type = 'requests'
-    soft_limit = 100
-    hard_limit = 128
-
-[checks]
-  [checks.health]
-    port = 8000
-    type = "http"
-    interval = "10s"
-    timeout = "5s"
-    grace_period = "30s"
-    method = "GET"
-    path = "/health/live"
-
-[[vm]]
-  memory = '1gb'
-  cpu_kind = 'shared'
-  cpus = 2
-EOF
-
-fly apps create "$FLY_APP_NAME" --org personal
-
-# Dummy OpenAI key -- SleepModule never calls an LM, but dspy.LM() init needs it
-fly secrets set OPENAI_API_KEY="sk-dummy-not-used" DSPY_API_KEY="$DSPY_API_KEY_VALUE" --app "$FLY_APP_NAME"
-
-fly deploy --app "$FLY_APP_NAME" --wait-timeout 300
-```
-
-### Phase 7: Wait for Ready
-
-```bash
-export FLY_APP_URL="https://$FLY_APP_NAME.fly.dev"
-
-for i in $(seq 1 60); do
-  STATUS=$(curl -s -o /dev/null -w "%{http_code}" "$FLY_APP_URL/health/ready")
-  if [ "$STATUS" = "200" ]; then echo "App ready after ${i}s"; break; fi
-  echo "Waiting... ($STATUS)"
-  sleep 2
-done
-
-# Confirm SleepModule is discovered
-curl -s -H "Authorization: Bearer $DSPY_API_KEY_VALUE" "$FLY_APP_URL/programs"
-```
-
-### Phase 8: Single-Machine Load Tests
-
-Run `hey` sweeps at increasing concurrency. Each run sends 200 requests with a 1s sleep delay. Theoretical max throughput = `min(concurrency, sync_workers)` rps.
-
-**Phase A: --sync-workers 64**
-
-```bash
-export URL="$FLY_APP_URL/SleepModule"
-export AUTH="Authorization: Bearer $DSPY_API_KEY_VALUE"
-export BODY='{"delay_seconds": 1.0}'
-
-echo "=== Phase A: 64 workers, c=10 ==="
-hey -n 200 -c 10 -t 30 -m POST -H "$AUTH" -H "Content-Type: application/json" -d "$BODY" "$URL"
-
-echo "=== Phase A: 64 workers, c=32 ==="
-hey -n 200 -c 32 -t 30 -m POST -H "$AUTH" -H "Content-Type: application/json" -d "$BODY" "$URL"
-
-echo "=== Phase A: 64 workers, c=64 ==="
-hey -n 200 -c 64 -t 30 -m POST -H "$AUTH" -H "Content-Type: application/json" -d "$BODY" "$URL"
-
-echo "=== Phase A: 64 workers, c=100 ==="
-hey -n 200 -c 100 -t 30 -m POST -H "$AUTH" -H "Content-Type: application/json" -d "$BODY" "$URL"
-
-echo "=== Phase A: 64 workers, c=128 ==="
-hey -n 200 -c 128 -t 30 -m POST -H "$AUTH" -H "Content-Type: application/json" -d "$BODY" "$URL"
-
-# Check memory after heavy load
-fly ssh console --app "$FLY_APP_NAME" -C "cat /proc/meminfo | head -5"
-```
-
-**Phase B: --sync-workers 128** (redeploy first)
-
-```bash
-cd "$TEST_DIR/load-test-app"
-sed -i.bak 's/--sync-workers [0-9]*/--sync-workers 128/' Dockerfile
-fly deploy --app "$FLY_APP_NAME" --wait-timeout 300
-
-# Wait for ready
-for i in $(seq 1 60); do
-  STATUS=$(curl -s -o /dev/null -w "%{http_code}" "$FLY_APP_URL/health/ready")
-  if [ "$STATUS" = "200" ]; then echo "Ready after ${i}s"; break; fi
-  sleep 2
-done
-
-echo "=== Phase B: 128 workers, c=64 ==="
-hey -n 200 -c 64 -t 30 -m POST -H "$AUTH" -H "Content-Type: application/json" -d "$BODY" "$URL"
-
-echo "=== Phase B: 128 workers, c=128 ==="
-hey -n 200 -c 128 -t 30 -m POST -H "$AUTH" -H "Content-Type: application/json" -d "$BODY" "$URL"
-
-echo "=== Phase B: 128 workers, c=200 ==="
-hey -n 300 -c 200 -t 30 -m POST -H "$AUTH" -H "Content-Type: application/json" -d "$BODY" "$URL"
-
-echo "=== Phase B: 128 workers, c=256 ==="
-hey -n 300 -c 256 -t 30 -m POST -H "$AUTH" -H "Content-Type: application/json" -d "$BODY" "$URL"
-
-fly ssh console --app "$FLY_APP_NAME" -C "cat /proc/meminfo | head -5"
-```
-
-**Phase C: --sync-workers 256** (redeploy, only if Phase B didn't OOM)
-
-```bash
-cd "$TEST_DIR/load-test-app"
-sed -i.bak 's/--sync-workers [0-9]*/--sync-workers 256/' Dockerfile
-fly deploy --app "$FLY_APP_NAME" --wait-timeout 300
-
-for i in $(seq 1 60); do
-  STATUS=$(curl -s -o /dev/null -w "%{http_code}" "$FLY_APP_URL/health/ready")
-  if [ "$STATUS" = "200" ]; then echo "Ready after ${i}s"; break; fi
-  sleep 2
-done
-
-echo "=== Phase C: 256 workers, c=128 ==="
-hey -n 200 -c 128 -t 30 -m POST -H "$AUTH" -H "Content-Type: application/json" -d "$BODY" "$URL"
-
-echo "=== Phase C: 256 workers, c=256 ==="
-hey -n 300 -c 256 -t 30 -m POST -H "$AUTH" -H "Content-Type: application/json" -d "$BODY" "$URL"
-
-echo "=== Phase C: 256 workers, c=300 ==="
-hey -n 300 -c 300 -t 30 -m POST -H "$AUTH" -H "Content-Type: application/json" -d "$BODY" "$URL"
-
-fly ssh console --app "$FLY_APP_NAME" -C "cat /proc/meminfo | head -5"
-```
-
-### Phase 9: Multi-Machine Autoscaling Test
-
-Use the best `--sync-workers` from Phases A-C. Remove `ha = false` and scale to 3 machines:
-
-```bash
-cd "$TEST_DIR/load-test-app"
-
-# Update fly.toml: remove ha = false
-sed -i.bak '/ha = false/d' fly.toml
-
-# Set concurrency limits based on findings (adjust these!)
-# soft_limit = ~80% of sync_workers, hard_limit = sync_workers
-# Example for 128 workers:
-sed -i.bak 's/soft_limit = [0-9]*/soft_limit = 100/' fly.toml
-sed -i.bak 's/hard_limit = [0-9]*/hard_limit = 128/' fly.toml
-
-fly deploy --app "$FLY_APP_NAME" --wait-timeout 300
-
-# Scale to 3 machines
-fly scale count 3 --app "$FLY_APP_NAME"
-
-# Wait for all machines to be ready
-sleep 30
-fly machines list --app "$FLY_APP_NAME"
-
-for i in $(seq 1 60); do
-  STATUS=$(curl -s -o /dev/null -w "%{http_code}" "$FLY_APP_URL/health/ready")
-  if [ "$STATUS" = "200" ]; then echo "Ready after ${i}s"; break; fi
-  sleep 2
-done
-```
-
-Now blast at concurrency levels that should trigger multi-machine distribution:
-
-```bash
-# Should spread across machines (3 x 128 = 384 slots)
-echo "=== Autoscale: c=100 (fits in 1 machine) ==="
-hey -n 300 -c 100 -t 30 -m POST -H "$AUTH" -H "Content-Type: application/json" -d "$BODY" "$URL"
-
-echo "=== Autoscale: c=200 (needs 2 machines) ==="
-hey -n 400 -c 200 -t 30 -m POST -H "$AUTH" -H "Content-Type: application/json" -d "$BODY" "$URL"
-
-echo "=== Autoscale: c=300 (needs 3 machines) ==="
-hey -n 600 -c 300 -t 30 -m POST -H "$AUTH" -H "Content-Type: application/json" -d "$BODY" "$URL"
-
-# Check machine status (which ones are started/stopped)
-fly machines list --app "$FLY_APP_NAME"
-```
-
-**Test auto-stop/auto-start:**
-
-```bash
-# Wait for idle machines to stop (~5 min)
-echo "Waiting 5 minutes for auto-stop..."
-sleep 300
-fly machines list --app "$FLY_APP_NAME"
-
-# Hit the endpoint -- should auto-start a machine
-echo "=== Cold start test ==="
-time curl -s -X POST -H "$AUTH" -H "Content-Type: application/json" -d "$BODY" "$URL"
-
-# Check: how long did the cold start take?
-fly machines list --app "$FLY_APP_NAME"
-```
-
-### Phase 10: Guaranteed Cleanup
-
-**ALWAYS run cleanup, even if tests fail:**
-
-```bash
-fly apps destroy "$FLY_APP_NAME" --yes
-
-git -C "$DSPY_CLI_DIR" push origin --delete "$TEMP_BRANCH"
-git -C "$DSPY_CLI_DIR" checkout main
-git -C "$DSPY_CLI_DIR" branch -D "$TEMP_BRANCH"
-
-rm -rf "$TEST_DIR"
-```
-
-## Interpreting Results
-
-### hey Output Key Metrics
-
-```
-Summary:
-  Total:        X.XXX secs        ← wall clock time
-  Requests/sec: XX.XX             ← throughput (target: min(concurrency, sync_workers))
-
-Latency distribution:
-  50% in X.XXX secs               ← should be ~1s (the sleep duration) when not queuing
-  95% in X.XXX secs               ← spikes here = queuing
-  99% in X.XXX secs               ← worst case
-
-Status code distribution:
-  [200] XXX responses              ← success
-  [503] XXX responses              ← server overloaded (hit hard_limit or OOM)
-```
-
-### What Good Looks Like
-
-| Concurrency | Expected RPS (128 workers) | Expected p50 | Sign of trouble |
-|-------------|---------------------------|-------------|-----------------|
-| c <= workers | ~c rps | ~1.0s | - |
-| c = workers | ~workers rps | ~1.0s | Perfect saturation |
-| c = 1.5x workers | ~workers rps | ~1.5s | Queuing (expected) |
-| c = 2x workers | ~workers rps | ~2.0s | Deep queue |
-| Any | < expected | > 3s | OOM, CPU thrash, or errors |
-
-### Memory Check
-
-```bash
-fly ssh console --app "$FLY_APP_NAME" -C "cat /proc/meminfo | head -5"
-```
-
-If `MemAvailable` drops below ~100MB under load, you've found the memory wall. Reduce `--sync-workers` or increase VM memory.
-
-## Production Sizing Guide
-
-*Fill in after running tests. Template:*
-
-| Target Concurrent | VM | `--sync-workers` | `soft_limit` | `hard_limit` | Machines |
-|-------------------|-----|-----------------|-------------|-------------|----------|
-| 50 | shared-cpu-2x 1gb | ? | ? | ? | 1 |
-| 100 | shared-cpu-2x 1gb | ? | ? | ? | 1 |
-| 200 | shared-cpu-2x 1gb | ? | ? | ? | 2 |
-| 500 | shared-cpu-2x 1gb | ? | ? | ? | 4-5 |
-
-**Rules:**
-- `hard_limit = sync_workers` (the thread pool ceiling; no more concurrent work is possible)
-- `soft_limit = ~80% of sync_workers` (gives fly ~seconds to wake another machine)
-- Machines = `ceil(target_concurrent / hard_limit)`
-
-## Cleanup Verification
-
-```bash
-fly apps list | grep "dspy-load" || echo "No orphaned apps"
-git branch -r | grep "load-test/" || echo "No orphaned branches"
-```
-
-## Troubleshooting
-
-### SleepModule not discovered
-Check that the module file is in `src/<pkg>/modules/` and the class inherits from `dspy.Module`. Verify with:
-```bash
-fly ssh console --app "$FLY_APP_NAME" -C "sh -c 'find /src -name \"*.py\" | head -20'"
-```
-
-### "No module named dspy" during build
-The `uv sync --no-dev` in Dockerfile should install dspy via the dspy-cli dependency. Check `pyproject.toml` has the git URL correctly.
-
-### OOM kills during load test
-Reduce `--sync-workers`, or increase VM memory. Check:
-```bash
-fly logs --app "$FLY_APP_NAME" --no-tail | grep -i "oom\|kill\|memory"
-```
-
-### hey: "socket: too many open files"
-On macOS, increase ulimit before running hey:
-```bash
-ulimit -n 10240
-```
-
-### Autoscaling doesn't trigger
-Verify concurrency limits in fly.toml match what's deployed:
-```bash
-fly config show --app "$FLY_APP_NAME" | grep -A5 concurrency
-```
-Fly only wakes stopped machines when `soft_limit` is exceeded. If all machines are already running, no new ones start (fly doesn't create machines, only starts/stops existing ones).
-
-### Machines don't auto-stop
-`auto_stop_machines = 'stop'` only stops machines with zero connections. If hey keeps connections alive, wait for them to close. Default idle timeout is ~5 minutes.

From d491e4452c95632c3836190d4f40176f17d765d8 Mon Sep 17 00:00:00 2001
From: isaacbmiller <isaacbmiller@gmail.com>
Date: Mon, 2 Mar 2026 10:31:58 -0500
Subject: [PATCH 4/4] fix: pass save_openapi/openapi_format through reload
 factory path

create_app_instance() (used by uvicorn in reload mode) was missing the
save_openapi and openapi_format parameters, so openapi.json was never
written when --reload was enabled (the default). Forward both values
through environment variables like the other reload-mode settings.

Co-authored-by: factory-droid[bot] <138933559+factory-droid[bot]@users.noreply.github.com>
---
 src/dspy_cli/server/runner.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/dspy_cli/server/runner.py b/src/dspy_cli/server/runner.py
index 867ed7e..2b5ba17 100644
--- a/src/dspy_cli/server/runner.py
+++ b/src/dspy_cli/server/runner.py
@@ -21,6 +21,8 @@
 ENV_LOGS_DIR = "DSPY_CLI_LOGS_DIR"
 ENV_AUTH_ENABLED = "DSPY_CLI_AUTH_ENABLED"
 ENV_SYNC_WORKERS = "DSPY_CLI_SYNC_WORKERS"
+ENV_SAVE_OPENAPI = "DSPY_CLI_SAVE_OPENAPI"
+ENV_OPENAPI_FORMAT = "DSPY_CLI_OPENAPI_FORMAT"
 
 
 def _maybe_mount_mcp(app, enable: bool, *, path: str = MCP_DEFAULT_PATH, notify=None) -> bool:
@@ -89,6 +91,8 @@ def create_app_instance():
     enable_auth = os.environ.get(ENV_AUTH_ENABLED, "false").lower() == "true"
     sync_workers_str = os.environ.get(ENV_SYNC_WORKERS)
     sync_workers = int(sync_workers_str) if sync_workers_str else None
+    save_openapi = os.environ.get(ENV_SAVE_OPENAPI, "true").lower() == "true"
+    openapi_format = os.environ.get(ENV_OPENAPI_FORMAT, "json")
 
     # Validate project structure
     if not validate_project_structure():
@@ -127,6 +131,15 @@ def create_app_instance():
     # Register post-init callbacks (run by the lifespan after module discovery)
     def _on_ready():
         _maybe_mount_mcp(app, enable_mcp)
+        if save_openapi:
+            try:
+                spec = generate_openapi_spec(app)
+                spec_filename = f"openapi.{openapi_format}"
+                spec_path = Path.cwd() / spec_filename
+                save_openapi_spec(spec, spec_path, format=openapi_format)
+                logger.info("OpenAPI spec saved: %s", spec_filename)
+            except Exception as e:
+                logger.warning("Could not save OpenAPI spec: %s", e)
 
     app.state._on_ready_callbacks = [_on_ready]
 
@@ -250,6 +263,8 @@ def _on_ready():
             os.environ[ENV_LOGS_DIR] = str(logs_path)
             os.environ[ENV_ENABLE_MCP] = str(mcp).lower()
             os.environ[ENV_AUTH_ENABLED] = str(auth).lower()
+            os.environ[ENV_SAVE_OPENAPI] = str(save_openapi).lower()
+            os.environ[ENV_OPENAPI_FORMAT] = openapi_format
             if sync_workers is not None:
                 os.environ[ENV_SYNC_WORKERS] = str(sync_workers)