hookdeck · leggetter · Apr 24, 2026 · Apr 22, 2026 · Apr 23, 2026 · Apr 23, 2026
diff --git a/.env.example b/.env.example
@@ -0,0 +1,12 @@
+# Agent scenario tester + Claude Code — copy to `.env` in this repo root and fill in values.
+# See tools/agent-scenario-tester/README.md
+
+# Required for: optional LLM-as-judge (`--judge` / RUN_LLM_JUDGE); often also used by Claude CLI for API access.
+ANTHROPIC_API_KEY=
+
+# Optional: enable LLM judge without passing `--judge` each time (1 / true / yes).
+# RUN_LLM_JUDGE=0
+
+# Optional: model for the judge (defaults to claude-sonnet-4-20250514).
+# JUDGE_MODEL=
+# EVAL_SCORE_MODEL=
diff --git a/.github/workflows/test-examples.yml b/.github/workflows/test-examples.yml
@@ -70,10 +70,29 @@ jobs:
         working-directory: skills/event-gateway/examples/fastapi
         run: pytest test_webhook.py -v
 
+  test-outpost-saas:
+    name: Outpost SaaS examples (nextjs-saas + fastapi-saas)
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: "20"
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Run test-examples.sh outpost
+        run: ./scripts/test-examples.sh outpost
+
   summary:
     name: Test Summary
     runs-on: ubuntu-latest
-    needs: [test-express, test-nextjs, test-fastapi]
+    needs: [test-express, test-nextjs, test-fastapi, test-outpost-saas]
     if: always()
     steps:
       - name: Check test results
@@ -83,7 +102,8 @@ jobs:
 
           if [ "${{ needs.test-express.result }}" == "failure" ] || \
              [ "${{ needs.test-nextjs.result }}" == "failure" ] || \
-             [ "${{ needs.test-fastapi.result }}" == "failure" ]; then
+             [ "${{ needs.test-fastapi.result }}" == "failure" ] || \
+             [ "${{ needs.test-outpost-saas.result }}" == "failure" ]; then
             echo "**Result:** Some tests failed" >> $GITHUB_STEP_SUMMARY
             exit 1
           fi

diff --git a/.gitignore b/.gitignore
@@ -20,6 +20,7 @@ build/
 *.egg-info/
 .next/
 out/
+.turbo/
 
 # IDE and editor files
 .idea/
@@ -45,6 +46,9 @@ htmlcov/
 # Package lock files (not tracked for example projects)
 package-lock.json
 yarn.lock
+pnpm-lock.yaml
+uv.lock
+bun.lock
 
 # Agent scenario tester
 test-results/

diff --git a/AGENTS.md b/AGENTS.md
@@ -34,7 +34,7 @@ All skills MUST conform to the [Agent Skills specification](https://agentskills.
 
 - **Hookdeck** = the company brand, carried by the repo name (`hookdeck/agent-skills`)
 - **Event Gateway** ("Hookdeck Event Gateway") = the inbound product. Receives, routes, processes, and delivers webhooks/events. [Docs](https://hookdeck.com/docs/).
-- **Outpost** ("Hookdeck Outpost") = the outbound product. Open-source infrastructure for sending webhooks and events to user-preferred destinations (HTTP, SQS, RabbitMQ, Pub/Sub, EventBridge, Kafka). [Docs](https://outpost.hookdeck.com/docs/). [GitHub](https://github.com/hookdeck/outpost).
+- **Outpost** ("Hookdeck Outpost") = the outbound product. Open-source infrastructure for sending webhooks and events to user-preferred destinations (HTTP, SQS, RabbitMQ, Pub/Sub, and more—see docs for the current matrix). [Docs](https://hookdeck.com/docs/outpost). [GitHub](https://github.com/hookdeck/outpost).
 
 Skills are prefixed by product name: `event-gateway` or `outpost`. The company brand is not repeated in skill names because it's carried by the repo.
 
@@ -93,6 +93,15 @@ hookdeck/agent-skills/
         fastapi/                       # Hookdeck signature verification handler
     outpost/                           # Outpost skill (separate product)
       SKILL.md
+      references/
+        outpost-quickstarts.md         # Canonical quickstart + llms.txt links
+        outpost-scope.md               # Scope ladder, topic reconciliation, BFF pointers
+        outpost-verify.md              # Trimmed “before you stop” checklist
+        nextjs-saas-integration-map.md
+        fastapi-saas-integration-map.md
+      examples/
+        nextjs-saas/
+        fastapi-saas/
   AGENTS.md                            # This file
   CLAUDE.md
   CONTRIBUTING.md
@@ -265,6 +274,7 @@ Rules provide concise, always-on guidance. Keep them minimal; link to skills for
 
 - **Cursor first:** Lead with Cursor plugin install (`/add-plugin hookdeck`). What the plugin does.
 - **Retain generic skills:** Keep `npx skills add hookdeck/agent-skills` and full Agent Skills usage for Claude, ChatGPT, and other agents. Do not remove the generic install path.
+- **Human-facing doc links in README.md:** Do not link to Hookdeck documentation URLs that end in `.md` (for example `https://hookdeck.com/docs/cli/mcp.md`). Those `.md` endpoints exist mainly for agents and tools fetching markdown. In the repo root README, use pages without `.md` (for example [MCP & Skills](https://hookdeck.com/docs/mcp), [CLI](https://hookdeck.com/docs/cli)) and describe where to find a subsection (e.g. MCP in the CLI doc’s Event Gateway operations table) when there is no dedicated HTML path.
 
 ### Plugin description and keywords
 

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -31,8 +31,16 @@ cd agent-skills
 cd skills/event-gateway/examples/express && npm install && npm test
 cd skills/event-gateway/examples/nextjs && npm install && npm test
 cd skills/event-gateway/examples/fastapi && pip install -r requirements.txt && pytest test_webhook.py -v
+
+# Outpost SaaS reference (Next.js + Outpost SDK; large tree — npm install may take a minute)
+cd skills/outpost/examples/nextjs-saas && npm install && npm test
+
+# Outpost FastAPI SaaS (backend only — pip install in backend/)
+cd skills/outpost/examples/fastapi-saas/backend && python3 -m venv venv && source venv/bin/activate && pip install pytest httpx 'fastapi>=0.114' && pytest test_outpost_wire.py -q
 ```
 
+When you bump dependencies in **Outpost** SaaS examples (`skills/outpost/examples/nextjs-saas` or `skills/outpost/examples/fastapi-saas`), update the **Example stack snapshot** table in `skills/outpost/SKILL.md` to match the new pins, and run `./scripts/test-examples.sh outpost` before opening a PR.
+
 **Agent scenario tests** (end-to-end: install skills, run Claude, score report): see [TESTING.md](TESTING.md#agent-scenario-testing-two-layers). From repo root: `./scripts/test-agent-scenario.sh run receive-webhooks express` or `./scripts/test-agent-scenario.sh list`.
 
 ## Repository Structure

diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # Hookdeck Agent Skills
 
-Equip your AI coding agent with webhook and event-driven architecture expertise. Receive, queue, route, and deliver webhooks with [Hookdeck Event Gateway](https://hookdeck.com), test webhooks locally with the Hookdeck CLI, and build outbound webhook delivery with [Outpost](https://outpost.hookdeck.com).
+Equip your AI coding agent with webhook and event-driven architecture expertise. Receive, queue, route, and deliver webhooks with [Hookdeck Event Gateway](https://hookdeck.com), test webhooks locally with the Hookdeck CLI, and build outbound webhook delivery with [Hookdeck Outpost](https://hookdeck.com/docs/outpost).
 
 <!-- Cursor plugin pending marketplace acceptance
 ## Install (Cursor)
@@ -52,23 +52,24 @@ The `event-gateway` skill includes a staged integration workflow:
 3. **Listen** -- Start `hookdeck listen`, trigger test events
 4. **Iterate** -- Debug failures, fix code, replay events
 
-## Roadmap
+## Hookdeck CLI MCP (reference)
 
-### Hookdeck MCP Server (coming soon)
+The [Hookdeck CLI](https://hookdeck.com/docs/cli) includes a **beta** MCP server (`hookdeck gateway mcp`) that exposes Event Gateway data as MCP tools over stdio. In practice it is **operational support**: query, inspect, and diagnose connections, requests, events, attempts, issues, and metrics from your agent or IDE—without replacing the full CLI or API for creating resources, retries, or local tunnel workflows.
 
-The Hookdeck CLI will host an MCP server (`hookdeck mcp`), giving your coding agent direct access to the full CLI toolchain. Beyond resource management, this enables a real-time development loop:
+For setup, client configuration, available tools, and what MCP can and cannot do, see the official docs:
 
-- **Live event tools:** List, inspect, and retry events and deliveries without leaving your editor
-- **Webhook tunnel in the agent loop:** The MCP runs `hookdeck listen` in-process. When webhook events arrive (or deliveries fail), the MCP pushes updates to the agent. Your agent writes the webhook handler, receives a live event, sees the result, and iterates on the code—all in a single session with no context switching.
-- **Both skill repos as MCP resources:** Agent-skills (Hookdeck product knowledge) and webhook-skills (provider-specific webhook knowledge for Stripe, Shopify, GitHub, etc.) served as `hookdeck://` and `webhooks://` resources.
+- [MCP & Skills](https://hookdeck.com/docs/mcp) — overview and when to pair MCP with agent skills
+- [Hookdeck CLI](https://hookdeck.com/docs/cli) — MCP is linked from the Event Gateway operations table (command usage, auth, client configuration)
 
-This turns the plugin from a knowledge layer into an interactive development environment for webhook integrations.
+The skills in this repository remain the primary way to guide an agent through **product workflows** (setup, handlers, `hookdeck listen`, Outpost, API usage). Use CLI MCP when you want **read-heavy inspection and analysis** against your live Hookdeck project.
 
 ## Documentation
 
 - [Event Gateway docs](https://hookdeck.com/docs/)
-- [Outpost docs](https://outpost.hookdeck.com/docs/)
-- [API reference](https://hookdeck.com/docs/api)
+- [MCP & Skills](https://hookdeck.com/docs/mcp); [Hookdeck CLI](https://hookdeck.com/docs/cli) (MCP under Event Gateway operations)
+- [Outpost docs](https://hookdeck.com/docs/outpost)
+- [Outpost API reference](https://hookdeck.com/docs/outpost/api)
+- [Hookdeck REST API](https://hookdeck.com/docs/api)
 
 ## Testing
 

diff --git a/TESTING.md b/TESTING.md
@@ -1,6 +1,6 @@
 # Testing Hookdeck Agent Skills
 
-This document covers automated testing for code examples in the `event-gateway` skill. The example tests follow the same patterns as [hookdeck/webhook-skills](https://github.com/hookdeck/webhook-skills/blob/main/TESTING.md).
+This document covers automated testing for code examples in the **`event-gateway`** and **`outpost`** skills. The example tests follow the same patterns as [hookdeck/webhook-skills](https://github.com/hookdeck/webhook-skills/blob/main/TESTING.md).
 
 Hookdeck tests its agent skills at three levels: **code example tests** (unit/integration tests for the example applications shipped with each skill), **static quality checks** (linting and scoring skill files), and **agent scenario testing** (giving real agents tasks and scoring whether they succeed).
 
@@ -56,8 +56,16 @@ Use the test runner script to discover and run all examples:
 
 # Specific skill
 ./scripts/test-examples.sh event-gateway
+./scripts/test-examples.sh outpost
 ```
 
+The **outpost** skill includes:
+
+- [examples/nextjs-saas/](skills/outpost/examples/nextjs-saas/) — `npm test` (Vitest). For a full `next build` without Stripe, use `STRIPE_MOCK=1` (see that example’s README).
+- [examples/fastapi-saas/](skills/outpost/examples/fastapi-saas/) — in `backend/`: `pytest test_outpost_wire.py` with minimal deps (no full app install; no live Outpost).
+
+Both are full apps; installs can take longer than the small event-gateway demos.
+
 ### CI Pipeline
 
 Tests run automatically on PR and push to main via GitHub Actions. See `.github/workflows/test-examples.yml`.
@@ -124,7 +132,7 @@ Baseline: run `npm run skill:review` periodically and record scores; use them to
 
 ### Layer 2: Agent Scenarios (Evals)
 
-This is where testing becomes evaluation. The scenario tester installs skills, runs Claude Code with a scenario prompt, and writes a scored report. It answers: can an agent actually follow the staged workflow to accomplish a real task?
+This is where testing becomes evaluation. The scenario tester installs **one** skill per scenario (see `skillUnderTest` in `scenarios.yaml`; default is `event-gateway`), runs Claude Code with a scenario prompt, and writes a scored report. It answers: can an agent actually follow the workflow to accomplish a real task?
 
 **Prerequisites:** [Claude Code CLI](https://claude.ai/download) installed and logged in (`ANTHROPIC_API_KEY` or `claude login`). The tool runs a preflight that sends a short prompt to the CLI; if you see "Claude CLI did not respond within 15s", the CLI may be blocked (e.g. in a restricted sandbox). Run with a full environment or ensure the CLI can reach the API.
 
@@ -133,27 +141,34 @@ This is where testing becomes evaluation. The scenario tester installs skills, r
 ```bash
 # From repo root (recommended)
 ./scripts/test-agent-scenario.sh run receive-webhooks express
+./scripts/test-agent-scenario.sh run receive-webhooks express --judge   # optional LLM rubric (same ANTHROPIC_API_KEY)
+./scripts/test-agent-scenario.sh run outpost-managed-quickstart express
 ./scripts/test-agent-scenario.sh run receive-provider-webhooks nextjs --provider stripe
 ./scripts/test-agent-scenario.sh list
-./scripts/test-agent-scenario.sh assess <resultDir>   # re-run assessor on existing result, update report.md
+./scripts/test-agent-scenario.sh assess <resultDir>   # re-run heuristic assessor, update report.md
+./scripts/test-agent-scenario.sh assess <resultDir> --judge   # also run LLM judge
 
 # Or via npx from repo root
 npx tsx tools/agent-scenario-tester/src/index.ts run receive-webhooks express
 ```
 
-**Options:** `--dry-run`, `--verbose`, `--timeout <seconds>` (default 300).
+**Options:** `--dry-run`, `--verbose`, `--timeout <seconds>` (default 300), `--judge` (optional LLM-as-judge after heuristics).
+
+**LLM judge:** Pass `--judge` or set `RUN_LLM_JUDGE=1`. Requires `ANTHROPIC_API_KEY`. Optional `JUDGE_MODEL` / `EVAL_SCORE_MODEL` override the default scoring model. Rubric comes from per-scenario `successCriteriaMarkdown` in YAML when set; otherwise it is derived from heuristic `evaluation.checks`. Writes `llm-score.json` and appends **## LLM judge** to `report.md` (pattern aligned with [Outpost docs eval](https://github.com/hookdeck/outpost/blob/main/docs/agent-evaluation/src/llm-judge.ts)). The judge reads `run.log`, generated text files in the result directory (handler, `package.json`, etc.), then `README.md` — it does not execute the agent’s shell or HTTP.
 
-**Scenarios:** Defined in `scenarios.yaml`. Three scenarios test increasingly interesting agent behaviors:
+**Scenarios:** Defined in `scenarios.yaml`. Examples:
 
 - **receive-webhooks** — Setup Hookdeck, build handler with signature verification, run `hookdeck listen`, document inspect/retry workflow. Tests stages 01–04 (iterate is documentation-only: agent documents how to list request → event → attempt and retry; no live traffic required).
-- **receive-provider-webhooks** — Same plus a provider (e.g. Stripe). Use `--provider stripe`. Only the event-gateway skill is pre-installed; the agent is expected to discover and use the provider skill from webhook-skills (e.g. `npx skills add hookdeck/webhook-skills --skill stripe-webhooks -y -g`) and use the provider SDK in the handler. Tests composition and the provider-webhooks checklist.
+- **receive-provider-webhooks** — Same plus a provider (e.g. Stripe). Use `--provider stripe`. Only `event-gateway` is copied in for this scenario (`skillUnderTest`); the agent is expected to discover and use the provider skill from webhook-skills (e.g. `npx skills add hookdeck/webhook-skills --skill stripe-webhooks -y -g`) and use the provider SDK in the handler. Tests composition and the provider-webhooks checklist.
 - **investigate-delivery-health** — Documentation-only: assume the user has had webhooks for a week and wants to understand delivery performance (success vs failure, backlog, latency). The prompt does **not** mention "metrics" or "hookdeck gateway metrics"; the assessor checks whether the agent used metrics CLI commands. Use to verify that agents discover and use metrics from the skill when the task implies it.
+- **outpost-managed-quickstart** — `skillUnderTest: outpost`. Managed Outpost smallest path (tenant, webhook destination, publish, verify). Prompt assumes `OUTPOST_API_KEY` is set.
 
 | Scenario | Tests | Key question |
 |----------|-------|-------------|
 | `receive-webhooks` | Core skill usage | Can the agent follow the skill to set up webhook receiving? |
 | `receive-provider-webhooks` | Composition | Does the agent discover and install a Stripe-specific skill on its own? |
 | `investigate-delivery-health` | Discovery | Does the agent find diagnostic tools (CLI metrics, MCP) when they aren't mentioned in the prompt? |
+| `outpost-managed-quickstart` | Outpost skill | Does the agent use Outpost docs and API/curl for tenant, destination, and publish? |
 
 ### Scenario run checklist
 
@@ -166,8 +181,9 @@ Run these and evaluate results; iterate on skills or prompts as needed.
 | 3 | receive-webhooks | FastAPI | `./scripts/test-agent-scenario.sh run receive-webhooks fastapi` | Done |
 | 4 | receive-provider-webhooks | Express | `./scripts/test-agent-scenario.sh run receive-provider-webhooks express --provider stripe` | Done |
 | 5 | investigate-delivery-health | Express | `./scripts/test-agent-scenario.sh run investigate-delivery-health express` | — |
+| 6 | outpost-managed-quickstart | Express | `./scripts/test-agent-scenario.sh run outpost-managed-quickstart express` | — |
 
-**Output:** `test-results/<scenario>-<framework>-<provider?>-<timestamp>/` containing `report.md` (checklist + automated score), `run.log` (full Claude output), and generated project files. To re-run only the assessor (e.g. after fixing the tool): `./scripts/test-agent-scenario.sh assess <resultDir>`.
+**Output:** `test-results/<scenario>-<framework>-<provider?>-<timestamp>/` containing `report.md` (heuristic checklist + score), optional `llm-score.json` and **LLM judge** section when `--judge` / `RUN_LLM_JUDGE=1`, `run.log` (full Claude output), and generated project files. To re-run only the assessor: `./scripts/test-agent-scenario.sh assess <resultDir>` (add `--judge` to include the LLM pass).
 
 ### Iterative Improvement Workflow