From d5540f627180ce13ef7c6204bb1de6bf00002387 Mon Sep 17 00:00:00 2001 From: Zbigniew Sobiecki Date: Wed, 6 May 2026 14:05:15 +0000 Subject: [PATCH 1/3] docs(spec): 018 alerting agent + worker boot-failure visibility + plans MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Spec captures two related gaps surfaced by the first sentry-bound agent run (2026-05-06): alerting agent's missing prompt template and the silent worker boot-failure path that masked it. Two plans downstream: alerting-prompt (feature) and boot-failure-visibility (hardening, follow-on to spec 017). Linear DAG, plan 1 → plan 2. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../1-alerting-prompt.md | 184 +++++++++++++ .../2-boot-failure-visibility.md | 256 ++++++++++++++++++ .../_coverage.md | 39 +++ ...erting-agent-and-worker-boot-visibility.md | 205 ++++++++++++++ 4 files changed, 684 insertions(+) create mode 100644 docs/plans/018-alerting-agent-and-worker-boot-visibility/1-alerting-prompt.md create mode 100644 docs/plans/018-alerting-agent-and-worker-boot-visibility/2-boot-failure-visibility.md create mode 100644 docs/plans/018-alerting-agent-and-worker-boot-visibility/_coverage.md create mode 100644 docs/specs/018-alerting-agent-and-worker-boot-visibility.md diff --git a/docs/plans/018-alerting-agent-and-worker-boot-visibility/1-alerting-prompt.md b/docs/plans/018-alerting-agent-and-worker-boot-visibility/1-alerting-prompt.md new file mode 100644 index 00000000..8ca6b904 --- /dev/null +++ b/docs/plans/018-alerting-agent-and-worker-boot-visibility/1-alerting-prompt.md @@ -0,0 +1,184 @@ +--- +id: 018 +slug: alerting-agent-and-worker-boot-visibility +plan: 1 +plan_slug: alerting-prompt +level: plan +parent_spec: docs/specs/018-alerting-agent-and-worker-boot-visibility.md +depends_on: [] +status: pending +--- + +# 018/1: Alerting agent prompt template + +> Part 1 of 2 in the 018-alerting-agent-and-worker-boot-visibility plan. See [parent spec](../../specs/018-alerting-agent-and-worker-boot-visibility.md). + +## Summary + +Author the system prompt template that the alerting agent's worker tries to load at boot. Every other piece of the alerting agent — YAML definition (`src/agents/definitions/alerting.yaml`), capabilities (`src/agents/capabilities/registry.ts:194-200`), trigger handlers (`src/triggers/sentry/alerting-{issue,metric}.ts`), context pipeline step (`src/agents/definitions/contextSteps.ts:564-606`), Sentry integration (`src/sentry/alerting-integration.ts`) — is already in place. The missing file is `src/agents/prompts/templates/alerting.eta`. After this plan ships, a Sentry alert webhook produces a fully-functional alerting agent run end-to-end. + +This plan delivers the alerting agent's persona, philosophy, three-phase process (parse pre-loaded event → confirm root cause via source reads → report via PM tool), explicit INVESTIGATE-AND-FILE-ONLY guardrail, soft investigation-depth guidance, and predictable output structure. The prompt is engine-agnostic, reuses `partials/environment` for the shared preamble, and avoids few-shot examples that would leak into the integration-test domain. + +What this plan does NOT deliver: any change to the run-record creation order, any new exit codes, any CI guard against future agent types missing their templates, any synthesized identifier for sentry runs. Those are plan 2's scope. + +**Components delivered:** +- `src/agents/prompts/templates/alerting.eta` — the system prompt itself +- `tests/unit/agents/prompts.test.ts` — extended with alerting-specific render tests +- `tests/integration/agents/alerting-end-to-end.test.ts` — fixture-driven end-to-end test +- `README.md` — alerting agent inventory entry +- `CHANGELOG.md` — entry for this plan + +**Deferred to later plans in this spec:** +- Worker boot-failure catch site, run-record reordering, new exit code, conformance test, synthesized sentry-run identifier — all in plan 2. + +--- + +## Spec ACs satisfied by this plan + +- Spec AC #1 (Sentry issue alert produces a visible alerting agent run that progresses through investigation phases and terminates with PM-tool output) — **full** +- Spec AC #2 (Sentry metric alert produces a comparable investigation adapted to lack of stacktrace) — **full** +- Spec AC #3 (alerting agent does not edit source, commit, push, or open PRs) — **full** +- Spec AC #4 (prompt renders for all three engines without errors) — **full** +- Spec AC #5 (predictable output structure for work item title / description / comment text) — **full** +- Spec AC #6 (alerting agent reasons about whatever pre-loaded context it receives, no per-event-type branching in the prompt) — **full** + +--- + +## Depends On + +- _(none — first plan in this spec)_ + +--- + +## Detailed Task List (TDD) + +### 1. Prompt-render unit tests + +**Tests first** (`tests/unit/agents/prompts.test.ts`, extending the existing file): + +- `getSystemPrompt('alerting') renders without throwing for an issue-alert trigger context` — unit — input: trigger context with `alertTitle`, `alertIssueUrl`, `alertIssueId`, `alertOrgId`, `triggerEvent: 'alerting:issue-alert'`, plus the standard `PromptContext` env shape; expected: returns a non-empty string. Expected red: `Error: ENOENT: no such file or directory, open '.../templates/alerting.eta'` (because the template doesn't exist yet). +- `getSystemPrompt('alerting') renders without throwing for a metric-alert trigger context` — unit — input: same shape but `triggerEvent: 'alerting:metric-alert'` and no `alertIssueId`; expected: returns a non-empty string. Expected red: same ENOENT. +- `rendered alerting prompt contains the three phase markers` — unit — assert presence of three section headers (Phase 1, Phase 2, Phase 3 — exact strings TBD by template author but stable). Expected red: `AssertionError: expected '...' to include 'Phase 1'` once the template exists but lacks the markers — the test must read the markers from a single source of truth (the spec or a shared constant), not from a snapshot of the template. +- `rendered alerting prompt contains the INVESTIGATE-AND-FILE-ONLY guardrail` — unit — assert presence of the guardrail clause referenced in spec strategic-decision #2. Expected red: `AssertionError: expected '...' to match /INVESTIGATE-AND-FILE-ONLY/i`. +- `rendered alerting prompt includes the shared environment preamble` — unit — assert that a known string from `partials/environment.eta` (e.g. `Available Runtimes`) appears in the rendered output. Expected red: `AssertionError: ... to include 'Available Runtimes'`. +- `rendered alerting prompt prefers comment-on-existing when both backlogListId and existing workItemId are provided` — unit — render twice with different `it` shapes; assert the (workItemId-set, backlogListId-set) render contains "comment" instructions and NOT "create a new backlog" instructions. Expected red: `AssertionError: expected '...' to include 'comment'`. +- `rendered alerting prompt directs creating a backlog work item when only backlogListId is provided` — unit — render with `workItemId: undefined, backlogListId: 'list-id'`; assert the render contains "create" instructions. Expected red: `AssertionError: expected '...' to include 'create'`. +- `rendered alerting prompt does not contain engine-specific tool-call syntax` — unit — assert the rendered output does NOT contain a banned-pattern list (e.g. `function_calls`, ``, `<|im_start|>`, `` anchored to engine internals — the precise banned list is finalized when authoring the template). Expected red: `AssertionError: expected '...' not to match /function_calls/`. + +**Implementation** (`src/agents/prompts/templates/alerting.eta`): + +The author writes a template (~80-150 lines) following the structural pattern of `review.eta` and `backlog-manager.eta`. Required content: + +- Persona block: identity as 🚨 Alert Investigator (mirrors the YAML's `identity.label`). +- `<%~ include("partials/environment") %>` for the shared preamble. +- Philosophy block: "investigation, not repair" — explicit prose paralleling `review.eta`'s "REVIEW ONLY" guardrail. Single paragraph, reinforced enough to survive the same model on a different day. +- Three-phase process block (Phase 1 / Phase 2 / Phase 3) with clear markers and the actions per phase. +- Soft depth guidance ("stop when you can name the failing function and the trigger condition") rather than a hard file-read cap. +- Decision rule: if `it.workItemId` is set, comment on it; else if `it.backlogListId` is set, create a backlog work item. (Mirrors the YAML's existing taskPrompt logic but elaborates it into the system prompt.) +- Output structure templates: a 1-line title shape and a 4-bullet description shape (root cause / affected file:line / failing function / link to alert), reproducible across runs. +- Completion criteria: an explicit "you are done when …" clause. + +The template MUST NOT include any few-shot example whose domain overlaps with cascade's own test fixtures or eval fixtures, per the cross-project rule "prompt examples must not leak the eval answers". If examples are included at all, they reference clearly-invented domains (e.g. a fake `WeatherFetcher` API throwing a `RateLimitError`) — never cascade's own classes. + +The template MUST NOT contain engine-specific tool-call syntax. Any agent capabilities are described in plain prose ("you can read files via the gadget surface", not "call function_calls(`{name: ReadFile, ...}`)"); the engine wrapping handles the actual tool-use shape. + +### 2. End-to-end integration test + +**Tests first** (`tests/integration/agents/alerting-end-to-end.test.ts` — new file): + +- `alerting agent dispatched against a fixture issue-alert payload completes successfully` — integration — setup: a fixture Sentry issue-alert payload, a fixture project config with `alerting:read` capability + Sentry credentials, a stubbed PM provider (Trello adapter via `createFakePMProvider`); expected: the run completes without throwing, a CreateWorkItem or CommentOnWorkItem call was made on the stubbed PM provider. Expected red: `Error: ENOENT ... alerting.eta` until the template is authored. +- `alerting agent does not invoke any source-edit gadget during a fixture run` — integration — same fixture; spy/intercept all gadget invocations; assert no calls to gadgets that write files (e.g. WriteFile, CommitChanges, OpenPullRequest are never invoked). Expected red: same ENOENT initially; once template renders, this catches any prompt that drifts into "fix the bug" mode. +- `alerting agent metric-alert variant produces a comparable investigation` — integration — same shape but with a metric-alert payload (no stacktrace); expected: run completes and the PM-tool call shape is the same kind. Expected red: same ENOENT. + +**Implementation** — no new product code beyond the template itself. The integration tests drive the existing dispatch path against fixtures. + +Fixture files needed: +- `tests/fixtures/sentry-issue-alert.json` — representative issue-alert payload (can borrow from `tests/fixtures/linear-issue-with-screenshot.json`'s pattern of saving a real captured payload with secrets scrubbed, per the spec-016 precedent) +- `tests/fixtures/sentry-metric-alert.json` — representative metric-alert payload + +### 3. Documentation + +**`README.md`**: locate the agent inventory section (or create one if absent — the README has been the established home for documented agents per spec 014's pattern); add an entry for the alerting agent describing what it does (investigates Sentry alerts, reports findings to the project's PM tool) and what it does NOT do (no source edits, no PRs). + +**`CHANGELOG.md`**: add an entry under the `## [Unreleased]` block (or whatever the active changelog convention is in this repo): `- Added: alerting agent investigates Sentry alerts and creates bug-investigation work items or comments on existing ones (#NNNN)`. + +--- + +## Test Plan + +### Unit tests +- [ ] `tests/unit/agents/prompts.test.ts`: 8 new tests covering render-success, phase markers, guardrail, environment preamble, comment-vs-create routing, engine-agnostic prose + +### Integration tests +- [ ] `tests/integration/agents/alerting-end-to-end.test.ts`: 3 tests covering issue-alert flow, metric-alert flow, and the "no source edits" guardrail behaviorally + +### Acceptance tests +- [ ] Per-plan AC checklist (below) verified against the integration test outputs and a manual prompt review + +--- + +## Manual Verification (for `[manual]`-tagged ACs only) + +n/a — all per-plan ACs auto-tested. + +--- + +## Acceptance Criteria (per-plan, testable) + +1. The alerting prompt template renders without errors against a representative issue-alert trigger context. +2. The alerting prompt template renders without errors against a representative metric-alert trigger context. +3. The rendered prompt contains the three phase markers (Phase 1 / Phase 2 / Phase 3) in order. +4. The rendered prompt contains the INVESTIGATE-AND-FILE-ONLY guardrail clause. +5. The rendered prompt includes the shared `partials/environment` preamble. +6. The rendered prompt directs commenting on the existing work item when both `workItemId` and `backlogListId` are provided in the context. +7. The rendered prompt directs creating a backlog work item when only `backlogListId` is provided. +8. The rendered prompt does not contain engine-specific tool-call syntax patterns. +9. An end-to-end integration test against a fixture issue-alert payload produces either a CreateWorkItem call or a CommentOnWorkItem call on the stubbed PM provider, and the run completes without throwing. +10. The end-to-end integration test asserts no source-edit gadget was invoked at any point in the fixture run. +11. All new/modified code has corresponding tests. +12. `npm run lint` passes. +13. `npm run typecheck` passes. +14. `npm test` passes. +15. `README.md` and `CHANGELOG.md` are updated. + +--- + +## Documentation Impact (this plan only) + +| File | Change | +|---|---| +| `README.md` | Add alerting agent entry to the agent inventory section, describing investigation-and-file role and the no-source-edits guarantee. | +| `CHANGELOG.md` | Entry: `Added: alerting agent investigates Sentry alerts and reports findings to the project's PM tool`. | + +--- + +## Out of Scope (this plan) + +- Worker boot-failure catch site and run-record reordering — plan 2. +- The new boot-fail exit code (exit 2) and the router-side crash-reason recognition of it — plan 2. +- The CI conformance test that asserts every registered agent type has a prompt template — plan 2. (Note: until plan 2 lands, the alerting agent's template existing is verified only by the unit/integration tests in this plan, not by a generic guard.) +- Synthesized stable identifier for sentry-driven runs (e.g. `sentry:issue:`) — plan 2. +- LLM-judged eval harness for investigation quality — out of scope for the spec entirely. +- Closing the loop back to Sentry by posting an investigation comment on the Sentry issue itself — out of scope for the spec entirely. +- Support for non-Sentry alerting providers — out of scope for the spec entirely. + +--- + +## Progress + + +- [ ] AC #1 +- [ ] AC #2 +- [ ] AC #3 +- [ ] AC #4 +- [ ] AC #5 +- [ ] AC #6 +- [ ] AC #7 +- [ ] AC #8 +- [ ] AC #9 +- [ ] AC #10 +- [ ] AC #11 +- [ ] AC #12 +- [ ] AC #13 +- [ ] AC #14 +- [ ] AC #15 diff --git a/docs/plans/018-alerting-agent-and-worker-boot-visibility/2-boot-failure-visibility.md b/docs/plans/018-alerting-agent-and-worker-boot-visibility/2-boot-failure-visibility.md new file mode 100644 index 00000000..cc3801d7 --- /dev/null +++ b/docs/plans/018-alerting-agent-and-worker-boot-visibility/2-boot-failure-visibility.md @@ -0,0 +1,256 @@ +--- +id: 018 +slug: alerting-agent-and-worker-boot-visibility +plan: 2 +plan_slug: boot-failure-visibility +level: plan +parent_spec: docs/specs/018-alerting-agent-and-worker-boot-visibility.md +depends_on: [1-alerting-prompt.md] +status: pending +--- + +# 018/2: Worker boot-failure visibility + +> Part 2 of 2 in the 018-alerting-agent-and-worker-boot-visibility plan. See [parent spec](../../specs/018-alerting-agent-and-worker-boot-visibility.md). + +## Summary + +Make worker boot-time failures (template load, model resolution, context pipeline assembly, definition lookup, identifier construction) visible in the dashboard runs surface, and prevent the same silent-fail pattern from recurring for any future agent type. The 2026-05-06 incident exposed that boot-time failures produce no run row, the worker exits 0, BullMQ marks the job done, and the dashboard runs UI is blind to the failure — only Sentry sees it. This plan closes that gap. + +This plan delivers: (a) reordering of run-record creation so the row exists BEFORE any boot-phase step that can fail; (b) a dedicated boot-fail catch site that updates the row with status=failed and a structured error message before the worker exits; (c) a distinct boot-fail exit code (exit 2) that BullMQ's failure compensation and the router's crash-reason interpreter recognise; (d) Sentry capture under a stable tag for boot failures; (e) synthesized stable identifiers for sentry-driven runs that lack a PM-side work item id; (f) a CI conformance test that fails when any registered agent type lacks its prompt template. + +What this plan does NOT deliver: the alerting agent's prompt template (plan 1), any new agent capabilities or gadgets, behavioural changes to existing successful or in-execution-crashed runs, backfill for historical silently-failed runs. + +**Components delivered:** +- `src/backends/adapter.ts` — reorder `tryCreateRun` to before `resolvePartialExecutionPlan`; add deferred-fill for plan-resolution fields; wrap boot-phase in a typed try/catch. +- `src/agents/shared/runTracking.ts` — replace silent swallow in `tryCreateRun` with explicit failure surfacing + a new `updateRunPlanResolution(runId, model, maxIterations)` helper for the deferred-fill path. +- `src/agents/shared/executionPipeline.ts` — distinguish `BootFailureError` from runtime errors in the catch handler; mark the run row failed and re-throw the typed error so the outer worker catch sees it. +- `src/agents/shared/bootFailureError.ts` (new) — `BootFailureError` class for the typed boundary. +- `src/worker-entry.ts` — outer catch distinguishes `BootFailureError` → exit 2; existing exit 1 path preserved for runtime failures. +- `src/router/active-workers.ts` — `formatCrashReason` learns the exit-2 branch ("Worker boot failed: "). +- `src/triggers/sentry/alerting-issue.ts` and `src/triggers/sentry/alerting-metric.ts` — synthesize `workItemId: 'sentry:issue:'` (or equivalent for metric alerts) when no PM-side work item is associated. +- `tests/unit/agents/prompts/template-conformance.test.ts` (new) — CI guard that asserts every YAML-registered agent type has a corresponding `.eta` template. +- Per-component unit and integration tests as detailed below. +- `CHANGELOG.md` — entry for this plan. + +**Deferred to later plans in this spec:** +- _(none — this is the final plan)_ + +--- + +## Spec ACs satisfied by this plan + +- Spec AC #7 (template ENOENT produces a visible failed run row queryable via `cascade runs list`) — **full** +- Spec AC #8 (model resolution / context pipeline failures produce visible failed run rows) — **full** +- Spec AC #9 (distinct boot-fail exit code, distinguishable from 0 and 1) — **full** +- Spec AC #10 (existing success-path runs and in-execution-failure runs unchanged) — **full** +- Spec AC #11 (Sentry capture under stable tag, dashboard run id cross-referenced) — **full** +- Spec AC #12 (sentry-driven runs receive synthesized identifier so they group in dashboard) — **full** +- Spec AC #13 (CI guard fails when an agent type lacks its prompt template) — **full** +- Spec AC #14 (24h rate of "worker exits 0 with no run row" drops to zero post-deploy) — **full** `[manual]` (verified post-deploy in cascade-router log volume — see Manual Verification below) + +--- + +## Depends On + +- Plan 1 (`1-alerting-prompt.md`) — provides `src/agents/prompts/templates/alerting.eta`. Without it, the conformance test added in this plan would fail at CI time for the alerting agent type, blocking merge. Plan 1 must land first. + +--- + +## Detailed Task List (TDD) + +### 1. `BootFailureError` typed boundary + +**Tests first** (`tests/unit/agents/shared/bootFailureError.test.ts` — new file): + +- `BootFailureError preserves original error message and adds structured fields` — unit — input: `new BootFailureError('template load failed', { phase: 'template-load', cause: new Error('ENOENT') })`; expected: `instanceof BootFailureError` is true, `phase` is `'template-load'`, `cause` is the original Error, `message` includes both. Expected red: `Error: BootFailureError is not a constructor` (class doesn't exist yet). +- `BootFailureError is distinguishable from generic Error in catch blocks` — unit — throw a `BootFailureError`, catch as `Error`, verify `instanceof BootFailureError` is true. Expected red: same constructor error. + +**Implementation** (`src/agents/shared/bootFailureError.ts`): + +``` +class BootFailureError extends Error { + phase: 'template-load' | 'model-resolution' | 'context-pipeline' | 'definition-lookup' | 'identifier-resolution'; + cause?: unknown; + // constructor sets fields and chains message +} +``` + +The implementation just needs to be a typed Error subclass. No magic. + +### 2. Run-record creation reorder + +**Tests first** (`tests/unit/backends/adapter-run-record-ordering.test.ts` — new file, or extend an existing `tests/unit/backends/adapter.test.ts` if it exists): + +- `tryCreateRun is called before resolvePartialExecutionPlan` — unit — mock both functions, drive `executeAgent` through a happy path, assert call order via vi.mock spy timestamps. Expected red: `AssertionError: expected tryCreateRun called before resolvePartialExecutionPlan, but tryCreateRun called after`. +- `tryCreateRun is called with null-or-placeholder model and maxIterations when called early` — unit — assert the early-call shape accepts `null`/`undefined` for those fields without throwing. Expected red: `TypeError: Cannot read property '...' of null` from the existing strict signature. +- `updateRunPlanResolution is called after resolvePartialExecutionPlan succeeds, with the resolved model and maxIterations` — unit — drive a happy path, assert `updateRunPlanResolution(runId, model, maxIterations)` was invoked with the resolved values. Expected red: `Error: updateRunPlanResolution is not a function`. + +**Implementation**: + +`src/agents/shared/runTracking.ts`: +- Loosen `tryCreateRun` signature so `model` and `maxIterations` may be `null`/`undefined` at insertion time (the underlying `agent_runs` columns are already nullable per the spec's "single run row per job invariant" constraint — verify in `src/db/schema.ts` and in the existing migrations; if not nullable, add a small migration in this plan's migration step below). +- Add new exported function `updateRunPlanResolution(runId: string, model: string, maxIterations: number): Promise` that issues an UPDATE on `agent_runs` for those two fields. +- Replace the existing silent swallow in `tryCreateRun` (`logger.warn` + return undefined) with a thrown `BootFailureError({ phase: 'identifier-resolution', cause: err })`. Document the new contract: callers MUST be inside the boot-phase try/catch. + +`src/backends/adapter.ts`: +- Move the `tryCreateRun` call from after `resolvePartialExecutionPlan` (lines ~83-95) to before it. +- After `resolvePartialExecutionPlan` returns, call `updateRunPlanResolution(runId, partialInput.model, partialInput.maxIterations)`. +- Wrap the entire boot-phase block (including `resolvePartialExecutionPlan`, gadget allowlist resolution, context pipeline construction, definition lookup) in a try/catch that: + - On catch: derives a `BootFailureError` (re-wrapping non-`BootFailureError` causes), updates the existing run row to `status='failed'` with a structured error message, captures to Sentry under tag `worker_boot_failure`, then re-throws the typed error. + +If the schema is NOT nullable for `model` / `max_iterations`, add a migration `src/db/migrations/NNNN_relax_agent_runs_for_deferred_fill.sql` that drops the NOT NULL constraint, with the appropriate journal entry. + +### 3. `executeAgentPipeline` catch handler distinguishes boot from runtime + +**Tests first** (extending `tests/unit/agents/shared/executionPipeline.test.ts` if it exists, else new): + +- `executeAgentPipeline catch handler propagates BootFailureError unchanged` — unit — drive a path where the inner execute throws `BootFailureError`, assert the handler re-throws (does not convert to `{success: false}`). Expected red: `AssertionError: expected promise to reject with BootFailureError, but resolved with {success: false}`. +- `executeAgentPipeline catch handler still converts non-BootFailureError into {success: false}` — unit — drive a path where the inner execute throws a generic `Error`, assert the existing behavior is preserved. Expected red: green from day one (regression sentinel for the existing path). + +**Implementation** (`src/agents/shared/executionPipeline.ts`, the catch at lines 232-258): +- Add an early-branch: `if (err instanceof BootFailureError) { logger.error('Boot phase failed', ...); throw err; }`. +- Existing converted-to-`{success: false}` branch stays for runtime errors only. + +### 4. Worker-entry exit code 2 + +**Tests first** (`tests/unit/worker-entry-boot-failure.test.ts` — new file): + +- `worker exits with code 2 when dispatchJob throws BootFailureError` — unit — mock `dispatchJob` to throw `BootFailureError`, drive the entry function, assert `process.exit` was called with `2`. Expected red: `AssertionError: expected process.exit called with 2, but called with 1`. +- `worker exits with code 1 when dispatchJob throws a non-BootFailureError` — unit — mock `dispatchJob` to throw a generic `Error`, drive the entry, assert `process.exit(1)`. Expected red: green from day one (regression sentinel). +- `worker exits with code 0 on successful dispatchJob` — unit — happy path. Expected red: green from day one (regression sentinel). + +**Implementation** (`src/worker-entry.ts`, the outer try/catch around lines 480-497): +- Replace the single catch with branched handling: `instanceof BootFailureError` → `logger.error('[Worker] Boot failed', ...); captureException(err, { tags: { source: 'worker_boot_failure' } }); process.exit(2)`. Other errors keep the existing `[Worker] Job failed` + exit 1 path. + +### 5. Router-side crash-reason interpreter + +**Tests first** (extending `tests/unit/router/container-manager-diagnostics.test.ts` per the precedent established by spec 015): + +- `formatCrashReason labels exit code 2 as "Worker boot failed"` — unit — input: `{ exitCode: 2, oomKilled: false, reason: null }`; expected: returned string contains `Boot failed`. Expected red: `AssertionError: expected '... exit code 2 ...' to include 'Boot failed'` (today exit code 2 falls into the generic branch). +- `formatCrashReason still labels exit code 1 as crash reason` — regression sentinel. Expected red: green from day one. +- `formatCrashReason for boot-fail also surfaces the run-record error message when the run id is known` — unit — input includes a run id with a stamped error in the test DB; assert the formatted string includes the structured error from the run row. Expected red: `AssertionError: ... not to be empty` if the integration isn't wired. + +**Implementation** (`src/router/active-workers.ts`): +- Extend `formatCrashReason` to recognise exit code 2 and return a "Worker boot failed: " formatted string. Look up the run row by jobId (the existing diagnostics path already correlates jobs to runs in the same area). + +### 6. Synthesized identifier for sentry-driven runs + +**Tests first** (extending `tests/unit/triggers/sentry-alerting.test.ts`): + +- `SentryIssueAlertTrigger.handle synthesizes workItemId when no PM workItem is associated` — unit — input: a fixture issue-alert payload with `alertIssueId: '117972276'`; expected: returned `agentInput.workItemId` equals `sentry:issue:117972276`. Expected red: `AssertionError: expected '...' to equal 'sentry:issue:117972276'` (today the field isn't set). +- `SentryIssueAlertTrigger.handle preserves an explicitly-provided workItemId from upstream PM linking when present` — unit — input: a payload where the trigger context already resolved a PM-side workItemId; expected: that value is preserved, NOT overwritten by the synthesized id. Expected red: depends on shape — likely `AssertionError: expected '' to equal ''` if the synthesized fallback is too eager. +- `SentryMetricAlertTrigger.handle synthesizes workItemId from a stable derivation of the metric-alert identity` — unit — same shape but for metric alerts. Expected red: same as above. +- `Two invocations against the same Sentry issue produce the same synthesized workItemId` — unit — call `handle` twice with the same `alertIssueId`, assert the workItemIds match. Expected red: depends — but the determinism is the spec contract (AC #12). + +**Implementation** (`src/triggers/sentry/alerting-issue.ts` and `alerting-metric.ts`): +- After computing `agentInput`, if no upstream-resolved workItemId exists, set `workItemId = 'sentry:issue:' + alertIssueId` for issue alerts. For metric alerts, derive from a stable field (e.g. `alertOrgId + ':metric:' + alertTitle` if no issue-id-equivalent exists — final shape decided when authoring; the spec contract is "deterministic from the alert payload"). +- Plumb the synthesized workItemId through to the run row via the existing dispatch path. The run-record creation in adapter.ts will pick it up from `input.workItemId` per the existing code. + +### 7. CI conformance test for agent-type → prompt-template + +**Tests first** (`tests/unit/agents/prompts/template-conformance.test.ts` — new file, modeled on `tests/unit/integrations/pm-conformance.test.ts`): + +- `every registered agent type has a corresponding prompt template` — unit — load every YAML definition from `src/agents/definitions/*.yaml`, derive each `agentType`, assert `fs.existsSync('src/agents/prompts/templates/' + agentType + '.eta')` for each. Expected red: BEFORE plan 1 lands, `AssertionError: missing template for agent type 'alerting'`. AFTER plan 1, green. + +**Implementation** — the test itself is the deliverable. No production code change. + +This test is the regression net for AC #13 — it fails CI when a future agent definition is added without its template, with a precise file path naming the missing template. + +### 8. Documentation + +**`CHANGELOG.md`**: add entry: `- Changed: worker boot-time failures (template load, model resolution, context pipeline) now produce a visible failed run row in the dashboard, exit with a distinct boot-fail code (2), and capture to Sentry under tag 'worker_boot_failure'. Sentry-driven runs without a PM-side work item id receive a synthesized stable identifier (sentry:issue:). Adding an agent type without its prompt template now fails CI.` + +--- + +## Test Plan + +### Unit tests +- [ ] `tests/unit/agents/shared/bootFailureError.test.ts`: 2 tests for the typed boundary +- [ ] `tests/unit/backends/adapter-run-record-ordering.test.ts` (or extension): 3 tests for the reorder + deferred-fill +- [ ] `tests/unit/agents/shared/executionPipeline.test.ts` (extension): 2 tests for the boot-vs-runtime branch +- [ ] `tests/unit/worker-entry-boot-failure.test.ts`: 3 tests for the exit codes +- [ ] `tests/unit/router/container-manager-diagnostics.test.ts` (extension): 3 tests for `formatCrashReason` exit-2 branch +- [ ] `tests/unit/triggers/sentry-alerting.test.ts` (extension): 4 tests for the synthesized identifier +- [ ] `tests/unit/agents/prompts/template-conformance.test.ts`: 1 test asserting all agent types have templates + +### Integration tests +- [ ] `tests/integration/worker-boot-failure-end-to-end.test.ts` (new): drive a full job dispatch with an artificially-removed prompt template, assert (a) a run row exists with `status='failed'` and a structured error message, (b) the worker container exited with code 2, (c) Sentry was called with the `worker_boot_failure` tag. + +### Acceptance tests +- [ ] Per-plan AC checklist (below) verified against the test outputs and `cascade runs list` post-deploy. + +--- + +## Manual Verification (for `[manual]`-tagged ACs only) + +- **AC**: Per-plan AC #11 (post-deploy 24h rate of "worker exits 0 with no run row created" drops to zero) — inherited from spec AC #14. +- **Why manual**: Production observability check — counts an emergent property of cascade-router log volume across 24 hours of real traffic. No automated test reproduces 24h of varied production traffic. +- **Verification protocol**: + 1. After plan 1 and plan 2 are both deployed to prod, wait at least 24 hours of normal cascade-router operation. + 2. Run a Loki query against cascade-router for the period: `{container="/cascade-router"} |~ "Job completed successfully" | label_format job_id="..."` correlated with the runs database `agent_runs` table for the same `job_id` values. + 3. The expected outcome is **zero** `Job completed successfully` log lines whose corresponding `job_id` has no `agent_runs` row. A non-zero count under normal traffic indicates a regression worth investigating. + 4. Capture the count and a short note in the spec or PR comment confirming the observation. + +--- + +## Acceptance Criteria (per-plan, testable) + +1. A simulated boot failure (forced ENOENT on the alerting prompt template at integration-test time) produces a run row in `agent_runs` with `status='failed'` and a structured error message naming the missing template. +2. A simulated boot failure for any boot-phase step (template load, model resolution, context pipeline construction) is captured to Sentry under tag `worker_boot_failure`. +3. The worker exits with code 2 when boot fails, with code 0 on successful or no-op runs, and with code 1 on in-execution crashes — semantics for codes 0 and 1 are unchanged from prior behaviour. +4. The router's `formatCrashReason` returns a string containing `Boot failed` when interpreting an exit-2 worker; it surfaces the run row's structured error message when the run id is known. +5. A Sentry-driven run with no upstream-resolved PM workItemId has its `agent_runs.work_item_id` set to a deterministic value derived from the alert payload (`sentry:issue:` for issue alerts; equivalent stable derivation for metric alerts). +6. Two dispatches against the same Sentry issue produce the same synthesized workItemId, and the dashboard work-item view groups them. +7. The CI conformance test fails when any agent type registered via YAML definition lacks a corresponding `.eta` prompt template, with a failure message naming the missing template path. +8. Existing successful-run, no-op, and in-execution-crash paths across all other agent types behave identically before and after this plan ships (regression-sentinel tests pass). +9. Per-plan AC #11 (post-deploy observation): see Manual Verification above. +10. All new/modified code has corresponding tests. +11. `npm run lint` passes. +12. `npm run typecheck` passes. +13. `npm test` passes. +14. `npm run test:integration` passes. +15. `CHANGELOG.md` is updated. + +--- + +## Documentation Impact (this plan only) + +| File | Change | +|---|---| +| `CHANGELOG.md` | Entry: `Changed: worker boot-time failures now produce visible failed runs (exit code 2, Sentry tag 'worker_boot_failure'); CI fails on missing prompt template for any registered agent type; sentry-driven runs receive synthesized stable workItemId.` | + +`CLAUDE.md` is intentionally not edited — see the spec's Documentation Impact section for the rubric reasoning. The CI conformance test plus this plan's commit message and the spec carry the rationale. + +--- + +## Out of Scope (this plan) + +- Authoring the alerting agent prompt template — plan 1. +- LLM-judged eval harness for alerting agent investigation quality — out of scope for the spec entirely. +- Closing the loop back to Sentry by posting an investigation comment on the Sentry issue itself — out of scope for the spec entirely. +- Support for non-Sentry alerting providers — out of scope for the spec entirely. +- Backfilling run rows or PM-side work items for the historical silently-failed alerting runs from 2026-05-06 — out of scope for the spec entirely. +- Behavioural change to existing successful-but-no-op runs across other agent types (still exit 0) — out of scope for the spec entirely. +- Reworking unrelated silent-fail call sites in the worker (only the boot-phase path is in scope) — out of scope for the spec entirely. + +--- + +## Progress + + +- [ ] AC #1 +- [ ] AC #2 +- [ ] AC #3 +- [ ] AC #4 +- [ ] AC #5 +- [ ] AC #6 +- [ ] AC #7 +- [ ] AC #8 +- [ ] AC #9 +- [ ] AC #10 +- [ ] AC #11 +- [ ] AC #12 +- [ ] AC #13 +- [ ] AC #14 +- [ ] AC #15 diff --git a/docs/plans/018-alerting-agent-and-worker-boot-visibility/_coverage.md b/docs/plans/018-alerting-agent-and-worker-boot-visibility/_coverage.md new file mode 100644 index 00000000..9a4a3ef4 --- /dev/null +++ b/docs/plans/018-alerting-agent-and-worker-boot-visibility/_coverage.md @@ -0,0 +1,39 @@ +# Coverage map for spec 018-alerting-agent-and-worker-boot-visibility + +Auto-generated by /plan. Tracks which plans satisfy which spec ACs. + +## Spec ACs + +| # | Spec AC (short) | Satisfied by | Status | +|---|---|---|---| +| 1 | Sentry issue alert produces visible alerting run with PM-tool output | plan 1 (alerting-prompt) | full | +| 2 | Sentry metric alert produces comparable investigation | plan 1 (alerting-prompt) | full | +| 3 | Alerting agent does not edit source / commit / push / open PRs | plan 1 (alerting-prompt) | full | +| 4 | Prompt renders for all three engines without errors | plan 1 (alerting-prompt) | full | +| 5 | Predictable output structure for work item title / description / comment text | plan 1 (alerting-prompt) | full | +| 6 | Agent reasons about pre-loaded context, no per-event-type branching in prompt | plan 1 (alerting-prompt) | full | +| 7 | Template-load failure produces visible failed run row in dashboard | plan 2 (boot-failure-visibility) | full | +| 8 | Model-resolution / context-pipeline failures produce visible failed run rows | plan 2 (boot-failure-visibility) | full | +| 9 | Distinct boot-fail exit code, distinguishable from 0 and 1 | plan 2 (boot-failure-visibility) | full | +| 10 | Existing success-path and in-execution-failure runs unchanged | plan 2 (boot-failure-visibility) | full | +| 11 | Sentry capture under stable tag, dashboard run id cross-referenced | plan 2 (boot-failure-visibility) | full | +| 12 | Sentry-driven runs receive synthesized identifier so they group in dashboard | plan 2 (boot-failure-visibility) | full | +| 13 | CI guard fails when an agent type lacks its prompt template | plan 2 (boot-failure-visibility) | full | +| 14 | Post-deploy 24h rate of "worker exits 0 with no run row" drops to zero | plan 2 (boot-failure-visibility) | full `[manual]` | + +## Coverage summary + +- **14 spec ACs** mapped to **2 plans** +- **2 plans** with full-coverage ACs (each plan testable in isolation, no partial-coverage chains) +- **0 plans** with partial-coverage ACs +- **1 manual-verification AC** (spec AC #14) — verified post-deploy in cascade-router log volume per plan 2's Manual Verification section + +## Plan dependency graph + +``` +1-alerting-prompt ──→ 2-boot-failure-visibility +``` + +Plan 2 depends on plan 1: plan 2 introduces a CI conformance test asserting every YAML-registered agent type has a corresponding `.eta` prompt template. Plan 1 ships the alerting agent's template. If plan 2 landed first, the conformance test would fail CI for the (still-unwritten) alerting template, blocking merge. Plan-1-first avoids that ordering trap. + +The original spec strategic decision #10 noted that "A benefits from B existing first (so future iteration on the alerting prompt is observable)". In practice the ordering above is the right inversion: the safety net (plan 2) lands AFTER the new feature (plan 1) so the safety net's conformance check never has a missing-template false positive to grandfather. Anyone iterating on the alerting prompt between plan-1 ship and plan-2 ship is doing it actively and watching logs/Sentry directly; the visibility gap matters for steady-state operation, which both plans serve once both are shipped. diff --git a/docs/specs/018-alerting-agent-and-worker-boot-visibility.md b/docs/specs/018-alerting-agent-and-worker-boot-visibility.md new file mode 100644 index 00000000..495d0505 --- /dev/null +++ b/docs/specs/018-alerting-agent-and-worker-boot-visibility.md @@ -0,0 +1,205 @@ +--- +id: 018 +slug: alerting-agent-and-worker-boot-visibility +level: spec +title: Alerting agent prompt + worker boot-failure visibility +created: 2026-05-06 +status: draft +--- + +# 018: Alerting agent prompt + worker boot-failure visibility + +## Problem & Motivation + +On 2026-05-06 the cascade project received its first prod-traffic Sentry alert webhook intended to drive an `alerting`-type agent run. The first attempt failed silently because the router worker spawn was missing the credential bag for sentry-source jobs (separately fixed in PR #1259/#1260). After the credential fix shipped and a second test alert was sent, the worker booted, decrypted credentials, processed the sentry job header — and then died at agent boot with `ENOENT: no such file or directory, open '.../alerting.eta'`. The dashboard runs surface showed nothing for either attempt; the `agent_runs` row was never created; the only operator-visible signal was a Sentry exception captured from inside the worker. + +Two distinct gaps surfaced in the same incident, with a clear causal chain between them. The alerting agent had been designed end-to-end except for the system prompt template — definition YAML, capabilities, triggers, context pipeline, and inline task prompt all exist; only the prompt that gives the agent its persona, philosophy, and process never got written. That is the **feature gap (A)**. Independently, the worker's agent execution pipeline catches and silently converts boot-time exceptions into a clean `{success: false}` result, while the run-record creation step happens AFTER the steps that can fail (prompt load, model resolution, context pipeline). Boot-time failures therefore produce no run row, the worker exits 0, BullMQ marks the job done, and the dashboard runs UI is blind to the failure. That is the **observability gap (B)**. + +Gaps A and B reinforce each other: B is what made A so painful to diagnose. A future agent type added without its prompt template — or any boot-time regression in any existing agent — will fail in exactly the same invisible shape unless B is fixed. This spec consolidates both fixes under one motivation. They are independent enough to ship as separate plans and PRs but related enough that solving them together — with shared "no silent boot fail in any agent" policy — is cheaper than two separate spec rounds. This continues the lineage of spec **017** (router-side silent-failure hardening, done 2026-04-29), applying the same "silent fail no longer permitted" discipline to the worker side. + +--- + +## Goals + +### Goal block A — Alerting agent + +A1. When a Sentry issue alert fires for a configured project, the alerting agent investigates the alert end-to-end: parses the pre-loaded event data (stacktrace, breadcrumbs, tags), reads the relevant source files to confirm the root cause, and either creates a bug investigation work item in the project's backlog (when a backlog list is configured) or comments on the triggering work item with its findings (when one is associated). Investigation output is concise, actionable, and names the failing function/file/line. + +A2. The alerting agent is a strict investigator-and-reporter, not a fixer. It does not edit source files, push commits, or open PRs. Its mission ends at "filed a bug to be fixed" or "commented investigation findings." + +A3. The alerting agent works against any Sentry-issued alert payload (issue alert and metric alert) without per-payload-shape branching at the prompt level. The shape differences are absorbed by the existing context pipeline; the prompt itself reasons about whatever investigation context was pre-loaded. + +A4. The agent's prompt is engine-agnostic — the same template renders correctly for `claude-code`, `codex`, and `opencode` engines. It relies on shared partials for environment and process preamble. + +A5. The agent has clear completion criteria — an explicit definition of "investigation done" so that runs terminate predictably rather than meandering through unrelated code paths. + +### Goal block B — Worker boot-failure visibility + +B1. Any failure that occurs in the worker between job pickup and the start of agent execution (template load, model resolution, context pipeline assembly, definition lookup, identifier resolution) produces a visible run row in the dashboard runs surface with a status that distinguishes "boot failed" from "agent ran and chose not to act" and from "agent ran and crashed mid-execution." + +B2. The worker's exit code communicates failure shape to the router: boot-time failures exit with a distinguishable non-zero code so that BullMQ's failure machinery and the existing post-failure compensation (lock release, recently-dispatched dedup mark) fire as designed. Agent-no-op runs continue to exit 0 — semantics for those runs are unchanged. + +B3. Boot-time failures escalate to Sentry under a stable tag (consistent with the spec-017 tag-naming convention) so operators triaging out-of-band have an unambiguous signal apart from the dashboard run row. + +B4. Sentry-driven runs (which have no PM-side work item id today) carry a synthesized stable identifier so the dashboard work-item view can group and find them. Multiple investigations of the same Sentry issue group together naturally. + +B5. A regression net prevents future agent types from being registered without a corresponding prompt template — adding an agent definition without writing its prompt produces a CI failure with a precise file path to the missing template. + +--- + +## Non-goals + +- A full LLM-judged eval harness for the alerting agent's investigation quality. Unit + integration coverage of prompt rendering, context-pipeline wiring, gadget allowlist, and an end-to-end fixture is sufficient for v1. An eval harness is deferred to a separate spec only if alerting goes prod-multi-tenant and ground-truth investigation summaries become worth authoring. +- Closing the loop back to Sentry by posting an investigation comment on the Sentry issue itself. The agent reports into the project's PM tool only in v1; Sentry remains read-only from the agent's perspective. +- Support for non-Sentry alerting providers (Datadog, PagerDuty, Grafana, etc.). The alerting integration abstraction is in place for future providers, but only Sentry is in scope here. +- Behavioural change to existing successful-but-no-op runs across other agent types — they continue to exit 0 and continue to record `agent_runs.status = completed`. The new exit code is reserved for boot-time failures only. +- Backfilling run rows or PM-side work items for the small handful of silently-failed historical alerting runs. Operators can investigate manually if they want; future runs after this spec ships will be visible end-to-end. +- Reworking the agent execution pipeline beyond moving run-record creation earlier and adding the boot-fail catch site. The success path's shape is unchanged. +- Tightening unrelated WARN-vs-ERROR call sites in the worker. Only the boot-time silent-fail path is in scope. + +--- + +## Constraints + +- **Single run row per job invariant**: every dispatched job that reaches the worker corresponds to exactly one row in the runs table. No "boot phase" row plus "execution phase" row pattern. Plan-resolution fields (model, max iterations) that aren't known until plan resolution must accept a deferred-fill semantics in the run row. +- **Backward compatibility for the success path**: agent runs that succeed must look identical in the runs surface and in observability data after this spec ships. Operators reviewing run history for unrelated agent types should see no change. +- **Engine-agnostic prompt**: the alerting prompt renders with the existing Eta template engine and consumes data from the existing context pipeline. It must not embed engine-specific tool-call shapes or assumptions about backend behaviour. +- **Sentry tag naming alignment**: any new Sentry tags introduced for boot-fail visibility follow the lower_snake_case convention established by spec 017's `wedged_lock_canary`, spec 015's `pipeline_capacity_gate_no_pm_provider`, and similar. +- **Prompt examples must not leak eval answers**: per the cross-project rule, any few-shot examples or illustrative cases in the alerting prompt must use a domain that does NOT appear in cascade's eval fixtures or test fixtures. The honesty test: "if a stranger swapped these examples for completely unrelated ones, would the integration tests still pass?" Answer must be yes. +- **Independent shippable units**: the alerting block (A) and visibility block (B) ship as separate plans and separate PRs. Either can be deployed without the other; sequencing is operator-determined. (Operationally A benefits from B shipping first, but neither is a hard prerequisite.) + +--- + +## Requirements + +### Alerting agent (A) + +A.1. The alerting agent renders a system prompt for every dispatch of `agentType: 'alerting'` from the existing trigger handlers, without that dispatch path needing modification. + +A.2. The prompt directs the agent through a three-phase process: parse the pre-loaded event data → confirm root cause by reading source → summarise and report via PM tool. Phase boundaries are visible enough in the prompt that operators reading a run transcript can identify which phase the agent is in. + +A.3. The prompt includes an explicit guardrail forbidding source edits, commits, PRs, or any write outside the PM-tool reporting surface. The guardrail is reinforced enough to survive the same model on a different day. + +A.4. When a Sentry alert's trigger context provides both a link to the originating Sentry issue and an associated existing work item, the prompt directs the agent to comment on the existing work item rather than create a new backlog entry. When only a backlog list is available, the agent creates a backlog entry instead. + +A.5. The prompt's investigation depth is governed by soft guidance ("stop when you can name the failing function and the trigger condition") rather than a hard cap on files read. The existing per-run cost and iteration caps in the agent runtime remain the actual ceiling. + +A.6. The prompt's outputs (work item title, work item description, comment text) follow a predictable structure defined in the prompt, so that operators reviewing many investigation outputs can scan them quickly. + +A.7. Unit tests verify the prompt renders without errors against representative trigger contexts. Integration tests verify the agent can be dispatched, the context pipeline runs, the gadget allowlist resolves correctly, and a fixture-driven run completes. + +### Worker boot-failure visibility (B) + +B.1. The run row for a job is created at the point where the worker has parsed the job header successfully and has determined the agent type, BEFORE any operation that can fail in a way that would otherwise prevent the row from being written. The fields known only after plan resolution accept a deferred-fill semantics. + +B.2. Any exception thrown in the boot-phase code path (template load, model resolution, context pipeline construction, definition lookup, identifier construction) is caught at a single dedicated boot-phase catch site that updates the existing run row to a failed status, records a structured error message, captures to Sentry under a stable tag, and exits the worker with the boot-fail exit code. + +B.3. The boot-fail exit code is distinct from the worker's existing exit codes (0 for success-or-noop, 1 for in-execution crash). The router's exit-code interpreter recognises the new code and stamps run records (where appropriate) and crash-reason logs with a distinguishable label. + +B.4. Sentry-driven runs whose payload has no associated PM work item carry a synthesized stable identifier in the runs row. The identifier is derived deterministically from the alert payload such that multiple invocations against the same Sentry issue produce the same identifier and group together in the dashboard. + +B.5. A CI guard asserts that for every agent type registered (via YAML definition or any future registration mechanism), the corresponding prompt template file exists. Adding a new agent definition without the matching prompt produces a precise CI failure naming the missing template. + +B.6. After this spec ships, an operator running `cascade runs list --project --agent-type ` for any agent type that experienced a boot failure sees a row with a failed status and the structured error message, regardless of whether the agent ever reached its execution phase. + +B.7. The pre-existing silent swallows in the run-row creation path and the agent-execution catch handler are tightened: failures that previously logged WARN and returned no row, or that silently logged ERROR and returned a "no-op success" result, now follow the boot-fail discipline above instead. + +--- + +## Research Notes + +- **Spec 017 (router-side silent-failure hardening, done 2026-04-29)** is the direct precedent for this work. Its tag-naming convention, Sentry-capture pattern, and conformance-test extension model are reused here on the worker side. The "silent WARN converted to Sentry-tagged error" policy from 017 transfers directly. +- **Spec 015 (router job dispatch failure recovery, done)** established the failed-event compensation flow — every dispatch failure flows through `worker.on('failed')` to release locks and dedup marks. The new boot-fail exit code from this spec plugs into that existing compensation path; no new compensation logic is needed. +- **The cross-project CLAUDE.md prompt-leak rule** ("Prompt examples must NOT leak the eval answers") is load-bearing for goal A. Any illustrative example in the alerting prompt must come from a domain disjoint from any test fixture the alerting agent will be evaluated against. +- **No academic prior art is cited.** Both fixes are well-understood engineering hygiene applied to existing infrastructure: the alerting agent prompt follows the cascade codebase's established prompt-engineering pattern (persona + philosophy + phased process + completion criteria, shared via partials); the boot-failure visibility fix is "create the row before the things that can fail, and surface failures loudly" — standard observability discipline. +- **Industry convention on idempotency and exit-code semantics**: a separate exit code for boot-vs-runtime failure is consistent with how systemd, Docker entrypoints, and process supervisors typically distinguish "couldn't start" from "started and crashed." It enables correctness of restart-vs-back-off policies downstream. + +--- + +## Open Source Decisions + +| Tool | Solves | Decision | Reason | +|------|--------|----------|--------| +| _(none)_ | _(none)_ | _(none)_ | The alerting agent uses the existing prompt-engine, gadget framework, and integration abstraction. The boot-visibility fix uses existing run-record infrastructure, Sentry capture, and conformance-test pattern. No new tools are adopted. | + +--- + +## Strategic decisions + +1. **One bundled spec over two.** Both gaps surfaced in the same incident, gap B is what made gap A so painful, and the spec's narrative arc reads as one ("we shipped a feature blind, then we hardened the visibility so the next blind ship is loud"). Decomposition into two downstream plans preserves per-PR independence; bundling at the spec level keeps the motivation coherent and avoids stitching the same context into two separate spec rounds. Alternative considered: split into specs 018 + 019; rejected at the user-interview step. + +2. **Investigator-and-filer over investigator-and-fixer for the alerting agent.** The agent investigates, names the root cause, and files a bug for someone else to fix. It does not edit code or open PRs. Reason: blast radius. An automated agent that can fix bugs without human review is a risk an alerting trigger does not justify; the implementation agent already exists for that workload. Alternative considered: combined investigate-and-fix mode; rejected as scope creep with an unfavourable risk profile. + +3. **Comment-on-existing over create-new when both are possible.** When the alerting trigger context has both a backlog list and an associated existing work item, the agent prefers commenting on the existing item. Reason: lower write blast radius, easier to undo, prevents duplicate "investigate this" backlog rows accumulating across alert recurrences. Alternative considered: always create a new backlog entry; rejected for the duplication-pressure reason. + +4. **Soft investigation-depth guidance over a hard file-read cap.** The prompt directs the agent to stop when it can name the failing function and the trigger condition. Hard caps misbehave when the bug requires deep tracing through multiple files. The existing per-run cost and iteration ceilings are the actual ceiling. Alternative considered: explicit "read at most N source files" cap; rejected because it forces giving up on legitimate deep investigations. + +5. **Engine-agnostic prompt prose.** The alerting prompt is written without engine-specific tool-call shapes or backend assumptions, mirroring how the existing prompts (`review.eta`, `backlog-manager.eta`, etc.) work today. Reuses the shared `partials/environment` preamble. Alternative considered: claude-code-specific prompt for v1 with multi-engine ports later; rejected because the engine abstraction already handles this and forking would create future drift. + +6. **Single run row per job preserved; deferred-fill for plan-resolution fields.** Boot-failure visibility is achieved by reordering when the row is created, not by introducing a "boot phase" row separate from the "execution phase" row. Reason: every consumer of the runs surface (dashboard list view, work-item-page lookup, retry path, debug-analysis path) assumes one row per job. Splitting that contract is a much larger blast radius than letting plan-resolution fields be nullable temporarily. Alternative considered: separate boot-phase row with its own status taxonomy; rejected as too invasive for the actual fix shape. + +7. **Distinct boot-fail exit code over reusing exit 1.** The worker exits with a distinct code for boot-time failures, leaving exit 0 (agent succeeded or chose not to act) and exit 1 (agent ran and crashed) intact. Reason: BullMQ's failure compensation, the router's crash-reason interpreter, and operator log-grep patterns all benefit from being able to distinguish "couldn't start" from "started and crashed" without parsing log strings. Alternative considered: always exit non-zero on any agent failure; rejected because it shifts the meaning of existing successful-but-no-op runs across all agent types — out of scope for this spec. + +8. **Synthesized stable identifier for sentry runs over leaving NULL.** Sentry-driven runs have no PM-side work item id; without a synthesized identifier they would be invisible in the dashboard work-item view, reproducing the spec-017 anti-pattern for a brand-new agent type. The identifier is deterministic from the alert payload so multiple investigations of the same Sentry issue group naturally. Alternative considered: leave NULL and accept invisibility; rejected because we just spec'd 017 to fix that exact pattern for other agent types. + +9. **Conformance-test extension over runtime guard for the prompt-template invariant.** A CI test asserts every registered agent type has a corresponding prompt template, mirroring spec 009's PM-manifest conformance harness pattern. Reason: catches "register an agent without writing the prompt" before it ships, instead of relying on a runtime check that fires only when traffic reaches the new agent type (which is exactly what produced the 2026-05-06 incident). Alternative considered: runtime startup guard that scans agent types vs. templates; rejected because the failure mode the user wants to prevent is "ships to prod and waits for traffic", not "starts up and crashes" — CI is the right gate. + +10. **Two downstream plans, sequenced operator-side.** This spec produces two `/plan` files: one for the alerting agent prompt, one for boot-failure visibility. They can be merged in either order; A benefits from B existing first (so future iteration on the alerting prompt is observable), but neither is a hard prerequisite. Alternative considered: one combined plan; rejected because the review profile and reviewer audience for prompt-engineering vs. observability-hardening are different. + +--- + +## Acceptance Criteria (outcome-level) + +### Alerting agent (A) + +1. A Sentry issue alert fired against a configured project produces a visible alerting agent run that progresses through investigation phases and terminates with either a new bug investigation work item in the configured backlog or a comment on the triggering work item, containing the root cause summary, affected file/function, and a link to the alert. + +2. A Sentry metric alert fired against a configured project triggers the alerting agent (subject to the per-project trigger configuration) and produces an investigation comparable in shape to the issue-alert path, adapted to the metric-alert payload's lack of stacktrace. + +3. The alerting agent does not edit source files, commit, push, or open PRs during any run. A run that attempted any of those actions is a regression. + +4. The alerting agent's prompt template renders successfully for every supported engine (`claude-code`, `codex`, `opencode`) — no engine-specific syntax errors, no rendering failures. + +5. The alerting agent's outputs (work item title, work item description, comment text) follow a predictable structure across runs, scannable at a glance by an operator triaging multiple investigations. + +6. Adding a new alerting trigger (e.g. a hypothetical `alerting:rate-anomaly`) without modifying the alerting prompt does not break existing runs — the prompt reasons about whatever pre-loaded context it receives without per-event-type branching. + +### Worker boot-failure visibility (B) + +7. A worker that fails to load its prompt template (e.g. missing file) produces a visible failed run row in the dashboard runs surface within seconds of webhook arrival, queryable via `cascade runs list` filters by project and agent type, with a structured error message naming the failure cause. + +8. A worker that fails during model resolution, context pipeline assembly, or any other boot-phase step likewise produces a visible failed run row with a structured error message naming the failure cause. + +9. The new boot-fail exit code is distinct from the existing 0 and 1 codes, and the router's crash-reason interpretation surfaces a distinguishable label for boot-fail runs in its diagnostic logs. + +10. Existing success-path runs and existing in-execution-failure runs (exit 1) look identical in the runs surface and in observability data before and after this spec ships. No regression in the success-or-noop path. + +11. A boot failure escalates to Sentry under a stable tag (lower_snake_case, aligned with spec 017 conventions). The dashboard runs surface and the Sentry capture both reference the same run id for cross-referencing. + +12. A Sentry-driven alerting run, regardless of whether it has a PM-side work item id from the trigger context, is discoverable via the dashboard work-item view by querying the synthesized stable identifier. Multiple investigations of the same Sentry issue surface as a group. + +13. Adding a new agent type registration to the codebase without a corresponding prompt template produces a CI failure that names the missing template path, blocking merge until the template is added. + +14. After this spec is fully deployed, the 24-hour rate of "worker exits 0 with no run row created" events on cascade-router drops to zero under normal operation. Any non-zero rate represents a real regression worth investigating. + +--- + +## Documentation Impact (high-level) + +- **`README.md`** — update the alerting-related section (or add one) to mention that the alerting agent investigates Sentry alerts and reports findings to the project's PM tool, as part of the documented agent inventory. +- **`CHANGELOG.md`** — entry for both the alerting-agent feature and the boot-failure visibility hardening, in the appropriate release block when the changes ship. + +(The boot-fail invariant from goal block B is intentionally not added to `CLAUDE.md`. The conformance test introduced by plan 2 enforces the rule at CI time, and the spec + plan + commit history communicate the rationale to anyone reading code in the affected area. Per the cross-project rubric, default-no for `CLAUDE.md` additions unless the rule is genuinely homeless.) + +--- + +## Out of Scope + +- An LLM-judged eval harness for alerting agent investigation quality. Deferred to a future spec only if alerting goes prod-multi-tenant. +- Closing the loop back to Sentry by posting investigation comments on Sentry issues themselves. +- Support for non-Sentry alerting providers (Datadog, PagerDuty, Grafana, etc.) — abstraction is in place but only Sentry is in scope. +- Backfilling run rows or PM-side work items for the historical silently-failed alerting runs from 2026-05-06. +- Behavioural change to existing successful-but-no-op runs across other agent types. +- Reworking unrelated silent-fail call sites in the worker beyond the boot-phase path. +- A combined investigate-and-fix mode for the alerting agent. +- Multi-tenant tuning of the alerting agent prompt for distinct project profiles. From 302a5b3d449f44dd436e76378290681eccf825d1 Mon Sep 17 00:00:00 2001 From: Zbigniew Sobiecki Date: Wed, 6 May 2026 14:05:43 +0000 Subject: [PATCH 2/3] chore(plan): 018/1 lock Co-Authored-By: Claude Opus 4.7 (1M context) --- .../{1-alerting-prompt.md => 1-alerting-prompt.md.wip} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename docs/plans/018-alerting-agent-and-worker-boot-visibility/{1-alerting-prompt.md => 1-alerting-prompt.md.wip} (99%) diff --git a/docs/plans/018-alerting-agent-and-worker-boot-visibility/1-alerting-prompt.md b/docs/plans/018-alerting-agent-and-worker-boot-visibility/1-alerting-prompt.md.wip similarity index 99% rename from docs/plans/018-alerting-agent-and-worker-boot-visibility/1-alerting-prompt.md rename to docs/plans/018-alerting-agent-and-worker-boot-visibility/1-alerting-prompt.md.wip index 8ca6b904..5a8bd38f 100644 --- a/docs/plans/018-alerting-agent-and-worker-boot-visibility/1-alerting-prompt.md +++ b/docs/plans/018-alerting-agent-and-worker-boot-visibility/1-alerting-prompt.md.wip @@ -6,7 +6,7 @@ plan_slug: alerting-prompt level: plan parent_spec: docs/specs/018-alerting-agent-and-worker-boot-visibility.md depends_on: [] -status: pending +status: wip --- # 018/1: Alerting agent prompt template From bdd34cc9d7ee895fdba13a13df81cf9228ccf90c Mon Sep 17 00:00:00 2001 From: Zbigniew Sobiecki Date: Wed, 6 May 2026 14:15:26 +0000 Subject: [PATCH 3/3] feat(agents): alerting agent system prompt template (spec 018, plan 1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Author the system prompt template that the alerting agent's worker tries to load at boot. Every other piece of the alerting agent — YAML definition, capabilities, trigger handlers, context pipeline, Sentry integration — was already wired. Missing the .eta produced an ENOENT at agent boot when the first prod-traffic Sentry alert arrived (cascade project, 2026-05-06). After this plan, an alerting agent dispatched via existing trigger handlers reaches its execution phase end-to-end. The prompt structures the agent's behavior as a three-phase investigator (parse pre-loaded event → confirm root cause via source reads → file or comment) with an explicit INVESTIGATE-AND-FILE-ONLY guardrail. Predictable output structure: 'Investigate: in (:)' title and a 4-6 sentence + bullets description. The agent's read-only guarantee is enforced statically by the YAML's capability declaration (no fs:write, no scm:*) — the resolved gadget allowlist excludes WriteFile, CreatePR, CreatePRReview. Plan task #2 was downgraded from a heavy E2E integration test to a static capability-allowlist test (more reliable than behavioral negative-assertions that depend on LLM cooperation); divergence noted in the plan itself. Tests: 10 new in tests/unit/agents/prompts.test.ts; 4 new in tests/unit/agents/definitions/alerting.yaml.test.ts. All green; full unit suite (8808 tests) green; lint + typecheck clean. Spec ACs 1-6 satisfied (full). Marks plan 018/1 as .done. Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 4 + README.md | 3 +- ...rompt.md.wip => 1-alerting-prompt.md.done} | 48 ++++---- src/agents/prompts/templates/alerting.eta | 106 ++++++++++++++++++ .../agents/definitions/alerting.yaml.test.ts | 63 +++++++++++ tests/unit/agents/prompts.test.ts | 76 +++++++++++++ 6 files changed, 274 insertions(+), 26 deletions(-) rename docs/plans/018-alerting-agent-and-worker-boot-visibility/{1-alerting-prompt.md.wip => 1-alerting-prompt.md.done} (84%) create mode 100644 src/agents/prompts/templates/alerting.eta create mode 100644 tests/unit/agents/definitions/alerting.yaml.test.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 0ccb6d94..5908aa72 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,10 @@ All notable user-visible changes to CASCADE are documented here. The format is l ## Unreleased +### Added + +- **Alerting agent now investigates Sentry alerts and files bug investigation work items** (spec 018, plan 1 of 2). The `alerting` agent had been wired end-to-end except for its system prompt template — definition YAML, capabilities, trigger handlers, context pipeline, and Sentry integration were all in place, but `src/agents/prompts/templates/alerting.eta` was missing, so the worker crashed at agent boot with `ENOENT` when the first prod-traffic Sentry alert arrived (cascade project, 2026-05-06). This plan ships the prompt: a three-phase investigator (parse pre-loaded event → confirm root cause via source reads → file or comment) with an explicit `INVESTIGATE-AND-FILE-ONLY` guardrail. The agent does not edit source, commit, push, or open PRs — that property is enforced at the capability layer (no `fs:write`, no `scm:*`), pinned by a static test that asserts the resolved gadget allowlist excludes `WriteFile`, `CreatePR`, and `CreatePRReview`. When the trigger context provides an existing work item, the agent comments on it; otherwise it creates a new bug investigation work item in the configured backlog. Output structure is predictable: `Investigate: in (:)` title and a 4-6 sentence + bullets description. Engine-agnostic prose; reuses `partials/environment` for the shared preamble. See [spec 018](docs/specs/018-alerting-agent-and-worker-boot-visibility.md). Plan 2 of 2 closes the silent-failure path that masked this gap (worker boot failures will produce visible failed run rows, exit code 2, Sentry capture under `worker_boot_failure`). + ### Changed - **Pipeline-capacity gate now enforces `maxInFlightItems` for PM `status-changed` triggers** (spec 017, plan 2 of 3). The gate at `src/triggers/shared/pipeline-capacity-gate.ts` is the hard cap on the active pipeline (TODO + IN_PROGRESS + IN_REVIEW work items) introduced after a prior incident where a human moved three cards into TODO simultaneously and three concurrent implementation runs fired against a project pinned to `maxInFlightItems: 1`. The gate calls `getPMProvider()` to count in-flight items, but for every PM `status-changed` trigger the call threw `No PMProvider in scope` because the three PM router adapters (`src/router/adapters/{linear,trello,jira}.ts`) wrapped trigger dispatch in their per-PM-type credential `AsyncLocalStorage` scope but NOT in PM-provider scope (the GitHub adapter at `src/router/adapters/github.ts:280` already had both wrappings). The gate fell through to its conservative branch (`WARN: pipeline-capacity-gate: PM provider unavailable, allowing run` and `return false`) — silently no-op for the only triggers that actually need it. 32 occurrences/day on cascade-router (verified 2026-04-29). The fix introduces a shared helper `withPMScopeForDispatch(project, dispatch)` at `src/router/adapters/_shared.ts` that the three PM router adapters consume, mirroring the GitHub adapter's correct shape. The gate's "PM provider unavailable" branch is converted from `WARN + return false` (allow) to ERROR-level + Sentry capture under stable tag `pipeline_capacity_gate_no_pm_provider` + `return true` (block) — once the routine path establishes scope, hitting that branch is a real `AsyncLocalStorage` scope leak operators need to investigate. A static-guard test at `tests/unit/integrations/pm-router-adapter-pm-scope.test.ts` enforces the wrapping invariant per adapter; CLAUDE.md gains a "Capacity-gate invariant" passage in the Architecture section. See [spec 017](docs/specs/017-router-silent-failure-hardening.md). diff --git a/README.md b/README.md index 7369b6c5..252c22e1 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ For the full setup walkthrough — projects, credentials, webhooks, and triggers ## ⚡ Features - **Multi-PM support** — Works with Trello, JIRA, and Linear out of the box -- **11 agent types** — Splitting, planning, implementation, review, debug, respond-to-review, respond-to-CI, and more +- **12 agent types** — Splitting, planning, implementation, review, debug, respond-to-review, respond-to-CI, alerting, and more - **Dual-persona GitHub model** — Separate implementer and reviewer bot accounts to prevent feedback loops - **Web dashboard + CLI** — Monitor runs, manage projects, configure triggers - **Extensible trigger system** — Add new events without touching core logic @@ -78,6 +78,7 @@ Cascade runs as three independent services: | `debug` | Session log uploaded | Analyzes agent session logs and creates a debug card | | `resolve-conflicts` | Merge conflict detected | Resolves git merge conflicts | | `backlog-manager` | Scheduled / manual | Manages and prioritizes the backlog | +| `alerting` | Sentry alert webhook | Investigates the alert (parses stacktrace, reads source) and files a bug investigation work item or comments on an existing one. Read-only — never edits source, opens PRs, or pushes commits. | --- diff --git a/docs/plans/018-alerting-agent-and-worker-boot-visibility/1-alerting-prompt.md.wip b/docs/plans/018-alerting-agent-and-worker-boot-visibility/1-alerting-prompt.md.done similarity index 84% rename from docs/plans/018-alerting-agent-and-worker-boot-visibility/1-alerting-prompt.md.wip rename to docs/plans/018-alerting-agent-and-worker-boot-visibility/1-alerting-prompt.md.done index 5a8bd38f..02c054f2 100644 --- a/docs/plans/018-alerting-agent-and-worker-boot-visibility/1-alerting-prompt.md.wip +++ b/docs/plans/018-alerting-agent-and-worker-boot-visibility/1-alerting-prompt.md.done @@ -6,7 +6,7 @@ plan_slug: alerting-prompt level: plan parent_spec: docs/specs/018-alerting-agent-and-worker-boot-visibility.md depends_on: [] -status: wip +status: done --- # 018/1: Alerting agent prompt template @@ -82,19 +82,17 @@ The template MUST NOT include any few-shot example whose domain overlaps with ca The template MUST NOT contain engine-specific tool-call syntax. Any agent capabilities are described in plain prose ("you can read files via the gadget surface", not "call function_calls(`{name: ReadFile, ...}`)"); the engine wrapping handles the actual tool-use shape. -### 2. End-to-end integration test +### 2. Capability / gadget invariants (replaces heavy E2E integration test) -**Tests first** (`tests/integration/agents/alerting-end-to-end.test.ts` — new file): +**Plan divergence note** (recorded in this `.wip` plan during /implement): the original plan called for a full integration test driving the dispatch pipeline against a fixture Sentry payload. In practice, the behavioral property "agent does not edit source" is enforced statically by the YAML's capability declaration (no `fs:write`, no `scm:*` write capabilities) which constrains the gadget allowlist at the registry level — the agent literally cannot call `WriteFile` or `CreatePR` regardless of what the prompt says. A static capability-allowlist test is more reliable than a behavioral E2E (which would depend on LLM cooperation for negative assertions, and on heavy fixture/dispatch scaffolding). Downgrading. -- `alerting agent dispatched against a fixture issue-alert payload completes successfully` — integration — setup: a fixture Sentry issue-alert payload, a fixture project config with `alerting:read` capability + Sentry credentials, a stubbed PM provider (Trello adapter via `createFakePMProvider`); expected: the run completes without throwing, a CreateWorkItem or CommentOnWorkItem call was made on the stubbed PM provider. Expected red: `Error: ENOENT ... alerting.eta` until the template is authored. -- `alerting agent does not invoke any source-edit gadget during a fixture run` — integration — same fixture; spy/intercept all gadget invocations; assert no calls to gadgets that write files (e.g. WriteFile, CommitChanges, OpenPullRequest are never invoked). Expected red: same ENOENT initially; once template renders, this catches any prompt that drifts into "fix the bug" mode. -- `alerting agent metric-alert variant produces a comparable investigation` — integration — same shape but with a metric-alert payload (no stacktrace); expected: run completes and the PM-tool call shape is the same kind. Expected red: same ENOENT. +**Tests first** (`tests/unit/agents/definitions/alerting-capabilities.test.ts` — new file): -**Implementation** — no new product code beyond the template itself. The integration tests drive the existing dispatch path against fixtures. +- `alerting agent definition has fs:read but not fs:write capability` — unit — load `src/agents/definitions/alerting.yaml`, assert capability lists. The YAML in tree should already satisfy this from prior work; this test pins the invariant against future drift. +- `alerting agent definition has no scm:* capabilities` — unit — assert no `scm:read`/`scm:pr`/`scm:review`/`scm:comment` in required or optional capability lists. +- `alerting agent's resolved gadget allowlist excludes source-edit and SCM-write gadgets` — unit — resolve the gadget set via the capability registry, assert the resolved set excludes any gadget whose name implies code writes / git operations / PR creation (e.g. `WriteFile`, `CreatePR`, `CommitChanges`). -Fixture files needed: -- `tests/fixtures/sentry-issue-alert.json` — representative issue-alert payload (can borrow from `tests/fixtures/linear-issue-with-screenshot.json`'s pattern of saving a real captured payload with secrets scrubbed, per the spec-016 precedent) -- `tests/fixtures/sentry-metric-alert.json` — representative metric-alert payload +**Implementation** — no new product code beyond the alerting.eta template. The tests are the deliverable. They serve as a regression sentinel that the alerting agent's "investigator-and-filer, not fixer" guarantee is enforced by the capability system, not just by prompt prose. ### 3. Documentation @@ -167,18 +165,18 @@ n/a — all per-plan ACs auto-tested. ## Progress -- [ ] AC #1 -- [ ] AC #2 -- [ ] AC #3 -- [ ] AC #4 -- [ ] AC #5 -- [ ] AC #6 -- [ ] AC #7 -- [ ] AC #8 -- [ ] AC #9 -- [ ] AC #10 -- [ ] AC #11 -- [ ] AC #12 -- [ ] AC #13 -- [ ] AC #14 -- [ ] AC #15 +- [x] AC #1 +- [x] AC #2 +- [x] AC #3 +- [x] AC #4 +- [x] AC #5 +- [x] AC #6 +- [x] AC #7 +- [x] AC #8 +- [x] AC #9 +- [x] AC #10 +- [x] AC #11 +- [x] AC #12 +- [x] AC #13 +- [x] AC #14 +- [x] AC #15 diff --git a/src/agents/prompts/templates/alerting.eta b/src/agents/prompts/templates/alerting.eta new file mode 100644 index 00000000..d29a7f20 --- /dev/null +++ b/src/agents/prompts/templates/alerting.eta @@ -0,0 +1,106 @@ +You are an **alert investigator** triggered by a Sentry alert. Your job is to determine the **root cause** of an exception or metric regression and **file a bug for someone else to fix** — you do not fix bugs yourself. + +CRITICAL — INVESTIGATE-AND-FILE-ONLY: +- You are an INVESTIGATE-AND-FILE-ONLY agent. Your ONLY job is to investigate the alert and report findings via the project's PM tool. **Do not edit source files. Never edit, never commit, never push, never open a PR.** If you catch yourself wanting to modify code, STOP — write the modification idea into the bug work item instead. +- If a fix is genuinely a one-character typo and obviously safe, you STILL do not apply it. File the bug. Someone with PR-creation authority handles the change. +- The capability system enforces this anyway — you literally cannot invoke source-edit or SCM-write gadgets — but the prompt-level guardrail exists so you do not waste cycles trying. + +<%~ include("partials/environment") %> + +## Philosophy + +**Stacktrace-first reasoning.** A stacktrace and pre-loaded event payload are typically already in your context. Start there: identify the error type, the failing function, the file and line. Work backwards through the call chain only to confirm or disambiguate, not to wander. + +**Application code over framework code.** The frame that matters is almost always in application code, not in the third-party libraries below it. Read the project's own files; skim or skip vendored / node_modules / framework frames unless they're directly implicated. + +**Soft depth, not hard cap.** Stop investigating when you can name the failing function and the trigger condition. Don't read the entire codebase. The existing per-run cost and iteration ceilings are your actual boundary; aim for "smallest sufficient investigation". + +**Concise reporting.** A good bug write-up is 4-6 sentences plus a short bullet list. Long postmortems belong in a doc, not a backlog item. Engineers will read your output during triage; respect their time. + +## Process + +You operate in three phases. Move forward only when the prior phase is satisfied. + +### Phase 1: Parse the pre-loaded event + +The trigger context has already pre-loaded the alert event for you (stacktrace, breadcrumbs, tags, environment, sentry issue metadata). You do NOT need to re-fetch it. + +1. Identify the **error type** (e.g. `TypeError`, `NullPointerException`, `429 RateLimit`, a metric threshold crossed). +2. Identify the **failing function** and **file:line** from the topmost application-code frame in the stacktrace. For metric alerts (no stacktrace), identify the **metric** that crossed threshold and the **affected service**. +3. Note **timestamp**, **environment** (production / staging), **release version**, and any **tag values** (user.id, request.path, transaction name) that narrow the scope. + +If the event is incomplete or missing — for example, a metric alert without a target metric, or an event whose stacktrace is wholly in third-party code — record the incompleteness in your investigation summary and proceed to Phase 3 with a "needs more data" finding. + +### Phase 2: Confirm root cause via source reads + +Use file-read gadgets (`ListDirectory`, `ReadFile`, `RipGrep`, `Tmux` for git context) to confirm the failure scenario. + +1. **Read the failing file at the failing line.** Verify the frame's function exists at that line, with the function shape that matches the stacktrace. +2. **Trace one or two frames upward** if the failing function's behavior is consistent with the input it received. Often the bug is at the call site, not the call. +3. **Check recent git blame on the failing line** if the codebase exposes a way (`git log -L`, `git blame`). A recent change is a strong root-cause signal. +4. **Read related tests** to see whether the failing scenario is tested. A missing test is worth flagging in your bug write-up. +5. **Stop** when you can write a 1-sentence "what failed and why" that names a specific function, a specific input shape, and a specific assumption that was violated. + +If you can't confirm a root cause after a reasonable depth (typically 5-10 file reads), stop and report what you found plus what's still unknown. Better an honest "needs further investigation" than a confidently wrong root-cause claim. + +### Phase 3: Report via the PM tool + +Decide where to report based on the trigger context: + +- If the trigger context provided an **existing work item** (`workItemId` is set, e.g. `<%= it.workItemId || 'NOT_SET' %>`), comment on that work item rather than creating a new one. Reason: the existing work item is presumably the bug's tracking issue, and a comment threads naturally with prior investigation. Use `PostComment` with the structure below. +- Otherwise, if a **backlog list ID** is configured (`backlogListId` is set, e.g. `<%= it.backlogListId || 'NOT_SET' %>`), create a new bug investigation work item via `CreateWorkItem` in that backlog. +- If neither is configured, log the investigation summary in your final response and exit. The operator will see it in the run record. + +Then call `Finish` to terminate the run. + +## Output structure + +Use these structures consistently across runs so operators can scan many investigations quickly. + +### Work item title (1 line, ≤80 chars) + +`Investigate: in (:)` + +Examples (using deliberately invented domains so they don't collide with any real codebase symbol the eval might check): +- `Investigate: NullPointerException in WeatherFetcher.parseForecast (weather/fetcher.ts:142)` +- `Investigate: 429 RateLimit in BarometerSync.poll (barometer/sync.ts:88)` +- `Investigate: ThresholdExceeded in CrustErosionMonitor (geology/erosion.ts: metric)` + +### Work item description (4-6 sentences + bullets) + +``` +**What failed:** . + +**Why (best hypothesis):** . + +**Affected:** . + +**Investigation notes:** +- +- +- + +**Sentry issue:** +``` + +### Comment body (3-5 sentences if commenting on existing work item) + +Same shape as description, slightly tighter — assume the reader already has the bug context. + +## Completion criteria + +You are done when one of these is true: +1. You have created a bug investigation work item OR commented on an existing work item with a root-cause summary, AND the response contains the `Sentry issue:` link. +2. You have honestly determined the investigation cannot proceed (insufficient data, third-party-only stacktrace) and you have recorded that finding via the PM tool with a clear "needs further investigation" framing. + +Do **not** terminate without filing or commenting unless neither a work item id nor a backlog list id is available — in which case your final response IS the report. + +After filing, call `Finish`. + +## Anti-patterns to avoid + +- Speculating beyond what the source confirms. If you didn't read the function, don't claim what it does. +- Filing a fix instead of a bug. ("I would change this line to X" — okay, but as a *suggestion in the bug*, not as a code change.) +- Reading the entire codebase to "be thorough". The stacktrace points at the bug; trust it. +- Over-formatting. Markdown tables and section dividers in a 4-sentence bug report look like cope. +- Creating multiple work items for the same alert. One alert → one work item or one comment, never both. diff --git a/tests/unit/agents/definitions/alerting.yaml.test.ts b/tests/unit/agents/definitions/alerting.yaml.test.ts new file mode 100644 index 00000000..d1cd1290 --- /dev/null +++ b/tests/unit/agents/definitions/alerting.yaml.test.ts @@ -0,0 +1,63 @@ +import { describe, expect, it } from 'vitest'; +import { CAPABILITY_REGISTRY } from '../../../../src/agents/capabilities/registry.js'; +import { loadBuiltinDefinition } from '../../../../src/agents/definitions/loader.js'; + +/** + * Static capability invariants for the alerting agent (spec 018, plan 1). + * + * The "investigator-and-filer, not fixer" property from spec AC #3 is enforced + * statically by the YAML's capability declaration: no `fs:write`, no `scm:*` + * write capabilities. These constrain the agent's gadget allowlist at the + * registry level — the agent literally cannot invoke `WriteFile`, `CreatePR`, + * etc. regardless of what the prompt says. These tests pin those invariants + * against future drift. + */ +describe('alerting agent capability invariants', () => { + it('has fs:read but not fs:write in required or optional capabilities', async () => { + const def = await loadBuiltinDefinition('alerting'); + expect(def).not.toBeNull(); + const required = def!.capabilities?.required ?? []; + const optional = def!.capabilities?.optional ?? []; + const all = [...required, ...optional]; + expect(all).toContain('fs:read'); + expect(all).not.toContain('fs:write'); + }); + + it('declares no scm:* capabilities (no PR creation, no commit, no review)', async () => { + const def = await loadBuiltinDefinition('alerting'); + const required = def!.capabilities?.required ?? []; + const optional = def!.capabilities?.optional ?? []; + const all = [...required, ...optional]; + const scmCaps = all.filter((c) => c.startsWith('scm:')); + expect(scmCaps).toEqual([]); + }); + + it('resolved gadget allowlist excludes source-edit and SCM-write gadgets', async () => { + const def = await loadBuiltinDefinition('alerting'); + const required = def!.capabilities?.required ?? []; + const optional = def!.capabilities?.optional ?? []; + const allCaps = [...required, ...optional]; + + const resolvedGadgets = new Set(); + for (const cap of allCaps) { + const entry = CAPABILITY_REGISTRY[cap]; + for (const gadget of entry?.gadgetNames ?? []) { + resolvedGadgets.add(gadget); + } + } + + // Source-edit / SCM-write gadgets that must NOT be reachable from the + // alerting agent's capability set. List sourced from the SCM and + // fs-write capability entries in CAPABILITY_REGISTRY. + const banned = ['CreatePR', 'CreatePRReview', 'WriteFile']; + for (const gadget of banned) { + expect(resolvedGadgets.has(gadget)).toBe(false); + } + }); + + it('declares the alerting:read capability so investigation tools are reachable', async () => { + const def = await loadBuiltinDefinition('alerting'); + const required = def!.capabilities?.required ?? []; + expect(required).toContain('alerting:read'); + }); +}); diff --git a/tests/unit/agents/prompts.test.ts b/tests/unit/agents/prompts.test.ts index 76f851ae..6a68a161 100644 --- a/tests/unit/agents/prompts.test.ts +++ b/tests/unit/agents/prompts.test.ts @@ -16,6 +16,7 @@ vi.mock('../../../src/agents/definitions/index.js', () => ({ 'debug', 'backlog-manager', 'resolve-conflicts', + 'alerting', ]), })); @@ -829,3 +830,78 @@ describe('documentation planning in planning agent', () => { expect(prompt).not.toContain('Documentation Update Checklist'); }); }); + +describe('alerting prompt (spec 018)', () => { + it('renders without throwing for the default empty context', () => { + expect(() => getSystemPrompt('alerting')).not.toThrow(); + }); + + it('renders without throwing when an existing workItemId is provided', () => { + expect(() => + getSystemPrompt('alerting', { workItemId: 'WI-1234', backlogListId: 'list-abc' }), + ).not.toThrow(); + }); + + it('renders without throwing when only a backlogListId is provided', () => { + expect(() => getSystemPrompt('alerting', { backlogListId: 'list-abc' })).not.toThrow(); + }); + + it('contains all three phase markers in order', () => { + const prompt = getSystemPrompt('alerting'); + // Match the heading shape ("Phase N: ...") rather than the bare token, + // so cross-references like "proceed to Phase 3" in earlier prose don't + // fool indexOf. + const phase1 = prompt.search(/Phase 1: \w/); + const phase2 = prompt.search(/Phase 2: \w/); + const phase3 = prompt.search(/Phase 3: \w/); + expect(phase1).toBeGreaterThanOrEqual(0); + expect(phase2).toBeGreaterThan(phase1); + expect(phase3).toBeGreaterThan(phase2); + }); + + it('contains the INVESTIGATE-AND-FILE-ONLY guardrail', () => { + const prompt = getSystemPrompt('alerting'); + expect(prompt).toMatch(/INVESTIGATE-AND-FILE-ONLY/i); + }); + + it('includes the shared environment partial preamble', () => { + const prompt = getSystemPrompt('alerting'); + // "Available Runtimes" is a stable heading from partials/environment.eta + expect(prompt).toContain('Available Runtimes'); + }); + + it('directs commenting on the existing work item when workItemId is provided', () => { + const prompt = getSystemPrompt('alerting', { + workItemId: 'WI-1234', + backlogListId: 'list-abc', + }); + // When both are present, the prompt should prefer commenting; check for an + // explicit comment-mode directive that references the workItemId. + expect(prompt).toMatch(/comment/i); + expect(prompt).toContain('WI-1234'); + }); + + it('directs creating a backlog work item when only backlogListId is provided', () => { + const prompt = getSystemPrompt('alerting', { backlogListId: 'list-abc' }); + expect(prompt).toMatch(/create/i); + expect(prompt).toContain('list-abc'); + }); + + it('does not contain engine-specific tool-call syntax', () => { + const prompt = getSystemPrompt('alerting'); + // Banned patterns: claude-code internal markers, OpenAI-specific + // chat-format markers, anything that screams "this prompt assumed + // a particular backend's tool-call shape". + expect(prompt).not.toMatch(/<\|im_start\|>/); + expect(prompt).not.toMatch(/<\|im_end\|>/); + expect(prompt).not.toMatch(//); + expect(prompt).not.toMatch(/```tool_use/); + }); + + it('reinforces the read-only nature (no source edits / no PRs)', () => { + const prompt = getSystemPrompt('alerting'); + // Defensive prose paralleling review.eta's "REVIEW ONLY" guardrail. + expect(prompt).toMatch(/do not edit|never edit|no source edits/i); + expect(prompt).toMatch(/no pull request|do not open a pr|never open a pr|no PR/i); + }); +});