From 40d9b1c27569a417085f57d92a6e96f2c552b671 Mon Sep 17 00:00:00 2001 From: Alex Date: Wed, 10 Jun 2026 08:08:20 -0400 Subject: [PATCH] =?UTF-8?q?feat:=20loop-age=20features=20=E2=80=94=20guard?= =?UTF-8?q?s.protect,=20SubagentStop,=20check=20--against,=20progress-awar?= =?UTF-8?q?e=20bounces?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Four features that put donegate inside agentic fan-out workflows instead of only at the session's terminal stop: - guards.protect + no_protected_edits: pin the files the checks *mean* (package.json scripts, lint/test configs). Hashed into the baseline like the donefile; changed, deleted, or newly shadowing files are findings. Falls back to the git diff when there is no baseline, so it works in CI. Closes the '"test": "exit 0"' indirection hole. - SubagentStop adapter (Claude Code): donegate install claude now wires `donegate hook claude --subagent` — a guards-only tamper scan at every subagent boundary. No checks run, so fan-outs are gated per node at git-diff cost; findings bounce the subagent while it still has the context to undo them. Subagent bounces keep their own ledger so a noisy fan-out can't burn the terminal gate's budget. - donegate check --against : judge mode. Evaluates checks + guards against an explicit ref, ignoring the session baseline — grade each worktree against its fork point from a workflow script, pin CI to the PR base, or re-derive a verdict past a re-blessed baseline. Receipts record kind "explicit"; a nonexistent ref is a config error (exit 2), never a silent pass. - Progress-aware bounce budget: gate.max_bounces now counts consecutive bounces without new progress. A stop attempt with strictly fewer failing checks + tripped guards than the session's best refreshes the budget (and says so in the reason). Best-ever is the bar, so oscillating failure sets can't farm refreshes and total bounces stay bounded — loop-until-done semantics without the hostage situation. Docs: new docs/agent-loops.md (terminal gate / per-node scan / judge mode, worktree behavior), spec + hooks + threat-model + README updated. An empty guards.protect adds no receipt noise for existing repos. Co-Authored-By: Claude Fable 5 --- CHANGELOG.md | 23 +++++++++ README.md | 25 +++++++--- docs/agent-loops.md | 90 +++++++++++++++++++++++++++++++++++ docs/hooks.md | 33 +++++++++---- docs/spec.md | 32 +++++++++++-- docs/threat-model.md | 18 ++++--- src/baseline.ts | 18 +++++++ src/check.ts | 22 +++++++-- src/cli.ts | 17 +++++-- src/donefile.ts | 11 ++++- src/guards.ts | 61 +++++++++++++++++++++++- src/hooks.ts | 79 +++++++++++++++++++++++++------ src/install.ts | 20 ++++++-- src/types.ts | 15 +++++- test/donefile.test.ts | 10 ++++ test/guards.test.ts | 106 +++++++++++++++++++++++++++++++++++++++++- test/helpers.ts | 4 ++ test/hooks.test.ts | 79 +++++++++++++++++++++++++++++++ test/install.test.ts | 26 +++++++++++ 19 files changed, 635 insertions(+), 54 deletions(-) create mode 100644 docs/agent-loops.md diff --git a/CHANGELOG.md b/CHANGELOG.md index 61668e0..e911fb1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,29 @@ ## Unreleased +- **`guards.protect` + `no_protected_edits`** — pin the files your checks + *mean* (package.json scripts, lint/test/build configs). They're hashed into + the baseline like the donefile itself; any change, deletion, or new + shadowing file is a finding. Closes the `"test": "exit 0"` hole. Falls back + to the git diff when there's no baseline, so it works in CI too. +- **`SubagentStop` adapter (Claude Code)** — `donegate install claude` now + also wires a guards-only tamper scan at every subagent boundary + (`donegate hook claude --subagent`). No checks run, so fan-out workflows are + gated per node at git-diff cost; subagent bounces use their own ledger. +- **`donegate check --against `** — judge mode: evaluate checks + guards + against an explicit git ref, ignoring the session baseline. Makes donegate + scriptable as the deterministic judge in fan-out workflows (grade each + worktree against its fork point) and re-derives verdicts from git history + alone. Receipts record the comparison as `explicit`; a nonexistent ref is a + config error, not a silent pass. +- **Progress-aware bounce budget** — `gate.max_bounces` now counts + *consecutive bounces without new progress*: a stop attempt with strictly + fewer failing checks + tripped guards than the session's best refreshes the + budget (and says so). An agent steadily fixing a long list is never cut off + mid-fix; "best ever" as the bar keeps total bounces bounded. +- `docs/agent-loops.md` — where donegate sits in agentic loops and dynamic + workflows: terminal gate, per-subagent guard scan, judge mode, worktree + behavior. - **The donefile can no longer be deleted or broken out of the way.** The stop hook used to treat a missing DONE.md as "not my repo" and an unparseable one as a config typo — both fail-open, both one `rm` or one bad edit away from diff --git a/README.md b/README.md index d23f256..f74c063 100644 --- a/README.md +++ b/README.md @@ -139,9 +139,12 @@ guards: no_done_edits: true # this file edited mid-session → fail no_new_todos: warn no_debug_artifacts: warn + protect: # files that define what the checks MEAN + - package.json # ("test": "exit 0" is not a fix) + - eslint.config.js gate: - max_bounces: 3 # re-prompts per session before giving up + max_bounces: 3 # no-progress re-prompts before giving up ``` ```` @@ -162,6 +165,7 @@ tries to finish, it diffs reality against that baseline: | `no_deleted_tests` | deleted test files, per-file test counts dropping | fail | | `no_disabled_lint` | `eslint-disable` `biome-ignore` `@ts-ignore` `# noqa` `# type: ignore` `//nolint` `#[allow(...)]` `@SuppressWarnings` `rubocop:disable` — added anywhere | fail | | `no_done_edits` | DONE.md modified or deleted mid-session | fail | +| `no_protected_edits` | files listed in `guards.protect` (package.json, lint/test configs — the files that define what the checks *mean*) changed, deleted, or shadowed | fail | | `no_new_todos` | `TODO` / `FIXME` / `HACK` introduced in code | warn | | `no_debug_artifacts` | `console.log` `debugger` `breakpoint()` `pdb.set_trace` `binding.pry` `dbg!` left in non-test code | warn | @@ -176,16 +180,20 @@ followed, so moving a test file is never "deleting" it. Guards are a **ratchet, not a sandbox**: they make the cheap, common shortcuts loud and expensive, with receipts. An agent with shell access can still find -quieter moves — weakening assertions, redefining what `npm test` means in -package.json, re-blessing the baseline itself. What the gate catches, what it -deliberately doesn't, and why CI is the copy of the gate an agent can't touch: -[docs/threat-model.md](docs/threat-model.md). +quieter moves — weakening assertions, re-blessing the baseline itself. What +the gate catches, what it deliberately doesn't, and why CI is the copy of the +gate an agent can't touch: [docs/threat-model.md](docs/threat-model.md). + +Running fan-out workflows with subagents and worktrees? donegate gates those +boundaries too — a guards-only scan at every `SubagentStop`, and +`check --against ` as the deterministic judge over any diff: +[docs/agent-loops.md](docs/agent-loops.md). ## Works with | | command | mechanism | |---|---|---| -| **Claude Code** | `donegate install claude` | `Stop` hook — blocks the stop, feeds failures back | +| **Claude Code** | `donegate install claude` | `Stop` hook — blocks the stop, feeds failures back · `SubagentStop` — guards-only scan per subagent | | **Codex CLI** | `donegate install codex` | `Stop` hook (`.codex/hooks.json`) | | **Cursor** | `donegate install cursor` | `stop` hook → `followup_message` | | **GitHub Actions** | `donegate install ci` | gates PRs, posts the receipt as a comment | @@ -239,6 +247,11 @@ is not the agent's to edit. **Won't it delete the failing test?** That trips `no_deleted_tests` — file deletions *and* per-file test-count drops. +**Won't it just change what `npm test` means in package.json?** List the +files your checks depend on in `guards.protect` and that trips +`no_protected_edits` — they're hashed into the baseline like the donefile +itself. + **Does this replace CI?** No — it runs *before* the agent declares victory, while it still has context to fix things. CI stays as the backstop (and `donegate install ci` makes CI speak DONE.md too). diff --git a/docs/agent-loops.md b/docs/agent-loops.md new file mode 100644 index 0000000..b6a5383 --- /dev/null +++ b/docs/agent-loops.md @@ -0,0 +1,90 @@ +# donegate in agent loops + +Coding agents run a loop: gather context → take action → verify → repeat. +Increasingly that loop fans out — orchestrators spawn subagents, subagents get +their own worktrees, workflow scripts coordinate the lot. donegate has a +specific seat at three points of that topology, and this page maps them. + +## The three seats + +| where | mechanism | what runs | cost | +|---|---|---|---| +| **terminal stop** | `Stop` hook | full gate: checks + guards | your test suite | +| **subagent boundary** | `SubagentStop` hook (`hook claude --subagent`) | guards only | git diffs + regexes — fast | +| **judge in a fan-out** | `donegate check --against --json` | checks + guards vs an explicit ref | your call (use `--only` to scope) | + +### Terminal stop — the gate on the loop's exit + +The classic donegate role: the agent tries to finish, the gate runs the +repo's definition of done, failure bounces the agent back with the report in +its context. This is the **deterministic verifier** in the loop's +verify-work phase — exit codes and diffs, no LLM judging anything, which also +means it can't share an LLM judge's self-preference for the code that was +just written. + +### Subagent boundary — tamper scan per node + +A full test suite per subagent would be brutal; a tamper scan isn't. The +`SubagentStop` hook (installed automatically by `donegate install claude`) +runs **guards only**: did this subagent skip or delete tests, silence the +linter, touch a protected file, edit the donefile? Findings block the +subagent's completion the same way the stop hook blocks the session — the +finding lands while the subagent still has the context to undo it, instead of +surfacing at the terminal stop after its output was already absorbed. + +Read-only subagents (searchers, reviewers) change nothing, trip nothing, and +pay one git diff. Subagent bounces are tracked in their own ledger +(`:subagent`), so a noisy fan-out can't burn the bounce budget the +terminal gate relies on. + +### Judge mode — `--against` in workflow scripts + +Fan-out patterns end with verification: N agents produced N diffs, something +deterministic should grade them before anything merges. `--against` pins the +comparison to an explicit ref — the worktree's fork point, the PR base — +instead of whatever baseline/merge-base resolution would guess. With `--json` +the receipt is machine-readable; the exit code is the verdict +(0 done / 1 checks failed / 3 bar was lowered). + +```js +// inside a workflow script: judge each worktree before accepting it +const verdict = await bash( + `cd ${worktree} && npx -y donegate check --against ${forkPoint} --json --quiet`, +); +// exit 0 → accept; exit 3 → the diff "passes" because the bar moved — reject loudly +``` + +`--against` deliberately **ignores the session baseline** — judge mode judges +a diff, not a session. That also makes it the answer to a re-blessed +baseline: `donegate check --against origin/main` re-derives the verdict from +git history alone. + +## Worktree behavior + +Linked worktrees get their own `.donegate/` (it's per-root and gitignored). +Inside a fresh worktree there is usually **no session baseline**, so guards +fall back to git comparisons — added-line scans against HEAD or merge-base +still work; baseline-only detections (count drops in untouched files, +protected-file hashes) degrade gracefully. For full-strength guards in a +worktree, record a baseline when it's created (`donegate baseline`) or judge +it from outside with `--against `. + +## Loop-until-done, bounded + +A fixed bounce cap fights the loop: an agent steadily fixing a long failure +list gets cut off mid-fix. donegate's budget counts **consecutive bounces +without new progress** — when a stop attempt's failure count (failing checks + +tripped guards) drops below the session's best, the budget refreshes and the +agent is told so. "Best ever" is the bar, not "better than last time," so +oscillating between two failure sets can't farm refreshes; total bounces stay +bounded and a wedged session still exits with a red receipt. + +## What this does not change + +The loop's failure modes that donegate addresses are the *mechanical* ones: +declaring done early (agentic laziness that trips a check), lowering the bar +to get green (guards), drifting past the definition of done (DONE.md is +re-read from disk every stop — compaction can't summarize it away). The +*semantic* failure modes — weakened assertions, vacuous tests, an agent +grading its own homework — still need a clean-context reviewer or a human; +see [threat-model.md](threat-model.md) for the honest boundary. diff --git a/docs/hooks.md b/docs/hooks.md index 05bd7da..bc711f2 100644 --- a/docs/hooks.md +++ b/docs/hooks.md @@ -1,12 +1,18 @@ # Agent integrations -`donegate install ` wires the gate into an agent's lifecycle. Two hooks -get installed per agent: +`donegate install ` wires the gate into an agent's lifecycle: - **session start** → `donegate baseline --if-missing --quiet` — snapshots - test files and DONE.md so the tamper guards have something to diff against. + test files, protected files, and DONE.md so the tamper guards have something + to diff against. - **stop** → `donegate hook ` — runs the full gate when the agent tries to finish, and blocks the stop (with the failure report) if the verdict is red. +- **subagent stop** (Claude Code only) → `donegate hook claude --subagent` — + a **guards-only** tamper scan at every subagent boundary. No checks run, so + it's cheap enough to pay per subagent; a subagent that skipped tests or + touched a protected file is bounced while it still has the context to undo + it. Subagent bounces use their own ledger so a noisy fan-out can't burn the + terminal gate's budget. Project-level installs are the default and are **shareable** — commit the config and every teammate's agent is gated too. Add `--global` to install at the user @@ -29,6 +35,9 @@ budget — keep their sum under the stop timeout. "Stop": [ { "hooks": [{ "type": "command", "command": "npx -y donegate hook claude" }] } ], + "SubagentStop": [ + { "hooks": [{ "type": "command", "command": "npx -y donegate hook claude --subagent" }] } + ], "SessionStart": [ { "hooks": [{ "type": "command", "command": "npx -y donegate baseline --if-missing --quiet" }] } ] @@ -37,7 +46,9 @@ budget — keep their sum under the stop timeout. ``` On a red verdict the hook prints `{"decision": "block", "reason": ""}` -— Claude Code keeps the session going and feeds the report to the model. +— Claude Code keeps the session going and feeds the report to the model. The +`SubagentStop` entry speaks the same contract but runs guards only (see +[agent-loops.md](agent-loops.md)). ## Codex CLI @@ -85,10 +96,16 @@ because tests were deleted" is visible right in the review. A stop hook that can block forever is a hostage situation, so every block increments a per-session bounce counter (`.donegate/state.json`, pruned after -24h). After `gate.max_bounces` (default 3) the gate stops blocking and lets the -stop through with a loud warning — but it keeps verifying, so the receipt -always tells the truth. Sessions that recover reset their counter on the first -green run. +24h). After `gate.max_bounces` (default 3) **consecutive attempts without new +progress** the gate stops blocking and lets the stop through with a loud +warning — but it keeps verifying, so the receipt always tells the truth. + +Progress refreshes the budget: when a stop attempt's failure count (failing +checks + tripped guards) drops strictly below the session's best so far, the +counter resets and the agent is told so — a session steadily fixing a long +list is never cut off mid-fix. "Best ever" is the bar rather than "better +than last time", so oscillating between failure sets can't farm refreshes and +total bounces stay bounded. Sessions that go green reset entirely. ## When the gate itself is the target diff --git a/docs/spec.md b/docs/spec.md index aab16be..d38cade 100644 --- a/docs/spec.md +++ b/docs/spec.md @@ -40,14 +40,24 @@ guards: # optional — tamper detection levels no_disabled_lint: true # eslint-disable/noqa/@ts-ignore/nolint added no_new_todos: warn # TODO/FIXME/HACK introduced no_debug_artifacts: warn # console.log/debugger/pdb.set_trace left behind + no_protected_edits: true # files matching `protect` changed mid-session test_globs: # optional — what counts as a test file ["**/*.test.*", "**/*.spec.*", "**/test_*.py", "**/*_test.go", "..."] exclude: [] # optional — files exempt from guard analysis # (for code that legitimately CONTAINS the # patterns: lint configs, scanners, donegate itself) + protect: [] # optional — globs for files the verdict depends on + # but the gate doesn't run: the files that define + # what the check commands MEAN (package.json, + # eslint/jest/pytest/tsconfig configs). Hashed into + # the baseline; any change, deletion, or new + # shadowing file trips no_protected_edits. gate: # optional - max_bounces: 3 # stop-hook re-prompts per session before giving up (1-20) + max_bounces: 3 # consecutive no-progress stop-hook re-prompts per + # session before giving up (1-20); progress — a strictly + # lower failing-check + tripped-guard count than the + # session's best — refreshes the budget ``` Guard levels: `true` (findings fail the gate), `"warn"` (findings are reported @@ -73,10 +83,17 @@ pass?"* Guards ask ***"was the bar lowered so it would pass?"*** They compare the current tree against a **baseline**: 1. a **session baseline** recorded when an agent session starts (test-file - hashes, test/skip counts, the DONE.md hash, and the git HEAD at that moment), or + hashes, test/skip counts, hashes of `guards.protect` files, the DONE.md + hash, and the git HEAD at that moment), or 2. **HEAD**, when there's uncommitted work and no session baseline, or 3. the **merge-base with the default branch**, for clean trees (the CI case). +An **explicit ref** (`donegate check --against `) overrides all three, +including the session baseline: judge mode evaluates a diff, not a session. +The verdict is then derivable from git history alone — useful for grading +fan-out worktrees from a workflow script, pinning CI to the PR base, or +re-deriving a verdict past a re-blessed baseline. + All guard findings are deterministic, diff-based, and cite `file:line` evidence. Guards never call a model and never make network requests. @@ -110,9 +127,14 @@ tries to finish: output tails, guard findings with file:line) is fed back to the agent, which keeps working. Each block increments a per-session **bounce counter**. - **pass** → the stop proceeds; the bounce counter resets; the receipt is green. -- **bounces exhausted** (`gate.max_bounces`) → the gate stops *blocking* but - never stops *verifying*: the stop is allowed with a loud warning and a red - receipt. The gate must not be able to trap an agent in an infinite loop. +- **progress** → a stop attempt whose failure count (failing checks + tripped + guards) is strictly below the session's best **refreshes the bounce budget**: + an agent steadily working down a list is never cut off mid-fix. Best-ever is + the bar, so alternating between failure sets cannot farm refreshes. +- **bounces exhausted** (`gate.max_bounces` consecutive attempts without new + progress) → the gate stops *blocking* but never stops *verifying*: the stop + is allowed with a loud warning and a red receipt. The gate must not be able + to trap an agent in an infinite loop. - a repo **without** a DONE.md → the hook is a silent no-op. A **broken** DONE.md → warn and allow (a config typo must never wedge an agent). - user-initiated aborts are never blocked. diff --git a/docs/threat-model.md b/docs/threat-model.md index 5ec6930..5434f35 100644 --- a/docs/threat-model.md +++ b/docs/threat-model.md @@ -20,6 +20,8 @@ Deterministic, diff-based, with `file:line` receipts: | DONE.md edited mid-session | `no_done_edits` → exit 3 | | DONE.md **deleted** mid-session | stop bounced — the baseline remembers it existed | | DONE.md **broken** mid-session (no longer parses) | stop bounced — its hash no longer matches the baseline | +| a `guards.protect` file changed/deleted/shadowed (package.json, lint config) | `no_protected_edits` → exit 3 | +| a subagent tampering inside a fan-out | `SubagentStop` guard scan bounces it at its own boundary | Exit 3 — *"checks pass but the bar was lowered to get there"* — is deliberately a distinct, alarmable signal. @@ -35,12 +37,16 @@ that rather than guess it: count stays flat), updating snapshots to bake a bug in, or writing both the bug and the test that blesses it. Catching these means judging *meaning*; a diff and a regex can't, and donegate refuses to guess. -- **Command indirection.** DONE.md is hash-guarded, but `run: npm test` - resolves through package.json — and package.json, jest.config, pytest.ini, - tsconfig, and the linter config are all agent-editable. `"test": "exit 0"` - is green. Mitigations: prefer direct invocations in your checks - (`npx vitest run` rather than `npm test`) and treat tooling-config diffs as - review-worthy, because they are. +- **Command indirection — unless you pin it.** DONE.md is hash-guarded, but + `run: npm test` resolves through package.json — and package.json, + jest.config, pytest.ini, tsconfig, and the linter config are all + agent-editable. `"test": "exit 0"` is green. The fix is `guards.protect`: + list those files and they're hashed into the baseline; any change, deletion, + or new shadowing config trips `no_protected_edits`. What protect *can't* do + is know which files you forgot to list — prefer direct invocations in your + checks (`npx vitest run` rather than `npm test`), protect the configs those + tools read, and treat tooling-config diffs as review-worthy, because they + are. - **Attacks on donegate's own state.** An agent with shell access can run `donegate baseline` to bless its own donefile edit, hand-edit `.donegate/state.json` to burn the bounce budget, or patch diff --git a/src/baseline.ts b/src/baseline.ts index 0525743..e5f7b84 100644 --- a/src/baseline.ts +++ b/src/baseline.ts @@ -203,6 +203,24 @@ export async function createBaseline(config: DoneConfig): Promise { donefile_path: path.relative(config.root, config.sourcePath).split(path.sep).join('/'), test_files: entries, }; + + // Pin the files the verdict depends on but the gate doesn't run — the ones + // that define what the check commands *mean* (package.json scripts, + // lint/test configs). No size cap: a lockfile is large and is exactly the + // kind of file worth pinning. + if (config.guards.protect.length > 0) { + const protectedEntries: Record = {}; + for (const rel of walk(config.root, makeTestFileMatcher(config.guards.protect))) { + try { + const key = rel.split(path.sep).join('/'); + protectedEntries[key] = { sha: sha256(fs.readFileSync(path.join(config.root, rel))) }; + } catch { + // unreadable — skip + } + } + baseline.protected_files = protectedEntries; + } + return baseline; } diff --git a/src/check.ts b/src/check.ts index 658a669..b290e82 100644 --- a/src/check.ts +++ b/src/check.ts @@ -1,5 +1,6 @@ import type { CheckRunSummary, DoneConfig, Receipt } from './types.js'; -import { loadConfig } from './donefile.js'; +import { DonefileError, loadConfig } from './donefile.js'; +import { refExists } from './git.js'; import { resolveComparison, runGuards } from './guards.js'; import { runChecks } from './runner.js'; import { buildReceipt, writeReceipt } from './receipt.js'; @@ -11,6 +12,13 @@ export interface CheckOptions { only?: string[]; /** Skip tamper guards entirely. */ noGuards?: boolean; + /** Skip checks entirely — guards only (the subagent-boundary fast path). */ + noChecks?: boolean; + /** + * Compare against this git ref instead of the session baseline / HEAD / + * merge-base (the CLI's `--against`). Judge mode: evaluates a diff. + */ + comparisonRef?: string; via?: Receipt['via']; onCheckResult?: (result: CheckResult, index: number) => void; /** Pre-loaded config (skips discovery). */ @@ -32,9 +40,17 @@ export async function verify(options: CheckOptions = {}): Promise run a subset (comma-separated) + --against judge the diff vs an explicit git ref --no-guards skip tamper guards --json print the receipt as JSON --quiet verdict only install: --global install to ~/.claude, ~/.codex, or ~/.cursor baseline: --if-missing only record when no baseline exists + hook: --subagent guards-only gate (SubagentStop boundaries) all: -h, --help, -V, --version ${bold('EXIT CODES')} @@ -133,10 +135,11 @@ async function cmdInit(argv: string[]): Promise { } async function cmdCheck(argv: string[]): Promise { - const flags = parseFlags(argv, ['only']); + const flags = parseFlags(argv, ['only', 'against']); const json = flags.bool.has('json'); const quiet = flags.bool.has('quiet'); const only = flags.values.get('only')?.split(',').map((s) => s.trim()).filter(Boolean); + const against = flags.values.get('against'); const config = loadConfig(process.cwd()); if (only) { @@ -157,6 +160,7 @@ async function cmdCheck(argv: string[]): Promise { config, only, noGuards: flags.bool.has('no-guards'), + comparisonRef: against, via: 'cli', onCheckResult: (result) => { if (!json && !quiet) process.stdout.write(renderCheckLine(result) + '\n'); @@ -283,12 +287,17 @@ async function cmdReceipt(argv: string[]): Promise { } async function cmdHook(argv: string[]): Promise { - const agent = argv[0] as HookAgent | undefined; + const flags = parseFlags(argv); + const agent = flags.positional[0] as HookAgent | undefined; if (!agent || !['claude', 'codex', 'cursor'].includes(agent)) { - fail('usage: donegate hook '); + fail('usage: donegate hook [--subagent]'); + } + const subagent = flags.bool.has('subagent'); + if (subagent && agent !== 'claude') { + fail('--subagent is only supported for claude (SubagentStop hooks)'); } const stdin = await readStdin(); - const outcome = await runStopHook(agent, stdin); + const outcome = await runStopHook(agent, stdin, { subagent }); if (outcome.stdout) process.stdout.write(outcome.stdout + '\n'); if (outcome.stderr) process.stderr.write(outcome.stderr + '\n'); return outcome.exitCode; diff --git a/src/donefile.ts b/src/donefile.ts index c8db980..592b961 100644 --- a/src/donefile.ts +++ b/src/donefile.ts @@ -36,8 +36,10 @@ const DEFAULT_GUARDS: GuardsConfig = { no_disabled_lint: true, no_new_todos: 'warn', no_debug_artifacts: 'warn', + no_protected_edits: true, test_globs: DEFAULT_TEST_GLOBS, exclude: [], + protect: [], }; /** Bounce budget used when there is no (readable) donefile to say otherwise. */ @@ -156,7 +158,12 @@ export function parseDonefileSource(source: string, sourcePath: string, root: st checks.push({ name, run, timeout }); } - const guards: GuardsConfig = { ...DEFAULT_GUARDS, test_globs: [...DEFAULT_TEST_GLOBS], exclude: [] }; + const guards: GuardsConfig = { + ...DEFAULT_GUARDS, + test_globs: [...DEFAULT_TEST_GLOBS], + exclude: [], + protect: [], + }; if (data.guards !== undefined) { if (!isRecord(data.guards)) throw new DonefileError('"guards" must be a map'); for (const [key, value] of Object.entries(data.guards)) { @@ -167,10 +174,12 @@ export function parseDonefileSource(source: string, sourcePath: string, root: st case 'no_disabled_lint': case 'no_new_todos': case 'no_debug_artifacts': + case 'no_protected_edits': guards[key] = asGuardLevel(value, key); break; case 'test_globs': case 'exclude': + case 'protect': if (!Array.isArray(value) || value.some((v) => typeof v !== 'string')) { throw new DonefileError(`guards.${key} must be a list of glob strings`); } diff --git a/src/guards.ts b/src/guards.ts index 02abab9..78ff8ac 100644 --- a/src/guards.ts +++ b/src/guards.ts @@ -83,9 +83,18 @@ function snippet(text: string): string { * 1. a session baseline recorded by `donegate baseline` (hooks do this automatically) * 2. HEAD, when there is uncommitted work * 3. merge-base with the default branch, when the tree is clean + * + * An `explicitRef` (the CLI's `--against`) overrides all of that, including + * the session baseline: judge mode judges a diff, not a session. The caller + * is responsible for validating that the ref exists. */ -export async function resolveComparison(config: DoneConfig): Promise { +export async function resolveComparison(config: DoneConfig, explicitRef?: string): Promise { const root = config.root; + + if (explicitRef) { + return { kind: 'explicit', ref: explicitRef, baseline: null }; + } + const inGit = await isGitRepo(root); const baseline = loadBaseline(root); @@ -176,9 +185,13 @@ function skippedAll(config: DoneConfig, note: string): GuardResult[] { 'no_disabled_lint', 'no_new_todos', 'no_debug_artifacts', + 'no_protected_edits', ] as const; return names .filter((n) => config.guards[n] !== false) + // An empty guards.protect means the guard is unconfigured, not skipped — + // don't add noise to every receipt that never opted in. + .filter((n) => n !== 'no_protected_edits' || config.guards.protect.length > 0) .map((name) => ({ name, status: 'skipped' as const, findings: [], note })); } @@ -224,6 +237,52 @@ export async function runGuards(config: DoneConfig, comparison: ComparisonContex results.push(makeResult('no_done_edits', config.guards.no_done_edits, findings)); } + // ── no_protected_edits ───────────────────────────────────────────────────── + // DONE.md says `run: npm test`, but what `npm test` *means* lives in files + // the agent can edit (package.json, lint/test configs). guards.protect pins + // them: changed, deleted, or newly shadowed → finding. + if (config.guards.protect.length > 0) { + const findings: GuardFinding[] = []; + const isProtected = makeTestFileMatcher(config.guards.protect); + const blessHint = 'if this change is the human\'s, bless it with `donegate baseline`'; + if (baseline?.protected_files) { + const seen = new Set(Object.keys(baseline.protected_files)); + for (const [file, entry] of Object.entries(baseline.protected_files)) { + try { + const current = sha256(fs.readFileSync(path.join(config.root, file))); + if (current !== entry.sha) { + findings.push({ + file, + detail: `protected file modified since the baseline — it defines what the checks mean (${blessHint})`, + }); + } + } catch { + findings.push({ file, detail: 'protected file is missing — it existed when the baseline was taken' }); + } + } + // A *new* file matching protect globs can shadow an existing config + // (e.g. a more-local eslint config) — that's a change in meaning too. + for (const file of inputs.added.keys()) { + if (isProtected(file) && !seen.has(file)) { + findings.push({ file, detail: `new file matches guards.protect (${blessHint})` }); + } + } + } else { + // No baseline (CI, --against, plain diffs): fall back to the git diff. + const flagged = new Set(); + for (const file of inputs.added.keys()) { + if (isProtected(file)) flagged.add(file); + } + for (const file of [...inputs.modifiedPaths, ...inputs.deletedPaths]) { + if (isProtected(file)) flagged.add(file); + } + for (const file of flagged) { + findings.push({ file, detail: 'protected file changed in this diff — it defines what the checks mean' }); + } + } + results.push(makeResult('no_protected_edits', config.guards.no_protected_edits, findings)); + } + // ── no_deleted_tests ─────────────────────────────────────────────────────── { const findings: GuardFinding[] = []; diff --git a/src/hooks.ts b/src/hooks.ts index 4a0c404..b9043d2 100644 --- a/src/hooks.ts +++ b/src/hooks.ts @@ -20,7 +20,7 @@ interface HookPayload { } interface BounceState { - sessions: Record; + sessions: Record; } function statePath(root: string): string { @@ -168,26 +168,58 @@ function findOrphanedBaseline(cwd: string): { root: string; baseline: Baseline } } } -/** Block the stop (incrementing the session's bounce count), or give up loudly once the budget is spent. */ +/** + * Block the stop (incrementing the session's bounce count), or give up loudly + * once the budget is spent. + * + * The budget counts *consecutive bounces without new progress*. When `score` + * is provided (failing checks + tripped guards), a score strictly below the + * session's best refreshes the budget: an agent steadily fixing a long list + * shouldn't be cut off mid-fix. Best-ever (not last-attempt) is the bar, so + * oscillating between two failure sets can't farm refreshes — total bounces + * stay bounded by max_bounces × (initial score + 1). + */ function bounceOrGiveUp(options: { agent: HookAgent; root: string; sessionId: string; maxBounces: number; + score?: number; reason: (attempt: number) => string; giveUp: (bounces: number) => string; }): HookOutcome { const state = loadState(options.root); - const bounces = state.sessions[options.sessionId]?.bounces ?? 0; + const entry = state.sessions[options.sessionId]; + let bounces = entry?.bounces ?? 0; + let best = entry?.best; + let refreshed = false; + + if (typeof options.score === 'number') { + if (typeof best !== 'number') { + best = options.score; // first scored attempt sets the bar + } else if (options.score < best) { + best = options.score; + refreshed = true; + bounces = 0; + } + } if (bounces >= options.maxBounces) { return { stdout: null, stderr: options.giveUp(bounces), exitCode: 0 }; } const attempt = bounces + 1; - state.sessions[options.sessionId] = { bounces: attempt, updated_at: new Date().toISOString() }; + state.sessions[options.sessionId] = { + bounces: attempt, + updated_at: new Date().toISOString(), + ...(typeof best === 'number' ? { best } : {}), + }; saveState(options.root, state); - const reason = options.reason(attempt); + + let reason = options.reason(attempt); + if (refreshed) { + reason += '\n\n(donegate noticed progress since the last attempt — the bounce budget was refreshed.)'; + } if (options.agent === 'cursor') { return { stdout: JSON.stringify({ followup_message: reason }), stderr: null, exitCode: 0 }; @@ -196,7 +228,11 @@ function bounceOrGiveUp(options: { return { stdout: JSON.stringify({ decision: 'block', reason }), stderr: null, exitCode: 0 }; } -export async function runStopHook(agent: HookAgent, rawStdin: string): Promise { +export async function runStopHook( + agent: HookAgent, + rawStdin: string, + mode: { subagent?: boolean } = {}, +): Promise { const payload = parsePayload(rawStdin); const cwd = resolveCwd(payload); @@ -206,6 +242,9 @@ export async function runStopHook(agent: HookAgent, rawStdin: string): Promise @@ -253,7 +292,7 @@ export async function runStopHook(agent: HookAgent, rawStdin: string): Promise @@ -273,18 +312,27 @@ export async function runStopHook(agent: HookAgent, rawStdin: string): Promise buildReason(summary, attempt, config.gate.max_bounces), giveUp: (bounces) => `donegate: ✗ still NOT DONE after ${bounces} bounce${bounces > 1 ? 's' : ''} — giving up and allowing the stop. The receipt is red: ${path.join(DONEGATE_DIR, 'receipts', 'latest.json')}`, diff --git a/src/install.ts b/src/install.ts index dbb2c90..ea0886c 100644 --- a/src/install.ts +++ b/src/install.ts @@ -4,8 +4,17 @@ import path from 'node:path'; export type InstallTarget = 'claude' | 'codex' | 'cursor' | 'ci'; -export const HOOK_COMMANDS: Record, { stop: string; baseline: string }> = { - claude: { stop: 'npx -y donegate hook claude', baseline: 'npx -y donegate baseline --if-missing --quiet' }, +export const HOOK_COMMANDS: Record< + Exclude, + { stop: string; baseline: string; subagentStop?: string } +> = { + claude: { + stop: 'npx -y donegate hook claude', + baseline: 'npx -y donegate baseline --if-missing --quiet', + // Guards-only tamper scan at every subagent boundary — fast (git diffs, + // no checks), so fan-out workflows are gated per node, not just at the end. + subagentStop: 'npx -y donegate hook claude --subagent', + }, codex: { stop: 'npx -y donegate hook codex', baseline: 'npx -y donegate baseline --if-missing --quiet' }, cursor: { stop: 'npx -y donegate hook cursor', baseline: 'npx -y donegate baseline --if-missing --quiet' }, }; @@ -17,6 +26,8 @@ export const HOOK_COMMANDS: Record, { stop: string; */ const STOP_TIMEOUT_SECONDS = 1800; const BASELINE_TIMEOUT_SECONDS = 120; +/** Guards only — no checks run — but big-repo git diffs and a cold npx need headroom. */ +const SUBAGENT_TIMEOUT_SECONDS = 300; export interface InstallResult { target: InstallTarget; @@ -140,6 +151,9 @@ export function installAgent( } else { changed = mergeNestedHooks(config, 'Stop', commands.stop, STOP_TIMEOUT_SECONDS) || changed; changed = mergeNestedHooks(config, 'SessionStart', commands.baseline, BASELINE_TIMEOUT_SECONDS) || changed; + if (commands.subagentStop) { + changed = mergeNestedHooks(config, 'SubagentStop', commands.subagentStop, SUBAGENT_TIMEOUT_SECONDS) || changed; + } } if (!changed) return { target, file, action: 'already-installed' }; @@ -170,7 +184,7 @@ export function uninstallAgent( } } } else { - for (const event of ['Stop', 'SessionStart']) { + for (const event of ['Stop', 'SessionStart', 'SubagentStop']) { if (removeNestedHooks(config, event)) changed = true; } } diff --git a/src/types.ts b/src/types.ts index 4e697ff..af8785d 100644 --- a/src/types.ts +++ b/src/types.ts @@ -21,6 +21,8 @@ export interface GuardsConfig { no_disabled_lint: GuardLevel; no_new_todos: GuardLevel; no_debug_artifacts: GuardLevel; + /** Findings when files matching `protect` change mid-session. */ + no_protected_edits: GuardLevel; /** Glob patterns that identify test files. */ test_globs: string[]; /** @@ -28,6 +30,13 @@ export interface GuardsConfig { * skip/suppression patterns (lint configs, pattern scanners, donegate itself). */ exclude: string[]; + /** + * Globs for files the verdict depends on but the gate doesn't run — the + * files that define what the check commands *mean* (package.json, lint/ + * test/build configs). Hashed into the baseline; changes trip + * `no_protected_edits`. + */ + protect: string[]; } export interface GateConfig { @@ -74,7 +83,7 @@ export interface GuardResult { note?: string; } -export type BaselineKind = 'session' | 'head' | 'merge-base' | 'none'; +export type BaselineKind = 'session' | 'head' | 'merge-base' | 'explicit' | 'none'; export interface BaselineFileEntry { sha: string; @@ -90,6 +99,8 @@ export interface Baseline { donefile_sha: string; donefile_path: string; test_files: Record; + /** Hashes of files matching guards.protect (absent when protect is empty). */ + protected_files?: Record; } export interface ComparisonContext { @@ -128,7 +139,7 @@ export interface Receipt { checks: CheckResult[]; guards: GuardResult[]; /** Which surface produced the receipt. */ - via: 'cli' | 'claude' | 'codex' | 'cursor' | 'run'; + via: 'cli' | 'claude' | 'codex' | 'cursor' | 'run' | 'subagent'; /** sha256 of the receipt body (excluding this field). */ receipt_sha: string; } diff --git a/test/donefile.test.ts b/test/donefile.test.ts index dc1c863..a3afc32 100644 --- a/test/donefile.test.ts +++ b/test/donefile.test.ts @@ -135,3 +135,13 @@ test('findDonefile walks upward and prefers DONE.md', () => { cleanup(root); } }); + +test('parses guards.protect and no_protected_edits', () => { + const config = parseDonefileSource( + 'checks:\n - name: a\n run: x\nguards:\n no_protected_edits: warn\n protect:\n - package.json\n - "*.config.js"\n', + '/repo/done.yml', + '/repo', + ); + assert.deepEqual(config.guards.protect, ['package.json', '*.config.js']); + assert.equal(config.guards.no_protected_edits, 'warn'); +}); diff --git a/test/guards.test.ts b/test/guards.test.ts index 914aa14..a0f8fd7 100644 --- a/test/guards.test.ts +++ b/test/guards.test.ts @@ -2,9 +2,10 @@ import { test } from 'node:test'; import assert from 'node:assert/strict'; import { loadConfig } from '../src/donefile.js'; import { writeBaseline } from '../src/baseline.js'; +import { verify } from '../src/check.js'; import { resolveComparison, runGuards } from '../src/guards.js'; import type { GuardResult } from '../src/types.js'; -import { BASIC_DONEFILE, cleanup, gitCommitAll, gitInit, read, rm, tmpdir, write } from './helpers.js'; +import { BASIC_DONEFILE, cleanup, gitCommitAll, gitHead, gitInit, read, rm, tmpdir, write } from './helpers.js'; const TEST_FILE = `import { test } from 'node:test'; @@ -296,3 +297,106 @@ test('outside git WITH baseline: snapshot comparisons still work', async () => { cleanup(root); } }); + +const PROTECT_DONEFILE = `# DoD +\`\`\`yaml +checks: + - name: ok + run: node -e "process.exit(0)" +guards: + protect: + - package.json + - "*.config.js" +\`\`\` +`; + +const PKG_JSON = '{ "scripts": { "test": "node run-tests.js" } }\n'; + +async function setupProtectRepo(): Promise { + const root = tmpdir(); + gitInit(root); + write(root, 'DONE.md', PROTECT_DONEFILE); + write(root, 'package.json', PKG_JSON); + gitCommitAll(root, 'base'); + return root; +} + +test('no_protected_edits: pinned files cannot be changed, deleted, or shadowed quietly', async () => { + const root = await setupProtectRepo(); + try { + const config = loadConfig(root); + await writeBaseline(config); + + // modified — redefining what `npm test` means + write(root, 'package.json', '{ "scripts": { "test": "exit 0" } }\n'); + let g = guard(await runGuards(config, await resolveComparison(config)), 'no_protected_edits'); + assert.equal(g.status, 'fail'); + assert.match(g.findings[0]!.detail, /modified since the baseline/); + + // deleted + rm(root, 'package.json'); + g = guard(await runGuards(config, await resolveComparison(config)), 'no_protected_edits'); + assert.equal(g.status, 'fail'); + assert.match(g.findings[0]!.detail, /missing/); + + // restored byte-for-byte → clean again; a NEW file matching protect globs is not + write(root, 'package.json', PKG_JSON); + g = guard(await runGuards(config, await resolveComparison(config)), 'no_protected_edits'); + assert.equal(g.status, 'pass'); + write(root, 'extra.config.js', 'module.exports = {};\n'); + g = guard(await runGuards(config, await resolveComparison(config)), 'no_protected_edits'); + assert.equal(g.status, 'fail'); + assert.match(g.findings[0]!.detail, /new file matches/); + } finally { + cleanup(root); + } +}); + +test('no_protected_edits: falls back to the git diff when there is no baseline (CI mode)', async () => { + const root = await setupProtectRepo(); + try { + const config = loadConfig(root); + write(root, 'package.json', '{ "scripts": { "test": "exit 0" } }\n'); + const g = guard(await runGuards(config, await resolveComparison(config)), 'no_protected_edits'); + assert.equal(g.status, 'fail'); + assert.match(g.findings[0]!.detail, /changed in this diff/); + } finally { + cleanup(root); + } +}); + +test('check --against judges an explicit ref — even past a re-blessed baseline', async () => { + const root = await setupRepo(); + try { + const base = gitHead(root); + await writeBaseline(loadConfig(root)); + + // skip a test, commit it, and re-bless the baseline: a session comparison + // is now blind to the skip. The explicit ref is not. + write(root, 'test/app.test.ts', TEST_FILE.replace("test('two'", "test.skip('two'")); + gitCommitAll(root, 'sneaky'); + await writeBaseline(loadConfig(root)); + + const blessed = await verify({ cwd: root, config: loadConfig(root), via: 'cli' }); + assert.equal(blessed.exitCode, 0); + + const judged = await verify({ cwd: root, config: loadConfig(root), comparisonRef: base, via: 'cli' }); + assert.equal(judged.receipt.baseline.kind, 'explicit'); + assert.equal(judged.exitCode, 3); + assert.ok(judged.receipt.guards.some((g) => g.name === 'no_new_skips' && g.status === 'fail')); + } finally { + cleanup(root); + } +}); + +test('check --against refuses a ref that does not exist', async () => { + const root = await setupRepo(); + try { + await assert.rejects( + verify({ cwd: root, config: loadConfig(root), comparisonRef: 'not-a-ref', via: 'cli' }), + /not a commit/, + ); + } finally { + cleanup(root); + } +}); diff --git a/test/helpers.ts b/test/helpers.ts index 548f714..693d82e 100644 --- a/test/helpers.ts +++ b/test/helpers.ts @@ -35,6 +35,10 @@ export function gitCommitAll(root: string, message = 'commit'): void { execFileSync('git', ['commit', '-q', '-m', message], { cwd: root, stdio: 'pipe' }); } +export function gitHead(root: string): string { + return execFileSync('git', ['rev-parse', 'HEAD'], { cwd: root, stdio: 'pipe' }).toString().trim(); +} + export const BASIC_DONEFILE = `# Definition of Done \`\`\`yaml diff --git a/test/hooks.test.ts b/test/hooks.test.ts index c0c8927..bd8597b 100644 --- a/test/hooks.test.ts +++ b/test/hooks.test.ts @@ -227,3 +227,82 @@ test('cursor: aborted turns are not gated even when the donefile is gone', async cleanup(root); } }); + +// Assembled at runtime so the repo's own no_new_skips guard never sees the +// literal marker in this (non-excluded) test file. +const SKIP_CALL = ['test', 'skip'].join('.'); + +test('subagent boundary: guards-only — failing checks do not block, tampering does', async () => { + const root = await setup(FAILING_DONEFILE); + try { + write(root, 'test/app.test.ts', "import { test } from 'node:test';\ntest('one', () => {});\ntest('two', () => {});\n"); + gitCommitAll(root); + await runBaselineHook({ ifMissing: false, quiet: true, cwd: root }); + + // the donefile's check always fails, but the boundary doesn't run checks + const clean = await runStopHook('claude', payload(root), { subagent: true }); + assert.equal(clean.stdout, null); + assert.match(clean.stderr ?? '', /subagent boundary clean/); + + // tamper at the boundary → blocked with the guard finding + write(root, 'test/app.test.ts', `import { test } from 'node:test';\ntest('one', () => {});\n${SKIP_CALL}('two', () => {});\n`); + const tampered = await runStopHook('claude', payload(root), { subagent: true }); + assert.ok(tampered.stdout, 'expected a block'); + const response = JSON.parse(tampered.stdout) as { decision: string; reason: string }; + assert.equal(response.decision, 'block'); + assert.match(response.reason, /no_new_skips/); + + // subagent bounces live in their own ledger — the terminal gate still starts fresh + const main = await runStopHook('claude', payload(root)); + assert.match(JSON.parse(main.stdout!).reason as string, /attempt 1\/2/); + } finally { + cleanup(root); + } +}); + +const PROGRESS_DONEFILE = `# DoD +\`\`\`yaml +checks: + - name: c1 + run: node -e "process.exit(require('fs').existsSync('fix1.txt') ? 0 : 1)" + - name: c2 + run: node -e "process.exit(require('fs').existsSync('fix2.txt') ? 0 : 1)" +gate: + max_bounces: 2 +\`\`\` +`; + +test('progress refreshes the bounce budget; stalling exhausts it', async () => { + const root = await setup(PROGRESS_DONEFILE); + try { + const block = async () => { + const outcome = await runStopHook('claude', payload(root)); + assert.ok(outcome.stdout, 'expected a block'); + return JSON.parse(outcome.stdout) as { decision: string; reason: string }; + }; + + // two failing checks, no movement: the budget counts down + assert.match((await block()).reason, /attempt 1\/2/); + assert.match((await block()).reason, /attempt 2\/2/); + + // fixing one check is progress → budget refreshed, loudly + write(root, 'fix1.txt', 'fixed\n'); + const refreshed = await block(); + assert.match(refreshed.reason, /attempt 1\/2/); + assert.match(refreshed.reason, /bounce budget was refreshed/); + + // stalling at the new best exhausts the refreshed budget + assert.match((await block()).reason, /attempt 2\/2/); + const spent = await runStopHook('claude', payload(root)); + assert.equal(spent.stdout, null); + assert.match(spent.stderr ?? '', /giving up/); + + // finishing the job still works and clears the session + write(root, 'fix2.txt', 'fixed\n'); + const done = await runStopHook('claude', payload(root)); + assert.equal(done.stdout, null); + assert.match(done.stderr ?? '', /✓ DONE/); + } finally { + cleanup(root); + } +}); diff --git a/test/install.test.ts b/test/install.test.ts index 4fd2ae4..8972ebd 100644 --- a/test/install.test.ts +++ b/test/install.test.ts @@ -171,3 +171,29 @@ test('ensureGitignore appends once', () => { cleanup(root); } }); + +test('claude install wires the subagent boundary; uninstall removes it', () => { + const root = tmpdir(); + try { + installAgent('claude', root); + const config = JSON.parse(read(root, '.claude/settings.json')); + assert.match(config.hooks.SubagentStop[0].hooks[0].command, /donegate hook claude --subagent/); + + uninstallAgent('claude', root); + const after = JSON.parse(read(root, '.claude/settings.json')); + assert.equal(after.hooks.SubagentStop, undefined); + } finally { + cleanup(root); + } +}); + +test('codex and cursor get no subagent hook (no such event)', () => { + const root = tmpdir(); + try { + installAgent('codex', root); + const config = JSON.parse(read(root, '.codex/hooks.json')); + assert.equal(config.hooks.SubagentStop, undefined); + } finally { + cleanup(root); + } +});