diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 04862c1c..52de8ec5 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -12,7 +12,7 @@ "name": "PACT", "source": "./pact-plugin", "description": "Orchestration harness that turns Claude Code into a coordinated team of specialist AI agents", - "version": "3.20.4", + "version": "3.21.0", "author": { "name": "Synaptic-Labs-AI" }, diff --git a/README.md b/README.md index dfa1700b..87d619aa 100644 --- a/README.md +++ b/README.md @@ -471,7 +471,7 @@ When installed as a plugin, PACT lives in your plugin cache: │ └── cache/ │ └── pact-plugin/ │ └── PACT/ -│ └── 3.20.4/ # Plugin version +│ └── 3.21.0/ # Plugin version │ ├── agents/ │ ├── commands/ │ ├── skills/ diff --git a/pact-plugin/.claude-plugin/plugin.json b/pact-plugin/.claude-plugin/plugin.json index ae9256ab..8127798e 100644 --- a/pact-plugin/.claude-plugin/plugin.json +++ b/pact-plugin/.claude-plugin/plugin.json @@ -1,6 +1,6 @@ { "name": "PACT", - "version": "3.20.4", + "version": "3.21.0", "description": "Orchestration harness that turns Claude Code into a coordinated team of specialist AI agents", "author": { "name": "Synaptic-Labs-AI", diff --git a/pact-plugin/README.md b/pact-plugin/README.md index 83b86716..f64b658a 100644 --- a/pact-plugin/README.md +++ b/pact-plugin/README.md @@ -1,6 +1,6 @@ # PACT — Orchestration Harness for Claude Code -> **Version**: 3.20.4 +> **Version**: 3.21.0 Turn a single Claude Code session into a managed team of specialist AI agents that prepare, design, build, and test your code systematically. diff --git a/pact-plugin/commands/unwatch-inbox.md b/pact-plugin/commands/unwatch-inbox.md new file mode 100644 index 00000000..7c6be52e --- /dev/null +++ b/pact-plugin/commands/unwatch-inbox.md @@ -0,0 +1,89 @@ +--- +description: Tear down the lead's inbox-watch Monitor — stop Monitor task, unlink STATE_FILE atomically. Hook-invoked on last active teammate task transition; user-invoked manually to silence Monitor noise mid-session. +--- +# Unwatch Inbox + +Tear down the lead-side wake mechanism armed by [`/PACT:watch-inbox`](watch-inbox.md): stop the Monitor task and unlink the registry sidecar. + +## Overview + +Best-effort cleanup. Tolerates a Monitor that died silently mid-session — the wake mechanism is opportunistic by design (no in-session watchdog; see [`/PACT:watch-inbox`](watch-inbox.md#failure-modes)), so `TaskStop` on a stale `monitor_task_id` is the expected path under silent-death conditions. + +## When to Invoke + +| Trigger | Site | +|---|---| +| Last active teammate task reaches terminal status (`completed` or `deleted`; PostToolUse hook detects 1→0 transition) | `wake_lifecycle_emitter.py` `additionalContext` directive | +| Session-end safety net (count already 0; redundant-but-correct hook-silent-fail catch) | `/wrap-up` command body Skill invocation | +| User-typed manual invocation (silence Monitor noise mid-session, e.g., during long-running solo work) | `/PACT:unwatch-inbox` slash invocation | + +## Operation + +Single procedure — the command IS the operation. Order is load-bearing: stop the live Monitor before unlinking the registry sidecar. + +0. **Lead-session guard** (see `## Lead-Session Guard` below). If the current session is not the team-lead session, refuse and return — do NOT proceed to step 1. +1. Read STATE_FILE at `~/.claude/teams/{team_name}/inbox-wake-state.json`; if absent or invalid (malformed JSON / `v ≠ 1`), skip steps 2-4 — nothing to stop. +2. Validate `STATE_FILE.monitor_task_id` against the Claude Code task-id allowlist regex `^[a-z0-9]{6,}\Z`. If invalid, skip step 4 — still proceed to step 5 (STATE_FILE unlink is independently safe and clears the corrupt registry entry). +3. Validate `STATE_FILE.armed_by_session_id` against the current session_id read from `pact-session-context.json`. If mismatch, skip step 4 — still proceed to step 5 (planted/cross-session STATE_FILE gets cleaned without weaponizing TaskStop). +4. `TaskStop(STATE_FILE.monitor_task_id)` — **ignoring not-found errors** (the Monitor may have died silently mid-session). +5. Unlink STATE_FILE — `Path.unlink(missing_ok=True)`. + +Ordering rationale: the inverse ordering would leave a brief window where a STATE_FILE-less Monitor still runs but the next [`/PACT:watch-inbox`](watch-inbox.md) sees no STATE_FILE and re-arms — creating an orphan. + +## Lead-Session Guard + +Refuse to execute when invoked from a teammate session. Teardown is lead-only: a teammate process calling `TaskStop` on the lead's Monitor task ID is a cross-session operation that, even if the substrate permitted it, would silently kill the lead's wake mechanism without the lead's knowledge. + +```python +team_name = pact_session_context["team_name"] +session_id = pact_session_context["session_id"] +team_config = json.loads( + (Path.home() / ".claude" / "teams" / team_name / "config.json").read_text() +) +if session_id != team_config.get("leadSessionId"): + refuse( + "This command only runs in the team-lead session. " + "Teammates do not arm or tear down the lead's Monitor." + ) + return +``` + +**Audit**: signal source is `session_id == team_config.leadSessionId`, NOT a hypothetical `agent_type` field on `pact-session-context.json`. The session-context schema is `{team_name, session_id, project_dir, plugin_root, started_at}` by design; the team config is the single source of truth for team membership and lead identity. An editing LLM tempted to "just add agent_type to session-context" should stop — replicating that signal into session-context creates two-source-of-truth drift. The guard runs at command-invoke time; the paired arm command's directive-emit sites in `wake_lifecycle_emitter.py` and `session_init.py` are lead-side already, so this guard's purpose is to defend against user-typed `/PACT:unwatch-inbox` from a teammate session. This guard is foot-gun protection (typo / wrong-window / cross-session-LLM speculation), not a security boundary against same-user adversaries. `leadSessionId` is read from `team_config.json` which has no integrity check; same-user write authority can spoof it. The user-local-trust assumption bounds the residual exposure — same-user attacker has equivalent capability via direct os tooling anyway. + +## Teardown Block + +Best-effort sequence. Tolerates a Monitor that died silently mid-session. + +``` +1. Read STATE_FILE; if absent or invalid (malformed JSON / v ≠ 1), skip steps 2-4. +2. Validate STATE_FILE.monitor_task_id against ^[a-z0-9]{6,}\Z. If invalid, skip step 4. +3. Validate STATE_FILE.armed_by_session_id == current pact_session_context["session_id"]. If mismatch, skip step 4. +4. TaskStop(STATE_FILE.monitor_task_id) — ignoring not-found errors. +5. Path.unlink(STATE_FILE, missing_ok=True). # TOCTOU window between resolve() and unlink() is bounded by user-local-trust assumption — same-user attacker has equivalent capability via direct os.unlink. +``` + +`TaskStop` will return a `tool_use_error` if the Monitor died silently. Tolerate not-found and continue to step 5. Do not abort teardown on `TaskStop` failure; an undeleted STATE_FILE is worse than a failed `TaskStop` because it leaves a phantom registry entry that confuses the next [`/PACT:watch-inbox`](watch-inbox.md) invocation. + +**Audit**: F6 tolerance phrasing (**"ignoring not-found errors"**) is the load-bearing fragment. An editing LLM "tightening up error handling" by removing the phrase silently restores crash-on-stale-ID. The principle anchor — teardown is best-effort because a torn-down session may have already lost its Monitor — tells the editing LLM why the phrase exists. The wake mechanism is opportunistic; teardown must be tolerant. The `^[a-z0-9]{6,}\Z` regex is the Claude Code task-id allowlist (most task IDs are short alphanumeric, e.g. `bu4hxc2bh`, `b3w334skp`); a STATE_FILE that fails this check is corrupt and must not flow into `TaskStop` as an unsanitized argument. The `\Z` end-anchor is deliberate: Python's `$` matches before a trailing newline (so `validid\nrm -rf ~` would pass `^[a-z0-9]{6,}$`), while `\Z` rejects trailing newlines. This matches the `_UUID_PATTERN` precedent in `pact-plugin/hooks/session_end.py` for the same reason; an editing LLM tempted to "simplify" back to `$` re-opens the trailing-newline bypass. Skipping `TaskStop` on validation failure is correct — the unlink that follows clears the corrupt registry entry, and the next `/PACT:watch-inbox` cold-starts a fresh Monitor with a fresh ID. **Same-session validation prevents cross-session TaskStop weaponization. A concurrent same-user session that planted STATE_FILE will have its `armed_by_session_id` mismatched against the current Teardown session's `session_id`; Teardown refuses TaskStop but proceeds to unlink so the planted file doesn't accumulate.** Validation ordering is layered defense: task-id-regex (cheap shape check) gates first, then same-session integrity check; both fail-open to unlink so a planted or corrupt STATE_FILE is always cleaned. An editing LLM tempted to drop the integrity check "because the regex already validates" misses the failure mode entirely — regex-valid task IDs from the attacker's session can still point at the lead's active work; only the session_id match prevents cross-session TaskStop. + +## Failure Modes + +### Monitor died silently mid-session + +`TaskStop(STATE_FILE.monitor_task_id)` returns `tool_use_error: No task found ...`. This is the EXPECTED path under silent-death conditions, not a defect. Tolerate the error and continue to the STATE_FILE unlink; the teardown's purpose is to clean the registry sidecar regardless of the Monitor's actual state. + +### STATE_FILE absent + +If STATE_FILE does not exist, the wake was never armed (or was already torn down). Skip steps 2-3; this is a no-op success. + +## Verification + +Confirm teardown: + +1. STATE_FILE absent at `~/.claude/teams/{team_name}/inbox-wake-state.json`. +2. `STATE_FILE.monitor_task_id` (read before teardown) no longer resolves to a live Monitor task — either successfully stopped or already dead. + +## References + +- [`/PACT:watch-inbox`](watch-inbox.md) — paired arm command. +- [Communication Charter Part I §Wake Mechanism](../protocols/pact-communication-charter.md#wake-mechanism) — protocol contract surface. diff --git a/pact-plugin/commands/watch-inbox.md b/pact-plugin/commands/watch-inbox.md new file mode 100644 index 00000000..991a0c46 --- /dev/null +++ b/pact-plugin/commands/watch-inbox.md @@ -0,0 +1,182 @@ +--- +description: Arm the lead's inbox-watch Monitor — spawn a Monitor on the team-lead inbox, write STATE_FILE atomically. Hook-invoked on first active teammate task; user-invoked manually for debug or recovery. +--- +# Watch Inbox + +Arm the lead-side wake mechanism: a single `Monitor` task watches `inboxes/team-lead.json` via `wc -c` byte-grow and fires a turn at the next between-tool-call boundary, with quiet-window coalescing to bound emit count under bursty traffic. + +## Overview + +> **Monitor is an alarm clock, not a mailbox.** On `INBOX_GREW`, end the turn and return to idle without emitting acknowledgment text or narrating the wake event — the platform's idle-delivery is the channel-of-record for content. Never read the inbox file or parse the wake's stdout payload yourself. +> +> **Wake surfaces between tool calls within a turn, not mid-tool.** Monitor's `INBOX_GREW` emit cannot interrupt a single in-flight tool call. The platform queues `INBOX_GREW` events that fire during a long-running tool and delivers them when the tool returns, bundled with the tool's result. The wake mechanism's promise is "messages surface between tool calls within a turn," NOT "instant interrupt anywhere." For multi-tool turns the wake reliably opens the poller-gate between tools; for single long tools (e.g., a 90-second blocking sleep) the lead is effectively unwakeable until the tool returns. + +Problem this solves: during long-running operations, the platform's `useInboxPoller` only delivers queued `SendMessage` between tool calls; long blocking tool calls leave inbound messages stuck until the next idle boundary. See [Communication Charter Part I — Delivery Model](../protocols/pact-communication-charter.md#delivery-model). The Monitor's stdout emit forces a turn at the next between-tool-call boundary, bounding latency by the poll interval (30 s) rather than by the next opportunistic idle. + +Single-Monitor model, no in-session watchdog. Lifetime is scoped to the period during which the lead holds assigned, uncompleted teammate tasks. Inbox path is a single JSON file (`inboxes/team-lead.json`), not a directory. + +**Audit**: both alarm-clock paragraphs are non-negotiable. The first prevents two failure modes: (a) an editing LLM writing "parse the wake stdout to extract content" — wake is signal, not content; and (b) the woken lead emitting acknowledgment text like "(Alarm.)" or "(Idle ping.)" instead of returning to silent idle — empirically observed even with the wait-in-silence feedback memory loaded, so the no-narration clause must be explicit in the principle anchor itself. The second prevents an editing LLM from inferring mid-tool interrupt from "wake on inbox grow" — the substrate's actual capability is between-tool, not anywhere. Removing either paragraph silently overpromises the mechanism. + +## When to Invoke + +| Trigger | Site | +|---|---| +| First active teammate task created (PostToolUse hook detects 0→1 transition) | `wake_lifecycle_emitter.py` `additionalContext` directive | +| Resume into a session with active tasks already on disk | `session_init.py` `additionalContext` directive | +| User-typed manual invocation (debug, recovery from silent Monitor death) | `/PACT:watch-inbox` slash invocation | + +Arm is idempotent. Re-invoking when STATE_FILE is already present and valid is a no-op — cheap on every directive re-fire. + +## Operation + +Single procedure — the command IS the operation. No Arm/Teardown sub-section. + +0. **Lead-session guard** (see `## Lead-Session Guard` below). If the current session is not the team-lead session, refuse and return — do NOT proceed to step 1. +1. Read STATE_FILE at `~/.claude/teams/{team_name}/inbox-wake-state.json`. +2. If STATE_FILE is present and parses with `v=1`: no-op (already armed; cheap on every re-invocation). +3. Otherwise cold-start: + 1. Spawn the Monitor (see `## Monitor Block`). + 2. Capture the returned `monitor_task_id`. + 3. Write the STATE_FILE atomically (see `## WriteStateFile Block`). + +**Audit**: idempotency lives in this command (STATE_FILE-presence check), NOT in the directive that invokes it. An editing LLM tempted to add an "if not already armed" guard at the directive site would re-introduce LLM-self-diagnosis as the gate, which is the failure mode the unconditional-emit discipline closes. + +## Lead-Session Guard + +Refuse to execute when invoked from a teammate session. The wake mechanism is lead-only: arming Monitor in a teammate session would watch the lead's inbox file from the wrong process, fire the wake in the teammate's session, and the lead would silently miss every signal. + +```python +team_name = pact_session_context["team_name"] +session_id = pact_session_context["session_id"] +team_config = json.loads( + (Path.home() / ".claude" / "teams" / team_name / "config.json").read_text() +) +if session_id != team_config.get("leadSessionId"): + refuse( + "This command only runs in the team-lead session. " + "Teammates use their own inbox at " + "~/.claude/teams/{team_name}/inboxes/{teammate-name}.json — " + "no Monitor is required for teammates; the platform's idle-delivery " + "covers their inbox-read path." + ) + return +``` + +**Audit**: signal source is `session_id == team_config.leadSessionId`, NOT a hypothetical `agent_type` field on `pact-session-context.json`. The session-context schema is `{team_name, session_id, project_dir, plugin_root, started_at}` by design — adding an `agent_type` field would couple every hook's session-init to a teammate-vs-lead discriminator that already exists at canonical depth in the team config (`leadSessionId`). An editing LLM tempted to "just add agent_type to session-context" should stop: the team config is the single source of truth for team membership and lead identity; replicating that signal into session-context creates two-source-of-truth drift. The guard runs at command-invoke time, NOT at directive-emit time — directives emitted by `wake_lifecycle_emitter.py` and `session_init.py` already only fire in lead sessions (the lifecycle emitter's pre/post derivation runs in the lead's PostToolUse path; SessionStart fires per session and the directive is teammate-harmless because this guard refuses cold-start). The guard's purpose is to defend against user-typed `/PACT:watch-inbox` invocation from a teammate session, which is the only invocation path the directive-side filtering does not cover. This guard is foot-gun protection (typo / wrong-window / cross-session-LLM speculation), not a security boundary against same-user adversaries. `leadSessionId` is read from `team_config.json` which has no integrity check; same-user write authority can spoof it. The user-local-trust assumption bounds the residual exposure — same-user attacker has equivalent capability via direct os tooling anyway. + +## Monitor Block + +Canonical Monitor `cmd` body. The arming agent interpolates `{team_name}` at Arm time. Inbox path is fixed to `inboxes/team-lead.json` (lead-only scope). + +```bash +INBOX="$HOME/.claude/teams/{team_name}/inboxes/team-lead.json" +PREV=0 +STATE="PENDING" +QUIET_START=0 +BURST_START=0 +POLL=30 +QUIET_REQUIRED=60 +MAX_DELAY=120 +while true; do + NOW=$(date +%s) + if [ -f "$INBOX" ]; then + SIZE=$(wc -c < "$INBOX" 2>/dev/null | tr -d ' ') + if [ "$SIZE" -gt "$PREV" ] 2>/dev/null; then + if [ "$STATE" = "PENDING" ]; then + echo "INBOX_GREW size=$SIZE ts=$(date -u +%Y-%m-%dT%H:%M:%SZ) edge=FIRST_GROW" + STATE="GROWING" + BURST_START=$NOW + elif [ $((NOW - BURST_START)) -ge "$MAX_DELAY" ]; then + echo "INBOX_GREW size=$SIZE ts=$(date -u +%Y-%m-%dT%H:%M:%SZ) edge=MAX_DELAY" + BURST_START=$NOW + fi + PREV=$SIZE + QUIET_START=$NOW + elif [ "$STATE" = "GROWING" ] 2>/dev/null; then + if [ $((NOW - QUIET_START)) -ge "$QUIET_REQUIRED" ]; then + echo "INBOX_GREW size=$SIZE ts=$(date -u +%Y-%m-%dT%H:%M:%SZ) edge=LAST_GROW" + STATE="PENDING" + fi + fi + fi + sleep "$POLL" +done +``` + +State-machine semantics (POLL=30, QUIET_REQUIRED=60, MAX_DELAY=120): + +- **PENDING**: idle; no recent growth. On observed grow → emit `edge=FIRST_GROW`, transition to GROWING, record `BURST_START` and `QUIET_START` at now. +- **GROWING**: at least one grow has fired in this burst. On further grow → no emit by default; refresh `QUIET_START` at now (resets the quiet timer). If the burst has been growing continuously past `MAX_DELAY` (120 s) since `BURST_START` → emit `edge=MAX_DELAY`, reset `BURST_START` to now (sustained-traffic ceiling). On no grow for `QUIET_REQUIRED` consecutive seconds (60 s, two consecutive quiet poll cycles) → emit `edge=LAST_GROW`, transition back to PENDING. + +Discipline: +- **Single-file inbox.** The inbox is `inboxes/team-lead.json` — a single JSON file, NOT a directory. Byte-grow detection via `wc -c`, NOT directory inotify. +- **Stdout discipline.** Each stdout line fires a turn. Emit ONLY `INBOX_GREW size=… ts=… edge=…` lines on real state transitions (FIRST_GROW, MAX_DELAY ceiling, LAST_GROW). Diagnostic and lifecycle output goes to `>&2` (stderr does not turn-fire). +- **Transient-error suppression.** `wc -c 2>/dev/null` swallows transient read errors so a momentary missing-file does not crash the loop. + +Spawn via `Monitor(persistent=true, cmd=)`; the returned task ID is captured for STATE_FILE write. + +**Audit**: the design target is ONE emit per discrete burst (FIRST_GROW + LAST_GROW bracket the burst), with MAX_DELAY as the hard ceiling for sustained traffic that never reaches `QUIET_REQUIRED` quiet seconds. POLL=30 sets the granularity floor (no point checking quiet-time at finer resolution than the poll); QUIET_REQUIRED=60 means TWO consecutive quiet poll cycles must pass before LAST_GROW fires; MAX_DELAY=120 caps unbounded suppression. Relationship: `QUIET_REQUIRED ≥ POLL` is required (otherwise quiet-detection is sub-resolution) and `MAX_DELAY ≥ 2*POLL` is required (otherwise the ceiling fires before the natural LAST_GROW would). Two design-choice ratios are pinned beyond the bare minimum constraints. (a) `QUIET_REQUIRED = 2*POLL` — a deliberate tightening over the minimum `QUIET_REQUIRED = POLL` (one-poll confirmation) to capture multi-message reviewer bursts with internal sub-pauses as ONE logical burst, not multiple ping-pong FIRST_GROW + LAST_GROW pairs. (b) `MAX_DELAY = 2*QUIET_REQUIRED` (equivalently `MAX_DELAY = 4*POLL`) — sets the sustained-traffic ceiling at roughly two coalesced bursts so it caps unbounded suppression without re-fragmenting natural burst boundaries. An editing LLM tempted to shrink QUIET_REQUIRED back to `= POLL` (one quiet poll cycle) must understand: that ratio fires LAST_GROW after a single quiet poll, breaking burst coalescence under reviewer-style traffic patterns where 30-50 s sub-pauses are normal. An editing LLM tempted to set `MAX_DELAY = QUIET_REQUIRED` (or smaller) refragments coalesced bursts — the ceiling would fire mid-burst, defeating the quiet-window's purpose. An editing LLM tempted to shrink POLL "for snappier wake" must adjust both QUIET_REQUIRED and MAX_DELAY to preserve all three relationships (≥ POLL, = 2*POLL for QUIET, = 2*QUIET for MAX_DELAY); an editing LLM tempted to drop the state machine "to simplify" reintroduces wake-fire inflation under bursty traffic. F1 (single-file inbox) is the load-bearing wake trigger; an editing LLM who confuses inbox-as-directory will silently break wake delivery — the inbox path must remain `inboxes/team-lead.json`. + +## WriteStateFile Block + +Atomic-rename JSON write. STATE_FILE path is `~/.claude/teams/{team_name}/inbox-wake-state.json`. + +```python +state_path = Path.home() / ".claude" / "teams" / team_name / "inbox-wake-state.json" +payload = { + "v": 1, + "monitor_task_id": , + "armed_at": datetime.now(timezone.utc).isoformat(timespec="seconds").replace("+00:00", "Z"), + "armed_by_session_id": pact_session_context["session_id"], +} +tmp = state_path.with_suffix(".json.tmp") +tmp.write_text(json.dumps(payload), encoding="utf-8") +os.replace(tmp, state_path) # atomic rename +``` + +Schema is intentionally minimal — exactly 4 fields: `v`, `monitor_task_id`, `armed_at`, `armed_by_session_id`. `armed_by_session_id` is the lead's session_id at Arm time, read from `pact-session-context.json`; the paired [`/PACT:unwatch-inbox`](unwatch-inbox.md) command validates it on Teardown to prevent cross-session TaskStop weaponization (see that command's `## Teardown Block` audit anchor). There is no watchdog, so no watchdog-job-id and no liveness-pulse fields are written or read. + +**Audit**: STATE_FILE has exactly 4 fields after the cycle-7 security extension; no further fields. The watchdog-style family of fields (liveness pulses, cron-driven job identifiers, periodic-emit tokens) remains explicitly excluded — adding any of them re-introduces the cron-fire-kills-Monitor self-defeating loop documented in `## Failure Modes` on silent Monitor death. `armed_by_session_id` is the single security-driven exception, not a precedent that opens the gate. An editing LLM reasoning by analogy with prior watchdog-augmented designs must NOT re-add a watchdog job id. Lead-only scope means no per-agent suffix either — the filename is fixed (`inbox-wake-state.json`), and there is one Monitor per session. + +## Failure Modes + +### Malformed STATE_FILE + +If the STATE_FILE exists but fails to parse as JSON, this command treats it as not-armed and cold-starts. The pre-existing file is overwritten by the atomic-rename write. + +### Silent Monitor death + +> **Silent Monitor death**: this design has no in-session watchdog. If the Monitor task dies silently mid-session (process crash, OOM, harness GC, undisclosed lifetime cap), the wake mechanism degrades to no-wake until the next Arm fire (next first-active-task transition or next SessionStart-with-active-tasks). Detection is not possible from inside the lead's session; a manual `/PACT:watch-inbox` invocation will re-arm if the lead notices missed messages by other means (e.g., a teammate complaining about no response). This is a deliberate trade against the cron-fire-kills-Monitor self-defeating loop: a cron-driven watchdog firing in the same session as a watched Monitor kills the watched Monitor as a side effect of cron-fire harness behavior, while an isolated Monitor in the same session survives. Future work tracked separately if/when the platform offers a watchdog primitive that does not kill the watched task. + +**Audit**: this paragraph is the principle anchor for the central trade-off (no watchdog). An editing LLM later proposing "let's add a heartbeat + cron watchdog after all" must understand the rule: the cron+Monitor combination kills its own Monitor. Adding a cron-driven watchdog re-introduces the self-defeating loop this design closes; do not. + +### Long single-tool calls block wake delivery + +> **Long single-tool calls block wake delivery**: Monitor's `INBOX_GREW` stdout emit fires when the inbox grows, but events that fire during a long-running single tool call (e.g., a 90-second blocking `sleep`) are queued and delivered to the lead only when that tool returns, bundled with the tool's result. The wake mechanism does NOT interrupt a tool mid-call. For multi-tool turns the wake surfaces between tool calls; for single long tools the lead is effectively unwakeable until the tool returns. The substrate behavior is: `INBOX_GREW` fires correctly during the in-flight tool, but the platform queues those events behind the tool's result; teammate-message content arrives in the *next* turn via standard idle-delivery, not mid-tool. + +**Audit**: this paragraph is the principle anchor for the scope claim ("between tool calls, not mid-tool"). An editing LLM reading this command and seeing "wake on inbox grow" will reasonably infer mid-tool interrupt unless explicitly told otherwise. That inference is wrong. The §Overview's second alarm-clock paragraph + this entry together correctly scope the substrate's actual capability — the wake is a between-tool-call signal. The mechanism: `INBOX_GREW` fires correctly during a long-running tool but its event delivery is gated behind the tool's return, so the wake is observably reproducible (run a long blocking tool in parallel with a delayed peer reply; observe `INBOX_GREW` in Monitor stdout during the tool, observe content delivery only after tool return). + +### Wake-fire inflation under bursty traffic + +> **Wake-fire inflation**: a naive Monitor that emits `INBOX_GREW` on every poll-observed growth would fire one turn per poll cycle during a burst (e.g., five queued teammate replies arriving within seconds → five separate turns, each consuming a tool-result roundtrip's token cost). The 30/60/120 quiet-window state machine coalesces a discrete burst into ONE turn at FIRST_GROW + ONE turn at LAST_GROW (after `QUIET_REQUIRED=60` consecutive quiet seconds, two consecutive quiet poll cycles). For traffic that never reaches the quiet window — sustained writes from a long-running peer — `MAX_DELAY=120` re-emits at most every 120 s so the lead does not silently accumulate an unbounded inbox-grow window. The design target is the smallest emit count that keeps wake timeliness within a 120-second worst-case ceiling. + +**Audit**: an editing LLM tempted to "simplify" by removing the quiet-window state machine reverts to one emit per poll cycle, which is the original wake-fire inflation problem this design closes. The relationship `QUIET_REQUIRED ≥ POLL` and `MAX_DELAY ≥ 2*POLL` must be preserved on any timing tune. An editing LLM tempted to drop MAX_DELAY "because LAST_GROW will fire eventually" misses the sustained-traffic case where the inbox never goes quiet for `QUIET_REQUIRED` seconds — without MAX_DELAY, sustained traffic produces zero LAST_GROW emits and the lead never wakes during the burst. + +### Concurrent re-arm + +If two Arm directives race (rare; first-active-transition firing concurrently with a SessionStart-resume Arm), both may attempt cold-start. Atomic-rename write makes STATE_FILE corruption impossible; the second write wins, so STATE_FILE points only at the winner. The loser's Monitor task is orphaned and has no registry pointer for any subsequent `/PACT:unwatch-inbox` to find — `TaskStop(STATE_FILE.monitor_task_id)` reaches the winner only. Orphan accumulation is bounded by the rarity of the race; force-termination cleanup via `cleanup_wake_registry` covers the registry sidecar regardless. + +## Verification + +Confirm Monitor armed: + +1. STATE_FILE exists at `~/.claude/teams/{team_name}/inbox-wake-state.json` and parses with `v=1`. +2. `STATE_FILE.monitor_task_id` resolves to a live Monitor task (visible in TaskList, status running). + +See dogfood runbook `pact-plugin/tests/runbooks/591-inbox-wake.md` for end-to-end verification (fresh-session arm via first-active-task transition, inbox-grow wake under quiet-window coalescing, teardown via last-active-task transition, force-termination cleanup). + +## References + +- [`/PACT:unwatch-inbox`](unwatch-inbox.md) — paired teardown command. +- [Communication Charter Part I §Wake Mechanism](../protocols/pact-communication-charter.md#wake-mechanism) — protocol contract surface. +- [Communication Charter Part I §Delivery Model](../protocols/pact-communication-charter.md#delivery-model) — async-at-idle-boundary delivery model. diff --git a/pact-plugin/commands/wrap-up.md b/pact-plugin/commands/wrap-up.md index 96092f98..d10188f8 100644 --- a/pact-plugin/commands/wrap-up.md +++ b/pact-plugin/commands/wrap-up.md @@ -95,6 +95,8 @@ The `session_consolidated` write fires under the `true` branch regardless of whe ## 6. Worktree Cleanup +Invoke `Skill("PACT:unwatch-inbox")` before worktree cleanup. This stops the lead's Monitor task and unlinks the STATE_FILE; it is the hook-silent-fail safety-net path that runs alongside the PostToolUse 1→0 last-active Teardown directive. Idempotent and best-effort — see [unwatch-inbox §Operation](unwatch-inbox.md#operation). + Check for open PRs associated with the current worktree branch: - **PR merged or no PR**: Invoke `/PACT:worktree-cleanup` to remove the worktree cleanly. - **PR still open**: Skip worktree cleanup. Write a `session_paused` event to the journal (see the `session_paused` field table in [pause.md step 5](pause.md#5-write-paused-state-to-session-journal) for the event schema — wrap-up writes only the `session_paused` event here; the `session_consolidated` event was already emitted in step 5 above). Set `consolidation_completed: true` because wrap-up steps 1-4 already performed memory consolidation. Report: "Worktree preserved — PR still open. Use `/PACT:pause` to consolidate and pause, or `/PACT:peer-review` to continue review." diff --git a/pact-plugin/hooks/hooks.json b/pact-plugin/hooks/hooks.json index 93e192e2..33cf5e50 100644 --- a/pact-plugin/hooks/hooks.json +++ b/pact-plugin/hooks/hooks.json @@ -191,6 +191,15 @@ "command": "python3 \"${CLAUDE_PLUGIN_ROOT}/hooks/auditor_reminder.py\"" } ] + }, + { + "matcher": "TaskCreate|TaskUpdate", + "hooks": [ + { + "type": "command", + "command": "python3 \"${CLAUDE_PLUGIN_ROOT}/hooks/wake_lifecycle_emitter.py\"" + } + ] } ] } diff --git a/pact-plugin/hooks/session_end.py b/pact-plugin/hooks/session_end.py index c9be8027..f2e2a79b 100644 --- a/pact-plugin/hooks/session_end.py +++ b/pact-plugin/hooks/session_end.py @@ -459,6 +459,52 @@ def _dir_max_child_mtime(entry: Path, glob: str = "*.json") -> float | None: return None +def cleanup_wake_registry(team_name: str) -> None: + """ + Best-effort removal of the inbox-wake STATE_FILE sidecar for the + given team. + + Belt-and-suspenders for force-termination edge cases (SIGKILL, + crash, hard process tree teardown) where the primary skill-driven + Teardown path did not run. Cannot stop the orphaned Monitor task — + that is an agent-runtime tool unreachable from this hook context. + Sidecar removal lets the next session's Arm cold-start cleanly + instead of seeing a STATE_FILE pointing at a long-dead Monitor. + + Lead-only scope: a single fixed-name STATE_FILE at + ~/.claude/teams/{team_name}/inbox-wake-state.json. No glob, no + per-agent suffix. + + Path-traversal discipline: + - team_name validated via is_safe_path_component (positive + allowlist); rejects ``, `..`, `/`, controls, separators. + - teams_root resolved once; team_dir resolved and asserted under + teams_root via relative_to (symlink-escape defense). + - Path.unlink wrapped in try/except OSError; missing_ok=True + suppresses FileNotFoundError, other OSError subtypes are + swallowed per the module-wide fail-open posture. + + Pure side-effect; never raises. + """ + if not team_name or not is_safe_path_component(team_name): + return + try: + teams_root = (Path.home() / ".claude" / "teams").resolve() + except OSError: + return + team_dir = teams_root / team_name + try: + resolved_team_dir = team_dir.resolve() + resolved_team_dir.relative_to(teams_root) + except (OSError, ValueError): + return # symlink escape or unreadable path + state_file = resolved_team_dir / "inbox-wake-state.json" + try: + state_file.unlink(missing_ok=True) + except OSError: + pass # fail-open per module convention + + def cleanup_old_teams( current_team_name: str, teams_base_dir: str | None = None, @@ -801,6 +847,15 @@ def main(): # Callsite short-circuit on empty team_name is the belt-and-suspenders # layer around the internal fail-closed guard. current_team_name = get_team_name() + + # Wake-registry cleanup (#591). Belt-and-suspenders for force- + # termination paths where the primary skill-driven Teardown did + # not run. Cannot reach TaskStop from hook context; only the + # registry sidecar is removable here. Single-file unlink + # (lead-only — fixed STATE_FILE name, no glob). + if current_team_name: + cleanup_wake_registry(current_team_name) + teams_r, teams_s = 0, 0 tasks_r, tasks_s = 0, 0 teams_reaper_ran = False diff --git a/pact-plugin/hooks/session_init.py b/pact-plugin/hooks/session_init.py index 0725feb1..cbbb91cc 100755 --- a/pact-plugin/hooks/session_init.py +++ b/pact-plugin/hooks/session_init.py @@ -80,6 +80,7 @@ from shared.session_journal import append_event, make_event from shared.failure_log import append_failure from shared.plugin_manifest import format_plugin_banner +from shared.wake_lifecycle import count_active_tasks # Import extracted modules (decomposed for maintainability per M5 audit finding). from shared.symlinks import setup_plugin_symlinks @@ -711,6 +712,39 @@ def main(): # 5. Remind orchestrator to create session-unique PACT team (or reuse on resume) team_name = generate_team_name(input_data) + # 5_pre. Wake-arm directive (resume-with-active-tasks gap closure). + # PostToolUse on Task-mutating tools handles 0->1 transitions + # within a session, but a session resuming with tasks already in + # flight has no such transition to observe. Hook-side + # count_active_tasks check + unconditional Arm emit when the + # team has any active teammate work on disk. The hook does the + # diagnosis; the directive itself is unconditional per the + # hook-emitted-directives discipline (#444 unconditional > + # conditional). Tier-0 additionalContext via context_parts + # append; resume / startup / clear / compact all reach this + # branch identically — Arm is idempotent in the skill, so + # redundant emission no-ops cheaply. + # + # count_active_tasks honors the pure-never-raises contract + # (shared.wake_lifecycle module-wide pin); call it directly, + # no try/except wrapper required. + active_count = count_active_tasks(team_name) + if active_count > 0: + # Audit anchor (editing-LLM warning): the directive prose + # below is UNCONDITIONAL by design. Do not introduce + # LLM-self-diagnosis here ("only emit if X"). Diagnostic + # logic belongs in the hook (Python side, this function); + # the directive (LLM side) carries no conditional wording. + # Tier-0 additionalContext is architecturally binding; + # conditional wording silently regresses to the + # LLM-self-diagnosis failure mode #444 forbids. + context_parts.append( + 'Active teammate tasks detected on session start. ' + 'Invoke Skill("PACT:watch-inbox") before any further ' + 'teammate dispatch. Idempotent — no-op if a valid ' + 'STATE_FILE is already on disk.' + ) + # 5a. Write session context file FIRST so get_session_dir() works for # subsequent journal writes. write_context() populates the _cache # immediately, enabling append_event() to derive the journal path. diff --git a/pact-plugin/hooks/shared/task_utils.py b/pact-plugin/hooks/shared/task_utils.py index bdf7bc97..0f1d43a0 100644 --- a/pact-plugin/hooks/shared/task_utils.py +++ b/pact-plugin/hooks/shared/task_utils.py @@ -1,7 +1,7 @@ """ Location: pact-plugin/hooks/shared/task_utils.py Summary: Shared Task system integration utilities for PACT hooks. -Used by: session_init.py, agent_handoff_emitter.py +Used by: session_init.py, agent_handoff_emitter.py, wake_lifecycle.py This module provides common functions for reading and analyzing Tasks from the Claude Task system. Tasks are stored at ~/.claude/tasks/{sessionId}/*.json @@ -15,6 +15,7 @@ find_active_agents: Find all active agent tasks find_blockers: Find blocker/algedonic tasks build_post_compaction_checkpoint: Build [POST-COMPACTION CHECKPOINT] message from Task state + iter_team_task_jsons: Yield parsed task JSONs from a team dir (path-traversal safe) read_task_json: Read the raw task JSON by id + team_name (path-traversal safe) """ @@ -22,9 +23,10 @@ import os import re from pathlib import Path -from typing import Any +from typing import Any, Iterator from shared.pact_context import get_session_id +from shared.session_state import is_safe_path_component def get_task_list() -> list[dict[str, Any]] | None: @@ -265,6 +267,70 @@ def build_post_compaction_checkpoint( return "\n".join(lines) +def iter_team_task_jsons(team_name: str) -> Iterator[dict]: + """ + Yield parsed task JSON dicts from ~/.claude/tasks/{team_name}/*.json. + + Path-traversal defense (single source of truth for per-team task + iteration): + - team_name validated via is_safe_path_component (positive + allowlist; rejects empty, non-string, traversal fragments, + separators, controls, whitespace). + - tasks_root resolved once; team_dir resolved and asserted under + tasks_root via relative_to (symlink-escape defense). + + Pure generator; never raises. Yields nothing on any error + (unsafe team_name, missing dir, symlink escape, IO error). Individual + unreadable / unparseable task files are skipped silently. Callers + treat "no tasks" as fail-open — wake mechanism degrades to baseline + idle-poll rather than crashing the hook. + + Yields: + dict per successfully-parsed task JSON file. + """ + if not is_safe_path_component(team_name): + return + try: + tasks_root = (Path.home() / ".claude" / "tasks").resolve() + except OSError: + return + team_dir = tasks_root / team_name + try: + resolved_team_dir = team_dir.resolve() + resolved_team_dir.relative_to(tasks_root) + except (OSError, ValueError): + return # symlink escape or unreadable path + if not resolved_team_dir.exists(): + return + try: + for task_file in resolved_team_dir.glob("*.json"): + # Exclude dotfile-prefixed JSON files: pathlib glob includes + # them, but the platform's task system never writes them. + # An attacker dropping one into the team's tasks dir could + # otherwise inflate the active-task count. + if task_file.name.startswith("."): + continue + # Per-file symlink defense: glob() returns symlink entries + # as their raw paths. Even though resolved_team_dir is + # asserted under tasks_root, a symlink inside it could + # resolve outside. Skip symlinks silently — the platform + # task system writes only regular files. + try: + if task_file.is_symlink(): + continue + except OSError: + continue + try: + content = task_file.read_text(encoding="utf-8") + task = json.loads(content) + except (IOError, OSError, json.JSONDecodeError): + continue + if isinstance(task, dict): + yield task + except (IOError, OSError): + return + + def read_task_json( task_id: str, team_name: str | None, diff --git a/pact-plugin/hooks/shared/wake_lifecycle.py b/pact-plugin/hooks/shared/wake_lifecycle.py new file mode 100644 index 00000000..b6d2853a --- /dev/null +++ b/pact-plugin/hooks/shared/wake_lifecycle.py @@ -0,0 +1,119 @@ +""" +Location: pact-plugin/hooks/shared/wake_lifecycle.py +Summary: Shared helper for inbox-wake lifecycle hooks. Counts active teammate + tasks under a team, applying carve-out filters that match the lead-only + completion-authority model (signal-tasks and self-complete-exempt + agents do not count toward the wake-mechanism's "any active work" + signal). +Used by: pact-plugin/hooks/wake_lifecycle_emitter.py (PostToolUse hook on + TaskCreate / TaskUpdate), and + pact-plugin/hooks/session_init.py (resume-with-active-tasks Arm + directive emission). + +Public surface: +- count_active_tasks(team_name) -> int + Counts tasks in ~/.claude/tasks/{team_name}/*.json where + _lifecycle_relevant returns True. +- _lifecycle_relevant(task) -> bool + Predicate. True iff the task counts toward the active-work tally that + arms/tears down the wake mechanism. + +Contract: pure functions; never raise. Filesystem or JSON parse errors +fail-open as "no active tasks" (returns 0 / False), matching the +fail-open module-wide posture of hook code. The wake mechanism is +opportunistic: a transient failure to read tasks degrades to "no Arm +emit," which falls back to baseline idle-poll delivery — strictly +better than crashing the hook. + +Carve-out rules (match shared.intentional_wait.is_self_complete_exempt +and the inline-literal signal-task pattern at agent_handoff_emitter.py): +1. Signal-tasks: metadata.completion_type == "signal" AND + metadata.type in {"blocker", "algedonic"}. These self-complete + without team-lead-as-completion-gate; they do not represent + teammate work the wake mechanism needs to surface. +2. Self-complete-exempt agents: task.owner in + SELF_COMPLETE_EXEMPT_AGENTS (currently {secretary, pact-secretary}). + These also self-complete; they do not require the lead's wake. +""" + +from typing import Any + +from shared.intentional_wait import SELF_COMPLETE_EXEMPT_AGENTS +from shared.task_utils import iter_team_task_jsons + +# Signal-task types — inline literal mirrors the convention at +# agent_handoff_emitter.py:78 and task_utils.find_blockers. The carve-out +# applies identically pre/post status transitions, so this constant is +# stable across the lifecycle pre/post derivation. +_SIGNAL_TASK_TYPES = ("blocker", "algedonic") + +# Statuses that count as "active teammate work" — the wake mechanism's +# trigger condition. `completed` and `deleted` do not count; nothing else +# is expected in a healthy task list, but unknown statuses are excluded +# by the positive allowlist (conservative: only count statuses we know +# represent in-flight work). +_ACTIVE_STATUSES = ("pending", "in_progress") + + +def _lifecycle_relevant(task: Any) -> bool: + """ + Return True iff this task counts toward the active-work tally that + arms/tears down the wake mechanism. + + Returns False on any malformed input (non-dict task, non-dict + metadata) — conservative: missing fields cannot exempt a real + active task, but we cannot positively count an unparseable record. + + Status check: task.status must be in {"pending", "in_progress"}. + Other statuses (completed, deleted, blocked) do not count. + + Carve-outs (apply only on top of a passing status check): + - Self-complete-exempt owner: owner in SELF_COMPLETE_EXEMPT_AGENTS. + Evaluated before the metadata-shape check so that owner-based + exemption holds even on corrupted metadata. + - Signal-task pattern: metadata.completion_type == "signal" AND + metadata.type in {"blocker", "algedonic"}. + """ + if not isinstance(task, dict): + return False + + if task.get("status") not in _ACTIVE_STATUSES: + return False + + # Self-complete-exempt agent carve-out. Hoisted above the metadata + # shape check so that a secretary-owned task with corrupted metadata + # is still exempt — symmetric with shared.intentional_wait. + # is_self_complete_exempt, which fail-closes on owner alone. + owner = task.get("owner") + if isinstance(owner, str) and owner in SELF_COMPLETE_EXEMPT_AGENTS: + return False + + metadata = task.get("metadata") or {} + if not isinstance(metadata, dict): + # Malformed metadata — conservative: do not silently exempt a real + # active task on a parse-failed metadata field. Count it. + return True + + # Signal-task carve-out (inline-literal mirror). + if metadata.get("completion_type") == "signal": + if metadata.get("type") in _SIGNAL_TASK_TYPES: + return False + + return True + + +def count_active_tasks(team_name: str) -> int: + """ + Count lifecycle-relevant tasks under ~/.claude/tasks/{team_name}/. + + Iteration + path-traversal defense (allowlist + symlink-escape) is + delegated to task_utils.iter_team_task_jsons, which is the single + source of truth for per-team task-file iteration. Individual + unreadable / unparseable task files are skipped silently; the count + reflects only successfully-parsed lifecycle-relevant tasks. + + Pure function; never raises. Fail-open as "no active tasks" — the + wake mechanism degrades to baseline idle-poll on read failure rather + than crashing the calling hook (livelock-safety > observability). + """ + return sum(1 for task in iter_team_task_jsons(team_name) if _lifecycle_relevant(task)) diff --git a/pact-plugin/hooks/wake_lifecycle_emitter.py b/pact-plugin/hooks/wake_lifecycle_emitter.py new file mode 100644 index 00000000..ce5dec5c --- /dev/null +++ b/pact-plugin/hooks/wake_lifecycle_emitter.py @@ -0,0 +1,315 @@ +#!/usr/bin/env python3 +""" +Location: pact-plugin/hooks/wake_lifecycle_emitter.py +Summary: PostToolUse hook that emits watch-inbox/unwatch-inbox directives + for the inbox-wake command pair on first/last active-task transitions. +Used by: hooks.json PostToolUse hook with matcher + `TaskCreate|TaskUpdate`. + +Lifecycle automation: +- On TaskCreate that transitions the team's active-task count from 0 to + 1, emit a watch-inbox directive instructing the lead to invoke + Skill("PACT:watch-inbox"). +- On TaskUpdate(status in {completed, deleted}) that transitions the + team's active-task count from 1 to 0, emit an unwatch-inbox directive + instructing the lead to invoke Skill("PACT:unwatch-inbox"). Both + terminal statuses end active work and are treated symmetrically. +- On any other tool fire (TaskUpdate without a terminal-status + transition, TaskCreate at non-zero pre-state, terminal-status + TaskUpdate leaving residual active tasks): no directive emitted. + +Transition detection (post-only): +- post = count_active_tasks(team_name) — the count AFTER the tool's + effect is on disk. count_active_tasks already filters out signal- + tasks and self-complete-exempt agents, so post is the lifecycle- + relevant count. +- TaskCreate + post == 1 → emit Arm (a TaskCreate that lifts the count + to 1 must have been the first lifecycle-relevant task; carve-out + creations leave post unchanged at 0). +- TaskUpdate(status in {completed, deleted}) + post == 0 → emit + Teardown. Skill's Teardown is idempotent (no-op if STATE_FILE absent), + so over-eager emission on edge cases (terminal-status update of a + never-counted signal-task while post==0) is benign. +- Any other tool fire (non-status TaskUpdate, TaskCreate at post != 1, + terminal-status TaskUpdate at post > 0): no-op. + +Output schema (load-bearing): +{ + "hookSpecificOutput": { + "hookEventName": "PostToolUse", # REQUIRED — silent rejection without + "additionalContext": "" + } +} +The `hookEventName` field is REQUIRED on PostToolUse outputs. +Empirically verified during the Phase 0 routing probe: missing +`hookEventName` triggers silent schema rejection at the platform layer. + +Fail-open invariant: every code path exits 0 with `suppressOutput` +sentinel on parse errors, missing fields, or unexpected exceptions. +The wake mechanism is opportunistic — a crashed lifecycle hook degrades +to "no Arm/Teardown emit," which falls back to baseline idle-poll +delivery. Livelock-safety > observability for hook code emitting on +every Task-tool fire. + +Input: JSON from stdin with tool_name, tool_input, tool_response. +Output: hookSpecificOutput with additionalContext on transitions; + suppressOutput sentinel on no-op paths. +""" + +import json +import sys +from pathlib import Path +from typing import Any + +# Ensure shared package import resolves under the hooks directory. +_hooks_dir = Path(__file__).parent +if str(_hooks_dir) not in sys.path: + sys.path.insert(0, str(_hooks_dir)) + +import shared.pact_context as pact_context +from shared.pact_context import get_team_name +from shared.session_state import is_safe_path_component +from shared.wake_lifecycle import count_active_tasks + +# Suppress the false "hook error" UI surface on bare exit paths. +_SUPPRESS_OUTPUT = json.dumps({"suppressOutput": True}) + +# Maximum stdin payload size in bytes. PostToolUse payloads carry +# tool_input + tool_response JSON; even verbose Task-tool fires fit +# comfortably under 1MB. A larger payload is a defense-in-depth +# rejection signal (parser amplification / memory-exhaustion vector). +_MAX_PAYLOAD_BYTES = 1024 * 1024 + +# Directive prose — verbatim text emitted via additionalContext on +# transitions. Imperative voice; references the canonical command-pair +# slugs `PACT:watch-inbox` (Arm role) and `PACT:unwatch-inbox` (Teardown +# role); idempotency / best-effort clauses prevent the lead from adding +# their own conditional self-diagnosis (#444 unconditional discipline). +_ARM_DIRECTIVE = ( + 'First active teammate task created. ' + 'Invoke Skill("PACT:watch-inbox") before any further teammate ' + 'dispatch. Idempotent — no-op if a valid STATE_FILE is already on disk.' +) + +_TEARDOWN_DIRECTIVE = ( + 'Last active teammate task completed. ' + 'Invoke Skill("PACT:unwatch-inbox") to stop the Monitor and unlink ' + 'the STATE_FILE. Best-effort — tolerates a Monitor that died ' + 'silently mid-session.' +) + +# Tools accepted by _decide_directive. The hooks.json matcher prunes +# other tools at the platform layer; this in-hook check is +# belt-and-suspenders against future matcher widening. +_TASK_MUTATING_TOOLS = ("TaskCreate", "TaskUpdate") + + +def _is_lead_session(input_data: dict[str, Any], team_name: str) -> bool: + """ + Return True iff the current session is the team's lead session. + + Reads `session_id` from the PostToolUse stdin payload and + `leadSessionId` from ~/.claude/teams/{team_name}/config.json. + Mirrors the Lead-Session Guard pattern used by the watch-inbox / + unwatch-inbox skill bodies; team_config is the single source of + truth for lead identity. + + Pure function; never raises. Returns False on missing/empty + session_id, missing/unsafe team_name, missing config.json, malformed + JSON, or filesystem error. The teammate session is the expected + non-lead path; silent fail-open avoids UI noise on every + teammate Task-tool fire. + """ + raw_session_id = input_data.get("session_id") + if not isinstance(raw_session_id, str) or not raw_session_id: + return False + if not team_name or not is_safe_path_component(team_name): + return False + try: + config_path = ( + Path.home() / ".claude" / "teams" / team_name / "config.json" + ) + if not config_path.exists(): + return False + data = json.loads(config_path.read_text(encoding="utf-8")) + except (OSError, json.JSONDecodeError): + return False + if not isinstance(data, dict): + return False + lead_session_id = data.get("leadSessionId") + if not isinstance(lead_session_id, str) or not lead_session_id: + return False + return raw_session_id == lead_session_id + + +def _extract_task_id(input_data: dict[str, Any]) -> str | None: + """ + Pull the task_id out of the PostToolUse payload. + + PostToolUse stdin shape carries the original tool_input under + "tool_input" and the tool's response under "tool_response". Both + TaskCreate and TaskUpdate accept/return a task with an `id` field. + Defensively probe both paths; return None if neither yields a + string id. + """ + tool_input = input_data.get("tool_input") or {} + if isinstance(tool_input, dict): + tid = tool_input.get("taskId") or tool_input.get("task_id") + if isinstance(tid, str) and tid: + return tid + + tool_response = input_data.get("tool_response") or {} + if isinstance(tool_response, dict): + tid = tool_response.get("id") or tool_response.get("taskId") or tool_response.get("task_id") + if isinstance(tid, str) and tid: + return tid + + return None + + +_TERMINAL_STATUSES = ("completed", "deleted") + + +def _is_terminal_status_update(input_data: dict[str, Any]) -> bool: + """ + Return True iff this TaskUpdate fired with a status transition into + a terminal state (completed or deleted). Both terminate active work + and should trigger a Teardown emit on a 1->0 transition. + + Probes the tool_input.status field (the request) primarily; falls + back to tool_response.statusChange.to or tool_response.status if + the platform's response shape carries the new status. Conservative: + returns False on any ambiguity, so a non-status TaskUpdate cannot + accidentally trigger a Teardown emit. + """ + tool_input = input_data.get("tool_input") or {} + if isinstance(tool_input, dict): + if tool_input.get("status") in _TERMINAL_STATUSES: + return True + + tool_response = input_data.get("tool_response") or {} + if isinstance(tool_response, dict): + status_change = tool_response.get("statusChange") + if isinstance(status_change, dict) and status_change.get("to") in _TERMINAL_STATUSES: + return True + if tool_response.get("status") in _TERMINAL_STATUSES: + return True + + return False + + +def _emit_directive(prose: str) -> None: + """ + Print the additionalContext output payload with the required + `hookEventName` field. Caller is responsible for sys.exit(0). + """ + output = { + "hookSpecificOutput": { + "hookEventName": "PostToolUse", + "additionalContext": prose, + } + } + print(json.dumps(output)) + + +def _decide_directive(input_data: dict[str, Any], team_name: str) -> str | None: + """ + Return the directive prose to emit, or None for no-op. + + Post-only transition detection: + - TaskCreate + post == 1 → Arm. + - TaskUpdate(status in {completed, deleted}) + post == 0 → Teardown. + + count_active_tasks already filters carve-outs (signal-tasks, + self-complete-exempt owners), so post == 1 after a TaskCreate + means the create added the first lifecycle-relevant task, and + post == 0 after a terminal-status TaskUpdate means the team + has no remaining lifecycle-relevant work. The skill's Arm and + Teardown are both idempotent, so any over-eager emit on edge + cases is benign. + """ + if not _is_lead_session(input_data, team_name): + return None + + tool_name = input_data.get("tool_name") + if tool_name not in _TASK_MUTATING_TOOLS: + return None + + if not _extract_task_id(input_data): + return None + + # Defer count_active_tasks (filesystem glob+parse) until after the + # cheap predicates have selected an actually-relevant tool fire. + # Metadata-only TaskUpdates (teachback_submit, intentional_wait, + # handoff, progress, memory_saved) outnumber terminal-status + # transitions on a typical task lifecycle; gating the I/O on the + # terminal-status check eliminates ~9k wasted reads/session. + if tool_name == "TaskCreate": + if count_active_tasks(team_name) == 1: + return _ARM_DIRECTIVE + return None + + # tool_name == "TaskUpdate" + if not _is_terminal_status_update(input_data): + return None + if count_active_tasks(team_name) == 0: + return _TEARDOWN_DIRECTIVE + return None + + +def main() -> None: + # Outer catch-all preserves the exit-0 fail-open contract against + # any unexpected exception (malformed task.json, filesystem race, + # import-time error). The wake mechanism is opportunistic; a crash + # here would surface as a "hook-error" UI on every Task-tool call, + # which is the livelock-capable failure shape the categorical + # standard forbids for any TaskCompleted/TeammateIdle/Stop-class + # hook. + try: + # Bounded read: cap stdin at _MAX_PAYLOAD_BYTES + 1 so we can + # distinguish "fits under cap" from "exceeds cap" without + # allocating an unbounded buffer. Reject (suppressOutput) on + # over-cap input — defense-in-depth against parser amplification. + try: + buffer = sys.stdin.read(_MAX_PAYLOAD_BYTES + 1) + except (IOError, OSError): + print(_SUPPRESS_OUTPUT) + sys.exit(0) + if len(buffer) > _MAX_PAYLOAD_BYTES: + print(_SUPPRESS_OUTPUT) + sys.exit(0) + try: + input_data = json.loads(buffer) + except json.JSONDecodeError: + print(_SUPPRESS_OUTPUT) + sys.exit(0) + + if not isinstance(input_data, dict): + print(_SUPPRESS_OUTPUT) + sys.exit(0) + + pact_context.init(input_data) + team_name = get_team_name() + if not team_name: + print(_SUPPRESS_OUTPUT) + sys.exit(0) + + directive = _decide_directive(input_data, team_name) + if directive is None: + print(_SUPPRESS_OUTPUT) + sys.exit(0) + + _emit_directive(directive) + sys.exit(0) + except SystemExit: + # Re-raise — explicit sys.exit(0) calls above are expected + # control-flow, not errors. Swallowing them would skip the + # _SUPPRESS_OUTPUT print on no-op paths. + raise + except Exception: + print(_SUPPRESS_OUTPUT) + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/pact-plugin/protocols/pact-communication-charter.md b/pact-plugin/protocols/pact-communication-charter.md index 73e2348e..e61e0b02 100644 --- a/pact-plugin/protocols/pact-communication-charter.md +++ b/pact-plugin/protocols/pact-communication-charter.md @@ -20,6 +20,18 @@ The rules below govern how messages delivered via this tool actually behave. - The only mid-turn interrupt mechanism is user-side (Escape). Agent-to-agent SendMessage has no equivalent. - `SendMessage` requires a specific `to=` recipient name. There is no broadcast addressing mode; reaching multiple teammates means iterating and sending one message per recipient (see [Lead-Side HALT Fan-Out](../skills/orchestration/SKILL.md#team-lead-side-halt-fan-out) for the canonical pattern). +### Wake Mechanism + +The platform's `useInboxPoller` only delivers queued `SendMessage` between tool calls; long-running tool calls leave inbound messages stuck until the next idle boundary. The lead-side wake mechanism (lead-only — teammates rely on the standard idle-delivery channel) closes this gap with a `Monitor` that watches the lead's inbox file and emits a turn at the next between-tool-call boundary. + +Implementation: a command-pair — [Skill("PACT:watch-inbox")](../commands/watch-inbox.md) (Arm) and [Skill("PACT:unwatch-inbox")](../commands/unwatch-inbox.md) (Teardown). The watch-inbox command's [§Overview](../commands/watch-inbox.md#overview) and [§Failure Modes](../commands/watch-inbox.md#failure-modes) anchor the full contract; the rules below summarize the surface that callers and authors of related protocols need: + +- **Lead-only.** Exactly one Monitor per session, scoped to the period during which the lead holds assigned, uncompleted teammate tasks. No teammate-side wake. +- **[Between-tool-call, not mid-tool](../commands/watch-inbox.md#long-single-tool-calls-block-wake-delivery).** The wake surfaces queued messages between tool calls within a turn; it does NOT interrupt a tool mid-call. +- **[Signal, not content; no-narration on wake](../commands/watch-inbox.md#overview).** The Monitor's stdout emit is an alarm clock that ends the turn — content still arrives via the standard inbox channel — and the lead returns to silent idle without acknowledgment text. +- **Arm and Teardown trigger sites.** [Arm](../commands/watch-inbox.md#when-to-invoke) on first-active-task transition and on session-resume with active tasks. [Teardown](../commands/unwatch-inbox.md#when-to-invoke) on last-active-task transition, on `/wrap-up` (which invokes [Skill("PACT:unwatch-inbox")](../commands/unwatch-inbox.md) as a hook-silent-fail safety net for the all-tasks-completed exit), and on `session_end` registry cleanup. +- **No watchdog.** The mechanism degrades to no-wake on silent Monitor death until the next Arm fire — no in-session detection. The trade-off is documented in [watch-inbox §Failure Modes — Silent Monitor death](../commands/watch-inbox.md#silent-monitor-death). + ### Lead-Side Discipline — Verify Before Dispatching #### Verify State Before Correction diff --git a/pact-plugin/tests/runbooks/591-inbox-wake.md b/pact-plugin/tests/runbooks/591-inbox-wake.md new file mode 100644 index 00000000..7d90a5da --- /dev/null +++ b/pact-plugin/tests/runbooks/591-inbox-wake.md @@ -0,0 +1,199 @@ +# Inbox-Wake Runbook (#591) + +End-to-end operator runbook for the lead-side inbox-wake mechanism. Use this to verify a fresh-session arm/teardown cycle, observe 30/60/120 quiet-window coalescing, diagnose failure modes, and inspect runtime state. + +Implementation references: +- Arm command: [pact-plugin/commands/watch-inbox.md](../../commands/watch-inbox.md) +- Teardown command: [pact-plugin/commands/unwatch-inbox.md](../../commands/unwatch-inbox.md) +- Charter contract surface: [pact-plugin/protocols/pact-communication-charter.md §Wake Mechanism](../../protocols/pact-communication-charter.md#wake-mechanism) +- Lifecycle hook: `pact-plugin/hooks/wake_lifecycle_emitter.py` (PostToolUse, matcher `TaskCreate|TaskUpdate`) +- Resume-arm hook: `pact-plugin/hooks/session_init.py` (Option-C resume gap closure) +- Registry cleanup hook: `pact-plugin/hooks/session_end.py` (`cleanup_wake_registry`) + +The hook registration takes effect at the **next fresh session** after these files are merged. Hooks loaded in the session that authors them do not fire — verify in a fresh session. + +--- + +## 1. Lifecycle: When Arm and Teardown Fire + +| Event | Trigger | Mechanism | +|---|---|---| +| **Arm — first active task** | PostToolUse fires after `TaskCreate` and the team transitions 0→1 active teammate task | `wake_lifecycle_emitter.py` emits `additionalContext` directive: *"Invoke Skill('PACT:watch-inbox') before continuing."* The lead invokes the command on its next turn. | +| **Arm — session resume** | `SessionStart` fires and the team's task list already has active teammate tasks (resumed session) | `session_init.py` Option-C path: hook reads `~/.claude/tasks/{team}/` filtered by `_lifecycle_relevant`; if count ≥ 1, emits unconditional Arm directive via `additionalContext`. | +| **Teardown — last active task** | PostToolUse fires after `TaskUpdate` with terminal status (`completed` or `deleted`) and the team transitions 1→0 active teammate tasks | `wake_lifecycle_emitter.py` emits Teardown directive: *"Invoke Skill('PACT:unwatch-inbox') — no remaining teammate work."* | +| **Teardown — operator command** | Lead invokes `/wrap-up` (after all teammate tasks have completed) | Command body contains an explicit `Skill("PACT:unwatch-inbox")` invocation as a hook-silent-fail safety net. Active-task count is naturally 0 at this point, so the Teardown is harmless and useful as a catch for the rare case where the PostToolUse 1→0 directive was missed. | +| **Manual user invocation** | User types `/PACT:watch-inbox` or `/PACT:unwatch-inbox` in chat | Debug/recovery surface. Idempotent on watch-inbox (STATE_FILE-present check no-ops); best-effort on unwatch-inbox (tolerates already-stopped Monitor). Useful for re-arming after silent Monitor death or silencing Monitor noise mid-session. | +| **Registry cleanup** | `SessionEnd` fires (any session-termination path including force-termination) | `session_end.py::cleanup_wake_registry(team_name)` unlinks `inbox-wake-state.json` if present. Hook cannot reach `TaskStop` — Monitor process dies with the session. | + +The PostToolUse hook is the **primary** lifecycle mechanism. The `/wrap-up` command body is the **only command-file callsite** — a hook-silent-fail safety net for the all-tasks-completed exit. `/pause` and `/imPACT` deliberately do NOT invoke Teardown: those commands run with active teammate tasks remaining, and the lead's Monitor must stay armed for them. The watch-inbox command's idempotent cold-start check + unwatch-inbox's best-effort Teardown make the /wrap-up safety-net fire harmless even when the PostToolUse 1→0 directive already ran. + +--- + +## 2. 30/60/120 Behavioral Verification + +The Monitor's state machine has three timing constants: + +| Constant | Value | Meaning | +|---|---|---| +| `POLL` | 30 s | Polling cadence; granularity floor for grow detection | +| `QUIET_REQUIRED` | 60 s | Required consecutive quiet seconds before LAST_GROW emits | +| `MAX_DELAY` | 120 s | Sustained-traffic ceiling; emits MAX_DELAY edge if a burst never reaches QUIET_REQUIRED | + +The state machine emits `INBOX_GREW size=… ts=… edge=…` lines on three edges: `FIRST_GROW`, `LAST_GROW`, `MAX_DELAY`. Each emit ends the turn (stdout discipline). + +### 2.1 Verify FIRST_GROW + LAST_GROW (single discrete burst) + +In a fresh session with at least one active teammate task (Monitor armed): + +1. Send a single message to the lead's inbox from a teammate (e.g., a peer ping). +2. Observe the lead wakes within ~30–60 s with one `INBOX_GREW … edge=FIRST_GROW` emit. +3. Wait 60+ s with no further inbox writes. +4. Observe the lead wakes again with one `INBOX_GREW … edge=LAST_GROW` emit. + +Two emits per discrete burst is the design target. Three or more emits within a quiet window indicates a state-machine regression. + +### 2.2 Verify MAX_DELAY ceiling (sustained traffic) + +1. Have a peer write to the lead's inbox repeatedly at intervals < 30 s for at least 130 s (preventing the burst from reaching QUIET_REQUIRED). +2. Observe `INBOX_GREW … edge=FIRST_GROW` at the start of the burst. +3. Observe `INBOX_GREW … edge=MAX_DELAY` after ~120 s of sustained traffic. +4. After traffic stops and 60 s of quiet pass, observe `INBOX_GREW … edge=LAST_GROW`. + +MAX_DELAY caps unbounded suppression — without it, sustained traffic produces zero LAST_GROW emits and the lead never wakes during the burst. + +### 2.3 Smoke-check the Monitor process + +``` +ps -ef | grep '[i]nbox-wake' | head -5 +``` + +A persistent `Monitor`-spawned shell holding the `while true; sleep 30` loop should appear once the lead is armed. Process count > 1 for the same team indicates a concurrent re-arm orphan (rare; cleaned up by next Teardown). + +--- + +## 3. Failure-Mode Diagnosis + +### 3.1 Silent Monitor death + +Symptom: lead is in a session with active tasks but never wakes on inbox grow. Manual SendMessage from a peer does not surface until lead idles for unrelated reasons. + +Diagnosis: + +``` +cat ~/.claude/teams/{team_name}/inbox-wake-state.json +``` + +If the file shows `v=1` and a `monitor_task_id`, the lead believes it is armed but the Monitor process has died. This is the **silent death** failure mode — undetectable in-session. + +Recovery: invoke `Skill("PACT:watch-inbox")` manually on the lead. watch-inbox sees STATE_FILE present and is a no-op by default; force re-arm by first deleting the STATE_FILE: + +``` +rm ~/.claude/teams/{team_name}/inbox-wake-state.json +``` + +then invoke Arm again. The next cold-start writes a fresh STATE_FILE and spawns a new Monitor. + +### 3.2 Long single-tool calls block wake delivery + +Symptom: lead is mid-turn in a single long-running tool call (e.g., a 90-second blocking `sleep` or a long `Bash` command); peer messages do not surface until the tool returns. + +Diagnosis: this is **expected behavior, not a failure**. The wake mechanism's promise is "messages surface between tool calls within a turn," not "instant interrupt anywhere." Verified empirically 2026-04-30T00:00–00:02Z (skill body §Failure Modes). + +Mitigation: prefer multiple short tool calls over one long blocking call when wake responsiveness matters. For unavoidable long calls, accept the bounded latency. + +### 3.3 Malformed STATE_FILE + +Symptom: STATE_FILE exists but parse fails; Arm cold-starts unexpectedly on every Arm directive. + +Diagnosis: + +``` +cat ~/.claude/teams/{team_name}/inbox-wake-state.json +python3 -c 'import json,sys; print(json.load(open(sys.argv[1])))' \ + ~/.claude/teams/{team_name}/inbox-wake-state.json +``` + +If the file content is not valid JSON or `v != 1`, Arm correctly treats it as not-armed and cold-starts (overwriting on atomic-rename write). This is **fail-safe, not fail-broken** — the next cold-start corrects state. + +Recovery: none required. The next Arm directive cold-starts cleanly; the malformed file is overwritten. + +### 3.4 Schema-rejection of hook output + +Symptom: PostToolUse fires (visible in `~/.claude/sessions/{sid}/transcript.jsonl` if available) but no Arm/Teardown directive lands in the lead's next turn. + +Diagnosis: check `wake_lifecycle_emitter.py` JSON output schema. Required field is `hookSpecificOutput.hookEventName: "PostToolUse"`. Missing → silent schema-validation rejection by the platform. Verified empirically in PREPARE probe round 2. + +Recovery: not an operator concern — this is a code regression. File a bug; the structural tests in `pact-plugin/tests/test_inbox_wake_lifecycle_emitter.py` (emit JSON shape) and `pact-plugin/tests/test_inbox_wake_lifecycle_helper.py` (helper logic) should catch this before merge. + +### 3.5 Concurrent re-arm + +Symptom: two Monitor processes for the same team (`ps -ef | grep '[i]nbox-wake'` shows multiple). + +Diagnosis: rare race between PostToolUse first-active-transition Arm and SessionStart-resume Arm. Atomic-rename STATE_FILE write makes corruption impossible; the second cold-start wins, leaving the first Monitor's `monitor_task_id` orphaned in the harness task list. + +Recovery: next Teardown's `TaskStop(STATE_FILE.monitor_task_id)` only stops the winner. Orphan dies with session-process termination. `cleanup_wake_registry` covers the registry sidecar regardless. + +--- + +## 4. STATE_FILE Inspection + +The STATE_FILE lives at `~/.claude/teams/{team_name}/inbox-wake-state.json` with exactly 3 fields: + +```json +{ + "v": 1, + "monitor_task_id": "", + "armed_at": "2026-04-30T05:42:17Z" +} +``` + +### 4.1 Quick inspection commands + +Show armed state: + +``` +cat ~/.claude/teams/{team_name}/inbox-wake-state.json +``` + +Pretty-print + extract the Monitor task id: + +``` +python3 -c 'import json; d=json.load(open("'"$HOME"'/.claude/teams/{team_name}/inbox-wake-state.json")); print(d.get("monitor_task_id"))' +``` + +Verify the inbox file the Monitor watches: + +``` +ls -la ~/.claude/teams/{team_name}/inboxes/team-lead.json +wc -c < ~/.claude/teams/{team_name}/inboxes/team-lead.json +``` + +The `wc -c` byte size is exactly what the Monitor compares against on each poll. A growing byte count between polls is the wake trigger. + +### 4.2 Force-termination cleanup verification + +After force-terminating a session (kill -9 the parent, system crash, etc.), the STATE_FILE may be left behind. On next session start, `cleanup_wake_registry(team_name)` runs in `session_end.py` and unlinks the stale STATE_FILE. Verify: + +``` +ls ~/.claude/teams/{team_name}/inbox-wake-state.json 2>&1 +``` + +Should return `No such file or directory` after a clean session-end pass. + +--- + +## 5. End-to-End Runbook (Fresh Session) + +The minimum cycle to confirm the mechanism works in a fresh session after merge: + +1. Start a fresh session in a project with the plugin installed at the merged version. +2. Run `/PACT:orchestrate` or `/PACT:comPACT` with any small feature; observe a teammate task is created. +3. Verify `wake_lifecycle_emitter.py` fired Arm: check the lead's next turn for `additionalContext` text "Invoke Skill('PACT:watch-inbox')". +4. Confirm STATE_FILE exists: `ls ~/.claude/teams/{team_name}/inbox-wake-state.json`. +5. Have the teammate idle, then send a peer ping that grows the lead's inbox. +6. Observe the lead wakes with `INBOX_GREW … edge=FIRST_GROW`. +7. After 60 s quiet, observe `INBOX_GREW … edge=LAST_GROW`. +8. Complete all teammate tasks; verify Teardown directive lands and STATE_FILE is unlinked. +9. Run `/PACT:wrap-up`; verify (idempotent) the hook-silent-fail safety-net Teardown invocation completes without error against an already-torn-down state. + +A successful run hits all 9 steps. Step failures map to the failure modes in §3. diff --git a/pact-plugin/tests/test_inbox_wake_callsites.py b/pact-plugin/tests/test_inbox_wake_callsites.py new file mode 100644 index 00000000..1000d52a --- /dev/null +++ b/pact-plugin/tests/test_inbox_wake_callsites.py @@ -0,0 +1,139 @@ +""" +Cross-file consistency invariants for the inbox-wake feature. + +Cycle 4 migrated the inbox-wake skill into a command-pair: +- watch-inbox (Arm role) at pact-plugin/commands/watch-inbox.md +- unwatch-inbox (Teardown role) at pact-plugin/commands/unwatch-inbox.md + +Pin the Skill("PACT:unwatch-inbox") invocation in wrap-up.md (the only +command body that retains the callsite post-cycle-2; runs after all +tasks complete, so count is already 0 and the callsite is both correct +and useful as a hook-silent-fail safety net). + +Negative assertions for imPACT.md and pause.md: their callsites were +removed in F3-followup (Option C) because their unconditional Teardown +destroyed still-needed Monitors when other teammates remained active. +The PostToolUse 1→0 hook handles the lifecycle automatically. The +negative assertions here forbid all THREE slugs (legacy inbox-wake + +new watch-inbox + unwatch-inbox) as defense-in-depth against future +re-introduction. + +Charter §Wake Mechanism cross-ref pins ensure command bodies and +charter remain aligned on lead-only scope, F7 between-tool-calls +scope, and signal-not-content corollary. +""" + +from pathlib import Path + + +ROOT = Path(__file__).resolve().parent.parent +COMMANDS_DIR = ROOT / "commands" +PROTOCOLS_DIR = ROOT / "protocols" + +_FORBIDDEN_SLUGS = ( + 'Skill("PACT:inbox-wake")', + 'Skill("PACT:watch-inbox")', + 'Skill("PACT:unwatch-inbox")', +) + + +def _read(path: Path) -> str: + return path.read_text(encoding="utf-8") + + +# ---------- Teardown invocation in wrap-up.md ---------- + +def test_wrap_up_invokes_unwatch_inbox(): + """wrap-up.md retains the unwatch-inbox callsite — it fires after + all tasks are completed, so the count is already 0 and the callsite + is both correct and useful as a hook-silent-fail safety net.""" + text = _read(COMMANDS_DIR / "wrap-up.md") + assert 'Skill("PACT:unwatch-inbox")' in text, ( + "wrap-up.md missing unwatch-inbox slug invocation" + ) + + +# ---------- Negative assertions for imPACT.md and pause.md ---------- + +def test_impact_md_does_not_invoke_any_inbox_slug(): + """F3-followup (Option C) + cycle 4: imPACT.md's force-termination + Teardown was removed because it destroyed still-needed Monitors + when other teammates remained active. The PostToolUse 1→0 hook + handles the lifecycle automatically. Negative assertion forbids + ALL three slugs (legacy inbox-wake + watch-inbox + unwatch-inbox) + as defense-in-depth against future re-introduction.""" + text = _read(COMMANDS_DIR / "imPACT.md") + for slug in _FORBIDDEN_SLUGS: + assert slug not in text, ( + f"imPACT.md must NOT invoke {slug} — unconditional callsite " + f"destroys/re-arms Monitors needed by remaining teammates. " + f"PostToolUse hook handles lifecycle automatically." + ) + + +def test_pause_md_does_not_invoke_any_inbox_slug(): + """F3-followup (Option C) + cycle 4: pause.md's pre-shutdown + callsite was removed for the same reason as imPACT.md. + Defense-in-depth — forbid all three slugs.""" + text = _read(COMMANDS_DIR / "pause.md") + for slug in _FORBIDDEN_SLUGS: + assert slug not in text, ( + f"pause.md must NOT invoke {slug} — unconditional callsite " + f"destroys/re-arms Monitors needed by remaining teammates." + ) + + +# ---------- Charter cross-reference ---------- + +def test_charter_has_wake_mechanism_section(): + text = _read(PROTOCOLS_DIR / "pact-communication-charter.md") + assert "## Part I — Message Delivery Mechanics" in text + assert "### Wake Mechanism" in text + + +def test_charter_links_to_command_pair_via_relative_paths(): + """Slug-link form (relative-path markdown link), NOT @~/ pattern. + Cycle 4: charter cross-ref now points at the command pair, not the + legacy single skill body.""" + text = _read(PROTOCOLS_DIR / "pact-communication-charter.md") + section_start = text.find("### Wake Mechanism") + assert section_start >= 0 + section = text[section_start:section_start + 5000] + # Charter must reference both new commands (or at minimum the + # primary watch-inbox command — robust fallback if charter author + # links only the primary). + assert ( + "../commands/watch-inbox.md" in section + or "../commands/unwatch-inbox.md" in section + ) + # And must NOT use the deprecated @~/ pattern, nor link to the + # legacy skill body that no longer exists. + assert "@~/" not in section + assert "../skills/inbox-wake/SKILL.md" not in section + + +def test_charter_echoes_lead_only_scope(): + text = _read(PROTOCOLS_DIR / "pact-communication-charter.md") + section_start = text.find("### Wake Mechanism") + section = text[section_start:section_start + 5000] + # Lead-only narrowing must be explicit in the charter rule list. + assert "Lead-only" in section or "lead-only" in section.lower() + + +def test_charter_echoes_f7_between_tool_calls_not_mid_tool(): + """F7 cross-file consistency: charter must carry the same scope + claim as SKILL.md §Overview/§Failure Modes.""" + text = _read(PROTOCOLS_DIR / "pact-communication-charter.md") + section_start = text.find("### Wake Mechanism") + section = text[section_start:section_start + 5000] + assert "Between-tool-call" in section or "between tool calls" in section.lower() + assert "not mid-tool" in section or "mid-call" in section.lower() + + +def test_charter_echoes_signal_not_content(): + """F7 stdout-discipline corollary at the charter surface — the wake + is a signal, not a content channel.""" + text = _read(PROTOCOLS_DIR / "pact-communication-charter.md") + section_start = text.find("### Wake Mechanism") + section = text[section_start:section_start + 5000] + assert "Signal" in section or "signal" in section diff --git a/pact-plugin/tests/test_inbox_wake_lifecycle_emitter.py b/pact-plugin/tests/test_inbox_wake_lifecycle_emitter.py new file mode 100644 index 00000000..36464eb2 --- /dev/null +++ b/pact-plugin/tests/test_inbox_wake_lifecycle_emitter.py @@ -0,0 +1,673 @@ +""" +Behavioral invariants for pact-plugin/hooks/wake_lifecycle_emitter.py. + +Pipes synthesized stdin payloads through the emitter via subprocess and +asserts: +- hookEventName="PostToolUse" present on all directive emits (REQUIRED; + silent platform rejection without it). +- 0->1 active-task transition emits Arm; 1->0 emits Teardown. +- Non-status TaskUpdate, Task/Agent spawn, and TaskCreate at non-zero + pre-state are no-ops. +- Fail-open exit-0 + suppressOutput sentinel on malformed stdin / + missing team_name / unexpected exception. +- hooks.json registers the emitter under PostToolUse with matcher + TaskCreate|TaskUpdate. +""" + +import json +import os +import subprocess +import sys +from pathlib import Path + +import pytest + +HOOK_DIR = Path(__file__).resolve().parent.parent / "hooks" +EMITTER = HOOK_DIR / "wake_lifecycle_emitter.py" +HOOKS_JSON = HOOK_DIR / "hooks.json" + + +def _run_emitter(stdin_payload: str | bytes, env_extra: dict | None = None) -> tuple[int, str, str]: + # Start from a clean env so the harness's CLAUDE_PROJECT_DIR doesn't + # leak into the synthesized context resolution. + env = {k: v for k, v in os.environ.items() if not k.startswith("CLAUDE_")} + if env_extra: + env.update(env_extra) + proc = subprocess.run( + [sys.executable, str(EMITTER)], + input=stdin_payload if isinstance(stdin_payload, bytes) else stdin_payload.encode("utf-8"), + capture_output=True, + env=env, + timeout=10, + ) + return proc.returncode, proc.stdout.decode("utf-8"), proc.stderr.decode("utf-8") + + +def _pact_session_env(tmp_path: Path, team_name: str) -> dict: + """ + Build env vars + on-disk pact-session-context so the emitter's + pact_context.init() resolves the team_name from the synthesized + session-context file. The emitter calls pact_context.init(input_data) + which reads session_id/project_dir from input_data and locates the + context file under ~/.claude/pact-sessions///. + """ + home = tmp_path / "home" + home.mkdir() + return {"HOME": str(home)} + + +def _write_session_context(home: Path, session_id: str, project_dir: str, team_name: str, *, lead_session_id: str | None = None) -> None: + # Match the resolution path used by pact_context._resolve_context_path: + # ~/.claude/pact-sessions///pact-session-context.json + slug = Path(project_dir).name + sess_dir = home / ".claude" / "pact-sessions" / slug / session_id + sess_dir.mkdir(parents=True, exist_ok=True) + (sess_dir / "pact-session-context.json").write_text( + json.dumps({ + "team_name": team_name, + "session_id": session_id, + "project_dir": project_dir, + "plugin_root": "", + "started_at": "2026-04-30T00:00:00Z", + }), + encoding="utf-8", + ) + # Team config drives the emitter's _is_lead_session guard. Default + # behavior: caller's session_id IS the lead (the standard test + # framing for these tests, which exercise lead-side behavior). Pass + # `lead_session_id="some-other-id"` to simulate a teammate session. + team_dir = home / ".claude" / "teams" / team_name + team_dir.mkdir(parents=True, exist_ok=True) + effective_lead = lead_session_id if lead_session_id is not None else session_id + (team_dir / "config.json").write_text( + json.dumps({"leadSessionId": effective_lead}), + encoding="utf-8", + ) + + +def _write_task(home: Path, team_name: str, task_id: str, **fields) -> None: + tasks_dir = home / ".claude" / "tasks" / team_name + tasks_dir.mkdir(parents=True, exist_ok=True) + payload = {"id": task_id, **fields} + (tasks_dir / f"{task_id}.json").write_text(json.dumps(payload), encoding="utf-8") + + +# ---------- hooks.json registration ---------- + +def test_hooks_json_registers_emitter_under_post_tool_use(): + cfg = json.loads(HOOKS_JSON.read_text(encoding="utf-8")) + posts = cfg["hooks"]["PostToolUse"] + found = [ + entry for entry in posts + if entry.get("matcher") == "TaskCreate|TaskUpdate" + ] + assert found, "PostToolUse with required matcher not registered" + cmds = [] + for entry in found: + for h in entry.get("hooks", []): + cmds.append(h.get("command", "")) + assert any("wake_lifecycle_emitter.py" in c for c in cmds), ( + "wake_lifecycle_emitter.py not wired to the matcher" + ) + + +# ---------- Fail-open paths ---------- + +def test_malformed_stdin_exits_zero_with_suppress(tmp_path): + rc, out, _ = _run_emitter(b"\x00not-json\xff", env_extra=_pact_session_env(tmp_path, "t")) + assert rc == 0 + assert json.loads(out) == {"suppressOutput": True} + + +def test_non_dict_stdin_exits_zero_with_suppress(tmp_path): + rc, out, _ = _run_emitter("[]", env_extra=_pact_session_env(tmp_path, "t")) + assert rc == 0 + assert json.loads(out) == {"suppressOutput": True} + + +def test_missing_team_name_exits_zero_with_suppress(tmp_path): + # No session context file written → get_team_name() returns "". + payload = json.dumps({ + "tool_name": "TaskCreate", + "session_id": "abc", + "cwd": "/tmp/x", + "tool_input": {"taskId": "1"}, + "tool_response": {"id": "1"}, + }) + rc, out, _ = _run_emitter(payload, env_extra=_pact_session_env(tmp_path, "t")) + assert rc == 0 + assert json.loads(out) == {"suppressOutput": True} + + +def test_unrelated_tool_no_op(tmp_path): + home = tmp_path / "home"; home.mkdir() + sid = "session-1"; pdir = "/tmp/proj" + _write_session_context(home, sid, pdir, "team-a") + payload = json.dumps({ + "tool_name": "Read", + "session_id": sid, + "cwd": pdir, + "tool_input": {}, + "tool_response": {}, + }) + rc, out, _ = _run_emitter(payload, env_extra={"HOME": str(home)}) + assert rc == 0 + assert json.loads(out) == {"suppressOutput": True} + + +def test_task_spawn_tool_no_op(tmp_path): + """Task and Agent are spawn-tool internal names; they don't change + active-task count, so they fall through to the no-op path.""" + home = tmp_path / "home"; home.mkdir() + sid = "session-1"; pdir = "/tmp/proj" + _write_session_context(home, sid, pdir, "team-a") + for spawn_tool in ("Task", "Agent"): + payload = json.dumps({ + "tool_name": spawn_tool, + "session_id": sid, + "cwd": pdir, + "tool_input": {"description": "x"}, + "tool_response": {}, + }) + rc, out, _ = _run_emitter(payload, env_extra={"HOME": str(home)}) + assert rc == 0 + assert json.loads(out) == {"suppressOutput": True}, ( + f"{spawn_tool} should be no-op" + ) + + +# ---------- Arm directive (0 -> 1) ---------- + +def _emit_output(payload: dict, home: Path) -> dict: + rc, out, err = _run_emitter( + json.dumps(payload), + env_extra={"HOME": str(home), "CLAUDE_PROJECT_DIR": payload.get("cwd", "")}, + ) + assert rc == 0, f"non-zero exit; stderr={err}" + return json.loads(out) + + +def test_arm_emitted_on_first_task_create(tmp_path): + home = tmp_path / "home"; home.mkdir() + sid = "session-1"; pdir = "/tmp/proj"; team = "team-a" + _write_session_context(home, sid, pdir, team) + # Just-created task is on disk and active. + _write_task(home, team, "task-1", status="in_progress", owner="backend-coder") + payload = { + "tool_name": "TaskCreate", + "session_id": sid, + "cwd": pdir, + "tool_input": {"taskId": "task-1"}, + "tool_response": {"id": "task-1"}, + } + out = _emit_output(payload, home) + hso = out["hookSpecificOutput"] + assert hso["hookEventName"] == "PostToolUse" + assert "Skill(\"PACT:watch-inbox\")" in hso["additionalContext"] + + +def test_arm_includes_idempotency_clause(tmp_path): + home = tmp_path / "home"; home.mkdir() + sid = "s"; pdir = "/tmp/p"; team = "t" + _write_session_context(home, sid, pdir, team) + _write_task(home, team, "1", status="pending", owner="x") + out = _emit_output({ + "tool_name": "TaskCreate", "session_id": sid, "cwd": pdir, + "tool_input": {"taskId": "1"}, "tool_response": {"id": "1"}, + }, home) + additional = out["hookSpecificOutput"]["additionalContext"] + # Case-insensitive — directive prose capitalizes 'Idempotent' but + # the substring 'idempotent' must still appear somewhere. + assert "idempotent" in additional.lower() + # Pin the no-op clause as well — anchors the WHY of idempotency. + assert "no-op if a valid STATE_FILE" in additional + + +def test_teardown_includes_best_effort_clause(tmp_path): + """Symmetric to Arm idempotency: pin the best-effort clause + the + 'tolerates Monitor that died silently' rationale.""" + home = tmp_path / "home"; home.mkdir() + sid = "s"; pdir = "/tmp/p"; team = "t" + _write_session_context(home, sid, pdir, team) + _write_task(home, team, "1", status="completed", owner="x") + out = _emit_output({ + "tool_name": "TaskUpdate", "session_id": sid, "cwd": pdir, + "tool_input": {"taskId": "1", "status": "completed"}, + "tool_response": {"id": "1", "status": "completed"}, + }, home) + additional = out["hookSpecificOutput"]["additionalContext"] + assert "best-effort" in additional.lower() + assert "tolerates a Monitor that died silently" in additional + + +def test_arm_directive_contains_precondition_phrase(tmp_path): + """Pin the canonical Arm precondition prose so an editing LLM + stripping it for terseness loses the directive's WHY-context. + Symmetric with session_init's pinned 'Active teammate tasks + detected on session start' phrase.""" + home = tmp_path / "home"; home.mkdir() + sid = "s"; pdir = "/tmp/p"; team = "t" + _write_session_context(home, sid, pdir, team) + _write_task(home, team, "1", status="pending", owner="x") + out = _emit_output({ + "tool_name": "TaskCreate", "session_id": sid, "cwd": pdir, + "tool_input": {"taskId": "1"}, "tool_response": {"id": "1"}, + }, home) + assert "First active teammate task created" in out["hookSpecificOutput"]["additionalContext"] + + +def test_teardown_directive_contains_precondition_phrase(tmp_path): + """Pin the canonical Teardown precondition prose.""" + home = tmp_path / "home"; home.mkdir() + sid = "s"; pdir = "/tmp/p"; team = "t" + _write_session_context(home, sid, pdir, team) + _write_task(home, team, "1", status="completed", owner="x") + out = _emit_output({ + "tool_name": "TaskUpdate", "session_id": sid, "cwd": pdir, + "tool_input": {"taskId": "1", "status": "completed"}, + "tool_response": {"id": "1", "status": "completed"}, + }, home) + assert "Last active teammate task completed" in out["hookSpecificOutput"]["additionalContext"] + + +def test_no_op_on_second_active_task_create(tmp_path): + home = tmp_path / "home"; home.mkdir() + sid = "s"; pdir = "/tmp/p"; team = "t" + _write_session_context(home, sid, pdir, team) + # Pre-existing active task + just-created task → 1->2 transition. + _write_task(home, team, "existing", status="in_progress", owner="x") + _write_task(home, team, "new", status="in_progress", owner="y") + out = _emit_output({ + "tool_name": "TaskCreate", "session_id": sid, "cwd": pdir, + "tool_input": {"taskId": "new"}, "tool_response": {"id": "new"}, + }, home) + assert out == {"suppressOutput": True} + + +def test_no_op_on_create_of_signal_task(tmp_path): + """Signal-tasks don't count toward lifecycle-relevant tally.""" + home = tmp_path / "home"; home.mkdir() + sid = "s"; pdir = "/tmp/p"; team = "t" + _write_session_context(home, sid, pdir, team) + _write_task( + home, team, "sig-1", + status="in_progress", + owner="x", + metadata={"completion_type": "signal", "type": "blocker"}, + ) + out = _emit_output({ + "tool_name": "TaskCreate", "session_id": sid, "cwd": pdir, + "tool_input": {"taskId": "sig-1"}, "tool_response": {"id": "sig-1"}, + }, home) + assert out == {"suppressOutput": True} + + +def test_no_op_on_create_owned_by_exempt_agent(tmp_path): + home = tmp_path / "home"; home.mkdir() + sid = "s"; pdir = "/tmp/p"; team = "t" + _write_session_context(home, sid, pdir, team) + _write_task(home, team, "sec-1", status="in_progress", owner="secretary") + out = _emit_output({ + "tool_name": "TaskCreate", "session_id": sid, "cwd": pdir, + "tool_input": {"taskId": "sec-1"}, "tool_response": {"id": "sec-1"}, + }, home) + assert out == {"suppressOutput": True} + + +# ---------- Teardown directive (1 -> 0) ---------- + +def test_teardown_emitted_on_last_active_completion(tmp_path): + home = tmp_path / "home"; home.mkdir() + sid = "s"; pdir = "/tmp/p"; team = "t" + _write_session_context(home, sid, pdir, team) + # Task on disk is now completed (post-state); pre-state was active. + _write_task(home, team, "1", status="completed", owner="x") + out = _emit_output({ + "tool_name": "TaskUpdate", "session_id": sid, "cwd": pdir, + "tool_input": {"taskId": "1", "status": "completed"}, + "tool_response": {"id": "1", "status": "completed"}, + }, home) + hso = out["hookSpecificOutput"] + assert hso["hookEventName"] == "PostToolUse" + assert "Skill(\"PACT:unwatch-inbox\")" in hso["additionalContext"] + + +def test_no_teardown_when_other_active_remains(tmp_path): + home = tmp_path / "home"; home.mkdir() + sid = "s"; pdir = "/tmp/p"; team = "t" + _write_session_context(home, sid, pdir, team) + _write_task(home, team, "1", status="completed", owner="x") + _write_task(home, team, "2", status="in_progress", owner="y") + out = _emit_output({ + "tool_name": "TaskUpdate", "session_id": sid, "cwd": pdir, + "tool_input": {"taskId": "1", "status": "completed"}, + "tool_response": {"id": "1", "status": "completed"}, + }, home) + assert out == {"suppressOutput": True} + + +def test_no_teardown_on_non_status_taskupdate(tmp_path): + """TaskUpdate that changes only owner/metadata/etc. must not Teardown.""" + home = tmp_path / "home"; home.mkdir() + sid = "s"; pdir = "/tmp/p"; team = "t" + _write_session_context(home, sid, pdir, team) + _write_task(home, team, "1", status="in_progress", owner="x") + out = _emit_output({ + "tool_name": "TaskUpdate", "session_id": sid, "cwd": pdir, + "tool_input": {"taskId": "1", "owner": "y"}, + "tool_response": {"id": "1"}, + }, home) + assert out == {"suppressOutput": True} + + +def test_teardown_emits_on_signal_task_completion_at_post_zero(tmp_path): + """A1 simplification (see emitter docstring): post-only transition + detector emits Teardown on any status=completed TaskUpdate when + post==0, including the signal-task completion case where the task + never contributed to the active count. Skill's Teardown is + idempotent (no-op if STATE_FILE absent), so this over-eager emit + is benign by design — replaces the prior hypothetical_pre filter.""" + home = tmp_path / "home"; home.mkdir() + sid = "s"; pdir = "/tmp/p"; team = "t" + _write_session_context(home, sid, pdir, team) + _write_task( + home, team, "sig", + status="completed", + owner="x", + metadata={"completion_type": "signal", "type": "algedonic"}, + ) + out = _emit_output({ + "tool_name": "TaskUpdate", "session_id": sid, "cwd": pdir, + "tool_input": {"taskId": "sig", "status": "completed"}, + "tool_response": {"id": "sig", "status": "completed"}, + }, home) + hso = out["hookSpecificOutput"] + assert hso["hookEventName"] == "PostToolUse" + assert "Skill(\"PACT:unwatch-inbox\")" in hso["additionalContext"] + + +# ---------- _decide_directive direct unit coverage ---------- + +def test_decide_directive_module_importable(): + """Import the emitter directly and exercise _decide_directive + with synthetic inputs to lock its transition table without + subprocess overhead.""" + sys.path.insert(0, str(HOOK_DIR)) + import wake_lifecycle_emitter as emitter + + # Unrelated tool → None + assert emitter._decide_directive({"tool_name": "Read"}, "team") is None + + # TaskCreate without task id → None + assert emitter._decide_directive( + {"tool_name": "TaskCreate", "tool_input": {}, "tool_response": {}}, + "team", + ) is None + + +# ---------- Terminal-status detection covers deleted (be-F2) ---------- + +def test_teardown_emitted_on_status_deleted_at_post_zero(tmp_path): + """Both terminal statuses — `completed` and `deleted` — must trigger + Teardown when the team's active count drops to zero. A status=deleted + transition is structurally equivalent to status=completed for the + wake-lifecycle (the task is no longer active work). Without this, + a deleted-task TaskUpdate at post-zero leaves a phantom Monitor + armed against an inbox no one is watching.""" + home = tmp_path / "home"; home.mkdir() + sid = "s"; pdir = "/tmp/p"; team = "t" + _write_session_context(home, sid, pdir, team) + # Task on disk is deleted (post-state); pre-state was active. + _write_task(home, team, "1", status="deleted", owner="x") + out = _emit_output({ + "tool_name": "TaskUpdate", "session_id": sid, "cwd": pdir, + "tool_input": {"taskId": "1", "status": "deleted"}, + "tool_response": {"id": "1", "status": "deleted"}, + }, home) + hso = out.get("hookSpecificOutput") + assert hso is not None, ( + "Expected Teardown directive on status=deleted at post-zero " + f"(be-F2). Actual emit: {out!r}" + ) + assert hso["hookEventName"] == "PostToolUse" + assert "Skill(\"PACT:unwatch-inbox\")" in hso["additionalContext"] + + +def test_is_terminal_status_update_matches_completed_and_deleted(tmp_path): + """Direct unit test on the terminal-status predicate. The behavioral + contract is "task transitioned to a terminal status" — both + `completed` and `deleted` are terminal.""" + sys.path.insert(0, str(HOOK_DIR)) + import wake_lifecycle_emitter as emitter + assert emitter._is_terminal_status_update({ + "tool_input": {"status": "deleted"}, + "tool_response": {}, + }) is True + assert emitter._is_terminal_status_update({ + "tool_input": {"status": "completed"}, + "tool_response": {}, + }) is True + # Sanity: an unrelated status remains False. + assert emitter._is_terminal_status_update({ + "tool_input": {"status": "in_progress"}, + "tool_response": {}, + }) is False + + +# ---------- Lead-session guard (sec-M5 / te-MED-1) ---------- + +def test_emitter_guards_on_lead_session_id_structural(): + """Source-level structural pin: the emitter must call + `_is_lead_session` (or equivalent guard against + team_config.leadSessionId) before any directive emit. Without this + structural anchor, the emitter would fire Arm/Teardown directives + in teammate sessions (where they're inert at best, attacker- + weaponizable at worst — a teammate session that arms a Monitor + would watch the lead's inbox file from the wrong process).""" + src = (HOOK_DIR / "wake_lifecycle_emitter.py").read_text(encoding="utf-8") + assert "_is_lead_session" in src + assert "leadSessionId" in src + + +def test_no_emit_when_session_id_does_not_match_lead(tmp_path): + """Behavioral pin: a teammate-session TaskCreate that would + otherwise emit Arm must be suppressed by the lead-session guard. + Synthesize a session_id distinct from the team config's + leadSessionId; verify the emit is suppressOutput.""" + home = tmp_path / "home"; home.mkdir() + teammate_sid = "teammate-session-id" + lead_sid = "lead-session-id" + pdir = "/tmp/proj" + team = "team-guard" + # Session context: caller is the teammate; team config: lead is OTHER. + _write_session_context(home, teammate_sid, pdir, team, lead_session_id=lead_sid) + _write_task(home, team, "task-x", status="in_progress", owner="x") + payload = { + "tool_name": "TaskCreate", + "session_id": teammate_sid, + "cwd": pdir, + "tool_input": {"taskId": "task-x"}, + "tool_response": {"id": "task-x"}, + } + out = _emit_output(payload, home) + assert out == {"suppressOutput": True}, ( + f"Teammate-session TaskCreate must be suppressed; got {out!r}" + ) + + +def test_no_emit_when_team_config_missing(tmp_path): + """If team config.json is missing, _is_lead_session fail-closes — + no emit. Documenting the fail-closed behavior so an editing LLM + cannot 'simplify' the guard into fail-open during refactor.""" + home = tmp_path / "home"; home.mkdir() + sid = "s"; pdir = "/tmp/p"; team = "team-no-config" + # Write only the session-context, NOT the team config. + slug = Path(pdir).name + sess_dir = home / ".claude" / "pact-sessions" / slug / sid + sess_dir.mkdir(parents=True) + (sess_dir / "pact-session-context.json").write_text( + json.dumps({ + "team_name": team, "session_id": sid, "project_dir": pdir, + "plugin_root": "", "started_at": "2026-04-30T00:00:00Z", + }), encoding="utf-8", + ) + _write_task(home, team, "task-x", status="in_progress", owner="x") + payload = { + "tool_name": "TaskCreate", "session_id": sid, "cwd": pdir, + "tool_input": {"taskId": "task-x"}, "tool_response": {"id": "task-x"}, + } + out = _emit_output(payload, home) + assert out == {"suppressOutput": True} + + +# ---------- Perf reorder (arch2-M2) ---------- + +def test_count_active_tasks_not_called_on_metadata_only_taskupdate(): + """Performance invariant: count_active_tasks (filesystem glob+parse) + must NOT run on a metadata-only TaskUpdate (no status change). The + count is gated behind the cheap _is_terminal_status_update check + so the typical task lifecycle (lots of metadata writes + teachback_submit, intentional_wait, handoff, progress, memory_saved + + a single status=completed transition) doesn't pay an O(N) team + scan per metadata write.""" + sys.path.insert(0, str(HOOK_DIR)) + import wake_lifecycle_emitter as emitter + + # Patch the imported reference at the call site (NOT shared.wake_lifecycle). + # `from shared.wake_lifecycle import count_active_tasks` binds the name + # `count_active_tasks` into emitter's module globals, so patching the + # source module reference would miss this binding (phantom-green trap). + from unittest.mock import patch + with patch.object(emitter, "count_active_tasks") as mock_count: + # Metadata-only TaskUpdate: no status field, just owner change. + result = emitter._decide_directive({ + "tool_name": "TaskUpdate", + "session_id": "sid", "cwd": "/tmp/p", + "tool_input": {"taskId": "1", "owner": "y"}, + "tool_response": {"id": "1"}, + }, "team-x") + # Without a lead-session guard pass, _decide_directive returns + # early; we want to specifically exercise the post-tool-name + + # post-task-id + non-terminal path. Bypass _is_lead_session by + # patching it to True for this perf-invariant probe. + # (The lead-session guard's correctness is covered separately + # above; here we isolate the count_active_tasks ordering.) + + # Re-run with lead-guard bypassed to isolate the perf ordering invariant. + with patch.object(emitter, "_is_lead_session", return_value=True), \ + patch.object(emitter, "count_active_tasks") as mock_count: + result = emitter._decide_directive({ + "tool_name": "TaskUpdate", + "session_id": "sid", "cwd": "/tmp/p", + "tool_input": {"taskId": "1", "owner": "y"}, + "tool_response": {"id": "1"}, + }, "team-x") + assert result is None + assert mock_count.call_count == 0, ( + f"count_active_tasks should NOT run on metadata-only TaskUpdate; " + f"called {mock_count.call_count} time(s)" + ) + + +def test_count_active_tasks_called_on_terminal_status_taskupdate(): + """Sanity-paired with the perf invariant: terminal-status + TaskUpdates DO call count_active_tasks (otherwise the gate would be + completely bypassed and Teardown would never fire).""" + sys.path.insert(0, str(HOOK_DIR)) + import wake_lifecycle_emitter as emitter + + from unittest.mock import patch + with patch.object(emitter, "_is_lead_session", return_value=True), \ + patch.object(emitter, "count_active_tasks", return_value=0) as mock_count: + result = emitter._decide_directive({ + "tool_name": "TaskUpdate", + "session_id": "sid", "cwd": "/tmp/p", + "tool_input": {"taskId": "1", "status": "completed"}, + "tool_response": {"id": "1", "status": "completed"}, + }, "team-x") + assert mock_count.call_count >= 1, ( + "Expected count_active_tasks to run on terminal-status TaskUpdate" + ) + + +def test_count_active_tasks_called_on_taskcreate(): + """TaskCreate path also requires the count (to detect 0->1 + transition). Mirror of the terminal-status sanity test.""" + sys.path.insert(0, str(HOOK_DIR)) + import wake_lifecycle_emitter as emitter + + from unittest.mock import patch + with patch.object(emitter, "_is_lead_session", return_value=True), \ + patch.object(emitter, "_extract_task_id", return_value="1"), \ + patch.object(emitter, "count_active_tasks", return_value=1) as mock_count: + result = emitter._decide_directive({ + "tool_name": "TaskCreate", + "session_id": "sid", "cwd": "/tmp/p", + "tool_input": {"taskId": "1"}, + "tool_response": {"id": "1"}, + }, "team-x") + assert mock_count.call_count >= 1 + + +# ---------- Stdin payload size limit (sec-F1) ---------- + +def test_oversized_stdin_payload_fails_open_with_suppress(tmp_path): + """Defense-in-depth size cap (sec-F1): a stdin payload exceeding + _MAX_PAYLOAD_BYTES must be rejected with suppressOutput / exit 0, + NOT parsed and not OOM. Synthesize a 2MB payload to comfortably + exceed any reasonable 1MB threshold; assert fail-open behavior. + + The setup matches the WOULD-BE-ARM case (valid team_name, valid + session context, would-be 0->1 transition) so the suppress signal + can ONLY come from the size cap, not from any downstream guard like + missing team_name or non-lead session_id. + + Counter-test-by-revert: removing the bounded read + size guard at + main() entry produces an Arm directive instead of suppressOutput + (the underlying path would otherwise emit Arm).""" + home = tmp_path / "home"; home.mkdir() + sid = "session-cap"; pdir = "/tmp/proj-cap"; team = "team-cap" + # Set up a valid lead-session context + active task on disk so the + # downstream path would emit Arm if the payload were processed. + _write_session_context(home, sid, pdir, team) + _write_task(home, team, "task-cap", status="in_progress", owner="x") + # Build a payload structurally valid but bloated — 2MB of filler + # padding. The structural shape is JSON so a no-cap path would + # attempt a full parse and (with valid context) emit Arm. + filler = "A" * (2 * 1024 * 1024) + payload_dict = { + "tool_name": "TaskCreate", + "session_id": sid, + "cwd": pdir, + "tool_input": {"taskId": "task-cap", "filler": filler}, + "tool_response": {"id": "task-cap"}, + } + payload_bytes = json.dumps(payload_dict).encode("utf-8") + assert len(payload_bytes) > 1024 * 1024, ( + "Test setup: payload must exceed 1MB cap" + ) + rc, out, _ = _run_emitter(payload_bytes, env_extra={"HOME": str(home), "CLAUDE_PROJECT_DIR": pdir}) + assert rc == 0, "Expected fail-open exit-0 on oversized payload" + parsed = json.loads(out) + # Discriminator: with the cap in place, suppressOutput. Without the + # cap, the would-be-Arm path emits hookSpecificOutput.additionalContext. + assert parsed == {"suppressOutput": True}, ( + f"Expected size-cap rejection (suppressOutput), got {parsed!r}. " + f"If parsed has hookSpecificOutput, the cap was bypassed and the " + f"would-be-Arm path emitted instead." + ) + + +def test_emitter_documents_payload_size_cap_constant(): + """Source-level structural pin: the size cap constant must exist + as a module-global so an editing LLM cannot 'simplify' the bounded + read into an unbounded read by removing the cap inline.""" + src = (HOOK_DIR / "wake_lifecycle_emitter.py").read_text(encoding="utf-8") + assert "_MAX_PAYLOAD_BYTES" in src + # Cap must be a reasonable size (between 64KB and 16MB). Pin a + # range rather than a literal so a future tune (e.g., 2MB) doesn't + # require updating this test. + sys.path.insert(0, str(HOOK_DIR)) + import wake_lifecycle_emitter as emitter + assert isinstance(emitter._MAX_PAYLOAD_BYTES, int) + assert 64 * 1024 <= emitter._MAX_PAYLOAD_BYTES <= 16 * 1024 * 1024 diff --git a/pact-plugin/tests/test_inbox_wake_lifecycle_helper.py b/pact-plugin/tests/test_inbox_wake_lifecycle_helper.py new file mode 100644 index 00000000..78551f7d --- /dev/null +++ b/pact-plugin/tests/test_inbox_wake_lifecycle_helper.py @@ -0,0 +1,353 @@ +""" +Behavioral invariants for pact-plugin/hooks/shared/wake_lifecycle.py. + +Direct-import tests of count_active_tasks() and _lifecycle_relevant(). +Pin the carve-out semantics (signal-tasks, exempt agents) to the +SELF_COMPLETE_EXEMPT_AGENTS frozenset reuse from shared.intentional_wait +(no duplicate literal). Pure-never-raises property pins the contract so +the redundant try/except in session_init.py:728-730 can be removed in +future cleanup. +""" + +import json +import sys +from pathlib import Path + +import pytest + +# Hooks dir is added to sys.path by conftest. +import shared.wake_lifecycle as wl +from shared.intentional_wait import SELF_COMPLETE_EXEMPT_AGENTS + + +# ---------- Source-level structural invariants ---------- + +def test_helper_imports_exempt_set_from_intentional_wait(): + """No duplicate frozenset literal — the helper must reuse the + canonical set from shared.intentional_wait.""" + src = ( + Path(__file__).resolve().parent.parent + / "hooks" / "shared" / "wake_lifecycle.py" + ).read_text(encoding="utf-8") + assert "from shared.intentional_wait import SELF_COMPLETE_EXEMPT_AGENTS" in src + # No re-declaration: the literal must not appear with a curly-brace + # set syntax in the helper itself. + assert "frozenset({\"secretary\"" not in src + assert "frozenset({'secretary'" not in src + + +def test_helper_documented_pure_never_raises(): + """Pin the docstring contract — pure functions, never raise. This is + the structural anchor that lets future cleanup remove the redundant + try/except wrapping count_active_tasks at session_init.py:728-730.""" + docs = (wl.count_active_tasks.__doc__ or "") + (wl.__doc__ or "") + assert "never raise" in docs.lower() or "never raises" in docs.lower() + + +# ---------- _lifecycle_relevant predicate ---------- + +@pytest.mark.parametrize("task,expected", [ + ({"status": "in_progress", "owner": "x"}, True), + ({"status": "pending", "owner": "x"}, True), + ({"status": "completed", "owner": "x"}, False), + ({"status": "deleted", "owner": "x"}, False), + ({"status": "blocked", "owner": "x"}, False), + ({"status": "in_progress"}, True), # missing owner is fine + ({}, False), # missing status fails the active-status gate +]) +def test_lifecycle_relevant_status_gate(task, expected): + assert wl._lifecycle_relevant(task) is expected + + +@pytest.mark.parametrize("agent", sorted(SELF_COMPLETE_EXEMPT_AGENTS)) +def test_lifecycle_relevant_excludes_exempt_agents(agent): + task = {"status": "in_progress", "owner": agent} + assert wl._lifecycle_relevant(task) is False + + +@pytest.mark.parametrize("metadata,expected", [ + ({"completion_type": "signal", "type": "blocker"}, False), + ({"completion_type": "signal", "type": "algedonic"}, False), + # Wrong type → not a signal carve-out, still counts. + ({"completion_type": "signal", "type": "regular"}, True), + # Missing completion_type → counts. + ({"type": "blocker"}, True), + # Empty metadata → counts. + ({}, True), +]) +def test_lifecycle_relevant_signal_task_carveout(metadata, expected): + task = {"status": "in_progress", "owner": "x", "metadata": metadata} + assert wl._lifecycle_relevant(task) is expected + + +def test_lifecycle_relevant_handles_non_dict_input(): + for bad in (None, [], 42, "string", True): + assert wl._lifecycle_relevant(bad) is False + + +def test_lifecycle_relevant_counts_under_malformed_metadata(): + """Malformed metadata (non-dict) is conservative-counted: cannot + silently exempt a real active task on a parse failure.""" + task = {"status": "in_progress", "owner": "x", "metadata": "not-a-dict"} + assert wl._lifecycle_relevant(task) is True + + +@pytest.mark.parametrize("agent", sorted(SELF_COMPLETE_EXEMPT_AGENTS)) +def test_lifecycle_relevant_exempt_owner_with_corrupted_metadata(agent): + """Owner-check hoist (be-M1): a self-complete-exempt agent (e.g. + secretary) with non-dict metadata must STILL be exempt — return + False. Pre-hoist behavior was True because the metadata-shape gate + short-circuited to conservative-count BEFORE checking the owner + carve-out, so corrupted metadata accidentally promoted exempt agents + to lifecycle-relevant tasks. This is the inverse asymmetry of the + sibling test_lifecycle_relevant_counts_under_malformed_metadata: a + NON-exempt owner with corrupted metadata stays True (count it + conservatively); an EXEMPT owner with corrupted metadata flips to + False (the carve-out is owner-shape, not metadata-shape). + + Counter-test-by-revert: reverting the owner carve-out below the + metadata-shape gate would flip this test RED.""" + task = {"status": "in_progress", "owner": agent, "metadata": "not-a-dict"} + assert wl._lifecycle_relevant(task) is False, ( + f"Exempt owner {agent!r} with corrupted metadata must remain " + f"exempt; pre-hoist behavior was True." + ) + + +# ---------- count_active_tasks ---------- + +def test_count_active_tasks_returns_zero_on_empty_team_name(tmp_path, monkeypatch): + monkeypatch.setattr(Path, "home", lambda: tmp_path) + assert wl.count_active_tasks("") == 0 + assert wl.count_active_tasks(None) == 0 # type: ignore[arg-type] + + +def test_count_active_tasks_returns_zero_when_dir_missing(tmp_path, monkeypatch): + monkeypatch.setattr(Path, "home", lambda: tmp_path) + assert wl.count_active_tasks("ghost-team") == 0 + + +def _stage_task(tmp_path: Path, team: str, task_id: str, **fields) -> None: + d = tmp_path / ".claude" / "tasks" / team + d.mkdir(parents=True, exist_ok=True) + (d / f"{task_id}.json").write_text( + json.dumps({"id": task_id, **fields}), encoding="utf-8" + ) + + +def test_count_active_tasks_counts_pending_and_in_progress(tmp_path, monkeypatch): + monkeypatch.setattr(Path, "home", lambda: tmp_path) + team = "team-counts" + _stage_task(tmp_path, team, "1", status="pending", owner="x") + _stage_task(tmp_path, team, "2", status="in_progress", owner="y") + _stage_task(tmp_path, team, "3", status="completed", owner="z") + _stage_task(tmp_path, team, "4", status="deleted", owner="w") + assert wl.count_active_tasks(team) == 2 + + +def test_count_active_tasks_skips_signal_and_exempt(tmp_path, monkeypatch): + monkeypatch.setattr(Path, "home", lambda: tmp_path) + team = "team-carveouts" + _stage_task(tmp_path, team, "real", status="in_progress", owner="x") + _stage_task( + tmp_path, team, "sig", + status="in_progress", owner="y", + metadata={"completion_type": "signal", "type": "blocker"}, + ) + _stage_task(tmp_path, team, "sec", status="in_progress", owner="secretary") + assert wl.count_active_tasks(team) == 1 + + +def test_count_active_tasks_skips_unparseable_files(tmp_path, monkeypatch): + monkeypatch.setattr(Path, "home", lambda: tmp_path) + team = "team-malformed" + d = tmp_path / ".claude" / "tasks" / team + d.mkdir(parents=True) + _stage_task(tmp_path, team, "ok", status="in_progress", owner="x") + (d / "garbage.json").write_text("not valid json {{{", encoding="utf-8") + assert wl.count_active_tasks(team) == 1 + + +# ---------- Pure-never-raises property ---------- + +@pytest.mark.parametrize("bad_input", [ + None, + "", + "/etc", + "team\x00with-null", + "../../../escape", + 42, +]) +def test_count_active_tasks_never_raises_on_bad_team_name(bad_input, tmp_path, monkeypatch): + """Pure-function contract: any input shape exits with a count, never + raises. Pinning this lets the redundant try/except at + session_init.py:728-730 be removed in future cleanup.""" + monkeypatch.setattr(Path, "home", lambda: tmp_path) + # Should not raise for any bad input. + result = wl.count_active_tasks(bad_input) # type: ignore[arg-type] + assert isinstance(result, int) + assert result >= 0 + + +def test_lifecycle_relevant_never_raises(): + for bad in [None, [], {}, {"status": None}, {"metadata": None}, + {"status": "in_progress", "owner": None}, + {"status": "in_progress", "metadata": []}]: + try: + result = wl._lifecycle_relevant(bad) + except Exception as exc: # pragma: no cover + pytest.fail(f"_lifecycle_relevant raised on {bad!r}: {exc}") + assert isinstance(result, bool) + + +# Adversarial-shape sweep across the (status, owner, metadata) cartesian +# product. Pins pure-never-raises for the predicate against arbitrary +# task shapes — required to gate the future try/except cleanup at +# session_init.py:728-730 (the gate depends on the WHOLE call chain +# being raise-free, not just the count_active_tasks entry point). +_BAD_STATUSES = [ + None, "", "kaboom", 42, 3.14, [], {}, True, b"bytes", +] +_BAD_OWNERS = [ + None, "", 42, [], {}, ["secretary"], {"name": "x"}, True, +] +_BAD_METADATAS = [ + "string", 42, [], True, + {"completion_type": 42}, # wrong type for completion_type + {"completion_type": "signal", "type": []}, # wrong type for type + {"completion_type": "signal", "type": "blocker", "extra": object()}, + {"nested": {"deep": {"very": "deep"}}}, +] + + +@pytest.mark.parametrize("status", _BAD_STATUSES) +def test_lifecycle_relevant_never_raises_on_adversarial_status(status): + task = {"status": status, "owner": "x", "metadata": {}} + try: + result = wl._lifecycle_relevant(task) + except Exception as exc: # pragma: no cover + pytest.fail(f"_lifecycle_relevant raised on status={status!r}: {exc}") + assert isinstance(result, bool) + + +@pytest.mark.parametrize("owner", _BAD_OWNERS) +def test_lifecycle_relevant_never_raises_on_adversarial_owner(owner): + task = {"status": "in_progress", "owner": owner, "metadata": {}} + try: + result = wl._lifecycle_relevant(task) + except Exception as exc: # pragma: no cover + pytest.fail(f"_lifecycle_relevant raised on owner={owner!r}: {exc}") + assert isinstance(result, bool) + + +@pytest.mark.parametrize("metadata", _BAD_METADATAS) +def test_lifecycle_relevant_never_raises_on_adversarial_metadata(metadata): + task = {"status": "in_progress", "owner": "x", "metadata": metadata} + try: + result = wl._lifecycle_relevant(task) + except Exception as exc: # pragma: no cover + pytest.fail(f"_lifecycle_relevant raised on metadata={metadata!r}: {exc}") + assert isinstance(result, bool) + + +@pytest.mark.parametrize("task", [ + {"status": "in_progress", "owner": ["secretary"], "metadata": []}, + {"status": [], "owner": {}, "metadata": "string"}, + {"status": None, "owner": None, "metadata": None}, + {"status": "pending", "owner": "kaboom", "metadata": {"completion_type": []}}, + {"status": 42, "owner": 99, "metadata": {"type": []}}, +]) +def test_lifecycle_relevant_never_raises_on_combined_adversarial_shapes(task): + """Cross-field adversarial combinations — catches interactions + between the status gate, owner-membership check, and metadata + parse paths.""" + try: + result = wl._lifecycle_relevant(task) + except Exception as exc: # pragma: no cover + pytest.fail(f"_lifecycle_relevant raised on {task!r}: {exc}") + assert isinstance(result, bool) + + +# ---------- Dotfile exclusion (te-M2) ---------- + +def test_count_active_tasks_excludes_dotfile_prefixed_json(tmp_path, monkeypatch): + """Dotfile-prefixed `.fake_task.json` files planted in the team + directory must not influence the count. (Path.glob('*.json') matches + dotfiles on POSIX, contra a common assumption — the explicit + `name.startswith('.')` guard in iter_team_task_jsons is what excludes + them.) Without that guard, an attacker who can write a single + dotfile into the team's tasks dir could inflate the active-tasks + count and suppress Teardown emit.""" + monkeypatch.setattr(Path, "home", lambda: tmp_path) + team = "team-dotfile" + d = tmp_path / ".claude" / "tasks" / team + d.mkdir(parents=True) + # One legitimate active task. + _stage_task(tmp_path, team, "real", status="in_progress", owner="x") + # Dotfile-prefixed shape that would be active if matched. + (d / ".fake_task.json").write_text( + json.dumps({"id": "fake", "status": "in_progress", "owner": "y"}), + encoding="utf-8", + ) + # Dotfile-only file (pure leading-dot). + (d / ".hidden.json").write_text( + json.dumps({"id": "hidden", "status": "in_progress", "owner": "z"}), + encoding="utf-8", + ) + assert wl.count_active_tasks(team) == 1 + + +# ---------- Symlink-escape defense (be-B1) ---------- + +def test_count_active_tasks_returns_zero_when_team_dir_symlink_escapes_root(tmp_path, monkeypatch): + """Symlink-escape defense: even if `team_name` passes the safe-path + allowlist, a symlink at ~/.claude/tasks/{team_name} pointing outside + tasks_root must be detected via resolve()+relative_to and counted + as 0. Mirrors session_end.py::cleanup_wake_registry's defense.""" + monkeypatch.setattr(Path, "home", lambda: tmp_path) + tasks_root = tmp_path / ".claude" / "tasks" + tasks_root.mkdir(parents=True) + # Outside tasks_root: a directory with a real active task. + outside = tmp_path / "elsewhere" + outside.mkdir() + (outside / "real.json").write_text( + json.dumps({"id": "real", "status": "in_progress", "owner": "x"}), + encoding="utf-8", + ) + # team_name is allowlist-safe, but the team_dir symlinks outside. + team = "team-sym" + (tasks_root / team).symlink_to(outside, target_is_directory=True) + # Without symlink-escape defense the count would be 1; with it, 0. + assert wl.count_active_tasks(team) == 0 + + +# ---------- Per-file symlink defense (be-F1) ---------- + +def test_count_active_tasks_skips_symlinked_task_files(tmp_path, monkeypatch): + """Per-file symlink defense (be-F1): even when the team_dir is + legitimate, individual task-file entries that are symlinks must be + skipped. The team_dir-level resolve()+relative_to defense catches + a malicious team_dir, but a regular team_dir with a planted symlink + inside (e.g., `~/.claude/tasks/team-x/task-1.json -> /etc/passwd`) + would otherwise be read by iter_team_task_jsons. Skip silently — the + platform task system writes only regular files. + + Counter-test-by-revert: removing the per-file is_symlink guard + would let a planted symlink contribute to the count.""" + monkeypatch.setattr(Path, "home", lambda: tmp_path) + team = "team-symfile" + d = tmp_path / ".claude" / "tasks" / team + d.mkdir(parents=True) + # One legitimate active task as a regular file. + _stage_task(tmp_path, team, "real", status="in_progress", owner="x") + # External payload that the symlink will point at (active-shaped). + elsewhere = tmp_path / "elsewhere" + elsewhere.mkdir() + target = elsewhere / "planted.json" + target.write_text( + json.dumps({"id": "planted", "status": "in_progress", "owner": "y"}), + encoding="utf-8", + ) + # Symlinked task file inside the team dir — must be skipped. + (d / "planted.json").symlink_to(target) + assert wl.count_active_tasks(team) == 1 diff --git a/pact-plugin/tests/test_inbox_wake_session_end_cleanup.py b/pact-plugin/tests/test_inbox_wake_session_end_cleanup.py new file mode 100644 index 00000000..38949f86 --- /dev/null +++ b/pact-plugin/tests/test_inbox_wake_session_end_cleanup.py @@ -0,0 +1,239 @@ +""" +Behavioral invariants for session_end.cleanup_wake_registry. + +Pins the lead-only single-file unlink contract (NOT glob), the +path-traversal defense layers (is_safe_path_component + +resolve+relative_to), and Path.unlink missing_ok=True wrapped in +try/except OSError. Asserts the helper never raises across 9 negative +inputs (closes #602 reviewer-flagged coverage gap). +""" + +import os +from pathlib import Path + +import pytest + +# Hooks dir is on sys.path via conftest. +import session_end +from session_end import cleanup_wake_registry + +SESSION_END_PATH = ( + Path(__file__).resolve().parent.parent / "hooks" / "session_end.py" +) + + +@pytest.fixture(scope="module") +def src() -> str: + return SESSION_END_PATH.read_text(encoding="utf-8") + + +# ---------- Source-level structural invariants ---------- + +def test_cleanup_wake_registry_targets_single_fixed_filename(src): + """Lead-only scope: NO glob, single fixed-name STATE_FILE.""" + # Locate the function body to check just its scope. + start = src.find("def cleanup_wake_registry(") + assert start >= 0 + end = src.find("\ndef ", start + 1) + body = src[start:end if end > 0 else len(src)] + assert "inbox-wake-state.json" in body + # The function must NOT use glob — single-file unlink only. + assert ".glob(" not in body + + +def test_cleanup_wake_registry_uses_safe_path_component(src): + start = src.find("def cleanup_wake_registry(") + end = src.find("\ndef ", start + 1) + body = src[start:end if end > 0 else len(src)] + assert "is_safe_path_component" in body + + +def test_cleanup_wake_registry_uses_resolve_and_relative_to(src): + """Symlink-escape defense layer.""" + start = src.find("def cleanup_wake_registry(") + end = src.find("\ndef ", start + 1) + body = src[start:end if end > 0 else len(src)] + assert ".resolve()" in body + assert ".relative_to(" in body + + +def test_cleanup_wake_registry_uses_missing_ok(src): + start = src.find("def cleanup_wake_registry(") + end = src.find("\ndef ", start + 1) + body = src[start:end if end > 0 else len(src)] + assert "missing_ok=True" in body + + +def test_cleanup_wake_registry_wraps_unlink_in_try_except_oserror(src): + start = src.find("def cleanup_wake_registry(") + end = src.find("\ndef ", start + 1) + body = src[start:end if end > 0 else len(src)] + assert "except OSError" in body + + +# ---------- Behavioral: never-raises across negative inputs ---------- + +@pytest.fixture +def home_root(tmp_path, monkeypatch): + """Redirect Path.home() and shadow the resolved teams root.""" + monkeypatch.setattr(Path, "home", lambda: tmp_path) + return tmp_path + + +@pytest.mark.parametrize("bad_team", [ + "", + "..", + "../escape", + "team/with-slash", + "team\\with-backslash", + ".", + "team\x00null", + "team\nwith-newline", + "team with space", +]) +def test_cleanup_wake_registry_rejects_bad_team_name(bad_team, home_root): + """Each invalid team_name must early-return without touching disk + and without raising.""" + cleanup_wake_registry(bad_team) # returns None on rejection + + +def test_cleanup_wake_registry_handles_missing_team_dir(home_root): + """team_name passes the safe-component gate but the team dir does + not exist on disk — should be a quiet no-op.""" + cleanup_wake_registry("ghost-team") # never raises + + +def test_cleanup_wake_registry_handles_present_state_file(home_root): + team = "real-team" + team_dir = home_root / ".claude" / "teams" / team + team_dir.mkdir(parents=True) + state_file = team_dir / "inbox-wake-state.json" + state_file.write_text('{"v":1,"monitor_task_id":"abc"}', encoding="utf-8") + assert state_file.exists() + + cleanup_wake_registry(team) + + assert not state_file.exists() + + +def test_cleanup_wake_registry_handles_state_file_already_gone(home_root): + team = "team-no-state" + (home_root / ".claude" / "teams" / team).mkdir(parents=True) + cleanup_wake_registry(team) # missing_ok=True + + +def test_cleanup_wake_registry_does_not_touch_unrelated_files(home_root): + """Selective unlink — only inbox-wake-state.json.""" + team = "selective" + team_dir = home_root / ".claude" / "teams" / team + team_dir.mkdir(parents=True) + other = team_dir / "config.json" + other.write_text("{}", encoding="utf-8") + state = team_dir / "inbox-wake-state.json" + state.write_text("{}", encoding="utf-8") + + cleanup_wake_registry(team) + + assert other.exists() + assert not state.exists() + + +def test_cleanup_wake_registry_resists_symlink_escape(home_root): + """If team_dir resolves outside teams_root (symlink escape), the + function must not unlink anything.""" + teams_root = home_root / ".claude" / "teams" + teams_root.mkdir(parents=True) + # Create a real "victim" directory outside the teams root that holds + # a state file we don't want touched. + victim_dir = home_root / "outside" + victim_dir.mkdir() + victim_state = victim_dir / "inbox-wake-state.json" + victim_state.write_text("{}", encoding="utf-8") + + # Make a symlinked "team" that points at the victim dir. + link = teams_root / "evil" + try: + os.symlink(victim_dir, link) + except (OSError, NotImplementedError): + pytest.skip("symlinks unavailable on this filesystem") + + cleanup_wake_registry("evil") + + # The escape must have been refused — the victim file is intact. + assert victim_state.exists() + + +def test_cleanup_wake_registry_call_site_in_session_end(src): + """Belt-and-suspenders force-termination cleanup path: confirm + cleanup_wake_registry is wired into the main session_end flow with + the current team_name.""" + assert "cleanup_wake_registry(current_team_name)" in src + + +# ---------- Behavioral: cleanup_wake_registry actually fires from session_end.main ---------- + +import subprocess # noqa: E402 — used by the behavioral test below +import sys # noqa: E402 + +SESSION_END_HOOK = ( + Path(__file__).resolve().parent.parent / "hooks" / "session_end.py" +) + + +def test_session_end_main_unlinks_state_file_via_callsite(tmp_path): + """Behavioral pin (B4): structurally pinning the callsite is + false-RED-prone on benign refactor (rename of the helper, swap to + keyword-arg call, etc.). This pipes synthesized stdin through + session_end.main() in a subprocess, with a real STATE_FILE staged + on disk under a synthesized HOME. After the hook returns, the + state file must be gone — proves the callsite reaches the actual + cleanup helper, not just that the source line is present.""" + home = tmp_path / "home" + home.mkdir() + # Build a complete pact-session-context so session_end resolves + # team_name without falling through to the empty-team_name branch + # that skips the callsite. + sid = "test-session-id-1234" + pdir = "/tmp/test-end-flow" + team = "pact-cleanup-target" + slug = Path(pdir).name + sess_dir = home / ".claude" / "pact-sessions" / slug / sid + sess_dir.mkdir(parents=True) + (sess_dir / "pact-session-context.json").write_text( + '{{"team_name":"{0}","session_id":"{1}","project_dir":"{2}",' + '"plugin_root":"","started_at":"2026-04-30T00:00:00Z"}}'.format( + team, sid, pdir + ), + encoding="utf-8", + ) + # Stage the team dir with a STATE_FILE that the callsite must remove. + team_dir = home / ".claude" / "teams" / team + team_dir.mkdir(parents=True) + state_file = team_dir / "inbox-wake-state.json" + state_file.write_text( + '{"v":1,"monitor_task_id":"x","armed_at":"2026-04-30T00:00:00Z"}', + encoding="utf-8", + ) + assert state_file.exists() + + payload = '{"session_id":"' + sid + '","cwd":"' + pdir + '"}' + env = {k: v for k, v in os.environ.items() if not k.startswith("CLAUDE_")} + env.update({"HOME": str(home), "CLAUDE_PROJECT_DIR": pdir}) + proc = subprocess.run( + [sys.executable, str(SESSION_END_HOOK)], + input=payload.encode("utf-8"), + capture_output=True, + env=env, + timeout=30, + ) + # session_end may print warnings or journal-failure notes on stderr; + # exit code is the contract. Best-effort cleanup means non-zero is + # not expected under our happy-path setup. + assert proc.returncode == 0, ( + f"session_end exited {proc.returncode}; stderr={proc.stderr!r}" + ) + # The behavioral assertion: state file must be gone. + assert not state_file.exists(), ( + "session_end.main did not unlink the STATE_FILE — callsite " + "may be unreachable or guarded out" + ) diff --git a/pact-plugin/tests/test_inbox_wake_session_init.py b/pact-plugin/tests/test_inbox_wake_session_init.py new file mode 100644 index 00000000..0de25e80 --- /dev/null +++ b/pact-plugin/tests/test_inbox_wake_session_init.py @@ -0,0 +1,161 @@ +""" +Structural invariants for session_init.py's resume-with-active-tasks +Arm directive (Option-C gap closure for #591). + +Source-grep tests: pin the directive prose, the count_active_tasks +call site, and the unconditional-emission discipline. We don't run +session_init.py end-to-end — these are file-parsing fences. +""" + +from pathlib import Path + +import pytest + +SESSION_INIT_PATH = ( + Path(__file__).resolve().parent.parent / "hooks" / "session_init.py" +) + + +@pytest.fixture(scope="module") +def src() -> str: + return SESSION_INIT_PATH.read_text(encoding="utf-8") + + +def test_imports_count_active_tasks_from_wake_lifecycle(src): + assert "from shared.wake_lifecycle import count_active_tasks" in src + + +def test_calls_count_active_tasks(src): + # Single call site at the resume-Arm branch. + assert src.count("count_active_tasks(team_name)") >= 1 + + +def test_directive_references_watch_inbox_command_slug(src): + assert 'Skill("PACT:watch-inbox")' in src + + +def test_directive_includes_idempotency_clause(src): + # Cycle 4 directive prose: "Idempotent — no-op if a valid + # STATE_FILE is already on disk." Source is split across two + # quoted strings via Python implicit-concat, so substring matches + # must accommodate the line break — pin shorter fragments. + assert "idempotent" in src.lower() + assert "no-op if a valid" in src + assert "STATE_FILE is already on disk" in src + + +def test_directive_includes_active_task_trigger_phrase(src): + """The Tier-0 directive must declare the precondition (active tasks + on disk) so an LLM reader cannot misread it as unconditional Arm + on every session start.""" + assert "Active teammate tasks detected" in src + + +def test_directive_emitted_only_when_count_positive(src): + """Guard the emission with a positive-count check. The directive + must NOT fire on sessions with zero active teammate tasks.""" + assert "if active_count > 0:" in src + + +def test_directive_appended_to_context_parts(src): + """The directive flows through Tier-0 additionalContext via the + context_parts append channel, not via a separate emission path.""" + # Source contains a `context_parts.append(` near the Arm directive. + assert "context_parts.append(" in src + # And the directive prose lives in that block. + assert ( + "Active teammate tasks detected on session start." in src + ) + + +# ---------- Behavioral: session_init Arm-emit gate fires only when count>0 ---------- + +import json # noqa: E402 +import os # noqa: E402 +import subprocess # noqa: E402 +import sys # noqa: E402 + +SESSION_INIT_HOOK = SESSION_INIT_PATH + + +_ARM_DIRECTIVE_PHRASE = "Active teammate tasks detected on session start." + + +def _stage_pact_session(home: Path, team: str, sid: str, pdir: str) -> None: + slug = Path(pdir).name + sess_dir = home / ".claude" / "pact-sessions" / slug / sid + sess_dir.mkdir(parents=True, exist_ok=True) + (sess_dir / "pact-session-context.json").write_text( + json.dumps({ + "team_name": team, + "session_id": sid, + "project_dir": pdir, + "plugin_root": "", + "started_at": "2026-04-30T00:00:00Z", + }), + encoding="utf-8", + ) + + +def _stage_active_task(home: Path, team: str) -> None: + tasks_dir = home / ".claude" / "tasks" / team + tasks_dir.mkdir(parents=True, exist_ok=True) + (tasks_dir / "1.json").write_text( + json.dumps({"id": "1", "status": "in_progress", "owner": "backend-coder"}), + encoding="utf-8", + ) + + +def _run_session_init(home: Path, sid: str, pdir: str, source: str = "resume") -> dict: + payload = json.dumps({"session_id": sid, "cwd": pdir, "source": source}) + env = {k: v for k, v in os.environ.items() if not k.startswith("CLAUDE_")} + env.update({"HOME": str(home), "CLAUDE_PROJECT_DIR": pdir}) + proc = subprocess.run( + [sys.executable, str(SESSION_INIT_HOOK)], + input=payload.encode("utf-8"), + capture_output=True, + env=env, + timeout=30, + ) + assert proc.returncode == 0, f"session_init exited {proc.returncode}; stderr={proc.stderr!r}" + return json.loads(proc.stdout.decode("utf-8") or "{}") + + +def test_session_init_omits_arm_directive_when_no_active_tasks(tmp_path): + """Behavioral pin (B4): Arm-emit gate must fire only when + count_active_tasks > 0. Pure-structural source-grep is false-RED-prone + on benign refactor (e.g., extracting a helper); subprocess execution + confirms the gate's actual emit semantics. With zero active tasks + on disk, the directive prose must NOT appear in additionalContext.""" + home = tmp_path / "home"; home.mkdir() + # session_id[:8] filters to [a-f0-9-]; use a pure-hex session_id so + # generate_team_name returns a predictable team name. + sid = "abcdef01-no-tasks-here" + pdir = "/tmp/pi-empty" + team = "pact-abcdef01" + _stage_pact_session(home, team, sid, pdir) + # Stage the team's tasks dir but leave it empty. + (home / ".claude" / "tasks" / team).mkdir(parents=True) + out = _run_session_init(home, sid, pdir) + additional = out.get("hookSpecificOutput", {}).get("additionalContext", "") + assert _ARM_DIRECTIVE_PHRASE not in additional, ( + "Arm directive emitted with zero active tasks — gate is broken" + ) + + +def test_session_init_emits_arm_directive_when_active_tasks_present(tmp_path): + """Symmetric behavioral pin: with one active task on disk, + additionalContext must carry the Arm directive's precondition phrase.""" + home = tmp_path / "home"; home.mkdir() + sid = "deadbeef-active-task-present" + pdir = "/tmp/pi-active" + team = "pact-deadbeef" + _stage_pact_session(home, team, sid, pdir) + _stage_active_task(home, team) + out = _run_session_init(home, sid, pdir) + additional = out.get("hookSpecificOutput", {}).get("additionalContext", "") + assert _ARM_DIRECTIVE_PHRASE in additional, ( + "Arm directive missing despite active task on disk — gate is broken" + ) + # And the directive references the canonical command slug. + assert 'Skill("PACT:watch-inbox")' in additional diff --git a/pact-plugin/tests/test_inbox_wake_version_bump.py b/pact-plugin/tests/test_inbox_wake_version_bump.py new file mode 100644 index 00000000..f6b633cd --- /dev/null +++ b/pact-plugin/tests/test_inbox_wake_version_bump.py @@ -0,0 +1,68 @@ +""" +Version-bump consistency invariants for the 3.21.0 release. + +The plugin version is tracked in 4 files; all four must carry the same +version literal, with zero stale references to the prior 3.20.4. +""" + +import json +from pathlib import Path + +import pytest + +REPO_ROOT = Path(__file__).resolve().parent.parent.parent +TARGET_VERSION = "3.21.0" +PRIOR_VERSION = "3.20.4" + + +# ---------- 4-file version invariants ---------- + +def test_plugin_json_version(): + p = REPO_ROOT / "pact-plugin" / ".claude-plugin" / "plugin.json" + data = json.loads(p.read_text(encoding="utf-8")) + assert data.get("version") == TARGET_VERSION + + +def test_marketplace_json_version(): + p = REPO_ROOT / ".claude-plugin" / "marketplace.json" + data = json.loads(p.read_text(encoding="utf-8")) + plugins = data.get("plugins", []) + assert plugins, "marketplace.json missing plugins array" + versions = {plugin.get("version") for plugin in plugins} + assert TARGET_VERSION in versions, ( + f"marketplace.json: no plugin entry with version {TARGET_VERSION}; " + f"saw {versions}" + ) + + +def test_root_readme_version(): + p = REPO_ROOT / "README.md" + text = p.read_text(encoding="utf-8") + assert TARGET_VERSION in text, ( + f"root README.md missing target version literal {TARGET_VERSION}" + ) + + +def test_pact_plugin_readme_version(): + p = REPO_ROOT / "pact-plugin" / "README.md" + text = p.read_text(encoding="utf-8") + assert TARGET_VERSION in text, ( + f"pact-plugin/README.md missing version literal {TARGET_VERSION}" + ) + + +# ---------- Stale-version sweep ---------- + +@pytest.mark.parametrize("path", [ + Path("pact-plugin") / ".claude-plugin" / "plugin.json", + Path(".claude-plugin") / "marketplace.json", + Path("README.md"), + Path("pact-plugin") / "README.md", +]) +def test_no_stale_prior_version_token(path): + """No file in the 4-file set may carry the immediate-prior version + string. Catches half-applied bumps.""" + text = (REPO_ROOT / path).read_text(encoding="utf-8") + assert PRIOR_VERSION not in text, ( + f"{path}: stale prior version {PRIOR_VERSION!r} still present" + ) diff --git a/pact-plugin/tests/test_unwatch_inbox_command_structure.py b/pact-plugin/tests/test_unwatch_inbox_command_structure.py new file mode 100644 index 00000000..e97219a7 --- /dev/null +++ b/pact-plugin/tests/test_unwatch_inbox_command_structure.py @@ -0,0 +1,330 @@ +""" +Structural invariants for pact-plugin/commands/unwatch-inbox.md (Teardown role). + +File-parsing assertions only — no command execution. Pin section +presence, F6 'ignoring not-found errors' literal, best-effort framing, +Teardown ordering load-bearing, missing_ok=True usage, and the +cross-link to watch-inbox. + +Cycle 4 audit allocation per Task #52: +- F6 (TaskStop tolerates not-found) → unwatch-inbox +- best-effort framing → unwatch-inbox +- 'wake mechanism is opportunistic' brief mention → unwatch-inbox +""" + +from pathlib import Path + +import pytest + +CMD_PATH = ( + Path(__file__).resolve().parent.parent + / "commands" + / "unwatch-inbox.md" +) + + +@pytest.fixture(scope="module") +def cmd_text() -> str: + return CMD_PATH.read_text(encoding="utf-8") + + +def _section_body(text: str, header: str) -> str: + """Return the body between `header` and the next same-or-higher level header.""" + lines = text.splitlines() + level = len(header) - len(header.lstrip("#")) + start = None + for i, line in enumerate(lines): + if line.strip() == header: + start = i + 1 + break + if start is None: + return "" + end = len(lines) + for j in range(start, len(lines)): + line = lines[j].strip() + if line.startswith("#"): + this_level = len(line) - len(line.lstrip("#")) + if this_level <= level: + end = j + break + return "\n".join(lines[start:end]) + + +# ---------- Frontmatter / file-level invariants ---------- + +def test_command_file_exists(): + assert CMD_PATH.exists() + + +def test_frontmatter_has_description(cmd_text): + """Slash-commands use filename as identity (no `name:` field). + Frontmatter must carry at least a description.""" + assert cmd_text.startswith("---\n") + head, _, _ = cmd_text[4:].partition("\n---\n") + assert "description:" in head + assert "inbox-watch" in head or "unwatch-inbox" in head or "Tear down" in head + + +def test_command_body_under_compaction_budget(cmd_text): + # ~50L target per Task #52; 100-line ceiling for headroom. + line_count = len(cmd_text.splitlines()) + assert line_count <= 100, ( + f"unwatch-inbox.md has {line_count} lines, exceeds 100 cap" + ) + + +# ---------- Section-presence invariants ---------- + +REQUIRED_SECTIONS = [ + "## Overview", + "## When to Invoke", + "## Operation", + "## Teardown Block", + "## Failure Modes", + "## Verification", + "## References", +] + + +@pytest.mark.parametrize("section", REQUIRED_SECTIONS) +def test_required_section_present(cmd_text, section): + assert any(line.strip() == section for line in cmd_text.splitlines()), ( + f"missing required section header: {section}" + ) + + +def test_no_arm_or_recovery_subsection(cmd_text): + """Cycle 4: command IS the operation, no Arm/Recovery sub-sections. + unwatch-inbox is the Teardown command; Arm logic lives in + watch-inbox.md.""" + body = _section_body(cmd_text, "## Operation") + assert "### Arm" not in body + assert "### Recovery" not in body + + +# ---------- F6: TaskStop tolerance literal ---------- + +def test_f6_teardown_block_contains_ignoring_not_found(cmd_text): + """F6 invariant: TaskStop tolerates not-found errors. The literal + phrase 'ignoring not-found errors' (or the close substitute + 'tolerate not-found') must appear in the Teardown Block.""" + teardown = _section_body(cmd_text, "## Teardown Block") + assert ( + "ignoring not-found" in teardown + or "tolerate not-found" in teardown + ), ( + "Teardown Block must carry the F6 tolerance phrase. Without it, " + "an editing LLM 'tightening up error handling' will silently " + "restore crash-on-stale-ID." + ) + + +# ---------- Best-effort framing ---------- + +def test_best_effort_framing_in_overview_or_teardown(cmd_text): + """The 'best-effort' framing prevents an editing LLM from converting + Teardown into a strict-required-success operation.""" + full = cmd_text.lower() + assert "best-effort" in full + + +def test_opportunistic_wake_rationale_present(cmd_text): + """Brief 'wake mechanism is opportunistic' rationale anchors why + Teardown failures are tolerable. Per Task #52: brief mention here; + main treatment lives in watch-inbox.""" + full = cmd_text.lower() + assert ( + "opportunistic" in full + or "no harm done" in full + or "tolerable" in full + ) + + +# ---------- Teardown ordering load-bearing ---------- + +def test_teardown_block_orders_taskstop_before_unlink(cmd_text): + """Order is load-bearing: TaskStop must precede STATE_FILE unlink. + Inverse ordering would leave a brief window where a STATE_FILE-less + Monitor still runs but Arm sees no STATE_FILE and re-arms — orphan.""" + teardown = _section_body(cmd_text, "## Teardown Block") + stop_idx = teardown.find("TaskStop") + # Match either "Unlink STATE_FILE" prose or the canonical + # `Path.unlink(STATE_FILE, missing_ok=True)` form. + unlink_idx = teardown.find("Path.unlink") + if unlink_idx == -1: + unlink_idx = teardown.find("Unlink STATE_FILE") + assert stop_idx >= 0, "TaskStop reference missing from §Teardown Block" + assert unlink_idx >= 0, "STATE_FILE unlink reference missing from §Teardown Block" + assert stop_idx < unlink_idx, ( + "Teardown ordering inverted — TaskStop must precede unlink" + ) + + +def test_teardown_uses_missing_ok(cmd_text): + """Path.unlink missing_ok=True is the load-bearing flag — without + it, the unlink call raises FileNotFoundError when STATE_FILE was + already removed by an earlier Teardown.""" + teardown = _section_body(cmd_text, "## Teardown Block") + assert "missing_ok=True" in teardown + + +# ---------- Failure Modes coverage ---------- + +def test_failure_modes_mentions_monitor_died_silently(cmd_text): + """The Monitor-died-mid-session concept must appear in §Failure Modes + with the specific phrasing — `"Monitor"` alone is too weak (passes on + any prose mentioning Monitor); the F6-anchored phrase pins the + failure-mode entry concretely.""" + fm = _section_body(cmd_text, "## Failure Modes") + assert "Monitor died silently" in fm + + +# ---------- Cross-link to watch-inbox ---------- + +def test_references_section_links_to_watch_inbox(cmd_text): + refs = _section_body(cmd_text, "## References") + assert "watch-inbox" in refs + + +# ---------- Lead-Session Guard (arch-F1) ---------- + +def test_lead_session_guard_section_has_body(cmd_text): + """Teardown must refuse to execute from a teammate session AND the + section must have a non-empty body (not just inline cross-ref). + Phantom-green guard: the prior `'## Lead-Session Guard' in cmd_text` + form would pass on inline references like 'see `## Lead-Session + Guard` below'; this stricter form requires the actual H2 section to + exist and contain content.""" + body = _section_body(cmd_text, "## Lead-Session Guard") + assert body.strip(), ( + "Lead-Session Guard section is missing or empty — phantom-green " + "guard: only an inline cross-ref to the section, not the section " + "itself." + ) + + +def test_lead_session_guard_compares_session_id_to_team_config_lead(cmd_text): + """Same as arm command: signal source MUST be `session_id` against + `team_config.leadSessionId`. Two-source-of-truth defense.""" + guard = _section_body(cmd_text, "## Lead-Session Guard") + assert "leadSessionId" in guard + assert "session_id" in guard + assert "refuse" in guard.lower() + + +# ---------- task_id allowlist validation (sec-M1) ---------- + +def test_teardown_validates_monitor_task_id_against_allowlist_regex(cmd_text): + """The Teardown sequence MUST validate STATE_FILE.monitor_task_id + against an allowlist regex BEFORE calling TaskStop. A poisoned + STATE_FILE could otherwise inject arbitrary strings into a tool-call + argument. The allowlist `^[a-z0-9]{6,}\\Z` matches Claude Code's + task-id format and refuses anything else. + + Anchor must be `\\Z` (absolute end-of-string), NOT `$`. Python + regex's `$` matches before a trailing newline by default, which + means a planted task_id like `bu4hxc2bh\\n; rm -rf ~` would pass + `re.match(r"^[a-z0-9]{6,}$", task_id)` despite the embedded + newline. `\\Z` matches only the absolute end of the string.""" + teardown = _section_body(cmd_text, "## Teardown Block") + assert "^[a-z0-9]{6,}\\Z" in teardown + + +def test_operation_section_validates_monitor_task_id_before_taskstop(cmd_text): + """Independent pin in the Operation section — the validation must be + in the load-bearing procedure list, not just the supplementary + Teardown Block. An editing LLM removing the regex from Operation + would leave the supplementary block as the only mention; this test + pins both surfaces.""" + operation = _section_body(cmd_text, "## Operation") + assert "^[a-z0-9]{6,}\\Z" in operation + + +def test_task_id_regex_uses_absolute_end_anchor_not_dollar(cmd_text): + """Pin the `\\Z` (absolute-end) anchor across both regex surfaces. + `$` matches before a trailing newline in Python's default re mode, + creating a smuggling vector: `bu4hxc2bh\\n; rm -rf` passes + `^[a-z0-9]{6,}$` because `$` accepts the prefix-then-newline + shape. `\\Z` rejects any string with a trailing newline. + + Restricts the search to the procedure-step lines (not audit prose + that may quote the old `$` form as a negative example). The + procedure step is the load-bearing surface — the audit prose can + legitimately reference both forms in a 'use this not that' + explanation.""" + for section_name in ("## Operation", "## Teardown Block"): + body = _section_body(cmd_text, section_name) + # Filter to only the procedure-step lines: lines that begin with + # a digit + dot (numbered procedure step) AND mention the regex. + proc_lines = [ + line for line in body.splitlines() + if line.strip() + and (line.lstrip()[:1].isdigit() or line.lstrip().startswith("- ")) + and "[a-z0-9]" in line + ] + assert proc_lines, ( + f"§{section_name} should contain a procedure step that names " + f"the task-id regex" + ) + for line in proc_lines: + assert "\\Z" in line, ( + f"§{section_name} procedure step uses unanchored `$` " + f"(newline-smuggling vector). Line: {line!r}" + ) + + +# ---------- TOCTOU audit comment (sec-M2) ---------- + +def test_teardown_documents_toctou_window_audit(cmd_text): + """The Teardown Block must document the TOCTOU window between + resolve() and unlink() so an editing LLM understands why the window + is acceptable (same-user-trust assumption) and does not over-engineer + a defense that would not improve the security posture.""" + teardown = _section_body(cmd_text, "## Teardown Block") + assert "TOCTOU" in teardown + assert "same-user" in teardown.lower() + + +# ---------- armed_by_session_id integrity validation (sec-M1) ---------- + +def test_unwatch_inbox_validates_armed_by_session_id_before_taskstop(cmd_text): + """The Teardown sequence must validate STATE_FILE.armed_by_session_id + against the current session_id BEFORE calling TaskStop. Without this + check, a planted/cross-session STATE_FILE would let a different + same-user session weaponize TaskStop against the lead's active + Monitor or other tasks. The check fail-opens to unlink so the + planted file gets cleaned without invoking TaskStop on it.""" + teardown = _section_body(cmd_text, "## Teardown Block") + assert "armed_by_session_id" in teardown + + +def test_operation_validates_armed_by_session_id_before_taskstop(cmd_text): + """Independent pin in the Operation section — the validation must + appear in the load-bearing procedure list, not just the + supplementary Teardown Block.""" + operation = _section_body(cmd_text, "## Operation") + assert "armed_by_session_id" in operation + + +def test_teardown_audit_explains_cross_session_taskstop_threat(cmd_text): + """Audit anchor must name the threat model so an editing LLM + 'tightening' the procedure does not silently strip the integrity + check by reasoning 'the regex already validates'. The audit must + explicitly name the cross-session TaskStop weaponization shape.""" + teardown = _section_body(cmd_text, "## Teardown Block") + assert "cross-session" in teardown.lower() + + +# ---------- arch2-M1: terminal-status doc wording ---------- + +def test_when_to_invoke_uses_terminal_status_wording(cmd_text): + """The When-to-Invoke trigger row must reflect that BOTH `completed` + AND `deleted` terminal statuses fire the Teardown — not just + `completed`. After be-F2 (cycle 6), status=deleted is also a + terminal transition; the doc must match the implementation's + behavior or an editing LLM reading the doc will silently + misrepresent the contract.""" + when = _section_body(cmd_text, "## When to Invoke") + assert "terminal status" in when + assert "completed" in when + assert "deleted" in when diff --git a/pact-plugin/tests/test_watch_inbox_command_structure.py b/pact-plugin/tests/test_watch_inbox_command_structure.py new file mode 100644 index 00000000..03bf1b6c --- /dev/null +++ b/pact-plugin/tests/test_watch_inbox_command_structure.py @@ -0,0 +1,364 @@ +""" +Structural invariants for pact-plugin/commands/watch-inbox.md (Arm role). + +File-parsing assertions only — no command execution. Pin section +presence, alarm-clock framing, F1/F7 invariants, 30/60/120 timing +constants and dual-ratio audit anchors, state-machine edge tokens, +WriteStateFile schema, lead-only narrowing, and negative invariants +(no Cron, no Recovery, no symmetric per-agent tokens). + +Cycle 4 audit allocation per Task #52: +- alarm-clock framing → watch-inbox (governs Monitor's behavior) +- 30/60/120 design ratios + dual-ratio audit → watch-inbox +- F1 (single-file inbox), F7 (stdout discipline) → watch-inbox +- bidirectional drift warnings → watch-inbox +- F6 (TaskStop tolerates not-found) → unwatch-inbox (separate file) +""" + +from pathlib import Path + +import pytest + +CMD_PATH = ( + Path(__file__).resolve().parent.parent + / "commands" + / "watch-inbox.md" +) + + +@pytest.fixture(scope="module") +def cmd_text() -> str: + return CMD_PATH.read_text(encoding="utf-8") + + +def _section_body(text: str, header: str) -> str: + """Return the body between `header` and the next same-or-higher level header.""" + lines = text.splitlines() + level = len(header) - len(header.lstrip("#")) + start = None + for i, line in enumerate(lines): + if line.strip() == header: + start = i + 1 + break + if start is None: + return "" + end = len(lines) + for j in range(start, len(lines)): + line = lines[j].strip() + if line.startswith("#"): + this_level = len(line) - len(line.lstrip("#")) + if this_level <= level: + end = j + break + return "\n".join(lines[start:end]) + + +# ---------- Frontmatter / file-level invariants ---------- + +def test_command_file_exists(): + assert CMD_PATH.exists() + + +def test_frontmatter_has_description(cmd_text): + """Slash-commands use filename as identity (no `name:` field). + Frontmatter must carry at least a description.""" + assert cmd_text.startswith("---\n") + head, _, _ = cmd_text[4:].partition("\n---\n") + assert "description:" in head + assert "inbox-watch" in head or "watch-inbox" in head + + +def test_command_body_under_compaction_budget(cmd_text): + # ~120L target per Task #52; 200-line ceiling allows headroom for + # future revisions while preventing unbounded growth. + line_count = len(cmd_text.splitlines()) + assert line_count <= 200, ( + f"watch-inbox.md has {line_count} lines, exceeds 200 cap" + ) + + +# ---------- Section-presence invariants ---------- + +REQUIRED_SECTIONS = [ + "## Overview", + "## When to Invoke", + "## Operation", + "## Monitor Block", + "## WriteStateFile Block", + "## Failure Modes", + "## Verification", + "## References", +] + + +@pytest.mark.parametrize("section", REQUIRED_SECTIONS) +def test_required_section_present(cmd_text, section): + assert any(line.strip() == section for line in cmd_text.splitlines()), ( + f"missing required section header: {section}" + ) + + +def test_no_separate_arm_or_teardown_subsection(cmd_text): + """Cycle 4: command IS the operation, no Arm/Teardown sub-headers + inside ## Operation. Per Task #52: 'no `## Operations` sub-section + listing Arm/Teardown'.""" + body = _section_body(cmd_text, "## Operation") + assert "### Arm" not in body + assert "### Teardown" not in body + # Recovery never reintroduced — D1 fence. + assert "### Recovery" not in body + + +# ---------- Negative invariants (D1 fence) ---------- + +@pytest.mark.parametrize("forbidden", [ + "## Cron Block", + "## Wake-State-Check Algorithm", + "## Per-Branch Action Sequences", + "## Recovery", + "_WAKE_ARM_TEMPLATE", + "{agent_name}", +]) +def test_forbidden_token_absent(cmd_text, forbidden): + assert forbidden not in cmd_text, ( + f"forbidden token reintroduced: {forbidden}" + ) + + +# ---------- F1: single-file inbox + wc -c byte-grow ---------- + +def test_f1_single_file_inbox_path_hardcoded(cmd_text): + monitor = _section_body(cmd_text, "## Monitor Block") + assert "inboxes/team-lead.json" in monitor + assert "wc -c" in monitor + + +def test_f1_single_file_inbox_phrase(cmd_text): + """Narrative anchor must mark the inbox as a single JSON file (not + a directory).""" + assert "single JSON file" in cmd_text + overview = _section_body(cmd_text, "## Overview") + assert "team-lead.json" in overview + + +# ---------- F7: stdout discipline + between-tool-call scope ---------- + +def test_f7_between_tool_calls_not_mid_tool(cmd_text): + overview = _section_body(cmd_text, "## Overview") + assert "between tool calls" in overview + assert "not mid-tool" in overview or "instant interrupt anywhere" in overview + + +def test_f7_stdout_discipline_in_monitor_block(cmd_text): + monitor = _section_body(cmd_text, "## Monitor Block") + assert "Stdout discipline" in monitor + assert "INBOX_GREW" in monitor + assert ">&2" in monitor + + +def test_long_single_tool_failure_mode_documented(cmd_text): + fm = _section_body(cmd_text, "## Failure Modes") + assert "Long single-tool calls block wake delivery" in fm + # Pin the present-tense scope rule so the entry cannot be silently + # edited into a vague claim. Both phrases together establish the + # between-tool-call boundary. + assert "between tool calls" in fm + assert "not mid-tool" in fm + + +# ---------- Alarm-clock framing (audit-anchor invariant) ---------- + +def test_alarm_clock_paragraph_present(cmd_text): + overview = _section_body(cmd_text, "## Overview") + assert "Monitor is an alarm clock, not a mailbox" in overview + + +def test_alarm_clock_no_narration_clause(cmd_text): + overview = _section_body(cmd_text, "## Overview") + assert ( + "without emitting acknowledgment text" in overview + or "no narration" in overview.lower() + or "return to idle" in overview + ) + + +# ---------- 30/60/120 timing fences ---------- + +@pytest.mark.parametrize("token", [ + "POLL=30", + "QUIET_REQUIRED=60", + "MAX_DELAY=120", +]) +def test_timing_constant_present(cmd_text, token): + monitor = _section_body(cmd_text, "## Monitor Block") + assert token in monitor + + +def test_audit_documents_quiet_equals_two_times_poll(cmd_text): + """Pin the QUIET = 2*POLL design rationale anchor in the Monitor + Block audit annotation. Robust substring fallback set.""" + monitor = _section_body(cmd_text, "## Monitor Block") + candidates = [ + "two consecutive quiet poll cycles", + "two quiet poll cycles", + "QUIET = 2*POLL", + "QUIET = 2 * POLL", + "QUIET_REQUIRED = 2*POLL", + "QUIET_REQUIRED = 2 * POLL", + "2 × POLL", + ] + assert any(c in monitor for c in candidates), ( + "Monitor Block audit must anchor QUIET = 2*POLL design choice. " + f"None of {candidates} found in §Monitor Block." + ) + + +def test_audit_documents_max_delay_ratio_to_quiet(cmd_text): + """Pin the MAX_DELAY = 2*QUIET (= 4*POLL) design-choice anchor. + Robust substring fallback set.""" + monitor = _section_body(cmd_text, "## Monitor Block") + candidates = [ + "MAX_DELAY = 2*QUIET", + "MAX_DELAY = 2 * QUIET", + "MAX_DELAY = 4*POLL", + "MAX_DELAY = 4 * POLL", + "twice QUIET", + "2 × QUIET", + "4 × POLL", + "two QUIET", + ] + assert any(c in monitor for c in candidates), ( + "Monitor Block audit must anchor MAX_DELAY = 2*QUIET (= 4*POLL) " + f"design choice. None of {candidates} found in §Monitor Block." + ) + + +# ---------- State machine edge tokens ---------- + +@pytest.mark.parametrize("edge", ["FIRST_GROW", "LAST_GROW", "MAX_DELAY"]) +def test_state_machine_edge_token_present(cmd_text, edge): + monitor = _section_body(cmd_text, "## Monitor Block") + assert edge in monitor + + +def test_state_machine_states_present(cmd_text): + monitor = _section_body(cmd_text, "## Monitor Block") + assert "PENDING" in monitor + assert "GROWING" in monitor + + +# ---------- WriteStateFile schema (lead-only fence) ---------- + +def test_writestate_block_has_four_fields(cmd_text): + """STATE_FILE schema: 4 fields. `armed_by_session_id` was added in + cycle 7 to enable cross-session TaskStop weaponization defense at + Teardown time (see unwatch-inbox §Teardown Block audit anchor).""" + block = _section_body(cmd_text, "## WriteStateFile Block") + for field in ("v", "monitor_task_id", "armed_at", "armed_by_session_id"): + assert f'"{field}"' in block, f"missing schema field: {field}" + + +def test_writestate_block_no_watchdog_tokens(cmd_text): + block = _section_body(cmd_text, "## WriteStateFile Block") + assert "cron_job_id" not in block + assert "heartbeat" not in block + + +def test_writestate_path_is_lead_only_fixed(cmd_text): + block = _section_body(cmd_text, "## WriteStateFile Block") + assert "inbox-wake-state.json" in block + assert "{agent-name}" not in block + + +# ---------- Lead-only narrowing ---------- + +def test_lead_only_scope_in_overview(cmd_text): + overview = _section_body(cmd_text, "## Overview") + assert "Single-Monitor model" in overview + assert "Lead-only" in cmd_text or "lead-only" in cmd_text.lower() + + +# ---------- Failure Modes coverage ---------- + +@pytest.mark.parametrize("entry", [ + "Silent Monitor death", + "Long single-tool calls block wake delivery", + "Wake-fire inflation under bursty traffic", + "Malformed STATE_FILE", +]) +def test_failure_modes_entry_present(cmd_text, entry): + fm = _section_body(cmd_text, "## Failure Modes") + assert entry in fm + + +# ---------- Cross-link to unwatch-inbox ---------- + +def test_references_section_links_to_unwatch_inbox(cmd_text): + refs = _section_body(cmd_text, "## References") + assert "unwatch-inbox" in refs + + +# ---------- Lead-Session Guard (arch-F1) ---------- + +def test_lead_session_guard_section_has_body(cmd_text): + """Arm command must refuse to execute from a teammate session AND + must have a non-empty section body (not just an inline cross-ref). + Phantom-green guard: the prior `'## Lead-Session Guard' in cmd_text` + form passes on inline references like 'see `## Lead-Session Guard` + below'; this stricter form requires the actual H2 section to exist + and contain content.""" + body = _section_body(cmd_text, "## Lead-Session Guard") + assert body.strip(), ( + "Lead-Session Guard section is missing or empty — phantom-green " + "guard: only an inline cross-ref to the section, not the section " + "itself." + ) + + +def test_lead_session_guard_compares_session_id_to_team_config_lead(cmd_text): + """The guard's signal source MUST be `session_id` against + `team_config.leadSessionId` — NOT a hypothetical `agent_type` field + on pact-session-context.json. The team config is the canonical + source of truth for lead identity; replicating that into + session-context creates two-source-of-truth drift.""" + guard = _section_body(cmd_text, "## Lead-Session Guard") + assert "leadSessionId" in guard + assert "session_id" in guard + assert "refuse" in guard.lower() + + +# ---------- Monitor bash shell-injection-vector absence (sec-F1) ---------- + +def test_monitor_bash_has_no_shell_injection_vectors(cmd_text): + """Coarse but stable invariant: the Monitor bash block must contain + no shell-injection vectors (positional args, eval, command- + substitution against external commands, backticks). The actual + security property is "no user-controlled string flows into shell + interpretation"; a positive allowlist of safe-vars is brittle + across future Monitor changes, so we negative-allowlist the dangerous + primitives instead.""" + monitor = _section_body(cmd_text, "## Monitor Block") + # Extract the bash fence body. The block starts with ```bash. + if "```bash" not in monitor: + pytest.fail("Monitor Block must contain a ```bash fenced block") + bash_body = monitor.split("```bash", 1)[1].split("```", 1)[0] + + forbidden_tokens = [ + "$1", "$2", "$3", "$@", "$*", # positional args (untrusted caller input) + "eval ", # arbitrary string execution + "`", # backticks (legacy command substitution) + "$(curl", # network command-substitution + "$(wget", + "$(nc ", + "$(bash", + "$(sh ", + "$(eval", + "$(rm", # destructive shell-out + "$(python", # interpreter substitution (host filesystem read) + "<(", # process substitution (read FD from subshell) + ">(", # process substitution (write FD to subshell) + ] + for tok in forbidden_tokens: + assert tok not in bash_body, ( + f"Monitor bash contains shell-injection vector: {tok!r}" + )