From bf3657bc8845557d538d91bc6c3d7ea7d520394f Mon Sep 17 00:00:00 2001 From: Andrew Jon Przybilla Date: Tue, 12 May 2026 21:25:45 -0700 Subject: [PATCH] Trace MITM scoping + hybrid busy signal + stuck-handle safety net MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Scoped MITM (LLM-host allowlist) - Add `runtime/trace/known-hosts.ts` exporting `isKnownLlmHost` and `KNOWN_LLM_HOST_PATTERNS`. Seeded with Anthropic, OpenAI, and Azure OpenAI subdomain patterns. Single source of truth the proxy and the decoder both consult. - Proxy now takes a `shouldMitm?: (host) => boolean` predicate. `host.ts` wires `isKnownLlmHost`. Hosts off the allowlist fall through to the existing raw TCP tunnel — the agent's TLS client talks straight to the real upstream cert, system trust applies, we never see plaintext. Hosts on the allowlist get the MITM treatment as before. Predicate omission preserves legacy "decrypt everything" for tests. - `anthropic.ts` re-imports the regex from `known-hosts.ts` plus a module-load sanity check so the decoder and allowlist can't drift silently. - Practical effect: curl / git / python / wget invocations from an agent (or from the bundled tools an agent calls) now succeed against any HTTPS host without needing `SSL_CERT_FILE` / `REQUESTS_CA_BUNDLE` / `CURL_CA_BUNDLE` / `GIT_SSL_CAINFO` env-var injection. Those vars are deliberately NOT set — under scoped MITM they would replace system trust with our single-CA pem and break every non-allowlisted call. - Honest privacy claim: ac7 only decrypts traffic to LLM-provider hosts on a maintained allowlist. Codex `CODEX_CA_CERTIFICATE` stays (Rust/reqwest still needs it for the MITM'd OpenAI hosts). - Docs: `tracing.mdx` reframed (new "Host allowlist" section, updated overview + security posture); `concepts/activity-and- traces.mdx` limitations corrected. Hybrid "agent is working" signal - `BusySignal` gains per-source counters keyed by a `BusySource` union (`'llm_inflight' | 'tool_inflight'`). `start(source?)` defaults to `llm_inflight` for backwards-compat with existing MITM call sites. Public surface stays a single `busy: boolean` so the UI never has to merge state. New `getSourceCounts()` surfaces which feeder is wedged when things go wrong. - Codex: new `agents/codex/busy-sniff.ts` attaches to the existing JSON-RPC client. Bumps `tool_inflight` on `item/started` for `commandExecution` / `fileChange` / `mcpToolCall`, drains on matching `item/completed`, and sweeps via `turn/completed` plus teardown `drain()`. Zero subprocess overhead — we already proxy the JSON-RPC stream, just inspecting notifications inline. - Claude Code: new `runtime/trace/hook-server.ts` binds a loopback HTTP endpoint at TraceHost startup; new `prepareClaudeSettings` writes `.claude/settings.json` with `type: "http"` hook entries for `PreToolUse` / `PostToolUse` / `PostToolUseFailure` pointing at it. Same backup-then-restore discipline as `prepareMcpConfig`, with `x_ac7_busy_feeder: true` marker so stale entries from a prior crash get auto-purged. - Net result: the indicator now lights up during model-in-flight AND during tool execution windows (bash, file edits, MCP tools) that the LLM-call bump alone wouldn't cover. Stuck-handle safety net - Reported failure: TUI interrupt (Ctrl+C in claude-code) could leave a handle stuck in `pendingHandles` because the keep-alive socket survives and `onSessionEnd` never fires. With the count stuck > 0, the reporter heartbeats `busy:true` to the broker every 10s indefinitely. - Fix 1: each `start()` returns a handle with an auto-finish timer. Defaults per source — 5 min `llm_inflight`, 15 min `tool_inflight`. Caller can pass `maxAgeMs` to override or `Infinity` to opt out. Timer is `unref()`'d so it can't keep the process alive. Auto-finish logs a diagnostic with source + age for follow-up investigation. - Fix 2: new `BusySignal.forceFinishAll()` drains every live handle and emits one busy→idle transition. `TraceHost.close()` calls it after proxy + hook server shutdown, with a pre-drain `getSourceCounts()` snapshot in the diagnostic log so leaks tell us which source they came from. Silent on clean teardowns. - SIGKILL is still bounded by the existing server-side 30s TTL — nothing the runner can do post `kill -9`. Tests - 13 new tests for known-hosts dispatch + proxy gating (MITM on accepted hosts produces plaintext; rejected hosts produce ciphertext-only chunks with `mitm: false` session metadata). - 9 new tests for the hook server (PreToolUse / Post lifecycle, duplicate / unknown ids, malformed bodies, drain-on-close). - 7 new tests for the codex sniff driving real JSON-RPC notifications through real `attachCodexBusySniff` (tool types bump, non-tool types don't, sweep on turn/completed, explicit drain). - 6 new tests for `prepareClaudeSettings` (create, merge, stale- entry purge, corrupt-JSON refusal, idempotent restore, `.claude/` preservation when other files live there). - 16 new tests for `BusySignal` upgrades (per-source counters, max-age timers, default-source fallback, forceFinishAll diagnostics, TraceHost teardown integration). - Test count: 220 → 258 (+38). Signed-off-by: Andrew Jon Przybilla --- docs/concepts/activity-and-traces.mdx | 10 +- docs/tracing.mdx | 99 +++++-- packages/cli/src/commands/claude-code.ts | 37 +++ packages/cli/src/commands/codex.ts | 4 + .../cli/src/runtime/agents/claude-code.ts | 249 +++++++++++++++++ .../cli/src/runtime/agents/codex/adapter.ts | 22 ++ .../src/runtime/agents/codex/busy-sniff.ts | 111 ++++++++ packages/cli/src/runtime/trace/anthropic.ts | 12 + packages/cli/src/runtime/trace/busy.ts | 250 +++++++++++++++--- packages/cli/src/runtime/trace/hook-server.ts | 192 ++++++++++++++ packages/cli/src/runtime/trace/host.ts | 91 ++++++- packages/cli/src/runtime/trace/known-hosts.ts | 59 +++++ packages/cli/src/runtime/trace/proxy.ts | 42 ++- packages/cli/test/runtime/busy.test.ts | 225 +++++++++++++++- .../test/runtime/claude-code-adapter.test.ts | 158 ++++++++++- .../cli/test/runtime/codex/busy-sniff.test.ts | 215 +++++++++++++++ .../test/runtime/trace-hook-server.test.ts | 164 ++++++++++++ packages/cli/test/runtime/trace-host.test.ts | 61 +++++ .../test/runtime/trace-known-hosts.test.ts | 66 +++++ packages/cli/test/runtime/trace-proxy.test.ts | 217 +++++++++++++++ 20 files changed, 2216 insertions(+), 68 deletions(-) create mode 100644 packages/cli/src/runtime/agents/codex/busy-sniff.ts create mode 100644 packages/cli/src/runtime/trace/hook-server.ts create mode 100644 packages/cli/src/runtime/trace/known-hosts.ts create mode 100644 packages/cli/test/runtime/codex/busy-sniff.test.ts create mode 100644 packages/cli/test/runtime/trace-hook-server.test.ts create mode 100644 packages/cli/test/runtime/trace-known-hosts.test.ts diff --git a/docs/concepts/activity-and-traces.mdx b/docs/concepts/activity-and-traces.mdx index 5e50588..56ba37e 100644 --- a/docs/concepts/activity-and-traces.mdx +++ b/docs/concepts/activity-and-traces.mdx @@ -198,9 +198,15 @@ WAL, online prune doesn't block live writes for long. no `llm_exchange` entries — the proxy doesn't speak HPACK yet. In practice the Anthropic SDK defaults to HTTP/1.1 for `/v1/messages`, so this is rarely hit. -- **Anthropic parser only.** OpenAI / Gemini / Mistral land as +- **Anthropic parser only.** OpenAI / Azure OpenAI traffic IS + captured (those hosts are on the MITM allowlist) but lands as `opaque_http`. Codex traces today fall in this bucket — adding - typed parsers is a follow-up. + typed parsers for OpenAI's Chat Completions / Responses APIs + is a follow-up. Gemini / Mistral / Bedrock are not currently + on the allowlist — their traffic passes through unmodified + and produces no activity events. See + [tracing.mdx → Host allowlist](/docs/tracing#host-allowlist) + for how to extend. - **Uploader queue cap.** The uploader caps in-flight at 1000 events / 1 MB and evicts oldest-first under sustained broker unreachability. Events dropped here won't appear later. diff --git a/docs/tracing.mdx b/docs/tracing.mdx index 88cfffb..78a0bdc 100644 --- a/docs/tracing.mdx +++ b/docs/tracing.mdx @@ -15,13 +15,30 @@ without embedding observability hooks into the agent itself. The runner runs **upstream** of the agent process and intercepts its network traffic at the TLS layer via a **loopback MITM TLS -proxy** with a per-session local CA. Every HTTPS request the -agent makes is transparently decrypted by the proxy, observed as -plaintext, re-encrypted toward the real upstream, and passed -through. From the upstream's point of view we are a normal TLS -client doing standard SNI + cert validation — it can't tell us -apart from any other user-agent, which means OAuth flows, token -refreshes, streaming responses, and SSE all work identically. +proxy** with a per-session local CA. HTTPS requests to **known +LLM-provider hosts** (Anthropic, OpenAI, Azure OpenAI — see the +allowlist below) are transparently decrypted by the proxy, +observed as plaintext, re-encrypted toward the real upstream, +and passed through. From the upstream's point of view we are a +normal TLS client doing standard SNI + cert validation — it can't +tell us apart from any other user-agent, which means OAuth flows, +token refreshes, streaming responses, and SSE all work +identically. + +Traffic to any **other** host (GitHub, npm, package registries, +agent-spawned `curl`/`git`/`python` calls, telemetry endpoints, +etc.) is **not** decrypted. The proxy still routes those +connections — it has to, because `HTTPS_PROXY` is set on the +agent child — but it passes the TCP bytes through as a raw +tunnel. The agent's TLS client negotiates directly with the real +upstream cert and the agent's system trust store is what +validates it; the runner never sees plaintext. + +This is the honest privacy claim: **ac7 only decrypts traffic to +LLM provider hosts on a maintained allowlist.** Adding a host +requires editing +[`known-hosts.ts`](https://github.com/anthropics/agentc7) (and +ideally a parser for whatever shape lives behind it). Zero external tools. No tshark. No pcap. No SSLKEYLOGFILE shenanigans. Just Node's built-in `crypto` + `tls` + a small @@ -124,6 +141,39 @@ Node-only and would confuse reqwest. For codex specifics see [runners/codex](/docs/runners/codex). +## The host allowlist + +The proxy is host-agnostic: every CONNECT lands in the same code +path. The decision of "decrypt this session" vs "pass it through +unmodified" is gated by a single predicate, `isKnownLlmHost`, +exported from `runtime/trace/known-hosts.ts`. Production wires +that predicate into the proxy at startup; tests can substitute +any predicate they want to drive both code paths. + +Current allowlist patterns (regex, case-insensitive, +anchored against the apex): + +``` +(?:^|\.)anthropic\.com$ # api.anthropic.com, console.anthropic.com, auth.anthropic.com +(?:^|\.)openai\.com$ # api.openai.com, auth.openai.com +(?:^|\.)openai\.azure\.com$ # .openai.azure.com +``` + +The bar for adding a host: the agent (or its bundled tools) +makes inference-related calls to it — model invocations, token +refresh against the same provider, provider-specific telemetry +that the trace pipeline knows how to parse. Hosts we just +"happen to see" (a package registry the agent installs from, a +GitHub API call from an MCP tool) do not belong here. + +Non-allowlisted CONNECTs produce **no activity events** today. +At the proxy layer the runner still observes session metadata +(host, port, byte counts, duration) via the `onSessionEnd` +callback, but this data isn't currently surfaced into the +activity stream — promoting it to a new `network_session` event +kind is a candidate follow-up if presence visibility for non-LLM +hosts becomes valuable. + ## How the MITM works When the agent issues `CONNECT api.anthropic.com:443` through the @@ -250,23 +300,32 @@ the work. ac7 mitigates this with defense in depth: only to `127.0.0.1` on a random ephemeral port. The CA is generated fresh per runner process; its cert is written with `0o600`; its private key never touches disk. -2. **Redaction at parse time.** Secrets are replaced with +2. **Scoped to an LLM-host allowlist.** TLS termination only + fires for hosts on the `known-hosts.ts` allowlist. Everything + else passes through as a raw TCP tunnel — the agent's TLS + client validates the real upstream cert against the agent's + system trust store and the runner never observes plaintext. + This is also why setting `SSL_CERT_FILE` / `REQUESTS_CA_BUNDLE` + / `CURL_CA_BUNDLE` / `GIT_SSL_CAINFO` is deliberately *not* + done: they would replace the system trust store with our + single-CA pem and break every non-allowlisted HTTPS call. +3. **Redaction at parse time.** Secrets are replaced with `[REDACTED]` before entries leave the runner. The server never sees the plaintext token. -3. **Permission-gated view.** Only members with `activity.read` +4. **Permission-gated view.** Only members with `activity.read` (or the captured member themselves) can read the activity stream. Watchers, originators, and assignees of OTHER members' objectives all get 403 on the GET endpoint. -4. **CA cert deleted on runner exit.** The cert PEM is unlinked +5. **CA cert deleted on runner exit.** The cert PEM is unlinked on every exit path (normal, SIGINT, SIGTERM, uncaughtException). -5. **`.mcp.json` restored on every exit** (claude-code only) — +6. **`.mcp.json` restored on every exit** (claude-code only) — the original is backed up and restored idempotently. -6. **Ephemeral CODEX_HOME removed on exit** (codex only) — the +7. **Ephemeral CODEX_HOME removed on exit** (codex only) — the entire temp directory is `rm -rf`'d, including the symlink to the user's `~/.codex/auth.json` (the symlink is removed; the real file isn't). -7. **Upload is best-effort.** If the upload fails, the runner +8. **Upload is best-effort.** If the upload fails, the runner logs and moves on. It does NOT retry past the queue cap, and it does NOT persist the trace to disk. @@ -344,11 +403,15 @@ audit requirements. ALPN) produce no `llm_exchange` events. Adding an HPACK-aware parser is a follow-up. In practice the Anthropic SDK defaults to HTTP/1.1 for `/v1/messages`, so this is rarely hit. -- **Anthropic parser only.** Other LLM providers (OpenAI, - Gemini, Mistral) land as `opaque_http`. **Codex traces today - fall in this bucket** — adding a typed OpenAI parser so codex - traces render the same way claude-code traces do is a - follow-up. +- **Anthropic parser only.** OpenAI / Azure OpenAI traffic IS + decrypted (those hosts are on the allowlist) but lands as + `opaque_http` because there's no structured parser for the + Chat Completions / Responses API shapes yet. **Codex traces + today fall in this bucket** — adding a typed OpenAI parser so + codex traces render the same way claude-code traces do is a + follow-up. Other providers (Gemini, Mistral, Bedrock) are + passed through unmodified (not on the allowlist); adding them + is a code change to `known-hosts.ts` plus, ideally, a parser. - **Uploader queue cap.** The uploader caps in-flight at 1000 events / 1 MB and evicts oldest-first under sustained broker unreachability. Events dropped here won't appear in the UI. diff --git a/packages/cli/src/commands/claude-code.ts b/packages/cli/src/commands/claude-code.ts index 1812e21..98dae34 100644 --- a/packages/cli/src/commands/claude-code.ts +++ b/packages/cli/src/commands/claude-code.ts @@ -39,8 +39,10 @@ import { resolve } from 'node:path'; import { DEFAULT_PORT, ENV } from '@agentc7/sdk/protocol'; import { ClaudeCodeAdapterError, + type ClaudeSettingsHandle, findClaudeBinary, type McpConfigHandle, + prepareClaudeSettings, prepareMcpConfig, } from '../runtime/agents/claude-code.js'; import { HUD_HEIGHT, startHud } from '../runtime/hud.js'; @@ -226,6 +228,7 @@ export async function runClaudeCodeCommand(input: ClaudeCodeCommandInput): Promi // here tears down the runner before propagating so we don't leave // an orphaned IPC socket. let mcpHandle: McpConfigHandle; + let settingsHandle: ClaudeSettingsHandle | null = null; // Auto-detect the bridge command from the currently-running cli // process. `process.execPath` is the node binary; `process.argv[1]` // is the absolute path to the cli's entry script (dist/index.js in @@ -277,6 +280,26 @@ export async function runClaudeCodeCommand(input: ClaudeCodeCommandInput): Promi } log('claude-code: .mcp.json prepared', { path: mcpHandle.path }); + // 3b. If tracing is enabled, write a `.claude/settings.json` hook + // config so PreToolUse / PostToolUse events drive the busy + // signal. Skipped under `--no-trace` since there's no hook + // endpoint to point at. Failures here are non-fatal — they + // only degrade the busy-signal accuracy during tool runs, not + // correctness of the agent itself. + if (runner.traceHost) { + try { + settingsHandle = prepareClaudeSettings({ + cwd, + hookUrl: runner.traceHost.hookEndpointUrl, + }); + log('claude-code: .claude/settings.json prepared', { path: settingsHandle.path }); + } catch (err) { + log('claude-code: .claude/settings.json prepare failed (busy hooks disabled)', { + error: err instanceof Error ? err.message : String(err), + }); + } + } + // 4. Spawn claude. In interactive sessions we route through a // node-pty relay so we can (a) reserve the bottom `HUD_HEIGHT` // rows for the ac7 status strip and (b) own the stream for @@ -302,6 +325,15 @@ export async function runClaudeCodeCommand(input: ClaudeCodeCommandInput): Promi error: err instanceof Error ? err.message : String(err), }); } + if (settingsHandle) { + try { + settingsHandle.restore(); + } catch (err) { + log('claude-code: settings.json restore threw', { + error: err instanceof Error ? err.message : String(err), + }); + } + } await runner.shutdown(reason).catch((err) => { log('claude-code: runner shutdown threw', { error: err instanceof Error ? err.message : String(err), @@ -435,6 +467,11 @@ export async function runClaudeCodeCommand(input: ClaudeCodeCommandInput): Promi } catch { /* ignore */ } + try { + settingsHandle?.restore(); + } catch { + /* ignore */ + } }; process.on('uncaughtException', onUncaught); process.on('unhandledRejection', onUncaught); diff --git a/packages/cli/src/commands/codex.ts b/packages/cli/src/commands/codex.ts index 51d2039..f14ac31 100644 --- a/packages/cli/src/commands/codex.ts +++ b/packages/cli/src/commands/codex.ts @@ -184,6 +184,10 @@ export async function runCodexCommand(input: CodexCommandInput): Promise cwd, model: input.model, presence, + // Share the trace host's busy signal so codex tool-lifecycle + // notifications and MITM-derived LLM bumps both feed one + // observable. Null when --no-trace. + busy: runner.traceHost?.busy, log, }); } catch (err) { diff --git a/packages/cli/src/runtime/agents/claude-code.ts b/packages/cli/src/runtime/agents/claude-code.ts index 74f1361..a7f627c 100644 --- a/packages/cli/src/runtime/agents/claude-code.ts +++ b/packages/cli/src/runtime/agents/claude-code.ts @@ -41,6 +41,7 @@ import { existsSync, constants as FS, fsyncSync, + mkdirSync, mkdtempSync, openSync, readFileSync, @@ -299,6 +300,254 @@ export function prepareMcpConfig(options: PrepareMcpConfigOptions): McpConfigHan return { path: mcpConfigPath, restore }; } +/** + * Shape of `.claude/settings.json`. Only `hooks` is modeled; everything + * else is preserved verbatim during merge/restore. + * + * Claude Code accepts `hooks` as a map from event name → array of hook + * matchers. For the busy-signal feeder we use `type: "http"` so each + * event becomes a localhost POST rather than a process fork. + */ +interface ClaudeSettingsConfig { + hooks?: Record; + [k: string]: unknown; +} + +interface ClaudeHookMatcher { + matcher?: string; + hooks: ClaudeHookEntry[]; +} + +interface ClaudeHookEntry { + type: 'command' | 'http'; + command?: string; + url?: string; + [k: string]: unknown; +} + +export interface PrepareClaudeSettingsOptions { + /** Directory containing `.claude/settings.json`. Usually the project cwd. */ + cwd: string; + /** + * Full URL the Claude Code harness should POST to for each hook + * event. Comes from `TraceHost.hookEndpointUrl`. The same URL handles + * PreToolUse / PostToolUse / PostToolUseFailure — the runner routes by + * `hook_event_name` in the payload. + */ + hookUrl: string; +} + +export interface ClaudeSettingsHandle { + readonly path: string; + /** + * Restore `.claude/settings.json` to its pre-run state. If the file + * didn't exist before we touched it, delete it (and remove the + * `.claude/` dir if we created it). Idempotent. + */ + restore(): void; +} + +/** + * Marker key we add under each ac7-managed hook entry so a later + * restore (or stale state from a previous crash) can identify our + * entries unambiguously even if the individual contributor later edits + * the file. + */ +const AC7_HOOK_MARKER = 'x_ac7_busy_feeder'; + +/** + * Merge our HTTP hook entries into `.claude/settings.json`, backing up + * the existing file first. Returns a handle whose `.restore()` undoes + * the modification. + * + * The hook config writes one entry per relevant lifecycle event + * (PreToolUse, PostToolUse, PostToolUseFailure) pointing at the + * loopback URL. Each entry is tagged with `x_ac7_busy_feeder: true` + * so we don't accidentally drop unrelated hooks the user has + * configured. + * + * Failure modes that leave the existing file UNTOUCHED: + * - file exists but is not valid JSON + * - backup write fails + * - staging temp file write fails (before rename) + */ +export function prepareClaudeSettings(options: PrepareClaudeSettingsOptions): ClaudeSettingsHandle { + const claudeDir = resolve(options.cwd, '.claude'); + const settingsPath = join(claudeDir, 'settings.json'); + const dirExistedBefore = existsSync(claudeDir); + const existedBefore = existsSync(settingsPath); + + // Parse before we touch anything. Invalid JSON → throw with a clear + // message rather than overwriting the user's file. + let originalBytes: string | null = null; + let existingConfig: ClaudeSettingsConfig = {}; + if (existedBefore) { + originalBytes = readFileSync(settingsPath, 'utf8'); + try { + const parsed = JSON.parse(originalBytes); + if (parsed !== null && typeof parsed === 'object' && !Array.isArray(parsed)) { + existingConfig = parsed as ClaudeSettingsConfig; + } else { + throw new Error('top-level value is not an object'); + } + } catch (err) { + throw new ClaudeCodeAdapterError( + `refusing to modify ${settingsPath}: existing file is not a valid JSON object ` + + `(${err instanceof Error ? err.message : String(err)}). ` + + `Fix or delete the file, then re-run.`, + ); + } + } + + // Backup BEFORE writing the target — same invariant as prepareMcpConfig. + const backupDir = mkdtempSync(join(tmpdir(), 'ac7-runner-')); + const backupPath = join(backupDir, 'claude-settings.json.bak'); + let backupWritten = false; + if (existedBefore && originalBytes !== null) { + try { + atomicWrite(backupPath, originalBytes); + backupWritten = true; + } catch (err) { + try { + rmdirSync(backupDir); + } catch { + /* ignore */ + } + throw new ClaudeCodeAdapterError( + `failed to write backup of ${settingsPath}: ${ + err instanceof Error ? err.message : String(err) + }`, + ); + } + } + + // Build the merged config. Preserve all existing keys, including any + // hooks the user already configured for events we don't touch. + const existingHooks: Record = + existingConfig.hooks && typeof existingConfig.hooks === 'object' + ? { ...existingConfig.hooks } + : {}; + + const ac7Entry: ClaudeHookEntry = { + type: 'http', + url: options.hookUrl, + [AC7_HOOK_MARKER]: true, + }; + + for (const event of ['PreToolUse', 'PostToolUse', 'PostToolUseFailure']) { + const existing = Array.isArray(existingHooks[event]) ? existingHooks[event] : []; + // Drop any prior ac7 entries (e.g. from a previous crash that + // didn't restore cleanly) so we don't accumulate duplicates. + const cleaned = existing.map((matcher) => ({ + ...matcher, + hooks: (matcher.hooks ?? []).filter( + (h) => !(typeof h === 'object' && h !== null && AC7_HOOK_MARKER in h), + ), + })); + // Match-all matcher carrying just our hook entry. + cleaned.push({ matcher: '*', hooks: [ac7Entry] }); + existingHooks[event] = cleaned; + } + + const mergedConfig: ClaudeSettingsConfig = { + ...existingConfig, + hooks: existingHooks, + }; + + // Ensure the .claude directory exists before writing. atomicWrite + // does a same-directory rename, so the dir has to be in place first. + if (!dirExistedBefore) { + try { + mkdirSync(claudeDir, { recursive: true }); + } catch (err) { + if (backupWritten) { + try { + unlinkSync(backupPath); + } catch { + /* ignore */ + } + } + try { + rmdirSync(backupDir); + } catch { + /* ignore */ + } + throw new ClaudeCodeAdapterError( + `failed to create ${claudeDir}: ${err instanceof Error ? err.message : String(err)}`, + ); + } + } + + try { + atomicWrite(settingsPath, `${JSON.stringify(mergedConfig, null, 2)}\n`); + } catch (err) { + if (backupWritten) { + try { + unlinkSync(backupPath); + } catch { + /* ignore */ + } + } + try { + rmdirSync(backupDir); + } catch { + /* ignore */ + } + throw new ClaudeCodeAdapterError( + `failed to write ${settingsPath}: ${err instanceof Error ? err.message : String(err)}`, + ); + } + + let restored = false; + const restore = (): void => { + if (restored) return; + restored = true; + try { + if (existedBefore && originalBytes !== null) { + atomicWrite(settingsPath, originalBytes); + } else { + try { + unlinkSync(settingsPath); + } catch (err) { + const code = (err as NodeJS.ErrnoException).code; + if (code !== 'ENOENT') throw err; + } + // If we created the .claude/ dir for our own settings file + // and it's still empty, clean it up. If the individual + // contributor added unrelated files we leave it alone. + if (!dirExistedBefore) { + try { + rmdirSync(claudeDir); + } catch { + // Directory not empty or some other error — leave it. + } + } + } + } catch (err) { + process.stderr.write( + `ac7: warning: failed to restore ${settingsPath} from backup ${backupPath}: ${ + err instanceof Error ? err.message : String(err) + }\n` + ` The backup file is still at ${backupPath} — you can copy it back manually.\n`, + ); + return; + } + if (backupWritten) { + try { + unlinkSync(backupPath); + } catch { + /* ignore */ + } + } + try { + rmdirSync(backupDir); + } catch { + /* ignore */ + } + }; + + return { path: settingsPath, restore }; +} + /** * Atomically write `body` to `path`. Same pattern as the server's * slot-config writer: open a temp file in the same directory with diff --git a/packages/cli/src/runtime/agents/codex/adapter.ts b/packages/cli/src/runtime/agents/codex/adapter.ts index b133cf8..4d41989 100644 --- a/packages/cli/src/runtime/agents/codex/adapter.ts +++ b/packages/cli/src/runtime/agents/codex/adapter.ts @@ -31,7 +31,9 @@ import { existsSync } from 'node:fs'; import type { BriefingResponse } from '@agentc7/sdk/types'; import { CLI_VERSION } from '../../../version.js'; import type { Presence } from '../../presence.js'; +import type { BusySignal } from '../../trace/busy.js'; import type { TraceHost } from '../../trace/host.js'; +import { attachCodexBusySniff, type CodexBusySniff } from './busy-sniff.js'; import type { CodexChannelSink } from './channel-sink.js'; import { createCodexChannelSink } from './channel-sink.js'; import { setupCodexHome } from './codex-home.js'; @@ -108,6 +110,15 @@ export interface CodexSpawnOptions { model?: string; /** Presence signal — flipped by status notifications. */ presence: Presence; + /** + * Busy signal — bumped on each tool-execution `item/started` and + * decremented on the matching `item/completed`. Optional: when + * absent (e.g. `--no-trace`), tool-lifecycle events still get + * logged but don't drive the indicator. Pass the same signal the + * trace host uses so LLM-call and tool-execution bumps share one + * 0↔busy transition contract. + */ + busy?: BusySignal; /** Logger, structured JSON to stderr by default. */ log: (msg: string, ctx?: Record) => void; } @@ -311,6 +322,13 @@ export async function spawnCodex(opts: CodexSpawnOptions): Promise { opts.log('codex: error notification', params as Record); }); @@ -344,6 +362,10 @@ export async function spawnCodex(opts: CodexSpawnOptions): Promise = new Set([ + 'commandExecution', + 'fileChange', + 'mcpToolCall', +]); + +export interface CodexBusySniffOptions { + rpc: JsonRpcClient; + busy: BusySignal; + log?: (msg: string, ctx?: Record) => void; +} + +export interface CodexBusySniff { + /** + * Drain any handles still in flight. Safe to call multiple times — + * second call is a no-op since the underlying map is empty after + * the first. + */ + drain(): void; + /** + * Outstanding tool handle count. Useful for assertions. + */ + readonly inFlight: number; +} + +export function attachCodexBusySniff(options: CodexBusySniffOptions): CodexBusySniff { + const { rpc, busy } = options; + const log = options.log ?? (() => {}); + + // Per-item busy handles, keyed by item.id. Codex's `item/completed` + // notifications are normally reliable, but a turn interrupt or + // transport error can drop them; the turnCompleted handler below + // sweeps any leftovers so busy can't stay wedged. + const toolHandles = new Map void }>(); + + const drainAll = (reason: string): void => { + if (toolHandles.size === 0) return; + log('codex-busy-sniff: draining tool handles', { count: toolHandles.size, reason }); + for (const handle of toolHandles.values()) handle.finish(); + toolHandles.clear(); + }; + + rpc.onNotification(NOTIFICATIONS.itemStarted, (params) => { + const p = params as ItemStartedNotification; + if (!p?.item?.type || !p.item.id) return; + if (!TOOL_ITEM_TYPES.has(p.item.type)) return; + // Duplicate item/started for the same id is a no-op — preserve + // the first handle so the matching item/completed still drains + // exactly one. + if (!toolHandles.has(p.item.id)) { + toolHandles.set(p.item.id, busy.start('tool_inflight')); + } + }); + + rpc.onNotification(NOTIFICATIONS.itemCompleted, (params) => { + const p = params as ItemCompletedNotification; + if (!p?.item?.id) return; + const handle = toolHandles.get(p.item.id); + if (handle) { + handle.finish(); + toolHandles.delete(p.item.id); + } + }); + + rpc.onNotification(NOTIFICATIONS.turnCompleted, () => { + drainAll('turn-completed'); + }); + + return { + drain(): void { + drainAll('explicit-drain'); + }, + get inFlight() { + return toolHandles.size; + }, + }; +} diff --git a/packages/cli/src/runtime/trace/anthropic.ts b/packages/cli/src/runtime/trace/anthropic.ts index 55cb766..4c9e23a 100644 --- a/packages/cli/src/runtime/trace/anthropic.ts +++ b/packages/cli/src/runtime/trace/anthropic.ts @@ -25,6 +25,7 @@ * a crashed upload. */ +import { isKnownLlmHost } from './known-hosts.js'; import { redactHeaders, redactJson } from './redact.js'; export interface HttpRequestRecord { @@ -113,6 +114,17 @@ export interface OpaqueHttpEntry { const ANTHROPIC_HOST_RE = /(?:^|\.)anthropic\.com$/i; const MESSAGES_PATH_RE = /\/v1\/messages(?:\?|$)/; + +// Sanity check: the structured decoder targets Anthropic's apex, which +// must appear on the trace MITM allowlist. If a future edit drops +// anthropic.com from `known-hosts.ts`, the decoder would silently stop +// firing — catch that at module load instead. +if (!isKnownLlmHost('api.anthropic.com')) { + throw new Error( + 'trace/anthropic: anthropic.com is missing from KNOWN_LLM_HOST_PATTERNS — ' + + 'the structured /v1/messages decoder can only run on MITM-decrypted traffic.', + ); +} const OPAQUE_BODY_PREVIEW_BYTES = 4096; /** diff --git a/packages/cli/src/runtime/trace/busy.ts b/packages/cli/src/runtime/trace/busy.ts index 31c8328..8f562c4 100644 --- a/packages/cli/src/runtime/trace/busy.ts +++ b/packages/cli/src/runtime/trace/busy.ts @@ -1,39 +1,159 @@ /** * "Agent is working" signal for the runner. * - * Tracks an integer count of in-flight upstream HTTP requests captured - * by the MITM proxy. The runner reports `busy = count > 0` to the - * broker, which surfaces it on `/roster` so the web UI can render a - * spinner next to the agent's name. + * Tracks in-flight work the agent is doing across multiple independent + * sources. The runner reports `busy = anyCount > 0` to the broker, + * which surfaces it on `/roster` so the web UI can render a spinner + * next to the agent's name. * - * Why a count rather than a bool: many concurrent calls are normal - * (parallel tool fan-out), and using a count means we don't accidentally - * flip "not busy" mid-burst when one of N calls finishes. Subscribers - * still see only the boolean transitions, so the UI never thrashes. + * The signal is multi-sourced because no single observation point sees + * everything an agent does: * - * Listeners are notified on every transition between 0 and >0 (and - * vice versa). Subsequent increments while already busy don't fire — - * the public observable is the boolean, not the count. + * - `llm_inflight` — bumped by the MITM trace pipeline whenever a + * request to an allowlisted LLM-provider host is in flight. Lights + * up while the model is generating between tool calls. This is the + * historical bumper and the one that still fires today. + * - `tool_inflight` — bumped by per-runner integrations watching + * tool lifecycle events (claude-code hooks, codex app-server + * `item/started`/`item/completed` notifications). Lights up during + * bash, file-edit, MCP-tool, and other tool execution windows that + * the LLM bump alone wouldn't cover. + * + * Why per-source counters: any single feeder can stall (a misbehaving + * hook that never decrements, a JSON-RPC stream that drops a + * notification). With separate counters, a stuck source can't poison + * the others — and `getSourceCounts()` lets diagnostics tell us which + * one is wedged. The public observable stays a single boolean so the + * UI never has to merge state. + * + * Why a count rather than a bool per source: many concurrent in-flight + * units are normal (parallel tool fan-out, streaming LLM requests). + * Using a count means we don't accidentally flip "not busy" mid-burst + * when one of N completes. Subscribers still see only the boolean + * transitions, so the UI never thrashes. + * + * Listeners are notified on every transition between "any source has + * work" and "no source has work". Subsequent increments while already + * busy don't fire. + * + * Defense in depth — handles that never see their `finish()` call: + * + * - Each handle has an auto-finish timer (see `DEFAULT_MAX_AGE_MS`). + * If `finish()` hasn't run by then we force it. A keep-alive socket + * surviving a TUI interrupt, a dropped tool-lifecycle notification, + * or a parser failing to complete an exchange would all otherwise + * wedge the indicator. We'd rather flicker idle after a long + * legitimate operation than show busy forever. + * - `forceFinishAll()` drains every live handle and is called from + * `TraceHost.close()` after sub-systems shut down — a teardown-time + * safety net for any handle a sub-system's own cleanup missed. */ +export type BusySource = 'llm_inflight' | 'tool_inflight'; + +const ALL_SOURCES: readonly BusySource[] = ['llm_inflight', 'tool_inflight']; + +/** + * Per-source upper bound on how long a single handle is allowed to + * stay open before the safety net force-finishes it. + * + * - `llm_inflight` — 5 minutes. Extended-thinking turns and long + * batch generations can take ~1-2 minutes legitimately; 5 minutes + * leaves plenty of headroom while still bounding the stuck case. + * - `tool_inflight` — 15 minutes. Tool calls can include long bash + * commands (npm install, docker build, large checkouts). Beyond + * 15 minutes we'd rather risk flickering than show stuck-busy + * forever. + * + * Callers with legitimate need for a different cap (or no cap) can + * pass `maxAgeMs` explicitly to `start()`. Use `Infinity` to disable + * the timer entirely; non-positive / non-finite values fall back to + * the source default. + */ +export const DEFAULT_MAX_AGE_MS: Readonly> = { + llm_inflight: 5 * 60_000, + tool_inflight: 15 * 60_000, +}; + +export interface BusySignalOptions { + /** + * Optional logger for non-routine events: handle auto-finished by + * the max-age timer, handles drained via `forceFinishAll()`, etc. + * The signal is normally silent on the happy path. + */ + log?: (msg: string, ctx?: Record) => void; +} + +export interface BusyStartOptions { + /** + * Hard cap on the handle's lifetime in milliseconds. If `finish()` + * isn't called by then we force-finish, log a warning, and drop the + * count. Defaults to `DEFAULT_MAX_AGE_MS[source]`. Pass `Infinity` + * to disable the safety net for this handle. + */ + maxAgeMs?: number; +} + +export interface BusyHandle { + finish(): void; +} + export interface BusySignal { - /** Current count of in-flight requests. */ + /** Sum of in-flight counts across all sources. */ readonly count: number; - /** Whether at least one request is in flight (`count > 0`). */ + /** Whether at least one source has work in flight. */ readonly busy: boolean; - /** Mark a new request as started. Returns a handle that decrements on `finish()`. */ - start(): { finish: () => void }; + /** + * Mark a new unit of work as started. Returns a handle that + * decrements on `finish()`. Defaults to `llm_inflight` so existing + * MITM bumper code that doesn't pass a source remains correct. + * + * Each handle auto-finishes after `maxAgeMs` if `finish()` hasn't + * been called — see the file-level comment on defense in depth. + */ + start(source?: BusySource, options?: BusyStartOptions): BusyHandle; /** * Subscribe to busy-state changes. Listener fires immediately with * the current state, then on every transition. Returns an unsubscribe * function. */ subscribe(listener: (busy: boolean) => void): () => void; + /** + * Diagnostics: read the live per-source counts. Useful when a + * subscriber suspects one source is stuck — see which counter + * refuses to drain. + */ + getSourceCounts(): Readonly>; + /** + * Force every outstanding handle to finish. Returns the number of + * handles that were drained (zero when no work was in flight). Emits + * a single busy→idle transition if at least one handle was open. + * + * The trace host calls this from its `close()` path after the proxy + * and hook server have shut down, as a final safety net for handles + * a sub-system's own cleanup missed. Tests can also use it to + * scrub state between cases. + */ + forceFinishAll(): number; } -export function createBusySignal(): BusySignal { - let count = 0; +interface InternalHandle { + source: BusySource; + finish: (reason: 'normal' | 'timeout' | 'force') => void; +} + +export function createBusySignal(options: BusySignalOptions = {}): BusySignal { + const log = options.log ?? (() => {}); + const counts = new Map(); + for (const source of ALL_SOURCES) counts.set(source, 0); const listeners = new Set<(busy: boolean) => void>(); + const liveHandles = new Set(); + + const totalCount = (): number => { + let total = 0; + for (const v of counts.values()) total += v; + return total; + }; const emit = (busy: boolean): void => { for (const listener of listeners) { @@ -45,36 +165,98 @@ export function createBusySignal(): BusySignal { } }; - const start = (): { finish: () => void } => { - const wasBusy = count > 0; - count += 1; + const resolveMaxAge = (source: BusySource, requested: number | undefined): number => { + if (requested === undefined) return DEFAULT_MAX_AGE_MS[source]; + if (typeof requested !== 'number') return DEFAULT_MAX_AGE_MS[source]; + if (Number.isNaN(requested)) return DEFAULT_MAX_AGE_MS[source]; + // Positive Infinity is the documented opt-out signal — let it + // through so the timer setup below sees a non-finite value and + // skips scheduling. + if (requested === Number.POSITIVE_INFINITY) return requested; + if (requested <= 0) return DEFAULT_MAX_AGE_MS[source]; + return requested; + }; + + const start = ( + source: BusySource = 'llm_inflight', + startOpts: BusyStartOptions = {}, + ): BusyHandle => { + const wasBusy = totalCount() > 0; + counts.set(source, (counts.get(source) ?? 0) + 1); if (!wasBusy) emit(true); + let finished = false; + let timer: ReturnType | null = null; + const startedAt = Date.now(); + + const finish = (reason: 'normal' | 'timeout' | 'force'): void => { + // Idempotent — a callback wired to two completion paths + // (e.g., onExchange + closeSession) shouldn't double-decrement. + if (finished) return; + finished = true; + if (timer !== null) { + clearTimeout(timer); + timer = null; + } + liveHandles.delete(handleEntry); + const next = Math.max(0, (counts.get(source) ?? 0) - 1); + counts.set(source, next); + if (totalCount() === 0) emit(false); + if (reason !== 'normal') { + log('busy: handle auto-finished', { + source, + reason, + ageMs: Date.now() - startedAt, + }); + } + }; + + const handleEntry: InternalHandle = { + source, + finish, + }; + liveHandles.add(handleEntry); + + const maxAgeMs = resolveMaxAge(source, startOpts.maxAgeMs); + if (Number.isFinite(maxAgeMs) && maxAgeMs > 0) { + timer = setTimeout(() => finish('timeout'), maxAgeMs); + // Don't keep the runner process alive just to fire this watchdog — + // if the runner is exiting and this is the only live timer, we + // want the loop to drain so close() can proceed. + if (typeof timer === 'object' && 'unref' in timer) { + (timer as { unref: () => void }).unref(); + } + } + return { - finish: () => { - // Idempotent — a callback wired to two completion paths - // (e.g., onExchange + closeSession) shouldn't double-decrement. - if (finished) return; - finished = true; - count = Math.max(0, count - 1); - if (count === 0) emit(false); - }, + finish: () => finish('normal'), }; }; + const forceFinishAll = (): number => { + if (liveHandles.size === 0) return 0; + const drained = liveHandles.size; + // Snapshot before iterating since each finish() mutates the set. + for (const entry of [...liveHandles]) { + entry.finish('force'); + } + log('busy: force-finished outstanding handles', { drained }); + return drained; + }; + return { get count() { - return count; + return totalCount(); }, get busy() { - return count > 0; + return totalCount() > 0; }, start, subscribe(listener) { listeners.add(listener); // Late subscribers see the current state. try { - listener(count > 0); + listener(totalCount() > 0); } catch { /* ignore */ } @@ -82,5 +264,11 @@ export function createBusySignal(): BusySignal { listeners.delete(listener); }; }, + getSourceCounts() { + const out = { llm_inflight: 0, tool_inflight: 0 } as Record; + for (const [k, v] of counts) out[k] = v; + return out; + }, + forceFinishAll, }; } diff --git a/packages/cli/src/runtime/trace/hook-server.ts b/packages/cli/src/runtime/trace/hook-server.ts new file mode 100644 index 0000000..92e2edd --- /dev/null +++ b/packages/cli/src/runtime/trace/hook-server.ts @@ -0,0 +1,192 @@ +/** + * Loopback HTTP endpoint for Claude Code hook events. + * + * Claude Code's hook system fires lifecycle callbacks at points in the + * agent loop the MITM proxy can't see (tool execution windows that + * don't generate LLM calls). We bind a small HTTP server here, write + * its URL into `.claude/settings.json` as a `type: "http"` hook target, + * and let Claude Code POST to us on PreToolUse / PostToolUse / + * PostToolUseFailure. + * + * Why HTTP and not `type: "command"`: + * - Each `type: "command"` hook forks a process per event. With ~50 + * tool calls per turn over a session, that's hundreds of Node + * startups for what should be a counter bump. + * - HTTP hooks are single localhost round-trips — sub-millisecond on + * loopback. + * - We already bind two listeners (the MITM proxy, the runner IPC + * socket); a third is cheap. + * + * The server is single-purpose: bumps `busy('tool_inflight')` on + * PreToolUse, decrements on PostToolUse / PostToolUseFailure. It + * keeps a per-`tool_use_id` map of busy handles so out-of-order + * matching (e.g., a hook event arrives twice or out of sequence) + * stays correct: PreToolUse for an id we already have is a no-op; + * PostToolUse for an id we don't have is a no-op; double Post + * decrements at most once. + * + * On close, all outstanding handles are drained so a torn-down runner + * can't leave the indicator wedged at "busy". + */ + +import { createServer, type Server } from 'node:http'; +import type { BusySignal } from './busy.js'; + +export type ClaudeHookEventName = + | 'PreToolUse' + | 'PostToolUse' + | 'PostToolUseFailure' + | 'PostToolBatch'; + +interface HookRequestBody { + hook_event_name?: string; + tool_use_id?: string; + tool_name?: string; +} + +export interface HookServer { + /** The full URL that goes into the `type: "http"` hook config. */ + readonly url: string; + /** Live count of outstanding tool handles. Useful for diagnostics. */ + readonly inFlight: number; + /** Tear down: drain any remaining handles, close the listener. */ + close(): Promise; +} + +export interface HookServerOptions { + busy: BusySignal; + log?: (msg: string, ctx?: Record) => void; +} + +export async function startHookServer(options: HookServerOptions): Promise { + const log = + options.log ?? + ((msg: string, ctx: Record = {}): void => { + const record = { ts: new Date().toISOString(), component: 'hook-server', msg, ...ctx }; + process.stderr.write(`${JSON.stringify(record)}\n`); + }); + + // Per-tool-use-id handles. The same id appears in PreToolUse and + // PostToolUse, so the matching is exact when Claude Code is + // well-behaved. If we get an unexpected duplicate or out-of-order + // event we err on the side of "do nothing surprising" rather than + // double-bump or under-decrement. + const handles = new Map void }>(); + + const readBody = (req: NodeJS.ReadableStream): Promise => + new Promise((resolve, reject) => { + const parts: Buffer[] = []; + let total = 0; + // 64 KB is far more than any hook payload should be; cap to + // defang slow-loris / oversized-body adversaries even on + // loopback. A real claude payload is ~1-2 KB. + const cap = 64 * 1024; + req.on('data', (chunk: Buffer) => { + total += chunk.length; + if (total > cap) { + req.removeAllListeners('data'); + req.removeAllListeners('end'); + reject(new Error('hook payload exceeded 64 KB cap')); + return; + } + parts.push(chunk); + }); + req.on('end', () => resolve(Buffer.concat(parts).toString('utf8'))); + req.on('error', reject); + }); + + const server: Server = createServer(async (req, res) => { + // Liveness only — anything except POST /hook/tool-event gets a 404 + // so misconfiguration is loud. + if (req.method !== 'POST' || req.url !== '/hook/tool-event') { + res.writeHead(404, { 'Content-Type': 'text/plain' }); + res.end('not found'); + return; + } + let body: HookRequestBody; + try { + const raw = await readBody(req); + const parsed = raw.length === 0 ? {} : JSON.parse(raw); + if (parsed === null || typeof parsed !== 'object' || Array.isArray(parsed)) { + throw new Error('hook body is not a JSON object'); + } + body = parsed as HookRequestBody; + } catch (err) { + log('hook-server: bad request', { + error: err instanceof Error ? err.message : String(err), + }); + res.writeHead(400, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ error: 'bad request' })); + return; + } + + const event = body.hook_event_name; + const toolUseId = body.tool_use_id; + if (typeof event !== 'string' || typeof toolUseId !== 'string' || toolUseId.length === 0) { + // Missing essentials. Don't 4xx — Claude Code might keep + // retrying. 2xx with a no-op semantics is safer. + res.writeHead(200, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ accepted: false, reason: 'missing fields' })); + return; + } + + if (event === 'PreToolUse') { + // Duplicate PreToolUse for the same id is a no-op — keep the + // first handle so the matching Post still finds something. + if (!handles.has(toolUseId)) { + handles.set(toolUseId, options.busy.start('tool_inflight')); + } + } else if ( + event === 'PostToolUse' || + event === 'PostToolUseFailure' || + event === 'PostToolBatch' + ) { + const handle = handles.get(toolUseId); + if (handle) { + handle.finish(); + handles.delete(toolUseId); + } + // Note: PostToolBatch may carry a synthetic batch id rather than + // a real tool_use_id; we still try to drain the matching handle + // in case Claude Code uses the same id space. Missing matches + // are silent (no-op). + } + // Any other event (SessionStart, Stop, etc.) is accepted but + // doesn't drive busy. We ignore them politely so the user can + // share one hook config block across multiple events without + // worrying about which ones we care about. + res.writeHead(200, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ accepted: true })); + }); + + await new Promise((resolve, reject) => { + server.once('listening', () => resolve()); + server.once('error', (err) => reject(err)); + server.listen(0, '127.0.0.1'); + }); + + const address = server.address(); + if (!address || typeof address === 'string') { + server.close(); + throw new Error('hook-server: server.address() returned non-TCP binding'); + } + const url = `http://127.0.0.1:${address.port}/hook/tool-event`; + log('hook-server: listening', { url }); + + return { + url, + get inFlight() { + return handles.size; + }, + async close(): Promise { + if (handles.size > 0) { + log('hook-server: draining handles at close', { count: handles.size }); + for (const handle of handles.values()) handle.finish(); + handles.clear(); + } + await new Promise((resolve) => { + server.close(() => resolve()); + }); + }, + }; +} diff --git a/packages/cli/src/runtime/trace/host.ts b/packages/cli/src/runtime/trace/host.ts index af69de4..f265ab6 100644 --- a/packages/cli/src/runtime/trace/host.ts +++ b/packages/cli/src/runtime/trace/host.ts @@ -5,10 +5,13 @@ * The runner constructs a `TraceHost` at startup when tracing is * enabled, bakes its `envVars()` into the agent child's environment, * and calls `noteObjective{Open,Close}()` from the objectives - * tracker when SSE objective events arrive. Every HTTPS flow the - * agent makes is decrypted transparently via the MITM proxy; - * completed HTTP/1.1 exchanges stream up to the broker via the - * activity uploader in real time, not at span close. + * tracker when SSE objective events arrive. HTTPS flows to known LLM- + * provider hosts (see `known-hosts.ts`) are MITM-decrypted and the + * resulting HTTP/1.1 exchanges stream up to the broker via the + * activity uploader in real time. Traffic to non-allowlisted hosts + * passes through the proxy as a raw TCP tunnel — the agent's TLS + * client talks to the real upstream cert end-to-end, system trust + * applies, no plaintext is observed. * * There's no TraceBuffer, no span boundary, no per-objective * copying. The agent's activity log is the source of truth; per- @@ -28,7 +31,9 @@ import type { ActivityEvent, TraceEntry } from '@agentc7/sdk/types'; import { ActivityUploader } from './activity-uploader.js'; import { extractEntries, type HttpExchange } from './anthropic.js'; import { type BusySignal, createBusySignal } from './busy.js'; +import { type HookServer, startHookServer } from './hook-server.js'; import { type Http1Exchange, Http1Reassembler } from './http1-reassembler.js'; +import { isKnownLlmHost } from './known-hosts.js'; import { type CertPool, createCertPool, createTraceCa, type TraceCa } from './mitm/ca.js'; import { type ProxyRelay, startProxyRelay } from './proxy.js'; import { looksLikeSseStream, reassembleAnthropicSse } from './sse.js'; @@ -74,6 +79,19 @@ export interface TraceHost { * can render a spinner next to the agent's name. */ readonly busy: BusySignal; + /** + * Loopback HTTP endpoint URL that Claude Code hooks should POST to. + * Used by the `claude-code` adapter to write a `type: "http"` hook + * config into `.claude/settings.json` so PreToolUse / PostToolUse + * events drive `busy('tool_inflight')`. + * + * Co-located with the trace host because the lifecycles are identical + * (started together, torn down together, shares the busy signal). + * Null is impossible here — when tracing is enabled, hooks are + * available — but the field exists so callers can plumb without + * a separate option. + */ + readonly hookEndpointUrl: string; /** * Env vars to merge into the agent child's environment (see the * comment on the implementation for the full list). Returns a @@ -122,7 +140,10 @@ export async function startTraceHost(options: TraceHostOptions): Promise void }>(); const handleKey = (sessionId: number, startedAt: number): string => `${sessionId}:${startedAt}`; @@ -133,7 +154,7 @@ export async function startTraceHost(options: TraceHostOptions): Promise { - pendingHandles.set(handleKey(sessionId, startedAt), busy.start()); + pendingHandles.set(handleKey(sessionId, startedAt), busy.start('llm_inflight')); }, onExchange: (exchange) => { const handle = pendingHandles.get(handleKey(exchange.sessionId, exchange.startedAt)); @@ -149,12 +170,27 @@ export async function startTraceHost(options: TraceHostOptions): Promise reassembler.ingest(chunk), onSessionEnd: (session) => reassembler.closeSession(session.id), }); + // Loopback HTTP endpoint for Claude Code hook events. The runner + // writes its URL into `.claude/settings.json` so PreToolUse / + // PostToolUse callbacks bump the same `busy` signal the MITM uses. + // For codex this is unused — codex feeds busy via JSON-RPC + // notifications on the app-server stream. + const hookServer: HookServer = await startHookServer({ busy, log }); + log('trace-host: started', { proxyUrl: proxy.proxyUrl, + hookUrl: hookServer.url, caCertPath, name: options.name, }); @@ -167,6 +203,24 @@ export async function startTraceHost(options: TraceHostOptions): Promise { const existingNoProxy = existingEnv.NO_PROXY ?? existingEnv.no_proxy ?? ''; const noProxyHosts = ['localhost', '127.0.0.1', '::1']; @@ -218,6 +272,31 @@ export async function startTraceHost(options: TraceHostOptions): Promise { + log('trace-host: hook server close failed', { + error: err instanceof Error ? err.message : String(err), + }); + }); + // Final safety net for the busy signal. Sub-system closes above + // (reassembler flush, hook server drain, codex sniff drain at + // its own teardown) should have drained every handle they own. + // If anything slipped through — a keep-alive socket that never + // emitted onSessionEnd, a dropped item/completed notification, + // a hook event that never fired — this guarantees the indicator + // goes idle before the runner exits rather than waiting on the + // 30s server-side TTL. + // + // Snapshot per-source counts BEFORE the drain so the diagnostic + // log tells us which source leaked (the counts are all zero + // after forceFinishAll, which would be useless on its own). + const leakedCounts = busy.getSourceCounts(); + const drained = busy.forceFinishAll(); + if (drained > 0) { + log('trace-host: force-drained leaked busy handles at teardown', { + drained, + sourceCounts: leakedCounts, + }); + } try { await fs.unlink(caCertPath); } catch (err) { diff --git a/packages/cli/src/runtime/trace/known-hosts.ts b/packages/cli/src/runtime/trace/known-hosts.ts new file mode 100644 index 0000000..bc4b835 --- /dev/null +++ b/packages/cli/src/runtime/trace/known-hosts.ts @@ -0,0 +1,59 @@ +/** + * Allowlist of hosts whose HTTPS traffic the trace proxy decrypts. + * + * The proxy itself is host-agnostic: it accepts `CONNECT host:port`, + * dials upstream, and pipes bytes. The decision of "decrypt this + * session" vs "pass it through unmodified" is gated by the predicate + * exported here. Hosts that match get MITM'd — the proxy terminates + * TLS on both sides, captures plaintext, and runs the request/response + * pair through the reassembler + decoders. Hosts that don't match get + * a raw TCP tunnel: the agent's TLS client talks to the real upstream + * cert end-to-end, system trust applies, no plaintext is observed. + * + * Why an allowlist instead of decrypting everything: + * - The honest privacy claim is "ac7 decrypts traffic to known LLM + * providers." That's only defensible if non-LLM hosts genuinely + * bypass our TLS termination. + * - Non-LLM HTTPS calls (git fetch, package installs, telemetry, + * arbitrary curl/wget from agents) "just work" with system trust + * and don't require us to ship CA-bundle env vars per-tool. + * - The activity feed only shows traffic we actually want to inspect. + * + * The bar for adding a host: + * The agent (or its bundled tools) makes inference-related calls to + * it — model invocations, token refresh against the same provider, + * provider-specific telemetry that the trace pipeline knows how to + * parse. Hosts we just "happen to see" because the agent shells out + * to them do NOT belong here. + * + * Patterns match the CONNECT target hostname (no port, no scheme). + * Use `(?:^|\.)domain$` to match the apex and any subdomain. + */ + +export const KNOWN_LLM_HOST_PATTERNS: readonly RegExp[] = [ + // Anthropic — `api.anthropic.com` for /v1/messages, plus auth/console + // subdomains used during token refresh. + /(?:^|\.)anthropic\.com$/i, + // OpenAI — `api.openai.com` for chat/completions, `auth.openai.com` + // for codex token refresh. The wildcard covers both without listing + // each subdomain. + /(?:^|\.)openai\.com$/i, + // Azure OpenAI — customer-specific subdomains under + // `*.openai.azure.com`. + /(?:^|\.)openai\.azure\.com$/i, +]; + +/** + * True if `host` is on the LLM allowlist and should be MITM-decrypted. + * + * Accepts the raw hostname from a CONNECT line (no port, no scheme). + * Falls back to literal compare so callers don't have to worry about + * regex specials in `host`. + */ +export function isKnownLlmHost(host: string): boolean { + if (host.length === 0) return false; + for (const pattern of KNOWN_LLM_HOST_PATTERNS) { + if (pattern.test(host)) return true; + } + return false; +} diff --git a/packages/cli/src/runtime/trace/proxy.ts b/packages/cli/src/runtime/trace/proxy.ts index a79005a..a4fc895 100644 --- a/packages/cli/src/runtime/trace/proxy.ts +++ b/packages/cli/src/runtime/trace/proxy.ts @@ -95,10 +95,23 @@ export interface ProxyRelayOptions { log?: (msg: string, ctx?: Record) => void; /** * Per-hostname leaf cert issuer. When provided, CONNECT targets - * get the MITM treatment; captured chunks are plaintext. Omit - * for pure TCP relay behavior. + * eligible per `shouldMitm` get the MITM treatment; captured + * chunks are plaintext. Omit for pure TCP relay behavior on every + * session. */ certPool?: CertPool; + /** + * Predicate consulted on each CONNECT to decide MITM vs raw tunnel. + * Only consulted when `certPool` is also set. When omitted, every + * session with a `certPool` is MITM'd — the legacy "decrypt + * everything" behavior, useful for tests and one-off debugging. + * + * Production callers pass `isKnownLlmHost` here so only allowlisted + * LLM-provider traffic gets decrypted; all other HTTPS passes + * through unmodified end-to-end (agent's system trust applies, we + * never see plaintext). + */ + shouldMitm?: (host: string) => boolean; /** * Extra options merged into the upstream `tls.connect()` call * during MITM. Production uses this for nothing (the defaults @@ -136,6 +149,7 @@ export async function startProxyRelay(options: ProxyRelayOptions = {}): Promise< client, sessionId, certPool: options.certPool, + shouldMitm: options.shouldMitm, upstreamTlsOptions: options.upstreamTlsOptions, onChunk: options.onChunk, onSessionEnd: options.onSessionEnd, @@ -175,6 +189,7 @@ interface SessionContext { client: Socket; sessionId: number; certPool: CertPool | undefined; + shouldMitm: ((host: string) => boolean) | undefined; upstreamTlsOptions: ConnectionOptions | undefined; onChunk: ((chunk: ProxyChunk) => void) | undefined; onSessionEnd: ((session: ProxySession) => void) | undefined; @@ -259,14 +274,25 @@ function handleSession(ctx: SessionContext): void { upstreamPort = parsed.port; phase = 'connecting'; - // MITM whenever a CertPool is configured, regardless of port. - // In practice agents only CONNECT to HTTPS targets (443 or - // vanity ports like 8443); the TLS handshake would fail - // cleanly on any non-TLS traffic. Keeping the code path - // uniform makes testing easier and avoids silent misbehavior. - if (ctx.certPool) { + // MITM only when (a) we have a CertPool, and (b) the host is + // on the allowlist (or no allowlist is configured — legacy + // mode used by tests). Everything else gets a raw TCP tunnel: + // the agent's TLS client talks straight to the real upstream + // cert, system trust applies, we never see plaintext. We still + // produce a ProxySession with byte counts + host:port so the + // activity layer retains "agent talked to X" metadata. + const wantMitm = + ctx.certPool !== undefined && + (ctx.shouldMitm === undefined || ctx.shouldMitm(upstreamHost)); + if (wantMitm) { startMitmBridge(); } else { + log('proxy: passthrough (no MITM)', { + sessionId, + host: upstreamHost, + port: upstreamPort, + reason: ctx.certPool === undefined ? 'no-cert-pool' : 'host-not-allowlisted', + }); startRawBridge(); } return; diff --git a/packages/cli/test/runtime/busy.test.ts b/packages/cli/test/runtime/busy.test.ts index bfce033..69c0019 100644 --- a/packages/cli/test/runtime/busy.test.ts +++ b/packages/cli/test/runtime/busy.test.ts @@ -7,8 +7,8 @@ * /presence/busy traffic to thrash on parallel tool fan-outs. */ -import { describe, expect, it, vi } from 'vitest'; -import { createBusySignal } from '../../src/runtime/trace/busy.js'; +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; +import { createBusySignal, DEFAULT_MAX_AGE_MS } from '../../src/runtime/trace/busy.js'; describe('createBusySignal', () => { it('starts idle (count=0, busy=false)', () => { @@ -89,4 +89,225 @@ describe('createBusySignal', () => { b.start(); expect(listener).not.toHaveBeenCalled(); }); + + it('tracks per-source counts independently', () => { + const b = createBusySignal(); + const llm = b.start('llm_inflight'); + const tool1 = b.start('tool_inflight'); + const tool2 = b.start('tool_inflight'); + expect(b.getSourceCounts()).toEqual({ llm_inflight: 1, tool_inflight: 2 }); + expect(b.count).toBe(3); + expect(b.busy).toBe(true); + + llm.finish(); + expect(b.getSourceCounts()).toEqual({ llm_inflight: 0, tool_inflight: 2 }); + // Still busy — tool_inflight is non-zero. + expect(b.busy).toBe(true); + + tool1.finish(); + tool2.finish(); + expect(b.getSourceCounts()).toEqual({ llm_inflight: 0, tool_inflight: 0 }); + expect(b.busy).toBe(false); + }); + + it('emits only one 0→busy transition across mixed sources', () => { + const b = createBusySignal(); + const listener = vi.fn(); + b.subscribe(listener); + listener.mockClear(); + + const llm = b.start('llm_inflight'); + const tool = b.start('tool_inflight'); + expect(listener).toHaveBeenCalledTimes(1); + expect(listener).toHaveBeenLastCalledWith(true); + + llm.finish(); + // tool_inflight still in flight — no transition. + expect(listener).toHaveBeenCalledTimes(1); + + tool.finish(); + // Now all sources drained — transition fires. + expect(listener).toHaveBeenCalledTimes(2); + expect(listener).toHaveBeenLastCalledWith(false); + }); + + it("defaults to 'llm_inflight' when start() is called without a source", () => { + // Backwards compatibility — existing MITM call sites that don't + // pass a source must keep behaving as they did. + const b = createBusySignal(); + const h = b.start(); + expect(b.getSourceCounts()).toEqual({ llm_inflight: 1, tool_inflight: 0 }); + h.finish(); + expect(b.getSourceCounts()).toEqual({ llm_inflight: 0, tool_inflight: 0 }); + }); + + it('a wedged source does not poison drain of the other', () => { + // Regression: if one feeder forgets to decrement, the other + // should still be able to report idle on its own track. The + // overall `busy` stays true (correct — there IS in-flight work + // by the contract) but `getSourceCounts()` makes the culprit + // diagnosable instead of leaving us guessing. + const b = createBusySignal(); + b.start('llm_inflight', { maxAgeMs: Infinity }); // intentionally not finished, safety disabled + const tool = b.start('tool_inflight'); + tool.finish(); + expect(b.busy).toBe(true); + expect(b.getSourceCounts()).toEqual({ llm_inflight: 1, tool_inflight: 0 }); + }); +}); + +describe('createBusySignal — handle max-age safety net', () => { + beforeEach(() => { + vi.useFakeTimers(); + }); + afterEach(() => { + vi.useRealTimers(); + }); + + it('auto-finishes a handle that exceeds its max-age and emits the idle transition', () => { + const log = vi.fn(); + const b = createBusySignal({ log }); + const listener = vi.fn(); + b.subscribe(listener); + listener.mockClear(); + log.mockClear(); + + b.start('llm_inflight', { maxAgeMs: 1000 }); + expect(b.busy).toBe(true); + expect(listener).toHaveBeenCalledWith(true); + + vi.advanceTimersByTime(999); + expect(b.busy).toBe(true); // not yet + + vi.advanceTimersByTime(2); + expect(b.busy).toBe(false); + expect(listener).toHaveBeenLastCalledWith(false); + // Log carries the diagnostic context. + expect(log).toHaveBeenCalledWith( + 'busy: handle auto-finished', + expect.objectContaining({ source: 'llm_inflight', reason: 'timeout' }), + ); + }); + + it('respects a custom maxAgeMs even when the default is much larger', () => { + const b = createBusySignal(); + b.start('tool_inflight', { maxAgeMs: 50 }); + expect(b.busy).toBe(true); + vi.advanceTimersByTime(60); + expect(b.busy).toBe(false); + }); + + it('Infinity maxAgeMs disables the safety net entirely', () => { + const b = createBusySignal(); + b.start('llm_inflight', { maxAgeMs: Number.POSITIVE_INFINITY }); + expect(b.busy).toBe(true); + // Crank well past any default — handle should still be live. + vi.advanceTimersByTime(DEFAULT_MAX_AGE_MS.llm_inflight * 10); + expect(b.busy).toBe(true); + }); + + it('finish() before the deadline cancels the timer (no double-finish)', () => { + const log = vi.fn(); + const b = createBusySignal({ log }); + const h = b.start('llm_inflight', { maxAgeMs: 1000 }); + expect(b.getSourceCounts().llm_inflight).toBe(1); + + h.finish(); + expect(b.getSourceCounts().llm_inflight).toBe(0); + + // Advance well past the original deadline — the timer should + // have been cleared, so no auto-finish fires (which would + // otherwise underflow the count or fire a spurious idle + // transition). + vi.advanceTimersByTime(5000); + expect(b.getSourceCounts().llm_inflight).toBe(0); + expect(log).not.toHaveBeenCalledWith('busy: handle auto-finished', expect.anything()); + }); + + it('applies the per-source default when maxAgeMs is omitted', () => { + const b = createBusySignal(); + b.start('tool_inflight'); + // 14:59 — not yet. + vi.advanceTimersByTime(DEFAULT_MAX_AGE_MS.tool_inflight - 1000); + expect(b.busy).toBe(true); + // Past 15min. + vi.advanceTimersByTime(2000); + expect(b.busy).toBe(false); + }); + + it('falls back to the default when maxAgeMs is non-positive or NaN', () => { + const b = createBusySignal(); + b.start('llm_inflight', { maxAgeMs: 0 }); + b.start('llm_inflight', { maxAgeMs: -100 }); + b.start('llm_inflight', { maxAgeMs: Number.NaN }); + expect(b.getSourceCounts().llm_inflight).toBe(3); + + // None of those should auto-finish before the source default. + vi.advanceTimersByTime(DEFAULT_MAX_AGE_MS.llm_inflight - 1); + expect(b.getSourceCounts().llm_inflight).toBe(3); + // Past the default. + vi.advanceTimersByTime(2); + expect(b.getSourceCounts().llm_inflight).toBe(0); + }); +}); + +describe('createBusySignal — forceFinishAll', () => { + it('returns 0 and emits nothing when no work is in flight', () => { + const b = createBusySignal(); + const listener = vi.fn(); + b.subscribe(listener); + listener.mockClear(); + const drained = b.forceFinishAll(); + expect(drained).toBe(0); + expect(listener).not.toHaveBeenCalled(); + }); + + it('drains every live handle across both sources with a single idle transition', () => { + const b = createBusySignal(); + const listener = vi.fn(); + b.subscribe(listener); + listener.mockClear(); + + b.start('llm_inflight'); + b.start('llm_inflight'); + b.start('tool_inflight'); + expect(b.getSourceCounts()).toEqual({ llm_inflight: 2, tool_inflight: 1 }); + expect(listener).toHaveBeenCalledTimes(1); // 0→busy + + const drained = b.forceFinishAll(); + expect(drained).toBe(3); + expect(b.getSourceCounts()).toEqual({ llm_inflight: 0, tool_inflight: 0 }); + expect(b.busy).toBe(false); + // One additional listener fire — the busy→idle transition. No + // intermediate per-handle transitions because the counters stayed + // > 0 until the last one drained. + expect(listener).toHaveBeenCalledTimes(2); + expect(listener).toHaveBeenLastCalledWith(false); + }); + + it('subsequent finish() on a force-finished handle is a no-op', () => { + const b = createBusySignal(); + const h = b.start('llm_inflight'); + expect(b.getSourceCounts().llm_inflight).toBe(1); + + b.forceFinishAll(); + expect(b.getSourceCounts().llm_inflight).toBe(0); + + // The caller's handle.finish() shouldn't underflow. + h.finish(); + expect(b.getSourceCounts().llm_inflight).toBe(0); + expect(b.busy).toBe(false); + }); + + it('logs the drain count for diagnostics', () => { + const log = vi.fn(); + const b = createBusySignal({ log }); + b.start('llm_inflight'); + b.start('tool_inflight'); + b.forceFinishAll(); + expect(log).toHaveBeenCalledWith( + 'busy: force-finished outstanding handles', + expect.objectContaining({ drained: 2 }), + ); + }); }); diff --git a/packages/cli/test/runtime/claude-code-adapter.test.ts b/packages/cli/test/runtime/claude-code-adapter.test.ts index 0b1bf98..e49b26d 100644 --- a/packages/cli/test/runtime/claude-code-adapter.test.ts +++ b/packages/cli/test/runtime/claude-code-adapter.test.ts @@ -21,7 +21,11 @@ import { existsSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'no import { tmpdir } from 'node:os'; import { join } from 'node:path'; import { afterEach, beforeEach, describe, expect, it } from 'vitest'; -import { ClaudeCodeAdapterError, prepareMcpConfig } from '../../src/runtime/agents/claude-code.js'; +import { + ClaudeCodeAdapterError, + prepareClaudeSettings, + prepareMcpConfig, +} from '../../src/runtime/agents/claude-code.js'; describe('prepareMcpConfig', () => { let cwd: string; @@ -182,3 +186,155 @@ describe('prepareMcpConfig', () => { handle.restore(); }); }); + +describe('prepareClaudeSettings', () => { + let cwd: string; + const hookUrl = 'http://127.0.0.1:55555/hook/tool-event'; + + beforeEach(() => { + cwd = mkdtempSync(join(tmpdir(), 'ac7-claude-settings-test-')); + }); + + afterEach(() => { + rmSync(cwd, { recursive: true, force: true }); + }); + + it('creates .claude/settings.json when neither dir nor file existed, restore removes both', () => { + const dirPath = join(cwd, '.claude'); + const settingsPath = join(dirPath, 'settings.json'); + expect(existsSync(dirPath)).toBe(false); + + const handle = prepareClaudeSettings({ cwd, hookUrl }); + + expect(handle.path).toBe(settingsPath); + expect(existsSync(settingsPath)).toBe(true); + const written = JSON.parse(readFileSync(settingsPath, 'utf8')); + for (const event of ['PreToolUse', 'PostToolUse', 'PostToolUseFailure']) { + const matchers = written.hooks[event]; + expect(Array.isArray(matchers)).toBe(true); + const ac7 = matchers + .flatMap((m: { hooks: unknown[] }) => m.hooks) + .find( + (h: Record) => + h.type === 'http' && h.url === hookUrl && h.x_ac7_busy_feeder === true, + ); + expect(ac7).toBeTruthy(); + } + + handle.restore(); + expect(existsSync(settingsPath)).toBe(false); + expect(existsSync(dirPath)).toBe(false); + }); + + it('merges into existing settings.json while preserving other keys + other hooks', () => { + const dirPath = join(cwd, '.claude'); + const settingsPath = join(dirPath, 'settings.json'); + // Pre-existing user config: a Stop hook AND an unrelated top-level key. + require('node:fs').mkdirSync(dirPath, { recursive: true }); + writeFileSync( + settingsPath, + JSON.stringify({ + permissions: { allow: ['Bash'] }, + hooks: { + Stop: [{ matcher: '*', hooks: [{ type: 'command', command: 'notify-send done' }] }], + }, + }), + ); + + const handle = prepareClaudeSettings({ cwd, hookUrl }); + const merged = JSON.parse(readFileSync(settingsPath, 'utf8')); + + // Unrelated top-level key preserved. + expect(merged.permissions).toEqual({ allow: ['Bash'] }); + // User's Stop hook preserved verbatim. + expect(merged.hooks.Stop).toEqual([ + { matcher: '*', hooks: [{ type: 'command', command: 'notify-send done' }] }, + ]); + // Our PreToolUse hook injected. + expect(merged.hooks.PreToolUse).toBeTruthy(); + const preToolEntries = merged.hooks.PreToolUse.flatMap((m: { hooks: unknown[] }) => m.hooks); + expect(preToolEntries).toContainEqual({ + type: 'http', + url: hookUrl, + x_ac7_busy_feeder: true, + }); + + handle.restore(); + // Restore writes the original bytes back — Stop hook still there, + // our PreToolUse gone. + const restored = JSON.parse(readFileSync(settingsPath, 'utf8')); + expect(restored.hooks.PreToolUse).toBeUndefined(); + expect(restored.hooks.Stop).toEqual([ + { matcher: '*', hooks: [{ type: 'command', command: 'notify-send done' }] }, + ]); + expect(restored.permissions).toEqual({ allow: ['Bash'] }); + }); + + it('drops a stale ac7 hook entry from a previous crash before injecting fresh ones', () => { + const dirPath = join(cwd, '.claude'); + const settingsPath = join(dirPath, 'settings.json'); + require('node:fs').mkdirSync(dirPath, { recursive: true }); + // Simulate a previous run that crashed mid-restore, leaving a + // stale ac7 entry behind. Our prepare should NOT duplicate it. + writeFileSync( + settingsPath, + JSON.stringify({ + hooks: { + PreToolUse: [ + { + matcher: '*', + hooks: [ + { type: 'http', url: 'http://stale.local/hook', x_ac7_busy_feeder: true }, + { type: 'command', command: 'audit-log' }, + ], + }, + ], + }, + }), + ); + + const handle = prepareClaudeSettings({ cwd, hookUrl }); + const merged = JSON.parse(readFileSync(settingsPath, 'utf8')); + const entries = merged.hooks.PreToolUse.flatMap( + (m: { hooks: Record[] }) => m.hooks, + ); + // Stale ac7 entry gone. + expect( + entries.filter((e: Record) => e.url === 'http://stale.local/hook'), + ).toHaveLength(0); + // User's unrelated audit-log hook preserved. + expect(entries.find((e: Record) => e.command === 'audit-log')).toBeTruthy(); + // Fresh ac7 entry pointing at the current hook URL. + expect(entries.find((e: Record) => e.url === hookUrl)).toBeTruthy(); + handle.restore(); + }); + + it('refuses to modify when existing settings.json is not valid JSON', () => { + const dirPath = join(cwd, '.claude'); + const settingsPath = join(dirPath, 'settings.json'); + require('node:fs').mkdirSync(dirPath, { recursive: true }); + writeFileSync(settingsPath, 'not-json'); + expect(() => prepareClaudeSettings({ cwd, hookUrl })).toThrow(ClaudeCodeAdapterError); + // Original file untouched. + expect(readFileSync(settingsPath, 'utf8')).toBe('not-json'); + }); + + it('restore() is idempotent — second call is a no-op', () => { + const handle = prepareClaudeSettings({ cwd, hookUrl }); + handle.restore(); + expect(() => handle.restore()).not.toThrow(); + }); + + it('preserves the .claude/ dir on restore if other files live there', () => { + const dirPath = join(cwd, '.claude'); + require('node:fs').mkdirSync(dirPath, { recursive: true }); + // User had something else in .claude/ but no settings.json yet. + writeFileSync(join(dirPath, 'agents.md'), '# my agents'); + const handle = prepareClaudeSettings({ cwd, hookUrl }); + handle.restore(); + // Settings file gone, but the .claude/ dir + agents.md stay because + // the dir existed before our prepare touched it. + expect(existsSync(join(dirPath, 'settings.json'))).toBe(false); + expect(existsSync(join(dirPath, 'agents.md'))).toBe(true); + }); +}); diff --git a/packages/cli/test/runtime/codex/busy-sniff.test.ts b/packages/cli/test/runtime/codex/busy-sniff.test.ts new file mode 100644 index 0000000..7705ac4 --- /dev/null +++ b/packages/cli/test/runtime/codex/busy-sniff.test.ts @@ -0,0 +1,215 @@ +/** + * Tests for the codex tool-busy sniff layer. + * + * Drives the real `attachCodexBusySniff` helper against the real + * JSON-RPC client over a pair of in-memory streams — same wiring the + * adapter uses in production, no subprocess required. + * + * Pins the contract: + * - `item/started` with a TOOL_ITEM_TYPES type bumps tool_inflight + * - `item/started` with a non-tool type (agentMessage, reasoning, + * userMessage) is a no-op + * - matching `item/completed` decrements + * - missing `item/completed` is swept by `turn/completed` + * - explicit `drain()` finishes anything left over + */ + +import { PassThrough } from 'node:stream'; +import { afterEach, describe, expect, it } from 'vitest'; +import { attachCodexBusySniff } from '../../../src/runtime/agents/codex/busy-sniff.js'; +import { createJsonRpcClient } from '../../../src/runtime/agents/codex/json-rpc.js'; +import { createBusySignal } from '../../../src/runtime/trace/busy.js'; + +function pair(): { + client: ReturnType; + send: (notification: unknown) => void; + cleanup: () => void; +} { + const serverOut = new PassThrough(); + const serverIn = new PassThrough(); + const client = createJsonRpcClient(serverOut, serverIn); + // The notification stream is server→client; we WRITE to serverOut + // and the client reads it. Each message is one JSON line. + const send = (notification: unknown): void => { + serverOut.write(`${JSON.stringify(notification)}\n`); + }; + return { + client, + send, + cleanup: () => { + client.close('test-end'); + serverOut.destroy(); + serverIn.destroy(); + }, + }; +} + +// Vitest waits for events to drain between writes; in practice the +// notifications post within microtask order, but a small delay makes +// the test resilient to scheduler quirks. +const tick = (): Promise => new Promise((r) => setImmediate(r)); + +describe('attachCodexBusySniff', () => { + const teardowns: Array<() => void> = []; + + afterEach(() => { + while (teardowns.length > 0) teardowns.pop()?.(); + }); + + it('bumps tool_inflight on commandExecution start and drains on completion', async () => { + const { client, send, cleanup } = pair(); + teardowns.push(cleanup); + const busy = createBusySignal(); + attachCodexBusySniff({ rpc: client, busy }); + + send({ + method: 'item/started', + params: { + threadId: 't1', + turnId: 'turn-1', + item: { type: 'commandExecution', id: 'item-1', command: 'ls' }, + }, + }); + await tick(); + expect(busy.busy).toBe(true); + expect(busy.getSourceCounts().tool_inflight).toBe(1); + + send({ + method: 'item/completed', + params: { + threadId: 't1', + turnId: 'turn-1', + item: { type: 'commandExecution', id: 'item-1' }, + }, + }); + await tick(); + expect(busy.busy).toBe(false); + expect(busy.getSourceCounts().tool_inflight).toBe(0); + }); + + it('bumps for fileChange and mcpToolCall types too', async () => { + const { client, send, cleanup } = pair(); + teardowns.push(cleanup); + const busy = createBusySignal(); + attachCodexBusySniff({ rpc: client, busy }); + + for (const [type, id] of [ + ['fileChange', 'fc-1'], + ['mcpToolCall', 'mcp-1'], + ]) { + send({ + method: 'item/started', + params: { threadId: 't1', turnId: 'turn-1', item: { type, id } }, + }); + } + await tick(); + expect(busy.getSourceCounts().tool_inflight).toBe(2); + }); + + it('does NOT bump for non-tool item types (agentMessage, reasoning, userMessage)', async () => { + const { client, send, cleanup } = pair(); + teardowns.push(cleanup); + const busy = createBusySignal(); + attachCodexBusySniff({ rpc: client, busy }); + + for (const type of ['agentMessage', 'reasoning', 'userMessage', 'turnPlan']) { + send({ + method: 'item/started', + params: { threadId: 't1', turnId: 'turn-1', item: { type, id: `${type}-1` } }, + }); + } + await tick(); + expect(busy.busy).toBe(false); + expect(busy.getSourceCounts().tool_inflight).toBe(0); + }); + + it('ignores duplicate item/started for the same id', async () => { + const { client, send, cleanup } = pair(); + teardowns.push(cleanup); + const busy = createBusySignal(); + attachCodexBusySniff({ rpc: client, busy }); + + for (let i = 0; i < 3; i++) { + send({ + method: 'item/started', + params: { + threadId: 't1', + turnId: 'turn-1', + item: { type: 'commandExecution', id: 'dup' }, + }, + }); + } + await tick(); + expect(busy.getSourceCounts().tool_inflight).toBe(1); + + send({ + method: 'item/completed', + params: { threadId: 't1', turnId: 'turn-1', item: { type: 'commandExecution', id: 'dup' } }, + }); + await tick(); + expect(busy.getSourceCounts().tool_inflight).toBe(0); + }); + + it('turn/completed sweeps tool handles that never got matching item/completed', async () => { + const { client, send, cleanup } = pair(); + teardowns.push(cleanup); + const busy = createBusySignal(); + attachCodexBusySniff({ rpc: client, busy }); + + // Two tool starts, no completions. + send({ + method: 'item/started', + params: { threadId: 't1', turnId: 'turn-1', item: { type: 'commandExecution', id: 'a' } }, + }); + send({ + method: 'item/started', + params: { threadId: 't1', turnId: 'turn-1', item: { type: 'commandExecution', id: 'b' } }, + }); + await tick(); + expect(busy.getSourceCounts().tool_inflight).toBe(2); + + send({ + method: 'turn/completed', + params: { threadId: 't1', turn: { id: 'turn-1', status: 'completed' } }, + }); + await tick(); + expect(busy.busy).toBe(false); + expect(busy.getSourceCounts().tool_inflight).toBe(0); + }); + + it('explicit drain() finishes anything still in flight', async () => { + const { client, send, cleanup } = pair(); + teardowns.push(cleanup); + const busy = createBusySignal(); + const sniff = attachCodexBusySniff({ rpc: client, busy }); + + send({ + method: 'item/started', + params: { + threadId: 't1', + turnId: 'turn-1', + item: { type: 'commandExecution', id: 'orphan' }, + }, + }); + await tick(); + expect(sniff.inFlight).toBe(1); + + sniff.drain(); + expect(sniff.inFlight).toBe(0); + expect(busy.busy).toBe(false); + }); + + it('items lacking an id are skipped (cannot key the handle)', async () => { + const { client, send, cleanup } = pair(); + teardowns.push(cleanup); + const busy = createBusySignal(); + attachCodexBusySniff({ rpc: client, busy }); + + send({ + method: 'item/started', + params: { threadId: 't1', turnId: 'turn-1', item: { type: 'commandExecution' } }, + }); + await tick(); + expect(busy.busy).toBe(false); + }); +}); diff --git a/packages/cli/test/runtime/trace-hook-server.test.ts b/packages/cli/test/runtime/trace-hook-server.test.ts new file mode 100644 index 0000000..d3e9735 --- /dev/null +++ b/packages/cli/test/runtime/trace-hook-server.test.ts @@ -0,0 +1,164 @@ +/** + * Hook server tests. + * + * Pins the busy-signal contract the Claude Code hook endpoint + * implements: + * + * - PreToolUse bumps tool_inflight by tool_use_id + * - PostToolUse / PostToolUseFailure decrement the matching handle + * - Mismatched events (Post without a prior Pre, duplicate Pre, etc.) + * do not corrupt the count + * - Bad bodies are rejected with 4xx + * - close() drains every outstanding handle so a torn-down runner + * can't leave the indicator wedged + * + * The HTTP server binds on 127.0.0.1:0 (random ephemeral port) so the + * tests are hermetic and don't collide with anything else listening. + */ + +import { afterEach, describe, expect, it } from 'vitest'; +import { createBusySignal } from '../../src/runtime/trace/busy.js'; +import { type HookServer, startHookServer } from '../../src/runtime/trace/hook-server.js'; + +async function postJson(url: string, body: unknown): Promise<{ status: number; text: string }> { + const res = await fetch(url, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(body), + }); + return { status: res.status, text: await res.text() }; +} + +describe('hook server', () => { + let server: HookServer | null = null; + + afterEach(async () => { + if (server) { + await server.close().catch(() => {}); + server = null; + } + }); + + it('PreToolUse bumps tool_inflight; PostToolUse drains it', async () => { + const busy = createBusySignal(); + server = await startHookServer({ busy, log: () => {} }); + + expect(busy.getSourceCounts().tool_inflight).toBe(0); + + const r1 = await postJson(server.url, { + hook_event_name: 'PreToolUse', + tool_use_id: 'tool-1', + tool_name: 'Bash', + }); + expect(r1.status).toBe(200); + expect(busy.busy).toBe(true); + expect(busy.getSourceCounts().tool_inflight).toBe(1); + + const r2 = await postJson(server.url, { + hook_event_name: 'PostToolUse', + tool_use_id: 'tool-1', + tool_name: 'Bash', + }); + expect(r2.status).toBe(200); + expect(busy.busy).toBe(false); + expect(busy.getSourceCounts().tool_inflight).toBe(0); + }); + + it('counts overlapping tool calls correctly', async () => { + const busy = createBusySignal(); + server = await startHookServer({ busy, log: () => {} }); + + await postJson(server.url, { hook_event_name: 'PreToolUse', tool_use_id: 'a' }); + await postJson(server.url, { hook_event_name: 'PreToolUse', tool_use_id: 'b' }); + await postJson(server.url, { hook_event_name: 'PreToolUse', tool_use_id: 'c' }); + expect(busy.getSourceCounts().tool_inflight).toBe(3); + + await postJson(server.url, { hook_event_name: 'PostToolUse', tool_use_id: 'b' }); + expect(busy.getSourceCounts().tool_inflight).toBe(2); + expect(busy.busy).toBe(true); + + await postJson(server.url, { hook_event_name: 'PostToolUse', tool_use_id: 'a' }); + await postJson(server.url, { hook_event_name: 'PostToolUseFailure', tool_use_id: 'c' }); + expect(busy.getSourceCounts().tool_inflight).toBe(0); + expect(busy.busy).toBe(false); + }); + + it('duplicate PreToolUse for the same id is a no-op', async () => { + const busy = createBusySignal(); + server = await startHookServer({ busy, log: () => {} }); + + await postJson(server.url, { hook_event_name: 'PreToolUse', tool_use_id: 'dup' }); + await postJson(server.url, { hook_event_name: 'PreToolUse', tool_use_id: 'dup' }); + expect(busy.getSourceCounts().tool_inflight).toBe(1); + + await postJson(server.url, { hook_event_name: 'PostToolUse', tool_use_id: 'dup' }); + expect(busy.getSourceCounts().tool_inflight).toBe(0); + }); + + it('PostToolUse for an unknown id is silently ignored (no underflow)', async () => { + const busy = createBusySignal(); + server = await startHookServer({ busy, log: () => {} }); + + const res = await postJson(server.url, { + hook_event_name: 'PostToolUse', + tool_use_id: 'never-saw-this', + }); + expect(res.status).toBe(200); + expect(busy.getSourceCounts().tool_inflight).toBe(0); + }); + + it('non-tool events (SessionStart, Stop, etc.) are accepted without bumping', async () => { + const busy = createBusySignal(); + server = await startHookServer({ busy, log: () => {} }); + + await postJson(server.url, { hook_event_name: 'SessionStart' }); + await postJson(server.url, { hook_event_name: 'Stop' }); + await postJson(server.url, { hook_event_name: 'UserPromptSubmit' }); + expect(busy.busy).toBe(false); + expect(busy.getSourceCounts().tool_inflight).toBe(0); + }); + + it('rejects malformed bodies with 400', async () => { + const busy = createBusySignal(); + server = await startHookServer({ busy, log: () => {} }); + + const res = await fetch(server.url, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: 'not-json', + }); + expect(res.status).toBe(400); + expect(busy.getSourceCounts().tool_inflight).toBe(0); + }); + + it('returns 200 with accepted=false when fields are missing (avoid retry storms)', async () => { + const busy = createBusySignal(); + server = await startHookServer({ busy, log: () => {} }); + + const res = await postJson(server.url, { hook_event_name: 'PreToolUse' }); + expect(res.status).toBe(200); + const body = JSON.parse(res.text); + expect(body.accepted).toBe(false); + }); + + it('non-matching routes return 404', async () => { + const busy = createBusySignal(); + server = await startHookServer({ busy, log: () => {} }); + + const res = await fetch(server.url.replace('/hook/tool-event', '/something-else')); + expect(res.status).toBe(404); + }); + + it('close() drains outstanding handles so busy unwedges', async () => { + const busy = createBusySignal(); + server = await startHookServer({ busy, log: () => {} }); + + await postJson(server.url, { hook_event_name: 'PreToolUse', tool_use_id: 'left-dangling' }); + expect(busy.busy).toBe(true); + + await server.close(); + server = null; + expect(busy.busy).toBe(false); + expect(busy.getSourceCounts().tool_inflight).toBe(0); + }); +}); diff --git a/packages/cli/test/runtime/trace-host.test.ts b/packages/cli/test/runtime/trace-host.test.ts index bf0189e..f01204c 100644 --- a/packages/cli/test/runtime/trace-host.test.ts +++ b/packages/cli/test/runtime/trace-host.test.ts @@ -160,4 +160,65 @@ describe('TraceHost', () => { await host.close(); } }); + + it('close() force-drains leaked busy handles as the final teardown safety net', async () => { + // Simulates the production failure mode: a sub-system (proxy + // reassembler, hook server, codex sniff) failed to drain a + // handle before its own close. Without this safety net, the + // runner's busy reporter would keep heartbeating `busy: true` + // until its AbortController fires, then post a final + // `busy: false`. The forceFinishAll() inside TraceHost.close() + // makes the indicator drop earlier and protects against bugs + // where the reassembler's flush path silently misses a handle. + const logged: Array<{ msg: string; ctx?: Record }> = []; + const host = await startTraceHost({ + log: (msg, ctx) => logged.push({ msg, ctx }), + caCertPath: join(tmpDir, 'leak.pem'), + brokerClient: stubBrokerClient(), + name: 'TEST', + }); + + // Leak a handle as if a reassembler bug forgot to call finish(). + // Disable the max-age timer so this test isn't racing the 5min + // default — we want close() to do the draining, not the timer. + host.busy.start('llm_inflight', { maxAgeMs: Number.POSITIVE_INFINITY }); + host.busy.start('tool_inflight', { maxAgeMs: Number.POSITIVE_INFINITY }); + expect(host.busy.busy).toBe(true); + expect(host.busy.getSourceCounts()).toEqual({ llm_inflight: 1, tool_inflight: 1 }); + + await host.close(); + + expect(host.busy.busy).toBe(false); + expect(host.busy.getSourceCounts()).toEqual({ llm_inflight: 0, tool_inflight: 0 }); + // The diagnostic log fires with the pre-drain counts so + // operators can tell which source leaked. + const drainLog = logged.find( + (l) => l.msg === 'trace-host: force-drained leaked busy handles at teardown', + ); + expect(drainLog).toBeTruthy(); + expect(drainLog?.ctx?.drained).toBe(2); + expect(drainLog?.ctx?.sourceCounts).toEqual({ + llm_inflight: 1, + tool_inflight: 1, + }); + }); + + it('close() does not log the drain message when no handles leaked', async () => { + // Happy path: every sub-system drained its handles correctly, + // so forceFinishAll() returns 0 and stays quiet. We don't want + // the diagnostic log to fire on normal shutdowns since it would + // be noise. + const logged: Array<{ msg: string }> = []; + const host = await startTraceHost({ + log: (msg) => logged.push({ msg }), + caCertPath: join(tmpDir, 'clean.pem'), + brokerClient: stubBrokerClient(), + name: 'TEST', + }); + await host.close(); + const drainLog = logged.find( + (l) => l.msg === 'trace-host: force-drained leaked busy handles at teardown', + ); + expect(drainLog).toBeUndefined(); + }); }); diff --git a/packages/cli/test/runtime/trace-known-hosts.test.ts b/packages/cli/test/runtime/trace-known-hosts.test.ts new file mode 100644 index 0000000..a2d9994 --- /dev/null +++ b/packages/cli/test/runtime/trace-known-hosts.test.ts @@ -0,0 +1,66 @@ +/** + * Allowlist predicate tests — pin the hostnames the trace proxy is + * willing to MITM-decrypt. The proxy itself takes the predicate by + * injection, but the production wiring (`host.ts`) uses this exact + * function; if the predicate's behavior changes silently, agents stop + * generating structured LLM traces or, worse, the proxy starts + * decrypting hosts we promised it wouldn't. + */ + +import { describe, expect, it } from 'vitest'; +import { isKnownLlmHost, KNOWN_LLM_HOST_PATTERNS } from '../../src/runtime/trace/known-hosts.js'; + +describe('isKnownLlmHost', () => { + it('matches the Anthropic apex and its subdomains', () => { + expect(isKnownLlmHost('anthropic.com')).toBe(true); + expect(isKnownLlmHost('api.anthropic.com')).toBe(true); + expect(isKnownLlmHost('console.anthropic.com')).toBe(true); + expect(isKnownLlmHost('auth.anthropic.com')).toBe(true); + }); + + it('matches the OpenAI apex and its subdomains', () => { + expect(isKnownLlmHost('openai.com')).toBe(true); + expect(isKnownLlmHost('api.openai.com')).toBe(true); + expect(isKnownLlmHost('auth.openai.com')).toBe(true); + }); + + it('matches Azure OpenAI customer subdomains', () => { + expect(isKnownLlmHost('example.openai.azure.com')).toBe(true); + expect(isKnownLlmHost('my-deployment.openai.azure.com')).toBe(true); + }); + + it('is case-insensitive', () => { + expect(isKnownLlmHost('API.ANTHROPIC.COM')).toBe(true); + expect(isKnownLlmHost('Api.Openai.Com')).toBe(true); + }); + + it('rejects hosts not on the allowlist', () => { + expect(isKnownLlmHost('github.com')).toBe(false); + expect(isKnownLlmHost('api.github.com')).toBe(false); + expect(isKnownLlmHost('example.com')).toBe(false); + expect(isKnownLlmHost('telemetry.example.com')).toBe(false); + expect(isKnownLlmHost('registry.npmjs.org')).toBe(false); + }); + + it('does NOT match suffix tricks (anthropic.com-as-prefix attacks)', () => { + // `anthropic.com.evil.com` ends with `.com`, not `anthropic.com`. + // The regex anchors the apex on the right and requires either + // line-start or a `.` separator on the left. + expect(isKnownLlmHost('anthropic.com.evil.com')).toBe(false); + expect(isKnownLlmHost('evil-anthropic.com')).toBe(false); + expect(isKnownLlmHost('xanthropic.com')).toBe(false); + expect(isKnownLlmHost('openai.com.attacker.net')).toBe(false); + }); + + it('rejects empty / malformed input safely', () => { + expect(isKnownLlmHost('')).toBe(false); + }); + + it('exports the underlying patterns so callers can introspect', () => { + expect(KNOWN_LLM_HOST_PATTERNS.length).toBeGreaterThan(0); + // Sanity: each pattern is a RegExp. + for (const p of KNOWN_LLM_HOST_PATTERNS) { + expect(p).toBeInstanceOf(RegExp); + } + }); +}); diff --git a/packages/cli/test/runtime/trace-proxy.test.ts b/packages/cli/test/runtime/trace-proxy.test.ts index 3831422..2891899 100644 --- a/packages/cli/test/runtime/trace-proxy.test.ts +++ b/packages/cli/test/runtime/trace-proxy.test.ts @@ -334,3 +334,220 @@ describe('proxy raw TCP fallback path', () => { expect(proxy.proxyUrl.startsWith('http://')).toBe(true); }); }); + +describe('proxy shouldMitm gating', () => { + const cleanups: Array<() => Promise> = []; + + afterEach(async () => { + while (cleanups.length > 0) { + const c = cleanups.pop(); + if (c) await c().catch(() => {}); + } + }); + + it('MITMs hosts the predicate accepts (mitm: true, plaintext captured)', async () => { + const upstream = await startHttpsEchoServer(); + cleanups.push(() => new Promise((r) => upstream.server.close(() => r()))); + + const chunks: ProxyChunk[] = []; + const sessions: ProxySession[] = []; + const ca = createTraceCa(); + const certPool = createCertPool(ca); + const proxy = await startProxyRelay({ + log: () => {}, + certPool, + // Predicate always accepts — equivalent to legacy behavior. + shouldMitm: () => true, + upstreamTlsOptions: { rejectUnauthorized: false }, + onChunk: (c) => chunks.push({ ...c, bytes: Buffer.from(c.bytes) }), + onSessionEnd: (s) => sessions.push(s), + }); + cleanups.push(() => proxy.close()); + + // CONNECT + TLS-trust-our-CA handshake (mirrors what a real Node + // agent does under HTTPS_PROXY + NODE_EXTRA_CA_CERTS). + const tcp = tcpConnect({ host: proxy.host, port: proxy.port }); + await new Promise((r, j) => { + tcp.once('connect', () => r()); + tcp.once('error', j); + }); + tcp.write( + `CONNECT localhost:${upstream.port} HTTP/1.1\r\nHost: localhost:${upstream.port}\r\n\r\n`, + ); + await new Promise((resolve) => { + let buf = ''; + const onData = (d: Buffer): void => { + buf += d.toString('utf8'); + if (buf.includes('\r\n\r\n')) { + tcp.off('data', onData); + resolve(); + } + }; + tcp.on('data', onData); + }); + const client = tlsConnect({ + socket: tcp, + servername: 'localhost', + ca: [ca.caCertPem], + minVersion: 'TLSv1.2', + }); + await new Promise((r, j) => { + client.once('secureConnect', () => r()); + client.once('error', j); + }); + client.write(`POST /v1/messages HTTP/1.1\r\nHost: localhost\r\nContent-Length: 5\r\n\r\nhello`); + await new Promise((resolve) => { + const parts: Buffer[] = []; + client.on('data', (d: Buffer) => parts.push(d)); + client.on('end', () => resolve(Buffer.concat(parts))); + }); + await new Promise((r) => setTimeout(r, 50)); + + const clientText = Buffer.concat( + chunks.filter((c) => c.direction === 'client_to_upstream').map((c) => c.bytes), + ).toString('utf8'); + expect(clientText).toContain('POST /v1/messages'); + expect(sessions).toHaveLength(1); + expect(sessions[0]?.mitm).toBe(true); + }, 15_000); + + it('raw-tunnels hosts the predicate rejects (mitm: false, ciphertext only)', async () => { + const upstream = await startHttpsEchoServer(); + cleanups.push(() => new Promise((r) => upstream.server.close(() => r()))); + + const chunks: ProxyChunk[] = []; + const sessions: ProxySession[] = []; + const ca = createTraceCa(); + const certPool = createCertPool(ca); + const proxy = await startProxyRelay({ + log: () => {}, + certPool, + // Predicate rejects — proxy must fall back to raw TCP tunnel + // even though a CertPool is configured. The client's TLS + // handshake completes against the upstream's real cert. + shouldMitm: () => false, + onChunk: (c) => chunks.push({ ...c, bytes: Buffer.from(c.bytes) }), + onSessionEnd: (s) => sessions.push(s), + }); + cleanups.push(() => proxy.close()); + + const tcp = tcpConnect({ host: proxy.host, port: proxy.port }); + await new Promise((r, j) => { + tcp.once('connect', () => r()); + tcp.once('error', j); + }); + tcp.write( + `CONNECT localhost:${upstream.port} HTTP/1.1\r\nHost: localhost:${upstream.port}\r\n\r\n`, + ); + await new Promise((resolve) => { + let buf = ''; + const onData = (d: Buffer): void => { + buf += d.toString('utf8'); + if (buf.includes('\r\n\r\n')) { + tcp.off('data', onData); + resolve(); + } + }; + tcp.on('data', onData); + }); + // Important: trust the upstream cert directly (no MITM cert + // involved), exactly as a real agent would when system trust + // applies end-to-end. + const client = tlsConnect({ + socket: tcp, + servername: 'localhost', + ca: [upstream.cert], + // The test upstream cert is self-signed for 'localhost'; we + // still want a real handshake to validate the raw-tunnel + // works at the application layer. + rejectUnauthorized: true, + minVersion: 'TLSv1.2', + }); + await new Promise((r, j) => { + client.once('secureConnect', () => r()); + client.once('error', j); + }); + client.write(`POST /v1/messages HTTP/1.1\r\nHost: localhost\r\nContent-Length: 5\r\n\r\nhello`); + await new Promise((resolve) => { + const parts: Buffer[] = []; + client.on('data', (d: Buffer) => parts.push(d)); + client.on('end', () => resolve(Buffer.concat(parts))); + }); + await new Promise((r) => setTimeout(r, 50)); + + // Captured bytes are encrypted TLS records — the plaintext HTTP + // request never appears in the chunks. This is the privacy + // promise of scoped MITM: hosts off the allowlist are not + // decrypted, even though we routed their traffic. + const everything = Buffer.concat(chunks.map((c) => c.bytes)).toString('utf8'); + expect(everything).not.toContain('POST /v1/messages'); + expect(everything).not.toContain('hello'); + // Session metadata is still produced — we know the agent talked + // to this host, just not what was said. + expect(sessions).toHaveLength(1); + expect(sessions[0]?.mitm).toBe(false); + expect(sessions[0]?.upstream.host).toBe('localhost'); + expect(sessions[0]?.upstream.port).toBe(upstream.port); + expect(sessions[0]?.bytesOut).toBeGreaterThan(0); + expect(sessions[0]?.bytesIn).toBeGreaterThan(0); + }, 15_000); + + it('omitted shouldMitm preserves legacy "MITM whenever certPool is set" behavior', async () => { + // This guards the migration path: existing callers that didn't + // pass shouldMitm should still get MITM behavior. + const upstream = await startHttpsEchoServer(); + cleanups.push(() => new Promise((r) => upstream.server.close(() => r()))); + + const sessions: ProxySession[] = []; + const ca = createTraceCa(); + const certPool = createCertPool(ca); + const proxy = await startProxyRelay({ + log: () => {}, + certPool, + // shouldMitm intentionally omitted. + upstreamTlsOptions: { rejectUnauthorized: false }, + onSessionEnd: (s) => sessions.push(s), + }); + cleanups.push(() => proxy.close()); + + const tcp = tcpConnect({ host: proxy.host, port: proxy.port }); + await new Promise((r, j) => { + tcp.once('connect', () => r()); + tcp.once('error', j); + }); + tcp.write( + `CONNECT localhost:${upstream.port} HTTP/1.1\r\nHost: localhost:${upstream.port}\r\n\r\n`, + ); + await new Promise((resolve) => { + let buf = ''; + const onData = (d: Buffer): void => { + buf += d.toString('utf8'); + if (buf.includes('\r\n\r\n')) { + tcp.off('data', onData); + resolve(); + } + }; + tcp.on('data', onData); + }); + const client = tlsConnect({ + socket: tcp, + servername: 'localhost', + ca: [ca.caCertPem], + minVersion: 'TLSv1.2', + }); + await new Promise((r, j) => { + client.once('secureConnect', () => r()); + client.once('error', j); + }); + client.write(`GET / HTTP/1.1\r\nHost: localhost\r\n\r\n`); + await new Promise((resolve) => { + const parts: Buffer[] = []; + client.on('data', (d: Buffer) => parts.push(d)); + client.on('end', () => resolve(Buffer.concat(parts))); + }); + await new Promise((r) => setTimeout(r, 50)); + + expect(sessions).toHaveLength(1); + expect(sessions[0]?.mitm).toBe(true); + }, 15_000); +});