From 9ba49b9853c4bf61adf495da96a94446df616519 Mon Sep 17 00:00:00 2001 From: ojowwalker77 Date: Fri, 5 Jun 2026 17:08:55 -0300 Subject: [PATCH] feat: skills-first delivery, contract-trace directive, SPLUS.md (v0.9.2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The protocol no longer depends on agents reading MCP tool descriptions: install.sh now installs the review skills into every detected agent (Claude Code ~/.claude/skills, Codex ~/.codex/prompts, OpenCode ~/.config/opencode/command), with the canonical copy at ~/.splus/skills, refreshed on every `splus update`. Release tarballs ship skills/. Bench-proven review lessons land in the agent path: - `review` returns a deterministic CHANGED SYMBOLS block (engine exports ∩ diff hunks, via new shared changedExportedSymbols/diffText) - the directive gains a TRACE CONTRACTS stage (return shape on every path → open every caller → report each mismatch) and an anti-checklist clause banning generic hardening nits - the same disciplines are encoded in skills/review (contract-drift lens) Also: - SPLUS.md replaces splus.md as the review contract (no fallback) - bench triage pipeline: second contract-tracing discovery lens, merged + deduped; claude -p client fails closed on is_error and unparseable forced-tool JSON; signal budget 3 → 4 per file - @splus/shared tests wired into pnpm -r test (had no test script) - drop unused @splus/triage dep from @splus/mcp (bench-only by design) - bump all versions to 0.9.2 --- .github/workflows/release.yml | 1 + CHANGELOG.md | 34 ++++++ Cargo.lock | 2 +- Cargo.toml | 2 +- README.md | 12 +- splus.md => SPLUS.md | 0 bench/martian/run.mjs | 2 +- docs/TOOLS.md | 16 +-- install.sh | 69 ++++++++++- package.json | 2 +- packages/mcp/package.json | 3 +- packages/mcp/src/index.ts | 82 ++++++++----- packages/shared/package.json | 5 +- packages/shared/src/index.ts | 82 ++++++++++++- packages/shared/src/splusMd.test.ts | 14 ++- packages/shared/src/splusMd.ts | Bin 4240 -> 4240 bytes packages/suppression/package.json | 2 +- packages/triage/package.json | 2 +- packages/triage/src/claude-cli.ts | 13 ++- packages/triage/src/index.test.ts | 16 +-- packages/triage/src/index.ts | 148 ++++++++++++++++-------- pnpm-lock.yaml | 3 - skills/prefs/SKILL.md | 16 +-- skills/review/SKILL.md | 21 +++- skills/review/references/dispatch.md | 6 +- skills/review/references/investigate.md | 21 +++- skills/review/references/lenses.md | 26 ++++- skills/review/references/verify.md | 2 +- 28 files changed, 459 insertions(+), 143 deletions(-) rename splus.md => SPLUS.md (100%) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 766b1da..1b3fb99 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -68,6 +68,7 @@ jobs: mkdir -p staging cp "target/${{ matrix.target }}/release/splus-engine" staging/ cp dist-release/mcp.cjs staging/ + cp -R skills staging/skills tar -czf "splus-${{ matrix.name }}.tar.gz" -C staging . - uses: actions/upload-artifact@v7 diff --git a/CHANGELOG.md b/CHANGELOG.md index 7a151bc..f3e264b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,40 @@ this project uses [semantic versioning](https://semver.org) (pre-1.0: minor vers ## [Unreleased] +## [0.9.2] — 2026-06-05 + +### Changed +- **`SPLUS.md` replaces `splus.md`.** The per-repo review contract is now + uppercase (repo root and `~/.splus/SPLUS.md`). No fallback: rename your file. +- **Skills are installed, not just shipped.** `install.sh` now places the review + protocol into every detected agent — Claude Code (`~/.claude/skills/splus-review`, + `splus-prefs`), Codex (`~/.codex/prompts/splus-review.md`), OpenCode + (`~/.config/opencode/command/splus-review.md`) — with the canonical copy at + `~/.splus/skills`. Skills refresh on `splus update` even when MCP wiring is + skipped. The protocol no longer depends on agents reading MCP tool descriptions. +- **The review directive teaches contract tracing.** `review` output now includes + a deterministic CHANGED SYMBOLS block (engine exports ∩ diff hunks) and a + TRACE CONTRACTS stage: enumerate each changed function's return/throw shape on + every path, open every caller, report every assumption mismatch — the + most-missed real-bug class on the Martian bench. A new anti-checklist clause + bans generic hardening nits unless the diff introduces the flaw (the #1 noise + source). The same disciplines are encoded in the review skill's lenses. + +### Added +- `@splus/shared`: `diffText`, `changedLineRanges`, `changedExportedSymbols` — + deterministic change-surface extraction reused by the MCP server and the + benchmark pipeline. +- Benchmark triage pipeline: a second contract-tracing discovery lens (merged + + deduped with the sweep lens) aimed by the CHANGED SYMBOLS block. + +### Fixed +- `claude -p` CLI client now fails closed: an `is_error` result or unparseable + forced-tool JSON throws instead of being recorded as a zero-finding review. +- `@splus/shared` tests are now actually wired into `pnpm -r test` (the package + had no test script). +- Removed the unused `@splus/triage` dependency from `@splus/mcp` — the MCP + path is agent-led by design; the triage pipeline serves the benchmark only. + ## [0.9.1] — 2026-06-05 ### Fixed diff --git a/Cargo.lock b/Cargo.lock index 1fd1f6e..e4c654e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -453,7 +453,7 @@ checksum = "f8fadd59c855ef2080decdef8ff161eb6661b86933c9d82e5ba29dc602a55aba" [[package]] name = "splus-engine" -version = "0.9.1" +version = "0.9.2" dependencies = [ "anyhow", "clap", diff --git a/Cargo.toml b/Cargo.toml index 0895f0a..340780a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,7 +6,7 @@ resolver = "2" members = ["crates/splus-engine"] [workspace.package] -version = "0.9.1" +version = "0.9.2" edition = "2021" license = "MIT" repository = "https://github.com/kiwi-init/splus" diff --git a/README.md b/README.md index 56e3e6f..269ef73 100644 --- a/README.md +++ b/README.md @@ -83,10 +83,10 @@ Your agent connects to the local server and calls these: | Tool | What it does | | ------------- | --------------------------------------------------------------------------- | -| `review` | Read `splus.md`, return the deterministic floor + a directive, drive the review. | +| `review` | Read `SPLUS.md`, return the deterministic floor + a directive, drive the review. | | `inspect` | The engine **on tap**: `definition` · `callers` · `blast_radius` · `complexity` · `exports` · `imports` — investigate on demand. | | `floor` | Re-ground on the deterministic finding floor for a scope (no directive). | -| `preferences` | Show the merged `splus.md` contract (repo + `~/.splus`). | +| `preferences` | Show the merged `SPLUS.md` contract (repo + `~/.splus`). | | `recall` | Surface past confirmed findings / conventions relevant to a hunk. | | `note` | Remember a repo convention you discovered (→ `recall`). | | `dismiss` | Teach Splus a finding is noise — it generalizes to close variants. | @@ -96,15 +96,15 @@ Your agent connects to the local server and calls these: | `report` | Render the review as a standalone offline HTML report. | | `index` | Build a SCIP index locally for the precise (compiler-grade) blast-radius tier. | -Agent-led, one flow: `review` injects the repo's `splus.md` contract and returns the grounded +Agent-led, one flow: `review` injects the repo's `SPLUS.md` contract and returns the grounded deterministic floor; **you** drive the review — pull signal on demand with `inspect`, verify before posting, then `report` and teach. No API key, ever — the model already in your editor does the reasoning. Learnings stay per-repo in `.splus-cache/` (suppressions in `learnings.json`, memory in `memory.json`) — they never leave your checkout. -### `splus.md` — the repo's review contract +### `SPLUS.md` — the repo's review contract -Drop a `splus.md` at the repo root (layered over your personal `~/.splus/splus.md`). Splus reads it +Drop a `SPLUS.md` at the repo root (layered over your personal `~/.splus/SPLUS.md`). Splus reads it **first** on every review: prose preferences/nits guide the reviewer, and binding `mute: ` / `skip: ` lines drop matching findings (and say so — never silently). The `prefs` skill scaffolds one. @@ -112,7 +112,7 @@ Drop a `splus.md` at the repo root (layered over your personal `~/.splus/splus.m The `skills/` bundle drives the agent-led flow: `review` (fans out **fresh, unbiased sub-agents** per unit — finder ≠ verifier — and degrades to a sequential pass where sub-agents aren't available) and -`prefs` (author `splus.md`). +`prefs` (author `SPLUS.md`). **Full reference: [`docs/TOOLS.md`](docs/TOOLS.md)** — every tool, parameter, and return shape. diff --git a/splus.md b/SPLUS.md similarity index 100% rename from splus.md rename to SPLUS.md diff --git a/bench/martian/run.mjs b/bench/martian/run.mjs index 4ca52ef..5c644d7 100644 --- a/bench/martian/run.mjs +++ b/bench/martian/run.mjs @@ -239,7 +239,7 @@ async function main() { ); } catch (e) { // Not persisted → retried on the next run (this is how we survive rate limits). - process.stderr.write(` ⊘ ${pr.source_repo}: ${String(e.message || e).slice(0, 90)}\n`); + process.stderr.write(` ⊘ ${pr.source_repo}: ${String(e.message || e).slice(0, 500)}\n`); } finally { rmSync(dir, { recursive: true, force: true }); } diff --git a/docs/TOOLS.md b/docs/TOOLS.md index ebb8fd7..df2fa99 100644 --- a/docs/TOOLS.md +++ b/docs/TOOLS.md @@ -6,10 +6,10 @@ API key and no cloud step; the coding agent connected over stdio is the reviewer | Tool | Mutates? | What it's for | |---|---|---| -| [`review`](#review) | no | Read `splus.md`, return the floor, drive the agent's review. | +| [`review`](#review) | no | Read `SPLUS.md`, return the floor, drive the agent's review. | | [`inspect`](#inspect) | no | The engine **on tap** — one code-intelligence question on demand. | | [`floor`](#floor) | no | Re-ground on the deterministic finding floor for a scope. | -| [`preferences`](#preferences) | no | Show the merged `splus.md` contract. | +| [`preferences`](#preferences) | no | Show the merged `SPLUS.md` contract. | | [`recall`](#recall) | no | Surface confirmed findings / conventions for a hunk. | | [`note`](#note) | yes | Remember a discovered repo convention (→ `recall`). | | [`dismiss`](#dismiss) | yes | Teach Splus a finding is **noise** (suppresses close variants). | @@ -48,7 +48,7 @@ rule id, severity, confidence, a deterministic provenance **anchor**, an optiona fix, and cross-file **blast radius**. Learned suppressions are applied first. There is **one flow** and you are the driver: the response begins with the repo's -[`splus.md`](#preferences) contract (preferences injected, binding `mute:`/`skip:` +[`SPLUS.md`](#preferences) contract (preferences injected, binding `mute:`/`skip:` rules already enforced) and ends with a **discovery directive** that drives *you* (the agent) through the full protocol (triage → investigate → verify) over the changed files. No API key — Splus grounds you with precise anchors and a toolbelt @@ -112,7 +112,7 @@ return an honest empty answer. ## `floor` Return the engine's deterministic finding **floor** for a scope as JSON — the same -grounded set `review` starts from, without the directive. The repo's `splus.md` +grounded set `review` starts from, without the directive. The repo's `SPLUS.md` binding rules are applied; learned suppression is not. Use it to re-check a scope mid-investigation. @@ -126,11 +126,11 @@ mid-investigation. ## `preferences` -Return the merged [`splus.md`](#) review contract for this repo (`./splus.md` -layered over `~/.splus/splus.md`), including its binding `mute:`/`skip:` rules. +Return the merged [`SPLUS.md`](#) review contract for this repo (`./SPLUS.md` +layered over `~/.splus/SPLUS.md`), including its binding `mute:`/`skip:` rules. `review` already injects it; call this to read it directly. -`splus.md` is the repo's review contract, read **first** on every review: prose +`SPLUS.md` is the repo's review contract, read **first** on every review: prose preferences/nits guide the reviewer; `mute: ` and `skip: ` lines are **binding** (matching findings are dropped and reported, never silently). The `prefs` skill scaffolds one. @@ -159,7 +159,7 @@ compounds across sessions. Semantic (embedding) match over `.splus-cache/memory. Remember a repo convention you discovered while reviewing (e.g. "this module uses `Result`, never throws") so future reviews `recall` it. Complements `accept`. -Written to `.splus-cache/memory.json`; promotable into `splus.md` for a binding rule. +Written to `.splus-cache/memory.json`; promotable into `SPLUS.md` for a binding rule. | Param | Type | Description | |---|---|---| diff --git a/install.sh b/install.sh index 7c3e3ac..051d3ef 100755 --- a/install.sh +++ b/install.sh @@ -11,7 +11,7 @@ # SPLUS_VERSION pin a release tag (e.g. v0.3.0); default: latest # SPLUS_INSTALL_DIR install prefix; default: $HOME/.splus # SPLUS_LOCAL_DIST install from a local dir of built artifacts instead of -# downloading (expects splus-engine, mcp.cjs) — +# downloading (expects splus-engine, mcp.cjs, optionally skills/) — # used for local testing of this script # SPLUS_NO_MODIFY_PATH=1 don't touch shell rc files # SPLUS_NO_WIRE=1 don't auto-wire coding agents @@ -94,6 +94,7 @@ if [ -n "${SPLUS_LOCAL_DIST:-}" ]; then detail "using local dist: $SPLUS_LOCAL_DIST" cp "$SPLUS_LOCAL_DIST/splus-engine" "$tmp/" || die "missing splus-engine in SPLUS_LOCAL_DIST" cp "$SPLUS_LOCAL_DIST/mcp.cjs" "$tmp/" || die "missing mcp.cjs in SPLUS_LOCAL_DIST" + [ -d "$SPLUS_LOCAL_DIST/skills" ] && cp -R "$SPLUS_LOCAL_DIST/skills" "$tmp/skills" version="local" else if command -v curl >/dev/null 2>&1; then dl() { curl -fsSL "$1" -o "$2"; } @@ -167,6 +168,16 @@ esac EOF chmod 0755 "$BIN_DIR/splus" printf '%s\n' "$version" > "$INSTALL_DIR/version" + +# The review-protocol skills — the canonical copy lives in ~/.splus/skills; the +# agent wiring below copies/points each coding agent at it. The protocol is a +# first-class artifact: agents load it explicitly instead of depending on MCP +# tool descriptions being read. +if [ -d "$tmp/skills" ]; then + rm -rf "$INSTALL_DIR/skills" + cp -R "$tmp/skills" "$INSTALL_DIR/skills" +fi + if [ "$updating" -eq 1 ]; then ok "core updated" else @@ -301,11 +312,65 @@ EOF [ "$wired" = 1 ] || warn "no coding agent detected — register \`$MCP_BIN\` as an MCP server manually" fi +# --- install the review protocol as agent skills ---------------------------- +# The skills ARE the product's review protocol — installing them per agent makes +# the protocol explicit and user-invocable instead of depending on the agent +# happening to read MCP tool descriptions. Unlike MCP wiring, this also runs on +# updates: a refreshed protocol is half the point of `splus update`. +if [ -z "${SPLUS_NO_WIRE:-}" ] && [ -d "$INSTALL_DIR/skills" ]; then + detail "installing agent skills" + + # The SKILL.md body without its Claude-specific YAML frontmatter. + skill_body() { awk 'f>1 {print} /^---$/ {f++}' "$1"; } + # The one-line description from the frontmatter (for OpenCode's command header). + skill_desc() { awk '/^description: /{sub(/^description: /,""); print; exit}' "$1"; } + # Point non-Claude agents at the canonical per-stage reference files. + skill_refs() { + [ -d "$INSTALL_DIR/skills/$1/references" ] || return 0 + printf '\n> Stage protocols — read each file in %s/skills/%s/references/ as you reach that stage.\n' "$INSTALL_DIR" "$1" + } + + # Claude Code — native skills (auto-triggered by name + user-invocable). + if command -v claude >/dev/null 2>&1 || [ -d "$HOME/.claude" ]; then + mkdir -p "$HOME/.claude/skills" + for s in review prefs; do + [ -d "$INSTALL_DIR/skills/$s" ] || continue + rm -rf "$HOME/.claude/skills/splus-$s" + cp -R "$INSTALL_DIR/skills/$s" "$HOME/.claude/skills/splus-$s" + done + ok "Claude Code skills (splus-review, splus-prefs)" + fi + + # Codex — custom prompts, slash-invocable (/splus-review). + if command -v codex >/dev/null 2>&1 || [ -d "$HOME/.codex" ]; then + mkdir -p "$HOME/.codex/prompts" + for s in review prefs; do + [ -f "$INSTALL_DIR/skills/$s/SKILL.md" ] || continue + { skill_body "$INSTALL_DIR/skills/$s/SKILL.md"; skill_refs "$s"; } > "$HOME/.codex/prompts/splus-$s.md" + done + ok "Codex prompts (/splus-review, /splus-prefs)" + fi + + # OpenCode — commands, slash-invocable (/splus-review). + if command -v opencode >/dev/null 2>&1 || [ -d "$HOME/.config/opencode" ]; then + mkdir -p "$HOME/.config/opencode/command" + for s in review prefs; do + [ -f "$INSTALL_DIR/skills/$s/SKILL.md" ] || continue + { + printf -- '---\ndescription: %s\n---\n' "$(skill_desc "$INSTALL_DIR/skills/$s/SKILL.md")" + skill_body "$INSTALL_DIR/skills/$s/SKILL.md" + skill_refs "$s" + } > "$HOME/.config/opencode/command/splus-$s.md" + done + ok "OpenCode commands (/splus-review, /splus-prefs)" + fi +fi + # --- done ------------------------------------------------------------------ if [ "$updating" -eq 1 ]; then printf '\n%b\n' "${c_grn}${c_b}Splus is up to date.${c_0}" else printf '\n%b\n' "${c_grn}${c_b}Splus is installed.${c_0}" - printf '%b\n' " ${c_dim}then, in your agent:${c_0} \"review my staged changes with splus\"" + printf '%b\n' " ${c_dim}then, in your agent:${c_0} /splus-review ${c_dim}(or \"review my staged changes with splus\")${c_0}" printf '%b\n' " ${c_dim}update:${c_0} splus update" fi diff --git a/package.json b/package.json index 401130f..170cb12 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "splus", - "version": "0.9.1", + "version": "0.9.2", "private": true, "description": "Splus — the precision-first, local-first code reviewer. A deterministic Rust engine your coding agent (Claude Code, Codex, OpenCode) calls over MCP. Open source, runs entirely on your machine.", "license": "MIT", diff --git a/packages/mcp/package.json b/packages/mcp/package.json index 7df7b3d..b825705 100644 --- a/packages/mcp/package.json +++ b/packages/mcp/package.json @@ -1,6 +1,6 @@ { "name": "@splus/mcp", - "version": "0.9.1", + "version": "0.9.2", "type": "module", "description": "Splus MCP server (local) — a stdio MCP server your coding agent (Claude Code, Codex, OpenCode) connects to. Runs the deterministic review engine on your local checkout and applies this repo's learned suppressions. No account, no token, nothing leaves your machine.", "bin": { @@ -18,7 +18,6 @@ "@modelcontextprotocol/sdk": "^1.29.0", "@splus/shared": "workspace:*", "@splus/suppression": "workspace:*", - "@splus/triage": "workspace:*", "zod": "^3.23.8" }, "devDependencies": { diff --git a/packages/mcp/src/index.ts b/packages/mcp/src/index.ts index 50b1167..f014587 100644 --- a/packages/mcp/src/index.ts +++ b/packages/mcp/src/index.ts @@ -21,10 +21,10 @@ * SPLUS_ENGINE path to the splus-engine binary (else auto-resolved / PATH) * * Tools: - * review — review staged / working / base..HEAD / whole-repo changes (reads splus.md) + * review — review staged / working / base..HEAD / whole-repo changes (reads SPLUS.md) * inspect — the engine on tap: definition / callers / blast_radius / complexity / exports / imports * floor — re-ground on the deterministic finding floor for a scope (no directive) - * preferences — show the merged splus.md contract (repo + ~/.splus) + * preferences — show the merged SPLUS.md contract (repo + ~/.splus) * report — render the review as a standalone offline HTML report (final step) * dismiss — teach Splus a finding is noise (generalizes semantically) * accept — teach Splus a finding was real (reinforces + stores recallable memory) @@ -43,6 +43,8 @@ import type { CallToolResult } from "@modelcontextprotocol/sdk/types.js"; import { z } from "zod"; import { applyPolicy, + changedExportedSymbols, + diffText, inspect as engineInspect, listChangedFiles, loadSplusConfig, @@ -80,10 +82,10 @@ const VERSION = typeof __SPLUS_VERSION__ !== "undefined" ? __SPLUS_VERSION__ : " const SERVER_INSTRUCTIONS = `Splus turns the coding agent already in this session into a disciplined, precision-first code reviewer. You are the reviewer in the chair — the engine is yours to interrogate, not a list to relay. There is no API key and no clock: curiosity and verification are the job, and a wrong comment costs more than a slow review. When the user asks you to review code: -1. \`review\` (mode: working/staged/base/all; precise:true for compiler-grade blast radius). It reads the repo's \`splus.md\` contract (its preferences + binding nits — honor them; they come first), returns the deterministic FLOOR of findings, and hands you a directive. Do NOT ask about a "deterministic-only" mode or an ANTHROPIC_API_KEY — neither exists here. +1. \`review\` (mode: working/staged/base/all; precise:true for compiler-grade blast radius). It reads the repo's \`SPLUS.md\` contract (its preferences + binding nits — honor them; they come first), returns the deterministic FLOOR of findings, and hands you a directive. Do NOT ask about a "deterministic-only" mode or an ANTHROPIC_API_KEY — neither exists here. 2. INVESTIGATE, don't triage a list. The floor is grounding; the review is what you find. Pull deterministic signal on demand with \`inspect\` (kind: definition | callers | blast_radius | complexity | exports | imports) — when an export looks risky, open its callers and confirm the blast radius; recurse when something smells off. Use \`floor\` to re-ground a file subset. Read the changed code for what determinism can't see: logic, security, intent, concurrency. 3. VERIFY every finding by trying to refute it against the cited line. Drop any you can't defend. Then REPORT survivors as must-fix / concern / nit with file:line and a concrete fix. -4. TEACH the repo: \`dismiss \` for noise, \`accept \` for real, \`note\` to record a convention. \`preferences\` shows the active \`splus.md\`; \`recall\` surfaces what was learned here before. +4. TEACH the repo: \`dismiss \` for noise, \`accept \` for real, \`note\` to record a convention. \`preferences\` shows the active \`SPLUS.md\`; \`recall\` surfaces what was learned here before. Other tools: \`report\` renders the offline HTML deliverable, \`mute\` silences a rule, \`learnings\` lists what's been taught, \`index\` builds a SCIP index for precise blast radius.`; @@ -199,15 +201,15 @@ function toAgentReport(report: Report, suppressed: SuppressedFinding[]) { } /** - * The `splus.md` contract block, prepended to the review so the agent reads the + * The `SPLUS.md` contract block, prepended to the review so the agent reads the * repo's standing preferences BEFORE planning — and an honest record of any * finding the binding `mute:`/`skip:` rules dropped (never silent). */ function prefsBlock(cfg: SplusConfig, dropped: { ruleId: string; file: string; reason: string }[]): string { - const lines: string[] = ["=== splus.md · the repo's review contract (read first) ==="]; + const lines: string[] = ["=== SPLUS.md · the repo's review contract (read first) ==="]; if (cfg.source === "none") { lines.push( - "No splus.md found. Reviewing with engine defaults. If the user has standing preferences or", + "No SPLUS.md found. Reviewing with engine defaults. If the user has standing preferences or", "repo nits, offer to scaffold one (the `prefs` skill) — it makes the next review serve their taste.", ); } else { @@ -220,11 +222,11 @@ function prefsBlock(cfg: SplusConfig, dropped: { ruleId: string; file: string; r if (dropped.length) { lines.push( "", - `Dropped by splus.md policy (${dropped.length}): ` + + `Dropped by SPLUS.md policy (${dropped.length}): ` + dropped.map((d) => `${d.ruleId}@${d.file} (${d.reason})`).join("; "), ); } - lines.push("=== end splus.md ==="); + lines.push("=== end SPLUS.md ==="); return lines.join("\n"); } @@ -241,30 +243,42 @@ function summaryLine(report: Report, suppressedCount: number): string { * stages (triage → discover → verify → report → teach) so the review is run, not * relayed. No API key — the frontier model already in the session does the work. */ -function discoveryDirective(files: string[]): string { +function discoveryDirective(files: string[], changedSymbols: string[] = []): string { const shown = files.slice(0, 40); const more = files.length - shown.length; const list = (shown.map((f) => ` - ${f}`).join("\n") || " (no changed files)") + (more > 0 ? `\n …and ${more} more` : ""); + const symbolBlock = changedSymbols.length + ? [ + "", + "Changed exported symbols (deterministic — the contracts this change touches):", + ...changedSymbols.map((s) => ` - ${s}`), + ] + : []; return [ "=== Splus · you are the reviewer (one flow — no API key, no clock) ===", "The findings above are the DETERMINISTIC FLOOR — high-precision, each anchored to a pattern, metric, or cross-file graph edge. They are NOT the review. You are the senior reviewer in the chair, seeing this code for the first time: run the full protocol over the changed code yourself. Take the time it takes — curiosity is the job.", "", "Changed files:", list, + ...symbolBlock, "", "1. TRIAGE — for each finding above, decide keep vs suppress. Optimize for signal: suppress test fixtures, idiomatic patterns for the file's role, and pure style; keep what a senior reviewer would genuinely want fixed before merge. A noisy comment costs more than a missed nit.", - "2. DISCOVER — read the changed code and find what determinism cannot. Don't guess — INTERROGATE THE ENGINE with `inspect` (kind: definition | callers | blast_radius | complexity | exports | imports); when a hunk smells off, open its callers and confirm the blast radius before you move on. Each finding must be grounded in a line that exists:", - " • correctness — off-by-one, missing await / unhandled error path, wrong condition, null/undefined deref, resource leak, broken invariant", + "2. TRACE CONTRACTS — for EACH changed exported symbol above (or any changed function if the list is empty), run this discipline; it catches the single most-missed real-bug class (return-shape drift):", + " a. enumerate what it returns/throws on EVERY path after this change — success, error, missing/invalid input, each early return — with the exact shape (object keys, wrapper types like {success,data,error}, Response vs parsed body, promise vs value);", + " b. `inspect callers` / `inspect blast_radius`, then OPEN each call site and state what shape that caller assumes (property accesses, destructuring, truthiness checks);", + " c. report every mismatch. One changed function often breaks several callers — finding one mismatch means checking the remaining call sites, not stopping.", + "3. DISCOVER — read the changed code and find what determinism cannot. Don't guess — INTERROGATE THE ENGINE with `inspect` (kind: definition | callers | blast_radius | complexity | exports | imports); when a hunk smells off, open its callers and confirm the blast radius before you move on. Each finding must be grounded in a line that exists:", + " • correctness — off-by-one, missing await / unhandled error path, wrong condition, case-sensitive comparison where input case varies, null/undefined deref, resource leak, broken invariant", " • security — injection / path-traversal / SSRF reachable from input, authz/IDOR gaps, unsafe deserialization, secret & credential handling, command or eval", " • intent — does the code do what its name, comments, and the change claim? dead, contradictory, or silently fail-open logic", - " • failure & concurrency — races, partial writes, retries, fail-open where it must fail-closed", - " • blast radius — for any changed export, `inspect callers` / `inspect blast_radius` and open each call site to confirm it still holds", - "3. VERIFY — before posting anything, re-read each candidate's cited line and try to REFUTE it. Drop any you can't defend (already handled nearby, speculative, the line doesn't actually demonstrate it). A wrong comment costs more than a missed nit.", - "4. REPORT — the survivors as must-fix / concern / nit with file:line and a concrete fix. Never invent a finding; every claim cites a real line.", - "5. TEACH — `dismiss ` when the user agrees something is noise, `accept ` when they act on a real one — Splus learns this repo both ways.", - "6. RENDER — the deliverable that ends the review: call the `report` tool, fill the returned HTML template with the verdict + your verified survivors + the file-level impact graph, and write `splus-report.html`. One self-contained, offline file — the artifact a dev keeps next to the diff.", + " • failure & concurrency — races, partial writes, retries, fail-open where it must fail-closed, concurrent mutation of shared state on request paths", + " Spend your comments on the CHANGE's own logic. Do NOT pad the review with generic best-practice concerns (timing-safe comparison, rate limiting, header normalization, hardening on trusted paths) unless the diff clearly introduces the flaw — that padding is the #1 source of review noise.", + "4. VERIFY — before posting anything, re-read each candidate's cited line and try to REFUTE it. Drop any you can't defend (already handled nearby, speculative, the line doesn't actually demonstrate it). A wrong comment costs more than a missed nit.", + "5. REPORT — the survivors as must-fix / concern / nit with file:line and a concrete fix. Never invent a finding; every claim cites a real line.", + "6. TEACH — `dismiss ` when the user agrees something is noise, `accept ` when they act on a real one — Splus learns this repo both ways.", + "7. RENDER — the deliverable that ends the review: call the `report` tool, fill the returned HTML template with the verdict + your verified survivors + the file-level impact graph, and write `splus-report.html`. One self-contained, offline file — the artifact a dev keeps next to the diff.", ].join("\n"); } @@ -386,7 +400,7 @@ server.registerTool( ); } - // The repo contract (`splus.md`): inject its prose, enforce its binding + // The repo contract (`SPLUS.md`): inject its prose, enforce its binding // `mute:`/`skip:` rules. This runs BEFORE learned suppression so a stated // preference always wins over the engine's defaults. const cfg = loadSplusConfig(repo); @@ -416,12 +430,22 @@ server.registerTool( const payload = toAgentReport(report, suppressed); const body = `${summaryLine(report, suppressed.length)}\n\n${JSON.stringify(payload, null, 2)}${reinforcedNote}${preciseNote}`; + // Deterministic AIM for the contract-trace stage: which exported symbols the + // diff actually touches (engine exports ∩ hunks). Best-effort and capped — + // an engine hiccup or a huge change surface must never block the review. + const changedFiles = listChangedFiles(repo, dmode); + let changedSymbols: string[] = []; + try { + changedSymbols = await changedExportedSymbols(repo, changedFiles.slice(0, 20), diffText(repo, dmode)); + } catch { + /* aim is enrichment, never load-bearing */ + } // The handoff, and the whole product: read the repo contract first, ground the // agent with the deterministic floor, then drive it through the protocol. The // directive is ALWAYS appended — there is no other path, no key, no headless // mode to choose between. return ok( - `${prefsBlock(cfg, policy.dropped)}\n\n${body}\n\n${discoveryDirective(listChangedFiles(repo, dmode))}`, + `${prefsBlock(cfg, policy.dropped)}\n\n${body}\n\n${discoveryDirective(changedFiles, changedSymbols)}`, ); }, ); @@ -476,7 +500,7 @@ server.registerTool( description: "Return the engine's deterministic finding FLOOR for a scope as JSON — the same grounded set " + "`review` starts from, but without the directive. Use it to re-check a file subset or a " + - "different scope mid-investigation. The repo's `splus.md` binding rules are applied; learned " + + "different scope mid-investigation. The repo's `SPLUS.md` binding rules are applied; learned " + "suppression is not (this is the raw floor).", inputSchema: { root: z.string().optional().describe("Repo root (default: server CWD)."), @@ -504,15 +528,15 @@ server.registerTool( }, ); -// --- preferences (the splus.md contract) ----------------------------------- +// --- preferences (the SPLUS.md contract) ----------------------------------- server.registerTool( "preferences", { - title: "Show the active splus.md contract", + title: "Show the active SPLUS.md contract", description: - "Return the merged `splus.md` review contract for this repo (repo `./splus.md` layered over " + - "`~/.splus/splus.md`), including its binding `mute:`/`skip:` rules. This is the repo's standing " + + "Return the merged `SPLUS.md` review contract for this repo (repo `./SPLUS.md` layered over " + + "`~/.splus/SPLUS.md`), including its binding `mute:`/`skip:` rules. This is the repo's standing " + "preferences + nits — `review` already injects it, but call this to read it directly.", inputSchema: { root: z.string().optional().describe("Repo root (default: server CWD)."), @@ -523,12 +547,12 @@ server.registerTool( const cfg = loadSplusConfig(rootOf(root)); if (cfg.source === "none") { return ok( - "No splus.md found (repo root or ~/.splus/splus.md). Reviewing with engine defaults — the " + + "No SPLUS.md found (repo root or ~/.splus/SPLUS.md). Reviewing with engine defaults — the " + "`prefs` skill scaffolds one so the next review serves the repo's taste.", ); } return ok( - `splus.md (${cfg.source})\nmuted rules: ${cfg.mutedRules.join(", ") || "—"}\n` + + `SPLUS.md (${cfg.source})\nmuted rules: ${cfg.mutedRules.join(", ") || "—"}\n` + `skip paths: ${cfg.skipPaths.join(", ") || "—"}\n\n${cfg.raw.trim()}`, ); }, @@ -764,7 +788,7 @@ server.registerTool( "Remember a repo convention or context you discovered while reviewing (e.g. 'this module uses " + "Result, never throws' or 'auth/ requires every handler to call requireSession first'), so " + "future reviews `recall` it. Complements `accept` (which remembers confirmed findings). Written " + - "to .splus-cache/memory.json; promotable into splus.md for a binding rule.", + "to .splus-cache/memory.json; promotable into SPLUS.md for a binding rule.", inputSchema: { root: z.string().optional().describe("Repo root (default: server CWD)."), text: z.string().describe("The convention/context to remember, in one sentence."), @@ -775,7 +799,7 @@ server.registerTool( async ({ root, text, file }) => { const store = new FileMemoryStore(memoryPath(rootOf(root))); await store.remember({ kind: "note", text, file }); - return ok(`Noted. Splus will recall this on future reviews of this repo. (Promote it into splus.md to make it a binding rule.)`); + return ok(`Noted. Splus will recall this on future reviews of this repo. (Promote it into SPLUS.md to make it a binding rule.)`); }, ); diff --git a/packages/shared/package.json b/packages/shared/package.json index d6bd3ff..3072fe0 100644 --- a/packages/shared/package.json +++ b/packages/shared/package.json @@ -1,6 +1,6 @@ { "name": "@splus/shared", - "version": "0.9.1", + "version": "0.9.2", "type": "module", "main": "./dist/index.js", "types": "./dist/index.d.ts", @@ -12,7 +12,8 @@ }, "scripts": { "build": "tsc -p tsconfig.json", - "typecheck": "tsc -p tsconfig.json --noEmit" + "typecheck": "tsc -p tsconfig.json --noEmit", + "test": "node --test dist/*.test.js" }, "dependencies": { "zod": "^3.23.8" diff --git a/packages/shared/src/index.ts b/packages/shared/src/index.ts index cb72def..d6f9d92 100644 --- a/packages/shared/src/index.ts +++ b/packages/shared/src/index.ts @@ -290,6 +290,86 @@ export function listChangedFiles(root: string, mode: DiffMode): string[] { .filter(Boolean); } +/** + * The unified diff text for a mode — the same change surface `listChangedFiles` + * names, as hunks. Mode `all` has no diff (the whole repo is "the change"). + * Returns "" on any git failure: callers treat the diff as enrichment, never load-bearing. + */ +export function diffText(root: string, mode: DiffMode): string { + if (mode.kind === "all") return ""; + if (mode.kind === "base") assertSafeRef(mode.ref); + const args = + mode.kind === "staged" + ? ["diff", "--cached"] + : mode.kind === "base" + ? ["diff", `${mode.ref}...HEAD`, "--"] + : ["diff", "HEAD"]; + const r = spawnSync("git", args, { + cwd: resolve(root), + encoding: "utf8", + maxBuffer: 64 * 1024 * 1024, + }); + return r.status === 0 ? (r.stdout ?? "") : ""; +} + +/** New-side line ranges per file from a unified diff's hunk headers. */ +export function changedLineRanges(diff: string): Map> { + const map = new Map>(); + let file = ""; + for (const line of diff.split("\n")) { + const f = line.match(/^\+\+\+ b\/(.+)/); + if (f) { + file = f[1] ?? ""; + continue; + } + const h = line.match(/^@@ -\d+(?:,\d+)? \+(\d+)(?:,(\d+))? @@/); + if (h && file) { + const start = Number(h[1]); + const count = h[2] !== undefined ? Number(h[2]) : 1; + const arr = map.get(file) ?? []; + arr.push([start, start + Math.max(count, 1) - 1]); + map.set(file, arr); + } + } + return map; +} + +/** + * The diff's change surface as exported-symbol names: for each changed file, ask + * the engine for its exports (tree-sitter) and keep the ones whose body overlaps + * a diff hunk. One line per file: `path: symbolA, symbolB`. This is deterministic + * AIM for a reviewer — "these are the contracts the change touches; trace each + * into its callers." Best-effort: any engine failure just drops that file. + */ +export async function changedExportedSymbols(root: string, files: string[], diff: string): Promise { + const ranges = changedLineRanges(diff); + const out: string[] = []; + for (const file of files) { + const fileRanges = ranges.get(file); + if (!fileRanges?.length) continue; + let exports: Array<{ name: string; line: number; kind: string }>; + try { + const r = (await inspect({ root, kind: "exports", target: file })) as { + exports?: Array<{ name: string; line: number; kind: string }>; + }; + exports = (r.exports ?? []).filter((e) => e.line > 0).sort((a, b) => a.line - b.line); + } catch { + continue; + } + const touched: string[] = []; + for (let i = 0; i < exports.length; i++) { + const cur = exports[i]; + if (!cur) continue; + // Approximate body span: from this export's line to just before the next. + const start = cur.line; + const end = exports[i + 1] ? exports[i + 1]!.line - 1 : Number.MAX_SAFE_INTEGER; + if (fileRanges.some(([a, b]) => b >= start && a <= end)) touched.push(cur.name); + } + if (touched.length) out.push(`${file}: ${touched.join(", ")}`); + } + return out; +} + /** Map our severities to a numeric rank (mirrors the engine). */ export function severityRank(s: Severity): number { return { critical: 4, high: 3, medium: 2, low: 1, info: 0 }[s]; @@ -301,5 +381,5 @@ export function exceedsThreshold(report: Report, failOn: Severity): boolean { return report.findings.some((f) => severityRank(f.severity) >= t); } -// The per-repo review contract (`splus.md`): loader + binding policy. +// The per-repo review contract (`SPLUS.md`): loader + binding policy. export * from "./splusMd.js"; diff --git a/packages/shared/src/splusMd.test.ts b/packages/shared/src/splusMd.test.ts index fe19ab6..4c58f71 100644 --- a/packages/shared/src/splusMd.test.ts +++ b/packages/shared/src/splusMd.test.ts @@ -8,7 +8,7 @@ import type { Finding } from "./index.js"; function tmpRepo(splusMd?: string): string { const dir = mkdtempSync(join(tmpdir(), "splus-md-")); - if (splusMd !== undefined) writeFileSync(join(dir, "splus.md"), splusMd); + if (splusMd !== undefined) writeFileSync(join(dir, "SPLUS.md"), splusMd); return dir; } @@ -34,7 +34,7 @@ function finding(over: Partial): Finding { test("loads repo contract and parses mute/skip directives", () => { const repo = tmpRepo( [ - "# splus.md", + "# SPLUS.md", "## nits", "- console.log is fine in scripts.", "- mute: hygiene.console-log", @@ -49,6 +49,16 @@ test("loads repo contract and parses mute/skip directives", () => { assert.deepEqual(cfg.skipPaths, ["generated/**"]); }); +test("user-level contract loads from ~/.splus", () => { + const repo = tmpRepo(); + const home = mkdtempSync(join(tmpdir(), "splus-home-")); + mkdirSync(join(home, ".splus"), { recursive: true }); + writeFileSync(join(home, ".splus", "SPLUS.md"), "skip: vendored/**"); + const cfg = loadSplusConfig(repo, home); + assert.equal(cfg.source, "user"); + assert.deepEqual(cfg.skipPaths, ["vendored/**"]); +}); + test("missing file yields an empty, non-throwing contract", () => { const repo = tmpRepo(); const cfg = loadSplusConfig(repo, repo); diff --git a/packages/shared/src/splusMd.ts b/packages/shared/src/splusMd.ts index 170fcc2922cba5cd5ec808ede01412cf0e9eaccc..03614a2d52a1949d9eb63920318a3228ff6ed48a 100644 GIT binary patch delta 83 zcmbQBI6;w1UtdunAvnM%G }; + const res = JSON.parse(out) as { result?: string; is_error?: boolean; usage?: Record }; const text = res.result ?? ""; + if (res.is_error) throw new Error(`claude -p errored: ${text.slice(0, 300)}`); const parsed = tool ? parseJsonLoose(text) : null; + // The pipeline always forces tool choice, so a reply without parseable tool + // JSON is an error — throw (fail closed) rather than hand back a text block + // the caller would read as "zero findings" and record as a real score. + if (tool && !parsed) { + throw new Error(`claude -p returned unparseable "${tool.name}" input: ${text.slice(0, 300)}`); + } return { content: tool && parsed ? [{ type: "tool_use", name: tool.name, input: parsed }] : [{ type: "text", text }], diff --git a/packages/triage/src/index.test.ts b/packages/triage/src/index.test.ts index 602acca..575f43a 100644 --- a/packages/triage/src/index.test.ts +++ b/packages/triage/src/index.test.ts @@ -211,8 +211,8 @@ test("signal budget caps low/medium discoveries per file to the most-confident f writeFileSync(join(dir, "src", "a.ts"), Array.from({ length: 12 }, (_, i) => `line${i + 1}`).join("\n") + "\n"); // Five low-severity discoveries on ONE file — an over-firing pass. Verify - // affirms all (so the cut is the budget, not verification). Only the 3 - // most-confident may surface; the other 2 are demoted, not deleted. + // affirms all (so the cut is the budget, not verification). Only the 4 + // most-confident may surface; the other 1 is demoted, not deleted. const confidences: Record = { 1: 0.9, 2: 0.5, 3: 0.8, 4: 0.6, 5: 0.7 }; const client: LLMClient = { messages: { @@ -244,16 +244,16 @@ test("signal budget caps low/medium discoveries per file to the most-confident f const out = await triage(rep, { root: dir, client, thorough: true, verify: true, changedFiles: ["src/a.ts"] }); assert.equal(out.llm.discovered, 5, "five discovered"); - assert.equal(out.llm.budgeted, 2, "two demoted by the per-file budget"); - assert.equal(out.findings.length, 3, "only the 3 most-confident surface"); - assert.equal(out.summary.findings_total, 3, "summary reflects the final kept set"); + assert.equal(out.llm.budgeted, 1, "one demoted by the per-file budget"); + assert.equal(out.findings.length, 4, "only the 4 most-confident surface"); + assert.equal(out.summary.findings_total, 4, "summary reflects the final kept set"); assert.equal(out.summary.must_fix, 0); assert.equal(out.summary.concern, 0); - assert.equal(out.summary.nit, 3); + assert.equal(out.summary.nit, 4); const surfaced = out.findings.map((f) => f.title).sort(); - assert.deepEqual(surfaced, ["nit-1", "nit-3", "nit-5"], "kept the top-3 by confidence (0.9/0.8/0.7)"); + assert.deepEqual(surfaced, ["nit-1", "nit-3", "nit-4", "nit-5"], "kept the top-4 by confidence (0.9/0.8/0.7/0.6)"); const demoted = out.suppressed.filter((f) => /signal budget/.test(f.rationale ?? "")); - assert.equal(demoted.length, 2, "demoted ones are visible in suppressed, not deleted"); + assert.equal(demoted.length, 1, "demoted one is visible in suppressed, not deleted"); }); test("verify is fail-closed for low-severity speculation, fail-open for medium+", async () => { diff --git a/packages/triage/src/index.ts b/packages/triage/src/index.ts index 3bb8678..e067d44 100644 --- a/packages/triage/src/index.ts +++ b/packages/triage/src/index.ts @@ -13,7 +13,7 @@ import Anthropic from "@anthropic-ai/sdk"; import { readFileSync } from "node:fs"; import { join } from "node:path"; import type { Finding, Report, Severity } from "@splus/shared"; -import { Severity as SeverityEnum, Category as CategoryEnum } from "@splus/shared"; +import { Severity as SeverityEnum, Category as CategoryEnum, changedExportedSymbols } from "@splus/shared"; export const TRIAGE_MODEL = "claude-haiku-4-5"; export const DISCOVERY_MODEL = "claude-opus-4-8"; @@ -417,51 +417,24 @@ function selectDiscoverySurface( return { files: surface.slice(0, DISCOVERY_FILE_CAP), notes }; } -async function discover( - client: LLMClient, - model: string, - root: string, - files: string[], - diff: string | undefined, - accUsage: (r: LLMResponse) => void, -): Promise { - const blocks = files - .map((f) => { - const src = readFileSafe(join(root, f)); - if (!src) return null; - return `### ${f}\n${numberLines(src)}`; - }) - .filter(Boolean) - .join("\n\n"); - if (!blocks && !diff) return []; - - // The DIFF is the focus (bugs live in changed lines); the full files are - // context for reasoning. Lead with the diff so the model reviews the change, - // not the whole file blind — the single biggest recall lever. - const userContent = diff - ? `Review this DIFF. The added/changed lines (\`+\`) are where bugs are most likely; cite line numbers from the FULL FILES below.\n\n=== DIFF (the change under review) ===\n${diff.slice(0, 60000)}\n\n=== FULL FILES (numbered, for context + line numbers) ===\n${blocks}` - : `Changed files:\n\n${blocks}`; - - const res = await client.messages.create({ - model, - max_tokens: 4096, - system: [ - { - type: "text", - text: - "You are Splus in deep-review mode, reviewing the NEW/changed code in a diff. Find EVERY plausible bug the change introduces: logic errors, off-by-one, missing await/unhandled error paths, null/undefined, broken invariants, resource leaks, race conditions, security (injection, SSRF, path-traversal, authz/IDOR, unsafe deserialization, secrets), breaking API/signature changes, and intent/spec mismatches (code that doesn't do what its name/comment/PR claims). " + - "Prioritize RECALL: a separate verification pass removes false positives, so err toward flagging anything a careful senior reviewer might raise — but only about the CHANGED code, and each finding MUST cite a real line number from the provided files. Don't flag pre-existing code the diff didn't touch.", - cache_control: { type: "ephemeral" }, - }, - ], - tools: [DISCOVERY_TOOL], - tool_choice: { type: "tool", name: "report_findings" }, - messages: [{ role: "user", content: userContent }], - }); - accUsage(res); - +/** Lens A — broad sweep over the change. */ +const DISCOVER_SWEEP_SYSTEM = + "You are Splus in deep-review mode, reviewing the NEW/changed code in a diff. Find EVERY plausible bug the change introduces, in this priority order: (1) the change's own logic — wrong variable, inverted/incomplete condition, off-by-one, case-sensitive comparison where input case varies, broken invariants, intent/spec mismatches (code that doesn't do what its name/comment/PR claims); (2) error and edge paths — missing await, unhandled rejection/exception, null/undefined, resource leaks, concurrent mutation of shared state; (3) breaking API/signature changes; (4) security the diff itself introduces (injection, SSRF, path-traversal, authz/IDOR, unsafe deserialization, secrets). " + + "Do NOT pad the review with generic best-practice concerns (timing-safe comparison, rate limiting, header normalization, input sanitization on trusted paths) unless the diff clearly introduces the flaw. " + + "Prioritize RECALL on the changed logic: a separate verification pass removes false positives, so err toward flagging anything a careful senior reviewer might raise — but only about the CHANGED code, and each finding MUST cite a real line number from the provided files. Don't flag pre-existing code the diff didn't touch."; + +/** Lens B — trace each changed contract into its callers. */ +const DISCOVER_CONTRACT_SYSTEM = + "You are Splus in contract-tracing mode. The diff changes the functions listed under CHANGED SYMBOLS (extracted deterministically). For EACH one, trace the data flow with discipline:\n" + + "1. RETURN SHAPE — enumerate what it returns/throws on every path after this change: success, error, missing/invalid input, each early return. Include the exact shape (object keys, wrapper types like {success,data,error}, Response vs parsed body, promise vs value).\n" + + "2. CALLERS — find every call site in the provided files (search the full files for the symbol). For each, state what shape the caller assumes (property accesses, destructuring, truthiness checks).\n" + + "3. MISMATCH — report every place a caller's assumption no longer matches the new behavior. This bug class (return-shape drift, sentinel/hardcoded fallback values, Response-vs-data confusion) is the reason this pass exists; one changed function often breaks several callers.\n" + + "Also check each changed comparison for case/type strictness against the data it actually receives, and each shared structure mutated on a request path for concurrent access.\n" + + "Report ONLY issues the change introduces, each citing a real line number from the provided files. Skip style, naming, and generic hardening concerns entirely."; + +/** Parse one discovery response into findings (shared by both lenses). */ +function parseDiscoveries(res: LLMResponse, fileSet: Set): TriagedFinding[] { const out: TriagedFinding[] = []; - const fileSet = new Set(files); for (const d of parseTool<{ findings: DiscoveryItem[] }>(res)?.findings ?? []) { if (!fileSet.has(d.file)) continue; // must cite a provided file (anti-hallucination) // The model is tool-constrained but not schema-validated on the client side, @@ -493,6 +466,89 @@ async function discover( return out; } +async function discover( + client: LLMClient, + model: string, + root: string, + files: string[], + diff: string | undefined, + accUsage: (r: LLMResponse) => void, +): Promise { + const blocks = files + .map((f) => { + const src = readFileSafe(join(root, f)); + if (!src) return null; + return `### ${f}\n${numberLines(src)}`; + }) + .filter(Boolean) + .join("\n\n"); + if (!blocks && !diff) return []; + + // The DIFF is the focus (bugs live in changed lines); the full files are + // context for reasoning. Lead with the diff so the model reviews the change, + // not the whole file blind — the single biggest recall lever. + const userContent = diff + ? `Review this DIFF. The added/changed lines (\`+\`) are where bugs are most likely; cite line numbers from the FULL FILES below.\n\n=== DIFF (the change under review) ===\n${diff.slice(0, 60000)}\n\n=== FULL FILES (numbered, for context + line numbers) ===\n${blocks}` + : `Changed files:\n\n${blocks}`; + + const fileSet = new Set(files); + const call = (system: string, content: string) => + client.messages + .create({ + model, + max_tokens: 4096, + system: [{ type: "text", text: system, cache_control: { type: "ephemeral" } }], + tools: [DISCOVERY_TOOL], + tool_choice: { type: "tool", name: "report_findings" }, + messages: [{ role: "user", content }], + }) + .then((res) => { + accUsage(res); + return parseDiscoveries(res, fileSet); + }); + + // Lens B only exists when there is a diff to anchor the contract trace. Its + // CHANGED SYMBOLS block comes from the engine (tree-sitter exports ∩ diff + // hunks) — deterministic aim for the data-flow class the sweep keeps missing. + // Engine signal is best-effort: an empty block still leaves a useful lens. + const lenses = [call(DISCOVER_SWEEP_SYSTEM, userContent)]; + if (diff) { + const symbols = await changedExportedSymbols(root, files, diff); + const symbolBlock = symbols.length + ? `=== CHANGED SYMBOLS (deterministic — exported symbols whose body the diff touches) ===\n${symbols.join("\n")}\n\n` + : ""; + lenses.push(call(DISCOVER_CONTRACT_SYSTEM, symbolBlock + userContent)); + } + + const settled = await Promise.allSettled(lenses); + const found = settled.filter((s) => s.status === "fulfilled").map((s) => (s as PromiseFulfilledResult).value); + // One lens failing (CLI hiccup) must not kill the whole discovery pass — but + // BOTH failing means no discovery happened: surface that instead of a silent + // empty review (the benchmark would record it as a legit zero-finding score). + if (found.length === 0) { + const first = settled[0] as PromiseRejectedResult; + throw first.reason instanceof Error ? first.reason : new Error(String(first.reason)); + } + return dedupeDiscoveries(found.flat()); +} + +/** file:line + normalized-title dedup across lenses (first occurrence wins). */ +function dedupeDiscoveries(all: TriagedFinding[]): TriagedFinding[] { + const out: TriagedFinding[] = []; + const seenLoc = new Set(); + const seenTitle = new Set(); + for (const f of all) { + const loc = `${f.file}:${f.region.start_line}`; + const title = f.title.toLowerCase().replace(/[^a-z0-9]+/g, " ").trim().slice(0, 64); + if (seenLoc.has(loc) || seenTitle.has(title)) continue; + seenLoc.add(loc); + seenTitle.add(title); + out.push(f); + } + return out; +} + + interface DiscoveryItem { file: string; line: number; @@ -633,7 +689,7 @@ function severityRank(s: Severity): number { } /** Most low/medium UNGROUNDED findings surfaced per changed file. */ -const UNGROUNDED_PER_FILE = 3; +const UNGROUNDED_PER_FILE = 4; /** * Cap the low/medium UNGROUNDED (LLM-discovered) findings surfaced per file to diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 53a56ca..bf40c5d 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -23,9 +23,6 @@ importers: '@splus/suppression': specifier: workspace:* version: link:../suppression - '@splus/triage': - specifier: workspace:* - version: link:../triage zod: specifier: ^3.23.8 version: 3.25.76 diff --git a/skills/prefs/SKILL.md b/skills/prefs/SKILL.md index 68e6303..a1ee334 100644 --- a/skills/prefs/SKILL.md +++ b/skills/prefs/SKILL.md @@ -1,6 +1,6 @@ --- name: Splus Preferences -description: This skill should be used when the user wants to set up or change how Splus reviews this repo — "add a splus nit", "splus should never flag X", "set up splus.md", "tell splus to skip generated files", "splus review preferences". +description: This skill should be used when the user wants to set up or change how Splus reviews this repo — "add a splus nit", "splus should never flag X", "set up SPLUS.md", "tell splus to skip generated files", "splus review preferences". user-invocable: true allowed-tools: - mcp__splus__preferences @@ -12,11 +12,11 @@ allowed-tools: - Grep --- -# Splus Preferences — author `splus.md` +# Splus Preferences — author `SPLUS.md` -`splus.md` is the repo's review contract. Splus reads it **first** on every +`SPLUS.md` is the repo's review contract. Splus reads it **first** on every review: the prose guides the reviewer; a small structured subset is **binding**. -Repo `./splus.md` layers over the user's `~/.splus/splus.md`. +Repo `./SPLUS.md` layers over the user's `~/.splus/SPLUS.md`. ## What goes in it - **Prose** (injected into the reviewer, soft-binding): policy, conventions, nits, @@ -29,17 +29,17 @@ Repo `./splus.md` layers over the user's `~/.splus/splus.md`. segment), e.g. `skip: generated/**`, `skip: examples/keys.sample.env`. ## To create one -1. Check for an existing `./splus.md` (Read it) and run `preferences` to see what's +1. Check for an existing `./SPLUS.md` (Read it) and run `preferences` to see what's active. Also glance at `learnings` — patterns already dismissed/muted are good candidates to promote into a durable `mute:`. 2. Infer the repo's conventions from its code (test layout, error style, generated dirs) and from what the user tells you. -3. Write `./splus.md` using the template below. Confirm the binding rules with the +3. Write `./SPLUS.md` using the template below. Confirm the binding rules with the user before adding them — they silently drop findings. ## Template ```markdown -# splus.md — how this repo wants to be reviewed +# SPLUS.md — how this repo wants to be reviewed ## policy - signal budget: keep it tight; lead with the worst thing. @@ -61,4 +61,4 @@ terse. no praise padding. When you add a `mute:`/`skip:`, tell the user exactly what will stop being flagged. To make a one-off correction instead, prefer `dismiss`/`mute` from the review flow; -use `splus.md` for the standing contract. +use `SPLUS.md` for the standing contract. diff --git a/skills/review/SKILL.md b/skills/review/SKILL.md index 8a7e1ee..68f4273 100644 --- a/skills/review/SKILL.md +++ b/skills/review/SKILL.md @@ -31,7 +31,7 @@ slow review. ## The flow ### 0. Read the contract — FIRST, always -Before anything else, read `splus.md` (the repo's review contract). `review` +Before anything else, read `SPLUS.md` (the repo's review contract). `review` injects it and `preferences` returns it, but treat it as step zero: it encodes the repo's standing preferences, nits, and binding `mute:`/`skip:` rules. **It overrides the engine's defaults and your own taste.** If there is none, you may @@ -49,7 +49,7 @@ the worst judge of it. So review in **fresh sub-agents** that see the diff cold: - Partition the changed files into a few coherent **units** (by directory / subsystem). For a small diff, one unit is fine. - For each unit, spawn a **fresh `Task` sub-agent** with the `references/investigate.md` - protocol. Hand it only: the unit's files, the `splus.md` contract, the floor for + protocol. Hand it only: the unit's files, the `SPLUS.md` contract, the floor for those files (`floor`), and the stated intent (PR title / commit message) — **not** this session's implementation narrative. - Each sub-agent INVESTIGATES with the toolbelt instead of guessing: `inspect` @@ -79,10 +79,19 @@ the diff. - `mute ` when a whole class is unwanted here. ## Lenses -Within a unit, a thorough reviewer applies every lens — correctness, security, -intent, failure/concurrency, blast-radius. See `references/lenses.md`. For a large -unit, fan out one sub-agent per lens (each blind to the others) so different -failure modes get found instead of one pass. +Within a unit, a thorough reviewer applies every lens — contract-drift, +correctness, security, intent, failure/concurrency, blast-radius. See +`references/lenses.md`. For a large unit, fan out one sub-agent per lens (each +blind to the others) so different failure modes get found instead of one pass. + +Two disciplines outrank the rest (they decide real-world precision and recall): +- **Trace every changed contract.** `review` lists the changed exported symbols + deterministically. For each: enumerate the return/throw shape on every path, + open every caller, report every assumption that no longer holds. Return-shape + drift is the most-missed real-bug class. +- **No checklist padding.** Generic hardening concerns (timing-safe compares, + rate limiting, header casing) are noise unless the diff itself introduces the + flaw — they crowd out the comments that matter. ## The standard you're held to Coverage, not speed. For every changed export, you should have `inspect`ed its diff --git a/skills/review/references/dispatch.md b/skills/review/references/dispatch.md index 2cfe0f1..5e764e1 100644 --- a/skills/review/references/dispatch.md +++ b/skills/review/references/dispatch.md @@ -1,7 +1,7 @@ # Dispatch — fan out fresh reviewers (and degrade gracefully) Fresh context per reviewer is the point: no author bias, no session clutter, small -windows. The orchestrator (you) holds only `splus.md` + the unit list + each +windows. The orchestrator (you) holds only `SPLUS.md` + the unit list + each sub-agent's compact result — never every file at once. This is what lets a huge diff get a careful review. @@ -11,7 +11,7 @@ diff get a careful review. 2. For each unit, spawn a fresh `Task` with the `investigate.md` protocol. Give it ONLY: - the unit's file list, - - the `splus.md` contract, + - the `SPLUS.md` contract, - the floor for those files (call `floor` with the scope, or pass the relevant slice of the `review` floor), - the stated intent (PR title / commit message). @@ -21,7 +21,7 @@ diff get a careful review. dumps). 4. Spawn an **independent** verifier `Task` (the `verify.md` refuter) over the candidates. The verifier must be a different agent than any finder. -5. You synthesize: dedup, signal-budget against `splus.md`, assign tiers, then +5. You synthesize: dedup, signal-budget against `SPLUS.md`, assign tiers, then `report` + teach. For a large unit, fan out again **by lens** (one sub-agent per lens in diff --git a/skills/review/references/investigate.md b/skills/review/references/investigate.md index 58e1f8e..9821fba 100644 --- a/skills/review/references/investigate.md +++ b/skills/review/references/investigate.md @@ -3,14 +3,14 @@ You are a **fresh reviewer** seeing this code for the first time. You have no attachment to it and no memory of why it was written — that is your advantage. Judge the code against its *stated contract* (its names, comments, the change's -intent, and `splus.md`), not against any author's rationalization. +intent, and `SPLUS.md`), not against any author's rationalization. There is no clock. Your reputation is precision. Curiosity is the job: when a hunk smells off, open every call site before you move on. ## Your inputs - The unit's changed files (read them). -- The `splus.md` contract — binding preferences and nits. They come first. +- The `SPLUS.md` contract — binding preferences and nits. They come first. - The deterministic floor for these files (from `floor`). - The stated intent (PR title / commit message). @@ -22,7 +22,15 @@ smells off, open every call site before you move on. 3. **Triage the floor** — for each engine finding, keep or suppress. Optimize for signal: suppress fixtures, role-idiomatic patterns, pure style; keep what a senior reviewer genuinely wants fixed before merge. -4. **Investigate — don't guess.** Use the engine on tap: +4. **Trace contracts first.** The `review` output lists the changed exported + symbols (deterministic) — start there. For each one: enumerate what it + returns/throws on EVERY path after the change (the exact shape — object keys, + wrappers like `{success,data,error}`, `Response` vs parsed body, promise vs + value), `inspect callers`, open each call site, and report every place a + caller's assumption no longer holds. Return-shape drift is the most-missed + real-bug class; one changed function often breaks several callers — finding + one mismatch means checking the rest, not stopping. +5. **Investigate — don't guess.** Use the engine on tap: - `inspect callers ` / `inspect blast_radius ` — for every changed export, find who depends on it and **open those call sites** to confirm the change still holds for them. @@ -30,8 +38,11 @@ smells off, open every call site before you move on. - `inspect complexity ` — where the risk concentrates. - `inspect exports|imports ` — the surface and dependencies of a file. - Recurse: if a call site looks wrong, inspect *its* callers. Follow the smell. -5. **Apply every lens** (see `lenses.md`): correctness, security, intent, - failure/concurrency, blast-radius. +6. **Apply every lens** (see `lenses.md`): contract-drift, correctness, security, + intent, failure/concurrency, blast-radius. Spend the comment budget on the + change's own logic — generic best-practice padding (timing-safe compares, + rate limiting, header casing) is noise unless the diff itself introduces the + flaw. ## What to return A list of **candidate** findings. Each one MUST have: diff --git a/skills/review/references/lenses.md b/skills/review/references/lenses.md index 222977c..b32389e 100644 --- a/skills/review/references/lenses.md +++ b/skills/review/references/lenses.md @@ -7,11 +7,31 @@ reviewer looking only for races finds races a generalist skims past. Every finding, in every lens, must cite a real changed line and survive a refutation. Ground it with `inspect` rather than asserting it. +Spend the comment budget on the **change's own logic**. Generic best-practice +padding — timing-safe comparison, rate limiting, header normalization, hardening +on trusted paths — is the #1 source of review noise; raise it only when the diff +itself clearly introduces the flaw. + +## Contract drift — the most-missed real-bug class +For EVERY changed function (the `review` output lists the changed exported +symbols deterministically — start there): +1. **Enumerate what it returns/throws on every path** after the change — success, + error, missing/invalid input, each early return — with the *exact shape*: + object keys, wrapper types (`{success,data,error}`), `Response` vs parsed + body, promise vs value, sentinel/fallback values. +2. **Open every call site** (`inspect callers`, then read each one) and state + what shape that caller assumes — property accesses, destructuring, + truthiness checks. +3. **Report every mismatch.** One changed function often breaks several callers; + finding one mismatch means checking the remaining call sites, not stopping. + ## Correctness Off-by-one and boundary errors; missing `await` / unhandled rejection; a swallowed -or mis-handled error path; wrong condition or inverted boolean; null/undefined -deref; resource leak (unclosed handle, unbounded growth); a broken invariant the -surrounding code relies on; an early return that skips required cleanup. +or mis-handled error path; wrong condition or inverted boolean; a case-sensitive +comparison where the input's case varies; null/undefined deref; resource leak +(unclosed handle, unbounded growth); a broken invariant the surrounding code +relies on; an early return that skips required cleanup; validation that diverges +between the read path and the write path. ## Security Injection (SQL / command / template) reachable from input; path traversal; SSRF; diff --git a/skills/review/references/verify.md b/skills/review/references/verify.md index 646f2ea..7794d0d 100644 --- a/skills/review/references/verify.md +++ b/skills/review/references/verify.md @@ -19,7 +19,7 @@ looking for reasons it might be right; you are looking for the reason it's noise claim, or is it speculation about code that isn't there? - **Role-appropriate** — is this idiomatic and fine for this file's role (test, fixture, script, generated)? - - **Contract says so** — does `splus.md` explicitly accept this? + - **Contract says so** — does `SPLUS.md` explicitly accept this? - Use `inspect` to check: e.g. `inspect callers` to see whether a "breaking" change actually has any callers; `inspect definition` to see what a symbol truly is.