From 8b57803c9f70b59989c91404a5f14a4c4952b7d7 Mon Sep 17 00:00:00 2001 From: Pengfei Hu Date: Fri, 22 May 2026 23:14:58 -0700 Subject: [PATCH] Productize provenance kind triage --- .agents/skills/agents-shipgate/SKILL.md | 1 + .claude/commands/shipgate.md | 2 +- .cursor/rules/agents-shipgate.mdc | 6 + AGENTS.md | 3 +- STABILITY.md | 3 +- docs/agent-contract-current.md | 14 +- docs/report-reading-for-agents.md | 11 + docs/target-repo-agent-snippets.md | 6 + llms-full.txt | 17 +- .../simple_crewai_agent/expected/report.md | 14 + .../simple_langchain_agent/expected/report.md | 14 + .../expected/report.md | 14 + .../support_refund_agent/expected/packet.html | 2 +- .../support_refund_agent/expected/packet.json | 3 +- .../support_refund_agent/expected/packet.md | 1 + .../support_refund_agent/expected/report.md | 14 + skills/agents-shipgate/SKILL.md | 2 +- .../renderers/claude_code_skill.py | 8 +- .../renderers/codex_skill.py | 7 +- .../agent_instructions/renderers/cursor.py | 6 + src/agents_shipgate/cli/findings.py | 270 ++++++++++++++++++ src/agents_shipgate/cli/main.py | 8 + src/agents_shipgate/core/findings/__init__.py | 3 + .../core/findings/provenance.py | 34 +++ src/agents_shipgate/packet/builder.py | 29 ++ src/agents_shipgate/report/markdown.py | 35 ++- src/agents_shipgate/schemas/contract.py | 4 + tests/test_agent_instructions_renderers.py | 4 +- tests/test_cli.py | 129 +++++++++ tests/test_evidence_packet.py | 48 ++++ tests/test_provenance_kind.py | 50 +++- tests/test_public_surface_contract.py | 10 + 32 files changed, 756 insertions(+), 16 deletions(-) create mode 100644 src/agents_shipgate/cli/findings.py create mode 100644 src/agents_shipgate/core/findings/provenance.py diff --git a/.agents/skills/agents-shipgate/SKILL.md b/.agents/skills/agents-shipgate/SKILL.md index 983089b3..d2fe4f41 100644 --- a/.agents/skills/agents-shipgate/SKILL.md +++ b/.agents/skills/agents-shipgate/SKILL.md @@ -27,6 +27,7 @@ Do not use it for general linting, runtime monitoring, evals, model-output quali - Existing manifest: run `agents-shipgate scan -c shipgate.yaml --suggest-patches --format json`. - First GitHub CI: copy `assets/advisory-pr-comment.yml` to `.github/workflows/agents-shipgate.yml`. - Explain one finding: run `agents-shipgate explain-finding --from agents-shipgate-reports/report.json --json`. +- Triage heuristic findings: run `agents-shipgate findings --from agents-shipgate-reports/report.json --provenance-kind keyword_heuristic,regex_heuristic --json`. ## Boundaries diff --git a/.claude/commands/shipgate.md b/.claude/commands/shipgate.md index a2cd8b7d..cc62a729 100644 --- a/.claude/commands/shipgate.md +++ b/.claude/commands/shipgate.md @@ -18,7 +18,7 @@ Required behavior (do not skip): 1. Set `AGENTS_SHIPGATE_AGENT_MODE=1` for every CLI call so errors emit a `next_action` JSON line on stderr. 2. Run `agents-shipgate contract --json` when available and use it to verify the installed CLI's schema versions and gating signal. 3. Confirm with the user before running `agents-shipgate init --workspace . --write` (it writes `shipgate.yaml` to the workspace). -4. Parse `agents-shipgate-reports/report.json` directly — do not scrape the markdown. **For release gating, read `release_decision.decision` first** (`"blocked" | "review_required" | "insufficient_evidence" | "passed"`; baseline-aware, v0.8+; `insufficient_evidence` added v0.14) along with `release_decision.{reason, blockers, review_items, fail_policy.would_fail_ci}`. Other stable fields: `findings[].{check_id, severity, tool_name, recommendation}`. `summary.{critical_count, high_count, medium_count, status}` is legacy and baseline-blind — kept for v0.7 callers, do not lead with it. The Release Evidence Packet is at `agents-shipgate-reports/packet.{md,json,html}`. Full contract: [`docs/agent-contract-current.md`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/agent-contract-current.md). +4. Parse `agents-shipgate-reports/report.json` directly — do not scrape the markdown. **For release gating, read `release_decision.decision` first** (`"blocked" | "review_required" | "insufficient_evidence" | "passed"`; baseline-aware, v0.8+; `insufficient_evidence` added v0.14) along with `release_decision.{reason, blockers, review_items, fail_policy.would_fail_ci}`. Other stable fields: `findings[].{check_id, severity, tool_name, recommendation}`. For reviewer triage by source reliability, run `agents-shipgate findings --from agents-shipgate-reports/report.json --provenance-kind keyword_heuristic,regex_heuristic --json`; `findings[].provenance_kind` is not a gate input. `summary.{critical_count, high_count, medium_count, status}` is legacy and baseline-blind — kept for v0.7 callers, do not lead with it. The Release Evidence Packet is at `agents-shipgate-reports/packet.{md,json,html}`. Full contract: [`docs/agent-contract-current.md`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/agent-contract-current.md). 5. Add `agents-shipgate-reports/` to `.gitignore` if it is not already. 6. Do **not** run `agents-shipgate baseline save` in this flow — baselining is a separate decision. diff --git a/.cursor/rules/agents-shipgate.mdc b/.cursor/rules/agents-shipgate.mdc index 03162ee4..17de9878 100644 --- a/.cursor/rules/agents-shipgate.mdc +++ b/.cursor/rules/agents-shipgate.mdc @@ -51,6 +51,12 @@ auto_apply, propose_patch_for_review, escalate_to_human, suppress_with_reason, informational. Do not synthesize an action from the underlying flags when the enum is present. +For reviewer triage by source reliability, run +`agents-shipgate findings --from agents-shipgate-reports/report.json +--provenance-kind keyword_heuristic,regex_heuristic --json`. The +underlying `findings[].provenance_kind` field is a filter signal only, +not a gate input. + To translate a single finding into user-facing prose, run: agents-shipgate explain-finding \ diff --git a/AGENTS.md b/AGENTS.md index 5913ed01..fb096ce3 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -253,7 +253,7 @@ Other stable top-level fields: - `baseline.{matched_count, new_count, resolved_count}` - `tool_inventory[]` - `codex_plugin_surface` (v0.13+, static Codex plugin package/marketplace facts) -- `findings[].provenance_kind` (v0.15+, per-finding rule provenance — `static_declaration | ast_extraction | keyword_heuristic | regex_heuristic | policy_pack`; independent of `confidence`, useful for filtering heuristic-only findings) +- `findings[].provenance_kind` (v0.15+, per-finding rule provenance — `static_declaration | ast_extraction | keyword_heuristic | regex_heuristic | policy_pack`; independent of `confidence`, useful for reviewer filtering via `agents-shipgate findings`; never a release-gate input) - `findings[].blocks_release` (v0.16+, explicit release-policy blockers from Action Surface Diff policies) - `action_surface_facts` / `action_surface_diff` (v0.16+, deterministic action snapshot and base/head action delta) - `release_decision.contribution_rules[]` (v0.17+, per-finding audit of how each finding contributed to the decision; one row per `report.findings` entry, with `category` ∈ `{blocker, review_item, excluded}` and `rule` ∈ `{policy_block_new, severity_block_new, policy_baseline_accepted, severity_baseline_accepted, review_required, sub_threshold, suppressed}`) @@ -406,6 +406,7 @@ Promised to not break in `0.x` minor versions. See [STABILITY.md](STABILITY.md) | `agents-shipgate contract` | `--json` | | `agents-shipgate explain` | ``, `--no-plugins`, `--json` | | `agents-shipgate explain-finding` | ``, `--from`, `--no-plugins`, `--json` | +| `agents-shipgate findings` | `--from`, `--provenance-kind`, `--include-suppressed`, `--json` | | `agents-shipgate bootstrap` | `--workspace`, `--confidence`, `--no-ci`, `--no-apply`, `--json` | | `agents-shipgate list-checks` | `--json`, `--no-plugins` | | `agents-shipgate baseline save` | `-c`, `--out` | diff --git a/STABILITY.md b/STABILITY.md index c54b0ad6..eaf9ab86 100644 --- a/STABILITY.md +++ b/STABILITY.md @@ -22,6 +22,7 @@ These commands and flags are stable across all `0.x.y` releases. They will only | `agents-shipgate contract` | `--json` | | `agents-shipgate explain` | ``, `--no-plugins`, `--json` | | `agents-shipgate explain-finding` (v0.12+) | ``, `--from`, `--no-plugins`, `--json` | +| `agents-shipgate findings` (v0.20+) | `--from` (default: `agents-shipgate-reports/report.json`), `--provenance-kind`, `--include-suppressed`, `--json` | | `agents-shipgate bootstrap` | `--workspace`, `--confidence`, `--no-ci`, `--no-apply`, `--json` | | `agents-shipgate list-checks` | `--json`, `--no-plugins` | | `agents-shipgate baseline save` | `-c`, `--config`, `--out` | @@ -94,7 +95,7 @@ In `agents-shipgate-reports/report.json`, the following are guaranteed: - `findings[].agent_action` (v0.12+) — deterministic projection of `patches`, `autofix_safe`, and `requires_human_review`. Enum: `auto_apply | propose_patch_for_review | escalate_to_human | suppress_with_reason | informational`. The first four cover the actionable cases; `informational` covers suppressed findings or non-actionable advisories. `suppress_with_reason` is reserved for future check classes that explicitly mark themselves as suppressible — the v0.12 deterministic projection does not emit it. New consumers should read `agent_action` first and treat the underlying flags as advisory. - `agent_summary.{verdict, headline, blocker_count, review_item_count, auto_appliable_patches, needs_human_review, first_recommended_action}` (v0.12+) — top-level deterministic projection of `release_decision` + per-finding `agent_action`. Lets a coding agent read one block instead of traversing arrays. `first_recommended_action` is `{kind: "command" | "info", command: string | null, why: string}`; the `command` form carries an actual CLI invocation, the `info` form is a "surface this to the user" hint. Same inputs always produce the same output; this block cannot disagree with the underlying `release_decision` and `findings[].agent_action`. - `codex_plugin_surface.{plugins, marketplaces, skills, apps, mcp_server_stubs, hook_stubs, mcp_inventory_files, component_path_issues, warnings}` (v0.13+) — static Codex plugin package and marketplace facts. Only explicit MCP inventory tools enter `tool_inventory[]`; apps, hooks, skills, and MCP server declarations stay in this surface block. -- `findings[].provenance_kind` (v0.15+) — records *how a finding was produced*; independent of `confidence`, which records how *sure* we are. Enum: `static_declaration | ast_extraction | keyword_heuristic | regex_heuristic | policy_pack`. `static_declaration` covers manifest, MCP, OpenAPI schema facts, and declarative framework inputs like ADK YAML agent configs or LangChain/CrewAI inventory JSON files — high-trust structural data. `ast_extraction` covers findings against Tools parsed from user Python source by a framework extractor (LangChain function/structured tools, CrewAI function/class tools, ADK Python toolsets); these are subject to extraction error and agents that distrust AST quality can filter them as a class. Framework checks that fire against both AST-extracted and declaratively loaded tools (ADK's per-tool checks) pick the label per tool from `tool.source_type`. `keyword_heuristic` covers token-list matches (broad scope, read-only prompts, free-text parameter names); `regex_heuristic` covers regex matches (secrets, prompt injection); `policy_pack` covers findings emitted by externally loaded policy packs. Built-in checks set the value via the required kwarg on the `tool_finding`/`agent_finding` helpers; third-party plugin checks that construct `Finding(...)` directly and omit the field are coerced to `static_declaration` by `annotate_remediation` so the wire schema stays satisfied. Required + non-nullable on the wire; the field is Python-Optional only so older v0.12/v0.13 reports loaded by `explain-finding` and minimal synthetic test fixtures keep working. +- `findings[].provenance_kind` (v0.15+) — records *how a finding was produced*; independent of `confidence`, which records how *sure* we are. It is a reviewer triage/filter signal only: it never changes `release_decision`, severity, fingerprints, baselines, or CI exit behavior. Use `agents-shipgate findings --from agents-shipgate-reports/report.json --provenance-kind keyword_heuristic,regex_heuristic --json` to filter active findings by provenance class. Enum: `static_declaration | ast_extraction | keyword_heuristic | regex_heuristic | policy_pack`. `static_declaration` covers manifest, MCP, OpenAPI schema facts, and declarative framework inputs like ADK YAML agent configs or LangChain/CrewAI inventory JSON files — high-trust structural data. `ast_extraction` covers findings against Tools parsed from user Python source by a framework extractor (LangChain function/structured tools, CrewAI function/class tools, ADK Python toolsets); these are subject to extraction error and agents that distrust AST quality can filter them as a class. Framework checks that fire against both AST-extracted and declaratively loaded tools (ADK's per-tool checks) pick the label per tool from `tool.source_type`. `keyword_heuristic` covers token-list matches (broad scope, read-only prompts, free-text parameter names); `regex_heuristic` covers regex matches (secrets, prompt injection); `policy_pack` covers findings emitted by externally loaded policy packs. Built-in checks set the value via the required kwarg on the `tool_finding`/`agent_finding` helpers; third-party plugin checks that construct `Finding(...)` directly and omit the field are coerced to `static_declaration` by `annotate_remediation` so the wire schema stays satisfied. Required + non-nullable on the wire; the field is Python-Optional only so older v0.12/v0.13 reports loaded by `explain-finding` and minimal synthetic test fixtures keep working. - `findings[].blocks_release` (v0.16+) — explicit release-policy blocking bit. Built-in and user-defined Action Surface Diff policies, plus declarative policy-pack rules with `block: true`, set it for findings that must block release when active and unbaselined; ordinary severity-based gating still works for existing checks. - `action_surface_facts.actions[]` (v0.16+) — deterministic current action snapshot: action id, operation, effect, normalized risk tags, scopes, approval policy, safeguards, evidence, input fields, and stable hashes. - `action_surface_diff.{enabled, base, summary, added, removed, modified, notes}` (v0.16+) — reviewer-facing delta for what the agent can do vs. a prior report or v0.4 baseline. Policy findings derived from this diff can set `findings[].blocks_release=true` and affect `release_decision.decision` and strict-mode exit behavior. diff --git a/docs/agent-contract-current.md b/docs/agent-contract-current.md index 05ee6dfd..c06070ac 100644 --- a/docs/agent-contract-current.md +++ b/docs/agent-contract-current.md @@ -36,7 +36,9 @@ The action exposes these as outputs `decision`, `blocker_count`, `review_item_co `agents-shipgate contract --json` exposes `manual_review_signals[]` as the installed CLI's stable list of report/packet fields to inspect for human review -work. +work. `findings[].provenance_kind` is included there as a filter/review signal +only; it never changes the release decision, severity, fingerprints, baselines, +or CI exit behavior. The capability/intent diff fields (v0.9+), used by reviewers to spot misalignment between declared agent intent and actual tool surface: @@ -96,6 +98,16 @@ Per-finding `provenance_kind` enum (v0.15+), additive classification — read th Provenance generally follows the rule's own trigger (e.g., a rule that checks for a declared manifest field is `static_declaration` even when the underlying Tool was AST-extracted). For framework checks that fire across both AST and declarative tool sources (ADK's per-tool checks against `google_adk_function` AND `google_adk_config` tools), the label tracks the underlying tool's source. Third-party plugin checks that don't yet set the field land at `static_declaration` by default — pre-v0.15 plugins continue to validate against the v0.15 wire schema. Use `findings[].source.type` for the precise underlying tool source. +To filter operationally, use: + +```bash +agents-shipgate findings --from agents-shipgate-reports/report.json \ + --provenance-kind keyword_heuristic,regex_heuristic --json +``` + +The command reads active findings by default; add `--include-suppressed` when a +reviewer needs suppressed entries in the same provenance summary. + For reviewer-shaped output, also read the **Release Evidence Packet** at `agents-shipgate-reports/packet.{md,json,html}` (and `packet.pdf` when the `[pdf]` extras are installed). Packet outputs are redacted by the same default privacy layer as the report. The packet has fixed reviewer sections governed by [`docs/packet-schema.v0.6.json`](packet-schema.v0.6.json) — see [STABILITY.md §Release Evidence Packet](../STABILITY.md#release-evidence-packet-v06). Packet schema `0.6` preserves the v0.5 `action_surface_diff` section and adds two independent additive extensions: diff --git a/docs/report-reading-for-agents.md b/docs/report-reading-for-agents.md index a3c99c9f..eec2d987 100644 --- a/docs/report-reading-for-agents.md +++ b/docs/report-reading-for-agents.md @@ -66,6 +66,17 @@ Per-finding stable fields (see [`AGENTS.md`](../AGENTS.md) Task 2 for the full l Group by `severity` to summarize; cite `check_id` so the user can run `agents-shipgate explain ` for rationale. +For reviewer triage by source reliability, filter on +`findings[].provenance_kind` with the dedicated command: + +```bash +agents-shipgate findings --from agents-shipgate-reports/report.json \ + --provenance-kind keyword_heuristic,regex_heuristic --json +``` + +This is not a gate signal. It does not change severity, release decisions, +fingerprints, baselines, or CI exit codes. + ### Step 4 · Per-finding autofix fields (v0.7+) For every active finding, inspect: diff --git a/docs/target-repo-agent-snippets.md b/docs/target-repo-agent-snippets.md index 19ff4473..50600256 100644 --- a/docs/target-repo-agent-snippets.md +++ b/docs/target-repo-agent-snippets.md @@ -198,6 +198,12 @@ auto_apply, propose_patch_for_review, escalate_to_human, suppress_with_reason, informational. Do not synthesize an action from the underlying flags when the enum is present. +For reviewer triage by source reliability, run +`agents-shipgate findings --from agents-shipgate-reports/report.json +--provenance-kind keyword_heuristic,regex_heuristic --json`. The +underlying `findings[].provenance_kind` field is a filter signal only, +not a gate input. + To translate a single finding into user-facing prose, run: agents-shipgate explain-finding \ diff --git a/llms-full.txt b/llms-full.txt index 85b6e87c..f4d83de5 100644 --- a/llms-full.txt +++ b/llms-full.txt @@ -278,7 +278,7 @@ Other stable top-level fields: - `baseline.{matched_count, new_count, resolved_count}` - `tool_inventory[]` - `codex_plugin_surface` (v0.13+, static Codex plugin package/marketplace facts) -- `findings[].provenance_kind` (v0.15+, per-finding rule provenance — `static_declaration | ast_extraction | keyword_heuristic | regex_heuristic | policy_pack`; independent of `confidence`, useful for filtering heuristic-only findings) +- `findings[].provenance_kind` (v0.15+, per-finding rule provenance — `static_declaration | ast_extraction | keyword_heuristic | regex_heuristic | policy_pack`; independent of `confidence`, useful for reviewer filtering via `agents-shipgate findings`; never a release-gate input) - `findings[].blocks_release` (v0.16+, explicit release-policy blockers from Action Surface Diff policies) - `action_surface_facts` / `action_surface_diff` (v0.16+, deterministic action snapshot and base/head action delta) - `release_decision.contribution_rules[]` (v0.17+, per-finding audit of how each finding contributed to the decision; one row per `report.findings` entry, with `category` ∈ `{blocker, review_item, excluded}` and `rule` ∈ `{policy_block_new, severity_block_new, policy_baseline_accepted, severity_baseline_accepted, review_required, sub_threshold, suppressed}`) @@ -431,6 +431,7 @@ Promised to not break in `0.x` minor versions. See [STABILITY.md](STABILITY.md) | `agents-shipgate contract` | `--json` | | `agents-shipgate explain` | ``, `--no-plugins`, `--json` | | `agents-shipgate explain-finding` | ``, `--from`, `--no-plugins`, `--json` | +| `agents-shipgate findings` | `--from`, `--provenance-kind`, `--include-suppressed`, `--json` | | `agents-shipgate bootstrap` | `--workspace`, `--confidence`, `--no-ci`, `--no-apply`, `--json` | | `agents-shipgate list-checks` | `--json`, `--no-plugins` | | `agents-shipgate baseline save` | `-c`, `--out` | @@ -856,7 +857,9 @@ The action exposes these as outputs `decision`, `blocker_count`, `review_item_co `agents-shipgate contract --json` exposes `manual_review_signals[]` as the installed CLI's stable list of report/packet fields to inspect for human review -work. +work. `findings[].provenance_kind` is included there as a filter/review signal +only; it never changes the release decision, severity, fingerprints, baselines, +or CI exit behavior. The capability/intent diff fields (v0.9+), used by reviewers to spot misalignment between declared agent intent and actual tool surface: @@ -916,6 +919,16 @@ Per-finding `provenance_kind` enum (v0.15+), additive classification — read th Provenance generally follows the rule's own trigger (e.g., a rule that checks for a declared manifest field is `static_declaration` even when the underlying Tool was AST-extracted). For framework checks that fire across both AST and declarative tool sources (ADK's per-tool checks against `google_adk_function` AND `google_adk_config` tools), the label tracks the underlying tool's source. Third-party plugin checks that don't yet set the field land at `static_declaration` by default — pre-v0.15 plugins continue to validate against the v0.15 wire schema. Use `findings[].source.type` for the precise underlying tool source. +To filter operationally, use: + +```bash +agents-shipgate findings --from agents-shipgate-reports/report.json \ + --provenance-kind keyword_heuristic,regex_heuristic --json +``` + +The command reads active findings by default; add `--include-suppressed` when a +reviewer needs suppressed entries in the same provenance summary. + For reviewer-shaped output, also read the **Release Evidence Packet** at `agents-shipgate-reports/packet.{md,json,html}` (and `packet.pdf` when the `[pdf]` extras are installed). Packet outputs are redacted by the same default privacy layer as the report. The packet has fixed reviewer sections governed by [`docs/packet-schema.v0.6.json`](packet-schema.v0.6.json) — see [STABILITY.md §Release Evidence Packet](../STABILITY.md#release-evidence-packet-v06). Packet schema `0.6` preserves the v0.5 `action_surface_diff` section and adds two independent additive extensions: diff --git a/samples/simple_crewai_agent/expected/report.md b/samples/simple_crewai_agent/expected/report.md index f34422e3..54067c28 100644 --- a/samples/simple_crewai_agent/expected/report.md +++ b/samples/simple_crewai_agent/expected/report.md @@ -32,6 +32,20 @@ Fail policy: ci_mode=advisory, fail_on=[none], new_findings_only=false, would_fa No critical or high findings. +## Finding Provenance + +Reviewer triage signal only. Provenance kind does not change severity, release decision, fingerprints, baselines, or CI exit codes. + +| Provenance kind | Active findings | +| --- | ---: | +| `static_declaration` | 0 | +| `ast_extraction` | 0 | +| `keyword_heuristic` | 0 | +| `regex_heuristic` | 0 | +| `policy_pack` | 0 | + +Suppressed findings excluded: 0 + ## Capability <-> Intent Diff No capability/intent misalignments detected from static evidence. diff --git a/samples/simple_langchain_agent/expected/report.md b/samples/simple_langchain_agent/expected/report.md index b62ccccf..529e84c9 100644 --- a/samples/simple_langchain_agent/expected/report.md +++ b/samples/simple_langchain_agent/expected/report.md @@ -32,6 +32,20 @@ Fail policy: ci_mode=advisory, fail_on=[none], new_findings_only=false, would_fa No critical or high findings. +## Finding Provenance + +Reviewer triage signal only. Provenance kind does not change severity, release decision, fingerprints, baselines, or CI exit codes. + +| Provenance kind | Active findings | +| --- | ---: | +| `static_declaration` | 0 | +| `ast_extraction` | 0 | +| `keyword_heuristic` | 0 | +| `regex_heuristic` | 0 | +| `policy_pack` | 0 | + +Suppressed findings excluded: 0 + ## Capability <-> Intent Diff No capability/intent misalignments detected from static evidence. diff --git a/samples/simple_openai_api_agent/expected/report.md b/samples/simple_openai_api_agent/expected/report.md index a90253d2..cf3cc7dd 100644 --- a/samples/simple_openai_api_agent/expected/report.md +++ b/samples/simple_openai_api_agent/expected/report.md @@ -70,6 +70,20 @@ Fail policy: ci_mode=advisory, fail_on=[none], new_findings_only=false, would_fa Evidence: retry\_policy=\{'max\_attempts': 2\}; risk\_tags=\['customer\_communication', 'external\_write', 'write'\] Recommendation: Add idempotency evidence for send\_customer\_email or avoid retrying this side effect. +## Finding Provenance + +Reviewer triage signal only. Provenance kind does not change severity, release decision, fingerprints, baselines, or CI exit codes. + +| Provenance kind | Active findings | +| --- | ---: | +| `static_declaration` | 14 | +| `ast_extraction` | 0 | +| `keyword_heuristic` | 6 | +| `regex_heuristic` | 0 | +| `policy_pack` | 0 | + +Suppressed findings excluded: 0 + ## Capability <-> Intent Diff Agent intent: diff --git a/samples/support_refund_agent/expected/packet.html b/samples/support_refund_agent/expected/packet.html index 19d9eef7..35b2f947 100644 --- a/samples/support_refund_agent/expected/packet.html +++ b/samples/support_refund_agent/expected/packet.html @@ -26,4 +26,4 @@ .status-missing { color: #7f1d1d; } .status-informational { color: #555; } .meta { color: #555; font-size: 0.92rem; } -

Release Evidence Packet

Project: support-refund-agent · Agent: refund-assistant · Environment: production_like
Run id: agents_shipgate_ebb71d7248235cc3 · Generated at: 2026-01-01T00:00:00+00:00 · Packet schema: 0.6

This packet is a reviewer-shaped synthesis of a static Agents Shipgate scan. See §10 for what the packet does not prove.

§1 Release decision — BLOCKED

  • Decision: blocked
  • Reason: 2 active findings block release.
  • Blockers: 2
  • Review items: 16

CI gate behavior (informational)

  • ci_mode: advisory, would_fail_ci: false, exit code: 0
  • Note: CI behavior is metadata about the run gate, not the verdict. The verdict above derives from release_decision.decision.

Blockers

  • SHIP-POLICY-APPROVAL-MISSING (critical): stripe.create_refund lacks a declared approval policy
  • SHIP-SIDEFX-IDEMPOTENCY-MISSING (critical): stripe.create_refund lacks idempotency evidence

Review items

  • SHIP-INVENTORY-WILDCARD-TOOLS (high): Wildcard tool exposure declared
  • SHIP-SCHEMA-MISSING-BOUNDS (high): stripe.create_refund.amount has no maximum bound
  • SHIP-SCHEMA-BROAD-FREE-TEXT (high): zendesk.update_ticket accepts broad free-form action input
  • SHIP-SCHEMA-BROAD-FREE-TEXT (high): gmail.send_customer_email accepts broad free-form action input
  • SHIP-SCHEMA-FREEFORM-OUTPUT (medium): send_email_preview returns free-form text output
  • SHIP-AUTH-MANIFEST-BROAD-SCOPE (high): Manifest declares broad permission scopes
  • SHIP-AUTH-SCOPE-COVERAGE-MISSING (high): shopify.cancel_order requires scopes not declared in the manifest
  • SHIP-AUTH-SCOPE-COVERAGE-MISSING (high): support.search_kb requires scopes not declared in the manifest
  • SHIP-AUTH-SCOPE-COVERAGE-MISSING (high): gmail.send_customer_email requires scopes not declared in the manifest
  • SHIP-SCOPE-PROHIBITED-TOOL-PRESENT (high): stripe.create_refund appears to overlap with a prohibited action
  • SHIP-SCOPE-PROHIBITED-TOOL-PRESENT (high): gmail.send_customer_email appears to overlap with a prohibited action
  • SHIP-POLICY-CONFIRMATION-MISSING (high): stripe.create_refund lacks a declared confirmation policy
  • SHIP-POLICY-CONFIRMATION-MISSING (high): gmail.send_customer_email lacks a declared confirmation policy
  • SHIP-SIDEFX-IDEMPOTENCY-MISSING (high): gmail.send_customer_email lacks idempotency evidence
  • SHIP-MANIFEST-HIGH-RISK-OWNER-MISSING (high): shopify.cancel_order is high-risk but has no owner
  • SHIP-MANIFEST-UNUSED-SCOPE (medium): Manifest declares unused permission scope zendesk:tickets:read

§1A Evidence matrix — compact review summary

  • Evidence Matrix Light is derived from public report.json only. Release decisions, CI exit behavior, and baseline semantics remain owned by release_decision. Domain rows intentionally overlap; a single finding can appear in multiple rows when it is relevant to each review lens.
DomainEvidence presentEvidence sourceConfidenceMissing controlsBlocking findingsReview items
Inventorypartialtool_inventory; tool_surface; +2 morehighSHIP-INVENTORY-WILDCARD-TOOLS on wildcard_mcp_tools.*: Wildcard tool exposure declaredSHIP-INVENTORY-WILDCARD-TOOLS (high)
Schemapartialtool_surface_facts.tools[].hashes; findings[]mixedSHIP-SCHEMA-MISSING-BOUNDS on stripe.create_refund: stripe.create_refund.amount has no maximum bound; SHIP-SCHEMA-BROAD-FREE-TEXT on zendesk.update_ticket: zendesk.update_ticket accepts broad free-form action input; +2 moreSHIP-SCHEMA-MISSING-BOUNDS (high); SHIP-SCHEMA-BROAD-FREE-TEXT (high); +2 more
Authpartialtool_surface_facts.scopes; tool_inventory[].auth_scopes; +1 moremixedSHIP-AUTH-MANIFEST-BROAD-SCOPE: Manifest declares broad permission scopes; SHIP-AUTH-SCOPE-COVERAGE-MISSING on shopify.cancel_order: shopify.cancel_order requires scopes not declared in the manifest; +3 moreSHIP-AUTH-MANIFEST-BROAD-SCOPE (high); SHIP-AUTH-SCOPE-COVERAGE-MISSING (high); +3 more
Approvalpartialtool_surface_facts.controls[kind=approval_policy]; findings[]highSHIP-POLICY-APPROVAL-MISSING on stripe.create_refund: stripe.create_refund lacks a declared approval policySHIP-POLICY-APPROVAL-MISSING (critical)
Confirmationpartialtool_surface_facts.controls[kind=confirmation_policy]; findings[]highSHIP-POLICY-CONFIRMATION-MISSING on stripe.create_refund: stripe.create_refund lacks a declared confirmation policy; SHIP-POLICY-CONFIRMATION-MISSING on gmail.send_customer_email: gmail.send_customer_email lacks a declared confirmation policySHIP-POLICY-CONFIRMATION-MISSING (high); SHIP-POLICY-CONFIRMATION-MISSING (high)
Idempotencypartialtool_surface_facts.controls[kind=idempotency_evidence]; action_surface_facts.actions[].safeguards.idempotency; +1 moremixedSHIP-SIDEFX-IDEMPOTENCY-MISSING on stripe.create_refund: stripe.create_refund lacks idempotency evidence; SHIP-SIDEFX-IDEMPOTENCY-MISSING on gmail.send_customer_email: gmail.send_customer_email lacks idempotency evidenceSHIP-SIDEFX-IDEMPOTENCY-MISSING (critical)SHIP-SIDEFX-IDEMPOTENCY-MISSING (high)
Side effectspartialtool_inventory[].risk_tags; action_surface_facts.actions[].effect; +1 moremixedSHIP-SCHEMA-BROAD-FREE-TEXT on zendesk.update_ticket: zendesk.update_ticket accepts broad free-form action input; SHIP-SCHEMA-BROAD-FREE-TEXT on gmail.send_customer_email: gmail.send_customer_email accepts broad free-form action input; +5 moreSHIP-POLICY-APPROVAL-MISSING (critical); SHIP-SIDEFX-IDEMPOTENCY-MISSING (critical)SHIP-SCHEMA-BROAD-FREE-TEXT (high); SHIP-SCHEMA-BROAD-FREE-TEXT (high); +3 more
Memory isolationnot_declaredunknown
Human-in-the-loop evidencenot_declaredunknown
Prompt/scope alignmentpartialdeclared_intentions; misalignments; +2 moremediumSHIP-SCOPE-PROHIBITED-TOOL-PRESENT on stripe.create_refund: stripe.create_refund appears to overlap with a prohibited action; SHIP-SCOPE-PROHIBITED-TOOL-PRESENT on gmail.send_customer_email: gmail.send_customer_email appears to overlap with a prohibited actionSHIP-SCOPE-PROHIBITED-TOOL-PRESENT (high); SHIP-SCOPE-PROHIBITED-TOOL-PRESENT (high)
Retry/timeoutnot_declaredunknown
Baseline debtinformationalunknown
Action-surface policycoveredaction_surface_facts.actionsmedium

§2 Capability ↔ Intent diff — missing

Declared

  • Purpose: answer refund policy questions
  • Purpose: prepare refund requests for human review
  • Purpose: update support ticket notes
  • Prohibited: issue refund without approval
  • Prohibited: cancel order without explicit confirmation
  • Prohibited: send external email without preview

Observed tools

  • gmail.send_customer_email
  • refund_status_lookup
  • send_email_preview
  • shopify.cancel_order
  • stripe.create_refund
  • support.search_kb
  • wildcard_mcp_tools.*
  • zendesk.update_ticket

Divergences

  • SHIP-SCOPE-PROHIBITED-TOOL-PRESENT: stripe.create_refund appears to overlap with a prohibited action
  • SHIP-SCOPE-PROHIBITED-TOOL-PRESENT: gmail.send_customer_email appears to overlap with a prohibited action

§3 High-risk tool surface — partial

Total tools: 8 · High-risk: 3

ToolSourceRisk tagsApprovalIdempotency
gmail.send_customer_emailmcpcustomer_communication, external_writenono
shopify.cancel_orderopenapidestructive, writeyesyes
stripe.create_refundopenapiexternal_write, financial_action, writenono

§3A Tool-surface diff — not declared

Status: disabled — No --diff-from report or v0.3 baseline snapshot was provided.
Base: none

§3B Action-surface diff — not declared

Status: disabled — No action-surface comparison source was provided.
Base: none

§4 Approval policy coverage — partial

ToolDeclaredSourceGap finding(s)
shopify.cancel_orderyespolicies
stripe.create_refundnofp_f092940f62fbb012

Gap findings

  • SHIP-POLICY-APPROVAL-MISSING (critical): stripe.create_refund lacks a declared approval policy

§5 Idempotency / retry risk — partial

Retry policy: not declared

ToolDeclaredSourceGap finding(s)
gmail.send_customer_emailnofp_0f8aaa912d589cf0
shopify.cancel_orderyespolicies
stripe.create_refundnofp_dac8011e14c53777

Gap findings

  • SHIP-SIDEFX-IDEMPOTENCY-MISSING (critical): stripe.create_refund lacks idempotency evidence
  • SHIP-SIDEFX-IDEMPOTENCY-MISSING (high): gmail.send_customer_email lacks idempotency evidence

§6 Scope coverage — missing

Declared scopes

  • zendesk:tickets:read
  • zendesk:tickets:write
  • stripe:*
ScopeDeclaredUsed by tools
gmail:sendnogmail.send_customer_email
shopify:orders:writenoshopify.cancel_order
stripe:*yes
stripe:refunds:writeyesstripe.create_refund
support:kb:readnosupport.search_kb
zendesk:tickets:readyes
zendesk:tickets:writeyeszendesk.update_ticket

Unused declared scopes

  • zendesk:tickets:read

Used by tools but not declared

  • gmail:send
  • shopify:orders:write
  • support:kb:read

Gap findings

  • SHIP-AUTH-SCOPE-COVERAGE-MISSING (high): shopify.cancel_order requires scopes not declared in the manifest
  • SHIP-AUTH-SCOPE-COVERAGE-MISSING (high): support.search_kb requires scopes not declared in the manifest
  • SHIP-AUTH-SCOPE-COVERAGE-MISSING (high): gmail.send_customer_email requires scopes not declared in the manifest
  • SHIP-MANIFEST-UNUSED-SCOPE (medium): Manifest declares unused permission scope zendesk:tickets:read

§7 Memory isolation — not declared

Manifest does not declare a memory isolation policy. The current manifest schema (v0.1) has no agent.memory field. See §10 for the residual review item.

§8 Human-in-the-loop evidence — covered

  • Configured: yes
  • Human review recommended: yes
  • Provenance mode: fresh_scan
  • HITL evidence is local review evidence only. Missing local evidence does not prove a runtime control is absent, and present local evidence does not certify runtime enforcement.

Approval-required tools

  • shopify.cancel_order

Confirmation-required tools

  • shopify.cancel_order

§9 Required dynamic scenarios — partial

  • Manual review for SHIP-AUTH-MANIFEST-BROAD-SCOPE — Replace broad manifest permission scopes with the narrowest scopes needed for this release.
    Related finding(s): fp_d27325cbdbbf5483
  • Manual review for SHIP-AUTH-SCOPE-COVERAGE-MISSING — Add the required scopes for shopify.cancel_order to permissions.scopes or narrow the tool's declared auth requirements.
    Related finding(s): fp_1f6cfd6b7daa9b7c, fp_83852fbd6b440524, fp_d8e6d1865dae97cc
  • Manual review for SHIP-INVENTORY-WILDCARD-TOOLS — Replace wildcard tool exposure with an explicit tool allowlist before release review.
    Related finding(s): fp_fc02d8ecd30f2578
  • Manual review for SHIP-MANIFEST-HIGH-RISK-OWNER-MISSING — Declare an owner for each high-risk production tool in risk_overrides.tools.
    Related finding(s): fp_fd2577850cef1f87
  • Manual review for SHIP-MANIFEST-UNUSED-SCOPE — Remove unused manifest scopes or add tool metadata showing why they are required.
    Related finding(s): fp_39b9ae878f343d1b
  • Manual review for SHIP-POLICY-APPROVAL-MISSING — Declare an approval policy for stripe.create_refund or remove this tool from the release.
    Related finding(s): fp_f092940f62fbb012
  • Manual review for SHIP-POLICY-CONFIRMATION-MISSING — Declare a user confirmation policy for stripe.create_refund or remove this action from the release.
    Related finding(s): fp_8e08a4fe6b0917f6, fp_a62ca2fd9a68a1d1
  • Manual review for SHIP-SCHEMA-BROAD-FREE-TEXT — Constrain zendesk.update_ticket.updates with an enum, structured schema, or narrower field-specific parameters.
    Related finding(s): fp_acd63b899d49aa1c, fp_ff2f028953d1c220
  • Manual review for SHIP-SCHEMA-FREEFORM-OUTPUT — Prefer a structured output schema for send_email_preview, especially when output is later passed back into model context.
    Related finding(s): fp_85f8513ad72cd9ea
  • Manual review for SHIP-SCHEMA-MISSING-BOUNDS — Add a maximum bound to stripe.create_refund.amount or document an equivalent limit in the tool policy.
    Related finding(s): fp_ab60b01cb53cfcbe
  • Manual review for SHIP-SCOPE-PROHIBITED-TOOL-PRESENT — Remove stripe.create_refund, narrow its policy, or revise prohibited_actions so the manifest and tool surface do not contradict each other.
    Related finding(s): fp_12985c36a06026de, fp_e090c62e390e70ab
  • Manual review for SHIP-SIDEFX-IDEMPOTENCY-MISSING — Add an idempotency key, idempotent annotation, or declared idempotency policy for stripe.create_refund.
    Related finding(s): fp_0f8aaa912d589cf0, fp_dac8011e14c53777
  • Re-run scan after resolving source warnings — Source loaders emitted warnings; some tool surfaces may have been parsed with reduced confidence.
  • Verify low-confidence tool extractions — One or more tools were extracted with low confidence; confirm against the upstream source before release.

§10 What this packet did NOT prove

Agents Shipgate is an advisory, local-first, static Tool-Use Readiness release gate for AI agent tool surfaces. The packet below is derived from a scan; it does not, by itself, prove the following properties:

  • Prompt robustness. Whether the agent's prompt holds up under jailbreaks, persona drift, indirect prompt injection, or adversarial inputs.
  • Runtime behavior. Whether the agent actually invokes only the declared tools, respects approval gates at runtime, or follows policy under load. Static config is not runtime evidence.
  • Model correctness. Whether the underlying model produces correct outputs, calls the right tools, or stays within the declared scope. The packet does not benchmark the model.
  • Adversarial resistance. Whether the agent withstands red-team or penetration testing. The packet does not run scenarios; it organizes evidence.

Per-run residuals

  • Source warnings:
    • MCP source declares wildcard tool exposure
  • Low-confidence tool extractions: none
  • Suppressed findings in effect: none
  • Memory isolation is not modeled by the v0.1 manifest schema; no static evidence is available.
+

Release Evidence Packet

Project: support-refund-agent · Agent: refund-assistant · Environment: production_like
Run id: agents_shipgate_ebb71d7248235cc3 · Generated at: 2026-01-01T00:00:00+00:00 · Packet schema: 0.6

This packet is a reviewer-shaped synthesis of a static Agents Shipgate scan. See §10 for what the packet does not prove.

§1 Release decision — BLOCKED

  • Decision: blocked
  • Reason: 2 active findings block release.
  • Blockers: 2
  • Review items: 16

CI gate behavior (informational)

  • ci_mode: advisory, would_fail_ci: false, exit code: 0
  • Note: CI behavior is metadata about the run gate, not the verdict. The verdict above derives from release_decision.decision.

Blockers

  • SHIP-POLICY-APPROVAL-MISSING (critical): stripe.create_refund lacks a declared approval policy
  • SHIP-SIDEFX-IDEMPOTENCY-MISSING (critical): stripe.create_refund lacks idempotency evidence

Review items

  • SHIP-INVENTORY-WILDCARD-TOOLS (high): Wildcard tool exposure declared
  • SHIP-SCHEMA-MISSING-BOUNDS (high): stripe.create_refund.amount has no maximum bound
  • SHIP-SCHEMA-BROAD-FREE-TEXT (high): zendesk.update_ticket accepts broad free-form action input
  • SHIP-SCHEMA-BROAD-FREE-TEXT (high): gmail.send_customer_email accepts broad free-form action input
  • SHIP-SCHEMA-FREEFORM-OUTPUT (medium): send_email_preview returns free-form text output
  • SHIP-AUTH-MANIFEST-BROAD-SCOPE (high): Manifest declares broad permission scopes
  • SHIP-AUTH-SCOPE-COVERAGE-MISSING (high): shopify.cancel_order requires scopes not declared in the manifest
  • SHIP-AUTH-SCOPE-COVERAGE-MISSING (high): support.search_kb requires scopes not declared in the manifest
  • SHIP-AUTH-SCOPE-COVERAGE-MISSING (high): gmail.send_customer_email requires scopes not declared in the manifest
  • SHIP-SCOPE-PROHIBITED-TOOL-PRESENT (high): stripe.create_refund appears to overlap with a prohibited action
  • SHIP-SCOPE-PROHIBITED-TOOL-PRESENT (high): gmail.send_customer_email appears to overlap with a prohibited action
  • SHIP-POLICY-CONFIRMATION-MISSING (high): stripe.create_refund lacks a declared confirmation policy
  • SHIP-POLICY-CONFIRMATION-MISSING (high): gmail.send_customer_email lacks a declared confirmation policy
  • SHIP-SIDEFX-IDEMPOTENCY-MISSING (high): gmail.send_customer_email lacks idempotency evidence
  • SHIP-MANIFEST-HIGH-RISK-OWNER-MISSING (high): shopify.cancel_order is high-risk but has no owner
  • SHIP-MANIFEST-UNUSED-SCOPE (medium): Manifest declares unused permission scope zendesk:tickets:read

§1A Evidence matrix — compact review summary

  • Evidence Matrix Light is derived from public report.json only. Release decisions, CI exit behavior, and baseline semantics remain owned by release_decision. Domain rows intentionally overlap; a single finding can appear in multiple rows when it is relevant to each review lens.
DomainEvidence presentEvidence sourceConfidenceMissing controlsBlocking findingsReview items
Inventorypartialtool_inventory; tool_surface; +2 morehighSHIP-INVENTORY-WILDCARD-TOOLS on wildcard_mcp_tools.*: Wildcard tool exposure declaredSHIP-INVENTORY-WILDCARD-TOOLS (high)
Schemapartialtool_surface_facts.tools[].hashes; findings[]mixedSHIP-SCHEMA-MISSING-BOUNDS on stripe.create_refund: stripe.create_refund.amount has no maximum bound; SHIP-SCHEMA-BROAD-FREE-TEXT on zendesk.update_ticket: zendesk.update_ticket accepts broad free-form action input; +2 moreSHIP-SCHEMA-MISSING-BOUNDS (high); SHIP-SCHEMA-BROAD-FREE-TEXT (high); +2 more
Authpartialtool_surface_facts.scopes; tool_inventory[].auth_scopes; +1 moremixedSHIP-AUTH-MANIFEST-BROAD-SCOPE: Manifest declares broad permission scopes; SHIP-AUTH-SCOPE-COVERAGE-MISSING on shopify.cancel_order: shopify.cancel_order requires scopes not declared in the manifest; +3 moreSHIP-AUTH-MANIFEST-BROAD-SCOPE (high); SHIP-AUTH-SCOPE-COVERAGE-MISSING (high); +3 more
Approvalpartialtool_surface_facts.controls[kind=approval_policy]; findings[]highSHIP-POLICY-APPROVAL-MISSING on stripe.create_refund: stripe.create_refund lacks a declared approval policySHIP-POLICY-APPROVAL-MISSING (critical)
Confirmationpartialtool_surface_facts.controls[kind=confirmation_policy]; findings[]highSHIP-POLICY-CONFIRMATION-MISSING on stripe.create_refund: stripe.create_refund lacks a declared confirmation policy; SHIP-POLICY-CONFIRMATION-MISSING on gmail.send_customer_email: gmail.send_customer_email lacks a declared confirmation policySHIP-POLICY-CONFIRMATION-MISSING (high); SHIP-POLICY-CONFIRMATION-MISSING (high)
Idempotencypartialtool_surface_facts.controls[kind=idempotency_evidence]; action_surface_facts.actions[].safeguards.idempotency; +1 moremixedSHIP-SIDEFX-IDEMPOTENCY-MISSING on stripe.create_refund: stripe.create_refund lacks idempotency evidence; SHIP-SIDEFX-IDEMPOTENCY-MISSING on gmail.send_customer_email: gmail.send_customer_email lacks idempotency evidenceSHIP-SIDEFX-IDEMPOTENCY-MISSING (critical)SHIP-SIDEFX-IDEMPOTENCY-MISSING (high)
Side effectspartialtool_inventory[].risk_tags; action_surface_facts.actions[].effect; +1 moremixedSHIP-SCHEMA-BROAD-FREE-TEXT on zendesk.update_ticket: zendesk.update_ticket accepts broad free-form action input; SHIP-SCHEMA-BROAD-FREE-TEXT on gmail.send_customer_email: gmail.send_customer_email accepts broad free-form action input; +5 moreSHIP-POLICY-APPROVAL-MISSING (critical); SHIP-SIDEFX-IDEMPOTENCY-MISSING (critical)SHIP-SCHEMA-BROAD-FREE-TEXT (high); SHIP-SCHEMA-BROAD-FREE-TEXT (high); +3 more
Memory isolationnot_declaredunknown
Human-in-the-loop evidencenot_declaredunknown
Prompt/scope alignmentpartialdeclared_intentions; misalignments; +2 moremediumSHIP-SCOPE-PROHIBITED-TOOL-PRESENT on stripe.create_refund: stripe.create_refund appears to overlap with a prohibited action; SHIP-SCOPE-PROHIBITED-TOOL-PRESENT on gmail.send_customer_email: gmail.send_customer_email appears to overlap with a prohibited actionSHIP-SCOPE-PROHIBITED-TOOL-PRESENT (high); SHIP-SCOPE-PROHIBITED-TOOL-PRESENT (high)
Retry/timeoutnot_declaredunknown
Baseline debtinformationalunknown
Action-surface policycoveredaction_surface_facts.actionsmedium

§2 Capability ↔ Intent diff — missing

Declared

  • Purpose: answer refund policy questions
  • Purpose: prepare refund requests for human review
  • Purpose: update support ticket notes
  • Prohibited: issue refund without approval
  • Prohibited: cancel order without explicit confirmation
  • Prohibited: send external email without preview

Observed tools

  • gmail.send_customer_email
  • refund_status_lookup
  • send_email_preview
  • shopify.cancel_order
  • stripe.create_refund
  • support.search_kb
  • wildcard_mcp_tools.*
  • zendesk.update_ticket

Divergences

  • SHIP-SCOPE-PROHIBITED-TOOL-PRESENT: stripe.create_refund appears to overlap with a prohibited action
  • SHIP-SCOPE-PROHIBITED-TOOL-PRESENT: gmail.send_customer_email appears to overlap with a prohibited action

§3 High-risk tool surface — partial

Total tools: 8 · High-risk: 3

ToolSourceRisk tagsApprovalIdempotency
gmail.send_customer_emailmcpcustomer_communication, external_writenono
shopify.cancel_orderopenapidestructive, writeyesyes
stripe.create_refundopenapiexternal_write, financial_action, writenono

§3A Tool-surface diff — not declared

Status: disabled — No --diff-from report or v0.3 baseline snapshot was provided.
Base: none

§3B Action-surface diff — not declared

Status: disabled — No action-surface comparison source was provided.
Base: none

§4 Approval policy coverage — partial

ToolDeclaredSourceGap finding(s)
shopify.cancel_orderyespolicies
stripe.create_refundnofp_f092940f62fbb012

Gap findings

  • SHIP-POLICY-APPROVAL-MISSING (critical): stripe.create_refund lacks a declared approval policy

§5 Idempotency / retry risk — partial

Retry policy: not declared

ToolDeclaredSourceGap finding(s)
gmail.send_customer_emailnofp_0f8aaa912d589cf0
shopify.cancel_orderyespolicies
stripe.create_refundnofp_dac8011e14c53777

Gap findings

  • SHIP-SIDEFX-IDEMPOTENCY-MISSING (critical): stripe.create_refund lacks idempotency evidence
  • SHIP-SIDEFX-IDEMPOTENCY-MISSING (high): gmail.send_customer_email lacks idempotency evidence

§6 Scope coverage — missing

Declared scopes

  • zendesk:tickets:read
  • zendesk:tickets:write
  • stripe:*
ScopeDeclaredUsed by tools
gmail:sendnogmail.send_customer_email
shopify:orders:writenoshopify.cancel_order
stripe:*yes
stripe:refunds:writeyesstripe.create_refund
support:kb:readnosupport.search_kb
zendesk:tickets:readyes
zendesk:tickets:writeyeszendesk.update_ticket

Unused declared scopes

  • zendesk:tickets:read

Used by tools but not declared

  • gmail:send
  • shopify:orders:write
  • support:kb:read

Gap findings

  • SHIP-AUTH-SCOPE-COVERAGE-MISSING (high): shopify.cancel_order requires scopes not declared in the manifest
  • SHIP-AUTH-SCOPE-COVERAGE-MISSING (high): support.search_kb requires scopes not declared in the manifest
  • SHIP-AUTH-SCOPE-COVERAGE-MISSING (high): gmail.send_customer_email requires scopes not declared in the manifest
  • SHIP-MANIFEST-UNUSED-SCOPE (medium): Manifest declares unused permission scope zendesk:tickets:read

§7 Memory isolation — not declared

Manifest does not declare a memory isolation policy. The current manifest schema (v0.1) has no agent.memory field. See §10 for the residual review item.

§8 Human-in-the-loop evidence — covered

  • Configured: yes
  • Human review recommended: yes
  • Provenance mode: fresh_scan
  • HITL evidence is local review evidence only. Missing local evidence does not prove a runtime control is absent, and present local evidence does not certify runtime enforcement.

Approval-required tools

  • shopify.cancel_order

Confirmation-required tools

  • shopify.cancel_order

§9 Required dynamic scenarios — partial

  • Manual review for SHIP-AUTH-MANIFEST-BROAD-SCOPE — Replace broad manifest permission scopes with the narrowest scopes needed for this release.
    Related finding(s): fp_d27325cbdbbf5483
  • Manual review for SHIP-AUTH-SCOPE-COVERAGE-MISSING — Add the required scopes for shopify.cancel_order to permissions.scopes or narrow the tool's declared auth requirements.
    Related finding(s): fp_1f6cfd6b7daa9b7c, fp_83852fbd6b440524, fp_d8e6d1865dae97cc
  • Manual review for SHIP-INVENTORY-WILDCARD-TOOLS — Replace wildcard tool exposure with an explicit tool allowlist before release review.
    Related finding(s): fp_fc02d8ecd30f2578
  • Manual review for SHIP-MANIFEST-HIGH-RISK-OWNER-MISSING — Declare an owner for each high-risk production tool in risk_overrides.tools.
    Related finding(s): fp_fd2577850cef1f87
  • Manual review for SHIP-MANIFEST-UNUSED-SCOPE — Remove unused manifest scopes or add tool metadata showing why they are required.
    Related finding(s): fp_39b9ae878f343d1b
  • Manual review for SHIP-POLICY-APPROVAL-MISSING — Declare an approval policy for stripe.create_refund or remove this tool from the release.
    Related finding(s): fp_f092940f62fbb012
  • Manual review for SHIP-POLICY-CONFIRMATION-MISSING — Declare a user confirmation policy for stripe.create_refund or remove this action from the release.
    Related finding(s): fp_8e08a4fe6b0917f6, fp_a62ca2fd9a68a1d1
  • Manual review for SHIP-SCHEMA-BROAD-FREE-TEXT — Constrain zendesk.update_ticket.updates with an enum, structured schema, or narrower field-specific parameters.
    Related finding(s): fp_acd63b899d49aa1c, fp_ff2f028953d1c220
  • Manual review for SHIP-SCHEMA-FREEFORM-OUTPUT — Prefer a structured output schema for send_email_preview, especially when output is later passed back into model context.
    Related finding(s): fp_85f8513ad72cd9ea
  • Manual review for SHIP-SCHEMA-MISSING-BOUNDS — Add a maximum bound to stripe.create_refund.amount or document an equivalent limit in the tool policy.
    Related finding(s): fp_ab60b01cb53cfcbe
  • Manual review for SHIP-SCOPE-PROHIBITED-TOOL-PRESENT — Remove stripe.create_refund, narrow its policy, or revise prohibited_actions so the manifest and tool surface do not contradict each other.
    Related finding(s): fp_12985c36a06026de, fp_e090c62e390e70ab
  • Manual review for SHIP-SIDEFX-IDEMPOTENCY-MISSING — Add an idempotency key, idempotent annotation, or declared idempotency policy for stripe.create_refund.
    Related finding(s): fp_0f8aaa912d589cf0, fp_dac8011e14c53777
  • Re-run scan after resolving source warnings — Source loaders emitted warnings; some tool surfaces may have been parsed with reduced confidence.
  • Verify low-confidence tool extractions — One or more tools were extracted with low confidence; confirm against the upstream source before release.

§10 What this packet did NOT prove

Agents Shipgate is an advisory, local-first, static Tool-Use Readiness release gate for AI agent tool surfaces. The packet below is derived from a scan; it does not, by itself, prove the following properties:

  • Prompt robustness. Whether the agent's prompt holds up under jailbreaks, persona drift, indirect prompt injection, or adversarial inputs.
  • Runtime behavior. Whether the agent actually invokes only the declared tools, respects approval gates at runtime, or follows policy under load. Static config is not runtime evidence.
  • Model correctness. Whether the underlying model produces correct outputs, calls the right tools, or stays within the declared scope. The packet does not benchmark the model.
  • Adversarial resistance. Whether the agent withstands red-team or penetration testing. The packet does not run scenarios; it organizes evidence.

Per-run residuals

  • Source warnings:
    • MCP source declares wildcard tool exposure
  • Low-confidence tool extractions: none
  • Suppressed findings in effect: none
  • Memory isolation is not modeled by the v0.1 manifest schema; no static evidence is available.
  • 6 active finding(s) came from heuristic provenance (keyword_heuristic=6, regex_heuristic=0); review the finding evidence before acting.
diff --git a/samples/support_refund_agent/expected/packet.json b/samples/support_refund_agent/expected/packet.json index 471379a3..94c50adc 100644 --- a/samples/support_refund_agent/expected/packet.json +++ b/samples/support_refund_agent/expected/packet.json @@ -1289,7 +1289,8 @@ }, "not_proven": { "additional_residuals": [ - "Memory isolation is not modeled by the v0.1 manifest schema; no static evidence is available." + "Memory isolation is not modeled by the v0.1 manifest schema; no static evidence is available.", + "6 active finding(s) came from heuristic provenance (keyword_heuristic=6, regex_heuristic=0); review the finding evidence before acting." ], "headline": "Agents Shipgate is an advisory, local-first, static Tool-Use Readiness release gate for AI agent tool surfaces. The packet below is derived from a scan; it does not, by itself, prove the following properties:", "low_confidence_tools": [], diff --git a/samples/support_refund_agent/expected/packet.md b/samples/support_refund_agent/expected/packet.md index 3a55105d..46d10dc7 100644 --- a/samples/support_refund_agent/expected/packet.md +++ b/samples/support_refund_agent/expected/packet.md @@ -237,3 +237,4 @@ Agents Shipgate is an advisory, local-first, static Tool-Use Readiness release g - Low-confidence tool extractions: none - Suppressed findings in effect: none - Memory isolation is not modeled by the v0.1 manifest schema; no static evidence is available. +- 6 active finding\(s\) came from heuristic provenance \(keyword\_heuristic=6, regex\_heuristic=0\); review the finding evidence before acting. diff --git a/samples/support_refund_agent/expected/report.md b/samples/support_refund_agent/expected/report.md index 81e5ee19..7bd41f71 100644 --- a/samples/support_refund_agent/expected/report.md +++ b/samples/support_refund_agent/expected/report.md @@ -68,6 +68,20 @@ Fail policy: ci_mode=advisory, fail_on=[none], new_findings_only=false, would_fa Evidence: tool\_scopes=\['support:kb:read'\]; manifest\_scopes=\['zendesk:tickets:read', 'zendesk:tickets:write', 'stripe:\*'\]; missing\_scopes=\['support:kb:read'\] Recommendation: Add the required scopes for support.search\_kb to permissions.scopes or narrow the tool's declared auth requirements. +## Finding Provenance + +Reviewer triage signal only. Provenance kind does not change severity, release decision, fingerprints, baselines, or CI exit codes. + +| Provenance kind | Active findings | +| --- | ---: | +| `static_declaration` | 12 | +| `ast_extraction` | 0 | +| `keyword_heuristic` | 6 | +| `regex_heuristic` | 0 | +| `policy_pack` | 0 | + +Suppressed findings excluded: 0 + ## Capability <-> Intent Diff Agent intent: diff --git a/skills/agents-shipgate/SKILL.md b/skills/agents-shipgate/SKILL.md index d7dfd759..01fd401d 100644 --- a/skills/agents-shipgate/SKILL.md +++ b/skills/agents-shipgate/SKILL.md @@ -63,7 +63,7 @@ For non-GitHub CI (GitLab, CircleCI, Jenkins, Azure Pipelines, Buildkite, Bitbuc - **CLI surface** is frozen for `0.x` — see https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/STABILITY.md. - **Installed CLI contract**: when available, run `agents-shipgate contract --json` to verify local schema versions, `release_decision.decision`, and manual-review signal fields. Older installs should use [`docs/agent-contract-current.md`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/agent-contract-current.md) or upgrade before automating against the local contract command. -- **Report JSON**: `report_schema_version: "0.20"`. Read `release_decision.decision` (`"blocked" | "review_required" | "insufficient_evidence" | "passed"`) **first** for release gating — it is baseline-aware. `insufficient_evidence` (added v0.14) fires when evidence coverage is degraded past threshold (at least half of scanned tools low-confidence — `ceil(N × 0.5)` with a minimum of 1 — or 4+ source warnings); switch on the enum with a `review_required` fallback for unknown values. For privacy audit read `privacy_audit` (v0.18+) to confirm default redaction ran before public artifacts were written; `redacted_paths[]` contains structural paths and counts only. For severity-override audit read the top-level `policy_audit.severity_overrides_applied[]` block (v0.17+) — every manifest-driven severity change carries `{check_id, default_severity, applied_severity, manifest_path, reason, tier_crossed, direction, expires}`. For per-finding decision audit read `release_decision.contribution_rules[]` (v0.17+) — one row per `report.findings` entry with `category` ∈ `{blocker, review_item, excluded}` and `rule` ∈ `{policy_block_new, severity_block_new, policy_baseline_accepted, severity_baseline_accepted, review_required, sub_threshold, suppressed}`. For Action Surface Diff read `action_surface_facts`, `action_surface_diff`, and `findings[].blocks_release` (v0.16+) to understand added/removed/modified external actions and explicit release-policy blockers. For one-fetch summarization read the top-level `agent_summary` block (v0.12+) — `{verdict, headline, blocker_count, review_item_count, auto_appliable_patches, needs_human_review, first_recommended_action}`. For per-finding routing read `findings[].agent_action` (v0.12+; `auto_apply | propose_patch_for_review | escalate_to_human | suppress_with_reason | informational`) instead of synthesizing one from `autofix_safe`/`requires_human_review`/`suggested_patch_kind`. To filter findings by source reliability read `findings[].provenance_kind` (v0.15+; `static_declaration | ast_extraction | keyword_heuristic | regex_heuristic | policy_pack`) — independent of `confidence`. Codex plugin facts, when present, live under `codex_plugin_surface` (v0.13+). Do not gate on `summary.status` for new consumers; it is preserved for v0.7 callers and is baseline-blind. The full field list lives in [`docs/agent-contract-current.md`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/agent-contract-current.md#read-these-first-for-release-gating); this skill links there instead of restating it. v0.11 adds optional `findings[].source.{path, start_line, end_line, start_column, pointer}` provenance keys (kept in v0.19). v0.19 adds the optional `Finding.policy_evidence_source` and `ReleaseDecisionItem.{source, policy_evidence_source}` fields for reviewer-grade dual-source provenance — high-risk findings that fire because of a missing manifest mitigation can carry both the tool location AND the manifest-pointer line. Reports validate against [`docs/report-schema.v0.20.json`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.20.json) (active emitted version). Frozen-reference older schemas (kept for legacy/pre-v0.19 reports): [`v0.18`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.18.json), [`v0.17`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.17.json), [`v0.16`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.16.json), [`v0.15`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.15.json), [`v0.14`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.14.json), [`v0.13`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.13.json), [`v0.12`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.12.json), [`v0.11` (frozen)](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.11.json), [`v0.10` (frozen)](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.10.json), [`v0.9` (frozen)](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.9.json), [`v0.8` (frozen)](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.8.json), and [`v0.7` (frozen)](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.7.json). +- **Report JSON**: `report_schema_version: "0.20"`. Read `release_decision.decision` first for release gating; use `agent_summary` / `findings[].agent_action` for agent routing and `reviewer_summary` for the human-review entry point. To filter findings by source reliability, use `agents-shipgate findings --from agents-shipgate-reports/report.json --provenance-kind keyword_heuristic,regex_heuristic --json`; it reads `findings[].provenance_kind` (v0.15+) as a reviewer triage signal only, independent of `confidence` and never as a gate input. Do not gate on `summary.status`; it is legacy and baseline-blind. The full field list lives in [`docs/agent-contract-current.md`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/agent-contract-current.md#read-these-first-for-release-gating), and reports validate against [`docs/report-schema.v0.20.json`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.20.json). - **Release Evidence Packet**: `agents-shipgate-reports/packet.{md,json,html}` (and `packet.pdf` with the `[pdf]` extras) is emitted alongside the report by default. The packet has fixed reviewer sections governed by [`docs/packet-schema.v0.6.json`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/packet-schema.v0.6.json) (latest; v0.6 adds the top-level `evidence_matrix` compact review section AND `ReleaseDecisionItem.{source, policy_evidence_source}` for reviewer-grade dual-source provenance over the v0.5 baseline). See [STABILITY.md §Release Evidence Packet](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/STABILITY.md#release-evidence-packet-v06). Use the packet for reviewer-shaped output; use the report for finding details. - **Single source of truth for the contract**: [`docs/agent-contract-current.md`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/agent-contract-current.md). When the schema bumps, that file updates first. - **Exit codes**: `0` pass, `2` config error, `3` parse error, `4` other error, `20` strict-mode gate failure. diff --git a/src/agents_shipgate/cli/discovery/agent_instructions/renderers/claude_code_skill.py b/src/agents_shipgate/cli/discovery/agent_instructions/renderers/claude_code_skill.py index def66886..469e2550 100644 --- a/src/agents_shipgate/cli/discovery/agent_instructions/renderers/claude_code_skill.py +++ b/src/agents_shipgate/cli/discovery/agent_instructions/renderers/claude_code_skill.py @@ -41,7 +41,11 @@ def render_bundle_text() -> str: # `init --agent-instructions=claude-code-skill --write` can safely migrate # v(N-1) files. Leave the dict empty while there is no prior shipped Claude # Code skill bundle. -PRIOR_RENDER_SHA256: dict[str, tuple[str, ...]] = {} +PRIOR_RENDER_SHA256: dict[str, tuple[str, ...]] = { + ".claude/skills/agents-shipgate/SKILL.md": ( + "b17c53d9905f46b196be38e98cf71e53da6779e3a4f426ecff14f2b0f238aba9", + ), +} _ACTION_VERSION = __version__ @@ -112,7 +116,7 @@ def render_bundle_text() -> str: - **CLI surface** is frozen for `0.x` — see https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/STABILITY.md. - **Installed CLI contract**: when available, run `agents-shipgate contract --json` to verify local schema versions, `release_decision.decision`, and manual-review signal fields. Older installs should use [`docs/agent-contract-current.md`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/agent-contract-current.md) or upgrade before automating against the local contract command. -- **Report JSON**: `report_schema_version: "0.20"`. Read `release_decision.decision` (`"blocked" | "review_required" | "insufficient_evidence" | "passed"`) **first** for release gating — it is baseline-aware. `insufficient_evidence` (added v0.14) fires when evidence coverage is degraded past threshold (at least half of scanned tools low-confidence — `ceil(N × 0.5)` with a minimum of 1 — or 4+ source warnings); switch on the enum with a `review_required` fallback for unknown values. For privacy audit read `privacy_audit` (v0.18+) to confirm default redaction ran before public artifacts were written; `redacted_paths[]` contains structural paths and counts only. For severity-override audit read the top-level `policy_audit.severity_overrides_applied[]` block (v0.17+) — every manifest-driven severity change carries `{check_id, default_severity, applied_severity, manifest_path, reason, tier_crossed, direction, expires}`. For per-finding decision audit read `release_decision.contribution_rules[]` (v0.17+) — one row per `report.findings` entry with `category` ∈ `{blocker, review_item, excluded}` and `rule` ∈ `{policy_block_new, severity_block_new, policy_baseline_accepted, severity_baseline_accepted, review_required, sub_threshold, suppressed}`. For Action Surface Diff read `action_surface_facts`, `action_surface_diff`, and `findings[].blocks_release` (v0.16+) to understand added/removed/modified external actions and explicit release-policy blockers. For one-fetch summarization read the top-level `agent_summary` block (v0.12+) — `{verdict, headline, blocker_count, review_item_count, auto_appliable_patches, needs_human_review, first_recommended_action}`. For per-finding routing read `findings[].agent_action` (v0.12+; `auto_apply | propose_patch_for_review | escalate_to_human | suppress_with_reason | informational`) instead of synthesizing one from `autofix_safe`/`requires_human_review`/`suggested_patch_kind`. To filter findings by source reliability read `findings[].provenance_kind` (v0.15+; `static_declaration | ast_extraction | keyword_heuristic | regex_heuristic | policy_pack`) — independent of `confidence`. Codex plugin facts, when present, live under `codex_plugin_surface` (v0.13+). Do not gate on `summary.status` for new consumers; it is preserved for v0.7 callers and is baseline-blind. The full field list lives in [`docs/agent-contract-current.md`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/agent-contract-current.md#read-these-first-for-release-gating); this skill links there instead of restating it. v0.11 adds optional `findings[].source.{path, start_line, end_line, start_column, pointer}` provenance keys (kept in v0.19). v0.19 adds the optional `Finding.policy_evidence_source` and `ReleaseDecisionItem.{source, policy_evidence_source}` fields for reviewer-grade dual-source provenance — high-risk findings that fire because of a missing manifest mitigation can carry both the tool location AND the manifest-pointer line. Reports validate against [`docs/report-schema.v0.20.json`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.20.json) (active emitted version). Frozen-reference older schemas (kept for legacy/pre-v0.19 reports): [`v0.18`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.18.json), [`v0.17`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.17.json), [`v0.16`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.16.json), [`v0.15`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.15.json), [`v0.14`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.14.json), [`v0.13`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.13.json), [`v0.12`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.12.json), [`v0.11` (frozen)](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.11.json), [`v0.10` (frozen)](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.10.json), [`v0.9` (frozen)](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.9.json), [`v0.8` (frozen)](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.8.json), and [`v0.7` (frozen)](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.7.json). +- **Report JSON**: `report_schema_version: "0.20"`. Read `release_decision.decision` first for release gating; use `agent_summary` / `findings[].agent_action` for agent routing and `reviewer_summary` for the human-review entry point. To filter findings by source reliability, use `agents-shipgate findings --from agents-shipgate-reports/report.json --provenance-kind keyword_heuristic,regex_heuristic --json`; it reads `findings[].provenance_kind` (v0.15+) as a reviewer triage signal only, independent of `confidence` and never as a gate input. Do not gate on `summary.status`; it is legacy and baseline-blind. The full field list lives in [`docs/agent-contract-current.md`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/agent-contract-current.md#read-these-first-for-release-gating), and reports validate against [`docs/report-schema.v0.20.json`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.20.json). - **Release Evidence Packet**: `agents-shipgate-reports/packet.{md,json,html}` (and `packet.pdf` with the `[pdf]` extras) is emitted alongside the report by default. The packet has fixed reviewer sections governed by [`docs/packet-schema.v0.6.json`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/packet-schema.v0.6.json) (latest; v0.6 adds the top-level `evidence_matrix` compact review section AND `ReleaseDecisionItem.{source, policy_evidence_source}` for reviewer-grade dual-source provenance over the v0.5 baseline). See [STABILITY.md §Release Evidence Packet](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/STABILITY.md#release-evidence-packet-v06). Use the packet for reviewer-shaped output; use the report for finding details. - **Single source of truth for the contract**: [`docs/agent-contract-current.md`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/agent-contract-current.md). When the schema bumps, that file updates first. - **Exit codes**: `0` pass, `2` config error, `3` parse error, `4` other error, `20` strict-mode gate failure. diff --git a/src/agents_shipgate/cli/discovery/agent_instructions/renderers/codex_skill.py b/src/agents_shipgate/cli/discovery/agent_instructions/renderers/codex_skill.py index ad17c2b1..f4a1aa65 100644 --- a/src/agents_shipgate/cli/discovery/agent_instructions/renderers/codex_skill.py +++ b/src/agents_shipgate/cli/discovery/agent_instructions/renderers/codex_skill.py @@ -35,7 +35,11 @@ def render_bundle_text() -> str: # move that file's previous current-render hash into this dict so `init # --agent-instructions=codex-skill --write` can safely migrate v(N-1) files. # Leave the dict empty while there is no prior shipped Codex skill bundle. -PRIOR_RENDER_SHA256: dict[str, tuple[str, ...]] = {} +PRIOR_RENDER_SHA256: dict[str, tuple[str, ...]] = { + ".agents/skills/agents-shipgate/SKILL.md": ( + "59ec0a31f9747acf569f731561236ff4ef6d8734b614edfa04ea6ff10043f21a", + ), +} _ACTION_VERSION = __version__ @@ -69,6 +73,7 @@ def render_bundle_text() -> str: - Existing manifest: run `agents-shipgate scan -c shipgate.yaml --suggest-patches --format json`. - First GitHub CI: copy `assets/advisory-pr-comment.yml` to `.github/workflows/agents-shipgate.yml`. - Explain one finding: run `agents-shipgate explain-finding --from agents-shipgate-reports/report.json --json`. +- Triage heuristic findings: run `agents-shipgate findings --from agents-shipgate-reports/report.json --provenance-kind keyword_heuristic,regex_heuristic --json`. ## Boundaries diff --git a/src/agents_shipgate/cli/discovery/agent_instructions/renderers/cursor.py b/src/agents_shipgate/cli/discovery/agent_instructions/renderers/cursor.py index f464533f..56720a10 100644 --- a/src/agents_shipgate/cli/discovery/agent_instructions/renderers/cursor.py +++ b/src/agents_shipgate/cli/discovery/agent_instructions/renderers/cursor.py @@ -67,6 +67,12 @@ def render_file() -> str: suppress_with_reason, informational. Do not synthesize an action from the underlying flags when the enum is present. +For reviewer triage by source reliability, run +`agents-shipgate findings --from agents-shipgate-reports/report.json +--provenance-kind keyword_heuristic,regex_heuristic --json`. The +underlying `findings[].provenance_kind` field is a filter signal only, +not a gate input. + To translate a single finding into user-facing prose, run: agents-shipgate explain-finding \\ diff --git a/src/agents_shipgate/cli/findings.py b/src/agents_shipgate/cli/findings.py new file mode 100644 index 00000000..9c25d293 --- /dev/null +++ b/src/agents_shipgate/cli/findings.py @@ -0,0 +1,270 @@ +"""``shipgate findings`` — filter report findings for reviewer triage.""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + +import typer +from pydantic import ValidationError + +from agents_shipgate.cli.agent_mode import emit_agent_mode_error +from agents_shipgate.core.findings import ( + PROVENANCE_KIND_ORDER, + provenance_kind_counts, +) +from agents_shipgate.core.privacy import sanitize_report_payload +from agents_shipgate.schemas.common import ProvenanceKind, parse_provenance_kind +from agents_shipgate.schemas.diagnostics import NextAction +from agents_shipgate.schemas.report import Finding, ReadinessReport + +_MIN_SUPPORTED_SCHEMA = "0.15" + + +def _version_tuple(value: str) -> tuple[int, ...]: + try: + return tuple(int(part) for part in value.split(".")) + except (AttributeError, ValueError) as exc: + raise ValueError( + f"invalid report_schema_version: {value!r}" + ) from exc + + +def _load_report(path: Path) -> ReadinessReport: + if not path.is_file(): + raise ValueError(f"report file not found: {path}") + try: + text = path.read_text(encoding="utf-8") + except OSError as exc: + raise ValueError(f"cannot read report at {path}: {exc}") from exc + try: + payload = json.loads(text) + except json.JSONDecodeError as exc: + raise ValueError(f"report is not valid JSON: {exc}") from exc + if not isinstance(payload, dict): + raise ValueError("report JSON must be an object") + + version = payload.get("report_schema_version") + if not isinstance(version, str): + raise ValueError( + "input must be an agents-shipgate report.json with a " + "string `report_schema_version`." + ) + if _version_tuple(version) < _version_tuple(_MIN_SUPPORTED_SCHEMA): + raise ValueError( + f"findings provenance filtering requires report_schema_version " + f">= {_MIN_SUPPORTED_SCHEMA} (got {version!r}). The v0.15 " + "`provenance_kind` field is required for this command. " + "Re-scan with the current CLI: " + "`agents-shipgate scan -c shipgate.yaml --format json`." + ) + + payload = sanitize_report_payload(payload) + try: + report = ReadinessReport.model_validate(payload) + except ValidationError as exc: + raise ValueError(f"report.json failed validation: {exc}") from exc + + missing = [ + finding.id or finding.fingerprint or finding.check_id + for finding in report.findings + if finding.provenance_kind is None + ] + if missing: + preview = ", ".join(missing[:3]) + suffix = f", … (+{len(missing) - 3})" if len(missing) > 3 else "" + raise ValueError( + "report.json contains finding(s) without `provenance_kind`: " + f"{preview}{suffix}. Re-scan with the current CLI: " + "`agents-shipgate scan -c shipgate.yaml --format json`." + ) + return report + + +def _parse_provenance_filter(value: str | None) -> list[ProvenanceKind]: + if value is None or value.strip() == "": + return list(PROVENANCE_KIND_ORDER) + parsed: list[ProvenanceKind] = [] + for raw in value.split(","): + token = raw.strip() + if not token: + continue + try: + kind = parse_provenance_kind(token) + except ValueError as exc: + allowed = ", ".join(PROVENANCE_KIND_ORDER) + raise ValueError( + f"unsupported --provenance-kind value {token!r}; " + f"expected one of: {allowed}" + ) from exc + if kind not in parsed: + parsed.append(kind) + if not parsed: + allowed = ", ".join(PROVENANCE_KIND_ORDER) + raise ValueError( + "--provenance-kind must include at least one value; " + f"expected one of: {allowed}" + ) + return parsed + + +def _counts_payload(counts: dict[ProvenanceKind, int]) -> dict[str, int]: + return {kind: counts[kind] for kind in PROVENANCE_KIND_ORDER} + + +def _finding_payload(finding: Finding) -> dict[str, Any]: + return { + "id": finding.id, + "fingerprint": finding.fingerprint, + "check_id": finding.check_id, + "severity": finding.severity, + "title": finding.title, + "tool_name": finding.tool_name, + "confidence": finding.confidence, + "provenance_kind": finding.provenance_kind, + "agent_action": finding.agent_action, + "suppressed": finding.suppressed, + "source": ( + finding.source.model_dump(mode="json", exclude_none=True) + if finding.source is not None + else None + ), + } + + +def findings_payload( + *, + report_path: Path, + provenance_kind_filter: str | None, + include_suppressed: bool, +) -> dict[str, Any]: + report = _load_report(report_path) + selected = _parse_provenance_filter(provenance_kind_filter) + included = [ + finding + for finding in report.findings + if include_suppressed or not finding.suppressed + ] + matched = [ + finding + for finding in included + if finding.provenance_kind in selected + ] + return { + "filters": { + "source_report": str(report_path.resolve()), + "provenance_kind": list(selected), + "include_suppressed": include_suppressed, + }, + "summary": { + "total_findings": len(report.findings), + "included_findings": len(included), + "matched_findings": len(matched), + "suppressed_omitted": 0 + if include_suppressed + else sum(1 for finding in report.findings if finding.suppressed), + "by_provenance_kind": _counts_payload( + provenance_kind_counts( + report.findings, + include_suppressed=include_suppressed, + ) + ), + "matched_by_provenance_kind": _counts_payload( + provenance_kind_counts(matched, include_suppressed=True) + ), + }, + "findings": [_finding_payload(finding) for finding in matched], + } + + +def findings( + source: Path = typer.Option( + Path("agents-shipgate-reports/report.json"), + "--from", + help=( + "Path to the scan's `report.json`. Default mirrors the " + "canonical reports directory." + ), + ), + provenance_kind: str | None = typer.Option( + None, + "--provenance-kind", + help=( + "Comma-separated provenance kinds to include. Defaults to all " + "kinds. Values: static_declaration, ast_extraction, " + "keyword_heuristic, regex_heuristic, policy_pack." + ), + ), + include_suppressed: bool = typer.Option( + False, + "--include-suppressed", + help="Include suppressed findings. Defaults to active findings only.", + ), + json_output: bool = typer.Option( + False, + "--json", + help="Emit JSON instead of text.", + ), +) -> None: + """Filter report findings by provenance kind for reviewer triage.""" + + try: + payload = findings_payload( + report_path=source, + provenance_kind_filter=provenance_kind, + include_suppressed=include_suppressed, + ) + except ValueError as exc: + typer.echo(f"findings: {exc}", err=True) + emit_agent_mode_error( + "input_parse_error", + message=str(exc), + source_report=str(source), + next_action="agents-shipgate scan -c shipgate.yaml --format json", + next_actions=[ + NextAction( + kind="command", + command=( + "agents-shipgate scan -c shipgate.yaml --format json" + ), + why=( + f"Could not load or filter {source}. Generate a " + "fresh report.json with the current CLI." + ), + expects=( + "agents-shipgate-reports/report.json on disk, " + "validatable against report schema v0.15 or newer." + ), + ).model_dump(mode="json") + ], + ) + raise typer.Exit(3) from exc + + if json_output: + typer.echo(json.dumps(payload, indent=2, sort_keys=True)) + return + + summary = payload["summary"] + typer.echo(f"Source report: {payload['filters']['source_report']}") + typer.echo( + "Scope: " + + ("all findings" if include_suppressed else "active findings only") + ) + typer.echo(f"Matched findings: {summary['matched_findings']}") + typer.echo("Provenance counts:") + for kind in PROVENANCE_KIND_ORDER: + typer.echo(f" {kind}: {summary['by_provenance_kind'][kind]}") + typer.echo("") + if not payload["findings"]: + typer.echo("No findings matched.") + return + for finding in payload["findings"]: + target = f" [{finding['tool_name']}]" if finding["tool_name"] else "" + suppressed = " (suppressed)" if finding["suppressed"] else "" + typer.echo( + f"- {finding['severity'].upper()}: {finding['check_id']}" + f"{target}{suppressed} " + f"({finding['provenance_kind']}, {finding['confidence']}) - " + f"{finding['title']}" + ) diff --git a/src/agents_shipgate/cli/main.py b/src/agents_shipgate/cli/main.py index 071dd3e4..1f3bdb42 100644 --- a/src/agents_shipgate/cli/main.py +++ b/src/agents_shipgate/cli/main.py @@ -19,6 +19,7 @@ from agents_shipgate.cli.detect import detect as _detect_command from agents_shipgate.cli.evidence_packet import evidence_packet as _evidence_packet_command from agents_shipgate.cli.explain_finding import explain_finding as _explain_finding_command +from agents_shipgate.cli.findings import findings as _findings_command from agents_shipgate.cli.fixture import fixture_app from agents_shipgate.cli.scenario import scenario_app from agents_shipgate.cli.self_check import self_check @@ -68,6 +69,13 @@ "`." ), )(_explain_finding_command) +app.command( + "findings", + help=( + "Filter findings from a `report.json` by provenance kind for " + "reviewer triage." + ), +)(_findings_command) _register_scan.register(app) _register_list_checks.register(app) _register_contract.register(app) diff --git a/src/agents_shipgate/core/findings/__init__.py b/src/agents_shipgate/core/findings/__init__.py index ad946ebf..14ff7e13 100644 --- a/src/agents_shipgate/core/findings/__init__.py +++ b/src/agents_shipgate/core/findings/__init__.py @@ -19,6 +19,7 @@ apply_severity_overrides, apply_suppressions, ) +from .provenance import PROVENANCE_KIND_ORDER, provenance_kind_counts from .remediation import ( _REMEDIATION_FALLBACK, _derive_from_patches, @@ -49,6 +50,7 @@ __all__ = [ "FINGERPRINT_EXCLUDED_EVIDENCE_KEYS", + "PROVENANCE_KIND_ORDER", "SEVERITY_ORDER", "_REMEDIATION_FALLBACK", "_action_surface_changes", @@ -79,6 +81,7 @@ "dedupe_findings", "derive_agent_action", "finding_fingerprint", + "provenance_kind_counts", "recommended_actions", "summarize_findings", "summarize_tool_surface", diff --git a/src/agents_shipgate/core/findings/provenance.py b/src/agents_shipgate/core/findings/provenance.py new file mode 100644 index 00000000..aff22bd4 --- /dev/null +++ b/src/agents_shipgate/core/findings/provenance.py @@ -0,0 +1,34 @@ +from __future__ import annotations + +from agents_shipgate.schemas.common import ProvenanceKind +from agents_shipgate.schemas.report import Finding + +PROVENANCE_KIND_ORDER: tuple[ProvenanceKind, ...] = ( + "static_declaration", + "ast_extraction", + "keyword_heuristic", + "regex_heuristic", + "policy_pack", +) + + +def provenance_kind_counts( + findings: list[Finding], + *, + include_suppressed: bool = False, +) -> dict[ProvenanceKind, int]: + """Count findings by provenance kind in the public enum order. + + This is a reviewer triage helper only. It must not feed release + gating, severity, fingerprints, baselines, or CI exit behavior. + """ + + counts = {kind: 0 for kind in PROVENANCE_KIND_ORDER} + for finding in findings: + if finding.suppressed and not include_suppressed: + continue + # Legacy/plugin compatibility: CLI report filtering pre-validates + # provenance_kind, while renderers coerce older in-memory findings. + kind = finding.provenance_kind or "static_declaration" + counts[kind] += 1 + return counts diff --git a/src/agents_shipgate/packet/builder.py b/src/agents_shipgate/packet/builder.py index 91023f46..c52712f0 100644 --- a/src/agents_shipgate/packet/builder.py +++ b/src/agents_shipgate/packet/builder.py @@ -24,6 +24,7 @@ Tool, ToolRiskHint, ) +from agents_shipgate.core.findings import provenance_kind_counts from agents_shipgate.core.risk_hints import is_high_risk_tool, risk_tags from agents_shipgate.packet.disclaimer import ( PACKET_NON_PROOF, @@ -1085,6 +1086,34 @@ def _build_not_proven( "Memory isolation is not modeled by the v0.1 manifest schema; " "no static evidence is available." ] + provenance_counts = provenance_kind_counts( + findings, + include_suppressed=False, + ) + heuristic_count = ( + provenance_counts["keyword_heuristic"] + + provenance_counts["regex_heuristic"] + ) + if heuristic_count: + additional.append( + f"{heuristic_count} active finding(s) came from heuristic " + "provenance " + f"(keyword_heuristic={provenance_counts['keyword_heuristic']}, " + f"regex_heuristic={provenance_counts['regex_heuristic']}); " + "review the finding evidence before acting." + ) + if provenance_counts["ast_extraction"]: + additional.append( + f"{provenance_counts['ast_extraction']} active finding(s) came " + "from AST-extracted tool surfaces; confirm against the source " + "code when extraction precision matters." + ) + if provenance_counts["policy_pack"]: + additional.append( + f"{provenance_counts['policy_pack']} active finding(s) came " + "from external policy packs; review the pack rule and its " + "declared confidence before acting." + ) return NotProvenSection( headline=PACKET_NON_PROOF_HEADLINE, unconditional=[ diff --git a/src/agents_shipgate/report/markdown.py b/src/agents_shipgate/report/markdown.py index e08b5c57..ce2e33a8 100644 --- a/src/agents_shipgate/report/markdown.py +++ b/src/agents_shipgate/report/markdown.py @@ -5,7 +5,11 @@ from pathlib import Path from agents_shipgate.core.disclaimers import HITL_RUNTIME_CONTROL_DISCLAIMER -from agents_shipgate.core.findings.constants import SEVERITY_ORDER +from agents_shipgate.core.findings import ( + PROVENANCE_KIND_ORDER, + SEVERITY_ORDER, + provenance_kind_counts, +) from agents_shipgate.core.privacy import sanitize_report from agents_shipgate.schemas.report import ( DeclaredIntention, @@ -98,6 +102,7 @@ def render_markdown_report(report: ReadinessReport) -> str: ] ) _append_top_findings(lines, report.findings) + _append_finding_provenance(lines, report.findings) _append_capability_intent_diff(lines, report) _append_baseline(lines, report) _append_recommended_actions(lines, report.recommended_actions) @@ -254,6 +259,34 @@ def _append_top_findings(lines: list[str], findings: list[Finding]) -> None: lines.append("") +def _append_finding_provenance(lines: list[str], findings: list[Finding]) -> None: + counts = provenance_kind_counts(findings) + suppressed = sum(1 for finding in findings if finding.suppressed) + lines.extend( + [ + "## Finding Provenance", + "", + ( + "Reviewer triage signal only. Provenance kind does not " + "change severity, release decision, fingerprints, baselines, " + "or CI exit codes." + ), + "", + "| Provenance kind | Active findings |", + "| --- | ---: |", + ] + ) + for kind in PROVENANCE_KIND_ORDER: + lines.append(f"| `{kind}` | {counts[kind]} |") + lines.extend( + [ + "", + f"Suppressed findings excluded: {suppressed}", + "", + ] + ) + + def _append_capability_intent_diff( lines: list[str], report: ReadinessReport, diff --git a/src/agents_shipgate/schemas/contract.py b/src/agents_shipgate/schemas/contract.py index 0428f857..46e2d9db 100644 --- a/src/agents_shipgate/schemas/contract.py +++ b/src/agents_shipgate/schemas/contract.py @@ -42,6 +42,10 @@ "release_decision.contribution_rules", "findings[].requires_human_review", "findings[].blocks_release", + # v0.15: provenance is a reviewer triage/filter axis only. It + # never changes release_decision, severity, fingerprints, + # baselines, or CI exit behavior. + "findings[].provenance_kind", "summary.human_review_recommended", "action_surface_diff", "codex_plugin_surface", diff --git a/tests/test_agent_instructions_renderers.py b/tests/test_agent_instructions_renderers.py index eb57993c..48133c94 100644 --- a/tests/test_agent_instructions_renderers.py +++ b/tests/test_agent_instructions_renderers.py @@ -39,7 +39,7 @@ REPO_ROOT = Path(__file__).resolve().parent.parent EXPECTED_CLAUDE_CODE_SKILL_RENDER_SHA256 = { ".claude/skills/agents-shipgate/SKILL.md": ( - "b17c53d9905f46b196be38e98cf71e53da6779e3a4f426ecff14f2b0f238aba9" + "7fa56f4fd9668e2136ea540986733f99bdf75c7c72a2e5c3fcb9fdeb73e143b2" ), ".claude/skills/agents-shipgate/prompts/add-shipgate-to-repo.md": ( "1ea69b1d3d418080c76540fff3b20044f70ed6787418eb5e4d3d39e036b34014" @@ -71,7 +71,7 @@ } EXPECTED_CODEX_SKILL_RENDER_SHA256 = { ".agents/skills/agents-shipgate/SKILL.md": ( - "59ec0a31f9747acf569f731561236ff4ef6d8734b614edfa04ea6ff10043f21a" + "920b60dcfeacb5eac55936d82f31796eb9a88bcec0e910fa56c278018c597772" ), ".agents/skills/agents-shipgate/references/recipes.md": ( "df5110bfa05eeabd9b918d8902b5c054fa547d1155be61ef6e7d7d63378bf210" diff --git a/tests/test_cli.py b/tests/test_cli.py index 8074d43a..baba5cf2 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -13,6 +13,7 @@ from agents_shipgate.checks import registry from agents_shipgate.cli._helpers import _safe_output_name from agents_shipgate.cli.main import app +from agents_shipgate.cli.scan import run_scan from agents_shipgate.schemas.contract import ( CONTRACT_VERSION, GATING_SIGNAL, @@ -371,6 +372,134 @@ def test_cli_explain_json_returns_full_metadata(): assert key in payload +def test_cli_findings_json_filters_by_provenance_kind(tmp_path): + run_scan( + config_path=Path("samples/support_refund_agent/shipgate.yaml"), + output_dir=tmp_path, + formats=["json"], + ci_mode="advisory", + packet_enabled=False, + ) + + result = runner.invoke( + app, + [ + "findings", + "--from", + str(tmp_path / "report.json"), + "--provenance-kind", + "keyword_heuristic,regex_heuristic", + "--json", + ], + ) + + assert result.exit_code == 0, result.output + payload = json.loads(result.output) + assert payload["filters"]["provenance_kind"] == [ + "keyword_heuristic", + "regex_heuristic", + ] + assert payload["filters"]["include_suppressed"] is False + assert payload["summary"]["matched_findings"] > 0 + assert "suppressed_omitted" in payload["summary"] + assert "suppressed_excluded" not in payload["summary"] + assert payload["summary"]["by_provenance_kind"]["keyword_heuristic"] > 0 + assert { + finding["provenance_kind"] for finding in payload["findings"] + } <= {"keyword_heuristic", "regex_heuristic"} + for finding in payload["findings"]: + assert { + "id", + "fingerprint", + "check_id", + "severity", + "title", + "tool_name", + "confidence", + "provenance_kind", + "agent_action", + "suppressed", + "source", + } <= set(finding) + + +def test_cli_findings_text_outputs_summary(tmp_path): + run_scan( + config_path=Path("samples/support_refund_agent/shipgate.yaml"), + output_dir=tmp_path, + formats=["json"], + ci_mode="advisory", + packet_enabled=False, + ) + + result = runner.invoke( + app, + [ + "findings", + "--from", + str(tmp_path / "report.json"), + "--provenance-kind", + "keyword_heuristic", + ], + ) + + assert result.exit_code == 0, result.output + assert "Scope: active findings only" in result.output + assert "Provenance counts:" in result.output + assert "keyword_heuristic:" in result.output + assert "SHIP-" in result.output + + +def test_cli_findings_invalid_provenance_kind_exits_three(tmp_path): + run_scan( + config_path=Path("samples/support_refund_agent/shipgate.yaml"), + output_dir=tmp_path, + formats=["json"], + ci_mode="advisory", + packet_enabled=False, + ) + + result = runner.invoke( + app, + [ + "findings", + "--from", + str(tmp_path / "report.json"), + "--provenance-kind", + "made_up", + ], + ) + + assert result.exit_code == 3 + assert "unsupported --provenance-kind value" in result.output + + +def test_cli_findings_pre_v15_report_agent_mode_error(tmp_path, monkeypatch): + monkeypatch.setenv("AGENTS_SHIPGATE_AGENT_MODE", "1") + old_report = tmp_path / "report.json" + old_report.write_text( + json.dumps({"report_schema_version": "0.14", "findings": []}), + encoding="utf-8", + ) + + result = runner.invoke( + app, + ["findings", "--from", str(old_report), "--json"], + ) + + assert result.exit_code == 3 + assert ">= 0.15" in result.output + json_lines = [ + line for line in (result.output or "").splitlines() if line.startswith("{") + ] + assert json_lines + payload = json.loads(json_lines[-1]) + assert payload["error"] == "input_parse_error" + assert payload["next_actions"][0]["command"] == ( + "agents-shipgate scan -c shipgate.yaml --format json" + ) + + def test_cli_agent_mode_emits_structured_error_on_missing_config(tmp_path, monkeypatch): monkeypatch.setenv("AGENTS_SHIPGATE_AGENT_MODE", "1") result = runner.invoke( diff --git a/tests/test_evidence_packet.py b/tests/test_evidence_packet.py index f4477e85..9a6113c3 100644 --- a/tests/test_evidence_packet.py +++ b/tests/test_evidence_packet.py @@ -32,11 +32,13 @@ render_packet_markdown, serialize_packet_json, ) +from agents_shipgate.packet.builder import _build_not_proven from agents_shipgate.packet.disclaimer import ( PACKET_NON_PROOF, PACKET_NON_PROOF_HEADLINE, ) from agents_shipgate.packet.evidence_matrix import build_evidence_matrix +from agents_shipgate.schemas.report import Finding SAMPLE_CONFIG = Path("samples/support_refund_agent/shipgate.yaml") EXPECTED_DIR = Path("samples/support_refund_agent/expected") @@ -134,6 +136,52 @@ def test_evidence_matrix_covers_expected_domains_and_renders(tmp_path): assert "§1A Evidence matrix — compact review summary" in html +def test_not_proven_residuals_include_non_static_provenance(): + findings = [ + Finding( + check_id="SHIP-KEYWORD", + title="keyword", + severity="medium", + category="test", + recommendation="r", + provenance_kind="keyword_heuristic", + ), + Finding( + check_id="SHIP-REGEX", + title="regex", + severity="medium", + category="test", + recommendation="r", + provenance_kind="regex_heuristic", + ), + Finding( + check_id="SHIP-AST", + title="ast", + severity="medium", + category="test", + recommendation="r", + provenance_kind="ast_extraction", + ), + Finding( + check_id="SHIP-POLICY-PACK", + title="policy pack", + severity="medium", + category="test", + recommendation="r", + provenance_kind="policy_pack", + ), + ] + + section = _build_not_proven(findings, source_warnings=[], tools=[]) + residuals = "\n".join(section.additional_residuals) + + assert "heuristic provenance" in residuals + assert "keyword_heuristic=1" in residuals + assert "regex_heuristic=1" in residuals + assert "AST-extracted tool surfaces" in residuals + assert "external policy packs" in residuals + + def test_evidence_matrix_uses_release_decision_only_for_blocking_and_review(): payload = { "release_decision": { diff --git a/tests/test_provenance_kind.py b/tests/test_provenance_kind.py index 05d7a919..833f03fd 100644 --- a/tests/test_provenance_kind.py +++ b/tests/test_provenance_kind.py @@ -14,7 +14,11 @@ from jsonschema import ValidationError, validate from agents_shipgate.cli.scan import run_scan -from agents_shipgate.core.findings import finding_fingerprint +from agents_shipgate.core.findings import ( + PROVENANCE_KIND_ORDER, + finding_fingerprint, + provenance_kind_counts, +) from agents_shipgate.report.json_report import report_json_payload from agents_shipgate.schemas.common import ProvenanceKind from agents_shipgate.schemas.report import Finding @@ -100,6 +104,50 @@ def test_each_provenance_value_constructs_cleanly(): assert finding.provenance_kind == value +def test_provenance_kind_counts_cover_all_values_and_suppression(): + findings = [ + Finding( + check_id=f"SHIP-TEST-{index}", + title=f"t{index}", + severity="info", + category="test", + recommendation="r", + provenance_kind=kind, + suppressed=index == 2, + ) + for index, kind in enumerate(PROVENANCE_KIND_ORDER) + ] + + active_counts = provenance_kind_counts(findings) + all_counts = provenance_kind_counts(findings, include_suppressed=True) + + assert set(active_counts) == set(PROVENANCE_KIND_ORDER) + assert active_counts["keyword_heuristic"] == 0 + assert all_counts["keyword_heuristic"] == 1 + for kind in ( + "static_declaration", + "ast_extraction", + "regex_heuristic", + "policy_pack", + ): + assert active_counts[kind] == 1 + assert all_counts[kind] == 1 + + +def test_provenance_kind_counts_treat_legacy_none_as_static(): + finding = Finding( + check_id="SHIP-TEST", + title="t", + severity="info", + category="test", + recommendation="r", + ) + + counts = provenance_kind_counts([finding]) + + assert counts["static_declaration"] == 1 + + def test_provenance_kind_not_in_fingerprint(): """Fingerprint stability invariant. Two findings that differ only by provenance_kind MUST share a fingerprint — otherwise every diff --git a/tests/test_public_surface_contract.py b/tests/test_public_surface_contract.py index 8709ddb8..fcd4a809 100644 --- a/tests/test_public_surface_contract.py +++ b/tests/test_public_surface_contract.py @@ -32,6 +32,7 @@ from agents_shipgate.schemas.contract import ( CONTRACT_VERSION, GATING_SIGNAL, + MANUAL_REVIEW_SIGNALS, SUPPORTED_INPUTS, build_contract_payload, ) @@ -383,6 +384,15 @@ def test_agent_contract_current_doc_is_canonical(): "docs/agent-contract-current.md must mention the local contract's " "manual_review_signals[] field." ) + assert "findings[].provenance_kind" in MANUAL_REVIEW_SIGNALS + assert "agents-shipgate findings" in text, ( + "docs/agent-contract-current.md must make provenance_kind operational " + "via the findings filter command." + ) + assert "never changes the release decision" in text, ( + "docs/agent-contract-current.md must state that provenance_kind is " + "reviewer triage only, not a gate input." + ) assert CURRENT_PACKET_SCHEMA in text, ( "docs/agent-contract-current.md must reference the current packet " f"schema (v{CURRENT_PACKET_SCHEMA_VERSION}) so coding agents know "