Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .claude-plugin/marketplace.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
{
"name": "shield",
"description": "Unified SDLC plugin \u2014 research, planning, PM integration, implementation, and continuous review with multi-domain support and specialist agents",
"version": "2.26.0",
"version": "2.27.0",
"source": "./shield",
"category": "development"
},
Expand Down
47 changes: 47 additions & 0 deletions shield/evals/plan-review-verdict.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# shield/evals/plan-review-verdict.yaml
# Eval suite for /plan-review verdict computation (composite + P0-gate).
# Invoked by `uv run shield/evals/run.py plan-review-verdict`.
#
# Each fixture is a grades.json holding aggregated per-persona grades plus the
# classified findings. compute_plan_verdict.py turns those into a composite
# score and applies the P0-gate. The `verdict` expectation lists substrings
# that must ALL appear in the script's stdout.

name: plan-review-verdict
description: >
Asserts that /plan-review's verdict logic applies the P0-gate: a high
composite with a Critical D/F finding is gated to Needs Work, while a clean
high composite is Ready and a weak plan is Not Ready.

cases:
# Strong grades, no Critical D/F → Ready.
- name: clean-ready
fixture: fixtures/clean-ready
expect:
verdict:
- "Ready"
- "composite 3.6"

# THE bug today: composite >= 2.5 but a Critical-F is present. Without the
# P0-gate this scored "Ready"; the gate must flip it to Needs Work.
- name: high-composite-p0
fixture: fixtures/high-composite-p0
expect:
verdict:
- "Needs Work"
- "blocked by 1 P0"

# Mid composite, no P0 (the lone finding is Important/C = P1) → Needs Work,
# and crucially NOT "blocked by" (distinguishes gate from threshold).
- name: needs-work-threshold
fixture: fixtures/needs-work-threshold
expect:
verdict:
- "Needs Work"

# Weak grades across the board → Not Ready on composite alone.
- name: not-ready
fixture: fixtures/not-ready
expect:
verdict:
- "Not Ready"
12 changes: 12 additions & 0 deletions shield/evals/plan-review-verdict/fixtures/clean-ready/grades.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"personas": [
{ "name": "architect", "grade": "A" },
{ "name": "security-engineer", "grade": "A" },
{ "name": "dx-engineer", "grade": "B" },
{ "name": "agile-coach", "grade": "A" },
{ "name": "product-manager", "grade": "B" }
],
"findings": [
{ "id": "PM6", "severity": "Warning", "grade": "C" }
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"personas": [
{ "name": "architect", "grade": "A" },
{ "name": "security-engineer", "grade": "A" },
{ "name": "dx-engineer", "grade": "B" },
{ "name": "agile-coach", "grade": "A" },
{ "name": "product-manager", "grade": "B" }
],
"findings": [
{ "id": "PM2", "severity": "Critical", "grade": "F" }
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"personas": [
{ "name": "architect", "grade": "C" },
{ "name": "security-engineer", "grade": "C" },
{ "name": "dx-engineer", "grade": "B" },
{ "name": "product-manager", "grade": "C" }
],
"findings": [
{ "id": "PM4", "severity": "Important", "grade": "C" }
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"personas": [
{ "name": "architect", "grade": "D" },
{ "name": "security-engineer", "grade": "F" },
{ "name": "dx-engineer", "grade": "D" },
{ "name": "product-manager", "grade": "D" }
],
"findings": []
}
26 changes: 26 additions & 0 deletions shield/evals/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
VALIDATE_TRD = SCRIPTS_DIR / "validate_trd.py"
VALIDATE_PLAN = SCRIPTS_DIR / "validate_plan.py"
CHECK_PLAN_REVIEW_TRD = SCRIPTS_DIR / "check_plan_review_trd.py"
COMPUTE_PLAN_VERDICT = SCRIPTS_DIR / "compute_plan_verdict.py"


def _run_validator(script: Path, target: Path) -> tuple[int, str]:
Expand Down Expand Up @@ -125,6 +126,31 @@ def run_suite(suite_name: str, only_case: str | None = None, verbose: bool = Fal
elif verbose:
print(f" gates OK ({gates_expect})")

# plan-review verdict check (composite + P0-gate). The `verdict`
# expectation is one or more substrings that must ALL appear in the
# compute_plan_verdict.py stdout for grades.json.
verdict_expect = expect.get("verdict")
if verdict_expect is not None:
needles = [verdict_expect] if isinstance(verdict_expect, str) else list(verdict_expect)
grades_path = fixture_dir / "grades.json"
proc = subprocess.run(
[sys.executable, str(COMPUTE_PLAN_VERDICT), str(grades_path)],
capture_output=True,
text=True,
)
stdout = proc.stdout.strip()
missing = [n for n in needles if n not in stdout]
if proc.returncode != 0:
failed_assertions.append(
f"verdict: script exit={proc.returncode} stderr={proc.stderr.strip()!r}"
)
elif missing:
failed_assertions.append(
f"verdict: missing {missing!r} in stdout={stdout!r}"
)
elif verbose:
print(f" verdict OK ({needles})")

if failed_assertions:
print(f" FAIL {name}")
for fa in failed_assertions:
Expand Down
152 changes: 152 additions & 0 deletions shield/scripts/compute_plan_verdict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
#!/usr/bin/env python3
# /// script
# requires-python = ">=3.11"
# ///
"""compute_plan_verdict.py — deterministic verdict for /plan-review.

Turns aggregated per-persona grades plus the classified findings into a
composite readiness score and applies the P0-gate documented in
`shield/skills/general/plan-review/scoring.md`. This mechanizes the
"averaging problem" guard: a strong composite cannot hide a Critical-severity
D/F finding.

This module is the SINGLE SOURCE OF TRUTH for plan-review persona weights.
`scoring.md` and `dimensions.md` reference this table rather than restating it.

Input (JSON on stdin or a file path argument):

{
"personas": [ {"name": "architect", "grade": "B"}, ... ],
"findings": [ {"id": "PM2", "severity": "Critical", "grade": "F"}, ... ]
}

- `personas[].name` must be a known persona (see WEIGHTS); unknown names error.
- `personas[].grade` is the persona's aggregated letter grade (A-F).
- `findings[]` is the classified finding list; a P0 is any finding with
severity "Critical" graded D or F.

Output (stdout), three stable lines:

composite: 2.93 (B)
p0_count: 1
verdict: Needs Work (composite 2.93, blocked by 1 P0)

Exit codes:
0 — verdict computed.
2 — usage / input error (unknown persona, bad grade, malformed JSON).
"""
from __future__ import annotations

import argparse
import json
import sys
from pathlib import Path
from typing import Any

# Canonical persona weights. Mirrors dimensions.md / personas.md. Core = 1.0,
# supporting = 0.7. The PM persona weight (0.7) applies to the grade rolled up
# from the 10 PM dim subagents (PM1-PM10).
WEIGHTS: dict[str, float] = {
"architect": 1.0,
"security-engineer": 1.0,
"dx-engineer": 1.0,
"platform-engineer": 1.0,
"backend-engineer": 1.0,
"finops-analyst": 0.7,
"agile-coach": 0.7,
"sre": 0.7,
"product-manager": 0.7,
}

GRADE_TO_NUM: dict[str, int] = {"A": 4, "B": 3, "C": 2, "D": 1, "F": 0}


def _letter(num: float) -> str:
"""Map a numeric average to a letter using scoring.md's range table."""
if num >= 3.5:
return "A"
if num >= 2.5:
return "B"
if num >= 1.5:
return "C"
if num >= 0.5:
return "D"
return "F"


def _composite(personas: list[dict[str, Any]]) -> float:
"""Weighted average of activated persona grades. Denominator is the sum of
weights for personas that actually ran — not all of WEIGHTS."""
num = 0.0
denom = 0.0
for p in personas:
name = p.get("name")
if name not in WEIGHTS:
raise ValueError(f"unknown persona: {name!r} (known: {sorted(WEIGHTS)})")
grade = (p.get("grade") or "").strip().upper()
if grade not in GRADE_TO_NUM:
raise ValueError(f"bad grade {grade!r} for persona {name!r}")
weight = WEIGHTS[name]
num += GRADE_TO_NUM[grade] * weight
denom += weight
if denom == 0:
raise ValueError("no activated personas — cannot compute composite")
return num / denom


def _p0_count(findings: list[dict[str, Any]]) -> int:
"""P0 = grade D or F on a Critical-severity finding (scoring.md)."""
count = 0
for f in findings:
severity = (f.get("severity") or "").strip().lower()
grade = (f.get("grade") or "").strip().upper()
if severity == "critical" and grade in {"D", "F"}:
count += 1
return count


def _verdict(composite: float, p0_count: int) -> str:
"""Composite + P0-gate. A high composite with any P0 is gated to Needs Work."""
if composite < 1.5:
return f"Not Ready (composite {composite:.2f})"
if composite < 2.5:
return f"Needs Work (composite {composite:.2f})"
if p0_count > 0:
plural = "P0" if p0_count == 1 else "P0s"
return f"Needs Work (composite {composite:.2f}, blocked by {p0_count} {plural})"
return f"Ready (composite {composite:.2f})"


def compute(payload: dict[str, Any]) -> str:
personas = payload.get("personas") or []
findings = payload.get("findings") or []
composite = _composite(personas)
p0 = _p0_count(findings)
return (
f"composite: {composite:.2f} ({_letter(composite)})\n"
f"p0_count: {p0}\n"
f"verdict: {_verdict(composite, p0)}"
)


def main(argv: list[str]) -> int:
parser = argparse.ArgumentParser(description=__doc__.split("\n", 1)[0])
parser.add_argument(
"grades_json",
nargs="?",
default=None,
help="Path to grades.json. Reads stdin when omitted.",
)
args = parser.parse_args(argv)
try:
raw = Path(args.grades_json).read_text() if args.grades_json else sys.stdin.read()
payload = json.loads(raw)
print(compute(payload))
except (ValueError, json.JSONDecodeError, OSError) as exc:
print(f"error: {exc}", file=sys.stderr)
return 2
return 0


if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
38 changes: 31 additions & 7 deletions shield/skills/general/plan-review/SKILL.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
---
name: plan-review
description: Use when a plan, architecture doc, or execution plan exists and needs expert review before implementation. Triggers on /plan-review, review my plan, document review.
description: Use when a plan, architecture doc, or execution plan exists and needs expert review before implementation. Produces a scored analysis with a P0-gated verdict and an enhanced plan. Triggers on /plan-review, review my plan, document review.
---

# Plan Review
Expand All @@ -14,6 +14,8 @@ All review output goes into a per-run, date-keyed folder under the feature's `re
```
{output_dir}/{feature}/reviews/plan/{date}{_counter}/ ← {review_dir}
├── summary.md ← {review_summary} (scored analysis, main output)
├── source-plan.md ← immutable verbatim snapshot of the reviewed plan
├── grades.json ← aggregated persona grades + findings (verdict input)
├── enhanced-plan.md ← {review_enhanced} (enhanced plan with feedback applied)
└── detailed/
└── <agent>.md ← {review_detailed} (one per dispatched agent)
Expand Down Expand Up @@ -59,17 +61,26 @@ At startup, call execute-steps to register these steps. Execute them in order, u
| 0i | `lld_draft_review` — apply the LLD structural rubric (missing always-on, missing forced subsection, vague TBDs in always-on, PoD lifted but vague) to every `docs/shield/{feature}/lld-*.md` draft | when feature-folder LLD drafts exist | Yes — High/Medium/Review depending on issue |
| 1 | Load plan document | always | Yes |
| 1a | Detect prior PRD in feature folder — read prd.meta.json if present | only if prd.meta.json exists | No |
| 1b | Snapshot the loaded plan to `{review_dir}/source-plan.md` (immutable) — see Source Snapshot | always | Yes |
| 2 | Select reviewer personas | always | Yes |
| 3 | Dispatch selected agents in parallel | always | Yes |
| 4 | Parse grades + calculate scores | always | Yes |
| 4 | Parse grades; compute composite + verdict via `compute_plan_verdict.py` (P0-gate) — see Collection & Scoring | always | Yes |
| 5 | Generate enhanced plan | always | Yes |
| 6 | Write summary + detailed findings (gates 0a-0e flow in here as Critical findings) | always | Yes |
| 6 | Write summary + detailed findings (gates 0a-0i flow in here as Critical/High findings) | always | Yes |
| 7 | Update manifest | always | Yes |

### Step 1a: Detect prior PRD

If `{output_dir}/{feature}/prd.meta.json` exists (alongside `{prd}` = `{output_dir}/{feature}/prd.md`), read it. Use its `sections_present` and `type` to inform the plan-vs-PRD alignment check (future enhancement — for now, record it in `{review_summary}` as a "Source PRD" header line, e.g. `Source PRD: prd.md (type: standard, rubric: 1.2)`). This gives reviewers visibility into which PRD version the plan was built from.

### Step 1b: Source Snapshot

Immediately after loading the plan (and before dispatch), copy the plan markdown verbatim to
`{review_dir}/source-plan.md`. This is an immutable snapshot — the enhanced plan is annotated
separately as `enhanced-plan.md`, never in place. The snapshot makes each date-keyed review
folder self-contained for audit (reviews never overwrite prior runs). Mirrors `/prd-review`'s
`source-prd.md`. When the plan source is HTML, snapshot the parsed markdown and note the origin.

## Plan Input

The skill reads plan data from (in priority order):
Expand Down Expand Up @@ -379,14 +390,27 @@ After all agents return:
templates expect.
3. **Per-persona grade** — average numeric grades (A=4, B=3, C=2, D=1, F=0) within each
persona, round using ranges in `scoring.md`.
4. **Composite score** — weighted average using persona weights from `dimensions.md` (PM
persona is 0.7, applied to the aggregated PM grade), convert to verdict per `scoring.md`
thresholds.
5. **Classify recommendations** — P0/P1/P2 per severity rules in `scoring.md`.
4. **Classify recommendations** — P0/P1/P2 per severity rules in `scoring.md`. A P0 is any
finding graded D or F on a **Critical** severity evaluation point. Failed deterministic
gates (0a–0i) are also P0s.
5. **Compute composite + verdict deterministically** — do NOT compute the verdict by hand.
Build a `grades.json` payload — `{ "personas": [{"name": "<slug>", "grade": "<A-F>"}, ...],
"findings": [{"id": "...", "severity": "Critical|Important|Warning", "grade": "<A-F>"}, ...] }`
— and run:

```bash
uv run "$CLAUDE_PLUGIN_ROOT/scripts/compute_plan_verdict.py" {review_dir}/grades.json
```

It returns the composite (with weights from its canonical `WEIGHTS` table), the P0 count, and
the verdict string — applying the P0-gate (a composite ≥ 2.5 with any P0 is gated to **Needs
Work**, not Ready). Use the emitted `verdict:` line verbatim in `summary.md`.

## Output

Write to `{review_dir}` = `{output_dir}/{feature}/reviews/plan/{date}{_counter}/`:
- `source-plan.md` — immutable verbatim snapshot of the reviewed plan (Step 1b)
- `grades.json` — aggregated persona grades + classified findings (input to `compute_plan_verdict.py`)
- `{review_summary}` (`summary.md`) — scored evaluation with consolidated recommendations
- `{review_enhanced}` (`enhanced-plan.md`) — enhanced version of original plan with feedback applied
- `{review_detailed}` (`detailed/<agent>.md`) — full output from each dispatched agent
Expand Down
7 changes: 4 additions & 3 deletions shield/skills/general/plan-review/dimensions.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,10 @@ These continue to dispatch as full persona agents — they are not decomposed in
| Backend engineer | `shield:backend-engineer` | 1.0 | Application code, API design |
| Security engineer | `shield:security-engineer` | 1.0 | Security posture, threat modeling |

Persona weights (used by `scoring.md` composite calculation) are unchanged from `personas.md`.
The PM persona weight is 0.7 — it now applies to the aggregated PM grade rolled up from the
10 dim subagents.
Persona weights are defined canonically in `shield/scripts/compute_plan_verdict.py` (`WEIGHTS`);
`scoring.md` and the table above reference that single source. The PM persona weight is 0.7 — it
applies to the aggregated PM grade rolled up from the 10 dim subagents. Platform Engineer and
Backend Engineer are weight 1.0 (Core); both are in the canonical table.

## Dispatch shape per pattern

Expand Down
Loading
Loading