infraspecdev · ashwinimanoj · Jun 4, 2026 · Jun 4, 2026
diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
@@ -9,7 +9,7 @@
     {
       "name": "shield",
       "description": "Unified SDLC plugin \u2014 research, planning, PM integration, implementation, and continuous review with multi-domain support and specialist agents",
-      "version": "2.26.0",
+      "version": "2.27.0",
       "source": "./shield",
       "category": "development"
     },

diff --git a/shield/evals/plan-review-verdict.yaml b/shield/evals/plan-review-verdict.yaml
@@ -0,0 +1,47 @@
+# shield/evals/plan-review-verdict.yaml
+# Eval suite for /plan-review verdict computation (composite + P0-gate).
+# Invoked by `uv run shield/evals/run.py plan-review-verdict`.
+#
+# Each fixture is a grades.json holding aggregated per-persona grades plus the
+# classified findings. compute_plan_verdict.py turns those into a composite
+# score and applies the P0-gate. The `verdict` expectation lists substrings
+# that must ALL appear in the script's stdout.
+
+name: plan-review-verdict
+description: >
+  Asserts that /plan-review's verdict logic applies the P0-gate: a high
+  composite with a Critical D/F finding is gated to Needs Work, while a clean
+  high composite is Ready and a weak plan is Not Ready.
+
+cases:
+  # Strong grades, no Critical D/F → Ready.
+  - name: clean-ready
+    fixture: fixtures/clean-ready
+    expect:
+      verdict:
+        - "Ready"
+        - "composite 3.6"
+
+  # THE bug today: composite >= 2.5 but a Critical-F is present. Without the
+  # P0-gate this scored "Ready"; the gate must flip it to Needs Work.
+  - name: high-composite-p0
+    fixture: fixtures/high-composite-p0
+    expect:
+      verdict:
+        - "Needs Work"
+        - "blocked by 1 P0"
+
+  # Mid composite, no P0 (the lone finding is Important/C = P1) → Needs Work,
+  # and crucially NOT "blocked by" (distinguishes gate from threshold).
+  - name: needs-work-threshold
+    fixture: fixtures/needs-work-threshold
+    expect:
+      verdict:
+        - "Needs Work"
+
+  # Weak grades across the board → Not Ready on composite alone.
+  - name: not-ready
+    fixture: fixtures/not-ready
+    expect:
+      verdict:
+        - "Not Ready"
diff --git a/shield/evals/plan-review-verdict/fixtures/clean-ready/grades.json b/shield/evals/plan-review-verdict/fixtures/clean-ready/grades.json
@@ -0,0 +1,12 @@
+{
+  "personas": [
+    { "name": "architect", "grade": "A" },
+    { "name": "security-engineer", "grade": "A" },
+    { "name": "dx-engineer", "grade": "B" },
+    { "name": "agile-coach", "grade": "A" },
+    { "name": "product-manager", "grade": "B" }
+  ],
+  "findings": [
+    { "id": "PM6", "severity": "Warning", "grade": "C" }
+  ]
+}
diff --git a/shield/evals/plan-review-verdict/fixtures/high-composite-p0/grades.json b/shield/evals/plan-review-verdict/fixtures/high-composite-p0/grades.json
@@ -0,0 +1,12 @@
+{
+  "personas": [
+    { "name": "architect", "grade": "A" },
+    { "name": "security-engineer", "grade": "A" },
+    { "name": "dx-engineer", "grade": "B" },
+    { "name": "agile-coach", "grade": "A" },
+    { "name": "product-manager", "grade": "B" }
+  ],
+  "findings": [
+    { "id": "PM2", "severity": "Critical", "grade": "F" }
+  ]
+}
diff --git a/shield/evals/plan-review-verdict/fixtures/needs-work-threshold/grades.json b/shield/evals/plan-review-verdict/fixtures/needs-work-threshold/grades.json
@@ -0,0 +1,11 @@
+{
+  "personas": [
+    { "name": "architect", "grade": "C" },
+    { "name": "security-engineer", "grade": "C" },
+    { "name": "dx-engineer", "grade": "B" },
+    { "name": "product-manager", "grade": "C" }
+  ],
+  "findings": [
+    { "id": "PM4", "severity": "Important", "grade": "C" }
+  ]
+}
diff --git a/shield/evals/plan-review-verdict/fixtures/not-ready/grades.json b/shield/evals/plan-review-verdict/fixtures/not-ready/grades.json
@@ -0,0 +1,9 @@
+{
+  "personas": [
+    { "name": "architect", "grade": "D" },
+    { "name": "security-engineer", "grade": "F" },
+    { "name": "dx-engineer", "grade": "D" },
+    { "name": "product-manager", "grade": "D" }
+  ],
+  "findings": []
+}
diff --git a/shield/evals/run.py b/shield/evals/run.py
@@ -36,6 +36,7 @@
 VALIDATE_TRD = SCRIPTS_DIR / "validate_trd.py"
 VALIDATE_PLAN = SCRIPTS_DIR / "validate_plan.py"
 CHECK_PLAN_REVIEW_TRD = SCRIPTS_DIR / "check_plan_review_trd.py"
+COMPUTE_PLAN_VERDICT = SCRIPTS_DIR / "compute_plan_verdict.py"
 
 
 def _run_validator(script: Path, target: Path) -> tuple[int, str]:
@@ -125,6 +126,31 @@ def run_suite(suite_name: str, only_case: str | None = None, verbose: bool = Fal
             elif verbose:
                 print(f"    gates OK ({gates_expect})")
 
+        # plan-review verdict check (composite + P0-gate). The `verdict`
+        # expectation is one or more substrings that must ALL appear in the
+        # compute_plan_verdict.py stdout for grades.json.
+        verdict_expect = expect.get("verdict")
+        if verdict_expect is not None:
+            needles = [verdict_expect] if isinstance(verdict_expect, str) else list(verdict_expect)
+            grades_path = fixture_dir / "grades.json"
+            proc = subprocess.run(
+                [sys.executable, str(COMPUTE_PLAN_VERDICT), str(grades_path)],
+                capture_output=True,
+                text=True,
+            )
+            stdout = proc.stdout.strip()
+            missing = [n for n in needles if n not in stdout]
+            if proc.returncode != 0:
+                failed_assertions.append(
+                    f"verdict: script exit={proc.returncode} stderr={proc.stderr.strip()!r}"
+                )
+            elif missing:
+                failed_assertions.append(
+                    f"verdict: missing {missing!r} in stdout={stdout!r}"
+                )
+            elif verbose:
+                print(f"    verdict OK ({needles})")
+
         if failed_assertions:
             print(f"  FAIL {name}")
             for fa in failed_assertions:

diff --git a/shield/scripts/compute_plan_verdict.py b/shield/scripts/compute_plan_verdict.py
@@ -0,0 +1,152 @@
+#!/usr/bin/env python3
+# /// script
+# requires-python = ">=3.11"
+# ///
+"""compute_plan_verdict.py — deterministic verdict for /plan-review.
+
+Turns aggregated per-persona grades plus the classified findings into a
+composite readiness score and applies the P0-gate documented in
+`shield/skills/general/plan-review/scoring.md`. This mechanizes the
+"averaging problem" guard: a strong composite cannot hide a Critical-severity
+D/F finding.
+
+This module is the SINGLE SOURCE OF TRUTH for plan-review persona weights.
+`scoring.md` and `dimensions.md` reference this table rather than restating it.
+
+Input (JSON on stdin or a file path argument):
+
+    {
+      "personas": [ {"name": "architect", "grade": "B"}, ... ],
+      "findings":  [ {"id": "PM2", "severity": "Critical", "grade": "F"}, ... ]
+    }
+
+- `personas[].name` must be a known persona (see WEIGHTS); unknown names error.
+- `personas[].grade` is the persona's aggregated letter grade (A-F).
+- `findings[]` is the classified finding list; a P0 is any finding with
+  severity "Critical" graded D or F.
+
+Output (stdout), three stable lines:
+
+    composite: 2.93 (B)
+    p0_count: 1
+    verdict: Needs Work (composite 2.93, blocked by 1 P0)
+
+Exit codes:
+  0  — verdict computed.
+  2  — usage / input error (unknown persona, bad grade, malformed JSON).
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import Any
+
+# Canonical persona weights. Mirrors dimensions.md / personas.md. Core = 1.0,
+# supporting = 0.7. The PM persona weight (0.7) applies to the grade rolled up
+# from the 10 PM dim subagents (PM1-PM10).
+WEIGHTS: dict[str, float] = {
+    "architect": 1.0,
+    "security-engineer": 1.0,
+    "dx-engineer": 1.0,
+    "platform-engineer": 1.0,
+    "backend-engineer": 1.0,
+    "finops-analyst": 0.7,
+    "agile-coach": 0.7,
+    "sre": 0.7,
+    "product-manager": 0.7,
+}
+
+GRADE_TO_NUM: dict[str, int] = {"A": 4, "B": 3, "C": 2, "D": 1, "F": 0}
+
+
+def _letter(num: float) -> str:
+    """Map a numeric average to a letter using scoring.md's range table."""
+    if num >= 3.5:
+        return "A"
+    if num >= 2.5:
+        return "B"
+    if num >= 1.5:
+        return "C"
+    if num >= 0.5:
+        return "D"
+    return "F"
+
+
+def _composite(personas: list[dict[str, Any]]) -> float:
+    """Weighted average of activated persona grades. Denominator is the sum of
+    weights for personas that actually ran — not all of WEIGHTS."""
+    num = 0.0
+    denom = 0.0
+    for p in personas:
+        name = p.get("name")
+        if name not in WEIGHTS:
+            raise ValueError(f"unknown persona: {name!r} (known: {sorted(WEIGHTS)})")
+        grade = (p.get("grade") or "").strip().upper()
+        if grade not in GRADE_TO_NUM:
+            raise ValueError(f"bad grade {grade!r} for persona {name!r}")
+        weight = WEIGHTS[name]
+        num += GRADE_TO_NUM[grade] * weight
+        denom += weight
+    if denom == 0:
+        raise ValueError("no activated personas — cannot compute composite")
+    return num / denom
+
+
+def _p0_count(findings: list[dict[str, Any]]) -> int:
+    """P0 = grade D or F on a Critical-severity finding (scoring.md)."""
+    count = 0
+    for f in findings:
+        severity = (f.get("severity") or "").strip().lower()
+        grade = (f.get("grade") or "").strip().upper()
+        if severity == "critical" and grade in {"D", "F"}:
+            count += 1
+    return count
+
+
+def _verdict(composite: float, p0_count: int) -> str:
+    """Composite + P0-gate. A high composite with any P0 is gated to Needs Work."""
+    if composite < 1.5:
+        return f"Not Ready (composite {composite:.2f})"
+    if composite < 2.5:
+        return f"Needs Work (composite {composite:.2f})"
+    if p0_count > 0:
+        plural = "P0" if p0_count == 1 else "P0s"
+        return f"Needs Work (composite {composite:.2f}, blocked by {p0_count} {plural})"
+    return f"Ready (composite {composite:.2f})"
+
+
+def compute(payload: dict[str, Any]) -> str:
+    personas = payload.get("personas") or []
+    findings = payload.get("findings") or []
+    composite = _composite(personas)
+    p0 = _p0_count(findings)
+    return (
+        f"composite: {composite:.2f} ({_letter(composite)})\n"
+        f"p0_count: {p0}\n"
+        f"verdict: {_verdict(composite, p0)}"
+    )
+
+
+def main(argv: list[str]) -> int:
+    parser = argparse.ArgumentParser(description=__doc__.split("\n", 1)[0])
+    parser.add_argument(
+        "grades_json",
+        nargs="?",
+        default=None,
+        help="Path to grades.json. Reads stdin when omitted.",
+    )
+    args = parser.parse_args(argv)
+    try:
+        raw = Path(args.grades_json).read_text() if args.grades_json else sys.stdin.read()
+        payload = json.loads(raw)
+        print(compute(payload))
+    except (ValueError, json.JSONDecodeError, OSError) as exc:
+        print(f"error: {exc}", file=sys.stderr)
+        return 2
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv[1:]))
diff --git a/shield/skills/general/plan-review/SKILL.md b/shield/skills/general/plan-review/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: plan-review
-description: Use when a plan, architecture doc, or execution plan exists and needs expert review before implementation. Triggers on /plan-review, review my plan, document review.
+description: Use when a plan, architecture doc, or execution plan exists and needs expert review before implementation. Produces a scored analysis with a P0-gated verdict and an enhanced plan. Triggers on /plan-review, review my plan, document review.
 ---
 
 # Plan Review
@@ -14,6 +14,8 @@ All review output goes into a per-run, date-keyed folder under the feature's `re
 ```
 {output_dir}/{feature}/reviews/plan/{date}{_counter}/   ← {review_dir}
 ├── summary.md                        ← {review_summary}  (scored analysis, main output)
+├── source-plan.md                    ← immutable verbatim snapshot of the reviewed plan
+├── grades.json                       ← aggregated persona grades + findings (verdict input)
 ├── enhanced-plan.md                  ← {review_enhanced} (enhanced plan with feedback applied)
 └── detailed/
     └── <agent>.md                    ← {review_detailed} (one per dispatched agent)
@@ -59,17 +61,26 @@ At startup, call execute-steps to register these steps. Execute them in order, u
 | 0i | `lld_draft_review` — apply the LLD structural rubric (missing always-on, missing forced subsection, vague TBDs in always-on, PoD lifted but vague) to every `docs/shield/{feature}/lld-*.md` draft | when feature-folder LLD drafts exist | Yes — High/Medium/Review depending on issue |
 | 1 | Load plan document | always | Yes |
 | 1a | Detect prior PRD in feature folder — read prd.meta.json if present | only if prd.meta.json exists | No |
+| 1b | Snapshot the loaded plan to `{review_dir}/source-plan.md` (immutable) — see Source Snapshot | always | Yes |
 | 2 | Select reviewer personas | always | Yes |
 | 3 | Dispatch selected agents in parallel | always | Yes |
-| 4 | Parse grades + calculate scores | always | Yes |
+| 4 | Parse grades; compute composite + verdict via `compute_plan_verdict.py` (P0-gate) — see Collection & Scoring | always | Yes |
 | 5 | Generate enhanced plan | always | Yes |
-| 6 | Write summary + detailed findings (gates 0a-0e flow in here as Critical findings) | always | Yes |
+| 6 | Write summary + detailed findings (gates 0a-0i flow in here as Critical/High findings) | always | Yes |
 | 7 | Update manifest | always | Yes |
 
 ### Step 1a: Detect prior PRD
 
 If `{output_dir}/{feature}/prd.meta.json` exists (alongside `{prd}` = `{output_dir}/{feature}/prd.md`), read it. Use its `sections_present` and `type` to inform the plan-vs-PRD alignment check (future enhancement — for now, record it in `{review_summary}` as a "Source PRD" header line, e.g. `Source PRD: prd.md (type: standard, rubric: 1.2)`). This gives reviewers visibility into which PRD version the plan was built from.
 
+### Step 1b: Source Snapshot
+
+Immediately after loading the plan (and before dispatch), copy the plan markdown verbatim to
+`{review_dir}/source-plan.md`. This is an immutable snapshot — the enhanced plan is annotated
+separately as `enhanced-plan.md`, never in place. The snapshot makes each date-keyed review
+folder self-contained for audit (reviews never overwrite prior runs). Mirrors `/prd-review`'s
+`source-prd.md`. When the plan source is HTML, snapshot the parsed markdown and note the origin.
+
 ## Plan Input
 
 The skill reads plan data from (in priority order):
@@ -379,14 +390,27 @@ After all agents return:
    templates expect.
 3. **Per-persona grade** — average numeric grades (A=4, B=3, C=2, D=1, F=0) within each
    persona, round using ranges in `scoring.md`.
-4. **Composite score** — weighted average using persona weights from `dimensions.md` (PM
-   persona is 0.7, applied to the aggregated PM grade), convert to verdict per `scoring.md`
-   thresholds.
-5. **Classify recommendations** — P0/P1/P2 per severity rules in `scoring.md`.
+4. **Classify recommendations** — P0/P1/P2 per severity rules in `scoring.md`. A P0 is any
+   finding graded D or F on a **Critical** severity evaluation point. Failed deterministic
+   gates (0a–0i) are also P0s.
+5. **Compute composite + verdict deterministically** — do NOT compute the verdict by hand.
+   Build a `grades.json` payload — `{ "personas": [{"name": "<slug>", "grade": "<A-F>"}, ...],
+   "findings": [{"id": "...", "severity": "Critical|Important|Warning", "grade": "<A-F>"}, ...] }`
+   — and run:
+
+   ```bash
+   uv run "$CLAUDE_PLUGIN_ROOT/scripts/compute_plan_verdict.py" {review_dir}/grades.json
+   ```
+
+   It returns the composite (with weights from its canonical `WEIGHTS` table), the P0 count, and
+   the verdict string — applying the P0-gate (a composite ≥ 2.5 with any P0 is gated to **Needs
+   Work**, not Ready). Use the emitted `verdict:` line verbatim in `summary.md`.
 
 ## Output
 
 Write to `{review_dir}` = `{output_dir}/{feature}/reviews/plan/{date}{_counter}/`:
+- `source-plan.md` — immutable verbatim snapshot of the reviewed plan (Step 1b)
+- `grades.json` — aggregated persona grades + classified findings (input to `compute_plan_verdict.py`)
 - `{review_summary}` (`summary.md`) — scored evaluation with consolidated recommendations
 - `{review_enhanced}` (`enhanced-plan.md`) — enhanced version of original plan with feedback applied
 - `{review_detailed}` (`detailed/<agent>.md`) — full output from each dispatched agent

diff --git a/shield/skills/general/plan-review/dimensions.md b/shield/skills/general/plan-review/dimensions.md
@@ -48,9 +48,10 @@ These continue to dispatch as full persona agents — they are not decomposed in
 | Backend engineer | `shield:backend-engineer` | 1.0 | Application code, API design |
 | Security engineer | `shield:security-engineer` | 1.0 | Security posture, threat modeling |
 
-Persona weights (used by `scoring.md` composite calculation) are unchanged from `personas.md`.
-The PM persona weight is 0.7 — it now applies to the aggregated PM grade rolled up from the
-10 dim subagents.
+Persona weights are defined canonically in `shield/scripts/compute_plan_verdict.py` (`WEIGHTS`);
+`scoring.md` and the table above reference that single source. The PM persona weight is 0.7 — it
+applies to the aggregated PM grade rolled up from the 10 dim subagents. Platform Engineer and
+Backend Engineer are weight 1.0 (Core); both are in the canonical table.
 
 ## Dispatch shape per pattern