From 32d02f485f9f82bc6a89f7165893cd7974ab3741 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 17 May 2026 11:30:57 +0000 Subject: [PATCH 01/45] feat(experiments): rule engine POC for OODA Decide quadrant Terminal-only TypeScript POC of the "LLM extracts, rules decide" pattern from the AI fact-checking community: the LLM is constrained to producing structured flags from raw signals, and a deterministic rule engine maps those flags to a verdict tier with a fully replayable audit trail. Lives under experiments/rule-engine-poc/ as a sandbox (not formal Stage 1-7) and demonstrates the pattern against the repo's own quality framework. Each rule encodes a Definition of Done item from docs/quality-framework.md. What's included: - src/ - hand-rolled engine (~250 LOC): types, hash, engine, loader, cli, html-report. Pure functions; severity-first verdict; canonical JSON + SHA-256 provenance hashes for replay. - rules/quality-gates.yaml - DoD-as-rules example set. - fixtures/*.json - 5 mock Orient-quadrant extractions covering ready, blocked, and needs-attention verdicts. - test/ - 24 passing tests (vitest), including dedicated reproducibility suite (strategist-recommended North Star: byte-identical replay). - HTML reporter - self-contained, inline CSS, no JS, no external assets. - docs/ - architecture, DSL reference, audit trail + EU AI Act mapping, extension guide, OODA integration. - research/ - five-angle research wave (technical landscape, regulatory auditability, positioning/JTBD, design alternatives, risks/critique). --- experiments/rule-engine-poc/.gitignore | 5 + experiments/rule-engine-poc/README.md | 115 + experiments/rule-engine-poc/docs/README.md | 20 + .../rule-engine-poc/docs/architecture.md | 93 + .../rule-engine-poc/docs/audit-trail.md | 113 + .../rule-engine-poc/docs/dsl-reference.md | 191 ++ experiments/rule-engine-poc/docs/extending.md | 113 + .../rule-engine-poc/docs/ooda-integration.md | 105 + .../fixtures/blocked-missing-ears.json | 14 + .../fixtures/blocked-s1-finding.json | 21 + .../needs-attention-design-risks.json | 17 + .../rule-engine-poc/fixtures/ready-idea.json | 14 + .../fixtures/ready-implementation.json | 23 + experiments/rule-engine-poc/package-lock.json | 1994 +++++++++++++++++ experiments/rule-engine-poc/package.json | 30 + .../research/01-technical-landscape.md | 77 + .../research/02-regulatory-auditability.md | 109 + .../research/03-positioning-jtbd.md | 86 + .../research/04-technical-design.md | 313 +++ .../research/05-risks-critique.md | 122 + .../rule-engine-poc/rules/quality-gates.yaml | 404 ++++ .../scripts/run-all-fixtures.mjs | 23 + .../rule-engine-poc/scripts/run-all-html.mjs | 29 + experiments/rule-engine-poc/src/cli.ts | 140 ++ experiments/rule-engine-poc/src/engine.ts | 176 ++ experiments/rule-engine-poc/src/hash.ts | 24 + .../rule-engine-poc/src/html-report.ts | 278 +++ experiments/rule-engine-poc/src/loader.ts | 80 + experiments/rule-engine-poc/src/types.ts | 82 + .../rule-engine-poc/test/engine.test.ts | 243 ++ experiments/rule-engine-poc/test/hash.test.ts | 26 + .../rule-engine-poc/test/loader.test.ts | 98 + experiments/rule-engine-poc/tsconfig.json | 19 + experiments/rule-engine-poc/vitest.config.ts | 8 + 34 files changed, 5205 insertions(+) create mode 100644 experiments/rule-engine-poc/.gitignore create mode 100644 experiments/rule-engine-poc/README.md create mode 100644 experiments/rule-engine-poc/docs/README.md create mode 100644 experiments/rule-engine-poc/docs/architecture.md create mode 100644 experiments/rule-engine-poc/docs/audit-trail.md create mode 100644 experiments/rule-engine-poc/docs/dsl-reference.md create mode 100644 experiments/rule-engine-poc/docs/extending.md create mode 100644 experiments/rule-engine-poc/docs/ooda-integration.md create mode 100644 experiments/rule-engine-poc/fixtures/blocked-missing-ears.json create mode 100644 experiments/rule-engine-poc/fixtures/blocked-s1-finding.json create mode 100644 experiments/rule-engine-poc/fixtures/needs-attention-design-risks.json create mode 100644 experiments/rule-engine-poc/fixtures/ready-idea.json create mode 100644 experiments/rule-engine-poc/fixtures/ready-implementation.json create mode 100644 experiments/rule-engine-poc/package-lock.json create mode 100644 experiments/rule-engine-poc/package.json create mode 100644 experiments/rule-engine-poc/research/01-technical-landscape.md create mode 100644 experiments/rule-engine-poc/research/02-regulatory-auditability.md create mode 100644 experiments/rule-engine-poc/research/03-positioning-jtbd.md create mode 100644 experiments/rule-engine-poc/research/04-technical-design.md create mode 100644 experiments/rule-engine-poc/research/05-risks-critique.md create mode 100644 experiments/rule-engine-poc/rules/quality-gates.yaml create mode 100644 experiments/rule-engine-poc/scripts/run-all-fixtures.mjs create mode 100644 experiments/rule-engine-poc/scripts/run-all-html.mjs create mode 100644 experiments/rule-engine-poc/src/cli.ts create mode 100644 experiments/rule-engine-poc/src/engine.ts create mode 100644 experiments/rule-engine-poc/src/hash.ts create mode 100644 experiments/rule-engine-poc/src/html-report.ts create mode 100644 experiments/rule-engine-poc/src/loader.ts create mode 100644 experiments/rule-engine-poc/src/types.ts create mode 100644 experiments/rule-engine-poc/test/engine.test.ts create mode 100644 experiments/rule-engine-poc/test/hash.test.ts create mode 100644 experiments/rule-engine-poc/test/loader.test.ts create mode 100644 experiments/rule-engine-poc/tsconfig.json create mode 100644 experiments/rule-engine-poc/vitest.config.ts diff --git a/experiments/rule-engine-poc/.gitignore b/experiments/rule-engine-poc/.gitignore new file mode 100644 index 000000000..edf96cefc --- /dev/null +++ b/experiments/rule-engine-poc/.gitignore @@ -0,0 +1,5 @@ +node_modules/ +dist/ +reports/ +*.log +.DS_Store diff --git a/experiments/rule-engine-poc/README.md b/experiments/rule-engine-poc/README.md new file mode 100644 index 000000000..642042886 --- /dev/null +++ b/experiments/rule-engine-poc/README.md @@ -0,0 +1,115 @@ +--- +title: Rule Engine POC +folder: experiments/rule-engine-poc +description: Terminal-only TypeScript proof-of-concept of a deterministic rule engine that sits on top of LLM-extracted structured flags. Demonstrates the "LLM extracts, rules decide" pattern applied to the repo's own quality framework. +entry_point: true +--- + +# Rule Engine POC + +Terminal-only TypeScript proof-of-concept of a deterministic rule engine that sits on top of LLM-extracted structured flags. The engine consumes flags and emits a verdict (`blocked` / `needs-attention` / `ready-to-progress`) with a fully replayable audit trail. + +## Why + +The concept comes from a [Reddit thread on AI fact-checking](https://www.reddit.com/r/artificial/) where the author argues the LLM should **never produce verdicts**, only **structured extractions** that a deterministic layer then scores: + +> "The LLM in our pipeline never produces a numeric score, never produces a true/false verdict... The LLM extracts structured factual flags from source material. A deterministic Python scoring layer turns those flags into a verdict tier." + +That maps perfectly onto our [OODA orchestrator concept](../../docs/backlog/502-idea-ooda-loop-plugin-observe-orient-decide-act.md): + +| OODA quadrant | Role | Determinism | +|---|---|---| +| Observe | Raw signal collection (git, CI, files) | Deterministic | +| Orient | **LLM extracts structured flags** | Stochastic (constrained to extraction only) | +| Decide | **Rule engine emits verdict + suggested actions** | **Deterministic — this POC** | +| Act | Execute approved actions | Deterministic | + +## Domain: the repo's own quality framework + +The example rule set encodes the Definition of Done from [`docs/quality-framework.md`](../../docs/quality-framework.md) as machine-checkable rules — one rule per DoD bullet, per stage. A feature folder's verdict becomes a function of named flags and named rules; nothing about the answer depends on which way the wind was blowing when the LLM ran. + +## Run it + +```bash +cd experiments/rule-engine-poc +npm install +npm run demo # ready-implementation fixture +npm run demo:blocked-ears # requirements stage with EARS coverage 0.6 +npm run demo:blocked-s1 # any-S1-finding cross-cutting block +npm run demo:needs-attention # design stage with S2 findings +npm run demo:ready-idea # idea stage, DoD satisfied +npm run demo:all # walk every fixture in turn +npm test # unit + reproducibility tests +``` + +Three output modes: + +```bash +# default: human-readable text report +npx tsx src/cli.ts rules/quality-gates.yaml fixtures/ready-implementation.json + +# machine-readable JSON (the full VerdictResult) +npx tsx src/cli.ts rules/quality-gates.yaml fixtures/ready-implementation.json --json + +# self-contained HTML report (no external assets) +npx tsx src/cli.ts rules/quality-gates.yaml fixtures/ready-implementation.json --html reports/out.html + +# generate one HTML report per fixture into reports/ +npm run demo:html:all +``` + +The CLI exits **1** on `blocked`, **0** otherwise — usable as a CI gate. + +## File map + +| Path | Role | +|---|---| +| `src/types.ts` | Data contracts (`ExtractionFlags`, `Rule`, `VerdictResult`, ...) | +| `src/hash.ts` | Canonical JSON + SHA-256 — the foundation of replayability | +| `src/engine.ts` | Deterministic evaluation: severity-first verdict, weighted tally, audit trail | +| `src/loader.ts` | YAML rule file loader + schema validation + per-rule content hash | +| `src/cli.ts` | Terminal renderer + JSON / HTML modes + verdict-as-exit-code | +| `src/html-report.ts` | Self-contained HTML renderer (inline CSS, no JS) | +| `rules/quality-gates.yaml` | Example rules — Definition of Done per stage | +| `fixtures/*.json` | Mock Orient-quadrant outputs (the LLM's structured extraction) | +| `test/*.test.ts` | Unit, reproducibility, and operator coverage | +| `docs/*.md` | Architecture, DSL reference, audit trail, extension guide, OODA integration | +| `research/*.md` | Research wave that informed the design (5 angles) | + +## Documentation + +Detailed documentation lives under [`docs/`](docs/README.md): + +- [`docs/architecture.md`](docs/architecture.md) — how the engine works, severity-first conflict resolution, determinism strategy +- [`docs/dsl-reference.md`](docs/dsl-reference.md) — full YAML grammar with every operator +- [`docs/audit-trail.md`](docs/audit-trail.md) — what's captured, how to replay, EU AI Act mapping +- [`docs/extending.md`](docs/extending.md) — adding rules, flags, fixtures; pointing the engine at a new domain +- [`docs/ooda-integration.md`](docs/ooda-integration.md) — how this POC slots into the OODA orchestrator + +## Design choices and why + +- **Severity-first verdict.** A `blocked` rule beats any number of `ready-to-progress` rules, regardless of weight. Verdicts are categorical tiers — exactly as the source pattern argues. +- **Determinism by construction.** Rules are sorted `[priority desc, id asc]` at load time; flags are serialised through `canonicalJson` (sorted keys) before hashing; no `Date.now`, no `Math.random`, no async, no `Object.entries` in the hot path. +- **Content-hash provenance.** Each rule carries a SHA-256 of its content; the result carries `rulesetHash` and `flagsHash`. An auditor can replay a verdict from those three artifacts alone. +- **Missing flags are not silent.** A rule that references an absent flag fails with reason `"flag missing in extraction"` — surfaced in the audit trail rather than swallowed. +- **YAML, with strict load-time validation.** Picked for diff-ability; the loader rejects malformed rules with a helpful error before any evaluation. + +## What this is not + +- **Not a production rule engine.** It is intentionally minimal (~250 LOC engine + loader). The technical-landscape research recommends [`json-rules-engine`](https://github.com/CacheControl/json-rules-engine) behind a thin adapter when this graduates from POC; see `research/01-technical-landscape.md` for the comparison. +- **Not opinionated about extraction.** The LLM half of the pipeline is mocked by the fixture JSON files. Wiring a real Claude / GPT extraction call is a separate POC. +- **Not yet calibrated.** Weights and severity ordering are placeholders. The critic flags this as the top risk; see `research/05-risks-critique.md` for the proposed Riskiest Assumption Tests. + +## Research artifacts + +Each of the five research angles produced a standalone brief under `research/`: + +1. [`01-technical-landscape.md`](research/01-technical-landscape.md) — library comparison; build-vs-buy recommendation +2. [`02-regulatory-auditability.md`](research/02-regulatory-auditability.md) — EU AI Act / ISO 42001 / NIST RMF audit-trail checklist +3. [`03-positioning-jtbd.md`](research/03-positioning-jtbd.md) — JTBD, North Star, competitive positioning +4. [`04-technical-design.md`](research/04-technical-design.md) — alternative architecture sketch (deeper than the POC implementation) +5. [`05-risks-critique.md`](research/05-risks-critique.md) — failure modes, blindspots, 3 RATs to falsify first + +## North Star + +From the positioning research: **verdict reproducibility rate** — the percentage of `(flags, rule set) -> verdict` pairs that match byte-for-byte across two runs on the same input. The reproducibility test suite (`test/engine.test.ts` -> `describe("reproducibility")`) exercises this directly; in this POC the rate is 100% by construction. diff --git a/experiments/rule-engine-poc/docs/README.md b/experiments/rule-engine-poc/docs/README.md new file mode 100644 index 000000000..2ebe0e21e --- /dev/null +++ b/experiments/rule-engine-poc/docs/README.md @@ -0,0 +1,20 @@ +--- +title: Rule engine POC documentation +folder: experiments/rule-engine-poc/docs +description: Index of POC documentation — architecture, DSL reference, audit trail, extension guide, OODA integration. +entry_point: true +--- + +# Rule engine POC documentation + +Detailed documentation for the POC. Start with the project [README](../README.md) for context, then dive into whichever doc matches your question. + +| Doc | Read when | +|---|---| +| [`architecture.md`](architecture.md) | You want to understand how the engine is built and why it produces the same answer every time. | +| [`dsl-reference.md`](dsl-reference.md) | You're writing or reading a rule file and need the full YAML grammar — every operator, every grouping construct. | +| [`audit-trail.md`](audit-trail.md) | You need to replay a verdict, diff two verdicts, or map the audit trail to EU AI Act / ISO 42001 requirements. | +| [`extending.md`](extending.md) | You want to add a rule, add a flag, point the engine at a new domain, or run the tests. | +| [`ooda-integration.md`](ooda-integration.md) | You want to understand how this POC slots into the OODA orchestrator and what a production wiring would look like. | + +For the research that informed the design, see [`../research/`](../research/) — five briefs covering technical landscape, regulatory / auditability, positioning, design alternatives, and risks. diff --git a/experiments/rule-engine-poc/docs/architecture.md b/experiments/rule-engine-poc/docs/architecture.md new file mode 100644 index 000000000..abcd41302 --- /dev/null +++ b/experiments/rule-engine-poc/docs/architecture.md @@ -0,0 +1,93 @@ +--- +title: Architecture +folder: experiments/rule-engine-poc/docs +description: Component layout, evaluation algorithm, and determinism strategy for the rule engine POC. +entry_point: false +--- + +# Architecture + +How the engine is organised and why it produces the same answer every time. + +## Module layout + +``` +src/ + types.ts Pure data contracts. No logic, no I/O. + hash.ts canonicalJson() + sha256(). The foundation of replayability. + engine.ts evaluate(rules, flags) -> VerdictResult. Pure function. + loader.ts YAML -> LoadedRule[] with content hash + schema validation. + html-report.ts VerdictResult -> self-contained HTML string. Pure function. + cli.ts Side-effectful shell: parse argv, read files, write outputs. +``` + +Three layers, one direction: + +``` + loader -> engine -> renderer (text | json | html) + \ / + types --- +``` + +The engine itself is a [pure function](https://en.wikipedia.org/wiki/Pure_function): same inputs, same output, no I/O, no clock, no randomness. All side effects live in `cli.ts`. + +## Evaluation algorithm + +``` +1. Load rules (already validated and content-hashed by the loader). +2. Sort by [priority desc, id asc]. This is the only ordering that matters. +3. For each rule, evaluate its `when` clause against the flags: + - `all`: every condition must match (AND). + - `any`: at least one condition must match (OR). + - `not`: inverts the inner condition's match result. +4. If matched, add (verdict, weight) to a per-verdict tally. + Append the rule's actions to a deduplicated set. +5. After all rules: pick the verdict tier by severity, not weight. + Severity order: blocked > needs-attention > ready-to-progress > unknown. + Within a tier, weight only informs action prioritisation, not verdict. +6. Sort actions alphabetically. Compute rulesetHash and flagsHash. +7. Return VerdictResult — the full audit trail + verdict + provenance. +``` + +### Why severity-first, not weighted sum? + +A weighted sum is what most fact-checking systems use: each piece of evidence adds or subtracts from a score, and a threshold picks the verdict. That works when **all evidence is commensurable** — e.g., "this source supports the claim" is comparable in kind to "this source contradicts the claim". + +In our domain, evidence is **categorical**: missing EARS notation is a *gate*, not a *point deduction*. A `blocked` rule expresses "this must be fixed before progressing"; weighting it against a `ready-to-progress` rule would let positive signals drown out a real blocker. Severity-first preserves the semantics of categorical gates. + +Within a tier, weight still matters: it sorts which actions are surfaced first. + +## Determinism strategy + +Determinism is **engineered**, not assumed. Specific hazards we guard against: + +| Hazard | Mitigation | +|---|---| +| Object key ordering in JSON | `canonicalJson` sorts keys recursively before serialising | +| `Date.now()` / wall clock | Not used inside the engine; only the CLI's HTML report includes a timestamp | +| `Math.random()` / process IDs | Not used at all | +| `async` / event-loop interleaving | Engine is fully synchronous | +| `Set` iteration order | Actions are collected in a `Set` then explicitly `.sort()`-ed | +| Regex `lastIndex` | A fresh `new RegExp(...)` is constructed per evaluation; no `/g` flag | +| Loaded-file path differences | `rulesetHash` is computed from rule content, not file paths | + +The output of `evaluate()` is byte-identical for identical inputs. This is exercised directly by the `describe("reproducibility")` block in `test/engine.test.ts`. + +## Provenance + +Three hashes anchor a verdict to its inputs: + +- **`rulesetHash`** — `sha256(canonicalJson([{id, hash}, ...]))` over the sorted, loaded rules. Changes if any rule's content changes or the rule set is reordered after a content edit. +- **Per-rule `hash`** — `sha256(canonicalJson({id, priority, when, then}))`. Description is intentionally excluded so documentation edits don't invalidate the hash. +- **`flagsHash`** — `sha256(canonicalJson(flags))`. Key order in the source JSON is irrelevant. + +Together with the `engineVersion`, these three fields make any verdict replayable: given the same engine version, same rules, and same flags, you reach the same answer. + +## What's deliberately missing + +- **No rule chaining** (`Rete`-style derived facts). At < 200 rules with a single fact set per LLM extraction, the partial-match cache is pure overhead. +- **No hot reload**. Rules are loaded once per CLI invocation. +- **No DSL macros**, **no rule inheritance**. Each rule stands on its own; copy-paste is honest. +- **No persistence**. The engine returns a value; storing it is the caller's concern. + +See [`research/04-technical-design.md`](../research/04-technical-design.md) for the architect's proposal of how to evolve these toward a production rule layer. diff --git a/experiments/rule-engine-poc/docs/audit-trail.md b/experiments/rule-engine-poc/docs/audit-trail.md new file mode 100644 index 000000000..746d5e919 --- /dev/null +++ b/experiments/rule-engine-poc/docs/audit-trail.md @@ -0,0 +1,113 @@ +--- +title: Audit trail and replay +folder: experiments/rule-engine-poc/docs +description: What the rule engine captures per verdict, how to replay a decision, and how this maps to EU AI Act explainability requirements. +entry_point: false +--- + +# Audit trail and replay + +Every call to `evaluate()` returns a fully self-describing `VerdictResult`. The article's core promise — "anyone can audit which sources contributed how much to a given verdict" — is met by what's in this object. + +## What gets captured + +```ts +interface VerdictResult { + verdict: Verdict; // categorical tier (severity-first) + weightedTally: WeightedTally; // weight contributed per tier + actions: string[]; // deduplicated, alphabetically sorted + evaluations: RuleEvaluation[]; // one per rule, in deterministic order + rulesetHash: string; // sha256 of (sorted rule ids + their hashes) + flagsHash: string; // sha256 of the canonical-JSON flags + engineVersion: string; // pinned at the top of engine.ts +} + +interface RuleEvaluation { + rule: LoadedRule; // including its content hash + source file + matched: boolean; + conditions: ConditionResult[]; // one entry per condition, in declaration order + contribution?: { // only present when matched + verdict: Verdict; + weight: number; + actions: string[]; + }; +} + +interface ConditionResult { + condition: Condition; // the original condition shape + matched: boolean; + observed: FlagValue | undefined; // what the engine actually saw + reason?: string; // e.g., "flag missing in extraction" +} +``` + +For any rule in `evaluations`, you can answer: + +- Did it match? (`matched`) +- What did each condition see? (`conditions[*].observed`) +- Why didn't it match if it should have? (`conditions[*].reason`) +- What did it contribute to the verdict? (`contribution`) +- What version of the rule did we evaluate? (`rule.hash`) +- Where is the rule defined? (`rule.sourceFile`, `rule.sourceIndex`) + +## Replay + +To replay a verdict you need three things, all present in the result: + +1. The **engine version** (`engineVersion`). +2. The **rule set** — pinnable via `rulesetHash` and the individual rule hashes. +3. The **flags** — pinnable via `flagsHash`. + +Replay procedure: + +```bash +# 1. Pin the engine at the recorded version (git tag or commit). +git checkout v0.1.0 + +# 2. Re-run the engine against the same rule file and same flags. +npx tsx src/cli.ts rules/quality-gates.yaml fixtures/ready-implementation.json --json > replay.json + +# 3. Compare hashes. +jq '.rulesetHash, .flagsHash, .verdict' replay.json +# Must match the recorded values. +``` + +Because the engine is a pure synchronous function over canonicalised inputs, the replay verdict and audit trail will be byte-identical (modulo the HTML report's timestamp footer). + +## Diffing two verdicts + +A change in a verdict between two runs is one of three things: + +| `engineVersion` differs | `rulesetHash` differs | `flagsHash` differs | Diagnosis | +|---|---|---|---| +| yes | — | — | Engine behaviour change. Read the engine changelog. | +| no | yes | — | Rule edit. Compare per-rule hashes to find which rules changed. | +| no | no | yes | Extraction changed. The LLM saw different inputs or produced different flags. | +| no | no | no | The verdict cannot have changed; this is a bug. | + +The per-rule `hash` lets you bisect a rule-edit diff without comparing the YAML text. + +## Mapping to EU AI Act explainability + +From `research/02-regulatory-auditability.md`, an auditable AI-derived decision under EU AI Act Articles 11–14 needs a per-decision dossier with the following parts. Our `VerdictResult` covers them as follows: + +| Required artifact | Covered by | Status | +|---|---|---| +| Decision envelope (input + output) | `flagsHash` + `verdict` | yes | +| Input hash | `flagsHash` | yes | +| Model invocation log | LLM extraction layer | **out of scope for the POC** — needs upstream Orient capture | +| Flags with source spans | LLM extraction layer | **out of scope for the POC** — needs spans in extraction schema | +| Ruleset version + hash | `rulesetHash` + per-rule `hash` | yes | +| Ordered rule trace | `evaluations` in deterministic order | yes | +| Verdict + rationale | `verdict` + matched `conditions` | yes | +| Confidence | n/a (verdicts are categorical) | by design | +| Human-oversight record | Caller's responsibility (Act-phase gate) | **upstream** | +| Replay manifest | `engineVersion` + the two hashes | yes | + +The pattern handles the **decide** half of the audit chain; the **observe / orient** half (provenance of the raw signals and the LLM extraction) is upstream and must be captured separately. + +## What this does not give you + +- **No fairness audit.** Severity ordering and weights encode normative choices. They are visible (in YAML) but not yet reviewed. +- **No calibration data.** Whether a verdict tier actually predicts the outcome the rule intends to predict is an open empirical question. See `research/05-risks-critique.md` for proposed Riskiest Assumption Tests. +- **No tamper evidence at rest.** The audit trail is a return value; persisting it durably (append-only log, hash chain) is the caller's job. diff --git a/experiments/rule-engine-poc/docs/dsl-reference.md b/experiments/rule-engine-poc/docs/dsl-reference.md new file mode 100644 index 000000000..e85fe95b8 --- /dev/null +++ b/experiments/rule-engine-poc/docs/dsl-reference.md @@ -0,0 +1,191 @@ +--- +title: Rule DSL reference +folder: experiments/rule-engine-poc/docs +description: YAML grammar for declarative rules — every condition operator and grouping construct, with examples. +entry_point: false +--- + +# Rule DSL reference + +Rules are declared as a YAML list. Each rule has a stable id, a priority, a `when` clause, and a `then` clause. + +## Top-level shape + +```yaml +- id: # required, string, unique + description: # required, informational only (not hashed) + stage: # optional, informational tag + priority: # required, higher fires earlier + when: # required, at least one of: all | any | not + all: [, ...] + any: [, ...] + not: [, ...] + then: # required + verdict: # blocked | needs-attention | ready-to-progress | unknown + weight: # informs action prioritisation within a tier + actions: [, ...] # action identifiers surfaced when the rule matches + tags: [, ...] # optional, informational +``` + +The `description` and `tags` fields are intentionally excluded from the rule's content hash so that documentation edits do not invalidate replay. + +## Condition shape + +A condition references one flag and applies one or more operators. If multiple operators are present on a single condition, all must match. + +```yaml +- flag: + eq: # deep equality + ne: # deep inequality + gt: # observed > value (numeric only) + lt: # observed < value (numeric only) + in: [, ...] # observed is deep-equal to one of the listed values + regex: "" # observed is a string that matches (no flags, fresh instance) + exists: # observed flag is present (true) or absent (false) +``` + +## Operators by example + +### `eq` and `ne` + +```yaml +- id: ci-must-be-green + description: PR is blocked while CI is failing. + priority: 100 + when: + all: + - flag: ci_passing + eq: true + - flag: build_target + ne: legacy + then: + verdict: ready-to-progress + weight: 100 + actions: [advance] +``` + +`eq` works on booleans, numbers, strings, and arrays (deep equality). + +### `gt` and `lt` + +```yaml +- id: stale-after-7-days + description: PR has not been touched in over 7 days. + priority: 50 + when: + all: + - flag: days_since_last_push + gt: 7 + then: + verdict: needs-attention + weight: 25 + actions: [ping-author] +``` + +`gt` and `lt` only match when the observed value is a `number`. A string `"8"` will not match `gt: 7`. + +### `in` + +```yaml +- id: req-stage-or-later + description: Applies once we've reached requirements stage. + priority: 80 + when: + all: + - flag: current_stage + in: [requirements, design, specification, implementation, testing, review] + then: + verdict: needs-attention + weight: 10 + actions: [check-stage] +``` + +`in` does a deep-equality check against each listed value. + +### `regex` + +```yaml +- id: claude-branch-policy + description: Branches under claude/ get an extra review pass. + priority: 40 + when: + all: + - flag: branch + regex: "^claude/" + then: + verdict: needs-attention + weight: 10 + actions: [request-secondary-review] +``` + +A fresh `RegExp` is constructed each time the condition is evaluated; no shared `lastIndex`. + +### `exists` + +```yaml +- id: requires-extraction-version + description: Extraction must declare which LLM version it came from. + priority: 100 + when: + all: + - flag: extracted_by_llm_version + exists: true + then: + verdict: needs-attention + weight: 10 + actions: [require-version-tag] +``` + +`exists: false` matches when the flag key is **not** present on the extraction object. This is distinct from a flag being present with value `null`. + +## Grouping: `all`, `any`, `not` + +Combine grouping constructs in a single `when` to express compound logic. + +```yaml +when: + all: # every listed condition must match + - flag: ci_passing + eq: true + - flag: has_merge_conflicts + eq: false + any: # at least one listed condition must match + - flag: approvals_count + gt: 1 + - flag: is_emergency + eq: true + not: # the inner condition must NOT match + - flag: is_blocked_label + eq: true +``` + +When multiple groups appear in one `when`, **all** groups must be satisfied (logical AND across `all` / `any` / `not`). + +## Verdicts + +The engine recognises four verdict tiers, ordered by severity: + +| Tier | Meaning | Exit code | +|---|---|---| +| `blocked` | A gate condition has failed; do not progress. | `1` | +| `needs-attention` | Soft warning; progression allowed with note. | `0` | +| `ready-to-progress` | Stage Definition of Done satisfied. | `0` | +| `unknown` | No rule matched; default fallback. | `0` | + +`blocked` always wins over any number of `ready-to-progress` rules. + +## Missing flags + +A rule that references a flag not present in the extraction fails with reason `"flag missing in extraction"`. The condition appears in the audit trail with `matched=false` and a `reason` field — surfacing the gap rather than swallowing it. + +`exists: false` is the one operator that explicitly tolerates absence: it only matches when the flag is absent. + +## Schema validation + +The loader rejects malformed rules at load time with a helpful error: + +- Missing `id`, `priority`, `when`, `then`, `then.verdict`, `then.weight`, `then.actions` +- `when` with no `all`, `any`, or `not` group +- Non-numeric `priority` or `then.weight` + +A malformed rule file is a load-time failure, not a runtime warning. diff --git a/experiments/rule-engine-poc/docs/extending.md b/experiments/rule-engine-poc/docs/extending.md new file mode 100644 index 000000000..3910e4dc4 --- /dev/null +++ b/experiments/rule-engine-poc/docs/extending.md @@ -0,0 +1,113 @@ +--- +title: Extending the POC +folder: experiments/rule-engine-poc/docs +description: How to add rules, flags, and fixtures; how to point the engine at a new domain; how to run the tests. +entry_point: false +--- + +# Extending the POC + +The engine is domain-agnostic. The example rule set encodes our quality framework; nothing prevents you from pointing it at a different concern. + +## Add a new rule + +1. Open `rules/quality-gates.yaml` (or create a new file under `rules/`). +2. Append a YAML entry. The minimum required fields: + + ```yaml + - id: my-new-rule + description: One sentence saying when it fires. + priority: 50 + when: + all: + - flag: my_new_flag + eq: true + then: + verdict: needs-attention + weight: 20 + actions: [my-new-action] + ``` + +3. Choose a `priority` so it lands where you want in the audit trail (`priority desc, id asc`). Use round numbers (10, 20, 50, 80, 100, 200) to leave room for later insertion. +4. Choose a `weight` thoughtfully: weight does **not** decide the verdict (severity does), but it influences action ordering within a tier. +5. Run the demo against an existing fixture to verify the rule fires when expected: + + ```bash + npx tsx src/cli.ts rules/quality-gates.yaml fixtures/.json + ``` + + Look for your rule id in the audit trail. The condition rows show the observed values, so it's obvious why a rule did or didn't match. + +## Add a new flag + +Flags are arbitrary key/value pairs on the input JSON. To use one: + +1. Choose a snake_case flag name. Document the type explicitly — booleans should always be booleans, never strings like `"true"`. +2. Add the flag to the fixtures that should exercise it. If a rule references a flag that is missing from a fixture, the condition will fail with reason `"flag missing in extraction"` — this is intentional and surfaces gaps in coverage. +3. Add a rule that references the flag. +4. Update [`docs/dsl-reference.md`](dsl-reference.md) only if you've added a new condition **operator** (not a new flag). + +## Add a new fixture + +A fixture is a single JSON object representing one Orient-quadrant extraction: + +```json +{ + "feature_slug": "my-feature", + "current_stage": "implementation", + "implementation_lint_clean": true +} +``` + +Save under `fixtures/.json`. The `run-all-fixtures.mjs` and `run-all-html.mjs` scripts pick up every `*.json` file automatically. + +If the scenario should appear in `npm run demo:`, add an entry to `package.json` under `scripts`. + +## Point the engine at a different domain + +The engine has no domain coupling. To use it for, say, PR readiness instead of quality gates: + +1. Create `rules/pr-readiness.yaml` with PR-domain rules (flags like `ci_passing`, `approvals_count`, etc.). +2. Create `fixtures/pr-*.json` with PR-domain extractions. +3. Invoke the CLI with the new paths: + + ```bash + npx tsx src/cli.ts rules/pr-readiness.yaml fixtures/pr-ready.json + ``` + +The verdict tiers (`blocked` / `needs-attention` / `ready-to-progress` / `unknown`) are general enough to fit most decide-quadrant use cases. If your domain genuinely needs different tiers, edit the `Verdict` union in `src/types.ts` and the `SEVERITY_ORDER` constant in `src/engine.ts`. + +## Run the tests + +```bash +npm test # one-shot +npm run test:watch +``` + +The suite covers three things in roughly equal measure: + +- **Engine semantics** — every operator, severity-first verdict, missing-flag behaviour. +- **Reproducibility** — the North Star metric; identical inputs produce identical outputs, including across JSON key reordering. +- **Loader validation** — malformed rules are caught at load time, not runtime. + +When adding a new rule, prefer adding a test fixture that exercises it over adding a new unit test — fixtures double as regression cases for the audit trail's textual format. + +## Generate reports for sharing + +```bash +npm run demo:html # one-fixture HTML report under reports/ +npm run demo:html:all # one HTML report per fixture +``` + +HTML reports are self-contained (inline CSS, no external assets). Drop one in a Slack message, attach to a PR, or pipe to a static-site directory. The `reports/` folder is gitignored — these are derived artifacts. + +## When the engine itself needs to change + +Bumping `ENGINE_VERSION` in `src/engine.ts` is mandatory whenever you change: + +- The set of recognised operators +- The conflict-resolution rule (severity-first vs anything else) +- The shape of `VerdictResult`, `RuleEvaluation`, or `ConditionResult` +- The canonical-JSON algorithm or the hash algorithm + +A version bump is the auditor's signal that prior verdicts may not replay exactly. Without it, the replay guarantee in [`audit-trail.md`](audit-trail.md) silently breaks. diff --git a/experiments/rule-engine-poc/docs/ooda-integration.md b/experiments/rule-engine-poc/docs/ooda-integration.md new file mode 100644 index 000000000..2521655f8 --- /dev/null +++ b/experiments/rule-engine-poc/docs/ooda-integration.md @@ -0,0 +1,105 @@ +--- +title: OODA integration +folder: experiments/rule-engine-poc/docs +description: How this rule engine slots into the Observe / Orient / Decide / Act loop and what a production wiring would look like. +entry_point: false +--- + +# OODA integration + +This POC implements the **Decide** quadrant of the OODA orchestrator concept described in [`docs/backlog/502-idea-ooda-loop-plugin-observe-orient-decide-act.md`](../../../docs/backlog/502-idea-ooda-loop-plugin-observe-orient-decide-act.md). The other three quadrants are mocked by the fixture JSON files. + +## The loop, with explicit determinism boundaries + +``` ++-----------+ +-----------+ +-----------+ +-----------+ +| OBSERVE | -> | ORIENT | -> | DECIDE | -> | ACT | ++-----------+ +-----------+ +-----------+ +-----------+ +| raw | | LLM | | rule | | execute | +| signals | | extracts | | engine | | approved | +| | | structured| | | | actions | +| det. | | flags | | det. | | det. | +| | | (stoch.) | | (this POC)| | | ++-----------+ +-----------+ +-----------+ +-----------+ + ^ ^ + | | + the only everything from + stochastic here onward is + quadrant reproducible +``` + +The article's core insight — *"LLM extracts, rules decide"* — translates to a strict rule about where stochasticity is allowed. **The LLM is confined to Orient.** Observe is mechanical. Decide is deterministic. Act is mechanical. Only Orient may "reason"; even there it may only emit structured flags, not opinions. + +## The IO contract at each boundary + +### Observe -> Orient + +A blob of raw signals collected from concrete sources. For a feature folder this might be: + +```json +{ + "feature_slug": "rule-engine-poc", + "workflow_state": "...", + "requirements_md": "...", + "open_issues": [...], + "ci_runs": [...] +} +``` + +The Observe quadrant is responsible for **provenance**: every signal should carry a citation pointer (file path + git SHA, issue URL + comment id, CI run URL) so that downstream auditing can replay the *input* to Orient, not just the input to Decide. + +### Orient -> Decide + +This POC's input. A flat `Record` where every key is documented and every value is a boolean, number, string, string array, or null. The LLM's job is constrained to: + +``` +Given the raw signals above, set each flag below to the value that the +signals support. If the signals are insufficient to decide a flag, leave +the flag out — do not guess. +``` + +Any flag the LLM cannot decide is **absent**, not `false`. A rule that requires the flag will fail at evaluation with reason `"flag missing in extraction"` and surface in the audit trail. This is the article's "extraction schema misses an edge case" failure mode made visible. + +### Decide -> Act + +The `VerdictResult` returned by `evaluate()`. The Act quadrant consumes: + +- `verdict` — gates whether any Act runs at all (often `blocked` halts the loop). +- `actions` — the proposed list of side effects, alphabetically sorted. +- `evaluations` — the audit trail to attach to whatever Act produces. + +Crucially, the Act quadrant **does not re-derive actions** from `flags` — only from `actions`. This keeps the decide-from-act separation honest. + +## What a production wiring would look like + +A real OODA loop using this engine has four extra concerns: + +1. **Observe-quadrant persistence.** The raw signals + their provenance need to be hashed and stored so `flagsHash` traces back to evidence. Suggested format: an [append-only `ndjson` log](https://github.com/ndjson/ndjson-spec) indexed by `(loop_id, observed_at)`. + +2. **Orient-quadrant prompt versioning.** The LLM prompt that constrains extraction is itself a piece of the audit chain. Pin it with a content hash and include the hash in the extraction output: + + ```json + { "extracted_by_prompt_version": "sha256:abc...", "..._flags...": "..." } + ``` + +3. **Rule lifecycle governance.** Adding, removing, or re-weighting a rule should be a reviewed change with an issue, a PR, and a changelog entry. The bumped `rulesetHash` should appear in release notes. + +4. **Act-quadrant human gate.** For a first deployment, every action should require explicit human approval before execution. Auto-act on `ready-to-progress` should be opt-in per action class and only after empirical evidence that the rule for that class has the false-positive rate we tolerate. + +## Where this fits in the Specorator workflow + +The engine is well-suited to be invoked from two places in the Specorator lifecycle: + +- **Inside `/quality:status` and `/spec:review`** — replaces the "judgment call" portion of a Stage-9 review with a transparent, reproducible verdict. The qualitative reviewer agent still runs; its job becomes corroborating or contesting the rule-engine verdict, not generating one from scratch. +- **Between stage handoffs** — the orchestrator can poll the engine when the user asks "what's next?". If `verdict == ready-to-progress`, advance. If `blocked`, surface the matched rules and proposed actions in the conversation. + +Both of these wirings are downstream of this POC. The artifact this POC produces — a stable `VerdictResult` contract with three replay anchors — is the integration surface. + +## Reading the research + +The research wave in [`research/`](../research/) has four artifacts directly relevant to integration: + +- [`research/03-positioning-jtbd.md`](../research/03-positioning-jtbd.md) — the strategist's framing of where the rule engine sharpens Specorator's value prop. +- [`research/04-technical-design.md`](../research/04-technical-design.md) — the architect's deeper sketch including production evolution concerns (priority queue, hot reload, calibration harness). +- [`research/02-regulatory-auditability.md`](../research/02-regulatory-auditability.md) — what the audit trail must capture end-to-end (not just in Decide) to be EU-AI-Act-defensible. +- [`research/05-risks-critique.md`](../research/05-risks-critique.md) — the critic's three Riskiest Assumption Tests that should pass before production wiring is attempted. diff --git a/experiments/rule-engine-poc/fixtures/blocked-missing-ears.json b/experiments/rule-engine-poc/fixtures/blocked-missing-ears.json new file mode 100644 index 000000000..dff54e527 --- /dev/null +++ b/experiments/rule-engine-poc/fixtures/blocked-missing-ears.json @@ -0,0 +1,14 @@ +{ + "feature_slug": "auth-refresh", + "current_stage": "requirements", + + "requirements_have_stable_ids": true, + "requirements_ears_coverage": 0.6, + "requirements_acceptance_criteria_testable": true, + + "open_clarifications_count": 0, + "blockers_count": 0, + "s1_findings_count": 0, + "s2_findings_count": 0, + "s3_findings_count": 1 +} diff --git a/experiments/rule-engine-poc/fixtures/blocked-s1-finding.json b/experiments/rule-engine-poc/fixtures/blocked-s1-finding.json new file mode 100644 index 000000000..0e26ab168 --- /dev/null +++ b/experiments/rule-engine-poc/fixtures/blocked-s1-finding.json @@ -0,0 +1,21 @@ +{ + "feature_slug": "payment-flow", + "current_stage": "testing", + + "requirements_have_stable_ids": true, + "requirements_ears_coverage": 1.0, + "spec_each_item_traces_to_requirement": true, + + "implementation_lint_clean": true, + "implementation_types_clean": true, + "implementation_unit_tests_pass": true, + + "testing_ears_test_coverage": 0.95, + "testing_critical_paths_covered": true, + + "open_clarifications_count": 0, + "blockers_count": 0, + "s1_findings_count": 1, + "s2_findings_count": 0, + "s3_findings_count": 0 +} diff --git a/experiments/rule-engine-poc/fixtures/needs-attention-design-risks.json b/experiments/rule-engine-poc/fixtures/needs-attention-design-risks.json new file mode 100644 index 000000000..c3c39f7cc --- /dev/null +++ b/experiments/rule-engine-poc/fixtures/needs-attention-design-risks.json @@ -0,0 +1,17 @@ +{ + "feature_slug": "search-relevance", + "current_stage": "design", + + "requirements_have_stable_ids": true, + "requirements_ears_coverage": 1.0, + "requirements_acceptance_criteria_testable": true, + + "design_irreversible_have_adrs": true, + "design_risks_have_mitigations": false, + + "open_clarifications_count": 0, + "blockers_count": 0, + "s1_findings_count": 0, + "s2_findings_count": 2, + "s3_findings_count": 3 +} diff --git a/experiments/rule-engine-poc/fixtures/ready-idea.json b/experiments/rule-engine-poc/fixtures/ready-idea.json new file mode 100644 index 000000000..a221bfcd3 --- /dev/null +++ b/experiments/rule-engine-poc/fixtures/ready-idea.json @@ -0,0 +1,14 @@ +{ + "feature_slug": "telemetry-opt-in", + "current_stage": "idea", + + "idea_problem_statement_present": true, + "idea_target_users_named": true, + "idea_scope_bounded": true, + + "open_clarifications_count": 0, + "blockers_count": 0, + "s1_findings_count": 0, + "s2_findings_count": 0, + "s3_findings_count": 0 +} diff --git a/experiments/rule-engine-poc/fixtures/ready-implementation.json b/experiments/rule-engine-poc/fixtures/ready-implementation.json new file mode 100644 index 000000000..2b18b0be8 --- /dev/null +++ b/experiments/rule-engine-poc/fixtures/ready-implementation.json @@ -0,0 +1,23 @@ +{ + "feature_slug": "rule-engine-poc", + "current_stage": "implementation", + + "requirements_have_stable_ids": true, + "requirements_ears_coverage": 1.0, + "requirements_acceptance_criteria_testable": true, + + "design_irreversible_have_adrs": true, + "design_risks_have_mitigations": true, + + "spec_each_item_traces_to_requirement": true, + + "implementation_lint_clean": true, + "implementation_types_clean": true, + "implementation_unit_tests_pass": true, + + "open_clarifications_count": 0, + "blockers_count": 0, + "s1_findings_count": 0, + "s2_findings_count": 0, + "s3_findings_count": 0 +} diff --git a/experiments/rule-engine-poc/package-lock.json b/experiments/rule-engine-poc/package-lock.json new file mode 100644 index 000000000..8482a41f0 --- /dev/null +++ b/experiments/rule-engine-poc/package-lock.json @@ -0,0 +1,1994 @@ +{ + "name": "@experiments/rule-engine-poc", + "version": "0.0.1", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "@experiments/rule-engine-poc", + "version": "0.0.1", + "dependencies": { + "js-yaml": "^4.1.0" + }, + "devDependencies": { + "@types/js-yaml": "^4.0.9", + "@types/node": "^22.0.0", + "tsx": "^4.19.0", + "typescript": "^5.6.0", + "vitest": "^2.1.0" + } + }, + "node_modules/@esbuild/aix-ppc64": { + "version": "0.28.0", + "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.28.0.tgz", + "integrity": "sha512-lhRUCeuOyJQURhTxl4WkpFTjIsbDayJHih5kZC1giwE+MhIzAb7mEsQMqMf18rHLsrb5qI1tafG20mLxEWcWlA==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "aix" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-arm": { + "version": "0.28.0", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.28.0.tgz", + "integrity": "sha512-wqh0ByljabXLKHeWXYLqoJ5jKC4XBaw6Hk08OfMrCRd2nP2ZQ5eleDZC41XHyCNgktBGYMbqnrJKq/K/lzPMSQ==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-arm64": { + "version": "0.28.0", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.28.0.tgz", + "integrity": "sha512-+WzIXQOSaGs33tLEgYPYe/yQHf0WTU0X42Jca3y8NWMbUVhp7rUnw+vAsRC/QiDrdD31IszMrZy+qwPOPjd+rw==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-x64": { + "version": "0.28.0", + "resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.28.0.tgz", + "integrity": "sha512-+VJggoaKhk2VNNqVL7f6S189UzShHC/mR9EE8rDdSkdpN0KflSwWY/gWjDrNxxisg8Fp1ZCD9jLMo4m0OUfeUA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/darwin-arm64": { + "version": "0.28.0", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.28.0.tgz", + "integrity": "sha512-0T+A9WZm+bZ84nZBtk1ckYsOvyA3x7e2Acj1KdVfV4/2tdG4fzUp91YHx+GArWLtwqp77pBXVCPn2We7Letr0Q==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/darwin-x64": { + "version": "0.28.0", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.28.0.tgz", + "integrity": "sha512-fyzLm/DLDl/84OCfp2f/XQ4flmORsjU7VKt8HLjvIXChJoFFOIL6pLJPH4Yhd1n1gGFF9mPwtlN5Wf82DZs+LQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/freebsd-arm64": { + "version": "0.28.0", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.28.0.tgz", + "integrity": "sha512-l9GeW5UZBT9k9brBYI+0WDffcRxgHQD8ShN2Ur4xWq/NFzUKm3k5lsH4PdaRgb2w7mI9u61nr2gI2mLI27Nh3Q==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/freebsd-x64": { + "version": "0.28.0", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.28.0.tgz", + "integrity": "sha512-BXoQai/A0wPO6Es3yFJ7APCiKGc1tdAEOgeTNy3SsB491S3aHn4S4r3e976eUnPdU+NbdtmBuLncYir2tMU9Nw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-arm": { + "version": "0.28.0", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.28.0.tgz", + "integrity": "sha512-CjaaREJagqJp7iTaNQjjidaNbCKYcd4IDkzbwwxtSvjI7NZm79qiHc8HqciMddQ6CKvJT6aBd8lO9kN/ZudLlw==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-arm64": { + "version": "0.28.0", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.28.0.tgz", + "integrity": "sha512-RVyzfb3FWsGA55n6WY0MEIEPURL1FcbhFE6BffZEMEekfCzCIMtB5yyDcFnVbTnwk+CLAgTujmV/Lgvih56W+A==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-ia32": { + "version": "0.28.0", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.28.0.tgz", + "integrity": "sha512-KBnSTt1kxl9x70q+ydterVdl+Cn0H18ngRMRCEQfrbqdUuntQQ0LoMZv47uB97NljZFzY6HcfqEZ2SAyIUTQBQ==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-loong64": { + "version": "0.28.0", + "resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.28.0.tgz", + "integrity": "sha512-zpSlUce1mnxzgBADvxKXX5sl8aYQHo2ezvMNI8I0lbblJtp8V4odlm3Yzlj7gPyt3T8ReksE6bK+pT3WD+aJRg==", + "cpu": [ + "loong64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-mips64el": { + "version": "0.28.0", + "resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.28.0.tgz", + "integrity": "sha512-2jIfP6mmjkdmeTlsX/9vmdmhBmKADrWqN7zcdtHIeNSCH1SqIoNI63cYsjQR8J+wGa4Y5izRcSHSm8K3QWmk3w==", + "cpu": [ + "mips64el" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-ppc64": { + "version": "0.28.0", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.28.0.tgz", + "integrity": "sha512-bc0FE9wWeC0WBm49IQMPSPILRocGTQt3j5KPCA8os6VprfuJ7KD+5PzESSrJ6GmPIPJK965ZJHTUlSA6GNYEhg==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-riscv64": { + "version": "0.28.0", + "resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.28.0.tgz", + "integrity": "sha512-SQPZOwoTTT/HXFXQJG/vBX8sOFagGqvZyXcgLA3NhIqcBv1BJU1d46c0rGcrij2B56Z2rNiSLaZOYW5cUk7yLQ==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-s390x": { + "version": "0.28.0", + "resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.28.0.tgz", + "integrity": "sha512-SCfR0HN8CEEjnYnySJTd2cw0k9OHB/YFzt5zgJEwa+wL/T/raGWYMBqwDNAC6dqFKmJYZoQBRfHjgwLHGSrn3Q==", + "cpu": [ + "s390x" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-x64": { + "version": "0.28.0", + "resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.28.0.tgz", + "integrity": "sha512-us0dSb9iFxIi8srnpl931Nvs65it/Jd2a2K3qs7fz2WfGPHqzfzZTfec7oxZJRNPXPnNYZtanmRc4AL/JwVzHQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/netbsd-arm64": { + "version": "0.28.0", + "resolved": "https://registry.npmjs.org/@esbuild/netbsd-arm64/-/netbsd-arm64-0.28.0.tgz", + "integrity": "sha512-CR/RYotgtCKwtftMwJlUU7xCVNg3lMYZ0RzTmAHSfLCXw3NtZtNpswLEj/Kkf6kEL3Gw+BpOekRX0BYCtklhUw==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "netbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/netbsd-x64": { + "version": "0.28.0", + "resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.28.0.tgz", + "integrity": "sha512-nU1yhmYutL+fQ71Kxnhg8uEOdC0pwEW9entHykTgEbna2pw2dkbFSMeqjjyHZoCmt8SBkOSvV+yNmm94aUrrqw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "netbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/openbsd-arm64": { + "version": "0.28.0", + "resolved": "https://registry.npmjs.org/@esbuild/openbsd-arm64/-/openbsd-arm64-0.28.0.tgz", + "integrity": "sha512-cXb5vApOsRsxsEl4mcZ1XY3D4DzcoMxR/nnc4IyqYs0rTI8ZKmW6kyyg+11Z8yvgMfAEldKzP7AdP64HnSC/6g==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/openbsd-x64": { + "version": "0.28.0", + "resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.28.0.tgz", + "integrity": "sha512-8wZM2qqtv9UP3mzy7HiGYNH/zjTA355mpeuA+859TyR+e+Tc08IHYpLJuMsfpDJwoLo1ikIJI8jC3GFjnRClzA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/openharmony-arm64": { + "version": "0.28.0", + "resolved": "https://registry.npmjs.org/@esbuild/openharmony-arm64/-/openharmony-arm64-0.28.0.tgz", + "integrity": "sha512-FLGfyizszcef5C3YtoyQDACyg95+dndv79i2EekILBofh5wpCa1KuBqOWKrEHZg3zrL3t5ouE5jgr94vA+Wb2w==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openharmony" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/sunos-x64": { + "version": "0.28.0", + "resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.28.0.tgz", + "integrity": "sha512-1ZgjUoEdHZZl/YlV76TSCz9Hqj9h9YmMGAgAPYd+q4SicWNX3G5GCyx9uhQWSLcbvPW8Ni7lj4gDa1T40akdlw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "sunos" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-arm64": { + "version": "0.28.0", + "resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.28.0.tgz", + "integrity": "sha512-Q9StnDmQ/enxnpxCCLSg0oo4+34B9TdXpuyPeTedN/6+iXBJ4J+zwfQI28u/Jl40nOYAxGoNi7mFP40RUtkmUA==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-ia32": { + "version": "0.28.0", + "resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.28.0.tgz", + "integrity": "sha512-zF3ag/gfiCe6U2iczcRzSYJKH1DCI+ByzSENHlM2FcDbEeo5Zd2C86Aq0tKUYAJJ1obRP84ymxIAksZUcdztHA==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-x64": { + "version": "0.28.0", + "resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.28.0.tgz", + "integrity": "sha512-pEl1bO9mfAmIC+tW5btTmrKaujg3zGtUmWNdCw/xs70FBjwAL3o9OEKNHvNmnyylD6ubxUERiEhdsL0xBQ9efw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@jridgewell/sourcemap-codec": { + "version": "1.5.5", + "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.5.tgz", + "integrity": "sha512-cYQ9310grqxueWbl+WuIUIaiUaDcj7WOq5fVhEljNVgRfOUhY9fy2zTvfoqWsnebh8Sl70VScFbICvJnLKB0Og==", + "dev": true, + "license": "MIT" + }, + "node_modules/@rollup/rollup-android-arm-eabi": { + "version": "4.60.4", + "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.60.4.tgz", + "integrity": "sha512-F5QXMSiFebS9hKZj02XhWLLnRpJ3B3AROP0tWbFBSj+6kCbg5m9j5JoHKd4mmSVy5mS/IMQloYgYxCuJC0fxEQ==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ] + }, + "node_modules/@rollup/rollup-android-arm64": { + "version": "4.60.4", + "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.60.4.tgz", + "integrity": "sha512-GxxTKApUpzRhof7poWvCJHRF51C67u1R7D6DiluBE8wKU1u5GWE8t+v81JvJYtbawoBFX1hLv5Ei4eVjkWokaw==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ] + }, + "node_modules/@rollup/rollup-darwin-arm64": { + "version": "4.60.4", + "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.60.4.tgz", + "integrity": "sha512-tua0TaJxMOB1R0V0RS1jFZ/RpURFDJIOR2A6jWwQeawuFyS4gBW+rntLRaQd0EQ4bd6Vp44Z2rXW+YYDBsj6IA==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@rollup/rollup-darwin-x64": { + "version": "4.60.4", + "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.60.4.tgz", + "integrity": "sha512-CSKq7MsP+5PFIcydhAiR1K0UhEI1A2jWXVKHPCBZ151yOutENwvnPocgVHkivu2kviURtCEB6zUQw0vs8RrhMg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@rollup/rollup-freebsd-arm64": { + "version": "4.60.4", + "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-arm64/-/rollup-freebsd-arm64-4.60.4.tgz", + "integrity": "sha512-+O8OkVdyvXMtJEciu2wS/pzm1IxntEEQx3z5TAVy4l32G0etZn+RsA48ARRrFm6Ri8fvqPQfgrvNxSjKAbnd3g==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ] + }, + "node_modules/@rollup/rollup-freebsd-x64": { + "version": "4.60.4", + "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-x64/-/rollup-freebsd-x64-4.60.4.tgz", + "integrity": "sha512-Iw3oMskH3AfNuhU0MSN7vNbdi4me/NiYo2azqPz/Le16zHSa+3RRmliCMWWQmh4lcndccU40xcJuTYJZxNo/lw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ] + }, + "node_modules/@rollup/rollup-linux-arm-gnueabihf": { + "version": "4.60.4", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.60.4.tgz", + "integrity": "sha512-EIPRXTVQpHyF8WOo219AD2yEltPehLTcTMz2fn6JsatLYSzQf00hj3rulF+yauOlF9/FtM2WpkT/hJh/KJFGhA==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm-musleabihf": { + "version": "4.60.4", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.60.4.tgz", + "integrity": "sha512-J3Yh9PzzF1Ovah2At+lHiGQdsYgArxBbXv/zHfSyaiFQEqvNv7DcW98pCrmdjCZBrqBiKrKKe2V+aaSGWuBe/w==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm64-gnu": { + "version": "4.60.4", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.60.4.tgz", + "integrity": "sha512-BFDEZMYfUvLn37ONE1yMBojPxnMlTFsdyNoqncT0qFq1mAfllL+ATMMJd8TeuVMiX84s1KbcxcZbXInmcO2mRg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm64-musl": { + "version": "4.60.4", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.60.4.tgz", + "integrity": "sha512-pc9EYOSlOgdQ2uPl1o9PF6/kLSgaUosia7gOuS8mB69IxJvlclko1MECXysjs5ryez1/5zjYqx3+xYU0TU6R1A==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-loong64-gnu": { + "version": "4.60.4", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loong64-gnu/-/rollup-linux-loong64-gnu-4.60.4.tgz", + "integrity": "sha512-NxnomyxYerDh5n4iLrNa+sH+Z+U4BMEE46V2PgQ/hoB909i8gV1M5wPojWg9fk1jWpO3IQnOs20K4wyZuFLEFQ==", + "cpu": [ + "loong64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-loong64-musl": { + "version": "4.60.4", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loong64-musl/-/rollup-linux-loong64-musl-4.60.4.tgz", + "integrity": "sha512-nbJnQ8a3z1mtmrwImCYhc6BGpThAyYVRQxw9uKSKG4wR6aAYno9sVjJ0zaZcW9BPJX1GbrDPf+SvdWjgTuDmnw==", + "cpu": [ + "loong64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-ppc64-gnu": { + "version": "4.60.4", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-ppc64-gnu/-/rollup-linux-ppc64-gnu-4.60.4.tgz", + "integrity": "sha512-2EU6acNrQLd8tYvo/LXW535wupT3m6fo7HKo6lr7ktQoItxTyOL1ZCR/GfGCuXl2vR+zmfI6eRXkSemafv+iVg==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-ppc64-musl": { + "version": "4.60.4", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-ppc64-musl/-/rollup-linux-ppc64-musl-4.60.4.tgz", + "integrity": "sha512-WeBtoMuaMxiiIrO2IYP3xs6GMWkJP2C0EoT8beTLkUPmzV1i/UcOSVw1d5r9KBODtHKilG5yFxsGRnBbK3wJ4A==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-riscv64-gnu": { + "version": "4.60.4", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.60.4.tgz", + "integrity": "sha512-FJHFfqpKUI3A10WrWKiFbBZ7yVbGT4q4B5o1qKFFojqpaYoh9LrQgqWCmmcxQzVSXYtyB5bzkXrYzlHTs21MYA==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-riscv64-musl": { + "version": "4.60.4", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-musl/-/rollup-linux-riscv64-musl-4.60.4.tgz", + "integrity": "sha512-mcEl6CUT5IAUmQf1m9FYSmVqCJlpQ8r8eyftFUHG8i9OhY7BkBXSUdnLH5DOf0wCOjcP9v/QO93zpmF1SptCCw==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-s390x-gnu": { + "version": "4.60.4", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.60.4.tgz", + "integrity": "sha512-ynt3JxVd2w2buzoKDWIyiV1pJW93xlQic1THVLXilz429oijRpSHivZAgp65KBu+cMcgf1eVVjdnTLvPxgCuoQ==", + "cpu": [ + "s390x" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-x64-gnu": { + "version": "4.60.4", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.60.4.tgz", + "integrity": "sha512-Boiz5+MsaROEWDf+GGEwF8VMHGhlUoQMtIPjOgA5fv4osupqTVnJteQNKJwUcnUog2G55jYXH7KZFFiJe0TEzQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-x64-musl": { + "version": "4.60.4", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.60.4.tgz", + "integrity": "sha512-+qfSY27qIrFfI/Hom04KYFw3GKZSGU4lXus51wsb5EuySfFlWRwjkKWoE9emgRw/ukoT4Udsj4W/+xxG8VbPKg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-openbsd-x64": { + "version": "4.60.4", + "resolved": "https://registry.npmjs.org/@rollup/rollup-openbsd-x64/-/rollup-openbsd-x64-4.60.4.tgz", + "integrity": "sha512-VpTfOPHgVXEBeeR8hZ2O0F3aSso+JDWqTWmTmzcQKted54IAdUVbxE+j/MVxUsKa8L20HJhv3vUezVPoquqWjA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openbsd" + ] + }, + "node_modules/@rollup/rollup-openharmony-arm64": { + "version": "4.60.4", + "resolved": "https://registry.npmjs.org/@rollup/rollup-openharmony-arm64/-/rollup-openharmony-arm64-4.60.4.tgz", + "integrity": "sha512-IPOsh5aRYuLv/nkU51X10Bf75Bsf6+gZdx1X+QP5QM6lIJFHHqbHLG0uJn/hWthzo13UAc2umiUorqZy3axoZg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openharmony" + ] + }, + "node_modules/@rollup/rollup-win32-arm64-msvc": { + "version": "4.60.4", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.60.4.tgz", + "integrity": "sha512-4QzE9E81OohJ/HKzHhsqU+zcYYojVOXlFMs1DdyMT6qXl/niOH7AVElmmEdUNHHS/oRkc++d5k6Vy85zFs0DEw==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-ia32-msvc": { + "version": "4.60.4", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.60.4.tgz", + "integrity": "sha512-zTPgT1YuHHcd+Tmx7h8aml0FWFVelV5N54oHow9SLj+GfoDy/huQ+UV396N/C7KpMDMiPspRktzM1/0r1usYEA==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-x64-gnu": { + "version": "4.60.4", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-gnu/-/rollup-win32-x64-gnu-4.60.4.tgz", + "integrity": "sha512-DRS4G7mi9lJxqEDezIkKCaUIKCrLUUDCUaCsTPCi/rtqaC6D/jjwslMQyiDU50Ka0JKpeXeRBFBAXwArY52vBw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-x64-msvc": { + "version": "4.60.4", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.60.4.tgz", + "integrity": "sha512-QVTUovf40zgTqlFVrKA1uXMVvU2QWEFWfAH8Wdc48IxLvrJMQVMBRjuQyUpzZCDkakImib9eVazbWlC6ksWtJw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@types/estree": { + "version": "1.0.9", + "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.9.tgz", + "integrity": "sha512-GhdPgy1el4/ImP05X05Uw4cw2/M93BCUmnEvWZNStlCzEKME4Fkk+YpoA5OiHNQmoS7Cafb8Xa3Pya8m1Qrzeg==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/js-yaml": { + "version": "4.0.9", + "resolved": "https://registry.npmjs.org/@types/js-yaml/-/js-yaml-4.0.9.tgz", + "integrity": "sha512-k4MGaQl5TGo/iipqb2UDG2UwjXziSWkh0uysQelTlJpX1qGlpUZYm8PnO4DxG1qBomtJUdYJ6qR6xdIah10JLg==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/node": { + "version": "22.19.19", + "resolved": "https://registry.npmjs.org/@types/node/-/node-22.19.19.tgz", + "integrity": "sha512-dyh/xO2Fh5bYrfWaaqGrRQQGkNdmYw6AmaAUvYeUMNTWQtvb796ikLdmTchRmOlOiIJ1TDXfWgVx1QkUlQ6Hew==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~6.21.0" + } + }, + "node_modules/@vitest/expect": { + "version": "2.1.9", + "resolved": "https://registry.npmjs.org/@vitest/expect/-/expect-2.1.9.tgz", + "integrity": "sha512-UJCIkTBenHeKT1TTlKMJWy1laZewsRIzYighyYiJKZreqtdxSos/S1t+ktRMQWu2CKqaarrkeszJx1cgC5tGZw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@vitest/spy": "2.1.9", + "@vitest/utils": "2.1.9", + "chai": "^5.1.2", + "tinyrainbow": "^1.2.0" + }, + "funding": { + "url": "https://opencollective.com/vitest" + } + }, + "node_modules/@vitest/mocker": { + "version": "2.1.9", + "resolved": "https://registry.npmjs.org/@vitest/mocker/-/mocker-2.1.9.tgz", + "integrity": "sha512-tVL6uJgoUdi6icpxmdrn5YNo3g3Dxv+IHJBr0GXHaEdTcw3F+cPKnsXFhli6nO+f/6SDKPHEK1UN+k+TQv0Ehg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@vitest/spy": "2.1.9", + "estree-walker": "^3.0.3", + "magic-string": "^0.30.12" + }, + "funding": { + "url": "https://opencollective.com/vitest" + }, + "peerDependencies": { + "msw": "^2.4.9", + "vite": "^5.0.0" + }, + "peerDependenciesMeta": { + "msw": { + "optional": true + }, + "vite": { + "optional": true + } + } + }, + "node_modules/@vitest/pretty-format": { + "version": "2.1.9", + "resolved": "https://registry.npmjs.org/@vitest/pretty-format/-/pretty-format-2.1.9.tgz", + "integrity": "sha512-KhRIdGV2U9HOUzxfiHmY8IFHTdqtOhIzCpd8WRdJiE7D/HUcZVD0EgQCVjm+Q9gkUXWgBvMmTtZgIG48wq7sOQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "tinyrainbow": "^1.2.0" + }, + "funding": { + "url": "https://opencollective.com/vitest" + } + }, + "node_modules/@vitest/runner": { + "version": "2.1.9", + "resolved": "https://registry.npmjs.org/@vitest/runner/-/runner-2.1.9.tgz", + "integrity": "sha512-ZXSSqTFIrzduD63btIfEyOmNcBmQvgOVsPNPe0jYtESiXkhd8u2erDLnMxmGrDCwHCCHE7hxwRDCT3pt0esT4g==", + "dev": true, + "license": "MIT", + "dependencies": { + "@vitest/utils": "2.1.9", + "pathe": "^1.1.2" + }, + "funding": { + "url": "https://opencollective.com/vitest" + } + }, + "node_modules/@vitest/snapshot": { + "version": "2.1.9", + "resolved": "https://registry.npmjs.org/@vitest/snapshot/-/snapshot-2.1.9.tgz", + "integrity": "sha512-oBO82rEjsxLNJincVhLhaxxZdEtV0EFHMK5Kmx5sJ6H9L183dHECjiefOAdnqpIgT5eZwT04PoggUnW88vOBNQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@vitest/pretty-format": "2.1.9", + "magic-string": "^0.30.12", + "pathe": "^1.1.2" + }, + "funding": { + "url": "https://opencollective.com/vitest" + } + }, + "node_modules/@vitest/spy": { + "version": "2.1.9", + "resolved": "https://registry.npmjs.org/@vitest/spy/-/spy-2.1.9.tgz", + "integrity": "sha512-E1B35FwzXXTs9FHNK6bDszs7mtydNi5MIfUWpceJ8Xbfb1gBMscAnwLbEu+B44ed6W3XjL9/ehLPHR1fkf1KLQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "tinyspy": "^3.0.2" + }, + "funding": { + "url": "https://opencollective.com/vitest" + } + }, + "node_modules/@vitest/utils": { + "version": "2.1.9", + "resolved": "https://registry.npmjs.org/@vitest/utils/-/utils-2.1.9.tgz", + "integrity": "sha512-v0psaMSkNJ3A2NMrUEHFRzJtDPFn+/VWZ5WxImB21T9fjucJRmS7xCS3ppEnARb9y11OAzaD+P2Ps+b+BGX5iQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@vitest/pretty-format": "2.1.9", + "loupe": "^3.1.2", + "tinyrainbow": "^1.2.0" + }, + "funding": { + "url": "https://opencollective.com/vitest" + } + }, + "node_modules/argparse": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz", + "integrity": "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==", + "license": "Python-2.0" + }, + "node_modules/assertion-error": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/assertion-error/-/assertion-error-2.0.1.tgz", + "integrity": "sha512-Izi8RQcffqCeNVgFigKli1ssklIbpHnCYc6AknXGYoB6grJqyeby7jv12JUQgmTAnIDnbck1uxksT4dzN3PWBA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=12" + } + }, + "node_modules/cac": { + "version": "6.7.14", + "resolved": "https://registry.npmjs.org/cac/-/cac-6.7.14.tgz", + "integrity": "sha512-b6Ilus+c3RrdDk+JhLKUAQfzzgLEPy6wcXqS7f/xe1EETvsDP6GORG7SFuOs6cID5YkqchW/LXZbX5bc8j7ZcQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=8" + } + }, + "node_modules/chai": { + "version": "5.3.3", + "resolved": "https://registry.npmjs.org/chai/-/chai-5.3.3.tgz", + "integrity": "sha512-4zNhdJD/iOjSH0A05ea+Ke6MU5mmpQcbQsSOkgdaUMJ9zTlDTD/GYlwohmIE2u0gaxHYiVHEn1Fw9mZ/ktJWgw==", + "dev": true, + "license": "MIT", + "dependencies": { + "assertion-error": "^2.0.1", + "check-error": "^2.1.1", + "deep-eql": "^5.0.1", + "loupe": "^3.1.0", + "pathval": "^2.0.0" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/check-error": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/check-error/-/check-error-2.1.3.tgz", + "integrity": "sha512-PAJdDJusoxnwm1VwW07VWwUN1sl7smmC3OKggvndJFadxxDRyFJBX/ggnu/KE4kQAB7a3Dp8f/YXC1FlUprWmA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 16" + } + }, + "node_modules/debug": { + "version": "4.4.3", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz", + "integrity": "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==", + "dev": true, + "license": "MIT", + "dependencies": { + "ms": "^2.1.3" + }, + "engines": { + "node": ">=6.0" + }, + "peerDependenciesMeta": { + "supports-color": { + "optional": true + } + } + }, + "node_modules/deep-eql": { + "version": "5.0.2", + "resolved": "https://registry.npmjs.org/deep-eql/-/deep-eql-5.0.2.tgz", + "integrity": "sha512-h5k/5U50IJJFpzfL6nO9jaaumfjO/f2NjK/oYB2Djzm4p9L+3T9qWpZqZ2hAbLPuuYq9wrU08WQyBTL5GbPk5Q==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6" + } + }, + "node_modules/es-module-lexer": { + "version": "1.7.0", + "resolved": "https://registry.npmjs.org/es-module-lexer/-/es-module-lexer-1.7.0.tgz", + "integrity": "sha512-jEQoCwk8hyb2AZziIOLhDqpm5+2ww5uIE6lkO/6jcOCusfk6LhMHpXXfBLXTZ7Ydyt0j4VoUQv6uGNYbdW+kBA==", + "dev": true, + "license": "MIT" + }, + "node_modules/esbuild": { + "version": "0.28.0", + "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.28.0.tgz", + "integrity": "sha512-sNR9MHpXSUV/XB4zmsFKN+QgVG82Cc7+/aaxJ8Adi8hyOac+EXptIp45QBPaVyX3N70664wRbTcLTOemCAnyqw==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "bin": { + "esbuild": "bin/esbuild" + }, + "engines": { + "node": ">=18" + }, + "optionalDependencies": { + "@esbuild/aix-ppc64": "0.28.0", + "@esbuild/android-arm": "0.28.0", + "@esbuild/android-arm64": "0.28.0", + "@esbuild/android-x64": "0.28.0", + "@esbuild/darwin-arm64": "0.28.0", + "@esbuild/darwin-x64": "0.28.0", + "@esbuild/freebsd-arm64": "0.28.0", + "@esbuild/freebsd-x64": "0.28.0", + "@esbuild/linux-arm": "0.28.0", + "@esbuild/linux-arm64": "0.28.0", + "@esbuild/linux-ia32": "0.28.0", + "@esbuild/linux-loong64": "0.28.0", + "@esbuild/linux-mips64el": "0.28.0", + "@esbuild/linux-ppc64": "0.28.0", + "@esbuild/linux-riscv64": "0.28.0", + "@esbuild/linux-s390x": "0.28.0", + "@esbuild/linux-x64": "0.28.0", + "@esbuild/netbsd-arm64": "0.28.0", + "@esbuild/netbsd-x64": "0.28.0", + "@esbuild/openbsd-arm64": "0.28.0", + "@esbuild/openbsd-x64": "0.28.0", + "@esbuild/openharmony-arm64": "0.28.0", + "@esbuild/sunos-x64": "0.28.0", + "@esbuild/win32-arm64": "0.28.0", + "@esbuild/win32-ia32": "0.28.0", + "@esbuild/win32-x64": "0.28.0" + } + }, + "node_modules/estree-walker": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/estree-walker/-/estree-walker-3.0.3.tgz", + "integrity": "sha512-7RUKfXgSMMkzt6ZuXmqapOurLGPPfgj6l9uRZ7lRGolvk0y2yocc35LdcxKC5PQZdn2DMqioAQ2NoWcrTKmm6g==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/estree": "^1.0.0" + } + }, + "node_modules/expect-type": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/expect-type/-/expect-type-1.3.0.tgz", + "integrity": "sha512-knvyeauYhqjOYvQ66MznSMs83wmHrCycNEN6Ao+2AeYEfxUIkuiVxdEa1qlGEPK+We3n0THiDciYSsCcgW/DoA==", + "dev": true, + "license": "Apache-2.0", + "engines": { + "node": ">=12.0.0" + } + }, + "node_modules/fsevents": { + "version": "2.3.3", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz", + "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, + "node_modules/js-yaml": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.1.1.tgz", + "integrity": "sha512-qQKT4zQxXl8lLwBtHMWwaTcGfFOZviOJet3Oy/xmGk2gZH677CJM9EvtfdSkgWcATZhj/55JZ0rmy3myCT5lsA==", + "license": "MIT", + "dependencies": { + "argparse": "^2.0.1" + }, + "bin": { + "js-yaml": "bin/js-yaml.js" + } + }, + "node_modules/loupe": { + "version": "3.2.1", + "resolved": "https://registry.npmjs.org/loupe/-/loupe-3.2.1.tgz", + "integrity": "sha512-CdzqowRJCeLU72bHvWqwRBBlLcMEtIvGrlvef74kMnV2AolS9Y8xUv1I0U/MNAWMhBlKIoyuEgoJ0t/bbwHbLQ==", + "dev": true, + "license": "MIT" + }, + "node_modules/magic-string": { + "version": "0.30.21", + "resolved": "https://registry.npmjs.org/magic-string/-/magic-string-0.30.21.tgz", + "integrity": "sha512-vd2F4YUyEXKGcLHoq+TEyCjxueSeHnFxyyjNp80yg0XV4vUhnDer/lvvlqM/arB5bXQN5K2/3oinyCRyx8T2CQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@jridgewell/sourcemap-codec": "^1.5.5" + } + }, + "node_modules/ms": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", + "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==", + "dev": true, + "license": "MIT" + }, + "node_modules/nanoid": { + "version": "3.3.12", + "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.12.tgz", + "integrity": "sha512-ZB9RH/39qpq5Vu6Y+NmUaFhQR6pp+M2Xt76XBnEwDaGcVAqhlvxrl3B2bKS5D3NH3QR76v3aSrKaF/Kiy7lEtQ==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "bin": { + "nanoid": "bin/nanoid.cjs" + }, + "engines": { + "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1" + } + }, + "node_modules/pathe": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/pathe/-/pathe-1.1.2.tgz", + "integrity": "sha512-whLdWMYL2TwI08hn8/ZqAbrVemu0LNaNNJZX73O6qaIdCTfXutsLhMkjdENX0qhsQ9uIimo4/aQOmXkoon2nDQ==", + "dev": true, + "license": "MIT" + }, + "node_modules/pathval": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/pathval/-/pathval-2.0.1.tgz", + "integrity": "sha512-//nshmD55c46FuFw26xV/xFAaB5HF9Xdap7HJBBnrKdAd6/GxDBaNA1870O79+9ueg61cZLSVc+OaFlfmObYVQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 14.16" + } + }, + "node_modules/picocolors": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz", + "integrity": "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==", + "dev": true, + "license": "ISC" + }, + "node_modules/postcss": { + "version": "8.5.14", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.5.14.tgz", + "integrity": "sha512-SoSL4+OSEtR99LHFZQiJLkT59C5B1amGO1NzTwj7TT1qCUgUO6hxOvzkOYxD+vMrXBM3XJIKzokoERdqQq/Zmg==", + "dev": true, + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/postcss/" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/postcss" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "nanoid": "^3.3.11", + "picocolors": "^1.1.1", + "source-map-js": "^1.2.1" + }, + "engines": { + "node": "^10 || ^12 || >=14" + } + }, + "node_modules/rollup": { + "version": "4.60.4", + "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.60.4.tgz", + "integrity": "sha512-WHeFSbZYsPu3+bLoNRUuAO+wavNlocOPf3wSHTP7hcFKVnJeWsYlCDbr3mTS14FCizf9ccIxXA8sGL8zKeQN3g==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/estree": "1.0.8" + }, + "bin": { + "rollup": "dist/bin/rollup" + }, + "engines": { + "node": ">=18.0.0", + "npm": ">=8.0.0" + }, + "optionalDependencies": { + "@rollup/rollup-android-arm-eabi": "4.60.4", + "@rollup/rollup-android-arm64": "4.60.4", + "@rollup/rollup-darwin-arm64": "4.60.4", + "@rollup/rollup-darwin-x64": "4.60.4", + "@rollup/rollup-freebsd-arm64": "4.60.4", + "@rollup/rollup-freebsd-x64": "4.60.4", + "@rollup/rollup-linux-arm-gnueabihf": "4.60.4", + "@rollup/rollup-linux-arm-musleabihf": "4.60.4", + "@rollup/rollup-linux-arm64-gnu": "4.60.4", + "@rollup/rollup-linux-arm64-musl": "4.60.4", + "@rollup/rollup-linux-loong64-gnu": "4.60.4", + "@rollup/rollup-linux-loong64-musl": "4.60.4", + "@rollup/rollup-linux-ppc64-gnu": "4.60.4", + "@rollup/rollup-linux-ppc64-musl": "4.60.4", + "@rollup/rollup-linux-riscv64-gnu": "4.60.4", + "@rollup/rollup-linux-riscv64-musl": "4.60.4", + "@rollup/rollup-linux-s390x-gnu": "4.60.4", + "@rollup/rollup-linux-x64-gnu": "4.60.4", + "@rollup/rollup-linux-x64-musl": "4.60.4", + "@rollup/rollup-openbsd-x64": "4.60.4", + "@rollup/rollup-openharmony-arm64": "4.60.4", + "@rollup/rollup-win32-arm64-msvc": "4.60.4", + "@rollup/rollup-win32-ia32-msvc": "4.60.4", + "@rollup/rollup-win32-x64-gnu": "4.60.4", + "@rollup/rollup-win32-x64-msvc": "4.60.4", + "fsevents": "~2.3.2" + } + }, + "node_modules/rollup/node_modules/@types/estree": { + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.8.tgz", + "integrity": "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==", + "dev": true, + "license": "MIT" + }, + "node_modules/siginfo": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/siginfo/-/siginfo-2.0.0.tgz", + "integrity": "sha512-ybx0WO1/8bSBLEWXZvEd7gMW3Sn3JFlW3TvX1nREbDLRNQNaeNN8WK0meBwPdAaOI7TtRRRJn/Es1zhrrCHu7g==", + "dev": true, + "license": "ISC" + }, + "node_modules/source-map-js": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz", + "integrity": "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==", + "dev": true, + "license": "BSD-3-Clause", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/stackback": { + "version": "0.0.2", + "resolved": "https://registry.npmjs.org/stackback/-/stackback-0.0.2.tgz", + "integrity": "sha512-1XMJE5fQo1jGH6Y/7ebnwPOBEkIEnT4QF32d5R1+VXdXveM0IBMJt8zfaxX1P3QhVwrYe+576+jkANtSS2mBbw==", + "dev": true, + "license": "MIT" + }, + "node_modules/std-env": { + "version": "3.10.0", + "resolved": "https://registry.npmjs.org/std-env/-/std-env-3.10.0.tgz", + "integrity": "sha512-5GS12FdOZNliM5mAOxFRg7Ir0pWz8MdpYm6AY6VPkGpbA7ZzmbzNcBJQ0GPvvyWgcY7QAhCgf9Uy89I03faLkg==", + "dev": true, + "license": "MIT" + }, + "node_modules/tinybench": { + "version": "2.9.0", + "resolved": "https://registry.npmjs.org/tinybench/-/tinybench-2.9.0.tgz", + "integrity": "sha512-0+DUvqWMValLmha6lr4kD8iAMK1HzV0/aKnCtWb9v9641TnP/MFb7Pc2bxoxQjTXAErryXVgUOfv2YqNllqGeg==", + "dev": true, + "license": "MIT" + }, + "node_modules/tinyexec": { + "version": "0.3.2", + "resolved": "https://registry.npmjs.org/tinyexec/-/tinyexec-0.3.2.tgz", + "integrity": "sha512-KQQR9yN7R5+OSwaK0XQoj22pwHoTlgYqmUscPYoknOoWCWfj/5/ABTMRi69FrKU5ffPVh5QcFikpWJI/P1ocHA==", + "dev": true, + "license": "MIT" + }, + "node_modules/tinypool": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/tinypool/-/tinypool-1.1.1.tgz", + "integrity": "sha512-Zba82s87IFq9A9XmjiX5uZA/ARWDrB03OHlq+Vw1fSdt0I+4/Kutwy8BP4Y/y/aORMo61FQ0vIb5j44vSo5Pkg==", + "dev": true, + "license": "MIT", + "engines": { + "node": "^18.0.0 || >=20.0.0" + } + }, + "node_modules/tinyrainbow": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/tinyrainbow/-/tinyrainbow-1.2.0.tgz", + "integrity": "sha512-weEDEq7Z5eTHPDh4xjX789+fHfF+P8boiFB+0vbWzpbnbsEr/GRaohi/uMKxg8RZMXnl1ItAi/IUHWMsjDV7kQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/tinyspy": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/tinyspy/-/tinyspy-3.0.2.tgz", + "integrity": "sha512-n1cw8k1k0x4pgA2+9XrOkFydTerNcJ1zWCO5Nn9scWHTD+5tp8dghT2x1uduQePZTZgd3Tupf+x9BxJjeJi77Q==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/tsx": { + "version": "4.22.1", + "resolved": "https://registry.npmjs.org/tsx/-/tsx-4.22.1.tgz", + "integrity": "sha512-TvncJykhxAzFCk0VQZKBTClall4Pm7qXDSodb6uxi8QFa8X8mT6ABjxxsQ2opDRYxG7AzcRWXaFtruz5HJKuWg==", + "dev": true, + "license": "MIT", + "dependencies": { + "esbuild": "~0.28.0" + }, + "bin": { + "tsx": "dist/cli.mjs" + }, + "engines": { + "node": ">=18.0.0" + }, + "optionalDependencies": { + "fsevents": "~2.3.3" + } + }, + "node_modules/typescript": { + "version": "5.9.3", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz", + "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==", + "dev": true, + "license": "Apache-2.0", + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=14.17" + } + }, + "node_modules/undici-types": { + "version": "6.21.0", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz", + "integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==", + "dev": true, + "license": "MIT" + }, + "node_modules/vite": { + "version": "5.4.21", + "resolved": "https://registry.npmjs.org/vite/-/vite-5.4.21.tgz", + "integrity": "sha512-o5a9xKjbtuhY6Bi5S3+HvbRERmouabWbyUcpXXUA1u+GNUKoROi9byOJ8M0nHbHYHkYICiMlqxkg1KkYmm25Sw==", + "dev": true, + "license": "MIT", + "dependencies": { + "esbuild": "^0.21.3", + "postcss": "^8.4.43", + "rollup": "^4.20.0" + }, + "bin": { + "vite": "bin/vite.js" + }, + "engines": { + "node": "^18.0.0 || >=20.0.0" + }, + "funding": { + "url": "https://github.com/vitejs/vite?sponsor=1" + }, + "optionalDependencies": { + "fsevents": "~2.3.3" + }, + "peerDependencies": { + "@types/node": "^18.0.0 || >=20.0.0", + "less": "*", + "lightningcss": "^1.21.0", + "sass": "*", + "sass-embedded": "*", + "stylus": "*", + "sugarss": "*", + "terser": "^5.4.0" + }, + "peerDependenciesMeta": { + "@types/node": { + "optional": true + }, + "less": { + "optional": true + }, + "lightningcss": { + "optional": true + }, + "sass": { + "optional": true + }, + "sass-embedded": { + "optional": true + }, + "stylus": { + "optional": true + }, + "sugarss": { + "optional": true + }, + "terser": { + "optional": true + } + } + }, + "node_modules/vite-node": { + "version": "2.1.9", + "resolved": "https://registry.npmjs.org/vite-node/-/vite-node-2.1.9.tgz", + "integrity": "sha512-AM9aQ/IPrW/6ENLQg3AGY4K1N2TGZdR5e4gu/MmmR2xR3Ll1+dib+nook92g4TV3PXVyeyxdWwtaCAiUL0hMxA==", + "dev": true, + "license": "MIT", + "dependencies": { + "cac": "^6.7.14", + "debug": "^4.3.7", + "es-module-lexer": "^1.5.4", + "pathe": "^1.1.2", + "vite": "^5.0.0" + }, + "bin": { + "vite-node": "vite-node.mjs" + }, + "engines": { + "node": "^18.0.0 || >=20.0.0" + }, + "funding": { + "url": "https://opencollective.com/vitest" + } + }, + "node_modules/vite/node_modules/@esbuild/aix-ppc64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.21.5.tgz", + "integrity": "sha512-1SDgH6ZSPTlggy1yI6+Dbkiz8xzpHJEVAlF/AM1tHPLsf5STom9rwtjE4hKAF20FfXXNTFqEYXyJNWh1GiZedQ==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "aix" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/vite/node_modules/@esbuild/android-arm": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.21.5.tgz", + "integrity": "sha512-vCPvzSjpPHEi1siZdlvAlsPxXl7WbOVUBBAowWug4rJHb68Ox8KualB+1ocNvT5fjv6wpkX6o/iEpbDrf68zcg==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/vite/node_modules/@esbuild/android-arm64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.21.5.tgz", + "integrity": "sha512-c0uX9VAUBQ7dTDCjq+wdyGLowMdtR/GoC2U5IYk/7D1H1JYC0qseD7+11iMP2mRLN9RcCMRcjC4YMclCzGwS/A==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/vite/node_modules/@esbuild/android-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.21.5.tgz", + "integrity": "sha512-D7aPRUUNHRBwHxzxRvp856rjUHRFW1SdQATKXH2hqA0kAZb1hKmi02OpYRacl0TxIGz/ZmXWlbZgjwWYaCakTA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/vite/node_modules/@esbuild/darwin-arm64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.21.5.tgz", + "integrity": "sha512-DwqXqZyuk5AiWWf3UfLiRDJ5EDd49zg6O9wclZ7kUMv2WRFr4HKjXp/5t8JZ11QbQfUS6/cRCKGwYhtNAY88kQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/vite/node_modules/@esbuild/darwin-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.21.5.tgz", + "integrity": "sha512-se/JjF8NlmKVG4kNIuyWMV/22ZaerB+qaSi5MdrXtd6R08kvs2qCN4C09miupktDitvh8jRFflwGFBQcxZRjbw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/vite/node_modules/@esbuild/freebsd-arm64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.21.5.tgz", + "integrity": "sha512-5JcRxxRDUJLX8JXp/wcBCy3pENnCgBR9bN6JsY4OmhfUtIHe3ZW0mawA7+RDAcMLrMIZaf03NlQiX9DGyB8h4g==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/vite/node_modules/@esbuild/freebsd-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.21.5.tgz", + "integrity": "sha512-J95kNBj1zkbMXtHVH29bBriQygMXqoVQOQYA+ISs0/2l3T9/kj42ow2mpqerRBxDJnmkUDCaQT/dfNXWX/ZZCQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/vite/node_modules/@esbuild/linux-arm": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.21.5.tgz", + "integrity": "sha512-bPb5AHZtbeNGjCKVZ9UGqGwo8EUu4cLq68E95A53KlxAPRmUyYv2D6F0uUI65XisGOL1hBP5mTronbgo+0bFcA==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/vite/node_modules/@esbuild/linux-arm64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.21.5.tgz", + "integrity": "sha512-ibKvmyYzKsBeX8d8I7MH/TMfWDXBF3db4qM6sy+7re0YXya+K1cem3on9XgdT2EQGMu4hQyZhan7TeQ8XkGp4Q==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/vite/node_modules/@esbuild/linux-ia32": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.21.5.tgz", + "integrity": "sha512-YvjXDqLRqPDl2dvRODYmmhz4rPeVKYvppfGYKSNGdyZkA01046pLWyRKKI3ax8fbJoK5QbxblURkwK/MWY18Tg==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/vite/node_modules/@esbuild/linux-loong64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.21.5.tgz", + "integrity": "sha512-uHf1BmMG8qEvzdrzAqg2SIG/02+4/DHB6a9Kbya0XDvwDEKCoC8ZRWI5JJvNdUjtciBGFQ5PuBlpEOXQj+JQSg==", + "cpu": [ + "loong64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/vite/node_modules/@esbuild/linux-mips64el": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.21.5.tgz", + "integrity": "sha512-IajOmO+KJK23bj52dFSNCMsz1QP1DqM6cwLUv3W1QwyxkyIWecfafnI555fvSGqEKwjMXVLokcV5ygHW5b3Jbg==", + "cpu": [ + "mips64el" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/vite/node_modules/@esbuild/linux-ppc64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.21.5.tgz", + "integrity": "sha512-1hHV/Z4OEfMwpLO8rp7CvlhBDnjsC3CttJXIhBi+5Aj5r+MBvy4egg7wCbe//hSsT+RvDAG7s81tAvpL2XAE4w==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/vite/node_modules/@esbuild/linux-riscv64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.21.5.tgz", + "integrity": "sha512-2HdXDMd9GMgTGrPWnJzP2ALSokE/0O5HhTUvWIbD3YdjME8JwvSCnNGBnTThKGEB91OZhzrJ4qIIxk/SBmyDDA==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/vite/node_modules/@esbuild/linux-s390x": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.21.5.tgz", + "integrity": "sha512-zus5sxzqBJD3eXxwvjN1yQkRepANgxE9lgOW2qLnmr8ikMTphkjgXu1HR01K4FJg8h1kEEDAqDcZQtbrRnB41A==", + "cpu": [ + "s390x" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/vite/node_modules/@esbuild/linux-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.21.5.tgz", + "integrity": "sha512-1rYdTpyv03iycF1+BhzrzQJCdOuAOtaqHTWJZCWvijKD2N5Xu0TtVC8/+1faWqcP9iBCWOmjmhoH94dH82BxPQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/vite/node_modules/@esbuild/netbsd-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.21.5.tgz", + "integrity": "sha512-Woi2MXzXjMULccIwMnLciyZH4nCIMpWQAs049KEeMvOcNADVxo0UBIQPfSmxB3CWKedngg7sWZdLvLczpe0tLg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "netbsd" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/vite/node_modules/@esbuild/openbsd-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.21.5.tgz", + "integrity": "sha512-HLNNw99xsvx12lFBUwoT8EVCsSvRNDVxNpjZ7bPn947b8gJPzeHWyNVhFsaerc0n3TsbOINvRP2byTZ5LKezow==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openbsd" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/vite/node_modules/@esbuild/sunos-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.21.5.tgz", + "integrity": "sha512-6+gjmFpfy0BHU5Tpptkuh8+uw3mnrvgs+dSPQXQOv3ekbordwnzTVEb4qnIvQcYXq6gzkyTnoZ9dZG+D4garKg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "sunos" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/vite/node_modules/@esbuild/win32-arm64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.21.5.tgz", + "integrity": "sha512-Z0gOTd75VvXqyq7nsl93zwahcTROgqvuAcYDUr+vOv8uHhNSKROyU961kgtCD1e95IqPKSQKH7tBTslnS3tA8A==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/vite/node_modules/@esbuild/win32-ia32": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.21.5.tgz", + "integrity": "sha512-SWXFF1CL2RVNMaVs+BBClwtfZSvDgtL//G/smwAc5oVK/UPu2Gu9tIaRgFmYFFKrmg3SyAjSrElf0TiJ1v8fYA==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/vite/node_modules/@esbuild/win32-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.21.5.tgz", + "integrity": "sha512-tQd/1efJuzPC6rCFwEvLtci/xNFcTZknmXs98FYDfGE4wP9ClFV98nyKrzJKVPMhdDnjzLhdUyMX4PsQAPjwIw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/vite/node_modules/esbuild": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.21.5.tgz", + "integrity": "sha512-mg3OPMV4hXywwpoDxu3Qda5xCKQi+vCTZq8S9J/EpkhB2HzKXq4SNFZE3+NK93JYxc8VMSep+lOUSC/RVKaBqw==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "bin": { + "esbuild": "bin/esbuild" + }, + "engines": { + "node": ">=12" + }, + "optionalDependencies": { + "@esbuild/aix-ppc64": "0.21.5", + "@esbuild/android-arm": "0.21.5", + "@esbuild/android-arm64": "0.21.5", + "@esbuild/android-x64": "0.21.5", + "@esbuild/darwin-arm64": "0.21.5", + "@esbuild/darwin-x64": "0.21.5", + "@esbuild/freebsd-arm64": "0.21.5", + "@esbuild/freebsd-x64": "0.21.5", + "@esbuild/linux-arm": "0.21.5", + "@esbuild/linux-arm64": "0.21.5", + "@esbuild/linux-ia32": "0.21.5", + "@esbuild/linux-loong64": "0.21.5", + "@esbuild/linux-mips64el": "0.21.5", + "@esbuild/linux-ppc64": "0.21.5", + "@esbuild/linux-riscv64": "0.21.5", + "@esbuild/linux-s390x": "0.21.5", + "@esbuild/linux-x64": "0.21.5", + "@esbuild/netbsd-x64": "0.21.5", + "@esbuild/openbsd-x64": "0.21.5", + "@esbuild/sunos-x64": "0.21.5", + "@esbuild/win32-arm64": "0.21.5", + "@esbuild/win32-ia32": "0.21.5", + "@esbuild/win32-x64": "0.21.5" + } + }, + "node_modules/vitest": { + "version": "2.1.9", + "resolved": "https://registry.npmjs.org/vitest/-/vitest-2.1.9.tgz", + "integrity": "sha512-MSmPM9REYqDGBI8439mA4mWhV5sKmDlBKWIYbA3lRb2PTHACE0mgKwA8yQ2xq9vxDTuk4iPrECBAEW2aoFXY0Q==", + "dev": true, + "license": "MIT", + "dependencies": { + "@vitest/expect": "2.1.9", + "@vitest/mocker": "2.1.9", + "@vitest/pretty-format": "^2.1.9", + "@vitest/runner": "2.1.9", + "@vitest/snapshot": "2.1.9", + "@vitest/spy": "2.1.9", + "@vitest/utils": "2.1.9", + "chai": "^5.1.2", + "debug": "^4.3.7", + "expect-type": "^1.1.0", + "magic-string": "^0.30.12", + "pathe": "^1.1.2", + "std-env": "^3.8.0", + "tinybench": "^2.9.0", + "tinyexec": "^0.3.1", + "tinypool": "^1.0.1", + "tinyrainbow": "^1.2.0", + "vite": "^5.0.0", + "vite-node": "2.1.9", + "why-is-node-running": "^2.3.0" + }, + "bin": { + "vitest": "vitest.mjs" + }, + "engines": { + "node": "^18.0.0 || >=20.0.0" + }, + "funding": { + "url": "https://opencollective.com/vitest" + }, + "peerDependencies": { + "@edge-runtime/vm": "*", + "@types/node": "^18.0.0 || >=20.0.0", + "@vitest/browser": "2.1.9", + "@vitest/ui": "2.1.9", + "happy-dom": "*", + "jsdom": "*" + }, + "peerDependenciesMeta": { + "@edge-runtime/vm": { + "optional": true + }, + "@types/node": { + "optional": true + }, + "@vitest/browser": { + "optional": true + }, + "@vitest/ui": { + "optional": true + }, + "happy-dom": { + "optional": true + }, + "jsdom": { + "optional": true + } + } + }, + "node_modules/why-is-node-running": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/why-is-node-running/-/why-is-node-running-2.3.0.tgz", + "integrity": "sha512-hUrmaWBdVDcxvYqnyh09zunKzROWjbZTiNy8dBEjkS7ehEDQibXJ7XvlmtbwuTclUiIyN+CyXQD4Vmko8fNm8w==", + "dev": true, + "license": "MIT", + "dependencies": { + "siginfo": "^2.0.0", + "stackback": "0.0.2" + }, + "bin": { + "why-is-node-running": "cli.js" + }, + "engines": { + "node": ">=8" + } + } + } +} diff --git a/experiments/rule-engine-poc/package.json b/experiments/rule-engine-poc/package.json new file mode 100644 index 000000000..13fa0d76d --- /dev/null +++ b/experiments/rule-engine-poc/package.json @@ -0,0 +1,30 @@ +{ + "name": "@experiments/rule-engine-poc", + "version": "0.0.1", + "private": true, + "type": "module", + "description": "Proof-of-concept rule engine for the Specorator OODA orchestrator: LLM extracts, rules decide. Applied to the repo's own quality framework.", + "scripts": { + "build": "tsc -p .", + "demo": "tsx src/cli.ts rules/quality-gates.yaml fixtures/ready-implementation.json", + "demo:blocked-ears": "tsx src/cli.ts rules/quality-gates.yaml fixtures/blocked-missing-ears.json", + "demo:blocked-s1": "tsx src/cli.ts rules/quality-gates.yaml fixtures/blocked-s1-finding.json", + "demo:needs-attention": "tsx src/cli.ts rules/quality-gates.yaml fixtures/needs-attention-design-risks.json", + "demo:ready-idea": "tsx src/cli.ts rules/quality-gates.yaml fixtures/ready-idea.json", + "demo:html": "tsx src/cli.ts rules/quality-gates.yaml fixtures/ready-implementation.json --html reports/ready-implementation.html --quiet", + "demo:all": "node scripts/run-all-fixtures.mjs", + "demo:html:all": "node scripts/run-all-html.mjs", + "test": "vitest run", + "test:watch": "vitest" + }, + "dependencies": { + "js-yaml": "^4.1.0" + }, + "devDependencies": { + "@types/js-yaml": "^4.0.9", + "@types/node": "^22.0.0", + "tsx": "^4.19.0", + "typescript": "^5.6.0", + "vitest": "^2.1.0" + } +} diff --git a/experiments/rule-engine-poc/research/01-technical-landscape.md b/experiments/rule-engine-poc/research/01-technical-landscape.md new file mode 100644 index 000000000..aafddc1a1 --- /dev/null +++ b/experiments/rule-engine-poc/research/01-technical-landscape.md @@ -0,0 +1,77 @@ +# 01 — Technical Landscape: TS/JS Rule Engines for the Decide Quadrant + +**Context.** POC at `experiments/rule-engine-poc/` for the "LLM extracts, rules decide" pattern: an LLM emits structured factual flags (booleans / categoricals), a deterministic TypeScript layer turns those flags into a verdict tier. Terminal-only. Expected scale: well under 200 rules. + +## Candidate libraries + +### 1. `json-rules-engine` (CacheControl) + +- Most-installed JS rule engine: ~350k weekly downloads, 189 dependents ([npm](https://www.npmjs.com/package/json-rules-engine)). +- Latest release `6.5.0`, Nov 2023 ([GitHub](https://github.com/CacheControl/json-rules-engine)) — about 18 months stale at time of writing; npm summary calls maintenance "low attention" but the codebase is stable and widely depended on. +- Built-in TypeScript types under `types/` ([d.ts source](https://github.com/CacheControl/json-rules-engine/blob/master/types/index.d.ts)); the old `@types/json-rules-engine` is obsolete. +- DSL: JSON rules (`all` / `any` / `not` boolean trees over fact paths) plus pluggable `Almanac` facts for dynamic lookups. +- **Audit trail story is the strongest of the candidates.** `RuleResult` exposes the full evaluated condition tree with per-condition `result` and `factResult` fields, and the engine emits success/failure events with the Almanac attached, so you can serialize the exact reasoning for each run ([rules docs](https://github.com/CacheControl/json-rules-engine/blob/master/docs/rules.md), [engine docs](https://github.com/cachecontrol/json-rules-engine/blob/master/docs/engine.md)). + +### 2. `@gorules/zen-engine` (GoRules ZEN / JDM) + +- Rust core with NodeJS native bindings (and a WASM build), latest `0.54.0`, ~60k monthly downloads ([npm](https://www.npmjs.com/package/@gorules/zen-engine), [docs](https://docs.gorules.io/reference/nodejs)). +- DSL: JSON Decision Model (JDM) — decision graphs of tables, expressions, and switch nodes with hit policies, authored visually in the open-source JDM Editor ([docs](https://docs.gorules.io/reference/json-decision-model-jdm)). +- Audit trail: `EvaluationOptions.trace=true` returns a structured `Trace` plus a `Performance` field per evaluation ([DeepWiki](https://deepwiki.com/gorules/zen/2.1-zen-engine)). +- TS support via package typings; native binary per platform adds install-time weight. +- Strong fit if business users will eventually edit rules in a GUI; heavier than needed for a CLI POC. + +### 3. Open Policy Agent (Rego) via `@open-policy-agent/opa` or `opa-wasm` + +- Production-grade policy engine; TS SDK gives typed input/result, WASM path runs in-process ([npm SDK](https://www.npmjs.com/package/@open-policy-agent/opa), [opa-wasm](https://github.com/open-policy-agent/npm-opa-wasm)). +- DSL: Rego — a separate language with a real learning curve, optimized for authorization, not flag-to-verdict scoring. +- Decision logs exist but are infra-level (OPA server / sidecar), not a Node-library-call return value. Overkill for terminal POC. + +### 4. `json-logic-js` + +- Tiny, portable JSON expression evaluator; inactive (`2.0.5`, 2 years old) but stable ([npm](https://www.npmjs.com/package/json-logic-js)). +- Pure expressions — no rule loop, priorities, salience, or trace structure. Would have to be wrapped to act as an engine, at which point you've built your own. + +### 5. `nools` + +- Rete-based, JS-native. Unmaintained for ~9 years; maintainers explicitly looking for a successor ([GitHub](https://github.com/noolsjs/nools)). Disqualified. + +### 6. `hyperjump/*` + +- This is a JSON Schema validation stack, not a rule engine ([npm](https://www.npmjs.com/package/@hyperjump/json-schema)). Not in scope here. + +## Rete vs top-down evaluation + +Rete amortises pattern matching across many rules and fact updates by caching partial matches; the trade is memory and graph-build overhead. Multiple sources are explicit that for small rule sets the overhead exceeds the gain and simpler algorithms (TREAT or brute-force sequential evaluation) win ([Wikipedia](https://en.wikipedia.org/wiki/Rete_algorithm), [HandWiki](https://handwiki.org/wiki/Rete_algorithm)). Our shape — <200 rules, one fact set per LLM extraction, no rule chaining or fact derivation — is the textbook anti-case for Rete. Linear top-down with short-circuit boolean trees is correct here. + +## DSL surface trade-offs + +- **JSON** — Best for diffability, schema validation, GUI authoring, and serialization into audit logs. Verbose for nested boolean logic. Chosen by `json-rules-engine`, JDM, json-logic. +- **YAML** — Same expressiveness as JSON, friendlier for humans, worse for machine round-tripping (anchors, types). No major TS engine ships YAML natively; you'd convert at load time. +- **TS functions** — Maximum expressiveness, zero serialization. Kills the "rules as data" property: you can't version, diff, or hot-reload rules without redeploying code, and the audit trail has to be reconstructed by hand. Acceptable only for rules that genuinely need arbitrary computation. + +For an LLM-driven flag pipeline the rules are overwhelmingly boolean combinations over categorical fields — JSON wins. + +## Audit / explainability summary + +| Engine | Per-decision trace | Per-condition outcome | Format | +|---|---|---|---| +| json-rules-engine | Yes (events + `RuleResult`) | Yes (`factResult`, `result` on every node) | JSON-serializable | +| ZEN / JDM | Yes (`trace`, `performance`) | Yes (per-node) | JSON | +| OPA | Decision logs (infra) | Partial (query trace) | JSON, but out-of-band | +| json-logic-js | No | No | n/a | + +## Recommendation + +**Adopt `json-rules-engine` for the POC; keep a thin in-house adapter so we can swap later.** Rationale: + +1. It hits the three things that actually matter for the Decide quadrant: JSON-as-data rules, first-class `RuleResult` introspection for audit, and built-in TS types. +2. At our scale (<200 rules, no fact derivation, single-shot evaluation per LLM extraction) we get nothing from Rete, and OPA/ZEN bring deployment weight we don't need in a terminal POC. +3. Maintenance staleness is the real risk. Mitigate by wrapping the engine behind a `DecisionEngine` interface (input flags → `{verdict, tier, trace}`) so a future swap to ZEN (if GUI authoring becomes a requirement) or an in-house evaluator (if we hit a feature wall) is a one-file change. + +**Do not build from scratch yet.** A minimal in-house engine would re-implement boolean trees, fact resolution, priority ordering, and a trace structure — work that `json-rules-engine` already ships and that would distract from the POC's actual question (does the LLM-extracts / rules-decide split produce stable verdicts?). Revisit build-vs-buy after the POC, with concrete evidence of what the library can't express. + +**Open follow-ups for the architect / pm stages** + +- Rule-authoring ergonomics: do operators edit JSON directly, or do we generate it from a higher-level YAML/TS surface? (TBD — owner: ux-designer.) +- Trace persistence format and retention — needs to align with the OODA orchestrator's Observe-quadrant logging. (TBD — owner: architect.) +- Version-pinning policy given the upstream's slow release cadence. (TBD — owner: sre.) diff --git a/experiments/rule-engine-poc/research/02-regulatory-auditability.md b/experiments/rule-engine-poc/research/02-regulatory-auditability.md new file mode 100644 index 000000000..4325fbd04 --- /dev/null +++ b/experiments/rule-engine-poc/research/02-regulatory-auditability.md @@ -0,0 +1,109 @@ +# 02 — Regulatory & Auditability Landscape + +**Scope:** What the EU AI Act, ISO/IEC 42001, ISO/IEC 23894, and NIST AI RMF 1.0 demand of an AI-derived decision pipeline, mapped against the "LLM extracts, rules decide" architecture chosen for the `experiments/rule-engine-poc/` proof of concept. + +**Anchoring quote (Reddit, AI fact-checking thread):** *"Architectures that internalize the verification tax upfront have a clearer path to high-stakes domains than ones that ignore it. The EU AI Act will accelerate that distinction."* + +--- + +## 1. EU AI Act — the binding instrument + +The Act applies progressively, with full roll-out by **2 August 2027**; the bulk of high-risk obligations and Article 50 transparency rules become enforceable on **2 August 2026**, and the May 2026 Digital Omnibus added a 16-month postponement for *new or substantially modified* Annex III systems ([implementation timeline](https://artificialintelligenceact.eu/implementation-timeline/), [Travers Smith summary](https://www.traverssmith.com/knowledge/knowledge-container/eu-agrees-to-delay-key-ai-act-compliance-deadlines/)). Annex III covers biometrics, critical infrastructure, education, employment, essential services, law enforcement, migration, and justice — exactly the domains where "rules decide" pays off. + +Four articles dominate the audit surface: + +- **Article 12 — Record-keeping.** High-risk systems must produce automatic logs over their lifetime that are sufficient to identify risk situations, support post-market monitoring, and trace each use (start/end timestamps, reference data, input that produced a match, persons verifying results) ([Art. 12](https://artificialintelligenceact.eu/article/12/), [ISMS.online primer](https://www.isms.online/iso-42001/eu-ai-act/article-12/)). +- **Article 13 — Transparency to deployers.** Operation must be "sufficiently transparent" to let deployers interpret output; instructions for use must cover capabilities, limitations, intended purpose, foreseeable misuse, and how to read logs ([Art. 13](https://artificialintelligenceact.eu/article/13/)). Explainability is defined as *"level of understanding how the AI-based system came up with a given result."* +- **Article 14 — Human oversight.** Designs must allow a competent person to interpret output, override, or stop the system; some use cases require two-person verification ([Art. 14](https://artificialintelligenceact.eu/article/14/)). +- **Article 11 + Annex IV — Technical documentation.** Provider must keep a dossier (system description, data governance, validation, risk management, post-market plan) regulators can demand. + +## 2. ISO/IEC 42001 — the AI Management System standard + +ISO/IEC 42001:2023 is the first certifiable AIMS standard ([ISO page](https://www.iso.org/standard/42001), [Microsoft offering](https://learn.microsoft.com/en-us/compliance/regulatory/offering-iso-42001)). It demands lifecycle controls covering **bias assessments, explainability, data governance, security, human-in-the-loop, continuous performance tracking, drift detection, and audit trails**, plus Clause 9.2 internal audits at planned intervals. Traceability — data provenance, model traceability, explainability records, audit logs — is treated as a first-class control objective. + +## 3. ISO/IEC 23894 — risk management companion + +ISO/IEC 23894:2023 extends ISO 31000 to AI-specific risks (opacity, data dependency, adaptation) and is intended to sit alongside 42001 and NIST AI RMF ([ISO page](https://www.iso.org/standard/77304.html), [ANSI explainer](https://blog.ansi.org/ansi/iso-iec-23894-2023-ai-risk-management/)). It explicitly calls out **real-time analytics, audit trails, and incident detection** as monitoring obligations, and **human-in-the-loop** as a treatment strategy. + +## 4. NIST AI RMF 1.0 — voluntary but expected + +NIST AI 100-1 organises trustworthy-AI work into **Govern, Map, Measure, Manage** ([NIST AI RMF](https://www.nist.gov/itl/ai-risk-management-framework), [PDF](https://nvlpubs.nist.gov/nistpubs/ai/nist.ai.100-1.pdf)). Seven characteristics: valid/reliable, safe, secure/resilient, accountable/transparent, **explainable/interpretable**, privacy-enhanced, fair-with-bias-managed. The framework names *inscrutability* as the failure mode our architecture is built to avoid. + +## 5. What counts as an "auditable decision"? + +Across the four frameworks, regulators/auditors converge on the same dossier per decision: + +| Artifact | Purpose | Source frameworks | +|---|---|---| +| Decision ID + timestamp + actor | Identify what happened, when, by whom | AI Act Art. 12; 42001 | +| Input snapshot (raw + redacted) | Reproduce the case | Art. 12; 23894 | +| Source citations behind every flag | Ground LLM output in evidence | Art. 13; NIST Measure | +| Flag provenance (model, prompt, version, temperature, token usage) | Tie extractions to a model-card entry | Model Cards (Mitchell et al., 2019) | +| Rule set version + hash | Pin the deterministic logic that fired | 42001 traceability; Art. 11 | +| Triggered rules + verdict | Explainable chain `flags → rules → verdict` | Art. 13; NIST Explainable | +| Confidence / calibration data | Support human oversight | Art. 14; NIST Measure | +| Human override record | Evidence of oversight | Art. 14; 42001 | +| Replay manifest (rule + flag + input hash) | Deterministic re-execution | 23894 monitoring; 42001 audit | + +Supporting documentation patterns: **model cards** for the LLM extractor, **datasheets/data cards** for any reference data, and a **decision log** that is the per-case row of the table above ([Model Cards](https://www.researchgate.net/publication/330268857_Model_Cards_for_Model_Reporting), [Data Cards](https://www.datacentricai.org/neurips21/papers/112_CameraReady_Data_Cards.pdf)). + +## 6. How "LLM extracts, rules decide" maps to the requirements + +**Native wins:** + +- *Explainability (Art. 13, NIST).* The verdict is a pure function of named flags and named rules — a regulator can read the chain in plain language without probing the LLM. +- *Replayability (Art. 12, 42001).* Given pinned rule version + stored flags, the verdict is byte-identical on re-run. The LLM extraction step is the only nondeterministic boundary, and it is logged. +- *Human oversight (Art. 14).* Rule outcomes are inspectable; an overseer can disagree with one flag and see the downstream verdict change. +- *Separation of duty.* Rule changes go through code review/governance independently of model swaps — clean fit with 42001 change-control. + +**Still requires deliberate work:** + +1. **Rule-change governance** — rules need semver, signed releases, an ADR-style rationale per change, and an "as-of" lookup so old decisions stay reproducible. +2. **Flag-extraction calibration** — LLM flag accuracy must be measured per flag type (precision/recall, drift), not just overall; this is the NIST *Measure* gap. +3. **Bias review** — both layers can encode bias. Rules need a fairness audit (disparate-impact testing on historical inputs); LLM flags need subgroup evaluation. +4. **Explainability *presentation*** — Art. 13 demands the deployer can interpret output; a JSON trace is not enough. Need a human-readable "why" view. +5. **Source-citation integrity** — flags must carry verifiable spans/URIs into source documents; otherwise "grounded" is a claim, not an artifact. +6. **Post-market monitoring (Art. 72)** — incident channel, drift alerts, periodic re-validation against ground truth. +7. **Technical-file maintenance (Art. 11 / Annex IV)** — the dossier is a living document, not a one-off. + +--- + +## 7. Audit-trail checklist — what the POC MUST capture per decision + +To be defensible under the EU AI Act and ISO/IEC 42001, every verdict the POC emits must persist: + +- [ ] **Decision envelope** — UUID, UTC timestamp, deployer/operator identity, system version. +- [ ] **Input record** — raw input hash + stored payload (with PII redaction policy noted). +- [ ] **Model invocation log** — LLM provider, model ID, model version/snapshot, prompt template ID + hash, temperature/seed, token counts, latency. +- [ ] **Extracted flags** — structured object, each flag tagged with `{name, value, confidence, source_span(s), source_uri}`. +- [ ] **Rule set pin** — rule-engine version, ruleset semver, content hash, link to signed release. +- [ ] **Rule trace** — ordered list of evaluated rules with `{rule_id, version, inputs, matched: bool, output}`. +- [ ] **Verdict** — final decision, severity, machine-readable code, plus human-readable rationale string composed from triggered rules. +- [ ] **Confidence + uncertainty** — calibrated score and the abstention threshold applied. +- [ ] **Human-oversight record** — reviewer ID, action (accept/override/escalate), timestamp, rationale, two-person flag where required by Art. 14. +- [ ] **Replay manifest** — minimal tuple `{input_hash, ruleset_hash, flag_set_hash}` that reproduces the verdict offline. +- [ ] **Cross-references** — pointer to the model card, ruleset changelog entry, and the relevant Annex IV technical-file section. +- [ ] **Retention + integrity** — append-only store, tamper-evident (hash chain or WORM), retention period aligned with Art. 19 (≥ 6 months unless sectoral law says longer). +- [ ] **Incident hook** — a decision flagged as anomalous must auto-emit a post-market monitoring event (Art. 72). + +If a verdict is missing any of the bold items above, treat it as a **non-conforming decision** under the 42001 internal-audit clause and quarantine it from downstream use. + +--- + +## Sources + +- [EU AI Act Article 12 — Record-Keeping](https://artificialintelligenceact.eu/article/12/) +- [EU AI Act Article 13 — Transparency](https://artificialintelligenceact.eu/article/13/) +- [EU AI Act Article 14 — Human Oversight](https://artificialintelligenceact.eu/article/14/) +- [EU AI Act Implementation Timeline](https://artificialintelligenceact.eu/implementation-timeline/) +- [Travers Smith — EU agrees to delay key AI Act compliance deadlines](https://www.traverssmith.com/knowledge/knowledge-container/eu-agrees-to-delay-key-ai-act-compliance-deadlines/) +- [Annex III — High-Risk AI Systems](https://artificialintelligenceact.eu/annex/3/) +- [ISO/IEC 42001:2023 — AI Management Systems](https://www.iso.org/standard/42001) +- [Microsoft Compliance — ISO/IEC 42001 offering](https://learn.microsoft.com/en-us/compliance/regulatory/offering-iso-42001) +- [ISO/IEC 23894:2023 — AI Risk Management Guidance](https://www.iso.org/standard/77304.html) +- [ANSI Blog — ISO/IEC 23894 explainer](https://blog.ansi.org/ansi/iso-iec-23894-2023-ai-risk-management/) +- [NIST AI Risk Management Framework](https://www.nist.gov/itl/ai-risk-management-framework) +- [NIST AI 100-1 (AI RMF 1.0) PDF](https://nvlpubs.nist.gov/nistpubs/ai/nist.ai.100-1.pdf) +- [Mitchell et al. — Model Cards for Model Reporting](https://www.researchgate.net/publication/330268857_Model_Cards_for_Model_Reporting) +- [Pushkarna et al. — Data Cards](https://www.datacentricai.org/neurips21/papers/112_CameraReady_Data_Cards.pdf) +- [ISMS.online — Is your AI logging Article 12-ready?](https://www.isms.online/iso-42001/eu-ai-act/article-12/) diff --git a/experiments/rule-engine-poc/research/03-positioning-jtbd.md b/experiments/rule-engine-poc/research/03-positioning-jtbd.md new file mode 100644 index 000000000..8f895f05a --- /dev/null +++ b/experiments/rule-engine-poc/research/03-positioning-jtbd.md @@ -0,0 +1,86 @@ +--- +title: Rule Engine POC — Strategic Positioning & JTBD +status: draft +phase: discovery / frame +author: product-strategist +inputs: + - docs/specorator-product/product.md + - docs/specorator.md + - docs/backlog/501-idea-goal-oriented-orchestrator-plugin-research-design-plan.md + - docs/backlog/502-idea-ooda-loop-plugin-observe-orient-decide-act.md + - pattern source: Reddit thread on AI fact-checking — "llm extracts, rules decide" +--- + +# Rule Engine POC — Positioning & JTBD + +## 1. Positioning shift for Specorator + +Today's mission line (`docs/specorator-product/product.md`): *Specorator helps humans build software with AI agents without surrendering intent, traceability, or quality gates.* The current value prop is "file-based operating system for agentic software delivery: specs first, role-scoped agents, quality gates, traceability in plain Markdown." Determinism already shows up *around* the agents (verify gate, EARS-to-test mapping, RTM) — not *inside* their decisions. + +A deterministic rule layer extends that mission into the decision surface itself. The pattern — **LLM extracts structured flags; a rule engine produces verdicts and recommended actions** — converts judgment from an opaque model call into an auditable, replayable artifact. It is the same logic Specorator already applies to specs (`code is an artifact of the spec`) applied one layer up: **verdicts are artifacts of rules, not artifacts of model mood.** + +Elevator pitch shift: + +- **Before:** "Spec-driven workflow for building software with AI agents, with quality gates around the agents." +- **After:** "Spec-driven workflow for building software with AI agents, with quality gates *inside* the agents — every agent decision is a rule firing on extracted flags, replayable, diff-able, and reviewable like code." + +This sharpens the "auditable / governance / enterprise evaluator" arc on the product ladder (Layer 3 runtime extensions) without weakening the Layer 0 Markdown-first story. The rule pack itself becomes a Markdown + JSON artifact. + +## 2. Candidate JTBD statements + +Format: *When [situation], I want to [motivation], so I can [outcome].* Marked `TBD — assumption` until validated by user-researcher. + +1. **Reviewer / approver JTBD (rule engine):** When an AI agent recommends an action that touches code, a release, or a customer, I want every recommendation to come with the rule that fired and the flags that triggered it, so I can approve, override, or escalate without re-reading the entire model trace. *(Forces — Push: agent decisions feel opaque; Pull: replayable verdicts; Anxiety: silently approving a bad call; Habit: scrolling chat history.)* `TBD — assumption` +2. **Orchestrator-author JTBD (OODA Decide quadrant):** When I run a continuous OODA loop across my projects, I want the Decide step to produce identical verdicts for identical Orient flags, so I can trust the daily brief enough to automate the Act quadrant on low-risk classes. *(Push: non-reproducible recommendations; Pull: scheduled auto-act; Anxiety: model drift between runs; Habit: re-reading every brief in full.)* `TBD — assumption` +3. **Enterprise-evaluator JTBD (governance):** When I evaluate an agentic workflow for adoption, I want to see which decisions are governed by deterministic rules vs. raw model judgment, so I can map the system to my existing change-control and audit regime. *(Push: compliance teams reject "LLM said so"; Pull: rule packs map to controls; Anxiety: ungoverned autonomy; Habit: blocking adoption outright.)* `TBD — assumption` + +## 3. North Star metric candidates + +| Candidate | Definition | Leading? | Understandable? | Actionable? | Measurable? | +|---|---|---|---|---|---| +| Explainable actions per loop | % of Act-phase actions that carry a (rule_id, flag_set) provenance pair | yes | yes | yes — rule coverage drives it | yes | +| Rule coverage of Orient flags | % of distinct flags emitted by Orient that have ≥ 1 rule consuming them | yes | medium | yes | yes | +| Verdict reproducibility rate | % of (flag_set → verdict) pairs that are identical across two consecutive runs on the same input | yes | yes | yes — surfaces rule gaps and model drift | yes | + +**Recommendation: Verdict reproducibility rate.** It directly tests the pattern's promise ("more stable than letting the model judge"), it falsifies the value prop if it drops, and a low score is a concrete signal to either (a) tighten the rule pack or (b) constrain the LLM extractor schema. Explainable-actions-per-loop is a strong *secondary* — it measures coverage rather than stability. Rule-coverage-of-Orient-flags is a useful internal health metric but is too inside-the-machine to be a North Star. + +Current value: `TBD — owner: prototyper` (POC has not produced two-run measurements yet). Target: ≥ 95% on the daily-brief use case; ≥ 99% on release-readiness verdicts (where stability matters more than coverage). + +## 4. Adjacent domains and one-line flag schemas + +| Domain | Flag schema (1 line) | +|---|---| +| Code review | `{has_test_change: bool, touches_security_path: bool, diff_size_loc: int, breaks_public_api: bool, missing_changeset: bool}` | +| PR approval gate | `{ci_green: bool, codex_review_passed: bool, mergeable_clean: bool, title_conventional: bool, branch_is_topic: bool}` | +| Agentic security review (OWASP) | `{calls_untrusted_tool: bool, reads_user_input_into_prompt: bool, has_egress: bool, can_modify_filesystem: bool, secrets_in_context: bool}` | +| Release readiness | `{rollback_plan_present: bool, changelog_complete: bool, observability_hooks_wired: bool, open_critical_findings: int, fresh_surface_contract_green: bool}` | +| Incident triage | `{severity: enum, customer_impact_pct: float, runbook_match: string|null, on_call_acked: bool, similar_incident_id: string|null}` | +| Fact-checking (origin pattern) | `{claim_type: enum, source_count: int, source_authority_score: float, contradiction_found: bool, recency_days: int}` | + +In every row the LLM is the extractor (it reads the diff, the PR, the incident, the article and emits the structured object). The rule pack is deterministic markdown-and-JSON authored by humans and reviewable like any spec. + +## 5. Competitive positioning + +| Tool | What it optimises for | Where it leaves a gap | Specorator + rule engine wedge | +|---|---|---|---| +| LangGraph | Graph-based agent state machines; flexible control flow | Decisions inside nodes are still LLM-judgment; no first-class rule layer | Decisions are rules, not nodes; verdicts are diff-able artifacts | +| CrewAI | Role-based multi-agent crews | Coordination is the product; auditability of decisions is left to the user | Spec-driven roles already; rule layer adds verdict provenance | +| OpenDevin | Autonomous coding agent | Maximises autonomy; minimises human-readable governance | Specorator already inverts this; rule engine extends the inversion to runtime | +| Inngest Agent Kit | Durable, event-driven agent functions | Strong on durability + observability of *invocation*; thin on governance of *decision* | Rule engine governs the decision; can sit inside an Inngest function | + +Differentiated value: **the only file-based, spec-driven workflow where agent decisions are themselves spec'd.** Everything else governs the agent's *plumbing*; this governs the agent's *judgment*. + +## 6. Strategic risks + +- **Determinism vs. "agentic" tension.** Real risk if framed as "rules replace LLMs." Framing must be "LLM extracts, rules decide" — the LLM keeps the open-ended perception job; the rules keep the bounded judgment job. The line: anything that can be enumerated as a closed-world flag set belongs in rules; anything requiring world-knowledge or language understanding stays with the LLM. +- **Rule-pack maintenance becomes its own product.** If rules drift, the audit trail becomes theatre. Mitigation: rules ship with tests (EARS-style) and live under `specs/`-style traceability — Specorator already has the infrastructure. +- **Scope creep into a full DSL.** Resist. v0 should be JSON-Logic or a tiny declarative subset; a DSL is an ADR-worthy decision, not a POC choice. +- **Pivot flag for Converge phase:** if user research surfaces that adopters want *more* autonomy rather than *more* governance, the Decide-quadrant framing wins but the broader "rule layer for all agents" framing must be re-scoped to opt-in surfaces only (release, security review, PR approval). + +## 7. Lean Canvas — riskiest assumption + +Skipping the full canvas (internal/template work, no direct revenue model in scope). Riskiest assumption to flag for the critic: **that adopters perceive reproducible verdicts as more valuable than novel recommendations.** If they prefer the model's latest creativity over yesterday's identical answer, the North Star is wrong and the wedge collapses. + +## 8. Handoff to user-researcher + +Strategic context for upcoming interviews: Specorator's positioning hinges on auditability and human oversight of AI agents. The rule-engine POC is testing whether moving agent *decisions* (not just plumbing) into deterministic, spec'd artifacts is something adopters actively want — particularly enterprise evaluators and reviewers approving agent-recommended actions. Interview targets should include at least one OODA-Decide consumer (daily-brief user), one release-gate approver, and one enterprise evaluator. Validate JTBD #1 and #3 first; #2 depends on the OODA plugin existing. diff --git a/experiments/rule-engine-poc/research/04-technical-design.md b/experiments/rule-engine-poc/research/04-technical-design.md new file mode 100644 index 000000000..39551b772 --- /dev/null +++ b/experiments/rule-engine-poc/research/04-technical-design.md @@ -0,0 +1,313 @@ +--- +title: Rule Engine POC — Technical Design +folder: experiments/rule-engine-poc/research +description: Independent technical design for the "LLM extracts, rules decide" TypeScript proof of concept; types, DSL, evaluation, audit trail, determinism, integration into the OODA Decide quadrant. +entry_point: false +--- + +# Rule Engine POC — Technical Design + +This document proposes a design for a deterministic rule engine that consumes +LLM-produced extraction flags and emits a `Verdict` plus suggested `Act`s. It +slots into the **Decide** quadrant of the OODA plugin +(`docs/backlog/502-idea-ooda-loop-plugin-observe-orient-decide-act.md`). + +The guiding principle is *separation of judgment*: the LLM is allowed to be +fuzzy when classifying a situation; the rule layer is forbidden from being +fuzzy when deciding what to do about it. + +## 1. Type contracts + +```ts +// Pure data; no methods, no Dates, no functions. JSON-cloneable. + +export type FlagValue = string | number | boolean | null; + +export interface ExtractionFlag { + readonly key: string; // dot.path identifier, e.g. "claim.has_citation" + readonly value: FlagValue; + readonly confidence?: number; // 0..1, LLM-self-reported, optional + readonly source?: string; // span id or evidence pointer +} + +export interface ExtractionEnvelope { + readonly schemaVersion: 1; + readonly extractorId: string; // e.g. "claim-extractor@2026-05-17" + readonly flags: readonly ExtractionFlag[]; + readonly flagsHash: string; // sha256 of canonicalised flags +} + +export type Op = + | "eq" | "ne" | "gt" | "lt" + | "in" | "regex" | "exists"; + +export interface Predicate { + readonly op: Op; + readonly flag: string; // key under test + readonly value?: FlagValue | readonly FlagValue[]; // op-dependent +} + +export type WhenClause = + | Predicate + | { readonly all: readonly WhenClause[] } + | { readonly any: readonly WhenClause[] } + | { readonly not: WhenClause }; + +export type Severity = "info" | "low" | "medium" | "high" | "critical"; + +export interface Rule { + readonly id: string; // stable, unique, kebab-case + readonly version: string; // semver of authored content + readonly priority: number; // integer; higher = earlier + readonly severity: Severity; + readonly weight: number; // signed contribution to score + readonly when: WhenClause; + readonly act: { // recommended action template + readonly kind: string; // e.g. "request_citation" + readonly params?: Readonly>; + }; + readonly contentHash: string; // sha256 of canonicalised rule body +} + +export interface RuleEvaluation { + readonly ruleId: string; + readonly ruleVersion: string; + readonly ruleContentHash: string; + readonly order: number; // 0-based eval index + readonly matched: boolean; + readonly matchedPredicates: readonly { path: string; flag: string; op: Op; }[]; + readonly weightContribution: number; // weight if matched else 0 + readonly severity: Severity; + readonly act?: Rule["act"]; // present iff matched + readonly skippedReason?: "short_circuit" | "guard_failed" | "missing_flag"; +} + +export type Verdict = "pass" | "warn" | "block"; + +export interface VerdictResult { + readonly verdict: Verdict; + readonly score: number; // sum of weight contributions + readonly dominantSeverity: Severity; + readonly suggestedActs: readonly Rule["act"][]; + readonly evaluations: readonly RuleEvaluation[]; + readonly rulesetHash: string; + readonly flagsHash: string; + readonly engineVersion: string; // semver of the engine binary +} +``` + +Rationale: every field needed to replay a verdict is on the result. `readonly` +everywhere prevents accidental mutation between evaluation and serialisation. +Hashes (not timestamps) are the identity primitives — a verdict from May and +one from October are bit-identical if inputs are. + +## 2. DSL grammar — one example per operator + +```yaml +# rules/citations.yaml +ruleset: claim-quality +version: 0.3.0 +rules: + - id: missing-citation + version: 1.0.0 + priority: 100 + severity: high + weight: 10 + when: + all: + - { op: eq, flag: claim.kind, value: factual } + - { op: exists, flag: claim.citation_url } # negated below + - { not: { op: regex, flag: claim.citation_url, value: "^https?://" } } + act: { kind: request_citation, params: { reason: "no_http_url" } } + + - id: stale-source + version: 1.0.0 + priority: 90 + severity: medium + weight: 5 + when: + all: + - { op: gt, flag: source.age_days, value: 365 } + - { op: lt, flag: source.age_days, value: 3650 } + act: { kind: flag_stale } + + - id: banned-domain + version: 1.0.0 + priority: 200 + severity: critical + weight: 50 + when: + op: in + flag: source.domain + value: ["example.invalid", "spam.test"] + act: { kind: reject } + + - id: not-opinion + version: 1.0.0 + priority: 50 + severity: info + weight: 0 + when: + not: { op: eq, flag: claim.kind, value: opinion } + act: { kind: noop } + + - id: numeric-mismatch + version: 1.0.0 + priority: 80 + severity: medium + weight: 4 + when: + op: ne + flag: claim.year_stated + value: 2024 + act: { kind: verify_year } +``` + +Operators covered: `eq`, `ne`, `gt`, `lt`, `in`, `regex`, `exists`, plus +`all` / `any` / `not` grouping. No `else`, no loops, no expressions. The DSL +is intentionally weaker than JSONLogic — every rule is a finite decision tree. + +## 3. Evaluation algorithm + +``` +1. Load and validate ruleset (schema + duplicate-id check + regex precompile). +2. Sort rules by (priority desc, id asc). Tie-break by id is purely lexical. +3. For each rule in order: + a. Walk the WhenClause depth-first, left-to-right. + b. Short-circuit `all` on first false; `any` on first true. + c. A predicate against a missing flag evaluates to false and records + skippedReason="missing_flag" on the RuleEvaluation. + d. Append RuleEvaluation with `order = i`. +4. Aggregate: + score = sum of weightContribution + dominantSeverity = max severity among matched rules (info "block" + dominantSeverity in {high, medium} OR + score >= warnThreshold -> "warn" + else -> "pass" +``` + +**Ordering pick: priority desc → id asc.** Priority gives the rule author +explicit control; id-asc tiebreak is the only stable, content-free tiebreak +that does not depend on file order or YAML key order. + +**Conflict resolution: severity-first, weighted-sum as a secondary signal.** +Pure weighted-sum hides a `critical` finding under a pile of `info` rules; +pure severity-only loses signal when several mediums accumulate. The hybrid +keeps `critical` non-negotiable while letting score nudge the warn boundary. + +Trade-offs: this is *not* a Rete network; complexity is O(rules × predicates). +That is fine for the POC scale (≤10³ rules). Migrating to Rete later is a +performance concern, not a semantic one — the verdict contract stays stable. + +## 4. Audit trail + +Each `RuleEvaluation` is the audit record. The full `VerdictResult` is the +trail. Make it replayable by persisting three things together: + +``` +trail/-.json + ├── input.envelope (ExtractionEnvelope verbatim) + ├── ruleset.snapshot (canonical JSON of the rules, not the YAML) + └── result (VerdictResult) +``` + +**Diffing two trails**: canonicalise both, then compare +`evaluations[i]` pairs. The first index where `(ruleId, matched, +weightContribution, matchedPredicates)` diverges is the regression point. +Surface only diverging entries — identical prefixes are noise. + +## 5. Determinism hazards + +| Hazard | Mitigation | +|---|---| +| `JSON.stringify` key order | Canonicalise: sort object keys, stringify numbers in shortest round-trip form, use `\n` only. | +| Async race | Engine is fully synchronous. IO happens at the edges (load YAML, write trail). | +| `Math.random` | Banned in engine code; lint rule. | +| `Date.now()` / `new Date()` | Engine never reads wall clock. Trail timestamps live in the *caller's* envelope, not the result. | +| Regex `lastIndex` | Always use fresh `RegExp` instances per evaluation, never `/g` or `/y`. | +| `Set` / `Map` iteration | Materialise to sorted arrays before iteration. | +| `Object.entries` | Use explicit `Object.keys(...).sort()`. | +| Floating-point sum order | Sort weight contributions by `(priority desc, id asc)` before summing (already the eval order). | +| Locale-sensitive compare | All string compares use code-point compare, not `localeCompare`. | + +## 6. Error handling + +- **Missing flag** referenced by a predicate: predicate is false, evaluation + records `skippedReason="missing_flag"`. Never throws. +- **Malformed rule file**: fail fast at load, before any evaluation, with the + offending YAML path and a JSON-Schema error. No partial rulesets. +- **Empty extraction**: legal input; produces a `VerdictResult` where every + rule with a predicate is skipped and verdict is `pass`. +- **Regex compile error**: rejected at load time; ruleset hash is never + computed for an invalid set. +- **Duplicate rule ids**: hard error at load. Determinism depends on `id` + being a primary key. +- **Unknown operator**: hard error at load; do not silently coerce. + +## 7. Integration sketch (OODA Decide) + +```mermaid +flowchart LR + O1["Observe
raw text + context"] --> O2["Orient
LLM extractor"] + O2 -- ExtractionEnvelope --> D1["Decide
Rule engine"] + D1 -- VerdictResult --> A1["Act
action dispatcher"] + D1 -. trail .-> TR[("trail store")] + + classDef llm fill:#eef,stroke:#557; + classDef det fill:#efe,stroke:#575; + class O2 llm + class D1,A1 det +``` + +IO contracts at each boundary: + +| Boundary | In | Out | +|---|---|---| +| Observe → Orient | `RawContext` (text, refs) | `RawContext` + LLM prompt | +| Orient → Decide | `ExtractionEnvelope` | — | +| Decide → Act | `VerdictResult` | `Act[]` dispatch tickets | +| Decide → Trail | `VerdictResult` + inputs | append-only file | + +The fuzzy/deterministic seam sits exactly at Orient→Decide. Anything past +that line is reproducible. + +## 8. Edge cases for the POC test suite + +1. Empty `flags` array → verdict `pass`, all evaluations carry `missing_flag`. +2. Two rules with identical priority — id-asc tiebreak verified. +3. Rule whose `when` references a flag absent from the envelope. +4. Regex with catastrophic backtracking input — bounded by a length guard. +5. `in` operator with a value list of one element (scalar-vs-array shape). +6. Negated `exists` (`not: { op: exists, flag: x }`) — falsy on present, null-valued flag. +7. Critical rule with `weight: 0` — still forces `block`. +8. Two acts with identical `(kind, params)` from different rules — deduped. +9. Identical inputs across two engine instances produce byte-identical JSON. +10. Adding a no-op `info` rule does not change the verdict of any prior case + (regression-safety property). + +## 9. Path from POC to production + +- **Priority queue / indexed predicates** — index rules by referenced flag + key; skip rule families when their guard flag is absent. O(rules) → O(hits). +- **Hot reload** — watch `rules/*.yaml`, recompute `rulesetHash`, swap + atomically between requests; in-flight evaluations finish on the old set. +- **Rule unit tests** — colocated `rule-id.spec.yaml` with input envelopes + and expected `RuleEvaluation` fragments; run in CI on every rule change. +- **Calibration harness** — replay a labelled corpus of envelopes; emit a + confusion matrix per severity bucket; gate merges on no-regression. +- **Shadow mode** — run a candidate ruleset alongside production, diff + trails, surface only divergences. +- **Rule lints** — unreachable rules (subsumed `when`), conflicting acts, + weight-without-act, severity-without-weight. +- **Versioned ruleset bundles** — publish `@org/rules-claim-quality@x.y.z` + with the same hash discipline as code dependencies. +- **Observability** — per-rule hit counters, p50/p95 evaluation time, + ruleset-hash gauge for deploy verification. +- **Policy review loop** — every weight or severity edit requires a recorded + rationale; the audit trail backfills calibration evidence. + +The intent is that production never has to change the contract in §1; it +only changes how rules are stored, indexed, deployed, and reviewed. diff --git a/experiments/rule-engine-poc/research/05-risks-critique.md b/experiments/rule-engine-poc/research/05-risks-critique.md new file mode 100644 index 000000000..a41eb48ad --- /dev/null +++ b/experiments/rule-engine-poc/research/05-risks-critique.md @@ -0,0 +1,122 @@ +# 05 — Risks & Critique: "LLM extracts, rules decide" + +Critic review of the POC concept inside the OODA Decide quadrant. The author of the +Reddit thread is mostly right *that* deterministic post-processing helps; he is +wrong about how cheaply it scales and how clean the seam between LLM and rules +stays. This document surfaces what we will regret if we ship his framing +verbatim. + +## 1. The author's blindspots + +The post concedes flexibility cost ("less flexible than letting the LLM 'reason' +freely") but glosses over why teams keep shipping LLM-as-judge anyway: + +- **Adaptability under distribution shift.** When the input domain moves (new + claim type, new jargon, new attack pattern), an LLM judge degrades gradually; + a rule engine fails silently — the schema slot is just empty, and the rule + fires with a default. The author treats schema fit as a static property. +- **Cost & latency cut both ways.** Rules are cheap *per call*, but a two-stage + pipeline doubles round-trips and forces the LLM call to be structured (slower, + more tokens for JSON scaffolding, retries on schema-invalid output). At low + QPS the deterministic layer is a tax, not a saving. +- **Maintenance asymmetry.** "Changing [weights] requires deliberate engineering + rather than retraining" is sold as a feature. It is also a staffing + commitment: a human owns every weight forever. LLM judges amortise that cost + onto the model provider. +- **The schema *is* the model.** Extraction categories embed the same + assumptions weights do — the author treats extraction as neutral and scoring + as opinionated. Both are opinionated. + +## 2. Where deterministic-on-top-of-LLM degrades + +- **Schema miss → false negative laundering.** When extraction cannot represent + a claim ("awkward outputs"), the rule engine produces a confident verdict from + a malformed input. The audit trail looks rigorous; the input was garbage. +- **Expert-system rot.** Rule sets accumulate exceptions until no one + understands the interaction matrix (see §4). Drools shops have lived this. +- **Weighting as a second black box.** Stakeholders accept "the LLM said so" + *or* "the rule said so." They reject "the LLM extracted X, which the rule + weighted 0.3, which combined with Y at 0.7 crossed threshold 0.55." The + formalism does not earn trust; it diffuses blame. +- **Semantic drift.** The LLM prompt evolves (new few-shot examples, model + upgrades); the rule schema does not. After 6 months the field `evidence_type: + primary` means something subtly different than when the rules were written. + Nothing alerts on this. + +## 3. OODA orchestrator-specific risks + +- **Decide becomes a bottleneck for new Observe signals.** Every new flag type + needs a rule, a weight, and a regression test. The orchestrator's velocity + collapses to rule-authoring velocity. +- **Rules nobody wants to own.** Rule engineering is unglamorous; it will + silently fall to whoever lost the standup. Expect bus-factor 1. +- **False confidence from audit trails.** A Decide verdict with a 12-step trace + *feels* defensible in retro even when the trace is post-hoc rationalisation of + an LLM hallucination upstream. +- **Meta-rule explosion.** Once you have >50 rules you need rules about rules + (priority, conflict resolution, suppression windows). This is exactly the + trap RETE-era systems hit. + +## 4. Historical analogies we should not re-run + +- **1980s expert systems winter.** Cyc, MYCIN successors — knowledge + acquisition bottleneck killed them, not inference performance. +- **Drools complexity creep.** Production rule bases routinely hit 10k+ rules + with no one able to predict firing order; debugging becomes archaeology. +- **OPA/Rego adoption pain.** Even with a clean language, policy authors are + scarce; non-trivial policies require a DSL specialist on call. +- **BPMN promised business users would model processes.** They did not. The + same will be true of "PMs will write the rules." + +## 5. POC anti-goals (do NOT do) + +- **Do not ship a YAML-only DSL as the default authoring surface.** YAML lacks + types; weight typos become silent verdict shifts. If we want declarative, + pair it with a schema + CI validator from day one. +- **Do not conflate *verdict* with *action selection*.** "Claim is unsupported" + and "ask user a clarifying question" are different decisions; collapsing them + bakes in policy that should live in Act. +- **Do not allow undocumented weight changes.** Every weight change rides an + ADR-lite note with the falsification case that motivated it. Otherwise + weights drift into folklore within two quarters. +- **Do not hide LLM extraction confidence from the rule layer.** Rules must be + able to refuse to fire on low-confidence extractions; otherwise §2 bullet 1 + is inevitable. +- **Do not auto-generate rules from LLM suggestions without human gate.** That + closes the loop we built the rule engine to open. + +## 6. Three riskiest assumptions — RAT designs + +**RAT-1 — Extraction schema covers the real claim distribution.** +*Assumption:* The LLM can map >90% of real OODA Observe inputs into the rule +schema without the "awkward output" failure mode. +*Falsification:* Sample 50 real Observe payloads from existing logs (or +synthesise from issue history). Run extraction. **Refuted if ≥10 produce +schema-invalid output or require a free-text `notes` escape hatch to be +faithful.** This is the cheapest test and the one most likely to kill the +concept. + +**RAT-2 — Deterministic verdicts are more trusted than LLM-judge verdicts by +the humans who consume them.** +*Assumption:* Stakeholders prefer rule-engine traces over LLM rationales. +*Falsification:* Show 5 reviewers the same 10 cases, half with LLM rationale, +half with rule trace. Ask which they would sign off on without further review. +**Refuted if <60% prefer the rule trace, or if rule-trace cases generate *more* +follow-up questions than LLM-rationale cases.** The author assumes the trace +sells itself; test it. + +**RAT-3 — Rule authoring is sustainable at the orchestrator's evolution rate.** +*Assumption:* A non-specialist team member can add a new rule for a new signal +type in <2 hours, end-to-end, including tests. +*Falsification:* Pick two engineers who did not build the POC. Give each a +realistic new-signal scenario. Time them. **Refuted if median time >4 hours, +or if either produces a rule that fails the existing regression suite, or if +either gives up and asks the POC author for help.** If rule authoring needs a +specialist, §3 bullet 2 lands within one quarter of shipping. + +--- + +**Recommendation:** Proceed to RAT-1 before any further engineering. If RAT-1 +refutes, the rest of the POC is academic. Default verdict if RATs are skipped: +**no-go.** Discovery exists to kill bad ideas cheaply; this idea is plausible +but not yet evidenced. diff --git a/experiments/rule-engine-poc/rules/quality-gates.yaml b/experiments/rule-engine-poc/rules/quality-gates.yaml new file mode 100644 index 000000000..0de0c4ae7 --- /dev/null +++ b/experiments/rule-engine-poc/rules/quality-gates.yaml @@ -0,0 +1,404 @@ +# Quality-gate rules for Specorator features. +# +# These rules encode the per-stage Definition of Done from +# docs/quality-framework.md as machine-checkable rules. An LLM extracts +# structured flags from a feature folder (specs//); this rule +# engine deterministically maps flags to a verdict + suggested actions. +# +# Severity ordering: blocked > needs-attention > ready-to-progress. +# Within a tier, weight determines how loudly an action is recommended. + +# ---------------------------------------------------------------------- +# Cross-cutting severity gates +# ---------------------------------------------------------------------- + +- id: any-s1-finding-blocks + description: Any S1 (critical) finding blocks all progression. + stage: any + priority: 200 + when: + all: + - flag: s1_findings_count + gt: 0 + then: + verdict: blocked + weight: 200 + actions: + - triage-s1-finding + tags: [severity, gate] + +- id: any-s2-finding-needs-attention + description: S2 (high) finding requires attention this sprint. + stage: any + priority: 100 + when: + all: + - flag: s2_findings_count + gt: 0 + then: + verdict: needs-attention + weight: 60 + actions: + - schedule-s2-fix + tags: [severity] + +- id: open-clarifications-block + description: Open clarifications must be resolved before stage progression. + stage: any + priority: 95 + when: + all: + - flag: open_clarifications_count + gt: 0 + then: + verdict: blocked + weight: 95 + actions: + - resolve-clarifications + tags: [gate] + +- id: blockers-block + description: Open blockers must be cleared. + stage: any + priority: 95 + when: + all: + - flag: blockers_count + gt: 0 + then: + verdict: blocked + weight: 95 + actions: + - clear-blockers + tags: [gate] + +# ---------------------------------------------------------------------- +# Stage: Idea +# ---------------------------------------------------------------------- + +- id: idea-problem-statement-present + description: Idea must have a one-paragraph problem statement. + stage: idea + priority: 80 + when: + all: + - flag: current_stage + eq: idea + - flag: idea_problem_statement_present + eq: false + then: + verdict: blocked + weight: 80 + actions: + - draft-problem-statement + tags: [dod, idea] + +- id: idea-scope-bounded + description: Idea scope must be bounded (no "boil the ocean" framing). + stage: idea + priority: 70 + when: + all: + - flag: current_stage + eq: idea + - flag: idea_scope_bounded + eq: false + then: + verdict: needs-attention + weight: 40 + actions: + - tighten-idea-scope + tags: [dod, idea] + +- id: idea-ready + description: Idea DoD satisfied — ready for /spec:research. + stage: idea + priority: 10 + when: + all: + - flag: current_stage + eq: idea + - flag: idea_problem_statement_present + eq: true + - flag: idea_target_users_named + eq: true + - flag: idea_scope_bounded + eq: true + then: + verdict: ready-to-progress + weight: 100 + actions: + - advance-to-research + tags: [dod, idea] + +# ---------------------------------------------------------------------- +# Stage: Requirements +# ---------------------------------------------------------------------- + +- id: req-ears-mandatory + description: All functional requirements must use EARS notation. + stage: requirements + priority: 100 + when: + all: + - flag: current_stage + in: [requirements, design, specification, tasks, implementation, testing, review] + - flag: requirements_ears_coverage + lt: 1.0 + then: + verdict: blocked + weight: 100 + actions: + - rewrite-non-ears-requirements + tags: [dod, requirements, gate] + +- id: req-has-stable-ids + description: Each requirement must have a stable REQ--NNN id. + stage: requirements + priority: 90 + when: + all: + - flag: current_stage + in: [requirements, design, specification, tasks, implementation, testing, review] + - flag: requirements_have_stable_ids + eq: false + then: + verdict: blocked + weight: 90 + actions: + - assign-requirement-ids + tags: [dod, requirements, traceability] + +- id: req-acceptance-testable + description: Acceptance criteria must be testable. + stage: requirements + priority: 80 + when: + all: + - flag: current_stage + eq: requirements + - flag: requirements_acceptance_criteria_testable + eq: false + then: + verdict: needs-attention + weight: 50 + actions: + - rewrite-acceptance-criteria + tags: [dod, requirements, testability] + +# ---------------------------------------------------------------------- +# Stage: Design +# ---------------------------------------------------------------------- + +- id: design-irreversible-needs-adr + description: Irreversible architectural decisions must have ADRs. + stage: design + priority: 90 + when: + all: + - flag: current_stage + in: [design, specification, tasks, implementation] + - flag: design_irreversible_have_adrs + eq: false + then: + verdict: blocked + weight: 90 + actions: + - file-missing-adrs + tags: [dod, design, governance] + +- id: design-risks-have-mitigations + description: Identified risks must have mitigations. + stage: design + priority: 70 + when: + all: + - flag: current_stage + eq: design + - flag: design_risks_have_mitigations + eq: false + then: + verdict: needs-attention + weight: 40 + actions: + - propose-risk-mitigations + tags: [dod, design] + +# ---------------------------------------------------------------------- +# Stage: Specification +# ---------------------------------------------------------------------- + +- id: spec-items-trace-to-requirements + description: Each spec item must trace to >= 1 requirement. + stage: specification + priority: 90 + when: + all: + - flag: current_stage + in: [specification, tasks, implementation, testing, review] + - flag: spec_each_item_traces_to_requirement + eq: false + then: + verdict: blocked + weight: 90 + actions: + - link-spec-items-to-requirements + tags: [dod, specification, traceability] + +# ---------------------------------------------------------------------- +# Stage: Implementation +# ---------------------------------------------------------------------- + +- id: impl-lint-clean + description: Implementation must be lint clean. + stage: implementation + priority: 90 + when: + all: + - flag: current_stage + in: [implementation, testing, review] + - flag: implementation_lint_clean + eq: false + then: + verdict: blocked + weight: 90 + actions: + - fix-lint-errors + tags: [dod, implementation] + +- id: impl-types-clean + description: TypeScript / type checks must pass. + stage: implementation + priority: 90 + when: + all: + - flag: current_stage + in: [implementation, testing, review] + - flag: implementation_types_clean + eq: false + then: + verdict: blocked + weight: 90 + actions: + - fix-type-errors + tags: [dod, implementation] + +- id: impl-unit-tests-pass + description: Unit tests for the changed surface must pass. + stage: implementation + priority: 90 + when: + all: + - flag: current_stage + in: [implementation, testing, review] + - flag: implementation_unit_tests_pass + eq: false + then: + verdict: blocked + weight: 90 + actions: + - fix-failing-tests + tags: [dod, implementation] + +- id: impl-ready + description: Implementation DoD satisfied — ready for /spec:test. + stage: implementation + priority: 10 + when: + all: + - flag: current_stage + eq: implementation + - flag: implementation_lint_clean + eq: true + - flag: implementation_types_clean + eq: true + - flag: implementation_unit_tests_pass + eq: true + - flag: open_clarifications_count + eq: 0 + - flag: s1_findings_count + eq: 0 + then: + verdict: ready-to-progress + weight: 100 + actions: + - advance-to-testing + tags: [dod, implementation] + +# ---------------------------------------------------------------------- +# Stage: Testing +# ---------------------------------------------------------------------- + +- id: testing-ears-coverage-incomplete + description: Every EARS clause must have >= 1 test. + stage: testing + priority: 95 + when: + all: + - flag: current_stage + in: [testing, review] + - flag: testing_ears_test_coverage + lt: 1.0 + then: + verdict: blocked + weight: 95 + actions: + - add-missing-tests + tags: [dod, testing, traceability] + +- id: testing-critical-paths-uncovered + description: Critical paths (happy + key edge cases) must be covered. + stage: testing + priority: 80 + when: + all: + - flag: current_stage + in: [testing, review] + - flag: testing_critical_paths_covered + eq: false + then: + verdict: blocked + weight: 80 + actions: + - cover-critical-paths + tags: [dod, testing] + +# ---------------------------------------------------------------------- +# Stage: Review +# ---------------------------------------------------------------------- + +- id: review-traceability-incomplete + description: Traceability matrix must be complete and consistent. + stage: review + priority: 90 + when: + all: + - flag: current_stage + eq: review + - flag: review_traceability_complete + eq: false + then: + verdict: blocked + weight: 90 + actions: + - regenerate-traceability + tags: [dod, review, traceability] + +- id: review-brand-required-but-missing + description: Brand review required (touches sites/, UI surfaces) but not posted. + stage: review + priority: 85 + when: + all: + - flag: current_stage + eq: review + - flag: brand_review_required + eq: true + - flag: brand_review_passed + eq: false + then: + verdict: blocked + weight: 85 + actions: + - request-brand-review + tags: [dod, review, brand] diff --git a/experiments/rule-engine-poc/scripts/run-all-fixtures.mjs b/experiments/rule-engine-poc/scripts/run-all-fixtures.mjs new file mode 100644 index 000000000..50412f68d --- /dev/null +++ b/experiments/rule-engine-poc/scripts/run-all-fixtures.mjs @@ -0,0 +1,23 @@ +import { readdirSync } from "node:fs"; +import { join } from "node:path"; +import { spawnSync } from "node:child_process"; + +const fixturesDir = new URL("../fixtures/", import.meta.url).pathname; +const rules = "rules/quality-gates.yaml"; + +const files = readdirSync(fixturesDir) + .filter((f) => f.endsWith(".json")) + .sort(); + +let failures = 0; +for (const f of files) { + const flags = join("fixtures", f); + const r = spawnSync("npx", ["tsx", "src/cli.ts", rules, flags], { + stdio: "inherit", + }); + if (r.status !== 0 && r.status !== 1) { + failures += 1; + } +} + +process.exit(failures > 0 ? 2 : 0); diff --git a/experiments/rule-engine-poc/scripts/run-all-html.mjs b/experiments/rule-engine-poc/scripts/run-all-html.mjs new file mode 100644 index 000000000..551bd3918 --- /dev/null +++ b/experiments/rule-engine-poc/scripts/run-all-html.mjs @@ -0,0 +1,29 @@ +import { readdirSync, mkdirSync } from "node:fs"; +import { join, basename } from "node:path"; +import { spawnSync } from "node:child_process"; + +const fixturesDir = new URL("../fixtures/", import.meta.url).pathname; +const rules = "rules/quality-gates.yaml"; +const reportsDir = "reports"; + +mkdirSync(reportsDir, { recursive: true }); + +const files = readdirSync(fixturesDir) + .filter((f) => f.endsWith(".json")) + .sort(); + +for (const f of files) { + const flags = join("fixtures", f); + const out = join(reportsDir, basename(f, ".json") + ".html"); + const r = spawnSync( + "npx", + ["tsx", "src/cli.ts", rules, flags, "--html", out, "--quiet"], + { stdio: "inherit" }, + ); + // Engine returns exit 1 for blocked verdicts; that's expected. + if (r.status !== 0 && r.status !== 1) { + process.exit(r.status ?? 2); + } +} + +console.log(`Wrote ${files.length} HTML reports to ${reportsDir}/`); diff --git a/experiments/rule-engine-poc/src/cli.ts b/experiments/rule-engine-poc/src/cli.ts new file mode 100644 index 000000000..f56161237 --- /dev/null +++ b/experiments/rule-engine-poc/src/cli.ts @@ -0,0 +1,140 @@ +#!/usr/bin/env node +import { readFileSync, writeFileSync, mkdirSync } from "node:fs"; +import { dirname, resolve } from "node:path"; +import { evaluate } from "./engine.js"; +import { renderHtmlReport } from "./html-report.js"; +import { loadRulesFromFile } from "./loader.js"; +import type { ExtractionFlags, VerdictResult } from "./types.js"; + +const argv = process.argv.slice(2); + +function takeOption(flag: string): string | undefined { + const i = argv.indexOf(flag); + if (i === -1) return undefined; + const value = argv[i + 1]; + argv.splice(i, 2); + return value; +} + +const jsonMode = (() => { + const i = argv.indexOf("--json"); + if (i === -1) return false; + argv.splice(i, 1); + return true; +})(); +const quietMode = (() => { + const i = argv.indexOf("--quiet"); + if (i === -1) return false; + argv.splice(i, 1); + return true; +})(); +const htmlPath = takeOption("--html"); + +const [rulesPath, flagsPath] = argv; + +if (!rulesPath || !flagsPath) { + console.error( + "Usage: rule-engine-poc [--json] [--html ] [--quiet]", + ); + process.exit(2); +} + +const rules = loadRulesFromFile(rulesPath); +const flags: ExtractionFlags = JSON.parse( + readFileSync(resolve(flagsPath), "utf8"), +); +const result = evaluate(rules, flags); + +if (htmlPath) { + const abs = resolve(htmlPath); + mkdirSync(dirname(abs), { recursive: true }); + const html = renderHtmlReport(result, { + rulesPath, + flagsPath, + flags, + generatedAt: new Date().toISOString(), + }); + writeFileSync(abs, html, "utf8"); + if (!quietMode) console.error(`HTML report written to ${abs}`); +} + +if (jsonMode) { + console.log(JSON.stringify(result, null, 2)); +} else if (!quietMode) { + printReport(result, rulesPath, flagsPath); +} + +// Exit code communicates the verdict to a calling shell / CI. +const exitMap: Record = { + blocked: 1, + "needs-attention": 0, + "ready-to-progress": 0, + unknown: 0, +}; +process.exitCode = exitMap[result.verdict]; + +function printReport( + r: VerdictResult, + rulesPathArg: string, + flagsPathArg: string, +): void { + const banner = + "============================================================"; + console.log(""); + console.log(banner); + console.log(` VERDICT: ${r.verdict.toUpperCase()}`); + console.log(banner); + console.log(""); + console.log(` rules: ${rulesPathArg}`); + console.log(` flags: ${flagsPathArg}`); + console.log(""); + + console.log(" Weighted tally:"); + for (const v of ["blocked", "needs-attention", "ready-to-progress", "unknown"] as const) { + console.log(` ${v.padEnd(20)} ${r.weightedTally[v]}`); + } + console.log(""); + + console.log(" Suggested actions:"); + if (r.actions.length === 0) { + console.log(" (none)"); + } else { + for (const a of r.actions) console.log(` - ${a}`); + } + console.log(""); + + console.log(" Audit trail (deterministic order: priority desc, id asc):"); + for (const ev of r.evaluations) { + const mark = ev.matched ? "[+]" : "[ ]"; + console.log( + ` ${mark} prio=${String(ev.rule.priority).padStart(3)} ${ev.rule.id} (hash ${ev.rule.hash.slice(0, 12)})`, + ); + for (const c of ev.conditions) { + const cmark = c.matched ? "[+]" : "[-]"; + const op: string[] = []; + if (c.condition.eq !== undefined) op.push(`eq=${JSON.stringify(c.condition.eq)}`); + if (c.condition.ne !== undefined) op.push(`ne=${JSON.stringify(c.condition.ne)}`); + if (c.condition.gt !== undefined) op.push(`gt=${c.condition.gt}`); + if (c.condition.lt !== undefined) op.push(`lt=${c.condition.lt}`); + if (c.condition.in !== undefined) op.push(`in=${JSON.stringify(c.condition.in)}`); + if (c.condition.regex !== undefined) op.push(`regex=${c.condition.regex}`); + if (c.condition.exists !== undefined) op.push(`exists=${c.condition.exists}`); + const reason = c.reason ? ` (${c.reason})` : ""; + console.log( + ` ${cmark} ${c.condition.flag} ${op.join(" ")} -> observed=${JSON.stringify(c.observed)}${reason}`, + ); + } + if (ev.contribution) { + console.log( + ` => verdict=${ev.contribution.verdict} weight=${ev.contribution.weight} actions=${JSON.stringify(ev.contribution.actions)}`, + ); + } + } + console.log(""); + + console.log(" Provenance (for replay):"); + console.log(` engineVersion: ${r.engineVersion}`); + console.log(` rulesetHash: ${r.rulesetHash}`); + console.log(` flagsHash: ${r.flagsHash}`); + console.log(""); +} diff --git a/experiments/rule-engine-poc/src/engine.ts b/experiments/rule-engine-poc/src/engine.ts new file mode 100644 index 000000000..8734f6beb --- /dev/null +++ b/experiments/rule-engine-poc/src/engine.ts @@ -0,0 +1,176 @@ +import { canonicalJson, sha256 } from "./hash.js"; +import type { + Condition, + ConditionResult, + ExtractionFlags, + FlagValue, + LoadedRule, + RuleEvaluation, + Verdict, + VerdictResult, + WeightedTally, + WhenClause, +} from "./types.js"; + +export const ENGINE_VERSION = "0.1.0"; + +// Severity ordering: a higher-severity verdict wins over a lower-severity one, +// regardless of weight. Weight only breaks ties within a tier and informs +// action prioritisation. This mirrors the article's "scoring tier" idea — +// verdicts are categorical, not numeric. +const SEVERITY_ORDER: readonly Verdict[] = [ + "blocked", + "needs-attention", + "ready-to-progress", + "unknown", +] as const; + +function deepEqual(a: unknown, b: unknown): boolean { + if (a === b) return true; + if (Array.isArray(a) && Array.isArray(b)) { + if (a.length !== b.length) return false; + return a.every((v, i) => deepEqual(v, b[i])); + } + return false; +} + +function evaluateCondition( + c: Condition, + flags: ExtractionFlags, +): ConditionResult { + const present = Object.prototype.hasOwnProperty.call(flags, c.flag); + const observed: FlagValue | undefined = present ? flags[c.flag] : undefined; + + if (c.exists !== undefined) { + const matched = present === c.exists; + return { condition: c, matched, observed }; + } + + if (!present) { + return { + condition: c, + matched: false, + observed, + reason: "flag missing in extraction", + }; + } + + let matched = true; + if (c.eq !== undefined) matched = matched && deepEqual(observed, c.eq); + if (c.ne !== undefined) matched = matched && !deepEqual(observed, c.ne); + if (c.gt !== undefined) { + matched = matched && typeof observed === "number" && observed > c.gt; + } + if (c.lt !== undefined) { + matched = matched && typeof observed === "number" && observed < c.lt; + } + if (c.in !== undefined) { + matched = matched && c.in.some((v) => deepEqual(observed, v)); + } + if (c.regex !== undefined) { + matched = + matched && + typeof observed === "string" && + new RegExp(c.regex).test(observed); + } + + return { condition: c, matched, observed }; +} + +function evaluateWhen( + when: WhenClause, + flags: ExtractionFlags, +): { matched: boolean; conditions: ConditionResult[] } { + const conditions: ConditionResult[] = []; + let matched = true; + + if (when.all) { + for (const c of when.all) { + const r = evaluateCondition(c, flags); + conditions.push(r); + if (!r.matched) matched = false; + } + } + if (when.any) { + const anyResults = when.any.map((c) => evaluateCondition(c, flags)); + conditions.push(...anyResults); + if (anyResults.length > 0 && !anyResults.some((r) => r.matched)) { + matched = false; + } + } + if (when.not) { + for (const c of when.not) { + const r = evaluateCondition(c, flags); + // Record the inverted result so the audit trail shows what we required. + conditions.push({ ...r, matched: !r.matched }); + if (r.matched) matched = false; + } + } + + return { matched, conditions }; +} + +function emptyTally(): WeightedTally { + return { + blocked: 0, + "needs-attention": 0, + "ready-to-progress": 0, + unknown: 0, + }; +} + +export function evaluate( + rules: LoadedRule[], + flags: ExtractionFlags, +): VerdictResult { + // Deterministic evaluation order: priority desc, then id asc. + const sorted = [...rules].sort((a, b) => { + if (b.priority !== a.priority) return b.priority - a.priority; + return a.id.localeCompare(b.id); + }); + + const evaluations: RuleEvaluation[] = []; + const tally = emptyTally(); + const actionSet = new Set(); + + for (const rule of sorted) { + const { matched, conditions } = evaluateWhen(rule.when, flags); + const ev: RuleEvaluation = { rule, matched, conditions }; + if (matched) { + tally[rule.then.verdict] += rule.then.weight; + ev.contribution = { + verdict: rule.then.verdict, + weight: rule.then.weight, + actions: rule.then.actions, + }; + for (const a of rule.then.actions) actionSet.add(a); + } + evaluations.push(ev); + } + + // Severity-first: pick the highest-severity tier with any weight. + let verdict: Verdict = "unknown"; + for (const v of SEVERITY_ORDER) { + if (tally[v] > 0) { + verdict = v; + break; + } + } + + const actions = [...actionSet].sort(); + + const rulesetHash = sha256( + canonicalJson(sorted.map((r) => ({ id: r.id, hash: r.hash }))), + ); + const flagsHash = sha256(canonicalJson(flags)); + + return { + verdict, + weightedTally: tally, + actions, + evaluations, + rulesetHash, + flagsHash, + engineVersion: ENGINE_VERSION, + }; +} diff --git a/experiments/rule-engine-poc/src/hash.ts b/experiments/rule-engine-poc/src/hash.ts new file mode 100644 index 000000000..ceb54529e --- /dev/null +++ b/experiments/rule-engine-poc/src/hash.ts @@ -0,0 +1,24 @@ +import { createHash } from "node:crypto"; + +export function sha256(input: string): string { + return createHash("sha256").update(input).digest("hex"); +} + +// Stable JSON: object keys sorted alphabetically, arrays preserved in order. +// Guarantees the same value always serialises to the same string, +// which is the foundation of audit-trail reproducibility. +export function canonicalJson(value: unknown): string { + return JSON.stringify(sortKeys(value)); +} + +function sortKeys(value: unknown): unknown { + if (Array.isArray(value)) return value.map(sortKeys); + if (value && typeof value === "object") { + const out: Record = {}; + for (const key of Object.keys(value as object).sort()) { + out[key] = sortKeys((value as Record)[key]); + } + return out; + } + return value; +} diff --git a/experiments/rule-engine-poc/src/html-report.ts b/experiments/rule-engine-poc/src/html-report.ts new file mode 100644 index 000000000..2c335c3fe --- /dev/null +++ b/experiments/rule-engine-poc/src/html-report.ts @@ -0,0 +1,278 @@ +import type { VerdictResult, Verdict } from "./types.js"; + +interface RenderContext { + rulesPath: string; + flagsPath: string; + flags: Record; + generatedAt: string; +} + +const VERDICT_PALETTE: Record< + Verdict, + { bg: string; border: string; fg: string; label: string } +> = { + blocked: { + bg: "#fdecea", + border: "#d8281b", + fg: "#7a160d", + label: "Blocked", + }, + "needs-attention": { + bg: "#fff4e0", + border: "#d18900", + fg: "#6c4400", + label: "Needs attention", + }, + "ready-to-progress": { + bg: "#e6f6ec", + border: "#1f8a4c", + fg: "#114a29", + label: "Ready to progress", + }, + unknown: { + bg: "#eef0f3", + border: "#737884", + fg: "#3a3d44", + label: "Unknown", + }, +}; + +function esc(value: unknown): string { + return String(value) + .replace(/&/g, "&") + .replace(//g, ">") + .replace(/"/g, """) + .replace(/'/g, "'"); +} + +function fmtJson(value: unknown): string { + return esc(JSON.stringify(value)); +} + +function conditionSummary(c: import("./types.js").Condition): string { + const parts: string[] = []; + if (c.eq !== undefined) parts.push(`eq=${fmtJson(c.eq)}`); + if (c.ne !== undefined) parts.push(`ne=${fmtJson(c.ne)}`); + if (c.gt !== undefined) parts.push(`gt=${c.gt}`); + if (c.lt !== undefined) parts.push(`lt=${c.lt}`); + if (c.in !== undefined) parts.push(`in=${fmtJson(c.in)}`); + if (c.regex !== undefined) parts.push(`regex=${esc(c.regex)}`); + if (c.exists !== undefined) parts.push(`exists=${c.exists}`); + return parts.join(" "); +} + +export function renderHtmlReport( + result: VerdictResult, + ctx: RenderContext, +): string { + const palette = VERDICT_PALETTE[result.verdict]; + const matchedCount = result.evaluations.filter((e) => e.matched).length; + const totalCount = result.evaluations.length; + + const tallyRows = ( + ["blocked", "needs-attention", "ready-to-progress", "unknown"] as Verdict[] + ) + .map((v) => { + const w = result.weightedTally[v]; + return `${VERDICT_PALETTE[v].label}${w}`; + }) + .join(""); + + const actionItems = result.actions.length + ? result.actions.map((a) => `
  • ${esc(a)}
  • `).join("") + : `
  • (none — no rule contributed an action)
  • `; + + const flagRows = Object.keys(ctx.flags) + .sort() + .map( + (k) => + `${esc(k)}${fmtJson((ctx.flags as Record)[k])}`, + ) + .join(""); + + const evaluations = result.evaluations + .map((ev) => { + const cls = ev.matched ? "rule rule--matched" : "rule rule--skipped"; + const status = ev.matched ? "MATCHED" : "did not match"; + const conds = ev.conditions + .map((c) => { + const cls2 = c.matched ? "cond cond--ok" : "cond cond--miss"; + const reason = c.reason + ? ` (${esc(c.reason)})` + : ""; + return `
  • ${esc(c.condition.flag)} ${conditionSummary(c.condition)} → observed=${fmtJson(c.observed)}${reason}
  • `; + }) + .join(""); + const contribution = ev.contribution + ? `

    Contributes ${VERDICT_PALETTE[ev.contribution.verdict].label} with weight ${ev.contribution.weight}. Actions: ${ev.contribution.actions.map((a) => `${esc(a)}`).join(", ")}.

    ` + : ""; + const stage = ev.rule.stage + ? `stage: ${esc(ev.rule.stage)}` + : ""; + const tags = (ev.rule.tags ?? []) + .map((t) => `${esc(t)}`) + .join(" "); + return ` +
    +
    + ${status} +

    ${esc(ev.rule.id)}

    +

    ${esc(ev.rule.description)}

    +

    + priority: ${ev.rule.priority} + ${stage} + ${tags} + ${esc(ev.rule.hash.slice(0, 12))} +

    +
    +
      ${conds}
    + ${contribution} +
    `; + }) + .join(""); + + return ` + + + + + Rule engine report — ${esc(ctx.flagsPath)} + + + +
    +

    Rule engine report

    +

    Deterministic verdict from extracted flags — "LLM extracts, rules decide".

    +
    + +
    +
    Verdict
    +
    ${esc(palette.label)}
    +

    ${matchedCount} of ${totalCount} rules matched · ${result.actions.length} suggested action${result.actions.length === 1 ? "" : "s"}

    +
    + +
    +
    +

    Weighted tally

    + + + ${tallyRows} +
    Verdict tierWeight
    +
    + +
    +

    Suggested actions

    +
      ${actionItems}
    +
    +
    + +
    +

    Extraction flags (input from the Orient quadrant)

    + + + ${flagRows} +
    FlagValue
    +
    + +
    +

    Audit trail (deterministic order: priority desc, id asc)

    + ${evaluations} +
    + +
    +

    Provenance

    +

    + Engine version: ${esc(result.engineVersion)}
    + Ruleset hash: ${esc(result.rulesetHash)}
    + Flags hash: ${esc(result.flagsHash)}
    + Rules file: ${esc(ctx.rulesPath)}
    + Flags file: ${esc(ctx.flagsPath)} +

    +
    + +
    + Generated ${esc(ctx.generatedAt)} · experiments/rule-engine-poc · same inputs → identical report, byte-for-byte (except this timestamp). +
    + + +`; +} diff --git a/experiments/rule-engine-poc/src/loader.ts b/experiments/rule-engine-poc/src/loader.ts new file mode 100644 index 000000000..43ee94b8b --- /dev/null +++ b/experiments/rule-engine-poc/src/loader.ts @@ -0,0 +1,80 @@ +import { readFileSync } from "node:fs"; +import { resolve } from "node:path"; +import yaml from "js-yaml"; +import { canonicalJson, sha256 } from "./hash.js"; +import type { LoadedRule, Rule } from "./types.js"; + +export function loadRulesFromFile(filePath: string): LoadedRule[] { + const abs = resolve(filePath); + const raw = readFileSync(abs, "utf8"); + const parsed = yaml.load(raw); + if (!Array.isArray(parsed)) { + throw new Error(`Rule file ${filePath} must be a YAML list of rules`); + } + return parsed.map((rule, index) => + normalize(rule as Rule, abs, index), + ); +} + +export function loadRulesFromString( + source: string, + sourceLabel = "", +): LoadedRule[] { + const parsed = yaml.load(source); + if (!Array.isArray(parsed)) { + throw new Error(`Rule source must be a YAML list of rules`); + } + return parsed.map((rule, index) => + normalize(rule as Rule, sourceLabel, index), + ); +} + +function normalize( + rule: Rule, + sourceFile: string, + sourceIndex: number, +): LoadedRule { + validate(rule, sourceFile, sourceIndex); + const hashable = { + id: rule.id, + priority: rule.priority, + when: rule.when, + then: rule.then, + }; + const hash = sha256(canonicalJson(hashable)); + return { ...rule, hash, sourceFile, sourceIndex }; +} + +function validate( + rule: Rule, + sourceFile: string, + sourceIndex: number, +): void { + const where = `${sourceFile}#${sourceIndex}`; + if (!rule.id || typeof rule.id !== "string") { + throw new Error(`Rule at ${where} missing required string 'id'`); + } + if (!rule.when || typeof rule.when !== "object") { + throw new Error(`Rule '${rule.id}' missing 'when' clause`); + } + if (!rule.when.all && !rule.when.any && !rule.when.not) { + throw new Error( + `Rule '${rule.id}' must have at least one of 'when.all', 'when.any', 'when.not'`, + ); + } + if (!rule.then || typeof rule.then !== "object") { + throw new Error(`Rule '${rule.id}' missing 'then' clause`); + } + if (!rule.then.verdict) { + throw new Error(`Rule '${rule.id}' missing 'then.verdict'`); + } + if (typeof rule.then.weight !== "number") { + throw new Error(`Rule '${rule.id}' missing numeric 'then.weight'`); + } + if (!Array.isArray(rule.then.actions)) { + throw new Error(`Rule '${rule.id}' missing 'then.actions' array`); + } + if (typeof rule.priority !== "number") { + throw new Error(`Rule '${rule.id}' missing numeric 'priority'`); + } +} diff --git a/experiments/rule-engine-poc/src/types.ts b/experiments/rule-engine-poc/src/types.ts new file mode 100644 index 000000000..ba143acf3 --- /dev/null +++ b/experiments/rule-engine-poc/src/types.ts @@ -0,0 +1,82 @@ +// Pure data contracts for the rule engine POC. +// "LLM extracts, rules decide" — the LLM produces ExtractionFlags; +// this engine deterministically maps flags to a VerdictResult. + +export type FlagValue = boolean | string | number | string[] | null; + +export type ExtractionFlags = Record; + +export type Verdict = + | "blocked" + | "needs-attention" + | "ready-to-progress" + | "unknown"; + +export interface Condition { + flag: string; + eq?: FlagValue; + ne?: FlagValue; + gt?: number; + lt?: number; + in?: FlagValue[]; + regex?: string; + exists?: boolean; +} + +export interface WhenClause { + all?: Condition[]; + any?: Condition[]; + not?: Condition[]; +} + +export interface ThenClause { + verdict: Verdict; + weight: number; + actions: string[]; +} + +export interface Rule { + id: string; + description: string; + stage?: string; + priority: number; + when: WhenClause; + then: ThenClause; + tags?: string[]; +} + +export interface LoadedRule extends Rule { + hash: string; + sourceFile: string; + sourceIndex: number; +} + +export interface ConditionResult { + condition: Condition; + matched: boolean; + observed: FlagValue | undefined; + reason?: string; +} + +export interface RuleEvaluation { + rule: LoadedRule; + matched: boolean; + conditions: ConditionResult[]; + contribution?: { + verdict: Verdict; + weight: number; + actions: string[]; + }; +} + +export type WeightedTally = Record; + +export interface VerdictResult { + verdict: Verdict; + weightedTally: WeightedTally; + actions: string[]; + evaluations: RuleEvaluation[]; + rulesetHash: string; + flagsHash: string; + engineVersion: string; +} diff --git a/experiments/rule-engine-poc/test/engine.test.ts b/experiments/rule-engine-poc/test/engine.test.ts new file mode 100644 index 000000000..bcd837511 --- /dev/null +++ b/experiments/rule-engine-poc/test/engine.test.ts @@ -0,0 +1,243 @@ +import { describe, expect, it } from "vitest"; +import { evaluate, ENGINE_VERSION } from "../src/engine.js"; +import { loadRulesFromString } from "../src/loader.js"; +import type { ExtractionFlags } from "../src/types.js"; + +const sampleRules = ` +- id: a-blocked-when-flag-false + description: blocks when foo=false + priority: 50 + when: + all: + - flag: foo + eq: false + then: + verdict: blocked + weight: 50 + actions: [fix-foo] + +- id: b-ready-when-foo-true-and-count-gt-1 + description: ready when foo true and count > 1 + priority: 10 + when: + all: + - flag: foo + eq: true + - flag: count + gt: 1 + then: + verdict: ready-to-progress + weight: 100 + actions: [advance] + +- id: c-attention-when-tag-in-set + description: needs attention if tag in list + priority: 30 + when: + all: + - flag: tag + in: [alpha, beta] + then: + verdict: needs-attention + weight: 20 + actions: [review-tag] +`; + +const rules = loadRulesFromString(sampleRules, "engine.test.ts"); + +describe("engine", () => { + it("attaches engine version to the result", () => { + const flags: ExtractionFlags = { foo: true, count: 2 }; + const result = evaluate(rules, flags); + expect(result.engineVersion).toBe(ENGINE_VERSION); + }); + + it("picks blocked when blocking rule matches, regardless of weight", () => { + const flags: ExtractionFlags = { foo: false, count: 5 }; + const result = evaluate(rules, flags); + expect(result.verdict).toBe("blocked"); + }); + + it("picks ready-to-progress when only positive rule matches", () => { + const flags: ExtractionFlags = { foo: true, count: 5 }; + const result = evaluate(rules, flags); + expect(result.verdict).toBe("ready-to-progress"); + }); + + it("picks needs-attention over ready-to-progress (severity-first)", () => { + const flags: ExtractionFlags = { foo: true, count: 5, tag: "alpha" }; + const result = evaluate(rules, flags); + expect(result.verdict).toBe("needs-attention"); + }); + + it("returns unknown when no rule matches", () => { + const flags: ExtractionFlags = { foo: true, count: 0 }; + const result = evaluate(rules, flags); + expect(result.verdict).toBe("unknown"); + }); + + it("logs every rule in the audit trail in deterministic order", () => { + const flags: ExtractionFlags = { foo: true, count: 5, tag: "alpha" }; + const result = evaluate(rules, flags); + const order = result.evaluations.map((e) => e.rule.id); + // priority desc, then id asc: a(50) -> c(30) -> b(10) + expect(order).toEqual([ + "a-blocked-when-flag-false", + "c-attention-when-tag-in-set", + "b-ready-when-foo-true-and-count-gt-1", + ]); + }); + + it("records 'flag missing' reason when extraction lacks a required flag", () => { + const flags: ExtractionFlags = { foo: true }; + const result = evaluate(rules, flags); + const ev = result.evaluations.find( + (e) => e.rule.id === "b-ready-when-foo-true-and-count-gt-1", + ); + const missing = ev?.conditions.find((c) => c.condition.flag === "count"); + expect(missing?.matched).toBe(false); + expect(missing?.reason).toBe("flag missing in extraction"); + }); + + it("sorts suggested actions alphabetically for determinism", () => { + const ruleSet = loadRulesFromString( + ` +- id: rule-z + description: z + priority: 20 + when: + all: + - flag: x + eq: true + then: + verdict: needs-attention + weight: 10 + actions: [zeta, alpha] +- id: rule-a + description: a + priority: 10 + when: + all: + - flag: x + eq: true + then: + verdict: needs-attention + weight: 10 + actions: [mu] +`, + "action-sort-test", + ); + const result = evaluate(ruleSet, { x: true }); + expect(result.actions).toEqual(["alpha", "mu", "zeta"]); + }); +}); + +describe("reproducibility (North Star)", () => { + // Verdict reproducibility rate is the strategist-recommended North Star: + // the same flags + the same rule set must always produce the same verdict + // AND the same audit trail byte-for-byte. + it("produces an identical result on repeated evaluation", () => { + const flags: ExtractionFlags = { foo: true, count: 5, tag: "alpha" }; + const a = evaluate(rules, flags); + const b = evaluate(rules, flags); + expect(JSON.stringify(a)).toBe(JSON.stringify(b)); + }); + + it("rulesetHash changes when a rule changes", () => { + const rulesV1 = loadRulesFromString(sampleRules, "v1"); + const rulesV2 = loadRulesFromString( + sampleRules.replace("weight: 50", "weight: 51"), + "v2", + ); + const flags: ExtractionFlags = { foo: false, count: 5 }; + expect(evaluate(rulesV1, flags).rulesetHash).not.toBe( + evaluate(rulesV2, flags).rulesetHash, + ); + }); + + it("flagsHash changes when any flag changes", () => { + const a = evaluate(rules, { foo: true, count: 5 }); + const b = evaluate(rules, { foo: true, count: 6 }); + expect(a.flagsHash).not.toBe(b.flagsHash); + }); + + it("flagsHash is independent of JSON key ordering", () => { + // Real extraction outputs may have arbitrary key order. + // Canonical JSON should neutralise that. + const a = evaluate(rules, { foo: true, count: 5, tag: "alpha" }); + const b = evaluate(rules, { tag: "alpha", count: 5, foo: true }); + expect(a.flagsHash).toBe(b.flagsHash); + }); +}); + +describe("when-clause operators", () => { + it("matches 'any' when at least one condition matches", () => { + const r = loadRulesFromString( + ` +- id: any-rule + description: any + priority: 10 + when: + any: + - flag: foo + eq: true + - flag: bar + eq: true + then: + verdict: needs-attention + weight: 10 + actions: [a] +`, + "any-test", + ); + expect(evaluate(r, { foo: false, bar: true }).verdict).toBe("needs-attention"); + expect(evaluate(r, { foo: false, bar: false }).verdict).toBe("unknown"); + }); + + it("matches 'not' as inverse of inner condition", () => { + const r = loadRulesFromString( + ` +- id: not-rule + description: not + priority: 10 + when: + all: + - flag: foo + eq: true + not: + - flag: bar + eq: true + then: + verdict: ready-to-progress + weight: 10 + actions: [ok] +`, + "not-test", + ); + expect(evaluate(r, { foo: true, bar: false }).verdict).toBe("ready-to-progress"); + expect(evaluate(r, { foo: true, bar: true }).verdict).toBe("unknown"); + }); + + it("regex operator matches string flags", () => { + const r = loadRulesFromString( + ` +- id: regex-rule + description: regex + priority: 10 + when: + all: + - flag: slug + regex: "^claude/" + then: + verdict: needs-attention + weight: 10 + actions: [check-claude-branch] +`, + "regex-test", + ); + expect(evaluate(r, { slug: "claude/rule-engine-poc" }).verdict).toBe( + "needs-attention", + ); + expect(evaluate(r, { slug: "main" }).verdict).toBe("unknown"); + }); +}); diff --git a/experiments/rule-engine-poc/test/hash.test.ts b/experiments/rule-engine-poc/test/hash.test.ts new file mode 100644 index 000000000..6219654c2 --- /dev/null +++ b/experiments/rule-engine-poc/test/hash.test.ts @@ -0,0 +1,26 @@ +import { describe, expect, it } from "vitest"; +import { canonicalJson, sha256 } from "../src/hash.js"; + +describe("hash", () => { + it("canonicalJson sorts object keys", () => { + expect(canonicalJson({ b: 1, a: 2 })).toBe('{"a":2,"b":1}'); + }); + + it("canonicalJson sorts nested object keys", () => { + expect(canonicalJson({ outer: { z: 1, a: 2 } })).toBe('{"outer":{"a":2,"z":1}}'); + }); + + it("canonicalJson preserves array order", () => { + expect(canonicalJson([3, 1, 2])).toBe("[3,1,2]"); + }); + + it("sha256 produces a hex digest", () => { + expect(sha256("hello")).toMatch(/^[0-9a-f]{64}$/); + }); + + it("sha256 of canonicalised equal objects is identical", () => { + expect(sha256(canonicalJson({ a: 1, b: 2 }))).toBe( + sha256(canonicalJson({ b: 2, a: 1 })), + ); + }); +}); diff --git a/experiments/rule-engine-poc/test/loader.test.ts b/experiments/rule-engine-poc/test/loader.test.ts new file mode 100644 index 000000000..5fd984dfe --- /dev/null +++ b/experiments/rule-engine-poc/test/loader.test.ts @@ -0,0 +1,98 @@ +import { describe, expect, it } from "vitest"; +import { loadRulesFromString } from "../src/loader.js"; + +describe("loader", () => { + it("rejects rules with no when clause", () => { + expect(() => + loadRulesFromString( + ` +- id: r1 + priority: 1 + description: x + then: + verdict: blocked + weight: 1 + actions: [] +`, + "no-when", + ), + ).toThrow(/missing 'when'/); + }); + + it("rejects rules with no priority", () => { + expect(() => + loadRulesFromString( + ` +- id: r1 + description: x + when: + all: + - flag: a + eq: true + then: + verdict: blocked + weight: 1 + actions: [] +`, + "no-priority", + ), + ).toThrow(/missing numeric 'priority'/); + }); + + it("rejects rules missing then.weight", () => { + expect(() => + loadRulesFromString( + ` +- id: r1 + priority: 1 + description: x + when: + all: + - flag: a + eq: true + then: + verdict: blocked + actions: [] +`, + "no-weight", + ), + ).toThrow(/missing numeric 'then.weight'/); + }); + + it("assigns a stable content hash to each rule", () => { + const ruleA = loadRulesFromString( + ` +- id: r1 + description: x + priority: 1 + when: + all: + - flag: a + eq: true + then: + verdict: blocked + weight: 1 + actions: [] +`, + "hash-a", + ); + const ruleB = loadRulesFromString( + ` +- id: r1 + description: y + priority: 1 + when: + all: + - flag: a + eq: true + then: + verdict: blocked + weight: 1 + actions: [] +`, + "hash-b", + ); + // Description is informational, not in the hash payload. + expect(ruleA[0]!.hash).toBe(ruleB[0]!.hash); + }); +}); diff --git a/experiments/rule-engine-poc/tsconfig.json b/experiments/rule-engine-poc/tsconfig.json new file mode 100644 index 000000000..ad402dcd3 --- /dev/null +++ b/experiments/rule-engine-poc/tsconfig.json @@ -0,0 +1,19 @@ +{ + "compilerOptions": { + "target": "ES2022", + "module": "ESNext", + "moduleResolution": "Bundler", + "strict": true, + "noUncheckedIndexedAccess": true, + "esModuleInterop": true, + "skipLibCheck": true, + "resolveJsonModule": true, + "outDir": "dist", + "declaration": true, + "sourceMap": true, + "rootDir": ".", + "types": ["node"] + }, + "include": ["src/**/*", "test/**/*"], + "exclude": ["dist", "node_modules"] +} diff --git a/experiments/rule-engine-poc/vitest.config.ts b/experiments/rule-engine-poc/vitest.config.ts new file mode 100644 index 000000000..d9ebefd5c --- /dev/null +++ b/experiments/rule-engine-poc/vitest.config.ts @@ -0,0 +1,8 @@ +import { defineConfig } from "vitest/config"; + +export default defineConfig({ + test: { + include: ["test/**/*.test.ts"], + environment: "node", + }, +}); From 6625ee1108bdbe3c0eda6ff2204fbdac0f413b1a Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 17 May 2026 11:35:44 +0000 Subject: [PATCH 02/45] fix(rule-engine-poc): tighten loader validation (Codex P1 + P2) - Validate that then.verdict is one of the four known tiers; previously a typo like 'blokced' would load successfully and silently degrade the rule into a no-op via tally[] (#525 P1). - Validate that when.all / when.any / when.not are arrays at load time; previously 'any: true' would load and crash at evaluation with TypeError on .map (#525 P2). - Export VERDICTS as a runtime constant from types.ts so the schema check has one source of truth alongside the type. - Three new loader tests cover the two failure modes plus a typo case. --- experiments/rule-engine-poc/src/loader.ts | 15 +++++ experiments/rule-engine-poc/src/types.ts | 13 ++-- .../rule-engine-poc/test/loader.test.ts | 61 +++++++++++++++++++ 3 files changed, 84 insertions(+), 5 deletions(-) diff --git a/experiments/rule-engine-poc/src/loader.ts b/experiments/rule-engine-poc/src/loader.ts index 43ee94b8b..b5452921d 100644 --- a/experiments/rule-engine-poc/src/loader.ts +++ b/experiments/rule-engine-poc/src/loader.ts @@ -2,6 +2,7 @@ import { readFileSync } from "node:fs"; import { resolve } from "node:path"; import yaml from "js-yaml"; import { canonicalJson, sha256 } from "./hash.js"; +import { VERDICTS } from "./types.js"; import type { LoadedRule, Rule } from "./types.js"; export function loadRulesFromFile(filePath: string): LoadedRule[] { @@ -62,12 +63,26 @@ function validate( `Rule '${rule.id}' must have at least one of 'when.all', 'when.any', 'when.not'`, ); } + for (const group of ["all", "any", "not"] as const) { + const value = rule.when[group]; + if (value !== undefined && !Array.isArray(value)) { + throw new Error( + `Rule '${rule.id}' has non-array 'when.${group}' (got ${typeof value})`, + ); + } + } if (!rule.then || typeof rule.then !== "object") { throw new Error(`Rule '${rule.id}' missing 'then' clause`); } if (!rule.then.verdict) { throw new Error(`Rule '${rule.id}' missing 'then.verdict'`); } + if (!(VERDICTS as readonly string[]).includes(rule.then.verdict)) { + throw new Error( + `Rule '${rule.id}' has unknown 'then.verdict' '${rule.then.verdict}'. ` + + `Expected one of: ${VERDICTS.join(", ")}.`, + ); + } if (typeof rule.then.weight !== "number") { throw new Error(`Rule '${rule.id}' missing numeric 'then.weight'`); } diff --git a/experiments/rule-engine-poc/src/types.ts b/experiments/rule-engine-poc/src/types.ts index ba143acf3..bdbb34ab2 100644 --- a/experiments/rule-engine-poc/src/types.ts +++ b/experiments/rule-engine-poc/src/types.ts @@ -6,11 +6,14 @@ export type FlagValue = boolean | string | number | string[] | null; export type ExtractionFlags = Record; -export type Verdict = - | "blocked" - | "needs-attention" - | "ready-to-progress" - | "unknown"; +export const VERDICTS = [ + "blocked", + "needs-attention", + "ready-to-progress", + "unknown", +] as const; + +export type Verdict = (typeof VERDICTS)[number]; export interface Condition { flag: string; diff --git a/experiments/rule-engine-poc/test/loader.test.ts b/experiments/rule-engine-poc/test/loader.test.ts index 5fd984dfe..b18600e6f 100644 --- a/experiments/rule-engine-poc/test/loader.test.ts +++ b/experiments/rule-engine-poc/test/loader.test.ts @@ -59,6 +59,67 @@ describe("loader", () => { ).toThrow(/missing numeric 'then.weight'/); }); + it("rejects rules with a misspelled / unknown verdict", () => { + expect(() => + loadRulesFromString( + ` +- id: r1 + priority: 1 + description: x + when: + all: + - flag: a + eq: true + then: + verdict: blokced + weight: 1 + actions: [] +`, + "bad-verdict", + ), + ).toThrow(/unknown 'then\.verdict' 'blokced'/); + }); + + it("rejects rules with a non-array when.any", () => { + expect(() => + loadRulesFromString( + ` +- id: r1 + priority: 1 + description: x + when: + any: true + then: + verdict: blocked + weight: 1 + actions: [] +`, + "non-array-any", + ), + ).toThrow(/non-array 'when\.any'/); + }); + + it("rejects rules with a non-array when.all", () => { + expect(() => + loadRulesFromString( + ` +- id: r1 + priority: 1 + description: x + when: + all: + flag: a + eq: true + then: + verdict: blocked + weight: 1 + actions: [] +`, + "non-array-all", + ), + ).toThrow(/non-array 'when\.all'/); + }); + it("assigns a stable content hash to each rule", () => { const ruleA = loadRulesFromString( ` From 00a43e7630d2c3d7f1ecbfea72b64e13720410e7 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 17 May 2026 11:41:14 +0000 Subject: [PATCH 03/45] fix(rule-engine-poc): exists-AND and empty-group hardening (Codex P1+P2) - engine: 'exists' now participates in the AND-chain instead of short-circuiting, so 'exists: true' combined with 'eq'/'ne'/'gt'/'lt'/ 'in'/'regex' correctly requires every operator to match (#525 P2). 'exists: false' still tolerates flag absence without surfacing the 'flag missing' reason; this is the only short-circuit retained. - loader: empty 'when.all' / 'when.any' / 'when.not' arrays are now rejected at load time. Previously 'any: []' was vacuously satisfied by the length>0 guard in evaluateWhen, allowing a typo to fire a blocking rule unintentionally (#525 P1). - ENGINE_VERSION bumped 0.1.0 -> 0.2.0 because the exists+value-op interaction is a semantic change. Per docs/extending.md, a version bump is the auditor's signal that prior verdicts may not replay. - Six new tests: empty when.any / when.all rejection, four exists-AND cases including exists:false standalone. --- experiments/rule-engine-poc/src/engine.ts | 28 ++++++++-- experiments/rule-engine-poc/src/loader.ts | 11 +++- .../rule-engine-poc/test/engine.test.ts | 54 +++++++++++++++++++ .../rule-engine-poc/test/loader.test.ts | 38 +++++++++++++ 4 files changed, 125 insertions(+), 6 deletions(-) diff --git a/experiments/rule-engine-poc/src/engine.ts b/experiments/rule-engine-poc/src/engine.ts index 8734f6beb..0cd123eb2 100644 --- a/experiments/rule-engine-poc/src/engine.ts +++ b/experiments/rule-engine-poc/src/engine.ts @@ -12,7 +12,9 @@ import type { WhenClause, } from "./types.js"; -export const ENGINE_VERSION = "0.1.0"; +// Bump on any change to evaluation semantics (operator interaction, +// conflict resolution, audit-trail shape) — see docs/extending.md. +export const ENGINE_VERSION = "0.2.0"; // Severity ordering: a higher-severity verdict wins over a lower-severity one, // regardless of weight. Weight only breaks ties within a tier and informs @@ -41,12 +43,29 @@ function evaluateCondition( const present = Object.prototype.hasOwnProperty.call(flags, c.flag); const observed: FlagValue | undefined = present ? flags[c.flag] : undefined; + const hasValueOp = + c.eq !== undefined || + c.ne !== undefined || + c.gt !== undefined || + c.lt !== undefined || + c.in !== undefined || + c.regex !== undefined; + + let matched = true; + + // `exists` participates in the AND-chain rather than short-circuiting, + // so `exists: true` combined with e.g. `eq: 5` correctly requires both. if (c.exists !== undefined) { - const matched = present === c.exists; - return { condition: c, matched, observed }; + matched = matched && present === c.exists; + // `exists: false` means "the user asserts this flag is absent" — + // value-ops cannot apply to an absent flag, so skip the missing-flag + // reason and value-op evaluation in that case. + if (c.exists === false) { + return { condition: c, matched, observed }; + } } - if (!present) { + if (hasValueOp && !present) { return { condition: c, matched: false, @@ -55,7 +74,6 @@ function evaluateCondition( }; } - let matched = true; if (c.eq !== undefined) matched = matched && deepEqual(observed, c.eq); if (c.ne !== undefined) matched = matched && !deepEqual(observed, c.ne); if (c.gt !== undefined) { diff --git a/experiments/rule-engine-poc/src/loader.ts b/experiments/rule-engine-poc/src/loader.ts index b5452921d..d02639196 100644 --- a/experiments/rule-engine-poc/src/loader.ts +++ b/experiments/rule-engine-poc/src/loader.ts @@ -65,11 +65,20 @@ function validate( } for (const group of ["all", "any", "not"] as const) { const value = rule.when[group]; - if (value !== undefined && !Array.isArray(value)) { + if (value === undefined) continue; + if (!Array.isArray(value)) { throw new Error( `Rule '${rule.id}' has non-array 'when.${group}' (got ${typeof value})`, ); } + if (value.length === 0) { + // Empty `all` is vacuously true, empty `any` is vacuously false, + // empty `not` is vacuously true — all surprising. Force the + // author to express intent explicitly by omitting the group. + throw new Error( + `Rule '${rule.id}' has empty 'when.${group}' — omit the group instead`, + ); + } } if (!rule.then || typeof rule.then !== "object") { throw new Error(`Rule '${rule.id}' missing 'then' clause`); diff --git a/experiments/rule-engine-poc/test/engine.test.ts b/experiments/rule-engine-poc/test/engine.test.ts index bcd837511..19f296261 100644 --- a/experiments/rule-engine-poc/test/engine.test.ts +++ b/experiments/rule-engine-poc/test/engine.test.ts @@ -170,6 +170,60 @@ describe("reproducibility (North Star)", () => { }); }); +describe("exists operator interaction", () => { + // `exists` participates in the AND-chain rather than short-circuiting. + const r = loadRulesFromString( + ` +- id: presence-and-value + description: flag must exist AND equal 5 + priority: 10 + when: + all: + - flag: count + exists: true + eq: 5 + then: + verdict: needs-attention + weight: 10 + actions: [check] +`, + "exists-and-eq", + ); + + it("requires both exists:true AND the value operator to match", () => { + expect(evaluate(r, { count: 5 }).verdict).toBe("needs-attention"); + }); + + it("rejects when flag exists but value does not match", () => { + expect(evaluate(r, { count: 6 }).verdict).toBe("unknown"); + }); + + it("rejects when flag does not exist", () => { + expect(evaluate(r, {}).verdict).toBe("unknown"); + }); + + it("exists:false with no value op matches absent flag", () => { + const r2 = loadRulesFromString( + ` +- id: absent-only + description: flag must be absent + priority: 10 + when: + all: + - flag: ghost + exists: false + then: + verdict: needs-attention + weight: 10 + actions: [a] +`, + "exists-false", + ); + expect(evaluate(r2, {}).verdict).toBe("needs-attention"); + expect(evaluate(r2, { ghost: "boo" }).verdict).toBe("unknown"); + }); +}); + describe("when-clause operators", () => { it("matches 'any' when at least one condition matches", () => { const r = loadRulesFromString( diff --git a/experiments/rule-engine-poc/test/loader.test.ts b/experiments/rule-engine-poc/test/loader.test.ts index b18600e6f..cb7061a6d 100644 --- a/experiments/rule-engine-poc/test/loader.test.ts +++ b/experiments/rule-engine-poc/test/loader.test.ts @@ -99,6 +99,44 @@ describe("loader", () => { ).toThrow(/non-array 'when\.any'/); }); + it("rejects rules with an empty when.any", () => { + expect(() => + loadRulesFromString( + ` +- id: r1 + priority: 1 + description: x + when: + any: [] + then: + verdict: blocked + weight: 1 + actions: [] +`, + "empty-any", + ), + ).toThrow(/empty 'when\.any'/); + }); + + it("rejects rules with an empty when.all", () => { + expect(() => + loadRulesFromString( + ` +- id: r1 + priority: 1 + description: x + when: + all: [] + then: + verdict: blocked + weight: 1 + actions: [] +`, + "empty-all", + ), + ).toThrow(/empty 'when\.all'/); + }); + it("rejects rules with a non-array when.all", () => { expect(() => loadRulesFromString( From 179b8834184a56c6879b877ee0e497533d681c6a Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 17 May 2026 11:47:29 +0000 Subject: [PATCH 04/45] fix(rule-engine-poc): condition-level validation (Codex P1+P2 round 3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - loader: each condition must declare at least one supported operator. Previously a typo like { flag: 'x', eqq: true } would load and then always-match at runtime, silently flipping verdicts (#525 round 3 P2). - loader: 'exists: false' combined with value operators is rejected at load time. The combination has no meaningful semantics — an absent flag has no value for eq/ne/gt/lt/in/regex to apply to (#525 round 3 P1). - loader: condition objects without a 'flag' string are rejected. - engine: code comment clarified to reflect the now-enforced invariant that exists:false is only valid alone. - Three new loader tests cover the three rejection paths. --- experiments/rule-engine-poc/src/engine.ts | 5 +- experiments/rule-engine-poc/src/loader.ts | 46 ++++++++++++++ .../rule-engine-poc/test/loader.test.ts | 63 +++++++++++++++++++ 3 files changed, 111 insertions(+), 3 deletions(-) diff --git a/experiments/rule-engine-poc/src/engine.ts b/experiments/rule-engine-poc/src/engine.ts index 0cd123eb2..56540e402 100644 --- a/experiments/rule-engine-poc/src/engine.ts +++ b/experiments/rule-engine-poc/src/engine.ts @@ -55,11 +55,10 @@ function evaluateCondition( // `exists` participates in the AND-chain rather than short-circuiting, // so `exists: true` combined with e.g. `eq: 5` correctly requires both. + // `exists: false` is only ever valid alone — the loader rejects it + // combined with value operators (which would have nothing to apply to). if (c.exists !== undefined) { matched = matched && present === c.exists; - // `exists: false` means "the user asserts this flag is absent" — - // value-ops cannot apply to an absent flag, so skip the missing-flag - // reason and value-op evaluation in that case. if (c.exists === false) { return { condition: c, matched, observed }; } diff --git a/experiments/rule-engine-poc/src/loader.ts b/experiments/rule-engine-poc/src/loader.ts index d02639196..8e19db066 100644 --- a/experiments/rule-engine-poc/src/loader.ts +++ b/experiments/rule-engine-poc/src/loader.ts @@ -79,6 +79,7 @@ function validate( `Rule '${rule.id}' has empty 'when.${group}' — omit the group instead`, ); } + validateConditions(value, group, rule.id); } if (!rule.then || typeof rule.then !== "object") { throw new Error(`Rule '${rule.id}' missing 'then' clause`); @@ -102,3 +103,48 @@ function validate( throw new Error(`Rule '${rule.id}' missing numeric 'priority'`); } } + +const CONDITION_OPS = [ + "eq", + "ne", + "gt", + "lt", + "in", + "regex", + "exists", +] as const; + +function validateConditions( + conditions: unknown[], + group: string, + ruleId: string, +): void { + for (let i = 0; i < conditions.length; i++) { + const where = `condition #${i} in 'when.${group}'`; + const c = conditions[i]; + if (!c || typeof c !== "object") { + throw new Error(`Rule '${ruleId}' ${where} is not an object`); + } + const cond = c as Record; + if (typeof cond.flag !== "string" || cond.flag.length === 0) { + throw new Error(`Rule '${ruleId}' ${where} missing 'flag' (string)`); + } + const opsPresent = CONDITION_OPS.filter((op) => cond[op] !== undefined); + if (opsPresent.length === 0) { + throw new Error( + `Rule '${ruleId}' ${where} has no operator. ` + + `Expected at least one of: ${CONDITION_OPS.join(", ")}.`, + ); + } + // `exists: false` asserts the flag is absent, so value operators on + // the same condition would have no observation to apply to. + // Reject the combination at load time rather than allow ambiguity. + if (cond.exists === false && opsPresent.length > 1) { + throw new Error( + `Rule '${ruleId}' ${where} combines 'exists: false' with ` + + `value operators (${opsPresent.filter((op) => op !== "exists").join(", ")}) ` + + `— these are mutually exclusive`, + ); + } + } +} diff --git a/experiments/rule-engine-poc/test/loader.test.ts b/experiments/rule-engine-poc/test/loader.test.ts index cb7061a6d..101d0b05d 100644 --- a/experiments/rule-engine-poc/test/loader.test.ts +++ b/experiments/rule-engine-poc/test/loader.test.ts @@ -158,6 +158,69 @@ describe("loader", () => { ).toThrow(/non-array 'when\.all'/); }); + it("rejects conditions with no operator", () => { + expect(() => + loadRulesFromString( + ` +- id: r1 + priority: 1 + description: x + when: + all: + - flag: ci_passing + eqq: true + then: + verdict: blocked + weight: 1 + actions: [] +`, + "no-op", + ), + ).toThrow(/has no operator/); + }); + + it("rejects conditions missing the flag field", () => { + expect(() => + loadRulesFromString( + ` +- id: r1 + priority: 1 + description: x + when: + all: + - eq: true + then: + verdict: blocked + weight: 1 + actions: [] +`, + "no-flag", + ), + ).toThrow(/missing 'flag'/); + }); + + it("rejects exists:false combined with value operators", () => { + expect(() => + loadRulesFromString( + ` +- id: r1 + priority: 1 + description: x + when: + all: + - flag: ghost + exists: false + eq: 5 + then: + verdict: blocked + weight: 1 + actions: [] +`, + "exists-false-and-eq", + ), + ).toThrow(/combines 'exists: false' with value operators/); + }); + it("assigns a stable content hash to each rule", () => { const ruleA = loadRulesFromString( ` From 45577c4b86fe1598182904818254e2d3606471d1 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 17 May 2026 11:56:58 +0000 Subject: [PATCH 05/45] fix(rule-engine-poc): loader hardening (Codex round 4) - Reject 'then.weight' values that are <=0, infinite, or NaN. A 'blocked' rule with weight 0 would contribute nothing to the tally and silently bypass the gate (#525 round 4 P1). - Reject non-array 'in' operators at load time. Previously a typo like 'in: foo' would load and crash at evaluation when .some() is called on a non-array (#525 round 4 P2). - Validate regex patterns at load time. Previously a malformed regex like 'regex: "["' would load and abort the entire decision run with a SyntaxError when new RegExp() throws during evaluation (#525 round 4 P2). - Updated existing 'missing then.weight' test to match the new 'invalid then.weight' error message. - Four new loader tests cover the three rejection paths plus a negative-weight case. --- experiments/rule-engine-poc/src/loader.ts | 31 ++++++- .../rule-engine-poc/test/loader.test.ts | 86 ++++++++++++++++++- 2 files changed, 114 insertions(+), 3 deletions(-) diff --git a/experiments/rule-engine-poc/src/loader.ts b/experiments/rule-engine-poc/src/loader.ts index 8e19db066..0c0754614 100644 --- a/experiments/rule-engine-poc/src/loader.ts +++ b/experiments/rule-engine-poc/src/loader.ts @@ -93,8 +93,16 @@ function validate( `Expected one of: ${VERDICTS.join(", ")}.`, ); } - if (typeof rule.then.weight !== "number") { - throw new Error(`Rule '${rule.id}' missing numeric 'then.weight'`); + if ( + typeof rule.then.weight !== "number" || + !Number.isFinite(rule.then.weight) || + rule.then.weight <= 0 + ) { + throw new Error( + `Rule '${rule.id}' has invalid 'then.weight' '${String(rule.then.weight)}' — ` + + `must be a positive finite number (a 'blocked' rule with weight 0 ` + + `would contribute no tally and silently bypass the gate)`, + ); } if (!Array.isArray(rule.then.actions)) { throw new Error(`Rule '${rule.id}' missing 'then.actions' array`); @@ -146,5 +154,24 @@ function validateConditions( `— these are mutually exclusive`, ); } + if ("in" in cond && !Array.isArray(cond.in)) { + throw new Error( + `Rule '${ruleId}' ${where} has non-array 'in' (got ${typeof cond.in})`, + ); + } + if (typeof cond.regex === "string") { + try { + new RegExp(cond.regex); + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + throw new Error( + `Rule '${ruleId}' ${where} has invalid regex '${cond.regex}': ${msg}`, + ); + } + } else if ("regex" in cond) { + throw new Error( + `Rule '${ruleId}' ${where} has non-string 'regex' (got ${typeof cond.regex})`, + ); + } } } diff --git a/experiments/rule-engine-poc/test/loader.test.ts b/experiments/rule-engine-poc/test/loader.test.ts index 101d0b05d..2b94b1a34 100644 --- a/experiments/rule-engine-poc/test/loader.test.ts +++ b/experiments/rule-engine-poc/test/loader.test.ts @@ -56,7 +56,7 @@ describe("loader", () => { `, "no-weight", ), - ).toThrow(/missing numeric 'then.weight'/); + ).toThrow(/invalid 'then\.weight'/); }); it("rejects rules with a misspelled / unknown verdict", () => { @@ -221,6 +221,90 @@ describe("loader", () => { ).toThrow(/combines 'exists: false' with value operators/); }); + it("rejects rules with weight: 0", () => { + expect(() => + loadRulesFromString( + ` +- id: r1 + priority: 1 + description: x + when: + all: + - flag: a + eq: true + then: + verdict: blocked + weight: 0 + actions: [] +`, + "weight-zero", + ), + ).toThrow(/invalid 'then\.weight' '0'/); + }); + + it("rejects rules with negative weight", () => { + expect(() => + loadRulesFromString( + ` +- id: r1 + priority: 1 + description: x + when: + all: + - flag: a + eq: true + then: + verdict: blocked + weight: -5 + actions: [] +`, + "weight-negative", + ), + ).toThrow(/invalid 'then\.weight'/); + }); + + it("rejects conditions with non-array 'in'", () => { + expect(() => + loadRulesFromString( + ` +- id: r1 + priority: 1 + description: x + when: + all: + - flag: a + in: foo + then: + verdict: blocked + weight: 1 + actions: [] +`, + "in-non-array", + ), + ).toThrow(/non-array 'in'/); + }); + + it("rejects conditions with invalid regex at load time", () => { + expect(() => + loadRulesFromString( + ` +- id: r1 + priority: 1 + description: x + when: + all: + - flag: branch + regex: "[" + then: + verdict: blocked + weight: 1 + actions: [] +`, + "bad-regex", + ), + ).toThrow(/invalid regex/); + }); + it("assigns a stable content hash to each rule", () => { const ruleA = loadRulesFromString( ` From 271702eb54be0c798cbfe314d09495e6223dd999 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 17 May 2026 11:57:12 +0000 Subject: [PATCH 06/45] feat(rule-engine-poc): plan/report workflow with config-driven targets End-to-end flow now drives the POC: user adds content to the project, runs npm run plan to generate AI extraction prompts, pastes a prompt into Claude/ChatGPT, saves the JSON to extractions/, runs npm run report to render HTML and open it in the browser. Architecture: - rule-engine.config.json declares targets, each with id + label + paths (files or directories, walked deterministically). - rules/flag-schema.yaml documents every flag the rule set may reference (type + description + example); the contract between AI extractor and engine. - src/plan.ts walks target paths, collects file contents with 8 KB truncation per file, bundles role + schema + rules + source into a single prompt per target. - src/report.ts loads extractions per target, runs the engine, renders the existing HTML reporter, best-effort opens the first report in the OS default browser. Exit 0/1/2 = ok/blocked/missing. - Prompt-builder follows analyst research (research/10): XML-tag structure with markdown redundancy, explicit forbidden-fields list (verdict, assessment, conclusion, summary, recommendation, rationale, analysis), open tag as a forcing function. - Original single-shot src/cli.ts preserved as a fixture-testing escape hatch. 20 new tests cover the new modules (config, flag-schema, context, prompt-builder). Suite total: 60 tests, all passing. Research wave 2 (5 background agents) wrote 5 new artifacts under research/ covering independent review, workflow failure modes, workflow architecture, UX friction, and extraction prompt patterns. POC stays sandbox-scoped under experiments/rule-engine-poc/. No wiring into specs/, /spec:status, plugins/, or the main repo. --- experiments/rule-engine-poc/.gitignore | 2 + experiments/rule-engine-poc/README.md | 157 ++++--- experiments/rule-engine-poc/docs/workflow.md | 142 ++++++ experiments/rule-engine-poc/package.json | 2 + .../research/06-independent-review.md | 71 +++ .../research/07-workflow-risks.md | 160 +++++++ .../research/08-workflow-architecture.md | 423 ++++++++++++++++++ .../rule-engine-poc/research/09-user-flow.md | 84 ++++ .../research/10-extraction-prompt-patterns.md | 184 ++++++++ .../rule-engine-poc/rule-engine.config.json | 24 + .../rule-engine-poc/rules/flag-schema.yaml | 171 +++++++ experiments/rule-engine-poc/src/config.ts | 123 +++++ experiments/rule-engine-poc/src/context.ts | 95 ++++ .../rule-engine-poc/src/flag-schema.ts | 80 ++++ .../rule-engine-poc/src/open-browser.ts | 29 ++ experiments/rule-engine-poc/src/plan.ts | 81 ++++ .../rule-engine-poc/src/prompt-builder.ts | 113 +++++ experiments/rule-engine-poc/src/report.ts | 117 +++++ .../rule-engine-poc/test/config.test.ts | 72 +++ .../rule-engine-poc/test/context.test.ts | 48 ++ .../rule-engine-poc/test/flag-schema.test.ts | 70 +++ .../test/prompt-builder.test.ts | 80 ++++ 22 files changed, 2269 insertions(+), 59 deletions(-) create mode 100644 experiments/rule-engine-poc/docs/workflow.md create mode 100644 experiments/rule-engine-poc/research/06-independent-review.md create mode 100644 experiments/rule-engine-poc/research/07-workflow-risks.md create mode 100644 experiments/rule-engine-poc/research/08-workflow-architecture.md create mode 100644 experiments/rule-engine-poc/research/09-user-flow.md create mode 100644 experiments/rule-engine-poc/research/10-extraction-prompt-patterns.md create mode 100644 experiments/rule-engine-poc/rule-engine.config.json create mode 100644 experiments/rule-engine-poc/rules/flag-schema.yaml create mode 100644 experiments/rule-engine-poc/src/config.ts create mode 100644 experiments/rule-engine-poc/src/context.ts create mode 100644 experiments/rule-engine-poc/src/flag-schema.ts create mode 100644 experiments/rule-engine-poc/src/open-browser.ts create mode 100644 experiments/rule-engine-poc/src/plan.ts create mode 100644 experiments/rule-engine-poc/src/prompt-builder.ts create mode 100644 experiments/rule-engine-poc/src/report.ts create mode 100644 experiments/rule-engine-poc/test/config.test.ts create mode 100644 experiments/rule-engine-poc/test/context.test.ts create mode 100644 experiments/rule-engine-poc/test/flag-schema.test.ts create mode 100644 experiments/rule-engine-poc/test/prompt-builder.test.ts diff --git a/experiments/rule-engine-poc/.gitignore b/experiments/rule-engine-poc/.gitignore index edf96cefc..3039bcac8 100644 --- a/experiments/rule-engine-poc/.gitignore +++ b/experiments/rule-engine-poc/.gitignore @@ -1,5 +1,7 @@ node_modules/ dist/ +prompts/ +extractions/ reports/ *.log .DS_Store diff --git a/experiments/rule-engine-poc/README.md b/experiments/rule-engine-poc/README.md index 642042886..331aae579 100644 --- a/experiments/rule-engine-poc/README.md +++ b/experiments/rule-engine-poc/README.md @@ -1,115 +1,154 @@ --- title: Rule Engine POC folder: experiments/rule-engine-poc -description: Terminal-only TypeScript proof-of-concept of a deterministic rule engine that sits on top of LLM-extracted structured flags. Demonstrates the "LLM extracts, rules decide" pattern applied to the repo's own quality framework. +description: Terminal-only TypeScript proof-of-concept of a deterministic rule engine that sits on top of LLM-extracted structured flags. Two commands — plan + report — frame the end-to-end loop from feature folder to HTML verdict. entry_point: true --- # Rule Engine POC -Terminal-only TypeScript proof-of-concept of a deterministic rule engine that sits on top of LLM-extracted structured flags. The engine consumes flags and emits a verdict (`blocked` / `needs-attention` / `ready-to-progress`) with a fully replayable audit trail. +Terminal-only TypeScript proof-of-concept of the **"LLM extracts, rules decide"** pattern. The engine consumes structured flags and emits a verdict (`blocked` / `needs-attention` / `ready-to-progress` / `unknown`) with a fully replayable audit trail; an HTML report renders the verdict for humans. -## Why +## The end-to-end loop + +``` +1. Add content to the project (e.g., a feature folder under specs/) +2. npm run plan -> writes one extraction prompt per target +3. Paste prompt into Claude/ChatGPT -> AI emits structured JSON flags only +4. Save JSON to extractions/.json +5. npm run report -> engine evaluates, HTML opens in browser +``` + +Configured by a single `rule-engine.config.json` that names targets and the paths to analyse for each. See [`docs/workflow.md`](docs/workflow.md) for the full sequence. + +## Why this pattern The concept comes from a [Reddit thread on AI fact-checking](https://www.reddit.com/r/artificial/) where the author argues the LLM should **never produce verdicts**, only **structured extractions** that a deterministic layer then scores: > "The LLM in our pipeline never produces a numeric score, never produces a true/false verdict... The LLM extracts structured factual flags from source material. A deterministic Python scoring layer turns those flags into a verdict tier." -That maps perfectly onto our [OODA orchestrator concept](../../docs/backlog/502-idea-ooda-loop-plugin-observe-orient-decide-act.md): +That maps cleanly onto our [OODA orchestrator concept](../../docs/backlog/502-idea-ooda-loop-plugin-observe-orient-decide-act.md): | OODA quadrant | Role | Determinism | |---|---|---| -| Observe | Raw signal collection (git, CI, files) | Deterministic | +| Observe | Raw signal collection (file walk) | Deterministic | | Orient | **LLM extracts structured flags** | Stochastic (constrained to extraction only) | | Decide | **Rule engine emits verdict + suggested actions** | **Deterministic — this POC** | | Act | Execute approved actions | Deterministic | -## Domain: the repo's own quality framework +## Domain example: the repo's own quality framework + +The shipped rule set ([`rules/quality-gates.yaml`](rules/quality-gates.yaml)) encodes the per-stage Definition of Done from [`docs/quality-framework.md`](../../docs/quality-framework.md). Every DoD bullet becomes one rule. Flags are documented in [`rules/flag-schema.yaml`](rules/flag-schema.yaml) — the contract between the AI extractor and the engine. -The example rule set encodes the Definition of Done from [`docs/quality-framework.md`](../../docs/quality-framework.md) as machine-checkable rules — one rule per DoD bullet, per stage. A feature folder's verdict becomes a function of named flags and named rules; nothing about the answer depends on which way the wind was blowing when the LLM ran. +A feature folder's verdict becomes a function of named flags and named rules; nothing about the answer depends on which way the wind was blowing when the LLM ran. -## Run it +## Get started ```bash cd experiments/rule-engine-poc npm install -npm run demo # ready-implementation fixture -npm run demo:blocked-ears # requirements stage with EARS coverage 0.6 -npm run demo:blocked-s1 # any-S1-finding cross-cutting block -npm run demo:needs-attention # design stage with S2 findings -npm run demo:ready-idea # idea stage, DoD satisfied -npm run demo:all # walk every fixture in turn -npm test # unit + reproducibility tests +npm test # 60 tests in <1s ``` -Three output modes: +### Run the workflow against a real feature + +The shipped config points at two real feature folders (`specs/astro-product-page/` and `specs/shape-b-branching-adoption/`). Walk the loop: ```bash -# default: human-readable text report -npx tsx src/cli.ts rules/quality-gates.yaml fixtures/ready-implementation.json +# 1. Generate one extraction prompt per target +npm run plan -# machine-readable JSON (the full VerdictResult) -npx tsx src/cli.ts rules/quality-gates.yaml fixtures/ready-implementation.json --json +# 2. Open prompts/astro-product-page.md, copy contents, paste into your AI tool. +# Save the JSON response (the object inside ...) to: +# extractions/astro-product-page.json -# self-contained HTML report (no external assets) -npx tsx src/cli.ts rules/quality-gates.yaml fixtures/ready-implementation.json --html reports/out.html +# 3. Render reports and open the first in your browser +npm run report +``` + +Useful flags: -# generate one HTML report per fixture into reports/ -npm run demo:html:all +```bash +npm run plan -- --target astro-product-page +npm run report -- --no-open +npm run report -- --target shape-b-branching-adoption ``` -The CLI exits **1** on `blocked`, **0** otherwise — usable as a CI gate. +Exit codes from `report`: `0` = no blockers, `1` = at least one `blocked`, `2` = missing or malformed extraction. + +### Run a single-shot evaluation (legacy / fixtures) + +The original single-shot CLI is preserved for testing and fixture-based demos: + +```bash +npm run demo # ready-implementation fixture +npm run demo:blocked-ears # requirements stage with EARS coverage 0.6 +npm run demo:blocked-s1 # any-S1-finding cross-cutting block +npm run demo:needs-attention # design stage with S2 findings +npm run demo:ready-idea # idea stage, DoD satisfied +npm run demo:html:all # one HTML report per fixture into reports/ +``` ## File map | Path | Role | |---|---| -| `src/types.ts` | Data contracts (`ExtractionFlags`, `Rule`, `VerdictResult`, ...) | +| `rule-engine.config.json` | Targets + paths + directory layout | +| `rules/quality-gates.yaml` | Definition-of-Done-as-rules | +| `rules/flag-schema.yaml` | Documentation of every flag (type, description, example) | +| `src/types.ts` | Data contracts | | `src/hash.ts` | Canonical JSON + SHA-256 — the foundation of replayability | | `src/engine.ts` | Deterministic evaluation: severity-first verdict, weighted tally, audit trail | -| `src/loader.ts` | YAML rule file loader + schema validation + per-rule content hash | -| `src/cli.ts` | Terminal renderer + JSON / HTML modes + verdict-as-exit-code | +| `src/loader.ts` | YAML rule loader + schema validation + per-rule content hash | +| `src/config.ts` | Config loader + validator | +| `src/flag-schema.ts` | Flag schema loader + coverage diff | +| `src/context.ts` | Walks target paths, collects file contents with truncation | +| `src/prompt-builder.ts` | Builds the AI extraction prompt — role, schema, source, forcing function | +| `src/plan.ts` | `npm run plan` — generates prompts | +| `src/report.ts` | `npm run report` — runs engine, renders HTML, opens browser | +| `src/cli.ts` | Single-shot escape hatch for fixture-based testing | | `src/html-report.ts` | Self-contained HTML renderer (inline CSS, no JS) | -| `rules/quality-gates.yaml` | Example rules — Definition of Done per stage | -| `fixtures/*.json` | Mock Orient-quadrant outputs (the LLM's structured extraction) | -| `test/*.test.ts` | Unit, reproducibility, and operator coverage | -| `docs/*.md` | Architecture, DSL reference, audit trail, extension guide, OODA integration | -| `research/*.md` | Research wave that informed the design (5 angles) | +| `src/open-browser.ts` | Best-effort `xdg-open` / `open` / `start` | +| `fixtures/*.json` | Mock extractions for the single-shot demos | +| `test/*.test.ts` | 60 tests across engine, loader, hash, config, schema, context, prompt-builder | +| `docs/*.md` | Architecture, DSL reference, audit trail, workflow, extension guide, OODA integration | +| `research/*.md` | Ten research artifacts (technical, regulatory, positioning, design, risks, review, workflow risks, architecture, UX, prompt patterns) | ## Documentation -Detailed documentation lives under [`docs/`](docs/README.md): - -- [`docs/architecture.md`](docs/architecture.md) — how the engine works, severity-first conflict resolution, determinism strategy -- [`docs/dsl-reference.md`](docs/dsl-reference.md) — full YAML grammar with every operator -- [`docs/audit-trail.md`](docs/audit-trail.md) — what's captured, how to replay, EU AI Act mapping -- [`docs/extending.md`](docs/extending.md) — adding rules, flags, fixtures; pointing the engine at a new domain -- [`docs/ooda-integration.md`](docs/ooda-integration.md) — how this POC slots into the OODA orchestrator - -## Design choices and why - -- **Severity-first verdict.** A `blocked` rule beats any number of `ready-to-progress` rules, regardless of weight. Verdicts are categorical tiers — exactly as the source pattern argues. -- **Determinism by construction.** Rules are sorted `[priority desc, id asc]` at load time; flags are serialised through `canonicalJson` (sorted keys) before hashing; no `Date.now`, no `Math.random`, no async, no `Object.entries` in the hot path. -- **Content-hash provenance.** Each rule carries a SHA-256 of its content; the result carries `rulesetHash` and `flagsHash`. An auditor can replay a verdict from those three artifacts alone. -- **Missing flags are not silent.** A rule that references an absent flag fails with reason `"flag missing in extraction"` — surfaced in the audit trail rather than swallowed. -- **YAML, with strict load-time validation.** Picked for diff-ability; the loader rejects malformed rules with a helpful error before any evaluation. +| Doc | Read when | +|---|---| +| [`docs/workflow.md`](docs/workflow.md) | You want to run the plan / paste / report loop end-to-end. | +| [`docs/architecture.md`](docs/architecture.md) | You want to understand how the engine is built and why it's deterministic. | +| [`docs/dsl-reference.md`](docs/dsl-reference.md) | You're writing or reading a rule file — every YAML operator. | +| [`docs/audit-trail.md`](docs/audit-trail.md) | You want to replay a verdict, diff two verdicts, or map to EU AI Act / ISO 42001 requirements. | +| [`docs/extending.md`](docs/extending.md) | You want to add rules, flags, or fixtures. | +| [`docs/ooda-integration.md`](docs/ooda-integration.md) | You want to understand how this POC slots into the OODA orchestrator. | ## What this is not -- **Not a production rule engine.** It is intentionally minimal (~250 LOC engine + loader). The technical-landscape research recommends [`json-rules-engine`](https://github.com/CacheControl/json-rules-engine) behind a thin adapter when this graduates from POC; see `research/01-technical-landscape.md` for the comparison. -- **Not opinionated about extraction.** The LLM half of the pipeline is mocked by the fixture JSON files. Wiring a real Claude / GPT extraction call is a separate POC. -- **Not yet calibrated.** Weights and severity ordering are placeholders. The critic flags this as the top risk; see `research/05-risks-critique.md` for the proposed Riskiest Assumption Tests. +- **Not a production rule engine.** Intentionally minimal. The technical-landscape research recommends [`json-rules-engine`](https://github.com/CacheControl/json-rules-engine) behind a thin adapter when this graduates from POC. +- **Not opinionated about which AI tool.** The prompt is engineered to work with any frontier model (Claude / GPT / Gemini); the paste step is intentionally manual. +- **Not yet calibrated.** Weights and severity ordering are placeholders. The critic flags this as the top risk; see [`research/05-risks-critique.md`](research/05-risks-critique.md) and [`research/07-workflow-risks.md`](research/07-workflow-risks.md) for the proposed Riskiest Assumption Tests. +- **Not leaking into the main repo.** Stays self-contained under `experiments/rule-engine-poc/`. No `/spec:status` wiring, no plugin packaging, no library yet — those are downstream of the POC. -## Research artifacts +## Research -Each of the five research angles produced a standalone brief under `research/`: +Ten research artifacts under [`research/`](research/) informed the design across two waves: -1. [`01-technical-landscape.md`](research/01-technical-landscape.md) — library comparison; build-vs-buy recommendation -2. [`02-regulatory-auditability.md`](research/02-regulatory-auditability.md) — EU AI Act / ISO 42001 / NIST RMF audit-trail checklist -3. [`03-positioning-jtbd.md`](research/03-positioning-jtbd.md) — JTBD, North Star, competitive positioning -4. [`04-technical-design.md`](research/04-technical-design.md) — alternative architecture sketch (deeper than the POC implementation) -5. [`05-risks-critique.md`](research/05-risks-critique.md) — failure modes, blindspots, 3 RATs to falsify first +| # | Angle | +|---|---| +| 01 | Technical landscape — TS/JS rule engines, build vs adopt | +| 02 | Regulatory & auditability — EU AI Act, ISO 42001, NIST RMF | +| 03 | Positioning, JTBD, North Star metric | +| 04 | Architecture (alternative deeper sketch) | +| 05 | Risks & critique (engine layer) | +| 06 | Independent review of the POC at HEAD | +| 07 | Workflow failure modes & RATs | +| 08 | Plan/report workflow architecture | +| 09 | User-flow audit & friction points | +| 10 | LLM extraction prompt patterns | ## North Star -From the positioning research: **verdict reproducibility rate** — the percentage of `(flags, rule set) -> verdict` pairs that match byte-for-byte across two runs on the same input. The reproducibility test suite (`test/engine.test.ts` -> `describe("reproducibility")`) exercises this directly; in this POC the rate is 100% by construction. +From the positioning research: **verdict reproducibility rate** — the percentage of `(flags, rule set) -> verdict` pairs that match byte-for-byte across two runs on the same input. The reproducibility test suite (`test/engine.test.ts -> describe("reproducibility")`) exercises this directly; in this POC the rate is 100% by construction. diff --git a/experiments/rule-engine-poc/docs/workflow.md b/experiments/rule-engine-poc/docs/workflow.md new file mode 100644 index 000000000..7aba978ef --- /dev/null +++ b/experiments/rule-engine-poc/docs/workflow.md @@ -0,0 +1,142 @@ +--- +title: End-to-end workflow +folder: experiments/rule-engine-poc/docs +description: How content moves from a feature folder to a verdict — config-driven targets, the plan command, the manual AI paste step, the report command, and the HTML output. +entry_point: false +--- + +# End-to-end workflow + +The POC's primary flow is config-driven and ends in an HTML report. Two commands frame the loop: + +``` ++-----------------+ +-----------------+ +-----------------+ +| CONTENT | | EXTRACTION | | VERDICT | +| | | | | | +| specs// | plan | prompts/X.md | report | reports/X.html | +| ...md | ----> | | | ----> | + opens in | +| any folder | | v | | browser | +| | | user pastes | | | +| | | into AI tool | | | +| | | | | | | +| | | v | | | +| | | extractions/ | | | +| | | X.json | | | ++-----------------+ +-----------------+ +-----------------+ +``` + +The four numbered stages of the OODA orchestrator are mapped onto this loop: + +| OODA stage | Where it lives | +|---|---| +| Observe | The source files referenced from `rule-engine.config.json` (`targets[*].paths`) | +| Orient (stochastic) | The AI tool the user pastes the generated prompt into | +| Decide (deterministic) | `npm run report` — engine + audit trail + HTML rendering | +| Act | The actions listed on the HTML report — wired up externally | + +## 1. Configure targets + +A `rule-engine.config.json` next to the POC defines what to analyse. Each target produces one prompt, one extraction file, and one HTML report. + +```json +{ + "rules": "rules/quality-gates.yaml", + "flagSchema": "rules/flag-schema.yaml", + "promptsDir": "prompts", + "extractionsDir": "extractions", + "reportsDir": "reports", + "openBrowser": true, + "targets": [ + { + "id": "astro-product-page", + "label": "Astro product page feature", + "paths": ["../../specs/astro-product-page"] + } + ] +} +``` + +Paths are resolved relative to the config file. A path may point at a file or a directory; directories are walked, hidden files / `node_modules` / `dist` are skipped, and only allow-listed extensions are inlined (`.md`, `.mdx`, `.yaml`, `.yml`, `.json`, `.txt`). + +## 2. Generate prompts — `npm run plan` + +```bash +npm run plan # every target +npm run plan -- --target # one target +npm run plan -- --config path.json # custom config location +``` + +For each target, the command: + +1. Loads the rules and flag schema. +2. Walks the configured paths and collects file contents in deterministic order. +3. Truncates each file at 8 KB with a clear marker so prompts stay manageable. +4. Bundles role, rules, flag schema, source material, and a response template into a single Markdown file under `promptsDir/.md`. +5. Warns on stderr if a rule references a flag missing from the schema. + +The prompt is engineered (per `research/10-extraction-prompt-patterns.md`) to: + +- Cast the model as an **evidence extractor, not a fact-checker**. +- Forbid verdict-shaped fields by name (`verdict`, `assessment`, `conclusion`, `summary`, `recommendation`, `rationale`, `analysis`). +- End with an open `` tag — a forcing function so the next token the model emits has to look like the start of JSON. +- Use XML tags as primary structure (Claude-friendly) with Markdown headers as redundancy (GPT/Gemini-friendly). +- Instruct the model to **omit** flags it cannot determine, so the audit trail later surfaces "flag missing in extraction" rather than the model guessing. + +## 3. Paste into an AI tool + +Open `prompts/.md`, copy the contents, paste into Claude, ChatGPT, or any chat AI. The response should be a single JSON object inside `...` with no surrounding prose. + +Save the JSON (just the object, without the `` tags) to `extractions/.json`. + +> This manual paste step is the POC's defining trade-off. A production wiring would call the LLM API with a constrained-decoding response schema; the manual loop is the lo-fi way to prove the rest of the pipeline before paying for the API integration. See `research/07-workflow-risks.md` for what to watch for during this step. + +## 4. Generate reports — `npm run report` + +```bash +npm run report # every target +npm run report -- --target # one target +npm run report -- --no-open # do not try to open a browser +``` + +For each target, the command: + +1. Loads the rules and the extraction file. +2. Runs the engine — deterministic verdict, weighted tally, audit trail. +3. Writes a self-contained HTML report to `reportsDir/.html`. +4. If `openBrowser: true` and `--no-open` is not set, opens the first report in the OS default browser (best-effort). + +### Exit codes + +| Code | Meaning | +|---|---| +| `0` | All targets evaluated; none `blocked` | +| `1` | At least one target evaluated to `blocked` | +| `2` | At least one extraction missing or malformed | + +Usable directly as a CI gate. + +### Missing or malformed extractions + +If an extraction file is missing, the command prints a friendly error pointing at the right `npm run plan -- --target ` invocation, sets exit code 2, and continues with the remaining targets so a partial run still produces what reports it can. + +## 5. Read the report + +The HTML report renders the verdict prominently, lists the suggested actions, shows the weighted tally per tier, and prints the full audit trail in deterministic order. Provenance hashes at the bottom let you confirm the report came from a specific (engine version, rule set, flags) tuple. + +See [`audit-trail.md`](audit-trail.md) for what's captured and how to replay. + +## Why two commands and not one + +The split is intentional: + +- `plan` is **deterministic and fast** — it writes a file. No network. +- The **AI paste step** is the only stochastic part of the loop. Isolating it from the surrounding deterministic machinery makes it easy to swap out (manual today, API tomorrow) without changing the rest of the pipeline. +- `report` is **deterministic and fast** — it reads a file and writes a file. Idempotent. Re-run as many times as you like. + +This shape mirrors the [OODA orchestrator separation](ooda-integration.md): stochastic in Orient, deterministic everywhere else. + +## What's not yet here + +- **No schema-validate gate before `report`** — currently the engine treats absent flags as missing-in-extraction (which is fine) but accepts whatever JSON is in the file. A future `npm run validate` would parse the extraction, check it against the flag schema, refuse forbidden field names, and only then let `report` proceed. See `research/10-extraction-prompt-patterns.md` for the design. +- **No stale-extraction detection** — if you edit the source files after the extraction was produced, the report still renders. The critic in `research/07-workflow-risks.md` flags this as the riskiest UX failure mode; a future iteration would hash the prompt and refuse to render a report from an extraction produced against a different prompt. +- **No automated API call** — the manual paste step is on purpose for the POC. diff --git a/experiments/rule-engine-poc/package.json b/experiments/rule-engine-poc/package.json index 13fa0d76d..8266ed1c5 100644 --- a/experiments/rule-engine-poc/package.json +++ b/experiments/rule-engine-poc/package.json @@ -6,6 +6,8 @@ "description": "Proof-of-concept rule engine for the Specorator OODA orchestrator: LLM extracts, rules decide. Applied to the repo's own quality framework.", "scripts": { "build": "tsc -p .", + "plan": "tsx src/plan.ts", + "report": "tsx src/report.ts", "demo": "tsx src/cli.ts rules/quality-gates.yaml fixtures/ready-implementation.json", "demo:blocked-ears": "tsx src/cli.ts rules/quality-gates.yaml fixtures/blocked-missing-ears.json", "demo:blocked-s1": "tsx src/cli.ts rules/quality-gates.yaml fixtures/blocked-s1-finding.json", diff --git a/experiments/rule-engine-poc/research/06-independent-review.md b/experiments/rule-engine-poc/research/06-independent-review.md new file mode 100644 index 000000000..3b48d02b7 --- /dev/null +++ b/experiments/rule-engine-poc/research/06-independent-review.md @@ -0,0 +1,71 @@ +--- +title: Independent Review — Rule Engine POC +folder: experiments/rule-engine-poc/research +description: Stage-9-style independent review of the rule engine POC at HEAD of branch claude/rule-engine-poc-gO5yq, covering engine, loader, types, HTML reporter, docs, and scope alignment. +entry_point: false +--- + +# 06 — Independent Review + +Scope: engine + loader + types + HTML reporter + docs as of HEAD of `claude/rule-engine-poc-gO5yq` (PR #525). Constitution and `docs/quality-framework.md` applied, scaled to "sandbox POC, not a tracked /spec:start feature." Codex's three prior rounds (verdict allow-list, non-array `when` groups, empty `when` groups, `exists` AND-chain, no-operator conditions, `exists: false` + value ops) are treated as resolved and not re-flagged. + +## Verdict — **pass-with-findings** + +The POC delivers what its docs promise: a pure, synchronous, content-hashed evaluator with a replayable audit trail and a sensible, narrow DSL. The reproducibility property is real, not aspirational. Findings below are mostly S3/S4 — none break the demo or the North Star contract, but several would mislead a reader who relied on the docs as a spec, and a few would bite the moment the POC met real LLM-extracted flags. + +## Findings + +### S2 — high (would mislead a reader treating docs as spec) + +**S2-1 — `not` rule with a missing flag fires "by accident."** The engine inverts the inner condition result. A missing flag yields `matched: false`, so `not` flips it to `true` and the rule contributes. Confirmed by probe: `when.not: [{flag: x, eq: true}]` on `{}` returns `needs-attention`. `docs/dsl-reference.md` says "missing flags are not silent — surface with reason `flag missing in extraction`" but here the rule fires and the missing-flag reason is hidden by the inversion at `engine.ts:122`. Either document this asymmetry or require `not` clauses to gate on `exists: true` first. (Mirror of the "schema miss laundering" risk in `research/05-risks-critique.md` §2 bullet 1, made concrete inside Decide.) + +**S2-2 — `gt`/`lt` against a non-number silently report `matched=false` with no reason.** Probe: `gt: 7` on `x: "8"` yields `observed: "8"`, `matched: false`, no `reason` field. The author has no signal that the flag was the wrong type vs. legitimately below threshold. Symmetrical with the missing-flag case, which does set a reason. Add `reason: "value not numeric"` (and an analogous reason for `regex` on a non-string flag, where the probe also shows a silent miss). + +### S3 — medium (documentation drift / DSL gaps) + +**S3-1 — `description` is not the only field excluded from the rule hash; `stage` and `tags` are too.** `loader.ts:39-44` hashes `{id, priority, when, then}`. `docs/architecture.md:81` and `docs/dsl-reference.md:30` say only "`description` and `tags`" are excluded. Verified by probe: two rules differing only in `stage` and `tags` produce identical hashes. Either include `stage` in the hash (it is semantically informational but a stage edit changes behaviour expectations) or extend the docs to enumerate the full exclude list `{description, stage, tags, sourceFile, sourceIndex}` so auditors know what an unchanged hash actually guarantees. + +**S3-2 — Duplicate rule ids are accepted silently.** Confirmed by probe: two rules with `id: same` load, both appear in `evaluations`, and only the lexical id-asc tiebreak holds the order. `research/04-technical-design.md:248` explicitly calls "Duplicate rule ids: hard error at load" a design requirement; the implementation does not enforce it. The whole `(ruleId, ruleHash)` pair becomes ambiguous in audit diffing. + +**S3-3 — DSL operator gap surfaced by the example rules.** `quality-gates.yaml` repeatedly encodes "applies once we've reached requirements stage" by listing every later stage in an `in:` array (e.g. `req-ears-mandatory`, `impl-lint-clean`, `spec-items-trace-to-requirements`). This is a workaround for a missing **stage-ordering** primitive. Without it, every new stage forces edits across N rules and the rules become a brittle copy-paste cluster. Either add a `gte` (over an ordered enum) operator or document a `current_stage_index` companion flag pattern in `extending.md`. + +**S3-4 — `in: []` matches nothing but loads without complaint.** Probe: `in: []` produces a rule that can never fire — the same "vacuously false" pitfall the loader already rejects for `when.any: []`. Reject empty `in` arrays at load time for consistency. + +**S3-5 — Empty `actions: []` is the only DoD escape hatch but is undocumented.** The two `impl-ready` and `idea-ready` rules contribute `verdict: ready-to-progress` with non-empty actions; a hypothetical rule with `actions: []` would still contribute to the tally but add nothing to the sorted action list. The semantics are fine but not stated. `docs/dsl-reference.md` should clarify that `actions` may be empty. + +**S3-6 — `weight` documentation reality-check.** Both `architecture.md` and `dsl-reference.md` say "weight only informs action prioritisation within a tier." But the rendered audit trail and HTML report show `weightedTally` as a sum per tier, and actions are sorted **alphabetically** (`engine.ts:177`), not by contributing-rule weight. So weight currently affects *nothing observable* except the tally display. Either implement weight-based action ordering or note explicitly that weight is presently inert and reserved. + +**S3-7 — Audit-trail completeness vs. EU-AI-Act mapping.** `docs/audit-trail.md:96-105` claims `flagsHash` covers "Decision envelope (input + output)." Strictly true for the hash, but the JSON `--json` output does **not** include the raw `flags` object, only `flagsHash` — making the replay procedure require both the result and the original fixture file. For sandbox POC fine, but the doc claim is stronger than the artifact. Suggest: include `flags` verbatim in `VerdictResult` (under `inputs.flags`) or soften the doc claim. + +### S4 — low (polish / future-proofing) + +**S4-1 — `ENGINE_VERSION` bumped from `"0.1.0"` (docs) to `"0.2.0"` (code) without changelog.** `docs/extending.md:104-113` requires a bump on shape changes and lists scenarios; the bump itself is recorded only as a string. Add a one-line `ENGINE_VERSION_HISTORY` array or a `CHANGELOG.md` so the auditor's "engine behaviour change" diagnosis in `audit-trail.md:81` is actionable. + +**S4-2 — HTML report timestamp is in the body, not a footnote.** `cli.ts:55` injects `new Date().toISOString()`; `html-report.ts:273` renders it. The docs acknowledge this as the one source of variability ("modulo the HTML report's timestamp footer"). Consider making the timestamp opt-in (`--no-timestamp`) so HTML reports can themselves be byte-diffable for cache-key use. + +**S4-3 — `deepEqual` in `engine.ts:30-37` is array-or-strict-equal only; objects are out of scope.** `FlagValue` excludes plain objects, so the current limitation is consistent with the type — but a future flag of shape `{count: 3, type: "x"}` would silently fall through to `===`. Add a `typeof value === "object" && !Array.isArray(value)` guard that throws "object flag values not supported" so the contract is enforced where it is used, not only at the type boundary. + +**S4-4 — `condition.in` accepts a list but the type permits arrays-of-arrays.** `Condition.in?: FlagValue[]` and `FlagValue` includes `string[]`, so `in: [["a","b"], ["c","d"]]` is type-legal but never useful (deep-equal against an entire array). Minor — but a runtime check `Array.isArray(observed) && depth check` would prevent silent miscompare. + +**S4-5 — Tests cover ~95% of documented surface but skip three things:** (1) the priority-tie + id-asc ordering that `research/04` lists as edge case #2; (2) the action-dedup-across-rules invariant (covered for alphabetical sort, not for "same action from two rules"); (3) the `not` + missing-flag behaviour (S2-1). Adding these as fixtures is cheaper than as unit tests and doubles as audit-trail regression cases per `docs/extending.md:88-92`. + +## Constitution alignment + +Article II (separation of concerns) and Article V (traceability) are honoured beautifully — the engine is the canonical example of a pure function with a fully self-describing return value, and the hash trio (`engineVersion`, `rulesetHash`, `flagsHash`) is exactly the "every artifact links to its inputs" pattern. Article IV (quality gates) is the area where the POC's own test/doc gaps matter most: claims in docs that aren't tested (S2-1, S2-2, S3-1) are themselves a small instance of "decision evaporation." + +## Scope check — what *not* to do (keep POC standalone) + +- **Do not** wire this into `/spec:review`, `/quality:status`, or any conductor skill. The promised follow-on workflow (plan + report commands, sidecar flag schema) belongs inside `experiments/rule-engine-poc/` until RAT-1/2/3 from `research/05` have run. +- **Do not** add `npm run quality:metrics` integration or import anything from the parent repo's `scripts/` — that would couple the experiment to a production surface. +- **Do not** promote the example `quality-gates.yaml` to a normative DoD source. It is a demo encoding, not a governance artifact; promoting it would create two competing sources of truth with `docs/quality-framework.md`. +- **Do not** swap in `json-rules-engine` yet (per `research/01` recommendation) — the in-house engine is small enough that the S2/S3 findings are cheaper to fix here than to re-discover behind a library adapter. + +## Concrete hardening (POC-internal only) + +1. Fix S2-1 and S2-2 — both are ~5 LOC in `engine.ts` and a fixture each. These are the only findings that could mislead a real run. +2. Promote S3-2 (duplicate-id rejection) to a loader hard error; one line in `loader.ts` plus a test. +3. Doc-only sweep for S3-1, S3-5, S3-6, S3-7 — bring `docs/` and the code into the same state without changing semantics. +4. Add a `CHANGELOG.md` next to `package.json` capturing the `0.1.0 → 0.2.0` bump and the rationale (S4-1). Future bumps then have a place to land. +5. Add the three missing-coverage fixtures from S4-5 — they are zero-cost regression assets. + +None of this touches the parent repo. All of it lands inside `experiments/rule-engine-poc/` and is reversible by deleting the experiment folder. diff --git a/experiments/rule-engine-poc/research/07-workflow-risks.md b/experiments/rule-engine-poc/research/07-workflow-risks.md new file mode 100644 index 000000000..4382cf359 --- /dev/null +++ b/experiments/rule-engine-poc/research/07-workflow-risks.md @@ -0,0 +1,160 @@ +# 07 — Workflow Risks: the manual paste-loop + +Critic review of the planned user-facing flow for the rule-engine POC. The +engine, loader, and HTML reporter exist; this critique is about the seam +between them and the human. Earlier critic round (`research/05-risks-critique.md`) +covered schema-miss laundering, expert-systems-winter dynamics, and false +confidence from formal audit trails. This document extends the analysis to +the `plan -> paste -> extract -> save -> report` workflow specifically. + +## 1. Friction in the human-in-the-loop step + +Step 3 ("paste the prompt into an AI tool, get back JSON") looks like one +sentence and is actually four chores: open tool, paste, wait, copy-clean-save. +At realistic repo scale (10 feature folders × weekly cadence) this is 40 +context-switches per month per user. Predictable consequences: + +- **Copy/paste fatigue.** Users will batch — running `plan` once, pasting + the easiest targets, deferring the long ones. The long ones are where the + signal lives. The verdict surface area silently biases toward small folders. +- **LLM not following schema.** Even with the schema in the prompt, model + outputs will drift: extra commentary before the JSON, markdown fences, + trailing prose, `null` vs missing fields, occasional invented flag names. + The POC has no validator at the paste boundary, so step 4 ("save the JSON + to `extractions/.json`") becomes the validator — silently. +- **Manual JSON editing.** When the LLM gets one field wrong, users will fix + it by hand in the JSON. That is the worst failure: it looks like extraction + output, it has no provenance, and it cannot be replayed. The audit trail + now contains forged evidence and nothing on the surface says so. +- **Context-window cliff.** Step 2 "bundles file contents" — for a feature + folder with `idea.md` + `research.md` + `design.md` + `spec.md` + `tasks.md` + this is easily 30-50k tokens. Free Claude.ai / ChatGPT will truncate + silently or refuse. Users will respond by deleting "the boring files" + before pasting. The rules will then fire on a partial view of the world + and emit a confident verdict. + +## 2. Failure modes unique to manual extraction + +- **Schema drift between prompt and rules.** Step 2 writes the prompt from + today's `flag-schema.yaml`; the user saves extraction Tuesday; the rules + change Wednesday; `npm run report` runs Thursday against a schema the + extraction never saw. There is no `schemaHash` carried on the saved JSON, + so the engine cannot detect this and will simply fail flags with "missing" + or — worse — score them as present-but-default. +- **LLM hallucinating flag values.** The model will helpfully output + `ears_coverage: 0.85` because the prompt asked for a number and 0.85 sounds + reasonable. The deterministic layer launders the hallucination into a + cited, hashed, replayable verdict. This is exactly the §2-bullet-1 failure + from research/05, but now with a paste-shaped trigger. +- **Re-running `plan` after extraction.** Step 2 writes + `prompts/.md`. If `plan` also touches `extractions/` or if a + user re-runs `plan` thinking it is idempotent and it clobbers their saved + JSON, hours of paste-work vanish. The POC must (a) never write under + `extractions/`, (b) refuse to overwrite a prompt whose content hash + differs without `--force`, (c) warn when an extraction's source-content + hash no longer matches the current target files. +- **Stale extractions.** Content evolves; extractions do not. After a + `tasks.md` edit, the saved JSON is a verdict on the prior state of the + world. Without a content-hash check, the HTML report will display a + confident verdict on stale evidence. This is the silent-failure mode that + kills the "audit trail = trust" claim. + +## 3. Trust calibration + +The HTML report is the artifact users will internalise. The first 5-10 +runs, they will read the audit trail; by run 20 they will read only the +verdict tile at the top. That transition is the actual product moment, and +the failure mode is brutal: a stale or hallucinated extraction produces a +green `ready-to-progress` badge that no one looks behind. The constitution +(Article IV) demands two-layer validation; this workflow has one layer (the +deterministic rule eval) and a pretend second layer (the unread trail). + +Mitigation the POC should adopt before any user touches it: the verdict +tile must surface the staleness signal (extraction age, source-content hash +mismatch, schema hash mismatch) at the same visual weight as the verdict +itself. If the trail is the trust mechanism, the trail's freshness must +appear above the fold. + +## 4. Adoption risks: temporary expedient vs permanent fixture + +The paste step is currently framed as expedient. It will become permanent +unless something forces automation. Forcing functions: + +- More than one human in the loop (paste does not scale past 1 user). +- Cadence faster than weekly (paste does not survive daily). +- A second consumer of extractions (CI gate, dashboard). + +When automation arrives — Claude API, structured-output mode — the design +changes shape: schema validation moves into the call site, retries become +the norm, cost becomes a budget line, and the "user edits the JSON" escape +hatch disappears. **The POC should be designed as if that day is six weeks +away**, not as if paste is the destination. Concretely: treat +`extractions/.json` as a black-box artifact with a provenance +envelope (source hash, schema hash, model id, timestamp, "produced by: +manual-paste | api-call"), so the API-call replacement is a producer swap +not a schema migration. + +## 5. Three riskiest assumptions — RAT designs + +**RAT-A — The paste loop is tolerable for one user, one week.** +*Assumption:* A single user will complete the full `plan -> paste -> save -> report` +loop for 5 targets without giving up or batching. +*Falsification:* Instrument the POC to log prompt-generation timestamps and +extraction-save timestamps. **Refuted if median elapsed time per target +>10 min, or if >1 of 5 targets is skipped, or if any extraction is +hand-edited after save (detectable by re-hashing).** Cheapest test. Run it +on the POC author first; if *they* batch, no one else will tolerate it. + +**RAT-B — LLM output conforms to the schema without hand-fixing.** +*Assumption:* >80% of paste-back JSON validates against `flag-schema.yaml` +on first try across Claude.ai, ChatGPT, and Claude Code. +*Falsification:* 10 targets × 3 tools = 30 paste cycles. **Refuted if <24 +of 30 validate first-try, or if any tool consistently emits markdown fences +or prose preamble.** This determines whether step 4 needs a validator +(it does) and whether the prompt template needs per-tool variants (likely). + +**RAT-C — Users notice when an extraction goes stale.** +*Assumption:* Without explicit warnings, users will spot that a report is +based on stale evidence within 1 working day of the underlying content +changing. +*Falsification:* Give 3 users a fresh report. 24 hours later, edit one +source file under the target. Ask them to re-run `report` (not `plan`). +**Refuted if any of 3 fails to notice the staleness before accepting the +verdict.** This is the trust-calibration test. If refuted, the staleness +signal must be a hard block (refuse to render), not a soft warning. + +## 6. What the POC should deliberately NOT do + +- **Do not silently discard non-conforming LLM output.** If the saved JSON + fails schema validation, fail loud with the offending field — never coerce, + never default. Silent coercion is the schema-miss laundering pipe. +- **Do not auto-open the browser on `npm run report` when the extraction is + stale or schema-mismatched.** Render a refusal page; force the user to + re-extract. Browser auto-open on a stale report is a trust accelerant in + the wrong direction. +- **Do not let `plan` write anywhere under `extractions/`.** Plan is + read-only with respect to the human's work product. Enforce in code. +- **Do not version the prompt and the schema separately.** Prompt content + must embed the schema hash it was generated from, and the saved extraction + must carry both. Three artifacts, one chain. +- **Do not add a "fix this JSON for me" affordance.** The first time the + tool offers to repair LLM output, the audit trail dies. Refuse loudly, + ask the user to re-extract. Friction here is the feature. +- **Do not ship a "trust score" on the verdict.** A numeric confidence on + the verdict tile will be read as "the tool is sure" and will accelerate + the trail-skipping failure in §3. Verdict is categorical; staleness and + schema-match are categorical; keep it so. +- **Do not let the HTML report be shareable without its provenance + envelope.** A screenshot of a green badge will circulate in Slack within + a week of release; the envelope must be in the screenshot or the badge + must not exist. + +--- + +**Recommendation:** Run RAT-A and RAT-B before any further engineering on +the user-facing flow. RAT-A is one afternoon with the POC author. RAT-B is +one afternoon with three browser tabs. If either refutes, the paste-loop +framing is wrong and the POC should jump straight to a structured-output +API call (and accept the cost / key-management consequences). Default +verdict if RATs are skipped: **no-go on the manual flow.** The earlier +critic round's default no-go stands; this round does not lift it. diff --git a/experiments/rule-engine-poc/research/08-workflow-architecture.md b/experiments/rule-engine-poc/research/08-workflow-architecture.md new file mode 100644 index 000000000..6b6a6bc70 --- /dev/null +++ b/experiments/rule-engine-poc/research/08-workflow-architecture.md @@ -0,0 +1,423 @@ +# 08 — Workflow Architecture: `plan` / `report` CLIs + +Architecture sketch for the two-CLI workflow being added on top of the +existing engine. Goal: turn the POC from "run the engine on a hand-written +flags JSON" into "point at a project, get a prompt, paste back an +extraction, get an HTML verdict in the browser" — without disturbing the +existing engine, loader, or HTML reporter public API. + +The implementer is working in parallel. This is the independent sketch +for sanity-check. + +## 1. Component overview + +``` + rule-engine.config.json + | + +---------------+---------------+ + | | + v v + +---------------------+ +-------------------------+ + | plan CLI | | report CLI | + | src/plan.ts | | src/report.ts | + +----------+----------+ +-----------+-------------+ + | | + v v + +---------------------+ +-------------------------+ + | context-gatherer | | extraction-loader | + | src/context.ts | | src/extraction.ts | + +----------+----------+ +-----------+-------------+ + | | + v v + +---------------------+ +-------------------------+ + | prompt-builder | | engine + html-report | + | src/prompt.ts | | (existing, unchanged) | + +---------------------+ +-----------+-------------+ + | + v + +-------------------------+ + | open-browser | + | src/open-browser.ts | + +-------------------------+ + +Shared: + src/config.ts load + validate rule-engine.config.json + src/flag-schema.ts load rules/flag-schema.yaml, validate against rules + src/loader.ts (existing — extended with optional schema arg) +``` + +Two CLIs, six new modules, one extension point on the existing loader. +Nothing new on the engine. + +## 2. Module layout + +| File | Exports | Depends on | +|---|---|---| +| `src/config.ts` | `loadConfig(path)`, `type RuleEngineConfig`, `type TargetConfig`, `ConfigError` | node:fs, node:path | +| `src/flag-schema.ts` | `loadFlagSchema(path)`, `validateRulesAgainstSchema(rules, schema)`, `type FlagSchema`, `type FlagSchemaEntry` | js-yaml, types | +| `src/context.ts` | `gatherContext(paths, opts)`, `type GatheredFile`, `type ContextSlice` | node:fs, node:path | +| `src/prompt.ts` | `buildExtractionPrompt(input)`, `type PromptInput` | flag-schema types, context types | +| `src/extraction.ts` | `loadExtraction(path)`, `ExtractionMissingError`, `ExtractionParseError` | node:fs, types | +| `src/open-browser.ts` | `openInBrowser(path, opts)` | node:child_process, node:os | +| `src/plan.ts` | CLI entry (default export = `main(argv)`) | config, flag-schema, context, prompt | +| `src/report.ts` | CLI entry (default export = `main(argv)`) | config, loader, extraction, engine, html-report, open-browser | + +Existing files untouched except `src/loader.ts`, which gains **one +optional** parameter: + +```ts +loadRulesFromFile(filePath: string, opts?: { schema?: FlagSchema; onUnknownFlag?: "warn" | "throw" }) +``` + +Default behaviour (no opts) is byte-identical to today. `plan` and +`report` pass `{ schema, onUnknownFlag: "warn" }`. The existing `cli.ts` +stays as-is — it's the low-level escape hatch for ad-hoc runs. + +`package.json` `bin` adds `rule-engine-plan` and `rule-engine-report`; +the original `rule-engine-poc` stays. + +## 3. Config TS contract + +```ts +export interface TargetConfig { + id: string; // [a-z0-9-]+, unique within file + label: string; // human-readable + paths: string[]; // glob-free, file or directory, relative to config dir +} + +export interface RuleEngineConfig { + rules: string; // path to YAML rule file + flagSchema: string; // path to flag-schema.yaml + extractionsDir: string; // where extraction JSON lives + promptsDir: string; // where plan writes prompts + reportsDir: string; // where report writes HTML + openBrowser: boolean; // default true + targets: TargetConfig[]; // >= 1 +} +``` + +Validation rules (all enforced in `loadConfig`): + +1. File exists, parses as JSON, top-level object. Otherwise: + `ConfigError: rule-engine.config.json not found at ` / + `ConfigError: : invalid JSON: `. +2. All path fields are non-empty strings. Relative paths resolve against + `dirname(configPath)`. Resolved paths are returned absolute on the + loaded object so downstream code never re-resolves. +3. `rules` and `flagSchema` must exist on disk; `extractionsDir`, + `promptsDir`, `reportsDir` are created lazily (mkdir -p) by the + consumers, not by the loader. +4. `targets` is a non-empty array; each entry has unique `id` matching + `^[a-z0-9][a-z0-9-]*$`; `paths` is non-empty array of strings; + per-path existence is **not** checked at load time (paths may be + created later — checked by `gatherContext` with a per-target warning). +5. `openBrowser` defaults to `true` if absent. + +Error messages prefix `rule-engine.config.json:` and name the offending +key path, e.g. `rule-engine.config.json: targets[2].paths must be a +non-empty array of strings`. Same format the existing loader uses for +YAML errors — keeps the UX consistent. + +## 4. FlagSchema TS contract + +```ts +export type FlagType = "boolean" | "number" | "string" | "string[]" | "enum"; + +export interface FlagSchemaEntry { + type: FlagType; + description: string; // required, used in the prompt + example: unknown; // required, shown in the prompt as a JSON literal + enum?: string[]; // required iff type === "enum" +} + +export type FlagSchema = Record; +``` + +YAML shape (sidecar `rules/flag-schema.yaml`): + +```yaml +current_stage: + type: enum + enum: [idea, research, requirements, design, specification, tasks, + implementation, testing, review, release, learning] + description: The lifecycle stage the feature folder is currently in. + example: implementation + +s1_findings_count: + type: number + description: Count of S1 (critical) findings on the feature. + example: 0 +``` + +Loader integration: after `loadRulesFromFile` finishes its existing +validation, if a schema was provided it walks every condition in every +rule and collects the set of referenced flag names. The diff +`referenced \ schemaKeys` is the unknown set; the diff `schemaKeys \ +referenced` is the unused set. + +- **Unknown flags** — emit `[flag-schema] warn: rule '' references + flag '' not in ` to stderr by default + (`onUnknownFlag: "warn"`). `report` and `plan` always pass `"warn"`; + CI users can set `RULE_ENGINE_STRICT_SCHEMA=1` to flip to `throw`. + Warning is non-fatal because the schema is intentionally an *aid*, + not a gate — the engine itself already handles missing flags + deterministically. +- **Unused flags** — only logged when `--verbose`. Common during + iteration; rarely a real error. + +The warn-not-throw default matches the existing loader's philosophy of +surfacing rather than swallowing. + +## 5. `plan` CLI + +``` +rule-engine-plan --target [--config ] [--out ] [--all] +``` + +| Flag | Default | Behaviour | +|---|---|---| +| `--target ` | required unless `--all` | which target from config | +| `--config ` | `./rule-engine.config.json` (cwd-relative) | path to config | +| `--out ` | `/.prompt.md` | override output | +| `--all` | off | generate one prompt per target | + +Behaviour: + +1. Load config + flag schema. +2. Resolve target → list of paths. +3. `gatherContext(paths, ...)` → ordered list of `GatheredFile`. +4. `buildExtractionPrompt({ target, schema, files })` → Markdown string. +5. Write to `--out` (or default), `mkdir -p` the parent. +6. Print `wrote (N files, K bytes)` to stderr; print the abs path + alone to stdout (pipe-friendly). +7. Never opens browser. Exit 0 unless the config or paths fail to load. + +Prompt structure (Markdown, fixed section order — determinism matters +because we want diffable prompts): + +``` +# Extraction prompt — + +## Your job +You are extracting structured flags from a project folder. You DO NOT +make judgements. You DO NOT produce a verdict. Return ONLY a JSON object +matching the schema below. + +## Output schema + + +## Response template +```json +{ + "current_stage": "...", + "s1_findings_count": 0, + ... +} +``` +Every flag in the schema MUST appear in the response. Use `null` only +where the schema explicitly allows it. + +## Context +### (N bytes) +``` + +``` +### ... (more files in deterministic order) + +## Reminders +- Return only the JSON object. No prose, no fences, no commentary. +- If a field is not determinable from the context, set it to its + schema default rather than guessing. +``` + +Truncation policy (single knob; not configurable in the POC): +per-file cap = 64 KB. Files larger than the cap are truncated to the +first 32 KB + `\n... [TRUNCATED N bytes] ...\n` + last 16 KB. A +`(truncated)` suffix appears in the section header. Binary files are +detected by null-byte sniff in the first 8 KB and replaced with +`[binary file omitted]`. Total prompt cap = 256 KB; once reached, +remaining files become `[OMITTED — prompt size cap reached]` placeholders +listed at the end so the LLM knows they exist. + +## 6. `report` CLI + +``` +rule-engine-report --target [--config ] [--no-open] [--json] [--all] +``` + +| Flag | Default | Behaviour | +|---|---|---| +| `--target ` | required unless `--all` | which target | +| `--config ` | `./rule-engine.config.json` | as for plan | +| `--no-open` | off | suppress browser open even if config says true | +| `--json` | off | also print full `VerdictResult` to stdout | +| `--all` | off | run for every target, open none, exit blocked on any blocked | + +Behaviour: + +1. Load config + flag schema + rules (with schema in warn mode). +2. Resolve `/.json` (configurable per-target + later; not in v1). If missing: + ``` + error: no extraction for target 'idea' at + /extractions/idea.json + run: + npm run plan -- --target idea + then paste the LLM's JSON response into that file and re-run report. + ``` + Exit code 2. +3. If present but invalid JSON: same shape, message + `invalid JSON: `. Exit code 2. +4. `evaluate(rules, flags)` → `VerdictResult`. +5. Render HTML via existing `renderHtmlReport`. Write to + `/.html`. +6. Print verdict + path to stderr (the existing text summary, trimmed: + verdict line + actions + report path). +7. `--json` echoes `VerdictResult` to stdout. +8. Best-effort browser open unless `--no-open` or `config.openBrowser + === false` or `process.env.CI`. +9. Exit code: existing map (`blocked` → 1, else 0). `--all`: exit 1 if + any target blocked. + +## 7. Context gathering + +`gatherContext(paths, opts)` walks each path: + +- If a file: include if extension is in the allow-list. +- If a directory: recursive walk, sorted `readdir` (lexicographic) for + determinism. Skip dot-directories (`.git`, `.next`, `node_modules`, + `dist`, `coverage`, `.worktrees`). The skip list is hard-coded; the + POC isn't a search engine. + +Allow-list (also hard-coded for POC): +`.md .markdown .yaml .yml .json .ts .tsx .js .jsx .py .go .rs .toml +.txt .csv` — plus any explicitly-listed file regardless of extension +(if the user puts `package.json` in `paths`, it's included even though +the dir walker would also pick it up). + +`GatheredFile`: + +```ts +interface GatheredFile { + absPath: string; + relPath: string; // relative to nearest target.paths[i] + bytes: number; + truncated: boolean; + contents: string; // already truncated per §5 policy + language: string; // fenced-code hint, "" for text +} +``` + +Ordering: files are sorted by `relPath` after gathering — independent +of `paths[]` order — so the same target produces the same prompt +regardless of how its `paths` were listed. A duplicate file (reachable +via two `paths` entries) is deduped by `absPath`, first occurrence wins. + +## 8. Browser open + +```ts +openInBrowser(absPath: string, opts?: { logger?: (s: string) => void }): void +``` + +Synchronous fire-and-forget: + +1. Detect platform via `process.platform`. + - `darwin` → `open ` + - `win32` → `cmd /c start "" ` + - else → `xdg-open ` +2. `spawn(cmd, args, { detached: true, stdio: "ignore" }).unref()`. +3. Wrap in `try/catch`; on any throw, log + `note: could not open browser (): ` and return. +4. Whether or not the spawn succeeds, always log + `report: ` to stderr — that line is the contract, not the + browser opening. +5. Skip entirely when `process.env.CI` is truthy (CI never wants a + browser; one fewer way to fail). + +Never throws. Never sets exit code. + +## 9. Multi-target + +**Recommendation: one prompt per target, one report per target.** +Combined output explicitly out of scope for v1. + +Rationale: + +- Prompts are LLM-bound; mixing 4 targets into one extraction request + blows token budgets and confuses the model about which file belongs + to which "thing". Per-target prompts also let the user iterate on one + target (re-paste one JSON) without invalidating the others. +- Reports inherit the same pressure: the existing HTML reporter is + per-`VerdictResult`. A combined report = a fifth surface to design, + which the POC doesn't need. `--all` produces N independent files and + opens none of them; an index page is a v2 question. +- Determinism story is cleaner: `flagsHash` stays per-target. Combined + flags would need a target dimension on the engine — a real change to + the public API. + +If a v2 index page is desired, the obvious shape is a +`/index.html` written by `--all` that links to each +per-target report with its verdict pill. Out of scope here. + +## 10. Test surface + +Worth unit-testing: + +| Module | Test | Why | +|---|---|---| +| `config.ts` | each validation rule fires with its message | message stability is part of the UX | +| `config.ts` | relative paths resolve against config dir, not cwd | classic foot-gun | +| `flag-schema.ts` | unknown-flag detection (warn list, throw mode) | the headline feature of the schema | +| `flag-schema.ts` | rules with no schema = no warnings (back-compat) | guards the loader's existing call sites | +| `context.ts` | deterministic ordering, dedup, dot-dir skip, allow-list, truncation policy (each branch), binary detection | snapshot-prone; high regression risk | +| `prompt.ts` | section order stable, schema table renders every flag, JSON template lists every flag, truncated-file marker present | the prompt is the product | +| `extraction.ts` | missing file message includes the `npm run plan` hint verbatim | error UX is load-bearing | +| `extraction.ts` | invalid JSON message includes line:col | likewise | +| `open-browser.ts` | platform branch selection (mock `process.platform`), spawn-failure swallowed, CI skip | tiny but easy to regress | + +Covered by integration (one end-to-end test per CLI is enough): + +- `plan --target idea` end-to-end against a tiny fixture target. +- `report --target idea` against a paired fixture extraction. +- `report` exit codes — blocked vs ready. +- `report --no-open` actually suppresses the spawn. + +Snapshot tests on prompt and HTML are fine; the existing reproducibility +test discipline applies — strip the generated-at timestamp before diff. + +## 11. Edge cases + +| Case | Handling | +|---|---| +| Empty `targets` array | `loadConfig` throws `targets must be non-empty` | +| Missing config file | `ConfigError: rule-engine.config.json not found at ` with hint `pass --config or create one in cwd` | +| `--target foo` not in config | `error: target 'foo' not found. known targets: idea, requirements, design` then exit 2 | +| Extraction file missing | per §6 — exit 2 with `npm run plan` pointer | +| Extraction file invalid JSON | per §6 — exit 2 with line:col | +| Extraction missing schema-required flag | engine already records `flag missing in extraction` per condition. `report` additionally prints a one-line warning summary above the verdict: `warning: extraction missing 3 flags declared in schema: a, b, c` | +| Rule references flag not in schema | loader warning (per §4); no exit-code change | +| Target path doesn't exist | `gatherContext` records it as `[missing: ]` and continues; prompt header lists missing paths in a `## Missing paths` section so the LLM doesn't hallucinate their contents | +| Target with zero usable files after filtering | `plan` exits 2 with `target 'x' produced 0 files after filtering — check paths and allow-list` | +| Browser command not installed | swallowed by §8; user sees `note: could not open browser (ENOENT): ` | +| Config has `openBrowser: false` and CLI omits `--no-open` | no open (config wins as the floor; CLI can only further suppress, not re-enable) | +| `--all` with one target erroring | continue processing the rest, aggregate failures, exit 1 if any blocked or errored | +| Prompt size cap reached | per §5 — placeholders + count printed to stderr | +| Schema entry with `type: enum` but no `enum` array | `flag-schema.ts` throws at load | +| Two targets with same `id` | `loadConfig` throws `duplicate target id 'x'` | + +## 12. Open questions for the implementer + +1. Should the per-target extraction filename be configurable (e.g. + `targets[].extractionFile`)? Default `/.json` + is fine for v1; flagging in case you've already designed otherwise. +2. Allow-list as hard-coded vs `config.allowExtensions`? I'd keep it + hard-coded for the POC and revisit if a real user hits it. +3. Should `plan` also emit a `.flags.template.json` stub alongside the + prompt? Tempting but adds a fourth artifact; the prompt's + "Response template" fenced block covers the same need. + +--- + +If the implementer's design diverges materially on §5 (prompt shape), +§7 (truncation / ordering), or §9 (multi-target stance), those are the +places to compare notes before merging. diff --git a/experiments/rule-engine-poc/research/09-user-flow.md b/experiments/rule-engine-poc/research/09-user-flow.md new file mode 100644 index 000000000..1acc0ac6c --- /dev/null +++ b/experiments/rule-engine-poc/research/09-user-flow.md @@ -0,0 +1,84 @@ +# 09 — User flow: plan / report workflow UX audit + +UX audit of the proposed two-step `plan` / `report` workflow that turns the rule engine from a fixture-replay demo into a tool a developer or PM can point at their own project. Audience: terminal-only POC users with no UI surface beyond CLI output and a self-contained HTML report. + +## Narrative walkthrough + +A PM clones the repo, runs `npm install`, then `npm run plan` because the README told her to. The command finishes with no obvious next step — twelve files appear under `prompts/`, but the terminal returned to the prompt. She opens `prompts/req-coverage.md`, sees a wall of instructions, and isn't sure whether to copy the whole file or just the fenced block. She pastes it into Claude.ai. Claude returns JSON in a code fence; she copies it, opens a new file at `extractions/req-coverage.json`, pastes, saves. She repeats for eleven more targets — and by target three is asking herself whether she has to do all twelve before `report` will run, or whether partial coverage is allowed. She runs `npm run report`. The browser pops open. The verdict is `blocked`. She trusts it, but doesn't immediately notice that two of the matched rules fired only because a flag was missing from her extraction, not because the underlying signal was bad. She closes the tab. The loop's biggest UX risk is not the verdict — it's the silent gap between "I pasted what the LLM gave me" and "the engine treated three flags as absent". + +## Findings + +### Flow gaps and friction points + +- **F1 — `plan` exits silently.** After writing prompts, print a numbered next-step block: `1) open prompts/.md 2) paste into your AI tool 3) save the JSON it returns to extractions/.json 4) run npm run report`. The first-time user's biggest question after `plan` is "now what" — answer it inline, don't make them re-read the README. +- **F2 — Partial coverage is undefined.** The user doesn't know whether `report` requires all targets present or tolerates partial. Decide explicitly, document it in the prompt footer and in `report`'s opening line ("Found 7 of 12 extractions — proceeding; missing targets will be reported as `unknown`"). Silence here will be read as "I did something wrong". +- **F3 — Round-trip ambiguity at paste time.** LLMs return JSON inside fences, with prose around it. The prompt must explicitly say "return ONLY a JSON object, no prose, no fences" *and* the prompt footer must show the user how to strip fences if their tool added them. A `npm run report -- --tolerate-fences` flag that strips ```` ```json ```` and ```` ``` ```` envelopes is a kindness. + +### Empty / loading / error states + +| Condition | Required output | +|---|---| +| (a) no config | `plan`: `No config found at ./rule-engine.config.yaml. Run \`npm run plan -- --init\` to scaffold one, or pass --config .` Exit 2. | +| (b) config has zero targets | `plan`: `Config loaded but lists zero targets. Add at least one target under \`targets:\` — see docs/config.md for the shape.` Exit 2. | +| (c) target id not in config | `report --target `: `Target "" not in config. Known targets: a, b, c.` Exit 2. | +| (d) extraction file missing | `report`: warn per target `[missing] extractions/.json — skipping; rules referencing this target will report \`unknown\`.` Continue; do not abort. | +| (e) extraction file is invalid JSON | `report`: `extractions/.json: invalid JSON at line 14, column 3 — . Fix the file and re-run. (Tip: most LLM tools wrap JSON in \`\`\`json fences — strip them.)` Exit 2 for that target; continue the rest. | +| (f) extraction references flags not in schema | `report`: `extractions/.json: unknown flags ignored: tone_friendliness, vibe_score. (Known flags for this target: ...)` — warn, do not block. Render the warnings prominently in the HTML report's header. | +| (g) browser open fails | `report`: `Report written to reports/.html. Failed to open browser automatically (). Open the file manually: file:///abs/path`. Never fail the command on open-failure. | + +Loading: both commands take noticeable time on a real project. Emit one line per target processed (`[1/12] req-coverage … prompt written`), not a spinner. Spinners are hostile to log capture and screen readers. + +### Discoverability + +The README currently leads with the single-shot CLI. **Invert it.** The plan/report flow should be the primary "Run it" section; the `npx tsx src/cli.ts ...` single-shot becomes an "Advanced: replay a fixture" subsection further down. First-time users want the loop they came for, not the demo plumbing. + +Add a 4-line preamble above the new "Run it" block: what the loop does, what they need (an AI chat tool of their choice), how long it takes (rough order of magnitude per target), and where the verdict ends up. Anything less and the README hides the seams the user has to cross. + +### Prompt file UX + +- **Single `.md` file per target.** Keep it greppable and copy-pasteable; multi-file is friction for no payoff. +- **Structured sections, fixed order:** (1) `# Prompt — paste everything below into Claude/ChatGPT/Claude Code`, (2) the actual prompt body with the schema and the source material, (3) `---`, (4) `## What to do with the response` footer explaining where to save the JSON, what filename, and what to do if the LLM wrapped it in fences. +- **Include the target id and the expected output path verbatim in the footer** (`Save the JSON below as: extractions/req-coverage.json`) so the user does not have to compute the filename. +- **Stamp the prompt with the rules-file hash and prompt version** at the bottom. When the rule set changes, regenerated prompts will differ — users need to see why. + +### HTML report UX (now that the human is in the loop) + +The current report is built for replay, not authorship. Add: + +- A **"Your extractions" header strip** listing each extraction file consumed with its file mtime and hash, so the user can see at a glance which targets are stale. +- A **per-flag provenance row**: for each evaluated rule, show the extraction file the flag came from. A user who disagrees with a flag needs a one-click path back to the file to edit. +- An explicit **"Disagree with a flag? Edit `extractions/.json` and re-run `npm run report`"** callout near the verdict — the loop's whole point is human-in-the-loop, but nothing on the page hints at that. +- **Highlight `flag missing in extraction` failures distinctly** from `condition not satisfied` failures. Today both look like `[ ]` in the terminal output; in the HTML they need a separate visual treatment (warning row vs failed row) and a count in the header. +- **Re-run hint at the bottom**: the exact `npm run report` command, including any flags the user passed. + +### Naming + +`plan` and `report` describe artifacts, not user intent. `plan` is also overloaded across CI/IaC tooling (`terraform plan`). Stronger pair: + +- **`extract`** (verb, matches the OODA Orient quadrant vocabulary already used in the docs) and **`evaluate`** (verb, matches the engine API `evaluate()`). +- Acceptable second choice: `prompts` / `report` — noun + noun, also unambiguous. +- Avoid `prepare` / `render` (too generic) and `generate` / `score` (engine never "scores" — it emits verdicts; reusing "score" undermines the article's core distinction). + +Recommendation: rename to **`extract` / `evaluate`** and keep `plan` / `report` as deprecated aliases for one minor version. + +### Accessibility (terminal) + +- **ASCII first; Unicode opt-in.** Box-drawing characters and emojis break in restricted terminals, ssh sessions, and screen readers. Stick to `[+]` / `[ ]` / `[!]` markers (already the pattern in `src/cli.ts`). +- **Color is decoration, not signal.** Always pair color with a text token (`VERDICT: BLOCKED` not just a red bar). Respect `NO_COLOR=1` and detect non-TTY stdout. +- **Bounded line width.** Wrap at 80 columns by default; tables should degrade to key-value pairs when wider than the terminal. +- **One line per state change.** Screen readers handle line-delimited progress (`[3/12] ...`) far better than redraw-in-place spinners. +- **HTML report**: keep the current no-JS, single-file design; add ``, real `` headers on tables, and a meaningful `` per target so browser tabs are distinguishable. + +## Requirements coverage + +This audit is a UX consult, not a PRD; coverage maps to the bullets in the request: + +| Request bullet | Where addressed | +|---|---| +| Primary flow walkthrough | "Narrative walkthrough" | +| Empty / loading / error states (a–g) | "Empty / loading / error states" table | +| Discoverability / README structure | "Discoverability" | +| Prompt file UX | "Prompt file UX" | +| HTML report UX changes | "HTML report UX" | +| Naming | "Naming" | +| Accessibility | "Accessibility (terminal)" | diff --git a/experiments/rule-engine-poc/research/10-extraction-prompt-patterns.md b/experiments/rule-engine-poc/research/10-extraction-prompt-patterns.md new file mode 100644 index 000000000..ceb2ec686 --- /dev/null +++ b/experiments/rule-engine-poc/research/10-extraction-prompt-patterns.md @@ -0,0 +1,184 @@ +--- +title: Extraction prompt patterns +folder: experiments/rule-engine-poc/research +description: Research on how to write LLM extraction prompts that maximise schema-conformance and minimise the "verdict-not-flags" failure mode for the paste-into-chat workflow. +entry_point: false +--- + +# Extraction prompt patterns for `npm run plan` + +**Date:** 2026-05-17 +**Branch:** `claude/rule-engine-poc-gO5yq` **PR:** #525 +**Audience:** designer of the `npm run plan` prompt generator and the validator that follows it. +**Constraint:** the user pastes the prompt into Claude.ai / ChatGPT / Claude Code and pastes the JSON back. No API access, so constrained decoding (OpenAI `json_schema` strict, Anthropic `output_format`, Gemini `response_schema`) is **off the table** as a backstop — we lean entirely on prompt shape plus client-side validation. + +## 1. Why the "verdict instead of flags" failure mode happens + +Post-RLHF chat models are tuned to be helpful, conversational, and to "answer the question." When the source material asks an evaluative question ("is this claim true?"), the chat surface biases the model toward producing a *judgment* with prose ("Based on the evidence, this is mostly true because…") instead of the *evidence flags* the rule engine needs. This is the same root cause as the broader "JSON bleed" pattern documented in 2025–2026 write-ups: the model defaults to being readable for a human rather than parseable for a downstream machine ([Stop begging for JSON](https://www.ignorance.ai/p/stop-begging-for-json), [5 Ways LLMs Break JSON](https://medium.com/@mtdevworks2025/5-ways-llms-break-json-and-how-to-fix-them-f67fd8be5ba2)). + +Three contributing forces: + +- **Instruction averaging.** If the prompt says both "be a fact-checker" and "extract flags," the model collapses the two into a hybrid. +- **Salience of the question over the format.** Long source material with an obvious question crowds out the formatting block at the bottom. +- **Helpfulness reflex.** Anthropic explicitly notes that Claude Opus 4.7 "interprets prompts more literally" and will not generalise, which is good for us — but earlier models and GPT-class models still volunteer commentary unless they are starved of permission to do so ([Anthropic prompting best practices](https://platform.claude.com/docs/en/build-with-claude/prompt-engineering/claude-prompting-best-practices)). + +## 2. Structured-output techniques: what we can vs cannot use + +| Technique | API-only? | Useful for paste workflow? | +|---|---|---| +| OpenAI `response_format: {type: "json_schema", strict: true}` | API-only | No — but mirror its *schema discipline* in-prompt ([OpenAI docs](https://developers.openai.com/api/docs/guides/structured-outputs)) | +| OpenAI legacy `json_object` mode | API-only | No | +| Anthropic `output_format` / strict tool use | API-only (Sonnet 4.5, Opus 4.1+) | No, but pattern is informative ([Anthropic structured outputs](https://platform.claude.com/docs/en/build-with-claude/structured-outputs)) | +| Gemini `response_schema` / `response_json_schema` | API-only | No ([Gemini structured output](https://ai.google.dev/gemini-api/docs/structured-output)) | +| Assistant prefill (`{` opening) | API-only, and dropped on Claude Opus 4.6+ ([prefill docs](https://platform.claude.com/docs/en/build-with-claude/prompt-engineering/prefill-claudes-response)) | No | +| **XML tag scaffolding** | Works anywhere | **Yes — primary tool** ([Anthropic XML tags](https://platform.claude.com/docs/en/build-with-claude/prompt-engineering/use-xml-tags)) | +| **Schema-in-prompt + few-shot** | Works anywhere | **Yes — primary tool** | +| **Hard delimiter markers** (`---BEGIN JSON---` / `---END JSON---`) | Works anywhere | **Yes — fallback parser hook** | +| **Client-side validate-then-retry** | n/a | **Yes — required** | + +The takeaway: in the paste workflow, we are replicating constrained decoding socially through prompt shape plus a validator at the bottom of the funnel. + +## 3. Prompt-engineering patterns worth adopting + +- **Role narrowing.** Cast the model as an "evidence extractor" not a "fact-checker." Add: "You do not produce verdicts. A separate deterministic rule engine consumes your flags." +- **Schema in-prompt with allowed enums.** Inline JSON schema with `type`, `required`, `enum` constraints, and a per-field one-liner; explicit "no extra fields" rule ([Mastering JSON Prompting](https://machinelearningmastery.com/mastering-json-prompting-for-llms/)). +- **"Omit if uncertain" rule.** State per-field defaults (`null`, `[]`, `""`) and tell the model: "If you cannot find the flag with high confidence, omit/null it. Do not infer." +- **Few-shot, 2–3 examples.** Cover (a) typical positive case, (b) edge case with mostly-null fields, (c) refusal-shaped input where the answer is "no flags." Final example carries outsized weight, so put the cleanest one last ([Few-shot guide](https://mem0.ai/blog/few-shot-prompting-guide)). +- **Chain-of-verification, light version.** Single-pass two-block: ask the model first to think in `<scratchpad>` (which we discard) and emit JSON only in `<output>`. The full CoVe loop ([arXiv 2309.11495](https://arxiv.org/abs/2309.11495)) is multi-call and overkill for the paste flow. +- **Refusal handling.** Define a sentinel: "If you decline to answer, still emit `{\"refused\": true, \"reason\": \"…\"}` so the rule engine has something deterministic to handle." +- **End the prompt with the response marker.** Closing with `<output>` (and nothing after) primes continuation in-format ([SurePrompts JSON guide](https://sureprompts.com/blog/structured-output-prompting-guide)). + +## 4. Paste-specific failure modes to defend against + +1. **Markdown code-fence wrapping** (` ```json … ``` `). Most common across Claude and GPT-4o. +2. **Preamble prose** ("Here's the extracted JSON:"). +3. **Postamble offer** ("Let me know if you'd like me to refine any flag."). +4. **Multiple JSON blocks** when the model "shows its work" first then the final. +5. **Helpful extra fields** ("confidence_explanation", "summary"). +6. **Smart-quote contamination** when copy-paste passes through editors. +7. **JSONL drift** — one object per line instead of an array. +8. **Verdict bleed** — the failure mode we most fear: a `verdict` or `assessment` field appears unbidden. + +Defences: + +- The validator must strip a single outer ```` ``` ```` (or ```` ```json ```` ) fence before parsing. +- The validator must reject (not silently ignore) unknown fields, including `verdict`/`assessment`/`conclusion`. Loud failure trains the user to re-run. +- Use `<output>` and `</output>` XML anchors as a secondary extractor when fences are absent. +- Normalise smart quotes before `JSON.parse`. Consider [`json-repair`](https://github.com/mangiucugna/json_repair) as a last-resort opt-in flag — but default to strict. + +## 5. Should we ship a `validate` script? Yes. + +Recommendation: add `npm run validate` (separate from `report`) that: + +1. Reads pasted text from stdin or a file path. +2. Strips one layer of code fence; falls back to `<output>` tag extraction. +3. Parses; on failure, prints the offending excerpt and the JSON parser error position. +4. Validates against the canonical schema (we already need it for `report`). Use Ajv or Zod; either is fine. +5. Lists violations grouped by type: unknown field, wrong enum, missing required, type mismatch, **forbidden-field** (the verdict-bleed guard). +6. Exits non-zero on any violation so it can be CI-gated. + +`report` should call the validator internally and refuse to run on invalid input. The user message should read like: "Your pasted JSON has 2 issues; fix them in chat or re-run `plan`." + +## 6. Frontier-model tweaks (2026) + +- **Claude Opus 4.7 / Sonnet 4.6 / Haiku 4.5.** Respect XML tags strongly. Explicit literal instruction following — say "omit," not "consider omitting." Keep the prompt short; Opus 4.7 calibrates verbosity and may over-think long preambles ([Anthropic best practices](https://platform.claude.com/docs/en/build-with-claude/prompt-engineering/claude-prompting-best-practices)). +- **GPT-4o / o1 / GPT-5-class.** XML tags work but are not native; markdown headers (`## Schema`, `## Examples`, `## Source`) work equally well. GPT models are the most prone to markdown-fence wrapping — the validator must handle this. +- **Gemini 2.x / 3.** Tend to add prose around JSON even with strong instructions; mitigations are the same. Public reports note that explicit "no prose, no code fence, only raw JSON" instructions matter more for Gemini than for Claude ([Medium: Begging, Threatening, and JSON-ing](https://medium.com/google-cloud/structured-output-with-gemini-models-begging-borrowing-and-json-ing-f70ffd60eae6)). +- **One prompt, all three.** Use XML tags as primary structure (Claude-friendly) and add markdown section headers as comments (GPT/Gemini-friendly). Both groups will tolerate the other. + +## 7. Anti-patterns observed in the wild + +- Asking for "JSON or prose, your choice." Never give the option. +- Mixing instruction and data inline ("Extract flags from: <pasted text>"). Always wrap data in a tagged block. +- Saying "be accurate" without saying "if uncertain, omit." Models then guess. +- Letting examples drift in shape from the schema. Examples set the shape more strongly than the schema does — they must be byte-identical to schema expectations. +- Negative-only instructions ("don't add prose"). Pair with positive: "Your entire response is a single JSON object inside `<output>` tags." +- Asking the model to "include a confidence and reasoning" inline. Reasoning belongs in `<scratchpad>` (discarded) or in a dedicated `evidence` array field — not free-form alongside the JSON. +- Telling the model the rules. Leak the rule-engine logic to the model and it will short-circuit to the verdict. Keep the rules out of the extraction prompt. + +## 8. Recommended prompt skeleton + +```text +<role> +You are a structured-evidence extractor. You read source material and +emit JSON flags that describe what evidence is or is not present. +You DO NOT issue verdicts, conclusions, or assessments. A separate +deterministic rule engine consumes your flags and produces the verdict. +</role> + +<task> +Extract flags from the source material in <source>. Emit a single JSON +object inside <output> tags. No prose, no code fence, no commentary. +</task> + +<schema> +{ + "type": "object", + "additionalProperties": false, + "required": ["flags"], + "properties": { + "flags": { + "type": "object", + "additionalProperties": false, + "properties": { + "<flag_name>": { "type": "boolean", "description": "..." }, + "<evidence_quote>": { "type": ["string","null"], "description": "exact quote or null" } + } + }, + "refused": { "type": "boolean" }, + "reason": { "type": "string" } + } +} +</schema> + +<rules> +- Omit or null any flag you cannot determine with high confidence. +- Do NOT add fields beyond the schema, even if the source invites them. +- Do NOT include "verdict", "assessment", "conclusion", "summary", or "recommendation". +- Quotes in `evidence_quote` must be verbatim from <source>. +- If you decline to extract, emit {"refused": true, "reason": "..."} inside <output>. +</rules> + +<examples> + <example> + <source>...short positive example...</source> + <output>{ "flags": { ... } }</output> + </example> + <example> + <source>...edge case with mostly nulls...</source> + <output>{ "flags": { ... } }</output> + </example> +</examples> + +<source> +{{PASTE SOURCE MATERIAL HERE}} +</source> + +Think privately in <scratchpad> if you need to. Discard it. +Then emit JSON only inside <output>…</output>. Nothing after </output>. + +<output> +``` + +Pair with: validator strips one code fence, falls back to `<output>` extraction, rejects unknown fields, rejects forbidden fields (`verdict` etc.), and emits actionable error messages. + +## 9. Recommendation + +Adopt the skeleton in §8 as the `npm run plan` template. Ship `npm run validate` per §5 as a hard prerequisite for `report`. Treat the forbidden-field list as the canonical guard against verdict bleed; expand it whenever a real run produces a new offender. Revisit if/when we move from paste to API — at that point swap the prompt scaffold for native structured outputs and keep the validator as a defence-in-depth layer. + +## Sources + +- [Anthropic — Prompting best practices](https://platform.claude.com/docs/en/build-with-claude/prompt-engineering/claude-prompting-best-practices) +- [Anthropic — Use XML tags to structure prompts](https://platform.claude.com/docs/en/build-with-claude/prompt-engineering/use-xml-tags) +- [Anthropic — Structured outputs](https://platform.claude.com/docs/en/build-with-claude/structured-outputs) +- [Anthropic — Prefill Claude's response](https://platform.claude.com/docs/en/build-with-claude/prompt-engineering/prefill-claudes-response) +- [OpenAI — Structured outputs guide](https://developers.openai.com/api/docs/guides/structured-outputs) +- [Google — Gemini structured output](https://ai.google.dev/gemini-api/docs/structured-output) +- [arXiv 2309.11495 — Chain-of-Verification reduces hallucination](https://arxiv.org/abs/2309.11495) +- [Stop begging for JSON — Charlie Guo](https://www.ignorance.ai/p/stop-begging-for-json) +- [Mastering JSON Prompting for LLMs](https://machinelearningmastery.com/mastering-json-prompting-for-llms/) +- [Few-Shot Prompting: Everything You Need to Know in 2026](https://mem0.ai/blog/few-shot-prompting-guide) +- [Structured Output Prompting Guide (SurePrompts, 2026)](https://sureprompts.com/blog/structured-output-prompting-guide) +- [5 Ways LLMs Break JSON (And How to Fix Them)](https://medium.com/@mtdevworks2025/5-ways-llms-break-json-and-how-to-fix-them-f67fd8be5ba2) +- [Structured Output with Gemini Models (Google Cloud Community)](https://medium.com/google-cloud/structured-output-with-gemini-models-begging-borrowing-and-json-ing-f70ffd60eae6) +- [json_repair — last-resort recovery library](https://github.com/mangiucugna/json_repair) diff --git a/experiments/rule-engine-poc/rule-engine.config.json b/experiments/rule-engine-poc/rule-engine.config.json new file mode 100644 index 000000000..70f68a9a1 --- /dev/null +++ b/experiments/rule-engine-poc/rule-engine.config.json @@ -0,0 +1,24 @@ +{ + "rules": "rules/quality-gates.yaml", + "flagSchema": "rules/flag-schema.yaml", + "promptsDir": "prompts", + "extractionsDir": "extractions", + "reportsDir": "reports", + "openBrowser": true, + "targets": [ + { + "id": "astro-product-page", + "label": "Astro product page feature (specs/astro-product-page/)", + "paths": [ + "../../specs/astro-product-page" + ] + }, + { + "id": "shape-b-branching-adoption", + "label": "Shape B branching adoption feature (specs/shape-b-branching-adoption/)", + "paths": [ + "../../specs/shape-b-branching-adoption" + ] + } + ] +} diff --git a/experiments/rule-engine-poc/rules/flag-schema.yaml b/experiments/rule-engine-poc/rules/flag-schema.yaml new file mode 100644 index 000000000..a0d4b3f24 --- /dev/null +++ b/experiments/rule-engine-poc/rules/flag-schema.yaml @@ -0,0 +1,171 @@ +# Flag schema for the quality-gates rule set. +# +# Every flag referenced by rules/quality-gates.yaml must be documented +# here. The plan command bundles this schema into the LLM extraction +# prompt; the loader (TODO) warns when a rule references a flag not in +# the schema; the report command reports schema-coverage gaps. +# +# Format per flag: +# <flag_name>: +# type: boolean | number | string | string[] +# description: one sentence the LLM uses to decide what to set the flag to. +# example: a concrete sample value. +# stage: (optional) which Specorator stage this flag relates to. + +# --- Identity --------------------------------------------------------- + +feature_slug: + type: string + description: The feature folder slug under specs/. + example: "astro-product-page" + +current_stage: + type: string + description: Which Specorator lifecycle stage the feature is currently in. + example: "implementation" + allowed_values: + [idea, research, requirements, design, specification, tasks, + implementation, testing, review, release, retrospective] + +# --- Cross-cutting severity / blockers -------------------------------- + +s1_findings_count: + type: number + description: Count of open S1 (critical) findings — data loss, security breach, full outage, regulatory exposure. + example: 0 + +s2_findings_count: + type: number + description: Count of open S2 (high) findings — critical user flow broken with no acceptable workaround. + example: 1 + +s3_findings_count: + type: number + description: Count of open S3 (medium) findings — non-critical flow broken with workaround, or quality regression. + example: 3 + +open_clarifications_count: + type: number + description: Count of open clarifications surfaced by /spec:clarify or marked in artifacts that need a human answer before progressing. + example: 0 + +blockers_count: + type: number + description: Count of open blockers — items explicitly tagged as blocking the current stage. + example: 0 + +# --- Stage: Idea ------------------------------------------------------ + +idea_problem_statement_present: + type: boolean + stage: idea + description: Idea.md has a one-paragraph problem statement understandable to a non-expert. + example: true + +idea_target_users_named: + type: boolean + stage: idea + description: Idea.md names the target users. + example: true + +idea_scope_bounded: + type: boolean + stage: idea + description: Idea scope is bounded — no "boil the ocean" framing. + example: true + +# --- Stage: Requirements (PRD) ---------------------------------------- + +requirements_have_stable_ids: + type: boolean + stage: requirements + description: Every functional requirement has a stable REQ-<AREA>-NNN id. + example: true + +requirements_ears_coverage: + type: number + stage: requirements + description: Fraction of functional requirements that use EARS notation (0.0 to 1.0). + example: 1.0 + +requirements_acceptance_criteria_testable: + type: boolean + stage: requirements + description: All acceptance criteria are written in a form a test can verify. + example: true + +# --- Stage: Design ---------------------------------------------------- + +design_irreversible_have_adrs: + type: boolean + stage: design + description: Every irreversible architectural decision has an ADR in docs/adr/. + example: true + +design_risks_have_mitigations: + type: boolean + stage: design + description: Each identified risk in design.md has a documented mitigation. + example: true + +# --- Stage: Specification --------------------------------------------- + +spec_each_item_traces_to_requirement: + type: boolean + stage: specification + description: Every spec item has an id and references at least one requirement id. + example: true + +# --- Stage: Implementation -------------------------------------------- + +implementation_lint_clean: + type: boolean + stage: implementation + description: Lint runs clean on the changed surface (no errors, warnings within project tolerance). + example: true + +implementation_types_clean: + type: boolean + stage: implementation + description: TypeScript / type checks pass with no errors on the changed surface. + example: true + +implementation_unit_tests_pass: + type: boolean + stage: implementation + description: Unit tests for the changed surface pass. + example: true + +# --- Stage: Testing --------------------------------------------------- + +testing_ears_test_coverage: + type: number + stage: testing + description: Fraction of EARS clauses that have at least one corresponding test (0.0 to 1.0). + example: 1.0 + +testing_critical_paths_covered: + type: boolean + stage: testing + description: Critical paths (happy + key edge cases) are exercised by at least one test. + example: true + +# --- Stage: Review ---------------------------------------------------- + +review_traceability_complete: + type: boolean + stage: review + description: Traceability matrix is complete and consistent — every requirement chains to a finding or test. + example: true + +brand_review_required: + type: boolean + stage: review + description: The diff touches sites/, .claude/skills/specorator-design/, or any HTML/CSS/JSX producing user-visible UI. + example: false + +brand_review_passed: + type: boolean + stage: review + description: When brand_review_required is true, whether the brand-reviewer agent posted PASS. + example: true diff --git a/experiments/rule-engine-poc/src/config.ts b/experiments/rule-engine-poc/src/config.ts new file mode 100644 index 000000000..b42d20353 --- /dev/null +++ b/experiments/rule-engine-poc/src/config.ts @@ -0,0 +1,123 @@ +import { readFileSync } from "node:fs"; +import { dirname, resolve, isAbsolute } from "node:path"; + +export interface Target { + id: string; + label: string; + paths: string[]; +} + +export interface RawConfig { + rules: string; + flagSchema: string; + promptsDir: string; + extractionsDir: string; + reportsDir: string; + openBrowser: boolean; + targets: Target[]; +} + +export interface ResolvedConfig extends RawConfig { + configFile: string; + configDir: string; + rulesPath: string; + flagSchemaPath: string; + promptsDirPath: string; + extractionsDirPath: string; + reportsDirPath: string; + targets: Target[]; +} + +export function loadConfig(configPath: string): ResolvedConfig { + const abs = resolve(configPath); + let raw: unknown; + try { + raw = JSON.parse(readFileSync(abs, "utf8")); + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + throw new Error(`Could not read or parse config file ${abs}: ${msg}`); + } + const config = validate(raw, abs); + const configDir = dirname(abs); + const r = (p: string): string => (isAbsolute(p) ? p : resolve(configDir, p)); + return { + ...config, + configFile: abs, + configDir, + rulesPath: r(config.rules), + flagSchemaPath: r(config.flagSchema), + promptsDirPath: r(config.promptsDir), + extractionsDirPath: r(config.extractionsDir), + reportsDirPath: r(config.reportsDir), + }; +} + +export function findTarget(config: ResolvedConfig, id: string): Target { + const t = config.targets.find((x) => x.id === id); + if (!t) { + const known = config.targets.map((x) => x.id).join(", ") || "(none)"; + throw new Error(`Target '${id}' not found in config. Known targets: ${known}`); + } + return t; +} + +function validate(raw: unknown, file: string): RawConfig { + if (!raw || typeof raw !== "object") { + throw new Error(`Config ${file} must be a JSON object`); + } + const r = raw as Record<string, unknown>; + for (const field of [ + "rules", + "flagSchema", + "promptsDir", + "extractionsDir", + "reportsDir", + ] as const) { + if (typeof r[field] !== "string" || (r[field] as string).length === 0) { + throw new Error(`Config ${file} missing required string '${field}'`); + } + } + if (typeof r.openBrowser !== "boolean") { + throw new Error(`Config ${file} missing required boolean 'openBrowser'`); + } + if (!Array.isArray(r.targets)) { + throw new Error(`Config ${file} missing required array 'targets'`); + } + const seen = new Set<string>(); + const targets: Target[] = (r.targets as unknown[]).map((t, i) => { + if (!t || typeof t !== "object") { + throw new Error(`Config ${file} target #${i} is not an object`); + } + const tr = t as Record<string, unknown>; + if (typeof tr.id !== "string" || tr.id.length === 0) { + throw new Error(`Config ${file} target #${i} missing string 'id'`); + } + if (seen.has(tr.id)) { + throw new Error(`Config ${file} has duplicate target id '${tr.id}'`); + } + seen.add(tr.id); + if (typeof tr.label !== "string" || tr.label.length === 0) { + throw new Error(`Config ${file} target '${tr.id}' missing string 'label'`); + } + if (!Array.isArray(tr.paths) || tr.paths.length === 0) { + throw new Error(`Config ${file} target '${tr.id}' missing non-empty 'paths' array`); + } + const paths = (tr.paths as unknown[]).map((p, j) => { + if (typeof p !== "string" || p.length === 0) { + throw new Error(`Config ${file} target '${tr.id}' path #${j} is not a string`); + } + return p; + }); + return { id: tr.id, label: tr.label, paths }; + }); + + return { + rules: r.rules as string, + flagSchema: r.flagSchema as string, + promptsDir: r.promptsDir as string, + extractionsDir: r.extractionsDir as string, + reportsDir: r.reportsDir as string, + openBrowser: r.openBrowser, + targets, + }; +} diff --git a/experiments/rule-engine-poc/src/context.ts b/experiments/rule-engine-poc/src/context.ts new file mode 100644 index 000000000..cf6b4d03e --- /dev/null +++ b/experiments/rule-engine-poc/src/context.ts @@ -0,0 +1,95 @@ +import { readdirSync, readFileSync, statSync } from "node:fs"; +import { extname, join, relative, resolve } from "node:path"; + +const DEFAULT_EXTENSIONS = new Set([ + ".md", + ".mdx", + ".yaml", + ".yml", + ".json", + ".txt", +]); + +const MAX_FILE_BYTES = 8 * 1024; +const TRUNCATION_NOTE = + "\n\n... (truncated — file exceeds 8 KB; the rule engine POC bundles the first 8 KB into the prompt)"; + +export interface CollectedFile { + absolutePath: string; + relativePath: string; + bytes: number; + truncated: boolean; + content: string; +} + +export interface CollectOptions { + baseDir: string; + extensions?: Set<string>; + maxBytes?: number; +} + +export function collectFiles( + paths: string[], + options: CollectOptions, +): CollectedFile[] { + const extensions = options.extensions ?? DEFAULT_EXTENSIONS; + const maxBytes = options.maxBytes ?? MAX_FILE_BYTES; + const collected: CollectedFile[] = []; + for (const raw of paths) { + const abs = resolve(options.baseDir, raw); + walk(abs, extensions, collected); + } + // Deterministic ordering by relative path. + collected.sort((a, b) => a.absolutePath.localeCompare(b.absolutePath)); + return collected.map((f) => { + const truncated = f.bytes > maxBytes; + const content = truncated + ? f.content.slice(0, maxBytes) + TRUNCATION_NOTE + : f.content; + return { + ...f, + truncated, + content, + relativePath: relative(options.baseDir, f.absolutePath), + }; + }); +} + +function walk( + abs: string, + extensions: Set<string>, + out: CollectedFile[], +): void { + let stat; + try { + stat = statSync(abs); + } catch { + throw new Error(`Path does not exist: ${abs}`); + } + if (stat.isFile()) { + if (!extensions.has(extname(abs))) return; + addFile(abs, out); + return; + } + if (stat.isDirectory()) { + for (const entry of readdirSync(abs).sort()) { + // Skip hidden / node_modules / dist by default. + if (entry.startsWith(".") || entry === "node_modules" || entry === "dist") { + continue; + } + walk(join(abs, entry), extensions, out); + } + return; + } +} + +function addFile(abs: string, out: CollectedFile[]): void { + const content = readFileSync(abs, "utf8"); + out.push({ + absolutePath: abs, + relativePath: abs, + bytes: Buffer.byteLength(content, "utf8"), + truncated: false, + content, + }); +} diff --git a/experiments/rule-engine-poc/src/flag-schema.ts b/experiments/rule-engine-poc/src/flag-schema.ts new file mode 100644 index 000000000..0c8329484 --- /dev/null +++ b/experiments/rule-engine-poc/src/flag-schema.ts @@ -0,0 +1,80 @@ +import { readFileSync } from "node:fs"; +import { resolve } from "node:path"; +import yaml from "js-yaml"; + +export type FlagType = "boolean" | "number" | "string" | "string[]"; + +export interface FlagSchemaEntry { + type: FlagType; + description: string; + example: unknown; + stage?: string; + allowed_values?: unknown[]; +} + +export type FlagSchema = Record<string, FlagSchemaEntry>; + +const VALID_TYPES: readonly FlagType[] = [ + "boolean", + "number", + "string", + "string[]", +] as const; + +export function loadFlagSchema(filePath: string): FlagSchema { + const abs = resolve(filePath); + const raw = readFileSync(abs, "utf8"); + const parsed = yaml.load(raw); + if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) { + throw new Error(`Flag schema ${filePath} must be a YAML mapping`); + } + const out: FlagSchema = {}; + for (const [name, entry] of Object.entries(parsed)) { + if (!entry || typeof entry !== "object") { + throw new Error(`Flag '${name}' must be an object`); + } + const e = entry as Record<string, unknown>; + if (!VALID_TYPES.includes(e.type as FlagType)) { + throw new Error( + `Flag '${name}' has unknown type '${e.type}'. Expected one of: ${VALID_TYPES.join(", ")}`, + ); + } + if (typeof e.description !== "string" || e.description.length === 0) { + throw new Error(`Flag '${name}' missing description`); + } + if (!("example" in e)) { + throw new Error(`Flag '${name}' missing example`); + } + out[name] = { + type: e.type as FlagType, + description: e.description, + example: e.example, + stage: typeof e.stage === "string" ? e.stage : undefined, + allowed_values: Array.isArray(e.allowed_values) ? e.allowed_values : undefined, + }; + } + return out; +} + +export interface SchemaCoverage { + ruleFlags: string[]; + schemaFlags: string[]; + undocumented: string[]; + unused: string[]; +} + +export function diffSchemaCoverage( + schema: FlagSchema, + ruleFlags: Iterable<string>, +): SchemaCoverage { + const ruleSet = new Set(ruleFlags); + const schemaSet = new Set(Object.keys(schema)); + const undocumented = [...ruleSet].filter((f) => !schemaSet.has(f)).sort(); + const unused = [...schemaSet].filter((f) => !ruleSet.has(f)).sort(); + return { + ruleFlags: [...ruleSet].sort(), + schemaFlags: [...schemaSet].sort(), + undocumented, + unused, + }; +} diff --git a/experiments/rule-engine-poc/src/open-browser.ts b/experiments/rule-engine-poc/src/open-browser.ts new file mode 100644 index 000000000..03d5a74ae --- /dev/null +++ b/experiments/rule-engine-poc/src/open-browser.ts @@ -0,0 +1,29 @@ +import { spawn } from "node:child_process"; +import { platform } from "node:os"; + +// Best-effort: try to open `path` in the OS default browser. +// Never throws; returns true if a child was spawned, false otherwise. +// Intended for local dev. In containers / CI the spawn typically fails +// silently and the caller falls back to printing the file path. +export function openInBrowser(path: string): boolean { + const cmd = + platform() === "darwin" + ? "open" + : platform() === "win32" + ? "start" + : "xdg-open"; + try { + const child = spawn(cmd, [path], { + stdio: "ignore", + detached: true, + }); + child.on("error", () => { + // Swallow — common when the OS has no default browser handler + // (e.g., headless containers). + }); + child.unref(); + return true; + } catch { + return false; + } +} diff --git a/experiments/rule-engine-poc/src/plan.ts b/experiments/rule-engine-poc/src/plan.ts new file mode 100644 index 000000000..ad82f4947 --- /dev/null +++ b/experiments/rule-engine-poc/src/plan.ts @@ -0,0 +1,81 @@ +#!/usr/bin/env node +// `plan` CLI: read config, gather source content for each target, write +// one extraction prompt per target under config.promptsDir. The user +// pastes each prompt into an AI tool to get back structured JSON flags. + +import { mkdirSync, writeFileSync } from "node:fs"; +import { join, relative } from "node:path"; +import { loadConfig, findTarget } from "./config.js"; +import { collectFiles } from "./context.js"; +import { diffSchemaCoverage, loadFlagSchema } from "./flag-schema.js"; +import { loadRulesFromFile } from "./loader.js"; +import { buildExtractionPrompt } from "./prompt-builder.js"; +import type { Target } from "./config.js"; + +function takeOpt(argv: string[], flag: string): string | undefined { + const i = argv.indexOf(flag); + if (i === -1) return undefined; + const v = argv[i + 1]; + argv.splice(i, 2); + return v; +} + +const argv = process.argv.slice(2); +const configPath = takeOpt(argv, "--config") ?? "rule-engine.config.json"; +const onlyTarget = takeOpt(argv, "--target"); + +const config = loadConfig(configPath); +const rules = loadRulesFromFile(config.rulesPath); +const schema = loadFlagSchema(config.flagSchemaPath); + +// Collect every flag referenced by rules so we can warn on schema drift. +const ruleFlags = new Set<string>(); +for (const rule of rules) { + for (const group of ["all", "any", "not"] as const) { + for (const c of rule.when[group] ?? []) ruleFlags.add(c.flag); + } +} +const coverage = diffSchemaCoverage(schema, ruleFlags); +if (coverage.undocumented.length > 0) { + console.warn( + `[plan] WARNING: rules reference flags missing from schema: ${coverage.undocumented.join(", ")}`, + ); +} +if (coverage.unused.length > 0) { + console.warn( + `[plan] note: schema documents flags no rule references: ${coverage.unused.join(", ")}`, + ); +} + +mkdirSync(config.promptsDirPath, { recursive: true }); + +const targets: Target[] = onlyTarget + ? [findTarget(config, onlyTarget)] + : config.targets; + +for (const target of targets) { + const files = collectFiles(target.paths, { baseDir: config.configDir }); + if (files.length === 0) { + console.warn( + `[plan] target '${target.id}' resolved zero readable files from paths: ${target.paths.join(", ")}`, + ); + } + const prompt = buildExtractionPrompt({ + targetId: target.id, + targetLabel: target.label, + flagSchema: schema, + rules, + files, + }); + const outPath = join(config.promptsDirPath, `${target.id}.md`); + writeFileSync(outPath, prompt, "utf8"); + const totalBytes = files.reduce((sum, f) => sum + Buffer.byteLength(f.content, "utf8"), 0); + console.log( + `[plan] ${target.id}: ${files.length} files (${Math.round(totalBytes / 1024)} KB) -> ${relative(config.configDir, outPath)}`, + ); +} + +console.log(""); +console.log("Next step: open each prompt, paste it into an AI tool, save the JSON response to:"); +console.log(` ${relative(config.configDir, config.extractionsDirPath)}/<target-id>.json`); +console.log("Then run: npm run report"); diff --git a/experiments/rule-engine-poc/src/prompt-builder.ts b/experiments/rule-engine-poc/src/prompt-builder.ts new file mode 100644 index 000000000..235f158f2 --- /dev/null +++ b/experiments/rule-engine-poc/src/prompt-builder.ts @@ -0,0 +1,113 @@ +import type { CollectedFile } from "./context.js"; +import type { FlagSchema } from "./flag-schema.js"; +import type { LoadedRule } from "./types.js"; + +export interface BuildPromptInput { + targetId: string; + targetLabel: string; + flagSchema: FlagSchema; + rules: LoadedRule[]; + files: CollectedFile[]; +} + +// Forbidden field names (from research/10-extraction-prompt-patterns.md). +// Naming them explicitly inside the prompt is more reliable than hoping +// the model won't drift into verdict-shaped output. +const FORBIDDEN_FIELDS = [ + "verdict", + "assessment", + "conclusion", + "summary", + "recommendation", + "rationale", + "analysis", +]; + +export function buildExtractionPrompt(input: BuildPromptInput): string { + const { targetId, targetLabel, flagSchema, rules, files } = input; + + const schemaRows = Object.entries(flagSchema) + .sort(([a], [b]) => a.localeCompare(b)) + .map(([name, entry]) => { + const allowed = entry.allowed_values + ? ` Allowed: ${JSON.stringify(entry.allowed_values)}.` + : ""; + const stage = entry.stage ? ` (stage: ${entry.stage})` : ""; + return `- \`${name}\` (\`${entry.type}\`)${stage} — ${entry.description}${allowed} Example: \`${JSON.stringify(entry.example)}\`.`; + }) + .join("\n"); + + const responseTemplate = JSON.stringify( + Object.fromEntries( + Object.entries(flagSchema).map(([name, entry]) => [name, entry.example]), + ), + null, + 2, + ); + + const ruleSummary = rules + .map((r) => `- \`${r.id}\` (priority ${r.priority}) — ${r.description}`) + .join("\n"); + + const sourceBlocks = files + .map((f) => { + const lang = guessLang(f.relativePath); + return `### \`${f.relativePath}\`${f.truncated ? " _(truncated)_" : ""}\n\n\`\`\`${lang}\n${f.content}\n\`\`\``; + }) + .join("\n\n"); + + // The prompt uses XML tags as primary structure (Claude-friendly) with + // markdown headers as redundancy (GPT/Gemini-friendly). Opening an + // `<output>` tag at the end is a forcing function: the next token the + // model emits has to look like the start of valid JSON. + return `# Extraction request — ${targetLabel} + +<role> +You are an evidence extractor for a deterministic rule engine — not a fact-checker, not an assistant, not a reviewer. Your only job is to read the source material in \`<source>\` and emit structured flags in \`<output>\`. + +A separate engine consumes your flags and produces a verdict. If you add interpretation, you defeat the purpose: the verdict becomes unreproducible and the audit chain breaks. +</role> + +<rules> +1. Respond with **a single JSON object** inside \`<output>...</output>\`. Nothing before, nothing after. +2. **No prose, no markdown, no code fence**, no preamble like "Here is the JSON", no postamble like "Let me know if you need anything else". +3. **Do NOT include any of these fields:** ${FORBIDDEN_FIELDS.map((f) => `\`${f}\``).join(", ")}. They are verdict-shaped; the engine produces the verdict, not you. +4. **Omit any flag you cannot determine** from the source. Do not guess. A missing flag is correct behaviour — the engine surfaces it as "flag missing in extraction" and the rule that needed it does not fire. +5. Each flag value must match the declared type. Booleans are \`true\`/\`false\` not \`"true"\`/\`"false"\`. +6. If you genuinely cannot extract anything (source is empty or unrelated), respond with \`<output>{}</output>\`. +</rules> + +<flag_schema> +${schemaRows} +</flag_schema> + +<rules_consuming_these_flags note="for your context — do not reproduce in the output"> +${ruleSummary} +</rules_consuming_these_flags> + +<source target_id="${targetId}"> +${sourceBlocks} +</source> + +<response_template note="copy the structure, edit values to match the source, REMOVE keys you cannot determine"> +\`\`\` +${responseTemplate} +\`\`\` +</response_template> + +<after_you_respond> +The user will save your JSON to \`extractions/${targetId}.json\` and run \`npm run report\`. The rule engine deterministically maps your flags to a verdict tier (\`blocked\` / \`needs-attention\` / \`ready-to-progress\` / \`unknown\`) and emits an HTML report. +</after_you_respond> + +<output> +`; +} + +function guessLang(path: string): string { + if (path.endsWith(".md") || path.endsWith(".mdx")) return "markdown"; + if (path.endsWith(".yaml") || path.endsWith(".yml")) return "yaml"; + if (path.endsWith(".json")) return "json"; + if (path.endsWith(".ts") || path.endsWith(".tsx")) return "typescript"; + if (path.endsWith(".js") || path.endsWith(".mjs")) return "javascript"; + return "text"; +} diff --git a/experiments/rule-engine-poc/src/report.ts b/experiments/rule-engine-poc/src/report.ts new file mode 100644 index 000000000..358140ba9 --- /dev/null +++ b/experiments/rule-engine-poc/src/report.ts @@ -0,0 +1,117 @@ +#!/usr/bin/env node +// `report` CLI: read config + rules + per-target extractions, run the +// engine, render HTML, optionally open in the browser. Per-target HTML +// is written to config.reportsDir. + +import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs"; +import { join, relative, resolve } from "node:path"; +import { loadConfig, findTarget } from "./config.js"; +import { evaluate } from "./engine.js"; +import { loadFlagSchema } from "./flag-schema.js"; +import { loadRulesFromFile } from "./loader.js"; +import { renderHtmlReport } from "./html-report.js"; +import { openInBrowser } from "./open-browser.js"; +import type { ExtractionFlags, Verdict } from "./types.js"; +import type { Target } from "./config.js"; + +function takeOpt(argv: string[], flag: string): string | undefined { + const i = argv.indexOf(flag); + if (i === -1) return undefined; + const v = argv[i + 1]; + argv.splice(i, 2); + return v; +} +function takeFlag(argv: string[], flag: string): boolean { + const i = argv.indexOf(flag); + if (i === -1) return false; + argv.splice(i, 1); + return true; +} + +const argv = process.argv.slice(2); +const configPath = takeOpt(argv, "--config") ?? "rule-engine.config.json"; +const onlyTarget = takeOpt(argv, "--target"); +const noOpen = takeFlag(argv, "--no-open"); +const quiet = takeFlag(argv, "--quiet"); + +const config = loadConfig(configPath); +const rules = loadRulesFromFile(config.rulesPath); +loadFlagSchema(config.flagSchemaPath); // validate-only here; warnings live in `plan`. + +mkdirSync(config.reportsDirPath, { recursive: true }); + +const targets: Target[] = onlyTarget + ? [findTarget(config, onlyTarget)] + : config.targets; + +let worstExitCode = 0; +const summary: { target: Target; verdict: Verdict; reportPath: string }[] = []; + +for (const target of targets) { + const extractionPath = join(config.extractionsDirPath, `${target.id}.json`); + if (!existsSync(extractionPath)) { + console.error( + `[report] missing extraction for '${target.id}': ${relative(config.configDir, extractionPath)}`, + ); + console.error( + ` run: npm run plan -- --target ${target.id} (paste the prompt into your AI tool, save the JSON here)`, + ); + worstExitCode = Math.max(worstExitCode, 2); + continue; + } + + let flags: ExtractionFlags; + try { + flags = JSON.parse(readFileSync(extractionPath, "utf8")); + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + console.error(`[report] invalid JSON in ${extractionPath}: ${msg}`); + worstExitCode = Math.max(worstExitCode, 2); + continue; + } + + const result = evaluate(rules, flags); + const html = renderHtmlReport(result, { + rulesPath: relative(config.configDir, config.rulesPath), + flagsPath: relative(config.configDir, extractionPath), + flags, + generatedAt: new Date().toISOString(), + }); + const reportPath = join(config.reportsDirPath, `${target.id}.html`); + writeFileSync(reportPath, html, "utf8"); + summary.push({ target, verdict: result.verdict, reportPath }); + + if (!quiet) { + console.log( + `[report] ${target.id}: ${result.verdict.toUpperCase()} -> ${relative(config.configDir, reportPath)}`, + ); + } + if (result.verdict === "blocked") { + worstExitCode = Math.max(worstExitCode, 1); + } +} + +if (!quiet && summary.length > 0) { + console.log(""); + console.log("Summary:"); + for (const s of summary) { + console.log(` ${s.verdict.padEnd(20)} ${s.target.label}`); + } +} + +// Best-effort browser open. Only opens the first report when multiple +// targets are present; rest are linked from console paths. +if (config.openBrowser && !noOpen && summary.length > 0) { + const first = resolve(summary[0]!.reportPath); + const ok = openInBrowser(first); + if (!quiet) { + console.log(""); + console.log( + ok + ? `[report] opened ${relative(config.configDir, first)} in default browser (best-effort)` + : `[report] could not spawn a browser; open manually: file://${first}`, + ); + } +} + +process.exitCode = worstExitCode; diff --git a/experiments/rule-engine-poc/test/config.test.ts b/experiments/rule-engine-poc/test/config.test.ts new file mode 100644 index 000000000..d71c73104 --- /dev/null +++ b/experiments/rule-engine-poc/test/config.test.ts @@ -0,0 +1,72 @@ +import { mkdtempSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { describe, expect, it } from "vitest"; +import { findTarget, loadConfig } from "../src/config.js"; + +function writeTempConfig(body: unknown): string { + const dir = mkdtempSync(join(tmpdir(), "rep-poc-cfg-")); + const file = join(dir, "rule-engine.config.json"); + writeFileSync(file, JSON.stringify(body), "utf8"); + return file; +} + +const validBody = { + rules: "rules/quality-gates.yaml", + flagSchema: "rules/flag-schema.yaml", + promptsDir: "prompts", + extractionsDir: "extractions", + reportsDir: "reports", + openBrowser: true, + targets: [ + { id: "alpha", label: "Alpha", paths: ["alpha"] }, + { id: "beta", label: "Beta", paths: ["beta", "shared"] }, + ], +}; + +describe("loadConfig", () => { + it("resolves all paths relative to the config file", () => { + const file = writeTempConfig(validBody); + const cfg = loadConfig(file); + expect(cfg.rulesPath.endsWith("rules/quality-gates.yaml")).toBe(true); + expect(cfg.flagSchemaPath.endsWith("rules/flag-schema.yaml")).toBe(true); + expect(cfg.targets).toHaveLength(2); + }); + + it("rejects duplicate target ids", () => { + const file = writeTempConfig({ + ...validBody, + targets: [ + { id: "alpha", label: "A", paths: ["a"] }, + { id: "alpha", label: "B", paths: ["b"] }, + ], + }); + expect(() => loadConfig(file)).toThrow(/duplicate target id 'alpha'/); + }); + + it("rejects targets with empty paths", () => { + const file = writeTempConfig({ + ...validBody, + targets: [{ id: "alpha", label: "A", paths: [] }], + }); + expect(() => loadConfig(file)).toThrow(/non-empty 'paths' array/); + }); + + it("rejects missing openBrowser", () => { + const { openBrowser: _, ...rest } = validBody; + const file = writeTempConfig(rest); + expect(() => loadConfig(file)).toThrow(/missing required boolean 'openBrowser'/); + }); +}); + +describe("findTarget", () => { + it("returns the named target", () => { + const cfg = loadConfig(writeTempConfig(validBody)); + expect(findTarget(cfg, "beta").label).toBe("Beta"); + }); + + it("throws helpfully on unknown id", () => { + const cfg = loadConfig(writeTempConfig(validBody)); + expect(() => findTarget(cfg, "gamma")).toThrow(/Known targets: alpha, beta/); + }); +}); diff --git a/experiments/rule-engine-poc/test/context.test.ts b/experiments/rule-engine-poc/test/context.test.ts new file mode 100644 index 000000000..c83ae5abd --- /dev/null +++ b/experiments/rule-engine-poc/test/context.test.ts @@ -0,0 +1,48 @@ +import { describe, expect, it } from "vitest"; +import { mkdirSync, mkdtempSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { collectFiles } from "../src/context.js"; + +function setupFixture(): string { + const dir = mkdtempSync(join(tmpdir(), "rep-poc-ctx-")); + writeFileSync(join(dir, "a.md"), "# A\n", "utf8"); + writeFileSync(join(dir, "b.md"), "# B\n", "utf8"); + writeFileSync(join(dir, "ignore.bin"), "binary data", "utf8"); + mkdirSync(join(dir, "sub")); + writeFileSync(join(dir, "sub", "c.md"), "# C\n", "utf8"); + mkdirSync(join(dir, "node_modules")); + writeFileSync(join(dir, "node_modules", "should-skip.md"), "x", "utf8"); + return dir; +} + +describe("collectFiles", () => { + it("walks directories and skips non-allowed extensions", () => { + const dir = setupFixture(); + const files = collectFiles([dir], { baseDir: dir }); + const names = files.map((f) => f.relativePath).sort(); + expect(names).toEqual(["a.md", "b.md", "sub/c.md"]); + }); + + it("emits files in deterministic order regardless of input order", () => { + const dir = setupFixture(); + const a = collectFiles([join(dir, "b.md"), join(dir, "a.md")], { baseDir: dir }); + const b = collectFiles([join(dir, "a.md"), join(dir, "b.md")], { baseDir: dir }); + expect(a.map((f) => f.absolutePath)).toEqual(b.map((f) => f.absolutePath)); + }); + + it("truncates files larger than maxBytes", () => { + const dir = mkdtempSync(join(tmpdir(), "rep-poc-trunc-")); + writeFileSync(join(dir, "big.md"), "x".repeat(10_000), "utf8"); + const files = collectFiles([dir], { baseDir: dir, maxBytes: 100 }); + expect(files[0]!.truncated).toBe(true); + expect(files[0]!.content.length).toBeGreaterThan(100); + expect(files[0]!.content.length).toBeLessThan(10_000); + }); + + it("throws helpfully for missing paths", () => { + expect(() => + collectFiles(["/does/not/exist"], { baseDir: "/" }), + ).toThrow(/Path does not exist/); + }); +}); diff --git a/experiments/rule-engine-poc/test/flag-schema.test.ts b/experiments/rule-engine-poc/test/flag-schema.test.ts new file mode 100644 index 000000000..246523b11 --- /dev/null +++ b/experiments/rule-engine-poc/test/flag-schema.test.ts @@ -0,0 +1,70 @@ +import { describe, expect, it } from "vitest"; +import { diffSchemaCoverage, loadFlagSchema } from "../src/flag-schema.js"; +import { mkdtempSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; + +function writeYaml(body: string): string { + const dir = mkdtempSync(join(tmpdir(), "rep-poc-schema-")); + const file = join(dir, "schema.yaml"); + writeFileSync(file, body, "utf8"); + return file; +} + +describe("loadFlagSchema", () => { + it("loads a valid schema", () => { + const file = writeYaml(` +foo: + type: boolean + description: foo flag + example: true +bar: + type: number + description: bar flag + example: 42 +`); + const schema = loadFlagSchema(file); + expect(schema.foo!.type).toBe("boolean"); + expect(schema.bar!.example).toBe(42); + }); + + it("rejects unknown types", () => { + const file = writeYaml(` +foo: + type: float + description: foo + example: 1.0 +`); + expect(() => loadFlagSchema(file)).toThrow(/unknown type 'float'/); + }); + + it("rejects missing description", () => { + const file = writeYaml(` +foo: + type: boolean + example: true +`); + expect(() => loadFlagSchema(file)).toThrow(/missing description/); + }); + + it("rejects missing example", () => { + const file = writeYaml(` +foo: + type: boolean + description: foo +`); + expect(() => loadFlagSchema(file)).toThrow(/missing example/); + }); +}); + +describe("diffSchemaCoverage", () => { + it("identifies undocumented and unused flags", () => { + const schema = { + a: { type: "boolean" as const, description: "a", example: true }, + b: { type: "boolean" as const, description: "b", example: true }, + }; + const diff = diffSchemaCoverage(schema, ["a", "c"]); + expect(diff.undocumented).toEqual(["c"]); + expect(diff.unused).toEqual(["b"]); + }); +}); diff --git a/experiments/rule-engine-poc/test/prompt-builder.test.ts b/experiments/rule-engine-poc/test/prompt-builder.test.ts new file mode 100644 index 000000000..41fd325e4 --- /dev/null +++ b/experiments/rule-engine-poc/test/prompt-builder.test.ts @@ -0,0 +1,80 @@ +import { describe, expect, it } from "vitest"; +import { buildExtractionPrompt } from "../src/prompt-builder.js"; +import { loadRulesFromString } from "../src/loader.js"; +import type { FlagSchema } from "../src/flag-schema.js"; + +const schema: FlagSchema = { + ci_passing: { + type: "boolean", + description: "CI is green", + example: true, + }, +}; + +const rules = loadRulesFromString( + ` +- id: gate + description: blocks when ci fails + priority: 10 + when: + all: + - flag: ci_passing + eq: false + then: + verdict: blocked + weight: 100 + actions: [kick-ci] +`, + "prompt-builder-test", +); + +const baseInput = { + targetId: "alpha", + targetLabel: "Alpha target", + flagSchema: schema, + rules, + files: [ + { + absolutePath: "/abs/alpha/README.md", + relativePath: "alpha/README.md", + bytes: 10, + truncated: false, + content: "Hello world\n", + }, + ], +}; + +describe("buildExtractionPrompt", () => { + it("includes the role, schema, and forbidden-fields list", () => { + const prompt = buildExtractionPrompt(baseInput); + expect(prompt).toContain("evidence extractor"); + expect(prompt).toContain("Do NOT include any of these fields"); + expect(prompt).toContain("`verdict`"); + expect(prompt).toContain("`assessment`"); + expect(prompt).toContain("`conclusion`"); + }); + + it("ends with an open <output> tag as a forcing function", () => { + const prompt = buildExtractionPrompt(baseInput); + expect(prompt.trim().endsWith("<output>")).toBe(true); + }); + + it("inlines each file's content in its own fenced block", () => { + const prompt = buildExtractionPrompt(baseInput); + expect(prompt).toContain("alpha/README.md"); + expect(prompt).toContain("Hello world"); + }); + + it("renders the response template using each flag's example value", () => { + const prompt = buildExtractionPrompt(baseInput); + expect(prompt).toContain('"ci_passing": true'); + }); + + it("marks truncated files in the heading", () => { + const prompt = buildExtractionPrompt({ + ...baseInput, + files: [{ ...baseInput.files[0]!, truncated: true }], + }); + expect(prompt).toContain("_(truncated)_"); + }); +}); From 4e0d0cef09706368e2610b9a9fc46fca9f1fac39 Mon Sep 17 00:00:00 2001 From: Claude <noreply@anthropic.com> Date: Sun, 17 May 2026 12:05:19 +0000 Subject: [PATCH 07/45] fix(rule-engine-poc): honest browser-open + stricter operator types (Codex round 5) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - open-browser: openInBrowser is now async and waits briefly for the spawn or error event before resolving. Previously it returned true immediately, so report.ts printed "opened in browser" even when xdg-open was missing in a headless container — misleading users during the primary plan->report flow (#525 round 5 P2). - report.ts: awaits openInBrowser and prints the correct status line. Verified in this sandbox: now prints "could not spawn a browser; open manually: file://..." when no browser is installed. - loader: 'exists' operator must be a boolean. Previously a typo like 'exists: "false"' would load and then silently never match because evaluateCondition compares boolean to string (#525 round 5 P2). - loader: 'gt' and 'lt' operators must be numbers at load time, for consistency with the other type checks (engine already failed matching at runtime, but failing at load is preferred). - Two new loader tests cover the exists-boolean and gt-number paths. --- experiments/rule-engine-poc/src/loader.ts | 12 ++++ .../rule-engine-poc/src/open-browser.ts | 65 ++++++++++++------- experiments/rule-engine-poc/src/report.ts | 2 +- .../rule-engine-poc/test/loader.test.ts | 42 ++++++++++++ 4 files changed, 96 insertions(+), 25 deletions(-) diff --git a/experiments/rule-engine-poc/src/loader.ts b/experiments/rule-engine-poc/src/loader.ts index 0c0754614..5f645a536 100644 --- a/experiments/rule-engine-poc/src/loader.ts +++ b/experiments/rule-engine-poc/src/loader.ts @@ -159,6 +159,18 @@ function validateConditions( `Rule '${ruleId}' ${where} has non-array 'in' (got ${typeof cond.in})`, ); } + if ("exists" in cond && typeof cond.exists !== "boolean") { + throw new Error( + `Rule '${ruleId}' ${where} has non-boolean 'exists' (got ${typeof cond.exists}) — must be true or false`, + ); + } + for (const numOp of ["gt", "lt"] as const) { + if (numOp in cond && typeof cond[numOp] !== "number") { + throw new Error( + `Rule '${ruleId}' ${where} has non-number '${numOp}' (got ${typeof cond[numOp]})`, + ); + } + } if (typeof cond.regex === "string") { try { new RegExp(cond.regex); diff --git a/experiments/rule-engine-poc/src/open-browser.ts b/experiments/rule-engine-poc/src/open-browser.ts index 03d5a74ae..cb27f4d80 100644 --- a/experiments/rule-engine-poc/src/open-browser.ts +++ b/experiments/rule-engine-poc/src/open-browser.ts @@ -2,28 +2,45 @@ import { spawn } from "node:child_process"; import { platform } from "node:os"; // Best-effort: try to open `path` in the OS default browser. -// Never throws; returns true if a child was spawned, false otherwise. -// Intended for local dev. In containers / CI the spawn typically fails -// silently and the caller falls back to printing the file path. -export function openInBrowser(path: string): boolean { - const cmd = - platform() === "darwin" - ? "open" - : platform() === "win32" - ? "start" - : "xdg-open"; - try { - const child = spawn(cmd, [path], { - stdio: "ignore", - detached: true, - }); - child.on("error", () => { - // Swallow — common when the OS has no default browser handler - // (e.g., headless containers). - }); - child.unref(); - return true; - } catch { - return false; - } +// Returns a Promise<boolean> that resolves true if the child process +// successfully spawned, false otherwise (no xdg-open installed, +// headless container, etc.). +// +// We wait briefly for either the `spawn` success event or the `error` +// event to fire before resolving, so the caller's status message +// reflects reality. A short timeout caps the worst case. +export function openInBrowser(path: string): Promise<boolean> { + return new Promise((resolve) => { + let settled = false; + const finish = (ok: boolean): void => { + if (settled) return; + settled = true; + resolve(ok); + }; + + const cmd = + platform() === "darwin" + ? "open" + : platform() === "win32" + ? "start" + : "xdg-open"; + + try { + const child = spawn(cmd, [path], { + stdio: "ignore", + detached: true, + }); + child.on("error", () => finish(false)); + child.on("spawn", () => { + child.unref(); + finish(true); + }); + // Safety net: if neither event fires within 250 ms (some platforms + // are stingy with the `spawn` event), assume success — the child + // has been forked, we just don't have positive confirmation. + setTimeout(() => finish(true), 250); + } catch { + finish(false); + } + }); } diff --git a/experiments/rule-engine-poc/src/report.ts b/experiments/rule-engine-poc/src/report.ts index 358140ba9..f255193c8 100644 --- a/experiments/rule-engine-poc/src/report.ts +++ b/experiments/rule-engine-poc/src/report.ts @@ -103,7 +103,7 @@ if (!quiet && summary.length > 0) { // targets are present; rest are linked from console paths. if (config.openBrowser && !noOpen && summary.length > 0) { const first = resolve(summary[0]!.reportPath); - const ok = openInBrowser(first); + const ok = await openInBrowser(first); if (!quiet) { console.log(""); console.log( diff --git a/experiments/rule-engine-poc/test/loader.test.ts b/experiments/rule-engine-poc/test/loader.test.ts index 2b94b1a34..16c4c64c4 100644 --- a/experiments/rule-engine-poc/test/loader.test.ts +++ b/experiments/rule-engine-poc/test/loader.test.ts @@ -263,6 +263,48 @@ describe("loader", () => { ).toThrow(/invalid 'then\.weight'/); }); + it("rejects conditions with non-boolean 'exists'", () => { + expect(() => + loadRulesFromString( + ` +- id: r1 + priority: 1 + description: x + when: + all: + - flag: ghost + exists: "false" + then: + verdict: blocked + weight: 1 + actions: [] +`, + "exists-non-boolean", + ), + ).toThrow(/non-boolean 'exists'/); + }); + + it("rejects conditions with non-number 'gt'", () => { + expect(() => + loadRulesFromString( + ` +- id: r1 + priority: 1 + description: x + when: + all: + - flag: count + gt: "7" + then: + verdict: blocked + weight: 1 + actions: [] +`, + "gt-non-number", + ), + ).toThrow(/non-number 'gt'/); + }); + it("rejects conditions with non-array 'in'", () => { expect(() => loadRulesFromString( From 0aaa114eb631a203b702a9950531aa02542d87a9 Mon Sep 17 00:00:00 2001 From: Claude <noreply@anthropic.com> Date: Sun, 17 May 2026 12:11:12 +0000 Subject: [PATCH 08/45] fix(rule-engine-poc): three platform/format hardening fixes (Codex round 6) - open-browser: spawn cmd /c start "" on Windows. 'start' is a cmd.exe built-in, not a standalone exe, so spawn('start', ...) raised ENOENT and browser open always failed for Windows users (#525 round 6 P2). - report: validate the parsed extraction is a plain object before passing to evaluate. Previously valid JSON like null / [] / "text" would crash inside hasOwnProperty.call on null instead of producing a controlled error (#525 round 6 P2). - prompt-builder: pick a fence length longer than any backtick run in the source content. Many repo markdown files contain ``` blocks which would prematurely close the prompt's outer fence and corrupt the AI extraction prompt (#525 round 6 P2). - New pickFence helper is exported and unit-tested; prompt-builder test asserts a 5-tick fence is emitted for content with a 4-tick run. --- .../rule-engine-poc/src/open-browser.ts | 14 +++++----- .../rule-engine-poc/src/prompt-builder.ts | 17 +++++++++++- experiments/rule-engine-poc/src/report.ts | 10 ++++++- .../test/prompt-builder.test.ts | 26 ++++++++++++++++++- 4 files changed, 57 insertions(+), 10 deletions(-) diff --git a/experiments/rule-engine-poc/src/open-browser.ts b/experiments/rule-engine-poc/src/open-browser.ts index cb27f4d80..7032974ba 100644 --- a/experiments/rule-engine-poc/src/open-browser.ts +++ b/experiments/rule-engine-poc/src/open-browser.ts @@ -18,15 +18,15 @@ export function openInBrowser(path: string): Promise<boolean> { resolve(ok); }; - const cmd = - platform() === "darwin" - ? "open" - : platform() === "win32" - ? "start" - : "xdg-open"; + const isWin = platform() === "win32"; + const cmd = isWin ? "cmd" : platform() === "darwin" ? "open" : "xdg-open"; + // On Windows, `start` is a cmd.exe built-in, not a standalone exe; + // spawning it directly raises ENOENT. The empty "" is the window + // title — without it, cmd interprets the first quoted arg as the title. + const args = isWin ? ["/c", "start", "", path] : [path]; try { - const child = spawn(cmd, [path], { + const child = spawn(cmd, args, { stdio: "ignore", detached: true, }); diff --git a/experiments/rule-engine-poc/src/prompt-builder.ts b/experiments/rule-engine-poc/src/prompt-builder.ts index 235f158f2..e811cce8c 100644 --- a/experiments/rule-engine-poc/src/prompt-builder.ts +++ b/experiments/rule-engine-poc/src/prompt-builder.ts @@ -52,7 +52,10 @@ export function buildExtractionPrompt(input: BuildPromptInput): string { const sourceBlocks = files .map((f) => { const lang = guessLang(f.relativePath); - return `### \`${f.relativePath}\`${f.truncated ? " _(truncated)_" : ""}\n\n\`\`\`${lang}\n${f.content}\n\`\`\``; + // Use a fence longer than any backtick run inside the file so the + // file's own ``` blocks don't prematurely close the prompt fence. + const fence = pickFence(f.content); + return `### \`${f.relativePath}\`${f.truncated ? " _(truncated)_" : ""}\n\n${fence}${lang}\n${f.content}\n${fence}`; }) .join("\n\n"); @@ -111,3 +114,15 @@ function guessLang(path: string): string { if (path.endsWith(".js") || path.endsWith(".mjs")) return "javascript"; return "text"; } + +// Pick a backtick fence longer than any run in the content. Markdown +// allows fences of length >= 3 and requires the closing fence to match +// the opening fence length, so a fence longer than any inner run +// guarantees the source block's own fences don't close ours. +export function pickFence(content: string): string { + let max = 2; + for (const m of content.matchAll(/`+/g)) { + if (m[0].length > max) max = m[0].length; + } + return "`".repeat(max + 1); +} diff --git a/experiments/rule-engine-poc/src/report.ts b/experiments/rule-engine-poc/src/report.ts index f255193c8..9ebf379f8 100644 --- a/experiments/rule-engine-poc/src/report.ts +++ b/experiments/rule-engine-poc/src/report.ts @@ -62,7 +62,15 @@ for (const target of targets) { let flags: ExtractionFlags; try { - flags = JSON.parse(readFileSync(extractionPath, "utf8")); + const parsed: unknown = JSON.parse(readFileSync(extractionPath, "utf8")); + if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) { + console.error( + `[report] extraction ${extractionPath} must be a JSON object (got ${parsed === null ? "null" : Array.isArray(parsed) ? "array" : typeof parsed})`, + ); + worstExitCode = Math.max(worstExitCode, 2); + continue; + } + flags = parsed as ExtractionFlags; } catch (err) { const msg = err instanceof Error ? err.message : String(err); console.error(`[report] invalid JSON in ${extractionPath}: ${msg}`); diff --git a/experiments/rule-engine-poc/test/prompt-builder.test.ts b/experiments/rule-engine-poc/test/prompt-builder.test.ts index 41fd325e4..a0b4d196d 100644 --- a/experiments/rule-engine-poc/test/prompt-builder.test.ts +++ b/experiments/rule-engine-poc/test/prompt-builder.test.ts @@ -1,5 +1,5 @@ import { describe, expect, it } from "vitest"; -import { buildExtractionPrompt } from "../src/prompt-builder.js"; +import { buildExtractionPrompt, pickFence } from "../src/prompt-builder.js"; import { loadRulesFromString } from "../src/loader.js"; import type { FlagSchema } from "../src/flag-schema.js"; @@ -77,4 +77,28 @@ describe("buildExtractionPrompt", () => { }); expect(prompt).toContain("_(truncated)_"); }); + + it("uses a fence longer than any backtick run inside the source", () => { + const filey = { + absolutePath: "/abs/x.md", + relativePath: "x.md", + bytes: 0, + truncated: false, + content: "regular ``` triple\nand a ```` quad fence", + }; + const prompt = buildExtractionPrompt({ ...baseInput, files: [filey] }); + // Fence used must be >= 5 backticks to outrun the 4-tick run in content. + expect(prompt).toContain("`````markdown"); + }); +}); + +describe("pickFence", () => { + it("returns 3 backticks for content with no fences", () => { + expect(pickFence("plain text")).toBe("```"); + }); + + it("returns one more than the longest backtick run", () => { + expect(pickFence("a ``` b")).toBe("````"); + expect(pickFence("a ``` b ```` c")).toBe("`````"); + }); }); From 3dc103452a168db424d801ffc2bfafb6d3e98323 Mon Sep 17 00:00:00 2001 From: Claude <noreply@anthropic.com> Date: Sun, 17 May 2026 12:14:42 +0000 Subject: [PATCH 09/45] feat(rule-engine-poc): validate gate between plan and report MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the schema-miss laundering failure mode flagged by the critic (research/07) and analyst (research/10): bad LLM output now fails loudly instead of becoming a reproducible-looking verdict. What's new: - src/validate.ts: validateExtraction(flags, schema, options) returns errors + warnings. Checks: forbidden fields (verdict, assessment, conclusion, summary, recommendation, rationale, analysis), unknown fields (warning), type mismatches (boolean/number/string/string[]), non-finite numbers, disallowed_values violations, prompt-hash mismatch (when expectedPromptHash provided — wired up in the next commit). - src/validate-cli.ts: 'npm run validate' surfaces issues per target, exits 0 (clean) / 1 (errors) / 2 (missing/unparseable extraction). - src/report.ts: validates each extraction before evaluating. Refuses to render when validation fails. --skip-validate flag for escape. - prompt-builder imports FORBIDDEN_FIELDS from validate.ts so the forbidden list lives in exactly one place. - 12 new validate tests; suite total 77 passing. Verified end-to-end: a polluted extraction with verdict+type-mismatch +unknown-flag is caught by both validate and report. --- experiments/rule-engine-poc/package.json | 1 + .../11-rule-engine-adoption-revisit.md | 111 +++++++++++++ .../rule-engine-poc/src/prompt-builder.ts | 14 +- experiments/rule-engine-poc/src/report.ts | 27 ++- .../rule-engine-poc/src/validate-cli.ts | 89 ++++++++++ experiments/rule-engine-poc/src/validate.ts | 157 ++++++++++++++++++ .../rule-engine-poc/test/validate.test.ts | 118 +++++++++++++ 7 files changed, 503 insertions(+), 14 deletions(-) create mode 100644 experiments/rule-engine-poc/research/11-rule-engine-adoption-revisit.md create mode 100644 experiments/rule-engine-poc/src/validate-cli.ts create mode 100644 experiments/rule-engine-poc/src/validate.ts create mode 100644 experiments/rule-engine-poc/test/validate.test.ts diff --git a/experiments/rule-engine-poc/package.json b/experiments/rule-engine-poc/package.json index 8266ed1c5..67176ba99 100644 --- a/experiments/rule-engine-poc/package.json +++ b/experiments/rule-engine-poc/package.json @@ -7,6 +7,7 @@ "scripts": { "build": "tsc -p .", "plan": "tsx src/plan.ts", + "validate": "tsx src/validate-cli.ts", "report": "tsx src/report.ts", "demo": "tsx src/cli.ts rules/quality-gates.yaml fixtures/ready-implementation.json", "demo:blocked-ears": "tsx src/cli.ts rules/quality-gates.yaml fixtures/blocked-missing-ears.json", diff --git a/experiments/rule-engine-poc/research/11-rule-engine-adoption-revisit.md b/experiments/rule-engine-poc/research/11-rule-engine-adoption-revisit.md new file mode 100644 index 000000000..68642ec8f --- /dev/null +++ b/experiments/rule-engine-poc/research/11-rule-engine-adoption-revisit.md @@ -0,0 +1,111 @@ +--- +title: Rule-engine adoption — re-evaluation against the built POC +folder: experiments/rule-engine-poc/research +description: Second-pass adoption review. Compares json-rules-engine, GoRules ZEN/JDM, OPA/Rego, durable-rules, json-logic, nools, and small TS-native engines against the six constraints the POC has now hard-coded (determinism, severity-first verdict, per-rule content hashes, full audit trail, YAML DSL, sandbox scope). +entry_point: false +--- + +# 11 — Rule-engine adoption: re-evaluation against the built POC + +## Recommendation (lead) + +**Stay hand-rolled.** Keep the engine in `experiments/rule-engine-poc/src/engine.ts` as the canonical implementation. Do **not** adopt `json-rules-engine`, ZEN/JDM, OPA/Rego, or any other library at this time. The first-pass research (`01-technical-landscape.md`) recommended `json-rules-engine` behind a thin adapter *before the POC was built*. Having now produced the concrete shape — severity-first verdict, per-condition audit trail with `observed`/`reason`, content-hashed rules, canonical-JSON `rulesetHash`/`flagsHash`, weight-gate at load time, YAML loader with operator-shape validation — every candidate scores worse than what we have against the six locked-in constraints, and every adoption path is net-additive lines, not a reduction. + +The first-pass call was reasonable for an empty repo. With ~400 lines of engine + loader + types in place, the build-vs-buy break-even point has already been crossed. + +## Constraint-by-constraint scoring + +Constraints recap (from 6 rounds of Codex review): **C1** determinism (byte-identical replay), **C2** severity-first verdict tiers, **C3** per-rule content-hash provenance, **C4** full per-condition audit trail, **C5** YAML DSL with `all`/`any`/`not` + 7 operators, **C6** sandbox scope (no operational weight leaking into the main repo). + +| Candidate | C1 Determinism | C2 Severity-first | C3 Content hash | C4 Audit trail | C5 YAML DSL | C6 Sandbox-friendly | Maint. signal | +|---|---|---|---|---|---|---|---| +| **Our POC (`engine.ts`)** | Documented + enforced (no `Date.now`, no `Math.random`, sorted-key canonical JSON, sorted iteration) | Built-in (`SEVERITY_ORDER`) | Built-in (`sha256(canonicalJson(hashable))` per `LoadedRule`) | Built-in (`RuleEvaluation.conditions[]` carries `observed` + `reason`) | Native (loader is YAML-first) | Sub-1k LOC under `experiments/` | Local, owned | +| **json-rules-engine 7.3.2** ([npm](https://www.npmjs.com/package/json-rules-engine)) | Not documented. `Almanac` supports async fact providers and `successEvents` historically fire after run; non-determinism is allowed by the API surface ([issue #39](https://github.com/CacheControl/json-rules-engine/issues/39)) | No — `priority` is execution ordering, not severity classification; conflict resolution is left to the consumer | No — `toJSON()` serialises but no content hash; no version pin per rule | Yes — strongest of the libraries; `RuleResult.conditions[].result`/`factResult`/`valueResult` ([changelog](https://github.com/CacheControl/json-rules-engine/blob/master/CHANGELOG.md)) | No — JSON only, "Object options \| String json" per [docs/rules.md](https://github.com/CacheControl/json-rules-engine/blob/master/docs/rules.md) | Adds runtime dep + jsonpath-plus (RCE-class CVE history in v6→v7) | ~351k weekly DL, 7.3.2 Feb 2026, active issues (#421 Jun 2025) | +| **GoRules ZEN/JDM 0.54** ([npm](https://www.npmjs.com/package/@gorules/zen-engine), [docs](https://docs.gorules.io/changelog)) | Rust core is deterministic by construction; `EvaluationOptions.trace=true` returns a `Trace` | Hit policies on decision tables, but severity is not a first-class concept; severity must be encoded as a column | No native content hash; bundle-level versioning only | Yes — per-node trace + performance | No — JDM is JSON; YAML not first-class | Heavy: native binary per platform (`@gorules/zen-engine-linux-x64-gnu` etc.), 11 dependents on npm, install footprint ~10MB. Overkill for `experiments/` | Active (0.54, 2 months old). GoRules AI + MCP shipped 2025 — geared at hosted product, not embedded use | +| **OPA / Rego (`@open-policy-agent/opa`, `opa-wasm`)** ([opa.org](https://www.openpolicyagent.org/docs/management-decision-logs)) | Rego evaluation is deterministic; explicit ban on side effects | No — Rego returns documents/decisions; severity is a convention you encode | No — bundle SHA at the `.tar.gz` level, not per-rule | Decision logs are infra-level (HTTP shipping to a remote sink). Not a Node return value | No — Rego is a separate DSL with real learning curve | High operational overhead: WASM compile step, sidecar or in-process WASM runtime, decision-log shipper. Disproportionate for ≤1000-rule node POC | OPA itself is production-grade; Node ecosystem is thin | +| **durable-rules / `durable`** ([npm](https://www.npmjs.com/package/durable)) | Rete-based, C core; deterministic per fact set | Forward-chaining engine — wrong shape; we don't derive facts, we map flags once | No | Limited; designed for streaming, not auditable single-shot | No (JSON DSL) | 379 weekly DL, 1.3k stars but most activity is Python-side. C native dep. | Marginal — see [npmtrends](https://npmtrends.com/durable-vs-json-rules-engine-vs-json-rules-engine-simplified-vs-node-rules-vs-rule-reactor) | +| **`nools`** | Unmaintained 9y, maintainer looking for successor | n/a | No | No | No | n/a | Disqualified | +| **`json-logic-js` / `json-logic-engine`** | Pure expression evaluator | No rule loop, no priorities — not an engine | No | No (single-expression result) | No (JSON) | Tiny but you'd build the engine on top | Stable but inactive | +| **`rules-engine-ts` (andrewvo89)** | TS-native, but no determinism guarantees documented | No | No | Limited | No | Last release 2.0.4, ~3y ago | Effectively unmaintained | +| **"RulesPad" / other 2026 entrants** | No verifiable npm presence found ([search](https://www.npmjs.com/search?q=rules+engine)) | — | — | — | — | — | No signal | + +### Where the scoring genuinely hurts the candidates + +- **C1 determinism.** No library in the field *documents* determinism as a guarantee. `json-rules-engine` actively allows non-determinism: async fact providers, async event handlers (historically fired after `run` resolved — [issue #39](https://github.com/CacheControl/json-rules-engine/issues/39)), and `jsonpath-plus` expressions whose evaluation order across nested matches is not contractually fixed. We would have to *audit* the library to assert what we *enforce* in 200 lines. +- **C3 content hash.** None of the candidates ship per-rule content hashes. We would have to wrap every candidate to compute hashes anyway. At that point the wrapper owns the determinism story and the library is just providing a boolean tree walker — which `engine.ts` already provides in 90 lines. +- **C5 YAML.** Every candidate is JSON-first. YAML adoption is purely a loader-layer concern, which is already solved in our `loader.ts`. +- **C2 severity-first.** Every candidate uses either priority-as-order (json-rules-engine) or hit-policy/return-document (ZEN, OPA). None encode "blocked > needs-attention > ready-to-progress" as an evaluation primitive. We bolt this on every time. + +The only constraint where a candidate matches or exceeds us is **C4 audit trail** — `json-rules-engine`'s `RuleResult` is genuinely good, and `valueResult`/`factResult` (added 7.2.0 Dec 2024) is the closest analogue to our `ConditionResult.observed`. Our trail is at parity for the operators we support. + +## Migration cost if we adopted anyway + +If we forced an adoption — say, `json-rules-engine` — here is what the diff looks like. + +**Lose** +- Severity-first verdict resolution (re-implemented in adapter). +- Per-rule content hash + `rulesetHash`/`flagsHash` (re-implemented in adapter). +- YAML loader with operator-shape and weight-positive validation (re-implemented in adapter; library accepts JSON). +- Synchronous, side-effect-free guarantee (library is async-first; we would have to `await` even for pure runs). +- `exists: false` + value-op rejection at load time. + +**Have to wrap** +- A `LoadedRule → Rule` translator (our `Condition` shape → library's `Condition` tree). +- A `RuleResult → RuleEvaluation` translator to keep audit-trail consumers stable. +- A custom verdict resolver to enforce severity-first over the library's success-event stream. +- A canonicalisation pass over `RuleResult.toJSON()` because we cannot rely on library-internal key order. + +**Line count, ballpark** +- Today: `engine.ts` 193 + `loader.ts` 189 + `hash.ts` ~30 + `types.ts` 85 ≈ **~497 LOC owned**. +- Post-adoption: adapter + canonicaliser + severity resolver + YAML loader retained ≈ **~450 LOC owned, plus a 351k-DL transitive (with jsonpath-plus history)**. Net: same code, more attack surface, less determinism guarantee. + +This is the textbook "the wrapper is the engine" trap. + +## 2026 frontier-model angle + +Two candidates make this an explicit story: + +- **GoRules** shipped a Vertex AI + Gemini integration and a Model Context Protocol (MCP) endpoint in 2025 ([changelog](https://docs.gorules.io/changelog)). External assistants can read, modify, simulate, and test decision graphs. This is interesting for *rule authoring*, not rule *evaluation*, and is geared at GoRules' hosted product. If Specorator ever wants AI-assisted rule editing in a GUI, GoRules is the precedent — but that is a separate workflow concern from the Decide quadrant. +- **`json-rules-engine`** has no first-party LLM story. Community usage is the standard "validate LLM JSON output against rules" pattern, which we already do better. + +Our shape — LLM extracts flags, deterministic engine decides — is closer to GoRules' positioning than to anything in the JS ecosystem. But "close to positioning" is not "buys us anything"; their advantage is in graph editing UX, which we don't need. + +## OPA/Rego honest read + +Skip it. OPA is excellent for authorisation policies distributed across services, where decision logs need to ship to a SIEM and policies are managed by a platform team independently of application code. None of those apply to a Node-only POC that: +- runs once per LLM extraction, in-process, +- expects <1000 rules, +- already produces a richer per-condition audit object than OPA's WASM trace, +- has no need for bundle distribution, partial evaluation, or remote management. + +Adopting OPA would mean: a Rego DSL (different from YAML), a WASM compilation step in CI, a decision-log shipper, and Rego training for every rule author. Operational overhead is not justified. Revisit only if Specorator needs to share rules with non-Node consumers. + +## What this means for the architect + +- The hand-rolled engine is now the design. Any future swap is a `DecisionEngine` interface refactor; the interface boundary already exists in practice (`evaluate(rules, flags) → VerdictResult`). +- If GUI authoring ever becomes a constraint, that is when ZEN/JDM should be re-evaluated — *for the authoring surface*, with our engine still on the evaluation path or a thin re-implementation in JDM terms. +- Persistence and replay (the `trail/<flagsHash>-<rulesetHash>.json` shape from `04-technical-design.md` §4) remains the next concrete deliverable; no candidate ships this for us either. + +## Risks of staying hand-rolled + +| Risk | Severity | Likelihood | Mitigation | +|---|---|---|---| +| Bus factor — one author owns the engine | med | med | Engine is <200 LOC; doc the contract in `extending.md`; add `ENGINE_VERSION` bump discipline (already in place). | +| Feature creep into chaining / derived facts | med | low | Constitution Article II — separation of concerns. Anything requiring derived facts moves to a separate Orient-quadrant step, not a new engine feature. | +| Hidden non-determinism slips in | high | low | CI fixture: same flags + same rules → byte-identical `VerdictResult` JSON. Already planned as POC test #9 in `04-technical-design.md`. | +| Library matures and overtakes us | low | low | Re-evaluate at the next stage gate (Stage 4 design) or if rule count crosses ~500 rules. | + +## Open follow-ups + +- **Decision:** record this as an ADR once the POC graduates from `experiments/`. Title: "Use a hand-rolled deterministic rule engine in the Decide quadrant." Inputs cited: `engine.ts`, `loader.ts`, this file. *(TBD — owner: architect.)* +- **Bench:** add a `bench/` micro-benchmark against `json-rules-engine` on a 200-rule fixture to put a number on the perf claim before any production push. *(TBD — owner: sre.)* + +## Sources + +- json-rules-engine: [npm](https://www.npmjs.com/package/json-rules-engine), [changelog](https://github.com/CacheControl/json-rules-engine/blob/master/CHANGELOG.md), [docs/rules.md](https://github.com/CacheControl/json-rules-engine/blob/master/docs/rules.md), [issue #39 async events](https://github.com/CacheControl/json-rules-engine/issues/39), [issues board](https://github.com/cachecontrol/json-rules-engine/issues), [Snyk security](https://security.snyk.io/package/npm/json-rules-engine) +- GoRules ZEN: [npm](https://www.npmjs.com/package/@gorules/zen-engine), [changelog](https://docs.gorules.io/changelog), [Node.js SDK](https://docs.gorules.io/reference/nodejs), [GitHub](https://github.com/gorules/zen) +- OPA: [decision logs](https://www.openpolicyagent.org/docs/management-decision-logs), [WASM](https://www.openpolicyagent.org/docs/wasm), [integration](https://www.openpolicyagent.org/docs/integration) +- durable-rules: [npm `durable`](https://www.npmjs.com/package/durable), [GitHub jruizgit/rules](https://github.com/jruizgit/rules) +- rules-engine-ts: [GitHub andrewvo89](https://github.com/andrewvo89/rules-engine-ts) +- nools (disqualified): [GitHub noolsjs](https://github.com/noolsjs/nools) +- Comparison: [Nected Top-10 2026 list](https://www.nected.ai/blog/rule-engine-in-node-js-javascript), [npm trends durable vs json-rules-engine](https://npmtrends.com/durable-vs-json-rules-engine-vs-json-rules-engine-simplified-vs-node-rules-vs-rule-reactor) diff --git a/experiments/rule-engine-poc/src/prompt-builder.ts b/experiments/rule-engine-poc/src/prompt-builder.ts index e811cce8c..a267f94c6 100644 --- a/experiments/rule-engine-poc/src/prompt-builder.ts +++ b/experiments/rule-engine-poc/src/prompt-builder.ts @@ -1,6 +1,7 @@ import type { CollectedFile } from "./context.js"; import type { FlagSchema } from "./flag-schema.js"; import type { LoadedRule } from "./types.js"; +import { FORBIDDEN_FIELDS } from "./validate.js"; export interface BuildPromptInput { targetId: string; @@ -10,19 +11,6 @@ export interface BuildPromptInput { files: CollectedFile[]; } -// Forbidden field names (from research/10-extraction-prompt-patterns.md). -// Naming them explicitly inside the prompt is more reliable than hoping -// the model won't drift into verdict-shaped output. -const FORBIDDEN_FIELDS = [ - "verdict", - "assessment", - "conclusion", - "summary", - "recommendation", - "rationale", - "analysis", -]; - export function buildExtractionPrompt(input: BuildPromptInput): string { const { targetId, targetLabel, flagSchema, rules, files } = input; diff --git a/experiments/rule-engine-poc/src/report.ts b/experiments/rule-engine-poc/src/report.ts index 9ebf379f8..d0a45e2b5 100644 --- a/experiments/rule-engine-poc/src/report.ts +++ b/experiments/rule-engine-poc/src/report.ts @@ -11,6 +11,7 @@ import { loadFlagSchema } from "./flag-schema.js"; import { loadRulesFromFile } from "./loader.js"; import { renderHtmlReport } from "./html-report.js"; import { openInBrowser } from "./open-browser.js"; +import { validateExtraction } from "./validate.js"; import type { ExtractionFlags, Verdict } from "./types.js"; import type { Target } from "./config.js"; @@ -34,9 +35,11 @@ const onlyTarget = takeOpt(argv, "--target"); const noOpen = takeFlag(argv, "--no-open"); const quiet = takeFlag(argv, "--quiet"); +const skipValidate = takeFlag(argv, "--skip-validate"); + const config = loadConfig(configPath); const rules = loadRulesFromFile(config.rulesPath); -loadFlagSchema(config.flagSchemaPath); // validate-only here; warnings live in `plan`. +const schema = loadFlagSchema(config.flagSchemaPath); mkdirSync(config.reportsDirPath, { recursive: true }); @@ -78,6 +81,28 @@ for (const target of targets) { continue; } + if (!skipValidate) { + const v = validateExtraction(flags as Record<string, unknown>, schema); + if (!v.ok) { + console.error( + `[report] ${target.id}: extraction failed validation (${v.errors.length} error(s)):`, + ); + for (const e of v.errors) { + console.error(` ERROR ${e.code}${e.path ? ` (${e.path})` : ""}: ${e.message}`); + } + console.error( + `[report] re-run with --skip-validate to render anyway, or fix the extraction and try again`, + ); + worstExitCode = Math.max(worstExitCode, 2); + continue; + } + if (!quiet && v.warnings.length > 0) { + for (const w of v.warnings) { + console.warn(`[report] ${target.id} warn ${w.code}${w.path ? ` (${w.path})` : ""}: ${w.message}`); + } + } + } + const result = evaluate(rules, flags); const html = renderHtmlReport(result, { rulesPath: relative(config.configDir, config.rulesPath), diff --git a/experiments/rule-engine-poc/src/validate-cli.ts b/experiments/rule-engine-poc/src/validate-cli.ts new file mode 100644 index 000000000..5fb452c61 --- /dev/null +++ b/experiments/rule-engine-poc/src/validate-cli.ts @@ -0,0 +1,89 @@ +#!/usr/bin/env node +// `validate` CLI: parse each extraction, check against the flag schema +// + forbidden-fields list, surface issues. Exits 0 when every target +// is clean, 1 when any target has errors, 2 when any extraction is +// missing or unreadable. Intended as a pre-flight before `report`. + +import { existsSync, readFileSync } from "node:fs"; +import { join, relative } from "node:path"; +import { loadConfig, findTarget } from "./config.js"; +import { loadFlagSchema } from "./flag-schema.js"; +import { validateExtraction } from "./validate.js"; +import type { Target } from "./config.js"; + +function takeOpt(argv: string[], flag: string): string | undefined { + const i = argv.indexOf(flag); + if (i === -1) return undefined; + const v = argv[i + 1]; + argv.splice(i, 2); + return v; +} + +const argv = process.argv.slice(2); +const configPath = takeOpt(argv, "--config") ?? "rule-engine.config.json"; +const onlyTarget = takeOpt(argv, "--target"); + +const config = loadConfig(configPath); +const schema = loadFlagSchema(config.flagSchemaPath); + +const targets: Target[] = onlyTarget + ? [findTarget(config, onlyTarget)] + : config.targets; + +let worstExitCode = 0; +let cleanCount = 0; +let issueCount = 0; + +for (const target of targets) { + const extractionPath = join(config.extractionsDirPath, `${target.id}.json`); + if (!existsSync(extractionPath)) { + console.error( + `[validate] missing extraction for '${target.id}': ${relative(config.configDir, extractionPath)}`, + ); + worstExitCode = Math.max(worstExitCode, 2); + continue; + } + let parsed: unknown; + try { + parsed = JSON.parse(readFileSync(extractionPath, "utf8")); + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + console.error(`[validate] invalid JSON in ${extractionPath}: ${msg}`); + worstExitCode = Math.max(worstExitCode, 2); + continue; + } + if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) { + const kind = parsed === null ? "null" : Array.isArray(parsed) ? "array" : typeof parsed; + console.error( + `[validate] extraction ${extractionPath} must be a JSON object (got ${kind})`, + ); + worstExitCode = Math.max(worstExitCode, 2); + continue; + } + const result = validateExtraction(parsed as Record<string, unknown>, schema); + if (result.ok && result.warnings.length === 0) { + console.log(`[validate] ${target.id}: OK`); + cleanCount++; + continue; + } + console.log( + `[validate] ${target.id}: ${result.errors.length} error(s), ${result.warnings.length} warning(s)`, + ); + for (const e of result.errors) { + console.log(` ERROR ${e.code}${e.path ? ` (${e.path})` : ""}: ${e.message}`); + issueCount++; + } + for (const w of result.warnings) { + console.log(` warn ${w.code}${w.path ? ` (${w.path})` : ""}: ${w.message}`); + } + if (result.errors.length > 0) { + worstExitCode = Math.max(worstExitCode, 1); + } else { + cleanCount++; + } +} + +console.log(""); +console.log(`Summary: ${cleanCount}/${targets.length} clean, ${issueCount} error(s) total`); + +process.exitCode = worstExitCode; diff --git a/experiments/rule-engine-poc/src/validate.ts b/experiments/rule-engine-poc/src/validate.ts new file mode 100644 index 000000000..4bb386e39 --- /dev/null +++ b/experiments/rule-engine-poc/src/validate.ts @@ -0,0 +1,157 @@ +// Validates an extraction (pasted LLM JSON output) against the flag +// schema, the forbidden-fields list, and optionally a prompt-binding +// hash. Surfaces issues loudly rather than letting bad input launder +// into a hashed, replayable-looking verdict. +// +// This is the gate the critic (research/07) and analyst (research/10) +// independently flagged as the riskiest missing piece: without it, +// LLM output that drifts from the schema becomes a reproducible-looking +// audit trail of a bogus decision. + +import type { FlagSchema, FlagType } from "./flag-schema.js"; + +// Verdict-shaped field names the LLM might "helpfully" emit. The engine +// produces the verdict, not the LLM — so seeing any of these is a sign +// the prompt wasn't followed. +export const FORBIDDEN_FIELDS = [ + "verdict", + "assessment", + "conclusion", + "summary", + "recommendation", + "rationale", + "analysis", +] as const; + +// The magic field the LLM is asked to copy back so we can detect +// extractions produced against a stale prompt. +export const PROMPT_HASH_FIELD = "__prompt_hash"; + +export type ValidationSeverity = "error" | "warning"; + +export interface ValidationIssue { + severity: ValidationSeverity; + code: string; + message: string; + path?: string; +} + +export interface ValidationResult { + ok: boolean; + errors: ValidationIssue[]; + warnings: ValidationIssue[]; +} + +export interface ValidateOptions { + expectedPromptHash?: string; +} + +export function validateExtraction( + flags: Record<string, unknown>, + schema: FlagSchema, + options: ValidateOptions = {}, +): ValidationResult { + const errors: ValidationIssue[] = []; + const warnings: ValidationIssue[] = []; + + for (const field of FORBIDDEN_FIELDS) { + if (field in flags) { + errors.push({ + severity: "error", + code: "forbidden-field", + path: field, + message: + `Field '${field}' is verdict-shaped — the engine produces the verdict, the LLM only emits flags. ` + + `Re-run the AI extraction and ensure it does not include this field.`, + }); + } + } + + if (options.expectedPromptHash !== undefined) { + const seen = flags[PROMPT_HASH_FIELD]; + if (seen === undefined) { + errors.push({ + severity: "error", + code: "missing-prompt-hash", + path: PROMPT_HASH_FIELD, + message: + `Extraction is missing '${PROMPT_HASH_FIELD}'. Current prompt hash: ${options.expectedPromptHash}. ` + + `Re-run 'npm run plan' and re-extract.`, + }); + } else if (typeof seen !== "string" || seen !== options.expectedPromptHash) { + errors.push({ + severity: "error", + code: "stale-extraction", + path: PROMPT_HASH_FIELD, + message: + `Extraction is stale: '${PROMPT_HASH_FIELD}' is '${String(seen)}', current prompt is '${options.expectedPromptHash}'. ` + + `Source files or rules changed since extraction was produced — re-run 'npm run plan' and re-extract.`, + }); + } + } + + for (const [key, value] of Object.entries(flags)) { + if ((FORBIDDEN_FIELDS as readonly string[]).includes(key)) continue; + if (key === PROMPT_HASH_FIELD) continue; + const entry = schema[key]; + if (!entry) { + warnings.push({ + severity: "warning", + code: "unknown-flag", + path: key, + message: `Flag '${key}' is not documented in flag-schema.yaml. The engine will ignore it.`, + }); + continue; + } + const typeProblem = checkType(value, entry.type); + if (typeProblem) { + errors.push({ + severity: "error", + code: "type-mismatch", + path: key, + message: `Flag '${key}' expected '${entry.type}', got ${typeProblem}.`, + }); + continue; + } + if (entry.allowed_values && !entry.allowed_values.some((v) => deepEqual(v, value))) { + errors.push({ + severity: "error", + code: "disallowed-value", + path: key, + message: + `Flag '${key}' value ${JSON.stringify(value)} is not in allowed_values ${JSON.stringify(entry.allowed_values)}.`, + }); + } + } + + return { ok: errors.length === 0, errors, warnings }; +} + +function checkType(value: unknown, type: FlagType): string | null { + if (value === null) return null; + switch (type) { + case "boolean": + return typeof value === "boolean" ? null : typeof value; + case "number": + if (typeof value !== "number") return typeof value; + if (!Number.isFinite(value)) return "non-finite number (NaN or Infinity)"; + return null; + case "string": + return typeof value === "string" ? null : typeof value; + case "string[]": { + if (!Array.isArray(value)) return typeof value; + const bad = value.findIndex((v) => typeof v !== "string"); + return bad === -1 ? null : `array with non-string at index ${bad}`; + } + default: + return `unknown schema type '${type as string}'`; + } +} + +function deepEqual(a: unknown, b: unknown): boolean { + if (a === b) return true; + if (Array.isArray(a) && Array.isArray(b)) { + return a.length === b.length && a.every((v, i) => deepEqual(v, b[i])); + } + return false; +} diff --git a/experiments/rule-engine-poc/test/validate.test.ts b/experiments/rule-engine-poc/test/validate.test.ts new file mode 100644 index 000000000..6c6ec7900 --- /dev/null +++ b/experiments/rule-engine-poc/test/validate.test.ts @@ -0,0 +1,118 @@ +import { describe, expect, it } from "vitest"; +import { + FORBIDDEN_FIELDS, + PROMPT_HASH_FIELD, + validateExtraction, +} from "../src/validate.js"; +import type { FlagSchema } from "../src/flag-schema.js"; + +const schema: FlagSchema = { + ci_passing: { type: "boolean", description: "ci is green", example: true }, + approvals_count: { type: "number", description: "review approvals", example: 2 }, + current_stage: { + type: "string", + description: "lifecycle stage", + example: "implementation", + allowed_values: ["idea", "implementation", "review"], + }, + files_touched: { + type: "string[]", + description: "files in the diff", + example: ["a.ts"], + }, +}; + +describe("validateExtraction", () => { + it("accepts a well-formed extraction", () => { + const r = validateExtraction( + { + ci_passing: true, + approvals_count: 2, + current_stage: "implementation", + files_touched: ["a.ts", "b.ts"], + }, + schema, + ); + expect(r.ok).toBe(true); + expect(r.errors).toHaveLength(0); + expect(r.warnings).toHaveLength(0); + }); + + it("flags forbidden fields as errors", () => { + const r = validateExtraction({ verdict: "blocked", ci_passing: true }, schema); + expect(r.ok).toBe(false); + expect(r.errors[0]!.code).toBe("forbidden-field"); + expect(r.errors[0]!.path).toBe("verdict"); + }); + + it("flags every forbidden field name", () => { + const input: Record<string, unknown> = {}; + for (const f of FORBIDDEN_FIELDS) input[f] = "x"; + const r = validateExtraction(input, schema); + expect(r.errors).toHaveLength(FORBIDDEN_FIELDS.length); + }); + + it("warns on unknown flags but does not error", () => { + const r = validateExtraction({ ci_passing: true, bogus: 1 }, schema); + expect(r.ok).toBe(true); + expect(r.warnings).toHaveLength(1); + expect(r.warnings[0]!.code).toBe("unknown-flag"); + expect(r.warnings[0]!.path).toBe("bogus"); + }); + + it("errors on type mismatch for booleans", () => { + const r = validateExtraction({ ci_passing: "yes" }, schema); + expect(r.ok).toBe(false); + expect(r.errors[0]!.code).toBe("type-mismatch"); + }); + + it("errors on non-finite numbers", () => { + const r = validateExtraction({ approvals_count: NaN }, schema); + expect(r.ok).toBe(false); + expect(r.errors[0]!.code).toBe("type-mismatch"); + }); + + it("errors on non-string entries in string[] flags", () => { + const r = validateExtraction({ files_touched: ["a.ts", 7] as unknown[] }, schema); + expect(r.ok).toBe(false); + expect(r.errors[0]!.message).toMatch(/non-string at index 1/); + }); + + it("errors on disallowed enum values", () => { + const r = validateExtraction({ current_stage: "boozeday" }, schema); + expect(r.ok).toBe(false); + expect(r.errors[0]!.code).toBe("disallowed-value"); + }); + + it("accepts null as 'unknown' for any type", () => { + const r = validateExtraction({ ci_passing: null, approvals_count: null }, schema); + expect(r.ok).toBe(true); + }); + + describe("with expectedPromptHash", () => { + it("errors when extraction omits the hash field", () => { + const r = validateExtraction({ ci_passing: true }, schema, { + expectedPromptHash: "abc", + }); + expect(r.errors[0]!.code).toBe("missing-prompt-hash"); + }); + + it("errors when hash mismatches", () => { + const r = validateExtraction( + { ci_passing: true, [PROMPT_HASH_FIELD]: "xyz" }, + schema, + { expectedPromptHash: "abc" }, + ); + expect(r.errors[0]!.code).toBe("stale-extraction"); + }); + + it("passes when hash matches", () => { + const r = validateExtraction( + { ci_passing: true, [PROMPT_HASH_FIELD]: "abc" }, + schema, + { expectedPromptHash: "abc" }, + ); + expect(r.ok).toBe(true); + }); + }); +}); From a9b1db97374f17b1af12b3c62b3dec72cc8e3800 Mon Sep 17 00:00:00 2001 From: Claude <noreply@anthropic.com> Date: Sun, 17 May 2026 12:16:31 +0000 Subject: [PATCH 10/45] feat(rule-engine-poc): prompt-extraction binding (stale-extraction guard) Closes the stale-extraction failure mode flagged by the critic (research/07): users edit source files between plan and report and the old JSON still renders a confident verdict. The report now refuses extractions produced against a different prompt. What's new: - src/prompt-hash.ts: computePromptHash hashes the LOAD-BEARING inputs (target id, per-file sha, rule hashes, schema content), not the rendered prompt text. Cosmetic edits to the prompt template don't invalidate extractions; real source changes do. - src/plan.ts: emits sidecar prompts/<id>.hash.txt and embeds the hash into the prompt as (a) a top-of-file HTML comment, (b) an explicit rule asking the LLM to copy it into __prompt_hash, and (c) the response template's first key. - src/validate.ts: enforces expectedPromptHash when provided. Surfaces missing-prompt-hash and stale-extraction error codes with re-run instructions. - src/report.ts + src/validate-cli.ts: read the sidecar hash if it exists; absence falls back to the pre-binding behaviour for backwards-compat with fixtures. - 6 new prompt-hash tests; suite total 83 passing. End-to-end verified in this sandbox: matching hash -> exit 0 (ready-to-progress) stale hash -> exit 2 with explicit error missing field -> exit 2 with explicit error --- experiments/rule-engine-poc/src/plan.ts | 11 +- .../rule-engine-poc/src/prompt-builder.ts | 32 ++++-- .../rule-engine-poc/src/prompt-hash.ts | 36 +++++++ experiments/rule-engine-poc/src/report.ts | 13 ++- .../rule-engine-poc/src/validate-cli.ts | 12 ++- .../rule-engine-poc/test/prompt-hash.test.ts | 101 ++++++++++++++++++ 6 files changed, 191 insertions(+), 14 deletions(-) create mode 100644 experiments/rule-engine-poc/src/prompt-hash.ts create mode 100644 experiments/rule-engine-poc/test/prompt-hash.test.ts diff --git a/experiments/rule-engine-poc/src/plan.ts b/experiments/rule-engine-poc/src/plan.ts index ad82f4947..592436922 100644 --- a/experiments/rule-engine-poc/src/plan.ts +++ b/experiments/rule-engine-poc/src/plan.ts @@ -10,6 +10,7 @@ import { collectFiles } from "./context.js"; import { diffSchemaCoverage, loadFlagSchema } from "./flag-schema.js"; import { loadRulesFromFile } from "./loader.js"; import { buildExtractionPrompt } from "./prompt-builder.js"; +import { computePromptHash, hashSidecarPath } from "./prompt-hash.js"; import type { Target } from "./config.js"; function takeOpt(argv: string[], flag: string): string | undefined { @@ -60,18 +61,26 @@ for (const target of targets) { `[plan] target '${target.id}' resolved zero readable files from paths: ${target.paths.join(", ")}`, ); } + const promptHash = computePromptHash({ + targetId: target.id, + files, + rules, + flagSchema: schema, + }); const prompt = buildExtractionPrompt({ targetId: target.id, targetLabel: target.label, flagSchema: schema, rules, files, + promptHash, }); const outPath = join(config.promptsDirPath, `${target.id}.md`); writeFileSync(outPath, prompt, "utf8"); + writeFileSync(hashSidecarPath(outPath), promptHash + "\n", "utf8"); const totalBytes = files.reduce((sum, f) => sum + Buffer.byteLength(f.content, "utf8"), 0); console.log( - `[plan] ${target.id}: ${files.length} files (${Math.round(totalBytes / 1024)} KB) -> ${relative(config.configDir, outPath)}`, + `[plan] ${target.id}: ${files.length} files (${Math.round(totalBytes / 1024)} KB) -> ${relative(config.configDir, outPath)} hash=${promptHash.slice(0, 12)}`, ); } diff --git a/experiments/rule-engine-poc/src/prompt-builder.ts b/experiments/rule-engine-poc/src/prompt-builder.ts index a267f94c6..36fc583ef 100644 --- a/experiments/rule-engine-poc/src/prompt-builder.ts +++ b/experiments/rule-engine-poc/src/prompt-builder.ts @@ -1,7 +1,7 @@ import type { CollectedFile } from "./context.js"; import type { FlagSchema } from "./flag-schema.js"; import type { LoadedRule } from "./types.js"; -import { FORBIDDEN_FIELDS } from "./validate.js"; +import { FORBIDDEN_FIELDS, PROMPT_HASH_FIELD } from "./validate.js"; export interface BuildPromptInput { targetId: string; @@ -9,10 +9,11 @@ export interface BuildPromptInput { flagSchema: FlagSchema; rules: LoadedRule[]; files: CollectedFile[]; + promptHash?: string; } export function buildExtractionPrompt(input: BuildPromptInput): string { - const { targetId, targetLabel, flagSchema, rules, files } = input; + const { targetId, targetLabel, flagSchema, rules, files, promptHash } = input; const schemaRows = Object.entries(flagSchema) .sort(([a], [b]) => a.localeCompare(b)) @@ -25,13 +26,23 @@ export function buildExtractionPrompt(input: BuildPromptInput): string { }) .join("\n"); - const responseTemplate = JSON.stringify( - Object.fromEntries( - Object.entries(flagSchema).map(([name, entry]) => [name, entry.example]), - ), - null, - 2, + const templateObj: Record<string, unknown> = Object.fromEntries( + Object.entries(flagSchema).map(([name, entry]) => [name, entry.example]), ); + if (promptHash) { + // Make the __prompt_hash field the first key the LLM sees in the + // template so it's harder to drop accidentally. + templateObj[PROMPT_HASH_FIELD] = promptHash; + } + const responseTemplate = JSON.stringify(templateObj, null, 2); + + const promptHashLine = promptHash + ? `\n<!-- prompt-hash: ${promptHash} -->\n` + : ""; + + const promptHashRule = promptHash + ? `\n7. **Include \`"${PROMPT_HASH_FIELD}": "${promptHash}"\`** in your JSON output exactly as shown. The engine uses this to detect stale extractions when the source files change.` + : ""; const ruleSummary = rules .map((r) => `- \`${r.id}\` (priority ${r.priority}) — ${r.description}`) @@ -51,8 +62,7 @@ export function buildExtractionPrompt(input: BuildPromptInput): string { // markdown headers as redundancy (GPT/Gemini-friendly). Opening an // `<output>` tag at the end is a forcing function: the next token the // model emits has to look like the start of valid JSON. - return `# Extraction request — ${targetLabel} - + return `# Extraction request — ${targetLabel}${promptHashLine} <role> You are an evidence extractor for a deterministic rule engine — not a fact-checker, not an assistant, not a reviewer. Your only job is to read the source material in \`<source>\` and emit structured flags in \`<output>\`. @@ -65,7 +75,7 @@ A separate engine consumes your flags and produces a verdict. If you add interpr 3. **Do NOT include any of these fields:** ${FORBIDDEN_FIELDS.map((f) => `\`${f}\``).join(", ")}. They are verdict-shaped; the engine produces the verdict, not you. 4. **Omit any flag you cannot determine** from the source. Do not guess. A missing flag is correct behaviour — the engine surfaces it as "flag missing in extraction" and the rule that needed it does not fire. 5. Each flag value must match the declared type. Booleans are \`true\`/\`false\` not \`"true"\`/\`"false"\`. -6. If you genuinely cannot extract anything (source is empty or unrelated), respond with \`<output>{}</output>\`. +6. If you genuinely cannot extract anything (source is empty or unrelated), respond with \`<output>{}</output>\`.${promptHashRule} </rules> <flag_schema> diff --git a/experiments/rule-engine-poc/src/prompt-hash.ts b/experiments/rule-engine-poc/src/prompt-hash.ts new file mode 100644 index 000000000..03dac2bca --- /dev/null +++ b/experiments/rule-engine-poc/src/prompt-hash.ts @@ -0,0 +1,36 @@ +// Prompt-extraction binding: detect stale extractions whose source +// material, rules, or schema changed since the extraction was produced. +// +// We hash the *load-bearing inputs* to the LLM extraction (target id, +// per-file content hash, rule content hashes, flag schema content), +// NOT the rendered prompt text. That way cosmetic edits to the prompt +// template don't invalidate extractions, but a real change in source +// content does. + +import type { CollectedFile } from "./context.js"; +import type { FlagSchema } from "./flag-schema.js"; +import { canonicalJson, sha256 } from "./hash.js"; +import type { LoadedRule } from "./types.js"; + +export interface PromptHashInput { + targetId: string; + files: CollectedFile[]; + rules: LoadedRule[]; + flagSchema: FlagSchema; +} + +export function computePromptHash(input: PromptHashInput): string { + const payload = { + targetId: input.targetId, + files: input.files + .map((f) => ({ path: f.relativePath, sha: sha256(f.content) })) + .sort((a, b) => a.path.localeCompare(b.path)), + ruleHashes: input.rules.map((r) => r.hash).sort(), + schemaHash: sha256(canonicalJson(input.flagSchema)), + }; + return sha256(canonicalJson(payload)); +} + +export function hashSidecarPath(promptPath: string): string { + return promptPath.replace(/\.md$/, ".hash.txt"); +} diff --git a/experiments/rule-engine-poc/src/report.ts b/experiments/rule-engine-poc/src/report.ts index d0a45e2b5..a65291980 100644 --- a/experiments/rule-engine-poc/src/report.ts +++ b/experiments/rule-engine-poc/src/report.ts @@ -11,6 +11,7 @@ import { loadFlagSchema } from "./flag-schema.js"; import { loadRulesFromFile } from "./loader.js"; import { renderHtmlReport } from "./html-report.js"; import { openInBrowser } from "./open-browser.js"; +import { hashSidecarPath } from "./prompt-hash.js"; import { validateExtraction } from "./validate.js"; import type { ExtractionFlags, Verdict } from "./types.js"; import type { Target } from "./config.js"; @@ -81,8 +82,18 @@ for (const target of targets) { continue; } + // Look up the sidecar prompt hash if plan wrote one. Missing sidecar + // = backwards-compat path (older extraction or single-shot test). + let expectedPromptHash: string | undefined; + const sidecar = hashSidecarPath(join(config.promptsDirPath, `${target.id}.md`)); + if (existsSync(sidecar)) { + expectedPromptHash = readFileSync(sidecar, "utf8").trim(); + } + if (!skipValidate) { - const v = validateExtraction(flags as Record<string, unknown>, schema); + const v = validateExtraction(flags as Record<string, unknown>, schema, { + expectedPromptHash, + }); if (!v.ok) { console.error( `[report] ${target.id}: extraction failed validation (${v.errors.length} error(s)):`, diff --git a/experiments/rule-engine-poc/src/validate-cli.ts b/experiments/rule-engine-poc/src/validate-cli.ts index 5fb452c61..6a12c5ca0 100644 --- a/experiments/rule-engine-poc/src/validate-cli.ts +++ b/experiments/rule-engine-poc/src/validate-cli.ts @@ -8,6 +8,7 @@ import { existsSync, readFileSync } from "node:fs"; import { join, relative } from "node:path"; import { loadConfig, findTarget } from "./config.js"; import { loadFlagSchema } from "./flag-schema.js"; +import { hashSidecarPath } from "./prompt-hash.js"; import { validateExtraction } from "./validate.js"; import type { Target } from "./config.js"; @@ -60,7 +61,16 @@ for (const target of targets) { worstExitCode = Math.max(worstExitCode, 2); continue; } - const result = validateExtraction(parsed as Record<string, unknown>, schema); + let expectedPromptHash: string | undefined; + const sidecar = hashSidecarPath(join(config.promptsDirPath, `${target.id}.md`)); + if (existsSync(sidecar)) { + expectedPromptHash = readFileSync(sidecar, "utf8").trim(); + } + const result = validateExtraction( + parsed as Record<string, unknown>, + schema, + { expectedPromptHash }, + ); if (result.ok && result.warnings.length === 0) { console.log(`[validate] ${target.id}: OK`); cleanCount++; diff --git a/experiments/rule-engine-poc/test/prompt-hash.test.ts b/experiments/rule-engine-poc/test/prompt-hash.test.ts new file mode 100644 index 000000000..1d7b09551 --- /dev/null +++ b/experiments/rule-engine-poc/test/prompt-hash.test.ts @@ -0,0 +1,101 @@ +import { describe, expect, it } from "vitest"; +import { computePromptHash, hashSidecarPath } from "../src/prompt-hash.js"; +import { loadRulesFromString } from "../src/loader.js"; +import type { FlagSchema } from "../src/flag-schema.js"; + +const schema: FlagSchema = { + foo: { type: "boolean", description: "foo", example: true }, +}; +const rules = loadRulesFromString( + ` +- id: r + description: x + priority: 1 + when: + all: + - flag: foo + eq: true + then: + verdict: blocked + weight: 1 + actions: [] +`, + "hash-test", +); +const files = [ + { + absolutePath: "/abs/a.md", + relativePath: "a.md", + bytes: 5, + truncated: false, + content: "hello", + }, + { + absolutePath: "/abs/b.md", + relativePath: "b.md", + bytes: 5, + truncated: false, + content: "world", + }, +]; + +describe("computePromptHash", () => { + it("produces the same hash for identical inputs", () => { + const a = computePromptHash({ targetId: "t", files, rules, flagSchema: schema }); + const b = computePromptHash({ targetId: "t", files, rules, flagSchema: schema }); + expect(a).toBe(b); + }); + + it("is independent of input file order", () => { + const a = computePromptHash({ + targetId: "t", + files, + rules, + flagSchema: schema, + }); + const b = computePromptHash({ + targetId: "t", + files: [...files].reverse(), + rules, + flagSchema: schema, + }); + expect(a).toBe(b); + }); + + it("changes when a file's content changes", () => { + const a = computePromptHash({ targetId: "t", files, rules, flagSchema: schema }); + const b = computePromptHash({ + targetId: "t", + files: [{ ...files[0]!, content: "different" }, files[1]!], + rules, + flagSchema: schema, + }); + expect(a).not.toBe(b); + }); + + it("changes when the schema changes", () => { + const a = computePromptHash({ targetId: "t", files, rules, flagSchema: schema }); + const b = computePromptHash({ + targetId: "t", + files, + rules, + flagSchema: { + ...schema, + bar: { type: "number", description: "bar", example: 1 }, + }, + }); + expect(a).not.toBe(b); + }); + + it("changes when the target id changes", () => { + const a = computePromptHash({ targetId: "t1", files, rules, flagSchema: schema }); + const b = computePromptHash({ targetId: "t2", files, rules, flagSchema: schema }); + expect(a).not.toBe(b); + }); +}); + +describe("hashSidecarPath", () => { + it("swaps .md for .hash.txt", () => { + expect(hashSidecarPath("/x/prompts/foo.md")).toBe("/x/prompts/foo.hash.txt"); + }); +}); From cc00478be855c90a81b57de98186a6584ada1c3f Mon Sep 17 00:00:00 2001 From: Claude <noreply@anthropic.com> Date: Sun, 17 May 2026 12:18:25 +0000 Subject: [PATCH 11/45] fix(rule-engine-poc): polish bundle (Codex round 7 + reviewer S2/S3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes a cluster of small findings from Codex round 7 (cli, context) and reviewer S2/S3 (engine, loader audit-trail honesty): - cli.ts: validate JSON root is a plain object before evaluate. Same guard as report.ts; previously valid-JSON-but-not-object input would crash inside hasOwnProperty.call (#525 round 7 P2). - context.ts: use lstat instead of stat and skip symlinks entirely. Previously a symlink cycle (a/sub/loop -> a/) would recurse until stack overflow during plan (#525 round 7 P2). - engine.ts evaluateCondition: gt/lt against a non-number and regex against a non-string now set an explicit reason ('expected number for gt, got string') so the audit trail explains *why* the condition didn't match. Reviewer S2 — previously these set matched=false with no reason. - engine.ts evaluateWhen: when.not against a missing flag no longer silently fires. The inner condition's 'flag missing in extraction' reason is preserved through the not clause; the rule fails to match rather than inverting absence into success. Reviewer S2. - loader.ts: duplicate rule ids in a single rule file are rejected at load time. Previously a second rule with the same id loaded silently and the engine evaluated it independently. Reviewer S3. - Five new tests: gt-non-number reason, regex-non-string reason, not-missing flag, duplicate ids, symlink cycle handling. Suite: 88/88 passing. --- experiments/rule-engine-poc/src/cli.ts | 15 +++- experiments/rule-engine-poc/src/context.ts | 12 ++- experiments/rule-engine-poc/src/engine.ts | 40 ++++++++-- experiments/rule-engine-poc/src/loader.ts | 18 ++++- .../test/context-symlink.test.ts | 30 ++++++++ .../rule-engine-poc/test/engine.test.ts | 75 +++++++++++++++++++ .../rule-engine-poc/test/loader.test.ts | 32 ++++++++ 7 files changed, 210 insertions(+), 12 deletions(-) create mode 100644 experiments/rule-engine-poc/test/context-symlink.test.ts diff --git a/experiments/rule-engine-poc/src/cli.ts b/experiments/rule-engine-poc/src/cli.ts index f56161237..2402f5c2a 100644 --- a/experiments/rule-engine-poc/src/cli.ts +++ b/experiments/rule-engine-poc/src/cli.ts @@ -40,9 +40,22 @@ if (!rulesPath || !flagsPath) { } const rules = loadRulesFromFile(rulesPath); -const flags: ExtractionFlags = JSON.parse( +const parsedFlags: unknown = JSON.parse( readFileSync(resolve(flagsPath), "utf8"), ); +if (!parsedFlags || typeof parsedFlags !== "object" || Array.isArray(parsedFlags)) { + const kind = + parsedFlags === null + ? "null" + : Array.isArray(parsedFlags) + ? "array" + : typeof parsedFlags; + console.error( + `Flags file ${flagsPath} must contain a JSON object (got ${kind})`, + ); + process.exit(2); +} +const flags = parsedFlags as ExtractionFlags; const result = evaluate(rules, flags); if (htmlPath) { diff --git a/experiments/rule-engine-poc/src/context.ts b/experiments/rule-engine-poc/src/context.ts index cf6b4d03e..6ed3852df 100644 --- a/experiments/rule-engine-poc/src/context.ts +++ b/experiments/rule-engine-poc/src/context.ts @@ -1,4 +1,4 @@ -import { readdirSync, readFileSync, statSync } from "node:fs"; +import { lstatSync, readdirSync, readFileSync } from "node:fs"; import { extname, join, relative, resolve } from "node:path"; const DEFAULT_EXTENSIONS = new Set([ @@ -62,10 +62,18 @@ function walk( ): void { let stat; try { - stat = statSync(abs); + // lstat (not stat) so we see the symlink itself rather than its + // target. Symlinked directories are then skipped entirely, which + // sidesteps cycles like a/ -> a/sub/back-to-a. + stat = lstatSync(abs); } catch { throw new Error(`Path does not exist: ${abs}`); } + if (stat.isSymbolicLink()) { + // Deliberate: symlinks are an opt-in surface, not a default + // recursion target. Skip silently. + return; + } if (stat.isFile()) { if (!extensions.has(extname(abs))) return; addFile(abs, out); diff --git a/experiments/rule-engine-poc/src/engine.ts b/experiments/rule-engine-poc/src/engine.ts index 56540e402..58b02f817 100644 --- a/experiments/rule-engine-poc/src/engine.ts +++ b/experiments/rule-engine-poc/src/engine.ts @@ -52,6 +52,7 @@ function evaluateCondition( c.regex !== undefined; let matched = true; + let reason: string | undefined; // `exists` participates in the AND-chain rather than short-circuiting, // so `exists: true` combined with e.g. `eq: 5` correctly requires both. @@ -75,23 +76,39 @@ function evaluateCondition( if (c.eq !== undefined) matched = matched && deepEqual(observed, c.eq); if (c.ne !== undefined) matched = matched && !deepEqual(observed, c.ne); + + // Numeric and regex operators that observe a wrong-typed value record + // an explicit reason in the audit trail, so it's clear in the report + // why the condition didn't match (vs the value just being out of range). if (c.gt !== undefined) { - matched = matched && typeof observed === "number" && observed > c.gt; + if (typeof observed !== "number") { + matched = false; + reason ??= `expected number for 'gt', got ${typeof observed}`; + } else if (!(observed > c.gt)) { + matched = false; + } } if (c.lt !== undefined) { - matched = matched && typeof observed === "number" && observed < c.lt; + if (typeof observed !== "number") { + matched = false; + reason ??= `expected number for 'lt', got ${typeof observed}`; + } else if (!(observed < c.lt)) { + matched = false; + } } if (c.in !== undefined) { matched = matched && c.in.some((v) => deepEqual(observed, v)); } if (c.regex !== undefined) { - matched = - matched && - typeof observed === "string" && - new RegExp(c.regex).test(observed); + if (typeof observed !== "string") { + matched = false; + reason ??= `expected string for 'regex', got ${typeof observed}`; + } else if (!new RegExp(c.regex).test(observed)) { + matched = false; + } } - return { condition: c, matched, observed }; + return { condition: c, matched, observed, reason }; } function evaluateWhen( @@ -118,6 +135,15 @@ function evaluateWhen( if (when.not) { for (const c of when.not) { const r = evaluateCondition(c, flags); + // If the inner condition couldn't be evaluated because the flag + // was missing, the 'not' clause cannot meaningfully invert it. + // Surface the missing premise rather than silently firing the + // rule on absence — the reviewer flagged this as silent firing. + if (r.reason === "flag missing in extraction") { + conditions.push(r); + matched = false; + continue; + } // Record the inverted result so the audit trail shows what we required. conditions.push({ ...r, matched: !r.matched }); if (r.matched) matched = false; diff --git a/experiments/rule-engine-poc/src/loader.ts b/experiments/rule-engine-poc/src/loader.ts index 5f645a536..bc7e70d62 100644 --- a/experiments/rule-engine-poc/src/loader.ts +++ b/experiments/rule-engine-poc/src/loader.ts @@ -12,9 +12,11 @@ export function loadRulesFromFile(filePath: string): LoadedRule[] { if (!Array.isArray(parsed)) { throw new Error(`Rule file ${filePath} must be a YAML list of rules`); } - return parsed.map((rule, index) => + const rules = parsed.map((rule, index) => normalize(rule as Rule, abs, index), ); + assertUniqueIds(rules, abs); + return rules; } export function loadRulesFromString( @@ -25,9 +27,21 @@ export function loadRulesFromString( if (!Array.isArray(parsed)) { throw new Error(`Rule source must be a YAML list of rules`); } - return parsed.map((rule, index) => + const rules = parsed.map((rule, index) => normalize(rule as Rule, sourceLabel, index), ); + assertUniqueIds(rules, sourceLabel); + return rules; +} + +function assertUniqueIds(rules: LoadedRule[], where: string): void { + const seen = new Set<string>(); + for (const r of rules) { + if (seen.has(r.id)) { + throw new Error(`Duplicate rule id '${r.id}' in ${where}`); + } + seen.add(r.id); + } } function normalize( diff --git a/experiments/rule-engine-poc/test/context-symlink.test.ts b/experiments/rule-engine-poc/test/context-symlink.test.ts new file mode 100644 index 000000000..b3262ea54 --- /dev/null +++ b/experiments/rule-engine-poc/test/context-symlink.test.ts @@ -0,0 +1,30 @@ +import { describe, expect, it } from "vitest"; +import { + mkdirSync, + mkdtempSync, + symlinkSync, + writeFileSync, +} from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { collectFiles } from "../src/context.js"; + +describe("collectFiles symlink handling", () => { + it("skips symlinks rather than following them (no infinite recursion on cycles)", () => { + const dir = mkdtempSync(join(tmpdir(), "rep-poc-sym-")); + writeFileSync(join(dir, "a.md"), "# A\n", "utf8"); + mkdirSync(join(dir, "sub")); + writeFileSync(join(dir, "sub", "b.md"), "# B\n", "utf8"); + // Create a cycle: dir/sub/loop -> dir + try { + symlinkSync(dir, join(dir, "sub", "loop")); + } catch { + // Some filesystems (e.g., some CI containers) reject symlinks. + // In that case the test is a no-op; the implementation is still safe. + return; + } + const files = collectFiles([dir], { baseDir: dir }); + const names = files.map((f) => f.relativePath).sort(); + expect(names).toEqual(["a.md", "sub/b.md"]); + }); +}); diff --git a/experiments/rule-engine-poc/test/engine.test.ts b/experiments/rule-engine-poc/test/engine.test.ts index 19f296261..41b569767 100644 --- a/experiments/rule-engine-poc/test/engine.test.ts +++ b/experiments/rule-engine-poc/test/engine.test.ts @@ -224,6 +224,81 @@ describe("exists operator interaction", () => { }); }); +describe("reason surfacing in audit trail", () => { + it("emits explicit reason when gt receives a non-number", () => { + const r = loadRulesFromString( + ` +- id: numeric + description: numeric + priority: 10 + when: + all: + - flag: count + gt: 0 + then: + verdict: blocked + weight: 1 + actions: [] +`, + "gt-non-number", + ); + const result = evaluate(r, { count: "five" as unknown as number }); + const ev = result.evaluations[0]!; + const cond = ev.conditions[0]!; + expect(cond.matched).toBe(false); + expect(cond.reason).toMatch(/expected number for 'gt'/); + }); + + it("emits explicit reason when regex receives a non-string", () => { + const r = loadRulesFromString( + ` +- id: regex + description: regex + priority: 10 + when: + all: + - flag: tag + regex: "^x" + then: + verdict: blocked + weight: 1 + actions: [] +`, + "regex-non-string", + ); + const result = evaluate(r, { tag: 42 as unknown as string }); + const cond = result.evaluations[0]!.conditions[0]!; + expect(cond.matched).toBe(false); + expect(cond.reason).toMatch(/expected string for 'regex'/); + }); + + it("does not silently fire when.not against a missing flag", () => { + const r = loadRulesFromString( + ` +- id: not-missing + description: blocks unless x is true + priority: 10 + when: + not: + - flag: x + eq: true + then: + verdict: blocked + weight: 1 + actions: [] +`, + "not-missing", + ); + // Before fix: rule would fire because the inner eq=true returns matched=false + // (missing), and not inverts to matched=true. Now we surface the missing reason. + const result = evaluate(r, {}); + expect(result.verdict).toBe("unknown"); + const cond = result.evaluations[0]!.conditions[0]!; + expect(cond.matched).toBe(false); + expect(cond.reason).toMatch(/flag missing/); + }); +}); + describe("when-clause operators", () => { it("matches 'any' when at least one condition matches", () => { const r = loadRulesFromString( diff --git a/experiments/rule-engine-poc/test/loader.test.ts b/experiments/rule-engine-poc/test/loader.test.ts index 16c4c64c4..b959459ee 100644 --- a/experiments/rule-engine-poc/test/loader.test.ts +++ b/experiments/rule-engine-poc/test/loader.test.ts @@ -347,6 +347,38 @@ describe("loader", () => { ).toThrow(/invalid regex/); }); + it("rejects rule files with duplicate rule ids", () => { + expect(() => + loadRulesFromString( + ` +- id: dup + description: a + priority: 10 + when: + all: + - flag: x + eq: true + then: + verdict: blocked + weight: 1 + actions: [] +- id: dup + description: b + priority: 5 + when: + all: + - flag: y + eq: true + then: + verdict: needs-attention + weight: 1 + actions: [] +`, + "dup-ids", + ), + ).toThrow(/Duplicate rule id 'dup'/); + }); + it("assigns a stable content hash to each rule", () => { const ruleA = loadRulesFromString( ` From 314032076cbdacd56fc1fc1082abf9bf2bca86a8 Mon Sep 17 00:00:00 2001 From: Claude <noreply@anthropic.com> Date: Sun, 17 May 2026 12:22:37 +0000 Subject: [PATCH 12/45] docs(rule-engine-poc): research wave 3 strategist re-evaluation (research/13) --- .../research/13-strategy-v2.md | 81 +++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 experiments/rule-engine-poc/research/13-strategy-v2.md diff --git a/experiments/rule-engine-poc/research/13-strategy-v2.md b/experiments/rule-engine-poc/research/13-strategy-v2.md new file mode 100644 index 000000000..a0c843a70 --- /dev/null +++ b/experiments/rule-engine-poc/research/13-strategy-v2.md @@ -0,0 +1,81 @@ +--- +title: Rule-engine POC — strategic re-evaluation (v2) +folder: experiments/rule-engine-poc/research +description: Second strategic pass after the workflow is concrete — config-driven targets, plan/report split, validate gate refusing verdict-shaped output, prompt-hash binding for stale-extraction detection, HTML reports. Re-checks JTBD, moat, migration story, packaging, North Star, brand alignment, and the manual-paste design choice. +status: draft +phase: discovery / frame (revisit) +author: product-strategist +supersedes_partial: research/03-positioning-jtbd.md (sections 1, 3, 5) +--- + +# 13 — Strategy v2: positioning after the workflow exists + +## Updated positioning (one sentence) + +**Specorator's rule engine is a portable contract that forces every AI-extracted JSON object to bind to the prompt that produced it before a deterministic verdict is allowed to render** — the audit trail is the product, the YAML DSL and HTML report are the surface, the manual paste step is a feature, not a regression. + +## Does the JTBD hold? + +Mostly — but the centre of gravity shifted. The first pass framed three JTBDs (reviewer/approver, OODA-Decide author, enterprise evaluator) and the wedge was "make agentic decisions auditable." After building the full loop, the load-bearing job is narrower and sharper: **"when an AI gave me structured output, I want to refuse to act on it unless I can prove it came from the prompt I currently have in source control."** Validate-gate-refuses-verdict-shape and prompt-hash-binding are not features layered on top of the auditability story — they *are* the story. JTBD #1 (reviewer/approver) and #3 (enterprise evaluator) survive intact. JTBD #2 (OODA-Decide) is now contingent on the OODA plugin existing and should be parked until that lands. A new JTBD has emerged that #1–#3 did not name explicitly: **"verify the extraction matches the source it was extracted from"** — this is the stale-extraction problem and is the strongest single justification for the engine's existence in a world where LLMs are cheap. + +## Competitive moat — pick one + +Every major framework now ships structured output (OpenAI strict mode, Anthropic tool-use JSON, Vercel AI SDK `generateObject`, LangGraph + Pydantic, CrewAI structured outputs). Structured output is table stakes. The differentiation that matters is **prompt-hash-binding plus a validate gate that refuses verdict-shaped fields** — i.e. the workflow shape, not the DSL and not the verdict tiers. + +- The YAML DSL is replaceable (json-rules-engine, JDM, JSON-Logic all work). Not a moat. +- Severity-first verdict tiers are a five-line convention. Not a moat. +- The audit trail is best-in-class but `json-rules-engine` is at parity for tree walks. Modest moat. +- **The plan/paste/report loop where the prompt hash binds to the extraction and the validate gate enforces "no verdict fields" — that is genuinely unusual.** No major framework forces this contract; they all let the model emit verdict-shaped fields and trust the caller to ignore them. + +Pick: **the workflow shape (prompt-hash binding + validate gate) is the moat.** Everything else is implementation. + +## Migration story: paste -> API call + +The split between `plan` and `report` was built for exactly this. An adopter who likes the pattern but wants real API calls swaps the single stochastic step. Concretely: + +- `src/plan.ts` already writes a self-contained prompt to disk with a deterministic content hash. An adopter writes an alternative entry point — `src/orient.ts` — that takes the same prompt string, calls their chosen LLM with constrained decoding (OpenAI strict mode / Anthropic tool use / Gemini schema), writes the JSON to `extractions/<id>.json` with the same prompt-hash header. +- `src/report.ts` stays untouched. The validate gate, engine, audit trail, and HTML output do not know whether the JSON came from a paste or an API call. +- The migration is a one-file addition, not a refactor. `prompt-builder.ts`, `engine.ts`, `loader.ts`, `hash.ts`, `html-report.ts` all stay byte-identical. + +The seam is stable. Confidence: high — provided the prompt-hash header contract is documented as the public interface (it currently isn't, mark `TBD — owner: architect`). + +## Pricing and packaging + +Three viable paths, ranked by fit to Specorator's stated non-goals ("do not turn Specorator into a hosted SaaS"): + +1. **Specorator skill / track (recommended).** Ship the engine as an opt-in track under `.claude/skills/rule-engine/` or as a Stage 9 (Review) sub-skill. Value capture: increases adoption of the broader Specorator template by adding a governance surface. No direct revenue. Aligns with the Layer 0 Markdown-first ladder and the non-goal of hosted SaaS. +2. **Standalone open-source library (`@luis85/rule-engine`).** Carve out the engine + loader + validate gate into a separate package, depend on it from the Specorator track. Value capture: ecosystem reach (other agentic frameworks adopt the pattern). Cost: separate release surface, separate docs. Worth it only if external adoption signals appear after path 1. +3. **Paid SaaS (rule-pack hosting, GUI authoring, decision-log SIEM shipping).** This is GoRules' territory and Specorator's product steering explicitly forbids it ("do not turn Specorator into a hosted SaaS"). Skip unless the constitution changes. + +Recommendation: **path 1 now, path 2 if external pull appears, never path 3.** Justifies the build cost because the engine reinforces the existing audit/governance arc on the product ladder without expanding scope. + +## North Star check + +The first pass picked **verdict reproducibility rate**. With the validate gate and prompt-hash binding now built, that metric is 100% by construction and no longer falsifiable. It has become a unit-test invariant, not a North Star. + +The metric that now captures the most value is **extraction conformance rate** — the percentage of LLM extractions that pass the validate gate on first try (correct JSON shape, no forbidden verdict-shaped fields, prompt hash matches, all required-by-rule flags present). It is leading (drops before adoption drops), understandable, actionable (fixes route to prompt or schema), and measurable from the existing logs. Stale-extraction catch rate is a strong secondary — it directly measures the prompt-hash-binding feature's value, but it is too narrow to be the headline. + +**Recommendation: switch the North Star to extraction conformance rate.** Target ≥ 90% on first paste; ≥ 99% on second paste after the model sees its own error. Current value `TBD — owner: prototyper` (no measurement loop yet). + +## Brand alignment with Specorator + +Same brand, same product. Specorator's brand promise is **workflow discipline that survives contact with stochastic AI**. The rule engine extends that promise from "discipline around agent plumbing" to "discipline inside agent decisions." The vocabulary already matches — "specs are the source of truth" in Specorator becomes "rules are the source of truth" in the rule engine; "EARS maps 1:1 to tests" becomes "flags map 1:1 to rule conditions"; "traceability matrix" becomes "audit trail." This is the same product framed at a different layer. No second-brand work needed. Risk: if the engine grows a GUI authoring surface (ZEN/JDM territory), the brand starts to fracture — flag for the Decider if that direction is ever explored. + +## Open question — is the manual paste step temporary or permanent? + +Surface this for the facilitator. The current docs frame paste as "lo-fi until the API integration is paid for." That is one reading. A second reading is **paste is the on-prem / air-gapped / sovereignty story** — the only mode where the LLM, the extraction, and the verdict can live on three different sides of a firewall and the audit trail still works because the prompt and the extraction are both files on disk. If permanence is the chosen frame: + +- The plan/paste/report loop becomes the public interface, not a stepping stone. +- The migration story above becomes an alternative mode rather than the default future. +- Marketing reads "your LLM stays where it is; we just bind the answer to the question" — a credible enterprise-evaluator pitch. +- Engineering work shifts: better prompt rendering, better validate-gate error messages, better stale-extraction diagnostics — not API client code. + +Pivot flag: if user research surfaces that adopters want the paste loop *because of* sovereignty, not despite friction, the strategic outcome shifts and the divergent-thinker should be re-engaged on what a "permanent lo-fi mode" UI looks like. + +## Riskiest commercial assumption (cross-check) + +The critic's riskiest *technical* assumption was "rule weights are uncalibrated." The riskiest *commercial* assumption is different and unaddressed: **that adopters will accept the manual paste step long enough for the rest of the pipeline to demonstrate value.** If they bounce at the paste step, the validate gate and the prompt-hash binding never get a chance to differentiate from any other structured-output framework. The RAT for this is a five-user usability study on the paste loop — owner: user-researcher, not prototyper. + +## Handoff + +To the facilitator: raise the paste-step permanence question with the Decider before Phase 3 converge. To the user-researcher: validate JTBD #1 + #3 and the new "verify extraction matches source" job; run the paste-step RAT. To the architect: document the prompt-hash header as a public contract before any API-mode migration ships. From d191ff8e2524d89f33bde8a30346fd9ce9c05970 Mon Sep 17 00:00:00 2001 From: Claude <noreply@anthropic.com> Date: Sun, 17 May 2026 12:24:42 +0000 Subject: [PATCH 13/45] fix(rule-engine-poc): recompute prompt hash on report (research/14) Closes the 'paste the sidecar to bypass staleness' cheat the critic flagged as the highest-leverage fix in the post-validate workflow. Previously: report.ts and validate-cli.ts read prompts/<id>.hash.txt (plain text) and trusted its value. An operator under deadline pressure could open the sidecar, copy the hash into the extraction's __prompt_hash field, and silently re-render a stale verdict. Now: report and validate-cli recompute the hash from current source files + rules + schema (same code path as plan.ts). The sidecar still gets written for diagnostic / debugging purposes, but it is never the authority for whether an extraction is stale. A real change to any source file invalidates the extraction automatically. Smoke-tested in this sandbox: - Source unchanged, paste-the-sidecar cheat -> exit 0 (correct; extraction is still valid against current source). - Source mutated, same paste-the-sidecar cheat -> stale-extraction error with both the pasted hash and the recomputed hash printed. --- experiments/rule-engine-poc/src/report.ts | 34 +++++++++++++++---- .../rule-engine-poc/src/validate-cli.ts | 20 ++++++++--- 2 files changed, 44 insertions(+), 10 deletions(-) diff --git a/experiments/rule-engine-poc/src/report.ts b/experiments/rule-engine-poc/src/report.ts index a65291980..e777585ee 100644 --- a/experiments/rule-engine-poc/src/report.ts +++ b/experiments/rule-engine-poc/src/report.ts @@ -6,12 +6,13 @@ import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs"; import { join, relative, resolve } from "node:path"; import { loadConfig, findTarget } from "./config.js"; +import { collectFiles } from "./context.js"; import { evaluate } from "./engine.js"; import { loadFlagSchema } from "./flag-schema.js"; import { loadRulesFromFile } from "./loader.js"; import { renderHtmlReport } from "./html-report.js"; import { openInBrowser } from "./open-browser.js"; -import { hashSidecarPath } from "./prompt-hash.js"; +import { computePromptHash } from "./prompt-hash.js"; import { validateExtraction } from "./validate.js"; import type { ExtractionFlags, Verdict } from "./types.js"; import type { Target } from "./config.js"; @@ -82,12 +83,33 @@ for (const target of targets) { continue; } - // Look up the sidecar prompt hash if plan wrote one. Missing sidecar - // = backwards-compat path (older extraction or single-shot test). + // Recompute the prompt hash from CURRENT source + rules + schema. + // Trusting only the sidecar (`prompts/<id>.hash.txt`) would let an + // operator under deadline pressure paste the sidecar value into the + // extraction's __prompt_hash to bypass staleness detection — the + // sidecar is plain text. Recomputing means a real source-file change + // always invalidates the extraction. Critic research/14 flagged this + // as the highest-leverage fix in the post-validate workflow. let expectedPromptHash: string | undefined; - const sidecar = hashSidecarPath(join(config.promptsDirPath, `${target.id}.md`)); - if (existsSync(sidecar)) { - expectedPromptHash = readFileSync(sidecar, "utf8").trim(); + try { + const files = collectFiles(target.paths, { baseDir: config.configDir }); + expectedPromptHash = computePromptHash({ + targetId: target.id, + files, + rules, + flagSchema: schema, + }); + } catch (err) { + // Sources unavailable (deleted folder, etc.) — fall through without + // hash binding rather than crash. validate will warn separately if + // __prompt_hash is set but unverifiable. + expectedPromptHash = undefined; + const msg = err instanceof Error ? err.message : String(err); + if (!quiet) { + console.warn( + `[report] ${target.id}: could not recompute prompt hash (${msg}); skipping staleness check`, + ); + } } if (!skipValidate) { diff --git a/experiments/rule-engine-poc/src/validate-cli.ts b/experiments/rule-engine-poc/src/validate-cli.ts index 6a12c5ca0..8593c4a98 100644 --- a/experiments/rule-engine-poc/src/validate-cli.ts +++ b/experiments/rule-engine-poc/src/validate-cli.ts @@ -7,8 +7,10 @@ import { existsSync, readFileSync } from "node:fs"; import { join, relative } from "node:path"; import { loadConfig, findTarget } from "./config.js"; +import { collectFiles } from "./context.js"; import { loadFlagSchema } from "./flag-schema.js"; -import { hashSidecarPath } from "./prompt-hash.js"; +import { loadRulesFromFile } from "./loader.js"; +import { computePromptHash } from "./prompt-hash.js"; import { validateExtraction } from "./validate.js"; import type { Target } from "./config.js"; @@ -26,6 +28,7 @@ const onlyTarget = takeOpt(argv, "--target"); const config = loadConfig(configPath); const schema = loadFlagSchema(config.flagSchemaPath); +const rules = loadRulesFromFile(config.rulesPath); const targets: Target[] = onlyTarget ? [findTarget(config, onlyTarget)] @@ -61,10 +64,19 @@ for (const target of targets) { worstExitCode = Math.max(worstExitCode, 2); continue; } + // Recompute the prompt hash from current source rather than trusting + // the sidecar — see report.ts and research/14. let expectedPromptHash: string | undefined; - const sidecar = hashSidecarPath(join(config.promptsDirPath, `${target.id}.md`)); - if (existsSync(sidecar)) { - expectedPromptHash = readFileSync(sidecar, "utf8").trim(); + try { + const files = collectFiles(target.paths, { baseDir: config.configDir }); + expectedPromptHash = computePromptHash({ + targetId: target.id, + files, + rules, + flagSchema: schema, + }); + } catch { + expectedPromptHash = undefined; } const result = validateExtraction( parsed as Record<string, unknown>, From 927dabc4efa7292be9fe9656ff85b084fab08b60 Mon Sep 17 00:00:00 2001 From: Claude <noreply@anthropic.com> Date: Sun, 17 May 2026 12:24:42 +0000 Subject: [PATCH 14/45] =?UTF-8?q?docs(rule-engine-poc):=20research=20wave?= =?UTF-8?q?=203=20=E2=80=94=20critic,=20sre,=20user-researcher?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - research/14 (critic): three new failure modes the validate gate opened; ranked --skip-validate, sidecar-paste cheat, and reproducibility theatre. Highest-leverage fix already landed in the previous commit. - research/15 (sre): CI integration sketch with concrete cost math (~$0.56/target, $1,700/month at 20 PRs/day on Opus 4.7) and a Day-1/30/90 operational milestones path. - research/16 (user-researcher): 5-segment JTBD switch interview plan with sequencing (mine demand signal first, S1 indie devs next, fail fast before S2-S5), full sample script, RAT integration. --- .../research/14-new-failure-modes.md | 219 ++++++++++++++++++ .../research/15-ci-operations.md | 148 ++++++++++++ .../research/16-jtbd-switch-interviews.md | 154 ++++++++++++ 3 files changed, 521 insertions(+) create mode 100644 experiments/rule-engine-poc/research/14-new-failure-modes.md create mode 100644 experiments/rule-engine-poc/research/15-ci-operations.md create mode 100644 experiments/rule-engine-poc/research/16-jtbd-switch-interviews.md diff --git a/experiments/rule-engine-poc/research/14-new-failure-modes.md b/experiments/rule-engine-poc/research/14-new-failure-modes.md new file mode 100644 index 000000000..f45658d91 --- /dev/null +++ b/experiments/rule-engine-poc/research/14-new-failure-modes.md @@ -0,0 +1,219 @@ +# 14 — New failure modes introduced by the validate gate + +Critic anti-validation pass on the post-PR-#525 state. The schema-miss and +stale-extraction defences from `research/07` are in `src/validate.ts` +(`FORBIDDEN_FIELDS`, `expectedPromptHash` check) and +`src/prompt-hash.ts` + sidecar wiring in `plan.ts:64-80` / +`report.ts:85-115`. Old failure modes are mitigated. Below: what we bought +ourselves by closing them. Note: RAT-A and RAT-B from research/07 have not +been run, so the operator-behaviour claims here are theoretical — flagged +where they are. + +## 1. `--skip-validate` is a one-keystroke laundering pipe + +`report.ts:39` accepts `--skip-validate` and `report.ts:106` advertises it +in the error message itself: *"re-run with --skip-validate to render +anyway"*. We built the gate and then printed the bypass next to it. The +slippery slope is short: + +1. Friday afternoon, demo at 4pm, validation errors on one rule change. +2. `--skip-validate` once "to get the demo out". +3. Aliased in someone's shell. Then in CI. Then in the runbook. +4. The flag never comes off because removing it now breaks a green build. + +The escape hatch was correct to ship (operator override exists in every +serious gate) but it must be **noisy, logged, and surfaced in the HTML +report itself**. Today, a `--skip-validate` run produces HTML +indistinguishable from a validated run — no badge, no provenance line, no +audit-trail entry. That is the actual defect. Recommend: stamp +`validated: skipped` into the HTML provenance footer and refuse to write +the verdict tile in its normal colour when validation was bypassed. + +## 2. Hand-edited `__prompt_hash` — the binding is honour-system + +The hash sidecar lives at `prompts/<id>.hash.txt` (`prompt-hash.ts:34`). +Nothing prevents an operator from opening it, copying the hex, and pasting +it into `extractions/<id>.json` to silence the stale-extraction error in +`validate.ts:81-90`. The check at `validate.ts:81` is string equality — +there is no signature, no second factor, no record of who/when the hash +was put there. The binding is **ceremonial** the moment one developer +learns the trick. + +Detection options, in order of feasibility: + +- **Re-hash on report**: at `report.ts:88-91`, recompute the prompt hash + from current source files + rules + schema and compare to the value in + the extraction. If source content changed since the sidecar was + written, the recomputed value will differ from both the sidecar and the + pasted value, and we can refuse. This is cheap and closes the obvious + cheat. +- **HMAC the sidecar** with a per-repo secret: the operator can still copy + it, but at least we know the value was produced by `plan` and not typed. + Marginal value vs. complexity. +- **Behavioural**: log every extraction load with `(promptHash, mtime of + extractions/<id>.json, mtime of prompts/<id>.md)`. If extraction mtime + is **later** than prompt mtime by hours-to-days but the hash matches, the + hash was almost certainly hand-pasted. Surface as a soft warning. + +The recompute option is the one I'd ship. It converts the binding from +"trust the sidecar" to "the inputs themselves are the source of truth", +which is what we should have built originally. + +## 3. The dark matter: hash matches, flags are wrong + +This is the honest limit of the work. `validate.ts:93-125` checks **type +conformance and `allowed_values` membership** — it cannot check +*truthfulness*. An LLM that emits `ears_coverage: 0.85` when the real +value is `0.20` passes every defence we shipped: forbidden-fields list is +clean, prompt-hash matches, type is `number`, value is in +`[0..1]`. The deterministic layer then launders the hallucination into a +verdict with a clean audit trail. This is `research/07 §2 bullet 2` +**unaltered** — we did not fix it, we narrowed the visible surface around +it. + +This is probably not addressable inside the deterministic layer. The +mitigations live elsewhere: + +- **Cross-tool quorum** — run the same prompt through two models, require + agreement on numeric values within tolerance. Expensive; punts on + systematic shared bias. +- **Evidence-quoting requirement** — the prompt already asks the model to + cite quotes (per `research/10`). The validator could require every + numeric flag to be accompanied by a `<flag>_evidence: string` field with + a non-empty source-quote, and reject extractions where the quote string + is not a substring of any collected file. This is enforceable + deterministically and would be the single highest-value addition the + validator could grow. +- Otherwise: accept the limit, lower the trust ceiling, **stop calling the + report a verdict** — call it a draft assessment. + +## 4. `FORBIDDEN_FIELDS` will grow; nobody owns the list + +`validate.ts:16-24` is seven names today. The list is a moving target: + +- Claude 4.x adds `considerations`, `caveats`. +- GPT-5 adds `notes`, `confidence`, `risks`. +- Future tuned models add whatever their RLHF pass rewards. + +No test fails when a new verdict-shaped field slips through. The list +update is reactive — somebody notices a hallucination, files an issue, +adds a string. Governance: where does this list live in 6 months? Two +places that would help: (a) emit a *warning* (not error) for **any** +top-level field not in `flagSchema` (already done at `validate.ts:97-105` +as `unknown-flag`) and treat the trend as data — if "considerations" shows +up in 30% of extractions across a week, it gets promoted to +`FORBIDDEN_FIELDS`; (b) put a comment in `validate.ts` naming the owner +and review cadence. Today it's nobody's job, which means it's the next +incident's job. + +## 5. Trust over-shoot: green validate ≠ green verdict + +The most dangerous consequence of shipping validate. Today's flow: + +``` +ok = no forbidden fields + types match + values in allowed_values + hash matches +``` + +That set says **nothing** about whether the extraction is *correct*. But +the next time `[report] X: extraction passed validation` prints, the +reader's prior shifts. The validate gate becomes a quality signal it was +never designed to be. The HTML report compounds this — there is no visible +distinction between "the schema layer is happy" and "the verdict is +trustworthy", and within 20 runs nobody reads the trail (per `research/07 +§3`). + +Mitigation: the HTML provenance footer must say, in plain words, *"This +report confirms (a) the engine ran, (b) the flags conformed to schema, +(c) the extraction was produced against the current prompt. It does NOT +confirm the flag values are correct."* Three lines. If we can't fit them +above the fold, the verdict tile is too prominent. + +## 6. Reproducibility theatre + +`computePromptHash` (`prompt-hash.ts:22-32`) makes a verdict reproducible +against a `(sourceContent, rules, schema)` tuple. What it does **not** +make reproducible is the LLM extraction itself. Two paste-runs against +identical prompts can produce different JSON — different numeric values, +different presence/absence of optional flags. Same `promptHash`, possibly +different verdict. + +The asymmetry to be honest about: the hash proves **the input to the LLM +was the same**, not **the output was the same**. Marketing this as +"reproducible verdicts" oversells it. The correct phrasing is "verdict is +reproducible from a saved extraction; the extraction itself is not". A +later operator who reads "reproducible" in the docs and assumes they can +recreate yesterday's verdict by re-pasting will be unpleasantly +surprised on a Tuesday. + +Concrete fix: in the HTML provenance, surface **both** the promptHash +*and* a hash of the saved extraction JSON. Re-running `plan` + paste +should produce a matching promptHash but is **expected** to produce a +different extractionHash. If extractionHash is identical across two +paste-runs by different operators, that itself is the suspicious signal +(somebody cached or copied). + +## 7. Audit-trail fatigue + +The trail's value plateaus fast. Today's `quality-gates.yaml` has on the +order of 10 rules; each evaluates 1–5 conditions. So ~30 lines of trail +per report. As rule count grows to 50+ — the natural endpoint of "every +quality-framework concern gets a rule" — the trail crosses the 200-line +mark and becomes read-once-then-ignored. The trail is most useful when +the verdict is *unexpected*; on the 19 of 20 reports where the verdict +matches intuition, nobody opens it. + +The fix is not "shorter trails" — full traceability is required by +constitution Article V. The fix is **differential surfacing**: at the top +of the report, highlight the 1-3 conditions whose state *changed* since +the previous report for this target. The trail stays complete for +audit; the human reads only the delta. Without this, the trail's +deterrent value against trust over-shoot (§5) approaches zero by month +two. + +## 8. Three new RATs for the post-validate flow + +**RAT-D — `--skip-validate` is invoked at least once within 4 weeks of +real use.** +*Assumption:* the escape hatch is rare and well-justified. +*Falsification:* instrument `report.ts` to write a log line to a +gitignored audit file every time `--skip-validate` fires, with timestamp, +target id, and the validation errors that would have fired. **Refuted if +any single operator invokes `--skip-validate` more than twice in a 4-week +window, or if `--skip-validate` appears in a CI config, a Makefile, an npm +script, or a shell alias.** Cheap. Worth running from day one of any +multi-user pilot. + +**RAT-E — The prompt-hash binding catches a real hand-paste cheat.** +*Assumption:* the binding is a sufficient deterrent against the +"hand-edit the hash" workaround. +*Falsification:* without warning, modify one source file under a target, +then ask three operators to make the previously-passing report render +again under time pressure (15-minute deadline). **Refuted if any of three +operators copies the new sidecar hex into the existing extraction without +re-pasting the prompt.** If even one does, the binding is honour-system +and the recompute-on-report fix from §2 must ship before any further +trust claims. + +**RAT-F — Operators distinguish "validate ok" from "verdict trustworthy" +after 10 reports.** +*Assumption:* the gate does not produce trust over-shoot. +*Falsification:* after 10 reports per operator over 2 weeks, ask each +operator (a) what `[report] X: passed validation` actually checked, and +(b) whether they would ship based on a green verdict tile alone. +**Refuted if any operator describes validate as "checking the analysis" +or "confirming the verdict", or if more than 1 of 3 says they would ship +on the verdict tile without reading the trail.** This is the §5 test. If +refuted, the HTML provenance changes from §5 are not optional, they are +the next sprint. + +--- + +**Recommendation.** The validate gate closed the loudest failure modes and +opened three quieter ones: a flagged escape hatch (§1), a ceremonial +binding (§2), and trust over-shoot (§5). The first two are engineering +fixes (stamp `validated: skipped`; recompute hash on report). The third +is a docs-and-UI fix and the most expensive to leave alone, because it +compounds. **Default verdict: ship the validate gate, but treat §1, §2, +and §5 as blockers on the "promote POC to production" decision** — not +on further POC iteration. RAT-D and RAT-F are afternoon-cost; run them +before any external pilot. diff --git a/experiments/rule-engine-poc/research/15-ci-operations.md b/experiments/rule-engine-poc/research/15-ci-operations.md new file mode 100644 index 000000000..cd66aa186 --- /dev/null +++ b/experiments/rule-engine-poc/research/15-ci-operations.md @@ -0,0 +1,148 @@ +--- +title: 15 — CI integration & operations sketch +folder: experiments/rule-engine-poc/research +description: Operational picture for graduating the plan / validate / report workflow from a paste-loop POC into a CI-integrated, API-backed, observable production capability. Costs, secrets, rate limits, alerting, retention, failure modes, and a Day-1/30/90 milestone path. +entry_point: false +--- + +# 15 — CI integration & operations sketch + +This is an SRE-perspective gap analysis, not a build plan. The POC ships three CLIs (`plan` exit `0`, `validate` exit `0/1/2`, `report` exit `0/1/2`), 88 tests, and a prompt-hash sidecar pinning each extraction to the prompt that produced it. What would running this on every pull request actually require? What's missing? + +## 1. CI integration shape + +**Minimum-viable GitHub Action (sketch, not committed):** + +``` +on: pull_request +jobs: + rule-engine: + runs-on: ubuntu-latest + permissions: { contents: read, pull-requests: write } + steps: + - checkout (full depth so target paths resolve) + - setup-node 22 + npm ci in experiments/rule-engine-poc + - npm run plan # writes prompts/*.md + .hash + - run extractor (API call, see §1.2) # writes extractions/*.json + - npm run validate # exit 1 -> fail job + - npm run report -- --no-open # exit 1 -> fail job; exit 2 -> error + - upload-artifact: prompts/, extractions/, reports/, hashes + - PR comment: link to the HTML report from artifact URL +``` + +The human-paste step lives **nowhere** in CI. The paste loop is the lo-fi developer-laptop version; CI calls a real LLM API. The split between `plan` (writes a prompt) and "something invokes the model" (writes an extraction) is exactly the seam the POC was designed around — see [`docs/workflow.md`](../docs/workflow.md) §"Why two commands and not one". + +**Swapping orient/extraction to an Anthropic API call** means one new script (call it `extract.ts`) that: + +1. Reads `prompts/<id>.md`. +2. Calls `messages.create({model: "claude-opus-4-7", max_tokens: 4096, system: "...", messages: [{role: "user", content: <prompt>}]})`. +3. Parses the text between `<output>` tags, runs `JSON.parse`. +4. Writes `extractions/<id>.json` + a `<id>.meta.json` sidecar with `{model, modelVersion, requestId, inputTokens, outputTokens, latencyMs, timestamp, promptHash}` (the model-invocation log Article 12 wants — see [`research/02-regulatory-auditability.md`](02-regulatory-auditability.md) §7). +5. Retries with backoff on `429` / `529`; gives up after N attempts. + +Constrained-decoding (`response_format` JSON-schema) lands as a `v2` of `extract.ts` once stable; until then the `validate` step is the safety net. + +## 2. Secrets + +- `ANTHROPIC_API_KEY` lives in GitHub Actions **org-level secrets**, scoped to the rule-engine repo set, never echoed to logs. The extractor reads `process.env.ANTHROPIC_API_KEY` and **asserts non-empty** before doing anything else. +- Token redaction: the extractor must scrub `Authorization`, `x-api-key`, `Bearer …` patterns from any error string before writing the meta sidecar or stderr. A `tests/redaction.test.ts` enforces this — easy to forget, easy to leak. +- Per [`docs/steering/operations.md`](../../../docs/steering/operations.md), prod-affecting secrets rotate on a calendar; the extractor must tolerate a 24h overlap window (two keys valid simultaneously). +- Audit logs themselves are not secret, but the *source files* fed into the prompt may contain pre-publication content. Treat the prompt body as the same sensitivity tier as the source files; do **not** archive prompts longer than extractions (see §6). + +## 3. Cost + +**Per-target back-of-envelope (Opus 4.7, projected 2026 pricing in the order of $15 / $75 per million input/output tokens — confirm at procurement time):** + +- One 121 KB prompt ≈ 121,000 chars ≈ **~30k input tokens** (4 chars/token rough rule). +- Output: extraction JSON for the current schema is small — a few dozen flags, generous estimate **~1.5k output tokens**. +- Per-call cost ≈ 30,000 × $15/1M + 1,500 × $75/1M ≈ **$0.45 + $0.11 = ~$0.56 per target**. +- Five targets/PR (current config has two; assume growth) ≈ **$2.80 per PR run**. +- 20 PR pushes/day × $2.80 ≈ **$56/day = ~$1,700/month** in extraction cost alone. Same order as a small SaaS line item. + +**Budget controls:** a `MAX_COST_PER_RUN_USD` env var that aborts before the (N+1)-th call when projected spend exceeds it; daily budget cap at the org level via Anthropic console; PR comment that surfaces token usage so cost drift is visible to the engineer who caused it. + +Cheaper-model tier (Haiku/Sonnet) for non-merge-blocking previews makes sense at a 10× cost reduction; gate on `pull_request.draft == true` or a `extraction-tier: cheap` PR label. + +## 4. Rate limits + +Anthropic tier-3 API limits are in the range of 4k RPM / 400k input tokens-per-minute (subject to change — verify at scale-out). The bottleneck for this workload is **input TPM** because each call is ~30k tokens. + +- 5 targets/PR × 30k = 150k input tokens to extract a single PR. One PR = ~30% of one minute's TPM budget at tier 3. +- 50 PRs/day × 5 targets × 1 reviewer-cycle = **250 extraction calls/day**, ~7.5M input tokens/day. Headroom on daily limits is fine; *bursts* are the risk (a force-push storm or a merge train). +- Throttling: a **per-repo concurrency lock** (GitHub Actions `concurrency: rule-engine-${{ github.repository }}` with `cancel-in-progress: true`) keeps only one extractor running at a time per repo. Queueing across repos belongs in a small worker tier (see §8). + +## 5. Observability + +- **Structured logs**: today the CLIs print human strings. Production needs a `--json-logs` flag emitting one NDJSON record per target with `{cli, targetId, verdict, durationMs, promptHash, rulesetHash, flagsHash, engineVersion, errorCode?}`. Ship to whatever log aggregator the org uses (Loki/Datadog/CloudWatch); the existing audit-trail fields ([`docs/audit-trail.md`](../docs/audit-trail.md)) are the schema. +- **Per-target latencies**: extractor latency (LLM call) is the only meaningful one; `plan`/`validate`/`report` are sub-second. Track p50/p95/p99 by model ID. +- **Verdict distribution over time**: a daily aggregate of `count by verdict by target by week` — the headline drift signal. Goes on a Grafana board next to the verify-gate KPIs. +- **JSON outputs as a metrics source**: yes, but the report HTML is not the right surface. The eventually-needed `--json` mode for `report` (the engine already returns `VerdictResult`; just expose it) is the metrics source. Until then, log emission has to cover it. + +## 6. Alerting + +- **Page-worthy** (rare): repeated `validate` failures on `main` for >1h — means prompt + LLM drifted enough that extractions don't match the schema. Either the model regressed or the schema changed under us. +- **Page-worthy**: extraction-cost burn-rate exceeds 2× the 30-day baseline (catches a runaway loop or a prompt explosion). +- **Ticket-worthy** (next-business-day): verdict-distribution shift — e.g., `blocked` rate > 2σ from rolling baseline. Could be a real quality problem, could be prompt drift; either way investigate. +- **Ticket-worthy**: stale-extraction rate > 0 in any 24h window. The prompt-hash sidecar makes this measurable; an unmatched hash means a developer edited sources between `plan` and `report` and the extraction is from the wrong prompt. +- **Do not page** on `report` exit-code 1 (`blocked`) — that's a working alarm, not a system fault. The alarm is for the engineer whose PR got blocked, not for SRE. + +## 7. Storage / retention + +The POC writes everything under `experiments/rule-engine-poc/{prompts,extractions,reports}`. Three options for production: + +| Tier | Where | How long | Why | +|---|---|---|---| +| Hot | GitHub Actions artifacts | 30 days | CI default; cheap; URL link from PR comments | +| Warm | S3 + lifecycle rule | 13 months | Cheap; sufficient for Article 19 "at least 6 months" floor and "as long as system in service" intent for high-risk uses | +| Cold | S3 Glacier | 7 years | For Annex IV technical-file evidence; only verdicts on flagged PRs, not the everyday stream | + +What to store, minimally, per run: `prompts/<id>.md`, `prompts/<id>.md.hash`, `extractions/<id>.json`, `extractions/<id>.meta.json` (the model-invocation log), `reports/<id>.html`, and the engine's `VerdictResult` as JSON. The hash chain (rulesetHash, flagsHash, engineVersion, promptHash) is what makes any of this replayable later. + +For EU AI Act: a non-trivial chunk of this **only** matters once the system is classified high-risk. For internal-dev-quality-gate use, the POC is well below the Annex III line — but the architecture lets us flip the switch without a re-platform. + +## 8. CI-specific failure modes + +- **Ephemeral runners**: the prompt-hash sidecar (`prompts/<id>.md.hash`) must travel with the extraction. If extractions are committed to branches, hashes must be too; otherwise they only co-exist on a single runner. Easier: never commit, always rebuild both in the same job. +- **Browser open in CI**: `report.ts` already supports `--no-open`. CI must always pass it; the `openBrowser: true` config default is laptop-friendly, CI-hostile. A second look at `open-browser.ts` for `CI=true` env auto-detect would be friendly. +- **Colored output in CI logs**: the CLIs don't appear to ANSI-colour today (good); keep it that way, or gate on `NO_COLOR`/`process.stdout.isTTY`. +- **Path resolution**: `paths: ["../../specs/..."]` is repo-relative and works fine in CI as long as `actions/checkout` ran at the repo root with `fetch-depth: 0` (for branch comparisons later). +- **Concurrent PR runs touching the same target**: each runner has its own `extractions/` so this is safe — but if the eventual worker tier shares storage, lock per `targetId`. + +## 9. Production diff from POC + +The POC does not yet have, and production needs: + +1. **API extractor** (`src/extract.ts`) replacing the manual paste. Model-invocation log written as a sidecar. +2. **Persistent state**: extractions in object storage, not just a runner tmpfs. +3. **Queued workers**: pulls extraction jobs off SQS / Pub/Sub; smooths bursts; isolates the API budget from CI job timeouts. +4. **Drift dashboards**: verdict distribution, validate-failure rate, stale-extraction rate, per-target latency, $/run. +5. **Model A/B harness**: route X% of extractions to a candidate model, diff the resulting `flagsHash`; verdict-disagreement rate is the metric. Mirrors what [`research/02`](02-regulatory-auditability.md) §6 calls "flag-extraction calibration". +6. **Fairness audit cron**: monthly disparate-impact pass over historical extractions (per `research/02` §6 point 3). +7. **Rule-change on-call**: rules under `rules/quality-gates.yaml` get a CODEOWNERS gate + an ADR-shaped changelog. A "rule bump" is a production-affecting change (Article IX); past authorisation does not extend across edits. +8. **Runbook** for the four pageable conditions in §6 — none exists today; this file is upstream of writing one. + +## 10. Day-1 / Day-30 / Day-90 + +**Day 1** (graduate from `experiments/`): +- API extractor lives behind an `--api` flag on the existing `plan`/`extract`/`report` split. +- Single non-blocking GitHub Actions workflow comments on PRs with the verdict and a link to the HTML report artifact. +- `ANTHROPIC_API_KEY` provisioned; cost cap at $50/day org-wide; concurrency lock per-repo. +- NDJSON logs to stdout, scraped by whatever log pipeline already exists. + +**Day 30** (trust): +- Verdict + cost + latency dashboard live; verify the verdict distribution is stable across a fortnight before promoting to blocking. +- Promote to **required check** on one repo as a pilot. +- S3 bucket with 13-month lifecycle; extraction + meta + verdict JSON copied out of CI artifacts on every run. +- Page-worthy alerts (§6) wired to the on-call rotation; one runbook page per alert. + +**Day 90** (steady-state): +- Required check on all in-scope repos. +- Model A/B harness running on 5% of traffic; first calibration report published. +- Monthly fairness-audit job; results into the dossier folder. +- Rule-set under semver with signed releases; `as-of` lookup for old verdicts proven by a quarterly replay drill (re-run last quarter's blocked PRs against current rules + pinned engine, confirm the chain still resolves). + +## What this is not + +- **Not a commitment to ship.** This is the operational picture against which a build proposal would be sized. +- **Not the regulatory readiness story.** [`research/02`](02-regulatory-auditability.md) is. This file is downstream of those obligations. +- **Not validated against real cost data.** All dollar figures are 2026 list-price projections; replace with negotiated rates before budgeting. diff --git a/experiments/rule-engine-poc/research/16-jtbd-switch-interviews.md b/experiments/rule-engine-poc/research/16-jtbd-switch-interviews.md new file mode 100644 index 000000000..3298e6111 --- /dev/null +++ b/experiments/rule-engine-poc/research/16-jtbd-switch-interviews.md @@ -0,0 +1,154 @@ +# 16 — JTBD Switch Interview Design + +Field-research plan to validate (or refute) the three JTBDs sketched in +`research/03-positioning-jtbd.md` before more engineering goes into the rule-engine POC. +Pattern under test: **"LLM extracts structured flags; deterministic rules emit a verdict."** +Interviews must surface whether anyone is currently *struggling enough* with the alternative +to switch — not whether they think the idea is "cool" in the abstract. + +> Status: research plan only. No interviews have been conducted. Every quote in +> §6 is illustrative scaffolding so the interviewer knows what a validating +> vs. refuting answer sounds like; none are attributable to a real participant. + +## 1. Candidate segments and switch hypotheses + +| # | Segment | Hypothesised Push (pain today) | Hypothesised Pull (what the POC offers) | Why they'd switch | +|---|---|---|---|---| +| **S1** | **Indie hackers / solo devs running Claude Code daily on their own repos** | "Claude told me my spec is ready; was it really?" Re-reading the chat to second-guess. | A printable HTML verdict they can keep beside their git log. | Lowest-friction adopters — they already paste prompts. | +| **S2** | **Engineering managers / staff engineers running quality gates over team output** | Reviews of agent-generated PRs are still done by eyeball; no consistent rubric. | A rule pack the team agrees on once, fires automatically per PR/feature. | Highest *budget* for governance tooling; slowest to switch. | +| **S3** | **Compliance / risk officers in regulated industries (fintech, health, EU AI Act–exposed)** | "Show me your model-governance evidence" → screenshots of chats. | Replayable, hash-stamped audit trail mappable to ISO 42001 / EU AI Act articles. | Strongest *required* pull; weakest hands-on AI fluency. | +| **S4** | **DevRel / PMs documenting agentic workflow patterns** | Their own pattern posts get cargo-culted; no way to test if readers actually got the same answer. | A shareable rule pack + verdict that proves the pattern reproduces. | Multiplier segment — they evangelise to S1 and S2. | +| **S5** | **AI engineers building agent products who need audit trails for customers** | Customers ask "why did your agent recommend X?" → "the LLM decided" doesn't fly. | Drop-in decision layer that yields rule-firing provenance per call. | Highest *willingness to pay* if it works; but most likely to build their own. | + +**Most-likely-to-validate pick: S1.** They are the cheapest to recruit, they already +live inside the workflow the POC assumes (terminal + paste-into-LLM), and if *they* +don't feel the pull, the more demanding segments won't either. Full S1 script in §6. + +## 2. Sampling and recruitment + +Target **5 interviews per segment, 25 total**. Floor is 3/segment per the Sprint 2.0 +heuristic ([AJ&Smart](https://ajsmart.com/design-sprint-2-0/)); 5 hits Nielsen's ~85% +coverage. JTBD recommends 12–20 to saturation *per job*; since we are testing +three candidate jobs across five segments, 25 is the leanest credible budget. + +| Segment | n | Recruitment channel | +|---|---|---| +| S1 indie devs | 5 | Hacker News "Who is hiring/being hired"-style callout; `r/ClaudeAI`, `r/LocalLLaMA`, `r/SaaS`; Indie Hackers DM; replies to the original Reddit fact-checking thread. | +| S2 eng managers | 5 | LinkedIn warm intros via the maintainer's network; LeadDev Slack; Rands Leadership Slack. | +| S3 compliance | 5 | Hardest. ISACA chapters, EU AI Act practitioner LinkedIn groups, two specialist consultancies the maintainer can ask for one warm intro each. Pay-for-time may be required. | +| S4 DevRel / PMs | 5 | DevRel Collective Slack, Lenny's Newsletter community, Twitter/X DM to authors of recent "agentic workflow" posts. | +| S5 AI engineers | 5 | `r/LocalLLaMA`, `r/MachineLearning`, AI Engineer Discord, LangChain / LangGraph community channels. | + +**Hard rule:** zero participants from the maintainer's day-job team or the Specorator +contributor list. If recruitment for S3 fails outright, document that as a finding +("compliance signal is hypothesised but unverified") rather than substitute a friendly proxy. + +## 3. Switch-interview script structure (all segments) + +Standard Re:Wired / Strategyn switch story, ~45 min, recorded with consent. + +1. **First Thought** (5 min): *"Take me back to the first time you thought 'I wish I could prove why my AI tool gave me that answer.' What were you doing?"* — anchor a specific moment, not a generic opinion. +2. **Push (Pain)** (10 min): *"What about your current way of working pushed you to look for something better?"* Probe for the emotional core, not the feature wish. +3. **Pull (New way)** (10 min): *"When you imagined a better answer, what did it look like? Did you try anything? What happened?"* +4. **Anxiety (Holding back)** (10 min): *"What worried you about changing? What might break, what might you lose?"* This is where rule-engine adoption *actually* dies — surface it. +5. **Habit (Inertia)** (5 min): *"What's your current workaround? How long have you been doing it that way?"* Strong habit = high switching cost regardless of how good the pull is. +6. **Decision Trigger** (5 min): *"If you did switch, what would have to happen the week before? Who would have to say yes?"* — names the actual sale, not the user. + +Forces of progress must net out *Push + Pull > Anxiety + Habit* for a switch to be plausible. + +## 4. Validating vs refuting prompts (signal detection) + +These are the specific probes that distinguish a real JTBD signal from polite agreement. + +| Prompt | Validates if you hear… | Refutes if you hear… | +|---|---|---| +| "Tell me about the last time you wished you could show *exactly* why an AI tool gave a verdict." | A concrete recent moment, named stakes ("the customer asked", "audit was in three weeks"), already-tried workaround. | "Hmm, I can imagine that being useful." Generic. No specific incident. | +| "Walk me through what happens today when an agent's recommendation surprises you." | "I re-read the entire chat", "I ask it again hoping for a different answer", "I just override it without telling anyone". Frustration. | "I trust the model." Or: "I just don't use it for those decisions." (Means the JTBD doesn't exist for them.) | +| "If I gave you an HTML page that said 'verdict: blocked, because rule R-12 fired on flag `missing_changeset=true`', what would you do with it?" | "Send it to my PM / compliance / the dev who opened the PR." Names a downstream consumer. | "I'd still check the chat to be sure." (Means the audit trail isn't trusted, just decorative — the §07 trail-skipping failure mode pre-empted.) | +| "Who has to agree before you'd commit to running this every week?" | Names a real second human (manager, compliance, customer). | "Just me, I'd play with it." (Indie toy, not a JTBD — fine for S1, fatal for S2/S3/S5.) | +| "What would have to be true for you to *not* read the audit trail and just trust the green badge?" | Long, hesitant answer; calls out staleness, schema mismatch, model swap as required guarantees. | "If it looks right I'll trust it." (Means RAT-C / trust calibration is broken before they even start.) | + +## 5. Mapping interviews to the three RATs + +The interviews **cannot replace the RATs** but they can re-prioritise and pre-falsify them. + +- **RAT-A (schema coverage).** Ask every participant to list the 5 flags they'd most want extracted from their own work artefacts. Compare against `rules/flag-schema.yaml`. **Refuted** if <60% of named flags exist in the schema and <30% can be added with one-line YAML. **Action:** if refuted, the schema is too quality-gates-shaped for the segments who would actually pay. +- **RAT-B (trust comparison).** Show two verdicts on the same artefact: one a plain LLM judgement, one a rule-engine HTML. Don't tell them which is which until they've stated which they'd act on. **Refuted** if <3 of 5 per segment pick the rule-engine output, *or* if they pick it but say "because it's prettier." +- **RAT-C (authoring time).** Hand a non-rules-expert the DSL reference and ask them to write 5 new rules covering their own domain. **Refuted** if median time >30 min or <3 of 5 rules pass `npm run report` first try. Authoring difficulty is the hidden killer — if only the maintainer can write rules, the rule pack becomes the bottleneck instead of the LLM. + +Two interviews can include hands-on RAT-A/RAT-B exercises in the same session. +RAT-C wants its own 1-hour session. + +## 6. Cheap pre-research (mine before recruiting) + +Before paying for 25 hours of interviews, mine signal that's already on the public internet. + +- The **origin Reddit thread** on r/artificial fact-checking — re-read every top-level comment for switch-language ("I used to…", "the problem with just asking the LLM is…"). +- **r/LocalLLaMA, r/ClaudeAI, r/ChatGPTCoding** — search `"audit trail" agent`, `"deterministic" LLM`, `"why did" claude said`, `"trust the model"`. Note every comment that names a *moment* of distrust, not a feature wish. +- **GitHub Issues across `langchain`, `langgraph`, `crewai`, `inngest agent-kit`, `openai-agents`** — search for issues mentioning `audit`, `replay`, `deterministic`, `reproducible verdict`. Existing issues = pre-existing demand. +- **Hacker News searches** via `hn.algolia.com` for `LLM judge`, `LLM as critic`, `replay agent decision`. The volume + tone of replies tells you whether the JTBD is felt or just admired. +- **EU AI Act / ISO 42001 community forums** — searches on the IAPP and ISACA forums for `"AI agent" governance evidence`. If S3 has zero hits, S3 is theoretical not real. +- **The Codex / Claude Code / Cursor changelogs** — every change that adds "explain why" or "trace" is a competing answer to the same JTBD. List them. +- **Reverse Twitter/X search** for `"told me my code is fine"` + `claude OR chatgpt OR cursor` — captures the after-the-fact "I shouldn't have trusted it" tweet. + +Pre-research output: a 1-page **demand-signal log** with ≥ 15 dated quotes, each tagged +to a segment and a JTBD candidate. If the log is empty, the JTBD is conjectural and +interviews will be confirmation theatre. + +## 7. Failure criteria (the kill switch) + +Decision rule the facilitator should commit to *before* the first interview, per §3.6 of the Discovery Track. + +- **Go ("JTBD is real, build more"):** ≥ 3 of 5 interviews in **at least two of S1/S2/S5** surface a concrete recent struggling moment, name a downstream consumer for the verdict, *and* would re-use the tool weekly without prompting. RAT-A must not be refuted. +- **Pivot:** signal is strong in one segment only (e.g. only S3), or the named JTBD is adjacent but not the one in `research/03-positioning-jtbd.md` (e.g. "I want help authoring rules" rather than "I want trustworthy verdicts"). Re-frame and re-run Phase 1 with the new framing. +- **No-go ("JTBD is weak, stop"):** <3 of 5 surface a struggling moment in **any** segment, or the dominant theme is "the LLM is fine, I just override it." The POC stands as an engineering exercise; no further investment. + +Inconclusive (e.g. recruitment failed for S3) is **not** a Go. Mark and stop. + +## 8. Sample full interview script — S1 (indie hacker / solo dev) + +**Recruitment line (DM/email):** +> *"I'm researching how solo devs working with Claude Code / Cursor decide when to trust an AI-generated recommendation. 45 minutes, recorded, no sales pitch, $50 gift card. Interested?"* + +**Pre-interview:** confirm they ship code with AI assistance weekly; confirm they are not on the Specorator contributor list. + +**Opening (2 min):** *"Thanks for the time. I'm not going to show you anything I built — I want to understand how you currently work. Everything you say is anonymised. Is recording OK?"* + +**First Thought (5 min):** +- *"Walk me through the last time an AI tool gave you a recommendation about your own code or spec and you weren't sure whether to trust it."* +- Probe: *When was that? What were you working on? What did the AI actually say?* +- Probe: *What did you do in the next 60 seconds?* + +**Push (10 min):** +- *"What did that feel like in the moment?"* +- *"Has that happened before? How often?"* +- *"What's the worst version of that you remember — when not trusting it (or trusting it wrongly) actually cost you something?"* +- *"What were you using before you used [Claude Code / Cursor]? Was this better or worse?"* + +**Pull (10 min):** +- *"If you could rewind that moment and have anything appear on your screen to help you decide, what would it be?"* +- *"Have you tried building or finding anything like that? What happened?"* +- (If they describe something rule-like) *"Walk me through how you'd want to author the thing that decides."* + +**Anxiety (10 min):** +- *"If someone gave you a tool that produced a verdict — green / red / yellow — on your spec or code, what would worry you about using it?"* +- *"What's the worst way that tool could be wrong?"* +- *"Whose problem would it become if it was wrong?"* + +**Habit (5 min):** +- *"What's your actual current workaround? Walk me through what you did yesterday."* +- *"How long have you been doing it that way?"* + +**Decision trigger (5 min):** +- *"If you were going to add a new tool to your weekly workflow, what would have to be true? Who, if anyone, would have to OK it?"* +- *"What would make you abandon it after week 2?"* + +**Closing probe (the RAT-B card-sort, 5 min):** +- Show two artefacts side-by-side: a Claude chat saying "your spec looks ready to implement" *vs* a one-page HTML showing `verdict: ready-to-progress`, the firing rule, and the flags. Don't say which you built. +- *"If both of these came to you on the same morning, which would you act on? Why? What would you do with the other one?"* + +**Score immediately after** (interviewer, not participant): Push 1–5, Pull 1–5, Anxiety 1–5, Habit 1–5. A switch is plausible only when (Push + Pull) − (Anxiety + Habit) ≥ +3. + +--- + +**Recommendation to facilitator:** mine the pre-research log first (§6, half a day). If it produces < 15 dated demand-signal quotes across S1/S2/S5, **do not run interviews yet** — the framing in `research/03-positioning-jtbd.md` may need a Pivot pass before recruiting. If the log clears, run S1 first (cheapest, fastest), make the Go/Pivot/No-go call after the fifth S1 interview, and only commit to S2–S5 if S1 produces a Go signal. From 0509c11445fe75ec9060453d270fed10049eacd5 Mon Sep 17 00:00:00 2001 From: Claude <noreply@anthropic.com> Date: Sun, 17 May 2026 12:26:44 +0000 Subject: [PATCH 15/45] docs(rule-engine-poc): research wave 3 reviewer re-review + typo fixes - research/12 (reviewer): independent re-review at HEAD. Verdict pass-with-findings. S2-1 (docs drift: workflow.md still lists validate gate as 'not yet here' despite shipping), S2-2 (sidecar deletion bypasses prompt-hash binding entirely), S2-3 (--skip- validate is undocumented), and an S3 cluster on test count drift, HTML provenance, and type-mismatch error messages. - Fix two typos caught by CI spell check (typos v1.46.0): research/16 'pre-empted' -> 'confirmed' (reads more clearly anyway), research/12 'ci_passsing' -> 'ci_passingx' (illustrative typo recast to avoid typos-tool false positive). --- .../research/12-rereview-at-head.md | 78 +++++++++++++++++++ .../research/16-jtbd-switch-interviews.md | 2 +- 2 files changed, 79 insertions(+), 1 deletion(-) create mode 100644 experiments/rule-engine-poc/research/12-rereview-at-head.md diff --git a/experiments/rule-engine-poc/research/12-rereview-at-head.md b/experiments/rule-engine-poc/research/12-rereview-at-head.md new file mode 100644 index 000000000..3b069689a --- /dev/null +++ b/experiments/rule-engine-poc/research/12-rereview-at-head.md @@ -0,0 +1,78 @@ +--- +title: 12 — Re-review at HEAD (post-validate-gate, post-prompt-hash) +folder: experiments/rule-engine-poc/research +description: Second independent reviewer pass at HEAD of claude/rule-engine-poc-gO5yq (PR #525) after the validate gate, prompt-extraction binding, and Codex round-7 polish landed. Focused on the new modules and what slipped. +entry_point: false +--- + +# 12 — Re-review at HEAD + +Scope: HEAD of `claude/rule-engine-poc-gO5yq` after `cc00478` (Codex round 7 + 06-review S2/S3 fixes), `a9b1db9` (prompt-extraction binding), and `3dc1034` (validate gate). All 88 tests across 10 files pass locally in ~1.4s. The 06-review S2/S3 findings (`not` + missing flag, `gt/lt` reasons, duplicate ids, `description`/`stage`/`tags` hash docs, `in: []`) are verified resolved at HEAD and are not re-flagged. This pass focuses on the new validate + prompt-hash code, integration seams, and documentation drift. + +## Verdict — **pass-with-findings** + +The new defensive layer is the right shape: forbidden-fields list, type-checking, hash-binding-as-magic-field, and an early refusal in `report.ts` before any HTML is rendered. The engine is materially stronger than it was at the 06-review. What remains is (a) a real **bypass surface** around the hash sidecar that defeats the staleness guard, (b) substantial **documentation drift** — the workflow doc still tells the reader the validate gate and stale-extraction guard *do not exist* — and (c) a few **integration seams** that no test exercises end-to-end. None of these defeats the demo; all of them would mislead the first external user. + +## Findings + +### S2 — high (would mislead a user or undermine the guard the critic asked for) + +**S2-1 — `docs/workflow.md` still advertises the validate gate and stale-extraction check as not-yet-implemented.** `docs/workflow.md:138-142` "What's not yet here" lists `npm run validate`, the schema-validate gate, and stale-extraction detection as future work. All three are present at HEAD (`src/validate-cli.ts`, the gate inside `src/report.ts:93-115`, the prompt-hash binding in `src/prompt-hash.ts` + `src/plan.ts:80` + `src/report.ts:87-91`). A reader following `docs/workflow.md` will not run `npm run validate`, will not know `--skip-validate` exists, and will not understand why their pasted JSON is rejected. This is the single largest user-facing defect at HEAD. + +**S2-2 — Deleting the hash sidecar silently disables the staleness guard.** `src/report.ts:87-91` and `src/validate-cli.ts:65-68` both guard the hash check with `if (existsSync(sidecar))`. Comment says "missing sidecar = backwards-compat path." The threat model from research/07 §2 ("Stale extractions … no content-hash check … the silent-failure mode that kills the audit-trail claim") is defeated by `rm prompts/<id>.hash.txt`. A bad-faith user does not need to hand-edit `__prompt_hash`; they just delete the sidecar. Both layers (`validate-cli` and `report`) accept the unguarded extraction. The forcing function should be the reverse: refuse to validate/report when the sidecar is missing **and** the extraction was produced by `npm run plan` (i.e. when prompt exists), and only fall through silently when *neither* the prompt nor the sidecar exists (single-shot fixture path). Today there is no way to tell those two cases apart in `report.ts`. + +**S2-3 — `--skip-validate` is undocumented, has no warning at use, and lands in no place a future user will discover.** The flag exists in `src/report.ts:39` and is mentioned exactly once — in the error message printed *after* a validation failure (line 105). It is not in `README.md`, not in `docs/workflow.md`, and has no `--help` / usage banner. The constitution's intent (Article IX — reversibility, ask before bypassing) is partly honoured because the user has to opt in, but the affordance is exactly the "fix this JSON for me" anti-pattern research/07 §6 warns against once it becomes habitual. Minimum bar: document the flag explicitly, surface a one-line warning on stderr each time `--skip-validate` runs (so the bypass is visible in CI logs), and refuse to combine it with the upcoming staleness path. Better still: rename to `--allow-stale-extraction` (or similar) so the flag self-describes the trade-off. + +### S3 — medium (drift, gaps, friction) + +**S3-1 — `README.md` claims 60 tests; vitest reports 88.** `README.md:50` ("60 tests in <1s") and `README.md:113` ("60 tests across …") are stale; the actual count is 88 across 10 files. Cosmetic, but the README is the front door. + +**S3-2 — `docs/README.md` claims "five briefs" of research; there are 11 (plus 13-16).** `docs/README.md:20` undersells the work and points the reader at an outdated framing. Adopters who follow the doc map will miss the entire workflow-risks / extraction-prompts / adoption-revisit thread. + +**S3-3 — No integration test for the `plan -> validate -> report` pipeline.** The 88 tests cover every leaf module (validate, prompt-hash, loader, engine, context, config, prompt-builder, html-report indirectly). Zero tests exercise `plan.ts`, `validate-cli.ts`, or `report.ts` end-to-end. The seams are exactly where the bugs in S2-1 and S2-2 live, and no regression test would have caught either. A single fixture-based integration test (tmpdir, write rules + schema + tiny target file, run plan, hand-write a clean extraction, run validate, run report, assert exit codes + sidecar presence + report contents) would close a real coverage hole. + +**S3-4 — HTML report does not surface the prompt-hash or staleness state.** `src/html-report.ts:261-270` lists `engineVersion`, `rulesetHash`, `flagsHash`, rules path, flags path — but not the prompt hash the extraction was validated against. Research/07 §3 ("verdict tile must surface staleness signal at same visual weight as the verdict") is unmet. Even a successful validation produces a report indistinguishable from one rendered with `--skip-validate` against a stale extraction. Minimum: a "Validated against prompt `<hash12>`" row in Provenance; better: a green/amber banner near the verdict. + +**S3-5 — `validate-cli` exit-code mapping is asymmetric with `report`.** `validate-cli.ts` returns `2` for missing/malformed JSON and `1` for any errors. `report.ts` returns `2` for missing extraction *or* failed validation, and `1` only for an actual `blocked` verdict. A CI gate that runs `npm run validate && npm run report` is consistent, but a user who runs only `report` cannot tell from the exit code alone whether the failure was structural (`2`) or semantic (`1`). The error stream distinguishes them; the exit code does not. Document the contract in `docs/workflow.md`'s exit-code table (which today only covers `report`). + +**S3-6 — `validate-cli` warnings on unknown flags do not affect exit code, but the user gets no flag to escalate warnings to errors.** A typo in an extraction (`ci_passingx: true` instead of `ci_passing: true`) produces an `unknown-flag` warning and exits 0. The engine then sees no `ci_passing` flag and the rule does not fire — a silent miss in the verdict that the validator *did* notice but chose not to block on. Add an opt-in `--strict` flag or change the default; otherwise the validator is advisory in exactly the case it should be load-bearing. + +**S3-7 — Type-mismatch error messages are clear for primitives but terse for the failing path.** `src/validate.ts:112` `Flag 'X' expected 'boolean', got string.` Fine for a developer; for the LLM-paste user the more useful payload is the observed *value*. Compare `disallowed-value` (line 122) which renders `JSON.stringify(value)`. Symmetry: include `got ${typeof value} (value: ${JSON.stringify(value)})` so the user can see what came back from the LLM. + +**S3-8 — `validate.ts:131` accepts `null` as "unknown" for any type, including `string[]`.** Documented in the test (`accepts null as 'unknown' for any type`) and the test passes for `ci_passing: null` and `approvals_count: null`. But the prompt template tells the LLM to **omit** a flag it cannot determine (`prompt-builder.ts:75`, rule #4). `null` and "missing" should mean the same thing to the engine; today they do not — `null` is present-but-typed-as-unknown, `missing` triggers `flag missing in extraction` in the engine's `evaluateCondition`. Either reject `null` in the validator (consistent with "omit it") or document that `null` is the explicit-unknown sentinel and update the engine's missing-flag handling to treat `null === missing`. + +### S4 — low (polish) + +**S4-1 — `prompt-hash.ts` hashes `rule.hash` strings rather than the rule content directly.** Functionally identical (rule.hash is itself a sha256 of canonicalJson), but a reader auditing the prompt-extraction binding has to chase one indirection to satisfy themselves the hash actually covers what the docstring claims. A one-line comment in `prompt-hash.ts:28` ("`r.hash` is the rule-level canonical-content hash from loader.ts; see types.ts LoadedRule") would close the gap. + +**S4-2 — `hashSidecarPath` uses string replace, not path-aware swap.** `src/prompt-hash.ts:35` does `replace(/\.md$/, ".hash.txt")`. A target id containing `.md` in a stem segment (e.g. `prompts/foo.md.bak.md`) would still work, but a misnamed input like `prompts/notes.md.md` would produce `prompts/notes.md.hash.txt`, leaving the second `.md` intact. Edge case; harmless given the call site always passes `<id>.md`, but switching to `path.join(dirname, basename(p, '.md') + '.hash.txt')` makes the invariant explicit. + +**S4-3 — `validate-cli` does not check that the sidecar's hash is well-formed.** `readFileSync(sidecar, 'utf8').trim()` is passed straight to `validateExtraction`. A garbled or zero-byte sidecar produces a `stale-extraction` error rather than a "your sidecar is corrupt" error. Loud-fail is acceptable; just note the case so a future user does not chase the wrong tail. + +**S4-4 — `engine.ts:30-37` `deepEqual` still does not handle plain objects, and `FlagValue` still excludes them.** Carry-over from 06-review S4-3; same recommendation: throw an explicit "object flag values not supported" at the type boundary, or extend `FlagValue` once the OPA-style nested-fact case arrives. Low priority while extractions stay flat. + +**S4-5 — `validate.test.ts` is 12 tests, 100% of the validator's surface, but no test for the `expectedPromptHash: ""` edge case** (empty-string hash). Likely impossible to produce in practice (sha256 always returns 64 hex), but the type permits it and the `=== options.expectedPromptHash` check would pass on `{ __prompt_hash: "" }` against `expectedPromptHash: ""`. Either reject empty strings explicitly or accept the existing trust on the producer. + +## Integration check — does plan → paste → validate → report hang together? + +Manual trace, no fixture run (the workflow involves a real LLM paste): + +1. `npm run plan` writes `prompts/<id>.md` **and** `prompts/<id>.hash.txt`. Pass. +2. User pastes prompt into AI tool. Prompt embeds `__prompt_hash` in three places (`<!-- prompt-hash -->`, the rules section, the response template's first key). LLM-conformance odds are reasonable. +3. User saves JSON to `extractions/<id>.json`. The `__prompt_hash` field is the LLM's responsibility to copy; the design is correct and the forbidden-fields list explicitly excludes `__prompt_hash` from the "you shall not emit" list. +4. `npm run validate` reads sidecar, validates extraction, returns 0/1/2. Pass. +5. `npm run report` re-reads sidecar, re-validates (so validate is advisory, not mandatory — `report` is self-defending). Pass. + +The seams are clean *except* the S2-2 sidecar-delete bypass and the S2-1 doc gap. The architecture is right. + +## What this re-review does not cover + +- The 06-review's S3-3 (stage-ordering DSL primitive) is unaddressed at HEAD. Out of scope for this pass; revisit when rules are calibrated. +- The 06-review's S3-7 (`flags` not included verbatim in `VerdictResult`) is unaddressed; still a doc-vs-artifact mismatch in `docs/audit-trail.md:96-105`. +- The 06-review's S4-1 (`ENGINE_VERSION` bumped without changelog) — still no `CHANGELOG.md`. + +These are not new findings; they are simply not the focus of this pass. + +## Recommendation + +Two fixes are pre-merge: S2-1 (rewrite `docs/workflow.md`'s "What's not yet here" section and add a `## 2. Validate the extraction` section between paste and report) and S2-2 (refuse to operate when the prompt exists but the sidecar is missing; allow the no-sidecar path only when no prompt exists). S2-3, S3-1, S3-2, S3-4 are doc/UX fixes that should ride the same PR — they are all one-line edits. The S3 and S4 set is a one-afternoon polish pass that does not block. diff --git a/experiments/rule-engine-poc/research/16-jtbd-switch-interviews.md b/experiments/rule-engine-poc/research/16-jtbd-switch-interviews.md index 3298e6111..00fbe12ec 100644 --- a/experiments/rule-engine-poc/research/16-jtbd-switch-interviews.md +++ b/experiments/rule-engine-poc/research/16-jtbd-switch-interviews.md @@ -64,7 +64,7 @@ These are the specific probes that distinguish a real JTBD signal from polite ag |---|---|---| | "Tell me about the last time you wished you could show *exactly* why an AI tool gave a verdict." | A concrete recent moment, named stakes ("the customer asked", "audit was in three weeks"), already-tried workaround. | "Hmm, I can imagine that being useful." Generic. No specific incident. | | "Walk me through what happens today when an agent's recommendation surprises you." | "I re-read the entire chat", "I ask it again hoping for a different answer", "I just override it without telling anyone". Frustration. | "I trust the model." Or: "I just don't use it for those decisions." (Means the JTBD doesn't exist for them.) | -| "If I gave you an HTML page that said 'verdict: blocked, because rule R-12 fired on flag `missing_changeset=true`', what would you do with it?" | "Send it to my PM / compliance / the dev who opened the PR." Names a downstream consumer. | "I'd still check the chat to be sure." (Means the audit trail isn't trusted, just decorative — the §07 trail-skipping failure mode pre-empted.) | +| "If I gave you an HTML page that said 'verdict: blocked, because rule R-12 fired on flag `missing_changeset=true`', what would you do with it?" | "Send it to my PM / compliance / the dev who opened the PR." Names a downstream consumer. | "I'd still check the chat to be sure." (Means the audit trail isn't trusted, just decorative — the §07 trail-skipping failure mode confirmed.) | | "Who has to agree before you'd commit to running this every week?" | Names a real second human (manager, compliance, customer). | "Just me, I'd play with it." (Indie toy, not a JTBD — fine for S1, fatal for S2/S3/S5.) | | "What would have to be true for you to *not* read the audit trail and just trust the green badge?" | Long, hesitant answer; calls out staleness, schema mismatch, model swap as required guarantees. | "If it looks right I'll trust it." (Means RAT-C / trust calibration is broken before they even start.) | From 827e3b3fec8270d5151881189a1dde68373c2ca8 Mon Sep 17 00:00:00 2001 From: Claude <noreply@anthropic.com> Date: Sun, 17 May 2026 12:34:16 +0000 Subject: [PATCH 16/45] docs(rule-engine-poc): consolidated architecture doc with mermaid diagrams Replaces the engine-internals-focused architecture.md with a comprehensive system view covering: - System overview (component flowchart) - User flow (sequence diagram across plan/AI/validate/report) - Data flow (annotated with data shapes at each seam) - Engine internals (evaluate algorithm + per-condition + severity picker) - Validate gate + prompt-hash binding (sequence) - OODA mapping (Observe/Orient/Decide/Act with stochasticity boundary) - Module dependency graph (16 src/ modules) - Why these shapes (design choices + research refs) Seven Mermaid diagrams. docs/README.md now points to architecture.md as the start-here entry. --- experiments/rule-engine-poc/docs/README.md | 3 +- .../rule-engine-poc/docs/architecture.md | 398 +++++++++++++++--- 2 files changed, 343 insertions(+), 58 deletions(-) diff --git a/experiments/rule-engine-poc/docs/README.md b/experiments/rule-engine-poc/docs/README.md index 2ebe0e21e..34b2baf86 100644 --- a/experiments/rule-engine-poc/docs/README.md +++ b/experiments/rule-engine-poc/docs/README.md @@ -11,7 +11,8 @@ Detailed documentation for the POC. Start with the project [README](../README.md | Doc | Read when | |---|---| -| [`architecture.md`](architecture.md) | You want to understand how the engine is built and why it produces the same answer every time. | +| [`architecture.md`](architecture.md) | You want the full architecture picture — system map, user flow, data flow, engine internals, module graph, with Mermaid diagrams. **Start here.** | +| [`workflow.md`](workflow.md) | You want to run the `plan` → paste → `validate` → `report` loop end-to-end. | | [`dsl-reference.md`](dsl-reference.md) | You're writing or reading a rule file and need the full YAML grammar — every operator, every grouping construct. | | [`audit-trail.md`](audit-trail.md) | You need to replay a verdict, diff two verdicts, or map the audit trail to EU AI Act / ISO 42001 requirements. | | [`extending.md`](extending.md) | You want to add a rule, add a flag, point the engine at a new domain, or run the tests. | diff --git a/experiments/rule-engine-poc/docs/architecture.md b/experiments/rule-engine-poc/docs/architecture.md index abcd41302..ddba61882 100644 --- a/experiments/rule-engine-poc/docs/architecture.md +++ b/experiments/rule-engine-poc/docs/architecture.md @@ -1,93 +1,377 @@ --- title: Architecture folder: experiments/rule-engine-poc/docs -description: Component layout, evaluation algorithm, and determinism strategy for the rule engine POC. +description: System overview, user flow, data flow, engine internals, and module dependencies for the rule-engine POC. Mermaid diagrams throughout. entry_point: false --- # Architecture -How the engine is organised and why it produces the same answer every time. +A complete picture of the rule-engine POC: how the moving parts fit, how data flows between them, what the user does at each step, and what the engine does inside. -## Module layout +## Contents +1. [System overview](#1-system-overview) +2. [User flow](#2-user-flow) +3. [Data flow](#3-data-flow) +4. [Engine internals](#4-engine-internals) +5. [Validate gate + prompt-hash binding](#5-validate-gate--prompt-hash-binding) +6. [OODA mapping](#6-ooda-mapping) +7. [Module dependency graph](#7-module-dependency-graph) +8. [Why these shapes](#8-why-these-shapes) + +--- + +## 1. System overview + +The POC is three CLIs (`plan`, `validate`, `report`) over a deterministic core (`engine` + `loader`), plus the supporting modules that prepare extraction prompts and render HTML reports. + +```mermaid +flowchart LR + subgraph IN["Inputs (on disk)"] + CFG["rule-engine.config.json"] + RULES["rules/quality-gates.yaml"] + SCHEMA["rules/flag-schema.yaml"] + SRC["target source files<br/>(e.g. specs/<slug>/)"] + end + + subgraph CLI["CLIs"] + PLAN["npm run plan"] + VAL["npm run validate"] + REP["npm run report"] + ONE["src/cli.ts<br/>single-shot"] + end + + subgraph WORK["Workspace (on disk)"] + PMT["prompts/<id>.md<br/>+ <id>.hash.txt"] + EXT["extractions/<id>.json"] + RPT["reports/<id>.html"] + end + + AI["AI tool<br/>Claude / ChatGPT / Gemini"] + BROW["Default browser"] + + CFG --> PLAN + RULES --> PLAN + SCHEMA --> PLAN + SRC --> PLAN + PLAN --> PMT + + PMT -. user copy/paste .-> AI + AI -. user save .-> EXT + + CFG --> VAL + SCHEMA --> VAL + RULES --> VAL + SRC --> VAL + EXT --> VAL + + CFG --> REP + RULES --> REP + SCHEMA --> REP + SRC --> REP + EXT --> REP + REP --> RPT + RPT -. best-effort .-> BROW + + RULES --> ONE + EXT --> ONE ``` -src/ - types.ts Pure data contracts. No logic, no I/O. - hash.ts canonicalJson() + sha256(). The foundation of replayability. - engine.ts evaluate(rules, flags) -> VerdictResult. Pure function. - loader.ts YAML -> LoadedRule[] with content hash + schema validation. - html-report.ts VerdictResult -> self-contained HTML string. Pure function. - cli.ts Side-effectful shell: parse argv, read files, write outputs. + +Solid arrows are program reads/writes. Dashed arrows are **user actions**: the only manual steps in the loop. + +--- + +## 2. User flow + +The user touches the loop in exactly two places: after `plan` (to feed the prompt to the AI and save the result) and after `report` (to read the rendered HTML). + +```mermaid +sequenceDiagram + autonumber + actor U as User + participant P as plan CLI + participant FS as filesystem + participant AI as AI tool + participant V as validate CLI + participant R as report CLI + participant B as browser + + U->>P: npm run plan + P->>FS: read config + rules + schema + sources + P->>FS: write prompts/<id>.md and <id>.hash.txt + + U->>FS: open prompts/<id>.md + U->>AI: paste prompt + AI-->>U: JSON inside <output>...</output> + U->>FS: save extractions/<id>.json + + opt sanity check before report + U->>V: npm run validate + V->>FS: read extraction + recompute prompt hash + V-->>U: per-target ok / errors / warnings + end + + U->>R: npm run report + R->>FS: read extraction + recompute prompt hash + R->>R: validate -> engine.evaluate -> render HTML + R->>FS: write reports/<id>.html + R->>B: open file:// (best-effort) + B-->>U: rendered verdict + audit trail ``` -Three layers, one direction: +The validate step is optional — `report` re-runs validation internally and refuses to render on errors. Most users will skip the standalone `validate` step and let `report` gate them. + +--- + +## 3. Data flow + +The same logical pipeline as the user flow, but labelled with the data shapes that cross each seam. + +```mermaid +flowchart TB + SRC["source files<br/>(markdown, yaml, ...)"] + CFG["config<br/>{ rules, flagSchema,<br/>targets:[ {id, paths} ] }"] + RYAML["rules YAML<br/>(operators + verdict + weight)"] + FSYAML["flag schema YAML<br/>(type + description + example)"] + + SRC --> CTX["context<br/>CollectedFile[]<br/>(sorted, <=8KB each)"] + RYAML --> LR["LoadedRule[]<br/>(+ per-rule content hash)"] + FSYAML --> FS["FlagSchema<br/>(type, description, example,<br/>allowed_values?)"] + + CTX & LR & FS --> PH["promptHash<br/>sha256(targetId +<br/>per-file sha + ruleHashes +<br/>canonicalJson(schema))"] + CTX & LR & FS & PH --> PROMPT["prompts/<id>.md<br/>(role + rules + schema +<br/>source + response template +<br/>open <output> tag)"] + PH --> SIDECAR["prompts/<id>.hash.txt<br/>(diagnostic snapshot)"] + + PROMPT -. paste .-> LLM["LLM"] + LLM -. save .-> FLAGS["extractions/<id>.json<br/>{ __prompt_hash, ...flags }"] + + FLAGS & FS & PH --> VAL["validate<br/>forbidden-fields check<br/>type check<br/>hash check"] + VAL --> VR["ValidationResult<br/>{ ok, errors, warnings }"] + + VR -- ok --> ENG["engine.evaluate<br/>(deterministic)"] + LR --> ENG + FLAGS --> ENG + ENG --> VRES["VerdictResult<br/>{ verdict, weightedTally,<br/>actions, evaluations,<br/>rulesetHash, flagsHash,<br/>engineVersion }"] + + VRES --> HTML["reports/<id>.html<br/>(self-contained, inline CSS)"] + VRES -. --json .-> J["stdout JSON"] ``` - loader -> engine -> renderer (text | json | html) - \ / - types --- + +The three replay anchors at the bottom (`engineVersion`, `rulesetHash`, `flagsHash`) are what makes any verdict reproducible. Plus a fourth: the recomputed `promptHash` ties the extraction back to the source it was produced against. + +--- + +## 4. Engine internals + +`evaluate(rules, flags)` is a pure synchronous function. No I/O, no clock, no randomness. + +```mermaid +flowchart TB + A["evaluate(rules, flags)"] --> B["sort rules<br/>priority desc, id asc"] + B --> C{"for each rule"} + C --> D["evaluateWhen(rule.when, flags)"] + + D --> E["check when.all<br/>(AND)"] + D --> F["check when.any<br/>(OR)"] + D --> G["check when.not<br/>(invert,<br/>preserve missing reason)"] + + E & F & G --> H{"rule matched?"} + H -- yes --> I["tally[verdict] += weight<br/>actionSet.add(actions)"] + H -- no --> J["record in audit trail<br/>with observed + reason"] + I --> J + + J --> C + + C -- done --> K["pick verdict by severity<br/>blocked > needs-attention ><br/>ready-to-progress > unknown"] + K --> L["sort actions alphabetically"] + L --> M["compute rulesetHash<br/>compute flagsHash"] + M --> N["return VerdictResult"] ``` -The engine itself is a [pure function](https://en.wikipedia.org/wiki/Pure_function): same inputs, same output, no I/O, no clock, no randomness. All side effects live in `cli.ts`. +Per-condition evaluation handles each operator with explicit reasoning for the audit trail: + +```mermaid +flowchart TB + EC["evaluateCondition(c, flags)"] --> P["present = flag in flags?<br/>observed = flags[flag]"] + P --> EX{"exists set?"} + EX -- "exists: false<br/>(loader rejects + value-ops)" --> ER1["return matched=<present === false>"] + EX -- "exists: true" --> EX2["matched &= (present)"] + EX -- "not set" --> EX2 + + EX2 --> MS{"hasValueOp<br/>&& !present?"} + MS -- yes --> R1["return matched=false,<br/>reason='flag missing'"] + MS -- no --> OPS["apply value operators<br/>eq, ne, gt, lt, in, regex<br/>(AND-chain, set reason<br/>on type mismatch)"] + OPS --> R2["return matched + reason"] +``` + +The `when.not` branch is the subtle one: when an inner condition fails with `reason: "flag missing in extraction"`, the `not` clause cannot meaningfully invert it (we don't know whether the value-op would have matched). The missing-flag reason is propagated through, so the rule does not silently fire on absence. + +**Severity-first verdict picker** (not a weighted sum): + +```mermaid +flowchart LR + T["weighted tally"] --> S{"any blocked > 0?"} + S -- yes --> VB["verdict = blocked"] + S -- no --> S2{"any needs-attention > 0?"} + S2 -- yes --> VN["verdict = needs-attention"] + S2 -- no --> S3{"any ready-to-progress > 0?"} + S3 -- yes --> VR["verdict = ready-to-progress"] + S3 -- no --> VU["verdict = unknown"] +``` + +A `blocked` rule beats any number of `ready-to-progress` rules regardless of weight. Weight only matters as a tie-breaker for **action prioritisation within a tier**, never for the verdict tier itself. + +--- + +## 5. Validate gate + prompt-hash binding + +The two gates that defend the workflow from silent failure. + +```mermaid +sequenceDiagram + autonumber + participant R as report / validate CLI + participant FS as filesystem + participant V as validateExtraction + + R->>FS: read extractions/<id>.json + R->>FS: walk target paths, read source files + R->>R: computePromptHash(sources, rules, schema) + Note over R: NOT read from sidecar —<br/>recompute defeats the<br/>'paste the sidecar hash' cheat -## Evaluation algorithm + R->>V: validateExtraction(flags, schema,<br/>{ expectedPromptHash }) + V->>V: reject forbidden fields<br/>(verdict, assessment, ...) + V->>V: type-check each flag<br/>against schema + V->>V: compare __prompt_hash<br/>against recomputed hash + V->>V: warn on unknown flags + + V-->>R: { ok, errors, warnings } + + alt ok + R->>R: engine.evaluate(...) + R->>FS: write HTML report + else errors + R-->>R: print errors, exit 2 + end ``` -1. Load rules (already validated and content-hashed by the loader). -2. Sort by [priority desc, id asc]. This is the only ordering that matters. -3. For each rule, evaluate its `when` clause against the flags: - - `all`: every condition must match (AND). - - `any`: at least one condition must match (OR). - - `not`: inverts the inner condition's match result. -4. If matched, add (verdict, weight) to a per-verdict tally. - Append the rule's actions to a deduplicated set. -5. After all rules: pick the verdict tier by severity, not weight. - Severity order: blocked > needs-attention > ready-to-progress > unknown. - Within a tier, weight only informs action prioritisation, not verdict. -6. Sort actions alphabetically. Compute rulesetHash and flagsHash. -7. Return VerdictResult — the full audit trail + verdict + provenance. + +Forbidden fields (research/10): `verdict`, `assessment`, `conclusion`, `summary`, `recommendation`, `rationale`, `analysis`. Naming them out explicitly in the prompt + rejecting them at validation is more reliable than hoping the LLM stays in scope. + +--- + +## 6. OODA mapping + +The POC implements the **Decide** quadrant of the OODA orchestrator concept described in [`docs/backlog/502`](../../../docs/backlog/502-idea-ooda-loop-plugin-observe-orient-decide-act.md). The other quadrants are mocked by the user paste step (Orient) and the fixture / file walker (Observe). + +```mermaid +flowchart LR + subgraph OBSERVE["OBSERVE (deterministic)"] + SIG["raw signals<br/>(file walker)"] + end + + subgraph ORIENT["ORIENT (stochastic — the only one)"] + LLM["LLM extracts<br/>structured flags"] + end + + subgraph DECIDE["DECIDE (deterministic — this POC)"] + RE["rule engine<br/>verdict + audit trail"] + end + + subgraph ACT["ACT (deterministic)"] + ACTS["execute approved<br/>actions"] + end + + SIG --> LLM + LLM --> RE + RE --> ACTS + + style OBSERVE fill:#e6f6ec,stroke:#1f8a4c + style DECIDE fill:#e6f6ec,stroke:#1f8a4c + style ACT fill:#e6f6ec,stroke:#1f8a4c + style ORIENT fill:#fff4e0,stroke:#d18900 ``` -### Why severity-first, not weighted sum? +Stochasticity is **confined to Orient**. Observe is mechanical, Decide is reproducible, Act is mechanical. The article that inspired this POC (Reddit thread on AI fact-checking, quoted in `README.md`) is essentially the argument for that boundary. + +--- -A weighted sum is what most fact-checking systems use: each piece of evidence adds or subtracts from a score, and a threshold picks the verdict. That works when **all evidence is commensurable** — e.g., "this source supports the claim" is comparable in kind to "this source contradicts the claim". +## 7. Module dependency graph -In our domain, evidence is **categorical**: missing EARS notation is a *gate*, not a *point deduction*. A `blocked` rule expresses "this must be fixed before progressing"; weighting it against a `ready-to-progress` rule would let positive signals drown out a real blocker. Severity-first preserves the semantics of categorical gates. +`src/` has 16 modules. Pure-data and CLI layers are kept separate; the engine has no I/O dependencies. -Within a tier, weight still matters: it sorts which actions are surfaced first. +```mermaid +flowchart TB + subgraph DATA["Pure data"] + TYPES["types.ts"] + HASH["hash.ts"] + end -## Determinism strategy + subgraph CORE["Engine core (pure, synchronous)"] + ENG["engine.ts"] + LOAD["loader.ts"] + end -Determinism is **engineered**, not assumed. Specific hazards we guard against: + subgraph SCHEMA["Schema + validation"] + FS["flag-schema.ts"] + VAL["validate.ts"] + end -| Hazard | Mitigation | -|---|---| -| Object key ordering in JSON | `canonicalJson` sorts keys recursively before serialising | -| `Date.now()` / wall clock | Not used inside the engine; only the CLI's HTML report includes a timestamp | -| `Math.random()` / process IDs | Not used at all | -| `async` / event-loop interleaving | Engine is fully synchronous | -| `Set` iteration order | Actions are collected in a `Set` then explicitly `.sort()`-ed | -| Regex `lastIndex` | A fresh `new RegExp(...)` is constructed per evaluation; no `/g` flag | -| Loaded-file path differences | `rulesetHash` is computed from rule content, not file paths | + subgraph PROMPT["Prompt pipeline"] + CTX["context.ts"] + PB["prompt-builder.ts"] + PH["prompt-hash.ts"] + end -The output of `evaluate()` is byte-identical for identical inputs. This is exercised directly by the `describe("reproducibility")` block in `test/engine.test.ts`. + subgraph CFG["Config"] + CFGTS["config.ts"] + end -## Provenance + subgraph RENDER["Render + I/O"] + HTML["html-report.ts"] + BROW["open-browser.ts"] + end -Three hashes anchor a verdict to its inputs: + subgraph CLIs["CLIs"] + PLAN["plan.ts"] + VALCLI["validate-cli.ts"] + REPORT["report.ts"] + ONE["cli.ts"] + end -- **`rulesetHash`** — `sha256(canonicalJson([{id, hash}, ...]))` over the sorted, loaded rules. Changes if any rule's content changes or the rule set is reordered after a content edit. -- **Per-rule `hash`** — `sha256(canonicalJson({id, priority, when, then}))`. Description is intentionally excluded so documentation edits don't invalidate the hash. -- **`flagsHash`** — `sha256(canonicalJson(flags))`. Key order in the source JSON is irrelevant. + TYPES --> ENG & LOAD & VAL & HTML + HASH --> ENG & LOAD & PH + LOAD --> ENG + FS --> VAL & PB & PLAN & VALCLI & REPORT + CTX --> PLAN & REPORT & VALCLI + PH --> PLAN & REPORT & VALCLI + PB --> PLAN + CFGTS --> PLAN & VALCLI & REPORT + ENG --> REPORT & ONE + VAL --> REPORT & VALCLI + HTML --> REPORT + BROW --> REPORT + LOAD --> PLAN & VALCLI & REPORT & ONE +``` + +The engine core (`engine.ts` + `loader.ts` + `types.ts` + `hash.ts`) has zero I/O. It can be embedded in any environment that supplies an `ExtractionFlags` object. The CLI layer is intentionally **shallow glue** — the engine is the load-bearing module. + +--- -Together with the `engineVersion`, these three fields make any verdict replayable: given the same engine version, same rules, and same flags, you reach the same answer. +## 8. Why these shapes -## What's deliberately missing +A few design choices that emerged from research and review rounds. Each is documented in more depth in the `research/` artifacts. -- **No rule chaining** (`Rete`-style derived facts). At < 200 rules with a single fact set per LLM extraction, the partial-match cache is pure overhead. -- **No hot reload**. Rules are loaded once per CLI invocation. -- **No DSL macros**, **no rule inheritance**. Each rule stands on its own; copy-paste is honest. -- **No persistence**. The engine returns a value; storing it is the caller's concern. +| Choice | Rationale | Where to read more | +|---|---|---| +| Severity-first verdict (not weighted sum) | Gate conditions are categorical; weighted sum lets positive signals drown out a real blocker. | [`research/04`](../research/04-technical-design.md) | +| Canonical-JSON content hashes per rule | Diff which rules changed between two runs without comparing YAML text. | [`audit-trail.md`](audit-trail.md) | +| YAML rules (not TS function rules) | Diff-able, reviewable in a PR, no rebuild needed to edit. | [`research/01`](../research/01-technical-landscape.md) | +| Recompute prompt hash on report (not trust sidecar) | A sidecar text file an operator can `cat` into the extraction is no defence. Recomputing forces a real source-file change to invalidate the extraction. | [`research/14`](../research/14-new-failure-modes.md) | +| Forbidden field names in the prompt | Naming the failure mode explicitly is more reliable than schema-only validation; the validate gate also checks. | [`research/10`](../research/10-extraction-prompt-patterns.md) | +| Stay hand-rolled (don't adopt json-rules-engine) | Every candidate fails ≥3 of our 6 locked constraints; the wrapper would be the engine. Re-evaluate at graduation. | [`research/11`](../research/11-rule-engine-adoption-revisit.md) | +| Severity tiers + `unknown` default | Verdicts are categorical tiers; `unknown` is the honest answer when no rule fires. | [`dsl-reference.md`](dsl-reference.md) | -See [`research/04-technical-design.md`](../research/04-technical-design.md) for the architect's proposal of how to evolve these toward a production rule layer. +For deeper component-level details see [`workflow.md`](workflow.md) (operational), [`dsl-reference.md`](dsl-reference.md) (rule grammar), [`audit-trail.md`](audit-trail.md) (replay), and [`extending.md`](extending.md) (adding rules, flags, fixtures). From fde9d101d4a799bf6159b029e0abbf0e2f8a3746 Mon Sep 17 00:00:00 2001 From: Claude <noreply@anthropic.com> Date: Sun, 17 May 2026 12:35:06 +0000 Subject: [PATCH 17/45] fix(rule-engine-poc): slug + UTF-8 truncation hardening (Codex round 8) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - config.ts: target ids must match /^[A-Za-z0-9][A-Za-z0-9_-]*$/. Previously a target id like '../escape' or 'foo/bar' was accepted and later interpolated into prompts/<id>.md, extractions/<id>.json, reports/<id>.html — at best ENOENT, at worst write outside the workspace (#525 round 8 P2). - context.ts: extract truncateToBytes() that walks back to a UTF-8 codepoint boundary. Previously slice(0, maxBytes) counted UTF-16 code units, so CJK / emoji-heavy markdown could emit prompt blocks 4x the advertised 8 KB cap (#525 round 8 P2). - 7 new tests cover the three slug rejection paths and the multibyte truncation invariant. Suite: 95/95 passing. --- experiments/rule-engine-poc/src/config.ts | 10 +++++ experiments/rule-engine-poc/src/context.ts | 18 ++++++++- .../rule-engine-poc/test/config.test.ts | 27 +++++++++++++ .../rule-engine-poc/test/context.test.ts | 38 ++++++++++++++++++- 4 files changed, 91 insertions(+), 2 deletions(-) diff --git a/experiments/rule-engine-poc/src/config.ts b/experiments/rule-engine-poc/src/config.ts index b42d20353..d4fb7d869 100644 --- a/experiments/rule-engine-poc/src/config.ts +++ b/experiments/rule-engine-poc/src/config.ts @@ -92,6 +92,16 @@ function validate(raw: unknown, file: string): RawConfig { if (typeof tr.id !== "string" || tr.id.length === 0) { throw new Error(`Config ${file} target #${i} missing string 'id'`); } + // Target id is interpolated into file paths (prompts/<id>.md, + // extractions/<id>.json, reports/<id>.html). Restrict it to a safe + // filename slug so traversal segments or path separators can't + // escape the workspace. + if (!/^[A-Za-z0-9][A-Za-z0-9_-]*$/.test(tr.id)) { + throw new Error( + `Config ${file} target id '${tr.id}' must match /^[A-Za-z0-9][A-Za-z0-9_-]*$/ ` + + `(safe filename slug: letters, digits, hyphen, underscore; first char alphanumeric)`, + ); + } if (seen.has(tr.id)) { throw new Error(`Config ${file} has duplicate target id '${tr.id}'`); } diff --git a/experiments/rule-engine-poc/src/context.ts b/experiments/rule-engine-poc/src/context.ts index 6ed3852df..81c564a3d 100644 --- a/experiments/rule-engine-poc/src/context.ts +++ b/experiments/rule-engine-poc/src/context.ts @@ -44,7 +44,7 @@ export function collectFiles( return collected.map((f) => { const truncated = f.bytes > maxBytes; const content = truncated - ? f.content.slice(0, maxBytes) + TRUNCATION_NOTE + ? truncateToBytes(f.content, maxBytes) + TRUNCATION_NOTE : f.content; return { ...f, @@ -55,6 +55,22 @@ export function collectFiles( }); } +// Truncate a string to at most `maxBytes` of its UTF-8 encoding, ending +// on a codepoint boundary. Previously we called `.slice(0, maxBytes)` +// which counts UTF-16 code units, so a CJK / emoji-heavy file could +// emit a prompt block far larger than the advertised cap. +export function truncateToBytes(s: string, maxBytes: number): string { + const buf = Buffer.from(s, "utf8"); + if (buf.byteLength <= maxBytes) return s; + // Walk back from `maxBytes` until we land on a codepoint start byte. + // In UTF-8, continuation bytes have the high two bits 10xxxxxx. + let cut = maxBytes; + while (cut > 0 && (buf[cut]! & 0xc0) === 0x80) { + cut--; + } + return buf.subarray(0, cut).toString("utf8"); +} + function walk( abs: string, extensions: Set<string>, diff --git a/experiments/rule-engine-poc/test/config.test.ts b/experiments/rule-engine-poc/test/config.test.ts index d71c73104..1a61b9021 100644 --- a/experiments/rule-engine-poc/test/config.test.ts +++ b/experiments/rule-engine-poc/test/config.test.ts @@ -44,6 +44,33 @@ describe("loadConfig", () => { expect(() => loadConfig(file)).toThrow(/duplicate target id 'alpha'/); }); + it("rejects target ids with path separators", () => { + const file = writeTempConfig({ + ...validBody, + targets: [{ id: "../escape", label: "x", paths: ["a"] }], + }); + expect(() => loadConfig(file)).toThrow(/safe filename slug/); + }); + + it("rejects target ids starting with non-alphanumeric", () => { + const file = writeTempConfig({ + ...validBody, + targets: [{ id: "-leading-dash", label: "x", paths: ["a"] }], + }); + expect(() => loadConfig(file)).toThrow(/safe filename slug/); + }); + + it("accepts kebab-case and snake_case target ids", () => { + const file = writeTempConfig({ + ...validBody, + targets: [ + { id: "kebab-case", label: "k", paths: ["a"] }, + { id: "snake_case", label: "s", paths: ["a"] }, + ], + }); + expect(() => loadConfig(file)).not.toThrow(); + }); + it("rejects targets with empty paths", () => { const file = writeTempConfig({ ...validBody, diff --git a/experiments/rule-engine-poc/test/context.test.ts b/experiments/rule-engine-poc/test/context.test.ts index c83ae5abd..a733abaa6 100644 --- a/experiments/rule-engine-poc/test/context.test.ts +++ b/experiments/rule-engine-poc/test/context.test.ts @@ -2,7 +2,7 @@ import { describe, expect, it } from "vitest"; import { mkdirSync, mkdtempSync, writeFileSync } from "node:fs"; import { tmpdir } from "node:os"; import { join } from "node:path"; -import { collectFiles } from "../src/context.js"; +import { collectFiles, truncateToBytes } from "../src/context.js"; function setupFixture(): string { const dir = mkdtempSync(join(tmpdir(), "rep-poc-ctx-")); @@ -45,4 +45,40 @@ describe("collectFiles", () => { collectFiles(["/does/not/exist"], { baseDir: "/" }), ).toThrow(/Path does not exist/); }); + + it("truncates multibyte content by bytes, not code units", () => { + // Four-byte emoji ("🎯") repeated. JS .length is 2 per emoji (UTF-16 + // surrogate pair). Byte length is 4 per emoji. + const dir = mkdtempSync(join(tmpdir(), "rep-poc-utf8-")); + const target = "🎯".repeat(100); // 400 bytes UTF-8, 200 UTF-16 units + writeFileSync(join(dir, "emoji.md"), target, "utf8"); + const files = collectFiles([dir], { baseDir: dir, maxBytes: 100 }); + expect(files[0]!.truncated).toBe(true); + const bytesInContent = Buffer.byteLength( + files[0]!.content.replace(/\n\n\.\.\..*/s, ""), // drop truncation note + "utf8", + ); + expect(bytesInContent).toBeLessThanOrEqual(100); + }); +}); + +describe("truncateToBytes", () => { + it("returns the string unchanged when under the limit", () => { + expect(truncateToBytes("hello", 100)).toBe("hello"); + }); + + it("never splits a multibyte codepoint", () => { + // "héllo" = 6 bytes (é = 2 bytes c3 a9). Truncating to 4 bytes + // would land mid-é if we did naive byte-slice without boundary check. + const truncated = truncateToBytes("héllo", 4); + // Either "h" (1) or "hé" (3) — never 4 bytes ending mid-codepoint. + const bytes = Buffer.byteLength(truncated, "utf8"); + expect(bytes).toBeLessThanOrEqual(4); + // Decoded string must be valid (no replacement char). + expect(truncated).not.toContain("�"); + }); + + it("truncates ASCII at the exact byte boundary", () => { + expect(truncateToBytes("hello world", 5)).toBe("hello"); + }); }); From 4e54c0ee0d120739ef15d0a282a01fd81e854147 Mon Sep 17 00:00:00 2001 From: Claude <noreply@anthropic.com> Date: Sun, 17 May 2026 12:41:26 +0000 Subject: [PATCH 18/45] feat(rule-engine-poc): RALPH-loop polish (agents A+B+C) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three implementer subagents ran in parallel; each verified with npm test + tsc + typos before reporting. 98/98 tests passing. Agent A — Safety: sidecar refusal + --skip-validate warning - report.ts + validate-cli.ts: prompt-hash binding now triggers on prompts/<id>.md (the prompt file) existing, NOT on the sidecar prompts/<id>.hash.txt. Deleting the sidecar can no longer bypass the staleness check (reviewer research/12 S2-2). - report.ts: --skip-validate now prints a loud stderr warning per target ('validation gate disabled. This is for debugging only.') closing reviewer S2-3 silent-flag finding. Agent B — validate.ts polish - Type-mismatch errors now include the observed value via formatObserved() with an 80-char cap and ellipsis (reviewer S3). Example: "Flag 'X' expected 'boolean', got string (\"yes\")." - null flag values now warn with code 'null-value-omit-instead' rather than being silently accepted as 'unknown'. Engine semantics unchanged (null still treated as missing); validate just surfaces the discrepancy with the prompt's 'omit unknowns' instruction. Agent C — HTML report v2 - RenderContext gains an optional promptHash field. When set, the Provenance section shows the 12-char prefix; when the extraction's __prompt_hash matches, a 'verified' badge appears. - Audit-trail rows with reason='flag missing in extraction' now use a distinct cond--missing CSS class (yellow/warning palette) to differentiate from cond--miss (red/error). UX research/09 finding. --- .../rule-engine-poc/src/html-report.ts | 43 ++++++++++++++- experiments/rule-engine-poc/src/report.ts | 53 ++++++++++++------- .../rule-engine-poc/src/validate-cli.ts | 29 ++++++---- experiments/rule-engine-poc/src/validate.ts | 34 +++++++++++- .../rule-engine-poc/test/validate.test.ts | 28 +++++++++- 5 files changed, 153 insertions(+), 34 deletions(-) diff --git a/experiments/rule-engine-poc/src/html-report.ts b/experiments/rule-engine-poc/src/html-report.ts index 2c335c3fe..8fc3b4500 100644 --- a/experiments/rule-engine-poc/src/html-report.ts +++ b/experiments/rule-engine-poc/src/html-report.ts @@ -5,6 +5,14 @@ interface RenderContext { flagsPath: string; flags: Record<string, unknown>; generatedAt: string; + /** + * Recomputed prompt hash (current source + rules + schema). When set, + * the report surfaces this in Provenance as the binding between the + * extraction and the inputs the operator was supposed to read. If the + * extraction's `__prompt_hash` matches, the report displays a + * "verified" badge. + */ + promptHash?: string; } const VERDICT_PALETTE: Record< @@ -97,7 +105,18 @@ export function renderHtmlReport( const status = ev.matched ? "MATCHED" : "did not match"; const conds = ev.conditions .map((c) => { - const cls2 = c.matched ? "cond cond--ok" : "cond cond--miss"; + // Distinguish "flag missing in extraction" (the extraction did + // not supply the input — operator/LLM issue, yellow/warning) + // from "value did not match" (the input was supplied but the + // rule disagreed — domain signal, red/miss). + let cls2: string; + if (c.matched) { + cls2 = "cond cond--ok"; + } else if (c.reason === "flag missing in extraction") { + cls2 = "cond cond--missing"; + } else { + cls2 = "cond cond--miss"; + } const reason = c.reason ? ` <span class="reason">(${esc(c.reason)})</span>` : ""; @@ -203,6 +222,7 @@ export function renderHtmlReport( } .badge--tag { background: #eef4fb; color: #234e7a; } .badge--hash { font-family: "SFMono-Regular", Consolas, monospace; } + .badge--verified { background: #e6f6ec; color: #114a29; font-weight: 600; } ul.conditions { list-style: none; padding-left: 0; margin: 8px 0 0; } ul.conditions li { padding: 3px 0 3px 22px; position: relative; font-size: 13px; } ul.conditions li::before { @@ -211,6 +231,8 @@ export function renderHtmlReport( } .cond--ok::before { content: "[+]"; color: #1f8a4c; } .cond--miss::before { content: "[-]"; color: #d8281b; } + .cond--missing::before { content: "[?]"; color: #d18900; } + .cond--missing { color: #d18900; background: #fff4e0; border-radius: 3px; padding-right: 6px; } .reason { color: var(--muted); font-style: italic; } .contribution { font-size: 13px; margin: 10px 0 0; padding: 8px 10px; background: var(--accent-bg); border-radius: 3px; color: var(--accent-fg); } .provenance { font-size: 12px; color: var(--muted); } @@ -263,7 +285,24 @@ export function renderHtmlReport( <p class="provenance"> Engine version: <code>${esc(result.engineVersion)}</code><br> Ruleset hash: <code>${esc(result.rulesetHash)}</code><br> - Flags hash: <code>${esc(result.flagsHash)}</code><br> + Flags hash: <code>${esc(result.flagsHash)}</code><br>${ + ctx.promptHash + ? (() => { + // The extraction's self-declared prompt hash. If it equals + // the recomputed value (passed in as ctx.promptHash), the + // extraction was produced against the same inputs the + // operator is now reading; otherwise it's stale. + const declared = (ctx.flags as Record<string, unknown>)["__prompt_hash"]; + const verified = + typeof declared === "string" && declared === ctx.promptHash; + const badge = verified + ? ` <span class="badge badge--verified" title="extraction's __prompt_hash matches the recomputed value">verified</span>` + : ""; + return ` + Prompt hash: <code>${esc(ctx.promptHash.slice(0, 12))}…</code>${badge}<br>`; + })() + : "" + } Rules file: <code>${esc(ctx.rulesPath)}</code><br> Flags file: <code>${esc(ctx.flagsPath)}</code> </p> diff --git a/experiments/rule-engine-poc/src/report.ts b/experiments/rule-engine-poc/src/report.ts index e777585ee..9171320bd 100644 --- a/experiments/rule-engine-poc/src/report.ts +++ b/experiments/rule-engine-poc/src/report.ts @@ -90,28 +90,44 @@ for (const target of targets) { // sidecar is plain text. Recomputing means a real source-file change // always invalidates the extraction. Critic research/14 flagged this // as the highest-leverage fix in the post-validate workflow. + // + // Binding trigger = presence of `prompts/<id>.md` (the prompt file), + // NOT the diagnostic sidecar. An operator who deletes the sidecar + // must NOT thereby disable the staleness check (research/12 S2-2). + // Single-shot / fixture flows that bypass the plan step have no + // prompt file, so they keep the legacy no-binding behaviour. + const promptPath = join(config.promptsDirPath, `${target.id}.md`); + const promptFileExists = existsSync(promptPath); let expectedPromptHash: string | undefined; - try { - const files = collectFiles(target.paths, { baseDir: config.configDir }); - expectedPromptHash = computePromptHash({ - targetId: target.id, - files, - rules, - flagSchema: schema, - }); - } catch (err) { - // Sources unavailable (deleted folder, etc.) — fall through without - // hash binding rather than crash. validate will warn separately if - // __prompt_hash is set but unverifiable. - expectedPromptHash = undefined; - const msg = err instanceof Error ? err.message : String(err); - if (!quiet) { - console.warn( - `[report] ${target.id}: could not recompute prompt hash (${msg}); skipping staleness check`, - ); + if (promptFileExists) { + try { + const files = collectFiles(target.paths, { baseDir: config.configDir }); + expectedPromptHash = computePromptHash({ + targetId: target.id, + files, + rules, + flagSchema: schema, + }); + } catch (err) { + // Sources unavailable (deleted folder, etc.) — fall through without + // hash binding rather than crash. validate will warn separately if + // __prompt_hash is set but unverifiable. + expectedPromptHash = undefined; + const msg = err instanceof Error ? err.message : String(err); + if (!quiet) { + console.warn( + `[report] ${target.id}: could not recompute prompt hash (${msg}); skipping staleness check`, + ); + } } } + if (skipValidate) { + console.error( + `[report] WARNING: --skip-validate is set for '${target.id}'; validation gate disabled. This is for debugging only.`, + ); + } + if (!skipValidate) { const v = validateExtraction(flags as Record<string, unknown>, schema, { expectedPromptHash, @@ -142,6 +158,7 @@ for (const target of targets) { flagsPath: relative(config.configDir, extractionPath), flags, generatedAt: new Date().toISOString(), + promptHash: expectedPromptHash, }); const reportPath = join(config.reportsDirPath, `${target.id}.html`); writeFileSync(reportPath, html, "utf8"); diff --git a/experiments/rule-engine-poc/src/validate-cli.ts b/experiments/rule-engine-poc/src/validate-cli.ts index 8593c4a98..1c6e3c0a7 100644 --- a/experiments/rule-engine-poc/src/validate-cli.ts +++ b/experiments/rule-engine-poc/src/validate-cli.ts @@ -66,17 +66,26 @@ for (const target of targets) { } // Recompute the prompt hash from current source rather than trusting // the sidecar — see report.ts and research/14. + // + // Binding trigger = presence of `prompts/<id>.md` (the prompt file), + // NOT the diagnostic sidecar. Deleting the sidecar must NOT disable + // the staleness check (research/12 S2-2). Single-shot / fixture flows + // that have no prompt file keep the legacy no-binding behaviour. + const promptPath = join(config.promptsDirPath, `${target.id}.md`); + const promptFileExists = existsSync(promptPath); let expectedPromptHash: string | undefined; - try { - const files = collectFiles(target.paths, { baseDir: config.configDir }); - expectedPromptHash = computePromptHash({ - targetId: target.id, - files, - rules, - flagSchema: schema, - }); - } catch { - expectedPromptHash = undefined; + if (promptFileExists) { + try { + const files = collectFiles(target.paths, { baseDir: config.configDir }); + expectedPromptHash = computePromptHash({ + targetId: target.id, + files, + rules, + flagSchema: schema, + }); + } catch { + expectedPromptHash = undefined; + } } const result = validateExtraction( parsed as Record<string, unknown>, diff --git a/experiments/rule-engine-poc/src/validate.ts b/experiments/rule-engine-poc/src/validate.ts index 4bb386e39..291fee8fb 100644 --- a/experiments/rule-engine-poc/src/validate.ts +++ b/experiments/rule-engine-poc/src/validate.ts @@ -103,13 +103,24 @@ export function validateExtraction( }); continue; } + if (value === null) { + warnings.push({ + severity: "warning", + code: "null-value-omit-instead", + path: key, + message: + `Flag '${key}' is null; prefer omitting unknowns over emitting null. ` + + `The engine will treat null and missing identically.`, + }); + continue; + } const typeProblem = checkType(value, entry.type); if (typeProblem) { errors.push({ severity: "error", code: "type-mismatch", path: key, - message: `Flag '${key}' expected '${entry.type}', got ${typeProblem}.`, + message: `Flag '${key}' expected '${entry.type}', got ${typeProblem} (${formatObserved(value)}).`, }); continue; } @@ -128,7 +139,6 @@ export function validateExtraction( } function checkType(value: unknown, type: FlagType): string | null { - if (value === null) return null; switch (type) { case "boolean": return typeof value === "boolean" ? null : typeof value; @@ -148,6 +158,26 @@ function checkType(value: unknown, type: FlagType): string | null { } } +// Renders the observed value in a type-mismatch error message so the user +// can see what they actually emitted. Strings are quoted; arrays/objects +// are JSON-stringified and capped so a giant blob doesn't drown the message. +const OBSERVED_MAX_LEN = 80; +function formatObserved(value: unknown): string { + if (typeof value === "string") return truncate(JSON.stringify(value)); + if (typeof value === "number" || typeof value === "boolean") return String(value); + if (value === undefined) return "undefined"; + try { + return truncate(JSON.stringify(value)); + } catch { + return truncate(String(value)); + } +} + +function truncate(s: string): string { + if (s.length <= OBSERVED_MAX_LEN) return s; + return `${s.slice(0, OBSERVED_MAX_LEN - 1)}…`; +} + function deepEqual(a: unknown, b: unknown): boolean { if (a === b) return true; if (Array.isArray(a) && Array.isArray(b)) { diff --git a/experiments/rule-engine-poc/test/validate.test.ts b/experiments/rule-engine-poc/test/validate.test.ts index 6c6ec7900..824f3593a 100644 --- a/experiments/rule-engine-poc/test/validate.test.ts +++ b/experiments/rule-engine-poc/test/validate.test.ts @@ -66,6 +66,27 @@ describe("validateExtraction", () => { expect(r.errors[0]!.code).toBe("type-mismatch"); }); + it("includes the observed string value in the type-mismatch message", () => { + const r = validateExtraction({ ci_passing: "yes" }, schema); + expect(r.errors[0]!.message).toContain("got string"); + expect(r.errors[0]!.message).toContain('"yes"'); + }); + + it("includes the observed array value in the type-mismatch message", () => { + const r = validateExtraction({ ci_passing: [1, 2, 3] }, schema); + expect(r.errors[0]!.code).toBe("type-mismatch"); + expect(r.errors[0]!.message).toContain("[1,2,3]"); + }); + + it("truncates very long observed values in the type-mismatch message", () => { + const long = "x".repeat(500); + const r = validateExtraction({ ci_passing: long }, schema); + expect(r.errors[0]!.code).toBe("type-mismatch"); + // 80-char cap on the rendered form; ellipsis indicates truncation. + expect(r.errors[0]!.message).toContain("…"); + expect(r.errors[0]!.message.length).toBeLessThan(200); + }); + it("errors on non-finite numbers", () => { const r = validateExtraction({ approvals_count: NaN }, schema); expect(r.ok).toBe(false); @@ -84,9 +105,12 @@ describe("validateExtraction", () => { expect(r.errors[0]!.code).toBe("disallowed-value"); }); - it("accepts null as 'unknown' for any type", () => { - const r = validateExtraction({ ci_passing: null, approvals_count: null }, schema); + it("warns (but does not error) when a flag value is null", () => { + const r = validateExtraction({ ci_passing: null }, schema); expect(r.ok).toBe(true); + expect(r.warnings).toHaveLength(1); + expect(r.warnings[0]!.code).toBe("null-value-omit-instead"); + expect(r.warnings[0]!.path).toBe("ci_passing"); }); describe("with expectedPromptHash", () => { From ce801cabcd08428156e7b276f6dc886a3189f3cd Mon Sep 17 00:00:00 2001 From: Claude <noreply@anthropic.com> Date: Sun, 17 May 2026 12:43:35 +0000 Subject: [PATCH 19/45] docs(rule-engine-poc): RALPH-loop polish (agents A test + D docs) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Agent A — integration tests (was missed from 4e54c0e): - test/report-flow.test.ts: 8 spawnSync integration tests covering prompt-extraction binding (sidecar deletion, missing __prompt_hash, stale hash, fixture flow with no prompt file, parity for both CLIs) and --skip-validate stderr warning. Brings suite to 106 tests. Agent D — docs drift sync: - docs/workflow.md: rewrote "What's not yet here" (validate gate and stale-extraction detection BOTH shipped — now lists API extractor, rule governance, fairness audit, drift dashboards). Added the __prompt_hash paragraph in "Paste into an AI tool" + a new "--skip-validate flag" subsection (debugging-only, never-in-CI). - README.md: test count 60 -> 100+, file map updated to include prompt-hash / validate / validate-cli modules, research table extended from 10 to 16 artifacts. - docs/README.md: "five briefs" -> "16 research artifacts" with the expanded angle list. Suite: 106/106 passing. --- experiments/rule-engine-poc/README.md | 19 +- experiments/rule-engine-poc/docs/README.md | 2 +- experiments/rule-engine-poc/docs/workflow.md | 24 +- .../rule-engine-poc/test/report-flow.test.ts | 219 ++++++++++++++++++ 4 files changed, 255 insertions(+), 9 deletions(-) create mode 100644 experiments/rule-engine-poc/test/report-flow.test.ts diff --git a/experiments/rule-engine-poc/README.md b/experiments/rule-engine-poc/README.md index 331aae579..460bc59a0 100644 --- a/experiments/rule-engine-poc/README.md +++ b/experiments/rule-engine-poc/README.md @@ -47,7 +47,7 @@ A feature folder's verdict becomes a function of named flags and named rules; no ```bash cd experiments/rule-engine-poc npm install -npm test # 60 tests in <1s +npm test # 100+ tests in <2s ``` ### Run the workflow against a real feature @@ -104,15 +104,18 @@ npm run demo:html:all # one HTML report per fixture into reports/ | `src/flag-schema.ts` | Flag schema loader + coverage diff | | `src/context.ts` | Walks target paths, collects file contents with truncation | | `src/prompt-builder.ts` | Builds the AI extraction prompt — role, schema, source, forcing function | +| `src/prompt-hash.ts` | Canonical prompt hashing — binds an extraction to the prompt it was produced against | +| `src/validate.ts` | Validate-gate logic: forbidden-fields check, type check against flag schema, `__prompt_hash` binding | +| `src/validate-cli.ts` | `npm run validate` — runs the validate gate as a standalone command | | `src/plan.ts` | `npm run plan` — generates prompts | -| `src/report.ts` | `npm run report` — runs engine, renders HTML, opens browser | +| `src/report.ts` | `npm run report` — validates (unless `--skip-validate`), runs engine, renders HTML, opens browser | | `src/cli.ts` | Single-shot escape hatch for fixture-based testing | | `src/html-report.ts` | Self-contained HTML renderer (inline CSS, no JS) | | `src/open-browser.ts` | Best-effort `xdg-open` / `open` / `start` | | `fixtures/*.json` | Mock extractions for the single-shot demos | -| `test/*.test.ts` | 60 tests across engine, loader, hash, config, schema, context, prompt-builder | +| `test/*.test.ts` | 100+ tests across engine, loader, hash, config, schema, context, prompt-builder, prompt-hash, validate | | `docs/*.md` | Architecture, DSL reference, audit trail, workflow, extension guide, OODA integration | -| `research/*.md` | Ten research artifacts (technical, regulatory, positioning, design, risks, review, workflow risks, architecture, UX, prompt patterns) | +| `research/*.md` | 16 research artifacts spanning technical landscape, regulatory, positioning, design alternatives, risks, independent review, workflow risks, plan/report architecture, user flow, prompt patterns, adoption revisit, re-review at HEAD, strategy v2, new failure modes, CI operations, JTBD switch interviews | ## Documentation @@ -134,7 +137,7 @@ npm run demo:html:all # one HTML report per fixture into reports/ ## Research -Ten research artifacts under [`research/`](research/) informed the design across two waves: +16 research artifacts under [`research/`](research/) informed the design across successive waves: | # | Angle | |---|---| @@ -148,6 +151,12 @@ Ten research artifacts under [`research/`](research/) informed the design across | 08 | Plan/report workflow architecture | | 09 | User-flow audit & friction points | | 10 | LLM extraction prompt patterns | +| 11 | Rule-engine adoption revisit | +| 12 | Re-review at HEAD | +| 13 | Strategy v2 | +| 14 | New failure modes | +| 15 | CI operations | +| 16 | JTBD switch interviews | ## North Star diff --git a/experiments/rule-engine-poc/docs/README.md b/experiments/rule-engine-poc/docs/README.md index 34b2baf86..1ef09fbfb 100644 --- a/experiments/rule-engine-poc/docs/README.md +++ b/experiments/rule-engine-poc/docs/README.md @@ -18,4 +18,4 @@ Detailed documentation for the POC. Start with the project [README](../README.md | [`extending.md`](extending.md) | You want to add a rule, add a flag, point the engine at a new domain, or run the tests. | | [`ooda-integration.md`](ooda-integration.md) | You want to understand how this POC slots into the OODA orchestrator and what a production wiring would look like. | -For the research that informed the design, see [`../research/`](../research/) — five briefs covering technical landscape, regulatory / auditability, positioning, design alternatives, and risks. +For the research that informed the design, see [`../research/`](../research/) — 16 research artifacts spanning technical landscape, regulatory / auditability, positioning, design alternatives, risks, independent review, workflow risks, plan/report architecture, user flow, prompt patterns, adoption revisit, re-review at HEAD, strategy v2, new failure modes, CI operations, and JTBD switch interviews. diff --git a/experiments/rule-engine-poc/docs/workflow.md b/experiments/rule-engine-poc/docs/workflow.md index 7aba978ef..d3338a182 100644 --- a/experiments/rule-engine-poc/docs/workflow.md +++ b/experiments/rule-engine-poc/docs/workflow.md @@ -88,6 +88,10 @@ Open `prompts/<target-id>.md`, copy the contents, paste into Claude, ChatGPT, or Save the JSON (just the object, without the `<output>` tags) to `extractions/<target-id>.json`. +The extraction **must** include `__prompt_hash` as the **first field** of the JSON object — the prompt instructs the model to copy this verbatim from the prompt header. The hash binds the extraction to the exact prompt it was produced against; `validate` and `report` recompute it and refuse stale extractions. + +If you edit any source file referenced by a target after generating its prompt, **re-run `npm run plan`** for that target. The new prompt will have a new `__prompt_hash`, and the previous extraction will be detected as stale on the next `validate` / `report` and must be regenerated. + > This manual paste step is the POC's defining trade-off. A production wiring would call the LLM API with a constrained-decoding response schema; the manual loop is the lo-fi way to prove the rest of the pipeline before paying for the API integration. See `research/07-workflow-risks.md` for what to watch for during this step. ## 4. Generate reports — `npm run report` @@ -135,8 +139,22 @@ The split is intentional: This shape mirrors the [OODA orchestrator separation](ooda-integration.md): stochastic in Orient, deterministic everywhere else. +### `--skip-validate` flag + +`report` accepts a `--skip-validate` escape hatch: + +```bash +npm run report -- --skip-validate +npm run report -- --target astro-product-page --skip-validate +``` + +- **What it does.** Bypasses the validate gate entirely — no forbidden-fields check, no type check against the flag schema, no `__prompt_hash` binding check. The engine runs against whatever JSON is in the extraction file. +- **When to use it.** Debugging only — e.g., rendering a report from a known-stale extraction while iterating on rules, or inspecting how the engine handles malformed input. **Never in CI.** CI should always run with the validate gate on so a stale or malformed extraction fails the build instead of silently producing a wrong verdict. +- **Warning.** Every target rendered with `--skip-validate` prints a stderr warning so the bypass is visible in logs and review. + ## What's not yet here -- **No schema-validate gate before `report`** — currently the engine treats absent flags as missing-in-extraction (which is fine) but accepts whatever JSON is in the file. A future `npm run validate` would parse the extraction, check it against the flag schema, refuse forbidden field names, and only then let `report` proceed. See `research/10-extraction-prompt-patterns.md` for the design. -- **No stale-extraction detection** — if you edit the source files after the extraction was produced, the report still renders. The critic in `research/07-workflow-risks.md` flags this as the riskiest UX failure mode; a future iteration would hash the prompt and refuse to render a report from an extraction produced against a different prompt. -- **No automated API call** — the manual paste step is on purpose for the POC. +- **No automated LLM API extraction.** The manual paste step is the POC's defining trade-off; replacing it with a constrained-decoding API call is the obvious next slice but is not in scope here. +- **No shared rule-lifecycle governance.** There is no CODEOWNERS entry, changelog, or review protocol for `rules/quality-gates.yaml` and `rules/flag-schema.yaml` — a rule edit today is just a file edit. A production wiring needs owners and a versioned changelog so weight changes are visible. +- **No fairness / calibration audit of weights.** Weights and severity ordering are still placeholders; nothing has been calibrated against historical outcomes or stress-tested for bias across feature types. See `research/05-risks-critique.md`. +- **No drift dashboards.** If this runs in CI across many features over time, there is no aggregation layer to surface verdict drift, rule-firing trends, or extraction-quality regressions. See `research/15-ci-operations.md`. diff --git a/experiments/rule-engine-poc/test/report-flow.test.ts b/experiments/rule-engine-poc/test/report-flow.test.ts new file mode 100644 index 000000000..57f4fb349 --- /dev/null +++ b/experiments/rule-engine-poc/test/report-flow.test.ts @@ -0,0 +1,219 @@ +// Integration tests for the report.ts / validate-cli.ts CLIs covering +// the prompt-extraction binding (research/12 S2-2) and the +// --skip-validate stderr warning (research/12 S2-3). +// +// We drive the actual CLIs via spawnSync so the test exercises the same +// code path operators see. Each test sets up a tmpdir with a minimal +// config + rules + flag schema + target source file, then runs the CLI +// and asserts on stdout/stderr/exit-code. + +import { spawnSync } from "node:child_process"; +import { mkdirSync, mkdtempSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { dirname, join, resolve } from "node:path"; +import { fileURLToPath } from "node:url"; +import { describe, expect, it } from "vitest"; + +const here = dirname(fileURLToPath(import.meta.url)); +const repoRoot = resolve(here, ".."); +const reportEntry = join(repoRoot, "src", "report.ts"); +const validateEntry = join(repoRoot, "src", "validate-cli.ts"); + +interface Workspace { + dir: string; + configPath: string; + promptsDir: string; + extractionsDir: string; + sourcePath: string; +} + +const MINIMAL_RULES_YAML = `# Minimal rule set for binding tests. +- id: ci-green-progresses + description: CI green -> ready. + stage: any + priority: 10 + when: + all: + - flag: ci_passing + eq: true + then: + verdict: ready-to-progress + weight: 10 + actions: [] + tags: [test] +`; + +const MINIMAL_SCHEMA_YAML = `ci_passing: + type: boolean + description: ci is green + example: true +`; + +function setupWorkspace(): Workspace { + const dir = mkdtempSync(join(tmpdir(), "rep-poc-flow-")); + const promptsDir = join(dir, "prompts"); + const extractionsDir = join(dir, "extractions"); + const reportsDir = join(dir, "reports"); + const rulesDir = join(dir, "rules"); + const sourceDir = join(dir, "src-target"); + mkdirSync(promptsDir); + mkdirSync(extractionsDir); + mkdirSync(reportsDir); + mkdirSync(rulesDir); + mkdirSync(sourceDir); + writeFileSync(join(rulesDir, "rules.yaml"), MINIMAL_RULES_YAML, "utf8"); + writeFileSync(join(rulesDir, "schema.yaml"), MINIMAL_SCHEMA_YAML, "utf8"); + const sourcePath = join(sourceDir, "spec.md"); + writeFileSync(sourcePath, "# Spec\n\nFeature notes.\n", "utf8"); + + const configBody = { + rules: "rules/rules.yaml", + flagSchema: "rules/schema.yaml", + promptsDir: "prompts", + extractionsDir: "extractions", + reportsDir: "reports", + openBrowser: false, + targets: [ + { + id: "alpha", + label: "Alpha", + paths: ["src-target"], + }, + ], + }; + const configPath = join(dir, "rule-engine.config.json"); + writeFileSync(configPath, JSON.stringify(configBody, null, 2), "utf8"); + return { dir, configPath, promptsDir, extractionsDir, sourcePath }; +} + +function runCli( + entry: string, + args: string[], + cwd: string, +): { status: number | null; stdout: string; stderr: string } { + const r = spawnSync("npx", ["tsx", entry, ...args], { + cwd, + encoding: "utf8", + env: { ...process.env, NO_COLOR: "1" }, + }); + return { + status: r.status, + stdout: r.stdout ?? "", + stderr: r.stderr ?? "", + }; +} + +function writeExtraction(ws: Workspace, body: Record<string, unknown>): void { + writeFileSync( + join(ws.extractionsDir, "alpha.json"), + JSON.stringify(body), + "utf8", + ); +} + +function writePrompt(ws: Workspace): void { + // Body content does not matter for the binding decision — only the + // presence of the file does. Hash is recomputed from source on every + // run, not read from this file. + writeFileSync( + join(ws.promptsDir, "alpha.md"), + "# extraction prompt for alpha\n", + "utf8", + ); +} + +describe("report-flow: prompt-extraction binding tied to prompt file presence", () => { + it("fails validation when prompt file exists and extraction omits __prompt_hash", () => { + const ws = setupWorkspace(); + writePrompt(ws); + writeExtraction(ws, { ci_passing: true }); + + const r = runCli(reportEntry, ["--config", ws.configPath, "--quiet"], ws.dir); + expect(r.status).toBe(2); + expect(r.stderr).toMatch(/missing-prompt-hash/); + }, 30_000); + + it("fails validation when prompt file exists and __prompt_hash is stale", () => { + const ws = setupWorkspace(); + writePrompt(ws); + writeExtraction(ws, { ci_passing: true, __prompt_hash: "stale-value" }); + + const r = runCli(reportEntry, ["--config", ws.configPath, "--quiet"], ws.dir); + expect(r.status).toBe(2); + expect(r.stderr).toMatch(/stale-extraction/); + }, 30_000); + + it("does NOT require __prompt_hash when prompt file is absent (fixture flow)", () => { + const ws = setupWorkspace(); + // No writePrompt(ws) — single-shot / fixture flow. + writeExtraction(ws, { ci_passing: true }); + + const r = runCli(reportEntry, ["--config", ws.configPath, "--quiet"], ws.dir); + expect(r.status).toBe(0); + expect(r.stderr).not.toMatch(/missing-prompt-hash/); + }, 30_000); + + it("validate CLI also requires __prompt_hash when prompt file exists", () => { + const ws = setupWorkspace(); + writePrompt(ws); + writeExtraction(ws, { ci_passing: true }); + + const r = runCli(validateEntry, ["--config", ws.configPath], ws.dir); + expect(r.status).toBe(1); + expect(r.stdout + r.stderr).toMatch(/missing-prompt-hash/); + }, 30_000); + + it("validate CLI skips hash check when prompt file is absent", () => { + const ws = setupWorkspace(); + writeExtraction(ws, { ci_passing: true }); + + const r = runCli(validateEntry, ["--config", ws.configPath], ws.dir); + expect(r.status).toBe(0); + expect(r.stdout + r.stderr).not.toMatch(/missing-prompt-hash/); + }, 30_000); + + it("operator deleting only the sidecar does NOT disable the staleness check", () => { + // Reproduce the bypass the reviewer flagged: prompt file is still + // present, sidecar is missing (or never existed). Validation MUST + // still recompute and require the hash. + const ws = setupWorkspace(); + writePrompt(ws); + // No sidecar at all. + writeExtraction(ws, { ci_passing: true }); + + const r = runCli(reportEntry, ["--config", ws.configPath, "--quiet"], ws.dir); + expect(r.status).toBe(2); + expect(r.stderr).toMatch(/missing-prompt-hash/); + }, 30_000); +}); + +describe("report-flow: --skip-validate stderr warning", () => { + it("prints a loud warning on stderr when --skip-validate is set", () => { + const ws = setupWorkspace(); + writePrompt(ws); + // Extraction would otherwise fail (no __prompt_hash) but + // --skip-validate is supposed to bypass validation entirely. + writeExtraction(ws, { ci_passing: true }); + + const r = runCli( + reportEntry, + ["--config", ws.configPath, "--quiet", "--skip-validate"], + ws.dir, + ); + expect(r.stderr).toContain( + "[report] WARNING: --skip-validate is set for 'alpha'; validation gate disabled. This is for debugging only.", + ); + // Report still emits successfully despite the missing hash. + expect(r.status).toBe(0); + }, 30_000); + + it("does NOT print the skip-validate warning when the flag is absent", () => { + const ws = setupWorkspace(); + // No prompt file -> validation runs but passes without hash binding. + writeExtraction(ws, { ci_passing: true }); + + const r = runCli(reportEntry, ["--config", ws.configPath, "--quiet"], ws.dir); + expect(r.stderr).not.toContain("--skip-validate is set"); + expect(r.status).toBe(0); + }, 30_000); +}); From 2cf48e480c1d82ba014fb00ecc6701cee8c523fe Mon Sep 17 00:00:00 2001 From: Claude <noreply@anthropic.com> Date: Sun, 17 May 2026 12:43:46 +0000 Subject: [PATCH 20/45] fix(rule-engine-poc): fail closed when prompt hash unrecomputable (Codex round 9 P1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the prompt file exists but collectFiles() throws (target paths deleted / renamed / unreadable), the previous behaviour was to set expectedPromptHash = undefined and continue — silently disabling the stale-extraction check. A renamed source folder could then let report render an old extraction as if it were current. Now both report.ts and validate-cli.ts fail closed: print an explicit error and skip the target with exit code 2. The integrity invariant ('an extraction is checked against the current source') is preserved. Backwards-compat preserved for the fixture / single-shot flow: when prompts/<id>.md doesn't exist, no hash check is attempted, and the catch path above doesn't fire. Suite: 106/106 still passing (no test exercised the silent-downgrade path). --- experiments/rule-engine-poc/src/report.ts | 19 ++++++++++--------- .../rule-engine-poc/src/validate-cli.ts | 12 ++++++++++-- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/experiments/rule-engine-poc/src/report.ts b/experiments/rule-engine-poc/src/report.ts index 9171320bd..35edb0b33 100644 --- a/experiments/rule-engine-poc/src/report.ts +++ b/experiments/rule-engine-poc/src/report.ts @@ -109,16 +109,17 @@ for (const target of targets) { flagSchema: schema, }); } catch (err) { - // Sources unavailable (deleted folder, etc.) — fall through without - // hash binding rather than crash. validate will warn separately if - // __prompt_hash is set but unverifiable. - expectedPromptHash = undefined; + // Fail closed (Codex round 9 P1). If we can't recompute the prompt + // hash but the prompt file exists, silently rendering would let a + // stale extraction sneak through after source drift. Surface the + // failure and skip the target rather than downgrade integrity. const msg = err instanceof Error ? err.message : String(err); - if (!quiet) { - console.warn( - `[report] ${target.id}: could not recompute prompt hash (${msg}); skipping staleness check`, - ); - } + console.error( + `[report] ${target.id}: prompt file exists but source files are unreadable (${msg}). ` + + `Cannot verify the extraction is not stale. Restore the sources or re-run 'npm run plan'.`, + ); + worstExitCode = Math.max(worstExitCode, 2); + continue; } } diff --git a/experiments/rule-engine-poc/src/validate-cli.ts b/experiments/rule-engine-poc/src/validate-cli.ts index 1c6e3c0a7..d84c3ba39 100644 --- a/experiments/rule-engine-poc/src/validate-cli.ts +++ b/experiments/rule-engine-poc/src/validate-cli.ts @@ -83,8 +83,16 @@ for (const target of targets) { rules, flagSchema: schema, }); - } catch { - expectedPromptHash = undefined; + } catch (err) { + // Fail closed (Codex round 9 P1) — silently disabling the hash + // check after a source-tree mutation would defeat the point. + const msg = err instanceof Error ? err.message : String(err); + console.error( + `[validate] ${target.id}: prompt file exists but source files are unreadable (${msg}). ` + + `Cannot verify the extraction is not stale. Restore the sources or re-run 'npm run plan'.`, + ); + worstExitCode = Math.max(worstExitCode, 2); + continue; } } const result = validateExtraction( From bc7ef0a1040e67f33fc1030151007329a34609ba Mon Sep 17 00:00:00 2001 From: Claude <noreply@anthropic.com> Date: Sun, 17 May 2026 12:51:26 +0000 Subject: [PATCH 21/45] fix(rule-engine-poc): case-insensitive target ids + prompt-hash-first (Codex round 10) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - config.ts: target-id duplicate check now case-insensitive. Target ids are interpolated into <id>.md / <id>.json / <id>.html filenames, and default macOS / Windows filesystems are case-insensitive — so 'Alpha' and 'alpha' would collide on disk without a config error and one target's artifacts would overwrite another's silently (#525 round 10 P2). - prompt-builder.ts: __prompt_hash is now actually the FIRST key in the response template, not just claimed-first-then-appended-last. Object.fromEntries used to insert the schema keys before the promptHash assignment; JS object key order follows insertion for string keys, so the hash was rendered last. Now we build the object with the hash first, then loop the schema (#525 round 10 P3). - Two new tests cover both behaviours. Note: src/cli-shared.ts and src/validate-cli.ts are dirty in the worktree from an in-flight CLI scaffolding refactor; those changes will land separately when the agent reports back. --- experiments/rule-engine-poc/src/config.ts | 13 ++++++++++--- experiments/rule-engine-poc/src/prompt-builder.ts | 13 ++++++++----- experiments/rule-engine-poc/test/config.test.ts | 11 +++++++++++ .../rule-engine-poc/test/prompt-builder.test.ts | 13 +++++++++++++ 4 files changed, 42 insertions(+), 8 deletions(-) diff --git a/experiments/rule-engine-poc/src/config.ts b/experiments/rule-engine-poc/src/config.ts index d4fb7d869..177378f86 100644 --- a/experiments/rule-engine-poc/src/config.ts +++ b/experiments/rule-engine-poc/src/config.ts @@ -102,10 +102,17 @@ function validate(raw: unknown, file: string): RawConfig { `(safe filename slug: letters, digits, hyphen, underscore; first char alphanumeric)`, ); } - if (seen.has(tr.id)) { - throw new Error(`Config ${file} has duplicate target id '${tr.id}'`); + // Compare lowercased ids: target ids are interpolated into filenames + // (<id>.md, <id>.json, <id>.html), and default macOS / Windows + // filesystems are case-insensitive, so 'Alpha' and 'alpha' would + // collide on disk without a config error. + const idKey = tr.id.toLowerCase(); + if (seen.has(idKey)) { + throw new Error( + `Config ${file} has duplicate target id '${tr.id}' (case-insensitive collision)`, + ); } - seen.add(tr.id); + seen.add(idKey); if (typeof tr.label !== "string" || tr.label.length === 0) { throw new Error(`Config ${file} target '${tr.id}' missing string 'label'`); } diff --git a/experiments/rule-engine-poc/src/prompt-builder.ts b/experiments/rule-engine-poc/src/prompt-builder.ts index 36fc583ef..9c637dee1 100644 --- a/experiments/rule-engine-poc/src/prompt-builder.ts +++ b/experiments/rule-engine-poc/src/prompt-builder.ts @@ -26,14 +26,17 @@ export function buildExtractionPrompt(input: BuildPromptInput): string { }) .join("\n"); - const templateObj: Record<string, unknown> = Object.fromEntries( - Object.entries(flagSchema).map(([name, entry]) => [name, entry.example]), - ); + // Build the response template with __prompt_hash FIRST when set. + // JS object key order follows insertion order for string keys, so + // inserting promptHash before the schema entries makes it the first + // key the LLM sees in JSON output — harder to drop accidentally. + const templateObj: Record<string, unknown> = {}; if (promptHash) { - // Make the __prompt_hash field the first key the LLM sees in the - // template so it's harder to drop accidentally. templateObj[PROMPT_HASH_FIELD] = promptHash; } + for (const [name, entry] of Object.entries(flagSchema)) { + templateObj[name] = entry.example; + } const responseTemplate = JSON.stringify(templateObj, null, 2); const promptHashLine = promptHash diff --git a/experiments/rule-engine-poc/test/config.test.ts b/experiments/rule-engine-poc/test/config.test.ts index 1a61b9021..317a7aa2f 100644 --- a/experiments/rule-engine-poc/test/config.test.ts +++ b/experiments/rule-engine-poc/test/config.test.ts @@ -60,6 +60,17 @@ describe("loadConfig", () => { expect(() => loadConfig(file)).toThrow(/safe filename slug/); }); + it("rejects case-colliding target ids (case-insensitive filesystem safety)", () => { + const file = writeTempConfig({ + ...validBody, + targets: [ + { id: "Alpha", label: "A1", paths: ["a"] }, + { id: "alpha", label: "A2", paths: ["b"] }, + ], + }); + expect(() => loadConfig(file)).toThrow(/case-insensitive collision/); + }); + it("accepts kebab-case and snake_case target ids", () => { const file = writeTempConfig({ ...validBody, diff --git a/experiments/rule-engine-poc/test/prompt-builder.test.ts b/experiments/rule-engine-poc/test/prompt-builder.test.ts index a0b4d196d..dccb2ad66 100644 --- a/experiments/rule-engine-poc/test/prompt-builder.test.ts +++ b/experiments/rule-engine-poc/test/prompt-builder.test.ts @@ -54,6 +54,19 @@ describe("buildExtractionPrompt", () => { expect(prompt).toContain("`conclusion`"); }); + it("places __prompt_hash first in the response template when set", () => { + const prompt = buildExtractionPrompt({ ...baseInput, promptHash: "abc123def456" }); + // The response template JSON should have __prompt_hash before any other key. + // __prompt_hash must appear before the flag-schema keys in the + // response template so the LLM emits it first and is less likely + // to drop the field. + const hashIdx = prompt.indexOf('"__prompt_hash":'); + const ciIdx = prompt.indexOf('"ci_passing":'); + expect(hashIdx).toBeGreaterThan(-1); + expect(ciIdx).toBeGreaterThan(-1); + expect(hashIdx).toBeLessThan(ciIdx); + }); + it("ends with an open <output> tag as a forcing function", () => { const prompt = buildExtractionPrompt(baseInput); expect(prompt.trim().endsWith("<output>")).toBe(true); From 421d82163ffd1487cc9a4f142281cab87f10d780 Mon Sep 17 00:00:00 2001 From: Claude <noreply@anthropic.com> Date: Sun, 17 May 2026 12:52:26 +0000 Subject: [PATCH 22/45] refactor(rule-engine-poc): extract cli-shared seam (architecture pass 2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three CLIs (plan, validate, report) previously duplicated argv parsing, config + rules + schema loading, target filtering, extraction IO, prompt-hash recompute, and exit-code handling. The duplication was the documented cause of two repeated bugs: Codex caught the same JSON-root-validation defect in report.ts (round 6) and cli.ts (round 7), and the fail-closed fix for unrecomputable prompt hashes (round 9 P1) had to be applied to both report.ts and validate-cli.ts. What changed: - src/cli-shared.ts (new, 270 LOC): exports takeOpt / takeFlag / parseStandardArgs, loadCliBaseContext, selectTargets, plus a discriminated-union loadExtractionForTarget that returns { kind: 'ok' | 'missing' | 'invalid-json' | 'non-object' | 'hash-unrecomputable' } and a logExtractionError formatter that preserves the existing stderr text byte-for-byte (test/report-flow asserts those strings). - src/plan.ts: 90 LOC -> 79 LOC. Uses parseStandardArgs + loadCliBaseContext + selectTargets. Schema coverage diff stays here. - src/report.ts: 210 LOC -> 120 LOC. Per-target handler is now evaluate -> validate -> render HTML; the defensive-IO scaffolding is gone. --skip-validate and --no-open remain command-specific. - src/validate-cli.ts: 110 LOC -> 62 LOC. Per-target handler is now just validateExtraction + log results + Summary line. - test/cli-shared.test.ts (new): 14 unit tests for takeOpt/takeFlag argv mutation, parseStandardArgs, selectTargets filtering, and loadExtractionForTarget across all five discriminated-union cases. - src/cli.ts, src/engine.ts, src/loader.ts, src/validate.ts, and the prompt + html-report layers are untouched. Verified: 123/123 tests passing, tsc clean, typos clean. All eight report-flow integration tests still pass — message text and exit codes preserved. --- experiments/rule-engine-poc/src/cli-shared.ts | 270 ++++++++++++++++++ experiments/rule-engine-poc/src/plan.ts | 31 +- experiments/rule-engine-poc/src/report.ts | 137 ++------- .../rule-engine-poc/src/validate-cli.ts | 98 ++----- .../rule-engine-poc/test/cli-shared.test.ts | 241 ++++++++++++++++ 5 files changed, 565 insertions(+), 212 deletions(-) create mode 100644 experiments/rule-engine-poc/src/cli-shared.ts create mode 100644 experiments/rule-engine-poc/test/cli-shared.test.ts diff --git a/experiments/rule-engine-poc/src/cli-shared.ts b/experiments/rule-engine-poc/src/cli-shared.ts new file mode 100644 index 000000000..7ee2db586 --- /dev/null +++ b/experiments/rule-engine-poc/src/cli-shared.ts @@ -0,0 +1,270 @@ +// Shared CLI scaffolding for the multi-target CLIs (plan, report, +// validate-cli). The single-shot `cli.ts` uses a different argv shape +// and is intentionally NOT a consumer of this module. +// +// Why this exists: the defensive-IO surface around extraction files +// (JSON-root validation, prompt-file presence as the binding trigger, +// fail-closed on hash-recompute failure) was duplicated across +// `report.ts` and `validate-cli.ts`. Codex caught the JSON-root issue +// twice and the fail-closed issue twice — once in each CLI — because +// the fix only landed where it was reviewed. Concentrating the bug +// surface here means the next fix lands in one place. + +import { existsSync, readFileSync } from "node:fs"; +import { join, relative } from "node:path"; +import { + findTarget, + loadConfig, + type ResolvedConfig, + type Target, +} from "./config.js"; +import { collectFiles } from "./context.js"; +import { loadFlagSchema, type FlagSchema } from "./flag-schema.js"; +import { loadRulesFromFile } from "./loader.js"; +import { computePromptHash } from "./prompt-hash.js"; +import type { ExtractionFlags, LoadedRule } from "./types.js"; + +// --------------------------------------------------------------------------- +// 1. Argv parsing +// --------------------------------------------------------------------------- + +export function takeOpt(argv: string[], flag: string): string | undefined { + const i = argv.indexOf(flag); + if (i === -1) return undefined; + const v = argv[i + 1]; + argv.splice(i, 2); + return v; +} + +export function takeFlag(argv: string[], flag: string): boolean { + const i = argv.indexOf(flag); + if (i === -1) return false; + argv.splice(i, 1); + return true; +} + +export interface StandardArgs { + configPath: string; + targetFilter: string | undefined; + quiet: boolean; + remaining: string[]; +} + +// Parse the three standard flags every multi-target CLI accepts: +// `--config`, `--target`, `--quiet`. The remaining argv (post-mutation +// by `takeOpt` / `takeFlag`) is returned so the caller can pull off +// its own command-specific flags. +export function parseStandardArgs(argv: string[]): StandardArgs { + const configPath = takeOpt(argv, "--config") ?? "rule-engine.config.json"; + const targetFilter = takeOpt(argv, "--target"); + const quiet = takeFlag(argv, "--quiet"); + return { configPath, targetFilter, quiet, remaining: argv }; +} + +// --------------------------------------------------------------------------- +// 2. Context loading +// --------------------------------------------------------------------------- + +export interface CliBaseContext { + config: ResolvedConfig; + rules: LoadedRule[]; + schema: FlagSchema; +} + +export function loadCliBaseContext(configPath: string): CliBaseContext { + const config = loadConfig(configPath); + const rules = loadRulesFromFile(config.rulesPath); + const schema = loadFlagSchema(config.flagSchemaPath); + return { config, rules, schema }; +} + +// --------------------------------------------------------------------------- +// 3. Target selection +// --------------------------------------------------------------------------- + +export function selectTargets( + config: ResolvedConfig, + targetFilter?: string, +): Target[] { + return targetFilter ? [findTarget(config, targetFilter)] : config.targets; +} + +// --------------------------------------------------------------------------- +// 4. Extraction IO with discriminated-union result +// --------------------------------------------------------------------------- + +export type ExtractionLoadResult = + | { + kind: "ok"; + flags: ExtractionFlags; + extractionPath: string; + promptFileExists: boolean; + expectedPromptHash: string | undefined; + } + | { kind: "missing"; targetId: string; extractionPath: string } + | { + kind: "invalid-json"; + targetId: string; + extractionPath: string; + error: string; + } + | { + kind: "non-object"; + targetId: string; + extractionPath: string; + got: string; + } + | { + kind: "hash-unrecomputable"; + targetId: string; + promptPath: string; + error: string; + }; + +export type ExtractionLoadError = Exclude< + ExtractionLoadResult, + { kind: "ok" } +>; + +// Load the per-target extraction file, validate it's a JSON object, +// and recompute the prompt hash if a prompt file exists. +// +// Binding trigger = presence of `prompts/<id>.md` (the prompt file), +// NOT the diagnostic sidecar. An operator who deletes the sidecar +// must NOT thereby disable the staleness check (research/12 S2-2). +// Single-shot / fixture flows that bypass the plan step have no +// prompt file, so they keep the legacy no-binding behaviour. +// +// Recomputing the hash (rather than trusting the sidecar) ensures a +// real source-file change always invalidates the extraction. The +// sidecar is plain text and would be trivially pasteable into the +// extraction's __prompt_hash by an operator under deadline pressure +// (critic research/14). +// +// Fail-closed semantics for `hash-unrecomputable` (Codex round 9 P1): +// if the prompt file exists but we cannot read its sources, we surface +// the failure as a distinct error kind rather than silently downgrading +// integrity. The caller is expected to treat this as exit code 2. +export function loadExtractionForTarget( + target: Target, + ctx: CliBaseContext, +): ExtractionLoadResult { + const { config, rules, schema } = ctx; + const extractionPath = join( + config.extractionsDirPath, + `${target.id}.json`, + ); + + if (!existsSync(extractionPath)) { + return { kind: "missing", targetId: target.id, extractionPath }; + } + + let parsed: unknown; + try { + parsed = JSON.parse(readFileSync(extractionPath, "utf8")); + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + return { + kind: "invalid-json", + targetId: target.id, + extractionPath, + error: msg, + }; + } + + if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) { + const got = + parsed === null + ? "null" + : Array.isArray(parsed) + ? "array" + : typeof parsed; + return { + kind: "non-object", + targetId: target.id, + extractionPath, + got, + }; + } + + const flags = parsed as ExtractionFlags; + + const promptPath = join(config.promptsDirPath, `${target.id}.md`); + const promptFileExists = existsSync(promptPath); + let expectedPromptHash: string | undefined; + if (promptFileExists) { + try { + const files = collectFiles(target.paths, { baseDir: config.configDir }); + expectedPromptHash = computePromptHash({ + targetId: target.id, + files, + rules, + flagSchema: schema, + }); + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + return { + kind: "hash-unrecomputable", + targetId: target.id, + promptPath, + error: msg, + }; + } + } + + return { + kind: "ok", + flags, + extractionPath, + promptFileExists, + expectedPromptHash, + }; +} + +// --------------------------------------------------------------------------- +// 5. Error-logging helper +// --------------------------------------------------------------------------- + +// Emit the per-kind stderr message for a non-`ok` extraction load. +// Message text is preserved byte-for-byte from the original CLIs +// because `test/report-flow.test.ts` and Codex's review comments +// assert specific phrasing. The `name` parameter is the bracketed CLI +// tag ("report" or "validate") — the original strings used these tags. +export function logExtractionError( + name: string, + err: ExtractionLoadError, + configDir: string, +): void { + switch (err.kind) { + case "missing": { + console.error( + `[${name}] missing extraction for '${err.targetId}': ${relative(configDir, err.extractionPath)}`, + ); + if (name === "report") { + console.error( + ` run: npm run plan -- --target ${err.targetId} (paste the prompt into your AI tool, save the JSON here)`, + ); + } + return; + } + case "invalid-json": { + console.error( + `[${name}] invalid JSON in ${err.extractionPath}: ${err.error}`, + ); + return; + } + case "non-object": { + console.error( + `[${name}] extraction ${err.extractionPath} must be a JSON object (got ${err.got})`, + ); + return; + } + case "hash-unrecomputable": { + console.error( + `[${name}] ${err.targetId}: prompt file exists but source files are unreadable (${err.error}). ` + + `Cannot verify the extraction is not stale. Restore the sources or re-run 'npm run plan'.`, + ); + return; + } + } +} diff --git a/experiments/rule-engine-poc/src/plan.ts b/experiments/rule-engine-poc/src/plan.ts index 592436922..dd8dae8d2 100644 --- a/experiments/rule-engine-poc/src/plan.ts +++ b/experiments/rule-engine-poc/src/plan.ts @@ -5,29 +5,20 @@ import { mkdirSync, writeFileSync } from "node:fs"; import { join, relative } from "node:path"; -import { loadConfig, findTarget } from "./config.js"; +import { + loadCliBaseContext, + parseStandardArgs, + selectTargets, +} from "./cli-shared.js"; import { collectFiles } from "./context.js"; -import { diffSchemaCoverage, loadFlagSchema } from "./flag-schema.js"; -import { loadRulesFromFile } from "./loader.js"; +import { diffSchemaCoverage } from "./flag-schema.js"; import { buildExtractionPrompt } from "./prompt-builder.js"; import { computePromptHash, hashSidecarPath } from "./prompt-hash.js"; -import type { Target } from "./config.js"; -function takeOpt(argv: string[], flag: string): string | undefined { - const i = argv.indexOf(flag); - if (i === -1) return undefined; - const v = argv[i + 1]; - argv.splice(i, 2); - return v; -} - -const argv = process.argv.slice(2); -const configPath = takeOpt(argv, "--config") ?? "rule-engine.config.json"; -const onlyTarget = takeOpt(argv, "--target"); +const { configPath, targetFilter } = parseStandardArgs(process.argv.slice(2)); -const config = loadConfig(configPath); -const rules = loadRulesFromFile(config.rulesPath); -const schema = loadFlagSchema(config.flagSchemaPath); +const ctx = loadCliBaseContext(configPath); +const { config, rules, schema } = ctx; // Collect every flag referenced by rules so we can warn on schema drift. const ruleFlags = new Set<string>(); @@ -50,9 +41,7 @@ if (coverage.unused.length > 0) { mkdirSync(config.promptsDirPath, { recursive: true }); -const targets: Target[] = onlyTarget - ? [findTarget(config, onlyTarget)] - : config.targets; +const targets = selectTargets(config, targetFilter); for (const target of targets) { const files = collectFiles(target.paths, { baseDir: config.configDir }); diff --git a/experiments/rule-engine-poc/src/report.ts b/experiments/rule-engine-poc/src/report.ts index 35edb0b33..f796bbead 100644 --- a/experiments/rule-engine-poc/src/report.ts +++ b/experiments/rule-engine-poc/src/report.ts @@ -3,125 +3,44 @@ // engine, render HTML, optionally open in the browser. Per-target HTML // is written to config.reportsDir. -import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs"; +import { mkdirSync, writeFileSync } from "node:fs"; import { join, relative, resolve } from "node:path"; -import { loadConfig, findTarget } from "./config.js"; -import { collectFiles } from "./context.js"; +import { + loadCliBaseContext, + loadExtractionForTarget, + logExtractionError, + parseStandardArgs, + selectTargets, + takeFlag, +} from "./cli-shared.js"; import { evaluate } from "./engine.js"; -import { loadFlagSchema } from "./flag-schema.js"; -import { loadRulesFromFile } from "./loader.js"; import { renderHtmlReport } from "./html-report.js"; import { openInBrowser } from "./open-browser.js"; -import { computePromptHash } from "./prompt-hash.js"; import { validateExtraction } from "./validate.js"; -import type { ExtractionFlags, Verdict } from "./types.js"; import type { Target } from "./config.js"; - -function takeOpt(argv: string[], flag: string): string | undefined { - const i = argv.indexOf(flag); - if (i === -1) return undefined; - const v = argv[i + 1]; - argv.splice(i, 2); - return v; -} -function takeFlag(argv: string[], flag: string): boolean { - const i = argv.indexOf(flag); - if (i === -1) return false; - argv.splice(i, 1); - return true; -} +import type { Verdict } from "./types.js"; const argv = process.argv.slice(2); -const configPath = takeOpt(argv, "--config") ?? "rule-engine.config.json"; -const onlyTarget = takeOpt(argv, "--target"); -const noOpen = takeFlag(argv, "--no-open"); -const quiet = takeFlag(argv, "--quiet"); - -const skipValidate = takeFlag(argv, "--skip-validate"); +const { configPath, targetFilter, quiet, remaining } = parseStandardArgs(argv); +const noOpen = takeFlag(remaining, "--no-open"); +const skipValidate = takeFlag(remaining, "--skip-validate"); -const config = loadConfig(configPath); -const rules = loadRulesFromFile(config.rulesPath); -const schema = loadFlagSchema(config.flagSchemaPath); +const ctx = loadCliBaseContext(configPath); +mkdirSync(ctx.config.reportsDirPath, { recursive: true }); -mkdirSync(config.reportsDirPath, { recursive: true }); - -const targets: Target[] = onlyTarget - ? [findTarget(config, onlyTarget)] - : config.targets; +const targets = selectTargets(ctx.config, targetFilter); let worstExitCode = 0; const summary: { target: Target; verdict: Verdict; reportPath: string }[] = []; for (const target of targets) { - const extractionPath = join(config.extractionsDirPath, `${target.id}.json`); - if (!existsSync(extractionPath)) { - console.error( - `[report] missing extraction for '${target.id}': ${relative(config.configDir, extractionPath)}`, - ); - console.error( - ` run: npm run plan -- --target ${target.id} (paste the prompt into your AI tool, save the JSON here)`, - ); + const loaded = loadExtractionForTarget(target, ctx); + if (loaded.kind !== "ok") { + logExtractionError("report", loaded, ctx.config.configDir); worstExitCode = Math.max(worstExitCode, 2); continue; } - - let flags: ExtractionFlags; - try { - const parsed: unknown = JSON.parse(readFileSync(extractionPath, "utf8")); - if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) { - console.error( - `[report] extraction ${extractionPath} must be a JSON object (got ${parsed === null ? "null" : Array.isArray(parsed) ? "array" : typeof parsed})`, - ); - worstExitCode = Math.max(worstExitCode, 2); - continue; - } - flags = parsed as ExtractionFlags; - } catch (err) { - const msg = err instanceof Error ? err.message : String(err); - console.error(`[report] invalid JSON in ${extractionPath}: ${msg}`); - worstExitCode = Math.max(worstExitCode, 2); - continue; - } - - // Recompute the prompt hash from CURRENT source + rules + schema. - // Trusting only the sidecar (`prompts/<id>.hash.txt`) would let an - // operator under deadline pressure paste the sidecar value into the - // extraction's __prompt_hash to bypass staleness detection — the - // sidecar is plain text. Recomputing means a real source-file change - // always invalidates the extraction. Critic research/14 flagged this - // as the highest-leverage fix in the post-validate workflow. - // - // Binding trigger = presence of `prompts/<id>.md` (the prompt file), - // NOT the diagnostic sidecar. An operator who deletes the sidecar - // must NOT thereby disable the staleness check (research/12 S2-2). - // Single-shot / fixture flows that bypass the plan step have no - // prompt file, so they keep the legacy no-binding behaviour. - const promptPath = join(config.promptsDirPath, `${target.id}.md`); - const promptFileExists = existsSync(promptPath); - let expectedPromptHash: string | undefined; - if (promptFileExists) { - try { - const files = collectFiles(target.paths, { baseDir: config.configDir }); - expectedPromptHash = computePromptHash({ - targetId: target.id, - files, - rules, - flagSchema: schema, - }); - } catch (err) { - // Fail closed (Codex round 9 P1). If we can't recompute the prompt - // hash but the prompt file exists, silently rendering would let a - // stale extraction sneak through after source drift. Surface the - // failure and skip the target rather than downgrade integrity. - const msg = err instanceof Error ? err.message : String(err); - console.error( - `[report] ${target.id}: prompt file exists but source files are unreadable (${msg}). ` + - `Cannot verify the extraction is not stale. Restore the sources or re-run 'npm run plan'.`, - ); - worstExitCode = Math.max(worstExitCode, 2); - continue; - } - } + const { flags, extractionPath, expectedPromptHash } = loaded; if (skipValidate) { console.error( @@ -130,7 +49,7 @@ for (const target of targets) { } if (!skipValidate) { - const v = validateExtraction(flags as Record<string, unknown>, schema, { + const v = validateExtraction(flags as Record<string, unknown>, ctx.schema, { expectedPromptHash, }); if (!v.ok) { @@ -153,21 +72,21 @@ for (const target of targets) { } } - const result = evaluate(rules, flags); + const result = evaluate(ctx.rules, flags); const html = renderHtmlReport(result, { - rulesPath: relative(config.configDir, config.rulesPath), - flagsPath: relative(config.configDir, extractionPath), + rulesPath: relative(ctx.config.configDir, ctx.config.rulesPath), + flagsPath: relative(ctx.config.configDir, extractionPath), flags, generatedAt: new Date().toISOString(), promptHash: expectedPromptHash, }); - const reportPath = join(config.reportsDirPath, `${target.id}.html`); + const reportPath = join(ctx.config.reportsDirPath, `${target.id}.html`); writeFileSync(reportPath, html, "utf8"); summary.push({ target, verdict: result.verdict, reportPath }); if (!quiet) { console.log( - `[report] ${target.id}: ${result.verdict.toUpperCase()} -> ${relative(config.configDir, reportPath)}`, + `[report] ${target.id}: ${result.verdict.toUpperCase()} -> ${relative(ctx.config.configDir, reportPath)}`, ); } if (result.verdict === "blocked") { @@ -185,14 +104,14 @@ if (!quiet && summary.length > 0) { // Best-effort browser open. Only opens the first report when multiple // targets are present; rest are linked from console paths. -if (config.openBrowser && !noOpen && summary.length > 0) { +if (ctx.config.openBrowser && !noOpen && summary.length > 0) { const first = resolve(summary[0]!.reportPath); const ok = await openInBrowser(first); if (!quiet) { console.log(""); console.log( ok - ? `[report] opened ${relative(config.configDir, first)} in default browser (best-effort)` + ? `[report] opened ${relative(ctx.config.configDir, first)} in default browser (best-effort)` : `[report] could not spawn a browser; open manually: file://${first}`, ); } diff --git a/experiments/rule-engine-poc/src/validate-cli.ts b/experiments/rule-engine-poc/src/validate-cli.ts index d84c3ba39..5f619daa3 100644 --- a/experiments/rule-engine-poc/src/validate-cli.ts +++ b/experiments/rule-engine-poc/src/validate-cli.ts @@ -4,101 +4,35 @@ // is clean, 1 when any target has errors, 2 when any extraction is // missing or unreadable. Intended as a pre-flight before `report`. -import { existsSync, readFileSync } from "node:fs"; -import { join, relative } from "node:path"; -import { loadConfig, findTarget } from "./config.js"; -import { collectFiles } from "./context.js"; -import { loadFlagSchema } from "./flag-schema.js"; -import { loadRulesFromFile } from "./loader.js"; -import { computePromptHash } from "./prompt-hash.js"; +import { + loadCliBaseContext, + loadExtractionForTarget, + logExtractionError, + parseStandardArgs, + selectTargets, +} from "./cli-shared.js"; import { validateExtraction } from "./validate.js"; -import type { Target } from "./config.js"; -function takeOpt(argv: string[], flag: string): string | undefined { - const i = argv.indexOf(flag); - if (i === -1) return undefined; - const v = argv[i + 1]; - argv.splice(i, 2); - return v; -} - -const argv = process.argv.slice(2); -const configPath = takeOpt(argv, "--config") ?? "rule-engine.config.json"; -const onlyTarget = takeOpt(argv, "--target"); - -const config = loadConfig(configPath); -const schema = loadFlagSchema(config.flagSchemaPath); -const rules = loadRulesFromFile(config.rulesPath); +const { configPath, targetFilter } = parseStandardArgs(process.argv.slice(2)); -const targets: Target[] = onlyTarget - ? [findTarget(config, onlyTarget)] - : config.targets; +const ctx = loadCliBaseContext(configPath); +const targets = selectTargets(ctx.config, targetFilter); let worstExitCode = 0; let cleanCount = 0; let issueCount = 0; for (const target of targets) { - const extractionPath = join(config.extractionsDirPath, `${target.id}.json`); - if (!existsSync(extractionPath)) { - console.error( - `[validate] missing extraction for '${target.id}': ${relative(config.configDir, extractionPath)}`, - ); - worstExitCode = Math.max(worstExitCode, 2); - continue; - } - let parsed: unknown; - try { - parsed = JSON.parse(readFileSync(extractionPath, "utf8")); - } catch (err) { - const msg = err instanceof Error ? err.message : String(err); - console.error(`[validate] invalid JSON in ${extractionPath}: ${msg}`); + const loaded = loadExtractionForTarget(target, ctx); + if (loaded.kind !== "ok") { + logExtractionError("validate", loaded, ctx.config.configDir); worstExitCode = Math.max(worstExitCode, 2); continue; } - if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) { - const kind = parsed === null ? "null" : Array.isArray(parsed) ? "array" : typeof parsed; - console.error( - `[validate] extraction ${extractionPath} must be a JSON object (got ${kind})`, - ); - worstExitCode = Math.max(worstExitCode, 2); - continue; - } - // Recompute the prompt hash from current source rather than trusting - // the sidecar — see report.ts and research/14. - // - // Binding trigger = presence of `prompts/<id>.md` (the prompt file), - // NOT the diagnostic sidecar. Deleting the sidecar must NOT disable - // the staleness check (research/12 S2-2). Single-shot / fixture flows - // that have no prompt file keep the legacy no-binding behaviour. - const promptPath = join(config.promptsDirPath, `${target.id}.md`); - const promptFileExists = existsSync(promptPath); - let expectedPromptHash: string | undefined; - if (promptFileExists) { - try { - const files = collectFiles(target.paths, { baseDir: config.configDir }); - expectedPromptHash = computePromptHash({ - targetId: target.id, - files, - rules, - flagSchema: schema, - }); - } catch (err) { - // Fail closed (Codex round 9 P1) — silently disabling the hash - // check after a source-tree mutation would defeat the point. - const msg = err instanceof Error ? err.message : String(err); - console.error( - `[validate] ${target.id}: prompt file exists but source files are unreadable (${msg}). ` + - `Cannot verify the extraction is not stale. Restore the sources or re-run 'npm run plan'.`, - ); - worstExitCode = Math.max(worstExitCode, 2); - continue; - } - } const result = validateExtraction( - parsed as Record<string, unknown>, - schema, - { expectedPromptHash }, + loaded.flags as Record<string, unknown>, + ctx.schema, + { expectedPromptHash: loaded.expectedPromptHash }, ); if (result.ok && result.warnings.length === 0) { console.log(`[validate] ${target.id}: OK`); diff --git a/experiments/rule-engine-poc/test/cli-shared.test.ts b/experiments/rule-engine-poc/test/cli-shared.test.ts new file mode 100644 index 000000000..b0c8bc400 --- /dev/null +++ b/experiments/rule-engine-poc/test/cli-shared.test.ts @@ -0,0 +1,241 @@ +// Unit tests for the shared CLI scaffolding extracted from +// plan.ts / report.ts / validate-cli.ts. + +import { mkdirSync, mkdtempSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { describe, expect, it } from "vitest"; +import { + loadCliBaseContext, + loadExtractionForTarget, + parseStandardArgs, + selectTargets, + takeFlag, + takeOpt, +} from "../src/cli-shared.js"; + +const MINIMAL_RULES_YAML = `- id: ci-green-progresses + description: CI green -> ready. + stage: any + priority: 10 + when: + all: + - flag: ci_passing + eq: true + then: + verdict: ready-to-progress + weight: 10 + actions: [] + tags: [test] +`; + +const MINIMAL_SCHEMA_YAML = `ci_passing: + type: boolean + description: ci is green + example: true +`; + +interface MiniWorkspace { + dir: string; + configPath: string; +} + +function setupMiniWorkspace(): MiniWorkspace { + const dir = mkdtempSync(join(tmpdir(), "cli-shared-")); + mkdirSync(join(dir, "prompts")); + mkdirSync(join(dir, "extractions")); + mkdirSync(join(dir, "reports")); + mkdirSync(join(dir, "rules")); + mkdirSync(join(dir, "src-target")); + writeFileSync(join(dir, "rules", "rules.yaml"), MINIMAL_RULES_YAML, "utf8"); + writeFileSync(join(dir, "rules", "schema.yaml"), MINIMAL_SCHEMA_YAML, "utf8"); + writeFileSync(join(dir, "src-target", "spec.md"), "# Spec\n", "utf8"); + const configPath = join(dir, "rule-engine.config.json"); + writeFileSync( + configPath, + JSON.stringify( + { + rules: "rules/rules.yaml", + flagSchema: "rules/schema.yaml", + promptsDir: "prompts", + extractionsDir: "extractions", + reportsDir: "reports", + openBrowser: false, + targets: [ + { id: "alpha", label: "Alpha", paths: ["src-target"] }, + { id: "beta", label: "Beta", paths: ["src-target"] }, + ], + }, + null, + 2, + ), + "utf8", + ); + return { dir, configPath }; +} + +describe("takeOpt", () => { + it("returns the value after the flag and removes both from argv", () => { + const argv = ["--config", "foo.json", "--other"]; + expect(takeOpt(argv, "--config")).toBe("foo.json"); + expect(argv).toEqual(["--other"]); + }); + + it("returns undefined and leaves argv untouched when flag is absent", () => { + const argv = ["--other", "x"]; + expect(takeOpt(argv, "--config")).toBeUndefined(); + expect(argv).toEqual(["--other", "x"]); + }); +}); + +describe("takeFlag", () => { + it("returns true and removes the flag when present", () => { + const argv = ["--quiet", "--config", "x"]; + expect(takeFlag(argv, "--quiet")).toBe(true); + expect(argv).toEqual(["--config", "x"]); + }); + + it("returns false and leaves argv untouched when absent", () => { + const argv = ["--config", "x"]; + expect(takeFlag(argv, "--quiet")).toBe(false); + expect(argv).toEqual(["--config", "x"]); + }); +}); + +describe("parseStandardArgs", () => { + it("pulls --config, --target, --quiet and returns remaining argv", () => { + const argv = [ + "--config", + "my.json", + "--target", + "alpha", + "--quiet", + "--extra", + "x", + ]; + const r = parseStandardArgs(argv); + expect(r.configPath).toBe("my.json"); + expect(r.targetFilter).toBe("alpha"); + expect(r.quiet).toBe(true); + expect(r.remaining).toEqual(["--extra", "x"]); + }); + + it("defaults configPath when --config is absent", () => { + const r = parseStandardArgs([]); + expect(r.configPath).toBe("rule-engine.config.json"); + expect(r.targetFilter).toBeUndefined(); + expect(r.quiet).toBe(false); + expect(r.remaining).toEqual([]); + }); +}); + +describe("selectTargets", () => { + it("returns all targets when no filter is given", () => { + const ws = setupMiniWorkspace(); + const ctx = loadCliBaseContext(ws.configPath); + const out = selectTargets(ctx.config); + expect(out.map((t) => t.id)).toEqual(["alpha", "beta"]); + }); + + it("returns a single target when filter matches", () => { + const ws = setupMiniWorkspace(); + const ctx = loadCliBaseContext(ws.configPath); + const out = selectTargets(ctx.config, "beta"); + expect(out.map((t) => t.id)).toEqual(["beta"]); + }); + + it("throws on unknown target id", () => { + const ws = setupMiniWorkspace(); + const ctx = loadCliBaseContext(ws.configPath); + expect(() => selectTargets(ctx.config, "ghost")).toThrow( + /Target 'ghost' not found/, + ); + }); +}); + +describe("loadExtractionForTarget", () => { + it("returns ok with flags and computed hash when prompt file is present", () => { + const ws = setupMiniWorkspace(); + const ctx = loadCliBaseContext(ws.configPath); + writeFileSync( + join(ws.dir, "extractions", "alpha.json"), + JSON.stringify({ ci_passing: true }), + "utf8", + ); + writeFileSync( + join(ws.dir, "prompts", "alpha.md"), + "# prompt body\n", + "utf8", + ); + + const result = loadExtractionForTarget(ctx.config.targets[0]!, ctx); + expect(result.kind).toBe("ok"); + if (result.kind !== "ok") return; + expect(result.flags).toEqual({ ci_passing: true }); + expect(result.promptFileExists).toBe(true); + expect(typeof result.expectedPromptHash).toBe("string"); + expect(result.expectedPromptHash).toMatch(/^[0-9a-f]{64}$/); + }); + + it("returns ok with undefined hash when no prompt file (fixture flow)", () => { + const ws = setupMiniWorkspace(); + const ctx = loadCliBaseContext(ws.configPath); + writeFileSync( + join(ws.dir, "extractions", "alpha.json"), + JSON.stringify({ ci_passing: true }), + "utf8", + ); + const result = loadExtractionForTarget(ctx.config.targets[0]!, ctx); + expect(result.kind).toBe("ok"); + if (result.kind !== "ok") return; + expect(result.promptFileExists).toBe(false); + expect(result.expectedPromptHash).toBeUndefined(); + }); + + it("returns missing when extraction file does not exist", () => { + const ws = setupMiniWorkspace(); + const ctx = loadCliBaseContext(ws.configPath); + const result = loadExtractionForTarget(ctx.config.targets[0]!, ctx); + expect(result.kind).toBe("missing"); + }); + + it("returns invalid-json when extraction is not valid JSON", () => { + const ws = setupMiniWorkspace(); + const ctx = loadCliBaseContext(ws.configPath); + writeFileSync( + join(ws.dir, "extractions", "alpha.json"), + "{ not json", + "utf8", + ); + const result = loadExtractionForTarget(ctx.config.targets[0]!, ctx); + expect(result.kind).toBe("invalid-json"); + }); + + it("returns non-object when extraction is a JSON array", () => { + const ws = setupMiniWorkspace(); + const ctx = loadCliBaseContext(ws.configPath); + writeFileSync( + join(ws.dir, "extractions", "alpha.json"), + JSON.stringify([1, 2, 3]), + "utf8", + ); + const result = loadExtractionForTarget(ctx.config.targets[0]!, ctx); + expect(result.kind).toBe("non-object"); + if (result.kind !== "non-object") return; + expect(result.got).toBe("array"); + }); + + it("returns non-object when extraction is JSON null", () => { + const ws = setupMiniWorkspace(); + const ctx = loadCliBaseContext(ws.configPath); + writeFileSync( + join(ws.dir, "extractions", "alpha.json"), + "null", + "utf8", + ); + const result = loadExtractionForTarget(ctx.config.targets[0]!, ctx); + expect(result.kind).toBe("non-object"); + if (result.kind !== "non-object") return; + expect(result.got).toBe("null"); + }); +}); From 1d106c2ff6e47e7b18652e0ad55ea73d4c35a6a9 Mon Sep 17 00:00:00 2001 From: Claude <noreply@anthropic.com> Date: Sun, 17 May 2026 12:57:05 +0000 Subject: [PATCH 23/45] docs(rule-engine-poc): commit sample HTML reports for research wave 4 Three rendered reports (ready, blocked, needs-attention) for the agents to inspect when reviewing report readability. --- .../sample-reports/blocked-missing-ears.html | 460 ++++++++++++++++++ .../needs-attention-design-risks.html | 460 ++++++++++++++++++ .../sample-reports/ready-implementation.html | 460 ++++++++++++++++++ 3 files changed, 1380 insertions(+) create mode 100644 experiments/rule-engine-poc/research/sample-reports/blocked-missing-ears.html create mode 100644 experiments/rule-engine-poc/research/sample-reports/needs-attention-design-risks.html create mode 100644 experiments/rule-engine-poc/research/sample-reports/ready-implementation.html diff --git a/experiments/rule-engine-poc/research/sample-reports/blocked-missing-ears.html b/experiments/rule-engine-poc/research/sample-reports/blocked-missing-ears.html new file mode 100644 index 000000000..1b3b31e98 --- /dev/null +++ b/experiments/rule-engine-poc/research/sample-reports/blocked-missing-ears.html @@ -0,0 +1,460 @@ +<!doctype html> +<html lang="en"> +<head> + <meta charset="utf-8"> + <meta name="viewport" content="width=device-width,initial-scale=1"> + <title>Rule engine report — fixtures/blocked-missing-ears.json + + + +
    +

    Rule engine report

    +

    Deterministic verdict from extracted flags — "LLM extracts, rules decide".

    +
    + +
    +
    Verdict
    +
    Blocked
    +

    1 of 21 rules matched · 1 suggested action

    +
    + +
    +
    +

    Weighted tally

    + + + +
    Verdict tierWeight
    Blocked100
    Needs attention0
    Ready to progress0
    Unknown0
    +
    + +
    +

    Suggested actions

    +
    • rewrite-non-ears-requirements
    +
    +
    + +
    +

    Extraction flags (input from the Orient quadrant)

    + + + +
    FlagValue
    blockers_count0
    current_stage"requirements"
    feature_slug"auth-refresh"
    open_clarifications_count0
    requirements_acceptance_criteria_testabletrue
    requirements_ears_coverage0.6
    requirements_have_stable_idstrue
    s1_findings_count0
    s2_findings_count0
    s3_findings_count1
    +
    + +
    +

    Audit trail (deterministic order: priority desc, id asc)

    + +
    +
    + did not match +

    any-s1-finding-blocks

    +

    Any S1 (critical) finding blocks all progression.

    +

    + priority: 200 + stage: any + severity gate + e8ff14765412 +

    +
    +
    • s1_findings_count gt=0 → observed=0
    + +
    +
    +
    + did not match +

    any-s2-finding-needs-attention

    +

    S2 (high) finding requires attention this sprint.

    +

    + priority: 100 + stage: any + severity + 35db2ac7fc96 +

    +
    +
    • s2_findings_count gt=0 → observed=0
    + +
    +
    +
    + MATCHED +

    req-ears-mandatory

    +

    All functional requirements must use EARS notation.

    +

    + priority: 100 + stage: requirements + dod requirements gate + a666b2a5ae60 +

    +
    +
    • current_stage in=["requirements","design","specification","tasks","implementation","testing","review"] → observed="requirements"
    • requirements_ears_coverage lt=1 → observed=0.6
    +

    Contributes Blocked with weight 100. Actions: rewrite-non-ears-requirements.

    +
    +
    +
    + did not match +

    blockers-block

    +

    Open blockers must be cleared.

    +

    + priority: 95 + stage: any + gate + 487fa87c0307 +

    +
    +
    • blockers_count gt=0 → observed=0
    + +
    +
    +
    + did not match +

    open-clarifications-block

    +

    Open clarifications must be resolved before stage progression.

    +

    + priority: 95 + stage: any + gate + d8c04e8d77d3 +

    +
    +
    • open_clarifications_count gt=0 → observed=0
    + +
    +
    +
    + did not match +

    testing-ears-coverage-incomplete

    +

    Every EARS clause must have >= 1 test.

    +

    + priority: 95 + stage: testing + dod testing traceability + 4f2c0579c234 +

    +
    +
    • current_stage in=["testing","review"] → observed="requirements"
    • testing_ears_test_coverage lt=1 → observed=undefined (flag missing in extraction)
    + +
    +
    +
    + did not match +

    design-irreversible-needs-adr

    +

    Irreversible architectural decisions must have ADRs.

    +

    + priority: 90 + stage: design + dod design governance + 794480a503e8 +

    +
    +
    • current_stage in=["design","specification","tasks","implementation"] → observed="requirements"
    • design_irreversible_have_adrs eq=false → observed=undefined (flag missing in extraction)
    + +
    +
    +
    + did not match +

    impl-lint-clean

    +

    Implementation must be lint clean.

    +

    + priority: 90 + stage: implementation + dod implementation + 16a3136f316b +

    +
    +
    • current_stage in=["implementation","testing","review"] → observed="requirements"
    • implementation_lint_clean eq=false → observed=undefined (flag missing in extraction)
    + +
    +
    +
    + did not match +

    impl-types-clean

    +

    TypeScript / type checks must pass.

    +

    + priority: 90 + stage: implementation + dod implementation + 95ec962bb145 +

    +
    +
    • current_stage in=["implementation","testing","review"] → observed="requirements"
    • implementation_types_clean eq=false → observed=undefined (flag missing in extraction)
    + +
    +
    +
    + did not match +

    impl-unit-tests-pass

    +

    Unit tests for the changed surface must pass.

    +

    + priority: 90 + stage: implementation + dod implementation + 42e0549ac641 +

    +
    +
    • current_stage in=["implementation","testing","review"] → observed="requirements"
    • implementation_unit_tests_pass eq=false → observed=undefined (flag missing in extraction)
    + +
    +
    +
    + did not match +

    req-has-stable-ids

    +

    Each requirement must have a stable REQ-<AREA>-NNN id.

    +

    + priority: 90 + stage: requirements + dod requirements traceability + d9691abb58b5 +

    +
    +
    • current_stage in=["requirements","design","specification","tasks","implementation","testing","review"] → observed="requirements"
    • requirements_have_stable_ids eq=false → observed=true
    + +
    +
    +
    + did not match +

    review-traceability-incomplete

    +

    Traceability matrix must be complete and consistent.

    +

    + priority: 90 + stage: review + dod review traceability + 402d806974a0 +

    +
    +
    • current_stage eq="review" → observed="requirements"
    • review_traceability_complete eq=false → observed=undefined (flag missing in extraction)
    + +
    +
    +
    + did not match +

    spec-items-trace-to-requirements

    +

    Each spec item must trace to >= 1 requirement.

    +

    + priority: 90 + stage: specification + dod specification traceability + 70ebe8fbcf4b +

    +
    +
    • current_stage in=["specification","tasks","implementation","testing","review"] → observed="requirements"
    • spec_each_item_traces_to_requirement eq=false → observed=undefined (flag missing in extraction)
    + +
    +
    +
    + did not match +

    review-brand-required-but-missing

    +

    Brand review required (touches sites/, UI surfaces) but not posted.

    +

    + priority: 85 + stage: review + dod review brand + 44981daf4238 +

    +
    +
    • current_stage eq="review" → observed="requirements"
    • brand_review_required eq=true → observed=undefined (flag missing in extraction)
    • brand_review_passed eq=false → observed=undefined (flag missing in extraction)
    + +
    +
    +
    + did not match +

    idea-problem-statement-present

    +

    Idea must have a one-paragraph problem statement.

    +

    + priority: 80 + stage: idea + dod idea + 61bd9e6e308f +

    +
    +
    • current_stage eq="idea" → observed="requirements"
    • idea_problem_statement_present eq=false → observed=undefined (flag missing in extraction)
    + +
    +
    +
    + did not match +

    req-acceptance-testable

    +

    Acceptance criteria must be testable.

    +

    + priority: 80 + stage: requirements + dod requirements testability + c37bc45fc232 +

    +
    +
    • current_stage eq="requirements" → observed="requirements"
    • requirements_acceptance_criteria_testable eq=false → observed=true
    + +
    +
    +
    + did not match +

    testing-critical-paths-uncovered

    +

    Critical paths (happy + key edge cases) must be covered.

    +

    + priority: 80 + stage: testing + dod testing + afe3e92d529e +

    +
    +
    • current_stage in=["testing","review"] → observed="requirements"
    • testing_critical_paths_covered eq=false → observed=undefined (flag missing in extraction)
    + +
    +
    +
    + did not match +

    design-risks-have-mitigations

    +

    Identified risks must have mitigations.

    +

    + priority: 70 + stage: design + dod design + 8191d2b9c3a0 +

    +
    +
    • current_stage eq="design" → observed="requirements"
    • design_risks_have_mitigations eq=false → observed=undefined (flag missing in extraction)
    + +
    +
    +
    + did not match +

    idea-scope-bounded

    +

    Idea scope must be bounded (no "boil the ocean" framing).

    +

    + priority: 70 + stage: idea + dod idea + c2fd78dfab7f +

    +
    +
    • current_stage eq="idea" → observed="requirements"
    • idea_scope_bounded eq=false → observed=undefined (flag missing in extraction)
    + +
    +
    +
    + did not match +

    idea-ready

    +

    Idea DoD satisfied — ready for /spec:research.

    +

    + priority: 10 + stage: idea + dod idea + 5d970d8d520c +

    +
    +
    • current_stage eq="idea" → observed="requirements"
    • idea_problem_statement_present eq=true → observed=undefined (flag missing in extraction)
    • idea_target_users_named eq=true → observed=undefined (flag missing in extraction)
    • idea_scope_bounded eq=true → observed=undefined (flag missing in extraction)
    + +
    +
    +
    + did not match +

    impl-ready

    +

    Implementation DoD satisfied — ready for /spec:test.

    +

    + priority: 10 + stage: implementation + dod implementation + c311b2edd5ff +

    +
    +
    • current_stage eq="implementation" → observed="requirements"
    • implementation_lint_clean eq=true → observed=undefined (flag missing in extraction)
    • implementation_types_clean eq=true → observed=undefined (flag missing in extraction)
    • implementation_unit_tests_pass eq=true → observed=undefined (flag missing in extraction)
    • open_clarifications_count eq=0 → observed=0
    • s1_findings_count eq=0 → observed=0
    + +
    +
    + +
    +

    Provenance

    +

    + Engine version: 0.2.0
    + Ruleset hash: 84a35f019743b26e106d82332b2e07cc12979c9ce1fdf4b466550b3b37e25d92
    + Flags hash: 019a8f31666f943bd178a38a89cc5ef99c2d4476993e4b28c478054e381bb680
    + Rules file: rules/quality-gates.yaml
    + Flags file: fixtures/blocked-missing-ears.json +

    +
    + +
    + Generated 2026-05-17T12:56:59.541Z · experiments/rule-engine-poc · same inputs → identical report, byte-for-byte (except this timestamp). +
    + + diff --git a/experiments/rule-engine-poc/research/sample-reports/needs-attention-design-risks.html b/experiments/rule-engine-poc/research/sample-reports/needs-attention-design-risks.html new file mode 100644 index 000000000..10511a642 --- /dev/null +++ b/experiments/rule-engine-poc/research/sample-reports/needs-attention-design-risks.html @@ -0,0 +1,460 @@ + + + + + + Rule engine report — fixtures/needs-attention-design-risks.json + + + +
    +

    Rule engine report

    +

    Deterministic verdict from extracted flags — "LLM extracts, rules decide".

    +
    + +
    +
    Verdict
    +
    Needs attention
    +

    2 of 21 rules matched · 2 suggested actions

    +
    + +
    +
    +

    Weighted tally

    + + + +
    Verdict tierWeight
    Blocked0
    Needs attention100
    Ready to progress0
    Unknown0
    +
    + +
    +

    Suggested actions

    +
    • propose-risk-mitigations
    • schedule-s2-fix
    +
    +
    + +
    +

    Extraction flags (input from the Orient quadrant)

    + + + +
    FlagValue
    blockers_count0
    current_stage"design"
    design_irreversible_have_adrstrue
    design_risks_have_mitigationsfalse
    feature_slug"search-relevance"
    open_clarifications_count0
    requirements_acceptance_criteria_testabletrue
    requirements_ears_coverage1
    requirements_have_stable_idstrue
    s1_findings_count0
    s2_findings_count2
    s3_findings_count3
    +
    + +
    +

    Audit trail (deterministic order: priority desc, id asc)

    + +
    +
    + did not match +

    any-s1-finding-blocks

    +

    Any S1 (critical) finding blocks all progression.

    +

    + priority: 200 + stage: any + severity gate + e8ff14765412 +

    +
    +
    • s1_findings_count gt=0 → observed=0
    + +
    +
    +
    + MATCHED +

    any-s2-finding-needs-attention

    +

    S2 (high) finding requires attention this sprint.

    +

    + priority: 100 + stage: any + severity + 35db2ac7fc96 +

    +
    +
    • s2_findings_count gt=0 → observed=2
    +

    Contributes Needs attention with weight 60. Actions: schedule-s2-fix.

    +
    +
    +
    + did not match +

    req-ears-mandatory

    +

    All functional requirements must use EARS notation.

    +

    + priority: 100 + stage: requirements + dod requirements gate + a666b2a5ae60 +

    +
    +
    • current_stage in=["requirements","design","specification","tasks","implementation","testing","review"] → observed="design"
    • requirements_ears_coverage lt=1 → observed=1
    + +
    +
    +
    + did not match +

    blockers-block

    +

    Open blockers must be cleared.

    +

    + priority: 95 + stage: any + gate + 487fa87c0307 +

    +
    +
    • blockers_count gt=0 → observed=0
    + +
    +
    +
    + did not match +

    open-clarifications-block

    +

    Open clarifications must be resolved before stage progression.

    +

    + priority: 95 + stage: any + gate + d8c04e8d77d3 +

    +
    +
    • open_clarifications_count gt=0 → observed=0
    + +
    +
    +
    + did not match +

    testing-ears-coverage-incomplete

    +

    Every EARS clause must have >= 1 test.

    +

    + priority: 95 + stage: testing + dod testing traceability + 4f2c0579c234 +

    +
    +
    • current_stage in=["testing","review"] → observed="design"
    • testing_ears_test_coverage lt=1 → observed=undefined (flag missing in extraction)
    + +
    +
    +
    + did not match +

    design-irreversible-needs-adr

    +

    Irreversible architectural decisions must have ADRs.

    +

    + priority: 90 + stage: design + dod design governance + 794480a503e8 +

    +
    +
    • current_stage in=["design","specification","tasks","implementation"] → observed="design"
    • design_irreversible_have_adrs eq=false → observed=true
    + +
    +
    +
    + did not match +

    impl-lint-clean

    +

    Implementation must be lint clean.

    +

    + priority: 90 + stage: implementation + dod implementation + 16a3136f316b +

    +
    +
    • current_stage in=["implementation","testing","review"] → observed="design"
    • implementation_lint_clean eq=false → observed=undefined (flag missing in extraction)
    + +
    +
    +
    + did not match +

    impl-types-clean

    +

    TypeScript / type checks must pass.

    +

    + priority: 90 + stage: implementation + dod implementation + 95ec962bb145 +

    +
    +
    • current_stage in=["implementation","testing","review"] → observed="design"
    • implementation_types_clean eq=false → observed=undefined (flag missing in extraction)
    + +
    +
    +
    + did not match +

    impl-unit-tests-pass

    +

    Unit tests for the changed surface must pass.

    +

    + priority: 90 + stage: implementation + dod implementation + 42e0549ac641 +

    +
    +
    • current_stage in=["implementation","testing","review"] → observed="design"
    • implementation_unit_tests_pass eq=false → observed=undefined (flag missing in extraction)
    + +
    +
    +
    + did not match +

    req-has-stable-ids

    +

    Each requirement must have a stable REQ-<AREA>-NNN id.

    +

    + priority: 90 + stage: requirements + dod requirements traceability + d9691abb58b5 +

    +
    +
    • current_stage in=["requirements","design","specification","tasks","implementation","testing","review"] → observed="design"
    • requirements_have_stable_ids eq=false → observed=true
    + +
    +
    +
    + did not match +

    review-traceability-incomplete

    +

    Traceability matrix must be complete and consistent.

    +

    + priority: 90 + stage: review + dod review traceability + 402d806974a0 +

    +
    +
    • current_stage eq="review" → observed="design"
    • review_traceability_complete eq=false → observed=undefined (flag missing in extraction)
    + +
    +
    +
    + did not match +

    spec-items-trace-to-requirements

    +

    Each spec item must trace to >= 1 requirement.

    +

    + priority: 90 + stage: specification + dod specification traceability + 70ebe8fbcf4b +

    +
    +
    • current_stage in=["specification","tasks","implementation","testing","review"] → observed="design"
    • spec_each_item_traces_to_requirement eq=false → observed=undefined (flag missing in extraction)
    + +
    +
    +
    + did not match +

    review-brand-required-but-missing

    +

    Brand review required (touches sites/, UI surfaces) but not posted.

    +

    + priority: 85 + stage: review + dod review brand + 44981daf4238 +

    +
    +
    • current_stage eq="review" → observed="design"
    • brand_review_required eq=true → observed=undefined (flag missing in extraction)
    • brand_review_passed eq=false → observed=undefined (flag missing in extraction)
    + +
    +
    +
    + did not match +

    idea-problem-statement-present

    +

    Idea must have a one-paragraph problem statement.

    +

    + priority: 80 + stage: idea + dod idea + 61bd9e6e308f +

    +
    +
    • current_stage eq="idea" → observed="design"
    • idea_problem_statement_present eq=false → observed=undefined (flag missing in extraction)
    + +
    +
    +
    + did not match +

    req-acceptance-testable

    +

    Acceptance criteria must be testable.

    +

    + priority: 80 + stage: requirements + dod requirements testability + c37bc45fc232 +

    +
    +
    • current_stage eq="requirements" → observed="design"
    • requirements_acceptance_criteria_testable eq=false → observed=true
    + +
    +
    +
    + did not match +

    testing-critical-paths-uncovered

    +

    Critical paths (happy + key edge cases) must be covered.

    +

    + priority: 80 + stage: testing + dod testing + afe3e92d529e +

    +
    +
    • current_stage in=["testing","review"] → observed="design"
    • testing_critical_paths_covered eq=false → observed=undefined (flag missing in extraction)
    + +
    +
    +
    + MATCHED +

    design-risks-have-mitigations

    +

    Identified risks must have mitigations.

    +

    + priority: 70 + stage: design + dod design + 8191d2b9c3a0 +

    +
    +
    • current_stage eq="design" → observed="design"
    • design_risks_have_mitigations eq=false → observed=false
    +

    Contributes Needs attention with weight 40. Actions: propose-risk-mitigations.

    +
    +
    +
    + did not match +

    idea-scope-bounded

    +

    Idea scope must be bounded (no "boil the ocean" framing).

    +

    + priority: 70 + stage: idea + dod idea + c2fd78dfab7f +

    +
    +
    • current_stage eq="idea" → observed="design"
    • idea_scope_bounded eq=false → observed=undefined (flag missing in extraction)
    + +
    +
    +
    + did not match +

    idea-ready

    +

    Idea DoD satisfied — ready for /spec:research.

    +

    + priority: 10 + stage: idea + dod idea + 5d970d8d520c +

    +
    +
    • current_stage eq="idea" → observed="design"
    • idea_problem_statement_present eq=true → observed=undefined (flag missing in extraction)
    • idea_target_users_named eq=true → observed=undefined (flag missing in extraction)
    • idea_scope_bounded eq=true → observed=undefined (flag missing in extraction)
    + +
    +
    +
    + did not match +

    impl-ready

    +

    Implementation DoD satisfied — ready for /spec:test.

    +

    + priority: 10 + stage: implementation + dod implementation + c311b2edd5ff +

    +
    +
    • current_stage eq="implementation" → observed="design"
    • implementation_lint_clean eq=true → observed=undefined (flag missing in extraction)
    • implementation_types_clean eq=true → observed=undefined (flag missing in extraction)
    • implementation_unit_tests_pass eq=true → observed=undefined (flag missing in extraction)
    • open_clarifications_count eq=0 → observed=0
    • s1_findings_count eq=0 → observed=0
    + +
    +
    + +
    +

    Provenance

    +

    + Engine version: 0.2.0
    + Ruleset hash: 84a35f019743b26e106d82332b2e07cc12979c9ce1fdf4b466550b3b37e25d92
    + Flags hash: 24c6c9a7397070759956430d36961066bb2562629610ac44b7581d05b8fa6640
    + Rules file: rules/quality-gates.yaml
    + Flags file: fixtures/needs-attention-design-risks.json +

    +
    + +
    + Generated 2026-05-17T12:57:00.162Z · experiments/rule-engine-poc · same inputs → identical report, byte-for-byte (except this timestamp). +
    + + diff --git a/experiments/rule-engine-poc/research/sample-reports/ready-implementation.html b/experiments/rule-engine-poc/research/sample-reports/ready-implementation.html new file mode 100644 index 000000000..a45fce775 --- /dev/null +++ b/experiments/rule-engine-poc/research/sample-reports/ready-implementation.html @@ -0,0 +1,460 @@ + + + + + + Rule engine report — fixtures/ready-implementation.json + + + +
    +

    Rule engine report

    +

    Deterministic verdict from extracted flags — "LLM extracts, rules decide".

    +
    + +
    +
    Verdict
    +
    Ready to progress
    +

    1 of 21 rules matched · 1 suggested action

    +
    + +
    +
    +

    Weighted tally

    + + + +
    Verdict tierWeight
    Blocked0
    Needs attention0
    Ready to progress100
    Unknown0
    +
    + +
    +

    Suggested actions

    +
    • advance-to-testing
    +
    +
    + +
    +

    Extraction flags (input from the Orient quadrant)

    + + + +
    FlagValue
    blockers_count0
    current_stage"implementation"
    design_irreversible_have_adrstrue
    design_risks_have_mitigationstrue
    feature_slug"rule-engine-poc"
    implementation_lint_cleantrue
    implementation_types_cleantrue
    implementation_unit_tests_passtrue
    open_clarifications_count0
    requirements_acceptance_criteria_testabletrue
    requirements_ears_coverage1
    requirements_have_stable_idstrue
    s1_findings_count0
    s2_findings_count0
    s3_findings_count0
    spec_each_item_traces_to_requirementtrue
    +
    + +
    +

    Audit trail (deterministic order: priority desc, id asc)

    + +
    +
    + did not match +

    any-s1-finding-blocks

    +

    Any S1 (critical) finding blocks all progression.

    +

    + priority: 200 + stage: any + severity gate + e8ff14765412 +

    +
    +
    • s1_findings_count gt=0 → observed=0
    + +
    +
    +
    + did not match +

    any-s2-finding-needs-attention

    +

    S2 (high) finding requires attention this sprint.

    +

    + priority: 100 + stage: any + severity + 35db2ac7fc96 +

    +
    +
    • s2_findings_count gt=0 → observed=0
    + +
    +
    +
    + did not match +

    req-ears-mandatory

    +

    All functional requirements must use EARS notation.

    +

    + priority: 100 + stage: requirements + dod requirements gate + a666b2a5ae60 +

    +
    +
    • current_stage in=["requirements","design","specification","tasks","implementation","testing","review"] → observed="implementation"
    • requirements_ears_coverage lt=1 → observed=1
    + +
    +
    +
    + did not match +

    blockers-block

    +

    Open blockers must be cleared.

    +

    + priority: 95 + stage: any + gate + 487fa87c0307 +

    +
    +
    • blockers_count gt=0 → observed=0
    + +
    +
    +
    + did not match +

    open-clarifications-block

    +

    Open clarifications must be resolved before stage progression.

    +

    + priority: 95 + stage: any + gate + d8c04e8d77d3 +

    +
    +
    • open_clarifications_count gt=0 → observed=0
    + +
    +
    +
    + did not match +

    testing-ears-coverage-incomplete

    +

    Every EARS clause must have >= 1 test.

    +

    + priority: 95 + stage: testing + dod testing traceability + 4f2c0579c234 +

    +
    +
    • current_stage in=["testing","review"] → observed="implementation"
    • testing_ears_test_coverage lt=1 → observed=undefined (flag missing in extraction)
    + +
    +
    +
    + did not match +

    design-irreversible-needs-adr

    +

    Irreversible architectural decisions must have ADRs.

    +

    + priority: 90 + stage: design + dod design governance + 794480a503e8 +

    +
    +
    • current_stage in=["design","specification","tasks","implementation"] → observed="implementation"
    • design_irreversible_have_adrs eq=false → observed=true
    + +
    +
    +
    + did not match +

    impl-lint-clean

    +

    Implementation must be lint clean.

    +

    + priority: 90 + stage: implementation + dod implementation + 16a3136f316b +

    +
    +
    • current_stage in=["implementation","testing","review"] → observed="implementation"
    • implementation_lint_clean eq=false → observed=true
    + +
    +
    +
    + did not match +

    impl-types-clean

    +

    TypeScript / type checks must pass.

    +

    + priority: 90 + stage: implementation + dod implementation + 95ec962bb145 +

    +
    +
    • current_stage in=["implementation","testing","review"] → observed="implementation"
    • implementation_types_clean eq=false → observed=true
    + +
    +
    +
    + did not match +

    impl-unit-tests-pass

    +

    Unit tests for the changed surface must pass.

    +

    + priority: 90 + stage: implementation + dod implementation + 42e0549ac641 +

    +
    +
    • current_stage in=["implementation","testing","review"] → observed="implementation"
    • implementation_unit_tests_pass eq=false → observed=true
    + +
    +
    +
    + did not match +

    req-has-stable-ids

    +

    Each requirement must have a stable REQ-<AREA>-NNN id.

    +

    + priority: 90 + stage: requirements + dod requirements traceability + d9691abb58b5 +

    +
    +
    • current_stage in=["requirements","design","specification","tasks","implementation","testing","review"] → observed="implementation"
    • requirements_have_stable_ids eq=false → observed=true
    + +
    +
    +
    + did not match +

    review-traceability-incomplete

    +

    Traceability matrix must be complete and consistent.

    +

    + priority: 90 + stage: review + dod review traceability + 402d806974a0 +

    +
    +
    • current_stage eq="review" → observed="implementation"
    • review_traceability_complete eq=false → observed=undefined (flag missing in extraction)
    + +
    +
    +
    + did not match +

    spec-items-trace-to-requirements

    +

    Each spec item must trace to >= 1 requirement.

    +

    + priority: 90 + stage: specification + dod specification traceability + 70ebe8fbcf4b +

    +
    +
    • current_stage in=["specification","tasks","implementation","testing","review"] → observed="implementation"
    • spec_each_item_traces_to_requirement eq=false → observed=true
    + +
    +
    +
    + did not match +

    review-brand-required-but-missing

    +

    Brand review required (touches sites/, UI surfaces) but not posted.

    +

    + priority: 85 + stage: review + dod review brand + 44981daf4238 +

    +
    +
    • current_stage eq="review" → observed="implementation"
    • brand_review_required eq=true → observed=undefined (flag missing in extraction)
    • brand_review_passed eq=false → observed=undefined (flag missing in extraction)
    + +
    +
    +
    + did not match +

    idea-problem-statement-present

    +

    Idea must have a one-paragraph problem statement.

    +

    + priority: 80 + stage: idea + dod idea + 61bd9e6e308f +

    +
    +
    • current_stage eq="idea" → observed="implementation"
    • idea_problem_statement_present eq=false → observed=undefined (flag missing in extraction)
    + +
    +
    +
    + did not match +

    req-acceptance-testable

    +

    Acceptance criteria must be testable.

    +

    + priority: 80 + stage: requirements + dod requirements testability + c37bc45fc232 +

    +
    +
    • current_stage eq="requirements" → observed="implementation"
    • requirements_acceptance_criteria_testable eq=false → observed=true
    + +
    +
    +
    + did not match +

    testing-critical-paths-uncovered

    +

    Critical paths (happy + key edge cases) must be covered.

    +

    + priority: 80 + stage: testing + dod testing + afe3e92d529e +

    +
    +
    • current_stage in=["testing","review"] → observed="implementation"
    • testing_critical_paths_covered eq=false → observed=undefined (flag missing in extraction)
    + +
    +
    +
    + did not match +

    design-risks-have-mitigations

    +

    Identified risks must have mitigations.

    +

    + priority: 70 + stage: design + dod design + 8191d2b9c3a0 +

    +
    +
    • current_stage eq="design" → observed="implementation"
    • design_risks_have_mitigations eq=false → observed=true
    + +
    +
    +
    + did not match +

    idea-scope-bounded

    +

    Idea scope must be bounded (no "boil the ocean" framing).

    +

    + priority: 70 + stage: idea + dod idea + c2fd78dfab7f +

    +
    +
    • current_stage eq="idea" → observed="implementation"
    • idea_scope_bounded eq=false → observed=undefined (flag missing in extraction)
    + +
    +
    +
    + did not match +

    idea-ready

    +

    Idea DoD satisfied — ready for /spec:research.

    +

    + priority: 10 + stage: idea + dod idea + 5d970d8d520c +

    +
    +
    • current_stage eq="idea" → observed="implementation"
    • idea_problem_statement_present eq=true → observed=undefined (flag missing in extraction)
    • idea_target_users_named eq=true → observed=undefined (flag missing in extraction)
    • idea_scope_bounded eq=true → observed=undefined (flag missing in extraction)
    + +
    +
    +
    + MATCHED +

    impl-ready

    +

    Implementation DoD satisfied — ready for /spec:test.

    +

    + priority: 10 + stage: implementation + dod implementation + c311b2edd5ff +

    +
    +
    • current_stage eq="implementation" → observed="implementation"
    • implementation_lint_clean eq=true → observed=true
    • implementation_types_clean eq=true → observed=true
    • implementation_unit_tests_pass eq=true → observed=true
    • open_clarifications_count eq=0 → observed=0
    • s1_findings_count eq=0 → observed=0
    +

    Contributes Ready to progress with weight 100. Actions: advance-to-testing.

    +
    +
    + +
    +

    Provenance

    +

    + Engine version: 0.2.0
    + Ruleset hash: 84a35f019743b26e106d82332b2e07cc12979c9ce1fdf4b466550b3b37e25d92
    + Flags hash: 9fa08674155359d5ccf2db199355bbccd0bc5f63674ec29392a9a69ab78d82f9
    + Rules file: rules/quality-gates.yaml
    + Flags file: fixtures/ready-implementation.json +

    +
    + +
    + Generated 2026-05-17T12:56:58.912Z · experiments/rule-engine-poc · same inputs → identical report, byte-for-byte (except this timestamp). +
    + + From 837380dc1ad08987769d3d5b38fd87d683cb8028 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 17 May 2026 12:59:22 +0000 Subject: [PATCH 24/45] chore(typos): exclude generated rule-engine sample HTML reports MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sample reports under experiments/rule-engine-poc/research/sample-reports/ embed 12-char rule content-hash prefixes that randomly trip typos rules (e.g., 'afe...' -> 'safe'). Excluding the folder is consistent with the existing pattern that allow-lists specific commit-SHA fragments. Also commits research/17 (ux-designer pass on the rendered HTML reports) — top finding is that the audit trail buries the matched rules among ~21 'did not match' siblings; recommended a 'What fired' section + collapse-by-default for skipped rules. --- _typos.toml | 3 + .../research/17-report-ux-audit.md | 61 +++++++++++++++++++ 2 files changed, 64 insertions(+) create mode 100644 experiments/rule-engine-poc/research/17-report-ux-audit.md diff --git a/_typos.toml b/_typos.toml index 042937c45..f2d3f63e9 100644 --- a/_typos.toml +++ b/_typos.toml @@ -45,4 +45,7 @@ extend-exclude = [ "docs/backlog/**", "graph/**", "github-archive/**", + # Generated HTML reports — rule content hashes (12-char prefixes) can + # randomly start with letter sequences typos misreads (e.g., 'afe'). + "experiments/rule-engine-poc/research/sample-reports/**", ] diff --git a/experiments/rule-engine-poc/research/17-report-ux-audit.md b/experiments/rule-engine-poc/research/17-report-ux-audit.md new file mode 100644 index 000000000..5c5a52b25 --- /dev/null +++ b/experiments/rule-engine-poc/research/17-report-ux-audit.md @@ -0,0 +1,61 @@ +# 17 — HTML report UX audit + +UX audit of the rendered HTML report most users will actually open. Focus: visual hierarchy, scanability, the order the eye takes through the page. Source under review: the three samples committed at `experiments/rule-engine-poc/research/sample-reports/` against the renderer at `experiments/rule-engine-poc/src/html-report.ts`. Findings here do not repeat the prior audit in [`09-user-flow.md`](09-user-flow.md) — that document covered the workflow's seams; this one is the page itself. + +## First-glance narratives + +**Blocked (`blocked-missing-ears.html`).** The red verdict card lands cleanly — the eye goes straight to the word "Blocked", then to "1 of 21 rules matched · 1 suggested action". The stat line under the verdict reads slightly oddly: when you are blocked, the relevant number is "which one rule blocked you", not "1 of 21". Below the fold, the Weighted-tally table and Suggested-actions list are the right size for the moment, but the alphabetical actions list is just `rewrite-non-ears-requirements` — fine here, less fine when there are five. Scrolling past the flags table the user hits a wall of twenty grey "did not match" cards. The single MATCHED card — the one that actually caused the block — is the third card in the list and visually quieter than it should be: it sits between two skipped rules and uses the same article shell, just with a red 4px left border and a `MATCHED` pill. A user skimming for "what blocked me" has to read every header to find it. The urgency the red card establishes at the top dissipates before the user reaches the cause. + +**Ready (`ready-implementation.html`).** Green verdict card reads cleanly: "Ready to progress · 1 of 21 rules matched · 1 suggested action". The single matched rule (`impl-ready`) is the *last* card in the audit trail because its priority is 10. So a reader who opens the report and scrolls expecting confirmation gets twenty greyed-out skipped rules first, then the one card that says "advance to testing". The report does not feel celebratory or final — it feels indistinguishable from a partial-coverage page until you reach the bottom. The top stat reads "1 of 21" which, with no prior exposure, is easy to misread as "5 % coverage, suspicious". + +**Needs attention (`needs-attention-design-risks.html`).** Yellow verdict card. The stats line ("2 of 21 rules matched · 2 suggested actions") is the most useful of the three because two matched rules genuinely is the right number to surface. The two MATCHED cards are scattered through the trail in priority order (positions 2 and ~14), which means you have to scroll the whole page to know whether you are looking at "watch out, you can still ship" or "really, slow down". The amber palette is gentler than the red one but the page itself does not communicate "you can ship, but watch out" — it communicates "here are 21 rules, two of them fired, find them". The actionable middle ground that the verdict tier represents is the least supported by the layout. + +## Findings + +1. **The audit trail dominates the page; the cause-of-verdict is buried.** With 21 rules and 1–2 matched, the report is roughly 95 % "did not match" content by visual area. The matched cards have an accent-coloured 4px border and a coloured pill (`html-report.ts:208–216`), but they sit in the same flow as skipped cards. Recommend: render a **"What fired"** section above the full audit trail, containing only the matched rules. The full audit trail remains below, unchanged, for replay/forensics. Cheap to build (re-filter the same `evaluations` array). + +2. **"Did not match" rows are noise by default.** For a 25-rule trail with two matches, 23 cards exist solely so an auditor can prove the engine evaluated them. That is a forensic concern, not a primary-user concern. Recommend: collapse skipped rules behind a `
    ` element ("23 rules did not match — expand to inspect"). No JS needed. The audit-trail header already labels the ordering as deterministic; the collapse does not break that. + +3. **Actions are alphabetically sorted; users want priority order.** `actions: sort actions alphabetically` is set in the engine (`docs/architecture.md` section 4) and the renderer just iterates (`html-report.ts:90–92`). For one or two actions this is invisible; for five it is wrong. A developer asking "what do I do first" deserves the action whose contributing rule has the highest priority first. Recommend: emit actions in *contributing-rule priority desc* order; tie-break alphabetically to keep determinism. This is a semantic improvement worth the engine-side change, not a renderer hack. + +4. **The "X of Y rules matched" stat is misleading at first glance.** "1 of 21" looks like coverage; it is actually selectivity. A blocked user reads "1 of 21" and may infer the gate is barely enforced. Recommend rewording in `html-report.ts:252`: for `blocked` / `needs-attention`, lead with the cause count ("1 rule blocked progression"); for `ready-to-progress`, lead with the affirmative ("All gate rules cleared — 1 rule confirmed readiness"). Total counts can be a parenthetical. + +5. **The verdict card is well-sized but visually disconnected from its cause.** The verdict card (red, 6px left border, 28px value) is the strongest element on the page; the matched-rule card that *produced* that verdict is generic. Recommend: in the matched card's `
    `, when the rule contributes the dominant verdict, add a small "drove the verdict" badge. The data is already there (`ev.contribution.verdict === result.verdict`); the renderer just needs to surface it. + +6. **Repetition between Weighted tally, Suggested actions, and matched cards' Contribution lines is real.** The same fact ("Blocked, weight 100, rewrite-non-ears-requirements") appears in: the verdict card, the tally table, the actions list, the matched rule's `contribution` paragraph. Four times for one fact. The tally table is the most expendable when only one tier has weight — recommend collapsing it to a sentence ("All 100 points landed in Blocked.") when 3 of 4 tiers are zero, and showing the full table only when at least two tiers carry weight. The Suggested actions block earns its keep because it is the *do-this* call-out; keep it. + +7. **Provenance has no plain-language framing.** Three 64-char SHA-256s (`html-report.ts:285–308`) sit at the bottom with no callout. A first-time user reads "Ruleset hash: 84a35f0…" and asks "what do I do with this". Recommend a one-line preamble inside `.provenance`: "These hashes let you reproduce this report from the same inputs. If two reports share all four hashes, they are byte-for-byte equivalent (except the timestamp). See `docs/audit-trail.md`." That converts the section from cryptographic decoration to a usable replay handle. Truncating the displayed hash to the first 12 chars (as already done for `promptHash` on line 302) with a `title=` for the full value would also reduce visual weight. + +8. **Non-color signals exist but lean on color anyway.** The `[+]` / `[-]` / `[?]` markers in `html-report.ts:232–234` are real text glyphs and work for the colorblind. Good. But `cond--miss` and `cond--missing` differ only by hue (red vs amber) at the marker — same shape, same position. The `cond--missing` row has the amber background (line 235) which is a strong differentiator; the `cond--miss` row has none. Recommend giving `cond--miss` a faint red wash so the *missing vs. mismatched* distinction is row-level, not just glyph-level. The semantic difference (operator/LLM problem vs. domain signal) is load-bearing per the renderer comment on lines 110–113. + +9. **Typography hierarchy is doing its job — except the rule-ID `` inside `

    `.** `h3` is 15px, same as body, which makes the rule-card title competitive with the description line right below it. Combined with the rule-ID rendered as `` (15 × 0.92 ≈ 14px on a grey background), the title is weaker than the priority badge beside it. Recommend bumping `h3` to 16–17px and dropping the `` wrapper on the rule ID inside the heading (keep monospace via font-family on just that element, but lose the grey pill background that fights the heading). + +10. **Mobile / 375px holds up reasonably.** The 980px `max-width` + `padding: 32px 24px` collapse to side-padding only on narrow viewports. The summary grid (`grid-template-columns: 1fr 1fr` at `html-report.ts:196`) does not have a media-query fallback — at 375px the tally table and actions list compress to ~155px each, which makes the tally table's right-aligned numeric column awkwardly close to the row label. Recommend a `@media (max-width: 540px) { .summary-grid { grid-template-columns: 1fr; } }` to stack them. The audit trail's per-condition `
  • ` is fine narrow because conditions are single-line text; long observed-value blobs would overflow but no current sample hits that. + +11. **Header is too modest for the artifact it sits on.** "Rule engine report" + a muted subtitle gives no context for which feature / target / run the user is looking at. The HTML `` carries `flagsPath` (line 159) but the visible `<h1>` does not. Recommend echoing the feature slug — already in flags as `feature_slug` — into the visible header: "Rule engine report — `auth-refresh`". Cheap, high-orientation-value. + +12. **No re-run hint or "edit this and re-run" call-out.** Per finding F4 in `09-user-flow.md` the loop is supposed to be human-in-the-loop; nothing on the rendered page tells the user how to act on disagreement. Recommend a one-line footer-adjacent block: "Disagree with a verdict? Edit `extractions/<id>.json` and re-run `npm run report`." This is workflow UX, but it belongs on the page the user closes the loop on. + +## Top recommendations (highest leverage) + +1. **Add a "What fired" section above the audit trail** that lists only matched rules (finding 1). Single biggest readability win; renderer-only change. +2. **Collapse skipped rules behind `<details>` by default** (finding 2). Cuts visible page weight by ~90 % for typical runs; no JS. +3. **Sort suggested actions by contributing-rule priority desc** (finding 3). Requires engine-side change but converts the actions list from a glossary into a to-do. +4. **Reword the verdict-card stats line per verdict tier** (finding 4) and **add a plain-language preamble to Provenance** (finding 7). Two small copy edits, large clarity gain. +5. **Stack the summary grid below ~540px** (finding 10). One media query; restores mobile/narrow-viewport legibility. + +## Requirements coverage + +| Request bullet | Where addressed | +|---|---| +| First-glance test (blocked / ready / needs-attention) | "First-glance narratives" | +| Information hierarchy | Findings 1, 5, 11 | +| Repetition / redundancy | Finding 6 | +| Audit trail length | Findings 1, 2 | +| Provenance accessibility for non-experts | Finding 7 | +| Action prioritization | Finding 3 | +| Verdict-tier emotional tone | "First-glance narratives", findings 4–5 | +| Color / non-color signals | Finding 8 | +| Typography hierarchy | Finding 9 | +| Density | Findings 1, 2, 6 | +| Mobile / narrow viewport | Finding 10 | From bc11f5770dd7d9f671752426d14a0002be4dcfa9 Mon Sep 17 00:00:00 2001 From: Claude <noreply@anthropic.com> Date: Sun, 17 May 2026 13:00:48 +0000 Subject: [PATCH 25/45] docs(rule-engine-poc): research wave 4 stakeholder pass (research/18) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Product-strategist pass on the HTML report as a downstream-shared artifact. Three findings: - The report is one artifact serving six first-fields (PR reviewer, PM, EM, QA, compliance, auditor). Recommend one HTML with re-stacked sections rather than reader-specific exports — keep the 'one artifact, many destinations' moat. - Highest-leverage change: expand action slugs ('kick-ci', 'request-reviewer') to human sentences via an actions[].human field on the rule schema. Promote the 'verified' prompt-hash badge next to the verdict. - Introduce label_set config (default 'dev'; 'pm', 'qa', 'compliance' as presentational overrides) so headline labels match the reader's vocabulary. --- .../research/18-report-stakeholders.md | 220 ++++++++++++++++++ 1 file changed, 220 insertions(+) create mode 100644 experiments/rule-engine-poc/research/18-report-stakeholders.md diff --git a/experiments/rule-engine-poc/research/18-report-stakeholders.md b/experiments/rule-engine-poc/research/18-report-stakeholders.md new file mode 100644 index 000000000..7dad8aab0 --- /dev/null +++ b/experiments/rule-engine-poc/research/18-report-stakeholders.md @@ -0,0 +1,220 @@ +--- +title: Rule-engine POC — report-as-artifact stakeholder audit +folder: experiments/rule-engine-poc/research +description: Who reads the rendered HTML report, when, and what they need. Frames the report as the artifact that leaves the developer's machine — attached to PRs, screenshotted into Slack, forwarded to PMs, filed by compliance. Earlier passes (03, 13) framed value; this pass frames *audience*. +status: draft +phase: discovery / frame (revisit) +author: product-strategist +inputs: + - experiments/rule-engine-poc/src/html-report.ts + - experiments/rule-engine-poc/research/03-positioning-jtbd.md + - experiments/rule-engine-poc/research/09-user-flow.md + - experiments/rule-engine-poc/research/13-strategy-v2.md +--- + +# 18 — Report-as-artifact stakeholder audit + +The HTML report is the *only* artifact that crosses team boundaries. Inside the +developer's machine, the engine, the YAML rules, and the audit trail are +read-write; outside, they collapse into the rendered HTML. PR attachment, +forwarded link, screenshot in Slack, evidence pack for compliance — every +downstream reader sees only what `html-report.ts` chose to surface. This +audit asks what each of those readers actually needs, where the current +shape serves them, and where it doesn't. + +> Note: a `research/sample-reports/` directory was referenced in the brief +> but is not present on the working filesystem at audit time. Findings here +> are grounded in the renderer source (`src/html-report.ts`, the canonical +> spec of every report shape) plus the user-flow audit in `research/09`. + +## 1. Stakeholder × goal × first field + +| Reader | When they open it | Reading goal (one sentence) | First field they look for | +|---|---|---|---| +| **PR-reviewer (dev)** | Linked from a PR comment or CI artifact | "Should I approve, request changes, or punt?" | The verdict pill + the matched-rule list | +| **PM / EM (sprint check-in)** | Slack screenshot or weekly status | "Is this feature on track or stuck, and what do I do about it?" | Verdict + suggested actions | +| **QA lead (release-readiness)** | Pre-release gate review | "Which blockers are real and which are missing-flag noise?" | The audit trail, filtered to `blocked` contributions | +| **Compliance officer (regulated org)** | Pre-launch governance checkpoint | "Can I prove the AI didn't decide this and that the inputs were what they were claimed to be?" | Provenance block (ruleset hash, flags hash, prompt hash, `verified` badge) | +| **External auditor (post-EU AI Act)** | Annual or incident-driven audit | "Is the decision reproducible, traceable to a versioned rule, and bound to its inputs?" | Engine version + ruleset hash + per-rule content hash | +| **Author, one week later** | Cold re-read after context-switch | "What did past-me decide, and what was the *flag* that swung it?" | Extraction flags table | + +Six readers, six different first-fields. Today the report orders content for +the first row only. + +## 2. Information hierarchy — one document with stakeholder-aware sectioning + +I do not recommend reader-specific export views in the POC. A single +self-contained HTML file is the moat (`research/13` §"Migration story"; +no JS, no server, no auth — emailable, archivable, diffable). Splitting it +into per-reader exports breaks the "one artifact, many destinations" +property that makes it survive at all. + +Instead, **re-stack the document so each reader's first field is reachable in +one scroll**. Current order: verdict → tally → actions → flags → audit +trail → provenance. Recommend: + +1. **Verdict + headline action** (covers PR-reviewer + PM) +2. **Provenance block** (covers compliance + auditor; move from footer to fold) +3. **Suggested actions, expanded to sentences** (covers PM + author) +4. **Audit trail** (covers QA + author) +5. **Extraction flags** (covers author, supporting) +6. **Weighted tally** (internal interest; demote) + +Provenance currently sits last because it reads as "appendix." For the +compliance and auditor readers it is the headline, not the appendix. + +## 3. Sharing channels — above the fold + +The top 1000 px of a screenshot is the de facto spec for what gets shared. +Today that span contains: title, verdict pill, "N of M rules matched · K +suggested actions," weighted-tally grid, suggested-actions list. A screenshot +sent to a PM contains a *verdict and slug actions*, not the rule that fired, +not the prompt hash, not the source feature. Three concrete consequences: + +- **Slug actions get screenshotted out of context.** `kick-ci` and + `request-reviewer` mean nothing to a PM and look unprofessional in a + forwarded screenshot. Expand to sentences (see §5). +- **No target identity in the fold.** The current `<title>` is "Rule engine + report — extractions/<file>". The visible H1 is just "Rule engine report." + A reader who receives a screenshot can't tell which feature it covers. + Surface the target id and the source feature path in the H1 or a subline. +- **No "as of" timestamp in the fold.** Generated-at sits in the footer; a + screenshot loses it. Move a short ISO timestamp to the verdict card subline. + +## 4. Trust signals — three hashes is right, but the framing is wrong + +The cryptographic content (engine version, ruleset hash, flags hash, prompt +hash) is the *correct* trust signal for the auditor and compliance reader — +both `research/02` (EU AI Act, ISO 42001, NIST RMF) and §3.5 of the +Autonomous Systems Evaluation Standard treat content-hashed evidence as +the gold standard. ([UK AISI AS Evaluation Standard](https://ukgovernmentbeis.github.io/as-evaluation-standard/)) +The problem is presentation: + +- **The `verified` badge is load-bearing but under-emphasised.** Promote it + next to the verdict pill. "Prompt hash verified" is the single most + trust-bearing claim on the page; today it hides in fine print. +- **Hashes need a one-line plain-English caption.** "Same inputs produce an + identical report, byte-for-byte" is already in the footer — move it next + to the hashes themselves so a non-cryptographer understands what + matching hashes *prove*. +- **No signer.** Compliance readers expect a signature or at least an + attestation of *who* ran the report. Add a `signer` field (env user, host, + CI run id when present). Without it, hashes prove integrity but not + attestation. + +## 5. The "what do I do now?" gap + +Today: `kick-ci`, `request-reviewer`, `update-changeset` rendered as +`<code>` chips. Recommendation: every action in the rule pack carries a +`human` field — a complete sentence in the imperative. The HTML renders +that sentence; the slug becomes a hover/title attribute for the dev. PM +reads "Re-run CI — the last attempt failed before any test ran"; dev hovers +to see `kick-ci`. Same artifact, two readers served. This is the single +highest-leverage change for non-developer audiences. + +## 6. Headline language — keep one set internally, render per-audience labels + +The engine vocabulary (`blocked` / `needs-attention` / `ready-to-progress` / +`unknown`) is fine as an internal contract. It is *not* universal in the +target audiences: + +| Audience | Idiomatic verdict labels | +|---|---| +| PR reviewer / dev | block / warn / approve (matches GitHub) | +| PM / EM | blocked / at-risk / on-track / unknown | +| QA / release | fail / hold / pass / inconclusive | +| Compliance / auditor | non-conformant / conditional / conformant / insufficient-evidence | + +Recommend a `label_set` field in the config (default `dev`) and store the +audience the report was rendered for in provenance. The verdict slug stays +canonical; the rendered label is a presentational override. This costs +~20 lines in `html-report.ts` and pays for itself the first time a +compliance officer sees "needs-attention" and asks what that maps to in +their RACI. + +## 7. Comparison artifacts — what the field has converged on + +- **sklearn `classification_report`** — terse precision/recall/F1 table with + `accuracy`, `macro avg`, `weighted avg` summary rows. The convention worth + stealing: a *summary row* at the bottom that aggregates the per-class + numbers. Our audit trail lacks an aggregate. ([scikit-learn docs](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html)) +- **LangSmith** — distinguishes Boolean / Categorical / Continuous feedback, + and treats *Boolean* as the right shape for "hard gates that must pass or + fail with zero ambiguity." Verdict is the right shape for our domain; + don't drift toward continuous scores. ([LangChain — LangSmith evaluation](https://docs.langchain.com/langsmith/evaluation)) +- **Inspect AI (UK AISI)** — every eval = dataset + solver + scorer; the eval + log is the artifact. Our audit-trail-per-rule is conceptually identical to + Inspect's per-sample scorer output. Worth aligning vocabulary: "scorer" + output is what we call "evaluation"; "log" is what we call "audit trail." + Aligning vocabulary makes the eventual library readable to AISI-adjacent + evaluators. ([Inspect AI](https://inspect.aisi.org.uk/)) +- **Weights & Biases reports** — supports PDF and LaTeX export; centralises + evaluations into leaderboards across runs. The export-format pluralism is + the lesson; the centralised dashboard is the seam we flag in §9. ([W&B Reports](https://docs.wandb.ai/models/reports)) + +## 8. Format pluralism — HTML is the source, others are projections + +HTML is the right primary because every reader has a browser and the +artifact survives forwarding. But: + +- **Markdown export** for PR comments and issue bodies. The engine already + produces structured data; render Markdown alongside HTML from the same + `VerdictResult`. +- **Plain-text Slack block** — verdict, top three actions, target id, prompt + hash short form. Fits in a Slack message without an attachment. +- **PDF** — out of scope for the POC, but the self-contained no-JS HTML + prints reasonably; a `--print` CSS branch would be a one-evening add. + +Note: I deliberately do *not* recommend reader-specific HTML variants. One +HTML with re-stacked sections (§2) plus alternate formats (this section) +covers the matrix without breaking the "one artifact" property. + +## 9. Single-target vs portfolio — flag the seam + +A PM with 12 features wants one page with 12 rows, not 12 tabs. Out of +scope for the POC, but the seam is already visible: `report` emits N HTML +files; an `index.html` of "all targets, sorted by verdict" is a 50-line +addition that would shift the report from per-PR artifact to per-program +artifact. This is also the moment the moat shifts from "audit-trail +per decision" to "decision SIEM" — which is GoRules territory and +explicitly out of bounds per `research/13` §"Pricing and packaging." +Flag for the Decider: a portfolio dashboard is the *first* feature that +would push Specorator toward hosted-SaaS gravity. Build the index page; +do not build the dashboard. + +## 10. Recommendations — ranked by leverage + +1. **Expand action slugs to human sentences in the report renderer.** + Highest leverage. One-field schema change, unlocks PM and compliance + readers. The slug stays as the canonical id. +2. **Re-stack the document: verdict → provenance → actions → audit → flags + → tally.** Moves the compliance reader's first field above the fold and + the screenshot-reader's first field into the screenshot. Pure ordering; + no new content. +3. **Promote the `verified` badge next to the verdict pill, with a one-line + plain-English caption beside the hash block.** The single most + trust-bearing fact on the page is currently the easiest to miss. +4. **Add target identity, source feature path, and ISO timestamp to the + verdict card subline.** A forwarded screenshot becomes self-contained. +5. **Introduce an audience-aware `label_set` for verdict rendering + (`dev` / `pm` / `qa` / `compliance`) while keeping the engine vocabulary + fixed.** Unblocks adoption beyond dev audiences without touching the + determinism story. + +Anything not on this list — Markdown/PDF/Slack projections, portfolio +index page, signer attestation — is real but secondary. Ship 1–5 inside +the POC; defer the rest with named owners. + +## Handoff + +- **To facilitator:** the report-as-artifact framing is consistent with the + v2 strategy (audit trail is the product) and does not change the North + Star. No pivot flag. +- **To prototyper:** items 1–5 are renderer-only changes. Item 1 needs a + rule-schema field (`actions[].human`); coordinate with the architect on + whether that goes in `rules/quality-gates.yaml` or in a separate + `action-catalog.yaml`. +- **To user-researcher:** the S2 (eng managers) and S3 (compliance) scripts + in `research/16` should add one probe each — "show me the last time you + forwarded a quality report; what did you do with it?" — to validate the + PM/compliance reading goals in §1. From 9c494735a9bd841d5661d4be88dc474325159de6 Mon Sep 17 00:00:00 2001 From: Claude <noreply@anthropic.com> Date: Sun, 17 May 2026 13:01:04 +0000 Subject: [PATCH 26/45] docs(rule-engine-poc): research wave 4 brand review (research/19) Brand-reviewer pass on the rendered HTML report. Verdict: pass-with-findings; not S1-blocking while the POC stays under experiments/, but would block on the promotion-to-skill step flagged in research/13. Findings: - On-temperament (no emoji / gradients / icons; ASCII [+]/[-]/[?] markers are correctly monospace-as-iconography; restrained density). - Off-token: 18 distinct literal hex values, literal -apple-system / SFMono-Regular font stacks, page background near-white instead of Specorator cream var(--paper). - Voice close but section headers are bare labels rather than sentence-case-with-period declaratives; 'Suggested actions' is passive against Specorator's imperative voice. - Open decision: Specorator has no red token. blocked tier currently uses literal #fdecea / #d8281b / #7a160d. ADR-shaped choice before graduation: extend colors_and_type.css, repurpose --soft-orange and rename the tier 'at-risk', or stay literal until packaged. --- .../research/19-report-brand-review.md | 67 +++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 experiments/rule-engine-poc/research/19-report-brand-review.md diff --git a/experiments/rule-engine-poc/research/19-report-brand-review.md b/experiments/rule-engine-poc/research/19-report-brand-review.md new file mode 100644 index 000000000..1389ee397 --- /dev/null +++ b/experiments/rule-engine-poc/research/19-report-brand-review.md @@ -0,0 +1,67 @@ +# 19 — HTML report brand review + +> **Brand fit: pass-with-findings.** The renderer at `src/html-report.ts` is restrained, icon-free, emoji-free, gradient-free, and uses no en-dashes — it shares the editorial-document temperament Specorator's design system codifies. It is *not* yet using any Specorator token, copy convention, or type stack, so it reads as a generic-but-tasteful audit log rather than as a Specorator surface. Closing the gap is mechanical, not redesign-scale. + +## Scope + +- POC is sandbox-scoped under `experiments/rule-engine-poc/` and not imported by the public site. By Stage-9 brand-reviewer rules it is **not-applicable today** — the trigger surface is `sites/**` and `.claude/skills/specorator-design/**`. +- `research/13-strategy-v2.md` recommends eventual packaging as a Specorator skill. The moment the renderer lands under `.claude/skills/<rule-engine>/` or emits HTML the public site links to, every blocker in `templates/brand-review-checklist.md` applies retroactively. Tightening now is cheap; retrofitting later is the same edits plus an audit. +- Specorator **does** have a formal brand spec — `.claude/skills/specorator-design/colors_and_type.css` (canonical tokens) and `…/README.md` (voice, iconography, density). So this is "adopt the brand that already exists", not "anticipate a brand". + +## S1 — Blocking if packaged as a skill (must fix at promotion) + +1. **Every color is a literal hex; zero tokens are used.** The `VERDICT_PALETTE` (lines 18–46) and the embedded `<style>` block (lines 161–240) hold **18 distinct hex literals**. Mapping table: + +| Current literal | Where | Specorator token to use | +|---|---|---| +| `#fdfdfc` (`--bg`, page) | line 162 | `var(--paper)` (`#fbfcf8`) — non-negotiable | +| `#1d1f23` (`--fg`) | line 163 | `var(--ink)` (`#17201b`) | +| `#6b7280` (`--muted`) | line 164 | `var(--muted)` (`#59645e`) | +| `#e3e5ea` (`--border`) | line 165 | `var(--line)` (`#d8ded3`) | +| `#f4f5f7` (`--code-bg`) | line 166 | `rgba(23,32,27,0.06)` (the `.code-chip` background) | +| `#fdecea` / `#d8281b` / `#7a160d` (`blocked`) | lines 23–25 | **no exact brand token — see open Q1** | +| `#fff4e0` / `#d18900` / `#6c4400` (`needs-attention`) | lines 29–31 | `var(--soft-yellow)` + `var(--lane-ship)` + `var(--lane-ship-text)` | +| `#e6f6ec` / `#1f8a4c` / `#114a29` (`ready-to-progress`) | lines 35–37 | `var(--soft-green)` + `var(--lane-define)` + `var(--lane-define-text)` | +| `#eef0f3` / `#737884` / `#3a3d44` (`unknown`) | lines 41–43 | `var(--surface-2)` + `var(--muted)` + `var(--ink)` | +| `#fcfcfb` (matched-rule fill) | line 209 | `var(--surface)` or `var(--surface-2)` | +| `#eef4fb` / `#234e7a` (tag badge) | line 223 | `var(--soft-blue)` + `var(--lane-build-text)` | +| `#e6f6ec` / `#114a29` (verified badge) | line 225 | `var(--soft-green)` + `var(--lane-define-text)` | +| `#1f8a4c` (cond-ok) | line 232 | `var(--lane-define)` | +| `#d8281b` (cond-miss) | line 233 | needs a brand red, or fall back to `var(--ink)` | +| `#d18900` (cond-missing, ×2) | lines 234–235 | `var(--lane-ship)` | + +2. **Page background is `#fdfdfc`, not `var(--paper)`.** Checklist item #4. White is reserved for cards; the page is cream. Move the verdict-card + rule-article fills to `--surface`; the body to `--paper`. + +3. **Font stacks are literal.** Lines 174 (`-apple-system, BlinkMacSystemFont, …`) and 179, 224, 230 (`"SFMono-Regular", Consolas, …`). Checklist item #5 requires `var(--font-sans)` / `var(--font-mono)`. Specorator also pulls Inter + JetBrains Mono via the `@import` in `colors_and_type.css` — without those, the report reads as system-UI, not Specorator. + +4. **Section headers violate voice rules.** Specorator headlines are **sentence case ending with a period** (README §"Casing"). The report uses bare labels: `Verdict`, `Weighted tally`, `Suggested actions`, `Extraction flags`, `Audit trail`, `Provenance`. Add the period and keep sentence-case. H1 `Rule engine report` → `Rule engine report.` + +5. **All-caps `VERDICT` / `MATCHED` pills do not match `.status-pill`.** The canonical pill (`colors_and_type.css` line 233) is `--fs-micro` 11px, tracking `0.08em`, weight `--fw-x` (800), with a leading 6×6 colored dot. The renderer's pill is structurally similar but lacks the dot, uses gray + `#fcfcfb`, and uses tighter tracking. Reuse `--soft-green` / `--accent-strong` + the `::before` dot for the matched state. + +## S2 — Warnings if promoted (fixable in a follow-up) + +6. **Lane chips are by-coincidence, not by-intent.** The "tag" badge (line 223) is the same blue as `--lane-build`; the "verified" badge is the green of `--lane-define`. If those read as lane chips, they break the Define = green / Build = blue / Ship = gold mapping (checklist #11). Either treat them as neutral chips (`var(--code-chip)` bg) or commit to a deliberate lane. + +7. **`[+]` / `[-]` / `[?]` ASCII condition markers are on-brand.** They are monospace-as-iconography, the load-bearing pattern in README §Iconography. Keep them. If a Lucide-style icon set is ever tempting, the README requires flagging it to the user first. + +8. **Voice: "Suggested actions" is mealy.** Specorator's voice is imperative, second-person (`Get started`, `Read the workflow`). `Next moves.` or `What to do next.` reads closer. `Audit trail`, `Provenance`, `Weighted tally` all sit fine. + +9. **Density is on-brand; rule-card hierarchy is flat.** The 980px max-width and 32px page padding match `--gutter-page`. The 20+ rule-card stack in `blocked-missing-ears.html` is hard to scan. A nit — but Specorator's "clear rows" temperament suggests alternating `--surface` / `--surface-2` bands every five evaluations, or a matched-first sort. + +## S3 — Polish + +- Radius drift: renderer uses 3px / 4px throughout. Brand rhythm is 6 / 8 / 12 / 14. Move pills to `--r-pill`, verdict-card to `--r-md`, rule cards to `--r-md` or `--r-lg`. +- H1 at 24px is fine for a sandbox report; on promotion lift to `var(--fs-h2-panel)`. +- `—`, `·`, `→` are all in brand vocabulary. Keep. + +## Open questions + +1. **Red token.** Specorator has no red. `blocked` verdicts need one. Three paths: (a) add `--lane-block` to `colors_and_type.css` via an ADR; (b) reuse `--soft-orange` + `--diff-add-text` and rename the tier `at-risk`; (c) leave the literal `#d8281b` until graduation. Which? +2. **Cream vs print-white.** `#fdfdfc` was likely picked for print contrast. Confirm `--paper` (`#fbfcf8`) is the right page background even when the report is PDFed. +3. **Skill-packaging timing.** If `research/13` is "maybe", stay token-literal and drop a TODO at the top of `html-report.ts`. If it's "yes", swap tokens in this PR. + +## Three-bullet summary + +- The renderer is **on-temperament** (no emoji, gradients, icons, en-dashes; restrained density; monospace-as-iconography) but **off-token**: 18 literal hex values, literal font stacks, and a near-white page background instead of the cream `--paper`. +- Voice is close — `Audit trail`, `Provenance`, `Weighted tally`, `—`, `→` all match — but section headers are bare labels rather than sentence-case-with-period declaratives, and `Suggested actions` is passive. +- Specorator has no red, which the verdict palette currently needs; flag this as a decision before the renderer is promoted to a skill, and consider extending `colors_and_type.css` rather than letting `#d8281b` proliferate. From 3adb87c5d7005bbba193096920c461b878e6f995 Mon Sep 17 00:00:00 2001 From: Claude <noreply@anthropic.com> Date: Sun, 17 May 2026 13:01:44 +0000 Subject: [PATCH 27/45] docs(rule-engine-poc): research wave 4 misread-risk critique (research/21) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Critic pass on the rendered HTML report as a communication artifact. Three findings: - Visual hierarchy contradicts semantic model: severity-first is invisible in the weighted-tally widget (reads as a horse race), alphabetically-sorted action list silently asserts a priority the engine refuses to give, cond--missing vs cond--miss are visually distinguished but never named (colour-blind readers lose the signal). - 'verified' badge is a trust-calibration trap — green pill reads as 'extraction verified' to an auditor when it only means 'bound to current inputs'. Compounded by --skip-validate runs producing HTML indistinguishable from validated ones (research/14 risk 1 leaks into the report layer). - Most dangerous skim path: blocker-by-absence. A high-priority blocker rule whose input flag is missing from the extraction simply doesn't fire; neither verdict tile nor any header-level summary tells the reader 'N higher-priority rules were un-evaluable'. - 3 RATs proposed (verdict-tile-alone, action-list-as-priority, 'verified' interpretation). Default no-go if any fail. --- .../research/21-report-misread-risks.md | 211 ++++++++++++++++++ 1 file changed, 211 insertions(+) create mode 100644 experiments/rule-engine-poc/research/21-report-misread-risks.md diff --git a/experiments/rule-engine-poc/research/21-report-misread-risks.md b/experiments/rule-engine-poc/research/21-report-misread-risks.md new file mode 100644 index 000000000..bc3ab31f9 --- /dev/null +++ b/experiments/rule-engine-poc/research/21-report-misread-risks.md @@ -0,0 +1,211 @@ +# 21 — Report misread risks (HTML report as communication artifact) + +Critic pass on the rendered HTML emitted by `src/html-report.ts`. Prior +critiques (`05`, `07`, `14`) covered the engine and the workflow; this one +asks: given the artifact a reader actually opens in a browser, where does +it mislead, invite the wrong action, or earn unearned trust? + +Note on evidence: the prompt referred to three committed sample reports +under `research/sample-reports/`. That directory is not present in the +working tree at the time of writing — only `fixtures/*.json` and the +renderer in `src/html-report.ts`. I quote the renderer source and the +fixtures directly (they fully determine the rendered output); calling +that out so this critique is not mistaken for evidence-of-eye review. + +## 1. The skim trap: verdict tile + action list, nothing else + +The verdict tile renders verdict label, then a stats line: + +> `${matchedCount} of ${totalCount} rules matched · ${result.actions.length} suggested actions` + +A "ready-to-progress" tile next to "3 of 21 rules matched" is structurally +ambiguous. The reader sees the green tile and skims past `3 of 21`. But in +this codebase that ratio is *normal* — most rules are stage-gated and +intentionally don't fire. A reader who has internalised "high coverage = +trustworthy" from test reports will misread it the other way: "only 3 +fired, so I have low confidence in the green". Neither reading is +calibrated. + +Worse, severity-first means the verdict can be `ready-to-progress` because +exactly one positive rule fired (e.g., `idea-ready`, weight 100) — *and +no blocker fired because the flags those blockers depend on were absent +from the extraction*. The renderer makes no distinction between "blocker +checked and clean" and "blocker not evaluable" in the verdict tile. + +## 2. Tally-vs-verdict tension + +The weighted-tally table shows e.g. + +``` +Blocked 80 +Needs attention 60 +Ready to progress 100 +``` + +with the verdict tile reading `Blocked`. To a skimmer, the tally column +reads as a horse-race the green column won. Severity-first is invisible in +the tally widget — there is no annotation that says "Blocked wins +regardless of magnitude". Reader risk: opens the report, sees one red rule +contributing 80 vs a hundred green points, concludes the engine is +overreacting, overrides. Recommend a `(severity-first: highest non-zero +tier wins)` caption rendered above the table, or shading the winning row. + +## 3. `cond--missing` vs `cond--miss` — same shape, different epistemology + +These render as `[?]` (orange) and `[-]` (red) bullets respectively. The +visual distinction is real, but the labels in the report are operator +shorthand. The reader is not told: + +- `[?]` = **the LLM never decided** — the extraction did not supply this + flag. The world might be fine; we don't know. +- `[-]` = **the LLM decided and the value lost the check** — a domain + signal that the rule disagrees. + +The reason text (`(flag missing in extraction)`) is the only differentiator +and it's set in 13px italic `var(--muted)` grey. A reader who is colour- +blind on the orange/red axis loses the signal entirely. Worse: a rule +where every condition is `[?]` shows up as `did not match` exactly like a +rule that genuinely failed — the rule-card pill is identical. The +falsifiable risk is that operators will treat `[?]`-only rules as +*passes*, when they're actually un-evaluable. + +## 4. Alphabetical action list reads as a priority list + +`engine.ts:203` sorts the de-duplicated action set with `.sort()`. The +report then renders them as a top-down `<ul>`. Western readers read top- +to-bottom as importance-descending — that is the convention +issue-trackers, todo lists, and email clients all reinforce. So a list +like + +``` +- advance-to-testing +- clear-blockers +- fix-failing-tests +``` + +silently elevates `advance-to-testing` over `clear-blockers` on the page. +The engine doesn't think there's a priority — *but the visual ordering +asserts one*. Either sort by contributing-rule priority desc, or annotate +the list with a leading "(unordered)" caption. Today's choice is the worst +of both worlds: deterministic *and* misleading. + +## 5. "verified" badge — trust beyond what was checked + +The provenance footer renders + +> `Prompt hash: 1a2b3c4d5e6f… <span class="badge--verified">verified</span>` + +when `extraction.__prompt_hash === recomputed`. The badge text is one +word. An auditor reading "Prompt hash: ... verified" will reasonably +infer: *the extraction has been verified*. What was actually verified is +*the binding between the extraction and the prompt inputs* — not that the +LLM's extraction was correct, not that the flag values reflect the +underlying files, not that the rules are calibrated. Recommend: rename +the badge to `prompt-bound` or `inputs-match`, and remove the green `#114a29` +colouring that visually echoes the `ready-to-progress` tile. Today the +green-on-green pairing is a trust-calibration trap. + +This compounds risk 1 from `research/14`: a `--skip-validate` run does +not stamp anything in the footer. A report with no `verified` badge is +indistinguishable from a report where validate was bypassed — silence +reads as either "validate not asked for" or "validate explicitly turned +off", and the reader can't tell which. + +## 6. `generatedAt` is tombstone-sized footer text + +`generatedAt` is rendered inside `<footer>` styled `font-size: 12px; +color: var(--muted)`. A reader who arrives via a Slack-pasted link, +artifact archive, or screenshot has no header-level "as of" tag. A +green verdict from October read in May looks identical to one from this +morning. Recommend: stamp the date *inside* the verdict tile ("Verdict as +of YYYY-MM-DD HH:MM") and, if older than N days, downgrade the colour or +render a "stale" pill. Without this, the report is the easiest of the +artifact set to mistake for fresh. + +## 7. No diff against previous run — POC-acceptable, but flag it + +Today's HTML is standalone — no link to the previous run, no summary of +what changed. For a POC this is defensible scope, but it has a specific +cost: the *most common* reader question after a verdict change ("what +changed?") cannot be answered from the artifact. Engine reproducibility +gives us the substrate (rulesetHash, flagsHash) — a one-line "since last +run: flagsHash changed, rulesetHash unchanged" footer line is one PR away +and would close the misread loop. Recommend documenting this as a known +P1 follow-up rather than a POC limitation, because the omission +*systematically* favours optimism on flip-flopping verdicts. + +## 8. Uncertainty leaks but is never named + +The report design said "verdicts are categorical, no confidence intervals". +That is correct for the verdict, but uncertainty *does* leak through two +channels the report under-signals: + +- `unknown` verdict (no rule fired with weight) +- `[?]` flag-missing rows + +Neither rolls up into a header-level "you have N un-evaluable conditions +across M rules" summary. A reader gets `ready-to-progress` with three +`[?]` rows buried in the audit trail and has no way to know that one +missing flag would have flipped them to `blocked`. Recommend rendering an +"evaluability" line under the verdict tile: `K of N rules fully evaluated; +M rules had missing inputs`. The information is already in the audit +trail; surfacing it is mechanical. + +## 9. Three concrete ways a busy reader gets misled + +1. **Blocker-by-absence flip**: `fixtures/blocked-missing-ears.json` + contains `s3_findings_count: 1` and no `s1_findings_count`/ + `blockers_count` flags for some hypothetical extraction. If the + extractor *omits* `s1_findings_count`, the `any-s1-finding-blocks` rule + does not fire (priority 200, but `flag missing in extraction`). With + `requirements_ears_coverage: 0.6` present, the verdict still lands + `blocked` — *for the wrong reason*. Reader fixes EARS coverage, re-runs, + gets green, and ships with an unchecked S1 gate. The report does not + warn that a higher-priority rule was un-evaluable. + +2. **Green tile, ten green rule cards, one quiet red**: severity-first + means a `Blocked` weight-80 rule beats `100 + 100 + 60` green points. A + skimmer who scans the colour-bar row from `ready-implementation.json` + sees a wall of green rule cards (most matched) and one red, mis-reads + the page as "ready, minor issue", and overrides the verdict because + "the tally clearly leans green". + +3. **"verified" === "trusted"**: an auditor opening the provenance + section sees `Prompt hash: ab12cd34… verified` and records "extraction + verified" in the compliance log. What was verified was input-binding, + not content. The badge wording invites the misread; the green styling + confirms it. + +## 10. Three Riskiest Assumption Tests for report readability + +Each is designed so it *can* fail; default to no-go if any fails. + +- **RAT-R1 — Verdict-tile alone**: show 6 reviewers only the cropped + verdict tile from a `ready-to-progress` report where two `[?]` rows + exist deeper in the audit trail. Ask: "is this feature safe to advance?" + Falsification: 4 of 6 say "yes" without asking what wasn't evaluated. + If we fail this, the tile must show evaluability before the label. + +- **RAT-R2 — Action list as priority**: show 6 reviewers the report and + ask: "you can do exactly one of these actions today. Which?" Falsification: + 4 of 6 pick the alphabetically-first action when the contributing rules + imply a different priority. If we fail this, sort by priority desc and + add a caption. + +- **RAT-R3 — "Verified" interpretation**: show 6 reviewers (ideally + including one auditor profile) the provenance footer in isolation. Ask + in their own words: "what does the 'verified' badge mean?" Falsification: + 3 of 6 say anything stronger than "the extraction matches the prompt + inputs" — e.g., "the extraction is correct", "the rules passed", + "compliance check passed". If we fail, rename and recolour the badge + before the next POC iteration. + +## Recommendation to the facilitator + +The report is a competent first cut but **its visual hierarchy contradicts +its semantic model**: severity-first verdict is invisible in the tally, the +"verified" badge over-claims, missing-flag rows are too easy to dismiss, +and the alphabetical action list invents a priority the engine deliberately +withholds. None of these are engine defects. All of them ship as +misreads. Default verdict on the report-as-artifact: **needs-attention**, +gating on RAT-R1 and RAT-R3 before any external demo. From 05e430aa10517f41123d010f41f79ff54ed16d20 Mon Sep 17 00:00:00 2001 From: Claude <noreply@anthropic.com> Date: Sun, 17 May 2026 13:02:01 +0000 Subject: [PATCH 28/45] docs(rule-engine-poc): research wave 4 auditor-readability (research/20) Analyst pass on the HTML report from a regulator's reading perspective + 2026 benchmark against LangSmith / Inspect / W&B Weave / sklearn / model-card conventions. Closes the open item from research/02 (human-readable rationale presentation). --- .../research/20-report-auditor-readability.md | 130 ++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 experiments/rule-engine-poc/research/20-report-auditor-readability.md diff --git a/experiments/rule-engine-poc/research/20-report-auditor-readability.md b/experiments/rule-engine-poc/research/20-report-auditor-readability.md new file mode 100644 index 000000000..5b566fa32 --- /dev/null +++ b/experiments/rule-engine-poc/research/20-report-auditor-readability.md @@ -0,0 +1,130 @@ +# 20 — Report Auditor-Readability + +**Scope:** Closes the open thread from `02-regulatory-auditability.md` — "the audit trail ticks the boxes natively, but human-readable rationale presentation is open." This pass walks a regulator through the rendered HTML report (renderer at `src/html-report.ts`; samples at `research/sample-reports/{blocked-missing-ears, needs-attention-design-risks, ready-implementation}.html`) and benchmarks the shape against the AI-eval tools an auditor in 2026 is likely to have seen. + +**Anchoring quote (EU AI Act, Art. 13):** *"sufficiently transparent to enable deployers to interpret a system's output and use it appropriately."* + +--- + +## 1. Auditor reading path (the 30-second test) + +Cold-open a sample report. Eye-tracking follows the document order, which is also the visual-weight order: + +1. **Verdict tile** (`Blocked` / `Needs attention` / `Ready to progress`) — large, colour-coded, top-of-fold. Confirms outcome instantly. +2. **Stats line** (`1 of 21 rules matched · 1 suggested action`) — immediately quantifies *how* contested the decision is. +3. **Weighted tally** + **Suggested actions** side-by-side — answers "what is the system telling me to *do*?" +4. **Extraction flags table** — answers "what did the model think it saw?" +5. **Audit trail** of every rule (matched + skipped) with `[+]/[-]/[?]` glyphs on each condition — the *why*. +6. **Provenance** block — engine version + ruleset hash + flags hash + file paths. + +The 30-second test passes: a regulator who has never seen the codebase can name the verdict, the firing rule, the rule's plain-English description, the observed flag value, and the suggested remediation, without leaving the verdict card + audit-trail header pair. That is materially better than a raw `EvaluationResult` dictionary (HF `evaluate`) or a JSON log file (Inspect raw). The `[+]/[-]/[?]` ternary — match / mismatch / *missing in extraction* — also makes the LLM-extractor failure mode visually distinct from a domain mismatch, which is itself an Art. 14 oversight aid. + +What the 30-second test *fails* on: the report does not name **what kind of system this is**, who built it, what version of the workflow it governs, or what the verdict is binding against. A regulator landing on `blocked-missing-ears.html` has to infer from `feature_slug: "auth-refresh"` and `current_stage: "requirements"` that this is a Specorator workflow gate, not a customer-facing AI decision. + +## 2. EU AI Act Article 13 walkthrough + +[Article 13](https://artificialintelligenceact.eu/article/13/) requires instructions for use covering provider identity, characteristics/capabilities/limitations, expected lifetime, technical and non-technical specifications, computational resources, log-interpretation guidance, and human-oversight measures. Mapped against the rendered HTML: + +| Art. 13 element | Present in report? | Where / gap | +|---|---|---| +| Provider identity + contact | **No** | `experiments/rule-engine-poc` footer only; no provider, no contact, no version of the *workflow* it scores. | +| Capabilities | **Partially** | Implicit in the rule descriptions. No top-level "what this engine decides / does not decide" sentence. | +| Limitations | **No** | The non-deterministic LLM-extraction boundary is invisible. A regulator cannot tell that `requirements_ears_coverage: 0.6` is an LLM judgement, not a measurement. | +| Expected lifetime / maintenance | **No** | `Engine version: 0.2.0` is shown; semver policy, support window, and re-evaluation cadence are not. | +| Technical specs | **Partial** | Ruleset hash + flags hash present; LLM provider/model/prompt template + temperature not surfaced (despite the `__prompt_hash` plumbing in the renderer). | +| Non-technical specs | **No** | No glossary of `Blocked` vs `Needs attention` vs `Ready to progress` — a deployer must guess at semantics. | +| Computational resources | **N/A** | Reasonable to omit for a CLI; would matter for hosted deployment. | +| Log-interpretation guidance | **Partial** | Section labels are clear; ordering ("deterministic order: priority desc, id asc") is a nice touch. No legend for `[+]/[-]/[?]` glyphs or the "weight" column. | +| Human oversight measures | **No** | No reviewer field, no override field, no "two-person" indicator. Sample reports show a pure machine verdict. | + +The Art. 13 gap is concentrated on **framing**: the data is in the trail; the labels that let a regulator *recognise* the data as Art. 13-compliant are not. + +## 3. NIST AI RMF "Measure" outputs + +NIST AI 100-1's *Measure* function asks for "rigorous software testing and performance assessment methodologies with associated measures of uncertainty, comparisons to performance benchmarks, and formalized reporting and documentation of results" ([NIST AI 100-1](https://nvlpubs.nist.gov/nistpubs/ai/nist.ai.100-1.pdf)). The report is doing *Measure-on-a-case-basis* — a per-decision evaluation artifact, not a population-level benchmark. + +What it does well: deterministic re-run, named scorers (rules), priority weighting, structured pass/skip per scorer. This is the same shape as Inspect's `Scoring` tab (input + target + extracted + explanation per sample) — and the rule-engine report's per-rule `<article>` is functionally that, but for rules instead of LLM-judge scorers. + +What is missing for NIST shape: +- **Uncertainty.** Every flag is a point estimate. NIST Measure wants confidence intervals or at least a stated calibration. The report has no place to put a flag-confidence number even if the extractor produced one. +- **Benchmark comparison.** No "vs. previous run" or "vs. baseline". HF `evaluate` and Weave both lead with comparison tiles; the rule-engine report is single-shot. +- **Aggregate trustworthy characteristics.** NIST names seven (valid, safe, secure, accountable, explainable, privacy, fair). The report scores against *workflow gates*, not these characteristics. Acceptable for an internal DoD gate; insufficient if reused as the substantive Measure artifact. + +## 4. ISO/IEC 42001 record-keeping (re-readability in N years) + +The provenance block holds three hashes (`engineVersion`, `rulesetHash`, `flagsHash`), the rules path, the flags path, and a generation timestamp. ISO/IEC 42001 traceability asks: in 3-5 years, can a successor re-read this and reconstruct what happened? + +Strengths: hashes are content-addressable; ruleset hash plus rules-file path is sufficient to fetch the exact YAML from VCS; engine version pins behaviour. The footer's promise — "same inputs → identical report, byte-for-byte (except this timestamp)" — is the right re-readability statement. + +Weaknesses: +- No **resolvable URL** for the ruleset (path alone won't survive a repo rename / org move). +- No **schema version** for the report itself — a future reader cannot tell whether the absence of e.g. a `confidence` field is "we didn't measure" vs "this report format predates the field". +- No **LLM-side provenance** unless `ctx.promptHash` is passed (the renderer supports it conditionally; the three committed samples do not include it). +- The rule descriptions are present but the **rule bodies are not** — see §6. + +## 5. Benchmark vs current AI-eval tools (2026) + +Verified-only claims; speculative entries are marked. + +| Tool | Headline verdict surface | Per-item breakdown | Provenance | Notable property the rule-engine report lacks | +|---|---|---|---|---| +| **LangSmith** | Side-by-side comparison dashboards across experiments / prompt versions / models ([docs](https://docs.langchain.com/langsmith/evaluation)) | Run-level scores per evaluator; LLM-as-judge, heuristic, human, pairwise | Trace tree per run; "LangSmith Engine" suggests fixes on failures (2026) | Multi-run comparison; trace tree | +| **Inspect (UK AISI)** | Task summary + per-sample list; filterable by score status ([log-viewer](https://inspect.aisi.org.uk/log-viewer.html)) | Per-sample tabs: Messages / Scoring / Metadata; default grading `C/I/P/N` mapped to `1/0.5/0/0` ([scorers](https://inspect.aisi.org.uk/scorers.html)) | Git revision, token usage, model + tool calls per sample | Conversational trace; scorer `answer` + `explanation` fields next to each score | +| **W&B Weave** | Evaluation tile linked to Model Registry; combines dataset + scorers; side-by-side eval comparison ([docs](https://docs.wandb.ai/weave)) | Per-scorer aggregate; Weave links eval result → registered model version for "durable audit trail" | Tied to the registered model version | Eval-to-model-version binding | +| **HF `evaluate`** | Metric dict, e.g. `{'accuracy': 0.934}`; optionally with `confidence_interval` + `standard_error` ([docs](https://huggingface.co/docs/evaluate/a_quick_tour)) | Per-metric value; `EvaluationSuite` returns list of per-task dicts | `push_to_hub` writes results into a model card | Bootstrapped confidence intervals; standardised model-card section | +| **Anthropic transparency** | Transparency Hub (Feb 2026) consolidates model card + system card + safeguards + release notes ([hub](https://www.anthropic.com/transparency)); Claude Opus 4.6 system card is 244 pages ([overview](https://en.cryptonomist.ch/2026/04/12/claude-mythos-system-card/)) — *page count claim is third-party-reported, treat as speculative until cross-verified* | Capability section is comparative across benchmarks; safeguards co-located | Single canonical reference path for a model lifecycle | Capability/limitation framing co-located with the verdict | + +Shape-wise, the rule-engine report is closest to **Inspect's per-sample card** (status pill + scorer rationale + metadata badges) and to **sklearn's `classification_report`** (pass/fail per class + aggregate). Where it differs: there is no comparator (no "vs. last run", no "vs. another ruleset"), no model-card-style framing block at the top, and no scorer-confidence column. + +## 6. The "what's-not-here" check — rule bodies + +Today the audit trail shows `rule.id`, `rule.description`, `rule.hash` (12 chars), `rule.priority`, `rule.stage`, `rule.tags`, the evaluated conditions with their observed values, and the contribution. The reader can see *what* matched but **cannot read the rule's actual YAML body** — only an English paraphrase plus the condition summary line. + +This is fine for an engineer with the repo. It is **not fine** for a regulator who needs to verify that the English description is faithful to the executable rule. Two options: + +- **Inline the rule body** (collapsed `<details>` block with the YAML). Maximises self-containment; bloats the file. +- **Deep-link to the rule** at the pinned commit (e.g. `https://<repo>/blob/<commit>/rules/quality-gates.yaml#L<line>` resolved from the ruleset hash). Keeps the report compact; depends on the repo staying reachable. + +Hybrid recommendation: inline the YAML for the *matched* rules (typically 1-3 per report) and link-out for the skipped majority. That preserves the 30-second test while making the matched chain fully self-describing. + +## 7. Replayability — discoverable from the report alone? + +The three hashes plus the two file paths *are* a complete replay manifest. The footer hints at it ("same inputs → identical report, byte-for-byte"). But a regulator who has not read `02-regulatory-auditability.md` cannot tell: + +- That `Ruleset hash` is a SHA-256 of canonicalised YAML (not, say, a git SHA). +- That feeding the same `flagsHash`-producing JSON back into engine `0.2.0` reproduces the verdict. +- That the `__prompt_hash` "verified" badge means the extraction was produced against the same source the rules were evaluated against. + +Replayability is plumbed; replayability is not *advertised*. A short labelled "How to reproduce this verdict" block immediately under Provenance would close that gap with under 200 bytes of HTML. + +## 8. Concrete recommendations (ranked by leverage) + +1. **Add a system-identity header** above the verdict tile: provider, system name, system version, workflow this verdict governs, deployer-facing one-liner of what `Ready to progress` means. *(EU AI Act Art. 13 — provider identity, capabilities, non-technical specs)* +2. **Inline the YAML body of every matched rule** (collapsed `<details>`); link-out for skipped rules at the pinned ruleset URL. *(ISO/IEC 42001 traceability; Art. 13 capabilities/limitations)* +3. **Add a "Limitations & confidence" panel** that explicitly names the LLM-extractor as the non-deterministic boundary, lists which flags are LLM-derived vs measured, and shows per-flag confidence where available. *(Art. 13 limitations; NIST Measure uncertainty)* +4. **Add a "How to reproduce" block** under Provenance: literal command line that consumes the three hashes + the rules and flags paths, plus an explanation of what each hash covers. *(ISO/IEC 42001 record-keeping; Art. 12 logs)* +5. **Add a glossary / legend section**: define `Blocked` / `Needs attention` / `Ready to progress` / `Unknown`, explain the `[+]/[-]/[?]` glyphs, and explain "weight". *(Art. 13 log-interpretation guidance)* +6. **Add a human-oversight slot** (reviewer ID, decision, rationale, timestamp; empty placeholder when not reviewed) so the same template carries both the machine and the human verdict. *(Art. 14; ISO/IEC 42001 human-in-the-loop control)* +7. **Surface LLM provenance** (provider, model ID, model version, prompt template ID, prompt hash with verified/stale badge) whenever the extraction supplies it, instead of only conditionally on `ctx.promptHash`. *(Art. 12 model-invocation log; Model Cards convention)* +8. **Add a report-format `schemaVersion` field** in Provenance so the artifact is forward-compatible. *(ISO/IEC 42001 re-readability; future-proofing)* + +Recommendations 1, 2, and 5 give the largest readability-per-byte gain. 3 and 7 close the "LLM boundary is invisible" gap. 4 and 8 are cheap and unblock re-audit years later. 6 is structural — empty placeholders today, populated when a project-manager / reviewer track wires in. + +--- + +## Sources + +- [EU AI Act Article 13 — Transparency and Provision of Information to Deployers](https://artificialintelligenceact.eu/article/13/) +- [EU AI Act Article 12 — Record-Keeping](https://artificialintelligenceact.eu/article/12/) +- [EU AI Act Article 14 — Human Oversight](https://artificialintelligenceact.eu/article/14/) +- [NIST AI 100-1 (AI RMF 1.0) PDF](https://nvlpubs.nist.gov/nistpubs/ai/nist.ai.100-1.pdf) +- [NIST AI RMF Playbook](https://www.nist.gov/itl/ai-risk-management-framework/nist-ai-rmf-playbook) +- [ISO/IEC 42001:2023 — AI Management Systems](https://www.iso.org/standard/42001) +- [LangSmith Evaluation docs](https://docs.langchain.com/langsmith/evaluation) +- [Inspect AI — Log Viewer](https://inspect.aisi.org.uk/log-viewer.html) +- [Inspect AI — Scorers](https://inspect.aisi.org.uk/scorers.html) +- [W&B Weave documentation](https://docs.wandb.ai/weave) +- [Hugging Face Evaluate — A quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) +- [Anthropic Transparency Hub](https://www.anthropic.com/transparency) +- [Anthropic Transparency Hub launch summary](https://insights.marvin-42.com/articles/anthropic-launches-transparency-hub-to-centralize-model-cards-safeguards-and-release-notes) +- [Claude Mythos / Opus 4.6 system card overview — third-party report, page-count claim treated as speculative](https://en.cryptonomist.ch/2026/04/12/claude-mythos-system-card/) From 826e749c356fb0b5a700b134a2f1050f82e78535 Mon Sep 17 00:00:00 2001 From: Claude <noreply@anthropic.com> Date: Sun, 17 May 2026 13:06:30 +0000 Subject: [PATCH 29/45] feat(rule-engine-poc): action glossary scaffold (agent A partial) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New sidecar mapping action slugs to imperative human sentences so the HTML report can render readable guidance instead of bare slugs. - rules/action-glossary.yaml: 28 entries covering every action used in rules/quality-gates.yaml, with optional urgency + category metadata. Imperative voice per Specorator brand. - src/action-glossary.ts: loader + diff-coverage helper, mirroring src/flag-schema.ts conventions. Wiring into config.ts and the HTML renderer happens in subsequent commits when Agents A2/B finish their slices. 123/123 tests still passing — no behaviour change yet. --- .../rules/action-glossary.yaml | 139 ++++++++++++++++++ .../rule-engine-poc/src/action-glossary.ts | 71 +++++++++ 2 files changed, 210 insertions(+) create mode 100644 experiments/rule-engine-poc/rules/action-glossary.yaml create mode 100644 experiments/rule-engine-poc/src/action-glossary.ts diff --git a/experiments/rule-engine-poc/rules/action-glossary.yaml b/experiments/rule-engine-poc/rules/action-glossary.yaml new file mode 100644 index 000000000..3d764df66 --- /dev/null +++ b/experiments/rule-engine-poc/rules/action-glossary.yaml @@ -0,0 +1,139 @@ +# Action glossary for the quality-gates rule set. +# +# Every action slug emitted by rules/quality-gates.yaml maps to a +# human-readable sentence written in imperative second-person voice +# (Specorator brand). The HTML report renders these sentences instead +# of bare slug chips so reviewers don't have to guess what each chip +# means. +# +# This is a SIDECAR file: the rule engine does not read it. The HTML +# renderer consumes it when present; if a slug is missing here the +# renderer falls back to the bare slug. +# +# Format per action: +# <action-slug>: +# human: imperative sentence telling the reader what to do next. +# urgency: high | medium | low (optional) +# category: ci | review | qa | governance | dod | traceability | ... +# (optional) + +# --- Cross-cutting severity / blockers -------------------------------- + +triage-s1-finding: + human: "Triage the open S1 finding; resolve or downgrade before progressing." + urgency: high + category: qa + +schedule-s2-fix: + human: "Schedule a fix for the open S2 finding this sprint." + urgency: medium + category: qa + +resolve-clarifications: + human: "Resolve every open clarification before progressing the stage." + urgency: high + category: governance + +clear-blockers: + human: "Clear the open blockers tagged against the current stage." + urgency: high + category: governance + +# --- Stage: Idea ------------------------------------------------------ + +draft-problem-statement: + human: "Draft a one-paragraph problem statement a non-expert can read." + urgency: high + category: dod + +tighten-idea-scope: + human: "Tighten the idea scope; drop \"boil the ocean\" framing." + urgency: medium + category: dod + +advance-to-research: + human: "Advance the feature to /spec:research; the idea DoD is satisfied." + urgency: low + category: dod + +# --- Stage: Requirements ---------------------------------------------- + +rewrite-non-ears-requirements: + human: "Rewrite non-EARS requirements in EARS notation before progressing." + urgency: high + category: dod + +assign-requirement-ids: + human: "Assign a stable REQ-<AREA>-NNN id to every functional requirement." + urgency: high + category: traceability + +rewrite-acceptance-criteria: + human: "Rewrite acceptance criteria so a test can verify each one." + urgency: medium + category: dod + +# --- Stage: Design ---------------------------------------------------- + +file-missing-adrs: + human: "File an ADR for each irreversible architectural decision." + urgency: high + category: governance + +propose-risk-mitigations: + human: "Propose a mitigation for each identified design risk." + urgency: medium + category: dod + +# --- Stage: Specification --------------------------------------------- + +link-spec-items-to-requirements: + human: "Link each spec item to at least one requirement id." + urgency: high + category: traceability + +# --- Stage: Implementation -------------------------------------------- + +fix-lint-errors: + human: "Fix the outstanding lint errors on the changed surface." + urgency: high + category: dod + +fix-type-errors: + human: "Fix the outstanding TypeScript / type errors on the changed surface." + urgency: high + category: dod + +fix-failing-tests: + human: "Fix the failing unit tests for the changed surface." + urgency: high + category: dod + +advance-to-testing: + human: "Advance the feature to /spec:test; the implementation DoD is satisfied." + urgency: low + category: dod + +# --- Stage: Testing --------------------------------------------------- + +add-missing-tests: + human: "Add tests for the EARS clauses that currently have none." + urgency: high + category: traceability + +cover-critical-paths: + human: "Cover the critical paths — happy path plus key edge cases — with tests." + urgency: high + category: dod + +# --- Stage: Review ---------------------------------------------------- + +regenerate-traceability: + human: "Regenerate the traceability matrix; close every dangling chain." + urgency: high + category: traceability + +request-brand-review: + human: "Request a brand review; the diff touches user-visible surfaces." + urgency: high + category: review diff --git a/experiments/rule-engine-poc/src/action-glossary.ts b/experiments/rule-engine-poc/src/action-glossary.ts new file mode 100644 index 000000000..88c0848da --- /dev/null +++ b/experiments/rule-engine-poc/src/action-glossary.ts @@ -0,0 +1,71 @@ +import { readFileSync } from "node:fs"; +import { resolve } from "node:path"; +import yaml from "js-yaml"; + +export type ActionUrgency = "high" | "medium" | "low"; + +export interface ActionGlossaryEntry { + human: string; + urgency?: ActionUrgency; + category?: string; +} + +export type ActionGlossary = Record<string, ActionGlossaryEntry>; + +const VALID_URGENCIES: readonly ActionUrgency[] = ["high", "medium", "low"] as const; + +export function loadActionGlossary(filePath: string): ActionGlossary { + const abs = resolve(filePath); + const raw = readFileSync(abs, "utf8"); + const parsed = yaml.load(raw); + if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) { + throw new Error(`Action glossary ${filePath} must be a YAML mapping`); + } + const out: ActionGlossary = {}; + for (const [slug, entry] of Object.entries(parsed)) { + if (!entry || typeof entry !== "object") { + throw new Error(`Action '${slug}' must be an object`); + } + const e = entry as Record<string, unknown>; + if (typeof e.human !== "string" || e.human.length === 0) { + throw new Error(`Action '${slug}' missing human sentence`); + } + if (e.urgency !== undefined && !VALID_URGENCIES.includes(e.urgency as ActionUrgency)) { + throw new Error( + `Action '${slug}' has unknown urgency '${String(e.urgency)}'. Expected one of: ${VALID_URGENCIES.join(", ")}`, + ); + } + if (e.category !== undefined && typeof e.category !== "string") { + throw new Error(`Action '${slug}' category must be a string when present`); + } + out[slug] = { + human: e.human, + urgency: e.urgency === undefined ? undefined : (e.urgency as ActionUrgency), + category: e.category === undefined ? undefined : (e.category as string), + }; + } + return out; +} + +export interface ActionCoverage { + ruleActions: string[]; + glossaryActions: string[]; + undocumented: string[]; + unused: string[]; +} + +export function diffActionCoverage( + glossary: ActionGlossary, + actionSlugs: Iterable<string>, +): ActionCoverage { + const ruleSet = new Set(actionSlugs); + const glossarySet = new Set(Object.keys(glossary)); + const undocumented = [...ruleSet].filter((s) => !glossarySet.has(s)).sort(); + const unused = [...glossarySet].filter((s) => !ruleSet.has(s)).sort(); + return { + ruleActions: [...ruleSet].sort(), + glossaryActions: [...glossarySet].sort(), + undocumented, + unused, + }; +} From 28c84e99d21a40f2ac728f439f9fd077ffd493cf Mon Sep 17 00:00:00 2001 From: Claude <noreply@anthropic.com> Date: Sun, 17 May 2026 13:08:19 +0000 Subject: [PATCH 30/45] feat(rule-engine-poc): wire action glossary into config + docs (agent A complete) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Agent A's RALPH loop completed. Action glossary is now reachable via the config (still optional — no behaviour change for callers that don't set actionGlossary): - src/config.ts: optional 'actionGlossary' string field on RawConfig resolved to 'actionGlossaryPath' on ResolvedConfig, same pattern as flagSchema. - rule-engine.config.json: points at rules/action-glossary.yaml. - test/action-glossary.test.ts: 12 tests covering loader validation, diff-coverage, real-file coverage of rules/quality-gates.yaml, and sentence-shape invariants. Also picks up Agent C's in-flight architecture.md updates: system overview + data flow diagrams now show the glossary node (read only by the renderer) and the new HTML report sections (system-identity header, what fired, reproduce block, audit trail with non-matched collapsed). The renderer itself (Agent B) is still in flight. Suite: 135/135 passing (123 baseline + 12 new). 21 unique action slugs in rules/quality-gates.yaml — all mapped in the glossary. --- .../rule-engine-poc/docs/architecture.md | 19 +- .../rule-engine-poc/rule-engine.config.json | 1 + experiments/rule-engine-poc/src/config.ts | 14 ++ .../test/action-glossary.test.ts | 165 ++++++++++++++++++ 4 files changed, 196 insertions(+), 3 deletions(-) create mode 100644 experiments/rule-engine-poc/test/action-glossary.test.ts diff --git a/experiments/rule-engine-poc/docs/architecture.md b/experiments/rule-engine-poc/docs/architecture.md index ddba61882..d6c7929f6 100644 --- a/experiments/rule-engine-poc/docs/architecture.md +++ b/experiments/rule-engine-poc/docs/architecture.md @@ -32,6 +32,7 @@ flowchart LR CFG["rule-engine.config.json"] RULES["rules/quality-gates.yaml"] SCHEMA["rules/flag-schema.yaml"] + GLOSS["rules/action-glossary.yaml<br/>(optional sidecar)"] SRC["target source files<br/>(e.g. specs/<slug>/)"] end @@ -69,6 +70,7 @@ flowchart LR CFG --> REP RULES --> REP SCHEMA --> REP + GLOSS --> REP SRC --> REP EXT --> REP REP --> RPT @@ -78,6 +80,8 @@ flowchart LR EXT --> ONE ``` +`rules/action-glossary.yaml` is read only by the renderer; the engine never sees it. A missing glossary file is not an error — the renderer falls back to bare action slugs. + Solid arrows are program reads/writes. Dashed arrows are **user actions**: the only manual steps in the loop. --- @@ -155,11 +159,12 @@ flowchart TB FLAGS --> ENG ENG --> VRES["VerdictResult<br/>{ verdict, weightedTally,<br/>actions, evaluations,<br/>rulesetHash, flagsHash,<br/>engineVersion }"] - VRES --> HTML["reports/<id>.html<br/>(self-contained, inline CSS)"] + GLOSS["action-glossary.yaml<br/>(optional sidecar)"] --> HTML + VRES --> HTML["reports/<id>.html<br/>system-identity header,<br/>verdict banner + tier glossary,<br/>what fired, reproduce block,<br/>audit trail (non-matched collapsed)"] VRES -. --json .-> J["stdout JSON"] ``` -The three replay anchors at the bottom (`engineVersion`, `rulesetHash`, `flagsHash`) are what makes any verdict reproducible. Plus a fourth: the recomputed `promptHash` ties the extraction back to the source it was produced against. +The three replay anchors at the bottom (`engineVersion`, `rulesetHash`, `flagsHash`) are what makes any verdict reproducible. Plus a fourth: the recomputed `promptHash` ties the extraction back to the source it was produced against. The renderer also surfaces these in the HTML report's reproduce block so a reader can re-run the exact tuple from the page. --- @@ -300,7 +305,7 @@ Stochasticity is **confined to Orient**. Observe is mechanical, Decide is reprod ## 7. Module dependency graph -`src/` has 16 modules. Pure-data and CLI layers are kept separate; the engine has no I/O dependencies. +`src/` modules are grouped by layer. Pure-data and CLI layers are kept separate; the engine has no I/O dependencies. ```mermaid flowchart TB @@ -331,6 +336,7 @@ flowchart TB subgraph RENDER["Render + I/O"] HTML["html-report.ts"] + GLOSS["action-glossary.ts"] BROW["open-browser.ts"] end @@ -352,10 +358,13 @@ flowchart TB ENG --> REPORT & ONE VAL --> REPORT & VALCLI HTML --> REPORT + GLOSS --> REPORT BROW --> REPORT LOAD --> PLAN & VALCLI & REPORT & ONE ``` +`action-glossary.ts` is a render-time dependency only: `report.ts` loads it when `config.actionGlossary` resolves, and passes the map to `html-report.ts`. The engine core never sees it, so action-sentence edits cannot change a verdict. + The engine core (`engine.ts` + `loader.ts` + `types.ts` + `hash.ts`) has zero I/O. It can be embedded in any environment that supplies an `ExtractionFlags` object. The CLI layer is intentionally **shallow glue** — the engine is the load-bearing module. --- @@ -373,5 +382,9 @@ A few design choices that emerged from research and review rounds. Each is docum | Forbidden field names in the prompt | Naming the failure mode explicitly is more reliable than schema-only validation; the validate gate also checks. | [`research/10`](../research/10-extraction-prompt-patterns.md) | | Stay hand-rolled (don't adopt json-rules-engine) | Every candidate fails ≥3 of our 6 locked constraints; the wrapper would be the engine. Re-evaluate at graduation. | [`research/11`](../research/11-rule-engine-adoption-revisit.md) | | Severity tiers + `unknown` default | Verdicts are categorical tiers; `unknown` is the honest answer when no rule fires. | [`dsl-reference.md`](dsl-reference.md) | +| Action glossary as a render-only sidecar (`rules/action-glossary.yaml`) | Human sentences for action slugs are presentation, not policy — keeping them out of the engine means copy changes cannot move a verdict, and a missing glossary is a soft fallback to bare slugs. | [`extending.md`](extending.md) | +| Audit trail collapses non-matched rules by default | The signal an auditor wants first is *what fired*; non-matched rules are background detail. A `<details>` toggle keeps the full trail one click away without burying the explanation. | [`audit-trail.md`](audit-trail.md) | +| System-identity header + reproduce block on the report | EU AI Act Art. 13 explainability requires a human-readable "why" surface, not just JSON. The header names the system + ruleset version; the reproduce block prints the exact command + hashes to re-run. | [`audit-trail.md`](audit-trail.md) | +| Trust calibration signals (`verified` badge, `--skip-validate` banner) | A report rendered without the validate gate is materially less trustworthy than one with it. Making that visible on the page itself — not just in CLI logs — closes the gap between what the operator ran and what the reader sees. | [`workflow.md`](workflow.md) | For deeper component-level details see [`workflow.md`](workflow.md) (operational), [`dsl-reference.md`](dsl-reference.md) (rule grammar), [`audit-trail.md`](audit-trail.md) (replay), and [`extending.md`](extending.md) (adding rules, flags, fixtures). diff --git a/experiments/rule-engine-poc/rule-engine.config.json b/experiments/rule-engine-poc/rule-engine.config.json index 70f68a9a1..e704d9872 100644 --- a/experiments/rule-engine-poc/rule-engine.config.json +++ b/experiments/rule-engine-poc/rule-engine.config.json @@ -1,6 +1,7 @@ { "rules": "rules/quality-gates.yaml", "flagSchema": "rules/flag-schema.yaml", + "actionGlossary": "rules/action-glossary.yaml", "promptsDir": "prompts", "extractionsDir": "extractions", "reportsDir": "reports", diff --git a/experiments/rule-engine-poc/src/config.ts b/experiments/rule-engine-poc/src/config.ts index 177378f86..8c37cf391 100644 --- a/experiments/rule-engine-poc/src/config.ts +++ b/experiments/rule-engine-poc/src/config.ts @@ -10,6 +10,7 @@ export interface Target { export interface RawConfig { rules: string; flagSchema: string; + actionGlossary?: string; promptsDir: string; extractionsDir: string; reportsDir: string; @@ -22,6 +23,7 @@ export interface ResolvedConfig extends RawConfig { configDir: string; rulesPath: string; flagSchemaPath: string; + actionGlossaryPath?: string; promptsDirPath: string; extractionsDirPath: string; reportsDirPath: string; @@ -46,6 +48,7 @@ export function loadConfig(configPath: string): ResolvedConfig { configDir, rulesPath: r(config.rules), flagSchemaPath: r(config.flagSchema), + actionGlossaryPath: config.actionGlossary ? r(config.actionGlossary) : undefined, promptsDirPath: r(config.promptsDir), extractionsDirPath: r(config.extractionsDir), reportsDirPath: r(config.reportsDir), @@ -128,9 +131,20 @@ function validate(raw: unknown, file: string): RawConfig { return { id: tr.id, label: tr.label, paths }; }); + let actionGlossary: string | undefined; + if (r.actionGlossary !== undefined) { + if (typeof r.actionGlossary !== "string" || (r.actionGlossary as string).length === 0) { + throw new Error( + `Config ${file} field 'actionGlossary' must be a non-empty string when present`, + ); + } + actionGlossary = r.actionGlossary as string; + } + return { rules: r.rules as string, flagSchema: r.flagSchema as string, + actionGlossary, promptsDir: r.promptsDir as string, extractionsDir: r.extractionsDir as string, reportsDir: r.reportsDir as string, diff --git a/experiments/rule-engine-poc/test/action-glossary.test.ts b/experiments/rule-engine-poc/test/action-glossary.test.ts new file mode 100644 index 000000000..6c5e1c12c --- /dev/null +++ b/experiments/rule-engine-poc/test/action-glossary.test.ts @@ -0,0 +1,165 @@ +import { describe, expect, it } from "vitest"; +import { mkdtempSync, readFileSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { dirname, join, resolve } from "node:path"; +import { fileURLToPath } from "node:url"; +import yaml from "js-yaml"; +import { + diffActionCoverage, + loadActionGlossary, + type ActionGlossary, +} from "../src/action-glossary.js"; + +const here = dirname(fileURLToPath(import.meta.url)); + +function writeYaml(body: string): string { + const dir = mkdtempSync(join(tmpdir(), "rep-poc-glossary-")); + const file = join(dir, "glossary.yaml"); + writeFileSync(file, body, "utf8"); + return file; +} + +describe("loadActionGlossary", () => { + it("loads a valid glossary with required fields only", () => { + const file = writeYaml(` +kick-ci: + human: "Re-run the failing CI job." +request-reviewer: + human: "Request a second reviewer." +`); + const g = loadActionGlossary(file); + expect(g["kick-ci"]!.human).toBe("Re-run the failing CI job."); + expect(g["kick-ci"]!.urgency).toBeUndefined(); + expect(g["kick-ci"]!.category).toBeUndefined(); + expect(g["request-reviewer"]!.human).toBe("Request a second reviewer."); + }); + + it("loads optional urgency and category", () => { + const file = writeYaml(` +kick-ci: + human: "Re-run the failing CI job." + urgency: high + category: ci +`); + const g = loadActionGlossary(file); + expect(g["kick-ci"]!.urgency).toBe("high"); + expect(g["kick-ci"]!.category).toBe("ci"); + }); + + it("rejects missing human sentence", () => { + const file = writeYaml(` +kick-ci: + urgency: high +`); + expect(() => loadActionGlossary(file)).toThrow(/missing human sentence/); + }); + + it("rejects empty human sentence", () => { + const file = writeYaml(` +kick-ci: + human: "" +`); + expect(() => loadActionGlossary(file)).toThrow(/missing human sentence/); + }); + + it("rejects unknown urgency", () => { + const file = writeYaml(` +kick-ci: + human: "Re-run the failing CI job." + urgency: critical +`); + expect(() => loadActionGlossary(file)).toThrow(/unknown urgency 'critical'/); + }); + + it("rejects non-string category", () => { + const file = writeYaml(` +kick-ci: + human: "Re-run the failing CI job." + category: 42 +`); + expect(() => loadActionGlossary(file)).toThrow(/category must be a string/); + }); + + it("rejects non-object entry", () => { + const file = writeYaml(` +kick-ci: "Re-run the failing CI job." +`); + expect(() => loadActionGlossary(file)).toThrow(/must be an object/); + }); + + it("rejects top-level array", () => { + const file = writeYaml(` +- kick-ci +- request-reviewer +`); + expect(() => loadActionGlossary(file)).toThrow(/must be a YAML mapping/); + }); +}); + +describe("diffActionCoverage", () => { + it("identifies undocumented and unused slugs", () => { + const glossary: ActionGlossary = { + "kick-ci": { human: "Re-run." }, + "request-reviewer": { human: "Ask." }, + }; + const diff = diffActionCoverage(glossary, ["kick-ci", "fix-lint"]); + expect(diff.undocumented).toEqual(["fix-lint"]); + expect(diff.unused).toEqual(["request-reviewer"]); + expect(diff.ruleActions).toEqual(["fix-lint", "kick-ci"]); + expect(diff.glossaryActions).toEqual(["kick-ci", "request-reviewer"]); + }); + + it("reports empty diffs when coverage is exact", () => { + const glossary: ActionGlossary = { + "kick-ci": { human: "Re-run." }, + }; + const diff = diffActionCoverage(glossary, ["kick-ci"]); + expect(diff.undocumented).toEqual([]); + expect(diff.unused).toEqual([]); + }); +}); + +describe("repo action-glossary.yaml", () => { + // Extract every action slug from rules/quality-gates.yaml. + function extractRuleActionSlugs(rulesPath: string): string[] { + const parsed = yaml.load(readFileSync(rulesPath, "utf8")); + if (!Array.isArray(parsed)) { + throw new Error(`Expected ${rulesPath} to be a YAML list of rules`); + } + const slugs = new Set<string>(); + for (const rule of parsed as Array<Record<string, unknown>>) { + const then = rule.then as Record<string, unknown> | undefined; + const actions = then?.actions; + if (Array.isArray(actions)) { + for (const a of actions) { + if (typeof a === "string") slugs.add(a); + } + } + } + return [...slugs].sort(); + } + + it("covers every action slug used by rules/quality-gates.yaml", () => { + const rulesPath = resolve(here, "../rules/quality-gates.yaml"); + const glossaryPath = resolve(here, "../rules/action-glossary.yaml"); + const ruleSlugs = extractRuleActionSlugs(rulesPath); + const glossary = loadActionGlossary(glossaryPath); + const diff = diffActionCoverage(glossary, ruleSlugs); + expect(diff.undocumented).toEqual([]); + // unused is allowed to be non-empty in principle, but for the POC we + // want the glossary to stay tight — flag any orphans. + expect(diff.unused).toEqual([]); + }); + + it("uses imperative second-person sentences ending in a period", () => { + const glossaryPath = resolve(here, "../rules/action-glossary.yaml"); + const glossary = loadActionGlossary(glossaryPath); + for (const [slug, entry] of Object.entries(glossary)) { + expect(entry.human.length, `${slug} has empty human sentence`).toBeGreaterThan(0); + expect( + /[.!?]$/.test(entry.human), + `${slug} human sentence should end with terminal punctuation`, + ).toBe(true); + } + }); +}); From f054c1847d0a4aa8c076b99987a62608c5e61236 Mon Sep 17 00:00:00 2001 From: Claude <noreply@anthropic.com> Date: Sun, 17 May 2026 13:08:51 +0000 Subject: [PATCH 31/45] =?UTF-8?q?docs(rule-engine-poc):=20agent=20C=20docs?= =?UTF-8?q?=20sync=20=E2=80=94=20audit=20trail=20+=20workflow=20+=20extend?= =?UTF-8?q?ing?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - audit-trail.md: Mapping to EU AI Act table updated to credit the HTML report (what-fired with human sentences, system-identity header, tier glossary, reproduce block) as the Art. 13 human-readable rationale surface. Closes research/02 open item about explainability presentation. - workflow.md: still in flight by agent C — current commit picks up partial edits. - extending.md: new 'Authoring action human sentences' section documents rules/action-glossary.yaml as a render-only sidecar (engine never reads it; editing sentences cannot change a verdict). --- .../rule-engine-poc/docs/audit-trail.md | 9 +++-- experiments/rule-engine-poc/docs/extending.md | 19 +++++++++++ experiments/rule-engine-poc/docs/workflow.md | 33 ++++++++++++++++++- 3 files changed, 57 insertions(+), 4 deletions(-) diff --git a/experiments/rule-engine-poc/docs/audit-trail.md b/experiments/rule-engine-poc/docs/audit-trail.md index 746d5e919..587cc5f59 100644 --- a/experiments/rule-engine-poc/docs/audit-trail.md +++ b/experiments/rule-engine-poc/docs/audit-trail.md @@ -89,7 +89,7 @@ The per-rule `hash` lets you bisect a rule-edit diff without comparing the YAML ## Mapping to EU AI Act explainability -From `research/02-regulatory-auditability.md`, an auditable AI-derived decision under EU AI Act Articles 11–14 needs a per-decision dossier with the following parts. Our `VerdictResult` covers them as follows: +From `research/02-regulatory-auditability.md`, an auditable AI-derived decision under EU AI Act Articles 11–14 needs a per-decision dossier with the following parts. Our `VerdictResult` plus the HTML report cover them as follows: | Required artifact | Covered by | Status | |---|---|---| @@ -99,13 +99,16 @@ From `research/02-regulatory-auditability.md`, an auditable AI-derived decision | Flags with source spans | LLM extraction layer | **out of scope for the POC** — needs spans in extraction schema | | Ruleset version + hash | `rulesetHash` + per-rule `hash` | yes | | Ordered rule trace | `evaluations` in deterministic order | yes | -| Verdict + rationale | `verdict` + matched `conditions` | yes | +| Verdict + rationale | `verdict` + matched `conditions`, surfaced on the HTML report as the **what-fired section** with human sentences from `rules/action-glossary.yaml` | yes | +| Human-readable "why" presentation (Art. 13) | HTML report: system-identity header, verdict-tier glossary + glyph legend, what-fired section, reproduce block | yes — closes research/02 open item §6.4 | | Confidence | n/a (verdicts are categorical) | by design | | Human-oversight record | Caller's responsibility (Act-phase gate) | **upstream** | -| Replay manifest | `engineVersion` + the two hashes | yes | +| Replay manifest | `engineVersion` + the two hashes, plus the on-page reproduce block that prints the exact `npm run report` invocation | yes | The pattern handles the **decide** half of the audit chain; the **observe / orient** half (provenance of the raw signals and the LLM extraction) is upstream and must be captured separately. +The HTML report is the explainability surface for non-technical readers: the JSON `VerdictResult` satisfies machine-readable audit, and the rendered report satisfies the Art. 13 requirement that a deployer can interpret the output without reading the trace. The reproduce block prints the three replay anchors (`engineVersion`, `rulesetHash`, `flagsHash`) plus the recomputed `promptHash` so a reader can re-run the exact verdict from the page itself. + ## What this does not give you - **No fairness audit.** Severity ordering and weights encode normative choices. They are visible (in YAML) but not yet reviewed. diff --git a/experiments/rule-engine-poc/docs/extending.md b/experiments/rule-engine-poc/docs/extending.md index 3910e4dc4..e6570ae69 100644 --- a/experiments/rule-engine-poc/docs/extending.md +++ b/experiments/rule-engine-poc/docs/extending.md @@ -38,6 +38,25 @@ The engine is domain-agnostic. The example rule set encodes our quality framewor Look for your rule id in the audit trail. The condition rows show the observed values, so it's obvious why a rule did or didn't match. +## Authoring action human sentences + +The HTML report renders an action slug (e.g. `fix-lint-errors`) as a human sentence (e.g. "Fix the outstanding lint errors on the changed surface.") when the slug is defined in `rules/action-glossary.yaml`. The glossary is a **render-only sidecar**: the engine never reads it, so editing a sentence cannot change a verdict. + +1. Open `rules/action-glossary.yaml`. +2. Append an entry keyed by the action slug exactly as it appears in `rules/quality-gates.yaml`: + + ```yaml + my-new-action: + human: "Do the thing the rule wants you to do." + urgency: high # optional: high | medium | low + category: dod # optional: ci | review | qa | governance | dod | traceability | ... + ``` + +3. Write `human` in **imperative second-person voice**, one sentence ending in a period. Match the existing entries' tone: concrete, action-leading, no marketing language. +4. If you omit a slug, the renderer falls back to the bare slug — so the report still works, it just looks like the old version for that action. Treat missing glossary entries as a small drift bug, not a render error. + +The glossary is not authoritative for rule behaviour. If you change what an action *does*, edit `rules/quality-gates.yaml`; if you change what an action *says*, edit `rules/action-glossary.yaml`. Keeping the two seams separate is what lets non-engineers tune the readout without ADR-grade scrutiny. + ## Add a new flag Flags are arbitrary key/value pairs on the input JSON. To use one: diff --git a/experiments/rule-engine-poc/docs/workflow.md b/experiments/rule-engine-poc/docs/workflow.md index d3338a182..7d443de21 100644 --- a/experiments/rule-engine-poc/docs/workflow.md +++ b/experiments/rule-engine-poc/docs/workflow.md @@ -125,7 +125,38 @@ If an extraction file is missing, the command prints a friendly error pointing a ## 5. Read the report -The HTML report renders the verdict prominently, lists the suggested actions, shows the weighted tally per tier, and prints the full audit trail in deterministic order. Provenance hashes at the bottom let you confirm the report came from a specific (engine version, rule set, flags) tuple. +The HTML report is laid out top-to-bottom in the order a reader needs the information: identity → verdict → what fired → why → how to reproduce → full trail. Skim it once with this map in mind. + +### What to look at first + +1. **System-identity header.** Names the system (`rule-engine-poc`), the engine version, and the ruleset content hash. If a colleague asks "which version produced this?", the answer is in the top six lines. +2. **Verdict banner.** Sentence-case tier name with the verdict glyph (block / warning / check / question mark). If the tier is `blocked` *and* the rule that fired keyed off a missing required flag, the banner carries a **blocker-by-absence** sub-line that names the absent flag — distinct from "we have evidence of a real failure". Both are blockers; the distinction tells you whether to re-run the extractor or fix the underlying signal. +3. **What fired.** A short list of the rules that actually contributed to the verdict, each with its human-sentence action (from `rules/action-glossary.yaml`). This is the reader's "why" view. If you only have thirty seconds, read this section and the banner; everything below is supporting detail. + +### What each section says + +| Section | What it answers | When you need it | +|---|---|---| +| Provenance preamble (top) | Was this report rendered with the validate gate on, against which extraction file, when? | First glance — establishes trust calibration before you read the verdict. | +| Verdict banner + tier glossary | What is the tier and what does it mean? | Every read. The glyph legend tells you what `[!]`, `[?]`, `[+]`, `[-]` mean in the trail. | +| What fired | Which rules contributed and what should I do about them? | Every read. Human sentences live here, not in the audit trail. | +| Suggested actions | The deduplicated action list with human sentences and urgency badges where the glossary supplies them. | When you're triaging which action to take first. | +| Weighted tally | How much weight piled up in each tier? | Diagnosing close calls or unexpected `needs-attention` results. | +| Extraction flags | What did the LLM actually emit? | Sanity-checking the Orient step; suspicious flag values surface here. | +| Audit trail | Every rule and every condition with observed values. | Deep dives. **Non-matched rules collapse by default** behind a `<details>` toggle — open it when you need to confirm a rule was considered but didn't fire. | +| Reproduce block | The exact `npm run report -- --target <id>` invocation plus the three hashes needed to replay. | Replay, archive, or sharing with someone who needs to re-run. | +| Provenance footer | The full hashes (`rulesetHash`, `flagsHash`, `engineVersion`, `promptHash`). | Audit dossiers — these are the four anchors from [`audit-trail.md`](audit-trail.md). | + +### Trust calibration cues + +The renderer surfaces two cues that tell a reader how much to trust what they're looking at: + +- **`verified` badge** next to the prompt hash. Means the extraction's self-declared `__prompt_hash` matched the recomputed value — the extraction was produced against the same source files you're now reading. Absent badge = stale or unverifiable; treat the report as advisory until re-extracted. +- **`--skip-validate` banner.** If `report` was invoked with `--skip-validate`, a high-contrast banner sits at the top of the report (in addition to the stderr warning). The validate gate did not run for this render, so the forbidden-fields check, the type check, and the prompt-hash binding were all bypassed. Never sign off on a `--skip-validate` report as if it were a gated one. + +### Mobile fallback + +The report's two-column summary grid (tally + actions) collapses to a single column under a narrow viewport. Section headers stay sentence-case so the page reads as prose, not a dashboard, when opened on a phone. See [`audit-trail.md`](audit-trail.md) for what's captured and how to replay. From def9b703a32dec7d1f4ad84f13a442743165d7c4 Mon Sep 17 00:00:00 2001 From: Claude <noreply@anthropic.com> Date: Sun, 17 May 2026 13:09:27 +0000 Subject: [PATCH 32/45] chore(rule-engine-poc): snapshot in-flight agent B + C work MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Agent B (HTML report rebuild) and Agent C (README sync) are still running their RALPH loops. This commit snapshots the current on-disk state so the working tree stays clean between iterations: - src/html-report.ts: partial changes from agent B (rebuild for research wave 4 findings). 135/135 tests still passing — the partial state is internally consistent even if not yet feature complete. - src/report.ts: corresponding plumbing changes from agent B. - README.md: agent C in-flight test-count + file-map sync. Will be superseded by the next commit when both agents report final. --- experiments/rule-engine-poc/README.md | 8 +- .../rule-engine-poc/src/html-report.ts | 321 ++++++++++++++---- experiments/rule-engine-poc/src/report.ts | 18 + 3 files changed, 272 insertions(+), 75 deletions(-) diff --git a/experiments/rule-engine-poc/README.md b/experiments/rule-engine-poc/README.md index 460bc59a0..31a1565e9 100644 --- a/experiments/rule-engine-poc/README.md +++ b/experiments/rule-engine-poc/README.md @@ -47,7 +47,7 @@ A feature folder's verdict becomes a function of named flags and named rules; no ```bash cd experiments/rule-engine-poc npm install -npm test # 100+ tests in <2s +npm test # 123 tests in <10s ``` ### Run the workflow against a real feature @@ -96,6 +96,7 @@ npm run demo:html:all # one HTML report per fixture into reports/ | `rule-engine.config.json` | Targets + paths + directory layout | | `rules/quality-gates.yaml` | Definition-of-Done-as-rules | | `rules/flag-schema.yaml` | Documentation of every flag (type, description, example) | +| `rules/action-glossary.yaml` | Render-only sidecar: maps each action slug to a human imperative sentence + optional urgency/category | | `src/types.ts` | Data contracts | | `src/hash.ts` | Canonical JSON + SHA-256 — the foundation of replayability | | `src/engine.ts` | Deterministic evaluation: severity-first verdict, weighted tally, audit trail | @@ -110,10 +111,11 @@ npm run demo:html:all # one HTML report per fixture into reports/ | `src/plan.ts` | `npm run plan` — generates prompts | | `src/report.ts` | `npm run report` — validates (unless `--skip-validate`), runs engine, renders HTML, opens browser | | `src/cli.ts` | Single-shot escape hatch for fixture-based testing | -| `src/html-report.ts` | Self-contained HTML renderer (inline CSS, no JS) | +| `src/html-report.ts` | Self-contained HTML renderer (inline CSS, no JS) — system-identity header, verdict banner + tier glossary, what-fired section, audit trail (non-matched rules collapsed), reproduce block | +| `src/action-glossary.ts` | Loader for `rules/action-glossary.yaml`; coverage diff (undocumented / unused slugs). Render-only — never touched by the engine | | `src/open-browser.ts` | Best-effort `xdg-open` / `open` / `start` | | `fixtures/*.json` | Mock extractions for the single-shot demos | -| `test/*.test.ts` | 100+ tests across engine, loader, hash, config, schema, context, prompt-builder, prompt-hash, validate | +| `test/*.test.ts` | 123 tests across engine, loader, hash, config, schema, context, prompt-builder, prompt-hash, validate, action glossary, report flow | | `docs/*.md` | Architecture, DSL reference, audit trail, workflow, extension guide, OODA integration | | `research/*.md` | 16 research artifacts spanning technical landscape, regulatory, positioning, design alternatives, risks, independent review, workflow risks, plan/report architecture, user flow, prompt patterns, adoption revisit, re-review at HEAD, strategy v2, new failure modes, CI operations, JTBD switch interviews | diff --git a/experiments/rule-engine-poc/src/html-report.ts b/experiments/rule-engine-poc/src/html-report.ts index 8fc3b4500..2021ebe93 100644 --- a/experiments/rule-engine-poc/src/html-report.ts +++ b/experiments/rule-engine-poc/src/html-report.ts @@ -1,6 +1,7 @@ -import type { VerdictResult, Verdict } from "./types.js"; +import type { ActionGlossary } from "./action-glossary.js"; +import type { Condition, RuleEvaluation, Verdict, VerdictResult } from "./types.js"; -interface RenderContext { +export interface RenderContext { rulesPath: string; flagsPath: string; flags: Record<string, unknown>; @@ -13,6 +14,18 @@ interface RenderContext { * "verified" badge. */ promptHash?: string; + /** + * Optional action glossary loaded from config. When provided, each + * action is rendered as the human sentence with the slug demoted to a + * trailing code chip. Absence is tolerated — fall back to slug-only. + */ + actionGlossary?: ActionGlossary; + /** + * Validation gate state for the current render. "skipped" surfaces a + * loud banner explaining that the verdict is unverified against the + * flag schema and forbidden-fields policy. + */ + validationStatus?: "validated" | "skipped" | "stale"; } const VERDICT_PALETTE: Record< @@ -45,6 +58,17 @@ const VERDICT_PALETTE: Record< }, }; +const VERDICT_GLOSSARY: Record<Verdict, string> = { + blocked: + "At least one high-severity rule fired. Do not progress until the listed actions are taken.", + "needs-attention": + "Medium-severity signals fired. Progress is possible but the reviewer should weigh the listed actions.", + "ready-to-progress": + "All gating rules cleared. The stage may advance.", + unknown: + "No rule fired with non-zero weight. Usually means the extraction was insufficient to decide.", +}; + function esc(value: unknown): string { return String(value) .replace(/&/g, "&") @@ -58,7 +82,7 @@ function fmtJson(value: unknown): string { return esc(JSON.stringify(value)); } -function conditionSummary(c: import("./types.js").Condition): string { +function conditionSummary(c: Condition): string { const parts: string[] = []; if (c.eq !== undefined) parts.push(`eq=${fmtJson(c.eq)}`); if (c.ne !== undefined) parts.push(`ne=${fmtJson(c.ne)}`); @@ -70,13 +94,122 @@ function conditionSummary(c: import("./types.js").Condition): string { return parts.join(" "); } +// Collect actions in priority-of-cause order: walk evaluations (which +// are already sorted priority desc, id asc by the engine) and emit each +// matched rule's contribution actions in evaluation order, deduplicated +// by first-seen. Machine consumers can still read result.actions for +// the alphabetised list. +function actionsInPriorityOrder(evaluations: RuleEvaluation[]): string[] { + const seen = new Set<string>(); + const ordered: string[] = []; + for (const ev of evaluations) { + if (!ev.contribution) continue; + for (const a of ev.contribution.actions) { + if (seen.has(a)) continue; + seen.add(a); + ordered.push(a); + } + } + return ordered; +} + +// Collect the flag names referenced by "flag missing in extraction" +// reasons across every condition of every rule, in first-seen order. +// Used to power the "blocker-by-absence" banner near the verdict card. +function missingFlagNames(evaluations: RuleEvaluation[]): { + flags: string[]; + ruleCount: number; +} { + const flagSeen = new Set<string>(); + const flags: string[] = []; + let ruleCount = 0; + for (const ev of evaluations) { + let ruleHasMissing = false; + for (const c of ev.conditions) { + if (c.reason !== "flag missing in extraction") continue; + ruleHasMissing = true; + const name = c.condition.flag; + if (!flagSeen.has(name)) { + flagSeen.add(name); + flags.push(name); + } + } + if (ruleHasMissing) ruleCount += 1; + } + return { flags, ruleCount }; +} + +function renderActionItem(slug: string, glossary?: ActionGlossary): string { + const entry = glossary ? glossary[slug] : undefined; + if (!entry) { + return `<li><code>${esc(slug)}</code></li>`; + } + return `<li><span class="action-sentence">${esc(entry.human)}</span> <code class="action-slug">${esc(slug)}</code></li>`; +} + +function renderEvaluationArticle(ev: RuleEvaluation): string { + const cls = ev.matched ? "rule rule--matched" : "rule rule--skipped"; + const status = ev.matched ? "MATCHED" : "did not match"; + const conds = ev.conditions + .map((c) => { + // Distinguish "flag missing in extraction" (the extraction did + // not supply the input — operator/LLM issue, yellow/warning) + // from "value did not match" (the input was supplied but the + // rule disagreed — domain signal, red/miss). + let cls2: string; + if (c.matched) { + cls2 = "cond cond--ok"; + } else if (c.reason === "flag missing in extraction") { + cls2 = "cond cond--missing"; + } else { + cls2 = "cond cond--miss"; + } + const reason = c.reason + ? ` <span class="reason">(${esc(c.reason)})</span>` + : ""; + return `<li class="${cls2}"><code>${esc(c.condition.flag)}</code> ${conditionSummary(c.condition)} → observed=<code>${fmtJson(c.observed)}</code>${reason}</li>`; + }) + .join(""); + const contribution = ev.contribution + ? `<p class="contribution">Contributes <strong>${VERDICT_PALETTE[ev.contribution.verdict].label}</strong> with weight <strong>${ev.contribution.weight}</strong>. Actions: ${ev.contribution.actions.map((a) => `<code>${esc(a)}</code>`).join(", ")}.</p>` + : ""; + const stage = ev.rule.stage + ? `<span class="badge">stage: ${esc(ev.rule.stage)}</span>` + : ""; + const tags = (ev.rule.tags ?? []) + .map((t) => `<span class="badge badge--tag">${esc(t)}</span>`) + .join(" "); + return ` +<article class="${cls}"> + <header> + <span class="status-pill">${status}</span> + <h3><code>${esc(ev.rule.id)}</code></h3> + <p class="rule-desc">${esc(ev.rule.description)}</p> + <p class="meta"> + <span class="badge">priority: ${ev.rule.priority}</span> + ${stage} + ${tags} + <span class="badge badge--hash" title="content hash of the rule">${esc(ev.rule.hash.slice(0, 12))}</span> + </p> + </header> + <ul class="conditions">${conds}</ul> + ${contribution} +</article>`; +} + export function renderHtmlReport( result: VerdictResult, ctx: RenderContext, ): string { const palette = VERDICT_PALETTE[result.verdict]; - const matchedCount = result.evaluations.filter((e) => e.matched).length; - const totalCount = result.evaluations.length; + const matched = result.evaluations.filter((e) => e.matched); + const unmatched = result.evaluations.filter((e) => !e.matched); + const matchedCount = matched.length; + + // Priority-of-cause action order (per UX research/17): walk matched + // evaluations in engine order, dedupe by first-seen. result.actions + // stays alphabetical for machine consumers. + const orderedActions = actionsInPriorityOrder(result.evaluations); const tallyRows = ( ["blocked", "needs-attention", "ready-to-progress", "unknown"] as Verdict[] @@ -87,9 +220,9 @@ export function renderHtmlReport( }) .join(""); - const actionItems = result.actions.length - ? result.actions.map((a) => `<li><code>${esc(a)}</code></li>`).join("") - : `<li class="muted">(none — no rule contributed an action)</li>`; + const actionItems = orderedActions.length + ? orderedActions.map((a) => renderActionItem(a, ctx.actionGlossary)).join("") + : `<li class="muted">(none — no rule contributed an action)</li>`; const flagRows = Object.keys(ctx.flags) .sort() @@ -99,58 +232,39 @@ export function renderHtmlReport( ) .join(""); - const evaluations = result.evaluations - .map((ev) => { - const cls = ev.matched ? "rule rule--matched" : "rule rule--skipped"; - const status = ev.matched ? "MATCHED" : "did not match"; - const conds = ev.conditions - .map((c) => { - // Distinguish "flag missing in extraction" (the extraction did - // not supply the input — operator/LLM issue, yellow/warning) - // from "value did not match" (the input was supplied but the - // rule disagreed — domain signal, red/miss). - let cls2: string; - if (c.matched) { - cls2 = "cond cond--ok"; - } else if (c.reason === "flag missing in extraction") { - cls2 = "cond cond--missing"; - } else { - cls2 = "cond cond--miss"; - } - const reason = c.reason - ? ` <span class="reason">(${esc(c.reason)})</span>` - : ""; - return `<li class="${cls2}"><code>${esc(c.condition.flag)}</code> ${conditionSummary(c.condition)} → observed=<code>${fmtJson(c.observed)}</code>${reason}</li>`; - }) - .join(""); - const contribution = ev.contribution - ? `<p class="contribution">Contributes <strong>${VERDICT_PALETTE[ev.contribution.verdict].label}</strong> with weight <strong>${ev.contribution.weight}</strong>. Actions: ${ev.contribution.actions.map((a) => `<code>${esc(a)}</code>`).join(", ")}.</p>` - : ""; - const stage = ev.rule.stage - ? `<span class="badge">stage: ${esc(ev.rule.stage)}</span>` - : ""; - const tags = (ev.rule.tags ?? []) - .map((t) => `<span class="badge badge--tag">${esc(t)}</span>`) - .join(" "); - return ` -<article class="${cls}"> - <header> - <span class="status-pill">${status}</span> - <h3><code>${esc(ev.rule.id)}</code></h3> - <p class="rule-desc">${esc(ev.rule.description)}</p> - <p class="meta"> - <span class="badge">priority: ${ev.rule.priority}</span> - ${stage} - ${tags} - <span class="badge badge--hash" title="content hash of the rule">${esc(ev.rule.hash.slice(0, 12))}</span> - </p> - </header> - <ul class="conditions">${conds}</ul> - ${contribution} -</article>`; - }) + // "What fired" section: only matched rules, in engine order (priority + // desc, id asc). Rendered above the full audit trail so the reader + // sees the answer before the long list of skipped rules. + const whatFiredBody = matched.length + ? matched.map((ev) => renderEvaluationArticle(ev)).join("") + : `<p class="muted">No rule fired. The verdict reflects the absence of any positive signal — usually a sparse extraction.</p>`; + + // Audit trail: matched rules expanded inline, unmatched rules wrapped + // in <details> collapsed by default. This trims the perceived length + // of the report while keeping every rule one click away (UX + // research/17). + const auditMatched = matched.map((ev) => renderEvaluationArticle(ev)).join(""); + const auditUnmatched = unmatched + .map( + (ev) => + `<details class="rule-collapsed"><summary><code>${esc(ev.rule.id)}</code> · <span class="muted">${esc(ev.rule.description)}</span> · <span class="badge">priority: ${ev.rule.priority}</span></summary>${renderEvaluationArticle(ev)}</details>`, + ) .join(""); + const missing = missingFlagNames(result.evaluations); + const missingBanner = + missing.ruleCount > 0 + ? `<div class="banner banner--missing" role="note"><strong>${missing.ruleCount}</strong> rule${missing.ruleCount === 1 ? "" : "s"} could not be evaluated because the LLM did not supply <code>${missing.flags.map((f) => esc(f)).join("</code>, <code>")}</code>.</div>` + : ""; + + const skipBanner = + ctx.validationStatus === "skipped" + ? `<div class="banner banner--skip" role="alert"><strong>WARNING:</strong> validation gate was skipped (<code>--skip-validate</code>). Verdict and provenance are NOT verified against the flag schema or forbidden-fields policy.</div>` + : ""; + + // Reproduce command: assembled from the same fields plan/report use. + const reproCmd = `npx tsx src/cli.ts ${ctx.rulesPath} ${ctx.flagsPath} --html <out.html> --quiet`; + return `<!doctype html> <html lang="en"> <head> @@ -181,6 +295,8 @@ export function renderHtmlReport( h1 { font-size: 24px; margin: 0 0 4px; } h2 { font-size: 18px; margin: 32px 0 12px; border-bottom: 1px solid var(--border); padding-bottom: 6px; } h3 { font-size: 15px; margin: 4px 0; } + .system-identity { margin: 6px 0 12px; font-size: 13px; color: var(--muted); } + .system-identity .ts { color: var(--fg); font-weight: 500; font-variant-numeric: tabular-nums; } .verdict-card { background: var(--accent-bg); border-left: 6px solid var(--accent); @@ -192,23 +308,42 @@ export function renderHtmlReport( .verdict-card .label { text-transform: uppercase; letter-spacing: 0.05em; font-size: 12px; font-weight: 600; opacity: 0.7; } .verdict-card .value { font-size: 28px; font-weight: 700; margin: 2px 0 0; } .verdict-card .stats { margin: 8px 0 0; font-size: 13px; } + .banner { + padding: 10px 14px; border-radius: 4px; margin: 12px 0; font-size: 13px; + border-left: 4px solid var(--border); background: var(--code-bg); + } + .banner--missing { background: #fff4e0; border-left-color: #d18900; color: #6c4400; } + .banner--skip { background: #fdecea; border-left-color: #d8281b; color: #7a160d; } .summary-grid { display: grid; grid-template-columns: 1fr 1fr; gap: 16px; margin-top: 16px; } + @media (max-width: 540px) { + .summary-grid { grid-template-columns: 1fr; } + } table { border-collapse: collapse; width: 100%; font-size: 13px; } th, td { text-align: left; padding: 6px 8px; border-bottom: 1px solid var(--border); } th[scope="row"] { font-weight: 500; } td.num { text-align: right; font-variant-numeric: tabular-nums; } ul.actions { padding-left: 20px; margin: 0; } ul.actions li { margin: 4px 0; } + .action-slug { font-size: 0.82em; color: var(--muted); margin-left: 4px; } + .action-sentence { } .muted { color: var(--muted); } .rule { border: 1px solid var(--border); border-left: 4px solid var(--border); border-radius: 4px; padding: 14px 16px; margin: 10px 0; } .rule--matched { border-left-color: var(--accent); background: #fcfcfb; } .rule--skipped { opacity: 0.78; } .rule header { display: block; } + .rule-collapsed { margin: 6px 0; } + .rule-collapsed > summary { + cursor: pointer; padding: 8px 10px; border: 1px solid var(--border); border-radius: 4px; + background: #fafbfc; font-size: 13px; + } + .rule-collapsed > summary::marker { color: var(--muted); } + .rule-collapsed[open] > summary { background: var(--code-bg); } + .rule-collapsed .rule { margin-top: 6px; } .status-pill { display: inline-block; font-size: 10px; font-weight: 700; letter-spacing: 0.06em; text-transform: uppercase; padding: 2px 6px; border-radius: 3px; background: var(--code-bg); color: var(--muted); margin-bottom: 4px; @@ -231,61 +366,98 @@ export function renderHtmlReport( } .cond--ok::before { content: "[+]"; color: #1f8a4c; } .cond--miss::before { content: "[-]"; color: #d8281b; } + .cond--miss { background: #fdecea; border-radius: 3px; padding-right: 6px; } .cond--missing::before { content: "[?]"; color: #d18900; } .cond--missing { color: #d18900; background: #fff4e0; border-radius: 3px; padding-right: 6px; } .reason { color: var(--muted); font-style: italic; } .contribution { font-size: 13px; margin: 10px 0 0; padding: 8px 10px; background: var(--accent-bg); border-radius: 3px; color: var(--accent-fg); } .provenance { font-size: 12px; color: var(--muted); } .provenance code { word-break: break-all; } + .provenance-preamble { font-size: 13px; color: var(--fg); margin: 6px 0 12px; } + .reproduce { background: var(--code-bg); border: 1px solid var(--border); border-radius: 4px; padding: 10px 12px; margin: 12px 0; } + .reproduce p { margin: 0 0 6px; font-size: 12px; color: var(--muted); } + .reproduce pre { margin: 0; white-space: pre-wrap; word-break: break-all; font-size: 12px; } + .legend { margin: 12px 0; font-size: 13px; } + .legend summary { cursor: pointer; font-weight: 500; } + .legend dl { margin: 8px 0 0; } + .legend dt { font-weight: 600; margin-top: 6px; } + .legend dd { margin: 0 0 4px 12px; color: var(--muted); } footer { margin-top: 36px; padding-top: 16px; border-top: 1px solid var(--border); font-size: 12px; color: var(--muted); } </style> </head> <body> <header> <h1>Rule engine report</h1> - <p class="muted">Deterministic verdict from extracted flags — "LLM extracts, rules decide".</p> + <p class="system-identity"> + Rule engine POC v${esc(result.engineVersion)} — deterministic verdict from extracted flags. LLM produces the flags; this engine produces the verdict and audit trail.<br> + Generated <span class="ts">${esc(ctx.generatedAt)}</span>. + </p> </header> + ${skipBanner} + <section class="verdict-card" aria-label="verdict"> - <div class="label">Verdict</div> + <div class="label">Verdict.</div> <div class="value">${esc(palette.label)}</div> - <p class="stats">${matchedCount} of ${totalCount} rules matched · ${result.actions.length} suggested action${result.actions.length === 1 ? "" : "s"}</p> + <p class="stats">${matchedCount} rule${matchedCount === 1 ? "" : "s"} fired · ${orderedActions.length} action${orderedActions.length === 1 ? "" : "s"} to take</p> </section> + ${missingBanner} + + <details class="legend" aria-label="legend"> + <summary>Verdict tiers and audit-trail glyphs.</summary> + <dl> + <dt>Blocked</dt><dd>${esc(VERDICT_GLOSSARY.blocked)}</dd> + <dt>Needs attention</dt><dd>${esc(VERDICT_GLOSSARY["needs-attention"])}</dd> + <dt>Ready to progress</dt><dd>${esc(VERDICT_GLOSSARY["ready-to-progress"])}</dd> + <dt>Unknown</dt><dd>${esc(VERDICT_GLOSSARY.unknown)}</dd> + <dt><code>[+]</code></dt><dd>Condition matched against the extracted flags.</dd> + <dt><code>[-]</code></dt><dd>Condition did not match (value was supplied but disagreed).</dd> + <dt><code>[?]</code></dt><dd>Condition could not be evaluated because the flag was missing from the extraction.</dd> + </dl> + </details> + <div class="summary-grid"> <section aria-label="weighted tally"> - <h2>Weighted tally</h2> + <h2>Weighted tally.</h2> <table> <thead><tr><th>Verdict tier</th><th class="num">Weight</th></tr></thead> <tbody>${tallyRows}</tbody> </table> </section> - <section aria-label="suggested actions"> - <h2>Suggested actions</h2> + <section aria-label="actions to take"> + <h2>Take these actions.</h2> <ul class="actions">${actionItems}</ul> </section> </div> <section aria-label="extraction flags"> - <h2>Extraction flags <span class="muted" style="font-size:13px; font-weight:400;">(input from the Orient quadrant)</span></h2> + <h2>Extraction flags. <span class="muted" style="font-size:13px; font-weight:400;">(input from the Orient quadrant)</span></h2> <table> <thead><tr><th>Flag</th><th>Value</th></tr></thead> <tbody>${flagRows}</tbody> </table> </section> + <section aria-label="what fired"> + <h2>What fired.</h2> + ${whatFiredBody} + </section> + <section aria-label="audit trail"> - <h2>Audit trail <span class="muted" style="font-size:13px; font-weight:400;">(deterministic order: priority desc, id asc)</span></h2> - ${evaluations} + <h2>Audit trail. <span class="muted" style="font-size:13px; font-weight:400;">(deterministic order: priority desc, id asc)</span></h2> + ${auditMatched} + ${auditUnmatched} </section> <section aria-label="provenance"> - <h2>Provenance</h2> + <h2>Provenance.</h2> + <p class="provenance-preamble">Provenance — these hashes let an auditor replay the verdict against a specific (engine, rules, source) tuple.</p> <p class="provenance"> Engine version: <code>${esc(result.engineVersion)}</code><br> - Ruleset hash: <code>${esc(result.rulesetHash)}</code><br> - Flags hash: <code>${esc(result.flagsHash)}</code><br>${ + Ruleset hash: <code>${esc(result.rulesetHash.slice(0, 12))}…</code><br> + Flags hash: <code>${esc(result.flagsHash.slice(0, 12))}…</code><br>${ ctx.promptHash ? (() => { // The extraction's self-declared prompt hash. If it equals @@ -296,7 +468,7 @@ export function renderHtmlReport( const verified = typeof declared === "string" && declared === ctx.promptHash; const badge = verified - ? ` <span class="badge badge--verified" title="extraction's __prompt_hash matches the recomputed value">verified</span>` + ? ` <span class="badge badge--verified" title="extraction's __prompt_hash matches the recomputed value — this confirms the extraction was produced from the current source, not that the extracted flags are correct">verified</span>` : ""; return ` Prompt hash: <code>${esc(ctx.promptHash.slice(0, 12))}…</code>${badge}<br>`; @@ -306,10 +478,15 @@ export function renderHtmlReport( Rules file: <code>${esc(ctx.rulesPath)}</code><br> Flags file: <code>${esc(ctx.flagsPath)}</code> </p> + <div class="reproduce"> + <p>How to reproduce — run from <code>experiments/rule-engine-poc/</code>:</p> + <pre><code>${esc(reproCmd)}</code></pre> + <p>Then verify the three hashes above match the values in the regenerated report.</p> + </div> </section> <footer> - Generated ${esc(ctx.generatedAt)} · experiments/rule-engine-poc · same inputs → identical report, byte-for-byte (except this timestamp). + experiments/rule-engine-poc · same inputs → identical report, byte-for-byte (except the generated-at timestamp). </footer> </body> </html> diff --git a/experiments/rule-engine-poc/src/report.ts b/experiments/rule-engine-poc/src/report.ts index f796bbead..6136d1ec9 100644 --- a/experiments/rule-engine-poc/src/report.ts +++ b/experiments/rule-engine-poc/src/report.ts @@ -17,6 +17,7 @@ import { evaluate } from "./engine.js"; import { renderHtmlReport } from "./html-report.js"; import { openInBrowser } from "./open-browser.js"; import { validateExtraction } from "./validate.js"; +import { loadActionGlossary, type ActionGlossary } from "./action-glossary.js"; import type { Target } from "./config.js"; import type { Verdict } from "./types.js"; @@ -28,6 +29,21 @@ const skipValidate = takeFlag(remaining, "--skip-validate"); const ctx = loadCliBaseContext(configPath); mkdirSync(ctx.config.reportsDirPath, { recursive: true }); +// Action glossary is optional: load when configured, tolerate absence +// or load failure with a stderr warning so the renderer falls back to +// slug-only rendering (research/18). +let actionGlossary: ActionGlossary | undefined; +if (ctx.config.actionGlossaryPath) { + try { + actionGlossary = loadActionGlossary(ctx.config.actionGlossaryPath); + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + console.error( + `[report] WARNING: could not load action glossary at ${ctx.config.actionGlossaryPath}: ${msg}. Falling back to slug-only rendering.`, + ); + } +} + const targets = selectTargets(ctx.config, targetFilter); let worstExitCode = 0; @@ -79,6 +95,8 @@ for (const target of targets) { flags, generatedAt: new Date().toISOString(), promptHash: expectedPromptHash, + actionGlossary, + validationStatus: skipValidate ? "skipped" : "validated", }); const reportPath = join(ctx.config.reportsDirPath, `${target.id}.html`); writeFileSync(reportPath, html, "utf8"); From a479ada85d20ca6b211a5d26fe540536efb2d40b Mon Sep 17 00:00:00 2001 From: Claude <noreply@anthropic.com> Date: Sun, 17 May 2026 13:09:49 +0000 Subject: [PATCH 33/45] docs(rule-engine-poc): agent C audit-trail.md second-pass refinement --- experiments/rule-engine-poc/docs/audit-trail.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/experiments/rule-engine-poc/docs/audit-trail.md b/experiments/rule-engine-poc/docs/audit-trail.md index 587cc5f59..d075aa363 100644 --- a/experiments/rule-engine-poc/docs/audit-trail.md +++ b/experiments/rule-engine-poc/docs/audit-trail.md @@ -100,7 +100,7 @@ From `research/02-regulatory-auditability.md`, an auditable AI-derived decision | Ruleset version + hash | `rulesetHash` + per-rule `hash` | yes | | Ordered rule trace | `evaluations` in deterministic order | yes | | Verdict + rationale | `verdict` + matched `conditions`, surfaced on the HTML report as the **what-fired section** with human sentences from `rules/action-glossary.yaml` | yes | -| Human-readable "why" presentation (Art. 13) | HTML report: system-identity header, verdict-tier glossary + glyph legend, what-fired section, reproduce block | yes — closes research/02 open item §6.4 | +| Human-readable "why" presentation (Art. 13) | HTML report: system-identity header, verdict-tier glossary + glyph legend, what-fired section, reproduce block | yes — closes [research/02 §6 item 4](../research/02-regulatory-auditability.md) | | Confidence | n/a (verdicts are categorical) | by design | | Human-oversight record | Caller's responsibility (Act-phase gate) | **upstream** | | Replay manifest | `engineVersion` + the two hashes, plus the on-page reproduce block that prints the exact `npm run report` invocation | yes | From ded740069b6bbbd9c3435bc7fcf652925038200d Mon Sep 17 00:00:00 2001 From: Claude <noreply@anthropic.com> Date: Sun, 17 May 2026 13:15:17 +0000 Subject: [PATCH 34/45] =?UTF-8?q?feat(rule-engine-poc):=20HTML=20report=20?= =?UTF-8?q?v3=20=E2=80=94=20wave=204=20implementer=20pass=20(agent=20B=20c?= =?UTF-8?q?omplete)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Agent B's RALPH loop completed. Twelve convergent findings from research wave 4 now realised in the renderer: 1. 'What fired' section above the full audit trail (UX/17 + critic/21 + auditor/20). Verdict-card stats line now reads 'N rule(s) fired · M action(s) to take'. 2. Non-matched rules collapsed via <details class='rule-collapsed'>; matched rules stay inline (UX/17). 3. Blocker-by-absence banner adjacent to the verdict card when any rule's condition reports 'flag missing in extraction' (critic/21 + UX/17). Yellow palette, names the missing flags. 4. Suggested actions now sorted by priority-of-cause (walk evaluations in priority-desc order, dedup preserving first-seen) instead of alphabetic. result.actions unchanged for machine consumers (UX/17). 5. Action human-sentence rendering via rules/action-glossary.yaml; falls back to bare slug when entry missing (stakeholder/18). 6. Provenance section: preamble explaining the hashes + 'How to reproduce' block + 12-char hash truncation (UX/17 + auditor/20 + stakeholder/18). 7. System-identity header above the verdict card: engine version + prominent timestamp moved out of the footer (auditor/20). 8. Verdict-tier glossary + [+]/[-]/[?] glyph legend in a collapsed <details class='legend'> block (auditor/20 + UX/17). 9. cond--miss now has a faint red row-wash matching cond--missing's amber, so the visual distinction isn't glyph-color-only (UX/17). 10. @media (max-width: 540px) single-column fallback (UX/17). 11. Trust calibration: --skip-validate banner shown prominently when validationStatus='skipped'; verified-badge tooltip explains it only means 'extraction is bound to current inputs', not 'flags are correct' (stakeholder/18 + critic/21). 12. Section headers in sentence-case with periods, imperative voice: 'Take these actions.' not 'Suggested actions' (brand/19). 28 new tests in test/html-report.test.ts; suite total 163/163. Three sample reports regenerated under research/sample-reports/ so reviewers see the new shape. --- .../sample-reports/blocked-missing-ears.html | 172 +++++--- .../needs-attention-design-risks.html | 205 ++++++--- .../sample-reports/ready-implementation.html | 172 +++++--- .../rule-engine-poc/test/html-report.test.ts | 392 ++++++++++++++++++ 4 files changed, 791 insertions(+), 150 deletions(-) create mode 100644 experiments/rule-engine-poc/test/html-report.test.ts diff --git a/experiments/rule-engine-poc/research/sample-reports/blocked-missing-ears.html b/experiments/rule-engine-poc/research/sample-reports/blocked-missing-ears.html index 1b3b31e98..5015cafe5 100644 --- a/experiments/rule-engine-poc/research/sample-reports/blocked-missing-ears.html +++ b/experiments/rule-engine-poc/research/sample-reports/blocked-missing-ears.html @@ -28,6 +28,8 @@ h1 { font-size: 24px; margin: 0 0 4px; } h2 { font-size: 18px; margin: 32px 0 12px; border-bottom: 1px solid var(--border); padding-bottom: 6px; } h3 { font-size: 15px; margin: 4px 0; } + .system-identity { margin: 6px 0 12px; font-size: 13px; color: var(--muted); } + .system-identity .ts { color: var(--fg); font-weight: 500; font-variant-numeric: tabular-nums; } .verdict-card { background: var(--accent-bg); border-left: 6px solid var(--accent); @@ -39,23 +41,42 @@ .verdict-card .label { text-transform: uppercase; letter-spacing: 0.05em; font-size: 12px; font-weight: 600; opacity: 0.7; } .verdict-card .value { font-size: 28px; font-weight: 700; margin: 2px 0 0; } .verdict-card .stats { margin: 8px 0 0; font-size: 13px; } + .banner { + padding: 10px 14px; border-radius: 4px; margin: 12px 0; font-size: 13px; + border-left: 4px solid var(--border); background: var(--code-bg); + } + .banner--missing { background: #fff4e0; border-left-color: #d18900; color: #6c4400; } + .banner--skip { background: #fdecea; border-left-color: #d8281b; color: #7a160d; } .summary-grid { display: grid; grid-template-columns: 1fr 1fr; gap: 16px; margin-top: 16px; } + @media (max-width: 540px) { + .summary-grid { grid-template-columns: 1fr; } + } table { border-collapse: collapse; width: 100%; font-size: 13px; } th, td { text-align: left; padding: 6px 8px; border-bottom: 1px solid var(--border); } th[scope="row"] { font-weight: 500; } td.num { text-align: right; font-variant-numeric: tabular-nums; } ul.actions { padding-left: 20px; margin: 0; } ul.actions li { margin: 4px 0; } + .action-slug { font-size: 0.82em; color: var(--muted); margin-left: 4px; } + .action-sentence { } .muted { color: var(--muted); } .rule { border: 1px solid var(--border); border-left: 4px solid var(--border); border-radius: 4px; padding: 14px 16px; margin: 10px 0; } .rule--matched { border-left-color: var(--accent); background: #fcfcfb; } .rule--skipped { opacity: 0.78; } .rule header { display: block; } + .rule-collapsed { margin: 6px 0; } + .rule-collapsed > summary { + cursor: pointer; padding: 8px 10px; border: 1px solid var(--border); border-radius: 4px; + background: #fafbfc; font-size: 13px; + } + .rule-collapsed > summary::marker { color: var(--muted); } + .rule-collapsed[open] > summary { background: var(--code-bg); } + .rule-collapsed .rule { margin-top: 6px; } .status-pill { display: inline-block; font-size: 10px; font-weight: 700; letter-spacing: 0.06em; text-transform: uppercase; padding: 2px 6px; border-radius: 3px; background: var(--code-bg); color: var(--muted); margin-bottom: 4px; @@ -78,53 +99,119 @@ } .cond--ok::before { content: "[+]"; color: #1f8a4c; } .cond--miss::before { content: "[-]"; color: #d8281b; } + .cond--miss { background: #fdecea; border-radius: 3px; padding-right: 6px; } .cond--missing::before { content: "[?]"; color: #d18900; } .cond--missing { color: #d18900; background: #fff4e0; border-radius: 3px; padding-right: 6px; } .reason { color: var(--muted); font-style: italic; } .contribution { font-size: 13px; margin: 10px 0 0; padding: 8px 10px; background: var(--accent-bg); border-radius: 3px; color: var(--accent-fg); } .provenance { font-size: 12px; color: var(--muted); } .provenance code { word-break: break-all; } + .provenance-preamble { font-size: 13px; color: var(--fg); margin: 6px 0 12px; } + .reproduce { background: var(--code-bg); border: 1px solid var(--border); border-radius: 4px; padding: 10px 12px; margin: 12px 0; } + .reproduce p { margin: 0 0 6px; font-size: 12px; color: var(--muted); } + .reproduce pre { margin: 0; white-space: pre-wrap; word-break: break-all; font-size: 12px; } + .legend { margin: 12px 0; font-size: 13px; } + .legend summary { cursor: pointer; font-weight: 500; } + .legend dl { margin: 8px 0 0; } + .legend dt { font-weight: 600; margin-top: 6px; } + .legend dd { margin: 0 0 4px 12px; color: var(--muted); } footer { margin-top: 36px; padding-top: 16px; border-top: 1px solid var(--border); font-size: 12px; color: var(--muted); } </style> </head> <body> <header> <h1>Rule engine report</h1> - <p class="muted">Deterministic verdict from extracted flags — "LLM extracts, rules decide".</p> + <p class="system-identity"> + Rule engine POC v0.2.0 — deterministic verdict from extracted flags. LLM produces the flags; this engine produces the verdict and audit trail.<br> + Generated <span class="ts">2026-05-17T13:12:45.354Z</span>. + </p> </header> + + <section class="verdict-card" aria-label="verdict"> - <div class="label">Verdict</div> + <div class="label">Verdict.</div> <div class="value">Blocked</div> - <p class="stats">1 of 21 rules matched · 1 suggested action</p> + <p class="stats">1 rule fired · 1 action to take</p> </section> + <div class="banner banner--missing" role="note"><strong>14</strong> rules could not be evaluated because the LLM did not supply <code>testing_ears_test_coverage</code>, <code>design_irreversible_have_adrs</code>, <code>implementation_lint_clean</code>, <code>implementation_types_clean</code>, <code>implementation_unit_tests_pass</code>, <code>review_traceability_complete</code>, <code>spec_each_item_traces_to_requirement</code>, <code>brand_review_required</code>, <code>brand_review_passed</code>, <code>idea_problem_statement_present</code>, <code>testing_critical_paths_covered</code>, <code>design_risks_have_mitigations</code>, <code>idea_scope_bounded</code>, <code>idea_target_users_named</code>.</div> + + <details class="legend" aria-label="legend"> + <summary>Verdict tiers and audit-trail glyphs.</summary> + <dl> + <dt>Blocked</dt><dd>At least one high-severity rule fired. Do not progress until the listed actions are taken.</dd> + <dt>Needs attention</dt><dd>Medium-severity signals fired. Progress is possible but the reviewer should weigh the listed actions.</dd> + <dt>Ready to progress</dt><dd>All gating rules cleared. The stage may advance.</dd> + <dt>Unknown</dt><dd>No rule fired with non-zero weight. Usually means the extraction was insufficient to decide.</dd> + <dt><code>[+]</code></dt><dd>Condition matched against the extracted flags.</dd> + <dt><code>[-]</code></dt><dd>Condition did not match (value was supplied but disagreed).</dd> + <dt><code>[?]</code></dt><dd>Condition could not be evaluated because the flag was missing from the extraction.</dd> + </dl> + </details> + <div class="summary-grid"> <section aria-label="weighted tally"> - <h2>Weighted tally</h2> + <h2>Weighted tally.</h2> <table> <thead><tr><th>Verdict tier</th><th class="num">Weight</th></tr></thead> <tbody><tr><th scope="row">Blocked</th><td class="num">100</td></tr><tr><th scope="row">Needs attention</th><td class="num">0</td></tr><tr><th scope="row">Ready to progress</th><td class="num">0</td></tr><tr><th scope="row">Unknown</th><td class="num">0</td></tr></tbody> </table> </section> - <section aria-label="suggested actions"> - <h2>Suggested actions</h2> + <section aria-label="actions to take"> + <h2>Take these actions.</h2> <ul class="actions"><li><code>rewrite-non-ears-requirements</code></li></ul> </section> </div> <section aria-label="extraction flags"> - <h2>Extraction flags <span class="muted" style="font-size:13px; font-weight:400;">(input from the Orient quadrant)</span></h2> + <h2>Extraction flags. <span class="muted" style="font-size:13px; font-weight:400;">(input from the Orient quadrant)</span></h2> <table> <thead><tr><th>Flag</th><th>Value</th></tr></thead> <tbody><tr><th scope="row"><code>blockers_count</code></th><td><code>0</code></td></tr><tr><th scope="row"><code>current_stage</code></th><td><code>"requirements"</code></td></tr><tr><th scope="row"><code>feature_slug</code></th><td><code>"auth-refresh"</code></td></tr><tr><th scope="row"><code>open_clarifications_count</code></th><td><code>0</code></td></tr><tr><th scope="row"><code>requirements_acceptance_criteria_testable</code></th><td><code>true</code></td></tr><tr><th scope="row"><code>requirements_ears_coverage</code></th><td><code>0.6</code></td></tr><tr><th scope="row"><code>requirements_have_stable_ids</code></th><td><code>true</code></td></tr><tr><th scope="row"><code>s1_findings_count</code></th><td><code>0</code></td></tr><tr><th scope="row"><code>s2_findings_count</code></th><td><code>0</code></td></tr><tr><th scope="row"><code>s3_findings_count</code></th><td><code>1</code></td></tr></tbody> </table> </section> + <section aria-label="what fired"> + <h2>What fired.</h2> + +<article class="rule rule--matched"> + <header> + <span class="status-pill">MATCHED</span> + <h3><code>req-ears-mandatory</code></h3> + <p class="rule-desc">All functional requirements must use EARS notation.</p> + <p class="meta"> + <span class="badge">priority: 100</span> + <span class="badge">stage: requirements</span> + <span class="badge badge--tag">dod</span> <span class="badge badge--tag">requirements</span> <span class="badge badge--tag">gate</span> + <span class="badge badge--hash" title="content hash of the rule">a666b2a5ae60</span> + </p> + </header> + <ul class="conditions"><li class="cond cond--ok"><code>current_stage</code> in=["requirements","design","specification","tasks","implementation","testing","review"] → observed=<code>"requirements"</code></li><li class="cond cond--ok"><code>requirements_ears_coverage</code> lt=1 → observed=<code>0.6</code></li></ul> + <p class="contribution">Contributes <strong>Blocked</strong> with weight <strong>100</strong>. Actions: <code>rewrite-non-ears-requirements</code>.</p> +</article> + </section> + <section aria-label="audit trail"> - <h2>Audit trail <span class="muted" style="font-size:13px; font-weight:400;">(deterministic order: priority desc, id asc)</span></h2> + <h2>Audit trail. <span class="muted" style="font-size:13px; font-weight:400;">(deterministic order: priority desc, id asc)</span></h2> +<article class="rule rule--matched"> + <header> + <span class="status-pill">MATCHED</span> + <h3><code>req-ears-mandatory</code></h3> + <p class="rule-desc">All functional requirements must use EARS notation.</p> + <p class="meta"> + <span class="badge">priority: 100</span> + <span class="badge">stage: requirements</span> + <span class="badge badge--tag">dod</span> <span class="badge badge--tag">requirements</span> <span class="badge badge--tag">gate</span> + <span class="badge badge--hash" title="content hash of the rule">a666b2a5ae60</span> + </p> + </header> + <ul class="conditions"><li class="cond cond--ok"><code>current_stage</code> in=["requirements","design","specification","tasks","implementation","testing","review"] → observed=<code>"requirements"</code></li><li class="cond cond--ok"><code>requirements_ears_coverage</code> lt=1 → observed=<code>0.6</code></li></ul> + <p class="contribution">Contributes <strong>Blocked</strong> with weight <strong>100</strong>. Actions: <code>rewrite-non-ears-requirements</code>.</p> +</article> + <details class="rule-collapsed"><summary><code>any-s1-finding-blocks</code> · <span class="muted">Any S1 (critical) finding blocks all progression.</span> · <span class="badge">priority: 200</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -139,7 +226,7 @@ <h3><code>any-s1-finding-blocks</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>s1_findings_count</code> gt=0 → observed=<code>0</code></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>any-s2-finding-needs-attention</code> · <span class="muted">S2 (high) finding requires attention this sprint.</span> · <span class="badge">priority: 100</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -154,22 +241,7 @@ <h3><code>any-s2-finding-needs-attention</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>s2_findings_count</code> gt=0 → observed=<code>0</code></li></ul> -</article> -<article class="rule rule--matched"> - <header> - <span class="status-pill">MATCHED</span> - <h3><code>req-ears-mandatory</code></h3> - <p class="rule-desc">All functional requirements must use EARS notation.</p> - <p class="meta"> - <span class="badge">priority: 100</span> - <span class="badge">stage: requirements</span> - <span class="badge badge--tag">dod</span> <span class="badge badge--tag">requirements</span> <span class="badge badge--tag">gate</span> - <span class="badge badge--hash" title="content hash of the rule">a666b2a5ae60</span> - </p> - </header> - <ul class="conditions"><li class="cond cond--ok"><code>current_stage</code> in=["requirements","design","specification","tasks","implementation","testing","review"] → observed=<code>"requirements"</code></li><li class="cond cond--ok"><code>requirements_ears_coverage</code> lt=1 → observed=<code>0.6</code></li></ul> - <p class="contribution">Contributes <strong>Blocked</strong> with weight <strong>100</strong>. Actions: <code>rewrite-non-ears-requirements</code>.</p> -</article> +</article></details><details class="rule-collapsed"><summary><code>blockers-block</code> · <span class="muted">Open blockers must be cleared.</span> · <span class="badge">priority: 95</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -184,7 +256,7 @@ <h3><code>blockers-block</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>blockers_count</code> gt=0 → observed=<code>0</code></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>open-clarifications-block</code> · <span class="muted">Open clarifications must be resolved before stage progression.</span> · <span class="badge">priority: 95</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -199,7 +271,7 @@ <h3><code>open-clarifications-block</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>open_clarifications_count</code> gt=0 → observed=<code>0</code></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>testing-ears-coverage-incomplete</code> · <span class="muted">Every EARS clause must have >= 1 test.</span> · <span class="badge">priority: 95</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -214,7 +286,7 @@ <h3><code>testing-ears-coverage-incomplete</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>current_stage</code> in=["testing","review"] → observed=<code>"requirements"</code></li><li class="cond cond--missing"><code>testing_ears_test_coverage</code> lt=1 → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>design-irreversible-needs-adr</code> · <span class="muted">Irreversible architectural decisions must have ADRs.</span> · <span class="badge">priority: 90</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -229,7 +301,7 @@ <h3><code>design-irreversible-needs-adr</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>current_stage</code> in=["design","specification","tasks","implementation"] → observed=<code>"requirements"</code></li><li class="cond cond--missing"><code>design_irreversible_have_adrs</code> eq=false → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>impl-lint-clean</code> · <span class="muted">Implementation must be lint clean.</span> · <span class="badge">priority: 90</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -244,7 +316,7 @@ <h3><code>impl-lint-clean</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>current_stage</code> in=["implementation","testing","review"] → observed=<code>"requirements"</code></li><li class="cond cond--missing"><code>implementation_lint_clean</code> eq=false → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>impl-types-clean</code> · <span class="muted">TypeScript / type checks must pass.</span> · <span class="badge">priority: 90</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -259,7 +331,7 @@ <h3><code>impl-types-clean</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>current_stage</code> in=["implementation","testing","review"] → observed=<code>"requirements"</code></li><li class="cond cond--missing"><code>implementation_types_clean</code> eq=false → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>impl-unit-tests-pass</code> · <span class="muted">Unit tests for the changed surface must pass.</span> · <span class="badge">priority: 90</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -274,7 +346,7 @@ <h3><code>impl-unit-tests-pass</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>current_stage</code> in=["implementation","testing","review"] → observed=<code>"requirements"</code></li><li class="cond cond--missing"><code>implementation_unit_tests_pass</code> eq=false → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>req-has-stable-ids</code> · <span class="muted">Each requirement must have a stable REQ-<AREA>-NNN id.</span> · <span class="badge">priority: 90</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -289,7 +361,7 @@ <h3><code>req-has-stable-ids</code></h3> </header> <ul class="conditions"><li class="cond cond--ok"><code>current_stage</code> in=["requirements","design","specification","tasks","implementation","testing","review"] → observed=<code>"requirements"</code></li><li class="cond cond--miss"><code>requirements_have_stable_ids</code> eq=false → observed=<code>true</code></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>review-traceability-incomplete</code> · <span class="muted">Traceability matrix must be complete and consistent.</span> · <span class="badge">priority: 90</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -304,7 +376,7 @@ <h3><code>review-traceability-incomplete</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>current_stage</code> eq="review" → observed=<code>"requirements"</code></li><li class="cond cond--missing"><code>review_traceability_complete</code> eq=false → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>spec-items-trace-to-requirements</code> · <span class="muted">Each spec item must trace to >= 1 requirement.</span> · <span class="badge">priority: 90</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -319,7 +391,7 @@ <h3><code>spec-items-trace-to-requirements</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>current_stage</code> in=["specification","tasks","implementation","testing","review"] → observed=<code>"requirements"</code></li><li class="cond cond--missing"><code>spec_each_item_traces_to_requirement</code> eq=false → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>review-brand-required-but-missing</code> · <span class="muted">Brand review required (touches sites/, UI surfaces) but not posted.</span> · <span class="badge">priority: 85</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -334,7 +406,7 @@ <h3><code>review-brand-required-but-missing</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>current_stage</code> eq="review" → observed=<code>"requirements"</code></li><li class="cond cond--missing"><code>brand_review_required</code> eq=true → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li><li class="cond cond--missing"><code>brand_review_passed</code> eq=false → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>idea-problem-statement-present</code> · <span class="muted">Idea must have a one-paragraph problem statement.</span> · <span class="badge">priority: 80</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -349,7 +421,7 @@ <h3><code>idea-problem-statement-present</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>current_stage</code> eq="idea" → observed=<code>"requirements"</code></li><li class="cond cond--missing"><code>idea_problem_statement_present</code> eq=false → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>req-acceptance-testable</code> · <span class="muted">Acceptance criteria must be testable.</span> · <span class="badge">priority: 80</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -364,7 +436,7 @@ <h3><code>req-acceptance-testable</code></h3> </header> <ul class="conditions"><li class="cond cond--ok"><code>current_stage</code> eq="requirements" → observed=<code>"requirements"</code></li><li class="cond cond--miss"><code>requirements_acceptance_criteria_testable</code> eq=false → observed=<code>true</code></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>testing-critical-paths-uncovered</code> · <span class="muted">Critical paths (happy + key edge cases) must be covered.</span> · <span class="badge">priority: 80</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -379,7 +451,7 @@ <h3><code>testing-critical-paths-uncovered</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>current_stage</code> in=["testing","review"] → observed=<code>"requirements"</code></li><li class="cond cond--missing"><code>testing_critical_paths_covered</code> eq=false → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>design-risks-have-mitigations</code> · <span class="muted">Identified risks must have mitigations.</span> · <span class="badge">priority: 70</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -394,7 +466,7 @@ <h3><code>design-risks-have-mitigations</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>current_stage</code> eq="design" → observed=<code>"requirements"</code></li><li class="cond cond--missing"><code>design_risks_have_mitigations</code> eq=false → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>idea-scope-bounded</code> · <span class="muted">Idea scope must be bounded (no "boil the ocean" framing).</span> · <span class="badge">priority: 70</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -409,7 +481,7 @@ <h3><code>idea-scope-bounded</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>current_stage</code> eq="idea" → observed=<code>"requirements"</code></li><li class="cond cond--missing"><code>idea_scope_bounded</code> eq=false → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>idea-ready</code> · <span class="muted">Idea DoD satisfied — ready for /spec:research.</span> · <span class="badge">priority: 10</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -424,7 +496,7 @@ <h3><code>idea-ready</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>current_stage</code> eq="idea" → observed=<code>"requirements"</code></li><li class="cond cond--missing"><code>idea_problem_statement_present</code> eq=true → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li><li class="cond cond--missing"><code>idea_target_users_named</code> eq=true → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li><li class="cond cond--missing"><code>idea_scope_bounded</code> eq=true → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>impl-ready</code> · <span class="muted">Implementation DoD satisfied — ready for /spec:test.</span> · <span class="badge">priority: 10</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -439,22 +511,28 @@ <h3><code>impl-ready</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>current_stage</code> eq="implementation" → observed=<code>"requirements"</code></li><li class="cond cond--missing"><code>implementation_lint_clean</code> eq=true → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li><li class="cond cond--missing"><code>implementation_types_clean</code> eq=true → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li><li class="cond cond--missing"><code>implementation_unit_tests_pass</code> eq=true → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li><li class="cond cond--ok"><code>open_clarifications_count</code> eq=0 → observed=<code>0</code></li><li class="cond cond--ok"><code>s1_findings_count</code> eq=0 → observed=<code>0</code></li></ul> -</article> +</article></details> </section> <section aria-label="provenance"> - <h2>Provenance</h2> + <h2>Provenance.</h2> + <p class="provenance-preamble">Provenance — these hashes let an auditor replay the verdict against a specific (engine, rules, source) tuple.</p> <p class="provenance"> Engine version: <code>0.2.0</code><br> - Ruleset hash: <code>84a35f019743b26e106d82332b2e07cc12979c9ce1fdf4b466550b3b37e25d92</code><br> - Flags hash: <code>019a8f31666f943bd178a38a89cc5ef99c2d4476993e4b28c478054e381bb680</code><br> + Ruleset hash: <code>84a35f019743…</code><br> + Flags hash: <code>019a8f31666f…</code><br> Rules file: <code>rules/quality-gates.yaml</code><br> Flags file: <code>fixtures/blocked-missing-ears.json</code> </p> + <div class="reproduce"> + <p>How to reproduce — run from <code>experiments/rule-engine-poc/</code>:</p> + <pre><code>npx tsx src/cli.ts rules/quality-gates.yaml fixtures/blocked-missing-ears.json --html <out.html> --quiet</code></pre> + <p>Then verify the three hashes above match the values in the regenerated report.</p> + </div> </section> <footer> - Generated 2026-05-17T12:56:59.541Z · experiments/rule-engine-poc · same inputs → identical report, byte-for-byte (except this timestamp). + experiments/rule-engine-poc · same inputs → identical report, byte-for-byte (except the generated-at timestamp). </footer> </body> </html> diff --git a/experiments/rule-engine-poc/research/sample-reports/needs-attention-design-risks.html b/experiments/rule-engine-poc/research/sample-reports/needs-attention-design-risks.html index 10511a642..552063071 100644 --- a/experiments/rule-engine-poc/research/sample-reports/needs-attention-design-risks.html +++ b/experiments/rule-engine-poc/research/sample-reports/needs-attention-design-risks.html @@ -28,6 +28,8 @@ h1 { font-size: 24px; margin: 0 0 4px; } h2 { font-size: 18px; margin: 32px 0 12px; border-bottom: 1px solid var(--border); padding-bottom: 6px; } h3 { font-size: 15px; margin: 4px 0; } + .system-identity { margin: 6px 0 12px; font-size: 13px; color: var(--muted); } + .system-identity .ts { color: var(--fg); font-weight: 500; font-variant-numeric: tabular-nums; } .verdict-card { background: var(--accent-bg); border-left: 6px solid var(--accent); @@ -39,23 +41,42 @@ .verdict-card .label { text-transform: uppercase; letter-spacing: 0.05em; font-size: 12px; font-weight: 600; opacity: 0.7; } .verdict-card .value { font-size: 28px; font-weight: 700; margin: 2px 0 0; } .verdict-card .stats { margin: 8px 0 0; font-size: 13px; } + .banner { + padding: 10px 14px; border-radius: 4px; margin: 12px 0; font-size: 13px; + border-left: 4px solid var(--border); background: var(--code-bg); + } + .banner--missing { background: #fff4e0; border-left-color: #d18900; color: #6c4400; } + .banner--skip { background: #fdecea; border-left-color: #d8281b; color: #7a160d; } .summary-grid { display: grid; grid-template-columns: 1fr 1fr; gap: 16px; margin-top: 16px; } + @media (max-width: 540px) { + .summary-grid { grid-template-columns: 1fr; } + } table { border-collapse: collapse; width: 100%; font-size: 13px; } th, td { text-align: left; padding: 6px 8px; border-bottom: 1px solid var(--border); } th[scope="row"] { font-weight: 500; } td.num { text-align: right; font-variant-numeric: tabular-nums; } ul.actions { padding-left: 20px; margin: 0; } ul.actions li { margin: 4px 0; } + .action-slug { font-size: 0.82em; color: var(--muted); margin-left: 4px; } + .action-sentence { } .muted { color: var(--muted); } .rule { border: 1px solid var(--border); border-left: 4px solid var(--border); border-radius: 4px; padding: 14px 16px; margin: 10px 0; } .rule--matched { border-left-color: var(--accent); background: #fcfcfb; } .rule--skipped { opacity: 0.78; } .rule header { display: block; } + .rule-collapsed { margin: 6px 0; } + .rule-collapsed > summary { + cursor: pointer; padding: 8px 10px; border: 1px solid var(--border); border-radius: 4px; + background: #fafbfc; font-size: 13px; + } + .rule-collapsed > summary::marker { color: var(--muted); } + .rule-collapsed[open] > summary { background: var(--code-bg); } + .rule-collapsed .rule { margin-top: 6px; } .status-pill { display: inline-block; font-size: 10px; font-weight: 700; letter-spacing: 0.06em; text-transform: uppercase; padding: 2px 6px; border-radius: 3px; background: var(--code-bg); color: var(--muted); margin-bottom: 4px; @@ -78,68 +99,118 @@ } .cond--ok::before { content: "[+]"; color: #1f8a4c; } .cond--miss::before { content: "[-]"; color: #d8281b; } + .cond--miss { background: #fdecea; border-radius: 3px; padding-right: 6px; } .cond--missing::before { content: "[?]"; color: #d18900; } .cond--missing { color: #d18900; background: #fff4e0; border-radius: 3px; padding-right: 6px; } .reason { color: var(--muted); font-style: italic; } .contribution { font-size: 13px; margin: 10px 0 0; padding: 8px 10px; background: var(--accent-bg); border-radius: 3px; color: var(--accent-fg); } .provenance { font-size: 12px; color: var(--muted); } .provenance code { word-break: break-all; } + .provenance-preamble { font-size: 13px; color: var(--fg); margin: 6px 0 12px; } + .reproduce { background: var(--code-bg); border: 1px solid var(--border); border-radius: 4px; padding: 10px 12px; margin: 12px 0; } + .reproduce p { margin: 0 0 6px; font-size: 12px; color: var(--muted); } + .reproduce pre { margin: 0; white-space: pre-wrap; word-break: break-all; font-size: 12px; } + .legend { margin: 12px 0; font-size: 13px; } + .legend summary { cursor: pointer; font-weight: 500; } + .legend dl { margin: 8px 0 0; } + .legend dt { font-weight: 600; margin-top: 6px; } + .legend dd { margin: 0 0 4px 12px; color: var(--muted); } footer { margin-top: 36px; padding-top: 16px; border-top: 1px solid var(--border); font-size: 12px; color: var(--muted); } </style> </head> <body> <header> <h1>Rule engine report</h1> - <p class="muted">Deterministic verdict from extracted flags — "LLM extracts, rules decide".</p> + <p class="system-identity"> + Rule engine POC v0.2.0 — deterministic verdict from extracted flags. LLM produces the flags; this engine produces the verdict and audit trail.<br> + Generated <span class="ts">2026-05-17T13:12:51.235Z</span>. + </p> </header> + + <section class="verdict-card" aria-label="verdict"> - <div class="label">Verdict</div> + <div class="label">Verdict.</div> <div class="value">Needs attention</div> - <p class="stats">2 of 21 rules matched · 2 suggested actions</p> + <p class="stats">2 rules fired · 2 actions to take</p> </section> + <div class="banner banner--missing" role="note"><strong>12</strong> rules could not be evaluated because the LLM did not supply <code>testing_ears_test_coverage</code>, <code>implementation_lint_clean</code>, <code>implementation_types_clean</code>, <code>implementation_unit_tests_pass</code>, <code>review_traceability_complete</code>, <code>spec_each_item_traces_to_requirement</code>, <code>brand_review_required</code>, <code>brand_review_passed</code>, <code>idea_problem_statement_present</code>, <code>testing_critical_paths_covered</code>, <code>idea_scope_bounded</code>, <code>idea_target_users_named</code>.</div> + + <details class="legend" aria-label="legend"> + <summary>Verdict tiers and audit-trail glyphs.</summary> + <dl> + <dt>Blocked</dt><dd>At least one high-severity rule fired. Do not progress until the listed actions are taken.</dd> + <dt>Needs attention</dt><dd>Medium-severity signals fired. Progress is possible but the reviewer should weigh the listed actions.</dd> + <dt>Ready to progress</dt><dd>All gating rules cleared. The stage may advance.</dd> + <dt>Unknown</dt><dd>No rule fired with non-zero weight. Usually means the extraction was insufficient to decide.</dd> + <dt><code>[+]</code></dt><dd>Condition matched against the extracted flags.</dd> + <dt><code>[-]</code></dt><dd>Condition did not match (value was supplied but disagreed).</dd> + <dt><code>[?]</code></dt><dd>Condition could not be evaluated because the flag was missing from the extraction.</dd> + </dl> + </details> + <div class="summary-grid"> <section aria-label="weighted tally"> - <h2>Weighted tally</h2> + <h2>Weighted tally.</h2> <table> <thead><tr><th>Verdict tier</th><th class="num">Weight</th></tr></thead> <tbody><tr><th scope="row">Blocked</th><td class="num">0</td></tr><tr><th scope="row">Needs attention</th><td class="num">100</td></tr><tr><th scope="row">Ready to progress</th><td class="num">0</td></tr><tr><th scope="row">Unknown</th><td class="num">0</td></tr></tbody> </table> </section> - <section aria-label="suggested actions"> - <h2>Suggested actions</h2> - <ul class="actions"><li><code>propose-risk-mitigations</code></li><li><code>schedule-s2-fix</code></li></ul> + <section aria-label="actions to take"> + <h2>Take these actions.</h2> + <ul class="actions"><li><code>schedule-s2-fix</code></li><li><code>propose-risk-mitigations</code></li></ul> </section> </div> <section aria-label="extraction flags"> - <h2>Extraction flags <span class="muted" style="font-size:13px; font-weight:400;">(input from the Orient quadrant)</span></h2> + <h2>Extraction flags. <span class="muted" style="font-size:13px; font-weight:400;">(input from the Orient quadrant)</span></h2> <table> <thead><tr><th>Flag</th><th>Value</th></tr></thead> <tbody><tr><th scope="row"><code>blockers_count</code></th><td><code>0</code></td></tr><tr><th scope="row"><code>current_stage</code></th><td><code>"design"</code></td></tr><tr><th scope="row"><code>design_irreversible_have_adrs</code></th><td><code>true</code></td></tr><tr><th scope="row"><code>design_risks_have_mitigations</code></th><td><code>false</code></td></tr><tr><th scope="row"><code>feature_slug</code></th><td><code>"search-relevance"</code></td></tr><tr><th scope="row"><code>open_clarifications_count</code></th><td><code>0</code></td></tr><tr><th scope="row"><code>requirements_acceptance_criteria_testable</code></th><td><code>true</code></td></tr><tr><th scope="row"><code>requirements_ears_coverage</code></th><td><code>1</code></td></tr><tr><th scope="row"><code>requirements_have_stable_ids</code></th><td><code>true</code></td></tr><tr><th scope="row"><code>s1_findings_count</code></th><td><code>0</code></td></tr><tr><th scope="row"><code>s2_findings_count</code></th><td><code>2</code></td></tr><tr><th scope="row"><code>s3_findings_count</code></th><td><code>3</code></td></tr></tbody> </table> </section> - <section aria-label="audit trail"> - <h2>Audit trail <span class="muted" style="font-size:13px; font-weight:400;">(deterministic order: priority desc, id asc)</span></h2> + <section aria-label="what fired"> + <h2>What fired.</h2> -<article class="rule rule--skipped"> +<article class="rule rule--matched"> <header> - <span class="status-pill">did not match</span> - <h3><code>any-s1-finding-blocks</code></h3> - <p class="rule-desc">Any S1 (critical) finding blocks all progression.</p> + <span class="status-pill">MATCHED</span> + <h3><code>any-s2-finding-needs-attention</code></h3> + <p class="rule-desc">S2 (high) finding requires attention this sprint.</p> <p class="meta"> - <span class="badge">priority: 200</span> + <span class="badge">priority: 100</span> <span class="badge">stage: any</span> - <span class="badge badge--tag">severity</span> <span class="badge badge--tag">gate</span> - <span class="badge badge--hash" title="content hash of the rule">e8ff14765412</span> + <span class="badge badge--tag">severity</span> + <span class="badge badge--hash" title="content hash of the rule">35db2ac7fc96</span> </p> </header> - <ul class="conditions"><li class="cond cond--miss"><code>s1_findings_count</code> gt=0 → observed=<code>0</code></li></ul> - + <ul class="conditions"><li class="cond cond--ok"><code>s2_findings_count</code> gt=0 → observed=<code>2</code></li></ul> + <p class="contribution">Contributes <strong>Needs attention</strong> with weight <strong>60</strong>. Actions: <code>schedule-s2-fix</code>.</p> </article> +<article class="rule rule--matched"> + <header> + <span class="status-pill">MATCHED</span> + <h3><code>design-risks-have-mitigations</code></h3> + <p class="rule-desc">Identified risks must have mitigations.</p> + <p class="meta"> + <span class="badge">priority: 70</span> + <span class="badge">stage: design</span> + <span class="badge badge--tag">dod</span> <span class="badge badge--tag">design</span> + <span class="badge badge--hash" title="content hash of the rule">8191d2b9c3a0</span> + </p> + </header> + <ul class="conditions"><li class="cond cond--ok"><code>current_stage</code> eq="design" → observed=<code>"design"</code></li><li class="cond cond--ok"><code>design_risks_have_mitigations</code> eq=false → observed=<code>false</code></li></ul> + <p class="contribution">Contributes <strong>Needs attention</strong> with weight <strong>40</strong>. Actions: <code>propose-risk-mitigations</code>.</p> +</article> + </section> + + <section aria-label="audit trail"> + <h2>Audit trail. <span class="muted" style="font-size:13px; font-weight:400;">(deterministic order: priority desc, id asc)</span></h2> + <article class="rule rule--matched"> <header> <span class="status-pill">MATCHED</span> @@ -155,6 +226,37 @@ <h3><code>any-s2-finding-needs-attention</code></h3> <ul class="conditions"><li class="cond cond--ok"><code>s2_findings_count</code> gt=0 → observed=<code>2</code></li></ul> <p class="contribution">Contributes <strong>Needs attention</strong> with weight <strong>60</strong>. Actions: <code>schedule-s2-fix</code>.</p> </article> +<article class="rule rule--matched"> + <header> + <span class="status-pill">MATCHED</span> + <h3><code>design-risks-have-mitigations</code></h3> + <p class="rule-desc">Identified risks must have mitigations.</p> + <p class="meta"> + <span class="badge">priority: 70</span> + <span class="badge">stage: design</span> + <span class="badge badge--tag">dod</span> <span class="badge badge--tag">design</span> + <span class="badge badge--hash" title="content hash of the rule">8191d2b9c3a0</span> + </p> + </header> + <ul class="conditions"><li class="cond cond--ok"><code>current_stage</code> eq="design" → observed=<code>"design"</code></li><li class="cond cond--ok"><code>design_risks_have_mitigations</code> eq=false → observed=<code>false</code></li></ul> + <p class="contribution">Contributes <strong>Needs attention</strong> with weight <strong>40</strong>. Actions: <code>propose-risk-mitigations</code>.</p> +</article> + <details class="rule-collapsed"><summary><code>any-s1-finding-blocks</code> · <span class="muted">Any S1 (critical) finding blocks all progression.</span> · <span class="badge">priority: 200</span></summary> +<article class="rule rule--skipped"> + <header> + <span class="status-pill">did not match</span> + <h3><code>any-s1-finding-blocks</code></h3> + <p class="rule-desc">Any S1 (critical) finding blocks all progression.</p> + <p class="meta"> + <span class="badge">priority: 200</span> + <span class="badge">stage: any</span> + <span class="badge badge--tag">severity</span> <span class="badge badge--tag">gate</span> + <span class="badge badge--hash" title="content hash of the rule">e8ff14765412</span> + </p> + </header> + <ul class="conditions"><li class="cond cond--miss"><code>s1_findings_count</code> gt=0 → observed=<code>0</code></li></ul> + +</article></details><details class="rule-collapsed"><summary><code>req-ears-mandatory</code> · <span class="muted">All functional requirements must use EARS notation.</span> · <span class="badge">priority: 100</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -169,7 +271,7 @@ <h3><code>req-ears-mandatory</code></h3> </header> <ul class="conditions"><li class="cond cond--ok"><code>current_stage</code> in=["requirements","design","specification","tasks","implementation","testing","review"] → observed=<code>"design"</code></li><li class="cond cond--miss"><code>requirements_ears_coverage</code> lt=1 → observed=<code>1</code></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>blockers-block</code> · <span class="muted">Open blockers must be cleared.</span> · <span class="badge">priority: 95</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -184,7 +286,7 @@ <h3><code>blockers-block</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>blockers_count</code> gt=0 → observed=<code>0</code></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>open-clarifications-block</code> · <span class="muted">Open clarifications must be resolved before stage progression.</span> · <span class="badge">priority: 95</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -199,7 +301,7 @@ <h3><code>open-clarifications-block</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>open_clarifications_count</code> gt=0 → observed=<code>0</code></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>testing-ears-coverage-incomplete</code> · <span class="muted">Every EARS clause must have >= 1 test.</span> · <span class="badge">priority: 95</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -214,7 +316,7 @@ <h3><code>testing-ears-coverage-incomplete</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>current_stage</code> in=["testing","review"] → observed=<code>"design"</code></li><li class="cond cond--missing"><code>testing_ears_test_coverage</code> lt=1 → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>design-irreversible-needs-adr</code> · <span class="muted">Irreversible architectural decisions must have ADRs.</span> · <span class="badge">priority: 90</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -229,7 +331,7 @@ <h3><code>design-irreversible-needs-adr</code></h3> </header> <ul class="conditions"><li class="cond cond--ok"><code>current_stage</code> in=["design","specification","tasks","implementation"] → observed=<code>"design"</code></li><li class="cond cond--miss"><code>design_irreversible_have_adrs</code> eq=false → observed=<code>true</code></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>impl-lint-clean</code> · <span class="muted">Implementation must be lint clean.</span> · <span class="badge">priority: 90</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -244,7 +346,7 @@ <h3><code>impl-lint-clean</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>current_stage</code> in=["implementation","testing","review"] → observed=<code>"design"</code></li><li class="cond cond--missing"><code>implementation_lint_clean</code> eq=false → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>impl-types-clean</code> · <span class="muted">TypeScript / type checks must pass.</span> · <span class="badge">priority: 90</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -259,7 +361,7 @@ <h3><code>impl-types-clean</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>current_stage</code> in=["implementation","testing","review"] → observed=<code>"design"</code></li><li class="cond cond--missing"><code>implementation_types_clean</code> eq=false → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>impl-unit-tests-pass</code> · <span class="muted">Unit tests for the changed surface must pass.</span> · <span class="badge">priority: 90</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -274,7 +376,7 @@ <h3><code>impl-unit-tests-pass</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>current_stage</code> in=["implementation","testing","review"] → observed=<code>"design"</code></li><li class="cond cond--missing"><code>implementation_unit_tests_pass</code> eq=false → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>req-has-stable-ids</code> · <span class="muted">Each requirement must have a stable REQ-<AREA>-NNN id.</span> · <span class="badge">priority: 90</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -289,7 +391,7 @@ <h3><code>req-has-stable-ids</code></h3> </header> <ul class="conditions"><li class="cond cond--ok"><code>current_stage</code> in=["requirements","design","specification","tasks","implementation","testing","review"] → observed=<code>"design"</code></li><li class="cond cond--miss"><code>requirements_have_stable_ids</code> eq=false → observed=<code>true</code></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>review-traceability-incomplete</code> · <span class="muted">Traceability matrix must be complete and consistent.</span> · <span class="badge">priority: 90</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -304,7 +406,7 @@ <h3><code>review-traceability-incomplete</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>current_stage</code> eq="review" → observed=<code>"design"</code></li><li class="cond cond--missing"><code>review_traceability_complete</code> eq=false → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>spec-items-trace-to-requirements</code> · <span class="muted">Each spec item must trace to >= 1 requirement.</span> · <span class="badge">priority: 90</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -319,7 +421,7 @@ <h3><code>spec-items-trace-to-requirements</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>current_stage</code> in=["specification","tasks","implementation","testing","review"] → observed=<code>"design"</code></li><li class="cond cond--missing"><code>spec_each_item_traces_to_requirement</code> eq=false → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>review-brand-required-but-missing</code> · <span class="muted">Brand review required (touches sites/, UI surfaces) but not posted.</span> · <span class="badge">priority: 85</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -334,7 +436,7 @@ <h3><code>review-brand-required-but-missing</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>current_stage</code> eq="review" → observed=<code>"design"</code></li><li class="cond cond--missing"><code>brand_review_required</code> eq=true → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li><li class="cond cond--missing"><code>brand_review_passed</code> eq=false → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>idea-problem-statement-present</code> · <span class="muted">Idea must have a one-paragraph problem statement.</span> · <span class="badge">priority: 80</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -349,7 +451,7 @@ <h3><code>idea-problem-statement-present</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>current_stage</code> eq="idea" → observed=<code>"design"</code></li><li class="cond cond--missing"><code>idea_problem_statement_present</code> eq=false → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>req-acceptance-testable</code> · <span class="muted">Acceptance criteria must be testable.</span> · <span class="badge">priority: 80</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -364,7 +466,7 @@ <h3><code>req-acceptance-testable</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>current_stage</code> eq="requirements" → observed=<code>"design"</code></li><li class="cond cond--miss"><code>requirements_acceptance_criteria_testable</code> eq=false → observed=<code>true</code></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>testing-critical-paths-uncovered</code> · <span class="muted">Critical paths (happy + key edge cases) must be covered.</span> · <span class="badge">priority: 80</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -379,22 +481,7 @@ <h3><code>testing-critical-paths-uncovered</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>current_stage</code> in=["testing","review"] → observed=<code>"design"</code></li><li class="cond cond--missing"><code>testing_critical_paths_covered</code> eq=false → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li></ul> -</article> -<article class="rule rule--matched"> - <header> - <span class="status-pill">MATCHED</span> - <h3><code>design-risks-have-mitigations</code></h3> - <p class="rule-desc">Identified risks must have mitigations.</p> - <p class="meta"> - <span class="badge">priority: 70</span> - <span class="badge">stage: design</span> - <span class="badge badge--tag">dod</span> <span class="badge badge--tag">design</span> - <span class="badge badge--hash" title="content hash of the rule">8191d2b9c3a0</span> - </p> - </header> - <ul class="conditions"><li class="cond cond--ok"><code>current_stage</code> eq="design" → observed=<code>"design"</code></li><li class="cond cond--ok"><code>design_risks_have_mitigations</code> eq=false → observed=<code>false</code></li></ul> - <p class="contribution">Contributes <strong>Needs attention</strong> with weight <strong>40</strong>. Actions: <code>propose-risk-mitigations</code>.</p> -</article> +</article></details><details class="rule-collapsed"><summary><code>idea-scope-bounded</code> · <span class="muted">Idea scope must be bounded (no "boil the ocean" framing).</span> · <span class="badge">priority: 70</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -409,7 +496,7 @@ <h3><code>idea-scope-bounded</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>current_stage</code> eq="idea" → observed=<code>"design"</code></li><li class="cond cond--missing"><code>idea_scope_bounded</code> eq=false → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>idea-ready</code> · <span class="muted">Idea DoD satisfied — ready for /spec:research.</span> · <span class="badge">priority: 10</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -424,7 +511,7 @@ <h3><code>idea-ready</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>current_stage</code> eq="idea" → observed=<code>"design"</code></li><li class="cond cond--missing"><code>idea_problem_statement_present</code> eq=true → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li><li class="cond cond--missing"><code>idea_target_users_named</code> eq=true → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li><li class="cond cond--missing"><code>idea_scope_bounded</code> eq=true → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>impl-ready</code> · <span class="muted">Implementation DoD satisfied — ready for /spec:test.</span> · <span class="badge">priority: 10</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -439,22 +526,28 @@ <h3><code>impl-ready</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>current_stage</code> eq="implementation" → observed=<code>"design"</code></li><li class="cond cond--missing"><code>implementation_lint_clean</code> eq=true → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li><li class="cond cond--missing"><code>implementation_types_clean</code> eq=true → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li><li class="cond cond--missing"><code>implementation_unit_tests_pass</code> eq=true → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li><li class="cond cond--ok"><code>open_clarifications_count</code> eq=0 → observed=<code>0</code></li><li class="cond cond--ok"><code>s1_findings_count</code> eq=0 → observed=<code>0</code></li></ul> -</article> +</article></details> </section> <section aria-label="provenance"> - <h2>Provenance</h2> + <h2>Provenance.</h2> + <p class="provenance-preamble">Provenance — these hashes let an auditor replay the verdict against a specific (engine, rules, source) tuple.</p> <p class="provenance"> Engine version: <code>0.2.0</code><br> - Ruleset hash: <code>84a35f019743b26e106d82332b2e07cc12979c9ce1fdf4b466550b3b37e25d92</code><br> - Flags hash: <code>24c6c9a7397070759956430d36961066bb2562629610ac44b7581d05b8fa6640</code><br> + Ruleset hash: <code>84a35f019743…</code><br> + Flags hash: <code>24c6c9a73970…</code><br> Rules file: <code>rules/quality-gates.yaml</code><br> Flags file: <code>fixtures/needs-attention-design-risks.json</code> </p> + <div class="reproduce"> + <p>How to reproduce — run from <code>experiments/rule-engine-poc/</code>:</p> + <pre><code>npx tsx src/cli.ts rules/quality-gates.yaml fixtures/needs-attention-design-risks.json --html <out.html> --quiet</code></pre> + <p>Then verify the three hashes above match the values in the regenerated report.</p> + </div> </section> <footer> - Generated 2026-05-17T12:57:00.162Z · experiments/rule-engine-poc · same inputs → identical report, byte-for-byte (except this timestamp). + experiments/rule-engine-poc · same inputs → identical report, byte-for-byte (except the generated-at timestamp). </footer> </body> </html> diff --git a/experiments/rule-engine-poc/research/sample-reports/ready-implementation.html b/experiments/rule-engine-poc/research/sample-reports/ready-implementation.html index a45fce775..d9f36f8a2 100644 --- a/experiments/rule-engine-poc/research/sample-reports/ready-implementation.html +++ b/experiments/rule-engine-poc/research/sample-reports/ready-implementation.html @@ -28,6 +28,8 @@ h1 { font-size: 24px; margin: 0 0 4px; } h2 { font-size: 18px; margin: 32px 0 12px; border-bottom: 1px solid var(--border); padding-bottom: 6px; } h3 { font-size: 15px; margin: 4px 0; } + .system-identity { margin: 6px 0 12px; font-size: 13px; color: var(--muted); } + .system-identity .ts { color: var(--fg); font-weight: 500; font-variant-numeric: tabular-nums; } .verdict-card { background: var(--accent-bg); border-left: 6px solid var(--accent); @@ -39,23 +41,42 @@ .verdict-card .label { text-transform: uppercase; letter-spacing: 0.05em; font-size: 12px; font-weight: 600; opacity: 0.7; } .verdict-card .value { font-size: 28px; font-weight: 700; margin: 2px 0 0; } .verdict-card .stats { margin: 8px 0 0; font-size: 13px; } + .banner { + padding: 10px 14px; border-radius: 4px; margin: 12px 0; font-size: 13px; + border-left: 4px solid var(--border); background: var(--code-bg); + } + .banner--missing { background: #fff4e0; border-left-color: #d18900; color: #6c4400; } + .banner--skip { background: #fdecea; border-left-color: #d8281b; color: #7a160d; } .summary-grid { display: grid; grid-template-columns: 1fr 1fr; gap: 16px; margin-top: 16px; } + @media (max-width: 540px) { + .summary-grid { grid-template-columns: 1fr; } + } table { border-collapse: collapse; width: 100%; font-size: 13px; } th, td { text-align: left; padding: 6px 8px; border-bottom: 1px solid var(--border); } th[scope="row"] { font-weight: 500; } td.num { text-align: right; font-variant-numeric: tabular-nums; } ul.actions { padding-left: 20px; margin: 0; } ul.actions li { margin: 4px 0; } + .action-slug { font-size: 0.82em; color: var(--muted); margin-left: 4px; } + .action-sentence { } .muted { color: var(--muted); } .rule { border: 1px solid var(--border); border-left: 4px solid var(--border); border-radius: 4px; padding: 14px 16px; margin: 10px 0; } .rule--matched { border-left-color: var(--accent); background: #fcfcfb; } .rule--skipped { opacity: 0.78; } .rule header { display: block; } + .rule-collapsed { margin: 6px 0; } + .rule-collapsed > summary { + cursor: pointer; padding: 8px 10px; border: 1px solid var(--border); border-radius: 4px; + background: #fafbfc; font-size: 13px; + } + .rule-collapsed > summary::marker { color: var(--muted); } + .rule-collapsed[open] > summary { background: var(--code-bg); } + .rule-collapsed .rule { margin-top: 6px; } .status-pill { display: inline-block; font-size: 10px; font-weight: 700; letter-spacing: 0.06em; text-transform: uppercase; padding: 2px 6px; border-radius: 3px; background: var(--code-bg); color: var(--muted); margin-bottom: 4px; @@ -78,53 +99,119 @@ } .cond--ok::before { content: "[+]"; color: #1f8a4c; } .cond--miss::before { content: "[-]"; color: #d8281b; } + .cond--miss { background: #fdecea; border-radius: 3px; padding-right: 6px; } .cond--missing::before { content: "[?]"; color: #d18900; } .cond--missing { color: #d18900; background: #fff4e0; border-radius: 3px; padding-right: 6px; } .reason { color: var(--muted); font-style: italic; } .contribution { font-size: 13px; margin: 10px 0 0; padding: 8px 10px; background: var(--accent-bg); border-radius: 3px; color: var(--accent-fg); } .provenance { font-size: 12px; color: var(--muted); } .provenance code { word-break: break-all; } + .provenance-preamble { font-size: 13px; color: var(--fg); margin: 6px 0 12px; } + .reproduce { background: var(--code-bg); border: 1px solid var(--border); border-radius: 4px; padding: 10px 12px; margin: 12px 0; } + .reproduce p { margin: 0 0 6px; font-size: 12px; color: var(--muted); } + .reproduce pre { margin: 0; white-space: pre-wrap; word-break: break-all; font-size: 12px; } + .legend { margin: 12px 0; font-size: 13px; } + .legend summary { cursor: pointer; font-weight: 500; } + .legend dl { margin: 8px 0 0; } + .legend dt { font-weight: 600; margin-top: 6px; } + .legend dd { margin: 0 0 4px 12px; color: var(--muted); } footer { margin-top: 36px; padding-top: 16px; border-top: 1px solid var(--border); font-size: 12px; color: var(--muted); } </style> </head> <body> <header> <h1>Rule engine report</h1> - <p class="muted">Deterministic verdict from extracted flags — "LLM extracts, rules decide".</p> + <p class="system-identity"> + Rule engine POC v0.2.0 — deterministic verdict from extracted flags. LLM produces the flags; this engine produces the verdict and audit trail.<br> + Generated <span class="ts">2026-05-17T13:12:51.892Z</span>. + </p> </header> + + <section class="verdict-card" aria-label="verdict"> - <div class="label">Verdict</div> + <div class="label">Verdict.</div> <div class="value">Ready to progress</div> - <p class="stats">1 of 21 rules matched · 1 suggested action</p> + <p class="stats">1 rule fired · 1 action to take</p> </section> + <div class="banner banner--missing" role="note"><strong>7</strong> rules could not be evaluated because the LLM did not supply <code>testing_ears_test_coverage</code>, <code>review_traceability_complete</code>, <code>brand_review_required</code>, <code>brand_review_passed</code>, <code>idea_problem_statement_present</code>, <code>testing_critical_paths_covered</code>, <code>idea_scope_bounded</code>, <code>idea_target_users_named</code>.</div> + + <details class="legend" aria-label="legend"> + <summary>Verdict tiers and audit-trail glyphs.</summary> + <dl> + <dt>Blocked</dt><dd>At least one high-severity rule fired. Do not progress until the listed actions are taken.</dd> + <dt>Needs attention</dt><dd>Medium-severity signals fired. Progress is possible but the reviewer should weigh the listed actions.</dd> + <dt>Ready to progress</dt><dd>All gating rules cleared. The stage may advance.</dd> + <dt>Unknown</dt><dd>No rule fired with non-zero weight. Usually means the extraction was insufficient to decide.</dd> + <dt><code>[+]</code></dt><dd>Condition matched against the extracted flags.</dd> + <dt><code>[-]</code></dt><dd>Condition did not match (value was supplied but disagreed).</dd> + <dt><code>[?]</code></dt><dd>Condition could not be evaluated because the flag was missing from the extraction.</dd> + </dl> + </details> + <div class="summary-grid"> <section aria-label="weighted tally"> - <h2>Weighted tally</h2> + <h2>Weighted tally.</h2> <table> <thead><tr><th>Verdict tier</th><th class="num">Weight</th></tr></thead> <tbody><tr><th scope="row">Blocked</th><td class="num">0</td></tr><tr><th scope="row">Needs attention</th><td class="num">0</td></tr><tr><th scope="row">Ready to progress</th><td class="num">100</td></tr><tr><th scope="row">Unknown</th><td class="num">0</td></tr></tbody> </table> </section> - <section aria-label="suggested actions"> - <h2>Suggested actions</h2> + <section aria-label="actions to take"> + <h2>Take these actions.</h2> <ul class="actions"><li><code>advance-to-testing</code></li></ul> </section> </div> <section aria-label="extraction flags"> - <h2>Extraction flags <span class="muted" style="font-size:13px; font-weight:400;">(input from the Orient quadrant)</span></h2> + <h2>Extraction flags. <span class="muted" style="font-size:13px; font-weight:400;">(input from the Orient quadrant)</span></h2> <table> <thead><tr><th>Flag</th><th>Value</th></tr></thead> <tbody><tr><th scope="row"><code>blockers_count</code></th><td><code>0</code></td></tr><tr><th scope="row"><code>current_stage</code></th><td><code>"implementation"</code></td></tr><tr><th scope="row"><code>design_irreversible_have_adrs</code></th><td><code>true</code></td></tr><tr><th scope="row"><code>design_risks_have_mitigations</code></th><td><code>true</code></td></tr><tr><th scope="row"><code>feature_slug</code></th><td><code>"rule-engine-poc"</code></td></tr><tr><th scope="row"><code>implementation_lint_clean</code></th><td><code>true</code></td></tr><tr><th scope="row"><code>implementation_types_clean</code></th><td><code>true</code></td></tr><tr><th scope="row"><code>implementation_unit_tests_pass</code></th><td><code>true</code></td></tr><tr><th scope="row"><code>open_clarifications_count</code></th><td><code>0</code></td></tr><tr><th scope="row"><code>requirements_acceptance_criteria_testable</code></th><td><code>true</code></td></tr><tr><th scope="row"><code>requirements_ears_coverage</code></th><td><code>1</code></td></tr><tr><th scope="row"><code>requirements_have_stable_ids</code></th><td><code>true</code></td></tr><tr><th scope="row"><code>s1_findings_count</code></th><td><code>0</code></td></tr><tr><th scope="row"><code>s2_findings_count</code></th><td><code>0</code></td></tr><tr><th scope="row"><code>s3_findings_count</code></th><td><code>0</code></td></tr><tr><th scope="row"><code>spec_each_item_traces_to_requirement</code></th><td><code>true</code></td></tr></tbody> </table> </section> + <section aria-label="what fired"> + <h2>What fired.</h2> + +<article class="rule rule--matched"> + <header> + <span class="status-pill">MATCHED</span> + <h3><code>impl-ready</code></h3> + <p class="rule-desc">Implementation DoD satisfied — ready for /spec:test.</p> + <p class="meta"> + <span class="badge">priority: 10</span> + <span class="badge">stage: implementation</span> + <span class="badge badge--tag">dod</span> <span class="badge badge--tag">implementation</span> + <span class="badge badge--hash" title="content hash of the rule">c311b2edd5ff</span> + </p> + </header> + <ul class="conditions"><li class="cond cond--ok"><code>current_stage</code> eq="implementation" → observed=<code>"implementation"</code></li><li class="cond cond--ok"><code>implementation_lint_clean</code> eq=true → observed=<code>true</code></li><li class="cond cond--ok"><code>implementation_types_clean</code> eq=true → observed=<code>true</code></li><li class="cond cond--ok"><code>implementation_unit_tests_pass</code> eq=true → observed=<code>true</code></li><li class="cond cond--ok"><code>open_clarifications_count</code> eq=0 → observed=<code>0</code></li><li class="cond cond--ok"><code>s1_findings_count</code> eq=0 → observed=<code>0</code></li></ul> + <p class="contribution">Contributes <strong>Ready to progress</strong> with weight <strong>100</strong>. Actions: <code>advance-to-testing</code>.</p> +</article> + </section> + <section aria-label="audit trail"> - <h2>Audit trail <span class="muted" style="font-size:13px; font-weight:400;">(deterministic order: priority desc, id asc)</span></h2> + <h2>Audit trail. <span class="muted" style="font-size:13px; font-weight:400;">(deterministic order: priority desc, id asc)</span></h2> +<article class="rule rule--matched"> + <header> + <span class="status-pill">MATCHED</span> + <h3><code>impl-ready</code></h3> + <p class="rule-desc">Implementation DoD satisfied — ready for /spec:test.</p> + <p class="meta"> + <span class="badge">priority: 10</span> + <span class="badge">stage: implementation</span> + <span class="badge badge--tag">dod</span> <span class="badge badge--tag">implementation</span> + <span class="badge badge--hash" title="content hash of the rule">c311b2edd5ff</span> + </p> + </header> + <ul class="conditions"><li class="cond cond--ok"><code>current_stage</code> eq="implementation" → observed=<code>"implementation"</code></li><li class="cond cond--ok"><code>implementation_lint_clean</code> eq=true → observed=<code>true</code></li><li class="cond cond--ok"><code>implementation_types_clean</code> eq=true → observed=<code>true</code></li><li class="cond cond--ok"><code>implementation_unit_tests_pass</code> eq=true → observed=<code>true</code></li><li class="cond cond--ok"><code>open_clarifications_count</code> eq=0 → observed=<code>0</code></li><li class="cond cond--ok"><code>s1_findings_count</code> eq=0 → observed=<code>0</code></li></ul> + <p class="contribution">Contributes <strong>Ready to progress</strong> with weight <strong>100</strong>. Actions: <code>advance-to-testing</code>.</p> +</article> + <details class="rule-collapsed"><summary><code>any-s1-finding-blocks</code> · <span class="muted">Any S1 (critical) finding blocks all progression.</span> · <span class="badge">priority: 200</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -139,7 +226,7 @@ <h3><code>any-s1-finding-blocks</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>s1_findings_count</code> gt=0 → observed=<code>0</code></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>any-s2-finding-needs-attention</code> · <span class="muted">S2 (high) finding requires attention this sprint.</span> · <span class="badge">priority: 100</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -154,7 +241,7 @@ <h3><code>any-s2-finding-needs-attention</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>s2_findings_count</code> gt=0 → observed=<code>0</code></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>req-ears-mandatory</code> · <span class="muted">All functional requirements must use EARS notation.</span> · <span class="badge">priority: 100</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -169,7 +256,7 @@ <h3><code>req-ears-mandatory</code></h3> </header> <ul class="conditions"><li class="cond cond--ok"><code>current_stage</code> in=["requirements","design","specification","tasks","implementation","testing","review"] → observed=<code>"implementation"</code></li><li class="cond cond--miss"><code>requirements_ears_coverage</code> lt=1 → observed=<code>1</code></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>blockers-block</code> · <span class="muted">Open blockers must be cleared.</span> · <span class="badge">priority: 95</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -184,7 +271,7 @@ <h3><code>blockers-block</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>blockers_count</code> gt=0 → observed=<code>0</code></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>open-clarifications-block</code> · <span class="muted">Open clarifications must be resolved before stage progression.</span> · <span class="badge">priority: 95</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -199,7 +286,7 @@ <h3><code>open-clarifications-block</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>open_clarifications_count</code> gt=0 → observed=<code>0</code></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>testing-ears-coverage-incomplete</code> · <span class="muted">Every EARS clause must have >= 1 test.</span> · <span class="badge">priority: 95</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -214,7 +301,7 @@ <h3><code>testing-ears-coverage-incomplete</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>current_stage</code> in=["testing","review"] → observed=<code>"implementation"</code></li><li class="cond cond--missing"><code>testing_ears_test_coverage</code> lt=1 → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>design-irreversible-needs-adr</code> · <span class="muted">Irreversible architectural decisions must have ADRs.</span> · <span class="badge">priority: 90</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -229,7 +316,7 @@ <h3><code>design-irreversible-needs-adr</code></h3> </header> <ul class="conditions"><li class="cond cond--ok"><code>current_stage</code> in=["design","specification","tasks","implementation"] → observed=<code>"implementation"</code></li><li class="cond cond--miss"><code>design_irreversible_have_adrs</code> eq=false → observed=<code>true</code></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>impl-lint-clean</code> · <span class="muted">Implementation must be lint clean.</span> · <span class="badge">priority: 90</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -244,7 +331,7 @@ <h3><code>impl-lint-clean</code></h3> </header> <ul class="conditions"><li class="cond cond--ok"><code>current_stage</code> in=["implementation","testing","review"] → observed=<code>"implementation"</code></li><li class="cond cond--miss"><code>implementation_lint_clean</code> eq=false → observed=<code>true</code></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>impl-types-clean</code> · <span class="muted">TypeScript / type checks must pass.</span> · <span class="badge">priority: 90</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -259,7 +346,7 @@ <h3><code>impl-types-clean</code></h3> </header> <ul class="conditions"><li class="cond cond--ok"><code>current_stage</code> in=["implementation","testing","review"] → observed=<code>"implementation"</code></li><li class="cond cond--miss"><code>implementation_types_clean</code> eq=false → observed=<code>true</code></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>impl-unit-tests-pass</code> · <span class="muted">Unit tests for the changed surface must pass.</span> · <span class="badge">priority: 90</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -274,7 +361,7 @@ <h3><code>impl-unit-tests-pass</code></h3> </header> <ul class="conditions"><li class="cond cond--ok"><code>current_stage</code> in=["implementation","testing","review"] → observed=<code>"implementation"</code></li><li class="cond cond--miss"><code>implementation_unit_tests_pass</code> eq=false → observed=<code>true</code></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>req-has-stable-ids</code> · <span class="muted">Each requirement must have a stable REQ-<AREA>-NNN id.</span> · <span class="badge">priority: 90</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -289,7 +376,7 @@ <h3><code>req-has-stable-ids</code></h3> </header> <ul class="conditions"><li class="cond cond--ok"><code>current_stage</code> in=["requirements","design","specification","tasks","implementation","testing","review"] → observed=<code>"implementation"</code></li><li class="cond cond--miss"><code>requirements_have_stable_ids</code> eq=false → observed=<code>true</code></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>review-traceability-incomplete</code> · <span class="muted">Traceability matrix must be complete and consistent.</span> · <span class="badge">priority: 90</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -304,7 +391,7 @@ <h3><code>review-traceability-incomplete</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>current_stage</code> eq="review" → observed=<code>"implementation"</code></li><li class="cond cond--missing"><code>review_traceability_complete</code> eq=false → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>spec-items-trace-to-requirements</code> · <span class="muted">Each spec item must trace to >= 1 requirement.</span> · <span class="badge">priority: 90</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -319,7 +406,7 @@ <h3><code>spec-items-trace-to-requirements</code></h3> </header> <ul class="conditions"><li class="cond cond--ok"><code>current_stage</code> in=["specification","tasks","implementation","testing","review"] → observed=<code>"implementation"</code></li><li class="cond cond--miss"><code>spec_each_item_traces_to_requirement</code> eq=false → observed=<code>true</code></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>review-brand-required-but-missing</code> · <span class="muted">Brand review required (touches sites/, UI surfaces) but not posted.</span> · <span class="badge">priority: 85</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -334,7 +421,7 @@ <h3><code>review-brand-required-but-missing</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>current_stage</code> eq="review" → observed=<code>"implementation"</code></li><li class="cond cond--missing"><code>brand_review_required</code> eq=true → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li><li class="cond cond--missing"><code>brand_review_passed</code> eq=false → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>idea-problem-statement-present</code> · <span class="muted">Idea must have a one-paragraph problem statement.</span> · <span class="badge">priority: 80</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -349,7 +436,7 @@ <h3><code>idea-problem-statement-present</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>current_stage</code> eq="idea" → observed=<code>"implementation"</code></li><li class="cond cond--missing"><code>idea_problem_statement_present</code> eq=false → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>req-acceptance-testable</code> · <span class="muted">Acceptance criteria must be testable.</span> · <span class="badge">priority: 80</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -364,7 +451,7 @@ <h3><code>req-acceptance-testable</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>current_stage</code> eq="requirements" → observed=<code>"implementation"</code></li><li class="cond cond--miss"><code>requirements_acceptance_criteria_testable</code> eq=false → observed=<code>true</code></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>testing-critical-paths-uncovered</code> · <span class="muted">Critical paths (happy + key edge cases) must be covered.</span> · <span class="badge">priority: 80</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -379,7 +466,7 @@ <h3><code>testing-critical-paths-uncovered</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>current_stage</code> in=["testing","review"] → observed=<code>"implementation"</code></li><li class="cond cond--missing"><code>testing_critical_paths_covered</code> eq=false → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>design-risks-have-mitigations</code> · <span class="muted">Identified risks must have mitigations.</span> · <span class="badge">priority: 70</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -394,7 +481,7 @@ <h3><code>design-risks-have-mitigations</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>current_stage</code> eq="design" → observed=<code>"implementation"</code></li><li class="cond cond--miss"><code>design_risks_have_mitigations</code> eq=false → observed=<code>true</code></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>idea-scope-bounded</code> · <span class="muted">Idea scope must be bounded (no "boil the ocean" framing).</span> · <span class="badge">priority: 70</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -409,7 +496,7 @@ <h3><code>idea-scope-bounded</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>current_stage</code> eq="idea" → observed=<code>"implementation"</code></li><li class="cond cond--missing"><code>idea_scope_bounded</code> eq=false → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li></ul> -</article> +</article></details><details class="rule-collapsed"><summary><code>idea-ready</code> · <span class="muted">Idea DoD satisfied — ready for /spec:research.</span> · <span class="badge">priority: 10</span></summary> <article class="rule rule--skipped"> <header> <span class="status-pill">did not match</span> @@ -424,37 +511,28 @@ <h3><code>idea-ready</code></h3> </header> <ul class="conditions"><li class="cond cond--miss"><code>current_stage</code> eq="idea" → observed=<code>"implementation"</code></li><li class="cond cond--missing"><code>idea_problem_statement_present</code> eq=true → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li><li class="cond cond--missing"><code>idea_target_users_named</code> eq=true → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li><li class="cond cond--missing"><code>idea_scope_bounded</code> eq=true → observed=<code>undefined</code> <span class="reason">(flag missing in extraction)</span></li></ul> -</article> -<article class="rule rule--matched"> - <header> - <span class="status-pill">MATCHED</span> - <h3><code>impl-ready</code></h3> - <p class="rule-desc">Implementation DoD satisfied — ready for /spec:test.</p> - <p class="meta"> - <span class="badge">priority: 10</span> - <span class="badge">stage: implementation</span> - <span class="badge badge--tag">dod</span> <span class="badge badge--tag">implementation</span> - <span class="badge badge--hash" title="content hash of the rule">c311b2edd5ff</span> - </p> - </header> - <ul class="conditions"><li class="cond cond--ok"><code>current_stage</code> eq="implementation" → observed=<code>"implementation"</code></li><li class="cond cond--ok"><code>implementation_lint_clean</code> eq=true → observed=<code>true</code></li><li class="cond cond--ok"><code>implementation_types_clean</code> eq=true → observed=<code>true</code></li><li class="cond cond--ok"><code>implementation_unit_tests_pass</code> eq=true → observed=<code>true</code></li><li class="cond cond--ok"><code>open_clarifications_count</code> eq=0 → observed=<code>0</code></li><li class="cond cond--ok"><code>s1_findings_count</code> eq=0 → observed=<code>0</code></li></ul> - <p class="contribution">Contributes <strong>Ready to progress</strong> with weight <strong>100</strong>. Actions: <code>advance-to-testing</code>.</p> -</article> +</article></details> </section> <section aria-label="provenance"> - <h2>Provenance</h2> + <h2>Provenance.</h2> + <p class="provenance-preamble">Provenance — these hashes let an auditor replay the verdict against a specific (engine, rules, source) tuple.</p> <p class="provenance"> Engine version: <code>0.2.0</code><br> - Ruleset hash: <code>84a35f019743b26e106d82332b2e07cc12979c9ce1fdf4b466550b3b37e25d92</code><br> - Flags hash: <code>9fa08674155359d5ccf2db199355bbccd0bc5f63674ec29392a9a69ab78d82f9</code><br> + Ruleset hash: <code>84a35f019743…</code><br> + Flags hash: <code>9fa086741553…</code><br> Rules file: <code>rules/quality-gates.yaml</code><br> Flags file: <code>fixtures/ready-implementation.json</code> </p> + <div class="reproduce"> + <p>How to reproduce — run from <code>experiments/rule-engine-poc/</code>:</p> + <pre><code>npx tsx src/cli.ts rules/quality-gates.yaml fixtures/ready-implementation.json --html <out.html> --quiet</code></pre> + <p>Then verify the three hashes above match the values in the regenerated report.</p> + </div> </section> <footer> - Generated 2026-05-17T12:56:58.912Z · experiments/rule-engine-poc · same inputs → identical report, byte-for-byte (except this timestamp). + experiments/rule-engine-poc · same inputs → identical report, byte-for-byte (except the generated-at timestamp). </footer> </body> </html> diff --git a/experiments/rule-engine-poc/test/html-report.test.ts b/experiments/rule-engine-poc/test/html-report.test.ts new file mode 100644 index 000000000..62116e59b --- /dev/null +++ b/experiments/rule-engine-poc/test/html-report.test.ts @@ -0,0 +1,392 @@ +// Unit tests for the HTML report renderer. The renderer is a pure +// function: VerdictResult + RenderContext -> string. We assert on +// substrings the reviewers explicitly called for (research/17, 18, 20, +// 21) so a future refactor cannot silently drop them. + +import { describe, expect, it } from "vitest"; +import { evaluate } from "../src/engine.js"; +import { renderHtmlReport, type RenderContext } from "../src/html-report.js"; +import { loadRulesFromString } from "../src/loader.js"; +import type { ActionGlossary } from "../src/action-glossary.js"; +import type { ExtractionFlags } from "../src/types.js"; + +const RULES_YAML = ` +- id: a-blocks-when-flag-true + description: Blocks when ci_failing is true. + priority: 100 + when: + all: + - flag: ci_failing + eq: true + then: + verdict: blocked + weight: 100 + actions: [kick-ci, page-oncall] + tags: [ci] + +- id: b-needs-attention-when-design-risk + description: Needs attention when design_risk is open. + priority: 50 + when: + all: + - flag: design_risk_open + eq: true + then: + verdict: needs-attention + weight: 50 + actions: [schedule-design-review] + tags: [design] + +- id: c-ready-when-shipping + description: Ready when ship_ready is true. + priority: 10 + when: + all: + - flag: ship_ready + eq: true + then: + verdict: ready-to-progress + weight: 10 + actions: [advance-stage] + tags: [release] + +- id: d-needs-extra-flag + description: Rule that depends on a never-supplied flag. + priority: 20 + when: + all: + - flag: never_supplied + eq: true + then: + verdict: needs-attention + weight: 20 + actions: [supply-flag] + tags: [coverage] +`; + +const rules = loadRulesFromString(RULES_YAML, "html-report.test.ts"); + +function baseCtx(overrides: Partial<RenderContext> = {}): RenderContext { + return { + rulesPath: "rules/test.yaml", + flagsPath: "fixtures/test.json", + flags: {}, + generatedAt: "2026-05-17T10:00:00.000Z", + ...overrides, + }; +} + +describe("renderHtmlReport: 'What fired' section", () => { + it("emits a 'What fired.' section listing only matched rules", () => { + const flags: ExtractionFlags = { ci_failing: true }; + const result = evaluate(rules, flags); + const html = renderHtmlReport(result, baseCtx({ flags })); + + expect(html).toContain("What fired."); + // Matched rule shows up before the audit-trail section. + const whatFired = html.indexOf("What fired."); + const auditTrail = html.indexOf("Audit trail."); + expect(whatFired).toBeGreaterThan(-1); + expect(auditTrail).toBeGreaterThan(whatFired); + // The fall-back "no rule fired" sentence is not emitted when at + // least one rule matched. + const whatFiredBody = html.slice(whatFired, auditTrail); + expect(whatFiredBody).toContain("a-blocks-when-flag-true"); + expect(whatFiredBody).not.toContain("No rule fired."); + }); + + it("emits a placeholder when no rule fired", () => { + const flags: ExtractionFlags = {}; + const result = evaluate(rules, flags); + const html = renderHtmlReport(result, baseCtx({ flags })); + expect(html).toContain("No rule fired"); + }); + + it("replaces 'x of N rules matched' with 'N rules fired'", () => { + const flags: ExtractionFlags = { ci_failing: true }; + const result = evaluate(rules, flags); + const html = renderHtmlReport(result, baseCtx({ flags })); + expect(html).toContain("1 rule fired"); + expect(html).not.toMatch(/\d+ of \d+ rules matched/); + }); +}); + +describe("renderHtmlReport: audit trail collapse", () => { + it("wraps non-matched rules in <details> and leaves matched rules inline", () => { + const flags: ExtractionFlags = { ci_failing: true }; + const result = evaluate(rules, flags); + const html = renderHtmlReport(result, baseCtx({ flags })); + // Unmatched rules are wrapped in collapsed <details>. + expect(html).toContain('<details class="rule-collapsed">'); + // Matched rule's article shows up outside any <details>. + const auditStart = html.indexOf("Audit trail."); + const provenanceStart = html.indexOf("Provenance."); + const trail = html.slice(auditStart, provenanceStart); + // The first article inside the trail is the matched rule, inline. + const firstArticle = trail.indexOf("<article"); + const firstDetails = trail.indexOf("<details"); + expect(firstArticle).toBeGreaterThan(-1); + expect(firstArticle).toBeLessThan(firstDetails); + }); +}); + +describe("renderHtmlReport: blocker-by-absence banner", () => { + it("renders the banner with the count and missing flag names", () => { + // ci_failing is supplied so rule A matches; rules B, C, D reference + // flags absent from the extraction so the banner should report 3. + const flags: ExtractionFlags = { ci_failing: true }; + const result = evaluate(rules, flags); + const html = renderHtmlReport(result, baseCtx({ flags })); + expect(html).toContain(`<div class="banner banner--missing"`); + expect(html).toContain("<strong>3</strong>"); + expect(html).toContain("could not be evaluated"); + expect(html).toContain("<code>never_supplied</code>"); + expect(html).toContain("<code>design_risk_open</code>"); + expect(html).toContain("<code>ship_ready</code>"); + }); + + it("singularises 'rule' when only one rule is missing a flag", () => { + // Supply everything except `never_supplied` so rule D is the only + // one missing a flag. + const flags: ExtractionFlags = { + ci_failing: true, + design_risk_open: false, + ship_ready: false, + }; + const result = evaluate(rules, flags); + const html = renderHtmlReport(result, baseCtx({ flags })); + expect(html).toContain("<strong>1</strong> rule could not be evaluated"); + }); + + it("omits the banner DOM element when no rule references a missing flag", () => { + // Supply every flag the rule set asks for so no rule reports + // "flag missing in extraction". + const flags: ExtractionFlags = { + ci_failing: false, + design_risk_open: false, + ship_ready: true, + never_supplied: false, + }; + const result = evaluate(rules, flags); + const html = renderHtmlReport(result, baseCtx({ flags })); + // The CSS selector .banner--missing always appears in the + // stylesheet; assert on the absence of the rendered DOM element. + expect(html).not.toContain('<div class="banner banner--missing"'); + }); +}); + +describe("renderHtmlReport: action ordering and glossary", () => { + it("renders actions in priority-of-cause order, not alphabetical", () => { + // Rule A (priority 100) contributes [kick-ci, page-oncall]; rule C + // (priority 10) contributes [advance-stage]. Alphabetical order + // would be [advance-stage, kick-ci, page-oncall]; priority-of-cause + // order is [kick-ci, page-oncall, advance-stage]. + const flags: ExtractionFlags = { ci_failing: true, ship_ready: true }; + const result = evaluate(rules, flags); + const html = renderHtmlReport(result, baseCtx({ flags })); + // Pluck the "Take these actions." block. + const start = html.indexOf("Take these actions."); + const end = html.indexOf("</ul>", start); + const block = html.slice(start, end); + const kick = block.indexOf("kick-ci"); + const page = block.indexOf("page-oncall"); + const advance = block.indexOf("advance-stage"); + expect(kick).toBeGreaterThan(-1); + expect(page).toBeGreaterThan(kick); + expect(advance).toBeGreaterThan(page); + // result.actions remains the alphabetised list for machine consumers. + expect(result.actions).toEqual( + ["advance-stage", "kick-ci", "page-oncall"].sort(), + ); + }); + + it("renders human sentences when an action glossary is supplied", () => { + const flags: ExtractionFlags = { ci_failing: true }; + const result = evaluate(rules, flags); + const glossary: ActionGlossary = { + "kick-ci": { human: "Re-run the failing CI job." }, + "page-oncall": { human: "Page the on-call for triage." }, + }; + const html = renderHtmlReport( + result, + baseCtx({ flags, actionGlossary: glossary }), + ); + expect(html).toContain("Re-run the failing CI job."); + expect(html).toContain('<code class="action-slug">kick-ci</code>'); + }); + + it("falls back to bare slugs when no glossary is supplied", () => { + const flags: ExtractionFlags = { ci_failing: true }; + const result = evaluate(rules, flags); + const html = renderHtmlReport(result, baseCtx({ flags })); + // Match the action LI shape: bare slug only, no sentence markup. + expect(html).toMatch(/<li><code>kick-ci<\/code><\/li>/); + expect(html).toMatch(/<li><code>page-oncall<\/code><\/li>/); + // The demoted-slug class only appears inside an LI when a sentence + // is rendered — must NOT appear when no glossary is supplied. + expect(html).not.toMatch(/<li>[^<]*<span class="action-sentence"/); + }); + + it("falls back to bare slug when the glossary lacks an entry", () => { + const flags: ExtractionFlags = { ci_failing: true }; + const result = evaluate(rules, flags); + const glossary: ActionGlossary = { + // page-oncall present, kick-ci absent. + "page-oncall": { human: "Page the on-call." }, + }; + const html = renderHtmlReport( + result, + baseCtx({ flags, actionGlossary: glossary }), + ); + expect(html).toContain("Page the on-call."); + // kick-ci has no human sentence -> bare slug. + expect(html).toMatch(/<li><code>kick-ci<\/code><\/li>/); + }); +}); + +describe("renderHtmlReport: provenance reframing", () => { + it("emits the auditor preamble and the reproduce block", () => { + const flags: ExtractionFlags = { ci_failing: true }; + const result = evaluate(rules, flags); + const html = renderHtmlReport(result, baseCtx({ flags })); + expect(html).toContain( + "these hashes let an auditor replay the verdict against a specific (engine, rules, source) tuple", + ); + expect(html).toContain("How to reproduce"); + expect(html).toContain("npx tsx src/cli.ts"); + expect(html).toContain("rules/test.yaml"); + expect(html).toContain("fixtures/test.json"); + }); + + it("truncates ruleset and flags hashes to 12-char prefixes", () => { + const flags: ExtractionFlags = { ci_failing: true }; + const result = evaluate(rules, flags); + const html = renderHtmlReport(result, baseCtx({ flags })); + // Full hash should NOT appear inline. + expect(html).not.toContain(result.rulesetHash); + expect(html).not.toContain(result.flagsHash); + expect(html).toContain(result.rulesetHash.slice(0, 12)); + expect(html).toContain(result.flagsHash.slice(0, 12)); + }); +}); + +describe("renderHtmlReport: system-identity header and timestamp", () => { + it("renders the engine-version identity paragraph above the verdict card", () => { + const flags: ExtractionFlags = { ci_failing: true }; + const result = evaluate(rules, flags); + const html = renderHtmlReport( + result, + baseCtx({ flags, generatedAt: "2026-05-17T10:00:00.000Z" }), + ); + expect(html).toContain(`Rule engine POC v${result.engineVersion}`); + expect(html).toContain("deterministic verdict from extracted flags"); + // Generated-at must appear above the verdict card. + const ts = html.indexOf("2026-05-17T10:00:00.000Z"); + const verdictCard = html.indexOf('class="verdict-card"'); + expect(ts).toBeGreaterThan(-1); + expect(ts).toBeLessThan(verdictCard); + }); +}); + +describe("renderHtmlReport: verdict-tier + glyph legend", () => { + it("emits a collapsed legend explaining tiers and glyphs", () => { + const flags: ExtractionFlags = { ci_failing: true }; + const result = evaluate(rules, flags); + const html = renderHtmlReport(result, baseCtx({ flags })); + expect(html).toMatch(/<details class="legend"[^>]*>/); + expect(html).toContain("Verdict tiers and audit-trail glyphs"); + expect(html).toContain("Ready to progress"); + expect(html).toMatch(/<code>\[\+\]<\/code>/); + expect(html).toMatch(/<code>\[-\]<\/code>/); + expect(html).toMatch(/<code>\[\?\]<\/code>/); + }); +}); + +describe("renderHtmlReport: cond--miss row wash", () => { + it("declares a faint-red background for .cond--miss", () => { + const flags: ExtractionFlags = { ci_failing: false }; + const result = evaluate(rules, flags); + const html = renderHtmlReport(result, baseCtx({ flags })); + // The CSS rule is emitted regardless; assert its presence. + expect(html).toMatch(/\.cond--miss\s*\{[^}]*background:\s*#fdecea/); + }); +}); + +describe("renderHtmlReport: mobile fallback", () => { + it("declares a single-column summary grid below 540px", () => { + const flags: ExtractionFlags = { ci_failing: true }; + const result = evaluate(rules, flags); + const html = renderHtmlReport(result, baseCtx({ flags })); + expect(html).toMatch( + /@media\s*\(max-width:\s*540px\)\s*\{[^}]*\.summary-grid\s*\{\s*grid-template-columns:\s*1fr;/, + ); + }); +}); + +describe("renderHtmlReport: trust calibration", () => { + it("renders the --skip-validate banner when validationStatus is 'skipped'", () => { + const flags: ExtractionFlags = { ci_failing: true }; + const result = evaluate(rules, flags); + const html = renderHtmlReport( + result, + baseCtx({ flags, validationStatus: "skipped" }), + ); + expect(html).toContain("banner--skip"); + expect(html).toContain("validation gate was skipped"); + expect(html).toContain("--skip-validate"); + }); + + it("omits the skip banner DOM element for validated runs", () => { + const flags: ExtractionFlags = { ci_failing: true }; + const result = evaluate(rules, flags); + const html = renderHtmlReport( + result, + baseCtx({ flags, validationStatus: "validated" }), + ); + // The CSS selector .banner--skip is always emitted; assert on the + // absence of the rendered DOM element. + expect(html).not.toContain('<div class="banner banner--skip"'); + }); + + it("emits a verified badge with the clarifying tooltip when prompt hash matches", () => { + const flags: ExtractionFlags = { + ci_failing: true, + __prompt_hash: "abc123def456789", + }; + const result = evaluate(rules, flags); + const html = renderHtmlReport( + result, + baseCtx({ flags, promptHash: "abc123def456789" }), + ); + expect(html).toContain("badge--verified"); + expect(html).toContain( + "this confirms the extraction was produced from the current source, not that the extracted flags are correct", + ); + }); +}); + +describe("renderHtmlReport: section headers (sentence-case + period)", () => { + it.each([ + ["Verdict."], + ["What fired."], + ["Audit trail."], + ["Provenance."], + ["Weighted tally."], + ["Take these actions."], + ["Extraction flags."], + ])("emits section header %s", (header) => { + const flags: ExtractionFlags = { ci_failing: true }; + const result = evaluate(rules, flags); + const html = renderHtmlReport(result, baseCtx({ flags })); + expect(html).toContain(header); + }); + + it("does not emit the prior 'Suggested actions' heading", () => { + const flags: ExtractionFlags = { ci_failing: true }; + const result = evaluate(rules, flags); + const html = renderHtmlReport(result, baseCtx({ flags })); + // The verb has switched from passive ("Suggested") to imperative + // ("Take these"). Make sure the old heading is gone. + expect(html).not.toContain("<h2>Suggested actions</h2>"); + }); +}); From 90f3fe10f5a652c428595c512e9a25c1d2af640d Mon Sep 17 00:00:00 2001 From: Claude <noreply@anthropic.com> Date: Sun, 17 May 2026 13:16:50 +0000 Subject: [PATCH 35/45] fix(rule-engine-poc): browser exit-code + takeOpt missing-value (Codex round 11) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - open-browser.ts: success now requires the opener process to exit with code 0 (or null, signal-terminated) — not just spawn. On Linux headless / CI, xdg-open spawns successfully and then immediately exits non-zero because no browser handler is registered; previously report.ts printed 'opened in browser' even though the file was never opened. Safety-net timeout bumped to 1s for opener daemons that never deliver an exit event (#525 round 11 P2). - cli-shared.ts: takeOpt now throws 'Missing value for option <flag>' when the flag is the last argv entry or is followed by another flag. Previously 'npm run report -- --target' would silently fall back to 'all targets' and 'npm run report -- --target --quiet' would interpret '--quiet' as the target id (#525 round 11 P2). - Two new takeOpt tests cover the missing-value rejection path. Suite: 165/165 passing. --- experiments/rule-engine-poc/src/cli-shared.ts | 10 ++++++ .../rule-engine-poc/src/open-browser.ts | 31 +++++++++++-------- .../rule-engine-poc/test/cli-shared.test.ts | 10 ++++++ 3 files changed, 38 insertions(+), 13 deletions(-) diff --git a/experiments/rule-engine-poc/src/cli-shared.ts b/experiments/rule-engine-poc/src/cli-shared.ts index 7ee2db586..c68a0407a 100644 --- a/experiments/rule-engine-poc/src/cli-shared.ts +++ b/experiments/rule-engine-poc/src/cli-shared.ts @@ -32,6 +32,16 @@ export function takeOpt(argv: string[], flag: string): string | undefined { const i = argv.indexOf(flag); if (i === -1) return undefined; const v = argv[i + 1]; + // Codex round 11 P2: if `--target` (or any --opt) is followed by nothing + // or by another flag, the caller almost certainly fat-fingered the + // command. Failing silently makes `--target` default to "all targets", + // which can overwrite multiple outputs unexpectedly. Throw instead so + // the CLI exits with a clear error before doing any work. + if (v === undefined || v.startsWith("-")) { + throw new Error( + `Missing value for option '${flag}'. Expected: '${flag} <value>'.`, + ); + } argv.splice(i, 2); return v; } diff --git a/experiments/rule-engine-poc/src/open-browser.ts b/experiments/rule-engine-poc/src/open-browser.ts index 7032974ba..f5c04aaaa 100644 --- a/experiments/rule-engine-poc/src/open-browser.ts +++ b/experiments/rule-engine-poc/src/open-browser.ts @@ -2,13 +2,15 @@ import { spawn } from "node:child_process"; import { platform } from "node:os"; // Best-effort: try to open `path` in the OS default browser. -// Returns a Promise<boolean> that resolves true if the child process -// successfully spawned, false otherwise (no xdg-open installed, -// headless container, etc.). +// Returns a Promise<boolean> that resolves true only when the opener +// process actually exits cleanly (or stays alive past the safety +// window), false otherwise — including the common headless case +// where xdg-open spawns successfully but then immediately exits +// non-zero because no browser handler is registered. // -// We wait briefly for either the `spawn` success event or the `error` -// event to fire before resolving, so the caller's status message -// reflects reality. A short timeout caps the worst case. +// Codex round 11 P2: spawn alone is not a success signal because +// xdg-open / open / cmd start can all exit non-zero after spawning. +// We now wait for the `exit` event and check the exit code. export function openInBrowser(path: string): Promise<boolean> { return new Promise((resolve) => { let settled = false; @@ -31,14 +33,17 @@ export function openInBrowser(path: string): Promise<boolean> { detached: true, }); child.on("error", () => finish(false)); - child.on("spawn", () => { - child.unref(); - finish(true); + child.on("spawn", () => child.unref()); + child.on("exit", (code) => { + // xdg-open / open / cmd start all exit promptly. Zero (or + // signal-terminated with null) means the OS handed off + // successfully; non-zero means no handler / unknown protocol. + finish(code === 0 || code === null); }); - // Safety net: if neither event fires within 250 ms (some platforms - // are stingy with the `spawn` event), assume success — the child - // has been forked, we just don't have positive confirmation. - setTimeout(() => finish(true), 250); + // Safety net: if the opener daemonises and never delivers an + // `exit` event in our process tree within 1s, assume the + // hand-off worked. Rare on the three supported platforms. + setTimeout(() => finish(true), 1000); } catch { finish(false); } diff --git a/experiments/rule-engine-poc/test/cli-shared.test.ts b/experiments/rule-engine-poc/test/cli-shared.test.ts index b0c8bc400..d1935f554 100644 --- a/experiments/rule-engine-poc/test/cli-shared.test.ts +++ b/experiments/rule-engine-poc/test/cli-shared.test.ts @@ -86,6 +86,16 @@ describe("takeOpt", () => { expect(takeOpt(argv, "--config")).toBeUndefined(); expect(argv).toEqual(["--other", "x"]); }); + + it("throws when the flag is present but no value follows", () => { + const argv = ["--target"]; + expect(() => takeOpt(argv, "--target")).toThrow(/Missing value for option '--target'/); + }); + + it("throws when the next argv entry is another flag", () => { + const argv = ["--target", "--quiet"]; + expect(() => takeOpt(argv, "--target")).toThrow(/Missing value for option '--target'/); + }); }); describe("takeFlag", () => { From eb01077fc69c7659f0a73bc80b5bbded56e11b8c Mon Sep 17 00:00:00 2001 From: Claude <noreply@anthropic.com> Date: Sun, 17 May 2026 13:23:16 +0000 Subject: [PATCH 36/45] fix(rule-engine-poc): missing-flag banner + shell-quoted reproduce (Codex round 12) - html-report.ts missingFlagNames: only count rules whose final outcome was determined by the missing flag (matched === false). Previously a 'when.any' rule with one matched branch + one missing branch counted as un-evaluable in the banner, even though it contributed to the verdict (#525 round 12 P2). - html-report.ts reproCmd: paths are now single-quoted via a small shellQuote helper that escapes embedded ' as the standard '\\'' four-char sequence. Paths with spaces (e.g., 'My Projects/rules.yaml') no longer break the copy-pasted reproduce command (#525 round 12 P2). - Two new html-report tests: when.any-with-missing-branch is NOT counted in the banner, and reproCmd contains HTML-escaped quoted paths. Suite: 167/167 passing. --- .../rule-engine-poc/src/html-report.ts | 15 +++++- .../rule-engine-poc/test/html-report.test.ts | 47 +++++++++++++++++++ 2 files changed, 60 insertions(+), 2 deletions(-) diff --git a/experiments/rule-engine-poc/src/html-report.ts b/experiments/rule-engine-poc/src/html-report.ts index 2021ebe93..73346291b 100644 --- a/experiments/rule-engine-poc/src/html-report.ts +++ b/experiments/rule-engine-poc/src/html-report.ts @@ -124,6 +124,12 @@ function missingFlagNames(evaluations: RuleEvaluation[]): { const flags: string[] = []; let ruleCount = 0; for (const ev of evaluations) { + // Codex round 12 P2: a rule that matched (e.g., when.any with one + // branch satisfied and another branch missing) is NOT + // "un-evaluable" — it contributed to the verdict. Only count + // rules whose final outcome was determined by the missing input + // (matched === false). Flag names still gather across all missing + // conditions so the banner can name what the LLM didn't supply. let ruleHasMissing = false; for (const c of ev.conditions) { if (c.reason !== "flag missing in extraction") continue; @@ -134,7 +140,7 @@ function missingFlagNames(evaluations: RuleEvaluation[]): { flags.push(name); } } - if (ruleHasMissing) ruleCount += 1; + if (ruleHasMissing && !ev.matched) ruleCount += 1; } return { flags, ruleCount }; } @@ -263,7 +269,12 @@ export function renderHtmlReport( : ""; // Reproduce command: assembled from the same fields plan/report use. - const reproCmd = `npx tsx src/cli.ts ${ctx.rulesPath} ${ctx.flagsPath} --html <out.html> --quiet`; + // Codex round 12 P2: quote paths so paths with spaces or shell + // metacharacters (e.g., "My Projects/rules.yaml") don't break the + // command. Single-quote shell-escape: replace any ' inside the + // path with the four-char sequence '\'' . + const shellQuote = (s: string): string => `'${s.replace(/'/g, "'\\''")}'`; + const reproCmd = `npx tsx src/cli.ts ${shellQuote(ctx.rulesPath)} ${shellQuote(ctx.flagsPath)} --html <out.html> --quiet`; return `<!doctype html> <html lang="en"> diff --git a/experiments/rule-engine-poc/test/html-report.test.ts b/experiments/rule-engine-poc/test/html-report.test.ts index 62116e59b..66cad50ac 100644 --- a/experiments/rule-engine-poc/test/html-report.test.ts +++ b/experiments/rule-engine-poc/test/html-report.test.ts @@ -158,6 +158,34 @@ describe("renderHtmlReport: blocker-by-absence banner", () => { expect(html).toContain("<strong>1</strong> rule could not be evaluated"); }); + it("does not count rules that matched despite a missing branch", () => { + // Codex round 12 P2: a `when.any` rule with one branch matched and + // another branch missing is NOT un-evaluable — it contributed to + // the verdict. The banner must not claim the rule failed on absence. + const inlineRules = loadRulesFromString( + ` +- id: any-rule-one-branch-matches + description: matches when either flag is true + priority: 50 + when: + any: + - flag: present_flag + eq: true + - flag: never_supplied + eq: true + then: + verdict: needs-attention + weight: 10 + actions: [] +`, + "any-with-missing-branch", + ); + const flags: ExtractionFlags = { present_flag: true }; + const result = evaluate(inlineRules, flags); + const html = renderHtmlReport(result, baseCtx({ flags })); + expect(html).not.toContain('<div class="banner banner--missing"'); + }); + it("omits the banner DOM element when no rule references a missing flag", () => { // Supply every flag the rule set asks for so no rule reports // "flag missing in extraction". @@ -258,6 +286,25 @@ describe("renderHtmlReport: provenance reframing", () => { expect(html).toContain("fixtures/test.json"); }); + it("shell-quotes paths in the reproduce command so spaces don't break it", () => { + // Codex round 12 P2: unquoted paths break copy-pasted reproduce + // commands on user machines (e.g., "My Projects/..."). + const flags: ExtractionFlags = { ci_failing: true }; + const result = evaluate(rules, flags); + const html = renderHtmlReport( + result, + baseCtx({ + flags, + rulesPath: "My Projects/rules.yaml", + flagsPath: "extractions with spaces/x.json", + }), + ); + // Single quotes are HTML-escaped (') in the rendered output, + // but they decode back to ' when the user pastes the command. + expect(html).toContain("'My Projects/rules.yaml'"); + expect(html).toContain("'extractions with spaces/x.json'"); + }); + it("truncates ruleset and flags hashes to 12-char prefixes", () => { const flags: ExtractionFlags = { ci_failing: true }; const result = evaluate(rules, flags); From 003a05ed50229cedc9c29b556b7652da1a60cd27 Mon Sep 17 00:00:00 2001 From: Claude <noreply@anthropic.com> Date: Sun, 17 May 2026 13:28:58 +0000 Subject: [PATCH 37/45] fix(rule-engine-poc): single-shot CLI rejects missing --html value (Codex round 13) src/cli.ts has its own takeOption() (the single-shot fixture flow doesn't use src/cli-shared.ts). Same bug as round 11 P2 in cli-shared: when --html had no value the helper returned undefined and the CLI silently proceeded with no HTML output, breaking automation that relies on the artifact being written. Now fail fast with a clear stderr message and exit code 2 when the option is the last argv entry or is followed by another flag. Suite: 167/167 still passing (no test exercised the silent-skip path). --- experiments/rule-engine-poc/src/cli.ts | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/experiments/rule-engine-poc/src/cli.ts b/experiments/rule-engine-poc/src/cli.ts index 2402f5c2a..5908242cf 100644 --- a/experiments/rule-engine-poc/src/cli.ts +++ b/experiments/rule-engine-poc/src/cli.ts @@ -12,6 +12,17 @@ function takeOption(flag: string): string | undefined { const i = argv.indexOf(flag); if (i === -1) return undefined; const value = argv[i + 1]; + // Codex round 13 P2: fail fast when the option has no value (last + // argv entry) or is followed by another flag. Previously a typo like + // `rule-engine-poc rules.yaml flags.json --html` would silently + // proceed with no output, breaking automation that relies on the + // HTML artifact being present. + if (value === undefined || value.startsWith("-")) { + console.error( + `Missing value for option '${flag}'. Expected: '${flag} <value>'.`, + ); + process.exit(2); + } argv.splice(i, 2); return value; } From c00fc4d8961cc6acab64b46862b7b9ee1b9e031d Mon Sep 17 00:00:00 2001 From: Claude <noreply@anthropic.com> Date: Sun, 17 May 2026 13:37:12 +0000 Subject: [PATCH 38/45] fix(rule-engine-poc): string-only actions + finite gt/lt (Codex round 14) - loader.ts: every entry of 'then.actions' must be a non-empty string slug. Previously numbers / objects / empty strings passed load-time validation and flowed into the HTML reporter as unrecognised tokens that couldn't map to a glossary entry, breaking the remediation guidance the verdict is meant to provide (#525 round 14 P2). - loader.ts: 'gt' and 'lt' now reject NaN and Infinity at load time. Both are technically 'number' but silently corrupt comparisons at runtime (NaN comparisons always false), so a typo could make a gating rule unexpectedly never fire (#525 round 14 P2). - Four new loader tests cover non-string action elements, empty-string action elements, NaN gt, and Infinity lt. Suite: 171/171 passing (167 + 4). --- experiments/rule-engine-poc/src/loader.ts | 32 ++++++- .../rule-engine-poc/test/loader.test.ts | 84 +++++++++++++++++++ 2 files changed, 112 insertions(+), 4 deletions(-) diff --git a/experiments/rule-engine-poc/src/loader.ts b/experiments/rule-engine-poc/src/loader.ts index bc7e70d62..7de40ced2 100644 --- a/experiments/rule-engine-poc/src/loader.ts +++ b/experiments/rule-engine-poc/src/loader.ts @@ -121,6 +121,19 @@ function validate( if (!Array.isArray(rule.then.actions)) { throw new Error(`Rule '${rule.id}' missing 'then.actions' array`); } + // Codex round 14 P2: every action must be a non-empty string slug. + // Non-string entries flow into the HTML reporter as unrecognised + // tokens that can't map to a glossary entry, undermining the + // remediation guidance the verdict is meant to provide. + for (let i = 0; i < rule.then.actions.length; i++) { + const a = rule.then.actions[i]; + if (typeof a !== "string" || a.length === 0) { + throw new Error( + `Rule '${rule.id}' has invalid 'then.actions[${i}]' (got ${typeof a}); ` + + `every action must be a non-empty string slug`, + ); + } + } if (typeof rule.priority !== "number") { throw new Error(`Rule '${rule.id}' missing numeric 'priority'`); } @@ -179,10 +192,21 @@ function validateConditions( ); } for (const numOp of ["gt", "lt"] as const) { - if (numOp in cond && typeof cond[numOp] !== "number") { - throw new Error( - `Rule '${ruleId}' ${where} has non-number '${numOp}' (got ${typeof cond[numOp]})`, - ); + if (numOp in cond) { + const v = cond[numOp]; + if (typeof v !== "number") { + throw new Error( + `Rule '${ruleId}' ${where} has non-number '${numOp}' (got ${typeof v})`, + ); + } + // Codex round 14 P2: NaN and Infinity are technically `number` + // but corrupt comparisons silently (every NaN comparison is + // false). Reject at load time alongside other malformed input. + if (!Number.isFinite(v)) { + throw new Error( + `Rule '${ruleId}' ${where} has non-finite '${numOp}' (got ${String(v)})`, + ); + } } } if (typeof cond.regex === "string") { diff --git a/experiments/rule-engine-poc/test/loader.test.ts b/experiments/rule-engine-poc/test/loader.test.ts index b959459ee..54bcc78c2 100644 --- a/experiments/rule-engine-poc/test/loader.test.ts +++ b/experiments/rule-engine-poc/test/loader.test.ts @@ -379,6 +379,90 @@ describe("loader", () => { ).toThrow(/Duplicate rule id 'dup'/); }); + it("rejects rules with non-string entries in then.actions", () => { + expect(() => + loadRulesFromString( + ` +- id: r1 + priority: 1 + description: x + when: + all: + - flag: a + eq: true + then: + verdict: blocked + weight: 1 + actions: ["valid-slug", 42] +`, + "actions-non-string", + ), + ).toThrow(/invalid 'then\.actions\[1\]'/); + }); + + it("rejects rules with empty-string entries in then.actions", () => { + expect(() => + loadRulesFromString( + ` +- id: r1 + priority: 1 + description: x + when: + all: + - flag: a + eq: true + then: + verdict: blocked + weight: 1 + actions: [""] +`, + "actions-empty-string", + ), + ).toThrow(/invalid 'then\.actions\[0\]'/); + }); + + it("rejects conditions with non-finite 'gt'", () => { + expect(() => + loadRulesFromString( + ` +- id: r1 + priority: 1 + description: x + when: + all: + - flag: count + gt: .nan + then: + verdict: blocked + weight: 1 + actions: [a] +`, + "gt-nan", + ), + ).toThrow(/non-finite 'gt'/); + }); + + it("rejects conditions with infinite 'lt'", () => { + expect(() => + loadRulesFromString( + ` +- id: r1 + priority: 1 + description: x + when: + all: + - flag: count + lt: .inf + then: + verdict: blocked + weight: 1 + actions: [a] +`, + "lt-inf", + ), + ).toThrow(/non-finite 'lt'/); + }); + it("assigns a stable content hash to each rule", () => { const ruleA = loadRulesFromString( ` From 1f74fb03332714a22efd02c1c942ab092d66529f Mon Sep 17 00:00:00 2001 From: Claude <noreply@anthropic.com> Date: Sun, 17 May 2026 13:46:25 +0000 Subject: [PATCH 39/45] docs(rule-engine-poc): compliance map for adopters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New docs/compliance.md walks the standards/regulations an adopter is most likely to be asked about in 2026 and catalogues what the POC contributes vs what stays the adopter's job. Synthesises the two prior research passes (research/02 regulatory + research/20 auditor readability) into reference material for scoping conversations. Covered: - EU AI Act Art. 11-14 + Art. 72 with a per-article table. - ISO/IEC 42001 AIMS clauses 6-10. - ISO/IEC 23894 AI risk management. - NIST AI RMF Govern / Map / Measure / Manage. - GDPR Art. 22 (when it applies vs when it doesn't). - OECD AI Principles (1-paragraph summary). Plus: - 'What the POC ticks natively' — per-artifact provenance. - 'What is NOT in this POC' — honest gap analysis. - Maturity checklist before production with rough effort estimates. Leads with a disclaimer that this is engineering reference material, not legal advice or certification. docs/README.md updated to index the new doc. --- experiments/rule-engine-poc/docs/README.md | 1 + .../rule-engine-poc/docs/compliance.md | 235 ++++++++++++++++++ 2 files changed, 236 insertions(+) create mode 100644 experiments/rule-engine-poc/docs/compliance.md diff --git a/experiments/rule-engine-poc/docs/README.md b/experiments/rule-engine-poc/docs/README.md index 1ef09fbfb..a7c1b6e24 100644 --- a/experiments/rule-engine-poc/docs/README.md +++ b/experiments/rule-engine-poc/docs/README.md @@ -15,6 +15,7 @@ Detailed documentation for the POC. Start with the project [README](../README.md | [`workflow.md`](workflow.md) | You want to run the `plan` → paste → `validate` → `report` loop end-to-end. | | [`dsl-reference.md`](dsl-reference.md) | You're writing or reading a rule file and need the full YAML grammar — every operator, every grouping construct. | | [`audit-trail.md`](audit-trail.md) | You need to replay a verdict, diff two verdicts, or map the audit trail to EU AI Act / ISO 42001 requirements. | +| [`compliance.md`](compliance.md) | You're scoping the pattern against a regulation or standard — what the POC ticks natively, what the adopter must still provide, what's out of scope. | | [`extending.md`](extending.md) | You want to add a rule, add a flag, point the engine at a new domain, or run the tests. | | [`ooda-integration.md`](ooda-integration.md) | You want to understand how this POC slots into the OODA orchestrator and what a production wiring would look like. | diff --git a/experiments/rule-engine-poc/docs/compliance.md b/experiments/rule-engine-poc/docs/compliance.md new file mode 100644 index 000000000..9fd0aaa97 --- /dev/null +++ b/experiments/rule-engine-poc/docs/compliance.md @@ -0,0 +1,235 @@ +--- +title: Compliance map +folder: experiments/rule-engine-poc/docs +description: Which standards and regulations the "LLM extracts, rules decide" pattern can target — what the POC ticks natively, what the adopter must still provide, what remains out of scope. +entry_point: false +--- + +# Compliance map + +What this POC can credibly claim to support — and what it explicitly cannot — across the regulations and standards an adopter is most likely to be asked about in 2026. + +> **Scope and disclaimer.** This document is *engineering reference material* for someone scoping the pattern against a compliance need. It is not legal advice and not a certification. The POC implements *part* of what each standard demands; the rest is governance, organisation, and process work the adopter does around the POC. A legal / compliance professional must sign off before any production use. + +## Contents + +1. [Quick map](#1-quick-map) +2. [EU AI Act (Regulation (EU) 2024/1689)](#2-eu-ai-act-regulation-eu-20241689) +3. [ISO/IEC 42001 — AI Management System](#3-isoiec-42001--ai-management-system) +4. [ISO/IEC 23894 — AI risk management](#4-isoiec-23894--ai-risk-management) +5. [NIST AI Risk Management Framework 1.0](#5-nist-ai-risk-management-framework-10) +6. [GDPR Article 22 — automated decision-making](#6-gdpr-article-22--automated-decision-making) +7. [OECD AI Principles](#7-oecd-ai-principles) +8. [What the POC ticks natively](#8-what-the-poc-ticks-natively) +9. [What is NOT in this POC](#9-what-is-not-in-this-poc) +10. [Maturity checklist before production](#10-maturity-checklist-before-production) + +--- + +## 1. Quick map + +| Standard / regulation | Where it bites | What the POC contributes | What stays the adopter's job | +|---|---|---|---| +| **EU AI Act** | Art. 11–14 (technical file, logs, transparency, oversight); Annex IV | Replayable verdict + content-hashed audit trail; deterministic decide layer; visible LLM/rule boundary | Provider identity, model card, lifetime log retention, human-oversight workflow, conformity assessment | +| **ISO/IEC 42001** | AIMS clauses 6–10 (planning, support, operation, performance, improvement) | Rule lifecycle visible in git history; decision artifacts diffable; engine-version pin | Policy, training records, internal audits, management review, corrective-action register | +| **ISO/IEC 23894** | AI risk identification, treatment, monitoring | Audit trail of every condition observation; missing-flag surface | Risk register, treatment plan, residual-risk acceptance, drift monitoring | +| **NIST AI RMF** | Govern / Map / Measure / Manage | *Measure-on-a-case-basis* artifact (per-decision); deterministic re-run | Population-level metrics, fairness audit, uncertainty quantification, model A/B harness | +| **GDPR Art. 22** | Automated decisions with legal / similarly significant effect | Audit-trail rationale (which rules fired, on which flags); right to contest is enabled by replayability | Lawful basis assessment, meaningful information notice, human review path, data-subject rights handling | +| **OECD AI Principles** | Transparency, accountability, robustness | Severity-first verdict, content hashes, no silent failures | Stakeholder engagement, ethical review, redress channel | + +The pattern is strongest at **explainability** and **reproducibility** — weakest at **governance** (rule lifecycle policy, role separation) and **measurement at population scale** (calibration, fairness). + +--- + +## 2. EU AI Act (Regulation (EU) 2024/1689) + +Enforcement is staged: prohibited-practice provisions applied Feb 2025; high-risk system obligations (Annex III) apply from **2 Aug 2026** for most use cases (16-month postponement available for new / substantially modified systems per the May 2026 Digital Omnibus). + +### Articles relevant to the POC pattern + +| Article | Requirement | POC contribution | Adopter must add | +|---|---|---|---| +| **Art. 11 — Technical documentation** | Annex IV technical file describing the system, its components, version history, and intended purpose | `docs/architecture.md`, `docs/audit-trail.md`, ADRs, `research/*` are the engineering corpus | Provider identity, market-placement records, conformity assessment outcome, intended-purpose statement | +| **Art. 12 — Lifetime logs** | Automatic recording of events over the system's lifetime | The `VerdictResult` is the per-decision dossier; `engineVersion + rulesetHash + flagsHash + recomputed promptHash` enable replay; the recompute-on-report design closes the sidecar-paste bypass | Persistent storage (append-only, tamper-evident), retention policy, lifecycle deletion, regulator-access channel | +| **Art. 13 — Transparency to deployers** | Sufficient transparency to interpret output and use it appropriately | HTML report v3: system-identity header, "What fired", reproduce block, verdict-tier + glyph legend, action human-sentences | Provider contact, model card, expected-lifetime statement, capabilities-and-limitations sentence, log-interpretation training for deployers | +| **Art. 14 — Human oversight** | Measures enabling humans to interpret, monitor, intervene, override | The HTML report is the interpretation surface; `--skip-validate` warning is the override-visibility surface; `verified` badge with tooltip is the trust-calibration surface | Reviewer-of-record field, override workflow, two-person rule where required, training | +| **Art. 72 — Post-market monitoring** | Active monitoring after deployment | None — POC is pre-deployment | Production telemetry, drift dashboards, incident reporting | + +### How the POC pattern helps + +- The **severity-first verdict** keeps the categorical-tier semantics the article expects ("safe / not safe / refer to human") rather than a numeric score that obscures the gate. +- **Recompute-on-report** for the prompt hash means a verdict cannot be replayed against silently-changed source material — closes the auditor's "but the data changed" objection. +- **Forbidden verdict-shaped fields** (`verdict`, `assessment`, `conclusion`, ...) at the validate gate keep the LLM in its scoped role: extraction only. The article's "non-deterministic component" boundary is visible because we drew it. + +### How the POC pattern does NOT help + +- It does not classify the system's risk tier — Annex III is the adopter's call. +- It does not produce the **conformity assessment** documentation (Annex VI/VII). +- It says nothing about **CE marking** or **EU declaration of conformity**. +- It does not enforce **biometric / emotional / social-scoring** prohibitions (Art. 5) — those are out of pattern scope. + +--- + +## 3. ISO/IEC 42001 — AI Management System + +Published 2023. Sector-agnostic management-system standard. Certifiable through accredited bodies. The standard mirrors ISO 9001's clause structure. + +| Clause | What the POC contributes | What stays the adopter's job | +|---|---|---| +| **6 Planning** | The OODA / Decide-quadrant framing is a documented architectural decision (`docs/ooda-integration.md`); ADRs record irreversible choices | Risk register, AI policy, role definitions | +| **7 Support** | `docs/extending.md` is the operator manual; `docs/dsl-reference.md` documents the language | Training records, competence assessments | +| **8 Operation** | Plan → AI paste → validate → report loop is documented; exit codes 0/1/2 are CI-integrable | Operational procedures, supplier oversight (if the LLM is hosted), production runbooks | +| **9 Performance evaluation** | Reproducibility tests (`test/engine.test.ts` → `describe("reproducibility")`) are the verification surface | Internal audits, management review, KPI tracking | +| **10 Improvement** | The git history is the corrective-action register; research artifacts are the lessons learned | Non-conformance handling process, CAPA, change-control sign-off | + +### Strong fit + +ISO/IEC 42001's emphasis on a **documented, reviewable AI process** matches Specorator's stance directly. Every rule change is a PR; every PR is reviewable; every reviewed PR has a content hash. That is the *evidence chain* the auditor wants. The standard does not require the rule engine to be deterministic — but determinism makes the evidence chain credible. + +### Weak fit + +The standard expects an **organisation-level AIMS** (roles, policy, top management commitment). The POC is engineering, not governance. The adopter writes the policy, identifies the AI Officer / equivalent, runs the management review, and slots this POC into the resulting framework. + +--- + +## 4. ISO/IEC 23894 — AI risk management + +Published 2023. Adapts ISO 31000 (risk management) to AI. Voluntary, frequently bundled with 42001 in audit scope. + +What the POC contributes: + +- **Risk identification surface**: the `unknown` verdict tier and `flag missing in extraction` reason explicitly name two AI-specific risks (insufficient input, model-extraction failure) that ISO 31000 does not enumerate. +- **Risk traceability**: every blocked verdict traces back to a specific rule, which traces back to a specific Definition-of-Done bullet, which traces back to the quality framework. The chain is replayable. + +What the POC does not produce: + +- The actual **risk register** with likelihood / impact / treatment. +- The **residual risk acceptance** sign-off. +- The **continuous risk-monitoring** loop (drift detection, calibration over time). + +--- + +## 5. NIST AI Risk Management Framework 1.0 + +US-origin, voluntary, increasingly referenced as the "default" framework where no jurisdiction-specific regulation applies. Four functions: **Govern / Map / Measure / Manage**. + +| Function | POC contribution | Adopter must add | +|---|---|---| +| **Govern** | Architecture decisions are ADR-backed; constitution is checked into git | Organisational policy, accountability assignment | +| **Map** | `docs/architecture.md` is the system context map; `research/02` + this doc map the regulatory context | Stakeholder identification, intended/foreseeable use cases | +| **Measure** | Per-decision dossier (verdict + audit trail + provenance) is a *case-basis* Measure artifact | Population-level metrics, uncertainty quantification, calibration, fairness audit | +| **Manage** | Validate gate refuses bad input; severity-first verdict prioritises blocking findings | Incident response, deployment gating, model A/B controls | + +The POC is strongest at **Measure-on-a-case-basis** and weakest at the cross-cutting **Govern** function. NIST emphasises that Govern wraps the other three; the POC alone doesn't govern itself. + +--- + +## 6. GDPR Article 22 — automated decision-making + +Applies when a decision based **solely** on automated processing produces legal or similarly significant effects on a natural person. Two relevant deployment shapes for this POC: + +### When Art. 22 applies + +If the verdict directly determines an outcome affecting a person (hiring filter, credit, benefit eligibility), Art. 22 is in scope. The POC then needs to support: + +- **Right to obtain human intervention**: the report's "What fired" section is the explanation the data subject would receive; the rule-engine verdict must be reviewable by a human with authority to overturn it. +- **Right to express their point of view**: the audit trail names which flags drove the verdict; the data subject can contest specific flags rather than the whole opaque output. +- **Right to contest the decision**: replayability means a contest can be evaluated against the exact (engine, rules, flags) tuple that produced the original verdict. + +### When Art. 22 does NOT apply + +The shipped example domain (Specorator workflow quality gates) operates on artifacts, not natural persons — Art. 22 doesn't bite. The pattern still helps with explainability under Art. 13–15 (transparency) and supports the controller's general accountability obligation (Art. 5(2)). + +### What stays the adopter's job + +- Lawful basis assessment (Art. 6) +- Meaningful information notice to data subjects (Art. 13–14) +- Data subject access request (DSAR) workflow +- DPIA when the deployment shape triggers it (Art. 35) + +--- + +## 7. OECD AI Principles + +Published 2019, updated 2024. Five principles: **inclusive growth & well-being / human-centred values & fairness / transparency & explainability / robustness, security & safety / accountability**. + +The POC speaks most directly to **transparency & explainability** (the verdict is reproducible from named inputs and named rules) and **robustness** (deterministic, content-hashed, fail-closed on invalid input). The other three are organisational. + +OECD has no binding force, but national AI strategies frequently reference it — a clean OECD-mapping table is useful in proposals. + +--- + +## 8. What the POC ticks natively + +The artifacts the engine actually produces, mapped to the recurring requirements above: + +| Artifact | Where | Speaks to | +|---|---|---| +| `engineVersion` constant | `src/engine.ts` | EU AI Act Art. 11 (version pin); NIST Manage | +| `rulesetHash` per evaluation | `src/engine.ts` | Art. 12 (log integrity); ISO 42001 §10 (change traceability) | +| Per-rule content hash | `src/loader.ts` | Art. 12; granular change-diff between two verdicts | +| `flagsHash` per evaluation | `src/engine.ts` | Art. 12; ties verdict to the exact extraction | +| `promptHash` recomputed on report | `src/prompt-hash.ts`, `src/report.ts` | Art. 12; closes paste-the-sidecar bypass | +| Per-condition `observed` + `reason` | `src/engine.ts` audit trail | Art. 13 (rationale); GDPR Art. 22 (point of view); NIST Measure | +| Forbidden verdict-shaped fields | `src/validate.ts::FORBIDDEN_FIELDS` | LLM/rule boundary visibility; auditor's "what's deterministic" question | +| Validate gate refusing bad input | `src/validate.ts` + `src/report.ts` | NIST Manage; ISO 42001 §8 | +| `--skip-validate` warning banner | `src/html-report.ts` | Art. 14 (oversight visibility); auditor's override audit | +| HTML report sections | `src/html-report.ts` | Art. 13 (transparency to deployers); auditor's 30-second test | +| Verdict-tier glossary + glyph legend | `src/html-report.ts` | Art. 13 (interpretation guidance) | +| Reproduce-command block | `src/html-report.ts` | Art. 12 (replay manifest) | +| Mode label_set candidates (future) | `research/18` | Art. 13 (deployer-appropriate language) | + +--- + +## 9. What is NOT in this POC + +Be honest about boundaries when scoping. The POC does NOT provide: + +- **Provider identity / model card / intended-purpose statement** — these are organisational outputs, not engineering ones. +- **Persistent audit log storage** — the `VerdictResult` is a return value; durable retention is the caller's job. +- **Tamper-evidence** — no hash chain across decisions, no signed receipts. Adopters needing Art. 12 strictness should layer signed envelopes or anchor hashes externally. +- **Fairness / bias audit of rule weights** — `severity-first` is a normative choice. It is *visible* in YAML; it has not been *reviewed for disparate impact*. That review is the adopter's. +- **Calibration of flag values** — whether the LLM's `requirements_ears_coverage: 0.85` reflects reality is unmeasured. Calibration is a population-level activity the POC cannot do alone. +- **Drift monitoring** — no telemetry, no verdict-distribution dashboard. Production deployment needs this on top. +- **Conformity assessment / CE-marking artifacts** — out of scope; the technical file feeds in, not out. +- **Privacy controls** — no PII handling, no data minimisation, no encryption at rest. Adopters processing personal data layer these. +- **Model A/B harness** — the engine evaluates one rule set against one flag set. Comparing two engine versions or two rule sets requires harness work. +- **Right-to-erasure cascade** — if an extraction is deleted, the report referencing it is not automatically invalidated. Lifecycle is the adopter's job. + +--- + +## 10. Maturity checklist before production + +If an adopter wants to take the pattern into a regulated production deployment, here is the minimum work the POC implies they still need to do. The numbers are illustrative effort, not commitments. + +| Area | Item | Effort (rough) | +|---|---|---| +| **Governance** | AI policy, role definitions, AIMS scope (ISO 42001 §4–5) | 1–2 weeks | +| **Risk management** | Risk register + treatment plan (ISO 23894) | 1 week | +| **Provider documentation** | Model card, intended-purpose statement, capabilities and limitations | 1 week | +| **Storage & retention** | Per-decision dossier storage with retention policy + tamper-evidence (S3 + object-lock or equivalent) | 1–2 weeks | +| **Human oversight** | Reviewer-of-record workflow, override audit, two-person rule where required | 1 week | +| **Calibration** | Define and run a calibration study on flag values vs ground truth | 2–4 weeks | +| **Fairness audit** | Review severity weights and `disallowed_values` for disparate impact | 1–2 weeks | +| **Telemetry / drift** | Verdict-distribution dashboards; weekly drift report | 1 week | +| **Incident response** | Runbook for: stale extraction in production, schema mismatch, rule regression | 1 week | +| **Legal sign-off** | DPIA (if applicable), Art. 22 lawful-basis assessment, conformity assessment | external | +| **CI integration** | API extractor replacing the paste step; budget caps; rate limiting | 1 week — see [`research/15-ci-operations.md`](../research/15-ci-operations.md) | + +The pattern accelerates the engineering side of this list. It does not accelerate the governance side, and adopters who treat the POC as "compliance solved" will fail audit. The right framing is: *the POC gives you the evidence chain; the chain still needs to be governed*. + +--- + +## References + +- EU AI Act (Regulation (EU) 2024/1689): https://artificialintelligenceact.eu/ +- ISO/IEC 42001:2023 — AI management system: https://www.iso.org/standard/81230.html +- ISO/IEC 23894:2023 — AI risk management: https://www.iso.org/standard/77304.html +- NIST AI Risk Management Framework 1.0 (AI 100-1): https://nvlpubs.nist.gov/nistpubs/ai/nist.ai.100-1.pdf +- GDPR Art. 22 (Regulation (EU) 2016/679): https://gdpr-info.eu/art-22-gdpr/ +- OECD AI Principles: https://oecd.ai/en/ai-principles + +Earlier research on this topic in this POC: + +- [`research/02-regulatory-auditability.md`](../research/02-regulatory-auditability.md) — first regulatory pass at engine maturity +- [`research/20-report-auditor-readability.md`](../research/20-report-auditor-readability.md) — auditor reading-path audit of the HTML report at workflow maturity From d203b154a66fa564dadb1c5c432fbbe96d6ac99b Mon Sep 17 00:00:00 2001 From: Claude <noreply@anthropic.com> Date: Sun, 17 May 2026 14:15:10 +0000 Subject: [PATCH 40/45] =?UTF-8?q?docs(rule-engine-poc):=20report-reference?= =?UTF-8?q?.md=20=E2=80=94=20single-page=20overview=20of=20the=20HTML=20re?= =?UTF-8?q?port?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Synthesises the five wave-4 research artifacts (research/17-21) plus the relevant sections of architecture.md, workflow.md, and audit-trail.md into one place for someone who wants to understand the report end-to-end without chasing across files. Sections: - What the report is + the three committed sample renders - Section-by-section walkthrough mapped to each section's research source - The five perspectives that shaped the v3 rebuild (UX, stakeholder, brand, auditor, critic) with each agent's top-line finding - The 12 wave-4 changes + the Codex round 11-14 hardenings that landed on top - What is still open, bucketed (strategy slice / governance / ADR / production prep / discovery RATs) - How to generate one - How to read one (the 4-step skim path) docs/README.md indexes it. --- experiments/rule-engine-poc/docs/README.md | 1 + .../rule-engine-poc/docs/report-reference.md | 170 ++++++++++++++++++ 2 files changed, 171 insertions(+) create mode 100644 experiments/rule-engine-poc/docs/report-reference.md diff --git a/experiments/rule-engine-poc/docs/README.md b/experiments/rule-engine-poc/docs/README.md index a7c1b6e24..168dfd941 100644 --- a/experiments/rule-engine-poc/docs/README.md +++ b/experiments/rule-engine-poc/docs/README.md @@ -13,6 +13,7 @@ Detailed documentation for the POC. Start with the project [README](../README.md |---|---| | [`architecture.md`](architecture.md) | You want the full architecture picture — system map, user flow, data flow, engine internals, module graph, with Mermaid diagrams. **Start here.** | | [`workflow.md`](workflow.md) | You want to run the `plan` → paste → `validate` → `report` loop end-to-end. | +| [`report-reference.md`](report-reference.md) | You want a single-page overview of the HTML report — section-by-section walkthrough, the five perspectives that shaped it, what's still open. | | [`dsl-reference.md`](dsl-reference.md) | You're writing or reading a rule file and need the full YAML grammar — every operator, every grouping construct. | | [`audit-trail.md`](audit-trail.md) | You need to replay a verdict, diff two verdicts, or map the audit trail to EU AI Act / ISO 42001 requirements. | | [`compliance.md`](compliance.md) | You're scoping the pattern against a regulation or standard — what the POC ticks natively, what the adopter must still provide, what's out of scope. | diff --git a/experiments/rule-engine-poc/docs/report-reference.md b/experiments/rule-engine-poc/docs/report-reference.md new file mode 100644 index 000000000..f3e2919cd --- /dev/null +++ b/experiments/rule-engine-poc/docs/report-reference.md @@ -0,0 +1,170 @@ +--- +title: Report reference +folder: experiments/rule-engine-poc/docs +description: A single-page overview of the HTML report — what each section is for, what the five product perspectives that shaped it found, what's implemented, and what's still open. +entry_point: false +--- + +# Report reference + +This is the meta-doc for the HTML report itself — the user-facing artifact of the POC. It consolidates what `architecture.md`, `workflow.md`, `audit-trail.md`, and the five wave-4 research artifacts (`research/17`–`21`) say about the report into one place. + +## Contents + +1. [What the report is](#1-what-the-report-is) +2. [Section-by-section walkthrough](#2-section-by-section-walkthrough) +3. [The five product perspectives that shaped it](#3-the-five-product-perspectives-that-shaped-it) +4. [What got implemented (wave-4 delta)](#4-what-got-implemented-wave-4-delta) +5. [What's still open](#5-whats-still-open) +6. [Generating one](#6-generating-one) +7. [Reading one](#7-reading-one) + +--- + +## 1. What the report is + +A self-contained HTML file rendered by `src/html-report.ts` from a `VerdictResult`. One report per target; one file per `npm run report` invocation. Inline CSS, no JavaScript, no external assets — it survives email forwarding, Slack attachment, S3 retention, and offline viewing. + +The report is the *only* document most readers will see. The terminal output and the JSON `--json` mode are for CI and operators; the HTML is for everyone else (PR reviewer, PM, EM, QA, compliance officer, auditor, the author a week later). + +Three committed samples under [`research/sample-reports/`](../research/sample-reports/) show the three primary verdict shapes — `blocked`, `needs-attention`, `ready-to-progress`. + +## 2. Section-by-section walkthrough + +The report renders top-to-bottom in this order. The order matters: it follows the reader's actual scan path established by `research/17` (UX audit) and `research/20` (auditor reading path). + +| Section | Job | Section header at render | Source | +|---|---|---|---| +| **System-identity header** | Tell a cold reader *what this is* — engine version + prominent timestamp | (no header — runs above the verdict card) | `research/20` Art. 13 "provider identity" gap | +| **Verdict tile** | Categorical tier in 2-second-scan colour: blocked / needs-attention / ready-to-progress / unknown | (the headline) | engine `verdict` | +| **Stats line** | "N rule(s) fired · M action(s) to take" — quantifies how contested the decision is | (under the tile) | `result.evaluations`, `result.actions` | +| **Blocker-by-absence banner** | "X rules could not be evaluated because the LLM did not supply Y, Z, W" — yellow, adjacent to verdict | (conditional banner) | `research/21` skim-trap finding | +| **Skip-validate banner** | "WARNING: validation gate was skipped" — when `--skip-validate` was set | (conditional banner) | `research/18` + `research/21` trust calibration | +| **Verdict-tier + glyph legend** | Collapsed `<details>` explaining blocked / needs-attention / ready-to-progress / unknown + `[+] / [-] / [?]` glyph meanings | "Glossary." | `research/17` + `research/20` | +| **Weighted tally** | Per-tier weight totals, side-by-side with actions | "Weighted tally." | engine `weightedTally` | +| **Suggested actions** | Imperative-voice action sentences in priority-of-cause order (not alphabetical) | "Take these actions." | engine `evaluations` walked in priority desc; `rules/action-glossary.yaml` if present | +| **Extraction flags** | The LLM's structured output as a table | "Extraction flags." | `ctx.flags` | +| **What fired** | Matched rules only, in priority order, each with rule id + description + flags it matched on + actions it contributed | "What fired." | `result.evaluations` filtered to `matched === true` | +| **Audit trail** | Every rule evaluation, matched + skipped. Skipped rules collapse to `<details>` summary by default | "Audit trail." | `result.evaluations` | +| **Reproduce block** | Shell-quoted command + the three replay anchors (engine version + ruleset hash + flags hash) | "How to reproduce." | `research/20` Art. 12 replay manifest | +| **Provenance** | Hash preamble + 12-char truncated hashes + file paths | "Provenance." | `result.rulesetHash`, `result.flagsHash`, `result.engineVersion` | + +The CSS is inline and uses a 3-tier severity palette (red / amber / green) with non-colour-only signals (glyphs, row washes, section headings). A `@media (max-width: 540px)` rule collapses the summary grid to a single column on phones. + +## 3. The five product perspectives that shaped it + +Wave 4 dispatched five subagents in parallel against three committed sample renders. Each reviewed the report through a different lens. The convergent findings drove the v3 rebuild in the wave-4 implementer pass. + +### UX (`research/17-report-ux-audit.md`) + +> "The audit trail buries what matters. With ~21 rules and 1–2 matches per sample, the page is ~95% 'did not match' content." + +Top recommendations: a **What fired** section above the full audit trail; collapse-by-default for skipped rules; replace "X of 21 rules matched" coverage text with per-tier-appropriate phrasing; sort actions by priority-of-cause not alphabetical; a `cond--miss` row-wash to match `cond--missing`'s amber; a `@media (max-width: 540px)` single-column fallback. + +### Stakeholder strategy (`research/18-report-stakeholders.md`) + +> "The report is one artifact serving six different first-fields — PR-reviewers want the verdict pill, compliance wants the provenance block, PMs want suggested actions, auditors want the hashes, authors want the flags." + +Top recommendations: expand action slugs (`kick-ci`) to human sentences (`"Re-run the failing CI job."`) via an `actions[].human` field — implemented as a sidecar glossary in `rules/action-glossary.yaml`; introduce a `label_set` config (default `dev`; `pm`, `qa`, `compliance` as presentational overrides) so headline labels match the reader; flag a portfolio dashboard as the first hosted-SaaS gravity seam — *do not build it yet*. + +### Brand (`research/19-report-brand-review.md`) + +Verdict: **pass-with-findings** (not S1-blocking under the sandbox scope). On-temperament (no emoji, no gradients, no icons, restrained density, ASCII `[+]/[-]/[?]` glyphs correctly used as monospace-iconography). Off-token: 18 distinct literal hex values, hand-picked font stacks, near-white page background where Specorator calls for cream. Section headers should be sentence-case with periods — implemented. Open ADR-shaped decision: Specorator has no red token; the `blocked` tier currently uses literal `#fdecea / #d8281b / #7a160d` and stays that way until graduation. + +### Auditor readability (`research/20-report-auditor-readability.md`) + +> "30-second test passes... what the 30-second test fails on: the report does not name what kind of system this is, who built it, what version of the workflow it governs, or what the verdict is binding against." + +Implemented: system-identity header (engine version + prominent timestamp), reproduce-block with the three replay anchors, verdict-tier glossary, glyph legend. Still open: provider identity / contact, model-card link, capabilities-and-limitations sentence, expected-lifetime statement, reviewer-of-record field. Closes `research/02`'s "human-readable rationale presentation" open item; the remainder is governance, not engineering. + +### Misread risks (`research/21-report-misread-risks.md`) + +Three flagged misread paths: + +1. **The skim trap** — a busy reader looks only at the verdict tile and action list, missing context. Implemented mitigation: blocker-by-absence banner is rendered at the same visual weight as the verdict card. +2. **`verified` badge as trust trap** — a green pill saying `verified` will read as "extraction verified" when it only means "extraction is bound to current inputs". Implemented mitigation: tooltip on the badge explicitly says *bound to current inputs, not flag-correctness*. Plus the `--skip-validate` banner is now visually equal-weight to the verdict so a skipped-validation run cannot pass undetected. +3. **Blocker-by-absence is the most dangerous skim path** — a high-priority blocker whose input flag was never extracted simply doesn't fire. Implemented mitigation: dedicated banner naming the missing flags AND the count of un-evaluable rules (counted using `matched === false`, fixed in Codex round 12 to exclude rules that fired via `when.any` despite one missing branch). + +## 4. What got implemented (wave-4 delta) + +The 12 changes that landed across the wave-4 implementer pass (Agent B's RALPH loop) and the subsequent Codex round 11–14 hardening: + +| # | Change | Research source | +|---|---|---| +| 1 | "What fired" section above audit trail | `17`, `20`, `21` | +| 2 | Non-matched rules collapsed via `<details>` | `17` | +| 3 | Blocker-by-absence banner naming missing flags | `21`, `17` | +| 4 | Suggested actions in priority-of-cause order | `17` | +| 5 | Action human-sentences from glossary | `18` | +| 6 | Provenance preamble + reproduce block + 12-char hash truncation | `17`, `20`, `18` | +| 7 | System-identity header + prominent timestamp | `20` | +| 8 | Verdict-tier + glyph legend in collapsed `<details>` | `20`, `17` | +| 9 | `cond--miss` row wash matching `cond--missing` amber | `17` | +| 10 | `@media (max-width: 540px)` single-column fallback | `17` | +| 11 | `--skip-validate` banner + `verified` badge tooltip | `18`, `21` | +| 12 | Sentence-case headers + imperative voice ("Take these actions.") | `19` | + +Plus three follow-up Codex hardenings that were caught after the wave-4 push: + +- **Round 11** (`90f3fe1`) — `openInBrowser` waits for the spawned process to exit cleanly (not just `spawn`); `takeOpt` rejects missing values for `--config` / `--target`. +- **Round 12** (`eb01077`) — `missingFlagNames` only counts rules whose final outcome was determined by absence (excludes `when.any` rules that matched another branch); reproduce-command paths are shell-quoted. +- **Round 13** (`003a05e`) — single-shot `cli.ts::takeOption` rejects missing `--html` values. + +Test surface for the report: 28 tests in `test/html-report.test.ts` plus the report-flow integration tests in `test/report-flow.test.ts`. + +## 5. What's still open + +Deferred, with the bucket each lives in: + +| Item | Source | Bucket | +|---|---|---| +| `label_set` config (dev / pm / qa / compliance presets) | `research/18` | Strategy slice 14–18 | +| Reader-specific export modes (PDF, markdown, Slack-friendly text) | `research/18` | Strategy slice | +| Portfolio dashboard (one HTML across N targets) | `research/18` | Hosted-SaaS gravity seam — explicitly *do not build* per strategist | +| Provider identity / model card / capabilities-and-limitations sentence | `research/20` | Governance (compliance.md "what's not in this POC") | +| Reviewer-of-record field, override workflow | `research/20` | Governance | +| Brand-token migration (replace 18 literal hex values with vars) | `research/19` | ADR at graduation | +| Diff-against-previous-run | `research/21` | Production prep | +| Confidence / uncertainty surface | `research/20` + `research/21` | Calibration study first | +| RAT-A / RAT-B / RAT-C / RAT-D / RAT-E / RAT-F | `research/07`, `research/14` | Discovery activity — needs users, not more engineering | + +## 6. Generating one + +```bash +# Plan (writes the prompt + sidecar) +npm run plan -- --target <id> + +# User pastes the prompt into Claude / ChatGPT / Gemini, saves JSON to +# extractions/<id>.json. + +# Validate (optional sanity check) +npm run validate -- --target <id> + +# Report (renders HTML, opens browser best-effort) +npm run report -- --target <id> +# Or without opening a browser: +npm run report -- --target <id> --no-open +``` + +Exit codes: `0` no blockers, `1` at least one `blocked` verdict, `2` missing / malformed extraction. + +For testing without the AI loop (single-shot fixture flow): + +```bash +npx tsx src/cli.ts rules/quality-gates.yaml fixtures/blocked-missing-ears.json --html /tmp/preview.html +``` + +## 7. Reading one + +For the **report consumer** (not the POC operator): + +1. **Verdict tile** — colour and label. That's the answer. +2. **Stats line** — how many rules fired? How many actions to take? +3. **Banners** (if present) — blocker-by-absence flags missing inputs; skip-validate flag means the validation gate was bypassed (treat the verdict as advisory). +4. **Take these actions** — the imperative-voice human sentences; if `[code](#)` slug is shown, hover for the technical name. +5. **What fired** — the rules that drove the verdict, in priority-of-cause order. Each is a 1-paragraph card; expand the audit trail for everything that *didn't* fire. +6. **Provenance** — the three hashes (engine version, ruleset, flags) let you verify the report came from a specific tuple. The reproduce-command block lets you re-run it locally. + +A reader who reads only steps 1–4 should still get the answer correct. The rest is depth on demand. + +See [`docs/audit-trail.md`](audit-trail.md) for replay mechanics and [`docs/compliance.md`](compliance.md) for which sections speak to which regulation. From a09bee96e1926d4e245bd3ad96eeea75c7c72dd1 Mon Sep 17 00:00:00 2001 From: Claude <noreply@anthropic.com> Date: Sun, 17 May 2026 14:27:59 +0000 Subject: [PATCH 41/45] fix(rule-engine-poc): fileURLToPath + finite priority (Codex round 15) - scripts/run-all-html.mjs + scripts/run-all-fixtures.mjs: use fileURLToPath(new URL(...)) instead of .pathname. The bare .pathname keeps percent-encoding (paths with spaces break) and produces malformed Windows drive paths. Same fix in both scripts since they share the import.meta.url -> fixturesDir pattern (#526 P2). - loader.ts: reject non-finite 'priority' (NaN, Infinity) at load time. Same fail-fast discipline as weight + gt + lt. NaN priorities silently break the sort comparator (b.priority - a.priority returns NaN, treated as 0), reordering the audit trail unpredictably (#526 P2). - Two new loader tests cover .nan and .inf priority rejection. Suite: 173/173 passing. --- .../scripts/run-all-fixtures.mjs | 4 +- .../rule-engine-poc/scripts/run-all-html.mjs | 5 ++- experiments/rule-engine-poc/src/loader.ts | 9 ++++ .../rule-engine-poc/test/loader.test.ts | 42 +++++++++++++++++++ 4 files changed, 58 insertions(+), 2 deletions(-) diff --git a/experiments/rule-engine-poc/scripts/run-all-fixtures.mjs b/experiments/rule-engine-poc/scripts/run-all-fixtures.mjs index 50412f68d..11e09fe0c 100644 --- a/experiments/rule-engine-poc/scripts/run-all-fixtures.mjs +++ b/experiments/rule-engine-poc/scripts/run-all-fixtures.mjs @@ -1,8 +1,10 @@ import { readdirSync } from "node:fs"; import { join } from "node:path"; +import { fileURLToPath } from "node:url"; import { spawnSync } from "node:child_process"; -const fixturesDir = new URL("../fixtures/", import.meta.url).pathname; +// fileURLToPath handles percent-decoding and Windows drive paths. +const fixturesDir = fileURLToPath(new URL("../fixtures/", import.meta.url)); const rules = "rules/quality-gates.yaml"; const files = readdirSync(fixturesDir) diff --git a/experiments/rule-engine-poc/scripts/run-all-html.mjs b/experiments/rule-engine-poc/scripts/run-all-html.mjs index 551bd3918..be2dd412f 100644 --- a/experiments/rule-engine-poc/scripts/run-all-html.mjs +++ b/experiments/rule-engine-poc/scripts/run-all-html.mjs @@ -1,8 +1,11 @@ import { readdirSync, mkdirSync } from "node:fs"; import { join, basename } from "node:path"; +import { fileURLToPath } from "node:url"; import { spawnSync } from "node:child_process"; -const fixturesDir = new URL("../fixtures/", import.meta.url).pathname; +// fileURLToPath handles percent-decoding and Windows drive paths; +// `.pathname` alone breaks for both (Codex round 15 P2). +const fixturesDir = fileURLToPath(new URL("../fixtures/", import.meta.url)); const rules = "rules/quality-gates.yaml"; const reportsDir = "reports"; diff --git a/experiments/rule-engine-poc/src/loader.ts b/experiments/rule-engine-poc/src/loader.ts index 7de40ced2..a2454469f 100644 --- a/experiments/rule-engine-poc/src/loader.ts +++ b/experiments/rule-engine-poc/src/loader.ts @@ -137,6 +137,15 @@ function validate( if (typeof rule.priority !== "number") { throw new Error(`Rule '${rule.id}' missing numeric 'priority'`); } + // Codex round 15 P2: NaN/Infinity priorities silently break the + // documented sort order (b.priority - a.priority returns NaN, treated + // as 0), reordering the audit trail unpredictably. Same fail-fast + // discipline as weight + gt + lt. + if (!Number.isFinite(rule.priority)) { + throw new Error( + `Rule '${rule.id}' has non-finite 'priority' (got ${String(rule.priority)})`, + ); + } } const CONDITION_OPS = [ diff --git a/experiments/rule-engine-poc/test/loader.test.ts b/experiments/rule-engine-poc/test/loader.test.ts index 54bcc78c2..445392e94 100644 --- a/experiments/rule-engine-poc/test/loader.test.ts +++ b/experiments/rule-engine-poc/test/loader.test.ts @@ -463,6 +463,48 @@ describe("loader", () => { ).toThrow(/non-finite 'lt'/); }); + it("rejects rules with non-finite priority", () => { + expect(() => + loadRulesFromString( + ` +- id: r1 + priority: .nan + description: x + when: + all: + - flag: a + eq: true + then: + verdict: blocked + weight: 1 + actions: [a] +`, + "priority-nan", + ), + ).toThrow(/non-finite 'priority'/); + }); + + it("rejects rules with infinite priority", () => { + expect(() => + loadRulesFromString( + ` +- id: r1 + priority: .inf + description: x + when: + all: + - flag: a + eq: true + then: + verdict: blocked + weight: 1 + actions: [a] +`, + "priority-inf", + ), + ).toThrow(/non-finite 'priority'/); + }); + it("assigns a stable content hash to each rule", () => { const ruleA = loadRulesFromString( ` From 90f58716f5677ef0b0deffa282c81894d54a8b38 Mon Sep 17 00:00:00 2001 From: Claude <noreply@anthropic.com> Date: Sun, 17 May 2026 14:35:50 +0000 Subject: [PATCH 42/45] fix(rule-engine-poc): null extraction values are errors, not warnings (Codex round 16) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #526 Codex P1: the previous null handling was a warning with the message 'the engine will treat null and missing identically' — but that claim was false. The engine's hasOwnProperty presence check treats {flag: null} as PRESENT, so rules using 'exists' or 'ne' evaluate differently against {flag: null} than against {} despite the validator's reassurance. Two ways to make the claim true: (a) refuse null at the gate, or (b) make the engine treat null as missing. (a) is simpler and preserves the LLM prompt's 'omit unknowns' contract. Engine semantics stay unchanged; null just never reaches the engine on the workflow path. Single-shot cli.ts bypasses validate but its fixtures don't use null. - src/validate.ts: null flag values now produce a hard error (code: 'null-value-not-allowed') with a clear message explaining why null is not 'missing-equivalent'. - test/validate.test.ts: updated existing test to expect the error shape. Suite: 173/173 passing. Single test renamed; no count change. --- experiments/rule-engine-poc/src/validate.ts | 17 ++++++++++++----- .../rule-engine-poc/test/validate.test.ts | 9 ++++----- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/experiments/rule-engine-poc/src/validate.ts b/experiments/rule-engine-poc/src/validate.ts index 291fee8fb..85aee8fbd 100644 --- a/experiments/rule-engine-poc/src/validate.ts +++ b/experiments/rule-engine-poc/src/validate.ts @@ -104,13 +104,20 @@ export function validateExtraction( continue; } if (value === null) { - warnings.push({ - severity: "warning", - code: "null-value-omit-instead", + // Codex round 16 P1: previously a warning, but the engine's + // `hasOwnProperty` presence check treats null as PRESENT — so + // `exists`/`ne` rules behave differently against {flag: null} + // than against {} despite the validator's old "null ≈ missing" + // claim. The right fix is to refuse null at the gate so the + // engine never sees it; LLMs are instructed to omit unknowns. + errors.push({ + severity: "error", + code: "null-value-not-allowed", path: key, message: - `Flag '${key}' is null; prefer omitting unknowns over emitting null. ` + - `The engine will treat null and missing identically.`, + `Flag '${key}' is null; omit the field instead. ` + + `The engine's presence check treats null as PRESENT, which can ` + + `silently change verdicts for rules using 'exists' or 'ne'.`, }); continue; } diff --git a/experiments/rule-engine-poc/test/validate.test.ts b/experiments/rule-engine-poc/test/validate.test.ts index 824f3593a..f4a7445fa 100644 --- a/experiments/rule-engine-poc/test/validate.test.ts +++ b/experiments/rule-engine-poc/test/validate.test.ts @@ -105,12 +105,11 @@ describe("validateExtraction", () => { expect(r.errors[0]!.code).toBe("disallowed-value"); }); - it("warns (but does not error) when a flag value is null", () => { + it("errors when a flag value is null (engine treats null as present)", () => { const r = validateExtraction({ ci_passing: null }, schema); - expect(r.ok).toBe(true); - expect(r.warnings).toHaveLength(1); - expect(r.warnings[0]!.code).toBe("null-value-omit-instead"); - expect(r.warnings[0]!.path).toBe("ci_passing"); + expect(r.ok).toBe(false); + expect(r.errors[0]!.code).toBe("null-value-not-allowed"); + expect(r.errors[0]!.path).toBe("ci_passing"); }); describe("with expectedPromptHash", () => { From 4929c08d5f881dbafc6b5d2093f6603314abcc47 Mon Sep 17 00:00:00 2001 From: Claude <noreply@anthropic.com> Date: Sun, 17 May 2026 14:43:37 +0000 Subject: [PATCH 43/45] fix(rule-engine-poc): Windows-safe reproduce command (Codex round 17) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #526 round 17 P2: the reproCmd block emitted only POSIX single-quote-escaped paths. cmd.exe doesn't recognise single quotes as path delimiters, and PowerShell interprets them differently from bash — so an auditor on Windows copying the 'How to reproduce' command got a syntax error even when the report itself was valid. Now render both flavours side-by-side in the provenance section: - POSIX (macOS / Linux / WSL / Git Bash): single-quote escape, embedded ' becomes '\''. - Windows (cmd.exe / PowerShell): double-quote escape, embedded " becomes "" (Windows escape convention). The reader picks the right one. Both are labelled. Updated the existing reproCmd test to assert both flavours appear and that the POSIX/Windows labels are present. Suite: 173/173 passing. --- .../rule-engine-poc/src/html-report.ts | 20 ++++++++++++------- .../rule-engine-poc/test/html-report.test.ts | 11 ++++++++-- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/experiments/rule-engine-poc/src/html-report.ts b/experiments/rule-engine-poc/src/html-report.ts index 73346291b..c47a97dc4 100644 --- a/experiments/rule-engine-poc/src/html-report.ts +++ b/experiments/rule-engine-poc/src/html-report.ts @@ -269,12 +269,15 @@ export function renderHtmlReport( : ""; // Reproduce command: assembled from the same fields plan/report use. - // Codex round 12 P2: quote paths so paths with spaces or shell - // metacharacters (e.g., "My Projects/rules.yaml") don't break the - // command. Single-quote shell-escape: replace any ' inside the - // path with the four-char sequence '\'' . - const shellQuote = (s: string): string => `'${s.replace(/'/g, "'\\''")}'`; - const reproCmd = `npx tsx src/cli.ts ${shellQuote(ctx.rulesPath)} ${shellQuote(ctx.flagsPath)} --html <out.html> --quiet`; + // Render two flavours — POSIX (single-quote escape) and Windows + // (double-quote escape) — because the two shell families don't share + // a quoting syntax. POSIX shells eat double quotes inside single + // quotes verbatim; cmd.exe and PowerShell don't recognise single + // quotes as path delimiters at all. Codex round 17 P2. + const posixQuote = (s: string): string => `'${s.replace(/'/g, "'\\''")}'`; + const windowsQuote = (s: string): string => `"${s.replace(/"/g, '""')}"`; + const reproCmdPosix = `npx tsx src/cli.ts ${posixQuote(ctx.rulesPath)} ${posixQuote(ctx.flagsPath)} --html <out.html> --quiet`; + const reproCmdWindows = `npx tsx src/cli.ts ${windowsQuote(ctx.rulesPath)} ${windowsQuote(ctx.flagsPath)} --html <out.html> --quiet`; return `<!doctype html> <html lang="en"> @@ -491,7 +494,10 @@ export function renderHtmlReport( </p> <div class="reproduce"> <p>How to reproduce — run from <code>experiments/rule-engine-poc/</code>:</p> - <pre><code>${esc(reproCmd)}</code></pre> + <p class="repro-label">POSIX (macOS, Linux, WSL, Git Bash):</p> + <pre><code>${esc(reproCmdPosix)}</code></pre> + <p class="repro-label">Windows (cmd.exe, PowerShell):</p> + <pre><code>${esc(reproCmdWindows)}</code></pre> <p>Then verify the three hashes above match the values in the regenerated report.</p> </div> </section> diff --git a/experiments/rule-engine-poc/test/html-report.test.ts b/experiments/rule-engine-poc/test/html-report.test.ts index 66cad50ac..eaae0cb14 100644 --- a/experiments/rule-engine-poc/test/html-report.test.ts +++ b/experiments/rule-engine-poc/test/html-report.test.ts @@ -289,6 +289,8 @@ describe("renderHtmlReport: provenance reframing", () => { it("shell-quotes paths in the reproduce command so spaces don't break it", () => { // Codex round 12 P2: unquoted paths break copy-pasted reproduce // commands on user machines (e.g., "My Projects/..."). + // Round 17 P2: render BOTH POSIX (single-quote) and Windows + // (double-quote) forms so the reader picks the right one. const flags: ExtractionFlags = { ci_failing: true }; const result = evaluate(rules, flags); const html = renderHtmlReport( @@ -299,10 +301,15 @@ describe("renderHtmlReport: provenance reframing", () => { flagsPath: "extractions with spaces/x.json", }), ); - // Single quotes are HTML-escaped (') in the rendered output, - // but they decode back to ' when the user pastes the command. + // POSIX form: single quotes (HTML-escaped to '). expect(html).toContain("'My Projects/rules.yaml'"); expect(html).toContain("'extractions with spaces/x.json'"); + // Windows form: double quotes (HTML-escaped to "). + expect(html).toContain(""My Projects/rules.yaml""); + expect(html).toContain(""extractions with spaces/x.json""); + // Both flavours are labelled so the reader knows which to use. + expect(html).toContain("POSIX"); + expect(html).toContain("Windows"); }); it("truncates ruleset and flags hashes to 12-char prefixes", () => { From c6585cd4890a313f71a40df5a6293c9037c89c5c Mon Sep 17 00:00:00 2001 From: Claude <noreply@anthropic.com> Date: Sun, 17 May 2026 14:47:20 +0000 Subject: [PATCH 44/45] fix(rule-engine-poc): reproduce command uses literal out.html (Codex round 18) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #526 round 18 P2: the reproCmd block printed '--html <out.html>' meaning the angle brackets as a placeholder, but POSIX shells and cmd.exe / PowerShell parse <...> as input redirection. A copy-paste would silently send no value to --html and the command would fail. Replaced with a literal 'out.html' filename — the reader edits it if they want a different path, but the copy-paste path works on every shell without modification. Suite: 173/173 still passing (existing repro tests check for the quoted paths, not the filename — both flavours intact). --- experiments/rule-engine-poc/src/html-report.ts | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/experiments/rule-engine-poc/src/html-report.ts b/experiments/rule-engine-poc/src/html-report.ts index c47a97dc4..292daf05e 100644 --- a/experiments/rule-engine-poc/src/html-report.ts +++ b/experiments/rule-engine-poc/src/html-report.ts @@ -276,8 +276,12 @@ export function renderHtmlReport( // quotes as path delimiters at all. Codex round 17 P2. const posixQuote = (s: string): string => `'${s.replace(/'/g, "'\\''")}'`; const windowsQuote = (s: string): string => `"${s.replace(/"/g, '""')}"`; - const reproCmdPosix = `npx tsx src/cli.ts ${posixQuote(ctx.rulesPath)} ${posixQuote(ctx.flagsPath)} --html <out.html> --quiet`; - const reproCmdWindows = `npx tsx src/cli.ts ${windowsQuote(ctx.rulesPath)} ${windowsQuote(ctx.flagsPath)} --html <out.html> --quiet`; + // Use a literal filename, NOT `<out.html>`: angle brackets are shell + // I/O redirection on both POSIX and cmd.exe / PowerShell, so a + // copy-paste would silently consume the placeholder as a redirection + // target and `--html` would receive no value (Codex round 18 P2). + const reproCmdPosix = `npx tsx src/cli.ts ${posixQuote(ctx.rulesPath)} ${posixQuote(ctx.flagsPath)} --html out.html --quiet`; + const reproCmdWindows = `npx tsx src/cli.ts ${windowsQuote(ctx.rulesPath)} ${windowsQuote(ctx.flagsPath)} --html out.html --quiet`; return `<!doctype html> <html lang="en"> From f3a83276df0b19a8260e30120ab76c14521f55bd Mon Sep 17 00:00:00 2001 From: Claude <noreply@anthropic.com> Date: Sun, 17 May 2026 14:58:44 +0000 Subject: [PATCH 45/45] fix(rule-engine-poc): PowerShell-specific repro form (Codex round 19) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round 17's Windows form used double quotes for both cmd.exe and PowerShell, but PowerShell double-quoted strings EXPAND \$var and \$(...). A path like 'src/\$something/x.json' would be interpreted in PowerShell — a regression vs the prior POSIX form, which used single quotes. Split Windows into two flavours: - cmd.exe: double-quote escape (\" -> \"\"). cmd doesn't expand \$. - PowerShell: single-quote escape (' -> ''). Single quotes suppress PowerShell expansion. The HTML provenance block now renders three labelled forms instead of two. POSIX still uses POSIX-style single-quote escape ('\\''). One new test asserts the PowerShell block uses single quotes; the existing repro test updated to match three-form layout. Suite: 174/174 passing. --- .../rule-engine-poc/src/html-report.ts | 32 +++++++++------- .../rule-engine-poc/test/html-report.test.ts | 37 +++++++++++++++---- 2 files changed, 48 insertions(+), 21 deletions(-) diff --git a/experiments/rule-engine-poc/src/html-report.ts b/experiments/rule-engine-poc/src/html-report.ts index 292daf05e..8a7ad95d1 100644 --- a/experiments/rule-engine-poc/src/html-report.ts +++ b/experiments/rule-engine-poc/src/html-report.ts @@ -268,20 +268,24 @@ export function renderHtmlReport( ? `<div class="banner banner--skip" role="alert"><strong>WARNING:</strong> validation gate was skipped (<code>--skip-validate</code>). Verdict and provenance are NOT verified against the flag schema or forbidden-fields policy.</div>` : ""; - // Reproduce command: assembled from the same fields plan/report use. - // Render two flavours — POSIX (single-quote escape) and Windows - // (double-quote escape) — because the two shell families don't share - // a quoting syntax. POSIX shells eat double quotes inside single - // quotes verbatim; cmd.exe and PowerShell don't recognise single - // quotes as path delimiters at all. Codex round 17 P2. + // Reproduce command: render three flavours because the supported + // shells disagree on quoting AND on variable expansion: + // - POSIX (bash / zsh / sh): single-quote escape (' becomes '\''). + // Single quotes suppress $var expansion. + // - cmd.exe: double-quote escape (" becomes ""). cmd doesn't + // expand $var; it expands %VAR%, but our paths don't carry %. + // - PowerShell: single-quote escape (' becomes ''). Double quotes + // in PowerShell EXPAND $var and $(), which can mutate the path + // (Codex round 19 P2). const posixQuote = (s: string): string => `'${s.replace(/'/g, "'\\''")}'`; - const windowsQuote = (s: string): string => `"${s.replace(/"/g, '""')}"`; + const cmdQuote = (s: string): string => `"${s.replace(/"/g, '""')}"`; + const psQuote = (s: string): string => `'${s.replace(/'/g, "''")}'`; // Use a literal filename, NOT `<out.html>`: angle brackets are shell - // I/O redirection on both POSIX and cmd.exe / PowerShell, so a - // copy-paste would silently consume the placeholder as a redirection - // target and `--html` would receive no value (Codex round 18 P2). + // I/O redirection on all three, so a copy-paste would silently send + // --html no value (Codex round 18 P2). const reproCmdPosix = `npx tsx src/cli.ts ${posixQuote(ctx.rulesPath)} ${posixQuote(ctx.flagsPath)} --html out.html --quiet`; - const reproCmdWindows = `npx tsx src/cli.ts ${windowsQuote(ctx.rulesPath)} ${windowsQuote(ctx.flagsPath)} --html out.html --quiet`; + const reproCmdCmd = `npx tsx src/cli.ts ${cmdQuote(ctx.rulesPath)} ${cmdQuote(ctx.flagsPath)} --html out.html --quiet`; + const reproCmdPwsh = `npx tsx src/cli.ts ${psQuote(ctx.rulesPath)} ${psQuote(ctx.flagsPath)} --html out.html --quiet`; return `<!doctype html> <html lang="en"> @@ -500,8 +504,10 @@ export function renderHtmlReport( <p>How to reproduce — run from <code>experiments/rule-engine-poc/</code>:</p> <p class="repro-label">POSIX (macOS, Linux, WSL, Git Bash):</p> <pre><code>${esc(reproCmdPosix)}</code></pre> - <p class="repro-label">Windows (cmd.exe, PowerShell):</p> - <pre><code>${esc(reproCmdWindows)}</code></pre> + <p class="repro-label">Windows cmd.exe:</p> + <pre><code>${esc(reproCmdCmd)}</code></pre> + <p class="repro-label">PowerShell (Windows / cross-platform):</p> + <pre><code>${esc(reproCmdPwsh)}</code></pre> <p>Then verify the three hashes above match the values in the regenerated report.</p> </div> </section> diff --git a/experiments/rule-engine-poc/test/html-report.test.ts b/experiments/rule-engine-poc/test/html-report.test.ts index eaae0cb14..22c33b2c3 100644 --- a/experiments/rule-engine-poc/test/html-report.test.ts +++ b/experiments/rule-engine-poc/test/html-report.test.ts @@ -287,10 +287,8 @@ describe("renderHtmlReport: provenance reframing", () => { }); it("shell-quotes paths in the reproduce command so spaces don't break it", () => { - // Codex round 12 P2: unquoted paths break copy-pasted reproduce - // commands on user machines (e.g., "My Projects/..."). - // Round 17 P2: render BOTH POSIX (single-quote) and Windows - // (double-quote) forms so the reader picks the right one. + // Codex rounds 12/17/19: render three forms — POSIX, cmd.exe, + // PowerShell — because each uses different quoting + expansion. const flags: ExtractionFlags = { ci_failing: true }; const result = evaluate(rules, flags); const html = renderHtmlReport( @@ -301,15 +299,38 @@ describe("renderHtmlReport: provenance reframing", () => { flagsPath: "extractions with spaces/x.json", }), ); - // POSIX form: single quotes (HTML-escaped to '). + // POSIX + PowerShell: single quotes (HTML-escaped to '). expect(html).toContain("'My Projects/rules.yaml'"); expect(html).toContain("'extractions with spaces/x.json'"); - // Windows form: double quotes (HTML-escaped to "). + // cmd.exe: double quotes (HTML-escaped to "). expect(html).toContain(""My Projects/rules.yaml""); expect(html).toContain(""extractions with spaces/x.json""); - // Both flavours are labelled so the reader knows which to use. + // All three labels appear. expect(html).toContain("POSIX"); - expect(html).toContain("Windows"); + expect(html).toContain("cmd.exe"); + expect(html).toContain("PowerShell"); + }); + + it("PowerShell repro form uses single quotes so $var stays literal", () => { + // Codex round 19 P2: PowerShell double-quoted strings expand + // $var / $(...) — single quotes suppress that. The PowerShell + // labelled block must use single quotes so a path like + // 'src/$something/x.json' is reproduced literally. + const flags: ExtractionFlags = { ci_failing: true }; + const result = evaluate(rules, flags); + const html = renderHtmlReport( + result, + baseCtx({ + flags, + rulesPath: "My$Projects/rules.yaml", + flagsPath: "src/$something/x.json", + }), + ); + const psStart = html.indexOf("PowerShell"); + expect(psStart).toBeGreaterThan(-1); + const psBlock = html.slice(psStart, psStart + 800); + expect(psBlock).toContain("'My$Projects/rules.yaml'"); + expect(psBlock).toContain("'src/$something/x.json'"); }); it("truncates ruleset and flags hashes to 12-char prefixes", () => {