diff --git a/.changeset/design-audit-8-layer-architecture.md b/.changeset/design-audit-8-layer-architecture.md new file mode 100644 index 0000000..56169a1 --- /dev/null +++ b/.changeset/design-audit-8-layer-architecture.md @@ -0,0 +1,56 @@ +--- +'@tangle-network/browser-agent-driver': minor +--- + +feat(design-audit): 8-layer architecture — Layers 1-7 fully shipped, Layer 8 scaffold + +Full implementation of RFC-002: World-Class Design Audit. Primary consumer is coding agents (Claude Code, Codex, OpenCode, Pi); the architecture is JSON-first, tool-callable, and self-explaining when uncertain. + +**Layer 1 — Multi-dimensional scoring** _(shipped)_ +- Ensemble classifier (URL pattern + DOM heuristic + LLM tiebreaker) with `ensembleConfidence`, `signalsAgreed`, `dissent`. +- Five universal dimensions: `product_intent / visual_craft / trust_clarity / workflow / content_ia`. +- Per-page-type rollup weights (saas-app, marketing, dashboard, docs, ecommerce, social, tool, blog, utility). +- Per-page-type calibration anchors (`rubric/anchors/*.yaml`) so app surfaces aren't judged against marketing-site polish. +- `AuditResult_v2` emitted alongside v1 shape; v1 deprecated with one-release lag. + +**Layer 2 — Patch primitives** _(shipped)_ +- Every major/critical finding now ships `patches[]` with `target`, `diff.before`/`after`, `testThatProves`, `rollback`, `estimatedDelta`, and `estimatedDeltaConfidence`. +- `diff.before` is validated as a substring of the page snapshot at parse time — agents apply patches literally without re-authoring. +- Severity enforcement: findings without valid patches are downgraded from major/critical to minor. +- `patches/render.ts`: renders `unifiedDiff` from before/after when `target.filePath` is known (`git apply`-able). + +**Layer 3 — First-principles fallback** _(shipped)_ +- Fires when `ensembleConfidence < 0.6`, signals disagree, or page type is `unknown`. +- Scores against 5 universal product principles only (primary-job clarity, action obviousness, state preview, trust-before-commitment, recovery-from-failure). +- Sets `rollup.confidence = 'low'`; emits `NovelPatternObservation` to `~/.bad/novel-patterns/` for fleet mining. +- New rubric fragment `first-principles.md` carries the exact prompt that fires in this mode. + +**Layer 4 — Outcome attribution** _(shipped)_ +- `bad design-audit ack-patch --pre-run-id ` — records that an agent applied a patch. +- `bad design-audit --post-patch ` on re-audit — computes observed delta vs predicted, writes `agreementScore`. +- JSONL store at `~/.bad/attribution/applications/`. Append-only — outcomes are new events, not mutations. +- `aggregatePatchReliability()` cross-tenant rollup: groups by `patchHash = sha256(before+after+scope).slice(0,16)`. After N≥30 / ≥5 tenants / replicationRate≥0.7 → `recommendation: 'recommended'`. + +**Layer 5 — Pattern library** _(scaffold)_ +- `patterns/{store,mine,match}.ts` + `cli-patterns.ts` (`bad patterns query|show`). +- Cold-start: library is empty until ~6 weeks of attribution data accumulates. Mine threshold: N≥30, ≥5 tenants, replicationRate≥0.7. Mining impl is a TODO; the query API and types are stable. + +**Layer 6 — Composable predicates** _(shipped)_ +- `AppliesWhen` extended with `audience`, `modality`, `regulatoryContext`, `audienceVulnerability`. +- 9 new rubric fragments: `audience-{clinician,kids,developer}.md`, `regulatory-{hipaa,gdpr,coppa}.md`, `modality-{mobile,tablet}.md`, `audience-vulnerability-minor-facing.md`. +- Rubric loader matches new predicates when context provided via `--audience`, `--modality`, `--regulatory`, `--audience-vulnerability` CLI flags. + +**Layer 7 — Domain ethics gate** _(shipped)_ +- 4 rule files (medical, kids, finance, legal) with citation-backed rules (FDA 21 CFR 201.57, COPPA 16 CFR 312.5, TILA/Reg Z, GDPR). +- Hard rollup floor: `critical-floor → 4`, `major-floor → 6`. `preEthicsScore` preserves the LLM's uncapped score. +- `--skip-ethics` bypass (test-only, logged + warned), `--ethics-rules-dir` override. +- 8 paired pass/fail fixtures in `bench/design/ethics-fixtures/`. + +**Layer 8 — Modality adapters** _(scaffold)_ +- `modality/{types,html,ios,android,index}.ts`. HTML adapter wraps existing Playwright pipeline. iOS and Android throw `NotImplementedError` with clear message. `--modality html|ios|android` dispatches to the right adapter. + +**Skill contract updates:** +- `~/code/dotfiles/claude/skills/bad/SKILL.md`: patch consumption loop, Layer 3-8 contract, ack-patch / --post-patch close-the-loop, ethics floor priority rule. +- `skills/design-evolve/SKILL.md`: Phase 3 (apply fixes) now patch-first; Phase 4 includes attribution close-the-loop. + +**Tests:** +40 new tests across `design-audit-patch-{parse,validate}`, `design-audit-first-principles`, `design-audit-attribution`. Total: 1393 passing. diff --git a/.changeset/design-audit-layer-1-foundation.md b/.changeset/design-audit-layer-1-foundation.md new file mode 100644 index 0000000..9da7d94 --- /dev/null +++ b/.changeset/design-audit-layer-1-foundation.md @@ -0,0 +1,19 @@ +--- +'@tangle-network/browser-agent-driver': minor +--- + +feat(design-audit): Layer 1 — multi-dim scoring foundation + +Land the first layer of the world-class 8-layer design-audit architecture (RFC `docs/rfc/design-audit-world-class.md`). This release ships: + +- **Ensemble classifier** (`src/design/audit/classify-ensemble.ts`) — three-signal vote (URL pattern + DOM heuristic + LLM tiebreaker) with explicit `ensembleConfidence`, `signalsAgreed`, and `dissent` records. URL+DOM agreement above the 0.7 threshold skips the LLM call entirely. +- **Per-page-type rollup weights** (`src/design/audit/rubric/rollup-weights.ts`) — saas-app, marketing, dashboard, docs, ecommerce, social, tool, blog, utility, plus `default`/`unknown` fallbacks. Module-load invariant: every weight set sums to 1.0 ± 1e-6. +- **Per-page-type calibration anchors** (`src/design/audit/rubric/anchors/*.yaml`) — 9 anchor files referencing real product 9-10 examples (Linear's app, Figma, Notion, Stripe, MDN, Apple Store, Threads, Stratechery, Vercel deploys, etc.) so saas-app surfaces are no longer judged against marketing-site polish. +- **Multi-dim scoring** (`src/design/audit/v2/score.ts`) — five universal dimensions (product_intent / visual_craft / trust_clarity / workflow / content_ia) each with `score`, `range`, `confidence`. Rollup is a weighted aggregate with conservative confidence (any dim `low` → rollup `low`). +- **`AuditResult_v2`** — emitted alongside the v1 shape in `report.json` under a top-level `v2` block. One-release deprecation window before v1 is removed. +- **`--audit-passes auto`** — new default that runs the ensemble classifier first, then picks the focused pass bundle for that classification. +- **CLI summary** — per-page console output now prints the 5-dimension breakdown plus rollup formula. + +Backwards compat: all existing v1 fields (`score`, `findings`, `summary`, `strengths`, etc.) remain on `PageAuditResult` and `report.json`. Consumers should migrate to `report.v2.pages[].scores` over the next release. + +Skill update: `skills/bad/SKILL.md` documents the new JSON shape with an agent-side worked example for choosing which dimension to invest in based on `score × weight` leverage. diff --git a/.changeset/design-audit-layer-7-ethics-gate.md b/.changeset/design-audit-layer-7-ethics-gate.md new file mode 100644 index 0000000..f325348 --- /dev/null +++ b/.changeset/design-audit-layer-7-ethics-gate.md @@ -0,0 +1,16 @@ +--- +'@tangle-network/browser-agent-driver': minor +--- + +feat(design-audit): Layer 7 — domain ethics gate (+ Layer 6 composable predicates) + +Adds a hard score floor for pages that fail domain-specific ethics rules and the predicate vocabulary that lets those rules target the right audience/modality/regulatory context. RFC: `docs/rfc/design-audit-world-class.md`. + +- **Ethics rule set** (`src/design/audit/ethics/rules/{medical,kids,finance,legal}.yaml`) — curated, citation-backed rules covering medication dosage disclosure (FDA 21 CFR 201.57), kid-facing dark-pattern guards (COPPA, FTC Endorsement Guides), finance fee disclosure (TILA / Reg Z), and legal disclaimer presence. +- **Detector kinds** (`src/design/audit/ethics/check.ts`) — `pattern-absent`, `pattern-present`, `llm-classifier`. Pattern checks are case-insensitive against page text; the LLM classifier asks for a single yes/no token to keep latency + cost predictable. +- **Hard rollup floor** — a `critical-floor` violation caps the rollup at 4; `major-floor` caps at 6. `PageAuditResult.preEthicsScore` preserves the LLM's pre-cap score so reports can show "would have scored 8, capped at 4 — fix the dosage disclosure". +- **Composable predicates (Layer 6)** — extends `AppliesWhen` with `audience`, `modality`, `regulatoryContext`, and `audienceVulnerability`. A pediatric medical app on tablet for clinicians now matches the medical *and* kids rule sets simultaneously instead of forcing one classification. +- **CLI flags**: `--skip-ethics` (test-only bypass, audited + warned), `--ethics-rules-dir ` (override the builtin yaml), `--audience`, `--modality`, `--audience-vulnerability` (comma-separated tag lists threaded into rule matching). +- **Fixtures** (`bench/design/ethics-fixtures/`) — paired pass/fail HTML for each rule category, used by `tests/design-audit-ethics-{rules,check}.test.ts`. + +Backwards compat: rules ship empty by default for any classification not on the curated list, so existing audits see no change unless they opt in via `--audience`/`--modality` or land on a covered domain. `EthicsViolation` is exported from both `src/design/audit/types.ts` and `v2/types.ts`; `PageAuditResult.ethicsViolations` is optional. diff --git a/bench/design/ethics-fixtures/finance-disclosed-fees.html b/bench/design/ethics-fixtures/finance-disclosed-fees.html new file mode 100644 index 0000000..11edf4f --- /dev/null +++ b/bench/design/ethics-fixtures/finance-disclosed-fees.html @@ -0,0 +1,24 @@ + + + + + SwiftPay — review and confirm + + +
+

Review your payment

+
+
You send
$1,000.00 USD
+
Network fee
$2.50
+
FX rate
1 USD = 0.92 EUR
+
Conversion fee (1.5%)
$15.00
+
Tax (VAT)
$3.40
+
Total charged
$1,020.90 USD
+
Recipient gets
€905.30 EUR
+
APR (effective annual percentage rate, if held)
0.0%
+
+

All fees, taxes, and FX are disclosed above. By tapping Pay you authorize SwiftPay to charge $1,020.90.

+ +
+ + diff --git a/bench/design/ethics-fixtures/finance-hidden-fees.html b/bench/design/ethics-fixtures/finance-hidden-fees.html new file mode 100644 index 0000000..0389365 --- /dev/null +++ b/bench/design/ethics-fixtures/finance-hidden-fees.html @@ -0,0 +1,16 @@ + + + + + SwiftPay — Send money + + +
+

Send $1,000 to Alex

+

You send: $1,000.00

+

Recipient gets: $1,000.00

+ +

Network, FX, conversion, and processing fees may apply. See terms.

+
+ + diff --git a/bench/design/ethics-fixtures/gdpr-no-consent.html b/bench/design/ethics-fixtures/gdpr-no-consent.html new file mode 100644 index 0000000..a4b7517 --- /dev/null +++ b/bench/design/ethics-fixtures/gdpr-no-consent.html @@ -0,0 +1,16 @@ + + + + + EuroShop — Home + + +

EuroShop

+
+

Best deals in Berlin today

+

Trending products curated for the EU market.

+ +
+

EuroShop GmbH 2026

+ + diff --git a/bench/design/ethics-fixtures/gdpr-with-consent.html b/bench/design/ethics-fixtures/gdpr-with-consent.html new file mode 100644 index 0000000..09c0a01 --- /dev/null +++ b/bench/design/ethics-fixtures/gdpr-with-consent.html @@ -0,0 +1,31 @@ + + + + + EuroShop — Home + + +
+

We use cookies

+

We use necessary and optional cookies. Manage your preferences below.

+
+ Cookie consent preferences + + + +
+ + +
+

EuroShop

+
+

Best deals in Berlin today

+

Trending products curated for the EU market.

+ +
+ + + diff --git a/bench/design/ethics-fixtures/kids-age-gated.html b/bench/design/ethics-fixtures/kids-age-gated.html new file mode 100644 index 0000000..def4102 --- /dev/null +++ b/bench/design/ethics-fixtures/kids-age-gated.html @@ -0,0 +1,19 @@ + + + + + SuperKids — verify your age + + +
+

Welcome to SuperKids

+
+ + +

Enter your age so we can keep things appropriate. Parents will be notified.

+ +
+

We only collect what is strictly necessary for the activity. Optional fields are clearly marked.

+
+ + diff --git a/bench/design/ethics-fixtures/kids-dark-pattern.html b/bench/design/ethics-fixtures/kids-dark-pattern.html new file mode 100644 index 0000000..3a1fd44 --- /dev/null +++ b/bench/design/ethics-fixtures/kids-dark-pattern.html @@ -0,0 +1,17 @@ + + + + + SuperKids Coins! + + +
+

Hi friend! Get 100 SuperCoins NOW!

+

HURRY! Only 3 left!

+

Tap YES or your unicorn will be sad forever 🥺

+ + no thanks, I hate fun +

Cost: $4.99 charged to mom's card. Auto-renews monthly. Cancel by writing a letter.

+
+ + diff --git a/bench/design/ethics-fixtures/medical-no-dosage.html b/bench/design/ethics-fixtures/medical-no-dosage.html new file mode 100644 index 0000000..6965960 --- /dev/null +++ b/bench/design/ethics-fixtures/medical-no-dosage.html @@ -0,0 +1,18 @@ + + + + + RxMed — Order amoxicillin + + +

RxMed Pharmacy

+
+

Amoxicillin 500mg

+

Take this medication as your doctor recommends.

+

Available in 30-tablet packs.

+ + +
+

RxMed © 2026

+ + diff --git a/bench/design/ethics-fixtures/medical-with-dosage.html b/bench/design/ethics-fixtures/medical-with-dosage.html new file mode 100644 index 0000000..1504c73 --- /dev/null +++ b/bench/design/ethics-fixtures/medical-with-dosage.html @@ -0,0 +1,24 @@ + + + + + RxMed — Order amoxicillin (Rx) + + +

RxMed Pharmacy

+
+

Amoxicillin 500mg — Prescribing Information

+
+

Dosage and administration

+

Adults: 500 mg orally every 8 hours. Adjust dosage for renal impairment.

+
+
+

Warnings and contraindications

+

Contraindication: hypersensitivity to penicillin.

+

Adverse effects: nausea, diarrhea, rare anaphylaxis. Report any side effect to MedWatch (FDA 1088).

+
+ +

Report a side effect (MedWatch).

+
+ + diff --git a/package.json b/package.json index e48be73..00a47db 100644 --- a/package.json +++ b/package.json @@ -133,6 +133,7 @@ "pixelmatch": "^7.1.0", "playwright": "^1.40.0", "pngjs": "^7.0.0", + "tsx": "^4.21.0", "typescript": "^5.3.0", "vitest": "^4.0.18" } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index db1fe17..c30b3e0 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -66,12 +66,15 @@ importers: pngjs: specifier: ^7.0.0 version: 7.0.0 + tsx: + specifier: ^4.21.0 + version: 4.21.0 typescript: specifier: ^5.3.0 version: 5.9.3 vitest: specifier: ^4.0.18 - version: 4.0.18(@opentelemetry/api@1.9.0)(@types/node@20.19.35) + version: 4.0.18(@opentelemetry/api@1.9.0)(@types/node@20.19.35)(tsx@4.21.0) packages: @@ -376,56 +379,66 @@ packages: resolution: {integrity: sha512-9B+taZ8DlyyqzZQnoeIvDVR/2F4EbMepXMc/NdVbkzsJbzkUjhXv/70GQJ7tdLA4YJgNP25zukcxpX2/SueNrA==} cpu: [arm64] os: [linux] + libc: [glibc] '@img/sharp-libvips-linux-arm@1.0.5': resolution: {integrity: sha512-gvcC4ACAOPRNATg/ov8/MnbxFDJqf/pDePbBnuBDcjsI8PssmjoKMAz4LtLaVi+OnSb5FK/yIOamqDwGmXW32g==} cpu: [arm] os: [linux] + libc: [glibc] '@img/sharp-libvips-linux-x64@1.0.4': resolution: {integrity: sha512-MmWmQ3iPFZr0Iev+BAgVMb3ZyC4KeFc3jFxnNbEPas60e1cIfevbtuyf9nDGIzOaW9PdnDciJm+wFFaTlj5xYw==} cpu: [x64] os: [linux] + libc: [glibc] '@img/sharp-libvips-linuxmusl-arm64@1.0.4': resolution: {integrity: sha512-9Ti+BbTYDcsbp4wfYib8Ctm1ilkugkA/uscUn6UXK1ldpC1JjiXbLfFZtRlBhjPZ5o1NCLiDbg8fhUPKStHoTA==} cpu: [arm64] os: [linux] + libc: [musl] '@img/sharp-libvips-linuxmusl-x64@1.0.4': resolution: {integrity: sha512-viYN1KX9m+/hGkJtvYYp+CCLgnJXwiQB39damAO7WMdKWlIhmYTfHjwSbQeUK/20vY154mwezd9HflVFM1wVSw==} cpu: [x64] os: [linux] + libc: [musl] '@img/sharp-linux-arm64@0.33.5': resolution: {integrity: sha512-JMVv+AMRyGOHtO1RFBiJy/MBsgz0x4AWrT6QoEVVTyh1E39TrCUpTRI7mx9VksGX4awWASxqCYLCV4wBZHAYxA==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [arm64] os: [linux] + libc: [glibc] '@img/sharp-linux-arm@0.33.5': resolution: {integrity: sha512-JTS1eldqZbJxjvKaAkxhZmBqPRGmxgu+qFKSInv8moZ2AmT5Yib3EQ1c6gp493HvrvV8QgdOXdyaIBrhvFhBMQ==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [arm] os: [linux] + libc: [glibc] '@img/sharp-linux-x64@0.33.5': resolution: {integrity: sha512-opC+Ok5pRNAzuvq1AG0ar+1owsu842/Ab+4qvU879ippJBHvyY5n2mxF1izXqkPYlGuP/M556uh53jRLJmzTWA==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [x64] os: [linux] + libc: [glibc] '@img/sharp-linuxmusl-arm64@0.33.5': resolution: {integrity: sha512-XrHMZwGQGvJg2V/oRSUfSAfjfPxO+4DkiRh6p2AFjLQztWUuY/o8Mq0eMQVIY7HJ1CDQUJlxGGZRw1a5bqmd1g==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [arm64] os: [linux] + libc: [musl] '@img/sharp-linuxmusl-x64@0.33.5': resolution: {integrity: sha512-WT+d/cgqKkkKySYmqoZ8y3pxx7lx9vVejxW/W4DOFMYVSkErR+w7mf2u8m/y4+xHe7yY9DAXQMWQhpnMuFfScw==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [x64] os: [linux] + libc: [musl] '@img/sharp-win32-x64@0.33.5': resolution: {integrity: sha512-MpY/o8/8kj+EcnxwvrP4aTJSWw/aZ7JIGR4aBeZkZw5B7/Jn+tY9/VNwtcoGmdT7GfggGIU4kygOMSbYnOrAbg==} @@ -542,66 +555,79 @@ packages: resolution: {integrity: sha512-t4ONHboXi/3E0rT6OZl1pKbl2Vgxf9vJfWgmUoCEVQVxhW6Cw/c8I6hbbu7DAvgp82RKiH7TpLwxnJeKv2pbsw==} cpu: [arm] os: [linux] + libc: [glibc] '@rollup/rollup-linux-arm-musleabihf@4.59.0': resolution: {integrity: sha512-CikFT7aYPA2ufMD086cVORBYGHffBo4K8MQ4uPS/ZnY54GKj36i196u8U+aDVT2LX4eSMbyHtyOh7D7Zvk2VvA==} cpu: [arm] os: [linux] + libc: [musl] '@rollup/rollup-linux-arm64-gnu@4.59.0': resolution: {integrity: sha512-jYgUGk5aLd1nUb1CtQ8E+t5JhLc9x5WdBKew9ZgAXg7DBk0ZHErLHdXM24rfX+bKrFe+Xp5YuJo54I5HFjGDAA==} cpu: [arm64] os: [linux] + libc: [glibc] '@rollup/rollup-linux-arm64-musl@4.59.0': resolution: {integrity: sha512-peZRVEdnFWZ5Bh2KeumKG9ty7aCXzzEsHShOZEFiCQlDEepP1dpUl/SrUNXNg13UmZl+gzVDPsiCwnV1uI0RUA==} cpu: [arm64] os: [linux] + libc: [musl] '@rollup/rollup-linux-loong64-gnu@4.59.0': resolution: {integrity: sha512-gbUSW/97f7+r4gHy3Jlup8zDG190AuodsWnNiXErp9mT90iCy9NKKU0Xwx5k8VlRAIV2uU9CsMnEFg/xXaOfXg==} cpu: [loong64] os: [linux] + libc: [glibc] '@rollup/rollup-linux-loong64-musl@4.59.0': resolution: {integrity: sha512-yTRONe79E+o0FWFijasoTjtzG9EBedFXJMl888NBEDCDV9I2wGbFFfJQQe63OijbFCUZqxpHz1GzpbtSFikJ4Q==} cpu: [loong64] os: [linux] + libc: [musl] '@rollup/rollup-linux-ppc64-gnu@4.59.0': resolution: {integrity: sha512-sw1o3tfyk12k3OEpRddF68a1unZ5VCN7zoTNtSn2KndUE+ea3m3ROOKRCZxEpmT9nsGnogpFP9x6mnLTCaoLkA==} cpu: [ppc64] os: [linux] + libc: [glibc] '@rollup/rollup-linux-ppc64-musl@4.59.0': resolution: {integrity: sha512-+2kLtQ4xT3AiIxkzFVFXfsmlZiG5FXYW7ZyIIvGA7Bdeuh9Z0aN4hVyXS/G1E9bTP/vqszNIN/pUKCk/BTHsKA==} cpu: [ppc64] os: [linux] + libc: [musl] '@rollup/rollup-linux-riscv64-gnu@4.59.0': resolution: {integrity: sha512-NDYMpsXYJJaj+I7UdwIuHHNxXZ/b/N2hR15NyH3m2qAtb/hHPA4g4SuuvrdxetTdndfj9b1WOmy73kcPRoERUg==} cpu: [riscv64] os: [linux] + libc: [glibc] '@rollup/rollup-linux-riscv64-musl@4.59.0': resolution: {integrity: sha512-nLckB8WOqHIf1bhymk+oHxvM9D3tyPndZH8i8+35p/1YiVoVswPid2yLzgX7ZJP0KQvnkhM4H6QZ5m0LzbyIAg==} cpu: [riscv64] os: [linux] + libc: [musl] '@rollup/rollup-linux-s390x-gnu@4.59.0': resolution: {integrity: sha512-oF87Ie3uAIvORFBpwnCvUzdeYUqi2wY6jRFWJAy1qus/udHFYIkplYRW+wo+GRUP4sKzYdmE1Y3+rY5Gc4ZO+w==} cpu: [s390x] os: [linux] + libc: [glibc] '@rollup/rollup-linux-x64-gnu@4.59.0': resolution: {integrity: sha512-3AHmtQq/ppNuUspKAlvA8HtLybkDflkMuLK4DPo77DfthRb71V84/c4MlWJXixZz4uruIH4uaa07IqoAkG64fg==} cpu: [x64] os: [linux] + libc: [glibc] '@rollup/rollup-linux-x64-musl@4.59.0': resolution: {integrity: sha512-2UdiwS/9cTAx7qIUZB/fWtToJwvt0Vbo0zmnYt7ED35KPg13Q0ym1g442THLC7VyI6JfYTP4PiSOWyoMdV2/xg==} cpu: [x64] os: [linux] + libc: [musl] '@rollup/rollup-openbsd-x64@4.59.0': resolution: {integrity: sha512-M3bLRAVk6GOwFlPTIxVBSYKUaqfLrn8l0psKinkCFxl4lQvOSz8ZrKDz2gxcBwHFpci0B6rttydI4IpS4IS/jQ==} @@ -890,6 +916,9 @@ packages: engines: {node: ^8.16.0 || ^10.6.0 || >=11.0.0} os: [darwin] + get-tsconfig@4.14.0: + resolution: {integrity: sha512-yTb+8DXzDREzgvYmh6s9vHsSVCHeC0G3PI5bEXNBHtmshPnO+S5O7qgLEOn0I5QvMy6kpZN8K1NKGyilLb93wA==} + glob-parent@5.1.2: resolution: {integrity: sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==} engines: {node: '>= 6'} @@ -1165,6 +1194,9 @@ packages: resolution: {integrity: sha512-qYg9KP24dD5qka9J47d0aVky0N+b4fTU89LN9iDnjB5waksiC49rvMB0PrUJQGoTmH50XPiqOvAjDfaijGxYZw==} engines: {node: '>=8'} + resolve-pkg-maps@1.0.0: + resolution: {integrity: sha512-seS2Tj26TBVOC2NIc2rOe2y2ZO7efxITtLZcGSOnHHNOQ7CkiUBfw0Iw2ck6xkIhPwLhKNLS8BO+hEpngQlqzw==} + reusify@1.1.0: resolution: {integrity: sha512-g6QUff04oZpHs0eG5p83rFLhHeV00ug/Yf9nZM6fLeUrPguBTkTQOdpAWWspMh55TZfVQDPaN3NQJfbVRAxdIw==} engines: {iojs: '>=1.0.0', node: '>=0.10.0'} @@ -1260,6 +1292,11 @@ packages: tr46@0.0.3: resolution: {integrity: sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==} + tsx@4.21.0: + resolution: {integrity: sha512-5C1sg4USs1lfG0GFb2RLXsdpXqBSEhAaA/0kPL01wxzpMqLILNxIxIOKiILz+cdg/pLnOUxFYOR5yhHU666wbw==} + engines: {node: '>=18.0.0'} + hasBin: true + typedarray@0.0.6: resolution: {integrity: sha512-/aCDEGatGvZ2BIk+HmLf4ifCJFwvKFNb9/JeZPMulfgFracn9QFcAf5GO8B/mweUjSoblS5In0cWhqpfs/5PQA==} @@ -1913,13 +1950,13 @@ snapshots: chai: 6.2.2 tinyrainbow: 3.0.3 - '@vitest/mocker@4.0.18(vite@7.3.1(@types/node@20.19.35))': + '@vitest/mocker@4.0.18(vite@7.3.1(@types/node@20.19.35)(tsx@4.21.0))': dependencies: '@vitest/spy': 4.0.18 estree-walker: 3.0.3 magic-string: 0.30.21 optionalDependencies: - vite: 7.3.1(@types/node@20.19.35) + vite: 7.3.1(@types/node@20.19.35)(tsx@4.21.0) '@vitest/pretty-format@4.0.18': dependencies: @@ -2159,6 +2196,10 @@ snapshots: fsevents@2.3.3: optional: true + get-tsconfig@4.14.0: + dependencies: + resolve-pkg-maps: 1.0.0 + glob-parent@5.1.2: dependencies: is-glob: 4.0.3 @@ -2371,6 +2412,8 @@ snapshots: resolve-from@5.0.0: {} + resolve-pkg-maps@1.0.0: {} + reusify@1.1.0: {} rollup@4.59.0: @@ -2468,6 +2511,13 @@ snapshots: tr46@0.0.3: {} + tsx@4.21.0: + dependencies: + esbuild: 0.27.3 + get-tsconfig: 4.14.0 + optionalDependencies: + fsevents: 2.3.3 + typedarray@0.0.6: {} typescript@5.9.3: {} @@ -2478,7 +2528,7 @@ snapshots: util-deprecate@1.0.2: {} - vite@7.3.1(@types/node@20.19.35): + vite@7.3.1(@types/node@20.19.35)(tsx@4.21.0): dependencies: esbuild: 0.27.3 fdir: 6.5.0(picomatch@4.0.3) @@ -2489,11 +2539,12 @@ snapshots: optionalDependencies: '@types/node': 20.19.35 fsevents: 2.3.3 + tsx: 4.21.0 - vitest@4.0.18(@opentelemetry/api@1.9.0)(@types/node@20.19.35): + vitest@4.0.18(@opentelemetry/api@1.9.0)(@types/node@20.19.35)(tsx@4.21.0): dependencies: '@vitest/expect': 4.0.18 - '@vitest/mocker': 4.0.18(vite@7.3.1(@types/node@20.19.35)) + '@vitest/mocker': 4.0.18(vite@7.3.1(@types/node@20.19.35)(tsx@4.21.0)) '@vitest/pretty-format': 4.0.18 '@vitest/runner': 4.0.18 '@vitest/snapshot': 4.0.18 @@ -2510,7 +2561,7 @@ snapshots: tinyexec: 1.0.2 tinyglobby: 0.2.15 tinyrainbow: 3.0.3 - vite: 7.3.1(@types/node@20.19.35) + vite: 7.3.1(@types/node@20.19.35)(tsx@4.21.0) why-is-node-running: 2.3.0 optionalDependencies: '@opentelemetry/api': 1.9.0 diff --git a/scripts/copy-static-assets.mjs b/scripts/copy-static-assets.mjs index d897166..cbff8af 100644 --- a/scripts/copy-static-assets.mjs +++ b/scripts/copy-static-assets.mjs @@ -6,6 +6,8 @@ * * Currently copies: * - src/design/audit/rubric/fragments/*.md (rubric library) + * - src/design/audit/rubric/anchors/*.yaml (calibration anchors) + * - src/design/audit/ethics/rules/*.yaml (ethics gate rules) * - src/viewer/*.html (session viewer UI) */ @@ -22,6 +24,18 @@ const COPIES = [ dest: 'dist/design/audit/rubric/fragments', pattern: /\.md$/, }, + { + label: 'rubric anchor(s)', + src: 'src/design/audit/rubric/anchors', + dest: 'dist/design/audit/rubric/anchors', + pattern: /\.ya?ml$/, + }, + { + label: 'ethics rule(s)', + src: 'src/design/audit/ethics/rules', + dest: 'dist/design/audit/ethics/rules', + pattern: /\.ya?ml$/, + }, { label: 'viewer asset(s)', src: 'src/viewer', diff --git a/skills/design-evolve/SKILL.md b/skills/design-evolve/SKILL.md index 270d299..23fe2da 100644 --- a/skills/design-evolve/SKILL.md +++ b/skills/design-evolve/SKILL.md @@ -65,7 +65,33 @@ Batch related fixes: all spacing in one pass, all color in another. ## Phase 3: Apply Fixes to Source Code -Match the project's styling approach. Fix the **design system** (shared components, tokens, globals), not individual instances. +**Preferred (v2 — patch-based):** If the audit output has `findings[*].patches[]`, apply mechanically rather than authoring from scratch: + +```ts +// Iterate topFixes in order +for (const findingId of page.topFixes) { + const finding = page.findings.find(f => f.id === findingId) + if (!finding.patches?.length) continue // Layer 2 not yet active for this finding + + const patch = finding.patches[0] + // Option A: file path known → apply unified diff + if (patch.target.filePath && patch.diff.unifiedDiff) { + // write unifiedDiff to a temp file and: git apply + } + // Option B: CSS selector → search-replace + // find patch.diff.before in relevant file, replace with patch.diff.after + + // Verify + if (patch.testThatProves.command) { + // run patch.testThatProves.command + } + + // Close the loop — record attribution + // bad design-audit ack-patch --pre-run-id +} +``` + +**Fallback (prose-to-code):** When `patches[]` is empty, match the project's styling approach and fix the **design system** (shared components, tokens, globals), not individual instances. **Tailwind:** ```tsx @@ -100,17 +126,22 @@ Rules: - Fix the design system, not individual instances - Only change visual properties — never touch event handlers, state, or business logic -## Phase 4: Re-Audit +## Phase 4: Re-Audit + Attribution ```bash +# If you used patch-based flow in Phase 3, record attribution before re-auditing: +# bad design-audit ack-patch --pre-run-id + node dist/cli.js design-audit \ --url \ --profile \ --pages \ --json --headless +# (add --post-patch if you ack'd above) ``` Compare: did score improve? Are original critical/major findings resolved? Any new findings introduced? +Check `ethicsViolations` — if any are present the rollup is capped; scores won't improve past the cap until violations are remediated. ## Phase 5: Iterate diff --git a/src/cli-ack-patch.ts b/src/cli-ack-patch.ts new file mode 100644 index 0000000..28b3e7d --- /dev/null +++ b/src/cli-ack-patch.ts @@ -0,0 +1,89 @@ +/** + * Layer 4 — `bad design-audit ack-patch` subcommand handler. + * + * Invoked by coding agents after applying a patch: + * bad design-audit ack-patch --pre-run-id [--applied-by ] + * + * When a re-audit is run with `--post-patch `, the pipeline looks up + * the pending application and writes the observed outcome. This file handles + * the ack-patch side; the --post-patch flow lives in pipeline.ts. + */ + +import * as crypto from 'node:crypto' +import type { PatchApplication } from './design/audit/attribution/types.js' +import type { Dimension } from './design/audit/v2/types.js' +import { + appendPatchApplication, + patchHash, + findPendingApplication, + updateApplicationOutcome, +} from './design/audit/attribution/store.js' + +export interface AckPatchOptions { + patchId: string + preRunId: string + appliedBy?: string + predictedDim?: string + predictedDelta?: number + patchBefore?: string + patchAfter?: string + patchScope?: string + dir?: string +} + +/** + * Record that a patch was applied. Returns the applicationId for correlation. + * The predicted delta is optional — when not provided, defaults to 'untested'. + */ +export async function ackPatch(opts: AckPatchOptions): Promise { + const applicationId = crypto.randomUUID() + const hash = patchHash( + { before: opts.patchBefore ?? '', after: opts.patchAfter ?? '' }, + opts.patchScope ?? 'component', + ) + + const app: PatchApplication = { + applicationId, + patchId: opts.patchId, + patchHash: hash, + appliedAt: new Date().toISOString(), + appliedBy: opts.appliedBy ?? 'agent:unknown', + preAuditRunId: opts.preRunId, + predicted: { + dim: (opts.predictedDim ?? 'product_intent') as Dimension, + delta: opts.predictedDelta ?? 0, + }, + } + + await appendPatchApplication(app, opts.dir) + return applicationId +} + +export interface PostPatchOptions { + patchId: string + postRunId: string + observedDim: string + observedDelta: number + dir?: string +} + +/** + * Record the observed outcome after a re-audit. Looks up the pending + * application for `patchId` and appends an outcome event. + */ +export async function recordPatchOutcome(opts: PostPatchOptions): Promise { + const pending = await findPendingApplication(opts.patchId, opts.dir) + if (!pending) { + throw new Error( + `No pending PatchApplication found for patchId ${opts.patchId}. ` + + 'Run `bad design-audit ack-patch` after applying the patch, before re-auditing.', + ) + } + + await updateApplicationOutcome( + pending.applicationId, + opts.postRunId, + { dim: opts.observedDim as Dimension, delta: opts.observedDelta }, + opts.dir, + ) +} diff --git a/src/cli-design-audit.ts b/src/cli-design-audit.ts index 2add9b8..4595ab2 100644 --- a/src/cli-design-audit.ts +++ b/src/cli-design-audit.ts @@ -17,7 +17,72 @@ import { resolveProviderApiKey, resolveProviderModelName, type SupportedProvider import { loadLocalEnvFiles } from './env-loader.js' import { cliError } from './cli-ui.js' import { auditOnePage } from './design/audit/pipeline.js' -import type { PageAuditResult as Gen2PageAuditResult } from './design/audit/types.js' +import type { PageAuditResult as Gen2PageAuditResult, EthicsViolation } from './design/audit/types.js' + +/** Split "a, b , c" → ['a','b','c']. Returns undefined for empty input so the + * v2 predicate predicates can distinguish "operator did not say" from "[]". */ +function parseTagList(input: string | undefined): string[] | undefined { + if (!input) return undefined + const tags = input.split(',').map(s => s.trim()).filter(Boolean) + return tags.length > 0 ? tags : undefined +} + +/** Pretty-print the ethics-violation report for a set of pages. Prints + * nothing when no page tripped a rule. Each rule is shown with severity, + * remediation, and citation so the operator can act without re-running. */ +function printEthicsViolations(pages: Array<{ url: string; ethicsViolations?: EthicsViolation[] }>): void { + const offenders = pages.filter(p => (p.ethicsViolations?.length ?? 0) > 0) + if (offenders.length === 0) return + console.log('') + console.log(` ${chalk.bgRed.white.bold(' ETHICS VIOLATIONS ')}`) + for (const page of offenders) { + console.log(` ${chalk.dim('Page:')} ${page.url}`) + for (const v of page.ethicsViolations ?? []) { + const sevColor = v.severity === 'critical-floor' ? chalk.red : chalk.yellow + console.log(` ${sevColor('•')} ${chalk.bold(v.ruleId)} ${chalk.dim('—')} ${sevColor(v.severity)} ${chalk.dim(`(rollup capped at ${v.rollupCap})`)}`) + console.log(` ${chalk.dim('fix:')} ${v.remediation}`) + if (v.citation) console.log(` ${chalk.dim('cite:')} ${v.citation}`) + } + } +} + +/** Lowest rollup cap across all violated pages, or undefined if none fired. */ +function lowestRollupCap(pages: Array<{ ethicsViolations?: EthicsViolation[] }>): number | undefined { + const caps = pages.flatMap(p => p.ethicsViolations ?? []).map(v => v.rollupCap) + return caps.length === 0 ? undefined : Math.min(...caps) +} + +/** + * Layer 1 — print the per-dimension breakdown for one page when an + * `auditResultV2` is attached. Five dim lines + one rollup line; each shows + * score, range, and confidence so an agent can reason about uncertainty. + */ +function printV2Breakdown(page: { auditResultV2?: unknown }): void { + const v2 = page.auditResultV2 as + | { + scores?: Record + rollup?: { score: number; range: [number, number]; confidence: string; rule: string } + } + | undefined + if (!v2 || !v2.scores || !v2.rollup) return + + const dimOrder = ['product_intent', 'visual_craft', 'trust_clarity', 'workflow', 'content_ia'] + for (const dim of dimOrder) { + const s = v2.scores[dim] + if (!s) continue + const sevColor = s.score >= 8 ? chalk.green : s.score >= 5 ? chalk.yellow : chalk.red + const confColor = s.confidence === 'high' ? chalk.green : s.confidence === 'medium' ? chalk.yellow : chalk.dim + console.log( + ` ${chalk.dim(dim.padEnd(15))} ${sevColor(`${s.score}/10`)} ${chalk.dim(`[${s.range[0]}-${s.range[1]}]`)} ${confColor(s.confidence)}`, + ) + } + const r = v2.rollup + const rColor = r.score >= 8 ? chalk.green : r.score >= 5 ? chalk.yellow : chalk.red + const confColor = r.confidence === 'high' ? chalk.green : r.confidence === 'medium' ? chalk.yellow : chalk.dim + console.log( + ` ${chalk.dim('rollup'.padEnd(15))} ${rColor(`${r.score.toFixed(1)}/10`)} ${chalk.dim(`[${r.range[0].toFixed(1)}-${r.range[1].toFixed(1)}]`)} ${confColor(r.confidence)} ${chalk.dim(r.rule)}`, + ) +} import { resolveAuditPasses } from './design/audit/evaluate.js' import { detectSystemicFindings, topByRoi } from './design/audit/roi.js' import { getTelemetry, setInvocation } from './telemetry/index.js' @@ -92,6 +157,12 @@ interface PageAuditResult { rubricFragments?: string[] /** Gen 2: deterministic measurements */ measurements?: Gen2PageAuditResult['measurements'] + /** Layer 7: ethics violations that capped the rollup, if any. */ + ethicsViolations?: EthicsViolation[] + /** Layer 7: the pre-cap rollup score when ethicsViolations is non-empty. */ + preEthicsScore?: number + /** Layer 1: opaque v2 result attached for backwards-compat dual-emit. */ + auditResultV2?: unknown } // --------------------------------------------------------------------------- @@ -249,6 +320,19 @@ export interface DesignAuditOptions { rubricsDir?: string /** Subjective LLM audit passes: standard, deep, max, number, or comma-list */ auditPasses?: string + // ── Layer 7 — domain ethics gate ── + /** Bypass the ethics floor entirely. Audited + warned. Test-only. */ + skipEthics?: boolean + /** Override directory for ethics rule yaml files. */ + ethicsRulesDir?: string + /** Comma-separated audience tags: developer, clinician, kids, ... */ + audience?: string + /** Comma-separated regulatory contexts: hipaa, gdpr, coppa, ... */ + regulatoryContext?: string + /** Comma-separated audience-vulnerability tags: patient-facing, minor-facing, ... */ + audienceVulnerability?: string + /** Single modality: mobile, tablet, desktop, tv, kiosk */ + modality?: string } export async function runDesignAudit(opts: DesignAuditOptions): Promise { @@ -270,6 +354,19 @@ export async function runDesignAudit(opts: DesignAuditOptions): Promise { const apiKey = opts.apiKey ?? resolveProviderApiKey(provider) const auditPasses = resolveAuditPasses(opts.auditPasses) + // Layer 7 — ethics gate options. Threaded into every auditOnePage call site. + if (opts.skipEthics) { + console.warn(` ${chalk.yellow('⚠')} ${chalk.bold('--skip-ethics')} ${chalk.dim('— ethics floor disabled (test-only)')}`) + } + const ethicsCommonOpts = { + skipEthics: opts.skipEthics, + ethicsRulesDir: opts.ethicsRulesDir, + audience: parseTagList(opts.audience) as never, + regulatoryContext: parseTagList(opts.regulatoryContext) as never, + audienceVulnerability: parseTagList(opts.audienceVulnerability) as never, + modality: parseTagList(opts.modality) as never, + } + // Telemetry: every design-audit invocation gets a stable runId. Children // (per-page, evolve rounds) link back via parentRunId so a fleet rollup can // reconstruct the tree. @@ -347,6 +444,7 @@ export async function runDesignAudit(opts: DesignAuditOptions): Promise { runId, provider, model: modelName, + ...ethicsCommonOpts, }) const result = gen2 as PageAuditResult results.push(result) @@ -358,6 +456,7 @@ export async function runDesignAudit(opts: DesignAuditOptions): Promise { ? chalk.dim(` (${result.classification.type}/${result.classification.domain})`) : '' console.log(` ${icon} ${scoreColor(`${result.score}/10`)} ${chalk.dim('—')} ${findingCount} finding${findingCount !== 1 ? 's' : ''}${classLabel}`) + printV2Breakdown(result) } // Cross-page systemic detection + top-fixes ranking. @@ -383,17 +482,31 @@ export async function runDesignAudit(opts: DesignAuditOptions): Promise { if (opts.json) { const jsonPath = path.join(outputDir, 'report.json') + // Layer 1 — emit BOTH schemaVersion 1 (legacy) and schemaVersion 2 (new) + // shapes for one release. Consumers can migrate to v2 incrementally. + const v2Pages = results + .map(r => r.auditResultV2) + .filter((r): r is unknown => r !== undefined) fs.writeFileSync(jsonPath, JSON.stringify({ + schemaVersion: 1, timestamp: new Date().toISOString(), profile, url: opts.url, pages: results, topFixes, summary: { avgScore, totalFindings: allFindings.length, critical, major, minor }, + v2: { + schemaVersion: 2, + pages: v2Pages, + }, }, null, 2)) console.log(` ${chalk.dim('JSON →')} ${jsonPath}`) } + // ── Layer 7 — surface ethics violations BEFORE the score summary so the + // operator sees the floor reason, not just the capped number. ── + printEthicsViolations(results) + // Summary console.log('') console.log(` ${chalk.dim('─'.repeat(52))}`) @@ -403,6 +516,10 @@ export async function runDesignAudit(opts: DesignAuditOptions): Promise { if (major > 0) findingParts.push(chalk.yellow(`${major} major`)) if (minor > 0) findingParts.push(chalk.dim(`${minor} minor`)) console.log(` Avg: ${avgColor(`${avgScore.toFixed(1)}/10`)} ${chalk.dim('·')} ${allFindings.length} findings ${findingParts.length ? chalk.dim('(') + findingParts.join(chalk.dim(' · ')) + chalk.dim(')') : ''}`) + const lowestCap = lowestRollupCap(results) + if (lowestCap !== undefined) { + console.log(` ${chalk.red('⚠ Rollup capped at')} ${chalk.bold(`${lowestCap}/10`)} ${chalk.dim('— resolve ethics violations to lift the cap')}`) + } console.log(` ${chalk.dim('Report →')} ${reportPath}`) if (screenshotDir) console.log(` ${chalk.dim('Screenshots →')} ${screenshotDir}`) console.log('') @@ -426,6 +543,7 @@ export async function runDesignAudit(opts: DesignAuditOptions): Promise { provider, model: modelName, parentRunId: runId, + ...ethicsCommonOpts, }) repResults.push(gen2 as PageAuditResult) } diff --git a/src/cli-patterns.ts b/src/cli-patterns.ts new file mode 100644 index 0000000..cf09a34 --- /dev/null +++ b/src/cli-patterns.ts @@ -0,0 +1,66 @@ +/** + * Layer 5 — `bad patterns` subcommand surface. + * + * Provides pattern query and inspection via CLI. Mining runs as a periodic + * Cloudflare Worker cron in production; locally it reads from ~/.bad/patterns/. + * + * bad patterns query [--category ] [--page-type ] [--weak-dimension ] + * bad patterns show + * bad patterns mine [--dir ] + */ + +import type { PatternQuery } from './design/audit/patterns/types.js' +import { queryPatterns, loadPatterns } from './design/audit/patterns/store.js' +import type { Dimension, PageType } from './design/audit/v2/types.js' + +export interface PatternsQueryOptions { + category?: string + pageType?: PageType + weakDimension?: Dimension + minApplications?: number + minSuccessRate?: number + json?: boolean + dir?: string +} + +export async function runPatternsQuery(opts: PatternsQueryOptions): Promise { + const query: PatternQuery = { + category: opts.category, + pageType: opts.pageType, + weakDimension: opts.weakDimension, + minApplications: opts.minApplications, + minSuccessRate: opts.minSuccessRate, + } + const patterns = await queryPatterns(query, opts.dir) + + if (patterns.length === 0) { + console.log('No patterns found. The pattern library is empty until fleet data accumulates (Layer 5 cold-start).') + return + } + + if (opts.json) { + console.log(JSON.stringify(patterns, null, 2)) + return + } + + for (const p of patterns) { + console.log(`\n[${p.patternId}] ${p.scaffold.description}`) + console.log(` Category: ${p.category} | Type: ${p.classification.type}`) + console.log(` Fleet: N=${p.fleetEvidence.applications} tenants=${p.fleetEvidence.sampleTenants} success=${(p.fleetEvidence.successRate * 100).toFixed(0)}%`) + console.log(` Key decisions: ${p.scaffold.keyDecisions.join('; ')}`) + } +} + +export async function runPatternsShow(patternId: string, opts: { json?: boolean; dir?: string } = {}): Promise { + const all = await loadPatterns(opts.dir) + const pattern = all.find(p => p.patternId === patternId) + if (!pattern) { + console.error(`Pattern ${patternId} not found.`) + process.exit(1) + } + if (opts.json) { + console.log(JSON.stringify(pattern, null, 2)) + return + } + console.log(JSON.stringify(pattern, null, 2)) +} diff --git a/src/cli.ts b/src/cli.ts index 35662b6..34ac2a3 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -153,6 +153,15 @@ async function main(): Promise { reproducibility: { type: 'boolean' }, 'rubrics-dir': { type: 'string' }, 'audit-passes': { type: 'string' }, + // Layer 7 — domain ethics gate. --skip-ethics bypasses the rollup floor + // for testing scenarios; --ethics-rules-dir overrides the builtin rule set. + 'skip-ethics': { type: 'boolean' }, + 'ethics-rules-dir': { type: 'string' }, + // Layer 6 / 7 — audience predicate hints. Comma-separated. + audience: { type: 'string' }, + 'regulatory-context': { type: 'string' }, + 'audience-vulnerability': { type: 'string' }, + modality: { type: 'string' }, // bad view port: { type: 'string' }, 'no-open': { type: 'boolean' }, @@ -341,6 +350,12 @@ async function main(): Promise { reproducibility: values.reproducibility, rubricsDir: values['rubrics-dir'], auditPasses: values['audit-passes'], + skipEthics: values['skip-ethics'], + ethicsRulesDir: values['ethics-rules-dir'], + audience: values.audience, + regulatoryContext: values['regulatory-context'], + audienceVulnerability: values['audience-vulnerability'], + modality: values.modality, }); process.exit(0); } diff --git a/src/design/audit/attribution/aggregate.ts b/src/design/audit/attribution/aggregate.ts new file mode 100644 index 0000000..5e8c6c8 --- /dev/null +++ b/src/design/audit/attribution/aggregate.ts @@ -0,0 +1,117 @@ +/** + * Layer 4 — Cross-tenant patch reliability aggregation. + * + * Groups PatchApplication records by patchHash and computes reliability + * statistics. Fleet-mined patterns (Layer 5) consume these aggregates. + */ + +import type { PatchApplication, PatchReliability } from './types.js' +import type { PatchRecommendation } from './types.js' + +/** Stored record includes patchHash which is computed at write time by store.ts. */ +type StoredApplication = PatchApplication & { patchHash?: string } + +const MIN_APPLICATIONS_FOR_RECOMMENDED = 30 +const MIN_TENANTS_FOR_RECOMMENDED = 5 +const REPLICATION_RATE_THRESHOLD = 0.7 +const MIN_APPLICATIONS_FOR_ANTIPATTERN = 10 +const ANTIPATTERN_REPLICATION_THRESHOLD = 0.3 + +/** + * A candidate application for aggregation — has both predicted and observed. + */ +type CompletedApplication = PatchApplication & { + observed: NonNullable +} + +function isCompleted(app: PatchApplication): app is CompletedApplication { + return app.observed !== undefined +} + +/** + * True when the observed delta "replicates" the predicted: same sign and + * at least half the magnitude. + */ +function replicates(predicted: { delta: number }, observed: { delta: number }): boolean { + if (Math.sign(predicted.delta) !== Math.sign(observed.delta)) return false + return Math.abs(observed.delta) >= 0.5 * Math.abs(predicted.delta) +} + +/** Extract tenant tag from `appliedBy` field or application metadata. */ +function tenantFrom(app: PatchApplication): string { + // convention: 'agent:claude-code:tenant-id' or tenantId field if present + const parts = app.appliedBy.split(':') + return parts.length >= 3 ? parts.slice(2).join(':') : app.appliedBy +} + +/** + * Aggregate all PatchApplication records into per-patchHash reliability stats. + * Records without an `observed` delta are counted in `applications` but excluded + * from the rate computations. + */ +export function aggregatePatchReliability( + applications: PatchApplication[], +): PatchReliability[] { + const byHash = new Map() + for (const app of applications as StoredApplication[]) { + const hash = app.patchHash ?? app.patchId // fall back to patchId for records without hash + if (!byHash.has(hash)) byHash.set(hash, []) + byHash.get(hash)!.push(app) + } + + const results: PatchReliability[] = [] + for (const [hashKey, apps] of byHash.entries()) { + const completed = apps.filter(isCompleted) + const tenants = new Set(apps.map(tenantFrom)).size + + const meanPredictedDelta = + completed.length > 0 + ? completed.reduce((s, a) => s + a.predicted.delta, 0) / completed.length + : 0 + + const meanObservedDelta = + completed.length > 0 + ? completed.reduce((s, a) => s + a.observed.delta, 0) / completed.length + : 0 + + const replicationRate = + completed.length > 0 + ? completed.filter(a => replicates(a.predicted, a.observed)).length / completed.length + : 0 + + results.push({ + patchHash: hashKey, + applications: apps.length, + meanPredictedDelta, + meanObservedDelta, + sampleTenants: tenants, + replicationRate, + recommendation: recommendationFor(apps.length, tenants, replicationRate, meanObservedDelta), + }) + } + + return results.sort((a, b) => b.applications - a.applications) +} + +export function recommendationFor( + applications: number, + sampleTenants: number, + replicationRate: number, + meanObservedDelta: number, +): PatchRecommendation { + if ( + applications >= MIN_APPLICATIONS_FOR_RECOMMENDED && + sampleTenants >= MIN_TENANTS_FOR_RECOMMENDED && + replicationRate >= REPLICATION_RATE_THRESHOLD + ) { + return 'recommended' + } + if ( + applications >= MIN_APPLICATIONS_FOR_ANTIPATTERN && + replicationRate < ANTIPATTERN_REPLICATION_THRESHOLD && + meanObservedDelta < 0 + ) { + return 'antipattern' + } + return 'neutral' +} diff --git a/src/design/audit/attribution/store.ts b/src/design/audit/attribution/store.ts new file mode 100644 index 0000000..a828a95 --- /dev/null +++ b/src/design/audit/attribution/store.ts @@ -0,0 +1,138 @@ +/** + * Layer 4 — Append-only JSONL store for PatchApplication records. + * + * Layout: `/applications/.jsonl` + * Each line is a standalone JSON object — `patchHash` is always set so cross- + * tenant aggregation can group by patch signature, not per-tenant path. + * + * Append-only invariant: never mutate existing lines. Outcome updates are + * recorded as NEW lines so the JSONL is an event stream, not a state snapshot. + */ + +import * as fs from 'node:fs' +import * as fsp from 'node:fs/promises' +import * as path from 'node:path' +import * as os from 'node:os' +import * as crypto from 'node:crypto' +import type { PatchApplication } from './types.js' + +const DEFAULT_DIR = path.join(os.homedir(), '.bad', 'attribution') + +function applicationsDir(dir: string): string { + return path.join(dir, 'applications') +} + +function todayPath(dir: string): string { + const date = new Date().toISOString().slice(0, 10) + return path.join(applicationsDir(dir), `${date}.jsonl`) +} + +/** + * Stable hash for a patch diff + scope. Same patch content across tenants → + * same hash, enabling cross-tenant reliability aggregation. + */ +export function patchHash(diff: { before: string; after: string }, scope: string): string { + return crypto + .createHash('sha256') + .update(`${diff.before}\n---\n${diff.after}\n---\n${scope}`) + .digest('hex') + .slice(0, 16) +} + +/** Append a new PatchApplication record. */ +export async function appendPatchApplication( + app: PatchApplication, + dir: string = DEFAULT_DIR, +): Promise { + await fsp.mkdir(applicationsDir(dir), { recursive: true }) + await fsp.appendFile(todayPath(dir), JSON.stringify(app) + '\n', 'utf-8') +} + +/** Sync variant for non-async call sites. */ +export function appendPatchApplicationSync( + app: PatchApplication, + dir: string = DEFAULT_DIR, +): void { + fs.mkdirSync(applicationsDir(dir), { recursive: true }) + fs.appendFileSync(todayPath(dir), JSON.stringify(app) + '\n', 'utf-8') +} + +/** Read all PatchApplication records from the last `days` days. */ +export async function readRecentApplications( + days: number = 7, + dir: string = DEFAULT_DIR, +): Promise { + const appsDir = applicationsDir(dir) + if (!fs.existsSync(appsDir)) return [] + + const results: PatchApplication[] = [] + for (let d = 0; d < days; d++) { + const date = new Date(Date.now() - d * 86_400_000).toISOString().slice(0, 10) + const filePath = path.join(appsDir, `${date}.jsonl`) + if (!fs.existsSync(filePath)) continue + const lines = fs.readFileSync(filePath, 'utf-8').split('\n').filter(Boolean) + for (const line of lines) { + try { + results.push(JSON.parse(line) as PatchApplication) + } catch { + // corrupt line — skip + } + } + } + return results +} + +/** + * Find the most recent pending application for a patchId — one that has no + * `postAuditRunId` yet. Used when a re-audit lands to attach the outcome. + */ +export async function findPendingApplication( + patchId: string, + dir: string = DEFAULT_DIR, +): Promise { + const apps = await readRecentApplications(7, dir) + // Most recent first; pick the newest pending one. + const pending = apps + .filter(a => a.patchId === patchId && !a.postAuditRunId) + .sort((a, b) => b.appliedAt.localeCompare(a.appliedAt)) + return pending[0] ?? null +} + +/** + * Append an outcome event for an existing application. Does NOT mutate the + * original line — appends a new event so the JSONL remains an event stream. + */ +export async function updateApplicationOutcome( + applicationId: string, + postAuditRunId: string, + observed: PatchApplication['observed'], + dir: string = DEFAULT_DIR, +): Promise { + const apps = await readRecentApplications(7, dir) + const original = apps.find(a => a.applicationId === applicationId) + if (!original) { + throw new Error(`PatchApplication ${applicationId} not found in the last 7 days`) + } + + const agreementScore = computeAgreementScore(original.predicted, observed) + const outcome: PatchApplication = { + ...original, + postAuditRunId, + observed, + agreementScore, + } + + await fsp.mkdir(applicationsDir(dir), { recursive: true }) + await fsp.appendFile(todayPath(dir), JSON.stringify(outcome) + '\n', 'utf-8') +} + +function computeAgreementScore( + predicted: PatchApplication['predicted'], + observed: PatchApplication['observed'], +): number { + if (!predicted || !observed) return 0 + const p = predicted.delta + const o = observed.delta + const denom = Math.max(Math.abs(p), Math.abs(o), 1) + return 1 - Math.abs(p - o) / denom +} diff --git a/src/design/audit/attribution/types.ts b/src/design/audit/attribution/types.ts new file mode 100644 index 0000000..1236667 --- /dev/null +++ b/src/design/audit/attribution/types.ts @@ -0,0 +1,18 @@ +/** + * Layer 4 — Outcome attribution type contract. + * + * These types are already defined in src/design/audit/v2/types.ts as part of + * the Phase 0 contract. This module re-exports them so attribution code can + * import from a single, predictable path. When v2/types.ts is the sole + * canonical source, update these re-exports accordingly. + */ + +export type { + PatchApplication, + PatchReliability, +} from '../v2/types.js' + +/** sha256(diff.before + '\n---\n' + diff.after + '\n---\n' + scope).slice(0,16) */ +export type PatchHash = string + +export type PatchRecommendation = 'recommended' | 'neutral' | 'antipattern' diff --git a/src/design/audit/classify-ensemble.ts b/src/design/audit/classify-ensemble.ts new file mode 100644 index 0000000..5180481 --- /dev/null +++ b/src/design/audit/classify-ensemble.ts @@ -0,0 +1,264 @@ +/** + * Ensemble classifier — Layer 1 of the world-class design-audit architecture. + * + * Three-signal vote (URL pattern + DOM heuristic + LLM) decides the page type + * and reports an ensemble confidence so downstream layers (first-principles + * fallback, rubric loader, telemetry) can act on uncertainty honestly. + * + * Vote logic: + * - URL + DOM agree on a type AND combined confidence > 0.7 → accept (skip LLM) + * - else → run LLM, take majority + * - if LLM confidence < 0.5 AND signals disagree → return 'unknown' with dissent + */ + +import type { Brain } from '../../brain/index.js' +import type { PageState } from '../../types.js' +import { classifyPage, defaultClassification } from './classify.js' +import type { PageClassification, PageType } from './types.js' +import type { + ClassifierSignal, + ClassifierSource, + DomHeuristics, + EnsembleClassification, +} from './v2/types.js' + +interface UrlPatternRule { + pattern: RegExp + type: PageType + confidence: number + rationale: string +} + +/** + * URL pattern rules — straight from the RFC. Order matters: more specific + * patterns first. Each rule's confidence is the URL signal's contribution to + * the ensemble vote. + */ +const URL_PATTERN_RULES: UrlPatternRule[] = [ + { pattern: /\/(docs|reference|api|guide|help|faq)(\/|$)/, type: 'docs', confidence: 0.85, rationale: 'URL contains a docs path segment' }, + { pattern: /\/(checkout|cart|pay|order|billing)(\/|$)/, type: 'ecommerce', confidence: 0.85, rationale: 'URL contains a commerce path segment' }, + { pattern: /\/(app|dashboard|workspace|admin)(\/|$)/, type: 'saas-app', confidence: 0.75, rationale: 'URL contains an app/dashboard path segment' }, + { pattern: /\/(login|signup|auth|sign-in)(\/|$)/, type: 'utility', confidence: 0.85, rationale: 'URL contains an auth path segment' }, + { pattern: /\/(pricing|plans|features|product)(\/|$)/, type: 'marketing', confidence: 0.7, rationale: 'URL contains a marketing path segment' }, + { pattern: /\/(blog|articles|news|stories)(\/|$)/, type: 'blog', confidence: 0.8, rationale: 'URL contains a blog path segment' }, + { pattern: /\/$/, type: 'marketing', confidence: 0.4, rationale: 'URL is a root path — weak marketing default' }, +] + +const ENSEMBLE_AGREEMENT_THRESHOLD = 0.7 +const LLM_FALLBACK_CONFIDENCE = 0.5 + +export interface EnsembleClassifyInput { + brain: Brain + state: PageState + url: string + /** Optional pre-captured DOM heuristics. If absent, we attempt to derive them from the snapshot. */ + domHeuristics?: DomHeuristics +} + +/** Public entry point. */ +export async function classifyEnsemble(input: EnsembleClassifyInput): Promise { + const signals: ClassifierSignal[] = [] + + // ── 1. URL pattern signal ── + const urlSignal = classifyByUrl(input.url) + if (urlSignal) signals.push(urlSignal) + + // ── 2. DOM heuristic signal ── + const dom = input.domHeuristics ?? deriveHeuristics(input.state) + const domSignal = classifyByDom(dom) + if (domSignal) signals.push(domSignal) + + // ── Quick path: URL + DOM agree with combined confidence > threshold ── + if ( + urlSignal && + domSignal && + urlSignal.type === domSignal.type && + urlSignal.confidence + domSignal.confidence > ENSEMBLE_AGREEMENT_THRESHOLD + ) { + const ensembleConfidence = clamp01( + Math.min(1, (urlSignal.confidence + domSignal.confidence) / 1.6), + ) + return finalize({ + type: urlSignal.type, + base: defaultClassification(), + signals, + ensembleConfidence, + signalsAgreed: true, + }) + } + + // ── 3. LLM tiebreaker ── + const llmClass = await classifyPage(input.brain, input.state).catch(() => defaultClassification()) + signals.push({ + source: 'llm', + type: llmClass.type, + confidence: llmClass.confidence, + rationale: llmClass.intent || 'LLM page classification', + }) + + // ── Vote ── + const tally = new Map() + for (const sig of signals) { + tally.set(sig.type, (tally.get(sig.type) ?? 0) + sig.confidence) + } + + const sortedVotes = [...tally.entries()].sort((a, b) => b[1] - a[1]) + const winner = sortedVotes[0] + const winningType = winner ? winner[0] : 'unknown' + const winningTotal = winner ? winner[1] : 0 + + // Compute aggregate confidence: average over participating signals, weighted by agreement. + const winningSignals = signals.filter((s) => s.type === winningType) + const agreementShare = winningSignals.length / signals.length + const meanConfidence = winningSignals.reduce((acc, s) => acc + s.confidence, 0) / Math.max(winningSignals.length, 1) + const ensembleConfidence = clamp01(meanConfidence * agreementShare + 0.05 * (winningTotal - meanConfidence)) + + const signalsAgreed = signals.every((s) => s.type === winningType) + const dissent = signals.filter((s) => s.type !== winningType).map((s) => ({ source: s.source, type: s.type })) + + // ── Low-confidence + disagreement → 'unknown' with dissent ── + if (!signalsAgreed && llmClass.confidence < LLM_FALLBACK_CONFIDENCE) { + return finalize({ + type: 'unknown', + base: llmClass, + signals, + ensembleConfidence: Math.min(ensembleConfidence, 0.5), + signalsAgreed: false, + dissent, + }) + } + + return finalize({ + type: winningType, + base: llmClass, + signals, + ensembleConfidence, + signalsAgreed, + dissent: signalsAgreed ? undefined : dissent, + }) +} + +interface FinalizeArgs { + type: PageType + base: PageClassification + signals: ClassifierSignal[] + ensembleConfidence: number + signalsAgreed: boolean + dissent?: { source: ClassifierSource; type: PageType }[] +} + +function finalize(args: FinalizeArgs): EnsembleClassification { + const { type, base, signals, ensembleConfidence, signalsAgreed } = args + const firstPrinciplesMode = !signalsAgreed || ensembleConfidence < 0.6 + + const out: EnsembleClassification = { + ...base, + type, + confidence: ensembleConfidence, + signals, + signalsAgreed, + ensembleConfidence, + firstPrinciplesMode, + } + if (args.dissent && args.dissent.length > 0) out.dissent = args.dissent + return out +} + +// ── URL-pattern classifier ────────────────────────────────────────────────── + +export function classifyByUrl(url: string): ClassifierSignal | null { + let pathname: string + try { + pathname = new URL(url).pathname || '/' + } catch { + return null + } + for (const rule of URL_PATTERN_RULES) { + if (rule.pattern.test(pathname)) { + return { + source: 'url-pattern', + type: rule.type, + confidence: rule.confidence, + rationale: `${rule.rationale} (${pathname})`, + } + } + } + return null +} + +// ── DOM-heuristic classifier ──────────────────────────────────────────────── + +export function classifyByDom(dom: DomHeuristics): ClassifierSignal | null { + // docs: lots of paragraphs + code blocks, modest nav + if (dom.codeBlockCount >= 3 && dom.paragraphCount >= 6) { + return signal('dom-heuristic', 'docs', 0.7, `code blocks=${dom.codeBlockCount}, paragraphs=${dom.paragraphCount}`) + } + // dashboard: many table rows or charts + sidebar + if ((dom.tableRowCount >= 8 || dom.chartCount >= 2) && dom.hasSidebar) { + return signal('dom-heuristic', 'dashboard', 0.7, `rows=${dom.tableRowCount}, charts=${dom.chartCount}, sidebar=true`) + } + // saas-app: sidebar + multiple forms or many inputs + if (dom.hasSidebar && (dom.formCount >= 1 || dom.inputCount >= 4)) { + return signal('dom-heuristic', 'saas-app', 0.65, `sidebar=true, forms=${dom.formCount}, inputs=${dom.inputCount}`) + } + // utility: single dominant form, no hero, no sidebar + if (dom.formCount >= 1 && dom.inputCount >= 2 && !dom.hasHeroSection && !dom.hasSidebar) { + return signal('dom-heuristic', 'utility', 0.7, `single form, no hero, no sidebar`) + } + // ecommerce: forms + many nav items + footer links (storefront chrome) + if (dom.formCount >= 1 && dom.navItems >= 6 && dom.hasFooterLinks) { + return signal('dom-heuristic', 'ecommerce', 0.6, `nav=${dom.navItems}, footer-links, form present`) + } + // blog: long body of paragraphs without forms or tables + if (dom.paragraphCount >= 8 && dom.formCount === 0 && dom.tableRowCount === 0) { + return signal('dom-heuristic', 'blog', 0.65, `paragraphs=${dom.paragraphCount}, no forms or tables`) + } + // marketing: hero + footer-link cloud + few paragraphs + if (dom.hasHeroSection && dom.hasFooterLinks && dom.paragraphCount < 8) { + return signal('dom-heuristic', 'marketing', 0.6, `hero present, footer-links, paragraphs=${dom.paragraphCount}`) + } + return null +} + +function signal(source: ClassifierSource, type: PageType, confidence: number, rationale: string): ClassifierSignal { + return { source, type, confidence, rationale } +} + +// ── DOM heuristic derivation from snapshot ────────────────────────────────── + +/** + * Best-effort DOM heuristic derivation from the accessibility-tree snapshot. + * Pipelines that capture true DOM heuristics via Playwright should pass them + * in directly; this fallback works against the @ref-snapshot text. + */ +export function deriveHeuristics(state: PageState): DomHeuristics { + const snapshot = state.snapshot ?? '' + return { + formCount: countMatches(snapshot, /\bform\b/gi), + inputCount: countMatches(snapshot, /\b(textbox|searchbox|combobox|spinbutton|input)\b/gi), + tableRowCount: countMatches(snapshot, /\brow\b/gi), + chartCount: countMatches(snapshot, /\b(graphics-document|graphics-symbol|figure)\b/gi), + navItems: countMatches(snapshot, /\bnavigation\b/gi), + hasFooterLinks: /\bcontentinfo\b/i.test(snapshot), + hasHeroSection: /\bhero\b/i.test(snapshot) || /\bbanner\b/i.test(snapshot), + hasSidebar: /\bcomplementary\b/i.test(snapshot) || /\bsidebar\b/i.test(snapshot), + paragraphCount: countMatches(snapshot, /\bparagraph\b/gi), + codeBlockCount: countMatches(snapshot, /\bcode\b/gi), + } +} + +function countMatches(haystack: string, pattern: RegExp): number { + const m = haystack.match(pattern) + return m ? m.length : 0 +} + +function clamp01(n: number): number { + if (Number.isNaN(n)) return 0 + return Math.max(0, Math.min(1, n)) +} + +export const ENSEMBLE_INTERNALS = { + URL_PATTERN_RULES, + ENSEMBLE_AGREEMENT_THRESHOLD, + LLM_FALLBACK_CONFIDENCE, +} diff --git a/src/design/audit/ethics/check.ts b/src/design/audit/ethics/check.ts new file mode 100644 index 0000000..4fe85c6 --- /dev/null +++ b/src/design/audit/ethics/check.ts @@ -0,0 +1,183 @@ +/** + * Ethics check — Layer 7. + * + * Given a page state + classification, evaluate every loaded `EthicsRule` whose + * `appliesWhen` matches the classification. Each rule produces zero-or-one + * `EthicsViolation`. Violations enforce a hard floor on the rollup score: + * `critical-floor → 4`, `major-floor → 6`. + * + * Detector kinds: + * pattern-absent → regex must appear in page text; violation if absent + * pattern-present → regex must NOT appear in page text; violation if present + * llm-classifier → ask the LLM the question; violation when answer is yes + * + * Pattern matches are case-insensitive. The LLM classifier asks for a + * single-token yes/no answer to keep latency + cost predictable. + */ + +import type { Brain } from '../../../brain/index.js' +import type { + AppliesWhen, + EthicsRule, + EthicsViolation, + PageClassification, + AudienceTag, + ModalityTag, + RegulatoryContextTag, + AudienceVulnerabilityTag, +} from '../v2/types.js' +import { rollupCapFor } from './loader.js' + +export interface EthicsCheckContext { + /** Lowercased page text used by `pattern-absent` / `pattern-present`. */ + pageText: string + /** Page snapshot — passed verbatim to the LLM classifier prompt. */ + snapshot: string + /** The page-type / domain / maturity / designSystem classification. */ + classification: PageClassification + /** Operator-supplied audience / modality / regulatory hints (Layer 6). */ + audience?: AudienceTag[] + modality?: ModalityTag[] + regulatoryContext?: RegulatoryContextTag[] + audienceVulnerability?: AudienceVulnerabilityTag[] +} + +export interface EthicsCheckOptions { + /** When set, llm-classifier rules are evaluated; else skipped (deterministic-only). */ + brain?: Brain + /** Optional screenshot URL/path passed alongside snapshot context (unused today). */ + screenshotPath?: string + /** Logger override — defaults to console.warn for skipped rules. */ + warn?: (msg: string) => void +} + +/** + * Run every applicable rule against the page. Returns one violation per rule + * that fires. Rules whose detector is `llm-classifier` are skipped (with a + * warning) when no `brain` is supplied — the alternative is silent passes, + * which would hide ethics gaps in offline tests. + */ +export async function checkEthics( + rules: EthicsRule[], + ctx: EthicsCheckContext, + opts: EthicsCheckOptions = {}, +): Promise { + const warn = opts.warn ?? ((m: string) => console.warn(m)) + const violations: EthicsViolation[] = [] + for (const rule of rules) { + if (!appliesWhenMatches(rule.appliesWhen, ctx)) continue + const fired = await runDetector(rule, ctx, opts.brain, warn) + if (fired) violations.push(toViolation(rule)) + } + return violations +} + +function toViolation(rule: EthicsRule): EthicsViolation { + return { + ruleId: rule.ruleId, + detected: true, + severity: rule.severity, + rollupCap: rollupCapFor(rule.severity), + remediation: rule.remediation, + ...(rule.citation ? { citation: rule.citation } : {}), + } +} + +/** + * Predicate evaluator — extends the rubric loader's logic with the v2 fields + * (audience / modality / regulatoryContext / audienceVulnerability). All + * declared predicates are AND-combined. + */ +export function appliesWhenMatches(w: AppliesWhen, ctx: EthicsCheckContext): boolean { + if (w.universal) return true + const cls = ctx.classification + + if (w.type?.length && !w.type.includes(cls.type)) return false + if (w.maturity?.length && !w.maturity.includes(cls.maturity)) return false + if (w.designSystem?.length && !w.designSystem.includes(cls.designSystem)) return false + if (w.domain?.length) { + const domain = (cls.domain ?? '').toLowerCase() + const ok = w.domain.some(d => domain.includes(d.toLowerCase())) + if (!ok) return false + } + if (w.audience?.length) { + if (!w.audience.some(a => (ctx.audience ?? []).includes(a))) return false + } + if (w.modality?.length) { + if (!w.modality.some(m => (ctx.modality ?? []).includes(m))) return false + } + if (w.regulatoryContext?.length) { + if (!w.regulatoryContext.some(r => (ctx.regulatoryContext ?? []).includes(r))) return false + } + if (w.audienceVulnerability?.length) { + if (!w.audienceVulnerability.some(v => (ctx.audienceVulnerability ?? []).includes(v))) return false + } + + // Same conservative behavior as the rubric loader: with no predicates and + // no `universal: true`, the rule does NOT apply (don't fire blindly). + const hasPredicate = + !!w.type?.length || + !!w.maturity?.length || + !!w.designSystem?.length || + !!w.domain?.length || + !!w.audience?.length || + !!w.modality?.length || + !!w.regulatoryContext?.length || + !!w.audienceVulnerability?.length + return hasPredicate +} + +async function runDetector( + rule: EthicsRule, + ctx: EthicsCheckContext, + brain: Brain | undefined, + warn: (m: string) => void, +): Promise { + const d = rule.detector + if (d.kind === 'pattern-absent') { + const re = compilePattern(d.pattern) + return !re.test(ctx.pageText) + } + if (d.kind === 'pattern-present') { + const re = compilePattern(d.pattern) + return re.test(ctx.pageText) + } + if (d.kind === 'llm-classifier') { + if (!brain) { + warn(`ethics: skipping llm-classifier rule ${rule.ruleId} — no brain provided`) + return false + } + return await runLlmClassifier(brain, d.llmCheck, ctx.snapshot) + } + return false +} + +function compilePattern(pattern: string): RegExp { + // Allow already-anchored regexes; default is case-insensitive substring. + try { + return new RegExp(pattern, 'i') + } catch { + // Fallback: treat as a literal token if regex compile fails. + return new RegExp(pattern.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'), 'i') + } +} + +const LLM_SYSTEM = `You are an ethics auditor. Answer the user's yes/no question about the page snapshot. Respond with exactly one token — yes or no — followed by an optional one-sentence rationale. Do not hedge. If the evidence is missing, answer no.` + +export async function runLlmClassifier( + brain: Brain, + question: string, + snapshot: string, +): Promise { + const truncated = snapshot.length > 8000 ? `${snapshot.slice(0, 8000)}\n…[truncated]` : snapshot + const user = `QUESTION: ${question}\n\nPAGE SNAPSHOT:\n${truncated}\n\nAnswer yes or no.` + const { text } = await brain.complete(LLM_SYSTEM, user, { maxOutputTokens: 80 }) + const first = text.trim().toLowerCase().match(/^[a-z]+/)?.[0] ?? '' + return first === 'yes' +} + +/** Build the lowercased text blob used by pattern detectors. URL is intentionally excluded — URL path tokens (e.g. "no-dosage") would cause false negatives on pattern-absent rules. */ +export function pageTextBlob(snapshot: string, extra?: { url?: string; title?: string }): string { + const parts = [snapshot, extra?.title ?? ''] + return parts.join('\n').toLowerCase() +} diff --git a/src/design/audit/ethics/loader.ts b/src/design/audit/ethics/loader.ts new file mode 100644 index 0000000..9060c4b --- /dev/null +++ b/src/design/audit/ethics/loader.ts @@ -0,0 +1,213 @@ +/** + * Ethics rule loader — Layer 7. + * + * Loads `EthicsRule[]` from `rules/*.yaml`. Idempotent + cached: the in-memory + * cache keys on directory path so repeated calls (per-page audits) never re-IO. + * + * Each YAML file is a list of rule objects. The minimal parser supports the + * shape used in the RFC: `- key: value` items with nested objects and inline + * `[a, b]` lists. No external yaml dep — same approach as rubric/loader.ts. + */ + +import * as fs from 'node:fs' +import * as path from 'node:path' +import { fileURLToPath } from 'node:url' +import type { + EthicsRule, + EthicsCategory, + EthicsSeverity, + EthicsDetector, + AppliesWhen, +} from '../v2/types.js' + +const __dirname = path.dirname(fileURLToPath(import.meta.url)) +const BUILTIN_RULES_DIR = path.join(__dirname, 'rules') + +const cache = new Map() + +/** Severity → rollup ceiling. critical-floor caps at 4; major-floor caps at 6. */ +export function rollupCapFor(severity: EthicsSeverity): number { + return severity === 'critical-floor' ? 4 : 6 +} + +/** + * Load every `*.yaml` rule file in `dir`. Cached by absolute path. + * Returns a stable order (sorted by filename + position within file). + */ +export function loadEthicsRules(dir: string = BUILTIN_RULES_DIR): EthicsRule[] { + const abs = path.resolve(dir) + const cached = cache.get(abs) + if (cached) return cached + if (!fs.existsSync(abs)) { + cache.set(abs, []) + return [] + } + const rules: EthicsRule[] = [] + const files = fs.readdirSync(abs).filter(f => f.endsWith('.yaml')).sort() + for (const f of files) { + const file = path.join(abs, f) + const raw = fs.readFileSync(file, 'utf-8') + const parsed = parseRuleList(raw, file) + for (const r of parsed) rules.push(r) + } + cache.set(abs, rules) + return rules +} + +/** Reset cache — test-only. */ +export function clearEthicsRuleCache(): void { + cache.clear() +} + +function parseRuleList(text: string, sourceFile: string): EthicsRule[] { + const items = splitTopLevelItems(text) + return items.map((block, idx) => parseRule(block, `${sourceFile}#${idx}`)) +} + +/** Split a YAML doc into top-level `- item` blocks (one block per rule). */ +function splitTopLevelItems(text: string): string[] { + const lines = text.split('\n') + const items: string[] = [] + let current: string[] | null = null + for (const line of lines) { + if (/^\s*#/.test(line) || line.trim() === '') { + if (current) current.push(line) + continue + } + if (line.startsWith('- ')) { + if (current) items.push(current.join('\n')) + current = [line.slice(2)] + } else if (current) { + // Indented continuation. Strip 2 leading spaces if present so nesting + // levels become consistent within the item. + current.push(line.startsWith(' ') ? line.slice(2) : line) + } + } + if (current) items.push(current.join('\n')) + return items +} + +function parseRule(block: string, ref: string): EthicsRule { + const meta = parseYamlBlock(block) + const ruleId = stringField(meta, 'ruleId', ref) + const category = stringField(meta, 'category', ref) as EthicsCategory + const severity = stringField(meta, 'severity', ref) as EthicsSeverity + if (severity !== 'critical-floor' && severity !== 'major-floor') { + throw new Error(`ethics rule ${ruleId} (${ref}): invalid severity ${severity}`) + } + const remediation = stringField(meta, 'remediation', ref) + const appliesWhen = (meta.appliesWhen as AppliesWhen) ?? {} + const detectorRaw = (meta.detector as Record) ?? {} + const detector = parseDetector(detectorRaw, ruleId) + const citation = meta.citation != null ? String(meta.citation) : undefined + return { + ruleId, + category, + severity, + appliesWhen, + detector, + remediation, + ...(citation ? { citation } : {}), + } +} + +function parseDetector(d: Record, ruleId: string): EthicsDetector { + const kind = String(d.kind ?? '') + if (kind === 'pattern-absent' || kind === 'pattern-present') { + const pattern = String(d.pattern ?? '') + if (!pattern) throw new Error(`ethics rule ${ruleId}: detector.pattern required for ${kind}`) + return { kind, pattern } + } + if (kind === 'llm-classifier') { + const llmCheck = String(d.llmCheck ?? '') + if (!llmCheck) throw new Error(`ethics rule ${ruleId}: detector.llmCheck required for llm-classifier`) + return { kind, llmCheck } + } + throw new Error(`ethics rule ${ruleId}: unknown detector.kind ${kind}`) +} + +function stringField(meta: Record, key: string, ref: string): string { + const v = meta[key] + if (v == null || String(v) === '') { + throw new Error(`ethics rule (${ref}): missing required field ${key}`) + } + return String(v) +} + +/** + * YAML block parser supporting: + * key: scalar + * key: [a, b] + * key: | → folded multi-line block (preserves newlines) + * key: > or just continuation lines indented under the key + * key: + * subkey: value + * listKey: [a, b] + */ +function parseYamlBlock(text: string): Record { + const lines = text.split('\n') + const result: Record = {} + let i = 0 + while (i < lines.length) { + const line = lines[i] + if (!line.trim() || line.trim().startsWith('#')) { + i++ + continue + } + const m = line.match(/^([a-zA-Z][\w-]*):\s*(.*)$/) + if (!m) { + i++ + continue + } + const [, key, valueRaw] = m + const value = valueRaw.trim() + if (value === '|' || value === '>') { + // Folded block: collect indented continuation lines. + const collected: string[] = [] + i++ + while (i < lines.length && (lines[i].startsWith(' ') || lines[i].trim() === '')) { + collected.push(lines[i].replace(/^ {2}/, '')) + i++ + } + result[key] = collected.join(value === '|' ? '\n' : ' ').trim() + } else if (value === '') { + // Nested object — collect indented lines. + const nested: Record = {} + i++ + while (i < lines.length && (lines[i].startsWith(' ') || lines[i].trim() === '')) { + if (!lines[i].trim()) { + i++ + continue + } + const nm = lines[i].match(/^\s+([a-zA-Z][\w-]*):\s*(.*)$/) + if (nm) { + nested[nm[1]] = parseScalarOrList(nm[2].trim()) + } + i++ + } + result[key] = nested + } else { + result[key] = parseScalarOrList(value) + i++ + } + } + return result +} + +function parseScalarOrList(value: string): unknown { + if (value.startsWith('[') && value.endsWith(']')) { + return value + .slice(1, -1) + .split(',') + .map(s => s.trim().replace(/^['"]|['"]$/g, '')) + .filter(Boolean) + } + // Strip surrounding quotes for plain scalars. + if ((value.startsWith("'") && value.endsWith("'")) || (value.startsWith('"') && value.endsWith('"'))) { + return value.slice(1, -1) + } + if (value === 'true') return true + if (value === 'false') return false + if (/^-?\d+(\.\d+)?$/.test(value)) return Number(value) + return value +} diff --git a/src/design/audit/ethics/rules/finance.yaml b/src/design/audit/ethics/rules/finance.yaml new file mode 100644 index 0000000..fa385e6 --- /dev/null +++ b/src/design/audit/ethics/rules/finance.yaml @@ -0,0 +1,37 @@ +# Financial product ethics rules. +# Anchored to CFPB Reg E / Reg Z, EU PSD2, and SEC Rule 10b-10 disclosures. + +- ruleId: finance:fees-disclosed-pre-commitment + category: finance + severity: critical-floor + appliesWhen: + type: [ecommerce] + domain: [fintech, finance, banking, payments] + detector: + kind: llm-classifier + llmCheck: Does this checkout page hide, defer, or fail to itemize any fees, taxes, FX, or charges that the user will pay, with the pay/commit button reachable before the totals are fully visible? + remediation: Surface every line item (fees, taxes, FX) above the pay button. + citation: CFPB 12 CFR 1026 Reg Z / FTC Junk Fees Rule + +- ruleId: finance:apr-or-effective-rate-shown + category: finance + severity: major-floor + appliesWhen: + domain: [fintech, finance, banking, payments, lending] + detector: + kind: pattern-absent + pattern: (APR|annual percentage rate|effective (interest )?rate|finance charge) + remediation: Display APR or effective interest rate alongside any credit, lending, or yield-bearing offer. + citation: TILA 15 USC 1638 / CFPB 12 CFR 1026.18 + +- ruleId: finance:risk-disclosure-required + category: finance + severity: major-floor + appliesWhen: + domain: [fintech, finance, crypto, investing, trading] + audienceVulnerability: [high-stakes-financial] + detector: + kind: llm-classifier + llmCheck: Does this trading / investing / staking page allow the user to commit funds WITHOUT a clearly visible disclosure of principal-loss, volatility, or non-FDIC-insurance risk above the action button? + remediation: Show risk-of-loss and insurance-status disclosures above the action button on any investing, trading, or staking surface. + citation: SEC Rule 10b-10 / FINRA Rule 2210 diff --git a/src/design/audit/ethics/rules/kids.yaml b/src/design/audit/ethics/rules/kids.yaml new file mode 100644 index 0000000..d6473a3 --- /dev/null +++ b/src/design/audit/ethics/rules/kids.yaml @@ -0,0 +1,37 @@ +# Kids-targeted product ethics rules. +# Anchored to COPPA + UK ICO Age Appropriate Design Code (AADC). + +- ruleId: kids:dark-patterns-prohibited + category: kids + severity: critical-floor + appliesWhen: + audience: [kids] + detector: + kind: llm-classifier + llmCheck: Does this page use any dark pattern (hidden costs, forced action, fake urgency, confirmshaming, manipulated visual hierarchy) targeting a minor user? + remediation: Remove all dark patterns. Use clear, age-appropriate, friction-symmetric flows. + citation: COPPA 16 CFR 312.5 / FTC Dark Patterns Report 2022 + +- ruleId: kids:age-gate-required + category: kids + severity: critical-floor + appliesWhen: + audience: [kids] + audienceVulnerability: [minor-facing] + detector: + kind: pattern-absent + pattern: (age|date of birth|verify your age) + remediation: Implement an age gate before collecting any data or showing user-generated content. + citation: COPPA 16 CFR 312.5 + +- ruleId: kids:data-minimization-default + category: kids + severity: major-floor + appliesWhen: + audience: [kids] + regulatoryContext: [coppa] + detector: + kind: llm-classifier + llmCheck: Does this page request data fields from a child that are NOT strictly necessary for the activity (e.g. unmarked optional fields, profile / social / friend-graph fields, location or contact data not required for the task)? + remediation: Default to data minimization for child users — collect only what is strictly necessary, mark every optional field, and disable profile/social fields by default. + citation: ICO AADC Standard 8 / COPPA 16 CFR 312.7 diff --git a/src/design/audit/ethics/rules/legal.yaml b/src/design/audit/ethics/rules/legal.yaml new file mode 100644 index 0000000..4df0e3e --- /dev/null +++ b/src/design/audit/ethics/rules/legal.yaml @@ -0,0 +1,35 @@ +# Legal / regulatory consent rules. +# Anchored to GDPR, ePrivacy Directive, and CCPA/CPRA notice-at-collection. + +- ruleId: legal:gdpr-cookie-consent + category: legal + severity: major-floor + appliesWhen: + regulatoryContext: [gdpr] + detector: + kind: pattern-absent + pattern: (cookie|consent|necessary|preferences) + remediation: Display GDPR-compliant cookie consent banner with granular controls. + citation: GDPR Art. 7 / ePrivacy Directive 2002/58/EC Art. 5(3) + +- ruleId: legal:privacy-policy-link-required + category: legal + severity: major-floor + appliesWhen: + regulatoryContext: [gdpr] + detector: + kind: pattern-absent + pattern: (privacy policy|privacy notice|data protection) + remediation: Provide a clearly labeled privacy policy / notice link on every page that processes personal data. + citation: GDPR Art. 13-14 + +- ruleId: legal:ccpa-do-not-sell-link + category: legal + severity: major-floor + appliesWhen: + regulatoryContext: [gdpr] + detector: + kind: pattern-absent + pattern: (do not sell|do not share|opt.?out|your privacy choices) + remediation: Provide a "Do Not Sell or Share My Personal Information" / "Your Privacy Choices" link in the global footer. + citation: CCPA Cal. Civ. Code 1798.135 / CPRA 1798.121 diff --git a/src/design/audit/ethics/rules/medical.yaml b/src/design/audit/ethics/rules/medical.yaml new file mode 100644 index 0000000..64d9a1f --- /dev/null +++ b/src/design/audit/ethics/rules/medical.yaml @@ -0,0 +1,36 @@ +# Medical / clinical / pharmacy ethics rules. +# Anchored to FDA labeling regs, ICH E2 pharmacovigilance, and ONC info-blocking. + +- ruleId: medical:dosage-warning-required + category: medical + severity: critical-floor + appliesWhen: + domain: [medical, clinical, pharmacy, health] + detector: + kind: pattern-absent + pattern: (dosage|warning|contraindication|adverse|side effect) + remediation: Display dosage warnings, contraindications, and adverse-effect summaries before any prescription action. + citation: FDA 21 CFR 201.57 + +- ruleId: medical:adverse-event-reporting-path + category: medical + severity: major-floor + appliesWhen: + domain: [medical, clinical, pharmacy, health] + detector: + kind: pattern-absent + pattern: (report.{0,20}(side effect|adverse|reaction)|MedWatch|FDA 1088|pharmacovigilance) + remediation: Provide a visible adverse-event reporting path (MedWatch link or equivalent) on any prescribing or dispensing surface. + citation: FDA 21 CFR 314.80 / ICH E2D + +- ruleId: medical:patient-education-disclosure + category: medical + severity: major-floor + appliesWhen: + domain: [medical, clinical, pharmacy, health] + audienceVulnerability: [patient-facing] + detector: + kind: llm-classifier + llmCheck: Does this patient-facing page request a clinical action or health-data submission WITHOUT first presenting plain-language patient education (indications, risks, instructions for use)? + remediation: Surface plain-language patient education (indications, risks, instructions) ahead of any clinical action a patient can take. + citation: ONC 45 CFR 170.315(g)(9) Patient Education diff --git a/src/design/audit/evaluate.ts b/src/design/audit/evaluate.ts index 745c696..5e8481b 100644 --- a/src/design/audit/evaluate.ts +++ b/src/design/audit/evaluate.ts @@ -257,7 +257,10 @@ export function resolveAuditPasses( const raw = value?.trim().toLowerCase() if (!raw || raw === 'standard' || raw === 'single' || raw === 'default') return ['standard'] - if (raw === 'deep' || raw === 'parallel' || raw === 'full') { + // Layer 1 — `auto` is the new default for the v2 path: classification-aware + // selection mirroring `deep`. The pipeline runs the ensemble classifier + // first, then this picks the focused pass bundle for that page type. + if (raw === 'auto' || raw === 'deep' || raw === 'parallel' || raw === 'full') { return deepPassesForClassification(options?.classification, options?.overrides) } if (raw === 'max' || raw === 'exhaustive') return ['product', 'visual', 'trust', 'workflow', 'content'] diff --git a/src/design/audit/first-principles-mode.ts b/src/design/audit/first-principles-mode.ts new file mode 100644 index 0000000..29672e4 --- /dev/null +++ b/src/design/audit/first-principles-mode.ts @@ -0,0 +1,98 @@ +/** + * Layer 3 — First-principles fallback. + * + * When the ensemble classifier is uncertain the auditor does not fabricate a + * classification. This module decides when to trigger first-principles mode + * and queues NovelPatternObservations for fleet mining. + */ + +import * as fs from 'node:fs' +import * as fsp from 'node:fs/promises' +import * as path from 'node:path' +import * as os from 'node:os' +import * as crypto from 'node:crypto' +import type { EnsembleClassification, NovelPatternObservation, PageType } from './v2/types.js' + +export interface FirstPrinciplesOptions { + /** Override the minimum ensemble confidence threshold (default 0.6). */ + confidenceThreshold?: number +} + +/** + * Returns true when first-principles mode should fire. + * + * Trigger conditions (ANY of): + * - ensembleConfidence < threshold (default 0.6) + * - signalsAgreed === false + * - classification.type === 'unknown' + * - LLM explicitly emitted first_principles_mode: true + */ +export function shouldTriggerFirstPrinciples( + classification: EnsembleClassification, + opts?: FirstPrinciplesOptions, +): boolean { + const threshold = opts?.confidenceThreshold ?? 0.6 + if (classification.ensembleConfidence < threshold) return true + if (!classification.signalsAgreed) return true + if ((classification.type as string) === 'unknown') return true + if (classification.firstPrinciplesMode) return true + return false +} + +/** + * Build a NovelPatternObservation from the classification and runtime context. + * The `observationId` is stable: same pageRef + capturedAt minute → same id. + */ +export function buildNovelPatternObservation(args: { + classification: EnsembleClassification + pageRef: string + observedSignals?: string + snapshotKey?: string +}): NovelPatternObservation { + const capturedAt = new Date().toISOString() + const observationId = crypto + .createHash('sha256') + .update(`${args.pageRef}::${capturedAt.slice(0, 16)}`) + .digest('hex') + .slice(0, 16) + + return { + observationId, + capturedAt, + observed: args.observedSignals ?? 'No specific signal description provided.', + closestType: args.classification.type as PageType, + closestConfidence: args.classification.ensembleConfidence, + pageRef: args.pageRef, + ...(args.snapshotKey ? { snapshotKey: args.snapshotKey } : {}), + } +} + +/** + * Append a NovelPatternObservation as a JSONL line to the date-stamped sink. + * Default dir: `~/.bad/novel-patterns/`. Each line is valid JSON on its own. + */ +export async function appendNovelPatternObservation( + observation: NovelPatternObservation, + dir?: string, +): Promise { + const sinkDir = dir ?? path.join(os.homedir(), '.bad', 'novel-patterns') + await fsp.mkdir(sinkDir, { recursive: true }) + const date = observation.capturedAt.slice(0, 10) + const filePath = path.join(sinkDir, `${date}.jsonl`) + const line = JSON.stringify(observation) + '\n' + await fsp.appendFile(filePath, line, 'utf-8') +} + +/** + * Synchronous variant — for use in pipeline paths that aren't async. + */ +export function appendNovelPatternObservationSync( + observation: NovelPatternObservation, + dir?: string, +): void { + const sinkDir = dir ?? path.join(os.homedir(), '.bad', 'novel-patterns') + fs.mkdirSync(sinkDir, { recursive: true }) + const date = observation.capturedAt.slice(0, 10) + const filePath = path.join(sinkDir, `${date}.jsonl`) + fs.appendFileSync(filePath, JSON.stringify(observation) + '\n', 'utf-8') +} diff --git a/src/design/audit/modality/android.ts b/src/design/audit/modality/android.ts new file mode 100644 index 0000000..4ec62e6 --- /dev/null +++ b/src/design/audit/modality/android.ts @@ -0,0 +1,24 @@ +/** + * Layer 8 — Android modality adapter (stub). + * + * UI Automator + accessibility-tree capture. Not yet implemented. + * + * TODO Layer 8: UI Automator bridge, emulator management, ax-tree capture. + */ + +import type { ModalityAdapter, ModalityInput, Evidence } from '../v2/types.js' + +export class AndroidModalityAdapter implements ModalityAdapter { + readonly modality = 'android' as const + + async capture(_input: ModalityInput): Promise { + throw new Error( + 'Android modality adapter is not yet implemented. ' + + 'See RFC-002 Layer 8 for the implementation plan. ' + + 'Ship iOS first per the RFC sequencing note. ' + + 'Use --modality html for web audits.', + ) + } +} + +export const androidAdapter = new AndroidModalityAdapter() diff --git a/src/design/audit/modality/html.ts b/src/design/audit/modality/html.ts new file mode 100644 index 0000000..ae32532 --- /dev/null +++ b/src/design/audit/modality/html.ts @@ -0,0 +1,56 @@ +/** + * Layer 8 — HTML modality adapter. + * + * Wraps the existing Playwright-based capture pipeline into the `ModalityAdapter` + * interface so it can participate in the unified scoring framework. The underlying + * pipeline is unchanged; this module provides the typed adapter boundary. + */ + +import type { ModalityAdapter, ModalityInput, Evidence, MeasurementBundle } from '../v2/types.js' + +export class HtmlModalityAdapter implements ModalityAdapter { + readonly modality = 'html' as const + + /** + * Capture HTML evidence. Delegates to the existing browser-based pipeline. + * In practice, `pipeline.ts` drives this; the adapter exists to make the + * interface explicit and enable Layer 8's modality dispatch. + * + * @param input.entryPoint - URL to audit + * @param input.flow - optional page flow (multi-page audit) + */ + async capture(input: ModalityInput): Promise { + // The real implementation lives in pipeline.ts / measure/index.ts. + // This adapter records the contract and is called by the pipeline dispatcher. + // When a caller invokes adapter.capture() directly, it returns a shell + // Evidence that the pipeline will hydrate with real snapshot + measurements. + const shell: Evidence = { + modality: 'html', + surfaces: [], + measurements: emptyMeasurementBundle(), + snapshot: '', + screenshot: undefined, + } + void input + return shell + } +} + +function emptyMeasurementBundle(): MeasurementBundle { + return { + contrast: { + totalChecked: 0, + aaFailures: [], + aaaFailures: [], + summary: { aaPassRate: 1, aaaPassRate: 1 }, + }, + a11y: { + ran: true, + violations: [], + passes: 0, + }, + hasBlockingIssues: false, + } +} + +export const htmlAdapter = new HtmlModalityAdapter() diff --git a/src/design/audit/modality/index.ts b/src/design/audit/modality/index.ts new file mode 100644 index 0000000..a95adca --- /dev/null +++ b/src/design/audit/modality/index.ts @@ -0,0 +1,19 @@ +import type { Modality, ModalityAdapter } from './types.js' +import { htmlAdapter } from './html.js' +import { iosAdapter } from './ios.js' +import { androidAdapter } from './android.js' + +const ADAPTERS: Record = { + html: htmlAdapter, + ios: iosAdapter, + android: androidAdapter, + terminal: { modality: 'terminal', capture: async () => { throw new Error('terminal modality not implemented') } }, + voice: { modality: 'voice', capture: async () => { throw new Error('voice modality not implemented') } }, +} + +export function getModalityAdapter(modality: Modality): ModalityAdapter { + return ADAPTERS[modality] +} + +export { htmlAdapter, iosAdapter, androidAdapter } +export type { Modality, ModalityAdapter } diff --git a/src/design/audit/modality/ios.ts b/src/design/audit/modality/ios.ts new file mode 100644 index 0000000..83fce5e --- /dev/null +++ b/src/design/audit/modality/ios.ts @@ -0,0 +1,25 @@ +/** + * Layer 8 — iOS modality adapter (stub). + * + * XCUITest + accessibility-tree capture. Not yet implemented. + * Ship the interface so CLI dispatch and type-checking work; native + * bridging will be added once the HTML adapter's abstraction is validated. + * + * TODO Layer 8: XCUITest bridge, simulator management, ax-tree capture. + */ + +import type { ModalityAdapter, ModalityInput, Evidence } from '../v2/types.js' + +export class IosModalityAdapter implements ModalityAdapter { + readonly modality = 'ios' as const + + async capture(_input: ModalityInput): Promise { + throw new Error( + 'iOS modality adapter is not yet implemented. ' + + 'See RFC-002 Layer 8 for the implementation plan. ' + + 'Use --modality html for web audits.', + ) + } +} + +export const iosAdapter = new IosModalityAdapter() diff --git a/src/design/audit/modality/types.ts b/src/design/audit/modality/types.ts new file mode 100644 index 0000000..d769314 --- /dev/null +++ b/src/design/audit/modality/types.ts @@ -0,0 +1,16 @@ +/** + * Layer 8 — Modality adapter type contract. + * + * Re-exports the stable shapes from v2/types.ts. Each adapter (HTML, iOS, + * Android) implements the ModalityAdapter interface and produces an Evidence + * record that flows into the shared Layers 1–7 scoring pipeline unchanged. + */ + +export type { + Modality, + ModalityAdapter, + ModalityInput, + Evidence, + SurfaceRecord, + SurfaceMeasurements, +} from '../v2/types.js' diff --git a/src/design/audit/patches/index.ts b/src/design/audit/patches/index.ts new file mode 100644 index 0000000..de59a33 --- /dev/null +++ b/src/design/audit/patches/index.ts @@ -0,0 +1,7 @@ +export { parsePatch, parsePatches } from './parse.js' +export { validatePatch, validatePatches } from './validate.js' +export { renderUnifiedDiff, renderPatchSummary } from './render.js' +export { enforcePatchPolicy } from './severity-enforcement.js' +export type { ParseResult } from './parse.js' +export type { ValidationResult, ValidationReason } from './validate.js' +export type { EnforcementResult, EnforcementRecord } from './severity-enforcement.js' diff --git a/src/design/audit/patches/parse.ts b/src/design/audit/patches/parse.ts new file mode 100644 index 0000000..12f9a04 --- /dev/null +++ b/src/design/audit/patches/parse.ts @@ -0,0 +1,165 @@ +/** + * Patch parser — converts raw LLM JSON output into typed `Patch` objects. + * + * Strict shape validation. On schema mismatch returns `{ patch: null, reason }` + * rather than throwing — the calling pipeline batches many candidate patches + * per audit and a single malformed entry must not abort the whole run. + */ + +import type { + Patch, + PatchRollback, + PatchRollbackKind, + PatchTarget, + PatchTest, + PatchTestKind, + ConfidenceLevel, + Dimension, +} from '../v2/types.js' + +type PatchScope = 'page' | 'section' | 'component' | 'system' +type PatchTargetScope = 'tsx' | 'jsx' | 'css' | 'tailwind' | 'module-css' | 'styled-component' | 'structural' | 'html' +type PatchDeltaConfidence = ConfidenceLevel | 'untested' + +const VALID_SCOPES: PatchScope[] = ['page', 'section', 'component', 'system'] +const VALID_TARGET_SCOPES: PatchTargetScope[] = [ + 'tsx', 'jsx', 'css', 'tailwind', 'module-css', 'styled-component', 'structural', 'html', +] +const VALID_TEST_KINDS: PatchTestKind[] = [ + 'storybook', 'a11y-rule', 'visual-snapshot', 'unit', 'rerun-audit', 'manual', +] +const VALID_ROLLBACK_KINDS: PatchRollbackKind[] = ['git-revert', 'css-disable', 'manual'] +const VALID_CONFIDENCES: PatchDeltaConfidence[] = ['high', 'medium', 'low', 'untested'] + +export interface ParseResult { + patch: Patch | null + reason?: string +} + +function isObject(value: unknown): value is Record { + return typeof value === 'object' && value !== null && !Array.isArray(value) +} + +function isString(value: unknown): value is string { + return typeof value === 'string' && value.length > 0 +} + +function oneOf(value: unknown, allowed: readonly T[]): value is T { + return typeof value === 'string' && (allowed as readonly string[]).includes(value) +} + +function parseTarget(raw: unknown): PatchTarget | string { + if (!isObject(raw)) return 'target: not an object' + if (!oneOf(raw.scope, VALID_TARGET_SCOPES)) return `target.scope: invalid (got ${String(raw.scope)})` + const target: PatchTarget = { scope: raw.scope } + if (raw.filePath !== undefined) { + if (!isString(raw.filePath)) return 'target.filePath: must be non-empty string' + target.filePath = raw.filePath + } + if (raw.componentName !== undefined) { + if (!isString(raw.componentName)) return 'target.componentName: must be non-empty string' + target.componentName = raw.componentName + } + if (raw.cssSelector !== undefined) { + if (!isString(raw.cssSelector)) return 'target.cssSelector: must be non-empty string' + target.cssSelector = raw.cssSelector + } + return target +} + +function parseTest(raw: unknown): PatchTest | string { + if (!isObject(raw)) return 'testThatProves: not an object' + if (!oneOf(raw.kind, VALID_TEST_KINDS)) return `testThatProves.kind: invalid (got ${String(raw.kind)})` + if (!isString(raw.description)) return 'testThatProves.description: must be non-empty string' + const test: PatchTest = { kind: raw.kind, description: raw.description } + if (raw.command !== undefined) { + if (typeof raw.command !== 'string') return 'testThatProves.command: must be string when present' + test.command = raw.command + } + return test +} + +function parseRollback(raw: unknown): PatchRollback | string { + if (!isObject(raw)) return 'rollback: not an object' + if (!oneOf(raw.kind, VALID_ROLLBACK_KINDS)) return `rollback.kind: invalid (got ${String(raw.kind)})` + const rollback: PatchRollback = { kind: raw.kind } + if (raw.instruction !== undefined) { + if (typeof raw.instruction !== 'string') return 'rollback.instruction: must be string when present' + rollback.instruction = raw.instruction + } + return rollback +} + +/** + * Parse a single raw LLM-produced object into a `Patch`. Returns + * `{ patch: null, reason }` on any schema violation. + */ +export function parsePatch(raw: unknown): ParseResult { + if (!isObject(raw)) return { patch: null, reason: 'patch: not an object' } + if (!isString(raw.patchId)) return { patch: null, reason: 'patchId: required non-empty string' } + if (!isString(raw.findingId)) return { patch: null, reason: 'findingId: required non-empty string' } + if (!oneOf(raw.scope, VALID_SCOPES)) return { patch: null, reason: `scope: invalid (got ${String(raw.scope)})` } + + const target = parseTarget(raw.target) + if (typeof target === 'string') return { patch: null, reason: target } + + if (!isObject(raw.diff)) return { patch: null, reason: 'diff: not an object' } + if (!isString(raw.diff.before)) return { patch: null, reason: 'diff.before: required non-empty string' } + if (typeof raw.diff.after !== 'string') return { patch: null, reason: 'diff.after: required string' } + const diff = { + before: raw.diff.before, + after: raw.diff.after, + ...(typeof raw.diff.unifiedDiff === 'string' ? { unifiedDiff: raw.diff.unifiedDiff } : {}), + } + + const test = parseTest(raw.testThatProves) + if (typeof test === 'string') return { patch: null, reason: test } + + const rollback = parseRollback(raw.rollback) + if (typeof rollback === 'string') return { patch: null, reason: rollback } + + if (!isObject(raw.estimatedDelta)) return { patch: null, reason: 'estimatedDelta: not an object' } + if (!isString(raw.estimatedDelta.dim)) return { patch: null, reason: 'estimatedDelta.dim: required' } + if (typeof raw.estimatedDelta.delta !== 'number' || !Number.isFinite(raw.estimatedDelta.delta)) { + return { patch: null, reason: 'estimatedDelta.delta: must be finite number' } + } + + if (!oneOf(raw.estimatedDeltaConfidence, VALID_CONFIDENCES)) { + return { patch: null, reason: `estimatedDeltaConfidence: invalid (got ${String(raw.estimatedDeltaConfidence)})` } + } + + const patch: Patch = { + patchId: raw.patchId, + findingId: raw.findingId, + scope: raw.scope, + target, + diff, + testThatProves: test, + rollback, + estimatedDelta: { dim: raw.estimatedDelta.dim as Dimension, delta: raw.estimatedDelta.delta }, + estimatedDeltaConfidence: raw.estimatedDeltaConfidence, + ...(typeof raw.matchedPatternId === 'string' ? { matchedPatternId: raw.matchedPatternId } : {}), + } + return { patch } +} + +/** + * Parse an array of raw patch objects. Invalid entries are dropped from the + * returned `patches` and reported in `errors` with their original index. + */ +export function parsePatches(raw: unknown): { + patches: Patch[] + errors: Array<{ index: number; reason: string }> +} { + if (!Array.isArray(raw)) { + return { patches: [], errors: [{ index: -1, reason: 'patches: not an array' }] } + } + const patches: Patch[] = [] + const errors: Array<{ index: number; reason: string }> = [] + for (let i = 0; i < raw.length; i++) { + const result = parsePatch(raw[i]) + if (result.patch) patches.push(result.patch) + else errors.push({ index: i, reason: result.reason ?? 'unknown' }) + } + return { patches, errors } +} diff --git a/src/design/audit/patches/render.ts b/src/design/audit/patches/render.ts new file mode 100644 index 0000000..5b11f77 --- /dev/null +++ b/src/design/audit/patches/render.ts @@ -0,0 +1,57 @@ +/** + * Patch renderer — produces a unified diff from a Patch when filePath is known. + * + * Agents can pipe the result to `git apply --check` then `git apply`. + * When filePath is unknown, returns null — the agent must use before/after for + * search-replace instead. + */ + +import type { Patch } from '../v2/types.js' + +/** + * Render a minimal unified diff (1-hunk, 3 lines context) from a patch. + * Returns null when: + * - `target.filePath` is not set (no file to diff against) + * - `unifiedDiff` is already set on the patch (prefer the LLM's version) + */ +export function renderUnifiedDiff(patch: Patch): string | null { + if (patch.diff.unifiedDiff) return patch.diff.unifiedDiff + if (!patch.target.filePath) return null + + const { before, after } = patch.diff + const filePath = patch.target.filePath + + const beforeLines = before.split('\n') + const afterLines = after.split('\n') + + const removals = beforeLines.map(l => `- ${l}`) + const additions = afterLines.map(l => `+ ${l}`) + + const hunkOldLen = beforeLines.length + const hunkNewLen = afterLines.length + + return [ + `--- a/${filePath}`, + `+++ b/${filePath}`, + `@@ -1,${hunkOldLen} +1,${hunkNewLen} @@`, + ...removals, + ...additions, + ].join('\n') +} + +/** + * Render a human-readable patch summary for display in report.md. + */ +export function renderPatchSummary(patch: Patch): string { + const parts: string[] = [] + parts.push(`**Patch ${patch.patchId}** (${patch.scope})`) + if (patch.target.filePath) parts.push(`File: \`${patch.target.filePath}\``) + else if (patch.target.cssSelector) parts.push(`Selector: \`${patch.target.cssSelector}\``) + else if (patch.target.componentName) parts.push(`Component: \`${patch.target.componentName}\``) + parts.push(`\`\`\`diff\n${renderUnifiedDiff(patch) ?? `- ${patch.diff.before}\n+ ${patch.diff.after}`}\n\`\`\``) + parts.push(`Test: ${patch.testThatProves.description}`) + if (patch.testThatProves.command) parts.push(`Command: \`${patch.testThatProves.command}\``) + parts.push(`Rollback: ${patch.rollback.kind}${patch.rollback.instruction ? ` — ${patch.rollback.instruction}` : ''}`) + parts.push(`Estimated Δ: ${patch.estimatedDelta.dim} ${patch.estimatedDelta.delta > 0 ? '+' : ''}${patch.estimatedDelta.delta} (confidence: ${patch.estimatedDeltaConfidence})`) + return parts.join('\n') +} diff --git a/src/design/audit/patches/severity-enforcement.ts b/src/design/audit/patches/severity-enforcement.ts new file mode 100644 index 0000000..183ecfe --- /dev/null +++ b/src/design/audit/patches/severity-enforcement.ts @@ -0,0 +1,63 @@ +/** + * Severity enforcement — every major/critical finding MUST have ≥1 valid patch. + * + * Findings without patches are downgraded to `minor` with an explanatory note. + * This runs as a post-processing step after patch validation. + */ + +import type { Patch, DesignFinding } from '../v2/types.js' + +export interface EnforcementRecord { + findingId: string + fromSeverity: string + toSeverity: 'minor' + reason: string +} + +export interface EnforcementResult { + findings: DesignFinding[] + downgraded: EnforcementRecord[] +} + +/** + * Given a list of findings and the set of valid patches (post-validation), + * downgrade any major/critical finding that has no valid patch to `minor`. + */ +export function enforcePatchPolicy( + findings: DesignFinding[], + validPatchIds: Set, +): EnforcementResult { + const downgraded: EnforcementRecord[] = [] + + const updated = findings.map(f => { + if (f.severity !== 'major' && f.severity !== 'critical') return f + + const v2Finding = f as DesignFinding & { patches?: Patch[] } + const patches = v2Finding.patches ?? [] + const hasValidPatch = patches.some(p => validPatchIds.has(p.patchId)) + + if (hasValidPatch) return f + + downgraded.push({ + findingId: f.id, + fromSeverity: f.severity, + toSeverity: 'minor', + reason: patches.length === 0 + ? 'no patches proposed' + : 'all proposed patches failed validation (before not in snapshot, missing locator, or delta out of range)', + }) + + return { + ...f, + severity: 'minor' as const, + suggestion: [ + f.suggestion, + '[auto-downgraded: patch required for major/critical severity]', + ] + .filter(Boolean) + .join(' '), + } + }) + + return { findings: updated, downgraded } +} diff --git a/src/design/audit/patches/validate.ts b/src/design/audit/patches/validate.ts new file mode 100644 index 0000000..5089c99 --- /dev/null +++ b/src/design/audit/patches/validate.ts @@ -0,0 +1,74 @@ +/** + * Patch validator — given a parsed patch and the page snapshot text, verify + * that the patch is grounded and applyable. + * + * Rules: + * - `diff.before` must appear as a case-sensitive substring of the snapshot. + * Agents apply patches literally; a hallucinated `before` is unfixable. + * - `target` must carry at least one locator (cssSelector | filePath | + * componentName). Without one the agent has nowhere to apply. + * - `estimatedDelta.delta` must be in [-3, 3]. Larger claims are almost + * always over-confident on a 1–10 scale. + */ + +import type { Patch } from '../v2/types.js' + +export type ValidationReason = + | 'before-not-in-snapshot' + | 'target-missing-locator' + | 'estimated-delta-out-of-range' + | 'before-empty' + +export interface ValidationResult { + valid: boolean + reasons: ValidationReason[] +} + +const DELTA_MIN = -3 +const DELTA_MAX = 3 + +/** + * Validate a single patch against a page snapshot. Reports all issues in one + * pass so callers can surface every problem to the agent at once. + */ +export function validatePatch(patch: Patch, snapshot: string): ValidationResult { + const reasons: ValidationReason[] = [] + const { target, diff, estimatedDelta } = patch + + if (!target.cssSelector && !target.filePath && !target.componentName) { + reasons.push('target-missing-locator') + } + + if (diff.before.length === 0) { + reasons.push('before-empty') + } else if (!snapshot.includes(diff.before)) { + reasons.push('before-not-in-snapshot') + } + + if ( + estimatedDelta.delta < DELTA_MIN || + estimatedDelta.delta > DELTA_MAX || + !Number.isFinite(estimatedDelta.delta) + ) { + reasons.push('estimated-delta-out-of-range') + } + + return { valid: reasons.length === 0, reasons } +} + +/** + * Validate a list of patches and partition into valid / invalid. + */ +export function validatePatches( + patches: Patch[], + snapshot: string, +): { valid: Patch[]; invalid: Array<{ patch: Patch; reasons: ValidationReason[] }> } { + const valid: Patch[] = [] + const invalid: Array<{ patch: Patch; reasons: ValidationReason[] }> = [] + for (const patch of patches) { + const result = validatePatch(patch, snapshot) + if (result.valid) valid.push(patch) + else invalid.push({ patch, reasons: result.reasons }) + } + return { valid, invalid } +} diff --git a/src/design/audit/patterns/match.ts b/src/design/audit/patterns/match.ts new file mode 100644 index 0000000..cb73d91 --- /dev/null +++ b/src/design/audit/patterns/match.ts @@ -0,0 +1,61 @@ +/** + * Layer 5 — Pattern matching. + * + * Fuzzy-matches a page against catalogued patterns. When patterns exist (post + * fleet accumulation), findings include `matchedPatterns[]` so agents can cite + * fleet evidence rather than applying novel patches. + * + * Currently returns [] (cold-start). The interface is stable. + */ + +import type { Pattern, PatternMatch, PatternQuery } from './types.js' +import type { PageType, Dimension } from '../v2/types.js' +import { queryPatterns } from './store.js' + +export interface MatchContext { + pageType: PageType + weakDimensions: Dimension[] + dir?: string +} + +/** + * Match patterns against the current page context. Returns the top-N matches + * ordered by expected leverage (weakest dim × pattern's median delta for that dim). + * + * Cold-start: returns [] until patterns are mined. + */ +export async function matchPatterns( + ctx: MatchContext, + topN: number = 5, +): Promise { + const query: PatternQuery = { + pageType: ctx.pageType, + minApplications: 5, + minSuccessRate: 0.5, + } + const candidates = await queryPatterns(query, ctx.dir) + if (candidates.length === 0) return [] + + const scored: Array<{ pattern: Pattern; leverage: number }> = candidates.map(p => { + const leverage = ctx.weakDimensions.reduce((sum, dim) => { + return sum + (p.fleetEvidence.medianDimDelta[dim] ?? 0) + }, 0) + return { pattern: p, leverage } + }) + + return scored + .sort((a, b) => b.leverage - a.leverage) + .slice(0, topN) + .map(({ pattern, leverage }) => { + const expectedDelta: Record = {} as Record + for (const dim of ctx.weakDimensions) { + expectedDelta[dim] = pattern.fleetEvidence.medianDimDelta[dim] ?? 0 + } + return { + pattern, + matchConfidence: Math.min(1, leverage / 10), + expectedDelta, + applicationGuidance: `Apply ${pattern.scaffold.description}. Key decisions: ${pattern.scaffold.keyDecisions.join('; ')}.`, + } + }) +} diff --git a/src/design/audit/patterns/mine.ts b/src/design/audit/patterns/mine.ts new file mode 100644 index 0000000..2bbe967 --- /dev/null +++ b/src/design/audit/patterns/mine.ts @@ -0,0 +1,49 @@ +/** + * Layer 5 — Pattern mining (scaffold). + * + * In production this runs as a Cloudflare Worker cron job on accumulated + * PatchApplication telemetry. The mining threshold (N≥30, ≥5 tenants, + * replicationRate≥0.7) prevents false patterns from premature data. + * + * Until fleet data accumulates this module is a scaffold. Run: + * pnpm patterns:mine --dir ~/.bad + * + * TODO: implement clustering algorithm once sufficient attribution data exists. + */ + +import type { PatchApplication } from '../attribution/types.js' +import type { Pattern } from './types.js' +import { savePattern } from './store.js' + +export interface MineOptions { + minApplications?: number + minTenants?: number + minReplicationRate?: number + dir?: string +} + +const DEFAULTS: Required> = { + minApplications: 30, + minTenants: 5, + minReplicationRate: 0.7, +} + +/** + * Mine patterns from accumulated PatchApplication records. + * + * Currently a stub — returns 0 mined until clustering is implemented. + * The interface is stable; consumers can call it safely in tests via synthetic + * data without triggering real fleet operations. + */ +export async function minePatterns( + applications: PatchApplication[], + opts: MineOptions = {}, +): Promise<{ mined: number; skipped: number }> { + void applications + void opts + void DEFAULTS + void savePattern + // TODO: implement structural clustering by (scope, target.cssSelector pattern, + // diff similarity) once N≥30 fleet data is available. See RFC §Layer 5. + return { mined: 0, skipped: applications.length } +} diff --git a/src/design/audit/patterns/store.ts b/src/design/audit/patterns/store.ts new file mode 100644 index 0000000..3185568 --- /dev/null +++ b/src/design/audit/patterns/store.ts @@ -0,0 +1,51 @@ +/** + * Layer 5 — Pattern store. + * + * Reads/writes patterns from a JSONL file. In production this is backed by a + * Cloudflare D1 or R2 store; the JSONL backend is for local dev and tests. + * + * Cold-start: the pattern library is empty until fleet data accumulates. + * The store returns [] for all queries until patterns are mined (Layer 5 mine.ts). + */ + +import * as fs from 'node:fs' +import * as fsp from 'node:fs/promises' +import * as path from 'node:path' +import * as os from 'node:os' +import type { Pattern, PatternQuery } from './types.js' + +const DEFAULT_DIR = path.join(os.homedir(), '.bad', 'patterns') +const PATTERNS_FILE = 'patterns.jsonl' + +export async function loadPatterns(dir: string = DEFAULT_DIR): Promise { + const filePath = path.join(dir, PATTERNS_FILE) + if (!fs.existsSync(filePath)) return [] + const lines = fs.readFileSync(filePath, 'utf-8').split('\n').filter(Boolean) + return lines.flatMap(line => { + try { return [JSON.parse(line) as Pattern] } + catch { return [] } + }) +} + +export async function savePattern(pattern: Pattern, dir: string = DEFAULT_DIR): Promise { + await fsp.mkdir(dir, { recursive: true }) + await fsp.appendFile(path.join(dir, PATTERNS_FILE), JSON.stringify(pattern) + '\n', 'utf-8') +} + +export async function queryPatterns( + query: PatternQuery, + dir: string = DEFAULT_DIR, +): Promise { + const all = await loadPatterns(dir) + return all.filter(p => { + if (query.category && p.category !== query.category) return false + if (query.pageType && p.classification.type !== query.pageType) return false + if (query.minApplications && p.fleetEvidence.applications < query.minApplications) return false + if (query.minSuccessRate && p.fleetEvidence.successRate < query.minSuccessRate) return false + if (query.weakDimension) { + const delta = p.fleetEvidence.medianDimDelta[query.weakDimension] ?? 0 + if (delta <= 0) return false + } + return true + }) +} diff --git a/src/design/audit/patterns/types.ts b/src/design/audit/patterns/types.ts new file mode 100644 index 0000000..7053389 --- /dev/null +++ b/src/design/audit/patterns/types.ts @@ -0,0 +1,53 @@ +/** + * Layer 5 — Pattern library type contract. + * + * Patterns are mined from accumulated PatchApplication data once a cluster + * meets: N≥30 applications across ≥5 distinct tenants, replicationRate≥0.7. + * Until fleet data accumulates (≥6 weeks), the pattern library is empty. + * + * This module defines the stable query API so agents can code against it now. + * The mining and matching implementations are scaffolded; real clustering runs + * as a Cloudflare Worker cron once the attribution data accumulates. + */ + +export type { PageType, Dimension } from '../v2/types.js' +import type { PageType, Dimension } from '../v2/types.js' + +export interface PatternScaffold { + description: string + referenceTsx?: string + referenceCss?: string + keyDecisions: string[] +} + +export interface PatternFleetEvidence { + applications: number + successRate: number + medianDimDelta: Record + sampleTenants: number +} + +export interface Pattern { + patternId: string + category: string + classification: { type: PageType; tags: string[] } + scaffold: PatternScaffold + scores: { whenFollowed: Record } + fleetEvidence: PatternFleetEvidence + fixtures: string[] +} + +export interface PatternQuery { + category?: string + pageType?: PageType + weakDimension?: Dimension + minApplications?: number + minSuccessRate?: number +} + +export interface PatternMatch { + pattern: Pattern + matchConfidence: number + expectedDelta: Record + applicationGuidance: string +} diff --git a/src/design/audit/pipeline.ts b/src/design/audit/pipeline.ts index a44574d..248faea 100644 --- a/src/design/audit/pipeline.ts +++ b/src/design/audit/pipeline.ts @@ -18,6 +18,19 @@ import { gatherMeasurements } from './measure/index.js' import { evaluatePage, type AuditPassId, type AuditOverrides } from './evaluate.js' import type { PageAuditResult, PageClassification } from './types.js' import { getTelemetry, shortHash } from '../../telemetry/index.js' +import { loadEthicsRules } from './ethics/loader.js' +import { checkEthics, pageTextBlob } from './ethics/check.js' +import { classifyEnsemble } from './classify-ensemble.js' +import { loadAnchors } from './rubric/anchor-loader.js' +import { buildAuditResultV2 } from './v2/build-result.js' +import type { + AudienceTag, + ModalityTag, + RegulatoryContextTag, + AudienceVulnerabilityTag, + EthicsViolation, + EnsembleClassification, +} from './v2/types.js' export interface AuditOnePageOptions { brain: Brain @@ -43,6 +56,17 @@ export interface AuditOnePageOptions { * candidate prompts; production runs leave them undefined. */ overrides?: AuditOverrides + /** + * Layer 7 — bypass the ethics gate entirely. Audited + warned. Test-only. + */ + skipEthics?: boolean + /** Override directory containing ethics `*.yaml` rule files. */ + ethicsRulesDir?: string + /** Layer 6 hints used by ethics + composable predicates. */ + audience?: AudienceTag[] + modality?: ModalityTag[] + regulatoryContext?: RegulatoryContextTag[] + audienceVulnerability?: AudienceVulnerabilityTag[] } const COOKIE_BANNER_SELECTORS = [ @@ -68,7 +92,27 @@ async function dismissCookieBanners(page: Page): Promise { * Audit one page through the full Gen 2 pipeline. */ export async function auditOnePage(opts: AuditOnePageOptions): Promise { - const { brain, driver, page, url, profileOverride, screenshotDir, userRubricsDir, auditPasses, runId, parentRunId, provider, model, overrides } = opts + const { + brain, + driver, + page, + url, + profileOverride, + screenshotDir, + userRubricsDir, + auditPasses, + runId, + parentRunId, + provider, + model, + overrides, + skipEthics, + ethicsRulesDir, + audience, + modality, + regulatoryContext, + audienceVulnerability, + } = opts const startedAt = Date.now() try { @@ -90,7 +134,9 @@ export async function auditOnePage(opts: AuditOnePageOptions): Promise 0) { + const minCap = Math.min(...ethicsViolations.map((v) => v.rollupCap)) + if (typeof result.score === 'number' && result.score > minCap) { + result.preEthicsScore = result.score + result.score = minCap + } + } + result.ethicsViolations = ethicsViolations + } + + // ── 8. Layer 1 v2 — multi-dim scoring + rollup, emitted alongside v1 ── + if (ensemble) { + try { + const anchors = loadAnchors() + const anchor = anchors.get(ensemble.type) + const v2 = await buildAuditResultV2({ + brain, + state, + pageRef: url, + ensemble, + rubric, + measurements, + v1Result: result, + anchor, + runId, + }) + result.auditResultV2 = v2 + result.ensembleClassification = ensemble + } catch (v2Err) { + // Don't let v2 failures break v1. Log + move on. + console.warn(`[audit/v2] failed to build v2 result for ${url}: ${(v2Err as Error).message}`) + } + } + if (runId) { const findings = result.findings ?? [] + const ethicsViolations: EthicsViolation[] = result.ethicsViolations ?? [] getTelemetry().emit({ kind: 'design-audit-page', runId, @@ -170,6 +271,9 @@ export async function auditOnePage(opts: AuditOnePageOptions): Promise v.severity === 'critical-floor').length, + ethicsMajorFloor: ethicsViolations.filter((v) => v.severity === 'major-floor').length, }, tags: { pageType: classification.type, @@ -202,6 +306,7 @@ export async function auditOnePage(opts: AuditOnePageOptions): Promise.yaml`. Each + * anchor encodes score-band criteria + reference fixtures so the LLM scores + * an saas-app like Linear's app, not like Linear's marketing site. + * + * Schema: + * type: + * score_9_10: { criteria: string[], fixtures: string[] } + * score_7_8: { criteria: string[], fixtures: string[] } + * score_5_6: { criteria: string[], fixtures: string[] } + * score_3_4: { criteria: string[], fixtures: string[] } + */ + +import * as fs from 'node:fs' +import * as path from 'node:path' +import { fileURLToPath } from 'node:url' +import type { PageType } from '../types.js' + +const __dirname = path.dirname(fileURLToPath(import.meta.url)) +const ANCHORS_DIR = path.join(__dirname, 'anchors') + +export interface AnchorBand { + criteria: string[] + fixtures: string[] +} + +export interface CalibrationAnchor { + type: PageType + score_9_10: AnchorBand + score_7_8: AnchorBand + score_5_6: AnchorBand + score_3_4: AnchorBand +} + +const REQUIRED_BANDS = ['score_9_10', 'score_7_8', 'score_5_6', 'score_3_4'] as const + +/** + * Parse one anchor YAML. Uses a minimal YAML reader that handles the shape: + * type: saas-app + * score_9_10: + * criteria: + * - line one + * - line two + * fixtures: + * - fixture:linear-app + * + * Avoids pulling in a YAML dep for ~9 small files. Throws on malformed input. + */ +export function parseAnchorFile(filePath: string): CalibrationAnchor { + const raw = fs.readFileSync(filePath, 'utf-8') + const parsed = parseAnchorYaml(raw) + + if (!parsed.type || typeof parsed.type !== 'string') { + throw new Error(`anchor ${filePath} missing 'type' field`) + } + + for (const band of REQUIRED_BANDS) { + const node = parsed[band] + if (!node || typeof node !== 'object') { + throw new Error(`anchor ${filePath} missing '${band}' band`) + } + const b = node as { criteria?: unknown; fixtures?: unknown } + if (!Array.isArray(b.criteria) || b.criteria.length === 0) { + throw new Error(`anchor ${filePath} '${band}.criteria' must be a non-empty array`) + } + if (!Array.isArray(b.fixtures) || b.fixtures.length === 0) { + throw new Error(`anchor ${filePath} '${band}.fixtures' must be a non-empty array`) + } + } + + return parsed as unknown as CalibrationAnchor +} + +/** Load all anchors from `anchors/` into a map keyed by PageType. */ +export function loadAnchors(dir: string = ANCHORS_DIR): Map { + const out = new Map() + if (!fs.existsSync(dir)) return out + for (const file of fs.readdirSync(dir)) { + if (!file.endsWith('.yaml') && !file.endsWith('.yml')) continue + const anchor = parseAnchorFile(path.join(dir, file)) + out.set(anchor.type, anchor) + } + return out +} + +/** Render an anchor as a markdown block for prompt injection. */ +export function renderAnchor(anchor: CalibrationAnchor): string { + const band = (label: string, b: AnchorBand): string => + `${label}\n${b.criteria.map((c) => `- ${c}`).join('\n')}\nReferences: ${b.fixtures.join(', ')}` + return [ + `Calibration anchor for ${anchor.type}:`, + band('Score 9-10:', anchor.score_9_10), + band('Score 7-8:', anchor.score_7_8), + band('Score 5-6:', anchor.score_5_6), + band('Score 3-4:', anchor.score_3_4), + ].join('\n\n') +} + +/** + * Minimal YAML parser scoped to the anchor file shape. Supports: + * key: scalar + * key: + * subkey: scalar + * subkey: + * - list item + * + * Indentation is normalized to spaces; tabs are not supported. + */ +function parseAnchorYaml(text: string): Record { + const lines = text.split('\n').map((l) => l.replace(/\r$/, '')) + const root: Record = {} + let i = 0 + + while (i < lines.length) { + const line = lines[i] + if (!line.trim() || line.trim().startsWith('#')) { + i++ + continue + } + const indent = leadingSpaces(line) + if (indent !== 0) { + i++ + continue + } + const m = line.match(/^([a-zA-Z_][\w-]*):\s*(.*)$/) + if (!m) { + i++ + continue + } + const [, key, valueRaw] = m + const value = valueRaw.trim() + if (value === '') { + const { node, nextIndex } = readBlock(lines, i + 1, 2) + root[key] = node + i = nextIndex + } else { + root[key] = parseScalar(value) + i++ + } + } + + return root +} + +function readBlock( + lines: string[], + startIndex: number, + baseIndent: number, +): { node: Record | string[]; nextIndex: number } { + // Detect: is this a list ("- item") or a map? + let i = startIndex + while (i < lines.length && !lines[i].trim()) i++ + if (i >= lines.length) return { node: {}, nextIndex: i } + + const firstIndent = leadingSpaces(lines[i]) + if (firstIndent < baseIndent) return { node: {}, nextIndex: i } + + if (lines[i].trim().startsWith('- ') || lines[i].trim() === '-') { + const items: string[] = [] + while (i < lines.length) { + const line = lines[i] + if (!line.trim()) { + i++ + continue + } + const indent = leadingSpaces(line) + if (indent < baseIndent) break + const trimmed = line.trim() + if (!trimmed.startsWith('-')) break + const item = trimmed.replace(/^-\s*/, '') + items.push(parseScalar(item) as string) + i++ + } + return { node: items, nextIndex: i } + } + + const map: Record = {} + while (i < lines.length) { + const line = lines[i] + if (!line.trim() || line.trim().startsWith('#')) { + i++ + continue + } + const indent = leadingSpaces(line) + if (indent < baseIndent) break + if (indent > baseIndent) { + i++ + continue + } + const m = line.match(/^\s*([a-zA-Z_][\w-]*):\s*(.*)$/) + if (!m) { + i++ + continue + } + const [, key, valueRaw] = m + const value = valueRaw.trim() + if (value === '') { + const { node, nextIndex } = readBlock(lines, i + 1, baseIndent + 2) + map[key] = node + i = nextIndex + } else { + map[key] = parseScalar(value) + i++ + } + } + return { node: map, nextIndex: i } +} + +function leadingSpaces(line: string): number { + let n = 0 + while (n < line.length && line[n] === ' ') n++ + return n +} + +function parseScalar(raw: string): unknown { + let value = raw.trim() + if (value === '') return '' + if ((value.startsWith('"') && value.endsWith('"')) || (value.startsWith("'") && value.endsWith("'"))) { + value = value.slice(1, -1) + } + if (value === 'true') return true + if (value === 'false') return false + if (value === 'null' || value === '~') return null + if (/^-?\d+$/.test(value)) return Number(value) + if (/^-?\d+\.\d+$/.test(value)) return Number(value) + return value +} diff --git a/src/design/audit/rubric/anchors/blog.yaml b/src/design/audit/rubric/anchors/blog.yaml new file mode 100644 index 0000000..78c8d45 --- /dev/null +++ b/src/design/audit/rubric/anchors/blog.yaml @@ -0,0 +1,33 @@ +type: blog +score_9_10: + criteria: + - Reading column tuned to 60-75 character measure with deliberate vertical rhythm + - Typography scale supports headings, body, callouts, code, captions distinctly + - Author identity and publication date prominent without being chrome + - Inline media (images, code, embeds) integrate cleanly with text flow + - Navigation between posts (next, related, archive) supports the reading habit + fixtures: + - fixture:stratechery + - fixture:substack-post + - fixture:notion-blog-template +score_7_8: + criteria: + - Strong reading experience but typographic scale is one step shy of intentional + - Author and date present but visually inconsistent + - Inline media works but breaks rhythm at boundaries + fixtures: + - fixture:medium-default +score_5_6: + criteria: + - Default theme typography (no custom scale, line-height defaults) + - Heavy chrome (sidebar, social rail) competing with the article + - Author/date hidden in footer + fixtures: + - fixture:default-ghost +score_3_4: + criteria: + - Article copy crammed against gutters; no measure control + - Inline ads break reading flow every paragraph + - No publication date — content is unanchored in time + fixtures: + - fixture:ad-heavy-article diff --git a/src/design/audit/rubric/anchors/dashboard.yaml b/src/design/audit/rubric/anchors/dashboard.yaml new file mode 100644 index 0000000..7de766d --- /dev/null +++ b/src/design/audit/rubric/anchors/dashboard.yaml @@ -0,0 +1,35 @@ +type: dashboard +score_9_10: + criteria: + - Real metrics with units, time windows, and comparison baselines visible at first glance + - Charts use deliberate scales, axis labels, and selective color (not rainbow defaults) + - Filters, time ranges, and segment selectors are obvious and persistent + - Empty states preview real data shape (sample rows, last-7-days skeleton) + - Density tuned to operator workflow — no decorative whitespace where dense data belongs + fixtures: + - fixture:linear-app + - fixture:figma-file-ui + - fixture:datadog-dashboard + - fixture:vercel-dashboard +score_7_8: + criteria: + - Metrics present and useful but lack comparison baselines or time-window controls + - Charts readable but use default color palette; minor density issues + - Filters present but discoverability unclear + fixtures: + - fixture:grafana-default +score_5_6: + criteria: + - Generic stat cards (4 equal boxes) without context or comparison + - Charts are rainbow defaults from the component library + - No filters or time-range affordance + - Empty states are illustrations + platitudes + fixtures: + - fixture:generic-dashboard +score_3_4: + criteria: + - Numbers without units, labels, or context — operator cannot tell good from bad + - Decorative chart visualizations (donuts, gauges) instead of operational data + - Equal-weight UI controls; no clear workflow path + fixtures: + - fixture:empty-state-noise diff --git a/src/design/audit/rubric/anchors/docs.yaml b/src/design/audit/rubric/anchors/docs.yaml new file mode 100644 index 0000000..db0088b --- /dev/null +++ b/src/design/audit/rubric/anchors/docs.yaml @@ -0,0 +1,35 @@ +type: docs +score_9_10: + criteria: + - Quickstart path from landing to a working result is obvious within one viewport + - Code samples are runnable, copy-button-equipped, and language-tabbed where relevant + - Reference structure (sidebar, breadcrumbs, search) supports both linear and lookup reading + - Versioning, deprecation, and last-updated signals are visible + - Typography scale and line-length tuned for sustained reading + fixtures: + - fixture:stripe-docs + - fixture:tailwind-docs + - fixture:mdn-docs + - fixture:vercel-docs +score_7_8: + criteria: + - Quickstart present but buried; copy is solid but examples are partial + - Sidebar IA works but search is weak or missing + - Versioning unclear + fixtures: + - fixture:generic-mkdocs +score_5_6: + criteria: + - Wall-of-text reference with no quickstart + - Code samples without copy buttons or language switching + - No search, no breadcrumbs, no last-updated metadata + - Default theme typography (inconsistent line-height, no scale) + fixtures: + - fixture:default-docusaurus +score_3_4: + criteria: + - Marketing page disguised as docs (heavy hero, no actual reference) + - No code samples or syntax highlighting + - IA broken — "Getting Started" buried under marketing copy + fixtures: + - fixture:marketing-as-docs diff --git a/src/design/audit/rubric/anchors/ecommerce.yaml b/src/design/audit/rubric/anchors/ecommerce.yaml new file mode 100644 index 0000000..75d2e40 --- /dev/null +++ b/src/design/audit/rubric/anchors/ecommerce.yaml @@ -0,0 +1,33 @@ +type: ecommerce +score_9_10: + criteria: + - Product photography is real, detailed, and honest (not stock or AI-rendered) + - Price, fees, taxes, shipping, and total are surfaced before the commit button + - Stock, delivery, and return-policy signals are visible at decision points + - Cart and checkout flows preserve context — user always knows what they are buying + - Trust signals (verified payment, secure checkout, real merchant identity) appear where commitment occurs + fixtures: + - fixture:apple-store + - fixture:shopify-storefront + - fixture:allbirds-pdp +score_7_8: + criteria: + - Solid PDPs but checkout exposes fees only at the last step + - Stock and delivery info present but inconsistent across pages + - Photography mixes real and stock + fixtures: + - fixture:generic-shopify +score_5_6: + criteria: + - Fees surfaced only after pressing Pay; no shipping calculator + - Stock/delivery signals missing on PDPs + - Generic merchant identity, no trust badges, no return policy summary + fixtures: + - fixture:hidden-fees-checkout +score_3_4: + criteria: + - Total never shown until after commitment + - No merchant identity, no return policy, no trust signals + - Forced account creation before checkout, dark-pattern upsells + fixtures: + - fixture:dark-pattern-checkout diff --git a/src/design/audit/rubric/anchors/marketing.yaml b/src/design/audit/rubric/anchors/marketing.yaml new file mode 100644 index 0000000..a2f3dce --- /dev/null +++ b/src/design/audit/rubric/anchors/marketing.yaml @@ -0,0 +1,34 @@ +type: marketing +score_9_10: + criteria: + - Hero answers product, audience, and outcome within five seconds + - One dominant CTA, secondary actions clearly subordinate + - Concrete proof (real customer logos, real metrics, real screenshots) above the fold + - Visual craft (typography ramp, spacing rhythm, color system) is intentional, not template-default + - Differentiation is shown, not asserted; copy avoids vague hype + fixtures: + - fixture:stripe-marketing + - fixture:linear-marketing + - fixture:vercel-marketing + - fixture:apple-marketing +score_7_8: + criteria: + - Hero clear, but proof is generic (lorem-style logo cloud) or differentiation thin + - Visual system coherent but unremarkable; one or two minor rhythm breaks + - CTA dominant on hero but competes elsewhere + fixtures: + - fixture:generic-saas-marketing +score_5_6: + criteria: + - Hero copy explains the product instead of selling the outcome + - Visual hierarchy works but feels like a Tailwind starter + - Equal-weight CTAs, vague social proof, stock illustrations + fixtures: + - fixture:meta-copy-feature +score_3_4: + criteria: + - No primary message; could swap nouns and apply to any startup + - No CTA hierarchy; the page is a wall of equal sections + - Stock photography, generic gradients, default UI kit components + fixtures: + - fixture:ambiguous-deploy diff --git a/src/design/audit/rubric/anchors/saas-app.yaml b/src/design/audit/rubric/anchors/saas-app.yaml new file mode 100644 index 0000000..e2a6689 --- /dev/null +++ b/src/design/audit/rubric/anchors/saas-app.yaml @@ -0,0 +1,38 @@ +type: saas-app +score_9_10: + criteria: + - Domain object visible above the fold (tasks, deployments, conversations, files) + - One visually-dominant primary action per page state + - Empty states preview real product (sample rows, setup checklists, status timelines), not generic illustrations + - Action hierarchy = product hierarchy; no decorative buttons competing with workflow + - Trust details visible where commitment exists (price, permissions, undo, audit trail) + fixtures: + - fixture:linear-app + - fixture:figma-file-ui + - fixture:notion-editor + - fixture:superhuman + - fixture:github-pr-view +score_7_8: + criteria: + - Most criteria from 9-10 with one or two minor gaps + - Polish gaps that don't block job completion + - Domain objects present but action hierarchy slightly diffuse + fixtures: + - fixture:airtable-grid + - fixture:notion-database +score_5_6: + criteria: + - Functional but generic component-library assembly + - No domain object above the fold OR action hierarchy unclear + - Empty states show illustrations + platitudes instead of product preview + - Multiple equal-weight CTAs without a dominant primary + fixtures: + - fixture:generic-dashboard +score_3_4: + criteria: + - No primary job inferable from screen + - Equal-weight CTAs blocking workflow + - Decorative elements actively distract from product surface + - Page reads as a marketing/setup stub rather than an operational product + fixtures: + - fixture:no-primary-action diff --git a/src/design/audit/rubric/anchors/social.yaml b/src/design/audit/rubric/anchors/social.yaml new file mode 100644 index 0000000..591bb9b --- /dev/null +++ b/src/design/audit/rubric/anchors/social.yaml @@ -0,0 +1,33 @@ +type: social +score_9_10: + criteria: + - Feed prioritizes real content (posts, conversations) over chrome and ads + - Compose surface is one click away and primary in the layout + - Identity signals (verified accounts, profile preview, follower counts) consistent and lightweight + - State transitions (like, reply, repost) feel instant and reversible + - Empty states preview what the feed will look like with a few followed accounts + fixtures: + - fixture:threads-web + - fixture:bluesky-web + - fixture:substack-inline +score_7_8: + criteria: + - Solid feed and compose flow but reply chains are visually flat + - Identity signals inconsistent across surfaces + - Empty states use illustrations rather than previewing real content + fixtures: + - fixture:generic-microblog +score_5_6: + criteria: + - Feed cluttered with chrome (rails, ads, suggestions) competing with content + - Compose buried two clicks deep + - Action affordances (reply, like, share) are equal weight with no clear primary + fixtures: + - fixture:cluttered-feed +score_3_4: + criteria: + - Page reads as ad inventory with content squeezed in + - No clear primary feed; multiple surfaces compete + - Identity signals fake or absent (anonymous content with no provenance) + fixtures: + - fixture:ad-heavy-feed diff --git a/src/design/audit/rubric/anchors/tool.yaml b/src/design/audit/rubric/anchors/tool.yaml new file mode 100644 index 0000000..f280d01 --- /dev/null +++ b/src/design/audit/rubric/anchors/tool.yaml @@ -0,0 +1,33 @@ +type: tool +score_9_10: + criteria: + - Single-purpose surface — the input and output relationship is immediate and obvious + - Keyboard-first interaction (shortcuts, focus management, paste support) + - Output is copyable, exportable, and shareable without modal interruption + - Recent results, history, or undo always available + - State (input, processing, output, error) handled explicitly with clear transitions + fixtures: + - fixture:linear-command-palette + - fixture:github-pr-view + - fixture:raycast +score_7_8: + criteria: + - Tool works well but lacks keyboard affordances or history + - Output exportable but with extra clicks + - Error states present but generic + fixtures: + - fixture:generic-converter +score_5_6: + criteria: + - Form-and-submit pattern with reload-style output + - No keyboard shortcuts, no history, no undo + - Loading and error states use defaults + fixtures: + - fixture:basic-tool-form +score_3_4: + criteria: + - Multi-step flow for what should be a single action + - Output requires manual selection/copy + - No error handling — failures show generic browser errors + fixtures: + - fixture:broken-tool diff --git a/src/design/audit/rubric/anchors/utility.yaml b/src/design/audit/rubric/anchors/utility.yaml new file mode 100644 index 0000000..339fcfc --- /dev/null +++ b/src/design/audit/rubric/anchors/utility.yaml @@ -0,0 +1,33 @@ +type: utility +score_9_10: + criteria: + - Status, configuration, or admin surface that exposes the operational object directly (deploy, build, job, account) + - Real state (running, succeeded, failed, queued) with timestamps and durations + - Action affordances (retry, rollback, configure, audit) match the operational verbs of the system + - Logs, diagnostics, or detail panels are one click from the summary + - Empty states preview what real activity will look like + fixtures: + - fixture:vercel-deployment-status + - fixture:cloudflare-dashboard + - fixture:github-actions +score_7_8: + criteria: + - Status surface clear but action affordances are generic (Edit/Save instead of Retry/Rollback) + - Logs accessible but require navigation + - Empty states use illustrations rather than previewing activity + fixtures: + - fixture:generic-admin +score_5_6: + criteria: + - Status indicators are decorative pills without timestamps or durations + - Logs and diagnostics buried in modals + - Forms-of-forms pattern instead of operational verbs + fixtures: + - fixture:basic-settings-page +score_3_4: + criteria: + - Status is text only — no visual signal of failure or success + - No way to retry, rollback, or audit from the surface + - Settings sprawl with no IA, no search + fixtures: + - fixture:settings-sprawl diff --git a/src/design/audit/rubric/fragments/audience-clinician.md b/src/design/audit/rubric/fragments/audience-clinician.md new file mode 100644 index 0000000..bf4bcac --- /dev/null +++ b/src/design/audit/rubric/fragments/audience-clinician.md @@ -0,0 +1,42 @@ +--- +id: audience-clinician +title: Clinician Audience +weight: high +applies-when: + audience: [clinician] +--- + +This surface is used by clinical professionals (physicians, nurses, pharmacists, +therapists) in high-stakes decision-making contexts. Standard consumer-UX +heuristics are insufficient — apply the following additional lens. + +INFORMATION DENSITY +- Clinicians tolerate and often require high information density. Sparse + consumer-style layouts that hide detail behind progressive disclosure are + friction, not polish. +- Data tables, lab result grids, medication lists must be fully visible without + expand/collapse. If key data is folded, score `content_ia` lower. + +WORKFLOW EFFICIENCY +- Clinicians context-switch constantly (patient to patient, chart to EHR to + order entry). Keyboard navigation, dense primary actions, and minimal + confirmation dialogs for routine operations are expected. +- If standard consumer patterns (fat CTAs, step-by-step wizards) dominate + routine tasks, score `workflow` lower. + +CRITICAL VALUE FLAGGING +- Out-of-range lab values, drug interactions, and alert states must be + immediately visible with high visual contrast — not just color. Include + icon + text pattern redundancy. +- Missing or weak critical-value flagging is a major finding in `trust_clarity`. + +AUDIT TRAIL AND ATTRIBUTION +- Clinician workflows require visible "who did what, when" — last modified by, + order placed by, cosigned by. This is both regulatory and practical. +- If attributable actions lack visible provenance, that is a major finding in + `trust_clarity`. + +DO NOT penalize for: +- Dense information layouts (this is intentional) +- Lack of illustrations or hero imagery +- Technical terminology appropriate to the audience diff --git a/src/design/audit/rubric/fragments/audience-developer.md b/src/design/audit/rubric/fragments/audience-developer.md new file mode 100644 index 0000000..75b7875 --- /dev/null +++ b/src/design/audit/rubric/fragments/audience-developer.md @@ -0,0 +1,40 @@ +--- +id: audience-developer +title: Developer Audience +weight: medium +applies-when: + audience: [developer] +--- + +This surface is used by software engineers and technical practitioners. + +INFORMATION OVER DECORATION +- Code samples, CLI commands, API endpoints, and technical specifications must + be immediately accessible — not gated behind tabs, scrolling, or "Request + Demo" flows. If core technical content requires navigation to find, score + `content_ia` lower. + +COPY-PASTE HYGIENE +- Every code block must have a visible copy button or be selectable without + capturing surrounding prose. Missing copy affordance is a minor-to-major + finding in `workflow` depending on frequency. + +DARK MODE AND TERMINAL AESTHETICS +- Developers default to dark environments. A light-only surface with no dark + mode is a `visual_craft` minor finding. A surface that actively breaks + (illegible code contrast) in dark mode is major. + +AUTHENTICATION PATHS +- API keys, tokens, and credentials should be displayed with + mask-by-default + reveal-on-click. Showing credentials in plaintext by + default is a critical `trust_clarity` finding. + +SEARCH AS PRIMARY NAVIGATION +- Technical docs and reference surfaces must have a prominent, keyboard- + accessible search. If Cmd/Ctrl-K does not open search, that is a major + finding in `workflow`. + +DO NOT penalize for: +- Dense information layouts +- Monospace typography sections +- Minimal illustration or marketing copy diff --git a/src/design/audit/rubric/fragments/audience-kids.md b/src/design/audit/rubric/fragments/audience-kids.md new file mode 100644 index 0000000..84a09a0 --- /dev/null +++ b/src/design/audit/rubric/fragments/audience-kids.md @@ -0,0 +1,35 @@ +--- +id: audience-kids +title: Kids Audience +weight: critical +applies-when: + audience: [kids] +--- + +This surface is used by or targeted at minors. Apply the following additional +lens. Note: ethics rules in Layer 7 enforce hard score floors independently +of rubric scoring — both layers apply simultaneously. + +AGE-APPROPRIATE LANGUAGE AND IMAGERY +- Copy must be readable at the stated age level. Technical jargon, legalese, + or implicit social pressure ("Your friends are waiting!") are major findings + in `content_ia`. +- Imagery should be appropriate for the age group. Mature themes, even subtle + ones, are critical findings in `trust_clarity`. + +FRICTION-SYMMETRIC FLOWS +- Actions that benefit the operator (in-app purchase, data sharing, account + creation) must have equal or greater friction than their reversal + (cancellation, data deletion, account closure). +- Any flow where it is easier to spend money / share data than to undo is a + critical finding in `workflow`. + +PARENTAL CONTROLS VISIBILITY +- If the app collects data from minors, parental consent or control mechanisms + must be visible without buried navigation. If absent: major finding in + `trust_clarity`. + +DO NOT penalize for: +- Simplified language and larger touch targets +- Reduced information density appropriate to age +- Bright color palettes and playful illustration styles diff --git a/src/design/audit/rubric/fragments/audience-vulnerability-minor-facing.md b/src/design/audit/rubric/fragments/audience-vulnerability-minor-facing.md new file mode 100644 index 0000000..c57ee0b --- /dev/null +++ b/src/design/audit/rubric/fragments/audience-vulnerability-minor-facing.md @@ -0,0 +1,37 @@ +--- +id: audience-vulnerability-minor-facing +title: Minor-Facing Audience Vulnerability +weight: critical +applies-when: + audienceVulnerability: [minor-facing] +--- + +This surface directly interacts with users who are minors or who are +unaccompanied minors in a supervised context (e.g. school software, children's +gaming, education platforms). The vulnerability is that the minor may not fully +understand consent, financial consequence, or data implications. Apply this +lens in addition to audience-kids and regulatory-coppa fragments. + +DARK PATTERN PROHIBITION — ENFORCED +Every dark pattern is a critical finding when directed at minors. Dark patterns +to look for: +- Confirmshaming ("No thanks, I don't want to save money") +- Fake urgency ("Only 2 left! Timer expires in 03:42") +- Hidden costs revealed at final checkout step +- Forced continuity (subscription auto-enrolled without explicit confirmation) +- Misdirection (styled "X" button that is actually an ad click) + +IRREVERSIBILITY DISCLOSURE +- Any action that is irreversible (purchase, deletion, sharing to others) must + be labeled explicitly. "Delete" without "This cannot be undone" is a major + finding in `trust_clarity`. + +SOCIAL COMPARISON AS PRESSURE +- Leaderboards, "Your friends have X" notifications, or streak-loss warnings + designed to create anxiety are major `trust_clarity` findings when the + audience is minors. + +REPORTING AND BLOCKING CONTROLS +- If the surface allows social interaction (messaging, comments, reactions), + visible reporting and blocking controls are required. Absent: major finding + in `trust_clarity`. diff --git a/src/design/audit/rubric/fragments/first-principles.md b/src/design/audit/rubric/fragments/first-principles.md new file mode 100644 index 0000000..7570de6 --- /dev/null +++ b/src/design/audit/rubric/fragments/first-principles.md @@ -0,0 +1,65 @@ +--- +id: first-principles +title: First-Principles Fallback +weight: critical +applies-when: + universal: false +--- + +You haven't seen this pattern before. Do not fabricate a classification. +Audit against the universal product principles only. Score per-dimension as +usual, but set `rollup.confidence = "low"` and emit a top-level +`novel_pattern_signal` describing what you observed, so this surface can be +mined into a new fragment after enough fleet exposure. + +1. PRIMARY JOB CLARITY (5 sec test) + - Within 5 seconds, can a stranger name what this page is for? + - If no: severity major; finding category `product_intent`. + +2. PRIMARY ACTION OBVIOUSNESS + - Is there one visually-dominant action this page is built around? + - Are competing actions visually subordinate? + - If equal-weight: severity major; finding category `product_intent`. + +3. STATE PREVIEW + - Are empty/loading/error states designed, or browser-default / placeholder? + - Do empty states preview the real product, or show generic illustrations? + - If generic: severity major; finding category `product_intent`. + +4. TRUST BEFORE COMMITMENT + - Does the page ask the user to commit (money, identity, deploy, share)? + - If yes: are price, permissions, scope, undo path visible BEFORE the + commit button? + - If no: severity critical; finding category `trust_clarity`. + +5. RECOVERY FROM FAILURE + - Can the user undo their last action? + - Is there a clear path forward when something fails? + - If no: severity major; finding category `workflow`. + +GUARDRAILS: +- Do not invent domain-specific findings ("this dashboard needs charts"). + You don't know the domain. Stick to the five principles. +- Do not anchor on marketing-page heuristics (hero copy, illustrations, + social proof). They don't apply. +- If a principle simply doesn't apply (e.g. there is no commitment on this + page), say so explicitly rather than scoring it generically. + +RESPOND WITH ONLY a JSON object of the form: +{ + "scores": { + "product_intent": { "score": <1-10>, "range": [, ], "confidence": "low", "summary": "", "primaryFindings": [] }, + "visual_craft": { ... }, + "trust_clarity": { ... }, + "workflow": { ... }, + "content_ia": { ... } + }, + "rollup": { "score": <1-10>, "range": [, ], "confidence": "low", "rule": "first-principles", "weights": { "product_intent": 0.30, "workflow": 0.25, "visual_craft": 0.20, "content_ia": 0.15, "trust_clarity": 0.10 } }, + "findings": [ ... ], + "novel_pattern_signal": { + "observedSignals": [ + { "label": "", "evidence": "", "confidence": <0..1> } + ] + }, + "first_principles_mode": true +} diff --git a/src/design/audit/rubric/fragments/modality-mobile.md b/src/design/audit/rubric/fragments/modality-mobile.md new file mode 100644 index 0000000..1deb727 --- /dev/null +++ b/src/design/audit/rubric/fragments/modality-mobile.md @@ -0,0 +1,39 @@ +--- +id: modality-mobile +title: Mobile Modality +weight: medium +applies-when: + modality: [mobile] +--- + +This surface is evaluated at a mobile viewport (≤480px wide). Apply the +following lens on top of page-type and domain fragments. + +TOUCH TARGET SIZING +- Interactive elements must meet minimum 44×44pt touch targets (WCAG 2.5.5 + AAA; Apple HIG minimum). Anything below 32pt is a major finding in + `workflow`. Count the number of undersized targets — if >3 on a single + screen, escalate to critical. + +THUMB-ZONE REACHABILITY +- Primary actions must be reachable in the bottom 60% of a 375px screen + one-handed. A primary CTA pinned to the top of the viewport is a major + `workflow` finding. + +HORIZONTAL SCROLL AVOIDANCE +- Content must not require horizontal scroll on a 375px viewport. Tables + that overflow without a scroll affordance are major `workflow` findings. + +FONT LEGIBILITY +- Body text must be ≥16px (browser zoom notwithstanding). Text smaller than + 14px is a major `visual_craft` finding. Text below 12px is critical. + +FORM INPUT KEYBOARD +- Input fields must trigger the appropriate virtual keyboard type (numeric + for phone/postcode, email for email, tel for phone numbers). Wrong keyboard + type is a minor `workflow` finding per field. + +DO NOT penalize for: +- Navigation patterns specific to mobile (hamburger, bottom tab bar) +- Reduced visible surface area compared to desktop +- Single-column layouts diff --git a/src/design/audit/rubric/fragments/modality-tablet.md b/src/design/audit/rubric/fragments/modality-tablet.md new file mode 100644 index 0000000..f4dcb38 --- /dev/null +++ b/src/design/audit/rubric/fragments/modality-tablet.md @@ -0,0 +1,35 @@ +--- +id: modality-tablet +title: Tablet Modality +weight: low +applies-when: + modality: [tablet] +--- + +This surface is evaluated at a tablet viewport (481–1024px wide). Apply this +lens on top of page-type and domain fragments. + +LAYOUT ADAPTATION +- The layout must actually adapt between mobile and desktop — not simply + scale a mobile layout or stretch a desktop layout. A layout that is + identical to either breakpoint is a minor `visual_craft` finding. + +SPLIT-VIEW AND SIDEBAR OPPORTUNITIES +- Tablet viewports often benefit from master-detail or sidebar-content + patterns rather than single-column stacks. If the content hierarchy would + benefit from a persistent sidebar and none is present, that is a minor + `workflow` finding. + +TOUCH AND POINTER HYBRID +- Tablet users may use touch or pointer. Touch targets must still meet the + 44pt minimum. Hover-only affordances without touch fallbacks are major + `workflow` findings. + +LANDSCAPE AND PORTRAIT PARITY +- Key interactions must work in both orientations. If a primary action is + unreachable in landscape (below fold with no scroll), that is a major + `workflow` finding. + +DO NOT penalize for: +- Adapting typography slightly smaller than mobile maximums +- Showing more information density than the mobile equivalent diff --git a/src/design/audit/rubric/fragments/regulatory-coppa.md b/src/design/audit/rubric/fragments/regulatory-coppa.md new file mode 100644 index 0000000..07abf94 --- /dev/null +++ b/src/design/audit/rubric/fragments/regulatory-coppa.md @@ -0,0 +1,32 @@ +--- +id: regulatory-coppa +title: COPPA Regulatory Context +weight: critical +applies-when: + regulatoryContext: [coppa] +--- + +This surface is subject to COPPA (Children's Online Privacy Protection Act). +Apply this lens when the audience includes or may include users under 13. The +ethics gate (Layer 7) independently enforces hard score floors for missing +age gates and dark patterns — both apply simultaneously. + +VERIFIABLE PARENTAL CONSENT +- If this surface collects personal data from users who may be under 13, + a verifiable parental consent mechanism must be visible and functional. + Absent: critical finding in `trust_clarity`. + +AGE GATE INTEGRITY +- Age gates must require date-of-birth entry, not a single yes/no question + ("Are you 13 or older?"). A single-question age gate is a major finding — + it is trivially bypassed. + +DATA COLLECTION DISCLOSURE +- A clear, plain-English summary of what data is collected and why must be + visible before any data collection begins. Buried in a privacy policy does + not satisfy this requirement. Absent: major finding in `content_ia`. + +PROHIBITION ON BEHAVIORAL TARGETING +- No behavioral advertising or cross-site tracking may be enabled for users + under 13. If third-party tracking scripts are present without age-based + gating: critical finding in `trust_clarity`. diff --git a/src/design/audit/rubric/fragments/regulatory-gdpr.md b/src/design/audit/rubric/fragments/regulatory-gdpr.md new file mode 100644 index 0000000..d9fff07 --- /dev/null +++ b/src/design/audit/rubric/fragments/regulatory-gdpr.md @@ -0,0 +1,33 @@ +--- +id: regulatory-gdpr +title: GDPR Regulatory Context +weight: high +applies-when: + regulatoryContext: [gdpr] +--- + +This surface is subject to GDPR. Apply the following lens in addition to other +applicable fragments. Note: the ethics gate (Layer 7) independently enforces a +score floor for missing consent mechanisms — both apply. + +CONSENT MECHANISM QUALITY +- Cookie consent banners must offer granular controls (necessary / analytics / + marketing) with equal visual prominence. An "Accept all" button that is + larger or more prominent than "Manage preferences" is a major `trust_clarity` + finding. +- Pre-ticked checkboxes are a critical finding — they are unlawful under GDPR. + +DATA SUBJECT RIGHTS ACCESS +- Users must be able to find their data rights (access, deletion, portability, + correction) without more than 2 navigation steps from any page. If the + privacy page is not reachable from the footer, that is a major finding in + `content_ia`. + +LEGAL BASIS TRANSPARENCY +- If the page collects personal data, the legal basis (consent, legitimate + interest, contract) must be stated. Absent: minor finding in `trust_clarity`. + +DATA RETENTION +- If retention periods are disclosed (they should be), they must be + understandable to a non-lawyer. Legal boilerplate with no plain-English + summary is a minor finding in `content_ia`. diff --git a/src/design/audit/rubric/fragments/regulatory-hipaa.md b/src/design/audit/rubric/fragments/regulatory-hipaa.md new file mode 100644 index 0000000..d5ff918 --- /dev/null +++ b/src/design/audit/rubric/fragments/regulatory-hipaa.md @@ -0,0 +1,36 @@ +--- +id: regulatory-hipaa +title: HIPAA Regulatory Context +weight: high +applies-when: + regulatoryContext: [hipaa] +--- + +This surface handles Protected Health Information (PHI) and is subject to HIPAA +technical safeguards. Apply this lens in addition to domain-specific fragments. + +SESSION SECURITY VISIBILITY +- Automatic session timeout must be visible to the user (countdown or clear + logout trigger). Invisible timeout with hard logout is a major `workflow` + finding. +- If the surface shows PHI and has no visible session indicator, that is a + major `trust_clarity` finding. + +MINIMUM NECESSARY DATA +- Only the minimum necessary PHI should be visible on any given screen. + Dashboards that show full SSN, full DOB, or complete medication histories + when partial identifiers suffice are major `trust_clarity` findings. + +AUDIT LOG ACCESS +- If this surface allows modification of PHI, a visible "audit log" or + "activity history" link must be accessible to the user. Absent: minor + finding in `trust_clarity`. + +DATA EXPORT LABELING +- Export buttons (CSV, PDF, print) must label the output as PHI with a + handling reminder. Unlabeled PHI export is a minor finding. + +DO NOT penalize for: +- Explicit data masking that adds cognitive load (masks protect PHI) +- Confirmation dialogs on irreversible PHI operations +- Conservative color coding that prioritizes legibility over aesthetics diff --git a/src/design/audit/rubric/loader.ts b/src/design/audit/rubric/loader.ts index ebadbaf..5694fd2 100644 --- a/src/design/audit/rubric/loader.ts +++ b/src/design/audit/rubric/loader.ts @@ -15,6 +15,25 @@ import type { ComposedRubric, AppliesWhen, } from '../types.js' +import type { + AudienceTag, + ModalityTag, + RegulatoryContextTag, + AudienceVulnerabilityTag, +} from '../v2/types.js' + +/** + * Operator-supplied context for Layer 6 composable predicate matching. + * When provided, fragments whose `applies-when.audience | modality | + * regulatoryContext | audienceVulnerability` overlap with these values are + * included in the composed rubric alongside the classification-matched set. + */ +export interface RubricContext { + audience?: AudienceTag[] + modality?: ModalityTag[] + regulatoryContext?: RegulatoryContextTag[] + audienceVulnerability?: AudienceVulnerabilityTag[] +} const __dirname = path.dirname(fileURLToPath(import.meta.url)) const BUILTIN_FRAGMENTS_DIR = path.join(__dirname, 'fragments') @@ -134,44 +153,66 @@ export function loadFragments(dir: string = BUILTIN_FRAGMENTS_DIR): RubricFragme } /** - * Predicate evaluator. Returns true if the fragment applies to the classification. + * Predicate evaluator. Returns true if the fragment applies to the classification + * or the optional Layer 6 context (audience / modality / regulatoryContext / + * audienceVulnerability hints). * * Universal fragments always apply. - * Type/domain/maturity/designSystem predicates are AND-combined: all listed - * fields must match. Within a field, the classification value must be in the - * fragment's allowed set. + * Predicate groups are OR-combined at the group level: a fragment fires if ANY + * one of its predicate groups matches. Within a group, list membership is used + * (the classification/context value must appear in the fragment's allowed set). + * + * Layer 6 predicates are additive: they can cause a fragment to fire even when + * no type/domain predicate matches, enabling composition across independent + * predicate dimensions. */ export function fragmentApplies( fragment: RubricFragment, classification: PageClassification, + ctx?: RubricContext, ): boolean { - const w = fragment.appliesWhen + const w = fragment.appliesWhen as AppliesWhen & { + audience?: string[] + modality?: string[] + regulatoryContext?: string[] + audienceVulnerability?: string[] + } if (w.universal) return true - if (w.type && w.type.length > 0) { - if (!w.type.includes(classification.type)) return false - } - if (w.domain && w.domain.length > 0) { - const domainMatch = w.domain.some(d => - classification.domain.toLowerCase().includes(d.toLowerCase()), - ) - if (!domainMatch) return false - } - if (w.maturity && w.maturity.length > 0) { - if (!w.maturity.includes(classification.maturity)) return false + // --- Layer 1 classification predicates (AND-combined when all set) --- + const classificationPredicateSet = + !!w.type?.length || !!w.domain?.length || !!w.maturity?.length || !!w.designSystem?.length + + if (classificationPredicateSet) { + if (w.type?.length && !w.type.includes(classification.type)) return false + if (w.domain?.length) { + const domainMatch = w.domain.some(d => + classification.domain.toLowerCase().includes(d.toLowerCase()), + ) + if (!domainMatch) return false + } + if (w.maturity?.length && !w.maturity.includes(classification.maturity)) return false + if (w.designSystem?.length && !w.designSystem.includes(classification.designSystem)) return false + return true } - if (w.designSystem && w.designSystem.length > 0) { - if (!w.designSystem.includes(classification.designSystem)) return false + + // --- Layer 6 context predicates (any overlap fires the fragment) --- + if (ctx) { + if (w.audience?.length && ctx.audience?.length) { + if (w.audience.some(a => ctx.audience!.includes(a as AudienceTag))) return true + } + if (w.modality?.length && ctx.modality?.length) { + if (w.modality.some(m => ctx.modality!.includes(m as ModalityTag))) return true + } + if (w.regulatoryContext?.length && ctx.regulatoryContext?.length) { + if (w.regulatoryContext.some(r => ctx.regulatoryContext!.includes(r as RegulatoryContextTag))) return true + } + if (w.audienceVulnerability?.length && ctx.audienceVulnerability?.length) { + if (w.audienceVulnerability.some(av => ctx.audienceVulnerability!.includes(av as AudienceVulnerabilityTag))) return true + } } - // If at least one predicate field was set and all matched, apply. - // If NO predicates were set and not universal, don't apply (be conservative). - const hasPredicate = - !!w.type?.length || - !!w.domain?.length || - !!w.maturity?.length || - !!w.designSystem?.length - return hasPredicate + return false } /** @@ -180,11 +221,13 @@ export function fragmentApplies( * @param classification - the page classification * @param fragments - all loaded fragments (defaults to builtin) * @param userFragmentsDir - optional path to user-supplied fragments + * @param ctx - optional Layer 6 context for audience/modality/regulatory predicates */ export function composeRubric( classification: PageClassification, fragments?: RubricFragment[], userFragmentsDir?: string, + ctx?: RubricContext, ): ComposedRubric { const all = [ ...(fragments ?? loadFragments(BUILTIN_FRAGMENTS_DIR)), @@ -192,7 +235,7 @@ export function composeRubric( ] const matched = all - .filter(f => fragmentApplies(f, classification)) + .filter(f => fragmentApplies(f, classification, ctx)) .sort((a, b) => WEIGHT_ORDER[a.weight] - WEIGHT_ORDER[b.weight]) const body = matched diff --git a/src/design/audit/rubric/rollup-weights.ts b/src/design/audit/rubric/rollup-weights.ts new file mode 100644 index 0000000..1cb624b --- /dev/null +++ b/src/design/audit/rubric/rollup-weights.ts @@ -0,0 +1,60 @@ +/** + * Rollup weights — Layer 1 of the world-class design-audit architecture. + * + * Per-page-type weights for combining the 5 dimension scores into a single + * rollup. Marketing surfaces emphasize visual + content; saas-app surfaces + * emphasize product_intent + workflow; docs lean on content_ia. The weights + * are evolvable via the GEPA target `pareto-rollup-weights`. + * + * Invariant: every weight set sums to 1.0 within 1e-6. + */ + +import type { Dimension } from '../v2/types.js' +import type { PageType } from '../types.js' + +export type RollupWeightKey = PageType | 'default' + +const ROLLUP_WEIGHTS_RAW: Record> = { + marketing: { product_intent: 0.30, visual_craft: 0.30, content_ia: 0.25, trust_clarity: 0.10, workflow: 0.05 }, + 'saas-app': { product_intent: 0.35, workflow: 0.30, visual_craft: 0.15, trust_clarity: 0.10, content_ia: 0.10 }, + dashboard: { product_intent: 0.30, workflow: 0.30, content_ia: 0.20, visual_craft: 0.15, trust_clarity: 0.05 }, + docs: { content_ia: 0.45, workflow: 0.25, product_intent: 0.15, visual_craft: 0.15, trust_clarity: 0.0 }, + ecommerce: { trust_clarity: 0.35, product_intent: 0.30, workflow: 0.20, visual_craft: 0.10, content_ia: 0.05 }, + social: { product_intent: 0.30, workflow: 0.30, content_ia: 0.20, visual_craft: 0.15, trust_clarity: 0.05 }, + tool: { workflow: 0.40, product_intent: 0.30, content_ia: 0.15, visual_craft: 0.10, trust_clarity: 0.05 }, + blog: { content_ia: 0.50, visual_craft: 0.25, product_intent: 0.15, workflow: 0.10, trust_clarity: 0.0 }, + utility: { workflow: 0.45, product_intent: 0.25, content_ia: 0.20, visual_craft: 0.10, trust_clarity: 0.0 }, + unknown: { product_intent: 0.30, workflow: 0.25, visual_craft: 0.20, content_ia: 0.15, trust_clarity: 0.10 }, + default: { product_intent: 0.30, workflow: 0.25, visual_craft: 0.20, content_ia: 0.15, trust_clarity: 0.10 }, +} + +const WEIGHT_SUM_TOLERANCE = 1e-6 + +// Validate at module load — fail fast if a weight set drifts. +for (const [type, weights] of Object.entries(ROLLUP_WEIGHTS_RAW)) { + const sum = Object.values(weights).reduce((acc, n) => acc + n, 0) + if (Math.abs(sum - 1) > WEIGHT_SUM_TOLERANCE) { + throw new Error(`rollup weights for ${type} sum to ${sum}, expected 1.0 ± ${WEIGHT_SUM_TOLERANCE}`) + } +} + +export const ROLLUP_WEIGHTS: Record> = ROLLUP_WEIGHTS_RAW + +/** + * Look up rollup weights for a page type, falling back to `default` when the + * type isn't in the table (forward-compat for new types). + */ +export function rollupWeightsFor(type: PageType | undefined): Record { + if (type && type in ROLLUP_WEIGHTS) return ROLLUP_WEIGHTS[type as RollupWeightKey] + return ROLLUP_WEIGHTS.default +} + +/** + * Render a human-readable formula for the audit report, e.g. + * "saas-app: product_intent*0.35 + workflow*0.30 + visual_craft*0.15 + trust_clarity*0.10 + content_ia*0.10" + */ +export function rollupFormula(type: PageType | undefined, weights: Record): string { + const entries = Object.entries(weights).sort(([, a], [, b]) => b - a) + const body = entries.map(([dim, w]) => `${dim}*${w.toFixed(2)}`).join(' + ') + return `${type ?? 'default'}: ${body}` +} diff --git a/src/design/audit/types.ts b/src/design/audit/types.ts index f2a8a9a..79e1963 100644 --- a/src/design/audit/types.ts +++ b/src/design/audit/types.ts @@ -6,9 +6,11 @@ */ import type { DesignFinding, DesignSystemScore } from '../../types.js' +import type { EthicsViolation } from './v2/types.js' // Re-export the canonical Finding/Score types so consumers only import from here export type { DesignFinding, DesignSystemScore } from '../../types.js' +export type { EthicsViolation } from './v2/types.js' // ── Classification ───────────────────────────────────────────────────────── @@ -208,5 +210,31 @@ export interface PageAuditResult { designSystemScore?: DesignSystemScore screenshotPath?: string tokensUsed?: number + /** + * Layer 7 — domain ethics violations. When non-empty, `score` is capped by + * the lowest `rollupCap` across violations until the underlying issue is + * remediated. Empty when --skip-ethics is set or when no rule fires. + */ + ethicsViolations?: EthicsViolation[] + /** + * The pre-cap score (Layer 7). Set when `ethicsViolations` is non-empty so + * tooling can show "would have scored X, capped at Y" without losing + * the LLM's original assessment. + */ + preEthicsScore?: number + /** + * Layer 1 — v2 multi-dim audit result. Emitted alongside the v1 fields for + * one release as a backwards-compat bridge. Consumers should migrate to + * `auditResultV2` and treat the v1 surface as deprecated. + * + * Typed as `unknown` here to avoid pulling v2/types.ts into v1 consumers. + * The concrete shape is `import('./v2/types.js').AuditResult_v2`. + */ + auditResultV2?: unknown + /** + * Layer 1 — ensemble classification (URL + DOM + LLM). When set, the + * pipeline used `--audit-passes auto` (the new default). + */ + ensembleClassification?: unknown error?: string } diff --git a/src/design/audit/v2/build-result.ts b/src/design/audit/v2/build-result.ts new file mode 100644 index 0000000..74176d8 --- /dev/null +++ b/src/design/audit/v2/build-result.ts @@ -0,0 +1,210 @@ +/** + * v2 AuditResult builder. + * + * Wraps the existing v1 PageAuditResult with multi-dim scoring + ensemble + * classification + rollup. Layer 1 emits BOTH schemas in `report.json` so + * downstream consumers can migrate at their own pace (one-release deprecation + * window per the RFC). + */ + +import { randomUUID, createHash } from 'node:crypto' +import type { Brain } from '../../../brain/index.js' +import type { PageState } from '../../../types.js' +import type { + PageAuditResult, + PageClassification, + ComposedRubric, + MeasurementBundle, +} from '../types.js' +import { + type AuditResult_v2, + type DesignFinding, + type Dimension, + type DimensionScore, + type EnsembleClassification, + type RollupScore, + DIMENSIONS, +} from './types.js' +import { + buildEvalPromptV2, + computeRollup, + parseAuditResponseV2, +} from './score.js' +import { renderAnchor, type CalibrationAnchor } from '../rubric/anchor-loader.js' + +export interface BuildV2ResultInput { + brain: Brain + state: PageState + pageRef: string + ensemble: EnsembleClassification + rubric: ComposedRubric + measurements: MeasurementBundle + v1Result: PageAuditResult + anchor?: CalibrationAnchor + /** Reuse the pipeline runId so envelopes correlate. */ + runId?: string + /** Optional override (e.g. test fixtures). When set, skip the LLM call. */ + precomputedScores?: Record +} + +/** + * Produce a complete `AuditResult_v2`. When `precomputedScores` is set we + * skip the v2 LLM call entirely (used by deterministic tests + the + * `--audit-passes auto` legacy fallback path). + */ +export async function buildAuditResultV2(input: BuildV2ResultInput): Promise { + const { brain, state, pageRef, ensemble, rubric, measurements, v1Result, anchor, runId } = input + + const measurementSummary = renderMeasurementSummary(measurements) + const prompt = buildEvalPromptV2({ + pageType: ensemble.type, + rubricBody: rubric.body, + anchor, + measurementSummary, + intent: ensemble.intent, + }) + + let scores: Record + let llmTokens = 0 + if (input.precomputedScores) { + scores = input.precomputedScores + } else { + try { + const llm = await brain.auditDesign(state, 'Multi-dimensional audit (v2)', [], prompt) + llmTokens = llm.tokensUsed ?? 0 + const parsed = parseAuditResponseV2(llm.raw) + scores = parsed.scores + } catch { + // Fall back: synthesize per-dim scores from the v1 result. Conservative — + // every dim gets the v1 score, range +/- 1, confidence 'low'. + scores = synthesizeScoresFromV1(v1Result) + } + } + + const rollup: RollupScore = computeRollup(scores, ensemble.type) + const findings = adaptFindings(v1Result.findings) + const topFixes = computeTopFixes(findings).slice(0, 5).map((f) => f.id) + + const promptHash = sha1(prompt) + const rubricHash = sha1(rubric.body) + const totalTokens = (v1Result.tokensUsed ?? 0) + llmTokens + + return { + schemaVersion: 2, + runId: runId ?? randomUUID(), + pageRef, + classification: ensemble, + scores, + rollup, + findings, + topFixes, + measurements, + ethicsViolations: [], + matchedPatterns: [], + modality: 'html', + evaluatedAt: new Date().toISOString(), + promptHash, + rubricHash, + tokensUsed: totalTokens > 0 ? totalTokens : undefined, + passes: ['v2-multidim'], + ...(v1Result.error ? { error: v1Result.error } : {}), + } +} + +function renderMeasurementSummary(measurements: MeasurementBundle): string { + const aaFails = measurements.contrast.aaFailures.length + const a11y = measurements.a11y.violations.length + return [ + `contrast AA failures: ${aaFails} of ${measurements.contrast.totalChecked} text elements`, + `axe violations: ${a11y}${a11y > 0 ? ` (top: ${measurements.a11y.violations.slice(0, 3).map((v) => `${v.id}/${v.impact}`).join(', ')})` : ''}`, + ].join('\n') +} + +function adaptFindings(v1Findings: PageAuditResult['findings']): DesignFinding[] { + return v1Findings.map((f, idx) => { + const id = `finding-${idx + 1}-${sha1(`${f.category}|${f.description}`).slice(0, 8)}` + const dimension = mapCategoryToDimension(f.category) + const kind = inferKind(f) + return { + ...f, + id, + dimension, + kind, + // Layer 2 supplies real Patches; Layer 1 emits an empty array so the + // schema is satisfied without fabricating diffs. + patches: [], + } + }) +} + +function mapCategoryToDimension(category: string): Dimension { + switch (category) { + case 'visual-bug': + case 'spacing': + case 'typography': + case 'alignment': + case 'layout': + return 'visual_craft' + case 'contrast': + case 'accessibility': + return 'visual_craft' + case 'ux': + default: + return 'product_intent' + } +} + +function inferKind(f: PageAuditResult['findings'][number]): DesignFinding['kind'] { + if (f.category === 'contrast' || f.category === 'accessibility') return 'measurement' + if (f.category === 'ux') return 'job' + return 'polish' +} + +function computeTopFixes(findings: DesignFinding[]): DesignFinding[] { + return [...findings].sort((a, b) => { + const aScore = (a.impact ?? 0) * blastWeight(a.blast) + const bScore = (b.impact ?? 0) * blastWeight(b.blast) + return bScore - aScore + }) +} + +function blastWeight(blast: PageAuditResult['findings'][number]['blast']): number { + switch (blast) { + case 'system': return 4 + case 'component': return 3 + case 'section': return 2 + default: return 1 + } +} + +function synthesizeScoresFromV1(v1: PageAuditResult): Record { + const fallback = Math.max(1, Math.min(10, Math.round(v1.score))) + const out: Partial> = {} + for (const dim of DIMENSIONS) { + out[dim] = { + score: fallback, + range: [ + Math.max(1, fallback - 1), + Math.min(10, fallback + 1), + ], + confidence: 'low', + summary: 'Synthesized from v1 score (v2 LLM call unavailable).', + primaryFindings: [], + } + } + return out as Record +} + +function sha1(s: string): string { + return createHash('sha1').update(s).digest('hex') +} + +export const V2_INTERNALS = { + renderMeasurementSummary, + adaptFindings, + mapCategoryToDimension, + computeTopFixes, + synthesizeScoresFromV1, +} + +export { renderAnchor } diff --git a/src/design/audit/v2/score.ts b/src/design/audit/v2/score.ts new file mode 100644 index 0000000..16fef74 --- /dev/null +++ b/src/design/audit/v2/score.ts @@ -0,0 +1,253 @@ +/** + * Layer 1 multi-dim scoring — prompt builder, parser, and rollup. + * + * Pure functions. No I/O, no Brain dependency. The pipeline supplies the + * inputs (classification, rubric, anchor, measurements) and persists the + * resulting `Record + RollupScore`. + */ + +import { + DIMENSIONS, + type ConfidenceLevel, + type Dimension, + type DimensionScore, + type RollupScore, +} from './types.js' +import type { PageType } from '../types.js' +import { rollupFormula, rollupWeightsFor } from '../rubric/rollup-weights.js' +import type { CalibrationAnchor } from '../rubric/anchor-loader.js' +import { renderAnchor } from '../rubric/anchor-loader.js' + +const VALID_CONFIDENCE: readonly ConfidenceLevel[] = ['high', 'medium', 'low'] as const + +export interface BuildV2PromptInput { + pageType: PageType + rubricBody: string + anchor?: CalibrationAnchor + /** Concise text summary of deterministic measurements (axe, contrast). */ + measurementSummary: string + /** Optional auditor framing override. */ + systemOpener?: string + /** Page intent line surfaced from classification. */ + intent?: string +} + +const DEFAULT_OPENER = + 'You are a principal product-design auditor. Score this page on five universal dimensions independently, with explicit ranges and confidence. The downstream system aggregates these into a page-type-aware rollup.' + +/** + * Build the v2 evaluation prompt. Demands per-dim DimensionScore output with + * range + confidence. Does NOT request the rollup — the rollup is computed + * deterministically from the per-dim scores using rollup-weights. + */ +export function buildEvalPromptV2(input: BuildV2PromptInput): string { + const opener = input.systemOpener ?? DEFAULT_OPENER + const anchorBlock = input.anchor ? renderAnchor(input.anchor) : '' + const intentLine = input.intent ? `\nPAGE INTENT (from classifier): ${input.intent}` : '' + + return `${opener} + +You are auditing a page that has been pre-classified as type=${input.pageType}. Contrast and accessibility measurements have already been counted deterministically — do NOT re-evaluate them. They will be merged with your output.${intentLine} + +DIMENSIONS — score each one 1-10 (integer) with an explicit uncertainty range and confidence: + + product_intent — Does the page make its audience, purpose, primary action, and product state obvious within 5 seconds? Empty/loading/error states designed? + visual_craft — Is the visual system intentional? Typography ramp, spacing rhythm, color tokens, component coherence, polish details. Decorative-but-shallow output is a defect. + trust_clarity — Are commitments (money, identity, deploy, share, irreversible actions) accompanied by the right trust details (price, fees, permissions, undo path, provenance)? + workflow — Can a user complete the end-to-end job? State transitions, recovery from failure, action hierarchy match the operational verbs of the system. + content_ia — Is the copy plain and useful? Are labels and IA tuned to the audience's tasks? Meta-copy that explains the UI is a defect. + +DETERMINISTIC MEASUREMENTS (do not duplicate): +${input.measurementSummary} + +${anchorBlock ? anchorBlock + '\n\n' : ''}EVALUATION RUBRIC: +${input.rubricBody} + +OUTPUT REQUIREMENTS: +- Every dimension MUST have an integer score 1-10. +- Every dimension MUST have a range [low, high] with low <= score <= high. Range width encodes your uncertainty. +- Every dimension MUST have confidence in {"high","medium","low"}. +- Summary is one sentence grounded in observable evidence. +- primaryFindings is a list of finding ids that drive the score (may be empty if you produce no findings). + +RESPOND WITH ONLY a JSON object: +{ + "scores": { + "product_intent": { "score": 6, "range": [5, 7], "confidence": "medium", "summary": "Hero is clear but action hierarchy is diffuse.", "primaryFindings": [] }, + "visual_craft": { "score": 7, "range": [6, 8], "confidence": "high", "summary": "Spacing rhythm is intentional but type ramp drifts in cards.", "primaryFindings": [] }, + "trust_clarity": { "score": 5, "range": [4, 6], "confidence": "medium", "summary": "Fees disclosed but only at the final step.", "primaryFindings": [] }, + "workflow": { "score": 6, "range": [5, 7], "confidence": "medium", "summary": "Empty state directs the user but error recovery is implicit.", "primaryFindings": [] }, + "content_ia": { "score": 7, "range": [6, 8], "confidence": "high", "summary": "Copy is plain and audience-tuned.", "primaryFindings": [] } + }, + "summary": "One-sentence overall assessment.", + "strengths": ["..."], + "findings": [] +} + +Score 1-10. Most production apps score 5-7. Only world-class deserves 8+. Be honest.` +} + +export interface ParsedDimensionScores { + scores: Record + summary: string + strengths: string[] +} + +/** + * Parse the v2 LLM response. Throws when scores are missing, ranges violate + * `range[0] <= score <= range[1]`, or score is outside 1..10. The pipeline + * catches the throw and falls back to v1 mean-of-passes. + */ +export function parseAuditResponseV2(raw: string): ParsedDimensionScores { + const parsed = extractJsonObject(raw) + if (!parsed) throw new Error('v2 parser: no JSON object in response') + + const rawScores = (parsed as { scores?: unknown }).scores + if (!rawScores || typeof rawScores !== 'object') { + throw new Error('v2 parser: missing scores object') + } + + const scoreMap = rawScores as Record + const out: Record = {} + for (const dim of DIMENSIONS) { + const dimRaw = scoreMap[dim] + if (!dimRaw || typeof dimRaw !== 'object') { + throw new Error(`v2 parser: dimension ${dim} missing`) + } + out[dim] = parseDimensionScore(dim, dimRaw as Record) + } + + return { + scores: out as Record, + summary: typeof (parsed as { summary?: unknown }).summary === 'string' ? (parsed as { summary: string }).summary : '', + strengths: Array.isArray((parsed as { strengths?: unknown }).strengths) + ? ((parsed as { strengths: unknown[] }).strengths.filter( + (s): s is string => typeof s === 'string', + )) + : [], + } +} + +function parseDimensionScore(dim: Dimension, raw: Record): DimensionScore { + const score = raw.score + if (typeof score !== 'number' || !Number.isFinite(score)) { + throw new Error(`v2 parser: ${dim}.score must be a number`) + } + const integerScore = Math.round(score) + if (integerScore < 1 || integerScore > 10) { + throw new Error(`v2 parser: ${dim}.score=${integerScore} outside 1..10`) + } + const range = raw.range + if (!Array.isArray(range) || range.length !== 2 || typeof range[0] !== 'number' || typeof range[1] !== 'number') { + throw new Error(`v2 parser: ${dim}.range must be [number, number]`) + } + const [low, high] = range + if (low > high) { + throw new Error(`v2 parser: ${dim}.range=[${low},${high}] inverted`) + } + if (integerScore < low || integerScore > high) { + throw new Error(`v2 parser: ${dim}.score=${integerScore} outside range [${low},${high}]`) + } + if (low < 1 || high > 10) { + throw new Error(`v2 parser: ${dim}.range=[${low},${high}] outside 1..10`) + } + const confidenceRaw = String(raw.confidence ?? '').toLowerCase() + const confidence = (VALID_CONFIDENCE as readonly string[]).includes(confidenceRaw) + ? (confidenceRaw as ConfidenceLevel) + : 'medium' + const summary = typeof raw.summary === 'string' ? raw.summary : '' + const primaryFindings = Array.isArray(raw.primaryFindings) + ? raw.primaryFindings.filter((s): s is string => typeof s === 'string') + : [] + return { score: integerScore, range: [low, high], confidence, summary, primaryFindings } +} + +/** + * Compute the rollup from per-dimension scores using per-page-type weights. + * Conservative confidence rule: rollup confidence = lowest dim confidence. + */ +export function computeRollup(scores: Record, pageType: PageType): RollupScore { + const weights = rollupWeightsFor(pageType) + let weighted = 0 + let lowSum = 0 + let highSum = 0 + for (const dim of DIMENSIONS) { + const dimScore = scores[dim] + const w = weights[dim] + weighted += dimScore.score * w + lowSum += dimScore.range[0] * w + highSum += dimScore.range[1] * w + } + const score = Math.round(weighted * 10) / 10 + const range: [number, number] = [ + Math.round(lowSum * 10) / 10, + Math.round(highSum * 10) / 10, + ] + + const confidences = DIMENSIONS.map((d) => scores[d].confidence) + const confidence: ConfidenceLevel = confidences.includes('low') + ? 'low' + : confidences.includes('medium') + ? 'medium' + : 'high' + + return { + score, + range, + confidence, + rule: rollupFormula(pageType, weights), + weights, + } +} + +/** + * Aggregate per-dim scores from N independent passes (mean). Used when the + * audit runs deep mode and we want one DimensionScore per dimension. + */ +export function mergeDimensionScoresAcrossPasses( + perPass: Array>, +): Record { + if (perPass.length === 0) { + throw new Error('mergeDimensionScoresAcrossPasses: empty input') + } + if (perPass.length === 1) return perPass[0]! + + const out: Partial> = {} + for (const dim of DIMENSIONS) { + const samples = perPass.map((p) => p[dim]) + const meanScore = samples.reduce((a, s) => a + s.score, 0) / samples.length + const meanLow = samples.reduce((a, s) => a + s.range[0], 0) / samples.length + const meanHigh = samples.reduce((a, s) => a + s.range[1], 0) / samples.length + const conf = samples.map((s) => s.confidence) + const confidence: ConfidenceLevel = conf.includes('low') ? 'low' : conf.includes('medium') ? 'medium' : 'high' + const allFindings = samples.flatMap((s) => s.primaryFindings) + const primaryFindings = Array.from(new Set(allFindings)).slice(0, 3) + const summary = samples.find((s) => s.summary)?.summary ?? '' + out[dim] = { + score: Math.round(meanScore), + range: [ + Math.max(1, Math.floor(meanLow)), + Math.min(10, Math.ceil(meanHigh)), + ], + confidence, + summary, + primaryFindings, + } + } + return out as Record +} + +function extractJsonObject(raw: string): unknown { + try { + let text = raw.trim() + if (text.startsWith('```')) { + text = text.replace(/^```(?:json)?\n?/, '').replace(/\n?```$/, '') + } + const start = text.indexOf('{') + const end = text.lastIndexOf('}') + if (start < 0 || end <= start) return null + return JSON.parse(text.slice(start, end + 1)) + } catch { + return null + } +} diff --git a/src/design/audit/v2/types.ts b/src/design/audit/v2/types.ts new file mode 100644 index 0000000..a010cac --- /dev/null +++ b/src/design/audit/v2/types.ts @@ -0,0 +1,512 @@ +/** + * Design audit v2 — type contract for the 8-layer architecture. + * + * RFC: docs/rfc/design-audit-world-class.md + * + * This file is the stable contract that every layer's implementation + * builds against. It exists to let parallel implementation work proceed + * without diverging interfaces. Editing this file mid-build is a coordinated + * change; layers must update in lockstep. + * + * Invariants enforced by this contract: + * - Every score is a `DimensionScore` with `range` + `confidence`. No bare numbers. + * - Every finding with `severity in ['major','critical']` MUST have ≥1 `Patch`. + * - Every patch has both `target` (what changes) and `testThatProves` (how we verify). + * - Every classification carries explicit `ensembleConfidence` and `signalsAgreed`. + * - Every audit run can write a `PatchApplication` event for post-hoc attribution. + * - Pattern, ethics, modality types compose cleanly via shared `AppliesWhen`. + */ + +import type { + PageClassification, + PageType, + Maturity, + DesignSystemTag, + AppliesWhen as AppliesWhenV1, + MeasurementBundle, + DesignFinding as DesignFindingV1, +} from '../types.js' + +// Re-export so consumers import only from v2/types.ts. +export type { PageClassification, PageType, Maturity, DesignSystemTag, MeasurementBundle } + +// ─── Layer 1 · Multi-dimensional scoring ──────────────────────────────────── + +/** + * The five universal dimensions. Every audit produces a DimensionScore for + * each. The rollup is computed from these via per-page-type weights. + */ +export type Dimension = + | 'product_intent' + | 'visual_craft' + | 'trust_clarity' + | 'workflow' + | 'content_ia' + +export const DIMENSIONS: readonly Dimension[] = [ + 'product_intent', + 'visual_craft', + 'trust_clarity', + 'workflow', + 'content_ia', +] as const + +export type ConfidenceLevel = 'high' | 'medium' | 'low' + +export interface DimensionScore { + /** 1-10 integer score on the dimension. */ + score: number + /** Self-reported uncertainty range. `range[0] <= score <= range[1]`. */ + range: [number, number] + /** Auditor's confidence in the score. */ + confidence: ConfidenceLevel + /** One-sentence assessment grounded in observable evidence. */ + summary: string + /** Stable ids of top findings driving this score. References `DesignFinding.id`. */ + primaryFindings: string[] +} + +export interface RollupScore { + /** Weighted aggregate of `Record`. 1-10 number, can be fractional. */ + score: number + /** Aggregate uncertainty range. */ + range: [number, number] + /** Aggregate confidence. Conservative — `low` if any dim is `low`. */ + confidence: ConfidenceLevel + /** Human-readable formula, e.g. "saas-app: product*0.35 + workflow*0.30 + ...". */ + rule: string + /** Per-dimension weight that produced this rollup. Must sum to 1.0 ± 1e-6. */ + weights: Record +} + +// ─── Ensemble classifier ──────────────────────────────────────────────────── + +export type ClassifierSource = 'url-pattern' | 'dom-heuristic' | 'llm' + +export interface ClassifierSignal { + source: ClassifierSource + type: PageType + /** 0..1, source-specific. */ + confidence: number + /** Why this signal voted this type. Logged for debugging. */ + rationale: string +} + +export interface EnsembleClassification extends PageClassification { + /** Every signal that voted on this classification. */ + signals: ClassifierSignal[] + /** True if all signals agreed on `type`. */ + signalsAgreed: boolean + /** Aggregated 0..1 confidence after ensemble vote. */ + ensembleConfidence: number + /** Signals that disagreed with the final type, if any. */ + dissent?: { source: ClassifierSource; type: PageType }[] + /** True if Layer 3 (first-principles) mode was triggered. */ + firstPrinciplesMode: boolean +} + +/** + * DOM-derived signals used by the heuristic classifier. Captured once during + * the page-load phase, fed to the ensemble vote, and emitted into telemetry. + */ +export interface DomHeuristics { + formCount: number + inputCount: number + tableRowCount: number + chartCount: number + navItems: number + hasFooterLinks: boolean + hasHeroSection: boolean + hasSidebar: boolean + paragraphCount: number + codeBlockCount: number +} + +// ─── Layer 2 · Patch primitives ───────────────────────────────────────────── + +/** + * Where a patch applies. At least one of `cssSelector | filePath | componentName` + * MUST be set. The combination determines how an agent applies it. + */ +export interface PatchTarget { + /** Source file path when known via component scan. */ + filePath?: string + /** Component name when known (e.g. 'Sidebar', 'PrimaryButton'). */ + componentName?: string + /** CSS selector — fallback when filePath unknown. */ + cssSelector?: string + /** Patch scope. Determines applicability check. */ + scope: 'tsx' | 'jsx' | 'css' | 'tailwind' | 'module-css' | 'styled-component' | 'structural' | 'html' +} + +export interface PatchDiff { + /** + * Exact substring being replaced. Validators MUST verify `before` is a + * substring of the page snapshot or source file at apply time. If `before` + * is not found, the patch is rejected (no fuzzy apply). + */ + before: string + /** Replacement text. */ + after: string + /** + * When `target.filePath` is known, the unified diff format an agent can + * pipe to `git apply`. Optional; `before`/`after` is the canonical form. + */ + unifiedDiff?: string +} + +export type PatchTestKind = + | 'storybook' + | 'a11y-rule' + | 'visual-snapshot' + | 'unit' + | 'rerun-audit' + | 'manual' + +export interface PatchTest { + kind: PatchTestKind + /** Human-readable description of what proves the patch worked. */ + description: string + /** Optional CLI command an agent can invoke to verify (e.g. `pnpm vitest `). */ + command?: string +} + +export type PatchRollbackKind = 'git-revert' | 'css-disable' | 'manual' + +export interface PatchRollback { + kind: PatchRollbackKind + /** Optional human-readable rollback instruction. */ + instruction?: string +} + +/** + * A `Patch` is the agent-actionable unit. Layer 2 mandates ≥1 patch on every + * major/critical finding. Findings without patches downgrade to minor. + */ +export interface Patch { + /** Stable id derived from finding hash + target. Same patch across tenants → same id. */ + patchId: string + /** The finding this patch fixes. */ + findingId: string + /** Patch scope — page/section/component/system, drives ROI weighting. */ + scope: 'page' | 'section' | 'component' | 'system' + target: PatchTarget + diff: PatchDiff + testThatProves: PatchTest + rollback: PatchRollback + /** The dimension the auditor predicts this patch will move + by how much. */ + estimatedDelta: { dim: Dimension; delta: number } + /** + * Confidence in `estimatedDelta`, calibrated against fleet outcomes (Layer 4). + * 'untested' means no fleet data yet; 'high' means N≥30 with replication ≥0.7. + */ + estimatedDeltaConfidence: ConfidenceLevel | 'untested' + /** + * If this patch matches a known fleet pattern (Layer 5), the matched pattern + * id. Surfaced by the auditor so agents prefer evidence-backed patches. + */ + matchedPatternId?: string +} + +/** + * Updated `DesignFinding` shape — extends v1 with stable id, dimension link, + * mandatory patches for major/critical, optional pattern match. + */ +export interface DesignFinding extends DesignFindingV1 { + /** Stable id, used by `DimensionScore.primaryFindings`. */ + id: string + /** Which dimension this finding affects. */ + dimension: Dimension + /** Agent-actionable patches. Required (≥1) when severity is major or critical. */ + patches: Patch[] + /** + * Discriminator for finding kind. `polish` findings cap at impact 6; + * `job` findings can go to 10; `measurement` findings come from axe/contrast. + * Set this so ROI ranking auto-prioritizes job over polish. + */ + kind: 'polish' | 'job' | 'measurement' +} + +// ─── Layer 3 · First-principles fallback ──────────────────────────────────── + +/** + * Triggered when ensemble confidence is low or no fixture matches the page + * structure. Auditor scores against 5 universal principles and emits a + * novel-pattern record for fleet mining. + */ +export interface NovelPatternObservation { + observationId: string + capturedAt: string + /** What was distinctive about this page structurally. */ + observed: string + /** Closest existing classification, with low confidence. */ + closestType: PageType + closestConfidence: number + /** Page snapshot reference for later mining. */ + snapshotKey?: string + /** URL or fixture id. */ + pageRef: string +} + +// ─── Layer 4 · Outcome attribution ────────────────────────────────────────── + +/** + * One application of a patch. Emitted by the `bad design-audit ack-patch` + * subcommand or auto-detected by the `--evolve` loop. + */ +export interface PatchApplication { + applicationId: string + patchId: string + /** `sha256(diff.before + '\n---\n' + diff.after + '\n---\n' + scope).slice(0, 16)` */ + patchHash: string + appliedAt: string + appliedBy: string // 'agent:claude-code' | 'agent:codex' | 'human' | 'css-injection' | ... + /** The audit run that proposed the patch. */ + preAuditRunId: string + /** The audit run after the patch was applied. May be null until re-audit. */ + postAuditRunId?: string + /** Auditor's prediction at apply time. */ + predicted: { dim: Dimension; delta: number } + /** Measured delta after re-audit. Populated when postAuditRunId resolves. */ + observed?: { dim: Dimension; delta: number } + /** + * Agreement metric: 1.0 = perfect prediction, 0 = orthogonal, negative = wrong direction. + * `(observed.delta * predicted.delta) / max(|observed.delta|, |predicted.delta|, 1)` + */ + agreementScore?: number +} + +/** + * Aggregated reliability across all applications of a patch (joined on + * `patchHash = hash(diff.before, diff.after, scope)`). Surfaces in audit + * output as `Patch.estimatedDeltaConfidence` upgrade. + */ +export interface PatchReliability { + patchHash: string + applications: number + meanPredictedDelta: number + meanObservedDelta: number + /** % of applications where observed >= 0.5 * predicted. */ + replicationRate: number + recommendation: 'recommended' | 'neutral' | 'antipattern' + /** Distinct tenant count. Below 5 → 'untested' confidence. */ + sampleTenants: number +} + +// ─── Layer 5 · Pattern library ────────────────────────────────────────────── + +/** + * A curated known-good design pattern, mined from accumulated PatchApplication + * data once N≥30 across ≥5 distinct tenants with ≥0.7 replication. + */ +export interface Pattern { + patternId: string + /** Free-form category name, e.g. 'leaderboard', 'empty-state', 'pricing-table'. */ + category: string + classification: { type: PageType; tags: string[] } + scaffold: PatternScaffold + scores: { whenFollowed: Record } + fleetEvidence: PatternFleetEvidence + /** Fixture ids that exemplify this pattern. */ + fixtures: string[] +} + +export interface PatternScaffold { + description: string + referenceTsx?: string + referenceCss?: string + /** Concrete decisions that make the pattern work, e.g. 'criterion in header'. */ + keyDecisions: string[] +} + +export interface PatternFleetEvidence { + applications: number + /** % where adopting this pattern delivered the predicted dim delta. */ + successRate: number + medianDimDelta: Record + /** Distinct tenants. ≥5 required for promotion to 'recommended'. */ + sampleTenants: number +} + +export interface PatternQuery { + category?: string + pageType?: PageType + /** "I'm scoring 4 on product_intent — show me patterns that lift it." */ + weakDimension?: Dimension + minApplications?: number + minSuccessRate?: number +} + +export interface PatternMatch { + pattern: Pattern + matchConfidence: number + expectedDelta: Record + /** How to adapt this pattern to the current page. */ + applicationGuidance: string +} + +// ─── Layer 6 · Composable predicates (extends AppliesWhen) ────────────────── + +export type AudienceTag = + | 'developer' + | 'clinician' + | 'analyst' + | 'consumer' + | 'admin' + | 'kids' + | 'enterprise-buyer' + | 'creator' + +export type ModalityTag = 'desktop' | 'tablet' | 'mobile' | 'tv' | 'kiosk' + +export type RegulatoryContextTag = 'hipaa' | 'gdpr' | 'sox' | 'pci-dss' | 'coppa' | 'wcag-aaa' + +export type AudienceVulnerabilityTag = + | 'patient-facing' + | 'minor-facing' + | 'high-stakes-financial' + | 'crisis-context' + +/** + * v2 predicate set. Extends v1 with audience/modality/regulatoryContext/ + * audienceVulnerability so a pediatric medical app on tablet for clinicians + * loads multiple fragments simultaneously. + */ +export interface AppliesWhen extends AppliesWhenV1 { + audience?: AudienceTag[] + modality?: ModalityTag[] + regulatoryContext?: RegulatoryContextTag[] + audienceVulnerability?: AudienceVulnerabilityTag[] +} + +// ─── Layer 7 · Domain ethics gate ─────────────────────────────────────────── + +export type EthicsCategory = 'medical' | 'kids' | 'finance' | 'legal' | 'accessibility' | 'crisis' + +export type EthicsSeverity = 'critical-floor' | 'major-floor' + +export interface EthicsRule { + ruleId: string + category: EthicsCategory + severity: EthicsSeverity + appliesWhen: AppliesWhen + detector: EthicsDetector + remediation: string + /** Citation to regulation or standard, e.g. 'FDA 21 CFR 201.57'. */ + citation?: string +} + +export type EthicsDetector = + | { kind: 'pattern-absent'; pattern: string } + | { kind: 'pattern-present'; pattern: string } + | { kind: 'llm-classifier'; llmCheck: string } + +export interface EthicsViolation { + ruleId: string + detected: true + severity: EthicsSeverity + /** Rollup ceiling enforced by this violation. critical-floor → 4; major-floor → 6. */ + rollupCap: number + remediation: string + citation?: string +} + +// ─── Layer 8 · Modality adapters ──────────────────────────────────────────── + +export type Modality = 'html' | 'ios' | 'android' | 'terminal' | 'voice' + +export interface ModalityInput { + /** Modality-specific entry point — URL for HTML, app bundle for iOS, etc. */ + entryPoint: string + /** Optional flow specification when capturing multiple surfaces. */ + flow?: string[] +} + +/** + * Per-modality measurement bundle — analogous to the existing HTML + * MeasurementBundle (axe + contrast). Modality-specific implementations + * provide their own a11y/contrast equivalents. + */ +export interface SurfaceMeasurements { + modality: Modality + /** A11y violations — modality-specific shape. */ + a11y?: unknown + /** Contrast or readability check — modality-specific. */ + contrast?: unknown + /** Modality-specific measurements (haptic, latency, etc.). */ + extra?: Record +} + +export interface SurfaceRecord { + /** URL for HTML; screen name for native; turn id for voice. */ + identifier: string + measurements: SurfaceMeasurements + snapshot: string + screenshot?: string +} + +export interface Evidence { + modality: Modality + surfaces: SurfaceRecord[] + /** Roll-up of per-surface measurements for backwards compat with v1 pipeline. */ + measurements: MeasurementBundle + /** Concatenated snapshot for LLM consumption. */ + snapshot: string + screenshot?: string +} + +export interface ModalityAdapter { + modality: Modality + capture(input: ModalityInput): Promise +} + +// ─── AuditResult v2 — the top-level output ────────────────────────────────── + +export interface AuditResult_v2 { + schemaVersion: 2 + /** Run id for telemetry / attribution correlation. */ + runId: string + /** Page reference (URL for HTML; bundle id for native; etc.). */ + pageRef: string + classification: EnsembleClassification + /** Per-dimension scores, ALWAYS all 5 dimensions. */ + scores: Record + rollup: RollupScore + /** Findings + patches. Includes deterministic measurements (axe, contrast). */ + findings: DesignFinding[] + /** Top-N findings ranked by ROI. References `findings[*].id`. */ + topFixes: string[] + measurements: MeasurementBundle + ethicsViolations: EthicsViolation[] + /** Patterns matched against the page (Layer 5). May be empty. */ + matchedPatterns: PatternMatch[] + /** When first-principles mode triggered (Layer 3). May be undefined. */ + novelPattern?: NovelPatternObservation + /** Modality (Layer 8). HTML for v1 compat. */ + modality: Modality + /** Provenance. */ + evaluatedAt: string + promptHash: string + rubricHash: string + /** LLM token usage across passes. */ + tokensUsed?: number + /** Ensemble of audit passes that ran (deep / max / single). */ + passes: string[] + error?: string +} + +// ─── CLI / runtime hints ──────────────────────────────────────────────────── + +/** + * Operator-supplied hints. None override the classifier outright; they bias + * the ensemble toward a result. If a hint disagrees with the classifier's + * final type with high confidence, a warning surfaces. + */ +export interface AuditRuntimeHints { + rubricHint?: PageType + audience?: AudienceTag[] + modality?: ModalityTag[] + regulatoryContext?: RegulatoryContextTag[] + /** Tenant id for cross-tenant attribution + ethics rule overrides. */ + tenantId?: string +} diff --git a/tests/design-audit-anchor-loader.test.ts b/tests/design-audit-anchor-loader.test.ts new file mode 100644 index 0000000..b3eb28e --- /dev/null +++ b/tests/design-audit-anchor-loader.test.ts @@ -0,0 +1,141 @@ +import { describe, it, expect } from 'vitest' +import * as fs from 'node:fs' +import * as path from 'node:path' +import * as os from 'node:os' +import { fileURLToPath } from 'node:url' +import { + loadAnchors, + parseAnchorFile, + renderAnchor, +} from '../src/design/audit/rubric/anchor-loader.js' +import type { PageType } from '../src/design/audit/types.js' + +const __dirname = path.dirname(fileURLToPath(import.meta.url)) +const ANCHORS_DIR = path.resolve(__dirname, '..', 'src', 'design', 'audit', 'rubric', 'anchors') + +const REQUIRED_TYPES: PageType[] = [ + 'saas-app', + 'marketing', + 'dashboard', + 'docs', + 'ecommerce', + 'social', + 'tool', + 'blog', + 'utility', +] + +describe('anchor-loader — Layer 1', () => { + it('all 9 builtin anchor files exist', () => { + for (const t of REQUIRED_TYPES) { + expect(fs.existsSync(path.join(ANCHORS_DIR, `${t}.yaml`))).toBe(true) + } + }) + + it('loadAnchors() returns one anchor per page type', () => { + const anchors = loadAnchors(ANCHORS_DIR) + for (const t of REQUIRED_TYPES) { + const anchor = anchors.get(t) + expect(anchor).toBeDefined() + expect(anchor?.type).toBe(t) + } + }) + + it('every band has at least 3 criteria and at least 1 fixture', () => { + const anchors = loadAnchors(ANCHORS_DIR) + for (const anchor of anchors.values()) { + for (const band of ['score_9_10', 'score_7_8', 'score_5_6', 'score_3_4'] as const) { + const b = anchor[band] + expect(b.criteria.length).toBeGreaterThanOrEqual(3) + expect(b.fixtures.length).toBeGreaterThanOrEqual(1) + for (const c of b.criteria) { + expect(typeof c).toBe('string') + expect(c.length).toBeGreaterThan(8) + } + for (const f of b.fixtures) { + expect(typeof f).toBe('string') + expect(f.startsWith('fixture:')).toBe(true) + } + } + } + }) + + it('saas-app anchor cites Linear app + Figma + Notion + Superhuman + GitHub PR view', () => { + const a = parseAnchorFile(path.join(ANCHORS_DIR, 'saas-app.yaml')) + const refs = a.score_9_10.fixtures.join(' ') + expect(refs).toContain('linear-app') + expect(refs).toContain('figma-file-ui') + expect(refs).toContain('notion-editor') + expect(refs).toContain('superhuman') + expect(refs).toContain('github-pr-view') + }) + + it('marketing anchor cites Stripe / Linear / Vercel / Apple', () => { + const a = parseAnchorFile(path.join(ANCHORS_DIR, 'marketing.yaml')) + const refs = a.score_9_10.fixtures.join(' ') + expect(refs).toContain('stripe-marketing') + expect(refs).toContain('linear-marketing') + expect(refs).toContain('vercel-marketing') + expect(refs).toContain('apple-marketing') + }) + + it('docs anchor cites Stripe Docs / Tailwind Docs / MDN / Vercel Docs', () => { + const a = parseAnchorFile(path.join(ANCHORS_DIR, 'docs.yaml')) + const refs = a.score_9_10.fixtures.join(' ') + expect(refs).toContain('stripe-docs') + expect(refs).toContain('tailwind-docs') + expect(refs).toContain('mdn-docs') + expect(refs).toContain('vercel-docs') + }) + + it('renderAnchor produces injectable markdown', () => { + const a = parseAnchorFile(path.join(ANCHORS_DIR, 'saas-app.yaml')) + const md = renderAnchor(a) + expect(md).toContain('Score 9-10') + expect(md).toContain('Score 7-8') + expect(md).toContain('Score 5-6') + expect(md).toContain('Score 3-4') + expect(md).toContain('References:') + // contains an actual fixture reference + expect(md).toContain('fixture:linear-app') + }) + + it('returns empty Map for nonexistent dir', () => { + expect(loadAnchors('/nonexistent/anchors/dir').size).toBe(0) + }) + + it('throws on malformed file (missing band)', () => { + const tmp = fs.mkdtempSync(path.join(os.tmpdir(), 'anchor-')) + const file = path.join(tmp, 'bad.yaml') + fs.writeFileSync( + file, + `type: saas-app +score_9_10: + criteria: + - one criterion + fixtures: + - fixture:x +score_7_8: + criteria: + - one criterion + fixtures: + - fixture:x +score_5_6: + criteria: + - one criterion + fixtures: + - fixture:x +`, + ) + expect(() => parseAnchorFile(file)).toThrow(/score_3_4/) + fs.rmSync(tmp, { recursive: true, force: true }) + }) + + it('throws on malformed file (missing type)', () => { + const tmp = fs.mkdtempSync(path.join(os.tmpdir(), 'anchor-')) + const file = path.join(tmp, 'bad.yaml') + fs.writeFileSync(file, 'score_9_10:\n criteria:\n - x\n fixtures:\n - fixture:x\n') + expect(() => parseAnchorFile(file)).toThrow(/type/) + fs.rmSync(tmp, { recursive: true, force: true }) + }) +}) diff --git a/tests/design-audit-attribution.test.ts b/tests/design-audit-attribution.test.ts new file mode 100644 index 0000000..dea8a04 --- /dev/null +++ b/tests/design-audit-attribution.test.ts @@ -0,0 +1,150 @@ +import { describe, it, expect, afterEach } from 'vitest' +import { mkdtempSync, rmSync } from 'node:fs' +import { tmpdir } from 'node:os' +import { join } from 'node:path' +import { + patchHash, + appendPatchApplication, + readRecentApplications, + findPendingApplication, + updateApplicationOutcome, +} from '../src/design/audit/attribution/store.js' +import { aggregatePatchReliability, recommendationFor } from '../src/design/audit/attribution/aggregate.js' +import type { PatchApplication } from '../src/design/audit/attribution/types.js' + +function makeApp(overrides: Partial = {}): PatchApplication { + return { + applicationId: `app-${Math.random().toString(36).slice(2)}`, + patchId: 'patch-001', + patchHash: 'abc123', + appliedAt: new Date().toISOString(), + appliedBy: 'agent:claude-code', + preAuditRunId: 'run-pre', + predicted: { dim: 'visual_craft', delta: 2 }, + ...overrides, + } +} + +describe('patchHash', () => { + it('produces stable output for same inputs', () => { + const h1 = patchHash({ before: 'color: red', after: 'color: blue' }, 'component') + const h2 = patchHash({ before: 'color: red', after: 'color: blue' }, 'component') + expect(h1).toBe(h2) + expect(h1).toHaveLength(16) + }) + + it('produces different hashes for different scope', () => { + const h1 = patchHash({ before: 'a', after: 'b' }, 'component') + const h2 = patchHash({ before: 'a', after: 'b' }, 'page') + expect(h1).not.toBe(h2) + }) +}) + +describe('attribution store', () => { + let tmpDir: string + afterEach(() => { + if (tmpDir) rmSync(tmpDir, { recursive: true, force: true }) + }) + + it('appends and reads back an application', async () => { + tmpDir = mkdtempSync(join(tmpdir(), 'bad-attr-')) + const app = makeApp() + await appendPatchApplication(app, tmpDir) + + const apps = await readRecentApplications(1, tmpDir) + expect(apps).toHaveLength(1) + expect(apps[0].applicationId).toBe(app.applicationId) + }) + + it('is append-only: file grows on second write', async () => { + tmpDir = mkdtempSync(join(tmpdir(), 'bad-attr-')) + const a1 = makeApp({ applicationId: 'first' }) + const a2 = makeApp({ applicationId: 'second' }) + await appendPatchApplication(a1, tmpDir) + await appendPatchApplication(a2, tmpDir) + + const apps = await readRecentApplications(1, tmpDir) + expect(apps.length).toBeGreaterThanOrEqual(2) + }) + + it('finds a pending application by patchId', async () => { + tmpDir = mkdtempSync(join(tmpdir(), 'bad-attr-')) + const app = makeApp({ patchId: 'patch-findme' }) + await appendPatchApplication(app, tmpDir) + + const found = await findPendingApplication('patch-findme', tmpDir) + expect(found).not.toBeNull() + expect(found!.applicationId).toBe(app.applicationId) + }) + + it('does not find a pending application when postAuditRunId is set', async () => { + tmpDir = mkdtempSync(join(tmpdir(), 'bad-attr-')) + const app = makeApp({ patchId: 'patch-done', postAuditRunId: 'run-post' }) + await appendPatchApplication(app, tmpDir) + + const found = await findPendingApplication('patch-done', tmpDir) + expect(found).toBeNull() + }) + + it('appends an outcome event and the agreementScore is computed', async () => { + tmpDir = mkdtempSync(join(tmpdir(), 'bad-attr-')) + const app = makeApp({ predicted: { dim: 'visual_craft', delta: 2 } }) + await appendPatchApplication(app, tmpDir) + + await updateApplicationOutcome( + app.applicationId, + 'run-post', + { dim: 'visual_craft', delta: 1.5 }, + tmpDir, + ) + + const apps = await readRecentApplications(1, tmpDir) + const outcome = apps.find(a => a.applicationId === app.applicationId && a.postAuditRunId) + expect(outcome).toBeDefined() + expect(outcome!.agreementScore).toBeGreaterThan(0) + }) +}) + +describe('aggregatePatchReliability', () => { + it('produces recommended when N≥30, tenants≥5, replicationRate≥0.7', () => { + const hash = 'deadbeef' + const apps: PatchApplication[] = Array.from({ length: 30 }, (_, i) => ({ + applicationId: `app-${i}`, + patchId: 'p', + patchHash: hash, + appliedAt: new Date().toISOString(), + appliedBy: `agent:tenant-${i % 6}`, + preAuditRunId: 'pre', + predicted: { dim: 'visual_craft', delta: 2 }, + observed: { dim: 'visual_craft', delta: 2 }, + })) + + const [rel] = aggregatePatchReliability(apps) + expect(rel.patchHash).toBe(hash) + expect(rel.recommendation).toBe('recommended') + expect(rel.replicationRate).toBeCloseTo(1.0) + }) + + it('produces antipattern when N≥10, low replication, negative observed delta', () => { + const hash = 'baadf00d' + const apps: PatchApplication[] = Array.from({ length: 10 }, (_, i) => ({ + applicationId: `app-${i}`, + patchId: 'p', + patchHash: hash, + appliedAt: new Date().toISOString(), + appliedBy: 'agent:a', + preAuditRunId: 'pre', + predicted: { dim: 'visual_craft', delta: 2 }, + observed: { dim: 'visual_craft', delta: -1 }, + })) + + const [rel] = aggregatePatchReliability(apps) + expect(rel.recommendation).toBe('antipattern') + }) +}) + +describe('recommendationFor', () => { + it('is neutral below thresholds', () => { + expect(recommendationFor(5, 2, 0.5, 1)).toBe('neutral') + }) +}) diff --git a/tests/design-audit-ensemble.test.ts b/tests/design-audit-ensemble.test.ts new file mode 100644 index 0000000..8e0fe98 --- /dev/null +++ b/tests/design-audit-ensemble.test.ts @@ -0,0 +1,247 @@ +import { describe, it, expect } from 'vitest' +import { + classifyByUrl, + classifyByDom, + classifyEnsemble, + deriveHeuristics, + ENSEMBLE_INTERNALS, +} from '../src/design/audit/classify-ensemble.js' +import type { DomHeuristics } from '../src/design/audit/v2/types.js' +import type { Brain } from '../src/brain/index.js' +import type { PageState } from '../src/types.js' + +function emptyHeuristics(overrides: Partial = {}): DomHeuristics { + return { + formCount: 0, + inputCount: 0, + tableRowCount: 0, + chartCount: 0, + navItems: 0, + hasFooterLinks: false, + hasHeroSection: false, + hasSidebar: false, + paragraphCount: 0, + codeBlockCount: 0, + ...overrides, + } +} + +function fakeState(snapshot: string = ''): PageState { + return { + url: 'https://example.com/', + title: 'Example', + snapshot, + screenshot: '', + } as PageState +} + +interface FakeBrainResult { + type: string + confidence: number + intent?: string +} + +function fakeBrain(result: FakeBrainResult): Brain { + return { + auditDesign: async () => ({ + raw: JSON.stringify({ + type: result.type, + domain: 'unknown', + framework: null, + designSystem: 'unknown', + maturity: 'shipped', + intent: result.intent ?? '', + confidence: result.confidence, + }), + score: 5, + findings: [], + tokensUsed: 100, + }), + } as unknown as Brain +} + +describe('classifyByUrl — Layer 1', () => { + it('matches /docs paths', () => { + const sig = classifyByUrl('https://example.com/docs/intro') + expect(sig?.type).toBe('docs') + expect(sig?.confidence).toBeGreaterThanOrEqual(0.8) + }) + it('matches /checkout paths', () => { + expect(classifyByUrl('https://example.com/checkout/cart')?.type).toBe('ecommerce') + }) + it('matches /app paths', () => { + expect(classifyByUrl('https://example.com/app')?.type).toBe('saas-app') + }) + it('matches /login paths', () => { + expect(classifyByUrl('https://example.com/login')?.type).toBe('utility') + }) + it('matches /pricing paths', () => { + expect(classifyByUrl('https://example.com/pricing')?.type).toBe('marketing') + }) + it('matches /blog paths', () => { + expect(classifyByUrl('https://example.com/blog/post-1')?.type).toBe('blog') + }) + it('roots default to weak marketing signal', () => { + const sig = classifyByUrl('https://example.com/') + expect(sig?.type).toBe('marketing') + expect(sig?.confidence).toBeLessThanOrEqual(0.5) + }) + it('returns null for unparseable urls', () => { + expect(classifyByUrl('not a url')).toBeNull() + }) +}) + +describe('classifyByDom — Layer 1', () => { + it('docs: many paragraphs + code blocks', () => { + const sig = classifyByDom(emptyHeuristics({ codeBlockCount: 5, paragraphCount: 10 })) + expect(sig?.type).toBe('docs') + }) + it('dashboard: many table rows + sidebar', () => { + const sig = classifyByDom(emptyHeuristics({ tableRowCount: 12, hasSidebar: true })) + expect(sig?.type).toBe('dashboard') + }) + it('saas-app: sidebar + forms', () => { + const sig = classifyByDom(emptyHeuristics({ hasSidebar: true, formCount: 1, inputCount: 5 })) + expect(sig?.type).toBe('saas-app') + }) + it('utility: single form, no hero, no sidebar', () => { + const sig = classifyByDom(emptyHeuristics({ formCount: 1, inputCount: 3 })) + expect(sig?.type).toBe('utility') + }) + it('blog: many paragraphs, no forms or tables', () => { + const sig = classifyByDom(emptyHeuristics({ paragraphCount: 10 })) + expect(sig?.type).toBe('blog') + }) + it('marketing: hero + footer + few paragraphs', () => { + const sig = classifyByDom(emptyHeuristics({ hasHeroSection: true, hasFooterLinks: true, paragraphCount: 3 })) + expect(sig?.type).toBe('marketing') + }) + it('returns null for empty input', () => { + expect(classifyByDom(emptyHeuristics())).toBeNull() + }) +}) + +describe('classifyEnsemble — Layer 1', () => { + it('fast path: URL + DOM agree → skip LLM, signalsAgreed true', async () => { + let brainCalls = 0 + const brain = { + auditDesign: async () => { + brainCalls++ + return { raw: '{"type":"docs","confidence":0.9}', score: 5, findings: [], tokensUsed: 0 } + }, + } as unknown as Brain + + const result = await classifyEnsemble({ + brain, + state: fakeState(), + url: 'https://example.com/docs/intro', + domHeuristics: emptyHeuristics({ codeBlockCount: 5, paragraphCount: 10 }), + }) + expect(brainCalls).toBe(0) + expect(result.type).toBe('docs') + expect(result.signalsAgreed).toBe(true) + expect(result.signals.length).toBe(2) + expect(result.ensembleConfidence).toBeGreaterThan(0.5) + expect(result.firstPrinciplesMode).toBe(false) + }) + + it('LLM tiebreaker: signals disagree, LLM has high confidence → LLM wins', async () => { + const brain = fakeBrain({ type: 'saas-app', confidence: 0.9, intent: 'app surface' }) + const result = await classifyEnsemble({ + brain, + state: fakeState(), + url: 'https://example.com/app', + domHeuristics: emptyHeuristics({ paragraphCount: 10 }), // DOM says blog + }) + expect(result.signals.length).toBe(3) + expect(result.signals.some((s) => s.source === 'llm')).toBe(true) + }) + + it('low LLM confidence + signals disagree → unknown with dissent', async () => { + const brain = fakeBrain({ type: 'unknown', confidence: 0.1 }) + const result = await classifyEnsemble({ + brain, + state: fakeState(), + url: 'https://example.com/app', + domHeuristics: emptyHeuristics({ paragraphCount: 10 }), + }) + expect(result.type).toBe('unknown') + expect(result.signalsAgreed).toBe(false) + expect(result.dissent).toBeDefined() + expect(result.dissent!.length).toBeGreaterThan(0) + }) + + it('dom heuristic alone with weak url root → still produces a result', async () => { + const brain = fakeBrain({ type: 'docs', confidence: 0.8 }) + const result = await classifyEnsemble({ + brain, + state: fakeState(), + url: 'https://example.com/', + domHeuristics: emptyHeuristics({ codeBlockCount: 5, paragraphCount: 10 }), + }) + // URL says marketing (root), DOM says docs. LLM tiebreaker decides. + expect(['docs', 'marketing']).toContain(result.type) + expect(result.signals.length).toBeGreaterThanOrEqual(2) + }) + + it('first-principles mode triggers when ensemble confidence < 0.6', async () => { + const brain = fakeBrain({ type: 'unknown', confidence: 0.2 }) + const result = await classifyEnsemble({ + brain, + state: fakeState(), + url: 'https://example.com/', + domHeuristics: emptyHeuristics(), + }) + expect(result.firstPrinciplesMode).toBe(true) + }) + + it('records every signal with rationale + source', async () => { + const brain = fakeBrain({ type: 'docs', confidence: 0.9 }) + const result = await classifyEnsemble({ + brain, + state: fakeState(), + url: 'https://example.com/docs', + domHeuristics: emptyHeuristics({ paragraphCount: 10 }), + }) + for (const sig of result.signals) { + expect(['url-pattern', 'dom-heuristic', 'llm']).toContain(sig.source) + expect(typeof sig.rationale).toBe('string') + expect(sig.rationale.length).toBeGreaterThan(0) + } + }) +}) + +describe('deriveHeuristics — Layer 1', () => { + it('extracts counts from a snapshot', () => { + const snap = ` + navigation: [Home, Docs, Pricing] + heading "Hello" + form + textbox + textbox + paragraph "lorem" + paragraph "ipsum" + contentinfo: [Privacy, Terms] + ` + const h = deriveHeuristics({ snapshot: snap } as PageState) + expect(h.formCount).toBeGreaterThanOrEqual(1) + expect(h.inputCount).toBeGreaterThanOrEqual(2) + expect(h.paragraphCount).toBeGreaterThanOrEqual(2) + expect(h.hasFooterLinks).toBe(true) + }) + + it('returns zeros for empty snapshot', () => { + const h = deriveHeuristics({ snapshot: '' } as PageState) + expect(h.formCount).toBe(0) + expect(h.paragraphCount).toBe(0) + expect(h.hasFooterLinks).toBe(false) + }) +}) + +describe('Ensemble internals — Layer 1', () => { + it('exposes URL_PATTERN_RULES table for inspection', () => { + expect(ENSEMBLE_INTERNALS.URL_PATTERN_RULES.length).toBeGreaterThanOrEqual(7) + expect(ENSEMBLE_INTERNALS.ENSEMBLE_AGREEMENT_THRESHOLD).toBe(0.7) + expect(ENSEMBLE_INTERNALS.LLM_FALLBACK_CONFIDENCE).toBe(0.5) + }) +}) diff --git a/tests/design-audit-ethics-check.test.ts b/tests/design-audit-ethics-check.test.ts new file mode 100644 index 0000000..667e9df --- /dev/null +++ b/tests/design-audit-ethics-check.test.ts @@ -0,0 +1,289 @@ +/** + * Layer 7 — ethics check tests. + * + * Each test exercises one detector kind end-to-end against a real fixture: + * - pattern-absent (medical, gdpr): regex over snapshot text + * - llm-classifier (kids, finance): stubbed Brain response + * - skip-ethics: pipeline-level bypass behavior + * + * The Brain stub is a minimal object that satisfies the call shape used by + * `runLlmClassifier`. We do NOT mock the entire Brain — we call the public + * shape (`brain.complete(system, user)`) and assert the prompt the real + * implementation would send. + */ + +import { describe, it, expect, beforeEach } from 'vitest' +import * as fs from 'node:fs' +import * as path from 'node:path' +import { fileURLToPath } from 'node:url' +import { + loadEthicsRules, + clearEthicsRuleCache, +} from '../src/design/audit/ethics/loader.js' +import { + checkEthics, + pageTextBlob, + runLlmClassifier, + type EthicsCheckContext, +} from '../src/design/audit/ethics/check.js' +import type { Brain } from '../src/brain/index.js' +import type { PageClassification } from '../src/design/audit/v2/types.js' + +const __dirname = path.dirname(fileURLToPath(import.meta.url)) +const RULES_DIR = path.resolve(__dirname, '../src/design/audit/ethics/rules') +const FIXTURES_DIR = path.resolve(__dirname, '../bench/design/ethics-fixtures') + +beforeEach(() => clearEthicsRuleCache()) + +function readFixture(name: string): string { + return fs.readFileSync(path.join(FIXTURES_DIR, name), 'utf-8') +} + +function classification(over: Partial = {}): PageClassification { + return { + type: 'saas-app', + domain: 'general', + framework: null, + designSystem: 'unknown', + maturity: 'shipped', + intent: '', + confidence: 0.9, + ...over, + } +} + +function ctxFor(html: string, over: Partial = {}): EthicsCheckContext { + return { + pageText: pageTextBlob(html), + snapshot: html, + classification: over.classification ?? classification(), + ...over, + } +} + +/** Minimal Brain stub. Records every prompt; returns a scripted answer. */ +function stubBrain(reply: (user: string) => string): Brain { + const calls: Array<{ system: string; user: string }> = [] + const fake = { + calls, + async complete(system: string, user: string) { + calls.push({ system, user }) + return { text: reply(user) } + }, + } + return fake as unknown as Brain +} + +describe('checkEthics — no rules fire', () => { + it('returns [] when classification matches no rule', async () => { + const rules = loadEthicsRules(RULES_DIR) + const violations = await checkEthics( + rules, + ctxFor('hello', { + classification: classification({ domain: 'devtools', type: 'docs' }), + }), + ) + expect(violations).toEqual([]) + }) + + it('passing medical fixture triggers no violations when dosage + warning present', async () => { + const rules = loadEthicsRules(RULES_DIR) + const html = readFixture('medical-with-dosage.html') + const violations = await checkEthics( + rules, + ctxFor(html, { + classification: classification({ domain: 'pharmacy' }), + // No audience/regulatoryContext set → kids+gdpr rules skip. + }), + ) + const dosage = violations.find(v => v.ruleId === 'medical:dosage-warning-required') + expect(dosage).toBeUndefined() + // medical:adverse-event-reporting-path: regex must hit MedWatch text. + const adverse = violations.find(v => v.ruleId === 'medical:adverse-event-reporting-path') + expect(adverse).toBeUndefined() + }) + + it('passing gdpr fixture with consent banner clears the cookie + privacy rules', async () => { + const rules = loadEthicsRules(RULES_DIR) + const html = readFixture('gdpr-with-consent.html') + const violations = await checkEthics( + rules, + ctxFor(html, { regulatoryContext: ['gdpr'] }), + ) + expect(violations.find(v => v.ruleId === 'legal:gdpr-cookie-consent')).toBeUndefined() + expect(violations.find(v => v.ruleId === 'legal:privacy-policy-link-required')).toBeUndefined() + expect(violations.find(v => v.ruleId === 'legal:ccpa-do-not-sell-link')).toBeUndefined() + }) +}) + +describe('checkEthics — pattern-absent detectors fire on missing text', () => { + it('medical fixture missing dosage warnings → critical-floor violation, rollupCap=4', async () => { + const rules = loadEthicsRules(RULES_DIR) + const html = readFixture('medical-no-dosage.html') + const violations = await checkEthics( + rules, + ctxFor(html, { classification: classification({ domain: 'pharmacy' }) }), + ) + const v = violations.find(x => x.ruleId === 'medical:dosage-warning-required') + expect(v).toBeDefined() + expect(v!.severity).toBe('critical-floor') + expect(v!.rollupCap).toBe(4) + expect(v!.citation).toMatch(/FDA/i) + expect(v!.remediation.toLowerCase()).toContain('dosage') + }) + + it('gdpr fixture without consent banner → major-floor violation, rollupCap=6', async () => { + const rules = loadEthicsRules(RULES_DIR) + const html = readFixture('gdpr-no-consent.html') + const violations = await checkEthics( + rules, + ctxFor(html, { regulatoryContext: ['gdpr'] }), + ) + const v = violations.find(x => x.ruleId === 'legal:gdpr-cookie-consent') + expect(v).toBeDefined() + expect(v!.severity).toBe('major-floor') + expect(v!.rollupCap).toBe(6) + }) +}) + +describe('checkEthics — llm-classifier detectors', () => { + it('kids dark-pattern fixture + brain answers yes → critical-floor violation', async () => { + const rules = loadEthicsRules(RULES_DIR) + const html = readFixture('kids-dark-pattern.html') + const brain = stubBrain((user) => { + // Sanity-check the auditor sends both the question and the snapshot. + expect(user).toContain('dark pattern') + expect(user).toContain('YES, give me coins') + return 'yes — fake urgency, confirmshaming, friction-asymmetric flow' + }) + const violations = await checkEthics( + rules, + ctxFor(html, { + classification: classification(), + audience: ['kids'], + audienceVulnerability: ['minor-facing'], + }), + { brain }, + ) + const v = violations.find(x => x.ruleId === 'kids:dark-patterns-prohibited') + expect(v).toBeDefined() + expect(v!.severity).toBe('critical-floor') + expect(v!.rollupCap).toBe(4) + }) + + it('llm-classifier rules are skipped (warned) when no brain is supplied', async () => { + const rules = loadEthicsRules(RULES_DIR) + const html = readFixture('kids-dark-pattern.html') + const warns: string[] = [] + const violations = await checkEthics( + rules, + ctxFor(html, { + classification: classification(), + audience: ['kids'], + regulatoryContext: ['coppa'], + }), + { warn: (m) => warns.push(m) }, + ) + expect(violations.find(v => v.ruleId === 'kids:dark-patterns-prohibited')).toBeUndefined() + expect(warns.some(w => w.includes('kids:dark-patterns-prohibited'))).toBe(true) + }) + + it('finance hidden-fees fixture + brain confirms hiding → critical-floor violation fires', async () => { + const rules = loadEthicsRules(RULES_DIR) + const html = readFixture('finance-hidden-fees.html') + // Rule polarity: yes = "fees ARE hidden" = violation. The fixture buries + // fees in 6px white-on-white text, so a real auditor would say yes. + const brain = stubBrain(() => 'yes — fees are buried in microcopy below the pay button') + const violations = await checkEthics( + rules, + ctxFor(html, { + classification: classification({ type: 'ecommerce', domain: 'payments' }), + }), + { brain }, + ) + const v = violations.find(x => x.ruleId === 'finance:fees-disclosed-pre-commitment') + expect(v).toBeDefined() + expect(v!.severity).toBe('critical-floor') + expect(v!.rollupCap).toBe(4) + }) + + it('finance disclosed-fees fixture + brain says fees are visible → no fee-disclosure violation', async () => { + const rules = loadEthicsRules(RULES_DIR) + const html = readFixture('finance-disclosed-fees.html') + // Rule polarity: no = "fees NOT hidden" = compliant. + const brain = stubBrain(() => 'no — every fee, FX rate, and total is itemized above the pay button') + const violations = await checkEthics( + rules, + ctxFor(html, { + classification: classification({ type: 'ecommerce', domain: 'payments' }), + }), + { brain }, + ) + expect(violations.find(v => v.ruleId === 'finance:fees-disclosed-pre-commitment')).toBeUndefined() + }) + + it('runLlmClassifier truncates oversized snapshots before sending', async () => { + const huge = 'X'.repeat(20_000) + let captured = '' + const brain = stubBrain((user) => { + captured = user + return 'no' + }) + const out = await runLlmClassifier(brain, 'is this safe?', huge) + expect(out).toBe(false) + expect(captured).toContain('[truncated]') + expect(captured.length).toBeLessThan(huge.length) + }) + + it('runLlmClassifier returns false on empty / non-yes responses', async () => { + const brain = stubBrain(() => '') + expect(await runLlmClassifier(brain, 'q?', 'snap')).toBe(false) + const brain2 = stubBrain(() => 'unsure, maybe') + expect(await runLlmClassifier(brain2, 'q?', 'snap')).toBe(false) + }) +}) + +describe('rollup cap selection', () => { + it('takes the lowest cap when multiple rules fire', async () => { + const rules = loadEthicsRules(RULES_DIR) + const html = readFixture('gdpr-no-consent.html') + // GDPR fixture has neither consent (major-floor=6) nor a privacy policy + // link (major-floor=6). Both should fire; the cap is 6. + const violations = await checkEthics( + rules, + ctxFor(html, { regulatoryContext: ['gdpr'] }), + ) + expect(violations.length).toBeGreaterThanOrEqual(2) + const minCap = Math.min(...violations.map(v => v.rollupCap)) + expect(minCap).toBe(6) + }) + + it('mixing critical-floor with major-floor lowers the cap to 4', async () => { + const rules = loadEthicsRules(RULES_DIR) + const html = readFixture('medical-no-dosage.html') + const violations = await checkEthics( + rules, + ctxFor(html, { + classification: classification({ domain: 'pharmacy' }), + audienceVulnerability: ['patient-facing'], + }), + { brain: stubBrain(() => 'no') }, + ) + // Expect dosage (critical) + adverse-event (major) + patient-education (major) + const ruleIds = new Set(violations.map(v => v.ruleId)) + expect(ruleIds.has('medical:dosage-warning-required')).toBe(true) + const minCap = Math.min(...violations.map(v => v.rollupCap)) + expect(minCap).toBe(4) + }) +}) + +describe('skip-ethics bypass semantics', () => { + it('caller can short-circuit by passing zero rules', async () => { + const html = readFixture('medical-no-dosage.html') + const violations = await checkEthics( + [], + ctxFor(html, { classification: classification({ domain: 'pharmacy' }) }), + ) + expect(violations).toEqual([]) + }) +}) diff --git a/tests/design-audit-ethics-rules.test.ts b/tests/design-audit-ethics-rules.test.ts new file mode 100644 index 0000000..ab094be --- /dev/null +++ b/tests/design-audit-ethics-rules.test.ts @@ -0,0 +1,223 @@ +/** + * Layer 7 — ethics rule loader tests. + * + * Asserts the four canonical YAML rule files (medical, kids, finance, legal) + * load without error, every rule's `appliesWhen` predicate is well-formed and + * matches the expected classification surface, and every rule has a passing + + * failing fixture pair under bench/design/ethics-fixtures/. + */ + +import { describe, it, expect, beforeEach } from 'vitest' +import * as fs from 'node:fs' +import * as path from 'node:path' +import { fileURLToPath } from 'node:url' +import { + loadEthicsRules, + clearEthicsRuleCache, + rollupCapFor, +} from '../src/design/audit/ethics/loader.js' +import { appliesWhenMatches, pageTextBlob } from '../src/design/audit/ethics/check.js' +import type { EthicsRule, PageClassification } from '../src/design/audit/v2/types.js' + +const __dirname = path.dirname(fileURLToPath(import.meta.url)) +const RULES_DIR = path.resolve(__dirname, '../src/design/audit/ethics/rules') +const FIXTURES_DIR = path.resolve(__dirname, '../bench/design/ethics-fixtures') + +function makeClassification(over: Partial = {}): PageClassification { + return { + type: 'saas-app', + domain: 'general', + framework: null, + designSystem: 'unknown', + maturity: 'shipped', + intent: 'unspecified', + confidence: 0.9, + ...over, + } +} + +beforeEach(() => clearEthicsRuleCache()) + +describe('ethics rule loader', () => { + it('loads all four rule files without error', () => { + const rules = loadEthicsRules(RULES_DIR) + expect(rules.length).toBeGreaterThanOrEqual(8) + const cats = new Set(rules.map(r => r.category)) + expect(cats).toEqual(new Set(['medical', 'kids', 'finance', 'legal'])) + }) + + it('every rule has the required structural fields', () => { + const rules = loadEthicsRules(RULES_DIR) + for (const r of rules) { + expect(r.ruleId).toMatch(/^[a-z]+:[a-z0-9-]+$/) + expect(['critical-floor', 'major-floor']).toContain(r.severity) + expect(['medical', 'kids', 'finance', 'legal']).toContain(r.category) + expect(r.remediation.length).toBeGreaterThan(10) + expect(r.detector).toBeDefined() + // Citation is optional but every shipped rule should carry one — ethics + // without a regulation reference is opinion, not policy. + expect(r.citation).toBeDefined() + } + }) + + it('rollupCapFor returns 4 for critical-floor and 6 for major-floor', () => { + expect(rollupCapFor('critical-floor')).toBe(4) + expect(rollupCapFor('major-floor')).toBe(6) + }) + + it('caches by directory — second call returns the same array', () => { + const a = loadEthicsRules(RULES_DIR) + const b = loadEthicsRules(RULES_DIR) + expect(a).toBe(b) + }) + + it('returns [] for a missing directory without throwing', () => { + const missing = path.join(__dirname, '__nonexistent_ethics_dir__') + expect(loadEthicsRules(missing)).toEqual([]) + }) +}) + +describe('appliesWhen predicates', () => { + const rules = loadEthicsRules(RULES_DIR) + const byId = new Map(rules.map(r => [r.ruleId, r])) + + it('medical:dosage-warning-required matches a pharmacy classification', () => { + const rule = byId.get('medical:dosage-warning-required')! + const ok = appliesWhenMatches(rule.appliesWhen, { + pageText: '', + snapshot: '', + classification: makeClassification({ domain: 'pharmacy' }), + }) + expect(ok).toBe(true) + }) + + it('medical:dosage-warning-required does NOT match a general saas page', () => { + const rule = byId.get('medical:dosage-warning-required')! + const ok = appliesWhenMatches(rule.appliesWhen, { + pageText: '', + snapshot: '', + classification: makeClassification({ domain: 'devtools' }), + }) + expect(ok).toBe(false) + }) + + it('kids:dark-patterns-prohibited matches when audience=[kids]', () => { + const rule = byId.get('kids:dark-patterns-prohibited')! + const ctx = { + pageText: '', + snapshot: '', + classification: makeClassification(), + audience: ['kids'] as const, + } + expect(appliesWhenMatches(rule.appliesWhen, ctx as never)).toBe(true) + }) + + it('kids:age-gate-required requires both audience=kids AND minor-facing vulnerability', () => { + const rule = byId.get('kids:age-gate-required')! + expect( + appliesWhenMatches(rule.appliesWhen, { + pageText: '', + snapshot: '', + classification: makeClassification(), + audience: ['kids'], + } as never), + ).toBe(false) + expect( + appliesWhenMatches(rule.appliesWhen, { + pageText: '', + snapshot: '', + classification: makeClassification(), + audience: ['kids'], + audienceVulnerability: ['minor-facing'], + } as never), + ).toBe(true) + }) + + it('finance:fees-disclosed-pre-commitment matches ecommerce + fintech domain', () => { + const rule = byId.get('finance:fees-disclosed-pre-commitment')! + const ok = appliesWhenMatches(rule.appliesWhen, { + pageText: '', + snapshot: '', + classification: makeClassification({ type: 'ecommerce', domain: 'payments' }), + }) + expect(ok).toBe(true) + }) + + it('legal:gdpr-cookie-consent matches when regulatoryContext includes gdpr', () => { + const rule = byId.get('legal:gdpr-cookie-consent')! + expect( + appliesWhenMatches(rule.appliesWhen, { + pageText: '', + snapshot: '', + classification: makeClassification(), + regulatoryContext: ['gdpr'], + } as never), + ).toBe(true) + expect( + appliesWhenMatches(rule.appliesWhen, { + pageText: '', + snapshot: '', + classification: makeClassification(), + }), + ).toBe(false) + }) +}) + +describe('fixture pairs', () => { + // Map each rule (or rule cluster) to a passing + failing fixture. Every + // shipped rule MUST have ≥1 of each per the RFC success metrics. + const pairs: Array<{ ruleId: string; passing: string; failing: string }> = [ + { + ruleId: 'medical:dosage-warning-required', + passing: 'medical-with-dosage.html', + failing: 'medical-no-dosage.html', + }, + { + ruleId: 'kids:age-gate-required', + passing: 'kids-age-gated.html', + failing: 'kids-dark-pattern.html', + }, + { + ruleId: 'finance:fees-disclosed-pre-commitment', + passing: 'finance-disclosed-fees.html', + failing: 'finance-hidden-fees.html', + }, + { + ruleId: 'legal:gdpr-cookie-consent', + passing: 'gdpr-with-consent.html', + failing: 'gdpr-no-consent.html', + }, + ] + + it.each(pairs)('rule $ruleId has fixture pair on disk', ({ passing, failing }) => { + expect(fs.existsSync(path.join(FIXTURES_DIR, passing))).toBe(true) + expect(fs.existsSync(path.join(FIXTURES_DIR, failing))).toBe(true) + }) + + it('pattern-absent rules detect their pattern in the passing fixture', () => { + const rules = loadEthicsRules(RULES_DIR) + const byId = new Map(rules.map(r => [r.ruleId, r])) + for (const { ruleId, passing } of pairs) { + const rule = byId.get(ruleId) as EthicsRule | undefined + if (!rule) throw new Error(`rule ${ruleId} not loaded`) + if (rule.detector.kind !== 'pattern-absent') continue + const html = fs.readFileSync(path.join(FIXTURES_DIR, passing), 'utf-8') + const re = new RegExp(rule.detector.pattern, 'i') + expect(re.test(html.toLowerCase())).toBe(true) + } + }) + + it('pattern-absent rules miss the pattern in the failing fixture', () => { + const rules = loadEthicsRules(RULES_DIR) + const byId = new Map(rules.map(r => [r.ruleId, r])) + for (const { ruleId, failing } of pairs) { + const rule = byId.get(ruleId) as EthicsRule | undefined + if (!rule) throw new Error(`rule ${ruleId} not loaded`) + if (rule.detector.kind !== 'pattern-absent') continue + const html = fs.readFileSync(path.join(FIXTURES_DIR, failing), 'utf-8') + const blob = pageTextBlob(html) + const re = new RegExp(rule.detector.pattern, 'i') + expect(re.test(blob)).toBe(false) + } + }) +}) diff --git a/tests/design-audit-first-principles.test.ts b/tests/design-audit-first-principles.test.ts new file mode 100644 index 0000000..d487527 --- /dev/null +++ b/tests/design-audit-first-principles.test.ts @@ -0,0 +1,93 @@ +import { describe, it, expect, afterEach } from 'vitest' +import { mkdtempSync, rmSync } from 'node:fs' +import { tmpdir } from 'node:os' +import { join } from 'node:path' +import { + shouldTriggerFirstPrinciples, + buildNovelPatternObservation, + appendNovelPatternObservation, +} from '../src/design/audit/first-principles-mode.js' +import type { EnsembleClassification } from '../src/design/audit/v2/types.js' +import { readFileSync, existsSync } from 'node:fs' + +function makeClassification(overrides: Partial = {}): EnsembleClassification { + return { + type: 'saas-app', + domain: '', + maturity: 'production', + designSystem: 'unknown', + signals: [], + signalsAgreed: true, + ensembleConfidence: 0.85, + firstPrinciplesMode: false, + ...overrides, + } +} + +describe('shouldTriggerFirstPrinciples', () => { + it('does not trigger on high-confidence agreed classification', () => { + expect(shouldTriggerFirstPrinciples(makeClassification())).toBe(false) + }) + + it('triggers when ensembleConfidence < 0.6', () => { + expect(shouldTriggerFirstPrinciples(makeClassification({ ensembleConfidence: 0.4 }))).toBe(true) + }) + + it('triggers when signals disagree', () => { + expect(shouldTriggerFirstPrinciples(makeClassification({ signalsAgreed: false }))).toBe(true) + }) + + it('triggers when type is unknown', () => { + expect(shouldTriggerFirstPrinciples(makeClassification({ type: 'unknown' as never }))).toBe(true) + }) + + it('triggers when firstPrinciplesMode flag is set', () => { + expect(shouldTriggerFirstPrinciples(makeClassification({ firstPrinciplesMode: true }))).toBe(true) + }) + + it('respects custom threshold', () => { + const cl = makeClassification({ ensembleConfidence: 0.72 }) + expect(shouldTriggerFirstPrinciples(cl, { confidenceThreshold: 0.8 })).toBe(true) + expect(shouldTriggerFirstPrinciples(cl, { confidenceThreshold: 0.7 })).toBe(false) + }) +}) + +describe('buildNovelPatternObservation', () => { + it('produces a stable observationId for the same pageRef within the same minute', () => { + const cl = makeClassification({ ensembleConfidence: 0.3, signalsAgreed: false }) + const obs1 = buildNovelPatternObservation({ classification: cl, pageRef: 'https://example.com' }) + const obs2 = buildNovelPatternObservation({ classification: cl, pageRef: 'https://example.com' }) + expect(obs1.observationId).toBe(obs2.observationId) + }) + + it('carries closestType and closestConfidence from the classification', () => { + const cl = makeClassification({ type: 'marketing', ensembleConfidence: 0.45 }) + const obs = buildNovelPatternObservation({ classification: cl, pageRef: 'https://test.com' }) + expect(obs.closestType).toBe('marketing') + expect(obs.closestConfidence).toBe(0.45) + }) +}) + +describe('appendNovelPatternObservation', () => { + let tmpDir: string + afterEach(() => { + if (tmpDir) rmSync(tmpDir, { recursive: true, force: true }) + }) + + it('writes a valid JSON line and round-trips', async () => { + tmpDir = mkdtempSync(join(tmpdir(), 'bad-fp-test-')) + const cl = makeClassification({ ensembleConfidence: 0.2 }) + const obs = buildNovelPatternObservation({ classification: cl, pageRef: 'https://example.com' }) + await appendNovelPatternObservation(obs, tmpDir) + + const date = obs.capturedAt.slice(0, 10) + const filePath = join(tmpDir, `${date}.jsonl`) + expect(existsSync(filePath)).toBe(true) + + const lines = readFileSync(filePath, 'utf-8').split('\n').filter(Boolean) + expect(lines).toHaveLength(1) + const parsed = JSON.parse(lines[0]) + expect(parsed.observationId).toBe(obs.observationId) + expect(parsed.pageRef).toBe('https://example.com') + }) +}) diff --git a/tests/design-audit-patch-parse.test.ts b/tests/design-audit-patch-parse.test.ts new file mode 100644 index 0000000..1b7c59a --- /dev/null +++ b/tests/design-audit-patch-parse.test.ts @@ -0,0 +1,98 @@ +import { describe, it, expect } from 'vitest' +import { parsePatch, parsePatches } from '../src/design/audit/patches/parse.js' + +const validPatch = { + patchId: 'patch-001', + findingId: 'finding-001', + scope: 'component', + target: { scope: 'css', cssSelector: '.hero-cta' }, + diff: { before: 'background: blue', after: 'background: #2563eb' }, + testThatProves: { kind: 'rerun-audit', description: 'Re-run audit and verify visual_craft score improves.' }, + rollback: { kind: 'git-revert' }, + estimatedDelta: { dim: 'visual_craft', delta: 1 }, + estimatedDeltaConfidence: 'untested', +} + +describe('parsePatch', () => { + it('accepts a fully valid patch', () => { + const { patch, reason } = parsePatch(validPatch) + expect(patch).not.toBeNull() + expect(reason).toBeUndefined() + expect(patch!.patchId).toBe('patch-001') + }) + + it('accepts optional unifiedDiff', () => { + const { patch } = parsePatch({ ...validPatch, diff: { ...validPatch.diff, unifiedDiff: '--- a/f\n+++ b/f\n' } }) + expect(patch?.diff.unifiedDiff).toBeDefined() + }) + + it('rejects non-object input', () => { + const { patch, reason } = parsePatch('not an object') + expect(patch).toBeNull() + expect(reason).toMatch(/not an object/) + }) + + it('rejects missing patchId', () => { + const { patch, reason } = parsePatch({ ...validPatch, patchId: '' }) + expect(patch).toBeNull() + expect(reason).toMatch(/patchId/) + }) + + it('rejects missing findingId', () => { + const { patch, reason } = parsePatch({ ...validPatch, findingId: undefined }) + expect(patch).toBeNull() + expect(reason).toMatch(/findingId/) + }) + + it('rejects invalid scope', () => { + const { patch, reason } = parsePatch({ ...validPatch, scope: 'galaxy' }) + expect(patch).toBeNull() + expect(reason).toMatch(/scope/) + }) + + it('rejects invalid target.scope', () => { + const { patch, reason } = parsePatch({ ...validPatch, target: { scope: 'cobol', cssSelector: '.x' } }) + expect(patch).toBeNull() + expect(reason).toMatch(/target.scope/) + }) + + it('rejects missing diff.before', () => { + const { patch, reason } = parsePatch({ ...validPatch, diff: { before: '', after: 'x' } }) + expect(patch).toBeNull() + expect(reason).toMatch(/diff.before/) + }) + + it('rejects invalid testThatProves.kind', () => { + const { patch, reason } = parsePatch({ ...validPatch, testThatProves: { kind: 'vibes', description: 'idk' } }) + expect(patch).toBeNull() + expect(reason).toMatch(/testThatProves.kind/) + }) + + it('rejects invalid rollback.kind', () => { + const { patch, reason } = parsePatch({ ...validPatch, rollback: { kind: 'prayer' } }) + expect(patch).toBeNull() + expect(reason).toMatch(/rollback.kind/) + }) + + it('rejects invalid estimatedDeltaConfidence', () => { + const { patch, reason } = parsePatch({ ...validPatch, estimatedDeltaConfidence: 'godlike' }) + expect(patch).toBeNull() + expect(reason).toMatch(/estimatedDeltaConfidence/) + }) +}) + +describe('parsePatches', () => { + it('parses an array of patches, dropping invalid entries', () => { + const raw = [validPatch, { patchId: '' }, validPatch] + const { patches, errors } = parsePatches(raw) + expect(patches).toHaveLength(2) + expect(errors).toHaveLength(1) + expect(errors[0].index).toBe(1) + }) + + it('returns error when input is not an array', () => { + const { patches, errors } = parsePatches('oops') + expect(patches).toHaveLength(0) + expect(errors[0].index).toBe(-1) + }) +}) diff --git a/tests/design-audit-patch-validate.test.ts b/tests/design-audit-patch-validate.test.ts new file mode 100644 index 0000000..187e0b7 --- /dev/null +++ b/tests/design-audit-patch-validate.test.ts @@ -0,0 +1,78 @@ +import { describe, it, expect } from 'vitest' +import { validatePatch, validatePatches } from '../src/design/audit/patches/validate.js' +import type { Patch } from '../src/design/audit/v2/types.js' + +const basePatch: Patch = { + patchId: 'p1', + findingId: 'f1', + scope: 'component', + target: { scope: 'css', cssSelector: '.btn' }, + diff: { before: 'color: red', after: 'color: green' }, + testThatProves: { kind: 'rerun-audit', description: 'Score improves.' }, + rollback: { kind: 'git-revert' }, + estimatedDelta: { dim: 'visual_craft', delta: 1 }, + estimatedDeltaConfidence: 'untested', +} + +const snapshot = 'The page has: color: red and font-size: 14px' + +describe('validatePatch', () => { + it('passes when before is in snapshot and locator present', () => { + const result = validatePatch(basePatch, snapshot) + expect(result.valid).toBe(true) + expect(result.reasons).toHaveLength(0) + }) + + it('fails when before is not in snapshot', () => { + const result = validatePatch({ ...basePatch, diff: { before: 'color: purple', after: 'x' } }, snapshot) + expect(result.valid).toBe(false) + expect(result.reasons).toContain('before-not-in-snapshot') + }) + + it('fails when before is empty string', () => { + const result = validatePatch({ ...basePatch, diff: { before: '', after: 'x' } }, snapshot) + expect(result.valid).toBe(false) + expect(result.reasons).toContain('before-empty') + }) + + it('fails when target has no locator', () => { + const patch: Patch = { ...basePatch, target: { scope: 'css' } } + const result = validatePatch(patch, snapshot) + expect(result.valid).toBe(false) + expect(result.reasons).toContain('target-missing-locator') + }) + + it('fails when estimatedDelta.delta is out of range (> 3)', () => { + const result = validatePatch({ ...basePatch, estimatedDelta: { dim: 'visual_craft', delta: 5 } }, snapshot) + expect(result.valid).toBe(false) + expect(result.reasons).toContain('estimated-delta-out-of-range') + }) + + it('fails when estimatedDelta.delta is out of range (< -3)', () => { + const result = validatePatch({ ...basePatch, estimatedDelta: { dim: 'visual_craft', delta: -4 } }, snapshot) + expect(result.valid).toBe(false) + expect(result.reasons).toContain('estimated-delta-out-of-range') + }) + + it('accumulates multiple failures in one pass', () => { + const patch: Patch = { + ...basePatch, + target: { scope: 'css' }, + diff: { before: 'not present', after: 'x' }, + estimatedDelta: { dim: 'visual_craft', delta: 99 }, + } + const result = validatePatch(patch, snapshot) + expect(result.valid).toBe(false) + expect(result.reasons.length).toBeGreaterThanOrEqual(3) + }) +}) + +describe('validatePatches', () => { + it('partitions valid and invalid patches', () => { + const valid = basePatch + const invalid: Patch = { ...basePatch, diff: { before: 'not-here', after: 'x' } } + const result = validatePatches([valid, invalid], snapshot) + expect(result.valid).toHaveLength(1) + expect(result.invalid).toHaveLength(1) + }) +}) diff --git a/tests/design-audit-rollup.test.ts b/tests/design-audit-rollup.test.ts new file mode 100644 index 0000000..2316ad1 --- /dev/null +++ b/tests/design-audit-rollup.test.ts @@ -0,0 +1,252 @@ +import { describe, it, expect } from 'vitest' +import { + ROLLUP_WEIGHTS, + rollupWeightsFor, + rollupFormula, +} from '../src/design/audit/rubric/rollup-weights.js' +import { + computeRollup, + mergeDimensionScoresAcrossPasses, + parseAuditResponseV2, +} from '../src/design/audit/v2/score.js' +import { DIMENSIONS, type Dimension, type DimensionScore } from '../src/design/audit/v2/types.js' + +function dimScore(score: number, range: [number, number] = [score - 1, score + 1], conf: 'high' | 'medium' | 'low' = 'medium'): DimensionScore { + return { + score, + range: [Math.max(1, range[0]), Math.min(10, range[1])], + confidence: conf, + summary: '', + primaryFindings: [], + } +} + +function uniformScores(score: number, conf: 'high' | 'medium' | 'low' = 'medium'): Record { + const out: Partial> = {} + for (const dim of DIMENSIONS) out[dim] = dimScore(score, [Math.max(1, score - 1), Math.min(10, score + 1)], conf) + return out as Record +} + +describe('rollup weights — Layer 1', () => { + it('every page-type weight set sums to 1.0 within 1e-6', () => { + for (const [type, weights] of Object.entries(ROLLUP_WEIGHTS)) { + const sum = Object.values(weights).reduce((a, n) => a + n, 0) + expect(Math.abs(sum - 1)).toBeLessThan(1e-6) + // every dimension must be present + for (const dim of DIMENSIONS) { + expect(typeof weights[dim]).toBe('number') + } + } + }) + + it('exposes weights for every PageType plus default + unknown', () => { + const expected = ['marketing', 'saas-app', 'dashboard', 'docs', 'ecommerce', 'social', 'tool', 'blog', 'utility', 'unknown', 'default'] + for (const t of expected) { + expect(ROLLUP_WEIGHTS[t as keyof typeof ROLLUP_WEIGHTS]).toBeDefined() + } + }) + + it('saas-app weights emphasize product_intent + workflow over visual_craft', () => { + const w = ROLLUP_WEIGHTS['saas-app'] + expect(w.product_intent).toBeGreaterThan(w.visual_craft) + expect(w.workflow).toBeGreaterThan(w.visual_craft) + }) + + it('marketing weights emphasize visual_craft + content_ia + product_intent', () => { + const w = ROLLUP_WEIGHTS.marketing + expect(w.visual_craft).toBeGreaterThanOrEqual(0.25) + expect(w.content_ia).toBeGreaterThanOrEqual(0.2) + expect(w.product_intent).toBeGreaterThanOrEqual(0.2) + }) + + it('docs weights emphasize content_ia ≥ 0.4', () => { + expect(ROLLUP_WEIGHTS.docs.content_ia).toBeGreaterThanOrEqual(0.4) + }) + + it('ecommerce weights emphasize trust_clarity', () => { + expect(ROLLUP_WEIGHTS.ecommerce.trust_clarity).toBeGreaterThanOrEqual(0.3) + }) + + it('rollupWeightsFor falls back to default for unknown page type', () => { + const w = rollupWeightsFor(undefined) + const sum = Object.values(w).reduce((a, n) => a + n, 0) + expect(Math.abs(sum - 1)).toBeLessThan(1e-6) + }) + + it('rollupFormula renders a deterministic readable formula', () => { + const formula = rollupFormula('saas-app', ROLLUP_WEIGHTS['saas-app']) + expect(formula).toContain('saas-app:') + expect(formula).toContain('product_intent*0.35') + expect(formula).toContain('workflow*0.30') + }) +}) + +describe('computeRollup — Layer 1', () => { + it('uniform 7s on saas-app rolls up to exactly 7', () => { + const r = computeRollup(uniformScores(7), 'saas-app') + expect(r.score).toBeCloseTo(7, 6) + expect(r.range[0]).toBeCloseTo(6, 6) + expect(r.range[1]).toBeCloseTo(8, 6) + expect(r.confidence).toBe('medium') + expect(r.rule).toContain('saas-app') + }) + + it('saas-app rollup weights product_intent more heavily than docs', () => { + const scores: Record = { + product_intent: dimScore(9), + workflow: dimScore(5), + visual_craft: dimScore(5), + trust_clarity: dimScore(5), + content_ia: dimScore(5), + } + const saas = computeRollup(scores, 'saas-app') + const docs = computeRollup(scores, 'docs') + expect(saas.score).toBeGreaterThan(docs.score) + }) + + it('confidence is conservative: any low → low rollup', () => { + const scores = uniformScores(7, 'high') + scores.workflow = dimScore(7, [6, 8], 'low') + const r = computeRollup(scores, 'saas-app') + expect(r.confidence).toBe('low') + }) + + it('confidence is medium when no low + at least one medium', () => { + const scores = uniformScores(8, 'high') + scores.product_intent = dimScore(8, [7, 9], 'medium') + const r = computeRollup(scores, 'marketing') + expect(r.confidence).toBe('medium') + }) + + it('confidence is high when every dim is high', () => { + const r = computeRollup(uniformScores(9, 'high'), 'saas-app') + expect(r.confidence).toBe('high') + }) + + it('weighted-mean math: linear scoring 4 vs 9 with saas-app weights', () => { + const scores: Record = { + product_intent: dimScore(4), + workflow: dimScore(4), + visual_craft: dimScore(9), + trust_clarity: dimScore(9), + content_ia: dimScore(9), + } + const r = computeRollup(scores, 'saas-app') + // saas-app: 0.35*4 + 0.30*4 + 0.15*9 + 0.10*9 + 0.10*9 = 1.4 + 1.2 + 1.35 + 0.9 + 0.9 = 5.75 + expect(r.score).toBeCloseTo(5.75, 1) + }) +}) + +describe('mergeDimensionScoresAcrossPasses — Layer 1', () => { + it('returns identity for a single pass', () => { + const s = uniformScores(7) + const merged = mergeDimensionScoresAcrossPasses([s]) + expect(merged.product_intent.score).toBe(7) + }) + + it('averages scores across multiple passes', () => { + const s1 = uniformScores(6) + const s2 = uniformScores(8) + const merged = mergeDimensionScoresAcrossPasses([s1, s2]) + expect(merged.product_intent.score).toBe(7) + }) + + it('takes the floor confidence across passes', () => { + const s1 = uniformScores(7, 'high') + const s2 = uniformScores(7, 'low') + const merged = mergeDimensionScoresAcrossPasses([s1, s2]) + expect(merged.product_intent.confidence).toBe('low') + }) + + it('throws on empty input', () => { + expect(() => mergeDimensionScoresAcrossPasses([])).toThrow(/empty/) + }) +}) + +describe('parseAuditResponseV2 — Layer 1', () => { + const validRaw = JSON.stringify({ + scores: { + product_intent: { score: 6, range: [5, 7], confidence: 'medium', summary: 'ok', primaryFindings: [] }, + visual_craft: { score: 7, range: [6, 8], confidence: 'high', summary: 'ok', primaryFindings: [] }, + trust_clarity: { score: 5, range: [4, 6], confidence: 'medium', summary: 'ok', primaryFindings: [] }, + workflow: { score: 6, range: [5, 7], confidence: 'medium', summary: 'ok', primaryFindings: [] }, + content_ia: { score: 7, range: [6, 8], confidence: 'high', summary: 'ok', primaryFindings: [] }, + }, + summary: 'overall', + strengths: ['a', 'b'], + }) + + it('parses a well-formed v2 response with every dimension', () => { + const out = parseAuditResponseV2(validRaw) + expect(out.scores.product_intent.score).toBe(6) + expect(out.scores.visual_craft.confidence).toBe('high') + expect(out.summary).toBe('overall') + expect(out.strengths).toEqual(['a', 'b']) + }) + + it('parses fenced JSON', () => { + const fenced = '```json\n' + validRaw + '\n```' + const out = parseAuditResponseV2(fenced) + expect(out.scores.product_intent.score).toBe(6) + }) + + it('rejects scores outside [range[0], range[1]]', () => { + const bad = JSON.stringify({ + scores: { + product_intent: { score: 3, range: [5, 7], confidence: 'medium', summary: '', primaryFindings: [] }, + visual_craft: { score: 7, range: [6, 8], confidence: 'high', summary: '', primaryFindings: [] }, + trust_clarity: { score: 5, range: [4, 6], confidence: 'medium', summary: '', primaryFindings: [] }, + workflow: { score: 6, range: [5, 7], confidence: 'medium', summary: '', primaryFindings: [] }, + content_ia: { score: 7, range: [6, 8], confidence: 'high', summary: '', primaryFindings: [] }, + }, + }) + expect(() => parseAuditResponseV2(bad)).toThrow(/outside range/) + }) + + it('rejects scores outside 1..10', () => { + const bad = JSON.stringify({ + scores: { + product_intent: { score: 11, range: [10, 12], confidence: 'medium', summary: '', primaryFindings: [] }, + visual_craft: { score: 7, range: [6, 8], confidence: 'high', summary: '', primaryFindings: [] }, + trust_clarity: { score: 5, range: [4, 6], confidence: 'medium', summary: '', primaryFindings: [] }, + workflow: { score: 6, range: [5, 7], confidence: 'medium', summary: '', primaryFindings: [] }, + content_ia: { score: 7, range: [6, 8], confidence: 'high', summary: '', primaryFindings: [] }, + }, + }) + expect(() => parseAuditResponseV2(bad)).toThrow(/outside 1..10/) + }) + + it('rejects inverted ranges', () => { + const bad = JSON.stringify({ + scores: { + product_intent: { score: 6, range: [7, 5], confidence: 'medium', summary: '', primaryFindings: [] }, + visual_craft: { score: 7, range: [6, 8], confidence: 'high', summary: '', primaryFindings: [] }, + trust_clarity: { score: 5, range: [4, 6], confidence: 'medium', summary: '', primaryFindings: [] }, + workflow: { score: 6, range: [5, 7], confidence: 'medium', summary: '', primaryFindings: [] }, + content_ia: { score: 7, range: [6, 8], confidence: 'high', summary: '', primaryFindings: [] }, + }, + }) + expect(() => parseAuditResponseV2(bad)).toThrow(/inverted/) + }) + + it('throws when a dimension is missing', () => { + const bad = JSON.stringify({ + scores: { + product_intent: { score: 6, range: [5, 7], confidence: 'medium', summary: '', primaryFindings: [] }, + // missing visual_craft + trust_clarity: { score: 5, range: [4, 6], confidence: 'medium', summary: '', primaryFindings: [] }, + workflow: { score: 6, range: [5, 7], confidence: 'medium', summary: '', primaryFindings: [] }, + content_ia: { score: 7, range: [6, 8], confidence: 'high', summary: '', primaryFindings: [] }, + }, + }) + expect(() => parseAuditResponseV2(bad)).toThrow(/visual_craft missing/) + }) + + it('throws on missing scores object', () => { + expect(() => parseAuditResponseV2('{"summary":"x"}')).toThrow(/missing scores/) + }) + + it('throws on no JSON object at all', () => { + expect(() => parseAuditResponseV2('not json')).toThrow(/no JSON object/) + }) +}) diff --git a/tests/design-audit-v2-result.test.ts b/tests/design-audit-v2-result.test.ts new file mode 100644 index 0000000..96052e4 --- /dev/null +++ b/tests/design-audit-v2-result.test.ts @@ -0,0 +1,267 @@ +import { describe, it, expect } from 'vitest' +import { buildAuditResultV2 } from '../src/design/audit/v2/build-result.js' +import type { Brain } from '../src/brain/index.js' +import type { PageState } from '../src/types.js' +import type { + PageAuditResult, + ComposedRubric, + MeasurementBundle, +} from '../src/design/audit/types.js' +import type { + AuditResult_v2, + Dimension, + DimensionScore, + EnsembleClassification, +} from '../src/design/audit/v2/types.js' +import { DIMENSIONS } from '../src/design/audit/v2/types.js' + +function fakeMeasurements(): MeasurementBundle { + return { + contrast: { + totalChecked: 50, + aaFailures: [], + aaaFailures: [], + summary: { aaPassRate: 1, aaaPassRate: 1 }, + }, + a11y: { + ran: true, + violations: [], + passes: 30, + }, + hasBlockingIssues: false, + } +} + +function fakeRubric(): ComposedRubric { + return { + fragments: [], + body: 'TEST RUBRIC BODY', + calibration: 'Score honestly.', + dimensions: [], + } +} + +function fakeEnsemble(type: 'saas-app' | 'marketing' = 'saas-app'): EnsembleClassification { + return { + type, + domain: 'unknown', + framework: null, + designSystem: 'unknown', + maturity: 'shipped', + intent: 'test page', + confidence: 0.8, + signals: [ + { source: 'url-pattern', type, confidence: 0.7, rationale: 'fixture' }, + { source: 'dom-heuristic', type, confidence: 0.7, rationale: 'fixture' }, + ], + signalsAgreed: true, + ensembleConfidence: 0.8, + firstPrinciplesMode: false, + } +} + +function fakeV1(score = 7): PageAuditResult { + return { + url: 'https://example.com/app', + score, + summary: 'fake v1 summary', + strengths: ['a'], + findings: [ + { + category: 'ux', + severity: 'major', + description: 'No primary action', + location: 'main', + suggestion: 'Add a primary CTA', + impact: 8, + effort: 3, + blast: 'page', + }, + { + category: 'spacing', + severity: 'minor', + description: 'inconsistent padding', + location: 'cards', + suggestion: 'use 8px grid', + impact: 4, + effort: 1, + blast: 'component', + }, + ], + } +} + +function uniformScores(score: number, conf: 'high' | 'medium' | 'low' = 'medium'): Record { + const out: Partial> = {} + for (const dim of DIMENSIONS) { + out[dim] = { + score, + range: [Math.max(1, score - 1), Math.min(10, score + 1)], + confidence: conf, + summary: '', + primaryFindings: [], + } + } + return out as Record +} + +function fakeStateWithoutBrain(): { brain: Brain; state: PageState } { + // Brain that throws — buildAuditResultV2 should fall back to synthesized + // scores when given precomputedScores OR when the brain call fails. + const brain = { + auditDesign: async () => { + throw new Error('no brain in tests') + }, + } as unknown as Brain + const state = { url: 'x', title: 'x', snapshot: '', screenshot: '' } as PageState + return { brain, state } +} + +describe('buildAuditResultV2 — Layer 1', () => { + it('produces a complete AuditResult_v2 with every required field (precomputed path)', async () => { + const { brain, state } = fakeStateWithoutBrain() + const v2: AuditResult_v2 = await buildAuditResultV2({ + brain, + state, + pageRef: 'https://example.com/app', + ensemble: fakeEnsemble('saas-app'), + rubric: fakeRubric(), + measurements: fakeMeasurements(), + v1Result: fakeV1(7), + precomputedScores: uniformScores(8, 'high'), + }) + + expect(v2.schemaVersion).toBe(2) + expect(typeof v2.runId).toBe('string') + expect(v2.pageRef).toBe('https://example.com/app') + expect(v2.classification.type).toBe('saas-app') + expect(v2.classification.signalsAgreed).toBe(true) + + for (const dim of DIMENSIONS) { + expect(v2.scores[dim]).toBeDefined() + expect(v2.scores[dim].score).toBe(8) + expect(v2.scores[dim].range[0]).toBeLessThanOrEqual(v2.scores[dim].score) + expect(v2.scores[dim].range[1]).toBeGreaterThanOrEqual(v2.scores[dim].score) + } + + expect(v2.rollup.score).toBeCloseTo(8, 1) + expect(v2.rollup.confidence).toBe('high') + expect(v2.rollup.rule).toContain('saas-app') + + expect(Array.isArray(v2.findings)).toBe(true) + expect(v2.findings.length).toBeGreaterThan(0) + for (const f of v2.findings) { + expect(typeof f.id).toBe('string') + expect(f.id.length).toBeGreaterThan(0) + expect(['product_intent', 'visual_craft', 'trust_clarity', 'workflow', 'content_ia']).toContain(f.dimension) + expect(['polish', 'job', 'measurement']).toContain(f.kind) + expect(Array.isArray(f.patches)).toBe(true) + } + + expect(Array.isArray(v2.topFixes)).toBe(true) + expect(v2.topFixes.length).toBeLessThanOrEqual(5) + for (const fixId of v2.topFixes) { + expect(v2.findings.some((f) => f.id === fixId)).toBe(true) + } + + expect(Array.isArray(v2.ethicsViolations)).toBe(true) + expect(Array.isArray(v2.matchedPatterns)).toBe(true) + expect(v2.modality).toBe('html') + expect(typeof v2.evaluatedAt).toBe('string') + expect(typeof v2.promptHash).toBe('string') + expect(typeof v2.rubricHash).toBe('string') + expect(Array.isArray(v2.passes)).toBe(true) + }) + + it('rollup score reflects per-page-type weights (saas-app vs marketing)', async () => { + const { brain, state } = fakeStateWithoutBrain() + const scores = uniformScores(7, 'high') + // tilt one dimension low + scores.product_intent = { score: 3, range: [2, 4], confidence: 'high', summary: '', primaryFindings: [] } + + const saas = await buildAuditResultV2({ + brain, + state, + pageRef: 'x', + ensemble: fakeEnsemble('saas-app'), + rubric: fakeRubric(), + measurements: fakeMeasurements(), + v1Result: fakeV1(), + precomputedScores: scores, + }) + + const marketing = await buildAuditResultV2({ + brain, + state, + pageRef: 'x', + ensemble: fakeEnsemble('marketing'), + rubric: fakeRubric(), + measurements: fakeMeasurements(), + v1Result: fakeV1(), + precomputedScores: scores, + }) + + // saas-app weights product_intent at 0.35 vs marketing 0.30 — saas penalized more. + expect(saas.rollup.score).toBeLessThan(marketing.rollup.score) + }) + + it('falls back to synthesized scores when LLM call fails', async () => { + const { brain, state } = fakeStateWithoutBrain() + const v2 = await buildAuditResultV2({ + brain, + state, + pageRef: 'x', + ensemble: fakeEnsemble('saas-app'), + rubric: fakeRubric(), + measurements: fakeMeasurements(), + v1Result: fakeV1(6), + }) + // Synthesized fallback: every dim equals v1 score, confidence 'low'. + expect(v2.scores.product_intent.score).toBe(6) + expect(v2.scores.product_intent.confidence).toBe('low') + expect(v2.rollup.confidence).toBe('low') + }) + + it('classification carries ensembleConfidence + signalsAgreed', async () => { + const { brain, state } = fakeStateWithoutBrain() + const ensemble: EnsembleClassification = { + ...fakeEnsemble('saas-app'), + ensembleConfidence: 0.42, + signalsAgreed: false, + dissent: [{ source: 'dom-heuristic', type: 'marketing' }], + } + const v2 = await buildAuditResultV2({ + brain, + state, + pageRef: 'x', + ensemble, + rubric: fakeRubric(), + measurements: fakeMeasurements(), + v1Result: fakeV1(), + precomputedScores: uniformScores(6), + }) + expect(v2.classification.ensembleConfidence).toBe(0.42) + expect(v2.classification.signalsAgreed).toBe(false) + expect(v2.classification.dissent?.length).toBe(1) + }) + + it('fixture-style assertion: low product_intent + saas-app → rollup ≤ 6', async () => { + const { brain, state } = fakeStateWithoutBrain() + const scores = uniformScores(5) + scores.product_intent = { score: 3, range: [2, 4], confidence: 'medium', summary: '', primaryFindings: [] } + scores.workflow = { score: 4, range: [3, 5], confidence: 'medium', summary: '', primaryFindings: [] } + + const v2 = await buildAuditResultV2({ + brain, + state, + pageRef: 'fixture://no-primary-action', + ensemble: fakeEnsemble('saas-app'), + rubric: fakeRubric(), + measurements: fakeMeasurements(), + v1Result: fakeV1(4), + precomputedScores: scores, + }) + expect(v2.scores.product_intent.score).toBeLessThanOrEqual(4) + expect(v2.rollup.score).toBeLessThanOrEqual(6) + }) +}) diff --git a/tests/telemetry-rollup-remote.test.ts b/tests/telemetry-rollup-remote.test.ts index d3add21..57a7f4a 100644 --- a/tests/telemetry-rollup-remote.test.ts +++ b/tests/telemetry-rollup-remote.test.ts @@ -9,6 +9,7 @@ process.env.BAD_TELEMETRY_ROLLUP_NO_AUTORUN = '1' const { buildRemoteUrl } = await import('../bench/telemetry/rollup.js') const ROLLUP_PATH = path.resolve(__dirname, '..', 'bench', 'telemetry', 'rollup.ts') +const TSX_BIN = path.resolve(__dirname, '..', 'node_modules', '.bin', 'tsx') describe('rollup --remote URL building', () => { it('appends repo, kind, since, until query params when set', () => { @@ -60,8 +61,8 @@ describe('rollup --remote env requirements', () => { delete env.BAD_TELEMETRY_ADMIN_BEARER delete env.BAD_TELEMETRY_ROLLUP_NO_AUTORUN const out = spawnSync( - process.execPath, - ['--experimental-strip-types', '--no-warnings', ROLLUP_PATH, '--remote'], + TSX_BIN, + [ROLLUP_PATH, '--remote'], { encoding: 'utf-8', env }, ) expect(out.status).toBe(2) @@ -73,8 +74,8 @@ describe('rollup --remote env requirements', () => { delete env.BAD_TELEMETRY_ADMIN_BEARER delete env.BAD_TELEMETRY_ROLLUP_NO_AUTORUN const out = spawnSync( - process.execPath, - ['--experimental-strip-types', '--no-warnings', ROLLUP_PATH, '--remote'], + TSX_BIN, + [ROLLUP_PATH, '--remote'], { encoding: 'utf-8', env }, ) expect(out.status).toBe(2)