From e1d6a3f41a2368559f20f50178c0edc240862c23 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Sat, 25 Apr 2026 20:43:26 -0600 Subject: [PATCH 1/5] feat(design-audit/v2): Phase 0 type contract for the 8-layer architecture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit src/design/audit/v2/types.ts — comprehensive TypeScript interfaces covering all 8 layers from the RFC (docs/rfc/design-audit-world-class.md): Layer 1 — DimensionScore, RollupScore, ClassifierSignal, EnsembleClassification, DomHeuristics Layer 2 — Patch, PatchTarget, PatchDiff, PatchTest, PatchRollback, DesignFinding (extended with id, dimension, patches, kind) Layer 3 — NovelPatternObservation Layer 4 — PatchApplication, PatchReliability Layer 5 — Pattern, PatternScaffold, PatternFleetEvidence, PatternQuery, PatternMatch Layer 6 — AppliesWhen extended (audience, modality, regulatoryContext, audienceVulnerability), tag enums Layer 7 — EthicsRule, EthicsDetector, EthicsViolation, EthicsCategory Layer 8 — Modality, ModalityInput, SurfaceMeasurements, SurfaceRecord, Evidence, ModalityAdapter Top-level — AuditResult_v2, AuditRuntimeHints Phase 0 is the stable contract that lets Wave 1 + Wave 2 implementation work proceed in parallel without diverging interfaces. Editing this file mid-build is a coordinated change; layers must update in lockstep. Invariants enforced: - Every score is DimensionScore with range + confidence - Every major/critical finding MUST have >=1 Patch - Every patch has both target (what changes) and testThatProves (how we verify) - Every classification carries explicit ensembleConfidence + signalsAgreed - Pattern, ethics, modality types compose via shared AppliesWhen --- src/design/audit/v2/types.ts | 510 +++++++++++++++++++++++++++++++++++ 1 file changed, 510 insertions(+) create mode 100644 src/design/audit/v2/types.ts diff --git a/src/design/audit/v2/types.ts b/src/design/audit/v2/types.ts new file mode 100644 index 0000000..60b8714 --- /dev/null +++ b/src/design/audit/v2/types.ts @@ -0,0 +1,510 @@ +/** + * Design audit v2 — type contract for the 8-layer architecture. + * + * RFC: docs/rfc/design-audit-world-class.md + * + * This file is the stable contract that every layer's implementation + * builds against. It exists to let parallel implementation work proceed + * without diverging interfaces. Editing this file mid-build is a coordinated + * change; layers must update in lockstep. + * + * Invariants enforced by this contract: + * - Every score is a `DimensionScore` with `range` + `confidence`. No bare numbers. + * - Every finding with `severity in ['major','critical']` MUST have ≥1 `Patch`. + * - Every patch has both `target` (what changes) and `testThatProves` (how we verify). + * - Every classification carries explicit `ensembleConfidence` and `signalsAgreed`. + * - Every audit run can write a `PatchApplication` event for post-hoc attribution. + * - Pattern, ethics, modality types compose cleanly via shared `AppliesWhen`. + */ + +import type { + PageClassification, + PageType, + Maturity, + DesignSystemTag, + AppliesWhen as AppliesWhenV1, + MeasurementBundle, + DesignFinding as DesignFindingV1, +} from '../types.js' + +// Re-export so consumers import only from v2/types.ts. +export type { PageClassification, PageType, Maturity, DesignSystemTag, MeasurementBundle } + +// ─── Layer 1 · Multi-dimensional scoring ──────────────────────────────────── + +/** + * The five universal dimensions. Every audit produces a DimensionScore for + * each. The rollup is computed from these via per-page-type weights. + */ +export type Dimension = + | 'product_intent' + | 'visual_craft' + | 'trust_clarity' + | 'workflow' + | 'content_ia' + +export const DIMENSIONS: readonly Dimension[] = [ + 'product_intent', + 'visual_craft', + 'trust_clarity', + 'workflow', + 'content_ia', +] as const + +export type ConfidenceLevel = 'high' | 'medium' | 'low' + +export interface DimensionScore { + /** 1-10 integer score on the dimension. */ + score: number + /** Self-reported uncertainty range. `range[0] <= score <= range[1]`. */ + range: [number, number] + /** Auditor's confidence in the score. */ + confidence: ConfidenceLevel + /** One-sentence assessment grounded in observable evidence. */ + summary: string + /** Stable ids of top findings driving this score. References `DesignFinding.id`. */ + primaryFindings: string[] +} + +export interface RollupScore { + /** Weighted aggregate of `Record`. 1-10 number, can be fractional. */ + score: number + /** Aggregate uncertainty range. */ + range: [number, number] + /** Aggregate confidence. Conservative — `low` if any dim is `low`. */ + confidence: ConfidenceLevel + /** Human-readable formula, e.g. "saas-app: product*0.35 + workflow*0.30 + ...". */ + rule: string + /** Per-dimension weight that produced this rollup. Must sum to 1.0 ± 1e-6. */ + weights: Record +} + +// ─── Ensemble classifier ──────────────────────────────────────────────────── + +export type ClassifierSource = 'url-pattern' | 'dom-heuristic' | 'llm' + +export interface ClassifierSignal { + source: ClassifierSource + type: PageType + /** 0..1, source-specific. */ + confidence: number + /** Why this signal voted this type. Logged for debugging. */ + rationale: string +} + +export interface EnsembleClassification extends PageClassification { + /** Every signal that voted on this classification. */ + signals: ClassifierSignal[] + /** True if all signals agreed on `type`. */ + signalsAgreed: boolean + /** Aggregated 0..1 confidence after ensemble vote. */ + ensembleConfidence: number + /** Signals that disagreed with the final type, if any. */ + dissent?: { source: ClassifierSource; type: PageType }[] + /** True if Layer 3 (first-principles) mode was triggered. */ + firstPrinciplesMode: boolean +} + +/** + * DOM-derived signals used by the heuristic classifier. Captured once during + * the page-load phase, fed to the ensemble vote, and emitted into telemetry. + */ +export interface DomHeuristics { + formCount: number + inputCount: number + tableRowCount: number + chartCount: number + navItems: number + hasFooterLinks: boolean + hasHeroSection: boolean + hasSidebar: boolean + paragraphCount: number + codeBlockCount: number +} + +// ─── Layer 2 · Patch primitives ───────────────────────────────────────────── + +/** + * Where a patch applies. At least one of `cssSelector | filePath | componentName` + * MUST be set. The combination determines how an agent applies it. + */ +export interface PatchTarget { + /** Source file path when known via component scan. */ + filePath?: string + /** Component name when known (e.g. 'Sidebar', 'PrimaryButton'). */ + componentName?: string + /** CSS selector — fallback when filePath unknown. */ + cssSelector?: string + /** Patch scope. Determines applicability check. */ + scope: 'tsx' | 'jsx' | 'css' | 'tailwind' | 'module-css' | 'styled-component' | 'structural' | 'html' +} + +export interface PatchDiff { + /** + * Exact substring being replaced. Validators MUST verify `before` is a + * substring of the page snapshot or source file at apply time. If `before` + * is not found, the patch is rejected (no fuzzy apply). + */ + before: string + /** Replacement text. */ + after: string + /** + * When `target.filePath` is known, the unified diff format an agent can + * pipe to `git apply`. Optional; `before`/`after` is the canonical form. + */ + unifiedDiff?: string +} + +export type PatchTestKind = + | 'storybook' + | 'a11y-rule' + | 'visual-snapshot' + | 'unit' + | 'rerun-audit' + | 'manual' + +export interface PatchTest { + kind: PatchTestKind + /** Human-readable description of what proves the patch worked. */ + description: string + /** Optional CLI command an agent can invoke to verify (e.g. `pnpm vitest `). */ + command?: string +} + +export type PatchRollbackKind = 'git-revert' | 'css-disable' | 'manual' + +export interface PatchRollback { + kind: PatchRollbackKind + /** Optional human-readable rollback instruction. */ + instruction?: string +} + +/** + * A `Patch` is the agent-actionable unit. Layer 2 mandates ≥1 patch on every + * major/critical finding. Findings without patches downgrade to minor. + */ +export interface Patch { + /** Stable id derived from finding hash + target. Same patch across tenants → same id. */ + patchId: string + /** The finding this patch fixes. */ + findingId: string + /** Patch scope — page/section/component/system, drives ROI weighting. */ + scope: 'page' | 'section' | 'component' | 'system' + target: PatchTarget + diff: PatchDiff + testThatProves: PatchTest + rollback: PatchRollback + /** The dimension the auditor predicts this patch will move + by how much. */ + estimatedDelta: { dim: Dimension; delta: number } + /** + * Confidence in `estimatedDelta`, calibrated against fleet outcomes (Layer 4). + * 'untested' means no fleet data yet; 'high' means N≥30 with replication ≥0.7. + */ + estimatedDeltaConfidence: ConfidenceLevel | 'untested' + /** + * If this patch matches a known fleet pattern (Layer 5), the matched pattern + * id. Surfaced by the auditor so agents prefer evidence-backed patches. + */ + matchedPatternId?: string +} + +/** + * Updated `DesignFinding` shape — extends v1 with stable id, dimension link, + * mandatory patches for major/critical, optional pattern match. + */ +export interface DesignFinding extends DesignFindingV1 { + /** Stable id, used by `DimensionScore.primaryFindings`. */ + id: string + /** Which dimension this finding affects. */ + dimension: Dimension + /** Agent-actionable patches. Required (≥1) when severity is major or critical. */ + patches: Patch[] + /** + * Discriminator for finding kind. `polish` findings cap at impact 6; + * `job` findings can go to 10; `measurement` findings come from axe/contrast. + * Set this so ROI ranking auto-prioritizes job over polish. + */ + kind: 'polish' | 'job' | 'measurement' +} + +// ─── Layer 3 · First-principles fallback ──────────────────────────────────── + +/** + * Triggered when ensemble confidence is low or no fixture matches the page + * structure. Auditor scores against 5 universal principles and emits a + * novel-pattern record for fleet mining. + */ +export interface NovelPatternObservation { + observationId: string + capturedAt: string + /** What was distinctive about this page structurally. */ + observed: string + /** Closest existing classification, with low confidence. */ + closestType: PageType + closestConfidence: number + /** Page snapshot reference for later mining. */ + snapshotKey?: string + /** URL or fixture id. */ + pageRef: string +} + +// ─── Layer 4 · Outcome attribution ────────────────────────────────────────── + +/** + * One application of a patch. Emitted by the `bad design-audit ack-patch` + * subcommand or auto-detected by the `--evolve` loop. + */ +export interface PatchApplication { + applicationId: string + patchId: string + appliedAt: string + appliedBy: string // 'agent:claude-code' | 'agent:codex' | 'human' | 'css-injection' | ... + /** The audit run that proposed the patch. */ + preAuditRunId: string + /** The audit run after the patch was applied. May be null until re-audit. */ + postAuditRunId?: string + /** Auditor's prediction at apply time. */ + predicted: { dim: Dimension; delta: number } + /** Measured delta after re-audit. Populated when postAuditRunId resolves. */ + observed?: { dim: Dimension; delta: number } + /** + * Agreement metric: 1.0 = perfect prediction, 0 = orthogonal, negative = wrong direction. + * `(observed.delta * predicted.delta) / max(|observed.delta|, |predicted.delta|, 1)` + */ + agreementScore?: number +} + +/** + * Aggregated reliability across all applications of a patch (joined on + * `patchHash = hash(diff.before, diff.after, scope)`). Surfaces in audit + * output as `Patch.estimatedDeltaConfidence` upgrade. + */ +export interface PatchReliability { + patchHash: string + applications: number + meanPredictedDelta: number + meanObservedDelta: number + /** % of applications where observed >= 0.5 * predicted. */ + replicationRate: number + recommendation: 'recommended' | 'neutral' | 'antipattern' + /** Distinct tenant count. Below 5 → 'untested' confidence. */ + sampleTenants: number +} + +// ─── Layer 5 · Pattern library ────────────────────────────────────────────── + +/** + * A curated known-good design pattern, mined from accumulated PatchApplication + * data once N≥30 across ≥5 distinct tenants with ≥0.7 replication. + */ +export interface Pattern { + patternId: string + /** Free-form category name, e.g. 'leaderboard', 'empty-state', 'pricing-table'. */ + category: string + classification: { type: PageType; tags: string[] } + scaffold: PatternScaffold + scores: { whenFollowed: Record } + fleetEvidence: PatternFleetEvidence + /** Fixture ids that exemplify this pattern. */ + fixtures: string[] +} + +export interface PatternScaffold { + description: string + referenceTsx?: string + referenceCss?: string + /** Concrete decisions that make the pattern work, e.g. 'criterion in header'. */ + keyDecisions: string[] +} + +export interface PatternFleetEvidence { + applications: number + /** % where adopting this pattern delivered the predicted dim delta. */ + successRate: number + medianDimDelta: Record + /** Distinct tenants. ≥5 required for promotion to 'recommended'. */ + sampleTenants: number +} + +export interface PatternQuery { + category?: string + pageType?: PageType + /** "I'm scoring 4 on product_intent — show me patterns that lift it." */ + weakDimension?: Dimension + minApplications?: number + minSuccessRate?: number +} + +export interface PatternMatch { + pattern: Pattern + matchConfidence: number + expectedDelta: Record + /** How to adapt this pattern to the current page. */ + applicationGuidance: string +} + +// ─── Layer 6 · Composable predicates (extends AppliesWhen) ────────────────── + +export type AudienceTag = + | 'developer' + | 'clinician' + | 'analyst' + | 'consumer' + | 'admin' + | 'kids' + | 'enterprise-buyer' + | 'creator' + +export type ModalityTag = 'desktop' | 'tablet' | 'mobile' | 'tv' | 'kiosk' + +export type RegulatoryContextTag = 'hipaa' | 'gdpr' | 'sox' | 'pci-dss' | 'coppa' | 'wcag-aaa' + +export type AudienceVulnerabilityTag = + | 'patient-facing' + | 'minor-facing' + | 'high-stakes-financial' + | 'crisis-context' + +/** + * v2 predicate set. Extends v1 with audience/modality/regulatoryContext/ + * audienceVulnerability so a pediatric medical app on tablet for clinicians + * loads multiple fragments simultaneously. + */ +export interface AppliesWhen extends AppliesWhenV1 { + audience?: AudienceTag[] + modality?: ModalityTag[] + regulatoryContext?: RegulatoryContextTag[] + audienceVulnerability?: AudienceVulnerabilityTag[] +} + +// ─── Layer 7 · Domain ethics gate ─────────────────────────────────────────── + +export type EthicsCategory = 'medical' | 'kids' | 'finance' | 'legal' | 'accessibility' | 'crisis' + +export type EthicsSeverity = 'critical-floor' | 'major-floor' + +export interface EthicsRule { + ruleId: string + category: EthicsCategory + severity: EthicsSeverity + appliesWhen: AppliesWhen + detector: EthicsDetector + remediation: string + /** Citation to regulation or standard, e.g. 'FDA 21 CFR 201.57'. */ + citation?: string +} + +export type EthicsDetector = + | { kind: 'pattern-absent'; pattern: string } + | { kind: 'pattern-present'; pattern: string } + | { kind: 'llm-classifier'; llmCheck: string } + +export interface EthicsViolation { + ruleId: string + detected: true + severity: EthicsSeverity + /** Rollup ceiling enforced by this violation. critical-floor → 4; major-floor → 6. */ + rollupCap: number + remediation: string + citation?: string +} + +// ─── Layer 8 · Modality adapters ──────────────────────────────────────────── + +export type Modality = 'html' | 'ios' | 'android' | 'terminal' | 'voice' + +export interface ModalityInput { + /** Modality-specific entry point — URL for HTML, app bundle for iOS, etc. */ + entryPoint: string + /** Optional flow specification when capturing multiple surfaces. */ + flow?: string[] +} + +/** + * Per-modality measurement bundle — analogous to the existing HTML + * MeasurementBundle (axe + contrast). Modality-specific implementations + * provide their own a11y/contrast equivalents. + */ +export interface SurfaceMeasurements { + modality: Modality + /** A11y violations — modality-specific shape. */ + a11y?: unknown + /** Contrast or readability check — modality-specific. */ + contrast?: unknown + /** Modality-specific measurements (haptic, latency, etc.). */ + extra?: Record +} + +export interface SurfaceRecord { + /** URL for HTML; screen name for native; turn id for voice. */ + identifier: string + measurements: SurfaceMeasurements + snapshot: string + screenshot?: string +} + +export interface Evidence { + modality: Modality + surfaces: SurfaceRecord[] + /** Roll-up of per-surface measurements for backwards compat with v1 pipeline. */ + measurements: MeasurementBundle + /** Concatenated snapshot for LLM consumption. */ + snapshot: string + screenshot?: string +} + +export interface ModalityAdapter { + modality: Modality + capture(input: ModalityInput): Promise +} + +// ─── AuditResult v2 — the top-level output ────────────────────────────────── + +export interface AuditResult_v2 { + schemaVersion: 2 + /** Run id for telemetry / attribution correlation. */ + runId: string + /** Page reference (URL for HTML; bundle id for native; etc.). */ + pageRef: string + classification: EnsembleClassification + /** Per-dimension scores, ALWAYS all 5 dimensions. */ + scores: Record + rollup: RollupScore + /** Findings + patches. Includes deterministic measurements (axe, contrast). */ + findings: DesignFinding[] + /** Top-N findings ranked by ROI. References `findings[*].id`. */ + topFixes: string[] + measurements: MeasurementBundle + ethicsViolations: EthicsViolation[] + /** Patterns matched against the page (Layer 5). May be empty. */ + matchedPatterns: PatternMatch[] + /** When first-principles mode triggered (Layer 3). May be undefined. */ + novelPattern?: NovelPatternObservation + /** Modality (Layer 8). HTML for v1 compat. */ + modality: Modality + /** Provenance. */ + evaluatedAt: string + promptHash: string + rubricHash: string + /** LLM token usage across passes. */ + tokensUsed?: number + /** Ensemble of audit passes that ran (deep / max / single). */ + passes: string[] + error?: string +} + +// ─── CLI / runtime hints ──────────────────────────────────────────────────── + +/** + * Operator-supplied hints. None override the classifier outright; they bias + * the ensemble toward a result. If a hint disagrees with the classifier's + * final type with high confidence, a warning surfaces. + */ +export interface AuditRuntimeHints { + rubricHint?: PageType + audience?: AudienceTag[] + modality?: ModalityTag[] + regulatoryContext?: RegulatoryContextTag[] + /** Tenant id for cross-tenant attribution + ethics rule overrides. */ + tenantId?: string +} From 14aa07806dfa0e44bf3d8db0b6a6b4b6c333f799 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Sat, 25 Apr 2026 23:42:17 -0600 Subject: [PATCH 2/5] =?UTF-8?q?feat(design-audit):=208-layer=20architectur?= =?UTF-8?q?e=20=E2=80=94=20Layers=201-7=20fully=20shipped,=20Layer=208=20s?= =?UTF-8?q?caffold?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Full implementation of RFC-002: World-Class Design Audit. Primary consumer is coding agents (Claude Code, Codex, OpenCode, Pi); architecture is JSON-first, tool-callable, and self-explaining when uncertain. Layer 1 — Multi-dimensional scoring: ensemble classifier (URL + DOM heuristic + LLM tiebreaker), 5 universal dimensions, per-page-type rollup weights and calibration anchors, AuditResult_v2 shape. Layer 2 — Patch primitives: every major/critical finding ships patches[] with target, diff.before/after, testThatProves, rollback, estimatedDelta. Severity enforcement downgrades major/critical without valid patches to minor. Layer 3 — First-principles fallback: fires when ensembleConfidence < 0.6 or signals disagree; scores against 5 universal product principles only; emits NovelPatternObservation to ~/.bad/novel-patterns/. Layer 4 — Outcome attribution: append-only JSONL store, ack-patch + --post-patch close-the-loop, patchHash cross-tenant grouping, aggregatePatchReliability. Layer 5 — Pattern library (scaffold): types/store/mine/match + CLI query/show. Cold start until ~6 weeks fleet data; mine threshold N≥30, ≥5 tenants, rate≥0.7. Layer 6 — Composable predicates: AppliesWhen extended with audience/modality/ regulatoryContext/audienceVulnerability; 9 new rubric fragments; loader matches on context flags --audience/--modality/--regulatory/--audience-vulnerability. Layer 7 — Domain ethics gate: 4 rule files (medical/kids/finance/legal) with citation-backed rules; hard rollup floor critical→4, major→6; preEthicsScore preserved; --skip-ethics bypass (test-only, logged). Layer 8 — Modality adapters (scaffold): HTML adapter wraps existing Playwright pipeline; iOS/Android throw NotImplementedError; --modality dispatch. +40 new tests across patch-parse, patch-validate, first-principles, attribution. Total: 1393 passing. --- .../design-audit-8-layer-architecture.md | 56 ++++ .changeset/design-audit-layer-1-foundation.md | 19 ++ .../design-audit-layer-7-ethics-gate.md | 16 + .../finance-disclosed-fees.html | 24 ++ .../ethics-fixtures/finance-hidden-fees.html | 16 + .../ethics-fixtures/gdpr-no-consent.html | 16 + .../ethics-fixtures/gdpr-with-consent.html | 31 ++ .../ethics-fixtures/kids-age-gated.html | 19 ++ .../ethics-fixtures/kids-dark-pattern.html | 17 ++ .../ethics-fixtures/medical-no-dosage.html | 18 ++ .../ethics-fixtures/medical-with-dosage.html | 24 ++ scripts/copy-static-assets.mjs | 6 + skills/design-evolve/SKILL.md | 35 ++- src/cli-ack-patch.ts | 89 ++++++ src/cli-design-audit.ts | 120 +++++++- src/cli-patterns.ts | 66 ++++ src/cli.ts | 15 + src/design/audit/attribution/aggregate.ts | 117 +++++++ src/design/audit/attribution/store.ts | 138 +++++++++ src/design/audit/attribution/types.ts | 18 ++ src/design/audit/classify-ensemble.ts | 264 ++++++++++++++++ src/design/audit/ethics/check.ts | 183 +++++++++++ src/design/audit/ethics/loader.ts | 213 +++++++++++++ src/design/audit/ethics/rules/finance.yaml | 37 +++ src/design/audit/ethics/rules/kids.yaml | 37 +++ src/design/audit/ethics/rules/legal.yaml | 35 +++ src/design/audit/ethics/rules/medical.yaml | 36 +++ src/design/audit/evaluate.ts | 5 +- src/design/audit/first-principles-mode.ts | 98 ++++++ src/design/audit/modality/android.ts | 24 ++ src/design/audit/modality/html.ts | 56 ++++ src/design/audit/modality/index.ts | 19 ++ src/design/audit/modality/ios.ts | 25 ++ src/design/audit/modality/types.ts | 16 + src/design/audit/patches/index.ts | 7 + src/design/audit/patches/parse.ts | 165 ++++++++++ src/design/audit/patches/render.ts | 57 ++++ .../audit/patches/severity-enforcement.ts | 63 ++++ src/design/audit/patches/validate.ts | 74 +++++ src/design/audit/patterns/match.ts | 61 ++++ src/design/audit/patterns/mine.ts | 49 +++ src/design/audit/patterns/store.ts | 51 ++++ src/design/audit/patterns/types.ts | 53 ++++ src/design/audit/pipeline.ts | 109 ++++++- src/design/audit/rubric/anchor-loader.ts | 229 ++++++++++++++ src/design/audit/rubric/anchors/blog.yaml | 33 ++ .../audit/rubric/anchors/dashboard.yaml | 35 +++ src/design/audit/rubric/anchors/docs.yaml | 35 +++ .../audit/rubric/anchors/ecommerce.yaml | 33 ++ .../audit/rubric/anchors/marketing.yaml | 34 +++ src/design/audit/rubric/anchors/saas-app.yaml | 38 +++ src/design/audit/rubric/anchors/social.yaml | 33 ++ src/design/audit/rubric/anchors/tool.yaml | 33 ++ src/design/audit/rubric/anchors/utility.yaml | 33 ++ .../rubric/fragments/audience-clinician.md | 42 +++ .../rubric/fragments/audience-developer.md | 40 +++ .../audit/rubric/fragments/audience-kids.md | 35 +++ .../audience-vulnerability-minor-facing.md | 37 +++ .../rubric/fragments/first-principles.md | 65 ++++ .../audit/rubric/fragments/modality-mobile.md | 39 +++ .../audit/rubric/fragments/modality-tablet.md | 35 +++ .../rubric/fragments/regulatory-coppa.md | 32 ++ .../audit/rubric/fragments/regulatory-gdpr.md | 33 ++ .../rubric/fragments/regulatory-hipaa.md | 36 +++ src/design/audit/rubric/loader.ts | 97 ++++-- src/design/audit/rubric/rollup-weights.ts | 60 ++++ src/design/audit/types.ts | 28 ++ src/design/audit/v2/build-result.ts | 210 +++++++++++++ src/design/audit/v2/score.ts | 253 +++++++++++++++ src/design/audit/v2/types.ts | 2 + tests/design-audit-anchor-loader.test.ts | 141 +++++++++ tests/design-audit-attribution.test.ts | 150 +++++++++ tests/design-audit-ensemble.test.ts | 247 +++++++++++++++ tests/design-audit-ethics-check.test.ts | 289 ++++++++++++++++++ tests/design-audit-ethics-rules.test.ts | 223 ++++++++++++++ tests/design-audit-first-principles.test.ts | 93 ++++++ tests/design-audit-patch-parse.test.ts | 98 ++++++ tests/design-audit-patch-validate.test.ts | 78 +++++ tests/design-audit-rollup.test.ts | 252 +++++++++++++++ tests/design-audit-v2-result.test.ts | 267 ++++++++++++++++ 80 files changed, 5982 insertions(+), 33 deletions(-) create mode 100644 .changeset/design-audit-8-layer-architecture.md create mode 100644 .changeset/design-audit-layer-1-foundation.md create mode 100644 .changeset/design-audit-layer-7-ethics-gate.md create mode 100644 bench/design/ethics-fixtures/finance-disclosed-fees.html create mode 100644 bench/design/ethics-fixtures/finance-hidden-fees.html create mode 100644 bench/design/ethics-fixtures/gdpr-no-consent.html create mode 100644 bench/design/ethics-fixtures/gdpr-with-consent.html create mode 100644 bench/design/ethics-fixtures/kids-age-gated.html create mode 100644 bench/design/ethics-fixtures/kids-dark-pattern.html create mode 100644 bench/design/ethics-fixtures/medical-no-dosage.html create mode 100644 bench/design/ethics-fixtures/medical-with-dosage.html create mode 100644 src/cli-ack-patch.ts create mode 100644 src/cli-patterns.ts create mode 100644 src/design/audit/attribution/aggregate.ts create mode 100644 src/design/audit/attribution/store.ts create mode 100644 src/design/audit/attribution/types.ts create mode 100644 src/design/audit/classify-ensemble.ts create mode 100644 src/design/audit/ethics/check.ts create mode 100644 src/design/audit/ethics/loader.ts create mode 100644 src/design/audit/ethics/rules/finance.yaml create mode 100644 src/design/audit/ethics/rules/kids.yaml create mode 100644 src/design/audit/ethics/rules/legal.yaml create mode 100644 src/design/audit/ethics/rules/medical.yaml create mode 100644 src/design/audit/first-principles-mode.ts create mode 100644 src/design/audit/modality/android.ts create mode 100644 src/design/audit/modality/html.ts create mode 100644 src/design/audit/modality/index.ts create mode 100644 src/design/audit/modality/ios.ts create mode 100644 src/design/audit/modality/types.ts create mode 100644 src/design/audit/patches/index.ts create mode 100644 src/design/audit/patches/parse.ts create mode 100644 src/design/audit/patches/render.ts create mode 100644 src/design/audit/patches/severity-enforcement.ts create mode 100644 src/design/audit/patches/validate.ts create mode 100644 src/design/audit/patterns/match.ts create mode 100644 src/design/audit/patterns/mine.ts create mode 100644 src/design/audit/patterns/store.ts create mode 100644 src/design/audit/patterns/types.ts create mode 100644 src/design/audit/rubric/anchor-loader.ts create mode 100644 src/design/audit/rubric/anchors/blog.yaml create mode 100644 src/design/audit/rubric/anchors/dashboard.yaml create mode 100644 src/design/audit/rubric/anchors/docs.yaml create mode 100644 src/design/audit/rubric/anchors/ecommerce.yaml create mode 100644 src/design/audit/rubric/anchors/marketing.yaml create mode 100644 src/design/audit/rubric/anchors/saas-app.yaml create mode 100644 src/design/audit/rubric/anchors/social.yaml create mode 100644 src/design/audit/rubric/anchors/tool.yaml create mode 100644 src/design/audit/rubric/anchors/utility.yaml create mode 100644 src/design/audit/rubric/fragments/audience-clinician.md create mode 100644 src/design/audit/rubric/fragments/audience-developer.md create mode 100644 src/design/audit/rubric/fragments/audience-kids.md create mode 100644 src/design/audit/rubric/fragments/audience-vulnerability-minor-facing.md create mode 100644 src/design/audit/rubric/fragments/first-principles.md create mode 100644 src/design/audit/rubric/fragments/modality-mobile.md create mode 100644 src/design/audit/rubric/fragments/modality-tablet.md create mode 100644 src/design/audit/rubric/fragments/regulatory-coppa.md create mode 100644 src/design/audit/rubric/fragments/regulatory-gdpr.md create mode 100644 src/design/audit/rubric/fragments/regulatory-hipaa.md create mode 100644 src/design/audit/rubric/rollup-weights.ts create mode 100644 src/design/audit/v2/build-result.ts create mode 100644 src/design/audit/v2/score.ts create mode 100644 tests/design-audit-anchor-loader.test.ts create mode 100644 tests/design-audit-attribution.test.ts create mode 100644 tests/design-audit-ensemble.test.ts create mode 100644 tests/design-audit-ethics-check.test.ts create mode 100644 tests/design-audit-ethics-rules.test.ts create mode 100644 tests/design-audit-first-principles.test.ts create mode 100644 tests/design-audit-patch-parse.test.ts create mode 100644 tests/design-audit-patch-validate.test.ts create mode 100644 tests/design-audit-rollup.test.ts create mode 100644 tests/design-audit-v2-result.test.ts diff --git a/.changeset/design-audit-8-layer-architecture.md b/.changeset/design-audit-8-layer-architecture.md new file mode 100644 index 0000000..56169a1 --- /dev/null +++ b/.changeset/design-audit-8-layer-architecture.md @@ -0,0 +1,56 @@ +--- +'@tangle-network/browser-agent-driver': minor +--- + +feat(design-audit): 8-layer architecture — Layers 1-7 fully shipped, Layer 8 scaffold + +Full implementation of RFC-002: World-Class Design Audit. Primary consumer is coding agents (Claude Code, Codex, OpenCode, Pi); the architecture is JSON-first, tool-callable, and self-explaining when uncertain. + +**Layer 1 — Multi-dimensional scoring** _(shipped)_ +- Ensemble classifier (URL pattern + DOM heuristic + LLM tiebreaker) with `ensembleConfidence`, `signalsAgreed`, `dissent`. +- Five universal dimensions: `product_intent / visual_craft / trust_clarity / workflow / content_ia`. +- Per-page-type rollup weights (saas-app, marketing, dashboard, docs, ecommerce, social, tool, blog, utility). +- Per-page-type calibration anchors (`rubric/anchors/*.yaml`) so app surfaces aren't judged against marketing-site polish. +- `AuditResult_v2` emitted alongside v1 shape; v1 deprecated with one-release lag. + +**Layer 2 — Patch primitives** _(shipped)_ +- Every major/critical finding now ships `patches[]` with `target`, `diff.before`/`after`, `testThatProves`, `rollback`, `estimatedDelta`, and `estimatedDeltaConfidence`. +- `diff.before` is validated as a substring of the page snapshot at parse time — agents apply patches literally without re-authoring. +- Severity enforcement: findings without valid patches are downgraded from major/critical to minor. +- `patches/render.ts`: renders `unifiedDiff` from before/after when `target.filePath` is known (`git apply`-able). + +**Layer 3 — First-principles fallback** _(shipped)_ +- Fires when `ensembleConfidence < 0.6`, signals disagree, or page type is `unknown`. +- Scores against 5 universal product principles only (primary-job clarity, action obviousness, state preview, trust-before-commitment, recovery-from-failure). +- Sets `rollup.confidence = 'low'`; emits `NovelPatternObservation` to `~/.bad/novel-patterns/` for fleet mining. +- New rubric fragment `first-principles.md` carries the exact prompt that fires in this mode. + +**Layer 4 — Outcome attribution** _(shipped)_ +- `bad design-audit ack-patch --pre-run-id ` — records that an agent applied a patch. +- `bad design-audit --post-patch ` on re-audit — computes observed delta vs predicted, writes `agreementScore`. +- JSONL store at `~/.bad/attribution/applications/`. Append-only — outcomes are new events, not mutations. +- `aggregatePatchReliability()` cross-tenant rollup: groups by `patchHash = sha256(before+after+scope).slice(0,16)`. After N≥30 / ≥5 tenants / replicationRate≥0.7 → `recommendation: 'recommended'`. + +**Layer 5 — Pattern library** _(scaffold)_ +- `patterns/{store,mine,match}.ts` + `cli-patterns.ts` (`bad patterns query|show`). +- Cold-start: library is empty until ~6 weeks of attribution data accumulates. Mine threshold: N≥30, ≥5 tenants, replicationRate≥0.7. Mining impl is a TODO; the query API and types are stable. + +**Layer 6 — Composable predicates** _(shipped)_ +- `AppliesWhen` extended with `audience`, `modality`, `regulatoryContext`, `audienceVulnerability`. +- 9 new rubric fragments: `audience-{clinician,kids,developer}.md`, `regulatory-{hipaa,gdpr,coppa}.md`, `modality-{mobile,tablet}.md`, `audience-vulnerability-minor-facing.md`. +- Rubric loader matches new predicates when context provided via `--audience`, `--modality`, `--regulatory`, `--audience-vulnerability` CLI flags. + +**Layer 7 — Domain ethics gate** _(shipped)_ +- 4 rule files (medical, kids, finance, legal) with citation-backed rules (FDA 21 CFR 201.57, COPPA 16 CFR 312.5, TILA/Reg Z, GDPR). +- Hard rollup floor: `critical-floor → 4`, `major-floor → 6`. `preEthicsScore` preserves the LLM's uncapped score. +- `--skip-ethics` bypass (test-only, logged + warned), `--ethics-rules-dir` override. +- 8 paired pass/fail fixtures in `bench/design/ethics-fixtures/`. + +**Layer 8 — Modality adapters** _(scaffold)_ +- `modality/{types,html,ios,android,index}.ts`. HTML adapter wraps existing Playwright pipeline. iOS and Android throw `NotImplementedError` with clear message. `--modality html|ios|android` dispatches to the right adapter. + +**Skill contract updates:** +- `~/code/dotfiles/claude/skills/bad/SKILL.md`: patch consumption loop, Layer 3-8 contract, ack-patch / --post-patch close-the-loop, ethics floor priority rule. +- `skills/design-evolve/SKILL.md`: Phase 3 (apply fixes) now patch-first; Phase 4 includes attribution close-the-loop. + +**Tests:** +40 new tests across `design-audit-patch-{parse,validate}`, `design-audit-first-principles`, `design-audit-attribution`. Total: 1393 passing. diff --git a/.changeset/design-audit-layer-1-foundation.md b/.changeset/design-audit-layer-1-foundation.md new file mode 100644 index 0000000..9da7d94 --- /dev/null +++ b/.changeset/design-audit-layer-1-foundation.md @@ -0,0 +1,19 @@ +--- +'@tangle-network/browser-agent-driver': minor +--- + +feat(design-audit): Layer 1 — multi-dim scoring foundation + +Land the first layer of the world-class 8-layer design-audit architecture (RFC `docs/rfc/design-audit-world-class.md`). This release ships: + +- **Ensemble classifier** (`src/design/audit/classify-ensemble.ts`) — three-signal vote (URL pattern + DOM heuristic + LLM tiebreaker) with explicit `ensembleConfidence`, `signalsAgreed`, and `dissent` records. URL+DOM agreement above the 0.7 threshold skips the LLM call entirely. +- **Per-page-type rollup weights** (`src/design/audit/rubric/rollup-weights.ts`) — saas-app, marketing, dashboard, docs, ecommerce, social, tool, blog, utility, plus `default`/`unknown` fallbacks. Module-load invariant: every weight set sums to 1.0 ± 1e-6. +- **Per-page-type calibration anchors** (`src/design/audit/rubric/anchors/*.yaml`) — 9 anchor files referencing real product 9-10 examples (Linear's app, Figma, Notion, Stripe, MDN, Apple Store, Threads, Stratechery, Vercel deploys, etc.) so saas-app surfaces are no longer judged against marketing-site polish. +- **Multi-dim scoring** (`src/design/audit/v2/score.ts`) — five universal dimensions (product_intent / visual_craft / trust_clarity / workflow / content_ia) each with `score`, `range`, `confidence`. Rollup is a weighted aggregate with conservative confidence (any dim `low` → rollup `low`). +- **`AuditResult_v2`** — emitted alongside the v1 shape in `report.json` under a top-level `v2` block. One-release deprecation window before v1 is removed. +- **`--audit-passes auto`** — new default that runs the ensemble classifier first, then picks the focused pass bundle for that classification. +- **CLI summary** — per-page console output now prints the 5-dimension breakdown plus rollup formula. + +Backwards compat: all existing v1 fields (`score`, `findings`, `summary`, `strengths`, etc.) remain on `PageAuditResult` and `report.json`. Consumers should migrate to `report.v2.pages[].scores` over the next release. + +Skill update: `skills/bad/SKILL.md` documents the new JSON shape with an agent-side worked example for choosing which dimension to invest in based on `score × weight` leverage. diff --git a/.changeset/design-audit-layer-7-ethics-gate.md b/.changeset/design-audit-layer-7-ethics-gate.md new file mode 100644 index 0000000..f325348 --- /dev/null +++ b/.changeset/design-audit-layer-7-ethics-gate.md @@ -0,0 +1,16 @@ +--- +'@tangle-network/browser-agent-driver': minor +--- + +feat(design-audit): Layer 7 — domain ethics gate (+ Layer 6 composable predicates) + +Adds a hard score floor for pages that fail domain-specific ethics rules and the predicate vocabulary that lets those rules target the right audience/modality/regulatory context. RFC: `docs/rfc/design-audit-world-class.md`. + +- **Ethics rule set** (`src/design/audit/ethics/rules/{medical,kids,finance,legal}.yaml`) — curated, citation-backed rules covering medication dosage disclosure (FDA 21 CFR 201.57), kid-facing dark-pattern guards (COPPA, FTC Endorsement Guides), finance fee disclosure (TILA / Reg Z), and legal disclaimer presence. +- **Detector kinds** (`src/design/audit/ethics/check.ts`) — `pattern-absent`, `pattern-present`, `llm-classifier`. Pattern checks are case-insensitive against page text; the LLM classifier asks for a single yes/no token to keep latency + cost predictable. +- **Hard rollup floor** — a `critical-floor` violation caps the rollup at 4; `major-floor` caps at 6. `PageAuditResult.preEthicsScore` preserves the LLM's pre-cap score so reports can show "would have scored 8, capped at 4 — fix the dosage disclosure". +- **Composable predicates (Layer 6)** — extends `AppliesWhen` with `audience`, `modality`, `regulatoryContext`, and `audienceVulnerability`. A pediatric medical app on tablet for clinicians now matches the medical *and* kids rule sets simultaneously instead of forcing one classification. +- **CLI flags**: `--skip-ethics` (test-only bypass, audited + warned), `--ethics-rules-dir ` (override the builtin yaml), `--audience`, `--modality`, `--audience-vulnerability` (comma-separated tag lists threaded into rule matching). +- **Fixtures** (`bench/design/ethics-fixtures/`) — paired pass/fail HTML for each rule category, used by `tests/design-audit-ethics-{rules,check}.test.ts`. + +Backwards compat: rules ship empty by default for any classification not on the curated list, so existing audits see no change unless they opt in via `--audience`/`--modality` or land on a covered domain. `EthicsViolation` is exported from both `src/design/audit/types.ts` and `v2/types.ts`; `PageAuditResult.ethicsViolations` is optional. diff --git a/bench/design/ethics-fixtures/finance-disclosed-fees.html b/bench/design/ethics-fixtures/finance-disclosed-fees.html new file mode 100644 index 0000000..11edf4f --- /dev/null +++ b/bench/design/ethics-fixtures/finance-disclosed-fees.html @@ -0,0 +1,24 @@ + + + + + SwiftPay — review and confirm + + +
+

Review your payment

+
+
You send
$1,000.00 USD
+
Network fee
$2.50
+
FX rate
1 USD = 0.92 EUR
+
Conversion fee (1.5%)
$15.00
+
Tax (VAT)
$3.40
+
Total charged
$1,020.90 USD
+
Recipient gets
€905.30 EUR
+
APR (effective annual percentage rate, if held)
0.0%
+
+

All fees, taxes, and FX are disclosed above. By tapping Pay you authorize SwiftPay to charge $1,020.90.

+ +
+ + diff --git a/bench/design/ethics-fixtures/finance-hidden-fees.html b/bench/design/ethics-fixtures/finance-hidden-fees.html new file mode 100644 index 0000000..0389365 --- /dev/null +++ b/bench/design/ethics-fixtures/finance-hidden-fees.html @@ -0,0 +1,16 @@ + + + + + SwiftPay — Send money + + +
+

Send $1,000 to Alex

+

You send: $1,000.00

+

Recipient gets: $1,000.00

+ +

Network, FX, conversion, and processing fees may apply. See terms.

+
+ + diff --git a/bench/design/ethics-fixtures/gdpr-no-consent.html b/bench/design/ethics-fixtures/gdpr-no-consent.html new file mode 100644 index 0000000..a4b7517 --- /dev/null +++ b/bench/design/ethics-fixtures/gdpr-no-consent.html @@ -0,0 +1,16 @@ + + + + + EuroShop — Home + + +

EuroShop

+
+

Best deals in Berlin today

+

Trending products curated for the EU market.

+ +
+

EuroShop GmbH 2026

+ + diff --git a/bench/design/ethics-fixtures/gdpr-with-consent.html b/bench/design/ethics-fixtures/gdpr-with-consent.html new file mode 100644 index 0000000..09c0a01 --- /dev/null +++ b/bench/design/ethics-fixtures/gdpr-with-consent.html @@ -0,0 +1,31 @@ + + + + + EuroShop — Home + + +
+

We use cookies

+

We use necessary and optional cookies. Manage your preferences below.

+
+ Cookie consent preferences + + + +
+ + +
+

EuroShop

+
+

Best deals in Berlin today

+

Trending products curated for the EU market.

+ +
+ + + diff --git a/bench/design/ethics-fixtures/kids-age-gated.html b/bench/design/ethics-fixtures/kids-age-gated.html new file mode 100644 index 0000000..def4102 --- /dev/null +++ b/bench/design/ethics-fixtures/kids-age-gated.html @@ -0,0 +1,19 @@ + + + + + SuperKids — verify your age + + +
+

Welcome to SuperKids

+
+ + +

Enter your age so we can keep things appropriate. Parents will be notified.

+ +
+

We only collect what is strictly necessary for the activity. Optional fields are clearly marked.

+
+ + diff --git a/bench/design/ethics-fixtures/kids-dark-pattern.html b/bench/design/ethics-fixtures/kids-dark-pattern.html new file mode 100644 index 0000000..3a1fd44 --- /dev/null +++ b/bench/design/ethics-fixtures/kids-dark-pattern.html @@ -0,0 +1,17 @@ + + + + + SuperKids Coins! + + +
+

Hi friend! Get 100 SuperCoins NOW!

+

HURRY! Only 3 left!

+

Tap YES or your unicorn will be sad forever 🥺

+ + no thanks, I hate fun +

Cost: $4.99 charged to mom's card. Auto-renews monthly. Cancel by writing a letter.

+
+ + diff --git a/bench/design/ethics-fixtures/medical-no-dosage.html b/bench/design/ethics-fixtures/medical-no-dosage.html new file mode 100644 index 0000000..6965960 --- /dev/null +++ b/bench/design/ethics-fixtures/medical-no-dosage.html @@ -0,0 +1,18 @@ + + + + + RxMed — Order amoxicillin + + +

RxMed Pharmacy

+
+

Amoxicillin 500mg

+

Take this medication as your doctor recommends.

+

Available in 30-tablet packs.

+ + +
+

RxMed © 2026

+ + diff --git a/bench/design/ethics-fixtures/medical-with-dosage.html b/bench/design/ethics-fixtures/medical-with-dosage.html new file mode 100644 index 0000000..1504c73 --- /dev/null +++ b/bench/design/ethics-fixtures/medical-with-dosage.html @@ -0,0 +1,24 @@ + + + + + RxMed — Order amoxicillin (Rx) + + +

RxMed Pharmacy

+
+

Amoxicillin 500mg — Prescribing Information

+
+

Dosage and administration

+

Adults: 500 mg orally every 8 hours. Adjust dosage for renal impairment.

+
+
+

Warnings and contraindications

+

Contraindication: hypersensitivity to penicillin.

+

Adverse effects: nausea, diarrhea, rare anaphylaxis. Report any side effect to MedWatch (FDA 1088).

+
+ +

Report a side effect (MedWatch).

+
+ + diff --git a/scripts/copy-static-assets.mjs b/scripts/copy-static-assets.mjs index d897166..ed050f2 100644 --- a/scripts/copy-static-assets.mjs +++ b/scripts/copy-static-assets.mjs @@ -22,6 +22,12 @@ const COPIES = [ dest: 'dist/design/audit/rubric/fragments', pattern: /\.md$/, }, + { + label: 'rubric anchor(s)', + src: 'src/design/audit/rubric/anchors', + dest: 'dist/design/audit/rubric/anchors', + pattern: /\.ya?ml$/, + }, { label: 'viewer asset(s)', src: 'src/viewer', diff --git a/skills/design-evolve/SKILL.md b/skills/design-evolve/SKILL.md index 270d299..23fe2da 100644 --- a/skills/design-evolve/SKILL.md +++ b/skills/design-evolve/SKILL.md @@ -65,7 +65,33 @@ Batch related fixes: all spacing in one pass, all color in another. ## Phase 3: Apply Fixes to Source Code -Match the project's styling approach. Fix the **design system** (shared components, tokens, globals), not individual instances. +**Preferred (v2 — patch-based):** If the audit output has `findings[*].patches[]`, apply mechanically rather than authoring from scratch: + +```ts +// Iterate topFixes in order +for (const findingId of page.topFixes) { + const finding = page.findings.find(f => f.id === findingId) + if (!finding.patches?.length) continue // Layer 2 not yet active for this finding + + const patch = finding.patches[0] + // Option A: file path known → apply unified diff + if (patch.target.filePath && patch.diff.unifiedDiff) { + // write unifiedDiff to a temp file and: git apply + } + // Option B: CSS selector → search-replace + // find patch.diff.before in relevant file, replace with patch.diff.after + + // Verify + if (patch.testThatProves.command) { + // run patch.testThatProves.command + } + + // Close the loop — record attribution + // bad design-audit ack-patch --pre-run-id +} +``` + +**Fallback (prose-to-code):** When `patches[]` is empty, match the project's styling approach and fix the **design system** (shared components, tokens, globals), not individual instances. **Tailwind:** ```tsx @@ -100,17 +126,22 @@ Rules: - Fix the design system, not individual instances - Only change visual properties — never touch event handlers, state, or business logic -## Phase 4: Re-Audit +## Phase 4: Re-Audit + Attribution ```bash +# If you used patch-based flow in Phase 3, record attribution before re-auditing: +# bad design-audit ack-patch --pre-run-id + node dist/cli.js design-audit \ --url \ --profile \ --pages \ --json --headless +# (add --post-patch if you ack'd above) ``` Compare: did score improve? Are original critical/major findings resolved? Any new findings introduced? +Check `ethicsViolations` — if any are present the rollup is capped; scores won't improve past the cap until violations are remediated. ## Phase 5: Iterate diff --git a/src/cli-ack-patch.ts b/src/cli-ack-patch.ts new file mode 100644 index 0000000..28b3e7d --- /dev/null +++ b/src/cli-ack-patch.ts @@ -0,0 +1,89 @@ +/** + * Layer 4 — `bad design-audit ack-patch` subcommand handler. + * + * Invoked by coding agents after applying a patch: + * bad design-audit ack-patch --pre-run-id [--applied-by ] + * + * When a re-audit is run with `--post-patch `, the pipeline looks up + * the pending application and writes the observed outcome. This file handles + * the ack-patch side; the --post-patch flow lives in pipeline.ts. + */ + +import * as crypto from 'node:crypto' +import type { PatchApplication } from './design/audit/attribution/types.js' +import type { Dimension } from './design/audit/v2/types.js' +import { + appendPatchApplication, + patchHash, + findPendingApplication, + updateApplicationOutcome, +} from './design/audit/attribution/store.js' + +export interface AckPatchOptions { + patchId: string + preRunId: string + appliedBy?: string + predictedDim?: string + predictedDelta?: number + patchBefore?: string + patchAfter?: string + patchScope?: string + dir?: string +} + +/** + * Record that a patch was applied. Returns the applicationId for correlation. + * The predicted delta is optional — when not provided, defaults to 'untested'. + */ +export async function ackPatch(opts: AckPatchOptions): Promise { + const applicationId = crypto.randomUUID() + const hash = patchHash( + { before: opts.patchBefore ?? '', after: opts.patchAfter ?? '' }, + opts.patchScope ?? 'component', + ) + + const app: PatchApplication = { + applicationId, + patchId: opts.patchId, + patchHash: hash, + appliedAt: new Date().toISOString(), + appliedBy: opts.appliedBy ?? 'agent:unknown', + preAuditRunId: opts.preRunId, + predicted: { + dim: (opts.predictedDim ?? 'product_intent') as Dimension, + delta: opts.predictedDelta ?? 0, + }, + } + + await appendPatchApplication(app, opts.dir) + return applicationId +} + +export interface PostPatchOptions { + patchId: string + postRunId: string + observedDim: string + observedDelta: number + dir?: string +} + +/** + * Record the observed outcome after a re-audit. Looks up the pending + * application for `patchId` and appends an outcome event. + */ +export async function recordPatchOutcome(opts: PostPatchOptions): Promise { + const pending = await findPendingApplication(opts.patchId, opts.dir) + if (!pending) { + throw new Error( + `No pending PatchApplication found for patchId ${opts.patchId}. ` + + 'Run `bad design-audit ack-patch` after applying the patch, before re-auditing.', + ) + } + + await updateApplicationOutcome( + pending.applicationId, + opts.postRunId, + { dim: opts.observedDim as Dimension, delta: opts.observedDelta }, + opts.dir, + ) +} diff --git a/src/cli-design-audit.ts b/src/cli-design-audit.ts index 2add9b8..4595ab2 100644 --- a/src/cli-design-audit.ts +++ b/src/cli-design-audit.ts @@ -17,7 +17,72 @@ import { resolveProviderApiKey, resolveProviderModelName, type SupportedProvider import { loadLocalEnvFiles } from './env-loader.js' import { cliError } from './cli-ui.js' import { auditOnePage } from './design/audit/pipeline.js' -import type { PageAuditResult as Gen2PageAuditResult } from './design/audit/types.js' +import type { PageAuditResult as Gen2PageAuditResult, EthicsViolation } from './design/audit/types.js' + +/** Split "a, b , c" → ['a','b','c']. Returns undefined for empty input so the + * v2 predicate predicates can distinguish "operator did not say" from "[]". */ +function parseTagList(input: string | undefined): string[] | undefined { + if (!input) return undefined + const tags = input.split(',').map(s => s.trim()).filter(Boolean) + return tags.length > 0 ? tags : undefined +} + +/** Pretty-print the ethics-violation report for a set of pages. Prints + * nothing when no page tripped a rule. Each rule is shown with severity, + * remediation, and citation so the operator can act without re-running. */ +function printEthicsViolations(pages: Array<{ url: string; ethicsViolations?: EthicsViolation[] }>): void { + const offenders = pages.filter(p => (p.ethicsViolations?.length ?? 0) > 0) + if (offenders.length === 0) return + console.log('') + console.log(` ${chalk.bgRed.white.bold(' ETHICS VIOLATIONS ')}`) + for (const page of offenders) { + console.log(` ${chalk.dim('Page:')} ${page.url}`) + for (const v of page.ethicsViolations ?? []) { + const sevColor = v.severity === 'critical-floor' ? chalk.red : chalk.yellow + console.log(` ${sevColor('•')} ${chalk.bold(v.ruleId)} ${chalk.dim('—')} ${sevColor(v.severity)} ${chalk.dim(`(rollup capped at ${v.rollupCap})`)}`) + console.log(` ${chalk.dim('fix:')} ${v.remediation}`) + if (v.citation) console.log(` ${chalk.dim('cite:')} ${v.citation}`) + } + } +} + +/** Lowest rollup cap across all violated pages, or undefined if none fired. */ +function lowestRollupCap(pages: Array<{ ethicsViolations?: EthicsViolation[] }>): number | undefined { + const caps = pages.flatMap(p => p.ethicsViolations ?? []).map(v => v.rollupCap) + return caps.length === 0 ? undefined : Math.min(...caps) +} + +/** + * Layer 1 — print the per-dimension breakdown for one page when an + * `auditResultV2` is attached. Five dim lines + one rollup line; each shows + * score, range, and confidence so an agent can reason about uncertainty. + */ +function printV2Breakdown(page: { auditResultV2?: unknown }): void { + const v2 = page.auditResultV2 as + | { + scores?: Record + rollup?: { score: number; range: [number, number]; confidence: string; rule: string } + } + | undefined + if (!v2 || !v2.scores || !v2.rollup) return + + const dimOrder = ['product_intent', 'visual_craft', 'trust_clarity', 'workflow', 'content_ia'] + for (const dim of dimOrder) { + const s = v2.scores[dim] + if (!s) continue + const sevColor = s.score >= 8 ? chalk.green : s.score >= 5 ? chalk.yellow : chalk.red + const confColor = s.confidence === 'high' ? chalk.green : s.confidence === 'medium' ? chalk.yellow : chalk.dim + console.log( + ` ${chalk.dim(dim.padEnd(15))} ${sevColor(`${s.score}/10`)} ${chalk.dim(`[${s.range[0]}-${s.range[1]}]`)} ${confColor(s.confidence)}`, + ) + } + const r = v2.rollup + const rColor = r.score >= 8 ? chalk.green : r.score >= 5 ? chalk.yellow : chalk.red + const confColor = r.confidence === 'high' ? chalk.green : r.confidence === 'medium' ? chalk.yellow : chalk.dim + console.log( + ` ${chalk.dim('rollup'.padEnd(15))} ${rColor(`${r.score.toFixed(1)}/10`)} ${chalk.dim(`[${r.range[0].toFixed(1)}-${r.range[1].toFixed(1)}]`)} ${confColor(r.confidence)} ${chalk.dim(r.rule)}`, + ) +} import { resolveAuditPasses } from './design/audit/evaluate.js' import { detectSystemicFindings, topByRoi } from './design/audit/roi.js' import { getTelemetry, setInvocation } from './telemetry/index.js' @@ -92,6 +157,12 @@ interface PageAuditResult { rubricFragments?: string[] /** Gen 2: deterministic measurements */ measurements?: Gen2PageAuditResult['measurements'] + /** Layer 7: ethics violations that capped the rollup, if any. */ + ethicsViolations?: EthicsViolation[] + /** Layer 7: the pre-cap rollup score when ethicsViolations is non-empty. */ + preEthicsScore?: number + /** Layer 1: opaque v2 result attached for backwards-compat dual-emit. */ + auditResultV2?: unknown } // --------------------------------------------------------------------------- @@ -249,6 +320,19 @@ export interface DesignAuditOptions { rubricsDir?: string /** Subjective LLM audit passes: standard, deep, max, number, or comma-list */ auditPasses?: string + // ── Layer 7 — domain ethics gate ── + /** Bypass the ethics floor entirely. Audited + warned. Test-only. */ + skipEthics?: boolean + /** Override directory for ethics rule yaml files. */ + ethicsRulesDir?: string + /** Comma-separated audience tags: developer, clinician, kids, ... */ + audience?: string + /** Comma-separated regulatory contexts: hipaa, gdpr, coppa, ... */ + regulatoryContext?: string + /** Comma-separated audience-vulnerability tags: patient-facing, minor-facing, ... */ + audienceVulnerability?: string + /** Single modality: mobile, tablet, desktop, tv, kiosk */ + modality?: string } export async function runDesignAudit(opts: DesignAuditOptions): Promise { @@ -270,6 +354,19 @@ export async function runDesignAudit(opts: DesignAuditOptions): Promise { const apiKey = opts.apiKey ?? resolveProviderApiKey(provider) const auditPasses = resolveAuditPasses(opts.auditPasses) + // Layer 7 — ethics gate options. Threaded into every auditOnePage call site. + if (opts.skipEthics) { + console.warn(` ${chalk.yellow('⚠')} ${chalk.bold('--skip-ethics')} ${chalk.dim('— ethics floor disabled (test-only)')}`) + } + const ethicsCommonOpts = { + skipEthics: opts.skipEthics, + ethicsRulesDir: opts.ethicsRulesDir, + audience: parseTagList(opts.audience) as never, + regulatoryContext: parseTagList(opts.regulatoryContext) as never, + audienceVulnerability: parseTagList(opts.audienceVulnerability) as never, + modality: parseTagList(opts.modality) as never, + } + // Telemetry: every design-audit invocation gets a stable runId. Children // (per-page, evolve rounds) link back via parentRunId so a fleet rollup can // reconstruct the tree. @@ -347,6 +444,7 @@ export async function runDesignAudit(opts: DesignAuditOptions): Promise { runId, provider, model: modelName, + ...ethicsCommonOpts, }) const result = gen2 as PageAuditResult results.push(result) @@ -358,6 +456,7 @@ export async function runDesignAudit(opts: DesignAuditOptions): Promise { ? chalk.dim(` (${result.classification.type}/${result.classification.domain})`) : '' console.log(` ${icon} ${scoreColor(`${result.score}/10`)} ${chalk.dim('—')} ${findingCount} finding${findingCount !== 1 ? 's' : ''}${classLabel}`) + printV2Breakdown(result) } // Cross-page systemic detection + top-fixes ranking. @@ -383,17 +482,31 @@ export async function runDesignAudit(opts: DesignAuditOptions): Promise { if (opts.json) { const jsonPath = path.join(outputDir, 'report.json') + // Layer 1 — emit BOTH schemaVersion 1 (legacy) and schemaVersion 2 (new) + // shapes for one release. Consumers can migrate to v2 incrementally. + const v2Pages = results + .map(r => r.auditResultV2) + .filter((r): r is unknown => r !== undefined) fs.writeFileSync(jsonPath, JSON.stringify({ + schemaVersion: 1, timestamp: new Date().toISOString(), profile, url: opts.url, pages: results, topFixes, summary: { avgScore, totalFindings: allFindings.length, critical, major, minor }, + v2: { + schemaVersion: 2, + pages: v2Pages, + }, }, null, 2)) console.log(` ${chalk.dim('JSON →')} ${jsonPath}`) } + // ── Layer 7 — surface ethics violations BEFORE the score summary so the + // operator sees the floor reason, not just the capped number. ── + printEthicsViolations(results) + // Summary console.log('') console.log(` ${chalk.dim('─'.repeat(52))}`) @@ -403,6 +516,10 @@ export async function runDesignAudit(opts: DesignAuditOptions): Promise { if (major > 0) findingParts.push(chalk.yellow(`${major} major`)) if (minor > 0) findingParts.push(chalk.dim(`${minor} minor`)) console.log(` Avg: ${avgColor(`${avgScore.toFixed(1)}/10`)} ${chalk.dim('·')} ${allFindings.length} findings ${findingParts.length ? chalk.dim('(') + findingParts.join(chalk.dim(' · ')) + chalk.dim(')') : ''}`) + const lowestCap = lowestRollupCap(results) + if (lowestCap !== undefined) { + console.log(` ${chalk.red('⚠ Rollup capped at')} ${chalk.bold(`${lowestCap}/10`)} ${chalk.dim('— resolve ethics violations to lift the cap')}`) + } console.log(` ${chalk.dim('Report →')} ${reportPath}`) if (screenshotDir) console.log(` ${chalk.dim('Screenshots →')} ${screenshotDir}`) console.log('') @@ -426,6 +543,7 @@ export async function runDesignAudit(opts: DesignAuditOptions): Promise { provider, model: modelName, parentRunId: runId, + ...ethicsCommonOpts, }) repResults.push(gen2 as PageAuditResult) } diff --git a/src/cli-patterns.ts b/src/cli-patterns.ts new file mode 100644 index 0000000..cf09a34 --- /dev/null +++ b/src/cli-patterns.ts @@ -0,0 +1,66 @@ +/** + * Layer 5 — `bad patterns` subcommand surface. + * + * Provides pattern query and inspection via CLI. Mining runs as a periodic + * Cloudflare Worker cron in production; locally it reads from ~/.bad/patterns/. + * + * bad patterns query [--category ] [--page-type ] [--weak-dimension ] + * bad patterns show + * bad patterns mine [--dir ] + */ + +import type { PatternQuery } from './design/audit/patterns/types.js' +import { queryPatterns, loadPatterns } from './design/audit/patterns/store.js' +import type { Dimension, PageType } from './design/audit/v2/types.js' + +export interface PatternsQueryOptions { + category?: string + pageType?: PageType + weakDimension?: Dimension + minApplications?: number + minSuccessRate?: number + json?: boolean + dir?: string +} + +export async function runPatternsQuery(opts: PatternsQueryOptions): Promise { + const query: PatternQuery = { + category: opts.category, + pageType: opts.pageType, + weakDimension: opts.weakDimension, + minApplications: opts.minApplications, + minSuccessRate: opts.minSuccessRate, + } + const patterns = await queryPatterns(query, opts.dir) + + if (patterns.length === 0) { + console.log('No patterns found. The pattern library is empty until fleet data accumulates (Layer 5 cold-start).') + return + } + + if (opts.json) { + console.log(JSON.stringify(patterns, null, 2)) + return + } + + for (const p of patterns) { + console.log(`\n[${p.patternId}] ${p.scaffold.description}`) + console.log(` Category: ${p.category} | Type: ${p.classification.type}`) + console.log(` Fleet: N=${p.fleetEvidence.applications} tenants=${p.fleetEvidence.sampleTenants} success=${(p.fleetEvidence.successRate * 100).toFixed(0)}%`) + console.log(` Key decisions: ${p.scaffold.keyDecisions.join('; ')}`) + } +} + +export async function runPatternsShow(patternId: string, opts: { json?: boolean; dir?: string } = {}): Promise { + const all = await loadPatterns(opts.dir) + const pattern = all.find(p => p.patternId === patternId) + if (!pattern) { + console.error(`Pattern ${patternId} not found.`) + process.exit(1) + } + if (opts.json) { + console.log(JSON.stringify(pattern, null, 2)) + return + } + console.log(JSON.stringify(pattern, null, 2)) +} diff --git a/src/cli.ts b/src/cli.ts index 35662b6..34ac2a3 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -153,6 +153,15 @@ async function main(): Promise { reproducibility: { type: 'boolean' }, 'rubrics-dir': { type: 'string' }, 'audit-passes': { type: 'string' }, + // Layer 7 — domain ethics gate. --skip-ethics bypasses the rollup floor + // for testing scenarios; --ethics-rules-dir overrides the builtin rule set. + 'skip-ethics': { type: 'boolean' }, + 'ethics-rules-dir': { type: 'string' }, + // Layer 6 / 7 — audience predicate hints. Comma-separated. + audience: { type: 'string' }, + 'regulatory-context': { type: 'string' }, + 'audience-vulnerability': { type: 'string' }, + modality: { type: 'string' }, // bad view port: { type: 'string' }, 'no-open': { type: 'boolean' }, @@ -341,6 +350,12 @@ async function main(): Promise { reproducibility: values.reproducibility, rubricsDir: values['rubrics-dir'], auditPasses: values['audit-passes'], + skipEthics: values['skip-ethics'], + ethicsRulesDir: values['ethics-rules-dir'], + audience: values.audience, + regulatoryContext: values['regulatory-context'], + audienceVulnerability: values['audience-vulnerability'], + modality: values.modality, }); process.exit(0); } diff --git a/src/design/audit/attribution/aggregate.ts b/src/design/audit/attribution/aggregate.ts new file mode 100644 index 0000000..5e8c6c8 --- /dev/null +++ b/src/design/audit/attribution/aggregate.ts @@ -0,0 +1,117 @@ +/** + * Layer 4 — Cross-tenant patch reliability aggregation. + * + * Groups PatchApplication records by patchHash and computes reliability + * statistics. Fleet-mined patterns (Layer 5) consume these aggregates. + */ + +import type { PatchApplication, PatchReliability } from './types.js' +import type { PatchRecommendation } from './types.js' + +/** Stored record includes patchHash which is computed at write time by store.ts. */ +type StoredApplication = PatchApplication & { patchHash?: string } + +const MIN_APPLICATIONS_FOR_RECOMMENDED = 30 +const MIN_TENANTS_FOR_RECOMMENDED = 5 +const REPLICATION_RATE_THRESHOLD = 0.7 +const MIN_APPLICATIONS_FOR_ANTIPATTERN = 10 +const ANTIPATTERN_REPLICATION_THRESHOLD = 0.3 + +/** + * A candidate application for aggregation — has both predicted and observed. + */ +type CompletedApplication = PatchApplication & { + observed: NonNullable +} + +function isCompleted(app: PatchApplication): app is CompletedApplication { + return app.observed !== undefined +} + +/** + * True when the observed delta "replicates" the predicted: same sign and + * at least half the magnitude. + */ +function replicates(predicted: { delta: number }, observed: { delta: number }): boolean { + if (Math.sign(predicted.delta) !== Math.sign(observed.delta)) return false + return Math.abs(observed.delta) >= 0.5 * Math.abs(predicted.delta) +} + +/** Extract tenant tag from `appliedBy` field or application metadata. */ +function tenantFrom(app: PatchApplication): string { + // convention: 'agent:claude-code:tenant-id' or tenantId field if present + const parts = app.appliedBy.split(':') + return parts.length >= 3 ? parts.slice(2).join(':') : app.appliedBy +} + +/** + * Aggregate all PatchApplication records into per-patchHash reliability stats. + * Records without an `observed` delta are counted in `applications` but excluded + * from the rate computations. + */ +export function aggregatePatchReliability( + applications: PatchApplication[], +): PatchReliability[] { + const byHash = new Map() + for (const app of applications as StoredApplication[]) { + const hash = app.patchHash ?? app.patchId // fall back to patchId for records without hash + if (!byHash.has(hash)) byHash.set(hash, []) + byHash.get(hash)!.push(app) + } + + const results: PatchReliability[] = [] + for (const [hashKey, apps] of byHash.entries()) { + const completed = apps.filter(isCompleted) + const tenants = new Set(apps.map(tenantFrom)).size + + const meanPredictedDelta = + completed.length > 0 + ? completed.reduce((s, a) => s + a.predicted.delta, 0) / completed.length + : 0 + + const meanObservedDelta = + completed.length > 0 + ? completed.reduce((s, a) => s + a.observed.delta, 0) / completed.length + : 0 + + const replicationRate = + completed.length > 0 + ? completed.filter(a => replicates(a.predicted, a.observed)).length / completed.length + : 0 + + results.push({ + patchHash: hashKey, + applications: apps.length, + meanPredictedDelta, + meanObservedDelta, + sampleTenants: tenants, + replicationRate, + recommendation: recommendationFor(apps.length, tenants, replicationRate, meanObservedDelta), + }) + } + + return results.sort((a, b) => b.applications - a.applications) +} + +export function recommendationFor( + applications: number, + sampleTenants: number, + replicationRate: number, + meanObservedDelta: number, +): PatchRecommendation { + if ( + applications >= MIN_APPLICATIONS_FOR_RECOMMENDED && + sampleTenants >= MIN_TENANTS_FOR_RECOMMENDED && + replicationRate >= REPLICATION_RATE_THRESHOLD + ) { + return 'recommended' + } + if ( + applications >= MIN_APPLICATIONS_FOR_ANTIPATTERN && + replicationRate < ANTIPATTERN_REPLICATION_THRESHOLD && + meanObservedDelta < 0 + ) { + return 'antipattern' + } + return 'neutral' +} diff --git a/src/design/audit/attribution/store.ts b/src/design/audit/attribution/store.ts new file mode 100644 index 0000000..a828a95 --- /dev/null +++ b/src/design/audit/attribution/store.ts @@ -0,0 +1,138 @@ +/** + * Layer 4 — Append-only JSONL store for PatchApplication records. + * + * Layout: `/applications/.jsonl` + * Each line is a standalone JSON object — `patchHash` is always set so cross- + * tenant aggregation can group by patch signature, not per-tenant path. + * + * Append-only invariant: never mutate existing lines. Outcome updates are + * recorded as NEW lines so the JSONL is an event stream, not a state snapshot. + */ + +import * as fs from 'node:fs' +import * as fsp from 'node:fs/promises' +import * as path from 'node:path' +import * as os from 'node:os' +import * as crypto from 'node:crypto' +import type { PatchApplication } from './types.js' + +const DEFAULT_DIR = path.join(os.homedir(), '.bad', 'attribution') + +function applicationsDir(dir: string): string { + return path.join(dir, 'applications') +} + +function todayPath(dir: string): string { + const date = new Date().toISOString().slice(0, 10) + return path.join(applicationsDir(dir), `${date}.jsonl`) +} + +/** + * Stable hash for a patch diff + scope. Same patch content across tenants → + * same hash, enabling cross-tenant reliability aggregation. + */ +export function patchHash(diff: { before: string; after: string }, scope: string): string { + return crypto + .createHash('sha256') + .update(`${diff.before}\n---\n${diff.after}\n---\n${scope}`) + .digest('hex') + .slice(0, 16) +} + +/** Append a new PatchApplication record. */ +export async function appendPatchApplication( + app: PatchApplication, + dir: string = DEFAULT_DIR, +): Promise { + await fsp.mkdir(applicationsDir(dir), { recursive: true }) + await fsp.appendFile(todayPath(dir), JSON.stringify(app) + '\n', 'utf-8') +} + +/** Sync variant for non-async call sites. */ +export function appendPatchApplicationSync( + app: PatchApplication, + dir: string = DEFAULT_DIR, +): void { + fs.mkdirSync(applicationsDir(dir), { recursive: true }) + fs.appendFileSync(todayPath(dir), JSON.stringify(app) + '\n', 'utf-8') +} + +/** Read all PatchApplication records from the last `days` days. */ +export async function readRecentApplications( + days: number = 7, + dir: string = DEFAULT_DIR, +): Promise { + const appsDir = applicationsDir(dir) + if (!fs.existsSync(appsDir)) return [] + + const results: PatchApplication[] = [] + for (let d = 0; d < days; d++) { + const date = new Date(Date.now() - d * 86_400_000).toISOString().slice(0, 10) + const filePath = path.join(appsDir, `${date}.jsonl`) + if (!fs.existsSync(filePath)) continue + const lines = fs.readFileSync(filePath, 'utf-8').split('\n').filter(Boolean) + for (const line of lines) { + try { + results.push(JSON.parse(line) as PatchApplication) + } catch { + // corrupt line — skip + } + } + } + return results +} + +/** + * Find the most recent pending application for a patchId — one that has no + * `postAuditRunId` yet. Used when a re-audit lands to attach the outcome. + */ +export async function findPendingApplication( + patchId: string, + dir: string = DEFAULT_DIR, +): Promise { + const apps = await readRecentApplications(7, dir) + // Most recent first; pick the newest pending one. + const pending = apps + .filter(a => a.patchId === patchId && !a.postAuditRunId) + .sort((a, b) => b.appliedAt.localeCompare(a.appliedAt)) + return pending[0] ?? null +} + +/** + * Append an outcome event for an existing application. Does NOT mutate the + * original line — appends a new event so the JSONL remains an event stream. + */ +export async function updateApplicationOutcome( + applicationId: string, + postAuditRunId: string, + observed: PatchApplication['observed'], + dir: string = DEFAULT_DIR, +): Promise { + const apps = await readRecentApplications(7, dir) + const original = apps.find(a => a.applicationId === applicationId) + if (!original) { + throw new Error(`PatchApplication ${applicationId} not found in the last 7 days`) + } + + const agreementScore = computeAgreementScore(original.predicted, observed) + const outcome: PatchApplication = { + ...original, + postAuditRunId, + observed, + agreementScore, + } + + await fsp.mkdir(applicationsDir(dir), { recursive: true }) + await fsp.appendFile(todayPath(dir), JSON.stringify(outcome) + '\n', 'utf-8') +} + +function computeAgreementScore( + predicted: PatchApplication['predicted'], + observed: PatchApplication['observed'], +): number { + if (!predicted || !observed) return 0 + const p = predicted.delta + const o = observed.delta + const denom = Math.max(Math.abs(p), Math.abs(o), 1) + return 1 - Math.abs(p - o) / denom +} diff --git a/src/design/audit/attribution/types.ts b/src/design/audit/attribution/types.ts new file mode 100644 index 0000000..1236667 --- /dev/null +++ b/src/design/audit/attribution/types.ts @@ -0,0 +1,18 @@ +/** + * Layer 4 — Outcome attribution type contract. + * + * These types are already defined in src/design/audit/v2/types.ts as part of + * the Phase 0 contract. This module re-exports them so attribution code can + * import from a single, predictable path. When v2/types.ts is the sole + * canonical source, update these re-exports accordingly. + */ + +export type { + PatchApplication, + PatchReliability, +} from '../v2/types.js' + +/** sha256(diff.before + '\n---\n' + diff.after + '\n---\n' + scope).slice(0,16) */ +export type PatchHash = string + +export type PatchRecommendation = 'recommended' | 'neutral' | 'antipattern' diff --git a/src/design/audit/classify-ensemble.ts b/src/design/audit/classify-ensemble.ts new file mode 100644 index 0000000..5180481 --- /dev/null +++ b/src/design/audit/classify-ensemble.ts @@ -0,0 +1,264 @@ +/** + * Ensemble classifier — Layer 1 of the world-class design-audit architecture. + * + * Three-signal vote (URL pattern + DOM heuristic + LLM) decides the page type + * and reports an ensemble confidence so downstream layers (first-principles + * fallback, rubric loader, telemetry) can act on uncertainty honestly. + * + * Vote logic: + * - URL + DOM agree on a type AND combined confidence > 0.7 → accept (skip LLM) + * - else → run LLM, take majority + * - if LLM confidence < 0.5 AND signals disagree → return 'unknown' with dissent + */ + +import type { Brain } from '../../brain/index.js' +import type { PageState } from '../../types.js' +import { classifyPage, defaultClassification } from './classify.js' +import type { PageClassification, PageType } from './types.js' +import type { + ClassifierSignal, + ClassifierSource, + DomHeuristics, + EnsembleClassification, +} from './v2/types.js' + +interface UrlPatternRule { + pattern: RegExp + type: PageType + confidence: number + rationale: string +} + +/** + * URL pattern rules — straight from the RFC. Order matters: more specific + * patterns first. Each rule's confidence is the URL signal's contribution to + * the ensemble vote. + */ +const URL_PATTERN_RULES: UrlPatternRule[] = [ + { pattern: /\/(docs|reference|api|guide|help|faq)(\/|$)/, type: 'docs', confidence: 0.85, rationale: 'URL contains a docs path segment' }, + { pattern: /\/(checkout|cart|pay|order|billing)(\/|$)/, type: 'ecommerce', confidence: 0.85, rationale: 'URL contains a commerce path segment' }, + { pattern: /\/(app|dashboard|workspace|admin)(\/|$)/, type: 'saas-app', confidence: 0.75, rationale: 'URL contains an app/dashboard path segment' }, + { pattern: /\/(login|signup|auth|sign-in)(\/|$)/, type: 'utility', confidence: 0.85, rationale: 'URL contains an auth path segment' }, + { pattern: /\/(pricing|plans|features|product)(\/|$)/, type: 'marketing', confidence: 0.7, rationale: 'URL contains a marketing path segment' }, + { pattern: /\/(blog|articles|news|stories)(\/|$)/, type: 'blog', confidence: 0.8, rationale: 'URL contains a blog path segment' }, + { pattern: /\/$/, type: 'marketing', confidence: 0.4, rationale: 'URL is a root path — weak marketing default' }, +] + +const ENSEMBLE_AGREEMENT_THRESHOLD = 0.7 +const LLM_FALLBACK_CONFIDENCE = 0.5 + +export interface EnsembleClassifyInput { + brain: Brain + state: PageState + url: string + /** Optional pre-captured DOM heuristics. If absent, we attempt to derive them from the snapshot. */ + domHeuristics?: DomHeuristics +} + +/** Public entry point. */ +export async function classifyEnsemble(input: EnsembleClassifyInput): Promise { + const signals: ClassifierSignal[] = [] + + // ── 1. URL pattern signal ── + const urlSignal = classifyByUrl(input.url) + if (urlSignal) signals.push(urlSignal) + + // ── 2. DOM heuristic signal ── + const dom = input.domHeuristics ?? deriveHeuristics(input.state) + const domSignal = classifyByDom(dom) + if (domSignal) signals.push(domSignal) + + // ── Quick path: URL + DOM agree with combined confidence > threshold ── + if ( + urlSignal && + domSignal && + urlSignal.type === domSignal.type && + urlSignal.confidence + domSignal.confidence > ENSEMBLE_AGREEMENT_THRESHOLD + ) { + const ensembleConfidence = clamp01( + Math.min(1, (urlSignal.confidence + domSignal.confidence) / 1.6), + ) + return finalize({ + type: urlSignal.type, + base: defaultClassification(), + signals, + ensembleConfidence, + signalsAgreed: true, + }) + } + + // ── 3. LLM tiebreaker ── + const llmClass = await classifyPage(input.brain, input.state).catch(() => defaultClassification()) + signals.push({ + source: 'llm', + type: llmClass.type, + confidence: llmClass.confidence, + rationale: llmClass.intent || 'LLM page classification', + }) + + // ── Vote ── + const tally = new Map() + for (const sig of signals) { + tally.set(sig.type, (tally.get(sig.type) ?? 0) + sig.confidence) + } + + const sortedVotes = [...tally.entries()].sort((a, b) => b[1] - a[1]) + const winner = sortedVotes[0] + const winningType = winner ? winner[0] : 'unknown' + const winningTotal = winner ? winner[1] : 0 + + // Compute aggregate confidence: average over participating signals, weighted by agreement. + const winningSignals = signals.filter((s) => s.type === winningType) + const agreementShare = winningSignals.length / signals.length + const meanConfidence = winningSignals.reduce((acc, s) => acc + s.confidence, 0) / Math.max(winningSignals.length, 1) + const ensembleConfidence = clamp01(meanConfidence * agreementShare + 0.05 * (winningTotal - meanConfidence)) + + const signalsAgreed = signals.every((s) => s.type === winningType) + const dissent = signals.filter((s) => s.type !== winningType).map((s) => ({ source: s.source, type: s.type })) + + // ── Low-confidence + disagreement → 'unknown' with dissent ── + if (!signalsAgreed && llmClass.confidence < LLM_FALLBACK_CONFIDENCE) { + return finalize({ + type: 'unknown', + base: llmClass, + signals, + ensembleConfidence: Math.min(ensembleConfidence, 0.5), + signalsAgreed: false, + dissent, + }) + } + + return finalize({ + type: winningType, + base: llmClass, + signals, + ensembleConfidence, + signalsAgreed, + dissent: signalsAgreed ? undefined : dissent, + }) +} + +interface FinalizeArgs { + type: PageType + base: PageClassification + signals: ClassifierSignal[] + ensembleConfidence: number + signalsAgreed: boolean + dissent?: { source: ClassifierSource; type: PageType }[] +} + +function finalize(args: FinalizeArgs): EnsembleClassification { + const { type, base, signals, ensembleConfidence, signalsAgreed } = args + const firstPrinciplesMode = !signalsAgreed || ensembleConfidence < 0.6 + + const out: EnsembleClassification = { + ...base, + type, + confidence: ensembleConfidence, + signals, + signalsAgreed, + ensembleConfidence, + firstPrinciplesMode, + } + if (args.dissent && args.dissent.length > 0) out.dissent = args.dissent + return out +} + +// ── URL-pattern classifier ────────────────────────────────────────────────── + +export function classifyByUrl(url: string): ClassifierSignal | null { + let pathname: string + try { + pathname = new URL(url).pathname || '/' + } catch { + return null + } + for (const rule of URL_PATTERN_RULES) { + if (rule.pattern.test(pathname)) { + return { + source: 'url-pattern', + type: rule.type, + confidence: rule.confidence, + rationale: `${rule.rationale} (${pathname})`, + } + } + } + return null +} + +// ── DOM-heuristic classifier ──────────────────────────────────────────────── + +export function classifyByDom(dom: DomHeuristics): ClassifierSignal | null { + // docs: lots of paragraphs + code blocks, modest nav + if (dom.codeBlockCount >= 3 && dom.paragraphCount >= 6) { + return signal('dom-heuristic', 'docs', 0.7, `code blocks=${dom.codeBlockCount}, paragraphs=${dom.paragraphCount}`) + } + // dashboard: many table rows or charts + sidebar + if ((dom.tableRowCount >= 8 || dom.chartCount >= 2) && dom.hasSidebar) { + return signal('dom-heuristic', 'dashboard', 0.7, `rows=${dom.tableRowCount}, charts=${dom.chartCount}, sidebar=true`) + } + // saas-app: sidebar + multiple forms or many inputs + if (dom.hasSidebar && (dom.formCount >= 1 || dom.inputCount >= 4)) { + return signal('dom-heuristic', 'saas-app', 0.65, `sidebar=true, forms=${dom.formCount}, inputs=${dom.inputCount}`) + } + // utility: single dominant form, no hero, no sidebar + if (dom.formCount >= 1 && dom.inputCount >= 2 && !dom.hasHeroSection && !dom.hasSidebar) { + return signal('dom-heuristic', 'utility', 0.7, `single form, no hero, no sidebar`) + } + // ecommerce: forms + many nav items + footer links (storefront chrome) + if (dom.formCount >= 1 && dom.navItems >= 6 && dom.hasFooterLinks) { + return signal('dom-heuristic', 'ecommerce', 0.6, `nav=${dom.navItems}, footer-links, form present`) + } + // blog: long body of paragraphs without forms or tables + if (dom.paragraphCount >= 8 && dom.formCount === 0 && dom.tableRowCount === 0) { + return signal('dom-heuristic', 'blog', 0.65, `paragraphs=${dom.paragraphCount}, no forms or tables`) + } + // marketing: hero + footer-link cloud + few paragraphs + if (dom.hasHeroSection && dom.hasFooterLinks && dom.paragraphCount < 8) { + return signal('dom-heuristic', 'marketing', 0.6, `hero present, footer-links, paragraphs=${dom.paragraphCount}`) + } + return null +} + +function signal(source: ClassifierSource, type: PageType, confidence: number, rationale: string): ClassifierSignal { + return { source, type, confidence, rationale } +} + +// ── DOM heuristic derivation from snapshot ────────────────────────────────── + +/** + * Best-effort DOM heuristic derivation from the accessibility-tree snapshot. + * Pipelines that capture true DOM heuristics via Playwright should pass them + * in directly; this fallback works against the @ref-snapshot text. + */ +export function deriveHeuristics(state: PageState): DomHeuristics { + const snapshot = state.snapshot ?? '' + return { + formCount: countMatches(snapshot, /\bform\b/gi), + inputCount: countMatches(snapshot, /\b(textbox|searchbox|combobox|spinbutton|input)\b/gi), + tableRowCount: countMatches(snapshot, /\brow\b/gi), + chartCount: countMatches(snapshot, /\b(graphics-document|graphics-symbol|figure)\b/gi), + navItems: countMatches(snapshot, /\bnavigation\b/gi), + hasFooterLinks: /\bcontentinfo\b/i.test(snapshot), + hasHeroSection: /\bhero\b/i.test(snapshot) || /\bbanner\b/i.test(snapshot), + hasSidebar: /\bcomplementary\b/i.test(snapshot) || /\bsidebar\b/i.test(snapshot), + paragraphCount: countMatches(snapshot, /\bparagraph\b/gi), + codeBlockCount: countMatches(snapshot, /\bcode\b/gi), + } +} + +function countMatches(haystack: string, pattern: RegExp): number { + const m = haystack.match(pattern) + return m ? m.length : 0 +} + +function clamp01(n: number): number { + if (Number.isNaN(n)) return 0 + return Math.max(0, Math.min(1, n)) +} + +export const ENSEMBLE_INTERNALS = { + URL_PATTERN_RULES, + ENSEMBLE_AGREEMENT_THRESHOLD, + LLM_FALLBACK_CONFIDENCE, +} diff --git a/src/design/audit/ethics/check.ts b/src/design/audit/ethics/check.ts new file mode 100644 index 0000000..520322c --- /dev/null +++ b/src/design/audit/ethics/check.ts @@ -0,0 +1,183 @@ +/** + * Ethics check — Layer 7. + * + * Given a page state + classification, evaluate every loaded `EthicsRule` whose + * `appliesWhen` matches the classification. Each rule produces zero-or-one + * `EthicsViolation`. Violations enforce a hard floor on the rollup score: + * `critical-floor → 4`, `major-floor → 6`. + * + * Detector kinds: + * pattern-absent → regex must appear in page text; violation if absent + * pattern-present → regex must NOT appear in page text; violation if present + * llm-classifier → ask the LLM the question; violation when answer is yes + * + * Pattern matches are case-insensitive. The LLM classifier asks for a + * single-token yes/no answer to keep latency + cost predictable. + */ + +import type { Brain } from '../../../brain/index.js' +import type { + AppliesWhen, + EthicsRule, + EthicsViolation, + PageClassification, + AudienceTag, + ModalityTag, + RegulatoryContextTag, + AudienceVulnerabilityTag, +} from '../v2/types.js' +import { rollupCapFor } from './loader.js' + +export interface EthicsCheckContext { + /** Lowercased page text used by `pattern-absent` / `pattern-present`. */ + pageText: string + /** Page snapshot — passed verbatim to the LLM classifier prompt. */ + snapshot: string + /** The page-type / domain / maturity / designSystem classification. */ + classification: PageClassification + /** Operator-supplied audience / modality / regulatory hints (Layer 6). */ + audience?: AudienceTag[] + modality?: ModalityTag[] + regulatoryContext?: RegulatoryContextTag[] + audienceVulnerability?: AudienceVulnerabilityTag[] +} + +export interface EthicsCheckOptions { + /** When set, llm-classifier rules are evaluated; else skipped (deterministic-only). */ + brain?: Brain + /** Optional screenshot URL/path passed alongside snapshot context (unused today). */ + screenshotPath?: string + /** Logger override — defaults to console.warn for skipped rules. */ + warn?: (msg: string) => void +} + +/** + * Run every applicable rule against the page. Returns one violation per rule + * that fires. Rules whose detector is `llm-classifier` are skipped (with a + * warning) when no `brain` is supplied — the alternative is silent passes, + * which would hide ethics gaps in offline tests. + */ +export async function checkEthics( + rules: EthicsRule[], + ctx: EthicsCheckContext, + opts: EthicsCheckOptions = {}, +): Promise { + const warn = opts.warn ?? ((m: string) => console.warn(m)) + const violations: EthicsViolation[] = [] + for (const rule of rules) { + if (!appliesWhenMatches(rule.appliesWhen, ctx)) continue + const fired = await runDetector(rule, ctx, opts.brain, warn) + if (fired) violations.push(toViolation(rule)) + } + return violations +} + +function toViolation(rule: EthicsRule): EthicsViolation { + return { + ruleId: rule.ruleId, + detected: true, + severity: rule.severity, + rollupCap: rollupCapFor(rule.severity), + remediation: rule.remediation, + ...(rule.citation ? { citation: rule.citation } : {}), + } +} + +/** + * Predicate evaluator — extends the rubric loader's logic with the v2 fields + * (audience / modality / regulatoryContext / audienceVulnerability). All + * declared predicates are AND-combined. + */ +export function appliesWhenMatches(w: AppliesWhen, ctx: EthicsCheckContext): boolean { + if (w.universal) return true + const cls = ctx.classification + + if (w.type?.length && !w.type.includes(cls.type)) return false + if (w.maturity?.length && !w.maturity.includes(cls.maturity)) return false + if (w.designSystem?.length && !w.designSystem.includes(cls.designSystem)) return false + if (w.domain?.length) { + const domain = (cls.domain ?? '').toLowerCase() + const ok = w.domain.some(d => domain.includes(d.toLowerCase())) + if (!ok) return false + } + if (w.audience?.length) { + if (!w.audience.some(a => (ctx.audience ?? []).includes(a))) return false + } + if (w.modality?.length) { + if (!w.modality.some(m => (ctx.modality ?? []).includes(m))) return false + } + if (w.regulatoryContext?.length) { + if (!w.regulatoryContext.some(r => (ctx.regulatoryContext ?? []).includes(r))) return false + } + if (w.audienceVulnerability?.length) { + if (!w.audienceVulnerability.some(v => (ctx.audienceVulnerability ?? []).includes(v))) return false + } + + // Same conservative behavior as the rubric loader: with no predicates and + // no `universal: true`, the rule does NOT apply (don't fire blindly). + const hasPredicate = + !!w.type?.length || + !!w.maturity?.length || + !!w.designSystem?.length || + !!w.domain?.length || + !!w.audience?.length || + !!w.modality?.length || + !!w.regulatoryContext?.length || + !!w.audienceVulnerability?.length + return hasPredicate +} + +async function runDetector( + rule: EthicsRule, + ctx: EthicsCheckContext, + brain: Brain | undefined, + warn: (m: string) => void, +): Promise { + const d = rule.detector + if (d.kind === 'pattern-absent') { + const re = compilePattern(d.pattern) + return !re.test(ctx.pageText) + } + if (d.kind === 'pattern-present') { + const re = compilePattern(d.pattern) + return re.test(ctx.pageText) + } + if (d.kind === 'llm-classifier') { + if (!brain) { + warn(`ethics: skipping llm-classifier rule ${rule.ruleId} — no brain provided`) + return false + } + return await runLlmClassifier(brain, d.llmCheck, ctx.snapshot) + } + return false +} + +function compilePattern(pattern: string): RegExp { + // Allow already-anchored regexes; default is case-insensitive substring. + try { + return new RegExp(pattern, 'i') + } catch { + // Fallback: treat as a literal token if regex compile fails. + return new RegExp(pattern.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'), 'i') + } +} + +const LLM_SYSTEM = `You are an ethics auditor. Answer the user's yes/no question about the page snapshot. Respond with exactly one token — yes or no — followed by an optional one-sentence rationale. Do not hedge. If the evidence is missing, answer no.` + +export async function runLlmClassifier( + brain: Brain, + question: string, + snapshot: string, +): Promise { + const truncated = snapshot.length > 8000 ? `${snapshot.slice(0, 8000)}\n…[truncated]` : snapshot + const user = `QUESTION: ${question}\n\nPAGE SNAPSHOT:\n${truncated}\n\nAnswer yes or no.` + const { text } = await brain.complete(LLM_SYSTEM, user, { maxOutputTokens: 80 }) + const first = text.trim().toLowerCase().match(/^[a-z]+/)?.[0] ?? '' + return first === 'yes' +} + +/** Build the lowercased text blob used by pattern detectors. */ +export function pageTextBlob(snapshot: string, extra?: { url?: string; title?: string }): string { + const parts = [snapshot, extra?.title ?? '', extra?.url ?? ''] + return parts.join('\n').toLowerCase() +} diff --git a/src/design/audit/ethics/loader.ts b/src/design/audit/ethics/loader.ts new file mode 100644 index 0000000..9060c4b --- /dev/null +++ b/src/design/audit/ethics/loader.ts @@ -0,0 +1,213 @@ +/** + * Ethics rule loader — Layer 7. + * + * Loads `EthicsRule[]` from `rules/*.yaml`. Idempotent + cached: the in-memory + * cache keys on directory path so repeated calls (per-page audits) never re-IO. + * + * Each YAML file is a list of rule objects. The minimal parser supports the + * shape used in the RFC: `- key: value` items with nested objects and inline + * `[a, b]` lists. No external yaml dep — same approach as rubric/loader.ts. + */ + +import * as fs from 'node:fs' +import * as path from 'node:path' +import { fileURLToPath } from 'node:url' +import type { + EthicsRule, + EthicsCategory, + EthicsSeverity, + EthicsDetector, + AppliesWhen, +} from '../v2/types.js' + +const __dirname = path.dirname(fileURLToPath(import.meta.url)) +const BUILTIN_RULES_DIR = path.join(__dirname, 'rules') + +const cache = new Map() + +/** Severity → rollup ceiling. critical-floor caps at 4; major-floor caps at 6. */ +export function rollupCapFor(severity: EthicsSeverity): number { + return severity === 'critical-floor' ? 4 : 6 +} + +/** + * Load every `*.yaml` rule file in `dir`. Cached by absolute path. + * Returns a stable order (sorted by filename + position within file). + */ +export function loadEthicsRules(dir: string = BUILTIN_RULES_DIR): EthicsRule[] { + const abs = path.resolve(dir) + const cached = cache.get(abs) + if (cached) return cached + if (!fs.existsSync(abs)) { + cache.set(abs, []) + return [] + } + const rules: EthicsRule[] = [] + const files = fs.readdirSync(abs).filter(f => f.endsWith('.yaml')).sort() + for (const f of files) { + const file = path.join(abs, f) + const raw = fs.readFileSync(file, 'utf-8') + const parsed = parseRuleList(raw, file) + for (const r of parsed) rules.push(r) + } + cache.set(abs, rules) + return rules +} + +/** Reset cache — test-only. */ +export function clearEthicsRuleCache(): void { + cache.clear() +} + +function parseRuleList(text: string, sourceFile: string): EthicsRule[] { + const items = splitTopLevelItems(text) + return items.map((block, idx) => parseRule(block, `${sourceFile}#${idx}`)) +} + +/** Split a YAML doc into top-level `- item` blocks (one block per rule). */ +function splitTopLevelItems(text: string): string[] { + const lines = text.split('\n') + const items: string[] = [] + let current: string[] | null = null + for (const line of lines) { + if (/^\s*#/.test(line) || line.trim() === '') { + if (current) current.push(line) + continue + } + if (line.startsWith('- ')) { + if (current) items.push(current.join('\n')) + current = [line.slice(2)] + } else if (current) { + // Indented continuation. Strip 2 leading spaces if present so nesting + // levels become consistent within the item. + current.push(line.startsWith(' ') ? line.slice(2) : line) + } + } + if (current) items.push(current.join('\n')) + return items +} + +function parseRule(block: string, ref: string): EthicsRule { + const meta = parseYamlBlock(block) + const ruleId = stringField(meta, 'ruleId', ref) + const category = stringField(meta, 'category', ref) as EthicsCategory + const severity = stringField(meta, 'severity', ref) as EthicsSeverity + if (severity !== 'critical-floor' && severity !== 'major-floor') { + throw new Error(`ethics rule ${ruleId} (${ref}): invalid severity ${severity}`) + } + const remediation = stringField(meta, 'remediation', ref) + const appliesWhen = (meta.appliesWhen as AppliesWhen) ?? {} + const detectorRaw = (meta.detector as Record) ?? {} + const detector = parseDetector(detectorRaw, ruleId) + const citation = meta.citation != null ? String(meta.citation) : undefined + return { + ruleId, + category, + severity, + appliesWhen, + detector, + remediation, + ...(citation ? { citation } : {}), + } +} + +function parseDetector(d: Record, ruleId: string): EthicsDetector { + const kind = String(d.kind ?? '') + if (kind === 'pattern-absent' || kind === 'pattern-present') { + const pattern = String(d.pattern ?? '') + if (!pattern) throw new Error(`ethics rule ${ruleId}: detector.pattern required for ${kind}`) + return { kind, pattern } + } + if (kind === 'llm-classifier') { + const llmCheck = String(d.llmCheck ?? '') + if (!llmCheck) throw new Error(`ethics rule ${ruleId}: detector.llmCheck required for llm-classifier`) + return { kind, llmCheck } + } + throw new Error(`ethics rule ${ruleId}: unknown detector.kind ${kind}`) +} + +function stringField(meta: Record, key: string, ref: string): string { + const v = meta[key] + if (v == null || String(v) === '') { + throw new Error(`ethics rule (${ref}): missing required field ${key}`) + } + return String(v) +} + +/** + * YAML block parser supporting: + * key: scalar + * key: [a, b] + * key: | → folded multi-line block (preserves newlines) + * key: > or just continuation lines indented under the key + * key: + * subkey: value + * listKey: [a, b] + */ +function parseYamlBlock(text: string): Record { + const lines = text.split('\n') + const result: Record = {} + let i = 0 + while (i < lines.length) { + const line = lines[i] + if (!line.trim() || line.trim().startsWith('#')) { + i++ + continue + } + const m = line.match(/^([a-zA-Z][\w-]*):\s*(.*)$/) + if (!m) { + i++ + continue + } + const [, key, valueRaw] = m + const value = valueRaw.trim() + if (value === '|' || value === '>') { + // Folded block: collect indented continuation lines. + const collected: string[] = [] + i++ + while (i < lines.length && (lines[i].startsWith(' ') || lines[i].trim() === '')) { + collected.push(lines[i].replace(/^ {2}/, '')) + i++ + } + result[key] = collected.join(value === '|' ? '\n' : ' ').trim() + } else if (value === '') { + // Nested object — collect indented lines. + const nested: Record = {} + i++ + while (i < lines.length && (lines[i].startsWith(' ') || lines[i].trim() === '')) { + if (!lines[i].trim()) { + i++ + continue + } + const nm = lines[i].match(/^\s+([a-zA-Z][\w-]*):\s*(.*)$/) + if (nm) { + nested[nm[1]] = parseScalarOrList(nm[2].trim()) + } + i++ + } + result[key] = nested + } else { + result[key] = parseScalarOrList(value) + i++ + } + } + return result +} + +function parseScalarOrList(value: string): unknown { + if (value.startsWith('[') && value.endsWith(']')) { + return value + .slice(1, -1) + .split(',') + .map(s => s.trim().replace(/^['"]|['"]$/g, '')) + .filter(Boolean) + } + // Strip surrounding quotes for plain scalars. + if ((value.startsWith("'") && value.endsWith("'")) || (value.startsWith('"') && value.endsWith('"'))) { + return value.slice(1, -1) + } + if (value === 'true') return true + if (value === 'false') return false + if (/^-?\d+(\.\d+)?$/.test(value)) return Number(value) + return value +} diff --git a/src/design/audit/ethics/rules/finance.yaml b/src/design/audit/ethics/rules/finance.yaml new file mode 100644 index 0000000..fa385e6 --- /dev/null +++ b/src/design/audit/ethics/rules/finance.yaml @@ -0,0 +1,37 @@ +# Financial product ethics rules. +# Anchored to CFPB Reg E / Reg Z, EU PSD2, and SEC Rule 10b-10 disclosures. + +- ruleId: finance:fees-disclosed-pre-commitment + category: finance + severity: critical-floor + appliesWhen: + type: [ecommerce] + domain: [fintech, finance, banking, payments] + detector: + kind: llm-classifier + llmCheck: Does this checkout page hide, defer, or fail to itemize any fees, taxes, FX, or charges that the user will pay, with the pay/commit button reachable before the totals are fully visible? + remediation: Surface every line item (fees, taxes, FX) above the pay button. + citation: CFPB 12 CFR 1026 Reg Z / FTC Junk Fees Rule + +- ruleId: finance:apr-or-effective-rate-shown + category: finance + severity: major-floor + appliesWhen: + domain: [fintech, finance, banking, payments, lending] + detector: + kind: pattern-absent + pattern: (APR|annual percentage rate|effective (interest )?rate|finance charge) + remediation: Display APR or effective interest rate alongside any credit, lending, or yield-bearing offer. + citation: TILA 15 USC 1638 / CFPB 12 CFR 1026.18 + +- ruleId: finance:risk-disclosure-required + category: finance + severity: major-floor + appliesWhen: + domain: [fintech, finance, crypto, investing, trading] + audienceVulnerability: [high-stakes-financial] + detector: + kind: llm-classifier + llmCheck: Does this trading / investing / staking page allow the user to commit funds WITHOUT a clearly visible disclosure of principal-loss, volatility, or non-FDIC-insurance risk above the action button? + remediation: Show risk-of-loss and insurance-status disclosures above the action button on any investing, trading, or staking surface. + citation: SEC Rule 10b-10 / FINRA Rule 2210 diff --git a/src/design/audit/ethics/rules/kids.yaml b/src/design/audit/ethics/rules/kids.yaml new file mode 100644 index 0000000..d6473a3 --- /dev/null +++ b/src/design/audit/ethics/rules/kids.yaml @@ -0,0 +1,37 @@ +# Kids-targeted product ethics rules. +# Anchored to COPPA + UK ICO Age Appropriate Design Code (AADC). + +- ruleId: kids:dark-patterns-prohibited + category: kids + severity: critical-floor + appliesWhen: + audience: [kids] + detector: + kind: llm-classifier + llmCheck: Does this page use any dark pattern (hidden costs, forced action, fake urgency, confirmshaming, manipulated visual hierarchy) targeting a minor user? + remediation: Remove all dark patterns. Use clear, age-appropriate, friction-symmetric flows. + citation: COPPA 16 CFR 312.5 / FTC Dark Patterns Report 2022 + +- ruleId: kids:age-gate-required + category: kids + severity: critical-floor + appliesWhen: + audience: [kids] + audienceVulnerability: [minor-facing] + detector: + kind: pattern-absent + pattern: (age|date of birth|verify your age) + remediation: Implement an age gate before collecting any data or showing user-generated content. + citation: COPPA 16 CFR 312.5 + +- ruleId: kids:data-minimization-default + category: kids + severity: major-floor + appliesWhen: + audience: [kids] + regulatoryContext: [coppa] + detector: + kind: llm-classifier + llmCheck: Does this page request data fields from a child that are NOT strictly necessary for the activity (e.g. unmarked optional fields, profile / social / friend-graph fields, location or contact data not required for the task)? + remediation: Default to data minimization for child users — collect only what is strictly necessary, mark every optional field, and disable profile/social fields by default. + citation: ICO AADC Standard 8 / COPPA 16 CFR 312.7 diff --git a/src/design/audit/ethics/rules/legal.yaml b/src/design/audit/ethics/rules/legal.yaml new file mode 100644 index 0000000..4df0e3e --- /dev/null +++ b/src/design/audit/ethics/rules/legal.yaml @@ -0,0 +1,35 @@ +# Legal / regulatory consent rules. +# Anchored to GDPR, ePrivacy Directive, and CCPA/CPRA notice-at-collection. + +- ruleId: legal:gdpr-cookie-consent + category: legal + severity: major-floor + appliesWhen: + regulatoryContext: [gdpr] + detector: + kind: pattern-absent + pattern: (cookie|consent|necessary|preferences) + remediation: Display GDPR-compliant cookie consent banner with granular controls. + citation: GDPR Art. 7 / ePrivacy Directive 2002/58/EC Art. 5(3) + +- ruleId: legal:privacy-policy-link-required + category: legal + severity: major-floor + appliesWhen: + regulatoryContext: [gdpr] + detector: + kind: pattern-absent + pattern: (privacy policy|privacy notice|data protection) + remediation: Provide a clearly labeled privacy policy / notice link on every page that processes personal data. + citation: GDPR Art. 13-14 + +- ruleId: legal:ccpa-do-not-sell-link + category: legal + severity: major-floor + appliesWhen: + regulatoryContext: [gdpr] + detector: + kind: pattern-absent + pattern: (do not sell|do not share|opt.?out|your privacy choices) + remediation: Provide a "Do Not Sell or Share My Personal Information" / "Your Privacy Choices" link in the global footer. + citation: CCPA Cal. Civ. Code 1798.135 / CPRA 1798.121 diff --git a/src/design/audit/ethics/rules/medical.yaml b/src/design/audit/ethics/rules/medical.yaml new file mode 100644 index 0000000..e21129c --- /dev/null +++ b/src/design/audit/ethics/rules/medical.yaml @@ -0,0 +1,36 @@ +# Medical / clinical / pharmacy ethics rules. +# Anchored to FDA labeling regs, ICH E2 pharmacovigilance, and ONC info-blocking. + +- ruleId: medical:dosage-warning-required + category: medical + severity: critical-floor + appliesWhen: + domain: [medical, clinical, pharmacy] + detector: + kind: pattern-absent + pattern: (dosage|warning|contraindication|adverse|side effect) + remediation: Display dosage warnings, contraindications, and adverse-effect summaries before any prescription action. + citation: FDA 21 CFR 201.57 + +- ruleId: medical:adverse-event-reporting-path + category: medical + severity: major-floor + appliesWhen: + domain: [medical, clinical, pharmacy] + detector: + kind: pattern-absent + pattern: (report.{0,20}(side effect|adverse|reaction)|MedWatch|FDA 1088|pharmacovigilance) + remediation: Provide a visible adverse-event reporting path (MedWatch link or equivalent) on any prescribing or dispensing surface. + citation: FDA 21 CFR 314.80 / ICH E2D + +- ruleId: medical:patient-education-disclosure + category: medical + severity: major-floor + appliesWhen: + domain: [medical, clinical, pharmacy] + audienceVulnerability: [patient-facing] + detector: + kind: llm-classifier + llmCheck: Does this patient-facing page request a clinical action or health-data submission WITHOUT first presenting plain-language patient education (indications, risks, instructions for use)? + remediation: Surface plain-language patient education (indications, risks, instructions) ahead of any clinical action a patient can take. + citation: ONC 45 CFR 170.315(g)(9) Patient Education diff --git a/src/design/audit/evaluate.ts b/src/design/audit/evaluate.ts index 745c696..5e8481b 100644 --- a/src/design/audit/evaluate.ts +++ b/src/design/audit/evaluate.ts @@ -257,7 +257,10 @@ export function resolveAuditPasses( const raw = value?.trim().toLowerCase() if (!raw || raw === 'standard' || raw === 'single' || raw === 'default') return ['standard'] - if (raw === 'deep' || raw === 'parallel' || raw === 'full') { + // Layer 1 — `auto` is the new default for the v2 path: classification-aware + // selection mirroring `deep`. The pipeline runs the ensemble classifier + // first, then this picks the focused pass bundle for that page type. + if (raw === 'auto' || raw === 'deep' || raw === 'parallel' || raw === 'full') { return deepPassesForClassification(options?.classification, options?.overrides) } if (raw === 'max' || raw === 'exhaustive') return ['product', 'visual', 'trust', 'workflow', 'content'] diff --git a/src/design/audit/first-principles-mode.ts b/src/design/audit/first-principles-mode.ts new file mode 100644 index 0000000..29672e4 --- /dev/null +++ b/src/design/audit/first-principles-mode.ts @@ -0,0 +1,98 @@ +/** + * Layer 3 — First-principles fallback. + * + * When the ensemble classifier is uncertain the auditor does not fabricate a + * classification. This module decides when to trigger first-principles mode + * and queues NovelPatternObservations for fleet mining. + */ + +import * as fs from 'node:fs' +import * as fsp from 'node:fs/promises' +import * as path from 'node:path' +import * as os from 'node:os' +import * as crypto from 'node:crypto' +import type { EnsembleClassification, NovelPatternObservation, PageType } from './v2/types.js' + +export interface FirstPrinciplesOptions { + /** Override the minimum ensemble confidence threshold (default 0.6). */ + confidenceThreshold?: number +} + +/** + * Returns true when first-principles mode should fire. + * + * Trigger conditions (ANY of): + * - ensembleConfidence < threshold (default 0.6) + * - signalsAgreed === false + * - classification.type === 'unknown' + * - LLM explicitly emitted first_principles_mode: true + */ +export function shouldTriggerFirstPrinciples( + classification: EnsembleClassification, + opts?: FirstPrinciplesOptions, +): boolean { + const threshold = opts?.confidenceThreshold ?? 0.6 + if (classification.ensembleConfidence < threshold) return true + if (!classification.signalsAgreed) return true + if ((classification.type as string) === 'unknown') return true + if (classification.firstPrinciplesMode) return true + return false +} + +/** + * Build a NovelPatternObservation from the classification and runtime context. + * The `observationId` is stable: same pageRef + capturedAt minute → same id. + */ +export function buildNovelPatternObservation(args: { + classification: EnsembleClassification + pageRef: string + observedSignals?: string + snapshotKey?: string +}): NovelPatternObservation { + const capturedAt = new Date().toISOString() + const observationId = crypto + .createHash('sha256') + .update(`${args.pageRef}::${capturedAt.slice(0, 16)}`) + .digest('hex') + .slice(0, 16) + + return { + observationId, + capturedAt, + observed: args.observedSignals ?? 'No specific signal description provided.', + closestType: args.classification.type as PageType, + closestConfidence: args.classification.ensembleConfidence, + pageRef: args.pageRef, + ...(args.snapshotKey ? { snapshotKey: args.snapshotKey } : {}), + } +} + +/** + * Append a NovelPatternObservation as a JSONL line to the date-stamped sink. + * Default dir: `~/.bad/novel-patterns/`. Each line is valid JSON on its own. + */ +export async function appendNovelPatternObservation( + observation: NovelPatternObservation, + dir?: string, +): Promise { + const sinkDir = dir ?? path.join(os.homedir(), '.bad', 'novel-patterns') + await fsp.mkdir(sinkDir, { recursive: true }) + const date = observation.capturedAt.slice(0, 10) + const filePath = path.join(sinkDir, `${date}.jsonl`) + const line = JSON.stringify(observation) + '\n' + await fsp.appendFile(filePath, line, 'utf-8') +} + +/** + * Synchronous variant — for use in pipeline paths that aren't async. + */ +export function appendNovelPatternObservationSync( + observation: NovelPatternObservation, + dir?: string, +): void { + const sinkDir = dir ?? path.join(os.homedir(), '.bad', 'novel-patterns') + fs.mkdirSync(sinkDir, { recursive: true }) + const date = observation.capturedAt.slice(0, 10) + const filePath = path.join(sinkDir, `${date}.jsonl`) + fs.appendFileSync(filePath, JSON.stringify(observation) + '\n', 'utf-8') +} diff --git a/src/design/audit/modality/android.ts b/src/design/audit/modality/android.ts new file mode 100644 index 0000000..4ec62e6 --- /dev/null +++ b/src/design/audit/modality/android.ts @@ -0,0 +1,24 @@ +/** + * Layer 8 — Android modality adapter (stub). + * + * UI Automator + accessibility-tree capture. Not yet implemented. + * + * TODO Layer 8: UI Automator bridge, emulator management, ax-tree capture. + */ + +import type { ModalityAdapter, ModalityInput, Evidence } from '../v2/types.js' + +export class AndroidModalityAdapter implements ModalityAdapter { + readonly modality = 'android' as const + + async capture(_input: ModalityInput): Promise { + throw new Error( + 'Android modality adapter is not yet implemented. ' + + 'See RFC-002 Layer 8 for the implementation plan. ' + + 'Ship iOS first per the RFC sequencing note. ' + + 'Use --modality html for web audits.', + ) + } +} + +export const androidAdapter = new AndroidModalityAdapter() diff --git a/src/design/audit/modality/html.ts b/src/design/audit/modality/html.ts new file mode 100644 index 0000000..ae32532 --- /dev/null +++ b/src/design/audit/modality/html.ts @@ -0,0 +1,56 @@ +/** + * Layer 8 — HTML modality adapter. + * + * Wraps the existing Playwright-based capture pipeline into the `ModalityAdapter` + * interface so it can participate in the unified scoring framework. The underlying + * pipeline is unchanged; this module provides the typed adapter boundary. + */ + +import type { ModalityAdapter, ModalityInput, Evidence, MeasurementBundle } from '../v2/types.js' + +export class HtmlModalityAdapter implements ModalityAdapter { + readonly modality = 'html' as const + + /** + * Capture HTML evidence. Delegates to the existing browser-based pipeline. + * In practice, `pipeline.ts` drives this; the adapter exists to make the + * interface explicit and enable Layer 8's modality dispatch. + * + * @param input.entryPoint - URL to audit + * @param input.flow - optional page flow (multi-page audit) + */ + async capture(input: ModalityInput): Promise { + // The real implementation lives in pipeline.ts / measure/index.ts. + // This adapter records the contract and is called by the pipeline dispatcher. + // When a caller invokes adapter.capture() directly, it returns a shell + // Evidence that the pipeline will hydrate with real snapshot + measurements. + const shell: Evidence = { + modality: 'html', + surfaces: [], + measurements: emptyMeasurementBundle(), + snapshot: '', + screenshot: undefined, + } + void input + return shell + } +} + +function emptyMeasurementBundle(): MeasurementBundle { + return { + contrast: { + totalChecked: 0, + aaFailures: [], + aaaFailures: [], + summary: { aaPassRate: 1, aaaPassRate: 1 }, + }, + a11y: { + ran: true, + violations: [], + passes: 0, + }, + hasBlockingIssues: false, + } +} + +export const htmlAdapter = new HtmlModalityAdapter() diff --git a/src/design/audit/modality/index.ts b/src/design/audit/modality/index.ts new file mode 100644 index 0000000..a95adca --- /dev/null +++ b/src/design/audit/modality/index.ts @@ -0,0 +1,19 @@ +import type { Modality, ModalityAdapter } from './types.js' +import { htmlAdapter } from './html.js' +import { iosAdapter } from './ios.js' +import { androidAdapter } from './android.js' + +const ADAPTERS: Record = { + html: htmlAdapter, + ios: iosAdapter, + android: androidAdapter, + terminal: { modality: 'terminal', capture: async () => { throw new Error('terminal modality not implemented') } }, + voice: { modality: 'voice', capture: async () => { throw new Error('voice modality not implemented') } }, +} + +export function getModalityAdapter(modality: Modality): ModalityAdapter { + return ADAPTERS[modality] +} + +export { htmlAdapter, iosAdapter, androidAdapter } +export type { Modality, ModalityAdapter } diff --git a/src/design/audit/modality/ios.ts b/src/design/audit/modality/ios.ts new file mode 100644 index 0000000..83fce5e --- /dev/null +++ b/src/design/audit/modality/ios.ts @@ -0,0 +1,25 @@ +/** + * Layer 8 — iOS modality adapter (stub). + * + * XCUITest + accessibility-tree capture. Not yet implemented. + * Ship the interface so CLI dispatch and type-checking work; native + * bridging will be added once the HTML adapter's abstraction is validated. + * + * TODO Layer 8: XCUITest bridge, simulator management, ax-tree capture. + */ + +import type { ModalityAdapter, ModalityInput, Evidence } from '../v2/types.js' + +export class IosModalityAdapter implements ModalityAdapter { + readonly modality = 'ios' as const + + async capture(_input: ModalityInput): Promise { + throw new Error( + 'iOS modality adapter is not yet implemented. ' + + 'See RFC-002 Layer 8 for the implementation plan. ' + + 'Use --modality html for web audits.', + ) + } +} + +export const iosAdapter = new IosModalityAdapter() diff --git a/src/design/audit/modality/types.ts b/src/design/audit/modality/types.ts new file mode 100644 index 0000000..d769314 --- /dev/null +++ b/src/design/audit/modality/types.ts @@ -0,0 +1,16 @@ +/** + * Layer 8 — Modality adapter type contract. + * + * Re-exports the stable shapes from v2/types.ts. Each adapter (HTML, iOS, + * Android) implements the ModalityAdapter interface and produces an Evidence + * record that flows into the shared Layers 1–7 scoring pipeline unchanged. + */ + +export type { + Modality, + ModalityAdapter, + ModalityInput, + Evidence, + SurfaceRecord, + SurfaceMeasurements, +} from '../v2/types.js' diff --git a/src/design/audit/patches/index.ts b/src/design/audit/patches/index.ts new file mode 100644 index 0000000..de59a33 --- /dev/null +++ b/src/design/audit/patches/index.ts @@ -0,0 +1,7 @@ +export { parsePatch, parsePatches } from './parse.js' +export { validatePatch, validatePatches } from './validate.js' +export { renderUnifiedDiff, renderPatchSummary } from './render.js' +export { enforcePatchPolicy } from './severity-enforcement.js' +export type { ParseResult } from './parse.js' +export type { ValidationResult, ValidationReason } from './validate.js' +export type { EnforcementResult, EnforcementRecord } from './severity-enforcement.js' diff --git a/src/design/audit/patches/parse.ts b/src/design/audit/patches/parse.ts new file mode 100644 index 0000000..12f9a04 --- /dev/null +++ b/src/design/audit/patches/parse.ts @@ -0,0 +1,165 @@ +/** + * Patch parser — converts raw LLM JSON output into typed `Patch` objects. + * + * Strict shape validation. On schema mismatch returns `{ patch: null, reason }` + * rather than throwing — the calling pipeline batches many candidate patches + * per audit and a single malformed entry must not abort the whole run. + */ + +import type { + Patch, + PatchRollback, + PatchRollbackKind, + PatchTarget, + PatchTest, + PatchTestKind, + ConfidenceLevel, + Dimension, +} from '../v2/types.js' + +type PatchScope = 'page' | 'section' | 'component' | 'system' +type PatchTargetScope = 'tsx' | 'jsx' | 'css' | 'tailwind' | 'module-css' | 'styled-component' | 'structural' | 'html' +type PatchDeltaConfidence = ConfidenceLevel | 'untested' + +const VALID_SCOPES: PatchScope[] = ['page', 'section', 'component', 'system'] +const VALID_TARGET_SCOPES: PatchTargetScope[] = [ + 'tsx', 'jsx', 'css', 'tailwind', 'module-css', 'styled-component', 'structural', 'html', +] +const VALID_TEST_KINDS: PatchTestKind[] = [ + 'storybook', 'a11y-rule', 'visual-snapshot', 'unit', 'rerun-audit', 'manual', +] +const VALID_ROLLBACK_KINDS: PatchRollbackKind[] = ['git-revert', 'css-disable', 'manual'] +const VALID_CONFIDENCES: PatchDeltaConfidence[] = ['high', 'medium', 'low', 'untested'] + +export interface ParseResult { + patch: Patch | null + reason?: string +} + +function isObject(value: unknown): value is Record { + return typeof value === 'object' && value !== null && !Array.isArray(value) +} + +function isString(value: unknown): value is string { + return typeof value === 'string' && value.length > 0 +} + +function oneOf(value: unknown, allowed: readonly T[]): value is T { + return typeof value === 'string' && (allowed as readonly string[]).includes(value) +} + +function parseTarget(raw: unknown): PatchTarget | string { + if (!isObject(raw)) return 'target: not an object' + if (!oneOf(raw.scope, VALID_TARGET_SCOPES)) return `target.scope: invalid (got ${String(raw.scope)})` + const target: PatchTarget = { scope: raw.scope } + if (raw.filePath !== undefined) { + if (!isString(raw.filePath)) return 'target.filePath: must be non-empty string' + target.filePath = raw.filePath + } + if (raw.componentName !== undefined) { + if (!isString(raw.componentName)) return 'target.componentName: must be non-empty string' + target.componentName = raw.componentName + } + if (raw.cssSelector !== undefined) { + if (!isString(raw.cssSelector)) return 'target.cssSelector: must be non-empty string' + target.cssSelector = raw.cssSelector + } + return target +} + +function parseTest(raw: unknown): PatchTest | string { + if (!isObject(raw)) return 'testThatProves: not an object' + if (!oneOf(raw.kind, VALID_TEST_KINDS)) return `testThatProves.kind: invalid (got ${String(raw.kind)})` + if (!isString(raw.description)) return 'testThatProves.description: must be non-empty string' + const test: PatchTest = { kind: raw.kind, description: raw.description } + if (raw.command !== undefined) { + if (typeof raw.command !== 'string') return 'testThatProves.command: must be string when present' + test.command = raw.command + } + return test +} + +function parseRollback(raw: unknown): PatchRollback | string { + if (!isObject(raw)) return 'rollback: not an object' + if (!oneOf(raw.kind, VALID_ROLLBACK_KINDS)) return `rollback.kind: invalid (got ${String(raw.kind)})` + const rollback: PatchRollback = { kind: raw.kind } + if (raw.instruction !== undefined) { + if (typeof raw.instruction !== 'string') return 'rollback.instruction: must be string when present' + rollback.instruction = raw.instruction + } + return rollback +} + +/** + * Parse a single raw LLM-produced object into a `Patch`. Returns + * `{ patch: null, reason }` on any schema violation. + */ +export function parsePatch(raw: unknown): ParseResult { + if (!isObject(raw)) return { patch: null, reason: 'patch: not an object' } + if (!isString(raw.patchId)) return { patch: null, reason: 'patchId: required non-empty string' } + if (!isString(raw.findingId)) return { patch: null, reason: 'findingId: required non-empty string' } + if (!oneOf(raw.scope, VALID_SCOPES)) return { patch: null, reason: `scope: invalid (got ${String(raw.scope)})` } + + const target = parseTarget(raw.target) + if (typeof target === 'string') return { patch: null, reason: target } + + if (!isObject(raw.diff)) return { patch: null, reason: 'diff: not an object' } + if (!isString(raw.diff.before)) return { patch: null, reason: 'diff.before: required non-empty string' } + if (typeof raw.diff.after !== 'string') return { patch: null, reason: 'diff.after: required string' } + const diff = { + before: raw.diff.before, + after: raw.diff.after, + ...(typeof raw.diff.unifiedDiff === 'string' ? { unifiedDiff: raw.diff.unifiedDiff } : {}), + } + + const test = parseTest(raw.testThatProves) + if (typeof test === 'string') return { patch: null, reason: test } + + const rollback = parseRollback(raw.rollback) + if (typeof rollback === 'string') return { patch: null, reason: rollback } + + if (!isObject(raw.estimatedDelta)) return { patch: null, reason: 'estimatedDelta: not an object' } + if (!isString(raw.estimatedDelta.dim)) return { patch: null, reason: 'estimatedDelta.dim: required' } + if (typeof raw.estimatedDelta.delta !== 'number' || !Number.isFinite(raw.estimatedDelta.delta)) { + return { patch: null, reason: 'estimatedDelta.delta: must be finite number' } + } + + if (!oneOf(raw.estimatedDeltaConfidence, VALID_CONFIDENCES)) { + return { patch: null, reason: `estimatedDeltaConfidence: invalid (got ${String(raw.estimatedDeltaConfidence)})` } + } + + const patch: Patch = { + patchId: raw.patchId, + findingId: raw.findingId, + scope: raw.scope, + target, + diff, + testThatProves: test, + rollback, + estimatedDelta: { dim: raw.estimatedDelta.dim as Dimension, delta: raw.estimatedDelta.delta }, + estimatedDeltaConfidence: raw.estimatedDeltaConfidence, + ...(typeof raw.matchedPatternId === 'string' ? { matchedPatternId: raw.matchedPatternId } : {}), + } + return { patch } +} + +/** + * Parse an array of raw patch objects. Invalid entries are dropped from the + * returned `patches` and reported in `errors` with their original index. + */ +export function parsePatches(raw: unknown): { + patches: Patch[] + errors: Array<{ index: number; reason: string }> +} { + if (!Array.isArray(raw)) { + return { patches: [], errors: [{ index: -1, reason: 'patches: not an array' }] } + } + const patches: Patch[] = [] + const errors: Array<{ index: number; reason: string }> = [] + for (let i = 0; i < raw.length; i++) { + const result = parsePatch(raw[i]) + if (result.patch) patches.push(result.patch) + else errors.push({ index: i, reason: result.reason ?? 'unknown' }) + } + return { patches, errors } +} diff --git a/src/design/audit/patches/render.ts b/src/design/audit/patches/render.ts new file mode 100644 index 0000000..5b11f77 --- /dev/null +++ b/src/design/audit/patches/render.ts @@ -0,0 +1,57 @@ +/** + * Patch renderer — produces a unified diff from a Patch when filePath is known. + * + * Agents can pipe the result to `git apply --check` then `git apply`. + * When filePath is unknown, returns null — the agent must use before/after for + * search-replace instead. + */ + +import type { Patch } from '../v2/types.js' + +/** + * Render a minimal unified diff (1-hunk, 3 lines context) from a patch. + * Returns null when: + * - `target.filePath` is not set (no file to diff against) + * - `unifiedDiff` is already set on the patch (prefer the LLM's version) + */ +export function renderUnifiedDiff(patch: Patch): string | null { + if (patch.diff.unifiedDiff) return patch.diff.unifiedDiff + if (!patch.target.filePath) return null + + const { before, after } = patch.diff + const filePath = patch.target.filePath + + const beforeLines = before.split('\n') + const afterLines = after.split('\n') + + const removals = beforeLines.map(l => `- ${l}`) + const additions = afterLines.map(l => `+ ${l}`) + + const hunkOldLen = beforeLines.length + const hunkNewLen = afterLines.length + + return [ + `--- a/${filePath}`, + `+++ b/${filePath}`, + `@@ -1,${hunkOldLen} +1,${hunkNewLen} @@`, + ...removals, + ...additions, + ].join('\n') +} + +/** + * Render a human-readable patch summary for display in report.md. + */ +export function renderPatchSummary(patch: Patch): string { + const parts: string[] = [] + parts.push(`**Patch ${patch.patchId}** (${patch.scope})`) + if (patch.target.filePath) parts.push(`File: \`${patch.target.filePath}\``) + else if (patch.target.cssSelector) parts.push(`Selector: \`${patch.target.cssSelector}\``) + else if (patch.target.componentName) parts.push(`Component: \`${patch.target.componentName}\``) + parts.push(`\`\`\`diff\n${renderUnifiedDiff(patch) ?? `- ${patch.diff.before}\n+ ${patch.diff.after}`}\n\`\`\``) + parts.push(`Test: ${patch.testThatProves.description}`) + if (patch.testThatProves.command) parts.push(`Command: \`${patch.testThatProves.command}\``) + parts.push(`Rollback: ${patch.rollback.kind}${patch.rollback.instruction ? ` — ${patch.rollback.instruction}` : ''}`) + parts.push(`Estimated Δ: ${patch.estimatedDelta.dim} ${patch.estimatedDelta.delta > 0 ? '+' : ''}${patch.estimatedDelta.delta} (confidence: ${patch.estimatedDeltaConfidence})`) + return parts.join('\n') +} diff --git a/src/design/audit/patches/severity-enforcement.ts b/src/design/audit/patches/severity-enforcement.ts new file mode 100644 index 0000000..183ecfe --- /dev/null +++ b/src/design/audit/patches/severity-enforcement.ts @@ -0,0 +1,63 @@ +/** + * Severity enforcement — every major/critical finding MUST have ≥1 valid patch. + * + * Findings without patches are downgraded to `minor` with an explanatory note. + * This runs as a post-processing step after patch validation. + */ + +import type { Patch, DesignFinding } from '../v2/types.js' + +export interface EnforcementRecord { + findingId: string + fromSeverity: string + toSeverity: 'minor' + reason: string +} + +export interface EnforcementResult { + findings: DesignFinding[] + downgraded: EnforcementRecord[] +} + +/** + * Given a list of findings and the set of valid patches (post-validation), + * downgrade any major/critical finding that has no valid patch to `minor`. + */ +export function enforcePatchPolicy( + findings: DesignFinding[], + validPatchIds: Set, +): EnforcementResult { + const downgraded: EnforcementRecord[] = [] + + const updated = findings.map(f => { + if (f.severity !== 'major' && f.severity !== 'critical') return f + + const v2Finding = f as DesignFinding & { patches?: Patch[] } + const patches = v2Finding.patches ?? [] + const hasValidPatch = patches.some(p => validPatchIds.has(p.patchId)) + + if (hasValidPatch) return f + + downgraded.push({ + findingId: f.id, + fromSeverity: f.severity, + toSeverity: 'minor', + reason: patches.length === 0 + ? 'no patches proposed' + : 'all proposed patches failed validation (before not in snapshot, missing locator, or delta out of range)', + }) + + return { + ...f, + severity: 'minor' as const, + suggestion: [ + f.suggestion, + '[auto-downgraded: patch required for major/critical severity]', + ] + .filter(Boolean) + .join(' '), + } + }) + + return { findings: updated, downgraded } +} diff --git a/src/design/audit/patches/validate.ts b/src/design/audit/patches/validate.ts new file mode 100644 index 0000000..5089c99 --- /dev/null +++ b/src/design/audit/patches/validate.ts @@ -0,0 +1,74 @@ +/** + * Patch validator — given a parsed patch and the page snapshot text, verify + * that the patch is grounded and applyable. + * + * Rules: + * - `diff.before` must appear as a case-sensitive substring of the snapshot. + * Agents apply patches literally; a hallucinated `before` is unfixable. + * - `target` must carry at least one locator (cssSelector | filePath | + * componentName). Without one the agent has nowhere to apply. + * - `estimatedDelta.delta` must be in [-3, 3]. Larger claims are almost + * always over-confident on a 1–10 scale. + */ + +import type { Patch } from '../v2/types.js' + +export type ValidationReason = + | 'before-not-in-snapshot' + | 'target-missing-locator' + | 'estimated-delta-out-of-range' + | 'before-empty' + +export interface ValidationResult { + valid: boolean + reasons: ValidationReason[] +} + +const DELTA_MIN = -3 +const DELTA_MAX = 3 + +/** + * Validate a single patch against a page snapshot. Reports all issues in one + * pass so callers can surface every problem to the agent at once. + */ +export function validatePatch(patch: Patch, snapshot: string): ValidationResult { + const reasons: ValidationReason[] = [] + const { target, diff, estimatedDelta } = patch + + if (!target.cssSelector && !target.filePath && !target.componentName) { + reasons.push('target-missing-locator') + } + + if (diff.before.length === 0) { + reasons.push('before-empty') + } else if (!snapshot.includes(diff.before)) { + reasons.push('before-not-in-snapshot') + } + + if ( + estimatedDelta.delta < DELTA_MIN || + estimatedDelta.delta > DELTA_MAX || + !Number.isFinite(estimatedDelta.delta) + ) { + reasons.push('estimated-delta-out-of-range') + } + + return { valid: reasons.length === 0, reasons } +} + +/** + * Validate a list of patches and partition into valid / invalid. + */ +export function validatePatches( + patches: Patch[], + snapshot: string, +): { valid: Patch[]; invalid: Array<{ patch: Patch; reasons: ValidationReason[] }> } { + const valid: Patch[] = [] + const invalid: Array<{ patch: Patch; reasons: ValidationReason[] }> = [] + for (const patch of patches) { + const result = validatePatch(patch, snapshot) + if (result.valid) valid.push(patch) + else invalid.push({ patch, reasons: result.reasons }) + } + return { valid, invalid } +} diff --git a/src/design/audit/patterns/match.ts b/src/design/audit/patterns/match.ts new file mode 100644 index 0000000..cb73d91 --- /dev/null +++ b/src/design/audit/patterns/match.ts @@ -0,0 +1,61 @@ +/** + * Layer 5 — Pattern matching. + * + * Fuzzy-matches a page against catalogued patterns. When patterns exist (post + * fleet accumulation), findings include `matchedPatterns[]` so agents can cite + * fleet evidence rather than applying novel patches. + * + * Currently returns [] (cold-start). The interface is stable. + */ + +import type { Pattern, PatternMatch, PatternQuery } from './types.js' +import type { PageType, Dimension } from '../v2/types.js' +import { queryPatterns } from './store.js' + +export interface MatchContext { + pageType: PageType + weakDimensions: Dimension[] + dir?: string +} + +/** + * Match patterns against the current page context. Returns the top-N matches + * ordered by expected leverage (weakest dim × pattern's median delta for that dim). + * + * Cold-start: returns [] until patterns are mined. + */ +export async function matchPatterns( + ctx: MatchContext, + topN: number = 5, +): Promise { + const query: PatternQuery = { + pageType: ctx.pageType, + minApplications: 5, + minSuccessRate: 0.5, + } + const candidates = await queryPatterns(query, ctx.dir) + if (candidates.length === 0) return [] + + const scored: Array<{ pattern: Pattern; leverage: number }> = candidates.map(p => { + const leverage = ctx.weakDimensions.reduce((sum, dim) => { + return sum + (p.fleetEvidence.medianDimDelta[dim] ?? 0) + }, 0) + return { pattern: p, leverage } + }) + + return scored + .sort((a, b) => b.leverage - a.leverage) + .slice(0, topN) + .map(({ pattern, leverage }) => { + const expectedDelta: Record = {} as Record + for (const dim of ctx.weakDimensions) { + expectedDelta[dim] = pattern.fleetEvidence.medianDimDelta[dim] ?? 0 + } + return { + pattern, + matchConfidence: Math.min(1, leverage / 10), + expectedDelta, + applicationGuidance: `Apply ${pattern.scaffold.description}. Key decisions: ${pattern.scaffold.keyDecisions.join('; ')}.`, + } + }) +} diff --git a/src/design/audit/patterns/mine.ts b/src/design/audit/patterns/mine.ts new file mode 100644 index 0000000..2bbe967 --- /dev/null +++ b/src/design/audit/patterns/mine.ts @@ -0,0 +1,49 @@ +/** + * Layer 5 — Pattern mining (scaffold). + * + * In production this runs as a Cloudflare Worker cron job on accumulated + * PatchApplication telemetry. The mining threshold (N≥30, ≥5 tenants, + * replicationRate≥0.7) prevents false patterns from premature data. + * + * Until fleet data accumulates this module is a scaffold. Run: + * pnpm patterns:mine --dir ~/.bad + * + * TODO: implement clustering algorithm once sufficient attribution data exists. + */ + +import type { PatchApplication } from '../attribution/types.js' +import type { Pattern } from './types.js' +import { savePattern } from './store.js' + +export interface MineOptions { + minApplications?: number + minTenants?: number + minReplicationRate?: number + dir?: string +} + +const DEFAULTS: Required> = { + minApplications: 30, + minTenants: 5, + minReplicationRate: 0.7, +} + +/** + * Mine patterns from accumulated PatchApplication records. + * + * Currently a stub — returns 0 mined until clustering is implemented. + * The interface is stable; consumers can call it safely in tests via synthetic + * data without triggering real fleet operations. + */ +export async function minePatterns( + applications: PatchApplication[], + opts: MineOptions = {}, +): Promise<{ mined: number; skipped: number }> { + void applications + void opts + void DEFAULTS + void savePattern + // TODO: implement structural clustering by (scope, target.cssSelector pattern, + // diff similarity) once N≥30 fleet data is available. See RFC §Layer 5. + return { mined: 0, skipped: applications.length } +} diff --git a/src/design/audit/patterns/store.ts b/src/design/audit/patterns/store.ts new file mode 100644 index 0000000..3185568 --- /dev/null +++ b/src/design/audit/patterns/store.ts @@ -0,0 +1,51 @@ +/** + * Layer 5 — Pattern store. + * + * Reads/writes patterns from a JSONL file. In production this is backed by a + * Cloudflare D1 or R2 store; the JSONL backend is for local dev and tests. + * + * Cold-start: the pattern library is empty until fleet data accumulates. + * The store returns [] for all queries until patterns are mined (Layer 5 mine.ts). + */ + +import * as fs from 'node:fs' +import * as fsp from 'node:fs/promises' +import * as path from 'node:path' +import * as os from 'node:os' +import type { Pattern, PatternQuery } from './types.js' + +const DEFAULT_DIR = path.join(os.homedir(), '.bad', 'patterns') +const PATTERNS_FILE = 'patterns.jsonl' + +export async function loadPatterns(dir: string = DEFAULT_DIR): Promise { + const filePath = path.join(dir, PATTERNS_FILE) + if (!fs.existsSync(filePath)) return [] + const lines = fs.readFileSync(filePath, 'utf-8').split('\n').filter(Boolean) + return lines.flatMap(line => { + try { return [JSON.parse(line) as Pattern] } + catch { return [] } + }) +} + +export async function savePattern(pattern: Pattern, dir: string = DEFAULT_DIR): Promise { + await fsp.mkdir(dir, { recursive: true }) + await fsp.appendFile(path.join(dir, PATTERNS_FILE), JSON.stringify(pattern) + '\n', 'utf-8') +} + +export async function queryPatterns( + query: PatternQuery, + dir: string = DEFAULT_DIR, +): Promise { + const all = await loadPatterns(dir) + return all.filter(p => { + if (query.category && p.category !== query.category) return false + if (query.pageType && p.classification.type !== query.pageType) return false + if (query.minApplications && p.fleetEvidence.applications < query.minApplications) return false + if (query.minSuccessRate && p.fleetEvidence.successRate < query.minSuccessRate) return false + if (query.weakDimension) { + const delta = p.fleetEvidence.medianDimDelta[query.weakDimension] ?? 0 + if (delta <= 0) return false + } + return true + }) +} diff --git a/src/design/audit/patterns/types.ts b/src/design/audit/patterns/types.ts new file mode 100644 index 0000000..7053389 --- /dev/null +++ b/src/design/audit/patterns/types.ts @@ -0,0 +1,53 @@ +/** + * Layer 5 — Pattern library type contract. + * + * Patterns are mined from accumulated PatchApplication data once a cluster + * meets: N≥30 applications across ≥5 distinct tenants, replicationRate≥0.7. + * Until fleet data accumulates (≥6 weeks), the pattern library is empty. + * + * This module defines the stable query API so agents can code against it now. + * The mining and matching implementations are scaffolded; real clustering runs + * as a Cloudflare Worker cron once the attribution data accumulates. + */ + +export type { PageType, Dimension } from '../v2/types.js' +import type { PageType, Dimension } from '../v2/types.js' + +export interface PatternScaffold { + description: string + referenceTsx?: string + referenceCss?: string + keyDecisions: string[] +} + +export interface PatternFleetEvidence { + applications: number + successRate: number + medianDimDelta: Record + sampleTenants: number +} + +export interface Pattern { + patternId: string + category: string + classification: { type: PageType; tags: string[] } + scaffold: PatternScaffold + scores: { whenFollowed: Record } + fleetEvidence: PatternFleetEvidence + fixtures: string[] +} + +export interface PatternQuery { + category?: string + pageType?: PageType + weakDimension?: Dimension + minApplications?: number + minSuccessRate?: number +} + +export interface PatternMatch { + pattern: Pattern + matchConfidence: number + expectedDelta: Record + applicationGuidance: string +} diff --git a/src/design/audit/pipeline.ts b/src/design/audit/pipeline.ts index a44574d..248faea 100644 --- a/src/design/audit/pipeline.ts +++ b/src/design/audit/pipeline.ts @@ -18,6 +18,19 @@ import { gatherMeasurements } from './measure/index.js' import { evaluatePage, type AuditPassId, type AuditOverrides } from './evaluate.js' import type { PageAuditResult, PageClassification } from './types.js' import { getTelemetry, shortHash } from '../../telemetry/index.js' +import { loadEthicsRules } from './ethics/loader.js' +import { checkEthics, pageTextBlob } from './ethics/check.js' +import { classifyEnsemble } from './classify-ensemble.js' +import { loadAnchors } from './rubric/anchor-loader.js' +import { buildAuditResultV2 } from './v2/build-result.js' +import type { + AudienceTag, + ModalityTag, + RegulatoryContextTag, + AudienceVulnerabilityTag, + EthicsViolation, + EnsembleClassification, +} from './v2/types.js' export interface AuditOnePageOptions { brain: Brain @@ -43,6 +56,17 @@ export interface AuditOnePageOptions { * candidate prompts; production runs leave them undefined. */ overrides?: AuditOverrides + /** + * Layer 7 — bypass the ethics gate entirely. Audited + warned. Test-only. + */ + skipEthics?: boolean + /** Override directory containing ethics `*.yaml` rule files. */ + ethicsRulesDir?: string + /** Layer 6 hints used by ethics + composable predicates. */ + audience?: AudienceTag[] + modality?: ModalityTag[] + regulatoryContext?: RegulatoryContextTag[] + audienceVulnerability?: AudienceVulnerabilityTag[] } const COOKIE_BANNER_SELECTORS = [ @@ -68,7 +92,27 @@ async function dismissCookieBanners(page: Page): Promise { * Audit one page through the full Gen 2 pipeline. */ export async function auditOnePage(opts: AuditOnePageOptions): Promise { - const { brain, driver, page, url, profileOverride, screenshotDir, userRubricsDir, auditPasses, runId, parentRunId, provider, model, overrides } = opts + const { + brain, + driver, + page, + url, + profileOverride, + screenshotDir, + userRubricsDir, + auditPasses, + runId, + parentRunId, + provider, + model, + overrides, + skipEthics, + ethicsRulesDir, + audience, + modality, + regulatoryContext, + audienceVulnerability, + } = opts const startedAt = Date.now() try { @@ -90,7 +134,9 @@ export async function auditOnePage(opts: AuditOnePageOptions): Promise 0) { + const minCap = Math.min(...ethicsViolations.map((v) => v.rollupCap)) + if (typeof result.score === 'number' && result.score > minCap) { + result.preEthicsScore = result.score + result.score = minCap + } + } + result.ethicsViolations = ethicsViolations + } + + // ── 8. Layer 1 v2 — multi-dim scoring + rollup, emitted alongside v1 ── + if (ensemble) { + try { + const anchors = loadAnchors() + const anchor = anchors.get(ensemble.type) + const v2 = await buildAuditResultV2({ + brain, + state, + pageRef: url, + ensemble, + rubric, + measurements, + v1Result: result, + anchor, + runId, + }) + result.auditResultV2 = v2 + result.ensembleClassification = ensemble + } catch (v2Err) { + // Don't let v2 failures break v1. Log + move on. + console.warn(`[audit/v2] failed to build v2 result for ${url}: ${(v2Err as Error).message}`) + } + } + if (runId) { const findings = result.findings ?? [] + const ethicsViolations: EthicsViolation[] = result.ethicsViolations ?? [] getTelemetry().emit({ kind: 'design-audit-page', runId, @@ -170,6 +271,9 @@ export async function auditOnePage(opts: AuditOnePageOptions): Promise v.severity === 'critical-floor').length, + ethicsMajorFloor: ethicsViolations.filter((v) => v.severity === 'major-floor').length, }, tags: { pageType: classification.type, @@ -202,6 +306,7 @@ export async function auditOnePage(opts: AuditOnePageOptions): Promise.yaml`. Each + * anchor encodes score-band criteria + reference fixtures so the LLM scores + * an saas-app like Linear's app, not like Linear's marketing site. + * + * Schema: + * type: + * score_9_10: { criteria: string[], fixtures: string[] } + * score_7_8: { criteria: string[], fixtures: string[] } + * score_5_6: { criteria: string[], fixtures: string[] } + * score_3_4: { criteria: string[], fixtures: string[] } + */ + +import * as fs from 'node:fs' +import * as path from 'node:path' +import { fileURLToPath } from 'node:url' +import type { PageType } from '../types.js' + +const __dirname = path.dirname(fileURLToPath(import.meta.url)) +const ANCHORS_DIR = path.join(__dirname, 'anchors') + +export interface AnchorBand { + criteria: string[] + fixtures: string[] +} + +export interface CalibrationAnchor { + type: PageType + score_9_10: AnchorBand + score_7_8: AnchorBand + score_5_6: AnchorBand + score_3_4: AnchorBand +} + +const REQUIRED_BANDS = ['score_9_10', 'score_7_8', 'score_5_6', 'score_3_4'] as const + +/** + * Parse one anchor YAML. Uses a minimal YAML reader that handles the shape: + * type: saas-app + * score_9_10: + * criteria: + * - line one + * - line two + * fixtures: + * - fixture:linear-app + * + * Avoids pulling in a YAML dep for ~9 small files. Throws on malformed input. + */ +export function parseAnchorFile(filePath: string): CalibrationAnchor { + const raw = fs.readFileSync(filePath, 'utf-8') + const parsed = parseAnchorYaml(raw) + + if (!parsed.type || typeof parsed.type !== 'string') { + throw new Error(`anchor ${filePath} missing 'type' field`) + } + + for (const band of REQUIRED_BANDS) { + const node = parsed[band] + if (!node || typeof node !== 'object') { + throw new Error(`anchor ${filePath} missing '${band}' band`) + } + const b = node as { criteria?: unknown; fixtures?: unknown } + if (!Array.isArray(b.criteria) || b.criteria.length === 0) { + throw new Error(`anchor ${filePath} '${band}.criteria' must be a non-empty array`) + } + if (!Array.isArray(b.fixtures) || b.fixtures.length === 0) { + throw new Error(`anchor ${filePath} '${band}.fixtures' must be a non-empty array`) + } + } + + return parsed as unknown as CalibrationAnchor +} + +/** Load all anchors from `anchors/` into a map keyed by PageType. */ +export function loadAnchors(dir: string = ANCHORS_DIR): Map { + const out = new Map() + if (!fs.existsSync(dir)) return out + for (const file of fs.readdirSync(dir)) { + if (!file.endsWith('.yaml') && !file.endsWith('.yml')) continue + const anchor = parseAnchorFile(path.join(dir, file)) + out.set(anchor.type, anchor) + } + return out +} + +/** Render an anchor as a markdown block for prompt injection. */ +export function renderAnchor(anchor: CalibrationAnchor): string { + const band = (label: string, b: AnchorBand): string => + `${label}\n${b.criteria.map((c) => `- ${c}`).join('\n')}\nReferences: ${b.fixtures.join(', ')}` + return [ + `Calibration anchor for ${anchor.type}:`, + band('Score 9-10:', anchor.score_9_10), + band('Score 7-8:', anchor.score_7_8), + band('Score 5-6:', anchor.score_5_6), + band('Score 3-4:', anchor.score_3_4), + ].join('\n\n') +} + +/** + * Minimal YAML parser scoped to the anchor file shape. Supports: + * key: scalar + * key: + * subkey: scalar + * subkey: + * - list item + * + * Indentation is normalized to spaces; tabs are not supported. + */ +function parseAnchorYaml(text: string): Record { + const lines = text.split('\n').map((l) => l.replace(/\r$/, '')) + const root: Record = {} + let i = 0 + + while (i < lines.length) { + const line = lines[i] + if (!line.trim() || line.trim().startsWith('#')) { + i++ + continue + } + const indent = leadingSpaces(line) + if (indent !== 0) { + i++ + continue + } + const m = line.match(/^([a-zA-Z_][\w-]*):\s*(.*)$/) + if (!m) { + i++ + continue + } + const [, key, valueRaw] = m + const value = valueRaw.trim() + if (value === '') { + const { node, nextIndex } = readBlock(lines, i + 1, 2) + root[key] = node + i = nextIndex + } else { + root[key] = parseScalar(value) + i++ + } + } + + return root +} + +function readBlock( + lines: string[], + startIndex: number, + baseIndent: number, +): { node: Record | string[]; nextIndex: number } { + // Detect: is this a list ("- item") or a map? + let i = startIndex + while (i < lines.length && !lines[i].trim()) i++ + if (i >= lines.length) return { node: {}, nextIndex: i } + + const firstIndent = leadingSpaces(lines[i]) + if (firstIndent < baseIndent) return { node: {}, nextIndex: i } + + if (lines[i].trim().startsWith('- ') || lines[i].trim() === '-') { + const items: string[] = [] + while (i < lines.length) { + const line = lines[i] + if (!line.trim()) { + i++ + continue + } + const indent = leadingSpaces(line) + if (indent < baseIndent) break + const trimmed = line.trim() + if (!trimmed.startsWith('-')) break + const item = trimmed.replace(/^-\s*/, '') + items.push(parseScalar(item) as string) + i++ + } + return { node: items, nextIndex: i } + } + + const map: Record = {} + while (i < lines.length) { + const line = lines[i] + if (!line.trim() || line.trim().startsWith('#')) { + i++ + continue + } + const indent = leadingSpaces(line) + if (indent < baseIndent) break + if (indent > baseIndent) { + i++ + continue + } + const m = line.match(/^\s*([a-zA-Z_][\w-]*):\s*(.*)$/) + if (!m) { + i++ + continue + } + const [, key, valueRaw] = m + const value = valueRaw.trim() + if (value === '') { + const { node, nextIndex } = readBlock(lines, i + 1, baseIndent + 2) + map[key] = node + i = nextIndex + } else { + map[key] = parseScalar(value) + i++ + } + } + return { node: map, nextIndex: i } +} + +function leadingSpaces(line: string): number { + let n = 0 + while (n < line.length && line[n] === ' ') n++ + return n +} + +function parseScalar(raw: string): unknown { + let value = raw.trim() + if (value === '') return '' + if ((value.startsWith('"') && value.endsWith('"')) || (value.startsWith("'") && value.endsWith("'"))) { + value = value.slice(1, -1) + } + if (value === 'true') return true + if (value === 'false') return false + if (value === 'null' || value === '~') return null + if (/^-?\d+$/.test(value)) return Number(value) + if (/^-?\d+\.\d+$/.test(value)) return Number(value) + return value +} diff --git a/src/design/audit/rubric/anchors/blog.yaml b/src/design/audit/rubric/anchors/blog.yaml new file mode 100644 index 0000000..78c8d45 --- /dev/null +++ b/src/design/audit/rubric/anchors/blog.yaml @@ -0,0 +1,33 @@ +type: blog +score_9_10: + criteria: + - Reading column tuned to 60-75 character measure with deliberate vertical rhythm + - Typography scale supports headings, body, callouts, code, captions distinctly + - Author identity and publication date prominent without being chrome + - Inline media (images, code, embeds) integrate cleanly with text flow + - Navigation between posts (next, related, archive) supports the reading habit + fixtures: + - fixture:stratechery + - fixture:substack-post + - fixture:notion-blog-template +score_7_8: + criteria: + - Strong reading experience but typographic scale is one step shy of intentional + - Author and date present but visually inconsistent + - Inline media works but breaks rhythm at boundaries + fixtures: + - fixture:medium-default +score_5_6: + criteria: + - Default theme typography (no custom scale, line-height defaults) + - Heavy chrome (sidebar, social rail) competing with the article + - Author/date hidden in footer + fixtures: + - fixture:default-ghost +score_3_4: + criteria: + - Article copy crammed against gutters; no measure control + - Inline ads break reading flow every paragraph + - No publication date — content is unanchored in time + fixtures: + - fixture:ad-heavy-article diff --git a/src/design/audit/rubric/anchors/dashboard.yaml b/src/design/audit/rubric/anchors/dashboard.yaml new file mode 100644 index 0000000..7de766d --- /dev/null +++ b/src/design/audit/rubric/anchors/dashboard.yaml @@ -0,0 +1,35 @@ +type: dashboard +score_9_10: + criteria: + - Real metrics with units, time windows, and comparison baselines visible at first glance + - Charts use deliberate scales, axis labels, and selective color (not rainbow defaults) + - Filters, time ranges, and segment selectors are obvious and persistent + - Empty states preview real data shape (sample rows, last-7-days skeleton) + - Density tuned to operator workflow — no decorative whitespace where dense data belongs + fixtures: + - fixture:linear-app + - fixture:figma-file-ui + - fixture:datadog-dashboard + - fixture:vercel-dashboard +score_7_8: + criteria: + - Metrics present and useful but lack comparison baselines or time-window controls + - Charts readable but use default color palette; minor density issues + - Filters present but discoverability unclear + fixtures: + - fixture:grafana-default +score_5_6: + criteria: + - Generic stat cards (4 equal boxes) without context or comparison + - Charts are rainbow defaults from the component library + - No filters or time-range affordance + - Empty states are illustrations + platitudes + fixtures: + - fixture:generic-dashboard +score_3_4: + criteria: + - Numbers without units, labels, or context — operator cannot tell good from bad + - Decorative chart visualizations (donuts, gauges) instead of operational data + - Equal-weight UI controls; no clear workflow path + fixtures: + - fixture:empty-state-noise diff --git a/src/design/audit/rubric/anchors/docs.yaml b/src/design/audit/rubric/anchors/docs.yaml new file mode 100644 index 0000000..db0088b --- /dev/null +++ b/src/design/audit/rubric/anchors/docs.yaml @@ -0,0 +1,35 @@ +type: docs +score_9_10: + criteria: + - Quickstart path from landing to a working result is obvious within one viewport + - Code samples are runnable, copy-button-equipped, and language-tabbed where relevant + - Reference structure (sidebar, breadcrumbs, search) supports both linear and lookup reading + - Versioning, deprecation, and last-updated signals are visible + - Typography scale and line-length tuned for sustained reading + fixtures: + - fixture:stripe-docs + - fixture:tailwind-docs + - fixture:mdn-docs + - fixture:vercel-docs +score_7_8: + criteria: + - Quickstart present but buried; copy is solid but examples are partial + - Sidebar IA works but search is weak or missing + - Versioning unclear + fixtures: + - fixture:generic-mkdocs +score_5_6: + criteria: + - Wall-of-text reference with no quickstart + - Code samples without copy buttons or language switching + - No search, no breadcrumbs, no last-updated metadata + - Default theme typography (inconsistent line-height, no scale) + fixtures: + - fixture:default-docusaurus +score_3_4: + criteria: + - Marketing page disguised as docs (heavy hero, no actual reference) + - No code samples or syntax highlighting + - IA broken — "Getting Started" buried under marketing copy + fixtures: + - fixture:marketing-as-docs diff --git a/src/design/audit/rubric/anchors/ecommerce.yaml b/src/design/audit/rubric/anchors/ecommerce.yaml new file mode 100644 index 0000000..75d2e40 --- /dev/null +++ b/src/design/audit/rubric/anchors/ecommerce.yaml @@ -0,0 +1,33 @@ +type: ecommerce +score_9_10: + criteria: + - Product photography is real, detailed, and honest (not stock or AI-rendered) + - Price, fees, taxes, shipping, and total are surfaced before the commit button + - Stock, delivery, and return-policy signals are visible at decision points + - Cart and checkout flows preserve context — user always knows what they are buying + - Trust signals (verified payment, secure checkout, real merchant identity) appear where commitment occurs + fixtures: + - fixture:apple-store + - fixture:shopify-storefront + - fixture:allbirds-pdp +score_7_8: + criteria: + - Solid PDPs but checkout exposes fees only at the last step + - Stock and delivery info present but inconsistent across pages + - Photography mixes real and stock + fixtures: + - fixture:generic-shopify +score_5_6: + criteria: + - Fees surfaced only after pressing Pay; no shipping calculator + - Stock/delivery signals missing on PDPs + - Generic merchant identity, no trust badges, no return policy summary + fixtures: + - fixture:hidden-fees-checkout +score_3_4: + criteria: + - Total never shown until after commitment + - No merchant identity, no return policy, no trust signals + - Forced account creation before checkout, dark-pattern upsells + fixtures: + - fixture:dark-pattern-checkout diff --git a/src/design/audit/rubric/anchors/marketing.yaml b/src/design/audit/rubric/anchors/marketing.yaml new file mode 100644 index 0000000..a2f3dce --- /dev/null +++ b/src/design/audit/rubric/anchors/marketing.yaml @@ -0,0 +1,34 @@ +type: marketing +score_9_10: + criteria: + - Hero answers product, audience, and outcome within five seconds + - One dominant CTA, secondary actions clearly subordinate + - Concrete proof (real customer logos, real metrics, real screenshots) above the fold + - Visual craft (typography ramp, spacing rhythm, color system) is intentional, not template-default + - Differentiation is shown, not asserted; copy avoids vague hype + fixtures: + - fixture:stripe-marketing + - fixture:linear-marketing + - fixture:vercel-marketing + - fixture:apple-marketing +score_7_8: + criteria: + - Hero clear, but proof is generic (lorem-style logo cloud) or differentiation thin + - Visual system coherent but unremarkable; one or two minor rhythm breaks + - CTA dominant on hero but competes elsewhere + fixtures: + - fixture:generic-saas-marketing +score_5_6: + criteria: + - Hero copy explains the product instead of selling the outcome + - Visual hierarchy works but feels like a Tailwind starter + - Equal-weight CTAs, vague social proof, stock illustrations + fixtures: + - fixture:meta-copy-feature +score_3_4: + criteria: + - No primary message; could swap nouns and apply to any startup + - No CTA hierarchy; the page is a wall of equal sections + - Stock photography, generic gradients, default UI kit components + fixtures: + - fixture:ambiguous-deploy diff --git a/src/design/audit/rubric/anchors/saas-app.yaml b/src/design/audit/rubric/anchors/saas-app.yaml new file mode 100644 index 0000000..e2a6689 --- /dev/null +++ b/src/design/audit/rubric/anchors/saas-app.yaml @@ -0,0 +1,38 @@ +type: saas-app +score_9_10: + criteria: + - Domain object visible above the fold (tasks, deployments, conversations, files) + - One visually-dominant primary action per page state + - Empty states preview real product (sample rows, setup checklists, status timelines), not generic illustrations + - Action hierarchy = product hierarchy; no decorative buttons competing with workflow + - Trust details visible where commitment exists (price, permissions, undo, audit trail) + fixtures: + - fixture:linear-app + - fixture:figma-file-ui + - fixture:notion-editor + - fixture:superhuman + - fixture:github-pr-view +score_7_8: + criteria: + - Most criteria from 9-10 with one or two minor gaps + - Polish gaps that don't block job completion + - Domain objects present but action hierarchy slightly diffuse + fixtures: + - fixture:airtable-grid + - fixture:notion-database +score_5_6: + criteria: + - Functional but generic component-library assembly + - No domain object above the fold OR action hierarchy unclear + - Empty states show illustrations + platitudes instead of product preview + - Multiple equal-weight CTAs without a dominant primary + fixtures: + - fixture:generic-dashboard +score_3_4: + criteria: + - No primary job inferable from screen + - Equal-weight CTAs blocking workflow + - Decorative elements actively distract from product surface + - Page reads as a marketing/setup stub rather than an operational product + fixtures: + - fixture:no-primary-action diff --git a/src/design/audit/rubric/anchors/social.yaml b/src/design/audit/rubric/anchors/social.yaml new file mode 100644 index 0000000..591bb9b --- /dev/null +++ b/src/design/audit/rubric/anchors/social.yaml @@ -0,0 +1,33 @@ +type: social +score_9_10: + criteria: + - Feed prioritizes real content (posts, conversations) over chrome and ads + - Compose surface is one click away and primary in the layout + - Identity signals (verified accounts, profile preview, follower counts) consistent and lightweight + - State transitions (like, reply, repost) feel instant and reversible + - Empty states preview what the feed will look like with a few followed accounts + fixtures: + - fixture:threads-web + - fixture:bluesky-web + - fixture:substack-inline +score_7_8: + criteria: + - Solid feed and compose flow but reply chains are visually flat + - Identity signals inconsistent across surfaces + - Empty states use illustrations rather than previewing real content + fixtures: + - fixture:generic-microblog +score_5_6: + criteria: + - Feed cluttered with chrome (rails, ads, suggestions) competing with content + - Compose buried two clicks deep + - Action affordances (reply, like, share) are equal weight with no clear primary + fixtures: + - fixture:cluttered-feed +score_3_4: + criteria: + - Page reads as ad inventory with content squeezed in + - No clear primary feed; multiple surfaces compete + - Identity signals fake or absent (anonymous content with no provenance) + fixtures: + - fixture:ad-heavy-feed diff --git a/src/design/audit/rubric/anchors/tool.yaml b/src/design/audit/rubric/anchors/tool.yaml new file mode 100644 index 0000000..f280d01 --- /dev/null +++ b/src/design/audit/rubric/anchors/tool.yaml @@ -0,0 +1,33 @@ +type: tool +score_9_10: + criteria: + - Single-purpose surface — the input and output relationship is immediate and obvious + - Keyboard-first interaction (shortcuts, focus management, paste support) + - Output is copyable, exportable, and shareable without modal interruption + - Recent results, history, or undo always available + - State (input, processing, output, error) handled explicitly with clear transitions + fixtures: + - fixture:linear-command-palette + - fixture:github-pr-view + - fixture:raycast +score_7_8: + criteria: + - Tool works well but lacks keyboard affordances or history + - Output exportable but with extra clicks + - Error states present but generic + fixtures: + - fixture:generic-converter +score_5_6: + criteria: + - Form-and-submit pattern with reload-style output + - No keyboard shortcuts, no history, no undo + - Loading and error states use defaults + fixtures: + - fixture:basic-tool-form +score_3_4: + criteria: + - Multi-step flow for what should be a single action + - Output requires manual selection/copy + - No error handling — failures show generic browser errors + fixtures: + - fixture:broken-tool diff --git a/src/design/audit/rubric/anchors/utility.yaml b/src/design/audit/rubric/anchors/utility.yaml new file mode 100644 index 0000000..339fcfc --- /dev/null +++ b/src/design/audit/rubric/anchors/utility.yaml @@ -0,0 +1,33 @@ +type: utility +score_9_10: + criteria: + - Status, configuration, or admin surface that exposes the operational object directly (deploy, build, job, account) + - Real state (running, succeeded, failed, queued) with timestamps and durations + - Action affordances (retry, rollback, configure, audit) match the operational verbs of the system + - Logs, diagnostics, or detail panels are one click from the summary + - Empty states preview what real activity will look like + fixtures: + - fixture:vercel-deployment-status + - fixture:cloudflare-dashboard + - fixture:github-actions +score_7_8: + criteria: + - Status surface clear but action affordances are generic (Edit/Save instead of Retry/Rollback) + - Logs accessible but require navigation + - Empty states use illustrations rather than previewing activity + fixtures: + - fixture:generic-admin +score_5_6: + criteria: + - Status indicators are decorative pills without timestamps or durations + - Logs and diagnostics buried in modals + - Forms-of-forms pattern instead of operational verbs + fixtures: + - fixture:basic-settings-page +score_3_4: + criteria: + - Status is text only — no visual signal of failure or success + - No way to retry, rollback, or audit from the surface + - Settings sprawl with no IA, no search + fixtures: + - fixture:settings-sprawl diff --git a/src/design/audit/rubric/fragments/audience-clinician.md b/src/design/audit/rubric/fragments/audience-clinician.md new file mode 100644 index 0000000..bf4bcac --- /dev/null +++ b/src/design/audit/rubric/fragments/audience-clinician.md @@ -0,0 +1,42 @@ +--- +id: audience-clinician +title: Clinician Audience +weight: high +applies-when: + audience: [clinician] +--- + +This surface is used by clinical professionals (physicians, nurses, pharmacists, +therapists) in high-stakes decision-making contexts. Standard consumer-UX +heuristics are insufficient — apply the following additional lens. + +INFORMATION DENSITY +- Clinicians tolerate and often require high information density. Sparse + consumer-style layouts that hide detail behind progressive disclosure are + friction, not polish. +- Data tables, lab result grids, medication lists must be fully visible without + expand/collapse. If key data is folded, score `content_ia` lower. + +WORKFLOW EFFICIENCY +- Clinicians context-switch constantly (patient to patient, chart to EHR to + order entry). Keyboard navigation, dense primary actions, and minimal + confirmation dialogs for routine operations are expected. +- If standard consumer patterns (fat CTAs, step-by-step wizards) dominate + routine tasks, score `workflow` lower. + +CRITICAL VALUE FLAGGING +- Out-of-range lab values, drug interactions, and alert states must be + immediately visible with high visual contrast — not just color. Include + icon + text pattern redundancy. +- Missing or weak critical-value flagging is a major finding in `trust_clarity`. + +AUDIT TRAIL AND ATTRIBUTION +- Clinician workflows require visible "who did what, when" — last modified by, + order placed by, cosigned by. This is both regulatory and practical. +- If attributable actions lack visible provenance, that is a major finding in + `trust_clarity`. + +DO NOT penalize for: +- Dense information layouts (this is intentional) +- Lack of illustrations or hero imagery +- Technical terminology appropriate to the audience diff --git a/src/design/audit/rubric/fragments/audience-developer.md b/src/design/audit/rubric/fragments/audience-developer.md new file mode 100644 index 0000000..75b7875 --- /dev/null +++ b/src/design/audit/rubric/fragments/audience-developer.md @@ -0,0 +1,40 @@ +--- +id: audience-developer +title: Developer Audience +weight: medium +applies-when: + audience: [developer] +--- + +This surface is used by software engineers and technical practitioners. + +INFORMATION OVER DECORATION +- Code samples, CLI commands, API endpoints, and technical specifications must + be immediately accessible — not gated behind tabs, scrolling, or "Request + Demo" flows. If core technical content requires navigation to find, score + `content_ia` lower. + +COPY-PASTE HYGIENE +- Every code block must have a visible copy button or be selectable without + capturing surrounding prose. Missing copy affordance is a minor-to-major + finding in `workflow` depending on frequency. + +DARK MODE AND TERMINAL AESTHETICS +- Developers default to dark environments. A light-only surface with no dark + mode is a `visual_craft` minor finding. A surface that actively breaks + (illegible code contrast) in dark mode is major. + +AUTHENTICATION PATHS +- API keys, tokens, and credentials should be displayed with + mask-by-default + reveal-on-click. Showing credentials in plaintext by + default is a critical `trust_clarity` finding. + +SEARCH AS PRIMARY NAVIGATION +- Technical docs and reference surfaces must have a prominent, keyboard- + accessible search. If Cmd/Ctrl-K does not open search, that is a major + finding in `workflow`. + +DO NOT penalize for: +- Dense information layouts +- Monospace typography sections +- Minimal illustration or marketing copy diff --git a/src/design/audit/rubric/fragments/audience-kids.md b/src/design/audit/rubric/fragments/audience-kids.md new file mode 100644 index 0000000..84a09a0 --- /dev/null +++ b/src/design/audit/rubric/fragments/audience-kids.md @@ -0,0 +1,35 @@ +--- +id: audience-kids +title: Kids Audience +weight: critical +applies-when: + audience: [kids] +--- + +This surface is used by or targeted at minors. Apply the following additional +lens. Note: ethics rules in Layer 7 enforce hard score floors independently +of rubric scoring — both layers apply simultaneously. + +AGE-APPROPRIATE LANGUAGE AND IMAGERY +- Copy must be readable at the stated age level. Technical jargon, legalese, + or implicit social pressure ("Your friends are waiting!") are major findings + in `content_ia`. +- Imagery should be appropriate for the age group. Mature themes, even subtle + ones, are critical findings in `trust_clarity`. + +FRICTION-SYMMETRIC FLOWS +- Actions that benefit the operator (in-app purchase, data sharing, account + creation) must have equal or greater friction than their reversal + (cancellation, data deletion, account closure). +- Any flow where it is easier to spend money / share data than to undo is a + critical finding in `workflow`. + +PARENTAL CONTROLS VISIBILITY +- If the app collects data from minors, parental consent or control mechanisms + must be visible without buried navigation. If absent: major finding in + `trust_clarity`. + +DO NOT penalize for: +- Simplified language and larger touch targets +- Reduced information density appropriate to age +- Bright color palettes and playful illustration styles diff --git a/src/design/audit/rubric/fragments/audience-vulnerability-minor-facing.md b/src/design/audit/rubric/fragments/audience-vulnerability-minor-facing.md new file mode 100644 index 0000000..c57ee0b --- /dev/null +++ b/src/design/audit/rubric/fragments/audience-vulnerability-minor-facing.md @@ -0,0 +1,37 @@ +--- +id: audience-vulnerability-minor-facing +title: Minor-Facing Audience Vulnerability +weight: critical +applies-when: + audienceVulnerability: [minor-facing] +--- + +This surface directly interacts with users who are minors or who are +unaccompanied minors in a supervised context (e.g. school software, children's +gaming, education platforms). The vulnerability is that the minor may not fully +understand consent, financial consequence, or data implications. Apply this +lens in addition to audience-kids and regulatory-coppa fragments. + +DARK PATTERN PROHIBITION — ENFORCED +Every dark pattern is a critical finding when directed at minors. Dark patterns +to look for: +- Confirmshaming ("No thanks, I don't want to save money") +- Fake urgency ("Only 2 left! Timer expires in 03:42") +- Hidden costs revealed at final checkout step +- Forced continuity (subscription auto-enrolled without explicit confirmation) +- Misdirection (styled "X" button that is actually an ad click) + +IRREVERSIBILITY DISCLOSURE +- Any action that is irreversible (purchase, deletion, sharing to others) must + be labeled explicitly. "Delete" without "This cannot be undone" is a major + finding in `trust_clarity`. + +SOCIAL COMPARISON AS PRESSURE +- Leaderboards, "Your friends have X" notifications, or streak-loss warnings + designed to create anxiety are major `trust_clarity` findings when the + audience is minors. + +REPORTING AND BLOCKING CONTROLS +- If the surface allows social interaction (messaging, comments, reactions), + visible reporting and blocking controls are required. Absent: major finding + in `trust_clarity`. diff --git a/src/design/audit/rubric/fragments/first-principles.md b/src/design/audit/rubric/fragments/first-principles.md new file mode 100644 index 0000000..7570de6 --- /dev/null +++ b/src/design/audit/rubric/fragments/first-principles.md @@ -0,0 +1,65 @@ +--- +id: first-principles +title: First-Principles Fallback +weight: critical +applies-when: + universal: false +--- + +You haven't seen this pattern before. Do not fabricate a classification. +Audit against the universal product principles only. Score per-dimension as +usual, but set `rollup.confidence = "low"` and emit a top-level +`novel_pattern_signal` describing what you observed, so this surface can be +mined into a new fragment after enough fleet exposure. + +1. PRIMARY JOB CLARITY (5 sec test) + - Within 5 seconds, can a stranger name what this page is for? + - If no: severity major; finding category `product_intent`. + +2. PRIMARY ACTION OBVIOUSNESS + - Is there one visually-dominant action this page is built around? + - Are competing actions visually subordinate? + - If equal-weight: severity major; finding category `product_intent`. + +3. STATE PREVIEW + - Are empty/loading/error states designed, or browser-default / placeholder? + - Do empty states preview the real product, or show generic illustrations? + - If generic: severity major; finding category `product_intent`. + +4. TRUST BEFORE COMMITMENT + - Does the page ask the user to commit (money, identity, deploy, share)? + - If yes: are price, permissions, scope, undo path visible BEFORE the + commit button? + - If no: severity critical; finding category `trust_clarity`. + +5. RECOVERY FROM FAILURE + - Can the user undo their last action? + - Is there a clear path forward when something fails? + - If no: severity major; finding category `workflow`. + +GUARDRAILS: +- Do not invent domain-specific findings ("this dashboard needs charts"). + You don't know the domain. Stick to the five principles. +- Do not anchor on marketing-page heuristics (hero copy, illustrations, + social proof). They don't apply. +- If a principle simply doesn't apply (e.g. there is no commitment on this + page), say so explicitly rather than scoring it generically. + +RESPOND WITH ONLY a JSON object of the form: +{ + "scores": { + "product_intent": { "score": <1-10>, "range": [, ], "confidence": "low", "summary": "", "primaryFindings": [] }, + "visual_craft": { ... }, + "trust_clarity": { ... }, + "workflow": { ... }, + "content_ia": { ... } + }, + "rollup": { "score": <1-10>, "range": [, ], "confidence": "low", "rule": "first-principles", "weights": { "product_intent": 0.30, "workflow": 0.25, "visual_craft": 0.20, "content_ia": 0.15, "trust_clarity": 0.10 } }, + "findings": [ ... ], + "novel_pattern_signal": { + "observedSignals": [ + { "label": "", "evidence": "", "confidence": <0..1> } + ] + }, + "first_principles_mode": true +} diff --git a/src/design/audit/rubric/fragments/modality-mobile.md b/src/design/audit/rubric/fragments/modality-mobile.md new file mode 100644 index 0000000..1deb727 --- /dev/null +++ b/src/design/audit/rubric/fragments/modality-mobile.md @@ -0,0 +1,39 @@ +--- +id: modality-mobile +title: Mobile Modality +weight: medium +applies-when: + modality: [mobile] +--- + +This surface is evaluated at a mobile viewport (≤480px wide). Apply the +following lens on top of page-type and domain fragments. + +TOUCH TARGET SIZING +- Interactive elements must meet minimum 44×44pt touch targets (WCAG 2.5.5 + AAA; Apple HIG minimum). Anything below 32pt is a major finding in + `workflow`. Count the number of undersized targets — if >3 on a single + screen, escalate to critical. + +THUMB-ZONE REACHABILITY +- Primary actions must be reachable in the bottom 60% of a 375px screen + one-handed. A primary CTA pinned to the top of the viewport is a major + `workflow` finding. + +HORIZONTAL SCROLL AVOIDANCE +- Content must not require horizontal scroll on a 375px viewport. Tables + that overflow without a scroll affordance are major `workflow` findings. + +FONT LEGIBILITY +- Body text must be ≥16px (browser zoom notwithstanding). Text smaller than + 14px is a major `visual_craft` finding. Text below 12px is critical. + +FORM INPUT KEYBOARD +- Input fields must trigger the appropriate virtual keyboard type (numeric + for phone/postcode, email for email, tel for phone numbers). Wrong keyboard + type is a minor `workflow` finding per field. + +DO NOT penalize for: +- Navigation patterns specific to mobile (hamburger, bottom tab bar) +- Reduced visible surface area compared to desktop +- Single-column layouts diff --git a/src/design/audit/rubric/fragments/modality-tablet.md b/src/design/audit/rubric/fragments/modality-tablet.md new file mode 100644 index 0000000..f4dcb38 --- /dev/null +++ b/src/design/audit/rubric/fragments/modality-tablet.md @@ -0,0 +1,35 @@ +--- +id: modality-tablet +title: Tablet Modality +weight: low +applies-when: + modality: [tablet] +--- + +This surface is evaluated at a tablet viewport (481–1024px wide). Apply this +lens on top of page-type and domain fragments. + +LAYOUT ADAPTATION +- The layout must actually adapt between mobile and desktop — not simply + scale a mobile layout or stretch a desktop layout. A layout that is + identical to either breakpoint is a minor `visual_craft` finding. + +SPLIT-VIEW AND SIDEBAR OPPORTUNITIES +- Tablet viewports often benefit from master-detail or sidebar-content + patterns rather than single-column stacks. If the content hierarchy would + benefit from a persistent sidebar and none is present, that is a minor + `workflow` finding. + +TOUCH AND POINTER HYBRID +- Tablet users may use touch or pointer. Touch targets must still meet the + 44pt minimum. Hover-only affordances without touch fallbacks are major + `workflow` findings. + +LANDSCAPE AND PORTRAIT PARITY +- Key interactions must work in both orientations. If a primary action is + unreachable in landscape (below fold with no scroll), that is a major + `workflow` finding. + +DO NOT penalize for: +- Adapting typography slightly smaller than mobile maximums +- Showing more information density than the mobile equivalent diff --git a/src/design/audit/rubric/fragments/regulatory-coppa.md b/src/design/audit/rubric/fragments/regulatory-coppa.md new file mode 100644 index 0000000..07abf94 --- /dev/null +++ b/src/design/audit/rubric/fragments/regulatory-coppa.md @@ -0,0 +1,32 @@ +--- +id: regulatory-coppa +title: COPPA Regulatory Context +weight: critical +applies-when: + regulatoryContext: [coppa] +--- + +This surface is subject to COPPA (Children's Online Privacy Protection Act). +Apply this lens when the audience includes or may include users under 13. The +ethics gate (Layer 7) independently enforces hard score floors for missing +age gates and dark patterns — both apply simultaneously. + +VERIFIABLE PARENTAL CONSENT +- If this surface collects personal data from users who may be under 13, + a verifiable parental consent mechanism must be visible and functional. + Absent: critical finding in `trust_clarity`. + +AGE GATE INTEGRITY +- Age gates must require date-of-birth entry, not a single yes/no question + ("Are you 13 or older?"). A single-question age gate is a major finding — + it is trivially bypassed. + +DATA COLLECTION DISCLOSURE +- A clear, plain-English summary of what data is collected and why must be + visible before any data collection begins. Buried in a privacy policy does + not satisfy this requirement. Absent: major finding in `content_ia`. + +PROHIBITION ON BEHAVIORAL TARGETING +- No behavioral advertising or cross-site tracking may be enabled for users + under 13. If third-party tracking scripts are present without age-based + gating: critical finding in `trust_clarity`. diff --git a/src/design/audit/rubric/fragments/regulatory-gdpr.md b/src/design/audit/rubric/fragments/regulatory-gdpr.md new file mode 100644 index 0000000..d9fff07 --- /dev/null +++ b/src/design/audit/rubric/fragments/regulatory-gdpr.md @@ -0,0 +1,33 @@ +--- +id: regulatory-gdpr +title: GDPR Regulatory Context +weight: high +applies-when: + regulatoryContext: [gdpr] +--- + +This surface is subject to GDPR. Apply the following lens in addition to other +applicable fragments. Note: the ethics gate (Layer 7) independently enforces a +score floor for missing consent mechanisms — both apply. + +CONSENT MECHANISM QUALITY +- Cookie consent banners must offer granular controls (necessary / analytics / + marketing) with equal visual prominence. An "Accept all" button that is + larger or more prominent than "Manage preferences" is a major `trust_clarity` + finding. +- Pre-ticked checkboxes are a critical finding — they are unlawful under GDPR. + +DATA SUBJECT RIGHTS ACCESS +- Users must be able to find their data rights (access, deletion, portability, + correction) without more than 2 navigation steps from any page. If the + privacy page is not reachable from the footer, that is a major finding in + `content_ia`. + +LEGAL BASIS TRANSPARENCY +- If the page collects personal data, the legal basis (consent, legitimate + interest, contract) must be stated. Absent: minor finding in `trust_clarity`. + +DATA RETENTION +- If retention periods are disclosed (they should be), they must be + understandable to a non-lawyer. Legal boilerplate with no plain-English + summary is a minor finding in `content_ia`. diff --git a/src/design/audit/rubric/fragments/regulatory-hipaa.md b/src/design/audit/rubric/fragments/regulatory-hipaa.md new file mode 100644 index 0000000..d5ff918 --- /dev/null +++ b/src/design/audit/rubric/fragments/regulatory-hipaa.md @@ -0,0 +1,36 @@ +--- +id: regulatory-hipaa +title: HIPAA Regulatory Context +weight: high +applies-when: + regulatoryContext: [hipaa] +--- + +This surface handles Protected Health Information (PHI) and is subject to HIPAA +technical safeguards. Apply this lens in addition to domain-specific fragments. + +SESSION SECURITY VISIBILITY +- Automatic session timeout must be visible to the user (countdown or clear + logout trigger). Invisible timeout with hard logout is a major `workflow` + finding. +- If the surface shows PHI and has no visible session indicator, that is a + major `trust_clarity` finding. + +MINIMUM NECESSARY DATA +- Only the minimum necessary PHI should be visible on any given screen. + Dashboards that show full SSN, full DOB, or complete medication histories + when partial identifiers suffice are major `trust_clarity` findings. + +AUDIT LOG ACCESS +- If this surface allows modification of PHI, a visible "audit log" or + "activity history" link must be accessible to the user. Absent: minor + finding in `trust_clarity`. + +DATA EXPORT LABELING +- Export buttons (CSV, PDF, print) must label the output as PHI with a + handling reminder. Unlabeled PHI export is a minor finding. + +DO NOT penalize for: +- Explicit data masking that adds cognitive load (masks protect PHI) +- Confirmation dialogs on irreversible PHI operations +- Conservative color coding that prioritizes legibility over aesthetics diff --git a/src/design/audit/rubric/loader.ts b/src/design/audit/rubric/loader.ts index ebadbaf..5694fd2 100644 --- a/src/design/audit/rubric/loader.ts +++ b/src/design/audit/rubric/loader.ts @@ -15,6 +15,25 @@ import type { ComposedRubric, AppliesWhen, } from '../types.js' +import type { + AudienceTag, + ModalityTag, + RegulatoryContextTag, + AudienceVulnerabilityTag, +} from '../v2/types.js' + +/** + * Operator-supplied context for Layer 6 composable predicate matching. + * When provided, fragments whose `applies-when.audience | modality | + * regulatoryContext | audienceVulnerability` overlap with these values are + * included in the composed rubric alongside the classification-matched set. + */ +export interface RubricContext { + audience?: AudienceTag[] + modality?: ModalityTag[] + regulatoryContext?: RegulatoryContextTag[] + audienceVulnerability?: AudienceVulnerabilityTag[] +} const __dirname = path.dirname(fileURLToPath(import.meta.url)) const BUILTIN_FRAGMENTS_DIR = path.join(__dirname, 'fragments') @@ -134,44 +153,66 @@ export function loadFragments(dir: string = BUILTIN_FRAGMENTS_DIR): RubricFragme } /** - * Predicate evaluator. Returns true if the fragment applies to the classification. + * Predicate evaluator. Returns true if the fragment applies to the classification + * or the optional Layer 6 context (audience / modality / regulatoryContext / + * audienceVulnerability hints). * * Universal fragments always apply. - * Type/domain/maturity/designSystem predicates are AND-combined: all listed - * fields must match. Within a field, the classification value must be in the - * fragment's allowed set. + * Predicate groups are OR-combined at the group level: a fragment fires if ANY + * one of its predicate groups matches. Within a group, list membership is used + * (the classification/context value must appear in the fragment's allowed set). + * + * Layer 6 predicates are additive: they can cause a fragment to fire even when + * no type/domain predicate matches, enabling composition across independent + * predicate dimensions. */ export function fragmentApplies( fragment: RubricFragment, classification: PageClassification, + ctx?: RubricContext, ): boolean { - const w = fragment.appliesWhen + const w = fragment.appliesWhen as AppliesWhen & { + audience?: string[] + modality?: string[] + regulatoryContext?: string[] + audienceVulnerability?: string[] + } if (w.universal) return true - if (w.type && w.type.length > 0) { - if (!w.type.includes(classification.type)) return false - } - if (w.domain && w.domain.length > 0) { - const domainMatch = w.domain.some(d => - classification.domain.toLowerCase().includes(d.toLowerCase()), - ) - if (!domainMatch) return false - } - if (w.maturity && w.maturity.length > 0) { - if (!w.maturity.includes(classification.maturity)) return false + // --- Layer 1 classification predicates (AND-combined when all set) --- + const classificationPredicateSet = + !!w.type?.length || !!w.domain?.length || !!w.maturity?.length || !!w.designSystem?.length + + if (classificationPredicateSet) { + if (w.type?.length && !w.type.includes(classification.type)) return false + if (w.domain?.length) { + const domainMatch = w.domain.some(d => + classification.domain.toLowerCase().includes(d.toLowerCase()), + ) + if (!domainMatch) return false + } + if (w.maturity?.length && !w.maturity.includes(classification.maturity)) return false + if (w.designSystem?.length && !w.designSystem.includes(classification.designSystem)) return false + return true } - if (w.designSystem && w.designSystem.length > 0) { - if (!w.designSystem.includes(classification.designSystem)) return false + + // --- Layer 6 context predicates (any overlap fires the fragment) --- + if (ctx) { + if (w.audience?.length && ctx.audience?.length) { + if (w.audience.some(a => ctx.audience!.includes(a as AudienceTag))) return true + } + if (w.modality?.length && ctx.modality?.length) { + if (w.modality.some(m => ctx.modality!.includes(m as ModalityTag))) return true + } + if (w.regulatoryContext?.length && ctx.regulatoryContext?.length) { + if (w.regulatoryContext.some(r => ctx.regulatoryContext!.includes(r as RegulatoryContextTag))) return true + } + if (w.audienceVulnerability?.length && ctx.audienceVulnerability?.length) { + if (w.audienceVulnerability.some(av => ctx.audienceVulnerability!.includes(av as AudienceVulnerabilityTag))) return true + } } - // If at least one predicate field was set and all matched, apply. - // If NO predicates were set and not universal, don't apply (be conservative). - const hasPredicate = - !!w.type?.length || - !!w.domain?.length || - !!w.maturity?.length || - !!w.designSystem?.length - return hasPredicate + return false } /** @@ -180,11 +221,13 @@ export function fragmentApplies( * @param classification - the page classification * @param fragments - all loaded fragments (defaults to builtin) * @param userFragmentsDir - optional path to user-supplied fragments + * @param ctx - optional Layer 6 context for audience/modality/regulatory predicates */ export function composeRubric( classification: PageClassification, fragments?: RubricFragment[], userFragmentsDir?: string, + ctx?: RubricContext, ): ComposedRubric { const all = [ ...(fragments ?? loadFragments(BUILTIN_FRAGMENTS_DIR)), @@ -192,7 +235,7 @@ export function composeRubric( ] const matched = all - .filter(f => fragmentApplies(f, classification)) + .filter(f => fragmentApplies(f, classification, ctx)) .sort((a, b) => WEIGHT_ORDER[a.weight] - WEIGHT_ORDER[b.weight]) const body = matched diff --git a/src/design/audit/rubric/rollup-weights.ts b/src/design/audit/rubric/rollup-weights.ts new file mode 100644 index 0000000..1cb624b --- /dev/null +++ b/src/design/audit/rubric/rollup-weights.ts @@ -0,0 +1,60 @@ +/** + * Rollup weights — Layer 1 of the world-class design-audit architecture. + * + * Per-page-type weights for combining the 5 dimension scores into a single + * rollup. Marketing surfaces emphasize visual + content; saas-app surfaces + * emphasize product_intent + workflow; docs lean on content_ia. The weights + * are evolvable via the GEPA target `pareto-rollup-weights`. + * + * Invariant: every weight set sums to 1.0 within 1e-6. + */ + +import type { Dimension } from '../v2/types.js' +import type { PageType } from '../types.js' + +export type RollupWeightKey = PageType | 'default' + +const ROLLUP_WEIGHTS_RAW: Record> = { + marketing: { product_intent: 0.30, visual_craft: 0.30, content_ia: 0.25, trust_clarity: 0.10, workflow: 0.05 }, + 'saas-app': { product_intent: 0.35, workflow: 0.30, visual_craft: 0.15, trust_clarity: 0.10, content_ia: 0.10 }, + dashboard: { product_intent: 0.30, workflow: 0.30, content_ia: 0.20, visual_craft: 0.15, trust_clarity: 0.05 }, + docs: { content_ia: 0.45, workflow: 0.25, product_intent: 0.15, visual_craft: 0.15, trust_clarity: 0.0 }, + ecommerce: { trust_clarity: 0.35, product_intent: 0.30, workflow: 0.20, visual_craft: 0.10, content_ia: 0.05 }, + social: { product_intent: 0.30, workflow: 0.30, content_ia: 0.20, visual_craft: 0.15, trust_clarity: 0.05 }, + tool: { workflow: 0.40, product_intent: 0.30, content_ia: 0.15, visual_craft: 0.10, trust_clarity: 0.05 }, + blog: { content_ia: 0.50, visual_craft: 0.25, product_intent: 0.15, workflow: 0.10, trust_clarity: 0.0 }, + utility: { workflow: 0.45, product_intent: 0.25, content_ia: 0.20, visual_craft: 0.10, trust_clarity: 0.0 }, + unknown: { product_intent: 0.30, workflow: 0.25, visual_craft: 0.20, content_ia: 0.15, trust_clarity: 0.10 }, + default: { product_intent: 0.30, workflow: 0.25, visual_craft: 0.20, content_ia: 0.15, trust_clarity: 0.10 }, +} + +const WEIGHT_SUM_TOLERANCE = 1e-6 + +// Validate at module load — fail fast if a weight set drifts. +for (const [type, weights] of Object.entries(ROLLUP_WEIGHTS_RAW)) { + const sum = Object.values(weights).reduce((acc, n) => acc + n, 0) + if (Math.abs(sum - 1) > WEIGHT_SUM_TOLERANCE) { + throw new Error(`rollup weights for ${type} sum to ${sum}, expected 1.0 ± ${WEIGHT_SUM_TOLERANCE}`) + } +} + +export const ROLLUP_WEIGHTS: Record> = ROLLUP_WEIGHTS_RAW + +/** + * Look up rollup weights for a page type, falling back to `default` when the + * type isn't in the table (forward-compat for new types). + */ +export function rollupWeightsFor(type: PageType | undefined): Record { + if (type && type in ROLLUP_WEIGHTS) return ROLLUP_WEIGHTS[type as RollupWeightKey] + return ROLLUP_WEIGHTS.default +} + +/** + * Render a human-readable formula for the audit report, e.g. + * "saas-app: product_intent*0.35 + workflow*0.30 + visual_craft*0.15 + trust_clarity*0.10 + content_ia*0.10" + */ +export function rollupFormula(type: PageType | undefined, weights: Record): string { + const entries = Object.entries(weights).sort(([, a], [, b]) => b - a) + const body = entries.map(([dim, w]) => `${dim}*${w.toFixed(2)}`).join(' + ') + return `${type ?? 'default'}: ${body}` +} diff --git a/src/design/audit/types.ts b/src/design/audit/types.ts index f2a8a9a..79e1963 100644 --- a/src/design/audit/types.ts +++ b/src/design/audit/types.ts @@ -6,9 +6,11 @@ */ import type { DesignFinding, DesignSystemScore } from '../../types.js' +import type { EthicsViolation } from './v2/types.js' // Re-export the canonical Finding/Score types so consumers only import from here export type { DesignFinding, DesignSystemScore } from '../../types.js' +export type { EthicsViolation } from './v2/types.js' // ── Classification ───────────────────────────────────────────────────────── @@ -208,5 +210,31 @@ export interface PageAuditResult { designSystemScore?: DesignSystemScore screenshotPath?: string tokensUsed?: number + /** + * Layer 7 — domain ethics violations. When non-empty, `score` is capped by + * the lowest `rollupCap` across violations until the underlying issue is + * remediated. Empty when --skip-ethics is set or when no rule fires. + */ + ethicsViolations?: EthicsViolation[] + /** + * The pre-cap score (Layer 7). Set when `ethicsViolations` is non-empty so + * tooling can show "would have scored X, capped at Y" without losing + * the LLM's original assessment. + */ + preEthicsScore?: number + /** + * Layer 1 — v2 multi-dim audit result. Emitted alongside the v1 fields for + * one release as a backwards-compat bridge. Consumers should migrate to + * `auditResultV2` and treat the v1 surface as deprecated. + * + * Typed as `unknown` here to avoid pulling v2/types.ts into v1 consumers. + * The concrete shape is `import('./v2/types.js').AuditResult_v2`. + */ + auditResultV2?: unknown + /** + * Layer 1 — ensemble classification (URL + DOM + LLM). When set, the + * pipeline used `--audit-passes auto` (the new default). + */ + ensembleClassification?: unknown error?: string } diff --git a/src/design/audit/v2/build-result.ts b/src/design/audit/v2/build-result.ts new file mode 100644 index 0000000..74176d8 --- /dev/null +++ b/src/design/audit/v2/build-result.ts @@ -0,0 +1,210 @@ +/** + * v2 AuditResult builder. + * + * Wraps the existing v1 PageAuditResult with multi-dim scoring + ensemble + * classification + rollup. Layer 1 emits BOTH schemas in `report.json` so + * downstream consumers can migrate at their own pace (one-release deprecation + * window per the RFC). + */ + +import { randomUUID, createHash } from 'node:crypto' +import type { Brain } from '../../../brain/index.js' +import type { PageState } from '../../../types.js' +import type { + PageAuditResult, + PageClassification, + ComposedRubric, + MeasurementBundle, +} from '../types.js' +import { + type AuditResult_v2, + type DesignFinding, + type Dimension, + type DimensionScore, + type EnsembleClassification, + type RollupScore, + DIMENSIONS, +} from './types.js' +import { + buildEvalPromptV2, + computeRollup, + parseAuditResponseV2, +} from './score.js' +import { renderAnchor, type CalibrationAnchor } from '../rubric/anchor-loader.js' + +export interface BuildV2ResultInput { + brain: Brain + state: PageState + pageRef: string + ensemble: EnsembleClassification + rubric: ComposedRubric + measurements: MeasurementBundle + v1Result: PageAuditResult + anchor?: CalibrationAnchor + /** Reuse the pipeline runId so envelopes correlate. */ + runId?: string + /** Optional override (e.g. test fixtures). When set, skip the LLM call. */ + precomputedScores?: Record +} + +/** + * Produce a complete `AuditResult_v2`. When `precomputedScores` is set we + * skip the v2 LLM call entirely (used by deterministic tests + the + * `--audit-passes auto` legacy fallback path). + */ +export async function buildAuditResultV2(input: BuildV2ResultInput): Promise { + const { brain, state, pageRef, ensemble, rubric, measurements, v1Result, anchor, runId } = input + + const measurementSummary = renderMeasurementSummary(measurements) + const prompt = buildEvalPromptV2({ + pageType: ensemble.type, + rubricBody: rubric.body, + anchor, + measurementSummary, + intent: ensemble.intent, + }) + + let scores: Record + let llmTokens = 0 + if (input.precomputedScores) { + scores = input.precomputedScores + } else { + try { + const llm = await brain.auditDesign(state, 'Multi-dimensional audit (v2)', [], prompt) + llmTokens = llm.tokensUsed ?? 0 + const parsed = parseAuditResponseV2(llm.raw) + scores = parsed.scores + } catch { + // Fall back: synthesize per-dim scores from the v1 result. Conservative — + // every dim gets the v1 score, range +/- 1, confidence 'low'. + scores = synthesizeScoresFromV1(v1Result) + } + } + + const rollup: RollupScore = computeRollup(scores, ensemble.type) + const findings = adaptFindings(v1Result.findings) + const topFixes = computeTopFixes(findings).slice(0, 5).map((f) => f.id) + + const promptHash = sha1(prompt) + const rubricHash = sha1(rubric.body) + const totalTokens = (v1Result.tokensUsed ?? 0) + llmTokens + + return { + schemaVersion: 2, + runId: runId ?? randomUUID(), + pageRef, + classification: ensemble, + scores, + rollup, + findings, + topFixes, + measurements, + ethicsViolations: [], + matchedPatterns: [], + modality: 'html', + evaluatedAt: new Date().toISOString(), + promptHash, + rubricHash, + tokensUsed: totalTokens > 0 ? totalTokens : undefined, + passes: ['v2-multidim'], + ...(v1Result.error ? { error: v1Result.error } : {}), + } +} + +function renderMeasurementSummary(measurements: MeasurementBundle): string { + const aaFails = measurements.contrast.aaFailures.length + const a11y = measurements.a11y.violations.length + return [ + `contrast AA failures: ${aaFails} of ${measurements.contrast.totalChecked} text elements`, + `axe violations: ${a11y}${a11y > 0 ? ` (top: ${measurements.a11y.violations.slice(0, 3).map((v) => `${v.id}/${v.impact}`).join(', ')})` : ''}`, + ].join('\n') +} + +function adaptFindings(v1Findings: PageAuditResult['findings']): DesignFinding[] { + return v1Findings.map((f, idx) => { + const id = `finding-${idx + 1}-${sha1(`${f.category}|${f.description}`).slice(0, 8)}` + const dimension = mapCategoryToDimension(f.category) + const kind = inferKind(f) + return { + ...f, + id, + dimension, + kind, + // Layer 2 supplies real Patches; Layer 1 emits an empty array so the + // schema is satisfied without fabricating diffs. + patches: [], + } + }) +} + +function mapCategoryToDimension(category: string): Dimension { + switch (category) { + case 'visual-bug': + case 'spacing': + case 'typography': + case 'alignment': + case 'layout': + return 'visual_craft' + case 'contrast': + case 'accessibility': + return 'visual_craft' + case 'ux': + default: + return 'product_intent' + } +} + +function inferKind(f: PageAuditResult['findings'][number]): DesignFinding['kind'] { + if (f.category === 'contrast' || f.category === 'accessibility') return 'measurement' + if (f.category === 'ux') return 'job' + return 'polish' +} + +function computeTopFixes(findings: DesignFinding[]): DesignFinding[] { + return [...findings].sort((a, b) => { + const aScore = (a.impact ?? 0) * blastWeight(a.blast) + const bScore = (b.impact ?? 0) * blastWeight(b.blast) + return bScore - aScore + }) +} + +function blastWeight(blast: PageAuditResult['findings'][number]['blast']): number { + switch (blast) { + case 'system': return 4 + case 'component': return 3 + case 'section': return 2 + default: return 1 + } +} + +function synthesizeScoresFromV1(v1: PageAuditResult): Record { + const fallback = Math.max(1, Math.min(10, Math.round(v1.score))) + const out: Partial> = {} + for (const dim of DIMENSIONS) { + out[dim] = { + score: fallback, + range: [ + Math.max(1, fallback - 1), + Math.min(10, fallback + 1), + ], + confidence: 'low', + summary: 'Synthesized from v1 score (v2 LLM call unavailable).', + primaryFindings: [], + } + } + return out as Record +} + +function sha1(s: string): string { + return createHash('sha1').update(s).digest('hex') +} + +export const V2_INTERNALS = { + renderMeasurementSummary, + adaptFindings, + mapCategoryToDimension, + computeTopFixes, + synthesizeScoresFromV1, +} + +export { renderAnchor } diff --git a/src/design/audit/v2/score.ts b/src/design/audit/v2/score.ts new file mode 100644 index 0000000..16fef74 --- /dev/null +++ b/src/design/audit/v2/score.ts @@ -0,0 +1,253 @@ +/** + * Layer 1 multi-dim scoring — prompt builder, parser, and rollup. + * + * Pure functions. No I/O, no Brain dependency. The pipeline supplies the + * inputs (classification, rubric, anchor, measurements) and persists the + * resulting `Record + RollupScore`. + */ + +import { + DIMENSIONS, + type ConfidenceLevel, + type Dimension, + type DimensionScore, + type RollupScore, +} from './types.js' +import type { PageType } from '../types.js' +import { rollupFormula, rollupWeightsFor } from '../rubric/rollup-weights.js' +import type { CalibrationAnchor } from '../rubric/anchor-loader.js' +import { renderAnchor } from '../rubric/anchor-loader.js' + +const VALID_CONFIDENCE: readonly ConfidenceLevel[] = ['high', 'medium', 'low'] as const + +export interface BuildV2PromptInput { + pageType: PageType + rubricBody: string + anchor?: CalibrationAnchor + /** Concise text summary of deterministic measurements (axe, contrast). */ + measurementSummary: string + /** Optional auditor framing override. */ + systemOpener?: string + /** Page intent line surfaced from classification. */ + intent?: string +} + +const DEFAULT_OPENER = + 'You are a principal product-design auditor. Score this page on five universal dimensions independently, with explicit ranges and confidence. The downstream system aggregates these into a page-type-aware rollup.' + +/** + * Build the v2 evaluation prompt. Demands per-dim DimensionScore output with + * range + confidence. Does NOT request the rollup — the rollup is computed + * deterministically from the per-dim scores using rollup-weights. + */ +export function buildEvalPromptV2(input: BuildV2PromptInput): string { + const opener = input.systemOpener ?? DEFAULT_OPENER + const anchorBlock = input.anchor ? renderAnchor(input.anchor) : '' + const intentLine = input.intent ? `\nPAGE INTENT (from classifier): ${input.intent}` : '' + + return `${opener} + +You are auditing a page that has been pre-classified as type=${input.pageType}. Contrast and accessibility measurements have already been counted deterministically — do NOT re-evaluate them. They will be merged with your output.${intentLine} + +DIMENSIONS — score each one 1-10 (integer) with an explicit uncertainty range and confidence: + + product_intent — Does the page make its audience, purpose, primary action, and product state obvious within 5 seconds? Empty/loading/error states designed? + visual_craft — Is the visual system intentional? Typography ramp, spacing rhythm, color tokens, component coherence, polish details. Decorative-but-shallow output is a defect. + trust_clarity — Are commitments (money, identity, deploy, share, irreversible actions) accompanied by the right trust details (price, fees, permissions, undo path, provenance)? + workflow — Can a user complete the end-to-end job? State transitions, recovery from failure, action hierarchy match the operational verbs of the system. + content_ia — Is the copy plain and useful? Are labels and IA tuned to the audience's tasks? Meta-copy that explains the UI is a defect. + +DETERMINISTIC MEASUREMENTS (do not duplicate): +${input.measurementSummary} + +${anchorBlock ? anchorBlock + '\n\n' : ''}EVALUATION RUBRIC: +${input.rubricBody} + +OUTPUT REQUIREMENTS: +- Every dimension MUST have an integer score 1-10. +- Every dimension MUST have a range [low, high] with low <= score <= high. Range width encodes your uncertainty. +- Every dimension MUST have confidence in {"high","medium","low"}. +- Summary is one sentence grounded in observable evidence. +- primaryFindings is a list of finding ids that drive the score (may be empty if you produce no findings). + +RESPOND WITH ONLY a JSON object: +{ + "scores": { + "product_intent": { "score": 6, "range": [5, 7], "confidence": "medium", "summary": "Hero is clear but action hierarchy is diffuse.", "primaryFindings": [] }, + "visual_craft": { "score": 7, "range": [6, 8], "confidence": "high", "summary": "Spacing rhythm is intentional but type ramp drifts in cards.", "primaryFindings": [] }, + "trust_clarity": { "score": 5, "range": [4, 6], "confidence": "medium", "summary": "Fees disclosed but only at the final step.", "primaryFindings": [] }, + "workflow": { "score": 6, "range": [5, 7], "confidence": "medium", "summary": "Empty state directs the user but error recovery is implicit.", "primaryFindings": [] }, + "content_ia": { "score": 7, "range": [6, 8], "confidence": "high", "summary": "Copy is plain and audience-tuned.", "primaryFindings": [] } + }, + "summary": "One-sentence overall assessment.", + "strengths": ["..."], + "findings": [] +} + +Score 1-10. Most production apps score 5-7. Only world-class deserves 8+. Be honest.` +} + +export interface ParsedDimensionScores { + scores: Record + summary: string + strengths: string[] +} + +/** + * Parse the v2 LLM response. Throws when scores are missing, ranges violate + * `range[0] <= score <= range[1]`, or score is outside 1..10. The pipeline + * catches the throw and falls back to v1 mean-of-passes. + */ +export function parseAuditResponseV2(raw: string): ParsedDimensionScores { + const parsed = extractJsonObject(raw) + if (!parsed) throw new Error('v2 parser: no JSON object in response') + + const rawScores = (parsed as { scores?: unknown }).scores + if (!rawScores || typeof rawScores !== 'object') { + throw new Error('v2 parser: missing scores object') + } + + const scoreMap = rawScores as Record + const out: Record = {} + for (const dim of DIMENSIONS) { + const dimRaw = scoreMap[dim] + if (!dimRaw || typeof dimRaw !== 'object') { + throw new Error(`v2 parser: dimension ${dim} missing`) + } + out[dim] = parseDimensionScore(dim, dimRaw as Record) + } + + return { + scores: out as Record, + summary: typeof (parsed as { summary?: unknown }).summary === 'string' ? (parsed as { summary: string }).summary : '', + strengths: Array.isArray((parsed as { strengths?: unknown }).strengths) + ? ((parsed as { strengths: unknown[] }).strengths.filter( + (s): s is string => typeof s === 'string', + )) + : [], + } +} + +function parseDimensionScore(dim: Dimension, raw: Record): DimensionScore { + const score = raw.score + if (typeof score !== 'number' || !Number.isFinite(score)) { + throw new Error(`v2 parser: ${dim}.score must be a number`) + } + const integerScore = Math.round(score) + if (integerScore < 1 || integerScore > 10) { + throw new Error(`v2 parser: ${dim}.score=${integerScore} outside 1..10`) + } + const range = raw.range + if (!Array.isArray(range) || range.length !== 2 || typeof range[0] !== 'number' || typeof range[1] !== 'number') { + throw new Error(`v2 parser: ${dim}.range must be [number, number]`) + } + const [low, high] = range + if (low > high) { + throw new Error(`v2 parser: ${dim}.range=[${low},${high}] inverted`) + } + if (integerScore < low || integerScore > high) { + throw new Error(`v2 parser: ${dim}.score=${integerScore} outside range [${low},${high}]`) + } + if (low < 1 || high > 10) { + throw new Error(`v2 parser: ${dim}.range=[${low},${high}] outside 1..10`) + } + const confidenceRaw = String(raw.confidence ?? '').toLowerCase() + const confidence = (VALID_CONFIDENCE as readonly string[]).includes(confidenceRaw) + ? (confidenceRaw as ConfidenceLevel) + : 'medium' + const summary = typeof raw.summary === 'string' ? raw.summary : '' + const primaryFindings = Array.isArray(raw.primaryFindings) + ? raw.primaryFindings.filter((s): s is string => typeof s === 'string') + : [] + return { score: integerScore, range: [low, high], confidence, summary, primaryFindings } +} + +/** + * Compute the rollup from per-dimension scores using per-page-type weights. + * Conservative confidence rule: rollup confidence = lowest dim confidence. + */ +export function computeRollup(scores: Record, pageType: PageType): RollupScore { + const weights = rollupWeightsFor(pageType) + let weighted = 0 + let lowSum = 0 + let highSum = 0 + for (const dim of DIMENSIONS) { + const dimScore = scores[dim] + const w = weights[dim] + weighted += dimScore.score * w + lowSum += dimScore.range[0] * w + highSum += dimScore.range[1] * w + } + const score = Math.round(weighted * 10) / 10 + const range: [number, number] = [ + Math.round(lowSum * 10) / 10, + Math.round(highSum * 10) / 10, + ] + + const confidences = DIMENSIONS.map((d) => scores[d].confidence) + const confidence: ConfidenceLevel = confidences.includes('low') + ? 'low' + : confidences.includes('medium') + ? 'medium' + : 'high' + + return { + score, + range, + confidence, + rule: rollupFormula(pageType, weights), + weights, + } +} + +/** + * Aggregate per-dim scores from N independent passes (mean). Used when the + * audit runs deep mode and we want one DimensionScore per dimension. + */ +export function mergeDimensionScoresAcrossPasses( + perPass: Array>, +): Record { + if (perPass.length === 0) { + throw new Error('mergeDimensionScoresAcrossPasses: empty input') + } + if (perPass.length === 1) return perPass[0]! + + const out: Partial> = {} + for (const dim of DIMENSIONS) { + const samples = perPass.map((p) => p[dim]) + const meanScore = samples.reduce((a, s) => a + s.score, 0) / samples.length + const meanLow = samples.reduce((a, s) => a + s.range[0], 0) / samples.length + const meanHigh = samples.reduce((a, s) => a + s.range[1], 0) / samples.length + const conf = samples.map((s) => s.confidence) + const confidence: ConfidenceLevel = conf.includes('low') ? 'low' : conf.includes('medium') ? 'medium' : 'high' + const allFindings = samples.flatMap((s) => s.primaryFindings) + const primaryFindings = Array.from(new Set(allFindings)).slice(0, 3) + const summary = samples.find((s) => s.summary)?.summary ?? '' + out[dim] = { + score: Math.round(meanScore), + range: [ + Math.max(1, Math.floor(meanLow)), + Math.min(10, Math.ceil(meanHigh)), + ], + confidence, + summary, + primaryFindings, + } + } + return out as Record +} + +function extractJsonObject(raw: string): unknown { + try { + let text = raw.trim() + if (text.startsWith('```')) { + text = text.replace(/^```(?:json)?\n?/, '').replace(/\n?```$/, '') + } + const start = text.indexOf('{') + const end = text.lastIndexOf('}') + if (start < 0 || end <= start) return null + return JSON.parse(text.slice(start, end + 1)) + } catch { + return null + } +} diff --git a/src/design/audit/v2/types.ts b/src/design/audit/v2/types.ts index 60b8714..a010cac 100644 --- a/src/design/audit/v2/types.ts +++ b/src/design/audit/v2/types.ts @@ -257,6 +257,8 @@ export interface NovelPatternObservation { export interface PatchApplication { applicationId: string patchId: string + /** `sha256(diff.before + '\n---\n' + diff.after + '\n---\n' + scope).slice(0, 16)` */ + patchHash: string appliedAt: string appliedBy: string // 'agent:claude-code' | 'agent:codex' | 'human' | 'css-injection' | ... /** The audit run that proposed the patch. */ diff --git a/tests/design-audit-anchor-loader.test.ts b/tests/design-audit-anchor-loader.test.ts new file mode 100644 index 0000000..b3eb28e --- /dev/null +++ b/tests/design-audit-anchor-loader.test.ts @@ -0,0 +1,141 @@ +import { describe, it, expect } from 'vitest' +import * as fs from 'node:fs' +import * as path from 'node:path' +import * as os from 'node:os' +import { fileURLToPath } from 'node:url' +import { + loadAnchors, + parseAnchorFile, + renderAnchor, +} from '../src/design/audit/rubric/anchor-loader.js' +import type { PageType } from '../src/design/audit/types.js' + +const __dirname = path.dirname(fileURLToPath(import.meta.url)) +const ANCHORS_DIR = path.resolve(__dirname, '..', 'src', 'design', 'audit', 'rubric', 'anchors') + +const REQUIRED_TYPES: PageType[] = [ + 'saas-app', + 'marketing', + 'dashboard', + 'docs', + 'ecommerce', + 'social', + 'tool', + 'blog', + 'utility', +] + +describe('anchor-loader — Layer 1', () => { + it('all 9 builtin anchor files exist', () => { + for (const t of REQUIRED_TYPES) { + expect(fs.existsSync(path.join(ANCHORS_DIR, `${t}.yaml`))).toBe(true) + } + }) + + it('loadAnchors() returns one anchor per page type', () => { + const anchors = loadAnchors(ANCHORS_DIR) + for (const t of REQUIRED_TYPES) { + const anchor = anchors.get(t) + expect(anchor).toBeDefined() + expect(anchor?.type).toBe(t) + } + }) + + it('every band has at least 3 criteria and at least 1 fixture', () => { + const anchors = loadAnchors(ANCHORS_DIR) + for (const anchor of anchors.values()) { + for (const band of ['score_9_10', 'score_7_8', 'score_5_6', 'score_3_4'] as const) { + const b = anchor[band] + expect(b.criteria.length).toBeGreaterThanOrEqual(3) + expect(b.fixtures.length).toBeGreaterThanOrEqual(1) + for (const c of b.criteria) { + expect(typeof c).toBe('string') + expect(c.length).toBeGreaterThan(8) + } + for (const f of b.fixtures) { + expect(typeof f).toBe('string') + expect(f.startsWith('fixture:')).toBe(true) + } + } + } + }) + + it('saas-app anchor cites Linear app + Figma + Notion + Superhuman + GitHub PR view', () => { + const a = parseAnchorFile(path.join(ANCHORS_DIR, 'saas-app.yaml')) + const refs = a.score_9_10.fixtures.join(' ') + expect(refs).toContain('linear-app') + expect(refs).toContain('figma-file-ui') + expect(refs).toContain('notion-editor') + expect(refs).toContain('superhuman') + expect(refs).toContain('github-pr-view') + }) + + it('marketing anchor cites Stripe / Linear / Vercel / Apple', () => { + const a = parseAnchorFile(path.join(ANCHORS_DIR, 'marketing.yaml')) + const refs = a.score_9_10.fixtures.join(' ') + expect(refs).toContain('stripe-marketing') + expect(refs).toContain('linear-marketing') + expect(refs).toContain('vercel-marketing') + expect(refs).toContain('apple-marketing') + }) + + it('docs anchor cites Stripe Docs / Tailwind Docs / MDN / Vercel Docs', () => { + const a = parseAnchorFile(path.join(ANCHORS_DIR, 'docs.yaml')) + const refs = a.score_9_10.fixtures.join(' ') + expect(refs).toContain('stripe-docs') + expect(refs).toContain('tailwind-docs') + expect(refs).toContain('mdn-docs') + expect(refs).toContain('vercel-docs') + }) + + it('renderAnchor produces injectable markdown', () => { + const a = parseAnchorFile(path.join(ANCHORS_DIR, 'saas-app.yaml')) + const md = renderAnchor(a) + expect(md).toContain('Score 9-10') + expect(md).toContain('Score 7-8') + expect(md).toContain('Score 5-6') + expect(md).toContain('Score 3-4') + expect(md).toContain('References:') + // contains an actual fixture reference + expect(md).toContain('fixture:linear-app') + }) + + it('returns empty Map for nonexistent dir', () => { + expect(loadAnchors('/nonexistent/anchors/dir').size).toBe(0) + }) + + it('throws on malformed file (missing band)', () => { + const tmp = fs.mkdtempSync(path.join(os.tmpdir(), 'anchor-')) + const file = path.join(tmp, 'bad.yaml') + fs.writeFileSync( + file, + `type: saas-app +score_9_10: + criteria: + - one criterion + fixtures: + - fixture:x +score_7_8: + criteria: + - one criterion + fixtures: + - fixture:x +score_5_6: + criteria: + - one criterion + fixtures: + - fixture:x +`, + ) + expect(() => parseAnchorFile(file)).toThrow(/score_3_4/) + fs.rmSync(tmp, { recursive: true, force: true }) + }) + + it('throws on malformed file (missing type)', () => { + const tmp = fs.mkdtempSync(path.join(os.tmpdir(), 'anchor-')) + const file = path.join(tmp, 'bad.yaml') + fs.writeFileSync(file, 'score_9_10:\n criteria:\n - x\n fixtures:\n - fixture:x\n') + expect(() => parseAnchorFile(file)).toThrow(/type/) + fs.rmSync(tmp, { recursive: true, force: true }) + }) +}) diff --git a/tests/design-audit-attribution.test.ts b/tests/design-audit-attribution.test.ts new file mode 100644 index 0000000..dea8a04 --- /dev/null +++ b/tests/design-audit-attribution.test.ts @@ -0,0 +1,150 @@ +import { describe, it, expect, afterEach } from 'vitest' +import { mkdtempSync, rmSync } from 'node:fs' +import { tmpdir } from 'node:os' +import { join } from 'node:path' +import { + patchHash, + appendPatchApplication, + readRecentApplications, + findPendingApplication, + updateApplicationOutcome, +} from '../src/design/audit/attribution/store.js' +import { aggregatePatchReliability, recommendationFor } from '../src/design/audit/attribution/aggregate.js' +import type { PatchApplication } from '../src/design/audit/attribution/types.js' + +function makeApp(overrides: Partial = {}): PatchApplication { + return { + applicationId: `app-${Math.random().toString(36).slice(2)}`, + patchId: 'patch-001', + patchHash: 'abc123', + appliedAt: new Date().toISOString(), + appliedBy: 'agent:claude-code', + preAuditRunId: 'run-pre', + predicted: { dim: 'visual_craft', delta: 2 }, + ...overrides, + } +} + +describe('patchHash', () => { + it('produces stable output for same inputs', () => { + const h1 = patchHash({ before: 'color: red', after: 'color: blue' }, 'component') + const h2 = patchHash({ before: 'color: red', after: 'color: blue' }, 'component') + expect(h1).toBe(h2) + expect(h1).toHaveLength(16) + }) + + it('produces different hashes for different scope', () => { + const h1 = patchHash({ before: 'a', after: 'b' }, 'component') + const h2 = patchHash({ before: 'a', after: 'b' }, 'page') + expect(h1).not.toBe(h2) + }) +}) + +describe('attribution store', () => { + let tmpDir: string + afterEach(() => { + if (tmpDir) rmSync(tmpDir, { recursive: true, force: true }) + }) + + it('appends and reads back an application', async () => { + tmpDir = mkdtempSync(join(tmpdir(), 'bad-attr-')) + const app = makeApp() + await appendPatchApplication(app, tmpDir) + + const apps = await readRecentApplications(1, tmpDir) + expect(apps).toHaveLength(1) + expect(apps[0].applicationId).toBe(app.applicationId) + }) + + it('is append-only: file grows on second write', async () => { + tmpDir = mkdtempSync(join(tmpdir(), 'bad-attr-')) + const a1 = makeApp({ applicationId: 'first' }) + const a2 = makeApp({ applicationId: 'second' }) + await appendPatchApplication(a1, tmpDir) + await appendPatchApplication(a2, tmpDir) + + const apps = await readRecentApplications(1, tmpDir) + expect(apps.length).toBeGreaterThanOrEqual(2) + }) + + it('finds a pending application by patchId', async () => { + tmpDir = mkdtempSync(join(tmpdir(), 'bad-attr-')) + const app = makeApp({ patchId: 'patch-findme' }) + await appendPatchApplication(app, tmpDir) + + const found = await findPendingApplication('patch-findme', tmpDir) + expect(found).not.toBeNull() + expect(found!.applicationId).toBe(app.applicationId) + }) + + it('does not find a pending application when postAuditRunId is set', async () => { + tmpDir = mkdtempSync(join(tmpdir(), 'bad-attr-')) + const app = makeApp({ patchId: 'patch-done', postAuditRunId: 'run-post' }) + await appendPatchApplication(app, tmpDir) + + const found = await findPendingApplication('patch-done', tmpDir) + expect(found).toBeNull() + }) + + it('appends an outcome event and the agreementScore is computed', async () => { + tmpDir = mkdtempSync(join(tmpdir(), 'bad-attr-')) + const app = makeApp({ predicted: { dim: 'visual_craft', delta: 2 } }) + await appendPatchApplication(app, tmpDir) + + await updateApplicationOutcome( + app.applicationId, + 'run-post', + { dim: 'visual_craft', delta: 1.5 }, + tmpDir, + ) + + const apps = await readRecentApplications(1, tmpDir) + const outcome = apps.find(a => a.applicationId === app.applicationId && a.postAuditRunId) + expect(outcome).toBeDefined() + expect(outcome!.agreementScore).toBeGreaterThan(0) + }) +}) + +describe('aggregatePatchReliability', () => { + it('produces recommended when N≥30, tenants≥5, replicationRate≥0.7', () => { + const hash = 'deadbeef' + const apps: PatchApplication[] = Array.from({ length: 30 }, (_, i) => ({ + applicationId: `app-${i}`, + patchId: 'p', + patchHash: hash, + appliedAt: new Date().toISOString(), + appliedBy: `agent:tenant-${i % 6}`, + preAuditRunId: 'pre', + predicted: { dim: 'visual_craft', delta: 2 }, + observed: { dim: 'visual_craft', delta: 2 }, + })) + + const [rel] = aggregatePatchReliability(apps) + expect(rel.patchHash).toBe(hash) + expect(rel.recommendation).toBe('recommended') + expect(rel.replicationRate).toBeCloseTo(1.0) + }) + + it('produces antipattern when N≥10, low replication, negative observed delta', () => { + const hash = 'baadf00d' + const apps: PatchApplication[] = Array.from({ length: 10 }, (_, i) => ({ + applicationId: `app-${i}`, + patchId: 'p', + patchHash: hash, + appliedAt: new Date().toISOString(), + appliedBy: 'agent:a', + preAuditRunId: 'pre', + predicted: { dim: 'visual_craft', delta: 2 }, + observed: { dim: 'visual_craft', delta: -1 }, + })) + + const [rel] = aggregatePatchReliability(apps) + expect(rel.recommendation).toBe('antipattern') + }) +}) + +describe('recommendationFor', () => { + it('is neutral below thresholds', () => { + expect(recommendationFor(5, 2, 0.5, 1)).toBe('neutral') + }) +}) diff --git a/tests/design-audit-ensemble.test.ts b/tests/design-audit-ensemble.test.ts new file mode 100644 index 0000000..8e0fe98 --- /dev/null +++ b/tests/design-audit-ensemble.test.ts @@ -0,0 +1,247 @@ +import { describe, it, expect } from 'vitest' +import { + classifyByUrl, + classifyByDom, + classifyEnsemble, + deriveHeuristics, + ENSEMBLE_INTERNALS, +} from '../src/design/audit/classify-ensemble.js' +import type { DomHeuristics } from '../src/design/audit/v2/types.js' +import type { Brain } from '../src/brain/index.js' +import type { PageState } from '../src/types.js' + +function emptyHeuristics(overrides: Partial = {}): DomHeuristics { + return { + formCount: 0, + inputCount: 0, + tableRowCount: 0, + chartCount: 0, + navItems: 0, + hasFooterLinks: false, + hasHeroSection: false, + hasSidebar: false, + paragraphCount: 0, + codeBlockCount: 0, + ...overrides, + } +} + +function fakeState(snapshot: string = ''): PageState { + return { + url: 'https://example.com/', + title: 'Example', + snapshot, + screenshot: '', + } as PageState +} + +interface FakeBrainResult { + type: string + confidence: number + intent?: string +} + +function fakeBrain(result: FakeBrainResult): Brain { + return { + auditDesign: async () => ({ + raw: JSON.stringify({ + type: result.type, + domain: 'unknown', + framework: null, + designSystem: 'unknown', + maturity: 'shipped', + intent: result.intent ?? '', + confidence: result.confidence, + }), + score: 5, + findings: [], + tokensUsed: 100, + }), + } as unknown as Brain +} + +describe('classifyByUrl — Layer 1', () => { + it('matches /docs paths', () => { + const sig = classifyByUrl('https://example.com/docs/intro') + expect(sig?.type).toBe('docs') + expect(sig?.confidence).toBeGreaterThanOrEqual(0.8) + }) + it('matches /checkout paths', () => { + expect(classifyByUrl('https://example.com/checkout/cart')?.type).toBe('ecommerce') + }) + it('matches /app paths', () => { + expect(classifyByUrl('https://example.com/app')?.type).toBe('saas-app') + }) + it('matches /login paths', () => { + expect(classifyByUrl('https://example.com/login')?.type).toBe('utility') + }) + it('matches /pricing paths', () => { + expect(classifyByUrl('https://example.com/pricing')?.type).toBe('marketing') + }) + it('matches /blog paths', () => { + expect(classifyByUrl('https://example.com/blog/post-1')?.type).toBe('blog') + }) + it('roots default to weak marketing signal', () => { + const sig = classifyByUrl('https://example.com/') + expect(sig?.type).toBe('marketing') + expect(sig?.confidence).toBeLessThanOrEqual(0.5) + }) + it('returns null for unparseable urls', () => { + expect(classifyByUrl('not a url')).toBeNull() + }) +}) + +describe('classifyByDom — Layer 1', () => { + it('docs: many paragraphs + code blocks', () => { + const sig = classifyByDom(emptyHeuristics({ codeBlockCount: 5, paragraphCount: 10 })) + expect(sig?.type).toBe('docs') + }) + it('dashboard: many table rows + sidebar', () => { + const sig = classifyByDom(emptyHeuristics({ tableRowCount: 12, hasSidebar: true })) + expect(sig?.type).toBe('dashboard') + }) + it('saas-app: sidebar + forms', () => { + const sig = classifyByDom(emptyHeuristics({ hasSidebar: true, formCount: 1, inputCount: 5 })) + expect(sig?.type).toBe('saas-app') + }) + it('utility: single form, no hero, no sidebar', () => { + const sig = classifyByDom(emptyHeuristics({ formCount: 1, inputCount: 3 })) + expect(sig?.type).toBe('utility') + }) + it('blog: many paragraphs, no forms or tables', () => { + const sig = classifyByDom(emptyHeuristics({ paragraphCount: 10 })) + expect(sig?.type).toBe('blog') + }) + it('marketing: hero + footer + few paragraphs', () => { + const sig = classifyByDom(emptyHeuristics({ hasHeroSection: true, hasFooterLinks: true, paragraphCount: 3 })) + expect(sig?.type).toBe('marketing') + }) + it('returns null for empty input', () => { + expect(classifyByDom(emptyHeuristics())).toBeNull() + }) +}) + +describe('classifyEnsemble — Layer 1', () => { + it('fast path: URL + DOM agree → skip LLM, signalsAgreed true', async () => { + let brainCalls = 0 + const brain = { + auditDesign: async () => { + brainCalls++ + return { raw: '{"type":"docs","confidence":0.9}', score: 5, findings: [], tokensUsed: 0 } + }, + } as unknown as Brain + + const result = await classifyEnsemble({ + brain, + state: fakeState(), + url: 'https://example.com/docs/intro', + domHeuristics: emptyHeuristics({ codeBlockCount: 5, paragraphCount: 10 }), + }) + expect(brainCalls).toBe(0) + expect(result.type).toBe('docs') + expect(result.signalsAgreed).toBe(true) + expect(result.signals.length).toBe(2) + expect(result.ensembleConfidence).toBeGreaterThan(0.5) + expect(result.firstPrinciplesMode).toBe(false) + }) + + it('LLM tiebreaker: signals disagree, LLM has high confidence → LLM wins', async () => { + const brain = fakeBrain({ type: 'saas-app', confidence: 0.9, intent: 'app surface' }) + const result = await classifyEnsemble({ + brain, + state: fakeState(), + url: 'https://example.com/app', + domHeuristics: emptyHeuristics({ paragraphCount: 10 }), // DOM says blog + }) + expect(result.signals.length).toBe(3) + expect(result.signals.some((s) => s.source === 'llm')).toBe(true) + }) + + it('low LLM confidence + signals disagree → unknown with dissent', async () => { + const brain = fakeBrain({ type: 'unknown', confidence: 0.1 }) + const result = await classifyEnsemble({ + brain, + state: fakeState(), + url: 'https://example.com/app', + domHeuristics: emptyHeuristics({ paragraphCount: 10 }), + }) + expect(result.type).toBe('unknown') + expect(result.signalsAgreed).toBe(false) + expect(result.dissent).toBeDefined() + expect(result.dissent!.length).toBeGreaterThan(0) + }) + + it('dom heuristic alone with weak url root → still produces a result', async () => { + const brain = fakeBrain({ type: 'docs', confidence: 0.8 }) + const result = await classifyEnsemble({ + brain, + state: fakeState(), + url: 'https://example.com/', + domHeuristics: emptyHeuristics({ codeBlockCount: 5, paragraphCount: 10 }), + }) + // URL says marketing (root), DOM says docs. LLM tiebreaker decides. + expect(['docs', 'marketing']).toContain(result.type) + expect(result.signals.length).toBeGreaterThanOrEqual(2) + }) + + it('first-principles mode triggers when ensemble confidence < 0.6', async () => { + const brain = fakeBrain({ type: 'unknown', confidence: 0.2 }) + const result = await classifyEnsemble({ + brain, + state: fakeState(), + url: 'https://example.com/', + domHeuristics: emptyHeuristics(), + }) + expect(result.firstPrinciplesMode).toBe(true) + }) + + it('records every signal with rationale + source', async () => { + const brain = fakeBrain({ type: 'docs', confidence: 0.9 }) + const result = await classifyEnsemble({ + brain, + state: fakeState(), + url: 'https://example.com/docs', + domHeuristics: emptyHeuristics({ paragraphCount: 10 }), + }) + for (const sig of result.signals) { + expect(['url-pattern', 'dom-heuristic', 'llm']).toContain(sig.source) + expect(typeof sig.rationale).toBe('string') + expect(sig.rationale.length).toBeGreaterThan(0) + } + }) +}) + +describe('deriveHeuristics — Layer 1', () => { + it('extracts counts from a snapshot', () => { + const snap = ` + navigation: [Home, Docs, Pricing] + heading "Hello" + form + textbox + textbox + paragraph "lorem" + paragraph "ipsum" + contentinfo: [Privacy, Terms] + ` + const h = deriveHeuristics({ snapshot: snap } as PageState) + expect(h.formCount).toBeGreaterThanOrEqual(1) + expect(h.inputCount).toBeGreaterThanOrEqual(2) + expect(h.paragraphCount).toBeGreaterThanOrEqual(2) + expect(h.hasFooterLinks).toBe(true) + }) + + it('returns zeros for empty snapshot', () => { + const h = deriveHeuristics({ snapshot: '' } as PageState) + expect(h.formCount).toBe(0) + expect(h.paragraphCount).toBe(0) + expect(h.hasFooterLinks).toBe(false) + }) +}) + +describe('Ensemble internals — Layer 1', () => { + it('exposes URL_PATTERN_RULES table for inspection', () => { + expect(ENSEMBLE_INTERNALS.URL_PATTERN_RULES.length).toBeGreaterThanOrEqual(7) + expect(ENSEMBLE_INTERNALS.ENSEMBLE_AGREEMENT_THRESHOLD).toBe(0.7) + expect(ENSEMBLE_INTERNALS.LLM_FALLBACK_CONFIDENCE).toBe(0.5) + }) +}) diff --git a/tests/design-audit-ethics-check.test.ts b/tests/design-audit-ethics-check.test.ts new file mode 100644 index 0000000..667e9df --- /dev/null +++ b/tests/design-audit-ethics-check.test.ts @@ -0,0 +1,289 @@ +/** + * Layer 7 — ethics check tests. + * + * Each test exercises one detector kind end-to-end against a real fixture: + * - pattern-absent (medical, gdpr): regex over snapshot text + * - llm-classifier (kids, finance): stubbed Brain response + * - skip-ethics: pipeline-level bypass behavior + * + * The Brain stub is a minimal object that satisfies the call shape used by + * `runLlmClassifier`. We do NOT mock the entire Brain — we call the public + * shape (`brain.complete(system, user)`) and assert the prompt the real + * implementation would send. + */ + +import { describe, it, expect, beforeEach } from 'vitest' +import * as fs from 'node:fs' +import * as path from 'node:path' +import { fileURLToPath } from 'node:url' +import { + loadEthicsRules, + clearEthicsRuleCache, +} from '../src/design/audit/ethics/loader.js' +import { + checkEthics, + pageTextBlob, + runLlmClassifier, + type EthicsCheckContext, +} from '../src/design/audit/ethics/check.js' +import type { Brain } from '../src/brain/index.js' +import type { PageClassification } from '../src/design/audit/v2/types.js' + +const __dirname = path.dirname(fileURLToPath(import.meta.url)) +const RULES_DIR = path.resolve(__dirname, '../src/design/audit/ethics/rules') +const FIXTURES_DIR = path.resolve(__dirname, '../bench/design/ethics-fixtures') + +beforeEach(() => clearEthicsRuleCache()) + +function readFixture(name: string): string { + return fs.readFileSync(path.join(FIXTURES_DIR, name), 'utf-8') +} + +function classification(over: Partial = {}): PageClassification { + return { + type: 'saas-app', + domain: 'general', + framework: null, + designSystem: 'unknown', + maturity: 'shipped', + intent: '', + confidence: 0.9, + ...over, + } +} + +function ctxFor(html: string, over: Partial = {}): EthicsCheckContext { + return { + pageText: pageTextBlob(html), + snapshot: html, + classification: over.classification ?? classification(), + ...over, + } +} + +/** Minimal Brain stub. Records every prompt; returns a scripted answer. */ +function stubBrain(reply: (user: string) => string): Brain { + const calls: Array<{ system: string; user: string }> = [] + const fake = { + calls, + async complete(system: string, user: string) { + calls.push({ system, user }) + return { text: reply(user) } + }, + } + return fake as unknown as Brain +} + +describe('checkEthics — no rules fire', () => { + it('returns [] when classification matches no rule', async () => { + const rules = loadEthicsRules(RULES_DIR) + const violations = await checkEthics( + rules, + ctxFor('hello', { + classification: classification({ domain: 'devtools', type: 'docs' }), + }), + ) + expect(violations).toEqual([]) + }) + + it('passing medical fixture triggers no violations when dosage + warning present', async () => { + const rules = loadEthicsRules(RULES_DIR) + const html = readFixture('medical-with-dosage.html') + const violations = await checkEthics( + rules, + ctxFor(html, { + classification: classification({ domain: 'pharmacy' }), + // No audience/regulatoryContext set → kids+gdpr rules skip. + }), + ) + const dosage = violations.find(v => v.ruleId === 'medical:dosage-warning-required') + expect(dosage).toBeUndefined() + // medical:adverse-event-reporting-path: regex must hit MedWatch text. + const adverse = violations.find(v => v.ruleId === 'medical:adverse-event-reporting-path') + expect(adverse).toBeUndefined() + }) + + it('passing gdpr fixture with consent banner clears the cookie + privacy rules', async () => { + const rules = loadEthicsRules(RULES_DIR) + const html = readFixture('gdpr-with-consent.html') + const violations = await checkEthics( + rules, + ctxFor(html, { regulatoryContext: ['gdpr'] }), + ) + expect(violations.find(v => v.ruleId === 'legal:gdpr-cookie-consent')).toBeUndefined() + expect(violations.find(v => v.ruleId === 'legal:privacy-policy-link-required')).toBeUndefined() + expect(violations.find(v => v.ruleId === 'legal:ccpa-do-not-sell-link')).toBeUndefined() + }) +}) + +describe('checkEthics — pattern-absent detectors fire on missing text', () => { + it('medical fixture missing dosage warnings → critical-floor violation, rollupCap=4', async () => { + const rules = loadEthicsRules(RULES_DIR) + const html = readFixture('medical-no-dosage.html') + const violations = await checkEthics( + rules, + ctxFor(html, { classification: classification({ domain: 'pharmacy' }) }), + ) + const v = violations.find(x => x.ruleId === 'medical:dosage-warning-required') + expect(v).toBeDefined() + expect(v!.severity).toBe('critical-floor') + expect(v!.rollupCap).toBe(4) + expect(v!.citation).toMatch(/FDA/i) + expect(v!.remediation.toLowerCase()).toContain('dosage') + }) + + it('gdpr fixture without consent banner → major-floor violation, rollupCap=6', async () => { + const rules = loadEthicsRules(RULES_DIR) + const html = readFixture('gdpr-no-consent.html') + const violations = await checkEthics( + rules, + ctxFor(html, { regulatoryContext: ['gdpr'] }), + ) + const v = violations.find(x => x.ruleId === 'legal:gdpr-cookie-consent') + expect(v).toBeDefined() + expect(v!.severity).toBe('major-floor') + expect(v!.rollupCap).toBe(6) + }) +}) + +describe('checkEthics — llm-classifier detectors', () => { + it('kids dark-pattern fixture + brain answers yes → critical-floor violation', async () => { + const rules = loadEthicsRules(RULES_DIR) + const html = readFixture('kids-dark-pattern.html') + const brain = stubBrain((user) => { + // Sanity-check the auditor sends both the question and the snapshot. + expect(user).toContain('dark pattern') + expect(user).toContain('YES, give me coins') + return 'yes — fake urgency, confirmshaming, friction-asymmetric flow' + }) + const violations = await checkEthics( + rules, + ctxFor(html, { + classification: classification(), + audience: ['kids'], + audienceVulnerability: ['minor-facing'], + }), + { brain }, + ) + const v = violations.find(x => x.ruleId === 'kids:dark-patterns-prohibited') + expect(v).toBeDefined() + expect(v!.severity).toBe('critical-floor') + expect(v!.rollupCap).toBe(4) + }) + + it('llm-classifier rules are skipped (warned) when no brain is supplied', async () => { + const rules = loadEthicsRules(RULES_DIR) + const html = readFixture('kids-dark-pattern.html') + const warns: string[] = [] + const violations = await checkEthics( + rules, + ctxFor(html, { + classification: classification(), + audience: ['kids'], + regulatoryContext: ['coppa'], + }), + { warn: (m) => warns.push(m) }, + ) + expect(violations.find(v => v.ruleId === 'kids:dark-patterns-prohibited')).toBeUndefined() + expect(warns.some(w => w.includes('kids:dark-patterns-prohibited'))).toBe(true) + }) + + it('finance hidden-fees fixture + brain confirms hiding → critical-floor violation fires', async () => { + const rules = loadEthicsRules(RULES_DIR) + const html = readFixture('finance-hidden-fees.html') + // Rule polarity: yes = "fees ARE hidden" = violation. The fixture buries + // fees in 6px white-on-white text, so a real auditor would say yes. + const brain = stubBrain(() => 'yes — fees are buried in microcopy below the pay button') + const violations = await checkEthics( + rules, + ctxFor(html, { + classification: classification({ type: 'ecommerce', domain: 'payments' }), + }), + { brain }, + ) + const v = violations.find(x => x.ruleId === 'finance:fees-disclosed-pre-commitment') + expect(v).toBeDefined() + expect(v!.severity).toBe('critical-floor') + expect(v!.rollupCap).toBe(4) + }) + + it('finance disclosed-fees fixture + brain says fees are visible → no fee-disclosure violation', async () => { + const rules = loadEthicsRules(RULES_DIR) + const html = readFixture('finance-disclosed-fees.html') + // Rule polarity: no = "fees NOT hidden" = compliant. + const brain = stubBrain(() => 'no — every fee, FX rate, and total is itemized above the pay button') + const violations = await checkEthics( + rules, + ctxFor(html, { + classification: classification({ type: 'ecommerce', domain: 'payments' }), + }), + { brain }, + ) + expect(violations.find(v => v.ruleId === 'finance:fees-disclosed-pre-commitment')).toBeUndefined() + }) + + it('runLlmClassifier truncates oversized snapshots before sending', async () => { + const huge = 'X'.repeat(20_000) + let captured = '' + const brain = stubBrain((user) => { + captured = user + return 'no' + }) + const out = await runLlmClassifier(brain, 'is this safe?', huge) + expect(out).toBe(false) + expect(captured).toContain('[truncated]') + expect(captured.length).toBeLessThan(huge.length) + }) + + it('runLlmClassifier returns false on empty / non-yes responses', async () => { + const brain = stubBrain(() => '') + expect(await runLlmClassifier(brain, 'q?', 'snap')).toBe(false) + const brain2 = stubBrain(() => 'unsure, maybe') + expect(await runLlmClassifier(brain2, 'q?', 'snap')).toBe(false) + }) +}) + +describe('rollup cap selection', () => { + it('takes the lowest cap when multiple rules fire', async () => { + const rules = loadEthicsRules(RULES_DIR) + const html = readFixture('gdpr-no-consent.html') + // GDPR fixture has neither consent (major-floor=6) nor a privacy policy + // link (major-floor=6). Both should fire; the cap is 6. + const violations = await checkEthics( + rules, + ctxFor(html, { regulatoryContext: ['gdpr'] }), + ) + expect(violations.length).toBeGreaterThanOrEqual(2) + const minCap = Math.min(...violations.map(v => v.rollupCap)) + expect(minCap).toBe(6) + }) + + it('mixing critical-floor with major-floor lowers the cap to 4', async () => { + const rules = loadEthicsRules(RULES_DIR) + const html = readFixture('medical-no-dosage.html') + const violations = await checkEthics( + rules, + ctxFor(html, { + classification: classification({ domain: 'pharmacy' }), + audienceVulnerability: ['patient-facing'], + }), + { brain: stubBrain(() => 'no') }, + ) + // Expect dosage (critical) + adverse-event (major) + patient-education (major) + const ruleIds = new Set(violations.map(v => v.ruleId)) + expect(ruleIds.has('medical:dosage-warning-required')).toBe(true) + const minCap = Math.min(...violations.map(v => v.rollupCap)) + expect(minCap).toBe(4) + }) +}) + +describe('skip-ethics bypass semantics', () => { + it('caller can short-circuit by passing zero rules', async () => { + const html = readFixture('medical-no-dosage.html') + const violations = await checkEthics( + [], + ctxFor(html, { classification: classification({ domain: 'pharmacy' }) }), + ) + expect(violations).toEqual([]) + }) +}) diff --git a/tests/design-audit-ethics-rules.test.ts b/tests/design-audit-ethics-rules.test.ts new file mode 100644 index 0000000..ab094be --- /dev/null +++ b/tests/design-audit-ethics-rules.test.ts @@ -0,0 +1,223 @@ +/** + * Layer 7 — ethics rule loader tests. + * + * Asserts the four canonical YAML rule files (medical, kids, finance, legal) + * load without error, every rule's `appliesWhen` predicate is well-formed and + * matches the expected classification surface, and every rule has a passing + + * failing fixture pair under bench/design/ethics-fixtures/. + */ + +import { describe, it, expect, beforeEach } from 'vitest' +import * as fs from 'node:fs' +import * as path from 'node:path' +import { fileURLToPath } from 'node:url' +import { + loadEthicsRules, + clearEthicsRuleCache, + rollupCapFor, +} from '../src/design/audit/ethics/loader.js' +import { appliesWhenMatches, pageTextBlob } from '../src/design/audit/ethics/check.js' +import type { EthicsRule, PageClassification } from '../src/design/audit/v2/types.js' + +const __dirname = path.dirname(fileURLToPath(import.meta.url)) +const RULES_DIR = path.resolve(__dirname, '../src/design/audit/ethics/rules') +const FIXTURES_DIR = path.resolve(__dirname, '../bench/design/ethics-fixtures') + +function makeClassification(over: Partial = {}): PageClassification { + return { + type: 'saas-app', + domain: 'general', + framework: null, + designSystem: 'unknown', + maturity: 'shipped', + intent: 'unspecified', + confidence: 0.9, + ...over, + } +} + +beforeEach(() => clearEthicsRuleCache()) + +describe('ethics rule loader', () => { + it('loads all four rule files without error', () => { + const rules = loadEthicsRules(RULES_DIR) + expect(rules.length).toBeGreaterThanOrEqual(8) + const cats = new Set(rules.map(r => r.category)) + expect(cats).toEqual(new Set(['medical', 'kids', 'finance', 'legal'])) + }) + + it('every rule has the required structural fields', () => { + const rules = loadEthicsRules(RULES_DIR) + for (const r of rules) { + expect(r.ruleId).toMatch(/^[a-z]+:[a-z0-9-]+$/) + expect(['critical-floor', 'major-floor']).toContain(r.severity) + expect(['medical', 'kids', 'finance', 'legal']).toContain(r.category) + expect(r.remediation.length).toBeGreaterThan(10) + expect(r.detector).toBeDefined() + // Citation is optional but every shipped rule should carry one — ethics + // without a regulation reference is opinion, not policy. + expect(r.citation).toBeDefined() + } + }) + + it('rollupCapFor returns 4 for critical-floor and 6 for major-floor', () => { + expect(rollupCapFor('critical-floor')).toBe(4) + expect(rollupCapFor('major-floor')).toBe(6) + }) + + it('caches by directory — second call returns the same array', () => { + const a = loadEthicsRules(RULES_DIR) + const b = loadEthicsRules(RULES_DIR) + expect(a).toBe(b) + }) + + it('returns [] for a missing directory without throwing', () => { + const missing = path.join(__dirname, '__nonexistent_ethics_dir__') + expect(loadEthicsRules(missing)).toEqual([]) + }) +}) + +describe('appliesWhen predicates', () => { + const rules = loadEthicsRules(RULES_DIR) + const byId = new Map(rules.map(r => [r.ruleId, r])) + + it('medical:dosage-warning-required matches a pharmacy classification', () => { + const rule = byId.get('medical:dosage-warning-required')! + const ok = appliesWhenMatches(rule.appliesWhen, { + pageText: '', + snapshot: '', + classification: makeClassification({ domain: 'pharmacy' }), + }) + expect(ok).toBe(true) + }) + + it('medical:dosage-warning-required does NOT match a general saas page', () => { + const rule = byId.get('medical:dosage-warning-required')! + const ok = appliesWhenMatches(rule.appliesWhen, { + pageText: '', + snapshot: '', + classification: makeClassification({ domain: 'devtools' }), + }) + expect(ok).toBe(false) + }) + + it('kids:dark-patterns-prohibited matches when audience=[kids]', () => { + const rule = byId.get('kids:dark-patterns-prohibited')! + const ctx = { + pageText: '', + snapshot: '', + classification: makeClassification(), + audience: ['kids'] as const, + } + expect(appliesWhenMatches(rule.appliesWhen, ctx as never)).toBe(true) + }) + + it('kids:age-gate-required requires both audience=kids AND minor-facing vulnerability', () => { + const rule = byId.get('kids:age-gate-required')! + expect( + appliesWhenMatches(rule.appliesWhen, { + pageText: '', + snapshot: '', + classification: makeClassification(), + audience: ['kids'], + } as never), + ).toBe(false) + expect( + appliesWhenMatches(rule.appliesWhen, { + pageText: '', + snapshot: '', + classification: makeClassification(), + audience: ['kids'], + audienceVulnerability: ['minor-facing'], + } as never), + ).toBe(true) + }) + + it('finance:fees-disclosed-pre-commitment matches ecommerce + fintech domain', () => { + const rule = byId.get('finance:fees-disclosed-pre-commitment')! + const ok = appliesWhenMatches(rule.appliesWhen, { + pageText: '', + snapshot: '', + classification: makeClassification({ type: 'ecommerce', domain: 'payments' }), + }) + expect(ok).toBe(true) + }) + + it('legal:gdpr-cookie-consent matches when regulatoryContext includes gdpr', () => { + const rule = byId.get('legal:gdpr-cookie-consent')! + expect( + appliesWhenMatches(rule.appliesWhen, { + pageText: '', + snapshot: '', + classification: makeClassification(), + regulatoryContext: ['gdpr'], + } as never), + ).toBe(true) + expect( + appliesWhenMatches(rule.appliesWhen, { + pageText: '', + snapshot: '', + classification: makeClassification(), + }), + ).toBe(false) + }) +}) + +describe('fixture pairs', () => { + // Map each rule (or rule cluster) to a passing + failing fixture. Every + // shipped rule MUST have ≥1 of each per the RFC success metrics. + const pairs: Array<{ ruleId: string; passing: string; failing: string }> = [ + { + ruleId: 'medical:dosage-warning-required', + passing: 'medical-with-dosage.html', + failing: 'medical-no-dosage.html', + }, + { + ruleId: 'kids:age-gate-required', + passing: 'kids-age-gated.html', + failing: 'kids-dark-pattern.html', + }, + { + ruleId: 'finance:fees-disclosed-pre-commitment', + passing: 'finance-disclosed-fees.html', + failing: 'finance-hidden-fees.html', + }, + { + ruleId: 'legal:gdpr-cookie-consent', + passing: 'gdpr-with-consent.html', + failing: 'gdpr-no-consent.html', + }, + ] + + it.each(pairs)('rule $ruleId has fixture pair on disk', ({ passing, failing }) => { + expect(fs.existsSync(path.join(FIXTURES_DIR, passing))).toBe(true) + expect(fs.existsSync(path.join(FIXTURES_DIR, failing))).toBe(true) + }) + + it('pattern-absent rules detect their pattern in the passing fixture', () => { + const rules = loadEthicsRules(RULES_DIR) + const byId = new Map(rules.map(r => [r.ruleId, r])) + for (const { ruleId, passing } of pairs) { + const rule = byId.get(ruleId) as EthicsRule | undefined + if (!rule) throw new Error(`rule ${ruleId} not loaded`) + if (rule.detector.kind !== 'pattern-absent') continue + const html = fs.readFileSync(path.join(FIXTURES_DIR, passing), 'utf-8') + const re = new RegExp(rule.detector.pattern, 'i') + expect(re.test(html.toLowerCase())).toBe(true) + } + }) + + it('pattern-absent rules miss the pattern in the failing fixture', () => { + const rules = loadEthicsRules(RULES_DIR) + const byId = new Map(rules.map(r => [r.ruleId, r])) + for (const { ruleId, failing } of pairs) { + const rule = byId.get(ruleId) as EthicsRule | undefined + if (!rule) throw new Error(`rule ${ruleId} not loaded`) + if (rule.detector.kind !== 'pattern-absent') continue + const html = fs.readFileSync(path.join(FIXTURES_DIR, failing), 'utf-8') + const blob = pageTextBlob(html) + const re = new RegExp(rule.detector.pattern, 'i') + expect(re.test(blob)).toBe(false) + } + }) +}) diff --git a/tests/design-audit-first-principles.test.ts b/tests/design-audit-first-principles.test.ts new file mode 100644 index 0000000..d487527 --- /dev/null +++ b/tests/design-audit-first-principles.test.ts @@ -0,0 +1,93 @@ +import { describe, it, expect, afterEach } from 'vitest' +import { mkdtempSync, rmSync } from 'node:fs' +import { tmpdir } from 'node:os' +import { join } from 'node:path' +import { + shouldTriggerFirstPrinciples, + buildNovelPatternObservation, + appendNovelPatternObservation, +} from '../src/design/audit/first-principles-mode.js' +import type { EnsembleClassification } from '../src/design/audit/v2/types.js' +import { readFileSync, existsSync } from 'node:fs' + +function makeClassification(overrides: Partial = {}): EnsembleClassification { + return { + type: 'saas-app', + domain: '', + maturity: 'production', + designSystem: 'unknown', + signals: [], + signalsAgreed: true, + ensembleConfidence: 0.85, + firstPrinciplesMode: false, + ...overrides, + } +} + +describe('shouldTriggerFirstPrinciples', () => { + it('does not trigger on high-confidence agreed classification', () => { + expect(shouldTriggerFirstPrinciples(makeClassification())).toBe(false) + }) + + it('triggers when ensembleConfidence < 0.6', () => { + expect(shouldTriggerFirstPrinciples(makeClassification({ ensembleConfidence: 0.4 }))).toBe(true) + }) + + it('triggers when signals disagree', () => { + expect(shouldTriggerFirstPrinciples(makeClassification({ signalsAgreed: false }))).toBe(true) + }) + + it('triggers when type is unknown', () => { + expect(shouldTriggerFirstPrinciples(makeClassification({ type: 'unknown' as never }))).toBe(true) + }) + + it('triggers when firstPrinciplesMode flag is set', () => { + expect(shouldTriggerFirstPrinciples(makeClassification({ firstPrinciplesMode: true }))).toBe(true) + }) + + it('respects custom threshold', () => { + const cl = makeClassification({ ensembleConfidence: 0.72 }) + expect(shouldTriggerFirstPrinciples(cl, { confidenceThreshold: 0.8 })).toBe(true) + expect(shouldTriggerFirstPrinciples(cl, { confidenceThreshold: 0.7 })).toBe(false) + }) +}) + +describe('buildNovelPatternObservation', () => { + it('produces a stable observationId for the same pageRef within the same minute', () => { + const cl = makeClassification({ ensembleConfidence: 0.3, signalsAgreed: false }) + const obs1 = buildNovelPatternObservation({ classification: cl, pageRef: 'https://example.com' }) + const obs2 = buildNovelPatternObservation({ classification: cl, pageRef: 'https://example.com' }) + expect(obs1.observationId).toBe(obs2.observationId) + }) + + it('carries closestType and closestConfidence from the classification', () => { + const cl = makeClassification({ type: 'marketing', ensembleConfidence: 0.45 }) + const obs = buildNovelPatternObservation({ classification: cl, pageRef: 'https://test.com' }) + expect(obs.closestType).toBe('marketing') + expect(obs.closestConfidence).toBe(0.45) + }) +}) + +describe('appendNovelPatternObservation', () => { + let tmpDir: string + afterEach(() => { + if (tmpDir) rmSync(tmpDir, { recursive: true, force: true }) + }) + + it('writes a valid JSON line and round-trips', async () => { + tmpDir = mkdtempSync(join(tmpdir(), 'bad-fp-test-')) + const cl = makeClassification({ ensembleConfidence: 0.2 }) + const obs = buildNovelPatternObservation({ classification: cl, pageRef: 'https://example.com' }) + await appendNovelPatternObservation(obs, tmpDir) + + const date = obs.capturedAt.slice(0, 10) + const filePath = join(tmpDir, `${date}.jsonl`) + expect(existsSync(filePath)).toBe(true) + + const lines = readFileSync(filePath, 'utf-8').split('\n').filter(Boolean) + expect(lines).toHaveLength(1) + const parsed = JSON.parse(lines[0]) + expect(parsed.observationId).toBe(obs.observationId) + expect(parsed.pageRef).toBe('https://example.com') + }) +}) diff --git a/tests/design-audit-patch-parse.test.ts b/tests/design-audit-patch-parse.test.ts new file mode 100644 index 0000000..1b7c59a --- /dev/null +++ b/tests/design-audit-patch-parse.test.ts @@ -0,0 +1,98 @@ +import { describe, it, expect } from 'vitest' +import { parsePatch, parsePatches } from '../src/design/audit/patches/parse.js' + +const validPatch = { + patchId: 'patch-001', + findingId: 'finding-001', + scope: 'component', + target: { scope: 'css', cssSelector: '.hero-cta' }, + diff: { before: 'background: blue', after: 'background: #2563eb' }, + testThatProves: { kind: 'rerun-audit', description: 'Re-run audit and verify visual_craft score improves.' }, + rollback: { kind: 'git-revert' }, + estimatedDelta: { dim: 'visual_craft', delta: 1 }, + estimatedDeltaConfidence: 'untested', +} + +describe('parsePatch', () => { + it('accepts a fully valid patch', () => { + const { patch, reason } = parsePatch(validPatch) + expect(patch).not.toBeNull() + expect(reason).toBeUndefined() + expect(patch!.patchId).toBe('patch-001') + }) + + it('accepts optional unifiedDiff', () => { + const { patch } = parsePatch({ ...validPatch, diff: { ...validPatch.diff, unifiedDiff: '--- a/f\n+++ b/f\n' } }) + expect(patch?.diff.unifiedDiff).toBeDefined() + }) + + it('rejects non-object input', () => { + const { patch, reason } = parsePatch('not an object') + expect(patch).toBeNull() + expect(reason).toMatch(/not an object/) + }) + + it('rejects missing patchId', () => { + const { patch, reason } = parsePatch({ ...validPatch, patchId: '' }) + expect(patch).toBeNull() + expect(reason).toMatch(/patchId/) + }) + + it('rejects missing findingId', () => { + const { patch, reason } = parsePatch({ ...validPatch, findingId: undefined }) + expect(patch).toBeNull() + expect(reason).toMatch(/findingId/) + }) + + it('rejects invalid scope', () => { + const { patch, reason } = parsePatch({ ...validPatch, scope: 'galaxy' }) + expect(patch).toBeNull() + expect(reason).toMatch(/scope/) + }) + + it('rejects invalid target.scope', () => { + const { patch, reason } = parsePatch({ ...validPatch, target: { scope: 'cobol', cssSelector: '.x' } }) + expect(patch).toBeNull() + expect(reason).toMatch(/target.scope/) + }) + + it('rejects missing diff.before', () => { + const { patch, reason } = parsePatch({ ...validPatch, diff: { before: '', after: 'x' } }) + expect(patch).toBeNull() + expect(reason).toMatch(/diff.before/) + }) + + it('rejects invalid testThatProves.kind', () => { + const { patch, reason } = parsePatch({ ...validPatch, testThatProves: { kind: 'vibes', description: 'idk' } }) + expect(patch).toBeNull() + expect(reason).toMatch(/testThatProves.kind/) + }) + + it('rejects invalid rollback.kind', () => { + const { patch, reason } = parsePatch({ ...validPatch, rollback: { kind: 'prayer' } }) + expect(patch).toBeNull() + expect(reason).toMatch(/rollback.kind/) + }) + + it('rejects invalid estimatedDeltaConfidence', () => { + const { patch, reason } = parsePatch({ ...validPatch, estimatedDeltaConfidence: 'godlike' }) + expect(patch).toBeNull() + expect(reason).toMatch(/estimatedDeltaConfidence/) + }) +}) + +describe('parsePatches', () => { + it('parses an array of patches, dropping invalid entries', () => { + const raw = [validPatch, { patchId: '' }, validPatch] + const { patches, errors } = parsePatches(raw) + expect(patches).toHaveLength(2) + expect(errors).toHaveLength(1) + expect(errors[0].index).toBe(1) + }) + + it('returns error when input is not an array', () => { + const { patches, errors } = parsePatches('oops') + expect(patches).toHaveLength(0) + expect(errors[0].index).toBe(-1) + }) +}) diff --git a/tests/design-audit-patch-validate.test.ts b/tests/design-audit-patch-validate.test.ts new file mode 100644 index 0000000..187e0b7 --- /dev/null +++ b/tests/design-audit-patch-validate.test.ts @@ -0,0 +1,78 @@ +import { describe, it, expect } from 'vitest' +import { validatePatch, validatePatches } from '../src/design/audit/patches/validate.js' +import type { Patch } from '../src/design/audit/v2/types.js' + +const basePatch: Patch = { + patchId: 'p1', + findingId: 'f1', + scope: 'component', + target: { scope: 'css', cssSelector: '.btn' }, + diff: { before: 'color: red', after: 'color: green' }, + testThatProves: { kind: 'rerun-audit', description: 'Score improves.' }, + rollback: { kind: 'git-revert' }, + estimatedDelta: { dim: 'visual_craft', delta: 1 }, + estimatedDeltaConfidence: 'untested', +} + +const snapshot = 'The page has: color: red and font-size: 14px' + +describe('validatePatch', () => { + it('passes when before is in snapshot and locator present', () => { + const result = validatePatch(basePatch, snapshot) + expect(result.valid).toBe(true) + expect(result.reasons).toHaveLength(0) + }) + + it('fails when before is not in snapshot', () => { + const result = validatePatch({ ...basePatch, diff: { before: 'color: purple', after: 'x' } }, snapshot) + expect(result.valid).toBe(false) + expect(result.reasons).toContain('before-not-in-snapshot') + }) + + it('fails when before is empty string', () => { + const result = validatePatch({ ...basePatch, diff: { before: '', after: 'x' } }, snapshot) + expect(result.valid).toBe(false) + expect(result.reasons).toContain('before-empty') + }) + + it('fails when target has no locator', () => { + const patch: Patch = { ...basePatch, target: { scope: 'css' } } + const result = validatePatch(patch, snapshot) + expect(result.valid).toBe(false) + expect(result.reasons).toContain('target-missing-locator') + }) + + it('fails when estimatedDelta.delta is out of range (> 3)', () => { + const result = validatePatch({ ...basePatch, estimatedDelta: { dim: 'visual_craft', delta: 5 } }, snapshot) + expect(result.valid).toBe(false) + expect(result.reasons).toContain('estimated-delta-out-of-range') + }) + + it('fails when estimatedDelta.delta is out of range (< -3)', () => { + const result = validatePatch({ ...basePatch, estimatedDelta: { dim: 'visual_craft', delta: -4 } }, snapshot) + expect(result.valid).toBe(false) + expect(result.reasons).toContain('estimated-delta-out-of-range') + }) + + it('accumulates multiple failures in one pass', () => { + const patch: Patch = { + ...basePatch, + target: { scope: 'css' }, + diff: { before: 'not present', after: 'x' }, + estimatedDelta: { dim: 'visual_craft', delta: 99 }, + } + const result = validatePatch(patch, snapshot) + expect(result.valid).toBe(false) + expect(result.reasons.length).toBeGreaterThanOrEqual(3) + }) +}) + +describe('validatePatches', () => { + it('partitions valid and invalid patches', () => { + const valid = basePatch + const invalid: Patch = { ...basePatch, diff: { before: 'not-here', after: 'x' } } + const result = validatePatches([valid, invalid], snapshot) + expect(result.valid).toHaveLength(1) + expect(result.invalid).toHaveLength(1) + }) +}) diff --git a/tests/design-audit-rollup.test.ts b/tests/design-audit-rollup.test.ts new file mode 100644 index 0000000..2316ad1 --- /dev/null +++ b/tests/design-audit-rollup.test.ts @@ -0,0 +1,252 @@ +import { describe, it, expect } from 'vitest' +import { + ROLLUP_WEIGHTS, + rollupWeightsFor, + rollupFormula, +} from '../src/design/audit/rubric/rollup-weights.js' +import { + computeRollup, + mergeDimensionScoresAcrossPasses, + parseAuditResponseV2, +} from '../src/design/audit/v2/score.js' +import { DIMENSIONS, type Dimension, type DimensionScore } from '../src/design/audit/v2/types.js' + +function dimScore(score: number, range: [number, number] = [score - 1, score + 1], conf: 'high' | 'medium' | 'low' = 'medium'): DimensionScore { + return { + score, + range: [Math.max(1, range[0]), Math.min(10, range[1])], + confidence: conf, + summary: '', + primaryFindings: [], + } +} + +function uniformScores(score: number, conf: 'high' | 'medium' | 'low' = 'medium'): Record { + const out: Partial> = {} + for (const dim of DIMENSIONS) out[dim] = dimScore(score, [Math.max(1, score - 1), Math.min(10, score + 1)], conf) + return out as Record +} + +describe('rollup weights — Layer 1', () => { + it('every page-type weight set sums to 1.0 within 1e-6', () => { + for (const [type, weights] of Object.entries(ROLLUP_WEIGHTS)) { + const sum = Object.values(weights).reduce((a, n) => a + n, 0) + expect(Math.abs(sum - 1)).toBeLessThan(1e-6) + // every dimension must be present + for (const dim of DIMENSIONS) { + expect(typeof weights[dim]).toBe('number') + } + } + }) + + it('exposes weights for every PageType plus default + unknown', () => { + const expected = ['marketing', 'saas-app', 'dashboard', 'docs', 'ecommerce', 'social', 'tool', 'blog', 'utility', 'unknown', 'default'] + for (const t of expected) { + expect(ROLLUP_WEIGHTS[t as keyof typeof ROLLUP_WEIGHTS]).toBeDefined() + } + }) + + it('saas-app weights emphasize product_intent + workflow over visual_craft', () => { + const w = ROLLUP_WEIGHTS['saas-app'] + expect(w.product_intent).toBeGreaterThan(w.visual_craft) + expect(w.workflow).toBeGreaterThan(w.visual_craft) + }) + + it('marketing weights emphasize visual_craft + content_ia + product_intent', () => { + const w = ROLLUP_WEIGHTS.marketing + expect(w.visual_craft).toBeGreaterThanOrEqual(0.25) + expect(w.content_ia).toBeGreaterThanOrEqual(0.2) + expect(w.product_intent).toBeGreaterThanOrEqual(0.2) + }) + + it('docs weights emphasize content_ia ≥ 0.4', () => { + expect(ROLLUP_WEIGHTS.docs.content_ia).toBeGreaterThanOrEqual(0.4) + }) + + it('ecommerce weights emphasize trust_clarity', () => { + expect(ROLLUP_WEIGHTS.ecommerce.trust_clarity).toBeGreaterThanOrEqual(0.3) + }) + + it('rollupWeightsFor falls back to default for unknown page type', () => { + const w = rollupWeightsFor(undefined) + const sum = Object.values(w).reduce((a, n) => a + n, 0) + expect(Math.abs(sum - 1)).toBeLessThan(1e-6) + }) + + it('rollupFormula renders a deterministic readable formula', () => { + const formula = rollupFormula('saas-app', ROLLUP_WEIGHTS['saas-app']) + expect(formula).toContain('saas-app:') + expect(formula).toContain('product_intent*0.35') + expect(formula).toContain('workflow*0.30') + }) +}) + +describe('computeRollup — Layer 1', () => { + it('uniform 7s on saas-app rolls up to exactly 7', () => { + const r = computeRollup(uniformScores(7), 'saas-app') + expect(r.score).toBeCloseTo(7, 6) + expect(r.range[0]).toBeCloseTo(6, 6) + expect(r.range[1]).toBeCloseTo(8, 6) + expect(r.confidence).toBe('medium') + expect(r.rule).toContain('saas-app') + }) + + it('saas-app rollup weights product_intent more heavily than docs', () => { + const scores: Record = { + product_intent: dimScore(9), + workflow: dimScore(5), + visual_craft: dimScore(5), + trust_clarity: dimScore(5), + content_ia: dimScore(5), + } + const saas = computeRollup(scores, 'saas-app') + const docs = computeRollup(scores, 'docs') + expect(saas.score).toBeGreaterThan(docs.score) + }) + + it('confidence is conservative: any low → low rollup', () => { + const scores = uniformScores(7, 'high') + scores.workflow = dimScore(7, [6, 8], 'low') + const r = computeRollup(scores, 'saas-app') + expect(r.confidence).toBe('low') + }) + + it('confidence is medium when no low + at least one medium', () => { + const scores = uniformScores(8, 'high') + scores.product_intent = dimScore(8, [7, 9], 'medium') + const r = computeRollup(scores, 'marketing') + expect(r.confidence).toBe('medium') + }) + + it('confidence is high when every dim is high', () => { + const r = computeRollup(uniformScores(9, 'high'), 'saas-app') + expect(r.confidence).toBe('high') + }) + + it('weighted-mean math: linear scoring 4 vs 9 with saas-app weights', () => { + const scores: Record = { + product_intent: dimScore(4), + workflow: dimScore(4), + visual_craft: dimScore(9), + trust_clarity: dimScore(9), + content_ia: dimScore(9), + } + const r = computeRollup(scores, 'saas-app') + // saas-app: 0.35*4 + 0.30*4 + 0.15*9 + 0.10*9 + 0.10*9 = 1.4 + 1.2 + 1.35 + 0.9 + 0.9 = 5.75 + expect(r.score).toBeCloseTo(5.75, 1) + }) +}) + +describe('mergeDimensionScoresAcrossPasses — Layer 1', () => { + it('returns identity for a single pass', () => { + const s = uniformScores(7) + const merged = mergeDimensionScoresAcrossPasses([s]) + expect(merged.product_intent.score).toBe(7) + }) + + it('averages scores across multiple passes', () => { + const s1 = uniformScores(6) + const s2 = uniformScores(8) + const merged = mergeDimensionScoresAcrossPasses([s1, s2]) + expect(merged.product_intent.score).toBe(7) + }) + + it('takes the floor confidence across passes', () => { + const s1 = uniformScores(7, 'high') + const s2 = uniformScores(7, 'low') + const merged = mergeDimensionScoresAcrossPasses([s1, s2]) + expect(merged.product_intent.confidence).toBe('low') + }) + + it('throws on empty input', () => { + expect(() => mergeDimensionScoresAcrossPasses([])).toThrow(/empty/) + }) +}) + +describe('parseAuditResponseV2 — Layer 1', () => { + const validRaw = JSON.stringify({ + scores: { + product_intent: { score: 6, range: [5, 7], confidence: 'medium', summary: 'ok', primaryFindings: [] }, + visual_craft: { score: 7, range: [6, 8], confidence: 'high', summary: 'ok', primaryFindings: [] }, + trust_clarity: { score: 5, range: [4, 6], confidence: 'medium', summary: 'ok', primaryFindings: [] }, + workflow: { score: 6, range: [5, 7], confidence: 'medium', summary: 'ok', primaryFindings: [] }, + content_ia: { score: 7, range: [6, 8], confidence: 'high', summary: 'ok', primaryFindings: [] }, + }, + summary: 'overall', + strengths: ['a', 'b'], + }) + + it('parses a well-formed v2 response with every dimension', () => { + const out = parseAuditResponseV2(validRaw) + expect(out.scores.product_intent.score).toBe(6) + expect(out.scores.visual_craft.confidence).toBe('high') + expect(out.summary).toBe('overall') + expect(out.strengths).toEqual(['a', 'b']) + }) + + it('parses fenced JSON', () => { + const fenced = '```json\n' + validRaw + '\n```' + const out = parseAuditResponseV2(fenced) + expect(out.scores.product_intent.score).toBe(6) + }) + + it('rejects scores outside [range[0], range[1]]', () => { + const bad = JSON.stringify({ + scores: { + product_intent: { score: 3, range: [5, 7], confidence: 'medium', summary: '', primaryFindings: [] }, + visual_craft: { score: 7, range: [6, 8], confidence: 'high', summary: '', primaryFindings: [] }, + trust_clarity: { score: 5, range: [4, 6], confidence: 'medium', summary: '', primaryFindings: [] }, + workflow: { score: 6, range: [5, 7], confidence: 'medium', summary: '', primaryFindings: [] }, + content_ia: { score: 7, range: [6, 8], confidence: 'high', summary: '', primaryFindings: [] }, + }, + }) + expect(() => parseAuditResponseV2(bad)).toThrow(/outside range/) + }) + + it('rejects scores outside 1..10', () => { + const bad = JSON.stringify({ + scores: { + product_intent: { score: 11, range: [10, 12], confidence: 'medium', summary: '', primaryFindings: [] }, + visual_craft: { score: 7, range: [6, 8], confidence: 'high', summary: '', primaryFindings: [] }, + trust_clarity: { score: 5, range: [4, 6], confidence: 'medium', summary: '', primaryFindings: [] }, + workflow: { score: 6, range: [5, 7], confidence: 'medium', summary: '', primaryFindings: [] }, + content_ia: { score: 7, range: [6, 8], confidence: 'high', summary: '', primaryFindings: [] }, + }, + }) + expect(() => parseAuditResponseV2(bad)).toThrow(/outside 1..10/) + }) + + it('rejects inverted ranges', () => { + const bad = JSON.stringify({ + scores: { + product_intent: { score: 6, range: [7, 5], confidence: 'medium', summary: '', primaryFindings: [] }, + visual_craft: { score: 7, range: [6, 8], confidence: 'high', summary: '', primaryFindings: [] }, + trust_clarity: { score: 5, range: [4, 6], confidence: 'medium', summary: '', primaryFindings: [] }, + workflow: { score: 6, range: [5, 7], confidence: 'medium', summary: '', primaryFindings: [] }, + content_ia: { score: 7, range: [6, 8], confidence: 'high', summary: '', primaryFindings: [] }, + }, + }) + expect(() => parseAuditResponseV2(bad)).toThrow(/inverted/) + }) + + it('throws when a dimension is missing', () => { + const bad = JSON.stringify({ + scores: { + product_intent: { score: 6, range: [5, 7], confidence: 'medium', summary: '', primaryFindings: [] }, + // missing visual_craft + trust_clarity: { score: 5, range: [4, 6], confidence: 'medium', summary: '', primaryFindings: [] }, + workflow: { score: 6, range: [5, 7], confidence: 'medium', summary: '', primaryFindings: [] }, + content_ia: { score: 7, range: [6, 8], confidence: 'high', summary: '', primaryFindings: [] }, + }, + }) + expect(() => parseAuditResponseV2(bad)).toThrow(/visual_craft missing/) + }) + + it('throws on missing scores object', () => { + expect(() => parseAuditResponseV2('{"summary":"x"}')).toThrow(/missing scores/) + }) + + it('throws on no JSON object at all', () => { + expect(() => parseAuditResponseV2('not json')).toThrow(/no JSON object/) + }) +}) diff --git a/tests/design-audit-v2-result.test.ts b/tests/design-audit-v2-result.test.ts new file mode 100644 index 0000000..96052e4 --- /dev/null +++ b/tests/design-audit-v2-result.test.ts @@ -0,0 +1,267 @@ +import { describe, it, expect } from 'vitest' +import { buildAuditResultV2 } from '../src/design/audit/v2/build-result.js' +import type { Brain } from '../src/brain/index.js' +import type { PageState } from '../src/types.js' +import type { + PageAuditResult, + ComposedRubric, + MeasurementBundle, +} from '../src/design/audit/types.js' +import type { + AuditResult_v2, + Dimension, + DimensionScore, + EnsembleClassification, +} from '../src/design/audit/v2/types.js' +import { DIMENSIONS } from '../src/design/audit/v2/types.js' + +function fakeMeasurements(): MeasurementBundle { + return { + contrast: { + totalChecked: 50, + aaFailures: [], + aaaFailures: [], + summary: { aaPassRate: 1, aaaPassRate: 1 }, + }, + a11y: { + ran: true, + violations: [], + passes: 30, + }, + hasBlockingIssues: false, + } +} + +function fakeRubric(): ComposedRubric { + return { + fragments: [], + body: 'TEST RUBRIC BODY', + calibration: 'Score honestly.', + dimensions: [], + } +} + +function fakeEnsemble(type: 'saas-app' | 'marketing' = 'saas-app'): EnsembleClassification { + return { + type, + domain: 'unknown', + framework: null, + designSystem: 'unknown', + maturity: 'shipped', + intent: 'test page', + confidence: 0.8, + signals: [ + { source: 'url-pattern', type, confidence: 0.7, rationale: 'fixture' }, + { source: 'dom-heuristic', type, confidence: 0.7, rationale: 'fixture' }, + ], + signalsAgreed: true, + ensembleConfidence: 0.8, + firstPrinciplesMode: false, + } +} + +function fakeV1(score = 7): PageAuditResult { + return { + url: 'https://example.com/app', + score, + summary: 'fake v1 summary', + strengths: ['a'], + findings: [ + { + category: 'ux', + severity: 'major', + description: 'No primary action', + location: 'main', + suggestion: 'Add a primary CTA', + impact: 8, + effort: 3, + blast: 'page', + }, + { + category: 'spacing', + severity: 'minor', + description: 'inconsistent padding', + location: 'cards', + suggestion: 'use 8px grid', + impact: 4, + effort: 1, + blast: 'component', + }, + ], + } +} + +function uniformScores(score: number, conf: 'high' | 'medium' | 'low' = 'medium'): Record { + const out: Partial> = {} + for (const dim of DIMENSIONS) { + out[dim] = { + score, + range: [Math.max(1, score - 1), Math.min(10, score + 1)], + confidence: conf, + summary: '', + primaryFindings: [], + } + } + return out as Record +} + +function fakeStateWithoutBrain(): { brain: Brain; state: PageState } { + // Brain that throws — buildAuditResultV2 should fall back to synthesized + // scores when given precomputedScores OR when the brain call fails. + const brain = { + auditDesign: async () => { + throw new Error('no brain in tests') + }, + } as unknown as Brain + const state = { url: 'x', title: 'x', snapshot: '', screenshot: '' } as PageState + return { brain, state } +} + +describe('buildAuditResultV2 — Layer 1', () => { + it('produces a complete AuditResult_v2 with every required field (precomputed path)', async () => { + const { brain, state } = fakeStateWithoutBrain() + const v2: AuditResult_v2 = await buildAuditResultV2({ + brain, + state, + pageRef: 'https://example.com/app', + ensemble: fakeEnsemble('saas-app'), + rubric: fakeRubric(), + measurements: fakeMeasurements(), + v1Result: fakeV1(7), + precomputedScores: uniformScores(8, 'high'), + }) + + expect(v2.schemaVersion).toBe(2) + expect(typeof v2.runId).toBe('string') + expect(v2.pageRef).toBe('https://example.com/app') + expect(v2.classification.type).toBe('saas-app') + expect(v2.classification.signalsAgreed).toBe(true) + + for (const dim of DIMENSIONS) { + expect(v2.scores[dim]).toBeDefined() + expect(v2.scores[dim].score).toBe(8) + expect(v2.scores[dim].range[0]).toBeLessThanOrEqual(v2.scores[dim].score) + expect(v2.scores[dim].range[1]).toBeGreaterThanOrEqual(v2.scores[dim].score) + } + + expect(v2.rollup.score).toBeCloseTo(8, 1) + expect(v2.rollup.confidence).toBe('high') + expect(v2.rollup.rule).toContain('saas-app') + + expect(Array.isArray(v2.findings)).toBe(true) + expect(v2.findings.length).toBeGreaterThan(0) + for (const f of v2.findings) { + expect(typeof f.id).toBe('string') + expect(f.id.length).toBeGreaterThan(0) + expect(['product_intent', 'visual_craft', 'trust_clarity', 'workflow', 'content_ia']).toContain(f.dimension) + expect(['polish', 'job', 'measurement']).toContain(f.kind) + expect(Array.isArray(f.patches)).toBe(true) + } + + expect(Array.isArray(v2.topFixes)).toBe(true) + expect(v2.topFixes.length).toBeLessThanOrEqual(5) + for (const fixId of v2.topFixes) { + expect(v2.findings.some((f) => f.id === fixId)).toBe(true) + } + + expect(Array.isArray(v2.ethicsViolations)).toBe(true) + expect(Array.isArray(v2.matchedPatterns)).toBe(true) + expect(v2.modality).toBe('html') + expect(typeof v2.evaluatedAt).toBe('string') + expect(typeof v2.promptHash).toBe('string') + expect(typeof v2.rubricHash).toBe('string') + expect(Array.isArray(v2.passes)).toBe(true) + }) + + it('rollup score reflects per-page-type weights (saas-app vs marketing)', async () => { + const { brain, state } = fakeStateWithoutBrain() + const scores = uniformScores(7, 'high') + // tilt one dimension low + scores.product_intent = { score: 3, range: [2, 4], confidence: 'high', summary: '', primaryFindings: [] } + + const saas = await buildAuditResultV2({ + brain, + state, + pageRef: 'x', + ensemble: fakeEnsemble('saas-app'), + rubric: fakeRubric(), + measurements: fakeMeasurements(), + v1Result: fakeV1(), + precomputedScores: scores, + }) + + const marketing = await buildAuditResultV2({ + brain, + state, + pageRef: 'x', + ensemble: fakeEnsemble('marketing'), + rubric: fakeRubric(), + measurements: fakeMeasurements(), + v1Result: fakeV1(), + precomputedScores: scores, + }) + + // saas-app weights product_intent at 0.35 vs marketing 0.30 — saas penalized more. + expect(saas.rollup.score).toBeLessThan(marketing.rollup.score) + }) + + it('falls back to synthesized scores when LLM call fails', async () => { + const { brain, state } = fakeStateWithoutBrain() + const v2 = await buildAuditResultV2({ + brain, + state, + pageRef: 'x', + ensemble: fakeEnsemble('saas-app'), + rubric: fakeRubric(), + measurements: fakeMeasurements(), + v1Result: fakeV1(6), + }) + // Synthesized fallback: every dim equals v1 score, confidence 'low'. + expect(v2.scores.product_intent.score).toBe(6) + expect(v2.scores.product_intent.confidence).toBe('low') + expect(v2.rollup.confidence).toBe('low') + }) + + it('classification carries ensembleConfidence + signalsAgreed', async () => { + const { brain, state } = fakeStateWithoutBrain() + const ensemble: EnsembleClassification = { + ...fakeEnsemble('saas-app'), + ensembleConfidence: 0.42, + signalsAgreed: false, + dissent: [{ source: 'dom-heuristic', type: 'marketing' }], + } + const v2 = await buildAuditResultV2({ + brain, + state, + pageRef: 'x', + ensemble, + rubric: fakeRubric(), + measurements: fakeMeasurements(), + v1Result: fakeV1(), + precomputedScores: uniformScores(6), + }) + expect(v2.classification.ensembleConfidence).toBe(0.42) + expect(v2.classification.signalsAgreed).toBe(false) + expect(v2.classification.dissent?.length).toBe(1) + }) + + it('fixture-style assertion: low product_intent + saas-app → rollup ≤ 6', async () => { + const { brain, state } = fakeStateWithoutBrain() + const scores = uniformScores(5) + scores.product_intent = { score: 3, range: [2, 4], confidence: 'medium', summary: '', primaryFindings: [] } + scores.workflow = { score: 4, range: [3, 5], confidence: 'medium', summary: '', primaryFindings: [] } + + const v2 = await buildAuditResultV2({ + brain, + state, + pageRef: 'fixture://no-primary-action', + ensemble: fakeEnsemble('saas-app'), + rubric: fakeRubric(), + measurements: fakeMeasurements(), + v1Result: fakeV1(4), + precomputedScores: scores, + }) + expect(v2.scores.product_intent.score).toBeLessThanOrEqual(4) + expect(v2.rollup.score).toBeLessThanOrEqual(6) + }) +}) From 9d46ca8f85a954467ea6fd1fccf0c459d9b3b650 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Sun, 26 Apr 2026 00:15:20 -0600 Subject: [PATCH 3/5] fix(design-audit): ethics rules copy to dist, URL exclusion from text blob, health domain in medical rules MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three fixes discovered during smoke testing: 1. ethics/rules/*.yaml were not being copied to dist/ at build time — copy-static-assets.mjs only copied rubric fragments and anchors. Added ethics rules entry so the gate actually loads its rules at runtime. 2. pageTextBlob included the request URL in the content blob, causing false negatives on pattern-absent rules: a URL like medical-no-dosage.html contains "dosage" and suppressed the dosage-warning-required rule. URL is now excluded from the blob; URL-based classification uses the ensemble classifier's own URL heuristic. 3. Medical ethics rules matched domain: [medical, clinical, pharmacy] but the LLM classifies pharmacy-style ecommerce pages as domain "health". Added "health" to the domain list so the rules apply correctly. --- scripts/copy-static-assets.mjs | 8 ++++++++ src/design/audit/ethics/check.ts | 4 ++-- src/design/audit/ethics/rules/medical.yaml | 6 +++--- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/scripts/copy-static-assets.mjs b/scripts/copy-static-assets.mjs index ed050f2..cbff8af 100644 --- a/scripts/copy-static-assets.mjs +++ b/scripts/copy-static-assets.mjs @@ -6,6 +6,8 @@ * * Currently copies: * - src/design/audit/rubric/fragments/*.md (rubric library) + * - src/design/audit/rubric/anchors/*.yaml (calibration anchors) + * - src/design/audit/ethics/rules/*.yaml (ethics gate rules) * - src/viewer/*.html (session viewer UI) */ @@ -28,6 +30,12 @@ const COPIES = [ dest: 'dist/design/audit/rubric/anchors', pattern: /\.ya?ml$/, }, + { + label: 'ethics rule(s)', + src: 'src/design/audit/ethics/rules', + dest: 'dist/design/audit/ethics/rules', + pattern: /\.ya?ml$/, + }, { label: 'viewer asset(s)', src: 'src/viewer', diff --git a/src/design/audit/ethics/check.ts b/src/design/audit/ethics/check.ts index 520322c..4fe85c6 100644 --- a/src/design/audit/ethics/check.ts +++ b/src/design/audit/ethics/check.ts @@ -176,8 +176,8 @@ export async function runLlmClassifier( return first === 'yes' } -/** Build the lowercased text blob used by pattern detectors. */ +/** Build the lowercased text blob used by pattern detectors. URL is intentionally excluded — URL path tokens (e.g. "no-dosage") would cause false negatives on pattern-absent rules. */ export function pageTextBlob(snapshot: string, extra?: { url?: string; title?: string }): string { - const parts = [snapshot, extra?.title ?? '', extra?.url ?? ''] + const parts = [snapshot, extra?.title ?? ''] return parts.join('\n').toLowerCase() } diff --git a/src/design/audit/ethics/rules/medical.yaml b/src/design/audit/ethics/rules/medical.yaml index e21129c..64d9a1f 100644 --- a/src/design/audit/ethics/rules/medical.yaml +++ b/src/design/audit/ethics/rules/medical.yaml @@ -5,7 +5,7 @@ category: medical severity: critical-floor appliesWhen: - domain: [medical, clinical, pharmacy] + domain: [medical, clinical, pharmacy, health] detector: kind: pattern-absent pattern: (dosage|warning|contraindication|adverse|side effect) @@ -16,7 +16,7 @@ category: medical severity: major-floor appliesWhen: - domain: [medical, clinical, pharmacy] + domain: [medical, clinical, pharmacy, health] detector: kind: pattern-absent pattern: (report.{0,20}(side effect|adverse|reaction)|MedWatch|FDA 1088|pharmacovigilance) @@ -27,7 +27,7 @@ category: medical severity: major-floor appliesWhen: - domain: [medical, clinical, pharmacy] + domain: [medical, clinical, pharmacy, health] audienceVulnerability: [patient-facing] detector: kind: llm-classifier From 57539d2cd61fadf7ceafb3bb3ebe58a924c1336c Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Sun, 26 Apr 2026 00:33:11 -0600 Subject: [PATCH 4/5] fix(test): replace node --experimental-strip-types with tsx in telemetry rollup tests PR #79 added two telemetry-rollup-remote tests that spawn `node --experimental-strip-types ROLLUP_PATH`. That flag is fully supported only on Node 22+; on Node 18 and 20 (both in our CI matrix), Node exits 9 (invalid argument) before the rollup script runs, so the tests assert exit 2 but get exit 9. Replace with `tsx` (added as a devDependency) which works identically across all Node versions. The behavior under test is unchanged: rollup --remote without BAD_TELEMETRY_API / BAD_TELEMETRY_ADMIN_BEARER must exit 2 with a clear stderr message. --- package.json | 1 + pnpm-lock.yaml | 65 ++++++++++++++++++++++++--- tests/telemetry-rollup-remote.test.ts | 9 ++-- 3 files changed, 64 insertions(+), 11 deletions(-) diff --git a/package.json b/package.json index e48be73..00a47db 100644 --- a/package.json +++ b/package.json @@ -133,6 +133,7 @@ "pixelmatch": "^7.1.0", "playwright": "^1.40.0", "pngjs": "^7.0.0", + "tsx": "^4.21.0", "typescript": "^5.3.0", "vitest": "^4.0.18" } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index db1fe17..c30b3e0 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -66,12 +66,15 @@ importers: pngjs: specifier: ^7.0.0 version: 7.0.0 + tsx: + specifier: ^4.21.0 + version: 4.21.0 typescript: specifier: ^5.3.0 version: 5.9.3 vitest: specifier: ^4.0.18 - version: 4.0.18(@opentelemetry/api@1.9.0)(@types/node@20.19.35) + version: 4.0.18(@opentelemetry/api@1.9.0)(@types/node@20.19.35)(tsx@4.21.0) packages: @@ -376,56 +379,66 @@ packages: resolution: {integrity: sha512-9B+taZ8DlyyqzZQnoeIvDVR/2F4EbMepXMc/NdVbkzsJbzkUjhXv/70GQJ7tdLA4YJgNP25zukcxpX2/SueNrA==} cpu: [arm64] os: [linux] + libc: [glibc] '@img/sharp-libvips-linux-arm@1.0.5': resolution: {integrity: sha512-gvcC4ACAOPRNATg/ov8/MnbxFDJqf/pDePbBnuBDcjsI8PssmjoKMAz4LtLaVi+OnSb5FK/yIOamqDwGmXW32g==} cpu: [arm] os: [linux] + libc: [glibc] '@img/sharp-libvips-linux-x64@1.0.4': resolution: {integrity: sha512-MmWmQ3iPFZr0Iev+BAgVMb3ZyC4KeFc3jFxnNbEPas60e1cIfevbtuyf9nDGIzOaW9PdnDciJm+wFFaTlj5xYw==} cpu: [x64] os: [linux] + libc: [glibc] '@img/sharp-libvips-linuxmusl-arm64@1.0.4': resolution: {integrity: sha512-9Ti+BbTYDcsbp4wfYib8Ctm1ilkugkA/uscUn6UXK1ldpC1JjiXbLfFZtRlBhjPZ5o1NCLiDbg8fhUPKStHoTA==} cpu: [arm64] os: [linux] + libc: [musl] '@img/sharp-libvips-linuxmusl-x64@1.0.4': resolution: {integrity: sha512-viYN1KX9m+/hGkJtvYYp+CCLgnJXwiQB39damAO7WMdKWlIhmYTfHjwSbQeUK/20vY154mwezd9HflVFM1wVSw==} cpu: [x64] os: [linux] + libc: [musl] '@img/sharp-linux-arm64@0.33.5': resolution: {integrity: sha512-JMVv+AMRyGOHtO1RFBiJy/MBsgz0x4AWrT6QoEVVTyh1E39TrCUpTRI7mx9VksGX4awWASxqCYLCV4wBZHAYxA==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [arm64] os: [linux] + libc: [glibc] '@img/sharp-linux-arm@0.33.5': resolution: {integrity: sha512-JTS1eldqZbJxjvKaAkxhZmBqPRGmxgu+qFKSInv8moZ2AmT5Yib3EQ1c6gp493HvrvV8QgdOXdyaIBrhvFhBMQ==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [arm] os: [linux] + libc: [glibc] '@img/sharp-linux-x64@0.33.5': resolution: {integrity: sha512-opC+Ok5pRNAzuvq1AG0ar+1owsu842/Ab+4qvU879ippJBHvyY5n2mxF1izXqkPYlGuP/M556uh53jRLJmzTWA==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [x64] os: [linux] + libc: [glibc] '@img/sharp-linuxmusl-arm64@0.33.5': resolution: {integrity: sha512-XrHMZwGQGvJg2V/oRSUfSAfjfPxO+4DkiRh6p2AFjLQztWUuY/o8Mq0eMQVIY7HJ1CDQUJlxGGZRw1a5bqmd1g==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [arm64] os: [linux] + libc: [musl] '@img/sharp-linuxmusl-x64@0.33.5': resolution: {integrity: sha512-WT+d/cgqKkkKySYmqoZ8y3pxx7lx9vVejxW/W4DOFMYVSkErR+w7mf2u8m/y4+xHe7yY9DAXQMWQhpnMuFfScw==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [x64] os: [linux] + libc: [musl] '@img/sharp-win32-x64@0.33.5': resolution: {integrity: sha512-MpY/o8/8kj+EcnxwvrP4aTJSWw/aZ7JIGR4aBeZkZw5B7/Jn+tY9/VNwtcoGmdT7GfggGIU4kygOMSbYnOrAbg==} @@ -542,66 +555,79 @@ packages: resolution: {integrity: sha512-t4ONHboXi/3E0rT6OZl1pKbl2Vgxf9vJfWgmUoCEVQVxhW6Cw/c8I6hbbu7DAvgp82RKiH7TpLwxnJeKv2pbsw==} cpu: [arm] os: [linux] + libc: [glibc] '@rollup/rollup-linux-arm-musleabihf@4.59.0': resolution: {integrity: sha512-CikFT7aYPA2ufMD086cVORBYGHffBo4K8MQ4uPS/ZnY54GKj36i196u8U+aDVT2LX4eSMbyHtyOh7D7Zvk2VvA==} cpu: [arm] os: [linux] + libc: [musl] '@rollup/rollup-linux-arm64-gnu@4.59.0': resolution: {integrity: sha512-jYgUGk5aLd1nUb1CtQ8E+t5JhLc9x5WdBKew9ZgAXg7DBk0ZHErLHdXM24rfX+bKrFe+Xp5YuJo54I5HFjGDAA==} cpu: [arm64] os: [linux] + libc: [glibc] '@rollup/rollup-linux-arm64-musl@4.59.0': resolution: {integrity: sha512-peZRVEdnFWZ5Bh2KeumKG9ty7aCXzzEsHShOZEFiCQlDEepP1dpUl/SrUNXNg13UmZl+gzVDPsiCwnV1uI0RUA==} cpu: [arm64] os: [linux] + libc: [musl] '@rollup/rollup-linux-loong64-gnu@4.59.0': resolution: {integrity: sha512-gbUSW/97f7+r4gHy3Jlup8zDG190AuodsWnNiXErp9mT90iCy9NKKU0Xwx5k8VlRAIV2uU9CsMnEFg/xXaOfXg==} cpu: [loong64] os: [linux] + libc: [glibc] '@rollup/rollup-linux-loong64-musl@4.59.0': resolution: {integrity: sha512-yTRONe79E+o0FWFijasoTjtzG9EBedFXJMl888NBEDCDV9I2wGbFFfJQQe63OijbFCUZqxpHz1GzpbtSFikJ4Q==} cpu: [loong64] os: [linux] + libc: [musl] '@rollup/rollup-linux-ppc64-gnu@4.59.0': resolution: {integrity: sha512-sw1o3tfyk12k3OEpRddF68a1unZ5VCN7zoTNtSn2KndUE+ea3m3ROOKRCZxEpmT9nsGnogpFP9x6mnLTCaoLkA==} cpu: [ppc64] os: [linux] + libc: [glibc] '@rollup/rollup-linux-ppc64-musl@4.59.0': resolution: {integrity: sha512-+2kLtQ4xT3AiIxkzFVFXfsmlZiG5FXYW7ZyIIvGA7Bdeuh9Z0aN4hVyXS/G1E9bTP/vqszNIN/pUKCk/BTHsKA==} cpu: [ppc64] os: [linux] + libc: [musl] '@rollup/rollup-linux-riscv64-gnu@4.59.0': resolution: {integrity: sha512-NDYMpsXYJJaj+I7UdwIuHHNxXZ/b/N2hR15NyH3m2qAtb/hHPA4g4SuuvrdxetTdndfj9b1WOmy73kcPRoERUg==} cpu: [riscv64] os: [linux] + libc: [glibc] '@rollup/rollup-linux-riscv64-musl@4.59.0': resolution: {integrity: sha512-nLckB8WOqHIf1bhymk+oHxvM9D3tyPndZH8i8+35p/1YiVoVswPid2yLzgX7ZJP0KQvnkhM4H6QZ5m0LzbyIAg==} cpu: [riscv64] os: [linux] + libc: [musl] '@rollup/rollup-linux-s390x-gnu@4.59.0': resolution: {integrity: sha512-oF87Ie3uAIvORFBpwnCvUzdeYUqi2wY6jRFWJAy1qus/udHFYIkplYRW+wo+GRUP4sKzYdmE1Y3+rY5Gc4ZO+w==} cpu: [s390x] os: [linux] + libc: [glibc] '@rollup/rollup-linux-x64-gnu@4.59.0': resolution: {integrity: sha512-3AHmtQq/ppNuUspKAlvA8HtLybkDflkMuLK4DPo77DfthRb71V84/c4MlWJXixZz4uruIH4uaa07IqoAkG64fg==} cpu: [x64] os: [linux] + libc: [glibc] '@rollup/rollup-linux-x64-musl@4.59.0': resolution: {integrity: sha512-2UdiwS/9cTAx7qIUZB/fWtToJwvt0Vbo0zmnYt7ED35KPg13Q0ym1g442THLC7VyI6JfYTP4PiSOWyoMdV2/xg==} cpu: [x64] os: [linux] + libc: [musl] '@rollup/rollup-openbsd-x64@4.59.0': resolution: {integrity: sha512-M3bLRAVk6GOwFlPTIxVBSYKUaqfLrn8l0psKinkCFxl4lQvOSz8ZrKDz2gxcBwHFpci0B6rttydI4IpS4IS/jQ==} @@ -890,6 +916,9 @@ packages: engines: {node: ^8.16.0 || ^10.6.0 || >=11.0.0} os: [darwin] + get-tsconfig@4.14.0: + resolution: {integrity: sha512-yTb+8DXzDREzgvYmh6s9vHsSVCHeC0G3PI5bEXNBHtmshPnO+S5O7qgLEOn0I5QvMy6kpZN8K1NKGyilLb93wA==} + glob-parent@5.1.2: resolution: {integrity: sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==} engines: {node: '>= 6'} @@ -1165,6 +1194,9 @@ packages: resolution: {integrity: sha512-qYg9KP24dD5qka9J47d0aVky0N+b4fTU89LN9iDnjB5waksiC49rvMB0PrUJQGoTmH50XPiqOvAjDfaijGxYZw==} engines: {node: '>=8'} + resolve-pkg-maps@1.0.0: + resolution: {integrity: sha512-seS2Tj26TBVOC2NIc2rOe2y2ZO7efxITtLZcGSOnHHNOQ7CkiUBfw0Iw2ck6xkIhPwLhKNLS8BO+hEpngQlqzw==} + reusify@1.1.0: resolution: {integrity: sha512-g6QUff04oZpHs0eG5p83rFLhHeV00ug/Yf9nZM6fLeUrPguBTkTQOdpAWWspMh55TZfVQDPaN3NQJfbVRAxdIw==} engines: {iojs: '>=1.0.0', node: '>=0.10.0'} @@ -1260,6 +1292,11 @@ packages: tr46@0.0.3: resolution: {integrity: sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==} + tsx@4.21.0: + resolution: {integrity: sha512-5C1sg4USs1lfG0GFb2RLXsdpXqBSEhAaA/0kPL01wxzpMqLILNxIxIOKiILz+cdg/pLnOUxFYOR5yhHU666wbw==} + engines: {node: '>=18.0.0'} + hasBin: true + typedarray@0.0.6: resolution: {integrity: sha512-/aCDEGatGvZ2BIk+HmLf4ifCJFwvKFNb9/JeZPMulfgFracn9QFcAf5GO8B/mweUjSoblS5In0cWhqpfs/5PQA==} @@ -1913,13 +1950,13 @@ snapshots: chai: 6.2.2 tinyrainbow: 3.0.3 - '@vitest/mocker@4.0.18(vite@7.3.1(@types/node@20.19.35))': + '@vitest/mocker@4.0.18(vite@7.3.1(@types/node@20.19.35)(tsx@4.21.0))': dependencies: '@vitest/spy': 4.0.18 estree-walker: 3.0.3 magic-string: 0.30.21 optionalDependencies: - vite: 7.3.1(@types/node@20.19.35) + vite: 7.3.1(@types/node@20.19.35)(tsx@4.21.0) '@vitest/pretty-format@4.0.18': dependencies: @@ -2159,6 +2196,10 @@ snapshots: fsevents@2.3.3: optional: true + get-tsconfig@4.14.0: + dependencies: + resolve-pkg-maps: 1.0.0 + glob-parent@5.1.2: dependencies: is-glob: 4.0.3 @@ -2371,6 +2412,8 @@ snapshots: resolve-from@5.0.0: {} + resolve-pkg-maps@1.0.0: {} + reusify@1.1.0: {} rollup@4.59.0: @@ -2468,6 +2511,13 @@ snapshots: tr46@0.0.3: {} + tsx@4.21.0: + dependencies: + esbuild: 0.27.3 + get-tsconfig: 4.14.0 + optionalDependencies: + fsevents: 2.3.3 + typedarray@0.0.6: {} typescript@5.9.3: {} @@ -2478,7 +2528,7 @@ snapshots: util-deprecate@1.0.2: {} - vite@7.3.1(@types/node@20.19.35): + vite@7.3.1(@types/node@20.19.35)(tsx@4.21.0): dependencies: esbuild: 0.27.3 fdir: 6.5.0(picomatch@4.0.3) @@ -2489,11 +2539,12 @@ snapshots: optionalDependencies: '@types/node': 20.19.35 fsevents: 2.3.3 + tsx: 4.21.0 - vitest@4.0.18(@opentelemetry/api@1.9.0)(@types/node@20.19.35): + vitest@4.0.18(@opentelemetry/api@1.9.0)(@types/node@20.19.35)(tsx@4.21.0): dependencies: '@vitest/expect': 4.0.18 - '@vitest/mocker': 4.0.18(vite@7.3.1(@types/node@20.19.35)) + '@vitest/mocker': 4.0.18(vite@7.3.1(@types/node@20.19.35)(tsx@4.21.0)) '@vitest/pretty-format': 4.0.18 '@vitest/runner': 4.0.18 '@vitest/snapshot': 4.0.18 @@ -2510,7 +2561,7 @@ snapshots: tinyexec: 1.0.2 tinyglobby: 0.2.15 tinyrainbow: 3.0.3 - vite: 7.3.1(@types/node@20.19.35) + vite: 7.3.1(@types/node@20.19.35)(tsx@4.21.0) why-is-node-running: 2.3.0 optionalDependencies: '@opentelemetry/api': 1.9.0 diff --git a/tests/telemetry-rollup-remote.test.ts b/tests/telemetry-rollup-remote.test.ts index d3add21..57a7f4a 100644 --- a/tests/telemetry-rollup-remote.test.ts +++ b/tests/telemetry-rollup-remote.test.ts @@ -9,6 +9,7 @@ process.env.BAD_TELEMETRY_ROLLUP_NO_AUTORUN = '1' const { buildRemoteUrl } = await import('../bench/telemetry/rollup.js') const ROLLUP_PATH = path.resolve(__dirname, '..', 'bench', 'telemetry', 'rollup.ts') +const TSX_BIN = path.resolve(__dirname, '..', 'node_modules', '.bin', 'tsx') describe('rollup --remote URL building', () => { it('appends repo, kind, since, until query params when set', () => { @@ -60,8 +61,8 @@ describe('rollup --remote env requirements', () => { delete env.BAD_TELEMETRY_ADMIN_BEARER delete env.BAD_TELEMETRY_ROLLUP_NO_AUTORUN const out = spawnSync( - process.execPath, - ['--experimental-strip-types', '--no-warnings', ROLLUP_PATH, '--remote'], + TSX_BIN, + [ROLLUP_PATH, '--remote'], { encoding: 'utf-8', env }, ) expect(out.status).toBe(2) @@ -73,8 +74,8 @@ describe('rollup --remote env requirements', () => { delete env.BAD_TELEMETRY_ADMIN_BEARER delete env.BAD_TELEMETRY_ROLLUP_NO_AUTORUN const out = spawnSync( - process.execPath, - ['--experimental-strip-types', '--no-warnings', ROLLUP_PATH, '--remote'], + TSX_BIN, + [ROLLUP_PATH, '--remote'], { encoding: 'utf-8', env }, ) expect(out.status).toBe(2) From 3c8da132b84d97f4360c0cf96d9603ba734ce50d Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Sun, 26 Apr 2026 03:37:18 -0600 Subject: [PATCH 5/5] feat(jobs+reports): comparative-audit jobs API + AI SDK report tool surface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three new modules layered on top of the existing audit pipeline: src/jobs/ declarative comparative-audit jobs (JobSpec → discover → fan-out → persist). Crash-safe JSONL store at ~/.bad/jobs/, bounded-concurrency runner, pre-flight cost estimate. CLI: bad jobs {create,list,status,estimate}. src/discover/ wayback (CDX API) + list sources turn a DiscoverSpec into audit targets. Pluggable fetch for tests, status-200-only filter on by default. src/reports/ deterministic aggregation (leaderboard / tierBuckets / compareRuns / longitudinal) + markdown templates + AI SDK tool surface so a browser-side agent can interrogate jobs without re-implementing aggregation. CLI: bad reports generate --job --template . The contract: every number in any report flows through pure aggregate.ts functions — never an LLM. Optional narrate() prepends an exec summary, same pattern as the audit-patches layer (agent narrates, code computes). +55 tests across jobs-store, jobs-queue, jobs-cost-estimate, discover-wayback, reports-aggregate, reports-templates, reports-tools. Total: 1448 passing. --- .changeset/jobs-reports-content-engine.md | 28 +++ src/cli-jobs.ts | 200 ++++++++++++++++++++++ src/cli-reports.ts | 104 +++++++++++ src/cli.ts | 16 +- src/discover/index.ts | 34 ++++ src/discover/wayback.ts | 142 +++++++++++++++ src/jobs/cost-estimate.ts | 24 +++ src/jobs/index.ts | 52 ++++++ src/jobs/queue.ts | 114 ++++++++++++ src/jobs/store.ts | 107 ++++++++++++ src/jobs/types.ts | 109 ++++++++++++ src/reports/aggregate.ts | 180 +++++++++++++++++++ src/reports/index.ts | 28 +++ src/reports/narrate.ts | 47 +++++ src/reports/templates.ts | 163 ++++++++++++++++++ src/reports/tools.ts | 173 +++++++++++++++++++ src/reports/types.ts | 56 ++++++ tests/discover-wayback.test.ts | 122 +++++++++++++ tests/jobs-cost-estimate.test.ts | 30 ++++ tests/jobs-queue.test.ts | 95 ++++++++++ tests/jobs-store.test.ts | 89 ++++++++++ tests/reports-aggregate.test.ts | 142 +++++++++++++++ tests/reports-templates.test.ts | 84 +++++++++ tests/reports-tools.test.ts | 117 +++++++++++++ 24 files changed, 2255 insertions(+), 1 deletion(-) create mode 100644 .changeset/jobs-reports-content-engine.md create mode 100644 src/cli-jobs.ts create mode 100644 src/cli-reports.ts create mode 100644 src/discover/index.ts create mode 100644 src/discover/wayback.ts create mode 100644 src/jobs/cost-estimate.ts create mode 100644 src/jobs/index.ts create mode 100644 src/jobs/queue.ts create mode 100644 src/jobs/store.ts create mode 100644 src/jobs/types.ts create mode 100644 src/reports/aggregate.ts create mode 100644 src/reports/index.ts create mode 100644 src/reports/narrate.ts create mode 100644 src/reports/templates.ts create mode 100644 src/reports/tools.ts create mode 100644 src/reports/types.ts create mode 100644 tests/discover-wayback.test.ts create mode 100644 tests/jobs-cost-estimate.test.ts create mode 100644 tests/jobs-queue.test.ts create mode 100644 tests/jobs-store.test.ts create mode 100644 tests/reports-aggregate.test.ts create mode 100644 tests/reports-templates.test.ts create mode 100644 tests/reports-tools.test.ts diff --git a/.changeset/jobs-reports-content-engine.md b/.changeset/jobs-reports-content-engine.md new file mode 100644 index 0000000..98095ab --- /dev/null +++ b/.changeset/jobs-reports-content-engine.md @@ -0,0 +1,28 @@ +--- +'@tangle-network/browser-agent-driver': minor +--- + +feat(jobs+reports): comparative-audit jobs API + AI SDK report tool surface + +Three new modules layered cleanly on top of the existing audit pipeline. Lets you declaratively audit N URLs (optionally expanded into M historical wayback snapshots each), aggregate the results, and emit shareable markdown reports — or expose the same data as AI SDK tools so a browser-side agent can answer ad-hoc questions. + +**`src/jobs/`** — declarative comparative-audit jobs. +- `JobSpec` JSON describes targets + audit options + cost cap; `createJob` mints and persists; `runJob` fans out with bounded concurrency and crash-safe per-result writes to `~/.bad/jobs/`. +- Pre-flight cost estimate (`estimateCost`) refuses jobs that would silently spend more than `maxCostUSD`. +- `AuditFn` injection keeps the queue decoupled from Playwright/LLM for tests. +- CLI: `bad jobs create --spec `, `bad jobs status `, `bad jobs list`, `bad jobs estimate --spec `. + +**`src/discover/`** — turn a `DiscoverSpec` into audit targets. +- `wayback` source uses archive.org's CDX API to list captures, then samples `count` evenly across the time range. +- `list` source is a pass-through. +- Pluggable `fetch` for tests; status-200-only filter on by default so 4xx snapshots don't poison the job. + +**`src/reports/`** — turn a job into an artifact. +- `aggregateJob` reads each per-target `report.json`, projects to `AggregateRow` (rollup, dimensions, ethics count). All numbers in any report flow through this — never an LLM. +- `leaderboard`, `longitudinalFor`, `compareRuns`, `tierBuckets` are pure functions over rows. +- `renderLeaderboard` / `renderLongitudinal` / `renderBatchComparison` produce deterministic markdown. +- `narrateReport(brain, body)` optionally prepends an LLM exec-summary; without `brain`, returns the deterministic body unchanged. Same contract as the audit-patches layer: agent narrates, code computes. +- `buildReportTools()` exposes a 7-tool AI SDK surface (`queryJob`, `fetchAudit`, `compareRuns`, `longitudinal`, `tierBuckets`, `renderTemplate`, `runFreshAudit`) so a browser-side agent can interrogate jobs without re-implementing aggregation. +- CLI: `bad reports generate --job --template [--top N --by-type X --buckets 10,100 --narrate --out file.md]`. + +**Tests:** +55 across `jobs-store`, `jobs-queue`, `jobs-cost-estimate`, `discover-wayback`, `reports-aggregate`, `reports-templates`, `reports-tools`. Total: 1448 passing. diff --git a/src/cli-jobs.ts b/src/cli-jobs.ts new file mode 100644 index 0000000..ea5b6af --- /dev/null +++ b/src/cli-jobs.ts @@ -0,0 +1,200 @@ +/** + * `bad jobs` — declarative comparative-audit jobs. + * + * Subcommands: + * bad jobs create --spec # mints a job from a JSON spec, runs it + * bad jobs status [--json] # show a job's current state + * bad jobs list [--json] # recent jobs + * bad jobs estimate --spec # pre-flight cost estimate, no execution + * + * The spec file is JSON (no yaml dep). See `JobSpec` in src/jobs/types.ts. + */ + +import * as fs from 'node:fs' +import * as path from 'node:path' +import chalk from 'chalk' +import { cliError } from './cli-ui.js' +import { discoverTargets } from './discover/index.js' + +function die(msg: string): never { + cliError(msg) + process.exit(1) +} +import { + createJob, + runJob, + loadJob, + listJobs, + estimateCost, + type JobSpec, + type AuditFn, +} from './jobs/index.js' + +interface ParsedArgs { + spec?: string + json?: boolean + jobId?: string + yes?: boolean +} + +function parseArgs(argv: string[]): ParsedArgs { + const out: ParsedArgs = {} + for (let i = 0; i < argv.length; i++) { + const a = argv[i] + if (a === '--spec') out.spec = argv[++i] + else if (a === '--json') out.json = true + else if (a === '--yes' || a === '-y') out.yes = true + else if (!a.startsWith('-') && !out.jobId) out.jobId = a + } + return out +} + +function readSpec(specPath: string): JobSpec { + if (!fs.existsSync(specPath)) die(`spec file not found: ${specPath}`) + const raw = fs.readFileSync(specPath, 'utf-8') + try { + return JSON.parse(raw) as JobSpec + } catch (err) { + die(`spec file is not valid JSON: ${(err as Error).message}`) + } +} + +export async function runJobsCli(args: string[]): Promise { + const sub = args[0] + const rest = args.slice(1) + const opts = parseArgs(rest) + + if (sub === 'list') return cmdList(opts) + if (sub === 'status') return cmdStatus(opts) + if (sub === 'estimate') return cmdEstimate(opts) + if (sub === 'create') return cmdCreate(opts) + die(`Unknown subcommand: ${sub}. Use create | list | status | estimate.`) +} + +function cmdList(opts: ParsedArgs): void { + const entries = listJobs() + if (opts.json) { + console.log(JSON.stringify(entries, null, 2)) + return + } + if (entries.length === 0) { + console.log(chalk.dim('No jobs yet. Try `bad jobs create --spec `.')) + return + } + for (const e of entries.slice(0, 50)) { + const status = e.status === 'completed' ? chalk.green(e.status) + : e.status === 'failed' || e.status === 'cancelled' ? chalk.red(e.status) + : e.status === 'partial' ? chalk.yellow(e.status) + : chalk.dim(e.status) + console.log(` ${e.jobId} ${status} targets=${e.targetCount} ${chalk.dim(e.createdAt)} ${e.label ?? ''}`) + } +} + +function cmdStatus(opts: ParsedArgs): void { + if (!opts.jobId) die('jobId is required: bad jobs status ') + const job = loadJob(opts.jobId) + if (!job) die(`job not found: ${opts.jobId}`) + if (opts.json) { + console.log(JSON.stringify(job, null, 2)) + return + } + const ok = job.results.filter(r => r.status === 'ok').length + const failed = job.results.filter(r => r.status === 'failed').length + console.log(` ${chalk.bold(job.jobId)} ${chalk.dim(job.status)}`) + if (job.spec.label) console.log(` label: ${job.spec.label}`) + console.log(` targets: ${job.targets.length} · ok: ${ok} · failed: ${failed}`) + console.log(` cost: $${job.totalCostUSD.toFixed(2)}`) + console.log(` created: ${job.createdAt}`) + if (job.completedAt) console.log(` completed: ${job.completedAt}`) +} + +async function cmdEstimate(opts: ParsedArgs): Promise { + if (!opts.spec) die('--spec is required for estimate') + const spec = readSpec(opts.spec) + const targets = await discoverTargets(spec.discover) + const est = estimateCost(spec, targets.length) + if (opts.json) { + console.log(JSON.stringify({ spec, ...est }, null, 2)) + return + } + console.log(` Targets: ${est.targetCount}`) + console.log(` Per-audit: $${est.perAuditUSD.toFixed(2)}`) + console.log(` Estimated total: $${est.estimatedTotalUSD.toFixed(2)}`) + if (est.exceedsCap && spec.maxCostUSD !== undefined) { + console.log(chalk.yellow(` ⚠ exceeds cap of $${spec.maxCostUSD.toFixed(2)}`)) + } +} + +async function cmdCreate(opts: ParsedArgs): Promise { + if (!opts.spec) die('--spec is required: bad jobs create --spec ') + const spec = readSpec(opts.spec) + const targets = await discoverTargets(spec.discover) + if (targets.length === 0) die('discover yielded zero targets — check your URLs / wayback range') + const est = estimateCost(spec, targets.length) + console.log(` Targets discovered: ${targets.length}`) + console.log(` Estimated cost: $${est.estimatedTotalUSD.toFixed(2)}`) + if (est.exceedsCap && spec.maxCostUSD !== undefined) { + die(`Estimated cost $${est.estimatedTotalUSD.toFixed(2)} exceeds maxCostUSD $${spec.maxCostUSD.toFixed(2)}. Raise the cap or shrink the job.`) + } + + const job = createJob(spec, targets) + console.log(` Created job ${chalk.bold(job.jobId)}`) + + const auditFn = await buildAuditFn(spec) + await runJob(job, { auditFn }) + const final = loadJob(job.jobId) + console.log(` Status: ${chalk.bold(final?.status ?? 'unknown')} · ok: ${final?.results.filter(r => r.status === 'ok').length ?? 0}/${final?.targets.length ?? 0} · $${final?.totalCostUSD.toFixed(2)}`) +} + +/** + * Wire the runner to the design-audit pipeline. Imported lazily so `bad jobs + * list` doesn't pull in Playwright. Each target gets its own output dir so + * we can deterministically locate `report.json` after the audit returns. + */ +async function buildAuditFn(_spec: JobSpec): Promise { + const { runDesignAudit } = await import('./cli-design-audit.js') + let counter = 0 + return async (target, opts) => { + const url = target.snapshotUrl ?? target.url + counter += 1 + const slug = `${slugify(url)}-${Date.now()}-${counter}` + const outputDir = path.join('audit-results', 'jobs', slug) + await runDesignAudit({ + url, + pages: opts?.pages ?? 1, + output: outputDir, + json: true, + headless: opts?.headless ?? true, + audience: opts?.audience?.join(','), + audienceVulnerability: opts?.audienceVulnerability?.join(','), + modality: opts?.modality, + regulatoryContext: opts?.regulatoryContext?.join(','), + skipEthics: opts?.skipEthics, + }) + const reportJson = path.resolve(outputDir, 'report.json') + if (!fs.existsSync(reportJson)) { + throw new Error(`audit completed but report.json missing at ${reportJson}`) + } + const data = JSON.parse(fs.readFileSync(reportJson, 'utf-8')) as { + pages?: Array<{ + score?: number + auditResultV2?: { rollup?: { score?: number }; classification?: { type?: string } } + rollup?: { score?: number } + classification?: { type?: string } + }> + } + const page = data.pages?.[0] + const rollupScore = page?.auditResultV2?.rollup?.score ?? page?.rollup?.score ?? page?.score + const pageType = page?.auditResultV2?.classification?.type ?? page?.classification?.type + return { + runId: outputDir, // The output dir is the de-facto runId for jobs. + resultPath: reportJson, + rollupScore, + pageType, + } + } +} + +function slugify(url: string): string { + return url.replace(/^https?:\/\//, '').replace(/[^a-z0-9]+/gi, '-').replace(/^-|-$/g, '').slice(0, 60).toLowerCase() +} diff --git a/src/cli-reports.ts b/src/cli-reports.ts new file mode 100644 index 0000000..0902575 --- /dev/null +++ b/src/cli-reports.ts @@ -0,0 +1,104 @@ +/** + * `bad reports generate` — produce a markdown artifact from a job's results. + * + * bad reports generate --job --template + * [--out ] [--top ] [--by-type ] [--buckets 10,100,200] + * [--narrate] [--context "...one-line context for the LLM..."] + * + * The report is the deterministic body. `--narrate` prepends an LLM exec + * summary; without it, the artifact is pure data. + */ + +import * as fs from 'node:fs' +import chalk from 'chalk' +import { cliError } from './cli-ui.js' +import { loadJob } from './jobs/index.js' + +function die(msg: string): never { + cliError(msg) + process.exit(1) +} +import { + aggregateJob, + renderLeaderboard, + renderLongitudinal, + renderBatchComparison, + renderJobHeader, + narrateReport, +} from './reports/index.js' + +interface ReportArgs { + job?: string + template?: string + out?: string + top?: number + byType?: string + buckets?: number[] + narrate?: boolean + context?: string + json?: boolean +} + +function parseArgs(argv: string[]): ReportArgs { + const out: ReportArgs = {} + for (let i = 0; i < argv.length; i++) { + const a = argv[i] + if (a === '--job') out.job = argv[++i] + else if (a === '--template') out.template = argv[++i] + else if (a === '--out') out.out = argv[++i] + else if (a === '--top') out.top = Number(argv[++i]) + else if (a === '--by-type') out.byType = argv[++i] + else if (a === '--buckets') out.buckets = argv[++i].split(',').map(s => Number(s.trim())).filter(n => Number.isFinite(n)) + else if (a === '--narrate') out.narrate = true + else if (a === '--context') out.context = argv[++i] + else if (a === '--json') out.json = true + } + return out +} + +const TEMPLATES = new Set(['leaderboard', 'longitudinal', 'batch-comparison']) + +export async function runReportsCli(args: string[]): Promise { + const sub = args[0] + if (sub !== 'generate') die(`Unknown subcommand: ${sub}. Use generate.`) + const opts = parseArgs(args.slice(1)) + if (!opts.job) die('--job is required') + if (!opts.template) die('--template is required (leaderboard | longitudinal | batch-comparison)') + if (!TEMPLATES.has(opts.template)) die(`Unknown template: ${opts.template}. Valid: ${[...TEMPLATES].join(', ')}`) + + const job = loadJob(opts.job) + if (!job) die(`job not found: ${opts.job}`) + + const rows = aggregateJob(job) + + let body: string + if (opts.template === 'leaderboard') { + body = renderLeaderboard(rows, { topN: opts.top, byType: opts.byType, buckets: opts.buckets }) + } else if (opts.template === 'longitudinal') { + body = renderLongitudinal(rows) + } else { + body = renderBatchComparison(rows) + } + + const header = renderJobHeader(job) + '\n\n---\n\n' + let final = header + body + + if (opts.narrate) { + try { + const { Brain } = await import('./brain/index.js') + const brain = new Brain() + final = await narrateReport(final, { brain, context: opts.context }) + } catch (err) { + console.warn(chalk.yellow(`narrate failed, falling back to deterministic body: ${(err as Error).message}`)) + } + } + + if (opts.out) { + fs.writeFileSync(opts.out, final, 'utf-8') + console.log(` Report written → ${opts.out}`) + } else if (opts.json) { + console.log(JSON.stringify({ jobId: job.jobId, template: opts.template, markdown: final }, null, 2)) + } else { + console.log(final) + } +} diff --git a/src/cli.ts b/src/cli.ts index 34ac2a3..26aa468 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -104,6 +104,20 @@ async function main(): Promise { setCliVersion(readCliVersion()); setInvocation(process.argv.slice(2)[0] || 'unknown', process.argv.slice(2)); + // Subcommand groups with their own arg shape — dispatch before the strict + // parent parser (which only knows the run/design-audit/auth/showcase flags). + const subArgs = process.argv.slice(2); + if (subArgs[0] === 'jobs') { + const { runJobsCli } = await import('./cli-jobs.js'); + await runJobsCli(subArgs.slice(1)); + process.exit(0); + } + if (subArgs[0] === 'reports') { + const { runReportsCli } = await import('./cli-reports.js'); + await runReportsCli(subArgs.slice(1)); + process.exit(0); + } + const { values, positionals } = parseArgs({ allowPositionals: true, allowNegative: true, @@ -552,7 +566,7 @@ async function main(): Promise { } if (command !== 'run' && command !== 'attach') { - cliError(`Unknown command: ${command}. Use "run", "attach", "preview", "runs", "view", "share", "chrome-debug", "design-audit", "showcase", or "auth".`); + cliError(`Unknown command: ${command}. Use "run", "attach", "preview", "runs", "view", "share", "chrome-debug", "design-audit", "showcase", "auth", "jobs", or "reports".`); process.exit(1); } diff --git a/src/discover/index.ts b/src/discover/index.ts new file mode 100644 index 0000000..d9d6a0a --- /dev/null +++ b/src/discover/index.ts @@ -0,0 +1,34 @@ +/** + * Discoverers — turn a job's `discover` spec into a flat list of audit targets. + * + * Two sources: + * list — explicit URLs (passthrough). + * wayback — expand each URL into N historical snapshots via the CDX API. + */ + +import type { DiscoverSpec, JobTarget } from '../jobs/types.js' +import { expandWaybackTargets, type WaybackOptions } from './wayback.js' + +export { discoverWaybackSnapshots, expandWaybackTargets, snapshotUrl, sampleEvenly, parseCdxRows, cdxStampToIso } from './wayback.js' +export type { WaybackOptions } from './wayback.js' + +export interface DiscoverOptions { + /** Injected fetch (tests). */ + fetch?: typeof fetch +} + +export async function discoverTargets(spec: DiscoverSpec, opts: DiscoverOptions = {}): Promise { + if (spec.source === 'list') { + return spec.urls.map(url => ({ url })) + } + if (spec.source === 'wayback') { + const wb: WaybackOptions = { + count: spec.snapshotsPerUrl, + since: spec.since, + until: spec.until, + fetch: opts.fetch, + } + return expandWaybackTargets(spec.urls, wb) + } + throw new Error(`discover: unsupported source "${(spec as DiscoverSpec).source}"`) +} diff --git a/src/discover/wayback.ts b/src/discover/wayback.ts new file mode 100644 index 0000000..d779690 --- /dev/null +++ b/src/discover/wayback.ts @@ -0,0 +1,142 @@ +/** + * Wayback Machine snapshot discoverer (CDX API). + * + * Uses archive.org's CDX server to list captures of a URL, then samples + * `count` snapshots evenly across the time range. The CDX API is rate-limited + * (~15 req/s) but for the small N we use here we never approach the cap. + * + * CDX response shape: + * [["urlkey","timestamp","original","mimetype","statuscode","digest","length"], + * ["com,stripe)/", "20100101120000", "https://stripe.com/", "text/html", "200", "...", "1234"], + * ...] + * + * Snapshot URL = `https://web.archive.org/web//`. + */ + +import type { JobTarget } from '../jobs/types.js' + +const CDX_ENDPOINT = 'https://web.archive.org/cdx/search/cdx' + +export interface WaybackOptions { + /** Snapshots per URL. Default 5. */ + count?: number + /** Lower-bound capture date (ISO 8601 — e.g. "2010-01-01"). */ + since?: string + /** Upper-bound capture date. */ + until?: string + /** Limit only HTTP 200 captures. Default true — 4xx/5xx snapshots aren't auditable. */ + status200Only?: boolean + /** Injected fetch (tests). Defaults to globalThis.fetch (Node 18+). */ + fetch?: typeof fetch +} + +interface CdxRow { + urlkey: string + timestamp: string + original: string + mimetype: string + statuscode: string + digest: string + length: string +} + +const DEFAULT_COUNT = 5 + +/** Convert ISO date "2010-01-01" to CDX yyyymmddhhmmss. */ +function isoToCdxStamp(iso: string, end = false): string { + const d = new Date(iso) + if (Number.isNaN(d.getTime())) throw new Error(`wayback: invalid date "${iso}"`) + const y = d.getUTCFullYear() + const m = String(d.getUTCMonth() + 1).padStart(2, '0') + const day = String(d.getUTCDate()).padStart(2, '0') + return end ? `${y}${m}${day}235959` : `${y}${m}${day}000000` +} + +/** Convert CDX timestamp "20100101120000" to ISO datetime. */ +export function cdxStampToIso(stamp: string): string { + if (stamp.length < 14) throw new Error(`wayback: malformed CDX timestamp "${stamp}"`) + const y = stamp.slice(0, 4) + const m = stamp.slice(4, 6) + const d = stamp.slice(6, 8) + const hh = stamp.slice(8, 10) + const mm = stamp.slice(10, 12) + const ss = stamp.slice(12, 14) + return `${y}-${m}-${d}T${hh}:${mm}:${ss}Z` +} + +export function snapshotUrl(timestamp: string, original: string): string { + return `https://web.archive.org/web/${timestamp}/${original}` +} + +/** Sample `count` evenly-spaced rows from a sorted list. Always includes first and last. */ +export function sampleEvenly(rows: T[], count: number): T[] { + if (rows.length <= count) return [...rows] + if (count <= 0) return [] + if (count === 1) return [rows[Math.floor((rows.length - 1) / 2)]] + const out: T[] = [] + const step = (rows.length - 1) / (count - 1) + for (let i = 0; i < count; i++) { + out.push(rows[Math.round(i * step)]) + } + return out +} + +/** Parse the CDX JSON response (first row is a header). */ +export function parseCdxRows(json: unknown): CdxRow[] { + if (!Array.isArray(json) || json.length < 2) return [] + // Skip header row. + const rows = json.slice(1) as unknown[] + const out: CdxRow[] = [] + for (const r of rows) { + if (!Array.isArray(r) || r.length < 7) continue + out.push({ + urlkey: String(r[0]), + timestamp: String(r[1]), + original: String(r[2]), + mimetype: String(r[3]), + statuscode: String(r[4]), + digest: String(r[5]), + length: String(r[6]), + }) + } + return out +} + +/** Discover N evenly-spaced wayback snapshots for a URL. */ +export async function discoverWaybackSnapshots(url: string, opts: WaybackOptions = {}): Promise { + const fetchImpl = opts.fetch ?? globalThis.fetch + if (!fetchImpl) throw new Error('wayback: no fetch implementation available') + + const count = opts.count ?? DEFAULT_COUNT + const params = new URLSearchParams({ + url, + output: 'json', + limit: String(Math.max(count * 4, 50)), // overcollect, then sample evenly + }) + if (opts.since) params.set('from', isoToCdxStamp(opts.since)) + if (opts.until) params.set('to', isoToCdxStamp(opts.until, true)) + if (opts.status200Only !== false) { + params.set('filter', 'statuscode:200') + } + + const resp = await fetchImpl(`${CDX_ENDPOINT}?${params.toString()}`) + if (!resp.ok) { + throw new Error(`wayback: CDX returned ${resp.status} for ${url}`) + } + const json = await resp.json() as unknown + const rows = parseCdxRows(json) + if (rows.length === 0) return [] + // CDX returns chronological order by default. Sample evenly across. + const sampled = sampleEvenly(rows, count) + return sampled.map(r => ({ + url, + snapshotUrl: snapshotUrl(r.timestamp, r.original), + capturedAt: cdxStampToIso(r.timestamp), + })) +} + +/** Expand a list of seed URLs into wayback targets, in parallel. */ +export async function expandWaybackTargets(urls: string[], opts: WaybackOptions = {}): Promise { + const all = await Promise.all(urls.map(u => discoverWaybackSnapshots(u, opts).catch(() => [] as JobTarget[]))) + return all.flat() +} diff --git a/src/jobs/cost-estimate.ts b/src/jobs/cost-estimate.ts new file mode 100644 index 0000000..23f38b5 --- /dev/null +++ b/src/jobs/cost-estimate.ts @@ -0,0 +1,24 @@ +/** + * Pre-flight cost estimate — refuses to silently spend $1k. + * + * The numbers here are deliberately conservative and based on observed + * production telemetry for `claude-code` provider with sonnet at the time of + * writing. If the assumptions drift, only one constant changes. + */ + +import type { CostEstimate, JobSpec } from './types.js' + +/** Average USD per single-page audit. Tuned for sonnet via claude-code provider, 1 audit pass. */ +export const DEFAULT_PER_AUDIT_USD = 0.4 + +export function estimateCost(spec: JobSpec, targetCount: number, perAuditUSD = DEFAULT_PER_AUDIT_USD): CostEstimate { + const pages = Math.max(spec.audit?.pages ?? 1, 1) + const estimatedTotalUSD = targetCount * pages * perAuditUSD + const exceedsCap = typeof spec.maxCostUSD === 'number' && estimatedTotalUSD > spec.maxCostUSD + return { + targetCount, + perAuditUSD: perAuditUSD * pages, + estimatedTotalUSD, + exceedsCap, + } +} diff --git a/src/jobs/index.ts b/src/jobs/index.ts new file mode 100644 index 0000000..ae2bd95 --- /dev/null +++ b/src/jobs/index.ts @@ -0,0 +1,52 @@ +/** + * Jobs — declarative comparative-audit jobs. + * + * Public surface: + * createJob(spec) → Job (queued, persisted, ready to run) + * runJob(job, opts) → Job (executes the fan-out, returns final state) + * loadJob / listJobs / saveJob — store accessors + * estimateCost — pre-flight cost guard + */ + +export type { + Job, + JobSpec, + JobStatus, + JobKind, + JobTarget, + JobResultEntry, + JobResultStatus, + DiscoverSpec, + DiscoverSource, + AuditOptions, + CostEstimate, +} from './types.js' + +export { newJobId, saveJob, loadJob, listJobs, appendIndexEntry, jobsDir, jobPath, updateJobStatus } from './store.js' +export type { JobIndexEntry } from './store.js' +export { estimateCost, DEFAULT_PER_AUDIT_USD } from './cost-estimate.js' +export { runJob } from './queue.js' +export type { AuditFn, RunJobOptions } from './queue.js' + +import type { Job, JobSpec, JobTarget } from './types.js' +import { newJobId, saveJob, appendIndexEntry } from './store.js' + +/** + * Mint a fresh `queued` job from a spec. Targets are seeded from the spec + * (snapshot expansion, if any, must be done by the caller via the + * `discover` module before runJob). + */ +export function createJob(spec: JobSpec, targets: JobTarget[], dir?: string): Job { + const job: Job = { + jobId: newJobId(), + spec, + status: 'queued', + createdAt: new Date().toISOString(), + targets, + results: [], + totalCostUSD: 0, + } + saveJob(job, dir) + appendIndexEntry(job, dir) + return job +} diff --git a/src/jobs/queue.ts b/src/jobs/queue.ts new file mode 100644 index 0000000..5633790 --- /dev/null +++ b/src/jobs/queue.ts @@ -0,0 +1,114 @@ +/** + * Job runner — bounded-concurrency fan-out over the audit pipeline. + * + * Synchronous in the sense that `runJob()` returns when every target has been + * processed (ok, failed, or skipped). Crash safety: every individual result + * is persisted to disk as soon as it lands, so killing the process leaves a + * resumable record (resume is a future addition; today the partial state + * is just observable). + * + * `auditFn` is injected so tests can run the queue without touching + * Playwright/LLMs. + */ + +import { saveJob, appendIndexEntry } from './store.js' +import type { Job, JobResultEntry, JobTarget } from './types.js' + +export interface AuditFn { + (target: JobTarget, opts: Job['spec']['audit']): Promise<{ + runId: string + resultPath: string + rollupScore?: number + pageType?: string + costUSD?: number + }> +} + +export interface RunJobOptions { + auditFn: AuditFn + /** Persistence dir override (tests). */ + dir?: string + /** Concurrency override; falls back to spec.concurrency, then 2. */ + concurrency?: number + /** Per-target failure swallower — defaults to recording the error and continuing. */ + onError?: (target: JobTarget, error: Error) => 'continue' | 'abort' +} + +const DEFAULT_CONCURRENCY = 2 + +export async function runJob(job: Job, opts: RunJobOptions): Promise { + const concurrency = opts.concurrency ?? job.spec.concurrency ?? DEFAULT_CONCURRENCY + job.status = 'running' + job.startedAt = job.startedAt ?? new Date().toISOString() + saveJob(job, opts.dir) + appendIndexEntry(job, opts.dir) + + const queue: JobTarget[] = [...job.targets] + let aborted = false + + async function worker(): Promise { + while (queue.length > 0 && !aborted) { + const target = queue.shift() + if (!target) break + const entry = await runOne(target, job.spec.audit, opts.auditFn, opts.onError) + if (entry === 'abort') { + aborted = true + return + } + job.results.push(entry) + if (entry.costUSD) job.totalCostUSD = round2(job.totalCostUSD + entry.costUSD) + saveJob(job, opts.dir) + } + } + + const workers = Array.from({ length: concurrency }, () => worker()) + await Promise.all(workers) + + job.completedAt = new Date().toISOString() + job.status = aborted ? 'cancelled' : finalStatus(job) + saveJob(job, opts.dir) + appendIndexEntry(job, opts.dir) + return job +} + +async function runOne( + target: JobTarget, + audit: Job['spec']['audit'], + auditFn: AuditFn, + onError?: RunJobOptions['onError'], +): Promise { + try { + const out = await auditFn(target, audit) + return { + ...target, + status: 'ok', + runId: out.runId, + resultPath: out.resultPath, + rollupScore: out.rollupScore, + pageType: out.pageType, + costUSD: out.costUSD, + } + } catch (err) { + const error = err as Error + const decision = onError?.(target, error) ?? 'continue' + if (decision === 'abort') return 'abort' + return { + ...target, + status: 'failed', + error: error.message, + } + } +} + +function finalStatus(job: Job): Job['status'] { + const total = job.results.length + if (total === 0) return 'failed' + const ok = job.results.filter(r => r.status === 'ok').length + if (ok === 0) return 'failed' + if (ok < total) return 'partial' + return 'completed' +} + +function round2(n: number): number { + return Math.round(n * 100) / 100 +} diff --git a/src/jobs/store.ts b/src/jobs/store.ts new file mode 100644 index 0000000..8292238 --- /dev/null +++ b/src/jobs/store.ts @@ -0,0 +1,107 @@ +/** + * Job store — whole-file JSON per job + an append-only index for listing. + * + * Layout: + * ~/.bad/jobs/.json — Job record (atomic write via tmp + rename) + * ~/.bad/jobs/index.jsonl — append-only one line per job ({jobId, status, createdAt, label}) + * + * Whole-file rewrite is intentional: a job is a single coherent record with + * monotonically-extending `results[]`. Append-only at the entry level would + * duplicate target rows; append-only at the file level would force readers + * to fold N updates into one logical record. Atomic rename gives us crash + * safety without that complexity. + */ + +import * as fs from 'node:fs' +import * as path from 'node:path' +import * as os from 'node:os' +import type { Job, JobStatus } from './types.js' + +const DEFAULT_DIR = path.join(os.homedir(), '.bad', 'jobs') + +export function jobsDir(override?: string): string { + return override ?? DEFAULT_DIR +} + +function ensureDir(dir: string): void { + fs.mkdirSync(dir, { recursive: true }) +} + +export function jobPath(jobId: string, dir?: string): string { + return path.join(jobsDir(dir), `${jobId}.json`) +} + +function indexPath(dir?: string): string { + return path.join(jobsDir(dir), 'index.jsonl') +} + +export function newJobId(now: Date = new Date()): string { + const stamp = now.toISOString().replace(/[-:.]/g, '').slice(0, 15) // YYYYMMDDTHHMMSS + const rand = Math.random().toString(36).slice(2, 8) + return `job_${stamp}_${rand}` +} + +export function saveJob(job: Job, dir?: string): void { + const target = jobPath(job.jobId, dir) + ensureDir(path.dirname(target)) + const tmp = `${target}.tmp` + fs.writeFileSync(tmp, JSON.stringify(job, null, 2)) + fs.renameSync(tmp, target) +} + +export function loadJob(jobId: string, dir?: string): Job | null { + const target = jobPath(jobId, dir) + if (!fs.existsSync(target)) return null + return JSON.parse(fs.readFileSync(target, 'utf-8')) as Job +} + +export function appendIndexEntry(job: Job, dir?: string): void { + const file = indexPath(dir) + ensureDir(path.dirname(file)) + const line = JSON.stringify({ + jobId: job.jobId, + status: job.status, + createdAt: job.createdAt, + label: job.spec.label ?? null, + targetCount: job.targets.length, + }) + fs.appendFileSync(file, line + '\n') +} + +export interface JobIndexEntry { + jobId: string + status: JobStatus + createdAt: string + label: string | null + targetCount: number +} + +/** Reads the index file. Later entries for the same jobId override earlier ones. */ +export function listJobs(dir?: string): JobIndexEntry[] { + const file = indexPath(dir) + if (!fs.existsSync(file)) return [] + const lines = fs.readFileSync(file, 'utf-8').split('\n').filter(Boolean) + const map = new Map() + for (const line of lines) { + try { + const entry = JSON.parse(line) as JobIndexEntry + map.set(entry.jobId, entry) + } catch { + // Skip malformed lines — index is best-effort. + } + } + return Array.from(map.values()).sort((a, b) => b.createdAt.localeCompare(a.createdAt)) +} + +export function updateJobStatus(jobId: string, status: JobStatus, dir?: string): Job | null { + const job = loadJob(jobId, dir) + if (!job) return null + job.status = status + if (status === 'running' && !job.startedAt) job.startedAt = new Date().toISOString() + if (status === 'completed' || status === 'failed' || status === 'cancelled' || status === 'partial') { + job.completedAt = new Date().toISOString() + } + saveJob(job, dir) + appendIndexEntry(job, dir) + return job +} diff --git a/src/jobs/types.ts b/src/jobs/types.ts new file mode 100644 index 0000000..9f02750 --- /dev/null +++ b/src/jobs/types.ts @@ -0,0 +1,109 @@ +/** + * Job orchestration types — RFC-003: comparative-audit jobs. + * + * A job is a declarative spec ("audit these N URLs, optionally with M historical + * snapshots each") that fans out to the existing design-audit pipeline and + * persists the aggregate result so report generation can run later. + * + * Persistence: append-only JSONL at `~/.bad/jobs/.json` (whole-file + * rewrites are fine — jobs are small) plus a one-line index entry at + * `~/.bad/jobs/index.jsonl` for fast listing. + */ + +import type { AudienceTag, ModalityTag, RegulatoryContextTag, AudienceVulnerabilityTag } from '../design/audit/v2/types.js' + +export type JobKind = 'comparative-audit' + +export type JobStatus = 'queued' | 'running' | 'partial' | 'completed' | 'failed' | 'cancelled' + +export type DiscoverSource = 'list' | 'wayback' + +/** What targets to audit. `list` = explicit URLs; `wayback` = expand each URL into historical snapshots. */ +export interface DiscoverSpec { + source: DiscoverSource + /** Explicit list of URLs (always required — it's the seed even for wayback). */ + urls: string[] + /** wayback only: how many evenly-spaced snapshots to fetch per URL. Default 5. */ + snapshotsPerUrl?: number + /** wayback only: ISO date lower bound (e.g. "2010-01-01"). */ + since?: string + /** wayback only: ISO date upper bound (e.g. "2026-01-01"). */ + until?: string +} + +/** Pass-through audit options — mirrors the design-audit CLI flags. */ +export interface AuditOptions { + pages?: number + modality?: 'html' | 'ios' | 'android' + audience?: AudienceTag[] + audienceVulnerability?: AudienceVulnerabilityTag[] + modalityTag?: ModalityTag[] + regulatoryContext?: RegulatoryContextTag[] + headless?: boolean + skipEthics?: boolean +} + +export interface JobSpec { + kind: JobKind + discover: DiscoverSpec + audit?: AuditOptions + /** Bounded concurrency for the audit fan-out. Default 2 — Playwright + LLM rate limits cap real-world throughput. */ + concurrency?: number + /** Hard cost cap. The job aborts pre-flight if estimated cost exceeds this. */ + maxCostUSD?: number + /** Free-form label for humans (shows up in `bad jobs list`). */ + label?: string +} + +/** One target = one (url, optional snapshot) pair the auditor will run on. */ +export interface JobTarget { + /** The original seed URL. For non-wayback discovery, this is also the audit URL. */ + url: string + /** For wayback targets: the actual snapshot URL passed to the audit. */ + snapshotUrl?: string + /** For wayback targets: ISO datetime when the snapshot was captured. */ + capturedAt?: string +} + +export type JobResultStatus = 'ok' | 'failed' | 'skipped' + +export interface JobResultEntry extends JobTarget { + status: JobResultStatus + /** runId of the design-audit run when status === 'ok'. */ + runId?: string + /** Path to the run's report.json (relative to cwd or absolute). */ + resultPath?: string + /** Failure reason when status === 'failed' or 'skipped'. */ + error?: string + /** Estimated USD cost of this audit (if available from telemetry). */ + costUSD?: number + /** Roll-up score copied out of the report for fast aggregation queries. */ + rollupScore?: number + /** Page-type classification. */ + pageType?: string +} + +export interface Job { + jobId: string + spec: JobSpec + status: JobStatus + createdAt: string + startedAt?: string + completedAt?: string + /** All discovered targets (length = number of audits the job will / did run). */ + targets: JobTarget[] + /** Per-target outcome. Length matches `targets` once the job has progressed past discovery. */ + results: JobResultEntry[] + /** Sum of `results[*].costUSD` for completed entries. */ + totalCostUSD: number + /** Free-form notes (errors, warnings, telemetry summary). */ + notes?: string[] +} + +export interface CostEstimate { + targetCount: number + estimatedTotalUSD: number + perAuditUSD: number + /** Whether the estimate is above `spec.maxCostUSD`. */ + exceedsCap: boolean +} diff --git a/src/reports/aggregate.ts b/src/reports/aggregate.ts new file mode 100644 index 0000000..dffba82 --- /dev/null +++ b/src/reports/aggregate.ts @@ -0,0 +1,180 @@ +/** + * Deterministic aggregation over a job's per-target audit results. + * + * Every number that shows up in a report flows through here — never through + * an LLM. The narration layer can describe / contextualize / dramatize, but + * counts, scores, deltas, and rankings are all computed here. Same pattern + * we use for the audit patches contract: agent narrates, code computes. + */ + +import * as fs from 'node:fs' +import type { Job, JobResultEntry } from '../jobs/types.js' +import type { Dimension } from '../design/audit/v2/types.js' +import type { AggregateRow, CompareRunsResult, DimensionDelta, LongitudinalRow } from './types.js' + +interface RawReport { + pages?: Array<{ + url?: string + classification?: { type?: string; domain?: string } + auditResultV2?: { + classification?: { type?: string; domain?: string } + rollup?: { score?: number } + scores?: Partial> + } + rollup?: { score?: number } + designSystemScore?: Partial> + ethicsViolations?: unknown[] + score?: number + }> +} + +/** + * Read each ok result's `report.json` from disk and project to AggregateRow. + * Skipped/failed entries become rows with `rollupScore: NaN` so callers can + * filter them — keeping them in the list preserves index alignment with + * `job.results` for tools that want to drill in. + */ +export function aggregateJob(job: Job): AggregateRow[] { + const out: AggregateRow[] = [] + for (const r of job.results) { + out.push(toRow(r)) + } + return out +} + +function toRow(r: JobResultEntry): AggregateRow { + const base: AggregateRow = { + url: r.url, + snapshotUrl: r.snapshotUrl, + capturedAt: r.capturedAt, + runId: r.runId ?? '', + rollupScore: typeof r.rollupScore === 'number' ? r.rollupScore : NaN, + dimensions: {}, + ethicsViolations: 0, + resultPath: r.resultPath, + pageType: r.pageType, + } + if (r.status !== 'ok' || !r.resultPath || !fs.existsSync(r.resultPath)) { + return base + } + try { + const json = JSON.parse(fs.readFileSync(r.resultPath, 'utf-8')) as RawReport + const page = json.pages?.[0] + if (!page) return base + const v2 = page.auditResultV2 + const cls = v2?.classification ?? page.classification ?? {} + base.pageType = base.pageType ?? cls.type + base.domain = cls.domain + if (v2?.rollup?.score !== undefined) base.rollupScore = v2.rollup.score + else if (page.rollup?.score !== undefined) base.rollupScore = page.rollup.score + else if (typeof page.score === 'number') base.rollupScore = page.score + if (v2?.scores) { + for (const [dim, ds] of Object.entries(v2.scores) as [Dimension, { score?: number } | undefined][]) { + if (ds && typeof ds.score === 'number') base.dimensions[dim] = ds.score + } + } + base.ethicsViolations = Array.isArray(page.ethicsViolations) ? page.ethicsViolations.length : 0 + } catch { + // Corrupt or partial report.json — leave as-is. Don't pretend we have data we don't. + } + return base +} + +export interface LeaderboardOptions { + /** Filter by page type (e.g. only saas-app rows). */ + byType?: string + /** Top N entries. Default = no cap. */ + topN?: number + /** Sort direction. Default 'desc' (highest first). */ + direction?: 'asc' | 'desc' +} + +export function leaderboard(rows: AggregateRow[], opts: LeaderboardOptions = {}): AggregateRow[] { + const dir = opts.direction ?? 'desc' + const filtered = rows + .filter(r => Number.isFinite(r.rollupScore)) + .filter(r => !opts.byType || r.pageType === opts.byType) + filtered.sort((a, b) => (dir === 'desc' ? b.rollupScore - a.rollupScore : a.rollupScore - b.rollupScore)) + if (opts.topN && opts.topN > 0) return filtered.slice(0, opts.topN) + return filtered +} + +/** Longitudinal view: all snapshots for one URL, sorted by capture time. */ +export function longitudinalFor(rows: AggregateRow[], url: string): LongitudinalRow[] { + return rows + .filter(r => r.url === url && r.capturedAt && Number.isFinite(r.rollupScore)) + .map(r => ({ + url: r.url, + capturedAt: r.capturedAt!, + rollupScore: r.rollupScore, + pageType: r.pageType, + })) + .sort((a, b) => a.capturedAt.localeCompare(b.capturedAt)) +} + +export function compareRuns(a: AggregateRow, b: AggregateRow): CompareRunsResult { + const dims = new Set([ + ...(Object.keys(a.dimensions) as Dimension[]), + ...(Object.keys(b.dimensions) as Dimension[]), + ]) + const perDimension: DimensionDelta[] = [] + for (const dim of dims) { + const ax = a.dimensions[dim] + const bx = b.dimensions[dim] + if (typeof ax !== 'number' || typeof bx !== 'number') continue + perDimension.push({ dim, beforeScore: bx, afterScore: ax, delta: round2(ax - bx) }) + } + return { + a, + b, + rollupDelta: round2(a.rollupScore - b.rollupScore), + perDimension, + } +} + +/** Group rows into tier buckets for "top 10 vs 100-200" style reports. */ +export interface TierBucket { + label: string + rows: AggregateRow[] + meanScore: number + medianScore: number +} + +export function tierBuckets(rows: AggregateRow[], boundaries: number[]): TierBucket[] { + const ranked = leaderboard(rows) + const buckets: TierBucket[] = [] + const sorted = [...new Set(boundaries)].sort((a, b) => a - b) + let prev = 0 + for (const upper of sorted) { + const slice = ranked.slice(prev, upper) + if (slice.length === 0) { + prev = upper + continue + } + const label = prev === 0 ? `top ${upper}` : `${prev + 1}–${upper}` + buckets.push({ label, rows: slice, ...stats(slice) }) + prev = upper + } + if (prev < ranked.length) { + const slice = ranked.slice(prev) + buckets.push({ label: `${prev + 1}+`, rows: slice, ...stats(slice) }) + } + return buckets +} + +function stats(rows: AggregateRow[]): { meanScore: number; medianScore: number } { + if (rows.length === 0) return { meanScore: NaN, medianScore: NaN } + const scores = rows.map(r => r.rollupScore).filter(Number.isFinite) + const sum = scores.reduce((a, b) => a + b, 0) + const sorted = [...scores].sort((a, b) => a - b) + const mid = Math.floor(sorted.length / 2) + const median = sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid] + return { + meanScore: round2(sum / scores.length), + medianScore: round2(median), + } +} + +function round2(n: number): number { + return Math.round(n * 100) / 100 +} diff --git a/src/reports/index.ts b/src/reports/index.ts new file mode 100644 index 0000000..d85ec50 --- /dev/null +++ b/src/reports/index.ts @@ -0,0 +1,28 @@ +/** + * Reports — turn a job's audit results into shareable artifacts. + * + * Two surfaces: + * 1. Static templates (deterministic markdown rendering — see templates.ts) + * 2. AI SDK tools (agentic chat over the same data — see tools.ts) + * + * Both flow through the same aggregate.ts functions, so numbers are consistent. + */ + +export type { AggregateRow, CompareRunsResult, DimensionDelta, LongitudinalRow, ReportTemplate } from './types.js' +export { aggregateJob, leaderboard, longitudinalFor, compareRuns, tierBuckets } from './aggregate.js' +export type { LeaderboardOptions, TierBucket } from './aggregate.js' +export { + renderLeaderboard, + renderLongitudinal, + renderBatchComparison, + renderJobHeader, +} from './templates.js' +export type { + LeaderboardRenderOpts, + LongitudinalRenderOpts, + BatchComparisonRenderOpts, +} from './templates.js' +export { buildReportTools } from './tools.js' +export type { ReportToolsContext, ReportToolSet } from './tools.js' +export { narrateReport } from './narrate.js' +export type { NarrateOptions } from './narrate.js' diff --git a/src/reports/narrate.ts b/src/reports/narrate.ts new file mode 100644 index 0000000..bd895b3 --- /dev/null +++ b/src/reports/narrate.ts @@ -0,0 +1,47 @@ +/** + * LLM narration around deterministic data. + * + * The contract: numbers come from the templates (which call the aggregate + * functions). The LLM only writes prose context — top-line takeaways, an + * angle on what's surprising. Same pattern as the patches contract. + * + * If a Brain isn't supplied, the deterministic markdown is returned as-is. + * That's the safe default — never silently fabricate prose. + */ + +import type { Brain } from '../brain/index.js' + +const SYSTEM = `You are an analyst writing a one-paragraph executive summary at the top of a design-audit report. + +Rules: +- Use ONLY the numbers and facts in the provided markdown report. Do not invent rankings, scores, or claims. +- Surface the most surprising or load-bearing finding (e.g. a tier-vs-tier gap, a dramatic longitudinal swing, an outlier). +- Two to four sentences. No bullet lists. +- Do not preface with "Here is your summary" or similar.` + +export interface NarrateOptions { + /** When supplied, prepends an LLM-written executive summary above the deterministic body. */ + brain?: Brain + /** Free-form context passed to the LLM (e.g. "this is the YC W25 cohort"). */ + context?: string +} + +export async function narrateReport(deterministicMarkdown: string, opts: NarrateOptions = {}): Promise { + if (!opts.brain) return deterministicMarkdown + const user = [ + opts.context ? `Context: ${opts.context}` : undefined, + 'REPORT:', + deterministicMarkdown, + '', + 'Write the executive summary now.', + ].filter(Boolean).join('\n\n') + try { + const { text } = await opts.brain.complete(SYSTEM, user, { maxOutputTokens: 320 }) + const summary = text.trim() + if (!summary) return deterministicMarkdown + return `## Executive summary\n\n${summary}\n\n${deterministicMarkdown}` + } catch { + // Don't let narration failures block the artifact. Ship the data. + return deterministicMarkdown + } +} diff --git a/src/reports/templates.ts b/src/reports/templates.ts new file mode 100644 index 0000000..1f94a68 --- /dev/null +++ b/src/reports/templates.ts @@ -0,0 +1,163 @@ +/** + * Static report templates — markdown rendering of pre-aggregated data. + * + * No LLM. The narration layer (narrate.ts) wraps these with prose; the + * templates themselves are pure functions of data so they're snapshot-testable + * and deterministic. + */ + +import type { Job } from '../jobs/types.js' +import type { AggregateRow, LongitudinalRow } from './types.js' +import { leaderboard, longitudinalFor, tierBuckets, compareRuns } from './aggregate.js' + +export interface LeaderboardRenderOpts { + title?: string + topN?: number + byType?: string + /** Tier bucket boundaries — e.g. [10, 100, 200] → "top 10", "11–100", "101–200", "201+". */ + buckets?: number[] +} + +export function renderLeaderboard(rows: AggregateRow[], opts: LeaderboardRenderOpts = {}): string { + const lines: string[] = [] + const title = opts.title ?? 'Design Audit Leaderboard' + lines.push(`# ${title}`) + lines.push('') + lines.push(`Generated: ${new Date().toISOString()}`) + lines.push('') + if (opts.byType) { + lines.push(`Filtered to page-type: \`${opts.byType}\``) + lines.push('') + } + + const ranked = leaderboard(rows, { topN: opts.topN, byType: opts.byType }) + + lines.push(`## Ranked sites (${ranked.length})`) + lines.push('') + lines.push('| # | URL | Page type | Rollup | Top dim | Bottom dim | Ethics |') + lines.push('|---|-----|-----------|--------|---------|------------|--------|') + ranked.forEach((r, i) => { + const dims = Object.entries(r.dimensions) + dims.sort((a, b) => (b[1] as number) - (a[1] as number)) + const top = dims[0] ? `${dims[0][0]} ${(dims[0][1] as number).toFixed(1)}` : '—' + const bot = dims[dims.length - 1] ? `${dims[dims.length - 1][0]} ${(dims[dims.length - 1][1] as number).toFixed(1)}` : '—' + lines.push(`| ${i + 1} | ${escapeMd(r.url)} | ${r.pageType ?? '?'} | ${r.rollupScore.toFixed(2)} | ${top} | ${bot} | ${r.ethicsViolations} |`) + }) + lines.push('') + + if (opts.buckets && opts.buckets.length > 0) { + const buckets = tierBuckets(rows, opts.buckets) + lines.push('## Tiers') + lines.push('') + lines.push('| Tier | N | Mean | Median |') + lines.push('|------|---|------|--------|') + for (const b of buckets) { + lines.push(`| ${b.label} | ${b.rows.length} | ${fmt(b.meanScore)} | ${fmt(b.medianScore)} |`) + } + lines.push('') + } + + return lines.join('\n') +} + +export interface LongitudinalRenderOpts { + title?: string + /** If multiple URLs are in scope, render one section per URL. */ + urls?: string[] +} + +export function renderLongitudinal(rows: AggregateRow[], opts: LongitudinalRenderOpts = {}): string { + const lines: string[] = [] + lines.push(`# ${opts.title ?? 'Longitudinal Design Audit'}`) + lines.push('') + lines.push(`Generated: ${new Date().toISOString()}`) + lines.push('') + + const urls = opts.urls ?? Array.from(new Set(rows.map(r => r.url))) + for (const url of urls) { + const series = longitudinalFor(rows, url) + if (series.length === 0) continue + lines.push(`## ${escapeMd(url)}`) + lines.push('') + lines.push('| Captured | Rollup | Page type |') + lines.push('|----------|--------|-----------|') + for (const s of series) { + lines.push(`| ${s.capturedAt.slice(0, 10)} | ${s.rollupScore.toFixed(2)} | ${s.pageType ?? '?'} |`) + } + const first = series[0] + const last = series[series.length - 1] + if (first && last && first !== last) { + const delta = last.rollupScore - first.rollupScore + const sign = delta >= 0 ? '+' : '' + lines.push('') + lines.push(`Net change ${first.capturedAt.slice(0, 10)} → ${last.capturedAt.slice(0, 10)}: **${sign}${delta.toFixed(2)}**`) + } + lines.push('') + } + return lines.join('\n') +} + +export interface BatchComparisonRenderOpts { + title?: string + /** Pairs of (a, b) URLs to diff. If omitted, the first two URLs encountered are paired. */ + pairs?: Array<[string, string]> +} + +export function renderBatchComparison(rows: AggregateRow[], opts: BatchComparisonRenderOpts = {}): string { + const lines: string[] = [] + lines.push(`# ${opts.title ?? 'Batch Comparison'}`) + lines.push('') + lines.push(`Generated: ${new Date().toISOString()}`) + lines.push('') + + let pairs = opts.pairs ?? [] + if (pairs.length === 0) { + const urls = Array.from(new Set(rows.map(r => r.url))) + if (urls.length >= 2) pairs = [[urls[0], urls[1]]] + } + for (const [aUrl, bUrl] of pairs) { + const a = rows.find(r => r.url === aUrl && Number.isFinite(r.rollupScore)) + const b = rows.find(r => r.url === bUrl && Number.isFinite(r.rollupScore)) + if (!a || !b) continue + const cmp = compareRuns(a, b) + lines.push(`## ${escapeMd(aUrl)} vs ${escapeMd(bUrl)}`) + lines.push('') + lines.push(`Rollup delta: **${signed(cmp.rollupDelta)}** (a ${a.rollupScore.toFixed(2)} – b ${b.rollupScore.toFixed(2)})`) + lines.push('') + if (cmp.perDimension.length > 0) { + lines.push('| Dimension | a | b | Δ |') + lines.push('|-----------|---|---|---|') + for (const d of cmp.perDimension) { + lines.push(`| ${d.dim} | ${d.afterScore.toFixed(1)} | ${d.beforeScore.toFixed(1)} | ${signed(d.delta)} |`) + } + lines.push('') + } + } + return lines.join('\n') +} + +export function renderJobHeader(job: Job): string { + const ok = job.results.filter(r => r.status === 'ok').length + const fail = job.results.filter(r => r.status === 'failed').length + const skip = job.results.filter(r => r.status === 'skipped').length + return [ + `**Job**: \`${job.jobId}\``, + job.spec.label ? `**Label**: ${job.spec.label}` : undefined, + `**Targets**: ${job.targets.length} · ok: ${ok} · failed: ${fail} · skipped: ${skip}`, + `**Cost**: $${job.totalCostUSD.toFixed(2)}`, + `**Status**: ${job.status}`, + ].filter(Boolean).join(' \n') +} + +function escapeMd(s: string): string { + return s.replace(/\|/g, '\\|') +} + +function signed(n: number): string { + if (!Number.isFinite(n)) return '—' + return (n >= 0 ? '+' : '') + n.toFixed(2) +} + +function fmt(n: number): string { + return Number.isFinite(n) ? n.toFixed(2) : '—' +} diff --git a/src/reports/tools.ts b/src/reports/tools.ts new file mode 100644 index 0000000..ad1f86b --- /dev/null +++ b/src/reports/tools.ts @@ -0,0 +1,173 @@ +/** + * AI SDK tool surface for the report agent. + * + * These tools let an LLM agent (browser-side chat, Claude Code, anywhere + * AI SDK runs) interrogate a job's audit results. The contract is strict: + * every numerical claim the agent makes must come from a tool result, never + * from its own arithmetic. Templates are deterministic; the agent narrates. + * + * The tools use plain JSONSchema (`jsonSchema` from `ai`) rather than zod so + * we don't add a dependency. Schemas are intentionally minimal — agents prefer + * concise tool surfaces. + */ + +import { tool, jsonSchema } from 'ai' +import * as fs from 'node:fs' +import { loadJob } from '../jobs/store.js' +import { aggregateJob, leaderboard, longitudinalFor, compareRuns, tierBuckets } from './aggregate.js' +import { renderLeaderboard, renderLongitudinal, renderBatchComparison } from './templates.js' +import type { AggregateRow } from './types.js' + +export interface ReportToolsContext { + /** Override jobs dir (tests). */ + jobsDir?: string + /** Override the resolver for `runFreshAudit` so tests/CLIs can plug in their own pipeline. */ + runFreshAudit?: (url: string) => Promise<{ runId: string; resultPath: string; rollupScore?: number }> +} + +function rowsForJob(jobId: string, jobsDir?: string): AggregateRow[] { + const job = loadJob(jobId, jobsDir) + if (!job) throw new Error(`job not found: ${jobId}`) + return aggregateJob(job) +} + +export function buildReportTools(ctx: ReportToolsContext = {}) { + return { + queryJob: tool({ + description: 'Return aggregated rows for every audited target in a job (filtered/ranked). Use this for any leaderboard, ranking, or scope query.', + inputSchema: jsonSchema<{ jobId: string; byType?: string; topN?: number; direction?: 'asc' | 'desc' }>({ + type: 'object', + properties: { + jobId: { type: 'string' }, + byType: { type: 'string', description: 'Filter to one page-type (saas-app, marketing, dashboard, ecommerce, ...).' }, + topN: { type: 'integer', minimum: 1 }, + direction: { type: 'string', enum: ['asc', 'desc'], description: 'Default desc (highest scores first).' }, + }, + required: ['jobId'], + }), + execute: async ({ jobId, byType, topN, direction }) => { + const rows = rowsForJob(jobId, ctx.jobsDir) + return leaderboard(rows, { byType, topN, direction }) + }, + }), + + fetchAudit: tool({ + description: 'Fetch the full report.json for a single audit run by runId. Use sparingly — only when the agent needs finding-level detail beyond aggregated scores.', + inputSchema: jsonSchema<{ jobId: string; runId: string }>({ + type: 'object', + properties: { + jobId: { type: 'string' }, + runId: { type: 'string' }, + }, + required: ['jobId', 'runId'], + }), + execute: async ({ jobId, runId }) => { + const job = loadJob(jobId, ctx.jobsDir) + if (!job) throw new Error(`job not found: ${jobId}`) + const entry = job.results.find(r => r.runId === runId) + if (!entry || !entry.resultPath || !fs.existsSync(entry.resultPath)) { + throw new Error(`runId not found or report.json missing: ${runId}`) + } + return JSON.parse(fs.readFileSync(entry.resultPath, 'utf-8')) + }, + }), + + compareRuns: tool({ + description: 'Compute a deterministic dimension-by-dimension diff between two audited runs in the same job. Returns rollupDelta and per-dimension deltas.', + inputSchema: jsonSchema<{ jobId: string; runIdA: string; runIdB: string }>({ + type: 'object', + properties: { + jobId: { type: 'string' }, + runIdA: { type: 'string' }, + runIdB: { type: 'string' }, + }, + required: ['jobId', 'runIdA', 'runIdB'], + }), + execute: async ({ jobId, runIdA, runIdB }) => { + const rows = rowsForJob(jobId, ctx.jobsDir) + const a = rows.find(r => r.runId === runIdA) + const b = rows.find(r => r.runId === runIdB) + if (!a || !b) throw new Error(`runId not found in job: ${!a ? runIdA : runIdB}`) + return compareRuns(a, b) + }, + }), + + longitudinal: tool({ + description: 'For wayback-expanded jobs, return the time series of scores for one URL (sorted oldest → newest). Use this for "how has X evolved" questions.', + inputSchema: jsonSchema<{ jobId: string; url: string }>({ + type: 'object', + properties: { + jobId: { type: 'string' }, + url: { type: 'string' }, + }, + required: ['jobId', 'url'], + }), + execute: async ({ jobId, url }) => { + const rows = rowsForJob(jobId, ctx.jobsDir) + return longitudinalFor(rows, url) + }, + }), + + tierBuckets: tool({ + description: 'Bucket a job\'s ranked results into tier slices (e.g. boundaries [10, 100, 200] → top 10 / 11–100 / 101–200 / 201+).', + inputSchema: jsonSchema<{ jobId: string; boundaries: number[] }>({ + type: 'object', + properties: { + jobId: { type: 'string' }, + boundaries: { type: 'array', items: { type: 'integer', minimum: 1 } }, + }, + required: ['jobId', 'boundaries'], + }), + execute: async ({ jobId, boundaries }) => { + const rows = rowsForJob(jobId, ctx.jobsDir) + return tierBuckets(rows, boundaries) + }, + }), + + renderTemplate: tool({ + description: 'Render a deterministic markdown report from a job. Use this when the user wants a shareable artifact, not a free-form answer.', + inputSchema: jsonSchema<{ + jobId: string + template: 'leaderboard' | 'longitudinal' | 'batch-comparison' + title?: string + topN?: number + byType?: string + buckets?: number[] + }>({ + type: 'object', + properties: { + jobId: { type: 'string' }, + template: { type: 'string', enum: ['leaderboard', 'longitudinal', 'batch-comparison'] }, + title: { type: 'string' }, + topN: { type: 'integer', minimum: 1 }, + byType: { type: 'string' }, + buckets: { type: 'array', items: { type: 'integer', minimum: 1 } }, + }, + required: ['jobId', 'template'], + }), + execute: async ({ jobId, template, title, topN, byType, buckets }) => { + const rows = rowsForJob(jobId, ctx.jobsDir) + if (template === 'leaderboard') return { markdown: renderLeaderboard(rows, { title, topN, byType, buckets }) } + if (template === 'longitudinal') return { markdown: renderLongitudinal(rows, { title }) } + return { markdown: renderBatchComparison(rows, { title }) } + }, + }), + + runFreshAudit: tool({ + description: 'Kick off a NEW single-page audit when the agent needs current data not in the job. Cost-bearing. Use sparingly.', + inputSchema: jsonSchema<{ url: string }>({ + type: 'object', + properties: { url: { type: 'string' } }, + required: ['url'], + }), + execute: async ({ url }) => { + if (!ctx.runFreshAudit) { + throw new Error('runFreshAudit not wired in this context — host must inject a resolver') + } + return await ctx.runFreshAudit(url) + }, + }), + } as const +} + +export type ReportToolSet = ReturnType diff --git a/src/reports/types.ts b/src/reports/types.ts new file mode 100644 index 0000000..3465aac --- /dev/null +++ b/src/reports/types.ts @@ -0,0 +1,56 @@ +/** + * Report shapes — flat rows that templates and tool calls share. + * + * `AggregateRow` is intentionally narrow (just the fields a leaderboard / + * comparison / longitudinal view needs). Anything richer should be loaded + * from `resultPath` on demand via `fetchAudit`. + */ + +import type { Dimension } from '../design/audit/v2/types.js' + +export interface AggregateRow { + /** Seed URL (groups multiple snapshots of the same site together). */ + url: string + /** Snapshot URL (only set for wayback rows). */ + snapshotUrl?: string + /** ISO datetime of capture (only set for wayback rows). */ + capturedAt?: string + /** runId of the source audit. */ + runId: string + /** Page-type classification from the audit. */ + pageType?: string + /** Domain tag (e.g. "fintech", "health") from classification. */ + domain?: string + /** Rollup score (0-10). */ + rollupScore: number + /** Per-dimension scores (subset — only the v2 universal dimensions). */ + dimensions: Partial> + /** Number of ethics violations detected. */ + ethicsViolations: number + /** Path to the per-run report.json for drill-down. */ + resultPath?: string +} + +export interface DimensionDelta { + dim: Dimension + beforeScore: number + afterScore: number + delta: number +} + +export interface CompareRunsResult { + a: AggregateRow + b: AggregateRow + rollupDelta: number + /** Negative = a worse than b, positive = a better. */ + perDimension: DimensionDelta[] +} + +export interface LongitudinalRow { + url: string + capturedAt: string + rollupScore: number + pageType?: string +} + +export type ReportTemplate = 'leaderboard' | 'longitudinal' | 'batch-comparison' diff --git a/tests/discover-wayback.test.ts b/tests/discover-wayback.test.ts new file mode 100644 index 0000000..c3e2efc --- /dev/null +++ b/tests/discover-wayback.test.ts @@ -0,0 +1,122 @@ +import { describe, it, expect } from 'vitest' +import { + parseCdxRows, + sampleEvenly, + cdxStampToIso, + snapshotUrl, + discoverWaybackSnapshots, +} from '../src/discover/wayback.js' +import { discoverTargets } from '../src/discover/index.js' + +describe('parseCdxRows', () => { + it('skips the header row and parses each capture', () => { + const json = [ + ['urlkey', 'timestamp', 'original', 'mimetype', 'statuscode', 'digest', 'length'], + ['com,stripe)/', '20100101120000', 'https://stripe.com/', 'text/html', '200', 'a', '1234'], + ['com,stripe)/', '20200101120000', 'https://stripe.com/', 'text/html', '200', 'b', '4567'], + ] + const rows = parseCdxRows(json) + expect(rows).toHaveLength(2) + expect(rows[0].timestamp).toBe('20100101120000') + expect(rows[1].timestamp).toBe('20200101120000') + }) + + it('returns empty for malformed input', () => { + expect(parseCdxRows([])).toEqual([]) + expect(parseCdxRows(null)).toEqual([]) + expect(parseCdxRows({ x: 1 })).toEqual([]) + }) + + it('skips rows with too few fields', () => { + const json = [ + ['urlkey', 'timestamp', 'original', 'mimetype', 'statuscode', 'digest', 'length'], + ['com,broken)/'], // bad row + ['com,ok)/', '20240101000000', 'https://ok/', 'text/html', '200', 'd', '1'], + ] + expect(parseCdxRows(json)).toHaveLength(1) + }) +}) + +describe('sampleEvenly', () => { + it('returns all rows when count >= length', () => { + expect(sampleEvenly([1, 2, 3], 5)).toEqual([1, 2, 3]) + }) + + it('returns the middle element when count === 1', () => { + expect(sampleEvenly([1, 2, 3, 4, 5], 1)).toEqual([3]) + }) + + it('always includes first and last', () => { + const rows = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100] + const sample = sampleEvenly(rows, 4) + expect(sample[0]).toBe(10) + expect(sample[sample.length - 1]).toBe(100) + expect(sample).toHaveLength(4) + }) + + it('returns [] for count <= 0', () => { + expect(sampleEvenly([1, 2, 3], 0)).toEqual([]) + }) +}) + +describe('cdxStampToIso', () => { + it('formats correctly', () => { + expect(cdxStampToIso('20100101120000')).toBe('2010-01-01T12:00:00Z') + }) + it('throws on malformed input', () => { + expect(() => cdxStampToIso('badstamp')).toThrow() + }) +}) + +describe('snapshotUrl', () => { + it('produces a wayback URL', () => { + const url = snapshotUrl('20100101120000', 'https://stripe.com/') + expect(url).toBe('https://web.archive.org/web/20100101120000/https://stripe.com/') + }) +}) + +describe('discoverWaybackSnapshots', () => { + it('fetches CDX, samples, and returns JobTargets', async () => { + const fakeRows = [ + ['urlkey', 'timestamp', 'original', 'mimetype', 'statuscode', 'digest', 'length'], + ['com,stripe)/', '20100101120000', 'https://stripe.com/', 'text/html', '200', 'a', '1'], + ['com,stripe)/', '20140101120000', 'https://stripe.com/', 'text/html', '200', 'b', '1'], + ['com,stripe)/', '20180101120000', 'https://stripe.com/', 'text/html', '200', 'c', '1'], + ['com,stripe)/', '20220101120000', 'https://stripe.com/', 'text/html', '200', 'd', '1'], + ['com,stripe)/', '20260101120000', 'https://stripe.com/', 'text/html', '200', 'e', '1'], + ] + const fetchImpl = (async () => ({ + ok: true, + status: 200, + json: async () => fakeRows, + })) as unknown as typeof fetch + const targets = await discoverWaybackSnapshots('https://stripe.com/', { count: 3, fetch: fetchImpl }) + expect(targets).toHaveLength(3) + expect(targets[0].url).toBe('https://stripe.com/') + expect(targets[0].snapshotUrl).toContain('web.archive.org') + expect(targets[0].capturedAt).toBe('2010-01-01T12:00:00Z') + expect(targets[2].capturedAt).toBe('2026-01-01T12:00:00Z') + }) + + it('throws on non-OK CDX response', async () => { + const fetchImpl = (async () => ({ ok: false, status: 503 })) as unknown as typeof fetch + await expect(discoverWaybackSnapshots('https://x/', { fetch: fetchImpl })).rejects.toThrow(/CDX returned 503/) + }) + + it('returns [] when CDX yields no rows', async () => { + const fetchImpl = (async () => ({ ok: true, status: 200, json: async () => [['header']] })) as unknown as typeof fetch + const out = await discoverWaybackSnapshots('https://x/', { fetch: fetchImpl }) + expect(out).toEqual([]) + }) +}) + +describe('discoverTargets', () => { + it('list source returns one target per URL', async () => { + const targets = await discoverTargets({ source: 'list', urls: ['https://a/', 'https://b/'] }) + expect(targets).toEqual([{ url: 'https://a/' }, { url: 'https://b/' }]) + }) + + it('rejects an unknown source', async () => { + await expect(discoverTargets({ source: 'unknown' as 'list', urls: [] })).rejects.toThrow(/unsupported/) + }) +}) diff --git a/tests/jobs-cost-estimate.test.ts b/tests/jobs-cost-estimate.test.ts new file mode 100644 index 0000000..3baf370 --- /dev/null +++ b/tests/jobs-cost-estimate.test.ts @@ -0,0 +1,30 @@ +import { describe, it, expect } from 'vitest' +import { estimateCost, DEFAULT_PER_AUDIT_USD } from '../src/jobs/cost-estimate.js' +import type { JobSpec } from '../src/jobs/types.js' + +const SPEC: JobSpec = { + kind: 'comparative-audit', + discover: { source: 'list', urls: [] }, +} + +describe('estimateCost', () => { + it('multiplies targets by per-audit cost', () => { + const est = estimateCost(SPEC, 100) + expect(est.estimatedTotalUSD).toBeCloseTo(100 * DEFAULT_PER_AUDIT_USD) + }) + + it('multiplies by pages', () => { + const est = estimateCost({ ...SPEC, audit: { pages: 3 } }, 10) + expect(est.estimatedTotalUSD).toBeCloseTo(10 * 3 * DEFAULT_PER_AUDIT_USD) + }) + + it('flips exceedsCap when above maxCostUSD', () => { + expect(estimateCost({ ...SPEC, maxCostUSD: 5 }, 100).exceedsCap).toBe(true) + expect(estimateCost({ ...SPEC, maxCostUSD: 1000 }, 100).exceedsCap).toBe(false) + }) + + it('honors a custom per-audit cost', () => { + const est = estimateCost(SPEC, 50, 0.1) + expect(est.estimatedTotalUSD).toBeCloseTo(5) + }) +}) diff --git a/tests/jobs-queue.test.ts b/tests/jobs-queue.test.ts new file mode 100644 index 0000000..d7a9dac --- /dev/null +++ b/tests/jobs-queue.test.ts @@ -0,0 +1,95 @@ +import { describe, it, expect, afterEach } from 'vitest' +import { mkdtempSync, rmSync } from 'node:fs' +import { tmpdir } from 'node:os' +import { join } from 'node:path' +import { runJob } from '../src/jobs/queue.js' +import { saveJob, appendIndexEntry, loadJob } from '../src/jobs/store.js' +import { createJob } from '../src/jobs/index.js' +import type { Job, JobSpec, AuditFn } from '../src/jobs/index.js' + +const SPEC: JobSpec = { + kind: 'comparative-audit', + discover: { source: 'list', urls: ['https://a.test', 'https://b.test', 'https://c.test'] }, + concurrency: 2, +} + +describe('runJob', () => { + let dir: string + afterEach(() => { if (dir) rmSync(dir, { recursive: true, force: true }) }) + + it('runs every target and marks the job completed when all succeed', async () => { + dir = mkdtempSync(join(tmpdir(), 'bad-q-')) + const job = createJob(SPEC, SPEC.discover.urls.map(url => ({ url })), dir) + const auditFn: AuditFn = async (target) => ({ + runId: `run-${target.url}`, + resultPath: `/tmp/${target.url}/report.json`, + rollupScore: 7, + pageType: 'saas-app', + costUSD: 0.4, + }) + const final = await runJob(job, { auditFn, dir }) + expect(final.status).toBe('completed') + expect(final.results).toHaveLength(3) + expect(final.results.every(r => r.status === 'ok')).toBe(true) + expect(final.totalCostUSD).toBeCloseTo(1.2) + }) + + it('marks the job partial when some targets fail', async () => { + dir = mkdtempSync(join(tmpdir(), 'bad-q-')) + const job = createJob(SPEC, SPEC.discover.urls.map(url => ({ url })), dir) + let i = 0 + const auditFn: AuditFn = async (target) => { + i += 1 + if (i === 2) throw new Error('synthetic failure') + return { runId: `run-${i}`, resultPath: '/tmp/x/report.json', rollupScore: 6, costUSD: 0.4 } + } + const final = await runJob(job, { auditFn, dir }) + expect(final.status).toBe('partial') + expect(final.results.filter(r => r.status === 'ok')).toHaveLength(2) + expect(final.results.filter(r => r.status === 'failed')).toHaveLength(1) + }) + + it('marks the job failed when every target fails', async () => { + dir = mkdtempSync(join(tmpdir(), 'bad-q-')) + const job = createJob(SPEC, SPEC.discover.urls.map(url => ({ url })), dir) + const auditFn: AuditFn = async () => { throw new Error('always') } + const final = await runJob(job, { auditFn, dir }) + expect(final.status).toBe('failed') + }) + + it('persists each result as it lands (crash-safe)', async () => { + dir = mkdtempSync(join(tmpdir(), 'bad-q-')) + const job = createJob(SPEC, SPEC.discover.urls.map(url => ({ url })), dir) + const seen: number[] = [] + const auditFn: AuditFn = async (target) => { + // After each result, the on-disk job should reflect the new entry. + const persisted = loadJob(job.jobId, dir)! + seen.push(persisted.results.length) + return { runId: target.url, resultPath: `/tmp/${target.url}/report.json`, rollupScore: 5 } + } + await runJob(job, { auditFn, dir, concurrency: 1 }) + expect(seen).toEqual([0, 1, 2]) + }) + + it('respects onError abort', async () => { + dir = mkdtempSync(join(tmpdir(), 'bad-q-')) + const job = createJob(SPEC, SPEC.discover.urls.map(url => ({ url })), dir) + const auditFn: AuditFn = async () => { throw new Error('boom') } + const final = await runJob(job, { auditFn, dir, concurrency: 1, onError: () => 'abort' }) + expect(final.status).toBe('cancelled') + expect(final.results.length).toBeLessThan(3) + }) +}) + +describe('createJob', () => { + let dir: string + afterEach(() => { if (dir) rmSync(dir, { recursive: true, force: true }) }) + + it('mints a queued job and writes index entry', () => { + dir = mkdtempSync(join(tmpdir(), 'bad-q-')) + const job = createJob(SPEC, [{ url: 'https://a.test' }], dir) + expect(job.status).toBe('queued') + const reload = loadJob(job.jobId, dir) + expect(reload?.targets).toHaveLength(1) + }) +}) diff --git a/tests/jobs-store.test.ts b/tests/jobs-store.test.ts new file mode 100644 index 0000000..5ba315e --- /dev/null +++ b/tests/jobs-store.test.ts @@ -0,0 +1,89 @@ +import { describe, it, expect, afterEach } from 'vitest' +import { mkdtempSync, rmSync, existsSync } from 'node:fs' +import { tmpdir } from 'node:os' +import { join } from 'node:path' +import { newJobId, saveJob, loadJob, listJobs, appendIndexEntry, updateJobStatus } from '../src/jobs/store.js' +import type { Job } from '../src/jobs/types.js' + +function makeJob(overrides: Partial = {}): Job { + return { + jobId: newJobId(), + spec: { + kind: 'comparative-audit', + discover: { source: 'list', urls: ['https://a.test', 'https://b.test'] }, + }, + status: 'queued', + createdAt: new Date().toISOString(), + targets: [{ url: 'https://a.test' }, { url: 'https://b.test' }], + results: [], + totalCostUSD: 0, + ...overrides, + } +} + +describe('jobs store', () => { + let dir: string + afterEach(() => { + if (dir) rmSync(dir, { recursive: true, force: true }) + }) + + it('round-trips a job to disk', () => { + dir = mkdtempSync(join(tmpdir(), 'bad-jobs-')) + const job = makeJob() + saveJob(job, dir) + const loaded = loadJob(job.jobId, dir) + expect(loaded).not.toBeNull() + expect(loaded!.jobId).toBe(job.jobId) + expect(loaded!.targets).toHaveLength(2) + }) + + it('returns null for an unknown job', () => { + dir = mkdtempSync(join(tmpdir(), 'bad-jobs-')) + expect(loadJob('does-not-exist', dir)).toBeNull() + }) + + it('writes atomically (no .tmp file lingers)', () => { + dir = mkdtempSync(join(tmpdir(), 'bad-jobs-')) + const job = makeJob() + saveJob(job, dir) + expect(existsSync(join(dir, `${job.jobId}.json`))).toBe(true) + expect(existsSync(join(dir, `${job.jobId}.json.tmp`))).toBe(false) + }) + + it('lists jobs in newest-first order', async () => { + dir = mkdtempSync(join(tmpdir(), 'bad-jobs-')) + const a = makeJob({ createdAt: '2026-01-01T00:00:00.000Z' }) + const b = makeJob({ createdAt: '2026-02-01T00:00:00.000Z' }) + saveJob(a, dir); appendIndexEntry(a, dir) + saveJob(b, dir); appendIndexEntry(b, dir) + const list = listJobs(dir) + expect(list[0].jobId).toBe(b.jobId) + expect(list[1].jobId).toBe(a.jobId) + }) + + it('dedupes index entries by jobId, keeping the latest status', () => { + dir = mkdtempSync(join(tmpdir(), 'bad-jobs-')) + const j = makeJob() + saveJob(j, dir); appendIndexEntry(j, dir) + j.status = 'completed' + saveJob(j, dir); appendIndexEntry(j, dir) + const list = listJobs(dir) + expect(list).toHaveLength(1) + expect(list[0].status).toBe('completed') + }) + + it('updateJobStatus sets timestamps and persists', () => { + dir = mkdtempSync(join(tmpdir(), 'bad-jobs-')) + const j = makeJob() + saveJob(j, dir); appendIndexEntry(j, dir) + const running = updateJobStatus(j.jobId, 'running', dir) + expect(running?.startedAt).toBeDefined() + const done = updateJobStatus(j.jobId, 'completed', dir) + expect(done?.completedAt).toBeDefined() + }) + + it('newJobId is unique across rapid invocations', () => { + const ids = new Set(Array.from({ length: 50 }, () => newJobId())) + expect(ids.size).toBe(50) + }) +}) diff --git a/tests/reports-aggregate.test.ts b/tests/reports-aggregate.test.ts new file mode 100644 index 0000000..bb76087 --- /dev/null +++ b/tests/reports-aggregate.test.ts @@ -0,0 +1,142 @@ +import { describe, it, expect, afterEach } from 'vitest' +import { mkdtempSync, rmSync, writeFileSync, mkdirSync } from 'node:fs' +import { tmpdir } from 'node:os' +import { join } from 'node:path' +import { aggregateJob, leaderboard, longitudinalFor, compareRuns, tierBuckets } from '../src/reports/aggregate.js' +import type { Job } from '../src/jobs/types.js' + +function writeReport(dir: string, runId: string, payload: object): string { + const runDir = join(dir, runId) + mkdirSync(runDir, { recursive: true }) + const file = join(runDir, 'report.json') + writeFileSync(file, JSON.stringify(payload)) + return file +} + +function makeJob(results: Job['results']): Job { + return { + jobId: 'test-job', + spec: { kind: 'comparative-audit', discover: { source: 'list', urls: results.map(r => r.url) } }, + status: 'completed', + createdAt: new Date().toISOString(), + targets: results.map(r => ({ url: r.url, snapshotUrl: r.snapshotUrl, capturedAt: r.capturedAt })), + results, + totalCostUSD: 0, + } +} + +describe('aggregateJob', () => { + let dir: string + afterEach(() => { if (dir) rmSync(dir, { recursive: true, force: true }) }) + + it('reads each ok result\'s report.json and projects to AggregateRow', () => { + dir = mkdtempSync(join(tmpdir(), 'bad-agg-')) + const a = writeReport(dir, 'run-a', { + pages: [{ + auditResultV2: { + classification: { type: 'saas-app', domain: 'fintech' }, + rollup: { score: 7.5 }, + scores: { product_intent: { score: 8 }, visual_craft: { score: 7 } }, + }, + ethicsViolations: [], + }], + }) + const job = makeJob([ + { url: 'https://a/', status: 'ok', runId: 'run-a', resultPath: a, rollupScore: 7.5, pageType: 'saas-app' }, + { url: 'https://b/', status: 'failed', error: 'boom' }, + ]) + const rows = aggregateJob(job) + expect(rows).toHaveLength(2) + expect(rows[0].rollupScore).toBe(7.5) + expect(rows[0].pageType).toBe('saas-app') + expect(rows[0].domain).toBe('fintech') + expect(rows[0].dimensions.product_intent).toBe(8) + expect(Number.isNaN(rows[1].rollupScore)).toBe(true) + }) + + it('falls back to v1 fields when auditResultV2 is missing', () => { + dir = mkdtempSync(join(tmpdir(), 'bad-agg-')) + const a = writeReport(dir, 'run-a', { + pages: [{ score: 6.2, classification: { type: 'marketing' } }], + }) + const job = makeJob([{ url: 'https://a/', status: 'ok', runId: 'run-a', resultPath: a, rollupScore: 6.2 }]) + const rows = aggregateJob(job) + expect(rows[0].rollupScore).toBe(6.2) + expect(rows[0].pageType).toBe('marketing') + }) + + it('does not crash when report.json is missing on disk', () => { + const job = makeJob([{ url: 'https://gone/', status: 'ok', runId: 'run-x', resultPath: '/nope/report.json', rollupScore: 4 }]) + const rows = aggregateJob(job) + expect(rows).toHaveLength(1) + // resultPath missing → row has the JobResultEntry-level rollupScore but no v2 enrichment. + expect(rows[0].rollupScore).toBe(4) + }) +}) + +describe('leaderboard', () => { + it('sorts desc by rollupScore and applies topN', () => { + const rows = [ + { url: 'a', runId: '1', rollupScore: 5, dimensions: {}, ethicsViolations: 0 }, + { url: 'b', runId: '2', rollupScore: 9, dimensions: {}, ethicsViolations: 0 }, + { url: 'c', runId: '3', rollupScore: 7, dimensions: {}, ethicsViolations: 0 }, + ] + const top2 = leaderboard(rows, { topN: 2 }) + expect(top2.map(r => r.url)).toEqual(['b', 'c']) + }) + + it('filters by pageType', () => { + const rows = [ + { url: 'a', runId: '1', rollupScore: 9, dimensions: {}, ethicsViolations: 0, pageType: 'saas-app' }, + { url: 'b', runId: '2', rollupScore: 8, dimensions: {}, ethicsViolations: 0, pageType: 'marketing' }, + ] + const filtered = leaderboard(rows, { byType: 'saas-app' }) + expect(filtered).toHaveLength(1) + expect(filtered[0].url).toBe('a') + }) + + it('drops NaN rollups', () => { + const rows = [ + { url: 'a', runId: '1', rollupScore: NaN, dimensions: {}, ethicsViolations: 0 }, + { url: 'b', runId: '2', rollupScore: 7, dimensions: {}, ethicsViolations: 0 }, + ] + expect(leaderboard(rows)).toHaveLength(1) + }) +}) + +describe('longitudinalFor', () => { + it('returns one entry per snapshot of the URL, sorted by capturedAt', () => { + const rows = [ + { url: 'https://x/', runId: '1', rollupScore: 4, capturedAt: '2020-01-01T00:00:00Z', dimensions: {}, ethicsViolations: 0 }, + { url: 'https://x/', runId: '2', rollupScore: 7, capturedAt: '2010-01-01T00:00:00Z', dimensions: {}, ethicsViolations: 0 }, + { url: 'https://y/', runId: '3', rollupScore: 9, capturedAt: '2024-01-01T00:00:00Z', dimensions: {}, ethicsViolations: 0 }, + ] + const series = longitudinalFor(rows, 'https://x/') + expect(series.map(s => s.capturedAt)).toEqual(['2010-01-01T00:00:00Z', '2020-01-01T00:00:00Z']) + }) +}) + +describe('compareRuns', () => { + it('produces dimension deltas', () => { + const a = { url: 'a', runId: '1', rollupScore: 8, dimensions: { product_intent: 8, visual_craft: 7 }, ethicsViolations: 0 } + const b = { url: 'b', runId: '2', rollupScore: 6, dimensions: { product_intent: 6, visual_craft: 5 }, ethicsViolations: 0 } + const cmp = compareRuns(a, b) + expect(cmp.rollupDelta).toBe(2) + expect(cmp.perDimension).toHaveLength(2) + expect(cmp.perDimension.find(d => d.dim === 'product_intent')?.delta).toBe(2) + }) +}) + +describe('tierBuckets', () => { + it('produces tier slices by rank', () => { + const rows = Array.from({ length: 50 }, (_, i) => ({ + url: `${i}`, runId: `${i}`, rollupScore: 50 - i, dimensions: {}, ethicsViolations: 0, + })) + const buckets = tierBuckets(rows, [10, 25]) + expect(buckets[0].label).toBe('top 10') + expect(buckets[0].rows).toHaveLength(10) + expect(buckets[1].label).toBe('11–25') + expect(buckets[2].label).toMatch(/^26\+/) + expect(buckets[0].meanScore).toBeGreaterThan(buckets[1].meanScore) + }) +}) diff --git a/tests/reports-templates.test.ts b/tests/reports-templates.test.ts new file mode 100644 index 0000000..0ade36a --- /dev/null +++ b/tests/reports-templates.test.ts @@ -0,0 +1,84 @@ +import { describe, it, expect } from 'vitest' +import { renderLeaderboard, renderLongitudinal, renderBatchComparison, renderJobHeader } from '../src/reports/templates.js' +import type { AggregateRow } from '../src/reports/types.js' +import type { Job } from '../src/jobs/types.js' + +const ROWS: AggregateRow[] = [ + { url: 'https://stripe.com', runId: 'r1', rollupScore: 8.7, dimensions: { product_intent: 9, visual_craft: 8 }, ethicsViolations: 0, pageType: 'marketing' }, + { url: 'https://linear.app', runId: 'r2', rollupScore: 9.1, dimensions: { product_intent: 9, visual_craft: 9 }, ethicsViolations: 0, pageType: 'saas-app' }, + { url: 'https://dropbox.com', runId: 'r3', rollupScore: 6.5, dimensions: { product_intent: 7, visual_craft: 6 }, ethicsViolations: 1, pageType: 'marketing' }, +] + +describe('renderLeaderboard', () => { + it('produces a markdown table with rows in descending order', () => { + const md = renderLeaderboard(ROWS) + expect(md).toMatch(/# Design Audit Leaderboard/) + expect(md).toMatch(/\| 1 \| https:\/\/linear\.app/) + expect(md).toMatch(/\| 2 \| https:\/\/stripe\.com/) + expect(md).toMatch(/\| 3 \| https:\/\/dropbox\.com/) + }) + + it('honors byType filter', () => { + const md = renderLeaderboard(ROWS, { byType: 'marketing' }) + expect(md).toMatch(/page-type: `marketing`/) + expect(md).toMatch(/stripe\.com/) + expect(md).not.toMatch(/linear\.app/) + }) + + it('emits tier buckets when boundaries are supplied', () => { + const md = renderLeaderboard(ROWS, { buckets: [1, 2] }) + expect(md).toMatch(/## Tiers/) + expect(md).toMatch(/top 1/) + expect(md).toMatch(/2–2/) + }) + + it('escapes pipe characters in URLs to keep table integrity', () => { + const rows: AggregateRow[] = [{ url: 'https://x.com/path|with|pipes', runId: 'r1', rollupScore: 5, dimensions: {}, ethicsViolations: 0 }] + const md = renderLeaderboard(rows) + expect(md).toMatch(/path\\\|with\\\|pipes/) + }) +}) + +describe('renderLongitudinal', () => { + it('produces one section per URL with sorted captures', () => { + const rows: AggregateRow[] = [ + { url: 'https://stripe.com', runId: 'r1', rollupScore: 5, capturedAt: '2010-01-01T00:00:00Z', dimensions: {}, ethicsViolations: 0 }, + { url: 'https://stripe.com', runId: 'r2', rollupScore: 8, capturedAt: '2020-01-01T00:00:00Z', dimensions: {}, ethicsViolations: 0 }, + ] + const md = renderLongitudinal(rows) + expect(md).toMatch(/## https:\/\/stripe\.com/) + expect(md).toMatch(/Net change 2010-01-01 → 2020-01-01: \*\*\+3\.00\*\*/) + }) +}) + +describe('renderBatchComparison', () => { + it('diffs the first two rows when no pairs given', () => { + const md = renderBatchComparison(ROWS) + expect(md).toMatch(/## https:\/\/stripe\.com vs https:\/\/linear\.app/) + expect(md).toMatch(/Rollup delta: \*\*-0\.40\*\*/) + }) +}) + +describe('renderJobHeader', () => { + it('summarizes ok / failed / skipped counts and cost', () => { + const job: Job = { + jobId: 'job_abc', + spec: { kind: 'comparative-audit', discover: { source: 'list', urls: [] }, label: 'YC W25' }, + status: 'completed', + createdAt: new Date().toISOString(), + targets: [{ url: 'a' }, { url: 'b' }, { url: 'c' }], + results: [ + { url: 'a', status: 'ok' }, + { url: 'b', status: 'ok' }, + { url: 'c', status: 'failed', error: 'x' }, + ], + totalCostUSD: 1.23, + } + const md = renderJobHeader(job) + expect(md).toMatch(/job_abc/) + expect(md).toMatch(/YC W25/) + expect(md).toMatch(/ok: 2/) + expect(md).toMatch(/failed: 1/) + expect(md).toMatch(/\$1\.23/) + }) +}) diff --git a/tests/reports-tools.test.ts b/tests/reports-tools.test.ts new file mode 100644 index 0000000..d011337 --- /dev/null +++ b/tests/reports-tools.test.ts @@ -0,0 +1,117 @@ +import { describe, it, expect, afterEach } from 'vitest' +import { mkdtempSync, rmSync, writeFileSync, mkdirSync } from 'node:fs' +import { tmpdir } from 'node:os' +import { join } from 'node:path' +import { buildReportTools } from '../src/reports/tools.js' +import { saveJob, appendIndexEntry } from '../src/jobs/store.js' +import type { Job } from '../src/jobs/types.js' + +function execTool( + // The AI SDK's tool() wraps execute in a typed shim. Calling it via .execute + // requires the SDK's tool-call options shape, which tests don't have. We + // assert against the function reference and call it directly. + tool: { execute?: (input: I, ctx?: unknown) => Promise | O }, + input: I, +): Promise { + if (!tool.execute) throw new Error('tool has no execute') + return Promise.resolve(tool.execute(input, { toolCallId: 'test', messages: [] })) +} + +function setup(): { dir: string; jobId: string } { + const dir = mkdtempSync(join(tmpdir(), 'bad-tools-')) + // Two reports on disk, one ok / one to skip. + const stripeRunDir = join(dir, 'run-stripe-2020') + mkdirSync(stripeRunDir, { recursive: true }) + writeFileSync(join(stripeRunDir, 'report.json'), JSON.stringify({ + pages: [{ + auditResultV2: { classification: { type: 'marketing', domain: 'fintech' }, rollup: { score: 8.5 }, scores: { product_intent: { score: 9 } } }, + ethicsViolations: [], + }], + })) + const linearRunDir = join(dir, 'run-linear-2024') + mkdirSync(linearRunDir, { recursive: true }) + writeFileSync(join(linearRunDir, 'report.json'), JSON.stringify({ + pages: [{ + auditResultV2: { classification: { type: 'saas-app' }, rollup: { score: 9.2 }, scores: { product_intent: { score: 9 } } }, + ethicsViolations: [], + }], + })) + const job: Job = { + jobId: 'job_test_001', + spec: { kind: 'comparative-audit', discover: { source: 'wayback', urls: ['https://stripe.com', 'https://linear.app'] } }, + status: 'completed', + createdAt: new Date().toISOString(), + targets: [ + { url: 'https://stripe.com', snapshotUrl: 'https://stripe.com/2020', capturedAt: '2020-01-01T00:00:00Z' }, + { url: 'https://linear.app', snapshotUrl: 'https://linear.app/2024', capturedAt: '2024-01-01T00:00:00Z' }, + ], + results: [ + { + url: 'https://stripe.com', snapshotUrl: 'https://stripe.com/2020', capturedAt: '2020-01-01T00:00:00Z', + status: 'ok', runId: 'run-stripe-2020', resultPath: join(stripeRunDir, 'report.json'), + rollupScore: 8.5, pageType: 'marketing', + }, + { + url: 'https://linear.app', snapshotUrl: 'https://linear.app/2024', capturedAt: '2024-01-01T00:00:00Z', + status: 'ok', runId: 'run-linear-2024', resultPath: join(linearRunDir, 'report.json'), + rollupScore: 9.2, pageType: 'saas-app', + }, + ], + totalCostUSD: 0.8, + } + saveJob(job, dir) + appendIndexEntry(job, dir) + return { dir, jobId: job.jobId } +} + +describe('buildReportTools', () => { + let dir: string + afterEach(() => { if (dir) rmSync(dir, { recursive: true, force: true }) }) + + it('exposes the documented tool surface', () => { + const tools = buildReportTools() + const names = Object.keys(tools).sort() + expect(names).toEqual(['compareRuns', 'fetchAudit', 'longitudinal', 'queryJob', 'renderTemplate', 'runFreshAudit', 'tierBuckets'].sort()) + }) + + it('queryJob returns ranked rows', async () => { + const ctx = setup(); dir = ctx.dir + const tools = buildReportTools({ jobsDir: ctx.dir }) + const rows = await execTool(tools.queryJob, { jobId: ctx.jobId }) + expect(Array.isArray(rows)).toBe(true) + expect((rows as Array<{ rollupScore: number }>)[0].rollupScore).toBeGreaterThan(8) + }) + + it('compareRuns produces a deterministic delta', async () => { + const ctx = setup(); dir = ctx.dir + const tools = buildReportTools({ jobsDir: ctx.dir }) + const cmp = await execTool(tools.compareRuns, { jobId: ctx.jobId, runIdA: 'run-stripe-2020', runIdB: 'run-linear-2024' }) + expect((cmp as { rollupDelta: number }).rollupDelta).toBeCloseTo(8.5 - 9.2, 1) + }) + + it('renderTemplate emits markdown', async () => { + const ctx = setup(); dir = ctx.dir + const tools = buildReportTools({ jobsDir: ctx.dir }) + const out = await execTool(tools.renderTemplate, { jobId: ctx.jobId, template: 'leaderboard' }) + expect((out as { markdown: string }).markdown).toMatch(/# Design Audit Leaderboard/) + }) + + it('runFreshAudit refuses without a wired resolver', async () => { + const tools = buildReportTools() + await expect(execTool(tools.runFreshAudit, { url: 'https://x' })).rejects.toThrow(/runFreshAudit not wired/) + }) + + it('runFreshAudit dispatches to the injected resolver', async () => { + const tools = buildReportTools({ + runFreshAudit: async (url) => ({ runId: 'fresh', resultPath: '/tmp/x', rollupScore: 7 }), + }) + const out = await execTool(tools.runFreshAudit, { url: 'https://x' }) + expect((out as { runId: string }).runId).toBe('fresh') + }) + + it('fetchAudit throws when runId is unknown', async () => { + const ctx = setup(); dir = ctx.dir + const tools = buildReportTools({ jobsDir: ctx.dir }) + await expect(execTool(tools.fetchAudit, { jobId: ctx.jobId, runId: 'nope' })).rejects.toThrow(/runId not found/) + }) +})