From 093f3448a94c60cf74d4fd255367fa904aaabacd Mon Sep 17 00:00:00 2001
From: Jin Choi <jinchoi@u.northwestern.edu>
Date: Mon, 25 May 2026 14:55:47 -0700
Subject: [PATCH 1/3] feat(gate): add read-only insight quality audit (Phase 1)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds scripts/insight-gate.ts + scripts/lib/insight-gate.ts: a deterministic,
recomputable semantic-quality audit over the insight corpus. Sibling to
health.ts (structural validation) and verify-contracts.ts (doc/code surface) —
this layer asks the quality questions those don't: is the stance directional?
is it attributed to a real source? does its topic match its path? is it a
near-duplicate?

Checks (all deterministic, no LLM in the gate):
- stance present / stance directional (heuristic floor)
- attribution resolves to a known source (by normalized title or url)
- topic matches file path
- nearest-INSIGHT-neighbor cosine novelty: exact cosine on the stored float32
  vectors (the vec0 table returns L2 distance, not cosine), INS-only via k=64
  overfetch, with a block-threshold simulation and deduped triage pairs

Phase 1 is audit-only: it reports, never blocks (exit 1 only on operational
failure). brain.db is opened read-only. Outputs land in the gitignored
knowledge-base/meta/. Forward enforcement (Phase 2) will reuse these checks.

Tests: 47 (unit + in-memory sqlite-vec integration covering blob round-trip,
PRI-/MM- filtering, and exact cosine). Run via: npm run gate -- --audit --all

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 package.json                     |   1 +
 scripts/insight-gate.ts          | 242 ++++++++++++++
 scripts/lib/insight-gate.test.ts | 337 +++++++++++++++++++
 scripts/lib/insight-gate.ts      | 558 +++++++++++++++++++++++++++++++
 4 files changed, 1138 insertions(+)
 create mode 100644 scripts/insight-gate.ts
 create mode 100644 scripts/lib/insight-gate.test.ts
 create mode 100644 scripts/lib/insight-gate.ts

diff --git a/package.json b/package.json
index 8821e4dae2..78e48f0d9c 100644
--- a/package.json
+++ b/package.json
@@ -22,6 +22,7 @@
     "create-principles": "tsx scripts/create-principles.ts",
     "sleep": "tsx scripts/sleep.ts",
     "quality-score": "tsx scripts/quality-score.ts",
+    "gate": "tsx scripts/insight-gate.ts",
     "ask": "tsx scripts/ask.ts",
     "autoknowledge": "tsx scripts/autoknowledge.ts",
     "wake": "tsx scripts/wake.ts",
diff --git a/scripts/insight-gate.ts b/scripts/insight-gate.ts
new file mode 100644
index 0000000000..f588ba334f
--- /dev/null
+++ b/scripts/insight-gate.ts
@@ -0,0 +1,242 @@
+#!/usr/bin/env npx tsx
+/**
+ * insight-gate.ts — Semantic quality gate for insights (CLI).
+ *
+ * PHASE 1 (this file): AUDIT ONLY. Read-only X-ray of insight quality across
+ * the corpus. Never exits 1 — it reports so the pass-bar can be calibrated
+ * against the real distribution before forward enforcement is switched on.
+ *
+ * Usage:
+ *   npx tsx scripts/insight-gate.ts --audit --all          # full corpus X-ray
+ *   npx tsx scripts/insight-gate.ts --audit --since 2026-05-01
+ *   npx tsx scripts/insight-gate.ts --audit --all --json   # machine-readable
+ *   npx tsx scripts/insight-gate.ts --audit --examples 30  # more failing samples
+ *
+ * Outputs (in addition to stdout):
+ *   knowledge-base/meta/gate-report.json   latest full report (overwritten)
+ *   knowledge-base/meta/gate-log.jsonl     one summary line per run (appended)
+ *
+ * Phase 2 (not yet wired) will add forward enforcement: scope to a batch and
+ * exit 1 on failures, reusing the exact check functions in lib/insight-gate.ts.
+ */
+
+import { appendFileSync, mkdirSync, writeFileSync } from "node:fs";
+import { dirname, join } from "node:path";
+import Database from "better-sqlite3";
+import * as sqliteVec from "sqlite-vec";
+
+import {
+  auditInsights,
+  buildNoveltyComputer,
+  buildSourceIndex,
+  loadGateInsights,
+  type AuditReport,
+  type CheckId,
+  type NearestFn,
+} from "./lib/insight-gate";
+import { KB_ROOT } from "./lib/kb-root";
+
+const REPORT_PATH = join(KB_ROOT, "meta", "gate-report.json");
+const LOG_PATH = join(KB_ROOT, "meta", "gate-log.jsonl");
+const DB_PATH = join(KB_ROOT, "db", "brain.db");
+
+const CHECK_LABELS: Record<CheckId, string> = {
+  stance_present: "stance present",
+  stance_directional: "stance directional",
+  attribution_resolves: "attribution resolves",
+  topic_matches_path: "topic matches path",
+};
+
+// ─── Args ─────────────────────────────────────────────────────────────
+
+interface Args {
+  since: string | null;
+  json: boolean;
+  exampleLimit: number;
+}
+
+function parseArgs(argv: string[]): Args {
+  const sinceIdx = argv.indexOf("--since");
+  const examplesIdx = argv.indexOf("--examples");
+  return {
+    since: sinceIdx !== -1 ? argv[sinceIdx + 1] ?? null : null,
+    json: argv.includes("--json"),
+    exampleLimit: examplesIdx !== -1 ? parseInt(argv[examplesIdx + 1], 10) || 15 : 15,
+  };
+}
+
+// ─── Novelty wiring (graceful: structural audit survives a vec failure) ──
+
+function tryBuildNovelty(): { nearest: NearestFn | undefined; close: () => void } {
+  try {
+    // Read-only: an audit must never migrate schema or create tables on brain.db.
+    // We load sqlite-vec (no DDL) and query the existing embeddings table directly,
+    // deliberately bypassing initDb()/initVectorTable() which run CREATE/ALTER.
+    const db = new Database(DB_PATH, { readonly: true, fileMustExist: true });
+    sqliteVec.load(db);
+    return { nearest: buildNoveltyComputer(db), close: () => db.close() };
+  } catch (err) {
+    console.warn(
+      `WARN: novelty unavailable (${(err as Error).message}). ` +
+        `Reporting structural checks only.`
+    );
+    return { nearest: undefined, close: () => {} };
+  }
+}
+
+// ─── Main ─────────────────────────────────────────────────────────────
+
+async function main(): Promise<void> {
+  const args = parseArgs(process.argv.slice(2));
+
+  let insights = await loadGateInsights();
+  const scope = args.since ? `since ${args.since}` : "all";
+  if (args.since) {
+    insights = insights.filter((i) => i.dateExtracted >= args.since!);
+  }
+
+  const sourceIndex = await buildSourceIndex();
+  const { nearest, close } = tryBuildNovelty();
+
+  let report: AuditReport;
+  try {
+    report = auditInsights(insights, sourceIndex, nearest, {
+      scope,
+      exampleLimit: args.exampleLimit,
+    }).report;
+  } finally {
+    close();
+  }
+
+  persist(report);
+
+  if (args.json) {
+    console.log(JSON.stringify(report, null, 2));
+  } else {
+    printReport(report);
+  }
+}
+
+// ─── Persistence ──────────────────────────────────────────────────────
+
+function persist(report: AuditReport): void {
+  mkdirSync(dirname(REPORT_PATH), { recursive: true });
+  writeFileSync(REPORT_PATH, JSON.stringify(report, null, 2) + "\n", "utf-8");
+
+  const logLine = {
+    generated_at: report.generated_at,
+    scope: report.scope,
+    total: report.total,
+    coverage: report.coverage,
+    novelty_available: report.novelty.available,
+    block_simulation: report.novelty.block_simulation,
+    singleton_topics: report.fragmentation.singletons,
+  };
+  appendFileSync(LOG_PATH, JSON.stringify(logLine) + "\n", "utf-8");
+}
+
+// ─── Pretty printer ───────────────────────────────────────────────────
+
+function printReport(report: AuditReport): void {
+  const total = report.total;
+
+  console.log("╔════════════════════════════════════════════════════════╗");
+  console.log("║   Zuhn — Insight Gate (AUDIT · read-only)                ║");
+  console.log("╚════════════════════════════════════════════════════════╝");
+  console.log(`Scope: ${report.scope}    Insights: ${total.toLocaleString()}`);
+
+  if (total === 0) {
+    console.log("\nNo insights in scope.");
+    return;
+  }
+
+  console.log("\nQuality coverage (insights passing each check):");
+  for (const checkId of Object.keys(report.coverage) as CheckId[]) {
+    const n = report.coverage[checkId];
+    console.log(
+      `  ${CHECK_LABELS[checkId].padEnd(22)} ${fmt(n).padStart(7)} / ${fmt(total)}  ${pct(n, total)}`
+    );
+  }
+
+  const { available, embedded, unembedded, no_neighbor, histogram, block_simulation, mean_similarity, top_pairs } =
+    report.novelty;
+  if (!available) {
+    console.log("\nNovelty — not computed (embeddings DB unavailable; structural checks only).");
+  } else {
+    const withNeighbor = embedded - no_neighbor;
+    const noNeighborNote = no_neighbor > 0 ? `, ${fmt(no_neighbor)} no-neighbor` : "";
+    console.log(
+      `\nNovelty — nearest INSIGHT neighbor, exact cosine (${fmt(embedded)} embedded, ${fmt(unembedded)} unembedded${noNeighborNote}):`
+    );
+    if (withNeighbor === 0) {
+      console.log("  (no insight-to-insight neighbors available)");
+    } else {
+      for (const [label, count] of Object.entries(histogram)) {
+        console.log(`  ${label.padEnd(10)} ${fmt(count).padStart(7)}  ${bar(count, withNeighbor)}`);
+      }
+      console.log(`  mean similarity: ${mean_similarity === null ? "n/a" : mean_similarity.toFixed(3)}`);
+
+      console.log("\nBlock simulation (insights that WOULD be rejected as near-dupes):");
+      for (const [threshold, count] of Object.entries(block_simulation)) {
+        console.log(`  >= ${threshold}   ${fmt(count).padStart(7)}  (${pct(count, withNeighbor)} of measured)`);
+      }
+
+      if (top_pairs.length > 0) {
+        console.log("\nClosest insight pairs (dedup triage — highest similarity first):");
+        for (const p of top_pairs.slice(0, 10)) {
+          console.log(`  ${p.similarity.toFixed(3)}  ${p.id}  ~  ${p.nearestId}`);
+        }
+      }
+    }
+  }
+
+  const frag = report.fragmentation;
+  console.log(
+    `\nTopic fragmentation: ${fmt(frag.singletons)} singleton topics / ${fmt(frag.total_topics)} total`
+  );
+  if (frag.singletons > 0) {
+    const preview = frag.singleton_topics.slice(0, 12).join(", ");
+    const more = frag.singletons > 12 ? `, … (+${frag.singletons - 12} more)` : "";
+    console.log(`  ${preview}${more}`);
+  }
+
+  console.log("\nSample failures:");
+  let anyFailures = false;
+  for (const checkId of Object.keys(report.failing_examples) as CheckId[]) {
+    const examples = report.failing_examples[checkId];
+    if (examples.length === 0) continue;
+    anyFailures = true;
+    console.log(`  ${CHECK_LABELS[checkId]} (${fmt(total - report.coverage[checkId])} failing):`);
+    for (const ex of examples) {
+      const detail = ex.detail ? ` — ${ex.detail}` : "";
+      console.log(`    ${ex.id}  ${ex.relPath}${detail}`);
+    }
+  }
+  if (!anyFailures) console.log("  none — all checks pass.");
+
+  console.log(`\nFull report: ${REPORT_PATH}`);
+}
+
+// ─── Format helpers ───────────────────────────────────────────────────
+
+function fmt(n: number): string {
+  return n.toLocaleString();
+}
+
+function pct(n: number, total: number): string {
+  return total === 0 ? "0.0%" : `${((100 * n) / total).toFixed(1)}%`;
+}
+
+function bar(count: number, total: number): string {
+  if (total === 0) return "";
+  const width = Math.round((count / total) * 30);
+  return "█".repeat(width);
+}
+
+main().catch((err) => {
+  // Contract: AUDIT never exits non-zero on QUALITY findings (it only reports).
+  // A non-zero exit here means an OPERATIONAL failure — corpus unreadable, report
+  // unwritable, etc. — which should fail loud rather than pretend success.
+  console.error("insight-gate.ts failed:", err);
+  process.exit(1);
+});
diff --git a/scripts/lib/insight-gate.test.ts b/scripts/lib/insight-gate.test.ts
new file mode 100644
index 0000000000..695b94aed0
--- /dev/null
+++ b/scripts/lib/insight-gate.test.ts
@@ -0,0 +1,337 @@
+import { describe, expect, it } from "vitest";
+import Database from "better-sqlite3";
+
+import {
+  auditInsights,
+  buildNoveltyComputer,
+  checkAttributionResolves,
+  checkStanceDirectional,
+  checkStancePresent,
+  checkTopicMatchesPath,
+  cosineSimilarity,
+  isDirectionalStance,
+  normalizeTitle,
+  normalizeUrl,
+  type GateInsight,
+  type NearestFn,
+  type SourceIndex,
+} from "./insight-gate";
+import { initVectorTable, upsertEmbedding } from "./vector-search";
+
+// ─── Fixtures ─────────────────────────────────────────────────────────
+
+function mk(over: Partial<GateInsight> = {}): GateInsight {
+  return {
+    id: "INS-260525-AAAA",
+    domain: "ai-development",
+    topic: "agents",
+    title: "A title",
+    stance: "Agents fail because harness context is incomplete",
+    sources: [{ title: "Known Source", url: "https://example.com/a" }],
+    dateExtracted: "2026-05-25",
+    filePath: "/abs/domains/ai-development/agents/x.md",
+    relPath: "domains/ai-development/agents/x.md",
+    ...over,
+  };
+}
+
+const INDEX: SourceIndex = {
+  titles: new Set(["known source"]),
+  urls: new Set(["example.com/a"]),
+};
+
+// ─── isDirectionalStance ──────────────────────────────────────────────
+
+describe("isDirectionalStance", () => {
+  const directional = [
+    "AI will displace most white-collar workers",
+    "Context engineering beats model choice for agent reliability",
+    "Teams should adopt typed schemas because runtime checks rot",
+    "Verification is the bottleneck, so agents need a quality gate",
+    "Founders who ship early outperform those who plan",
+    "Memory capture is commoditizing rather than a durable moat",
+    // Regression: real corpus stances the heuristic wrongly flagged before the
+    // vacuous-list narrowing + directional-verb additions (incidental "good"/"bad").
+    "Building intuitive feel for what an AI is good at and when it hallucinates is a valid form of knowledge, not a substitute for 'real' understanding.",
+    "The booster-doomer debate creates a false spectrum where the only question is whether powerful AI is good or bad",
+    "Distributing systems across multiple geographic regions provides fault tolerance against regional failures but introduces data consistency challenges",
+  ];
+  for (const s of directional) {
+    it(`passes directional claim: "${s.slice(0, 40)}"`, () => {
+      expect(isDirectionalStance(s)).toBe(true);
+    });
+  }
+
+  const weak = [
+    "This discusses AI's impact on jobs",
+    "An overview of agent patterns",
+    "It covers prompt engineering basics",
+    "AI is important",
+    "Embeddings are interesting",
+    "The talk explains how transformers work",
+    "AI matters", // too short
+    "", // empty
+  ];
+  for (const s of weak) {
+    it(`rejects weak/descriptive claim: "${s.slice(0, 40)}"`, () => {
+      expect(isDirectionalStance(s)).toBe(false);
+    });
+  }
+
+  it("lets a directional marker override a vacuous value word", () => {
+    expect(isDirectionalStance("Latency is important because it predicts churn")).toBe(true);
+  });
+});
+
+// ─── Stance checks ────────────────────────────────────────────────────
+
+describe("checkStancePresent", () => {
+  it("fails on empty / whitespace-only stance", () => {
+    expect(checkStancePresent(mk({ stance: "" })).passed).toBe(false);
+    expect(checkStancePresent(mk({ stance: "   " })).passed).toBe(false);
+  });
+  it("passes when a stance exists", () => {
+    expect(checkStancePresent(mk({ stance: "X beats Y" })).passed).toBe(true);
+  });
+});
+
+describe("checkStanceDirectional", () => {
+  it("reports 'no stance' when absent (does not crash)", () => {
+    const r = checkStanceDirectional(mk({ stance: "" }));
+    expect(r.passed).toBe(false);
+    expect(r.detail).toBe("no stance");
+  });
+  it("fails a vacuous stance with a helpful detail", () => {
+    const r = checkStanceDirectional(mk({ stance: "AI is important" }));
+    expect(r.passed).toBe(false);
+    expect(r.detail).toContain("not directional");
+  });
+  it("passes a directional stance", () => {
+    expect(checkStanceDirectional(mk({ stance: "X reduces cost because Y" })).passed).toBe(true);
+  });
+});
+
+// ─── Normalizers ──────────────────────────────────────────────────────
+
+describe("normalizeTitle", () => {
+  it("trims, lowercases, and collapses whitespace", () => {
+    expect(normalizeTitle("  Some   Source  ")).toBe("some source");
+  });
+});
+
+describe("normalizeUrl", () => {
+  it("strips protocol, www, and trailing slashes; lowercases", () => {
+    expect(normalizeUrl("https://www.Example.com/x/")).toBe("example.com/x");
+    expect(normalizeUrl("http://example.com/a")).toBe("example.com/a");
+  });
+});
+
+// ─── Attribution ──────────────────────────────────────────────────────
+
+describe("checkAttributionResolves", () => {
+  it("fails when sources[] is empty (orphan)", () => {
+    const r = checkAttributionResolves(mk({ sources: [] }), INDEX);
+    expect(r.passed).toBe(false);
+    expect(r.detail).toBe("no sources[]");
+  });
+  it("resolves by normalized url", () => {
+    const ins = mk({ sources: [{ url: "https://www.example.com/a/" }] });
+    expect(checkAttributionResolves(ins, INDEX).passed).toBe(true);
+  });
+  it("resolves by normalized title", () => {
+    const ins = mk({ sources: [{ title: "  known source " }] });
+    expect(checkAttributionResolves(ins, INDEX).passed).toBe(true);
+  });
+  it("passes if any one of several refs resolves", () => {
+    const ins = mk({ sources: [{ title: "Unknown" }, { url: "https://example.com/a" }] });
+    expect(checkAttributionResolves(ins, INDEX).passed).toBe(true);
+  });
+  it("fails when nothing resolves", () => {
+    const ins = mk({ sources: [{ title: "Ghost", url: "https://nope.com/z" }] });
+    expect(checkAttributionResolves(ins, INDEX).passed).toBe(false);
+  });
+});
+
+// ─── Topic / path ─────────────────────────────────────────────────────
+
+describe("checkTopicMatchesPath", () => {
+  it("passes when frontmatter domain/topic match the path", () => {
+    expect(checkTopicMatchesPath(mk()).passed).toBe(true);
+  });
+  it("fails on a domain/topic mismatch and explains it", () => {
+    const r = checkTopicMatchesPath(mk({ domain: "startups" }));
+    expect(r.passed).toBe(false);
+    expect(r.detail).toContain("!= path");
+  });
+});
+
+// ─── Cosine similarity ────────────────────────────────────────────────
+
+describe("cosineSimilarity", () => {
+  it("is 1 for identical vectors", () => {
+    const v = new Float32Array([1, 2, 3]);
+    expect(cosineSimilarity(v, v)).toBeCloseTo(1, 6);
+  });
+  it("is 0 for orthogonal vectors", () => {
+    expect(cosineSimilarity(new Float32Array([1, 0]), new Float32Array([0, 1]))).toBeCloseTo(0, 6);
+  });
+  it("is -1 for opposite vectors", () => {
+    expect(cosineSimilarity(new Float32Array([1, 1]), new Float32Array([-1, -1]))).toBeCloseTo(-1, 6);
+  });
+  it("is invariant to magnitude (direction only)", () => {
+    expect(cosineSimilarity(new Float32Array([2, 0]), new Float32Array([5, 0]))).toBeCloseTo(1, 6);
+  });
+  it("returns 0 for a zero vector", () => {
+    expect(cosineSimilarity(new Float32Array([0, 0]), new Float32Array([1, 1]))).toBe(0);
+  });
+});
+
+// ─── Audit aggregation ────────────────────────────────────────────────
+
+describe("auditInsights", () => {
+  const good = mk({
+    id: "INS-1",
+    stance: "X reduces cost because Y",
+    sources: [{ title: "Known Source" }],
+    relPath: "domains/ai-development/agents/a.md",
+  });
+  const weak = mk({
+    id: "INS-2",
+    stance: "AI is important",
+    sources: [{ url: "https://example.com/a" }],
+    relPath: "domains/ai-development/agents/b.md",
+  });
+  const orphan = mk({
+    id: "INS-3",
+    stance: "",
+    sources: [],
+    domain: "startups",
+    topic: "lonely",
+    relPath: "domains/startups/lonely/c.md",
+  });
+
+  it("aggregates coverage counts per check", () => {
+    const { report } = auditInsights([good, weak, orphan], INDEX);
+    expect(report.total).toBe(3);
+    expect(report.coverage.stance_present).toBe(2); // good + weak
+    expect(report.coverage.stance_directional).toBe(1); // good only
+    expect(report.coverage.attribution_resolves).toBe(2); // good (title) + weak (url)
+    expect(report.coverage.topic_matches_path).toBe(3); // all filed correctly
+  });
+
+  it("marks novelty unavailable (not 'all unembedded') when no novelty fn is supplied", () => {
+    const { report } = auditInsights([good, weak, orphan], INDEX);
+    expect(report.novelty.available).toBe(false);
+    expect(report.novelty.embedded).toBe(0);
+    expect(report.novelty.unembedded).toBe(0);
+    expect(report.novelty.no_neighbor).toBe(0);
+    expect(report.novelty.mean_similarity).toBeNull();
+    expect(report.novelty.top_pairs).toEqual([]);
+  });
+
+  it("separates 'embedded but no neighbor' from 'unembedded' when novelty is available", () => {
+    const nearest: NearestFn = (id) =>
+      id === "INS-1"
+        ? { nearestId: null, similarity: null, selfEmbedded: true } // embedded, but no INS neighbor
+        : { nearestId: null, similarity: null, selfEmbedded: false }; // not embedded at all
+    const { report } = auditInsights([good, weak], INDEX, nearest);
+    expect(report.novelty.available).toBe(true);
+    expect(report.novelty.embedded).toBe(1);
+    expect(report.novelty.no_neighbor).toBe(1);
+    expect(report.novelty.unembedded).toBe(1);
+  });
+
+  it("detects singleton topics from the file path", () => {
+    const { report } = auditInsights([good, weak, orphan], INDEX);
+    expect(report.fragmentation.total_topics).toBe(2);
+    expect(report.fragmentation.singletons).toBe(1);
+    expect(report.fragmentation.singleton_topics).toEqual(["startups/lonely"]);
+  });
+
+  it("collects failing examples with details", () => {
+    const { report } = auditInsights([good, weak, orphan], INDEX);
+    expect(report.failing_examples.stance_present.map((e) => e.id)).toEqual(["INS-3"]);
+    expect(report.failing_examples.stance_directional.map((e) => e.id)).toEqual(["INS-2", "INS-3"]);
+    expect(report.failing_examples.attribution_resolves.map((e) => e.id)).toEqual(["INS-3"]);
+  });
+
+  it("respects exampleLimit", () => {
+    const orphans = Array.from({ length: 5 }, (_, i) =>
+      mk({ id: `INS-X${i}`, sources: [] })
+    );
+    const { report } = auditInsights(orphans, INDEX, undefined, { exampleLimit: 2 });
+    expect(report.failing_examples.attribution_resolves).toHaveLength(2);
+  });
+
+  it("builds the cosine histogram and block simulation from a novelty fn", () => {
+    const nearest: NearestFn = (id) =>
+      id === "INS-1"
+        ? { nearestId: "INS-9", similarity: 0.97, selfEmbedded: true }
+        : { nearestId: null, similarity: null, selfEmbedded: false };
+    const { report } = auditInsights([good], INDEX, nearest);
+    expect(report.novelty.available).toBe(true);
+    expect(report.novelty.embedded).toBe(1);
+    expect(report.novelty.histogram[">=0.95"]).toBe(1);
+    expect(report.novelty.block_simulation["0.85"]).toBe(1);
+    expect(report.novelty.block_simulation["0.95"]).toBe(1);
+    expect(report.novelty.mean_similarity).toBe(0.97);
+    expect(report.novelty.top_pairs).toEqual([
+      { id: "INS-1", nearestId: "INS-9", similarity: 0.97 },
+    ]);
+  });
+});
+
+// ─── buildNoveltyComputer (integration: real in-memory sqlite-vec) ─────
+// Exercises the DB path codex flagged as untested: float32 blob round-trip,
+// PRI-/MM- filtering, and exact-cosine computation.
+
+describe("buildNoveltyComputer (in-memory sqlite-vec)", () => {
+  /** Build a 768-dim vector from [index, value] pairs (rest zero). */
+  function vec768(pairs: Array<[number, number]>): number[] {
+    const v = new Array(768).fill(0);
+    for (const [i, x] of pairs) v[i] = x;
+    return v;
+  }
+
+  it("returns the nearest INSIGHT neighbor with exact cosine, skipping PRI-/MM-", () => {
+    const db = new Database(":memory:");
+    initVectorTable(db);
+    upsertEmbedding(db, "INS-A", vec768([[0, 1]]));
+    upsertEmbedding(db, "PRI-C", vec768([[0, 1], [1, 0.001]])); // ~identical to A, but a principle
+    upsertEmbedding(db, "MM-D", vec768([[0, 1], [2, 0.001]])); // ~identical to A, but a mental model
+    upsertEmbedding(db, "INS-B", vec768([[0, 1], [1, 0.3]])); // cosine(A,B) = 1/sqrt(1.09) ≈ 0.958
+
+    const nearest = buildNoveltyComputer(db);
+    const r = nearest("INS-A");
+
+    expect(r.selfEmbedded).toBe(true);
+    expect(r.nearestId).toBe("INS-B"); // PRI-C and MM-D are nearer but filtered out
+    expect(r.similarity).toBeCloseTo(0.958, 2);
+    db.close();
+  });
+
+  it("reports selfEmbedded=false when the insight has no embedding row", () => {
+    const db = new Database(":memory:");
+    initVectorTable(db);
+    upsertEmbedding(db, "INS-B", vec768([[0, 1]]));
+
+    const r = buildNoveltyComputer(db)("INS-MISSING");
+    expect(r.selfEmbedded).toBe(false);
+    expect(r.nearestId).toBeNull();
+    expect(r.similarity).toBeNull();
+    db.close();
+  });
+
+  it("reports selfEmbedded=true with no neighbor when only non-insight rows exist", () => {
+    const db = new Database(":memory:");
+    initVectorTable(db);
+    upsertEmbedding(db, "INS-A", vec768([[0, 1]]));
+    upsertEmbedding(db, "PRI-C", vec768([[0, 1], [1, 0.1]]));
+
+    const r = buildNoveltyComputer(db)("INS-A");
+    expect(r.selfEmbedded).toBe(true);
+    expect(r.nearestId).toBeNull();
+    expect(r.similarity).toBeNull();
+    db.close();
+  });
+});
diff --git a/scripts/lib/insight-gate.ts b/scripts/lib/insight-gate.ts
new file mode 100644
index 0000000000..921ba63c46
--- /dev/null
+++ b/scripts/lib/insight-gate.ts
@@ -0,0 +1,558 @@
+/**
+ * insight-gate.ts (lib) — Semantic quality gate for insights.
+ *
+ * Sibling to lib/verify-contracts.ts: a set of pure, deterministic,
+ * RECOMPUTABLE checks plus an audit aggregator. Unlike health.ts
+ * (structural: does the YAML parse?) this layer asks the semantic-quality
+ * questions: is the stance directional? is it attributed to a real source?
+ * is it filed under a topic that matches its path? is it a near-duplicate?
+ *
+ * Architecture boundary (CLAUDE.md): NO LLM here. Every check is a pure
+ * function of current state, so the verdict can be recomputed at any time
+ * without storing it. The LLM rubric stays in quality-score.ts (the
+ * advisory/audit layer) — never in the gate.
+ *
+ * Phase 1 is AUDIT ONLY: these functions report; the CLI never exits 1.
+ * Forward enforcement (Phase 2) reuses the exact same check functions.
+ */
+
+import { readFile } from "node:fs/promises";
+import { relative } from "node:path";
+import fg from "fast-glob";
+import matter from "gray-matter";
+import type Database from "better-sqlite3";
+
+import { KB_ROOT } from "./kb-root";
+
+// ─── Types ────────────────────────────────────────────────────────────
+
+export type CheckId =
+  | "stance_present"
+  | "stance_directional"
+  | "attribution_resolves"
+  | "topic_matches_path";
+
+export const CHECK_IDS: CheckId[] = [
+  "stance_present",
+  "stance_directional",
+  "attribution_resolves",
+  "topic_matches_path",
+];
+
+/** A source reference as stored on an insight (sources[] in frontmatter). */
+export interface SourceRef {
+  title?: string;
+  url?: string;
+}
+
+/** Defensively-parsed insight — fields may be missing on older insights. */
+export interface GateInsight {
+  id: string;
+  domain: string;
+  topic: string;
+  title: string;
+  stance: string; // "" when absent
+  sources: SourceRef[];
+  dateExtracted: string; // "" when absent
+  filePath: string; // absolute
+  relPath: string; // relative to KB_ROOT, e.g. domains/<domain>/<topic>/<file>.md
+}
+
+export interface CheckResult {
+  checkId: CheckId;
+  passed: boolean;
+  detail?: string;
+}
+
+export interface NoveltyResult {
+  /** Nearest INSIGHT neighbor id, or null when none was found. */
+  nearestId: string | null;
+  /** Exact cosine similarity to the nearest insight neighbor, or null. */
+  similarity: number | null;
+  /** Whether THIS insight has an embedding — distinguishes "unembedded" from "no neighbor found". */
+  selfEmbedded: boolean;
+}
+
+export interface InsightAudit {
+  id: string;
+  relPath: string;
+  checks: CheckResult[];
+  novelty: NoveltyResult;
+}
+
+/** Normalized lookup sets built from the sources/ directory. */
+export interface SourceIndex {
+  titles: Set<string>;
+  urls: Set<string>;
+}
+
+export interface AuditReport {
+  generated_at: string;
+  scope: string;
+  total: number;
+  /** Number of insights PASSING each check. */
+  coverage: Record<CheckId, number>;
+  novelty: {
+    /** Whether novelty was computed at all (false when the embeddings DB was unavailable). */
+    available: boolean;
+    /** Insights that have their own embedding. */
+    embedded: number;
+    /** Insights with no embedding row. */
+    unembedded: number;
+    /** Embedded insights for which no INS- neighbor was found (e.g. outliers). */
+    no_neighbor: number;
+    /** Bucketed counts of nearest-neighbor similarity (insights with a neighbor only). */
+    histogram: Record<string, number>;
+    /** For each candidate block threshold, how many insights have a neighbor >= it. */
+    block_simulation: Record<string, number>;
+    mean_similarity: number | null;
+    /** Closest insight pairs by similarity, for dedup triage (highest first). */
+    top_pairs: Array<{ id: string; nearestId: string; similarity: number }>;
+  };
+  fragmentation: {
+    total_topics: number;
+    singletons: number;
+    singleton_topics: string[]; // "domain/topic"
+  };
+  /** Up to `exampleLimit` failing insights per check, for triage. */
+  failing_examples: Record<CheckId, Array<{ id: string; relPath: string; detail?: string }>>;
+}
+
+// ─── Stance directionality heuristic ──────────────────────────────────
+//
+// A real stance is an ASSERTION that could be true or false — a directional
+// claim — not a description of the source ("this discusses X") or a vacuous
+// value statement ("X is important"). This is a deterministic FLOOR: it will
+// have false positives/negatives. Audit mode reports the rate so the bar can
+// be calibrated against the corpus rather than guessed. Patterns are exported
+// so they are testable and tweakable.
+
+/** Verbs/phrases that describe the source instead of asserting a claim. */
+export const STANCE_DESCRIPTIVE =
+  /\b(discusses|covers|explores|explains|describes|examines|outlines|highlights|mentions|summari[sz]es|talks about|is about|provides? an overview|gives? an overview|an overview of|introduction to|a look at)\b/i;
+
+// Bare value claims with no direction ("X is important/interesting/...").
+// Deliberately narrow: only "puffery" adjectives that almost never appear
+// except as the vacuous predicate of a non-claim. Generic words (good, bad,
+// complex, common, ...) were removed because they fire incidentally inside
+// genuinely directional stances ("AI is good at X", "good enough to ...").
+export const STANCE_VACUOUS_VALUE =
+  /\b(is|are|was|were|can be|remains?|stays?|seems?)\s+(?:very\s+|increasingly\s+|quite\s+|really\s+|highly\s+|often\s+|generally\s+)?(important|interesting|useful|valuable|key|crucial|essential|powerful|significant|notable|relevant)\b/i;
+
+/** Causal / comparative / predictive / consequential markers => directional. */
+// NOTE: keep this set disjoint from STANCE_DESCRIPTIVE — a verb in both (e.g.
+// "introduces", which can mean "describes" or "brings about") would be checked
+// directional-first and silently pass source-descriptive stances.
+export const STANCE_DIRECTIONAL =
+  /\b(because|since|so that|therefore|thus|hence|leads? to|causes?|drives?|predicts?|will|won't|should|shouldn't|must|cannot|can't|outperforms?|beats?|exceeds?|fails?|unless|instead of|rather than|more than|less than|faster than|slower than|better than|worse than|enables?|prevents?|requires?|eliminates?|replaces?|reduces?|increases?|decreases?|creates?|trades?|transforms?|determines?|undermines?|forces?|favou?rs?|breaks?|shifts?|results? in|means that|implies|trumps?|wins?|loses?|matters? more)\b/i;
+
+const STANCE_MIN_LENGTH = 15;
+
+/**
+ * Heuristic: is this stance a directional, falsifiable claim (vs. descriptive
+ * or vacuous)? Order matters — an explicit directional marker passes even when
+ * a weak word is also present ("X is important BECAUSE it predicts Y").
+ */
+export function isDirectionalStance(stance: string): boolean {
+  const s = stance.trim();
+  if (s.length < STANCE_MIN_LENGTH) return false;
+  if (STANCE_DIRECTIONAL.test(s)) return true;
+  if (STANCE_DESCRIPTIVE.test(s)) return false;
+  if (STANCE_VACUOUS_VALUE.test(s)) return false;
+  return true;
+}
+
+// ─── Normalizers ──────────────────────────────────────────────────────
+
+export function normalizeTitle(title: string): string {
+  return title.trim().toLowerCase().replace(/\s+/g, " ");
+}
+
+export function normalizeUrl(url: string): string {
+  return url
+    .trim()
+    .toLowerCase()
+    .replace(/^https?:\/\//, "")
+    .replace(/^www\./, "")
+    .replace(/\/+$/, "");
+}
+
+// ─── Pure checks ──────────────────────────────────────────────────────
+
+function result(checkId: CheckId, passed: boolean, detail?: string): CheckResult {
+  return detail === undefined ? { checkId, passed } : { checkId, passed, detail };
+}
+
+export function checkStancePresent(ins: GateInsight): CheckResult {
+  return ins.stance.trim().length > 0
+    ? result("stance_present", true)
+    : result("stance_present", false, "no stance");
+}
+
+export function checkStanceDirectional(ins: GateInsight): CheckResult {
+  const s = ins.stance.trim();
+  if (s.length === 0) {
+    // Presence is a separate check; directionality is vacuously failed.
+    return result("stance_directional", false, "no stance");
+  }
+  return isDirectionalStance(s)
+    ? result("stance_directional", true)
+    : result("stance_directional", false, `not directional: "${truncate(s, 80)}"`);
+}
+
+/**
+ * Passes if AT LEAST ONE of the insight's source references resolves to a
+ * known source file (by normalized url or title). An insight with no
+ * sources[] is an orphan and fails.
+ */
+export function checkAttributionResolves(
+  ins: GateInsight,
+  index: SourceIndex
+): CheckResult {
+  if (ins.sources.length === 0) {
+    return result("attribution_resolves", false, "no sources[]");
+  }
+  for (const src of ins.sources) {
+    if (src.url && index.urls.has(normalizeUrl(src.url))) {
+      return result("attribution_resolves", true);
+    }
+    if (src.title && index.titles.has(normalizeTitle(src.title))) {
+      return result("attribution_resolves", true);
+    }
+  }
+  const labels = ins.sources
+    .map((s) => s.title || s.url || "(empty)")
+    .join(", ");
+  return result(
+    "attribution_resolves",
+    false,
+    `no source ref resolves (${truncate(labels, 80)})`
+  );
+}
+
+/**
+ * Passes if the insight's frontmatter domain/topic match its file location
+ * (domains/<domain>/<topic>/...). Catches misfiled insights and frontmatter
+ * that drifted from the path.
+ */
+export function checkTopicMatchesPath(ins: GateInsight): CheckResult {
+  const parts = splitPath(ins.relPath);
+  const pathDomain = parts[1] ?? "";
+  const pathTopic = parts[2] ?? "";
+  if (pathDomain === ins.domain && pathTopic === ins.topic) {
+    return result("topic_matches_path", true);
+  }
+  return result(
+    "topic_matches_path",
+    false,
+    `frontmatter ${ins.domain}/${ins.topic} != path ${pathDomain}/${pathTopic}`
+  );
+}
+
+export function runChecks(ins: GateInsight, index: SourceIndex): CheckResult[] {
+  return [
+    checkStancePresent(ins),
+    checkStanceDirectional(ins),
+    checkAttributionResolves(ins, index),
+    checkTopicMatchesPath(ins),
+  ];
+}
+
+// ─── Loaders ──────────────────────────────────────────────────────────
+
+const INSIGHT_GLOB = "domains/**/*.md";
+const INSIGHT_IGNORE = ["**/_overview.md", "**/_summary.md", "**/_index.md"];
+
+export async function loadGateInsights(kbRoot: string = KB_ROOT): Promise<GateInsight[]> {
+  const files = await fg(INSIGHT_GLOB, {
+    cwd: kbRoot,
+    absolute: true,
+    ignore: INSIGHT_IGNORE,
+  });
+
+  const insights: GateInsight[] = [];
+  for (const filePath of files) {
+    let data: Record<string, unknown>;
+    try {
+      const raw = await readFile(filePath, "utf-8");
+      data = matter(raw).data as Record<string, unknown>;
+    } catch {
+      continue; // unreadable/garbled file — health.ts reports these separately
+    }
+    if (!data || typeof data.id !== "string") continue;
+
+    insights.push({
+      id: data.id,
+      domain: asString(data.domain),
+      topic: asString(data.topic),
+      title: asString(data.title),
+      stance: asString(data.stance),
+      dateExtracted: asString(data.date_extracted),
+      sources: parseSources(data.sources),
+      filePath,
+      relPath: relative(kbRoot, filePath),
+    });
+  }
+  return insights;
+}
+
+export async function buildSourceIndex(kbRoot: string = KB_ROOT): Promise<SourceIndex> {
+  const files = await fg("sources/**/*.md", {
+    cwd: kbRoot,
+    absolute: true,
+    ignore: ["**/raw/**", "**/_index.md"],
+  });
+
+  const titles = new Set<string>();
+  const urls = new Set<string>();
+  for (const filePath of files) {
+    let data: Record<string, unknown>;
+    try {
+      data = matter(await readFile(filePath, "utf-8")).data as Record<string, unknown>;
+    } catch {
+      continue;
+    }
+    if (typeof data?.title === "string") titles.add(normalizeTitle(data.title));
+    if (typeof data?.url === "string") urls.add(normalizeUrl(data.url));
+  }
+  return { titles, urls };
+}
+
+// ─── Novelty (cosine nearest-neighbor) ────────────────────────────────
+//
+// Two-step, deliberately metric-agnostic:
+//   1. vec0 KNN (`embedding MATCH ? AND k = N`) to find candidate neighbors.
+//      The live embeddings table returns L2 distance (it was created before
+//      distance_metric=cosine was added; CREATE…IF NOT EXISTS no-ops on the
+//      existing table). For the L2-normalized vectors Zuhn stores, L2 order
+//      and cosine order coincide, so the KNN ranking is correct.
+//   2. Compute EXACT cosine on the actual stored vectors — so the reported
+//      value never depends on which distance metric vec0 happens to use.
+//
+// Neighbors are filtered to INS- ids: the dedup signal is insight-to-insight,
+// and the table also holds principles/mental models (PRI-/MM-). We overfetch
+// (large k) and filter in JS so a real insight neighbor isn't missed behind a
+// run of PRI-/MM- rows in a tight compression cluster.
+
+const KNN_K = 64;
+
+/** Reinterpret a vec0 float32 blob as a Float32Array (copy → 4-byte aligned). */
+function toFloat32(buf: Buffer): Float32Array {
+  const copy = buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.length);
+  return new Float32Array(copy);
+}
+
+export function cosineSimilarity(a: Float32Array, b: Float32Array): number {
+  const n = Math.min(a.length, b.length);
+  let dot = 0;
+  let na = 0;
+  let nb = 0;
+  for (let i = 0; i < n; i++) {
+    dot += a[i] * b[i];
+    na += a[i] * a[i];
+    nb += b[i] * b[i];
+  }
+  if (na === 0 || nb === 0) return 0;
+  return dot / (Math.sqrt(na) * Math.sqrt(nb));
+}
+
+export type NearestFn = (id: string) => NoveltyResult;
+
+export function buildNoveltyComputer(db: Database.Database): NearestFn {
+  const embStmt = db.prepare("SELECT embedding FROM embeddings WHERE id = ?");
+  const knnStmt = db.prepare(
+    `SELECT id, distance FROM embeddings WHERE embedding MATCH ? AND k = ${KNN_K}`
+  );
+
+  return function nearest(id: string): NoveltyResult {
+    const self = embStmt.get(id) as { embedding: Buffer } | undefined;
+    if (!self) return { nearestId: null, similarity: null, selfEmbedded: false };
+
+    const rows = knnStmt.all(self.embedding) as Array<{ id: string; distance: number }>;
+    // Nearest *insight* neighbor (KNN is distance-ordered; exclude self + non-insights).
+    const neighbor = rows.find((r) => r.id !== id && r.id.startsWith("INS-"));
+    if (!neighbor) return { nearestId: null, similarity: null, selfEmbedded: true };
+
+    const other = embStmt.get(neighbor.id) as { embedding: Buffer } | undefined;
+    if (!other) return { nearestId: neighbor.id, similarity: null, selfEmbedded: true };
+
+    // Exact cosine on the stored vectors (full precision — round only at display).
+    const sim = cosineSimilarity(toFloat32(self.embedding), toFloat32(other.embedding));
+    return { nearestId: neighbor.id, similarity: sim, selfEmbedded: true };
+  };
+}
+
+// ─── Audit aggregation ────────────────────────────────────────────────
+
+/** Histogram buckets for nearest-neighbor similarity (high → low). */
+const SIM_BUCKETS: Array<{ label: string; min: number }> = [
+  { label: ">=0.95", min: 0.95 },
+  { label: "0.90-0.95", min: 0.9 },
+  { label: "0.85-0.90", min: 0.85 },
+  { label: "0.80-0.85", min: 0.8 },
+  { label: "0.70-0.80", min: 0.7 },
+  { label: "<0.70", min: -Infinity },
+];
+
+/** Candidate "too duplicate to admit" thresholds, for block simulation. */
+export const BLOCK_THRESHOLDS = [0.85, 0.9, 0.95];
+
+export function auditInsights(
+  insights: GateInsight[],
+  index: SourceIndex,
+  nearest?: NearestFn,
+  options: { scope?: string; exampleLimit?: number } = {}
+): { audits: InsightAudit[]; report: AuditReport } {
+  const exampleLimit = options.exampleLimit ?? 15;
+  // Distinguish "novelty not computed" (no DB) from "computed, found nothing".
+  const noveltyAvailable = nearest !== undefined;
+
+  const coverage = emptyCheckRecord();
+  const failingExamples = emptyExampleRecord();
+  const histogram: Record<string, number> = Object.fromEntries(
+    SIM_BUCKETS.map((b) => [b.label, 0])
+  );
+  const blockSimulation: Record<string, number> = Object.fromEntries(
+    BLOCK_THRESHOLDS.map((t) => [t.toFixed(2), 0])
+  );
+  const topicCounts = new Map<string, number>();
+
+  let embedded = 0;
+  let unembedded = 0;
+  let noNeighbor = 0;
+  let simSum = 0;
+  let simCount = 0;
+  const pairs: Array<{ id: string; nearestId: string; similarity: number }> = [];
+
+  const audits: InsightAudit[] = [];
+
+  for (const ins of insights) {
+    const checks = runChecks(ins, index);
+    for (const check of checks) {
+      if (check.passed) {
+        coverage[check.checkId] += 1;
+      } else if (failingExamples[check.checkId].length < exampleLimit) {
+        failingExamples[check.checkId].push({
+          id: ins.id,
+          relPath: ins.relPath,
+          detail: check.detail,
+        });
+      }
+    }
+
+    const novelty: NoveltyResult = nearest
+      ? nearest(ins.id)
+      : { nearestId: null, similarity: null, selfEmbedded: false };
+
+    // Only account for novelty when it was actually computed; otherwise leave
+    // all counts at zero (report.novelty.available signals "not measured").
+    if (noveltyAvailable) {
+      if (!novelty.selfEmbedded) {
+        unembedded += 1;
+      } else if (novelty.similarity === null || novelty.nearestId === null) {
+        embedded += 1;
+        noNeighbor += 1;
+      } else {
+        embedded += 1;
+        simSum += novelty.similarity;
+        simCount += 1;
+        histogram[bucketLabel(novelty.similarity)] += 1;
+        for (const t of BLOCK_THRESHOLDS) {
+          if (novelty.similarity >= t) blockSimulation[t.toFixed(2)] += 1;
+        }
+        pairs.push({ id: ins.id, nearestId: novelty.nearestId, similarity: novelty.similarity });
+      }
+    }
+
+    // Topic counts keyed by actual file location (domain/topic from path).
+    const parts = splitPath(ins.relPath);
+    const topicKey = `${parts[1] ?? "?"}/${parts[2] ?? "?"}`;
+    topicCounts.set(topicKey, (topicCounts.get(topicKey) ?? 0) + 1);
+
+    audits.push({ id: ins.id, relPath: ins.relPath, checks, novelty });
+  }
+
+  const singletonTopics = [...topicCounts.entries()]
+    .filter(([, n]) => n === 1)
+    .map(([key]) => key)
+    .sort();
+
+  // Dedup reciprocal pairs (A→B and B→A are the same near-dupe) by unordered key,
+  // keeping the highest-similarity instance (list is already sorted desc).
+  const seenPairs = new Set<string>();
+  const topPairs = pairs
+    .sort((a, b) => b.similarity - a.similarity)
+    .filter((p) => {
+      const key = p.id < p.nearestId ? `${p.id}|${p.nearestId}` : `${p.nearestId}|${p.id}`;
+      if (seenPairs.has(key)) return false;
+      seenPairs.add(key);
+      return true;
+    })
+    .slice(0, 50)
+    // Full precision in the report/JSON — rounding happens only at display.
+    .map((p) => ({ id: p.id, nearestId: p.nearestId, similarity: p.similarity }));
+
+  const report: AuditReport = {
+    generated_at: new Date().toISOString(),
+    scope: options.scope ?? "all",
+    total: insights.length,
+    coverage,
+    novelty: {
+      available: noveltyAvailable,
+      embedded,
+      unembedded,
+      no_neighbor: noNeighbor,
+      histogram,
+      block_simulation: blockSimulation,
+      mean_similarity: simCount > 0 ? simSum / simCount : null,
+      top_pairs: topPairs,
+    },
+    fragmentation: {
+      total_topics: topicCounts.size,
+      singletons: singletonTopics.length,
+      singleton_topics: singletonTopics,
+    },
+    failing_examples: failingExamples,
+  };
+
+  return { audits, report };
+}
+
+// ─── Internal helpers ─────────────────────────────────────────────────
+
+function asString(value: unknown): string {
+  if (typeof value === "string") return value;
+  if (value === null || value === undefined) return "";
+  return String(value);
+}
+
+function parseSources(value: unknown): SourceRef[] {
+  if (!Array.isArray(value)) return [];
+  return value
+    .filter((s): s is Record<string, unknown> => !!s && typeof s === "object")
+    .map((s) => ({
+      title: typeof s.title === "string" ? s.title : undefined,
+      url: typeof s.url === "string" ? s.url : undefined,
+    }));
+}
+
+function splitPath(relPath: string): string[] {
+  return relPath.split(/[\\/]/);
+}
+
+function truncate(text: string, max: number): string {
+  return text.length <= max ? text : text.slice(0, max - 1) + "…";
+}
+
+function emptyCheckRecord(): Record<CheckId, number> {
+  return { stance_present: 0, stance_directional: 0, attribution_resolves: 0, topic_matches_path: 0 };
+}
+
+function emptyExampleRecord(): Record<CheckId, Array<{ id: string; relPath: string; detail?: string }>> {
+  return { stance_present: [], stance_directional: [], attribution_resolves: [], topic_matches_path: [] };
+}
+
+function bucketLabel(similarity: number): string {
+  const bucket = SIM_BUCKETS.find((b) => similarity >= b.min) ?? SIM_BUCKETS[SIM_BUCKETS.length - 1];
+  return bucket.label;
+}

From 67ffd2dd189a28caba5edc914134a709c18c1f93 Mon Sep 17 00:00:00 2001
From: Jin Choi <jinchoi@u.northwestern.edu>
Date: Mon, 25 May 2026 15:20:09 -0700
Subject: [PATCH 2/3] feat(gate): add forward enforcement (Phase 2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Wires the audit checks into a forward-only enforcement gate.

- enforceGate() (lib): partitions blocking failures from non-blocking
  warnings. Near-duplicate (>= cosine threshold, default 0.95) blocks when
  measurable; missing stance blocks by default. stance_directional (a
  heuristic) and attribution_resolves (would reject legitimate synthetic
  insights) are warnings by default, promotable via blockingChecks. An
  unembedded new insight WARNS (dup check unmeasurable) rather than silently
  skipping — fail-closing there would halt ingestion whenever Ollama is down.

- insight-gate.ts --enforce: scopes via --changed (uncommitted insight files;
  fails CLOSED on git error and on changed files that don't load), --since, or
  all. Exits 1 on blocking failures.

- post-ingest gate step: fatal, after embed (needs vectors) and before
  learn/auto-git (a blocked batch must not commit; placement preserves
  forward-only scoping since learn mutates existing insights). Mirrors the
  existing fatal reindex step.

Conservative by design: blocks only on missing stance + near-identical
duplicate, so it won't false-block legitimate or synthetic insights. Ratchet
--max-similarity or promote warning-checks once trusted.

Tests: 8 enforceGate cases. Verified e2e: --changed exit 0; blocking scope exit 1.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 scripts/insight-gate.ts          | 161 ++++++++++++++++++++++++++++---
 scripts/lib/insight-gate.test.ts |  68 +++++++++++++
 scripts/lib/insight-gate.ts      |  85 ++++++++++++++++
 scripts/post-ingest.ts           |  37 ++++++-
 4 files changed, 338 insertions(+), 13 deletions(-)

diff --git a/scripts/insight-gate.ts b/scripts/insight-gate.ts
index f588ba334f..fdef23f757 100644
--- a/scripts/insight-gate.ts
+++ b/scripts/insight-gate.ts
@@ -2,24 +2,27 @@
 /**
  * insight-gate.ts — Semantic quality gate for insights (CLI).
  *
- * PHASE 1 (this file): AUDIT ONLY. Read-only X-ray of insight quality across
- * the corpus. Never exits 1 — it reports so the pass-bar can be calibrated
- * against the real distribution before forward enforcement is switched on.
+ * Two modes:
+ *   --audit    READ-ONLY X-ray of insight quality across the corpus. Never
+ *              exits 1 on quality findings — it reports so the pass-bar can be
+ *              calibrated against the real distribution.
+ *   --enforce  Forward gate over a batch of NEW insights. Exits 1 if any
+ *              BLOCKING issue is found (missing stance, or a near-duplicate).
+ *              Wired as a fatal pre-commit step in post-ingest.
  *
  * Usage:
- *   npx tsx scripts/insight-gate.ts --audit --all          # full corpus X-ray
- *   npx tsx scripts/insight-gate.ts --audit --since 2026-05-01
- *   npx tsx scripts/insight-gate.ts --audit --all --json   # machine-readable
- *   npx tsx scripts/insight-gate.ts --audit --examples 30  # more failing samples
+ *   npx tsx scripts/insight-gate.ts --audit --all                 # full corpus X-ray
+ *   npx tsx scripts/insight-gate.ts --audit --since 2026-05-01 [--json]
+ *   npx tsx scripts/insight-gate.ts --enforce --changed           # gate uncommitted insights
+ *   npx tsx scripts/insight-gate.ts --enforce --since 2026-05-01  # gate by date
+ *   npx tsx scripts/insight-gate.ts --enforce --changed --max-similarity 0.93
  *
- * Outputs (in addition to stdout):
+ * Audit outputs (in addition to stdout):
  *   knowledge-base/meta/gate-report.json   latest full report (overwritten)
  *   knowledge-base/meta/gate-log.jsonl     one summary line per run (appended)
- *
- * Phase 2 (not yet wired) will add forward enforcement: scope to a batch and
- * exit 1 on failures, reusing the exact check functions in lib/insight-gate.ts.
  */
 
+import { execFileSync } from "node:child_process";
 import { appendFileSync, mkdirSync, writeFileSync } from "node:fs";
 import { dirname, join } from "node:path";
 import Database from "better-sqlite3";
@@ -29,9 +32,12 @@ import {
   auditInsights,
   buildNoveltyComputer,
   buildSourceIndex,
+  enforceGate,
   loadGateInsights,
+  DEFAULT_MAX_SIMILARITY,
   type AuditReport,
   type CheckId,
+  type EnforceResult,
   type NearestFn,
 } from "./lib/insight-gate";
 import { KB_ROOT } from "./lib/kb-root";
@@ -87,8 +93,15 @@ function tryBuildNovelty(): { nearest: NearestFn | undefined; close: () => void
 // ─── Main ─────────────────────────────────────────────────────────────
 
 async function main(): Promise<void> {
-  const args = parseArgs(process.argv.slice(2));
+  const argv = process.argv.slice(2);
+  if (argv.includes("--enforce")) {
+    await runEnforce(argv);
+  } else {
+    await runAudit(parseArgs(argv));
+  }
+}
 
+async function runAudit(args: Args): Promise<void> {
   let insights = await loadGateInsights();
   const scope = args.since ? `since ${args.since}` : "all";
   if (args.since) {
@@ -117,6 +130,130 @@ async function main(): Promise<void> {
   }
 }
 
+// ─── Enforce (Phase 2) ────────────────────────────────────────────────
+
+async function runEnforce(argv: string[]): Promise<void> {
+  const simIdx = argv.indexOf("--max-similarity");
+  const maxSimilarity =
+    simIdx !== -1 ? parseFloat(argv[simIdx + 1]) || DEFAULT_MAX_SIMILARITY : DEFAULT_MAX_SIMILARITY;
+  const sinceIdx = argv.indexOf("--since");
+  const since = sinceIdx !== -1 ? argv[sinceIdx + 1] ?? null : null;
+
+  let insights = await loadGateInsights();
+  let scope: string;
+  if (argv.includes("--changed")) {
+    // Fail CLOSED: if git can't tell us what changed, we cannot safely enforce.
+    // Aborting (exit 1) is correct for a gate — never treat "git broke" as
+    // "nothing changed" (which would let a batch sail through in post-ingest).
+    let changed: Set<string>;
+    try {
+      changed = getChangedInsightRelPaths();
+    } catch (err) {
+      console.error(
+        `ENFORCE aborted: cannot determine changed insights via git — ${(err as Error).message}`
+      );
+      process.exit(1);
+    }
+    // Fail CLOSED on unloadable insights: a changed insight file that
+    // loadGateInsights() dropped (unreadable, invalid frontmatter, or missing
+    // id) would otherwise silently vanish from the batch and let the gate pass.
+    const loadedPaths = new Set(insights.map((i) => i.relPath));
+    const unaccounted = [...changed].filter((p) => !loadedPaths.has(p));
+    if (unaccounted.length > 0) {
+      console.error(
+        `ENFORCE aborted: ${unaccounted.length} changed insight file(s) could not be loaded ` +
+          "(unreadable, invalid frontmatter, or missing id):"
+      );
+      for (const p of unaccounted) console.error(`  ✗ ${p}`);
+      console.error("Fix these (run: npm run health) and re-run.");
+      process.exit(1);
+    }
+    insights = insights.filter((i) => changed.has(i.relPath));
+    scope = "changed (uncommitted)";
+  } else if (since) {
+    insights = insights.filter((i) => i.dateExtracted >= since);
+    scope = `since ${since}`;
+  } else {
+    scope = "all";
+  }
+
+  console.log(
+    `Insight Gate (ENFORCE) — scope: ${scope} · ${insights.length} insight(s) · block ≥ ${maxSimilarity} cosine`
+  );
+
+  if (insights.length === 0) {
+    console.log("No insights in scope — nothing to gate.");
+    return; // exit 0
+  }
+
+  const sourceIndex = await buildSourceIndex();
+  const { nearest, close } = tryBuildNovelty();
+  let result: EnforceResult;
+  try {
+    result = enforceGate(insights, sourceIndex, nearest, { maxSimilarity });
+  } finally {
+    close();
+  }
+  if (!nearest) {
+    console.warn("WARN: embeddings unavailable — near-duplicate check skipped this run.");
+  }
+
+  if (result.warnings.length > 0) {
+    console.log(`\n${result.warnings.length} warning(s) (non-blocking):`);
+    for (const w of result.warnings) {
+      console.log(`  ⚠ ${w.id} [${w.checkId}] ${w.reason}`);
+    }
+  }
+
+  if (result.failures.length > 0) {
+    console.error(`\n✗ GATE FAILED — ${result.failures.length} blocking issue(s):`);
+    for (const f of result.failures) {
+      console.error(`  ✗ ${f.id} [${f.checkId}] ${f.reason}`);
+      console.error(`      ${f.relPath}`);
+    }
+    console.error("\nThis batch is not admissible. Fix the above and re-run.");
+    process.exit(1);
+  }
+
+  console.log(`\n✓ GATE PASSED — ${insights.length} insight(s) admitted.`);
+}
+
+/**
+ * Insight files (KB-relative paths) that are untracked or modified vs HEAD.
+ *
+ * FAILS CLOSED: git errors propagate to the caller. A gate that can't determine
+ * its batch must not silently report "nothing changed" — the caller treats a
+ * throw as fatal (exit 1). An empty set means git succeeded and genuinely found
+ * no changed insights (a legitimate no-op).
+ */
+function getChangedInsightRelPaths(): Set<string> {
+  const repoRoot = execFileSync("git", ["rev-parse", "--show-toplevel"], {
+    encoding: "utf-8",
+    cwd: KB_ROOT,
+  }).trim();
+  const git = (args: string[]): string =>
+    execFileSync("git", args, { encoding: "utf-8", cwd: repoRoot }).trim();
+
+  const out = new Set<string>();
+  const blocks = [
+    git(["ls-files", "--others", "--exclude-standard", "--", "knowledge-base/domains/"]),
+    git(["diff", "--name-only", "HEAD", "--", "knowledge-base/domains/"]),
+  ];
+  for (const block of blocks) {
+    if (!block) continue;
+    for (const line of block.split("\n")) {
+      if (!line.endsWith(".md")) continue;
+      const rel = line.replace(/^knowledge-base\//, "");
+      // Skip regenerated _overview/_summary/_index files — they aren't insights
+      // (mirrors loadGateInsights' ignore set), so they must not be treated as
+      // "unaccounted" insight files in the reconciliation in runEnforce.
+      if ((rel.split("/").pop() ?? "").startsWith("_")) continue;
+      out.add(rel);
+    }
+  }
+  return out;
+}
+
 // ─── Persistence ──────────────────────────────────────────────────────
 
 function persist(report: AuditReport): void {
diff --git a/scripts/lib/insight-gate.test.ts b/scripts/lib/insight-gate.test.ts
index 695b94aed0..b94c9565a5 100644
--- a/scripts/lib/insight-gate.test.ts
+++ b/scripts/lib/insight-gate.test.ts
@@ -4,6 +4,7 @@ import Database from "better-sqlite3";
 import {
   auditInsights,
   buildNoveltyComputer,
+  enforceGate,
   checkAttributionResolves,
   checkStanceDirectional,
   checkStancePresent,
@@ -335,3 +336,70 @@ describe("buildNoveltyComputer (in-memory sqlite-vec)", () => {
     db.close();
   });
 });
+
+// ─── enforceGate (Phase 2 forward enforcement) ────────────────────────
+
+describe("enforceGate", () => {
+  const clean = mk({
+    id: "INS-1",
+    stance: "X reduces cost because Y",
+    sources: [{ title: "Known Source" }],
+    relPath: "domains/ai-development/agents/a.md",
+  });
+
+  it("admits a clean insight: no failures, no warnings", () => {
+    const { failures, warnings } = enforceGate([clean], INDEX, undefined);
+    expect(failures).toEqual([]);
+    expect(warnings).toEqual([]);
+  });
+
+  it("blocks a missing stance; directional + attribution failures are warnings (default policy)", () => {
+    const bad = mk({ id: "INS-2", stance: "", sources: [], relPath: "domains/ai-development/agents/b.md" });
+    const { failures, warnings } = enforceGate([bad], INDEX, undefined);
+    expect(failures.map((f) => f.checkId)).toEqual(["stance_present"]);
+    const warned = warnings.map((w) => w.checkId);
+    expect(warned).toContain("stance_directional");
+    expect(warned).toContain("attribution_resolves");
+  });
+
+  it("blocks a near-duplicate at/above the threshold", () => {
+    const nearest: NearestFn = () => ({ nearestId: "INS-9", similarity: 0.96, selfEmbedded: true });
+    const { failures } = enforceGate([clean], INDEX, nearest);
+    expect(failures.map((f) => f.checkId)).toEqual(["novelty"]);
+    expect(failures[0].reason).toContain("INS-9");
+  });
+
+  it("does not block a distinct insight below the threshold", () => {
+    const nearest: NearestFn = () => ({ nearestId: "INS-9", similarity: 0.8, selfEmbedded: true });
+    expect(enforceGate([clean], INDEX, nearest).failures).toEqual([]);
+  });
+
+  it("respects a custom maxSimilarity", () => {
+    const nearest: NearestFn = () => ({ nearestId: "INS-9", similarity: 0.91, selfEmbedded: true });
+    expect(enforceGate([clean], INDEX, nearest, { maxSimilarity: 0.95 }).failures).toEqual([]);
+    expect(
+      enforceGate([clean], INDEX, nearest, { maxSimilarity: 0.9 }).failures.map((f) => f.checkId)
+    ).toEqual(["novelty"]);
+  });
+
+  it("can promote a heuristic check to blocking via blockingChecks", () => {
+    const weak = mk({ id: "INS-3", stance: "AI is important", sources: [{ title: "Known Source" }] });
+    expect(enforceGate([weak], INDEX, undefined).failures).toEqual([]); // directional warns by default
+    const { failures } = enforceGate([weak], INDEX, undefined, {
+      blockingChecks: ["stance_present", "stance_directional"],
+    });
+    expect(failures.map((f) => f.checkId)).toEqual(["stance_directional"]);
+  });
+
+  it("skips the near-duplicate check when novelty is unavailable", () => {
+    expect(enforceGate([clean], INDEX, undefined).failures).toEqual([]);
+  });
+
+  it("warns (not silently skips) when a new insight has no embedding", () => {
+    const noEmbedding: NearestFn = () => ({ nearestId: null, similarity: null, selfEmbedded: false });
+    const { failures, warnings } = enforceGate([clean], INDEX, noEmbedding);
+    expect(failures).toEqual([]);
+    expect(warnings.map((w) => w.checkId)).toContain("novelty");
+    expect(warnings.find((w) => w.checkId === "novelty")?.reason).toContain("no embedding");
+  });
+});
diff --git a/scripts/lib/insight-gate.ts b/scripts/lib/insight-gate.ts
index 921ba63c46..f4f67da30d 100644
--- a/scripts/lib/insight-gate.ts
+++ b/scripts/lib/insight-gate.ts
@@ -382,6 +382,91 @@ export function buildNoveltyComputer(db: Database.Database): NearestFn {
   };
 }
 
+// ─── Enforcement (Phase 2) ────────────────────────────────────────────
+//
+// Forward-only gate: given a batch of NEW insights, return the blocking
+// failures and non-blocking warnings. Reuses the exact same deterministic
+// checks as the audit — the only difference is that here some of them have
+// teeth (a non-empty failures list means the batch is refused / exit 1).
+
+export interface GateFailure {
+  id: string;
+  relPath: string;
+  checkId: CheckId | "novelty";
+  reason: string;
+}
+
+export interface EnforceResult {
+  failures: GateFailure[]; // blocking — caller should refuse the batch
+  warnings: GateFailure[]; // non-blocking — reported for the human
+}
+
+export interface EnforceOptions {
+  /** Block a new insight whose nearest INSIGHT neighbor is >= this cosine. */
+  maxSimilarity?: number;
+  /** Which boolean checks block (the rest become warnings). */
+  blockingChecks?: CheckId[];
+}
+
+export const DEFAULT_MAX_SIMILARITY = 0.95;
+// Conservative default: only the objective, near-zero-false-positive checks
+// block. stance_directional is a heuristic (false positives); attribution
+// would reject legitimate synthetic/cross-domain insights — both start as
+// warnings and can be promoted once the corpus behavior is trusted.
+export const DEFAULT_BLOCKING_CHECKS: CheckId[] = ["stance_present"];
+
+export function enforceGate(
+  insights: GateInsight[],
+  index: SourceIndex,
+  nearest: NearestFn | undefined,
+  options: EnforceOptions = {}
+): EnforceResult {
+  const maxSimilarity = options.maxSimilarity ?? DEFAULT_MAX_SIMILARITY;
+  const blocking = new Set(options.blockingChecks ?? DEFAULT_BLOCKING_CHECKS);
+
+  const failures: GateFailure[] = [];
+  const warnings: GateFailure[] = [];
+
+  for (const ins of insights) {
+    for (const check of runChecks(ins, index)) {
+      if (check.passed) continue;
+      const entry: GateFailure = {
+        id: ins.id,
+        relPath: ins.relPath,
+        checkId: check.checkId,
+        reason: check.detail ?? check.checkId,
+      };
+      (blocking.has(check.checkId) ? failures : warnings).push(entry);
+    }
+
+    // Near-duplicate blocks when measurable. If the new insight has no
+    // embedding (e.g. Embed was skipped with Ollama offline), the dup check
+    // cannot run — surface that as a WARNING rather than skip it silently.
+    // (Fail-closing here would halt all ingestion whenever embeddings are
+    // unavailable, which is too brittle; the warning keeps the gap visible.)
+    if (nearest) {
+      const nv = nearest(ins.id);
+      if (!nv.selfEmbedded) {
+        warnings.push({
+          id: ins.id,
+          relPath: ins.relPath,
+          checkId: "novelty",
+          reason: "not checked — no embedding (run embed first to gate duplicates)",
+        });
+      } else if (nv.similarity !== null && nv.nearestId !== null && nv.similarity >= maxSimilarity) {
+        failures.push({
+          id: ins.id,
+          relPath: ins.relPath,
+          checkId: "novelty",
+          reason: `${nv.similarity.toFixed(3)} cosine near-duplicate of ${nv.nearestId}`,
+        });
+      }
+    }
+  }
+
+  return { failures, warnings };
+}
+
 // ─── Audit aggregation ────────────────────────────────────────────────
 
 /** Histogram buckets for nearest-neighbor similarity (high → low). */
diff --git a/scripts/post-ingest.ts b/scripts/post-ingest.ts
index 5c94f32bfc..505d284041 100644
--- a/scripts/post-ingest.ts
+++ b/scripts/post-ingest.ts
@@ -5,7 +5,7 @@ import { execFileSync } from "node:child_process";
 import { readFile, writeFile, mkdir } from "node:fs/promises";
 
 const PROJECT_ROOT = join(__dirname, "..");
-const KB_ROOT = join(PROJECT_ROOT, "knowledge-base");
+import { KB_ROOT } from "./lib/kb-root";
 const ACTIVITY_LOG = join(KB_ROOT, "meta", "activity.md");
 
 // ─── Helpers ──────────────────────────────────────────────────────────
@@ -179,6 +179,41 @@ async function main(): Promise<void> {
     console.warn("\nEmbed step failed (non-fatal) — FTS5-only mode active.");
   }
 
+  // Step 3.5: Gate (forward enforcement — FATAL, like reindex)
+  //
+  // Refuse to commit the batch if any NEW (uncommitted) insight fails a
+  // blocking check: missing stance, or a near-duplicate (>= cosine threshold).
+  // Runs AFTER embed so the cosine check has vectors, and BEFORE auto-git so a
+  // failing batch never lands. Scoped to uncommitted insight files (--changed),
+  // so a clean run (no new insights) passes trivially. Fully recoverable: fix
+  // the flagged insight(s) and re-run post-ingest (idempotent).
+  //
+  // Placement is deliberately BEFORE learn/views and must stay there: learn
+  // rewrites related[]/confidence on many EXISTING insights, which would
+  // balloon the --changed set to include grandfathered insights and break
+  // forward-only scoping (the gate would then block on pre-existing dupes /
+  // stanceless insights). The gate validates INTRINSIC quality — stance,
+  // attribution, novelty — fields fixed at extract time that learn never
+  // touches, so gating before learn loses no coverage.
+  const gateResult = runStep("Gate", [
+    "npx", "tsx", join(PROJECT_ROOT, "scripts", "insight-gate.ts"),
+    "--enforce", "--changed",
+  ]);
+
+  if (gateResult.ok) {
+    results.push({ step: "gate", status: "PASSED" });
+  } else {
+    results.push({ step: "gate", status: "BLOCKED" });
+    await appendToActivityLog(
+      `post-ingest gate BLOCKED the batch — aborting before auto-git: ${gateResult.error}`
+    );
+    console.error(
+      "\nPipeline ABORTED: gate blocked the batch (NOT committed). Fix the flagged insights and re-run."
+    );
+    printSummary(results);
+    process.exit(1);
+  }
+
   // Step 4: Learn (auto-connections, emergence detection, confidence propagation)
   const learnResult = runStep("Learn", [
     "npx", "tsx", join(PROJECT_ROOT, "scripts", "learn.ts"),

From fbeedbc0293bfc8470856a517b34acaf182404bd Mon Sep 17 00:00:00 2001
From: Jin Choi <jinchoi@u.northwestern.edu>
Date: Mon, 25 May 2026 15:32:04 -0700
Subject: [PATCH 3/3] chore(gate): track scripts/lib/kb-root.ts (KB_ROOT
 resolver)

The Phase 1/2 gate imports KB_ROOT from ./lib/kb-root, but the module was
untracked (created by an in-progress refactor that was never committed), so the
gate would not build on a fresh checkout. Track it here as the first committed
code to depend on it. Self-contained (only imports node:path); resolves
ZUHN_KB_ROOT or defaults to <repo>/knowledge-base.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 scripts/lib/kb-root.ts | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 scripts/lib/kb-root.ts

diff --git a/scripts/lib/kb-root.ts b/scripts/lib/kb-root.ts
new file mode 100644
index 0000000000..7d45ed6f5b
--- /dev/null
+++ b/scripts/lib/kb-root.ts
@@ -0,0 +1,24 @@
+// ─── KB_ROOT — single source of truth ─────────────────────────────────
+//
+// All scripts and library modules import KB_ROOT from here rather than
+// re-deriving it from `__dirname`. This is the foundation for multi-tenant
+// configuration: pointing Zuhn at a different knowledge base (e.g. a
+// customer corpus, a demo corpus, an isolated test KB) is a single env-var
+// flip — `ZUHN_KB_ROOT=/path/to/other/kb`.
+//
+// Resolution rules:
+//   1. If process.env.ZUHN_KB_ROOT is set, resolve it (relative paths
+//      resolve against process.cwd(), absolute paths pass through).
+//   2. Otherwise, default to <repo>/knowledge-base/.
+//
+// Naming follows the existing ZUHN_* convention used in inbox-server.ts
+// (ZUHN_INBOX_PORT, ZUHN_INBOX_HOST) and daemon.ts (ZUHN_MAX_AGENTS).
+
+import { join, resolve } from "node:path";
+
+// scripts/lib/kb-root.ts → ../../ → repo root → knowledge-base/
+const REPO_DEFAULT_KB_ROOT = join(__dirname, "../../knowledge-base");
+
+export const KB_ROOT = process.env.ZUHN_KB_ROOT
+  ? resolve(process.env.ZUHN_KB_ROOT)
+  : REPO_DEFAULT_KB_ROOT;