From 7c34cd8816533c76d51e061452a50b902318c398 Mon Sep 17 00:00:00 2001
From: Ed Heltzel <402910+edheltzel@users.noreply.github.com>
Date: Wed, 10 Jun 2026 22:29:03 -0400
Subject: [PATCH 1/2] =?UTF-8?q?feat:=20add=20recall=20dedup=20=E2=80=94=20?=
 =?UTF-8?q?non-destructive=20dedup=20with=20provenance-aware=20survivor=20?=
 =?UTF-8?q?selection?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Dry-run by default; --execute marks duplicates in the new dedup_lineage
table (schema migration 9→10) without touching the records; --delete is
the destructive opt-in (recall export --backup recommended first).

- Exact/normalized detection within table + project; cross-table
  candidates are report-only
- Semantic detection over stored embeddings (pairwise cosine, no
  embedding service call; conservative 0.95 default threshold; skip is
  reported when no embeddings exist; no transitive chaining)
- Survivor priority user_authored > verbatim > extracted > derived >
  unknown (PROVENANCE_VALUES), then richness, importance, recency, id
- Marked duplicates hidden from all search paths (FTS5, semantic,
  hybrid, MCP) unless --include-duplicates is passed
- dedup_lineage included in recall export for a portable audit trail
- Destructive deletes go through chunked() per the chunk.ts audit note;
  FK-referenced duplicates (LoA message ranges, LoA parents) are kept
  as marked instead of failing the transaction
- Fix latent blobToEmbedding crash on bun:sqlite Uint8Array blobs

Closes #45
---
 CHANGELOG.md                  |  12 ++
 docs/architecture.md          |  10 ++
 docs/cli-reference.md         |  45 ++++-
 src/commands/dedup.ts         | 145 ++++++++++++++++
 src/commands/embed.ts         |  15 +-
 src/commands/search.ts        |   4 +-
 src/db/migrations.ts          |   6 +
 src/db/schema.ts              |  26 +++
 src/index.ts                  |  32 +++-
 src/lib/dedup.ts              | Bin 0 -> 22314 bytes
 src/lib/embeddings.ts         |  13 +-
 src/lib/export.ts             |   7 +-
 src/lib/memory.ts             |  15 ++
 src/mcp-server.ts             |   4 +
 tests/commands/dedup.test.ts  | 314 ++++++++++++++++++++++++++++++++++
 tests/commands/export.test.ts |   2 +
 tests/db/migrations.test.ts   |   3 +-
 tests/lib/dedup.test.ts       | 200 ++++++++++++++++++++++
 18 files changed, 839 insertions(+), 14 deletions(-)
 create mode 100644 src/commands/dedup.ts
 create mode 100644 src/lib/dedup.ts
 create mode 100644 tests/commands/dedup.test.ts
 create mode 100644 tests/lib/dedup.test.ts

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4a11335..abf0eda 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,18 @@ while MCP tool names (`memory_search`, `memory_add`, etc.) remain stable.
 
 ### Added
 
+- **`recall dedup`** — non-destructive dedup with provenance-aware survivor
+  selection (#45): dry-run by default, `--execute` marks duplicates in the new
+  `dedup_lineage` table (schema migration 9→10) without touching the records,
+  `--delete` is the destructive opt-in (take `recall export --backup` first).
+  Detection combines normalized-text matching with semantic matching over
+  stored embeddings (conservative 0.95 default threshold, skip reported when
+  embeddings are unavailable). Survivor priority is `user_authored > verbatim
+  > extracted > derived > unknown`, then richness, importance, recency.
+  Within-table only; cross-table candidates are report-only. Marked
+  duplicates are hidden from every search path unless
+  `recall search --include-duplicates` is passed, and lineage rows are
+  included in `recall export`.
 - **`recall export`** — portable and disaster-recovery exports (#43): JSON,
   Markdown, SQL dump, and SQLite (`VACUUM INTO`) formats with a manifest
   (counts + provenance counts including explicit `unknown`), a stdout/file/
diff --git a/docs/architecture.md b/docs/architecture.md
index a7d859e..d84e960 100644
--- a/docs/architecture.md
+++ b/docs/architecture.md
@@ -70,6 +70,7 @@ both.
 | telos | Purpose framework entries (optional) | Yes |
 | documents | Imported standalone markdown documents (optional) | Yes |
 | embeddings | Vector embeddings for semantic search (768-dim, nomic-embed-text) | N/A |
+| dedup_lineage | Duplicate lineage audit trail from `recall dedup` (survivor, duplicate, reason, similarity, status) | No |
 
 All FTS5-indexed tables have automatic sync triggers.
 
@@ -88,6 +89,15 @@ rows stay `NULL` (unknown) until classified with
 `recall provenance backfill`, which only acts on deterministic write-path
 evidence and never guesses.
 
+The `dedup_lineage` table was added in schema migration 9→10. `recall dedup`
+marks duplicate records non-destructively by writing lineage rows here
+(survivor table/id, duplicate table/id, reason, similarity, status); marked
+duplicates stay in their source tables but are hidden from search unless
+`--include-duplicates` is passed. Survivor selection follows provenance order
+(`user_authored > verbatim > extracted > derived > unknown`), then richness,
+importance, and recency. Dedup acts within a table only; cross-table
+candidates are report-only.
+
 ## Tiered RecallStart (v0.7.0+)
 
 The `RecallStart` hook injects two tiers at the top of every session:
diff --git a/docs/cli-reference.md b/docs/cli-reference.md
index c3d38ff..671de42 100644
--- a/docs/cli-reference.md
+++ b/docs/cli-reference.md
@@ -17,6 +17,7 @@ recall search "query" -t decisions      # Hard-filter to decisions only
 recall search "query" --bias-type decisions # Prefer decisions, still show other matching tables
 recall search "query" -p myproject      # Filter by project
 recall search "query" --show-provenance # Show provenance for every result
+recall search "query" --include-duplicates # Include records marked by recall dedup
 recall semantic "query"                 # Semantic search (explicit)
 recall hybrid "query"                   # Hybrid search (explicit)
 ```
@@ -46,6 +47,8 @@ FTS5 supports boolean operators and prefix matching:
 
 By default, search output stays quiet about [Record Provenance](#record-provenance) when a record carries a known value, and visibly flags records whose provenance is unknown (legacy rows that predate the provenance column). Pass `--show-provenance` to display the provenance of every result.
 
+Records marked as duplicates by [`recall dedup`](#dedup) are hidden from every search path (keyword, semantic, hybrid) by default — the records and their lineage remain in the database. Pass `--include-duplicates` to show them.
+
 ---
 
 ## Capture
@@ -269,7 +272,7 @@ Formats:
 
 - **json / markdown** — app-level export of the durable memory tables
   (`sessions`, `messages`, `decisions`, `learnings`, `breadcrumbs`,
-  `loa_entries`). Every row of a provenance-bearing table carries an explicit
+  `loa_entries`, `dedup_lineage`). Every row of a provenance-bearing table carries an explicit
   `provenance` field; legacy `NULL` provenance is exported as the literal
   `unknown` — never omitted, never guessed (see Record Provenance above).
   Embeddings are excluded.
@@ -300,6 +303,46 @@ included.
 overwrites an existing file (a `-N` suffix is added on collision), and prints
 the output path.
 
+## Dedup
+
+Detect and mark duplicate memory records without erasing evidence or lineage.
+
+```bash
+recall dedup                            # Dry-run report (default — writes nothing)
+recall dedup --execute                  # Mark duplicates (non-destructive)
+recall dedup --execute --delete         # Destructive opt-in: hard-delete duplicates
+recall dedup -t breadcrumbs             # Scope to one table
+recall dedup -p myproject               # Scope to one project
+recall dedup --threshold 0.98           # Stricter semantic matching (default 0.95)
+recall dedup --no-semantic              # Exact/normalized text pass only
+```
+
+Safety model:
+
+- **Dry-run by default.** Mutations require `--execute`.
+- **Non-destructive by default.** `--execute` writes lineage rows to the
+  `dedup_lineage` table (survivor, duplicate, reason, similarity, status,
+  timestamp); the duplicate records themselves stay intact and are merely
+  hidden from search. `--delete` is the destructive opt-in and requires
+  `--execute` — run `recall export --backup` first.
+- **Within-table only.** Dedup never merges across tables (or across
+  projects). Cross-table duplicate candidates are report-only.
+- **Survivor priority** is `user_authored > verbatim > extracted > derived >
+  unknown` ([Record Provenance](#record-provenance)); ties break by richness
+  (longer normalized text), importance, recency, then lowest id.
+- **Detection** combines exact/normalized text matching with semantic
+  matching over stored embeddings (no embedding service call needed). The
+  semantic pass is skipped — and reported as skipped — when no embeddings
+  exist; records are never merged below the configured `--threshold`
+  (conservative default: 0.95 cosine similarity). Records with fewer than 20
+  significant characters are never candidates.
+- **Lifecycle-aware.** Only `active` decisions participate; superseded and
+  reverted decisions are managed by the decision lifecycle, not dedup.
+
+Marked duplicates are hidden from all search paths by default; see
+`recall search --include-duplicates`. Lineage is included in
+`recall export`, so the audit trail is portable.
+
 ## Admin
 
 ```bash
diff --git a/src/commands/dedup.ts b/src/commands/dedup.ts
new file mode 100644
index 0000000..7c865bf
--- /dev/null
+++ b/src/commands/dedup.ts
@@ -0,0 +1,145 @@
+// recall dedup command (issue #45).
+//
+// Dry-run by default. --execute marks duplicates (non-destructive: records
+// stay intact, hidden from search via dedup_lineage). --delete is the
+// destructive opt-in and requires --execute; take a `recall export --backup`
+// first. Core logic lives in src/lib/dedup.ts.
+
+import { getDb } from '../db/connection.js';
+import {
+  applyDedupPlan,
+  DEDUP_TABLES,
+  DEFAULT_SEMANTIC_THRESHOLD,
+  planDedup,
+  type ApplyResult,
+  type DedupPlan,
+} from '../lib/dedup.js';
+import type { ProvenanceTable } from '../types/index.js';
+
+export interface DedupOptions {
+  execute?: boolean;
+  delete?: boolean;
+  table?: string;
+  project?: string;
+  threshold?: number;
+  semantic?: boolean;
+}
+
+export interface DedupRunResult {
+  plan: DedupPlan;
+  applied: ApplyResult | null;
+}
+
+export function runDedup(options: DedupOptions = {}): DedupRunResult | undefined {
+  const execute = options.execute ?? false;
+  const destructive = options.delete ?? false;
+
+  if (destructive && !execute) {
+    console.error('--delete requires --execute. Dry-run never deletes.');
+    process.exitCode = 1;
+    return undefined;
+  }
+
+  const target = options.table ?? 'all';
+  if (target !== 'all' && !(DEDUP_TABLES as readonly string[]).includes(target)) {
+    console.error(
+      `Invalid --table "${target}". Valid tables: ${DEDUP_TABLES.join(', ')}, all.`
+    );
+    process.exitCode = 1;
+    return undefined;
+  }
+
+  const threshold = options.threshold ?? DEFAULT_SEMANTIC_THRESHOLD;
+  if (!Number.isFinite(threshold) || threshold <= 0 || threshold > 1) {
+    console.error(`Invalid --threshold "${options.threshold}". Expected a number in (0, 1].`);
+    process.exitCode = 1;
+    return undefined;
+  }
+
+  const db = getDb();
+  const plan = planDedup(db, {
+    tables: target === 'all' ? undefined : [target as ProvenanceTable],
+    project: options.project,
+    threshold,
+    semantic: options.semantic,
+  });
+
+  const mode = !execute
+    ? '[DRY RUN — no changes written]'
+    : destructive
+      ? '[EXECUTE + DELETE — destructive: duplicates will be removed]'
+      : '[EXECUTE — marking duplicates, non-destructive]';
+  console.log(`${mode}\n`);
+
+  if (destructive) {
+    console.log("Recommended: run 'recall export --backup' before destructive dedup.\n");
+  }
+
+  const verb = execute ? (destructive ? 'delete' : 'mark') : 'would mark';
+  let totalPlanned = 0;
+  for (const report of plan.tables) {
+    totalPlanned += report.planned.length;
+    const unchanged = report.scanned - report.planned.length;
+    const skipped: string[] = [];
+    if (report.alreadyMarked > 0) skipped.push(`${report.alreadyMarked} already marked`);
+    if (report.tooShort > 0) skipped.push(`${report.tooShort} too short`);
+    const skippedNote = skipped.length > 0 ? ` (${skipped.join(', ')})` : '';
+    console.log(
+      `${report.table}: scanned ${report.scanned}, exact groups ${report.exactGroups}, ` +
+      `semantic pairs ${report.semanticPairs}, ${verb} ${report.planned.length}, ` +
+      `unchanged ${unchanged}${skippedNote}`
+    );
+    for (const entry of report.planned.slice(0, 3)) {
+      const sim = entry.similarity !== null ? ` @ ${entry.similarity.toFixed(3)}` : '';
+      console.log(
+        `  #${entry.duplicate_id} → survivor #${entry.survivor_id} [${entry.reason}${sim}]`
+      );
+    }
+    if (report.planned.length > 3) {
+      console.log(`  ...and ${report.planned.length - 3} more`);
+    }
+  }
+  console.log('');
+
+  if (plan.semanticSkipped) {
+    console.log(`Semantic pass: skipped — ${plan.semanticSkipped}`);
+  } else {
+    console.log(`Semantic pass: threshold ${plan.threshold}`);
+  }
+
+  const crossText = plan.crossTable.textMatches;
+  console.log(
+    `Cross-table (report-only, never acted on): ${crossText.length} text match group(s), ` +
+    `${plan.crossTable.semanticPairs} semantic pair(s)`
+  );
+  for (const match of crossText.slice(0, 5)) {
+    const members = match.members.map(m => `${m.table}#${m.id}`).join(' ↔ ');
+    const projectTag = match.project ? ` [${match.project}]` : '';
+    console.log(`  ${members}${projectTag}`);
+  }
+  if (crossText.length > 5) {
+    console.log(`  ...and ${crossText.length - 5} more`);
+  }
+  console.log('');
+
+  if (!execute) {
+    if (totalPlanned > 0) {
+      console.log('Re-run with --execute to mark duplicates (non-destructive).');
+      console.log("Marked duplicates are hidden from search; use 'recall search --include-duplicates' to see them.");
+    } else {
+      console.log('No duplicates found.');
+    }
+    return { plan, applied: null };
+  }
+
+  const applied = applyDedupPlan(db, plan, { destructive });
+  if (destructive) {
+    const fkNote = applied.fkProtected > 0
+      ? ` (${applied.fkProtected} kept as marked — referenced by LoA lineage)`
+      : '';
+    console.log(`Deleted ${applied.deleted} duplicate(s)${fkNote}.`);
+  } else {
+    console.log(`Marked ${applied.marked} duplicate(s). Records remain intact and recoverable.`);
+  }
+  return { plan, applied };
+}
diff --git a/src/commands/embed.ts b/src/commands/embed.ts
index c9ae4a0..66bd623 100644
--- a/src/commands/embed.ts
+++ b/src/commands/embed.ts
@@ -2,8 +2,17 @@
 
 import { getDb } from '../db/connection.js';
 import { embed, embeddingToBlob, blobToEmbedding, cosineSimilarity, checkEmbeddingService, reciprocalRankFusion, EMBEDDING_MODEL } from '../lib/embeddings.js';
+import { notMarkedDuplicateSql } from '../lib/dedup.js';
 import { search as ftsSearch } from '../lib/memory.js';
 
+// Marked duplicates (recall dedup, issue #45) keep their embeddings but are
+// hidden from the semantic search paths, matching the FTS5 default.
+function embeddingsWhere(table?: string): string {
+  const conditions = [notMarkedDuplicateSql('source_table', 'source_id')];
+  if (table) conditions.push(`source_table = '${table}'`);
+  return `WHERE ${conditions.join(' AND ')}`;
+}
+
 interface EmbedOptions {
   table?: 'loa' | 'decisions' | 'messages' | 'learnings';
   limit?: number;
@@ -164,11 +173,10 @@ export async function runSemanticSearch(query: string, options: { table?: string
   const queryEmbedding = queryResult.embedding;
 
   // Get all embeddings (for now, brute force - will optimize later)
-  const tableFilter = options.table ? `WHERE source_table = '${options.table}'` : '';
   const embeddings = db.prepare(`
     SELECT id, source_table, source_id, embedding
     FROM embeddings
-    ${tableFilter}
+    ${embeddingsWhere(options.table)}
   `).all() as Array<{ id: number; source_table: string; source_id: number; embedding: Buffer }>;
 
   if (embeddings.length === 0) {
@@ -304,11 +312,10 @@ export async function runHybridSearch(query: string, options: { table?: string;
   const queryEmbedding = queryResult.embedding;
 
   // Get embeddings from database
-  const tableFilter = options.table ? `WHERE source_table = '${options.table}'` : '';
   const embeddings = db.prepare(`
     SELECT id, source_table, source_id, embedding
     FROM embeddings
-    ${tableFilter}
+    ${embeddingsWhere(options.table)}
   `).all() as Array<{ id: number; source_table: string; source_id: number; embedding: Buffer }>;
 
   // Calculate similarities
diff --git a/src/commands/search.ts b/src/commands/search.ts
index 2c13579..9c782a7 100644
--- a/src/commands/search.ts
+++ b/src/commands/search.ts
@@ -8,6 +8,7 @@ interface SearchOptions {
   biasType?: string;
   limit?: number;
   showProvenance?: boolean;
+  includeDuplicates?: boolean;
 }
 
 export function runSearch(query: string, options: SearchOptions): void {
@@ -23,7 +24,8 @@ export function runSearch(query: string, options: SearchOptions): void {
     project: options.project,
     table: options.table,
     biasType: options.biasType as SearchTable | undefined,
-    limit: options.limit || 20
+    limit: options.limit || 20,
+    includeDuplicates: options.includeDuplicates
   });
 
   if (results.length === 0) {
diff --git a/src/db/migrations.ts b/src/db/migrations.ts
index 415e1a4..e5c4cbe 100644
--- a/src/db/migrations.ts
+++ b/src/db/migrations.ts
@@ -198,6 +198,12 @@ export const MIGRATIONS: Migration[] = [
       }
     }
   },
+
+  // Migration 9 → 10: Dedup lineage table (issue #45).
+  // No-op — dedup_lineage and its indexes are brand new, handled by the
+  // CREATE TABLE IF NOT EXISTS DDL that runs before migrations (same
+  // precedent as migration 3 → 4 for the extraction tables).
+  (_db) => {},
 ];
 
 // ---------------------------------------------------------------------------
diff --git a/src/db/schema.ts b/src/db/schema.ts
index 7f60912..269e0b9 100644
--- a/src/db/schema.ts
+++ b/src/db/schema.ts
@@ -181,6 +181,24 @@ CREATE TABLE IF NOT EXISTS procedures (
   times_observed INTEGER DEFAULT 2,
   confidence TEXT DEFAULT 'medium' CHECK (confidence IN ('high', 'medium', 'low'))
 );
+
+-- Dedup lineage (issue #45): persistent audit trail of duplicate marking.
+-- Non-destructive by default — a 'marked' row hides the duplicate from search
+-- while the underlying record stays intact. 'deleted' records a destructive
+-- opt-in removal. 'reverted' is reserved vocabulary for a future unmark path
+-- (CHECK constraints cannot be widened later without a table rebuild).
+CREATE TABLE IF NOT EXISTS dedup_lineage (
+  id INTEGER PRIMARY KEY AUTOINCREMENT,
+  created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
+  survivor_table TEXT NOT NULL,
+  survivor_id INTEGER NOT NULL,
+  duplicate_table TEXT NOT NULL,
+  duplicate_id INTEGER NOT NULL,
+  reason TEXT NOT NULL CHECK (reason IN ('exact', 'semantic')),
+  similarity REAL,
+  status TEXT NOT NULL DEFAULT 'marked' CHECK (status IN ('marked', 'deleted', 'reverted')),
+  detail TEXT
+);
 `;
 
 export const CREATE_INDEXES = `
@@ -231,6 +249,14 @@ CREATE INDEX IF NOT EXISTS idx_documents_created ON documents(created_at);
 
 -- Extraction session indexes
 CREATE INDEX IF NOT EXISTS idx_extraction_sessions_ts ON extraction_sessions(timestamp DESC);
+
+-- Dedup lineage indexes: the partial unique index guarantees a record can be
+-- an actively marked duplicate at most once (idempotence); the survivor index
+-- supports lineage audits.
+CREATE UNIQUE INDEX IF NOT EXISTS idx_dedup_lineage_duplicate
+  ON dedup_lineage(duplicate_table, duplicate_id) WHERE status = 'marked';
+CREATE INDEX IF NOT EXISTS idx_dedup_lineage_survivor
+  ON dedup_lineage(survivor_table, survivor_id);
 `;
 
 export const CREATE_FTS = `
diff --git a/src/index.ts b/src/index.ts
index 4040e17..a2e2aaf 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -30,6 +30,8 @@ import { runOnboard } from './commands/onboard.js';
 import { runMigrate } from './commands/migrate.js';
 import { runPath } from './commands/path.js';
 import { runExport } from './commands/export.js';
+import { runDedup } from './commands/dedup.js';
+import { DEFAULT_SEMANTIC_THRESHOLD } from './lib/dedup.js';
 import { closeDb } from './db/connection.js';
 
 const program = new Command();
@@ -180,13 +182,15 @@ program
   .option('--bias-type <table>', 'Softly boost one table without filtering others (messages, loa, decisions, learnings, breadcrumbs)')
   .option('-l, --limit <n>', 'Max results', '20')
   .option('--show-provenance', 'Show provenance for every result (default: only unknown provenance is flagged)')
+  .option('--include-duplicates', 'Include records marked as duplicates by recall dedup (hidden by default)')
   .action((query, options) => {
     runSearch(query, {
       project: options.project,
       table: options.table,
       biasType: options.biasType,
       limit: parseInt(options.limit, 10),
-      showProvenance: options.showProvenance
+      showProvenance: options.showProvenance,
+      includeDuplicates: options.includeDuplicates
     });
     closeDb();
   });
@@ -658,6 +662,30 @@ program
     closeDb();
   });
 
+// recall dedup — non-destructive duplicate detection (issue #45)
+// Dry-run by default; --execute marks duplicates in dedup_lineage (records
+// stay intact, hidden from search); --delete is the destructive opt-in.
+program
+  .command('dedup')
+  .description('Detect and mark duplicate memory records (dry-run by default; non-destructive)')
+  .option('--execute', 'Apply the plan: mark duplicates (default is dry-run)')
+  .option('--delete', "Destructive opt-in: hard-delete duplicates instead of marking (requires --execute; run 'recall export --backup' first)")
+  .option('-t, --table <table>', 'Target table: messages, decisions, learnings, breadcrumbs, loa_entries, all', 'all')
+  .option('-p, --project <name>', 'Scope to one project')
+  .option('--threshold <n>', `Semantic similarity threshold (0-1)`, String(DEFAULT_SEMANTIC_THRESHOLD))
+  .option('--no-semantic', 'Skip the semantic (embeddings) pass')
+  .action((options) => {
+    runDedup({
+      execute: options.execute,
+      delete: options.delete,
+      table: options.table,
+      project: options.project,
+      threshold: parseFloat(options.threshold),
+      semantic: options.semantic
+    });
+    closeDb();
+  });
+
 // Default command: recall <query> → hybrid search (Phase 3: best of both worlds)
 program
   .arguments('[query]')
@@ -668,7 +696,7 @@ program
   .option('-k, --keyword', 'Use keyword search only (FTS5)')
   .option('-v, --vector', 'Use vector search only (semantic)')
   .action(async (query, options) => {
-    if (query && !['init', 'add', 'search', 'recent', 'show', 'stats', 'import', 'import-conversations', 'loa', 'telos', 'docs', 'dump', 'embed', 'semantic', 'hybrid', 'doctor', 'importance', 'provenance', 'pin', 'unpin', 'decision', 'prune', 'cluster', 'import-legacy', 'benchmark', 'onboard', 'migrate', 'path', 'export'].includes(query)) {
+    if (query && !['init', 'add', 'search', 'recent', 'show', 'stats', 'import', 'import-conversations', 'loa', 'telos', 'docs', 'dump', 'embed', 'semantic', 'hybrid', 'doctor', 'importance', 'provenance', 'pin', 'unpin', 'decision', 'prune', 'cluster', 'import-legacy', 'benchmark', 'onboard', 'migrate', 'path', 'export', 'dedup'].includes(query)) {
       if (options.keyword) {
         // FTS5 only
         runSearch(query, {
diff --git a/src/lib/dedup.ts b/src/lib/dedup.ts
new file mode 100644
index 0000000000000000000000000000000000000000..cd40610a8f935aaf96882a2aff362ea6982130c3
GIT binary patch
literal 22314
zcmcg!>rxy?mfqibinP`-U2Q`nImR2Y$VeeW+hQz%md4}RWmu^0E}%qpRjaC6z+fWw
zA@&LPN%s5B$y;?d-f@Ix35Nw)nI|*P{Z_TLW$J8_7KNE+)5YBU&wu^5nN)RVis~|-
zm?UqSMP~l_kKgtOPqw!3pE;<?&1u%O^<vWIpEA4dCU39IyskcFWm-<M&GaS(kY-VT
z%0E@LX|f{2^r{?~<14P2<PH8JJiAd@)w8t7|CVyrMp#+Z)2z<R%MEi`SBts))?~A^
zY;nAKn)6xT<c+z`<}Kl+jhQd7cXm6k>NcC2i$y70HwLGx&AdqS(oEAfHBDvm)--K;
z*O*0_x0@~Qm5z&yaQxN(Xy#XGlbLT!I-LS|UCpz)y)&F_VYXKpCism!u14uahMBW!
znibmwx@q?6yUlu0n(-a5aFH&G_Jx@(TDf)uT6|pOIDA_2{6#*&`Lf$=vS@+KHk*?O
z1fnZVDi<@ew7zZ($cF_fj%9d3lJl&{%PhUjgsh-5klDz6F{8y|0cXscw4P!?krDUA
zB~X*i(;Bca97tWw+s(Z6XCS`-NMlHRY?&HCo=MVj8peRH51@VDRAq5DFuQftG@BmF
zCLG&Lur3E-Z~-z;osoc>vP`c%a_ls(^Qz9<yKS>*vidAtv{&HSsd;HY;W2PI!>`$G
zTjO{#Odk5g-w4Qn*w<xsQ<~)c(czzm2d@uyhi89&{r2bKh^TAt=2^3qm(%QafZG8d
zps#7bXa)YM7WE`E)rCTW7_?V?aCMGTkLxVGCS~h<a#dzcV-m;K*t^KeOU&Ic`HUB%
zm~2q^vT|~_VWC}RB?RIII5#;s7m_p0+N_@CW!}sz;zy4A`A8XHQq4dvoS<n157}&-
zO{X|8*cBpjlY=`c*=1$?Bi#|``;_B8q5^Q!GRwd@1M?oslniA?shM0sYH$glv}|$;
zegW+B(VYS!58}CJLIarrM@+%Z6UbOT%ZrpGGcR|5PMr0sDgcfWh+|vA@`eQWJ&?aS
zsTO66BZCQ(CIjZ{$ySk%w<cF09|+UmHVs$?qKn&IXLn83;<lH0DY8T`$@wpDhDSr9
z@}+rY-tNEKKQ^z<H_SzTi?fcgO9R^NMJ(>6V7$nSX+tP^IbXD!*s8#q^Zf&pyxJHH
z2K{qW<kSG<b}&etfn1rntUv}_uv%PR0c$vFHcg-qo+#DLy_tYZ+U!l*T$u-RQCBn5
zo78vnw(7kIllGvW#?(4)%6L(3Hy?|<&EjcxgAFtKx3~Mp!?V%rABJbUZ+<@b>1?$B
zA49)-V6pb0iC2${YJ6M`9kXme^MlK?Q6QGveX!;ErWtgAJTWE`qT|=!za5S?x`QfN
z9HeBK_ufjZ89Ju=-FV{n9c;zYC_6vd`synq^Puy@Xi*`H+JN>Vn^RmPTQttVdF7><
zWwWY=-sTp~z<jm!L~R7FT|(gYhI>E1KhqnSoz+{tc*32Zn6Jz(=0k44XJY?Ei;J*&
z!!%3V2~^_cz`SnFv%$Z9Yf&2u0?Z#^mDMbtYzlT^er{W6unB~?*)Z)*MfD&$oB9x%
zyx#Qs4H5*Sk<GHQZC(&iTHa9$Uz%|S4dSg@TP-G6VBAFI7^<?`oMkm^g3cqZygY(W
z*P!Kx*FV2KJ{t|+y*@bJ-#t5ib2J>iIefbZB=KqiFGu#YH&+!nH&T6aF=!VEJxp?q
z4PoGL+E~K~DS44#1_oxt1u!yH*ATx;xvoi`R@YBA%u|B7Zur|>);#UgVnDPAqsT6y
zq(B9zDJ-YWfk|uVb2P#tvlLc{I*S+s4rb=2T0kL<d2Xo7HHdh(e{g1bWcZikv$w;8
zACKPv2j4sc1}rBDgYS_kkFvA@JMNfXb_*-iGryXiV-Lt+JpZj4=@IC;Xt+XksVotF
zluhOLn(7wz49nemnDMN>NTI-G(_Lpl%{`@!jJ_Son+Oe<1qyN7lncmx&A+ULhB|FU
zPJYF+qOeb_Ll=X=RosMN%3^@3;ihM4>tGTX#kRRhzxnoGU_FWRm_t;Mg-<BKf-}$u
zQsn(U>Ufb_!{3%6zZL7c2C|3A9lc?0^K4Vhbyi!LoTos-RUXzL+EQy+w$LT_xR*0t
zXOH9CPw67w8_VBV8s>D@!ia#sww3P6Y(#P}wC)xWwMc<KjSde?x|qVTg#VD{g-I;^
zgVVj`H;vRN3?(!s{54q>&<`BJNN#?<AA);N58nIJs?@0*olX%GFP7J#HiQ_T`#4_f
zr)5>b!7kwWuN*~OHRphf(<YwTws>>(1eP4=g?P=Wojxw;)WD?@|CG9ZQWZryhtj;c
zf`xD9kZ35sbcR_~2@w>>oD7Vexrnh709{oT9Dn*h+#k>Ln(@?T0Zm~ex{e}q81;C}
zjsY-f95v}usSY4x*IB!$OVbS6>MeZIdKb5VD+4WC;C#u}N$=_D76P+gFB<%7^R<i_
z3<A_F>A!eF7E#MNTK3=#{K;#fFWuf6VM7Vie8`{=SD(R`5)UbIG1Qyj?UUc<{P$l3
zzyILpw{KxDXg<VTq)j_OWDBaS>?H9}0dbo$Vw=E=g*8Q#;2BRM)b(f%PRJ3qO6F^{
zD8zGtNaKRuHPEkGS@cSNVUjQAv}!_zm4gj5prF*N1Cvj0VLevoNhEW4k<38~PgR+`
zvtu@&!!ur8qp;03OvP<Xz+H(nQg|o~q$k#T24q|yY=G7e7Z)VMnPFt(w674asLD$a
z*-12zO0ilW9?^RtcLc+Nd9ZZ^1k53QaTkS62<wF=JRF0!s)fwbOAoOWk_YQ4W+SN|
zs@kz?l3FF}jq8RPFAsGor{oaM-ic5lJdh3oZu$u2!(Z}v96u(x$OQuF3lQs>=LQbB
z$LS#0F2iZ#Xuz+C2eEIwC}JTWnu3#(@gUfD^XirPmIqGKl~H$)Xm<Cs;Q^;YzweI@
zg(3)BpzqA?&qqhYgX6Q~{ddFB@#}Z*sn1~j5mHykD>ZZii|jU^RF`!+zoHUGF2Q0a
z9R#+p*AD~!`kh%Xfub-60stx4HqUuDD{Z*s6qf}Cro#T#U-Jr76Iz}}BsaCp6ep)(
zi+jLys$zKp_w%L}O1SI8ot$E4uL)!aEAnPXd4XZ(4dQ|uGpy^XPT*iN&E`&p_<S};
z#4SsEBK*pk#3$r!$DAOJm8YizjwP$yAFGV}=pqW@<k{&)=LR}FXpo4?lIMN%;B<%i
z(Pr<g??%%m{u0b0e}@<_OJx6AQVQaEc+|^*f6DHX$uh>)ny>RO?<WJ<>fxXGtKEtG
z-0SHO+{uS?cT(z<Azk)3c_X1&KU`IDx{~>#lCkiqUd)lGZX^kj`ihrGYzGyJcr;^J
z3P-$IuaBVJrg9p=Uz3F3X!-KOZOa^Vaw-g@WFOxFYoIzA=3P4fPEGyB3fsM;%{Jmv
zfFl?hPv}>Nt@{GhnVoBZ3^b9Y?vV`RvFtp!%-ZDIKBrofr+r;Jm@k^EWMZFY1zZ|k
zgse_-y<tu!r}FF}h$xv?=X^?wMb;$!)4@evARtOE&CW}6>7)f2r*E)$oii`lNC}JH
zNvuI>wFLC8P>6OcrMAuMx=!!DyMHXgisCEt@G@qsqma{tt+y%fY^ssJggZ>)&C{|J
z%Zdzu;}YRu$8h8=dV{g(FpmXg0_rkf2mVEHt%0vDeVo6RkN`{uQm^H*r3L?AOeT{2
zxcDNkYaO9~4BM$FSR!p}B0I1SkN|xOmE92>Jqd)T%cxxevZPhP9D(0u)nJy+lL;Bw
zs;_<|I+mnO{<qA02kz#@+q_(4+~1%efWR3!E)gp7G=Dr_3wWq*rwW;d<Bw<WrTr7n
z==LN?VIzSsp^1Fx_jUQhvUu7vtd~2@Ic8JuQ>mJA>5FBq<E?xG(mBauAJ+uvE<er`
zwt!u6;0SKTh2!9A^em$=P(tW!xFc1OrDZJA+(r`YnmmnQXuWAAB|q?HLDJ`vp*Nbg
zg0QD!RH;O`eO3-e5tSWO1WKgKHTYv@2U&4%Ma|caynXsr)DI&2T4-Tn4CY-*DY6|x
zEKTs+%1u>TYiz~~rrRW6X8A<|B<3IUODHV34b0DIx~7>R^GX*R`PWT5eM6bg1qPDH
z!JduSH=gS-DgP4;E44VKDPX!hCt;_P2}*f#!xR~Pn8peoK9ZJ_y_Gz?_>NG3geOQC
zqYRU6!f8Y%3^^v8it2GvEKoF)KoSMRsjRxX8`t352FX<9mQowoA(DJtKgJBzB%T#B
z$iNC8pPWik+Z8w*uReim5`*ce%sFc<E6+RH1f@BXL%}?Ifq%ZUvVb~`4FOI5_18|P
zFcXk){s+v@!5Gh9;MX;4K7=(+aWZ~%G&#vdnNcyQ=A-N2AAVSRNGgboVAAct+YfcT
z#9zHz9;ImUhps?a{)azw);o32cp`>i$CnkK1Q+VP1;<*diR6dCjJ?KusK!9%l(`k=
zbl?LvhE*I*1lzMARGC%_2-~KLP_Vj=V;Q_dYQa#z5*e9kIq;IdT8s^BvVseYt=>X0
z9@G)Ax?Gz{ZU;&Yh}s5l<NvT`!Xh@*a^k|9V-}uL*w2~@16>&%SR&mB?a)LXp1SHv
zb5;HXM*mFz&)<N)Jzu|+f_+_0n1)AWE+M)&Sxf}L$&_;-oJRtI_|s!aSdz>)%*Q@V
zp#c9l2!~(*9sl|3uQFvmKzK_HGAJt6?KEqVB_G{*-$XMd(&Hoi;)fae0s6-Qe*sXz
zstpe1XlesKd*%}nF;ZTst#m+_QRPZ^g*beww&rPjCAG=Vmv!9?IlAZs5E&>@R+7Dn
zwuN3)cY}lt%(<X@cssApodG(}5xL0U5Gq~^F*K@HT7yYa&4QvVjs!pr0W9+C%$)a-
z@@vqs(Dcsf`le#Zs$wxi_9(l6c%o`<r)G#*S&P@?`FURqlX|~Q3~JqWy;et2`JL3O
z_gu<$^sz|%6`bo&KZjRvf3J@(4-Svb@GtwL<B_3pFlID-JKQ}s&&>}<hws!>`bLat
zAyc$2JuL<vhk6Uh`Iq<p94bu4ygt}7ow;C5PSn#a&0<-wQ$`~5+UWMPbE_{?&p{&8
z(=UYxQkLHUnWXsy83Nl{^4T-<q5O4BX&=Gv>{jJPekpz*vtql#J3hg!4!TYKytYpr
zd1;U&zy1sDz^gE<C5~T^cS1>D0jf>RTQ=sP4heq)E1xn<POYR!{Orjqxkt&{OBctU
zZGh4ODDlPS3b;E;y`3K2yiZwAG6y)wf^HBxGq$t_==>HU6rK$`e2V-co7_#{7Q!h*
zGm7L#X_wqm*d`>4X7Gq?h=^3XPv|sPs7wHucxBSRgFDq3T3N1OBI#3#<0<8CZQU9?
zQ5z-o^0U$I>w~l1!-F68f82(9%cAvn?|q$Lh_j(J`#__Y;Ng3q+(|rgWr9UdvXdUR
zXknmx$TF?atcUt8Yv8ka&pgoSa!q9*_6i{PY(1oq3hZq}9I$Pk24S)Ud8&IM2&fsN
zuiFc*33zb^P`w7s!dlJ$xvfHFL2^2&AxZ1t{w6Y_E7%iO=^0R0gB8}`qs`kQ;|VWP
zxWkh(VS7}m*CKieHgbBa<!6%hC~MHp){W$_`EpK5LMr8W=N@yYttzSZhp|vLLVa@z
zQLsS@oHva>0K_$nftSbV7s9wf{}+Q_^C?fotLwG}A!10P_8-O<B6QM80nxogFC5xK
zkh4RLT7nP;*(iP<RX0dSv$ujaX?-&Q!z5{2;xQ0Osr7`PjwQa^#3wQ-_PhQHv>j7M
zrMB}&;s_ofE*hyMVW#zey^`W=x%ddDbtr^@t?vtxa0I)@6w^eXYR?_BBGxB@4DtF^
z@NdLfMwEh`jp@N8p~tGna~Oln95aVd#Dkzvk!C+x89YVa9hu9p%w9|q`Mxm5r9Jm#
z2iru;P6r>Vyi9sxB|BN}cGdy|Q<hZN&6p+UayV<A^XZ1eqWoeT9XjoKGM^X14|2zP
zqa{uBU|x@$c0$L4-b9y#HIXT@$A#?m)pCoMV6DE6sCV$?y`Wf!4v+SRN9Ox~heslZ
zvb}8h{T!Yt{!yX?iCC38;6lwGgF|O>7*b+2hnxUno*yev7~Q9{AtMr6zIfq+2i=l=
zqBFLwIQlSD>rhUCMA-I>Dr9tz!zfSWZX~0C-hi3|lzkHLmTn2F)Ywd{wKy)JZQDCv
zBK1l@3@`<ZO9<QYiS&Qy6uFt+hgOmaJ8BVkQ)GsKS(xarzt;KgbfhZ|XE`IHvTV%F
zLmCf&nZ*uIY_j?jb9|qK;n`MW5CQ970f`tINnJkq)OVM7iE3bgr84l>QU!n;>uo}g
z8OQX09OKvtPuqzV{bQF`K-OTbo(pIRYCB*?wvLjKE-1vfS9aaZq-|9SqQmqe(8Ul8
zG}fAUcxoAK8--EKacqZ)ws%1j_I*}(h|tbL-_GRL+5ztK-a9O|XJ!qEP|390hk<a^
zc=@+s`(Q_r>``*^0%6#yK~fJ{0c2R=4``<-N_)~dg{F$doGsE(V{v~l%^5%^y{qiD
z7r`KbeMgh8HCX;}$8w>&su?fu!u}^6so6BoPdgN$C~%pLiqYBX>5Jrm3Jfa$)~-jj
zK#jB*UG;6$KUvfW?OIgH)`gc6R5)9z4Iu>I>EZNNwW!kRQgGKKv0;@|Tx+XeFV$TU
z;|F3;k3|MU<m=oT4DZWps1_AI&%mc^Vm(#k!G+YT^=O6qqRJu-b|0c?*9@lI{33b~
zI>4l?<T1~J9_t3Yfasf=D&_pk`+7OPeAw;|=F^9B))my%0#{0g*Qb)ueS;v}J6;=P
zOS(39t_>bLf{Id~MRd<HeTNQ1*IL}A0GMK}V*|>tE@|c6%Kgdd@1T**2++3sODJjl
zpN+9}iINAP`l~<|Ib4S}9<1}MUIg&l=KIA3G82|0s&-Mkp}w_+)|Csb6j+Hz7#$MO
zDOZ3Pu=t5Fm+0WNmU_13vA>&^h$5K;5Jy~kt^0WW1D>)bYtE6(M8t^(3ml$1M;R9S
z&LDk;oWEv96Nr_ICkV*~ZS3xW?m?_yr4VJxFvI?9w$=Z_SAr~%GcZoGgU&;2hSDx|
zL)F5YCMcx1rg)<WE=V>7w?mV;SBv=G4DzoxCAvN^&c!q6lh7cGX0cX6g9`$>KsO^(
z5SWHOIuvU3Z3V5EtargLuA%h#7DYA%ZaQk>H<%Q%W|=f*%^b+}H9YrRDNPSVxJt6W
zA;;xt?LCim`Ufta)j$^MXM*%IC>F>YoKK*?`y@jm8snLAeur|jEyhYfcaehI9~tJe
z*dJ#OkJiAACa62TMwKDxvwdu~c5XsQq9ayVI<?X@GGN=EC;{F-!pmOMW`I|*R5Cgq
z3}q#IWM*b(QaT=tbQmm=B|=p1cT#2#>A(F0B>hwd&Ik%d0YsRu5W!-<JM-^BE7}Jw
zTG>=UI<C5{l;bXjwCgX@2|(XJD0lt7V4jGDw)wn}_kuoae%e}?ko1$x!ZA&W_El)s
zZ>?z|zN|~)u*f0joWx_cCa69{{6NY_KeCU>Gq`}Y?_<5!GM<u%56~dVm|@$*E#r?>
zt>rF)z<ZvHmvFh0=!P1=`O0v7IyQGBPr{cDr~uu&2H;2}Gms{M_um@4dAPy6x&#_v
ze^iyBOl)VjTQa)N=X0LB<4^NOQW%!wXk!}=3Lo>GCS2CEL~sr8{uqLgNm|#GLo5h7
z4(CjsHG!-R@*TlTmxCxqY!KknA>7zw38R51?l2gPi}}HA(WEuXR<I_?W1V}E^S2ym
zuPqX=>wci<%rEW|lGx{Pwu6V~f*Ezr-R5VwosV1GAg8+6g{^Eh=Cui!;H!pdOZz;w
zm;nK(;t_)8Z$$|cRBrd>OY^{f&vg$G$t=IFz%`+g=V)YOCuXQ(u@N3{frenu9mTuL
z?j1sroh1y?b9M_5!{}3gU<bkvXz_4)7J-B*4vOB)?iZ?~az^*A1l$t7hP%X65fe$&
zrs7~b<yo?1ae=aY!L$HUo9OPMPLXn8h4kVLUZ|wQ<6iyJm#d{ALgwYtyKzG=4{u`P
zbrrsNtc7fSZBZZbU1NR4n9r%Zku+ZKFA=PW=8~&~H-jmukWM$j$u4sI51?gNy{W$=
zgrdh|S1Z;V#_jCwi{HnK@{V4t<Atu@f@}ANkG-%aDojE>rv)Myeh47ofl4QpbLQpM
z)*U*z>q1Ki45)2=IO();jv%9%_g+p#U)gp``CYA+ZMW3*S&?7n(EZkvd>p{8i?tCS
z(aFRKP`}aE2T}QKrLpZjg6Pm91k&2mlAN$md#(!l6PqA{5334mwH;L9xdge7So8+8
zvVg&V{2A$4TYnnnM|s8RI_s9{;tLdruk7m3FVeWaPHBhxLgVNGUNBA&<GSWvM7Hcm
z(|DHZH71B<zzFNuM76Lrwkp7plfVlMtjpjY26sW8x<B3HCC@?T0E(?qxWwHiG8nk!
zQXZz`Ts(6S9aCE0!jD_RF?X(0p*wiz$RU&_J_eP`EyS{W-#NHUA~0}Aq8@>gdN(1L
zr#K9a-;v5{;6;PXosM(c(yM3boQAp!XYuOOGYwB|Phc;lxxrv|oZgnbkv5luqIUIs
z@XWPD639AUmM{T4f^Ai~F#o|ngiqWt=^MKwrPYoSc?+BK6@B8zvZ<5Lc!DC7&|o)3
z3^gCkgk%oQjmQc-Ks|lIZZv9ixr4f9-p6Plp^H-6CZ~K8*K_a;@7{Ag68O-I{s>Xq
zbCI@*W`nz;vKA@7B{+J1Y4EkZgCQ&2?NX+GQPkK^G`RByCy{Ap9GVO$5Jmi~x0uxM
z3guc}d5x@}sCXg#mEK9IhrfU;8}ct;LH&e*{dNY6ZBoy0va{Zu#kt%|tSo+nL;F+M
zq6dS-FB~i<HSTW<8^C(gyrI*MH?21}%WBij=;Li&A8_8|SBEA|+C0)kNK%W<>j=>!
zdbbM^j6xkwTQ!W+Z&v?WKGw+ECyp94QK6OC7DgcTo&0^IRlGWrbd@Ke*QE;A&r+$A
zh+Gc8+Ud4(c$_^t7T*~N(2y{8Lc4T<zFmY`t2RA{WOSSgblGkv?kf5uu_K4m(iRm;
zu@J*}f(s={%^sL>l~0K>3Bz!BwsyF7!Gtc>IsW1&cVtggnkLsM1r_H^ZQ=7dwEm8J
z;Pu&Wd?+6+agsxxG^7D>l(dnRA~6~MTA;r3kvs|`vJwC(Du5i5^LE1h93A);&G<16
zz+PbyY`1Z+9t5z=OAhn)8M;wYcpW%9)U@nlJNKxm?cY?^f$VCM#@$7Dxs7=9i`cXm
zFM@0IV~tOn+@Ubh*4l8mY2Q_>*iz=ZcG%6VUJ>g#aGRZPgjm_r9N<7#<ad$cWdDGK
z9`ZgrKOj(N@?`-~ojMHc5k8ut-%){xQc39#G5n~SCT`fP=+3m~PD4*kJRl+h#(Wjc
zil@Kdbe|};QKlue$ETi69FwWC2c?{iq-@3{>wgibq;p^;^-E3fnHz~2!#=kc|4ox`
zKWZfJnuSj&*mH&e@^|<G0b*bqPv?ejMm*vlh<)!wp~7~Ed-Pu^zQE+Y?BJ;Fe_^X~
zUIx`^bJ(#;MF)dGxxO8rUkQ(BS;5LNc3Cc57Da6Z(ztRZd-j@1x~Kz0-D($ylaw|o
z9}h9h<e5-Ex<xthpR7Q2VI_k<hPLB7>d-Gg`~=L{Hcl<y!N7$aeAo$?wt=u56M{%q
z!RyE+%)QhY<hxc<wn+T$;iFo%{=l-pd2}Fr`^OVnYaXasbSe-gam}{Sh`|uC1x{Y8
zu%SGiI<{FtUG)4nIw<ZaQ{wFfTofJS_M*~xha^4T?MDM47P=}YlmK#e3uL%}HmR~9
z8P{mN;)Rqm@EHV@W$<wTu@d^dCHcw=m#%gcN#50m+Uq)s6pMZeS<c7qs6mF}OURFj
zpMU?rjLb62sE&zNK|8qcE*?cleUajwJ~EZAv8`#C&l9N_BFuvyi;y=F5o!lkJBTLI
zWk3g7%(@3bAZM$_+vXYQtCb}Ay~A41Sb?@~gEq!3*?KG3x<?e#Tb}DwhPC#we=r&z
z9h?1w<3sEV&T@Ue*oQyfw|6CD7t5eH5OJb=>RD4B+r(0oBct&m*YT1}PpZdnKU{{d
zQkgaXagJ>fg4Kfj75HWZGT8D`1mD{c&81XH@hu^rGvL>G#Be!&oTO;0qf;|O^%qb9
zE;aZZ0-8@rC=}|lrtBf1oNazw;JaXSL0!pE+U)%GD0wu1O4Ez$U4b0xr7J0usNmMg
zmPGUgCuxaoB|{9pjn-2+-wS!f?dQ(yZ?cDwcT@!<*8GY8BR5B{g0924!zvs#E*~yC
z*$ylVxdIlAtg_gGSO3#8o<1Ruf>AwsM5b{<v?WC3ND{seVNaxZY^yzCeav-*=M_Y1
z9^9sk$f1!BNRhmwlD4uL?n_K5=lP`UXt~Rr$L0?a7KwfU;>bOZFeZriC`{x8%Pk9$
zA0n<&v|JpZ<wb}Q;!T-dpw1XXyb?PiwiM_{%PZ(raW=2n-R1aHk<CxPVTQxG@Xhi`
zVm{(ZMGjZ?Q6$G(E)uCMuj<4&Xh`R)(0&{8(eeVzS0~Hk`84+7l+%Lutx9cxa715P
z;PDiK*HH@C(skvj%YTQ3UcRk9TUr?&uP%*~=mB7G?$kBn6RQE2VT(SrgPvo^FOHaK
zN_1`lzX>5Mp&n1*2Eny`X3dCuD4w6_y!|rz93Z|1s2>5O>Gxmz6KyBa-_j=w?T438
ziR0^r_=&_%8*JGW)tpia9nAwXN0VhM&T@NVRfp}BOQDx189%g$w+FzjGFg07)gHZ9
z=qJ<8LAIj?E{|HE&iC*(%kfaF*se>}matJ&<mV6(s73oc^w04*Ag~ouRkGoFxSoJ8
wPw^JP<+7W^FZtjOucXh{H{`58fCsK8u@bU!ppf{}ynv`$n@nGF1|FXLFZ(@T`2YX_

literal 0
HcmV?d00001

diff --git a/src/lib/embeddings.ts b/src/lib/embeddings.ts
index 8dc22f4..a46632b 100644
--- a/src/lib/embeddings.ts
+++ b/src/lib/embeddings.ts
@@ -84,12 +84,17 @@ export function embeddingToBlob(embedding: number[]): Buffer {
 }
 
 /**
- * Convert SQLite BLOB back to embedding array
+ * Convert SQLite BLOB back to embedding array.
+ * bun:sqlite returns BLOB columns as Uint8Array (not Buffer) — wrap without
+ * copying so readFloatLE is available either way.
  */
-export function blobToEmbedding(blob: Buffer): number[] {
+export function blobToEmbedding(blob: Buffer | Uint8Array): number[] {
+  const buf = Buffer.isBuffer(blob)
+    ? blob
+    : Buffer.from(blob.buffer, blob.byteOffset, blob.byteLength);
   const embedding: number[] = [];
-  for (let i = 0; i < blob.length; i += 4) {
-    embedding.push(blob.readFloatLE(i));
+  for (let i = 0; i < buf.length; i += 4) {
+    embedding.push(buf.readFloatLE(i));
   }
   return embedding;
 }
diff --git a/src/lib/export.ts b/src/lib/export.ts
index 2a072bb..a97573d 100644
--- a/src/lib/export.ts
+++ b/src/lib/export.ts
@@ -25,7 +25,11 @@ import { SQLITE_SAFE_CHUNK_SIZE } from './chunk.js';
 import { getMigrationVersion } from '../db/migrations.js';
 import { VERSION } from '../version.js';
 
-/** Durable memory tables included in app-level (JSON/Markdown/SQL) exports. */
+/**
+ * Durable tables included in app-level (JSON/Markdown/SQL) exports: the
+ * memory tables plus dedup_lineage (issue #45), so duplicate lineage stays
+ * portable and auditable alongside the records it describes.
+ */
 export const EXPORT_TABLES = [
   'sessions',
   'messages',
@@ -33,6 +37,7 @@ export const EXPORT_TABLES = [
   'learnings',
   'breadcrumbs',
   'loa_entries',
+  'dedup_lineage',
 ] as const;
 export type ExportTable = typeof EXPORT_TABLES[number];
 
diff --git a/src/lib/memory.ts b/src/lib/memory.ts
index eb89ce3..4ad246c 100644
--- a/src/lib/memory.ts
+++ b/src/lib/memory.ts
@@ -2,6 +2,7 @@
 
 import { getDb, getDbPath } from '../db/connection.js';
 import { existsSync, statSync } from 'fs';
+import { notMarkedDuplicateSql } from './dedup.js';
 import type { Session, Message, Decision, Learning, Breadcrumb, LoaEntry, Stats, SearchResult, Provenance } from '../types/index.js';
 
 // ============ Sessions ============
@@ -271,6 +272,15 @@ export interface MemorySearchOptions {
   table?: string;
   limit?: number;
   biasType?: SearchTable;
+  /** Include records marked as duplicates by `recall dedup` (hidden by default). */
+  includeDuplicates?: boolean;
+}
+
+// Marked duplicates (issue #45) are hidden from search by default — the
+// lineage row in dedup_lineage keeps them recoverable and auditable.
+function duplicateFilter(options: MemorySearchOptions | undefined, physicalTable: string, idExpr: string): string {
+  if (options?.includeDuplicates) return '';
+  return `AND ${notMarkedDuplicateSql(`'${physicalTable}'`, idExpr)}`;
 }
 
 const TYPE_BIAS_RANK_MULTIPLIER = 4;
@@ -313,6 +323,7 @@ export function search(query: string, options?: MemorySearchOptions): SearchResu
           FROM messages_fts f
           JOIN messages m ON m.id = f.rowid
           WHERE messages_fts MATCH ?
+          ${duplicateFilter(options, 'messages', 'm.id')}
           ${options?.project ? 'AND m.project = ?' : ''}
           ORDER BY f.rank
           LIMIT ?
@@ -325,6 +336,7 @@ export function search(query: string, options?: MemorySearchOptions): SearchResu
           JOIN decisions d ON d.id = f.rowid
           WHERE decisions_fts MATCH ?
           AND d.status = 'active'
+          ${duplicateFilter(options, 'decisions', 'd.id')}
           ${options?.project ? 'AND d.project = ?' : ''}
           ORDER BY CASE d.confidence WHEN 'high' THEN 0 WHEN 'medium' THEN 1 WHEN 'low' THEN 2 ELSE 1 END, f.rank
           LIMIT ?
@@ -336,6 +348,7 @@ export function search(query: string, options?: MemorySearchOptions): SearchResu
           FROM learnings_fts f
           JOIN learnings l ON l.id = f.rowid
           WHERE learnings_fts MATCH ?
+          ${duplicateFilter(options, 'learnings', 'l.id')}
           ${options?.project ? 'AND l.project = ?' : ''}
           ORDER BY f.rank
           LIMIT ?
@@ -347,6 +360,7 @@ export function search(query: string, options?: MemorySearchOptions): SearchResu
           FROM breadcrumbs_fts f
           JOIN breadcrumbs b ON b.id = f.rowid
           WHERE breadcrumbs_fts MATCH ?
+          ${duplicateFilter(options, 'breadcrumbs', 'b.id')}
           ${options?.project ? 'AND b.project = ?' : ''}
           ORDER BY f.rank
           LIMIT ?
@@ -358,6 +372,7 @@ export function search(query: string, options?: MemorySearchOptions): SearchResu
           FROM loa_fts f
           JOIN loa_entries l ON l.id = f.rowid
           WHERE loa_fts MATCH ?
+          ${duplicateFilter(options, 'loa_entries', 'l.id')}
           ${options?.project ? 'AND l.project = ?' : ''}
           ORDER BY f.rank
           LIMIT ?
diff --git a/src/mcp-server.ts b/src/mcp-server.ts
index 958d001..242085f 100644
--- a/src/mcp-server.ts
+++ b/src/mcp-server.ts
@@ -66,6 +66,7 @@ import {
 	reciprocalRankFusion,
 	checkEmbeddingService,
 } from "./lib/embeddings.js";
+import { notMarkedDuplicateSql } from "./lib/dedup.js";
 import type { Provenance } from "./types/index.js";
 import { existsSync } from "fs";
 
@@ -118,9 +119,12 @@ async function hybridSearch(
 			const queryResult = await embed(query);
 			const queryEmbedding = queryResult.embedding;
 
+			// Marked duplicates (recall dedup, issue #45) keep their embeddings
+			// but are hidden from the vector path, matching the FTS5 default.
 			const embeddings = db
 				.prepare(`
         SELECT source_table, source_id, embedding FROM embeddings
+        WHERE ${notMarkedDuplicateSql("source_table", "source_id")}
       `)
 				.all() as Array<{
 				source_table: string;
diff --git a/tests/commands/dedup.test.ts b/tests/commands/dedup.test.ts
new file mode 100644
index 0000000..cc60f98
--- /dev/null
+++ b/tests/commands/dedup.test.ts
@@ -0,0 +1,314 @@
+// recall dedup — issue #45 acceptance criteria.
+//
+// Behavior under test:
+// - dry-run by default: reports the plan, writes nothing
+// - --execute marks duplicates in dedup_lineage; records stay intact
+// - exact/normalized duplicate detection within table + project
+// - survivor priority user_authored > verbatim > extracted > derived > unknown
+// - semantic pass uses stored embeddings only; skipped (and reported) when
+//   none exist; threshold respected, below-threshold never merged
+// - idempotence: a second execute finds nothing new
+// - cross-table candidates are report-only
+// - search hides marked duplicates by default; --include-duplicates shows them
+// - lineage rows persist full audit detail
+// - --delete (destructive opt-in) removes rows + embeddings; FK-referenced
+//   duplicates are kept as marked instead of failing the transaction
+
+import { describe, test, expect, beforeEach, afterEach } from 'bun:test';
+import { setupTestDb, teardownTestDb } from '../helpers/setup';
+import { runDedup } from '../../src/commands/dedup';
+import { embeddingToBlob } from '../../src/lib/embeddings';
+import { getDb } from '../../src/db/connection';
+import { search } from '../../src/lib/memory';
+import {
+  createSession,
+  addMessage,
+  addDecision,
+  addBreadcrumb,
+  createLoaEntry,
+  supersedeDecision,
+} from '../../src/lib/memory';
+
+const originalLog = console.log;
+const originalError = console.error;
+const originalExitCode = process.exitCode;
+
+beforeEach(() => {
+  setupTestDb();
+  console.log = () => {};
+  console.error = () => {};
+});
+
+afterEach(() => {
+  console.log = originalLog;
+  console.error = originalError;
+  process.exitCode = originalExitCode;
+  teardownTestDb();
+});
+
+// Long enough to clear MIN_DEDUP_TEXT_LENGTH after normalization.
+const CRUMB = 'Always use bun for every script in this repository, never npm.';
+const OTHER = 'A completely different breadcrumb about the release process.';
+
+function lineageRows(): Array<Record<string, unknown>> {
+  return getDb().prepare('SELECT * FROM dedup_lineage ORDER BY id').all() as Array<Record<string, unknown>>;
+}
+
+function insertEmbedding(table: string, id: number, vector: number[]): void {
+  getDb().prepare(
+    `INSERT OR REPLACE INTO embeddings (source_table, source_id, model, dimensions, embedding)
+     VALUES (?, ?, 'test', ?, ?)`
+  ).run(table, id, vector.length, embeddingToBlob(vector));
+}
+
+describe('dry-run vs execute', () => {
+  test('dry-run reports the plan and writes nothing', () => {
+    addBreadcrumb({ content: CRUMB, importance: 5 });
+    addBreadcrumb({ content: CRUMB, importance: 5 });
+    addBreadcrumb({ content: OTHER, importance: 5 });
+
+    const result = runDedup({})!;
+    const crumbs = result.plan.tables.find(t => t.table === 'breadcrumbs')!;
+    expect(crumbs.exactGroups).toBe(1);
+    expect(crumbs.planned.length).toBe(1);
+    expect(result.applied).toBeNull();
+    expect(lineageRows().length).toBe(0);
+    expect(search('breadcrumb OR bun').length).toBe(3);
+  });
+
+  test('--execute marks duplicates non-destructively and search hides them', () => {
+    const id1 = addBreadcrumb({ content: CRUMB, importance: 5 });
+    const id2 = addBreadcrumb({ content: CRUMB, importance: 5 });
+
+    const result = runDedup({ execute: true })!;
+    expect(result.applied?.marked).toBe(1);
+
+    // Non-destructive: both records still exist.
+    const count = getDb().prepare('SELECT COUNT(*) AS c FROM breadcrumbs').get() as { c: number };
+    expect(count.c).toBe(2);
+
+    // Search hides the marked duplicate by default...
+    const hidden = search('bun');
+    expect(hidden.length).toBe(1);
+    // ...and shows it again with the explicit include option.
+    const shown = search('bun', { includeDuplicates: true });
+    expect(shown.length).toBe(2);
+    expect(shown.map(r => r.id).sort()).toEqual([id1, id2].sort());
+  });
+
+  test('--delete without --execute is rejected', () => {
+    expect(runDedup({ delete: true })).toBeUndefined();
+    expect(process.exitCode).toBe(1);
+  });
+});
+
+describe('exact detection and survivor selection', () => {
+  test('normalized variants dedupe; provenance picks the survivor', () => {
+    const extracted = addBreadcrumb({ content: CRUMB, importance: 9, provenance: 'extracted' });
+    const authored = addBreadcrumb({ content: CRUMB.toUpperCase(), importance: 1, provenance: 'user_authored' });
+    const legacy = addBreadcrumb({ content: `  ${CRUMB}  `, importance: 9 });
+
+    runDedup({ execute: true });
+
+    const rows = lineageRows();
+    expect(rows.length).toBe(2);
+    for (const row of rows) {
+      expect(row.survivor_id).toBe(authored);
+      expect(row.survivor_table).toBe('breadcrumbs');
+      expect(row.reason).toBe('exact');
+      expect(row.status).toBe('marked');
+    }
+    expect(rows.map(r => r.duplicate_id).sort()).toEqual([extracted, legacy].sort());
+  });
+
+  test('identical text in different projects is not deduped', () => {
+    addBreadcrumb({ content: CRUMB, importance: 5, project: 'alpha' });
+    addBreadcrumb({ content: CRUMB, importance: 5, project: 'beta' });
+    const result = runDedup({ execute: true })!;
+    expect(result.applied?.marked).toBe(0);
+  });
+
+  test('short records are never candidates', () => {
+    addBreadcrumb({ content: 'ok', importance: 5 });
+    addBreadcrumb({ content: 'ok', importance: 5 });
+    const result = runDedup({})!;
+    const crumbs = result.plan.tables.find(t => t.table === 'breadcrumbs')!;
+    expect(crumbs.tooShort).toBe(2);
+    expect(crumbs.planned.length).toBe(0);
+  });
+
+  test('only active decisions participate', () => {
+    const text = 'Adopt SQLite WAL mode for every database connection in Recall.';
+    const stale = addDecision({ decision: text, status: 'active' });
+    supersedeDecision(stale);
+    addDecision({ decision: text, status: 'active' });
+
+    const result = runDedup({ execute: true })!;
+    expect(result.applied?.marked).toBe(0);
+  });
+
+  test('--table scopes the run', () => {
+    addBreadcrumb({ content: CRUMB, importance: 5 });
+    addBreadcrumb({ content: CRUMB, importance: 5 });
+    createSession({ session_id: 's1', started_at: '2026-01-01T00:00:00Z' });
+    addMessage({ session_id: 's1', timestamp: '2026-01-01T00:00:01Z', role: 'user', content: CRUMB });
+    addMessage({ session_id: 's1', timestamp: '2026-01-01T00:00:02Z', role: 'user', content: CRUMB });
+
+    const result = runDedup({ execute: true, table: 'messages' })!;
+    expect(result.plan.tables.length).toBe(1);
+    expect(result.applied?.marked).toBe(1);
+    expect(lineageRows().every(r => r.duplicate_table === 'messages')).toBe(true);
+  });
+
+  test('invalid --table and --threshold are rejected', () => {
+    expect(runDedup({ table: 'sessions' })).toBeUndefined();
+    expect(process.exitCode).toBe(1);
+    process.exitCode = 0;
+    expect(runDedup({ threshold: 1.5 })).toBeUndefined();
+    expect(process.exitCode).toBe(1);
+  });
+});
+
+describe('semantic pass', () => {
+  test('skipped with a clear reason when no embeddings exist', () => {
+    addBreadcrumb({ content: CRUMB, importance: 5 });
+    addBreadcrumb({ content: OTHER, importance: 5 });
+    const result = runDedup({})!;
+    expect(result.plan.semanticSkipped).toContain('embed');
+  });
+
+  test('skipped when disabled via --no-semantic', () => {
+    addBreadcrumb({ content: CRUMB, importance: 5 });
+    const result = runDedup({ semantic: false })!;
+    expect(result.plan.semanticSkipped).toContain('--no-semantic');
+  });
+
+  test('marks pairs at the threshold, never below it', () => {
+    const near = Math.sqrt(1 - 0.97 * 0.97);
+    const a = addBreadcrumb({ content: CRUMB, importance: 5 });
+    const b = addBreadcrumb({ content: OTHER, importance: 5 });
+    insertEmbedding('breadcrumbs', a, [1, 0, 0]);
+    insertEmbedding('breadcrumbs', b, [0.97, near, 0]);
+
+    // Above the default 0.95 threshold → marked, with similarity recorded.
+    const marked = runDedup({ execute: true })!;
+    expect(marked.applied?.marked).toBe(1);
+    const row = lineageRows()[0];
+    expect(row.reason).toBe('semantic');
+    expect(row.similarity as number).toBeCloseTo(0.97, 3);
+
+    // Survivor is the richer record (longer normalized text wins the tie).
+    const longer = CRUMB.length >= OTHER.length ? a : b;
+    expect(row.survivor_id).toBe(longer);
+  });
+
+  test('a stricter threshold leaves the same pair untouched', () => {
+    const near = Math.sqrt(1 - 0.97 * 0.97);
+    const a = addBreadcrumb({ content: CRUMB, importance: 5 });
+    const b = addBreadcrumb({ content: OTHER, importance: 5 });
+    insertEmbedding('breadcrumbs', a, [1, 0, 0]);
+    insertEmbedding('breadcrumbs', b, [0.97, near, 0]);
+
+    const result = runDedup({ execute: true, threshold: 0.99 })!;
+    expect(result.applied?.marked).toBe(0);
+    expect(lineageRows().length).toBe(0);
+  });
+});
+
+describe('idempotence', () => {
+  test('a second execute finds nothing new', () => {
+    addBreadcrumb({ content: CRUMB, importance: 5 });
+    addBreadcrumb({ content: CRUMB, importance: 5 });
+    addBreadcrumb({ content: CRUMB, importance: 5 });
+
+    const first = runDedup({ execute: true })!;
+    expect(first.applied?.marked).toBe(2);
+    const after = lineageRows().length;
+
+    const second = runDedup({ execute: true })!;
+    expect(second.applied?.marked).toBe(0);
+    const crumbs = second.plan.tables.find(t => t.table === 'breadcrumbs')!;
+    expect(crumbs.alreadyMarked).toBe(2);
+    expect(lineageRows().length).toBe(after);
+  });
+});
+
+describe('cross-table candidates are report-only', () => {
+  test('reported, never marked', () => {
+    createSession({ session_id: 's1', started_at: '2026-01-01T00:00:00Z' });
+    addMessage({ session_id: 's1', timestamp: '2026-01-01T00:00:01Z', role: 'user', content: CRUMB });
+    addBreadcrumb({ content: CRUMB, importance: 5 });
+
+    const result = runDedup({ execute: true })!;
+    expect(result.plan.crossTable.textMatches.length).toBe(1);
+    expect(result.applied?.marked).toBe(0);
+    expect(lineageRows().length).toBe(0);
+    // Both records remain searchable.
+    expect(search('bun').length).toBe(2);
+  });
+});
+
+describe('lineage persistence', () => {
+  test('rows carry full audit detail', () => {
+    const survivor = addBreadcrumb({ content: CRUMB, importance: 5, provenance: 'user_authored', project: 'demo' });
+    const dup = addBreadcrumb({ content: CRUMB, importance: 5, provenance: 'extracted', project: 'demo' });
+
+    runDedup({ execute: true });
+
+    const row = lineageRows()[0];
+    expect(row.survivor_table).toBe('breadcrumbs');
+    expect(row.survivor_id).toBe(survivor);
+    expect(row.duplicate_table).toBe('breadcrumbs');
+    expect(row.duplicate_id).toBe(dup);
+    expect(row.reason).toBe('exact');
+    expect(row.similarity).toBe(1);
+    expect(row.status).toBe('marked');
+    expect(typeof row.created_at).toBe('string');
+    const detail = JSON.parse(row.detail as string);
+    expect(detail.survivor_provenance).toBe('user_authored');
+    expect(detail.duplicate_provenance).toBe('extracted');
+    expect(detail.project).toBe('demo');
+  });
+});
+
+describe('destructive opt-in (--execute --delete)', () => {
+  test('deletes duplicates and their embeddings; lineage records the deletion', () => {
+    const survivor = addBreadcrumb({ content: CRUMB, importance: 5, provenance: 'user_authored' });
+    const dup = addBreadcrumb({ content: CRUMB, importance: 5 });
+    insertEmbedding('breadcrumbs', dup, [1, 0, 0]);
+
+    const result = runDedup({ execute: true, delete: true })!;
+    expect(result.applied?.deleted).toBe(1);
+
+    const db = getDb();
+    const remaining = db.prepare('SELECT id FROM breadcrumbs').all() as Array<{ id: number }>;
+    expect(remaining.map(r => r.id)).toEqual([survivor]);
+    const embeddings = db.prepare('SELECT COUNT(*) AS c FROM embeddings').get() as { c: number };
+    expect(embeddings.c).toBe(0);
+    expect(lineageRows()[0].status).toBe('deleted');
+  });
+
+  test('FK-referenced duplicates are kept as marked instead of failing', () => {
+    createSession({ session_id: 's1', started_at: '2026-01-01T00:00:00Z' });
+    const survivor = addMessage({ session_id: 's1', timestamp: '2026-01-02T00:00:00Z', role: 'user', content: CRUMB });
+    const referenced = addMessage({ session_id: 's1', timestamp: '2026-01-01T00:00:00Z', role: 'user', content: CRUMB });
+    // The older message loses survivorship (recency tie-break) but is pinned
+    // by an LoA message range — hard-deleting it would violate the FK.
+    createLoaEntry({
+      title: 'entry', fabric_extract: 'body',
+      message_range_start: referenced, message_range_end: referenced,
+    });
+
+    const result = runDedup({ execute: true, delete: true })!;
+    expect(result.applied?.deleted).toBe(0);
+    expect(result.applied?.fkProtected).toBe(1);
+
+    const row = lineageRows()[0];
+    expect(row.duplicate_id).toBe(referenced);
+    expect(row.survivor_id).toBe(survivor);
+    expect(row.status).toBe('marked');
+    // The record still exists, just hidden.
+    const msg = getDb().prepare('SELECT id FROM messages WHERE id = ?').get(referenced);
+    expect(msg).toBeDefined();
+  });
+});
diff --git a/tests/commands/export.test.ts b/tests/commands/export.test.ts
index b7e0274..799c1f7 100644
--- a/tests/commands/export.test.ts
+++ b/tests/commands/export.test.ts
@@ -134,6 +134,7 @@ describe('manifest', () => {
       learnings: 1,
       breadcrumbs: 1,
       loa_entries: 1,
+      dedup_lineage: 0,
     });
     expect(manifest.provenance_counts.messages).toEqual({ unknown: 1, verbatim: 1 });
     expect(manifest.provenance_counts.decisions).toEqual({ unknown: 1, user_authored: 1 });
@@ -199,6 +200,7 @@ describe('SQL dump', () => {
         learnings: 1,
         breadcrumbs: 1,
         loa_entries: 1,
+        dedup_lineage: 0,
       });
 
       // Legacy NULL restores as NULL; known values survive verbatim
diff --git a/tests/db/migrations.test.ts b/tests/db/migrations.test.ts
index 8a52d55..62added 100644
--- a/tests/db/migrations.test.ts
+++ b/tests/db/migrations.test.ts
@@ -188,7 +188,8 @@ describe('MIGRATIONS array', () => {
   test('has expected number of migrations', () => {
     // 7 → 8: importance column on messages/decisions/learnings/loa_entries (Sprint #4)
     // 8 → 9: provenance column on all five memory tables (issue #42)
-    expect(MIGRATIONS.length).toBe(9);
+    // 9 → 10: dedup_lineage table (issue #45)
+    expect(MIGRATIONS.length).toBe(10);
   });
 
   test('all entries are functions', () => {
diff --git a/tests/lib/dedup.test.ts b/tests/lib/dedup.test.ts
new file mode 100644
index 0000000..e6ff9dd
--- /dev/null
+++ b/tests/lib/dedup.test.ts
@@ -0,0 +1,200 @@
+// recall dedup — pure logic (issue #45).
+//
+// Unit tests over the pure exported functions: normalization, survivor
+// selection (provenance > richness > importance > recency > id), exact
+// grouping, cross-table matching, and semantic pairing. Property-based
+// suites over these functions are issue #44 phase 2 — not here.
+
+import { describe, test, expect } from 'bun:test';
+import {
+  compareForSurvivor,
+  DEFAULT_SEMANTIC_THRESHOLD,
+  findCrossTableMatches,
+  findExactGroups,
+  findSemanticPairs,
+  normalizeText,
+  provenanceRank,
+  selectSurvivor,
+  type DedupCandidate,
+} from '../../src/lib/dedup';
+import { PROVENANCE_VALUES } from '../../src/types/index';
+
+function candidate(overrides: Partial<DedupCandidate> = {}): DedupCandidate {
+  return {
+    table: 'breadcrumbs',
+    id: 1,
+    project: null,
+    provenance: null,
+    importance: 5,
+    created_at: '2026-01-01 00:00:00',
+    key: 'k',
+    textLength: 50,
+    ...overrides,
+  };
+}
+
+describe('normalizeText', () => {
+  test('lowercases, strips quotes, collapses whitespace', () => {
+    expect(normalizeText(`  Use "Bun"   for\n'everything'  `)).toBe('use bun for everything');
+  });
+
+  test('identical meaning with different casing/spacing normalizes equal', () => {
+    expect(normalizeText('Ship  THE  feature')).toBe(normalizeText("ship the 'feature'"));
+  });
+});
+
+describe('provenanceRank', () => {
+  test('orders user_authored > verbatim > extracted > derived > unknown', () => {
+    const ranks = [...PROVENANCE_VALUES, null].map(p => provenanceRank(p));
+    expect(ranks).toEqual([...ranks].sort((a, b) => a - b));
+    expect(provenanceRank('user_authored')).toBeLessThan(provenanceRank('verbatim'));
+    expect(provenanceRank('verbatim')).toBeLessThan(provenanceRank('extracted'));
+    expect(provenanceRank('extracted')).toBeLessThan(provenanceRank('derived'));
+    expect(provenanceRank('derived')).toBeLessThan(provenanceRank(null));
+  });
+});
+
+describe('selectSurvivor', () => {
+  test('provenance dominates richness, importance, and recency', () => {
+    const weakButAuthored = candidate({ id: 1, provenance: 'user_authored', textLength: 10, importance: 1, created_at: '2020-01-01 00:00:00' });
+    const richButExtracted = candidate({ id: 2, provenance: 'extracted', textLength: 9999, importance: 10, created_at: '2026-01-01 00:00:00' });
+    const { survivor, duplicates } = selectSurvivor([richButExtracted, weakButAuthored]);
+    expect(survivor.id).toBe(1);
+    expect(duplicates.map(d => d.id)).toEqual([2]);
+  });
+
+  test('richness breaks provenance ties', () => {
+    const short = candidate({ id: 1, textLength: 10 });
+    const long = candidate({ id: 2, textLength: 200 });
+    expect(selectSurvivor([short, long]).survivor.id).toBe(2);
+  });
+
+  test('importance breaks richness ties', () => {
+    const low = candidate({ id: 1, importance: 3 });
+    const high = candidate({ id: 2, importance: 8 });
+    expect(selectSurvivor([low, high]).survivor.id).toBe(2);
+  });
+
+  test('recency breaks importance ties', () => {
+    const older = candidate({ id: 1, created_at: '2025-01-01 00:00:00' });
+    const newer = candidate({ id: 2, created_at: '2026-01-01 00:00:00' });
+    expect(selectSurvivor([older, newer]).survivor.id).toBe(2);
+  });
+
+  test('lowest id is the final deterministic tie-break', () => {
+    const a = candidate({ id: 7 });
+    const b = candidate({ id: 3 });
+    expect(selectSurvivor([a, b]).survivor.id).toBe(3);
+    // Total order: comparator never reports equality for distinct ids.
+    expect(compareForSurvivor(a, b)).toBeGreaterThan(0);
+  });
+
+  test('throws on an empty group', () => {
+    expect(() => selectSurvivor([])).toThrow();
+  });
+});
+
+describe('findExactGroups', () => {
+  test('groups same table + project + key; singletons excluded', () => {
+    const groups = findExactGroups([
+      candidate({ id: 1, key: 'a' }),
+      candidate({ id: 2, key: 'a' }),
+      candidate({ id: 3, key: 'b' }),
+    ]);
+    expect(groups.length).toBe(1);
+    expect(groups[0].map(c => c.id).sort()).toEqual([1, 2]);
+  });
+
+  test('identical text in different projects is not grouped', () => {
+    const groups = findExactGroups([
+      candidate({ id: 1, key: 'a', project: 'alpha' }),
+      candidate({ id: 2, key: 'a', project: 'beta' }),
+    ]);
+    expect(groups.length).toBe(0);
+  });
+});
+
+describe('findCrossTableMatches', () => {
+  test('reports same text across tables, ignores single-table matches', () => {
+    const matches = findCrossTableMatches([
+      candidate({ id: 1, key: 'a', table: 'breadcrumbs' }),
+      candidate({ id: 2, key: 'a', table: 'messages' }),
+      candidate({ id: 3, key: 'b', table: 'breadcrumbs' }),
+      candidate({ id: 4, key: 'b', table: 'breadcrumbs' }),
+    ]);
+    expect(matches.length).toBe(1);
+    expect(matches[0].members.map(m => m.table).sort()).toEqual(['breadcrumbs', 'messages']);
+  });
+});
+
+describe('findSemanticPairs', () => {
+  const unit = (x: number, y: number, z: number) => [x, y, z];
+
+  test('pairs at/above threshold; orthogonal vectors never pair', () => {
+    const near = Math.sqrt(1 - 0.97 * 0.97);
+    const pairs = findSemanticPairs(
+      [
+        { candidate: candidate({ id: 1, key: 'a' }), embedding: unit(1, 0, 0) },
+        { candidate: candidate({ id: 2, key: 'b' }), embedding: unit(0.97, near, 0) },
+        { candidate: candidate({ id: 3, key: 'c' }), embedding: unit(0, 0, 1) },
+      ],
+      DEFAULT_SEMANTIC_THRESHOLD
+    );
+    expect(pairs.length).toBe(1);
+    expect([pairs[0].a.id, pairs[0].b.id].sort()).toEqual([1, 2]);
+    expect(pairs[0].similarity).toBeCloseTo(0.97, 5);
+  });
+
+  test('below-threshold pairs are never produced', () => {
+    const near = Math.sqrt(1 - 0.97 * 0.97);
+    const pairs = findSemanticPairs(
+      [
+        { candidate: candidate({ id: 1, key: 'a' }), embedding: unit(1, 0, 0) },
+        { candidate: candidate({ id: 2, key: 'b' }), embedding: unit(0.97, near, 0) },
+      ],
+      0.99
+    );
+    expect(pairs.length).toBe(0);
+  });
+
+  test('same-table pairs across projects are ignored; cross-table pairs are flagged', () => {
+    const pairs = findSemanticPairs(
+      [
+        { candidate: candidate({ id: 1, key: 'a', project: 'alpha' }), embedding: unit(1, 0, 0) },
+        { candidate: candidate({ id: 2, key: 'b', project: 'beta' }), embedding: unit(1, 0, 0) },
+        { candidate: candidate({ id: 3, key: 'c', table: 'messages', project: 'alpha' }), embedding: unit(1, 0, 0) },
+      ],
+      0.95
+    );
+    // 1↔2: same table, different projects — ignored.
+    // 1↔3 and 2↔3: cross-table — report-only pairs.
+    expect(pairs.every(p => !p.sameTable)).toBe(true);
+    expect(pairs.length).toBe(2);
+  });
+
+  test('identical normalized keys are left to the exact pass', () => {
+    const pairs = findSemanticPairs(
+      [
+        { candidate: candidate({ id: 1, key: 'same' }), embedding: unit(1, 0, 0) },
+        { candidate: candidate({ id: 2, key: 'same' }), embedding: unit(1, 0, 0) },
+      ],
+      0.95
+    );
+    expect(pairs.length).toBe(0);
+  });
+
+  test('results are sorted strongest-first', () => {
+    const mid = Math.sqrt(1 - 0.96 * 0.96);
+    const near = Math.sqrt(1 - 0.99 * 0.99);
+    const pairs = findSemanticPairs(
+      [
+        { candidate: candidate({ id: 1, key: 'a' }), embedding: unit(1, 0, 0) },
+        { candidate: candidate({ id: 2, key: 'b' }), embedding: unit(0.99, near, 0) },
+        { candidate: candidate({ id: 3, key: 'c' }), embedding: unit(0.96, mid, 0) },
+      ],
+      0.95
+    );
+    const sims = pairs.map(p => p.similarity);
+    expect(sims).toEqual([...sims].sort((a, b) => b - a));
+  });
+});

From 167f1b8dabb0c85c5537e10de6e8ec4e47514b37 Mon Sep 17 00:00:00 2001
From: Ed Heltzel <402910+edheltzel@users.noreply.github.com>
Date: Thu, 11 Jun 2026 03:59:49 -0400
Subject: [PATCH 2/2] =?UTF-8?q?fix:=20never=20re-mark=20a=20planned=20surv?=
 =?UTF-8?q?ivor=20=E2=80=94=20one-hop=20dedup=20lineage;=20de-binary=20ded?=
 =?UTF-8?q?up.ts?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Resolves both blockers from the PR #60 review (comment 4676730890):

- planDedup now tracks plannedSurvivorKeys in both passes and skips any
  semantic pair whose member is already a planned survivor, so lineage
  stays one hop deep — no transitive chaining within or across passes.
  Regression test pins the reviewer's A-B-C repro (cos 0.99/0.96/0.91
  at the default 0.95 threshold).
- The raw NUL groupKey separators are now \x00 escape sequences, so git
  diffs src/lib/dedup.ts as text; the groupKey comment names the
  separator choice.
---
 src/lib/dedup.ts             | Bin 22314 -> 22875 bytes
 tests/commands/dedup.test.ts |  31 +++++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/src/lib/dedup.ts b/src/lib/dedup.ts
index cd40610a8f935aaf96882a2aff362ea6982130c3..fbfce6e271270aacb669d052c19736bd2cfeeb2a 100644
GIT binary patch
delta 623
zcma)(!D>@M6oyGEE<))-ZbiEKg+S60Z@UnLQfTQy2`#$N?p)+#<|KF0>&!4SNxifo
zeTVcJy68g1Y+Q-&;m#*<ZcQYXE}Yfy^Pm5m^PQVdm7iZLU$UfH!?d3qOkOI^r3X2#
zVG9GJG)tVRkRfD@*C)qdFUT@F??3MBbnxr)3IxxV90lW|ivk2`Z32uCLkFcnx+v3n
zWp)eRplgO&l2hzUFEo$Xq08t!GOiyRb_)oj1M861rq{`<jj~z&T%^{S0gHhBeKZ>5
zQk_-Szpp&F{rz|{XspLNA7Jx10?W5UH&P?xW@`s;s=If@BQAVOQZ_sFb3Zqxfx0=5
zcb60;$DP@`!_N;>f@5KvT%>~{D9EABm7u`oDmBdC_V#qI{dCfbyUS?0r*O%NMSC<O
zEsnG!4olu|{)@-)vk&A>{{SMKF`cqo&atH2?Md=@H|lc&AE@v!nvod`$!uFwY8TH{
k56$~{WaOI^$!y}{VzK96ir?CK<pu7_|HpCm_$XQZ4cH~vX8-^I

delta 122
zcmcb;iE-6B#tj)Fle0zaC)<d+i7=>CC+igy<!7ZPmnhiVE2yj2GHjkBx{sNWVRM{h
zmip$O4#rHI^PQKm3FPM`rz#{W6s0ET7o{j<7EhM-k`T-+;ZlGCeSL-G#JtJQUWzOV
Wj~b>-{_a(|`IxsC<K|FbHXZ=Gz9<g>

diff --git a/tests/commands/dedup.test.ts b/tests/commands/dedup.test.ts
index cc60f98..5c309b0 100644
--- a/tests/commands/dedup.test.ts
+++ b/tests/commands/dedup.test.ts
@@ -213,6 +213,37 @@ describe('semantic pass', () => {
     expect(result.applied?.marked).toBe(0);
     expect(lineageRows().length).toBe(0);
   });
+
+  test('a planned survivor is never re-marked by a weaker pair (no transitive chaining)', () => {
+    // PR #60 review repro: cos(A,B)=0.99, cos(B,C)=0.96, cos(A,C)≈0.91 with
+    // the default 0.95 threshold. Ascending text length pins the survivor
+    // orientation: B survives the strongest pair, so without the survivor
+    // guard the weaker B/C pair re-marks B — leaving A's only visible
+    // neighbor at 0.91, below the threshold.
+    const a = addBreadcrumb({ content: 'Review queue triage happens before standup.', importance: 5 });
+    const b = addBreadcrumb({ content: 'Review queue triage always happens before the standup.', importance: 5 });
+    const c = addBreadcrumb({ content: 'Review queue triage must always happen before the morning standup.', importance: 5 });
+    insertEmbedding('breadcrumbs', a, [0.99, Math.sqrt(1 - 0.99 * 0.99), 0]);
+    insertEmbedding('breadcrumbs', b, [1, 0, 0]);
+    insertEmbedding('breadcrumbs', c, [0.96, -Math.sqrt(1 - 0.96 * 0.96), 0]);
+
+    const result = runDedup({ execute: true })!;
+
+    // Only the strongest pair is marked; B/C is skipped because B already
+    // survives A.
+    expect(result.applied?.marked).toBe(1);
+    const rows = lineageRows();
+    expect(rows.length).toBe(1);
+    expect(rows[0].duplicate_id).toBe(a);
+    expect(rows[0].survivor_id).toBe(b);
+    expect(rows[0].similarity as number).toBeCloseTo(0.99, 3);
+
+    // One-hop lineage invariant: no survivor is itself marked as a duplicate.
+    const duplicates = new Set(rows.map(r => `${r.duplicate_table}:${r.duplicate_id}`));
+    for (const row of rows) {
+      expect(duplicates.has(`${row.survivor_table}:${row.survivor_id}`)).toBe(false);
+    }
+  });
 });
 
 describe('idempotence', () => {