From 7c34cd8816533c76d51e061452a50b902318c398 Mon Sep 17 00:00:00 2001
From: Ed Heltzel <402910+edheltzel@users.noreply.github.com>
Date: Wed, 10 Jun 2026 22:29:03 -0400
Subject: [PATCH 1/2] =?UTF-8?q?feat:=20add=20recall=20dedup=20=E2=80=94=20?=
=?UTF-8?q?non-destructive=20dedup=20with=20provenance-aware=20survivor=20?=
=?UTF-8?q?selection?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Dry-run by default; --execute marks duplicates in the new dedup_lineage
table (schema migration 9→10) without touching the records; --delete is
the destructive opt-in (recall export --backup recommended first).
- Exact/normalized detection within table + project; cross-table
candidates are report-only
- Semantic detection over stored embeddings (pairwise cosine, no
embedding service call; conservative 0.95 default threshold; skip is
reported when no embeddings exist; no transitive chaining)
- Survivor priority user_authored > verbatim > extracted > derived >
unknown (PROVENANCE_VALUES), then richness, importance, recency, id
- Marked duplicates hidden from all search paths (FTS5, semantic,
hybrid, MCP) unless --include-duplicates is passed
- dedup_lineage included in recall export for a portable audit trail
- Destructive deletes go through chunked() per the chunk.ts audit note;
FK-referenced duplicates (LoA message ranges, LoA parents) are kept
as marked instead of failing the transaction
- Fix latent blobToEmbedding crash on bun:sqlite Uint8Array blobs
Closes #45
---
CHANGELOG.md | 12 ++
docs/architecture.md | 10 ++
docs/cli-reference.md | 45 ++++-
src/commands/dedup.ts | 145 ++++++++++++++++
src/commands/embed.ts | 15 +-
src/commands/search.ts | 4 +-
src/db/migrations.ts | 6 +
src/db/schema.ts | 26 +++
src/index.ts | 32 +++-
src/lib/dedup.ts | Bin 0 -> 22314 bytes
src/lib/embeddings.ts | 13 +-
src/lib/export.ts | 7 +-
src/lib/memory.ts | 15 ++
src/mcp-server.ts | 4 +
tests/commands/dedup.test.ts | 314 ++++++++++++++++++++++++++++++++++
tests/commands/export.test.ts | 2 +
tests/db/migrations.test.ts | 3 +-
tests/lib/dedup.test.ts | 200 ++++++++++++++++++++++
18 files changed, 839 insertions(+), 14 deletions(-)
create mode 100644 src/commands/dedup.ts
create mode 100644 src/lib/dedup.ts
create mode 100644 tests/commands/dedup.test.ts
create mode 100644 tests/lib/dedup.test.ts
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4a11335..abf0eda 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,18 @@ while MCP tool names (`memory_search`, `memory_add`, etc.) remain stable.
### Added
+- **`recall dedup`** — non-destructive dedup with provenance-aware survivor
+ selection (#45): dry-run by default, `--execute` marks duplicates in the new
+ `dedup_lineage` table (schema migration 9→10) without touching the records,
+ `--delete` is the destructive opt-in (take `recall export --backup` first).
+ Detection combines normalized-text matching with semantic matching over
+ stored embeddings (conservative 0.95 default threshold, skip reported when
+ embeddings are unavailable). Survivor priority is `user_authored > verbatim
+ > extracted > derived > unknown`, then richness, importance, recency.
+ Within-table only; cross-table candidates are report-only. Marked
+ duplicates are hidden from every search path unless
+ `recall search --include-duplicates` is passed, and lineage rows are
+ included in `recall export`.
- **`recall export`** — portable and disaster-recovery exports (#43): JSON,
Markdown, SQL dump, and SQLite (`VACUUM INTO`) formats with a manifest
(counts + provenance counts including explicit `unknown`), a stdout/file/
diff --git a/docs/architecture.md b/docs/architecture.md
index a7d859e..d84e960 100644
--- a/docs/architecture.md
+++ b/docs/architecture.md
@@ -70,6 +70,7 @@ both.
| telos | Purpose framework entries (optional) | Yes |
| documents | Imported standalone markdown documents (optional) | Yes |
| embeddings | Vector embeddings for semantic search (768-dim, nomic-embed-text) | N/A |
+| dedup_lineage | Duplicate lineage audit trail from `recall dedup` (survivor, duplicate, reason, similarity, status) | No |
All FTS5-indexed tables have automatic sync triggers.
@@ -88,6 +89,15 @@ rows stay `NULL` (unknown) until classified with
`recall provenance backfill`, which only acts on deterministic write-path
evidence and never guesses.
+The `dedup_lineage` table was added in schema migration 9→10. `recall dedup`
+marks duplicate records non-destructively by writing lineage rows here
+(survivor table/id, duplicate table/id, reason, similarity, status); marked
+duplicates stay in their source tables but are hidden from search unless
+`--include-duplicates` is passed. Survivor selection follows provenance order
+(`user_authored > verbatim > extracted > derived > unknown`), then richness,
+importance, and recency. Dedup acts within a table only; cross-table
+candidates are report-only.
+
## Tiered RecallStart (v0.7.0+)
The `RecallStart` hook injects two tiers at the top of every session:
diff --git a/docs/cli-reference.md b/docs/cli-reference.md
index c3d38ff..671de42 100644
--- a/docs/cli-reference.md
+++ b/docs/cli-reference.md
@@ -17,6 +17,7 @@ recall search "query" -t decisions # Hard-filter to decisions only
recall search "query" --bias-type decisions # Prefer decisions, still show other matching tables
recall search "query" -p myproject # Filter by project
recall search "query" --show-provenance # Show provenance for every result
+recall search "query" --include-duplicates # Include records marked by recall dedup
recall semantic "query" # Semantic search (explicit)
recall hybrid "query" # Hybrid search (explicit)
```
@@ -46,6 +47,8 @@ FTS5 supports boolean operators and prefix matching:
By default, search output stays quiet about [Record Provenance](#record-provenance) when a record carries a known value, and visibly flags records whose provenance is unknown (legacy rows that predate the provenance column). Pass `--show-provenance` to display the provenance of every result.
+Records marked as duplicates by [`recall dedup`](#dedup) are hidden from every search path (keyword, semantic, hybrid) by default — the records and their lineage remain in the database. Pass `--include-duplicates` to show them.
+
---
## Capture
@@ -269,7 +272,7 @@ Formats:
- **json / markdown** — app-level export of the durable memory tables
(`sessions`, `messages`, `decisions`, `learnings`, `breadcrumbs`,
- `loa_entries`). Every row of a provenance-bearing table carries an explicit
+ `loa_entries`, `dedup_lineage`). Every row of a provenance-bearing table carries an explicit
`provenance` field; legacy `NULL` provenance is exported as the literal
`unknown` — never omitted, never guessed (see Record Provenance above).
Embeddings are excluded.
@@ -300,6 +303,46 @@ included.
overwrites an existing file (a `-N` suffix is added on collision), and prints
the output path.
+## Dedup
+
+Detect and mark duplicate memory records without erasing evidence or lineage.
+
+```bash
+recall dedup # Dry-run report (default — writes nothing)
+recall dedup --execute # Mark duplicates (non-destructive)
+recall dedup --execute --delete # Destructive opt-in: hard-delete duplicates
+recall dedup -t breadcrumbs # Scope to one table
+recall dedup -p myproject # Scope to one project
+recall dedup --threshold 0.98 # Stricter semantic matching (default 0.95)
+recall dedup --no-semantic # Exact/normalized text pass only
+```
+
+Safety model:
+
+- **Dry-run by default.** Mutations require `--execute`.
+- **Non-destructive by default.** `--execute` writes lineage rows to the
+ `dedup_lineage` table (survivor, duplicate, reason, similarity, status,
+ timestamp); the duplicate records themselves stay intact and are merely
+ hidden from search. `--delete` is the destructive opt-in and requires
+ `--execute` — run `recall export --backup` first.
+- **Within-table only.** Dedup never merges across tables (or across
+ projects). Cross-table duplicate candidates are report-only.
+- **Survivor priority** is `user_authored > verbatim > extracted > derived >
+ unknown` ([Record Provenance](#record-provenance)); ties break by richness
+ (longer normalized text), importance, recency, then lowest id.
+- **Detection** combines exact/normalized text matching with semantic
+ matching over stored embeddings (no embedding service call needed). The
+ semantic pass is skipped — and reported as skipped — when no embeddings
+ exist; records are never merged below the configured `--threshold`
+ (conservative default: 0.95 cosine similarity). Records with fewer than 20
+ significant characters are never candidates.
+- **Lifecycle-aware.** Only `active` decisions participate; superseded and
+ reverted decisions are managed by the decision lifecycle, not dedup.
+
+Marked duplicates are hidden from all search paths by default; see
+`recall search --include-duplicates`. Lineage is included in
+`recall export`, so the audit trail is portable.
+
## Admin
```bash
diff --git a/src/commands/dedup.ts b/src/commands/dedup.ts
new file mode 100644
index 0000000..7c865bf
--- /dev/null
+++ b/src/commands/dedup.ts
@@ -0,0 +1,145 @@
+// recall dedup command (issue #45).
+//
+// Dry-run by default. --execute marks duplicates (non-destructive: records
+// stay intact, hidden from search via dedup_lineage). --delete is the
+// destructive opt-in and requires --execute; take a `recall export --backup`
+// first. Core logic lives in src/lib/dedup.ts.
+
+import { getDb } from '../db/connection.js';
+import {
+ applyDedupPlan,
+ DEDUP_TABLES,
+ DEFAULT_SEMANTIC_THRESHOLD,
+ planDedup,
+ type ApplyResult,
+ type DedupPlan,
+} from '../lib/dedup.js';
+import type { ProvenanceTable } from '../types/index.js';
+
+export interface DedupOptions {
+ execute?: boolean;
+ delete?: boolean;
+ table?: string;
+ project?: string;
+ threshold?: number;
+ semantic?: boolean;
+}
+
+export interface DedupRunResult {
+ plan: DedupPlan;
+ applied: ApplyResult | null;
+}
+
+export function runDedup(options: DedupOptions = {}): DedupRunResult | undefined {
+ const execute = options.execute ?? false;
+ const destructive = options.delete ?? false;
+
+ if (destructive && !execute) {
+ console.error('--delete requires --execute. Dry-run never deletes.');
+ process.exitCode = 1;
+ return undefined;
+ }
+
+ const target = options.table ?? 'all';
+ if (target !== 'all' && !(DEDUP_TABLES as readonly string[]).includes(target)) {
+ console.error(
+ `Invalid --table "${target}". Valid tables: ${DEDUP_TABLES.join(', ')}, all.`
+ );
+ process.exitCode = 1;
+ return undefined;
+ }
+
+ const threshold = options.threshold ?? DEFAULT_SEMANTIC_THRESHOLD;
+ if (!Number.isFinite(threshold) || threshold <= 0 || threshold > 1) {
+ console.error(`Invalid --threshold "${options.threshold}". Expected a number in (0, 1].`);
+ process.exitCode = 1;
+ return undefined;
+ }
+
+ const db = getDb();
+ const plan = planDedup(db, {
+ tables: target === 'all' ? undefined : [target as ProvenanceTable],
+ project: options.project,
+ threshold,
+ semantic: options.semantic,
+ });
+
+ const mode = !execute
+ ? '[DRY RUN — no changes written]'
+ : destructive
+ ? '[EXECUTE + DELETE — destructive: duplicates will be removed]'
+ : '[EXECUTE — marking duplicates, non-destructive]';
+ console.log(`${mode}\n`);
+
+ if (destructive) {
+ console.log("Recommended: run 'recall export --backup' before destructive dedup.\n");
+ }
+
+ const verb = execute ? (destructive ? 'delete' : 'mark') : 'would mark';
+ let totalPlanned = 0;
+ for (const report of plan.tables) {
+ totalPlanned += report.planned.length;
+ const unchanged = report.scanned - report.planned.length;
+ const skipped: string[] = [];
+ if (report.alreadyMarked > 0) skipped.push(`${report.alreadyMarked} already marked`);
+ if (report.tooShort > 0) skipped.push(`${report.tooShort} too short`);
+ const skippedNote = skipped.length > 0 ? ` (${skipped.join(', ')})` : '';
+ console.log(
+ `${report.table}: scanned ${report.scanned}, exact groups ${report.exactGroups}, ` +
+ `semantic pairs ${report.semanticPairs}, ${verb} ${report.planned.length}, ` +
+ `unchanged ${unchanged}${skippedNote}`
+ );
+ for (const entry of report.planned.slice(0, 3)) {
+ const sim = entry.similarity !== null ? ` @ ${entry.similarity.toFixed(3)}` : '';
+ console.log(
+ ` #${entry.duplicate_id} → survivor #${entry.survivor_id} [${entry.reason}${sim}]`
+ );
+ }
+ if (report.planned.length > 3) {
+ console.log(` ...and ${report.planned.length - 3} more`);
+ }
+ }
+ console.log('');
+
+ if (plan.semanticSkipped) {
+ console.log(`Semantic pass: skipped — ${plan.semanticSkipped}`);
+ } else {
+ console.log(`Semantic pass: threshold ${plan.threshold}`);
+ }
+
+ const crossText = plan.crossTable.textMatches;
+ console.log(
+ `Cross-table (report-only, never acted on): ${crossText.length} text match group(s), ` +
+ `${plan.crossTable.semanticPairs} semantic pair(s)`
+ );
+ for (const match of crossText.slice(0, 5)) {
+ const members = match.members.map(m => `${m.table}#${m.id}`).join(' ↔ ');
+ const projectTag = match.project ? ` [${match.project}]` : '';
+ console.log(` ${members}${projectTag}`);
+ }
+ if (crossText.length > 5) {
+ console.log(` ...and ${crossText.length - 5} more`);
+ }
+ console.log('');
+
+ if (!execute) {
+ if (totalPlanned > 0) {
+ console.log('Re-run with --execute to mark duplicates (non-destructive).');
+ console.log("Marked duplicates are hidden from search; use 'recall search --include-duplicates' to see them.");
+ } else {
+ console.log('No duplicates found.');
+ }
+ return { plan, applied: null };
+ }
+
+ const applied = applyDedupPlan(db, plan, { destructive });
+ if (destructive) {
+ const fkNote = applied.fkProtected > 0
+ ? ` (${applied.fkProtected} kept as marked — referenced by LoA lineage)`
+ : '';
+ console.log(`Deleted ${applied.deleted} duplicate(s)${fkNote}.`);
+ } else {
+ console.log(`Marked ${applied.marked} duplicate(s). Records remain intact and recoverable.`);
+ }
+ return { plan, applied };
+}
diff --git a/src/commands/embed.ts b/src/commands/embed.ts
index c9ae4a0..66bd623 100644
--- a/src/commands/embed.ts
+++ b/src/commands/embed.ts
@@ -2,8 +2,17 @@
import { getDb } from '../db/connection.js';
import { embed, embeddingToBlob, blobToEmbedding, cosineSimilarity, checkEmbeddingService, reciprocalRankFusion, EMBEDDING_MODEL } from '../lib/embeddings.js';
+import { notMarkedDuplicateSql } from '../lib/dedup.js';
import { search as ftsSearch } from '../lib/memory.js';
+// Marked duplicates (recall dedup, issue #45) keep their embeddings but are
+// hidden from the semantic search paths, matching the FTS5 default.
+function embeddingsWhere(table?: string): string {
+ const conditions = [notMarkedDuplicateSql('source_table', 'source_id')];
+ if (table) conditions.push(`source_table = '${table}'`);
+ return `WHERE ${conditions.join(' AND ')}`;
+}
+
interface EmbedOptions {
table?: 'loa' | 'decisions' | 'messages' | 'learnings';
limit?: number;
@@ -164,11 +173,10 @@ export async function runSemanticSearch(query: string, options: { table?: string
const queryEmbedding = queryResult.embedding;
// Get all embeddings (for now, brute force - will optimize later)
- const tableFilter = options.table ? `WHERE source_table = '${options.table}'` : '';
const embeddings = db.prepare(`
SELECT id, source_table, source_id, embedding
FROM embeddings
- ${tableFilter}
+ ${embeddingsWhere(options.table)}
`).all() as Array<{ id: number; source_table: string; source_id: number; embedding: Buffer }>;
if (embeddings.length === 0) {
@@ -304,11 +312,10 @@ export async function runHybridSearch(query: string, options: { table?: string;
const queryEmbedding = queryResult.embedding;
// Get embeddings from database
- const tableFilter = options.table ? `WHERE source_table = '${options.table}'` : '';
const embeddings = db.prepare(`
SELECT id, source_table, source_id, embedding
FROM embeddings
- ${tableFilter}
+ ${embeddingsWhere(options.table)}
`).all() as Array<{ id: number; source_table: string; source_id: number; embedding: Buffer }>;
// Calculate similarities
diff --git a/src/commands/search.ts b/src/commands/search.ts
index 2c13579..9c782a7 100644
--- a/src/commands/search.ts
+++ b/src/commands/search.ts
@@ -8,6 +8,7 @@ interface SearchOptions {
biasType?: string;
limit?: number;
showProvenance?: boolean;
+ includeDuplicates?: boolean;
}
export function runSearch(query: string, options: SearchOptions): void {
@@ -23,7 +24,8 @@ export function runSearch(query: string, options: SearchOptions): void {
project: options.project,
table: options.table,
biasType: options.biasType as SearchTable | undefined,
- limit: options.limit || 20
+ limit: options.limit || 20,
+ includeDuplicates: options.includeDuplicates
});
if (results.length === 0) {
diff --git a/src/db/migrations.ts b/src/db/migrations.ts
index 415e1a4..e5c4cbe 100644
--- a/src/db/migrations.ts
+++ b/src/db/migrations.ts
@@ -198,6 +198,12 @@ export const MIGRATIONS: Migration[] = [
}
}
},
+
+ // Migration 9 → 10: Dedup lineage table (issue #45).
+ // No-op — dedup_lineage and its indexes are brand new, handled by the
+ // CREATE TABLE IF NOT EXISTS DDL that runs before migrations (same
+ // precedent as migration 3 → 4 for the extraction tables).
+ (_db) => {},
];
// ---------------------------------------------------------------------------
diff --git a/src/db/schema.ts b/src/db/schema.ts
index 7f60912..269e0b9 100644
--- a/src/db/schema.ts
+++ b/src/db/schema.ts
@@ -181,6 +181,24 @@ CREATE TABLE IF NOT EXISTS procedures (
times_observed INTEGER DEFAULT 2,
confidence TEXT DEFAULT 'medium' CHECK (confidence IN ('high', 'medium', 'low'))
);
+
+-- Dedup lineage (issue #45): persistent audit trail of duplicate marking.
+-- Non-destructive by default — a 'marked' row hides the duplicate from search
+-- while the underlying record stays intact. 'deleted' records a destructive
+-- opt-in removal. 'reverted' is reserved vocabulary for a future unmark path
+-- (CHECK constraints cannot be widened later without a table rebuild).
+CREATE TABLE IF NOT EXISTS dedup_lineage (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
+ survivor_table TEXT NOT NULL,
+ survivor_id INTEGER NOT NULL,
+ duplicate_table TEXT NOT NULL,
+ duplicate_id INTEGER NOT NULL,
+ reason TEXT NOT NULL CHECK (reason IN ('exact', 'semantic')),
+ similarity REAL,
+ status TEXT NOT NULL DEFAULT 'marked' CHECK (status IN ('marked', 'deleted', 'reverted')),
+ detail TEXT
+);
`;
export const CREATE_INDEXES = `
@@ -231,6 +249,14 @@ CREATE INDEX IF NOT EXISTS idx_documents_created ON documents(created_at);
-- Extraction session indexes
CREATE INDEX IF NOT EXISTS idx_extraction_sessions_ts ON extraction_sessions(timestamp DESC);
+
+-- Dedup lineage indexes: the partial unique index guarantees a record can be
+-- an actively marked duplicate at most once (idempotence); the survivor index
+-- supports lineage audits.
+CREATE UNIQUE INDEX IF NOT EXISTS idx_dedup_lineage_duplicate
+ ON dedup_lineage(duplicate_table, duplicate_id) WHERE status = 'marked';
+CREATE INDEX IF NOT EXISTS idx_dedup_lineage_survivor
+ ON dedup_lineage(survivor_table, survivor_id);
`;
export const CREATE_FTS = `
diff --git a/src/index.ts b/src/index.ts
index 4040e17..a2e2aaf 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -30,6 +30,8 @@ import { runOnboard } from './commands/onboard.js';
import { runMigrate } from './commands/migrate.js';
import { runPath } from './commands/path.js';
import { runExport } from './commands/export.js';
+import { runDedup } from './commands/dedup.js';
+import { DEFAULT_SEMANTIC_THRESHOLD } from './lib/dedup.js';
import { closeDb } from './db/connection.js';
const program = new Command();
@@ -180,13 +182,15 @@ program
.option('--bias-type
', 'Softly boost one table without filtering others (messages, loa, decisions, learnings, breadcrumbs)')
.option('-l, --limit ', 'Max results', '20')
.option('--show-provenance', 'Show provenance for every result (default: only unknown provenance is flagged)')
+ .option('--include-duplicates', 'Include records marked as duplicates by recall dedup (hidden by default)')
.action((query, options) => {
runSearch(query, {
project: options.project,
table: options.table,
biasType: options.biasType,
limit: parseInt(options.limit, 10),
- showProvenance: options.showProvenance
+ showProvenance: options.showProvenance,
+ includeDuplicates: options.includeDuplicates
});
closeDb();
});
@@ -658,6 +662,30 @@ program
closeDb();
});
+// recall dedup — non-destructive duplicate detection (issue #45)
+// Dry-run by default; --execute marks duplicates in dedup_lineage (records
+// stay intact, hidden from search); --delete is the destructive opt-in.
+program
+ .command('dedup')
+ .description('Detect and mark duplicate memory records (dry-run by default; non-destructive)')
+ .option('--execute', 'Apply the plan: mark duplicates (default is dry-run)')
+ .option('--delete', "Destructive opt-in: hard-delete duplicates instead of marking (requires --execute; run 'recall export --backup' first)")
+ .option('-t, --table ', 'Target table: messages, decisions, learnings, breadcrumbs, loa_entries, all', 'all')
+ .option('-p, --project ', 'Scope to one project')
+ .option('--threshold ', `Semantic similarity threshold (0-1)`, String(DEFAULT_SEMANTIC_THRESHOLD))
+ .option('--no-semantic', 'Skip the semantic (embeddings) pass')
+ .action((options) => {
+ runDedup({
+ execute: options.execute,
+ delete: options.delete,
+ table: options.table,
+ project: options.project,
+ threshold: parseFloat(options.threshold),
+ semantic: options.semantic
+ });
+ closeDb();
+ });
+
// Default command: recall → hybrid search (Phase 3: best of both worlds)
program
.arguments('[query]')
@@ -668,7 +696,7 @@ program
.option('-k, --keyword', 'Use keyword search only (FTS5)')
.option('-v, --vector', 'Use vector search only (semantic)')
.action(async (query, options) => {
- if (query && !['init', 'add', 'search', 'recent', 'show', 'stats', 'import', 'import-conversations', 'loa', 'telos', 'docs', 'dump', 'embed', 'semantic', 'hybrid', 'doctor', 'importance', 'provenance', 'pin', 'unpin', 'decision', 'prune', 'cluster', 'import-legacy', 'benchmark', 'onboard', 'migrate', 'path', 'export'].includes(query)) {
+ if (query && !['init', 'add', 'search', 'recent', 'show', 'stats', 'import', 'import-conversations', 'loa', 'telos', 'docs', 'dump', 'embed', 'semantic', 'hybrid', 'doctor', 'importance', 'provenance', 'pin', 'unpin', 'decision', 'prune', 'cluster', 'import-legacy', 'benchmark', 'onboard', 'migrate', 'path', 'export', 'dedup'].includes(query)) {
if (options.keyword) {
// FTS5 only
runSearch(query, {
diff --git a/src/lib/dedup.ts b/src/lib/dedup.ts
new file mode 100644
index 0000000000000000000000000000000000000000..cd40610a8f935aaf96882a2aff362ea6982130c3
GIT binary patch
literal 22314
zcmcg!>rxy?mfqibinP`-U2Q`nImR2Y$VeeW+hQz%md4}RWmu^0E}%qpRjaC6z+fWw
zA@&LPN%s5B$y;?d-f@Ix35Nw)nI|*P{Z_TLW$J8_7KNE+)5YBU&wu^5nN)RVis~|-
zm?UqSMP~l_kKgtOPqw!3pE;&1u%O^NcC2i$y70HwLGx&AdqS(oEAfHBDvm)--K;
z*O*0_x0@~Qm5z&yaQxN(Xy#XGlbLT!I-LS|UCpz)y)&F_VYXKpCism!u14uahMBW!
znibmwx@q?6yUlu0n(-a5aFH&G_Jx@(TDf)uT6|pOIDA_2{6#*&`Lf$=vS@+KHk*?O
z1fnZVDi<@ew7zZ($cF_fj%9d3lJl&{%PhUjgsh-5klDz6F{8y|0cXscw4P!?krDUA
zB~X*i(;Bca97tWw+s(Z6XCS`-NMlHRY?&HCo=MVj8peRH51@VDRAq5DFuQftG@BmF
zCLG&Lur3E-Z~-z;osoc>vP`c%a_ls(^Qz9*vidAtv{&HSsd;HY;W2PI!>`$G
zTjO{#Odk5g-w4Qn*w6V7$nSX+tP^IbXD!*s8#q^Zf&pyxJHH
z2K{qW1?$B
zA49)-V6pb0iC2${YJ6M`9kXme^MlK?Q6QGveX!;ErWtgAJTWE`qT|=!za5S?x`QfN
z9HeBK_ufjZ89Ju=-FV{n9c;zYC_6vd`synq^Puy@Xi*`H+JN>Vn^RmPTQttVdF7><
zWwWY=-sTp~zE1KhqnSoz+{tc*32Zn6Jz(=0k44XJY?Ei;J*&
z!!%3V2~^_cz`SnFv%$Z9Yf&2u0?Z#^mDMbtYzlT^er{W6unB~?*)Z)*MfD&$oB9x%
zyx#Qs4H5*SkiIefbZB=KqiFGu#YH&+!nH&T6aF=!VEJxp?q
z4PoGL+E~K~DS44#1_oxt1u!yH*ATx;xvoi`R@YBA%u|B7Zur|>);#UgVnDPAqsT6y
zq(B9zDJ-YWfk|uVb2P#tvlLc{I*S+s4rb=2T0kLNTI-G(_Lpl%{`@!jJ_Son+Oe<1qyN7lncmx&A+ULhB|FU
zPJYF+qOeb_Ll=X=RosMN%3^@3;ihM4>tGTX#kRRhzxnoGU_FWRm_t;Mg-Ufb_!{3%6zZL7c2C|3A9lc?0^K4Vhbyi!LoTos-RUXzL+EQy+w$LT_xR*0t
zXOH9CPw67w8_VBV8s>D@!ia#sww3P6Y(#P}wC)xWwMc&<`BJNN#?b!7kwWuN*~OHRphf(>(1eP4=g?P=Wojxw;)WD?@|CG9ZQWZryhtj;c
zf`xD9kZ35sbcR_~2@w>>oD7Vexrnh709{oT9Dn*h+#k>Ln(@?T0Zm~ex{e}q81;C}
zjsY-f95v}usSY4x*IB!$OVbS6>MeZIdKb5VD+4WC;C#u}N$=_D76P+gFB<%7^RA!eF7E#MNTK3=#{K;#fFWuf6VM7Vie8`{=SD(R`5)UbIG1Qyj?UUc<{P$l3
zzyILpw{KxDXg?H9}0dbo$Vw=E=g*8Q#;2BRM)b(f%PRJ3qO6F^{
zD8zGtNaKRuHPEkGS@cSNVUjQAv}!_zm4gj5prF*N1Cvj0VLevoNhEW4k<38~PgR+`
zvtu@&!!ur8qp;03OvP=LQbB
z$LS#0F2iZ#Xuz+C2eEIwC}JTWnu3#(@gUfD^XirPmIqGKl~H$)XmZZii|jU^RF`!+zoHUGF2Q0a
z9R#+p*AD~!`kh%Xfub-60stx4HqUuDD{Z*s6qf}Cro#T#U-Jr76Iz}}BsaCp6ep)(
zi+jLys$zKp_w%L}O1SI8ot$E4uL)!aEAnPXd4XZ(4dQ|uGpy^XPT*iN&E`&p_)ny>RO?fxXGtKEtG
z-0SHO+{uS?cT(z#lCkiqUd)lGZX^kj`ihrGYzGyJcr;^J
z3P-$IuaBVJrg9p=Uz3F3X!-KOZOa^Vaw-g@WFOxFYoIzA=3P4fPEGyB3fsM;%{Jmv
zfFl?hPv}>Nt@{GhnVoBZ3^b9Y?vV`RvFtp!%-ZDIKBrofr+r;Jm@k^EWMZFY1zZ|k
zgse_-y7)f2r*E)$oii`lNC}JH
zNvuI>wFLC8P>6OcrMAuMx=!!DyMHXgisCEt@G@qsqma{tt+y%fY^ssJggZ>)&C{|J
z%Zdzu;}YRu$8h8=dV{g(FpmXg0_rkf2mVEHt%0vDeVo6RkN`{uQm^H*r3L?AOeT{2
zxcDNkYaO9~4BM$FSR!p}B0I1SkN|xOmE92>Jqd)T%cxxevZPhP9D(0u)nJy+lL;Bw
zs;_<|I+mnO{mJA>5FBqTYiz~~rrRW6X8A<|B<3IUODHV34b0DIx~7>R^GX*R`PWT5eM6bg1qPDH
z!JduSH=gS-DgP4;E44VKDPX!hCt;_P2}*f#!xR~Pn8peoK9ZJ_y_Gz?_>NG3geOQC
zqYRU6!f8Y%3^^v8it2GvEKoF)KoSMRsjRxX8`t352FX<9mQowoA(DJtKgJBzB%T#B
z$iNC8pPWik+Z8w*uReim5`*ce%sFci$CnkK1Q+VP1;<*diR6dCjJ?KusK!9%l(`k=
zbl?LvhE*I*1lzMARGC%_2-~KLP_Vj=V;Q_dYQa#z5*e9kIq;IdT8s^BvVseYt=>X0
z9@G)Ax?Gz{ZU;&Yh}s5l&%SR&mB?a)LXp1SHv
zb5;HXM*mFz&))%*Q@V
zp#c9l2!~(*9sl|3uQFvmKzK_HGAJt6?KEqVB_G{*-$XMd(&Hoi;)fae0s6-Qe*sXz
zstpe1XlesKd*%}nF;ZTst#m+_QRPZ^g*beww&rPjCAG=Vmv!9?IlAZs5E&>@R+7Dn
zwuN3)cY}lt%(}`bLat
zAyc$2JuL{jJPekpz*vtql#J3hg!4!TYKytYpr
zd1;U&zy1sDz^gED0jf>RTQ=sP4heq)E1xnHU6rK$`e2V-co7_#{7Q!h*
zGm7L#X_wqm*d`>4X7Gq?h=^3XPv|sPs7wHucxBSRgFDq3T3N1OBI#3#<0<8CZQU9?
zQ5z-o^0U$I>w~l1!-F68f82(9%cAvn?|q$Lh_j(J`#__Y;Ng3q+(|rgWr9UdvXdUR
zXknmx$TF?atcUt8Yv8ka&pgoSa!q9*_6i{PY(1oq3hZq}9I$Pk24S)Ud8&IM2&fsN
zuiFc*33zb^P`w7s!dlJ$xvfHFL2^2&AxZ1t{w6Y_E7%iO=^0R0gB8}`qs`kQ;|VWP
zxWkh(VS7}m*CKieHgbBa0K_$nftSbV7s9wf{}+Q_^C?fotLwG}A!10P_8-Oj7M
zrMB}&;s_ofE*hyMVW#zey^`W=x%ddDbtr^@t?vtxa0I)@6w^eXYR?_BBGxB@4DtF^
z@NdLfMwEh`jp@N8p~tGna~Oln95aVd#Dkzvk!C+x89YVa9hu9p%w9|q`Mxm5r9Jm#
z2iru;P6r>Vyi9sxB|BN}cGdy|Q7*b+2hnxUno*yev7~Q9{AtMr6zIfq+2i=l=
zqBFLwIQlSD>rhUCMA-I>Dr9tz!zfSWZX~0C-hi3|lzkHLmTn2F)Ywd{wKy)JZQDCv
zBK1l@3@`QU!n;>uo}g
z8OQX09OKvtPuqzV{bQF`K-OTbo(pIRYCB*?wvLjKE-1vfS9aaZq-|9SqQmqe(8Ul8
zG}fAUcxoAK8--EKacqZ)ws%1j_I*}(h|tbL-_GRL+5ztK-a9O|XJ!qEP|390hk``*^0%6#yK~fJ{0c2R=4``<-N_)~dg{F$doGsE(V{v~l%^5%^y{qiD
z7r`KbeMgh8HCX;}$8w>&su?fu!u}^6so6BoPdgN$C~%pLiqYBX>5Jrm3Jfa$)~-jj
zK#jB*UG;6$KUvfW?OIgH)`gc6R5)9z4Iu>I>EZNNwW!kRQgGKKv0;@|Tx+XeFV$TU
z;|F3;k3|MU4l?bLf{Id~MRdtE@|c6%Kgdd@1T**2++3sODJjl
zpN+9}iINAP`l~<|Ib4S}9<1}MUIg&l=KIA3G82|0s&-Mkp}w_+)|Csb6j+Hz7#$MO
zDOZ3Pu=t5Fm+0WNmU_13vA>&^h$5K;5Jy~kt^0WW1D>)bYtE6(M8t^(3ml$1M;R9S
z&LDk;oWEv96Nr_ICkV*~ZS3xW?m?_yr4VJxFvI?9w$=Z_SAr~%GcZoGgU&;2hSDx|
zL)F5YCMcx1rg)7w?mV;SBv=G4DzoxCAvN^&c!q6lh7cGX0cX6g9`$>KsO^(
z5SWHOIuvU3Z3V5EtargLuA%h#7DYA%ZaQk>H<%Q%W|=f*%^b+}H9YrRDNPSVxJt6W
zA;;xt?LCim`Ufta)j$^MXM*%IC>F>YoKK*?`y@jm8snLAeur|jEyhYfcaehI9~tJe
z*dJ#OkJiAACa62TMwKDxvwdu~c5XsQq9ayVIA(F0B>hwd&Ik%d0YsRu5W!-=UI?z|zN|~)u*f0joWx_cCa69{{6NY_KeCU>Gq`}Y?_<5!GM
zt>rF)zGCS2CEL~sr8{uqLgNm|#GLo5h7
z4(CjsHG!-R@*TlTmxCxqY!KknA>7zw38R51?l2gPi}}HA(WEuXRwci<%rEW|lGx{Pwu6V~f*Ezr-R5VwosV1GAg8+6g{^Eh=Cui!;H!pdOZz;w
zm;nK(;t_)8Z$$|cRBrd>OY^{f&vg$G$t=IFz%`+g=V)YOCuXQ(u@N3{frenu9mTuL
z?j1sroh1y?b9M_5!{}3gUrv)Myeh47ofl4QpbLQpM
z)*U*z>q1Ki45)2=IO();jv%9%_g+p#U)gp``CYA+ZMW3*S&?7n(EZkvd>p{8i?tCS
z(aFRKP`}aE2T}QKrLpZjg6Pm91k&2mlAN$md#(!l6PqA{5334mwH;L9xdge7So8+8
zvVg&V{2A$4TYnnnM|s8RI_s9{;tLdruk7m3FVeWaPHBhxLgVNGUNBA&gdN(1L
zr#K9a-;v5{;6;PXosM(c(yM3boQAp!XYuOOGYwB|Phc;lxxrv|oZgnbkv5luqIUIs
z@XWPD639AUmM{T4f^Ai~F#o|ngiqWt=^MKwrPYoSc?+BK6@B8zvZ<5Lc!DC7&|o)3
z3^gCkgk%oQjmQc-Ks|lIZZv9ixr4f9-p6Plp^H-6CZ~K8*K_a;@7{Ag68O-I{s>Xq
zbCI@*W`nz;vKA@7B{+J1Y4EkZgCQ&2?NX+GQPkK^G`RByCy{Ap9GVO$5Jmi~x0uxM
z3guc}d5x@}sCXg#mEK9IhrfU;8}ct;LH&e*{dNY6ZBoy0va{Zu#kt%|tSo+nL;F+M
zq6dS-FB~ij=>!
zdbbM^j6xkwTQ!W+Z&v?WKGw+ECyp94QK6OC7DgcTo&0^IRlGWrbd@Ke*QE;A&r+$A
zh+Gc8+Ud4(c$_^t7T*~N(2y{8Lc4Tl~0K>3Bz!BwsyF7!Gtc>IsW1&cVtggnkLsM1r_H^ZQ=7dwEm8J
z;Pu&Wd?+6+agsxxG^7D>l(dnRA~6~MTA;r3kvs|`vJwC(Du5i5^LE1h93A);&G<16
zz+PbyY`1Z+9t5z=OAhn)8M;wYcpW%9)U@nlJNKxm?cY?^f$VCM#@$7Dxs7=9i`cXm
zFM@0IV~tOn+@Ubh*4l8mY2Q_>*iz=ZcG%6VUJ>g#aGRZPgjm_r9N<7#wgibq;p^;^-E3fnHz~2!#=kc|4ox`
zKWZfJnuSj&*mH&e@^|d-Gg`~=L{HclRD4B+r(0oBct&m*YT1}PpZdnKU{{d
zQkgaXagJ>fg4Kfj75HWZGT8D`1mD{c&81XH@hu^rGvL>G#Be!&oTO;0qf;|O^%qb9
zE;aZZ0-8@rC=}|lrtBf1oNazw;JaXSL0!pE+U)%GD0wu1O4Ez$U4b0xr7J0usNmMg
zmPGUgCuxaoB|{9pjn-2+-wS!f?dQ(yZ?cDwcT@!<*8GY8BR5B{g0924!zvs#E*~yC
z*$ylVxdIlAtg_gGSO3#8o<1Ruf>AwsM5b{bOZFeZriC`{x8%Pk9$
zA0n<&v|JpZ5Mp&n1*2Eny`X3dCuD4w6_y!|rz93Z|1s2>5O>Gxmz6KyBa-_j=w?T438
ziR0^r_=&_%8*JGW)tpia9nAwXN0VhM&T@NVRfp}BOQDx189%g$w+FzjGFg07)gHZ9
z=qJ<8LAIj?E{|HE&iC*(%kfaF*se>}matJ& verbatim > extracted > derived > unknown
+// - semantic pass uses stored embeddings only; skipped (and reported) when
+// none exist; threshold respected, below-threshold never merged
+// - idempotence: a second execute finds nothing new
+// - cross-table candidates are report-only
+// - search hides marked duplicates by default; --include-duplicates shows them
+// - lineage rows persist full audit detail
+// - --delete (destructive opt-in) removes rows + embeddings; FK-referenced
+// duplicates are kept as marked instead of failing the transaction
+
+import { describe, test, expect, beforeEach, afterEach } from 'bun:test';
+import { setupTestDb, teardownTestDb } from '../helpers/setup';
+import { runDedup } from '../../src/commands/dedup';
+import { embeddingToBlob } from '../../src/lib/embeddings';
+import { getDb } from '../../src/db/connection';
+import { search } from '../../src/lib/memory';
+import {
+ createSession,
+ addMessage,
+ addDecision,
+ addBreadcrumb,
+ createLoaEntry,
+ supersedeDecision,
+} from '../../src/lib/memory';
+
+const originalLog = console.log;
+const originalError = console.error;
+const originalExitCode = process.exitCode;
+
+beforeEach(() => {
+ setupTestDb();
+ console.log = () => {};
+ console.error = () => {};
+});
+
+afterEach(() => {
+ console.log = originalLog;
+ console.error = originalError;
+ process.exitCode = originalExitCode;
+ teardownTestDb();
+});
+
+// Long enough to clear MIN_DEDUP_TEXT_LENGTH after normalization.
+const CRUMB = 'Always use bun for every script in this repository, never npm.';
+const OTHER = 'A completely different breadcrumb about the release process.';
+
+function lineageRows(): Array> {
+ return getDb().prepare('SELECT * FROM dedup_lineage ORDER BY id').all() as Array>;
+}
+
+function insertEmbedding(table: string, id: number, vector: number[]): void {
+ getDb().prepare(
+ `INSERT OR REPLACE INTO embeddings (source_table, source_id, model, dimensions, embedding)
+ VALUES (?, ?, 'test', ?, ?)`
+ ).run(table, id, vector.length, embeddingToBlob(vector));
+}
+
+describe('dry-run vs execute', () => {
+ test('dry-run reports the plan and writes nothing', () => {
+ addBreadcrumb({ content: CRUMB, importance: 5 });
+ addBreadcrumb({ content: CRUMB, importance: 5 });
+ addBreadcrumb({ content: OTHER, importance: 5 });
+
+ const result = runDedup({})!;
+ const crumbs = result.plan.tables.find(t => t.table === 'breadcrumbs')!;
+ expect(crumbs.exactGroups).toBe(1);
+ expect(crumbs.planned.length).toBe(1);
+ expect(result.applied).toBeNull();
+ expect(lineageRows().length).toBe(0);
+ expect(search('breadcrumb OR bun').length).toBe(3);
+ });
+
+ test('--execute marks duplicates non-destructively and search hides them', () => {
+ const id1 = addBreadcrumb({ content: CRUMB, importance: 5 });
+ const id2 = addBreadcrumb({ content: CRUMB, importance: 5 });
+
+ const result = runDedup({ execute: true })!;
+ expect(result.applied?.marked).toBe(1);
+
+ // Non-destructive: both records still exist.
+ const count = getDb().prepare('SELECT COUNT(*) AS c FROM breadcrumbs').get() as { c: number };
+ expect(count.c).toBe(2);
+
+ // Search hides the marked duplicate by default...
+ const hidden = search('bun');
+ expect(hidden.length).toBe(1);
+ // ...and shows it again with the explicit include option.
+ const shown = search('bun', { includeDuplicates: true });
+ expect(shown.length).toBe(2);
+ expect(shown.map(r => r.id).sort()).toEqual([id1, id2].sort());
+ });
+
+ test('--delete without --execute is rejected', () => {
+ expect(runDedup({ delete: true })).toBeUndefined();
+ expect(process.exitCode).toBe(1);
+ });
+});
+
+describe('exact detection and survivor selection', () => {
+ test('normalized variants dedupe; provenance picks the survivor', () => {
+ const extracted = addBreadcrumb({ content: CRUMB, importance: 9, provenance: 'extracted' });
+ const authored = addBreadcrumb({ content: CRUMB.toUpperCase(), importance: 1, provenance: 'user_authored' });
+ const legacy = addBreadcrumb({ content: ` ${CRUMB} `, importance: 9 });
+
+ runDedup({ execute: true });
+
+ const rows = lineageRows();
+ expect(rows.length).toBe(2);
+ for (const row of rows) {
+ expect(row.survivor_id).toBe(authored);
+ expect(row.survivor_table).toBe('breadcrumbs');
+ expect(row.reason).toBe('exact');
+ expect(row.status).toBe('marked');
+ }
+ expect(rows.map(r => r.duplicate_id).sort()).toEqual([extracted, legacy].sort());
+ });
+
+ test('identical text in different projects is not deduped', () => {
+ addBreadcrumb({ content: CRUMB, importance: 5, project: 'alpha' });
+ addBreadcrumb({ content: CRUMB, importance: 5, project: 'beta' });
+ const result = runDedup({ execute: true })!;
+ expect(result.applied?.marked).toBe(0);
+ });
+
+ test('short records are never candidates', () => {
+ addBreadcrumb({ content: 'ok', importance: 5 });
+ addBreadcrumb({ content: 'ok', importance: 5 });
+ const result = runDedup({})!;
+ const crumbs = result.plan.tables.find(t => t.table === 'breadcrumbs')!;
+ expect(crumbs.tooShort).toBe(2);
+ expect(crumbs.planned.length).toBe(0);
+ });
+
+ test('only active decisions participate', () => {
+ const text = 'Adopt SQLite WAL mode for every database connection in Recall.';
+ const stale = addDecision({ decision: text, status: 'active' });
+ supersedeDecision(stale);
+ addDecision({ decision: text, status: 'active' });
+
+ const result = runDedup({ execute: true })!;
+ expect(result.applied?.marked).toBe(0);
+ });
+
+ test('--table scopes the run', () => {
+ addBreadcrumb({ content: CRUMB, importance: 5 });
+ addBreadcrumb({ content: CRUMB, importance: 5 });
+ createSession({ session_id: 's1', started_at: '2026-01-01T00:00:00Z' });
+ addMessage({ session_id: 's1', timestamp: '2026-01-01T00:00:01Z', role: 'user', content: CRUMB });
+ addMessage({ session_id: 's1', timestamp: '2026-01-01T00:00:02Z', role: 'user', content: CRUMB });
+
+ const result = runDedup({ execute: true, table: 'messages' })!;
+ expect(result.plan.tables.length).toBe(1);
+ expect(result.applied?.marked).toBe(1);
+ expect(lineageRows().every(r => r.duplicate_table === 'messages')).toBe(true);
+ });
+
+ test('invalid --table and --threshold are rejected', () => {
+ expect(runDedup({ table: 'sessions' })).toBeUndefined();
+ expect(process.exitCode).toBe(1);
+ process.exitCode = 0;
+ expect(runDedup({ threshold: 1.5 })).toBeUndefined();
+ expect(process.exitCode).toBe(1);
+ });
+});
+
+describe('semantic pass', () => {
+ test('skipped with a clear reason when no embeddings exist', () => {
+ addBreadcrumb({ content: CRUMB, importance: 5 });
+ addBreadcrumb({ content: OTHER, importance: 5 });
+ const result = runDedup({})!;
+ expect(result.plan.semanticSkipped).toContain('embed');
+ });
+
+ test('skipped when disabled via --no-semantic', () => {
+ addBreadcrumb({ content: CRUMB, importance: 5 });
+ const result = runDedup({ semantic: false })!;
+ expect(result.plan.semanticSkipped).toContain('--no-semantic');
+ });
+
+ test('marks pairs at the threshold, never below it', () => {
+ const near = Math.sqrt(1 - 0.97 * 0.97);
+ const a = addBreadcrumb({ content: CRUMB, importance: 5 });
+ const b = addBreadcrumb({ content: OTHER, importance: 5 });
+ insertEmbedding('breadcrumbs', a, [1, 0, 0]);
+ insertEmbedding('breadcrumbs', b, [0.97, near, 0]);
+
+ // Above the default 0.95 threshold → marked, with similarity recorded.
+ const marked = runDedup({ execute: true })!;
+ expect(marked.applied?.marked).toBe(1);
+ const row = lineageRows()[0];
+ expect(row.reason).toBe('semantic');
+ expect(row.similarity as number).toBeCloseTo(0.97, 3);
+
+ // Survivor is the richer record (longer normalized text wins the tie).
+ const longer = CRUMB.length >= OTHER.length ? a : b;
+ expect(row.survivor_id).toBe(longer);
+ });
+
+ test('a stricter threshold leaves the same pair untouched', () => {
+ const near = Math.sqrt(1 - 0.97 * 0.97);
+ const a = addBreadcrumb({ content: CRUMB, importance: 5 });
+ const b = addBreadcrumb({ content: OTHER, importance: 5 });
+ insertEmbedding('breadcrumbs', a, [1, 0, 0]);
+ insertEmbedding('breadcrumbs', b, [0.97, near, 0]);
+
+ const result = runDedup({ execute: true, threshold: 0.99 })!;
+ expect(result.applied?.marked).toBe(0);
+ expect(lineageRows().length).toBe(0);
+ });
+});
+
+describe('idempotence', () => {
+ test('a second execute finds nothing new', () => {
+ addBreadcrumb({ content: CRUMB, importance: 5 });
+ addBreadcrumb({ content: CRUMB, importance: 5 });
+ addBreadcrumb({ content: CRUMB, importance: 5 });
+
+ const first = runDedup({ execute: true })!;
+ expect(first.applied?.marked).toBe(2);
+ const after = lineageRows().length;
+
+ const second = runDedup({ execute: true })!;
+ expect(second.applied?.marked).toBe(0);
+ const crumbs = second.plan.tables.find(t => t.table === 'breadcrumbs')!;
+ expect(crumbs.alreadyMarked).toBe(2);
+ expect(lineageRows().length).toBe(after);
+ });
+});
+
+describe('cross-table candidates are report-only', () => {
+ test('reported, never marked', () => {
+ createSession({ session_id: 's1', started_at: '2026-01-01T00:00:00Z' });
+ addMessage({ session_id: 's1', timestamp: '2026-01-01T00:00:01Z', role: 'user', content: CRUMB });
+ addBreadcrumb({ content: CRUMB, importance: 5 });
+
+ const result = runDedup({ execute: true })!;
+ expect(result.plan.crossTable.textMatches.length).toBe(1);
+ expect(result.applied?.marked).toBe(0);
+ expect(lineageRows().length).toBe(0);
+ // Both records remain searchable.
+ expect(search('bun').length).toBe(2);
+ });
+});
+
+describe('lineage persistence', () => {
+ test('rows carry full audit detail', () => {
+ const survivor = addBreadcrumb({ content: CRUMB, importance: 5, provenance: 'user_authored', project: 'demo' });
+ const dup = addBreadcrumb({ content: CRUMB, importance: 5, provenance: 'extracted', project: 'demo' });
+
+ runDedup({ execute: true });
+
+ const row = lineageRows()[0];
+ expect(row.survivor_table).toBe('breadcrumbs');
+ expect(row.survivor_id).toBe(survivor);
+ expect(row.duplicate_table).toBe('breadcrumbs');
+ expect(row.duplicate_id).toBe(dup);
+ expect(row.reason).toBe('exact');
+ expect(row.similarity).toBe(1);
+ expect(row.status).toBe('marked');
+ expect(typeof row.created_at).toBe('string');
+ const detail = JSON.parse(row.detail as string);
+ expect(detail.survivor_provenance).toBe('user_authored');
+ expect(detail.duplicate_provenance).toBe('extracted');
+ expect(detail.project).toBe('demo');
+ });
+});
+
+describe('destructive opt-in (--execute --delete)', () => {
+ test('deletes duplicates and their embeddings; lineage records the deletion', () => {
+ const survivor = addBreadcrumb({ content: CRUMB, importance: 5, provenance: 'user_authored' });
+ const dup = addBreadcrumb({ content: CRUMB, importance: 5 });
+ insertEmbedding('breadcrumbs', dup, [1, 0, 0]);
+
+ const result = runDedup({ execute: true, delete: true })!;
+ expect(result.applied?.deleted).toBe(1);
+
+ const db = getDb();
+ const remaining = db.prepare('SELECT id FROM breadcrumbs').all() as Array<{ id: number }>;
+ expect(remaining.map(r => r.id)).toEqual([survivor]);
+ const embeddings = db.prepare('SELECT COUNT(*) AS c FROM embeddings').get() as { c: number };
+ expect(embeddings.c).toBe(0);
+ expect(lineageRows()[0].status).toBe('deleted');
+ });
+
+ test('FK-referenced duplicates are kept as marked instead of failing', () => {
+ createSession({ session_id: 's1', started_at: '2026-01-01T00:00:00Z' });
+ const survivor = addMessage({ session_id: 's1', timestamp: '2026-01-02T00:00:00Z', role: 'user', content: CRUMB });
+ const referenced = addMessage({ session_id: 's1', timestamp: '2026-01-01T00:00:00Z', role: 'user', content: CRUMB });
+ // The older message loses survivorship (recency tie-break) but is pinned
+ // by an LoA message range — hard-deleting it would violate the FK.
+ createLoaEntry({
+ title: 'entry', fabric_extract: 'body',
+ message_range_start: referenced, message_range_end: referenced,
+ });
+
+ const result = runDedup({ execute: true, delete: true })!;
+ expect(result.applied?.deleted).toBe(0);
+ expect(result.applied?.fkProtected).toBe(1);
+
+ const row = lineageRows()[0];
+ expect(row.duplicate_id).toBe(referenced);
+ expect(row.survivor_id).toBe(survivor);
+ expect(row.status).toBe('marked');
+ // The record still exists, just hidden.
+ const msg = getDb().prepare('SELECT id FROM messages WHERE id = ?').get(referenced);
+ expect(msg).toBeDefined();
+ });
+});
diff --git a/tests/commands/export.test.ts b/tests/commands/export.test.ts
index b7e0274..799c1f7 100644
--- a/tests/commands/export.test.ts
+++ b/tests/commands/export.test.ts
@@ -134,6 +134,7 @@ describe('manifest', () => {
learnings: 1,
breadcrumbs: 1,
loa_entries: 1,
+ dedup_lineage: 0,
});
expect(manifest.provenance_counts.messages).toEqual({ unknown: 1, verbatim: 1 });
expect(manifest.provenance_counts.decisions).toEqual({ unknown: 1, user_authored: 1 });
@@ -199,6 +200,7 @@ describe('SQL dump', () => {
learnings: 1,
breadcrumbs: 1,
loa_entries: 1,
+ dedup_lineage: 0,
});
// Legacy NULL restores as NULL; known values survive verbatim
diff --git a/tests/db/migrations.test.ts b/tests/db/migrations.test.ts
index 8a52d55..62added 100644
--- a/tests/db/migrations.test.ts
+++ b/tests/db/migrations.test.ts
@@ -188,7 +188,8 @@ describe('MIGRATIONS array', () => {
test('has expected number of migrations', () => {
// 7 → 8: importance column on messages/decisions/learnings/loa_entries (Sprint #4)
// 8 → 9: provenance column on all five memory tables (issue #42)
- expect(MIGRATIONS.length).toBe(9);
+ // 9 → 10: dedup_lineage table (issue #45)
+ expect(MIGRATIONS.length).toBe(10);
});
test('all entries are functions', () => {
diff --git a/tests/lib/dedup.test.ts b/tests/lib/dedup.test.ts
new file mode 100644
index 0000000..e6ff9dd
--- /dev/null
+++ b/tests/lib/dedup.test.ts
@@ -0,0 +1,200 @@
+// recall dedup — pure logic (issue #45).
+//
+// Unit tests over the pure exported functions: normalization, survivor
+// selection (provenance > richness > importance > recency > id), exact
+// grouping, cross-table matching, and semantic pairing. Property-based
+// suites over these functions are issue #44 phase 2 — not here.
+
+import { describe, test, expect } from 'bun:test';
+import {
+ compareForSurvivor,
+ DEFAULT_SEMANTIC_THRESHOLD,
+ findCrossTableMatches,
+ findExactGroups,
+ findSemanticPairs,
+ normalizeText,
+ provenanceRank,
+ selectSurvivor,
+ type DedupCandidate,
+} from '../../src/lib/dedup';
+import { PROVENANCE_VALUES } from '../../src/types/index';
+
+function candidate(overrides: Partial = {}): DedupCandidate {
+ return {
+ table: 'breadcrumbs',
+ id: 1,
+ project: null,
+ provenance: null,
+ importance: 5,
+ created_at: '2026-01-01 00:00:00',
+ key: 'k',
+ textLength: 50,
+ ...overrides,
+ };
+}
+
+describe('normalizeText', () => {
+ test('lowercases, strips quotes, collapses whitespace', () => {
+ expect(normalizeText(` Use "Bun" for\n'everything' `)).toBe('use bun for everything');
+ });
+
+ test('identical meaning with different casing/spacing normalizes equal', () => {
+ expect(normalizeText('Ship THE feature')).toBe(normalizeText("ship the 'feature'"));
+ });
+});
+
+describe('provenanceRank', () => {
+ test('orders user_authored > verbatim > extracted > derived > unknown', () => {
+ const ranks = [...PROVENANCE_VALUES, null].map(p => provenanceRank(p));
+ expect(ranks).toEqual([...ranks].sort((a, b) => a - b));
+ expect(provenanceRank('user_authored')).toBeLessThan(provenanceRank('verbatim'));
+ expect(provenanceRank('verbatim')).toBeLessThan(provenanceRank('extracted'));
+ expect(provenanceRank('extracted')).toBeLessThan(provenanceRank('derived'));
+ expect(provenanceRank('derived')).toBeLessThan(provenanceRank(null));
+ });
+});
+
+describe('selectSurvivor', () => {
+ test('provenance dominates richness, importance, and recency', () => {
+ const weakButAuthored = candidate({ id: 1, provenance: 'user_authored', textLength: 10, importance: 1, created_at: '2020-01-01 00:00:00' });
+ const richButExtracted = candidate({ id: 2, provenance: 'extracted', textLength: 9999, importance: 10, created_at: '2026-01-01 00:00:00' });
+ const { survivor, duplicates } = selectSurvivor([richButExtracted, weakButAuthored]);
+ expect(survivor.id).toBe(1);
+ expect(duplicates.map(d => d.id)).toEqual([2]);
+ });
+
+ test('richness breaks provenance ties', () => {
+ const short = candidate({ id: 1, textLength: 10 });
+ const long = candidate({ id: 2, textLength: 200 });
+ expect(selectSurvivor([short, long]).survivor.id).toBe(2);
+ });
+
+ test('importance breaks richness ties', () => {
+ const low = candidate({ id: 1, importance: 3 });
+ const high = candidate({ id: 2, importance: 8 });
+ expect(selectSurvivor([low, high]).survivor.id).toBe(2);
+ });
+
+ test('recency breaks importance ties', () => {
+ const older = candidate({ id: 1, created_at: '2025-01-01 00:00:00' });
+ const newer = candidate({ id: 2, created_at: '2026-01-01 00:00:00' });
+ expect(selectSurvivor([older, newer]).survivor.id).toBe(2);
+ });
+
+ test('lowest id is the final deterministic tie-break', () => {
+ const a = candidate({ id: 7 });
+ const b = candidate({ id: 3 });
+ expect(selectSurvivor([a, b]).survivor.id).toBe(3);
+ // Total order: comparator never reports equality for distinct ids.
+ expect(compareForSurvivor(a, b)).toBeGreaterThan(0);
+ });
+
+ test('throws on an empty group', () => {
+ expect(() => selectSurvivor([])).toThrow();
+ });
+});
+
+describe('findExactGroups', () => {
+ test('groups same table + project + key; singletons excluded', () => {
+ const groups = findExactGroups([
+ candidate({ id: 1, key: 'a' }),
+ candidate({ id: 2, key: 'a' }),
+ candidate({ id: 3, key: 'b' }),
+ ]);
+ expect(groups.length).toBe(1);
+ expect(groups[0].map(c => c.id).sort()).toEqual([1, 2]);
+ });
+
+ test('identical text in different projects is not grouped', () => {
+ const groups = findExactGroups([
+ candidate({ id: 1, key: 'a', project: 'alpha' }),
+ candidate({ id: 2, key: 'a', project: 'beta' }),
+ ]);
+ expect(groups.length).toBe(0);
+ });
+});
+
+describe('findCrossTableMatches', () => {
+ test('reports same text across tables, ignores single-table matches', () => {
+ const matches = findCrossTableMatches([
+ candidate({ id: 1, key: 'a', table: 'breadcrumbs' }),
+ candidate({ id: 2, key: 'a', table: 'messages' }),
+ candidate({ id: 3, key: 'b', table: 'breadcrumbs' }),
+ candidate({ id: 4, key: 'b', table: 'breadcrumbs' }),
+ ]);
+ expect(matches.length).toBe(1);
+ expect(matches[0].members.map(m => m.table).sort()).toEqual(['breadcrumbs', 'messages']);
+ });
+});
+
+describe('findSemanticPairs', () => {
+ const unit = (x: number, y: number, z: number) => [x, y, z];
+
+ test('pairs at/above threshold; orthogonal vectors never pair', () => {
+ const near = Math.sqrt(1 - 0.97 * 0.97);
+ const pairs = findSemanticPairs(
+ [
+ { candidate: candidate({ id: 1, key: 'a' }), embedding: unit(1, 0, 0) },
+ { candidate: candidate({ id: 2, key: 'b' }), embedding: unit(0.97, near, 0) },
+ { candidate: candidate({ id: 3, key: 'c' }), embedding: unit(0, 0, 1) },
+ ],
+ DEFAULT_SEMANTIC_THRESHOLD
+ );
+ expect(pairs.length).toBe(1);
+ expect([pairs[0].a.id, pairs[0].b.id].sort()).toEqual([1, 2]);
+ expect(pairs[0].similarity).toBeCloseTo(0.97, 5);
+ });
+
+ test('below-threshold pairs are never produced', () => {
+ const near = Math.sqrt(1 - 0.97 * 0.97);
+ const pairs = findSemanticPairs(
+ [
+ { candidate: candidate({ id: 1, key: 'a' }), embedding: unit(1, 0, 0) },
+ { candidate: candidate({ id: 2, key: 'b' }), embedding: unit(0.97, near, 0) },
+ ],
+ 0.99
+ );
+ expect(pairs.length).toBe(0);
+ });
+
+ test('same-table pairs across projects are ignored; cross-table pairs are flagged', () => {
+ const pairs = findSemanticPairs(
+ [
+ { candidate: candidate({ id: 1, key: 'a', project: 'alpha' }), embedding: unit(1, 0, 0) },
+ { candidate: candidate({ id: 2, key: 'b', project: 'beta' }), embedding: unit(1, 0, 0) },
+ { candidate: candidate({ id: 3, key: 'c', table: 'messages', project: 'alpha' }), embedding: unit(1, 0, 0) },
+ ],
+ 0.95
+ );
+ // 1↔2: same table, different projects — ignored.
+ // 1↔3 and 2↔3: cross-table — report-only pairs.
+ expect(pairs.every(p => !p.sameTable)).toBe(true);
+ expect(pairs.length).toBe(2);
+ });
+
+ test('identical normalized keys are left to the exact pass', () => {
+ const pairs = findSemanticPairs(
+ [
+ { candidate: candidate({ id: 1, key: 'same' }), embedding: unit(1, 0, 0) },
+ { candidate: candidate({ id: 2, key: 'same' }), embedding: unit(1, 0, 0) },
+ ],
+ 0.95
+ );
+ expect(pairs.length).toBe(0);
+ });
+
+ test('results are sorted strongest-first', () => {
+ const mid = Math.sqrt(1 - 0.96 * 0.96);
+ const near = Math.sqrt(1 - 0.99 * 0.99);
+ const pairs = findSemanticPairs(
+ [
+ { candidate: candidate({ id: 1, key: 'a' }), embedding: unit(1, 0, 0) },
+ { candidate: candidate({ id: 2, key: 'b' }), embedding: unit(0.99, near, 0) },
+ { candidate: candidate({ id: 3, key: 'c' }), embedding: unit(0.96, mid, 0) },
+ ],
+ 0.95
+ );
+ const sims = pairs.map(p => p.similarity);
+ expect(sims).toEqual([...sims].sort((a, b) => b - a));
+ });
+});
From 167f1b8dabb0c85c5537e10de6e8ec4e47514b37 Mon Sep 17 00:00:00 2001
From: Ed Heltzel <402910+edheltzel@users.noreply.github.com>
Date: Thu, 11 Jun 2026 03:59:49 -0400
Subject: [PATCH 2/2] =?UTF-8?q?fix:=20never=20re-mark=20a=20planned=20surv?=
=?UTF-8?q?ivor=20=E2=80=94=20one-hop=20dedup=20lineage;=20de-binary=20ded?=
=?UTF-8?q?up.ts?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Resolves both blockers from the PR #60 review (comment 4676730890):
- planDedup now tracks plannedSurvivorKeys in both passes and skips any
semantic pair whose member is already a planned survivor, so lineage
stays one hop deep — no transitive chaining within or across passes.
Regression test pins the reviewer's A-B-C repro (cos 0.99/0.96/0.91
at the default 0.95 threshold).
- The raw NUL groupKey separators are now \x00 escape sequences, so git
diffs src/lib/dedup.ts as text; the groupKey comment names the
separator choice.
---
src/lib/dedup.ts | Bin 22314 -> 22875 bytes
tests/commands/dedup.test.ts | 31 +++++++++++++++++++++++++++++++
2 files changed, 31 insertions(+)
diff --git a/src/lib/dedup.ts b/src/lib/dedup.ts
index cd40610a8f935aaf96882a2aff362ea6982130c3..fbfce6e271270aacb669d052c19736bd2cfeeb2a 100644
GIT binary patch
delta 623
zcma)(!D>@M6oyGEE<))-ZbiEKg+S60Z@UnLQfTQy2`#$N?p)+#<|KF0>&!4SNxifo
zeTVcJy68g1Y+Q-&;m#*5
zQk_-Szpp&F{rz|{XspLNA7Jx10?W5UH&P?xW@`s;s=If@BQAVOQZ_sFb3Zqxfx0=5
zcb60;$DP@`!_N;>f@5KvT%>~{D9EABm7u`oDmBdC_V#qI{dCfbyUS?0r*O%NMSC{{SMKF`cqo&atH2?Md=@H|lc&AE@v!nvod`$!uFwY8TH{
k56$~{WaOI^$!y}{VzK96ir?CKCC+igy-{_a(|`IxsC
diff --git a/tests/commands/dedup.test.ts b/tests/commands/dedup.test.ts
index cc60f98..5c309b0 100644
--- a/tests/commands/dedup.test.ts
+++ b/tests/commands/dedup.test.ts
@@ -213,6 +213,37 @@ describe('semantic pass', () => {
expect(result.applied?.marked).toBe(0);
expect(lineageRows().length).toBe(0);
});
+
+ test('a planned survivor is never re-marked by a weaker pair (no transitive chaining)', () => {
+ // PR #60 review repro: cos(A,B)=0.99, cos(B,C)=0.96, cos(A,C)≈0.91 with
+ // the default 0.95 threshold. Ascending text length pins the survivor
+ // orientation: B survives the strongest pair, so without the survivor
+ // guard the weaker B/C pair re-marks B — leaving A's only visible
+ // neighbor at 0.91, below the threshold.
+ const a = addBreadcrumb({ content: 'Review queue triage happens before standup.', importance: 5 });
+ const b = addBreadcrumb({ content: 'Review queue triage always happens before the standup.', importance: 5 });
+ const c = addBreadcrumb({ content: 'Review queue triage must always happen before the morning standup.', importance: 5 });
+ insertEmbedding('breadcrumbs', a, [0.99, Math.sqrt(1 - 0.99 * 0.99), 0]);
+ insertEmbedding('breadcrumbs', b, [1, 0, 0]);
+ insertEmbedding('breadcrumbs', c, [0.96, -Math.sqrt(1 - 0.96 * 0.96), 0]);
+
+ const result = runDedup({ execute: true })!;
+
+ // Only the strongest pair is marked; B/C is skipped because B already
+ // survives A.
+ expect(result.applied?.marked).toBe(1);
+ const rows = lineageRows();
+ expect(rows.length).toBe(1);
+ expect(rows[0].duplicate_id).toBe(a);
+ expect(rows[0].survivor_id).toBe(b);
+ expect(rows[0].similarity as number).toBeCloseTo(0.99, 3);
+
+ // One-hop lineage invariant: no survivor is itself marked as a duplicate.
+ const duplicates = new Set(rows.map(r => `${r.duplicate_table}:${r.duplicate_id}`));
+ for (const row of rows) {
+ expect(duplicates.has(`${row.survivor_table}:${row.survivor_id}`)).toBe(false);
+ }
+ });
});
describe('idempotence', () => {