From 6047ece203725c9b87ddc27eeb43165d7c40ecc5 Mon Sep 17 00:00:00 2001 From: Ed Heltzel <402910+edheltzel@users.noreply.github.com> Date: Thu, 11 Jun 2026 16:49:50 -0400 Subject: [PATCH 1/2] fix(dedup): make recorded survivors sticky across runs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cross-run re-marking of a prior survivor could break the visible-survivor safety property: dedup_lineage accumulated marked->marked chains, leaving hidden (or, with --delete, permanently removed) records whose only visible neighbor was never measured against them. - Sticky survivors (option a): planDedup loads survivor keys from dedup_lineage (status 'marked'/'deleted', mirroring loadMarkedDuplicates) and seeds plannedSurvivorKeys with them, extending the within-plan one-hop guard across runs; the exact pass skips them as duplicates too. Skips are reported per table (stickySkipped) and in the CLI summary. - Destructive-mode guard (option c, defense-in-depth): applyDedupPlan refuses — throws before writing — when a planned duplicate is itself a recorded survivor; runDedup surfaces the refusal as a clean error with exit code 1. - Property suite: cross-run sticky-survivor property (two waves plus angle-targeted challengers) pins 'a recorded survivor is never re-marked across runs' and final-lineage survivor/duplicate disjointness; the scope note now states the cross-run invariant. Mutation-checked: removing the exclusion fails exactly the new property. Chain compression (option b) and mark->delete escalation (#73) are deliberately untouched. Closes #63 --- docs/cli-reference.md | 7 ++ src/commands/dedup.ts | 12 ++- src/lib/dedup.ts | 68 +++++++++++++- tests/lib/dedup.property.test.ts | 154 +++++++++++++++++++++++++++++-- 4 files changed, 229 insertions(+), 12 deletions(-) diff --git a/docs/cli-reference.md b/docs/cli-reference.md index f0a265f..bbd7fcd 100644 --- a/docs/cli-reference.md +++ b/docs/cli-reference.md @@ -336,6 +336,13 @@ Safety model: exist; records are never merged below the configured `--threshold` (conservative default: 0.95 cosine similarity). Records with fewer than 20 significant characters are never candidates. +- **Survivors are sticky.** A record recorded as a survivor in + `dedup_lineage` is never re-marked as a duplicate by a later run, so every + hidden or deleted record keeps a visible survivor across runs. The + tradeoff is order-dependence: an early survivor is never consolidated + under a later, higher-priority record. As defense-in-depth, `--delete` + refuses outright (no changes written) if a plan would delete a recorded + survivor. - **Lifecycle-aware.** Only `active` decisions participate; superseded and reverted decisions are managed by the decision lifecycle, not dedup. diff --git a/src/commands/dedup.ts b/src/commands/dedup.ts index 7c865bf..35b28af 100644 --- a/src/commands/dedup.ts +++ b/src/commands/dedup.ts @@ -83,6 +83,7 @@ export function runDedup(options: DedupOptions = {}): DedupRunResult | undefined const skipped: string[] = []; if (report.alreadyMarked > 0) skipped.push(`${report.alreadyMarked} already marked`); if (report.tooShort > 0) skipped.push(`${report.tooShort} too short`); + if (report.stickySkipped > 0) skipped.push(`${report.stickySkipped} prior survivor(s) kept visible`); const skippedNote = skipped.length > 0 ? ` (${skipped.join(', ')})` : ''; console.log( `${report.table}: scanned ${report.scanned}, exact groups ${report.exactGroups}, ` + @@ -132,7 +133,16 @@ export function runDedup(options: DedupOptions = {}): DedupRunResult | undefined return { plan, applied: null }; } - const applied = applyDedupPlan(db, plan, { destructive }); + let applied: ApplyResult; + try { + applied = applyDedupPlan(db, plan, { destructive }); + } catch (err) { + // The destructive survivor guard (issue #63) throws before writing; + // surface its message as a clean refusal instead of a stack trace. + console.error(err instanceof Error ? err.message : String(err)); + process.exitCode = 1; + return undefined; + } if (destructive) { const fkNote = applied.fkProtected > 0 ? ` (${applied.fkProtected} kept as marked — referenced by LoA lineage)` diff --git a/src/lib/dedup.ts b/src/lib/dedup.ts index fbfce6e..061a1d5 100644 --- a/src/lib/dedup.ts +++ b/src/lib/dedup.ts @@ -18,6 +18,13 @@ // - Semantic detection compares stored embeddings pairwise — no embedding // service call is needed. Pairs are never chained transitively: every // marked duplicate has a direct similarity >= threshold to its survivor. +// - Survivors are sticky across runs (issue #63): a record recorded as a +// survivor in dedup_lineage (status 'marked' or 'deleted') is never +// re-marked as a duplicate by a later run, so every hidden or deleted +// record keeps a visible survivor. Conservative and order-dependent by +// design — an early survivor is never consolidated under a later record. +// Destructive mode independently refuses to delete a recorded survivor +// (defense-in-depth should planning ever regress). // // Bind-count note (see src/lib/chunk.ts): scans use keyset pagination // (`WHERE id > ? LIMIT ?`, fixed binds). Destructive deletion builds @@ -347,6 +354,21 @@ export function loadMarkedDuplicates(db: Database): Set { return new Set(rows.map(r => `${r.duplicate_table}:${r.duplicate_id}`)); } +/** + * (table, id) pairs recorded as survivors in dedup_lineage — sticky across + * runs (issue #63): their hidden ('marked') or removed ('deleted') + * duplicates rely on the survivor staying visible, so these records are + * never re-marked. 'reverted' rows carry no such obligation — the duplicate + * is visible again. + */ +export function loadRecordedSurvivors(db: Database): Set { + const rows = db.prepare( + `SELECT DISTINCT survivor_table, survivor_id FROM dedup_lineage + WHERE status IN ('marked', 'deleted')` + ).all() as Array<{ survivor_table: string; survivor_id: number }>; + return new Set(rows.map(r => `${r.survivor_table}:${r.survivor_id}`)); +} + /** Stored embeddings for one table, keyed by source row id. */ export function loadEmbeddings(db: Database, table: ProvenanceTable): Map { const rows = db.prepare( @@ -394,6 +416,8 @@ export interface DedupTableReport { scanned: number; tooShort: number; alreadyMarked: number; + /** Marks not planned because a sticky prior survivor was involved (issue #63). */ + stickySkipped: number; exactGroups: number; semanticPairs: number; planned: LineageEntry[]; @@ -451,13 +475,18 @@ export function planDedup(db: Database, options: PlanDedupOptions = {}): DedupPl const threshold = options.threshold ?? DEFAULT_SEMANTIC_THRESHOLD; const semantic = options.semantic ?? true; const marked = loadMarkedDuplicates(db); + // Sticky survivors (issue #63): seeding plannedSurvivorKeys with survivors + // recorded by prior runs extends the within-plan one-hop guard across + // runs — a recorded survivor is never re-marked, so its hidden or deleted + // duplicates always keep a visible representative. + const stickySurvivors = loadRecordedSurvivors(db); const reports = new Map(); const eligible = new Map(); // Exact pass — within table + project. const plannedDuplicateKeys = new Set(); - const plannedSurvivorKeys = new Set(); + const plannedSurvivorKeys = new Set(stickySurvivors); for (const table of tables) { const scan = scanCandidates(db, table, options.project); const fresh = scan.candidates.filter(c => !marked.has(`${c.table}:${c.id}`)); @@ -466,6 +495,7 @@ export function planDedup(db: Database, options: PlanDedupOptions = {}): DedupPl scanned: scan.scanned, tooShort: scan.tooShort, alreadyMarked: scan.candidates.length - fresh.length, + stickySkipped: 0, exactGroups: 0, semanticPairs: 0, planned: [], @@ -475,6 +505,12 @@ export function planDedup(db: Database, options: PlanDedupOptions = {}): DedupPl const { survivor, duplicates } = selectSurvivor(group); plannedSurvivorKeys.add(`${survivor.table}:${survivor.id}`); for (const dup of duplicates) { + // A sticky prior survivor stays visible even when a newer record + // out-ranks it within its exact group (issue #63). + if (stickySurvivors.has(`${dup.table}:${dup.id}`)) { + report.stickySkipped++; + continue; + } report.planned.push(toLineage(survivor, dup, 'exact', 1.0)); plannedDuplicateKeys.add(`${dup.table}:${dup.id}`); } @@ -520,10 +556,16 @@ export function planDedup(db: Database, options: PlanDedupOptions = {}): DedupPl const bKey = `${pair.b.table}:${pair.b.id}`; // Greedy, strongest-first: a record planned as a duplicate can // neither survive nor be re-marked, and a record planned as a - // survivor (in either pass) can never be re-marked by a weaker - // pair — lineage stays one hop deep, no transitive chaining. + // survivor (in either pass, this run or — via the sticky seed — any + // prior run) can never be re-marked by a weaker pair — lineage stays + // one hop deep, no transitive chaining. if (plannedDuplicateKeys.has(aKey) || plannedDuplicateKeys.has(bKey)) continue; - if (plannedSurvivorKeys.has(aKey) || plannedSurvivorKeys.has(bKey)) continue; + if (plannedSurvivorKeys.has(aKey) || plannedSurvivorKeys.has(bKey)) { + if (stickySurvivors.has(aKey) || stickySurvivors.has(bKey)) { + reports.get(pair.a.table)!.stickySkipped++; + } + continue; + } const { survivor, duplicates } = selectSurvivor([pair.a, pair.b]); const report = reports.get(pair.a.table)!; report.semanticPairs++; @@ -553,7 +595,10 @@ export interface ApplyResult { * Apply a plan: insert lineage rows ('marked' by default). With * `destructive`, duplicate rows are hard-deleted (with their embeddings) and * lineage status is 'deleted' — except FK-referenced rows, which stay marked. - * Runs in one transaction; failures roll back everything. + * Runs in one transaction; failures roll back everything. Destructive mode + * refuses (throws, nothing written) when a planned duplicate is itself a + * recorded survivor in dedup_lineage — planDedup never produces such a plan, + * so this guard is defense-in-depth for issue #63. */ export function applyDedupPlan( db: Database, @@ -565,6 +610,19 @@ export function applyDedupPlan( const result: ApplyResult = { marked: 0, deleted: 0, fkProtected: 0 }; if (entries.length === 0) return result; + if (destructive) { + const survivors = loadRecordedSurvivors(db); + const conflicts = entries.filter(e => survivors.has(`${e.duplicate_table}:${e.duplicate_id}`)); + if (conflicts.length > 0) { + const sample = conflicts[0]; + throw new Error( + `destructive dedup refused: ${conflicts.length} planned duplicate(s) are recorded survivors ` + + `in dedup_lineage (e.g. ${sample.duplicate_table}#${sample.duplicate_id}). Deleting a survivor ` + + `would leave its marked or deleted duplicates with no visible record. No changes were made.` + ); + } + } + const insert = db.prepare(` INSERT INTO dedup_lineage (survivor_table, survivor_id, duplicate_table, duplicate_id, reason, similarity, status, detail) diff --git a/tests/lib/dedup.property.test.ts b/tests/lib/dedup.property.test.ts index 22209c3..094aa4a 100644 --- a/tests/lib/dedup.property.test.ts +++ b/tests/lib/dedup.property.test.ts @@ -10,12 +10,13 @@ // recency > lowest id; cos of the angle difference), never read back from // the implementation. // -// Scope note: the one-hop lineage guarantee is WITHIN a single plan ("a -// planned survivor never re-marked" — the PR #60 blocker fix). Across runs, -// a prior survivor may legitimately be re-marked by a later semantic pass; -// the repeated-runs property therefore asserts auditability (uniqueness, -// record preservation, no re-marking of marked records), not cross-run -// one-hop. +// Scope note: the one-hop lineage guarantee holds WITHIN a single plan ("a +// planned survivor never re-marked" — the PR #60 blocker fix) AND across +// runs (issue #63 sticky survivors: a survivor recorded in dedup_lineage is +// never re-marked by a later run, so every hidden or deleted record keeps a +// visible survivor). The repeated-runs property asserts auditability +// (uniqueness, record preservation, no re-marking of marked records); the +// cross-run sticky-survivor property below pins the cross-run invariant. // // Generator sizes are bounded (≤5 groups × ≤4 members) so the suite stays // practical under normal `bun test` runs. @@ -459,3 +460,144 @@ describe('dedup apply, idempotence, and repeated-run auditability', () => { ); }); }); + +describe('dedup cross-run sticky survivors (issue #63)', () => { + const thresholdArb = fc.integer({ min: 50, max: 99 }).map(n => n / 100); + + // A challenger shadows one wave-1 record: same table, project, and + // embedding angle (so it pairs at/above any threshold with whatever + // visible record that angle reaches — recorded survivors included), with + // top provenance, importance, and richness so it out-ranks incumbents. + // Without sticky survivors, a challenger landing on a recorded survivor + // re-marks it — exactly the issue #63 hazard. + interface Challenger { + /** Index into the wave-1 records (mod length). */ + target: number; + extraLen: number; + day: number; + } + const challengerArb: fc.Arbitrary = fc.record({ + target: fc.nat({ max: 30 }), + extraLen: fc.integer({ min: 40, max: 60 }), + day: dayArb, + }); + + function insertChallengers(db: Database, challengers: Challenger[], wave1: InsRecord[]): void { + const records: InsRecord[] = challengers.map((c, i) => { + const target = wave1[c.target % wave1.length]; + const text = `cross-run challenger ${i} body text ${'x'.repeat(c.extraLen)}`; + // Wave-1 records all come from GenMember, so the project narrowing is sound. + const meta = { + project: target.project as GenMember['project'], + provenance: 'user_authored' as Prov, + importance: 10, + day: c.day, + }; + const id = insertRecord(db, target.table, text, meta); + return { + table: target.table, + id, + project: target.project, + provenance: meta.provenance, + importance: meta.importance, + createdAt: createdAt(c.day), + normalized: text, + angleDeg: target.angleDeg, + }; + }); + insertEmbeddings(db, records); + } + + test('a recorded survivor is never re-marked by a later run, in plan or in final lineage', () => { + fc.assert( + fc.property( + corpusArb, + corpusArb, + fc.array(challengerArb, { minLength: 1, maxLength: 3 }), + thresholdArb, + (wave1Corpus, wave2Corpus, challengers, threshold) => { + withDb(db => { + const wave1 = insertCorpus(db, wave1Corpus); + insertEmbeddings(db, wave1); + applyDedupPlan(db, planDedup(db, { semantic: true, threshold })); + const survivorsAfterRun1 = new Set( + markedLineageTuples(db).map(t => `${t[0]}:${t[1]}`) + ); + + // Wave 2 arrives between runs: a second corpus reusing the same + // canonical texts (new members join wave-1 exact groups and can + // out-rank the recorded survivor) plus angle-targeted semantic + // challengers. + const wave2 = insertCorpus(db, wave2Corpus); + insertEmbeddings(db, wave2); + insertChallengers(db, challengers, wave1); + + // Sticky: run 2 never plans a run-1 survivor as a duplicate. + const plan2 = planDedup(db, { semantic: true, threshold }); + for (const e of plan2.tables.flatMap(t => t.planned)) { + expect(survivorsAfterRun1.has(`${e.duplicate_table}:${e.duplicate_id}`)).toBe(false); + } + applyDedupPlan(db, plan2); + + // Visible-survivor property over the final lineage: no record is + // both a recorded survivor and an actively marked duplicate, so + // every hidden record's survivor is itself visible. + const finalTuples = markedLineageTuples(db); + const markedDupKeys = new Set(finalTuples.map(t => `${t[2]}:${t[3]}`)); + for (const t of finalTuples) { + expect(markedDupKeys.has(`${t[0]}:${t[1]}`)).toBe(false); + } + }); + } + ) + ); + }); + + test('destructive apply refuses, without writing, a plan that would delete a recorded survivor (defense-in-depth)', () => { + withDb(db => { + // Real run 1: an exact pair — the user-authored record survives. + const idA = insertRecord(db, 'breadcrumbs', canonical(0), { + project: null, provenance: 'user_authored', importance: 10, day: 1, + }); + insertRecord(db, 'breadcrumbs', canonical(0), { + project: null, provenance: null, importance: 5, day: 1, + }); + applyDedupPlan(db, planDedup(db, { semantic: false })); + expect(markedLineageTuples(db).map(t => `${t[0]}:${t[1]}`)).toEqual([`breadcrumbs:${idA}`]); + + // Handcrafted plan claiming the recorded survivor as a duplicate — + // planDedup never produces this (the sticky-survivor exclusion), so + // the guard is exercised directly. + const idC = insertRecord(db, 'breadcrumbs', canonical(1), { + project: null, provenance: 'user_authored', importance: 10, day: 2, + }); + const plan: DedupPlan = { + threshold: 0.95, + semanticSkipped: null, + crossTable: { textMatches: [], semanticPairs: 0 }, + tables: [{ + table: 'breadcrumbs', + scanned: 0, + tooShort: 0, + alreadyMarked: 0, + stickySkipped: 0, + exactGroups: 0, + semanticPairs: 1, + planned: [{ + survivor_table: 'breadcrumbs', + survivor_id: idC, + duplicate_table: 'breadcrumbs', + duplicate_id: idA, + reason: 'semantic', + similarity: 0.99, + detail: '{}', + }], + }], + }; + + const before = snapshot(db); + expect(() => applyDedupPlan(db, plan, { destructive: true })).toThrow(/destructive dedup refused/); + expect(snapshot(db)).toBe(before); + }); + }); +}); From fb60cf579583cce9f7bae3e998705cdbd4e5f787 Mon Sep 17 00:00:00 2001 From: Ed Heltzel <402910+edheltzel@users.noreply.github.com> Date: Thu, 11 Jun 2026 17:17:02 -0400 Subject: [PATCH 2/2] test(dedup): pin 'deleted'-status survivor stickiness deterministically MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Review R1 on PR #79: mutating loadRecordedSurvivors' filter from status IN ('marked','deleted') to status = 'marked' passed the whole suite — no test created a 'deleted' lineage row and asserted stickiness on its survivor, so a one-token regression would silently re-expose the destructive data-loss half of issue #63 and defeat both defense layers at once (planner seed and guard share the loader). New deterministic test: run 1 applies an exact pair destructively (lineage status 'deleted'), then asserts run 2 never re-plans the recorded survivor (stickySkipped reported instead) and that a handcrafted destructive plan against that survivor still refuses write-free. Mutation re-run: dropping 'deleted' from the filter now fails exactly this test (654 pass / 1 fail); restored, full suite 655 pass / 0 fail, lint clean. No source change — the filter was already correct. --- tests/lib/dedup.property.test.ts | 58 ++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/tests/lib/dedup.property.test.ts b/tests/lib/dedup.property.test.ts index 094aa4a..74d8c1f 100644 --- a/tests/lib/dedup.property.test.ts +++ b/tests/lib/dedup.property.test.ts @@ -600,4 +600,62 @@ describe('dedup cross-run sticky survivors (issue #63)', () => { expect(snapshot(db)).toBe(before); }); }); + + test("a survivor recorded by a destructive run ('deleted' lineage) is sticky: never re-planned, and the guard still refuses", () => { + withDb(db => { + // Run 1, destructive: exact pair — A survives, B is hard-deleted, so + // the only lineage row carries status 'deleted' (not 'marked'). + const idA = insertRecord(db, 'breadcrumbs', canonical(0), { + project: null, provenance: 'user_authored', importance: 10, day: 1, + }); + insertRecord(db, 'breadcrumbs', canonical(0), { + project: null, provenance: null, importance: 5, day: 1, + }); + const run1 = applyDedupPlan(db, planDedup(db, { semantic: false }), { destructive: true }); + expect(run1).toEqual({ marked: 0, deleted: 1, fkProtected: 0 }); + const lineage = db.prepare( + `SELECT survivor_id, status FROM dedup_lineage` + ).all() as Array<{ survivor_id: number; status: string }>; + expect(lineage).toEqual([{ survivor_id: idA, status: 'deleted' }]); + + // Run 2: an equally-ranked but newer exact duplicate of A arrives and + // wins the group on recency — without 'deleted'-status stickiness, A + // (the only visible trace of hard-deleted B) would be re-marked. + const idC = insertRecord(db, 'breadcrumbs', canonical(0), { + project: null, provenance: 'user_authored', importance: 10, day: 2, + }); + const plan2 = planDedup(db, { semantic: false }); + expect(plan2.tables.flatMap(t => t.planned)).toEqual([]); + expect(plan2.tables.find(t => t.table === 'breadcrumbs')!.stickySkipped).toBe(1); + + // Defense-in-depth shares the same loader: a handcrafted destructive + // plan against the 'deleted'-row survivor must still refuse. + const handcrafted: DedupPlan = { + threshold: 0.95, + semanticSkipped: null, + crossTable: { textMatches: [], semanticPairs: 0 }, + tables: [{ + table: 'breadcrumbs', + scanned: 0, + tooShort: 0, + alreadyMarked: 0, + stickySkipped: 0, + exactGroups: 1, + semanticPairs: 0, + planned: [{ + survivor_table: 'breadcrumbs', + survivor_id: idC, + duplicate_table: 'breadcrumbs', + duplicate_id: idA, + reason: 'exact', + similarity: 1.0, + detail: '{}', + }], + }], + }; + const before = snapshot(db); + expect(() => applyDedupPlan(db, handcrafted, { destructive: true })).toThrow(/destructive dedup refused/); + expect(snapshot(db)).toBe(before); + }); + }); });