Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions docs/cli-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,13 @@ Safety model:
exist; records are never merged below the configured `--threshold`
(conservative default: 0.95 cosine similarity). Records with fewer than 20
significant characters are never candidates.
- **Survivors are sticky.** A record recorded as a survivor in
`dedup_lineage` is never re-marked as a duplicate by a later run, so every
hidden or deleted record keeps a visible survivor across runs. The
tradeoff is order-dependence: an early survivor is never consolidated
under a later, higher-priority record. As defense-in-depth, `--delete`
refuses outright (no changes written) if a plan would delete a recorded
survivor.
- **Lifecycle-aware.** Only `active` decisions participate; superseded and
reverted decisions are managed by the decision lifecycle, not dedup.

Expand Down
12 changes: 11 additions & 1 deletion src/commands/dedup.ts
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ export function runDedup(options: DedupOptions = {}): DedupRunResult | undefined
const skipped: string[] = [];
if (report.alreadyMarked > 0) skipped.push(`${report.alreadyMarked} already marked`);
if (report.tooShort > 0) skipped.push(`${report.tooShort} too short`);
if (report.stickySkipped > 0) skipped.push(`${report.stickySkipped} prior survivor(s) kept visible`);
const skippedNote = skipped.length > 0 ? ` (${skipped.join(', ')})` : '';
console.log(
`${report.table}: scanned ${report.scanned}, exact groups ${report.exactGroups}, ` +
Expand Down Expand Up @@ -132,7 +133,16 @@ export function runDedup(options: DedupOptions = {}): DedupRunResult | undefined
return { plan, applied: null };
}

const applied = applyDedupPlan(db, plan, { destructive });
let applied: ApplyResult;
try {
applied = applyDedupPlan(db, plan, { destructive });
} catch (err) {
// The destructive survivor guard (issue #63) throws before writing;
// surface its message as a clean refusal instead of a stack trace.
console.error(err instanceof Error ? err.message : String(err));
process.exitCode = 1;
return undefined;
}
if (destructive) {
const fkNote = applied.fkProtected > 0
? ` (${applied.fkProtected} kept as marked — referenced by LoA lineage)`
Expand Down
68 changes: 63 additions & 5 deletions src/lib/dedup.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,13 @@
// - Semantic detection compares stored embeddings pairwise — no embedding
// service call is needed. Pairs are never chained transitively: every
// marked duplicate has a direct similarity >= threshold to its survivor.
// - Survivors are sticky across runs (issue #63): a record recorded as a
// survivor in dedup_lineage (status 'marked' or 'deleted') is never
// re-marked as a duplicate by a later run, so every hidden or deleted
// record keeps a visible survivor. Conservative and order-dependent by
// design — an early survivor is never consolidated under a later record.
// Destructive mode independently refuses to delete a recorded survivor
// (defense-in-depth should planning ever regress).
//
// Bind-count note (see src/lib/chunk.ts): scans use keyset pagination
// (`WHERE id > ? LIMIT ?`, fixed binds). Destructive deletion builds
Expand Down Expand Up @@ -347,6 +354,21 @@ export function loadMarkedDuplicates(db: Database): Set<string> {
return new Set(rows.map(r => `${r.duplicate_table}:${r.duplicate_id}`));
}

/**
* (table, id) pairs recorded as survivors in dedup_lineage — sticky across
* runs (issue #63): their hidden ('marked') or removed ('deleted')
* duplicates rely on the survivor staying visible, so these records are
* never re-marked. 'reverted' rows carry no such obligation — the duplicate
* is visible again.
*/
export function loadRecordedSurvivors(db: Database): Set<string> {
const rows = db.prepare(
`SELECT DISTINCT survivor_table, survivor_id FROM dedup_lineage
WHERE status IN ('marked', 'deleted')`
).all() as Array<{ survivor_table: string; survivor_id: number }>;
return new Set(rows.map(r => `${r.survivor_table}:${r.survivor_id}`));
}

/** Stored embeddings for one table, keyed by source row id. */
export function loadEmbeddings(db: Database, table: ProvenanceTable): Map<number, number[]> {
const rows = db.prepare(
Expand Down Expand Up @@ -394,6 +416,8 @@ export interface DedupTableReport {
scanned: number;
tooShort: number;
alreadyMarked: number;
/** Marks not planned because a sticky prior survivor was involved (issue #63). */
stickySkipped: number;
exactGroups: number;
semanticPairs: number;
planned: LineageEntry[];
Expand Down Expand Up @@ -451,13 +475,18 @@ export function planDedup(db: Database, options: PlanDedupOptions = {}): DedupPl
const threshold = options.threshold ?? DEFAULT_SEMANTIC_THRESHOLD;
const semantic = options.semantic ?? true;
const marked = loadMarkedDuplicates(db);
// Sticky survivors (issue #63): seeding plannedSurvivorKeys with survivors
// recorded by prior runs extends the within-plan one-hop guard across
// runs — a recorded survivor is never re-marked, so its hidden or deleted
// duplicates always keep a visible representative.
const stickySurvivors = loadRecordedSurvivors(db);

const reports = new Map<ProvenanceTable, DedupTableReport>();
const eligible = new Map<ProvenanceTable, DedupCandidate[]>();

// Exact pass — within table + project.
const plannedDuplicateKeys = new Set<string>();
const plannedSurvivorKeys = new Set<string>();
const plannedSurvivorKeys = new Set<string>(stickySurvivors);
for (const table of tables) {
const scan = scanCandidates(db, table, options.project);
const fresh = scan.candidates.filter(c => !marked.has(`${c.table}:${c.id}`));
Expand All @@ -466,6 +495,7 @@ export function planDedup(db: Database, options: PlanDedupOptions = {}): DedupPl
scanned: scan.scanned,
tooShort: scan.tooShort,
alreadyMarked: scan.candidates.length - fresh.length,
stickySkipped: 0,
exactGroups: 0,
semanticPairs: 0,
planned: [],
Expand All @@ -475,6 +505,12 @@ export function planDedup(db: Database, options: PlanDedupOptions = {}): DedupPl
const { survivor, duplicates } = selectSurvivor(group);
plannedSurvivorKeys.add(`${survivor.table}:${survivor.id}`);
for (const dup of duplicates) {
// A sticky prior survivor stays visible even when a newer record
// out-ranks it within its exact group (issue #63).
if (stickySurvivors.has(`${dup.table}:${dup.id}`)) {
report.stickySkipped++;
continue;
}
report.planned.push(toLineage(survivor, dup, 'exact', 1.0));
plannedDuplicateKeys.add(`${dup.table}:${dup.id}`);
}
Expand Down Expand Up @@ -520,10 +556,16 @@ export function planDedup(db: Database, options: PlanDedupOptions = {}): DedupPl
const bKey = `${pair.b.table}:${pair.b.id}`;
// Greedy, strongest-first: a record planned as a duplicate can
// neither survive nor be re-marked, and a record planned as a
// survivor (in either pass) can never be re-marked by a weaker
// pair — lineage stays one hop deep, no transitive chaining.
// survivor (in either pass, this run or — via the sticky seed — any
// prior run) can never be re-marked by a weaker pair — lineage stays
// one hop deep, no transitive chaining.
if (plannedDuplicateKeys.has(aKey) || plannedDuplicateKeys.has(bKey)) continue;
if (plannedSurvivorKeys.has(aKey) || plannedSurvivorKeys.has(bKey)) continue;
if (plannedSurvivorKeys.has(aKey) || plannedSurvivorKeys.has(bKey)) {
if (stickySurvivors.has(aKey) || stickySurvivors.has(bKey)) {
reports.get(pair.a.table)!.stickySkipped++;
}
continue;
}
const { survivor, duplicates } = selectSurvivor([pair.a, pair.b]);
const report = reports.get(pair.a.table)!;
report.semanticPairs++;
Expand Down Expand Up @@ -553,7 +595,10 @@ export interface ApplyResult {
* Apply a plan: insert lineage rows ('marked' by default). With
* `destructive`, duplicate rows are hard-deleted (with their embeddings) and
* lineage status is 'deleted' — except FK-referenced rows, which stay marked.
* Runs in one transaction; failures roll back everything.
* Runs in one transaction; failures roll back everything. Destructive mode
* refuses (throws, nothing written) when a planned duplicate is itself a
* recorded survivor in dedup_lineage — planDedup never produces such a plan,
* so this guard is defense-in-depth for issue #63.
*/
export function applyDedupPlan(
db: Database,
Expand All @@ -565,6 +610,19 @@ export function applyDedupPlan(
const result: ApplyResult = { marked: 0, deleted: 0, fkProtected: 0 };
if (entries.length === 0) return result;

if (destructive) {
const survivors = loadRecordedSurvivors(db);
const conflicts = entries.filter(e => survivors.has(`${e.duplicate_table}:${e.duplicate_id}`));
if (conflicts.length > 0) {
const sample = conflicts[0];
throw new Error(
`destructive dedup refused: ${conflicts.length} planned duplicate(s) are recorded survivors ` +
`in dedup_lineage (e.g. ${sample.duplicate_table}#${sample.duplicate_id}). Deleting a survivor ` +
`would leave its marked or deleted duplicates with no visible record. No changes were made.`
);
}
}

const insert = db.prepare(`
INSERT INTO dedup_lineage
(survivor_table, survivor_id, duplicate_table, duplicate_id, reason, similarity, status, detail)
Expand Down
Loading
Loading