diff --git a/__tests__/edges-unique.test.ts b/__tests__/edges-unique.test.ts new file mode 100644 index 00000000..49eced53 --- /dev/null +++ b/__tests__/edges-unique.test.ts @@ -0,0 +1,166 @@ +/** + * Edge Uniqueness Tests + * + * Regression tests for the bug where `INSERT OR IGNORE INTO edges` was + * silently a no-op: the only candidate key was the AUTOINCREMENT id (which + * never conflicts), so duplicate edges accumulated on every re-emission / + * re-resolution. + * + * Fix: a UNIQUE index on (source, target, kind, COALESCE(line, -1), + * COALESCE(col, -1)) backs a fresh-install schema and is also applied via + * migration v4 (with a dedup pass over existing rows). + */ + +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { DatabaseConnection } from '../src/db'; +import { QueryBuilder } from '../src/db/queries'; +import { Edge, Node } from '../src/types'; +import { runMigrations, getCurrentVersion, CURRENT_SCHEMA_VERSION } from '../src/db/migrations'; + +function tempDb(): { dir: string; db: DatabaseConnection; q: QueryBuilder } { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-edges-unique-')); + const db = DatabaseConnection.initialize(path.join(dir, 'test.db')); + const q = new QueryBuilder(db.getDb()); + return { dir, db, q }; +} + +function cleanup(dir: string, db: DatabaseConnection) { + db.close(); + if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true }); +} + +function makeNode(id: string, name: string): Node { + return { + id, + kind: 'function', + name, + qualifiedName: `f::${name}`, + filePath: 'a.ts', + language: 'typescript', + startLine: 1, + endLine: 1, + startColumn: 0, + endColumn: 0, + updatedAt: Date.now(), + }; +} + +function edgesCount(db: DatabaseConnection): number { + const row = db.getDb().prepare('SELECT COUNT(*) as c FROM edges').get() as { c: number }; + return row.c; +} + +describe('Edge UNIQUE constraint (bug #2)', () => { + let dir: string; + let db: DatabaseConnection; + let q: QueryBuilder; + + beforeEach(() => { + ({ dir, db, q } = tempDb()); + q.insertNodes([makeNode('n1', 'foo'), makeNode('n2', 'bar')]); + }); + + afterEach(() => cleanup(dir, db)); + + it('rejects duplicate (source, target, kind, line, col)', () => { + const e: Edge = { source: 'n1', target: 'n2', kind: 'calls', line: 10, column: 5 }; + q.insertEdge(e); + q.insertEdge(e); // INSERT OR IGNORE — should be a no-op now + expect(edgesCount(db)).toBe(1); + }); + + it('treats two NULL line edges as duplicates (COALESCE in unique index)', () => { + const e: Edge = { source: 'n1', target: 'n2', kind: 'calls' }; + q.insertEdge(e); + q.insertEdge(e); + expect(edgesCount(db)).toBe(1); + }); + + it('allows same source/target/kind on different lines', () => { + q.insertEdge({ source: 'n1', target: 'n2', kind: 'calls', line: 1 }); + q.insertEdge({ source: 'n1', target: 'n2', kind: 'calls', line: 2 }); + expect(edgesCount(db)).toBe(2); + }); + + it('allows same source/target/line on different kinds', () => { + q.insertEdge({ source: 'n1', target: 'n2', kind: 'calls', line: 1 }); + q.insertEdge({ source: 'n1', target: 'n2', kind: 'references', line: 1 }); + expect(edgesCount(db)).toBe(2); + }); + + it('insertEdges (batch) dedupes within the same call', () => { + const e: Edge = { source: 'n1', target: 'n2', kind: 'calls', line: 1, column: 1 }; + q.insertEdges([e, e, e]); + expect(edgesCount(db)).toBe(1); + }); + + it('survives the same edge being re-emitted across many cycles', () => { + const e: Edge = { source: 'n1', target: 'n2', kind: 'calls', line: 1 }; + for (let i = 0; i < 100; i++) { + q.insertEdge(e); + } + expect(edgesCount(db)).toBe(1); + }); +}); + +describe('Migration v4: dedup existing edges', () => { + let dir: string; + let dbPath: string; + + beforeEach(() => { + dir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-migr-v4-')); + dbPath = path.join(dir, 'test.db'); + }); + + afterEach(() => { + if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true }); + }); + + it('collapses pre-existing duplicates and adds the UNIQUE index', () => { + // Build a v3-shaped database manually: schema, but simulate a stale + // version row + insert duplicates that the missing UNIQUE index let + // through. We use the real initialize() path then drop the index + + // version row to back-date the DB. + const db = DatabaseConnection.initialize(dbPath); + db.getDb().exec(`DROP INDEX IF EXISTS idx_edges_unique;`); + db.getDb().exec(`DELETE FROM schema_versions;`); + db.getDb().prepare( + 'INSERT INTO schema_versions (version, applied_at, description) VALUES (3, ?, ?)' + ).run(Date.now(), 'simulated v3'); + + const q = new QueryBuilder(db.getDb()); + q.insertNodes([makeNode('n1', 'foo'), makeNode('n2', 'bar')]); + // Force-insert duplicates via raw SQL (bypassing the constraint that + // is now absent). Three rows that should collapse to one. + const stmt = db.getDb().prepare( + 'INSERT INTO edges (source, target, kind, line, col) VALUES (?, ?, ?, ?, ?)' + ); + stmt.run('n1', 'n2', 'calls', 10, 5); + stmt.run('n1', 'n2', 'calls', 10, 5); + stmt.run('n1', 'n2', 'calls', 10, 5); + // And one with NULL line/col, also duplicated + stmt.run('n1', 'n2', 'references', null, null); + stmt.run('n1', 'n2', 'references', null, null); + + expect(edgesCount(db)).toBe(5); + expect(getCurrentVersion(db.getDb())).toBe(3); + + // Run migrations forward + runMigrations(db.getDb(), 3); + + expect(getCurrentVersion(db.getDb())).toBe(CURRENT_SCHEMA_VERSION); + expect(CURRENT_SCHEMA_VERSION).toBeGreaterThanOrEqual(4); + // 3 calls dups → 1, 2 references dups → 1 + expect(edgesCount(db)).toBe(2); + + // Now the constraint is enforced: another duplicate insert is a no-op. + const q2 = new QueryBuilder(db.getDb()); + q2.insertEdge({ source: 'n1', target: 'n2', kind: 'calls', line: 10, column: 5 }); + expect(edgesCount(db)).toBe(2); + + db.close(); + }); +}); diff --git a/__tests__/foundation.test.ts b/__tests__/foundation.test.ts index 9ee437da..4e8f204a 100644 --- a/__tests__/foundation.test.ts +++ b/__tests__/foundation.test.ts @@ -305,7 +305,7 @@ describe('Database Connection', () => { const version = db.getSchemaVersion(); expect(version).not.toBeNull(); - expect(version?.version).toBe(3); + expect(version?.version).toBe(4); db.close(); }); diff --git a/__tests__/pr19-improvements.test.ts b/__tests__/pr19-improvements.test.ts index 5fbe17d7..d43dceb2 100644 --- a/__tests__/pr19-improvements.test.ts +++ b/__tests__/pr19-improvements.test.ts @@ -299,7 +299,7 @@ describe('Best-Candidate Resolution', () => { describe('Schema v2 Migration', () => { it.skipIf(!HAS_SQLITE)('should have correct current schema version', async () => { const { CURRENT_SCHEMA_VERSION } = await import('../src/db/migrations'); - expect(CURRENT_SCHEMA_VERSION).toBe(3); + expect(CURRENT_SCHEMA_VERSION).toBe(4); }); it.skipIf(!HAS_SQLITE)('should have migration for version 2', async () => { diff --git a/src/db/migrations.ts b/src/db/migrations.ts index 0a256dbc..270265e2 100644 --- a/src/db/migrations.ts +++ b/src/db/migrations.ts @@ -9,7 +9,7 @@ import { SqliteDatabase } from './sqlite-adapter'; /** * Current schema version */ -export const CURRENT_SCHEMA_VERSION = 3; +export const CURRENT_SCHEMA_VERSION = 4; /** * Migration definition @@ -54,6 +54,27 @@ const migrations: Migration[] = [ `); }, }, + { + version: 4, + description: 'Dedup edges and enforce UNIQUE(source, target, kind, line, col) so INSERT OR IGNORE actually dedupes', + up: (db) => { + // Without a UNIQUE constraint the existing `INSERT OR IGNORE INTO + // edges` was a no-op for dedup purposes (the only candidate key was + // the AUTOINCREMENT id, which never conflicts). Existing databases + // can therefore contain accumulated duplicates from re-emission / + // re-resolution. Collapse those before adding the constraint, then + // create the UNIQUE index that future inserts will conflict against. + db.exec(` + DELETE FROM edges + WHERE id NOT IN ( + SELECT MIN(id) FROM edges + GROUP BY source, target, kind, COALESCE(line, -1), COALESCE(col, -1) + ); + CREATE UNIQUE INDEX IF NOT EXISTS idx_edges_unique + ON edges(source, target, kind, COALESCE(line, -1), COALESCE(col, -1)); + `); + }, + }, ]; /** diff --git a/src/db/schema.sql b/src/db/schema.sql index dd0a9f06..82c3e9b6 100644 --- a/src/db/schema.sql +++ b/src/db/schema.sql @@ -129,6 +129,15 @@ CREATE INDEX IF NOT EXISTS idx_edges_kind ON edges(kind); CREATE INDEX IF NOT EXISTS idx_edges_source_kind ON edges(source, kind); CREATE INDEX IF NOT EXISTS idx_edges_target_kind ON edges(target, kind); +-- Uniqueness for (source, target, kind, line, col). The id column is an +-- AUTOINCREMENT primary key, so without this index `INSERT OR IGNORE` +-- would never see a conflict — duplicate edges would silently accumulate +-- on every re-resolution / re-emission. COALESCE keeps two NULL line/col +-- values comparable as equal (SQLite treats raw NULLs in a UNIQUE index +-- as distinct). +CREATE UNIQUE INDEX IF NOT EXISTS idx_edges_unique + ON edges(source, target, kind, COALESCE(line, -1), COALESCE(col, -1)); + -- File indexes CREATE INDEX IF NOT EXISTS idx_files_language ON files(language); CREATE INDEX IF NOT EXISTS idx_files_modified_at ON files(modified_at);