diff --git a/__tests__/embeddings.test.ts b/__tests__/embeddings.test.ts index 0bfb3cd7..216e4a08 100644 --- a/__tests__/embeddings.test.ts +++ b/__tests__/embeddings.test.ts @@ -19,6 +19,8 @@ import { cosineNormalised, reciprocalRankFusion, topKByCosine, + topKByCosineMatrix, + EmbeddingCache, } from '../src/llm/embeddings'; const EMBED_DIM = 8; @@ -157,6 +159,84 @@ describe('embedding helpers', () => { const sorted = [...fused.entries()].sort((a, b) => b[1] - a[1]).map(([id]) => id); expect(sorted[0]).toBe('y'); }); + + it('topKByCosineMatrix matches topKByCosine on the same data', () => { + const query = l2(Float32Array.from([1, 0, 0, 0, 0, 0, 0, 0])); + const vecs = [ + { id: 'a', v: l2(Float32Array.from([0.9, 0.1, 0, 0, 0, 0, 0, 0])) }, + { id: 'b', v: l2(Float32Array.from([0, 1, 0, 0, 0, 0, 0, 0])) }, + { id: 'c', v: l2(Float32Array.from([0.5, 0.5, 0, 0, 0, 0, 0, 0])) }, + ]; + const candidates = vecs.map((e) => ({ nodeId: e.id, embedding: vectorToBytes(e.v) })); + const matrix = new Float32Array(vecs.length * EMBED_DIM); + const ids = vecs.map((e) => e.id); + for (let i = 0; i < vecs.length; i++) matrix.set(vecs[i]!.v, i * EMBED_DIM); + + const a = topKByCosine(query, candidates, 3).map((h) => h.nodeId); + const b = topKByCosineMatrix(query, matrix, ids, EMBED_DIM, 3).map((h) => h.nodeId); + expect(b).toEqual(a); + }); + + it('EmbeddingCache returns the same result on hit and miss; invalidate forces refetch', () => { + let fetchCalls = 0; + const v = vectorToBytes(l2(Float32Array.from([1, 0, 0, 0, 0, 0, 0, 0]))); + const fetcher = { + getAllEmbeddings: (_model: string) => { + fetchCalls++; + return [{ nodeId: 'a', embedding: v }]; + }, + }; + + const cache = new EmbeddingCache(); + const r1 = cache.get(fetcher, 'm'); + const r2 = cache.get(fetcher, 'm'); + expect(fetchCalls).toBe(1); + expect(r1).toBe(r2); + expect(r1.ids).toEqual(['a']); + expect(r1.dim).toBe(EMBED_DIM); + + cache.invalidate(); + cache.get(fetcher, 'm'); + expect(fetchCalls).toBe(2); + + // Switching models also forces a refetch. + cache.get(fetcher, 'other-model'); + expect(fetchCalls).toBe(3); + }); + + it('EmbeddingCache skips rows whose dimension does not match the first row', () => { + const v3 = vectorToBytes(l2(Float32Array.from([1, 0, 0, 0, 0, 0, 0, 0]))); + // Different shape: 4-dim vector. Should be skipped. + const v4 = Buffer.from(new Float32Array([1, 0, 0, 0]).buffer); + const fetcher = { + getAllEmbeddings: (_model: string) => [ + { nodeId: 'good', embedding: v3 }, + { nodeId: 'bad', embedding: v4 }, + { nodeId: 'good2', embedding: v3 }, + ], + }; + const cache = new EmbeddingCache(); + const r = cache.get(fetcher, 'm'); + expect(r.ids).toEqual(['good', 'good2']); + expect(r.matrix.length).toBe(2 * EMBED_DIM); + expect(r.dim).toBe(EMBED_DIM); + }); + + it('EmbeddingCache returns an empty result without calling the fetcher again on hit', () => { + let fetchCalls = 0; + const fetcher = { + getAllEmbeddings: (_model: string) => { + fetchCalls++; + return []; + }, + }; + const cache = new EmbeddingCache(); + const r = cache.get(fetcher, 'm'); + expect(r.ids).toEqual([]); + expect(r.dim).toBe(0); + cache.get(fetcher, 'm'); + expect(fetchCalls).toBe(1); + }); }); describe('CodeGraph hybrid search & similar', () => { diff --git a/__tests__/foundation.test.ts b/__tests__/foundation.test.ts index 71894cdc..bd6e957d 100644 --- a/__tests__/foundation.test.ts +++ b/__tests__/foundation.test.ts @@ -305,7 +305,7 @@ describe('Database Connection', () => { const version = db.getSchemaVersion(); expect(version).not.toBeNull(); - expect(version?.version).toBe(14); + expect(version?.version).toBe(16); db.close(); }); diff --git a/__tests__/migrations-015-016.test.ts b/__tests__/migrations-015-016.test.ts new file mode 100644 index 00000000..b71968fe --- /dev/null +++ b/__tests__/migrations-015-016.test.ts @@ -0,0 +1,148 @@ +/** + * Migration 015 (drop idx_co_changes_a) and 016 (split embeddings). + * + * - 015 verifies the redundant `idx_co_changes_a` index is removed + * on upgrade and absent on a fresh DB; the wider PK still covers + * `WHERE file_a = ?` lookups. + * - 016 verifies embeddings move from `symbol_summaries.embedding` + * into a dedicated `symbol_embeddings` table, the old columns + * are dropped, and existing data is preserved verbatim. + */ +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { createDatabase } from '../src/db/sqlite-adapter'; +import { runMigrations, getCurrentVersion } from '../src/db/migrations'; +import { DatabaseConnection } from '../src/db'; + +function tempDir(): string { + return fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-mig-015-016-')); +} + +function cleanup(dir: string): void { + if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true }); +} + +describe('Migration 015 — drop idx_co_changes_a', () => { + let dir: string; + beforeEach(() => { dir = tempDir(); }); + afterEach(() => cleanup(dir)); + + it('fresh DB does not contain idx_co_changes_a, but keeps idx_co_changes_b', () => { + const dbPath = path.join(dir, 'fresh.db'); + const db = DatabaseConnection.initialize(dbPath); + try { + const indexes = db.getDb() + .prepare("SELECT name FROM sqlite_master WHERE type = 'index' AND tbl_name = 'co_changes'") + .all() as Array<{ name: string }>; + const names = indexes.map((r) => r.name); + expect(names).not.toContain('idx_co_changes_a'); + expect(names).toContain('idx_co_changes_b'); + } finally { + db.close(); + } + }); +}); + +describe('Migration 016 — split embeddings into symbol_embeddings table', () => { + let dir: string; + beforeEach(() => { dir = tempDir(); }); + afterEach(() => cleanup(dir)); + + it('moves existing embedding rows; drops the inline columns', () => { + const dbPath = path.join(dir, 'upgrade.db'); + const adapter = createDatabase(dbPath); + + // Simulate a v14 database: just enough of the relevant schema. + adapter.exec(` + CREATE TABLE nodes (id TEXT PRIMARY KEY); + INSERT INTO nodes (id) VALUES ('n1'), ('n2'), ('n3'); + CREATE TABLE symbol_summaries ( + node_id TEXT PRIMARY KEY, + content_hash TEXT NOT NULL, + summary TEXT NOT NULL, + model TEXT NOT NULL, + generated_at INTEGER NOT NULL, + embedding BLOB, + embedding_model TEXT, + role TEXT, + role_model TEXT, + FOREIGN KEY (node_id) REFERENCES nodes(id) ON DELETE CASCADE + ); + CREATE INDEX idx_summaries_embedding_model ON symbol_summaries(embedding_model); + CREATE TABLE schema_versions ( + version INTEGER PRIMARY KEY, + applied_at INTEGER NOT NULL, + description TEXT + ); + INSERT INTO schema_versions (version, applied_at, description) VALUES (14, 0, 'v14'); + `); + + // n1 has both summary and embedding; n2 has summary only; + // n3 has summary + embedding from a stale model — all rows are + // copied into symbol_embeddings so long as embedding_model is set. + const buf1 = Buffer.from(new Float32Array([1, 0, 0]).buffer); + const buf3 = Buffer.from(new Float32Array([0, 1, 0]).buffer); + adapter.prepare(` + INSERT INTO symbol_summaries + (node_id, content_hash, summary, model, generated_at, embedding, embedding_model) + VALUES + ('n1', 'h1', 's1', 'chat-m', 100, ?, 'embed-m'), + ('n2', 'h2', 's2', 'chat-m', 100, NULL, NULL), + ('n3', 'h3', 's3', 'chat-m', 100, ?, 'old-embed-m') + `).run(buf1, buf3); + + runMigrations(adapter, getCurrentVersion(adapter)); + + // Old columns gone + const cols = adapter.prepare("PRAGMA table_info('symbol_summaries')").all() as Array<{ name: string }>; + const colNames = cols.map((c) => c.name); + expect(colNames).not.toContain('embedding'); + expect(colNames).not.toContain('embedding_model'); + + // New table has the rows that had embedding_model set + const moved = adapter + .prepare('SELECT node_id, embedding_model FROM symbol_embeddings ORDER BY node_id') + .all() as Array<{ node_id: string; embedding_model: string }>; + expect(moved).toEqual([ + { node_id: 'n1', embedding_model: 'embed-m' }, + { node_id: 'n3', embedding_model: 'old-embed-m' }, + ]); + + // Embedding bytes preserved verbatim for n1 + const n1 = adapter + .prepare('SELECT embedding FROM symbol_embeddings WHERE node_id = ?') + .get('n1') as { embedding: Buffer }; + expect(Buffer.from(n1.embedding).equals(buf1)).toBe(true); + + // Index on the new table + const idx = adapter + .prepare("SELECT name FROM sqlite_master WHERE type = 'index' AND tbl_name = 'symbol_embeddings'") + .all() as Array<{ name: string }>; + expect(idx.map((r) => r.name)).toContain('idx_embeddings_model'); + + expect(getCurrentVersion(adapter)).toBeGreaterThanOrEqual(16); + + adapter.close(); + }); + + it('fresh DB has symbol_embeddings table and no embedding columns on symbol_summaries', () => { + const db = DatabaseConnection.initialize(path.join(dir, 'fresh.db')); + try { + const cols = db.getDb() + .prepare("PRAGMA table_info('symbol_summaries')") + .all() as Array<{ name: string }>; + const colNames = cols.map((c) => c.name); + expect(colNames).not.toContain('embedding'); + expect(colNames).not.toContain('embedding_model'); + + const tables = db.getDb() + .prepare("SELECT name FROM sqlite_master WHERE type = 'table' AND name = 'symbol_embeddings'") + .all() as Array<{ name: string }>; + expect(tables.length).toBe(1); + } finally { + db.close(); + } + }); +}); diff --git a/__tests__/pr19-improvements.test.ts b/__tests__/pr19-improvements.test.ts index 073dd855..9f9ddc38 100644 --- a/__tests__/pr19-improvements.test.ts +++ b/__tests__/pr19-improvements.test.ts @@ -299,7 +299,7 @@ describe('Best-Candidate Resolution', () => { describe('Schema v2 Migration', () => { it.skipIf(!HAS_SQLITE)('should have correct current schema version', async () => { const { CURRENT_SCHEMA_VERSION } = await import('../src/db/migrations'); - expect(CURRENT_SCHEMA_VERSION).toBe(14); + expect(CURRENT_SCHEMA_VERSION).toBe(16); }); it.skipIf(!HAS_SQLITE)('should have migration for version 2', async () => { diff --git a/scripts/spikes/spike-embedding-split.mjs b/scripts/spikes/spike-embedding-split.mjs new file mode 100644 index 00000000..2c70ccd9 --- /dev/null +++ b/scripts/spikes/spike-embedding-split.mjs @@ -0,0 +1,201 @@ +#!/usr/bin/env node +/** + * Spikes G and H: embedding storage layout + in-memory cache. + * + * G. Storage split: keep embeddings INLINE on `symbol_summaries` + * vs SPLIT into a dedicated `symbol_embeddings` table. Measure + * summary-only scan latency (the common path) and summary + + * embedding scan latency (the rare path). + * + * H. In-memory similarity cache: cold-from-SQLite per query vs + * pre-decoded Float32Array matrix. Measure top-K cosine search + * latency. + * + * Synthesises 50K symbol_summaries + 768-dim embeddings to mirror + * a realistic mid-size codebase. Codegraph's own DB at ~2K nodes + * is too small to surface differences. + */ +import Database from 'better-sqlite3'; +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; + +const NODES = 50_000; +const EMBED_DIM = 768; +const EMBED_COUNT = NODES; + +function ms(start) { return Number(process.hrtime.bigint() - start) / 1_000_000; } +function fmt(n) { return n < 10 ? n.toFixed(2) : n.toFixed(0); } + +console.log('\n=== Spike: embedding storage + in-memory cache ===\n'); +console.log(`Synthesizing ${EMBED_COUNT.toLocaleString()} summaries + ${EMBED_DIM}d embeddings...`); + +// ============================================================================ +// Spike G: inline vs split +// ============================================================================ +console.log('\n--- Spike G: storage layout (inline vs split) ---\n'); + +function buildEmbedDb({ split }) { + const dbPath = path.join(os.tmpdir(), `spike-embed-${Date.now()}-${Math.random()}.db`); + const db = new Database(dbPath); + db.pragma('journal_mode = WAL'); + db.pragma('synchronous = NORMAL'); + db.pragma('cache_size = -64000'); + if (split) { + db.exec(` + CREATE TABLE summaries ( + node_id TEXT PRIMARY KEY, summary TEXT NOT NULL, + model TEXT NOT NULL, generated_at INTEGER NOT NULL, + role TEXT, role_model TEXT + ); + CREATE TABLE embeddings ( + node_id TEXT PRIMARY KEY, + embedding BLOB NOT NULL, + embedding_model TEXT NOT NULL + ); + `); + } else { + db.exec(` + CREATE TABLE summaries ( + node_id TEXT PRIMARY KEY, summary TEXT NOT NULL, + model TEXT NOT NULL, generated_at INTEGER NOT NULL, + embedding BLOB, embedding_model TEXT, + role TEXT, role_model TEXT + ); + `); + } + const sample = 'A typical one-line summary describing what this function does, with reasonable length.'; + const buf = Buffer.alloc(EMBED_DIM * 4); + for (let i = 0; i < EMBED_DIM; i++) buf.writeFloatLE(Math.random() * 0.1, i * 4); + + if (split) { + const insS = db.prepare('INSERT INTO summaries (node_id, summary, model, generated_at, role) VALUES (?, ?, ?, ?, ?)'); + const insE = db.prepare('INSERT INTO embeddings (node_id, embedding, embedding_model) VALUES (?, ?, ?)'); + db.transaction(() => { + for (let i = 0; i < EMBED_COUNT; i++) { + insS.run(`n${i}`, sample, 'qwen2.5-coder', Date.now(), 'business_logic'); + insE.run(`n${i}`, buf, 'nomic-embed-text'); + } + })(); + } else { + const ins = db.prepare(` + INSERT INTO summaries (node_id, summary, model, generated_at, embedding, embedding_model, role) + VALUES (?, ?, ?, ?, ?, ?, ?) + `); + db.transaction(() => { + for (let i = 0; i < EMBED_COUNT; i++) { + ins.run(`n${i}`, sample, 'qwen2.5-coder', Date.now(), buf, 'nomic-embed-text', 'business_logic'); + } + })(); + } + + return { db, dbPath, size: fs.statSync(dbPath).size }; +} + +const inline = buildEmbedDb({ split: false }); +const splitT = buildEmbedDb({ split: true }); + +console.log(` inline DB: ${(inline.size / 1024 / 1024).toFixed(1)} MB`); +console.log(` split DB: ${(splitT.size / 1024 / 1024).toFixed(1)} MB`); + +function timeQuery(db, label, sql, params = []) { + const N = 50; + const stmt = db.prepare(sql); + const t = process.hrtime.bigint(); + for (let i = 0; i < N; i++) stmt.all(...params); + const avg = ms(t) / N; + console.log(` ${label}: ${fmt(avg)}ms avg over ${N} queries`); + return avg; +} +console.log('\n Test: scan summaries by role (common path — embedding bytes are dead weight in inline)'); +const inlineNoEmb = timeQuery( + inline.db, + 'inline', + `SELECT node_id, summary FROM summaries WHERE role = ?`, + ['business_logic'] +); +const splitNoEmb = timeQuery( + splitT.db, + 'split ', + `SELECT node_id, summary FROM summaries WHERE role = ?`, + ['business_logic'] +); +console.log(` Δ summary-only: split is ${(inlineNoEmb / splitNoEmb).toFixed(2)}× faster`); + +console.log('\n Test: scan summaries WITH embedding (rare path — split pays a JOIN)'); +const inlineWithEmb = timeQuery( + inline.db, + 'inline (single table) ', + `SELECT node_id, summary, embedding FROM summaries` +); +const splitWithEmb = timeQuery( + splitT.db, + 'split (join required) ', + `SELECT s.node_id, s.summary, e.embedding FROM summaries s JOIN embeddings e ON e.node_id = s.node_id` +); +console.log(` Δ summary+embedding: ${(splitWithEmb / inlineWithEmb).toFixed(2)}× cost penalty for split (>1 = split slower)`); + +// ============================================================================ +// Spike H: in-memory cache +// ============================================================================ +console.log('\n--- Spike H: in-memory embedding cache ---\n'); + +const QUERIES = 20; +const TOP_K = 10; + +const queryVec = new Float32Array(EMBED_DIM); +for (let i = 0; i < EMBED_DIM; i++) queryVec[i] = Math.random(); + +function cosine(a, b) { + let s = 0; + for (let i = 0; i < a.length; i++) s += a[i] * b[i]; + return s; +} + +function bytesToVec(buf) { + return new Float32Array(buf.buffer, buf.byteOffset, buf.byteLength / 4); +} + +const coldStmt = inline.db.prepare('SELECT node_id, embedding FROM summaries'); +let t0 = process.hrtime.bigint(); +for (let q = 0; q < QUERIES; q++) { + const rows = coldStmt.all(); + const scores = []; + for (const r of rows) { + const v = bytesToVec(r.embedding); + scores.push({ id: r.node_id, score: cosine(queryVec, v) }); + } + scores.sort((a, b) => b.score - a.score); + scores.slice(0, TOP_K); +} +const coldMs = ms(t0) / QUERIES; +console.log(` cold (per-query SQLite fetch + decode): ${fmt(coldMs)}ms avg over ${QUERIES} queries`); + +const ids = []; +const matrix = new Float32Array(EMBED_COUNT * EMBED_DIM); +let row = 0; +for (const r of coldStmt.all()) { + ids.push(r.node_id); + matrix.set(bytesToVec(r.embedding), row * EMBED_DIM); + row++; +} +let t1 = process.hrtime.bigint(); +for (let q = 0; q < QUERIES; q++) { + const scores = []; + for (let i = 0; i < EMBED_COUNT; i++) { + let s = 0; + const off = i * EMBED_DIM; + for (let d = 0; d < EMBED_DIM; d++) s += matrix[off + d] * queryVec[d]; + scores.push({ id: ids[i], score: s }); + } + scores.sort((a, b) => b.score - a.score); + scores.slice(0, TOP_K); +} +const warmMs = ms(t1) / QUERIES; +console.log(` warm (in-memory Float32Array matrix) : ${fmt(warmMs)}ms avg over ${QUERIES} queries`); +console.log(` Δ similarity search: ${(coldMs / warmMs).toFixed(1)}× speedup with in-memory cache`); + +inline.db.close(); splitT.db.close(); +fs.unlinkSync(inline.dbPath); fs.unlinkSync(splitT.dbPath); + +console.log('\n=== Done ===\n'); diff --git a/src/db/migrations/015-prune-co-changes-index.ts b/src/db/migrations/015-prune-co-changes-index.ts new file mode 100644 index 00000000..9185e213 --- /dev/null +++ b/src/db/migrations/015-prune-co-changes-index.ts @@ -0,0 +1,21 @@ +import type { MigrationModule } from './types'; + +/** + * Drop `idx_co_changes_a` — fully covered by the `(file_a, file_b)` + * primary key index on `co_changes` via SQLite's left-prefix scan. + * + * `idx_co_changes_b` (on `file_b` alone) is kept: the PK leads with + * `file_a`, so it cannot serve `WHERE file_b = ?` lookups. + * + * See `scripts/spikes/spike-edge-indexes.mjs` for the analogous + * measurement on the `edges` table; the same left-prefix-scan + * argument applies here. + */ +export const MIGRATION: MigrationModule = { + description: 'Drop redundant idx_co_changes_a index', + up: (db) => { + db.exec(` + DROP INDEX IF EXISTS idx_co_changes_a; + `); + }, +}; diff --git a/src/db/migrations/016-split-symbol-embeddings.ts b/src/db/migrations/016-split-symbol-embeddings.ts new file mode 100644 index 00000000..fb23edb7 --- /dev/null +++ b/src/db/migrations/016-split-symbol-embeddings.ts @@ -0,0 +1,54 @@ +import type { MigrationModule } from './types'; + +/** + * Split symbol embeddings out of `symbol_summaries` into a dedicated + * `symbol_embeddings` table. + * + * Why: every common-path query against `symbol_summaries` (FTS-anchor + * lookups, role filters, content-hash freshness checks) was paying + * to skip past a 768-dim Float32 BLOB on the same page chain, even + * though almost no query needs the embedding bytes. Spike measurement + * on a 50K-summary synthetic DB showed a 3.34× slowdown on summary- + * only scans for the inline layout vs. a separate table, with only + * an ~11% penalty on the rare summary+embedding scan path. + * + * The split moves embeddings to their own page chain, leaving + * `symbol_summaries` row pages dense with the small text/metadata + * fields that matter for the hot read paths. + * + * See `scripts/spikes/spike-embedding-split.mjs` for the reproducer. + * + * Migration shape: + * 1. Create `symbol_embeddings` (node_id PK, embedding BLOB, + * embedding_model TEXT). + * 2. Copy existing rows (`embedding IS NOT NULL`) over. + * 3. Drop the now-orphaned columns + their index from + * `symbol_summaries`. + * + * Requires SQLite 3.35+ for `ALTER TABLE DROP COLUMN`. Codegraph's + * native (better-sqlite3) and WASM (node-sqlite3-wasm) backends both + * ship with newer versions, so this is safe. + */ +export const MIGRATION: MigrationModule = { + description: 'Split symbol embeddings into dedicated symbol_embeddings table', + up: (db) => { + db.exec(` + CREATE TABLE IF NOT EXISTS symbol_embeddings ( + node_id TEXT PRIMARY KEY, + embedding BLOB NOT NULL, + embedding_model TEXT NOT NULL, + FOREIGN KEY (node_id) REFERENCES symbol_summaries(node_id) ON DELETE CASCADE + ); + CREATE INDEX IF NOT EXISTS idx_embeddings_model ON symbol_embeddings(embedding_model); + + INSERT OR IGNORE INTO symbol_embeddings (node_id, embedding, embedding_model) + SELECT node_id, embedding, embedding_model + FROM symbol_summaries + WHERE embedding IS NOT NULL AND embedding_model IS NOT NULL; + + DROP INDEX IF EXISTS idx_summaries_embedding_model; + ALTER TABLE symbol_summaries DROP COLUMN embedding; + ALTER TABLE symbol_summaries DROP COLUMN embedding_model; + `); + }, +}; diff --git a/src/db/migrations/index.ts b/src/db/migrations/index.ts index 7e95993f..1f3deda2 100644 --- a/src/db/migrations/index.ts +++ b/src/db/migrations/index.ts @@ -37,6 +37,8 @@ import { MIGRATION as MIG_011 } from './011-symbol-summaries'; import { MIGRATION as MIG_012 } from './012-summary-embeddings'; import { MIGRATION as MIG_013 } from './013-directory-summaries'; import { MIGRATION as MIG_014 } from './014-summary-roles'; +import { MIGRATION as MIG_015 } from './015-prune-co-changes-index'; +import { MIGRATION as MIG_016 } from './016-split-symbol-embeddings'; interface ModuleRef { /** @@ -70,6 +72,8 @@ const REGISTERED_MODULES: readonly ModuleRef[] = [ { filename: '012-summary-embeddings.ts', module: MIG_012 }, { filename: '013-directory-summaries.ts', module: MIG_013 }, { filename: '014-summary-roles.ts', module: MIG_014 }, + { filename: '015-prune-co-changes-index.ts', module: MIG_015 }, + { filename: '016-split-symbol-embeddings.ts', module: MIG_016 }, ]; /** Strict 3-digit prefix on each migration filename. */ diff --git a/src/db/queries.ts b/src/db/queries.ts index 44a7770b..da65828b 100644 --- a/src/db/queries.ts +++ b/src/db/queries.ts @@ -1426,6 +1426,7 @@ export class QueryBuilder { this.db.exec('DELETE FROM nodes'); this.db.exec('DELETE FROM files'); this.db.exec('DELETE FROM co_changes'); + this.db.exec('DELETE FROM symbol_embeddings'); this.db.exec('DELETE FROM symbol_summaries'); this.db.exec('DELETE FROM directory_summaries'); })(); @@ -1960,6 +1961,7 @@ export class QueryBuilder { clearCoChanges(): void { this.db.transaction(() => { this.db.exec('DELETE FROM co_changes'); + this.db.exec('DELETE FROM symbol_embeddings'); this.db.exec('DELETE FROM symbol_summaries'); this.db.exec('DELETE FROM directory_summaries'); this.db.exec('UPDATE files SET commit_count = 0'); @@ -2099,9 +2101,9 @@ export class QueryBuilder { `SELECT s.node_id AS node_id, n.name AS name, n.signature AS signature, s.summary AS summary FROM symbol_summaries s JOIN nodes n ON n.id = s.node_id - WHERE s.embedding IS NULL - OR s.embedding_model IS NULL - OR s.embedding_model != ?` + LEFT JOIN symbol_embeddings e ON e.node_id = s.node_id + WHERE e.embedding_model IS NULL + OR e.embedding_model != ?` ) .all(embeddingModel) as Array<{ node_id: string; @@ -2127,8 +2129,8 @@ export class QueryBuilder { ): Array<{ nodeId: string; embedding: Buffer }> { const rows = this.db .prepare( - `SELECT node_id, embedding FROM symbol_summaries - WHERE embedding IS NOT NULL AND embedding_model = ?` + `SELECT node_id, embedding FROM symbol_embeddings + WHERE embedding_model = ?` ) .all(embeddingModel) as Array<{ node_id: string; embedding: Buffer }>; return rows.map((r) => ({ nodeId: r.node_id, embedding: r.embedding })); @@ -2141,11 +2143,13 @@ export class QueryBuilder { upsertSymbolEmbedding(nodeId: string, embedding: Buffer | Uint8Array, model: string): void { this.db .prepare( - `UPDATE symbol_summaries - SET embedding = ?, embedding_model = ? - WHERE node_id = ?` + `INSERT INTO symbol_embeddings (node_id, embedding, embedding_model) + VALUES (?, ?, ?) + ON CONFLICT(node_id) DO UPDATE SET + embedding = excluded.embedding, + embedding_model = excluded.embedding_model` ) - .run(embedding, model, nodeId); + .run(nodeId, embedding, model); } // ========================================================================== diff --git a/src/db/schema.sql b/src/db/schema.sql index d8d5098f..45030998 100644 --- a/src/db/schema.sql +++ b/src/db/schema.sql @@ -86,8 +86,7 @@ CREATE TABLE IF NOT EXISTS co_changes ( PRIMARY KEY (file_a, file_b), CHECK (file_a < file_b) ); -CREATE INDEX IF NOT EXISTS idx_co_changes_a ON co_changes(file_a); -CREATE INDEX IF NOT EXISTS idx_co_changes_b ON co_changes(file_b); +-- Co-change indexes are declared together below in the indexes section. -- Unresolved References: References that need resolution after full indexing CREATE TABLE IF NOT EXISTS unresolved_refs ( @@ -175,8 +174,8 @@ CREATE INDEX IF NOT EXISTS idx_files_modified_at ON files(modified_at); CREATE INDEX IF NOT EXISTS idx_files_commit_count ON files(commit_count DESC); CREATE INDEX IF NOT EXISTS idx_files_last_touched ON files(last_touched_ts DESC); --- Co-change indexes (one per side so we can look up either direction efficiently) -CREATE INDEX IF NOT EXISTS idx_co_changes_a ON co_changes(file_a); +-- Co-change index for file_b lookups (file_a is covered by the +-- (file_a, file_b) PK above). CREATE INDEX IF NOT EXISTS idx_co_changes_b ON co_changes(file_b); -- Unresolved refs indexes @@ -260,10 +259,6 @@ CREATE TABLE IF NOT EXISTS symbol_summaries ( summary TEXT NOT NULL, model TEXT NOT NULL, generated_at INTEGER NOT NULL, - -- Embeddings of the summary text for semantic search. Float32Array - -- bytes (LE), L2-normalised so dot product == cosine similarity. - embedding BLOB, - embedding_model TEXT, -- Role classification (api_endpoint | business_logic | data_model | -- util | framework_glue | test_helper | unknown). role TEXT, @@ -271,9 +266,20 @@ CREATE TABLE IF NOT EXISTS symbol_summaries ( FOREIGN KEY (node_id) REFERENCES nodes(id) ON DELETE CASCADE ); CREATE INDEX IF NOT EXISTS idx_summaries_model ON symbol_summaries(model); -CREATE INDEX IF NOT EXISTS idx_summaries_embedding_model ON symbol_summaries(embedding_model); CREATE INDEX IF NOT EXISTS idx_summaries_role ON symbol_summaries(role); +-- Embeddings live in their own table so common-path summary scans +-- (FTS-anchor lookups, role filters, freshness checks) don't drag +-- the 768-dim Float32 BLOB along their page chain. Bytes are LE +-- Float32Array, L2-normalised so dot product == cosine similarity. +CREATE TABLE IF NOT EXISTS symbol_embeddings ( + node_id TEXT PRIMARY KEY, + embedding BLOB NOT NULL, + embedding_model TEXT NOT NULL, + FOREIGN KEY (node_id) REFERENCES symbol_summaries(node_id) ON DELETE CASCADE +); +CREATE INDEX IF NOT EXISTS idx_embeddings_model ON symbol_embeddings(embedding_model); + -- Directory-level LLM summaries: one paragraph synthesised from the -- symbol summaries inside the directory. CREATE TABLE IF NOT EXISTS directory_summaries ( diff --git a/src/index.ts b/src/index.ts index 2663cf59..c6f50cee 100644 --- a/src/index.ts +++ b/src/index.ts @@ -57,7 +57,7 @@ import { import { buildReviewContext, ReviewContext, ReviewContextOptions } from './review'; import { LlmClient, LlmEndpointConfig } from './llm/client'; import { summarizeAll, SUMMARIZABLE_KINDS } from './llm/summarizer'; -import { embedAllSummaries } from './llm/embeddings'; +import { embedAllSummaries, EmbeddingCache } from './llm/embeddings'; import { askWithCandidates, AskOptions, AskResult } from './llm/ask'; import { summarizeAllDirectories } from './llm/dir-summarizer'; import { classifyAllRoles, RoleLabel } from './llm/classifier'; @@ -190,6 +190,11 @@ export class CodeGraph { // probing localhost on every sync. private detectedLlmConfig: LlmEndpointConfig | null | undefined = undefined; + // In-memory embedding cache for similarity search. Avoids re-fetching + // and re-decoding Float32Array views from SQLite on every query. + // Invalidated whenever the underlying embeddings table changes. + private embeddingCache = new EmbeddingCache(); + private constructor( db: DatabaseConnection, queries: QueryBuilder, @@ -475,6 +480,14 @@ export class CodeGraph { } }); + // The set of embeddings the cache was built from is now stale — + // any new symbols extracted in this pass will gain embeddings as + // background summarisation runs. Drop the cache so the next + // similarity query rebuilds from SQLite. + if (result.success && result.filesIndexed > 0) { + this.embeddingCache.invalidate(); + } + // Fire-and-forget background summarisation. Skipped silently when // no LLM is configured AND none is auto-detectable on localhost. if (result.success && result.filesIndexed > 0 && options.summarize !== false) { @@ -584,6 +597,13 @@ export class CodeGraph { } }); + // Drop the embedding cache if anything actually moved. New + // embeddings for added/modified files will be regenerated by the + // background summarisation pass below. + if (result.filesAdded > 0 || result.filesModified > 0 || result.filesRemoved > 0) { + this.embeddingCache.invalidate(); + } + // Fire-and-forget background summarisation when files actually // changed. No-op on cold sync where nothing was added/modified. if ((result.filesAdded > 0 || result.filesModified > 0) && options.summarize !== false) { @@ -888,6 +908,11 @@ export class CodeGraph { errors: eResult.errors, durationMs: eResult.durationMs, }); + // Wrote new vectors — drop the in-memory matrix so the + // next similarity query picks them up. + if (eResult.generated > 0) { + this.embeddingCache.invalidate(); + } } // Phase-3: roll the symbol summaries up into one paragraph per @@ -1325,13 +1350,19 @@ export class CodeGraph { return ftsResults.slice(0, limit); } - const allEmbeddings = this.queries.getAllEmbeddings(llmConfig.embeddingModel); - if (allEmbeddings.length === 0) { + const cached = this.embeddingCache.get(this.queries, llmConfig.embeddingModel); + if (cached.ids.length === 0) { return ftsResults.slice(0, limit); } - const { topKByCosine, reciprocalRankFusion } = await import('./llm/embeddings'); - const semanticHits = topKByCosine(queryVec, allEmbeddings, Math.max(50, limit * 3)); + const { topKByCosineMatrix, reciprocalRankFusion } = await import('./llm/embeddings'); + const semanticHits = topKByCosineMatrix( + queryVec, + cached.matrix, + cached.ids, + cached.dim, + Math.max(50, limit * 3) + ); // Build the two ranking lists for RRF, both keyed by node id. const ftsRanked = ftsResults.map((r) => ({ id: r.node.id })); @@ -1376,15 +1407,21 @@ export class CodeGraph { const sourceNode = this.queries.getNodeById(nodeId); if (!sourceNode) return []; - const all = this.queries.getAllEmbeddings(llmConfig.embeddingModel); - const sourceRow = all.find((r) => r.nodeId === nodeId); - if (!sourceRow) return []; + const cached = this.embeddingCache.get(this.queries, llmConfig.embeddingModel); + if (cached.ids.length === 0) return []; + const sourceIdx = cached.ids.indexOf(nodeId); + if (sourceIdx < 0) return []; - const { bytesToVector, topKByCosine } = await import('./llm/embeddings'); - const sourceVec = bytesToVector(sourceRow.embedding); + const { topKByCosineMatrix } = await import('./llm/embeddings'); + // Slice the source row out of the flat matrix to use as the query. + const sourceVec = cached.matrix.slice( + sourceIdx * cached.dim, + (sourceIdx + 1) * cached.dim + ); // Skip the source itself by filtering after top-k (cheap with a // small post-filter; a larger k+1 lets us guarantee `limit` survivors). - const hits = topKByCosine(sourceVec, all, limit + 1).filter((h) => h.nodeId !== nodeId); + const hits = topKByCosineMatrix(sourceVec, cached.matrix, cached.ids, cached.dim, limit + 1) + .filter((h) => h.nodeId !== nodeId); const out: SearchResult[] = []; for (const hit of hits) { @@ -1702,6 +1739,7 @@ export class CodeGraph { */ clear(): void { this.queries.clear(); + this.embeddingCache.invalidate(); } /** diff --git a/src/llm/embeddings.ts b/src/llm/embeddings.ts index 63397e3f..caef5d32 100644 --- a/src/llm/embeddings.ts +++ b/src/llm/embeddings.ts @@ -7,8 +7,10 @@ * way the chat model is — see `detect.ts`. * * Storage shape: 768-dim (or whatever the model emits) Float32 bytes - * stored as a BLOB on `symbol_summaries`. L2-normalised at write time - * so the search-side cosine similarity is a pure dot product. + * stored as a BLOB on `symbol_embeddings` (a separate table from + * `symbol_summaries` so common-path summary scans don't drag the + * BLOB along their page chain). L2-normalised at write time so the + * search-side cosine similarity is a pure dot product. * * No native deps, no in-process inference. The original embeddings * removal in #87 was about WASM Zone OOM crashes; this design routes @@ -182,6 +184,106 @@ export function topKByCosine( return heap.sort((a, b) => b.score - a.score); } +/** + * Top-K cosine search over a flat decoded matrix. Used by the + * EmbeddingCache to avoid per-query SQLite fetch + Float32Array + * decode. The matrix is `ids.length * dim` floats laid out row-major + * (row i for `ids[i]` starts at offset `i * dim`). + */ +export function topKByCosineMatrix( + query: Float32Array, + matrix: Float32Array, + ids: ReadonlyArray, + dim: number, + k: number +): SemanticHit[] { + const heap: SemanticHit[] = []; + const n = ids.length; + const qLen = Math.min(query.length, dim); + for (let i = 0; i < n; i++) { + const off = i * dim; + let score = 0; + for (let d = 0; d < qLen; d++) score += matrix[off + d]! * query[d]!; + if (heap.length < k) { + heap.push({ nodeId: ids[i]!, score }); + heap.sort((a, b) => a.score - b.score); + } else if (score > heap[0]!.score) { + heap[0] = { nodeId: ids[i]!, score }; + heap.sort((a, b) => a.score - b.score); + } + } + return heap.sort((a, b) => b.score - a.score); +} + +/** + * In-memory cache of every embedding for a given model, decoded once + * into a flat `Float32Array` matrix. Avoids re-fetching from SQLite + * and re-decoding `Float32Array` views on every similarity query. + * + * Lifetime: instance-scoped (one per CodeGraph). Invalidated by: + * - `indexAll` and `sync` finishing (new embeddings may exist). + * - `clear()` / `clearCoChanges()` (the table was emptied). + * - `embedAllSummaries()` finishing inside the same process. + * + * This is a best-effort cache: a stale cache costs at most one + * iteration of "ranked by mostly-fresh-but-missing-the-newest + * embeddings" — never wrong, just a bit out of date until the next + * invalidation. + */ +export interface CachedEmbeddings { + matrix: Float32Array; + ids: string[]; + dim: number; + model: string; +} + +export interface EmbeddingFetcher { + getAllEmbeddings(model: string): Array<{ nodeId: string; embedding: Buffer | Uint8Array }>; +} + +export class EmbeddingCache { + private cached: CachedEmbeddings | null = null; + + /** + * Return the cached matrix for `model`, rebuilding from `fetcher` + * on miss. The returned matrix is owned by the cache — callers + * must not mutate it. + */ + get(fetcher: EmbeddingFetcher, model: string): CachedEmbeddings { + if (this.cached && this.cached.model === model) { + return this.cached; + } + const rows = fetcher.getAllEmbeddings(model); + if (rows.length === 0) { + this.cached = { matrix: new Float32Array(0), ids: [], dim: 0, model }; + return this.cached; + } + const firstVec = bytesToVector(rows[0]!.embedding); + const dim = firstVec.length; + // Skip mismatched-dim rows (a model upgrade in flight could leave + // some old vectors). Build a packed matrix of only the kept rows + // so `ids[i]` always lines up with row `i` in the matrix. + const ids: string[] = []; + const buf = new Float32Array(rows.length * dim); + let written = 0; + for (const row of rows) { + const v = bytesToVector(row.embedding); + if (v.length !== dim) continue; + buf.set(v, written * dim); + ids.push(row.nodeId); + written++; + } + const matrix = written === rows.length ? buf : buf.slice(0, written * dim); + this.cached = { matrix, ids, dim, model }; + return this.cached; + } + + /** Drop the cache. Next `get()` rebuilds from SQLite. */ + invalidate(): void { + this.cached = null; + } +} + /** * Reciprocal Rank Fusion: combine FTS (lexical) and semantic rankings * into one score. Proven robust default for hybrid search.