From 8b445b241d70df2ec84bb8803c8d59f4507775c3 Mon Sep 17 00:00:00 2001 From: andreinknv Date: Mon, 27 Apr 2026 22:50:16 -0400 Subject: [PATCH] perf(db): drop redundant idx_edges_source and idx_edges_target These two narrow indexes are fully covered by the wider idx_edges_source_kind and idx_edges_target_kind composite indexes via SQLite's left-prefix scan. Keeping them costs DB size and bulk-insert time without giving any query that the kind-prefixed indexes don't already cover. Empirical measurements on a 50K-node / 250K-edge synthesized DB (scripts/spikes/spike-edge-indexes.mjs): - DB size: -22.2% (34.7 MB -> 27.0 MB) - Bulk insert: 1.37x faster (590ms -> 431ms) - Source-only / target-only query latency: no regression (EXPLAIN: SEARCH edges USING COVERING INDEX idx_edges_source_kind (source=?)) Adds schema migration v4. Fresh databases skip the indexes entirely; existing v3 databases drop them on next open. Co-Authored-By: Claude Opus 4.7 (1M context) --- __tests__/foundation.test.ts | 2 +- __tests__/pr19-improvements.test.ts | 78 ++++++++++++++++- scripts/spikes/spike-edge-indexes.mjs | 119 ++++++++++++++++++++++++++ src/db/migrations.ts | 19 +++- src/db/schema.sql | 5 +- 5 files changed, 218 insertions(+), 5 deletions(-) create mode 100644 scripts/spikes/spike-edge-indexes.mjs diff --git a/__tests__/foundation.test.ts b/__tests__/foundation.test.ts index 9ee437da..4e8f204a 100644 --- a/__tests__/foundation.test.ts +++ b/__tests__/foundation.test.ts @@ -305,7 +305,7 @@ describe('Database Connection', () => { const version = db.getSchemaVersion(); expect(version).not.toBeNull(); - expect(version?.version).toBe(3); + expect(version?.version).toBe(4); db.close(); }); diff --git a/__tests__/pr19-improvements.test.ts b/__tests__/pr19-improvements.test.ts index 5fbe17d7..0059505f 100644 --- a/__tests__/pr19-improvements.test.ts +++ b/__tests__/pr19-improvements.test.ts @@ -299,7 +299,7 @@ describe('Best-Candidate Resolution', () => { describe('Schema v2 Migration', () => { it.skipIf(!HAS_SQLITE)('should have correct current schema version', async () => { const { CURRENT_SCHEMA_VERSION } = await import('../src/db/migrations'); - expect(CURRENT_SCHEMA_VERSION).toBe(3); + expect(CURRENT_SCHEMA_VERSION).toBe(4); }); it.skipIf(!HAS_SQLITE)('should have migration for version 2', async () => { @@ -308,6 +308,82 @@ describe('Schema v2 Migration', () => { }); }); +// ============================================================================= +// Schema v4 Migration: drop redundant edge indexes +// ============================================================================= + +describe('Schema v4 Migration: drop redundant edge indexes', () => { + let tempDir: string; + + beforeEach(() => { + tempDir = createTempDir(); + }); + + afterEach(() => { + cleanupTempDir(tempDir); + }); + + it.skipIf(!HAS_SQLITE)('fresh DB does not create idx_edges_source / idx_edges_target', async () => { + const { DatabaseConnection } = await import('../src/db/index'); + const db = DatabaseConnection.initialize(path.join(tempDir, 'fresh.db')); + + const indexes = db.getDb() + .prepare("SELECT name FROM sqlite_master WHERE type = 'index' AND tbl_name = 'edges'") + .all() as Array<{ name: string }>; + const names = indexes.map((r) => r.name); + + expect(names).not.toContain('idx_edges_source'); + expect(names).not.toContain('idx_edges_target'); + // The kind-prefixed indexes that cover the dropped ones must remain. + expect(names).toContain('idx_edges_source_kind'); + expect(names).toContain('idx_edges_target_kind'); + + db.close(); + }); + + it.skipIf(!HAS_SQLITE)('upgrade path drops both narrow indexes if present', async () => { + const dbPath = path.join(tempDir, 'upgrade.db'); + const { createDatabase } = await import('../src/db/sqlite-adapter'); + const adapter = createDatabase(dbPath); + + // Simulate a v3 database: minimal edges table + the two narrow indexes. + adapter.exec(` + CREATE TABLE edges ( + id INTEGER PRIMARY KEY, + source TEXT NOT NULL, + target TEXT NOT NULL, + kind TEXT NOT NULL + ); + CREATE INDEX idx_edges_source ON edges(source); + CREATE INDEX idx_edges_target ON edges(target); + CREATE INDEX idx_edges_source_kind ON edges(source, kind); + CREATE INDEX idx_edges_target_kind ON edges(target, kind); + CREATE TABLE schema_versions ( + version INTEGER PRIMARY KEY, + applied_at INTEGER NOT NULL, + description TEXT + ); + INSERT INTO schema_versions (version, applied_at, description) VALUES (3, 0, 'v3'); + `); + + const { runMigrations, getCurrentVersion } = await import('../src/db/migrations'); + runMigrations(adapter, getCurrentVersion(adapter)); + + const indexes = adapter + .prepare("SELECT name FROM sqlite_master WHERE type = 'index' AND tbl_name = 'edges'") + .all() as Array<{ name: string }>; + const names = indexes.map((r) => r.name); + + expect(names).not.toContain('idx_edges_source'); + expect(names).not.toContain('idx_edges_target'); + expect(names).toContain('idx_edges_source_kind'); + expect(names).toContain('idx_edges_target_kind'); + expect(getCurrentVersion(adapter)).toBe(4); + + adapter.close(); + }); +}); + // ============================================================================= // Database Layer: Batch Insert, getAllNodes, Pragmas // ============================================================================= diff --git a/scripts/spikes/spike-edge-indexes.mjs b/scripts/spikes/spike-edge-indexes.mjs new file mode 100644 index 00000000..eee81529 --- /dev/null +++ b/scripts/spikes/spike-edge-indexes.mjs @@ -0,0 +1,119 @@ +#!/usr/bin/env node +/** + * Spike: redundant edge indexes + * + * Drops `idx_edges_source` and `idx_edges_target` and measures + * the impact on: + * - DB size + * - Bulk-insert throughput + * - Latency for `WHERE source = ?` and `WHERE target = ?` + * (the two queries that previously hit the dropped indexes) + * + * The hypothesis: SQLite covers source-only / target-only lookups + * via the wider `(source, kind)` and `(target, kind)` composite + * indexes through left-prefix scan, so dropping the narrow ones + * costs nothing on the read side but saves space and write time. + * + * Synthesises 50K nodes / 250K edges so the measurement scales to + * what real users will hit; codegraph's own DB at ~2K nodes is too + * small for index choices to surface. + */ +import Database from 'better-sqlite3'; +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; + +const NODES = 50_000; +const EDGES_PER_NODE = 5; + +function ms(start) { return Number(process.hrtime.bigint() - start) / 1_000_000; } +function fmt(n) { return n < 10 ? n.toFixed(2) : n.toFixed(0); } + +console.log('\n=== Spike: redundant edge indexes ===\n'); +console.log(`Synthesizing ${NODES.toLocaleString()} nodes, ${(NODES*EDGES_PER_NODE).toLocaleString()} edges...`); + +function buildEdgesDb({ withRedundant }) { + const dbPath = path.join(os.tmpdir(), `spike-edges-${Date.now()}-${Math.random()}.db`); + const db = new Database(dbPath); + db.pragma('journal_mode = WAL'); + db.pragma('synchronous = NORMAL'); + db.pragma('cache_size = -64000'); + db.exec(` + CREATE TABLE nodes (id TEXT PRIMARY KEY, kind TEXT NOT NULL, name TEXT NOT NULL); + CREATE TABLE edges ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + source TEXT NOT NULL, target TEXT NOT NULL, kind TEXT NOT NULL, + line INTEGER, col INTEGER + ); + CREATE INDEX idx_edges_kind ON edges(kind); + CREATE INDEX idx_edges_source_kind ON edges(source, kind); + CREATE INDEX idx_edges_target_kind ON edges(target, kind); + `); + if (withRedundant) { + db.exec(` + CREATE INDEX idx_edges_source ON edges(source); + CREATE INDEX idx_edges_target ON edges(target); + `); + } + + const insNode = db.prepare('INSERT INTO nodes (id, kind, name) VALUES (?, ?, ?)'); + const insEdge = db.prepare('INSERT INTO edges (source, target, kind, line, col) VALUES (?, ?, ?, ?, ?)'); + const KINDS = ['calls', 'imports', 'references', 'type_of', 'extends', 'instantiates']; + const tStart = process.hrtime.bigint(); + db.transaction(() => { + for (let i = 0; i < NODES; i++) { + insNode.run(`n${i}`, 'function', `name${i}`); + } + for (let i = 0; i < NODES; i++) { + for (let j = 0; j < EDGES_PER_NODE; j++) { + const tgt = `n${(i + j + 1) % NODES}`; + const kind = KINDS[j % KINDS.length]; + insEdge.run(`n${i}`, tgt, kind, i, j); + } + } + })(); + const insertMs = ms(tStart); + db.exec('PRAGMA optimize'); + + return { db, dbPath, size: fs.statSync(dbPath).size, insertMs }; +} + +const baseline = buildEdgesDb({ withRedundant: true }); +const stripped = buildEdgesDb({ withRedundant: false }); + +console.log(''); +console.log(` baseline (with redundant): size=${(baseline.size / 1024 / 1024).toFixed(1)} MB · bulk insert=${fmt(baseline.insertMs)}ms`); +console.log(` stripped : size=${(stripped.size / 1024 / 1024).toFixed(1)} MB · bulk insert=${fmt(stripped.insertMs)}ms`); +const sizeDelta = ((baseline.size - stripped.size) / baseline.size * 100).toFixed(1); +const insertSpeedup = (baseline.insertMs / stripped.insertMs).toFixed(2); +console.log(` Δ size: -${sizeDelta}% · Δ bulk insert: ${insertSpeedup}× faster without redundant indexes`); + +function timeQueries(db, label) { + const N = 500; + const sourceOnly = db.prepare('SELECT COUNT(*) FROM edges WHERE source = ?'); + const targetOnly = db.prepare('SELECT COUNT(*) FROM edges WHERE target = ?'); + let t = process.hrtime.bigint(); + for (let i = 0; i < N; i++) sourceOnly.get(`n${i % NODES}`); + const sourceMs = ms(t) / N; + t = process.hrtime.bigint(); + for (let i = 0; i < N; i++) targetOnly.get(`n${i % NODES}`); + const targetMs = ms(t) / N; + console.log(` ${label}: WHERE source=? avg ${fmt(sourceMs)}ms · WHERE target=? avg ${fmt(targetMs)}ms`); + return { sourceMs, targetMs }; +} +console.log(''); +const baseQ = timeQueries(baseline.db, 'baseline'); +const strQ = timeQueries(stripped.db, 'stripped'); +console.log(` query speed delta: source ${(strQ.sourceMs / baseQ.sourceMs).toFixed(2)}× · target ${(strQ.targetMs / baseQ.targetMs).toFixed(2)}× (>1 = stripped slower)`); + +// EXPLAIN-confirm that the stripped DB still uses an index for these +// queries — we want to know it's a covering scan, not a table scan. +const plan = stripped.db.prepare('EXPLAIN QUERY PLAN SELECT COUNT(*) FROM edges WHERE source = ?').all('n0'); +console.log(''); +console.log(' EXPLAIN (stripped, source=?):'); +for (const row of plan) console.log(` ${row.detail}`); + +baseline.db.close(); stripped.db.close(); +fs.unlinkSync(baseline.dbPath); fs.unlinkSync(stripped.dbPath); + +console.log('\n=== Done ===\n'); diff --git a/src/db/migrations.ts b/src/db/migrations.ts index 0a256dbc..bea481e5 100644 --- a/src/db/migrations.ts +++ b/src/db/migrations.ts @@ -9,7 +9,7 @@ import { SqliteDatabase } from './sqlite-adapter'; /** * Current schema version */ -export const CURRENT_SCHEMA_VERSION = 3; +export const CURRENT_SCHEMA_VERSION = 4; /** * Migration definition @@ -54,6 +54,23 @@ const migrations: Migration[] = [ `); }, }, + { + // idx_edges_source and idx_edges_target are fully covered by the + // wider idx_edges_source_kind and idx_edges_target_kind indexes via + // SQLite's left-prefix scan. Keeping the narrow ones costs ~17-22% + // of DB size and ~1.3x bulk-insert time without giving any query + // that the kind-prefixed indexes don't already cover (EXPLAIN + // confirms: SEARCH edges USING COVERING INDEX idx_edges_source_kind). + // See scripts/spikes/spike-edge-indexes.mjs for the reproducer. + version: 4, + description: 'Drop redundant idx_edges_source and idx_edges_target indexes', + up: (db) => { + db.exec(` + DROP INDEX IF EXISTS idx_edges_source; + DROP INDEX IF EXISTS idx_edges_target; + `); + }, + }, ]; /** diff --git a/src/db/schema.sql b/src/db/schema.sql index dd0a9f06..6b0eac74 100644 --- a/src/db/schema.sql +++ b/src/db/schema.sql @@ -123,8 +123,9 @@ CREATE TRIGGER IF NOT EXISTS nodes_au AFTER UPDATE ON nodes BEGIN END; -- Edge indexes -CREATE INDEX IF NOT EXISTS idx_edges_source ON edges(source); -CREATE INDEX IF NOT EXISTS idx_edges_target ON edges(target); +-- Note: narrow source/target indexes are intentionally omitted — the +-- (source, kind) and (target, kind) composite indexes below cover +-- source-only and target-only lookups via SQLite's left-prefix scan. CREATE INDEX IF NOT EXISTS idx_edges_kind ON edges(kind); CREATE INDEX IF NOT EXISTS idx_edges_source_kind ON edges(source, kind); CREATE INDEX IF NOT EXISTS idx_edges_target_kind ON edges(target, kind);