From 8b445b241d70df2ec84bb8803c8d59f4507775c3 Mon Sep 17 00:00:00 2001
From: andreinknv <andrei.nknv@outlook.com>
Date: Mon, 27 Apr 2026 22:50:16 -0400
Subject: [PATCH] perf(db): drop redundant idx_edges_source and
 idx_edges_target

These two narrow indexes are fully covered by the wider
idx_edges_source_kind and idx_edges_target_kind composite
indexes via SQLite's left-prefix scan. Keeping them costs DB
size and bulk-insert time without giving any query that the
kind-prefixed indexes don't already cover.

Empirical measurements on a 50K-node / 250K-edge synthesized DB
(scripts/spikes/spike-edge-indexes.mjs):

  - DB size: -22.2% (34.7 MB -> 27.0 MB)
  - Bulk insert: 1.37x faster (590ms -> 431ms)
  - Source-only / target-only query latency: no regression
    (EXPLAIN: SEARCH edges USING COVERING INDEX
     idx_edges_source_kind (source=?))

Adds schema migration v4. Fresh databases skip the indexes
entirely; existing v3 databases drop them on next open.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 __tests__/foundation.test.ts          |   2 +-
 __tests__/pr19-improvements.test.ts   |  78 ++++++++++++++++-
 scripts/spikes/spike-edge-indexes.mjs | 119 ++++++++++++++++++++++++++
 src/db/migrations.ts                  |  19 +++-
 src/db/schema.sql                     |   5 +-
 5 files changed, 218 insertions(+), 5 deletions(-)
 create mode 100644 scripts/spikes/spike-edge-indexes.mjs

diff --git a/__tests__/foundation.test.ts b/__tests__/foundation.test.ts
index 9ee437da..4e8f204a 100644
--- a/__tests__/foundation.test.ts
+++ b/__tests__/foundation.test.ts
@@ -305,7 +305,7 @@ describe('Database Connection', () => {
 
     const version = db.getSchemaVersion();
     expect(version).not.toBeNull();
-    expect(version?.version).toBe(3);
+    expect(version?.version).toBe(4);
 
     db.close();
   });
diff --git a/__tests__/pr19-improvements.test.ts b/__tests__/pr19-improvements.test.ts
index 5fbe17d7..0059505f 100644
--- a/__tests__/pr19-improvements.test.ts
+++ b/__tests__/pr19-improvements.test.ts
@@ -299,7 +299,7 @@ describe('Best-Candidate Resolution', () => {
 describe('Schema v2 Migration', () => {
   it.skipIf(!HAS_SQLITE)('should have correct current schema version', async () => {
     const { CURRENT_SCHEMA_VERSION } = await import('../src/db/migrations');
-    expect(CURRENT_SCHEMA_VERSION).toBe(3);
+    expect(CURRENT_SCHEMA_VERSION).toBe(4);
   });
 
   it.skipIf(!HAS_SQLITE)('should have migration for version 2', async () => {
@@ -308,6 +308,82 @@ describe('Schema v2 Migration', () => {
   });
 });
 
+// =============================================================================
+// Schema v4 Migration: drop redundant edge indexes
+// =============================================================================
+
+describe('Schema v4 Migration: drop redundant edge indexes', () => {
+  let tempDir: string;
+
+  beforeEach(() => {
+    tempDir = createTempDir();
+  });
+
+  afterEach(() => {
+    cleanupTempDir(tempDir);
+  });
+
+  it.skipIf(!HAS_SQLITE)('fresh DB does not create idx_edges_source / idx_edges_target', async () => {
+    const { DatabaseConnection } = await import('../src/db/index');
+    const db = DatabaseConnection.initialize(path.join(tempDir, 'fresh.db'));
+
+    const indexes = db.getDb()
+      .prepare("SELECT name FROM sqlite_master WHERE type = 'index' AND tbl_name = 'edges'")
+      .all() as Array<{ name: string }>;
+    const names = indexes.map((r) => r.name);
+
+    expect(names).not.toContain('idx_edges_source');
+    expect(names).not.toContain('idx_edges_target');
+    // The kind-prefixed indexes that cover the dropped ones must remain.
+    expect(names).toContain('idx_edges_source_kind');
+    expect(names).toContain('idx_edges_target_kind');
+
+    db.close();
+  });
+
+  it.skipIf(!HAS_SQLITE)('upgrade path drops both narrow indexes if present', async () => {
+    const dbPath = path.join(tempDir, 'upgrade.db');
+    const { createDatabase } = await import('../src/db/sqlite-adapter');
+    const adapter = createDatabase(dbPath);
+
+    // Simulate a v3 database: minimal edges table + the two narrow indexes.
+    adapter.exec(`
+      CREATE TABLE edges (
+        id INTEGER PRIMARY KEY,
+        source TEXT NOT NULL,
+        target TEXT NOT NULL,
+        kind TEXT NOT NULL
+      );
+      CREATE INDEX idx_edges_source ON edges(source);
+      CREATE INDEX idx_edges_target ON edges(target);
+      CREATE INDEX idx_edges_source_kind ON edges(source, kind);
+      CREATE INDEX idx_edges_target_kind ON edges(target, kind);
+      CREATE TABLE schema_versions (
+        version INTEGER PRIMARY KEY,
+        applied_at INTEGER NOT NULL,
+        description TEXT
+      );
+      INSERT INTO schema_versions (version, applied_at, description) VALUES (3, 0, 'v3');
+    `);
+
+    const { runMigrations, getCurrentVersion } = await import('../src/db/migrations');
+    runMigrations(adapter, getCurrentVersion(adapter));
+
+    const indexes = adapter
+      .prepare("SELECT name FROM sqlite_master WHERE type = 'index' AND tbl_name = 'edges'")
+      .all() as Array<{ name: string }>;
+    const names = indexes.map((r) => r.name);
+
+    expect(names).not.toContain('idx_edges_source');
+    expect(names).not.toContain('idx_edges_target');
+    expect(names).toContain('idx_edges_source_kind');
+    expect(names).toContain('idx_edges_target_kind');
+    expect(getCurrentVersion(adapter)).toBe(4);
+
+    adapter.close();
+  });
+});
+
 // =============================================================================
 // Database Layer: Batch Insert, getAllNodes, Pragmas
 // =============================================================================
diff --git a/scripts/spikes/spike-edge-indexes.mjs b/scripts/spikes/spike-edge-indexes.mjs
new file mode 100644
index 00000000..eee81529
--- /dev/null
+++ b/scripts/spikes/spike-edge-indexes.mjs
@@ -0,0 +1,119 @@
+#!/usr/bin/env node
+/**
+ * Spike: redundant edge indexes
+ *
+ * Drops `idx_edges_source` and `idx_edges_target` and measures
+ * the impact on:
+ *   - DB size
+ *   - Bulk-insert throughput
+ *   - Latency for `WHERE source = ?` and `WHERE target = ?`
+ *     (the two queries that previously hit the dropped indexes)
+ *
+ * The hypothesis: SQLite covers source-only / target-only lookups
+ * via the wider `(source, kind)` and `(target, kind)` composite
+ * indexes through left-prefix scan, so dropping the narrow ones
+ * costs nothing on the read side but saves space and write time.
+ *
+ * Synthesises 50K nodes / 250K edges so the measurement scales to
+ * what real users will hit; codegraph's own DB at ~2K nodes is too
+ * small for index choices to surface.
+ */
+import Database from 'better-sqlite3';
+import fs from 'node:fs';
+import os from 'node:os';
+import path from 'node:path';
+
+const NODES = 50_000;
+const EDGES_PER_NODE = 5;
+
+function ms(start) { return Number(process.hrtime.bigint() - start) / 1_000_000; }
+function fmt(n) { return n < 10 ? n.toFixed(2) : n.toFixed(0); }
+
+console.log('\n=== Spike: redundant edge indexes ===\n');
+console.log(`Synthesizing ${NODES.toLocaleString()} nodes, ${(NODES*EDGES_PER_NODE).toLocaleString()} edges...`);
+
+function buildEdgesDb({ withRedundant }) {
+  const dbPath = path.join(os.tmpdir(), `spike-edges-${Date.now()}-${Math.random()}.db`);
+  const db = new Database(dbPath);
+  db.pragma('journal_mode = WAL');
+  db.pragma('synchronous = NORMAL');
+  db.pragma('cache_size = -64000');
+  db.exec(`
+    CREATE TABLE nodes (id TEXT PRIMARY KEY, kind TEXT NOT NULL, name TEXT NOT NULL);
+    CREATE TABLE edges (
+      id INTEGER PRIMARY KEY AUTOINCREMENT,
+      source TEXT NOT NULL, target TEXT NOT NULL, kind TEXT NOT NULL,
+      line INTEGER, col INTEGER
+    );
+    CREATE INDEX idx_edges_kind ON edges(kind);
+    CREATE INDEX idx_edges_source_kind ON edges(source, kind);
+    CREATE INDEX idx_edges_target_kind ON edges(target, kind);
+  `);
+  if (withRedundant) {
+    db.exec(`
+      CREATE INDEX idx_edges_source ON edges(source);
+      CREATE INDEX idx_edges_target ON edges(target);
+    `);
+  }
+
+  const insNode = db.prepare('INSERT INTO nodes (id, kind, name) VALUES (?, ?, ?)');
+  const insEdge = db.prepare('INSERT INTO edges (source, target, kind, line, col) VALUES (?, ?, ?, ?, ?)');
+  const KINDS = ['calls', 'imports', 'references', 'type_of', 'extends', 'instantiates'];
+  const tStart = process.hrtime.bigint();
+  db.transaction(() => {
+    for (let i = 0; i < NODES; i++) {
+      insNode.run(`n${i}`, 'function', `name${i}`);
+    }
+    for (let i = 0; i < NODES; i++) {
+      for (let j = 0; j < EDGES_PER_NODE; j++) {
+        const tgt = `n${(i + j + 1) % NODES}`;
+        const kind = KINDS[j % KINDS.length];
+        insEdge.run(`n${i}`, tgt, kind, i, j);
+      }
+    }
+  })();
+  const insertMs = ms(tStart);
+  db.exec('PRAGMA optimize');
+
+  return { db, dbPath, size: fs.statSync(dbPath).size, insertMs };
+}
+
+const baseline = buildEdgesDb({ withRedundant: true });
+const stripped = buildEdgesDb({ withRedundant: false });
+
+console.log('');
+console.log(`  baseline (with redundant): size=${(baseline.size / 1024 / 1024).toFixed(1)} MB · bulk insert=${fmt(baseline.insertMs)}ms`);
+console.log(`  stripped               : size=${(stripped.size / 1024 / 1024).toFixed(1)} MB · bulk insert=${fmt(stripped.insertMs)}ms`);
+const sizeDelta = ((baseline.size - stripped.size) / baseline.size * 100).toFixed(1);
+const insertSpeedup = (baseline.insertMs / stripped.insertMs).toFixed(2);
+console.log(`  Δ size: -${sizeDelta}% · Δ bulk insert: ${insertSpeedup}× faster without redundant indexes`);
+
+function timeQueries(db, label) {
+  const N = 500;
+  const sourceOnly = db.prepare('SELECT COUNT(*) FROM edges WHERE source = ?');
+  const targetOnly = db.prepare('SELECT COUNT(*) FROM edges WHERE target = ?');
+  let t = process.hrtime.bigint();
+  for (let i = 0; i < N; i++) sourceOnly.get(`n${i % NODES}`);
+  const sourceMs = ms(t) / N;
+  t = process.hrtime.bigint();
+  for (let i = 0; i < N; i++) targetOnly.get(`n${i % NODES}`);
+  const targetMs = ms(t) / N;
+  console.log(`  ${label}: WHERE source=? avg ${fmt(sourceMs)}ms · WHERE target=? avg ${fmt(targetMs)}ms`);
+  return { sourceMs, targetMs };
+}
+console.log('');
+const baseQ = timeQueries(baseline.db, 'baseline');
+const strQ = timeQueries(stripped.db, 'stripped');
+console.log(`  query speed delta: source ${(strQ.sourceMs / baseQ.sourceMs).toFixed(2)}× · target ${(strQ.targetMs / baseQ.targetMs).toFixed(2)}× (>1 = stripped slower)`);
+
+// EXPLAIN-confirm that the stripped DB still uses an index for these
+// queries — we want to know it's a covering scan, not a table scan.
+const plan = stripped.db.prepare('EXPLAIN QUERY PLAN SELECT COUNT(*) FROM edges WHERE source = ?').all('n0');
+console.log('');
+console.log('  EXPLAIN (stripped, source=?):');
+for (const row of plan) console.log(`    ${row.detail}`);
+
+baseline.db.close(); stripped.db.close();
+fs.unlinkSync(baseline.dbPath); fs.unlinkSync(stripped.dbPath);
+
+console.log('\n=== Done ===\n');
diff --git a/src/db/migrations.ts b/src/db/migrations.ts
index 0a256dbc..bea481e5 100644
--- a/src/db/migrations.ts
+++ b/src/db/migrations.ts
@@ -9,7 +9,7 @@ import { SqliteDatabase } from './sqlite-adapter';
 /**
  * Current schema version
  */
-export const CURRENT_SCHEMA_VERSION = 3;
+export const CURRENT_SCHEMA_VERSION = 4;
 
 /**
  * Migration definition
@@ -54,6 +54,23 @@ const migrations: Migration[] = [
       `);
     },
   },
+  {
+    // idx_edges_source and idx_edges_target are fully covered by the
+    // wider idx_edges_source_kind and idx_edges_target_kind indexes via
+    // SQLite's left-prefix scan. Keeping the narrow ones costs ~17-22%
+    // of DB size and ~1.3x bulk-insert time without giving any query
+    // that the kind-prefixed indexes don't already cover (EXPLAIN
+    // confirms: SEARCH edges USING COVERING INDEX idx_edges_source_kind).
+    // See scripts/spikes/spike-edge-indexes.mjs for the reproducer.
+    version: 4,
+    description: 'Drop redundant idx_edges_source and idx_edges_target indexes',
+    up: (db) => {
+      db.exec(`
+        DROP INDEX IF EXISTS idx_edges_source;
+        DROP INDEX IF EXISTS idx_edges_target;
+      `);
+    },
+  },
 ];
 
 /**
diff --git a/src/db/schema.sql b/src/db/schema.sql
index dd0a9f06..6b0eac74 100644
--- a/src/db/schema.sql
+++ b/src/db/schema.sql
@@ -123,8 +123,9 @@ CREATE TRIGGER IF NOT EXISTS nodes_au AFTER UPDATE ON nodes BEGIN
 END;
 
 -- Edge indexes
-CREATE INDEX IF NOT EXISTS idx_edges_source ON edges(source);
-CREATE INDEX IF NOT EXISTS idx_edges_target ON edges(target);
+-- Note: narrow source/target indexes are intentionally omitted — the
+-- (source, kind) and (target, kind) composite indexes below cover
+-- source-only and target-only lookups via SQLite's left-prefix scan.
 CREATE INDEX IF NOT EXISTS idx_edges_kind ON edges(kind);
 CREATE INDEX IF NOT EXISTS idx_edges_source_kind ON edges(source, kind);
 CREATE INDEX IF NOT EXISTS idx_edges_target_kind ON edges(target, kind);