diff --git a/CLAUDE.md b/CLAUDE.md
index 71a50c73..b0f3a660 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -98,7 +98,9 @@ SQLite database with:
 
 ### Supported Languages
 
-TypeScript, JavaScript, TSX, JSX, Svelte, Python, Go, Rust, Java, C, C++, C#, PHP, Ruby, Swift, Kotlin, Dart, Liquid, Pascal
+TypeScript, JavaScript, TSX, JSX, Svelte, Python, Go, Rust, Java, C, C++, C#, PHP, Ruby, Swift, Kotlin, Dart, Liquid, Pascal, R
+
+To add a new language, follow the cookbook at [`docs/ADDING-A-LANGUAGE.md`](docs/ADDING-A-LANGUAGE.md).
 
 ### Node and Edge Types
 
diff --git a/README.md b/README.md
index fd1ffaba..3303fc0e 100644
--- a/README.md
+++ b/README.md
@@ -316,7 +316,7 @@ fi
 
 ## MCP Tools
 
-When running as an MCP server, CodeGraph exposes these tools to Claude Code:
+When running as an MCP server, CodeGraph exposes these tools to any MCP-compatible AI assistant:
 
 | Tool | Purpose |
 |------|---------|
@@ -331,6 +331,110 @@ When running as an MCP server, CodeGraph exposes these tools to Claude Code:
 
 ---
 
+## Using with Other MCP Clients
+
+The MCP server runs over **stdio** and works with any MCP-compatible client — not just Claude Code. The interactive installer is Claude Code-specific (it writes `~/.claude.json`), so for other clients you'll want the manual setup.
+
+**Common steps for every client:**
+
+```bash
+npm install -g @colbymchenry/codegraph     # so `codegraph` is on PATH
+cd your-project
+codegraph init -i                           # initialize + index this project
+```
+
+Then point your MCP client at `codegraph serve --mcp` using whatever config shape it expects:
+
+### opencode
+
+In `opencode.json` (project) or `~/.config/opencode/opencode.json` (global):
+
+```json
+{
+  "$schema": "https://opencode.ai/config.json",
+  "mcp": {
+    "codegraph": {
+      "type": "local",
+      "command": ["codegraph", "serve", "--mcp"],
+      "enabled": true
+    }
+  }
+}
+```
+
+### Cursor
+
+In `~/.cursor/mcp.json` (global) or `.cursor/mcp.json` (project):
+
+```json
+{
+  "mcpServers": {
+    "codegraph": {
+      "command": "codegraph",
+      "args": ["serve", "--mcp"]
+    }
+  }
+}
+```
+
+### LangChain (`MultiServerMCPClient`)
+
+The CodeGraph server speaks stdio, not SSE — pass `transport: "stdio"`:
+
+```python
+from langchain_mcp_adapters.client import MultiServerMCPClient
+
+client = MultiServerMCPClient({
+    "codegraph": {
+        "command": "codegraph",
+        "args": ["serve", "--mcp"],
+        "transport": "stdio",
+    }
+})
+tools = await client.get_tools()
+```
+
+### Claude Agent SDK
+
+Pass the server in `mcpServers` (TypeScript) or `mcp_servers` (Python) when calling `query()`:
+
+```python
+from claude_agent_sdk import query, ClaudeAgentOptions
+
+options = ClaudeAgentOptions(
+    mcp_servers={
+        "codegraph": {
+            "command": "codegraph",
+            "args": ["serve", "--mcp"],
+        }
+    },
+    allowed_tools=["mcp__codegraph__*"],
+)
+
+async for message in query(prompt="Where is auth handled?", options=options):
+    ...
+```
+
+### Anything else (generic stdio MCP)
+
+Most MCP clients (Continue, Zed, custom integrations, etc.) accept some variation of `command` + `args`. The values are always:
+
+| Field | Value |
+|-------|-------|
+| Command | `codegraph` |
+| Args | `["serve", "--mcp"]` |
+| Transport | `stdio` |
+
+The server reads the project root from the MCP `initialize` request's `rootUri` (set by the client when it connects). If your client doesn't send a `rootUri`, pass the project path explicitly:
+
+```bash
+codegraph serve --mcp --path /absolute/path/to/project
+```
+
+> **Note:** CodeGraph's MCP server does **not** speak SSE/HTTP. If your client only supports `url` + `transport: "sse"`, you'll need to wrap stdio with a bridge like [supergateway](https://github.com/supercorp-ai/supergateway).
+
+---
+
 ## Library Usage
 
 ```typescript
@@ -402,6 +506,8 @@ The `.codegraph/config.json` file controls indexing:
 | Liquid | `.liquid` | Full support |
 | Pascal / Delphi | `.pas`, `.dpr`, `.dpk`, `.lpr` | Full support (classes, records, interfaces, enums, DFM/FMX form files) |
 
+Want to add another language? See [`docs/ADDING-A-LANGUAGE.md`](docs/ADDING-A-LANGUAGE.md) — it walks through sourcing a tree-sitter grammar, probing the AST, choosing between the OO and self-contained extractor patterns, and the worked examples in the existing extractors.
+
 ## Troubleshooting
 
 **"CodeGraph not initialized"** — Run `codegraph init` in your project directory first.
diff --git a/__tests__/centrality.test.ts b/__tests__/centrality.test.ts
new file mode 100644
index 00000000..e45dc858
--- /dev/null
+++ b/__tests__/centrality.test.ts
@@ -0,0 +1,134 @@
+import { describe, it, expect } from 'vitest';
+import { computePageRank, PR_DAMPING, PR_ITERATIONS } from '../src/centrality';
+
+function asNodes(ids: string[]) {
+  return ids.map((id) => ({ id }));
+}
+
+describe('computePageRank', () => {
+  it('returns empty result for an empty graph', () => {
+    const r = computePageRank([], []);
+    expect(r.scores.size).toBe(0);
+    expect(r.iterations).toBe(0);
+  });
+
+  it('assigns uniform rank to N isolated nodes', () => {
+    const r = computePageRank(asNodes(['a', 'b', 'c', 'd']), []);
+    expect(r.scores.size).toBe(4);
+    // 4 isolated nodes — all dangling — should each end up with 1/N.
+    for (const v of r.scores.values()) {
+      expect(v).toBeCloseTo(0.25, 6);
+    }
+  });
+
+  it('rewards being reached (sinks accumulate rank)', () => {
+    // a -> b -> c. c has no outgoing, so it accumulates the most.
+    const r = computePageRank(
+      asNodes(['a', 'b', 'c']),
+      [
+        { source: 'a', target: 'b' },
+        { source: 'b', target: 'c' },
+      ]
+    );
+    const a = r.scores.get('a')!;
+    const b = r.scores.get('b')!;
+    const c = r.scores.get('c')!;
+    expect(c).toBeGreaterThan(b);
+    expect(b).toBeGreaterThan(a);
+  });
+
+  it('star: hub ranks above all leaves; leaves are equal', () => {
+    const leaves = ['l1', 'l2', 'l3', 'l4', 'l5', 'l6', 'l7', 'l8', 'l9'];
+    const edges = leaves.map((l) => ({ source: l, target: 'hub' }));
+    const r = computePageRank(asNodes([...leaves, 'hub']), edges);
+    const hub = r.scores.get('hub')!;
+    for (const l of leaves) {
+      const lv = r.scores.get(l)!;
+      expect(hub).toBeGreaterThan(lv);
+    }
+    // Leaves are symmetric — should be within 1e-9.
+    const first = r.scores.get(leaves[0])!;
+    for (const l of leaves.slice(1)) {
+      expect(r.scores.get(l)!).toBeCloseTo(first, 9);
+    }
+  });
+
+  it('cycle: all nodes have approximately equal rank', () => {
+    const r = computePageRank(
+      asNodes(['a', 'b', 'c']),
+      [
+        { source: 'a', target: 'b' },
+        { source: 'b', target: 'c' },
+        { source: 'c', target: 'a' },
+      ]
+    );
+    const a = r.scores.get('a')!;
+    const b = r.scores.get('b')!;
+    const c = r.scores.get('c')!;
+    // Symmetric → all equal at convergence.
+    expect(a).toBeCloseTo(b, 6);
+    expect(b).toBeCloseTo(c, 6);
+  });
+
+  it('total rank sums to ~1 (mass is conserved)', () => {
+    const r = computePageRank(
+      asNodes(['a', 'b', 'c', 'd', 'e']),
+      [
+        { source: 'a', target: 'b' },
+        { source: 'b', target: 'c' },
+        { source: 'd', target: 'c' },
+        { source: 'e', target: 'd' },
+        { source: 'a', target: 'e' },
+      ]
+    );
+    let sum = 0;
+    for (const v of r.scores.values()) sum += v;
+    expect(sum).toBeCloseTo(1, 6);
+  });
+
+  it('preserves mass across two disconnected components', () => {
+    const r = computePageRank(
+      asNodes(['a', 'b', 'c', 'd']),
+      [
+        { source: 'a', target: 'b' },
+        { source: 'c', target: 'd' },
+      ]
+    );
+    let sum = 0;
+    for (const v of r.scores.values()) sum += v;
+    expect(sum).toBeCloseTo(1, 6);
+    // Within each component, the sink ranks above the source.
+    expect(r.scores.get('b')!).toBeGreaterThan(r.scores.get('a')!);
+    expect(r.scores.get('d')!).toBeGreaterThan(r.scores.get('c')!);
+  });
+
+  it('drops edges referencing unknown nodes', () => {
+    // 'ghost' is not in the node set — that edge should be ignored,
+    // not crash and not pollute scores.
+    const r = computePageRank(
+      asNodes(['a', 'b']),
+      [
+        { source: 'a', target: 'b' },
+        { source: 'a', target: 'ghost' },
+        { source: 'ghost', target: 'b' },
+      ]
+    );
+    expect(r.scores.size).toBe(2);
+    expect(r.scores.get('b')!).toBeGreaterThan(r.scores.get('a')!);
+    let sum = 0;
+    for (const v of r.scores.values()) sum += v;
+    expect(sum).toBeCloseTo(1, 6);
+  });
+
+  it('reports iteration count and duration', () => {
+    const r = computePageRank(asNodes(['a', 'b']), [{ source: 'a', target: 'b' }]);
+    expect(r.iterations).toBe(PR_ITERATIONS);
+    expect(r.durationMs).toBeGreaterThanOrEqual(0);
+  });
+
+  it('damping constant is the textbook 0.85', () => {
+    // Sentinel — protects against accidental tuning that would invalidate
+    // the spike findings the PR was justified on.
+    expect(PR_DAMPING).toBe(0.85);
+  });
+});
diff --git a/__tests__/churn.test.ts b/__tests__/churn.test.ts
new file mode 100644
index 00000000..fbe279f6
--- /dev/null
+++ b/__tests__/churn.test.ts
@@ -0,0 +1,208 @@
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import * as fs from 'fs';
+import * as os from 'os';
+import * as path from 'path';
+import { execFileSync } from 'child_process';
+import {
+  mineChurn,
+  getGitHead,
+  readFileLoc,
+  MAX_FILES_PER_COMMIT,
+  LAST_MINED_CHURN_HEAD_KEY,
+} from '../src/churn';
+
+let HAS_GIT = true;
+try {
+  execFileSync('git', ['--version'], { stdio: 'ignore' });
+} catch {
+  HAS_GIT = false;
+}
+
+let tempDir: string;
+
+function git(...args: string[]): string {
+  return execFileSync('git', args, {
+    cwd: tempDir,
+    encoding: 'utf-8',
+    env: {
+      ...process.env,
+      GIT_AUTHOR_NAME: 'Test',
+      GIT_AUTHOR_EMAIL: 'test@example.com',
+      GIT_COMMITTER_NAME: 'Test',
+      GIT_COMMITTER_EMAIL: 'test@example.com',
+      GIT_AUTHOR_DATE: process.env.GIT_AUTHOR_DATE,
+      GIT_COMMITTER_DATE: process.env.GIT_COMMITTER_DATE,
+    },
+    stdio: ['pipe', 'pipe', 'pipe'],
+  }).trim();
+}
+
+function commitAt(date: string, paths: string[], content?: string) {
+  for (const p of paths) {
+    const abs = path.join(tempDir, p);
+    fs.mkdirSync(path.dirname(abs), { recursive: true });
+    fs.writeFileSync(abs, content ?? `data for ${p} at ${date}\n`);
+  }
+  git('add', ...paths);
+  // Pin both author and committer dates so timestamps are deterministic.
+  process.env.GIT_AUTHOR_DATE = date;
+  process.env.GIT_COMMITTER_DATE = date;
+  git('commit', '-m', `commit at ${date}`);
+  delete process.env.GIT_AUTHOR_DATE;
+  delete process.env.GIT_COMMITTER_DATE;
+}
+
+beforeEach(() => {
+  tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-churn-'));
+  if (HAS_GIT) {
+    git('init', '-q', '-b', 'main');
+    git('config', 'commit.gpgsign', 'false');
+  }
+});
+
+afterEach(() => {
+  delete process.env.GIT_AUTHOR_DATE;
+  delete process.env.GIT_COMMITTER_DATE;
+  fs.rmSync(tempDir, { recursive: true, force: true });
+});
+
+describe.skipIf(!HAS_GIT)('mineChurn', () => {
+  it('returns empty + null head when not in a git repo', () => {
+    const nonGit = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-nogit-'));
+    try {
+      const r = mineChurn(nonGit, new Set(['foo.ts']), null);
+      expect(r.currentHead).toBeNull();
+      expect(r.deltas.size).toBe(0);
+      expect(r.needsFullRescan).toBe(false);
+    } finally {
+      fs.rmSync(nonGit, { recursive: true, force: true });
+    }
+  });
+
+  it('counts commits per indexed file, ignores files not in index', () => {
+    commitAt('2025-01-01T00:00:00', ['a.ts', 'b.ts']);
+    commitAt('2025-01-02T00:00:00', ['a.ts']);
+    commitAt('2025-01-03T00:00:00', ['a.ts', 'b.ts', 'c.ts']);
+
+    const r = mineChurn(tempDir, new Set(['a.ts', 'b.ts']), null);
+    expect(r.deltas.get('a.ts')?.commitCountDelta).toBe(3);
+    expect(r.deltas.get('b.ts')?.commitCountDelta).toBe(2);
+    expect(r.deltas.has('c.ts')).toBe(false);
+  });
+
+  it('records first-seen / last-touched as min/max of commit timestamps', () => {
+    commitAt('2025-01-01T00:00:00Z', ['a.ts']);
+    commitAt('2025-06-01T00:00:00Z', ['a.ts']);
+    commitAt('2025-12-01T00:00:00Z', ['a.ts']);
+
+    const r = mineChurn(tempDir, new Set(['a.ts']), null);
+    const d = r.deltas.get('a.ts')!;
+    // 2025-01-01 UTC = 1735689600
+    expect(d.firstSeenTs).toBe(1735689600);
+    // 2025-12-01 UTC = 1764547200
+    expect(d.lastTouchedTs).toBe(1764547200);
+  });
+
+  it('skips commits touching more than MAX_FILES_PER_COMMIT files', () => {
+    const bigBatch: string[] = [];
+    for (let i = 0; i < MAX_FILES_PER_COMMIT + 1; i++) bigBatch.push(`f${i}.ts`);
+    commitAt('2025-01-01T00:00:00Z', bigBatch);
+    // Then a normal commit on one of the same files.
+    commitAt('2025-02-01T00:00:00Z', ['f0.ts']);
+
+    const r = mineChurn(tempDir, new Set(bigBatch), null);
+    // First commit was skipped; only the second one should count.
+    expect(r.deltas.get('f0.ts')?.commitCountDelta).toBe(1);
+    // Files only seen in the skipped commit produce no delta at all.
+    expect(r.deltas.has('f50.ts')).toBe(false);
+  });
+
+  it('incremental mining returns only commits since the given sha', () => {
+    commitAt('2025-01-01T00:00:00Z', ['a.ts']);
+    const sha1 = getGitHead(tempDir)!;
+    commitAt('2025-01-02T00:00:00Z', ['a.ts']);
+    commitAt('2025-01-03T00:00:00Z', ['a.ts']);
+
+    const incr = mineChurn(tempDir, new Set(['a.ts']), sha1);
+    // Only the two commits *after* sha1 should be counted.
+    expect(incr.deltas.get('a.ts')?.commitCountDelta).toBe(2);
+    expect(incr.needsFullRescan).toBe(false);
+  });
+
+  it('returns needsFullRescan=true when sinceSha is unreachable', () => {
+    commitAt('2025-01-01T00:00:00Z', ['a.ts']);
+    const fakeSha = '0'.repeat(40);
+    const r = mineChurn(tempDir, new Set(['a.ts']), fakeSha);
+    expect(r.needsFullRescan).toBe(true);
+    expect(r.deltas.size).toBe(0);
+    expect(r.currentHead).not.toBeNull();
+  });
+
+  it('returns empty deltas when sinceSha equals current head (no-op)', () => {
+    commitAt('2025-01-01T00:00:00Z', ['a.ts']);
+    const head = getGitHead(tempDir)!;
+    const r = mineChurn(tempDir, new Set(['a.ts']), head);
+    expect(r.currentHead).toBe(head);
+    expect(r.deltas.size).toBe(0);
+    expect(r.needsFullRescan).toBe(false);
+  });
+
+  it('handles paths with spaces and unicode safely (NUL-delimited)', () => {
+    commitAt('2025-01-01T00:00:00Z', ['name with space.ts']);
+    commitAt('2025-01-02T00:00:00Z', ['ünïcødë.ts']);
+
+    const r = mineChurn(
+      tempDir,
+      new Set(['name with space.ts', 'ünïcødë.ts']),
+      null
+    );
+    expect(r.deltas.get('name with space.ts')?.commitCountDelta).toBe(1);
+    expect(r.deltas.get('ünïcødë.ts')?.commitCountDelta).toBe(1);
+  });
+
+  it('LAST_MINED_CHURN_HEAD_KEY is stable (used as project_metadata key)', () => {
+    expect(LAST_MINED_CHURN_HEAD_KEY).toBe('last_mined_churn_head');
+  });
+});
+
+describe('readFileLoc', () => {
+  it('returns 0 for an empty file', () => {
+    const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-loc-'));
+    try {
+      const f = path.join(dir, 'empty.txt');
+      fs.writeFileSync(f, '');
+      expect(readFileLoc(dir, 'empty.txt')).toBe(0);
+    } finally {
+      fs.rmSync(dir, { recursive: true, force: true });
+    }
+  });
+
+  it('counts newline-terminated lines', () => {
+    const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-loc-'));
+    try {
+      fs.writeFileSync(path.join(dir, 'x.txt'), 'a\nb\nc\n');
+      expect(readFileLoc(dir, 'x.txt')).toBe(3);
+    } finally {
+      fs.rmSync(dir, { recursive: true, force: true });
+    }
+  });
+
+  it('counts a final no-newline chunk as one extra line', () => {
+    const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-loc-'));
+    try {
+      fs.writeFileSync(path.join(dir, 'x.txt'), 'a\nb\nc');
+      expect(readFileLoc(dir, 'x.txt')).toBe(3);
+    } finally {
+      fs.rmSync(dir, { recursive: true, force: true });
+    }
+  });
+
+  it('returns 0 for a missing file (does not throw)', () => {
+    const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-loc-'));
+    try {
+      expect(readFileLoc(dir, 'no-such-file.txt')).toBe(0);
+    } finally {
+      fs.rmSync(dir, { recursive: true, force: true });
+    }
+  });
+});
diff --git a/__tests__/cochange.test.ts b/__tests__/cochange.test.ts
new file mode 100644
index 00000000..4a3918aa
--- /dev/null
+++ b/__tests__/cochange.test.ts
@@ -0,0 +1,481 @@
+/**
+ * Co-Change Graph Tests
+ *
+ * Verifies the file-level co-change miner:
+ *   - parses git log output correctly
+ *   - filters out merge / large refactor commits via MAX_FILES_PER_COMMIT
+ *   - drops files outside the indexed set
+ *   - persists per-file commit_count and per-pair count
+ *   - computes Jaccard correctly at query time
+ *   - updates incrementally on subsequent runs
+ *   - detects unreachable previous-head and re-mines from scratch
+ *   - migration v4 creates the table + column
+ */
+
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+import { execFileSync } from 'child_process';
+import {
+  mineCoChanges,
+  MAX_FILES_PER_COMMIT,
+  MIN_COCHANGE_COUNT,
+  LAST_MINED_HEAD_KEY,
+  getGitHead,
+} from '../src/cochange';
+import { DatabaseConnection } from '../src/db';
+import { QueryBuilder } from '../src/db/queries';
+import { runMigrations, getCurrentVersion } from '../src/db/migrations';
+import CodeGraph from '../src/index';
+import { loadConfig } from '../src/config';
+
+function tempGitRepo(prefix: string): string {
+  const dir = fs.mkdtempSync(path.join(os.tmpdir(), prefix));
+  execFileSync('git', ['init'], { cwd: dir, stdio: 'pipe' });
+  execFileSync('git', ['config', 'user.email', 'test@test.com'], { cwd: dir, stdio: 'pipe' });
+  execFileSync('git', ['config', 'user.name', 'Test'], { cwd: dir, stdio: 'pipe' });
+  // Pin the initial branch name so subsequent operations are deterministic
+  // across systems with different `init.defaultBranch` settings.
+  execFileSync('git', ['symbolic-ref', 'HEAD', 'refs/heads/main'], { cwd: dir, stdio: 'pipe' });
+  return dir;
+}
+
+function commit(dir: string, message: string, files: Record<string, string>) {
+  for (const [rel, content] of Object.entries(files)) {
+    const full = path.join(dir, rel);
+    fs.mkdirSync(path.dirname(full), { recursive: true });
+    fs.writeFileSync(full, content);
+  }
+  execFileSync('git', ['add', '-A'], { cwd: dir, stdio: 'pipe' });
+  execFileSync('git', ['commit', '-m', message], { cwd: dir, stdio: 'pipe' });
+}
+
+function rm(dir: string, ...rels: string[]) {
+  for (const rel of rels) {
+    fs.unlinkSync(path.join(dir, rel));
+  }
+}
+
+describe('mineCoChanges (unit)', () => {
+  let dir: string;
+
+  afterEach(() => {
+    if (dir && fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true });
+  });
+
+  it('returns empty result for non-git directories', () => {
+    dir = fs.mkdtempSync(path.join(os.tmpdir(), 'cochange-nongit-'));
+    const result = mineCoChanges(dir, new Set(['a.ts']), null);
+    expect(result.currentHead).toBeNull();
+    expect(result.pairs.size).toBe(0);
+    expect(result.fileCommits.size).toBe(0);
+  });
+
+  it('counts pairs and per-file commits across multiple commits', () => {
+    dir = tempGitRepo('cochange-basic-');
+    commit(dir, 'c1', { 'a.ts': '1', 'b.ts': '1' });
+    commit(dir, 'c2', { 'a.ts': '2', 'b.ts': '2' });
+    commit(dir, 'c3', { 'a.ts': '3' });
+
+    const result = mineCoChanges(dir, new Set(['a.ts', 'b.ts']), null);
+    expect(result.currentHead).not.toBeNull();
+    expect(result.fileCommits.get('a.ts')).toBe(3);
+    expect(result.fileCommits.get('b.ts')).toBe(2);
+    expect(result.pairs.get('a.ts\0b.ts')).toBe(2);
+  });
+
+  it('drops files outside the indexed set', () => {
+    dir = tempGitRepo('cochange-filter-');
+    commit(dir, 'c1', { 'a.ts': '1', 'README.md': 'doc', 'b.ts': '1' });
+    commit(dir, 'c2', { 'a.ts': '2', 'b.ts': '2' });
+
+    // README.md is not indexed; the pair (a, b) still counts but no
+    // (a, README) or (b, README) pair is created.
+    const result = mineCoChanges(dir, new Set(['a.ts', 'b.ts']), null);
+    expect(result.fileCommits.has('README.md')).toBe(false);
+    expect(result.pairs.get('a.ts\0b.ts')).toBe(2);
+    expect([...result.pairs.keys()].length).toBe(1);
+  });
+
+  it('skips commits that touch more than MAX_FILES_PER_COMMIT indexed files', () => {
+    dir = tempGitRepo('cochange-mass-');
+    // First commit: massive refactor across many files (would otherwise
+    // produce O(N²) spurious pairs).
+    const massFiles: Record<string, string> = {};
+    const indexed = new Set<string>();
+    for (let i = 0; i < MAX_FILES_PER_COMMIT + 5; i++) {
+      const f = `src/m${i}.ts`;
+      massFiles[f] = String(i);
+      indexed.add(f);
+    }
+    commit(dir, 'mass', massFiles);
+    // Second commit: small, two files — should produce one pair.
+    commit(dir, 'small', { 'src/m0.ts': 'A', 'src/m1.ts': 'B' });
+
+    const result = mineCoChanges(dir, indexed, null);
+    expect(result.pairs.get('src/m0.ts\0src/m1.ts')).toBe(1);
+    // The mass-refactor commit contributes nothing.
+    expect([...result.pairs.values()].every((c) => c <= 1)).toBe(true);
+  });
+
+  it('mines incrementally — only commits in <since>..HEAD', () => {
+    dir = tempGitRepo('cochange-incr-');
+    commit(dir, 'c1', { 'a.ts': '1', 'b.ts': '1' });
+    const anchor = getGitHead(dir)!;
+    commit(dir, 'c2', { 'a.ts': '2', 'b.ts': '2' });
+    commit(dir, 'c3', { 'a.ts': '3', 'b.ts': '3' });
+
+    const result = mineCoChanges(dir, new Set(['a.ts', 'b.ts']), anchor);
+    // c2 + c3 only — anchor commit is excluded by the .. range
+    expect(result.fileCommits.get('a.ts')).toBe(2);
+    expect(result.pairs.get('a.ts\0b.ts')).toBe(2);
+  });
+
+  it('returns no-op delta when current HEAD == sinceSha', () => {
+    dir = tempGitRepo('cochange-noop-');
+    commit(dir, 'c1', { 'a.ts': '1' });
+    const head = getGitHead(dir)!;
+
+    const result = mineCoChanges(dir, new Set(['a.ts']), head);
+    expect(result.currentHead).toBe(head);
+    expect(result.pairs.size).toBe(0);
+    expect(result.fileCommits.size).toBe(0);
+    expect(result.needsFullRescan).toBe(false);
+  });
+
+  it('signals needsFullRescan when sinceSha is unreachable', () => {
+    dir = tempGitRepo('cochange-orphan-');
+    commit(dir, 'c1', { 'a.ts': '1' });
+    const result = mineCoChanges(
+      dir,
+      new Set(['a.ts']),
+      '0000000000000000000000000000000000000000'
+    );
+    expect(result.needsFullRescan).toBe(true);
+  });
+
+  it('correctly handles paths with spaces and unicode', () => {
+    dir = tempGitRepo('cochange-special-');
+    commit(dir, 'c1', { 'with space.ts': '1', 'café.ts': '1' });
+    commit(dir, 'c2', { 'with space.ts': '2', 'café.ts': '2' });
+
+    const result = mineCoChanges(
+      dir,
+      new Set(['with space.ts', 'café.ts']),
+      null
+    );
+    // Either ordering (canonical sort) is fine
+    const total = [...result.pairs.values()].reduce((a, b) => a + b, 0);
+    expect(total).toBe(2);
+  });
+
+  it('does not misidentify a file literally named "--" as a sentinel', () => {
+    // Earlier the parser used `--` as the per-commit header; a real file
+    // by that name would corrupt block boundaries. Sentinel is now NUL-
+    // bracketed so it cannot collide with any POSIX-legal filename.
+    dir = tempGitRepo('cochange-dashdash-');
+    commit(dir, 'c1', { '--': 'literal dash file', 'b.ts': '1' });
+    commit(dir, 'c2', { '--': 'changed', 'b.ts': '2' });
+
+    const result = mineCoChanges(dir, new Set(['--', 'b.ts']), null);
+    // We expect both files to be counted in both commits and one pair.
+    expect(result.fileCommits.get('--')).toBe(2);
+    expect(result.fileCommits.get('b.ts')).toBe(2);
+    expect(result.pairs.get('--\0b.ts')).toBe(2);
+  });
+});
+
+describe('QueryBuilder co-change CRUD', () => {
+  let dir: string;
+  let db: DatabaseConnection;
+  let q: QueryBuilder;
+
+  beforeEach(() => {
+    dir = fs.mkdtempSync(path.join(os.tmpdir(), 'cochange-db-'));
+    db = DatabaseConnection.initialize(path.join(dir, 'test.db'));
+    q = new QueryBuilder(db.getDb());
+    // Insert a few file rows so commit_count updates and FK semantics work.
+    const upsert = db.getDb().prepare(`
+      INSERT INTO files (path, content_hash, language, size, modified_at, indexed_at)
+      VALUES (?, '', 'typescript', 0, 0, 0)
+    `);
+    upsert.run('a.ts');
+    upsert.run('b.ts');
+    upsert.run('c.ts');
+  });
+
+  afterEach(() => {
+    db.close();
+    if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true });
+  });
+
+  it('canonicalises pair ordering on upsert', () => {
+    q.applyCoChangeDeltas([['b.ts', 'a.ts', 3]], []);
+    const row = db.getDb().prepare('SELECT * FROM co_changes').get() as any;
+    expect(row.file_a).toBe('a.ts');
+    expect(row.file_b).toBe('b.ts');
+    expect(row.count).toBe(3);
+  });
+
+  it('accumulates counts on repeated apply', () => {
+    q.applyCoChangeDeltas([['a.ts', 'b.ts', 2]], []);
+    q.applyCoChangeDeltas([['a.ts', 'b.ts', 3]], []);
+    const row = db.getDb().prepare('SELECT count FROM co_changes').get() as any;
+    expect(row.count).toBe(5);
+  });
+
+  it('increments per-file commit_count', () => {
+    q.applyCoChangeDeltas([], [['a.ts', 4]]);
+    q.applyCoChangeDeltas([], [['a.ts', 1]]);
+    const row = db.getDb().prepare('SELECT commit_count FROM files WHERE path = ?').get('a.ts') as any;
+    expect(row.commit_count).toBe(5);
+  });
+
+  it('skips no-op self-pairs', () => {
+    q.applyCoChangeDeltas([['a.ts', 'a.ts', 5]], []);
+    const cnt = db.getDb().prepare('SELECT COUNT(*) AS n FROM co_changes').get() as any;
+    expect(cnt.n).toBe(0);
+  });
+
+  it('clearCoChanges wipes pairs and zeroes per-file counts', () => {
+    q.applyCoChangeDeltas([['a.ts', 'b.ts', 3]], [['a.ts', 5]]);
+    q.clearCoChanges();
+    const cnt = db.getDb().prepare('SELECT COUNT(*) AS n FROM co_changes').get() as any;
+    expect(cnt.n).toBe(0);
+    const row = db.getDb().prepare('SELECT commit_count FROM files WHERE path = ?').get('a.ts') as any;
+    expect(row.commit_count).toBe(0);
+  });
+});
+
+describe('getCoChangedFiles (Jaccard ranking)', () => {
+  let dir: string;
+  let db: DatabaseConnection;
+  let q: QueryBuilder;
+
+  beforeEach(() => {
+    dir = fs.mkdtempSync(path.join(os.tmpdir(), 'cochange-rank-'));
+    db = DatabaseConnection.initialize(path.join(dir, 'test.db'));
+    q = new QueryBuilder(db.getDb());
+    const insertFile = db.getDb().prepare(`
+      INSERT INTO files (path, content_hash, language, size, modified_at, indexed_at, commit_count)
+      VALUES (?, '', 'typescript', 0, 0, 0, ?)
+    `);
+    // anchor.ts changed in 10 commits.
+    insertFile.run('anchor.ts', 10);
+    // tight.ts changed in 4 commits, all of which were with anchor.ts.
+    insertFile.run('tight.ts', 4);
+    // loose.ts changed in 100 commits, only 4 with anchor.ts → low Jaccard.
+    insertFile.run('loose.ts', 100);
+    // weak.ts changed in 5 commits, only 1 with anchor.ts → drops below minCount.
+    insertFile.run('weak.ts', 5);
+
+    q.applyCoChangeDeltas(
+      [
+        ['anchor.ts', 'tight.ts', 4],
+        ['anchor.ts', 'loose.ts', 4],
+        ['anchor.ts', 'weak.ts', 1],
+      ],
+      []
+    );
+  });
+
+  afterEach(() => {
+    db.close();
+    if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true });
+  });
+
+  it('ranks tight coupling above loose coupling via Jaccard', () => {
+    const results = q.getCoChangedFiles('anchor.ts');
+    expect(results[0].path).toBe('tight.ts');
+    expect(results[0].jaccard).toBeCloseTo(4 / (10 + 4 - 4), 2);
+    const loose = results.find((r) => r.path === 'loose.ts')!;
+    expect(loose.jaccard).toBeLessThan(results[0].jaccard);
+  });
+
+  it('drops pairs below minCount', () => {
+    const results = q.getCoChangedFiles('anchor.ts', { minCount: 2 });
+    expect(results.find((r) => r.path === 'weak.ts')).toBeUndefined();
+  });
+
+  it('drops pairs below minJaccard (filter is applied in SQL, before LIMIT)', () => {
+    const results = q.getCoChangedFiles('anchor.ts', { minJaccard: 0.5 });
+    // tight.ts has jaccard 0.4 — also dropped at this threshold.
+    expect(results.length).toBe(0);
+  });
+
+  it('does not silently drop high-jaccard pairs ranked beyond an internal over-fetch', () => {
+    // Insert many low-jaccard partners to push tight.ts past any in-memory
+    // truncation that could happen if minJaccard were applied JS-side after
+    // a small SQL LIMIT. With the SQL-side filter, a `limit: 1` request
+    // with high minJaccard must still return tight.ts.
+    const insertFile = db.getDb().prepare(`
+      INSERT INTO files (path, content_hash, language, size, modified_at, indexed_at, commit_count)
+      VALUES (?, '', 'typescript', 0, 0, 0, ?)
+    `);
+    const deltas: Array<[string, string, number]> = [];
+    for (let i = 0; i < 100; i++) {
+      const p = `noise${i}.ts`;
+      insertFile.run(p, 1000); // huge commit_count → near-zero jaccard
+      deltas.push(['anchor.ts', p, 4]);
+    }
+    q.applyCoChangeDeltas(deltas, []);
+    const results = q.getCoChangedFiles('anchor.ts', { limit: 1, minJaccard: 0.3 });
+    expect(results).toHaveLength(1);
+    expect(results[0].path).toBe('tight.ts');
+  });
+
+  it('returns symmetric results when queried from either side', () => {
+    const fromAnchor = q.getCoChangedFiles('anchor.ts').find((r) => r.path === 'tight.ts')!;
+    const fromTight = q.getCoChangedFiles('tight.ts').find((r) => r.path === 'anchor.ts')!;
+    expect(fromAnchor.count).toBe(fromTight.count);
+    expect(fromAnchor.jaccard).toBeCloseTo(fromTight.jaccard, 4);
+  });
+
+  it('respects the limit', () => {
+    const results = q.getCoChangedFiles('anchor.ts', { limit: 1 });
+    expect(results).toHaveLength(1);
+  });
+});
+
+describe('CodeGraph end-to-end (mining wired into indexAll/sync)', () => {
+  let dir: string;
+  let cg: CodeGraph;
+
+  beforeEach(async () => {
+    dir = tempGitRepo('cochange-e2e-');
+    fs.writeFileSync(path.join(dir, 'a.ts'), 'export const a = 1;');
+    fs.writeFileSync(path.join(dir, 'b.ts'), 'export const b = 1;');
+    execFileSync('git', ['add', '-A'], { cwd: dir, stdio: 'pipe' });
+    execFileSync('git', ['commit', '-m', 'initial'], { cwd: dir, stdio: 'pipe' });
+    // A second co-change of the same pair so we cross MIN_COCHANGE_COUNT (2).
+    fs.writeFileSync(path.join(dir, 'a.ts'), 'export const a = 2;');
+    fs.writeFileSync(path.join(dir, 'b.ts'), 'export const b = 2;');
+    execFileSync('git', ['add', '-A'], { cwd: dir, stdio: 'pipe' });
+    execFileSync('git', ['commit', '-m', 'second'], { cwd: dir, stdio: 'pipe' });
+
+    cg = CodeGraph.initSync(dir, { config: { include: ['**/*.ts'], exclude: [] } });
+    await cg.indexAll();
+  });
+
+  afterEach(() => {
+    if (cg) cg.destroy();
+    if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true });
+  });
+
+  it('populates co_changes after indexAll on a git repo', () => {
+    const partners = cg.getCoChangedFiles('a.ts');
+    expect(partners.length).toBeGreaterThanOrEqual(1);
+    const b = partners.find((p) => p.path === 'b.ts');
+    expect(b).toBeDefined();
+    expect(b!.count).toBeGreaterThanOrEqual(MIN_COCHANGE_COUNT);
+  });
+
+  it('stores the last mined HEAD in project_metadata', () => {
+    // Internal-state assertion to confirm incremental sync has an anchor.
+    // `queries` is private; cast to access it from the test.
+    const head = (cg as unknown as { queries: QueryBuilder }).queries.getMetadata(LAST_MINED_HEAD_KEY);
+    expect(head).toMatch(/^[0-9a-f]{40}$/);
+  });
+
+  it('updates incrementally on sync', async () => {
+    const before = cg.getCoChangedFiles('a.ts').find((p) => p.path === 'b.ts')!.count;
+    fs.writeFileSync(path.join(dir, 'a.ts'), 'export const a = 3;');
+    fs.writeFileSync(path.join(dir, 'b.ts'), 'export const b = 3;');
+    execFileSync('git', ['add', '-A'], { cwd: dir, stdio: 'pipe' });
+    execFileSync('git', ['commit', '-m', 'third'], { cwd: dir, stdio: 'pipe' });
+
+    await cg.sync();
+    const after = cg.getCoChangedFiles('a.ts').find((p) => p.path === 'b.ts')!.count;
+    expect(after).toBe(before + 1);
+  });
+
+  it('respects enableCoChange: false (no mining, empty results)', async () => {
+    const dir2 = tempGitRepo('cochange-disabled-');
+    fs.writeFileSync(path.join(dir2, 'a.ts'), '1');
+    fs.writeFileSync(path.join(dir2, 'b.ts'), '1');
+    execFileSync('git', ['add', '-A'], { cwd: dir2, stdio: 'pipe' });
+    execFileSync('git', ['commit', '-m', 'c1'], { cwd: dir2, stdio: 'pipe' });
+
+    const cg2 = CodeGraph.initSync(dir2, {
+      config: { include: ['**/*.ts'], exclude: [], enableCoChange: false },
+    });
+    await cg2.indexAll();
+    expect(cg2.getCoChangedFiles('a.ts')).toHaveLength(0);
+    cg2.destroy();
+    fs.rmSync(dir2, { recursive: true, force: true });
+  });
+
+  it('persists enableCoChange across config save/load round-trip', () => {
+    // Regression: mergeConfig used to enumerate fields by hand and
+    // silently dropped enableCoChange, so the opt-out flag could never
+    // survive a reload from disk.
+    const dir2 = fs.mkdtempSync(path.join(os.tmpdir(), 'cochange-cfgrt-'));
+    const cg2 = CodeGraph.initSync(dir2, {
+      config: { enableCoChange: false },
+    });
+    cg2.close();
+    const reloaded = loadConfig(dir2);
+    expect(reloaded.enableCoChange).toBe(false);
+    fs.rmSync(dir2, { recursive: true, force: true });
+  });
+});
+
+describe('Migration v4: add commit_count column + co_changes table', () => {
+  let dir: string;
+
+  beforeEach(() => {
+    dir = fs.mkdtempSync(path.join(os.tmpdir(), 'cochange-migr-'));
+  });
+
+  afterEach(() => {
+    if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true });
+  });
+
+  it('adds commit_count to files and creates co_changes', () => {
+    // Build a v3-shape DB by hand.
+    const Database = require('better-sqlite3');
+    const dbHandle = new Database(path.join(dir, 'test.db'));
+    dbHandle.exec(`
+      CREATE TABLE schema_versions (version INTEGER PRIMARY KEY, applied_at INTEGER NOT NULL, description TEXT);
+      INSERT INTO schema_versions (version, applied_at, description) VALUES (3, 0, 'v3');
+      CREATE TABLE files (
+        path TEXT PRIMARY KEY, content_hash TEXT NOT NULL, language TEXT NOT NULL,
+        size INTEGER NOT NULL, modified_at INTEGER NOT NULL, indexed_at INTEGER NOT NULL,
+        node_count INTEGER DEFAULT 0, errors TEXT
+      );
+      INSERT INTO files (path, content_hash, language, size, modified_at, indexed_at)
+      VALUES ('x.ts', '', 'typescript', 0, 0, 0);
+    `);
+    expect(getCurrentVersion(dbHandle)).toBe(3);
+
+    runMigrations(dbHandle, 3);
+    expect(getCurrentVersion(dbHandle)).toBeGreaterThanOrEqual(10);
+
+    const cols = dbHandle.prepare('PRAGMA table_info(files)').all() as Array<{ name: string }>;
+    expect(cols.some((c) => c.name === 'commit_count')).toBe(true);
+    const tableExists = dbHandle
+      .prepare("SELECT name FROM sqlite_master WHERE type='table' AND name='co_changes'")
+      .get();
+    expect(tableExists).toBeDefined();
+    dbHandle.close();
+  });
+
+  it('migration is idempotent on partial-DDL re-run', () => {
+    const Database = require('better-sqlite3');
+    const dbHandle = new Database(path.join(dir, 'test.db'));
+    dbHandle.exec(`
+      CREATE TABLE schema_versions (version INTEGER PRIMARY KEY, applied_at INTEGER NOT NULL, description TEXT);
+      INSERT INTO schema_versions (version, applied_at, description) VALUES (3, 0, 'v3');
+      CREATE TABLE files (
+        path TEXT PRIMARY KEY, content_hash TEXT NOT NULL, language TEXT NOT NULL,
+        size INTEGER NOT NULL, modified_at INTEGER NOT NULL, indexed_at INTEGER NOT NULL,
+        node_count INTEGER DEFAULT 0, errors TEXT,
+        commit_count INTEGER NOT NULL DEFAULT 0  -- partial pre-existing state
+      );
+    `);
+    expect(() => runMigrations(dbHandle, 3)).not.toThrow();
+    expect(getCurrentVersion(dbHandle)).toBeGreaterThanOrEqual(10);
+    dbHandle.close();
+  });
+});
diff --git a/__tests__/codegraphignore.test.ts b/__tests__/codegraphignore.test.ts
new file mode 100644
index 00000000..4d7e58c5
--- /dev/null
+++ b/__tests__/codegraphignore.test.ts
@@ -0,0 +1,168 @@
+/**
+ * .codegraphignore Tests
+ *
+ * Regression test for the bug where the .codegraphignore marker file was
+ * honored by the filesystem-walk fallback (`scanDirectoryWalk`) but
+ * silently ignored by the git fast path (`getGitVisibleFiles` and
+ * `getGitChangedFiles`). Same project gave different file sets depending
+ * on whether `.git` existed.
+ */
+
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+import { execFileSync } from 'child_process';
+import { scanDirectory } from '../src/extraction';
+import { DEFAULT_CONFIG, CodeGraphConfig } from '../src/types';
+import CodeGraph from '../src/index';
+
+function tempDir(prefix: string): string {
+  return fs.mkdtempSync(path.join(os.tmpdir(), prefix));
+}
+
+function git(cwd: string, ...args: string[]) {
+  execFileSync('git', args, { cwd, stdio: 'pipe' });
+}
+
+const config: CodeGraphConfig = {
+  ...DEFAULT_CONFIG,
+  include: ['**/*.ts'],
+  exclude: [],
+};
+
+describe('.codegraphignore marker (bug #3)', () => {
+  describe('git fast path', () => {
+    let dir: string;
+
+    beforeEach(() => {
+      dir = tempDir('codegraph-ignore-git-');
+      git(dir, 'init');
+      git(dir, 'config', 'user.email', 'test@test.com');
+      git(dir, 'config', 'user.name', 'Test');
+      // Pin branch name for determinism across git defaults
+      git(dir, 'symbolic-ref', 'HEAD', 'refs/heads/main');
+
+      fs.mkdirSync(path.join(dir, 'src'));
+      fs.mkdirSync(path.join(dir, 'vendor'));
+      fs.mkdirSync(path.join(dir, 'vendor', 'lib'));
+      fs.writeFileSync(path.join(dir, 'src', 'app.ts'), 'export const a = 1;');
+      fs.writeFileSync(path.join(dir, 'vendor', 'pkg.ts'), 'export const v = 1;');
+      fs.writeFileSync(path.join(dir, 'vendor', 'lib', 'sub.ts'), 'export const s = 1;');
+      // Mark vendor/ as ignored
+      fs.writeFileSync(path.join(dir, 'vendor', '.codegraphignore'), '');
+
+      git(dir, 'add', '-A');
+      git(dir, 'commit', '-m', 'initial');
+    });
+
+    afterEach(() => {
+      if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true });
+    });
+
+    it('scanDirectory honors .codegraphignore on the git fast path', () => {
+      const files = scanDirectory(dir, config);
+      expect(files).toContain('src/app.ts');
+      expect(files).not.toContain('vendor/pkg.ts');
+      expect(files).not.toContain('vendor/lib/sub.ts');
+    });
+
+    it('marker at project root excludes everything', () => {
+      fs.writeFileSync(path.join(dir, '.codegraphignore'), '');
+      // Need to add it to git so ls-files sees it (or rely on -o)
+      git(dir, 'add', '-A');
+      git(dir, 'commit', '-m', 'add root marker');
+      const files = scanDirectory(dir, config);
+      expect(files).toEqual([]);
+    });
+
+    it('marker in nested subdir does not affect siblings', () => {
+      // Add another sibling subdir without a marker
+      fs.mkdirSync(path.join(dir, 'libs'));
+      fs.writeFileSync(path.join(dir, 'libs', 'util.ts'), 'export const u = 1;');
+      git(dir, 'add', '-A');
+      git(dir, 'commit', '-m', 'add libs');
+
+      const files = scanDirectory(dir, config);
+      expect(files).toContain('src/app.ts');
+      expect(files).toContain('libs/util.ts');
+      expect(files).not.toContain('vendor/pkg.ts');
+    });
+
+    it('respects marker added after initial commit (untracked marker)', () => {
+      // The marker file itself need not be committed — it can be a local
+      // override. Add marker AFTER commit, do not commit it.
+      fs.mkdirSync(path.join(dir, 'generated'));
+      fs.writeFileSync(path.join(dir, 'generated', 'gen.ts'), 'export const g = 1;');
+      fs.writeFileSync(path.join(dir, 'generated', '.codegraphignore'), '');
+      // The .ts file is untracked but visible via `git ls-files -o`.
+      // The marker is also untracked — we still detect it via fs check.
+
+      const files = scanDirectory(dir, config);
+      expect(files).not.toContain('generated/gen.ts');
+    });
+  });
+
+  describe('parity with non-git fallback (filesystem walk)', () => {
+    let dir: string;
+
+    beforeEach(() => {
+      dir = tempDir('codegraph-ignore-walk-');
+      fs.mkdirSync(path.join(dir, 'src'));
+      fs.mkdirSync(path.join(dir, 'vendor'));
+      fs.writeFileSync(path.join(dir, 'src', 'app.ts'), 'export const a = 1;');
+      fs.writeFileSync(path.join(dir, 'vendor', 'pkg.ts'), 'export const v = 1;');
+      fs.writeFileSync(path.join(dir, 'vendor', '.codegraphignore'), '');
+    });
+
+    afterEach(() => {
+      if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true });
+    });
+
+    it('non-git project also honors the marker (sanity / pre-existing behavior)', () => {
+      const files = scanDirectory(dir, config);
+      expect(files).toContain('src/app.ts');
+      expect(files).not.toContain('vendor/pkg.ts');
+    });
+  });
+
+  describe('sync git path (getGitChangedFiles)', () => {
+    let dir: string;
+    let cg: CodeGraph;
+
+    beforeEach(async () => {
+      dir = tempDir('codegraph-ignore-sync-');
+      git(dir, 'init');
+      git(dir, 'config', 'user.email', 'test@test.com');
+      git(dir, 'config', 'user.name', 'Test');
+      git(dir, 'symbolic-ref', 'HEAD', 'refs/heads/main');
+
+      fs.mkdirSync(path.join(dir, 'src'));
+      fs.mkdirSync(path.join(dir, 'vendor'));
+      fs.writeFileSync(path.join(dir, 'src', 'app.ts'), 'export const a = 1;');
+      fs.writeFileSync(path.join(dir, 'vendor', '.codegraphignore'), '');
+
+      git(dir, 'add', '-A');
+      git(dir, 'commit', '-m', 'initial');
+
+      cg = CodeGraph.initSync(dir, { config: { include: ['**/*.ts'], exclude: [] } });
+      await cg.indexAll();
+    });
+
+    afterEach(() => {
+      if (cg) cg.destroy();
+      if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true });
+    });
+
+    it('sync ignores changes inside marker dirs', async () => {
+      // Add a new file under vendor/ — should NOT be picked up by sync.
+      fs.writeFileSync(path.join(dir, 'vendor', 'leaked.ts'), 'export const x = 1;');
+      // Also add a real change to confirm sync still runs.
+      fs.writeFileSync(path.join(dir, 'src', 'app.ts'), 'export const a = 2;');
+
+      const result = await cg.sync();
+      expect(result.changedFilePaths).toContain('src/app.ts');
+      expect(result.changedFilePaths ?? []).not.toContain('vendor/leaked.ts');
+    });
+  });
+});
diff --git a/__tests__/config-refs.test.ts b/__tests__/config-refs.test.ts
new file mode 100644
index 00000000..ab1a63e4
--- /dev/null
+++ b/__tests__/config-refs.test.ts
@@ -0,0 +1,288 @@
+/**
+ * Config-refs tests: parser unit tests + end-to-end through CodeGraph.
+ */
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import * as fs from 'fs';
+import * as os from 'os';
+import * as path from 'path';
+import { extractConfigRefs } from '../src/config-refs';
+import CodeGraph from '../src/index';
+
+let testDir: string;
+let cg: CodeGraph | null = null;
+
+function write(rel: string, content: string) {
+  const abs = path.join(testDir, rel);
+  fs.mkdirSync(path.dirname(abs), { recursive: true });
+  fs.writeFileSync(abs, content);
+}
+
+beforeEach(() => {
+  testDir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-config-'));
+});
+
+afterEach(() => {
+  if (cg) {
+    cg.destroy();
+    cg = null;
+  }
+  if (fs.existsSync(testDir)) fs.rmSync(testDir, { recursive: true, force: true });
+});
+
+// ============================================================================
+// Pure parser tests (no CodeGraph)
+// ============================================================================
+
+describe('extractConfigRefs', () => {
+  it('extracts process.env.X from TS', () => {
+    write('a.ts', `const port = process.env.OBSIDIAN_PORT;\n`);
+    const refs = extractConfigRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null);
+    expect(refs.length).toBe(1);
+    expect(refs[0]!.configKey).toBe('OBSIDIAN_PORT');
+    expect(refs[0]!.line).toBe(1);
+  });
+
+  it('extracts process.env["X"] from JS', () => {
+    write('a.js', `module.exports = { port: process.env["MY_KEY"] };\n`);
+    const refs = extractConfigRefs(testDir, [{ path: 'a.js', language: 'javascript' }], () => null);
+    expect(refs.map((r) => r.configKey)).toEqual(['MY_KEY']);
+  });
+
+  it('extracts os.getenv / os.environ from Python', () => {
+    write(
+      'a.py',
+      [
+        `import os`,
+        `port = os.getenv("PYTHON_PORT")`,
+        `host = os.environ.get("PYTHON_HOST")`,
+        `path = os.environ["PYTHON_PATH"]`,
+        `name = getenv("PYTHON_NAME")`,
+      ].join('\n')
+    );
+    const refs = extractConfigRefs(testDir, [{ path: 'a.py', language: 'python' }], () => null);
+    expect(new Set(refs.map((r) => r.configKey))).toEqual(
+      new Set(['PYTHON_PORT', 'PYTHON_HOST', 'PYTHON_PATH', 'PYTHON_NAME'])
+    );
+  });
+
+  it('extracts os.Getenv / os.LookupEnv from Go', () => {
+    write(
+      'a.go',
+      [
+        `package main`,
+        `import "os"`,
+        `var Port = os.Getenv("GO_PORT")`,
+        `var Host, _ = os.LookupEnv("GO_HOST")`,
+      ].join('\n')
+    );
+    const refs = extractConfigRefs(testDir, [{ path: 'a.go', language: 'go' }], () => null);
+    expect(new Set(refs.map((r) => r.configKey))).toEqual(new Set(['GO_PORT', 'GO_HOST']));
+  });
+
+  it('extracts ENV[...] / ENV.fetch from Ruby', () => {
+    write('a.rb', `port = ENV["RUBY_PORT"]\nhost = ENV.fetch("RUBY_HOST")\n`);
+    const refs = extractConfigRefs(testDir, [{ path: 'a.rb', language: 'ruby' }], () => null);
+    expect(new Set(refs.map((r) => r.configKey))).toEqual(new Set(['RUBY_PORT', 'RUBY_HOST']));
+  });
+
+  it('extracts env!/std::env::var from Rust', () => {
+    write(
+      'a.rs',
+      [
+        `let port = env!("RUST_PORT");`,
+        `let host = std::env::var("RUST_HOST").unwrap();`,
+      ].join('\n')
+    );
+    const refs = extractConfigRefs(testDir, [{ path: 'a.rs', language: 'rust' }], () => null);
+    expect(new Set(refs.map((r) => r.configKey))).toEqual(new Set(['RUST_PORT', 'RUST_HOST']));
+  });
+
+  it('extracts System.getenv from Java/Kotlin', () => {
+    write('A.java', `String port = System.getenv("JAVA_PORT");\n`);
+    const refs = extractConfigRefs(testDir, [{ path: 'A.java', language: 'java' }], () => null);
+    expect(refs.map((r) => r.configKey)).toEqual(['JAVA_PORT']);
+  });
+
+  it('only matches UPPER_CASE keys (skips lower-case identifiers)', () => {
+    write('a.ts', `const x = process.env.somethingDynamic;\nconst y = process.env.GOOD_KEY;\n`);
+    const refs = extractConfigRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null);
+    expect(refs.map((r) => r.configKey)).toEqual(['GOOD_KEY']);
+  });
+
+  it('skips files in unsupported languages without crashing', () => {
+    write('a.swift', `let port = ProcessInfo.processInfo.environment["SWIFT_PORT"]\n`);
+    const refs = extractConfigRefs(testDir, [{ path: 'a.swift', language: 'swift' }], () => null);
+    // Swift not in PATTERNS for v1.
+    expect(refs).toEqual([]);
+  });
+
+  it('captures the correct 1-indexed line number', () => {
+    write(
+      'a.ts',
+      [
+        `// line 1`,
+        `// line 2`,
+        `const x = process.env.LINE_THREE_KEY;`,
+        `// line 4`,
+        `const y = process.env.LINE_FIVE_KEY;`,
+      ].join('\n')
+    );
+    const refs = extractConfigRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null);
+    expect(refs).toEqual([
+      expect.objectContaining({ configKey: 'LINE_THREE_KEY', line: 3 }),
+      expect.objectContaining({ configKey: 'LINE_FIVE_KEY', line: 5 }),
+    ]);
+  });
+
+  it('threads the resolveEnclosing closure correctly', () => {
+    write('a.ts', `const x = process.env.FOO;\n`);
+    const calls: Array<[string, number]> = [];
+    extractConfigRefs(
+      testDir,
+      [{ path: 'a.ts', language: 'typescript' }],
+      (filePath, line) => {
+        calls.push([filePath, line]);
+        return 'fake-node-id';
+      }
+    );
+    expect(calls).toEqual([['a.ts', 1]]);
+  });
+
+  it('survives a missing file (skips, no throw)', () => {
+    const refs = extractConfigRefs(
+      testDir,
+      [{ path: 'does-not-exist.ts', language: 'typescript' }],
+      () => null
+    );
+    expect(refs).toEqual([]);
+  });
+});
+
+// ============================================================================
+// End-to-end through CodeGraph
+// ============================================================================
+
+describe('CodeGraph config refs', () => {
+  it('persists env reads after indexAll and resolves enclosing function', async () => {
+    write(
+      'src/server.ts',
+      [
+        `export function start() {`,
+        `  const port = process.env.OBSIDIAN_PORT ?? 8080;`,
+        `  return port;`,
+        `}`,
+        ``,
+        `export function getApiKey() {`,
+        `  return process.env.OBSIDIAN_API_KEY;`,
+        `}`,
+        ``,
+        `// top-level read`,
+        `export const HOST = process.env.OBSIDIAN_HOST;`,
+      ].join('\n')
+    );
+    cg = CodeGraph.initSync(testDir, {
+      config: { include: ['**/*.ts'], exclude: [] },
+    });
+    await cg.indexAll();
+
+    // All three keys should be visible.
+    const keys = cg.getConfigKeys({ configKind: 'env' });
+    expect(keys.map((k) => k.configKey).sort()).toEqual([
+      'OBSIDIAN_API_KEY',
+      'OBSIDIAN_HOST',
+      'OBSIDIAN_PORT',
+    ]);
+
+    // The OBSIDIAN_PORT read should be attributed to `start`.
+    const portSites = cg.getConfigRefsByKey('OBSIDIAN_PORT');
+    expect(portSites.length).toBe(1);
+    expect(portSites[0]!.sourceName).toBe('start');
+
+    // The HOST read is at the top level — sourceName should be null.
+    const hostSites = cg.getConfigRefsByKey('OBSIDIAN_HOST');
+    expect(hostSites[0]!.sourceName).toBeNull();
+  });
+
+  it('reverse view: getConfigKeysForNode returns keys read by a function', async () => {
+    write(
+      'src/a.ts',
+      [
+        `export function loadConfig() {`,
+        `  const a = process.env.KEY_A;`,
+        `  const b = process.env.KEY_B;`,
+        `  return { a, b };`,
+        `}`,
+      ].join('\n')
+    );
+    cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } });
+    await cg.indexAll();
+
+    const node = cg.getNodesInFile('src/a.ts').find((n) => n.name === 'loadConfig')!;
+    const keys = cg.getConfigKeysForNode(node.id).map((r) => r.configKey).sort();
+    expect(keys).toEqual(['KEY_A', 'KEY_B']);
+  });
+
+  it('respects enableConfigRefs=false', async () => {
+    write('src/a.ts', `export const PORT = process.env.PORT;\n`);
+    cg = CodeGraph.initSync(testDir, {
+      config: { include: ['**/*.ts'], exclude: [], enableConfigRefs: false },
+    });
+    await cg.indexAll();
+    expect(cg.getConfigKeys()).toEqual([]);
+  });
+
+  it('incremental sync replaces refs for changed files only', async () => {
+    write('src/a.ts', `export const A = process.env.OLD_KEY;\n`);
+    write('src/b.ts', `export const B = process.env.UNCHANGED_KEY;\n`);
+    cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } });
+    await cg.indexAll();
+    expect(cg.getConfigKeys().map((k) => k.configKey).sort()).toEqual([
+      'OLD_KEY',
+      'UNCHANGED_KEY',
+    ]);
+
+    // Edit only a.ts — UNCHANGED_KEY should still be there.
+    write('src/a.ts', `export const A = process.env.NEW_KEY;\n`);
+    await cg.sync();
+
+    const keys = cg.getConfigKeys().map((k) => k.configKey).sort();
+    expect(keys).toContain('NEW_KEY');
+    expect(keys).toContain('UNCHANGED_KEY');
+    expect(keys).not.toContain('OLD_KEY');
+  });
+
+  it('drops refs when a file is edited to remove its last env read', async () => {
+    // Regression for the empty-rows early-return data-corruption bug:
+    // applyConfigRefs([]) used to short-circuit without deleting the
+    // stale rows for the file. The sync path now explicitly invalidates
+    // rows for every changed file *before* extracting, regardless of
+    // whether the new content has any reads.
+    write('src/a.ts', `export const PORT = process.env.REMOVED_KEY;\n`);
+    cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } });
+    await cg.indexAll();
+    expect(cg.getConfigKeys().some((k) => k.configKey === 'REMOVED_KEY')).toBe(true);
+
+    // Edit a.ts to remove the env read entirely (no remaining reads).
+    write('src/a.ts', `export const PORT = 8080; // no env read here\n`);
+    await cg.sync();
+
+    expect(cg.getConfigKeys().some((k) => k.configKey === 'REMOVED_KEY')).toBe(false);
+  });
+
+  it('drops refs for files removed between syncs', async () => {
+    write('src/a.ts', `export const A = process.env.GOING_AWAY;\n`);
+    cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } });
+    await cg.indexAll();
+    expect(cg.getConfigKeys().some((k) => k.configKey === 'GOING_AWAY')).toBe(true);
+
+    fs.unlinkSync(path.join(testDir, 'src/a.ts'));
+    await cg.sync();
+
+    expect(cg.getConfigKeys().some((k) => k.configKey === 'GOING_AWAY')).toBe(false);
+  });
+
+  // (Removed: a defensive test for the v4-migration-collision bug class.
+  // With file-based migrations (NNN-name.ts), two PRs claiming the same
+  // version produces a filesystem-level conflict, so the silent skip the
+  // defensive guard protected against can no longer happen.)
+});
diff --git a/__tests__/context.test.ts b/__tests__/context.test.ts
index 52dae1fe..9a0614aa 100644
--- a/__tests__/context.test.ts
+++ b/__tests__/context.test.ts
@@ -210,6 +210,19 @@ export function validateEmail(email: string): boolean {
 
       expect(result.nodes.size).toBeLessThanOrEqual(5);
     });
+
+    it('should clamp absurd searchLimit/maxNodes values to safe upper bounds', async () => {
+      // Without clamping, the internal `findNodesByExactName` query would
+      // request `searchLimit * 5` rows — passing 1e9 here would blow out
+      // memory. The call should complete in normal time and not return more
+      // than the hard cap on maxNodes (1000).
+      const result = await cg.findRelevantContext('function', {
+        searchLimit: 1_000_000_000,
+        maxNodes: 1_000_000_000,
+        traversalDepth: 1_000,
+      });
+      expect(result.nodes.size).toBeLessThanOrEqual(1000);
+    });
   });
 
   describe('buildContext()', () => {
diff --git a/__tests__/db-perf.test.ts b/__tests__/db-perf.test.ts
new file mode 100644
index 00000000..256cf92c
--- /dev/null
+++ b/__tests__/db-perf.test.ts
@@ -0,0 +1,161 @@
+/**
+ * DB Performance / Correctness Tests
+ *
+ * Regression tests for three changes:
+ *   1. Batch `getNodesByIds` collapses graph-traversal N+1 reads.
+ *   2. `insertNode` invalidates the LRU cache so INSERT OR REPLACE
+ *      doesn't serve a stale cached row on next `getNodeById`.
+ *   3. `runMaintenance` runs `PRAGMA optimize` + `wal_checkpoint(PASSIVE)`
+ *      after indexAll/sync without throwing.
+ */
+
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+import { DatabaseConnection } from '../src/db';
+import { QueryBuilder } from '../src/db/queries';
+import { Node } from '../src/types';
+
+function makeNode(id: string, name = id): Node {
+  return {
+    id,
+    kind: 'function',
+    name,
+    qualifiedName: name,
+    filePath: 'a.ts',
+    language: 'typescript',
+    startLine: 1,
+    endLine: 1,
+    startColumn: 0,
+    endColumn: 0,
+    updatedAt: Date.now(),
+  };
+}
+
+describe('getNodesByIds (batch lookup)', () => {
+  let dir: string;
+  let db: DatabaseConnection;
+  let q: QueryBuilder;
+
+  beforeEach(() => {
+    dir = fs.mkdtempSync(path.join(os.tmpdir(), 'db-perf-batch-'));
+    db = DatabaseConnection.initialize(path.join(dir, 'test.db'));
+    q = new QueryBuilder(db.getDb());
+  });
+
+  afterEach(() => {
+    db.close();
+    if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true });
+  });
+
+  it('returns a Map keyed by id, with one entry per existing node', () => {
+    q.insertNodes([makeNode('n1'), makeNode('n2'), makeNode('n3')]);
+    const out = q.getNodesByIds(['n1', 'n2', 'n3']);
+    expect(out.size).toBe(3);
+    expect(out.get('n1')!.name).toBe('n1');
+    expect(out.get('n3')!.name).toBe('n3');
+  });
+
+  it('omits missing IDs from the result map (no nulls, no exceptions)', () => {
+    q.insertNodes([makeNode('n1'), makeNode('n2')]);
+    const out = q.getNodesByIds(['n1', 'missing', 'n2']);
+    expect(out.size).toBe(2);
+    expect(out.has('missing')).toBe(false);
+    expect(out.has('n1')).toBe(true);
+    expect(out.has('n2')).toBe(true);
+  });
+
+  it('handles an empty input array', () => {
+    expect(q.getNodesByIds([]).size).toBe(0);
+  });
+
+  it('handles batches over the SQLite parameter limit (chunking)', () => {
+    // Insert 1500 nodes; the helper chunks at 500 internally.
+    const nodes = Array.from({ length: 1500 }, (_, i) => makeNode(`n${i}`));
+    q.insertNodes(nodes);
+    const ids = nodes.map((n) => n.id);
+    const out = q.getNodesByIds(ids);
+    expect(out.size).toBe(1500);
+    // Spot-check a few from the first / middle / last chunk.
+    expect(out.has('n0')).toBe(true);
+    expect(out.has('n750')).toBe(true);
+    expect(out.has('n1499')).toBe(true);
+  });
+
+  it('serves cache hits from memory and queries only the misses', () => {
+    q.insertNodes([makeNode('n1'), makeNode('n2'), makeNode('n3')]);
+    // Warm the cache for n1 only.
+    q.getNodeById('n1');
+    // Replace the underlying row to make a miss-vs-cache-hit detectable.
+    db.getDb().prepare('UPDATE nodes SET name = ? WHERE id = ?').run('changed', 'n1');
+    const out = q.getNodesByIds(['n1', 'n2']);
+    // The cached n1 (still 'n1', not 'changed') must be returned.
+    expect(out.get('n1')!.name).toBe('n1');
+    expect(out.get('n2')!.name).toBe('n2');
+  });
+});
+
+describe('insertNode cache invalidation', () => {
+  let dir: string;
+  let db: DatabaseConnection;
+  let q: QueryBuilder;
+
+  beforeEach(() => {
+    dir = fs.mkdtempSync(path.join(os.tmpdir(), 'db-perf-cache-'));
+    db = DatabaseConnection.initialize(path.join(dir, 'test.db'));
+    q = new QueryBuilder(db.getDb());
+  });
+
+  afterEach(() => {
+    db.close();
+    if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true });
+  });
+
+  it('does not serve a stale cached node after INSERT OR REPLACE', () => {
+    // Regression: insertNode (which uses INSERT OR REPLACE) used to skip
+    // cache invalidation, so the next getNodeById returned the pre-replace
+    // version until LRU eviction.
+    const original = makeNode('n1', 'oldName');
+    q.insertNode(original);
+    const beforeReplace = q.getNodeById('n1');
+    expect(beforeReplace!.name).toBe('oldName');
+
+    // Replace via insertNode (the bug path).
+    q.insertNode({ ...original, name: 'newName', updatedAt: Date.now() });
+    const afterReplace = q.getNodeById('n1');
+    expect(afterReplace!.name).toBe('newName');
+  });
+});
+
+describe('runMaintenance', () => {
+  let dir: string;
+  let db: DatabaseConnection;
+
+  beforeEach(() => {
+    dir = fs.mkdtempSync(path.join(os.tmpdir(), 'db-perf-maint-'));
+    db = DatabaseConnection.initialize(path.join(dir, 'test.db'));
+  });
+
+  afterEach(() => {
+    db.close();
+    if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true });
+  });
+
+  it('runs without throwing on a fresh database', () => {
+    expect(() => db.runMaintenance()).not.toThrow();
+  });
+
+  it('runs without throwing after writes', () => {
+    const q = new QueryBuilder(db.getDb());
+    q.insertNodes([makeNode('n1'), makeNode('n2')]);
+    expect(() => db.runMaintenance()).not.toThrow();
+  });
+
+  it('swallows failures rather than propagating (best-effort)', () => {
+    // Close the DB so the underlying handle would normally throw on any
+    // exec(). runMaintenance must still not propagate.
+    db.close();
+    expect(() => db.runMaintenance()).not.toThrow();
+  });
+});
diff --git a/__tests__/diversify.test.ts b/__tests__/diversify.test.ts
new file mode 100644
index 00000000..181ee9c5
--- /dev/null
+++ b/__tests__/diversify.test.ts
@@ -0,0 +1,200 @@
+/**
+ * Result Diversification Tests
+ *
+ * Verifies the per-file cap on search results: queries that match many
+ * symbols in one file (the methods of a class) no longer return 10 hits
+ * from one file, but instead surface representative breadth across files.
+ */
+
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+import { DatabaseConnection } from '../src/db';
+import { QueryBuilder } from '../src/db/queries';
+import { diversifyByFile } from '../src/search/query-utils';
+import { Node } from '../src/types';
+
+describe('diversifyByFile (unit)', () => {
+  function r(score: number, name: string, filePath: string) {
+    return { node: { id: name, name, filePath } as Node, score };
+  }
+
+  it('caps consecutive results from the same file at perFileCap', () => {
+    const results = [
+      r(10, 'a1', 'a.ts'),
+      r(9, 'a2', 'a.ts'),
+      r(8, 'a3', 'a.ts'),
+      r(7, 'a4', 'a.ts'),
+      r(6, 'b1', 'b.ts'),
+    ];
+    const out = diversifyByFile(results, 5, 2);
+    expect(out.map((x) => x.node.name)).toEqual(['a1', 'a2', 'b1', 'a3', 'a4']);
+    // First two from a.ts (cap), then b.ts (different file), then backfill.
+  });
+
+  it('preserves overall ranking when no file dominates', () => {
+    const results = [
+      r(10, 'a1', 'a.ts'),
+      r(9, 'b1', 'b.ts'),
+      r(8, 'c1', 'c.ts'),
+      r(7, 'a2', 'a.ts'),
+    ];
+    const out = diversifyByFile(results, 4, 2);
+    expect(out.map((x) => x.node.name)).toEqual(['a1', 'b1', 'c1', 'a2']);
+  });
+
+  it('does not lose results — backfills from skipped when limit not yet filled', () => {
+    // 10 candidates all from one file, limit 5, cap 2: pick 2, backfill 3.
+    const results = Array.from({ length: 10 }, (_, i) =>
+      r(10 - i, `n${i}`, 'a.ts')
+    );
+    const out = diversifyByFile(results, 5, 2);
+    expect(out).toHaveLength(5);
+    expect(out.every((x) => x.node.filePath === 'a.ts')).toBe(true);
+  });
+
+  it('returns the input slice unchanged when perFileCap=0', () => {
+    const results = [
+      r(10, 'a1', 'a.ts'),
+      r(9, 'a2', 'a.ts'),
+      r(8, 'a3', 'a.ts'),
+    ];
+    expect(diversifyByFile(results, 3, 0)).toEqual(results);
+  });
+
+  it('returns input unchanged when results.length <= limit and no reordering needed', () => {
+    const results = [r(10, 'a1', 'a.ts'), r(9, 'a2', 'a.ts')];
+    expect(diversifyByFile(results, 5, 2)).toEqual(results);
+  });
+
+  it('still reorders within limit when results.length === limit but cap rearranges', () => {
+    // Same total count as limit, but the cap reorders to surface peer files
+    // earlier in the list.
+    const results = [
+      r(10, 'a1', 'a.ts'),
+      r(9, 'a2', 'a.ts'),
+      r(8, 'a3', 'a.ts'),
+      r(7, 'a4', 'a.ts'),
+      r(6, 'b1', 'b.ts'),
+    ];
+    const out = diversifyByFile(results, 5, 2);
+    // First 2 from a.ts (cap), then b.ts, then backfill a.ts.
+    expect(out.map((x) => x.node.name)).toEqual(['a1', 'a2', 'b1', 'a3', 'a4']);
+  });
+
+  it('respects the limit even when picked + skipped exceed it', () => {
+    const results = [
+      r(10, 'a1', 'a.ts'),
+      r(9, 'a2', 'a.ts'),
+      r(8, 'a3', 'a.ts'),
+      r(7, 'b1', 'b.ts'),
+    ];
+    const out = diversifyByFile(results, 2, 2);
+    expect(out).toHaveLength(2);
+    expect(out.map((x) => x.node.name)).toEqual(['a1', 'a2']);
+  });
+
+  it('always preserves the top-scoring result at position 0', () => {
+    const results = [
+      r(100, 'top', 'big.ts'),
+      r(50, 'big2', 'big.ts'),
+      r(40, 'big3', 'big.ts'),
+      r(30, 'big4', 'big.ts'),
+      r(20, 'other', 'other.ts'),
+    ];
+    const out = diversifyByFile(results, 3, 2);
+    expect(out[0].node.name).toBe('top');
+  });
+});
+
+describe('searchNodes per-file diversification (integration)', () => {
+  let dir: string;
+  let db: DatabaseConnection;
+  let q: QueryBuilder;
+
+  function makeNode(id: string, name: string, kind: Node['kind'], filePath: string): Node {
+    return {
+      id,
+      kind,
+      name,
+      qualifiedName: `${filePath}::${name}`,
+      filePath,
+      language: 'typescript',
+      startLine: 1,
+      endLine: 1,
+      startColumn: 0,
+      endColumn: 0,
+      updatedAt: Date.now(),
+    };
+  }
+
+  beforeEach(() => {
+    dir = fs.mkdtempSync(path.join(os.tmpdir(), 'diversify-search-'));
+    db = DatabaseConnection.initialize(path.join(dir, 'test.db'));
+    q = new QueryBuilder(db.getDb());
+    // Simulate the "10 methods of one class" scenario: a class plus many
+    // methods all sharing a common token, all in one file. Plus a peer
+    // file with a sibling implementation.
+    const nodes: Node[] = [
+      makeNode('cls', 'DatabaseConnection', 'class', 'src/db.ts'),
+      makeNode('m1', 'connect', 'method', 'src/db.ts'),
+      makeNode('m2', 'disconnect', 'method', 'src/db.ts'),
+      makeNode('m3', 'reconnect', 'method', 'src/db.ts'),
+      makeNode('m4', 'isConnected', 'method', 'src/db.ts'),
+      makeNode('m5', 'connectionString', 'property', 'src/db.ts'),
+      makeNode('peer', 'PoolConnection', 'class', 'src/pool.ts'),
+      makeNode('peer2', 'connectPool', 'function', 'src/pool.ts'),
+    ];
+    q.insertNodes(nodes);
+  });
+
+  afterEach(() => {
+    db.close();
+    if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true });
+  });
+
+  it('caps results per file at the default (3) so peer files surface', () => {
+    const results = q.searchNodes('connect', { limit: 5 });
+    const fromDbTs = results.filter((r) => r.node.filePath === 'src/db.ts').length;
+    const fromPool = results.filter((r) => r.node.filePath === 'src/pool.ts').length;
+    expect(fromDbTs).toBeLessThanOrEqual(3); // cap
+    expect(fromPool).toBeGreaterThanOrEqual(1); // peer file represented
+  });
+
+  it('honors perFileCap: 0 (disabled) — does not enforce a per-file limit', () => {
+    // Insert a heavy imbalance so dominance is unambiguous: 10 matching
+    // methods in db.ts, only the existing pool.ts entries elsewhere.
+    const heavyDb: Node[] = Array.from({ length: 10 }, (_, i) =>
+      makeNode(`heavy${i}`, `connectVariant${i}`, 'method', 'src/db.ts')
+    );
+    q.insertNodes(heavyDb);
+    const results = q.searchNodes('connect', { limit: 8, perFileCap: 0 });
+    const fromDbTs = results.filter((r) => r.node.filePath === 'src/db.ts').length;
+    expect(fromDbTs).toBeGreaterThan(3);
+  });
+
+  it('honors a higher perFileCap', () => {
+    const results = q.searchNodes('connect', { limit: 6, perFileCap: 5 });
+    const fromDbTs = results.filter((r) => r.node.filePath === 'src/db.ts').length;
+    expect(fromDbTs).toBeLessThanOrEqual(5);
+  });
+
+  it('preserves the top-scoring hit even with diversification', () => {
+    // Class node with the most direct name match is the most relevant —
+    // diversification must never displace it from #1.
+    const results = q.searchNodes('DatabaseConnection', { limit: 3 });
+    expect(results[0].node.name).toBe('DatabaseConnection');
+  });
+
+  it('does not lose results — fills limit by backfilling skipped same-file hits', () => {
+    // If only one file has matches, all results legitimately come from it.
+    // The cap should not cause us to return fewer than `limit` results.
+    const onlyOneFileNodes: Node[] = Array.from({ length: 10 }, (_, i) =>
+      makeNode(`only${i}`, `solo${i}`, 'function', 'src/only.ts')
+    );
+    q.insertNodes(onlyOneFileNodes);
+    const results = q.searchNodes('solo', { limit: 5 });
+    expect(results.length).toBe(5);
+  });
+});
diff --git a/__tests__/edges-unique.test.ts b/__tests__/edges-unique.test.ts
new file mode 100644
index 00000000..49eced53
--- /dev/null
+++ b/__tests__/edges-unique.test.ts
@@ -0,0 +1,166 @@
+/**
+ * Edge Uniqueness Tests
+ *
+ * Regression tests for the bug where `INSERT OR IGNORE INTO edges` was
+ * silently a no-op: the only candidate key was the AUTOINCREMENT id (which
+ * never conflicts), so duplicate edges accumulated on every re-emission /
+ * re-resolution.
+ *
+ * Fix: a UNIQUE index on (source, target, kind, COALESCE(line, -1),
+ * COALESCE(col, -1)) backs a fresh-install schema and is also applied via
+ * migration v4 (with a dedup pass over existing rows).
+ */
+
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+import { DatabaseConnection } from '../src/db';
+import { QueryBuilder } from '../src/db/queries';
+import { Edge, Node } from '../src/types';
+import { runMigrations, getCurrentVersion, CURRENT_SCHEMA_VERSION } from '../src/db/migrations';
+
+function tempDb(): { dir: string; db: DatabaseConnection; q: QueryBuilder } {
+  const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-edges-unique-'));
+  const db = DatabaseConnection.initialize(path.join(dir, 'test.db'));
+  const q = new QueryBuilder(db.getDb());
+  return { dir, db, q };
+}
+
+function cleanup(dir: string, db: DatabaseConnection) {
+  db.close();
+  if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true });
+}
+
+function makeNode(id: string, name: string): Node {
+  return {
+    id,
+    kind: 'function',
+    name,
+    qualifiedName: `f::${name}`,
+    filePath: 'a.ts',
+    language: 'typescript',
+    startLine: 1,
+    endLine: 1,
+    startColumn: 0,
+    endColumn: 0,
+    updatedAt: Date.now(),
+  };
+}
+
+function edgesCount(db: DatabaseConnection): number {
+  const row = db.getDb().prepare('SELECT COUNT(*) as c FROM edges').get() as { c: number };
+  return row.c;
+}
+
+describe('Edge UNIQUE constraint (bug #2)', () => {
+  let dir: string;
+  let db: DatabaseConnection;
+  let q: QueryBuilder;
+
+  beforeEach(() => {
+    ({ dir, db, q } = tempDb());
+    q.insertNodes([makeNode('n1', 'foo'), makeNode('n2', 'bar')]);
+  });
+
+  afterEach(() => cleanup(dir, db));
+
+  it('rejects duplicate (source, target, kind, line, col)', () => {
+    const e: Edge = { source: 'n1', target: 'n2', kind: 'calls', line: 10, column: 5 };
+    q.insertEdge(e);
+    q.insertEdge(e); // INSERT OR IGNORE — should be a no-op now
+    expect(edgesCount(db)).toBe(1);
+  });
+
+  it('treats two NULL line edges as duplicates (COALESCE in unique index)', () => {
+    const e: Edge = { source: 'n1', target: 'n2', kind: 'calls' };
+    q.insertEdge(e);
+    q.insertEdge(e);
+    expect(edgesCount(db)).toBe(1);
+  });
+
+  it('allows same source/target/kind on different lines', () => {
+    q.insertEdge({ source: 'n1', target: 'n2', kind: 'calls', line: 1 });
+    q.insertEdge({ source: 'n1', target: 'n2', kind: 'calls', line: 2 });
+    expect(edgesCount(db)).toBe(2);
+  });
+
+  it('allows same source/target/line on different kinds', () => {
+    q.insertEdge({ source: 'n1', target: 'n2', kind: 'calls', line: 1 });
+    q.insertEdge({ source: 'n1', target: 'n2', kind: 'references', line: 1 });
+    expect(edgesCount(db)).toBe(2);
+  });
+
+  it('insertEdges (batch) dedupes within the same call', () => {
+    const e: Edge = { source: 'n1', target: 'n2', kind: 'calls', line: 1, column: 1 };
+    q.insertEdges([e, e, e]);
+    expect(edgesCount(db)).toBe(1);
+  });
+
+  it('survives the same edge being re-emitted across many cycles', () => {
+    const e: Edge = { source: 'n1', target: 'n2', kind: 'calls', line: 1 };
+    for (let i = 0; i < 100; i++) {
+      q.insertEdge(e);
+    }
+    expect(edgesCount(db)).toBe(1);
+  });
+});
+
+describe('Migration v4: dedup existing edges', () => {
+  let dir: string;
+  let dbPath: string;
+
+  beforeEach(() => {
+    dir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-migr-v4-'));
+    dbPath = path.join(dir, 'test.db');
+  });
+
+  afterEach(() => {
+    if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true });
+  });
+
+  it('collapses pre-existing duplicates and adds the UNIQUE index', () => {
+    // Build a v3-shaped database manually: schema, but simulate a stale
+    // version row + insert duplicates that the missing UNIQUE index let
+    // through. We use the real initialize() path then drop the index +
+    // version row to back-date the DB.
+    const db = DatabaseConnection.initialize(dbPath);
+    db.getDb().exec(`DROP INDEX IF EXISTS idx_edges_unique;`);
+    db.getDb().exec(`DELETE FROM schema_versions;`);
+    db.getDb().prepare(
+      'INSERT INTO schema_versions (version, applied_at, description) VALUES (3, ?, ?)'
+    ).run(Date.now(), 'simulated v3');
+
+    const q = new QueryBuilder(db.getDb());
+    q.insertNodes([makeNode('n1', 'foo'), makeNode('n2', 'bar')]);
+    // Force-insert duplicates via raw SQL (bypassing the constraint that
+    // is now absent). Three rows that should collapse to one.
+    const stmt = db.getDb().prepare(
+      'INSERT INTO edges (source, target, kind, line, col) VALUES (?, ?, ?, ?, ?)'
+    );
+    stmt.run('n1', 'n2', 'calls', 10, 5);
+    stmt.run('n1', 'n2', 'calls', 10, 5);
+    stmt.run('n1', 'n2', 'calls', 10, 5);
+    // And one with NULL line/col, also duplicated
+    stmt.run('n1', 'n2', 'references', null, null);
+    stmt.run('n1', 'n2', 'references', null, null);
+
+    expect(edgesCount(db)).toBe(5);
+    expect(getCurrentVersion(db.getDb())).toBe(3);
+
+    // Run migrations forward
+    runMigrations(db.getDb(), 3);
+
+    expect(getCurrentVersion(db.getDb())).toBe(CURRENT_SCHEMA_VERSION);
+    expect(CURRENT_SCHEMA_VERSION).toBeGreaterThanOrEqual(4);
+    // 3 calls dups → 1, 2 references dups → 1
+    expect(edgesCount(db)).toBe(2);
+
+    // Now the constraint is enforced: another duplicate insert is a no-op.
+    const q2 = new QueryBuilder(db.getDb());
+    q2.insertEdge({ source: 'n1', target: 'n2', kind: 'calls', line: 10, column: 5 });
+    expect(edgesCount(db)).toBe(2);
+
+    db.close();
+  });
+});
diff --git a/__tests__/embeddings.test.ts b/__tests__/embeddings.test.ts
new file mode 100644
index 00000000..216e4a08
--- /dev/null
+++ b/__tests__/embeddings.test.ts
@@ -0,0 +1,388 @@
+/**
+ * Embedding pipeline + hybrid search + cross-language matching.
+ *
+ * Reuses the in-process fake-Ollama pattern from llm.test.ts so the
+ * tests stay hermetic. The fake server returns deterministic vectors
+ * derived from the input text so we can assert ordering by hand.
+ */
+
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+import * as http from 'http';
+import { AddressInfo } from 'net';
+import { CodeGraph } from '../src';
+import {
+  vectorToBytes,
+  bytesToVector,
+  cosineNormalised,
+  reciprocalRankFusion,
+  topKByCosine,
+  topKByCosineMatrix,
+  EmbeddingCache,
+} from '../src/llm/embeddings';
+
+const EMBED_DIM = 8;
+
+function l2(v: Float32Array): Float32Array {
+  let s = 0;
+  for (let i = 0; i < v.length; i++) s += v[i]! * v[i]!;
+  const n = Math.sqrt(s) || 1;
+  const out = new Float32Array(v.length);
+  for (let i = 0; i < v.length; i++) out[i] = v[i]! / n;
+  return out;
+}
+
+/** Deterministic 8-dim vector keyed off character codes. */
+function fakeEmbed(text: string): number[] {
+  const v = new Array(EMBED_DIM).fill(0);
+  for (let i = 0; i < text.length; i++) {
+    v[i % EMBED_DIM] += text.charCodeAt(i) % 17;
+  }
+  return v;
+}
+
+interface FakeServer {
+  url: string;
+  chatCalls: number;
+  embedCalls: number;
+  close: () => Promise<void>;
+}
+
+async function startFake(): Promise<FakeServer> {
+  const state = { chatCalls: 0, embedCalls: 0 };
+  const server = http.createServer((req, res) => {
+    let body = '';
+    req.on('data', (c) => (body += c));
+    req.on('end', () => {
+      res.setHeader('content-type', 'application/json');
+      if (req.url?.endsWith('/models') || req.url === '/models') {
+        res.end(
+          JSON.stringify({
+            data: [{ id: 'qwen2.5-coder:7b' }, { id: 'nomic-embed-text' }],
+          })
+        );
+        return;
+      }
+      if (req.url?.endsWith('/chat/completions')) {
+        state.chatCalls++;
+        // Look for the symbol body in the user message and echo a
+        // deterministic summary so the cache key is stable.
+        const parsed = JSON.parse(body) as {
+          messages: Array<{ content: string }>;
+        };
+        const userText = parsed.messages?.[0]?.content || '';
+        const last = userText.slice(-200);
+        res.end(
+          JSON.stringify({
+            choices: [
+              {
+                message: {
+                  role: 'assistant',
+                  content: 'Summary of: ' + last.replace(/\s+/g, ' ').slice(0, 80),
+                },
+              },
+            ],
+          })
+        );
+        return;
+      }
+      if (req.url?.endsWith('/embeddings')) {
+        state.embedCalls++;
+        const parsed = JSON.parse(body) as { input: string[] };
+        res.end(
+          JSON.stringify({
+            data: parsed.input.map((text) => ({ embedding: fakeEmbed(text) })),
+          })
+        );
+        return;
+      }
+      res.statusCode = 404;
+      res.end();
+    });
+  });
+  await new Promise<void>((r) => server.listen(0, '127.0.0.1', r));
+  const addr = server.address() as AddressInfo;
+  return {
+    url: `http://127.0.0.1:${addr.port}/v1`,
+    get chatCalls() {
+      return state.chatCalls;
+    },
+    get embedCalls() {
+      return state.embedCalls;
+    },
+    close: () =>
+      new Promise<void>((resolve, reject) =>
+        server.close((err) => (err ? reject(err) : resolve()))
+      ),
+  };
+}
+
+describe('embedding helpers', () => {
+  it('vectorToBytes round-trips through bytesToVector', () => {
+    const v = l2(Float32Array.from([1, 2, 3, 4, 5, 6, 7, 8]));
+    const b = vectorToBytes(v);
+    const v2 = bytesToVector(b);
+    for (let i = 0; i < v.length; i++) {
+      expect(v2[i]).toBeCloseTo(v[i]!, 6);
+    }
+  });
+
+  it('cosineNormalised gives 1.0 for the same vector', () => {
+    const v = l2(Float32Array.from([1, 0, 0, 0, 0, 0, 0, 0]));
+    expect(cosineNormalised(v, v)).toBeCloseTo(1, 6);
+  });
+
+  it('cosineNormalised gives 0 for orthogonal vectors', () => {
+    const a = l2(Float32Array.from([1, 0, 0, 0, 0, 0, 0, 0]));
+    const b = l2(Float32Array.from([0, 1, 0, 0, 0, 0, 0, 0]));
+    expect(cosineNormalised(a, b)).toBeCloseTo(0, 6);
+  });
+
+  it('topKByCosine returns the highest-scoring node ids', () => {
+    const query = l2(Float32Array.from([1, 0, 0, 0, 0, 0, 0, 0]));
+    const candidates = [
+      { nodeId: 'a', embedding: vectorToBytes(l2(Float32Array.from([0.9, 0.1, 0, 0, 0, 0, 0, 0]))) },
+      { nodeId: 'b', embedding: vectorToBytes(l2(Float32Array.from([0, 1, 0, 0, 0, 0, 0, 0]))) },
+      { nodeId: 'c', embedding: vectorToBytes(l2(Float32Array.from([0.5, 0.5, 0, 0, 0, 0, 0, 0]))) },
+    ];
+    const hits = topKByCosine(query, candidates, 2);
+    expect(hits.map((h) => h.nodeId)).toEqual(['a', 'c']);
+  });
+
+  it('RRF favors items appearing high in both rankings', () => {
+    const fts = [{ id: 'x' }, { id: 'y' }, { id: 'z' }];
+    const sem = [{ id: 'y' }, { id: 'z' }, { id: 'x' }];
+    const fused = reciprocalRankFusion([fts, sem]);
+    // y appears at rank 2 in fts (1/62) + rank 1 in sem (1/61) = highest
+    const sorted = [...fused.entries()].sort((a, b) => b[1] - a[1]).map(([id]) => id);
+    expect(sorted[0]).toBe('y');
+  });
+
+  it('topKByCosineMatrix matches topKByCosine on the same data', () => {
+    const query = l2(Float32Array.from([1, 0, 0, 0, 0, 0, 0, 0]));
+    const vecs = [
+      { id: 'a', v: l2(Float32Array.from([0.9, 0.1, 0, 0, 0, 0, 0, 0])) },
+      { id: 'b', v: l2(Float32Array.from([0, 1, 0, 0, 0, 0, 0, 0])) },
+      { id: 'c', v: l2(Float32Array.from([0.5, 0.5, 0, 0, 0, 0, 0, 0])) },
+    ];
+    const candidates = vecs.map((e) => ({ nodeId: e.id, embedding: vectorToBytes(e.v) }));
+    const matrix = new Float32Array(vecs.length * EMBED_DIM);
+    const ids = vecs.map((e) => e.id);
+    for (let i = 0; i < vecs.length; i++) matrix.set(vecs[i]!.v, i * EMBED_DIM);
+
+    const a = topKByCosine(query, candidates, 3).map((h) => h.nodeId);
+    const b = topKByCosineMatrix(query, matrix, ids, EMBED_DIM, 3).map((h) => h.nodeId);
+    expect(b).toEqual(a);
+  });
+
+  it('EmbeddingCache returns the same result on hit and miss; invalidate forces refetch', () => {
+    let fetchCalls = 0;
+    const v = vectorToBytes(l2(Float32Array.from([1, 0, 0, 0, 0, 0, 0, 0])));
+    const fetcher = {
+      getAllEmbeddings: (_model: string) => {
+        fetchCalls++;
+        return [{ nodeId: 'a', embedding: v }];
+      },
+    };
+
+    const cache = new EmbeddingCache();
+    const r1 = cache.get(fetcher, 'm');
+    const r2 = cache.get(fetcher, 'm');
+    expect(fetchCalls).toBe(1);
+    expect(r1).toBe(r2);
+    expect(r1.ids).toEqual(['a']);
+    expect(r1.dim).toBe(EMBED_DIM);
+
+    cache.invalidate();
+    cache.get(fetcher, 'm');
+    expect(fetchCalls).toBe(2);
+
+    // Switching models also forces a refetch.
+    cache.get(fetcher, 'other-model');
+    expect(fetchCalls).toBe(3);
+  });
+
+  it('EmbeddingCache skips rows whose dimension does not match the first row', () => {
+    const v3 = vectorToBytes(l2(Float32Array.from([1, 0, 0, 0, 0, 0, 0, 0])));
+    // Different shape: 4-dim vector. Should be skipped.
+    const v4 = Buffer.from(new Float32Array([1, 0, 0, 0]).buffer);
+    const fetcher = {
+      getAllEmbeddings: (_model: string) => [
+        { nodeId: 'good', embedding: v3 },
+        { nodeId: 'bad', embedding: v4 },
+        { nodeId: 'good2', embedding: v3 },
+      ],
+    };
+    const cache = new EmbeddingCache();
+    const r = cache.get(fetcher, 'm');
+    expect(r.ids).toEqual(['good', 'good2']);
+    expect(r.matrix.length).toBe(2 * EMBED_DIM);
+    expect(r.dim).toBe(EMBED_DIM);
+  });
+
+  it('EmbeddingCache returns an empty result without calling the fetcher again on hit', () => {
+    let fetchCalls = 0;
+    const fetcher = {
+      getAllEmbeddings: (_model: string) => {
+        fetchCalls++;
+        return [];
+      },
+    };
+    const cache = new EmbeddingCache();
+    const r = cache.get(fetcher, 'm');
+    expect(r.ids).toEqual([]);
+    expect(r.dim).toBe(0);
+    cache.get(fetcher, 'm');
+    expect(fetchCalls).toBe(1);
+  });
+});
+
+describe('CodeGraph hybrid search & similar', () => {
+  let tempDir: string;
+  let fake: FakeServer;
+
+  beforeEach(async () => {
+    tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-emb-'));
+    fake = await startFake();
+    fs.writeFileSync(
+      path.join(tempDir, 'sample.ts'),
+      `export function authenticateUser(name: string): string {
+  const token = 'secret';
+  const claim = 'session';
+  return name + token + claim;
+}
+
+export function lookupAccount(id: string): { id: string } {
+  const cache = new Map<string, { id: string }>();
+  cache.set(id, { id });
+  return { id };
+}
+
+export class TokenStore {
+  private bag: Map<string, string> = new Map();
+  put(k: string, v: string): void { this.bag.set(k, v); }
+  get(k: string): string | undefined { return this.bag.get(k); }
+  size(): number { return this.bag.size; }
+}
+`
+    );
+    fs.writeFileSync(
+      path.join(tempDir, 'helper.py'),
+      `def authenticate_user(name):
+    token = 'secret'
+    claim = 'session'
+    return name + token + claim
+`
+    );
+  });
+
+  afterEach(async () => {
+    await fake.close();
+    fs.rmSync(tempDir, { recursive: true, force: true });
+  });
+
+  it('searchHybrid falls back to FTS when no embedding model is configured', async () => {
+    const cg = await CodeGraph.init(tempDir, {
+      config: {
+        llm: { endpoint: fake.url, chatModel: 'qwen2.5-coder:7b' },
+      },
+    });
+    try {
+      await cg.indexAll({ summarize: false });
+      const results = await cg.searchHybrid('authenticate', { limit: 5 });
+      expect(results.length).toBeGreaterThan(0);
+      // No embeddings in DB → no embed calls fired
+      expect(fake.embedCalls).toBe(0);
+    } finally {
+      cg.close();
+    }
+  });
+
+  it('background pass produces summaries AND embeddings end-to-end', async () => {
+    const cg = await CodeGraph.init(tempDir, {
+      config: {
+        llm: {
+          endpoint: fake.url,
+          chatModel: 'qwen2.5-coder:7b',
+          embeddingModel: 'nomic-embed-text',
+        },
+      },
+    });
+    try {
+      await cg.indexAll();
+      await cg.awaitBackgroundSummarization();
+
+      const cov = cg.getSummaryCoverage();
+      expect(cov.summarised).toBeGreaterThan(0);
+      // Embedding pass also ran (chat calls > 0 AND embed calls > 0)
+      expect(fake.chatCalls).toBeGreaterThan(0);
+      expect(fake.embedCalls).toBeGreaterThan(0);
+
+      // Re-running summarize is a cache hit; re-running embed should
+      // also be a cache hit (embedding_model already set).
+      const callsAfterFirst = fake.chatCalls + fake.embedCalls;
+      await cg.summarizeAll();
+      // chat shouldn't fire again; embed pass not invoked here directly.
+      expect(fake.chatCalls + fake.embedCalls).toBe(callsAfterFirst);
+    } finally {
+      cg.close();
+    }
+  });
+
+  it('searchHybrid returns FTS+semantic blended results once embeddings exist', async () => {
+    const cg = await CodeGraph.init(tempDir, {
+      config: {
+        llm: {
+          endpoint: fake.url,
+          chatModel: 'qwen2.5-coder:7b',
+          embeddingModel: 'nomic-embed-text',
+        },
+      },
+    });
+    try {
+      await cg.indexAll();
+      await cg.awaitBackgroundSummarization();
+
+      const results = await cg.searchHybrid('authenticateUser', { limit: 5 });
+      expect(results.length).toBeGreaterThan(0);
+      // Hybrid path embedded the query (one extra embed call beyond
+      // the bulk-summary embeddings).
+      expect(fake.embedCalls).toBeGreaterThan(1);
+    } finally {
+      cg.close();
+    }
+  });
+
+  it('findSimilar returns related symbols and respects differentLanguage', async () => {
+    const cg = await CodeGraph.init(tempDir, {
+      config: {
+        llm: {
+          endpoint: fake.url,
+          chatModel: 'qwen2.5-coder:7b',
+          embeddingModel: 'nomic-embed-text',
+        },
+      },
+    });
+    try {
+      await cg.indexAll();
+      await cg.awaitBackgroundSummarization();
+
+      const ts = cg.searchNodes('authenticateUser', { limit: 1 })[0];
+      expect(ts).toBeDefined();
+
+      const similar = await cg.findSimilar(ts!.node.id, { limit: 3 });
+      // Should exclude the source itself
+      expect(similar.find((r) => r.node.id === ts!.node.id)).toBeUndefined();
+
+      // Cross-language filter should only return non-TS hits (or empty)
+      const xLang = await cg.findSimilar(ts!.node.id, { limit: 3, differentLanguage: true });
+      for (const r of xLang) {
+        expect(r.node.language).not.toBe(ts!.node.language);
+      }
+    } finally {
+      cg.close();
+    }
+  });
+});
diff --git a/__tests__/extraction-resolution-accuracy.test.ts b/__tests__/extraction-resolution-accuracy.test.ts
new file mode 100644
index 00000000..f78f3d76
--- /dev/null
+++ b/__tests__/extraction-resolution-accuracy.test.ts
@@ -0,0 +1,266 @@
+/**
+ * Extraction & Resolution Accuracy Tests
+ *
+ * Regression tests for three accuracy bugs fixed in one PR:
+ *   1. Parse-retry comment strip was hardcoded to `//`, no-op on Python/Ruby/etc.
+ *   2. Framework route extractors ran regex over raw file content, matching
+ *      examples in docstrings/comments as real routes.
+ *   3. UTF-8 BOM caused spurious "modified" hash mismatches between editors.
+ */
+
+import { describe, it, expect } from 'vitest';
+import { stripBom, stripCommentLinesForRetry, stripCommentsForRegex } from '../src/utils';
+import { hashContent } from '../src/extraction';
+import { flaskResolver, fastapiResolver, djangoResolver } from '../src/resolution/frameworks/python';
+import { expressResolver } from '../src/resolution/frameworks/express';
+import { aspnetResolver } from '../src/resolution/frameworks/csharp';
+import { rustResolver } from '../src/resolution/frameworks/rust';
+import { laravelResolver } from '../src/resolution/frameworks/laravel';
+
+describe('UTF-8 BOM normalization (bug #5)', () => {
+  it('stripBom removes leading U+FEFF', () => {
+    expect(stripBom('﻿hello')).toBe('hello');
+    expect(stripBom('hello')).toBe('hello');
+    expect(stripBom('')).toBe('');
+  });
+
+  it('stripBom only removes leading BOM, not embedded ones', () => {
+    expect(stripBom('a﻿b')).toBe('a﻿b');
+  });
+
+  it('hashContent treats BOM and no-BOM as identical', () => {
+    const withBom = '﻿export function hello() { return 42; }';
+    const withoutBom = 'export function hello() { return 42; }';
+    expect(hashContent(withBom)).toBe(hashContent(withoutBom));
+  });
+});
+
+describe('Per-language comment-line stripping (bug #1)', () => {
+  it('strips `#` lines for Python', () => {
+    const input = ['# CHECK: foo', 'def x():', '    pass'].join('\n');
+    const out = stripCommentLinesForRetry(input, 'python');
+    expect(out.split('\n')).toEqual(['', 'def x():', '    pass']);
+  });
+
+  it('strips `#` lines for Ruby', () => {
+    const input = ['# top comment', 'def x; end'].join('\n');
+    const out = stripCommentLinesForRetry(input, 'ruby');
+    expect(out.split('\n')).toEqual(['', 'def x; end']);
+  });
+
+  it('strips `//` lines for TypeScript', () => {
+    const input = ['// header', 'function x() {}'].join('\n');
+    const out = stripCommentLinesForRetry(input, 'typescript');
+    expect(out.split('\n')).toEqual(['', 'function x() {}']);
+  });
+
+  it('strips both `//` and `#` lines for PHP', () => {
+    const input = ['// js-style', '# perl-style', '<?php $x = 1;'].join('\n');
+    const out = stripCommentLinesForRetry(input, 'php');
+    expect(out.split('\n')).toEqual(['', '', '<?php $x = 1;']);
+  });
+
+  it('returns content unchanged for unknown languages', () => {
+    const input = '// looks like a comment\ncode';
+    expect(stripCommentLinesForRetry(input, 'unknown-lang')).toBe(input);
+  });
+
+  it('preserves line count so node positions stay correct', () => {
+    const input = ['# c1', 'a', '# c2', 'b'].join('\n');
+    const out = stripCommentLinesForRetry(input, 'python');
+    expect(out.split('\n').length).toBe(input.split('\n').length);
+  });
+
+  it('does NOT strip indented `#` inside Python (still recognized as line comment)', () => {
+    // The marker matches optional leading whitespace + `#`, so an indented
+    // pure comment line is correctly stripped. Non-comment code on the same
+    // line as `#` (mid-line comment) is intentionally not stripped here.
+    const input = ['    # indented comment', '    pass  # trailing'].join('\n');
+    const out = stripCommentLinesForRetry(input, 'python');
+    expect(out.split('\n')).toEqual(['', '    pass  # trailing']);
+  });
+});
+
+describe('Framework regex no longer matches docstrings/comments (bug #4)', () => {
+  describe('Flask', () => {
+    it('skips routes inside `#` comments', () => {
+      const content = [
+        'from flask import Flask',
+        'app = Flask(__name__)',
+        '# Example: @app.route("/fake")',
+        '@app.route("/real")',
+        'def real(): pass',
+      ].join('\n');
+      const nodes = flaskResolver.extractNodes!('app.py', content);
+      const paths = nodes.map((n) => n.name);
+      expect(paths).toContain('/real');
+      expect(paths).not.toContain('/fake');
+    });
+
+    it('skips routes inside triple-quoted docstrings', () => {
+      const content = [
+        'def example():',
+        '    """',
+        '    Usage: @app.route("/fake")',
+        '    """',
+        '    pass',
+        '@app.route("/real")',
+        'def real(): pass',
+      ].join('\n');
+      const nodes = flaskResolver.extractNodes!('app.py', content);
+      const paths = nodes.map((n) => n.name);
+      expect(paths).toContain('/real');
+      expect(paths).not.toContain('/fake');
+    });
+  });
+
+  describe('FastAPI', () => {
+    it('skips routes inside `#` comments and triple-quoted docstrings', () => {
+      const content = [
+        '"""',
+        'Module docs — example: @app.get("/docfake")',
+        '"""',
+        '# @app.post("/commentfake")',
+        '@app.get("/real")',
+        'def real(): pass',
+      ].join('\n');
+      const nodes = fastapiResolver.extractNodes!('app.py', content);
+      const names = nodes.map((n) => n.name);
+      expect(names.some((n) => n.includes('/real'))).toBe(true);
+      expect(names.some((n) => n.includes('/docfake'))).toBe(false);
+      expect(names.some((n) => n.includes('/commentfake'))).toBe(false);
+    });
+
+    it('preserves correct line numbers for real routes after stripping', () => {
+      const content = [
+        '"""',                    // line 1
+        '@app.get("/fake")',      // line 2 — inside docstring
+        '"""',                    // line 3
+        '',                       // line 4
+        '@app.get("/real")',      // line 5 — real
+      ].join('\n');
+      const nodes = fastapiResolver.extractNodes!('app.py', content);
+      const real = nodes.find((n) => n.name.includes('/real'));
+      expect(real).toBeDefined();
+      expect(real!.startLine).toBe(5);
+    });
+  });
+
+  describe('Django URL patterns', () => {
+    it('skips path() inside `#` comments', () => {
+      const content = [
+        'from django.urls import path',
+        '# example: path("fake/", fake_view)',
+        'urlpatterns = [path("real/", real_view)]',
+      ].join('\n');
+      const nodes = djangoResolver.extractNodes!('urls.py', content);
+      const names = nodes.map((n) => n.name);
+      expect(names).toContain('real/');
+      expect(names).not.toContain('fake/');
+    });
+  });
+
+  describe('Express', () => {
+    it('skips routes inside `//` comments', () => {
+      const content = [
+        'const app = express();',
+        '// app.get("/fake", fakeHandler);',
+        'app.get("/real", realHandler);',
+      ].join('\n');
+      const nodes = expressResolver.extractNodes!('server.js', content);
+      const names = nodes.map((n) => n.name);
+      expect(names.some((n) => n.includes('/real'))).toBe(true);
+      expect(names.some((n) => n.includes('/fake'))).toBe(false);
+    });
+
+    it('skips routes inside `/* ... */` block comments', () => {
+      const content = [
+        '/*',
+        ' * app.post("/blockfake", h);',
+        ' */',
+        'app.get("/real", h);',
+      ].join('\n');
+      const nodes = expressResolver.extractNodes!('server.js', content);
+      const names = nodes.map((n) => n.name);
+      expect(names.some((n) => n.includes('/real'))).toBe(true);
+      expect(names.some((n) => n.includes('/blockfake'))).toBe(false);
+    });
+  });
+
+  describe('Laravel', () => {
+    it('skips routes inside PHP `//` and `#` comments', () => {
+      const content = [
+        '<?php',
+        '// Route::get("/jsfake", $h);',
+        '# Route::get("/perlfake", $h);',
+        'Route::get("/real", $h);',
+      ].join('\n');
+      const nodes = laravelResolver.extractNodes!('routes.php', content);
+      const names = nodes.map((n) => n.name);
+      expect(names.some((n) => n.includes('/real'))).toBe(true);
+      expect(names.some((n) => n.includes('/jsfake'))).toBe(false);
+      expect(names.some((n) => n.includes('/perlfake'))).toBe(false);
+    });
+  });
+
+  describe('Rust', () => {
+    it('skips actix/rocket routes inside `///` doc comments', () => {
+      const content = [
+        '/// Example route: #[get("/docfake")]',
+        '#[get("/real")]',
+        'fn real() {}',
+      ].join('\n');
+      const nodes = rustResolver.extractNodes!('main.rs', content);
+      const names = nodes.map((n) => n.name);
+      expect(names.some((n) => n.includes('/real'))).toBe(true);
+      expect(names.some((n) => n.includes('/docfake'))).toBe(false);
+    });
+  });
+
+  describe('ASP.NET (C#)', () => {
+    it('skips route attributes inside `///` XML doc comments', () => {
+      const content = [
+        '/// <summary>',
+        '/// Example: [HttpGet("/docfake")]',
+        '/// </summary>',
+        '[HttpGet("/real")]',
+        'public class C {}',
+      ].join('\n');
+      const nodes = aspnetResolver.extractNodes!('Controller.cs', content);
+      const names = nodes.map((n) => n.name);
+      expect(names.some((n) => n.includes('/real'))).toBe(true);
+      expect(names.some((n) => n.includes('/docfake'))).toBe(false);
+    });
+
+    it('skips minimal-API MapGet/MapPost calls inside comments', () => {
+      // Regression: the minimalApiPattern loop below the routePatterns
+      // loop was initially missed when applying the strip helper, leaving
+      // commented-out `app.MapGet("/x")` calls extracted as real routes.
+      const content = [
+        '// app.MapGet("/linefake", h);',
+        '/*',
+        ' * app.MapPost("/blockfake", h);',
+        ' */',
+        'app.MapGet("/real", h);',
+      ].join('\n');
+      const nodes = aspnetResolver.extractNodes!('Program.cs', content);
+      const names = nodes.map((n) => n.name);
+      expect(names.some((n) => n.includes('/real'))).toBe(true);
+      expect(names.some((n) => n.includes('/linefake'))).toBe(false);
+      expect(names.some((n) => n.includes('/blockfake'))).toBe(false);
+    });
+  });
+});
+
+describe('stripCommentsForRegex preserves line offsets', () => {
+  it('keeps newlines so match.index → original line number', () => {
+    const input = '"""\n@app.get("/x")\n"""\n@app.get("/y")';
+    const out = stripCommentsForRegex(input, 'python');
+    // Newlines preserved
+    expect(out.split('\n').length).toBe(input.split('\n').length);
+    // The /y route survives
+    expect(out).toContain('/y');
+    // The docstring contents are blanked
+    expect(out).not.toContain('/x');
+  });
+});
diff --git a/__tests__/extraction.test.ts b/__tests__/extraction.test.ts
index 8a70ffed..f279ae03 100644
--- a/__tests__/extraction.test.ts
+++ b/__tests__/extraction.test.ts
@@ -3079,3 +3079,655 @@ describe('Directory Exclusion', () => {
     expect(files.every((f) => !f.includes('vendor'))).toBe(true);
   });
 });
+
+// =============================================================================
+// R Extraction
+// =============================================================================
+
+describe('R Extraction', () => {
+  describe('Language detection', () => {
+    it('should detect R files', () => {
+      expect(detectLanguage('script.R')).toBe('r');
+      expect(detectLanguage('utils.r')).toBe('r');
+    });
+
+    it('should report R as supported', () => {
+      expect(isLanguageSupported('r')).toBe(true);
+      expect(getSupportedLanguages()).toContain('r');
+    });
+  });
+
+  describe('Function extraction', () => {
+    it('should extract a function defined with <-', () => {
+      const code = `add <- function(a, b) {
+  a + b
+}`;
+      const result = extractFromSource('main.R', code);
+      const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'add');
+      expect(fn).toBeDefined();
+      expect(fn?.signature).toBe('(a, b)');
+    });
+
+    it('should extract a function defined with =', () => {
+      const code = `subtract = function(a, b) a - b`;
+      const result = extractFromSource('main.R', code);
+      const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'subtract');
+      expect(fn).toBeDefined();
+    });
+
+    it('should extract a function defined with <<-', () => {
+      const code = `divide <<- function(a, b) a / b`;
+      const result = extractFromSource('main.R', code);
+      const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'divide');
+      expect(fn).toBeDefined();
+    });
+
+    it('should extract S3 method names verbatim (period in name)', () => {
+      const code = `print.myClass <- function(x, ...) cat(x$value)`;
+      const result = extractFromSource('print.R', code);
+      const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'print.myClass');
+      expect(fn).toBeDefined();
+    });
+
+    it('should NOT emit anonymous function nodes for inline lambdas', () => {
+      const code = `result <- lapply(xs, function(x) x * 2)`;
+      const result = extractFromSource('main.R', code);
+      expect(result.nodes.find((n) => n.kind === 'function')).toBeUndefined();
+    });
+
+    it('should attach a docstring from preceding roxygen comments', () => {
+      const code = `#' Add two numbers
+#' @param a numeric
+#' @param b numeric
+add <- function(a, b) a + b`;
+      const result = extractFromSource('main.R', code);
+      const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'add');
+      expect(fn?.docstring).toContain('Add two numbers');
+    });
+  });
+
+  describe('Call extraction', () => {
+    it('should extract simple function calls inside a function body', () => {
+      const code = `wrap <- function(x) {
+  inner(x)
+  another(x)
+}`;
+      const result = extractFromSource('main.R', code);
+      const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'wrap')!;
+      const calls = result.unresolvedReferences.filter(
+        (r) => r.fromNodeId === fn.id && r.referenceKind === 'calls'
+      );
+      const calleeNames = calls.map((c) => c.referenceName);
+      expect(calleeNames).toContain('inner');
+      expect(calleeNames).toContain('another');
+    });
+
+    it('should preserve namespace operator in callee name (pkg::fn)', () => {
+      const code = `runner <- function() {
+  dplyr::filter(df, x > 0)
+}`;
+      const result = extractFromSource('main.R', code);
+      const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'runner')!;
+      const calleeNames = result.unresolvedReferences
+        .filter((r) => r.fromNodeId === fn.id)
+        .map((r) => r.referenceName);
+      expect(calleeNames).toContain('dplyr::filter');
+    });
+  });
+
+  describe('Imports', () => {
+    it('should extract library() with bare-identifier argument', () => {
+      const code = `library(dplyr)`;
+      const result = extractFromSource('main.R', code);
+      const importNode = result.nodes.find((n) => n.kind === 'import');
+      expect(importNode?.name).toBe('dplyr');
+    });
+
+    it('should extract library() with quoted-string argument', () => {
+      const code = `library("tidyr")`;
+      const result = extractFromSource('main.R', code);
+      const importNode = result.nodes.find((n) => n.kind === 'import' && n.name === 'tidyr');
+      expect(importNode).toBeDefined();
+    });
+
+    it('should extract require() the same way as library()', () => {
+      const code = `require(ggplot2)`;
+      const result = extractFromSource('main.R', code);
+      const importNode = result.nodes.find((n) => n.kind === 'import' && n.name === 'ggplot2');
+      expect(importNode).toBeDefined();
+    });
+
+    it('should extract source() with a string path', () => {
+      const code = `source("helpers.R")`;
+      const result = extractFromSource('main.R', code);
+      const importNode = result.nodes.find((n) => n.kind === 'import' && n.name === 'helpers.R');
+      expect(importNode).toBeDefined();
+    });
+
+    it('should not emit an import node for a dynamic source() argument', () => {
+      const code = `source(paste0(BASE, "/helpers.R"))`;
+      const result = extractFromSource('main.R', code);
+      const imports = result.nodes.filter((n) => n.kind === 'import');
+      expect(imports.length).toBe(0);
+    });
+
+    it('should unquote R 4.0+ raw string literals (round delimiter)', () => {
+      const code = `source(r"(helpers.R)")`;
+      const result = extractFromSource('main.R', code);
+      const importNode = result.nodes.find((n) => n.kind === 'import' && n.name === 'helpers.R');
+      expect(importNode).toBeDefined();
+    });
+
+    it('should unquote R raw strings with bracket and brace delimiters', () => {
+      const r1 = extractFromSource('a.R', `library(R"[mypkg]")`);
+      const r2 = extractFromSource('b.R', `library(r"{mypkg}")`);
+      expect(r1.nodes.find((n) => n.kind === 'import' && n.name === 'mypkg')).toBeDefined();
+      expect(r2.nodes.find((n) => n.kind === 'import' && n.name === 'mypkg')).toBeDefined();
+    });
+
+    it('should unquote dash-delimited raw strings used to embed quotes', () => {
+      const code = `source(r"-(file.R)-")`;
+      const result = extractFromSource('main.R', code);
+      const importNode = result.nodes.find((n) => n.kind === 'import' && n.name === 'file.R');
+      expect(importNode).toBeDefined();
+    });
+  });
+
+  describe('Top-level constants', () => {
+    it('should extract top-level non-function assignments as constants', () => {
+      const code = `PI <- 3.14159
+COLORS <- c("red", "green")`;
+      const result = extractFromSource('main.R', code);
+      const pi = result.nodes.find((n) => n.kind === 'constant' && n.name === 'PI');
+      const colors = result.nodes.find((n) => n.kind === 'constant' && n.name === 'COLORS');
+      expect(pi).toBeDefined();
+      expect(colors).toBeDefined();
+    });
+
+    it('should NOT emit a constant for assignments inside a function body', () => {
+      const code = `outer <- function() {
+  x <- 5
+  x
+}`;
+      const result = extractFromSource('main.R', code);
+      const innerVar = result.nodes.find((n) => n.kind === 'constant' && n.name === 'x');
+      expect(innerVar).toBeUndefined();
+    });
+  });
+});
+
+// HCL / Terraform Extraction
+// =============================================================================
+
+describe('HCL / Terraform Extraction', () => {
+  describe('Language detection', () => {
+    it('should detect HCL/Terraform files', () => {
+      expect(detectLanguage('main.tf')).toBe('hcl');
+      expect(detectLanguage('terraform.tfvars')).toBe('hcl');
+      expect(detectLanguage('config.hcl')).toBe('hcl');
+    });
+
+    it('should report HCL as supported', () => {
+      expect(isLanguageSupported('hcl')).toBe(true);
+      expect(getSupportedLanguages()).toContain('hcl');
+    });
+  });
+
+  describe('Block extraction', () => {
+    it('should extract a resource block as a class node', () => {
+      const code = `resource "aws_s3_bucket" "logs" { bucket = "my-logs" }`;
+      const result = extractFromSource('main.tf', code);
+
+      const node = result.nodes.find((n) => n.qualifiedName === 'aws_s3_bucket.logs');
+      expect(node).toBeDefined();
+      expect(node?.kind).toBe('class');
+      expect(node?.name).toBe('aws_s3_bucket.logs');
+      expect(node?.language).toBe('hcl');
+      expect(node?.signature).toBe('resource "aws_s3_bucket" "logs"');
+    });
+
+    it('should extract a data block with `data.` prefix', () => {
+      const code = `data "aws_caller_identity" "current" {}`;
+      const result = extractFromSource('main.tf', code);
+
+      const node = result.nodes.find((n) => n.qualifiedName === 'data.aws_caller_identity.current');
+      expect(node).toBeDefined();
+      expect(node?.kind).toBe('class');
+      expect(node?.name).toBe('aws_caller_identity.current');
+    });
+
+    it('should extract a variable block', () => {
+      const code = `variable "environment" { type = string }`;
+      const result = extractFromSource('main.tf', code);
+
+      const node = result.nodes.find((n) => n.qualifiedName === 'var.environment');
+      expect(node).toBeDefined();
+      expect(node?.kind).toBe('variable');
+      expect(node?.name).toBe('environment');
+    });
+
+    it('should extract an output block as an export', () => {
+      const code = `output "vpc_id" { value = "abc" }`;
+      const result = extractFromSource('main.tf', code);
+
+      const node = result.nodes.find((n) => n.qualifiedName === 'output.vpc_id');
+      expect(node).toBeDefined();
+      expect(node?.kind).toBe('export');
+      expect(node?.name).toBe('vpc_id');
+    });
+
+    it('should extract a module block', () => {
+      const code = `module "vpc" { source = "terraform-aws-modules/vpc/aws" }`;
+      const result = extractFromSource('main.tf', code);
+
+      const node = result.nodes.find((n) => n.qualifiedName === 'module.vpc');
+      expect(node).toBeDefined();
+      expect(node?.kind).toBe('module');
+      expect(node?.name).toBe('vpc');
+    });
+
+    it('should extract a provider block as namespace', () => {
+      const code = `provider "aws" { region = "us-east-1" }`;
+      const result = extractFromSource('main.tf', code);
+
+      const node = result.nodes.find((n) => n.qualifiedName === 'provider.aws');
+      expect(node).toBeDefined();
+      expect(node?.kind).toBe('namespace');
+    });
+
+    it('should split a locals block into one constant per attribute', () => {
+      const code = `locals {
+  bucket_name = "my-bucket"
+  retention   = 30
+}`;
+      const result = extractFromSource('main.tf', code);
+
+      const bucketName = result.nodes.find((n) => n.qualifiedName === 'local.bucket_name');
+      const retention = result.nodes.find((n) => n.qualifiedName === 'local.retention');
+      expect(bucketName?.kind).toBe('constant');
+      expect(retention?.kind).toBe('constant');
+    });
+
+    it('should connect blocks to the file via contains edges', () => {
+      const code = `resource "aws_s3_bucket" "logs" {}`;
+      const result = extractFromSource('main.tf', code);
+
+      const fileNode = result.nodes.find((n) => n.kind === 'file');
+      const resourceNode = result.nodes.find((n) => n.qualifiedName === 'aws_s3_bucket.logs');
+      expect(fileNode).toBeDefined();
+      expect(resourceNode).toBeDefined();
+      const containsEdge = result.edges.find(
+        (e) => e.source === fileNode!.id && e.target === resourceNode!.id && e.kind === 'contains'
+      );
+      expect(containsEdge).toBeDefined();
+    });
+  });
+
+  describe('Reference extraction', () => {
+    it('should extract var.X references', () => {
+      const code = `resource "aws_s3_bucket" "logs" { bucket = var.bucket_name }`;
+      const result = extractFromSource('main.tf', code);
+
+      const ref = result.unresolvedReferences.find((r) => r.referenceName === 'var.bucket_name');
+      expect(ref).toBeDefined();
+      expect(ref?.referenceKind).toBe('references');
+    });
+
+    it('should extract local.X references', () => {
+      const code = `resource "aws_s3_bucket" "logs" { tags = local.common_tags }`;
+      const result = extractFromSource('main.tf', code);
+
+      const ref = result.unresolvedReferences.find((r) => r.referenceName === 'local.common_tags');
+      expect(ref).toBeDefined();
+    });
+
+    it('should extract module.X references and stop at the module name', () => {
+      const code = `output "vpc_id" { value = module.vpc.vpc_id }`;
+      const result = extractFromSource('main.tf', code);
+
+      const ref = result.unresolvedReferences.find((r) => r.referenceName === 'module.vpc');
+      expect(ref).toBeDefined();
+      // Should NOT emit a reference for the trailing attribute
+      expect(result.unresolvedReferences.find((r) => r.referenceName === 'module.vpc.vpc_id')).toBeUndefined();
+    });
+
+    it('should extract data.T.N references with both labels', () => {
+      const code = `output "x" { value = data.aws_caller_identity.current.account_id }`;
+      const result = extractFromSource('main.tf', code);
+
+      const ref = result.unresolvedReferences.find(
+        (r) => r.referenceName === 'data.aws_caller_identity.current'
+      );
+      expect(ref).toBeDefined();
+    });
+
+    it('should extract resource references as TYPE.NAME', () => {
+      const code = `resource "aws_s3_bucket_versioning" "v" { bucket = aws_s3_bucket.logs.id }`;
+      const result = extractFromSource('main.tf', code);
+
+      const ref = result.unresolvedReferences.find((r) => r.referenceName === 'aws_s3_bucket.logs');
+      expect(ref).toBeDefined();
+    });
+
+    it('should extract references inside string interpolations', () => {
+      const code = 'locals { name = "${var.environment}-${random_id.suffix.hex}" }';
+      const result = extractFromSource('main.tf', code);
+
+      const names = result.unresolvedReferences.map((r) => r.referenceName);
+      expect(names).toContain('var.environment');
+      expect(names).toContain('random_id.suffix');
+    });
+
+    it('should ignore references to count, each, self, and path', () => {
+      const code = `resource "aws_instance" "web" {
+  count = 3
+  tags  = { Name = "web-\${count.index}", For = each.value, Self = self.id, P = path.module }
+}`;
+      const result = extractFromSource('main.tf', code);
+
+      const names = result.unresolvedReferences.map((r) => r.referenceName);
+      expect(names.find((n) => n.startsWith('count.'))).toBeUndefined();
+      expect(names.find((n) => n.startsWith('each.'))).toBeUndefined();
+      expect(names.find((n) => n.startsWith('self.'))).toBeUndefined();
+      expect(names.find((n) => n.startsWith('path.'))).toBeUndefined();
+    });
+
+    it('should ignore for-loop iteration variables', () => {
+      const code = `output "ids" { value = [for s in var.subnets : s.id] }`;
+      const result = extractFromSource('main.tf', code);
+
+      const names = result.unresolvedReferences.map((r) => r.referenceName);
+      // var.subnets reference comes through, but `s.id` does NOT
+      expect(names).toContain('var.subnets');
+      expect(names.find((n) => n.startsWith('s.'))).toBeUndefined();
+    });
+
+    it('should ignore key/value bindings in for-object expressions', () => {
+      const code = `locals { tags = { for k, v in var.input : k => "\${v}-suffix" } }`;
+      const result = extractFromSource('main.tf', code);
+
+      const names = result.unresolvedReferences.map((r) => r.referenceName);
+      expect(names).toContain('var.input');
+      expect(names.find((n) => n === 'k' || n.startsWith('k.'))).toBeUndefined();
+      expect(names.find((n) => n === 'v' || n.startsWith('v.'))).toBeUndefined();
+    });
+
+    it('should emit an imports edge for module source', () => {
+      const code = `module "vpc" { source = "terraform-aws-modules/vpc/aws" }`;
+      const result = extractFromSource('main.tf', code);
+
+      const importRef = result.unresolvedReferences.find(
+        (r) => r.referenceKind === 'imports' && r.referenceName === 'terraform-aws-modules/vpc/aws'
+      );
+      expect(importRef).toBeDefined();
+    });
+  });
+
+  describe('Robustness', () => {
+    it('should handle empty files', () => {
+      const result = extractFromSource('main.tf', '');
+      const fileNode = result.nodes.find((n) => n.kind === 'file');
+      expect(fileNode).toBeDefined();
+    });
+
+    it('should handle blocks with no body', () => {
+      const code = `data "aws_caller_identity" "current" {}`;
+      const result = extractFromSource('main.tf', code);
+      expect(result.nodes.find((n) => n.qualifiedName === 'data.aws_caller_identity.current')).toBeDefined();
+    });
+
+    it('should walk nested blocks for references without emitting child nodes', () => {
+      const code = `resource "aws_s3_bucket_versioning" "v" {
+  bucket = aws_s3_bucket.logs.id
+  versioning_configuration {
+    status = var.versioning_status
+  }
+}`;
+      const result = extractFromSource('main.tf', code);
+
+      // Only one block-level node, plus the file
+      const blockNodes = result.nodes.filter((n) => n.kind === 'class');
+      expect(blockNodes.length).toBe(1);
+
+      // References from the nested block should still be captured
+      const names = result.unresolvedReferences.map((r) => r.referenceName);
+      expect(names).toContain('aws_s3_bucket.logs');
+      expect(names).toContain('var.versioning_status');
+    });
+  });
+});
+
+
+// =============================================================================
+// SQL Extraction
+// =============================================================================
+
+describe('SQL Extraction', () => {
+  describe('Language detection', () => {
+    it('should detect SQL files', () => {
+      expect(detectLanguage('schema.sql')).toBe('sql');
+      expect(detectLanguage('migrations/001.ddl')).toBe('sql');
+      expect(detectLanguage('seed.dml')).toBe('sql');
+    });
+
+    it('should report SQL as supported', () => {
+      expect(isLanguageSupported('sql')).toBe(true);
+      expect(getSupportedLanguages()).toContain('sql');
+    });
+  });
+
+  describe('CREATE TABLE', () => {
+    it('should extract a table as a class node', () => {
+      const code = `CREATE TABLE users (id INT PRIMARY KEY, email VARCHAR(255));`;
+      const result = extractFromSource('schema.sql', code);
+      const node = result.nodes.find((n) => n.kind === 'class' && n.name === 'users');
+      expect(node).toBeDefined();
+      expect(node?.signature).toBe('CREATE TABLE users');
+    });
+
+    it('should preserve schema-qualified table names', () => {
+      const code = `CREATE TABLE reporting.events (id INT);`;
+      const result = extractFromSource('schema.sql', code);
+      const node = result.nodes.find((n) => n.kind === 'class' && n.name === 'reporting.events');
+      expect(node).toBeDefined();
+    });
+
+    it('should extract inline foreign-key references', () => {
+      const code = `CREATE TABLE orders (id INT, user_id INT REFERENCES users(id));`;
+      const result = extractFromSource('schema.sql', code);
+      const orders = result.nodes.find((n) => n.name === 'orders')!;
+      const fk = result.unresolvedReferences.find(
+        (r) => r.fromNodeId === orders.id && r.referenceName === 'users' && r.referenceKind === 'references'
+      );
+      expect(fk).toBeDefined();
+    });
+
+    it('should extract CONSTRAINT-style foreign keys', () => {
+      const code = `CREATE TABLE orders (
+  id INT,
+  user_id INT,
+  CONSTRAINT fk_user FOREIGN KEY (user_id) REFERENCES users(id) ON DELETE CASCADE
+);`;
+      const result = extractFromSource('schema.sql', code);
+      const orders = result.nodes.find((n) => n.name === 'orders')!;
+      const fk = result.unresolvedReferences.find(
+        (r) => r.fromNodeId === orders.id && r.referenceName === 'users'
+      );
+      expect(fk).toBeDefined();
+    });
+
+    it('should add a contains edge from the file to each table', () => {
+      const code = `CREATE TABLE a (id INT); CREATE TABLE b (id INT);`;
+      const result = extractFromSource('schema.sql', code);
+      const file = result.nodes.find((n) => n.kind === 'file')!;
+      const a = result.nodes.find((n) => n.name === 'a')!;
+      const b = result.nodes.find((n) => n.name === 'b')!;
+      expect(result.edges).toContainEqual(expect.objectContaining({ source: file.id, target: a.id, kind: 'contains' }));
+      expect(result.edges).toContainEqual(expect.objectContaining({ source: file.id, target: b.id, kind: 'contains' }));
+    });
+  });
+
+  describe('CREATE VIEW', () => {
+    it('should extract a view as a class node', () => {
+      const code = `CREATE VIEW active_users AS SELECT id FROM users;`;
+      const result = extractFromSource('views.sql', code);
+      const view = result.nodes.find((n) => n.kind === 'class' && n.name === 'active_users');
+      expect(view).toBeDefined();
+    });
+
+    it('should record references to source tables in the view query', () => {
+      const code = `CREATE VIEW user_orders AS
+  SELECT u.id, COUNT(o.id) AS n
+  FROM users u
+  LEFT JOIN orders o ON o.user_id = u.id;`;
+      const result = extractFromSource('views.sql', code);
+      const view = result.nodes.find((n) => n.name === 'user_orders')!;
+      const refs = result.unresolvedReferences
+        .filter((r) => r.fromNodeId === view.id)
+        .map((r) => r.referenceName);
+      expect(refs).toContain('users');
+      expect(refs).toContain('orders');
+    });
+
+    it('should de-duplicate identical references in the same scope', () => {
+      const code = `CREATE VIEW double_users AS
+  SELECT * FROM users JOIN users u2 ON u2.id = users.id;`;
+      const result = extractFromSource('views.sql', code);
+      const view = result.nodes.find((n) => n.name === 'double_users')!;
+      const usersRefs = result.unresolvedReferences.filter(
+        (r) => r.fromNodeId === view.id && r.referenceName === 'users'
+      );
+      expect(usersRefs.length).toBe(1);
+    });
+
+    it('should walk into derived-table subqueries to find inner table refs', () => {
+      const code = `CREATE VIEW v AS
+  SELECT * FROM (SELECT id FROM users) u JOIN orders o ON o.user_id = u.id;`;
+      const result = extractFromSource('views.sql', code);
+      const view = result.nodes.find((n) => n.name === 'v')!;
+      const refs = result.unresolvedReferences
+        .filter((r) => r.fromNodeId === view.id)
+        .map((r) => r.referenceName);
+      expect(refs).toContain('users');
+      expect(refs).toContain('orders');
+    });
+  });
+
+  describe('CREATE FUNCTION', () => {
+    it('should extract a function with signature', () => {
+      const code = `CREATE FUNCTION add(a INT, b INT) RETURNS INT AS 'SELECT a + b' LANGUAGE SQL;`;
+      const result = extractFromSource('fns.sql', code);
+      const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'add');
+      expect(fn).toBeDefined();
+      expect(fn?.signature).toContain('(a INT, b INT)');
+    });
+
+    it('should handle CREATE OR REPLACE FUNCTION', () => {
+      const code = `CREATE OR REPLACE FUNCTION calc(x INT) RETURNS INT AS 'SELECT x * 2' LANGUAGE SQL;`;
+      const result = extractFromSource('fns.sql', code);
+      const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'calc');
+      expect(fn).toBeDefined();
+    });
+
+    it('should label a CREATE FUNCTION signature with CREATE FUNCTION', () => {
+      const code = `CREATE FUNCTION add(a INT) RETURNS INT AS 'SELECT a + 1' LANGUAGE SQL;`;
+      const result = extractFromSource('fns.sql', code);
+      const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'add');
+      expect(fn?.signature).toContain('CREATE FUNCTION');
+      expect(fn?.signature).not.toContain('CREATE PROCEDURE');
+    });
+  });
+
+  describe('CREATE TRIGGER', () => {
+    it('should extract a trigger with target-table reference and called function', () => {
+      const code = `CREATE TRIGGER orders_audit
+AFTER INSERT ON orders
+FOR EACH ROW
+EXECUTE FUNCTION audit_orders();`;
+      const result = extractFromSource('triggers.sql', code);
+      const trigger = result.nodes.find((n) => n.kind === 'function' && n.name === 'orders_audit');
+      expect(trigger).toBeDefined();
+
+      const refs = result.unresolvedReferences.filter((r) => r.fromNodeId === trigger!.id);
+      const tableRef = refs.find((r) => r.referenceName === 'orders' && r.referenceKind === 'references');
+      const callRef = refs.find((r) => r.referenceName === 'audit_orders' && r.referenceKind === 'calls');
+      expect(tableRef).toBeDefined();
+      expect(callRef).toBeDefined();
+    });
+
+    it('should still locate target/function across an UPDATE OF column list', () => {
+      const code = `CREATE TRIGGER t
+BEFORE UPDATE OF col1, col2 ON orders
+FOR EACH ROW
+EXECUTE FUNCTION audit_cols();`;
+      const result = extractFromSource('triggers.sql', code);
+      const trigger = result.nodes.find((n) => n.name === 't')!;
+      const refs = result.unresolvedReferences.filter((r) => r.fromNodeId === trigger.id);
+      expect(refs.find((r) => r.referenceName === 'orders' && r.referenceKind === 'references')).toBeDefined();
+      expect(refs.find((r) => r.referenceName === 'audit_cols' && r.referenceKind === 'calls')).toBeDefined();
+    });
+  });
+
+  describe('CREATE TYPE', () => {
+    it('should extract an enum type as an enum node', () => {
+      const code = `CREATE TYPE order_status AS ENUM ('pending', 'shipped', 'cancelled');`;
+      const result = extractFromSource('types.sql', code);
+      const node = result.nodes.find((n) => n.name === 'order_status');
+      expect(node?.kind).toBe('enum');
+    });
+
+    it('should extract a non-enum CREATE TYPE as a type_alias', () => {
+      const code = `CREATE TYPE point AS (x FLOAT, y FLOAT);`;
+      const result = extractFromSource('types.sql', code);
+      const node = result.nodes.find((n) => n.name === 'point');
+      expect(node?.kind).toBe('type_alias');
+    });
+  });
+
+  describe('CREATE SCHEMA', () => {
+    it('should extract a schema as a namespace node', () => {
+      const code = `CREATE SCHEMA reporting;`;
+      const result = extractFromSource('schemas.sql', code);
+      const node = result.nodes.find((n) => n.name === 'reporting');
+      expect(node?.kind).toBe('namespace');
+    });
+  });
+
+  describe('Robustness', () => {
+    it('should not error on plain SELECT/INSERT/UPDATE statements', () => {
+      const code = `SELECT * FROM users;
+INSERT INTO orders (id) VALUES (1);
+UPDATE users SET email = 'x';`;
+      const result = extractFromSource('queries.sql', code);
+      expect(result.errors.filter((e) => e.severity === 'error').length).toBe(0);
+      const nonFile = result.nodes.filter((n) => n.kind !== 'file');
+      expect(nonFile.length).toBe(0);
+    });
+
+    it('should not emit nodes for CREATE INDEX', () => {
+      const code = `CREATE INDEX idx_users_email ON users(email);`;
+      const result = extractFromSource('idx.sql', code);
+      const nonFile = result.nodes.filter((n) => n.kind !== 'file');
+      expect(nonFile.length).toBe(0);
+    });
+
+    it('should handle multiple statements without leaking state', () => {
+      const code = `CREATE TABLE a (id INT);
+CREATE TABLE b (id INT, a_id INT REFERENCES a(id));
+CREATE VIEW c AS SELECT * FROM a JOIN b ON b.a_id = a.id;`;
+      const result = extractFromSource('multi.sql', code);
+      const a = result.nodes.find((n) => n.name === 'a');
+      const b = result.nodes.find((n) => n.name === 'b');
+      const c = result.nodes.find((n) => n.name === 'c');
+      expect(a).toBeDefined();
+      expect(b).toBeDefined();
+      expect(c).toBeDefined();
+
+      const bRefs = result.unresolvedReferences.filter((r) => r.fromNodeId === b!.id);
+      const cRefs = result.unresolvedReferences.filter((r) => r.fromNodeId === c!.id);
+      expect(bRefs.map((r) => r.referenceName)).toContain('a');
+      expect(cRefs.map((r) => r.referenceName)).toContain('a');
+      expect(cRefs.map((r) => r.referenceName)).toContain('b');
+    });
+  });
+});
diff --git a/__tests__/foundation.test.ts b/__tests__/foundation.test.ts
index 9ee437da..bd6e957d 100644
--- a/__tests__/foundation.test.ts
+++ b/__tests__/foundation.test.ts
@@ -305,7 +305,7 @@ describe('Database Connection', () => {
 
     const version = db.getSchemaVersion();
     expect(version).not.toBeNull();
-    expect(version?.version).toBe(3);
+    expect(version?.version).toBe(16);
 
     db.close();
   });
diff --git a/__tests__/index-hooks.test.ts b/__tests__/index-hooks.test.ts
new file mode 100644
index 00000000..639587f9
--- /dev/null
+++ b/__tests__/index-hooks.test.ts
@@ -0,0 +1,130 @@
+/**
+ * Index-hook framework: register a fake hook at runtime, run an
+ * indexAll/sync against a synthetic project, assert the hook ran
+ * with the expected context shape and that errors are caught.
+ *
+ * The registry's static-import list (`REGISTERED_HOOKS`) is empty
+ * on main today; tests poke at the runner directly through
+ * `runAfterIndexAll`/`runAfterSync` rather than mutating that
+ * list.
+ */
+import { describe, it, expect } from 'vitest';
+import {
+  runAfterIndexAll,
+  runAfterSync,
+  getRegisteredHooks,
+  type IndexHook,
+  type IndexHookContext,
+} from '../src/index-hooks/registry';
+import type { SyncResult } from '../src/extraction';
+
+function makeFakeContext(): IndexHookContext {
+  // Hooks should not mutate the context; for the runner-shape
+  // tests we hand them stubs typed `as any` — the runner doesn't
+  // touch any of these fields itself.
+  return {
+    projectRoot: '/tmp/fake-project',
+    /* eslint-disable @typescript-eslint/no-explicit-any */
+    config: {} as any,
+    queries: {} as any,
+    db: {} as any,
+    /* eslint-enable */
+  };
+}
+
+const fakeSyncResult: SyncResult = {
+  filesChecked: 0,
+  filesAdded: 0,
+  filesModified: 0,
+  filesRemoved: 0,
+  nodesUpdated: 0,
+  durationMs: 0,
+};
+
+describe('index-hooks registry — runner', () => {
+  it('registered hooks expose stable {name, afterIndexAll|afterSync} shape', () => {
+    const hooks = getRegisteredHooks();
+    expect(hooks.length).toBeGreaterThanOrEqual(0);
+    for (const h of hooks) {
+      expect(typeof h.name).toBe('string');
+      expect(h.afterIndexAll === undefined || typeof h.afterIndexAll === 'function').toBe(true);
+      expect(h.afterSync === undefined || typeof h.afterSync === 'function').toBe(true);
+    }
+  });
+
+  it('runAfterIndexAll returns one outcome per registered hook, swallowing per-hook errors', async () => {
+    // Registered hooks will throw on the fake `{} as any` ctx; the
+    // runner contract is to catch + report each error so one bad
+    // hook never fails the whole pass.
+    const outcomes = await runAfterIndexAll(makeFakeContext());
+    const expectedCount = getRegisteredHooks().filter((h) => h.afterIndexAll).length;
+    expect(outcomes.length).toBe(expectedCount);
+    for (const o of outcomes) {
+      expect(typeof o.name).toBe('string');
+      expect(o.phase).toBe('indexAll');
+      expect(typeof o.durationMs).toBe('number');
+    }
+  });
+
+  it('runAfterSync returns one outcome per registered hook, swallowing per-hook errors', async () => {
+    const outcomes = await runAfterSync(makeFakeContext(), fakeSyncResult);
+    const expectedCount = getRegisteredHooks().filter((h) => h.afterSync).length;
+    expect(outcomes.length).toBe(expectedCount);
+    for (const o of outcomes) {
+      expect(typeof o.name).toBe('string');
+      expect(o.phase).toBe('sync');
+      expect(typeof o.durationMs).toBe('number');
+    }
+  });
+});
+
+describe('index-hooks runner — fake-hook injection', () => {
+  // Helper: temporarily inject a fake hook by wrapping the runner
+  // directly. The runner accepts no array argument today; this
+  // suite exercises the public surface (runAfterIndexAll /
+  // runAfterSync) by simulating what a registered hook would do.
+  // When real hooks land, REGISTERED_HOOKS in registry.ts will
+  // contain them and this fixture-style approach disappears.
+
+  it('a hook with afterIndexAll receives the context and is awaited', async () => {
+    // Build a one-off hook and call it directly — the runner's
+    // contract is "for each registered hook, await afterIndexAll
+    // if defined." We exercise that contract by calling the hook
+    // ourselves to confirm the IndexHookContext shape stays usable
+    // by hook implementations.
+    let captured: IndexHookContext | null = null;
+    const hook: IndexHook = {
+      name: 'fake-hook',
+      async afterIndexAll(ctx) {
+        captured = ctx;
+      },
+    };
+    const ctx = makeFakeContext();
+    await hook.afterIndexAll!(ctx);
+    expect(captured).toBe(ctx);
+  });
+
+  it('a hook with afterSync receives both ctx and result', async () => {
+    let capturedCtx: IndexHookContext | null = null;
+    let capturedResult: SyncResult | null = null;
+    const hook: IndexHook = {
+      name: 'fake-hook',
+      async afterSync(ctx, result) {
+        capturedCtx = ctx;
+        capturedResult = result;
+      },
+    };
+    const ctx = makeFakeContext();
+    await hook.afterSync!(ctx, fakeSyncResult);
+    expect(capturedCtx).toBe(ctx);
+    expect(capturedResult).toBe(fakeSyncResult);
+  });
+
+  it('a hook missing afterIndexAll is silently skipped', () => {
+    // Just a typing assertion: an IndexHook without afterIndexAll
+    // is allowed (both methods are optional).
+    const hook: IndexHook = { name: 'sync-only' };
+    expect(hook.afterIndexAll).toBeUndefined();
+    expect(hook.afterSync).toBeUndefined();
+  });
+});
diff --git a/__tests__/issue-history.test.ts b/__tests__/issue-history.test.ts
new file mode 100644
index 00000000..7c281771
--- /dev/null
+++ b/__tests__/issue-history.test.ts
@@ -0,0 +1,390 @@
+/**
+ * Issue → symbol attribution: parser unit tests + end-to-end mining
+ * against synthetic git repos.
+ */
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import * as fs from 'fs';
+import * as os from 'os';
+import * as path from 'path';
+import { execFileSync } from 'child_process';
+import {
+  extractSymbolFromContext,
+  extractDeclaration,
+} from '../src/issue-history/parse-diff';
+import {
+  mineIssueCommits,
+  mineIssueHistory,
+  ISSUE_REGEX,
+  LAST_MINED_ISSUES_HEAD_KEY,
+} from '../src/issue-history';
+import CodeGraph from '../src/index';
+
+let HAS_GIT = true;
+try {
+  execFileSync('git', ['--version'], { stdio: 'ignore' });
+} catch {
+  HAS_GIT = false;
+}
+
+let testDir: string;
+let cg: CodeGraph | null = null;
+
+function git(...args: string[]): string {
+  return execFileSync('git', args, {
+    cwd: testDir,
+    encoding: 'utf-8',
+    env: {
+      ...process.env,
+      GIT_AUTHOR_NAME: 'Test',
+      GIT_AUTHOR_EMAIL: 'test@example.com',
+      GIT_COMMITTER_NAME: 'Test',
+      GIT_COMMITTER_EMAIL: 'test@example.com',
+      GIT_AUTHOR_DATE: process.env.GIT_AUTHOR_DATE,
+      GIT_COMMITTER_DATE: process.env.GIT_COMMITTER_DATE,
+    },
+    stdio: ['pipe', 'pipe', 'pipe'],
+  }).trim();
+}
+
+function commitAt(date: string, files: Record<string, string>, message: string) {
+  for (const [rel, content] of Object.entries(files)) {
+    const abs = path.join(testDir, rel);
+    fs.mkdirSync(path.dirname(abs), { recursive: true });
+    fs.writeFileSync(abs, content);
+  }
+  git('add', '-A');
+  process.env.GIT_AUTHOR_DATE = date;
+  process.env.GIT_COMMITTER_DATE = date;
+  git('commit', '-m', message);
+  delete process.env.GIT_AUTHOR_DATE;
+  delete process.env.GIT_COMMITTER_DATE;
+}
+
+beforeEach(() => {
+  testDir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-issues-'));
+});
+
+afterEach(() => {
+  delete process.env.GIT_AUTHOR_DATE;
+  delete process.env.GIT_COMMITTER_DATE;
+  if (cg) {
+    cg.destroy();
+    cg = null;
+  }
+  if (fs.existsSync(testDir)) fs.rmSync(testDir, { recursive: true, force: true });
+});
+
+// ============================================================================
+// Pure parser unit tests
+// ============================================================================
+
+describe('ISSUE_REGEX', () => {
+  it('matches all canonical Fixes/Closes/Resolves verbs', () => {
+    const cases = [
+      'Fix #1', 'Fixes #2', 'Fixed #3',
+      'Close #4', 'Closes #5', 'Closed #6',
+      'Resolve #7', 'Resolves #8', 'Resolved #9',
+    ];
+    for (const s of cases) {
+      ISSUE_REGEX.lastIndex = 0;
+      expect(ISSUE_REGEX.test(s)).toBe(true);
+    }
+  });
+
+  it('matches multiple issues in a single body', () => {
+    ISSUE_REGEX.lastIndex = 0;
+    const matches = [...'Fixes #1, closes #2 and resolves #3'.matchAll(ISSUE_REGEX)];
+    expect(matches.map((m) => m[1])).toEqual(['1', '2', '3']);
+  });
+
+  it('is case-insensitive', () => {
+    ISSUE_REGEX.lastIndex = 0;
+    expect(ISSUE_REGEX.test('FIXES #42')).toBe(true);
+  });
+
+  it('does NOT match `#N` without a verb', () => {
+    ISSUE_REGEX.lastIndex = 0;
+    // Match in body of message that mentions #99 but with no verb prefix.
+    expect(ISSUE_REGEX.test('See #99 for context')).toBe(false);
+  });
+
+  it('v1 limitation: `Fixes #1, #2` only captures #1', () => {
+    // Documented behavior — the second issue lacks a verb prefix and
+    // is silently dropped. Authors who care can write `Fixes #1, fixes #2`.
+    ISSUE_REGEX.lastIndex = 0;
+    const matches = [...'Fixes #1, #2'.matchAll(ISSUE_REGEX)];
+    expect(matches.map((m) => m[1])).toEqual(['1']);
+  });
+});
+
+describe('extractSymbolFromContext', () => {
+  it('pulls function name from a TS function context', () => {
+    expect(extractSymbolFromContext('function processOrder(order: Order) {')).toBe('processOrder');
+  });
+  it('pulls class name', () => {
+    expect(extractSymbolFromContext('class UserService {')).toBe('UserService');
+  });
+  it('pulls Python def', () => {
+    expect(extractSymbolFromContext('def compute_score(items):')).toBe('compute_score');
+  });
+  it('pulls Go func', () => {
+    expect(extractSymbolFromContext('func ProcessOrder(o *Order) error {')).toBe('ProcessOrder');
+  });
+  it('pulls method-style ` async foo(`', () => {
+    expect(extractSymbolFromContext('  async foo(args: string) {')).toBe('foo');
+  });
+  it('rejects keyword-only contexts', () => {
+    expect(extractSymbolFromContext('  if (x) {')).toBeNull();
+  });
+  it('returns null on empty input', () => {
+    expect(extractSymbolFromContext('')).toBeNull();
+  });
+});
+
+describe('extractDeclaration', () => {
+  it('captures + function decl', () => {
+    expect(extractDeclaration('+function helper() {')).toEqual({ name: 'helper', sign: '+' });
+  });
+  it('captures - class decl', () => {
+    expect(extractDeclaration('-export class Old {')).toEqual({ name: 'Old', sign: '-' });
+  });
+  it('captures Python def', () => {
+    expect(extractDeclaration('+def my_helper(x):')).toEqual({ name: 'my_helper', sign: '+' });
+  });
+  it('captures Go func with receiver', () => {
+    expect(extractDeclaration('+func (s *Service) DoThing() error {')).toEqual({
+      name: 'DoThing',
+      sign: '+',
+    });
+  });
+  it('skips file-marker `+++` and `---` lines', () => {
+    expect(extractDeclaration('+++ b/src/foo.ts')).toBeNull();
+    expect(extractDeclaration('--- a/src/foo.ts')).toBeNull();
+  });
+  it('skips keywords like `+if`', () => {
+    expect(extractDeclaration('+  if (x) return;')).toBeNull();
+  });
+  it('returns null on context lines (no +/-)', () => {
+    expect(extractDeclaration(' some body line')).toBeNull();
+  });
+});
+
+// ============================================================================
+// Git mining: synthetic repo
+// ============================================================================
+
+describe.skipIf(!HAS_GIT)('mineIssueCommits', () => {
+  beforeEach(() => {
+    git('init', '-q', '-b', 'main');
+    git('config', 'commit.gpgsign', 'false');
+  });
+
+  it('finds commits with `Fixes #N` in the subject', () => {
+    commitAt('2025-01-01T00:00:00Z', { 'a.ts': 'a' }, 'feat: add a (no issue)');
+    commitAt('2025-01-02T00:00:00Z', { 'a.ts': 'a2' }, 'fix: bug. Fixes #42');
+    const commits = mineIssueCommits(testDir, null);
+    expect(commits.length).toBe(1);
+    expect(commits[0]!.issues).toEqual([42]);
+  });
+
+  it('parses multi-issue subjects', () => {
+    commitAt('2025-01-01T00:00:00Z', { 'a.ts': 'a' }, 'fix: triple. Fixes #1, closes #2, resolves #3');
+    const [c] = mineIssueCommits(testDir, null);
+    expect(c?.issues).toEqual([1, 2, 3]);
+  });
+
+  it('ignores commits with no issue ref', () => {
+    commitAt('2025-01-01T00:00:00Z', { 'a.ts': 'a' }, 'plain message');
+    expect(mineIssueCommits(testDir, null).length).toBe(0);
+  });
+
+  it('returns [] when not in a git repo', () => {
+    const nonGit = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-nogit-'));
+    try {
+      expect(mineIssueCommits(nonGit, null)).toEqual([]);
+    } finally {
+      fs.rmSync(nonGit, { recursive: true, force: true });
+    }
+  });
+});
+
+// ============================================================================
+// End-to-end through CodeGraph
+// ============================================================================
+
+describe.skipIf(!HAS_GIT)('CodeGraph issue history', () => {
+  beforeEach(() => {
+    git('init', '-q', '-b', 'main');
+    git('config', 'commit.gpgsign', 'false');
+  });
+
+  it('attributes a Fixes #N commit to the modified function', async () => {
+    commitAt('2025-01-01T00:00:00Z', {
+      'src/a.ts': `export function foo() { return 1; }\n`,
+    }, 'feat: add foo');
+
+    commitAt('2025-02-01T00:00:00Z', {
+      'src/a.ts': `export function foo() {\n  // changed\n  return 2;\n}\n`,
+    }, 'fix: bug. Fixes #42');
+
+    cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } });
+    await cg.indexAll();
+
+    const node = cg.getNodesInFile('src/a.ts').find((n) => n.name === 'foo')!;
+    expect(node).toBeDefined();
+    const issues = cg.getIssuesForNode(node.id);
+    expect(issues.length).toBeGreaterThan(0);
+    expect(issues.some((i) => i.issueNumber === 42)).toBe(true);
+});
+
+  it('tracks the agent-usable multi-issue signal', async () => {
+    // Simulate the codegraph history pattern: `loadGrammarsForLanguages`
+    // touched by every language-add issue (#54, #82, #83, #85).
+    commitAt('2025-01-01T00:00:00Z', {
+      'src/grammar.ts': `export function loadGrammarsForLanguages() { return []; }\n`,
+    }, 'feat: add grammar loader');
+
+    commitAt('2025-01-02T00:00:00Z', {
+      'src/grammar.ts': `export function loadGrammarsForLanguages() {\n  // R support\n  return [];\n}\n`,
+    }, 'feat: add R support. Fixes #82');
+
+    commitAt('2025-01-03T00:00:00Z', {
+      'src/grammar.ts': `export function loadGrammarsForLanguages() {\n  // R + HCL support\n  return [];\n}\n`,
+    }, 'feat: add HCL. Fixes #83');
+
+    commitAt('2025-01-04T00:00:00Z', {
+      'src/grammar.ts': `export function loadGrammarsForLanguages() {\n  // R + HCL + SQL\n  return [];\n}\n`,
+    }, 'feat: add SQL. Fixes #85');
+
+    cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } });
+    await cg.indexAll();
+
+    const node = cg.getNodesByKind("function").find((n) => n.name === 'loadGrammarsForLanguages')!;
+    expect(node).toBeDefined();
+    const issues = cg.getIssuesForNode(node.id);
+    const issueNumbers = [...new Set(issues.map((i) => i.issueNumber))].sort((a, b) => a - b);
+    expect(issueNumbers).toEqual([82, 83, 85]);
+  });
+
+  it('records `added` kind for symbols introduced in a Fixes commit', async () => {
+    commitAt('2025-01-01T00:00:00Z', {
+      'src/a.ts': `export function existing() { return 1; }\n`,
+    }, 'init');
+
+    commitAt('2025-02-01T00:00:00Z', {
+      'src/a.ts': `export function existing() { return 1; }\nexport function brandNew() { return 2; }\n`,
+    }, 'feat: add brandNew. Fixes #100');
+
+    cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } });
+    await cg.indexAll();
+
+    const node = cg.getNodesByKind("function").find((n) => n.name === 'brandNew')!;
+    const issues = cg.getIssuesForNode(node.id);
+    expect(issues.some((i) => i.issueNumber === 100 && i.kind === 'added')).toBe(true);
+  });
+
+  it('drops attributions for symbols that no longer exist', async () => {
+    // Symbol added then removed in two separate `Fixes` commits. The
+    // current index has no node for it, so attributions for the removed
+    // symbol must not appear (FK + drop-on-resolve).
+    commitAt('2025-01-01T00:00:00Z', {
+      'src/a.ts': `export function staysHere() { return 1; }\nexport function temporary() { return 99; }\n`,
+    }, 'feat: add. Fixes #1');
+
+    commitAt('2025-02-01T00:00:00Z', {
+      'src/a.ts': `export function staysHere() { return 1; }\n`,
+    }, 'fix: drop temporary. Fixes #2');
+
+    cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } });
+    await cg.indexAll();
+
+    // staysHere should have at least the #1 attribution (added).
+    const node = cg.getNodesByKind("function").find((n) => n.name === 'staysHere')!;
+    const issues = cg.getIssuesForNode(node.id);
+    expect(issues.some((i) => i.issueNumber === 1)).toBe(true);
+
+    // No node should exist named `temporary`, and no attribution to
+    // issue #2 should reference a node that doesn't exist.
+    expect(cg.getNodesByKind("function").find((n) => n.name === 'temporary')).toBeUndefined();
+  });
+
+  it('survives indexAll outside a git repo (table empty, no errors)', async () => {
+    fs.rmSync(path.join(testDir, '.git'), { recursive: true, force: true });
+    fs.writeFileSync(path.join(testDir, 'a.ts'), `export function x() { return 1; }\n`);
+    cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } });
+    await cg.indexAll();
+    const nodes = cg.getNodesInFile('a.ts');
+    expect(nodes.length).toBeGreaterThan(0);
+    for (const n of nodes) expect(cg.getIssuesForNode(n.id)).toEqual([]);
+  });
+
+  it('respects enableIssueHistory=false', async () => {
+    commitAt('2025-01-01T00:00:00Z', {
+      'src/a.ts': `export function foo() { return 1; }\n`,
+    }, 'init');
+    commitAt('2025-01-02T00:00:00Z', {
+      'src/a.ts': `export function foo() { return 2; }\n`,
+    }, 'fix: foo. Fixes #1');
+
+    cg = CodeGraph.initSync(testDir, {
+      config: { include: ['**/*.ts'], exclude: [], enableIssueHistory: false },
+    });
+    await cg.indexAll();
+    const node = cg.getNodesInFile('src/a.ts').find((n) => n.name === 'foo')!;
+    expect(cg.getIssuesForNode(node.id)).toEqual([]);
+  });
+
+  it('incrementally picks up new Fixes commits on sync', async () => {
+    commitAt('2025-01-01T00:00:00Z', {
+      'src/a.ts': `export function foo() { return 1; }\n`,
+    }, 'init');
+
+    cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } });
+    await cg.indexAll();
+    const node = cg.getNodesInFile('src/a.ts').find((n) => n.name === 'foo')!;
+    expect(cg.getIssuesForNode(node.id).length).toBe(0);
+
+    commitAt('2025-02-01T00:00:00Z', {
+      'src/a.ts': `export function foo() { return 2; }\n`,
+    }, 'fix: foo. Fixes #50');
+    await cg.sync();
+
+    const issues = cg.getIssuesForNode(node.id);
+    expect(issues.some((i) => i.issueNumber === 50)).toBe(true);
+  });
+
+  // (Removed: a defensive test for the v4-migration-collision bug class.
+  // With file-based migrations (NNN-name.ts), two migrations claiming
+  // the same version produces a filesystem-level conflict — the silent
+  // skip the defensive guard protected against can no longer happen.)
+
+  it('recovers from an unreachable last_mined_issues_head', async () => {
+    commitAt('2025-01-01T00:00:00Z', {
+      'src/a.ts': `export function foo() { return 1; }\n`,
+    }, 'init');
+    commitAt('2025-02-01T00:00:00Z', {
+      'src/a.ts': `export function foo() { return 2; }\n`,
+    }, 'fix: foo. Fixes #1');
+
+    cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } });
+    await cg.indexAll();
+    const node = cg.getNodesInFile('src/a.ts').find((n) => n.name === 'foo')!;
+    expect(
+      [...new Set(cg.getIssuesForNode(node.id).map((i) => i.issueNumber))]
+    ).toEqual([1]);
+
+    // Simulate force-push / gc by storing an unreachable SHA.
+    // eslint-disable-next-line @typescript-eslint/no-explicit-any
+    (cg as any).queries.setMetadata(LAST_MINED_ISSUES_HEAD_KEY, '0'.repeat(40));
+
+    commitAt('2025-03-01T00:00:00Z', {
+      'src/a.ts': `export function foo() { return 3; }\n`,
+    }, 'fix: foo again. Fixes #2');
+    await cg.sync();
+
+    const issueNums = [
+      ...new Set(cg.getIssuesForNode(node.id).map((i) => i.issueNumber)),
+    ].sort((a, b) => a - b);
+    expect(issueNums).toEqual([1, 2]);
+  });
+});
diff --git a/__tests__/language-registry.test.ts b/__tests__/language-registry.test.ts
new file mode 100644
index 00000000..9afdd59a
--- /dev/null
+++ b/__tests__/language-registry.test.ts
@@ -0,0 +1,157 @@
+/**
+ * Language registry: structural invariants.
+ *
+ * These tests guard against the "parallel list" failure mode that
+ * the registry refactor exists to prevent. If a future PR adds a
+ * grammar-backed language but forgets to wire it through one of
+ * the derived consumers, one of these tests should catch it.
+ */
+import { describe, it, expect } from 'vitest';
+import {
+  getLanguageDefs,
+  getLanguageDefByExtension,
+  getLanguageDefByName,
+} from '../src/extraction/languages/registry';
+import { EXTRACTORS } from '../src/extraction/languages';
+import {
+  detectLanguage,
+  isLanguageSupported,
+  getSupportedLanguages,
+  getLanguageDisplayName,
+  EXTENSION_MAP,
+} from '../src/extraction/grammars';
+
+describe('language registry — single source of truth', () => {
+  it('has at least the original 19 languages', () => {
+    const defs = getLanguageDefs();
+    expect(defs.length).toBeGreaterThanOrEqual(19);
+  });
+
+  it('every def has unique non-empty name', () => {
+    const names = new Set<string>();
+    for (const def of getLanguageDefs()) {
+      expect(def.name).toBeTruthy();
+      expect(names.has(def.name)).toBe(false);
+      names.add(def.name);
+    }
+  });
+
+  it('extensions are unique across registry (one ext maps to one language)', () => {
+    const seen = new Map<string, string>();
+    for (const def of getLanguageDefs()) {
+      for (const ext of def.extensions) {
+        const lower = ext.toLowerCase();
+        if (seen.has(lower)) {
+          // The .h ambiguity (C vs C++) is intentionally pinned to C
+          // by the registry; tree-sitter.ts has a content-sniff
+          // override. Anything else duplicating extensions is a bug.
+          throw new Error(
+            `Extension ${lower} mapped twice: ${seen.get(lower)} and ${def.name}`
+          );
+        }
+        seen.set(lower, def.name);
+      }
+    }
+  });
+
+  it('grammar-backed defs have wasmFile + extractor', () => {
+    for (const def of getLanguageDefs()) {
+      if (!def.grammar) continue;
+      expect(def.grammar.wasmFile).toMatch(/^tree-sitter-.+\.wasm$/);
+      expect(def.grammar.extractor).toBeDefined();
+    }
+  });
+
+  it('custom-extractor defs have a customExtractor function', () => {
+    for (const def of getLanguageDefs()) {
+      if (def.grammar) continue; // grammar-backed
+      expect(def.customExtractor).toBeInstanceOf(Function);
+    }
+  });
+});
+
+describe('derived consumers stay in sync with the registry', () => {
+  // Catch the "parallel list drift" bug that motivated this refactor.
+  // If a new language gets added to registry but a derived consumer
+  // still hard-codes the old set, one of these will fail.
+
+  it('EXTRACTORS contains exactly the grammar-backed languages', () => {
+    const grammarBacked = getLanguageDefs()
+      .filter((d) => d.grammar)
+      .map((d) => d.name)
+      .sort();
+    const extractorKeys = Object.keys(EXTRACTORS).sort();
+    expect(extractorKeys).toEqual(grammarBacked);
+  });
+
+  it('every grammar-backed extractor matches def.grammar.extractor exactly', () => {
+    for (const def of getLanguageDefs()) {
+      if (!def.grammar) continue;
+      expect(EXTRACTORS[def.name as keyof typeof EXTRACTORS]).toBe(def.grammar.extractor);
+    }
+  });
+
+  it('EXTENSION_MAP entries exactly mirror registry extensions', () => {
+    const expected = new Map<string, string>();
+    for (const def of getLanguageDefs()) {
+      for (const ext of def.extensions) {
+        expected.set(ext.toLowerCase(), def.name);
+      }
+    }
+    for (const [ext, lang] of expected) {
+      expect(EXTENSION_MAP[ext]).toBe(lang);
+    }
+    // Reverse: no extra keys in EXTENSION_MAP.
+    expect(Object.keys(EXTENSION_MAP).sort()).toEqual([...expected.keys()].sort());
+  });
+
+  it('detectLanguage returns the expected name for every registered extension', () => {
+    for (const def of getLanguageDefs()) {
+      for (const ext of def.extensions) {
+        // .h is pinned to C by the registry; the C++ heuristic only
+        // applies when source is provided AND looks like C++.
+        expect(detectLanguage(`x${ext}`)).toBe(def.name);
+      }
+    }
+  });
+
+  it('isLanguageSupported returns true for every registered language and false for unknown', () => {
+    for (const def of getLanguageDefs()) {
+      expect(isLanguageSupported(def.name as never)).toBe(true);
+    }
+    expect(isLanguageSupported('unknown' as never)).toBe(false);
+  });
+
+  it('getSupportedLanguages returns exactly the registry names', () => {
+    const fromRegistry = getLanguageDefs().map((d) => d.name).sort();
+    const supported = (getSupportedLanguages() as string[]).sort();
+    expect(supported).toEqual(fromRegistry);
+  });
+
+  it('getLanguageDisplayName uses each defs displayName', () => {
+    for (const def of getLanguageDefs()) {
+      expect(getLanguageDisplayName(def.name as never)).toBe(def.displayName);
+    }
+  });
+});
+
+describe('lookup helpers', () => {
+  it('getLanguageDefByName returns the def for a registered name', () => {
+    expect(getLanguageDefByName('typescript')?.displayName).toBe('TypeScript');
+  });
+
+  it('getLanguageDefByName returns undefined for unknown names', () => {
+    expect(getLanguageDefByName('nonexistent-language-name')).toBeUndefined();
+  });
+
+  it('getLanguageDefByExtension is case-insensitive', () => {
+    expect(getLanguageDefByExtension('.TS')?.name).toBe('typescript');
+    expect(getLanguageDefByExtension('.ts')?.name).toBe('typescript');
+  });
+
+  it('Pascal extensionOverrides routes .dfm and .fmx to a customExtractor', () => {
+    const def = getLanguageDefByName('pascal');
+    expect(def?.extensionOverrides?.['.dfm']?.customExtractor).toBeInstanceOf(Function);
+    expect(def?.extensionOverrides?.['.fmx']?.customExtractor).toBeInstanceOf(Function);
+  });
+});
diff --git a/__tests__/llm-tiers.test.ts b/__tests__/llm-tiers.test.ts
new file mode 100644
index 00000000..2fe588b5
--- /dev/null
+++ b/__tests__/llm-tiers.test.ts
@@ -0,0 +1,381 @@
+/**
+ * Tier 1 #3, Tier 2 #4/#5, Tier 3 #7/#8: directory summaries, role
+ * classifier, change-intent, dead-code judge, naming drift.
+ *
+ * Same in-process fake-Ollama pattern as llm.test.ts. The fake's
+ * chat handler returns deterministic JSON for the prompts that
+ * expect it (classifier, dead-code, naming) so we can assert ordering
+ * and parsing.
+ */
+
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+import * as http from 'http';
+import { AddressInfo } from 'net';
+import { CodeGraph } from '../src';
+
+interface FakeServer {
+  url: string;
+  chatCalls: number;
+  /** Lets a test override the next chat response. */
+  nextChatText: string | null;
+  close: () => Promise<void>;
+}
+
+async function startFake(): Promise<FakeServer> {
+  const state: { chatCalls: number; nextChatText: string | null } = {
+    chatCalls: 0,
+    nextChatText: null,
+  };
+  const server = http.createServer((req, res) => {
+    let body = '';
+    req.on('data', (c) => (body += c));
+    req.on('end', () => {
+      res.setHeader('content-type', 'application/json');
+      if (req.url?.endsWith('/models') || req.url === '/models') {
+        res.end(JSON.stringify({ data: [{ id: 'qwen2.5-coder:7b' }] }));
+        return;
+      }
+      if (req.url?.endsWith('/chat/completions')) {
+        state.chatCalls++;
+        const parsed = JSON.parse(body) as { messages: Array<{ content: string }> };
+        const userText = parsed.messages?.[0]?.content || '';
+        let text: string;
+        if (state.nextChatText !== null) {
+          text = state.nextChatText;
+          state.nextChatText = null;
+        } else if (userText.includes('Reply with EXACTLY one JSON object')) {
+          // Could be classifier-style or judge-style; default to a
+          // benign verdict object that satisfies dead-code parsing.
+          if (userText.includes('"verdict"')) {
+            text = '{"verdict": "uncertain", "confidence": 0.5, "reason": "test stub"}';
+          } else if (userText.includes('"consistent"')) {
+            text = '{"consistent": true, "suggestion": "", "reason": "test stub"}';
+          } else {
+            text = 'unknown';
+          }
+        } else if (userText.includes('Classify the following code symbol')) {
+          text = 'business_logic';
+        } else if (userText.includes('Module summary:')) {
+          text = 'Coordinates a small module that does test things.';
+        } else {
+          text = 'Test stub summary line.';
+        }
+        res.end(
+          JSON.stringify({
+            choices: [{ message: { role: 'assistant', content: text } }],
+          })
+        );
+        return;
+      }
+      if (req.url?.endsWith('/embeddings')) {
+        const parsed = JSON.parse(body) as { input: string[] };
+        const fake = (s: string): number[] => {
+          const v = new Array(8).fill(0);
+          for (let i = 0; i < s.length; i++) v[i % 8] += s.charCodeAt(i) % 11;
+          return v;
+        };
+        res.end(
+          JSON.stringify({ data: parsed.input.map((s) => ({ embedding: fake(s) })) })
+        );
+        return;
+      }
+      res.statusCode = 404;
+      res.end();
+    });
+  });
+  await new Promise<void>((r) => server.listen(0, '127.0.0.1', r));
+  const addr = server.address() as AddressInfo;
+  return {
+    url: `http://127.0.0.1:${addr.port}/v1`,
+    get chatCalls() {
+      return state.chatCalls;
+    },
+    set nextChatText(v: string | null) {
+      state.nextChatText = v;
+    },
+    get nextChatText() {
+      return state.nextChatText;
+    },
+    close: () =>
+      new Promise<void>((resolve, reject) =>
+        server.close((err) => (err ? reject(err) : resolve()))
+      ),
+  };
+}
+
+describe('Tier extensions', () => {
+  let tempDir: string;
+  let fake: FakeServer;
+
+  beforeEach(async () => {
+    tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-tiers-'));
+    fake = await startFake();
+    // Two files in two different dirs to give the directory summarizer
+    // and naming-drift checker enough siblings to be meaningful.
+    fs.mkdirSync(path.join(tempDir, 'src', 'auth'), { recursive: true });
+    fs.mkdirSync(path.join(tempDir, 'src', 'util'), { recursive: true });
+    fs.writeFileSync(
+      path.join(tempDir, 'src', 'auth', 'token.ts'),
+      `export function createToken(user: string): string {
+  const payload = { user };
+  const sig = 'fake';
+  return JSON.stringify(payload) + sig;
+}
+
+export function verifyToken(token: string): boolean {
+  const valid = token.length > 0;
+  const checked = true;
+  return valid && checked;
+}
+
+export class TokenStore {
+  private bag: Map<string, string> = new Map();
+  put(k: string, v: string): void { this.bag.set(k, v); }
+  get(k: string): string | undefined { return this.bag.get(k); }
+  size(): number { return this.bag.size; }
+}
+`
+    );
+    fs.writeFileSync(
+      path.join(tempDir, 'src', 'util', 'helpers.ts'),
+      `export function formatDate(d: Date): string {
+  const y = d.getFullYear();
+  const m = d.getMonth();
+  return y + '-' + m;
+}
+
+export function clamp(n: number, min: number, max: number): number {
+  if (n < min) return min;
+  if (n > max) return max;
+  return n;
+}
+
+export function debounce(fn: () => void, ms: number): () => void {
+  let t: ReturnType<typeof setTimeout> | undefined;
+  return () => {
+    if (t) clearTimeout(t);
+    t = setTimeout(fn, ms);
+  };
+}
+`
+    );
+  });
+
+  afterEach(async () => {
+    await fake.close();
+    fs.rmSync(tempDir, { recursive: true, force: true });
+  });
+
+  it('directory summary text round-trips correctly (column-order regression)', async () => {
+    // The fake server returns "Coordinates a small module..." for the
+    // dir-summarizer prompt. If the SQL bind order is wrong we'd see a
+    // hex content_hash come back instead of that paragraph.
+    const cg = await CodeGraph.init(tempDir, {
+      config: {
+        llm: { endpoint: fake.url, chatModel: 'qwen2.5-coder:7b' },
+      },
+    });
+    try {
+      await cg.indexAll();
+      await cg.awaitBackgroundSummarization();
+
+      const all = cg.getAllDirectorySummaries();
+      expect(all.length).toBeGreaterThan(0);
+      for (const { summary } of all) {
+        // Summaries must be prose, not 32-char hex (which would be
+        // a content_hash bleeding into the wrong column).
+        expect(summary).not.toMatch(/^[0-9a-f]{32}$/);
+        expect(summary.length).toBeGreaterThan(20);
+      }
+    } finally {
+      cg.close();
+    }
+  });
+
+  it('background pass writes directory summaries and role labels', async () => {
+    const cg = await CodeGraph.init(tempDir, {
+      config: {
+        llm: {
+          endpoint: fake.url,
+          chatModel: 'qwen2.5-coder:7b',
+          embeddingModel: 'fake-embed',
+        },
+      },
+    });
+    try {
+      await cg.indexAll();
+      await cg.awaitBackgroundSummarization();
+
+      // Directory summaries: at least one of the two source dirs
+      // should have one (3+ symbol threshold).
+      const dirs = cg.getAllDirectorySummaries();
+      expect(dirs.length).toBeGreaterThan(0);
+
+      // Role classification: every summarised symbol should have a
+      // role assigned (classifier returns "business_logic" for our
+      // fake responses).
+      const counts = cg.getRoleCounts();
+      expect([...counts.values()].reduce((a, b) => a + b, 0)).toBeGreaterThan(0);
+
+      // findNodesByRole returns the matching nodes
+      const businessLogic = cg.findNodesByRole('business_logic', 100);
+      expect(businessLogic.length).toBeGreaterThan(0);
+    } finally {
+      cg.close();
+    }
+  });
+
+  it('summarizeChange honors before-only and after-only modes', async () => {
+    const cg = await CodeGraph.init(tempDir, {
+      config: { llm: { endpoint: fake.url, chatModel: 'qwen2.5-coder:7b' } },
+    });
+    try {
+      const added = await cg.summarizeChange(
+        'newFn',
+        'function',
+        '',
+        'function newFn() { return 1; }'
+      );
+      expect(added.intent.length).toBeGreaterThan(0);
+
+      const removed = await cg.summarizeChange(
+        'oldFn',
+        'function',
+        'function oldFn() { return 1; }',
+        ''
+      );
+      expect(removed.intent.length).toBeGreaterThan(0);
+    } finally {
+      cg.close();
+    }
+  });
+
+  it('findDeadCodeCandidates returns parsed verdicts', async () => {
+    const cg = await CodeGraph.init(tempDir, {
+      config: { llm: { endpoint: fake.url, chatModel: 'qwen2.5-coder:7b' } },
+    });
+    try {
+      await cg.indexAll({ summarize: false });
+      const result = await cg.findDeadCodeCandidates({ maxCandidates: 5 });
+      // Real assertions: judged ≤ candidates, no errors on the fake
+      // server, and every verdict carries a parsed confidence in
+      // [0, 1] from one of the three known labels.
+      expect(result.candidates).toBeGreaterThanOrEqual(result.judged);
+      expect(result.errors).toBe(0);
+      for (const r of result.results) {
+        expect(['dead', 'live', 'uncertain']).toContain(r.verdict);
+        expect(r.confidence).toBeGreaterThanOrEqual(0);
+        expect(r.confidence).toBeLessThanOrEqual(1);
+      }
+    } finally {
+      cg.close();
+    }
+  });
+
+  it('parseRole accepts canonical, fenced, multi-word, and trailing-punct inputs', async () => {
+    const { parseRole } = await import('../src/llm/classifier');
+    // Canonical
+    expect(parseRole('business_logic')).toBe('business_logic');
+    // Trailing punctuation
+    expect(parseRole('business_logic.')).toBe('business_logic');
+    // Fenced + quotes
+    expect(parseRole('`business_logic`')).toBe('business_logic');
+    // Title-cased multi-word — the case the reviewer flagged.
+    expect(parseRole('Business Logic')).toBe('business_logic');
+    expect(parseRole('Api Endpoint')).toBe('api_endpoint');
+    // Garbage falls through to "unknown" (advisory degrade).
+    expect(parseRole('I think this is a util maybe')).toBe('unknown');
+  });
+
+  it('agent bridge: pendingSummariesBatch + saveAgentSummaries round-trip without LLM', async () => {
+    // No config.llm — exercises the path users without Ollama would take.
+    const cg = await CodeGraph.init(tempDir);
+    try {
+      await cg.indexAll({ summarize: false });
+
+      const batch = cg.pendingSummariesBatch({ limit: 5, modelHint: 'claude-test' });
+      expect(batch.items.length).toBeGreaterThan(0);
+      expect(batch.total).toBeGreaterThanOrEqual(batch.items.length);
+      // Each item should have a non-empty body and a content_hash.
+      for (const it of batch.items) {
+        expect(it.body.length).toBeGreaterThan(0);
+        expect(it.contentHash.length).toBe(32);
+      }
+
+      // Pretend the agent answered each one with a fake summary.
+      const saved = cg.saveAgentSummaries(
+        batch.items.map((it) => ({
+          nodeId: it.nodeId,
+          contentHash: it.contentHash,
+          summary: `Agent-summarised ${it.name}`,
+        })),
+        'claude-test'
+      );
+      expect(saved.saved).toBe(batch.items.length);
+      expect(saved.skipped).toBe(0);
+
+      // Coverage now reflects the writes.
+      const cov = cg.getSummaryCoverage();
+      expect(cov.summarised).toBeGreaterThanOrEqual(batch.items.length);
+
+      // Re-issuing the batch with the same modelHint should NOT return
+      // the same items again (cache short-circuit).
+      const batch2 = cg.pendingSummariesBatch({ limit: 5, modelHint: 'claude-test' });
+      const overlap = batch2.items.filter((b) =>
+        batch.items.some((a) => a.nodeId === b.nodeId)
+      );
+      expect(overlap.length).toBe(0);
+    } finally {
+      cg.close();
+    }
+  });
+
+  it('agent bridge: stale content_hash is rejected with a clear error', async () => {
+    const cg = await CodeGraph.init(tempDir);
+    try {
+      await cg.indexAll({ summarize: false });
+      const batch = cg.pendingSummariesBatch({ limit: 1 });
+      const item = batch.items[0]!;
+      const result = cg.saveAgentSummaries(
+        [
+          {
+            nodeId: item.nodeId,
+            contentHash: 'cccccccccccccccccccccccccccccccc', // stale
+            summary: 'wrong cache key',
+          },
+        ],
+        'claude-test'
+      );
+      expect(result.saved).toBe(0);
+      expect(result.skipped).toBe(1);
+      expect(result.errors[0]).toMatch(/content_hash drifted/);
+    } finally {
+      cg.close();
+    }
+  });
+
+  it('checkNamingDrift returns advisory consistent/suggestion shape', async () => {
+    const cg = await CodeGraph.init(tempDir, {
+      config: { llm: { endpoint: fake.url, chatModel: 'qwen2.5-coder:7b' } },
+    });
+    try {
+      await cg.indexAll({ summarize: false });
+      // Override response so we can assert parsing of an inconsistent verdict
+      fake.nextChatText =
+        '{"consistent": false, "suggestion": "createSession", "reason": "siblings use create* prefix"}';
+
+      const verdict = await cg.checkNamingDrift({
+        name: 'makeSession',
+        kind: 'function',
+        filePath: 'src/auth/new.ts',
+      });
+      expect(verdict.consistent).toBe(false);
+      expect(verdict.suggestion).toBe('createSession');
+    } finally {
+      cg.close();
+    }
+  });
+});
diff --git a/__tests__/llm.test.ts b/__tests__/llm.test.ts
new file mode 100644
index 00000000..e4bc48b6
--- /dev/null
+++ b/__tests__/llm.test.ts
@@ -0,0 +1,366 @@
+/**
+ * LLM auto-detect + background summarisation tests
+ *
+ * Spins up a tiny in-process HTTP server that mimics the OpenAI-compat
+ * surface Ollama exposes. Covers:
+ *  - detectLocalLlm picks a chat model from /v1/models
+ *  - LlmClient.isReachable / listModels round-trip
+ *  - summarizeAll content_hash cache: re-running is a pure cache hit
+ *  - CodeGraph.startBackgroundSummarization is fire-and-forget
+ *  - cancellation via AbortController on close()
+ */
+
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+import * as http from 'http';
+import { AddressInfo } from 'net';
+import { CodeGraph } from '../src';
+import { LlmClient } from '../src/llm/client';
+import { detectLocalLlm } from '../src/llm/detect';
+
+interface FakeServerOptions {
+  models?: string[];
+  /** Delay before responding to /chat/completions, ms. */
+  chatDelayMs?: number;
+  /** Optional override for the chat completion text. */
+  chatText?: string;
+}
+
+interface FakeServer {
+  url: string;
+  chatCalls: number;
+  modelsCalls: number;
+  close: () => Promise<void>;
+}
+
+async function startFakeOllama(options: FakeServerOptions = {}): Promise<FakeServer> {
+  const models = options.models ?? ['qwen2.5-coder:7b'];
+  const state = { chatCalls: 0, modelsCalls: 0 };
+
+  const server = http.createServer(async (req, res) => {
+    if (req.url === '/v1/models' || req.url === '/models') {
+      state.modelsCalls++;
+      res.setHeader('content-type', 'application/json');
+      res.end(JSON.stringify({ data: models.map((id) => ({ id })) }));
+      return;
+    }
+    if (req.url?.endsWith('/chat/completions')) {
+      state.chatCalls++;
+      let body = '';
+      req.on('data', (chunk) => (body += chunk));
+      req.on('end', async () => {
+        if (options.chatDelayMs) {
+          await new Promise((r) => setTimeout(r, options.chatDelayMs));
+        }
+        res.setHeader('content-type', 'application/json');
+        res.end(
+          JSON.stringify({
+            choices: [
+              {
+                message: {
+                  role: 'assistant',
+                  content: options.chatText ?? 'Computes a thing and returns it',
+                },
+              },
+            ],
+            usage: { prompt_tokens: 10, completion_tokens: 8 },
+          })
+        );
+      });
+      return;
+    }
+    res.statusCode = 404;
+    res.end();
+  });
+
+  await new Promise<void>((resolve) => server.listen(0, '127.0.0.1', resolve));
+  const addr = server.address() as AddressInfo;
+  const url = `http://127.0.0.1:${addr.port}/v1`;
+
+  return {
+    url,
+    get chatCalls() {
+      return state.chatCalls;
+    },
+    get modelsCalls() {
+      return state.modelsCalls;
+    },
+    close: () =>
+      new Promise<void>((resolve, reject) => {
+        server.close((err) => (err ? reject(err) : resolve()));
+      }),
+  };
+}
+
+describe('LlmClient', () => {
+  it('isReachable returns true when /v1/models responds', async () => {
+    const fake = await startFakeOllama();
+    try {
+      const client = new LlmClient({ endpoint: fake.url });
+      expect(await client.isReachable()).toBe(true);
+    } finally {
+      await fake.close();
+    }
+  });
+
+  it('isReachable returns false when nothing listens', async () => {
+    // Pick an unused port deterministically by opening + immediately closing
+    // a server. Race-free enough for a single test.
+    const tmp = http.createServer();
+    await new Promise<void>((r) => tmp.listen(0, '127.0.0.1', r));
+    const port = (tmp.address() as AddressInfo).port;
+    await new Promise<void>((r) => tmp.close(() => r()));
+    const client = new LlmClient({
+      endpoint: `http://127.0.0.1:${port}/v1`,
+      timeoutMs: 200,
+    });
+    expect(await client.isReachable()).toBe(false);
+  });
+
+  it('listModels returns ids from /v1/models', async () => {
+    const fake = await startFakeOllama({ models: ['qwen2.5:7b', 'gemma3:4b', 'nomic-embed-text'] });
+    try {
+      const client = new LlmClient({ endpoint: fake.url });
+      const ids = await client.listModels();
+      expect(ids).toEqual(['qwen2.5:7b', 'gemma3:4b', 'nomic-embed-text']);
+    } finally {
+      await fake.close();
+    }
+  });
+});
+
+describe('detectLocalLlm', () => {
+  it('picks a preferred chat model and skips embedding-only ids', async () => {
+    const fake = await startFakeOllama({
+      models: ['nomic-embed-text', 'gemma3:4b', 'qwen2.5-coder:7b'],
+    });
+    try {
+      const detected = await detectLocalLlm(fake.url);
+      expect(detected).not.toBeNull();
+      expect(detected?.chatModel).toBe('qwen2.5-coder:7b');
+      expect(detected?.embeddingModel).toBe('nomic-embed-text');
+    } finally {
+      await fake.close();
+    }
+  });
+
+  it('falls back to first non-embedding model when none preferred', async () => {
+    const fake = await startFakeOllama({
+      models: ['custom-finetune:13b', 'bge-m3'],
+    });
+    try {
+      const detected = await detectLocalLlm(fake.url);
+      expect(detected?.chatModel).toBe('custom-finetune:13b');
+    } finally {
+      await fake.close();
+    }
+  });
+
+  it('returns null when endpoint is unreachable', async () => {
+    const tmp = http.createServer();
+    await new Promise<void>((r) => tmp.listen(0, '127.0.0.1', r));
+    const port = (tmp.address() as AddressInfo).port;
+    await new Promise<void>((r) => tmp.close(() => r()));
+    const detected = await detectLocalLlm(`http://127.0.0.1:${port}/v1`, 200);
+    expect(detected).toBeNull();
+  });
+});
+
+describe('CodeGraph background summarisation', () => {
+  let tempDir: string;
+  let fake: FakeServer;
+
+  beforeEach(async () => {
+    tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-llm-'));
+    fake = await startFakeOllama();
+    // Drop one TS file so indexAll has something summarisable
+    fs.writeFileSync(
+      path.join(tempDir, 'sample.ts'),
+      `export function greet(name: string): string {
+  const greeting = 'Hello';
+  const punctuation = '!';
+  return \`\${greeting}, \${name}\${punctuation}\`;
+}
+
+export class Counter {
+  private value: number = 0;
+  increment(): number {
+    this.value += 1;
+    return this.value;
+  }
+  reset(): void {
+    this.value = 0;
+  }
+}
+`
+    );
+  });
+
+  afterEach(async () => {
+    await fake.close();
+    fs.rmSync(tempDir, { recursive: true, force: true });
+  });
+
+  it('startBackgroundSummarization populates the cache when an LLM is configured', async () => {
+    const cg = await CodeGraph.init(tempDir, {
+      config: {
+        llm: { endpoint: fake.url, chatModel: 'qwen2.5-coder:7b' },
+      },
+    });
+    try {
+      // indexAll fires summarisation in the background
+      await cg.indexAll();
+      await cg.awaitBackgroundSummarization();
+
+      const cov = cg.getSummaryCoverage();
+      expect(cov.total).toBeGreaterThan(0);
+      expect(cov.summarised).toBeGreaterThan(0);
+      expect(fake.chatCalls).toBeGreaterThan(0);
+    } finally {
+      cg.close();
+    }
+  });
+
+  it('re-running is a pure cache hit (no LLM calls)', async () => {
+    const cg = await CodeGraph.init(tempDir, {
+      config: {
+        llm: { endpoint: fake.url, chatModel: 'qwen2.5-coder:7b' },
+      },
+    });
+    try {
+      await cg.indexAll();
+      await cg.awaitBackgroundSummarization();
+      const callsAfterFirstPass = fake.chatCalls;
+      expect(callsAfterFirstPass).toBeGreaterThan(0);
+
+      // Run it again — every symbol should hit the cache.
+      const result = await cg.summarizeAll();
+      expect(result.cacheHits).toBe(result.candidates);
+      expect(result.generated).toBe(0);
+      expect(fake.chatCalls).toBe(callsAfterFirstPass);
+    } finally {
+      cg.close();
+    }
+  });
+
+  it('hasLlm + getEffectiveLlmConfig reflect explicit config', async () => {
+    const cg = await CodeGraph.init(tempDir, {
+      config: {
+        llm: { endpoint: fake.url, chatModel: 'qwen2.5-coder:7b' },
+      },
+    });
+    try {
+      expect(cg.hasLlm()).toBe(true);
+      const eff = await cg.getEffectiveLlmConfig();
+      expect(eff?.endpoint).toBe(fake.url);
+      expect(eff?.chatModel).toBe('qwen2.5-coder:7b');
+    } finally {
+      cg.close();
+    }
+  });
+
+  it('skips background pass silently when no LLM is reachable', async () => {
+    // Point at a guaranteed-closed port so the test is hermetic (host
+    // may or may not have Ollama on 11434). Reachability check fails
+    // and the background pass returns early without making chat calls.
+    const tmp = http.createServer();
+    await new Promise<void>((r) => tmp.listen(0, '127.0.0.1', r));
+    const closedPort = (tmp.address() as AddressInfo).port;
+    await new Promise<void>((r) => tmp.close(() => r()));
+
+    const cg = await CodeGraph.init(tempDir, {
+      config: {
+        llm: {
+          endpoint: `http://127.0.0.1:${closedPort}/v1`,
+          chatModel: 'qwen2.5-coder:7b',
+          timeoutMs: 200,
+        },
+      },
+    });
+    try {
+      await cg.indexAll();
+      await cg.awaitBackgroundSummarization();
+      const cov = cg.getSummaryCoverage();
+      expect(cov.summarised).toBe(0);
+      expect(cg.isSummarizing()).toBe(false);
+    } finally {
+      cg.close();
+    }
+  });
+
+  it('close() cancels in-flight background summarisation', async () => {
+    // Slow chat replies so we can observe cancellation between calls.
+    await fake.close();
+    fake = await startFakeOllama({ chatDelayMs: 100 });
+
+    const cg = await CodeGraph.init(tempDir, {
+      config: {
+        llm: { endpoint: fake.url, chatModel: 'qwen2.5-coder:7b' },
+      },
+    });
+    await cg.indexAll();
+    // Don't await: cancel mid-flight.
+    expect(cg.isSummarizing()).toBe(true);
+    cg.close();
+    // close() aborts the controller; awaiting here would hang on the
+    // last in-flight HTTP request, so we just verify the bookkeeping
+    // is consistent.
+    expect(cg.isSummarizing()).toBe(false);
+  });
+
+  it('re-queues a second pass when sync fires mid-pass (dirty flag)', async () => {
+    // Slow chat replies so the bg pass is still running when we kick
+    // off a second startBackgroundSummarization() call.
+    await fake.close();
+    fake = await startFakeOllama({ chatDelayMs: 30 });
+
+    const cg = await CodeGraph.init(tempDir, {
+      config: { llm: { endpoint: fake.url, chatModel: 'qwen2.5-coder:7b' } },
+    });
+    try {
+      await cg.indexAll();
+      // First pass is mid-flight; this second call should set the
+      // dirty flag and return the existing promise rather than
+      // starting a parallel pass.
+      const p1 = cg.startBackgroundSummarization();
+      const p2 = cg.startBackgroundSummarization();
+      expect(p1).toBe(p2);
+      await p1;
+      // After the first pass completes, the dirty flag triggers a
+      // second pass — wait for it and ensure it ran clean (cache
+      // hits, no errors).
+      if (cg.isSummarizing()) {
+        await cg.awaitBackgroundSummarization();
+      }
+      expect(cg.isSummarizing()).toBe(false);
+    } finally {
+      cg.close();
+    }
+  });
+
+  it('getSymbolSummaries returns map keyed by node id', async () => {
+    const cg = await CodeGraph.init(tempDir, {
+      config: {
+        llm: { endpoint: fake.url, chatModel: 'qwen2.5-coder:7b' },
+      },
+    });
+    try {
+      await cg.indexAll();
+      await cg.awaitBackgroundSummarization();
+
+      const allNodes = cg.getStats();
+      expect(allNodes.nodeCount).toBeGreaterThan(0);
+
+      const ids = cg
+        .searchNodes('greet', { limit: 5 })
+        .map((r) => r.node.id);
+      const summaries = cg.getSymbolSummaries(ids);
+      // At least one summarised symbol came back.
+      expect([...summaries.values()].some((s) => s.length > 0)).toBe(true);
+    } finally {
+      cg.close();
+    }
+  });
+});
diff --git a/__tests__/mcp-tool-registry.test.ts b/__tests__/mcp-tool-registry.test.ts
new file mode 100644
index 00000000..0bf45159
--- /dev/null
+++ b/__tests__/mcp-tool-registry.test.ts
@@ -0,0 +1,90 @@
+/**
+ * MCP tool registry: structural invariants.
+ *
+ * Guards against the failure mode where a future PR adds a
+ * ToolModule but forgets to implement the matching `handle<Name>`
+ * method on ToolHandler (or vice versa).
+ */
+import { describe, it, expect } from 'vitest';
+import { getToolModules, tools as registryTools } from '../src/mcp/tools/registry';
+import { ToolHandler, tools } from '../src/mcp/tools';
+
+describe('MCP tool registry — single source of truth', () => {
+  it('every tool module has a non-empty name and description', () => {
+    for (const m of getToolModules()) {
+      expect(m.definition.name).toMatch(/^codegraph_[a-z_]+$/);
+      expect(m.definition.description.length).toBeGreaterThan(20);
+    }
+  });
+
+  it('handlerKey is a string starting with "handle"', () => {
+    for (const m of getToolModules()) {
+      expect(m.handlerKey).toMatch(/^handle[A-Z][A-Za-z]+$/);
+    }
+  });
+
+  it('every registered tool has a corresponding ToolHandler method', () => {
+    const handler = new ToolHandler(null);
+    for (const m of getToolModules()) {
+      const fn = (handler as unknown as Record<string, unknown>)[m.handlerKey];
+      expect(typeof fn).toBe('function');
+    }
+  });
+
+  it('exported `tools` array exactly mirrors the registry', () => {
+    const fromRegistry = registryTools.map((t) => t.name).sort();
+    const fromExport = tools.map((t) => t.name).sort();
+    expect(fromExport).toEqual(fromRegistry);
+  });
+
+  it('all main-line tools are registered (regression guard)', () => {
+    const expected = [
+      'codegraph_ask',
+      'codegraph_callees',
+      'codegraph_callers',
+      'codegraph_config',
+      'codegraph_context',
+      'codegraph_dead_code',
+      'codegraph_explore',
+      'codegraph_files',
+      'codegraph_hotspots',
+      'codegraph_impact',
+      'codegraph_module',
+      'codegraph_node',
+      'codegraph_pending_summaries',
+      'codegraph_review_context',
+      'codegraph_role',
+      'codegraph_save_summaries',
+      'codegraph_search',
+      'codegraph_similar',
+      'codegraph_sql',
+      'codegraph_status',
+    ];
+    const actual = getToolModules()
+      .map((m) => m.definition.name)
+      .sort();
+    expect(actual).toEqual(expected);
+  });
+
+  it('execute() reports unknown-tool errors', async () => {
+    const handler = new ToolHandler(null);
+    const result = await handler.execute('codegraph_does_not_exist', {});
+    expect(result.isError).toBe(true);
+    expect(result.content[0]?.text).toMatch(/Unknown tool/);
+  });
+
+  it('execute() actually dispatches to the registered handler (no broken `this` binding)', async () => {
+    // No CodeGraph instance is bound, so handlers that call
+    // `getCodeGraph()` will throw — the dispatch should catch it
+    // and return an error result. The point of this test is to
+    // confirm the registry lookup + `this[handlerKey](args)` chain
+    // reaches an actual method body, not that the body succeeds.
+    const handler = new ToolHandler(null);
+    const result = await handler.execute('codegraph_status', {});
+    expect(result.isError).toBe(true);
+    // Generic tool-execution-failed envelope from execute()'s catch block.
+    expect(result.content[0]?.text).toMatch(/Tool execution failed/);
+    // Specifically because no CodeGraph was bound:
+    expect(result.content[0]?.text).toMatch(/CodeGraph not initialized/);
+  });
+});
diff --git a/__tests__/migrations-015-016.test.ts b/__tests__/migrations-015-016.test.ts
new file mode 100644
index 00000000..b71968fe
--- /dev/null
+++ b/__tests__/migrations-015-016.test.ts
@@ -0,0 +1,148 @@
+/**
+ * Migration 015 (drop idx_co_changes_a) and 016 (split embeddings).
+ *
+ * - 015 verifies the redundant `idx_co_changes_a` index is removed
+ *   on upgrade and absent on a fresh DB; the wider PK still covers
+ *   `WHERE file_a = ?` lookups.
+ * - 016 verifies embeddings move from `symbol_summaries.embedding`
+ *   into a dedicated `symbol_embeddings` table, the old columns
+ *   are dropped, and existing data is preserved verbatim.
+ */
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+import { createDatabase } from '../src/db/sqlite-adapter';
+import { runMigrations, getCurrentVersion } from '../src/db/migrations';
+import { DatabaseConnection } from '../src/db';
+
+function tempDir(): string {
+  return fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-mig-015-016-'));
+}
+
+function cleanup(dir: string): void {
+  if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true });
+}
+
+describe('Migration 015 — drop idx_co_changes_a', () => {
+  let dir: string;
+  beforeEach(() => { dir = tempDir(); });
+  afterEach(() => cleanup(dir));
+
+  it('fresh DB does not contain idx_co_changes_a, but keeps idx_co_changes_b', () => {
+    const dbPath = path.join(dir, 'fresh.db');
+    const db = DatabaseConnection.initialize(dbPath);
+    try {
+      const indexes = db.getDb()
+        .prepare("SELECT name FROM sqlite_master WHERE type = 'index' AND tbl_name = 'co_changes'")
+        .all() as Array<{ name: string }>;
+      const names = indexes.map((r) => r.name);
+      expect(names).not.toContain('idx_co_changes_a');
+      expect(names).toContain('idx_co_changes_b');
+    } finally {
+      db.close();
+    }
+  });
+});
+
+describe('Migration 016 — split embeddings into symbol_embeddings table', () => {
+  let dir: string;
+  beforeEach(() => { dir = tempDir(); });
+  afterEach(() => cleanup(dir));
+
+  it('moves existing embedding rows; drops the inline columns', () => {
+    const dbPath = path.join(dir, 'upgrade.db');
+    const adapter = createDatabase(dbPath);
+
+    // Simulate a v14 database: just enough of the relevant schema.
+    adapter.exec(`
+      CREATE TABLE nodes (id TEXT PRIMARY KEY);
+      INSERT INTO nodes (id) VALUES ('n1'), ('n2'), ('n3');
+      CREATE TABLE symbol_summaries (
+        node_id TEXT PRIMARY KEY,
+        content_hash TEXT NOT NULL,
+        summary TEXT NOT NULL,
+        model TEXT NOT NULL,
+        generated_at INTEGER NOT NULL,
+        embedding BLOB,
+        embedding_model TEXT,
+        role TEXT,
+        role_model TEXT,
+        FOREIGN KEY (node_id) REFERENCES nodes(id) ON DELETE CASCADE
+      );
+      CREATE INDEX idx_summaries_embedding_model ON symbol_summaries(embedding_model);
+      CREATE TABLE schema_versions (
+        version INTEGER PRIMARY KEY,
+        applied_at INTEGER NOT NULL,
+        description TEXT
+      );
+      INSERT INTO schema_versions (version, applied_at, description) VALUES (14, 0, 'v14');
+    `);
+
+    // n1 has both summary and embedding; n2 has summary only;
+    // n3 has summary + embedding from a stale model — all rows are
+    // copied into symbol_embeddings so long as embedding_model is set.
+    const buf1 = Buffer.from(new Float32Array([1, 0, 0]).buffer);
+    const buf3 = Buffer.from(new Float32Array([0, 1, 0]).buffer);
+    adapter.prepare(`
+      INSERT INTO symbol_summaries
+        (node_id, content_hash, summary, model, generated_at, embedding, embedding_model)
+      VALUES
+        ('n1', 'h1', 's1', 'chat-m', 100, ?, 'embed-m'),
+        ('n2', 'h2', 's2', 'chat-m', 100, NULL, NULL),
+        ('n3', 'h3', 's3', 'chat-m', 100, ?, 'old-embed-m')
+    `).run(buf1, buf3);
+
+    runMigrations(adapter, getCurrentVersion(adapter));
+
+    // Old columns gone
+    const cols = adapter.prepare("PRAGMA table_info('symbol_summaries')").all() as Array<{ name: string }>;
+    const colNames = cols.map((c) => c.name);
+    expect(colNames).not.toContain('embedding');
+    expect(colNames).not.toContain('embedding_model');
+
+    // New table has the rows that had embedding_model set
+    const moved = adapter
+      .prepare('SELECT node_id, embedding_model FROM symbol_embeddings ORDER BY node_id')
+      .all() as Array<{ node_id: string; embedding_model: string }>;
+    expect(moved).toEqual([
+      { node_id: 'n1', embedding_model: 'embed-m' },
+      { node_id: 'n3', embedding_model: 'old-embed-m' },
+    ]);
+
+    // Embedding bytes preserved verbatim for n1
+    const n1 = adapter
+      .prepare('SELECT embedding FROM symbol_embeddings WHERE node_id = ?')
+      .get('n1') as { embedding: Buffer };
+    expect(Buffer.from(n1.embedding).equals(buf1)).toBe(true);
+
+    // Index on the new table
+    const idx = adapter
+      .prepare("SELECT name FROM sqlite_master WHERE type = 'index' AND tbl_name = 'symbol_embeddings'")
+      .all() as Array<{ name: string }>;
+    expect(idx.map((r) => r.name)).toContain('idx_embeddings_model');
+
+    expect(getCurrentVersion(adapter)).toBeGreaterThanOrEqual(16);
+
+    adapter.close();
+  });
+
+  it('fresh DB has symbol_embeddings table and no embedding columns on symbol_summaries', () => {
+    const db = DatabaseConnection.initialize(path.join(dir, 'fresh.db'));
+    try {
+      const cols = db.getDb()
+        .prepare("PRAGMA table_info('symbol_summaries')")
+        .all() as Array<{ name: string }>;
+      const colNames = cols.map((c) => c.name);
+      expect(colNames).not.toContain('embedding');
+      expect(colNames).not.toContain('embedding_model');
+
+      const tables = db.getDb()
+        .prepare("SELECT name FROM sqlite_master WHERE type = 'table' AND name = 'symbol_embeddings'")
+        .all() as Array<{ name: string }>;
+      expect(tables.length).toBe(1);
+    } finally {
+      db.close();
+    }
+  });
+});
diff --git a/__tests__/migrations-registry.test.ts b/__tests__/migrations-registry.test.ts
new file mode 100644
index 00000000..9fa15eed
--- /dev/null
+++ b/__tests__/migrations-registry.test.ts
@@ -0,0 +1,95 @@
+/**
+ * Migration registry: structural invariants.
+ *
+ * Guards against the silent-no-op bug class that motivated this
+ * refactor. If a future PR introduces a duplicate version,
+ * out-of-order versions, or fails to register a new migration
+ * file, one of these tests fails loudly.
+ */
+import { describe, it, expect } from 'vitest';
+import * as fs from 'fs';
+import * as path from 'path';
+import {
+  ALL_MIGRATIONS,
+  CURRENT_SCHEMA_VERSION,
+} from '../src/db/migrations';
+
+describe('migration registry — structural invariants', () => {
+  it('registry is non-empty', () => {
+    expect(ALL_MIGRATIONS.length).toBeGreaterThan(0);
+  });
+
+  it('versions are unique', () => {
+    const seen = new Set<number>();
+    for (const m of ALL_MIGRATIONS) {
+      expect(seen.has(m.version)).toBe(false);
+      seen.add(m.version);
+    }
+  });
+
+  it('versions are strictly ascending', () => {
+    for (let i = 1; i < ALL_MIGRATIONS.length; i++) {
+      expect(ALL_MIGRATIONS[i]!.version).toBeGreaterThan(
+        ALL_MIGRATIONS[i - 1]!.version
+      );
+    }
+  });
+
+  it('each migration has a non-empty description and a function up()', () => {
+    for (const m of ALL_MIGRATIONS) {
+      expect(m.description.length).toBeGreaterThan(0);
+      expect(typeof m.up).toBe('function');
+    }
+  });
+
+  it('CURRENT_SCHEMA_VERSION matches the highest registered version', () => {
+    const max = ALL_MIGRATIONS[ALL_MIGRATIONS.length - 1]!.version;
+    expect(CURRENT_SCHEMA_VERSION).toBe(max);
+  });
+});
+
+describe('migration files — filename ↔ version coupling', () => {
+  // Read the actual filenames on disk and assert each matches an
+  // entry in the registry. Catches the case where someone drops a
+  // new file in src/db/migrations/ but forgets to register it.
+  const migrationsDir = path.resolve(__dirname, '../src/db/migrations');
+  const SUPPORT_FILES = new Set(['index.ts', 'types.ts']);
+  const STRICT_NNN_PATTERN = /^\d{3}-[a-z0-9]+(?:-[a-z0-9]+)*\.ts$/;
+
+  function listMigrationFiles(): string[] {
+    return fs.readdirSync(migrationsDir).filter((f) => f.endsWith('.ts') && !SUPPORT_FILES.has(f));
+  }
+
+  it('every migration file matches the strict `NNN-kebab-name.ts` pattern', () => {
+    const offenders: string[] = [];
+    for (const f of listMigrationFiles()) {
+      if (!STRICT_NNN_PATTERN.test(f)) {
+        offenders.push(f);
+      }
+    }
+    expect(offenders).toEqual([]);
+  });
+
+  it('every src/db/migrations/NNN-*.ts file is registered (no orphan files)', () => {
+    const files = listMigrationFiles().filter((f) => STRICT_NNN_PATTERN.test(f));
+    expect(files.length).toBeGreaterThan(0);
+    const registeredVersions = new Set(ALL_MIGRATIONS.map((m) => m.version));
+    for (const f of files) {
+      const version = parseInt(f.slice(0, 3), 10);
+      if (!registeredVersions.has(version)) {
+        throw new Error(
+          `Migration file ${f} exists on disk but is not registered in src/db/migrations/index.ts. ` +
+            `Add an import + array entry for it.`
+        );
+      }
+    }
+  });
+
+  it('every registered version has a matching NNN-*.ts file (no phantom registrations)', () => {
+    const files = listMigrationFiles().filter((f) => STRICT_NNN_PATTERN.test(f));
+    const filenameVersions = new Set(files.map((f) => parseInt(f.slice(0, 3), 10)));
+    for (const m of ALL_MIGRATIONS) {
+      expect(filenameVersions.has(m.version)).toBe(true);
+    }
+  });
+});
diff --git a/__tests__/pr19-improvements.test.ts b/__tests__/pr19-improvements.test.ts
index 5fbe17d7..9f9ddc38 100644
--- a/__tests__/pr19-improvements.test.ts
+++ b/__tests__/pr19-improvements.test.ts
@@ -299,7 +299,7 @@ describe('Best-Candidate Resolution', () => {
 describe('Schema v2 Migration', () => {
   it.skipIf(!HAS_SQLITE)('should have correct current schema version', async () => {
     const { CURRENT_SCHEMA_VERSION } = await import('../src/db/migrations');
-    expect(CURRENT_SCHEMA_VERSION).toBe(3);
+    expect(CURRENT_SCHEMA_VERSION).toBe(16);
   });
 
   it.skipIf(!HAS_SQLITE)('should have migration for version 2', async () => {
diff --git a/__tests__/review-context.test.ts b/__tests__/review-context.test.ts
new file mode 100644
index 00000000..fd61e103
--- /dev/null
+++ b/__tests__/review-context.test.ts
@@ -0,0 +1,644 @@
+/**
+ * Review Context Tests
+ *
+ * Verifies:
+ *   - parseDiff handles standard git unified-diff shapes (modified,
+ *     added, deleted, renamed, multiple hunks).
+ *   - symbolsTouchedByHunks correctly maps line ranges to symbols.
+ *   - buildReviewContext attaches callers, callees, impact, tests
+ *     for affected symbols.
+ *   - Co-change warnings surface when a changed file's historical
+ *     co-changers were NOT touched.
+ *   - Graceful degrade: pre-#105 install (no co_changes table) and
+ *     pre-#106 install (no `tests` edges) — return empty rather than
+ *     throwing.
+ */
+
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+import { parseDiff, symbolsTouchedByHunks } from '../src/review/diff-parser';
+import { buildReviewContext } from '../src/review';
+import { DatabaseConnection } from '../src/db';
+import { QueryBuilder } from '../src/db/queries';
+import { GraphTraverser } from '../src/graph/traversal';
+import { Node, Edge } from '../src/types';
+
+// =============================================================================
+// parseDiff
+// =============================================================================
+
+describe('parseDiff', () => {
+  it('parses a simple modified-file diff', () => {
+    const diff = `diff --git a/src/foo.ts b/src/foo.ts
+index abc..def 100644
+--- a/src/foo.ts
++++ b/src/foo.ts
+@@ -10,3 +10,5 @@
+ unchanged
+-old line
++new line one
++new line two
+ also unchanged`;
+    const files = parseDiff(diff);
+    expect(files).toHaveLength(1);
+    expect(files[0].path).toBe('src/foo.ts');
+    expect(files[0].status).toBe('modified');
+    expect(files[0].hunks).toEqual([
+      { oldStart: 10, oldCount: 3, newStart: 10, newCount: 5 },
+    ]);
+  });
+
+  it('detects file additions via /dev/null in the --- header', () => {
+    const diff = `diff --git a/new.ts b/new.ts
+new file mode 100644
+index 0000000..abc
+--- /dev/null
++++ b/new.ts
+@@ -0,0 +1,3 @@
++a
++b
++c`;
+    const files = parseDiff(diff);
+    expect(files).toHaveLength(1);
+    expect(files[0].status).toBe('added');
+    expect(files[0].path).toBe('new.ts');
+  });
+
+  it('detects file deletions via /dev/null in the +++ header', () => {
+    const diff = `diff --git a/gone.ts b/gone.ts
+deleted file mode 100644
+index abc..0000000
+--- a/gone.ts
++++ /dev/null
+@@ -1,3 +0,0 @@
+-a
+-b
+-c`;
+    const files = parseDiff(diff);
+    expect(files).toHaveLength(1);
+    expect(files[0].status).toBe('deleted');
+    expect(files[0].path).toBe('gone.ts');
+  });
+
+  it('detects renames and exposes oldPath', () => {
+    const diff = `diff --git a/old.ts b/new.ts
+similarity index 95%
+rename from old.ts
+rename to new.ts
+index abc..def 100644
+--- a/old.ts
++++ b/new.ts
+@@ -1,2 +1,2 @@
+-old name
++new name
+ unchanged`;
+    const files = parseDiff(diff);
+    expect(files).toHaveLength(1);
+    expect(files[0].status).toBe('renamed');
+    expect(files[0].path).toBe('new.ts');
+    expect(files[0].oldPath).toBe('old.ts');
+  });
+
+  it('handles multi-file, multi-hunk diffs', () => {
+    const diff = `diff --git a/a.ts b/a.ts
+index abc..def 100644
+--- a/a.ts
++++ b/a.ts
+@@ -10,3 +10,4 @@
+ ctx
++added
+ ctx
+ ctx
+@@ -20,2 +21,2 @@
+-old
++new
+ ctx
+diff --git a/b.ts b/b.ts
+index 111..222 100644
+--- a/b.ts
++++ b/b.ts
+@@ -5,1 +5,1 @@
+-x
++y`;
+    const files = parseDiff(diff);
+    expect(files).toHaveLength(2);
+    expect(files[0].path).toBe('a.ts');
+    expect(files[0].hunks).toHaveLength(2);
+    expect(files[1].path).toBe('b.ts');
+    expect(files[1].hunks).toHaveLength(1);
+  });
+
+  it('returns [] for empty input', () => {
+    expect(parseDiff('')).toEqual([]);
+  });
+
+  it('emits a hunk-less rename even when followed by another hunked file', () => {
+    // Regression: previously a rename-only file mid-diff was silently
+    // dropped because the EOF-only hunk-less flush never fired before
+    // the next `diff --git` header arrived.
+    const diff = `diff --git a/old.ts b/new.ts
+similarity index 100%
+rename from old.ts
+rename to new.ts
+diff --git a/other.ts b/other.ts
+index abc..def 100644
+--- a/other.ts
++++ b/other.ts
+@@ -1,1 +1,1 @@
+-x
++y`;
+    const files = parseDiff(diff);
+    expect(files).toHaveLength(2);
+    expect(files[0].status).toBe('renamed');
+    expect(files[0].path).toBe('new.ts');
+    expect(files[0].oldPath).toBe('old.ts');
+    expect(files[1].path).toBe('other.ts');
+    expect(files[1].status).toBe('modified');
+  });
+
+  it('emits a hunk-less file-mode-change followed by another file', () => {
+    const diff = `diff --git a/script.sh b/script.sh
+old mode 100644
+new mode 100755
+diff --git a/foo.ts b/foo.ts
+index abc..def 100644
+--- a/foo.ts
++++ b/foo.ts
+@@ -1,1 +1,1 @@
+-a
++b`;
+    const files = parseDiff(diff);
+    // The mode-change file has no add/delete/rename markers so it
+    // doesn't qualify as hunk-less for our purposes — it's silently
+    // skipped (current implementation). The hunked file MUST still
+    // be emitted, and that's the regression risk.
+    expect(files.find((f) => f.path === 'foo.ts')).toBeDefined();
+  });
+
+  it('strips C-style quoting from paths with spaces or special chars', () => {
+    const diff = `diff --git "a/path with spaces.ts" "b/path with spaces.ts"
+index abc..def 100644
+--- "a/path with spaces.ts"
++++ "b/path with spaces.ts"
+@@ -1,1 +1,1 @@
+-a
++b`;
+    const files = parseDiff(diff);
+    expect(files).toHaveLength(1);
+    expect(files[0].path).toBe('path with spaces.ts');
+    expect(files[0].path).not.toContain('"');
+  });
+
+  it('handles single-line hunk header (no comma)', () => {
+    // git emits `@@ -5 +5 @@` for one-line hunks (count of 1 elided).
+    const diff = `diff --git a/x.ts b/x.ts
+index abc..def 100644
+--- a/x.ts
++++ b/x.ts
+@@ -5 +5 @@
+-old
++new`;
+    const files = parseDiff(diff);
+    expect(files[0].hunks[0]).toEqual({
+      oldStart: 5,
+      oldCount: 1,
+      newStart: 5,
+      newCount: 1,
+    });
+  });
+});
+
+// =============================================================================
+// symbolsTouchedByHunks
+// =============================================================================
+
+describe('symbolsTouchedByHunks', () => {
+  const sym = (startLine: number, endLine: number, name = 'sym') => ({ startLine, endLine, name });
+
+  it('returns symbols whose range overlaps any hunk', () => {
+    const symbols = [sym(1, 5, 'a'), sym(10, 20, 'b'), sym(50, 60, 'c')];
+    const hunks = [{ oldStart: 12, oldCount: 3, newStart: 12, newCount: 3 }];
+    const out = symbolsTouchedByHunks(hunks, symbols);
+    expect(out.map((s) => s.name)).toEqual(['b']);
+  });
+
+  it('matches a symbol that fully contains the hunk', () => {
+    const symbols = [sym(1, 100, 'big')];
+    const hunks = [{ oldStart: 50, oldCount: 1, newStart: 50, newCount: 1 }];
+    expect(symbolsTouchedByHunks(hunks, symbols).map((s) => s.name)).toEqual(['big']);
+  });
+
+  it('matches a symbol fully contained by the hunk', () => {
+    const symbols = [sym(50, 55, 'small')];
+    const hunks = [{ oldStart: 10, oldCount: 100, newStart: 10, newCount: 100 }];
+    expect(symbolsTouchedByHunks(hunks, symbols).map((s) => s.name)).toEqual(['small']);
+  });
+
+  it('does not match symbols outside any hunk', () => {
+    const symbols = [sym(1, 5, 'before'), sym(50, 60, 'after')];
+    const hunks = [{ oldStart: 20, oldCount: 5, newStart: 20, newCount: 5 }];
+    expect(symbolsTouchedByHunks(hunks, symbols)).toEqual([]);
+  });
+
+  it('returns [] when hunks or symbols are empty', () => {
+    expect(symbolsTouchedByHunks([], [sym(1, 5)])).toEqual([]);
+    expect(symbolsTouchedByHunks([{ oldStart: 1, oldCount: 1, newStart: 1, newCount: 1 }], [])).toEqual([]);
+  });
+});
+
+// =============================================================================
+// buildReviewContext (integration)
+// =============================================================================
+
+function makeNode(id: string, name: string, kind: Node['kind'], filePath: string, startLine: number, endLine: number): Node {
+  return {
+    id,
+    kind,
+    name,
+    qualifiedName: `${filePath}::${name}`,
+    filePath,
+    language: 'typescript',
+    startLine,
+    endLine,
+    startColumn: 0,
+    endColumn: 0,
+    updatedAt: Date.now(),
+  };
+}
+
+describe('buildReviewContext (integration)', () => {
+  let dir: string;
+  let db: DatabaseConnection;
+  let q: QueryBuilder;
+  let traverser: GraphTraverser;
+
+  beforeEach(() => {
+    dir = fs.mkdtempSync(path.join(os.tmpdir(), 'review-ctx-'));
+    db = DatabaseConnection.initialize(path.join(dir, 'test.db'));
+    q = new QueryBuilder(db.getDb());
+    traverser = new GraphTraverser(q);
+
+    // Set up a small graph:
+    //   src/foo.ts contains `doFoo` (lines 5-15)
+    //   src/bar.ts contains `useFoo` (lines 1-10) which calls doFoo
+    //   src/baz.ts contains `helper` (lines 20-30) which doFoo calls
+    const upsertFile = db.getDb().prepare(`
+      INSERT INTO files (path, content_hash, language, size, modified_at, indexed_at)
+      VALUES (?, '', 'typescript', 0, 0, 0)
+    `);
+    upsertFile.run('src/foo.ts');
+    upsertFile.run('src/bar.ts');
+    upsertFile.run('src/baz.ts');
+
+    q.insertNodes([
+      makeNode('foo', 'doFoo', 'function', 'src/foo.ts', 5, 15),
+      makeNode('bar', 'useFoo', 'function', 'src/bar.ts', 1, 10),
+      makeNode('baz', 'helper', 'function', 'src/baz.ts', 20, 30),
+    ]);
+
+    // Edges: useFoo -> doFoo (calls), doFoo -> helper (calls)
+    const callEdge = (source: string, target: string, line: number): Edge => ({
+      source,
+      target,
+      kind: 'calls',
+      line,
+    });
+    q.insertEdges([callEdge('bar', 'foo', 5), callEdge('foo', 'baz', 12)]);
+  });
+
+  afterEach(() => {
+    db.close();
+    if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true });
+  });
+
+  function modifyDoFooDiff(): string {
+    return `diff --git a/src/foo.ts b/src/foo.ts
+index abc..def 100644
+--- a/src/foo.ts
++++ b/src/foo.ts
+@@ -10,3 +10,4 @@
+ ctx
+-old impl
++new impl
++plus one
+ ctx`;
+  }
+
+  it('attaches callers and callees for affected symbols', () => {
+    const ctx = buildReviewContext(modifyDoFooDiff(), q, traverser);
+    expect(ctx.files).toHaveLength(1);
+    expect(ctx.files[0].affectedSymbols).toHaveLength(1);
+    const sym = ctx.files[0].affectedSymbols[0];
+    expect(sym.name).toBe('doFoo');
+    expect(sym.callers.map((c) => c.name)).toContain('useFoo');
+    expect(sym.callees.map((c) => c.name)).toContain('helper');
+  });
+
+  it('summarizes correctly across an added + modified + deleted set', () => {
+    const diff = `diff --git a/src/foo.ts b/src/foo.ts
+--- a/src/foo.ts
++++ b/src/foo.ts
+@@ -10,1 +10,1 @@
+-x
++y
+diff --git a/src/added.ts b/src/added.ts
+new file mode 100644
+--- /dev/null
++++ b/src/added.ts
+@@ -0,0 +1,1 @@
++content
+diff --git a/src/baz.ts b/src/baz.ts
+deleted file mode 100644
+--- a/src/baz.ts
++++ /dev/null
+@@ -1,1 +0,0 @@
+-x`;
+    const ctx = buildReviewContext(diff, q, traverser);
+    expect(ctx.summary.filesAdded).toBe(1);
+    expect(ctx.summary.filesModified).toBe(1);
+    expect(ctx.summary.filesDeleted).toBe(1);
+  });
+
+  it('reports broken incoming refs for deleted files', () => {
+    const diff = `diff --git a/src/baz.ts b/src/baz.ts
+deleted file mode 100644
+--- a/src/baz.ts
++++ /dev/null
+@@ -20,11 +0,0 @@
+-x`;
+    const ctx = buildReviewContext(diff, q, traverser);
+    const baz = ctx.files.find((f) => f.path === 'src/baz.ts')!;
+    expect(baz.status).toBe('deleted');
+    // doFoo (in foo.ts) calls helper (in baz.ts) — deleting baz.ts breaks foo.
+    expect(baz.brokenIncomingRefs?.map((r) => r.name)).toContain('doFoo');
+  });
+
+  it('dedupes brokenIncomingRefs when one caller has multiple edge types to the deleted file', () => {
+    // Add a second edge from useFoo to helper (e.g., references in
+    // addition to the existing call). Without dedup, useFoo would appear
+    // twice in brokenIncomingRefs.
+    q.insertEdges([{ source: 'bar', target: 'baz', kind: 'references', line: 7 }]);
+    // Note: bar already had a `calls` edge target=foo and now `references` target=baz.
+    // For deletion of baz.ts we look at incoming to baz's symbols (helper).
+    // We need TWO edges from the same source to helper for dedup to fire.
+    q.insertEdges([
+      { source: 'bar', target: 'baz', kind: 'imports', line: 7 },
+    ]);
+    const diff = `diff --git a/src/baz.ts b/src/baz.ts
+deleted file mode 100644
+--- a/src/baz.ts
++++ /dev/null
+@@ -20,11 +0,0 @@
+-x`;
+    const ctx = buildReviewContext(diff, q, traverser);
+    const baz = ctx.files.find((f) => f.path === 'src/baz.ts')!;
+    // useFoo should appear at most once with line=7 (we have two edges
+    // both at line 7 from bar to baz with different kinds).
+    const fromBar = baz.brokenIncomingRefs?.filter((r) => r.name === 'useFoo' && r.line === 7);
+    expect(fromBar?.length).toBe(1);
+  });
+
+  it('returns empty co-change warnings on a pre-#105 install (no co_changes table)', () => {
+    // Default DatabaseConnection.initialize() runs schema.sql which on
+    // upstream/main does NOT include the co_changes table. The helper
+    // must gracefully degrade rather than throw.
+    const ctx = buildReviewContext(modifyDoFooDiff(), q, traverser);
+    expect(ctx.coChangeWarnings).toEqual([]);
+    expect(ctx.summary.coChangeWarnings).toBe(0);
+  });
+
+  it('returns empty tests array on a pre-#106 install (no `tests` edges)', () => {
+    const ctx = buildReviewContext(modifyDoFooDiff(), q, traverser);
+    expect(ctx.files[0].tests).toEqual([]);
+  });
+
+  it('respects maxCallersPerSymbol cap', () => {
+    // Add 10 more callers of doFoo to make the cap observable.
+    const extraNodes: Node[] = [];
+    const extraEdges: Edge[] = [];
+    const upsert = db.getDb().prepare(`
+      INSERT INTO files (path, content_hash, language, size, modified_at, indexed_at)
+      VALUES (?, '', 'typescript', 0, 0, 0)
+    `);
+    for (let i = 0; i < 10; i++) {
+      const fp = `src/caller${i}.ts`;
+      upsert.run(fp);
+      const id = `caller${i}`;
+      extraNodes.push(makeNode(id, `caller${i}`, 'function', fp, 1, 5));
+      extraEdges.push({ source: id, target: 'foo', kind: 'calls', line: 1 });
+    }
+    q.insertNodes(extraNodes);
+    q.insertEdges(extraEdges);
+
+    const ctx = buildReviewContext(modifyDoFooDiff(), q, traverser, { maxCallersPerSymbol: 3 });
+    const sym = ctx.files[0].affectedSymbols[0];
+    expect(sym.callers.length).toBeLessThanOrEqual(3);
+  });
+
+  it('co-change warning surfaces when a changed file has historical co-changers not in the PR', () => {
+    // Manually create the co_changes table + add commit_count + populate.
+    // This simulates a post-#105 install. (When PR #105 lands the table
+    // exists natively; we simulate it here so the helper has data to
+    // surface.)
+    db.getDb().exec(`
+      CREATE TABLE IF NOT EXISTS co_changes (
+        file_a TEXT NOT NULL,
+        file_b TEXT NOT NULL,
+        count INTEGER NOT NULL,
+        PRIMARY KEY (file_a, file_b),
+        CHECK (file_a < file_b)
+      );
+    `);
+    db.getDb().prepare('UPDATE files SET commit_count = ? WHERE path = ?').run(10, 'src/foo.ts');
+    db.getDb().prepare('UPDATE files SET commit_count = ? WHERE path = ?').run(8, 'src/bar.ts');
+    db.getDb().prepare('INSERT INTO co_changes (file_a, file_b, count) VALUES (?, ?, ?)')
+      .run('src/bar.ts', 'src/foo.ts', 7);
+
+    // Re-define getCoChangedFiles via a thin shim (since we don't have
+    // PR #105's QueryBuilder method here). Use the same SQL the PR
+    // would use.
+    (q as unknown as {
+      getCoChangedFiles: typeof getCoChangedFilesShim;
+    }).getCoChangedFiles = getCoChangedFilesShim.bind(null, q);
+
+    // Diff touches src/foo.ts but NOT src/bar.ts → bar.ts should surface
+    // as a co-change warning.
+    const ctx = buildReviewContext(modifyDoFooDiff(), q, traverser, {
+      minCoChangeJaccard: 0.3,
+    });
+    expect(ctx.coChangeWarnings.length).toBeGreaterThan(0);
+    const w = ctx.coChangeWarnings[0];
+    expect(w.changedFile).toBe('src/foo.ts');
+    expect(w.expectedToChange).toBe('src/bar.ts');
+    expect(w.jaccard).toBeGreaterThan(0.3);
+  });
+
+  it('does NOT warn about files that ARE in the PR (changedPaths exclusion)', () => {
+    db.getDb().exec(`
+      CREATE TABLE IF NOT EXISTS co_changes (
+        file_a TEXT NOT NULL, file_b TEXT NOT NULL, count INTEGER NOT NULL,
+        PRIMARY KEY (file_a, file_b), CHECK (file_a < file_b)
+      );
+    `);
+    db.getDb().prepare('UPDATE files SET commit_count = ? WHERE path = ?').run(10, 'src/foo.ts');
+    db.getDb().prepare('UPDATE files SET commit_count = ? WHERE path = ?').run(8, 'src/bar.ts');
+    db.getDb().prepare('INSERT INTO co_changes (file_a, file_b, count) VALUES (?, ?, ?)')
+      .run('src/bar.ts', 'src/foo.ts', 7);
+    (q as unknown as { getCoChangedFiles: typeof getCoChangedFilesShim }).getCoChangedFiles
+      = getCoChangedFilesShim.bind(null, q);
+
+    // Diff includes BOTH foo and bar → no warning should appear because
+    // bar IS in the changed set.
+    const diff = `diff --git a/src/foo.ts b/src/foo.ts
+--- a/src/foo.ts
++++ b/src/foo.ts
+@@ -10,1 +10,1 @@
+-x
++y
+diff --git a/src/bar.ts b/src/bar.ts
+--- a/src/bar.ts
++++ b/src/bar.ts
+@@ -3,1 +3,1 @@
+-x
++y`;
+    const ctx = buildReviewContext(diff, q, traverser, { minCoChangeJaccard: 0.3 });
+    expect(ctx.coChangeWarnings).toEqual([]);
+  });
+});
+
+describe('serializeReviewContextWithinCap (JSON-safe truncation)', () => {
+  // Re-import the helper indirectly via the MCP tool path. To test it
+  // in isolation we'd need to export it; instead exercise it via the
+  // path: build a too-large context, call the public buildReviewContext,
+  // serialize, and verify the output is parseable JSON.
+  it('produces parseable JSON even when context exceeds the cap', async () => {
+    // Build a context with thousands of symbols by inserting many nodes
+    // and a diff that touches them all.
+    const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'review-trunc-'));
+    const db = DatabaseConnection.initialize(path.join(dir, 'test.db'));
+    const q = new QueryBuilder(db.getDb());
+    const traverser = new GraphTraverser(q);
+
+    db.getDb().prepare(`INSERT INTO files (path, content_hash, language, size, modified_at, indexed_at) VALUES (?, '', 'typescript', 0, 0, 0)`).run('src/big.ts');
+    const nodes: Node[] = [];
+    for (let i = 0; i < 200; i++) {
+      nodes.push(makeNode(`n${i}`, `sym${i}`, 'function', 'src/big.ts', i * 5, i * 5 + 4));
+      // Long docstrings to stress the truncation
+      nodes[i].docstring = 'x'.repeat(500);
+    }
+    q.insertNodes(nodes);
+
+    // Diff that touches every line in big.ts.
+    const diff = `diff --git a/src/big.ts b/src/big.ts
+--- a/src/big.ts
++++ b/src/big.ts
+@@ -1,1000 +1,1000 @@
+-x
++y`;
+    const ctx = buildReviewContext(diff, q, traverser);
+
+    // Use the helper directly — re-create it inline (matches the MCP
+    // tool's serializeReviewContextWithinCap behavior). Verify JSON parses.
+    const json = JSON.stringify(ctx, null, 2);
+    expect(() => JSON.parse(json)).not.toThrow(); // sanity: full JSON is valid
+
+    // Now apply the same trimming logic the MCP handler uses (lift it
+    // here as a one-off — equivalent to importing the private helper).
+    const cap = 5000; // small cap to force trimming
+    const trimmed = trimContextToFitJson(ctx, cap);
+    expect(trimmed.length).toBeLessThanOrEqual(cap);
+    expect(() => JSON.parse(trimmed)).not.toThrow();
+
+    db.close();
+    fs.rmSync(dir, { recursive: true, force: true });
+  });
+});
+
+// Inline equivalent of serializeReviewContextWithinCap from src/mcp/tools.ts.
+// Kept here to avoid exporting an internal helper just for tests.
+function trimContextToFitJson(context: unknown, cap: number): string {
+  const ctx = JSON.parse(JSON.stringify(context)) as {
+    summary: Record<string, number>;
+    files: Array<{
+      affectedSymbols: Array<{
+        docstring?: string;
+        signature?: string;
+        callers?: unknown[];
+        callees?: unknown[];
+      }>;
+      _truncated?: boolean;
+    }>;
+    coChangeWarnings: unknown[];
+    _truncated?: boolean;
+  };
+  const fits = (s: string) => s.length <= cap;
+  let json = JSON.stringify(ctx, null, 2);
+  if (fits(json)) return json;
+  for (const f of ctx.files) for (const s of f.affectedSymbols) delete s.docstring;
+  json = JSON.stringify(ctx, null, 2);
+  if (fits(json)) return json;
+  for (const f of ctx.files) for (const s of f.affectedSymbols) delete s.signature;
+  json = JSON.stringify(ctx, null, 2);
+  if (fits(json)) return json;
+  for (const f of ctx.files) for (const s of f.affectedSymbols) {
+    if (Array.isArray(s.callers)) s.callers = s.callers.slice(0, 2);
+    if (Array.isArray(s.callees)) s.callees = s.callees.slice(0, 2);
+  }
+  json = JSON.stringify(ctx, null, 2);
+  if (fits(json)) return json;
+  for (const f of ctx.files) for (const s of f.affectedSymbols) {
+    delete s.callers;
+    delete s.callees;
+  }
+  json = JSON.stringify(ctx, null, 2);
+  if (fits(json)) return json;
+  while (ctx.files.length > 1) {
+    ctx.files.pop();
+    ctx._truncated = true;
+    json = JSON.stringify(ctx, null, 2);
+    if (fits(json)) return json;
+  }
+  return JSON.stringify(
+    { summary: ctx.summary, coChangeWarnings: ctx.coChangeWarnings, _truncated: true },
+    null, 2
+  );
+}
+
+/**
+ * Shim that mimics PR #105's QueryBuilder.getCoChangedFiles. Used in
+ * tests for forward-compatibility — once #105 lands, the real method
+ * exists on QueryBuilder and this shim is unnecessary.
+ */
+function getCoChangedFilesShim(
+  q: QueryBuilder,
+  filePath: string,
+  options: { limit: number; minCount: number; minJaccard: number }
+): Array<{ path: string; count: number; jaccard: number }> {
+  const { limit, minCount, minJaccard } = options;
+  const sql = `
+    WITH partners AS (
+      SELECT file_b AS path, count FROM co_changes WHERE file_a = ?
+      UNION ALL
+      SELECT file_a AS path, count FROM co_changes WHERE file_b = ?
+    ),
+    anchor AS (SELECT commit_count AS c FROM files WHERE path = ?),
+    scored AS (
+      SELECT
+        p.path AS path, p.count AS count,
+        CAST(p.count AS REAL) / NULLIF((SELECT c FROM anchor) + f.commit_count - p.count, 0) AS jaccard
+      FROM partners p
+      JOIN files f ON f.path = p.path
+      WHERE p.count >= ?
+    )
+    SELECT path, count, jaccard FROM scored
+    WHERE COALESCE(jaccard, 0) >= ?
+    ORDER BY jaccard DESC, count DESC
+    LIMIT ?
+  `;
+  const rows = (q as unknown as { db: { prepare: (sql: string) => { all: (...args: unknown[]) => Array<{ path: string; count: number; jaccard: number | null }> } } }).db
+    .prepare(sql)
+    .all(filePath, filePath, filePath, minCount, minJaccard, limit);
+  return rows.map((r) => ({ path: r.path, count: r.count, jaccard: r.jaccard ?? 0 }));
+}
diff --git a/__tests__/search-quality.test.ts b/__tests__/search-quality.test.ts
new file mode 100644
index 00000000..8e069776
--- /dev/null
+++ b/__tests__/search-quality.test.ts
@@ -0,0 +1,302 @@
+/**
+ * Search Quality Tests
+ *
+ * Regression tests for the FTS improvements that bring natural-language
+ * and partial-identifier queries into the top of the result set:
+ *   - Subword tokens (camel/snake split) so `parser` finds `getParser`.
+ *   - Porter stemmer so `parsing` matches `parser`/`parses`.
+ *   - Stopword stripping so `"how"` / `"the"` don't crowd out the
+ *     real terms via docstring matches.
+ *
+ * All measurements were captured against codegraph's own src/ during
+ * development. Targets that previously ranked #18, #19, or weren't in
+ * the top 20 jump to the top 5.
+ */
+
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+import { DatabaseConnection } from '../src/db';
+import { QueryBuilder } from '../src/db/queries';
+import { Node } from '../src/types';
+import { splitIdentifierTokens, buildNameSubwords } from '../src/utils';
+import { filterStopwords, STOP_WORDS } from '../src/search/query-utils';
+import { runMigrations, getCurrentVersion } from '../src/db/migrations';
+
+describe('splitIdentifierTokens', () => {
+  it('splits camelCase', () => {
+    expect(splitIdentifierTokens('getParser')).toEqual(['get', 'parser']);
+  });
+
+  it('splits PascalCase', () => {
+    expect(splitIdentifierTokens('DatabaseConnection')).toEqual(['database', 'connection']);
+  });
+
+  it('splits XMLHttpRequest-style runs of capitals', () => {
+    expect(splitIdentifierTokens('XMLHttpRequest')).toEqual(['xml', 'http', 'request']);
+  });
+
+  it('splits snake_case', () => {
+    expect(splitIdentifierTokens('database_connection')).toEqual(['database', 'connection']);
+  });
+
+  it('splits kebab-case and dots and slashes', () => {
+    expect(splitIdentifierTokens('foo-bar.baz/qux')).toEqual(['foo', 'bar', 'baz', 'qux']);
+  });
+
+  it('keeps single-word identifiers as-is', () => {
+    expect(splitIdentifierTokens('parse')).toEqual(['parse']);
+  });
+
+  it('handles trailing/leading underscores', () => {
+    expect(splitIdentifierTokens('__init__')).toEqual(['init']);
+  });
+
+  it('preserves numbers as part of the surrounding token', () => {
+    expect(splitIdentifierTokens('parseV2')).toEqual(['parse', 'v2']);
+  });
+});
+
+describe('buildNameSubwords', () => {
+  it('preserves the original identifier so direct queries still hit', () => {
+    const out = buildNameSubwords('getParser');
+    expect(out.split(' ')).toContain('getParser');
+  });
+
+  it('appends split tokens', () => {
+    const out = buildNameSubwords('getParser').split(' ');
+    expect(out).toContain('get');
+    expect(out).toContain('parser');
+  });
+
+  it('dedupes single-word identifiers (no "parse parse")', () => {
+    expect(buildNameSubwords('parse')).toBe('parse');
+  });
+
+  it('dedupes when split produces a single token equal to the original', () => {
+    // 'foo' has no boundary, so splitIdentifierTokens returns ['foo'];
+    // without dedup we would store 'foo foo'.
+    const out = buildNameSubwords('foo').split(' ');
+    expect(out).toEqual(['foo']);
+  });
+
+  it('handles empty string without crashing', () => {
+    expect(buildNameSubwords('')).toBe('');
+  });
+});
+
+describe('filterStopwords (shared with query-utils.ts)', () => {
+  it('drops common English stopwords', () => {
+    expect(filterStopwords(['how', 'does', 'parsing', 'work']))
+      // 'work' is also in STOP_WORDS, so the result is just 'parsing'
+      .toEqual(['parsing']);
+  });
+
+  it('returns the original list when every term is a stopword', () => {
+    // Otherwise we would produce an empty FTS query.
+    const allStopwords = ['the', 'a', 'an'];
+    expect(filterStopwords(allStopwords)).toEqual(allStopwords);
+  });
+
+  it('does not strip common identifier-like words', () => {
+    // `get` / `set` / `find` could be method names; never treated as stopwords.
+    expect(filterStopwords(['get', 'set', 'find', 'name']))
+      .toEqual(['get', 'set', 'find', 'name']);
+    expect(STOP_WORDS.has('get')).toBe(false);
+  });
+});
+
+describe('FTS5 search quality (integration)', () => {
+  let dir: string;
+  let db: DatabaseConnection;
+  let q: QueryBuilder;
+
+  function makeNode(id: string, name: string, kind: Node['kind'], docstring?: string): Node {
+    return {
+      id,
+      kind,
+      name,
+      qualifiedName: name,
+      filePath: `src/${name}.ts`,
+      language: 'typescript',
+      startLine: 1,
+      endLine: 1,
+      startColumn: 0,
+      endColumn: 0,
+      docstring,
+      updatedAt: Date.now(),
+    };
+  }
+
+  beforeEach(() => {
+    dir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-search-quality-'));
+    db = DatabaseConnection.initialize(path.join(dir, 'test.db'));
+    q = new QueryBuilder(db.getDb());
+  });
+
+  afterEach(() => {
+    db.close();
+    if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true });
+  });
+
+  it('finds getParser for a `parser` query (subword tokens)', () => {
+    q.insertNodes([
+      makeNode('n1', 'getParser', 'function'),
+      makeNode('n2', 'unrelated', 'function'),
+    ]);
+    const results = q.searchNodes('parser', { limit: 10 });
+    expect(results.find((r) => r.node.name === 'getParser')).toBeDefined();
+  });
+
+  it('finds DatabaseConnection for a `connection` query (subword tokens)', () => {
+    q.insertNodes([
+      makeNode('n1', 'DatabaseConnection', 'class'),
+      makeNode('n2', 'unrelated', 'function'),
+    ]);
+    const results = q.searchNodes('connection', { limit: 10 });
+    expect(results.find((r) => r.node.name === 'DatabaseConnection')).toBeDefined();
+  });
+
+  it('matches `parsing` against `getParser` via Porter stemmer', () => {
+    q.insertNodes([
+      makeNode('n1', 'getParser', 'function'),
+      makeNode('n2', 'unrelated', 'function'),
+    ]);
+    const results = q.searchNodes('parsing', { limit: 10 });
+    expect(results.find((r) => r.node.name === 'getParser')).toBeDefined();
+  });
+
+  it('matches `resolves references` against resolveOne', () => {
+    q.insertNodes([
+      makeNode('n1', 'resolveOne', 'method'),
+      makeNode('n2', 'unrelated', 'function'),
+    ]);
+    const results = q.searchNodes('resolves references', { limit: 10 });
+    expect(results.find((r) => r.node.name === 'resolveOne')).toBeDefined();
+  });
+
+  it('strips stopwords so `how does parser work` finds getParser', () => {
+    // Without stopword stripping the docstring of `unrelated` (containing
+    // "how" and "does") would BM25-flood the result list.
+    q.insertNodes([
+      makeNode('n1', 'getParser', 'function'),
+      makeNode(
+        'n2',
+        'unrelated',
+        'function',
+        'How does this work? It does many things — does, does, does.'
+      ),
+    ]);
+    const results = q.searchNodes('how does parser work', { limit: 10 });
+    const ranks = new Map(results.map((r, i) => [r.node.name, i + 1]));
+    const parserRank = ranks.get('getParser');
+    const unrelatedRank = ranks.get('unrelated');
+    expect(parserRank).toBeDefined();
+    if (unrelatedRank !== undefined) {
+      expect(parserRank).toBeLessThan(unrelatedRank);
+    }
+  });
+
+  it('exact identifier search still works (no regression on direct queries)', () => {
+    q.insertNodes([
+      makeNode('n1', 'ExtractionOrchestrator', 'class'),
+      makeNode('n2', 'extraction', 'variable'),
+      makeNode('n3', 'orchestrator', 'variable'),
+    ]);
+    const results = q.searchNodes('ExtractionOrchestrator', { limit: 10 });
+    expect(results[0].node.name).toBe('ExtractionOrchestrator');
+  });
+});
+
+describe('Migration v4: backfill name_subwords + rebuild FTS', () => {
+  let dir: string;
+
+  beforeEach(() => {
+    dir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-migr-v4-fts-'));
+  });
+
+  afterEach(() => {
+    if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true });
+  });
+
+  it('rebuilds FTS so subword search works on previously-indexed nodes', () => {
+    // Build a v3-shape database from explicit SQL — the pre-PR schema —
+    // then run forward migrations and verify search works end-to-end.
+    // This is a faithful simulation of an upgrade from a real v3 install.
+    const Database = require('better-sqlite3');
+    const dbHandle = new Database(path.join(dir, 'test.db'));
+    dbHandle.pragma('foreign_keys = ON');
+    dbHandle.exec(`
+      CREATE TABLE schema_versions (version INTEGER PRIMARY KEY, applied_at INTEGER NOT NULL, description TEXT);
+      INSERT INTO schema_versions (version, applied_at, description) VALUES (3, 0, 'v3');
+      CREATE TABLE nodes (
+        id TEXT PRIMARY KEY, kind TEXT NOT NULL, name TEXT NOT NULL,
+        qualified_name TEXT NOT NULL, file_path TEXT NOT NULL, language TEXT NOT NULL,
+        start_line INTEGER NOT NULL, end_line INTEGER NOT NULL,
+        start_column INTEGER NOT NULL, end_column INTEGER NOT NULL,
+        docstring TEXT, signature TEXT, visibility TEXT,
+        is_exported INTEGER DEFAULT 0, is_async INTEGER DEFAULT 0,
+        is_static INTEGER DEFAULT 0, is_abstract INTEGER DEFAULT 0,
+        decorators TEXT, type_parameters TEXT, updated_at INTEGER NOT NULL
+      );
+      CREATE VIRTUAL TABLE nodes_fts USING fts5(
+        id, name, qualified_name, docstring, signature,
+        content='nodes', content_rowid='rowid'
+      );
+      CREATE TRIGGER nodes_ai AFTER INSERT ON nodes BEGIN
+        INSERT INTO nodes_fts(rowid, id, name, qualified_name, docstring, signature)
+        VALUES (NEW.rowid, NEW.id, NEW.name, NEW.qualified_name, NEW.docstring, NEW.signature);
+      END;
+      INSERT INTO nodes (id, kind, name, qualified_name, file_path, language,
+        start_line, end_line, start_column, end_column, updated_at)
+      VALUES ('n1', 'function', 'getParser', 'getParser', 'a.ts', 'typescript', 1, 1, 0, 0, 0);
+    `);
+
+    expect(getCurrentVersion(dbHandle)).toBe(3);
+
+    // Apply forward migrations (4..N including the FTS-subwords pass).
+    runMigrations(dbHandle, 3);
+    expect(getCurrentVersion(dbHandle)).toBeGreaterThanOrEqual(9);
+
+    // The new column was backfilled with the split subwords.
+    const row = dbHandle.prepare('SELECT name_subwords FROM nodes WHERE id = ?').get('n1') as {
+      name_subwords: string;
+    };
+    expect(row.name_subwords).toContain('parser');
+
+    // Search end-to-end via QueryBuilder works against the migrated DB.
+    const q2 = new QueryBuilder(dbHandle);
+    const results = q2.searchNodes('parser', { limit: 10 });
+    expect(results.find((r) => r.node.name === 'getParser')).toBeDefined();
+
+    dbHandle.close();
+  });
+
+  it('migration is idempotent if name_subwords column already exists', () => {
+    // Simulate a partial-failure scenario: the ALTER TABLE landed
+    // (DDL is auto-committed in SQLite even inside a transaction) but
+    // the rest didn't, so the column is present but the FTS hasn't been
+    // recreated and the schema_versions row hasn't been bumped.
+    const Database = require('better-sqlite3');
+    const dbHandle = new Database(path.join(dir, 'test.db'));
+    dbHandle.exec(`
+      CREATE TABLE schema_versions (version INTEGER PRIMARY KEY, applied_at INTEGER NOT NULL, description TEXT);
+      INSERT INTO schema_versions (version, applied_at, description) VALUES (3, 0, 'v3');
+      CREATE TABLE nodes (
+        id TEXT PRIMARY KEY, kind TEXT NOT NULL, name TEXT NOT NULL,
+        qualified_name TEXT NOT NULL, file_path TEXT NOT NULL, language TEXT NOT NULL,
+        start_line INTEGER NOT NULL, end_line INTEGER NOT NULL,
+        start_column INTEGER NOT NULL, end_column INTEGER NOT NULL,
+        docstring TEXT, signature TEXT, visibility TEXT,
+        is_exported INTEGER DEFAULT 0, is_async INTEGER DEFAULT 0,
+        is_static INTEGER DEFAULT 0, is_abstract INTEGER DEFAULT 0,
+        decorators TEXT, type_parameters TEXT, updated_at INTEGER NOT NULL,
+        name_subwords TEXT  -- partial pre-existing state
+      );
+    `);
+    expect(() => runMigrations(dbHandle, 3)).not.toThrow();
+    expect(getCurrentVersion(dbHandle)).toBeGreaterThanOrEqual(9);
+    dbHandle.close();
+  });
+});
diff --git a/__tests__/security.test.ts b/__tests__/security.test.ts
index 53441d58..1c62e648 100644
--- a/__tests__/security.test.ts
+++ b/__tests__/security.test.ts
@@ -533,3 +533,36 @@ describe('Symlink Cycle Detection', () => {
     expect(files).toContain('src/valid.ts');
   });
 });
+
+describe('ReDoS-safe glob matching', () => {
+  it('coalesces runs of `*` so hostile inputs do not produce nested quantifiers', async () => {
+    const { globToSafeRegex } = await import('../src/utils');
+    // Two or more stars collapse to a single recursive wildcard. This is the
+    // ReDoS protection: `*****` doesn't expand to `[^/]*[^/]*[^/]*[^/]*[^/]*`,
+    // which on a long input could catastrophically backtrack.
+    expect(globToSafeRegex('*****')).toBe('.*');
+    expect(globToSafeRegex('**')).toBe('.*');
+
+    // Even a constructed-from-hostile-input regex matches in linear time.
+    const regex = new RegExp(`^${globToSafeRegex('*****')}foo$`);
+    const start = Date.now();
+    // 100k 'a's followed by something that doesn't end in 'foo'.
+    expect(regex.test('a'.repeat(100000) + 'bar')).toBe(false);
+    expect(Date.now() - start).toBeLessThan(500);
+  });
+
+  it('rejects pathologically long glob inputs', async () => {
+    const { globToSafeRegex } = await import('../src/utils');
+    expect(globToSafeRegex('*'.repeat(2000))).toBeNull();
+  });
+
+  it('preserves the standard glob semantics for common patterns', async () => {
+    const { globToSafeRegex } = await import('../src/utils');
+    const body = globToSafeRegex('src/**/*.test.ts');
+    expect(body).toBeDefined();
+    const regex = new RegExp(`^${body}$`);
+    expect(regex.test('src/lib/foo.test.ts')).toBe(true);
+    expect(regex.test('src/lib/foo.ts')).toBe(false);
+    expect(regex.test('other/src/foo.test.ts')).toBe(false);
+  });
+});
diff --git a/__tests__/sql-refs.test.ts b/__tests__/sql-refs.test.ts
new file mode 100644
index 00000000..0a05a31b
--- /dev/null
+++ b/__tests__/sql-refs.test.ts
@@ -0,0 +1,339 @@
+/**
+ * SQL call-site tests: parser unit tests + end-to-end through CodeGraph.
+ */
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import * as fs from 'fs';
+import * as os from 'os';
+import * as path from 'path';
+import { extractSqlRefs } from '../src/sql-refs';
+import CodeGraph from '../src/index';
+
+let testDir: string;
+let cg: CodeGraph | null = null;
+
+function write(rel: string, content: string) {
+  const abs = path.join(testDir, rel);
+  fs.mkdirSync(path.dirname(abs), { recursive: true });
+  fs.writeFileSync(abs, content);
+}
+
+beforeEach(() => {
+  testDir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-sql-'));
+});
+
+afterEach(() => {
+  if (cg) {
+    cg.destroy();
+    cg = null;
+  }
+  if (fs.existsSync(testDir)) fs.rmSync(testDir, { recursive: true, force: true });
+});
+
+// ============================================================================
+// Pure parser tests
+// ============================================================================
+
+describe('extractSqlRefs', () => {
+  it('captures FROM <table> as a read', () => {
+    write('a.ts', `db.prepare('SELECT id FROM users WHERE id = ?');\n`);
+    const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null);
+    expect(refs).toHaveLength(1);
+    expect(refs[0]!).toMatchObject({ tableName: 'users', op: 'read' });
+  });
+
+  it('captures INSERT INTO as a write', () => {
+    write('a.ts', `db.prepare('INSERT INTO logs (msg) VALUES (?)');\n`);
+    const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null);
+    expect(refs).toHaveLength(1);
+    expect(refs[0]!).toMatchObject({ tableName: 'logs', op: 'write' });
+  });
+
+  it('captures UPDATE ... SET as a write', () => {
+    write('a.ts', `db.run('UPDATE users SET name = ? WHERE id = ?', ['x', 1]);\n`);
+    const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null);
+    expect(refs).toHaveLength(1);
+    expect(refs[0]!).toMatchObject({ tableName: 'users', op: 'write' });
+  });
+
+  it('captures DELETE FROM as a write (and not as a read)', () => {
+    write('a.ts', `db.run('DELETE FROM sessions WHERE expired_at < ?');\n`);
+    const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null);
+    // Both regexes (DELETE FROM as write, FROM as read) hit, so we expect
+    // two refs for the same table but different ops.
+    expect(refs.map((r) => r.op).sort()).toEqual(['read', 'write']);
+    expect(new Set(refs.map((r) => r.tableName))).toEqual(new Set(['sessions']));
+  });
+
+  it('captures CREATE TABLE / ALTER / DROP as ddl', () => {
+    write(
+      'a.ts',
+      [
+        `db.exec('CREATE TABLE IF NOT EXISTS audit (id INTEGER)');`,
+        `db.exec('ALTER TABLE audit ADD COLUMN ts INTEGER');`,
+        `db.exec('DROP TABLE IF EXISTS audit_old');`,
+      ].join('\n')
+    );
+    const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null);
+    const ddls = refs.filter((r) => r.op === 'ddl');
+    expect(new Set(ddls.map((r) => r.tableName))).toEqual(new Set(['audit', 'audit_old']));
+  });
+
+  it('captures JOIN as a read', () => {
+    write(
+      'a.ts',
+      `db.prepare('SELECT u.name, p.title FROM users u JOIN posts p ON p.user_id = u.id');\n`
+    );
+    const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null);
+    const tables = new Set(refs.map((r) => r.tableName));
+    expect(tables).toEqual(new Set(['users', 'posts']));
+  });
+
+  it('handles backtick (MySQL) and double-quoted (Postgres) identifiers', () => {
+    write(
+      'a.ts',
+      [
+        "db.prepare('SELECT id FROM `mysql_table`');",
+        `db.prepare('SELECT id FROM "pg_table"');`,
+      ].join('\n')
+    );
+    const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null);
+    expect(new Set(refs.map((r) => r.tableName))).toEqual(
+      new Set(['mysql_table', 'pg_table'])
+    );
+  });
+
+  it('handles schema-qualified identifiers (drops the schema, keeps the table)', () => {
+    write('a.ts', `db.prepare('SELECT * FROM public.users');\n`);
+    const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null);
+    expect(refs[0]!.tableName).toBe('users');
+  });
+
+  it('does NOT match a JS variable named like a SQL keyword', () => {
+    // Without the FROM/INTO/etc. prefix, a bare identifier `users` is
+    // not caught — that's the whole point vs. plain grep.
+    write('a.ts', `const users = await loadUsers();\nfor (const user of users) {}\n`);
+    const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null);
+    expect(refs).toEqual([]);
+  });
+
+  it('skips unsupported languages (e.g. swift) without error', () => {
+    write('a.swift', `let q = "SELECT id FROM users"\n`);
+    const refs = extractSqlRefs(testDir, [{ path: 'a.swift', language: 'swift' }], () => null);
+    expect(refs).toEqual([]);
+  });
+
+  it('captures the correct 1-indexed line number', () => {
+    write(
+      'a.ts',
+      [`// blah`, `// blah`, `db.prepare('SELECT * FROM line_three');`, `// blah`].join('\n')
+    );
+    const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null);
+    expect(refs[0]).toEqual(expect.objectContaining({ tableName: 'line_three', line: 3 }));
+  });
+
+  it('threads the resolveEnclosing closure correctly', () => {
+    write('a.ts', `db.prepare('SELECT * FROM t');\n`);
+    const calls: Array<[string, number]> = [];
+    extractSqlRefs(
+      testDir,
+      [{ path: 'a.ts', language: 'typescript' }],
+      (filePath, line) => {
+        calls.push([filePath, line]);
+        return 'fake-id';
+      }
+    );
+    expect(calls).toEqual([['a.ts', 1]]);
+  });
+
+  it('drops reserved-word "table names" (WHERE/ON/AS/SELECT)', () => {
+    // Common over-match: `JOIN ... ON x = y` would otherwise pick up
+    // `ON` as the table name. The reserved set blocks that.
+    write('a.ts', `db.prepare('SELECT * FROM users JOIN posts ON posts.uid = users.id');\n`);
+    const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null);
+    const names = new Set(refs.map((r) => r.tableName));
+    expect(names).toEqual(new Set(['users', 'posts']));
+  });
+
+  it('handles multiple SQL operations on a single line', () => {
+    write(
+      'a.ts',
+      `db.exec('CREATE TABLE foo (id INTEGER); INSERT INTO foo VALUES (1)');\n`
+    );
+    const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null);
+    const ops = new Set(refs.map((r) => `${r.tableName}|${r.op}`));
+    expect(ops).toEqual(new Set(['foo|ddl', 'foo|write']));
+  });
+
+  it('survives a missing file (skips, no throw)', () => {
+    const refs = extractSqlRefs(
+      testDir,
+      [{ path: 'missing.ts', language: 'typescript' }],
+      () => null
+    );
+    expect(refs).toEqual([]);
+  });
+
+  it('rejects prose comments containing a quoted SQL example', () => {
+    // Reviewer-flagged regression: a comment like
+    //   // example: db.prepare('SELECT name FROM the docs')
+    // used to falsely match `the` as a table because the quote inside
+    // the comment passed isInsideString(). The comment-stripper now
+    // removes everything after `//` before the regex sees the line.
+    write(
+      'a.ts',
+      [
+        `// example: db.prepare('SELECT name FROM the docs')`,
+        `// "SELECT id FROM the comment"`,
+        `function ok() {`,
+        `  // sample SELECT FROM users in a comment — should be ignored`,
+        `  return 1;`,
+        `}`,
+      ].join('\n')
+    );
+    const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null);
+    expect(refs).toEqual([]);
+  });
+
+  it('rejects same-line block comments containing a quoted SQL example', () => {
+    write(
+      'a.ts',
+      `/* "SELECT * FROM ghost" */ const x = 1;\n`
+    );
+    const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null);
+    expect(refs).toEqual([]);
+  });
+
+  it('still keeps a real SQL call when there is a trailing comment', () => {
+    write('a.ts', `db.prepare('SELECT * FROM users'); // good doc\n`);
+    const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null);
+    expect(refs.length).toBe(1);
+    expect(refs[0]!.tableName).toBe('users');
+  });
+
+  it('strips Python `#` comments', () => {
+    write(
+      'a.py',
+      `# example: db.execute('SELECT * FROM the_docs')\nrows = db.execute('SELECT * FROM real_table')\n`
+    );
+    const refs = extractSqlRefs(testDir, [{ path: 'a.py', language: 'python' }], () => null);
+    expect(refs.map((r) => r.tableName)).toEqual(['real_table']);
+  });
+});
+
+// ============================================================================
+// End-to-end through CodeGraph
+// ============================================================================
+
+describe('CodeGraph SQL refs', () => {
+  it('persists call sites and resolves enclosing function', async () => {
+    write(
+      'src/db.ts',
+      [
+        `export function getUser(id: number) {`,
+        `  return db.prepare('SELECT * FROM users WHERE id = ?').get(id);`,
+        `}`,
+        ``,
+        `export function logEvent(msg: string) {`,
+        `  db.prepare('INSERT INTO events (msg) VALUES (?)').run(msg);`,
+        `}`,
+      ].join('\n')
+    );
+    cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } });
+    await cg.indexAll();
+
+    const tables = cg.getSqlTables();
+    expect(new Set(tables.map((t) => t.tableName))).toEqual(new Set(['users', 'events']));
+
+    const userSites = cg.getSqlRefsByTable('users');
+    expect(userSites[0]!.sourceName).toBe('getUser');
+
+    const eventSites = cg.getSqlRefsByTable('events');
+    expect(eventSites[0]!.sourceName).toBe('logEvent');
+    expect(eventSites[0]!.op).toBe('write');
+  });
+
+  it('reverse view: getSqlTablesForNode returns tables touched by a function', async () => {
+    write(
+      'src/a.ts',
+      [
+        `export function multiTouch() {`,
+        `  db.prepare('SELECT * FROM users').all();`,
+        `  db.prepare('INSERT INTO orders VALUES (?)').run(1);`,
+        `}`,
+      ].join('\n')
+    );
+    cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } });
+    await cg.indexAll();
+
+    const node = cg.getNodesInFile('src/a.ts').find((n) => n.name === 'multiTouch')!;
+    const touched = cg.getSqlTablesForNode(node.id);
+    const summary = touched.map((r) => `${r.tableName}|${r.op}`).sort();
+    expect(summary).toEqual(['orders|write', 'users|read']);
+  });
+
+  it('case-insensitive table lookup', async () => {
+    write('src/a.ts', `db.prepare('SELECT * FROM Users');\n`);
+    cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } });
+    await cg.indexAll();
+    expect(cg.getSqlRefsByTable('users').length).toBe(1);
+    expect(cg.getSqlRefsByTable('USERS').length).toBe(1);
+  });
+
+  it('respects enableSqlRefs=false', async () => {
+    write('src/a.ts', `db.prepare('SELECT * FROM users');\n`);
+    cg = CodeGraph.initSync(testDir, {
+      config: { include: ['**/*.ts'], exclude: [], enableSqlRefs: false },
+    });
+    await cg.indexAll();
+    expect(cg.getSqlTables()).toEqual([]);
+  });
+
+  it('incremental sync replaces refs for changed files only', async () => {
+    write('src/a.ts', `db.prepare('SELECT * FROM old_table');\n`);
+    write('src/b.ts', `db.prepare('SELECT * FROM stable_table');\n`);
+    cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } });
+    await cg.indexAll();
+    expect(new Set(cg.getSqlTables().map((t) => t.tableName))).toEqual(
+      new Set(['old_table', 'stable_table'])
+    );
+
+    write('src/a.ts', `db.prepare('SELECT * FROM new_table');\n`);
+    await cg.sync();
+
+    const tables = new Set(cg.getSqlTables().map((t) => t.tableName));
+    expect(tables).toContain('new_table');
+    expect(tables).toContain('stable_table');
+    expect(tables).not.toContain('old_table');
+  });
+
+  it('drops refs when a file is edited to remove its last SQL ref', async () => {
+    // Same regression as PR C — applySqlRefs([]) shouldn't leave
+    // stale rows. Pre-deleting the changed paths in runSqlRefsPass
+    // is the fix.
+    write('src/a.ts', `db.prepare('SELECT * FROM going_away');\n`);
+    cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } });
+    await cg.indexAll();
+    expect(cg.getSqlTables().some((t) => t.tableName === 'going_away')).toBe(true);
+
+    write('src/a.ts', `// no sql here anymore\nexport const x = 1;\n`);
+    await cg.sync();
+
+    expect(cg.getSqlTables().some((t) => t.tableName === 'going_away')).toBe(false);
+  });
+
+  it('drops refs for files removed between syncs', async () => {
+    write('src/a.ts', `db.prepare('SELECT * FROM gone_table');\n`);
+    cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } });
+    await cg.indexAll();
+    expect(cg.getSqlTables().some((t) => t.tableName === 'gone_table')).toBe(true);
+
+    fs.unlinkSync(path.join(testDir, 'src/a.ts'));
+    await cg.sync();
+    expect(cg.getSqlTables().some((t) => t.tableName === 'gone_table')).toBe(false);
+  });
+
+  // (Removed: a defensive test for the v4-migration-collision bug class.
+  // With file-based migrations (NNN-name.ts), two PRs claiming the same
+  // version produces a filesystem-level conflict, so the silent skip the
+  // defensive guard protected against can no longer happen.)
+});
diff --git a/__tests__/sync.test.ts b/__tests__/sync.test.ts
index 8365f630..115d078b 100644
--- a/__tests__/sync.test.ts
+++ b/__tests__/sync.test.ts
@@ -259,4 +259,242 @@ describe('Sync Module', () => {
       expect(result.changedFilePaths).toBeUndefined();
     });
   });
+
+  // Regression tests for the "stale index after HEAD-moving git operation"
+  // bug. `git status` only reports working-tree dirtiness vs HEAD, so a
+  // merge / pull / checkout / rebase / reset (and even post-commit) leaves
+  // a clean tree and used to trick sync into reporting "up to date" while
+  // the DB still held pre-operation content hashes. The fix detects HEAD
+  // movement by comparing current HEAD against a stored last-synced HEAD
+  // and unioning `git diff` output into the changed-file set.
+  describe('HEAD-moving git operations', () => {
+    let testDir: string;
+    let cg: CodeGraph;
+
+    function git(...args: string[]) {
+      execFileSync('git', args, { cwd: testDir, stdio: 'pipe' });
+    }
+
+    beforeEach(async () => {
+      testDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-head-move-'));
+
+      git('init');
+      git('config', 'user.email', 'test@test.com');
+      git('config', 'user.name', 'Test');
+      git('symbolic-ref', 'HEAD', 'refs/heads/main');
+
+      const srcDir = path.join(testDir, 'src');
+      fs.mkdirSync(srcDir);
+      fs.writeFileSync(
+        path.join(srcDir, 'index.ts'),
+        `export function hello() { return 'world'; }`
+      );
+
+      git('add', '-A');
+      git('commit', '-m', 'initial');
+
+      cg = CodeGraph.initSync(testDir, {
+        config: { include: ['**/*.ts'], exclude: [] },
+      });
+      await cg.indexAll();
+    });
+
+    afterEach(() => {
+      if (cg) cg.destroy();
+      if (fs.existsSync(testDir)) {
+        fs.rmSync(testDir, { recursive: true, force: true });
+      }
+    });
+
+    it('should detect changes brought in by `git merge`', async () => {
+      git('checkout', '-b', 'feature');
+      fs.writeFileSync(
+        path.join(testDir, 'src', 'index.ts'),
+        `export function merged() { return 'from-branch'; }`
+      );
+      fs.writeFileSync(
+        path.join(testDir, 'src', 'added.ts'),
+        `export function fromBranch() { return 1; }`
+      );
+      git('add', '-A');
+      git('commit', '-m', 'feature work');
+      git('checkout', 'main');
+      git('merge', '--no-ff', 'feature', '-m', 'merge feature');
+
+      const result = await cg.sync();
+
+      expect(result.filesModified + result.filesAdded).toBeGreaterThanOrEqual(2);
+      expect(cg.searchNodes('merged').length).toBeGreaterThan(0);
+      expect(cg.searchNodes('fromBranch').length).toBeGreaterThan(0);
+      expect(cg.searchNodes('hello').length).toBe(0);
+    });
+
+    it('should detect changes after `git checkout` to a different branch', async () => {
+      git('checkout', '-b', 'other');
+      fs.writeFileSync(
+        path.join(testDir, 'src', 'index.ts'),
+        `export function onOther() { return 'other'; }`
+      );
+      git('add', '-A');
+      git('commit', '-m', 'other work');
+      git('checkout', 'main');
+      git('checkout', 'other');
+
+      const result = await cg.sync();
+
+      expect(result.filesModified).toBeGreaterThanOrEqual(1);
+      expect(cg.searchNodes('onOther').length).toBeGreaterThan(0);
+      expect(cg.searchNodes('hello').length).toBe(0);
+    });
+
+    it('should detect file deletion brought in by a committed change', async () => {
+      git('rm', path.join('src', 'index.ts'));
+      git('commit', '-m', 'remove index');
+
+      const result = await cg.sync();
+
+      expect(result.filesRemoved).toBe(1);
+      expect(cg.searchNodes('hello').length).toBe(0);
+    });
+
+    it('should fall back to full scan when last-synced HEAD is unreachable', async () => {
+      fs.writeFileSync(
+        path.join(testDir, 'src', 'index.ts'),
+        `export function rewritten() { return 'rewritten'; }`
+      );
+      git('add', '-A');
+      git('commit', '--amend', '-m', 'rewritten');
+      const result = await cg.sync();
+
+      expect(result.filesModified + result.filesAdded).toBeGreaterThanOrEqual(1);
+      expect(cg.searchNodes('rewritten').length).toBeGreaterThan(0);
+      expect(cg.searchNodes('hello').length).toBe(0);
+    });
+
+    it('should still no-op when HEAD has not moved and tree is clean', async () => {
+      const result = await cg.sync();
+
+      expect(result.filesAdded).toBe(0);
+      expect(result.filesModified).toBe(0);
+      expect(result.filesRemoved).toBe(0);
+    });
+  });
+
+  describe('Git submodule support', () => {
+    let parentDir: string;
+    let submoduleSrc: string;
+    let cg: CodeGraph;
+
+    function git(cwd: string, ...args: string[]) {
+      execFileSync('git', args, { cwd, stdio: 'pipe' });
+    }
+
+    beforeEach(async () => {
+      parentDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-submod-parent-'));
+      submoduleSrc = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-submod-src-'));
+
+      git(submoduleSrc, 'init');
+      git(submoduleSrc, 'config', 'user.email', 'test@test.com');
+      git(submoduleSrc, 'config', 'user.name', 'Test');
+      fs.writeFileSync(
+        path.join(submoduleSrc, 'lib.ts'),
+        `export function fromSubmodule() { return 'sub'; }`
+      );
+      git(submoduleSrc, 'add', '-A');
+      git(submoduleSrc, 'commit', '-m', 'submodule initial');
+
+      git(parentDir, 'init');
+      git(parentDir, 'config', 'user.email', 'test@test.com');
+      git(parentDir, 'config', 'user.name', 'Test');
+
+      const parentSrc = path.join(parentDir, 'src');
+      fs.mkdirSync(parentSrc);
+      fs.writeFileSync(
+        path.join(parentSrc, 'main.ts'),
+        `export function fromParent() { return 'parent'; }`
+      );
+
+      git(parentDir, '-c', 'protocol.file.allow=always', 'submodule', 'add', submoduleSrc, 'vendor/sub');
+      git(parentDir, 'add', '-A');
+      git(parentDir, 'commit', '-m', 'parent initial with submodule');
+
+      cg = CodeGraph.initSync(parentDir, {
+        config: {
+          include: ['**/*.ts'],
+          exclude: [],
+        },
+      });
+    });
+
+    afterEach(() => {
+      if (cg) cg.destroy();
+      if (fs.existsSync(parentDir)) fs.rmSync(parentDir, { recursive: true, force: true });
+      if (fs.existsSync(submoduleSrc)) fs.rmSync(submoduleSrc, { recursive: true, force: true });
+    });
+
+    it('should index files inside a submodule on full index', async () => {
+      const result = await cg.indexAll();
+
+      expect(result.filesIndexed).toBeGreaterThanOrEqual(2);
+      const subNodes = cg.searchNodes('fromSubmodule');
+      const parentNodes = cg.searchNodes('fromParent');
+      expect(subNodes.length).toBeGreaterThan(0);
+      expect(parentNodes.length).toBeGreaterThan(0);
+      expect(subNodes.some((r) => r.node.filePath.startsWith('vendor/sub/'))).toBe(true);
+    });
+
+    it('should detect modifications to files inside a submodule via sync', async () => {
+      await cg.indexAll();
+
+      fs.writeFileSync(
+        path.join(parentDir, 'vendor/sub/lib.ts'),
+        `export function fromSubmodule() { return 'changed'; }`
+      );
+
+      const result = await cg.sync();
+
+      expect(result.filesModified).toBe(1);
+      expect(result.changedFilePaths).toContain('vendor/sub/lib.ts');
+    });
+
+    it('should detect new untracked files inside a submodule via sync', async () => {
+      await cg.indexAll();
+
+      fs.writeFileSync(
+        path.join(parentDir, 'vendor/sub/newfile.ts'),
+        `export function added() { return 1; }`
+      );
+
+      const result = await cg.sync();
+
+      expect(result.filesAdded).toBe(1);
+      expect(result.changedFilePaths).toContain('vendor/sub/newfile.ts');
+    });
+
+    it('should not break when a submodule directory is missing or empty', async () => {
+      fs.rmSync(path.join(parentDir, 'vendor/sub'), { recursive: true, force: true });
+      fs.mkdirSync(path.join(parentDir, 'vendor/sub'));
+
+      const result = await cg.indexAll();
+      expect(result.errors.filter((e) => e.severity === 'error').length).toBe(0);
+      expect(cg.searchNodes('fromParent').length).toBeGreaterThan(0);
+    });
+
+    it('should skip submodule contents when indexSubmodules is false', async () => {
+      cg.destroy();
+      fs.rmSync(path.join(parentDir, '.codegraph'), { recursive: true, force: true });
+      cg = CodeGraph.initSync(parentDir, {
+        config: {
+          include: ['**/*.ts'],
+          exclude: [],
+          indexSubmodules: false,
+        },
+      });
+
+      const result = await cg.indexAll();
+      expect(cg.searchNodes('fromParent').length).toBeGreaterThan(0);
+      expect(cg.searchNodes('fromSubmodule').length).toBe(0);
+      expect(result.filesIndexed).toBe(1);
+    });
+  });
 });
diff --git a/__tests__/tests-edges.test.ts b/__tests__/tests-edges.test.ts
new file mode 100644
index 00000000..abc300fb
--- /dev/null
+++ b/__tests__/tests-edges.test.ts
@@ -0,0 +1,248 @@
+/**
+ * Tests-as-Edges Tests
+ *
+ * Verifies the convention-based test→subject file resolver and the
+ * `tests` edges it produces:
+ *   - All recognized test naming conventions (Jest/Vitest, pytest,
+ *     Go, RSpec, JUnit/xUnit, Quick/Spek)
+ *   - The four-step resolution strategy (co-located, mirrored,
+ *     common source roots, basename-anywhere)
+ *   - End-to-end via CodeGraph: indexAll populates `tests` edges,
+ *     sync incrementally refreshes them, getTestsForFile and
+ *     getSubjectsOfTest return the expected file records.
+ */
+
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+import {
+  testSubjectBasename,
+  isTestFile,
+  findTestSubjects,
+} from '../src/tests-edges';
+import CodeGraph from '../src/index';
+
+describe('testSubjectBasename', () => {
+  it('recognizes JS/TS .test and .spec suffixes', () => {
+    expect(testSubjectBasename('foo.test.ts')).toBe('foo');
+    expect(testSubjectBasename('foo.spec.tsx')).toBe('foo');
+    expect(testSubjectBasename('Bar.test.js')).toBe('Bar');
+    expect(testSubjectBasename('a/b/foo.test.mjs')).toBe('foo');
+  });
+
+  it('recognizes Python pytest test_foo style', () => {
+    expect(testSubjectBasename('test_foo.py')).toBe('foo');
+    expect(testSubjectBasename('pkg/test_handlers.py')).toBe('handlers');
+  });
+
+  it('recognizes Go and Rust foo_test style', () => {
+    expect(testSubjectBasename('foo_test.go')).toBe('foo');
+    expect(testSubjectBasename('foo_test.rs')).toBe('foo');
+  });
+
+  it('recognizes Ruby foo_spec / foo_test style', () => {
+    expect(testSubjectBasename('foo_spec.rb')).toBe('foo');
+    expect(testSubjectBasename('foo_test.rb')).toBe('foo');
+  });
+
+  it('recognizes xUnit FooTest / FooTests', () => {
+    expect(testSubjectBasename('FooTest.java')).toBe('Foo');
+    expect(testSubjectBasename('FooTests.cs')).toBe('Foo');
+    expect(testSubjectBasename('FooTest.kt')).toBe('Foo');
+  });
+
+  it('recognizes Quick/Spek FooSpec', () => {
+    expect(testSubjectBasename('FooSpec.swift')).toBe('Foo');
+    expect(testSubjectBasename('FooSpec.kt')).toBe('Foo');
+  });
+
+  it('returns null for non-test files', () => {
+    expect(testSubjectBasename('foo.ts')).toBeNull();
+    expect(testSubjectBasename('handler.py')).toBeNull();
+    expect(testSubjectBasename('README.md')).toBeNull();
+    // Doesn't false-positive on similar-looking names
+    expect(testSubjectBasename('contest.ts')).toBeNull();
+    expect(testSubjectBasename('untested.go')).toBeNull();
+  });
+});
+
+describe('isTestFile', () => {
+  it('agrees with testSubjectBasename', () => {
+    expect(isTestFile('foo.test.ts')).toBe(true);
+    expect(isTestFile('foo.ts')).toBe(false);
+  });
+});
+
+describe('findTestSubjects (resolver strategies)', () => {
+  it('1. co-located: foo/foo.test.ts → foo/foo.ts', () => {
+    const all = new Set(['src/foo.ts', 'src/foo.test.ts']);
+    expect(findTestSubjects('src/foo.test.ts', all)).toEqual(['src/foo.ts']);
+  });
+
+  it('1b. co-located: foo/bar.test.ts → foo/bar/index.ts', () => {
+    const all = new Set(['src/bar/index.ts', 'src/bar.test.ts']);
+    expect(findTestSubjects('src/bar.test.ts', all)).toEqual(['src/bar/index.ts']);
+  });
+
+  it('2. mirrored: foo/__tests__/bar.test.ts → foo/bar.ts', () => {
+    const all = new Set(['src/bar.ts', 'src/__tests__/bar.test.ts']);
+    expect(findTestSubjects('src/__tests__/bar.test.ts', all)).toEqual(['src/bar.ts']);
+  });
+
+  it('2b. mirrored to index: __tests__/sync.test.ts → src/sync/index.ts', () => {
+    // Top-level __tests__ doesn't translate to a sibling source root, so
+    // the resolver falls through to step 3 (common source roots).
+    const all = new Set(['src/sync/index.ts', '__tests__/sync.test.ts']);
+    expect(findTestSubjects('__tests__/sync.test.ts', all)).toEqual(['src/sync/index.ts']);
+  });
+
+  it('3. common source roots: __tests__/handler.test.ts → lib/handler.ts', () => {
+    const all = new Set(['lib/handler.ts', '__tests__/handler.test.ts']);
+    expect(findTestSubjects('__tests__/handler.test.ts', all)).toEqual(['lib/handler.ts']);
+  });
+
+  it('4. basename-anywhere with prefix-tiebreaker', () => {
+    const all = new Set([
+      'packages/auth/utils.ts',
+      'packages/billing/utils.ts',
+      'packages/auth/utils.test.ts',
+    ]);
+    // Co-located resolves first → utils.ts in auth wins by directory.
+    expect(findTestSubjects('packages/auth/utils.test.ts', all))
+      .toEqual(['packages/auth/utils.ts']);
+  });
+
+  it('returns [] for tests with no matching subject', () => {
+    const all = new Set(['__tests__/integration.test.ts']);
+    expect(findTestSubjects('__tests__/integration.test.ts', all)).toEqual([]);
+  });
+
+  it('returns [] for non-test files', () => {
+    const all = new Set(['src/foo.ts']);
+    expect(findTestSubjects('src/foo.ts', all)).toEqual([]);
+  });
+
+  it('does not edge a test file back to itself', () => {
+    // Pathological: a file matching the test pattern that also happens
+    // to live where its "subject" would resolve. Should never produce a
+    // self-edge.
+    const all = new Set(['src/foo.test.ts']);
+    expect(findTestSubjects('src/foo.test.ts', all)).toEqual([]);
+  });
+
+  it('handles tsx test files preferring tsx subject before ts', () => {
+    const all = new Set(['src/Component.tsx', 'src/Component.test.tsx']);
+    expect(findTestSubjects('src/Component.test.tsx', all))
+      .toEqual(['src/Component.tsx']);
+  });
+
+  it('matches Go _test convention to subject .go', () => {
+    const all = new Set(['internal/handler.go', 'internal/handler_test.go']);
+    expect(findTestSubjects('internal/handler_test.go', all))
+      .toEqual(['internal/handler.go']);
+  });
+
+  it('matches Python test_ convention to subject .py', () => {
+    const all = new Set(['app/handlers.py', 'tests/test_handlers.py']);
+    expect(findTestSubjects('tests/test_handlers.py', all))
+      .toEqual(['app/handlers.py']);
+  });
+
+  it('strips top-level tests/ prefix when computing the mirrored subject path', () => {
+    // Regression: previously the mirroring regex only matched `/tests/`
+    // (slash-prefixed), so a top-level `tests/` directory wasn't stripped
+    // and the multi-root fallback (src/lib/app/...) never fired. With a
+    // decoy `tests/handlers.py` present, the resolver would have wrongly
+    // picked it via the basename-anywhere step instead of the real subject
+    // under `lib/`.
+    const all = new Set([
+      'lib/handlers.py',
+      'tests/handlers.py',         // decoy
+      'tests/test_handlers.py',
+    ]);
+    expect(findTestSubjects('tests/test_handlers.py', all))
+      .toContain('lib/handlers.py');
+  });
+
+  it('strips top-level spec/ prefix similarly', () => {
+    const all = new Set(['app/order.rb', 'spec/order_spec.rb']);
+    expect(findTestSubjects('spec/order_spec.rb', all))
+      .toEqual(['app/order.rb']);
+  });
+});
+
+describe('CodeGraph end-to-end (tests edges wired into indexAll/sync)', () => {
+  let dir: string;
+  let cg: CodeGraph;
+
+  beforeEach(async () => {
+    dir = fs.mkdtempSync(path.join(os.tmpdir(), 'tests-edges-e2e-'));
+    fs.mkdirSync(path.join(dir, 'src'));
+    fs.mkdirSync(path.join(dir, 'src', 'sync'));
+    fs.mkdirSync(path.join(dir, '__tests__'));
+    // Subject files
+    fs.writeFileSync(path.join(dir, 'src', 'sync', 'index.ts'), 'export const sync = 1;');
+    fs.writeFileSync(path.join(dir, 'src', 'sync', 'watcher.ts'), 'export const watcher = 1;');
+    fs.writeFileSync(path.join(dir, 'src', 'utils.ts'), 'export const utils = 1;');
+    // Tests
+    fs.writeFileSync(path.join(dir, '__tests__', 'sync.test.ts'), 'import { sync } from "../src/sync"; export {};');
+    fs.writeFileSync(path.join(dir, 'src', 'sync', 'watcher.test.ts'), 'import { watcher } from "./watcher"; export {};');
+    // Feature-themed test (no single subject)
+    fs.writeFileSync(path.join(dir, '__tests__', 'integration.test.ts'), 'export {};');
+
+    cg = CodeGraph.initSync(dir, { config: { include: ['**/*.ts'], exclude: [] } });
+    await cg.indexAll();
+  });
+
+  afterEach(() => {
+    if (cg) cg.destroy();
+    if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true });
+  });
+
+  it('indexAll populates tests edges (mirrored layout: __tests__/sync.test.ts → src/sync/index.ts)', () => {
+    const subjects = cg.getSubjectsOfTest('__tests__/sync.test.ts');
+    const paths = subjects.map((s) => s.path);
+    expect(paths).toContain('src/sync/index.ts');
+  });
+
+  it('indexAll populates tests edges (co-located: src/sync/watcher.test.ts → src/sync/watcher.ts)', () => {
+    const subjects = cg.getSubjectsOfTest('src/sync/watcher.test.ts');
+    expect(subjects.map((s) => s.path)).toEqual(['src/sync/watcher.ts']);
+  });
+
+  it('getTestsForFile returns the test that covers a given subject (incoming edges)', () => {
+    const tests = cg.getTestsForFile('src/sync/watcher.ts');
+    expect(tests.map((t) => t.path)).toContain('src/sync/watcher.test.ts');
+  });
+
+  it('returns empty array for tests with no resolvable subject (no false-positive guesses)', () => {
+    const subjects = cg.getSubjectsOfTest('__tests__/integration.test.ts');
+    expect(subjects).toEqual([]);
+  });
+
+  it('returns empty array for non-test files queried as tests', () => {
+    expect(cg.getSubjectsOfTest('src/sync/index.ts')).toEqual([]);
+  });
+
+  it('sync refreshes a test file\'s edges when its subject convention changes', async () => {
+    // Add a new subject file and a co-located test for it. After sync,
+    // the new test should have a `tests` edge to the new subject.
+    fs.writeFileSync(path.join(dir, 'src', 'newmod.ts'), 'export const m = 1;');
+    fs.writeFileSync(path.join(dir, 'src', 'newmod.test.ts'), 'import "./newmod";');
+
+    await cg.sync();
+    const subjects = cg.getSubjectsOfTest('src/newmod.test.ts');
+    expect(subjects.map((s) => s.path)).toEqual(['src/newmod.ts']);
+  });
+
+  it('sync removes stale edges when a subject file is deleted (FK cascade)', async () => {
+    // The cascade is on the file *node* (kind='file' has nodes_fts triggers
+    // and FK constraints from edges). When we sync after deleting the
+    // subject, edges to it should disappear.
+    fs.unlinkSync(path.join(dir, 'src', 'sync', 'watcher.ts'));
+    await cg.sync();
+    const subjects = cg.getSubjectsOfTest('src/sync/watcher.test.ts');
+    expect(subjects.map((s) => s.path)).not.toContain('src/sync/watcher.ts');
+  });
+});
diff --git a/__tests__/watcher.test.ts b/__tests__/watcher.test.ts
index f3638e6d..a546494d 100644
--- a/__tests__/watcher.test.ts
+++ b/__tests__/watcher.test.ts
@@ -31,6 +31,19 @@ function waitFor(
   });
 }
 
+/**
+ * fs.watch on macOS (FSEvents) and Linux (inotify) has a small but real
+ * latency between `fs.watch()` returning and the kernel actually
+ * delivering events. Writing a file in that window — particularly under
+ * parallel test load when the host CPU is busy — drops the event and
+ * causes a 5s timeout for "should trigger sync after file change" style
+ * tests. This helper standardizes the settle delay to match the pattern
+ * already used by the filtering tests in this file.
+ */
+async function letWatcherSettle(): Promise<void> {
+  await new Promise((r) => setTimeout(r, 400));
+}
+
 describe('FileWatcher', () => {
   let testDir: string;
 
@@ -101,6 +114,7 @@ describe('FileWatcher', () => {
       const watcher = new FileWatcher(testDir, baseConfig, syncFn, { debounceMs: 200 });
 
       watcher.start();
+      await letWatcherSettle();
 
       // Create a new file
       fs.writeFileSync(path.join(testDir, 'src', 'new.ts'), 'export const y = 2;');
@@ -117,6 +131,7 @@ describe('FileWatcher', () => {
       const watcher = new FileWatcher(testDir, baseConfig, syncFn, { debounceMs: 500 });
 
       watcher.start();
+      await letWatcherSettle();
 
       // Rapid-fire changes
       for (let i = 0; i < 5; i++) {
@@ -145,7 +160,7 @@ describe('FileWatcher', () => {
       watcher.start();
 
       // Let watcher settle — fs.watch may fire residual events from beforeEach
-      await new Promise((r) => setTimeout(r, 400));
+      await letWatcherSettle();
       syncFn.mockClear();
 
       // Create a file that doesn't match include patterns
@@ -165,7 +180,7 @@ describe('FileWatcher', () => {
       watcher.start();
 
       // Let watcher settle — fs.watch may fire residual events from beforeEach
-      await new Promise((r) => setTimeout(r, 400));
+      await letWatcherSettle();
       syncFn.mockClear();
 
       // Simulate a .codegraph directory change
@@ -191,6 +206,7 @@ describe('FileWatcher', () => {
       });
 
       watcher.start();
+      await letWatcherSettle();
 
       fs.writeFileSync(path.join(testDir, 'src', 'test.ts'), 'export const z = 3;');
 
@@ -209,6 +225,7 @@ describe('FileWatcher', () => {
       });
 
       watcher.start();
+      await letWatcherSettle();
 
       fs.writeFileSync(path.join(testDir, 'src', 'test.ts'), 'export const z = 3;');
 
@@ -218,6 +235,36 @@ describe('FileWatcher', () => {
 
       watcher.stop();
     });
+
+    it('should retry pending changes after a sync failure (no events lost)', async () => {
+      // First call rejects, subsequent calls resolve. After the initial
+      // failure, the watcher should retry the same batch on its own — without
+      // this, transient sync failures (DB locked etc.) would silently drop the
+      // changes until a new file event happened.
+      let calls = 0;
+      const syncFn = vi.fn().mockImplementation(() => {
+        calls++;
+        if (calls === 1) return Promise.reject(new Error('transient'));
+        return Promise.resolve({ filesChanged: 1, durationMs: 5 });
+      });
+      const onSyncError = vi.fn();
+      const onSyncComplete = vi.fn();
+      const watcher = new FileWatcher(testDir, baseConfig, syncFn, {
+        debounceMs: 100,
+        onSyncError,
+        onSyncComplete,
+      });
+
+      watcher.start();
+      fs.writeFileSync(path.join(testDir, 'src', 'test.ts'), 'export const z = 3;');
+
+      await waitFor(() => onSyncComplete.mock.calls.length > 0, 5000);
+      expect(onSyncError).toHaveBeenCalledTimes(1);
+      expect(syncFn).toHaveBeenCalledTimes(2);
+      expect(onSyncComplete).toHaveBeenCalledWith({ filesChanged: 1, durationMs: 5 });
+
+      watcher.stop();
+    });
   });
 
   describe('CodeGraph integration', () => {
@@ -268,6 +315,7 @@ describe('FileWatcher', () => {
       const initialNodes = initialStats.nodeCount;
 
       cg.watch({ debounceMs: 300 });
+      await letWatcherSettle();
 
       // Add a new file with a function
       fs.writeFileSync(
diff --git a/docs/ADDING-A-LANGUAGE.md b/docs/ADDING-A-LANGUAGE.md
new file mode 100644
index 00000000..189b0e27
--- /dev/null
+++ b/docs/ADDING-A-LANGUAGE.md
@@ -0,0 +1,463 @@
+# Adding a Language
+
+This is a cookbook for adding a new language to CodeGraph. It assumes you have a
+working dev setup (`npm install` and `npm test` pass).
+
+There are two patterns. **Pick the one that matches the language you're adding.**
+
+| Language shape | Pattern | Examples |
+|---|---|---|
+| Procedural / OO with named functions, classes, methods | **`LanguageExtractor` config** | `python.ts`, `ruby.ts`, `r.ts` |
+| Declarative / template / configuration / no named functions | **Custom extractor class** | `hcl-extractor.ts`, `liquid-extractor.ts`, `sql-extractor.ts` |
+
+The two patterns share the same setup steps (1–4) and only diverge at the extractor
+itself (step 5).
+
+---
+
+## 1. Source a tree-sitter wasm grammar
+
+CodeGraph parses everything via [`web-tree-sitter`](https://www.npmjs.com/package/web-tree-sitter),
+so the grammar has to be available as a `.wasm` file. Three options, in order of
+preference:
+
+### 1a. Already in `tree-sitter-wasms`
+
+The [`tree-sitter-wasms`](https://www.npmjs.com/package/tree-sitter-wasms) npm package
+ships pre-built wasms for 30+ common languages. Check `node_modules/tree-sitter-wasms/out/`
+after a fresh install:
+
+```bash
+ls node_modules/tree-sitter-wasms/out/ | grep <lang>
+```
+
+If your grammar is there, you're done with this step — just reference the filename.
+
+### 1b. A pre-built `.wasm` released somewhere else
+
+Many grammars publish wasms in their GitHub releases (e.g. r-lib/tree-sitter-r) or
+in a separate npm package (e.g. `@tree-sitter-grammars/tree-sitter-hcl` ships
+`tree-sitter-hcl.wasm` directly in the tarball).
+
+```bash
+# GitHub release
+curl -sL -o src/extraction/wasm/tree-sitter-foo.wasm \
+  https://github.com/.../releases/download/vX.Y.Z/tree-sitter-foo.wasm
+
+# Inside an npm tarball
+mkdir -p /tmp/foo && cd /tmp/foo
+curl -sL https://registry.npmjs.org/tree-sitter-foo/-/tree-sitter-foo-X.Y.Z.tgz | tar xz
+cp package/tree-sitter-foo.wasm <repo>/src/extraction/wasm/
+```
+
+Verify the sha256 against the upstream release manifest before committing.
+
+### 1c. Build from source
+
+If only the C source is published (e.g. DerekStride/tree-sitter-sql), build the wasm
+locally with `tree-sitter-cli`. Recent versions ship their own wasi-sdk and don't need
+Docker or local emcc:
+
+```bash
+mkdir /tmp/foo && cd /tmp/foo
+curl -sL https://github.com/.../releases/download/vX.Y.Z/tree-sitter-foo.tar.gz | tar xz
+npx --yes tree-sitter-cli@latest build --wasm
+cp tree-sitter-foo.wasm <repo>/src/extraction/wasm/
+```
+
+### Where the wasm lives
+
+- Grammars from the `tree-sitter-wasms` package are loaded directly from there at runtime.
+- Other grammars must be **vendored** under `src/extraction/wasm/` so they ship in the
+  npm package. The build's `copy-assets` script copies every `.wasm` from that
+  directory into `dist/extraction/wasm/`.
+
+**License check.** Tree-sitter grammars are usually MIT or Apache-2.0 — confirm before
+committing the wasm and note the source/version in the file's header comment so the
+provenance is recoverable later.
+
+---
+
+## 2. Probe the AST
+
+Don't guess at node types. Parse a representative sample and dump the tree:
+
+```js
+// scratch/probe.mjs
+import { Parser, Language } from 'web-tree-sitter';
+await Parser.init();
+const lang = await Language.load('./src/extraction/wasm/tree-sitter-foo.wasm');
+const parser = new Parser();
+parser.setLanguage(lang);
+
+const sample = `
+// realistic code here — cover every construct you plan to extract
+`;
+
+const tree = parser.parse(sample);
+function dump(n, d = 0, max = 4) {
+  if (d > max) return;
+  const text = n.text.length > 60 ? n.text.slice(0, 60).replace(/\n/g, '\\n') + '...' : n.text.replace(/\n/g, '\\n');
+  console.log(`${'  '.repeat(d)}${n.type}  "${text}"`);
+  for (let i = 0; i < n.namedChildCount; i++) dump(n.namedChild(i), d + 1, max);
+}
+dump(tree.rootNode);
+```
+
+```bash
+node scratch/probe.mjs
+```
+
+Cover every construct you plan to extract: function definitions, classes, methods,
+imports, assignments, calls, references. Watch for surprises:
+
+- Some grammars wrap names in extra layers (`identifier > simple_identifier`)
+- Field names (`childForFieldName`) often differ from what the docs imply
+- Operator nodes can be named, unnamed, or both — call `child(i)` vs `namedChild(i)`
+  and inspect
+
+Save the probe output before you start coding — you'll refer to it constantly.
+
+---
+
+## 3. Register the language
+
+Three files, all small.
+
+**`src/types.ts`** — add to the `Language` union and to `DEFAULT_CONFIG.include`:
+
+```ts
+export type Language =
+  | 'typescript'
+  | ...
+  | 'foo'                  // ← add here
+  | 'unknown';
+
+export const DEFAULT_CONFIG: CodeGraphConfig = {
+  ...
+  include: [
+    ...
+    '**/*.foo',            // ← and here
+  ],
+};
+```
+
+**`src/extraction/grammars.ts`** — wire up the wasm path, extension map, and display name:
+
+```ts
+const WASM_GRAMMAR_FILES: Record<GrammarLanguage, string> = {
+  ...
+  foo: 'tree-sitter-foo.wasm',
+};
+
+// If vendored under src/extraction/wasm/ instead of tree-sitter-wasms:
+const VENDORED_WASM_LANGUAGES: ReadonlySet<GrammarLanguage> = new Set([
+  'pascal',
+  'foo',                   // ← add here
+]);
+
+export const EXTENSION_MAP: Record<string, Language> = {
+  ...
+  '.foo': 'foo',
+};
+
+// And in getLanguageDisplayName():
+foo: 'Foo',
+```
+
+**`CLAUDE.md`** — append the language to the "Supported Languages" line so the
+LLM-readable architecture doc stays in sync.
+
+---
+
+## 4. Type-check before writing the extractor
+
+Run `npx tsc --noEmit` now. If it's not clean, the wiring is wrong — fix that
+before adding extraction logic, otherwise type errors will pile up.
+
+---
+
+## 5a. Path A — Plug into `LanguageExtractor`
+
+Use this when the language has named function/class/method declarations (Python, Ruby,
+Java, R, etc.). Create `src/extraction/languages/<lang>.ts`:
+
+```ts
+import type { LanguageExtractor } from '../tree-sitter-types';
+
+export const fooExtractor: LanguageExtractor = {
+  // Map AST node types → graph kinds. Empty array = "this kind doesn't
+  // exist in this language."
+  functionTypes: ['function_definition'],
+  classTypes: ['class_definition'],
+  methodTypes: ['function_definition'],   // often the same node, dispatched by context
+  interfaceTypes: [],
+  structTypes: [],
+  enumTypes: [],
+  typeAliasTypes: [],
+  importTypes: ['import_statement'],
+  callTypes: ['call'],
+  variableTypes: ['assignment'],
+
+  // Field names tree-sitter exposes for extractors to read.
+  nameField: 'name',
+  bodyField: 'body',
+  paramsField: 'parameters',
+  returnField: 'return_type',
+
+  // Optional hooks — implement what you need:
+  getSignature: (node, source) => { ... },
+  isExported: (node, source) => { ... },
+  isAsync: (node) => { ... },
+
+  // Escape hatch: take over a specific node type entirely. Return true to
+  // tell the core "I handled this, skip default dispatch."
+  visitNode: (node, ctx) => {
+    // R uses this to handle `name <- function() {}` because tree-sitter's
+    // function_definition has no name field — the name is on the LHS of
+    // the enclosing assignment.
+    return false;
+  },
+};
+```
+
+Then register it in `src/extraction/languages/index.ts`:
+
+```ts
+import { fooExtractor } from './foo';
+
+export const EXTRACTORS: Partial<Record<Language, LanguageExtractor>> = {
+  ...
+  foo: fooExtractor,
+};
+```
+
+The core (`TreeSitterExtractor` in `src/extraction/tree-sitter.ts`) does the rest:
+walks the AST, dispatches based on your `*Types` arrays, calls your hooks, manages
+the scope stack, and emits nodes/edges.
+
+**Worked example: R** (`src/extraction/languages/r.ts`). R's `function_definition`
+has no name (it's anonymous), so `functionTypes` is empty and the `visitNode` hook
+intercepts `binary_operator` assignments and emits the function manually via
+`ctx.createNode('function', name, ...)`.
+
+## 5b. Path B — Custom extractor class
+
+Use this when the language is declarative (HCL, SQL, dbt) or has a fundamentally
+different shape than functions/classes/methods (Liquid templates, Pascal `.dfm` form
+files). Create `src/extraction/<lang>-extractor.ts`:
+
+```ts
+import { Node, Edge, ExtractionResult, ExtractionError, UnresolvedReference } from '../types';
+import { generateNodeId, getNodeText } from './tree-sitter-helpers';
+import { getParser } from './grammars';
+
+export class FooExtractor {
+  private filePath: string;
+  private source: string;
+  private nodes: Node[] = [];
+  private edges: Edge[] = [];
+  private unresolvedReferences: UnresolvedReference[] = [];
+  private errors: ExtractionError[] = [];
+
+  constructor(filePath: string, source: string) {
+    this.filePath = filePath;
+    this.source = source;
+  }
+
+  extract(): ExtractionResult {
+    const startTime = Date.now();
+    const parser = getParser('foo');
+    if (!parser) {
+      this.errors.push({ message: 'foo grammar not loaded', severity: 'error', code: 'grammar_unavailable' });
+      return this.result(startTime);
+    }
+    const tree = parser.parse(this.source);
+    if (!tree) { ... return this.result(startTime); }
+
+    try {
+      const fileNodeId = this.createFileNode();
+      // Walk the AST, emit nodes via this.nodes.push and this.edges.push
+      // Emit references via this.unresolvedReferences.push so the resolver
+      // pass can match them across files.
+      ...
+      return this.result(startTime);
+    } finally {
+      tree.delete();   // ← important: tree-sitter trees back onto WASM memory
+    }
+  }
+
+  private result(startTime: number): ExtractionResult {
+    return {
+      nodes: this.nodes,
+      edges: this.edges,
+      unresolvedReferences: this.unresolvedReferences,
+      errors: this.errors,
+      durationMs: Date.now() - startTime,
+    };
+  }
+}
+```
+
+Wire the dispatch in `src/extraction/tree-sitter.ts`:
+
+```ts
+import { FooExtractor } from './foo-extractor';
+
+export function extractFromSource(filePath, source, language?) {
+  ...
+  if (detectedLanguage === 'foo') {
+    return new FooExtractor(filePath, source).extract();
+  }
+  ...
+}
+```
+
+**Worked examples:**
+
+- `src/extraction/hcl-extractor.ts` — Terraform / HCL. Block-based DDL. Each
+  top-level block becomes a node whose qualified name matches the Terraform
+  reference form (`var.X`, `local.X`, `module.X`, `aws_s3_bucket.foo`) so the
+  resolver can match references across files automatically.
+- `src/extraction/sql-extractor.ts` — SQL DDL. CREATE TABLE / VIEW / FUNCTION /
+  TRIGGER / TYPE / SCHEMA → graph nodes; foreign keys, view source tables,
+  trigger target tables and executed functions → edges.
+- `src/extraction/liquid-extractor.ts` — Shopify Liquid templates. Regex-based
+  (no tree-sitter) since the template grammar isn't useful for code intelligence.
+
+---
+
+## 6. Pick `NodeKind` and `EdgeKind` values
+
+`NodeKind` and `EdgeKind` are fixed unions in `src/types.ts`. Map your language's
+constructs onto the closest existing kind rather than introducing new ones —
+adding a new kind is a cross-cutting change that touches search, resolution, and
+context-building code.
+
+Common mappings used by recent extractors:
+
+| Language construct | NodeKind |
+|---|---|
+| Function / procedure / standalone routine | `function` |
+| Method on a class | `method` |
+| Class / type / table / declarative resource | `class` |
+| Trait / mixin | `trait` |
+| Interface / protocol | `interface` |
+| Module / package / file-level scope / Terraform module | `module` |
+| Namespace / schema / SQL schema / Terraform provider | `namespace` |
+| Variable / Terraform variable | `variable` |
+| Constant / Terraform local / R top-level binding | `constant` |
+| Type alias / SQL composite type | `type_alias` |
+| Enum (any) | `enum` |
+| Import / library / source / require | `import` |
+| Output / re-export / Terraform output | `export` |
+
+Edges are usually one of:
+
+| Edge | When |
+|---|---|
+| `contains` | Parent contains child (file → block, class → method) |
+| `calls` | Function/method invokes another |
+| `imports` | File pulls in another module/file |
+| `references` | Generic mention of another symbol (FK, lookup, attribute access) |
+| `extends` / `implements` | Inheritance relationships |
+
+Emit references through `unresolvedReferences` (with `referenceName` set to a
+qualified name that matches what you put on the target node's `qualifiedName`) —
+the resolver pass matches them across files using the `name-matcher` and
+`import-resolver` modules.
+
+---
+
+## 7. Tests
+
+Tests live in `__tests__/extraction.test.ts`, grouped by language with a
+`describe('<Language> Extraction', ...)` block. Use `extractFromSource` directly
+for unit-style tests:
+
+```ts
+import { extractFromSource } from '../src/extraction';
+
+describe('Foo Extraction', () => {
+  describe('Language detection', () => {
+    it('should detect Foo files', () => {
+      expect(detectLanguage('main.foo')).toBe('foo');
+    });
+  });
+
+  describe('Function extraction', () => {
+    it('should extract a top-level function', () => {
+      const code = `function add(a, b) a + b`;
+      const result = extractFromSource('main.foo', code);
+      const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'add');
+      expect(fn).toBeDefined();
+    });
+  });
+});
+```
+
+Cover the AST shapes you saw in the probe, especially the surprising ones. Pay
+particular attention to:
+
+- The smallest possible valid program (`expect(...).toBeDefined()` for the file node)
+- Each node-kind mapping (one test per emitted kind)
+- Reference forms (call edges, FK / cross-file references, imports)
+- Anything you intentionally skipped (anonymous lambdas, dynamic imports, etc.)
+  with a negative assertion so the omission is documented
+
+Run the suite serialized to avoid the file-watcher tests' parallel flakiness:
+
+```bash
+npx vitest run --no-file-parallelism
+```
+
+End-to-end smoke test from a fresh fixture before opening the PR:
+
+```bash
+SMOKE=$(mktemp -d) && cat > "$SMOKE/main.foo" <<'EOF'
+... realistic input ...
+EOF
+cd "$SMOKE" && git init -q
+node <repo>/dist/bin/codegraph.js init "$SMOKE"
+node <repo>/dist/bin/codegraph.js index "$SMOKE"
+node <repo>/dist/bin/codegraph.js status "$SMOKE"
+cd "$SMOKE" && node <repo>/dist/bin/codegraph.js query "<symbol>"
+```
+
+The `status` call should report your file under "Files by Language", and `query`
+should turn up the symbols you expect at the right line numbers.
+
+---
+
+## 8. Open the PR
+
+Include in the PR description:
+
+- The grammar source + version + license + sha256 (if vendored)
+- A small worked example showing what gets extracted
+- The full test plan (`npm test`, `tsc`, `npm run build`, CLI smoke)
+- Any known limitations (constructs not supported, AST quirks, things the grammar
+  itself can't parse)
+
+Don't claim support for constructs the grammar can't actually parse — this happens
+more often than you'd expect (e.g. `tree-sitter-sql` errors out on `CREATE
+PROCEDURE` because procedure-body syntax varies sharply across dialects). Say what
+works, say what doesn't, and let reviewers decide.
+
+---
+
+## Reference: existing extractors as templates
+
+Read these in source order if your language is similar to one of them:
+
+- **Procedural / OO:** `src/extraction/languages/python.ts` (small, easy to read),
+  `ruby.ts` (with bare-call detection), `kotlin.ts` (extension functions),
+  `r.ts` (no `def` keyword — uses `visitNode` hook for assignments)
+- **Declarative / config:** `src/extraction/hcl-extractor.ts` (Terraform reference
+  graph), `sql-extractor.ts` (DDL with FK / view source extraction)
+- **Embedded / template:** `src/extraction/svelte-extractor.ts` (delegates to JS
+  for `<script>` blocks), `liquid-extractor.ts` (regex-based, no tree-sitter)
+- **Form / non-tree-sitter:** `src/extraction/dfm-extractor.ts` (Delphi `.dfm`
+  files; line-based regex parser cross-linked with Pascal symbols)
+
+When in doubt, copy the extractor closest in shape to yours and modify from there.
diff --git a/scripts/audit.mjs b/scripts/audit.mjs
new file mode 100644
index 00000000..47436910
--- /dev/null
+++ b/scripts/audit.mjs
@@ -0,0 +1,271 @@
+#!/usr/bin/env node
+/**
+ * Codebase audit using the integrated codegraph signals.
+ *
+ * Surfaces real, actionable findings that combine multiple signals:
+ *   - Risk hotspots: high centrality × high churn (bug magnets)
+ *   - Single points of failure: outlier centrality (one node nearly everyone depends on)
+ *   - Untested high-leverage code: high centrality but no test coverage edges
+ *   - Dead exports: exported symbols with zero incoming calls/references
+ *   - Coupling smells: files that always co-change but aren't structurally linked
+ *   - Long-tail churn without centrality: high churn but low impact (refactor candidates)
+ *   - Config-drift signals: env vars read from many distinct files
+ *   - Hot SQL tables: tables touched in many distinct call sites (repository-pattern candidates)
+ *   - Issue-attributed bug magnets: symbols touched by many `Fixes #N` commits
+ *
+ * Each finding includes file:line so it's directly actionable.
+ *
+ * Usage: node scripts/audit.mjs <project-path>
+ */
+
+import path from 'node:path';
+import fs from 'node:fs';
+import process from 'node:process';
+
+const target = path.resolve(process.argv[2] ?? process.cwd());
+if (!fs.existsSync(target)) {
+  console.error(`audit: target path does not exist: ${target}`);
+  process.exit(1);
+}
+
+const { CodeGraph } = await import('../dist/index.js');
+
+console.log(`\n=== Audit: ${target} ===\n`);
+
+// Reset + index fresh
+const cgDir = path.join(target, '.codegraph');
+if (fs.existsSync(cgDir)) fs.rmSync(cgDir, { recursive: true, force: true });
+
+const cg = await CodeGraph.init(target);
+const t0 = Date.now();
+const r = await cg.indexAll();
+const indexMs = Date.now() - t0;
+const stats = cg.getStats();
+
+console.log(`Indexed: ${stats.fileCount} files / ${stats.nodeCount} nodes / ${stats.edgeCount} edges in ${indexMs}ms\n`);
+
+// ----------------------------------------------------------------------------
+function section(title) { console.log(`\n${title}`); console.log('─'.repeat(title.length)); }
+function bullet(s) { console.log(`  • ${s}`); }
+function indent(s) { console.log(`    ${s}`); }
+
+// ----------------------------------------------------------------------------
+// 1. RISK HOTSPOTS — top 5 by centrality × churn
+// ----------------------------------------------------------------------------
+section('🔥 Risk hotspots (high centrality × high churn)');
+const hotspots = cg.getHotspots({ limit: 8, minCommits: 3 });
+if (hotspots.length === 0) {
+  bullet('No hotspots — either fresh repo, non-git, or signals not yet computed.');
+} else {
+  bullet(`Top ${hotspots.length} files where bugs accumulate:`);
+  hotspots.forEach((h, i) => {
+    const ago = h.lastTouchedTs
+      ? Math.floor((Date.now() / 1000 - h.lastTouchedTs) / 86400) + 'd ago'
+      : '—';
+    indent(`${i + 1}. ${h.filePath}`);
+    indent(`   risk=${h.riskScore.toFixed(3)} · centrality=${h.fileCentrality.toFixed(4)} · ${h.commitCount} commits · ${h.loc} LOC · last touched ${ago}`);
+  });
+  console.log('\n  Action: schedule these for review before adding new features. ' +
+    'A regression here propagates widely (centrality) and recently-touched code is more likely to ship a bug (churn).');
+}
+
+// ----------------------------------------------------------------------------
+// 2. SINGLE POINTS OF FAILURE — outlier centrality (one symbol everyone depends on)
+// ----------------------------------------------------------------------------
+section('⚡ Single points of failure (centrality outliers)');
+const top10 = cg.getTopCentralNodes({ limit: 10 });
+if (top10.length < 2) {
+  bullet('Not enough nodes ranked.');
+} else {
+  // Compute mean + stddev of top 10; flag any with z-score > 2
+  const scores = top10.map((n) => n.centrality ?? 0);
+  const mean = scores.reduce((a, b) => a + b, 0) / scores.length;
+  const variance = scores.reduce((a, b) => a + (b - mean) ** 2, 0) / scores.length;
+  const stddev = Math.sqrt(variance);
+  const outliers = top10.filter((n) => stddev > 0 && (n.centrality - mean) / stddev > 1.5);
+
+  if (outliers.length === 0) {
+    bullet(`No outlier — top ${top10.length} centrality scores are within 1.5σ of mean (${mean.toFixed(4)}).`);
+    indent(`Top symbol: ${top10[0].name} (${top10[0].kind}) at ${top10[0].filePath}:${top10[0].startLine}, centrality=${top10[0].centrality.toFixed(4)}`);
+  } else {
+    bullet(`${outliers.length} outlier symbol${outliers.length > 1 ? 's' : ''} where everything funnels through one place:`);
+    outliers.forEach((n) => {
+      const z = ((n.centrality - mean) / stddev).toFixed(1);
+      indent(`${n.name} (${n.kind}) — z=${z}σ, centrality=${n.centrality.toFixed(4)}`);
+      indent(`   ${n.filePath}:${n.startLine}`);
+    });
+    console.log('\n  Action: any breaking change here ripples through the whole codebase. ' +
+      'Worth investing in extra test coverage and documentation around these symbols.');
+  }
+}
+
+// ----------------------------------------------------------------------------
+// 3. UNTESTED HIGH-LEVERAGE CODE — central files lacking incoming `tests` edges
+// ----------------------------------------------------------------------------
+section('🧪 Untested high-leverage files');
+const centralFiles = cg.getHotspots({ limit: 30, minCommits: 0, sortBy: 'centrality' });
+const untested = [];
+for (const f of centralFiles) {
+  const tests = cg.getTestsForFile(f.filePath);
+  if (tests.length === 0 && !f.filePath.includes('test')) {
+    untested.push(f);
+  }
+  if (untested.length >= 8) break;
+}
+if (untested.length === 0) {
+  bullet('Every high-centrality file has at least one test file pointing at it. ✓');
+} else {
+  bullet(`${untested.length} high-centrality files with no test coverage edges:`);
+  untested.forEach((f) => {
+    indent(`${f.filePath} — centrality ${f.fileCentrality.toFixed(4)}, ${f.loc} LOC`);
+  });
+  console.log('\n  Note: this checks for convention-named tests (foo.test.ts → foo.ts). ' +
+    'Tests with non-conventional names won\'t register. Treat as a starting point, not gospel.');
+}
+
+// ----------------------------------------------------------------------------
+// 4. DEAD EXPORTS — defer to language-specific tooling
+// ----------------------------------------------------------------------------
+// We deliberately don't surface a "dead exports" finding here. A spike
+// against the codegraph repo (scripts/spikes/spike-dead-exports.mjs)
+// showed 45% precision against ts-prune — the underlying graph doesn't
+// track import-to-export edges directly, so re-exported symbols look
+// "dead" when they're actually wired up. Cross-PR conflict surface
+// elimination was worth it; cross-language dead-code detection isn't,
+// without a deeper resolution-layer fix.
+//
+// For now, point users at language-specific tools that handle this
+// category correctly:
+section('🪦 Dead exports');
+bullet('Use a language-specific tool — codegraph\'s graph isn\'t precise enough here:');
+indent('TypeScript / JavaScript: `npx ts-prune` or `npx knip`');
+indent('Python:                  `vulture <path>`');
+indent('Go:                      `staticcheck -checks U1000 ./...` (or `deadcode`)');
+indent('Rust:                    `cargo +nightly udeps` / built-in `dead_code` lint');
+indent('Java:                    `pmd -R rulesets/java/unusedcode.xml`');
+indent('C#:                      Roslyn analyzer `IDE0051`');
+console.log('\n  Why we don\'t do this in codegraph: the graph tracks calls and references but not the import-to-export linkage that makes "is this export referenced?" a precise question. Spike data showed 45% precision (138 false positives in 250 claims, mostly re-exports). Better to defer to tools built for this.');
+
+// ----------------------------------------------------------------------------
+// 5. COUPLING SMELLS — high cochange between files NOT structurally linked
+// ----------------------------------------------------------------------------
+section('🪢 Coupling smells (high cochange, no static link)');
+const coupling = [];
+const allFiles = cg.getHotspots({ limit: 50, minCommits: 5, sortBy: 'churn' });
+for (const f of allFiles) {
+  const partners = cg.getCoChangedFiles(f.filePath, { limit: 5, minCount: 3, minJaccard: 0.5 });
+  for (const p of partners) {
+    // Heuristic: high jaccard pair we haven't already seen
+    const key = [f.filePath, p.path].sort().join('|');
+    if (coupling.some((c) => c.key === key)) continue;
+    if (p.jaccard >= 0.5) {
+      coupling.push({ key, a: f.filePath, b: p.path, jaccard: p.jaccard, count: p.count });
+    }
+  }
+  if (coupling.length >= 8) break;
+}
+coupling.sort((a, b) => b.jaccard - a.jaccard);
+if (coupling.length === 0) {
+  bullet('No high-jaccard cochange pairs found (Jaccard ≥ 0.5).');
+} else {
+  bullet(`${coupling.length} pairs that change together ≥50% of the time:`);
+  coupling.slice(0, 6).forEach((c) => {
+    indent(`${c.jaccard.toFixed(2)}j (${c.count}× together) — ${c.a} ↔ ${c.b}`);
+  });
+  console.log('\n  Action: investigate whether the coupling is intentional (sibling features) or accidental (a leaky abstraction). High cochange between files that don\'t import each other is often a sign of an implicit contract worth making explicit.');
+}
+
+// ----------------------------------------------------------------------------
+// 6. CONFIG SURFACE — env vars read from many distinct files (centralization candidates)
+// ----------------------------------------------------------------------------
+section('🔧 Configuration surface');
+const envKeys = cg.getConfigKeys({ configKind: 'env', limit: 100 });
+console.log(`  ${envKeys.length} distinct env vars read across this codebase.`);
+if (envKeys.length > 0) {
+  const spread = envKeys.filter((k) => k.distinctFiles >= 3);
+  if (spread.length > 0) {
+    bullet(`${spread.length} env var${spread.length > 1 ? 's' : ''} read from ≥3 different files:`);
+    spread.slice(0, 6).forEach((k) => {
+      indent(`${k.configKey} — ${k.reads} reads across ${k.distinctFiles} files`);
+    });
+    console.log('\n  Action: env vars read in many places are good candidates for a central config object. Each read site is a place that can drift if the env var is renamed or its semantics change.');
+  } else {
+    bullet('All env vars are read from ≤2 files — concentrated nicely.');
+  }
+}
+
+// ----------------------------------------------------------------------------
+// 7. HOT SQL TABLES — touched in many distinct call sites
+// ----------------------------------------------------------------------------
+section('🗄️  Hot SQL tables');
+const tables = cg.getSqlTables({ limit: 30 });
+if (tables.length === 0) {
+  bullet('No SQL string-literal call sites detected.');
+} else {
+  const hot = tables.filter((t) => t.total >= 5).slice(0, 5);
+  if (hot.length === 0) {
+    bullet(`${tables.length} tables touched, but each in <5 sites — well-distributed.`);
+  } else {
+    bullet(`Tables with ≥5 call sites (high coupling to schema):`);
+    hot.forEach((t) => {
+      indent(`${t.tableName} — ${t.reads}r / ${t.writes}w / ${t.ddl} ddl (${t.total} total)`);
+    });
+    console.log('\n  Action: tables touched in many places benefit from a repository/data-access layer. ' +
+      'Schema changes ripple to every call site otherwise.');
+  }
+}
+
+// ----------------------------------------------------------------------------
+// 8. ISSUE-ATTRIBUTED BUG MAGNETS — symbols touched in many `Fixes #N` commits
+// ----------------------------------------------------------------------------
+section('🐛 Bug magnets (symbols mentioned in many `Fixes #N` commits)');
+const sampleNodes = cg.getTopCentralNodes({ limit: 500 });
+const bugCounts = [];
+for (const n of sampleNodes) {
+  const issues = cg.getIssuesForNode(n.id);
+  if (issues.length >= 2) {
+    bugCounts.push({ name: n.name, kind: n.kind, file: n.filePath, line: n.startLine, count: issues.length });
+  }
+}
+bugCounts.sort((a, b) => b.count - a.count);
+if (bugCounts.length === 0) {
+  bullet('No symbols attributed to ≥2 issues.');
+  indent('(Either no `Fixes/Closes/Resolves #N` discipline in commits, or the codebase genuinely has light bug history.)');
+} else {
+  bullet(`Top symbols by attributed-issue count:`);
+  bugCounts.slice(0, 6).forEach((b) => {
+    indent(`${b.count} issues — ${b.name} (${b.kind}) — ${b.file}:${b.line}`);
+  });
+  console.log('\n  Action: symbols that show up in many bug-fix commits are usually the trickiest abstractions. ' +
+    'Worth extra invariant tests and consider whether the abstraction itself is the right one.');
+}
+
+// ----------------------------------------------------------------------------
+// 9. SUMMARY OF SIGNALS COMPUTED
+// ----------------------------------------------------------------------------
+section('📋 Signals coverage');
+const checks = [
+  { name: 'centrality', test: () => cg.getTopCentralNodes({ limit: 1 })[0]?.centrality != null },
+  { name: 'churn', test: () => cg.getHotspots({ limit: 1, minCommits: 0 })[0]?.commitCount > 0 },
+  { name: 'cochange', test: () => {
+    const top = cg.getHotspots({ limit: 1, minCommits: 1 })[0];
+    return top && cg.getCoChangedFiles(top.filePath, { limit: 1, minCount: 1, minJaccard: 0 }).length > 0;
+  }},
+  { name: 'issue-history', test: () => bugCounts.length > 0 || sampleNodes.some((n) => cg.getIssuesForNode(n.id).length > 0) },
+  { name: 'config-refs', test: () => envKeys.length > 0 },
+  { name: 'sql-refs', test: () => tables.length > 0 },
+  { name: 'tests-edges', test: () => {
+    const top = cg.getHotspots({ limit: 5, minCommits: 0 })[0];
+    return top && cg.getTestsForFile(top.filePath).length > 0;
+  }},
+];
+for (const c of checks) {
+  try {
+    indent((c.test() ? '✓' : '○') + '  ' + c.name + (c.test() ? '' : ' (no data — feature not applicable to this codebase)'));
+  } catch {
+    indent('?  ' + c.name + ' (error)');
+  }
+}
+
+cg.close();
+console.log('\n=== Audit complete ===\n');
diff --git a/scripts/battle-test.mjs b/scripts/battle-test.mjs
new file mode 100644
index 00000000..071ec3a4
--- /dev/null
+++ b/scripts/battle-test.mjs
@@ -0,0 +1,150 @@
+#!/usr/bin/env node
+/**
+ * Battle test: drive every feature shipped on `battle-test/all-shipped`
+ * against a real repo and print a comprehensive report.
+ *
+ * Validates:
+ *   - migrations: schema is at v7 with all 7 migrations applied
+ *   - extraction: nodes/edges/files indexed
+ *   - centrality: PageRank scores populated, top-N nonempty
+ *   - churn: per-file commit counts, LOC, last-touched timestamps
+ *   - hotspots: risk scoring (centrality × churn) returns ranked rows
+ *   - issue-history: Fixes/Closes/Resolves attribution
+ *   - config-refs: env var read sites
+ *   - sql-refs: table read/write/DDL call sites
+ *   - MCP tool registry: 11 tools registered + dispatch works
+ *   - Index-hook registry: 5 hooks registered + outcomes populated
+ *
+ * Usage: node scripts/battle-test.mjs <project-path>
+ */
+
+import path from 'node:path';
+import fs from 'node:fs';
+import process from 'node:process';
+
+const targetPath = path.resolve(process.argv[2] ?? process.cwd());
+if (!fs.existsSync(targetPath)) {
+  console.error(`battle-test: target path does not exist: ${targetPath}`);
+  process.exit(1);
+}
+
+console.log(`\n=== Battle test: ${targetPath} ===\n`);
+
+const { CodeGraph } = await import('../dist/index.js');
+
+// Reset .codegraph if present so we exercise the fresh-init path
+const cgDir = path.join(targetPath, '.codegraph');
+if (fs.existsSync(cgDir)) {
+  fs.rmSync(cgDir, { recursive: true, force: true });
+}
+
+const cg = await CodeGraph.init(targetPath);
+
+const t0 = Date.now();
+const result = await cg.indexAll();
+const indexMs = Date.now() - t0;
+console.log(`✓ indexAll completed in ${indexMs}ms — files=${result.filesIndexed} nodes=${result.nodesCreated} edges=${result.edgesCreated}`);
+
+const stats = cg.getStats();
+console.log(`  stats: ${stats.fileCount} files, ${stats.nodeCount} nodes, ${stats.edgeCount} edges`);
+
+// ----- migrations -----
+const { CURRENT_SCHEMA_VERSION, ALL_MIGRATIONS } = await import('../dist/db/migrations.js');
+const versions = ALL_MIGRATIONS.map((m) => m.version).join(',');
+console.log(`✓ schema v${CURRENT_SCHEMA_VERSION}, registered migrations: ${versions}`);
+
+// ----- index-hook registry -----
+const { getRegisteredHooks } = await import('../dist/index-hooks/registry.js');
+const hooks = getRegisteredHooks();
+console.log(`✓ ${hooks.length} index-hooks registered: ${hooks.map((h) => h.name).join(', ')}`);
+
+// ----- mcp tool registry -----
+const { getToolModules } = await import('../dist/mcp/tools/registry.js');
+const tools = getToolModules();
+console.log(`✓ ${tools.length} MCP tools registered: ${tools.map((t) => t.definition.name).join(', ')}`);
+
+// ----- centrality -----
+const top = cg.getTopCentralNodes({ limit: 5 });
+console.log(`\n--- centrality ---`);
+if (top.length === 0) {
+  console.log(`  ✗ no centrality scores computed`);
+} else {
+  console.log(`  ✓ top 5 by centrality:`);
+  for (const n of top) {
+    console.log(`    ${n.centrality?.toFixed(5)}  ${n.kind}  ${n.name}  (${n.filePath}:${n.startLine})`);
+  }
+}
+
+// ----- churn -----
+console.log(`\n--- churn ---`);
+const sample = cg.getStats().fileCount > 0
+  ? cg.getHotspots({ limit: 1, minCommits: 0 })[0]
+  : null;
+if (sample) {
+  const churn = cg.getFileChurn(sample.filePath);
+  console.log(`  ✓ sample file ${sample.filePath}: commits=${churn?.commitCount} loc=${churn?.loc} lastTouched=${churn?.lastTouchedTs}`);
+} else {
+  console.log(`  (no churn data — likely not in a git repo)`);
+}
+
+// ----- hotspots -----
+console.log(`\n--- hotspots ---`);
+const hot = cg.getHotspots({ limit: 5, minCommits: 0 });
+if (hot.length === 0) {
+  console.log(`  (no hotspots)`);
+} else {
+  console.log(`  ✓ top 5 by risk:`);
+  for (const r of hot) {
+    console.log(`    risk=${r.riskScore.toFixed(4)} commits=${r.commitCount} loc=${r.loc} ${r.filePath}`);
+  }
+}
+
+// ----- issue history -----
+console.log(`\n--- issue history ---`);
+let issueCount = 0;
+let nodesWithIssues = 0;
+const allNodes = cg.getStats().nodeCount;
+// Sample up to 200 random nodes; count how many have any issue history
+const sampleNodes = cg.getTopCentralNodes({ limit: 200 });
+for (const n of sampleNodes) {
+  const issues = cg.getIssuesForNode(n.id);
+  if (issues.length > 0) {
+    nodesWithIssues++;
+    issueCount += issues.length;
+  }
+}
+console.log(`  sampled ${sampleNodes.length} of ${allNodes} nodes: ${nodesWithIssues} have issue refs (${issueCount} attributions)`);
+
+// ----- config refs -----
+console.log(`\n--- config refs ---`);
+const envKeys = cg.getConfigKeys({ configKind: 'env', limit: 10 });
+if (envKeys.length === 0) {
+  console.log(`  (no env-var read sites)`);
+} else {
+  console.log(`  ✓ top 10 env vars (${envKeys.length}/${cg.getConfigKeys({ configKind: 'env', limit: 9999 }).length}):`);
+  for (const k of envKeys) {
+    console.log(`    ${k.reads.toString().padStart(4)} reads  ${k.distinctFiles} files  ${k.configKey}`);
+  }
+}
+
+// ----- sql refs -----
+console.log(`\n--- sql refs ---`);
+const tables = cg.getSqlTables({ limit: 10 });
+if (tables.length === 0) {
+  console.log(`  (no SQL string-literal call sites)`);
+} else {
+  console.log(`  ✓ top 10 tables:`);
+  for (const t of tables) {
+    console.log(`    r=${t.reads} w=${t.writes} d=${t.ddl}  ${t.tableName}`);
+  }
+}
+
+// ----- sync regression -----
+console.log(`\n--- sync round-trip ---`);
+const t1 = Date.now();
+const syncResult = await cg.sync();
+const syncMs = Date.now() - t1;
+console.log(`  ✓ sync no-op in ${syncMs}ms — added=${syncResult.filesAdded} modified=${syncResult.filesModified} removed=${syncResult.filesRemoved}`);
+
+cg.close();
+console.log(`\n=== battle test PASS ===\n`);
diff --git a/scripts/bench-llm-vs-baseline.js b/scripts/bench-llm-vs-baseline.js
new file mode 100644
index 00000000..b1d7b623
--- /dev/null
+++ b/scripts/bench-llm-vs-baseline.js
@@ -0,0 +1,240 @@
+/* eslint-disable */
+/**
+ * Bench: codegraph indexing + search WITH vs WITHOUT LLM enrichment.
+ * Uses the compiled dist/ build so tree-sitter WASM init paths match
+ * production. Run after `npm run build`.
+ *
+ *   node scripts/bench-llm-vs-baseline.js [--cap-seconds=180]
+ */
+
+const fs = require('fs');
+const path = require('path');
+const os = require('os');
+const { performance } = require('perf_hooks');
+
+const PROJECT_ROOT = path.resolve(__dirname, '..');
+const { default: CodeGraph } = require(path.join(PROJECT_ROOT, 'dist', 'index.js'));
+
+const CAP_SECONDS = Number(
+  (process.argv.find((a) => a.startsWith('--cap-seconds=')) || '').split('=')[1] || '180'
+);
+
+const SAMPLE_QUERIES = [
+  'FileWatcher',
+  'summarize symbol',
+  'detect ollama',
+  'background pass',
+  'content hash cache',
+  'reachability',
+  'search nodes',
+  'mcp tool format',
+];
+
+const PROBE_NODES = [
+  'startBackgroundSummarization',
+  'detectLocalLlm',
+  'summarizeAll',
+  'FileWatcher',
+];
+
+const timings = [];
+
+function header(text) {
+  console.log('\n' + '='.repeat(80));
+  console.log('  ' + text);
+  console.log('='.repeat(80));
+}
+function subheader(text) {
+  console.log('\n' + '-'.repeat(60));
+  console.log('  ' + text);
+  console.log('-'.repeat(60));
+}
+function fmtMs(ms) {
+  if (ms < 1000) return `${ms.toFixed(0)}ms`;
+  if (ms < 60_000) return `${(ms / 1000).toFixed(1)}s`;
+  const m = Math.floor(ms / 60_000);
+  const s = ((ms % 60_000) / 1000).toFixed(0);
+  return `${m}m ${s}s`;
+}
+
+function copyRecursive(src, dst) {
+  const skip = new Set(['node_modules', '.git', '.codegraph', 'dist', 'coverage']);
+  for (const entry of fs.readdirSync(src, { withFileTypes: true })) {
+    if (skip.has(entry.name)) continue;
+    const sp = path.join(src, entry.name);
+    const dp = path.join(dst, entry.name);
+    if (entry.isDirectory()) {
+      fs.mkdirSync(dp, { recursive: true });
+      copyRecursive(sp, dp);
+    } else if (entry.isFile()) {
+      fs.copyFileSync(sp, dp);
+    }
+  }
+}
+
+async function main() {
+  header('CodeGraph: WITH LLM vs WITHOUT LLM bench');
+  console.log(`  Sample codebase: ${PROJECT_ROOT}`);
+  console.log(`  Summary cap:     ${CAP_SECONDS}s`);
+
+  const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-bench-'));
+  console.log(`  Working copy:    ${tmpDir}`);
+
+  let phaseStart = performance.now();
+  copyRecursive(PROJECT_ROOT, tmpDir);
+  timings.push({ label: 'Copy source tree', durationMs: performance.now() - phaseStart });
+
+  // -----------------------------------------------------------------
+  // Phase 1: Index, summaries disabled
+  // -----------------------------------------------------------------
+  header('Phase 1: index with summaries DISABLED');
+  phaseStart = performance.now();
+  const cg = await CodeGraph.init(tmpDir);
+  const initMs = performance.now() - phaseStart;
+
+  phaseStart = performance.now();
+  const indexResult = await cg.indexAll({ summarize: false });
+  const indexMs = performance.now() - phaseStart;
+  timings.push({ label: 'Init', durationMs: initMs });
+  timings.push({ label: 'indexAll (no summaries)', durationMs: indexMs });
+
+  const stats = cg.getStats();
+  console.log('\n  Index complete:');
+  console.log(`    Files indexed:  ${indexResult.filesIndexed}`);
+  console.log(`    Files errored:  ${indexResult.filesErrored}`);
+  console.log(`    Nodes:          ${stats.nodeCount}`);
+  console.log(`    Edges:          ${stats.edgeCount}`);
+  console.log(`    Init time:      ${fmtMs(initMs)}`);
+  console.log(`    Index time:     ${fmtMs(indexMs)}`);
+
+  const llmConfig = await cg.getEffectiveLlmConfig();
+  console.log('\n  Auto-detect probe:');
+  if (llmConfig) {
+    console.log(`    Endpoint:       ${llmConfig.endpoint}`);
+    console.log(`    Chat model:     ${llmConfig.chatModel}`);
+    console.log(`    Embedding:      ${llmConfig.embeddingModel || '(none)'}`);
+  } else {
+    console.log('    No local LLM detected — bench will skip Phase 2.');
+    cg.close();
+    fs.rmSync(tmpDir, { recursive: true, force: true });
+    return;
+  }
+
+  // -----------------------------------------------------------------
+  subheader('Searches BEFORE summarisation');
+  for (const q of SAMPLE_QUERIES) {
+    const results = cg.searchNodes(q, { limit: 3 });
+    console.log(`\n  Q: "${q}" → ${results.length} hits`);
+    for (const r of results) {
+      const sig = r.node.signature ? ` ${r.node.signature.slice(0, 80)}` : '';
+      console.log(`    • ${r.node.name} (${r.node.kind}) — ${r.node.filePath}:${r.node.startLine}${sig}`);
+    }
+  }
+
+  // -----------------------------------------------------------------
+  // Phase 2: LLM summarisation
+  // -----------------------------------------------------------------
+  header('Phase 2: LLM summarisation pass');
+  console.log(`\n  Model: ${llmConfig.chatModel}`);
+  console.log(`  Cap:   ${CAP_SECONDS}s\n`);
+
+  const controller = new AbortController();
+  const cap = setTimeout(() => {
+    console.log(`\n  ⏱  Wall-clock cap (${CAP_SECONDS}s) reached — aborting...`);
+    controller.abort();
+  }, CAP_SECONDS * 1000);
+
+  let lastReport = 0;
+  let lastDone = 0;
+  const runStart = performance.now();
+  let summaryResult = null;
+  try {
+    summaryResult = await cg.summarizeAll({
+      signal: controller.signal,
+      concurrency: 2,
+      onProgress: (done, total) => {
+        const now = performance.now();
+        if (now - lastReport >= 5_000 || done === total) {
+          const elapsed = (now - runStart) / 1000;
+          const rate = done > 0 ? (done / elapsed).toFixed(2) : '0';
+          const recent = done - lastDone;
+          process.stdout.write(
+            `  ${done}/${total} symbols   ${rate}/s overall   +${recent} since last tick   ${fmtMs(elapsed * 1000)}\n`
+          );
+          lastReport = now;
+          lastDone = done;
+        }
+      },
+    });
+  } catch (err) {
+    console.log(`  summarizeAll threw: ${err && err.message ? err.message : String(err)}`);
+  } finally {
+    clearTimeout(cap);
+  }
+
+  const sumWallMs = performance.now() - runStart;
+  timings.push({ label: 'summarizeAll', durationMs: sumWallMs });
+
+  const coverage = cg.getSummaryCoverage();
+  const pct = coverage.total > 0 ? Math.round((coverage.summarised / coverage.total) * 100) : 0;
+
+  console.log('\n  Pass complete:');
+  console.log(`    Wall time:           ${fmtMs(sumWallMs)}`);
+  if (summaryResult) {
+    console.log(`    Candidates:          ${summaryResult.candidates}`);
+    console.log(`    Generated:           ${summaryResult.generated}`);
+    console.log(`    Cache hits:          ${summaryResult.cacheHits}`);
+    console.log(`    Errors:              ${summaryResult.errors}`);
+    if (summaryResult.generated > 0) {
+      const perSym = sumWallMs / summaryResult.generated;
+      console.log(`    Avg per generation:  ${fmtMs(perSym)}`);
+    }
+  } else {
+    console.log('    (aborted before completion)');
+  }
+  console.log(`    Coverage:            ${coverage.summarised}/${coverage.total} (${pct}% of summarisable kinds)`);
+
+  // -----------------------------------------------------------------
+  subheader('Searches AFTER summarisation');
+  for (const q of SAMPLE_QUERIES) {
+    const results = cg.searchNodes(q, { limit: 3 });
+    const ids = results.map((r) => r.node.id);
+    const summaries = cg.getSymbolSummaries(ids);
+    console.log(`\n  Q: "${q}" → ${results.length} hits`);
+    for (const r of results) {
+      const sig = r.node.signature ? ` ${r.node.signature.slice(0, 80)}` : '';
+      console.log(`    • ${r.node.name} (${r.node.kind}) — ${r.node.filePath}:${r.node.startLine}${sig}`);
+      const s = summaries.get(r.node.id);
+      if (s) console.log(`      ↳ ${s}`);
+    }
+  }
+
+  // -----------------------------------------------------------------
+  subheader('Detail spot-checks (codegraph_node parity)');
+  for (const name of PROBE_NODES) {
+    const hit = cg.searchNodes(name, { limit: 1 })[0];
+    if (!hit) {
+      console.log(`\n  • ${name}: not found`);
+      continue;
+    }
+    const s = cg.getSymbolSummaries([hit.node.id]).get(hit.node.id);
+    console.log(`\n  • ${hit.node.name} (${hit.node.kind})`);
+    console.log(`      ${hit.node.filePath}:${hit.node.startLine}`);
+    if (hit.node.signature) console.log(`      sig: ${hit.node.signature.slice(0, 100)}`);
+    console.log(`      summary: ${s || '(none — not yet summarised or skipped)'}`);
+  }
+
+  // -----------------------------------------------------------------
+  header('Timing summary');
+  for (const t of timings) {
+    console.log(`  ${t.label.padEnd(32)} ${fmtMs(t.durationMs)}`);
+  }
+
+  cg.close();
+  fs.rmSync(tmpDir, { recursive: true, force: true });
+}
+
+main().catch((err) => {
+  console.error('Bench failed:', err);
+  process.exit(1);
+});
diff --git a/scripts/bench-llm-vs-baseline.ts b/scripts/bench-llm-vs-baseline.ts
new file mode 100644
index 00000000..49dcee31
--- /dev/null
+++ b/scripts/bench-llm-vs-baseline.ts
@@ -0,0 +1,257 @@
+/**
+ * Bench: codegraph indexing + search WITH vs WITHOUT LLM enrichment.
+ *
+ * Uses the codegraph repo itself as the sample codebase. Indexes once
+ * (summaries disabled), captures baseline metrics + sample searches,
+ * then runs the LLM summarisation pass with a wall-clock cap and
+ * re-runs the same searches to show how the output changes.
+ *
+ *   npx tsx scripts/bench-llm-vs-baseline.ts [--cap-seconds 90]
+ */
+
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+import { performance } from 'perf_hooks';
+import { CodeGraph } from '../src';
+
+const PROJECT_ROOT = path.resolve(__dirname, '..');
+const CAP_SECONDS = Number(
+  process.argv.find((a) => a.startsWith('--cap-seconds='))?.split('=')[1] ?? '120'
+);
+
+const SAMPLE_QUERIES = [
+  'FileWatcher',
+  'summarize symbol',
+  'detect ollama',
+  'background pass',
+  'content hash cache',
+  'reachability',
+  'search nodes',
+  'mcp tool format',
+];
+
+const PROBE_NODES = [
+  'startBackgroundSummarization',
+  'detectLocalLlm',
+  'summarizeAll',
+  'FileWatcher',
+];
+
+interface PhaseTiming {
+  label: string;
+  durationMs: number;
+}
+
+const timings: PhaseTiming[] = [];
+
+function header(text: string): void {
+  console.log('\n' + '='.repeat(80));
+  console.log('  ' + text);
+  console.log('='.repeat(80));
+}
+
+function subheader(text: string): void {
+  console.log('\n' + '-'.repeat(60));
+  console.log('  ' + text);
+  console.log('-'.repeat(60));
+}
+
+function fmtMs(ms: number): string {
+  if (ms < 1000) return `${ms.toFixed(0)}ms`;
+  if (ms < 60_000) return `${(ms / 1000).toFixed(1)}s`;
+  const m = Math.floor(ms / 60_000);
+  const s = ((ms % 60_000) / 1000).toFixed(0);
+  return `${m}m ${s}s`;
+}
+
+function copyRecursive(src: string, dst: string): void {
+  const skip = new Set(['node_modules', '.git', '.codegraph', 'dist', 'coverage']);
+  for (const entry of fs.readdirSync(src, { withFileTypes: true })) {
+    if (skip.has(entry.name)) continue;
+    const sp = path.join(src, entry.name);
+    const dp = path.join(dst, entry.name);
+    if (entry.isDirectory()) {
+      fs.mkdirSync(dp, { recursive: true });
+      copyRecursive(sp, dp);
+    } else if (entry.isFile()) {
+      fs.copyFileSync(sp, dp);
+    }
+  }
+}
+
+async function main() {
+  header('CodeGraph: WITH LLM vs WITHOUT LLM bench');
+  console.log(`  Sample codebase: ${PROJECT_ROOT}`);
+  console.log(`  Summary cap:     ${CAP_SECONDS}s`);
+
+  const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-bench-'));
+  console.log(`  Working copy:    ${tmpDir}`);
+
+  let phaseStart = performance.now();
+  copyRecursive(PROJECT_ROOT, tmpDir);
+  timings.push({ label: 'Copy source tree', durationMs: performance.now() - phaseStart });
+
+  // -----------------------------------------------------------------
+  // Phase 1: Index, summaries disabled
+  // -----------------------------------------------------------------
+  header('Phase 1: index with summaries DISABLED');
+  phaseStart = performance.now();
+  const cg = await CodeGraph.init(tmpDir);
+  const initMs = performance.now() - phaseStart;
+
+  phaseStart = performance.now();
+  const indexResult = await cg.indexAll({ summarize: false });
+  const indexMs = performance.now() - phaseStart;
+  timings.push({ label: 'Init', durationMs: initMs });
+  timings.push({ label: 'indexAll (no summaries)', durationMs: indexMs });
+
+  const stats = cg.getStats();
+  console.log('\n  Index complete:');
+  console.log(`    Files indexed:  ${indexResult.filesIndexed}`);
+  console.log(`    Files errored:  ${indexResult.filesErrored}`);
+  console.log(`    Nodes:          ${stats.nodeCount}`);
+  console.log(`    Edges:          ${stats.edgeCount}`);
+  console.log(`    Init time:      ${fmtMs(initMs)}`);
+  console.log(`    Index time:     ${fmtMs(indexMs)}`);
+
+  // Surface what auto-detection sees on this machine.
+  const llmConfig = await cg.getEffectiveLlmConfig();
+  console.log('\n  Auto-detect probe:');
+  if (llmConfig) {
+    console.log(`    Endpoint:       ${llmConfig.endpoint}`);
+    console.log(`    Chat model:     ${llmConfig.chatModel}`);
+    console.log(`    Embedding:      ${llmConfig.embeddingModel ?? '(none)'}`);
+  } else {
+    console.log('    No local LLM detected — bench will skip Phase 2.');
+    cg.close();
+    fs.rmSync(tmpDir, { recursive: true, force: true });
+    return;
+  }
+
+  // -----------------------------------------------------------------
+  // Sample searches WITHOUT summaries
+  // -----------------------------------------------------------------
+  subheader('Searches BEFORE summarisation');
+  for (const q of SAMPLE_QUERIES) {
+    const results = cg.searchNodes(q, { limit: 3 });
+    console.log(`\n  Q: "${q}" → ${results.length} hits`);
+    for (const r of results) {
+      const sig = r.node.signature ? ` ${r.node.signature.slice(0, 80)}` : '';
+      console.log(`    • ${r.node.name} (${r.node.kind}) — ${r.node.filePath}:${r.node.startLine}${sig}`);
+    }
+  }
+
+  // -----------------------------------------------------------------
+  // Phase 2: LLM summarisation pass with wall-clock cap
+  // -----------------------------------------------------------------
+  header('Phase 2: LLM summarisation pass');
+  console.log(`\n  Model: ${llmConfig.chatModel}`);
+  console.log(`  Cap:   ${CAP_SECONDS}s\n`);
+
+  const controller = new AbortController();
+  const cap = setTimeout(() => {
+    console.log(`\n  ⏱  Wall-clock cap (${CAP_SECONDS}s) reached — aborting...`);
+    controller.abort();
+  }, CAP_SECONDS * 1000);
+
+  let lastReport = 0;
+  let lastDone = 0;
+  const runStart = performance.now();
+  let summaryResult: Awaited<ReturnType<typeof cg.summarizeAll>> | null = null;
+  try {
+    summaryResult = await cg.summarizeAll({
+      signal: controller.signal,
+      concurrency: 2,
+      onProgress: (done, total) => {
+        const now = performance.now();
+        if (now - lastReport >= 5_000 || done === total) {
+          const elapsed = (now - runStart) / 1000;
+          const rate = done > 0 ? (done / elapsed).toFixed(2) : '0';
+          const recent = done - lastDone;
+          process.stdout.write(
+            `  ${done}/${total} symbols   ${rate}/s overall   +${recent} since last tick   ${fmtMs(elapsed * 1000)}\n`
+          );
+          lastReport = now;
+          lastDone = done;
+        }
+      },
+    });
+  } catch (err) {
+    console.log(`  summarizeAll threw: ${err instanceof Error ? err.message : String(err)}`);
+  } finally {
+    clearTimeout(cap);
+  }
+
+  const sumWallMs = performance.now() - runStart;
+  timings.push({ label: 'summarizeAll', durationMs: sumWallMs });
+
+  const coverage = cg.getSummaryCoverage();
+  const pct = coverage.total > 0 ? Math.round((coverage.summarised / coverage.total) * 100) : 0;
+
+  console.log('\n  Pass complete:');
+  console.log(`    Wall time:           ${fmtMs(sumWallMs)}`);
+  if (summaryResult) {
+    console.log(`    Candidates:          ${summaryResult.candidates}`);
+    console.log(`    Generated:           ${summaryResult.generated}`);
+    console.log(`    Cache hits:          ${summaryResult.cacheHits}`);
+    console.log(`    Errors:              ${summaryResult.errors}`);
+    if (summaryResult.generated > 0) {
+      const perSym = sumWallMs / summaryResult.generated;
+      console.log(`    Avg per generation:  ${fmtMs(perSym)}`);
+    }
+  } else {
+    console.log('    (aborted before completion)');
+  }
+  console.log(`    Coverage:            ${coverage.summarised}/${coverage.total} (${pct}% of summarisable kinds)`);
+
+  // -----------------------------------------------------------------
+  // Sample searches WITH summaries
+  // -----------------------------------------------------------------
+  subheader('Searches AFTER summarisation');
+  for (const q of SAMPLE_QUERIES) {
+    const results = cg.searchNodes(q, { limit: 3 });
+    const ids = results.map((r) => r.node.id);
+    const summaries = cg.getSymbolSummaries(ids);
+    console.log(`\n  Q: "${q}" → ${results.length} hits`);
+    for (const r of results) {
+      const sig = r.node.signature ? ` ${r.node.signature.slice(0, 80)}` : '';
+      console.log(`    • ${r.node.name} (${r.node.kind}) — ${r.node.filePath}:${r.node.startLine}${sig}`);
+      const s = summaries.get(r.node.id);
+      if (s) console.log(`      ↳ ${s}`);
+    }
+  }
+
+  // -----------------------------------------------------------------
+  // Spot-check specific symbols (codegraph_node-style detail view)
+  // -----------------------------------------------------------------
+  subheader('Detail spot-checks (codegraph_node parity)');
+  for (const name of PROBE_NODES) {
+    const hit = cg.searchNodes(name, { limit: 1 })[0];
+    if (!hit) {
+      console.log(`\n  • ${name}: not found`);
+      continue;
+    }
+    const s = cg.getSymbolSummaries([hit.node.id]).get(hit.node.id);
+    console.log(`\n  • ${hit.node.name} (${hit.node.kind})`);
+    console.log(`      ${hit.node.filePath}:${hit.node.startLine}`);
+    if (hit.node.signature) console.log(`      sig: ${hit.node.signature.slice(0, 100)}`);
+    console.log(`      summary: ${s ?? '(none — not yet summarised or skipped)'}`);
+  }
+
+  // -----------------------------------------------------------------
+  // Final timing summary
+  // -----------------------------------------------------------------
+  header('Timing summary');
+  for (const t of timings) {
+    console.log(`  ${t.label.padEnd(32)} ${fmtMs(t.durationMs)}`);
+  }
+
+  cg.close();
+  fs.rmSync(tmpDir, { recursive: true, force: true });
+}
+
+main().catch((err) => {
+  console.error('Bench failed:', err);
+  process.exit(1);
+});
diff --git a/scripts/bench-targeted-summaries.js b/scripts/bench-targeted-summaries.js
new file mode 100644
index 00000000..803619cf
--- /dev/null
+++ b/scripts/bench-targeted-summaries.js
@@ -0,0 +1,171 @@
+/* eslint-disable */
+/**
+ * Companion to bench-llm-vs-baseline.js: indexes once, then targets a
+ * curated set of interesting symbols (instead of iterating
+ * file_path order) so the qualitative demo lands on names the user
+ * actually searched. Also dumps every summary that was produced for
+ * inspection.
+ *
+ *   node scripts/bench-targeted-summaries.js
+ */
+
+const fs = require('fs');
+const path = require('path');
+const os = require('os');
+const { performance } = require('perf_hooks');
+
+const PROJECT_ROOT = path.resolve(__dirname, '..');
+const { default: CodeGraph } = require(path.join(PROJECT_ROOT, 'dist', 'index.js'));
+const { LlmClient } = require(path.join(PROJECT_ROOT, 'dist', 'llm', 'client.js'));
+
+const TARGETS = [
+  // From src/sync/watcher.ts
+  { name: 'FileWatcher', kind: 'class' },
+  { name: 'shouldDebounce', kind: 'method' },
+  // From src/llm
+  { name: 'detectLocalLlm', kind: 'function' },
+  { name: 'pickChatModel', kind: 'function' },
+  { name: 'summarizeAll', kind: 'function' },
+  { name: 'isReachable', kind: 'method' },
+  { name: 'listModels', kind: 'method' },
+  { name: 'contentHashFor', kind: 'function' },
+  // From src/index.ts
+  { name: 'startBackgroundSummarization', kind: 'method' },
+  { name: 'awaitBackgroundSummarization', kind: 'method' },
+  { name: 'getEffectiveLlmConfig', kind: 'method' },
+  // From src/extraction
+  { name: 'hashContent', kind: 'function' },
+  // From src/db
+  { name: 'searchNodes', kind: 'method' },
+  { name: 'getSymbolSummaries', kind: 'method' },
+  { name: 'upsertSymbolSummary', kind: 'method' },
+  // From src/mcp
+  { name: 'formatSearchResults', kind: 'method' },
+  { name: 'formatNodeDetails', kind: 'method' },
+  { name: 'handleSearch', kind: 'method' },
+  // From src/resolution
+  { name: 'resolveAndPersist', kind: 'method' },
+  // From src/context
+  { name: 'buildContext', kind: 'method' },
+];
+
+function header(t) {
+  console.log('\n' + '='.repeat(80) + '\n  ' + t + '\n' + '='.repeat(80));
+}
+function fmtMs(ms) {
+  if (ms < 1000) return `${ms.toFixed(0)}ms`;
+  if (ms < 60000) return `${(ms / 1000).toFixed(1)}s`;
+  return `${Math.floor(ms / 60000)}m ${((ms % 60000) / 1000).toFixed(0)}s`;
+}
+function copyRecursive(src, dst) {
+  const skip = new Set(['node_modules', '.git', '.codegraph', 'dist', 'coverage']);
+  for (const e of fs.readdirSync(src, { withFileTypes: true })) {
+    if (skip.has(e.name)) continue;
+    const sp = path.join(src, e.name);
+    const dp = path.join(dst, e.name);
+    if (e.isDirectory()) {
+      fs.mkdirSync(dp, { recursive: true });
+      copyRecursive(sp, dp);
+    } else if (e.isFile()) fs.copyFileSync(sp, dp);
+  }
+}
+
+async function main() {
+  header('Targeted summary bench');
+  const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-targeted-'));
+  console.log(`  Working copy: ${tmpDir}`);
+  copyRecursive(PROJECT_ROOT, tmpDir);
+
+  const cg = await CodeGraph.init(tmpDir);
+  const t0 = performance.now();
+  await cg.indexAll({ summarize: false });
+  console.log(`  Indexed in ${fmtMs(performance.now() - t0)}`);
+
+  const llmConfig = await cg.getEffectiveLlmConfig();
+  if (!llmConfig) {
+    console.log('  No LLM detected');
+    cg.close();
+    return;
+  }
+  console.log(`  LLM: ${llmConfig.chatModel} @ ${llmConfig.endpoint}`);
+
+  const client = new LlmClient(llmConfig);
+
+  // Resolve target nodes by (name, kind, prefer src/* path)
+  const resolved = [];
+  for (const t of TARGETS) {
+    const hits = cg.searchNodes(t.name, { limit: 20, kinds: [t.kind] });
+    const inSrc = hits.find((h) => h.node.filePath.startsWith('src/') && h.node.name === t.name);
+    const exact = inSrc || hits.find((h) => h.node.name === t.name);
+    if (exact) resolved.push(exact.node);
+    else console.log(`  (no node for ${t.kind} ${t.name})`);
+  }
+  console.log(`\n  Resolved ${resolved.length}/${TARGETS.length} targets\n`);
+
+  header('Generating summaries (targeted)');
+  const tStart = performance.now();
+  let generated = 0;
+  for (const node of resolved) {
+    const symStart = performance.now();
+    try {
+      // Read body
+      const filePath = path.join(tmpDir, node.filePath);
+      const lines = fs.readFileSync(filePath, 'utf-8').split('\n');
+      const body = lines.slice(Math.max(0, node.startLine - 1), node.endLine).join('\n');
+      const truncated = body.length > 4000 ? body.slice(0, 4000) + '\n// ... (truncated)' : body;
+      const prompt = [
+        'You are a senior code reviewer documenting an unfamiliar codebase.',
+        '',
+        `Write a SINGLE LINE summary (max 200 chars) of what this ${node.kind} does.`,
+        'Start with an action verb. No "This function...", no fluff, no markdown. Just the summary.',
+        '',
+        '```',
+        truncated,
+        '```',
+        '',
+        'Summary:',
+      ].join('\n');
+
+      const result = await client.chat([{ role: 'user', content: prompt }], {
+        temperature: 0,
+        maxTokens: 80,
+      });
+      const summary = (result.text.split('\n')[0] || '').trim();
+      const elapsed = performance.now() - symStart;
+
+      // Persist via the public API
+      const crypto = require('crypto');
+      const h = crypto.createHash('sha256');
+      h.update(node.signature || '');
+      h.update('\0');
+      h.update(truncated);
+      const hash = h.digest('hex').slice(0, 32);
+      // Use the queries layer reflectively via the singleton — test surface
+      // exposes upsertSymbolSummary on QueryBuilder, accessed through
+      // direct require since we're a bench script.
+      // (We just print here; persistence not strictly needed for the demo.)
+
+      generated++;
+      console.log(
+        `  [${generated}/${resolved.length}] ${fmtMs(elapsed).padStart(7)}  ${node.name} (${node.kind})`
+      );
+      console.log(`           ${node.filePath}:${node.startLine}`);
+      console.log(`           ↳ ${summary}`);
+    } catch (err) {
+      console.log(`  ! ${node.name} failed: ${err.message || err}`);
+    }
+  }
+
+  const total = performance.now() - tStart;
+  header('Done');
+  console.log(`  ${generated} summaries in ${fmtMs(total)}`);
+  console.log(`  Avg: ${fmtMs(total / Math.max(1, generated))} / symbol`);
+
+  cg.close();
+  fs.rmSync(tmpDir, { recursive: true, force: true });
+}
+
+main().catch((err) => {
+  console.error('failed:', err);
+  process.exit(1);
+});
diff --git a/scripts/demo-all-features.js b/scripts/demo-all-features.js
new file mode 100644
index 00000000..c115bc3f
--- /dev/null
+++ b/scripts/demo-all-features.js
@@ -0,0 +1,325 @@
+/* eslint-disable */
+/**
+ * End-to-end demo of every LLM feature shipped in this PR.
+ *
+ *   node scripts/demo-all-features.js [--summary-cap-s=120]
+ */
+
+const fs = require('fs');
+const path = require('path');
+const os = require('os');
+const { performance } = require('perf_hooks');
+
+const PROJECT_ROOT = path.resolve(__dirname, '..');
+const { default: CodeGraph } = require(path.join(PROJECT_ROOT, 'dist', 'index.js'));
+const { LlmClient } = require(path.join(PROJECT_ROOT, 'dist', 'llm', 'client.js'));
+const { embedAllSummaries } = require(path.join(PROJECT_ROOT, 'dist', 'llm', 'embeddings.js'));
+const { summarizeAllDirectories } = require(path.join(PROJECT_ROOT, 'dist', 'llm', 'dir-summarizer.js'));
+const { classifyAllRoles } = require(path.join(PROJECT_ROOT, 'dist', 'llm', 'classifier.js'));
+
+const CAP = Number(
+  (process.argv.find((a) => a.startsWith('--summary-cap-s=')) || '').split('=')[1] || '120'
+);
+
+function header(t) { console.log('\n' + '='.repeat(78) + '\n  ' + t + '\n' + '='.repeat(78)); }
+function sub(t) { console.log('\n' + '-'.repeat(60) + '\n  ' + t + '\n' + '-'.repeat(60)); }
+function fmt(ms) {
+  if (ms < 1000) return `${ms.toFixed(0)}ms`;
+  if (ms < 60000) return `${(ms/1000).toFixed(1)}s`;
+  return `${Math.floor(ms/60000)}m ${((ms%60000)/1000).toFixed(0)}s`;
+}
+function copyRecursive(src, dst) {
+  const skip = new Set(['node_modules', '.git', '.codegraph', 'dist', 'coverage']);
+  for (const e of fs.readdirSync(src, { withFileTypes: true })) {
+    if (skip.has(e.name)) continue;
+    const sp = path.join(src, e.name);
+    const dp = path.join(dst, e.name);
+    if (e.isDirectory()) {
+      fs.mkdirSync(dp, { recursive: true });
+      copyRecursive(sp, dp);
+    } else if (e.isFile()) fs.copyFileSync(sp, dp);
+  }
+}
+
+async function main() {
+  header(`Demo: every LLM feature from PR #111 (cap ${CAP}s on the slow phase)`);
+
+  const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-demo-'));
+  console.log(`  Working copy: ${tmpDir}`);
+  copyRecursive(PROJECT_ROOT, tmpDir);
+
+  // -------------------------------------------------------------------
+  // Phase 0: index + auto-detect
+  // -------------------------------------------------------------------
+  header('Phase 0: index + LLM auto-detection');
+  const cg = await CodeGraph.init(tmpDir);
+  const t0 = performance.now();
+  const idx = await cg.indexAll({ summarize: false });
+  console.log(`  Indexed ${idx.filesIndexed} files (${cg.getStats().nodeCount} nodes, ${cg.getStats().edgeCount} edges) in ${fmt(performance.now() - t0)}`);
+
+  const llm = await cg.getEffectiveLlmConfig();
+  if (!llm) {
+    console.log('\n  No local LLM detected. Demo would now fall through to the agent-bridge path.');
+    cg.close();
+    return;
+  }
+  console.log(`\n  Auto-detected:`);
+  console.log(`    Endpoint:    ${llm.endpoint}`);
+  console.log(`    Chat model:  ${llm.chatModel}`);
+  console.log(`    Embed model: ${llm.embeddingModel || '(none)'}`);
+
+  const client = new LlmClient(llm);
+
+  // -------------------------------------------------------------------
+  // Phase 1: summary pass with hard cap
+  // -------------------------------------------------------------------
+  header(`Phase 1: symbol summaries (cap ${CAP}s)`);
+  const ctrl = new AbortController();
+  const cap = setTimeout(() => { console.log(`\n  ⏱  Cap reached, aborting...`); ctrl.abort(); }, CAP * 1000);
+  let lastReport = 0, lastDone = 0;
+  const sumStart = performance.now();
+  let sumResult = null;
+  try {
+    sumResult = await cg.summarizeAll({
+      signal: ctrl.signal,
+      concurrency: 2,
+      onProgress: (done, total) => {
+        const now = performance.now();
+        if (now - lastReport >= 8000 || done === total) {
+          const elapsed = (now - sumStart) / 1000;
+          console.log(`  ${done}/${total}   ${(done/elapsed).toFixed(2)}/s   +${done-lastDone} since last tick   ${fmt(elapsed*1000)}`);
+          lastReport = now; lastDone = done;
+        }
+      },
+    });
+  } catch (e) { console.log(`  summarizeAll: ${e.message}`); }
+  finally { clearTimeout(cap); }
+  if (sumResult) {
+    console.log(`  → generated=${sumResult.generated} cacheHits=${sumResult.cacheHits} errors=${sumResult.errors} in ${fmt(performance.now() - sumStart)}`);
+  }
+  const cov = cg.getSummaryCoverage();
+  console.log(`  Coverage: ${cov.summarised}/${cov.total} (${cov.total ? Math.round(cov.summarised/cov.total*100) : 0}%)`);
+
+  // -------------------------------------------------------------------
+  // Phase 2: embed the summaries we have
+  // -------------------------------------------------------------------
+  if (llm.embeddingModel) {
+    header('Phase 2: embeddings');
+    const eStart = performance.now();
+    const eRes = await embedAllSummaries(cg.queries || cg['queries'], client, llm.embeddingModel, { concurrency: 2 });
+    console.log(`  Embedded ${eRes.generated} summaries in ${fmt(performance.now() - eStart)} (errors: ${eRes.errors})`);
+  }
+
+  // -------------------------------------------------------------------
+  // Phase 3: directory summaries
+  // -------------------------------------------------------------------
+  header('Phase 3: directory summaries');
+  const dStart = performance.now();
+  const dRes = await summarizeAllDirectories(cg.queries || cg['queries'], client, llm.chatModel, { concurrency: 1 });
+  console.log(`  Generated ${dRes.generated} directory paragraphs (cache hits: ${dRes.cacheHits}) in ${fmt(performance.now() - dStart)}`);
+
+  // -------------------------------------------------------------------
+  // Phase 4: role classification (capped to first 30 symbols for time)
+  // -------------------------------------------------------------------
+  header('Phase 4: role classification');
+  const cStart = performance.now();
+  const classifierCtrl = new AbortController();
+  const classifierCap = setTimeout(() => classifierCtrl.abort(), 60_000);
+  const cRes = await classifyAllRoles(cg.queries || cg['queries'], client, llm.chatModel, {
+    concurrency: 3,
+    signal: classifierCtrl.signal,
+  });
+  clearTimeout(classifierCap);
+  console.log(`  Classified ${cRes.classified}/${cRes.candidates} symbols in ${fmt(performance.now() - cStart)} (errors: ${cRes.errors})`);
+  const roleCounts = cg.getRoleCounts();
+  console.log(`  Role distribution:`);
+  for (const [role, n] of [...roleCounts.entries()].sort((a, b) => b[1] - a[1])) {
+    console.log(`    ${role.padEnd(18)} ${n}`);
+  }
+
+  // ====================================================================
+  // FEATURE DEMOS
+  // ====================================================================
+
+  // -------------------------------------------------------------------
+  // Demo 1: hybrid search vs FTS
+  // -------------------------------------------------------------------
+  header('Demo 1 — Hybrid search (FTS + semantic) vs FTS-only');
+  for (const q of ['file watcher debounce', 'detect ollama']) {
+    sub(`Q: "${q}"`);
+    const fts = cg.searchNodes(q, { limit: 3 });
+    console.log(`  FTS-only:`);
+    for (const r of fts) console.log(`    • ${r.node.name} (${r.node.kind}) ${r.node.filePath}:${r.node.startLine}`);
+    const hybrid = await cg.searchHybrid(q, { limit: 3 });
+    const summaries = cg.getSymbolSummaries(hybrid.map(r => r.node.id));
+    console.log(`  Hybrid:`);
+    for (const r of hybrid) {
+      console.log(`    • ${r.node.name} (${r.node.kind}) ${r.node.filePath}:${r.node.startLine}`);
+      const s = summaries.get(r.node.id);
+      if (s) console.log(`      ↳ ${s}`);
+    }
+  }
+
+  // -------------------------------------------------------------------
+  // Demo 2: ask
+  // -------------------------------------------------------------------
+  header('Demo 2 — codegraph_ask (RAG Q&A)');
+  const questions = [
+    'How does the LLM auto-detection pick which chat model to use?',
+    'What does the FileWatcher do, and how does it decide when to sync?',
+  ];
+  for (const q of questions) {
+    sub(`Q: ${q}`);
+    try {
+      const a = await cg.ask(q, { retrieveK: 8 });
+      console.log(`  Answer (retrieve ${a.retrieveMs}ms, chat ${a.chatMs}ms):\n`);
+      console.log(a.answer.split('\n').map(l => '    ' + l).join('\n'));
+      console.log(`\n  Sources:`);
+      for (const c of a.citations.slice(0, 5)) {
+        console.log(`    • ${c.node.name} ${c.node.filePath}:${c.node.startLine}`);
+      }
+    } catch (e) { console.log(`  ask failed: ${e.message}`); }
+  }
+
+  // -------------------------------------------------------------------
+  // Demo 3: module summaries
+  // -------------------------------------------------------------------
+  header('Demo 3 — codegraph_module (directory summaries)');
+  const allDirs = cg.getAllDirectorySummaries();
+  if (allDirs.length === 0) {
+    console.log('  (no directory summaries yet)');
+  } else {
+    for (const { dirPath, summary } of allDirs.slice(0, 6)) {
+      sub(dirPath);
+      console.log('  ' + summary.split('\n').map(l => l).join('\n  '));
+    }
+  }
+
+  // -------------------------------------------------------------------
+  // Demo 4: role filtering
+  // -------------------------------------------------------------------
+  header('Demo 4 — codegraph_role (filter by LLM-assigned role)');
+  for (const role of ['data_model', 'business_logic', 'util']) {
+    sub(`role = ${role}`);
+    const nodes = cg.findNodesByRole(role, 8);
+    if (nodes.length === 0) { console.log('  (none)'); continue; }
+    const sums = cg.getSymbolSummaries(nodes.map(n => n.id));
+    for (const n of nodes) {
+      console.log(`    • ${n.name} (${n.kind}) ${n.filePath}:${n.startLine}`);
+      const s = sums.get(n.id);
+      if (s) console.log(`      ↳ ${s}`);
+    }
+  }
+
+  // -------------------------------------------------------------------
+  // Demo 5: cross-language similar
+  // -------------------------------------------------------------------
+  header('Demo 5 — codegraph_similar (semantic neighbors)');
+  // Pick a well-known symbol with a summary
+  const summarizedNodes = cg.searchNodes('summarize', { limit: 5 });
+  const seed = summarizedNodes.find(r => cg.getSymbolSummaries([r.node.id]).get(r.node.id));
+  if (!seed) {
+    console.log('  (no summarised seed node found yet)');
+  } else {
+    sub(`Seed: ${seed.node.name} ${seed.node.filePath}:${seed.node.startLine}`);
+    const similar = await cg.findSimilar(seed.node.id, { limit: 5 });
+    const sums = cg.getSymbolSummaries(similar.map(r => r.node.id));
+    for (const r of similar) {
+      console.log(`    [${r.score.toFixed(3)}] ${r.node.name} (${r.node.language}) ${r.node.filePath}:${r.node.startLine}`);
+      const s = sums.get(r.node.id);
+      if (s) console.log(`        ↳ ${s}`);
+    }
+  }
+
+  // -------------------------------------------------------------------
+  // Demo 6: dead code judge (capped tiny)
+  // -------------------------------------------------------------------
+  header('Demo 6 — codegraph_dead_code (graph filter + LLM judge)');
+  try {
+    const dead = await cg.findDeadCodeCandidates({ maxCandidates: 5 });
+    console.log(`  Judged ${dead.judged}/${dead.candidates} candidates in ${fmt(dead.durationMs)}`);
+    for (const r of dead.results) {
+      console.log(`    [${r.verdict.toUpperCase()} ${(r.confidence*100).toFixed(0)}%] ${r.node.name} ${r.node.filePath}:${r.node.startLine}`);
+      if (r.reason) console.log(`        ↳ ${r.reason}`);
+    }
+  } catch (e) { console.log(`  dead-code failed: ${e.message}`); }
+
+  // -------------------------------------------------------------------
+  // Demo 7: change-intent
+  // -------------------------------------------------------------------
+  header('Demo 7 — summarizeChange (PR-review intent helper)');
+  const before = `function add(a, b) {
+  return a + b;
+}`;
+  const after = `function add(a, b) {
+  if (typeof a !== 'number' || typeof b !== 'number') {
+    throw new TypeError('add requires numbers');
+  }
+  return a + b;
+}`;
+  try {
+    const ci = await cg.summarizeChange('add', 'function', before, after);
+    console.log(`  add (modified):\n    ↳ ${ci.intent}`);
+
+    const added = await cg.summarizeChange('multiply', 'function', '', 'function multiply(a, b) { return a * b; }');
+    console.log(`  multiply (added):\n    ↳ ${added.intent}`);
+  } catch (e) { console.log(`  change-intent failed: ${e.message}`); }
+
+  // -------------------------------------------------------------------
+  // Demo 8: naming drift
+  // -------------------------------------------------------------------
+  header('Demo 8 — checkNamingDrift (advisory)');
+  const probes = [
+    { name: 'getUserById', kind: 'function' },
+    { name: 'do_thing', kind: 'function' }, // snake_case in a camelCase codebase
+    { name: 'XMLParser', kind: 'class' },
+  ];
+  for (const p of probes) {
+    try {
+      const v = await cg.checkNamingDrift({ ...p, filePath: 'src/probe.ts' });
+      const tag = v.consistent ? 'OK' : 'DRIFT';
+      console.log(`  [${tag}] ${p.name} (${p.kind})`);
+      if (!v.consistent && v.suggestion) console.log(`        suggestion: ${v.suggestion}`);
+      if (v.reason) console.log(`        reason: ${v.reason}`);
+    } catch (e) { console.log(`  naming-drift failed: ${e.message}`); }
+  }
+
+  // -------------------------------------------------------------------
+  // Demo 9: agent-as-LLM bridge (works WITHOUT a local LLM)
+  // -------------------------------------------------------------------
+  header('Demo 9 — agent-as-LLM bridge (no local LLM required)');
+  const batch = cg.pendingSummariesBatch({ limit: 3, modelHint: 'demo-agent' });
+  console.log(`  pendingSummariesBatch returned ${batch.items.length} items (${batch.remaining} remaining of ${batch.total})`);
+  if (batch.items.length > 0) {
+    for (const it of batch.items) {
+      console.log(`    - ${it.name} (${it.kind}) ${it.filePath}:${it.startLine} contentHash=${it.contentHash.slice(0,8)}…`);
+    }
+    // Pretend the agent answered each one
+    const fake = batch.items.map(it => ({
+      nodeId: it.nodeId,
+      contentHash: it.contentHash,
+      summary: `Demo summary written by the calling agent for ${it.name}.`,
+    }));
+    const saved = cg.saveAgentSummaries(fake, 'demo-agent');
+    console.log(`  saveAgentSummaries → saved=${saved.saved} skipped=${saved.skipped}`);
+
+    // Show that re-issuing the batch with the same modelHint short-circuits
+    const batch2 = cg.pendingSummariesBatch({ limit: 3, modelHint: 'demo-agent' });
+    const overlap = batch2.items.filter(b => batch.items.some(a => a.nodeId === b.nodeId));
+    console.log(`  Re-issued batch overlap with first batch: ${overlap.length} (expect 0 — cache short-circuit)`);
+  }
+
+  // -------------------------------------------------------------------
+  // Final coverage snapshot
+  // -------------------------------------------------------------------
+  header('Final state');
+  const finalCov = cg.getSummaryCoverage();
+  console.log(`  Symbol summaries: ${finalCov.summarised}/${finalCov.total} (${finalCov.total ? Math.round(finalCov.summarised/finalCov.total*100) : 0}%)`);
+  console.log(`  Directory summaries: ${cg.getAllDirectorySummaries().length}`);
+  console.log(`  Role-classified symbols: ${[...cg.getRoleCounts().values()].reduce((a, b) => a + b, 0)}`);
+
+  cg.close();
+  fs.rmSync(tmpDir, { recursive: true, force: true });
+}
+
+main().catch(err => { console.error('Demo failed:', err); process.exit(1); });
diff --git a/scripts/spikes/spike-coverage.mjs b/scripts/spikes/spike-coverage.mjs
new file mode 100644
index 00000000..b408ff3a
--- /dev/null
+++ b/scripts/spikes/spike-coverage.mjs
@@ -0,0 +1,104 @@
+#!/usr/bin/env node
+/**
+ * Spike C: Does merging real coverage data with centrality find
+ * different "untested high-leverage" files than convention-based
+ * tests-edges?
+ *
+ * Method:
+ *   1. Read coverage/coverage-summary.json (jest/vitest standard format)
+ *   2. Get top-30 most-central files via codegraph
+ *   3. Find: which central files have <50% line coverage?
+ *   4. Compare against the convention-based "no incoming tests edges" finding
+ *   5. Are these the same files, or different?
+ */
+
+import path from 'node:path';
+import fs from 'node:fs';
+
+const target = path.resolve(process.argv[2] ?? process.cwd());
+const coverageFile = path.join(target, 'coverage/coverage-summary.json');
+if (!fs.existsSync(coverageFile)) {
+  console.error(`No coverage file at ${coverageFile}`);
+  process.exit(1);
+}
+
+const { CodeGraph } = await import('../../dist/index.js');
+
+console.log(`\n=== Spike C: Coverage merge vs convention-tests on ${path.basename(target)} ===\n`);
+
+const coverage = JSON.parse(fs.readFileSync(coverageFile, 'utf-8'));
+const covByPath = new Map();
+for (const [absPath, data] of Object.entries(coverage)) {
+  if (absPath === 'total') continue;
+  const rel = path.relative(target, absPath);
+  covByPath.set(rel, {
+    lines: data.lines?.pct ?? 0,
+    branches: data.branches?.pct ?? 0,
+    functions: data.functions?.pct ?? 0,
+  });
+}
+console.log(`Coverage: ${covByPath.size} files have line/branch/function pct\n`);
+
+const cgDir = path.join(target, '.codegraph');
+if (fs.existsSync(cgDir)) fs.rmSync(cgDir, { recursive: true, force: true });
+const cg = await CodeGraph.init(target);
+await cg.indexAll();
+
+// Top-30 central files
+const centralFiles = cg.getHotspots({ limit: 30, minCommits: 0, sortBy: 'centrality' });
+console.log(`${centralFiles.length} hotspots (sorted by centrality)\n`);
+
+// Approach 1: coverage-based untested
+const lowCovCentral = [];
+for (const f of centralFiles) {
+  const cov = covByPath.get(f.filePath);
+  if (cov && cov.lines < 50) {
+    lowCovCentral.push({ file: f.filePath, centrality: f.fileCentrality, lineCov: cov.lines });
+  }
+}
+
+// Approach 2: convention-based untested (current implementation)
+const conventionUntested = [];
+for (const f of centralFiles) {
+  const tests = cg.getTestsForFile(f.filePath);
+  if (tests.length === 0 && !f.filePath.includes('test')) {
+    conventionUntested.push({ file: f.filePath, centrality: f.fileCentrality });
+  }
+}
+
+console.log('Coverage-based untested (line cov <50%):');
+lowCovCentral.slice(0, 10).forEach((f) => console.log(`  ${f.lineCov.toFixed(0)}% cov · centrality ${f.centrality.toFixed(4)} · ${f.file}`));
+
+console.log(`\nConvention-based untested (no tests-edges):`);
+conventionUntested.slice(0, 10).forEach((f) => console.log(`  centrality ${f.centrality.toFixed(4)} · ${f.file}`));
+
+// Compare
+const covSet = new Set(lowCovCentral.map((f) => f.file));
+const convSet = new Set(conventionUntested.map((f) => f.file));
+const both = [...covSet].filter((f) => convSet.has(f));
+const covOnly = [...covSet].filter((f) => !convSet.has(f));
+const convOnly = [...convSet].filter((f) => !covSet.has(f));
+
+console.log(`\n=== Comparison ===\n`);
+console.log(`Both methods:        ${both.length} files`);
+console.log(`Coverage only:       ${covOnly.length} files (caught by coverage, missed by convention)`);
+console.log(`Convention only:     ${convOnly.length} files (caught by convention, NOT a coverage gap)`);
+
+if (covOnly.length > 0) {
+  console.log(`\nCoverage-only (these have a test file by convention, but real coverage <50%):`);
+  covOnly.slice(0, 5).forEach((f) => console.log(`  ${f}`));
+  console.log(`\n  → Coverage data adds genuine value: it sees half-tested files convention misses.`);
+}
+if (convOnly.length > 0) {
+  console.log(`\nConvention-only (no test file, but coverage data shows them tested):`);
+  convOnly.slice(0, 5).forEach((f) => console.log(`  ${f}`));
+  console.log(`\n  → Convention has false positives: a test exists somewhere with non-conventional name.`);
+}
+
+cg.close();
+
+// Verdict
+console.log('\n=== Verdict ===');
+if (covOnly.length >= 3) console.log('Coverage finds materially different gaps. BUILD coverage integration.');
+else if (convOnly.length >= 3) console.log('Convention has false positives that coverage corrects. BUILD coverage integration.');
+else console.log('Both methods agree closely. Coverage integration adds little. SKIP — but keep in mind it requires CI to produce coverage.');
diff --git a/scripts/spikes/spike-db-optimizations.mjs b/scripts/spikes/spike-db-optimizations.mjs
new file mode 100644
index 00000000..fdad09fe
--- /dev/null
+++ b/scripts/spikes/spike-db-optimizations.mjs
@@ -0,0 +1,294 @@
+#!/usr/bin/env node
+/**
+ * Spikes F, G, H: DB-layer optimization measurements.
+ *
+ * F. Redundant indexes: drop idx_edges_source, idx_edges_target,
+ *    idx_co_changes_a — measure size delta, bulk-insert speed, and
+ *    query latency for the queries that previously used them.
+ * G. Embedding split: vs inline. Measure summary-only-scan latency,
+ *    summary+embedding latency.
+ * H. In-memory embedding cache: cold-from-sqlite vs cached
+ *    Float32Array. Measure top-K cosine search latency.
+ *
+ * Generates synthetic real-shape data so measurements scale to what
+ * users will hit. Codegraph's own DB at 2K nodes is too small to
+ * surface meaningful differences.
+ */
+import Database from 'better-sqlite3';
+import fs from 'node:fs';
+import os from 'node:os';
+import path from 'node:path';
+
+const NODES = 50_000;     // Realistic mid-size codebase
+const EDGES_PER_NODE = 5;
+const EMBED_DIM = 768;
+const EMBED_COUNT = NODES;
+
+function ms(start) { return Number(process.hrtime.bigint() - start) / 1_000_000; }
+function fmt(n) { return n < 10 ? n.toFixed(2) : n.toFixed(0); }
+
+console.log('\n=== DB-layer optimization spikes ===\n');
+console.log(`Synthesizing ${NODES.toLocaleString()} nodes, ${(NODES*EDGES_PER_NODE).toLocaleString()} edges, ${EMBED_COUNT.toLocaleString()} ${EMBED_DIM}d embeddings...`);
+
+// ============================================================================
+// Spike F: Redundant indexes
+// ============================================================================
+console.log('\n--- Spike F: Redundant indexes (idx_edges_source, idx_edges_target, idx_co_changes_a) ---\n');
+
+function buildEdgesDb({ withRedundant }) {
+  const dbPath = path.join(os.tmpdir(), `spike-edges-${Date.now()}.db`);
+  const db = new Database(dbPath);
+  db.pragma('journal_mode = WAL');
+  db.pragma('synchronous = NORMAL');
+  db.pragma('cache_size = -64000');
+  db.exec(`
+    CREATE TABLE nodes (id TEXT PRIMARY KEY, kind TEXT NOT NULL, name TEXT NOT NULL);
+    CREATE TABLE edges (
+      id INTEGER PRIMARY KEY AUTOINCREMENT,
+      source TEXT NOT NULL, target TEXT NOT NULL, kind TEXT NOT NULL,
+      line INTEGER, col INTEGER
+    );
+    CREATE INDEX idx_edges_kind ON edges(kind);
+    CREATE INDEX idx_edges_source_kind ON edges(source, kind);
+    CREATE INDEX idx_edges_target_kind ON edges(target, kind);
+    CREATE UNIQUE INDEX idx_edges_unique
+      ON edges(source, target, kind, COALESCE(line, -1), COALESCE(col, -1));
+  `);
+  if (withRedundant) {
+    db.exec(`
+      CREATE INDEX idx_edges_source ON edges(source);
+      CREATE INDEX idx_edges_target ON edges(target);
+    `);
+  }
+
+  // Insert nodes + edges in bulk
+  const insNode = db.prepare('INSERT INTO nodes (id, kind, name) VALUES (?, ?, ?)');
+  const insEdge = db.prepare('INSERT INTO edges (source, target, kind, line, col) VALUES (?, ?, ?, ?, ?)');
+  const KINDS = ['calls', 'imports', 'references', 'type_of', 'extends', 'instantiates'];
+  const tStart = process.hrtime.bigint();
+  db.transaction(() => {
+    for (let i = 0; i < NODES; i++) {
+      insNode.run(`n${i}`, 'function', `name${i}`);
+    }
+    for (let i = 0; i < NODES; i++) {
+      for (let j = 0; j < EDGES_PER_NODE; j++) {
+        const tgt = `n${(i + j + 1) % NODES}`;
+        const kind = KINDS[j % KINDS.length];
+        insEdge.run(`n${i}`, tgt, kind, i, j);
+      }
+    }
+  })();
+  const insertMs = ms(tStart);
+  db.exec('PRAGMA optimize');
+
+  const size = fs.statSync(dbPath).size;
+  return { db, dbPath, size, insertMs };
+}
+
+const baseline = buildEdgesDb({ withRedundant: true });
+const stripped = buildEdgesDb({ withRedundant: false });
+
+console.log(`  baseline  (with redundant): size=${(baseline.size / 1024 / 1024).toFixed(1)} MB · bulk insert=${fmt(baseline.insertMs)}ms`);
+console.log(`  stripped (without redundant): size=${(stripped.size / 1024 / 1024).toFixed(1)} MB · bulk insert=${fmt(stripped.insertMs)}ms`);
+const sizeDelta = ((baseline.size - stripped.size) / baseline.size * 100).toFixed(1);
+const insertSpeedup = (baseline.insertMs / stripped.insertMs).toFixed(2);
+console.log(`  Δ size: -${sizeDelta}% · Δ bulk insert: ${insertSpeedup}× faster without redundant indexes`);
+
+// Query speed: queries that USED to hit dropped indexes
+function timeQueries(db, label) {
+  const N = 500;
+  const sourceOnly = db.prepare('SELECT COUNT(*) FROM edges WHERE source = ?');
+  const targetOnly = db.prepare('SELECT COUNT(*) FROM edges WHERE target = ?');
+  let t1 = process.hrtime.bigint();
+  for (let i = 0; i < N; i++) sourceOnly.get(`n${i % NODES}`);
+  const sourceMs = ms(t1) / N;
+  t1 = process.hrtime.bigint();
+  for (let i = 0; i < N; i++) targetOnly.get(`n${i % NODES}`);
+  const targetMs = ms(t1) / N;
+  console.log(`  ${label}: WHERE source=? avg ${fmt(sourceMs)}ms · WHERE target=? avg ${fmt(targetMs)}ms`);
+  return { sourceMs, targetMs };
+}
+const baseQ = timeQueries(baseline.db, 'baseline ');
+const strQ = timeQueries(stripped.db, 'stripped ');
+console.log(`  query speed delta: source ${(strQ.sourceMs / baseQ.sourceMs).toFixed(2)}× · target ${(strQ.targetMs / baseQ.targetMs).toFixed(2)}× (>1 = stripped slower)`);
+
+baseline.db.close(); stripped.db.close();
+fs.unlinkSync(baseline.dbPath); fs.unlinkSync(stripped.dbPath);
+
+// ============================================================================
+// Spike G: Embedding storage split
+// ============================================================================
+console.log('\n--- Spike G: Embedding storage split (inline vs separate table) ---\n');
+
+function buildEmbedDb({ split }) {
+  const dbPath = path.join(os.tmpdir(), `spike-embed-${Date.now()}-${Math.random()}.db`);
+  const db = new Database(dbPath);
+  db.pragma('journal_mode = WAL');
+  db.pragma('synchronous = NORMAL');
+  db.pragma('cache_size = -64000');
+  if (split) {
+    db.exec(`
+      CREATE TABLE summaries (
+        node_id TEXT PRIMARY KEY, summary TEXT NOT NULL,
+        model TEXT NOT NULL, generated_at INTEGER NOT NULL,
+        role TEXT, role_model TEXT
+      );
+      CREATE TABLE embeddings (
+        node_id TEXT PRIMARY KEY,
+        embedding BLOB NOT NULL,
+        embedding_model TEXT NOT NULL
+      );
+    `);
+  } else {
+    db.exec(`
+      CREATE TABLE summaries (
+        node_id TEXT PRIMARY KEY, summary TEXT NOT NULL,
+        model TEXT NOT NULL, generated_at INTEGER NOT NULL,
+        embedding BLOB, embedding_model TEXT,
+        role TEXT, role_model TEXT
+      );
+    `);
+  }
+  // Synthetic populate
+  const sample = 'A typical one-line summary describing what this function does, with reasonable length.';
+  const buf = Buffer.alloc(EMBED_DIM * 4);
+  for (let i = 0; i < EMBED_DIM; i++) buf.writeFloatLE(Math.random() * 0.1, i * 4);
+
+  if (split) {
+    const insS = db.prepare('INSERT INTO summaries (node_id, summary, model, generated_at, role) VALUES (?, ?, ?, ?, ?)');
+    const insE = db.prepare('INSERT INTO embeddings (node_id, embedding, embedding_model) VALUES (?, ?, ?)');
+    db.transaction(() => {
+      for (let i = 0; i < EMBED_COUNT; i++) {
+        insS.run(`n${i}`, sample, 'qwen2.5-coder', Date.now(), 'business_logic');
+        insE.run(`n${i}`, buf, 'nomic-embed-text');
+      }
+    })();
+  } else {
+    const ins = db.prepare(`
+      INSERT INTO summaries (node_id, summary, model, generated_at, embedding, embedding_model, role)
+      VALUES (?, ?, ?, ?, ?, ?, ?)
+    `);
+    db.transaction(() => {
+      for (let i = 0; i < EMBED_COUNT; i++) {
+        ins.run(`n${i}`, sample, 'qwen2.5-coder', Date.now(), buf, 'nomic-embed-text', 'business_logic');
+      }
+    })();
+  }
+
+  return { db, dbPath, size: fs.statSync(dbPath).size };
+}
+
+const inline = buildEmbedDb({ split: false });
+const splitT = buildEmbedDb({ split: true });
+
+console.log(`  inline DB: ${(inline.size / 1024 / 1024).toFixed(1)} MB`);
+console.log(`  split  DB: ${(splitT.size / 1024 / 1024).toFixed(1)} MB`);
+
+// Summary-only scan: get summaries by role (no embedding needed)
+function timeQuery(db, label, sql, params = []) {
+  const N = 50;
+  const stmt = db.prepare(sql);
+  let t = process.hrtime.bigint();
+  for (let i = 0; i < N; i++) stmt.all(...params);
+  const avg = ms(t) / N;
+  console.log(`  ${label}: ${fmt(avg)}ms avg over ${N} queries`);
+  return avg;
+}
+console.log('\n  Test: scan all rows by role (typical query, no embedding needed)');
+const inlineNoEmb = timeQuery(
+  inline.db,
+  'inline (reads embeddings even if unused)',
+  `SELECT node_id, summary FROM summaries WHERE role = ?`,
+  ['business_logic']
+);
+const splitNoEmb = timeQuery(
+  splitT.db,
+  'split  (separate page chain)            ',
+  `SELECT node_id, summary FROM summaries WHERE role = ?`,
+  ['business_logic']
+);
+console.log(`  Δ summary-only: split is ${(inlineNoEmb / splitNoEmb).toFixed(2)}× faster`);
+
+// Test 2: scan WITH embeddings (rare, but the cost case)
+console.log('\n  Test: scan all rows including embedding (similarity search prep)');
+const inlineWithEmb = timeQuery(
+  inline.db,
+  'inline (single table)                    ',
+  `SELECT node_id, summary, embedding FROM summaries`
+);
+const splitWithEmb = timeQuery(
+  splitT.db,
+  'split  (join required)                   ',
+  `SELECT s.node_id, s.summary, e.embedding FROM summaries s JOIN embeddings e ON e.node_id = s.node_id`
+);
+console.log(`  Δ summary+embedding: ${(splitWithEmb / inlineWithEmb).toFixed(2)}× cost penalty for split (>1 = split slower)`);
+
+// ============================================================================
+// Spike H: In-memory embedding cache for similarity search
+// ============================================================================
+console.log('\n--- Spike H: In-memory embedding cache for similarity search ---\n');
+
+const QUERIES = 20;
+const TOP_K = 10;
+
+// Cold path: load all embeddings from SQLite per query
+const queryVec = new Float32Array(EMBED_DIM);
+for (let i = 0; i < EMBED_DIM; i++) queryVec[i] = Math.random();
+
+function cosine(a, b) {
+  let s = 0;
+  for (let i = 0; i < a.length; i++) s += a[i] * b[i];
+  return s;
+}
+
+function bytesToVec(buf) {
+  // Zero-copy if aligned
+  return new Float32Array(buf.buffer, buf.byteOffset, buf.byteLength / 4);
+}
+
+const coldStmt = inline.db.prepare('SELECT node_id, embedding FROM summaries');
+let t0 = process.hrtime.bigint();
+for (let q = 0; q < QUERIES; q++) {
+  const rows = coldStmt.all();
+  // Top-K via min-heap simulated with sort
+  const scores = [];
+  for (const r of rows) {
+    const v = bytesToVec(r.embedding);
+    scores.push({ id: r.node_id, score: cosine(queryVec, v) });
+  }
+  scores.sort((a, b) => b.score - a.score);
+  scores.slice(0, TOP_K);
+}
+const coldMs = ms(t0) / QUERIES;
+console.log(`  cold (per-query SQLite fetch): ${fmt(coldMs)}ms avg over ${QUERIES} queries`);
+
+// Warm path: load once into Float32Array matrix, then dot products in-memory
+const ids = [];
+const matrix = new Float32Array(EMBED_COUNT * EMBED_DIM);
+let row = 0;
+for (const r of coldStmt.all()) {
+  ids.push(r.node_id);
+  const v = bytesToVec(r.embedding);
+  matrix.set(v, row * EMBED_DIM);
+  row++;
+}
+let t1 = process.hrtime.bigint();
+for (let q = 0; q < QUERIES; q++) {
+  const scores = [];
+  for (let i = 0; i < EMBED_COUNT; i++) {
+    let s = 0;
+    const off = i * EMBED_DIM;
+    for (let d = 0; d < EMBED_DIM; d++) s += matrix[off + d] * queryVec[d];
+    scores.push({ id: ids[i], score: s });
+  }
+  scores.sort((a, b) => b.score - a.score);
+  scores.slice(0, TOP_K);
+}
+const warmMs = ms(t1) / QUERIES;
+console.log(`  warm (in-memory Float32Array): ${fmt(warmMs)}ms avg over ${QUERIES} queries`);
+console.log(`  Δ similarity search: ${(coldMs / warmMs).toFixed(1)}× speedup with in-memory cache`);
+
+inline.db.close(); splitT.db.close();
+fs.unlinkSync(inline.dbPath); fs.unlinkSync(splitT.dbPath);
+
+console.log('\n=== Spikes complete ===\n');
diff --git a/scripts/spikes/spike-dead-exports.mjs b/scripts/spikes/spike-dead-exports.mjs
new file mode 100644
index 00000000..607bf54c
--- /dev/null
+++ b/scripts/spikes/spike-dead-exports.mjs
@@ -0,0 +1,110 @@
+#!/usr/bin/env node
+/**
+ * Spike A: Dead-export detection — graph query vs ts-prune.
+ *
+ * Question: how close does a simple "no incoming edges" graph query
+ * come to ts-prune's type-aware analysis?
+ *
+ * Method:
+ *   1. Run ts-prune, parse its output → ground-truth set of unused exports
+ *   2. Walk indexed nodes, find isExported nodes with no incoming edges
+ *      from outside their own file → codegraph "dead exports" set
+ *   3. Compare overlap, false positives, false negatives
+ */
+
+import { execFileSync } from 'node:child_process';
+import path from 'node:path';
+
+const target = path.resolve(process.argv[2] ?? process.cwd());
+const { CodeGraph } = await import('../../dist/index.js');
+
+console.log(`\n=== Spike A: Dead-export comparison on ${target} ===\n`);
+
+// 1. Get ts-prune output
+console.log('Running ts-prune...');
+let tsPruneOutput;
+try {
+  tsPruneOutput = execFileSync('npx', ['ts-prune'], { cwd: target, encoding: 'utf-8', maxBuffer: 50 * 1024 * 1024 });
+} catch (e) {
+  console.error('ts-prune failed:', e.message);
+  process.exit(1);
+}
+
+// Parse: `src/path.ts:42 - symbolName` (and "(used in module)" suffix)
+const tsPruneStrict = new Set();   // all entries, including "used in module"
+const tsPruneLoose = new Set();    // only truly-unused (no "(used in module)")
+for (const line of tsPruneOutput.split('\n')) {
+  const m = line.match(/^(.+?):(\d+) - (\S+)(?:\s+\(used in module\))?\s*$/);
+  if (!m) continue;
+  const [, file, lineno, name] = m;
+  const usedInModule = line.includes('(used in module)');
+  const key = `${file}:${name}`;
+  tsPruneStrict.add(key);
+  if (!usedInModule) tsPruneLoose.add(key);
+}
+console.log(`  ts-prune strict: ${tsPruneStrict.size} entries (incl. "used in module")`);
+console.log(`  ts-prune loose:  ${tsPruneLoose.size} entries (truly unused outside file)\n`);
+
+// 2. Codegraph version: indexed exports with no external incoming edges
+console.log('Indexing with codegraph...');
+const cgDir = path.join(target, '.codegraph');
+import('node:fs').then(({ rmSync, existsSync }) => existsSync(cgDir) && rmSync(cgDir, { recursive: true, force: true }));
+const cg = await CodeGraph.init(target);
+await cg.indexAll();
+
+// Walk all exported nodes, count incoming edges from outside the file.
+// We need direct DB access for this — use the underlying SQLite handle.
+const db = cg.db.getDb();
+const exportedNodes = db.prepare(`
+  SELECT id, kind, name, file_path, start_line
+  FROM nodes
+  WHERE is_exported = 1
+    AND kind IN ('function', 'method', 'class', 'interface', 'type_alias', 'constant', 'enum', 'variable')
+    AND file_path NOT LIKE '%test%'
+    AND file_path NOT LIKE '%.d.ts'
+`).all();
+console.log(`  ${exportedNodes.length} exported symbols indexed`);
+
+const incomingStmt = db.prepare(`
+  SELECT COUNT(*) AS c FROM edges e
+  JOIN nodes src ON src.id = e.source
+  WHERE e.target = ?
+    AND e.kind IN ('calls', 'references', 'type_of', 'instantiates', 'extends', 'implements', 'imports')
+    AND src.file_path != ?
+`);
+
+const cgDead = new Set();   // file:name keys of nodes with zero external incoming
+let nodesChecked = 0;
+for (const n of exportedNodes) {
+  const { c } = incomingStmt.get(n.id, n.file_path);
+  nodesChecked++;
+  if (c === 0) {
+    cgDead.add(`${n.file_path}:${n.name}`);
+  }
+}
+console.log(`  codegraph dead (no external incoming): ${cgDead.size} of ${nodesChecked}\n`);
+
+// 3. Compare
+function intersect(a, b) { return new Set([...a].filter((x) => b.has(x))); }
+function diff(a, b) { return new Set([...a].filter((x) => !b.has(x))); }
+
+const overlapStrict = intersect(cgDead, tsPruneStrict);
+const overlapLoose = intersect(cgDead, tsPruneLoose);
+const cgOnly = diff(cgDead, tsPruneStrict);  // cg says dead, ts-prune doesn't
+const tsLooseOnly = diff(tsPruneLoose, cgDead);  // ts-prune-loose says dead, cg doesn't
+
+console.log('=== Comparison ===\n');
+console.log(`Strict ts-prune ∩ codegraph: ${overlapStrict.size} / ${cgDead.size} cg-dead = ${(100 * overlapStrict.size / cgDead.size).toFixed(0)}% precision`);
+console.log(`                              ${overlapStrict.size} / ${tsPruneStrict.size} ts-strict = ${(100 * overlapStrict.size / tsPruneStrict.size).toFixed(0)}% recall (vs strict)`);
+console.log(`Loose ts-prune ∩ codegraph:  ${overlapLoose.size} / ${tsPruneLoose.size} ts-loose = ${(100 * overlapLoose.size / tsPruneLoose.size).toFixed(0)}% recall (vs loose)\n`);
+
+console.log('Codegraph says dead, ts-prune disagrees (potential codegraph false positives):');
+[...cgOnly].slice(0, 8).forEach((k) => console.log(`  ${k}`));
+console.log(`  (${cgOnly.size} total)\n`);
+
+console.log('ts-prune (loose) says dead, codegraph misses (potential codegraph false negatives):');
+[...tsLooseOnly].slice(0, 8).forEach((k) => console.log(`  ${k}`));
+console.log(`  (${tsLooseOnly.size} total)\n`);
+
+cg.close();
+console.log('=== Spike A complete ===\n');
diff --git a/scripts/spikes/spike-embedding-split.mjs b/scripts/spikes/spike-embedding-split.mjs
new file mode 100644
index 00000000..2c70ccd9
--- /dev/null
+++ b/scripts/spikes/spike-embedding-split.mjs
@@ -0,0 +1,201 @@
+#!/usr/bin/env node
+/**
+ * Spikes G and H: embedding storage layout + in-memory cache.
+ *
+ * G. Storage split: keep embeddings INLINE on `symbol_summaries`
+ *    vs SPLIT into a dedicated `symbol_embeddings` table. Measure
+ *    summary-only scan latency (the common path) and summary +
+ *    embedding scan latency (the rare path).
+ *
+ * H. In-memory similarity cache: cold-from-SQLite per query vs
+ *    pre-decoded Float32Array matrix. Measure top-K cosine search
+ *    latency.
+ *
+ * Synthesises 50K symbol_summaries + 768-dim embeddings to mirror
+ * a realistic mid-size codebase. Codegraph's own DB at ~2K nodes
+ * is too small to surface differences.
+ */
+import Database from 'better-sqlite3';
+import fs from 'node:fs';
+import os from 'node:os';
+import path from 'node:path';
+
+const NODES = 50_000;
+const EMBED_DIM = 768;
+const EMBED_COUNT = NODES;
+
+function ms(start) { return Number(process.hrtime.bigint() - start) / 1_000_000; }
+function fmt(n) { return n < 10 ? n.toFixed(2) : n.toFixed(0); }
+
+console.log('\n=== Spike: embedding storage + in-memory cache ===\n');
+console.log(`Synthesizing ${EMBED_COUNT.toLocaleString()} summaries + ${EMBED_DIM}d embeddings...`);
+
+// ============================================================================
+// Spike G: inline vs split
+// ============================================================================
+console.log('\n--- Spike G: storage layout (inline vs split) ---\n');
+
+function buildEmbedDb({ split }) {
+  const dbPath = path.join(os.tmpdir(), `spike-embed-${Date.now()}-${Math.random()}.db`);
+  const db = new Database(dbPath);
+  db.pragma('journal_mode = WAL');
+  db.pragma('synchronous = NORMAL');
+  db.pragma('cache_size = -64000');
+  if (split) {
+    db.exec(`
+      CREATE TABLE summaries (
+        node_id TEXT PRIMARY KEY, summary TEXT NOT NULL,
+        model TEXT NOT NULL, generated_at INTEGER NOT NULL,
+        role TEXT, role_model TEXT
+      );
+      CREATE TABLE embeddings (
+        node_id TEXT PRIMARY KEY,
+        embedding BLOB NOT NULL,
+        embedding_model TEXT NOT NULL
+      );
+    `);
+  } else {
+    db.exec(`
+      CREATE TABLE summaries (
+        node_id TEXT PRIMARY KEY, summary TEXT NOT NULL,
+        model TEXT NOT NULL, generated_at INTEGER NOT NULL,
+        embedding BLOB, embedding_model TEXT,
+        role TEXT, role_model TEXT
+      );
+    `);
+  }
+  const sample = 'A typical one-line summary describing what this function does, with reasonable length.';
+  const buf = Buffer.alloc(EMBED_DIM * 4);
+  for (let i = 0; i < EMBED_DIM; i++) buf.writeFloatLE(Math.random() * 0.1, i * 4);
+
+  if (split) {
+    const insS = db.prepare('INSERT INTO summaries (node_id, summary, model, generated_at, role) VALUES (?, ?, ?, ?, ?)');
+    const insE = db.prepare('INSERT INTO embeddings (node_id, embedding, embedding_model) VALUES (?, ?, ?)');
+    db.transaction(() => {
+      for (let i = 0; i < EMBED_COUNT; i++) {
+        insS.run(`n${i}`, sample, 'qwen2.5-coder', Date.now(), 'business_logic');
+        insE.run(`n${i}`, buf, 'nomic-embed-text');
+      }
+    })();
+  } else {
+    const ins = db.prepare(`
+      INSERT INTO summaries (node_id, summary, model, generated_at, embedding, embedding_model, role)
+      VALUES (?, ?, ?, ?, ?, ?, ?)
+    `);
+    db.transaction(() => {
+      for (let i = 0; i < EMBED_COUNT; i++) {
+        ins.run(`n${i}`, sample, 'qwen2.5-coder', Date.now(), buf, 'nomic-embed-text', 'business_logic');
+      }
+    })();
+  }
+
+  return { db, dbPath, size: fs.statSync(dbPath).size };
+}
+
+const inline = buildEmbedDb({ split: false });
+const splitT = buildEmbedDb({ split: true });
+
+console.log(`  inline DB: ${(inline.size / 1024 / 1024).toFixed(1)} MB`);
+console.log(`  split  DB: ${(splitT.size / 1024 / 1024).toFixed(1)} MB`);
+
+function timeQuery(db, label, sql, params = []) {
+  const N = 50;
+  const stmt = db.prepare(sql);
+  const t = process.hrtime.bigint();
+  for (let i = 0; i < N; i++) stmt.all(...params);
+  const avg = ms(t) / N;
+  console.log(`  ${label}: ${fmt(avg)}ms avg over ${N} queries`);
+  return avg;
+}
+console.log('\n  Test: scan summaries by role (common path — embedding bytes are dead weight in inline)');
+const inlineNoEmb = timeQuery(
+  inline.db,
+  'inline',
+  `SELECT node_id, summary FROM summaries WHERE role = ?`,
+  ['business_logic']
+);
+const splitNoEmb = timeQuery(
+  splitT.db,
+  'split ',
+  `SELECT node_id, summary FROM summaries WHERE role = ?`,
+  ['business_logic']
+);
+console.log(`  Δ summary-only: split is ${(inlineNoEmb / splitNoEmb).toFixed(2)}× faster`);
+
+console.log('\n  Test: scan summaries WITH embedding (rare path — split pays a JOIN)');
+const inlineWithEmb = timeQuery(
+  inline.db,
+  'inline (single table)   ',
+  `SELECT node_id, summary, embedding FROM summaries`
+);
+const splitWithEmb = timeQuery(
+  splitT.db,
+  'split  (join required)  ',
+  `SELECT s.node_id, s.summary, e.embedding FROM summaries s JOIN embeddings e ON e.node_id = s.node_id`
+);
+console.log(`  Δ summary+embedding: ${(splitWithEmb / inlineWithEmb).toFixed(2)}× cost penalty for split (>1 = split slower)`);
+
+// ============================================================================
+// Spike H: in-memory cache
+// ============================================================================
+console.log('\n--- Spike H: in-memory embedding cache ---\n');
+
+const QUERIES = 20;
+const TOP_K = 10;
+
+const queryVec = new Float32Array(EMBED_DIM);
+for (let i = 0; i < EMBED_DIM; i++) queryVec[i] = Math.random();
+
+function cosine(a, b) {
+  let s = 0;
+  for (let i = 0; i < a.length; i++) s += a[i] * b[i];
+  return s;
+}
+
+function bytesToVec(buf) {
+  return new Float32Array(buf.buffer, buf.byteOffset, buf.byteLength / 4);
+}
+
+const coldStmt = inline.db.prepare('SELECT node_id, embedding FROM summaries');
+let t0 = process.hrtime.bigint();
+for (let q = 0; q < QUERIES; q++) {
+  const rows = coldStmt.all();
+  const scores = [];
+  for (const r of rows) {
+    const v = bytesToVec(r.embedding);
+    scores.push({ id: r.node_id, score: cosine(queryVec, v) });
+  }
+  scores.sort((a, b) => b.score - a.score);
+  scores.slice(0, TOP_K);
+}
+const coldMs = ms(t0) / QUERIES;
+console.log(`  cold (per-query SQLite fetch + decode): ${fmt(coldMs)}ms avg over ${QUERIES} queries`);
+
+const ids = [];
+const matrix = new Float32Array(EMBED_COUNT * EMBED_DIM);
+let row = 0;
+for (const r of coldStmt.all()) {
+  ids.push(r.node_id);
+  matrix.set(bytesToVec(r.embedding), row * EMBED_DIM);
+  row++;
+}
+let t1 = process.hrtime.bigint();
+for (let q = 0; q < QUERIES; q++) {
+  const scores = [];
+  for (let i = 0; i < EMBED_COUNT; i++) {
+    let s = 0;
+    const off = i * EMBED_DIM;
+    for (let d = 0; d < EMBED_DIM; d++) s += matrix[off + d] * queryVec[d];
+    scores.push({ id: ids[i], score: s });
+  }
+  scores.sort((a, b) => b.score - a.score);
+  scores.slice(0, TOP_K);
+}
+const warmMs = ms(t1) / QUERIES;
+console.log(`  warm (in-memory Float32Array matrix)  : ${fmt(warmMs)}ms avg over ${QUERIES} queries`);
+console.log(`  Δ similarity search: ${(coldMs / warmMs).toFixed(1)}× speedup with in-memory cache`);
+
+inline.db.close(); splitT.db.close();
+fs.unlinkSync(inline.dbPath); fs.unlinkSync(splitT.dbPath);
+
+console.log('\n=== Done ===\n');
diff --git a/scripts/spikes/spike-risk-delta.mjs b/scripts/spikes/spike-risk-delta.mjs
new file mode 100644
index 00000000..0b639948
--- /dev/null
+++ b/scripts/spikes/spike-risk-delta.mjs
@@ -0,0 +1,117 @@
+#!/usr/bin/env node
+/**
+ * Spike D: PR risk-delta meaningfulness.
+ *
+ * Question: when a PR touches files, does the rank-shift in the hotspot
+ * list (after applying the touched_count += 1) correlate with risk
+ * intuition? Or do most PRs produce trivial shifts?
+ *
+ * Method:
+ *   1. Get current hotspot ranking (anchor)
+ *   2. For each of the last N real commits, simulate "after this commit":
+ *      - For each file the commit touched, conceptually +1 to commit_count
+ *      - Recompute risk = file_centrality * commit_count
+ *      - Find what the new top-K would look like
+ *   3. Tabulate per-commit: max rank shift, avg rank shift, files that
+ *      newly entered top-10
+ *   4. Decide: are the shifts informative, or noise?
+ */
+
+import path from 'node:path';
+import { execFileSync } from 'node:child_process';
+
+const target = path.resolve(process.argv[2] ?? process.cwd());
+const N_COMMITS = 20;
+const { CodeGraph } = await import('../../dist/index.js');
+
+console.log(`\n=== Spike D: Risk-delta on ${path.basename(target)} (last ${N_COMMITS} commits) ===\n`);
+
+const cg = await CodeGraph.open(target);
+
+// 1. Anchor: current hotspot ranking
+const anchor = cg.getHotspots({ limit: 200, minCommits: 0, sortBy: 'risk' });
+const rankByPath = new Map();
+anchor.forEach((h, i) => rankByPath.set(h.filePath, { rank: i + 1, risk: h.riskScore, centrality: h.fileCentrality, commits: h.commitCount }));
+console.log(`Anchor: ${anchor.length} files ranked by risk\n`);
+console.log('Top 5 anchor:');
+anchor.slice(0, 5).forEach((h, i) => console.log(`  #${i + 1} risk=${h.riskScore.toFixed(3)} · ${h.filePath}`));
+console.log();
+
+// 2. For each of the last N commits, simulate the rank shift
+const shaList = execFileSync('git', ['log', '-n', String(N_COMMITS), '--format=%H'], { cwd: target, encoding: 'utf-8' })
+  .trim().split('\n');
+
+console.log('Per-commit rank-shift analysis:\n');
+console.log(`${'commit'.padEnd(8)} ${'subject'.padEnd(50)} ${'files'.padStart(5)} ${'avgΔ'.padStart(6)} ${'maxΔ'.padStart(6)} new-in-top10`);
+console.log('-'.repeat(95));
+
+const allShifts = [];
+const interestingCommits = [];
+for (const sha of shaList) {
+  const subject = execFileSync('git', ['log', '-1', '--format=%s', sha], { cwd: target, encoding: 'utf-8' }).trim();
+  const filesRaw = execFileSync('git', ['show', '--name-only', '--format=', sha], { cwd: target, encoding: 'utf-8' })
+    .trim().split('\n').filter(Boolean);
+  // Filter to files codegraph knows
+  const touched = filesRaw.filter((f) => rankByPath.has(f));
+  if (touched.length === 0) continue;
+
+  // Simulate: increment commit_count by 1 for each touched file
+  const simulated = anchor.map((h) => {
+    const newCommits = touched.includes(h.filePath) ? h.commitCount + 1 : h.commitCount;
+    const newRisk = h.fileCentrality * newCommits;
+    return { filePath: h.filePath, newRisk };
+  });
+  simulated.sort((a, b) => b.newRisk - a.newRisk);
+
+  // Compute shifts
+  const newRankByPath = new Map();
+  simulated.forEach((s, i) => newRankByPath.set(s.filePath, i + 1));
+
+  let totalShift = 0, maxShift = 0, newInTop10 = [];
+  for (const file of touched) {
+    const before = rankByPath.get(file).rank;
+    const after = newRankByPath.get(file);
+    const shift = before - after;  // positive = climbed (worse)
+    totalShift += Math.abs(shift);
+    maxShift = Math.max(maxShift, Math.abs(shift));
+    if (after <= 10 && before > 10) newInTop10.push(file);
+  }
+  const avgShift = totalShift / touched.length;
+  allShifts.push({ sha, subject, touched: touched.length, avgShift, maxShift, newInTop10 });
+
+  // Print top-line: only show non-trivial commits (>1 file or maxShift>0)
+  if (touched.length > 1 || maxShift > 0) {
+    const subjShort = subject.length > 48 ? subject.slice(0, 45) + '...' : subject;
+    console.log(
+      `${sha.slice(0, 7).padEnd(8)} ${subjShort.padEnd(50)} ${String(touched.length).padStart(5)} ${avgShift.toFixed(1).padStart(6)} ${String(maxShift).padStart(6)} ${newInTop10.length > 0 ? '⚡ ' + newInTop10.join(',') : ''}`
+    );
+  }
+  if (newInTop10.length > 0 || maxShift >= 5) interestingCommits.push({ sha, subject, touched, maxShift, newInTop10 });
+}
+
+cg.close();
+
+// 3. Verdict
+console.log('\n=== Analysis ===\n');
+const triviallySmall = allShifts.filter((s) => s.maxShift === 0).length;
+const moderate = allShifts.filter((s) => s.maxShift > 0 && s.maxShift < 5).length;
+const significant = allShifts.filter((s) => s.maxShift >= 5).length;
+console.log(`${triviallySmall} commits: zero rank shift (already-ranked files staying put)`);
+console.log(`${moderate} commits: small shift (1-4 ranks)`);
+console.log(`${significant} commits: significant shift (≥5 ranks)`);
+console.log(`${interestingCommits.length} commits surfaced interesting (top-10 entry or ≥5-rank shift)`);
+
+if (interestingCommits.length > 0) {
+  console.log('\nInteresting commits this metric would have flagged:');
+  interestingCommits.slice(0, 5).forEach((c) => {
+    const subj = c.subject.length > 50 ? c.subject.slice(0, 47) + '...' : c.subject;
+    console.log(`  ${c.sha.slice(0, 7)} ${subj}`);
+    if (c.newInTop10.length > 0) console.log(`         → newly in top-10: ${c.newInTop10.join(', ')}`);
+    else console.log(`         → max ${c.maxShift}-rank climb`);
+  });
+}
+
+console.log('\n=== Verdict ===');
+if (significant >= N_COMMITS / 4) console.log(`Significant rank shifts on ${significant}/${N_COMMITS} commits = ${Math.round(100 * significant / N_COMMITS)}%. Metric is informative. BUILD risk-delta.`);
+else if (interestingCommits.length === 0) console.log('Zero meaningful shifts across recent commits. Metric is noise. SKIP.');
+else console.log(`Only ${interestingCommits.length}/${N_COMMITS} commits flagged. Marginal signal — informative when it fires but rarely fires. Build only if compounding with other features.`);
diff --git a/scripts/spikes/spike-simulate-change.mjs b/scripts/spikes/spike-simulate-change.mjs
new file mode 100644
index 00000000..2294fb6c
--- /dev/null
+++ b/scripts/spikes/spike-simulate-change.mjs
@@ -0,0 +1,130 @@
+#!/usr/bin/env node
+/**
+ * Spike E: Does codegraph_simulate_change need to exist as a tool,
+ * or is codegraph_review_context (existing) already good enough?
+ *
+ * Method: build a synthetic broken diff (rename a function definition
+ * but NOT its callers — a classic refactor mistake) and see whether
+ * codegraph_review_context flags the un-renamed callers as a risk.
+ *
+ *   - If review-context names the un-renamed callers → no need for a
+ *     new tool. The existing surface is enough.
+ *   - If review-context only sees the touched file → simulate-change
+ *     would add real value: it can predict the breakage that the
+ *     diff alone doesn't show.
+ */
+
+import path from 'node:path';
+import fs from 'node:fs';
+import os from 'node:os';
+import { execFileSync } from 'node:child_process';
+
+const { CodeGraph } = await import('../../dist/index.js');
+
+console.log('\n=== Spike E: simulate-change vs review-context ===\n');
+
+// Build a tiny synthetic codebase
+const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'spike-simulate-'));
+const git = (...a) => execFileSync('git', a, { cwd: dir, stdio: 'pipe' });
+git('init', '-q', '-b', 'main');
+git('config', 'user.email', 'spike@test.local');
+git('config', 'user.name', 'Spike Test');
+
+fs.mkdirSync(path.join(dir, 'src'));
+fs.writeFileSync(path.join(dir, 'src/util.ts'), `export function fooBarBaz(x: number): number {
+  return x * 2;
+}
+`);
+fs.writeFileSync(path.join(dir, 'src/a.ts'), `import { fooBarBaz } from './util';
+export function callerA() {
+  return fooBarBaz(1);
+}
+`);
+fs.writeFileSync(path.join(dir, 'src/b.ts'), `import { fooBarBaz } from './util';
+export function callerB() {
+  return fooBarBaz(2);
+}
+`);
+fs.writeFileSync(path.join(dir, 'src/c.ts'), `import { fooBarBaz } from './util';
+export function callerC() {
+  return fooBarBaz(3) + fooBarBaz(4);
+}
+`);
+git('add', '-A');
+git('commit', '-q', '-m', 'init');
+
+const cg = await CodeGraph.init(dir, { config: { include: ['**/*.ts'], exclude: [] } });
+await cg.indexAll();
+
+console.log('Setup: 4 files, 1 definition (fooBarBaz in util.ts), 4 calls across a.ts/b.ts/c.ts.');
+console.log('Indexed nodes:', cg.getStats().nodeCount, '  edges:', cg.getStats().edgeCount, '\n');
+
+// Verify the call graph is correct
+const callers = cg.getCallers('fooBarBaz');
+console.log(`getCallers(fooBarBaz) → ${callers.length} callers:`);
+callers.forEach((c) => console.log(`  ${c.callerNode.name} in ${c.callerNode.filePath}`));
+console.log();
+
+// Build a BROKEN diff: rename only the definition, leave callers alone
+const brokenDiff = `diff --git a/src/util.ts b/src/util.ts
+--- a/src/util.ts
++++ b/src/util.ts
+@@ -1,3 +1,3 @@
+-export function fooBarBaz(x: number): number {
++export function bazQux(x: number): number {
+   return x * 2;
+ }
+`;
+
+console.log('Synthetic diff: rename fooBarBaz → bazQux ONLY in util.ts (callers in a/b/c.ts NOT updated)\n');
+
+// Run review-context on the broken diff
+const ctx = cg.buildReviewContext(brokenDiff, {});
+console.log('=== review-context output ===');
+console.log(`summary: ${JSON.stringify(ctx.summary)}`);
+console.log(`files in ctx: ${ctx.files.length}`);
+ctx.files.forEach((f) => {
+  console.log(`  ${f.filePath} (${f.status}) — ${f.affectedSymbols.length} affected symbols`);
+  f.affectedSymbols.forEach((s) => {
+    console.log(`    ${s.name} (${s.kind})`);
+    console.log(`      callers: ${s.callers?.map((c) => c.name).join(', ') ?? '(none)'}`);
+    console.log(`      callees: ${s.callees?.map((c) => c.name).join(', ') ?? '(none)'}`);
+  });
+});
+
+// Crucial question: does review-context surface the callers
+// (callerA, callerB, callerC) as needing attention?
+const surfacedCallers = new Set();
+for (const f of ctx.files) {
+  for (const s of f.affectedSymbols) {
+    for (const c of s.callers ?? []) {
+      surfacedCallers.add(c.name);
+    }
+  }
+}
+
+console.log(`\n=== Analysis ===\n`);
+console.log(`Surfaced callers via review-context: ${[...surfacedCallers].join(', ') || '(none)'}`);
+const expectedCallers = ['callerA', 'callerB', 'callerC'];
+const hits = expectedCallers.filter((c) => surfacedCallers.has(c));
+
+if (hits.length === expectedCallers.length) {
+  console.log(`✓ All ${expectedCallers.length} expected callers surfaced. review-context is sufficient.`);
+  console.log(`\nVerdict: codegraph_simulate_change adds NO new value. SKIP.`);
+} else if (hits.length > 0) {
+  console.log(`⚠ Only ${hits.length}/${expectedCallers.length} callers surfaced.`);
+  console.log(`Missed: ${expectedCallers.filter((c) => !surfacedCallers.has(c)).join(', ')}`);
+  console.log(`\nVerdict: review-context partially covers this. simulate-change might add value for completeness, but not transformative. EVALUATE COST/BENEFIT.`);
+} else {
+  console.log(`✗ Zero callers surfaced. review-context does NOT see the breakage from this diff.`);
+  console.log(`\nVerdict: codegraph_simulate_change adds REAL value — it could predict breakage that review-context misses. BUILD.`);
+}
+
+// What does review-context actually show? Does it surface anything actionable?
+console.log(`\n=== Diagnostic: what DID review-context tell us? ===`);
+console.log(`affected files: ${ctx.files.map((f) => f.filePath).join(', ')}`);
+console.log(`coChangeWarnings: ${(ctx.coChangeWarnings ?? []).length}`);
+
+cg.close();
+fs.rmSync(dir, { recursive: true, force: true });
+console.log('\n=== Spike E complete ===\n');
diff --git a/scripts/spikes/spike-weighted-centrality.mjs b/scripts/spikes/spike-weighted-centrality.mjs
new file mode 100644
index 00000000..e0c36303
--- /dev/null
+++ b/scripts/spikes/spike-weighted-centrality.mjs
@@ -0,0 +1,118 @@
+#!/usr/bin/env node
+/**
+ * Spike B: Does edge-weighted PageRank produce a different top-10
+ * than the current uniform PageRank?
+ *
+ * Method: re-run PageRank on the same edge list with edge weights:
+ *   calls=3, instantiates=2, extends=2, implements=2, references=1, type_of=1
+ * Compare top-10 against current uniform run.
+ *
+ * If top-10 mostly the same (≥7 overlap), weights add little signal.
+ * If significantly different (≤4 overlap), weights matter — build it.
+ */
+import path from 'node:path';
+const target = path.resolve(process.argv[2] ?? process.cwd());
+const { CodeGraph } = await import('../../dist/index.js');
+
+console.log(`\n=== Spike B: Weighted PageRank on ${target} ===\n`);
+
+const cg = await CodeGraph.open(target);
+const db = cg.db.getDb();
+
+// Get current centrality top-10
+const current = cg.getTopCentralNodes({ limit: 10 });
+console.log('Current PageRank (uniform weights) top 10:');
+current.forEach((n, i) => console.log(`  ${i + 1}. ${n.name} (${n.kind}) — ${n.centrality.toFixed(5)} — ${n.filePath}:${n.startLine}`));
+
+// Implement weighted PageRank from the same edges
+const allNodes = db.prepare('SELECT id FROM nodes').all();
+const edges = db.prepare(`
+  SELECT source, target, kind FROM edges
+  WHERE kind IN ('calls', 'references', 'type_of', 'instantiates', 'extends', 'implements')
+`).all();
+
+const WEIGHTS = {
+  calls: 3.0,
+  instantiates: 2.0,
+  extends: 2.0,
+  implements: 2.0,
+  references: 1.0,
+  type_of: 1.0,
+};
+
+const N = allNodes.length;
+const idToIdx = new Map();
+allNodes.forEach((n, i) => idToIdx.set(n.id, i));
+
+// Build outgoing-weight totals + adjacency
+const outgoing = new Float64Array(N);  // total outgoing weight per node
+const incoming = new Map();             // target idx -> [(srcIdx, weight)]
+for (const e of edges) {
+  const s = idToIdx.get(e.source);
+  const t = idToIdx.get(e.target);
+  if (s == null || t == null) continue;
+  const w = WEIGHTS[e.kind] ?? 1.0;
+  outgoing[s] += w;
+  if (!incoming.has(t)) incoming.set(t, []);
+  incoming.get(t).push([s, w]);
+}
+
+const damping = 0.85;
+let pr = new Float64Array(N).fill(1.0 / N);
+const teleport = (1 - damping) / N;
+
+for (let iter = 0; iter < 30; iter++) {
+  const next = new Float64Array(N).fill(teleport);
+  let dangling = 0;
+  for (let i = 0; i < N; i++) if (outgoing[i] === 0) dangling += pr[i];
+  const danglingShare = damping * dangling / N;
+  for (let i = 0; i < N; i++) next[i] += danglingShare;
+  for (const [tIdx, contributors] of incoming) {
+    let sum = 0;
+    for (const [sIdx, w] of contributors) {
+      sum += pr[sIdx] * (w / outgoing[sIdx]);
+    }
+    next[tIdx] += damping * sum;
+  }
+  pr = next;
+}
+
+// Top 10 by weighted PR
+const ranked = [];
+for (let i = 0; i < N; i++) ranked.push({ idx: i, pr: pr[i] });
+ranked.sort((a, b) => b.pr - a.pr);
+
+const idAtIdx = (idx) => allNodes[idx].id;
+const getNode = db.prepare('SELECT id, kind, name, file_path, start_line FROM nodes WHERE id = ?');
+console.log('\nWeighted PageRank top 10:');
+const weightedTop = [];
+for (let i = 0; i < 10 && i < ranked.length; i++) {
+  const n = getNode.get(idAtIdx(ranked[i].idx));
+  weightedTop.push(n.name);
+  console.log(`  ${i + 1}. ${n.name} (${n.kind}) — ${ranked[i].pr.toFixed(5)} — ${n.file_path}:${n.start_line}`);
+}
+
+// Compare overlap
+const currentTop = current.map((n) => n.name);
+const overlap = currentTop.filter((n) => weightedTop.includes(n));
+console.log(`\nOverlap: ${overlap.length}/10 names appear in both rankings`);
+console.log(`Same:    ${overlap.join(', ') || '(none)'}`);
+console.log(`Only in current:  ${currentTop.filter((n) => !weightedTop.includes(n)).join(', ') || '(none)'}`);
+console.log(`Only in weighted: ${weightedTop.filter((n) => !currentTop.includes(n)).join(', ') || '(none)'}`);
+
+// Rank correlation: how much do positions shift?
+let totalShift = 0;
+for (const name of overlap) {
+  const a = currentTop.indexOf(name);
+  const b = weightedTop.indexOf(name);
+  totalShift += Math.abs(a - b);
+}
+console.log(`Average rank shift among overlap: ${(totalShift / Math.max(overlap.length, 1)).toFixed(1)} positions`);
+
+cg.close();
+
+// Verdict
+console.log('\n=== Verdict ===');
+if (overlap.length >= 8) console.log('Weighted PR shifts <2 names. Marginal value. SKIP build.');
+else if (overlap.length >= 5) console.log('Weighted PR shifts ~3-5 names. Modest value. Build only if other improvements compound.');
+else console.log('Weighted PR shifts ≥6 names. Significantly different signal. BUILD.');
diff --git a/scripts/stress-test.mjs b/scripts/stress-test.mjs
new file mode 100644
index 00000000..b0ba91d6
--- /dev/null
+++ b/scripts/stress-test.mjs
@@ -0,0 +1,376 @@
+#!/usr/bin/env node
+/**
+ * Stress-test harness for the integrated feature stack.
+ *
+ * Exercises each shipped feature beyond the happy path:
+ *   - Performance: indexAll/sync timing + memory under load
+ *   - Determinism: re-index produces identical state
+ *   - HEAD-movement detection (PR #100)
+ *   - .codegraphignore git fast-path (PR #103)
+ *   - edges UNIQUE constraint (PR #102) — re-extraction shouldn't dupe
+ *   - FTS subwords + Porter stem (PR #104)
+ *   - Search diversification (PR #107)
+ *   - Centrality + churn + hotspots (PR #112)
+ *   - Issue history (PR #113)
+ *   - Config-refs (PR #114) — env var false-positive guards
+ *   - SQL-refs (PR #115) — comment/docstring guards
+ *   - Co-change graph (PR #105)
+ *   - Tests-as-edges (PR #106)
+ *   - Review-context (PR #110)
+ *   - Index-hook framework (PR #119) — all hooks fire together
+ *   - Submodules (PR #93)
+ *
+ * Usage: node scripts/stress-test.mjs <project-path> [--quick]
+ */
+
+import path from 'node:path';
+import fs from 'node:fs';
+import os from 'node:os';
+import { execFileSync } from 'node:child_process';
+import process from 'node:process';
+
+const args = process.argv.slice(2);
+const target = path.resolve(args.find((a) => !a.startsWith('--')) ?? process.cwd());
+const QUICK = args.includes('--quick');
+
+if (!fs.existsSync(target)) {
+  console.error(`stress-test: target path does not exist: ${target}`);
+  process.exit(1);
+}
+
+const { CodeGraph } = await import('../dist/index.js');
+
+const FAILURES = [];
+const RESULTS = [];
+
+function pass(name, detail = '') { RESULTS.push({ name, status: 'PASS', detail }); }
+function fail(name, detail) { RESULTS.push({ name, status: 'FAIL', detail }); FAILURES.push({ name, detail }); }
+function warn(name, detail) { RESULTS.push({ name, status: 'WARN', detail }); }
+
+function ms() { return process.hrtime.bigint(); }
+function elapsed(start) { return Number((ms() - start) / 1_000_000n); }
+function rss() { return Math.round(process.memoryUsage.rss() / 1024 / 1024); }
+
+function resetTarget() {
+  const cgDir = path.join(target, '.codegraph');
+  if (fs.existsSync(cgDir)) fs.rmSync(cgDir, { recursive: true, force: true });
+}
+
+console.log(`\n=== Stress test: ${target} ===\n`);
+
+// =============================================================================
+// Phase 1: First-index timing + memory
+// =============================================================================
+console.log('[Phase 1] First-index performance');
+resetTarget();
+let cg = await CodeGraph.init(target);
+const t0 = ms();
+const memBefore = rss();
+const r1 = await cg.indexAll();
+const indexMs = elapsed(t0);
+const memAfter = rss();
+
+if (r1.success) pass('indexAll completes', `${r1.filesIndexed} files / ${r1.nodesCreated} nodes / ${r1.edgesCreated} edges in ${indexMs}ms (Δ ${memAfter - memBefore}MB RSS)`);
+else fail('indexAll completes', `errors: ${r1.errors.length}`);
+
+const stats = cg.getStats();
+console.log(`  ✓ stats: ${stats.fileCount} files, ${stats.nodeCount} nodes, ${stats.edgeCount} edges`);
+const filesPerSec = (r1.filesIndexed / (indexMs / 1000)).toFixed(0);
+console.log(`  ✓ throughput: ${filesPerSec} files/s\n`);
+
+// =============================================================================
+// Phase 2: Determinism — sync no-op should report no changes
+// =============================================================================
+console.log('[Phase 2] Determinism (sync no-op)');
+const t2 = ms();
+const sync1 = await cg.sync();
+const noopMs = elapsed(t2);
+if (sync1.filesAdded === 0 && sync1.filesModified === 0 && sync1.filesRemoved === 0) {
+  pass('sync no-op clean', `${noopMs}ms, no changes detected`);
+} else {
+  fail('sync no-op clean', `unexpectedly reported added=${sync1.filesAdded} mod=${sync1.filesModified} rm=${sync1.filesRemoved}`);
+}
+
+// =============================================================================
+// Phase 3: edges UNIQUE constraint (PR #102) — re-index shouldn't dup edges
+// =============================================================================
+console.log('\n[Phase 3] edges UNIQUE constraint (PR #102)');
+const edgeCountBefore = cg.getStats().edgeCount;
+const r3 = await cg.indexAll();
+const edgeCountAfter = cg.getStats().edgeCount;
+if (edgeCountAfter === edgeCountBefore) {
+  pass('re-indexAll preserves edge count', `${edgeCountBefore} = ${edgeCountAfter}`);
+} else if (Math.abs(edgeCountAfter - edgeCountBefore) <= 5) {
+  warn('re-indexAll preserves edge count', `${edgeCountBefore} vs ${edgeCountAfter} (small drift acceptable)`);
+} else {
+  fail('re-indexAll preserves edge count', `${edgeCountBefore} → ${edgeCountAfter} (UNIQUE constraint not deduping)`);
+}
+
+// =============================================================================
+// Phase 4: All registered hooks fire (PR #119 framework)
+// =============================================================================
+console.log('\n[Phase 4] Index-hook framework (PR #119)');
+const { getRegisteredHooks } = await import('../dist/index-hooks/registry.js');
+const hooks = getRegisteredHooks();
+console.log(`  ✓ ${hooks.length} hooks registered: ${hooks.map((h) => h.name).join(', ')}`);
+if (hooks.length >= 6) pass('all hooks registered', `${hooks.length} hooks`);
+else fail('all hooks registered', `expected ≥6, got ${hooks.length}`);
+
+// Each hook should have populated some state
+const sampleNode = cg.getTopCentralNodes({ limit: 1 })[0];
+if (sampleNode && sampleNode.centrality != null) pass('centrality hook populated nodes', `top node centrality=${sampleNode.centrality.toFixed(5)}`);
+else fail('centrality hook populated nodes', 'no top central nodes');
+
+const sampleHotspot = cg.getHotspots({ limit: 1, minCommits: 0 })[0];
+if (sampleHotspot && sampleHotspot.commitCount > 0) pass('churn hook populated commit_count', `${sampleHotspot.filePath}: ${sampleHotspot.commitCount} commits`);
+else warn('churn hook populated commit_count', 'no hotspots with commit data (likely non-git)');
+
+// Co-change: at least one pair should exist if churn worked
+let coChangeWorks = false;
+try {
+  const cochanges = cg.getCoChangedFiles(sampleHotspot?.filePath ?? '', { limit: 5, minCount: 1, minJaccard: 0 });
+  coChangeWorks = cochanges.length > 0;
+  if (coChangeWorks) pass('cochange hook populated pairs', `${cochanges.length} co-changers for ${sampleHotspot.filePath}`);
+  else warn('cochange hook populated pairs', 'no co-change pairs (history may be too short)');
+} catch (e) { warn('cochange hook populated pairs', e.message); }
+
+// =============================================================================
+// Phase 5: FTS subwords + Porter stemmer (PR #104)
+// =============================================================================
+console.log('\n[Phase 5] FTS subwords + Porter (PR #104)');
+// Subword: querying for "parser" should find getParser/parseFile/etc.
+const subwordHits = cg.searchNodes('parser', { limit: 20 });
+const hasSubwordMatch = subwordHits.some((r) => /[Pp]arser|[Pp]arse[A-Z]/.test(r.node.name) && r.node.name !== 'parser');
+if (hasSubwordMatch) pass('FTS finds subwords', `query "parser" matches camelCase parents like ${subwordHits[0]?.node.name}`);
+else if (subwordHits.length > 0) warn('FTS finds subwords', `${subwordHits.length} hits but no obvious camelCase subword matches`);
+else fail('FTS finds subwords', 'no hits for "parser"');
+
+// Porter stem: "parsing" should match "parse"/"parser"/"parses"
+const stemHits = cg.searchNodes('parsing', { limit: 10 });
+if (stemHits.length > 0) pass('FTS Porter stems', `query "parsing" matched ${stemHits.length} nodes`);
+else warn('FTS Porter stems', 'no hits for "parsing"');
+
+// =============================================================================
+// Phase 6: Search diversification (PR #107)
+// =============================================================================
+console.log('\n[Phase 6] Search diversification (PR #107)');
+const divHits = cg.searchNodes('extract', { limit: 10 });
+if (divHits.length >= 5) {
+  const distinctFiles = new Set(divHits.map((r) => r.node.filePath)).size;
+  const ratio = distinctFiles / divHits.length;
+  if (ratio >= 0.5) pass('search diversifies across files', `${distinctFiles}/${divHits.length} distinct files in top 10`);
+  else warn('search diversifies across files', `only ${distinctFiles}/${divHits.length} distinct (one file dominates)`);
+} else {
+  warn('search diversifies across files', `not enough hits to test (${divHits.length})`);
+}
+
+// =============================================================================
+// Phase 7: Config-refs false-positive guards (PR #114, #101 strip-comments)
+// =============================================================================
+console.log('\n[Phase 7] Config-refs false-positive resistance');
+const envKeys = cg.getConfigKeys({ configKind: 'env', limit: 100 });
+console.log(`  ✓ ${envKeys.length} distinct env vars detected`);
+// Sanity: top env var should appear in actual code, not just docstrings
+if (envKeys.length > 0) {
+  const top = envKeys[0];
+  const sites = cg.getConfigRefsByKey(top.configKey, { configKind: 'env' });
+  if (sites.length === top.reads) pass('config-refs reads count consistency', `${top.configKey}: ${sites.length} sites`);
+  else fail('config-refs reads count consistency', `getConfigKeys says ${top.reads}, getConfigRefsByKey says ${sites.length}`);
+}
+// False-positive guard: should not have keys like "the", "a", short common words
+const suspicious = envKeys.filter((k) => /^(the|a|of|and|for|to|is|in)$/i.test(k.configKey));
+if (suspicious.length === 0) pass('no false-positive env keys', '');
+else fail('no false-positive env keys', `found: ${suspicious.map((k) => k.configKey).join(', ')}`);
+
+// =============================================================================
+// Phase 8: SQL-refs comment-strip + keyword pre-filter (PR #115)
+// =============================================================================
+console.log('\n[Phase 8] SQL-refs precision');
+const tables = cg.getSqlTables({ limit: 50 });
+console.log(`  ✓ ${tables.length} SQL tables detected`);
+if (tables.length > 0) {
+  // False-positive guard: bare common English words shouldn't appear as tables
+  const englishLooking = tables.filter((t) => /^(the|a|of|and|for|to|in|is)$/i.test(t.tableName));
+  if (englishLooking.length === 0) pass('no English-word false positives', '');
+  else fail('no English-word false positives', `found: ${englishLooking.map((t) => t.tableName).join(', ')}`);
+}
+
+// =============================================================================
+// Phase 9: Issue-history (PR #113)
+// =============================================================================
+console.log('\n[Phase 9] Issue-history attribution');
+const sampledNodes = cg.getTopCentralNodes({ limit: 200 });
+let nodesWithIssues = 0, totalAttributions = 0;
+for (const n of sampledNodes) {
+  const issues = cg.getIssuesForNode(n.id);
+  if (issues.length > 0) { nodesWithIssues++; totalAttributions += issues.length; }
+}
+console.log(`  ${nodesWithIssues}/${sampledNodes.length} sampled nodes have issue refs (${totalAttributions} attributions)`);
+pass('issue-history runs without error', `sampled ${sampledNodes.length}`);
+
+// =============================================================================
+// Phase 10: Tests-edges (PR #106)
+// =============================================================================
+console.log('\n[Phase 10] Tests-as-edges');
+// Find a test file in the indexed set, verify it has outgoing tests edges
+const allFiles = cg.getStats();
+console.log(`  ✓ examined ${allFiles.fileCount} files`);
+let testEdgesFound = 0;
+const sampleNodes2 = cg.getTopCentralNodes({ limit: 50 });
+for (const n of sampleNodes2.slice(0, 20)) {
+  const tests = cg.getTestsForFile(n.filePath);
+  if (tests.length > 0) testEdgesFound++;
+}
+if (testEdgesFound > 0) pass('tests-edges hook populated', `${testEdgesFound}/20 sample files have test coverage edges`);
+else warn('tests-edges hook populated', 'no test coverage edges among sampled files');
+
+// =============================================================================
+// Phase 11: review-context MCP tool (PR #110)
+// =============================================================================
+console.log('\n[Phase 11] review-context');
+// Build a synthetic small diff against the first indexed file
+const firstFile = cg.getTopCentralNodes({ limit: 1 })[0];
+if (firstFile) {
+  const fakeDiff = `diff --git a/${firstFile.filePath} b/${firstFile.filePath}
+--- a/${firstFile.filePath}
++++ b/${firstFile.filePath}
+@@ -${firstFile.startLine},3 +${firstFile.startLine},4 @@ ${firstFile.name}
+   foo();
++  newCall();
+   bar();
+`;
+  try {
+    const ctx = cg.buildReviewContext(fakeDiff, {});
+    if (ctx.summary.symbolsAffected > 0 || ctx.files.length > 0) pass('review-context produces context', `${ctx.summary.symbolsAffected} symbols, ${ctx.files.length} files`);
+    else warn('review-context produces context', 'no overlap detected (synthetic diff may not match indexed lines)');
+  } catch (e) { fail('review-context produces context', e.message); }
+} else {
+  warn('review-context produces context', 'no nodes to test against');
+}
+
+// =============================================================================
+// Phase 12: Sync timing on a real edit
+// =============================================================================
+if (!QUICK) {
+  console.log('\n[Phase 12] Sync after a real edit');
+  // Find a markdown or low-impact file to touch and revert
+  const tmp = path.join(target, '.stress-test-tmp.txt');
+  try {
+    fs.writeFileSync(tmp, `// stress test marker ${Date.now()}\n`);
+    const t12 = ms();
+    const syncEdit = await cg.sync();
+    const syncMs = elapsed(t12);
+    fs.unlinkSync(tmp);
+    await cg.sync(); // clean up
+    if (syncEdit.filesChecked > 0 || syncEdit.filesAdded > 0) pass('sync detects new file', `${syncMs}ms, added=${syncEdit.filesAdded}`);
+    else warn('sync detects new file', `${syncMs}ms, no changes detected (file may be excluded)`);
+  } catch (e) { warn('sync detects new file', e.message); }
+}
+
+// =============================================================================
+// Phase 13: HEAD-movement detection (PR #100) — synthetic
+// =============================================================================
+console.log('\n[Phase 13] HEAD-movement detection (PR #100, synthetic repo)');
+const syntheticDir = fs.mkdtempSync(path.join(os.tmpdir(), 'stress-head-'));
+try {
+  const git = (...a) => execFileSync('git', a, { cwd: syntheticDir, stdio: 'pipe' });
+  git('init', '-q', '-b', 'main');
+  git('config', 'user.email', 'stress@test.local');
+  git('config', 'user.name', 'Stress Test');
+  fs.mkdirSync(path.join(syntheticDir, 'src'));
+  fs.writeFileSync(path.join(syntheticDir, 'src/a.ts'), `export function alpha() { return 1; }\n`);
+  git('add', '-A');
+  git('commit', '-m', 'initial', '-q');
+
+  const cg2 = await CodeGraph.init(syntheticDir, { config: { include: ['**/*.ts'], exclude: [] } });
+  await cg2.indexAll();
+  const before = cg2.getStats().nodeCount;
+
+  // HEAD-moving operation: branch + commit + checkout back, working tree clean
+  git('checkout', '-b', 'feature', '-q');
+  fs.writeFileSync(path.join(syntheticDir, 'src/b.ts'), `export function beta() { return 2; }\n`);
+  git('add', '-A');
+  git('commit', '-m', 'add b', '-q');
+  git('checkout', 'main', '-q');
+  git('merge', '--no-ff', 'feature', '-m', 'merge', '-q');
+  // Tree clean (post-merge), but HEAD moved
+  const sync13 = await cg2.sync();
+  const after = cg2.getStats().nodeCount;
+
+  if (sync13.filesAdded + sync13.filesModified > 0 && after > before) pass('HEAD-movement detected', `+${sync13.filesAdded} added, ${before}→${after} nodes`);
+  else fail('HEAD-movement detected', `sync reported added=${sync13.filesAdded} mod=${sync13.filesModified}; nodes ${before}→${after}`);
+
+  cg2.close();
+} catch (e) {
+  fail('HEAD-movement detected', e.message);
+} finally {
+  if (fs.existsSync(syntheticDir)) fs.rmSync(syntheticDir, { recursive: true, force: true });
+}
+
+// =============================================================================
+// Phase 14: .codegraphignore on git fast path (PR #103) — synthetic
+// =============================================================================
+console.log('\n[Phase 14] .codegraphignore on git fast path (PR #103, synthetic repo)');
+const ignoreDir = fs.mkdtempSync(path.join(os.tmpdir(), 'stress-ignore-'));
+try {
+  const git = (...a) => execFileSync('git', a, { cwd: ignoreDir, stdio: 'pipe' });
+  git('init', '-q', '-b', 'main');
+  git('config', 'user.email', 'stress@test.local');
+  git('config', 'user.name', 'Stress Test');
+  fs.mkdirSync(path.join(ignoreDir, 'src'));
+  fs.mkdirSync(path.join(ignoreDir, 'vendor-tree'));
+  fs.writeFileSync(path.join(ignoreDir, 'src/keep.ts'), `export function keep() {}\n`);
+  fs.writeFileSync(path.join(ignoreDir, 'vendor-tree/skip.ts'), `export function skip() {}\n`);
+  fs.writeFileSync(path.join(ignoreDir, 'vendor-tree/.codegraphignore'), '');
+  git('add', '-A');
+  git('commit', '-m', 'init', '-q');
+
+  const cg3 = await CodeGraph.init(ignoreDir, { config: { include: ['**/*.ts'], exclude: [] } });
+  await cg3.indexAll();
+
+  const keep = cg3.searchNodes('keep', { limit: 5 });
+  const skip = cg3.searchNodes('skip', { limit: 5 });
+  if (keep.length > 0 && skip.length === 0) pass('.codegraphignore excludes vendor-tree', `keep visible, skip hidden`);
+  else if (skip.length > 0) fail('.codegraphignore excludes vendor-tree', `vendor-tree/skip.ts was indexed (.codegraphignore not honored)`);
+  else warn('.codegraphignore excludes vendor-tree', `keep.ts also missing — globs may be wrong`);
+
+  cg3.close();
+} catch (e) {
+  fail('.codegraphignore excludes vendor-tree', e.message);
+} finally {
+  if (fs.existsSync(ignoreDir)) fs.rmSync(ignoreDir, { recursive: true, force: true });
+}
+
+// =============================================================================
+// Phase 15: Full sync round-trip + final memory
+// =============================================================================
+console.log('\n[Phase 15] Final sync + memory check');
+const t15 = ms();
+const final = await cg.sync();
+const finalMs = elapsed(t15);
+console.log(`  ✓ final sync no-op: ${finalMs}ms`);
+console.log(`  ✓ final RSS: ${rss()}MB`);
+
+cg.close();
+
+// =============================================================================
+// Summary
+// =============================================================================
+console.log('\n=== Summary ===');
+const passes = RESULTS.filter((r) => r.status === 'PASS').length;
+const warns = RESULTS.filter((r) => r.status === 'WARN').length;
+const fails = RESULTS.filter((r) => r.status === 'FAIL').length;
+console.log(`  ${passes} PASS · ${warns} WARN · ${fails} FAIL\n`);
+
+for (const r of RESULTS) {
+  const sym = r.status === 'PASS' ? '✓' : r.status === 'WARN' ? '⚠' : '✗';
+  console.log(`  ${sym} [${r.status}] ${r.name}${r.detail ? ': ' + r.detail : ''}`);
+}
+
+if (FAILURES.length > 0) {
+  console.log(`\n=== FAILURES (${FAILURES.length}) ===`);
+  for (const f of FAILURES) console.log(`  ✗ ${f.name}: ${f.detail}`);
+  process.exit(1);
+}
+
+console.log('\n=== Stress test PASSED ===\n');
diff --git a/src/bin/codegraph.ts b/src/bin/codegraph.ts
index d118a1fd..439fbb31 100644
--- a/src/bin/codegraph.ts
+++ b/src/bin/codegraph.ts
@@ -23,6 +23,7 @@ import * as path from 'path';
 import * as fs from 'fs';
 import { getCodeGraphDir, isInitialized } from '../directory';
 import { createShimmerProgress } from '../ui/shimmer-progress';
+import { globToSafeRegex } from '../utils';
 
 // Lazy-load heavy modules (CodeGraph, runInstaller) to keep CLI startup fast.
 async function loadCodeGraph(): Promise<typeof import('../index')> {
@@ -248,6 +249,40 @@ function warn(message: string): void {
   console.log(chalk.yellow('⚠') + ' ' + message);
 }
 
+/**
+ * Await any background LLM summarisation pass kicked off by indexAll
+ * or sync. Shows a small status line so the user knows what's
+ * happening, and lets them Ctrl-C to skip — partial work is preserved
+ * because the summariser persists each summary as it lands.
+ */
+async function awaitSummarisationWithProgress(
+  cg: import('../index').default,
+  clack: typeof import('@clack/prompts')
+): Promise<void> {
+  if (!cg.isSummarizing()) return;
+
+  const llmConfig = await cg.getEffectiveLlmConfig();
+  const label = llmConfig?.chatModel ?? 'local LLM';
+  clack.log.info(`Summarising symbols with ${label} (Ctrl-C to skip)…`);
+
+  const onSigint = (): void => {
+    // Closing cancels the in-flight pass via AbortController.
+    cg.destroy();
+    process.exit(0);
+  };
+  process.once('SIGINT', onSigint);
+  try {
+    await cg.awaitBackgroundSummarization();
+    const cov = cg.getSummaryCoverage();
+    if (cov.total > 0) {
+      const pct = Math.round((cov.summarised / cov.total) * 100);
+      clack.log.info(`Summary coverage: ${formatNumber(cov.summarised)}/${formatNumber(cov.total)} (${pct}%)`);
+    }
+  } finally {
+    process.removeListener('SIGINT', onSigint);
+  }
+}
+
 type IndexResult = {
   success: boolean;
   filesIndexed: number;
@@ -491,9 +526,10 @@ program
       const cg = await CodeGraph.open(projectPath);
 
       if (options.quiet) {
-        // Quiet mode: no UI, just run
+        // Quiet mode: no UI, no background summarisation (the process
+        // exits immediately and would kill in-flight LLM work anyway).
         if (options.force) cg.clear();
-        const result = await cg.indexAll();
+        const result = await cg.indexAll({ summarize: false });
         if (!result.success) process.exit(1);
         cg.destroy();
         return;
@@ -529,6 +565,12 @@ program
         process.exit(1);
       }
 
+      // If a local LLM was detected, indexAll kicked off a background
+      // summarisation pass. Await it here so the work persists before
+      // the CLI exits — Ctrl-C cancels and keeps whatever already
+      // landed in the DB.
+      await awaitSummarisationWithProgress(cg, clack);
+
       clack.outro('Done');
       cg.destroy();
     } catch (err) {
@@ -559,7 +601,10 @@ program
       const cg = await CodeGraph.open(projectPath);
 
       if (options.quiet) {
-        await cg.sync();
+        // Quiet mode (git hooks, scripts): skip summarisation so the
+        // hook stays fast. The next interactive sync/index picks up
+        // any new symbols.
+        await cg.sync({ summarize: false });
         cg.destroy();
         return;
       }
@@ -589,6 +634,10 @@ program
         clack.log.info(`${details.join(', ')} — ${formatNumber(result.nodesUpdated)} nodes in ${formatDuration(result.durationMs)}`);
       }
 
+      // Await any background summarisation kicked off by sync() so
+      // the work persists before exit.
+      await awaitSummarisationWithProgress(cg, clack);
+
       clack.outro('Done');
       cg.destroy();
     } catch (err) {
@@ -599,6 +648,123 @@ program
     }
   });
 
+/**
+ * codegraph summarize [path]
+ *
+ * Run an LLM-driven summarisation pass over symbols missing docstrings.
+ * Requires `config.llm` to be configured; the local LLM endpoint must
+ * be reachable. Cached by content_hash, so re-runs are cheap.
+ */
+program
+  .command('summarize [path]')
+  .description('Generate one-line LLM summaries for indexed symbols (requires config.llm)')
+  .option('-q, --quiet', 'Suppress output')
+  .option('-c, --concurrency <n>', 'Concurrent LLM requests', '2')
+  .action(async (pathArg: string | undefined, options: { quiet?: boolean; concurrency?: string }) => {
+    const projectPath = resolveProjectPath(pathArg);
+    try {
+      if (!isInitialized(projectPath)) {
+        if (!options.quiet) error(`CodeGraph not initialized in ${projectPath}`);
+        process.exit(1);
+      }
+      const { default: CodeGraph } = await loadCodeGraph();
+      const cg = await CodeGraph.open(projectPath);
+
+      const llmConfig = await cg.getEffectiveLlmConfig();
+      if (!llmConfig) {
+        if (!options.quiet) {
+          error(
+            'No LLM available. Either add config.llm to .codegraph/config.json or run a local Ollama server (https://ollama.com) with a chat model installed.\n\nExample config:\n\n  "llm": {\n    "endpoint": "http://localhost:11434/v1",\n    "chatModel": "qwen2.5-coder:7b"\n  }'
+          );
+        }
+        cg.destroy();
+        process.exit(1);
+      }
+
+      const concurrency = Math.max(1, parseInt(options.concurrency ?? '2', 10) || 2);
+
+      if (options.quiet) {
+        await cg.summarizeAll({ concurrency });
+        cg.destroy();
+        return;
+      }
+
+      const clack = await importESM('@clack/prompts');
+      clack.intro('Summarising indexed symbols');
+
+      const progress = createShimmerProgress();
+      progress.onProgress({ phase: 'parsing', current: 0, total: 0 });
+
+      const result = await cg.summarizeAll({
+        concurrency,
+        onProgress: (done, total) => {
+          progress.onProgress({ phase: 'parsing', current: done, total });
+        },
+      });
+
+      await progress.stop();
+
+      const skipped = result.candidates - result.generated - result.errors - result.cacheHits;
+      clack.log.success(`Summarised ${formatNumber(result.generated)} new symbols in ${formatDuration(result.durationMs)}`);
+      const details: string[] = [];
+      if (result.cacheHits > 0) details.push(`Cache hits: ${formatNumber(result.cacheHits)}`);
+      if (result.errors > 0) details.push(`Errors: ${formatNumber(result.errors)}`);
+      if (skipped > 0) details.push(`Skipped: ${formatNumber(skipped)}`);
+      if (details.length > 0) clack.log.info(details.join(' — '));
+
+      clack.outro('Done');
+      cg.destroy();
+    } catch (err) {
+      if (!options.quiet) error(`Failed to summarise: ${err instanceof Error ? err.message : String(err)}`);
+      process.exit(1);
+    }
+  });
+
+/**
+ * codegraph ask <question> [path]
+ *
+ * Natural-language Q&A over the indexed codebase. Hybrid-retrieves
+ * relevant symbols via FTS+semantic, then asks the configured chat
+ * model. Requires LLM (config.llm or auto-detected Ollama).
+ */
+program
+  .command('ask <question> [path]')
+  .description('Ask a natural-language question about the codebase (requires LLM)')
+  .option('-k, --retrieve-k <n>', 'Number of candidates to feed the model', '12')
+  .option('-q, --quiet', 'Print only the answer (no sources block)')
+  .action(async (question: string, pathArg: string | undefined, options: { retrieveK?: string; quiet?: boolean }) => {
+    const projectPath = resolveProjectPath(pathArg);
+    try {
+      if (!isInitialized(projectPath)) {
+        error(`CodeGraph not initialized in ${projectPath}`);
+        process.exit(1);
+      }
+      const { default: CodeGraph } = await loadCodeGraph();
+      const cg = await CodeGraph.open(projectPath);
+      const llmConfig = await cg.getEffectiveLlmConfig();
+      if (!llmConfig?.chatModel) {
+        error('No LLM available. Configure config.llm or run a local Ollama server with a chat model installed.');
+        cg.destroy();
+        process.exit(1);
+      }
+      const retrieveK = Math.max(4, Math.min(30, parseInt(options.retrieveK || '12', 10) || 12));
+      const result = await cg.ask(question, { retrieveK });
+      console.log(result.answer);
+      if (!options.quiet) {
+        console.log('\n' + chalk.dim('Sources:'));
+        for (const c of result.citations) {
+          const loc = c.node.startLine ? `:${c.node.startLine}` : '';
+          console.log(chalk.dim(`  • ${c.node.name} (${c.node.kind}) ${c.node.filePath}${loc}`));
+        }
+        console.log(chalk.dim(`\n  retrieve ${result.retrieveMs}ms · chat ${result.chatMs}ms · model ${llmConfig.chatModel}`));
+      }
+      cg.destroy();
+    } catch (err) {
+      error(`Failed to answer: ${err instanceof Error ? err.message : String(err)}`);
+      process.exit(1);
+    }
+  });
+
 /**
  * codegraph status [path]
  */
@@ -701,6 +867,23 @@ program
       }
       console.log();
 
+      // LLM enrichment status — auto-detected or configured.
+      console.log(chalk.bold('LLM Enrichment:'));
+      const llmConfig = await cg.getEffectiveLlmConfig();
+      if (!llmConfig) {
+        console.log('  No local LLM detected. Install Ollama (https://ollama.com) and pull a chat model to enable summaries.');
+      } else {
+        const source = cg.hasLlm() ? 'configured' : 'auto-detected';
+        console.log(`  Endpoint:  ${llmConfig.endpoint} (${source})`);
+        console.log(`  Model:     ${llmConfig.chatModel}`);
+        const cov = cg.getSummaryCoverage();
+        if (cov.total > 0) {
+          const pct = Math.round((cov.summarised / cov.total) * 100);
+          console.log(`  Coverage:  ${formatNumber(cov.summarised)}/${formatNumber(cov.total)} (${pct}%)`);
+        }
+      }
+      console.log();
+
       cg.destroy();
     } catch (err) {
       error(`Failed to get status: ${err instanceof Error ? err.message : String(err)}`);
@@ -1158,16 +1341,15 @@ program
         /\/spec\//,
       ];
 
-      // Custom filter pattern
+      // Custom filter pattern (ReDoS-safe — globToSafeRegex coalesces
+      // consecutive wildcards so hostile inputs can't produce nested
+      // quantifiers like `.+.+.+`).
       let customFilter: RegExp | null = null;
       if (options.filter) {
-        // Convert glob to regex: ** → .+, * → [^/]*, . → \.
-        const regex = options.filter
-          .replace(/[+[\]{}()^$|\\]/g, '\\$&')
-          .replace(/\./g, '\\.')
-          .replace(/\*\*/g, '.+')
-          .replace(/\*/g, '[^/]*');
-        customFilter = new RegExp(regex);
+        const regexBody = globToSafeRegex(options.filter);
+        if (regexBody !== null) {
+          customFilter = new RegExp(regexBody);
+        }
       }
 
       function isTestFile(filePath: string): boolean {
diff --git a/src/centrality/index.ts b/src/centrality/index.ts
new file mode 100644
index 00000000..d03f2206
--- /dev/null
+++ b/src/centrality/index.ts
@@ -0,0 +1,126 @@
+/**
+ * Centrality computation
+ *
+ * Computes PageRank over the `calls` + `references` subgraph and
+ * persists each node's score on the `nodes.centrality` column. Pure
+ * compute — no I/O — so the caller owns reading edges, writing scores,
+ * and deciding when to re-run.
+ *
+ * PageRank is the right shape for "what is structurally important?"
+ * because it rewards being reached (weighted by the importance of who
+ * reaches you), not just raw in-degree. A method called once from a
+ * central interface ranks above a method called many times from a
+ * leaf script.
+ *
+ * Edges of kind `contains` are deliberately excluded — they encode
+ * lexical containment (file → class → method), which would dominate
+ * the rank and hide actual reference flow.
+ *
+ * Side benefit observed in spike data: PageRank accidentally surfaces
+ * resolver false-positives. Generic short names (`trim`, `run`) that
+ * the resolver over-merges across files accumulate edges from many
+ * sources and float to the top alongside genuine hubs. Useful as a
+ * diagnostic; not a goal of this module.
+ */
+
+/** Damping factor — fraction of rank propagated through edges each step. */
+export const PR_DAMPING = 0.85;
+
+/**
+ * Iteration count. PageRank converges geometrically; 40 iterations puts
+ * us well below 1e-6 residual on graphs we've seen, with no per-graph
+ * tuning needed.
+ */
+export const PR_ITERATIONS = 40;
+
+/** Edge kinds that contribute to centrality. */
+export const PR_EDGE_KINDS = ['calls', 'references'] as const;
+
+export type PrEdgeKind = (typeof PR_EDGE_KINDS)[number];
+
+export interface CentralityResult {
+  /** nodeId → PageRank score in (0, 1). Sums to ~1.0 across all nodes. */
+  scores: Map<string, number>;
+  /** Iterations actually run (currently always PR_ITERATIONS — kept for forward compat). */
+  iterations: number;
+  /** Wall-clock duration in milliseconds. */
+  durationMs: number;
+}
+
+interface NodeRef {
+  id: string;
+}
+
+interface EdgeRef {
+  source: string;
+  target: string;
+}
+
+/**
+ * Compute PageRank scores for the supplied nodes/edges.
+ *
+ * @param nodes  All graph nodes (only `id` is read).
+ * @param edges  Edges that contribute to centrality. Caller is
+ *               responsible for filtering to `PR_EDGE_KINDS`.
+ *
+ * Edges referencing unknown node ids are silently dropped — the
+ * underlying graph has FK cascades, so dangling references can only
+ * occur mid-write and are not our problem to fix here.
+ */
+export function computePageRank(nodes: NodeRef[], edges: EdgeRef[]): CentralityResult {
+  const start = Date.now();
+  const N = nodes.length;
+  const scores = new Map<string, number>();
+  if (N === 0) {
+    return { scores, iterations: 0, durationMs: Date.now() - start };
+  }
+
+  // Index nodes for tight numeric loops. Float64Array gives ~3× speedup
+  // over Array(N).fill on million-edge graphs and costs nothing on
+  // smaller ones.
+  const idx = new Map<string, number>();
+  for (let i = 0; i < N; i++) {
+    const n = nodes[i]!;
+    idx.set(n.id, i);
+  }
+
+  const inEdges: number[][] = Array.from({ length: N }, () => []);
+  const outDeg = new Int32Array(N);
+  for (const e of edges) {
+    const s = idx.get(e.source);
+    const t = idx.get(e.target);
+    if (s === undefined || t === undefined) continue;
+    inEdges[t]!.push(s);
+    outDeg[s]! += 1;
+  }
+
+  let pr = new Float64Array(N).fill(1 / N);
+  const baseline = (1 - PR_DAMPING) / N;
+
+  for (let it = 0; it < PR_ITERATIONS; it++) {
+    const next = new Float64Array(N).fill(baseline);
+
+    // Distribute the rank of dangling nodes (no outgoing edges) uniformly.
+    // Without this the total rank decays each iteration.
+    let danglingSum = 0;
+    for (let i = 0; i < N; i++) {
+      if (outDeg[i] === 0) danglingSum += pr[i]!;
+    }
+    const danglingShare = (PR_DAMPING * danglingSum) / N;
+    for (let i = 0; i < N; i++) next[i]! += danglingShare;
+
+    for (let t = 0; t < N; t++) {
+      const sources = inEdges[t]!;
+      let s = 0;
+      for (let k = 0; k < sources.length; k++) {
+        const src = sources[k]!;
+        s += pr[src]! / outDeg[src]!;
+      }
+      next[t]! += PR_DAMPING * s;
+    }
+    pr = next;
+  }
+
+  for (let i = 0; i < N; i++) scores.set(nodes[i]!.id, pr[i]!);
+  return { scores, iterations: PR_ITERATIONS, durationMs: Date.now() - start };
+}
diff --git a/src/churn/index.ts b/src/churn/index.ts
new file mode 100644
index 00000000..1c332886
--- /dev/null
+++ b/src/churn/index.ts
@@ -0,0 +1,259 @@
+/**
+ * Per-file churn mining
+ *
+ * Reads `git log` to compute four signals per indexed file:
+ *   - commit_count    (how often the file gets touched)
+ *   - first_seen_ts   (when it entered the codebase)
+ *   - last_touched_ts (how recently it was modified)
+ *   - loc             (line count of the current on-disk content)
+ *
+ * Combined with PageRank centrality (see ../centrality), these answer
+ * "where do bugs hide?" — central files that change often are the
+ * highest-expected-value review targets, validated empirically against
+ * codegraph's own history (e.g. `src/extraction/tree-sitter.ts`).
+ *
+ * Storage strategy: scalar columns on `files` (one row already exists
+ * per indexed path; adding columns avoids a JOIN on every read).
+ *
+ * Incremental update: persist `last_mined_churn_head` in
+ * project_metadata; on subsequent mines, only enumerate commits in
+ * `<sha>..HEAD`. This keeps `sync` fast on long histories. If the
+ * stored sha is unreachable (force-push, gc), the caller gets
+ * `needsFullRescan: true` and re-mines from scratch after `clearChurn`.
+ *
+ * Rename note: `git log --name-only` (without `--follow`) reports
+ * post-rename paths only. The pre-rename history is therefore not
+ * counted toward the new path's `commit_count`. `--follow` would fix
+ * this but is documented as O(N) per file and shells out individually,
+ * so v1 accepts the under-count and surfaces it in the doc-comment on
+ * `commitCount` in types.ts.
+ */
+
+import { execFileSync } from 'child_process';
+import * as fs from 'fs';
+import * as path from 'path';
+import { logDebug } from '../errors';
+
+/**
+ * Skip commits that touch more than this many indexed files. Merge
+ * commits and mass refactors otherwise inflate every file's
+ * commit_count without any real coupling signal.
+ */
+export const MAX_FILES_PER_COMMIT = 50;
+
+/** Sentinel for `git log --pretty=tformat:`; cannot collide with a path. */
+const COMMIT_HEADER_PREFIX = 'CGCMT-';
+
+/** Project-metadata key holding the HEAD SHA of the last mined commit. */
+export const LAST_MINED_CHURN_HEAD_KEY = 'last_mined_churn_head';
+
+/** Hard cap on git output we'll buffer (bytes). Matches cochange. */
+const MAX_GIT_BUFFER = 200 * 1024 * 1024;
+
+/** Wall-clock cap on a single git invocation (ms). */
+const GIT_TIMEOUT_MS = 60_000;
+
+export interface FileChurnDelta {
+  path: string;
+  /** Commits to add to the existing commit_count. */
+  commitCountDelta: number;
+  /**
+   * Most recent commit timestamp (unix seconds) seen in this delta.
+   * Caller takes max() with the existing value.
+   */
+  lastTouchedTs: number;
+  /**
+   * Earliest commit timestamp (unix seconds) in this delta. Caller
+   * applies `COALESCE(existing, this)` so the first-seen column only
+   * gets written once.
+   */
+  firstSeenTs: number;
+}
+
+export interface ChurnMineResult {
+  deltas: Map<string, FileChurnDelta>;
+  /** HEAD SHA reached by this run; null when not in a git repo. */
+  currentHead: string | null;
+  /**
+   * True when the caller's `sinceSha` was unreachable (force-push, gc).
+   * Caller should `clearChurn()` and re-mine with `sinceSha=null`.
+   */
+  needsFullRescan: boolean;
+}
+
+/**
+ * Get the current HEAD commit SHA, or null when not in a git repo or
+ * the repo has no commits yet.
+ */
+export function getGitHead(rootDir: string): string | null {
+  try {
+    return (
+      execFileSync('git', ['rev-parse', 'HEAD'], {
+        cwd: rootDir,
+        encoding: 'utf-8',
+        timeout: 5000,
+        stdio: ['pipe', 'pipe', 'pipe'],
+      }).trim() || null
+    );
+  } catch {
+    return null;
+  }
+}
+
+/**
+ * Verify that a stored SHA is still reachable from HEAD. After
+ * force-push or `git gc` it can disappear, in which case incremental
+ * mining would silently miss commits.
+ */
+function isShaReachable(rootDir: string, sha: string): boolean {
+  try {
+    execFileSync('git', ['cat-file', '-e', `${sha}^{commit}`], {
+      cwd: rootDir,
+      timeout: 5000,
+      stdio: ['pipe', 'pipe', 'pipe'],
+    });
+    return true;
+  } catch {
+    return false;
+  }
+}
+
+/**
+ * Read the LOC of a file as currently on disk. Cheap; always fresh.
+ *
+ * Counts newline-delimited lines: a file with content `"a\nb\n"`
+ * reports 2; an empty file reports 0; a file ending without a newline
+ * still reports the visible-line count.
+ */
+export function readFileLoc(rootDir: string, relPath: string): number {
+  try {
+    const abs = path.join(rootDir, relPath);
+    const content = fs.readFileSync(abs, 'utf8');
+    if (content.length === 0) return 0;
+    let lines = 0;
+    for (let i = 0; i < content.length; i++) if (content.charCodeAt(i) === 10) lines++;
+    // Trailing chunk without final newline still counts as a line.
+    if (content.charCodeAt(content.length - 1) !== 10) lines++;
+    return lines;
+  } catch {
+    return 0;
+  }
+}
+
+/**
+ * Mine git log for per-file commit metrics.
+ *
+ * @param rootDir       Project root.
+ * @param indexedFiles  Paths we care about (deltas only emitted for
+ *                      these). Files outside this set are ignored
+ *                      per-commit so churn doesn't accumulate for
+ *                      paths the index has no other knowledge of.
+ * @param sinceSha      `null` for full scan; otherwise mine only
+ *                      `<sha>..HEAD`. Unreachable shas trigger
+ *                      `needsFullRescan: true`.
+ */
+export function mineChurn(
+  rootDir: string,
+  indexedFiles: Set<string>,
+  sinceSha: string | null
+): ChurnMineResult {
+  const empty: ChurnMineResult = {
+    deltas: new Map(),
+    currentHead: null,
+    needsFullRescan: false,
+  };
+
+  const head = getGitHead(rootDir);
+  if (!head) return empty;
+
+  if (sinceSha && !isShaReachable(rootDir, sinceSha)) {
+    return { deltas: new Map(), currentHead: head, needsFullRescan: true };
+  }
+
+  // No-op: nothing has happened since last mine.
+  if (sinceSha === head) {
+    return { deltas: new Map(), currentHead: head, needsFullRescan: false };
+  }
+
+  // tformat puts a literal trailing record-separator after each
+  // commit's name list; -z then NUL-delimits within the format too,
+  // so we get a clean stream of NUL-separated tokens.
+  const args = [
+    'log',
+    '--no-merges',
+    '--name-only',
+    `--pretty=tformat:${COMMIT_HEADER_PREFIX}%H|%ct`,
+    '-z',
+  ];
+  if (sinceSha) args.push(`${sinceSha}..HEAD`);
+
+  let raw: string;
+  try {
+    raw = execFileSync('git', args, {
+      cwd: rootDir,
+      encoding: 'utf-8',
+      timeout: GIT_TIMEOUT_MS,
+      maxBuffer: MAX_GIT_BUFFER,
+      stdio: ['pipe', 'pipe', 'pipe'],
+    });
+  } catch (err) {
+    logDebug(`mineChurn: git log failed: ${err instanceof Error ? err.message : String(err)}`);
+    return { deltas: new Map(), currentHead: head, needsFullRescan: false };
+  }
+
+  // Parse: tformat emits `CGCMT-<sha>|<ts>\0\n<path1>\0<path2>\0...
+  // CGCMT-<next>|<ts>\0\n<path1>\0`. Each token between NULs is either
+  // a commit header or a path; paths arrive with a leading '\n' on the
+  // first one of each commit (the tformat record-separator). We walk
+  // tokens linearly, switching commit context on each header.
+  const tokens = raw.split('\0');
+  const headerRe = /^CGCMT-([0-9a-f]{40})\|(\d+)$/;
+  const deltas = new Map<string, FileChurnDelta>();
+
+  let curTs = 0;
+  let curPaths: string[] = [];
+  let curActive = false;
+
+  function flush() {
+    if (!curActive) return;
+    if (curPaths.length > 0 && curPaths.length <= MAX_FILES_PER_COMMIT) {
+      for (const p of curPaths) {
+        if (!indexedFiles.has(p)) continue;
+        const cur = deltas.get(p);
+        if (cur) {
+          cur.commitCountDelta += 1;
+          if (curTs > cur.lastTouchedTs) cur.lastTouchedTs = curTs;
+          if (curTs < cur.firstSeenTs) cur.firstSeenTs = curTs;
+        } else {
+          deltas.set(p, {
+            path: p,
+            commitCountDelta: 1,
+            lastTouchedTs: curTs,
+            firstSeenTs: curTs,
+          });
+        }
+      }
+    }
+    curPaths = [];
+    curActive = false;
+  }
+
+  for (const rawTok of tokens) {
+    if (rawTok === '') continue;
+    // Strip a single leading \n introduced by tformat's record separator.
+    const tok = rawTok.startsWith('\n') ? rawTok.slice(1) : rawTok;
+    if (tok === '') continue;
+    const m = headerRe.exec(tok);
+    if (m) {
+      flush();
+      curTs = parseInt(m[2]!, 10);
+      curActive = true;
+    } else if (curActive) {
+      curPaths.push(tok);
+    }
+    // Tokens before the first header (shouldn't happen) are ignored.
+  }
+  flush();
+
+  return { deltas, currentHead: head, needsFullRescan: false };
+}
diff --git a/src/cochange/index.ts b/src/cochange/index.ts
new file mode 100644
index 00000000..69247288
--- /dev/null
+++ b/src/cochange/index.ts
@@ -0,0 +1,180 @@
+/**
+ * Co-Change Mining
+ *
+ * Reads `git log` to discover which files change together, surfacing
+ * coupling that static analysis can't see (sibling language extractors
+ * that share patterns, tests that assert schema state, config files
+ * coupled to the code that reads them, etc.).
+ *
+ * Storage is a separate `co_changes` table, not the main `edges` graph,
+ * because co-change is symmetric and weighted (count of commits where
+ * both files changed). Query layer normalizes the count into a Jaccard
+ * coefficient at read time using each file's total commit_count.
+ *
+ * Designed for incremental update: persist `last_mined_head` in
+ * project_metadata; on subsequent mines, only enumerate commits since
+ * that SHA so sync stays fast even on a large history.
+ */
+
+import { execFileSync } from 'child_process';
+import { normalizePath } from '../utils';
+import { logDebug } from '../errors';
+
+/**
+ * Skip commits that touch more than this many indexed files. Merge
+ * commits and large refactors otherwise produce O(N²) spurious pairs
+ * where every file appears coupled to every other.
+ */
+export const MAX_FILES_PER_COMMIT = 50;
+
+/**
+ * Drop pairs with fewer than this many co-changes. Two files that
+ * happened to land in the same commit once usually aren't meaningfully
+ * coupled; the signal lives in repeated co-occurrence.
+ */
+export const MIN_COCHANGE_COUNT = 2;
+
+/** Project-metadata key holding the HEAD SHA of the last mined commit. */
+export const LAST_MINED_HEAD_KEY = 'last_mined_cochange_head';
+
+export interface MinedCoChanges {
+  /** Map of "fileA\0fileB" (canonical: fileA < fileB) -> co-change count. */
+  pairs: Map<string, number>;
+  /** Map of file path -> number of mined commits that touched it. */
+  fileCommits: Map<string, number>;
+  /** HEAD SHA reached by this mining run, or null when no commits were seen. */
+  currentHead: string | null;
+}
+
+/**
+ * Get the current HEAD commit SHA, or null when not in a git repo or
+ * the repo has no commits yet.
+ */
+export function getGitHead(rootDir: string): string | null {
+  try {
+    return execFileSync(
+      'git',
+      ['rev-parse', 'HEAD'],
+      { cwd: rootDir, encoding: 'utf-8', timeout: 5000, stdio: ['pipe', 'pipe', 'pipe'] }
+    ).trim() || null;
+  } catch {
+    return null;
+  }
+}
+
+/**
+ * Mine git history for co-changes.
+ *
+ * @param rootDir       Project root.
+ * @param indexedFiles  Set of paths we care about (typically: every file
+ *                      currently tracked in the index). Files outside this
+ *                      set are dropped per-commit so we don't accumulate
+ *                      pair counts for files we have no other context for.
+ * @param sinceSha      If provided, only mine commits in `<sha>..HEAD`.
+ *                      Pass `null` for a full history scan. If the SHA is
+ *                      unreachable (force-push, gc), returns
+ *                      `needsFullRescan: true` so the caller can clear and
+ *                      re-mine from scratch.
+ */
+export function mineCoChanges(
+  rootDir: string,
+  indexedFiles: Set<string>,
+  sinceSha: string | null
+): MinedCoChanges & { needsFullRescan: boolean } {
+  const empty: MinedCoChanges & { needsFullRescan: boolean } = {
+    pairs: new Map(),
+    fileCommits: new Map(),
+    currentHead: null,
+    needsFullRescan: false,
+  };
+
+  // Bail early when there are no commits at all.
+  const currentHead = getGitHead(rootDir);
+  if (!currentHead) return empty;
+
+  // Verify the previous mining anchor is still reachable.
+  if (sinceSha) {
+    try {
+      execFileSync(
+        'git',
+        ['cat-file', '-e', `${sinceSha}^{commit}`],
+        { cwd: rootDir, encoding: 'utf-8', timeout: 5000, stdio: ['pipe', 'pipe', 'pipe'] }
+      );
+    } catch {
+      logDebug('Co-change: previous head unreachable, full rescan required', { sinceSha });
+      return { ...empty, currentHead, needsFullRescan: true };
+    }
+    // Same SHA — nothing new to mine.
+    if (sinceSha === currentHead) {
+      return { ...empty, currentHead };
+    }
+  }
+
+  // Header sentinel: `CGCMT-<40-hex-sha>`. This pattern cannot collide
+  // with any POSIX filename git would actually emit via --name-only — a
+  // file literally named `CGCMT-` followed by exactly 40 hex chars is not
+  // realistic to encounter. The previous draft used `--` which a real
+  // file could be named.
+  //
+  // git log -z output structure (verified empirically):
+  //   CGCMT-<sha>\0\n<file>\0<file>\0CGCMT-<sha>\0\n<file>\0...
+  // Each \0 is a record terminator; the \n after the format record is a
+  // git-emitted separator before the file list.
+  const HEADER_RE = /^CGCMT-[0-9a-f]{40}$/;
+  const range = sinceSha ? [`${sinceSha}..${currentHead}`] : [];
+  let raw: string;
+  try {
+    raw = execFileSync(
+      'git',
+      ['log', '--no-merges', '--name-only', '--format=tformat:CGCMT-%H', '-z', ...range],
+      {
+        cwd: rootDir,
+        encoding: 'utf-8',
+        timeout: 60_000,
+        maxBuffer: 200 * 1024 * 1024,
+        stdio: ['pipe', 'pipe', 'pipe'],
+      }
+    );
+  } catch (error) {
+    logDebug('Co-change: git log failed', { error: String(error) });
+    return { ...empty, currentHead };
+  }
+
+  // Split on NUL (the record terminator). Trim leading whitespace from
+  // each token to peel off the inter-record `\n` git inserts before the
+  // file list. Then walk: header tokens flush+start, others are files.
+  const pairs = new Map<string, number>();
+  const fileCommits = new Map<string, number>();
+
+  let currentFiles: string[] = [];
+  const flush = () => {
+    if (currentFiles.length === 0) return;
+    const filtered = [...new Set(currentFiles.filter((f) => indexedFiles.has(f)))];
+    currentFiles = [];
+    if (filtered.length === 0 || filtered.length > MAX_FILES_PER_COMMIT) return;
+
+    for (const f of filtered) {
+      fileCommits.set(f, (fileCommits.get(f) ?? 0) + 1);
+    }
+    filtered.sort();
+    for (let i = 0; i < filtered.length; i++) {
+      for (let j = i + 1; j < filtered.length; j++) {
+        const key = `${filtered[i]}\0${filtered[j]}`;
+        pairs.set(key, (pairs.get(key) ?? 0) + 1);
+      }
+    }
+  };
+
+  for (const rawToken of raw.split('\0')) {
+    const token = rawToken.replace(/^\s+/, '');
+    if (token.length === 0) continue;
+    if (HEADER_RE.test(token)) {
+      flush();
+    } else {
+      currentFiles.push(normalizePath(token));
+    }
+  }
+  flush();
+
+  return { pairs, fileCommits, currentHead, needsFullRescan: false };
+}
diff --git a/src/config-refs/index.ts b/src/config-refs/index.ts
new file mode 100644
index 00000000..1ef47ae9
--- /dev/null
+++ b/src/config-refs/index.ts
@@ -0,0 +1,188 @@
+/**
+ * Config-reference extraction
+ *
+ * Scans indexed source files for known config-read patterns
+ * (`process.env.X`, `os.getenv("X")`, etc.) and records each read
+ * site as a row in `config_refs`. Each row links to its enclosing
+ * function via a line-range lookup against the existing nodes table,
+ * so an agent asking "what reads OBSIDIAN_PORT?" gets a list of real
+ * functions, not a grep wall.
+ *
+ * Why a separate table, not graph nodes/edges: env vars don't have a
+ * single source-of-truth file (they're a global namespace), so giving
+ * them a synthetic file_path would pollute the main graph. The table
+ * is queried via a dedicated MCP tool (`codegraph_config`) and via
+ * augmented `codegraph_node` output (per-function "reads:" line).
+ *
+ * Spike validation (mcp-obsidian-extended): 71 reads, 19 distinct
+ * keys; 8× OBSIDIAN_PORT, 8× TOOL_PRESET surface as central
+ * config knobs. Codegraph-itself is sparse (4 reads) — this feature
+ * shines on service-style codebases.
+ *
+ * V1 scope: env-only, regex-based per-language. YAML key reads,
+ * LaunchDarkly flags, etc. are deliberately out of scope; the schema
+ * already supports them via `config_kind` so adding them later is a
+ * pattern addition, not a redesign.
+ */
+
+import * as fs from 'fs';
+import * as path from 'path';
+import { logDebug } from '../errors';
+
+export type ConfigKind = 'env';
+
+export interface ConfigRef {
+  configKind: ConfigKind;
+  configKey: string;
+  /** Indexed-symbol id for the enclosing function/method. NULL = top-level. */
+  sourceNodeId: string | null;
+  filePath: string;
+  line: number;
+}
+
+interface PatternDef {
+  /** Languages this pattern applies to (matches `Language` in types.ts). */
+  languages: string[];
+  /** Regex with capture group 1 = config key. */
+  re: RegExp;
+}
+
+/**
+ * Per-language read-pattern catalogue.
+ *
+ * Patterns intentionally err on the side of including only
+ * UPPER_CASE_KEYS — the convention every framework follows for env
+ * vars. This avoids false positives like `process.env.foo` (a Node
+ * variable) or `os.getenv(some_var)` (dynamic).
+ */
+const PATTERNS: PatternDef[] = [
+  // process.env.FOO  /  process.env["FOO"]  (TS, JS, TSX, JSX)
+  {
+    languages: ['typescript', 'javascript', 'tsx', 'jsx'],
+    re: /process\.env\.([A-Z_][A-Z0-9_]*)/g,
+  },
+  {
+    languages: ['typescript', 'javascript', 'tsx', 'jsx'],
+    re: /process\.env\[\s*['"]([A-Z_][A-Z0-9_]*)['"]\s*\]/g,
+  },
+  // os.getenv("FOO")  /  os.environ.get("FOO")  /  os.environ["FOO"]
+  {
+    languages: ['python'],
+    re: /\bos\.getenv\(\s*['"]([A-Z_][A-Z0-9_]*)['"]/g,
+  },
+  {
+    languages: ['python'],
+    re: /\bos\.environ\.get\(\s*['"]([A-Z_][A-Z0-9_]*)['"]/g,
+  },
+  {
+    languages: ['python'],
+    re: /\bos\.environ\[\s*['"]([A-Z_][A-Z0-9_]*)['"]\s*\]/g,
+  },
+  // Bare getenv("FOO") (Python convention with `from os import getenv`)
+  {
+    languages: ['python'],
+    re: /\bgetenv\(\s*['"]([A-Z_][A-Z0-9_]*)['"]/g,
+  },
+  // os.Getenv("FOO")  /  os.LookupEnv("FOO")  (Go)
+  {
+    languages: ['go'],
+    re: /\bos\.(?:Getenv|LookupEnv)\(\s*"([A-Z_][A-Z0-9_]*)"/g,
+  },
+  // System.getenv("FOO") (Java/Kotlin)
+  {
+    languages: ['java', 'kotlin'],
+    re: /\bSystem\.getenv\(\s*"([A-Z_][A-Z0-9_]*)"/g,
+  },
+  // ENV["FOO"] / ENV.fetch("FOO") (Ruby)
+  {
+    languages: ['ruby'],
+    re: /\bENV\[\s*['"]([A-Z_][A-Z0-9_]*)['"]\s*\]/g,
+  },
+  {
+    languages: ['ruby'],
+    re: /\bENV\.fetch\(\s*['"]([A-Z_][A-Z0-9_]*)['"]/g,
+  },
+  // Rust: env!("FOO") / std::env::var("FOO")
+  {
+    languages: ['rust'],
+    re: /\benv!\(\s*"([A-Z_][A-Z0-9_]*)"/g,
+  },
+  {
+    languages: ['rust'],
+    re: /\bstd::env::var\(\s*"([A-Z_][A-Z0-9_]*)"/g,
+  },
+];
+
+/** A file's languages-of-interest. Skip everything not in PATTERNS. */
+const SUPPORTED_LANGUAGES = new Set<string>(
+  PATTERNS.flatMap((p) => p.languages)
+);
+
+/**
+ * Resolver supplied by caller: (filePath, line) → enclosing nodeId
+ * (function/method/class). Returns null when the read is at the file's
+ * top level — the row still gets persisted with NULL source_node_id.
+ */
+export type EnclosingNodeResolver = (filePath: string, line: number) => string | null;
+
+export interface FileTarget {
+  path: string;
+  language: string;
+}
+
+/**
+ * Scan a list of (path, language) targets and return all read sites.
+ * Pure I/O + regex; the caller owns DB writes via `applyConfigRefs`.
+ *
+ * Files we can't read (deleted, permission, binary) are silently
+ * skipped — extraction has already validated readability for the rest.
+ */
+export function extractConfigRefs(
+  rootDir: string,
+  targets: Iterable<FileTarget>,
+  resolveEnclosing: EnclosingNodeResolver
+): ConfigRef[] {
+  const refs: ConfigRef[] = [];
+  for (const t of targets) {
+    if (!SUPPORTED_LANGUAGES.has(t.language)) continue;
+    let src: string;
+    try {
+      src = fs.readFileSync(path.join(rootDir, t.path), 'utf8');
+    } catch (err) {
+      logDebug(`extractConfigRefs: read failed for ${t.path}: ${err instanceof Error ? err.message : String(err)}`);
+      continue;
+    }
+    // Iterate lines so we can attribute each match to a 1-indexed line.
+    const lines = src.split('\n');
+    for (let i = 0; i < lines.length; i++) {
+      const line = lines[i]!;
+      // Cheap pre-filter to skip the 99% of lines that obviously
+      // contain no env reference. Cuts per-file cost dramatically on
+      // big repos.
+      if (
+        !line.includes('env') &&
+        !line.includes('Env') &&
+        !line.includes('ENV')
+      ) {
+        continue;
+      }
+      for (const pat of PATTERNS) {
+        if (!pat.languages.includes(t.language)) continue;
+        pat.re.lastIndex = 0;
+        let m: RegExpExecArray | null;
+        while ((m = pat.re.exec(line)) !== null) {
+          const key = m[1]!;
+          const lineNo = i + 1;
+          refs.push({
+            configKind: 'env',
+            configKey: key,
+            sourceNodeId: resolveEnclosing(t.path, lineNo),
+            filePath: t.path,
+            line: lineNo,
+          });
+        }
+      }
+    }
+  }
+  return refs;
+}
diff --git a/src/config.ts b/src/config.ts
index 9ab1032a..6cd993d7 100644
--- a/src/config.ts
+++ b/src/config.ts
@@ -127,7 +127,15 @@ function mergeConfig(
     maxFileSize: overrides.maxFileSize ?? defaults.maxFileSize,
     extractDocstrings: overrides.extractDocstrings ?? defaults.extractDocstrings,
     trackCallSites: overrides.trackCallSites ?? defaults.trackCallSites,
+    enableCoChange: overrides.enableCoChange ?? defaults.enableCoChange,
+    llm: overrides.llm ?? defaults.llm,
     customPatterns: overrides.customPatterns ?? defaults.customPatterns,
+    enableCentrality: overrides.enableCentrality ?? defaults.enableCentrality,
+    enableChurn: overrides.enableChurn ?? defaults.enableChurn,
+    enableIssueHistory: overrides.enableIssueHistory ?? defaults.enableIssueHistory,
+    enableConfigRefs: overrides.enableConfigRefs ?? defaults.enableConfigRefs,
+    enableSqlRefs: overrides.enableSqlRefs ?? defaults.enableSqlRefs,
+    indexSubmodules: overrides.indexSubmodules ?? defaults.indexSubmodules,
   };
 }
 
diff --git a/src/context/index.ts b/src/context/index.ts
index 94192377..08f25657 100644
--- a/src/context/index.ts
+++ b/src/context/index.ts
@@ -286,6 +286,14 @@ export class ContextBuilder {
     options: FindRelevantContextOptions = {}
   ): Promise<Subgraph> {
     const opts = { ...DEFAULT_FIND_OPTIONS, ...options };
+    // Bound user-supplied limits — `searchLimit` is multiplied by 5 in
+    // findNodesByExactName (line 312) and feeds several other unbounded
+    // operations below, so a request with `searchLimit: 1_000_000` would
+    // pull millions of rows before any filtering. 100 is well above the
+    // largest legitimate use we've seen.
+    opts.searchLimit = Math.min(Math.max(1, opts.searchLimit), 100);
+    opts.maxNodes = Math.min(Math.max(1, opts.maxNodes), 1000);
+    opts.traversalDepth = Math.min(Math.max(0, opts.traversalDepth), 10);
 
     // Start with empty subgraph
     const nodes = new Map<string, Node>();
diff --git a/src/db/index.ts b/src/db/index.ts
index 34e99338..da85caea 100644
--- a/src/db/index.ts
+++ b/src/db/index.ts
@@ -152,6 +152,36 @@ export class DatabaseConnection {
     this.db.exec('ANALYZE');
   }
 
+  /**
+   * Lightweight, non-blocking maintenance to run after bulk writes
+   * (indexAll, sync). Two operations:
+   *
+   *   - `PRAGMA optimize` — incremental ANALYZE; SQLite only re-analyzes
+   *     tables whose row counts changed materially since the last
+   *     ANALYZE. Without it, the query planner has no statistics on the
+   *     freshly-bulk-loaded tables and can pick suboptimal indexes.
+   *
+   *   - `PRAGMA wal_checkpoint(PASSIVE)` — fold pending WAL pages back
+   *     into the main database file so the WAL file doesn't grow
+   *     unboundedly between automatic checkpoints (auto-fires at 1000
+   *     pages by default; large indexAll runs blow past that).
+   *
+   * Both operations are silently swallowed on failure — they're a
+   * best-effort optimization, never load-bearing for correctness.
+   */
+  runMaintenance(): void {
+    try {
+      this.db.exec('PRAGMA optimize');
+    } catch {
+      // ignore
+    }
+    try {
+      this.db.exec('PRAGMA wal_checkpoint(PASSIVE)');
+    } catch {
+      // ignore (e.g., not in WAL mode)
+    }
+  }
+
   /**
    * Close the database connection
    */
diff --git a/src/db/migrations.ts b/src/db/migrations.ts
index 0a256dbc..9c640f6c 100644
--- a/src/db/migrations.ts
+++ b/src/db/migrations.ts
@@ -1,60 +1,27 @@
 /**
- * Database Migrations
+ * Database Migrations — runner + backward-compat surface.
  *
- * Schema versioning and migration support.
+ * The migration definitions themselves live in
+ * `./migrations/<NNN>-<name>.ts`, one file per migration, with
+ * version derived from the filename prefix. This file is the
+ * runner (read schema_versions, apply pending in order) and the
+ * stable API surface that the rest of the codebase imports.
+ *
+ * Adding a migration: see `./migrations/index.ts`.
  */
 
 import { SqliteDatabase } from './sqlite-adapter';
+import { ALL_MIGRATIONS, CURRENT_SCHEMA_VERSION as REGISTRY_CURRENT } from './migrations/index';
+import type { Migration } from './migrations/types';
 
 /**
- * Current schema version
- */
-export const CURRENT_SCHEMA_VERSION = 3;
-
-/**
- * Migration definition
+ * Highest registered migration version. Derived from the
+ * registry; re-exported here unchanged so existing consumers
+ * (`import { CURRENT_SCHEMA_VERSION } from './migrations'`) keep
+ * working.
  */
-interface Migration {
-  version: number;
-  description: string;
-  up: (db: SqliteDatabase) => void;
-}
+export const CURRENT_SCHEMA_VERSION: number = REGISTRY_CURRENT;
 
-/**
- * All migrations in order
- *
- * Note: Version 1 is the initial schema, handled by schema.sql
- * Future migrations go here.
- */
-const migrations: Migration[] = [
-  {
-    version: 2,
-    description: 'Add project metadata, provenance tracking, and unresolved ref context',
-    up: (db) => {
-      db.exec(`
-        CREATE TABLE IF NOT EXISTS project_metadata (
-          key TEXT PRIMARY KEY,
-          value TEXT NOT NULL,
-          updated_at INTEGER NOT NULL
-        );
-        ALTER TABLE unresolved_refs ADD COLUMN file_path TEXT NOT NULL DEFAULT '';
-        ALTER TABLE unresolved_refs ADD COLUMN language TEXT NOT NULL DEFAULT 'unknown';
-        ALTER TABLE edges ADD COLUMN provenance TEXT DEFAULT NULL;
-        CREATE INDEX IF NOT EXISTS idx_unresolved_file_path ON unresolved_refs(file_path);
-        CREATE INDEX IF NOT EXISTS idx_edges_provenance ON edges(provenance);
-      `);
-    },
-  },
-  {
-    version: 3,
-    description: 'Add lower(name) expression index for memory-efficient case-insensitive lookups',
-    up: (db) => {
-      db.exec(`
-        CREATE INDEX IF NOT EXISTS idx_nodes_lower_name ON nodes(lower(name));
-      `);
-    },
-  },
-];
 
 /**
  * Get the current schema version from the database
@@ -84,17 +51,14 @@ function recordMigration(db: SqliteDatabase, version: number, description: strin
  * Run all pending migrations
  */
 export function runMigrations(db: SqliteDatabase, fromVersion: number): void {
-  const pending = migrations.filter((m) => m.version > fromVersion);
-
-  if (pending.length === 0) {
-    return;
-  }
+  const pending = ALL_MIGRATIONS.filter((m) => m.version > fromVersion);
+  if (pending.length === 0) return;
 
-  // Sort by version
-  pending.sort((a, b) => a.version - b.version);
+  // ALL_MIGRATIONS is already sorted by version, but filtering can
+  // be cheap to re-confirm.
+  const ordered = [...pending].sort((a, b) => a.version - b.version);
 
-  // Run each migration in a transaction
-  for (const migration of pending) {
+  for (const migration of ordered) {
     db.transaction(() => {
       migration.up(db);
       recordMigration(db, migration.version, migration.description);
@@ -111,13 +75,15 @@ export function needsMigration(db: SqliteDatabase): boolean {
 }
 
 /**
- * Get list of pending migrations
+ * Get list of pending migrations.
+ *
+ * Returned as a fresh mutable array (not the underlying readonly
+ * registry) so callers that previously assigned the result to a
+ * `Migration[]`-typed variable keep working unchanged.
  */
 export function getPendingMigrations(db: SqliteDatabase): Migration[] {
   const current = getCurrentVersion(db);
-  return migrations
-    .filter((m) => m.version > current)
-    .sort((a, b) => a.version - b.version);
+  return ALL_MIGRATIONS.filter((m) => m.version > current).slice();
 }
 
 /**
@@ -136,3 +102,7 @@ export function getMigrationHistory(
     description: row.description,
   }));
 }
+
+// Re-export the registry surface for callers that want it.
+export { ALL_MIGRATIONS } from './migrations/index';
+export type { Migration, MigrationModule } from './migrations/types';
diff --git a/src/db/migrations/002-project-metadata.ts b/src/db/migrations/002-project-metadata.ts
new file mode 100644
index 00000000..9fe7945b
--- /dev/null
+++ b/src/db/migrations/002-project-metadata.ts
@@ -0,0 +1,19 @@
+import type { MigrationModule } from './types';
+
+export const MIGRATION: MigrationModule = {
+  description: 'Add project metadata, provenance tracking, and unresolved ref context',
+  up: (db) => {
+    db.exec(`
+      CREATE TABLE IF NOT EXISTS project_metadata (
+        key TEXT PRIMARY KEY,
+        value TEXT NOT NULL,
+        updated_at INTEGER NOT NULL
+      );
+      ALTER TABLE unresolved_refs ADD COLUMN file_path TEXT NOT NULL DEFAULT '';
+      ALTER TABLE unresolved_refs ADD COLUMN language TEXT NOT NULL DEFAULT 'unknown';
+      ALTER TABLE edges ADD COLUMN provenance TEXT DEFAULT NULL;
+      CREATE INDEX IF NOT EXISTS idx_unresolved_file_path ON unresolved_refs(file_path);
+      CREATE INDEX IF NOT EXISTS idx_edges_provenance ON edges(provenance);
+    `);
+  },
+};
diff --git a/src/db/migrations/003-lower-name-index.ts b/src/db/migrations/003-lower-name-index.ts
new file mode 100644
index 00000000..ff5416eb
--- /dev/null
+++ b/src/db/migrations/003-lower-name-index.ts
@@ -0,0 +1,10 @@
+import type { MigrationModule } from './types';
+
+export const MIGRATION: MigrationModule = {
+  description: 'Add lower(name) expression index for memory-efficient case-insensitive lookups',
+  up: (db) => {
+    db.exec(`
+      CREATE INDEX IF NOT EXISTS idx_nodes_lower_name ON nodes(lower(name));
+    `);
+  },
+};
diff --git a/src/db/migrations/004-centrality-churn.ts b/src/db/migrations/004-centrality-churn.ts
new file mode 100644
index 00000000..82d30ffe
--- /dev/null
+++ b/src/db/migrations/004-centrality-churn.ts
@@ -0,0 +1,42 @@
+import type { MigrationModule } from './types';
+
+export const MIGRATION: MigrationModule = {
+  description: 'Add centrality on nodes; per-file churn metrics on files',
+  up: (db) => {
+    // ALTER TABLE ADD COLUMN is not idempotent on SQLite — guard with
+    // PRAGMA table_info so re-running after a partial DDL failure (or
+    // landing alongside another migration that touches the same files
+    // columns) does not throw "duplicate column name".
+    const tableExists = (name: string): boolean =>
+      (db.prepare(`SELECT COUNT(*) AS c FROM sqlite_master WHERE type='table' AND name=?`)
+        .get(name) as { c: number }).c > 0;
+
+    if (tableExists('nodes')) {
+      const nodeCols = db.prepare(`PRAGMA table_info(nodes);`).all() as Array<{ name: string }>;
+      if (!nodeCols.some((c) => c.name === 'centrality')) {
+        db.exec(`ALTER TABLE nodes ADD COLUMN centrality REAL DEFAULT NULL;`);
+      }
+      db.exec(`CREATE INDEX IF NOT EXISTS idx_nodes_centrality ON nodes(centrality DESC);`);
+    }
+
+    if (tableExists('files')) {
+      const fileCols = db.prepare(`PRAGMA table_info(files);`).all() as Array<{ name: string }>;
+      if (!fileCols.some((c) => c.name === 'commit_count')) {
+        db.exec(`ALTER TABLE files ADD COLUMN commit_count INTEGER NOT NULL DEFAULT 0;`);
+      }
+      if (!fileCols.some((c) => c.name === 'loc')) {
+        db.exec(`ALTER TABLE files ADD COLUMN loc INTEGER NOT NULL DEFAULT 0;`);
+      }
+      if (!fileCols.some((c) => c.name === 'first_seen_ts')) {
+        db.exec(`ALTER TABLE files ADD COLUMN first_seen_ts INTEGER DEFAULT NULL;`);
+      }
+      if (!fileCols.some((c) => c.name === 'last_touched_ts')) {
+        db.exec(`ALTER TABLE files ADD COLUMN last_touched_ts INTEGER DEFAULT NULL;`);
+      }
+      db.exec(`
+        CREATE INDEX IF NOT EXISTS idx_files_commit_count ON files(commit_count DESC);
+        CREATE INDEX IF NOT EXISTS idx_files_last_touched ON files(last_touched_ts DESC);
+      `);
+    }
+  },
+};
diff --git a/src/db/migrations/005-symbol-issues.ts b/src/db/migrations/005-symbol-issues.ts
new file mode 100644
index 00000000..7af13795
--- /dev/null
+++ b/src/db/migrations/005-symbol-issues.ts
@@ -0,0 +1,19 @@
+import type { MigrationModule } from './types';
+
+export const MIGRATION: MigrationModule = {
+  description: 'Add symbol_issues table for issue→symbol attribution from git history',
+  up: (db) => {
+    db.exec(`
+      CREATE TABLE IF NOT EXISTS symbol_issues (
+        node_id TEXT NOT NULL,
+        issue_number INTEGER NOT NULL,
+        commit_sha TEXT NOT NULL,
+        kind TEXT NOT NULL CHECK (kind IN ('modified','added','removed')),
+        PRIMARY KEY (node_id, issue_number, commit_sha, kind),
+        FOREIGN KEY (node_id) REFERENCES nodes(id) ON DELETE CASCADE
+      );
+      CREATE INDEX IF NOT EXISTS idx_symbol_issues_node ON symbol_issues(node_id);
+      CREATE INDEX IF NOT EXISTS idx_symbol_issues_issue ON symbol_issues(issue_number);
+    `);
+  },
+};
diff --git a/src/db/migrations/006-config-refs.ts b/src/db/migrations/006-config-refs.ts
new file mode 100644
index 00000000..8fed1a91
--- /dev/null
+++ b/src/db/migrations/006-config-refs.ts
@@ -0,0 +1,24 @@
+import type { MigrationModule } from './types';
+
+export const MIGRATION: MigrationModule = {
+  description: 'Add config_refs table for env var / feature flag read sites',
+  up: (db) => {
+    db.exec(`
+      CREATE TABLE IF NOT EXISTS config_refs (
+        id INTEGER PRIMARY KEY AUTOINCREMENT,
+        config_kind TEXT NOT NULL,
+        config_key TEXT NOT NULL,
+        source_node_id TEXT,
+        file_path TEXT NOT NULL,
+        line INTEGER NOT NULL,
+        FOREIGN KEY (source_node_id) REFERENCES nodes(id) ON DELETE CASCADE
+      );
+      CREATE INDEX IF NOT EXISTS idx_config_refs_key
+        ON config_refs(config_kind, config_key);
+      CREATE INDEX IF NOT EXISTS idx_config_refs_node
+        ON config_refs(source_node_id);
+      CREATE INDEX IF NOT EXISTS idx_config_refs_file
+        ON config_refs(file_path);
+    `);
+  },
+};
diff --git a/src/db/migrations/007-sql-refs.ts b/src/db/migrations/007-sql-refs.ts
new file mode 100644
index 00000000..629d070f
--- /dev/null
+++ b/src/db/migrations/007-sql-refs.ts
@@ -0,0 +1,24 @@
+import type { MigrationModule } from './types';
+
+export const MIGRATION: MigrationModule = {
+  description: 'Add sql_refs table for SQL string-literal references to tables',
+  up: (db) => {
+    db.exec(`
+      CREATE TABLE IF NOT EXISTS sql_refs (
+        id INTEGER PRIMARY KEY AUTOINCREMENT,
+        table_name TEXT NOT NULL,
+        op TEXT NOT NULL CHECK (op IN ('read','write','ddl')),
+        source_node_id TEXT,
+        file_path TEXT NOT NULL,
+        line INTEGER NOT NULL,
+        FOREIGN KEY (source_node_id) REFERENCES nodes(id) ON DELETE CASCADE
+      );
+      CREATE INDEX IF NOT EXISTS idx_sql_refs_table
+        ON sql_refs(lower(table_name));
+      CREATE INDEX IF NOT EXISTS idx_sql_refs_node
+        ON sql_refs(source_node_id);
+      CREATE INDEX IF NOT EXISTS idx_sql_refs_file
+        ON sql_refs(file_path);
+    `);
+  },
+};
diff --git a/src/db/migrations/008-edges-unique.ts b/src/db/migrations/008-edges-unique.ts
new file mode 100644
index 00000000..ed7e5372
--- /dev/null
+++ b/src/db/migrations/008-edges-unique.ts
@@ -0,0 +1,29 @@
+import type { MigrationModule } from './types';
+
+export const MIGRATION: MigrationModule = {
+  description:
+    'Dedup edges and enforce UNIQUE(source, target, kind, line, col) so INSERT OR IGNORE actually dedupes',
+  up: (db) => {
+    // Tolerate edges-table-missing (synthetic test DBs that only need
+    // the FTS / nodes side of the schema): if there's no edges table,
+    // there are no duplicates to dedup or unique constraint to add.
+    const hasEdges = (db
+      .prepare(`SELECT COUNT(*) AS c FROM sqlite_master WHERE type='table' AND name='edges'`)
+      .get() as { c: number }).c > 0;
+    if (!hasEdges) return;
+
+    // Without a UNIQUE constraint the existing `INSERT OR IGNORE INTO
+    // edges` was a no-op for dedup purposes. Collapse accumulated
+    // duplicates first, then add the UNIQUE index. COALESCE keeps
+    // NULL line/col values comparable.
+    db.exec(`
+      DELETE FROM edges
+      WHERE id NOT IN (
+        SELECT MIN(id) FROM edges
+        GROUP BY source, target, kind, COALESCE(line, -1), COALESCE(col, -1)
+      );
+      CREATE UNIQUE INDEX IF NOT EXISTS idx_edges_unique
+        ON edges(source, target, kind, COALESCE(line, -1), COALESCE(col, -1));
+    `);
+  },
+};
diff --git a/src/db/migrations/009-fts-subwords-porter.ts b/src/db/migrations/009-fts-subwords-porter.ts
new file mode 100644
index 00000000..7db0c7f2
--- /dev/null
+++ b/src/db/migrations/009-fts-subwords-porter.ts
@@ -0,0 +1,74 @@
+import type { MigrationModule } from './types';
+import { buildNameSubwords } from '../../utils';
+
+export const MIGRATION: MigrationModule = {
+  description:
+    'Add name_subwords + Porter stemmer to FTS so natural-language and partial-identifier queries work',
+  up: (db) => {
+    // Synthetic test DBs may not have the nodes table — skip cleanly.
+    const hasNodes = (db
+      .prepare(`SELECT COUNT(*) AS c FROM sqlite_master WHERE type='table' AND name='nodes'`)
+      .get() as { c: number }).c > 0;
+    if (!hasNodes) return;
+
+    // 1. Add the synthetic subwords column to nodes — idempotent so a
+    //    re-run after a partial DDL failure (SQLite auto-commits DDL,
+    //    so only some of these statements may have landed) doesn't fail
+    //    with "duplicate column name".
+    const cols = db.prepare(`PRAGMA table_info(nodes);`).all() as Array<{ name: string }>;
+    if (!cols.some((c) => c.name === 'name_subwords')) {
+      db.exec(`ALTER TABLE nodes ADD COLUMN name_subwords TEXT;`);
+    }
+
+    // 2. Drop the existing FTS table + triggers. We can't ALTER the
+    //    FTS5 tokenizer in place; recreating is the supported path.
+    db.exec(`
+      DROP TRIGGER IF EXISTS nodes_ai;
+      DROP TRIGGER IF EXISTS nodes_ad;
+      DROP TRIGGER IF EXISTS nodes_au;
+      DROP TABLE IF EXISTS nodes_fts;
+    `);
+
+    // 3. Recreate the FTS table — but DO NOT recreate the triggers yet.
+    db.exec(`
+      CREATE VIRTUAL TABLE nodes_fts USING fts5(
+        id, name, qualified_name, docstring, signature, name_subwords,
+        content='nodes',
+        content_rowid='rowid',
+        tokenize="porter unicode61"
+      );
+    `);
+
+    // 4. Backfill name_subwords.
+    const rows = db
+      .prepare('SELECT id, name FROM nodes')
+      .all() as Array<{ id: string; name: string }>;
+    const update = db.prepare('UPDATE nodes SET name_subwords = ? WHERE id = ?');
+    for (const row of rows) {
+      update.run(buildNameSubwords(row.name), row.id);
+    }
+
+    // 5. Rebuild the FTS index from the content table.
+    db.exec(`INSERT INTO nodes_fts(nodes_fts) VALUES('rebuild');`);
+
+    // 6. Re-attach the triggers — fire on subsequent application writes.
+    db.exec(`
+      CREATE TRIGGER nodes_ai AFTER INSERT ON nodes BEGIN
+        INSERT INTO nodes_fts(rowid, id, name, qualified_name, docstring, signature, name_subwords)
+        VALUES (NEW.rowid, NEW.id, NEW.name, NEW.qualified_name, NEW.docstring, NEW.signature, NEW.name_subwords);
+      END;
+
+      CREATE TRIGGER nodes_ad AFTER DELETE ON nodes BEGIN
+        INSERT INTO nodes_fts(nodes_fts, rowid, id, name, qualified_name, docstring, signature, name_subwords)
+        VALUES ('delete', OLD.rowid, OLD.id, OLD.name, OLD.qualified_name, OLD.docstring, OLD.signature, OLD.name_subwords);
+      END;
+
+      CREATE TRIGGER nodes_au AFTER UPDATE ON nodes BEGIN
+        INSERT INTO nodes_fts(nodes_fts, rowid, id, name, qualified_name, docstring, signature, name_subwords)
+        VALUES ('delete', OLD.rowid, OLD.id, OLD.name, OLD.qualified_name, OLD.docstring, OLD.signature, OLD.name_subwords);
+        INSERT INTO nodes_fts(rowid, id, name, qualified_name, docstring, signature, name_subwords)
+        VALUES (NEW.rowid, NEW.id, NEW.name, NEW.qualified_name, NEW.docstring, NEW.signature, NEW.name_subwords);
+      END;
+    `);
+  },
+};
diff --git a/src/db/migrations/010-co-changes.ts b/src/db/migrations/010-co-changes.ts
new file mode 100644
index 00000000..570d4da1
--- /dev/null
+++ b/src/db/migrations/010-co-changes.ts
@@ -0,0 +1,18 @@
+import type { MigrationModule } from './types';
+
+export const MIGRATION: MigrationModule = {
+  description: 'Add co_changes table for file-level co-change graph',
+  up: (db) => {
+    db.exec(`
+      CREATE TABLE IF NOT EXISTS co_changes (
+        file_a TEXT NOT NULL,
+        file_b TEXT NOT NULL,
+        count INTEGER NOT NULL,
+        PRIMARY KEY (file_a, file_b),
+        CHECK (file_a < file_b)
+      );
+      CREATE INDEX IF NOT EXISTS idx_co_changes_a ON co_changes(file_a);
+      CREATE INDEX IF NOT EXISTS idx_co_changes_b ON co_changes(file_b);
+    `);
+  },
+};
diff --git a/src/db/migrations/011-symbol-summaries.ts b/src/db/migrations/011-symbol-summaries.ts
new file mode 100644
index 00000000..b6acf23f
--- /dev/null
+++ b/src/db/migrations/011-symbol-summaries.ts
@@ -0,0 +1,18 @@
+import type { MigrationModule } from './types';
+
+export const MIGRATION: MigrationModule = {
+  description: 'Add symbol_summaries table for LLM-generated one-line descriptions',
+  up: (db) => {
+    db.exec(`
+      CREATE TABLE IF NOT EXISTS symbol_summaries (
+        node_id TEXT PRIMARY KEY,
+        content_hash TEXT NOT NULL,
+        summary TEXT NOT NULL,
+        model TEXT NOT NULL,
+        generated_at INTEGER NOT NULL,
+        FOREIGN KEY (node_id) REFERENCES nodes(id) ON DELETE CASCADE
+      );
+      CREATE INDEX IF NOT EXISTS idx_summaries_model ON symbol_summaries(model);
+    `);
+  },
+};
diff --git a/src/db/migrations/012-summary-embeddings.ts b/src/db/migrations/012-summary-embeddings.ts
new file mode 100644
index 00000000..56dc1b72
--- /dev/null
+++ b/src/db/migrations/012-summary-embeddings.ts
@@ -0,0 +1,14 @@
+import type { MigrationModule } from './types';
+
+export const MIGRATION: MigrationModule = {
+  description: 'Add embedding BLOB + embedding_model columns on symbol_summaries for semantic search',
+  up: (db) => {
+    const cols = db.prepare(`PRAGMA table_info(symbol_summaries);`).all() as Array<{ name: string }>;
+    if (!cols.some((c) => c.name === 'embedding')) {
+      db.exec(`ALTER TABLE symbol_summaries ADD COLUMN embedding BLOB;`);
+    }
+    if (!cols.some((c) => c.name === 'embedding_model')) {
+      db.exec(`ALTER TABLE symbol_summaries ADD COLUMN embedding_model TEXT;`);
+    }
+  },
+};
diff --git a/src/db/migrations/013-directory-summaries.ts b/src/db/migrations/013-directory-summaries.ts
new file mode 100644
index 00000000..d79206a7
--- /dev/null
+++ b/src/db/migrations/013-directory-summaries.ts
@@ -0,0 +1,17 @@
+import type { MigrationModule } from './types';
+
+export const MIGRATION: MigrationModule = {
+  description: 'Add directory_summaries table for module-level LLM descriptions',
+  up: (db) => {
+    db.exec(`
+      CREATE TABLE IF NOT EXISTS directory_summaries (
+        dir_path TEXT PRIMARY KEY,
+        summary TEXT NOT NULL,
+        content_hash TEXT NOT NULL,
+        model TEXT NOT NULL,
+        generated_at INTEGER NOT NULL
+      );
+      CREATE INDEX IF NOT EXISTS idx_dir_summaries_model ON directory_summaries(model);
+    `);
+  },
+};
diff --git a/src/db/migrations/014-summary-roles.ts b/src/db/migrations/014-summary-roles.ts
new file mode 100644
index 00000000..4dc4cc04
--- /dev/null
+++ b/src/db/migrations/014-summary-roles.ts
@@ -0,0 +1,15 @@
+import type { MigrationModule } from './types';
+
+export const MIGRATION: MigrationModule = {
+  description: 'Add role + role_model columns on symbol_summaries for LLM role classification',
+  up: (db) => {
+    const cols = db.prepare(`PRAGMA table_info(symbol_summaries);`).all() as Array<{ name: string }>;
+    if (!cols.some((c) => c.name === 'role')) {
+      db.exec(`ALTER TABLE symbol_summaries ADD COLUMN role TEXT;`);
+    }
+    if (!cols.some((c) => c.name === 'role_model')) {
+      db.exec(`ALTER TABLE symbol_summaries ADD COLUMN role_model TEXT;`);
+    }
+    db.exec(`CREATE INDEX IF NOT EXISTS idx_summaries_role ON symbol_summaries(role);`);
+  },
+};
diff --git a/src/db/migrations/015-prune-co-changes-index.ts b/src/db/migrations/015-prune-co-changes-index.ts
new file mode 100644
index 00000000..9185e213
--- /dev/null
+++ b/src/db/migrations/015-prune-co-changes-index.ts
@@ -0,0 +1,21 @@
+import type { MigrationModule } from './types';
+
+/**
+ * Drop `idx_co_changes_a` — fully covered by the `(file_a, file_b)`
+ * primary key index on `co_changes` via SQLite's left-prefix scan.
+ *
+ * `idx_co_changes_b` (on `file_b` alone) is kept: the PK leads with
+ * `file_a`, so it cannot serve `WHERE file_b = ?` lookups.
+ *
+ * See `scripts/spikes/spike-edge-indexes.mjs` for the analogous
+ * measurement on the `edges` table; the same left-prefix-scan
+ * argument applies here.
+ */
+export const MIGRATION: MigrationModule = {
+  description: 'Drop redundant idx_co_changes_a index',
+  up: (db) => {
+    db.exec(`
+      DROP INDEX IF EXISTS idx_co_changes_a;
+    `);
+  },
+};
diff --git a/src/db/migrations/016-split-symbol-embeddings.ts b/src/db/migrations/016-split-symbol-embeddings.ts
new file mode 100644
index 00000000..fb23edb7
--- /dev/null
+++ b/src/db/migrations/016-split-symbol-embeddings.ts
@@ -0,0 +1,54 @@
+import type { MigrationModule } from './types';
+
+/**
+ * Split symbol embeddings out of `symbol_summaries` into a dedicated
+ * `symbol_embeddings` table.
+ *
+ * Why: every common-path query against `symbol_summaries` (FTS-anchor
+ * lookups, role filters, content-hash freshness checks) was paying
+ * to skip past a 768-dim Float32 BLOB on the same page chain, even
+ * though almost no query needs the embedding bytes. Spike measurement
+ * on a 50K-summary synthetic DB showed a 3.34× slowdown on summary-
+ * only scans for the inline layout vs. a separate table, with only
+ * an ~11% penalty on the rare summary+embedding scan path.
+ *
+ * The split moves embeddings to their own page chain, leaving
+ * `symbol_summaries` row pages dense with the small text/metadata
+ * fields that matter for the hot read paths.
+ *
+ * See `scripts/spikes/spike-embedding-split.mjs` for the reproducer.
+ *
+ * Migration shape:
+ *   1. Create `symbol_embeddings` (node_id PK, embedding BLOB,
+ *      embedding_model TEXT).
+ *   2. Copy existing rows (`embedding IS NOT NULL`) over.
+ *   3. Drop the now-orphaned columns + their index from
+ *      `symbol_summaries`.
+ *
+ * Requires SQLite 3.35+ for `ALTER TABLE DROP COLUMN`. Codegraph's
+ * native (better-sqlite3) and WASM (node-sqlite3-wasm) backends both
+ * ship with newer versions, so this is safe.
+ */
+export const MIGRATION: MigrationModule = {
+  description: 'Split symbol embeddings into dedicated symbol_embeddings table',
+  up: (db) => {
+    db.exec(`
+      CREATE TABLE IF NOT EXISTS symbol_embeddings (
+        node_id TEXT PRIMARY KEY,
+        embedding BLOB NOT NULL,
+        embedding_model TEXT NOT NULL,
+        FOREIGN KEY (node_id) REFERENCES symbol_summaries(node_id) ON DELETE CASCADE
+      );
+      CREATE INDEX IF NOT EXISTS idx_embeddings_model ON symbol_embeddings(embedding_model);
+
+      INSERT OR IGNORE INTO symbol_embeddings (node_id, embedding, embedding_model)
+        SELECT node_id, embedding, embedding_model
+        FROM symbol_summaries
+        WHERE embedding IS NOT NULL AND embedding_model IS NOT NULL;
+
+      DROP INDEX IF EXISTS idx_summaries_embedding_model;
+      ALTER TABLE symbol_summaries DROP COLUMN embedding;
+      ALTER TABLE symbol_summaries DROP COLUMN embedding_model;
+    `);
+  },
+};
diff --git a/src/db/migrations/index.ts b/src/db/migrations/index.ts
new file mode 100644
index 00000000..1f3deda2
--- /dev/null
+++ b/src/db/migrations/index.ts
@@ -0,0 +1,132 @@
+/**
+ * Migration registry.
+ *
+ * Adding a new schema migration is:
+ *
+ *   1. Pick the next free 3-digit prefix (`NNN`) — `git ls-files
+ *      'src/db/migrations/[0-9]*.ts'` shows what's taken.
+ *   2. Create `src/db/migrations/<NNN>-<short-description>.ts`
+ *      exporting a `MIGRATION: MigrationModule` (just `description`
+ *      and `up(db)`).
+ *   3. Add **one** import line and **one** array entry to this file.
+ *
+ * **Why filename-derived versions instead of a field?** Two PRs
+ * adding migrations independently used to collide on the
+ * `migrations[]` array AND the `CURRENT_SCHEMA_VERSION` const.
+ * With monolithic migrations.ts, "I claimed v4 / you claimed v4"
+ * resolved as "second PR's v4 silently no-ops" — a real bug class
+ * (PR #113's reviewer caught one). With filename-derived versions,
+ * two PRs both creating `004-foo.ts` produce a filesystem-level
+ * conflict the maintainer sees instantly.
+ *
+ * `CURRENT_SCHEMA_VERSION` is the max of all registered versions.
+ */
+
+import type { Migration, MigrationModule } from './types';
+
+import { MIGRATION as MIG_002 } from './002-project-metadata';
+import { MIGRATION as MIG_003 } from './003-lower-name-index';
+import { MIGRATION as MIG_004 } from './004-centrality-churn';
+import { MIGRATION as MIG_005 } from './005-symbol-issues';
+import { MIGRATION as MIG_006 } from './006-config-refs';
+import { MIGRATION as MIG_007 } from './007-sql-refs';
+import { MIGRATION as MIG_008 } from './008-edges-unique';
+import { MIGRATION as MIG_009 } from './009-fts-subwords-porter';
+import { MIGRATION as MIG_010 } from './010-co-changes';
+import { MIGRATION as MIG_011 } from './011-symbol-summaries';
+import { MIGRATION as MIG_012 } from './012-summary-embeddings';
+import { MIGRATION as MIG_013 } from './013-directory-summaries';
+import { MIGRATION as MIG_014 } from './014-summary-roles';
+import { MIGRATION as MIG_015 } from './015-prune-co-changes-index';
+import { MIGRATION as MIG_016 } from './016-split-symbol-embeddings';
+
+interface ModuleRef {
+  /**
+   * Source filename. The 3-digit prefix is the source of truth for
+   * the version number — `validateRegistered` parses it. Keep this
+   * field in sync with the actual file on disk; the
+   * filesystem-cross-check test catches drift.
+   */
+  filename: string;
+  module: MigrationModule;
+}
+
+/**
+ * Static-import list of every migration. Two PRs adding
+ * migrations both add a single entry here; alphabetical ordering
+ * puts adjacent additions on different lines unless the version
+ * numbers themselves collide, in which case the filesystem
+ * collision on `NNN-*.ts` surfaces the conflict instantly.
+ */
+const REGISTERED_MODULES: readonly ModuleRef[] = [
+  { filename: '002-project-metadata.ts', module: MIG_002 },
+  { filename: '003-lower-name-index.ts', module: MIG_003 },
+  { filename: '004-centrality-churn.ts', module: MIG_004 },
+  { filename: '005-symbol-issues.ts', module: MIG_005 },
+  { filename: '006-config-refs.ts', module: MIG_006 },
+  { filename: '007-sql-refs.ts', module: MIG_007 },
+  { filename: '008-edges-unique.ts', module: MIG_008 },
+  { filename: '009-fts-subwords-porter.ts', module: MIG_009 },
+  { filename: '010-co-changes.ts', module: MIG_010 },
+  { filename: '011-symbol-summaries.ts', module: MIG_011 },
+  { filename: '012-summary-embeddings.ts', module: MIG_012 },
+  { filename: '013-directory-summaries.ts', module: MIG_013 },
+  { filename: '014-summary-roles.ts', module: MIG_014 },
+  { filename: '015-prune-co-changes-index.ts', module: MIG_015 },
+  { filename: '016-split-symbol-embeddings.ts', module: MIG_016 },
+];
+
+/** Strict 3-digit prefix on each migration filename. */
+const FILENAME_PATTERN = /^(\d{3})-[a-z0-9]+(?:-[a-z0-9]+)*\.ts$/;
+
+/**
+ * Validate the registered set: filenames match the strict
+ * `NNN-name.ts` shape, version is parsed from the prefix (no
+ * hand-typed version field that can drift), versions are unique,
+ * and the result is sorted ascending. Throws loudly at module
+ * load if any invariant is violated rather than silently dropping
+ * a migration during `runMigrations()`.
+ */
+function validateRegistered(refs: readonly ModuleRef[]): readonly Migration[] {
+  if (refs.length === 0) {
+    throw new Error('[CodeGraph] migrations registry is empty');
+  }
+  const parsed = refs.map((r) => {
+    const m = FILENAME_PATTERN.exec(r.filename);
+    if (!m) {
+      throw new Error(
+        `[CodeGraph] migration filename "${r.filename}" does not match ` +
+          `expected pattern NNN-kebab-name.ts (3-digit prefix, lowercase kebab-case body)`
+      );
+    }
+    const version = parseInt(m[1]!, 10);
+    return {
+      version,
+      filename: r.filename,
+      description: r.module.description,
+      up: r.module.up,
+    };
+  });
+  const sorted = [...parsed].sort((a, b) => a.version - b.version);
+  for (let i = 1; i < sorted.length; i++) {
+    if (sorted[i]!.version === sorted[i - 1]!.version) {
+      throw new Error(
+        `[CodeGraph] duplicate migration version ${sorted[i]!.version}: ` +
+          `${sorted[i - 1]!.filename} vs ${sorted[i]!.filename}`
+      );
+    }
+  }
+  return sorted.map((r) => ({
+    version: r.version,
+    description: r.description,
+    up: r.up,
+  }));
+}
+
+export const ALL_MIGRATIONS: readonly Migration[] = validateRegistered(REGISTERED_MODULES);
+
+/**
+ * Highest registered migration version. Derived from the registry
+ * (no hand-maintained constant to keep in sync).
+ */
+export const CURRENT_SCHEMA_VERSION: number = ALL_MIGRATIONS[ALL_MIGRATIONS.length - 1]!.version;
diff --git a/src/db/migrations/types.ts b/src/db/migrations/types.ts
new file mode 100644
index 00000000..479af672
--- /dev/null
+++ b/src/db/migrations/types.ts
@@ -0,0 +1,25 @@
+/**
+ * Migration registry types.
+ *
+ * Each migration ships its own self-contained file
+ * (`./NNN-description.ts`) exporting a `MIGRATION:
+ * MigrationModule`. The version number is derived from the
+ * leading 3-digit prefix on the filename, NOT from a field in the
+ * module — this guarantees no two PRs can claim the same version
+ * silently (filenames collide on the filesystem; SQL migrations
+ * never silently no-op).
+ */
+
+import type { SqliteDatabase } from '../sqlite-adapter';
+
+export interface MigrationModule {
+  /** One-line description for `schema_versions` table + diagnostics. */
+  readonly description: string;
+  /** The actual schema-mutation function. Wrapped in a transaction. */
+  readonly up: (db: SqliteDatabase) => void;
+}
+
+export interface Migration extends MigrationModule {
+  /** Version derived from filename's leading NNN prefix. */
+  readonly version: number;
+}
diff --git a/src/db/queries.ts b/src/db/queries.ts
index 51f1a1ad..da65828b 100644
--- a/src/db/queries.ts
+++ b/src/db/queries.ts
@@ -17,8 +17,8 @@ import {
   SearchOptions,
   SearchResult,
 } from '../types';
-import { safeJsonParse } from '../utils';
-import { kindBonus, nameMatchBonus, scorePathRelevance } from '../search/query-utils';
+import { safeJsonParse, buildNameSubwords } from '../utils';
+import { kindBonus, nameMatchBonus, scorePathRelevance, filterStopwords, diversifyByFile } from '../search/query-utils';
 
 /**
  * Database row types (snake_case from SQLite)
@@ -44,6 +44,7 @@ interface NodeRow {
   decorators: string | null;
   type_parameters: string | null;
   updated_at: number;
+  centrality: number | null;
 }
 
 interface EdgeRow {
@@ -66,6 +67,10 @@ interface FileRow {
   indexed_at: number;
   node_count: number;
   errors: string | null;
+  commit_count: number | null;
+  loc: number | null;
+  first_seen_ts: number | null;
+  last_touched_ts: number | null;
 }
 
 interface UnresolvedRefRow {
@@ -105,6 +110,7 @@ function rowToNode(row: NodeRow): Node {
     decorators: row.decorators ? safeJsonParse(row.decorators, undefined) : undefined,
     typeParameters: row.type_parameters ? safeJsonParse(row.type_parameters, undefined) : undefined,
     updatedAt: row.updated_at,
+    centrality: row.centrality ?? undefined,
   };
 }
 
@@ -136,6 +142,10 @@ function rowToFileRecord(row: FileRow): FileRecord {
     indexedAt: row.indexed_at,
     nodeCount: row.node_count,
     errors: row.errors ? safeJsonParse(row.errors, undefined) : undefined,
+    commitCount: row.commit_count ?? 0,
+    loc: row.loc ?? 0,
+    firstSeenTs: row.first_seen_ts ?? null,
+    lastTouchedTs: row.last_touched_ts ?? null,
   };
 }
 
@@ -161,6 +171,8 @@ export class QueryBuilder {
     insertEdge?: SqliteStatement;
     upsertFile?: SqliteStatement;
     deleteEdgesBySource?: SqliteStatement;
+    deleteEdgesBySourceAndKind?: SqliteStatement;
+    deleteAllEdgesByKind?: SqliteStatement;
     deleteEdgesByTarget?: SqliteStatement;
     getEdgesBySource?: SqliteStatement;
     getEdgesByTarget?: SqliteStatement;
@@ -170,7 +182,6 @@ export class QueryBuilder {
     getFileByPath?: SqliteStatement;
     getAllFiles?: SqliteStatement;
     insertUnresolved?: SqliteStatement;
-    deleteUnresolvedByNode?: SqliteStatement;
     getUnresolvedByName?: SqliteStatement;
     getNodesByName?: SqliteStatement;
     getNodesByQualifiedNameExact?: SqliteStatement;
@@ -185,6 +196,14 @@ export class QueryBuilder {
     this.db = db;
   }
 
+  /**
+   * Execute a callback inside a single SQLite transaction. Useful when a
+   * caller needs several `QueryBuilder` operations to commit atomically.
+   */
+  transaction<T>(fn: () => T): T {
+    return this.db.transaction(fn)();
+  }
+
   // ===========================================================================
   // Node Operations
   // ===========================================================================
@@ -200,13 +219,13 @@ export class QueryBuilder {
           start_line, end_line, start_column, end_column,
           docstring, signature, visibility,
           is_exported, is_async, is_static, is_abstract,
-          decorators, type_parameters, updated_at
+          decorators, type_parameters, updated_at, name_subwords
         ) VALUES (
           @id, @kind, @name, @qualifiedName, @filePath, @language,
           @startLine, @endLine, @startColumn, @endColumn,
           @docstring, @signature, @visibility,
           @isExported, @isAsync, @isStatic, @isAbstract,
-          @decorators, @typeParameters, @updatedAt
+          @decorators, @typeParameters, @updatedAt, @nameSubwords
         )
       `);
     }
@@ -223,6 +242,12 @@ export class QueryBuilder {
       return;
     }
 
+    // INSERT OR REPLACE may overwrite a node we have cached. Drop the
+    // stale entry so the next getNodeById sees the new row, not the old
+    // one (matches the cache-invalidation pattern used by updateNode and
+    // deleteNode below).
+    this.nodeCache.delete(node.id);
+
     try {
       this.stmts.insertNode.run({
         id: node.id,
@@ -245,6 +270,7 @@ export class QueryBuilder {
         decorators: node.decorators ? JSON.stringify(node.decorators) : null,
         typeParameters: node.typeParameters ? JSON.stringify(node.typeParameters) : null,
         updatedAt: node.updatedAt ?? Date.now(),
+        nameSubwords: buildNameSubwords(node.name),
       });
     } catch (error) {
       throw error;
@@ -287,7 +313,8 @@ export class QueryBuilder {
           is_abstract = @isAbstract,
           decorators = @decorators,
           type_parameters = @typeParameters,
-          updated_at = @updatedAt
+          updated_at = @updatedAt,
+          name_subwords = @nameSubwords
         WHERE id = @id
       `);
     }
@@ -322,6 +349,7 @@ export class QueryBuilder {
       decorators: node.decorators ? JSON.stringify(node.decorators) : null,
       typeParameters: node.typeParameters ? JSON.stringify(node.typeParameters) : null,
       updatedAt: node.updatedAt ?? Date.now(),
+      nameSubwords: buildNameSubwords(node.name),
     });
   }
 
@@ -379,6 +407,59 @@ export class QueryBuilder {
     return node;
   }
 
+  /**
+   * Batch lookup: fetch many nodes by ID in a single SQL round-trip.
+   *
+   * Replaces the N+1 pattern in graph traversal where every edge would
+   * trigger its own `getNodeById` call. For a function with 50 callers
+   * this collapses 50 point reads into one IN-list query (~10-50x
+   * faster end-to-end).
+   *
+   * Returns a Map keyed by id so callers can preserve their own ordering
+   * (typically the order edges were returned from the graph). Missing IDs
+   * are simply absent from the map.
+   *
+   * Cache-aware: ids already in the LRU cache are served from memory and
+   * the SQL query only touches the misses.
+   */
+  getNodesByIds(ids: readonly string[]): Map<string, Node> {
+    const out = new Map<string, Node>();
+    if (ids.length === 0) return out;
+
+    // Serve cache hits first; build the miss list for SQL.
+    const misses: string[] = [];
+    for (const id of ids) {
+      const cached = this.nodeCache.get(id);
+      if (cached !== undefined) {
+        // LRU touch
+        this.nodeCache.delete(id);
+        this.nodeCache.set(id, cached);
+        out.set(id, cached);
+      } else {
+        misses.push(id);
+      }
+    }
+    if (misses.length === 0) return out;
+
+    // Chunk under SQLite's parameter limit (default 999, raised to 32766
+    // in better-sqlite3 builds — chunk at 500 for safety across both
+    // backends and to keep the query plan simple).
+    const CHUNK = 500;
+    for (let i = 0; i < misses.length; i += CHUNK) {
+      const chunk = misses.slice(i, i + CHUNK);
+      const placeholders = chunk.map(() => '?').join(',');
+      const rows = this.db
+        .prepare(`SELECT * FROM nodes WHERE id IN (${placeholders})`)
+        .all(...chunk) as NodeRow[];
+      for (const row of rows) {
+        const node = rowToNode(row);
+        out.set(node.id, node);
+        this.cacheNode(node);
+      }
+    }
+    return out;
+  }
+
   /**
    * Add a node to the cache, evicting oldest if needed
    */
@@ -478,7 +559,13 @@ export class QueryBuilder {
    * 3. Score results based on match quality
    */
   searchNodes(query: string, options: SearchOptions = {}): SearchResult[] {
-    const { kinds, languages, limit = 100, offset = 0 } = options;
+    const { kinds, languages, limit = 100, offset = 0, perFileCap = 3 } = options;
+
+    // Note on over-fetching: searchNodesFTS already over-fetches by 5x
+    // internally (Math.max(limit*5, 100)) so its own rescoring pass has
+    // headroom. That same headroom feeds the per-file diversification
+    // below — no additional outer multiplier needed. Keeping this comment
+    // here so future readers don't reintroduce a multiplier-on-multiplier.
 
     // First try FTS5 with prefix matching
     let results = this.searchNodesFTS(query, { kinds, languages, limit, offset });
@@ -530,10 +617,23 @@ export class QueryBuilder {
           + nameMatchBonus(r.node.name, query),
       }));
       results.sort((a, b) => b.score - a.score);
-      // Trim to requested limit after rescoring
-      if (results.length > limit) {
-        results = results.slice(0, limit);
-      }
+    }
+
+    // Diversification: cap per-file results so the top-K isn't dominated
+    // by the methods of a single class. Top-scoring hit per file is always
+    // included; the cap only kicks in for the second-and-onward members
+    // of the same file. perFileCap=0 disables.
+    //
+    // Guard `results.length > limit`: when results <= limit there's
+    // nothing to drop, so the existing score order is already what the
+    // caller will see. (`diversifyByFile` is also safe to call here and
+    // would reorder within the same set, but the existing rescore order
+    // is already meaningful and we don't want to perturb it without
+    // benefit.)
+    if (perFileCap > 0 && results.length > limit) {
+      results = diversifyByFile(results, limit, perFileCap);
+    } else if (results.length > limit) {
+      results = results.slice(0, limit);
     }
 
     return results;
@@ -545,30 +645,38 @@ export class QueryBuilder {
   private searchNodesFTS(query: string, options: SearchOptions): SearchResult[] {
     const { kinds, languages, limit = 100, offset = 0 } = options;
 
-    // Add prefix wildcard for better matching (e.g., "auth" matches "AuthService", "authenticate")
-    // Escape special FTS5 characters and add prefix wildcard
-    const ftsQuery = query
-      .replace(/['"*():^]/g, '') // Remove FTS5 special chars
+    // Build the FTS query in three steps:
+    //   1. Strip characters with special meaning to FTS5 and split on whitespace.
+    //   2. Drop FTS5 boolean operators (AND/OR/NOT/NEAR) — prevents user input
+    //      from injecting boolean structure into the OR-join below.
+    //   3. Drop English stopwords for natural-language queries — words like
+    //      "how" / "the" otherwise become OR'd hits against any prose-bearing
+    //      docstring and crowd out the actually-relevant identifier tokens.
+    const rawTerms = query
+      .replace(/['"*():^]/g, '')
       .split(/\s+/)
-      .filter(term => term.length > 0)
-      // Strip FTS5 boolean operators to prevent query manipulation
-      .filter(term => !/^(AND|OR|NOT|NEAR)$/i.test(term))
-      .map(term => `"${term}"*`) // Prefix match each term
+      .filter((term) => term.length > 0)
+      .filter((term) => !/^(AND|OR|NOT|NEAR)$/i.test(term));
+
+    const filteredTerms = filterStopwords(rawTerms);
+
+    const ftsQuery = filteredTerms
+      .map((term) => `"${term}"*`) // Prefix match each term
       .join(' OR ');
 
     if (!ftsQuery) {
       return [];
     }
 
-    // BM25 column weights: id=0, name=20, qualified_name=5, docstring=1, signature=2
-    // Heavy name weight ensures exact/prefix name matches rank above incidental
-    // mentions in long docstrings or qualified names of nested symbols.
-    // Fetch 5x requested limit so post-hoc rescoring (kindBonus, pathRelevance,
-    // nameMatchBonus) can promote results that BM25 alone undervalues.
+    // BM25 column weights: id=0, name=20, qualified_name=5, docstring=1,
+    // signature=2, name_subwords=10. Heavy name weight keeps exact and prefix
+    // name matches above incidental mentions in long docstrings; the new
+    // name_subwords column at 10× lets queries hit subword tokens like
+    // `parser` against `getParser` without burying full-name matches.
     const ftsLimit = Math.max(limit * 5, 100);
 
     let sql = `
-      SELECT nodes.*, bm25(nodes_fts, 0, 20, 5, 1, 2) as score
+      SELECT nodes.*, bm25(nodes_fts, 0, 20, 5, 1, 2, 10) as score
       FROM nodes_fts
       JOIN nodes ON nodes_fts.id = nodes.id
       WHERE nodes_fts MATCH ?
@@ -845,6 +953,34 @@ export class QueryBuilder {
     this.stmts.deleteEdgesBySource.run(sourceId);
   }
 
+  /**
+   * Delete all edges of a given kind from a single source node. Used by
+   * the tests-edges rebuild path to refresh `tests` edges for a single
+   * test file without disturbing its other outgoing edges.
+   */
+  deleteEdgesBySourceAndKind(sourceId: string, kind: EdgeKind): void {
+    if (!this.stmts.deleteEdgesBySourceAndKind) {
+      this.stmts.deleteEdgesBySourceAndKind = this.db.prepare(
+        'DELETE FROM edges WHERE source = ? AND kind = ?'
+      );
+    }
+    this.stmts.deleteEdgesBySourceAndKind.run(sourceId, kind);
+  }
+
+  /**
+   * Delete every edge of a given kind across the whole graph. Used to
+   * fully rebuild a derived edge layer (e.g. `tests`) before re-inserting
+   * the current set.
+   */
+  deleteAllEdgesByKind(kind: EdgeKind): void {
+    if (!this.stmts.deleteAllEdgesByKind) {
+      this.stmts.deleteAllEdgesByKind = this.db.prepare(
+        'DELETE FROM edges WHERE kind = ?'
+      );
+    }
+    this.stmts.deleteAllEdgesByKind.run(kind);
+  }
+
   /**
    * Get outgoing edges from a node
    */
@@ -916,7 +1052,12 @@ export class QueryBuilder {
   // ===========================================================================
 
   /**
-   * Insert or update a file record
+   * Insert or update a file record.
+   *
+   * Churn columns (commit_count, loc, first_seen_ts, last_touched_ts)
+   * are deliberately omitted from the ON CONFLICT update list — they
+   * are managed exclusively by `applyChurnDeltas` / `applyLocUpdates`.
+   * Adding them here would clobber mined git history on every re-index.
    */
   upsertFile(file: FileRecord): void {
     if (!this.stmts.upsertFile) {
@@ -1032,17 +1173,8 @@ export class QueryBuilder {
     insert();
   }
 
-  /**
-   * Delete unresolved references from a node
-   */
-  deleteUnresolvedByNode(nodeId: string): void {
-    if (!this.stmts.deleteUnresolvedByNode) {
-      this.stmts.deleteUnresolvedByNode = this.db.prepare(
-        'DELETE FROM unresolved_refs WHERE from_node_id = ?'
-      );
-    }
-    this.stmts.deleteUnresolvedByNode.run(nodeId);
-  }
+  // (deleteUnresolvedByNode removed — never called; FK cascade on
+  // nodes(id) → unresolved_refs.from_node_id handles cleanup automatically.)
 
   /**
    * Get unresolved references by name (for resolution)
@@ -1293,6 +1425,973 @@ export class QueryBuilder {
       this.db.exec('DELETE FROM edges');
       this.db.exec('DELETE FROM nodes');
       this.db.exec('DELETE FROM files');
+      this.db.exec('DELETE FROM co_changes');
+      this.db.exec('DELETE FROM symbol_embeddings');
+      this.db.exec('DELETE FROM symbol_summaries');
+      this.db.exec('DELETE FROM directory_summaries');
+    })();
+  }
+
+  // ===========================================================================
+  // Centrality (PageRank scores on nodes)
+  // ===========================================================================
+
+  applyCentralityScores(scores: Map<string, number>): void {
+    if (scores.size === 0) return;
+    const stmt = this.db.prepare('UPDATE nodes SET centrality = ? WHERE id = ?');
+    this.db.transaction(() => {
+      for (const [id, score] of scores) {
+        stmt.run(score, id);
+      }
+    })();
+    this.nodeCache.clear();
+  }
+
+  clearCentrality(): void {
+    this.db.exec('UPDATE nodes SET centrality = NULL');
+    this.nodeCache.clear();
+  }
+
+  getTopNodesByCentrality(opts: {
+    limit?: number;
+    kind?: NodeKind;
+    minCentrality?: number;
+  } = {}): Node[] {
+    const limit = opts.limit ?? 25;
+    const minCentrality = opts.minCentrality ?? 0;
+    const where: string[] = ['centrality IS NOT NULL', 'centrality >= ?'];
+    const params: (string | number)[] = [minCentrality];
+    if (opts.kind) {
+      where.push('kind = ?');
+      params.push(opts.kind);
+    }
+    const sql = `SELECT * FROM nodes WHERE ${where.join(' AND ')}
+                 ORDER BY centrality DESC LIMIT ?`;
+    params.push(limit);
+    const rows = this.db.prepare(sql).all(...params) as NodeRow[];
+    return rows.map(rowToNode);
+  }
+
+  getCentralityRank(nodeId: string): { rank: number; total: number } | null {
+    const row = this.db
+      .prepare('SELECT centrality FROM nodes WHERE id = ?')
+      .get(nodeId) as { centrality: number | null } | undefined;
+    if (!row || row.centrality === null) return null;
+    const above = this.db
+      .prepare('SELECT COUNT(*) AS c FROM nodes WHERE centrality > ?')
+      .get(row.centrality) as { c: number };
+    const total = this.db
+      .prepare('SELECT COUNT(*) AS c FROM nodes WHERE centrality IS NOT NULL')
+      .get() as { c: number };
+    return { rank: above.c + 1, total: total.c };
+  }
+
+  // ===========================================================================
+  // Co-Change (file-level coupling derived from git history)
+  // ===========================================================================
+
+  applyCoChangeDeltas(
+    pairDeltas: Iterable<[string, string, number]>,
+    fileCommitDeltas: Iterable<[string, number]>
+  ): void {
+    const upsertPair = this.db.prepare(`
+      INSERT INTO co_changes (file_a, file_b, count) VALUES (?, ?, ?)
+      ON CONFLICT(file_a, file_b) DO UPDATE SET count = count + excluded.count
+    `);
+    const incFileCommit = this.db.prepare(`
+      UPDATE files SET commit_count = commit_count + ? WHERE path = ?
+    `);
+    this.db.transaction(() => {
+      for (const [a, b, delta] of pairDeltas) {
+        const [lo, hi] = a < b ? [a, b] : [b, a];
+        if (lo === hi) continue;
+        upsertPair.run(lo, hi, delta);
+      }
+      for (const [path, delta] of fileCommitDeltas) {
+        incFileCommit.run(delta, path);
+      }
+    })();
+  }
+
+  // ===========================================================================
+  // Per-file churn (mined from git log)
+  // ===========================================================================
+
+  applyChurnDeltas(
+    deltas: Iterable<{
+      path: string;
+      commitCountDelta: number;
+      lastTouchedTs: number;
+      firstSeenTs: number;
+    }>
+  ): void {
+    const stmt = this.db.prepare(
+      `UPDATE files
+         SET commit_count    = commit_count + ?,
+             last_touched_ts = MAX(COALESCE(last_touched_ts, 0), ?),
+             first_seen_ts   = COALESCE(first_seen_ts, ?)
+       WHERE path = ?`
+    );
+    this.db.transaction(() => {
+      for (const d of deltas) {
+        stmt.run(d.commitCountDelta, d.lastTouchedTs, d.firstSeenTs, d.path);
+      }
+    })();
+  }
+
+  /** Reset all churn columns; used before a full re-mine. Does not touch `loc`. */
+  clearChurn(): void {
+    this.db.exec(
+      `UPDATE files SET commit_count = 0, last_touched_ts = NULL, first_seen_ts = NULL`
+    );
+  }
+
+  /** Update the on-disk LOC for a single file. Cheap; called per changed file. */
+  updateFileLoc(filePath: string, loc: number): void {
+    this.db.prepare('UPDATE files SET loc = ? WHERE path = ?').run(loc, filePath);
+  }
+
+  /** Bulk LOC update — used during indexAll to refresh LOC for every indexed file. */
+  applyLocUpdates(entries: Iterable<{ path: string; loc: number }>): void {
+    const stmt = this.db.prepare('UPDATE files SET loc = ? WHERE path = ?');
+    this.db.transaction(() => {
+      for (const e of entries) stmt.run(e.loc, e.path);
+    })();
+  }
+
+  getTopFilesByChurn(opts: { limit?: number; minCommits?: number } = {}): FileRecord[] {
+    const limit = opts.limit ?? 25;
+    const minCommits = opts.minCommits ?? 1;
+    const rows = this.db
+      .prepare(
+        `SELECT * FROM files WHERE commit_count >= ?
+         ORDER BY commit_count DESC LIMIT ?`
+      )
+      .all(minCommits, limit) as FileRow[];
+    return rows.map(rowToFileRecord);
+  }
+
+  /**
+   * Hotspots: files ranked by `risk = (Σ centrality of nodes in file) × commit_count`.
+   *
+   * Both inputs are optional in their own right; with neither computed,
+   * this returns []. Sorting modes:
+   *   - 'risk'        : the combined score (default; what "hotspot" means)
+   *   - 'centrality'  : pure structural importance
+   *   - 'churn'       : pure change frequency
+   */
+  getHotspots(opts: {
+    limit?: number;
+    minCommits?: number;
+    minCentrality?: number;
+    sortBy?: 'risk' | 'centrality' | 'churn';
+  } = {}): Array<{
+    filePath: string;
+    fileCentrality: number;
+    commitCount: number;
+    loc: number;
+    lastTouchedTs: number | null;
+    riskScore: number;
+  }> {
+    const limit = opts.limit ?? 15;
+    const minCommits = opts.minCommits ?? 0;
+    const minCentrality = opts.minCentrality ?? 0;
+    const sortBy = opts.sortBy ?? 'risk';
+
+    const orderBy =
+      sortBy === 'centrality'
+        ? 'fileCentrality DESC'
+        : sortBy === 'churn'
+          ? 'commitCount DESC'
+          : 'riskScore DESC';
+
+    // Aggregate centrality at file level. LEFT JOIN so files without any
+    // indexed nodes (rare — schema-only files) still surface if they have churn.
+    const sql = `
+      SELECT
+        f.path                                     AS filePath,
+        COALESCE(n_agg.fc, 0.0)                    AS fileCentrality,
+        f.commit_count                             AS commitCount,
+        f.loc                                      AS loc,
+        f.last_touched_ts                          AS lastTouchedTs,
+        COALESCE(n_agg.fc, 0.0) * f.commit_count   AS riskScore
+      FROM files f
+      LEFT JOIN (
+        SELECT file_path, SUM(centrality) AS fc
+        FROM nodes WHERE centrality IS NOT NULL
+        GROUP BY file_path
+      ) n_agg ON n_agg.file_path = f.path
+      WHERE f.commit_count >= ? AND COALESCE(n_agg.fc, 0.0) >= ?
+      ORDER BY ${orderBy}
+      LIMIT ?
+    `;
+    const rows = this.db.prepare(sql).all(minCommits, minCentrality, limit) as Array<{
+      filePath: string;
+      fileCentrality: number;
+      commitCount: number;
+      loc: number;
+      lastTouchedTs: number | null;
+      riskScore: number;
+    }>;
+    return rows;
+  }
+
+  // ===========================================================================
+  // Symbol-issue attributions (mined from git history)
+  // ===========================================================================
+
+  applyIssueAttributions(
+    rows: Iterable<{
+      nodeId: string;
+      issueNumber: number;
+      commitSha: string;
+      kind: 'modified' | 'added' | 'removed';
+    }>
+  ): void {
+    const stmt = this.db.prepare(
+      `INSERT OR IGNORE INTO symbol_issues (node_id, issue_number, commit_sha, kind)
+       VALUES (?, ?, ?, ?)`
+    );
+    this.db.transaction(() => {
+      for (const r of rows) {
+        stmt.run(r.nodeId, r.issueNumber, r.commitSha, r.kind);
+      }
+    })();
+  }
+
+  clearIssueAttributions(): void {
+    this.db.exec('DELETE FROM symbol_issues');
+  }
+
+  getIssuesForNode(nodeId: string): Array<{
+    issueNumber: number;
+    kind: 'modified' | 'added' | 'removed';
+    commitSha: string;
+  }> {
+    return this.db
+      .prepare(
+        `SELECT issue_number AS issueNumber, kind, commit_sha AS commitSha
+         FROM symbol_issues
+         WHERE node_id = ?
+         ORDER BY issue_number ASC, kind ASC`
+      )
+      .all(nodeId) as Array<{
+      issueNumber: number;
+      kind: 'modified' | 'added' | 'removed';
+      commitSha: string;
+    }>;
+  }
+
+  getNodesForIssue(issueNumber: number): Array<{
+    nodeId: string;
+    kind: 'modified' | 'added' | 'removed';
+    commitSha: string;
+  }> {
+    return this.db
+      .prepare(
+        `SELECT node_id AS nodeId, kind, commit_sha AS commitSha
+         FROM symbol_issues
+         WHERE issue_number = ?
+         ORDER BY node_id ASC`
+      )
+      .all(issueNumber) as Array<{
+      nodeId: string;
+      kind: 'modified' | 'added' | 'removed';
+      commitSha: string;
+    }>;
+  }
+
+  // ===========================================================================
+  // Config references (env vars / feature flags read sites)
+  // ===========================================================================
+
+  applyConfigRefs(
+    rows: Array<{
+      configKind: 'env';
+      configKey: string;
+      sourceNodeId: string | null;
+      filePath: string;
+      line: number;
+    }>
+  ): void {
+    if (rows.length === 0) return;
+    const distinctFiles = new Set(rows.map((r) => r.filePath));
+    const deleteStmt = this.db.prepare('DELETE FROM config_refs WHERE file_path = ?');
+    const insertStmt = this.db.prepare(
+      `INSERT INTO config_refs (config_kind, config_key, source_node_id, file_path, line)
+       VALUES (?, ?, ?, ?, ?)`
+    );
+    this.db.transaction(() => {
+      for (const f of distinctFiles) deleteStmt.run(f);
+      for (const r of rows) {
+        insertStmt.run(r.configKind, r.configKey, r.sourceNodeId, r.filePath, r.line);
+      }
+    })();
+  }
+
+  clearConfigRefs(): void {
+    this.db.exec('DELETE FROM config_refs');
+  }
+
+  deleteConfigRefsForPaths(filePaths: Iterable<string>): void {
+    const stmt = this.db.prepare('DELETE FROM config_refs WHERE file_path = ?');
+    this.db.transaction(() => {
+      for (const p of filePaths) stmt.run(p);
+    })();
+  }
+
+  pruneOrphanedConfigRefs(): void {
+    this.db.exec(
+      `DELETE FROM config_refs WHERE file_path NOT IN (SELECT path FROM files)`
+    );
+  }
+
+  getConfigKeys(opts: { configKind?: 'env'; limit?: number } = {}): Array<{
+    configKey: string;
+    reads: number;
+    distinctFiles: number;
+  }> {
+    const limit = opts.limit ?? 200;
+    const where = opts.configKind ? 'WHERE config_kind = ?' : '';
+    const params = opts.configKind ? [opts.configKind, limit] : [limit];
+    return this.db
+      .prepare(
+        `SELECT config_key AS configKey,
+                COUNT(*) AS reads,
+                COUNT(DISTINCT file_path) AS distinctFiles
+         FROM config_refs
+         ${where}
+         GROUP BY config_key
+         ORDER BY reads DESC, config_key ASC
+         LIMIT ?`
+      )
+      .all(...params) as Array<{ configKey: string; reads: number; distinctFiles: number }>;
+  }
+
+  getConfigRefsByKey(
+    configKey: string,
+    opts: { configKind?: 'env' } = {}
+  ): Array<{
+    filePath: string;
+    line: number;
+    sourceNodeId: string | null;
+    sourceName: string | null;
+    sourceKind: string | null;
+  }> {
+    const kind = opts.configKind ?? 'env';
+    return this.db
+      .prepare(
+        `SELECT cr.file_path AS filePath,
+                cr.line AS line,
+                cr.source_node_id AS sourceNodeId,
+                n.name AS sourceName,
+                n.kind AS sourceKind
+         FROM config_refs cr
+         LEFT JOIN nodes n ON n.id = cr.source_node_id
+         WHERE cr.config_kind = ? AND cr.config_key = ?
+         ORDER BY cr.file_path ASC, cr.line ASC`
+      )
+      .all(kind, configKey) as Array<{
+      filePath: string;
+      line: number;
+      sourceNodeId: string | null;
+      sourceName: string | null;
+      sourceKind: string | null;
+    }>;
+  }
+
+  getConfigKeysForNode(nodeId: string): Array<{ configKey: string; line: number }> {
+    return this.db
+      .prepare(
+        `SELECT config_key AS configKey, line
+         FROM config_refs
+         WHERE source_node_id = ?
+         ORDER BY config_key ASC, line ASC`
+      )
+      .all(nodeId) as Array<{ configKey: string; line: number }>;
+  }
+
+  // ===========================================================================
+  // SQL references (table-name string-literal refs from app code)
+  // ===========================================================================
+
+  applySqlRefs(
+    rows: Array<{
+      tableName: string;
+      op: 'read' | 'write' | 'ddl';
+      sourceNodeId: string | null;
+      filePath: string;
+      line: number;
+    }>
+  ): void {
+    if (rows.length === 0) return;
+    const stmt = this.db.prepare(
+      `INSERT INTO sql_refs (table_name, op, source_node_id, file_path, line)
+       VALUES (?, ?, ?, ?, ?)`
+    );
+    this.db.transaction(() => {
+      for (const r of rows) {
+        stmt.run(r.tableName, r.op, r.sourceNodeId, r.filePath, r.line);
+      }
+    })();
+  }
+
+  replaceAllSqlRefs(
+    rows: Array<{
+      tableName: string;
+      op: 'read' | 'write' | 'ddl';
+      sourceNodeId: string | null;
+      filePath: string;
+      line: number;
+    }>
+  ): void {
+    const insert = this.db.prepare(
+      `INSERT INTO sql_refs (table_name, op, source_node_id, file_path, line)
+       VALUES (?, ?, ?, ?, ?)`
+    );
+    this.db.transaction(() => {
+      this.db.exec('DELETE FROM sql_refs');
+      for (const r of rows) {
+        insert.run(r.tableName, r.op, r.sourceNodeId, r.filePath, r.line);
+      }
+    })();
+  }
+
+  deleteSqlRefsForPaths(filePaths: Iterable<string>): void {
+    const stmt = this.db.prepare('DELETE FROM sql_refs WHERE file_path = ?');
+    this.db.transaction(() => {
+      for (const p of filePaths) stmt.run(p);
+    })();
+  }
+
+  clearSqlRefs(): void {
+    this.db.exec('DELETE FROM sql_refs');
+  }
+
+  pruneOrphanedSqlRefs(): void {
+    this.db.exec(
+      `DELETE FROM sql_refs WHERE file_path NOT IN (SELECT path FROM files)`
+    );
+  }
+
+  getSqlTables(opts: { limit?: number } = {}): Array<{
+    tableName: string;
+    reads: number;
+    writes: number;
+    ddl: number;
+    total: number;
+  }> {
+    const limit = opts.limit ?? 100;
+    return this.db
+      .prepare(
+        `SELECT lower(table_name) AS tableName,
+                SUM(CASE WHEN op = 'read'  THEN 1 ELSE 0 END) AS reads,
+                SUM(CASE WHEN op = 'write' THEN 1 ELSE 0 END) AS writes,
+                SUM(CASE WHEN op = 'ddl'   THEN 1 ELSE 0 END) AS ddl,
+                COUNT(*)                                       AS total
+         FROM sql_refs
+         GROUP BY lower(table_name)
+         ORDER BY total DESC, tableName ASC
+         LIMIT ?`
+      )
+      .all(limit) as Array<{
+      tableName: string;
+      reads: number;
+      writes: number;
+      ddl: number;
+      total: number;
+    }>;
+  }
+
+  getSqlRefsByTable(
+    tableName: string,
+    opts: { op?: 'read' | 'write' | 'ddl' } = {}
+  ): Array<{
+    op: 'read' | 'write' | 'ddl';
+    filePath: string;
+    line: number;
+    sourceNodeId: string | null;
+    sourceName: string | null;
+    sourceKind: string | null;
+  }> {
+    const params: Array<string> = [tableName.toLowerCase()];
+    let opFilter = '';
+    if (opts.op) {
+      opFilter = ' AND sr.op = ?';
+      params.push(opts.op);
+    }
+    return this.db
+      .prepare(
+        `SELECT sr.op AS op,
+                sr.file_path AS filePath,
+                sr.line AS line,
+                sr.source_node_id AS sourceNodeId,
+                n.name AS sourceName,
+                n.kind AS sourceKind
+         FROM sql_refs sr
+         LEFT JOIN nodes n ON n.id = sr.source_node_id
+         WHERE lower(sr.table_name) = ?${opFilter}
+         ORDER BY sr.file_path ASC, sr.line ASC`
+      )
+      .all(...params) as Array<{
+      op: 'read' | 'write' | 'ddl';
+      filePath: string;
+      line: number;
+      sourceNodeId: string | null;
+      sourceName: string | null;
+      sourceKind: string | null;
+    }>;
+  }
+
+  getSqlTablesForNode(nodeId: string): Array<{ tableName: string; op: string }> {
+    return this.db
+      .prepare(
+        `SELECT DISTINCT lower(table_name) AS tableName, op
+         FROM sql_refs
+         WHERE source_node_id = ?
+         ORDER BY tableName ASC, op ASC`
+      )
+      .all(nodeId) as Array<{ tableName: string; op: string }>;
+  }
+
+  // ===========================================================================
+  // Co-Change reads
+  // ===========================================================================
+
+  clearCoChanges(): void {
+    this.db.transaction(() => {
+      this.db.exec('DELETE FROM co_changes');
+      this.db.exec('DELETE FROM symbol_embeddings');
+      this.db.exec('DELETE FROM symbol_summaries');
+      this.db.exec('DELETE FROM directory_summaries');
+      this.db.exec('UPDATE files SET commit_count = 0');
     })();
   }
+
+  getCoChangedFiles(
+    filePath: string,
+    options: { limit?: number; minCount?: number; minJaccard?: number } = {}
+  ): Array<{ path: string; count: number; jaccard: number }> {
+    const limit = options.limit ?? 10;
+    const minCount = options.minCount ?? 2;
+    const minJaccard = options.minJaccard ?? 0;
+    const sql = `
+      WITH partners AS (
+        SELECT file_b AS path, count FROM co_changes WHERE file_a = ?
+        UNION ALL
+        SELECT file_a AS path, count FROM co_changes WHERE file_b = ?
+      ),
+      anchor AS (SELECT commit_count AS c FROM files WHERE path = ?),
+      scored AS (
+        SELECT
+          p.path AS path,
+          p.count AS count,
+          CAST(p.count AS REAL) / NULLIF((SELECT c FROM anchor) + f.commit_count - p.count, 0) AS jaccard
+        FROM partners p
+        JOIN files f ON f.path = p.path
+        WHERE p.count >= ?
+      )
+      SELECT path, count, jaccard FROM scored
+      WHERE COALESCE(jaccard, 0) >= ?
+      ORDER BY jaccard DESC, count DESC
+      LIMIT ?
+    `;
+    const rows = this.db
+      .prepare(sql)
+      .all(filePath, filePath, filePath, minCount, minJaccard, limit) as Array<{
+        path: string;
+        count: number;
+        jaccard: number | null;
+      }>;
+    return rows.map((r) => ({ path: r.path, count: r.count, jaccard: r.jaccard ?? 0 }));
+  }
+
+
+  // ==========================================================================
+  // Symbol Summaries (LLM-generated one-liners; populated by background pass)
+  // ==========================================================================
+
+  /**
+   * Get every symbol whose body is meaningful enough to summarise and
+   * whose existing docstring (if any) is shorter than `docThreshold`
+   * chars. Sorted by file_path so callers iterating in order can warm
+   * the file-content cache.
+   */
+  getSummarizableNodes(
+    kinds: ReadonlySet<string>,
+    minBodyLines: number,
+    docCharThreshold: number
+  ): Node[] {
+    if (kinds.size === 0) return [];
+    const placeholders = [...kinds].map(() => '?').join(',');
+    const sql = `
+      SELECT * FROM nodes
+      WHERE kind IN (${placeholders})
+        AND (end_line - start_line) >= ?
+        AND (docstring IS NULL OR length(docstring) < ?)
+      ORDER BY file_path, start_line
+    `;
+    const params: (string | number)[] = [...kinds, minBodyLines, docCharThreshold];
+    const rows = this.db.prepare(sql).all(...params) as NodeRow[];
+    return rows.map(rowToNode);
+  }
+
+  /**
+   * Read a single symbol's cached summary, or null if none exists.
+   */
+  getSymbolSummary(nodeId: string): { summary: string; contentHash: string; model: string } | null {
+    const row = this.db
+      .prepare('SELECT summary, content_hash, model FROM symbol_summaries WHERE node_id = ?')
+      .get(nodeId) as { summary: string; content_hash: string; model: string } | undefined;
+    if (!row) return null;
+    return { summary: row.summary, contentHash: row.content_hash, model: row.model };
+  }
+
+  /**
+   * Bulk fetch summaries for a set of node IDs. Returns a Map keyed by id;
+   * absent entries are nodes without a cached summary.
+   */
+  getSymbolSummaries(nodeIds: readonly string[]): Map<string, string> {
+    const out = new Map<string, string>();
+    if (nodeIds.length === 0) return out;
+    const CHUNK = 500;
+    for (let i = 0; i < nodeIds.length; i += CHUNK) {
+      const chunk = nodeIds.slice(i, i + CHUNK);
+      const placeholders = chunk.map(() => '?').join(',');
+      const rows = this.db
+        .prepare(`SELECT node_id, summary FROM symbol_summaries WHERE node_id IN (${placeholders})`)
+        .all(...chunk) as Array<{ node_id: string; summary: string }>;
+      for (const r of rows) out.set(r.node_id, r.summary);
+    }
+    return out;
+  }
+
+  /**
+   * Insert or replace a summary. The unique key is `node_id`; the
+   * content_hash is the consistency anchor that lets the next pass
+   * detect a stale entry and regenerate.
+   */
+  upsertSymbolSummary(nodeId: string, contentHash: string, summary: string, model: string): void {
+    this.db
+      .prepare(`
+        INSERT INTO symbol_summaries (node_id, content_hash, summary, model, generated_at)
+        VALUES (?, ?, ?, ?, ?)
+        ON CONFLICT(node_id) DO UPDATE SET
+          content_hash = excluded.content_hash,
+          summary = excluded.summary,
+          model = excluded.model,
+          generated_at = excluded.generated_at
+      `)
+      .run(nodeId, contentHash, summary, model, Date.now());
+  }
+
+  /**
+   * Summaries that are missing an embedding for the given embedding
+   * model (or have one keyed to a different model). These are the
+   * candidates the embedder should run on.
+   *
+   * Joins to nodes only to surface name/signature, which we feed into
+   * the embedding text so search hits match by name + intent.
+   */
+  getEmbeddableSummaries(
+    embeddingModel: string
+  ): Array<{ nodeId: string; name: string; signature: string | null; summary: string }> {
+    const rows = this.db
+      .prepare(
+        `SELECT s.node_id AS node_id, n.name AS name, n.signature AS signature, s.summary AS summary
+         FROM symbol_summaries s
+         JOIN nodes n ON n.id = s.node_id
+         LEFT JOIN symbol_embeddings e ON e.node_id = s.node_id
+         WHERE e.embedding_model IS NULL
+            OR e.embedding_model != ?`
+      )
+      .all(embeddingModel) as Array<{
+      node_id: string;
+      name: string;
+      signature: string | null;
+      summary: string;
+    }>;
+    return rows.map((r) => ({
+      nodeId: r.node_id,
+      name: r.name,
+      signature: r.signature,
+      summary: r.summary,
+    }));
+  }
+
+  /**
+   * Bulk fetch every summary's embedding for the active model. Used by
+   * the in-process semantic search scan. Cheap because BLOBs are
+   * already byte-aligned in SQLite.
+   */
+  getAllEmbeddings(
+    embeddingModel: string
+  ): Array<{ nodeId: string; embedding: Buffer }> {
+    const rows = this.db
+      .prepare(
+        `SELECT node_id, embedding FROM symbol_embeddings
+         WHERE embedding_model = ?`
+      )
+      .all(embeddingModel) as Array<{ node_id: string; embedding: Buffer }>;
+    return rows.map((r) => ({ nodeId: r.node_id, embedding: r.embedding }));
+  }
+
+  /**
+   * Persist an embedding for a previously-summarised symbol. The
+   * caller passes raw Float32 bytes (already L2-normalised).
+   */
+  upsertSymbolEmbedding(nodeId: string, embedding: Buffer | Uint8Array, model: string): void {
+    this.db
+      .prepare(
+        `INSERT INTO symbol_embeddings (node_id, embedding, embedding_model)
+         VALUES (?, ?, ?)
+         ON CONFLICT(node_id) DO UPDATE SET
+           embedding = excluded.embedding,
+           embedding_model = excluded.embedding_model`
+      )
+      .run(nodeId, embedding, model);
+  }
+
+  // ==========================================================================
+  // Role Classification (LLM-generated coarse role labels)
+  // ==========================================================================
+
+  /** Symbols that have a summary but no (or stale) role for this model. */
+  getClassifiableSummaries(
+    roleModel: string
+  ): Array<{ nodeId: string; name: string; kind: string; signature: string | null; summary: string }> {
+    const rows = this.db
+      .prepare(
+        `SELECT s.node_id AS node_id, n.name AS name, n.kind AS kind,
+                n.signature AS signature, s.summary AS summary
+         FROM symbol_summaries s
+         JOIN nodes n ON n.id = s.node_id
+         WHERE s.role IS NULL OR s.role_model IS NULL OR s.role_model != ?`
+      )
+      .all(roleModel) as Array<{
+      node_id: string;
+      name: string;
+      kind: string;
+      signature: string | null;
+      summary: string;
+    }>;
+    return rows.map((r) => ({
+      nodeId: r.node_id,
+      name: r.name,
+      kind: r.kind,
+      signature: r.signature,
+      summary: r.summary,
+    }));
+  }
+
+  /** Persist a role assignment for a previously-summarised symbol. */
+  upsertSymbolRole(nodeId: string, role: string, roleModel: string): void {
+    this.db
+      .prepare(
+        `UPDATE symbol_summaries SET role = ?, role_model = ? WHERE node_id = ?`
+      )
+      .run(role, roleModel, nodeId);
+  }
+
+  /** Bulk fetch roles for a set of node ids. */
+  getSymbolRoles(nodeIds: readonly string[]): Map<string, string> {
+    const out = new Map<string, string>();
+    if (nodeIds.length === 0) return out;
+    const CHUNK = 500;
+    for (let i = 0; i < nodeIds.length; i += CHUNK) {
+      const chunk = nodeIds.slice(i, i + CHUNK);
+      const placeholders = chunk.map(() => '?').join(',');
+      const rows = this.db
+        .prepare(
+          `SELECT node_id, role FROM symbol_summaries
+           WHERE role IS NOT NULL AND node_id IN (${placeholders})`
+        )
+        .all(...chunk) as Array<{ node_id: string; role: string }>;
+      for (const r of rows) out.set(r.node_id, r.role);
+    }
+    return out;
+  }
+
+  /** Find every node currently classified with a given role. */
+  findNodesByRole(role: string, limit = 100): Node[] {
+    const rows = this.db
+      .prepare(
+        `SELECT n.* FROM nodes n
+         JOIN symbol_summaries s ON s.node_id = n.id
+         WHERE s.role = ?
+         ORDER BY n.file_path, n.start_line
+         LIMIT ?`
+      )
+      .all(role, limit) as NodeRow[];
+    return rows.map(rowToNode);
+  }
+
+  /**
+   * Sample existing sibling names for the naming-convention checker.
+   * Excludes the symbol's own file (so the new symbol's own name
+   * doesn't bias the convention) and prefers symbols that have
+   * survived multiple sync cycles (proxy: anything in the index).
+   */
+  sampleSiblingNames(
+    kind: string,
+    excludeName: string,
+    excludeFile: string,
+    limit: number
+  ): string[] {
+    const rows = this.db
+      .prepare(
+        `SELECT DISTINCT name FROM nodes
+         WHERE kind = ? AND name != ? AND file_path != ?
+         ORDER BY name
+         LIMIT ?`
+      )
+      .all(kind, excludeName, excludeFile, limit) as Array<{ name: string }>;
+    return rows.map((r) => r.name);
+  }
+
+  /**
+   * Find symbols with zero incoming `calls` edges that are not marked
+   * exported. Pre-filter for the dead-code judge — cheap, runs in
+   * SQL, narrows the LLM workload to the graph-suspicious set.
+   */
+  findOrphanedSymbols(limit = 200): Node[] {
+    const rows = this.db
+      .prepare(
+        `SELECT n.* FROM nodes n
+         WHERE n.is_exported = 0
+           AND n.kind IN ('function', 'method', 'class', 'component')
+           AND NOT EXISTS (
+             SELECT 1 FROM edges e
+             WHERE e.target = n.id AND e.kind = 'calls'
+           )
+         ORDER BY n.file_path, n.start_line
+         LIMIT ?`
+      )
+      .all(limit) as NodeRow[];
+    return rows.map(rowToNode);
+  }
+
+  /** Counts of classified symbols by role (for status display). */
+  getRoleCounts(): Map<string, number> {
+    const rows = this.db
+      .prepare(
+        `SELECT role, COUNT(*) AS n FROM symbol_summaries
+         WHERE role IS NOT NULL GROUP BY role`
+      )
+      .all() as Array<{ role: string; n: number }>;
+    const out = new Map<string, number>();
+    for (const r of rows) out.set(r.role, r.n);
+    return out;
+  }
+
+  // ==========================================================================
+  // Directory Summaries (LLM-generated module-level descriptions)
+  // ==========================================================================
+
+  /** Pull every (file_path, name, kind, summary) for symbols that
+   *  already have a summary — used to group by directory for the
+   *  module-level synthesis pass. */
+  getSummarisedSymbolsByDir(): Array<{
+    filePath: string;
+    name: string;
+    kind: string;
+    summary: string;
+  }> {
+    const rows = this.db
+      .prepare(
+        `SELECT n.file_path AS file_path, n.name AS name, n.kind AS kind, s.summary AS summary
+         FROM symbol_summaries s
+         JOIN nodes n ON n.id = s.node_id
+         ORDER BY n.file_path`
+      )
+      .all() as Array<{ file_path: string; name: string; kind: string; summary: string }>;
+    return rows.map((r) => ({
+      filePath: r.file_path,
+      name: r.name,
+      kind: r.kind,
+      summary: r.summary,
+    }));
+  }
+
+  /** Read a single directory's cached summary, or null. */
+  getDirectorySummary(
+    dirPath: string
+  ): { summary: string; contentHash: string; model: string } | null {
+    const row = this.db
+      .prepare(
+        `SELECT summary, content_hash, model FROM directory_summaries WHERE dir_path = ?`
+      )
+      .get(dirPath) as { summary: string; content_hash: string; model: string } | undefined;
+    if (!row) return null;
+    return { summary: row.summary, contentHash: row.content_hash, model: row.model };
+  }
+
+  /** Bulk fetch directory summaries by exact dir path. */
+  getDirectorySummaries(dirPaths: readonly string[]): Map<string, string> {
+    const out = new Map<string, string>();
+    if (dirPaths.length === 0) return out;
+    const CHUNK = 500;
+    for (let i = 0; i < dirPaths.length; i += CHUNK) {
+      const chunk = dirPaths.slice(i, i + CHUNK);
+      const placeholders = chunk.map(() => '?').join(',');
+      const rows = this.db
+        .prepare(
+          `SELECT dir_path, summary FROM directory_summaries WHERE dir_path IN (${placeholders})`
+        )
+        .all(...chunk) as Array<{ dir_path: string; summary: string }>;
+      for (const r of rows) out.set(r.dir_path, r.summary);
+    }
+    return out;
+  }
+
+  /** All directory summaries (for codegraph status / explore). */
+  getAllDirectorySummaries(): Array<{ dirPath: string; summary: string }> {
+    const rows = this.db
+      .prepare(`SELECT dir_path, summary FROM directory_summaries ORDER BY dir_path`)
+      .all() as Array<{ dir_path: string; summary: string }>;
+    return rows.map((r) => ({ dirPath: r.dir_path, summary: r.summary }));
+  }
+
+  /** Insert or replace a directory summary, keyed on dir_path. */
+  upsertDirectorySummary(
+    dirPath: string,
+    contentHash: string,
+    summary: string,
+    model: string
+  ): void {
+    this.db
+      .prepare(
+        `INSERT INTO directory_summaries (dir_path, summary, content_hash, model, generated_at)
+         VALUES (?, ?, ?, ?, ?)
+         ON CONFLICT(dir_path) DO UPDATE SET
+           summary = excluded.summary,
+           content_hash = excluded.content_hash,
+           model = excluded.model,
+           generated_at = excluded.generated_at`
+      )
+      .run(dirPath, summary, contentHash, model, Date.now());
+  }
+
+  /**
+   * Stats for `codegraph status`: how much of the index has summaries.
+   * `total` counts only nodes that are *eligible* for summarisation —
+   * counting parameters/imports/files in the denominator would
+   * understate coverage and confuse the user.
+   */
+  getSummaryCoverage(kinds?: ReadonlySet<string>): { total: number; summarised: number } {
+    let total: number;
+    if (kinds && kinds.size > 0) {
+      const placeholders = [...kinds].map(() => '?').join(',');
+      total = (
+        this.db
+          .prepare(`SELECT COUNT(*) AS n FROM nodes WHERE kind IN (${placeholders})`)
+          .get(...kinds) as { n: number }
+      ).n;
+    } else {
+      total = (this.db.prepare('SELECT COUNT(*) AS n FROM nodes').get() as { n: number }).n;
+    }
+    const summarised = (this.db.prepare('SELECT COUNT(*) AS n FROM symbol_summaries').get() as { n: number }).n;
+    return { total, summarised };
+  }
+
 }
diff --git a/src/db/schema.sql b/src/db/schema.sql
index dd0a9f06..45030998 100644
--- a/src/db/schema.sql
+++ b/src/db/schema.sql
@@ -37,7 +37,13 @@ CREATE TABLE IF NOT EXISTS nodes (
     is_abstract INTEGER DEFAULT 0,
     decorators TEXT, -- JSON array
     type_parameters TEXT, -- JSON array
-    updated_at INTEGER NOT NULL
+    updated_at INTEGER NOT NULL,
+    centrality REAL DEFAULT NULL, -- PageRank over calls+references; NULL until first compute
+    -- Camel/snake-split tokens of `name`, joined by spaces. The default
+    -- FTS5 tokenizer indexes each as a separate term, so a query for
+    -- `parser` finds `getParser` etc. Populated by buildNameSubwords()
+    -- in src/utils.ts on every insert/update.
+    name_subwords TEXT
 );
 
 -- Edges: Relationships between nodes
@@ -63,8 +69,24 @@ CREATE TABLE IF NOT EXISTS files (
     modified_at INTEGER NOT NULL,
     indexed_at INTEGER NOT NULL,
     node_count INTEGER DEFAULT 0,
-    errors TEXT -- JSON array
+    errors TEXT, -- JSON array
+    -- Churn signals (mined from git log)
+    commit_count INTEGER NOT NULL DEFAULT 0,
+    loc INTEGER NOT NULL DEFAULT 0,
+    first_seen_ts INTEGER DEFAULT NULL, -- unix seconds
+    last_touched_ts INTEGER DEFAULT NULL -- unix seconds
+);
+
+-- Co-Changes: pairs of files that have changed together in git history.
+-- Symmetric — stored canonically with file_a < file_b.
+CREATE TABLE IF NOT EXISTS co_changes (
+    file_a TEXT NOT NULL,
+    file_b TEXT NOT NULL,
+    count INTEGER NOT NULL,
+    PRIMARY KEY (file_a, file_b),
+    CHECK (file_a < file_b)
 );
+-- Co-change indexes are declared together below in the indexes section.
 
 -- Unresolved References: References that need resolution after full indexing
 CREATE TABLE IF NOT EXISTS unresolved_refs (
@@ -92,34 +114,42 @@ CREATE INDEX IF NOT EXISTS idx_nodes_file_path ON nodes(file_path);
 CREATE INDEX IF NOT EXISTS idx_nodes_language ON nodes(language);
 CREATE INDEX IF NOT EXISTS idx_nodes_file_line ON nodes(file_path, start_line);
 CREATE INDEX IF NOT EXISTS idx_nodes_lower_name ON nodes(lower(name));
+CREATE INDEX IF NOT EXISTS idx_nodes_centrality ON nodes(centrality DESC);
 
 -- Full-text search index on node names, docstrings, and signatures
+-- The Porter stemmer collapses morphological variants so a query for
+-- `parsing` matches a docstring or subword containing `parser`/`parse`.
+-- This is the largest single quality lift for natural-language queries
+-- (verified empirically: targets that ranked #18-#19 or weren't in the
+-- top 20 jump to the top 5 — see __tests__/search-quality.test.ts).
 CREATE VIRTUAL TABLE IF NOT EXISTS nodes_fts USING fts5(
     id,
     name,
     qualified_name,
     docstring,
     signature,
+    name_subwords,
     content='nodes',
-    content_rowid='rowid'
+    content_rowid='rowid',
+    tokenize="porter unicode61"
 );
 
 -- Triggers to keep FTS index in sync
 CREATE TRIGGER IF NOT EXISTS nodes_ai AFTER INSERT ON nodes BEGIN
-    INSERT INTO nodes_fts(rowid, id, name, qualified_name, docstring, signature)
-    VALUES (NEW.rowid, NEW.id, NEW.name, NEW.qualified_name, NEW.docstring, NEW.signature);
+    INSERT INTO nodes_fts(rowid, id, name, qualified_name, docstring, signature, name_subwords)
+    VALUES (NEW.rowid, NEW.id, NEW.name, NEW.qualified_name, NEW.docstring, NEW.signature, NEW.name_subwords);
 END;
 
 CREATE TRIGGER IF NOT EXISTS nodes_ad AFTER DELETE ON nodes BEGIN
-    INSERT INTO nodes_fts(nodes_fts, rowid, id, name, qualified_name, docstring, signature)
-    VALUES ('delete', OLD.rowid, OLD.id, OLD.name, OLD.qualified_name, OLD.docstring, OLD.signature);
+    INSERT INTO nodes_fts(nodes_fts, rowid, id, name, qualified_name, docstring, signature, name_subwords)
+    VALUES ('delete', OLD.rowid, OLD.id, OLD.name, OLD.qualified_name, OLD.docstring, OLD.signature, OLD.name_subwords);
 END;
 
 CREATE TRIGGER IF NOT EXISTS nodes_au AFTER UPDATE ON nodes BEGIN
-    INSERT INTO nodes_fts(nodes_fts, rowid, id, name, qualified_name, docstring, signature)
-    VALUES ('delete', OLD.rowid, OLD.id, OLD.name, OLD.qualified_name, OLD.docstring, OLD.signature);
-    INSERT INTO nodes_fts(rowid, id, name, qualified_name, docstring, signature)
-    VALUES (NEW.rowid, NEW.id, NEW.name, NEW.qualified_name, NEW.docstring, NEW.signature);
+    INSERT INTO nodes_fts(nodes_fts, rowid, id, name, qualified_name, docstring, signature, name_subwords)
+    VALUES ('delete', OLD.rowid, OLD.id, OLD.name, OLD.qualified_name, OLD.docstring, OLD.signature, OLD.name_subwords);
+    INSERT INTO nodes_fts(rowid, id, name, qualified_name, docstring, signature, name_subwords)
+    VALUES (NEW.rowid, NEW.id, NEW.name, NEW.qualified_name, NEW.docstring, NEW.signature, NEW.name_subwords);
 END;
 
 -- Edge indexes
@@ -129,9 +159,24 @@ CREATE INDEX IF NOT EXISTS idx_edges_kind ON edges(kind);
 CREATE INDEX IF NOT EXISTS idx_edges_source_kind ON edges(source, kind);
 CREATE INDEX IF NOT EXISTS idx_edges_target_kind ON edges(target, kind);
 
+-- Uniqueness for (source, target, kind, line, col). The id column is an
+-- AUTOINCREMENT primary key, so without this index `INSERT OR IGNORE`
+-- would never see a conflict — duplicate edges would silently accumulate
+-- on every re-resolution / re-emission. COALESCE keeps two NULL line/col
+-- values comparable as equal (SQLite treats raw NULLs in a UNIQUE index
+-- as distinct).
+CREATE UNIQUE INDEX IF NOT EXISTS idx_edges_unique
+  ON edges(source, target, kind, COALESCE(line, -1), COALESCE(col, -1));
+
 -- File indexes
 CREATE INDEX IF NOT EXISTS idx_files_language ON files(language);
 CREATE INDEX IF NOT EXISTS idx_files_modified_at ON files(modified_at);
+CREATE INDEX IF NOT EXISTS idx_files_commit_count ON files(commit_count DESC);
+CREATE INDEX IF NOT EXISTS idx_files_last_touched ON files(last_touched_ts DESC);
+
+-- Co-change index for file_b lookups (file_a is covered by the
+-- (file_a, file_b) PK above).
+CREATE INDEX IF NOT EXISTS idx_co_changes_b ON co_changes(file_b);
 
 -- Unresolved refs indexes
 CREATE INDEX IF NOT EXISTS idx_unresolved_from_node ON unresolved_refs(from_node_id);
@@ -146,3 +191,102 @@ CREATE TABLE IF NOT EXISTS project_metadata (
     value TEXT NOT NULL,
     updated_at INTEGER NOT NULL
 );
+
+-- Issue → symbol attribution mined from git history.
+-- One row per (node, issue, commit, kind) tuple; kind is 'modified'
+-- (enclosing function changed by hunk), 'added' (declaration on a +
+-- line), or 'removed' (declaration on a - line, dropped at lookup
+-- time when no current node matches).
+CREATE TABLE IF NOT EXISTS symbol_issues (
+    node_id TEXT NOT NULL,
+    issue_number INTEGER NOT NULL,
+    commit_sha TEXT NOT NULL,
+    kind TEXT NOT NULL CHECK (kind IN ('modified','added','removed')),
+    PRIMARY KEY (node_id, issue_number, commit_sha, kind),
+    FOREIGN KEY (node_id) REFERENCES nodes(id) ON DELETE CASCADE
+);
+CREATE INDEX IF NOT EXISTS idx_symbol_issues_node ON symbol_issues(node_id);
+CREATE INDEX IF NOT EXISTS idx_symbol_issues_issue ON symbol_issues(issue_number);
+
+-- Config references: read sites for env vars / feature flags / etc.
+-- One row per syntactic occurrence in source. config_kind narrows to
+-- 'env' (process.env, os.getenv, ...) for v1; future kinds add YAML
+-- keys, LaunchDarkly flags, etc. source_node_id may be NULL for
+-- top-level reads that aren't inside a function/method.
+CREATE TABLE IF NOT EXISTS config_refs (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    config_kind TEXT NOT NULL,
+    config_key TEXT NOT NULL,
+    source_node_id TEXT,
+    file_path TEXT NOT NULL,
+    line INTEGER NOT NULL,
+    FOREIGN KEY (source_node_id) REFERENCES nodes(id) ON DELETE CASCADE
+);
+CREATE INDEX IF NOT EXISTS idx_config_refs_key
+    ON config_refs(config_kind, config_key);
+CREATE INDEX IF NOT EXISTS idx_config_refs_node
+    ON config_refs(source_node_id);
+CREATE INDEX IF NOT EXISTS idx_config_refs_file
+    ON config_refs(file_path);
+
+-- SQL references: per-call-site links from app code to a table name.
+-- One row per syntactic occurrence in source. op is 'read' (SELECT,
+-- FROM in non-DDL), 'write' (INSERT/UPDATE/DELETE), or 'ddl'
+-- (CREATE TABLE / ALTER TABLE / DROP TABLE -- rare in app code but
+-- catches migration scripts).
+CREATE TABLE IF NOT EXISTS sql_refs (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    table_name TEXT NOT NULL,
+    op TEXT NOT NULL CHECK (op IN ('read','write','ddl')),
+    source_node_id TEXT,
+    file_path TEXT NOT NULL,
+    line INTEGER NOT NULL,
+    FOREIGN KEY (source_node_id) REFERENCES nodes(id) ON DELETE CASCADE
+);
+CREATE INDEX IF NOT EXISTS idx_sql_refs_table
+    ON sql_refs(lower(table_name));
+CREATE INDEX IF NOT EXISTS idx_sql_refs_node
+    ON sql_refs(source_node_id);
+CREATE INDEX IF NOT EXISTS idx_sql_refs_file
+    ON sql_refs(file_path);
+
+-- Symbol summaries: one-line LLM-generated descriptions for symbols that
+-- lack docstrings. content_hash invalidates the summary when the symbol
+-- body changes, so a re-index regenerates only what actually changed.
+CREATE TABLE IF NOT EXISTS symbol_summaries (
+    node_id TEXT PRIMARY KEY,
+    content_hash TEXT NOT NULL,
+    summary TEXT NOT NULL,
+    model TEXT NOT NULL,
+    generated_at INTEGER NOT NULL,
+    -- Role classification (api_endpoint | business_logic | data_model |
+    -- util | framework_glue | test_helper | unknown).
+    role TEXT,
+    role_model TEXT,
+    FOREIGN KEY (node_id) REFERENCES nodes(id) ON DELETE CASCADE
+);
+CREATE INDEX IF NOT EXISTS idx_summaries_model ON symbol_summaries(model);
+CREATE INDEX IF NOT EXISTS idx_summaries_role ON symbol_summaries(role);
+
+-- Embeddings live in their own table so common-path summary scans
+-- (FTS-anchor lookups, role filters, freshness checks) don't drag
+-- the 768-dim Float32 BLOB along their page chain. Bytes are LE
+-- Float32Array, L2-normalised so dot product == cosine similarity.
+CREATE TABLE IF NOT EXISTS symbol_embeddings (
+    node_id TEXT PRIMARY KEY,
+    embedding BLOB NOT NULL,
+    embedding_model TEXT NOT NULL,
+    FOREIGN KEY (node_id) REFERENCES symbol_summaries(node_id) ON DELETE CASCADE
+);
+CREATE INDEX IF NOT EXISTS idx_embeddings_model ON symbol_embeddings(embedding_model);
+
+-- Directory-level LLM summaries: one paragraph synthesised from the
+-- symbol summaries inside the directory.
+CREATE TABLE IF NOT EXISTS directory_summaries (
+    dir_path TEXT PRIMARY KEY,
+    summary TEXT NOT NULL,
+    content_hash TEXT NOT NULL,
+    model TEXT NOT NULL,
+    generated_at INTEGER NOT NULL
+);
+CREATE INDEX IF NOT EXISTS idx_dir_summaries_model ON directory_summaries(model);
diff --git a/src/default-config.ts b/src/default-config.ts
new file mode 100644
index 00000000..f3adedbe
--- /dev/null
+++ b/src/default-config.ts
@@ -0,0 +1,201 @@
+/**
+ * Default project configuration.
+ *
+ * Lives in its own file (separate from `types.ts`) because the
+ * `include` glob list is derived from the language registry — and
+ * the registry transitively imports `types.ts` via per-language
+ * files, which would create an evaluation cycle if `default-config`
+ * were itself imported by `types.ts` eagerly.
+ *
+ * **Lazy include resolution.** The `include` array is built on
+ * first access via a property getter, not at module load. By the
+ * time anything reads `DEFAULT_CONFIG.include`, the registry has
+ * fully evaluated, so all language definitions are available.
+ */
+
+import type { CodeGraphConfig } from './types';
+import { getLanguageDefs } from './extraction/languages/registry';
+
+let _includeCache: string[] | null = null;
+function buildIncludeGlobs(): string[] {
+  if (_includeCache) return _includeCache;
+  const seen = new Set<string>();
+  const out: string[] = [];
+  for (const def of getLanguageDefs()) {
+    for (const glob of def.includeGlobs) {
+      if (seen.has(glob)) continue;
+      seen.add(glob);
+      out.push(glob);
+    }
+  }
+  _includeCache = out;
+  return out;
+}
+
+const baseConfig: CodeGraphConfig = {
+  version: 1,
+  rootDir: '.',
+  include: [], // populated lazily via the getter below
+  exclude: [
+    // Version control
+    '**/.git/**',
+
+    // Dependencies
+    '**/node_modules/**',
+    '**/vendor/**',
+    '**/Pods/**',
+
+    // Generic build outputs
+    '**/dist/**',
+    '**/build/**',
+    '**/out/**',
+    '**/bin/**',
+    '**/obj/**',
+    '**/target/**',
+
+    // JavaScript/TypeScript
+    '**/*.min.js',
+    '**/*.bundle.js',
+    '**/.next/**',
+    '**/.nuxt/**',
+    '**/.svelte-kit/**',
+    '**/.output/**',
+    '**/.turbo/**',
+    '**/.cache/**',
+    '**/.parcel-cache/**',
+    '**/.vite/**',
+    '**/.astro/**',
+    '**/.docusaurus/**',
+    '**/.gatsby/**',
+    '**/.webpack/**',
+    '**/.nx/**',
+    '**/.yarn/cache/**',
+    '**/.pnpm-store/**',
+    '**/storybook-static/**',
+
+    // React Native / Expo
+    '**/.expo/**',
+    '**/web-build/**',
+    '**/ios/Pods/**',
+    '**/ios/build/**',
+    '**/android/build/**',
+    '**/android/.gradle/**',
+
+    // Python
+    '**/__pycache__/**',
+    '**/.venv/**',
+    '**/venv/**',
+    '**/site-packages/**',
+    '**/dist-packages/**',
+    '**/.pytest_cache/**',
+    '**/.mypy_cache/**',
+    '**/.ruff_cache/**',
+    '**/.tox/**',
+    '**/.nox/**',
+    '**/*.egg-info/**',
+    '**/.eggs/**',
+
+    // Go
+    '**/go/pkg/mod/**',
+
+    // Rust
+    '**/target/debug/**',
+    '**/target/release/**',
+
+    // Java/Kotlin/Gradle
+    '**/.gradle/**',
+    '**/.m2/**',
+    '**/generated-sources/**',
+    '**/.kotlin/**',
+
+    // Dart/Flutter
+    '**/.dart_tool/**',
+
+    // C#/.NET
+    '**/.vs/**',
+    '**/.nuget/**',
+    '**/artifacts/**',
+    '**/publish/**',
+
+    // C/C++
+    '**/cmake-build-*/**',
+    '**/CMakeFiles/**',
+    '**/bazel-*/**',
+    '**/vcpkg_installed/**',
+    '**/.conan/**',
+    '**/Debug/**',
+    '**/Release/**',
+    '**/x64/**',
+    '**/.pio/**',  // Platform.io (IoT/embedded build artifacts and library deps)
+
+    // Electron
+    '**/release/**',
+    '**/*.app/**',
+    '**/*.asar',
+
+    // Swift/iOS/Xcode
+    '**/DerivedData/**',
+    '**/.build/**',
+    '**/.swiftpm/**',
+    '**/xcuserdata/**',
+    '**/Carthage/Build/**',
+    '**/SourcePackages/**',
+
+    // Delphi/Pascal
+    '**/__history/**',
+    '**/__recovery/**',
+    '**/*.dcu',
+
+    // PHP
+    '**/.composer/**',
+    '**/storage/framework/**',
+    '**/bootstrap/cache/**',
+
+    // Ruby
+    '**/.bundle/**',
+    '**/tmp/cache/**',
+    '**/public/assets/**',
+    '**/public/packs/**',
+    '**/.yardoc/**',
+
+    // Testing/Coverage
+    '**/coverage/**',
+    '**/htmlcov/**',
+    '**/.nyc_output/**',
+    '**/test-results/**',
+    '**/.coverage/**',
+
+    // IDE/Editor
+    '**/.idea/**',
+
+    // Logs and temp
+    '**/logs/**',
+    '**/tmp/**',
+    '**/temp/**',
+
+    // Documentation build output
+    '**/_build/**',
+    '**/docs/_build/**',
+    '**/site/**',
+  ],
+  languages: [],
+  frameworks: [],
+  maxFileSize: 1024 * 1024, // 1MB
+  extractDocstrings: true,
+  trackCallSites: true,
+  enableCentrality: true,
+  enableChurn: true,
+  enableIssueHistory: true,
+  enableConfigRefs: true,
+  enableSqlRefs: true,
+  indexSubmodules: true,
+  enableCoChange: true,
+};
+
+Object.defineProperty(baseConfig, 'include', {
+  get: () => buildIncludeGlobs(),
+  enumerable: true,
+  configurable: true,
+});
+
+export const DEFAULT_CONFIG: CodeGraphConfig = baseConfig;
diff --git a/src/extraction/grammars.ts b/src/extraction/grammars.ts
index df264fb3..5c2aec09 100644
--- a/src/extraction/grammars.ts
+++ b/src/extraction/grammars.ts
@@ -4,77 +4,63 @@
  * Uses web-tree-sitter (WASM) for universal cross-platform support.
  * Grammars are loaded lazily — only languages actually present in the project
  * are compiled, keeping V8 WASM memory pressure low on large codebases.
+ *
+ * As of the language-registry refactor, all per-language metadata
+ * (WASM filenames, file extensions, display names, vendored flag)
+ * lives in `./languages/<name>.ts` and is auto-collected by
+ * `./languages/registry.ts`. The constants exported here
+ * (`EXTENSION_MAP`, `getSupportedLanguages`, `getLanguageDisplayName`)
+ * remain for backward compat but are derived from the registry.
  */
 
 import * as path from 'path';
 import { Parser, Language as WasmLanguage } from 'web-tree-sitter';
 import { Language } from '../types';
+import { getLanguageDefs, getLanguageDefByExtension, getLanguageDefByName } from './languages/registry';
 
 export type GrammarLanguage = Exclude<Language, 'svelte' | 'liquid' | 'unknown'>;
 
 /**
- * WASM filename map — maps each language to its .wasm grammar file
- * in the tree-sitter-wasms package.
+ * File extension → Language mapping, computed lazily on first read.
+ *
+ * Cannot be a top-level IIFE: the registry transitively pulls in
+ * `tree-sitter.ts` (via custom-extractor language defs), which
+ * imports this file — building the map at module load would TDZ
+ * against `ALL_DEFS` in the registry. Use the `getExtensionMap()`
+ * function for an explicit lazy entry point, or read
+ * `EXTENSION_MAP` (a Proxy that materialises on first property
+ * access).
  */
-const WASM_GRAMMAR_FILES: Record<GrammarLanguage, string> = {
-  typescript: 'tree-sitter-typescript.wasm',
-  tsx: 'tree-sitter-tsx.wasm',
-  javascript: 'tree-sitter-javascript.wasm',
-  jsx: 'tree-sitter-javascript.wasm',
-  python: 'tree-sitter-python.wasm',
-  go: 'tree-sitter-go.wasm',
-  rust: 'tree-sitter-rust.wasm',
-  java: 'tree-sitter-java.wasm',
-  c: 'tree-sitter-c.wasm',
-  cpp: 'tree-sitter-cpp.wasm',
-  csharp: 'tree-sitter-c_sharp.wasm',
-  php: 'tree-sitter-php.wasm',
-  ruby: 'tree-sitter-ruby.wasm',
-  swift: 'tree-sitter-swift.wasm',
-  kotlin: 'tree-sitter-kotlin.wasm',
-  dart: 'tree-sitter-dart.wasm',
-  pascal: 'tree-sitter-pascal.wasm',
-};
+let _extensionMapCache: Record<string, Language> | null = null;
+export function getExtensionMap(): Record<string, Language> {
+  if (_extensionMapCache) return _extensionMapCache;
+  const out: Record<string, Language> = {};
+  for (const def of getLanguageDefs()) {
+    for (const ext of def.extensions) {
+      out[ext.toLowerCase()] = def.name as Language;
+    }
+  }
+  _extensionMapCache = out;
+  return out;
+}
 
 /**
- * File extension to Language mapping
+ * Backward-compat: a Proxy that lazy-builds the extension map on
+ * first property access. Existing callers can keep doing
+ * `EXTENSION_MAP['.ts']` without changes.
  */
-export const EXTENSION_MAP: Record<string, Language> = {
-  '.ts': 'typescript',
-  '.tsx': 'tsx',
-  '.js': 'javascript',
-  '.mjs': 'javascript',
-  '.cjs': 'javascript',
-  '.jsx': 'jsx',
-  '.py': 'python',
-  '.pyw': 'python',
-  '.go': 'go',
-  '.rs': 'rust',
-  '.java': 'java',
-  '.c': 'c',
-  '.h': 'c', // Could also be C++, defaulting to C
-  '.cpp': 'cpp',
-  '.cc': 'cpp',
-  '.cxx': 'cpp',
-  '.hpp': 'cpp',
-  '.hxx': 'cpp',
-  '.cs': 'csharp',
-  '.php': 'php',
-  '.rb': 'ruby',
-  '.rake': 'ruby',
-  '.swift': 'swift',
-  '.kt': 'kotlin',
-  '.kts': 'kotlin',
-  '.dart': 'dart',
-  '.liquid': 'liquid',
-  '.svelte': 'svelte',
-  '.pas': 'pascal',
-  '.dpr': 'pascal',
-  '.dpk': 'pascal',
-  '.lpr': 'pascal',
-  '.dfm': 'pascal',
-  '.fmx': 'pascal',
-};
+export const EXTENSION_MAP: Record<string, Language> = new Proxy({} as Record<string, Language>, {
+  get(_t, key: string) { return getExtensionMap()[key]; },
+  has(_t, key: string) { return key in getExtensionMap(); },
+  ownKeys() { return Object.keys(getExtensionMap()); },
+  getOwnPropertyDescriptor(_t, key: string) {
+    const map = getExtensionMap();
+    if (key in map) {
+      return { configurable: true, enumerable: true, writable: false, value: map[key] };
+    }
+    return undefined;
+  },
+});
 
 /**
  * Caches for loaded grammars and parsers
@@ -108,21 +94,28 @@ export async function loadGrammarsForLanguages(languages: Language[]): Promise<v
     await initGrammars();
   }
 
-  // Deduplicate and filter to languages that have WASM grammars and aren't already loaded
-  const toLoad = [...new Set(languages)].filter(
-    (lang): lang is GrammarLanguage =>
-      lang in WASM_GRAMMAR_FILES &&
-      !languageCache.has(lang) &&
-      !unavailableGrammarErrors.has(lang)
-  );
+  // Deduplicate; filter to languages that have a tree-sitter grammar
+  // (registry's `def.grammar` field) and aren't already loaded.
+  const seen = new Set<Language>();
+  const toLoad: Array<{ lang: Language; wasmFile: string; vendored: boolean }> = [];
+  for (const lang of languages) {
+    if (seen.has(lang)) continue;
+    seen.add(lang);
+    if (languageCache.has(lang) || unavailableGrammarErrors.has(lang)) continue;
+    const def = getLanguageDefByName(lang);
+    if (!def?.grammar) continue;
+    toLoad.push({
+      lang,
+      wasmFile: def.grammar.wasmFile,
+      vendored: def.grammar.vendored === true,
+    });
+  }
 
   // Load grammars sequentially to avoid web-tree-sitter WASM race condition on Node 20+
   // See: https://github.com/tree-sitter/tree-sitter/issues/2338
-  for (const lang of toLoad) {
-    const wasmFile = WASM_GRAMMAR_FILES[lang];
+  for (const { lang, wasmFile, vendored } of toLoad) {
     try {
-      // Pascal ships its own WASM (not in tree-sitter-wasms)
-      const wasmPath = lang === 'pascal'
+      const wasmPath = vendored
         ? path.join(__dirname, 'wasm', wasmFile)
         : require.resolve(`tree-sitter-wasms/out/${wasmFile}`);
       const language = await WasmLanguage.load(wasmPath);
@@ -140,7 +133,9 @@ export async function loadGrammarsForLanguages(languages: Language[]): Promise<v
  * backward compatibility. Prefer loadGrammarsForLanguages() in production.
  */
 export async function loadAllGrammars(): Promise<void> {
-  const allLanguages = Object.keys(WASM_GRAMMAR_FILES) as GrammarLanguage[];
+  const allLanguages = getLanguageDefs()
+    .filter((d) => d.grammar)
+    .map((d) => d.name as Language);
   await loadGrammarsForLanguages(allLanguages);
 }
 
@@ -176,7 +171,8 @@ export function getParser(language: Language): Parser | null {
  */
 export function detectLanguage(filePath: string, source?: string): Language {
   const ext = filePath.substring(filePath.lastIndexOf('.')).toLowerCase();
-  const lang = EXTENSION_MAP[ext] || 'unknown';
+  const def = getLanguageDefByExtension(ext);
+  const lang = (def?.name as Language) ?? 'unknown';
 
   // .h files could be C or C++ — check source content for C++ features
   if (lang === 'c' && ext === '.h' && source) {
@@ -196,29 +192,30 @@ function looksLikeCpp(source: string): boolean {
 }
 
 /**
- * Check if a language is supported (has a grammar defined).
- * Returns true if the grammar exists, even if not yet loaded.
+ * Check if a language is supported (has a grammar or custom extractor).
+ * Returns true if a registry entry exists, even if its grammar isn't loaded.
  */
 export function isLanguageSupported(language: Language): boolean {
-  if (language === 'svelte') return true; // custom extractor (script block delegation)
-  if (language === 'liquid') return true; // custom regex extractor
   if (language === 'unknown') return false;
-  return language in WASM_GRAMMAR_FILES;
+  return getLanguageDefByName(language) !== undefined;
 }
 
 /**
  * Check if a grammar has been loaded and is ready for parsing.
+ * Custom-extractor languages (no `grammar` field) are always "ready".
  */
 export function isGrammarLoaded(language: Language): boolean {
-  if (language === 'svelte' || language === 'liquid') return true;
+  const def = getLanguageDefByName(language);
+  if (!def) return false;
+  if (!def.grammar) return true; // custom extractor — always available
   return languageCache.has(language);
 }
 
 /**
- * Get all supported languages (those with grammar definitions).
+ * Get all supported languages from the registry.
  */
 export function getSupportedLanguages(): Language[] {
-  return [...(Object.keys(WASM_GRAMMAR_FILES) as GrammarLanguage[]), 'svelte', 'liquid'];
+  return getLanguageDefs().map((d) => d.name as Language);
 }
 
 /**
@@ -237,54 +234,33 @@ export function resetParser(language: Language): void {
 }
 
 /**
- * Clear parser/grammar caches (useful for testing)
+ * Clear parser cache (useful for testing).
+ *
+ * Note: `languageCache` is intentionally NOT cleared — the WASM
+ * `Language` modules are expensive to load and stay cached so a
+ * subsequent `getParser` call can rebuild a fresh `Parser` instance
+ * without re-reading the .wasm file. To fully re-init, set
+ * `parserInitialized = false` and call `initGrammars()` again.
  */
 export function clearParserCache(): void {
   for (const parser of parserCache.values()) {
-    parser.delete();
+    try { parser.delete(); } catch { /* ignore */ }
   }
   parserCache.clear();
-  // Note: languageCache is NOT cleared — WASM languages persist.
-  // To fully re-init, set parserInitialized = false and call initGrammars() again.
   unavailableGrammarErrors.clear();
 }
 
 /**
- * Report grammars that failed to load.
+ * Get unavailable grammar errors (for diagnostics)
  */
-export function getUnavailableGrammarErrors(): Partial<Record<Language, string>> {
-  const out: Partial<Record<Language, string>> = {};
-  for (const [language, message] of unavailableGrammarErrors.entries()) {
-    out[language] = message;
-  }
-  return out;
+export function getUnavailableGrammarErrors(): Record<string, string> {
+  return Object.fromEntries(unavailableGrammarErrors);
 }
 
 /**
- * Get language display name
+ * Human-readable display name (e.g. "TypeScript", "Pascal / Delphi").
+ * Returns the canonical name unchanged if no display name is registered.
  */
 export function getLanguageDisplayName(language: Language): string {
-  const names: Record<Language, string> = {
-    typescript: 'TypeScript',
-    javascript: 'JavaScript',
-    tsx: 'TypeScript (TSX)',
-    jsx: 'JavaScript (JSX)',
-    python: 'Python',
-    go: 'Go',
-    rust: 'Rust',
-    java: 'Java',
-    c: 'C',
-    cpp: 'C++',
-    csharp: 'C#',
-    php: 'PHP',
-    ruby: 'Ruby',
-    swift: 'Swift',
-    kotlin: 'Kotlin',
-    dart: 'Dart',
-    svelte: 'Svelte',
-    liquid: 'Liquid',
-    pascal: 'Pascal / Delphi',
-    unknown: 'Unknown',
-  };
-  return names[language] || language;
+  return getLanguageDefByName(language)?.displayName ?? language;
 }
diff --git a/src/extraction/hcl-extractor.ts b/src/extraction/hcl-extractor.ts
new file mode 100644
index 00000000..3d810c88
--- /dev/null
+++ b/src/extraction/hcl-extractor.ts
@@ -0,0 +1,587 @@
+import type { Node as SyntaxNode } from 'web-tree-sitter';
+import { Node, Edge, ExtractionResult, ExtractionError, UnresolvedReference, NodeKind } from '../types';
+import { generateNodeId, getNodeText } from './tree-sitter-helpers';
+import { getParser } from './grammars';
+
+/**
+ * HclExtractor — extracts a Terraform/HCL file into the graph.
+ *
+ * HCL is a declarative configuration language: there are no functions,
+ * classes, or methods. The unit of structure is the **block**:
+ *
+ *     <kind> [<label>...] { <body> }
+ *
+ * Each top-level block is mapped to a graph node, with its qualified name
+ * matching the Terraform reference form so cross-block references resolve
+ * naturally:
+ *
+ *   block form                        | NodeKind   | qualified name
+ *   ----------------------------------|------------|----------------------
+ *   variable "x" {}                   | variable   | var.x
+ *   locals { x = ...; y = ... }       | constant   | local.x, local.y
+ *   resource "TYPE" "NAME" {}         | class      | TYPE.NAME
+ *   data "TYPE" "NAME" {}             | class      | data.TYPE.NAME
+ *   module "NAME" {}                  | module     | module.NAME
+ *   output "NAME" {}                  | export     | output.NAME
+ *   provider "NAME" {}                | namespace  | provider.NAME
+ *   terraform {}                      | module     | terraform
+ *
+ * References inside attribute values (e.g. `bucket = aws_s3_bucket.logs.id`)
+ * become unresolved references that the resolver matches by qualified name.
+ */
+export class HclExtractor {
+  private filePath: string;
+  private source: string;
+  private nodes: Node[] = [];
+  private edges: Edge[] = [];
+  private unresolvedReferences: UnresolvedReference[] = [];
+  private errors: ExtractionError[] = [];
+
+  /**
+   * Heads that look like references but are Terraform built-ins / pseudo-vars,
+   * not addressable graph nodes. Skipped during reference scanning.
+   *
+   * `terraform` is in this set because `terraform.workspace` is a built-in
+   * pseudo-var. As a side effect, the `terraform {}` block node we emit
+   * (qualifiedName=`terraform`) cannot be the target of a resolved reference
+   * — that's intentional, since Terraform itself doesn't allow blocks to
+   * reference the terraform settings block.
+   */
+  private static readonly RESERVED_HEADS: ReadonlySet<string> = new Set([
+    'count',
+    'each',
+    'self',
+    'path',
+    'terraform',
+    'null',
+    'true',
+    'false',
+  ]);
+
+  constructor(filePath: string, source: string) {
+    this.filePath = filePath;
+    this.source = source;
+  }
+
+  extract(): ExtractionResult {
+    const startTime = Date.now();
+
+    const parser = getParser('hcl');
+    if (!parser) {
+      this.errors.push({
+        message: 'HCL grammar not loaded',
+        severity: 'error',
+        code: 'grammar_unavailable',
+      });
+      return this.result(startTime);
+    }
+
+    let tree;
+    try {
+      tree = parser.parse(this.source);
+    } catch (e) {
+      this.errors.push({
+        message: `HCL parse error: ${e instanceof Error ? e.message : String(e)}`,
+        severity: 'error',
+        code: 'parse_error',
+      });
+      return this.result(startTime);
+    }
+    if (!tree) {
+      this.errors.push({ message: 'HCL parse returned no tree', severity: 'error', code: 'parse_error' });
+      return this.result(startTime);
+    }
+
+    try {
+      const fileNodeId = this.createFileNode();
+
+      const root = tree.rootNode;
+      const topBody = root.namedChildren.find((c: SyntaxNode | null) => c?.type === 'body');
+      if (!topBody) {
+        return this.result(startTime);
+      }
+
+      for (let i = 0; i < topBody.namedChildCount; i++) {
+        const child = topBody.namedChild(i);
+        if (child?.type === 'block') {
+          try {
+            this.visitTopLevelBlock(child, fileNodeId);
+          } catch (e) {
+            this.errors.push({
+              message: `HCL block extraction error: ${e instanceof Error ? e.message : String(e)}`,
+              line: child.startPosition.row + 1,
+              severity: 'warning',
+              code: 'extraction_error',
+            });
+          }
+        }
+      }
+
+      return this.result(startTime);
+    } finally {
+      // tree-sitter trees back onto WASM linear memory; release them explicitly
+      // so we don't accumulate one tree per indexed .tf file.
+      tree.delete();
+    }
+  }
+
+  private result(startTime: number): ExtractionResult {
+    return {
+      nodes: this.nodes,
+      edges: this.edges,
+      unresolvedReferences: this.unresolvedReferences,
+      errors: this.errors,
+      durationMs: Date.now() - startTime,
+    };
+  }
+
+  private createFileNode(): string {
+    const lines = this.source.split('\n');
+    const id = generateNodeId(this.filePath, 'file', this.filePath, 1);
+    const fileNode: Node = {
+      id,
+      kind: 'file',
+      name: this.filePath.split('/').pop() || this.filePath,
+      qualifiedName: this.filePath,
+      filePath: this.filePath,
+      language: 'hcl',
+      startLine: 1,
+      endLine: lines.length,
+      startColumn: 0,
+      endColumn: lines[lines.length - 1]?.length ?? 0,
+      updatedAt: Date.now(),
+    };
+    this.nodes.push(fileNode);
+    return id;
+  }
+
+  /**
+   * Handle a single top-level block, dispatching by block kind.
+   * Block AST shape:
+   *   block
+   *     identifier         (the kind: "resource", "variable", ...)
+   *     string_lit*        (zero, one, or two labels)
+   *     body               (optional — empty `{}` blocks have no body child)
+   */
+  private visitTopLevelBlock(block: SyntaxNode, fileNodeId: string): void {
+    const head = block.namedChildren.find((c: SyntaxNode | null) => c?.type === 'identifier');
+    if (!head) return;
+    const kind = getNodeText(head, this.source);
+
+    const labels: string[] = [];
+    for (const child of block.namedChildren) {
+      if (child?.type === 'string_lit') labels.push(this.unquoteStringLit(child));
+    }
+    const body = block.namedChildren.find((c: SyntaxNode | null) => c?.type === 'body') ?? null;
+
+    switch (kind) {
+      case 'resource':
+        this.emitTypedBlock(block, body, fileNodeId, labels, /*qnPrefix*/ '', 'resource');
+        return;
+      case 'data':
+        this.emitTypedBlock(block, body, fileNodeId, labels, 'data.', 'data');
+        return;
+      case 'module':
+        this.emitNamedBlock(block, body, fileNodeId, labels, 'module', 'module.', 'module');
+        return;
+      case 'variable':
+        this.emitNamedBlock(block, body, fileNodeId, labels, 'variable', 'var.', 'variable');
+        return;
+      case 'output':
+        this.emitNamedBlock(block, body, fileNodeId, labels, 'export', 'output.', 'output');
+        return;
+      case 'provider':
+        this.emitNamedBlock(block, body, fileNodeId, labels, 'namespace', 'provider.', 'provider');
+        return;
+      case 'locals':
+        this.emitLocalsBlock(body, fileNodeId);
+        return;
+      case 'terraform':
+        this.emitTerraformBlock(block, body, fileNodeId);
+        return;
+      default:
+        // Unknown top-level block kind (vendor extensions, etc.).
+        // Emit as a generic namespace node so it shows up in search.
+        this.emitNamedBlock(block, body, fileNodeId, labels, 'namespace', `${kind}.`, kind);
+    }
+  }
+
+  /**
+   * `resource "TYPE" "NAME" {}` and `data "TYPE" "NAME" {}` — both take two labels.
+   */
+  private emitTypedBlock(
+    block: SyntaxNode,
+    body: SyntaxNode | null,
+    fileNodeId: string,
+    labels: string[],
+    qnPrefix: string,
+    blockKind: string,
+  ): void {
+    if (labels.length < 2) return;
+    const [type, name] = labels;
+    const localName = `${type}.${name}`;
+    const qualifiedName = `${qnPrefix}${localName}`;
+    const nodeId = generateNodeId(this.filePath, 'class', qualifiedName, block.startPosition.row + 1);
+
+    const node: Node = {
+      id: nodeId,
+      kind: 'class',
+      name: localName,
+      qualifiedName,
+      filePath: this.filePath,
+      language: 'hcl',
+      startLine: block.startPosition.row + 1,
+      endLine: block.endPosition.row + 1,
+      startColumn: block.startPosition.column,
+      endColumn: block.endPosition.column,
+      signature: `${blockKind} "${type}" "${name}"`,
+      updatedAt: Date.now(),
+    };
+    this.nodes.push(node);
+    this.edges.push({ source: fileNodeId, target: nodeId, kind: 'contains' });
+
+    if (body) this.scanBodyForReferences(body, nodeId);
+  }
+
+  /**
+   * Single-label blocks: variable, output, provider, module, plus unknown kinds.
+   * `module` blocks additionally emit an `imports` reference for `source = "..."`.
+   */
+  private emitNamedBlock(
+    block: SyntaxNode,
+    body: SyntaxNode | null,
+    fileNodeId: string,
+    labels: string[],
+    nodeKind: NodeKind,
+    qnPrefix: string,
+    blockKind: string,
+  ): void {
+    if (labels.length < 1) return;
+    const name = labels[0]!;
+    const qualifiedName = `${qnPrefix}${name}`;
+    const nodeId = generateNodeId(this.filePath, nodeKind, qualifiedName, block.startPosition.row + 1);
+
+    const node: Node = {
+      id: nodeId,
+      kind: nodeKind,
+      name,
+      qualifiedName,
+      filePath: this.filePath,
+      language: 'hcl',
+      startLine: block.startPosition.row + 1,
+      endLine: block.endPosition.row + 1,
+      startColumn: block.startPosition.column,
+      endColumn: block.endPosition.column,
+      signature: `${blockKind} "${name}"`,
+      updatedAt: Date.now(),
+    };
+    this.nodes.push(node);
+    this.edges.push({ source: fileNodeId, target: nodeId, kind: 'contains' });
+
+    if (body) {
+      if (blockKind === 'module') this.emitModuleSourceImport(body, nodeId);
+      this.scanBodyForReferences(body, nodeId);
+    }
+  }
+
+  /**
+   * `locals { a = ...; b = ... }` — each top-level attribute becomes a
+   * separate `constant` node with qualified name `local.<attr>`.
+   */
+  private emitLocalsBlock(body: SyntaxNode | null, fileNodeId: string): void {
+    if (!body) return;
+    for (let i = 0; i < body.namedChildCount; i++) {
+      const child = body.namedChild(i);
+      if (child?.type !== 'attribute') continue;
+      const nameNode = child.namedChildren.find((c: SyntaxNode | null) => c?.type === 'identifier');
+      if (!nameNode) continue;
+      const name = getNodeText(nameNode, this.source);
+      const qualifiedName = `local.${name}`;
+      const nodeId = generateNodeId(this.filePath, 'constant', qualifiedName, child.startPosition.row + 1);
+
+      const node: Node = {
+        id: nodeId,
+        kind: 'constant',
+        name,
+        qualifiedName,
+        filePath: this.filePath,
+        language: 'hcl',
+        startLine: child.startPosition.row + 1,
+        endLine: child.endPosition.row + 1,
+        startColumn: child.startPosition.column,
+        endColumn: child.endPosition.column,
+        updatedAt: Date.now(),
+      };
+      this.nodes.push(node);
+      this.edges.push({ source: fileNodeId, target: nodeId, kind: 'contains' });
+
+      const exprNode = child.namedChildren.find((c: SyntaxNode | null) => c?.type === 'expression');
+      if (exprNode) this.scanExpressionForReferences(exprNode, nodeId);
+    }
+  }
+
+  /**
+   * `terraform { ... }` — anchor block with no labels. We emit a single
+   * module-kind node so the file shows up in search; nested
+   * required_providers / backend blocks are not enumerated for v1.
+   */
+  private emitTerraformBlock(block: SyntaxNode, _body: SyntaxNode | null, fileNodeId: string): void {
+    const qualifiedName = 'terraform';
+    const nodeId = generateNodeId(this.filePath, 'module', qualifiedName, block.startPosition.row + 1);
+    const node: Node = {
+      id: nodeId,
+      kind: 'module',
+      name: 'terraform',
+      qualifiedName,
+      filePath: this.filePath,
+      language: 'hcl',
+      startLine: block.startPosition.row + 1,
+      endLine: block.endPosition.row + 1,
+      startColumn: block.startPosition.column,
+      endColumn: block.endPosition.column,
+      signature: 'terraform',
+      updatedAt: Date.now(),
+    };
+    this.nodes.push(node);
+    this.edges.push({ source: fileNodeId, target: nodeId, kind: 'contains' });
+  }
+
+  /**
+   * For a `module "X" { source = "..." }` block, emit an `imports` edge to
+   * the source string. Cross-file resolution isn't yet HCL-aware, so we
+   * emit it as an unresolved reference using the literal source value.
+   */
+  private emitModuleSourceImport(body: SyntaxNode, fromNodeId: string): void {
+    for (let i = 0; i < body.namedChildCount; i++) {
+      const attr = body.namedChild(i);
+      if (attr?.type !== 'attribute') continue;
+      const nameNode = attr.namedChildren.find((c: SyntaxNode | null) => c?.type === 'identifier');
+      if (!nameNode || getNodeText(nameNode, this.source) !== 'source') continue;
+
+      const exprNode = attr.namedChildren.find((c: SyntaxNode | null) => c?.type === 'expression');
+      if (!exprNode) return;
+      const literal = this.extractStaticString(exprNode);
+      if (literal === null) return;
+
+      this.unresolvedReferences.push({
+        fromNodeId,
+        referenceName: literal,
+        referenceKind: 'imports',
+        line: attr.startPosition.row + 1,
+        column: attr.startPosition.column,
+      });
+      return;
+    }
+  }
+
+  private scanBodyForReferences(body: SyntaxNode, fromNodeId: string): void {
+    for (let i = 0; i < body.namedChildCount; i++) {
+      const child = body.namedChild(i);
+      if (!child) continue;
+      if (child.type === 'attribute') {
+        const exprNode = child.namedChildren.find((c: SyntaxNode | null) => c?.type === 'expression');
+        if (exprNode) this.scanExpressionForReferences(exprNode, fromNodeId);
+      } else if (child.type === 'block') {
+        // Nested block (e.g. `versioning_configuration { ... }` inside a resource).
+        // Walk its body recursively, but don't emit a separate node — the parent
+        // block owns the sub-config.
+        const nestedBody = child.namedChildren.find((c: SyntaxNode | null) => c?.type === 'body');
+        if (nestedBody) this.scanBodyForReferences(nestedBody, fromNodeId);
+      }
+    }
+  }
+
+  /**
+   * Walk an `expression` subtree and emit unresolved references for each
+   * Terraform-style address head we find. References take the form:
+   *
+   *   <head_identifier>(.<get_attr>)*
+   *
+   * which the parser exposes as a `variable_expr` node followed by sibling
+   * `get_attr` / `index` / `splat` nodes within the same `expression`.
+   *
+   * Loop-bound iteration variables (e.g. `s` in `[for s in xs : s.id]`,
+   * `k` and `v` in `{for k, v in m : k => v}`) are tracked in `bindings`
+   * so they don't generate spurious references.
+   */
+  private scanExpressionForReferences(
+    root: SyntaxNode,
+    fromNodeId: string,
+    loopBindings: ReadonlySet<string> = new Set(),
+  ): void {
+    const visit = (node: SyntaxNode, bindings: ReadonlySet<string>): void => {
+      if (node.type === 'expression') {
+        const ref = this.tryExtractReference(node, bindings);
+        if (ref) {
+          this.unresolvedReferences.push({
+            fromNodeId,
+            referenceName: ref.name,
+            referenceKind: 'references',
+            line: ref.line,
+            column: ref.column,
+          });
+        }
+        for (let i = 0; i < node.namedChildCount; i++) {
+          const child = node.namedChild(i);
+          if (child) visit(child, bindings);
+        }
+        return;
+      }
+
+      // for_expr: identifiers introduced in `for_intro` are bound for the
+      // rest of the for body (and any condition), but NOT for the iterable
+      // expression inside the for_intro itself.
+      if (node.type === 'for_tuple_expr' || node.type === 'for_object_expr') {
+        let activeBindings = bindings;
+        for (let i = 0; i < node.namedChildCount; i++) {
+          const child = node.namedChild(i);
+          if (!child) continue;
+          if (child.type === 'for_intro') {
+            activeBindings = this.visitForIntro(child, bindings, fromNodeId);
+          } else {
+            visit(child, activeBindings);
+          }
+        }
+        return;
+      }
+
+      for (let i = 0; i < node.namedChildCount; i++) {
+        const child = node.namedChild(i);
+        if (child) visit(child, bindings);
+      }
+    };
+
+    visit(root, loopBindings);
+  }
+
+  /**
+   * Process a `for_intro` node and return the binding set in scope for the
+   * enclosing for-expression's body and condition. The iterable expression
+   * inside the for_intro is scanned with the *outer* bindings — iteration
+   * variables aren't yet in scope at that point.
+   */
+  private visitForIntro(
+    forIntro: SyntaxNode,
+    outerBindings: ReadonlySet<string>,
+    fromNodeId: string,
+  ): ReadonlySet<string> {
+    const newBindings = new Set(outerBindings);
+    for (let i = 0; i < forIntro.namedChildCount; i++) {
+      const child = forIntro.namedChild(i);
+      if (child?.type === 'identifier') {
+        newBindings.add(getNodeText(child, this.source));
+      } else if (child?.type === 'expression') {
+        // The iterable: scan with the original (outer) bindings.
+        this.scanExpressionForReferences(child, fromNodeId, outerBindings);
+      }
+    }
+    return newBindings;
+  }
+
+  /**
+   * If `expression` is `<variable_expr> (<get_attr>|<index>|<splat>)*`,
+   * return the Terraform-style address it references. Otherwise null.
+   *
+   * The reference name follows Terraform's addressing scheme so it can match
+   * the qualified names of the block nodes we emit:
+   *   - var.X            → variable X
+   *   - local.X          → local X
+   *   - module.X         → module X (trailing get_attr is the output name)
+   *   - data.T.N         → data block T/N
+   *   - count/each/self/path/terraform → reserved, skipped
+   *   - <ident>.N        → resource <ident>.N
+   *
+   * We stop at the address head (e.g. `aws_s3_bucket.logs` from
+   * `aws_s3_bucket.logs.id`) so the resolver can match against block-node
+   * qualified names without per-attribute noise.
+   */
+  private tryExtractReference(
+    expression: SyntaxNode,
+    bindings: ReadonlySet<string>,
+  ): { name: string; line: number; column: number } | null {
+    if (expression.namedChildCount === 0) return null;
+    const first = expression.namedChild(0);
+    if (first?.type !== 'variable_expr') return null;
+
+    const headIdent = first.namedChildren.find((c: SyntaxNode | null) => c?.type === 'identifier');
+    if (!headIdent) return null;
+    const head = getNodeText(headIdent, this.source);
+    if (HclExtractor.RESERVED_HEADS.has(head) || bindings.has(head)) return null;
+
+    // Walk the get_attr chain until we have enough to address the resource/module/var/etc.
+    const chain: string[] = [];
+    for (let i = 1; i < expression.namedChildCount; i++) {
+      const child = expression.namedChild(i);
+      if (child?.type !== 'get_attr') break;
+      const attrIdent = child.namedChildren.find((c: SyntaxNode | null) => c?.type === 'identifier');
+      if (!attrIdent) break;
+      chain.push(getNodeText(attrIdent, this.source));
+    }
+
+    let name: string | null = null;
+    if (head === 'var' || head === 'local') {
+      // var.X or local.X
+      if (chain.length >= 1) name = `${head}.${chain[0]}`;
+    } else if (head === 'module') {
+      if (chain.length >= 1) name = `module.${chain[0]}`;
+    } else if (head === 'data') {
+      if (chain.length >= 2) name = `data.${chain[0]}.${chain[1]}`;
+    } else {
+      // Resource: <type>.<name>
+      if (chain.length >= 1) name = `${head}.${chain[0]}`;
+    }
+
+    if (!name) return null;
+    return {
+      name,
+      line: first.startPosition.row + 1,
+      column: first.startPosition.column,
+    };
+  }
+
+  /**
+   * Pull a literal string out of an expression of the form `"..."`.
+   * Returns null for interpolated, non-string, or otherwise dynamic values
+   * (we don't attempt module-source resolution on dynamic strings).
+   *
+   * The grammar uses two shapes for quoted strings:
+   *   - `expression > literal_value > string_lit`            (no interpolations)
+   *   - `expression > template_expr > quoted_template`       (with interpolations)
+   * In both, the body comes from `template_literal` children; presence of any
+   * `template_interpolation`/`template_directive` makes the value dynamic.
+   */
+  private extractStaticString(expression: SyntaxNode): string | null {
+    const child = expression.namedChild(0);
+    if (!child) return null;
+
+    let container: SyntaxNode | null = null;
+    if (child.type === 'literal_value') {
+      const stringLit = child.namedChildren.find((c: SyntaxNode | null) => c?.type === 'string_lit');
+      container = stringLit ?? null;
+    } else if (child.type === 'template_expr') {
+      const quoted = child.namedChildren.find((c: SyntaxNode | null) => c?.type === 'quoted_template');
+      container = quoted ?? null;
+    }
+    if (!container) return null;
+
+    let literal = '';
+    for (let i = 0; i < container.namedChildCount; i++) {
+      const part = container.namedChild(i);
+      if (!part) continue;
+      if (part.type === 'template_literal') {
+        literal += getNodeText(part, this.source);
+      } else if (part.type === 'template_interpolation' || part.type === 'template_directive') {
+        return null;
+      }
+    }
+    return literal;
+  }
+
+  private unquoteStringLit(node: SyntaxNode): string {
+    const text = getNodeText(node, this.source);
+    if (text.length >= 2 && text.startsWith('"') && text.endsWith('"')) {
+      return text.slice(1, -1);
+    }
+    return text;
+  }
+}
diff --git a/src/extraction/index.ts b/src/extraction/index.ts
index 4ad056fb..66568d4f 100644
--- a/src/extraction/index.ts
+++ b/src/extraction/index.ts
@@ -20,7 +20,7 @@ import { QueryBuilder } from '../db/queries';
 import { extractFromSource } from './tree-sitter';
 import { detectLanguage, isLanguageSupported, initGrammars, loadGrammarsForLanguages } from './grammars';
 import { logDebug, logWarn } from '../errors';
-import { validatePathWithinRoot, normalizePath } from '../utils';
+import { validatePathWithinRoot, validatePathWithinRootReal, normalizePath, stripBom, stripCommentLinesForRetry } from '../utils';
 import picomatch from 'picomatch';
 
 /**
@@ -85,10 +85,15 @@ export interface SyncResult {
 }
 
 /**
- * Calculate SHA256 hash of file contents
+ * Calculate SHA256 hash of file contents.
+ *
+ * A leading UTF-8 BOM is stripped before hashing so files round-tripped
+ * through editors that disagree about BOM handling (VSCode strips by
+ * default; some Windows editors preserve it) hash identically and don't
+ * appear "modified" on every sync.
  */
 export function hashContent(content: string): string {
-  return crypto.createHash('sha256').update(content).digest('hex');
+  return crypto.createHash('sha256').update(stripBom(content)).digest('hex');
 }
 
 /**
@@ -123,12 +128,73 @@ export function shouldIncludeFile(
   return false;
 }
 
+/**
+ * Enumerate all initialized submodule paths (recursively), relative to `rootDir`.
+ *
+ * Uses `git submodule foreach` so we get exactly the submodules git considers
+ * active — uninitialized / deinitialized submodules are skipped automatically,
+ * which is what we want (we can't ls-files inside a directory with no .git).
+ *
+ * Returns [] when there are no submodules or when the command fails. Errors
+ * here are non-fatal: submodule indexing is a best-effort enhancement on top
+ * of the parent-repo file scan.
+ */
+function getGitSubmodules(rootDir: string): string[] {
+  try {
+    const output = execFileSync(
+      'git',
+      ['submodule', 'foreach', '--recursive', '--quiet', 'echo "$displaypath"'],
+      { cwd: rootDir, encoding: 'utf-8', timeout: 10000, stdio: ['pipe', 'pipe', 'pipe'] }
+    );
+    const paths: string[] = [];
+    for (const line of output.split('\n')) {
+      const trimmed = line.trim();
+      if (trimmed) paths.push(normalizePath(trimmed));
+    }
+    return paths;
+  } catch {
+    return [];
+  }
+}
+
+/**
+ * Run `git ls-files -co --exclude-standard` inside a submodule and return
+ * paths prefixed back into the parent repo's relative-path namespace.
+ * Errors are swallowed so one broken submodule doesn't fail the whole scan.
+ */
+function getSubmoduleFiles(rootDir: string, submodulePath: string): string[] {
+  try {
+    const output = execFileSync(
+      'git',
+      ['ls-files', '-co', '--exclude-standard'],
+      {
+        cwd: path.join(rootDir, submodulePath),
+        encoding: 'utf-8',
+        timeout: 30000,
+        maxBuffer: 50 * 1024 * 1024,
+        stdio: ['pipe', 'pipe', 'pipe'],
+      }
+    );
+    const out: string[] = [];
+    for (const line of output.split('\n')) {
+      const trimmed = line.trim();
+      if (trimmed) out.push(normalizePath(`${submodulePath}/${trimmed}`));
+    }
+    return out;
+  } catch {
+    return [];
+  }
+}
+
 /**
  * Get all files visible to git (tracked + untracked but not ignored).
- * Respects .gitignore at all levels (root, subdirectories).
+ * Respects .gitignore at all levels (root, subdirectories) and recurses
+ * into git submodules — `git ls-files` itself does not enter submodules,
+ * so each one is enumerated separately and its paths are prefixed.
+ * Pass `indexSubmodules: false` in config to skip the submodule walk.
  * Returns null on failure (non-git project) so callers can fall back.
  */
-function getGitVisibleFiles(rootDir: string): Set<string> | null {
+function getGitVisibleFiles(rootDir: string, config: CodeGraphConfig): Set<string> | null {
   try {
     // Check if the project directory is gitignored by a parent repo.
     // When rootDir lives inside a parent git repo that ignores it,
@@ -167,6 +233,18 @@ function getGitVisibleFiles(rootDir: string): Set<string> | null {
         files.add(normalizePath(trimmed));
       }
     }
+
+    // Recurse into submodules: each submodule has its own git index, and the
+    // parent repo's ls-files only emits the submodule directory entry, not
+    // the files inside.
+    if (config.indexSubmodules !== false) {
+      for (const submodulePath of getGitSubmodules(rootDir)) {
+        for (const filePath of getSubmoduleFiles(rootDir, submodulePath)) {
+          files.add(filePath);
+        }
+      }
+    }
+
     return files;
   } catch {
     return null;
@@ -185,44 +263,244 @@ interface GitChanges {
 }
 
 /**
- * Use `git status` to detect changed files instead of scanning every file.
- * Returns null on failure so callers fall back to full scan.
+ * Project-metadata key holding the HEAD SHA the index was last synced against.
+ * Used to detect HEAD-moving operations (merge, pull, checkout, rebase,
+ * reset, post-commit) that leave the working tree clean — which `git status`
+ * alone cannot see.
  */
-function getGitChangedFiles(rootDir: string, config: CodeGraphConfig): GitChanges | null {
+export const LAST_SYNCED_HEAD_KEY = 'last_synced_head';
+
+interface GitChangesResult {
+  changes: GitChanges;
+  /** Current HEAD SHA, or null if not in a git repo or repo has no commits yet. */
+  currentHead: string | null;
+  /**
+   * True when the previously-synced HEAD is no longer reachable from current
+   * HEAD (e.g., after a force-push, history rewrite, or `git gc`). Caller
+   * should treat this as "git history is unreliable here" and fall back to
+   * a full filesystem scan.
+   */
+  needsFullReindex: boolean;
+}
+
+/**
+ * Get the current HEAD commit SHA. Returns null when not in a git repo or
+ * the repo has no commits yet.
+ */
+export function getGitHead(rootDir: string): string | null {
   try {
-    const output = execFileSync(
+    return execFileSync(
+      'git',
+      ['rev-parse', 'HEAD'],
+      { cwd: rootDir, encoding: 'utf-8', timeout: 5000, stdio: ['pipe', 'pipe', 'pipe'] }
+    ).trim() || null;
+  } catch {
+    return null;
+  }
+}
+
+/**
+ * Decode the C-style-quoted path that `git status --porcelain` emits when
+ * a path contains spaces, control chars, or non-ASCII bytes.
+ */
+function unquoteGitPath(raw: string): string {
+  if (raw.length < 2 || raw[0] !== '"' || raw[raw.length - 1] !== '"') {
+    return raw;
+  }
+  const body = raw.slice(1, -1);
+  const bytes: number[] = [];
+  for (let i = 0; i < body.length; i++) {
+    const ch = body[i];
+    if (ch !== '\\') {
+      bytes.push(body.charCodeAt(i));
+      continue;
+    }
+    const next = body[++i];
+    if (next === undefined) break;
+    if (next >= '0' && next <= '7') {
+      let octal = next;
+      let peek = body[i + 1];
+      while (octal.length < 3 && peek !== undefined && peek >= '0' && peek <= '7') {
+        octal += peek;
+        i++;
+        peek = body[i + 1];
+      }
+      bytes.push(parseInt(octal, 8));
+    } else {
+      const map: Record<string, number> = { a: 7, b: 8, t: 9, n: 10, v: 11, f: 12, r: 13, '"': 34, '\\': 92 };
+      bytes.push(map[next] ?? next.charCodeAt(0));
+    }
+  }
+  return Buffer.from(bytes).toString('utf-8');
+}
+
+/**
+ * Detect changed files using git, combining two sources:
+ *
+ *   1. `git status --porcelain` — uncommitted edits in the working tree.
+ *   2. `git diff <lastSyncedHead>..HEAD` — committed changes since last
+ *      sync. This catches operations that move HEAD without dirtying the
+ *      working tree (merge, pull, checkout, rebase, reset, post-commit).
+ *
+ * Without (2), a `git merge` (etc.) would silently leave the index stale
+ * because the working tree is clean and `git status` reports nothing.
+ *
+ * Returns null when git is unavailable (non-git project or status failure)
+ * so the caller falls back to a full filesystem scan. Returns
+ * `needsFullReindex: true` when the last-synced HEAD is unreachable
+ * (force-push, gc), which also calls for a full scan.
+ */
+function getGitChangedFiles(
+  rootDir: string,
+  config: CodeGraphConfig,
+  lastSyncedHead: string | null
+): GitChangesResult | null {
+  let statusOutput: string;
+  try {
+    statusOutput = execFileSync(
       'git',
       ['status', '--porcelain', '--no-renames'],
       { cwd: rootDir, encoding: 'utf-8', timeout: 10000, stdio: ['pipe', 'pipe', 'pipe'] }
     );
+  } catch {
+    return null;
+  }
 
-    const modified: string[] = [];
-    const added: string[] = [];
-    const deleted: string[] = [];
+  const currentHead = getGitHead(rootDir);
+
+  // Two parallel maps: candidates (files that exist or may exist on disk
+  // and need an index check) and deletions (files git says were removed).
+  // Origin distinguishes untracked-add (skip hash compare) from
+  // modified/committed (do hash compare).
+  const candidates = new Map<string, '??' | 'modified'>();
+  const deletions = new Set<string>();
+
+  for (const line of statusOutput.split('\n')) {
+    if (line.length < 4) continue;
+    const code = line.substring(0, 2);
+    const filePath = normalizePath(line.substring(3));
+    if (!shouldIncludeFile(filePath, config)) continue;
+
+    if (code === '??') {
+      if (!candidates.has(filePath)) candidates.set(filePath, '??');
+    } else if (code.includes('D')) {
+      deletions.add(filePath);
+    } else {
+      candidates.set(filePath, 'modified');
+    }
+  }
 
-    for (const line of output.split('\n')) {
-      if (line.length < 4) continue; // Minimum: "XY file"
+  // Union committed changes since last sync.
+  if (currentHead && lastSyncedHead && currentHead !== lastSyncedHead) {
+    // Verify the previously-synced commit is still reachable. If history
+    // was rewritten (force-push) or pruned (gc), we cannot diff against it
+    // and must full-reindex.
+    try {
+      execFileSync(
+        'git',
+        ['cat-file', '-e', `${lastSyncedHead}^{commit}`],
+        { cwd: rootDir, encoding: 'utf-8', timeout: 5000, stdio: ['pipe', 'pipe', 'pipe'] }
+      );
+    } catch {
+      logDebug('Last-synced HEAD unreachable, falling back to full reindex', { lastSyncedHead, currentHead });
+      return { changes: { modified: [], added: [], deleted: [] }, currentHead, needsFullReindex: true };
+    }
 
-      const statusCode = line.substring(0, 2);
-      const filePath = normalizePath(line.substring(3));
+    let diffOutput: string;
+    try {
+      // -z: NUL-delimited fields/records, robust against arbitrary path chars.
+      // --no-renames: keep semantics consistent with the status call above.
+      diffOutput = execFileSync(
+        'git',
+        ['diff', '--name-status', '--no-renames', '-z', `${lastSyncedHead}..${currentHead}`],
+        { cwd: rootDir, encoding: 'utf-8', timeout: 30000, maxBuffer: 50 * 1024 * 1024, stdio: ['pipe', 'pipe', 'pipe'] }
+      );
+    } catch {
+      logDebug('git diff against last-synced HEAD failed, falling back to full reindex', { lastSyncedHead, currentHead });
+      return { changes: { modified: [], added: [], deleted: [] }, currentHead, needsFullReindex: true };
+    }
 
-      // Skip files that don't match include/exclude config
+    // With -z + --name-status the stream is: status \0 path \0 status \0 path \0 ...
+    const tokens = diffOutput.split('\0').filter((t) => t.length > 0);
+    for (let i = 0; i + 1 < tokens.length; i += 2) {
+      const code = tokens[i]!;
+      const filePath = normalizePath(tokens[i + 1]!);
       if (!shouldIncludeFile(filePath, config)) continue;
 
-      if (statusCode === '??') {
-        added.push(filePath);
-      } else if (statusCode.includes('D')) {
-        deleted.push(filePath);
+      if (code.startsWith('D')) {
+        deletions.add(filePath);
       } else {
-        // M, MM, AM, A (staged), etc. — treat as modified
-        modified.push(filePath);
+        // A/M/T (and C with --no-renames) — caller will read+hash and let
+        // the DB lookup decide whether it's truly an add or a modify.
+        if (!candidates.has(filePath)) candidates.set(filePath, 'modified');
       }
     }
+  }
 
-    return { modified, added, deleted };
-  } catch {
-    return null;
+  // Submodule status: parent-repo `git status` only emits a directory-level
+  // entry per submodule, so run status inside each active submodule and
+  // merge the file-level results back. Errors are non-fatal.
+  if (config.indexSubmodules !== false) {
+    const submodules = getGitSubmodules(rootDir);
+    const submoduleDirs = new Set(submodules);
+    // Drop the directory-level submodule entries from candidates — file-
+    // level changes for those paths are picked up below.
+    for (const subPath of submoduleDirs) candidates.delete(subPath);
+    for (const subPath of submodules) {
+      let subStatus: string;
+      try {
+        subStatus = execFileSync(
+          'git',
+          ['status', '--porcelain', '--no-renames'],
+          { cwd: path.join(rootDir, subPath), encoding: 'utf-8', timeout: 10000, stdio: ['pipe', 'pipe', 'pipe'] }
+        );
+      } catch {
+        continue;
+      }
+      for (const line of subStatus.split('\n')) {
+        if (line.length < 4) continue;
+        const code = line.substring(0, 2);
+        const raw = unquoteGitPath(line.substring(3));
+        const filePath = normalizePath(`${subPath}/${raw}`);
+        if (!shouldIncludeFile(filePath, config)) continue;
+        if (code === '??') {
+          if (!candidates.has(filePath)) candidates.set(filePath, '??');
+        } else if (code.includes('D')) {
+          deletions.add(filePath);
+        } else {
+          candidates.set(filePath, 'modified');
+        }
+      }
+    }
+  }
+
+  // A file present in both sets exists on disk now (working tree wins over
+  // recorded deletion — e.g., file deleted in commit, then re-created
+  // uncommitted).
+  for (const filePath of candidates.keys()) deletions.delete(filePath);
+
+  // Apply .codegraphignore filtering across both candidates and
+  // deletions in one pass — the marker is per-directory, so build the
+  // ignored-dir set from the union of all paths we're considering.
+  const allConsidered = [...candidates.keys(), ...deletions];
+  const ignoredDirs = findCodegraphIgnoredDirs(rootDir, allConsidered);
+
+  const modified: string[] = [];
+  const added: string[] = [];
+  for (const [filePath, origin] of candidates) {
+    if (isUnderCodegraphIgnoredDir(filePath, ignoredDirs)) continue;
+    if (origin === '??') added.push(filePath);
+    else modified.push(filePath);
   }
+  const deleted = Array.from(deletions).filter(
+    (p) => !isUnderCodegraphIgnoredDir(p, ignoredDirs)
+  );
+
+  return {
+    changes: { modified, added, deleted },
+    currentHead,
+    needsFullReindex: false,
+  };
 }
 
 /**
@@ -230,6 +508,52 @@ function getGitChangedFiles(rootDir: string, config: CodeGraphConfig): GitChange
  */
 const CODEGRAPH_IGNORE_MARKER = '.codegraphignore';
 
+/**
+ * Walk every parent directory of the given files (relative to rootDir) and
+ * return the subset that contain a `.codegraphignore` marker. Anything
+ * under one of these directories should be excluded.
+ *
+ * Called by `scanDirectory`, `scanDirectoryAsync`, and `getGitChangedFiles`
+ * so the git-driven paths honor the marker the same way the filesystem
+ * walk fallback does. Without this the marker had inconsistent behavior:
+ * respected on non-git projects, silently ignored on git ones.
+ */
+function findCodegraphIgnoredDirs(rootDir: string, files: Iterable<string>): Set<string> {
+  const dirs = new Set<string>(['.']);
+  for (const file of files) {
+    let dir = path.posix.dirname(normalizePath(file));
+    while (dir && dir !== '.' && dir !== '/') {
+      if (dirs.has(dir)) break;  // already enumerated this branch
+      dirs.add(dir);
+      dir = path.posix.dirname(dir);
+    }
+  }
+
+  const ignored = new Set<string>();
+  for (const dir of dirs) {
+    const marker = dir === '.'
+      ? path.join(rootDir, CODEGRAPH_IGNORE_MARKER)
+      : path.join(rootDir, dir, CODEGRAPH_IGNORE_MARKER);
+    if (fs.existsSync(marker)) ignored.add(dir);
+  }
+  return ignored;
+}
+
+/**
+ * True if `filePath` (relative, forward-slashed) lives under any directory
+ * in `ignoredDirs`. Directory `.` matches the project root.
+ */
+function isUnderCodegraphIgnoredDir(filePath: string, ignoredDirs: Set<string>): boolean {
+  if (ignoredDirs.size === 0) return false;
+  if (ignoredDirs.has('.')) return true;
+  let dir = path.posix.dirname(filePath);
+  while (dir && dir !== '.' && dir !== '/') {
+    if (ignoredDirs.has(dir)) return true;
+    dir = path.posix.dirname(dir);
+  }
+  return false;
+}
+
 /**
  * Recursively scan directory for source files.
  *
@@ -243,11 +567,13 @@ export function scanDirectory(
   onProgress?: (current: number, file: string) => void
 ): string[] {
   // Fast path: use git to get all visible files (respects .gitignore everywhere)
-  const gitFiles = getGitVisibleFiles(rootDir);
+  const gitFiles = getGitVisibleFiles(rootDir, config);
   if (gitFiles) {
+    const ignoredDirs = findCodegraphIgnoredDirs(rootDir, gitFiles);
     const files: string[] = [];
     let count = 0;
     for (const filePath of gitFiles) {
+      if (isUnderCodegraphIgnoredDir(filePath, ignoredDirs)) continue;
       if (shouldIncludeFile(filePath, config)) {
         files.push(filePath);
         count++;
@@ -270,11 +596,13 @@ export async function scanDirectoryAsync(
   config: CodeGraphConfig,
   onProgress?: (current: number, file: string) => void
 ): Promise<string[]> {
-  const gitFiles = getGitVisibleFiles(rootDir);
+  const gitFiles = getGitVisibleFiles(rootDir, config);
   if (gitFiles) {
+    const ignoredDirs = findCodegraphIgnoredDirs(rootDir, gitFiles);
     const files: string[] = [];
     let count = 0;
     for (const filePath of gitFiles) {
+      if (isUnderCodegraphIgnoredDir(filePath, ignoredDirs)) continue;
       if (shouldIncludeFile(filePath, config)) {
         files.push(filePath);
         count++;
@@ -571,14 +899,29 @@ export class ExtractionOrchestrator {
      * Terminates the current worker and clears the reference so
      * ensureWorker() will spawn a fresh one on the next call.
      */
-    function recycleWorker(): void {
+    async function recycleWorker(): Promise<void> {
       if (!parseWorker) return;
       log(`Recycling worker after ${workerParseCount} parses (heap: ${Math.round(process.memoryUsage().rss / 1024 / 1024)}MB RSS)`);
       const w = parseWorker;
       parseWorker = null;
       workerParseCount = 0;
-      // Fire-and-forget: worker.terminate() can hang if WASM is stuck
-      w.terminate().catch(() => {});
+      // worker.terminate() can hang if WASM is stuck — bound the wait so we
+      // never block the caller's `await` on a wedged worker. The terminate
+      // promise keeps running in the background so the worker eventually gets
+      // reaped even if the timeout wins.
+      let timedOut = false;
+      try {
+        await Promise.race([
+          w.terminate(),
+          new Promise<void>((resolve) => setTimeout(() => { timedOut = true; resolve(); }, 1000)),
+        ]);
+      } catch {
+        // ignore — terminate() failing means the worker is already gone
+      }
+      if (timedOut) {
+        // Fire-and-forget: don't leak a zombie if terminate is still pending.
+        w.terminate().catch(() => {});
+      }
     }
 
     async function requestParse(filePath: string, content: string): Promise<ExtractionResult> {
@@ -820,11 +1163,12 @@ export class ExtractionOrchestrator {
           }
 
           // Strip lines that are entirely comments (preserving line numbers
-          // by replacing with empty lines so node positions stay correct)
-          const stripped = fullContent
-            .split('\n')
-            .map(line => /^\s*\/\//.test(line) ? '' : line)
-            .join('\n');
+          // by replacing with empty lines so node positions stay correct).
+          // The marker is language-specific — the previous hardcoded `//`
+          // was a no-op for Python (`#`), Ruby (`#`), etc., so those files
+          // would silently keep failing on the retry.
+          const language = detectLanguage(filePath, fullContent);
+          const stripped = stripCommentLinesForRetry(fullContent, language);
 
           let result: ExtractionResult;
           try {
@@ -834,7 +1178,6 @@ export class ExtractionOrchestrator {
           }
 
           if (result.nodes.length > 0 || result.errors.length === 0) {
-            const language = detectLanguage(filePath, fullContent);
             const stats = await fsp.stat(path.join(this.rootDir, filePath));
             this.storeExtractionResult(filePath, fullContent, language, stats, result);
 
@@ -856,6 +1199,13 @@ export class ExtractionOrchestrator {
       (parseWorker as import('worker_threads').Worker).terminate().catch(() => {});
     }
 
+    // Establish a baseline HEAD so the next sync can detect HEAD-moving git
+    // operations against this index.
+    const headAfterIndex = getGitHead(this.rootDir);
+    if (headAfterIndex) {
+      this.queries.setMetadata(LAST_SYNCED_HEAD_KEY, headAfterIndex);
+    }
+
     return {
       success: filesIndexed > 0 || errors.filter((e) => e.severity === 'error').length === 0,
       filesIndexed,
@@ -1016,7 +1366,13 @@ export class ExtractionOrchestrator {
   }
 
   /**
-   * Store extraction result in database
+   * Store extraction result in database.
+   *
+   * The whole sequence (delete existing rows → insert nodes → insert edges →
+   * insert unresolved refs → upsert file record) runs in a single transaction
+   * so a process kill mid-write cannot leave the file's old data wiped while
+   * the new data is missing — either everything from this call commits or
+   * nothing does.
    */
   private storeExtractionResult(
     filePath: string,
@@ -1033,59 +1389,61 @@ export class ExtractionOrchestrator {
       return; // No changes
     }
 
-    // Delete existing data for this file
-    if (existingFile) {
-      this.queries.deleteFile(filePath);
-    }
-
     // Filter out nodes with missing required fields before insertion.
     // This prevents FK violations when edges reference nodes that would
     // be silently skipped by insertNode() (see issue #42).
     const validNodes = result.nodes.filter((n) => n.id && n.kind && n.name && n.filePath && n.language);
 
-    // Insert nodes
-    if (validNodes.length > 0) {
-      this.queries.insertNodes(validNodes);
-    }
+    this.queries.transaction(() => {
+      // Delete existing data for this file
+      if (existingFile) {
+        this.queries.deleteFile(filePath);
+      }
 
-    // Filter edges to only reference nodes that were actually inserted
-    if (result.edges.length > 0) {
-      const insertedIds = new Set(validNodes.map((n) => n.id));
-      const validEdges = result.edges.filter(
-        (e) => insertedIds.has(e.source) && insertedIds.has(e.target)
-      );
-      if (validEdges.length > 0) {
-        this.queries.insertEdges(validEdges);
+      // Insert nodes
+      if (validNodes.length > 0) {
+        this.queries.insertNodes(validNodes);
       }
-    }
 
-    // Insert unresolved references in batch with denormalized filePath/language
-    if (result.unresolvedReferences.length > 0) {
-      const insertedIds = new Set(validNodes.map((n) => n.id));
-      const refsWithContext = result.unresolvedReferences
-        .filter((ref) => insertedIds.has(ref.fromNodeId))
-        .map((ref) => ({
-          ...ref,
-          filePath: ref.filePath ?? filePath,
-          language: ref.language ?? language,
-        }));
-      if (refsWithContext.length > 0) {
-        this.queries.insertUnresolvedRefsBatch(refsWithContext);
+      // Filter edges to only reference nodes that were actually inserted
+      if (result.edges.length > 0) {
+        const insertedIds = new Set(validNodes.map((n) => n.id));
+        const validEdges = result.edges.filter(
+          (e) => insertedIds.has(e.source) && insertedIds.has(e.target)
+        );
+        if (validEdges.length > 0) {
+          this.queries.insertEdges(validEdges);
+        }
       }
-    }
 
-    // Insert file record
-    const fileRecord: FileRecord = {
-      path: filePath,
-      contentHash,
-      language,
-      size: stats.size,
-      modifiedAt: stats.mtimeMs,
-      indexedAt: Date.now(),
-      nodeCount: result.nodes.length,
-      errors: result.errors.length > 0 ? result.errors : undefined,
-    };
-    this.queries.upsertFile(fileRecord);
+      // Insert unresolved references in batch with denormalized filePath/language
+      if (result.unresolvedReferences.length > 0) {
+        const insertedIds = new Set(validNodes.map((n) => n.id));
+        const refsWithContext = result.unresolvedReferences
+          .filter((ref) => insertedIds.has(ref.fromNodeId))
+          .map((ref) => ({
+            ...ref,
+            filePath: ref.filePath ?? filePath,
+            language: ref.language ?? language,
+          }));
+        if (refsWithContext.length > 0) {
+          this.queries.insertUnresolvedRefsBatch(refsWithContext);
+        }
+      }
+
+      // Insert file record
+      const fileRecord: FileRecord = {
+        path: filePath,
+        contentHash,
+        language,
+        size: stats.size,
+        modifiedAt: stats.mtimeMs,
+        indexedAt: Date.now(),
+        nodeCount: result.nodes.length,
+        errors: result.errors.length > 0 ? result.errors : undefined,
+      };
+      this.queries.upsertFile(fileRecord);
+    });
   }
 
   /**
@@ -1109,7 +1467,12 @@ export class ExtractionOrchestrator {
     });
 
     const filesToIndex: string[] = [];
-    const gitChanges = getGitChangedFiles(this.rootDir, this.config);
+    const lastSyncedHead = this.queries.getMetadata(LAST_SYNCED_HEAD_KEY);
+    const gitResult = getGitChangedFiles(this.rootDir, this.config, lastSyncedHead);
+    const currentHead = gitResult?.currentHead ?? null;
+    // When the last-synced HEAD is unreachable we drop to the filesystem
+    // fallback, which uses on-disk hashes and is correct regardless of git.
+    const gitChanges = gitResult && !gitResult.needsFullReindex ? gitResult.changes : null;
 
     if (gitChanges) {
       // === Git fast path ===
@@ -1125,9 +1488,16 @@ export class ExtractionOrchestrator {
         }
       }
 
-      // Handle modified files — read + hash only these files
+      // Handle modified files — read + hash only these files. Resolve
+      // symlinks (validatePathWithinRootReal) so a regular file swapped
+      // for a symlink to outside the project between scan and read is
+      // rejected, not followed.
       for (const filePath of gitChanges.modified) {
-        const fullPath = path.join(this.rootDir, filePath);
+        const fullPath = validatePathWithinRootReal(this.rootDir, filePath);
+        if (!fullPath) {
+          logWarn('Path traversal blocked during sync', { filePath });
+          continue;
+        }
         let content: string;
         try {
           content = fs.readFileSync(fullPath, 'utf-8');
@@ -1176,9 +1546,13 @@ export class ExtractionOrchestrator {
         }
       }
 
-      // Find files to add or update
+      // Find files to add or update (symlink-resistant validation)
       for (const filePath of currentFiles) {
-        const fullPath = path.join(this.rootDir, filePath);
+        const fullPath = validatePathWithinRootReal(this.rootDir, filePath);
+        if (!fullPath) {
+          logWarn('Path traversal blocked during sync', { filePath });
+          continue;
+        }
         let content: string;
         try {
           content = fs.readFileSync(fullPath, 'utf-8');
@@ -1227,6 +1601,13 @@ export class ExtractionOrchestrator {
       nodesUpdated += result.nodes.length;
     }
 
+    // Persist current HEAD so the next sync can detect HEAD-moving git
+    // operations (merge, pull, checkout, rebase, reset, post-commit) even
+    // when they leave the working tree clean.
+    if (currentHead) {
+      this.queries.setMetadata(LAST_SYNCED_HEAD_KEY, currentHead);
+    }
+
     return {
       filesChecked,
       filesAdded,
@@ -1243,7 +1624,11 @@ export class ExtractionOrchestrator {
    * Uses git status as a fast path when available, falling back to full scan.
    */
   getChangedFiles(): { added: string[]; modified: string[]; removed: string[] } {
-    const gitChanges = getGitChangedFiles(this.rootDir, this.config);
+    const lastSyncedHead = this.queries.getMetadata(LAST_SYNCED_HEAD_KEY);
+    const gitResult = getGitChangedFiles(this.rootDir, this.config, lastSyncedHead);
+    // Unreachable last-synced HEAD → drop to the filesystem fallback, which
+    // is correct regardless of git history state.
+    const gitChanges = gitResult && !gitResult.needsFullReindex ? gitResult.changes : null;
 
     if (gitChanges) {
       // === Git fast path ===
@@ -1260,8 +1645,13 @@ export class ExtractionOrchestrator {
       }
 
       // Modified files — read + hash only these, compare with DB
+      // (symlink-resistant validation)
       for (const filePath of gitChanges.modified) {
-        const fullPath = path.join(this.rootDir, filePath);
+        const fullPath = validatePathWithinRootReal(this.rootDir, filePath);
+        if (!fullPath) {
+          logWarn('Path traversal blocked while detecting changes', { filePath });
+          continue;
+        }
         let content: string;
         try {
           content = fs.readFileSync(fullPath, 'utf-8');
@@ -1309,9 +1699,13 @@ export class ExtractionOrchestrator {
       }
     }
 
-    // Find added and modified files
+    // Find added and modified files (symlink-resistant validation)
     for (const filePath of currentFiles) {
-      const fullPath = path.join(this.rootDir, filePath);
+      const fullPath = validatePathWithinRootReal(this.rootDir, filePath);
+      if (!fullPath) {
+        logWarn('Path traversal blocked while detecting changes', { filePath });
+        continue;
+      }
       let content: string;
       try {
         content = fs.readFileSync(fullPath, 'utf-8');
diff --git a/src/extraction/languages/c-cpp.ts b/src/extraction/languages/c-cpp.ts
index 66219d4f..8ed3a9de 100644
--- a/src/extraction/languages/c-cpp.ts
+++ b/src/extraction/languages/c-cpp.ts
@@ -114,3 +114,21 @@ export const cppExtractor: LanguageExtractor = {
     return null;
   },
 };
+
+import type { LanguageDef } from './types';
+export const C_DEF: LanguageDef = {
+  name: 'c',
+  displayName: 'C',
+  // .h is also listed for C; tree-sitter.ts contains a `.h might be C++`
+  // heuristic that overrides this on a content-sniff basis.
+  extensions: ['.c', '.h'],
+  includeGlobs: ['**/*.c', '**/*.h'],
+  grammar: { wasmFile: 'tree-sitter-c.wasm', extractor: cExtractor },
+};
+export const CPP_DEF: LanguageDef = {
+  name: 'cpp',
+  displayName: 'C++',
+  extensions: ['.cpp', '.cc', '.cxx', '.hpp', '.hxx'],
+  includeGlobs: ['**/*.cpp', '**/*.cc', '**/*.cxx', '**/*.hpp', '**/*.hxx'],
+  grammar: { wasmFile: 'tree-sitter-cpp.wasm', extractor: cppExtractor },
+};
diff --git a/src/extraction/languages/csharp.ts b/src/extraction/languages/csharp.ts
index 9de53734..c66aea69 100644
--- a/src/extraction/languages/csharp.ts
+++ b/src/extraction/languages/csharp.ts
@@ -65,3 +65,12 @@ export const csharpExtractor: LanguageExtractor = {
     return null;
   },
 };
+
+import type { LanguageDef } from './types';
+export const CSHARP_DEF: LanguageDef = {
+  name: 'csharp',
+  displayName: 'C#',
+  extensions: ['.cs'],
+  includeGlobs: ['**/*.cs'],
+  grammar: { wasmFile: 'tree-sitter-c_sharp.wasm', extractor: csharpExtractor },
+};
diff --git a/src/extraction/languages/dart.ts b/src/extraction/languages/dart.ts
index 5b545d04..d704d826 100644
--- a/src/extraction/languages/dart.ts
+++ b/src/extraction/languages/dart.ts
@@ -193,3 +193,12 @@ export const dartExtractor: LanguageExtractor = {
     return undefined;
   },
 };
+
+import type { LanguageDef } from './types';
+export const DART_DEF: LanguageDef = {
+  name: 'dart',
+  displayName: 'Dart',
+  extensions: ['.dart'],
+  includeGlobs: ['**/*.dart'],
+  grammar: { wasmFile: 'tree-sitter-dart.wasm', extractor: dartExtractor },
+};
diff --git a/src/extraction/languages/go.ts b/src/extraction/languages/go.ts
index 898e6165..5de68ffa 100644
--- a/src/extraction/languages/go.ts
+++ b/src/extraction/languages/go.ts
@@ -49,3 +49,12 @@ export const goExtractor: LanguageExtractor = {
     return match?.[1];
   },
 };
+
+import type { LanguageDef } from './types';
+export const GO_DEF: LanguageDef = {
+  name: 'go',
+  displayName: 'Go',
+  extensions: ['.go'],
+  includeGlobs: ['**/*.go'],
+  grammar: { wasmFile: 'tree-sitter-go.wasm', extractor: goExtractor },
+};
diff --git a/src/extraction/languages/hcl.ts b/src/extraction/languages/hcl.ts
new file mode 100644
index 00000000..21cb708a
--- /dev/null
+++ b/src/extraction/languages/hcl.ts
@@ -0,0 +1,40 @@
+/**
+ * HCL / Terraform — custom extractor that runs on top of the
+ * tree-sitter-hcl WASM grammar. The block-shape of HCL doesn't fit
+ * the universal function/class extractor, so HclExtractor handles it
+ * directly.
+ */
+import { HclExtractor } from '../hcl-extractor';
+import type { LanguageDef } from './types';
+
+export const HCL_DEF: LanguageDef = {
+  name: 'hcl',
+  displayName: 'HCL / Terraform',
+  extensions: ['.tf', '.tfvars', '.hcl'],
+  includeGlobs: ['**/*.tf', '**/*.tfvars', '**/*.hcl'],
+  // HCL needs both a tree-sitter parser (vendored WASM, not on
+  // tree-sitter-wasms) AND a custom extractor — the parse tree is
+  // standard but the extraction logic is bespoke.
+  grammar: {
+    wasmFile: 'tree-sitter-hcl.wasm',
+    vendored: true,
+    // Universal extractor is unused (custom path takes over) but
+    // the type requires it; supply a no-op skeleton.
+    extractor: {
+      functionTypes: [],
+      classTypes: [],
+      methodTypes: [],
+      interfaceTypes: [],
+      structTypes: [],
+      enumTypes: [],
+      typeAliasTypes: [],
+      importTypes: [],
+      callTypes: [],
+      variableTypes: [],
+      nameField: 'name',
+      bodyField: 'body',
+      paramsField: 'parameters',
+    },
+  },
+  customExtractor: (filePath, source) => new HclExtractor(filePath, source).extract(),
+};
diff --git a/src/extraction/languages/index.ts b/src/extraction/languages/index.ts
index e5d12ac6..0e35b826 100644
--- a/src/extraction/languages/index.ts
+++ b/src/extraction/languages/index.ts
@@ -1,44 +1,71 @@
 /**
- * Per-language extraction configurations.
+ * Per-language barrel.
  *
- * Each file exports a LanguageExtractor config object.
- * This barrel builds the EXTRACTORS map consumed by TreeSitterExtractor.
+ * Adding a new language is a single-file addition: drop a
+ * `<name>.ts` next to this barrel exporting an `<NAME>_DEF:
+ * LanguageDef`, then add one import + one array entry to
+ * `./registry.ts`. Nothing in this file needs to change for new
+ * languages.
+ *
+ * `EXTRACTORS` is preserved as a backward-compat export but is now
+ * derived from the registry. Direct readers of `EXTRACTORS` get the
+ * same shape they always did; the canonical source is each
+ * language def's `grammar.extractor` field.
  */
 
-import { Language } from '../../types';
+import type { Language } from '../../types';
 import type { LanguageExtractor } from '../tree-sitter-types';
+import { getLanguageDefs } from './registry';
+
+export * from './registry';
 
-import { typescriptExtractor } from './typescript';
-import { javascriptExtractor } from './javascript';
-import { pythonExtractor } from './python';
-import { goExtractor } from './go';
-import { rustExtractor } from './rust';
-import { javaExtractor } from './java';
-import { cExtractor, cppExtractor } from './c-cpp';
-import { csharpExtractor } from './csharp';
-import { phpExtractor } from './php';
-import { rubyExtractor } from './ruby';
-import { swiftExtractor } from './swift';
-import { kotlinExtractor } from './kotlin';
-import { dartExtractor } from './dart';
-import { pascalExtractor } from './pascal';
+/**
+ * Backward-compat: `Language → LanguageExtractor` map. Built lazily
+ * on first read (the registry transitively imports modules that
+ * import this barrel, so building eagerly would TDZ).
+ */
+let _extractorsCache: Partial<Record<Language, LanguageExtractor>> | null = null;
+function buildExtractors(): Partial<Record<Language, LanguageExtractor>> {
+  if (_extractorsCache) return _extractorsCache;
+  const out: Partial<Record<Language, LanguageExtractor>> = {};
+  for (const def of getLanguageDefs()) {
+    if (def.grammar) {
+      out[def.name as Language] = def.grammar.extractor;
+    }
+  }
+  _extractorsCache = out;
+  return out;
+}
 
-export const EXTRACTORS: Partial<Record<Language, LanguageExtractor>> = {
-  typescript: typescriptExtractor,
-  tsx: typescriptExtractor,
-  javascript: javascriptExtractor,
-  jsx: javascriptExtractor,
-  python: pythonExtractor,
-  go: goExtractor,
-  rust: rustExtractor,
-  java: javaExtractor,
-  c: cExtractor,
-  cpp: cppExtractor,
-  csharp: csharpExtractor,
-  php: phpExtractor,
-  ruby: rubyExtractor,
-  swift: swiftExtractor,
-  kotlin: kotlinExtractor,
-  dart: dartExtractor,
-  pascal: pascalExtractor,
-};
+/**
+ * Lazy Proxy keeps the existing `EXTRACTORS[lang]` access pattern
+ * working without forcing the registry to evaluate at module load
+ * (which would deadlock on the cyclic import chain through
+ * tree-sitter.ts).
+ */
+export const EXTRACTORS: Partial<Record<Language, LanguageExtractor>> = new Proxy(
+  {} as Partial<Record<Language, LanguageExtractor>>,
+  {
+    get(_t, key: string) {
+      return buildExtractors()[key as Language];
+    },
+    has(_t, key: string) {
+      return key in buildExtractors();
+    },
+    ownKeys() {
+      return Object.keys(buildExtractors());
+    },
+    getOwnPropertyDescriptor(_t, key: string) {
+      const m = buildExtractors();
+      if ((key as Language) in m) {
+        return {
+          configurable: true,
+          enumerable: true,
+          writable: false,
+          value: m[key as Language],
+        };
+      }
+      return undefined;
+    },
+  }
+);
diff --git a/src/extraction/languages/java.ts b/src/extraction/languages/java.ts
index 638533f0..9613217c 100644
--- a/src/extraction/languages/java.ts
+++ b/src/extraction/languages/java.ts
@@ -57,3 +57,12 @@ export const javaExtractor: LanguageExtractor = {
     return null;
   },
 };
+
+import type { LanguageDef } from './types';
+export const JAVA_DEF: LanguageDef = {
+  name: 'java',
+  displayName: 'Java',
+  extensions: ['.java'],
+  includeGlobs: ['**/*.java'],
+  grammar: { wasmFile: 'tree-sitter-java.wasm', extractor: javaExtractor },
+};
diff --git a/src/extraction/languages/javascript.ts b/src/extraction/languages/javascript.ts
index 0a0d6780..946e1c5c 100644
--- a/src/extraction/languages/javascript.ts
+++ b/src/extraction/languages/javascript.ts
@@ -82,3 +82,12 @@ export const javascriptExtractor: LanguageExtractor = {
     return null;
   },
 };
+
+import type { LanguageDef } from './types';
+export const JAVASCRIPT_DEF: LanguageDef = {
+  name: 'javascript',
+  displayName: 'JavaScript',
+  extensions: ['.js', '.mjs', '.cjs'],
+  includeGlobs: ['**/*.js'],
+  grammar: { wasmFile: 'tree-sitter-javascript.wasm', extractor: javascriptExtractor },
+};
diff --git a/src/extraction/languages/jsx.ts b/src/extraction/languages/jsx.ts
new file mode 100644
index 00000000..5091ee64
--- /dev/null
+++ b/src/extraction/languages/jsx.ts
@@ -0,0 +1,14 @@
+/**
+ * JSX — reuses the JavaScript extractor (the JS grammar handles JSX
+ * via the same `tree-sitter-javascript.wasm` file).
+ */
+import { javascriptExtractor } from './javascript';
+import type { LanguageDef } from './types';
+
+export const JSX_DEF: LanguageDef = {
+  name: 'jsx',
+  displayName: 'JSX',
+  extensions: ['.jsx'],
+  includeGlobs: ['**/*.jsx'],
+  grammar: { wasmFile: 'tree-sitter-javascript.wasm', extractor: javascriptExtractor },
+};
diff --git a/src/extraction/languages/kotlin.ts b/src/extraction/languages/kotlin.ts
index 19c38624..77d15609 100644
--- a/src/extraction/languages/kotlin.ts
+++ b/src/extraction/languages/kotlin.ts
@@ -236,3 +236,12 @@ export const kotlinExtractor: LanguageExtractor = {
     return null;
   },
 };
+
+import type { LanguageDef } from './types';
+export const KOTLIN_DEF: LanguageDef = {
+  name: 'kotlin',
+  displayName: 'Kotlin',
+  extensions: ['.kt', '.kts'],
+  includeGlobs: ['**/*.kt'],
+  grammar: { wasmFile: 'tree-sitter-kotlin.wasm', extractor: kotlinExtractor },
+};
diff --git a/src/extraction/languages/liquid.ts b/src/extraction/languages/liquid.ts
new file mode 100644
index 00000000..ead2f978
--- /dev/null
+++ b/src/extraction/languages/liquid.ts
@@ -0,0 +1,16 @@
+/**
+ * Liquid — custom regex-based extractor for Shopify Liquid templates.
+ * Tree-sitter has no production-quality Liquid grammar; the
+ * `LiquidExtractor` does targeted pattern matching for snippet
+ * includes and Drop variable references.
+ */
+import { LiquidExtractor } from '../liquid-extractor';
+import type { LanguageDef } from './types';
+
+export const LIQUID_DEF: LanguageDef = {
+  name: 'liquid',
+  displayName: 'Liquid',
+  extensions: ['.liquid'],
+  includeGlobs: ['**/*.liquid'],
+  customExtractor: (filePath, source) => new LiquidExtractor(filePath, source).extract(),
+};
diff --git a/src/extraction/languages/pascal.ts b/src/extraction/languages/pascal.ts
index aed6a59f..a196c7b0 100644
--- a/src/extraction/languages/pascal.ts
+++ b/src/extraction/languages/pascal.ts
@@ -60,3 +60,30 @@ export const pascalExtractor: LanguageExtractor = {
     return node.type === 'declConst';
   },
 };
+
+import type { LanguageDef } from './types';
+import { DfmExtractor } from '../dfm-extractor';
+
+const dfmCustomExtractor = (filePath: string, source: string) =>
+  new DfmExtractor(filePath, source).extract();
+
+export const PASCAL_DEF: LanguageDef = {
+  name: 'pascal',
+  displayName: 'Pascal / Delphi',
+  extensions: ['.pas', '.dpr', '.dpk', '.lpr', '.dfm', '.fmx'],
+  includeGlobs: [
+    '**/*.pas', '**/*.dpr', '**/*.dpk', '**/*.lpr',
+    '**/*.dfm', '**/*.fmx',
+  ],
+  grammar: {
+    wasmFile: 'tree-sitter-pascal.wasm',
+    vendored: true,
+    extractor: pascalExtractor,
+  },
+  // .dfm/.fmx are Delphi/FireMonkey form files — declarative property
+  // definitions, not Pascal source. Route them to the dedicated DfmExtractor.
+  extensionOverrides: {
+    '.dfm': { customExtractor: dfmCustomExtractor },
+    '.fmx': { customExtractor: dfmCustomExtractor },
+  },
+};
diff --git a/src/extraction/languages/php.ts b/src/extraction/languages/php.ts
index 1133f979..30271286 100644
--- a/src/extraction/languages/php.ts
+++ b/src/extraction/languages/php.ts
@@ -103,3 +103,12 @@ export const phpExtractor: LanguageExtractor = {
     return null;
   },
 };
+
+import type { LanguageDef } from './types';
+export const PHP_DEF: LanguageDef = {
+  name: 'php',
+  displayName: 'PHP',
+  extensions: ['.php'],
+  includeGlobs: ['**/*.php'],
+  grammar: { wasmFile: 'tree-sitter-php.wasm', extractor: phpExtractor },
+};
diff --git a/src/extraction/languages/python.ts b/src/extraction/languages/python.ts
index 77807d66..2cddcf40 100644
--- a/src/extraction/languages/python.ts
+++ b/src/extraction/languages/python.ts
@@ -51,3 +51,12 @@ export const pythonExtractor: LanguageExtractor = {
     return null;
   },
 };
+
+import type { LanguageDef } from './types';
+export const PYTHON_DEF: LanguageDef = {
+  name: 'python',
+  displayName: 'Python',
+  extensions: ['.py', '.pyw'],
+  includeGlobs: ['**/*.py'],
+  grammar: { wasmFile: 'tree-sitter-python.wasm', extractor: pythonExtractor },
+};
diff --git a/src/extraction/languages/r.ts b/src/extraction/languages/r.ts
new file mode 100644
index 00000000..00fe874f
--- /dev/null
+++ b/src/extraction/languages/r.ts
@@ -0,0 +1,247 @@
+import type { Node as SyntaxNode } from 'web-tree-sitter';
+import { getNodeText, getPrecedingDocstring } from '../tree-sitter-helpers';
+import type { LanguageExtractor, ExtractorContext } from '../tree-sitter-types';
+
+/**
+ * R extraction.
+ *
+ * R has no `def` / `function name() {}` keyword — every function is an
+ * anonymous `function_definition` whose name lives on the LHS of an
+ * enclosing assignment, e.g.:
+ *
+ *     add <- function(a, b) a + b      # left-arrow assignment
+ *     subtract = function(a, b) a - b  # equals assignment
+ *     divide <<- function(a, b) a / b  # super-assignment
+ *
+ * The OO-flavoured framework dispatch (`functionTypes: ['function_definition']`)
+ * doesn't fit because it would emit anonymous function nodes for every
+ * lambda passed to `lapply` / `Map` / `purrr::map` / etc. Instead we
+ * intercept top-level and nested assignments via the `visitNode` hook,
+ * pull the name from the LHS, and create the function node ourselves.
+ *
+ * Handled forms:
+ *   - `name <- function(...) body`           (and `=`, `<<-`)
+ *   - `library(pkg)` / `require(pkg)`        → import nodes
+ *   - `source("path/to/file.R")`             → import nodes (resolved by path)
+ *   - bare and namespaced calls: `f(...)`, `pkg::f(...)`  via core extractCall
+ *   - top-level non-function assignments     → constant nodes
+ *
+ * Right-arrow assignment (`function(...) body -> name`) is intentionally
+ * ignored: the tree-sitter-r grammar parses the `->` as part of the
+ * function body's last expression rather than as an outer assignment, and
+ * the form is rare enough in practice that the v1 extractor doesn't try
+ * to disambiguate it.
+ *
+ * `library()`/`require()`/`source()` calls are detected only at top level;
+ * the framework's `visitFunctionBody` walker doesn't dispatch through
+ * `visitNode`, so these calls inside a function body produce a `calls`
+ * edge but no separate `import` node. Rare in practice — most R code
+ * keeps imports at the top of the file.
+ */
+
+const ASSIGN_OPS: ReadonlySet<string> = new Set(['<-', '=', '<<-']);
+
+export const rExtractor: LanguageExtractor = {
+  // Functions are detected via the assignment pattern in `visitNode`, not
+  // by node type — function_definition has no name field.
+  functionTypes: [],
+  classTypes: [],
+  methodTypes: [],
+  interfaceTypes: [],
+  structTypes: [],
+  enumTypes: [],
+  typeAliasTypes: [],
+  // Imports are calls (`library(pkg)` / `source(...)`) — handled in visitNode.
+  importTypes: [],
+  // Standard call edges work for R: `extractCall` falls back to namedChild(0)
+  // which is either an `identifier`, `namespace_operator` (pkg::name), or
+  // `extract_operator` (obj$method). In all three cases getNodeText gives a
+  // sensible callee name.
+  callTypes: ['call'],
+  variableTypes: [],
+
+  nameField: 'name',
+  bodyField: 'body',
+  paramsField: 'parameters',
+
+  visitNode: (node, ctx) => {
+    if (node.type === 'binary_operator') {
+      return handleBinaryOperator(node, ctx);
+    }
+    if (node.type === 'call') {
+      return handleCall(node, ctx);
+    }
+    return false;
+  },
+};
+
+function handleBinaryOperator(node: SyntaxNode, ctx: ExtractorContext): boolean {
+  const operator = node.childForFieldName('operator');
+  const lhs = node.childForFieldName('lhs');
+  const rhs = node.childForFieldName('rhs');
+  if (!operator || !lhs || !rhs) return false;
+  if (!ASSIGN_OPS.has(operator.type)) return false;
+  if (lhs.type !== 'identifier') return false;
+
+  const name = getNodeText(lhs, ctx.source);
+  if (!name) return false;
+
+  if (rhs.type === 'function_definition') {
+    emitFunction(node, rhs, name, ctx);
+    return true; // we've fully handled this subtree
+  }
+
+  // Plain top-level assignment → constant. Don't return true so the core
+  // still walks the rhs for nested calls / function definitions / imports.
+  if (isAtTopLevel(ctx)) {
+    ctx.createNode('constant', name, node, {
+      docstring: getPrecedingDocstring(node, ctx.source),
+    });
+  }
+  return false;
+}
+
+function emitFunction(
+  outerNode: SyntaxNode,
+  funcDef: SyntaxNode,
+  name: string,
+  ctx: ExtractorContext,
+): void {
+  const params = funcDef.namedChildren.find((c: SyntaxNode | null) => c?.type === 'parameters');
+  const signature = params ? getNodeText(params, ctx.source) : undefined;
+
+  const funcNode = ctx.createNode('function', name, outerNode, {
+    docstring: getPrecedingDocstring(outerNode, ctx.source),
+    signature,
+  });
+  if (!funcNode) return;
+
+  // Body is the last named child of function_definition (after `parameters`).
+  // It may be a `braced_expression` or any single expression for one-liners
+  // like `function(x) x + 1`.
+  const body = funcDef.namedChild(funcDef.namedChildCount - 1);
+  if (!body || body.type === 'parameters') return;
+
+  ctx.pushScope(funcNode.id);
+  try {
+    ctx.visitFunctionBody(body, funcNode.id);
+  } finally {
+    ctx.popScope();
+  }
+}
+
+function handleCall(node: SyntaxNode, ctx: ExtractorContext): boolean {
+  const callee = node.namedChild(0);
+  if (callee?.type !== 'identifier') return false;
+  const calleeName = getNodeText(callee, ctx.source);
+
+  if (calleeName === 'library' || calleeName === 'require') {
+    emitLibraryImport(node, ctx);
+    // Don't return true — let the core also record the `library`/`require`
+    // call as an edge so callers/callees queries surface it.
+    return false;
+  }
+  if (calleeName === 'source') {
+    emitSourceImport(node, ctx);
+    return false;
+  }
+  return false;
+}
+
+/**
+ * `library(dplyr)` and `library("dplyr")` both name a package. R's NSE means
+ * the bare-identifier form is the idiomatic one, but we accept both.
+ */
+function emitLibraryImport(node: SyntaxNode, ctx: ExtractorContext): void {
+  const args = node.namedChildren.find((c: SyntaxNode | null) => c?.type === 'arguments');
+  if (!args) return;
+  const firstArg = args.namedChildren.find((c: SyntaxNode | null) => c?.type === 'argument');
+  if (!firstArg) return;
+
+  const inner = firstArg.namedChild(0);
+  if (!inner) return;
+
+  let pkg: string | null = null;
+  if (inner.type === 'identifier') {
+    pkg = getNodeText(inner, ctx.source);
+  } else if (inner.type === 'string') {
+    pkg = unquoteStringNode(inner, ctx.source);
+  }
+  if (!pkg) return;
+
+  ctx.createNode('import', pkg, node, {
+    signature: getNodeText(node, ctx.source),
+  });
+}
+
+/**
+ * `source("path/to/file.R")` brings another R file into scope. The argument
+ * must be a string literal — a dynamic path is recorded as an unresolved
+ * call only.
+ */
+function emitSourceImport(node: SyntaxNode, ctx: ExtractorContext): void {
+  const args = node.namedChildren.find((c: SyntaxNode | null) => c?.type === 'arguments');
+  if (!args) return;
+  const firstArg = args.namedChildren.find((c: SyntaxNode | null) => c?.type === 'argument');
+  if (!firstArg) return;
+  const inner = firstArg.namedChild(0);
+  if (inner?.type !== 'string') return;
+
+  const path = unquoteStringNode(inner, ctx.source);
+  if (!path) return;
+
+  ctx.createNode('import', path, node, {
+    signature: getNodeText(node, ctx.source),
+  });
+}
+
+/**
+ * Extract the literal content of an R `string` syntax node, handling both
+ * the regular `"..."` / `'...'` form and R 4.0+ raw strings: `r"(...)"`,
+ * `R"[...]"`, `r"{...}"`, plus dash-delimited variants like `r"-(...)-"`.
+ *
+ * Tree-sitter-r exposes a `string_content` named child for regular strings
+ * but not for raw strings, so we detect each case accordingly.
+ */
+function unquoteStringNode(node: SyntaxNode, source: string): string {
+  const content = node.namedChildren.find((c: SyntaxNode | null) => c?.type === 'string_content');
+  if (content) return getNodeText(content, source);
+
+  const text = getNodeText(node, source);
+  // Raw-string form: optional `r`/`R`, opening quote, dashes*, opening
+  // delimiter ((|[|{), body, matching closing delimiter, same dashes,
+  // closing quote.
+  const m = text.match(/^[rR]"(-*)([([{])([\s\S]*)([)\]}])\1"$/);
+  if (m) {
+    const [, , open, body, close] = m;
+    const ok =
+      (open === '(' && close === ')') ||
+      (open === '[' && close === ']') ||
+      (open === '{' && close === '}');
+    if (ok) return body!;
+  }
+  // Fallback: strip surrounding `"..."` or `'...'`.
+  if (text.length >= 2) {
+    const first = text[0];
+    const last = text[text.length - 1];
+    if ((first === '"' && last === '"') || (first === "'" && last === "'")) {
+      return text.slice(1, -1);
+    }
+  }
+  return text;
+}
+
+function isAtTopLevel(ctx: ExtractorContext): boolean {
+  // The file node is always at the bottom of the stack while extracting;
+  // top-level program statements run with only the file node on the stack.
+  return ctx.nodeStack.length <= 1;
+}
+
+import type { LanguageDef } from './types';
+export const R_DEF: LanguageDef = {
+  name: 'r',
+  displayName: 'R',
+  extensions: ['.r'],
+  includeGlobs: ['**/*.r', '**/*.R'],
+  grammar: { wasmFile: 'tree-sitter-r.wasm', vendored: true, extractor: rExtractor },
+};
diff --git a/src/extraction/languages/registry.ts b/src/extraction/languages/registry.ts
new file mode 100644
index 00000000..028ec82e
--- /dev/null
+++ b/src/extraction/languages/registry.ts
@@ -0,0 +1,118 @@
+/**
+ * Language registry — central import + collection of every per-language
+ * `LanguageDef`. Adding a new language is:
+ *
+ *   1. Create `src/extraction/languages/<name>.ts` exporting an
+ *      `<NAME>_DEF: LanguageDef` constant.
+ *   2. Add **one** import line and **one** array entry to this file.
+ *
+ * **That is the complete change list.** All consumers
+ * (`grammars.ts`, `tree-sitter.ts`'s extractor lookup,
+ * `default-config.ts`'s include globs, the legacy `EXTRACTORS`
+ * barrel in `./index.ts`) all read from this registry — there is
+ * no parallel list to keep in sync.
+ *
+ * This file is the only place a "central list" of languages lives,
+ * so adjacent-line conflicts between PRs adding different languages
+ * are limited to whichever alphabetical neighborhood they target.
+ *
+ * Note: an earlier draft used `fs.readdirSync` auto-discovery which
+ * eliminated even this file, but `require()` of extensionless paths
+ * doesn't work under vitest's vite-node loader for `.ts` source. A
+ * generated-barrel build step would restore zero-list-edits and is
+ * tracked as a follow-up.
+ */
+
+import type { LanguageDef } from './types';
+
+// =====================================================================
+// Imports — one per language, alphabetical by name
+// =====================================================================
+import { C_DEF, CPP_DEF } from './c-cpp';
+import { CSHARP_DEF } from './csharp';
+import { DART_DEF } from './dart';
+import { GO_DEF } from './go';
+import { HCL_DEF } from './hcl';
+import { JAVA_DEF } from './java';
+import { JAVASCRIPT_DEF } from './javascript';
+import { JSX_DEF } from './jsx';
+import { KOTLIN_DEF } from './kotlin';
+import { LIQUID_DEF } from './liquid';
+import { PASCAL_DEF } from './pascal';
+import { PHP_DEF } from './php';
+import { PYTHON_DEF } from './python';
+import { R_DEF } from './r';
+import { RESCRIPT_DEF } from './rescript';
+import { RUBY_DEF } from './ruby';
+import { RUST_DEF } from './rust';
+import { SCALA_DEF } from './scala';
+import { SQL_DEF } from './sql';
+import { SVELTE_DEF } from './svelte';
+import { SWIFT_DEF } from './swift';
+import { TSX_DEF } from './tsx';
+import { TYPESCRIPT_DEF } from './typescript';
+
+// =====================================================================
+// Registry — alphabetical by name
+// =====================================================================
+const ALL_DEFS: readonly LanguageDef[] = [
+  C_DEF,
+  CPP_DEF,
+  CSHARP_DEF,
+  DART_DEF,
+  GO_DEF,
+  HCL_DEF,
+  JAVA_DEF,
+  JAVASCRIPT_DEF,
+  JSX_DEF,
+  KOTLIN_DEF,
+  LIQUID_DEF,
+  PASCAL_DEF,
+  PHP_DEF,
+  PYTHON_DEF,
+  R_DEF,
+  RESCRIPT_DEF,
+  RUBY_DEF,
+  RUST_DEF,
+  SCALA_DEF,
+  SQL_DEF,
+  SVELTE_DEF,
+  SWIFT_DEF,
+  TSX_DEF,
+  TYPESCRIPT_DEF,
+];
+
+let byName: Map<string, LanguageDef> | null = null;
+let byExtension: Map<string, LanguageDef> | null = null;
+
+function ensureIndexes(): void {
+  if (byName && byExtension) return;
+  byName = new Map();
+  byExtension = new Map();
+  for (const def of ALL_DEFS) {
+    byName.set(def.name, def);
+    for (const ext of def.extensions) {
+      byExtension.set(ext.toLowerCase(), def);
+    }
+  }
+}
+
+export function getLanguageDefs(): readonly LanguageDef[] {
+  return ALL_DEFS;
+}
+
+export function getLanguageDefByName(name: string): LanguageDef | undefined {
+  ensureIndexes();
+  return byName!.get(name);
+}
+
+export function getLanguageDefByExtension(ext: string): LanguageDef | undefined {
+  ensureIndexes();
+  return byExtension!.get(ext.toLowerCase());
+}
+
+/** Reset cached indexes. Used by tests; no-op in production paths. */
+export function _resetRegistryCacheForTests(): void {
+  byName = null;
+  byExtension = null;
+}
diff --git a/src/extraction/languages/rescript.ts b/src/extraction/languages/rescript.ts
new file mode 100644
index 00000000..19e4aa57
--- /dev/null
+++ b/src/extraction/languages/rescript.ts
@@ -0,0 +1,448 @@
+import type { Node as SyntaxNode } from 'web-tree-sitter';
+import { getNodeText, getChildByField, getPrecedingDocstring } from '../tree-sitter-helpers';
+import type { LanguageExtractor, ExtractorContext, ImportInfo } from '../tree-sitter-types';
+import type { NodeKind } from '../../types';
+
+// ============================================================================
+// Helpers (no access to ExtractorContext needed)
+// ============================================================================
+
+function findChildTextWithSource(node: SyntaxNode, childType: string, source: string): string | undefined {
+  for (let i = 0; i < node.namedChildCount; i++) {
+    const child = node.namedChild(i);
+    if (child?.type === childType) {
+      return getNodeText(child, source);
+    }
+  }
+  return undefined;
+}
+
+function extractDecorators(node: SyntaxNode, source: string): string[] | undefined {
+  const decorators: string[] = [];
+  let sibling = node.previousNamedSibling;
+  while (sibling?.type === 'decorator') {
+    decorators.unshift(getNodeText(sibling, source));
+    sibling = sibling.previousNamedSibling;
+  }
+  for (let i = 0; i < node.namedChildCount; i++) {
+    const child = node.namedChild(i);
+    if (child?.type === 'decorator') {
+      decorators.push(getNodeText(child, source));
+    }
+  }
+  return decorators.length > 0 ? decorators : undefined;
+}
+
+// ============================================================================
+// Core visitor (uses ExtractorContext)
+// ============================================================================
+
+/**
+ * Handle ReScript-specific AST nodes via the visitNode hook.
+ * Returns true if the node was fully handled (skip default dispatch).
+ *
+ * ReScript uses wrapper nodes:
+ * - let_declaration → let_binding → pattern (name) + body
+ * - module_declaration → module_binding → name + definition/signature
+ * - type_declaration → type_binding → name + body
+ * - external_declaration → value_identifier + type_annotation + string
+ */
+function visitReScriptNode(node: SyntaxNode, ctx: ExtractorContext): boolean {
+  const nodeType = node.type;
+
+  // ERROR nodes often contain valid structures — walk their children to extract.
+  if (nodeType === 'ERROR') {
+    for (let i = 0; i < node.namedChildCount; i++) {
+      const child = node.namedChild(i);
+      if (child) visitReScriptNode(child, ctx);
+    }
+    return true;
+  }
+
+  // let_declaration: unwrap to let_binding
+  if (nodeType === 'let_declaration') {
+    for (let i = 0; i < node.namedChildCount; i++) {
+      const binding = node.namedChild(i);
+      if (binding?.type === 'let_binding') {
+        extractLetBinding(binding, ctx);
+      }
+    }
+    return true;
+  }
+
+  // Bare let_binding (inside ERROR nodes)
+  if (nodeType === 'let_binding') {
+    extractLetBinding(node, ctx);
+    return true;
+  }
+
+  // module_declaration: unwrap to module_binding
+  if (nodeType === 'module_declaration') {
+    for (let i = 0; i < node.namedChildCount; i++) {
+      const binding = node.namedChild(i);
+      if (binding?.type === 'module_binding') {
+        extractModule(binding, node, ctx);
+      }
+    }
+    return true;
+  }
+
+  // type_declaration: unwrap to type_binding
+  if (nodeType === 'type_declaration') {
+    for (let i = 0; i < node.namedChildCount; i++) {
+      const binding = node.namedChild(i);
+      if (binding?.type === 'type_binding') {
+        extractType(binding, node, ctx);
+      }
+    }
+    return true;
+  }
+
+  // Bare type_binding (inside ERROR nodes)
+  if (nodeType === 'type_binding') {
+    extractType(node, node, ctx);
+    return true;
+  }
+
+  // external_declaration: FFI binding → function node
+  if (nodeType === 'external_declaration') {
+    extractExternal(node, ctx);
+    return true;
+  }
+
+  // exception_declaration
+  if (nodeType === 'exception_declaration') {
+    const name = findChildTextWithSource(node, 'variant_identifier', ctx.source);
+    if (name) {
+      ctx.createNode('type_alias', name, node, {
+        docstring: getPrecedingDocstring(node, ctx.source),
+      });
+    }
+    return true;
+  }
+
+  // pipe_expression: extract call edge to the piped function
+  if (nodeType === 'pipe_expression') {
+    extractPipeCall(node, ctx);
+    return true;
+  }
+
+  return false;
+}
+
+function extractLetBinding(binding: SyntaxNode, ctx: ExtractorContext): void {
+  const patternNode = getChildByField(binding, 'pattern');
+  if (!patternNode) return;
+  const name = getNodeText(patternNode, ctx.source);
+  if (!name || name === '_') return;
+
+  const body = getChildByField(binding, 'body');
+  const docstring = getPrecedingDocstring(binding.parent || binding, ctx.source);
+  const decorators = extractDecorators(binding.parent || binding, ctx.source);
+
+  if (body?.type === 'function') {
+    // Function binding: let foo = (x, y) => body
+    const params = getChildByField(body, 'parameters');
+    const returnType = getChildByField(body, 'return_type');
+    let signature: string | undefined;
+    if (params) {
+      signature = getNodeText(params, ctx.source);
+      if (returnType) signature += ' => ' + getNodeText(returnType, ctx.source);
+    }
+
+    const funcNode = ctx.createNode('function', name, binding.parent || binding, {
+      docstring,
+      signature,
+      decorators,
+    });
+
+    if (funcNode) {
+      // Visit function body for calls
+      const funcBody = getChildByField(body, 'body');
+      if (funcBody) {
+        ctx.visitFunctionBody(funcBody, funcNode.id);
+      }
+    }
+  } else {
+    // Variable binding: let x = expr
+    const initValue = body ? getNodeText(body, ctx.source).slice(0, 100) : undefined;
+    const initSignature = initValue ? `= ${initValue}${initValue.length >= 100 ? '...' : ''}` : undefined;
+
+    ctx.createNode('variable', name, binding.parent || binding, {
+      docstring,
+      signature: initSignature,
+      decorators,
+    });
+
+    // Visit body for call expressions (e.g., let x = Foo.bar(arg))
+    if (body) {
+      for (let i = 0; i < body.namedChildCount; i++) {
+        const child = body.namedChild(i);
+        if (child) ctx.visitNode(child);
+      }
+    }
+  }
+}
+
+function extractModule(binding: SyntaxNode, declNode: SyntaxNode, ctx: ExtractorContext): void {
+  const nameNode = getChildByField(binding, 'name');
+  if (!nameNode) return;
+  const name = getNodeText(nameNode, ctx.source);
+  const docstring = getPrecedingDocstring(declNode, ctx.source);
+  const definition = getChildByField(binding, 'definition');
+  const signature = getChildByField(binding, 'signature');
+
+  // Check if this is a `module type` declaration (has non-named 'type' child)
+  let isModuleType = false;
+  for (let i = 0; i < declNode.childCount; i++) {
+    const child = declNode.child(i);
+    if (child?.type === 'type' && !child.isNamed) {
+      isModuleType = true;
+      break;
+    }
+  }
+
+  const kind: NodeKind = isModuleType ? 'interface' : 'namespace';
+  const moduleNode = ctx.createNode(kind, name, declNode, { docstring });
+  if (!moduleNode) return;
+
+  const body = definition || signature;
+  if (body) {
+    ctx.pushScope(moduleNode.id);
+    if (body.type === 'block') {
+      for (let i = 0; i < body.namedChildCount; i++) {
+        const child = body.namedChild(i);
+        if (child) ctx.visitNode(child);
+      }
+    } else if (body.type === 'functor') {
+      const functorBody = getChildByField(body, 'body');
+      if (functorBody?.type === 'block') {
+        for (let i = 0; i < functorBody.namedChildCount; i++) {
+          const child = functorBody.namedChild(i);
+          if (child) ctx.visitNode(child);
+        }
+      }
+    } else if (body.type === 'module_expression') {
+      const aliasName = getNodeText(body, ctx.source);
+      ctx.addUnresolvedReference({
+        fromNodeId: moduleNode.id,
+        referenceName: aliasName,
+        referenceKind: 'references',
+        line: body.startPosition.row + 1,
+        column: body.startPosition.column,
+      });
+    }
+    ctx.popScope();
+  }
+}
+
+function extractType(binding: SyntaxNode, declNode: SyntaxNode, ctx: ExtractorContext): void {
+  const nameNode = getChildByField(binding, 'name');
+  if (!nameNode) return;
+  const name = getNodeText(nameNode, ctx.source);
+  const docstring = getPrecedingDocstring(declNode, ctx.source);
+
+  let kind: NodeKind = 'type_alias';
+  for (let i = 0; i < binding.namedChildCount; i++) {
+    const child = binding.namedChild(i);
+    if (child?.type === 'variant_type' || child?.type === 'variant_declaration') {
+      kind = 'enum';
+      break;
+    }
+    if (child?.type === 'record_type') {
+      kind = 'struct';
+      break;
+    }
+  }
+
+  const typeNode = ctx.createNode(kind, name, declNode, { docstring });
+  if (!typeNode) return;
+
+  if (kind === 'enum') {
+    ctx.pushScope(typeNode.id);
+    const extractVariants = (container: SyntaxNode) => {
+      for (let i = 0; i < container.namedChildCount; i++) {
+        const child = container.namedChild(i);
+        if (child?.type === 'variant_type') {
+          extractVariants(child);
+        } else if (child?.type === 'variant_declaration') {
+          const variantId = findChildTextWithSource(child, 'variant_identifier', ctx.source);
+          if (variantId) {
+            ctx.createNode('enum_member', variantId, child);
+          }
+        }
+      }
+    };
+    extractVariants(binding);
+    ctx.popScope();
+  }
+
+  if (kind === 'struct') {
+    ctx.pushScope(typeNode.id);
+    for (let i = 0; i < binding.namedChildCount; i++) {
+      const child = binding.namedChild(i);
+      if (child?.type === 'record_type') {
+        for (let j = 0; j < child.namedChildCount; j++) {
+          const field = child.namedChild(j);
+          if (field?.type === 'record_type_field') {
+            const fieldName = findChildTextWithSource(field, 'property_identifier', ctx.source);
+            if (fieldName) {
+              ctx.createNode('field', fieldName, field);
+            }
+          }
+        }
+      }
+    }
+    ctx.popScope();
+  }
+}
+
+function extractExternal(node: SyntaxNode, ctx: ExtractorContext): void {
+  const name = findChildTextWithSource(node, 'value_identifier', ctx.source);
+  if (!name) return;
+
+  const docstring = getPrecedingDocstring(node, ctx.source);
+  const decorators = extractDecorators(node, ctx.source);
+
+  // Build signature from type annotation
+  let signature: string | undefined;
+  for (let i = 0; i < node.namedChildCount; i++) {
+    const child = node.namedChild(i);
+    if (child?.type === 'type_annotation') {
+      signature = getNodeText(child, ctx.source);
+      break;
+    }
+  }
+
+  ctx.createNode('function', name, node, { docstring, signature, decorators });
+}
+
+function extractPipeCall(node: SyntaxNode, ctx: ExtractorContext): void {
+  const callerId = ctx.nodeStack[ctx.nodeStack.length - 1];
+  if (!callerId) {
+    // Still recurse children
+    for (let i = 0; i < node.namedChildCount; i++) {
+      const child = node.namedChild(i);
+      if (child) ctx.visitNode(child);
+    }
+    return;
+  }
+
+  const children = node.namedChildren;
+  if (children.length >= 2) {
+    const pipedTo = children[1];
+    if (pipedTo) {
+      let calleeName = '';
+      if (pipedTo.type === 'call_expression') {
+        const func = getChildByField(pipedTo, 'function');
+        calleeName = func
+          ? getNodeText(func, ctx.source)
+          : getNodeText(pipedTo, ctx.source);
+      } else {
+        calleeName = getNodeText(pipedTo, ctx.source);
+      }
+
+      if (calleeName) {
+        ctx.addUnresolvedReference({
+          fromNodeId: callerId,
+          referenceName: calleeName,
+          referenceKind: 'calls',
+          line: node.startPosition.row + 1,
+          column: node.startPosition.column,
+        });
+      }
+    }
+  }
+
+  // Visit children for nested calls
+  for (let i = 0; i < node.namedChildCount; i++) {
+    const child = node.namedChild(i);
+    if (child) ctx.visitNode(child);
+  }
+}
+
+// ============================================================================
+// LanguageExtractor export
+// ============================================================================
+
+export const rescriptExtractor: LanguageExtractor = {
+  // ReScript uses wrapper nodes — all substantive extraction happens in visitNode.
+  functionTypes: [],
+  classTypes: [],
+  methodTypes: [],
+  interfaceTypes: [],
+  structTypes: [],
+  enumTypes: [],
+  typeAliasTypes: [],
+  importTypes: ['open_statement', 'include_statement'],
+  callTypes: ['call_expression', 'pipe_expression'],
+  variableTypes: [],
+  nameField: 'name',
+  bodyField: 'body',
+  paramsField: 'parameters',
+  returnField: 'return_type',
+
+  visitNode(node, ctx) {
+    return visitReScriptNode(node, ctx);
+  },
+
+  extractImport(node, source): ImportInfo | null {
+    // ReScript: open ModuleName, include ModuleName
+    const importText = getNodeText(node, source);
+    const moduleExpr = node.namedChildren.find(c => c.type === 'module_expression');
+    if (moduleExpr) {
+      return { moduleName: getNodeText(moduleExpr, source), signature: importText };
+    }
+    const moduleId = node.namedChildren.find(
+      c => c.type === 'module_identifier' || c.type === 'module_identifier_path'
+    );
+    if (moduleId) {
+      return { moduleName: getNodeText(moduleId, source), signature: importText };
+    }
+    return null;
+  },
+
+  getSignature(node, source) {
+    if (node.type === 'let_binding') {
+      const body = getChildByField(node, 'body');
+      if (body?.type === 'function') {
+        const params = getChildByField(body, 'parameters');
+        const returnType = getChildByField(body, 'return_type');
+        if (params) {
+          let sig = getNodeText(params, source);
+          if (returnType) sig += ' => ' + getNodeText(returnType, source);
+          return sig;
+        }
+      }
+    }
+    if (node.type === 'external_declaration') {
+      for (let i = 0; i < node.namedChildCount; i++) {
+        const child = node.namedChild(i);
+        if (child?.type === 'type_annotation') {
+          return getNodeText(child, source);
+        }
+      }
+    }
+    return undefined;
+  },
+
+  isAsync(node) {
+    if (node.type === 'let_binding') {
+      const body = getChildByField(node, 'body');
+      if (body?.type === 'function') {
+        const funcBody = getChildByField(body, 'body');
+        if (funcBody?.type === 'await_expression') return true;
+      }
+    }
+    return false;
+  },
+};
+
+import type { LanguageDef } from './types';
+export const RESCRIPT_DEF: LanguageDef = {
+  name: 'rescript',
+  displayName: 'ReScript',
+  extensions: ['.res', '.resi'],
+  includeGlobs: ['**/*.res', '**/*.resi'],
+  grammar: { wasmFile: 'tree-sitter-rescript.wasm', vendored: true, extractor: rescriptExtractor },
+};
diff --git a/src/extraction/languages/ruby.ts b/src/extraction/languages/ruby.ts
index b5426165..810ac26a 100644
--- a/src/extraction/languages/ruby.ts
+++ b/src/extraction/languages/ruby.ts
@@ -109,3 +109,12 @@ export const rubyExtractor: LanguageExtractor = {
     return null;
   },
 };
+
+import type { LanguageDef } from './types';
+export const RUBY_DEF: LanguageDef = {
+  name: 'ruby',
+  displayName: 'Ruby',
+  extensions: ['.rb', '.rake'],
+  includeGlobs: ['**/*.rb'],
+  grammar: { wasmFile: 'tree-sitter-ruby.wasm', extractor: rubyExtractor },
+};
diff --git a/src/extraction/languages/rust.ts b/src/extraction/languages/rust.ts
index 0266a2fd..35c957c0 100644
--- a/src/extraction/languages/rust.ts
+++ b/src/extraction/languages/rust.ts
@@ -114,3 +114,12 @@ export const rustExtractor: LanguageExtractor = {
     return null;
   },
 };
+
+import type { LanguageDef } from './types';
+export const RUST_DEF: LanguageDef = {
+  name: 'rust',
+  displayName: 'Rust',
+  extensions: ['.rs'],
+  includeGlobs: ['**/*.rs'],
+  grammar: { wasmFile: 'tree-sitter-rust.wasm', extractor: rustExtractor },
+};
diff --git a/src/extraction/languages/scala.ts b/src/extraction/languages/scala.ts
new file mode 100644
index 00000000..a3343872
--- /dev/null
+++ b/src/extraction/languages/scala.ts
@@ -0,0 +1,152 @@
+import type { Node as SyntaxNode } from 'web-tree-sitter';
+import { getNodeText } from '../tree-sitter-helpers';
+import type { LanguageExtractor } from '../tree-sitter-types';
+
+function getValVarName(node: SyntaxNode, source: string): string | null {
+  const patternNode = node.childForFieldName('pattern');
+  if (!patternNode) return null;
+  if (patternNode.type === 'identifier') return getNodeText(patternNode, source);
+  const identChild = patternNode.namedChildren.find((c: SyntaxNode) => c.type === 'identifier');
+  return identChild ? getNodeText(identChild, source) : null;
+}
+
+function extractVisibility(node: SyntaxNode): 'public' | 'private' | 'protected' {
+  for (let i = 0; i < node.namedChildCount; i++) {
+    const child = node.namedChild(i);
+    if (!child) continue;
+    if (child.type === 'modifiers' || child.type === 'access_modifier') {
+      const text = child.text;
+      if (text.includes('private')) return 'private';
+      if (text.includes('protected')) return 'protected';
+    }
+  }
+  return 'public';
+}
+
+export const scalaExtractor: LanguageExtractor = {
+  // top-level function_definition is handled via methodTypes (same pattern as Kotlin)
+  functionTypes: [],
+  classTypes: ['class_definition', 'object_definition', 'trait_definition'],
+  methodTypes: ['function_definition', 'function_declaration'],
+  interfaceTypes: [],
+  structTypes: [],
+  enumTypes: ['enum_definition'],
+  enumMemberTypes: [],        // handled in visitNode — enum_case_definitions wraps the cases
+  typeAliasTypes: ['type_definition'],
+  importTypes: ['import_declaration'],
+  callTypes: ['call_expression'],
+  variableTypes: [],          // val/var handled in visitNode (use `pattern` field, not `name`)
+  fieldTypes: [],
+  extraClassNodeTypes: [],
+
+  nameField: 'name',
+  bodyField: 'body',
+  paramsField: 'parameters',
+  returnField: 'return_type',
+  interfaceKind: 'trait',
+
+  classifyClassNode: (node: SyntaxNode) => {
+    if (node.type === 'trait_definition') return 'trait';
+    return 'class';
+  },
+
+  getSignature: (node: SyntaxNode, source: string) => {
+    const params = node.childForFieldName('parameters');
+    const returnType = node.childForFieldName('return_type');
+    if (!params && !returnType) return undefined;
+    let sig = params ? getNodeText(params, source) : '';
+    if (returnType) sig += ': ' + getNodeText(returnType, source);
+    return sig || undefined;
+  },
+
+  getVisibility: (node: SyntaxNode) => extractVisibility(node),
+
+  isAsync: () => false,
+
+  isStatic: (node: SyntaxNode) => {
+    for (let i = 0; i < node.namedChildCount; i++) {
+      const child = node.namedChild(i);
+      if (child?.type === 'modifiers' && child.text.includes('static')) return true;
+    }
+    return false;
+  },
+
+  visitNode: (node: SyntaxNode, ctx) => {
+    const t = node.type;
+
+    // val/var: name is in `pattern` field (identifier), not `name`
+    if (t === 'val_definition' || t === 'var_definition') {
+      const name = getValVarName(node, ctx.source);
+      if (!name) return false;
+
+      const isInClass = ctx.nodeStack.length > 0 &&
+        (() => {
+          const parentId = ctx.nodeStack[ctx.nodeStack.length - 1];
+          const parentNode = ctx.nodes.find((n) => n.id === parentId);
+          return parentNode != null && (
+            parentNode.kind === 'class' || parentNode.kind === 'trait' ||
+            parentNode.kind === 'interface' || parentNode.kind === 'struct' ||
+            parentNode.kind === 'enum' || parentNode.kind === 'module'
+          );
+        })();
+
+      const kind = isInClass ? 'field' : (t === 'val_definition' ? 'constant' : 'variable');
+      const typeNode = node.childForFieldName('type');
+      const sig = typeNode
+        ? `${t === 'val_definition' ? 'val' : 'var'} ${name}: ${getNodeText(typeNode, ctx.source)}`
+        : undefined;
+
+      ctx.createNode(kind, name, node, { signature: sig, visibility: extractVisibility(node) });
+      return true;
+    }
+
+    // enum_case_definitions wraps simple_enum_case / full_enum_case children
+    if (t === 'enum_case_definitions') {
+      for (let i = 0; i < node.namedChildCount; i++) {
+        const child = node.namedChild(i);
+        if (!child) continue;
+        if (child.type === 'simple_enum_case' || child.type === 'full_enum_case') {
+          const nameNode = child.childForFieldName('name');
+          if (nameNode) ctx.createNode('enum_member', getNodeText(nameNode, ctx.source), child);
+        }
+      }
+      return true;
+    }
+
+    // extension_definition: visit body children directly, no container node
+    if (t === 'extension_definition') {
+      const body = node.childForFieldName('body');
+      if (body) {
+        for (let i = 0; i < body.namedChildCount; i++) {
+          const child = body.namedChild(i);
+          if (child) ctx.visitNode(child);
+        }
+      }
+      return true;
+    }
+
+    return false;
+  },
+
+  extractImport: (node: SyntaxNode, source: string) => {
+    const importText = getNodeText(node, source).trim();
+    const pathNode = node.childForFieldName('path');
+    if (pathNode) return { moduleName: getNodeText(pathNode, source), signature: importText };
+    for (let i = 0; i < node.namedChildCount; i++) {
+      const child = node.namedChild(i);
+      if (child?.type === 'identifier' || child?.type === 'stable_identifier') {
+        return { moduleName: getNodeText(child, source), signature: importText };
+      }
+    }
+    return null;
+  },
+};
+
+import type { LanguageDef } from './types';
+export const SCALA_DEF: LanguageDef = {
+  name: 'scala',
+  displayName: 'Scala',
+  extensions: ['.scala', '.sc'],
+  includeGlobs: ['**/*.scala', '**/*.sc'],
+  grammar: { wasmFile: 'tree-sitter-scala.wasm', vendored: true, extractor: scalaExtractor },
+};
diff --git a/src/extraction/languages/sql.ts b/src/extraction/languages/sql.ts
new file mode 100644
index 00000000..69262f15
--- /dev/null
+++ b/src/extraction/languages/sql.ts
@@ -0,0 +1,35 @@
+/**
+ * SQL — custom extractor that runs over the tree-sitter-sql WASM
+ * grammar to extract DDL (CREATE TABLE / VIEW / INDEX). The query
+ * grammar is too dialect-specific to use the universal extractor;
+ * SqlExtractor handles it directly.
+ */
+import { SqlExtractor } from '../sql-extractor';
+import type { LanguageDef } from './types';
+
+export const SQL_DEF: LanguageDef = {
+  name: 'sql',
+  displayName: 'SQL',
+  extensions: ['.sql', '.ddl', '.dml'],
+  includeGlobs: ['**/*.sql', '**/*.ddl', '**/*.dml'],
+  grammar: {
+    wasmFile: 'tree-sitter-sql.wasm',
+    vendored: true,
+    extractor: {
+      functionTypes: [],
+      classTypes: [],
+      methodTypes: [],
+      interfaceTypes: [],
+      structTypes: [],
+      enumTypes: [],
+      typeAliasTypes: [],
+      importTypes: [],
+      callTypes: [],
+      variableTypes: [],
+      nameField: 'name',
+      bodyField: 'body',
+      paramsField: 'parameters',
+    },
+  },
+  customExtractor: (filePath, source) => new SqlExtractor(filePath, source).extract(),
+};
diff --git a/src/extraction/languages/svelte.ts b/src/extraction/languages/svelte.ts
new file mode 100644
index 00000000..7f7ab889
--- /dev/null
+++ b/src/extraction/languages/svelte.ts
@@ -0,0 +1,15 @@
+/**
+ * Svelte — custom extractor that delegates the script block back
+ * through the universal extraction pipeline as TypeScript/JavaScript,
+ * then merges in template-level call references.
+ */
+import { SvelteExtractor } from '../svelte-extractor';
+import type { LanguageDef } from './types';
+
+export const SVELTE_DEF: LanguageDef = {
+  name: 'svelte',
+  displayName: 'Svelte',
+  extensions: ['.svelte'],
+  includeGlobs: ['**/*.svelte'],
+  customExtractor: (filePath, source) => new SvelteExtractor(filePath, source).extract(),
+};
diff --git a/src/extraction/languages/swift.ts b/src/extraction/languages/swift.ts
index 373fa8a9..fe1ac5ce 100644
--- a/src/extraction/languages/swift.ts
+++ b/src/extraction/languages/swift.ts
@@ -81,3 +81,12 @@ export const swiftExtractor: LanguageExtractor = {
     return null;
   },
 };
+
+import type { LanguageDef } from './types';
+export const SWIFT_DEF: LanguageDef = {
+  name: 'swift',
+  displayName: 'Swift',
+  extensions: ['.swift'],
+  includeGlobs: ['**/*.swift'],
+  grammar: { wasmFile: 'tree-sitter-swift.wasm', extractor: swiftExtractor },
+};
diff --git a/src/extraction/languages/tsx.ts b/src/extraction/languages/tsx.ts
new file mode 100644
index 00000000..f4cbe536
--- /dev/null
+++ b/src/extraction/languages/tsx.ts
@@ -0,0 +1,14 @@
+/**
+ * TSX (TypeScript + JSX) — reuses the TypeScript extractor with a
+ * dedicated grammar so JSX-specific node types parse correctly.
+ */
+import { typescriptExtractor } from './typescript';
+import type { LanguageDef } from './types';
+
+export const TSX_DEF: LanguageDef = {
+  name: 'tsx',
+  displayName: 'TSX',
+  extensions: ['.tsx'],
+  includeGlobs: ['**/*.tsx'],
+  grammar: { wasmFile: 'tree-sitter-tsx.wasm', extractor: typescriptExtractor },
+};
diff --git a/src/extraction/languages/types.ts b/src/extraction/languages/types.ts
new file mode 100644
index 00000000..a93e1930
--- /dev/null
+++ b/src/extraction/languages/types.ts
@@ -0,0 +1,83 @@
+/**
+ * Per-language registry types.
+ *
+ * Each language ships its own self-contained `LanguageDef` (file
+ * extensions, default-config globs, grammar config, etc.) so that
+ * adding a new language is a single-file addition rather than 6
+ * coordinated edits across `types.ts`, `grammars.ts`, and the
+ * `extraction/languages/index.ts` barrel. The registry
+ * (`./registry`) auto-discovers definitions at module load.
+ */
+
+import type { LanguageExtractor } from '../tree-sitter-types';
+import type { ExtractionResult } from '../../types';
+
+/**
+ * Custom extraction function for languages that don't fit the
+ * universal tree-sitter AST shape (Liquid, Svelte, HCL, SQL,
+ * Pascal DFM/FMX form files).
+ */
+export type CustomExtractorFn = (filePath: string, source: string) => ExtractionResult;
+
+export interface GrammarBackedConfig {
+  /**
+   * WASM grammar filename. Resolved either against the
+   * `tree-sitter-wasms` npm package or, if `vendored` is true,
+   * against `src/extraction/wasm/`.
+   */
+  wasmFile: string;
+  /**
+   * True when the WASM is shipped under `src/extraction/wasm/`
+   * because no pre-built grammar exists in `tree-sitter-wasms`.
+   */
+  vendored?: boolean;
+  /**
+   * Per-language tree-sitter extraction config consumed by
+   * `TreeSitterExtractor`. The existing per-language objects
+   * (e.g. `typescriptExtractor`) are passed in here unchanged.
+   */
+  extractor: LanguageExtractor;
+}
+
+export interface LanguageDef {
+  /**
+   * Canonical language name. Stored as the `language` value on
+   * `Node`, `Edge`, and `FileRecord` rows. Should match an entry
+   * in the `Language` union in `src/types.ts` for known
+   * languages; new registry-only languages are accepted as
+   * strings at runtime.
+   */
+  name: string;
+  /** Human-readable display label (e.g. "HCL / Terraform"). */
+  displayName: string;
+  /**
+   * File extensions, lower-cased, with leading dot. Each
+   * extension uniquely maps to one language (caller should not
+   * register the same extension twice).
+   */
+  extensions: readonly string[];
+  /**
+   * Default-config include glob patterns. Combined into
+   * `DEFAULT_CONFIG.include` at registry load.
+   */
+  includeGlobs: readonly string[];
+  /**
+   * Tree-sitter grammar config. Absent for purely-custom
+   * languages like Liquid (regex-based) and Svelte (script
+   * delegation).
+   */
+  grammar?: GrammarBackedConfig;
+  /**
+   * Whole-language custom extractor. Used when `grammar` is
+   * absent. If both are present, `extensionOverrides` and
+   * `customExtractor` win over `grammar`.
+   */
+  customExtractor?: CustomExtractorFn;
+  /**
+   * Per-extension override. Used by Pascal where `.dfm`/`.fmx`
+   * (form files) are extracted by `DfmExtractor` rather than the
+   * tree-sitter Pascal grammar. Keys are lower-cased extensions
+   * with the leading dot.
+   */
+  extensionOverrides?: Readonly<Record<string, { customExtractor: CustomExtractorFn }>>;
+}
diff --git a/src/extraction/languages/typescript.ts b/src/extraction/languages/typescript.ts
index 9540dd94..9f82e675 100644
--- a/src/extraction/languages/typescript.ts
+++ b/src/extraction/languages/typescript.ts
@@ -1,5 +1,6 @@
 import { getNodeText, getChildByField } from '../tree-sitter-helpers';
 import type { LanguageExtractor } from '../tree-sitter-types';
+import type { LanguageDef } from './types';
 
 export const typescriptExtractor: LanguageExtractor = {
   functionTypes: ['function_declaration', 'arrow_function', 'function_expression'],
@@ -116,3 +117,11 @@ export const typescriptExtractor: LanguageExtractor = {
     return null;
   },
 };
+
+export const TYPESCRIPT_DEF: LanguageDef = {
+  name: 'typescript',
+  displayName: 'TypeScript',
+  extensions: ['.ts'],
+  includeGlobs: ['**/*.ts'],
+  grammar: { wasmFile: 'tree-sitter-typescript.wasm', extractor: typescriptExtractor },
+};
diff --git a/src/extraction/parse-worker.ts b/src/extraction/parse-worker.ts
index 21b239ca..211cfbf7 100644
--- a/src/extraction/parse-worker.ts
+++ b/src/extraction/parse-worker.ts
@@ -55,5 +55,29 @@ parentPort!.on('message', async (msg: { type: string; id?: number; filePath?: st
     }
   } else if (msg.type === 'shutdown') {
     parentPort!.postMessage({ type: 'shutdown-ack' });
+  } else {
+    // Unknown message types: when an `id` is present, surface a structured
+    // error so the in-flight Promise on the main thread fails fast rather
+    // than blocking until the per-file timeout. Messages without an `id`
+    // have no pending promise to unblock and are silently ignored — no
+    // harm done.
+    const id = msg.id;
+    if (typeof id === 'number') {
+      parentPort!.postMessage({
+        type: 'parse-result',
+        id,
+        result: {
+          nodes: [],
+          edges: [],
+          unresolvedReferences: [],
+          errors: [{
+            message: `Parse worker received unknown message type: ${msg.type}`,
+            severity: 'error',
+            code: 'worker_protocol_error',
+          }],
+          durationMs: 0,
+        } satisfies ExtractionResult,
+      });
+    }
   }
 });
diff --git a/src/extraction/sql-extractor.ts b/src/extraction/sql-extractor.ts
new file mode 100644
index 00000000..02301c32
--- /dev/null
+++ b/src/extraction/sql-extractor.ts
@@ -0,0 +1,415 @@
+import type { Node as SyntaxNode } from 'web-tree-sitter';
+import { Node, Edge, ExtractionResult, ExtractionError, UnresolvedReference, NodeKind } from '../types';
+import { generateNodeId, getNodeText } from './tree-sitter-helpers';
+import { getParser } from './grammars';
+
+/**
+ * SqlExtractor — extracts SQL DDL into the graph.
+ *
+ * SQL is declarative, with no functions/classes/methods in the OO sense, so
+ * this is a self-contained extractor (same shape as `LiquidExtractor` /
+ * `HclExtractor`) rather than a `LanguageExtractor` config plug-in.
+ *
+ * Top-level statements become graph nodes whose qualified names follow the
+ * SQL identifier they declare (with schema prefix when given):
+ *
+ *   SQL statement                          | NodeKind    | qualified name
+ *   ---------------------------------------|-------------|-----------------
+ *   CREATE TABLE [schema.]name             | class       | [schema.]name
+ *   CREATE VIEW [schema.]name              | class       | [schema.]name
+ *   CREATE FUNCTION [schema.]name(...)     | function    | [schema.]name
+ *   CREATE TRIGGER name ON table           | function    | name
+ *   CREATE TYPE name AS ENUM (...)         | enum        | name
+ *   CREATE TYPE name AS ...                | type_alias  | name
+ *   CREATE SCHEMA name                     | namespace   | name
+ *
+ * References emitted:
+ *   - Foreign keys (`REFERENCES other_table`)            → `references`
+ *   - View source tables (FROM/JOIN, including derived)  → `references`
+ *   - Function body table mentions                       → `references`
+ *   - Trigger target table                               → `references`
+ *   - Trigger executed function                          → `calls`
+ *
+ * The grammar (DerekStride/tree-sitter-sql) covers ANSI SQL plus common
+ * PostgreSQL/MySQL/SQLite/T-SQL syntax for tables, views, functions,
+ * triggers, types, and schemas. CREATE PROCEDURE syntax varies sharply
+ * across dialects (PL/pgSQL dollar-quoting, T-SQL BEGIN/END, MySQL
+ * delimiter blocks) and is not currently parsed by the grammar — those
+ * statements produce ERROR nodes and no extracted symbol. Plain DML
+ * (SELECT/INSERT/UPDATE/DELETE) outside a CREATE body is recognized but
+ * not emitted as nodes — those aren't symbol declarations.
+ */
+export class SqlExtractor {
+  private filePath: string;
+  private source: string;
+  private nodes: Node[] = [];
+  private edges: Edge[] = [];
+  private unresolvedReferences: UnresolvedReference[] = [];
+  private errors: ExtractionError[] = [];
+
+  constructor(filePath: string, source: string) {
+    this.filePath = filePath;
+    this.source = source;
+  }
+
+  extract(): ExtractionResult {
+    const startTime = Date.now();
+
+    const parser = getParser('sql');
+    if (!parser) {
+      this.errors.push({ message: 'SQL grammar not loaded', severity: 'error', code: 'grammar_unavailable' });
+      return this.result(startTime);
+    }
+
+    let tree;
+    try {
+      tree = parser.parse(this.source);
+    } catch (e) {
+      this.errors.push({
+        message: `SQL parse error: ${e instanceof Error ? e.message : String(e)}`,
+        severity: 'error',
+        code: 'parse_error',
+      });
+      return this.result(startTime);
+    }
+    if (!tree) {
+      this.errors.push({ message: 'SQL parse returned no tree', severity: 'error', code: 'parse_error' });
+      return this.result(startTime);
+    }
+
+    try {
+      const fileNodeId = this.createFileNode();
+      const root = tree.rootNode;
+      for (let i = 0; i < root.namedChildCount; i++) {
+        const child = root.namedChild(i);
+        if (child?.type !== 'statement') continue;
+        try {
+          this.visitStatement(child, fileNodeId);
+        } catch (e) {
+          this.errors.push({
+            message: `SQL statement extraction error: ${e instanceof Error ? e.message : String(e)}`,
+            line: child.startPosition.row + 1,
+            severity: 'warning',
+            code: 'extraction_error',
+          });
+        }
+      }
+      return this.result(startTime);
+    } finally {
+      tree.delete();
+    }
+  }
+
+  private result(startTime: number): ExtractionResult {
+    return {
+      nodes: this.nodes,
+      edges: this.edges,
+      unresolvedReferences: this.unresolvedReferences,
+      errors: this.errors,
+      durationMs: Date.now() - startTime,
+    };
+  }
+
+  private createFileNode(): string {
+    const lines = this.source.split('\n');
+    const id = generateNodeId(this.filePath, 'file', this.filePath, 1);
+    this.nodes.push({
+      id,
+      kind: 'file',
+      name: this.filePath.split('/').pop() || this.filePath,
+      qualifiedName: this.filePath,
+      filePath: this.filePath,
+      language: 'sql',
+      startLine: 1,
+      endLine: lines.length,
+      startColumn: 0,
+      endColumn: lines[lines.length - 1]?.length ?? 0,
+      updatedAt: Date.now(),
+    });
+    return id;
+  }
+
+  private visitStatement(stmt: SyntaxNode, fileNodeId: string): void {
+    const inner = stmt.namedChild(0);
+    if (!inner) return;
+    switch (inner.type) {
+      case 'create_table':
+        this.emitTable(inner, fileNodeId);
+        return;
+      case 'create_view':
+        this.emitView(inner, fileNodeId);
+        return;
+      case 'create_function':
+        this.emitFunction(inner, fileNodeId, 'CREATE FUNCTION');
+        return;
+      case 'create_trigger':
+        this.emitTrigger(inner, fileNodeId);
+        return;
+      case 'create_type':
+        this.emitType(inner, fileNodeId);
+        return;
+      case 'create_schema':
+        this.emitSchema(inner, fileNodeId);
+        return;
+      // create_index, select, insert, update, delete, etc. are not
+      // emitted as nodes — they aren't useful symbols for code intelligence.
+      default:
+        return;
+    }
+  }
+
+  private emitTable(node: SyntaxNode, fileNodeId: string): void {
+    const name = this.readObjectName(node);
+    if (!name) return;
+    const tableId = this.createNode('class', name, node, fileNodeId, `CREATE TABLE ${name}`);
+    if (!tableId) return;
+
+    // Foreign key references — `REFERENCES <table>` may appear inline on a
+    // column_definition or as a separate `constraint` block under
+    // column_definitions. Walk the whole subtree looking for object_reference
+    // nodes that follow a `keyword_references` sibling.
+    const cols = node.namedChildren.find((c: SyntaxNode | null) => c?.type === 'column_definitions');
+    if (cols) {
+      this.scanForeignKeys(cols, tableId);
+    }
+  }
+
+  private emitView(node: SyntaxNode, fileNodeId: string): void {
+    const name = this.readObjectName(node);
+    if (!name) return;
+    const viewId = this.createNode('class', name, node, fileNodeId, `CREATE VIEW ${name}`);
+    if (!viewId) return;
+
+    const query = node.namedChildren.find((c: SyntaxNode | null) => c?.type === 'create_query');
+    if (query) this.scanQueryReferences(query, viewId);
+  }
+
+  private emitFunction(node: SyntaxNode, fileNodeId: string, label: string): void {
+    const name = this.readObjectName(node);
+    if (!name) return;
+    const args = node.namedChildren.find((c: SyntaxNode | null) => c?.type === 'function_arguments');
+    const argsText = args ? getNodeText(args, this.source) : '()';
+    const funcId = this.createNode('function', name, node, fileNodeId, `${label} ${name}${argsText}`);
+    if (!funcId) return;
+
+    // Function bodies are often dollar-quoted plpgsql; the parser surfaces
+    // any tables/columns it can recognize as `relation`/`object_reference`
+    // even inside ERROR sub-trees. Pull out cross-references opportunistically.
+    const body = node.namedChildren.find((c: SyntaxNode | null) => c?.type === 'function_body');
+    if (body) this.scanQueryReferences(body, funcId);
+  }
+
+  private emitTrigger(node: SyntaxNode, fileNodeId: string): void {
+    // create_trigger has multiple object_reference children. The trigger
+    // name comes first (the only one before any keyword_on / keyword_execute).
+    // The target table is the first object_reference *after* a keyword_on,
+    // and the executed function is the first object_reference after a
+    // keyword_execute. Indexing by position alone is fragile because
+    // variants like `BEFORE UPDATE OF col1, col2 ON tbl ...` can interleave
+    // identifiers/object_references for column lists.
+    const nameNode = node.namedChildren.find((c: SyntaxNode | null) => c?.type === 'object_reference');
+    if (!nameNode) return;
+    const name = this.qualifiedName(nameNode);
+    if (!name) return;
+
+    const triggerId = this.createNode('function', name, node, fileNodeId, `CREATE TRIGGER ${name}`);
+    if (!triggerId) return;
+
+    const targetTable = this.findObjectRefAfter(node, 'keyword_on');
+    if (targetTable) {
+      const targetName = this.qualifiedName(targetTable);
+      if (targetName) {
+        this.unresolvedReferences.push({
+          fromNodeId: triggerId,
+          referenceName: targetName,
+          referenceKind: 'references',
+          line: targetTable.startPosition.row + 1,
+          column: targetTable.startPosition.column,
+        });
+      }
+    }
+
+    const executedFn = this.findObjectRefAfter(node, 'keyword_execute');
+    if (executedFn) {
+      const fnName = this.qualifiedName(executedFn);
+      if (fnName) {
+        this.unresolvedReferences.push({
+          fromNodeId: triggerId,
+          referenceName: fnName,
+          referenceKind: 'calls',
+          line: executedFn.startPosition.row + 1,
+          column: executedFn.startPosition.column,
+        });
+      }
+    }
+  }
+
+  /**
+   * Find the first `object_reference` named child that comes after a child
+   * of type `markerType`. Returns null if no marker or no following ref.
+   */
+  private findObjectRefAfter(parent: SyntaxNode, markerType: string): SyntaxNode | null {
+    let seenMarker = false;
+    for (let i = 0; i < parent.namedChildCount; i++) {
+      const child = parent.namedChild(i);
+      if (!child) continue;
+      if (child.type === markerType) {
+        seenMarker = true;
+        continue;
+      }
+      if (seenMarker && child.type === 'object_reference') return child;
+    }
+    return null;
+  }
+
+  private emitType(node: SyntaxNode, fileNodeId: string): void {
+    const name = this.readObjectName(node);
+    if (!name) return;
+    const isEnum = node.namedChildren.some((c: SyntaxNode | null) => c?.type === 'keyword_enum');
+    const kind: NodeKind = isEnum ? 'enum' : 'type_alias';
+    this.createNode(kind, name, node, fileNodeId, `CREATE TYPE ${name}`);
+  }
+
+  private emitSchema(node: SyntaxNode, fileNodeId: string): void {
+    // `create_schema` uses a plain `identifier` for the schema name, not
+    // `object_reference` — schemas have no qualifying parent.
+    const ident = node.namedChildren.find((c: SyntaxNode | null) => c?.type === 'identifier');
+    if (!ident) return;
+    const name = getNodeText(ident, this.source);
+    this.createNode('namespace', name, node, fileNodeId, `CREATE SCHEMA ${name}`);
+  }
+
+  /**
+   * Scan a column_definitions subtree for foreign-key references. Looks for
+   * a `keyword_references` token followed by an `object_reference` (that's
+   * the canonical FK shape both for inline `col INT REFERENCES ...` and for
+   * standalone `CONSTRAINT ... FOREIGN KEY (...) REFERENCES ...`).
+   */
+  private scanForeignKeys(root: SyntaxNode, fromNodeId: string): void {
+    const visit = (node: SyntaxNode): void => {
+      for (let i = 0; i < node.namedChildCount; i++) {
+        const child = node.namedChild(i);
+        if (!child) continue;
+        if (child.type === 'keyword_references') {
+          const target = node.namedChild(i + 1);
+          if (target?.type === 'object_reference') {
+            const targetName = this.qualifiedName(target);
+            if (targetName) {
+              this.unresolvedReferences.push({
+                fromNodeId,
+                referenceName: targetName,
+                referenceKind: 'references',
+                line: target.startPosition.row + 1,
+                column: target.startPosition.column,
+              });
+            }
+          }
+        }
+        visit(child);
+      }
+    };
+    visit(root);
+  }
+
+  /**
+   * Scan an arbitrary subtree (view body, function body, etc.) for table
+   * mentions. A `relation` always wraps a table reference in DML, and any
+   * naked `object_reference` outside the head position of a CREATE statement
+   * counts as a reference too. Subqueries inside `relation` (derived tables,
+   * CTEs) have no direct `object_reference` but contain inner `relation`s
+   * we must keep walking into.
+   */
+  private scanQueryReferences(root: SyntaxNode, fromNodeId: string): void {
+    const seen = new Set<string>();
+    const visit = (node: SyntaxNode): void => {
+      let recordedRelationHead = false;
+      if (node.type === 'relation' || node.type === 'object_reference') {
+        const target = node.type === 'relation'
+          ? node.namedChildren.find((c: SyntaxNode | null) => c?.type === 'object_reference')
+          : node;
+        if (target) {
+          const targetName = this.qualifiedName(target);
+          // De-dup per source — a view that mentions `users` five times
+          // shouldn't produce five edges to the same target.
+          if (targetName && !seen.has(targetName)) {
+            seen.add(targetName);
+            this.unresolvedReferences.push({
+              fromNodeId,
+              referenceName: targetName,
+              referenceKind: 'references',
+              line: target.startPosition.row + 1,
+              column: target.startPosition.column,
+            });
+          }
+          recordedRelationHead = node.type === 'relation';
+        }
+        // If we matched a `relation` to a real table, don't descend — the
+        // inner object_reference IS the head, not a nested reference.
+        // But if the `relation` had no object_reference child (subquery /
+        // derived table), keep walking so we pick up tables inside it.
+        if (recordedRelationHead) return;
+      }
+      for (let i = 0; i < node.namedChildCount; i++) {
+        const child = node.namedChild(i);
+        if (child) visit(child);
+      }
+    };
+    visit(root);
+  }
+
+  /**
+   * Read the head `object_reference` of a CREATE statement and return its
+   * qualified name (e.g. `reporting.events`). Returns null if the parser
+   * couldn't extract one.
+   */
+  private readObjectName(node: SyntaxNode): string | null {
+    const ref = node.namedChildren.find((c: SyntaxNode | null) => c?.type === 'object_reference');
+    if (!ref) return null;
+    return this.qualifiedName(ref);
+  }
+
+  /**
+   * Render an `object_reference` as a dotted qualified name. The grammar
+   * exposes 1 identifier child for an unqualified name, 2 for `schema.name`,
+   * and occasionally 3 for `db.schema.name` in some dialects.
+   */
+  private qualifiedName(ref: SyntaxNode): string | null {
+    const idents = ref.namedChildren.filter((c: SyntaxNode | null) => c?.type === 'identifier');
+    if (idents.length === 0) {
+      // Some grammar versions surface the name as raw text on the
+      // object_reference itself when there are no identifier children.
+      const text = getNodeText(ref, this.source).trim();
+      return text || null;
+    }
+    return idents.map((i: SyntaxNode) => getNodeText(i, this.source)).join('.');
+  }
+
+  private createNode(
+    kind: NodeKind,
+    name: string,
+    node: SyntaxNode,
+    fileNodeId: string,
+    signature: string,
+    extra?: Partial<Node>,
+  ): string | null {
+    if (!name) return null;
+    const id = generateNodeId(this.filePath, kind, name, node.startPosition.row + 1);
+    this.nodes.push({
+      id,
+      kind,
+      name,
+      qualifiedName: name,
+      filePath: this.filePath,
+      language: 'sql',
+      startLine: node.startPosition.row + 1,
+      endLine: node.endPosition.row + 1,
+      startColumn: node.startPosition.column,
+      endColumn: node.endPosition.column,
+      signature,
+      updatedAt: Date.now(),
+      ...extra,
+    });
+    this.edges.push({ source: fileNodeId, target: id, kind: 'contains' });
+    return id;
+  }
+}
diff --git a/src/extraction/svelte-extractor.ts b/src/extraction/svelte-extractor.ts
index 5586ee34..323cbe80 100644
--- a/src/extraction/svelte-extractor.ts
+++ b/src/extraction/svelte-extractor.ts
@@ -135,13 +135,17 @@ export class SvelteExtractor {
       // Detect module script
       const isModule = /context\s*=\s*["']module["']/.test(attrs);
 
-      // Calculate start line of the script content (line after <script>)
+      // The content captured by the regex includes the leading newline that
+      // follows `>`, so the inner extractor sees that newline as line 1 of
+      // its (1-indexed) input and the first real code on line 2. Offset is
+      // therefore the line number where the opening `<script ...>` tag ends
+      // (0-indexed) — adding it to the inner extractor's 1-indexed lines
+      // yields correct 1-indexed positions in the .svelte file.
       const beforeScript = this.source.substring(0, match.index);
       const scriptTagLine = (beforeScript.match(/\n/g) || []).length;
-      // The content starts on the line after the opening <script> tag
       const openingTag = match[0].substring(0, match[0].indexOf('>') + 1);
       const openingTagLines = (openingTag.match(/\n/g) || []).length;
-      const contentStartLine = scriptTagLine + openingTagLines + 1; // 0-indexed line
+      const contentStartLine = scriptTagLine + openingTagLines;
 
       blocks.push({
         content,
diff --git a/src/extraction/tree-sitter.ts b/src/extraction/tree-sitter.ts
index 7345d91f..e4537d28 100644
--- a/src/extraction/tree-sitter.ts
+++ b/src/extraction/tree-sitter.ts
@@ -18,14 +18,32 @@ import {
 import { getParser, detectLanguage, isLanguageSupported } from './grammars';
 import { generateNodeId, getNodeText, getChildByField, getPrecedingDocstring } from './tree-sitter-helpers';
 import type { LanguageExtractor, ExtractorContext } from './tree-sitter-types';
-import { EXTRACTORS } from './languages';
-import { LiquidExtractor } from './liquid-extractor';
-import { SvelteExtractor } from './svelte-extractor';
-import { DfmExtractor } from './dfm-extractor';
+import { getLanguageDefByName } from './languages/registry';
 
 // Re-export for backward compatibility
 export { generateNodeId } from './tree-sitter-helpers';
 
+/**
+ * Deduplicate unresolved references by (fromNodeId, referenceName,
+ * referenceKind). A function calling `foo()` 100 times pushes 100 refs
+ * during extraction; the resolver collapses them to one edge eventually
+ * (edges are unique on `(source, target, kind, line)` and most resolvers
+ * skip duplicate work), but indexing time and DB churn scale with the
+ * raw count. Collapsing here keeps the first occurrence's line/column
+ * (which is typically what users want when "go to call site" surfaces).
+ */
+function dedupeReferences(refs: UnresolvedReference[]): UnresolvedReference[] {
+  const seen = new Set<string>();
+  const out: UnresolvedReference[] = [];
+  for (const ref of refs) {
+    const key = `${ref.fromNodeId}\0${ref.referenceKind}\0${ref.referenceName}`;
+    if (seen.has(key)) continue;
+    seen.add(key);
+    out.push(ref);
+  }
+  return out;
+}
+
 /**
  * Extract the name from a node based on language
  */
@@ -115,7 +133,10 @@ export class TreeSitterExtractor {
     this.filePath = filePath;
     this.source = source;
     this.language = language || detectLanguage(filePath, source);
-    this.extractor = EXTRACTORS[this.language] || null;
+    // Single source of truth: read the extractor straight off the
+    // language def so adding a new grammar-backed language is a
+    // one-file change (no parallel EXTRACTORS map to keep in sync).
+    this.extractor = getLanguageDefByName(this.language)?.grammar?.extractor ?? null;
   }
 
   /**
@@ -216,7 +237,7 @@ export class TreeSitterExtractor {
     return {
       nodes: this.nodes,
       edges: this.edges,
-      unresolvedReferences: this.unresolvedReferences,
+      unresolvedReferences: dedupeReferences(this.unresolvedReferences),
       errors: this.errors,
       durationMs: Date.now() - startTime,
     };
@@ -2319,28 +2340,21 @@ export function extractFromSource(
 ): ExtractionResult {
   const detectedLanguage = language || detectLanguage(filePath, source);
   const fileExtension = path.extname(filePath).toLowerCase();
+  const def = getLanguageDefByName(detectedLanguage);
 
-  // Use custom extractor for Svelte
-  if (detectedLanguage === 'svelte') {
-    const extractor = new SvelteExtractor(filePath, source);
-    return extractor.extract();
-  }
-
-  // Use custom extractor for Liquid
-  if (detectedLanguage === 'liquid') {
-    const extractor = new LiquidExtractor(filePath, source);
-    return extractor.extract();
+  // Per-extension override wins (e.g. Pascal `.dfm`/`.fmx` route to
+  // DfmExtractor rather than the tree-sitter Pascal grammar).
+  const override = def?.extensionOverrides?.[fileExtension];
+  if (override) {
+    return override.customExtractor(filePath, source);
   }
 
-  // Use custom extractor for DFM/FMX form files
-  if (
-    detectedLanguage === 'pascal' &&
-    (fileExtension === '.dfm' || fileExtension === '.fmx')
-  ) {
-    const extractor = new DfmExtractor(filePath, source);
-    return extractor.extract();
+  // Whole-language custom extractor (Liquid, Svelte, etc.).
+  if (def?.customExtractor) {
+    return def.customExtractor(filePath, source);
   }
 
+  // Tree-sitter path.
   const extractor = new TreeSitterExtractor(filePath, source, detectedLanguage);
   return extractor.extract();
 }
diff --git a/src/extraction/wasm/tree-sitter-hcl.wasm b/src/extraction/wasm/tree-sitter-hcl.wasm
new file mode 100644
index 00000000..9cd0621d
Binary files /dev/null and b/src/extraction/wasm/tree-sitter-hcl.wasm differ
diff --git a/src/extraction/wasm/tree-sitter-r.wasm b/src/extraction/wasm/tree-sitter-r.wasm
new file mode 100644
index 00000000..3b1f3005
Binary files /dev/null and b/src/extraction/wasm/tree-sitter-r.wasm differ
diff --git a/src/extraction/wasm/tree-sitter-rescript.wasm b/src/extraction/wasm/tree-sitter-rescript.wasm
new file mode 100755
index 00000000..11b25cba
Binary files /dev/null and b/src/extraction/wasm/tree-sitter-rescript.wasm differ
diff --git a/src/extraction/wasm/tree-sitter-scala.wasm b/src/extraction/wasm/tree-sitter-scala.wasm
new file mode 100644
index 00000000..8652623f
Binary files /dev/null and b/src/extraction/wasm/tree-sitter-scala.wasm differ
diff --git a/src/extraction/wasm/tree-sitter-sql.wasm b/src/extraction/wasm/tree-sitter-sql.wasm
new file mode 100755
index 00000000..1b7579a8
Binary files /dev/null and b/src/extraction/wasm/tree-sitter-sql.wasm differ
diff --git a/src/graph/queries.ts b/src/graph/queries.ts
index c39e2e32..e6d79c51 100644
--- a/src/graph/queries.ts
+++ b/src/graph/queries.ts
@@ -7,6 +7,7 @@
 import { Node, Edge, Context, Subgraph, EdgeKind } from '../types';
 import { QueryBuilder } from '../db/queries';
 import { GraphTraverser } from './traversal';
+import { globToSafeRegex } from '../utils';
 
 /**
  * Graph query manager for complex queries
@@ -194,13 +195,11 @@ export class GraphQueryManager {
    * @returns Array of matching nodes
    */
   findByQualifiedName(pattern: string): Node[] {
-    // Convert glob pattern to regex
-    const regexPattern = pattern
-      .replace(/[.+^${}()|[\]\\]/g, '\\$&')
-      .replace(/\*/g, '.*')
-      .replace(/\?/g, '.');
-
-    const regex = new RegExp(`^${regexPattern}$`);
+    // Convert glob pattern to regex (ReDoS-safe — consecutive wildcards are
+    // coalesced so hostile inputs can't produce nested quantifiers).
+    const regexBody = globToSafeRegex(pattern);
+    if (regexBody === null) return [];
+    const regex = new RegExp(`^${regexBody}$`);
 
     // This is inefficient for large graphs - would need FTS index on qualified_name
     // For now, use kind-based filtering if possible
diff --git a/src/graph/traversal.ts b/src/graph/traversal.ts
index dd5b5029..dabda66f 100644
--- a/src/graph/traversal.ts
+++ b/src/graph/traversal.ts
@@ -8,10 +8,15 @@ import { Node, Edge, Subgraph, TraversalOptions, EdgeKind } from '../types';
 import { QueryBuilder } from '../db/queries';
 
 /**
- * Default traversal options
+ * Default traversal options.
+ *
+ * `maxDepth` is bounded by default — an unbounded depth on a highly connected
+ * graph can grow `visited` and the BFS/DFS frontier well beyond `limit` before
+ * the limit cuts in. Callers who really want unlimited depth can pass
+ * `maxDepth: Infinity` explicitly.
  */
 const DEFAULT_OPTIONS: Required<TraversalOptions> = {
-  maxDepth: Infinity,
+  maxDepth: 10,
   edgeKinds: [],
   nodeKinds: [],
   direction: 'outgoing',
@@ -19,6 +24,14 @@ const DEFAULT_OPTIONS: Required<TraversalOptions> = {
   includeStart: true,
 };
 
+/**
+ * Hard cap on `findPath`'s BFS queue — each queue entry clones the full path
+ * array, so on a dense graph the queue can balloon into millions of entries
+ * before either finding a path or exhausting the search. This bounds the
+ * worst-case memory footprint of a single findPath call.
+ */
+const FIND_PATH_MAX_QUEUE = 100_000;
+
 /**
  * Result of a single traversal step
  */
@@ -90,29 +103,24 @@ export class GraphTraverser {
         return priority(a) - priority(b);
       });
 
+      // Batch-fetch the unvisited neighbors in one query (was N+1 per BFS step).
+      const wantIds = adjacentEdges
+        .map((e) => (e.source === node.id ? e.target : e.source))
+        .filter((id) => !visited.has(id));
+      const neighborNodes = wantIds.length > 0 ? this.queries.getNodesByIds(wantIds) : new Map();
+
       for (const adjEdge of adjacentEdges) {
-        // Determine next node: for 'both' direction, edges can be either
-        // incoming or outgoing, so pick whichever end is not the current node
         const nextNodeId = adjEdge.source === node.id ? adjEdge.target : adjEdge.source;
+        if (visited.has(nextNodeId)) continue;
 
-        if (visited.has(nextNodeId)) {
-          continue;
-        }
-
-        const nextNode = this.queries.getNodeById(nextNodeId);
-        if (!nextNode) {
-          continue;
-        }
+        const nextNode = neighborNodes.get(nextNodeId);
+        if (!nextNode) continue;
 
-        // Apply node kind filter
         if (opts.nodeKinds && opts.nodeKinds.length > 0 && !opts.nodeKinds.includes(nextNode.kind)) {
           continue;
         }
 
-        // Add node to result
         nodes.set(nextNode.id, nextNode);
-
-        // Queue for further traversal
         queue.push({ node: nextNode, edge: adjEdge, depth: depth + 1 });
       }
     }
@@ -176,19 +184,18 @@ export class GraphTraverser {
     // Get adjacent edges
     const adjacentEdges = this.getAdjacentEdges(node.id, opts.direction, opts.edgeKinds);
 
+    // Batch-fetch unvisited neighbors (was N+1 per DFS step).
+    const wantIds = adjacentEdges
+      .map((e) => (e.source === node.id ? e.target : e.source))
+      .filter((id) => !visited.has(id));
+    const neighborNodes = wantIds.length > 0 ? this.queries.getNodesByIds(wantIds) : new Map();
+
     for (const edge of adjacentEdges) {
-      // Determine next node: for 'both' direction, edges can be either
-      // incoming or outgoing, so pick whichever end is not the current node
       const nextNodeId = edge.source === node.id ? edge.target : edge.source;
+      if (visited.has(nextNodeId)) continue;
 
-      if (visited.has(nextNodeId)) {
-        continue;
-      }
-
-      const nextNode = this.queries.getNodeById(nextNodeId);
-      if (!nextNode) {
-        continue;
-      }
+      const nextNode = neighborNodes.get(nextNodeId);
+      if (!nextNode) continue;
 
       // Apply node kind filter
       if (opts.nodeKinds && opts.nodeKinds.length > 0 && !opts.nodeKinds.includes(nextNode.kind)) {
@@ -255,9 +262,15 @@ export class GraphTraverser {
     visited.add(nodeId);
 
     const incomingEdges = this.queries.getIncomingEdges(nodeId, ['calls', 'references', 'imports']);
+    if (incomingEdges.length === 0) return;
+
+    // Batch-fetch all caller nodes in one round-trip instead of one
+    // getNodeById per edge (was N+1 — meaningful on functions with many callers).
+    const sourceIds = incomingEdges.map((e) => e.source);
+    const callerNodes = this.queries.getNodesByIds(sourceIds);
 
     for (const edge of incomingEdges) {
-      const callerNode = this.queries.getNodeById(edge.source);
+      const callerNode = callerNodes.get(edge.source);
       if (callerNode && !visited.has(callerNode.id)) {
         result.push({ node: callerNode, edge });
         this.getCallersRecursive(callerNode.id, maxDepth, currentDepth + 1, result, visited);
@@ -294,9 +307,14 @@ export class GraphTraverser {
     visited.add(nodeId);
 
     const outgoingEdges = this.queries.getOutgoingEdges(nodeId, ['calls', 'references', 'imports']);
+    if (outgoingEdges.length === 0) return;
+
+    // Batch-fetch callee nodes (was N+1 — see getCallersRecursive note).
+    const targetIds = outgoingEdges.map((e) => e.target);
+    const calleeNodes = this.queries.getNodesByIds(targetIds);
 
     for (const edge of outgoingEdges) {
-      const calleeNode = this.queries.getNodeById(edge.target);
+      const calleeNode = calleeNodes.get(edge.target);
       if (calleeNode && !visited.has(calleeNode.id)) {
         result.push({ node: calleeNode, edge });
         this.getCalleesRecursive(calleeNode.id, maxDepth, currentDepth + 1, result, visited);
@@ -388,9 +406,11 @@ export class GraphTraverser {
     visited.add(nodeId);
 
     const outgoingEdges = this.queries.getOutgoingEdges(nodeId, ['extends', 'implements']);
+    if (outgoingEdges.length === 0) return;
+    const parents = this.queries.getNodesByIds(outgoingEdges.map((e) => e.target));
 
     for (const edge of outgoingEdges) {
-      const parentNode = this.queries.getNodeById(edge.target);
+      const parentNode = parents.get(edge.target);
       if (parentNode && !nodes.has(parentNode.id)) {
         nodes.set(parentNode.id, parentNode);
         edges.push(edge);
@@ -411,9 +431,11 @@ export class GraphTraverser {
     visited.add(nodeId);
 
     const incomingEdges = this.queries.getIncomingEdges(nodeId, ['extends', 'implements']);
+    if (incomingEdges.length === 0) return;
+    const children = this.queries.getNodesByIds(incomingEdges.map((e) => e.source));
 
     for (const edge of incomingEdges) {
-      const childNode = this.queries.getNodeById(edge.source);
+      const childNode = children.get(edge.source);
       if (childNode && !nodes.has(childNode.id)) {
         nodes.set(childNode.id, childNode);
         edges.push(edge);
@@ -433,12 +455,13 @@ export class GraphTraverser {
 
     // Get all incoming edges (references, calls, type_of, etc.)
     const incomingEdges = this.queries.getIncomingEdges(nodeId);
+    if (incomingEdges.length === 0) return result;
 
+    // Batch-fetch source nodes (was N+1).
+    const sources = this.queries.getNodesByIds(incomingEdges.map((e) => e.source));
     for (const edge of incomingEdges) {
-      const sourceNode = this.queries.getNodeById(edge.source);
-      if (sourceNode) {
-        result.push({ node: sourceNode, edge });
-      }
+      const sourceNode = sources.get(edge.source);
+      if (sourceNode) result.push({ node: sourceNode, edge });
     }
 
     return result;
@@ -496,13 +519,16 @@ export class GraphTraverser {
       const containerKinds = new Set(['class', 'interface', 'struct', 'trait', 'protocol', 'module', 'enum']);
       if (containerKinds.has(focalNode.kind)) {
         const containsEdges = this.queries.getOutgoingEdges(nodeId, ['contains']);
-        for (const edge of containsEdges) {
-          const childNode = this.queries.getNodeById(edge.target);
-          if (childNode && !visited.has(childNode.id)) {
-            nodes.set(childNode.id, childNode);
-            edges.push(edge);
-            // Recurse into children at the same depth (they're part of the same symbol)
-            this.getImpactRecursive(childNode.id, maxDepth, currentDepth, nodes, edges, visited);
+        if (containsEdges.length > 0) {
+          const children = this.queries.getNodesByIds(containsEdges.map((e) => e.target));
+          for (const edge of containsEdges) {
+            const childNode = children.get(edge.target);
+            if (childNode && !visited.has(childNode.id)) {
+              nodes.set(childNode.id, childNode);
+              edges.push(edge);
+              // Recurse into children at the same depth (they're part of the same symbol)
+              this.getImpactRecursive(childNode.id, maxDepth, currentDepth, nodes, edges, visited);
+            }
           }
         }
       }
@@ -510,9 +536,11 @@ export class GraphTraverser {
 
     // Get all incoming edges (things that depend on this node)
     const incomingEdges = this.queries.getIncomingEdges(nodeId);
+    if (incomingEdges.length === 0) return;
+    const sources = this.queries.getNodesByIds(incomingEdges.map((e) => e.source));
 
     for (const edge of incomingEdges) {
-      const sourceNode = this.queries.getNodeById(edge.source);
+      const sourceNode = sources.get(edge.source);
       if (sourceNode && !nodes.has(sourceNode.id)) {
         nodes.set(sourceNode.id, sourceNode);
         edges.push(edge);
@@ -548,6 +576,12 @@ export class GraphTraverser {
     ];
 
     while (queue.length > 0) {
+      // Hard ceiling on memory: each queue entry holds a cloned path array,
+      // so a single dense node could push the queue well past nominal otherwise.
+      if (queue.length > FIND_PATH_MAX_QUEUE) {
+        return null;
+      }
+
       const { nodeId, path } = queue.shift()!;
 
       if (nodeId === toId) {
@@ -564,10 +598,17 @@ export class GraphTraverser {
         nodeId,
         edgeKinds.length > 0 ? edgeKinds : undefined
       );
+      if (outgoingEdges.length === 0) continue;
+
+      // Batch-fetch only the unvisited targets (was N+1 per BFS frontier).
+      const wantIds = outgoingEdges
+        .map((e) => e.target)
+        .filter((id) => !visited.has(id));
+      const nextNodes = wantIds.length > 0 ? this.queries.getNodesByIds(wantIds) : new Map();
 
       for (const edge of outgoingEdges) {
         if (!visited.has(edge.target)) {
-          const nextNode = this.queries.getNodeById(edge.target);
+          const nextNode = nextNodes.get(edge.target);
           if (nextNode) {
             queue.push({
               nodeId: edge.target,
@@ -627,15 +668,15 @@ export class GraphTraverser {
    */
   getChildren(nodeId: string): Node[] {
     const containsEdges = this.queries.getOutgoingEdges(nodeId, ['contains']);
-    const children: Node[] = [];
+    if (containsEdges.length === 0) return [];
 
+    // Batch-fetch (was N+1).
+    const childNodes = this.queries.getNodesByIds(containsEdges.map((e) => e.target));
+    const children: Node[] = [];
     for (const edge of containsEdges) {
-      const childNode = this.queries.getNodeById(edge.target);
-      if (childNode) {
-        children.push(childNode);
-      }
+      const childNode = childNodes.get(edge.target);
+      if (childNode) children.push(childNode);
     }
-
     return children;
   }
 }
diff --git a/src/index-hooks/centrality.ts b/src/index-hooks/centrality.ts
new file mode 100644
index 00000000..8fa69203
--- /dev/null
+++ b/src/index-hooks/centrality.ts
@@ -0,0 +1,37 @@
+/**
+ * Centrality index hook — runs PageRank over the calls+references
+ * subgraph after every indexAll/sync and persists scores to
+ * `nodes.centrality`. Cheap; no I/O. See `src/centrality/` for the
+ * pure-compute module.
+ */
+
+import type { IndexHook, IndexHookContext } from './registry';
+import { computePageRank, PR_EDGE_KINDS } from '../centrality';
+import { logDebug } from '../errors';
+
+function recompute(ctx: IndexHookContext): void {
+  if (ctx.config.enableCentrality === false) return;
+  try {
+    const nodes = ctx.queries.getAllNodes();
+    if (nodes.length === 0) return;
+    const edgeRows = ctx.db
+      .getDb()
+      .prepare(
+        `SELECT source, target FROM edges WHERE kind IN (${PR_EDGE_KINDS
+          .map(() => '?')
+          .join(',')})`
+      )
+      .all(...PR_EDGE_KINDS) as Array<{ source: string; target: string }>;
+    const result = computePageRank(nodes, edgeRows);
+    ctx.queries.clearCentrality();
+    ctx.queries.applyCentralityScores(result.scores);
+  } catch (err) {
+    logDebug(`centrality hook failed: ${err instanceof Error ? err.message : String(err)}`);
+  }
+}
+
+export const HOOK: IndexHook = {
+  name: 'centrality',
+  afterIndexAll(ctx) { recompute(ctx); },
+  afterSync(ctx) { recompute(ctx); },
+};
diff --git a/src/index-hooks/churn.ts b/src/index-hooks/churn.ts
new file mode 100644
index 00000000..d2526c46
--- /dev/null
+++ b/src/index-hooks/churn.ts
@@ -0,0 +1,53 @@
+/**
+ * Churn index hook — mines git history for per-file commit counts,
+ * first/last touched timestamps, and refreshes on-disk LOC.
+ * Incremental on sync via `last_mined_churn_head` in
+ * project_metadata; full re-mine on indexAll. See `src/churn/`
+ * for the miner.
+ */
+
+import type { IndexHook, IndexHookContext } from './registry';
+import type { SyncResult } from '../extraction';
+import { mineChurn, readFileLoc, LAST_MINED_CHURN_HEAD_KEY } from '../churn';
+import { logDebug } from '../errors';
+
+function refresh(ctx: IndexHookContext, options: { fullRescan: boolean; changedFiles: string[] | null }): void {
+  if (ctx.config.enableChurn === false) return;
+  try {
+    const indexedFiles = new Set(ctx.queries.getAllFilePaths());
+    if (indexedFiles.size === 0) return;
+    const sinceSha = options.fullRescan
+      ? null
+      : ctx.queries.getMetadata(LAST_MINED_CHURN_HEAD_KEY);
+    const mined = mineChurn(ctx.projectRoot, indexedFiles, sinceSha);
+    if (mined.currentHead === null) return; // not in a git repo
+    if (mined.needsFullRescan) {
+      ctx.queries.clearChurn();
+      const remined = mineChurn(ctx.projectRoot, indexedFiles, null);
+      ctx.queries.applyChurnDeltas(remined.deltas.values());
+      ctx.queries.setMetadata(LAST_MINED_CHURN_HEAD_KEY, remined.currentHead ?? '');
+    } else {
+      if (options.fullRescan) ctx.queries.clearChurn();
+      ctx.queries.applyChurnDeltas(mined.deltas.values());
+      ctx.queries.setMetadata(LAST_MINED_CHURN_HEAD_KEY, mined.currentHead);
+    }
+    const targets = options.fullRescan
+      ? [...indexedFiles]
+      : (options.changedFiles ?? []).filter((p) => indexedFiles.has(p));
+    if (targets.length > 0) {
+      ctx.queries.applyLocUpdates(
+        targets.map((p) => ({ path: p, loc: readFileLoc(ctx.projectRoot, p) }))
+      );
+    }
+  } catch (err) {
+    logDebug(`churn hook failed: ${err instanceof Error ? err.message : String(err)}`);
+  }
+}
+
+export const HOOK: IndexHook = {
+  name: 'churn',
+  afterIndexAll(ctx) { refresh(ctx, { fullRescan: true, changedFiles: null }); },
+  afterSync(ctx, result: SyncResult) {
+    refresh(ctx, { fullRescan: false, changedFiles: result.changedFilePaths ?? null });
+  },
+};
diff --git a/src/index-hooks/cochange.ts b/src/index-hooks/cochange.ts
new file mode 100644
index 00000000..c249a4c8
--- /dev/null
+++ b/src/index-hooks/cochange.ts
@@ -0,0 +1,61 @@
+/**
+ * Co-change index hook — mines git history for file pairs that
+ * change together. Persists pair counts + per-file commit counts.
+ * Incremental on sync via `last_mined_cochange_head` metadata; full
+ * rescan with force-push recovery on indexAll.
+ */
+
+import type { IndexHook, IndexHookContext } from './registry';
+import { mineCoChanges, LAST_MINED_HEAD_KEY } from '../cochange';
+import { logDebug } from '../errors';
+
+function applyResults(
+  ctx: IndexHookContext,
+  result: { pairs: Map<string, number>; fileCommits: Map<string, number> }
+): void {
+  const pairDeltas: Array<[string, string, number]> = [];
+  for (const [key, count] of result.pairs) {
+    const [a, b] = key.split('\0');
+    if (a && b) pairDeltas.push([a, b, count]);
+  }
+  const fileCommitDeltas: Array<[string, number]> = [...result.fileCommits.entries()];
+  ctx.queries.applyCoChangeDeltas(pairDeltas, fileCommitDeltas);
+}
+
+function refresh(ctx: IndexHookContext, options: { fullRescan: boolean }): void {
+  if (ctx.config.enableCoChange === false) return;
+  try {
+    const indexedFiles = new Set(ctx.queries.getAllFiles().map((f) => f.path));
+    if (indexedFiles.size === 0) return;
+    const sinceSha = options.fullRescan
+      ? null
+      : ctx.queries.getMetadata(LAST_MINED_HEAD_KEY);
+    const result = mineCoChanges(ctx.projectRoot, indexedFiles, sinceSha);
+    if (!result.currentHead) return;
+
+    if (result.needsFullRescan) {
+      ctx.queries.clearCoChanges();
+      const fresh = mineCoChanges(ctx.projectRoot, indexedFiles, null);
+      if (fresh.pairs.size > 0 || fresh.fileCommits.size > 0) {
+        applyResults(ctx, fresh);
+      }
+      if (fresh.currentHead) ctx.queries.setMetadata(LAST_MINED_HEAD_KEY, fresh.currentHead);
+      return;
+    }
+
+    if (options.fullRescan) ctx.queries.clearCoChanges();
+
+    if (result.pairs.size > 0 || result.fileCommits.size > 0) {
+      applyResults(ctx, result);
+    }
+    ctx.queries.setMetadata(LAST_MINED_HEAD_KEY, result.currentHead);
+  } catch (err) {
+    logDebug(`cochange hook failed: ${err instanceof Error ? err.message : String(err)}`);
+  }
+}
+
+export const HOOK: IndexHook = {
+  name: 'cochange',
+  afterIndexAll(ctx) { refresh(ctx, { fullRescan: true }); },
+  afterSync(ctx) { refresh(ctx, { fullRescan: false }); },
+};
diff --git a/src/index-hooks/config-refs.ts b/src/index-hooks/config-refs.ts
new file mode 100644
index 00000000..70f13ffa
--- /dev/null
+++ b/src/index-hooks/config-refs.ts
@@ -0,0 +1,77 @@
+/**
+ * Config-refs index hook — extracts env-var / feature-flag read
+ * sites and persists to `config_refs`. Incremental on sync; full
+ * rescan on indexAll. See `src/config-refs/` for the extractor.
+ */
+
+import type { IndexHook, IndexHookContext } from './registry';
+import type { SyncResult } from '../extraction';
+import { extractConfigRefs } from '../config-refs';
+import { logDebug } from '../errors';
+
+function refresh(
+  ctx: IndexHookContext,
+  options: { scope: 'all' } | { scope: 'files'; files: string[] }
+): void {
+  if (ctx.config.enableConfigRefs === false) return;
+  try {
+    const fileNodes = new Map<string, Array<{ id: string; start: number; end: number }>>();
+    const resolveEnclosing = (filePath: string, line: number): string | null => {
+      let nodes = fileNodes.get(filePath);
+      if (!nodes) {
+        nodes = ctx.queries
+          .getNodesByFile(filePath)
+          .filter(
+            (n) =>
+              n.kind === 'function' ||
+              n.kind === 'method' ||
+              n.kind === 'class' ||
+              n.kind === 'interface'
+          )
+          .map((n) => ({ id: n.id, start: n.startLine, end: n.endLine }))
+          .sort((a, b) => a.end - a.start - (b.end - b.start));
+        fileNodes.set(filePath, nodes);
+      }
+      for (const n of nodes) {
+        if (n.start <= line && line <= n.end) return n.id;
+      }
+      return null;
+    };
+
+    let targets: Array<{ path: string; language: string }>;
+    if (options.scope === 'all') {
+      targets = ctx.queries.getAllFiles().map((f) => ({
+        path: f.path,
+        language: f.language,
+      }));
+      ctx.queries.clearConfigRefs();
+    } else {
+      const records = options.files
+        .map((p) => ctx.queries.getFileByPath(p))
+        .filter((f): f is NonNullable<typeof f> => f != null);
+      targets = records.map((f) => ({ path: f.path, language: f.language }));
+      ctx.queries.pruneOrphanedConfigRefs();
+      if (targets.length > 0) {
+        ctx.queries.deleteConfigRefsForPaths(targets.map((t) => t.path));
+      }
+    }
+
+    const refs = extractConfigRefs(ctx.projectRoot, targets, resolveEnclosing);
+    ctx.queries.applyConfigRefs(refs);
+  } catch (err) {
+    logDebug(`config-refs hook failed: ${err instanceof Error ? err.message : String(err)}`);
+  }
+}
+
+export const HOOK: IndexHook = {
+  name: 'config-refs',
+  afterIndexAll(ctx) { refresh(ctx, { scope: 'all' }); },
+  afterSync(ctx, result: SyncResult) {
+    if (
+      (result.changedFilePaths && result.changedFilePaths.length > 0) ||
+      result.filesRemoved > 0
+    ) {
+      refresh(ctx, { scope: 'files', files: result.changedFilePaths ?? [] });
+    }
+  },
+};
diff --git a/src/index-hooks/issue-history.ts b/src/index-hooks/issue-history.ts
new file mode 100644
index 00000000..bc7aa95a
--- /dev/null
+++ b/src/index-hooks/issue-history.ts
@@ -0,0 +1,58 @@
+/**
+ * Issue-history index hook — mines `Fixes/Closes/Resolves #N`
+ * commits and attributes them to symbols touched by each commit's
+ * hunks. Incremental on sync via `last_mined_issues_head` in
+ * project_metadata; full re-mine on indexAll. See
+ * `src/issue-history/` for the miner.
+ */
+
+import type { IndexHook, IndexHookContext } from './registry';
+import { mineIssueHistory, LAST_MINED_ISSUES_HEAD_KEY } from '../issue-history';
+import { logDebug } from '../errors';
+
+function refresh(ctx: IndexHookContext, options: { fullRescan: boolean }): void {
+  if (ctx.config.enableIssueHistory === false) return;
+  try {
+    // Resolver closure with a per-pass file-level cache. Without it,
+    // every (filePath, name) lookup would re-fetch all nodes for the
+    // file.
+    const fileNodesCache = new Map<string, Map<string, string>>();
+    const resolveSymbol = (filePath: string, name: string): string | null => {
+      let nameToId = fileNodesCache.get(filePath);
+      if (!nameToId) {
+        nameToId = new Map();
+        for (const n of ctx.queries.getNodesByFile(filePath)) {
+          if (!nameToId.has(n.name)) nameToId.set(n.name, n.id);
+        }
+        fileNodesCache.set(filePath, nameToId);
+      }
+      return nameToId.get(name) ?? null;
+    };
+
+    const sinceSha = options.fullRescan
+      ? null
+      : ctx.queries.getMetadata(LAST_MINED_ISSUES_HEAD_KEY);
+
+    const mined = mineIssueHistory(ctx.projectRoot, resolveSymbol, sinceSha);
+    if (mined.currentHead === null) return; // not in a git repo
+
+    if (mined.needsFullRescan) {
+      ctx.queries.clearIssueAttributions();
+      const remined = mineIssueHistory(ctx.projectRoot, resolveSymbol, null);
+      ctx.queries.applyIssueAttributions(remined.attributions);
+      ctx.queries.setMetadata(LAST_MINED_ISSUES_HEAD_KEY, remined.currentHead ?? '');
+    } else {
+      if (options.fullRescan) ctx.queries.clearIssueAttributions();
+      ctx.queries.applyIssueAttributions(mined.attributions);
+      ctx.queries.setMetadata(LAST_MINED_ISSUES_HEAD_KEY, mined.currentHead);
+    }
+  } catch (err) {
+    logDebug(`issue-history hook failed: ${err instanceof Error ? err.message : String(err)}`);
+  }
+}
+
+export const HOOK: IndexHook = {
+  name: 'issue-history',
+  afterIndexAll(ctx) { refresh(ctx, { fullRescan: true }); },
+  afterSync(ctx) { refresh(ctx, { fullRescan: false }); },
+};
diff --git a/src/index-hooks/registry.ts b/src/index-hooks/registry.ts
new file mode 100644
index 00000000..6289e0e6
--- /dev/null
+++ b/src/index-hooks/registry.ts
@@ -0,0 +1,102 @@
+/**
+ * Index-hook registry.
+ *
+ * Adding a new derived-signal pass:
+ *
+ *   1. Create `src/index-hooks/<name>.ts` exporting a
+ *      `HOOK: IndexHook` constant with `afterIndexAll` and/or
+ *      `afterSync` implementations.
+ *   2. Add **one** import line and **one** array entry to this file.
+ *
+ * That's it. `CodeGraph` doesn't need a new private method or
+ * call site for each pass — the runner inside `runHooks*` walks
+ * every registered hook automatically.
+ *
+ * On main today there are NO hooks registered (this file ships
+ * the framework only). PRs adding derived-signal passes
+ * (centrality, churn, issue-history, config-refs, sql-refs,
+ * cochange) each register their hook here.
+ */
+
+import type { IndexHook, IndexHookContext, IndexHookOutcome } from './types';
+import type { SyncResult } from '../extraction';
+import { logDebug } from '../errors';
+
+import { HOOK as CENTRALITY_HOOK } from './centrality';
+import { HOOK as CHURN_HOOK } from './churn';
+import { HOOK as COCHANGE_HOOK } from './cochange';
+import { HOOK as CONFIG_REFS_HOOK } from './config-refs';
+import { HOOK as ISSUE_HISTORY_HOOK } from './issue-history';
+import { HOOK as SQL_REFS_HOOK } from './sql-refs';
+import { HOOK as TESTS_EDGES_HOOK } from './tests-edges';
+
+/**
+ * Static-import list of every registered hook.
+ *
+ * Two PRs adding hooks land their entries on different lines
+ * (alphabetical neighborhoods rarely collide). When an entry is
+ * unwanted at runtime, the hook itself can short-circuit on a
+ * config flag inside its `afterIndexAll`/`afterSync`.
+ */
+const REGISTERED_HOOKS: readonly IndexHook[] = [
+  CENTRALITY_HOOK,
+  CHURN_HOOK,
+  COCHANGE_HOOK,
+  CONFIG_REFS_HOOK,
+  ISSUE_HISTORY_HOOK,
+  SQL_REFS_HOOK,
+  TESTS_EDGES_HOOK,
+];
+
+/**
+ * Run `afterIndexAll` for every registered hook. Errors are
+ * caught + logged so one broken hook never fails the whole
+ * index. Returns per-hook outcomes for diagnostics.
+ */
+export async function runAfterIndexAll(
+  ctx: IndexHookContext
+): Promise<IndexHookOutcome[]> {
+  const out: IndexHookOutcome[] = [];
+  for (const hook of REGISTERED_HOOKS) {
+    if (!hook.afterIndexAll) continue;
+    const start = Date.now();
+    try {
+      await hook.afterIndexAll(ctx);
+      out.push({ name: hook.name, phase: 'indexAll', durationMs: Date.now() - start });
+    } catch (err) {
+      const e = err instanceof Error ? err : new Error(String(err));
+      logDebug(`index-hook "${hook.name}" afterIndexAll failed: ${e.message}`);
+      out.push({ name: hook.name, phase: 'indexAll', durationMs: Date.now() - start, error: e });
+    }
+  }
+  return out;
+}
+
+/** Same shape, for `afterSync`. */
+export async function runAfterSync(
+  ctx: IndexHookContext,
+  result: SyncResult
+): Promise<IndexHookOutcome[]> {
+  const out: IndexHookOutcome[] = [];
+  for (const hook of REGISTERED_HOOKS) {
+    if (!hook.afterSync) continue;
+    const start = Date.now();
+    try {
+      await hook.afterSync(ctx, result);
+      out.push({ name: hook.name, phase: 'sync', durationMs: Date.now() - start });
+    } catch (err) {
+      const e = err instanceof Error ? err : new Error(String(err));
+      logDebug(`index-hook "${hook.name}" afterSync failed: ${e.message}`);
+      out.push({ name: hook.name, phase: 'sync', durationMs: Date.now() - start, error: e });
+    }
+  }
+  return out;
+}
+
+/** Read access for tests + diagnostic tools. */
+export function getRegisteredHooks(): readonly IndexHook[] {
+  return REGISTERED_HOOKS;
+}
+
+// Re-export the types so consumers can import everything from one place.
+export type { IndexHook, IndexHookContext, IndexHookOutcome } from './types';
diff --git a/src/index-hooks/sql-refs.ts b/src/index-hooks/sql-refs.ts
new file mode 100644
index 00000000..34cec42b
--- /dev/null
+++ b/src/index-hooks/sql-refs.ts
@@ -0,0 +1,76 @@
+/**
+ * SQL-refs index hook — extracts SQL string-literal references to
+ * tables (read/write/ddl) and persists to `sql_refs`. Incremental
+ * on sync; full atomic replace on indexAll. See `src/sql-refs/`.
+ */
+
+import type { IndexHook, IndexHookContext } from './registry';
+import type { SyncResult } from '../extraction';
+import { extractSqlRefs } from '../sql-refs';
+import { logDebug } from '../errors';
+
+function refresh(
+  ctx: IndexHookContext,
+  options: { scope: 'all' } | { scope: 'files'; files: string[] }
+): void {
+  if (ctx.config.enableSqlRefs === false) return;
+  try {
+    const fileNodes = new Map<string, Array<{ id: string; start: number; end: number }>>();
+    const resolveEnclosing = (filePath: string, line: number): string | null => {
+      let nodes = fileNodes.get(filePath);
+      if (!nodes) {
+        nodes = ctx.queries
+          .getNodesByFile(filePath)
+          .filter(
+            (n) =>
+              n.kind === 'function' ||
+              n.kind === 'method' ||
+              n.kind === 'class' ||
+              n.kind === 'interface'
+          )
+          .map((n) => ({ id: n.id, start: n.startLine, end: n.endLine }))
+          .sort((a, b) => a.end - a.start - (b.end - b.start));
+        fileNodes.set(filePath, nodes);
+      }
+      for (const n of nodes) {
+        if (n.start <= line && line <= n.end) return n.id;
+      }
+      return null;
+    };
+
+    if (options.scope === 'all') {
+      const targets = ctx.queries.getAllFiles().map((f) => ({
+        path: f.path,
+        language: f.language,
+      }));
+      const refs = extractSqlRefs(ctx.projectRoot, targets, resolveEnclosing);
+      ctx.queries.replaceAllSqlRefs(refs);
+    } else {
+      const records = options.files
+        .map((p) => ctx.queries.getFileByPath(p))
+        .filter((f): f is NonNullable<typeof f> => f != null);
+      const targets = records.map((f) => ({ path: f.path, language: f.language }));
+      ctx.queries.pruneOrphanedSqlRefs();
+      if (targets.length > 0) {
+        ctx.queries.deleteSqlRefsForPaths(targets.map((t) => t.path));
+      }
+      const refs = extractSqlRefs(ctx.projectRoot, targets, resolveEnclosing);
+      ctx.queries.applySqlRefs(refs);
+    }
+  } catch (err) {
+    logDebug(`sql-refs hook failed: ${err instanceof Error ? err.message : String(err)}`);
+  }
+}
+
+export const HOOK: IndexHook = {
+  name: 'sql-refs',
+  afterIndexAll(ctx) { refresh(ctx, { scope: 'all' }); },
+  afterSync(ctx, result: SyncResult) {
+    if (
+      (result.changedFilePaths && result.changedFilePaths.length > 0) ||
+      result.filesRemoved > 0
+    ) {
+      refresh(ctx, { scope: 'files', files: result.changedFilePaths ?? [] });
+    }
+  },
+};
diff --git a/src/index-hooks/tests-edges.ts b/src/index-hooks/tests-edges.ts
new file mode 100644
index 00000000..fbcb1ac1
--- /dev/null
+++ b/src/index-hooks/tests-edges.ts
@@ -0,0 +1,66 @@
+/**
+ * Tests-edges index hook — adds convention-based `tests` edges from
+ * test files to their subject files (e.g. foo.test.ts → foo.ts).
+ * Full rebuild on indexAll; incremental rebuild for changed test
+ * files on sync.
+ */
+
+import type { IndexHook, IndexHookContext } from './registry';
+import type { SyncResult } from '../extraction';
+import type { Edge } from '../types';
+import { isTestFile, findTestSubjects } from '../tests-edges';
+import { logDebug } from '../errors';
+
+function insertEdgesFor(
+  ctx: IndexHookContext,
+  testFilePaths: string[],
+  allFilePaths: Set<string>
+): void {
+  const edges: Edge[] = [];
+  for (const tf of testFilePaths) {
+    const subjects = findTestSubjects(tf, allFilePaths);
+    for (const subject of subjects) {
+      edges.push({ source: `file:${tf}`, target: `file:${subject}`, kind: 'tests' });
+    }
+  }
+  if (edges.length > 0) ctx.queries.insertEdges(edges);
+}
+
+export const HOOK: IndexHook = {
+  name: 'tests-edges',
+  afterIndexAll(ctx) {
+    try {
+      const allFiles = ctx.queries.getAllFiles();
+      const allFilePaths = new Set(allFiles.map((f) => f.path));
+      const testPaths = allFiles.map((f) => f.path).filter(isTestFile);
+      ctx.queries.deleteAllEdgesByKind('tests');
+      insertEdgesFor(ctx, testPaths, allFilePaths);
+    } catch (err) {
+      logDebug(`tests-edges hook (indexAll) failed: ${err instanceof Error ? err.message : String(err)}`);
+    }
+  },
+  afterSync(ctx, result: SyncResult) {
+    try {
+      if (result.changedFilePaths) {
+        const stillTracked = new Set(ctx.queries.getAllFiles().map((f) => f.path));
+        const changedTests = result.changedFilePaths
+          .filter(isTestFile)
+          .filter((p) => stillTracked.has(p));
+        if (changedTests.length === 0) return;
+        for (const tf of changedTests) {
+          ctx.queries.deleteEdgesBySourceAndKind(`file:${tf}`, 'tests');
+        }
+        insertEdgesFor(ctx, changedTests, stillTracked);
+      } else if (result.filesAdded > 0 || result.filesModified > 0) {
+        // No git fast path — full rebuild.
+        const allFiles = ctx.queries.getAllFiles();
+        const allFilePaths = new Set(allFiles.map((f) => f.path));
+        const testPaths = allFiles.map((f) => f.path).filter(isTestFile);
+        ctx.queries.deleteAllEdgesByKind('tests');
+        insertEdgesFor(ctx, testPaths, allFilePaths);
+      }
+    } catch (err) {
+      logDebug(`tests-edges hook (sync) failed: ${err instanceof Error ? err.message : String(err)}`);
+    }
+  },
+};
diff --git a/src/index-hooks/types.ts b/src/index-hooks/types.ts
new file mode 100644
index 00000000..f1c07558
--- /dev/null
+++ b/src/index-hooks/types.ts
@@ -0,0 +1,65 @@
+/**
+ * Index-hook types.
+ *
+ * `IndexHook`s are derived-signal passes that run AFTER core
+ * indexing/sync has finished — centrality computation, churn
+ * mining, issue history, config-ref extraction, SQL call-site
+ * scanning, co-change graph mining, etc. Today every such PR
+ * mutates `CodeGraph` directly (private method + call site in
+ * `indexAll` + call site in `sync`), forcing every-PR conflicts
+ * on adjacent lines.
+ *
+ * After the registry refactor, each pass is its own file:
+ *   - exports a `HOOK: IndexHook` constant
+ *   - registers itself in `./registry.ts` (1 import line + 1 array entry)
+ *   - implements `afterIndexAll` and/or `afterSync`
+ *
+ * `CodeGraph` stops growing per-pass methods. The hook runner
+ * inside `CodeGraph` is a small generic loop that calls every
+ * registered hook in sequence, swallowing errors so one broken
+ * hook doesn't fail the whole index/sync.
+ */
+
+import type { CodeGraphConfig } from '../types';
+import type { QueryBuilder } from '../db/queries';
+import type { DatabaseConnection } from '../db';
+import type { SyncResult } from '../extraction';
+
+/**
+ * Per-call context handed to every hook. Stable shape so hooks
+ * don't need to import private members of `CodeGraph`.
+ */
+export interface IndexHookContext {
+  readonly projectRoot: string;
+  readonly config: CodeGraphConfig;
+  readonly queries: QueryBuilder;
+  readonly db: DatabaseConnection;
+}
+
+export interface IndexHook {
+  /** Stable identifier for logging / opt-out. */
+  readonly name: string;
+
+  /**
+   * Run after a full `indexAll` completes successfully. Treat
+   * this as a clean-slate signal — clear any cached state your
+   * pass owns and re-derive from scratch.
+   */
+  afterIndexAll?(ctx: IndexHookContext): Promise<void> | void;
+
+  /**
+   * Run after `sync` completes. `result.changedFilePaths` (when
+   * present) is the bounded set of paths touched in this sync;
+   * hooks should use it to do incremental work where possible.
+   */
+  afterSync?(ctx: IndexHookContext, result: SyncResult): Promise<void> | void;
+}
+
+/** Per-hook outcome reported back from the registry runner. */
+export interface IndexHookOutcome {
+  readonly name: string;
+  readonly phase: 'indexAll' | 'sync';
+  readonly durationMs: number;
+  /** Defined when the hook threw; the runner caught it. */
+  readonly error?: Error;
+}
diff --git a/src/index.ts b/src/index.ts
index 0ff1e090..c6f50cee 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -49,6 +49,30 @@ import { GraphTraverser, GraphQueryManager } from './graph';
 import { ContextBuilder, createContextBuilder } from './context';
 import { Mutex, FileLock } from './utils';
 import { FileWatcher, WatchOptions } from './sync';
+import {
+  runAfterIndexAll as runIndexHooksAfterIndexAll,
+  runAfterSync as runIndexHooksAfterSync,
+  type IndexHookContext,
+} from './index-hooks/registry';
+import { buildReviewContext, ReviewContext, ReviewContextOptions } from './review';
+import { LlmClient, LlmEndpointConfig } from './llm/client';
+import { summarizeAll, SUMMARIZABLE_KINDS } from './llm/summarizer';
+import { embedAllSummaries, EmbeddingCache } from './llm/embeddings';
+import { askWithCandidates, AskOptions, AskResult } from './llm/ask';
+import { summarizeAllDirectories } from './llm/dir-summarizer';
+import { classifyAllRoles, RoleLabel } from './llm/classifier';
+import { summarizeChange, ChangeIntentOptions, ChangeIntentResult } from './llm/change-intent';
+import { judgeDeadCode, DeadCodeOptions, DeadCodeResult } from './llm/dead-code';
+import { checkNamingConvention, NamingCheckResult } from './llm/naming';
+import {
+  pendingSummariesBatch,
+  saveAgentSummaries,
+  PendingBatch,
+  SaveSummaryItem,
+  SaveResult,
+} from './llm/agent-bridge';
+import { detectLocalLlm, detectionToConfig } from './llm/detect';
+import { logDebug, logWarn } from './errors';
 
 // Re-export types for consumers
 export * from './types';
@@ -118,6 +142,14 @@ export interface IndexOptions {
 
   /** Enable verbose logging (worker lifecycle, memory, timeouts) */
   verbose?: boolean;
+
+  /**
+   * After indexing/syncing, kick off LLM symbol summarisation in the
+   * background if a local LLM is configured or auto-detectable.
+   * Defaults to true. Set false in scripts / git hooks where the
+   * caller doesn't want a long-running side effect.
+   */
+  summarize?: boolean;
 }
 
 /**
@@ -145,6 +177,24 @@ export class CodeGraph {
   // File watcher for auto-sync on file changes
   private watcher: FileWatcher | null = null;
 
+  // Background LLM summarisation lifecycle
+  private bgSummaryAbort: AbortController | null = null;
+  private bgSummaryPromise: Promise<void> | null = null;
+  // Set when an index/sync completes while a pass is already running.
+  // The active pass checks this on completion and re-queues itself so
+  // newly indexed symbols don't have to wait for the next sync to be
+  // summarised.
+  private bgSummaryDirty: boolean = false;
+  // Auto-detected LLM config (populated lazily on first index/sync when
+  // config.llm is absent). Cached per CodeGraph instance to avoid
+  // probing localhost on every sync.
+  private detectedLlmConfig: LlmEndpointConfig | null | undefined = undefined;
+
+  // In-memory embedding cache for similarity search. Avoids re-fetching
+  // and re-decoding Float32Array views from SQLite on every query.
+  // Invalidated whenever the underlying embeddings table changes.
+  private embeddingCache = new EmbeddingCache();
+
   private constructor(
     db: DatabaseConnection,
     queries: QueryBuilder,
@@ -325,6 +375,17 @@ export class CodeGraph {
    */
   close(): void {
     this.unwatch();
+    // Cancel any in-flight background summarisation. The signal is
+    // checked between LLM requests; the in-flight HTTP call will
+    // continue running but its result is dropped. Clear our promise
+    // ref synchronously so isSummarizing() reflects cancellation
+    // intent immediately.
+    if (this.bgSummaryAbort) {
+      this.bgSummaryAbort.abort();
+      this.bgSummaryAbort = null;
+    }
+    this.bgSummaryPromise = null;
+    this.bgSummaryDirty = false;
     // Release file lock if held
     this.fileLock.release();
     this.db.close();
@@ -373,7 +434,7 @@ export class CodeGraph {
    * Uses a mutex to prevent concurrent indexing operations.
    */
   async indexAll(options: IndexOptions = {}): Promise<IndexResult> {
-    return this.indexMutex.withLock(async () => {
+    const result = await this.indexMutex.withLock(async () => {
       try {
         this.fileLock.acquire();
       } catch {
@@ -402,11 +463,49 @@ export class CodeGraph {
           });
         }
 
+        // Run registered post-indexAll hooks (centrality, churn,
+        // issue-history, config-refs, sql-refs, cochange…). Best-effort:
+        // hook errors are caught + logged inside the runner.
+        if (result.success) {
+          await runIndexHooksAfterIndexAll(this.buildHookContext());
+        }
+        // Refresh planner stats + checkpoint the WAL after bulk writes.
+        if (result.success && result.filesIndexed > 0) {
+          this.db.runMaintenance();
+        }
+
         return result;
       } finally {
         this.fileLock.release();
       }
     });
+
+    // The set of embeddings the cache was built from is now stale —
+    // any new symbols extracted in this pass will gain embeddings as
+    // background summarisation runs. Drop the cache so the next
+    // similarity query rebuilds from SQLite.
+    if (result.success && result.filesIndexed > 0) {
+      this.embeddingCache.invalidate();
+    }
+
+    // Fire-and-forget background summarisation. Skipped silently when
+    // no LLM is configured AND none is auto-detectable on localhost.
+    if (result.success && result.filesIndexed > 0 && options.summarize !== false) {
+      void this.startBackgroundSummarization();
+    }
+    return result;
+  }
+
+  /**
+   * Build the read-only context handed to every index hook.
+   */
+  private buildHookContext(): IndexHookContext {
+    return {
+      projectRoot: this.projectRoot,
+      config: this.config,
+      queries: this.queries,
+      db: this.db,
+    };
   }
 
   /**
@@ -435,7 +534,7 @@ export class CodeGraph {
    * Uses a mutex to prevent concurrent indexing operations.
    */
   async sync(options: IndexOptions = {}): Promise<SyncResult> {
-    return this.indexMutex.withLock(async () => {
+    const result = await this.indexMutex.withLock(async () => {
       try {
         this.fileLock.acquire();
       } catch {
@@ -483,11 +582,68 @@ export class CodeGraph {
           }
         }
 
+        // Run registered post-sync hooks (cochange, centrality, churn,
+        // tests-edges, …).
+        await runIndexHooksAfterSync(this.buildHookContext(), result);
+
+        // Refresh planner stats + checkpoint the WAL after bulk writes.
+        if (result.filesAdded > 0 || result.filesModified > 0 || result.filesRemoved > 0) {
+          this.db.runMaintenance();
+        }
+
         return result;
       } finally {
         this.fileLock.release();
       }
     });
+
+    // Drop the embedding cache if anything actually moved. New
+    // embeddings for added/modified files will be regenerated by the
+    // background summarisation pass below.
+    if (result.filesAdded > 0 || result.filesModified > 0 || result.filesRemoved > 0) {
+      this.embeddingCache.invalidate();
+    }
+
+    // Fire-and-forget background summarisation when files actually
+    // changed. No-op on cold sync where nothing was added/modified.
+    if ((result.filesAdded > 0 || result.filesModified > 0) && options.summarize !== false) {
+      void this.startBackgroundSummarization();
+    }
+    return result;
+  }
+
+  /**
+   * Get files coupled to `filePath` via shared git commit history.
+   */
+  getCoChangedFiles(
+    filePath: string,
+    options: { limit?: number; minCount?: number; minJaccard?: number } = {}
+  ): Array<{ path: string; count: number; jaccard: number }> {
+    return this.queries.getCoChangedFiles(filePath, options);
+  }
+
+  /** Get the test files that test `filePath` (incoming `tests` edges). */
+  getTestsForFile(filePath: string): FileRecord[] {
+    const incoming = this.queries.getIncomingEdges(`file:${filePath}`, ['tests']);
+    const paths = incoming
+      .map((e) => e.source)
+      .filter((id): id is string => id.startsWith('file:'))
+      .map((id) => id.slice('file:'.length));
+    return paths
+      .map((p) => this.queries.getFileByPath(p))
+      .filter((f): f is FileRecord => f !== null);
+  }
+
+  /** Get the subject file(s) of a test file (outgoing `tests` edges). */
+  getSubjectsOfTest(testFilePath: string): FileRecord[] {
+    const outgoing = this.queries.getOutgoingEdges(`file:${testFilePath}`, ['tests']);
+    const paths = outgoing
+      .map((e) => e.target)
+      .filter((id): id is string => id.startsWith('file:'))
+      .map((id) => id.slice('file:'.length));
+    return paths
+      .map((p) => this.queries.getFileByPath(p))
+      .filter((f): f is FileRecord => f !== null);
   }
 
   /**
@@ -497,6 +653,517 @@ export class CodeGraph {
     return this.indexMutex.isLocked();
   }
 
+  /**
+   * Build structured review context from a unified diff.
+   *
+   * Maps each hunk back to the symbols whose line ranges it touches,
+   * then attaches per-symbol callers / callees / impact-radius / tests
+   * plus historical co-change warnings (files that historically change
+   * together with a changed file but are NOT in this PR — the kind of
+   * coupling violation that catches "you changed schema.sql but didn't
+   * update migrations.ts").
+   *
+   * Returns pure data; no synthesis. Designed for an LLM consumer to
+   * turn into a code review.
+   */
+  buildReviewContext(diff: string, options: ReviewContextOptions = {}): ReviewContext {
+    return buildReviewContext(diff, this.queries, this.traverser, options);
+  }
+
+  // ===========================================================================
+  // Derived Signals (centrality, churn, hotspots)
+  // ===========================================================================
+
+  getCentrality(nodeId: string): number | null {
+    const node = this.queries.getNodeById(nodeId);
+    return node?.centrality ?? null;
+  }
+
+  getTopCentralNodes(opts: { limit?: number; kind?: import('./types').NodeKind } = {}): Node[] {
+    return this.queries.getTopNodesByCentrality(opts);
+  }
+
+  getCentralityRank(nodeId: string): { rank: number; total: number } | null {
+    return this.queries.getCentralityRank(nodeId);
+  }
+
+  getFileChurn(filePath: string): {
+    commitCount: number;
+    loc: number;
+    firstSeenTs: number | null;
+    lastTouchedTs: number | null;
+  } | null {
+    const f = this.queries.getFileByPath(filePath);
+    if (!f) return null;
+    return {
+      commitCount: f.commitCount ?? 0,
+      loc: f.loc ?? 0,
+      firstSeenTs: f.firstSeenTs ?? null,
+      lastTouchedTs: f.lastTouchedTs ?? null,
+    };
+  }
+
+  getHotspots(opts: {
+    limit?: number;
+    minCommits?: number;
+    minCentrality?: number;
+    sortBy?: 'risk' | 'centrality' | 'churn';
+  } = {}): ReturnType<QueryBuilder['getHotspots']> {
+    return this.queries.getHotspots(opts);
+  }
+
+  getIssuesForNode(nodeId: string): Array<{
+    issueNumber: number;
+    kind: 'modified' | 'added' | 'removed';
+    commitSha: string;
+  }> {
+    return this.queries.getIssuesForNode(nodeId);
+  }
+
+  getConfigKeys(opts: { configKind?: 'env'; limit?: number } = {}): ReturnType<
+    QueryBuilder['getConfigKeys']
+  > {
+    return this.queries.getConfigKeys(opts);
+  }
+
+  getConfigRefsByKey(
+    configKey: string,
+    opts: { configKind?: 'env' } = {}
+  ): ReturnType<QueryBuilder['getConfigRefsByKey']> {
+    return this.queries.getConfigRefsByKey(configKey, opts);
+  }
+
+  getConfigKeysForNode(nodeId: string): ReturnType<QueryBuilder['getConfigKeysForNode']> {
+    return this.queries.getConfigKeysForNode(nodeId);
+  }
+
+  getSqlTables(opts: { limit?: number } = {}): ReturnType<QueryBuilder['getSqlTables']> {
+    return this.queries.getSqlTables(opts);
+  }
+
+  getSqlRefsByTable(
+    tableName: string,
+    opts: { op?: 'read' | 'write' | 'ddl' } = {}
+  ): ReturnType<QueryBuilder['getSqlRefsByTable']> {
+    return this.queries.getSqlRefsByTable(tableName, opts);
+  }
+
+  getSqlTablesForNode(nodeId: string): ReturnType<QueryBuilder['getSqlTablesForNode']> {
+    return this.queries.getSqlTablesForNode(nodeId);
+  }
+
+  // ===========================================================================
+  // LLM-driven enrichment
+  // ===========================================================================
+
+  /**
+   * Resolve the LLM config to use: explicit config.llm wins; otherwise
+   * probe the conventional Ollama endpoint and cache the result. The
+   * probe is run once per CodeGraph instance — `null` is cached too, so
+   * users without Ollama don't pay the localhost roundtrip on every
+   * sync.
+   *
+   * Pass `forceRedetect: true` when the user has just installed Ollama
+   * and wants codegraph to pick it up without restarting the process.
+   */
+  private async resolveLlmConfig(forceRedetect = false): Promise<LlmEndpointConfig | null> {
+    if (this.config.llm?.endpoint && this.config.llm.chatModel) {
+      return this.config.llm;
+    }
+    if (this.detectedLlmConfig !== undefined && !forceRedetect) {
+      return this.detectedLlmConfig;
+    }
+    try {
+      const detected = await detectLocalLlm();
+      this.detectedLlmConfig = detected ? detectionToConfig(detected) : null;
+      if (detected) {
+        logDebug('Auto-detected local LLM', {
+          endpoint: detected.endpoint,
+          chatModel: detected.chatModel,
+          embeddingModel: detected.embeddingModel,
+        });
+      }
+    } catch (err) {
+      // Detection must never throw into callers — treat any failure as
+      // "no LLM available".
+      this.detectedLlmConfig = null;
+      logDebug('LLM auto-detect failed', { error: String(err) });
+    }
+    return this.detectedLlmConfig;
+  }
+
+  /**
+   * Run a full symbol-summarisation pass over the indexed nodes via the
+   * configured (or auto-detected) local LLM endpoint. Cached per node
+   * by content_hash, so repeated calls only generate new/changed
+   * summaries — first run is the slow one.
+   *
+   * Throws if no LLM is reachable. Use {@link hasLlm} (sync, config
+   * only) or await {@link getEffectiveLlmConfig} (async, includes
+   * auto-detect) before calling.
+   */
+  async summarizeAll(options: {
+    onProgress?: (done: number, total: number) => void;
+    signal?: AbortSignal;
+    concurrency?: number;
+  } = {}): Promise<{ candidates: number; generated: number; cacheHits: number; errors: number; durationMs: number }> {
+    const llmConfig = await this.resolveLlmConfig();
+    if (!llmConfig) {
+      throw new Error(
+        'No LLM available. Configure config.llm.endpoint or run a local Ollama server with a chat model installed.'
+      );
+    }
+    const client = new LlmClient(llmConfig);
+    const reachable = await client.isReachable();
+    if (!reachable) {
+      throw new Error(`LLM endpoint not reachable at ${llmConfig.endpoint}. Is your local server running?`);
+    }
+    return summarizeAll(this.projectRoot, this.queries, client, llmConfig.chatModel!, options);
+  }
+
+  /**
+   * Whether an LLM endpoint is configured in `config.llm`. This is the
+   * synchronous check — it does NOT trigger auto-detection. Callers
+   * that want to know whether summaries are *possible* (config OR
+   * auto-detect) should `await getEffectiveLlmConfig()` instead.
+   */
+  hasLlm(): boolean {
+    return Boolean(this.config.llm?.endpoint && this.config.llm.chatModel);
+  }
+
+  /**
+   * Returns the LLM config that will be used, including auto-detection.
+   * Callers can use this to decide whether to surface LLM-dependent UI
+   * without writing it to disk.
+   */
+  async getEffectiveLlmConfig(): Promise<LlmEndpointConfig | null> {
+    return this.resolveLlmConfig();
+  }
+
+  /**
+   * Kick off a summarisation pass in the background. Returns
+   * immediately — does NOT block the caller. Subsequent calls while
+   * one is already running are no-ops (returns the existing promise).
+   *
+   * The pass is best-effort: errors are logged, never thrown. The
+   * promise resolves either when work completes or when {@link close}
+   * cancels via AbortController.
+   *
+   * Called automatically after `indexAll` and `sync` so the user gets
+   * summaries without having to invoke a CLI command.
+   */
+  startBackgroundSummarization(): Promise<void> {
+    if (this.bgSummaryPromise) {
+      // Mark dirty so the running pass re-queues itself once it
+      // finishes — newly indexed symbols will be picked up without
+      // needing another sync to land.
+      this.bgSummaryDirty = true;
+      return this.bgSummaryPromise;
+    }
+
+    const controller = new AbortController();
+    this.bgSummaryAbort = controller;
+    this.bgSummaryDirty = false;
+
+    const run = async (): Promise<void> => {
+      try {
+        const llmConfig = await this.resolveLlmConfig();
+        if (!llmConfig || controller.signal.aborted) return;
+
+        const client = new LlmClient(llmConfig);
+        if (!(await client.isReachable())) {
+          logDebug('Background summarisation: endpoint went away', {
+            endpoint: llmConfig.endpoint,
+          });
+          return;
+        }
+
+        const result = await summarizeAll(
+          this.projectRoot,
+          this.queries,
+          client,
+          llmConfig.chatModel!,
+          { signal: controller.signal, concurrency: 2 }
+        );
+        logDebug('Background summarisation complete', {
+          candidates: result.candidates,
+          generated: result.generated,
+          cacheHits: result.cacheHits,
+          errors: result.errors,
+          durationMs: result.durationMs,
+        });
+
+        // Phase-2: embed the summaries so semantic search has data.
+        // Only runs when an embedding model is configured/auto-detected.
+        if (llmConfig.embeddingModel && !controller.signal.aborted) {
+          const eResult = await embedAllSummaries(
+            this.queries,
+            client,
+            llmConfig.embeddingModel,
+            { signal: controller.signal, concurrency: 2 }
+          );
+          logDebug('Background embedding complete', {
+            candidates: eResult.candidates,
+            generated: eResult.generated,
+            errors: eResult.errors,
+            durationMs: eResult.durationMs,
+          });
+          // Wrote new vectors — drop the in-memory matrix so the
+          // next similarity query picks them up.
+          if (eResult.generated > 0) {
+            this.embeddingCache.invalidate();
+          }
+        }
+
+        // Phase-3: roll the symbol summaries up into one paragraph per
+        // directory. Cheap once symbol summaries exist (only dirs
+        // whose content changed regenerate).
+        if (!controller.signal.aborted) {
+          const dResult = await summarizeAllDirectories(
+            this.queries,
+            client,
+            llmConfig.chatModel!,
+            { signal: controller.signal, concurrency: 1 }
+          );
+          logDebug('Background directory summarisation complete', {
+            candidates: dResult.candidates,
+            generated: dResult.generated,
+            cacheHits: dResult.cacheHits,
+            errors: dResult.errors,
+            durationMs: dResult.durationMs,
+          });
+        }
+
+        // Phase-4: role classification. One short call per symbol;
+        // assigns a coarse label (api_endpoint, business_logic, ...).
+        if (!controller.signal.aborted) {
+          const cResult = await classifyAllRoles(
+            this.queries,
+            client,
+            llmConfig.chatModel!,
+            { signal: controller.signal, concurrency: 2 }
+          );
+          logDebug('Background role classification complete', {
+            candidates: cResult.candidates,
+            classified: cResult.classified,
+            errors: cResult.errors,
+            durationMs: cResult.durationMs,
+          });
+        }
+      } catch (err) {
+        // Background work must not crash the host process. Worst case
+        // is no summaries — the rest of codegraph still works.
+        logWarn('Background summarisation failed', { error: String(err) });
+      } finally {
+        // Only clear our refs if we still own them. close() may have
+        // already cancelled and a fresh pass may have been started in
+        // the interim — don't clobber its bookkeeping.
+        if (this.bgSummaryAbort === controller) {
+          this.bgSummaryAbort = null;
+        }
+        if (this.bgSummaryPromise === pending) {
+          this.bgSummaryPromise = null;
+          // Re-queue if more work landed during the pass and we
+          // weren't aborted — gives newly indexed symbols a fast
+          // path without waiting for the next sync.
+          if (this.bgSummaryDirty && !controller.signal.aborted) {
+            this.bgSummaryDirty = false;
+            void this.startBackgroundSummarization();
+          }
+        }
+      }
+    };
+
+    const pending = run();
+    this.bgSummaryPromise = pending;
+    return pending;
+  }
+
+  /** Whether a background summarisation pass is currently running. */
+  isSummarizing(): boolean {
+    return this.bgSummaryPromise !== null;
+  }
+
+  /**
+   * Wait for any background summarisation to finish. Useful in tests
+   * and short-lived CLI invocations that want summaries persisted
+   * before exit.
+   */
+  async awaitBackgroundSummarization(): Promise<void> {
+    if (this.bgSummaryPromise) await this.bgSummaryPromise;
+  }
+
+  /**
+   * Coverage stats: how many indexed symbols have a cached LLM summary.
+   * Surfaces in `codegraph status` and helps users understand why some
+   * tool outputs include summaries and others don't.
+   */
+  getSummaryCoverage(): { total: number; summarised: number } {
+    return this.queries.getSummaryCoverage(SUMMARIZABLE_KINDS);
+  }
+
+  /**
+   * Bulk-fetch cached summaries for a set of node ids. Used by MCP
+   * tools and the CLI to enrich result lists with one-line descriptions
+   * without exposing the database layer.
+   */
+  getSymbolSummaries(nodeIds: string[]): Map<string, string> {
+    if (nodeIds.length === 0) return new Map();
+    return this.queries.getSymbolSummaries(nodeIds);
+  }
+
+  /** Read a single directory's cached LLM summary, or null. */
+  getDirectorySummary(dirPath: string): string | null {
+    return this.queries.getDirectorySummary(dirPath)?.summary ?? null;
+  }
+
+  /** All directory summaries as { dirPath, summary } pairs. */
+  getAllDirectorySummaries(): Array<{ dirPath: string; summary: string }> {
+    return this.queries.getAllDirectorySummaries();
+  }
+
+  /** Bulk-fetch role labels for a set of node ids. */
+  getSymbolRoles(nodeIds: string[]): Map<string, string> {
+    if (nodeIds.length === 0) return new Map();
+    return this.queries.getSymbolRoles(nodeIds);
+  }
+
+  /** Find every classified node with a given role. */
+  findNodesByRole(role: RoleLabel, limit = 100): Node[] {
+    return this.queries.findNodesByRole(role, limit);
+  }
+
+  /** Counts of symbols per role (for status display). */
+  getRoleCounts(): Map<string, number> {
+    return this.queries.getRoleCounts();
+  }
+
+  // ===========================================================================
+  // Agent-as-LLM bridge (works without a local LLM)
+  // ===========================================================================
+
+  /**
+   * Pull a batch of un-summarised symbols (with bodies + content_hash)
+   * for an external agent — typically the AI assistant currently in
+   * the user's session — to summarise. The agent then calls
+   * {@link saveAgentSummaries} to persist its results.
+   *
+   * Designed for users without a local LLM. The cache shape is
+   * identical to the local-pass output, so both paths can coexist.
+   */
+  pendingSummariesBatch(options: { limit?: number; modelHint?: string } = {}): PendingBatch {
+    return pendingSummariesBatch(this.projectRoot, this.queries, options);
+  }
+
+  /**
+   * Persist a batch of agent-generated summaries. Re-validates the
+   * content_hash against current disk content to guard against
+   * stale answers.
+   */
+  saveAgentSummaries(items: ReadonlyArray<SaveSummaryItem>, modelLabel = 'agent-mcp'): SaveResult {
+    return saveAgentSummaries(this.projectRoot, this.queries, items, modelLabel);
+  }
+
+  /**
+   * Judge potentially-dead symbols. Pre-filters by graph signal
+   * (zero incoming calls + not exported) and asks the LLM to weigh
+   * in on entry points the static graph misses (framework hooks,
+   * dynamic dispatch, public API consumed externally).
+   *
+   * Returns a CANDIDATE list with confidence — not a delete list.
+   * The user always decides.
+   */
+  async findDeadCodeCandidates(options: DeadCodeOptions = {}): Promise<DeadCodeResult> {
+    const llmConfig = await this.resolveLlmConfig();
+    if (!llmConfig?.chatModel) {
+      throw new Error('No LLM available for dead-code judge.');
+    }
+    const client = new LlmClient(llmConfig);
+    if (!(await client.isReachable())) {
+      throw new Error(`LLM endpoint not reachable at ${llmConfig.endpoint}`);
+    }
+    return judgeDeadCode(this.queries, client, options);
+  }
+
+  /**
+   * Check whether a newly added symbol's name follows the conventions
+   * already established in the codebase for the same kind. Advisory
+   * only — the judge is asked to err on the side of "consistent" when
+   * unsure.
+   *
+   * Designed to be called from a sync hook for each newly added node;
+   * lightweight enough to run a few dozen at a time.
+   */
+  async checkNamingDrift(symbol: {
+    name: string;
+    kind: string;
+    filePath: string;
+  }): Promise<NamingCheckResult> {
+    const llmConfig = await this.resolveLlmConfig();
+    if (!llmConfig?.chatModel) {
+      throw new Error('No LLM available for naming-drift check.');
+    }
+    const client = new LlmClient(llmConfig);
+    return checkNamingConvention(this.queries, client, symbol);
+  }
+
+  /**
+   * One-shot "what did this change do" intent generator. Designed for
+   * PR-review tooling that supplies the before/after bodies for a
+   * specific symbol. No caching at this layer — caller controls
+   * lifetime and whether to persist.
+   */
+  async summarizeChange(
+    name: string,
+    kind: string,
+    beforeBody: string,
+    afterBody: string,
+    options: ChangeIntentOptions = {}
+  ): Promise<ChangeIntentResult> {
+    const llmConfig = await this.resolveLlmConfig();
+    if (!llmConfig?.chatModel) {
+      throw new Error('No LLM available for summarizeChange.');
+    }
+    const client = new LlmClient(llmConfig);
+    if (!(await client.isReachable())) {
+      throw new Error(`LLM endpoint not reachable at ${llmConfig.endpoint}`);
+    }
+    return summarizeChange(client, name, kind, beforeBody, afterBody, options);
+  }
+
+  /**
+   * Natural-language Q&A over the codebase. Hybrid-retrieves the
+   * top-K most relevant symbols, builds a context prompt, and asks
+   * the configured/auto-detected chat model.
+   *
+   * Throws if no LLM is reachable. Use {@link getEffectiveLlmConfig}
+   * to check before calling for a graceful UX.
+   */
+  async ask(question: string, options: AskOptions = {}): Promise<AskResult> {
+    const llmConfig = await this.resolveLlmConfig();
+    if (!llmConfig?.chatModel) {
+      throw new Error(
+        'No LLM available for codegraph_ask. Configure config.llm or run a local Ollama server with a chat model installed.'
+      );
+    }
+    const client = new LlmClient(llmConfig);
+    if (!(await client.isReachable())) {
+      throw new Error(`LLM endpoint not reachable at ${llmConfig.endpoint}`);
+    }
+
+    const k = options.retrieveK ?? 12;
+    const candidates = await this.searchHybrid(question, { limit: k });
+    return askWithCandidates(
+      this.projectRoot,
+      question,
+      candidates,
+      this.queries,
+      client,
+      llmConfig.chatModel,
+      options
+    );
+  }
+
   // ===========================================================================
   // File Watching
   // ===========================================================================
@@ -644,6 +1311,130 @@ export class CodeGraph {
     return this.queries.searchNodes(query, options);
   }
 
+  /**
+   * Hybrid search: blends FTS5 lexical results with cosine semantic
+   * results via Reciprocal Rank Fusion. Falls back to FTS-only when
+   * no embedding model is configured/auto-detected, so callers can
+   * always use this — it just gets smarter as enrichment lands.
+   *
+   * The semantic ranking comes from the LLM-generated symbol summaries
+   * (PR #111) embedded with the auto-detected embedding model
+   * (PR #112 / Phase 0). Cold codebases without summaries fall through
+   * to FTS-only with no quality regression.
+   */
+  async searchHybrid(query: string, options?: SearchOptions): Promise<SearchResult[]> {
+    const limit = options?.limit ?? 20;
+    // Pull a deeper FTS slice than the user wants because RRF blending
+    // needs candidates beyond the first cut.
+    const ftsResults = this.queries.searchNodes(query, { ...options, limit: Math.max(50, limit * 3) });
+
+    const llmConfig = await this.resolveLlmConfig();
+    if (!llmConfig?.embeddingModel) {
+      return ftsResults.slice(0, limit);
+    }
+
+    // Cheap reachability check — if the endpoint is gone we still
+    // have valid FTS results to return rather than failing the call.
+    const client = new LlmClient(llmConfig);
+    if (!(await client.isReachable())) {
+      return ftsResults.slice(0, limit);
+    }
+
+    let queryVec: Float32Array;
+    try {
+      const vecs = await client.embed([query]);
+      if (vecs.length === 0 || !vecs[0]) return ftsResults.slice(0, limit);
+      queryVec = vecs[0];
+    } catch (err) {
+      logDebug('Hybrid search: query embed failed, falling back to FTS', { error: String(err) });
+      return ftsResults.slice(0, limit);
+    }
+
+    const cached = this.embeddingCache.get(this.queries, llmConfig.embeddingModel);
+    if (cached.ids.length === 0) {
+      return ftsResults.slice(0, limit);
+    }
+
+    const { topKByCosineMatrix, reciprocalRankFusion } = await import('./llm/embeddings');
+    const semanticHits = topKByCosineMatrix(
+      queryVec,
+      cached.matrix,
+      cached.ids,
+      cached.dim,
+      Math.max(50, limit * 3)
+    );
+
+    // Build the two ranking lists for RRF, both keyed by node id.
+    const ftsRanked = ftsResults.map((r) => ({ id: r.node.id }));
+    const semRanked = semanticHits.map((h) => ({ id: h.nodeId }));
+    const fused = reciprocalRankFusion([ftsRanked, semRanked]);
+
+    // Map every id we know about back to a SearchResult. FTS results
+    // already carry node objects; semantic-only hits need a lookup.
+    const known = new Map<string, SearchResult>();
+    for (const r of ftsResults) known.set(r.node.id, r);
+
+    const orderedIds = [...fused.entries()].sort((a, b) => b[1] - a[1]).map(([id]) => id);
+    const out: SearchResult[] = [];
+    for (const id of orderedIds) {
+      if (out.length >= limit) break;
+      let result = known.get(id);
+      if (!result) {
+        const node = this.queries.getNodeById(id);
+        if (!node) continue;
+        result = { node, score: fused.get(id) ?? 0 };
+      }
+      out.push(result);
+    }
+    return out;
+  }
+
+  /**
+   * Find symbols whose meaning is similar to a given node, via
+   * embedding cosine. Useful for "show me the other functions doing
+   * the same thing" — including across languages, since summaries
+   * are language-agnostic.
+   */
+  async findSimilar(
+    nodeId: string,
+    options: { limit?: number; sameLanguage?: boolean; differentLanguage?: boolean } = {}
+  ): Promise<SearchResult[]> {
+    const limit = options.limit ?? 10;
+    const llmConfig = await this.resolveLlmConfig();
+    if (!llmConfig?.embeddingModel) return [];
+
+    // Need the source node + its own embedding to compare against.
+    const sourceNode = this.queries.getNodeById(nodeId);
+    if (!sourceNode) return [];
+
+    const cached = this.embeddingCache.get(this.queries, llmConfig.embeddingModel);
+    if (cached.ids.length === 0) return [];
+    const sourceIdx = cached.ids.indexOf(nodeId);
+    if (sourceIdx < 0) return [];
+
+    const { topKByCosineMatrix } = await import('./llm/embeddings');
+    // Slice the source row out of the flat matrix to use as the query.
+    const sourceVec = cached.matrix.slice(
+      sourceIdx * cached.dim,
+      (sourceIdx + 1) * cached.dim
+    );
+    // Skip the source itself by filtering after top-k (cheap with a
+    // small post-filter; a larger k+1 lets us guarantee `limit` survivors).
+    const hits = topKByCosineMatrix(sourceVec, cached.matrix, cached.ids, cached.dim, limit + 1)
+      .filter((h) => h.nodeId !== nodeId);
+
+    const out: SearchResult[] = [];
+    for (const hit of hits) {
+      if (out.length >= limit) break;
+      const node = this.queries.getNodeById(hit.nodeId);
+      if (!node) continue;
+      if (options.sameLanguage && node.language !== sourceNode.language) continue;
+      if (options.differentLanguage && node.language === sourceNode.language) continue;
+      out.push({ node, score: hit.score });
+    }
+    return out;
+  }
+
   // ===========================================================================
   // Edge Operations
   // ===========================================================================
@@ -948,6 +1739,7 @@ export class CodeGraph {
    */
   clear(): void {
     this.queries.clear();
+    this.embeddingCache.invalidate();
   }
 
   /**
diff --git a/src/issue-history/index.ts b/src/issue-history/index.ts
new file mode 100644
index 00000000..ea94a355
--- /dev/null
+++ b/src/issue-history/index.ts
@@ -0,0 +1,235 @@
+/**
+ * Issue → symbol attribution from git history
+ *
+ * Mines commits whose subject or body matches `Fixes #N` /
+ * `Closes #N` / `Resolves #N` and attributes their hunks to the
+ * symbols they touched. Result is stored in the `symbol_issues`
+ * table and surfaced via `codegraph_node` so an agent inspecting
+ * `runInstaller` sees "modified by issues #37, #68, #69" inline.
+ *
+ * Why hunk-level, not file-level: spike data (see `spike_issues.js`
+ * + `spike_issues_hunk.js`) showed that file-level produced ~40
+ * symbols/issue, mostly noise — every issue touches files with
+ * many irrelevant symbols. Hunk-level is ~9 symbols/issue with
+ * 78% noise reduction, AND uniquely enables the multi-issue-symbol
+ * query (e.g. "loadGrammarsForLanguages was modified by every
+ * language-add issue") which file-level cannot answer because the
+ * intersection at file granularity is trivially huge.
+ *
+ * Convention: only `(Fixes|Closes|Resolves) #N` commits are mined.
+ * Generic commit messages without an issue ref are ignored — keeps
+ * signal-to-noise high.
+ *
+ * Known v1 limitations:
+ *   - `Fixes #1, #2` only captures #1. The regex requires a verb
+ *     prefix per match; `, #2` has no verb so it's skipped. Authors
+ *     who care should write `Fixes #1, fixes #2`. Acceptable noise
+ *     for v1; revisit if real projects show many comma-list misses.
+ *   - Quoted issue references in commit bodies (e.g. "this reverts the
+ *     'Fixes #99' commit from last week") produce false positives.
+ *     Detection would require message-block parsing; out of scope for v1.
+ */
+
+import { execFileSync } from 'child_process';
+import { logDebug } from '../errors';
+import { parseCommitDiff } from './parse-diff';
+
+/** Project-metadata key holding the HEAD SHA at the last successful mine. */
+export const LAST_MINED_ISSUES_HEAD_KEY = 'last_mined_issues_head';
+
+/**
+ * Skip commits touching more than this many files. Squashed merges
+ * and mass refactors otherwise produce many false-positive
+ * attributions where every symbol in the commit gets credited to
+ * the issue.
+ */
+export const MAX_FILES_PER_COMMIT = 50;
+
+/**
+ * Match `fix #N` / `fixes #N` / `closes #N` / `resolves #N` (and
+ * past-tense variants), case-insensitive, allowing `:` or `-`
+ * between verb and `#`. Captures the issue number.
+ */
+export const ISSUE_REGEX =
+  /\b(?:fix|fixes|fixed|close|closes|closed|resolve|resolves|resolved)\s*[:\-]?\s*#(\d+)/gi;
+
+const MAX_GIT_BUFFER = 200 * 1024 * 1024;
+const GIT_TIMEOUT_MS = 60_000;
+
+export interface IssueCommit {
+  sha: string;
+  /** Distinct issue numbers referenced, in source order. */
+  issues: number[];
+}
+
+export type AttributionKind = 'modified' | 'added' | 'removed';
+
+export interface IssueAttribution {
+  nodeId: string;
+  issueNumber: number;
+  commitSha: string;
+  kind: AttributionKind;
+}
+
+export interface IssueMineResult {
+  attributions: IssueAttribution[];
+  /** HEAD SHA reached by this run. null when not in a git repo. */
+  currentHead: string | null;
+  /** Caller's `sinceSha` was unreachable — caller clears + re-mines from scratch. */
+  needsFullRescan: boolean;
+  /** Debug-only counter: (file, name) lookups that didn't resolve. */
+  unresolvedCount: number;
+}
+
+/** Resolver supplied by the caller: (file, name) → node_id | null. */
+export type SymbolResolver = (filePath: string, symbolName: string) => string | null;
+
+/** Get HEAD SHA, or null when not in a git repo / no commits yet. */
+export function getGitHead(rootDir: string): string | null {
+  try {
+    return (
+      execFileSync('git', ['rev-parse', 'HEAD'], {
+        cwd: rootDir,
+        encoding: 'utf-8',
+        timeout: 5000,
+        stdio: ['pipe', 'pipe', 'pipe'],
+      }).trim() || null
+    );
+  } catch {
+    return null;
+  }
+}
+
+function isShaReachable(rootDir: string, sha: string): boolean {
+  try {
+    execFileSync('git', ['cat-file', '-e', `${sha}^{commit}`], {
+      cwd: rootDir,
+      timeout: 5000,
+      stdio: ['pipe', 'pipe', 'pipe'],
+    });
+    return true;
+  } catch {
+    return false;
+  }
+}
+
+/**
+ * Find commits whose message references at least one issue. Returns
+ * `[]` when not in a git repo or git fails (logged via logDebug;
+ * never throws to the caller).
+ *
+ * Format: `git log --no-merges -z --pretty=format:CGCMT-%H%n%s%n%b%n` —
+ * each commit terminated by a NUL. The body line lets us match
+ * trailers like `Fixes #N` that aren't in the subject.
+ */
+export function mineIssueCommits(rootDir: string, sinceSha: string | null): IssueCommit[] {
+  const args = ['log', '--no-merges', '-z', '--pretty=format:CGCMT-%H%n%s%n%b'];
+  if (sinceSha) args.push(`${sinceSha}..HEAD`);
+
+  let raw: string;
+  try {
+    raw = execFileSync('git', args, {
+      cwd: rootDir,
+      encoding: 'utf-8',
+      timeout: GIT_TIMEOUT_MS,
+      maxBuffer: MAX_GIT_BUFFER,
+      stdio: ['pipe', 'pipe', 'pipe'],
+    });
+  } catch (err) {
+    logDebug(`mineIssueCommits: git log failed: ${err instanceof Error ? err.message : String(err)}`);
+    return [];
+  }
+
+  const commits: IssueCommit[] = [];
+  const blocks = raw.split('\0');
+  const headerRe = /^CGCMT-([0-9a-f]{40})$/;
+  for (const block of blocks) {
+    const trimmed = block.trim();
+    if (!trimmed) continue;
+    const lines = trimmed.split('\n');
+    const m = headerRe.exec(lines[0] ?? '');
+    if (!m) continue;
+    const sha = m[1]!;
+    const messageBody = lines.slice(1).join('\n');
+    const issues = new Set<number>();
+    let match: RegExpExecArray | null;
+    ISSUE_REGEX.lastIndex = 0;
+    while ((match = ISSUE_REGEX.exec(messageBody)) !== null) {
+      const n = parseInt(match[1]!, 10);
+      if (Number.isFinite(n) && n > 0) issues.add(n);
+    }
+    if (issues.size > 0) commits.push({ sha, issues: [...issues] });
+  }
+  return commits;
+}
+
+/**
+ * Mine issue→symbol attributions.
+ *
+ * @param rootDir         Project root.
+ * @param resolveSymbol   (filePath, name) → nodeId | null. Closure
+ *                        over the current index. Names that don't
+ *                        resolve are dropped (counted as unresolved
+ *                        for diagnostics).
+ * @param sinceSha        null = full mine; otherwise `<sha>..HEAD`.
+ *                        Unreachable shas trigger needsFullRescan.
+ */
+export function mineIssueHistory(
+  rootDir: string,
+  resolveSymbol: SymbolResolver,
+  sinceSha: string | null
+): IssueMineResult {
+  const empty: IssueMineResult = {
+    attributions: [],
+    currentHead: null,
+    needsFullRescan: false,
+    unresolvedCount: 0,
+  };
+
+  const head = getGitHead(rootDir);
+  if (!head) return empty;
+
+  if (sinceSha && !isShaReachable(rootDir, sinceSha)) {
+    return { attributions: [], currentHead: head, needsFullRescan: true, unresolvedCount: 0 };
+  }
+  if (sinceSha === head) {
+    return { attributions: [], currentHead: head, needsFullRescan: false, unresolvedCount: 0 };
+  }
+
+  const commits = mineIssueCommits(rootDir, sinceSha);
+  const attributions: IssueAttribution[] = [];
+  let unresolvedCount = 0;
+
+  for (const c of commits) {
+    let perFile;
+    try {
+      perFile = parseCommitDiff(rootDir, c.sha);
+    } catch (err) {
+      logDebug(`parseCommitDiff failed for ${c.sha}: ${err instanceof Error ? err.message : String(err)}`);
+      continue;
+    }
+    if (perFile.size > MAX_FILES_PER_COMMIT) {
+      // Squashed mass-refactor — the issue ref is real but the per-symbol
+      // attribution would be all noise. Skip the whole commit.
+      continue;
+    }
+    for (const [filePath, sets] of perFile) {
+      const emit = (name: string, kind: AttributionKind) => {
+        const nodeId = resolveSymbol(filePath, name);
+        if (!nodeId) {
+          unresolvedCount += 1;
+          return;
+        }
+        for (const issue of c.issues) {
+          attributions.push({ nodeId, issueNumber: issue, commitSha: c.sha, kind });
+        }
+      };
+      // Order: modified first, then added, then removed. Stable for tests.
+      for (const name of sets.modCtx) emit(name, 'modified');
+      for (const name of sets.added) emit(name, 'added');
+      for (const name of sets.removed) emit(name, 'removed');
+    }
+  }
+
+  return { attributions, currentHead: head, needsFullRescan: false, unresolvedCount };
+}
diff --git a/src/issue-history/parse-diff.ts b/src/issue-history/parse-diff.ts
new file mode 100644
index 00000000..e697cbdc
--- /dev/null
+++ b/src/issue-history/parse-diff.ts
@@ -0,0 +1,208 @@
+/**
+ * Diff parsing for issue → symbol attribution
+ *
+ * Pure parser: no I/O, no git invocations beyond the one `git show` it
+ * uses to fetch a commit's full diff. Splits into two distinct signals
+ * per (commit, file):
+ *
+ *   modCtx  — the *enclosing* function/class of each hunk, taken from
+ *             git's `@@ -... +... @@ <ctx>` header. Cross-language
+ *             because git's userdiff regex covers it (TS/JS/Py/Go/
+ *             Java/C/C++/Rust/Ruby out of the box).
+ *
+ *   added   — declarations on `+` lines (newly-introduced symbols).
+ *   removed — declarations on `-` lines (deleted symbols).
+ *
+ * Both signals matter independently: an issue that *modifies* `foo()`
+ * is different evidence from an issue that *adds* `foo()`. The MCP
+ * surface renders them with explicit kind tags so an agent can tell
+ * the difference.
+ */
+
+import { execFileSync } from 'child_process';
+
+/** Hard cap on git output we'll buffer (bytes). */
+const MAX_GIT_BUFFER = 200 * 1024 * 1024;
+/** Wall-clock cap on a single git invocation (ms). */
+const GIT_TIMEOUT_MS = 60_000;
+
+/** Identifiers that look like declarations to the loose `name(` regex
+ * but are actually keywords / locals — never represent indexable
+ * symbols. Filtering them keeps the resolved hit-rate high. */
+const SKIP_NAMES = new Set([
+  'if', 'for', 'while', 'switch', 'catch', 'return', 'throw', 'await',
+  'new', 'function', 'class', 'interface', 'const', 'let', 'var',
+  'export', 'import', 'public', 'private', 'protected', 'static',
+  'async', 'abstract', 'default', 'super', 'this', 'true', 'false',
+  'null', 'undefined', 'void', 'typeof', 'instanceof',
+  'describe', 'it', 'expect', 'test', 'beforeEach', 'afterEach',
+  'beforeAll', 'afterAll', // popular test-framework names; not symbols
+  'constructor',           // not a top-level symbol — owned by class
+]);
+
+/** Path patterns we never extract diff symbols from. */
+const SKIP_PATH_RE =
+  /^(?:dist\/|node_modules\/|\.codegraph\/|coverage\/|build\/|out\/)|\.lock$|\.snap$|^package(?:-lock)?\.json$|\.md$|\.json$|\.svg$|\.png$|\.jpg$|\.gif$|\.ico$|\.txt$|\.yml$|\.yaml$|\.toml$/i;
+
+/** Declaration patterns; capture group 1 is the symbol name.
+ * Designed to be loose — better to over-collect and miss in the
+ * symbol-resolver step than to under-collect (the resolver is cheap). */
+const DECL_PATTERNS: RegExp[] = [
+  // function foo / function* foo / async function foo
+  /^[+\-]\s*(?:export\s+)?(?:async\s+)?function\s*\*?\s+([A-Za-z_$][\w$]*)/,
+  // class Foo / abstract class Foo / export class Foo
+  /^[+\-]\s*(?:export\s+)?(?:abstract\s+)?class\s+([A-Za-z_$][\w$]*)/,
+  // interface Foo
+  /^[+\-]\s*(?:export\s+)?interface\s+([A-Za-z_$][\w$]*)/,
+  // type Foo = ... / type alias
+  /^[+\-]\s*(?:export\s+)?type\s+([A-Za-z_$][\w$]*)\s*=/,
+  // enum Foo
+  /^[+\-]\s*(?:export\s+)?(?:const\s+)?enum\s+([A-Za-z_$][\w$]*)/,
+  // const Foo = (..) =>  /  const Foo = function
+  /^[+\-]\s*(?:export\s+)?const\s+([A-Z][\w$]*)\s*=\s*(?:\([^)]*\)\s*=>|function|async\s)/,
+  // method-like:  visibility?  name(    (loose; SKIP_NAMES filters keywords)
+  /^[+\-]\s*(?:public|private|protected|static|async)\s+(?:[a-z]+\s+)*([A-Za-z_$][\w$]*)\s*\(/,
+  // Python: def name(  /  async def name(
+  /^[+\-]\s*(?:async\s+)?def\s+([A-Za-z_][\w]*)\s*\(/,
+  // Go: func name(  /  func (recv) name(
+  /^[+\-]\s*func\s+(?:\([^)]*\)\s+)?([A-Za-z_][\w]*)\s*\(/,
+  // Rust: fn name(  /  pub fn name<...>(
+  /^[+\-]\s*(?:pub(?:\([^)]*\))?\s+)?(?:async\s+)?fn\s+([A-Za-z_][\w]*)\s*[<(]/,
+];
+
+export interface FileDiffSets {
+  modCtx: Set<string>;
+  added: Set<string>;
+  removed: Set<string>;
+}
+
+/**
+ * Pull the symbol name out of a git `@@ ... @@ <ctx>` context line.
+ * Git's userdiff regexes already give us a single line that includes
+ * the enclosing definition (e.g. `function processOrder(order: Order)
+ * {`). We take the first identifier following a recognised keyword,
+ * falling back to "first identifier-followed-by-paren" for languages
+ * git doesn't have explicit userdiff for.
+ */
+export function extractSymbolFromContext(ctx: string): string | null {
+  const trimmed = ctx.trim();
+  if (!trimmed) return null;
+  // Order of patterns matters: anchor on keyword first, then on
+  // identifier-followed-by-paren.
+  const m1 = trimmed.match(/(?:function|class|interface|type|enum|def|func|fn)\s+([A-Za-z_$][\w$]*)/);
+  if (m1 && !SKIP_NAMES.has(m1[1]!)) return m1[1]!;
+  const m2 = trimmed.match(/^([A-Za-z_$][\w$]*)\s*\(/);
+  if (m2 && !SKIP_NAMES.has(m2[1]!)) return m2[1]!;
+  // Methods: `  async foo(` after some indentation, with possibly a
+  // visibility modifier we already skipped above.
+  const m3 = trimmed.match(/(?:async\s+)?([A-Za-z_$][\w$]*)\s*\(/);
+  if (m3 && !SKIP_NAMES.has(m3[1]!)) return m3[1]!;
+  return null;
+}
+
+/**
+ * Pull a declared symbol name out of a single `+` or `-` diff line.
+ */
+export function extractDeclaration(diffLine: string): { name: string; sign: '+' | '-' } | null {
+  if (!diffLine || (diffLine[0] !== '+' && diffLine[0] !== '-')) return null;
+  // Skip the file-marker lines emitted by git.
+  if (diffLine.startsWith('+++') || diffLine.startsWith('---')) return null;
+  for (const re of DECL_PATTERNS) {
+    const m = re.exec(diffLine);
+    if (m && m[1] && !SKIP_NAMES.has(m[1])) {
+      return { name: m[1], sign: diffLine[0] as '+' | '-' };
+    }
+  }
+  return null;
+}
+
+/**
+ * Pull a declaration name out of an unchanged (` `-prefixed) diff
+ * line. Used to detect the enclosing function when git's `@@ ... @@
+ * <ctx>` header is empty (which happens when the changed hunk lives
+ * inside a function that starts at line 1, so there's no enclosing
+ * scope *above* the hunk for git's userdiff to reference).
+ *
+ * Matches the same patterns as `extractDeclaration` but allows a
+ * leading space (the diff context-line prefix).
+ */
+export function extractContextDeclaration(diffLine: string): string | null {
+  if (!diffLine || diffLine[0] !== ' ') return null;
+  for (const re of DECL_PATTERNS) {
+    // DECL_PATTERNS anchor on `[+\-]` — accept space too by trying
+    // again with that prefix swapped.
+    const swapped = '+' + diffLine.slice(1);
+    const m = re.exec(swapped);
+    if (m && m[1] && !SKIP_NAMES.has(m[1])) return m[1];
+  }
+  return null;
+}
+
+/**
+ * Run `git show <sha>` and parse the diff into per-file
+ * (modCtx, added, removed) sets.
+ *
+ * Throws if git fails (caller should catch + log + skip the commit).
+ */
+export function parseCommitDiff(rootDir: string, commitSha: string): Map<string, FileDiffSets> {
+  const out = execFileSync(
+    'git',
+    ['show', commitSha, '--unified=3', '--no-color', '--no-renames'],
+    {
+      cwd: rootDir,
+      encoding: 'utf-8',
+      timeout: GIT_TIMEOUT_MS,
+      maxBuffer: MAX_GIT_BUFFER,
+      stdio: ['pipe', 'pipe', 'pipe'],
+    }
+  );
+  const lines = out.split('\n');
+  const perFile = new Map<string, FileDiffSets>();
+  let curFile: string | null = null;
+
+  for (const L of lines) {
+    if (L.startsWith('diff --git ')) {
+      // `diff --git a/<old> b/<new>` — take the new path (post-rename
+      // would normally apply here but we passed --no-renames).
+      const m = L.match(/^diff --git a\/(.+?) b\/(.+)$/);
+      if (m) {
+        curFile = m[2]!;
+        if (SKIP_PATH_RE.test(curFile)) {
+          curFile = null; // signal to subsequent rows: skip
+          continue;
+        }
+        if (!perFile.has(curFile)) {
+          perFile.set(curFile, { modCtx: new Set(), added: new Set(), removed: new Set() });
+        }
+      }
+      continue;
+    }
+    if (curFile === null) continue;
+    if (L.startsWith('@@')) {
+      // `@@ -a,b +c,d @@ <enclosing context>`
+      const m = L.match(/^@@\s+-\d+(?:,\d+)?\s+\+\d+(?:,\d+)?\s+@@\s*(.*)$/);
+      if (m && m[1]) {
+        const sym = extractSymbolFromContext(m[1]);
+        if (sym) perFile.get(curFile)!.modCtx.add(sym);
+      }
+      continue;
+    }
+    const decl = extractDeclaration(L);
+    if (decl) {
+      const sets = perFile.get(curFile)!;
+      if (decl.sign === '+') sets.added.add(decl.name);
+      else sets.removed.add(decl.name);
+      continue;
+    }
+    // Fallback: an unchanged context line within a hunk that contains
+    // a declaration is the enclosing scope for that hunk. This catches
+    // the case where the function's signature is at line 1 (so git's
+    // userdiff has no scope *above* the hunk to use as @@ <ctx>).
+    const ctxName = extractContextDeclaration(L);
+    if (ctxName) {
+      perFile.get(curFile)!.modCtx.add(ctxName);
+    }
+  }
+
+  return perFile;
+}
diff --git a/src/llm/agent-bridge.ts b/src/llm/agent-bridge.ts
new file mode 100644
index 00000000..214936e9
--- /dev/null
+++ b/src/llm/agent-bridge.ts
@@ -0,0 +1,198 @@
+/**
+ * Agent-as-LLM bridge.
+ *
+ * When no local LLM is available, the agent currently in the user's
+ * session (Claude via Claude Code, or any MCP-speaking agent) can
+ * fill the summary cache directly. Two MCP tools form the contract:
+ *
+ *   1. codegraph_pending_summaries  → returns a batch of un-summarised
+ *      symbols with bodies + content_hash for the agent to summarise.
+ *
+ *   2. codegraph_save_summaries     → takes the agent's results and
+ *      persists them with the same content_hash invalidation as the
+ *      local-LLM path.
+ *
+ * No HTTP, no embedding model required, no install friction. The
+ * agent's tokens replace the local model. Quality is typically
+ * higher (Claude vs. a 32B local model) at the cost of agent budget.
+ *
+ * Same SUMMARIZABLE_KINDS / MIN_BODY_LINES filters as the local pass
+ * so both paths produce comparable cache entries.
+ */
+
+import * as fs from 'fs';
+import * as crypto from 'crypto';
+import { Node } from '../types';
+import { QueryBuilder } from '../db/queries';
+import { validatePathWithinRoot } from '../utils';
+import { SUMMARIZABLE_KINDS } from './summarizer';
+
+/** Same threshold the local-LLM summariser uses. */
+const MIN_BODY_LINES = 3;
+const MAX_BODY_CHARS = 4000;
+
+export interface PendingSummaryItem {
+  nodeId: string;
+  name: string;
+  kind: string;
+  language: string;
+  filePath: string;
+  startLine: number;
+  endLine: number;
+  signature: string | null;
+  body: string;
+  contentHash: string;
+}
+
+export interface PendingBatch {
+  items: PendingSummaryItem[];
+  /** How many additional candidates remain after this batch. */
+  remaining: number;
+  /** Total summarisable candidates (stable per index state). */
+  total: number;
+  /** Echo back to MCP callers so they know what label to save under. */
+  modelHint: string;
+}
+
+export interface SaveSummaryItem {
+  nodeId: string;
+  contentHash: string;
+  summary: string;
+}
+
+export interface SaveResult {
+  saved: number;
+  skipped: number;
+  errors: string[];
+}
+
+/** Compute the same content_hash the local summariser uses, so the
+ *  cache key is interchangeable between paths. */
+export function contentHashFor(sym: Pick<Node, 'signature'>, body: string): string {
+  const h = crypto.createHash('sha256');
+  h.update(sym.signature ?? '');
+  h.update('\0');
+  h.update(body);
+  return h.digest('hex').slice(0, 32);
+}
+
+function readBodySafe(projectRoot: string, node: Node): string {
+  const safe = validatePathWithinRoot(projectRoot, node.filePath);
+  if (!safe) return '';
+  try {
+    const lines = fs.readFileSync(safe, 'utf-8').split('\n');
+    const slice = lines.slice(Math.max(0, node.startLine - 1), node.endLine).join('\n');
+    return slice.length > MAX_BODY_CHARS
+      ? slice.slice(0, MAX_BODY_CHARS) + '\n// ... (truncated)'
+      : slice;
+  } catch {
+    return '';
+  }
+}
+
+/**
+ * Pull the next batch of un-summarised symbols. Returns bodies inline
+ * so the agent has everything it needs in one MCP round-trip.
+ *
+ * `modelHint` defaults to "agent-mcp" — callers can override (e.g. the
+ * actual model id of the calling agent) to keep cache provenance
+ * accurate.
+ */
+export function pendingSummariesBatch(
+  projectRoot: string,
+  queries: QueryBuilder,
+  options: { limit?: number; kinds?: ReadonlySet<string>; modelHint?: string } = {}
+): PendingBatch {
+  const limit = Math.max(1, Math.min(200, options.limit ?? 20));
+  const kinds = options.kinds ?? SUMMARIZABLE_KINDS;
+  const modelHint = options.modelHint ?? 'agent-mcp';
+
+  // Reuse the same docstring threshold the local pass uses for parity.
+  const candidates = queries.getSummarizableNodes(kinds, MIN_BODY_LINES, 30);
+  const total = candidates.length;
+  const items: PendingSummaryItem[] = [];
+
+  for (const node of candidates) {
+    if (items.length >= limit) break;
+    const body = readBodySafe(projectRoot, node);
+    if (!body) continue; // Skip files we can't read; they're surfaced
+    // again on the next call once readable.
+    const hash = contentHashFor(node, body);
+
+    // Don't ship a candidate whose content_hash already matches a
+    // cached summary from THIS model (effectively a cache hit) —
+    // that would waste agent tokens.
+    const existing = queries.getSymbolSummary(node.id);
+    if (existing && existing.contentHash === hash && existing.model === modelHint) continue;
+
+    items.push({
+      nodeId: node.id,
+      name: node.name,
+      kind: node.kind,
+      language: node.language,
+      filePath: node.filePath,
+      startLine: node.startLine,
+      endLine: node.endLine,
+      signature: node.signature ?? null,
+      body,
+      contentHash: hash,
+    });
+  }
+
+  return {
+    items,
+    remaining: Math.max(0, total - items.length),
+    total,
+    modelHint,
+  };
+}
+
+/**
+ * Persist a batch of agent-generated summaries. Idempotent: a stale
+ * content_hash is silently skipped (with a logged reason in `errors`)
+ * because by the time the agent answered, the symbol body may have
+ * changed under it.
+ */
+export function saveAgentSummaries(
+  projectRoot: string,
+  queries: QueryBuilder,
+  items: ReadonlyArray<SaveSummaryItem>,
+  modelLabel: string
+): SaveResult {
+  let saved = 0;
+  let skipped = 0;
+  const errors: string[] = [];
+
+  for (const item of items) {
+    const node = queries.getNodeById(item.nodeId);
+    if (!node) {
+      skipped++;
+      errors.push(`${item.nodeId}: node no longer exists`);
+      continue;
+    }
+    // Re-derive the hash from current disk content; if it doesn't
+    // match what the agent saw, the body changed under them.
+    const body = readBodySafe(projectRoot, node);
+    if (!body) {
+      skipped++;
+      errors.push(`${item.nodeId}: body unreadable`);
+      continue;
+    }
+    const currentHash = contentHashFor(node, body);
+    if (currentHash !== item.contentHash) {
+      skipped++;
+      errors.push(`${item.nodeId}: content_hash drifted (${item.contentHash} → ${currentHash})`);
+      continue;
+    }
+    const trimmed = item.summary.trim().split('\n')[0]?.trim() ?? '';
+    if (!trimmed) {
+      skipped++;
+      errors.push(`${item.nodeId}: empty summary`);
+      continue;
+    }
+    queries.upsertSymbolSummary(item.nodeId, currentHash, trimmed.slice(0, 200), modelLabel);
+    saved++;
+  }
+
+  return { saved, skipped, errors };
+}
diff --git a/src/llm/ask.ts b/src/llm/ask.ts
new file mode 100644
index 00000000..f9251d8b
--- /dev/null
+++ b/src/llm/ask.ts
@@ -0,0 +1,155 @@
+/**
+ * Natural-language Q&A over the indexed codebase (RAG).
+ *
+ * Tier-1 enrichment: hybrid-retrieve top-K relevant symbols, hand
+ * the LLM a curated context (summaries + bodies for the most
+ * promising ones), let it synthesise an answer that cites symbol
+ * names and file paths.
+ *
+ * The retrieval already weights by lexical AND semantic match (PR
+ * #112), so the model doesn't have to do its own retrieval — we just
+ * hand it a tight, well-formed prompt.
+ */
+
+import * as fs from 'fs';
+import { LlmClient } from './client';
+import { Node, SearchResult } from '../types';
+import { QueryBuilder } from '../db/queries';
+import { validatePathWithinRoot } from '../utils';
+import { logDebug } from '../errors';
+
+/** Max bodies we include verbatim. Beyond this, we just list names + summaries. */
+const MAX_FULL_BODIES = 4;
+/** Per-body char cap. Long bodies blow the prompt budget for nothing. */
+const MAX_BODY_CHARS = 1800;
+
+export interface AskOptions {
+  /** Max retrieved candidates to consider (deeper = better recall, slower prompt). */
+  retrieveK?: number;
+  /** Override the default chat model temperature (default 0.2 — stays grounded). */
+  temperature?: number;
+  /** Cap on response tokens. */
+  maxTokens?: number;
+  /** AbortSignal for cancellation. */
+  signal?: AbortSignal;
+}
+
+export interface AskResult {
+  answer: string;
+  /** Symbols the LLM was given as context — useful for citing/UI. */
+  citations: Array<{ node: Node; summary?: string }>;
+  /** Wall time of the chat call only. */
+  chatMs: number;
+  /** Wall time of retrieval only. */
+  retrieveMs: number;
+}
+
+function readBodySafe(projectRoot: string, node: Node): string {
+  const safe = validatePathWithinRoot(projectRoot, node.filePath);
+  if (!safe) return '';
+  try {
+    const lines = fs.readFileSync(safe, 'utf-8').split('\n');
+    const slice = lines.slice(Math.max(0, node.startLine - 1), node.endLine).join('\n');
+    return slice.length > MAX_BODY_CHARS
+      ? slice.slice(0, MAX_BODY_CHARS) + '\n// ... (truncated)'
+      : slice;
+  } catch {
+    return '';
+  }
+}
+
+function buildPrompt(
+  question: string,
+  full: Array<{ node: Node; summary?: string; body: string }>,
+  list: Array<{ node: Node; summary?: string }>
+): string {
+  const parts: string[] = [
+    'You are a senior engineer helping a teammate understand an unfamiliar codebase.',
+    'Answer the question below using only the symbols provided. Cite specific symbol',
+    'names and file paths in your answer (e.g. "see `FileWatcher.start` in src/sync/watcher.ts").',
+    'If the provided context is insufficient, say so plainly — do not invent details.',
+    '',
+    `Question: ${question}`,
+    '',
+    '## Most relevant symbols (full bodies)',
+    '',
+  ];
+  for (const { node, summary, body } of full) {
+    parts.push(`### ${node.name} (${node.kind}) — ${node.filePath}:${node.startLine}`);
+    if (summary) parts.push(`*Summary*: ${summary}`);
+    if (node.signature) parts.push(`*Signature*: \`${node.signature}\``);
+    parts.push('```' + (node.language || ''));
+    parts.push(body);
+    parts.push('```');
+    parts.push('');
+  }
+  if (list.length > 0) {
+    parts.push('## Other relevant symbols (names + summaries only)');
+    parts.push('');
+    for (const { node, summary } of list) {
+      parts.push(
+        `- **${node.name}** (${node.kind}) — ${node.filePath}:${node.startLine}` +
+          (summary ? ` — ${summary}` : '')
+      );
+    }
+    parts.push('');
+  }
+  parts.push('## Answer');
+  return parts.join('\n');
+}
+
+/**
+ * Run a one-shot Q&A pass. Caller is responsible for supplying the
+ * pre-retrieved candidates (so the same code path serves the MCP tool,
+ * the CLI, and any direct-API caller without each having to know about
+ * the embedding model).
+ */
+export async function askWithCandidates(
+  projectRoot: string,
+  question: string,
+  candidates: SearchResult[],
+  queries: QueryBuilder,
+  client: LlmClient,
+  chatModel: string,
+  options: AskOptions = {}
+): Promise<AskResult> {
+  const tRetrieve = Date.now();
+  const ids = candidates.map((c) => c.node.id);
+  const summaryMap = queries.getSymbolSummaries(ids);
+
+  // Top MAX_FULL_BODIES → include verbatim. The rest → name + summary line.
+  const enriched = candidates.map((c) => ({
+    node: c.node,
+    summary: summaryMap.get(c.node.id),
+  }));
+  const fullSlice = enriched.slice(0, MAX_FULL_BODIES);
+  const listSlice = enriched.slice(MAX_FULL_BODIES);
+
+  const full = fullSlice.map((e) => ({
+    ...e,
+    body: readBodySafe(projectRoot, e.node),
+  }));
+  const retrieveMs = Date.now() - tRetrieve;
+
+  const prompt = buildPrompt(question, full, listSlice);
+  logDebug('ask: prompt size', { chars: prompt.length });
+
+  const tChat = Date.now();
+  const result = await client.chat(
+    [{ role: 'user', content: prompt }],
+    {
+      temperature: options.temperature ?? 0.2,
+      maxTokens: options.maxTokens ?? 800,
+    }
+  );
+  const chatMs = Date.now() - tChat;
+
+  void chatModel; // model id is set on the client; here for future telemetry
+
+  return {
+    answer: result.text.trim(),
+    citations: enriched,
+    chatMs,
+    retrieveMs,
+  };
+}
diff --git a/src/llm/change-intent.ts b/src/llm/change-intent.ts
new file mode 100644
index 00000000..ecad9a38
--- /dev/null
+++ b/src/llm/change-intent.ts
@@ -0,0 +1,112 @@
+/**
+ * Change-intent generator — given a symbol's body before and after a
+ * change, produce a one-line description of what the change does
+ * (vs. what the diff shows, which is the *how*).
+ *
+ * Designed to plug into PR-review tooling (codegraph_review_context
+ * / ultrareview), but exposed standalone so any caller can use it.
+ *
+ * Non-cached at the API layer to keep the surface tiny. PR-review
+ * callers typically want one-shot summaries against a given commit
+ * range and don't benefit from a long-lived cache; if persistent
+ * caching is wanted later, add a content-hash-keyed table similar to
+ * symbol_summaries.
+ */
+
+import { LlmClient, LlmEndpointError } from './client';
+
+const MAX_BODY_CHARS = 1500;
+
+export interface ChangeIntentOptions {
+  signal?: AbortSignal;
+  /** Override prompt temperature; default 0.2 — stays close to evidence. */
+  temperature?: number;
+}
+
+export interface ChangeIntentResult {
+  intent: string;
+  durationMs: number;
+}
+
+function trim(body: string): string {
+  return body.length > MAX_BODY_CHARS
+    ? body.slice(0, MAX_BODY_CHARS) + '\n// ... (truncated)'
+    : body;
+}
+
+function buildPrompt(name: string, kind: string, beforeBody: string, afterBody: string): string {
+  if (!beforeBody) {
+    return [
+      'You are reviewing a code change. The following symbol was ADDED.',
+      `Symbol: ${name} (${kind})`,
+      '',
+      '## After',
+      '```',
+      trim(afterBody),
+      '```',
+      '',
+      'Write ONE LINE (max 200 chars) describing what this newly added',
+      'symbol does and why it likely matters in this PR. Start with a',
+      'verb. No fluff. Just the line.',
+    ].join('\n');
+  }
+  if (!afterBody) {
+    return [
+      'You are reviewing a code change. The following symbol was REMOVED.',
+      `Symbol: ${name} (${kind})`,
+      '',
+      '## Before',
+      '```',
+      trim(beforeBody),
+      '```',
+      '',
+      'Write ONE LINE (max 200 chars) describing what was removed and the',
+      'likely impact. Start with a verb (e.g. "Removes ..."). No fluff.',
+    ].join('\n');
+  }
+  return [
+    'You are reviewing a code change. Compare the BEFORE and AFTER versions',
+    `of ${name} (${kind}) and describe what changed at the *intent* level.`,
+    '',
+    '## Before',
+    '```',
+    trim(beforeBody),
+    '```',
+    '',
+    '## After',
+    '```',
+    trim(afterBody),
+    '```',
+    '',
+    'Write ONE LINE (max 200 chars) describing the behavioural change.',
+    'Focus on intent, not mechanics — what does the code now do that it',
+    'did not, or vice versa? Start with a verb. No "This change..." or',
+    'markdown. Just the line.',
+  ].join('\n');
+}
+
+/**
+ * One-shot change-intent generation. Throws on endpoint failure
+ * because callers (review tooling) want to surface the error rather
+ * than silently produce no intent.
+ */
+export async function summarizeChange(
+  client: LlmClient,
+  name: string,
+  kind: string,
+  beforeBody: string,
+  afterBody: string,
+  options: ChangeIntentOptions = {}
+): Promise<ChangeIntentResult> {
+  if (!beforeBody && !afterBody) {
+    throw new LlmEndpointError('summarizeChange requires either beforeBody or afterBody');
+  }
+  const t0 = Date.now();
+  const result = await client.chat(
+    [{ role: 'user', content: buildPrompt(name, kind, beforeBody, afterBody) }],
+    { temperature: options.temperature ?? 0.2, maxTokens: 80 }
+  );
+  let intent = (result.text.split('\n')[0] || '').trim();
+  if (intent.length > 200) intent = intent.slice(0, 199) + '…';
+  return { intent, durationMs: Date.now() - t0 };
+}
diff --git a/src/llm/classifier.ts b/src/llm/classifier.ts
new file mode 100644
index 00000000..081f68d7
--- /dev/null
+++ b/src/llm/classifier.ts
@@ -0,0 +1,143 @@
+/**
+ * Role classifier — assigns each summarised symbol a coarse role from
+ * a fixed label set. Lets callers filter "show me all api_endpoints"
+ * or "list the data_models" without crawling the graph by hand.
+ *
+ * Tier-2 enrichment: cheap (one short call per symbol, deterministic
+ * single-token output), cached on symbol_summaries.role.
+ */
+
+import { LlmClient, LlmEndpointError } from './client';
+import { QueryBuilder } from '../db/queries';
+import { logDebug, logWarn } from '../errors';
+
+/** Closed label set. The model is asked to pick exactly one. */
+export const ROLE_LABELS = [
+  'api_endpoint',
+  'business_logic',
+  'data_model',
+  'util',
+  'framework_glue',
+  'test_helper',
+  'unknown',
+] as const;
+
+export type RoleLabel = (typeof ROLE_LABELS)[number];
+
+const ROLE_SET: ReadonlySet<string> = new Set(ROLE_LABELS);
+
+const DEFAULT_CONCURRENCY = 2;
+
+export interface ClassifierOptions {
+  signal?: AbortSignal;
+  concurrency?: number;
+  onProgress?: (done: number, total: number) => void;
+}
+
+export interface ClassifierResult {
+  candidates: number;
+  classified: number;
+  cacheHits: number;
+  errors: number;
+  durationMs: number;
+}
+
+function buildPrompt(name: string, kind: string, signature: string | null, summary: string): string {
+  const sig = signature ? `\nSignature: ${signature}` : '';
+  return [
+    'Classify the following code symbol into EXACTLY ONE of these roles:',
+    '',
+    '- api_endpoint: HTTP/RPC handler, route, public-facing entry point.',
+    '- business_logic: domain operation, workflow, decision-making.',
+    '- data_model: type, struct, schema, DTO, persistence record.',
+    '- util: pure helper, formatter, parser, generic utility.',
+    '- framework_glue: middleware, adapter, config wiring, lifecycle hook.',
+    '- test_helper: fixture, mock builder, assertion helper.',
+    '- unknown: cannot determine from the description.',
+    '',
+    `Symbol: ${name} (${kind})${sig}`,
+    `Description: ${summary}`,
+    '',
+    'Reply with JUST the role name on a single line. No prose, no quotes.',
+  ].join('\n');
+}
+
+/** Strip markdown/quotes/whitespace, return the matched role or "unknown".
+ *  Tries two normalisations: (1) first whitespace-delimited token (handles
+ *  `business_logic.` and `\`business_logic\``), (2) all tokens joined with
+ *  underscores (handles `Business Logic`-style multi-word responses some
+ *  instruction-tuned models emit). Exported for direct testing. */
+export function parseRole(text: string): RoleLabel {
+  const lower = text.toLowerCase().trim().replace(/^[`'"\s]+|[`'"\s]+$/g, '');
+  const firstToken = lower.split(/\s+/)[0]?.replace(/[^a-z_]/g, '') ?? '';
+  if (firstToken && ROLE_SET.has(firstToken)) return firstToken as RoleLabel;
+  // Fallback: snake_case the whole response in case the model used spaces.
+  const joined = lower.replace(/[^a-z\s]/g, '').split(/\s+/).filter(Boolean).join('_');
+  if (joined && ROLE_SET.has(joined)) return joined as RoleLabel;
+  return 'unknown';
+}
+
+/**
+ * Run the classifier over every summarised symbol that doesn't yet
+ * have a role from the active model. Idempotent.
+ */
+export async function classifyAllRoles(
+  queries: QueryBuilder,
+  client: LlmClient,
+  modelLabel: string,
+  options: ClassifierOptions = {}
+): Promise<ClassifierResult> {
+  const t0 = Date.now();
+  const concurrency = Math.max(1, options.concurrency ?? DEFAULT_CONCURRENCY);
+
+  const candidates = queries.getClassifiableSummaries(modelLabel);
+  const total = candidates.length;
+  let done = 0;
+  let classified = 0;
+  let errors = 0;
+
+  let next = 0;
+  async function worker(): Promise<void> {
+    while (next < candidates.length) {
+      if (options.signal?.aborted) return;
+      const i = next++;
+      const c = candidates[i]!;
+      try {
+        const result = await client.chat(
+          [
+            {
+              role: 'user',
+              content: buildPrompt(c.name, c.kind, c.signature, c.summary),
+            },
+          ],
+          { temperature: 0, maxTokens: 12 }
+        );
+        // Don't persist if we were cancelled mid-call.
+        if (options.signal?.aborted) return;
+        const label = parseRole(result.text);
+        queries.upsertSymbolRole(c.nodeId, label, modelLabel);
+        classified++;
+      } catch (err) {
+        errors++;
+        if (err instanceof LlmEndpointError) {
+          logDebug('Classifier: endpoint error', { node: c.nodeId, error: err.message });
+        } else {
+          logWarn('Classifier: unexpected error', { node: c.nodeId, error: String(err) });
+        }
+      } finally {
+        done++;
+        options.onProgress?.(done, total);
+      }
+    }
+  }
+
+  await Promise.all(Array.from({ length: concurrency }, () => worker()));
+
+  return {
+    candidates: total,
+    classified,
+    cacheHits: 0,
+    errors,
+    durationMs: Date.now() - t0,
+  };
+}
diff --git a/src/llm/client.ts b/src/llm/client.ts
new file mode 100644
index 00000000..e556a36e
--- /dev/null
+++ b/src/llm/client.ts
@@ -0,0 +1,202 @@
+/**
+ * LLM HTTP Client
+ *
+ * Thin wrapper around an OpenAI-compatible HTTP endpoint. Works against
+ * any local server that speaks the spec — Ollama, llama.cpp's
+ * llama-server, LM Studio, vLLM, mistral.rs — and against the actual
+ * OpenAI / Anthropic-OpenAI-compat APIs too.
+ *
+ * Pure HTTP via global `fetch`. No SDK dependencies. The model itself
+ * runs in a separate process, so codegraph never touches WASM or ONNX —
+ * the V8 turboshaft Zone OOM that motivated the original embeddings
+ * removal in #87 cannot recur by construction.
+ */
+
+export interface LlmEndpointConfig {
+  /** Base URL, e.g. http://localhost:11434/v1 (Ollama) or https://api.openai.com/v1 */
+  endpoint: string;
+  /** Model id used for chat completions. */
+  chatModel?: string;
+  /** Model id used for embeddings. */
+  embeddingModel?: string;
+  /** Optional bearer token; most local servers leave this empty. */
+  apiKey?: string;
+  /** Per-request timeout in ms. Defaults to 60s — generous for local CPU. */
+  timeoutMs?: number;
+}
+
+export interface ChatMessage {
+  role: 'system' | 'user' | 'assistant';
+  content: string;
+}
+
+export interface ChatOptions {
+  /** Sampling temperature; 0 for deterministic single-line outputs. */
+  temperature?: number;
+  /** Cap output tokens. Tight default since most callers want short replies. */
+  maxTokens?: number;
+}
+
+export interface ChatResult {
+  text: string;
+  /** Round-trip wall time in ms (includes network + inference). */
+  durationMs: number;
+  /** Raw token counts the server reported, when available. */
+  promptTokens?: number;
+  completionTokens?: number;
+}
+
+export class LlmEndpointError extends Error {
+  constructor(message: string, public readonly status?: number) {
+    super(message);
+    this.name = 'LlmEndpointError';
+  }
+}
+
+/**
+ * OpenAI-compatible HTTP client. Stateless; cheap to construct.
+ */
+export class LlmClient {
+  private readonly endpoint: string;
+  private readonly chatModel: string | undefined;
+  private readonly embeddingModel: string | undefined;
+  private readonly apiKey: string | undefined;
+  private readonly timeoutMs: number;
+
+  constructor(config: LlmEndpointConfig) {
+    // Normalise: drop trailing slash so we can do `${endpoint}/chat/completions`.
+    this.endpoint = config.endpoint.replace(/\/+$/, '');
+    this.chatModel = config.chatModel;
+    this.embeddingModel = config.embeddingModel;
+    this.apiKey = config.apiKey;
+    this.timeoutMs = config.timeoutMs ?? 60_000;
+  }
+
+  /**
+   * One-shot chat completion. Returns the assistant's text and metadata.
+   * Throws {@link LlmEndpointError} on non-2xx response or network failure.
+   */
+  async chat(messages: ChatMessage[], options: ChatOptions = {}): Promise<ChatResult> {
+    if (!this.chatModel) {
+      throw new LlmEndpointError('chatModel not configured');
+    }
+    const t0 = Date.now();
+    const body = {
+      model: this.chatModel,
+      messages,
+      temperature: options.temperature ?? 0,
+      max_tokens: options.maxTokens ?? 256,
+      stream: false,
+    };
+    const data = await this.post('/chat/completions', body);
+    const text = data?.choices?.[0]?.message?.content ?? '';
+    return {
+      text: typeof text === 'string' ? text : String(text),
+      durationMs: Date.now() - t0,
+      promptTokens: data?.usage?.prompt_tokens,
+      completionTokens: data?.usage?.completion_tokens,
+    };
+  }
+
+  /**
+   * Embed one or more strings. Returns one Float32Array per input,
+   * already L2-normalised so callers can use a plain dot product as
+   * cosine similarity.
+   */
+  async embed(inputs: string[]): Promise<Float32Array[]> {
+    if (!this.embeddingModel) {
+      throw new LlmEndpointError('embeddingModel not configured');
+    }
+    if (inputs.length === 0) return [];
+    const data = await this.post('/embeddings', {
+      model: this.embeddingModel,
+      input: inputs,
+    });
+    if (!Array.isArray(data?.data)) {
+      throw new LlmEndpointError('embeddings response missing data[]');
+    }
+    return data.data.map((d: { embedding: number[] }) => {
+      const v = Float32Array.from(d.embedding);
+      // L2 normalise in-place so cosine == dot product downstream.
+      let s = 0;
+      for (let i = 0; i < v.length; i++) {
+        const x = v[i]!;
+        s += x * x;
+      }
+      const norm = Math.sqrt(s) || 1;
+      for (let i = 0; i < v.length; i++) v[i] = v[i]! / norm;
+      return v;
+    });
+  }
+
+  /**
+   * Cheap liveness probe — does the endpoint respond? Used to gracefully
+   * skip LLM-dependent features when the server isn't running, rather
+   * than failing the whole indexAll.
+   */
+  async isReachable(): Promise<boolean> {
+    try {
+      const res = await this.fetchWithTimeout(`${this.endpoint}/models`, { method: 'GET' });
+      return res.ok || res.status === 401; // 401 = endpoint exists but needs auth
+    } catch {
+      return false;
+    }
+  }
+
+  /**
+   * List model ids the endpoint advertises. Returns `[]` if the endpoint
+   * doesn't expose `/models` or returns a malformed payload — callers
+   * treat that as "no usable model" rather than throwing.
+   */
+  async listModels(): Promise<string[]> {
+    try {
+      const res = await this.fetchWithTimeout(`${this.endpoint}/models`, {
+        method: 'GET',
+        headers: this.apiKey ? { authorization: `Bearer ${this.apiKey}` } : undefined,
+      });
+      if (!res.ok) return [];
+      const data = (await res.json()) as { data?: Array<{ id?: string }> };
+      if (!Array.isArray(data?.data)) return [];
+      return data.data
+        .map((m) => (typeof m?.id === 'string' ? m.id : ''))
+        .filter((id) => id.length > 0);
+    } catch {
+      return [];
+    }
+  }
+
+  private async post(path: string, body: unknown): Promise<any> {
+    const res = await this.fetchWithTimeout(`${this.endpoint}${path}`, {
+      method: 'POST',
+      headers: this.headers(),
+      body: JSON.stringify(body),
+    });
+    if (!res.ok) {
+      const text = await res.text().catch(() => '');
+      throw new LlmEndpointError(`POST ${path} → ${res.status}: ${text.slice(0, 500)}`, res.status);
+    }
+    return res.json();
+  }
+
+  private headers(): Record<string, string> {
+    const h: Record<string, string> = { 'content-type': 'application/json' };
+    if (this.apiKey) h.authorization = `Bearer ${this.apiKey}`;
+    return h;
+  }
+
+  private async fetchWithTimeout(url: string, init: RequestInit): Promise<Response> {
+    const controller = new AbortController();
+    const t = setTimeout(() => controller.abort(), this.timeoutMs);
+    try {
+      return await fetch(url, { ...init, signal: controller.signal });
+    } catch (err) {
+      const message = err instanceof Error ? err.message : String(err);
+      if (message.includes('aborted') || message.includes('AbortError')) {
+        throw new LlmEndpointError(`request timed out after ${this.timeoutMs}ms`);
+      }
+      throw new LlmEndpointError(`network error: ${message}`);
+    } finally {
+      clearTimeout(t);
+    }
+  }
+}
diff --git a/src/llm/dead-code.ts b/src/llm/dead-code.ts
new file mode 100644
index 00000000..cecdde12
--- /dev/null
+++ b/src/llm/dead-code.ts
@@ -0,0 +1,196 @@
+/**
+ * Dead-code judge.
+ *
+ * Tier-3 enrichment: combine the deterministic graph signal
+ * ("0 incoming `calls` edges + not exported + not a test fixture")
+ * with an LLM judge that knows about entry points the graph can't see
+ * (CLI commands, MCP tool handlers, framework hooks called by name).
+ *
+ * Output is a confidence-tagged candidate list, NOT a delete list —
+ * the user always decides.
+ */
+
+import { LlmClient, LlmEndpointError } from './client';
+import { Node } from '../types';
+import { QueryBuilder } from '../db/queries';
+import { logDebug, logWarn } from '../errors';
+
+/** Kinds we'd flag as potentially dead. Skip data shapes (interfaces,
+ *  types, enums) — those are usually used via type-positions the
+ *  reference resolver doesn't track. */
+const SUSPECT_KINDS: ReadonlySet<string> = new Set([
+  'function',
+  'method',
+  'class',
+  'component',
+]);
+
+/** File-path patterns that exempt a symbol from suspicion. */
+const EXEMPT_PATH_PATTERNS = [
+  /(^|\/)__tests__\//,
+  /(^|\/)tests?\//,
+  /\.test\.[jt]sx?$/,
+  /\.spec\.[jt]sx?$/,
+  /(^|\/)scripts?\//,
+  /(^|\/)bench\//,
+  /(^|\/)benchmarks?\//,
+];
+
+const DEFAULT_CONCURRENCY = 2;
+
+export interface DeadCodeOptions {
+  signal?: AbortSignal;
+  concurrency?: number;
+  onProgress?: (done: number, total: number) => void;
+  /** Max candidates to judge in one pass (cap for very large repos). */
+  maxCandidates?: number;
+}
+
+export interface DeadCodeCandidate {
+  node: Node;
+  /** "dead" | "live" | "uncertain" — model's verdict. */
+  verdict: 'dead' | 'live' | 'uncertain';
+  /** 0..1 — model's stated confidence. Heuristic, not calibrated. */
+  confidence: number;
+  /** One-line justification. */
+  reason: string;
+}
+
+export interface DeadCodeResult {
+  candidates: number;
+  judged: number;
+  errors: number;
+  results: DeadCodeCandidate[];
+  durationMs: number;
+}
+
+/**
+ * Find graph-level "looks dead" candidates BEFORE asking the LLM.
+ * Cheap pre-filter so we don't burn tokens on obviously-live symbols.
+ */
+function findGraphCandidates(queries: QueryBuilder, max: number): Node[] {
+  const candidates = queries.findOrphanedSymbols(max * 2); // overshoot, we'll filter
+  const out: Node[] = [];
+  for (const node of candidates) {
+    if (!SUSPECT_KINDS.has(node.kind)) continue;
+    if (EXEMPT_PATH_PATTERNS.some((p) => p.test(node.filePath))) continue;
+    out.push(node);
+    if (out.length >= max) break;
+  }
+  return out;
+}
+
+interface JudgeResponse {
+  verdict: 'dead' | 'live' | 'uncertain';
+  confidence: number;
+  reason: string;
+}
+
+function parseJudgeResponse(text: string): JudgeResponse {
+  // Expected format: a single JSON object on one line. Be lenient with
+  // markdown fencing — strip ```json/``` markers and any stray
+  // backticks the model might leave behind on multi-line outputs.
+  const cleaned = text
+    .trim()
+    .replace(/```(?:json)?/g, '')
+    .replace(/`/g, '')
+    .trim();
+  try {
+    const obj = JSON.parse(cleaned);
+    const v = String(obj.verdict || '').toLowerCase();
+    const verdict: JudgeResponse['verdict'] =
+      v === 'dead' || v === 'live' || v === 'uncertain' ? v : 'uncertain';
+    const conf = typeof obj.confidence === 'number' ? Math.max(0, Math.min(1, obj.confidence)) : 0.5;
+    const reason = typeof obj.reason === 'string' ? obj.reason.slice(0, 200) : '';
+    return { verdict, confidence: conf, reason };
+  } catch {
+    // Fall through — return a low-confidence uncertain
+    return { verdict: 'uncertain', confidence: 0.3, reason: 'unparseable judge response' };
+  }
+}
+
+function buildPrompt(node: Node, summary: string | null): string {
+  return [
+    'You are reviewing whether a symbol is dead code.',
+    'Inputs: a symbol with NO incoming "calls" edges in the static',
+    'reference graph, and is NOT marked as exported. The graph misses:',
+    '  - dynamic dispatch (method called by string name)',
+    '  - framework hooks (e.g. Express middleware, CLI commands,',
+    '    MCP tool handlers, React component used in JSX from another file)',
+    '  - test fixtures used implicitly',
+    '  - public API consumed by external projects',
+    '',
+    `Symbol: ${node.name} (${node.kind}) at ${node.filePath}:${node.startLine}`,
+    summary ? `Summary: ${summary}` : 'Summary: (none)',
+    '',
+    'Reply with EXACTLY one JSON object on one line:',
+    '{"verdict": "dead" | "live" | "uncertain", "confidence": 0.0-1.0, "reason": "one short sentence"}',
+    'No markdown, no prose outside the JSON.',
+  ].join('\n');
+}
+
+export async function judgeDeadCode(
+  queries: QueryBuilder,
+  client: LlmClient,
+  options: DeadCodeOptions = {}
+): Promise<DeadCodeResult> {
+  const t0 = Date.now();
+  const max = options.maxCandidates ?? 200;
+  const concurrency = Math.max(1, options.concurrency ?? DEFAULT_CONCURRENCY);
+
+  const graphCandidates = findGraphCandidates(queries, max);
+  const total = graphCandidates.length;
+  let done = 0;
+  let judged = 0;
+  let errors = 0;
+  const results: DeadCodeCandidate[] = [];
+
+  const summaries = queries.getSymbolSummaries(graphCandidates.map((n) => n.id));
+
+  let next = 0;
+  async function worker(): Promise<void> {
+    while (next < graphCandidates.length) {
+      if (options.signal?.aborted) return;
+      const i = next++;
+      const node = graphCandidates[i]!;
+      try {
+        const result = await client.chat(
+          [{ role: 'user', content: buildPrompt(node, summaries.get(node.id) ?? null) }],
+          { temperature: 0, maxTokens: 120 }
+        );
+        if (options.signal?.aborted) return;
+        const parsed = parseJudgeResponse(result.text);
+        results.push({ node, ...parsed });
+        judged++;
+      } catch (err) {
+        errors++;
+        if (err instanceof LlmEndpointError) {
+          logDebug('DeadCode: endpoint error', { node: node.id, error: err.message });
+        } else {
+          logWarn('DeadCode: unexpected error', { node: node.id, error: String(err) });
+        }
+      } finally {
+        done++;
+        options.onProgress?.(done, total);
+      }
+    }
+  }
+
+  await Promise.all(Array.from({ length: concurrency }, () => worker()));
+
+  // Surface "dead" verdicts first, then uncertain, then live (which
+  // the user probably wants to know they DON'T need to clean up).
+  results.sort((a, b) => {
+    const order = { dead: 0, uncertain: 1, live: 2 } as const;
+    if (order[a.verdict] !== order[b.verdict]) return order[a.verdict] - order[b.verdict];
+    return b.confidence - a.confidence;
+  });
+
+  return {
+    candidates: total,
+    judged,
+    errors,
+    results,
+    durationMs: Date.now() - t0,
+  };
+}
diff --git a/src/llm/detect.ts b/src/llm/detect.ts
new file mode 100644
index 00000000..2161c0e1
--- /dev/null
+++ b/src/llm/detect.ts
@@ -0,0 +1,129 @@
+/**
+ * Local LLM auto-detection
+ *
+ * Probes the conventional Ollama endpoint to see if a usable chat model
+ * is already running on the machine. When found, callers (CodeGraph)
+ * synthesise an `LlmEndpointConfig` so summarisation can run with zero
+ * configuration on machines where the user already has Ollama installed.
+ *
+ * Cheap and quiet: 500ms probe, no logging on absence — having no local
+ * server is the common case and shouldn't add noise to the CLI.
+ */
+
+import { LlmClient, LlmEndpointConfig } from './client';
+
+/** Default Ollama OpenAI-compat endpoint. */
+export const DEFAULT_OLLAMA_ENDPOINT = 'http://localhost:11434/v1';
+
+/**
+ * Models we'd rather pick first when several are available. Ordered by
+ * "small enough to be background-friendly + decent at single-line
+ * summaries". The list is intentionally short and non-authoritative —
+ * if none of these match, we fall back to the first chat model the
+ * server reports.
+ */
+const PREFERRED_CHAT_MODELS = [
+  'qwen3:4b',
+  'qwen2.5-coder:3b',
+  'qwen2.5-coder:7b',
+  'qwen2.5:3b',
+  'qwen2.5:7b',
+  'gemma3:4b',
+  'gemma3:1b',
+  'llama3.2:3b',
+  'llama3.2:1b',
+  'phi3.5',
+];
+
+/**
+ * Embedding models we recognise so we can wire embedding calls without
+ * extra config when a known one is already pulled.
+ */
+const KNOWN_EMBEDDING_MODELS = [
+  'nomic-embed-text',
+  'mxbai-embed-large',
+  'bge-m3',
+  'snowflake-arctic-embed',
+];
+
+/** Heuristics for ruling models out as chat targets. */
+function isLikelyEmbedding(id: string): boolean {
+  const lower = id.toLowerCase();
+  return (
+    lower.includes('embed') ||
+    lower.includes('bge') ||
+    lower.includes('arctic-embed')
+  );
+}
+
+function pickChatModel(available: string[]): string | undefined {
+  // Prefer exact matches first, then any prefix (e.g. "qwen2.5-coder:7b-instruct-q4_K_M").
+  for (const wanted of PREFERRED_CHAT_MODELS) {
+    if (available.includes(wanted)) return wanted;
+  }
+  for (const wanted of PREFERRED_CHAT_MODELS) {
+    const hit = available.find((id) => id.startsWith(wanted));
+    if (hit) return hit;
+  }
+  // Fallback: first non-embedding model.
+  return available.find((id) => !isLikelyEmbedding(id));
+}
+
+function pickEmbeddingModel(available: string[]): string | undefined {
+  for (const wanted of KNOWN_EMBEDDING_MODELS) {
+    const hit = available.find((id) => id === wanted || id.startsWith(wanted));
+    if (hit) return hit;
+  }
+  // Last-ditch: anything that smells like an embedder.
+  return available.find((id) => isLikelyEmbedding(id));
+}
+
+export interface DetectedLlm {
+  endpoint: string;
+  chatModel: string;
+  embeddingModel?: string;
+  /** All model ids the server reported; useful for status output. */
+  availableModels: string[];
+}
+
+/**
+ * Probe the default Ollama endpoint and return a config if a usable
+ * chat model is available. Returns `null` when nothing is reachable or
+ * no chat-capable model is installed.
+ *
+ * `endpoint` lets callers point detection at a non-default server (used
+ * by tests and by users running on a non-standard port).
+ */
+export async function detectLocalLlm(
+  endpoint: string = DEFAULT_OLLAMA_ENDPOINT,
+  probeTimeoutMs: number = 500
+): Promise<DetectedLlm | null> {
+  const probe = new LlmClient({ endpoint, timeoutMs: probeTimeoutMs });
+  const reachable = await probe.isReachable();
+  if (!reachable) return null;
+
+  const models = await probe.listModels();
+  if (models.length === 0) return null;
+
+  const chatModel = pickChatModel(models);
+  if (!chatModel) return null;
+
+  return {
+    endpoint,
+    chatModel,
+    embeddingModel: pickEmbeddingModel(models),
+    availableModels: models,
+  };
+}
+
+/** Convert a {@link DetectedLlm} into the config shape the rest of the
+ * codebase consumes. Defaults are tuned for background work — generous
+ * timeout (LLMs on small CPUs can be slow) but no API key. */
+export function detectionToConfig(detected: DetectedLlm): LlmEndpointConfig {
+  return {
+    endpoint: detected.endpoint,
+    chatModel: detected.chatModel,
+    embeddingModel: detected.embeddingModel,
+    timeoutMs: 60_000,
+  };
+}
diff --git a/src/llm/dir-summarizer.ts b/src/llm/dir-summarizer.ts
new file mode 100644
index 00000000..a3b5d30f
--- /dev/null
+++ b/src/llm/dir-summarizer.ts
@@ -0,0 +1,202 @@
+/**
+ * Directory-level summaries.
+ *
+ * Aggregates symbol-level summaries (PR #111) within a directory into
+ * one paragraph that answers "what does this module do?". Lets the AI
+ * assistant get module-level context in a single MCP call instead of
+ * crawling all the symbols.
+ *
+ * Granularity: directory containing source files. We skip the project
+ * root and skip dirs whose only contents are subdirectories — the
+ * unit of meaning is the leaf module.
+ *
+ * Prompt-injection note: the synthesis prompt embeds LLM-generated
+ * symbol summaries verbatim. A malicious repo whose source bodies
+ * coerced an "ignore previous instructions" line into a summary
+ * could corrupt the directory paragraph. Accepted risk: all data
+ * here originates from the user's own codebase. Don't forward
+ * directory_summaries text to untrusted external systems without
+ * sanitisation.
+ */
+
+import * as crypto from 'crypto';
+import * as path from 'path';
+import { LlmClient, LlmEndpointError } from './client';
+import { QueryBuilder } from '../db/queries';
+import { logDebug, logWarn } from '../errors';
+
+/** Min number of summarised symbols before a dir is worth a paragraph. */
+const MIN_SYMBOLS_PER_DIR = 3;
+
+/** Cap how many symbol summaries we feed per dir prompt — beyond ~30
+ *  the marginal signal flattens and the prompt grows linearly. */
+const MAX_SYMBOLS_IN_PROMPT = 30;
+
+/** Output cap for the synth call. Tight: this is meant to be skimmed. */
+const MAX_SUMMARY_CHARS = 600;
+
+const DEFAULT_CONCURRENCY = 1; // Synthesis is large per call; serial is fine.
+
+export interface DirSummarizerOptions {
+  signal?: AbortSignal;
+  concurrency?: number;
+  onProgress?: (done: number, total: number) => void;
+}
+
+export interface DirSummarizerResult {
+  candidates: number;
+  generated: number;
+  cacheHits: number;
+  errors: number;
+  durationMs: number;
+}
+
+interface DirGroup {
+  dir: string;
+  items: Array<{ name: string; kind: string; summary: string }>;
+}
+
+/**
+ * Group symbol summaries by directory. Source of truth is the
+ * symbol_summaries table; we don't summarise dirs whose symbols
+ * haven't been summarised yet.
+ */
+function groupByDir(
+  rows: ReadonlyArray<{ filePath: string; name: string; kind: string; summary: string }>
+): DirGroup[] {
+  const groups = new Map<string, DirGroup>();
+  for (const row of rows) {
+    const dir = path.posix.dirname(row.filePath.replace(/\\/g, '/'));
+    if (dir === '.' || dir === '') continue;
+    let g = groups.get(dir);
+    if (!g) {
+      g = { dir, items: [] };
+      groups.set(dir, g);
+    }
+    g.items.push({ name: row.name, kind: row.kind, summary: row.summary });
+  }
+  // Drop dirs with too few symbols to be worth a paragraph.
+  return [...groups.values()].filter((g) => g.items.length >= MIN_SYMBOLS_PER_DIR);
+}
+
+/** Stable hash so re-running is a cache hit when nothing changed. */
+function hashDirContent(group: DirGroup): string {
+  const h = crypto.createHash('sha256');
+  // Sorted to be order-stable across DB row order
+  const items = [...group.items].sort((a, b) =>
+    `${a.kind}:${a.name}`.localeCompare(`${b.kind}:${b.name}`)
+  );
+  for (const it of items) {
+    h.update(it.kind);
+    h.update('\0');
+    h.update(it.name);
+    h.update('\0');
+    h.update(it.summary);
+    h.update('\n');
+  }
+  return h.digest('hex').slice(0, 32);
+}
+
+function buildPrompt(group: DirGroup): string {
+  const lines: string[] = [
+    `You are documenting the module \`${group.dir}\` of an unfamiliar codebase.`,
+    '',
+    `Below are one-line descriptions of every meaningful symbol in this directory.`,
+    `Write a SHORT paragraph (max ${MAX_SUMMARY_CHARS} chars) describing what this`,
+    `module does as a whole — its responsibility, the main types/functions it`,
+    `exposes, and how a caller would use it. No bullet lists. No headers. Just`,
+    `prose.`,
+    '',
+    '## Symbols in this module',
+  ];
+  const items = group.items.slice(0, MAX_SYMBOLS_IN_PROMPT);
+  for (const it of items) {
+    lines.push(`- ${it.name} (${it.kind}): ${it.summary}`);
+  }
+  if (group.items.length > items.length) {
+    lines.push(`- ... and ${group.items.length - items.length} more`);
+  }
+  lines.push('');
+  lines.push('Module summary:');
+  return lines.join('\n');
+}
+
+/**
+ * Run a full directory-summarisation pass. Cheap: only directories
+ * whose content_hash differs from the cached one regenerate.
+ */
+export async function summarizeAllDirectories(
+  queries: QueryBuilder,
+  client: LlmClient,
+  modelLabel: string,
+  options: DirSummarizerOptions = {}
+): Promise<DirSummarizerResult> {
+  const t0 = Date.now();
+  const concurrency = Math.max(1, options.concurrency ?? DEFAULT_CONCURRENCY);
+
+  const summarisedSymbols = queries.getSummarisedSymbolsByDir();
+  const groups = groupByDir(summarisedSymbols);
+  const total = groups.length;
+  let done = 0;
+  let generated = 0;
+  let cacheHits = 0;
+  let errors = 0;
+
+  let next = 0;
+  async function worker(): Promise<void> {
+    while (next < groups.length) {
+      if (options.signal?.aborted) return;
+      const i = next++;
+      const group = groups[i]!;
+      try {
+        const hash = hashDirContent(group);
+        const existing = queries.getDirectorySummary(group.dir);
+        if (existing && existing.contentHash === hash && existing.model === modelLabel) {
+          // Cache hit: bookkeeping happens in the `finally` below — do
+          // NOT double-increment `done` here. (The `finally` runs even
+          // on `continue`.)
+          cacheHits++;
+          continue;
+        }
+
+        const result = await client.chat(
+          [{ role: 'user', content: buildPrompt(group) }],
+          { temperature: 0.2, maxTokens: 220 }
+        );
+        // Honor abort signals between the chat call and the DB write
+        // so a close()-cancelled run doesn't persist a stale entry.
+        if (options.signal?.aborted) return;
+        let summary = result.text.trim();
+        if (summary.length === 0) {
+          errors++;
+        } else {
+          if (summary.length > MAX_SUMMARY_CHARS) {
+            summary = summary.slice(0, MAX_SUMMARY_CHARS - 1) + '…';
+          }
+          queries.upsertDirectorySummary(group.dir, hash, summary, modelLabel);
+          generated++;
+        }
+      } catch (err) {
+        errors++;
+        if (err instanceof LlmEndpointError) {
+          logDebug('DirSummarizer: endpoint error', { dir: group.dir, error: err.message });
+        } else {
+          logWarn('DirSummarizer: unexpected error', { dir: group.dir, error: String(err) });
+        }
+      } finally {
+        done++;
+        options.onProgress?.(done, total);
+      }
+    }
+  }
+
+  await Promise.all(Array.from({ length: concurrency }, () => worker()));
+
+  return {
+    candidates: total,
+    generated,
+    cacheHits,
+    errors,
+    durationMs: Date.now() - t0,
+  };
+}
diff --git a/src/llm/embeddings.ts b/src/llm/embeddings.ts
new file mode 100644
index 00000000..caef5d32
--- /dev/null
+++ b/src/llm/embeddings.ts
@@ -0,0 +1,306 @@
+/**
+ * Embedding helpers
+ *
+ * Tier-1 enrichment: turn the LLM-generated summaries into Float32
+ * vectors so we can do semantic search by cosine similarity. The
+ * embedding model is auto-detected (nomic-embed-text et al.) the same
+ * way the chat model is — see `detect.ts`.
+ *
+ * Storage shape: 768-dim (or whatever the model emits) Float32 bytes
+ * stored as a BLOB on `symbol_embeddings` (a separate table from
+ * `symbol_summaries` so common-path summary scans don't drag the
+ * BLOB along their page chain). L2-normalised at write time so the
+ * search-side cosine similarity is a pure dot product.
+ *
+ * No native deps, no in-process inference. The original embeddings
+ * removal in #87 was about WASM Zone OOM crashes; this design routes
+ * everything through HTTP to the same out-of-process LLM server we
+ * already use for chat.
+ */
+
+import { Buffer } from 'buffer';
+import { LlmClient, LlmEndpointError } from './client';
+import { QueryBuilder } from '../db/queries';
+import { logDebug, logWarn } from '../errors';
+
+/** Batch size for /embeddings calls. Large enough to amortise round-trip
+ *  but small enough to fit in any sane HTTP payload. Ollama and
+ *  llama.cpp both accept arrays here. */
+const EMBED_BATCH = 32;
+
+/** Concurrent embedding batches. The embedding model is fast so even 1
+ *  is usually CPU-bound on the server; 2 keeps the pipeline warm. */
+const DEFAULT_EMBED_CONCURRENCY = 2;
+
+export interface EmbedOptions {
+  signal?: AbortSignal;
+  concurrency?: number;
+  onProgress?: (done: number, total: number) => void;
+}
+
+export interface EmbedResult {
+  /** Symbols evaluated as candidates (have a summary, lack a fresh embedding). */
+  candidates: number;
+  /** Embeddings written this run. */
+  generated: number;
+  /** Skipped because the cached embedding is still valid. */
+  cacheHits: number;
+  /** Failures (timeout, network). */
+  errors: number;
+  durationMs: number;
+}
+
+/** Convert a Float32Array to a SQLite BLOB buffer (little-endian, no copy). */
+export function vectorToBytes(v: Float32Array): Buffer {
+  return Buffer.from(v.buffer, v.byteOffset, v.byteLength);
+}
+
+/** Read a SQLite BLOB back into a Float32Array view (zero-copy when aligned). */
+export function bytesToVector(b: Buffer | Uint8Array): Float32Array {
+  // Make sure we get a fresh, aligned ArrayBuffer regardless of how the
+  // SQLite driver hands us the bytes.
+  const ab = b.buffer.slice(b.byteOffset, b.byteOffset + b.byteLength);
+  return new Float32Array(ab);
+}
+
+/** Cosine similarity for two L2-normalised vectors == plain dot product. */
+export function cosineNormalised(a: Float32Array, b: Float32Array): number {
+  const n = Math.min(a.length, b.length);
+  let s = 0;
+  for (let i = 0; i < n; i++) s += a[i]! * b[i]!;
+  return s;
+}
+
+/**
+ * The text we feed the embedder for a symbol. Combines the summary
+ * (intent) with the name + signature (lexical handle). Keeping this
+ * deterministic means a content-hash-style cache key is well-defined.
+ */
+export function buildEmbedText(name: string, signature: string | null, summary: string): string {
+  const sig = signature ? signature.trim() : '';
+  return [name, sig, summary].filter((s) => s.length > 0).join('\n');
+}
+
+/**
+ * Embed every summary that doesn't yet have an embedding for the
+ * current model. Idempotent — second call is a pure cache check.
+ */
+export async function embedAllSummaries(
+  queries: QueryBuilder,
+  client: LlmClient,
+  embeddingModel: string,
+  options: EmbedOptions = {}
+): Promise<EmbedResult> {
+  const t0 = Date.now();
+  const concurrency = Math.max(1, options.concurrency ?? DEFAULT_EMBED_CONCURRENCY);
+
+  const candidates = queries.getEmbeddableSummaries(embeddingModel);
+  const total = candidates.length;
+  let done = 0;
+  let generated = 0;
+  let errors = 0;
+
+  // Build batches up front so workers can pull them off a shared queue.
+  const batches: Array<typeof candidates> = [];
+  for (let i = 0; i < candidates.length; i += EMBED_BATCH) {
+    batches.push(candidates.slice(i, i + EMBED_BATCH));
+  }
+
+  let nextBatch = 0;
+  async function worker(): Promise<void> {
+    while (nextBatch < batches.length) {
+      if (options.signal?.aborted) return;
+      const i = nextBatch++;
+      const batch = batches[i]!;
+      try {
+        const inputs = batch.map((c) => buildEmbedText(c.name, c.signature, c.summary));
+        const vecs = await client.embed(inputs);
+        if (vecs.length !== batch.length) {
+          throw new LlmEndpointError(
+            `embedding response length mismatch: got ${vecs.length}, want ${batch.length}`
+          );
+        }
+        for (let k = 0; k < batch.length; k++) {
+          queries.upsertSymbolEmbedding(batch[k]!.nodeId, vectorToBytes(vecs[k]!), embeddingModel);
+          generated++;
+          done++;
+        }
+        options.onProgress?.(done, total);
+      } catch (err) {
+        errors += batch.length;
+        done += batch.length;
+        if (err instanceof LlmEndpointError) {
+          logDebug('Embedder: endpoint error', { batch: i, error: err.message });
+        } else {
+          logWarn('Embedder: unexpected error', { batch: i, error: String(err) });
+        }
+        options.onProgress?.(done, total);
+      }
+    }
+  }
+
+  await Promise.all(Array.from({ length: concurrency }, () => worker()));
+
+  return {
+    candidates: total,
+    generated,
+    cacheHits: 0, // getEmbeddableSummaries already filters cache hits out
+    errors,
+    durationMs: Date.now() - t0,
+  };
+}
+
+/**
+ * Run an in-process semantic search by scanning every embedding and
+ * keeping the top-K. O(N) but for codegraph-sized indexes (≤ 50K
+ * symbols, ~150 MB of vectors) this is single-digit ms in practice.
+ *
+ * If the index ever grows past that, the right next step is the
+ * `sqlite-vec` extension — but it's a native dep so we defer until
+ * needed.
+ */
+export interface SemanticHit {
+  nodeId: string;
+  score: number;
+}
+
+export function topKByCosine(
+  query: Float32Array,
+  candidates: ReadonlyArray<{ nodeId: string; embedding: Buffer | Uint8Array }>,
+  k: number
+): SemanticHit[] {
+  const heap: SemanticHit[] = [];
+  for (const c of candidates) {
+    const v = bytesToVector(c.embedding);
+    const score = cosineNormalised(query, v);
+    if (heap.length < k) {
+      heap.push({ nodeId: c.nodeId, score });
+      heap.sort((a, b) => a.score - b.score);
+    } else if (score > heap[0]!.score) {
+      heap[0] = { nodeId: c.nodeId, score };
+      heap.sort((a, b) => a.score - b.score);
+    }
+  }
+  return heap.sort((a, b) => b.score - a.score);
+}
+
+/**
+ * Top-K cosine search over a flat decoded matrix. Used by the
+ * EmbeddingCache to avoid per-query SQLite fetch + Float32Array
+ * decode. The matrix is `ids.length * dim` floats laid out row-major
+ * (row i for `ids[i]` starts at offset `i * dim`).
+ */
+export function topKByCosineMatrix(
+  query: Float32Array,
+  matrix: Float32Array,
+  ids: ReadonlyArray<string>,
+  dim: number,
+  k: number
+): SemanticHit[] {
+  const heap: SemanticHit[] = [];
+  const n = ids.length;
+  const qLen = Math.min(query.length, dim);
+  for (let i = 0; i < n; i++) {
+    const off = i * dim;
+    let score = 0;
+    for (let d = 0; d < qLen; d++) score += matrix[off + d]! * query[d]!;
+    if (heap.length < k) {
+      heap.push({ nodeId: ids[i]!, score });
+      heap.sort((a, b) => a.score - b.score);
+    } else if (score > heap[0]!.score) {
+      heap[0] = { nodeId: ids[i]!, score };
+      heap.sort((a, b) => a.score - b.score);
+    }
+  }
+  return heap.sort((a, b) => b.score - a.score);
+}
+
+/**
+ * In-memory cache of every embedding for a given model, decoded once
+ * into a flat `Float32Array` matrix. Avoids re-fetching from SQLite
+ * and re-decoding `Float32Array` views on every similarity query.
+ *
+ * Lifetime: instance-scoped (one per CodeGraph). Invalidated by:
+ *   - `indexAll` and `sync` finishing (new embeddings may exist).
+ *   - `clear()` / `clearCoChanges()` (the table was emptied).
+ *   - `embedAllSummaries()` finishing inside the same process.
+ *
+ * This is a best-effort cache: a stale cache costs at most one
+ * iteration of "ranked by mostly-fresh-but-missing-the-newest
+ * embeddings" — never wrong, just a bit out of date until the next
+ * invalidation.
+ */
+export interface CachedEmbeddings {
+  matrix: Float32Array;
+  ids: string[];
+  dim: number;
+  model: string;
+}
+
+export interface EmbeddingFetcher {
+  getAllEmbeddings(model: string): Array<{ nodeId: string; embedding: Buffer | Uint8Array }>;
+}
+
+export class EmbeddingCache {
+  private cached: CachedEmbeddings | null = null;
+
+  /**
+   * Return the cached matrix for `model`, rebuilding from `fetcher`
+   * on miss. The returned matrix is owned by the cache — callers
+   * must not mutate it.
+   */
+  get(fetcher: EmbeddingFetcher, model: string): CachedEmbeddings {
+    if (this.cached && this.cached.model === model) {
+      return this.cached;
+    }
+    const rows = fetcher.getAllEmbeddings(model);
+    if (rows.length === 0) {
+      this.cached = { matrix: new Float32Array(0), ids: [], dim: 0, model };
+      return this.cached;
+    }
+    const firstVec = bytesToVector(rows[0]!.embedding);
+    const dim = firstVec.length;
+    // Skip mismatched-dim rows (a model upgrade in flight could leave
+    // some old vectors). Build a packed matrix of only the kept rows
+    // so `ids[i]` always lines up with row `i` in the matrix.
+    const ids: string[] = [];
+    const buf = new Float32Array(rows.length * dim);
+    let written = 0;
+    for (const row of rows) {
+      const v = bytesToVector(row.embedding);
+      if (v.length !== dim) continue;
+      buf.set(v, written * dim);
+      ids.push(row.nodeId);
+      written++;
+    }
+    const matrix = written === rows.length ? buf : buf.slice(0, written * dim);
+    this.cached = { matrix, ids, dim, model };
+    return this.cached;
+  }
+
+  /** Drop the cache. Next `get()` rebuilds from SQLite. */
+  invalidate(): void {
+    this.cached = null;
+  }
+}
+
+/**
+ * Reciprocal Rank Fusion: combine FTS (lexical) and semantic rankings
+ * into one score. Proven robust default for hybrid search.
+ *
+ * For each result that appears in either ranking, score is sum over
+ * lists of `1 / (k + rank)`. k=60 is the canonical constant.
+ */
+export function reciprocalRankFusion<T extends { id: string }>(
+  rankings: ReadonlyArray<ReadonlyArray<T>>,
+  k = 60
+): Map<string, number> {
+  const out = new Map<string, number>();
+  for (const list of rankings) {
+    for (let i = 0; i < list.length; i++) {
+      const id = list[i]!.id;
+      out.set(id, (out.get(id) ?? 0) + 1 / (k + i + 1));
+    }
+  }
+  return out;
+}
diff --git a/src/llm/naming.ts b/src/llm/naming.ts
new file mode 100644
index 00000000..61344037
--- /dev/null
+++ b/src/llm/naming.ts
@@ -0,0 +1,150 @@
+/**
+ * Naming-convention drift checker.
+ *
+ * Tier-3 enrichment: when a new symbol is added, compare its name
+ * against siblings of the same kind and ask the LLM whether it
+ * follows the established convention. Surfaces as an advisory — not
+ * an enforcement — because conventions in real codebases are mushy.
+ *
+ * Designed to be cheap: pulls a small (~30-name) sample of siblings,
+ * one LLM call per check, no caching at this layer.
+ */
+
+import { LlmClient, LlmEndpointError } from './client';
+import { Node } from '../types';
+import { QueryBuilder } from '../db/queries';
+
+const MAX_SIBLING_SAMPLE = 30;
+
+export interface NamingCheckOptions {
+  signal?: AbortSignal;
+}
+
+export interface NamingCheckResult {
+  consistent: boolean;
+  /** Optional better-named suggestion. Empty when consistent. */
+  suggestion: string;
+  /** One-line explanation. */
+  reason: string;
+  /** The sample of sibling names the model was given. Useful for UI. */
+  examples: string[];
+  durationMs: number;
+}
+
+function buildPrompt(name: string, kind: string, examples: string[]): string {
+  return [
+    `You are a code reviewer checking that a newly added ${kind} follows the`,
+    `naming conventions used by the rest of the codebase.`,
+    '',
+    'Existing names of the same kind in this codebase:',
+    ...examples.map((n) => `  - ${n}`),
+    '',
+    `Newly added: ${name}`,
+    '',
+    'Reply with EXACTLY one JSON object on one line:',
+    '{"consistent": true | false, "suggestion": "alternative name or empty string", "reason": "one short sentence"}',
+    'No markdown, no prose outside the JSON. If unsure, prefer consistent=true.',
+  ].join('\n');
+}
+
+interface RawResponse {
+  consistent?: unknown;
+  suggestion?: unknown;
+  reason?: unknown;
+}
+
+function parseResponse(text: string, examples: string[]): NamingCheckResult {
+  const cleaned = text.trim().replace(/```(?:json)?/g, '').trim();
+  let obj: RawResponse;
+  try {
+    obj = JSON.parse(cleaned) as RawResponse;
+  } catch {
+    return {
+      consistent: true, // err on the side of not flagging
+      suggestion: '',
+      reason: 'unparseable judge response — defaulting to consistent',
+      examples,
+      durationMs: 0,
+    };
+  }
+  return {
+    consistent: obj.consistent !== false, // default true unless explicit false
+    suggestion: typeof obj.suggestion === 'string' ? obj.suggestion.slice(0, 80) : '',
+    reason: typeof obj.reason === 'string' ? obj.reason.slice(0, 200) : '',
+    examples,
+    durationMs: 0,
+  };
+}
+
+/**
+ * Check a single name against the codebase's existing naming
+ * conventions for the same kind. One LLM call.
+ */
+export async function checkNamingConvention(
+  queries: QueryBuilder,
+  client: LlmClient,
+  newSymbol: { name: string; kind: string; filePath: string },
+  options: NamingCheckOptions = {}
+): Promise<NamingCheckResult> {
+  void options.signal; // The chat call honors signal via the client itself.
+  const t0 = Date.now();
+
+  const examples = queries.sampleSiblingNames(
+    newSymbol.kind,
+    newSymbol.name,
+    newSymbol.filePath,
+    MAX_SIBLING_SAMPLE
+  );
+
+  // Need at least a handful of siblings before "convention" is a
+  // meaningful concept — otherwise everything looks fine.
+  if (examples.length < 5) {
+    return {
+      consistent: true,
+      suggestion: '',
+      reason: 'not enough sibling symbols of this kind to infer a convention',
+      examples,
+      durationMs: Date.now() - t0,
+    };
+  }
+
+  try {
+    const result = await client.chat(
+      [{ role: 'user', content: buildPrompt(newSymbol.name, newSymbol.kind, examples) }],
+      { temperature: 0, maxTokens: 120 }
+    );
+    const parsed = parseResponse(result.text, examples);
+    parsed.durationMs = Date.now() - t0;
+    return parsed;
+  } catch (err) {
+    // Naming check is advisory — never throw, just defer the verdict.
+    return {
+      consistent: true,
+      suggestion: '',
+      reason: `naming check failed: ${err instanceof LlmEndpointError ? err.message : String(err)}`,
+      examples,
+      durationMs: Date.now() - t0,
+    };
+  }
+}
+
+/** Batch helper — check a list of newly added symbols at once. */
+export async function checkManyNames(
+  queries: QueryBuilder,
+  client: LlmClient,
+  newSymbols: ReadonlyArray<Node>,
+  options: NamingCheckOptions = {}
+): Promise<Array<{ node: Node; check: NamingCheckResult }>> {
+  const out: Array<{ node: Node; check: NamingCheckResult }> = [];
+  for (const node of newSymbols) {
+    if (options.signal?.aborted) break;
+    const check = await checkNamingConvention(
+      queries,
+      client,
+      { name: node.name, kind: node.kind, filePath: node.filePath },
+      options
+    );
+    out.push({ node, check });
+  }
+  return out;
+}
diff --git a/src/llm/summarizer.ts b/src/llm/summarizer.ts
new file mode 100644
index 00000000..dbb1fb3a
--- /dev/null
+++ b/src/llm/summarizer.ts
@@ -0,0 +1,215 @@
+/**
+ * Symbol Summarizer
+ *
+ * Generates one-line LLM descriptions of source-code symbols that lack
+ * meaningful docstrings. Cached per symbol with a content_hash so the
+ * summary stays in sync with the body — change the body, the next pass
+ * regenerates only that symbol.
+ *
+ * Designed to run as a background job after `indexAll` / `sync` so the
+ * CLI returns immediately. The user can query codegraph while summaries
+ * arrive incrementally.
+ *
+ * Pure data layer + driver — the LLM HTTP work lives in `LlmClient`.
+ */
+
+import * as fs from 'fs';
+import * as crypto from 'crypto';
+import { Node } from '../types';
+import { QueryBuilder } from '../db/queries';
+import { LlmClient, LlmEndpointError } from './client';
+import { logDebug, logWarn } from '../errors';
+import { validatePathWithinRoot } from '../utils';
+
+/** Symbol kinds worth summarising. Skip parameters/imports/literals. */
+export const SUMMARIZABLE_KINDS: ReadonlySet<string> = new Set([
+  'class', 'function', 'method', 'interface', 'struct',
+  'trait', 'protocol', 'enum', 'type_alias', 'component', 'route',
+]);
+
+/** Min body lines to bother summarising. Skip 1-line getters. */
+const MIN_BODY_LINES = 3;
+
+/** Truncate symbol bodies to this many chars to control prompt size. */
+const MAX_BODY_CHARS = 4000;
+
+/** Concurrent summarisation requests in flight. Ollama serializes
+ * internally; >1 buys little but keeps the network pipeline warm. */
+const DEFAULT_CONCURRENCY = 2;
+
+/** Max chars in the produced summary. Anything longer gets truncated. */
+const MAX_SUMMARY_CHARS = 200;
+
+export interface SummarizerOptions {
+  /** Skip symbols whose existing docstring already exceeds this length. */
+  existingDocstringCharThreshold?: number;
+  /** Concurrency. Default 2. */
+  concurrency?: number;
+  /** Optional progress callback. */
+  onProgress?: (done: number, total: number) => void;
+  /** AbortSignal — used to cancel in-flight summarisation when the
+   * project is closed or a new sync starts. */
+  signal?: AbortSignal;
+}
+
+export interface SummarizerResult {
+  /** Symbols evaluated as candidates. */
+  candidates: number;
+  /** Summaries generated this run (cache-misses + body changes). */
+  generated: number;
+  /** Summaries skipped because the cached one is still valid. */
+  cacheHits: number;
+  /** Failures (timeout, network, server error). */
+  errors: number;
+  /** Total wall time in ms. */
+  durationMs: number;
+}
+
+/**
+ * Strict prompt — single line, action verb, no fluff. We've benched
+ * this template against codegraph itself; quality is consistent across
+ * function/class/interface kinds.
+ */
+function buildPrompt(sym: Node, body: string): string {
+  return [
+    'You are a senior code reviewer documenting an unfamiliar codebase.',
+    '',
+    `Write a SINGLE LINE summary (max ${MAX_SUMMARY_CHARS} chars) of what this ${sym.kind} does.`,
+    'Start with an action verb. No "This function...", no fluff, no markdown. Just the summary.',
+    '',
+    '```',
+    body,
+    '```',
+    '',
+    'Summary:',
+  ].join('\n');
+}
+
+/** Stable content hash so we know when to regenerate. */
+export function contentHashFor(sym: Node, body: string): string {
+  const h = crypto.createHash('sha256');
+  h.update(sym.signature ?? '');
+  h.update('\0');
+  h.update(body);
+  return h.digest('hex').slice(0, 32);
+}
+
+/**
+ * Run a full summarisation pass over every summarisable symbol that
+ * doesn't yet have a fresh cached summary. Safe to call repeatedly:
+ * cache hits are O(1) per symbol and require no LLM call.
+ */
+export async function summarizeAll(
+  projectRoot: string,
+  queries: QueryBuilder,
+  client: LlmClient,
+  modelLabel: string,
+  options: SummarizerOptions = {}
+): Promise<SummarizerResult> {
+  const t0 = Date.now();
+  const concurrency = Math.max(1, options.concurrency ?? DEFAULT_CONCURRENCY);
+  const docThreshold = options.existingDocstringCharThreshold ?? 30;
+
+  const candidates = queries.getSummarizableNodes(SUMMARIZABLE_KINDS, MIN_BODY_LINES, docThreshold);
+  const total = candidates.length;
+  let done = 0;
+  let generated = 0;
+  let cacheHits = 0;
+  let errors = 0;
+
+  // Read body from disk lazily (per symbol) to avoid loading every file
+  // upfront. Cache file content by path while iterating so multiple
+  // symbols in one file hit the cache.
+  const fileContentCache = new Map<string, string[] | null>();
+  const readBodyLines = (sym: Node): string => {
+    let lines = fileContentCache.get(sym.filePath);
+    if (lines === undefined) {
+      // Defense-in-depth path-traversal guard. node.filePath comes
+      // from the indexer, which already filters its inputs, but a
+      // belt-and-suspenders check matches what context/extraction do
+      // and protects against malformed nodes from older schemas.
+      const safePath = validatePathWithinRoot(projectRoot, sym.filePath);
+      if (!safePath) {
+        lines = null;
+      } else {
+        try {
+          const full = fs.readFileSync(safePath, 'utf-8');
+          lines = full.split('\n');
+        } catch {
+          lines = null;
+        }
+      }
+      fileContentCache.set(sym.filePath, lines);
+    }
+    if (!lines) return '';
+    const slice = lines.slice(Math.max(0, sym.startLine - 1), sym.endLine);
+    const joined = slice.join('\n');
+    return joined.length > MAX_BODY_CHARS
+      ? joined.slice(0, MAX_BODY_CHARS) + '\n// ... (truncated)'
+      : joined;
+  };
+
+  // Worker that pulls from a shared index pointer.
+  let next = 0;
+  async function worker() {
+    while (next < candidates.length) {
+      if (options.signal?.aborted) return;
+      const i = next++;
+      const sym = candidates[i];
+      if (!sym) break;
+      try {
+        const body = readBodyLines(sym);
+        if (!body) {
+          errors++;
+          continue;
+        }
+        const hash = contentHashFor(sym, body);
+
+        const existing = queries.getSymbolSummary(sym.id);
+        if (existing && existing.contentHash === hash && existing.model === modelLabel) {
+          cacheHits++;
+          done++;
+          options.onProgress?.(done, total);
+          continue;
+        }
+
+        const result = await client.chat(
+          [{ role: 'user', content: buildPrompt(sym, body) }],
+          { temperature: 0, maxTokens: 80 }
+        );
+        // Don't persist a stale write if close()/abort fired during the call.
+        if (options.signal?.aborted) return;
+        let summary = result.text.trim().split('\n')[0]?.trim() ?? '';
+        if (summary.length === 0) {
+          errors++;
+        } else {
+          if (summary.length > MAX_SUMMARY_CHARS) {
+            summary = summary.slice(0, MAX_SUMMARY_CHARS - 1) + '…';
+          }
+          queries.upsertSymbolSummary(sym.id, hash, summary, modelLabel);
+          generated++;
+        }
+      } catch (err) {
+        errors++;
+        if (err instanceof LlmEndpointError) {
+          logDebug('Summarizer: endpoint error', { node: sym.id, error: err.message });
+        } else {
+          logWarn('Summarizer: unexpected error', { node: sym.id, error: String(err) });
+        }
+      } finally {
+        done++;
+        options.onProgress?.(done, total);
+      }
+    }
+  }
+
+  await Promise.all(Array.from({ length: concurrency }, () => worker()));
+
+  return {
+    candidates: total,
+    generated,
+    cacheHits,
+    errors,
+    durationMs: Date.now() - t0,
+  };
+}
diff --git a/src/mcp/index.ts b/src/mcp/index.ts
index bc3552ae..02c0bc14 100644
--- a/src/mcp/index.ts
+++ b/src/mcp/index.ts
@@ -18,7 +18,8 @@
 import * as path from 'path';
 import CodeGraph, { findNearestCodeGraphRoot } from '../index';
 import { StdioTransport, JsonRpcRequest, JsonRpcNotification, ErrorCodes } from './transport';
-import { tools, ToolHandler } from './tools';
+import { ToolHandler } from './tools';
+import { getToolModule } from './tools/registry';
 
 /**
  * Convert a file:// URI to a filesystem path.
@@ -34,8 +35,10 @@ function fileUriToPath(uri: string): string {
     }
     return path.resolve(filePath);
   } catch {
-    // Fallback for non-standard URIs
-    return uri.replace(/^file:\/\/\/?/, '');
+    // Fallback for non-standard URIs — still resolve through path.resolve
+    // so a malformed `file:///../etc/passwd` is normalized rather than
+    // returned raw to downstream filesystem code.
+    return path.resolve(uri.replace(/^file:\/\/\/?/, ''));
   }
 }
 
@@ -309,8 +312,9 @@ export class MCPServer {
     const toolName = params.name;
     const toolArgs = params.arguments || {};
 
-    // Validate tool exists
-    const tool = tools.find(t => t.name === toolName);
+    // Validate tool exists — O(1) Map lookup against the registry,
+    // matches the path `ToolHandler.execute()` uses internally.
+    const tool = getToolModule(toolName)?.definition;
     if (!tool) {
       this.transport.sendError(
         request.id,
diff --git a/src/mcp/tool-types.ts b/src/mcp/tool-types.ts
new file mode 100644
index 00000000..ab10ec03
--- /dev/null
+++ b/src/mcp/tool-types.ts
@@ -0,0 +1,44 @@
+/**
+ * Shared MCP tool types.
+ *
+ * Lives in its own module so per-tool files in `./tools/` and
+ * the legacy class wrapper in `./tools.ts` can import the same
+ * type definitions without a circular dependency.
+ */
+
+export interface PropertySchema {
+  type: string;
+  description?: string;
+  enum?: string[];
+  default?: unknown;
+  /** For type: 'array' — describes the items shape. */
+  items?: PropertySchema | { type: string; properties?: Record<string, PropertySchema>; required?: string[] };
+  /** For nested object schemas (codegraph_save_summaries items). */
+  properties?: Record<string, PropertySchema>;
+  required?: string[];
+}
+
+export interface ToolDefinition {
+  name: string;
+  description: string;
+  inputSchema: {
+    type: 'object';
+    properties: Record<string, PropertySchema>;
+    required?: string[];
+  };
+}
+
+export interface ToolResult {
+  content: Array<{ type: 'text'; text: string }>;
+  isError?: boolean;
+}
+
+/**
+ * Shared `projectPath` schema property — every tool's inputSchema
+ * accepts it for cross-project queries.
+ */
+export const projectPathProperty: PropertySchema = {
+  type: 'string',
+  description:
+    'Path to a different project with .codegraph/ initialized. If omitted, uses current project. Use this to query other codebases.',
+};
diff --git a/src/mcp/tools.ts b/src/mcp/tools.ts
index 53713145..2a96997e 100644
--- a/src/mcp/tools.ts
+++ b/src/mcp/tools.ts
@@ -11,6 +11,25 @@ import { writeFileSync, readFileSync, existsSync } from 'fs';
 import { clamp, validatePathWithinRoot } from '../utils';
 import { tmpdir } from 'os';
 import { join } from 'path';
+import type { ToolDefinition, ToolResult } from './tool-types';
+import type { ToolHandlerLike } from './tools/types';
+import { getToolModule, tools as registryTools } from './tools/registry';
+
+// Re-export shared types so existing consumers (`import { ToolDefinition,
+// ToolResult } from './tools'`) keep working unchanged.
+export type { ToolDefinition, ToolResult } from './tool-types';
+
+/**
+ * The MCP `list_tools` array, derived from the per-tool registry
+ * (`./tools/<name>.ts`). Adding a new tool no longer touches this
+ * array — drop a file in `./tools/` and add it to
+ * `./tools/registry.ts`.
+ *
+ * Typed as a mutable array (matching the original export shape)
+ * even though the underlying registry produces a readonly value;
+ * we slice() to materialize a fresh, mutable copy at module load.
+ */
+export const tools: ToolDefinition[] = registryTools.slice();
 
 /** Maximum output length to prevent context bloat (characters) */
 const MAX_OUTPUT_LENGTH = 15000;
@@ -42,248 +61,6 @@ function markSessionConsulted(sessionId: string): void {
   }
 }
 
-/**
- * MCP Tool definition
- */
-export interface ToolDefinition {
-  name: string;
-  description: string;
-  inputSchema: {
-    type: 'object';
-    properties: Record<string, PropertySchema>;
-    required?: string[];
-  };
-}
-
-interface PropertySchema {
-  type: string;
-  description: string;
-  enum?: string[];
-  default?: unknown;
-}
-
-/**
- * Tool execution result
- */
-export interface ToolResult {
-  content: Array<{
-    type: 'text';
-    text: string;
-  }>;
-  isError?: boolean;
-}
-
-/**
- * Common projectPath property for cross-project queries
- */
-const projectPathProperty: PropertySchema = {
-  type: 'string',
-  description: 'Path to a different project with .codegraph/ initialized. If omitted, uses current project. Use this to query other codebases.',
-};
-
-/**
- * All CodeGraph MCP tools
- *
- * Designed for minimal context usage - use codegraph_context as the primary tool,
- * and only use other tools for targeted follow-up queries.
- *
- * All tools support cross-project queries via the optional `projectPath` parameter.
- */
-export const tools: ToolDefinition[] = [
-  {
-    name: 'codegraph_search',
-    description: 'Quick symbol search by name. Returns locations only (no code). Use codegraph_context instead for comprehensive task context.',
-    inputSchema: {
-      type: 'object',
-      properties: {
-        query: {
-          type: 'string',
-          description: 'Symbol name or partial name (e.g., "auth", "signIn", "UserService")',
-        },
-        kind: {
-          type: 'string',
-          description: 'Filter by node kind',
-          enum: ['function', 'method', 'class', 'interface', 'type', 'variable', 'route', 'component'],
-        },
-        limit: {
-          type: 'number',
-          description: 'Maximum results (default: 10)',
-          default: 10,
-        },
-        projectPath: projectPathProperty,
-      },
-      required: ['query'],
-    },
-  },
-  {
-    name: 'codegraph_context',
-    description: 'PRIMARY TOOL: Build comprehensive context for a task. Returns entry points, related symbols, and key code - often enough to understand the codebase without additional tool calls. NOTE: This provides CODE context, not product requirements. For new features, still clarify UX/behavior questions with the user before implementing.',
-    inputSchema: {
-      type: 'object',
-      properties: {
-        task: {
-          type: 'string',
-          description: 'Description of the task, bug, or feature to build context for',
-        },
-        maxNodes: {
-          type: 'number',
-          description: 'Maximum symbols to include (default: 20)',
-          default: 20,
-        },
-        includeCode: {
-          type: 'boolean',
-          description: 'Include code snippets for key symbols (default: true)',
-          default: true,
-        },
-        projectPath: projectPathProperty,
-      },
-      required: ['task'],
-    },
-  },
-  {
-    name: 'codegraph_callers',
-    description: 'Find all functions/methods that call a specific symbol. Useful for understanding usage patterns and impact of changes.',
-    inputSchema: {
-      type: 'object',
-      properties: {
-        symbol: {
-          type: 'string',
-          description: 'Name of the function, method, or class to find callers for',
-        },
-        limit: {
-          type: 'number',
-          description: 'Maximum number of callers to return (default: 20)',
-          default: 20,
-        },
-        projectPath: projectPathProperty,
-      },
-      required: ['symbol'],
-    },
-  },
-  {
-    name: 'codegraph_callees',
-    description: 'Find all functions/methods that a specific symbol calls. Useful for understanding dependencies and code flow.',
-    inputSchema: {
-      type: 'object',
-      properties: {
-        symbol: {
-          type: 'string',
-          description: 'Name of the function, method, or class to find callees for',
-        },
-        limit: {
-          type: 'number',
-          description: 'Maximum number of callees to return (default: 20)',
-          default: 20,
-        },
-        projectPath: projectPathProperty,
-      },
-      required: ['symbol'],
-    },
-  },
-  {
-    name: 'codegraph_impact',
-    description: 'Analyze the impact radius of changing a symbol. Shows what code could be affected by modifications.',
-    inputSchema: {
-      type: 'object',
-      properties: {
-        symbol: {
-          type: 'string',
-          description: 'Name of the symbol to analyze impact for',
-        },
-        depth: {
-          type: 'number',
-          description: 'How many levels of dependencies to traverse (default: 2)',
-          default: 2,
-        },
-        projectPath: projectPathProperty,
-      },
-      required: ['symbol'],
-    },
-  },
-  {
-    name: 'codegraph_node',
-    description: 'Get detailed information about a specific code symbol. Use includeCode=true only when you need the full source code - otherwise just get location and signature to minimize context usage.',
-    inputSchema: {
-      type: 'object',
-      properties: {
-        symbol: {
-          type: 'string',
-          description: 'Name of the symbol to get details for',
-        },
-        includeCode: {
-          type: 'boolean',
-          description: 'Include full source code (default: false to minimize context)',
-          default: false,
-        },
-        projectPath: projectPathProperty,
-      },
-      required: ['symbol'],
-    },
-  },
-  {
-    name: 'codegraph_explore',
-    description: 'Deep exploration tool — returns comprehensive context for a topic in a SINGLE call. Groups all relevant source code by file (contiguous sections, not snippets), includes a relationship map, and uses deeper graph traversal. Designed to replace multiple codegraph_node + file Read calls. Use this instead of codegraph_context when you need thorough understanding. IMPORTANT: Use specific symbol names, file names, or short code terms in your query — NOT natural language sentences. Before calling this, use codegraph_search to discover relevant symbol names, then include those names in your query. Bad: "how are agent prompts loaded and passed to the CLI". Good: "readAgentsFromDirectory createClaudeSession chat-manager agents.ts".',
-    inputSchema: {
-      type: 'object',
-      properties: {
-        query: {
-          type: 'string',
-          description: 'Symbol names, file names, or short code terms to explore (e.g., "AuthService loginUser session-manager", "GraphTraverser BFS impact traversal.ts"). Use codegraph_search first to find relevant names.',
-        },
-        maxFiles: {
-          type: 'number',
-          description: 'Maximum number of files to include source code from (default: 12)',
-          default: 12,
-        },
-        projectPath: projectPathProperty,
-      },
-      required: ['query'],
-    },
-  },
-  {
-    name: 'codegraph_status',
-    description: 'Get the status of the CodeGraph index, including statistics about indexed files, nodes, and edges.',
-    inputSchema: {
-      type: 'object',
-      properties: {
-        projectPath: projectPathProperty,
-      },
-    },
-  },
-  {
-    name: 'codegraph_files',
-    description: 'REQUIRED for file/folder exploration. Get the project file structure from the CodeGraph index. Returns a tree view of all indexed files with metadata (language, symbol count). Much faster than Glob/filesystem scanning. Use this FIRST when exploring project structure, finding files, or understanding codebase organization.',
-    inputSchema: {
-      type: 'object',
-      properties: {
-        path: {
-          type: 'string',
-          description: 'Filter to files under this directory path (e.g., "src/components"). Returns all files if not specified.',
-        },
-        pattern: {
-          type: 'string',
-          description: 'Filter files matching this glob pattern (e.g., "*.tsx", "**/*.test.ts")',
-        },
-        format: {
-          type: 'string',
-          description: 'Output format: "tree" (hierarchical, default), "flat" (simple list), "grouped" (by language)',
-          enum: ['tree', 'flat', 'grouped'],
-          default: 'tree',
-        },
-        includeMetadata: {
-          type: 'boolean',
-          description: 'Include file metadata like language and symbol count (default: true)',
-          default: true,
-        },
-        maxDepth: {
-          type: 'number',
-          description: 'Maximum directory depth to show (default: unlimited)',
-        },
-        projectPath: projectPathProperty,
-      },
-    },
-  },
-];
 
 /**
  * Tool handler that executes tools against a CodeGraph instance
@@ -291,7 +68,7 @@ export const tools: ToolDefinition[] = [
  * Supports cross-project queries via the projectPath parameter.
  * Other projects are opened on-demand and cached for performance.
  */
-export class ToolHandler {
+export class ToolHandler implements ToolHandlerLike {
   // Cache of opened CodeGraph instances for cross-project queries
   private projectCache: Map<string, CodeGraph> = new Map();
 
@@ -404,32 +181,24 @@ export class ToolHandler {
   }
 
   /**
-   * Execute a tool by name
+   * Execute a tool by name.
+   *
+   * The dispatch table lives in `./tools/registry.ts` — this method
+   * just looks up the tool's `handlerKey` and invokes the matching
+   * `handle<Name>` method on this class. Adding a new tool means
+   * registering a `ToolModule` (one new file under `./tools/`,
+   * one entry in the registry) plus implementing
+   * `handle<Name>(args)` here.
    */
   async execute(toolName: string, args: Record<string, unknown>): Promise<ToolResult> {
     try {
-      switch (toolName) {
-        case 'codegraph_search':
-          return await this.handleSearch(args);
-        case 'codegraph_context':
-          return await this.handleContext(args);
-        case 'codegraph_callers':
-          return await this.handleCallers(args);
-        case 'codegraph_callees':
-          return await this.handleCallees(args);
-        case 'codegraph_impact':
-          return await this.handleImpact(args);
-        case 'codegraph_explore':
-          return await this.handleExplore(args);
-        case 'codegraph_node':
-          return await this.handleNode(args);
-        case 'codegraph_status':
-          return await this.handleStatus(args);
-        case 'codegraph_files':
-          return await this.handleFiles(args);
-        default:
-          return this.errorResult(`Unknown tool: ${toolName}`);
-      }
+      const mod = getToolModule(toolName);
+      if (!mod) return this.errorResult(`Unknown tool: ${toolName}`);
+      // `implements ToolHandlerLike` makes this lookup type-safe:
+      // `mod.handlerKey` is constrained to `HandlerKey`, and every
+      // member of that union maps to an `(args) => Promise<ToolResult>`
+      // method on `this` (verified at compile time, not at runtime).
+      return await this[mod.handlerKey](args);
     } catch (err) {
       return this.errorResult(`Tool execution failed: ${err instanceof Error ? err.message : String(err)}`);
     }
@@ -438,7 +207,7 @@ export class ToolHandler {
   /**
    * Handle codegraph_search
    */
-  private async handleSearch(args: Record<string, unknown>): Promise<ToolResult> {
+  async handleSearch(args: Record<string, unknown>): Promise<ToolResult> {
     const query = this.validateString(args.query, 'query');
     if (typeof query !== 'string') return query;
 
@@ -463,7 +232,7 @@ export class ToolHandler {
   /**
    * Handle codegraph_context
    */
-  private async handleContext(args: Record<string, unknown>): Promise<ToolResult> {
+  async handleContext(args: Record<string, unknown>): Promise<ToolResult> {
     const task = this.validateString(args.task, 'task');
     if (typeof task !== 'string') return task;
 
@@ -529,7 +298,7 @@ export class ToolHandler {
   /**
    * Handle codegraph_callers
    */
-  private async handleCallers(args: Record<string, unknown>): Promise<ToolResult> {
+  async handleCallers(args: Record<string, unknown>): Promise<ToolResult> {
     const symbol = this.validateString(args.symbol, 'symbol');
     if (typeof symbol !== 'string') return symbol;
 
@@ -564,7 +333,7 @@ export class ToolHandler {
   /**
    * Handle codegraph_callees
    */
-  private async handleCallees(args: Record<string, unknown>): Promise<ToolResult> {
+  async handleCallees(args: Record<string, unknown>): Promise<ToolResult> {
     const symbol = this.validateString(args.symbol, 'symbol');
     if (typeof symbol !== 'string') return symbol;
 
@@ -599,7 +368,7 @@ export class ToolHandler {
   /**
    * Handle codegraph_impact
    */
-  private async handleImpact(args: Record<string, unknown>): Promise<ToolResult> {
+  async handleImpact(args: Record<string, unknown>): Promise<ToolResult> {
     const symbol = this.validateString(args.symbol, 'symbol');
     if (typeof symbol !== 'string') return symbol;
 
@@ -650,7 +419,7 @@ export class ToolHandler {
    * then read contiguous file sections covering all symbols per file.
    * This replaces multiple codegraph_node + Read calls.
    */
-  private async handleExplore(args: Record<string, unknown>): Promise<ToolResult> {
+  async handleExplore(args: Record<string, unknown>): Promise<ToolResult> {
     const query = this.validateString(args.query, 'query');
     if (typeof query !== 'string') return query;
 
@@ -936,7 +705,7 @@ export class ToolHandler {
   /**
    * Handle codegraph_node
    */
-  private async handleNode(args: Record<string, unknown>): Promise<ToolResult> {
+  async handleNode(args: Record<string, unknown>): Promise<ToolResult> {
     const symbol = this.validateString(args.symbol, 'symbol');
     if (typeof symbol !== 'string') return symbol;
 
@@ -955,14 +724,17 @@ export class ToolHandler {
       code = await cg.getCode(match.node.id);
     }
 
-    const formatted = this.formatNodeDetails(match.node, code) + match.note;
+    // Surface issue history (mined from `Fixes #N` commits).
+    const issues = cg.getIssuesForNode(match.node.id);
+
+    const formatted = this.formatNodeDetails(match.node, code, issues) + match.note;
     return this.textResult(this.truncateOutput(formatted));
   }
 
   /**
    * Handle codegraph_status
    */
-  private async handleStatus(args: Record<string, unknown>): Promise<ToolResult> {
+  async handleStatus(args: Record<string, unknown>): Promise<ToolResult> {
     const cg = this.getCodeGraph(args.projectPath as string | undefined);
     const stats = cg.getStats();
 
@@ -996,7 +768,7 @@ export class ToolHandler {
   /**
    * Handle codegraph_files - get project file structure from the index
    */
-  private async handleFiles(args: Record<string, unknown>): Promise<ToolResult> {
+  async handleFiles(args: Record<string, unknown>): Promise<ToolResult> {
     const cg = this.getCodeGraph(args.projectPath as string | undefined);
     const pathFilter = args.path as string | undefined;
     const pattern = args.pattern as string | undefined;
@@ -1044,6 +816,443 @@ export class ToolHandler {
     return this.textResult(this.truncateOutput(output));
   }
 
+  /**
+   * Handle codegraph_config — env-var / config read-site queries.
+   */
+  async handleConfig(args: Record<string, unknown>): Promise<ToolResult> {
+    const cg = this.getCodeGraph(args.projectPath as string | undefined);
+    const key = typeof args.key === 'string' ? args.key.trim() : '';
+
+    if (!key) {
+      const limit = args.limit != null ? clamp(args.limit as number, 1, 500) : 30;
+      const rows = cg.getConfigKeys({ configKind: 'env', limit });
+      if (rows.length === 0) {
+        return this.textResult(
+          'No config reads found. Either the index has no env-var read sites, or `enableConfigRefs` is disabled in config.'
+        );
+      }
+      const lines: string[] = [
+        `## Config keys read in this project (top ${rows.length})`,
+        '',
+        '| # | Key | Reads | Files |',
+        '|---|-----|------:|------:|',
+      ];
+      rows.forEach((r, i) => {
+        lines.push(`| ${i + 1} | \`${r.configKey}\` | ${r.reads} | ${r.distinctFiles} |`);
+      });
+      lines.push('', 'Pass `key` to a follow-up call to see exact read sites.');
+      return this.textResult(this.truncateOutput(lines.join('\n')));
+    }
+
+    const sites = cg.getConfigRefsByKey(key, { configKind: 'env' });
+    if (sites.length === 0) {
+      return this.textResult(`No reads found for env var "${key}".`);
+    }
+    const lines: string[] = [
+      `## Reads of \`${key}\` (${sites.length} site${sites.length === 1 ? '' : 's'})`,
+      '',
+    ];
+    for (const s of sites) {
+      const enclosing = s.sourceName
+        ? ` — ${s.sourceKind ?? 'symbol'} \`${s.sourceName}\``
+        : ' — top-level';
+      lines.push(`- \`${s.filePath}:${s.line}\`${enclosing}`);
+    }
+    return this.textResult(this.truncateOutput(lines.join('\n')));
+  }
+
+  /**
+   * Handle codegraph_review_context — structured PR-review context from a diff.
+   */
+  async handleReviewContext(args: Record<string, unknown>): Promise<ToolResult> {
+    const diff = this.validateString(args.diff, 'diff');
+    if (typeof diff !== 'string') return diff;
+
+    const cg = this.getCodeGraph(args.projectPath as string | undefined);
+
+    const context = cg.buildReviewContext(diff, {
+      maxCallersPerSymbol: args.maxCallersPerSymbol != null
+        ? clamp(Number(args.maxCallersPerSymbol), 0, 50)
+        : undefined,
+      maxCalleesPerSymbol: args.maxCalleesPerSymbol != null
+        ? clamp(Number(args.maxCalleesPerSymbol), 0, 50)
+        : undefined,
+      maxCoChangeWarnings: args.maxCoChangeWarnings != null
+        ? clamp(Number(args.maxCoChangeWarnings), 0, 20)
+        : undefined,
+      minCoChangeJaccard: args.minCoChangeJaccard != null
+        ? clamp(Number(args.minCoChangeJaccard), 0, 1)
+        : undefined,
+    });
+
+    if (context.summary.symbolsAffected === 0 && context.files.length === 0) {
+      return this.textResult(
+        'No indexed symbols overlap the diff hunks. Either the affected files are not indexed, the diff is empty, or it touches files that were added/deleted entirely.'
+      );
+    }
+
+    return this.textResult(serializeReviewContextWithinCap(context, MAX_OUTPUT_LENGTH));
+  }
+
+  async handlePendingSummaries(args: Record<string, unknown>): Promise<ToolResult> {
+    const cg = this.getCodeGraph(args.projectPath as string | undefined);
+    const limit = clamp(Number(args.limit) || 20, 1, 200);
+    const modelHint = (args.modelHint as string | undefined) ?? 'agent-mcp';
+    const batch = cg.pendingSummariesBatch({ limit, modelHint });
+
+    if (batch.items.length === 0) {
+      return this.textResult(
+        `No pending summaries (${batch.total} total candidates, all have current cache entries for model "${modelHint}").`
+      );
+    }
+
+    // Return as JSON so the agent can consume it programmatically.
+    return this.textResult(
+      JSON.stringify(
+        {
+          items: batch.items,
+          remaining: batch.remaining,
+          total: batch.total,
+          modelHint: batch.modelHint,
+          instructions:
+            'Summarise each item.body in ONE LINE (max 200 chars), starting with an action verb. No "This function..." preamble. Then call codegraph_save_summaries with [{nodeId, contentHash, summary}, ...] echoing each item\'s contentHash unchanged. Use the same modelHint as the model arg.',
+        },
+        null,
+        2
+      )
+    );
+  }
+
+  /**
+   * Handle codegraph_save_summaries — persist a batch of agent-
+   * generated summaries with content_hash re-validation.
+   */
+  async handleSaveSummaries(args: Record<string, unknown>): Promise<ToolResult> {
+    const cg = this.getCodeGraph(args.projectPath as string | undefined);
+    const items = args.items as Array<{ nodeId?: unknown; contentHash?: unknown; summary?: unknown }> | undefined;
+    if (!Array.isArray(items)) {
+      return this.errorResult('items must be an array of { nodeId, contentHash, summary }');
+    }
+    const cleaned: Array<{ nodeId: string; contentHash: string; summary: string }> = [];
+    for (const it of items) {
+      if (
+        typeof it?.nodeId !== 'string' ||
+        typeof it?.contentHash !== 'string' ||
+        typeof it?.summary !== 'string'
+      ) {
+        return this.errorResult(
+          'each item requires string nodeId, contentHash, and summary'
+        );
+      }
+      cleaned.push({
+        nodeId: it.nodeId,
+        contentHash: it.contentHash,
+        summary: it.summary,
+      });
+    }
+    const model = (args.model as string | undefined) ?? 'agent-mcp';
+    const result = cg.saveAgentSummaries(cleaned, model);
+    const lines: string[] = [
+      `Saved ${result.saved} summaries (model: ${model}); skipped ${result.skipped}.`,
+    ];
+    if (result.errors.length > 0) {
+      lines.push('', 'Skipped:');
+      for (const e of result.errors.slice(0, 20)) lines.push(`  - ${e}`);
+      if (result.errors.length > 20) lines.push(`  ... and ${result.errors.length - 20} more`);
+    }
+    return this.textResult(lines.join('\n'));
+  }
+
+
+  async handleDeadCode(args: Record<string, unknown>): Promise<ToolResult> {
+    const cg = this.getCodeGraph(args.projectPath as string | undefined);
+    const llmConfig = await cg.getEffectiveLlmConfig();
+    if (!llmConfig?.chatModel) {
+      return this.errorResult(
+        'No LLM available for dead-code judge. Configure config.llm or run a local Ollama server.'
+      );
+    }
+    const maxCandidates = clamp(Number(args.maxCandidates) || 50, 1, 500);
+    const verdictFilter = (args.verdict as string | undefined) ?? 'all';
+
+    try {
+      const result = await cg.findDeadCodeCandidates({ maxCandidates });
+      let rows = result.results;
+      if (verdictFilter !== 'all') {
+        rows = rows.filter((r) => r.verdict === verdictFilter);
+      }
+
+      if (rows.length === 0) {
+        return this.textResult(
+          `Judged ${result.judged}/${result.candidates} candidates; no entries matched filter "${verdictFilter}".`
+        );
+      }
+
+      const lines: string[] = [
+        `## Dead-code candidates (${rows.length} of ${result.candidates} judged)`,
+        '',
+        `_Combines graph signal (no callers + not exported) with LLM verdict._`,
+        '',
+      ];
+      for (const c of rows) {
+        const loc = c.node.startLine ? `:${c.node.startLine}` : '';
+        const conf = `${(c.confidence * 100).toFixed(0)}%`;
+        lines.push(`### [${c.verdict.toUpperCase()} ${conf}] ${c.node.name} (${c.node.kind})`);
+        lines.push(`${c.node.filePath}${loc}`);
+        if (c.reason) lines.push(`> ${c.reason}`);
+        lines.push('');
+      }
+      return this.textResult(this.truncateOutput(lines.join('\n')));
+    } catch (err) {
+      return this.errorResult(err instanceof Error ? err.message : String(err));
+    }
+  }
+
+
+  async handleRole(args: Record<string, unknown>): Promise<ToolResult> {
+    const role = this.validateString(args.role, 'role');
+    if (typeof role !== 'string') return role;
+
+    const cg = this.getCodeGraph(args.projectPath as string | undefined);
+    const limit = clamp(Number(args.limit) || 50, 1, 500);
+
+    const nodes = cg.findNodesByRole(role as never, limit);
+    if (nodes.length === 0) {
+      return this.textResult(
+        `No symbols classified as "${role}". The role classifier may not have run yet — check codegraph_status for coverage.`
+      );
+    }
+    const summaries = cg.getSymbolSummaries(nodes.map((n) => n.id));
+    const lines: string[] = [`## Symbols with role: ${role} (${nodes.length})`, ''];
+    for (const n of nodes) {
+      const loc = n.startLine ? `:${n.startLine}` : '';
+      lines.push(`- **${n.name}** (${n.kind}) — ${n.filePath}${loc}`);
+      const s = summaries.get(n.id);
+      if (s) lines.push(`  ${s}`);
+    }
+    return this.textResult(this.truncateOutput(lines.join('\n')));
+  }
+
+
+  async handleModule(args: Record<string, unknown>): Promise<ToolResult> {
+    const dirPathRaw = args.dirPath as string | undefined;
+    const cg = this.getCodeGraph(args.projectPath as string | undefined);
+    if (!dirPathRaw) {
+      const all = cg.getAllDirectorySummaries();
+      if (all.length === 0) {
+        return this.textResult(
+          'No module summaries cached yet. Run a sync after the LLM background pass to populate them.'
+        );
+      }
+      const lines: string[] = ['## Module summaries', ''];
+      for (const { dirPath, summary } of all) {
+        lines.push(`### ${dirPath}`);
+        lines.push(summary);
+        lines.push('');
+      }
+      return this.textResult(this.truncateOutput(lines.join('\n')));
+    }
+
+    // Normalise a leading "./" or trailing slash before lookup
+    const dirPath = dirPathRaw.replace(/^\.\//, '').replace(/\/+$/, '');
+    const summary = cg.getDirectorySummary(dirPath);
+    if (!summary) {
+      return this.textResult(
+        `No summary cached for "${dirPath}". The directory may not have ≥3 summarised symbols yet, or the background pass hasn't completed.`
+      );
+    }
+    return this.textResult(`## ${dirPath}\n\n${summary}`);
+  }
+
+
+  async handleAsk(args: Record<string, unknown>): Promise<ToolResult> {
+    const question = this.validateString(args.question, 'question');
+    if (typeof question !== 'string') return question;
+
+    const cg = this.getCodeGraph(args.projectPath as string | undefined);
+    const llmConfig = await cg.getEffectiveLlmConfig();
+    if (!llmConfig?.chatModel) {
+      return this.errorResult(
+        'No LLM available for codegraph_ask. Configure config.llm or run a local Ollama server.'
+      );
+    }
+
+    const retrieveK = clamp(Number(args.retrieveK) || 12, 4, 30);
+    try {
+      const result = await cg.ask(question, { retrieveK });
+      const lines: string[] = [
+        '## Answer',
+        '',
+        result.answer,
+        '',
+        '## Sources',
+        '',
+      ];
+      for (const c of result.citations) {
+        const loc = c.node.startLine ? `:${c.node.startLine}` : '';
+        lines.push(`- **${c.node.name}** (${c.node.kind}) — ${c.node.filePath}${loc}`);
+        if (c.summary) lines.push(`  ${c.summary}`);
+      }
+      lines.push('');
+      lines.push(
+        `_Retrieved ${result.citations.length} symbols in ${result.retrieveMs}ms; chat ${result.chatMs}ms._`
+      );
+      return this.textResult(this.truncateOutput(lines.join('\n')));
+    } catch (err) {
+      return this.errorResult(err instanceof Error ? err.message : String(err));
+    }
+  }
+
+
+  async handleSimilar(args: Record<string, unknown>): Promise<ToolResult> {
+    const symbol = this.validateString(args.symbol, 'symbol');
+    if (typeof symbol !== 'string') return symbol;
+
+    const cg = this.getCodeGraph(args.projectPath as string | undefined);
+    const limit = clamp(Number(args.limit) || 10, 1, 100);
+    const sameLanguage = args.sameLanguage === true;
+    const differentLanguage = args.differentLanguage === true;
+
+    const match = this.findSymbol(cg, symbol);
+    if (!match) {
+      return this.textResult(`Symbol "${symbol}" not found in the codebase`);
+    }
+
+    const results = await cg.findSimilar(match.node.id, {
+      limit,
+      sameLanguage,
+      differentLanguage,
+    });
+
+    if (results.length === 0) {
+      const llmConfig = await cg.getEffectiveLlmConfig();
+      const reason = llmConfig?.embeddingModel
+        ? 'No similar symbols above the threshold (the source symbol may not have an embedding yet — try again after the background pass completes).'
+        : 'No embedding model configured. Configure config.llm.embeddingModel or pull a known embedding model into Ollama (e.g. nomic-embed-text).';
+      return this.textResult(reason);
+    }
+
+    const lines: string[] = [
+      `## Similar to ${match.node.name} (${match.node.kind}) — ${match.node.filePath}:${match.node.startLine}`,
+      '',
+    ];
+    const summaries = cg.getSymbolSummaries(results.map((r) => r.node.id));
+    for (const r of results) {
+      const loc = r.node.startLine ? `:${r.node.startLine}` : '';
+      lines.push(
+        `### ${r.node.name} (${r.node.kind})  [score ${r.score.toFixed(3)}, lang ${r.node.language}]`
+      );
+      lines.push(`${r.node.filePath}${loc}`);
+      const summary = summaries.get(r.node.id);
+      if (summary) lines.push(`> ${summary}`);
+      lines.push('');
+    }
+    return this.textResult(this.truncateOutput(lines.join('\n')));
+  }
+
+
+  /**
+   * Handle codegraph_sql — SQL call-site queries.
+   */
+  async handleSql(args: Record<string, unknown>): Promise<ToolResult> {
+    const cg = this.getCodeGraph(args.projectPath as string | undefined);
+    const table = typeof args.table === 'string' ? args.table.trim() : '';
+    const op =
+      args.op === 'read' || args.op === 'write' || args.op === 'ddl'
+        ? args.op
+        : undefined;
+
+    if (!table) {
+      const limit = args.limit != null ? clamp(args.limit as number, 1, 500) : 30;
+      const rows = cg.getSqlTables({ limit });
+      if (rows.length === 0) {
+        return this.textResult(
+          'No SQL refs found. Either the index has no SQL string-literal call sites, or `enableSqlRefs` is disabled in config.'
+        );
+      }
+      const lines: string[] = [
+        `## SQL tables touched by this codebase (top ${rows.length})`,
+        '',
+        '| # | Table | Reads | Writes | DDL | Total |',
+        '|---|-------|------:|-------:|----:|------:|',
+      ];
+      rows.forEach((r, i) => {
+        lines.push(
+          `| ${i + 1} | \`${r.tableName}\` | ${r.reads} | ${r.writes} | ${r.ddl} | ${r.total} |`
+        );
+      });
+      lines.push('', 'Pass `table` to a follow-up call to see exact call sites.');
+      return this.textResult(this.truncateOutput(lines.join('\n')));
+    }
+
+    const sites = cg.getSqlRefsByTable(table, op ? { op } : {});
+    if (sites.length === 0) {
+      return this.textResult(`No SQL refs found for table "${table}"${op ? ` (op=${op})` : ''}.`);
+    }
+    const lines: string[] = [
+      `## Call sites for \`${table}\`${op ? ` (op=${op})` : ''} — ${sites.length} site${sites.length === 1 ? '' : 's'}`,
+      '',
+    ];
+    for (const s of sites) {
+      const enclosing = s.sourceName
+        ? ` — ${s.sourceKind ?? 'symbol'} \`${s.sourceName}\``
+        : ' — top-level';
+      lines.push(`- [${s.op}] \`${s.filePath}:${s.line}\`${enclosing}`);
+    }
+    return this.textResult(this.truncateOutput(lines.join('\n')));
+  }
+
+  /**
+   * Handle codegraph_hotspots — files ranked by risk = centrality × churn.
+   */
+  async handleHotspots(args: Record<string, unknown>): Promise<ToolResult> {
+    const cg = this.getCodeGraph(args.projectPath as string | undefined);
+    const limit = args.limit != null ? clamp(args.limit as number, 1, 100) : 15;
+    const minCommits = args.minCommits != null ? Math.max(0, args.minCommits as number) : 3;
+    const minCentrality = args.minCentrality != null ? Math.max(0, args.minCentrality as number) : 0;
+    const sortBy = (args.sortBy as 'risk' | 'centrality' | 'churn' | undefined) ?? 'risk';
+
+    const rows = cg.getHotspots({ limit, minCommits, minCentrality, sortBy });
+    if (rows.length === 0) {
+      const lines = [
+        'No hotspots to report.',
+        '',
+        'This typically means one of:',
+        '- Index has not been built yet (`codegraph index`)',
+        '- Project is not a git repo (churn data unavailable)',
+        '- `enableCentrality` / `enableChurn` are disabled in config',
+        '- `minCommits` is set higher than any file in the project',
+      ];
+      return this.textResult(lines.join('\n'));
+    }
+
+    const now = Math.floor(Date.now() / 1000);
+    const fmtAge = (ts: number | null) => {
+      if (!ts) return '—';
+      const days = Math.floor((now - ts) / 86400);
+      if (days <= 0) return 'today';
+      if (days === 1) return '1d ago';
+      if (days < 30) return `${days}d ago`;
+      const months = Math.floor(days / 30);
+      return months === 1 ? '1mo ago' : `${months}mo ago`;
+    };
+
+    const lines: string[] = [
+      `## Hotspots (sortBy=${sortBy}, top ${rows.length})`,
+      '',
+      'High-risk files = high structural centrality × high git churn. Review these first.',
+      '',
+      '| # | File | PR | Commits | LOC | Last touched | Risk |',
+      '|---|------|----:|--------:|----:|--------------|-----:|',
+    ];
+    rows.forEach((r, i) => {
+      lines.push(
+        `| ${i + 1} | \`${r.filePath}\` | ${r.fileCentrality.toFixed(4)} | ${r.commitCount} | ${r.loc} | ${fmtAge(r.lastTouchedTs)} | ${r.riskScore.toFixed(4)} |`
+      );
+    });
+    return this.textResult(this.truncateOutput(lines.join('\n')));
+  }
+
   /**
    * Convert glob pattern to regex
    */
@@ -1336,7 +1545,15 @@ export class ToolHandler {
     return lines.join('\n');
   }
 
-  private formatNodeDetails(node: Node, code: string | null): string {
+  private formatNodeDetails(
+    node: Node,
+    code: string | null,
+    issues: Array<{
+      issueNumber: number;
+      kind: 'modified' | 'added' | 'removed';
+      commitSha: string;
+    }> = []
+  ): string {
     const location = node.startLine ? `:${node.startLine}` : '';
     const lines: string[] = [
       `## ${node.name} (${node.kind})`,
@@ -1348,6 +1565,25 @@ export class ToolHandler {
       lines.push(`**Signature:** \`${node.signature}\``);
     }
 
+    if (issues.length > 0) {
+      const byKind: Record<'modified' | 'added' | 'removed', Set<number>> = {
+        modified: new Set(),
+        added: new Set(),
+        removed: new Set(),
+      };
+      for (const i of issues) byKind[i.kind].add(i.issueNumber);
+      const parts: string[] = [];
+      for (const k of ['modified', 'added', 'removed'] as const) {
+        const set = byKind[k];
+        if (set.size === 0) continue;
+        const sorted = [...set].sort((a, b) => a - b);
+        parts.push(`#${sorted.join(', #')} (${k})`);
+      }
+      if (parts.length > 0) {
+        lines.push(`**Issues:** ${parts.join(' — ')}`);
+      }
+    }
+
     // Only include docstring if it's short and useful
     if (node.docstring && node.docstring.length < 200) {
       lines.push('', node.docstring);
@@ -1364,16 +1600,77 @@ export class ToolHandler {
     return context.summary || 'No context found';
   }
 
-  private textResult(text: string): ToolResult {
+  textResult(text: string): ToolResult {
     return {
       content: [{ type: 'text', text }],
     };
   }
 
-  private errorResult(message: string): ToolResult {
+  errorResult(message: string): ToolResult {
     return {
       content: [{ type: 'text', text: `Error: ${message}` }],
       isError: true,
     };
   }
 }
+
+/**
+ * Serialize the review context as JSON, progressively trimming low-value
+ * fields (docstrings → signatures → callers/callees → files) to fit `cap`.
+ */
+function serializeReviewContextWithinCap(context: unknown, cap: number): string {
+  const ctx = JSON.parse(JSON.stringify(context)) as {
+    summary: Record<string, number>;
+    files: Array<{
+      affectedSymbols: Array<{
+        docstring?: string;
+        signature?: string;
+        callers?: unknown[];
+        callees?: unknown[];
+      }>;
+      _truncated?: boolean;
+    }>;
+    coChangeWarnings: unknown[];
+    _truncated?: boolean;
+  };
+
+  const fits = (s: string) => s.length <= cap;
+
+  let json = JSON.stringify(ctx, null, 2);
+  if (fits(json)) return json;
+
+  for (const f of ctx.files) for (const s of f.affectedSymbols) delete s.docstring;
+  json = JSON.stringify(ctx, null, 2);
+  if (fits(json)) return json;
+
+  for (const f of ctx.files) for (const s of f.affectedSymbols) delete s.signature;
+  json = JSON.stringify(ctx, null, 2);
+  if (fits(json)) return json;
+
+  for (const f of ctx.files) for (const s of f.affectedSymbols) {
+    if (Array.isArray(s.callers)) s.callers = s.callers.slice(0, 2);
+    if (Array.isArray(s.callees)) s.callees = s.callees.slice(0, 2);
+  }
+  json = JSON.stringify(ctx, null, 2);
+  if (fits(json)) return json;
+
+  for (const f of ctx.files) for (const s of f.affectedSymbols) {
+    delete s.callers;
+    delete s.callees;
+  }
+  json = JSON.stringify(ctx, null, 2);
+  if (fits(json)) return json;
+
+  while (ctx.files.length > 1) {
+    ctx.files.pop();
+    ctx._truncated = true;
+    json = JSON.stringify(ctx, null, 2);
+    if (fits(json)) return json;
+  }
+
+  return JSON.stringify(
+    { summary: ctx.summary, coChangeWarnings: ctx.coChangeWarnings, _truncated: true },
+    null,
+    2
+  );
+}
diff --git a/src/mcp/tools/ask.ts b/src/mcp/tools/ask.ts
new file mode 100644
index 00000000..1b7eaafc
--- /dev/null
+++ b/src/mcp/tools/ask.ts
@@ -0,0 +1,26 @@
+import { projectPathProperty } from '../tool-types';
+import type { ToolModule } from './types';
+
+export const ASK_TOOL: ToolModule = {
+  definition: {
+    name: 'codegraph_ask',
+    description:
+      'Ask a natural-language question about the codebase. Hybrid-retrieves the top-K most relevant symbols (lexical + semantic match over LLM summaries), then asks the configured chat model. Use this for "how does X work?" questions; use codegraph_search for "what is the symbol named X" questions.',
+    inputSchema: {
+      type: 'object',
+      properties: {
+        question: {
+          type: 'string',
+          description: 'Natural-language question (e.g. "how does the file watcher decide when to sync?")',
+        },
+        retrieveK: {
+          type: 'number',
+          description: 'Number of candidate symbols to feed the model as context (default 12)',
+        },
+        projectPath: projectPathProperty,
+      },
+      required: ['question'],
+    },
+  },
+  handlerKey: 'handleAsk',
+};
diff --git a/src/mcp/tools/callees.ts b/src/mcp/tools/callees.ts
new file mode 100644
index 00000000..3c0d9740
--- /dev/null
+++ b/src/mcp/tools/callees.ts
@@ -0,0 +1,27 @@
+import { projectPathProperty } from '../tool-types';
+import type { ToolModule } from './types';
+
+export const CALLEES_TOOL: ToolModule = {
+  definition: {
+    name: 'codegraph_callees',
+    description:
+      'Find all functions/methods that a specific symbol calls. Useful for understanding dependencies and code flow.',
+    inputSchema: {
+      type: 'object',
+      properties: {
+        symbol: {
+          type: 'string',
+          description: 'Name of the function, method, or class to find callees for',
+        },
+        limit: {
+          type: 'number',
+          description: 'Maximum number of callees to return (default: 20)',
+          default: 20,
+        },
+        projectPath: projectPathProperty,
+      },
+      required: ['symbol'],
+    },
+  },
+  handlerKey: 'handleCallees',
+};
diff --git a/src/mcp/tools/callers.ts b/src/mcp/tools/callers.ts
new file mode 100644
index 00000000..a5d33912
--- /dev/null
+++ b/src/mcp/tools/callers.ts
@@ -0,0 +1,27 @@
+import { projectPathProperty } from '../tool-types';
+import type { ToolModule } from './types';
+
+export const CALLERS_TOOL: ToolModule = {
+  definition: {
+    name: 'codegraph_callers',
+    description:
+      'Find all functions/methods that call a specific symbol. Useful for understanding usage patterns and impact of changes.',
+    inputSchema: {
+      type: 'object',
+      properties: {
+        symbol: {
+          type: 'string',
+          description: 'Name of the function, method, or class to find callers for',
+        },
+        limit: {
+          type: 'number',
+          description: 'Maximum number of callers to return (default: 20)',
+          default: 20,
+        },
+        projectPath: projectPathProperty,
+      },
+      required: ['symbol'],
+    },
+  },
+  handlerKey: 'handleCallers',
+};
diff --git a/src/mcp/tools/config.ts b/src/mcp/tools/config.ts
new file mode 100644
index 00000000..fa11a5e1
--- /dev/null
+++ b/src/mcp/tools/config.ts
@@ -0,0 +1,26 @@
+import { projectPathProperty } from '../tool-types';
+import type { ToolModule } from './types';
+
+export const CONFIG_TOOL: ToolModule = {
+  definition: {
+    name: 'codegraph_config',
+    description:
+      "Surface environment-variable read sites across the codebase. Use to answer 'what reads OBSIDIAN_PORT?' or 'what config does this codebase read?'. Returns either (a) all distinct keys with read counts (no `key`), or (b) the precise read sites and their enclosing functions for a specific key. Beats grep because it skips comments/docs/tests-of-tests and attributes each hit to its enclosing function.",
+    inputSchema: {
+      type: 'object',
+      properties: {
+        key: {
+          type: 'string',
+          description:
+            'Specific env var to look up (e.g. "OBSIDIAN_PORT"). If omitted, returns the top-N keys with read counts.',
+        },
+        limit: {
+          type: 'number',
+          description: 'Max keys to return when no `key` is specified (default: 30).',
+        },
+        projectPath: projectPathProperty,
+      },
+    },
+  },
+  handlerKey: 'handleConfig',
+};
diff --git a/src/mcp/tools/context.ts b/src/mcp/tools/context.ts
new file mode 100644
index 00000000..e8618671
--- /dev/null
+++ b/src/mcp/tools/context.ts
@@ -0,0 +1,32 @@
+import { projectPathProperty } from '../tool-types';
+import type { ToolModule } from './types';
+
+export const CONTEXT_TOOL: ToolModule = {
+  definition: {
+    name: 'codegraph_context',
+    description:
+      'PRIMARY TOOL: Build comprehensive context for a task. Returns entry points, related symbols, and key code - often enough to understand the codebase without additional tool calls. NOTE: This provides CODE context, not product requirements. For new features, still clarify UX/behavior questions with the user before implementing.',
+    inputSchema: {
+      type: 'object',
+      properties: {
+        task: {
+          type: 'string',
+          description: 'Description of the task, bug, or feature to build context for',
+        },
+        maxNodes: {
+          type: 'number',
+          description: 'Maximum symbols to include (default: 20)',
+          default: 20,
+        },
+        includeCode: {
+          type: 'boolean',
+          description: 'Include code snippets for key symbols (default: true)',
+          default: true,
+        },
+        projectPath: projectPathProperty,
+      },
+      required: ['task'],
+    },
+  },
+  handlerKey: 'handleContext',
+};
diff --git a/src/mcp/tools/dead-code.ts b/src/mcp/tools/dead-code.ts
new file mode 100644
index 00000000..c94f8c35
--- /dev/null
+++ b/src/mcp/tools/dead-code.ts
@@ -0,0 +1,26 @@
+import { projectPathProperty } from '../tool-types';
+import type { ToolModule } from './types';
+
+export const DEAD_CODE_TOOL: ToolModule = {
+  definition: {
+    name: 'codegraph_dead_code',
+    description:
+      'Find potentially-dead symbols. Combines the graph signal (no incoming calls, not exported, not in test/script paths) with an LLM judge that knows about framework hooks, dynamic dispatch, and public APIs the static graph misses. Returns a CANDIDATE list with confidence — not a delete list.',
+    inputSchema: {
+      type: 'object',
+      properties: {
+        maxCandidates: {
+          type: 'number',
+          description: 'Cap on graph candidates the LLM judges (default 50)',
+        },
+        verdict: {
+          type: 'string',
+          description: 'Filter results by verdict (default shows all, dead-first)',
+          enum: ['dead', 'live', 'uncertain', 'all'],
+        },
+        projectPath: projectPathProperty,
+      },
+    },
+  },
+  handlerKey: 'handleDeadCode',
+};
diff --git a/src/mcp/tools/explore.ts b/src/mcp/tools/explore.ts
new file mode 100644
index 00000000..d61b24e9
--- /dev/null
+++ b/src/mcp/tools/explore.ts
@@ -0,0 +1,28 @@
+import { projectPathProperty } from '../tool-types';
+import type { ToolModule } from './types';
+
+export const EXPLORE_TOOL: ToolModule = {
+  definition: {
+    name: 'codegraph_explore',
+    description:
+      'Deep exploration tool — returns comprehensive context for a topic in a SINGLE call. Groups all relevant source code by file (contiguous sections, not snippets), includes a relationship map, and uses deeper graph traversal. Designed to replace multiple codegraph_node + file Read calls. Use this instead of codegraph_context when you need thorough understanding. IMPORTANT: Use specific symbol names, file names, or short code terms in your query — NOT natural language sentences. Before calling this, use codegraph_search to discover relevant symbol names, then include those names in your query. Bad: "how are agent prompts loaded and passed to the CLI". Good: "readAgentsFromDirectory createClaudeSession chat-manager agents.ts".',
+    inputSchema: {
+      type: 'object',
+      properties: {
+        query: {
+          type: 'string',
+          description:
+            'Symbol names, file names, or short code terms to explore (e.g., "AuthService loginUser session-manager", "GraphTraverser BFS impact traversal.ts"). Use codegraph_search first to find relevant names.',
+        },
+        maxFiles: {
+          type: 'number',
+          description: 'Maximum number of files to include source code from (default: 12)',
+          default: 12,
+        },
+        projectPath: projectPathProperty,
+      },
+      required: ['query'],
+    },
+  },
+  handlerKey: 'handleExplore',
+};
diff --git a/src/mcp/tools/files.ts b/src/mcp/tools/files.ts
new file mode 100644
index 00000000..117b0676
--- /dev/null
+++ b/src/mcp/tools/files.ts
@@ -0,0 +1,40 @@
+import { projectPathProperty } from '../tool-types';
+import type { ToolModule } from './types';
+
+export const FILES_TOOL: ToolModule = {
+  definition: {
+    name: 'codegraph_files',
+    description:
+      'REQUIRED for file/folder exploration. Get the project file structure from the CodeGraph index. Returns a tree view of all indexed files with metadata (language, symbol count). Much faster than Glob/filesystem scanning. Use this FIRST when exploring project structure, finding files, or understanding codebase organization.',
+    inputSchema: {
+      type: 'object',
+      properties: {
+        path: {
+          type: 'string',
+          description: 'Filter to files under this directory path (e.g., "src/components"). Returns all files if not specified.',
+        },
+        pattern: {
+          type: 'string',
+          description: 'Filter files matching this glob pattern (e.g., "*.tsx", "**/*.test.ts")',
+        },
+        format: {
+          type: 'string',
+          description: 'Output format: "tree" (hierarchical, default), "flat" (simple list), "grouped" (by language)',
+          enum: ['tree', 'flat', 'grouped'],
+          default: 'tree',
+        },
+        includeMetadata: {
+          type: 'boolean',
+          description: 'Include file metadata like language and symbol count (default: true)',
+          default: true,
+        },
+        maxDepth: {
+          type: 'number',
+          description: 'Maximum directory depth to show (default: unlimited)',
+        },
+        projectPath: projectPathProperty,
+      },
+    },
+  },
+  handlerKey: 'handleFiles',
+};
diff --git a/src/mcp/tools/hotspots.ts b/src/mcp/tools/hotspots.ts
new file mode 100644
index 00000000..a30c62cc
--- /dev/null
+++ b/src/mcp/tools/hotspots.ts
@@ -0,0 +1,37 @@
+import { projectPathProperty } from '../tool-types';
+import type { ToolModule } from './types';
+
+export const HOTSPOTS_TOOL: ToolModule = {
+  definition: {
+    name: 'codegraph_hotspots',
+    description:
+      "Identify high-risk files: high PageRank centrality (many things depend on them) AND high churn (frequently changed). Use when triaging an unfamiliar codebase, hunting for refactor targets, or asking 'where do bugs hide?'. Returns ranked file list with both signals plus a combined risk score (centrality × churn). Sort options: 'risk' (default), 'centrality', 'churn'.",
+    inputSchema: {
+      type: 'object',
+      properties: {
+        limit: {
+          type: 'number',
+          description: 'Maximum number of files to return (default: 15)',
+        },
+        minCommits: {
+          type: 'number',
+          description:
+            'Filter out files touched in fewer than N commits (default: 3 — excludes test fixtures and one-off files)',
+        },
+        minCentrality: {
+          type: 'number',
+          description:
+            'Filter out files whose total node centrality (Σ PageRank of nodes in file) is below this threshold (default: 0 — no filter). Useful to drop docs/config files from the list.',
+        },
+        sortBy: {
+          type: 'string',
+          enum: ['risk', 'centrality', 'churn'],
+          description:
+            'Sort dimension: risk = centrality × churn (default), centrality = pure structural importance, churn = pure change frequency',
+        },
+        projectPath: projectPathProperty,
+      },
+    },
+  },
+  handlerKey: 'handleHotspots',
+};
diff --git a/src/mcp/tools/impact.ts b/src/mcp/tools/impact.ts
new file mode 100644
index 00000000..45386e6b
--- /dev/null
+++ b/src/mcp/tools/impact.ts
@@ -0,0 +1,27 @@
+import { projectPathProperty } from '../tool-types';
+import type { ToolModule } from './types';
+
+export const IMPACT_TOOL: ToolModule = {
+  definition: {
+    name: 'codegraph_impact',
+    description:
+      'Analyze the impact radius of changing a symbol. Shows what code could be affected by modifications.',
+    inputSchema: {
+      type: 'object',
+      properties: {
+        symbol: {
+          type: 'string',
+          description: 'Name of the symbol to analyze impact for',
+        },
+        depth: {
+          type: 'number',
+          description: 'How many levels of dependencies to traverse (default: 2)',
+          default: 2,
+        },
+        projectPath: projectPathProperty,
+      },
+      required: ['symbol'],
+    },
+  },
+  handlerKey: 'handleImpact',
+};
diff --git a/src/mcp/tools/module.ts b/src/mcp/tools/module.ts
new file mode 100644
index 00000000..b68ab3bc
--- /dev/null
+++ b/src/mcp/tools/module.ts
@@ -0,0 +1,22 @@
+import { projectPathProperty } from '../tool-types';
+import type { ToolModule } from './types';
+
+export const MODULE_TOOL: ToolModule = {
+  definition: {
+    name: 'codegraph_module',
+    description:
+      'Get the LLM-synthesised paragraph describing what a directory/module does. Built from the symbol summaries inside it. Cheap: pure DB lookup. Useful for "what is in src/sync/?" before drilling into specific symbols.',
+    inputSchema: {
+      type: 'object',
+      properties: {
+        dirPath: {
+          type: 'string',
+          description: 'Project-relative directory path (e.g. "src/sync", "src/llm").',
+        },
+        projectPath: projectPathProperty,
+      },
+      required: ['dirPath'],
+    },
+  },
+  handlerKey: 'handleModule',
+};
diff --git a/src/mcp/tools/node.ts b/src/mcp/tools/node.ts
new file mode 100644
index 00000000..fe61b254
--- /dev/null
+++ b/src/mcp/tools/node.ts
@@ -0,0 +1,27 @@
+import { projectPathProperty } from '../tool-types';
+import type { ToolModule } from './types';
+
+export const NODE_TOOL: ToolModule = {
+  definition: {
+    name: 'codegraph_node',
+    description:
+      'Get detailed information about a specific code symbol. Use includeCode=true only when you need the full source code - otherwise just get location and signature to minimize context usage.',
+    inputSchema: {
+      type: 'object',
+      properties: {
+        symbol: {
+          type: 'string',
+          description: 'Name of the symbol to get details for',
+        },
+        includeCode: {
+          type: 'boolean',
+          description: 'Include full source code (default: false to minimize context)',
+          default: false,
+        },
+        projectPath: projectPathProperty,
+      },
+      required: ['symbol'],
+    },
+  },
+  handlerKey: 'handleNode',
+};
diff --git a/src/mcp/tools/pending-summaries.ts b/src/mcp/tools/pending-summaries.ts
new file mode 100644
index 00000000..bbbe71a5
--- /dev/null
+++ b/src/mcp/tools/pending-summaries.ts
@@ -0,0 +1,26 @@
+import { projectPathProperty } from '../tool-types';
+import type { ToolModule } from './types';
+
+export const PENDING_SUMMARIES_TOOL: ToolModule = {
+  definition: {
+    name: 'codegraph_pending_summaries',
+    description:
+      "Pull a batch of code symbols that need a one-line summary. Returns each symbol's body and content_hash. Designed for cases when no local LLM is available — the calling agent (you) can summarise each item and persist results via codegraph_save_summaries. Cache shape is identical to the local-LLM path, so the two coexist.",
+    inputSchema: {
+      type: 'object',
+      properties: {
+        limit: {
+          type: 'number',
+          description: 'Max symbols per batch (default 20, max 200)',
+        },
+        modelHint: {
+          type: 'string',
+          description:
+            'Label to record alongside saved summaries (default "agent-mcp"). Use your model id when known so cache provenance stays clear.',
+        },
+        projectPath: projectPathProperty,
+      },
+    },
+  },
+  handlerKey: 'handlePendingSummaries',
+};
diff --git a/src/mcp/tools/registry.ts b/src/mcp/tools/registry.ts
new file mode 100644
index 00000000..54688311
--- /dev/null
+++ b/src/mcp/tools/registry.ts
@@ -0,0 +1,87 @@
+/**
+ * MCP tool registry.
+ *
+ * Adding a new MCP tool is:
+ *
+ *   1. Create `src/mcp/tools/<name>.ts` exporting an
+ *      `<NAME>_TOOL: ToolModule` constant (definition + handlerKey).
+ *   2. Add **one** import line and **one** array entry to this file.
+ *   3. Add a `handle<Name>` method on `ToolHandler` in `../tools.ts`,
+ *      and add the new key to `HandlerKey` in `./types.ts`.
+ *
+ * The third step is currently the only "shared method on a single
+ * class" surface that competing PRs can collide on. Extracting
+ * handler bodies into per-tool files (so step 3 also becomes a
+ * single-file addition) is left as a follow-up.
+ */
+
+import type { ToolDefinition } from '../tool-types';
+import type { ToolModule } from './types';
+
+import { ASK_TOOL } from './ask';
+import { CALLEES_TOOL } from './callees';
+import { CALLERS_TOOL } from './callers';
+import { CONFIG_TOOL } from './config';
+import { CONTEXT_TOOL } from './context';
+import { DEAD_CODE_TOOL } from './dead-code';
+import { EXPLORE_TOOL } from './explore';
+import { FILES_TOOL } from './files';
+import { HOTSPOTS_TOOL } from './hotspots';
+import { IMPACT_TOOL } from './impact';
+import { MODULE_TOOL } from './module';
+import { NODE_TOOL } from './node';
+import { PENDING_SUMMARIES_TOOL } from './pending-summaries';
+import { REVIEW_CONTEXT_TOOL } from './review-context';
+import { ROLE_TOOL } from './role';
+import { SAVE_SUMMARIES_TOOL } from './save-summaries';
+import { SEARCH_TOOL } from './search';
+import { SIMILAR_TOOL } from './similar';
+import { SQL_TOOL } from './sql';
+import { STATUS_TOOL } from './status';
+
+const ALL_TOOLS: readonly ToolModule[] = [
+  ASK_TOOL,
+  CALLEES_TOOL,
+  CALLERS_TOOL,
+  CONFIG_TOOL,
+  CONTEXT_TOOL,
+  DEAD_CODE_TOOL,
+  EXPLORE_TOOL,
+  FILES_TOOL,
+  HOTSPOTS_TOOL,
+  IMPACT_TOOL,
+  MODULE_TOOL,
+  NODE_TOOL,
+  PENDING_SUMMARIES_TOOL,
+  REVIEW_CONTEXT_TOOL,
+  ROLE_TOOL,
+  SAVE_SUMMARIES_TOOL,
+  SEARCH_TOOL,
+  SIMILAR_TOOL,
+  SQL_TOOL,
+  STATUS_TOOL,
+];
+
+let byName: Map<string, ToolModule> | null = null;
+function ensureIndex(): Map<string, ToolModule> {
+  if (byName) return byName;
+  byName = new Map();
+  for (const t of ALL_TOOLS) byName.set(t.definition.name, t);
+  return byName;
+}
+
+export function getToolModules(): readonly ToolModule[] {
+  return ALL_TOOLS;
+}
+
+export function getToolModule(name: string): ToolModule | undefined {
+  return ensureIndex().get(name);
+}
+
+/**
+ * The `tools[]` array advertised in MCP `list_tools`. Derived from
+ * the registry; sorted alphabetically by tool name for stable output.
+ */
+export const tools: readonly ToolDefinition[] = ALL_TOOLS
+  .map((t) => t.definition)
+  .sort((a, b) => a.name.localeCompare(b.name));
diff --git a/src/mcp/tools/review-context.ts b/src/mcp/tools/review-context.ts
new file mode 100644
index 00000000..5d80269b
--- /dev/null
+++ b/src/mcp/tools/review-context.ts
@@ -0,0 +1,38 @@
+import { projectPathProperty } from '../tool-types';
+import type { ToolModule } from './types';
+
+export const REVIEW_CONTEXT_TOOL: ToolModule = {
+  definition: {
+    name: 'codegraph_review_context',
+    description:
+      'PR REVIEW HELPER: Given a unified diff, return structured context an LLM reviewer needs. Maps each hunk to the symbols it touches and attaches per-symbol callers, callees, impact-radius count, and tests covering the file. Also surfaces co-change warnings — files that historically change together with a changed file but were NOT included in this PR (catches "you changed schema.sql but not migrations.ts" type coupling violations). Returns JSON; the caller does the synthesis.',
+    inputSchema: {
+      type: 'object',
+      properties: {
+        diff: {
+          type: 'string',
+          description: 'Unified-diff text (e.g., the output of `git diff`, `gh pr diff <n>`).',
+        },
+        maxCallersPerSymbol: {
+          type: 'number',
+          description: 'Cap callers shown per affected symbol. Default 5.',
+        },
+        maxCalleesPerSymbol: {
+          type: 'number',
+          description: 'Cap callees shown per affected symbol. Default 5.',
+        },
+        maxCoChangeWarnings: {
+          type: 'number',
+          description: 'Cap co-change warnings per changed file. 0 disables. Default 3.',
+        },
+        minCoChangeJaccard: {
+          type: 'number',
+          description: 'Minimum Jaccard for a co-change warning. Default 0.4.',
+        },
+        projectPath: projectPathProperty,
+      },
+      required: ['diff'],
+    },
+  },
+  handlerKey: 'handleReviewContext',
+};
diff --git a/src/mcp/tools/role.ts b/src/mcp/tools/role.ts
new file mode 100644
index 00000000..64e4d40d
--- /dev/null
+++ b/src/mcp/tools/role.ts
@@ -0,0 +1,27 @@
+import { projectPathProperty } from '../tool-types';
+import type { ToolModule } from './types';
+
+export const ROLE_TOOL: ToolModule = {
+  definition: {
+    name: 'codegraph_role',
+    description:
+      'List symbols matching an LLM-assigned role (api_endpoint | business_logic | data_model | util | framework_glue | test_helper). Useful for "show me the API surface" or "list all data models". Requires the role classifier pass to have run.',
+    inputSchema: {
+      type: 'object',
+      properties: {
+        role: {
+          type: 'string',
+          description: 'Role label to filter by',
+          enum: ['api_endpoint', 'business_logic', 'data_model', 'util', 'framework_glue', 'test_helper', 'unknown'],
+        },
+        limit: {
+          type: 'number',
+          description: 'Maximum results (default: 50)',
+        },
+        projectPath: projectPathProperty,
+      },
+      required: ['role'],
+    },
+  },
+  handlerKey: 'handleRole',
+};
diff --git a/src/mcp/tools/save-summaries.ts b/src/mcp/tools/save-summaries.ts
new file mode 100644
index 00000000..74276d8f
--- /dev/null
+++ b/src/mcp/tools/save-summaries.ts
@@ -0,0 +1,40 @@
+import { projectPathProperty } from '../tool-types';
+import type { ToolModule } from './types';
+
+export const SAVE_SUMMARIES_TOOL: ToolModule = {
+  definition: {
+    name: 'codegraph_save_summaries',
+    description:
+      'Persist agent-generated symbol summaries returned from codegraph_pending_summaries. Re-validates content_hash against current disk before writing — items whose body changed since the batch was issued are skipped.',
+    inputSchema: {
+      type: 'object',
+      properties: {
+        items: {
+          type: 'array',
+          description:
+            'Summaries to persist. Each item must echo back the contentHash from codegraph_pending_summaries unchanged.',
+          items: {
+            type: 'object',
+            properties: {
+              nodeId: { type: 'string' },
+              contentHash: { type: 'string' },
+              summary: {
+                type: 'string',
+                description: 'One line, max 200 chars. Action verb. No "This function..." preamble.',
+              },
+            },
+            required: ['nodeId', 'contentHash', 'summary'],
+          },
+        },
+        model: {
+          type: 'string',
+          description:
+            'Model label to record (must match the modelHint from the pending batch for cache hits to short-circuit). Defaults to "agent-mcp".',
+        },
+        projectPath: projectPathProperty,
+      },
+      required: ['items'],
+    },
+  },
+  handlerKey: 'handleSaveSummaries',
+};
diff --git a/src/mcp/tools/search.ts b/src/mcp/tools/search.ts
new file mode 100644
index 00000000..c6678333
--- /dev/null
+++ b/src/mcp/tools/search.ts
@@ -0,0 +1,32 @@
+import { projectPathProperty } from '../tool-types';
+import type { ToolModule } from './types';
+
+export const SEARCH_TOOL: ToolModule = {
+  definition: {
+    name: 'codegraph_search',
+    description:
+      'Quick symbol search by name. Returns locations only (no code). Use codegraph_context instead for comprehensive task context.',
+    inputSchema: {
+      type: 'object',
+      properties: {
+        query: {
+          type: 'string',
+          description: 'Symbol name or partial name (e.g., "auth", "signIn", "UserService")',
+        },
+        kind: {
+          type: 'string',
+          description: 'Filter by node kind',
+          enum: ['function', 'method', 'class', 'interface', 'type', 'variable', 'route', 'component'],
+        },
+        limit: {
+          type: 'number',
+          description: 'Maximum results (default: 10)',
+          default: 10,
+        },
+        projectPath: projectPathProperty,
+      },
+      required: ['query'],
+    },
+  },
+  handlerKey: 'handleSearch',
+};
diff --git a/src/mcp/tools/similar.ts b/src/mcp/tools/similar.ts
new file mode 100644
index 00000000..f8d7b399
--- /dev/null
+++ b/src/mcp/tools/similar.ts
@@ -0,0 +1,34 @@
+import { projectPathProperty } from '../tool-types';
+import type { ToolModule } from './types';
+
+export const SIMILAR_TOOL: ToolModule = {
+  definition: {
+    name: 'codegraph_similar',
+    description:
+      'Find symbols whose summary semantics are similar to a given symbol. Useful for "show me the other implementations of this concept", including across languages in polyglot repos. Requires the source symbol to already have an embedding.',
+    inputSchema: {
+      type: 'object',
+      properties: {
+        symbol: {
+          type: 'string',
+          description: 'Source symbol name to find similar items for',
+        },
+        limit: {
+          type: 'number',
+          description: 'Maximum results (default: 10)',
+        },
+        sameLanguage: {
+          type: 'boolean',
+          description: 'Restrict to the same language as the source symbol',
+        },
+        differentLanguage: {
+          type: 'boolean',
+          description: 'Restrict to a different language from the source (cross-language matching)',
+        },
+        projectPath: projectPathProperty,
+      },
+      required: ['symbol'],
+    },
+  },
+  handlerKey: 'handleSimilar',
+};
diff --git a/src/mcp/tools/sql.ts b/src/mcp/tools/sql.ts
new file mode 100644
index 00000000..1f90ffe2
--- /dev/null
+++ b/src/mcp/tools/sql.ts
@@ -0,0 +1,32 @@
+import { projectPathProperty } from '../tool-types';
+import type { ToolModule } from './types';
+
+export const SQL_TOOL: ToolModule = {
+  definition: {
+    name: 'codegraph_sql',
+    description:
+      "Surface SQL string-literal references to tables across the codebase. Use to answer 'what code touches the users table?' or 'what tables does this codebase access?'. Returns either (a) the top-N distinct tables with read/write counts (no `table`), or (b) the precise read sites and their enclosing functions for a specific table. Beats grep because it requires a SQL keyword prefix (FROM/JOIN/INTO/UPDATE/DELETE), filtering out non-SQL uses of the same identifier.",
+    inputSchema: {
+      type: 'object',
+      properties: {
+        table: {
+          type: 'string',
+          description:
+            'Specific table to look up (e.g. "users"). Case-insensitive. If omitted, returns the top-N tables with read/write counts.',
+        },
+        op: {
+          type: 'string',
+          enum: ['read', 'write', 'ddl'],
+          description:
+            'Filter to one operation kind: read (SELECT/JOIN), write (INSERT/UPDATE/DELETE), or ddl (CREATE/ALTER/DROP). Only meaningful with `table`.',
+        },
+        limit: {
+          type: 'number',
+          description: 'Max tables to return when no `table` is specified (default: 30).',
+        },
+        projectPath: projectPathProperty,
+      },
+    },
+  },
+  handlerKey: 'handleSql',
+};
diff --git a/src/mcp/tools/status.ts b/src/mcp/tools/status.ts
new file mode 100644
index 00000000..84bebcc3
--- /dev/null
+++ b/src/mcp/tools/status.ts
@@ -0,0 +1,17 @@
+import { projectPathProperty } from '../tool-types';
+import type { ToolModule } from './types';
+
+export const STATUS_TOOL: ToolModule = {
+  definition: {
+    name: 'codegraph_status',
+    description:
+      'Get the status of the CodeGraph index, including statistics about indexed files, nodes, and edges.',
+    inputSchema: {
+      type: 'object',
+      properties: {
+        projectPath: projectPathProperty,
+      },
+    },
+  },
+  handlerKey: 'handleStatus',
+};
diff --git a/src/mcp/tools/types.ts b/src/mcp/tools/types.ts
new file mode 100644
index 00000000..20e1e50d
--- /dev/null
+++ b/src/mcp/tools/types.ts
@@ -0,0 +1,61 @@
+/**
+ * MCP tool registry types.
+ *
+ * Each tool ships its own self-contained `ToolModule` (definition
+ * + handler-key reference) so adding an MCP tool is a single-file
+ * addition for the metadata and dispatch entry. The actual handler
+ * bodies still live as methods on the `ToolHandler` class in
+ * `../tools.ts` (the helpers they call are tightly coupled and a
+ * full body extraction is left as a follow-up); each tool's
+ * `handlerKey` is the string name of the method to invoke.
+ *
+ * The registry (`./registry`) imports each module and exposes
+ * `tools[]` (for `list_tools`) plus a `getModule(name)` lookup
+ * used by `ToolHandler.execute`.
+ */
+
+import type { ToolDefinition, ToolResult } from '../tool-types';
+
+/**
+ * Names of methods on `ToolHandler` that can serve as tool handlers.
+ * Kept as a string union (not a `keyof ToolHandler` lookup) to
+ * avoid a circular import — the type list is the source of truth
+ * and is checked structurally at the call site in `execute()`.
+ */
+export type HandlerKey =
+  | 'handleSearch'
+  | 'handleContext'
+  | 'handleCallers'
+  | 'handleCallees'
+  | 'handleImpact'
+  | 'handleExplore'
+  | 'handleNode'
+  | 'handleStatus'
+  | 'handleFiles'
+  | 'handleHotspots'
+  | 'handleConfig'
+  | 'handleSql'
+  | 'handleReviewContext'
+  | 'handleAsk'
+  | 'handleDeadCode'
+  | 'handleModule'
+  | 'handlePendingSummaries'
+  | 'handleRole'
+  | 'handleSaveSummaries'
+  | 'handleSimilar';
+
+/**
+ * The minimum surface a `ToolHandler`-shaped object exposes for
+ * dispatch. Extending `HandlerKey` adds a new entry here too.
+ */
+export type ToolHandlerLike = {
+  [K in HandlerKey]: (args: Record<string, unknown>) => Promise<ToolResult>;
+} & {
+  errorResult(message: string): ToolResult;
+};
+
+export interface ToolModule {
+  readonly definition: ToolDefinition;
+  /** Method name on `ToolHandler` that runs this tool. */
+  readonly handlerKey: HandlerKey;
+}
diff --git a/src/resolution/frameworks/csharp.ts b/src/resolution/frameworks/csharp.ts
index 1e170be4..9effb53f 100644
--- a/src/resolution/frameworks/csharp.ts
+++ b/src/resolution/frameworks/csharp.ts
@@ -6,6 +6,7 @@
 
 import { Node } from '../../types';
 import { FrameworkResolver, UnresolvedRef, ResolvedRef, ResolutionContext } from '../types';
+import { stripCommentsForRegex } from '../../utils';
 
 export const aspnetResolver: FrameworkResolver = {
   name: 'aspnet',
@@ -117,6 +118,9 @@ export const aspnetResolver: FrameworkResolver = {
   extractNodes(filePath: string, content: string): Node[] {
     const nodes: Node[] = [];
     const now = Date.now();
+    // Strip `//` and `/* */` comments so XML-doc examples like
+    // `/// [HttpGet("/x")]` aren't treated as real route attributes.
+    const safe = stripCommentsForRegex(content, 'csharp');
 
     // Extract route attributes
     // [HttpGet("path")], [HttpPost("path")], [Route("path")]
@@ -128,8 +132,8 @@ export const aspnetResolver: FrameworkResolver = {
 
     for (const pattern of routePatterns) {
       let match;
-      while ((match = pattern.exec(content)) !== null) {
-        const line = content.slice(0, match.index).split('\n').length;
+      while ((match = pattern.exec(safe)) !== null) {
+        const line = safe.slice(0, match.index).split('\n').length;
 
         if (pattern.source.includes('Http')) {
           if (match[3]) {
@@ -190,9 +194,9 @@ export const aspnetResolver: FrameworkResolver = {
     const minimalApiPattern = /\.Map(Get|Post|Put|Patch|Delete)\s*\(\s*["']([^"']+)["']/g;
 
     let match;
-    while ((match = minimalApiPattern.exec(content)) !== null) {
+    while ((match = minimalApiPattern.exec(safe)) !== null) {
       const [, method, path] = match;
-      const line = content.slice(0, match.index).split('\n').length;
+      const line = safe.slice(0, match.index).split('\n').length;
 
       nodes.push({
         id: `route:${filePath}:${method!.toUpperCase()}:${path}:${line}`,
diff --git a/src/resolution/frameworks/express.ts b/src/resolution/frameworks/express.ts
index 0afa7e03..07851769 100644
--- a/src/resolution/frameworks/express.ts
+++ b/src/resolution/frameworks/express.ts
@@ -6,6 +6,7 @@
 
 import { Node } from '../../types';
 import { FrameworkResolver, UnresolvedRef, ResolvedRef, ResolutionContext } from '../types';
+import { stripCommentsForRegex } from '../../utils';
 
 export const expressResolver: FrameworkResolver = {
   name: 'express',
@@ -93,6 +94,9 @@ export const expressResolver: FrameworkResolver = {
   extractNodes(filePath: string, content: string): Node[] {
     const nodes: Node[] = [];
     const now = Date.now();
+    // Neutralize comments and JSDoc blocks so a `app.get('/x')` example in
+    // a comment isn't extracted as a real route.
+    const safe = stripCommentsForRegex(content, 'javascript');
 
     // Extract route definitions
     // app.get('/path', handler) or router.get('/path', handler)
@@ -102,9 +106,9 @@ export const expressResolver: FrameworkResolver = {
 
     for (const pattern of routePatterns) {
       let match;
-      while ((match = pattern.exec(content)) !== null) {
+      while ((match = pattern.exec(safe)) !== null) {
         const [, _obj, method, path] = match;
-        const line = content.slice(0, match.index).split('\n').length;
+        const line = safe.slice(0, match.index).split('\n').length;
 
         // Skip middleware use() without paths
         if (method === 'use' && !path?.startsWith('/')) {
diff --git a/src/resolution/frameworks/laravel.ts b/src/resolution/frameworks/laravel.ts
index d6a79885..4b3b5e00 100644
--- a/src/resolution/frameworks/laravel.ts
+++ b/src/resolution/frameworks/laravel.ts
@@ -6,6 +6,7 @@
 
 import { Node } from '../../types';
 import { FrameworkResolver, UnresolvedRef, ResolvedRef, ResolutionContext } from '../types';
+import { stripCommentsForRegex } from '../../utils';
 
 /**
  * Laravel facade mappings to underlying classes
@@ -93,6 +94,7 @@ export const laravelResolver: FrameworkResolver = {
   extractNodes(filePath: string, content: string): Node[] {
     const nodes: Node[] = [];
     const now = Date.now();
+    const safe = stripCommentsForRegex(content, 'php');
 
     // Extract route definitions
     const routePatterns = [
@@ -106,10 +108,10 @@ export const laravelResolver: FrameworkResolver = {
 
     for (const pattern of routePatterns) {
       let match;
-      while ((match = pattern.exec(content)) !== null) {
+      while ((match = pattern.exec(safe)) !== null) {
         if (pattern.source.includes('resource')) {
           const [, resourceName] = match;
-          const line = content.slice(0, match.index).split('\n').length;
+          const line = safe.slice(0, match.index).split('\n').length;
           nodes.push({
             id: `route:${filePath}:resource:${resourceName}:${line}`,
             kind: 'route',
@@ -125,7 +127,7 @@ export const laravelResolver: FrameworkResolver = {
           });
         } else {
           const [, method, path] = match;
-          const line = content.slice(0, match.index).split('\n').length;
+          const line = safe.slice(0, match.index).split('\n').length;
           nodes.push({
             id: `route:${filePath}:${method!.toUpperCase()}:${path}:${line}`,
             kind: 'route',
diff --git a/src/resolution/frameworks/python.ts b/src/resolution/frameworks/python.ts
index 88f5034a..021fbd1d 100644
--- a/src/resolution/frameworks/python.ts
+++ b/src/resolution/frameworks/python.ts
@@ -6,6 +6,7 @@
 
 import { Node } from '../../types';
 import { FrameworkResolver, UnresolvedRef, ResolvedRef, ResolutionContext } from '../types';
+import { stripCommentsForRegex } from '../../utils';
 
 export const djangoResolver: FrameworkResolver = {
   name: 'django',
@@ -77,6 +78,10 @@ export const djangoResolver: FrameworkResolver = {
   extractNodes(filePath: string, content: string): Node[] {
     const nodes: Node[] = [];
     const now = Date.now();
+    // Neutralize comments and docstrings so a `path('/x', view)` example in
+    // a docstring isn't extracted as a real route. Newlines preserved so
+    // line numbers stay correct.
+    const safe = stripCommentsForRegex(content, 'python');
 
     // Extract URL patterns
     // path('route/', view, name='name')
@@ -87,9 +92,9 @@ export const djangoResolver: FrameworkResolver = {
 
     for (const pattern of urlPatterns) {
       let match;
-      while ((match = pattern.exec(content)) !== null) {
+      while ((match = pattern.exec(safe)) !== null) {
         const [, urlPath] = match;
-        const line = content.slice(0, match.index).split('\n').length;
+        const line = safe.slice(0, match.index).split('\n').length;
 
         nodes.push({
           id: `route:${filePath}:${urlPath}:${line}`,
@@ -157,15 +162,16 @@ export const flaskResolver: FrameworkResolver = {
   extractNodes(filePath: string, content: string): Node[] {
     const nodes: Node[] = [];
     const now = Date.now();
+    const safe = stripCommentsForRegex(content, 'python');
 
     // Extract Flask route decorators
     // @app.route('/path') or @blueprint.route('/path')
     const routePattern = /@(\w+)\.route\s*\(\s*['"]([^'"]+)['"]/g;
 
     let match;
-    while ((match = routePattern.exec(content)) !== null) {
+    while ((match = routePattern.exec(safe)) !== null) {
       const [, _appOrBp, routePath] = match;
-      const line = content.slice(0, match.index).split('\n').length;
+      const line = safe.slice(0, match.index).split('\n').length;
 
       nodes.push({
         id: `route:${filePath}:${routePath}:${line}`,
@@ -245,15 +251,16 @@ export const fastapiResolver: FrameworkResolver = {
   extractNodes(filePath: string, content: string): Node[] {
     const nodes: Node[] = [];
     const now = Date.now();
+    const safe = stripCommentsForRegex(content, 'python');
 
     // Extract FastAPI route decorators
     // @app.get('/path') or @router.post('/path')
     const routePattern = /@(\w+)\.(get|post|put|patch|delete|options|head)\s*\(\s*['"]([^'"]+)['"]/g;
 
     let match;
-    while ((match = routePattern.exec(content)) !== null) {
+    while ((match = routePattern.exec(safe)) !== null) {
       const [, _appOrRouter, method, routePath] = match;
-      const line = content.slice(0, match.index).split('\n').length;
+      const line = safe.slice(0, match.index).split('\n').length;
 
       nodes.push({
         id: `route:${filePath}:${method!.toUpperCase()}:${routePath}:${line}`,
diff --git a/src/resolution/frameworks/rust.ts b/src/resolution/frameworks/rust.ts
index 5ab10bc3..92d92060 100644
--- a/src/resolution/frameworks/rust.ts
+++ b/src/resolution/frameworks/rust.ts
@@ -6,6 +6,7 @@
 
 import { Node } from '../../types';
 import { FrameworkResolver, UnresolvedRef, ResolvedRef, ResolutionContext } from '../types';
+import { stripCommentsForRegex } from '../../utils';
 
 export const rustResolver: FrameworkResolver = {
   name: 'rust',
@@ -74,15 +75,18 @@ export const rustResolver: FrameworkResolver = {
   extractNodes(filePath: string, content: string): Node[] {
     const nodes: Node[] = [];
     const now = Date.now();
+    // Strip `//` and `/* */` comments so doc-comment examples like
+    // `/// #[get("/x")]` aren't treated as real route attributes.
+    const safe = stripCommentsForRegex(content, 'rust');
 
     // Extract Actix-web routes
     // #[get("/path")], #[post("/path")], etc.
     const actixRoutePattern = /#\[(get|post|put|patch|delete)\s*\(\s*["']([^"']+)["']/g;
 
     let match;
-    while ((match = actixRoutePattern.exec(content)) !== null) {
+    while ((match = actixRoutePattern.exec(safe)) !== null) {
       const [, method, path] = match;
-      const line = content.slice(0, match.index).split('\n').length;
+      const line = safe.slice(0, match.index).split('\n').length;
 
       nodes.push({
         id: `route:${filePath}:${method!.toUpperCase()}:${path}:${line}`,
@@ -103,9 +107,9 @@ export const rustResolver: FrameworkResolver = {
     // #[get("/path")], #[post("/path", ...)]
     const rocketRoutePattern = /#\[(get|post|put|patch|delete|head|options)\s*\(\s*["']([^"']+)["']/g;
 
-    while ((match = rocketRoutePattern.exec(content)) !== null) {
+    while ((match = rocketRoutePattern.exec(safe)) !== null) {
       const [, method, path] = match;
-      const line = content.slice(0, match.index).split('\n').length;
+      const line = safe.slice(0, match.index).split('\n').length;
 
       // Avoid duplicates from actix pattern
       const routeId = `route:${filePath}:${method!.toUpperCase()}:${path}:${line}`;
@@ -130,9 +134,9 @@ export const rustResolver: FrameworkResolver = {
     // .route("/path", get(handler))
     const axumRoutePattern = /\.route\s*\(\s*["']([^"']+)["']\s*,\s*(get|post|put|patch|delete)/g;
 
-    while ((match = axumRoutePattern.exec(content)) !== null) {
+    while ((match = axumRoutePattern.exec(safe)) !== null) {
       const [, path, method] = match;
-      const line = content.slice(0, match.index).split('\n').length;
+      const line = safe.slice(0, match.index).split('\n').length;
 
       nodes.push({
         id: `route:${filePath}:${method!.toUpperCase()}:${path}:${line}`,
diff --git a/src/review/diff-parser.ts b/src/review/diff-parser.ts
new file mode 100644
index 00000000..0a57d35d
--- /dev/null
+++ b/src/review/diff-parser.ts
@@ -0,0 +1,226 @@
+/**
+ * Unified Diff Parser
+ *
+ * Minimal parser for the subset of unified-diff syntax git emits:
+ * file headers (`diff --git a/x b/y`), index lines, mode lines, and
+ * hunk headers (`@@ -OLD,COUNT +NEW,COUNT @@`). Body lines are not
+ * preserved — callers only need file + hunk metadata to map changes
+ * back to symbols via line-range overlap.
+ *
+ * Pure module: no DB or filesystem access. Safe to test in isolation.
+ */
+
+export type FileStatus = 'added' | 'modified' | 'deleted' | 'renamed';
+
+export interface Hunk {
+  /** Old file: starting line number (1-indexed). 0 if file was added. */
+  oldStart: number;
+  /** Number of lines from the old file in this hunk. 0 for added file. */
+  oldCount: number;
+  /** New file: starting line number (1-indexed). 0 if file was deleted. */
+  newStart: number;
+  /** Number of lines in the new file. 0 for deleted file. */
+  newCount: number;
+}
+
+export interface DiffFile {
+  /**
+   * File path as it appears in the new tree (or the old tree for deletions).
+   * Always normalized to forward slashes; the leading `a/` or `b/` prefix
+   * git emits is stripped.
+   */
+  path: string;
+  /** Pre-rename path (only set when status === 'renamed'). */
+  oldPath?: string;
+  status: FileStatus;
+  hunks: Hunk[];
+}
+
+const HUNK_RE =
+  /^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@/;
+
+// Matches both unquoted (`diff --git a/x b/y`) and C-style-quoted
+// (`diff --git "a/x with space" "b/y"`) git diff headers. The capture
+// groups always include the `a/` / `b/` prefix or the surrounding
+// quotes; both are stripped via `unquote` before use.
+const DIFF_HEADER_RE = /^diff --git (?:"a\/(.+)"|a\/(\S+)) (?:"b\/(.+)"|b\/(\S+))$/;
+
+/**
+ * Parse a unified diff into a flat list of files with hunk metadata.
+ *
+ * Tolerates extra noise lines (binary file markers, similarity index
+ * lines, etc.) by skipping anything that doesn't match a known prefix.
+ */
+export function parseDiff(text: string): DiffFile[] {
+  const files: DiffFile[] = [];
+  let current: DiffFile | null = null;
+  let oldPath: string | null = null;
+  let newPath: string | null = null;
+  let isAddition = false;
+  let isDeletion = false;
+  let isRename = false;
+  let renamedFrom: string | null = null;
+  let renamedTo: string | null = null;
+
+  // Strip git's C-style quoting on paths with special characters
+  // (e.g., `"path with spaces.ts"` → `path with spaces.ts`).
+  const unquote = (p: string | null): string | null => {
+    if (!p) return p;
+    if (p.startsWith('"') && p.endsWith('"')) {
+      try { return JSON.parse(p) as string; } catch { return p.slice(1, -1); }
+    }
+    return p;
+  };
+
+  const flushCurrent = () => {
+    if (!current) return;
+    files.push(current);
+    current = null;
+  };
+
+  // Emit a file entry for a header that produced no hunks (pure rename,
+  // mode change, or empty add/delete). Without this, such files silently
+  // disappear when followed by another `diff --git` header.
+  const flushHunkless = () => {
+    if (current !== null) return; // a hunked file already emitted
+    if (!isRename && !isAddition && !isDeletion) return;
+    const status: FileStatus = isRename
+      ? 'renamed'
+      : isAddition
+        ? 'added'
+        : 'deleted';
+    const path = isDeletion
+      ? unquote(oldPath) ?? renamedFrom ?? '?'
+      : unquote(newPath) ?? renamedTo ?? '?';
+    const f: DiffFile = { path, status, hunks: [] };
+    if (status === 'renamed' && renamedFrom) f.oldPath = renamedFrom;
+    files.push(f);
+  };
+
+  const lines = text.split('\n');
+  for (const line of lines) {
+    // Start of a new file block.
+    const headerMatch = DIFF_HEADER_RE.exec(line);
+    if (headerMatch) {
+      flushCurrent();
+      flushHunkless(); // emit any hunk-less file from the previous header
+      // headerMatch slots: 1=quoted-a, 2=unquoted-a, 3=quoted-b, 4=unquoted-b.
+      // Whichever side matched is the path; the others are undefined.
+      oldPath = headerMatch[1] ?? headerMatch[2] ?? null;
+      newPath = headerMatch[3] ?? headerMatch[4] ?? null;
+      isAddition = false;
+      isDeletion = false;
+      isRename = false;
+      renamedFrom = null;
+      renamedTo = null;
+      continue;
+    }
+
+    if (line.startsWith('new file mode')) {
+      isAddition = true;
+      continue;
+    }
+    if (line.startsWith('deleted file mode')) {
+      isDeletion = true;
+      continue;
+    }
+    if (line.startsWith('rename from ')) {
+      isRename = true;
+      renamedFrom = line.substring('rename from '.length).trim();
+      continue;
+    }
+    if (line.startsWith('rename to ')) {
+      renamedTo = line.substring('rename to '.length).trim();
+      continue;
+    }
+
+    // The old/new path lines (--- / +++) confirm paths and detect
+    // /dev/null sentinels that mean add or delete.
+    if (line.startsWith('--- ')) {
+      const p = line.substring(4).trim();
+      if (p === '/dev/null') isAddition = true;
+      continue;
+    }
+    if (line.startsWith('+++ ')) {
+      const p = line.substring(4).trim();
+      if (p === '/dev/null') isDeletion = true;
+      continue;
+    }
+
+    // First hunk seen — finalize the file header into a DiffFile.
+    const hunkMatch = HUNK_RE.exec(line);
+    if (hunkMatch) {
+      if (!current) {
+        const status: FileStatus = isRename
+          ? 'renamed'
+          : isAddition
+            ? 'added'
+            : isDeletion
+              ? 'deleted'
+              : 'modified';
+
+        const path = isDeletion
+          ? unquote(oldPath) ?? renamedFrom ?? '?'
+          : unquote(newPath) ?? renamedTo ?? '?';
+
+        current = {
+          path,
+          status,
+          hunks: [],
+        };
+        if (status === 'renamed' && renamedFrom) {
+          current.oldPath = renamedFrom;
+        }
+        // Reset add/delete/rename flags now that they've been consumed
+        // into `current.status`. Otherwise they leak into the next header
+        // and trigger a phantom hunk-less file emit.
+        isAddition = false;
+        isDeletion = false;
+        isRename = false;
+      }
+      current.hunks.push({
+        oldStart: parseInt(hunkMatch[1] ?? '0', 10),
+        oldCount: hunkMatch[2] !== undefined ? parseInt(hunkMatch[2], 10) : 1,
+        newStart: parseInt(hunkMatch[3] ?? '0', 10),
+        newCount: hunkMatch[4] !== undefined ? parseInt(hunkMatch[4], 10) : 1,
+      });
+      continue;
+    }
+
+    // Pure-rename or pure-mode-change blocks have no hunks. They get
+    // emitted via flushHunkless on the next header transition, or here
+    // at EOF.
+  }
+
+  flushCurrent();
+  flushHunkless();
+
+  return files;
+}
+
+/**
+ * Convert a DiffFile + the file's symbol nodes (with start/end line
+ * ranges) into the subset of symbols whose lines overlap any hunk.
+ *
+ * For added/deleted files there are no meaningful pre-existing symbols
+ * to intersect — caller should treat the entire file as affected.
+ */
+export function symbolsTouchedByHunks<T extends { startLine: number; endLine: number }>(
+  hunks: Hunk[],
+  symbols: T[]
+): T[] {
+  if (hunks.length === 0 || symbols.length === 0) return [];
+  const out: T[] = [];
+  for (const s of symbols) {
+    for (const h of hunks) {
+      // Overlap is checked against the new-file line range. A hunk that
+      // adds 5 lines starting at newStart=10 occupies lines [10, 14].
+      const hunkEnd = h.newStart + Math.max(h.newCount - 1, 0);
+      if (s.startLine <= hunkEnd && s.endLine >= h.newStart) {
+        out.push(s);
+        break;
+      }
+    }
+  }
+  return out;
+}
diff --git a/src/review/index.ts b/src/review/index.ts
new file mode 100644
index 00000000..c612a8b1
--- /dev/null
+++ b/src/review/index.ts
@@ -0,0 +1,325 @@
+/**
+ * Review Context Builder
+ *
+ * Takes a unified diff and returns the structured context an LLM-driven
+ * code reviewer needs to evaluate it: per-symbol callers / callees /
+ * tests / impact, plus historical co-change warnings (files that
+ * historically change together but were NOT both touched in this PR).
+ *
+ * Designed to be the substrate under PR-review tooling (Greptile,
+ * CodeRabbit, custom Claude Code agents). Not a reviewer itself —
+ * synthesis stays with the LLM consumer.
+ */
+
+import { Node, NodeKind } from '../types';
+import { QueryBuilder } from '../db/queries';
+import { GraphTraverser } from '../graph/traversal';
+import { parseDiff, symbolsTouchedByHunks, DiffFile, FileStatus } from './diff-parser';
+
+export { parseDiff, type DiffFile, type Hunk, type FileStatus } from './diff-parser';
+
+export interface ReviewContextOptions {
+  /**
+   * Per-symbol caller / callee fan-out cap. Reviewer only needs a handful
+   * to decide "is this a hot-path function or an internal helper", not
+   * every reference.
+   */
+  maxCallersPerSymbol?: number;
+  maxCalleesPerSymbol?: number;
+
+  /**
+   * For each changed file, surface up to N co-changers that historically
+   * change together but are NOT in this PR. Set 0 to disable.
+   */
+  maxCoChangeWarnings?: number;
+
+  /**
+   * Minimum Jaccard for a co-change warning to be reported. 0.4 catches
+   * meaningfully-coupled pairs without flooding the result with weak
+   * historical co-occurrence.
+   */
+  minCoChangeJaccard?: number;
+}
+
+interface SymbolRef {
+  name: string;
+  filePath: string;
+  line?: number;
+}
+
+export interface AffectedSymbol {
+  symbolId: string;
+  name: string;
+  kind: NodeKind;
+  qualifiedName: string;
+  startLine: number;
+  endLine: number;
+  signature?: string;
+  docstring?: string;
+  /** Direct callers (incoming `calls`/`references`/`imports` edges). */
+  callers: SymbolRef[];
+  /** Direct callees (outgoing `calls`/`references`/`imports` edges). */
+  callees: SymbolRef[];
+  /** Number of nodes in the impact radius (depth 2). */
+  impactCount: number;
+}
+
+export interface ReviewedFile {
+  path: string;
+  status: FileStatus;
+  oldPath?: string;
+  /** Symbols whose line ranges overlap the diff hunks. */
+  affectedSymbols: AffectedSymbol[];
+  /** Test files that cover this source file (via PR #106 `tests` edges). */
+  tests: string[];
+  /** Note when status == 'deleted' — incoming edges to symbols that vanish. */
+  brokenIncomingRefs?: SymbolRef[];
+}
+
+export interface CoChangeWarning {
+  changedFile: string;
+  expectedToChange: string;
+  jaccard: number;
+  historicalCount: number;
+  note: string;
+}
+
+export interface ReviewContext {
+  summary: {
+    filesAdded: number;
+    filesModified: number;
+    filesDeleted: number;
+    filesRenamed: number;
+    symbolsAffected: number;
+    coChangeWarnings: number;
+  };
+  files: ReviewedFile[];
+  coChangeWarnings: CoChangeWarning[];
+}
+
+const DEFAULTS: Required<ReviewContextOptions> = {
+  maxCallersPerSymbol: 5,
+  maxCalleesPerSymbol: 5,
+  maxCoChangeWarnings: 3,
+  minCoChangeJaccard: 0.4,
+};
+
+/**
+ * Build a review-context bundle from a unified diff. Pure data — the
+ * caller (typically an LLM) decides what to do with it.
+ */
+export function buildReviewContext(
+  diff: string,
+  queries: QueryBuilder,
+  traverser: GraphTraverser,
+  options: ReviewContextOptions = {}
+): ReviewContext {
+  const opts = { ...DEFAULTS, ...options };
+  const diffFiles = parseDiff(diff);
+  const changedPaths = new Set(diffFiles.map((f) => f.path));
+
+  const reviewedFiles: ReviewedFile[] = [];
+  let totalSymbols = 0;
+
+  for (const df of diffFiles) {
+    const reviewed = reviewFile(df, queries, traverser, opts);
+    totalSymbols += reviewed.affectedSymbols.length;
+    reviewedFiles.push(reviewed);
+  }
+
+  // Co-change warnings — for each changed file, find historical
+  // co-changers NOT touched in this PR. This is the genuinely novel
+  // signal: catches "you changed X but didn't update Y which always
+  // changes with X" (schema + migration, code + test, config + reader).
+  const coChangeWarnings: CoChangeWarning[] = [];
+  if (opts.maxCoChangeWarnings > 0) {
+    for (const df of diffFiles) {
+      // Skip pure deletions — querying their co-changers tells us nothing
+      // useful about what should also have been touched in this PR.
+      if (df.status === 'deleted') continue;
+      const partners = safeGetCoChangedFiles(queries, df.path, {
+        limit: opts.maxCoChangeWarnings * 3,
+        minCount: 2,
+        minJaccard: opts.minCoChangeJaccard,
+      });
+      const missing = partners.filter((p) => !changedPaths.has(p.path)).slice(0, opts.maxCoChangeWarnings);
+      for (const m of missing) {
+        coChangeWarnings.push({
+          changedFile: df.path,
+          expectedToChange: m.path,
+          jaccard: round2(m.jaccard),
+          historicalCount: m.count,
+          note: 'Historically changes together with the changed file but is not included in this PR. Verify whether it should be updated.',
+        });
+      }
+    }
+  }
+
+  const counts = reviewedFiles.reduce(
+    (acc, f) => {
+      if (f.status === 'added') acc.added++;
+      else if (f.status === 'modified') acc.modified++;
+      else if (f.status === 'deleted') acc.deleted++;
+      else if (f.status === 'renamed') acc.renamed++;
+      return acc;
+    },
+    { added: 0, modified: 0, deleted: 0, renamed: 0 }
+  );
+
+  return {
+    summary: {
+      filesAdded: counts.added,
+      filesModified: counts.modified,
+      filesDeleted: counts.deleted,
+      filesRenamed: counts.renamed,
+      symbolsAffected: totalSymbols,
+      coChangeWarnings: coChangeWarnings.length,
+    },
+    files: reviewedFiles,
+    coChangeWarnings,
+  };
+}
+
+function reviewFile(
+  df: DiffFile,
+  queries: QueryBuilder,
+  traverser: GraphTraverser,
+  opts: Required<ReviewContextOptions>
+): ReviewedFile {
+  const reviewed: ReviewedFile = {
+    path: df.path,
+    status: df.status,
+    affectedSymbols: [],
+    tests: safeGetTestsForFile(queries, df.path),
+  };
+  if (df.oldPath) reviewed.oldPath = df.oldPath;
+
+  const fileSymbols = queries.getNodesByFile(df.path);
+
+  // For deleted files: list every symbol that vanishes plus every
+  // distinct incoming reference to those symbols (the "what just broke"
+  // picture). Dedup by (name, filePath, line) so a caller with two
+  // different edge types to the same deleted file isn't double-listed.
+  if (df.status === 'deleted') {
+    const seen = new Set<string>();
+    const broken: SymbolRef[] = [];
+    for (const sym of fileSymbols) {
+      const incoming = queries.getIncomingEdges(sym.id, ['calls', 'references', 'imports', 'extends', 'implements']);
+      for (const edge of incoming) {
+        const sourceNode = queries.getNodeById(edge.source);
+        if (!sourceNode) continue;
+        const key = `${sourceNode.filePath}|${sourceNode.name}|${edge.line ?? ''}`;
+        if (seen.has(key)) continue;
+        seen.add(key);
+        broken.push({
+          name: sourceNode.name,
+          filePath: sourceNode.filePath,
+          line: edge.line,
+        });
+      }
+      // Skip the per-symbol details for deleted files — affected lists
+      // would all be empty since the symbol's gone.
+      reviewed.affectedSymbols.push(toAffected(sym, [], [], 0));
+    }
+    if (broken.length > 0) reviewed.brokenIncomingRefs = broken;
+    return reviewed;
+  }
+
+  // For added files: every top-level symbol is "affected" (newly created).
+  // For modified files: symbols whose line range overlaps a hunk.
+  const touched = df.status === 'added'
+    ? fileSymbols
+    : symbolsTouchedByHunks(df.hunks, fileSymbols);
+
+  for (const sym of touched) {
+    const callers = traverser
+      .getCallers(sym.id, 1)
+      .slice(0, opts.maxCallersPerSymbol)
+      .map((r) => ({
+        name: r.node.name,
+        filePath: r.node.filePath,
+        line: r.edge.line,
+      }));
+
+    const callees = traverser
+      .getCallees(sym.id, 1)
+      .slice(0, opts.maxCalleesPerSymbol)
+      .map((r) => ({
+        name: r.node.name,
+        filePath: r.node.filePath,
+        line: r.edge.line,
+      }));
+
+    const impactCount = traverser.getImpactRadius(sym.id, 2).nodes.size;
+    reviewed.affectedSymbols.push(toAffected(sym, callers, callees, impactCount));
+  }
+
+  return reviewed;
+}
+
+function toAffected(
+  sym: Node,
+  callers: SymbolRef[],
+  callees: SymbolRef[],
+  impactCount: number
+): AffectedSymbol {
+  const out: AffectedSymbol = {
+    symbolId: sym.id,
+    name: sym.name,
+    kind: sym.kind,
+    qualifiedName: sym.qualifiedName,
+    startLine: sym.startLine,
+    endLine: sym.endLine,
+    callers,
+    callees,
+    impactCount,
+  };
+  if (sym.signature) out.signature = sym.signature;
+  if (sym.docstring) out.docstring = sym.docstring;
+  return out;
+}
+
+/**
+ * Co-change query — graceful degradation if PR #105's co_changes table
+ * isn't present. Returns [] without throwing, so the review context
+ * still works on a pre-#105 install.
+ */
+function safeGetCoChangedFiles(
+  queries: QueryBuilder,
+  filePath: string,
+  options: { limit: number; minCount: number; minJaccard: number }
+): Array<{ path: string; count: number; jaccard: number }> {
+  const q = queries as unknown as {
+    getCoChangedFiles?: (
+      p: string,
+      o: { limit: number; minCount: number; minJaccard: number }
+    ) => Array<{ path: string; count: number; jaccard: number }>;
+  };
+  if (typeof q.getCoChangedFiles !== 'function') return [];
+  try {
+    return q.getCoChangedFiles(filePath, options);
+  } catch {
+    return [];
+  }
+}
+
+/**
+ * Tests-edges query — graceful degradation if PR #106's `tests` edges
+ * aren't present. Falls back to a direct edges-table query so we don't
+ * need the public API surface to exist yet.
+ */
+function safeGetTestsForFile(queries: QueryBuilder, filePath: string): string[] {
+  try {
+    const incoming = queries.getIncomingEdges(`file:${filePath}`, ['tests' as never]);
+    return incoming
+      .map((e) => e.source)
+      .filter((id) => id.startsWith('file:'))
+      .map((id) => id.slice('file:'.length));
+  } catch {
+    return [];
+  }
+}
+
+function round2(n: number): number {
+  return Math.round(n * 100) / 100;
+}
diff --git a/src/search/query-utils.ts b/src/search/query-utils.ts
index 9a61acae..eaa282c4 100644
--- a/src/search/query-utils.ts
+++ b/src/search/query-utils.ts
@@ -31,6 +31,17 @@ export const STOP_WORDS = new Set([
   'fix', 'bug', 'called',
 ]);
 
+/**
+ * Drop {@link STOP_WORDS} from a list of query terms. Returns the
+ * original list if every term is a stopword (so a degenerate input like
+ * `["the"]` still returns something rather than producing an empty
+ * downstream FTS query).
+ */
+export function filterStopwords(terms: string[]): string[] {
+  const filtered = terms.filter((t) => !STOP_WORDS.has(t.toLowerCase()));
+  return filtered.length > 0 ? filtered : terms;
+}
+
 /**
  * Generate stem variants of a search term by removing common English suffixes.
  * Used for FTS query expansion so "caching" also finds "cache", "eviction" finds "evict", etc.
@@ -333,3 +344,49 @@ export function kindBonus(kind: Node['kind']): number {
   };
   return bonuses[kind] ?? 0;
 }
+
+/**
+ * Cap consecutive results from the same file. Preserves overall ranking:
+ * the highest-scoring hit from each file is taken first (up to `perFileCap`
+ * per file), in score order. If `limit` isn't filled after the capped
+ * pass, the remaining slots are filled with the next-best hits regardless
+ * of file (preserves correctness — never hides a hit that would have
+ * otherwise been returned).
+ *
+ * Why: queries like `"ExtractionOrchestrator"` return the matching class
+ * plus 9 of its members from the same file. The first hit is informative;
+ * the next 9 are implementation detail that pushes peer files (subclasses,
+ * callers, sibling modules) past the limit. Capping per file surfaces
+ * representative breadth without losing the top hit.
+ */
+export function diversifyByFile<T extends { node: Node }>(
+  results: T[],
+  limit: number,
+  perFileCap: number
+): T[] {
+  if (perFileCap <= 0) return results.slice(0, limit);
+  const perFile = new Map<string, number>();
+  const picked: T[] = [];
+  const skipped: T[] = [];
+  for (const r of results) {
+    const f = r.node.filePath;
+    const c = perFile.get(f) ?? 0;
+    if (c < perFileCap) {
+      picked.push(r);
+      perFile.set(f, c + 1);
+      if (picked.length >= limit) return picked;
+    } else {
+      skipped.push(r);
+    }
+  }
+  // Backfill from skipped (in original score order) so we don't return
+  // fewer results than the caller asked for. This also handles the
+  // edge case where `results.length <= limit`: nothing was actually
+  // dropped, but the per-file cap reordered them so peer files appear
+  // earlier — `picked` first, then any leftover same-file hits.
+  for (const r of skipped) {
+    if (picked.length >= limit) break;
+    picked.push(r);
+  }
+  return picked;
+}
diff --git a/src/sql-refs/index.ts b/src/sql-refs/index.ts
new file mode 100644
index 00000000..0dbf8f85
--- /dev/null
+++ b/src/sql-refs/index.ts
@@ -0,0 +1,260 @@
+/**
+ * SQL call-site extraction
+ *
+ * Scans indexed source files for SQL string-literal patterns (FROM,
+ * JOIN, INTO, UPDATE, DELETE FROM, CREATE TABLE) and records each
+ * (table, op) pair as a row in `sql_refs`. Each row links to its
+ * enclosing function via line-range lookup against the existing
+ * nodes table, so an agent asking "what code touches the users
+ * table?" gets a list of real functions, not a grep wall.
+ *
+ * Why a separate table, not graph nodes/edges: tables aren't
+ * declared in code that the existing extractors parse — they live
+ * in `.sql` migration files. Once #95 (SQL language extractor)
+ * merges, `table_name` can be joined against indexed SQL DDL nodes
+ * for cross-language navigation. This PR ships the call-site
+ * detection now so the agent-useful queries already work; full
+ * graph integration follows when the prerequisite lands.
+ *
+ * Spike validation (codegraph indexing itself): 87 SQL call sites
+ * across the 8 tables defined in `src/db/schema.sql`, each
+ * attributed to its enclosing QueryBuilder method. Beats grep
+ * because grep matches `const nodes = ...` (a JS variable named
+ * `nodes`) too — this regex requires the SQL keyword prefix
+ * (FROM/INTO/UPDATE/JOIN), eliminating that class of false positive.
+ *
+ * V1 scope: table-level only. Column extraction (`SELECT email FROM
+ * users` → `users.email`) is best-effort and deferred until #95
+ * provides reliable column-name DDL nodes to join against.
+ */
+
+import * as fs from 'fs';
+import * as path from 'path';
+import { logDebug } from '../errors';
+
+export type SqlOp = 'read' | 'write' | 'ddl';
+
+export interface SqlRef {
+  tableName: string;
+  op: SqlOp;
+  /** Indexed-symbol id for the enclosing function/method. NULL = top-level. */
+  sourceNodeId: string | null;
+  filePath: string;
+  line: number;
+}
+
+/**
+ * Languages we scan. Anything not in this set is skipped — most
+ * non-source files have no SQL to find. SQL files themselves are
+ * skipped here because #95 will own DDL extraction.
+ */
+const SUPPORTED_LANGUAGES = new Set<string>([
+  'typescript',
+  'javascript',
+  'tsx',
+  'jsx',
+  'python',
+  'go',
+  'rust',
+  'java',
+  'kotlin',
+  'csharp',
+  'php',
+  'ruby',
+]);
+
+/**
+ * SQL identifier regex. Allows simple unquoted identifiers and
+ * double-quoted (Postgres) or backtick-quoted (MySQL) identifiers,
+ * with optional schema-qualifier prefix (`public.users`,
+ * `"public"."users"`). For v1 we record only the *table* part —
+ * schema goes into a future column when we have join targets.
+ */
+const IDENT = '(?:`([^`]+)`|"([^"]+)"|([A-Za-z_][\\w]*))';
+
+interface PatternDef {
+  /** Capture group containing the table name (1, 2, or 3 in IDENT). */
+  re: RegExp;
+  op: SqlOp;
+}
+
+/**
+ * SQL keyword + identifier patterns. `i` flag makes them case-
+ * insensitive; `g` is required for `exec` loops to advance through
+ * multiple matches per line.
+ *
+ * Each regex captures the table name in groups 1/2/3 (backtick /
+ * double-quote / unquoted) — at most one is set per match.
+ */
+const PATTERNS: PatternDef[] = [
+  // SELECT ... FROM <table>
+  // FROM appears in SELECT and DELETE statements; we tag it 'read' here
+  // and let DELETE's own regex below tag it 'write'. Last write wins
+  // because Map dedup is keyed by (table, op), so the DELETE one
+  // produces a separate write row alongside this read row.
+  { re: new RegExp(`\\bFROM\\s+(?:[A-Za-z_]\\w*\\s*\\.\\s*)?${IDENT}`, 'gi'), op: 'read' },
+  { re: new RegExp(`\\bJOIN\\s+(?:[A-Za-z_]\\w*\\s*\\.\\s*)?${IDENT}`, 'gi'), op: 'read' },
+  // INSERT INTO <table>
+  { re: new RegExp(`\\bINSERT\\s+INTO\\s+(?:[A-Za-z_]\\w*\\s*\\.\\s*)?${IDENT}`, 'gi'), op: 'write' },
+  // UPDATE <table> ... SET
+  { re: new RegExp(`\\bUPDATE\\s+(?:[A-Za-z_]\\w*\\s*\\.\\s*)?${IDENT}\\s+SET\\b`, 'gi'), op: 'write' },
+  // DELETE FROM <table>
+  { re: new RegExp(`\\bDELETE\\s+FROM\\s+(?:[A-Za-z_]\\w*\\s*\\.\\s*)?${IDENT}`, 'gi'), op: 'write' },
+  // CREATE TABLE [IF NOT EXISTS] <table>
+  { re: new RegExp(`\\bCREATE\\s+(?:TEMP(?:ORARY)?\\s+)?TABLE\\s+(?:IF\\s+NOT\\s+EXISTS\\s+)?(?:[A-Za-z_]\\w*\\s*\\.\\s*)?${IDENT}`, 'gi'), op: 'ddl' },
+  // ALTER TABLE / DROP TABLE
+  { re: new RegExp(`\\bALTER\\s+TABLE\\s+(?:[A-Za-z_]\\w*\\s*\\.\\s*)?${IDENT}`, 'gi'), op: 'ddl' },
+  { re: new RegExp(`\\bDROP\\s+TABLE\\s+(?:IF\\s+EXISTS\\s+)?(?:[A-Za-z_]\\w*\\s*\\.\\s*)?${IDENT}`, 'gi'), op: 'ddl' },
+];
+
+/**
+ * Identifier names we drop because they're SQL keywords or noise
+ * that the regex over-matches on:
+ *   - `WHERE` / `ON` / `GROUP` after `JOIN` (chained JOIN clauses)
+ *   - `AS`/`USING` aliasing
+ *   - `SELECT` / `INTO` (CTE-shaped or `SELECT ... INTO`)
+ */
+const RESERVED_TABLE_NAMES = new Set<string>([
+  // SQL keywords (real reserved words)
+  'where', 'on', 'group', 'order', 'limit', 'using', 'as',
+  'select', 'into', 'values', 'set', 'and', 'or', 'not',
+  'null', 'true', 'false',
+  // Common English words that survive the SQL-verb pre-filter when
+  // a sentence happens to contain a verb-like token. Stress test
+  // caught `from the list` in a code comment slipping through because
+  // "drop" appeared in "drop docs/config". These can never be real
+  // table names in production code, so reject early.
+  'a', 'an', 'the', 'of', 'to', 'in', 'is', 'it', 'for',
+  'this', 'that', 'these', 'those', 'with', 'by', 'at',
+]);
+
+/**
+ * Resolver supplied by caller: (filePath, line) → enclosing nodeId.
+ * Returns null when the read is at the file's top level.
+ */
+export type EnclosingNodeResolver = (filePath: string, line: number) => string | null;
+
+export interface FileTarget {
+  path: string;
+  language: string;
+}
+
+/**
+ * Strip line and same-line block comments before SQL detection.
+ *
+ * Without this, a line like
+ *   // example: db.prepare('SELECT name FROM the docs')
+ * passes the prose-rejection (it has a quote AND a SQL verb) and
+ * extracts `the` as a "table name". The comment is the actual
+ * problem — strip it first.
+ *
+ * Naive split on `//` / `#` is acceptable: SQL syntax doesn't use
+ * either as operators, so truncating SQL after a `//` inside a
+ * string is implausible (SQL line comments are `--`). Block
+ * comments on a single line (`/* ... *\/`) are stripped via
+ * regex; multi-line block comments are a documented v1 miss.
+ */
+function stripComments(line: string, language: string): string {
+  // Same-line block comments first (works for C-family languages).
+  let stripped = line.replace(/\/\*[\s\S]*?\*\//g, '');
+  if (language === 'python' || language === 'ruby') {
+    const idx = stripped.indexOf('#');
+    if (idx >= 0) stripped = stripped.slice(0, idx);
+  } else {
+    const idx = stripped.indexOf('//');
+    if (idx >= 0) stripped = stripped.slice(0, idx);
+  }
+  return stripped;
+}
+
+/**
+ * Pre-filter: line (with comments stripped) must contain a quote
+ * (so it's plausibly a string literal) AND a SQL verb. Anchoring on
+ * a verb is critical — without it, prose like
+ *   const note = "get the value from the array";
+ * pollutes results because `from the` matches our `FROM <table>`
+ * regex. Requiring `SELECT|INSERT|UPDATE|...` on the same line
+ * filters those out.
+ */
+function lineLooksLikeSql(line: string): boolean {
+  if (!/['"`]/.test(line)) return false;
+  return /\b(?:SELECT|INSERT|UPDATE|DELETE|CREATE|ALTER|DROP|TRUNCATE)\b/i.test(line);
+}
+
+/**
+ * Sanity check: the captured `FROM <table>` (or similar) should be
+ * inside a string literal, not in a comment. Approximated by
+ * requiring a quote (`'`, `"`, `` ` ``) somewhere before the match
+ * position on the same line. Doesn't handle multi-line template
+ * literals where the open-quote is on a previous line — that's a v1
+ * acceptable miss.
+ */
+function isInsideString(line: string, matchIndex: number): boolean {
+  const prefix = line.slice(0, matchIndex);
+  return /['"`]/.test(prefix);
+}
+
+/**
+ * Pull the table name out of a regex match. Exactly one of the
+ * three identifier capture groups is set per IDENT alternation.
+ */
+function extractTableName(m: RegExpExecArray): string | null {
+  const name = m[1] ?? m[2] ?? m[3];
+  if (!name) return null;
+  if (RESERVED_TABLE_NAMES.has(name.toLowerCase())) return null;
+  return name;
+}
+
+/**
+ * Scan a list of (path, language) targets and return all SQL refs
+ * found. Pure I/O + regex; the caller owns DB writes via
+ * `applySqlRefs`.
+ */
+export function extractSqlRefs(
+  rootDir: string,
+  targets: Iterable<FileTarget>,
+  resolveEnclosing: EnclosingNodeResolver
+): SqlRef[] {
+  const refs: SqlRef[] = [];
+  for (const t of targets) {
+    if (!SUPPORTED_LANGUAGES.has(t.language)) continue;
+    let src: string;
+    try {
+      src = fs.readFileSync(path.join(rootDir, t.path), 'utf8');
+    } catch (err) {
+      logDebug(`extractSqlRefs: read failed for ${t.path}: ${err instanceof Error ? err.message : String(err)}`);
+      continue;
+    }
+    const lines = src.split('\n');
+    for (let i = 0; i < lines.length; i++) {
+      const rawLine = lines[i]!;
+      const line = stripComments(rawLine, t.language);
+      if (!lineLooksLikeSql(line)) continue;
+      const lineNo = i + 1;
+      // Per-line dedup: if the same (table, op) appears twice via
+      // overlapping regex (e.g. `FROM` and `JOIN` in one line for
+      // different tables, but the same table doesn't double-record).
+      const seen = new Set<string>();
+      for (const pat of PATTERNS) {
+        pat.re.lastIndex = 0;
+        let m: RegExpExecArray | null;
+        while ((m = pat.re.exec(line)) !== null) {
+          if (!isInsideString(line, m.index)) continue;
+          const name = extractTableName(m);
+          if (!name) continue;
+          const key = `${name.toLowerCase()}|${pat.op}`;
+          if (seen.has(key)) continue;
+          seen.add(key);
+          refs.push({
+            tableName: name,
+            op: pat.op,
+            sourceNodeId: resolveEnclosing(t.path, lineNo),
+            filePath: t.path,
+            line: lineNo,
+          });
+        }
+      }
+    }
+  }
+  return refs;
+}
diff --git a/src/sync/watcher.ts b/src/sync/watcher.ts
index d3ef24b3..d059934c 100644
--- a/src/sync/watcher.ts
+++ b/src/sync/watcher.ts
@@ -177,17 +177,27 @@ export class FileWatcher {
     this.hasChanges = false;
     this.syncing = true;
 
+    let syncFailed = false;
     try {
       const result = await this.syncFn();
       this.onSyncComplete?.(result);
     } catch (err) {
+      syncFailed = true;
       const error = err instanceof Error ? err : new Error(String(err));
       logWarn('Watch sync failed', { error: error.message });
       this.onSyncError?.(error);
     } finally {
       this.syncing = false;
 
-      // If new changes arrived during sync, schedule another
+      // Re-set hasChanges if the sync failed so the dropped batch isn't
+      // forgotten — without this, a transient sync failure leaves the index
+      // stale until a *new* file event happens to retrigger.
+      if (syncFailed) {
+        this.hasChanges = true;
+      }
+
+      // If we have pending changes (either from the failed sync or new
+      // events that arrived during it), schedule another flush.
       if (this.hasChanges && !this.stopped) {
         this.scheduleSync();
       }
diff --git a/src/tests-edges/index.ts b/src/tests-edges/index.ts
new file mode 100644
index 00000000..e89e94c5
--- /dev/null
+++ b/src/tests-edges/index.ts
@@ -0,0 +1,183 @@
+/**
+ * Tests-as-edges
+ *
+ * Convention-based test → subject file resolution. Walks every indexed
+ * file, identifies test files via filename pattern, and resolves each
+ * test to the source file(s) it tests. Resulting edges (kind: 'tests')
+ * make `getTestsForFile(src)` and `getSubjectsOfTest(test)` one-call
+ * lookups that previously required a grep through the codebase.
+ *
+ * Convention-only — does not look inside test bodies. Tests with no
+ * obvious subject (multi-subject feature tests, project-wide harnesses)
+ * are honestly left without edges rather than guessed at.
+ */
+
+import * as path from 'path';
+
+/**
+ * Source file extensions we treat as plausible test subjects.
+ * Order matters: when both `foo.ts` and `foo.tsx` exist for a test
+ * named `foo.test.tsx`, we prefer the matching extension first.
+ */
+const SOURCE_EXTS_BY_TEST_EXT: Record<string, string[]> = {
+  ts: ['ts'],
+  tsx: ['tsx', 'ts'],
+  js: ['js'],
+  jsx: ['jsx', 'js'],
+  mjs: ['mjs', 'js'],
+  cjs: ['cjs', 'js'],
+  py: ['py'],
+  rs: ['rs'],
+  go: ['go'],
+  rb: ['rb'],
+  java: ['java'],
+  kt: ['kt', 'kts'],
+  cs: ['cs'],
+  swift: ['swift'],
+};
+
+/**
+ * Extract the "subject basename" from a test filename — i.e. the basename
+ * of the source file we'd expect this test to be testing, with the
+ * extension dropped. Returns null when the file isn't a test by any
+ * recognized convention.
+ *
+ * Conventions handled:
+ *   foo.test.{ts,tsx,js,jsx,mjs,cjs}        — JS/TS family (Jest, Vitest)
+ *   foo.spec.{ts,tsx,js,jsx,mjs,cjs}        — same family, alt suffix
+ *   test_foo.{py,rs}                        — Python pytest, Rust convention
+ *   foo_test.{go,py,rs}                     — Go convention; alt for Python/Rust
+ *   foo_{spec,test}.rb                      — Ruby (RSpec, Minitest)
+ *   FooTest.{java,kt,cs,swift}              — xUnit
+ *   FooTests.{java,kt,cs,swift}             — xUnit (plural)
+ *   FooSpec.{swift,kt}                      — Quick (Swift), Spek (Kotlin)
+ */
+export function testSubjectBasename(filePath: string): string | null {
+  const base = path.basename(filePath);
+
+  // foo.test.ts / foo.spec.ts (JS/TS family)
+  let m = base.match(/^(.+?)\.(test|spec)\.(ts|tsx|js|jsx|mjs|cjs)$/);
+  if (m) return m[1]!;
+
+  // test_foo.py / test_foo.rs
+  m = base.match(/^test_(.+?)\.(py|rs)$/);
+  if (m) return m[1]!;
+
+  // foo_test.go / foo_test.py / foo_test.rs
+  m = base.match(/^(.+?)_test\.(go|py|rs)$/);
+  if (m) return m[1]!;
+
+  // foo_spec.rb / foo_test.rb
+  m = base.match(/^(.+?)_(spec|test)\.rb$/);
+  if (m) return m[1]!;
+
+  // FooTest.java / FooTests.java (xUnit-style; trailing s optional)
+  m = base.match(/^(.+?)Tests?\.(java|kt|cs|swift)$/);
+  if (m) return m[1]!;
+
+  // FooSpec.swift / FooSpec.kt (Quick / Spek)
+  m = base.match(/^(.+?)Spec\.(swift|kt)$/);
+  if (m) return m[1]!;
+
+  return null;
+}
+
+/**
+ * True when `filePath` matches any test convention we recognize.
+ */
+export function isTestFile(filePath: string): boolean {
+  return testSubjectBasename(filePath) !== null;
+}
+
+/**
+ * Resolve a test file's subject source file(s) within the project. Returns
+ * an array (zero, one, or more) of paths drawn from `allFiles`.
+ *
+ * Strategy, applied in order; later steps run only if earlier ones miss:
+ *
+ *   1. Co-located: `path/foo.test.ts` → `path/foo.ts` or `path/foo/index.ts`
+ *      (handles e.g. `src/sync/watcher.test.ts` next to `src/sync/watcher.ts`).
+ *   2. Mirrored layout: walk up dropping `__tests__` / `tests` / `spec`,
+ *      then look for the subject directly or under `<dir>/index.<ext>`
+ *      (handles `__tests__/sync.test.ts` → `src/sync/index.ts`).
+ *   3. Common source roots: when the test sits at project root (no
+ *      mirrored target), try `src/`, `lib/`, `app/`, `packages/`.
+ *   4. Anywhere by basename: if still unresolved, find files by basename
+ *      match and pick the one whose directory shares the longest path
+ *      prefix with the test file.
+ *
+ * Returns an empty array when the test has no obvious subject. We
+ * deliberately do NOT guess — feature-themed tests like `security.test.ts`
+ * legitimately span multiple files and shouldn't be edged to one of them.
+ */
+export function findTestSubjects(testFile: string, allFiles: Set<string>): string[] {
+  const subject = testSubjectBasename(testFile);
+  if (!subject) return [];
+
+  const dir = path.posix.dirname(testFile);
+  const testExt = path.extname(testFile).slice(1).toLowerCase();
+  const sourceExts = SOURCE_EXTS_BY_TEST_EXT[testExt] ?? [testExt];
+  const candidates = new Set<string>();
+
+  // 1. Co-located.
+  for (const ext of sourceExts) {
+    const direct = path.posix.join(dir, `${subject}.${ext}`);
+    if (allFiles.has(direct) && direct !== testFile) candidates.add(direct);
+    const indexed = path.posix.join(dir, subject, `index.${ext}`);
+    if (allFiles.has(indexed)) candidates.add(indexed);
+  }
+
+  // 2. Mirrored — strip __tests__ anywhere, plus a leading or interior
+  //    `tests/`, `test/`, or `spec/` segment. The leading-anchored pass
+  //    catches top-level test directories like `tests/test_handlers.py`
+  //    that the slash-prefixed pattern alone would miss (no leading `/`).
+  const mirrored = dir
+    .replace(/__tests__\/?/g, '')
+    .replace(/^(?:tests?|spec)(\/|$)/, '')
+    .replace(/\/(?:tests?|spec)(\/|$)/g, '/')
+    .replace(/\/+$/, '');
+
+  // 3. Common source roots when the mirrored path collapsed to empty.
+  const sourceRoots = mirrored ? [mirrored] : ['.', 'src', 'lib', 'app', 'packages'];
+  for (const root of sourceRoots) {
+    for (const ext of sourceExts) {
+      const direct = path.posix.join(root, `${subject}.${ext}`);
+      if (allFiles.has(direct)) candidates.add(direct);
+      const indexed = path.posix.join(root, subject, `index.${ext}`);
+      if (allFiles.has(indexed)) candidates.add(indexed);
+    }
+  }
+
+  // 4. Anywhere by basename + closest path-prefix match.
+  if (candidates.size === 0) {
+    const matches: string[] = [];
+    for (const f of allFiles) {
+      const ext = path.extname(f).slice(1).toLowerCase();
+      if (!sourceExts.includes(ext)) continue;
+      const fBase = path.basename(f, path.extname(f));
+      if (fBase === subject && f !== testFile) matches.push(f);
+    }
+    if (matches.length === 1) {
+      candidates.add(matches[0]!);
+    } else if (matches.length > 1) {
+      const dirParts = dir.split('/');
+      let best = matches[0]!;
+      let bestCommon = -1;
+      for (const f of matches) {
+        const fParts = path.posix.dirname(f).split('/');
+        let common = 0;
+        for (let i = 0; i < Math.min(dirParts.length, fParts.length); i++) {
+          if (dirParts[i] === fParts[i]) common++;
+          else break;
+        }
+        if (common > bestCommon) {
+          best = f;
+          bestCommon = common;
+        }
+      }
+      candidates.add(best);
+    }
+  }
+
+  return [...candidates];
+}
diff --git a/src/types.ts b/src/types.ts
index 6834483d..a283ffc2 100644
--- a/src/types.ts
+++ b/src/types.ts
@@ -50,7 +50,8 @@ export type EdgeKind =
   | 'returns'         // Function returns type
   | 'instantiates'    // Creates instance of class
   | 'overrides'       // Method overrides parent method
-  | 'decorates';      // Decorator applied to symbol
+  | 'decorates'       // Decorator applied to symbol
+  | 'tests';          // Test file → subject file (convention-derived)
 
 /**
  * Supported programming languages
@@ -75,6 +76,11 @@ export type Language =
   | 'svelte'
   | 'liquid'
   | 'pascal'
+  | 'hcl'
+  | 'r'
+  | 'sql'
+  | 'scala'
+  | 'rescript'
   | 'unknown';
 
 // =============================================================================
@@ -144,6 +150,13 @@ export interface Node {
 
   /** When the node was last updated */
   updatedAt: number;
+
+  /**
+   * PageRank centrality score over calls+references edges, in (0, 1).
+   * NULL/undefined when not yet computed (fresh DB before first
+   * indexAll, or `enableCentrality: false`).
+   */
+  centrality?: number | null;
 }
 
 /**
@@ -199,6 +212,21 @@ export interface FileRecord {
 
   /** Any extraction errors */
   errors?: ExtractionError[];
+
+  /**
+   * Number of git commits touching this path. 0 when uncommitted or
+   * mining disabled. Lower bound on shallow clones.
+   */
+  commitCount?: number;
+
+  /** Current line count of the file on disk (newline-delimited). */
+  loc?: number;
+
+  /** Unix seconds, first commit timestamp touching this path. */
+  firstSeenTs?: number | null;
+
+  /** Unix seconds, most recent commit timestamp touching this path. */
+  lastTouchedTs?: number | null;
 }
 
 // =============================================================================
@@ -297,7 +325,12 @@ export interface Subgraph {
  * Options for graph traversal
  */
 export interface TraversalOptions {
-  /** Maximum depth to traverse (default: Infinity) */
+  /**
+   * Maximum depth to traverse (default: 10).
+   * Pass `Infinity` to traverse the full reachable subgraph; callers should
+   * combine that with a sensible `limit` since highly connected graphs can
+   * produce a frontier far larger than `limit` allows during traversal.
+   */
   maxDepth?: number;
 
   /** Edge types to follow (default: all) */
@@ -340,6 +373,17 @@ export interface SearchOptions {
 
   /** Whether search is case-sensitive */
   caseSensitive?: boolean;
+
+  /**
+   * Cap the number of results from any single file before returning.
+   * Default 3. Set to 0 to disable diversification (return raw ranked
+   * results, even if 10 of them come from the same class). The class /
+   * function / interface members of the same file are usually less
+   * informative as multiple distinct results than as "this file plus
+   * representative members" — diversification surfaces context across
+   * the codebase rather than burying the user in one file's internals.
+   */
+  perFileCap?: number;
 }
 
 /**
@@ -465,6 +509,42 @@ export interface CodeGraphConfig {
   /** Whether to track call sites */
   trackCallSites: boolean;
 
+  /**
+   * Whether to recurse into git submodules during indexing and sync.
+   * Default: true.
+   */
+  indexSubmodules?: boolean;
+
+  /**
+   * Mine the file-level co-change graph from git history. Default true.
+   */
+  enableCoChange?: boolean;
+
+  /**
+   * Optional local LLM endpoint for value-add features (symbol
+   * summarisation, semantic search, dead-code judging, role classification,
+   * RAG Q&A). OpenAI-compatible HTTP: Ollama, llama.cpp, LM Studio, vLLM.
+   *
+   * Off by default — codegraph remains FTS-only and zero-dependency
+   * when this is absent.
+   *
+   * @example
+   *   "llm": {
+   *     "endpoint": "http://localhost:11434/v1",
+   *     "chatModel": "qwen2.5-coder:32b"
+   *   }
+   */
+  llm?: {
+    endpoint: string;
+    chatModel?: string;
+    embeddingModel?: string;
+    apiKey?: string;
+    /** Per-request timeout in ms. Defaults to 60000. */
+    timeoutMs?: number;
+    /** Auto-summarise indexed symbols in a background pass. Default true when llm is configured. */
+    summarize?: boolean;
+  };
+
   /** Custom symbol patterns to extract */
   customPatterns?: {
     /** Name for this pattern group */
@@ -474,208 +554,48 @@ export interface CodeGraphConfig {
     /** Node kind to assign */
     kind: NodeKind;
   }[];
+
+  /**
+   * Compute PageRank centrality over calls+references after each
+   * indexAll/sync. Cheap (sub-second on realistic projects); enabled
+   * by default.
+   */
+  enableCentrality?: boolean;
+
+  /**
+   * Mine git log for per-file churn metrics (commit count, LOC,
+   * first-seen / last-touched timestamps). Set to false on shallow
+   * clones or non-git checkouts where the data would be misleading.
+   * Enabled by default.
+   */
+  enableChurn?: boolean;
+
+  /**
+   * Mine `Fixes/Closes/Resolves #N` commits and attribute issues to
+   * symbols touched by their hunks. Enabled by default; turn off on
+   * non-GitHub repos or where issue refs are noisy.
+   */
+  enableIssueHistory?: boolean;
+
+  /**
+   * Extract env-var / feature-flag read sites into config_refs.
+   * Enabled by default.
+   */
+  enableConfigRefs?: boolean;
+
+  /**
+   * Extract SQL string-literal references (table reads/writes/DDL)
+   * into sql_refs. Enabled by default.
+   */
+  enableSqlRefs?: boolean;
 }
 
-/**
- * Default configuration values
- */
-export const DEFAULT_CONFIG: CodeGraphConfig = {
-  version: 1,
-  rootDir: '.',
-  include: [
-    // TypeScript/JavaScript
-    '**/*.ts',
-    '**/*.tsx',
-    '**/*.js',
-    '**/*.jsx',
-    // Python
-    '**/*.py',
-    // Go
-    '**/*.go',
-    // Rust
-    '**/*.rs',
-    // Java
-    '**/*.java',
-    // C/C++
-    '**/*.c',
-    '**/*.h',
-    '**/*.cpp',
-    '**/*.hpp',
-    '**/*.cc',
-    '**/*.cxx',
-    // C#
-    '**/*.cs',
-    // PHP
-    '**/*.php',
-    // Ruby
-    '**/*.rb',
-    // Swift
-    '**/*.swift',
-    // Kotlin
-    '**/*.kt',
-    '**/*.kts',
-    // Dart
-    '**/*.dart',
-    // Svelte
-    '**/*.svelte',
-    // Liquid (Shopify themes)
-    '**/*.liquid',
-    // Pascal / Delphi
-    '**/*.pas',
-    '**/*.dpr',
-    '**/*.dpk',
-    '**/*.lpr',
-    '**/*.dfm',
-    '**/*.fmx',
-  ],
-  exclude: [
-    // Version control
-    '**/.git/**',
-
-    // Dependencies
-    '**/node_modules/**',
-    '**/vendor/**',
-    '**/Pods/**',
-
-    // Generic build outputs
-    '**/dist/**',
-    '**/build/**',
-    '**/out/**',
-    '**/bin/**',
-    '**/obj/**',
-    '**/target/**',
-
-    // JavaScript/TypeScript
-    '**/*.min.js',
-    '**/*.bundle.js',
-    '**/.next/**',
-    '**/.nuxt/**',
-    '**/.svelte-kit/**',
-    '**/.output/**',
-    '**/.turbo/**',
-    '**/.cache/**',
-    '**/.parcel-cache/**',
-    '**/.vite/**',
-    '**/.astro/**',
-    '**/.docusaurus/**',
-    '**/.gatsby/**',
-    '**/.webpack/**',
-    '**/.nx/**',
-    '**/.yarn/cache/**',
-    '**/.pnpm-store/**',
-    '**/storybook-static/**',
-
-    // React Native / Expo
-    '**/.expo/**',
-    '**/web-build/**',
-    '**/ios/Pods/**',
-    '**/ios/build/**',
-    '**/android/build/**',
-    '**/android/.gradle/**',
-
-    // Python
-    '**/__pycache__/**',
-    '**/.venv/**',
-    '**/venv/**',
-    '**/site-packages/**',
-    '**/dist-packages/**',
-    '**/.pytest_cache/**',
-    '**/.mypy_cache/**',
-    '**/.ruff_cache/**',
-    '**/.tox/**',
-    '**/.nox/**',
-    '**/*.egg-info/**',
-    '**/.eggs/**',
-
-    // Go
-    '**/go/pkg/mod/**',
-
-    // Rust
-    '**/target/debug/**',
-    '**/target/release/**',
-
-    // Java/Kotlin/Gradle
-    '**/.gradle/**',
-    '**/.m2/**',
-    '**/generated-sources/**',
-    '**/.kotlin/**',
-
-    // Dart/Flutter
-    '**/.dart_tool/**',
-
-    // C#/.NET
-    '**/.vs/**',
-    '**/.nuget/**',
-    '**/artifacts/**',
-    '**/publish/**',
-
-    // C/C++
-    '**/cmake-build-*/**',
-    '**/CMakeFiles/**',
-    '**/bazel-*/**',
-    '**/vcpkg_installed/**',
-    '**/.conan/**',
-    '**/Debug/**',
-    '**/Release/**',
-    '**/x64/**',
-    '**/.pio/**',  // Platform.io (IoT/embedded build artifacts and library deps)
-
-    // Electron
-    '**/release/**',
-    '**/*.app/**',
-    '**/*.asar',
-
-    // Swift/iOS/Xcode
-    '**/DerivedData/**',
-    '**/.build/**',
-    '**/.swiftpm/**',
-    '**/xcuserdata/**',
-    '**/Carthage/Build/**',
-    '**/SourcePackages/**',
-
-    // Delphi/Pascal
-    '**/__history/**',
-    '**/__recovery/**',
-    '**/*.dcu',
-
-    // PHP
-    '**/.composer/**',
-    '**/storage/framework/**',
-    '**/bootstrap/cache/**',
-
-    // Ruby
-    '**/.bundle/**',
-    '**/tmp/cache/**',
-    '**/public/assets/**',
-    '**/public/packs/**',
-    '**/.yardoc/**',
-
-    // Testing/Coverage
-    '**/coverage/**',
-    '**/htmlcov/**',
-    '**/.nyc_output/**',
-    '**/test-results/**',
-    '**/.coverage/**',
-
-    // IDE/Editor
-    '**/.idea/**',
-
-    // Logs and temp
-    '**/logs/**',
-    '**/tmp/**',
-    '**/temp/**',
-
-    // Documentation build output
-    '**/_build/**',
-    '**/docs/_build/**',
-    '**/site/**',
-  ],
-  languages: [],
-  frameworks: [],
-  maxFileSize: 1024 * 1024, // 1MB
-  extractDocstrings: true,
-  trackCallSites: true,
-};
+// `DEFAULT_CONFIG` lives in `./default-config.ts` so its `include`
+// list can be derived from the language registry without import
+// cycles. Re-exported here for backward compat with consumers that
+// already import it from `'./types'`.
+export { DEFAULT_CONFIG } from './default-config';
+
 
 // =============================================================================
 // Database Types
diff --git a/src/utils.ts b/src/utils.ts
index e75e58e0..55664bc6 100644
--- a/src/utils.ts
+++ b/src/utils.ts
@@ -122,6 +122,36 @@ export function isPathWithinRoot(filePath: string, rootDir: string): boolean {
   return resolvedPath.startsWith(resolvedRoot + path.sep) || resolvedPath === resolvedRoot;
 }
 
+/**
+ * Like validatePathWithinRoot but also resolves symlinks via fs.realpathSync,
+ * so a regular-looking path that is actually a symlink to outside the root
+ * is rejected. Returns the resolved real path, or null if the file escapes
+ * the root or can't be reached.
+ *
+ * Costs an extra realpath syscall vs. the lexical-only check, so prefer
+ * validatePathWithinRoot for hot paths where symlink TOCTOU isn't relevant.
+ */
+export function validatePathWithinRootReal(projectRoot: string, filePath: string): string | null {
+  const resolved = path.resolve(projectRoot, filePath);
+  const normalizedRoot = path.resolve(projectRoot);
+  if (!resolved.startsWith(normalizedRoot + path.sep) && resolved !== normalizedRoot) {
+    return null;
+  }
+  try {
+    const realPath = fs.realpathSync(resolved);
+    const realRoot = fs.realpathSync(normalizedRoot);
+    if (!realPath.startsWith(realRoot + path.sep) && realPath !== realRoot) {
+      return null;
+    }
+    return realPath;
+  } catch {
+    // realpath failures (broken symlink, permissions) — return the lexically-
+    // resolved path. The downstream readFileSync will fail naturally and the
+    // caller already handles read errors.
+    return resolved;
+  }
+}
+
 /**
  * Like isPathWithinRoot but also resolves symlinks via fs.realpathSync.
  *
@@ -174,6 +204,124 @@ export function normalizePath(filePath: string): string {
   return filePath.replace(/\\/g, '/');
 }
 
+/**
+ * Strip a leading UTF-8 BOM (U+FEFF) if present.
+ */
+export function stripBom(content: string): string {
+  return content.charCodeAt(0) === 0xfeff ? content.slice(1) : content;
+}
+
+function blankPreservingNewlines(text: string): string {
+  return text.replace(/[^\n]/g, ' ');
+}
+
+const BLOCK_COMMENT_LANGUAGES = new Set([
+  'javascript', 'typescript', 'tsx', 'jsx',
+  'java', 'csharp', 'cpp', 'c',
+  'go', 'rust', 'swift', 'kotlin', 'dart', 'scala',
+  'php',
+]);
+
+const LINE_COMMENT_MARKER: Record<string, RegExp> = {
+  javascript: /^[ \t]*\/\//,
+  typescript: /^[ \t]*\/\//,
+  tsx: /^[ \t]*\/\//,
+  jsx: /^[ \t]*\/\//,
+  java: /^[ \t]*\/\//,
+  csharp: /^[ \t]*\/\//,
+  cpp: /^[ \t]*\/\//,
+  c: /^[ \t]*\/\//,
+  go: /^[ \t]*\/\//,
+  rust: /^[ \t]*\/\//,
+  swift: /^[ \t]*\/\//,
+  kotlin: /^[ \t]*\/\//,
+  dart: /^[ \t]*\/\//,
+  scala: /^[ \t]*\/\//,
+  pascal: /^[ \t]*\/\//,
+  python: /^[ \t]*#/,
+  ruby: /^[ \t]*#/,
+  php: /^[ \t]*(?:\/\/|#)/,
+};
+
+export function stripCommentsForRegex(content: string, language: string): string {
+  let out = content;
+
+  if (BLOCK_COMMENT_LANGUAGES.has(language)) {
+    out = out.replace(/\/\*[\s\S]*?\*\//g, blankPreservingNewlines);
+  }
+  if (language === 'python') {
+    out = out.replace(/"""[\s\S]*?"""/g, blankPreservingNewlines);
+    out = out.replace(/'''[\s\S]*?'''/g, blankPreservingNewlines);
+  }
+  if (language === 'ruby') {
+    out = out.replace(/^=begin\b[\s\S]*?^=end\b[^\n]*/gm, blankPreservingNewlines);
+  }
+
+  const lineMarker = LINE_COMMENT_MARKER[language];
+  if (lineMarker) {
+    out = out
+      .split('\n')
+      .map((line) => (lineMarker.test(line) ? blankPreservingNewlines(line) : line))
+      .join('\n');
+  }
+
+  return out;
+}
+
+export function stripCommentLinesForRetry(content: string, language: string): string {
+  const marker = LINE_COMMENT_MARKER[language];
+  if (!marker) return content;
+  return content
+    .split('\n')
+    .map((line) => (marker.test(line) ? '' : line))
+    .join('\n');
+}
+
+/**
+ * Convert a simple `*` / `?` / `**` glob to a safe regex source string.
+ * Hardens against catastrophic backtracking by coalescing runs of `*`.
+ */
+export function globToSafeRegex(glob: string): string | null {
+  if (glob.length > 1024) return null;
+  let out = '';
+  for (let i = 0; i < glob.length; i++) {
+    const ch = glob[i];
+    if (ch === '*') {
+      let runLen = 1;
+      while (glob[i + runLen] === '*') runLen++;
+      out += runLen >= 2 ? '.*' : '[^/]*';
+      i += runLen - 1;
+    } else if (ch === '?') {
+      out += '[^/]';
+    } else if (ch && /[.+^${}()|[\]\\]/.test(ch)) {
+      out += '\\' + ch;
+    } else if (ch) {
+      out += ch;
+    }
+  }
+  return out;
+}
+
+/**
+ * Split an identifier on camelCase, snake_case, kebab-case, dots, and slashes.
+ */
+export function splitIdentifierTokens(name: string): string[] {
+  return name
+    .replace(/([a-z0-9])([A-Z])/g, '$1 $2')
+    .replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2')
+    .split(/[\s_\-.\/:]+/)
+    .map((t) => t.toLowerCase())
+    .filter((t) => t.length > 0);
+}
+
+/**
+ * Build the value stored in the `name_subwords` FTS column.
+ */
+export function buildNameSubwords(name: string): string {
+  const tokens = splitIdentifierTokens(name);
+  return [...new Set([name, ...tokens])].join(' ');
+}
+
 /**
  * Cross-process file lock using a lock file with PID tracking.
  *