diff --git a/CLAUDE.md b/CLAUDE.md
index 71a50c73..f91a3d20 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -98,7 +98,7 @@ SQLite database with:
### Supported Languages
-TypeScript, JavaScript, TSX, JSX, Svelte, Python, Go, Rust, Java, C, C++, C#, PHP, Ruby, Swift, Kotlin, Dart, Liquid, Pascal
+TypeScript, JavaScript, TSX, JSX, Svelte, Python, Go, Rust, Java, C, C++, C#, PHP, Ruby, Swift, Kotlin, Dart, Liquid, Pascal, R
### Node and Edge Types
diff --git a/__tests__/centrality.test.ts b/__tests__/centrality.test.ts
new file mode 100644
index 00000000..e45dc858
--- /dev/null
+++ b/__tests__/centrality.test.ts
@@ -0,0 +1,134 @@
+import { describe, it, expect } from 'vitest';
+import { computePageRank, PR_DAMPING, PR_ITERATIONS } from '../src/centrality';
+
+function asNodes(ids: string[]) {
+ return ids.map((id) => ({ id }));
+}
+
+describe('computePageRank', () => {
+ it('returns empty result for an empty graph', () => {
+ const r = computePageRank([], []);
+ expect(r.scores.size).toBe(0);
+ expect(r.iterations).toBe(0);
+ });
+
+ it('assigns uniform rank to N isolated nodes', () => {
+ const r = computePageRank(asNodes(['a', 'b', 'c', 'd']), []);
+ expect(r.scores.size).toBe(4);
+ // 4 isolated nodes — all dangling — should each end up with 1/N.
+ for (const v of r.scores.values()) {
+ expect(v).toBeCloseTo(0.25, 6);
+ }
+ });
+
+ it('rewards being reached (sinks accumulate rank)', () => {
+ // a -> b -> c. c has no outgoing, so it accumulates the most.
+ const r = computePageRank(
+ asNodes(['a', 'b', 'c']),
+ [
+ { source: 'a', target: 'b' },
+ { source: 'b', target: 'c' },
+ ]
+ );
+ const a = r.scores.get('a')!;
+ const b = r.scores.get('b')!;
+ const c = r.scores.get('c')!;
+ expect(c).toBeGreaterThan(b);
+ expect(b).toBeGreaterThan(a);
+ });
+
+ it('star: hub ranks above all leaves; leaves are equal', () => {
+ const leaves = ['l1', 'l2', 'l3', 'l4', 'l5', 'l6', 'l7', 'l8', 'l9'];
+ const edges = leaves.map((l) => ({ source: l, target: 'hub' }));
+ const r = computePageRank(asNodes([...leaves, 'hub']), edges);
+ const hub = r.scores.get('hub')!;
+ for (const l of leaves) {
+ const lv = r.scores.get(l)!;
+ expect(hub).toBeGreaterThan(lv);
+ }
+ // Leaves are symmetric — should be within 1e-9.
+ const first = r.scores.get(leaves[0])!;
+ for (const l of leaves.slice(1)) {
+ expect(r.scores.get(l)!).toBeCloseTo(first, 9);
+ }
+ });
+
+ it('cycle: all nodes have approximately equal rank', () => {
+ const r = computePageRank(
+ asNodes(['a', 'b', 'c']),
+ [
+ { source: 'a', target: 'b' },
+ { source: 'b', target: 'c' },
+ { source: 'c', target: 'a' },
+ ]
+ );
+ const a = r.scores.get('a')!;
+ const b = r.scores.get('b')!;
+ const c = r.scores.get('c')!;
+ // Symmetric → all equal at convergence.
+ expect(a).toBeCloseTo(b, 6);
+ expect(b).toBeCloseTo(c, 6);
+ });
+
+ it('total rank sums to ~1 (mass is conserved)', () => {
+ const r = computePageRank(
+ asNodes(['a', 'b', 'c', 'd', 'e']),
+ [
+ { source: 'a', target: 'b' },
+ { source: 'b', target: 'c' },
+ { source: 'd', target: 'c' },
+ { source: 'e', target: 'd' },
+ { source: 'a', target: 'e' },
+ ]
+ );
+ let sum = 0;
+ for (const v of r.scores.values()) sum += v;
+ expect(sum).toBeCloseTo(1, 6);
+ });
+
+ it('preserves mass across two disconnected components', () => {
+ const r = computePageRank(
+ asNodes(['a', 'b', 'c', 'd']),
+ [
+ { source: 'a', target: 'b' },
+ { source: 'c', target: 'd' },
+ ]
+ );
+ let sum = 0;
+ for (const v of r.scores.values()) sum += v;
+ expect(sum).toBeCloseTo(1, 6);
+ // Within each component, the sink ranks above the source.
+ expect(r.scores.get('b')!).toBeGreaterThan(r.scores.get('a')!);
+ expect(r.scores.get('d')!).toBeGreaterThan(r.scores.get('c')!);
+ });
+
+ it('drops edges referencing unknown nodes', () => {
+ // 'ghost' is not in the node set — that edge should be ignored,
+ // not crash and not pollute scores.
+ const r = computePageRank(
+ asNodes(['a', 'b']),
+ [
+ { source: 'a', target: 'b' },
+ { source: 'a', target: 'ghost' },
+ { source: 'ghost', target: 'b' },
+ ]
+ );
+ expect(r.scores.size).toBe(2);
+ expect(r.scores.get('b')!).toBeGreaterThan(r.scores.get('a')!);
+ let sum = 0;
+ for (const v of r.scores.values()) sum += v;
+ expect(sum).toBeCloseTo(1, 6);
+ });
+
+ it('reports iteration count and duration', () => {
+ const r = computePageRank(asNodes(['a', 'b']), [{ source: 'a', target: 'b' }]);
+ expect(r.iterations).toBe(PR_ITERATIONS);
+ expect(r.durationMs).toBeGreaterThanOrEqual(0);
+ });
+
+ it('damping constant is the textbook 0.85', () => {
+ // Sentinel — protects against accidental tuning that would invalidate
+ // the spike findings the PR was justified on.
+ expect(PR_DAMPING).toBe(0.85);
+ });
+});
diff --git a/__tests__/churn.test.ts b/__tests__/churn.test.ts
new file mode 100644
index 00000000..fbe279f6
--- /dev/null
+++ b/__tests__/churn.test.ts
@@ -0,0 +1,208 @@
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import * as fs from 'fs';
+import * as os from 'os';
+import * as path from 'path';
+import { execFileSync } from 'child_process';
+import {
+ mineChurn,
+ getGitHead,
+ readFileLoc,
+ MAX_FILES_PER_COMMIT,
+ LAST_MINED_CHURN_HEAD_KEY,
+} from '../src/churn';
+
+let HAS_GIT = true;
+try {
+ execFileSync('git', ['--version'], { stdio: 'ignore' });
+} catch {
+ HAS_GIT = false;
+}
+
+let tempDir: string;
+
+function git(...args: string[]): string {
+ return execFileSync('git', args, {
+ cwd: tempDir,
+ encoding: 'utf-8',
+ env: {
+ ...process.env,
+ GIT_AUTHOR_NAME: 'Test',
+ GIT_AUTHOR_EMAIL: 'test@example.com',
+ GIT_COMMITTER_NAME: 'Test',
+ GIT_COMMITTER_EMAIL: 'test@example.com',
+ GIT_AUTHOR_DATE: process.env.GIT_AUTHOR_DATE,
+ GIT_COMMITTER_DATE: process.env.GIT_COMMITTER_DATE,
+ },
+ stdio: ['pipe', 'pipe', 'pipe'],
+ }).trim();
+}
+
+function commitAt(date: string, paths: string[], content?: string) {
+ for (const p of paths) {
+ const abs = path.join(tempDir, p);
+ fs.mkdirSync(path.dirname(abs), { recursive: true });
+ fs.writeFileSync(abs, content ?? `data for ${p} at ${date}\n`);
+ }
+ git('add', ...paths);
+ // Pin both author and committer dates so timestamps are deterministic.
+ process.env.GIT_AUTHOR_DATE = date;
+ process.env.GIT_COMMITTER_DATE = date;
+ git('commit', '-m', `commit at ${date}`);
+ delete process.env.GIT_AUTHOR_DATE;
+ delete process.env.GIT_COMMITTER_DATE;
+}
+
+beforeEach(() => {
+ tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-churn-'));
+ if (HAS_GIT) {
+ git('init', '-q', '-b', 'main');
+ git('config', 'commit.gpgsign', 'false');
+ }
+});
+
+afterEach(() => {
+ delete process.env.GIT_AUTHOR_DATE;
+ delete process.env.GIT_COMMITTER_DATE;
+ fs.rmSync(tempDir, { recursive: true, force: true });
+});
+
+describe.skipIf(!HAS_GIT)('mineChurn', () => {
+ it('returns empty + null head when not in a git repo', () => {
+ const nonGit = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-nogit-'));
+ try {
+ const r = mineChurn(nonGit, new Set(['foo.ts']), null);
+ expect(r.currentHead).toBeNull();
+ expect(r.deltas.size).toBe(0);
+ expect(r.needsFullRescan).toBe(false);
+ } finally {
+ fs.rmSync(nonGit, { recursive: true, force: true });
+ }
+ });
+
+ it('counts commits per indexed file, ignores files not in index', () => {
+ commitAt('2025-01-01T00:00:00', ['a.ts', 'b.ts']);
+ commitAt('2025-01-02T00:00:00', ['a.ts']);
+ commitAt('2025-01-03T00:00:00', ['a.ts', 'b.ts', 'c.ts']);
+
+ const r = mineChurn(tempDir, new Set(['a.ts', 'b.ts']), null);
+ expect(r.deltas.get('a.ts')?.commitCountDelta).toBe(3);
+ expect(r.deltas.get('b.ts')?.commitCountDelta).toBe(2);
+ expect(r.deltas.has('c.ts')).toBe(false);
+ });
+
+ it('records first-seen / last-touched as min/max of commit timestamps', () => {
+ commitAt('2025-01-01T00:00:00Z', ['a.ts']);
+ commitAt('2025-06-01T00:00:00Z', ['a.ts']);
+ commitAt('2025-12-01T00:00:00Z', ['a.ts']);
+
+ const r = mineChurn(tempDir, new Set(['a.ts']), null);
+ const d = r.deltas.get('a.ts')!;
+ // 2025-01-01 UTC = 1735689600
+ expect(d.firstSeenTs).toBe(1735689600);
+ // 2025-12-01 UTC = 1764547200
+ expect(d.lastTouchedTs).toBe(1764547200);
+ });
+
+ it('skips commits touching more than MAX_FILES_PER_COMMIT files', () => {
+ const bigBatch: string[] = [];
+ for (let i = 0; i < MAX_FILES_PER_COMMIT + 1; i++) bigBatch.push(`f${i}.ts`);
+ commitAt('2025-01-01T00:00:00Z', bigBatch);
+ // Then a normal commit on one of the same files.
+ commitAt('2025-02-01T00:00:00Z', ['f0.ts']);
+
+ const r = mineChurn(tempDir, new Set(bigBatch), null);
+ // First commit was skipped; only the second one should count.
+ expect(r.deltas.get('f0.ts')?.commitCountDelta).toBe(1);
+ // Files only seen in the skipped commit produce no delta at all.
+ expect(r.deltas.has('f50.ts')).toBe(false);
+ });
+
+ it('incremental mining returns only commits since the given sha', () => {
+ commitAt('2025-01-01T00:00:00Z', ['a.ts']);
+ const sha1 = getGitHead(tempDir)!;
+ commitAt('2025-01-02T00:00:00Z', ['a.ts']);
+ commitAt('2025-01-03T00:00:00Z', ['a.ts']);
+
+ const incr = mineChurn(tempDir, new Set(['a.ts']), sha1);
+ // Only the two commits *after* sha1 should be counted.
+ expect(incr.deltas.get('a.ts')?.commitCountDelta).toBe(2);
+ expect(incr.needsFullRescan).toBe(false);
+ });
+
+ it('returns needsFullRescan=true when sinceSha is unreachable', () => {
+ commitAt('2025-01-01T00:00:00Z', ['a.ts']);
+ const fakeSha = '0'.repeat(40);
+ const r = mineChurn(tempDir, new Set(['a.ts']), fakeSha);
+ expect(r.needsFullRescan).toBe(true);
+ expect(r.deltas.size).toBe(0);
+ expect(r.currentHead).not.toBeNull();
+ });
+
+ it('returns empty deltas when sinceSha equals current head (no-op)', () => {
+ commitAt('2025-01-01T00:00:00Z', ['a.ts']);
+ const head = getGitHead(tempDir)!;
+ const r = mineChurn(tempDir, new Set(['a.ts']), head);
+ expect(r.currentHead).toBe(head);
+ expect(r.deltas.size).toBe(0);
+ expect(r.needsFullRescan).toBe(false);
+ });
+
+ it('handles paths with spaces and unicode safely (NUL-delimited)', () => {
+ commitAt('2025-01-01T00:00:00Z', ['name with space.ts']);
+ commitAt('2025-01-02T00:00:00Z', ['ünïcødë.ts']);
+
+ const r = mineChurn(
+ tempDir,
+ new Set(['name with space.ts', 'ünïcødë.ts']),
+ null
+ );
+ expect(r.deltas.get('name with space.ts')?.commitCountDelta).toBe(1);
+ expect(r.deltas.get('ünïcødë.ts')?.commitCountDelta).toBe(1);
+ });
+
+ it('LAST_MINED_CHURN_HEAD_KEY is stable (used as project_metadata key)', () => {
+ expect(LAST_MINED_CHURN_HEAD_KEY).toBe('last_mined_churn_head');
+ });
+});
+
+describe('readFileLoc', () => {
+ it('returns 0 for an empty file', () => {
+ const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-loc-'));
+ try {
+ const f = path.join(dir, 'empty.txt');
+ fs.writeFileSync(f, '');
+ expect(readFileLoc(dir, 'empty.txt')).toBe(0);
+ } finally {
+ fs.rmSync(dir, { recursive: true, force: true });
+ }
+ });
+
+ it('counts newline-terminated lines', () => {
+ const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-loc-'));
+ try {
+ fs.writeFileSync(path.join(dir, 'x.txt'), 'a\nb\nc\n');
+ expect(readFileLoc(dir, 'x.txt')).toBe(3);
+ } finally {
+ fs.rmSync(dir, { recursive: true, force: true });
+ }
+ });
+
+ it('counts a final no-newline chunk as one extra line', () => {
+ const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-loc-'));
+ try {
+ fs.writeFileSync(path.join(dir, 'x.txt'), 'a\nb\nc');
+ expect(readFileLoc(dir, 'x.txt')).toBe(3);
+ } finally {
+ fs.rmSync(dir, { recursive: true, force: true });
+ }
+ });
+
+ it('returns 0 for a missing file (does not throw)', () => {
+ const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-loc-'));
+ try {
+ expect(readFileLoc(dir, 'no-such-file.txt')).toBe(0);
+ } finally {
+ fs.rmSync(dir, { recursive: true, force: true });
+ }
+ });
+});
diff --git a/__tests__/codegraphignore.test.ts b/__tests__/codegraphignore.test.ts
new file mode 100644
index 00000000..4d7e58c5
--- /dev/null
+++ b/__tests__/codegraphignore.test.ts
@@ -0,0 +1,168 @@
+/**
+ * .codegraphignore Tests
+ *
+ * Regression test for the bug where the .codegraphignore marker file was
+ * honored by the filesystem-walk fallback (`scanDirectoryWalk`) but
+ * silently ignored by the git fast path (`getGitVisibleFiles` and
+ * `getGitChangedFiles`). Same project gave different file sets depending
+ * on whether `.git` existed.
+ */
+
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+import { execFileSync } from 'child_process';
+import { scanDirectory } from '../src/extraction';
+import { DEFAULT_CONFIG, CodeGraphConfig } from '../src/types';
+import CodeGraph from '../src/index';
+
+function tempDir(prefix: string): string {
+ return fs.mkdtempSync(path.join(os.tmpdir(), prefix));
+}
+
+function git(cwd: string, ...args: string[]) {
+ execFileSync('git', args, { cwd, stdio: 'pipe' });
+}
+
+const config: CodeGraphConfig = {
+ ...DEFAULT_CONFIG,
+ include: ['**/*.ts'],
+ exclude: [],
+};
+
+describe('.codegraphignore marker (bug #3)', () => {
+ describe('git fast path', () => {
+ let dir: string;
+
+ beforeEach(() => {
+ dir = tempDir('codegraph-ignore-git-');
+ git(dir, 'init');
+ git(dir, 'config', 'user.email', 'test@test.com');
+ git(dir, 'config', 'user.name', 'Test');
+ // Pin branch name for determinism across git defaults
+ git(dir, 'symbolic-ref', 'HEAD', 'refs/heads/main');
+
+ fs.mkdirSync(path.join(dir, 'src'));
+ fs.mkdirSync(path.join(dir, 'vendor'));
+ fs.mkdirSync(path.join(dir, 'vendor', 'lib'));
+ fs.writeFileSync(path.join(dir, 'src', 'app.ts'), 'export const a = 1;');
+ fs.writeFileSync(path.join(dir, 'vendor', 'pkg.ts'), 'export const v = 1;');
+ fs.writeFileSync(path.join(dir, 'vendor', 'lib', 'sub.ts'), 'export const s = 1;');
+ // Mark vendor/ as ignored
+ fs.writeFileSync(path.join(dir, 'vendor', '.codegraphignore'), '');
+
+ git(dir, 'add', '-A');
+ git(dir, 'commit', '-m', 'initial');
+ });
+
+ afterEach(() => {
+ if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true });
+ });
+
+ it('scanDirectory honors .codegraphignore on the git fast path', () => {
+ const files = scanDirectory(dir, config);
+ expect(files).toContain('src/app.ts');
+ expect(files).not.toContain('vendor/pkg.ts');
+ expect(files).not.toContain('vendor/lib/sub.ts');
+ });
+
+ it('marker at project root excludes everything', () => {
+ fs.writeFileSync(path.join(dir, '.codegraphignore'), '');
+ // Need to add it to git so ls-files sees it (or rely on -o)
+ git(dir, 'add', '-A');
+ git(dir, 'commit', '-m', 'add root marker');
+ const files = scanDirectory(dir, config);
+ expect(files).toEqual([]);
+ });
+
+ it('marker in nested subdir does not affect siblings', () => {
+ // Add another sibling subdir without a marker
+ fs.mkdirSync(path.join(dir, 'libs'));
+ fs.writeFileSync(path.join(dir, 'libs', 'util.ts'), 'export const u = 1;');
+ git(dir, 'add', '-A');
+ git(dir, 'commit', '-m', 'add libs');
+
+ const files = scanDirectory(dir, config);
+ expect(files).toContain('src/app.ts');
+ expect(files).toContain('libs/util.ts');
+ expect(files).not.toContain('vendor/pkg.ts');
+ });
+
+ it('respects marker added after initial commit (untracked marker)', () => {
+ // The marker file itself need not be committed — it can be a local
+ // override. Add marker AFTER commit, do not commit it.
+ fs.mkdirSync(path.join(dir, 'generated'));
+ fs.writeFileSync(path.join(dir, 'generated', 'gen.ts'), 'export const g = 1;');
+ fs.writeFileSync(path.join(dir, 'generated', '.codegraphignore'), '');
+ // The .ts file is untracked but visible via `git ls-files -o`.
+ // The marker is also untracked — we still detect it via fs check.
+
+ const files = scanDirectory(dir, config);
+ expect(files).not.toContain('generated/gen.ts');
+ });
+ });
+
+ describe('parity with non-git fallback (filesystem walk)', () => {
+ let dir: string;
+
+ beforeEach(() => {
+ dir = tempDir('codegraph-ignore-walk-');
+ fs.mkdirSync(path.join(dir, 'src'));
+ fs.mkdirSync(path.join(dir, 'vendor'));
+ fs.writeFileSync(path.join(dir, 'src', 'app.ts'), 'export const a = 1;');
+ fs.writeFileSync(path.join(dir, 'vendor', 'pkg.ts'), 'export const v = 1;');
+ fs.writeFileSync(path.join(dir, 'vendor', '.codegraphignore'), '');
+ });
+
+ afterEach(() => {
+ if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true });
+ });
+
+ it('non-git project also honors the marker (sanity / pre-existing behavior)', () => {
+ const files = scanDirectory(dir, config);
+ expect(files).toContain('src/app.ts');
+ expect(files).not.toContain('vendor/pkg.ts');
+ });
+ });
+
+ describe('sync git path (getGitChangedFiles)', () => {
+ let dir: string;
+ let cg: CodeGraph;
+
+ beforeEach(async () => {
+ dir = tempDir('codegraph-ignore-sync-');
+ git(dir, 'init');
+ git(dir, 'config', 'user.email', 'test@test.com');
+ git(dir, 'config', 'user.name', 'Test');
+ git(dir, 'symbolic-ref', 'HEAD', 'refs/heads/main');
+
+ fs.mkdirSync(path.join(dir, 'src'));
+ fs.mkdirSync(path.join(dir, 'vendor'));
+ fs.writeFileSync(path.join(dir, 'src', 'app.ts'), 'export const a = 1;');
+ fs.writeFileSync(path.join(dir, 'vendor', '.codegraphignore'), '');
+
+ git(dir, 'add', '-A');
+ git(dir, 'commit', '-m', 'initial');
+
+ cg = CodeGraph.initSync(dir, { config: { include: ['**/*.ts'], exclude: [] } });
+ await cg.indexAll();
+ });
+
+ afterEach(() => {
+ if (cg) cg.destroy();
+ if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true });
+ });
+
+ it('sync ignores changes inside marker dirs', async () => {
+ // Add a new file under vendor/ — should NOT be picked up by sync.
+ fs.writeFileSync(path.join(dir, 'vendor', 'leaked.ts'), 'export const x = 1;');
+ // Also add a real change to confirm sync still runs.
+ fs.writeFileSync(path.join(dir, 'src', 'app.ts'), 'export const a = 2;');
+
+ const result = await cg.sync();
+ expect(result.changedFilePaths).toContain('src/app.ts');
+ expect(result.changedFilePaths ?? []).not.toContain('vendor/leaked.ts');
+ });
+ });
+});
diff --git a/__tests__/config-refs.test.ts b/__tests__/config-refs.test.ts
new file mode 100644
index 00000000..ab1a63e4
--- /dev/null
+++ b/__tests__/config-refs.test.ts
@@ -0,0 +1,288 @@
+/**
+ * Config-refs tests: parser unit tests + end-to-end through CodeGraph.
+ */
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import * as fs from 'fs';
+import * as os from 'os';
+import * as path from 'path';
+import { extractConfigRefs } from '../src/config-refs';
+import CodeGraph from '../src/index';
+
+let testDir: string;
+let cg: CodeGraph | null = null;
+
+function write(rel: string, content: string) {
+ const abs = path.join(testDir, rel);
+ fs.mkdirSync(path.dirname(abs), { recursive: true });
+ fs.writeFileSync(abs, content);
+}
+
+beforeEach(() => {
+ testDir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-config-'));
+});
+
+afterEach(() => {
+ if (cg) {
+ cg.destroy();
+ cg = null;
+ }
+ if (fs.existsSync(testDir)) fs.rmSync(testDir, { recursive: true, force: true });
+});
+
+// ============================================================================
+// Pure parser tests (no CodeGraph)
+// ============================================================================
+
+describe('extractConfigRefs', () => {
+ it('extracts process.env.X from TS', () => {
+ write('a.ts', `const port = process.env.OBSIDIAN_PORT;\n`);
+ const refs = extractConfigRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null);
+ expect(refs.length).toBe(1);
+ expect(refs[0]!.configKey).toBe('OBSIDIAN_PORT');
+ expect(refs[0]!.line).toBe(1);
+ });
+
+ it('extracts process.env["X"] from JS', () => {
+ write('a.js', `module.exports = { port: process.env["MY_KEY"] };\n`);
+ const refs = extractConfigRefs(testDir, [{ path: 'a.js', language: 'javascript' }], () => null);
+ expect(refs.map((r) => r.configKey)).toEqual(['MY_KEY']);
+ });
+
+ it('extracts os.getenv / os.environ from Python', () => {
+ write(
+ 'a.py',
+ [
+ `import os`,
+ `port = os.getenv("PYTHON_PORT")`,
+ `host = os.environ.get("PYTHON_HOST")`,
+ `path = os.environ["PYTHON_PATH"]`,
+ `name = getenv("PYTHON_NAME")`,
+ ].join('\n')
+ );
+ const refs = extractConfigRefs(testDir, [{ path: 'a.py', language: 'python' }], () => null);
+ expect(new Set(refs.map((r) => r.configKey))).toEqual(
+ new Set(['PYTHON_PORT', 'PYTHON_HOST', 'PYTHON_PATH', 'PYTHON_NAME'])
+ );
+ });
+
+ it('extracts os.Getenv / os.LookupEnv from Go', () => {
+ write(
+ 'a.go',
+ [
+ `package main`,
+ `import "os"`,
+ `var Port = os.Getenv("GO_PORT")`,
+ `var Host, _ = os.LookupEnv("GO_HOST")`,
+ ].join('\n')
+ );
+ const refs = extractConfigRefs(testDir, [{ path: 'a.go', language: 'go' }], () => null);
+ expect(new Set(refs.map((r) => r.configKey))).toEqual(new Set(['GO_PORT', 'GO_HOST']));
+ });
+
+ it('extracts ENV[...] / ENV.fetch from Ruby', () => {
+ write('a.rb', `port = ENV["RUBY_PORT"]\nhost = ENV.fetch("RUBY_HOST")\n`);
+ const refs = extractConfigRefs(testDir, [{ path: 'a.rb', language: 'ruby' }], () => null);
+ expect(new Set(refs.map((r) => r.configKey))).toEqual(new Set(['RUBY_PORT', 'RUBY_HOST']));
+ });
+
+ it('extracts env!/std::env::var from Rust', () => {
+ write(
+ 'a.rs',
+ [
+ `let port = env!("RUST_PORT");`,
+ `let host = std::env::var("RUST_HOST").unwrap();`,
+ ].join('\n')
+ );
+ const refs = extractConfigRefs(testDir, [{ path: 'a.rs', language: 'rust' }], () => null);
+ expect(new Set(refs.map((r) => r.configKey))).toEqual(new Set(['RUST_PORT', 'RUST_HOST']));
+ });
+
+ it('extracts System.getenv from Java/Kotlin', () => {
+ write('A.java', `String port = System.getenv("JAVA_PORT");\n`);
+ const refs = extractConfigRefs(testDir, [{ path: 'A.java', language: 'java' }], () => null);
+ expect(refs.map((r) => r.configKey)).toEqual(['JAVA_PORT']);
+ });
+
+ it('only matches UPPER_CASE keys (skips lower-case identifiers)', () => {
+ write('a.ts', `const x = process.env.somethingDynamic;\nconst y = process.env.GOOD_KEY;\n`);
+ const refs = extractConfigRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null);
+ expect(refs.map((r) => r.configKey)).toEqual(['GOOD_KEY']);
+ });
+
+ it('skips files in unsupported languages without crashing', () => {
+ write('a.swift', `let port = ProcessInfo.processInfo.environment["SWIFT_PORT"]\n`);
+ const refs = extractConfigRefs(testDir, [{ path: 'a.swift', language: 'swift' }], () => null);
+ // Swift not in PATTERNS for v1.
+ expect(refs).toEqual([]);
+ });
+
+ it('captures the correct 1-indexed line number', () => {
+ write(
+ 'a.ts',
+ [
+ `// line 1`,
+ `// line 2`,
+ `const x = process.env.LINE_THREE_KEY;`,
+ `// line 4`,
+ `const y = process.env.LINE_FIVE_KEY;`,
+ ].join('\n')
+ );
+ const refs = extractConfigRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null);
+ expect(refs).toEqual([
+ expect.objectContaining({ configKey: 'LINE_THREE_KEY', line: 3 }),
+ expect.objectContaining({ configKey: 'LINE_FIVE_KEY', line: 5 }),
+ ]);
+ });
+
+ it('threads the resolveEnclosing closure correctly', () => {
+ write('a.ts', `const x = process.env.FOO;\n`);
+ const calls: Array<[string, number]> = [];
+ extractConfigRefs(
+ testDir,
+ [{ path: 'a.ts', language: 'typescript' }],
+ (filePath, line) => {
+ calls.push([filePath, line]);
+ return 'fake-node-id';
+ }
+ );
+ expect(calls).toEqual([['a.ts', 1]]);
+ });
+
+ it('survives a missing file (skips, no throw)', () => {
+ const refs = extractConfigRefs(
+ testDir,
+ [{ path: 'does-not-exist.ts', language: 'typescript' }],
+ () => null
+ );
+ expect(refs).toEqual([]);
+ });
+});
+
+// ============================================================================
+// End-to-end through CodeGraph
+// ============================================================================
+
+describe('CodeGraph config refs', () => {
+ it('persists env reads after indexAll and resolves enclosing function', async () => {
+ write(
+ 'src/server.ts',
+ [
+ `export function start() {`,
+ ` const port = process.env.OBSIDIAN_PORT ?? 8080;`,
+ ` return port;`,
+ `}`,
+ ``,
+ `export function getApiKey() {`,
+ ` return process.env.OBSIDIAN_API_KEY;`,
+ `}`,
+ ``,
+ `// top-level read`,
+ `export const HOST = process.env.OBSIDIAN_HOST;`,
+ ].join('\n')
+ );
+ cg = CodeGraph.initSync(testDir, {
+ config: { include: ['**/*.ts'], exclude: [] },
+ });
+ await cg.indexAll();
+
+ // All three keys should be visible.
+ const keys = cg.getConfigKeys({ configKind: 'env' });
+ expect(keys.map((k) => k.configKey).sort()).toEqual([
+ 'OBSIDIAN_API_KEY',
+ 'OBSIDIAN_HOST',
+ 'OBSIDIAN_PORT',
+ ]);
+
+ // The OBSIDIAN_PORT read should be attributed to `start`.
+ const portSites = cg.getConfigRefsByKey('OBSIDIAN_PORT');
+ expect(portSites.length).toBe(1);
+ expect(portSites[0]!.sourceName).toBe('start');
+
+ // The HOST read is at the top level — sourceName should be null.
+ const hostSites = cg.getConfigRefsByKey('OBSIDIAN_HOST');
+ expect(hostSites[0]!.sourceName).toBeNull();
+ });
+
+ it('reverse view: getConfigKeysForNode returns keys read by a function', async () => {
+ write(
+ 'src/a.ts',
+ [
+ `export function loadConfig() {`,
+ ` const a = process.env.KEY_A;`,
+ ` const b = process.env.KEY_B;`,
+ ` return { a, b };`,
+ `}`,
+ ].join('\n')
+ );
+ cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } });
+ await cg.indexAll();
+
+ const node = cg.getNodesInFile('src/a.ts').find((n) => n.name === 'loadConfig')!;
+ const keys = cg.getConfigKeysForNode(node.id).map((r) => r.configKey).sort();
+ expect(keys).toEqual(['KEY_A', 'KEY_B']);
+ });
+
+ it('respects enableConfigRefs=false', async () => {
+ write('src/a.ts', `export const PORT = process.env.PORT;\n`);
+ cg = CodeGraph.initSync(testDir, {
+ config: { include: ['**/*.ts'], exclude: [], enableConfigRefs: false },
+ });
+ await cg.indexAll();
+ expect(cg.getConfigKeys()).toEqual([]);
+ });
+
+ it('incremental sync replaces refs for changed files only', async () => {
+ write('src/a.ts', `export const A = process.env.OLD_KEY;\n`);
+ write('src/b.ts', `export const B = process.env.UNCHANGED_KEY;\n`);
+ cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } });
+ await cg.indexAll();
+ expect(cg.getConfigKeys().map((k) => k.configKey).sort()).toEqual([
+ 'OLD_KEY',
+ 'UNCHANGED_KEY',
+ ]);
+
+ // Edit only a.ts — UNCHANGED_KEY should still be there.
+ write('src/a.ts', `export const A = process.env.NEW_KEY;\n`);
+ await cg.sync();
+
+ const keys = cg.getConfigKeys().map((k) => k.configKey).sort();
+ expect(keys).toContain('NEW_KEY');
+ expect(keys).toContain('UNCHANGED_KEY');
+ expect(keys).not.toContain('OLD_KEY');
+ });
+
+ it('drops refs when a file is edited to remove its last env read', async () => {
+ // Regression for the empty-rows early-return data-corruption bug:
+ // applyConfigRefs([]) used to short-circuit without deleting the
+ // stale rows for the file. The sync path now explicitly invalidates
+ // rows for every changed file *before* extracting, regardless of
+ // whether the new content has any reads.
+ write('src/a.ts', `export const PORT = process.env.REMOVED_KEY;\n`);
+ cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } });
+ await cg.indexAll();
+ expect(cg.getConfigKeys().some((k) => k.configKey === 'REMOVED_KEY')).toBe(true);
+
+ // Edit a.ts to remove the env read entirely (no remaining reads).
+ write('src/a.ts', `export const PORT = 8080; // no env read here\n`);
+ await cg.sync();
+
+ expect(cg.getConfigKeys().some((k) => k.configKey === 'REMOVED_KEY')).toBe(false);
+ });
+
+ it('drops refs for files removed between syncs', async () => {
+ write('src/a.ts', `export const A = process.env.GOING_AWAY;\n`);
+ cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } });
+ await cg.indexAll();
+ expect(cg.getConfigKeys().some((k) => k.configKey === 'GOING_AWAY')).toBe(true);
+
+ fs.unlinkSync(path.join(testDir, 'src/a.ts'));
+ await cg.sync();
+
+ expect(cg.getConfigKeys().some((k) => k.configKey === 'GOING_AWAY')).toBe(false);
+ });
+
+ // (Removed: a defensive test for the v4-migration-collision bug class.
+ // With file-based migrations (NNN-name.ts), two PRs claiming the same
+ // version produces a filesystem-level conflict, so the silent skip the
+ // defensive guard protected against can no longer happen.)
+});
diff --git a/__tests__/context.test.ts b/__tests__/context.test.ts
index 52dae1fe..9a0614aa 100644
--- a/__tests__/context.test.ts
+++ b/__tests__/context.test.ts
@@ -210,6 +210,19 @@ export function validateEmail(email: string): boolean {
expect(result.nodes.size).toBeLessThanOrEqual(5);
});
+
+ it('should clamp absurd searchLimit/maxNodes values to safe upper bounds', async () => {
+ // Without clamping, the internal `findNodesByExactName` query would
+ // request `searchLimit * 5` rows — passing 1e9 here would blow out
+ // memory. The call should complete in normal time and not return more
+ // than the hard cap on maxNodes (1000).
+ const result = await cg.findRelevantContext('function', {
+ searchLimit: 1_000_000_000,
+ maxNodes: 1_000_000_000,
+ traversalDepth: 1_000,
+ });
+ expect(result.nodes.size).toBeLessThanOrEqual(1000);
+ });
});
describe('buildContext()', () => {
diff --git a/__tests__/db-perf.test.ts b/__tests__/db-perf.test.ts
new file mode 100644
index 00000000..256cf92c
--- /dev/null
+++ b/__tests__/db-perf.test.ts
@@ -0,0 +1,161 @@
+/**
+ * DB Performance / Correctness Tests
+ *
+ * Regression tests for three changes:
+ * 1. Batch `getNodesByIds` collapses graph-traversal N+1 reads.
+ * 2. `insertNode` invalidates the LRU cache so INSERT OR REPLACE
+ * doesn't serve a stale cached row on next `getNodeById`.
+ * 3. `runMaintenance` runs `PRAGMA optimize` + `wal_checkpoint(PASSIVE)`
+ * after indexAll/sync without throwing.
+ */
+
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+import { DatabaseConnection } from '../src/db';
+import { QueryBuilder } from '../src/db/queries';
+import { Node } from '../src/types';
+
+function makeNode(id: string, name = id): Node {
+ return {
+ id,
+ kind: 'function',
+ name,
+ qualifiedName: name,
+ filePath: 'a.ts',
+ language: 'typescript',
+ startLine: 1,
+ endLine: 1,
+ startColumn: 0,
+ endColumn: 0,
+ updatedAt: Date.now(),
+ };
+}
+
+describe('getNodesByIds (batch lookup)', () => {
+ let dir: string;
+ let db: DatabaseConnection;
+ let q: QueryBuilder;
+
+ beforeEach(() => {
+ dir = fs.mkdtempSync(path.join(os.tmpdir(), 'db-perf-batch-'));
+ db = DatabaseConnection.initialize(path.join(dir, 'test.db'));
+ q = new QueryBuilder(db.getDb());
+ });
+
+ afterEach(() => {
+ db.close();
+ if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true });
+ });
+
+ it('returns a Map keyed by id, with one entry per existing node', () => {
+ q.insertNodes([makeNode('n1'), makeNode('n2'), makeNode('n3')]);
+ const out = q.getNodesByIds(['n1', 'n2', 'n3']);
+ expect(out.size).toBe(3);
+ expect(out.get('n1')!.name).toBe('n1');
+ expect(out.get('n3')!.name).toBe('n3');
+ });
+
+ it('omits missing IDs from the result map (no nulls, no exceptions)', () => {
+ q.insertNodes([makeNode('n1'), makeNode('n2')]);
+ const out = q.getNodesByIds(['n1', 'missing', 'n2']);
+ expect(out.size).toBe(2);
+ expect(out.has('missing')).toBe(false);
+ expect(out.has('n1')).toBe(true);
+ expect(out.has('n2')).toBe(true);
+ });
+
+ it('handles an empty input array', () => {
+ expect(q.getNodesByIds([]).size).toBe(0);
+ });
+
+ it('handles batches over the SQLite parameter limit (chunking)', () => {
+ // Insert 1500 nodes; the helper chunks at 500 internally.
+ const nodes = Array.from({ length: 1500 }, (_, i) => makeNode(`n${i}`));
+ q.insertNodes(nodes);
+ const ids = nodes.map((n) => n.id);
+ const out = q.getNodesByIds(ids);
+ expect(out.size).toBe(1500);
+ // Spot-check a few from the first / middle / last chunk.
+ expect(out.has('n0')).toBe(true);
+ expect(out.has('n750')).toBe(true);
+ expect(out.has('n1499')).toBe(true);
+ });
+
+ it('serves cache hits from memory and queries only the misses', () => {
+ q.insertNodes([makeNode('n1'), makeNode('n2'), makeNode('n3')]);
+ // Warm the cache for n1 only.
+ q.getNodeById('n1');
+ // Replace the underlying row to make a miss-vs-cache-hit detectable.
+ db.getDb().prepare('UPDATE nodes SET name = ? WHERE id = ?').run('changed', 'n1');
+ const out = q.getNodesByIds(['n1', 'n2']);
+ // The cached n1 (still 'n1', not 'changed') must be returned.
+ expect(out.get('n1')!.name).toBe('n1');
+ expect(out.get('n2')!.name).toBe('n2');
+ });
+});
+
+describe('insertNode cache invalidation', () => {
+ let dir: string;
+ let db: DatabaseConnection;
+ let q: QueryBuilder;
+
+ beforeEach(() => {
+ dir = fs.mkdtempSync(path.join(os.tmpdir(), 'db-perf-cache-'));
+ db = DatabaseConnection.initialize(path.join(dir, 'test.db'));
+ q = new QueryBuilder(db.getDb());
+ });
+
+ afterEach(() => {
+ db.close();
+ if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true });
+ });
+
+ it('does not serve a stale cached node after INSERT OR REPLACE', () => {
+ // Regression: insertNode (which uses INSERT OR REPLACE) used to skip
+ // cache invalidation, so the next getNodeById returned the pre-replace
+ // version until LRU eviction.
+ const original = makeNode('n1', 'oldName');
+ q.insertNode(original);
+ const beforeReplace = q.getNodeById('n1');
+ expect(beforeReplace!.name).toBe('oldName');
+
+ // Replace via insertNode (the bug path).
+ q.insertNode({ ...original, name: 'newName', updatedAt: Date.now() });
+ const afterReplace = q.getNodeById('n1');
+ expect(afterReplace!.name).toBe('newName');
+ });
+});
+
+describe('runMaintenance', () => {
+ let dir: string;
+ let db: DatabaseConnection;
+
+ beforeEach(() => {
+ dir = fs.mkdtempSync(path.join(os.tmpdir(), 'db-perf-maint-'));
+ db = DatabaseConnection.initialize(path.join(dir, 'test.db'));
+ });
+
+ afterEach(() => {
+ db.close();
+ if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true });
+ });
+
+ it('runs without throwing on a fresh database', () => {
+ expect(() => db.runMaintenance()).not.toThrow();
+ });
+
+ it('runs without throwing after writes', () => {
+ const q = new QueryBuilder(db.getDb());
+ q.insertNodes([makeNode('n1'), makeNode('n2')]);
+ expect(() => db.runMaintenance()).not.toThrow();
+ });
+
+ it('swallows failures rather than propagating (best-effort)', () => {
+ // Close the DB so the underlying handle would normally throw on any
+ // exec(). runMaintenance must still not propagate.
+ db.close();
+ expect(() => db.runMaintenance()).not.toThrow();
+ });
+});
diff --git a/__tests__/diversify.test.ts b/__tests__/diversify.test.ts
new file mode 100644
index 00000000..181ee9c5
--- /dev/null
+++ b/__tests__/diversify.test.ts
@@ -0,0 +1,200 @@
+/**
+ * Result Diversification Tests
+ *
+ * Verifies the per-file cap on search results: queries that match many
+ * symbols in one file (the methods of a class) no longer return 10 hits
+ * from one file, but instead surface representative breadth across files.
+ */
+
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+import { DatabaseConnection } from '../src/db';
+import { QueryBuilder } from '../src/db/queries';
+import { diversifyByFile } from '../src/search/query-utils';
+import { Node } from '../src/types';
+
+describe('diversifyByFile (unit)', () => {
+ function r(score: number, name: string, filePath: string) {
+ return { node: { id: name, name, filePath } as Node, score };
+ }
+
+ it('caps consecutive results from the same file at perFileCap', () => {
+ const results = [
+ r(10, 'a1', 'a.ts'),
+ r(9, 'a2', 'a.ts'),
+ r(8, 'a3', 'a.ts'),
+ r(7, 'a4', 'a.ts'),
+ r(6, 'b1', 'b.ts'),
+ ];
+ const out = diversifyByFile(results, 5, 2);
+ expect(out.map((x) => x.node.name)).toEqual(['a1', 'a2', 'b1', 'a3', 'a4']);
+ // First two from a.ts (cap), then b.ts (different file), then backfill.
+ });
+
+ it('preserves overall ranking when no file dominates', () => {
+ const results = [
+ r(10, 'a1', 'a.ts'),
+ r(9, 'b1', 'b.ts'),
+ r(8, 'c1', 'c.ts'),
+ r(7, 'a2', 'a.ts'),
+ ];
+ const out = diversifyByFile(results, 4, 2);
+ expect(out.map((x) => x.node.name)).toEqual(['a1', 'b1', 'c1', 'a2']);
+ });
+
+ it('does not lose results — backfills from skipped when limit not yet filled', () => {
+ // 10 candidates all from one file, limit 5, cap 2: pick 2, backfill 3.
+ const results = Array.from({ length: 10 }, (_, i) =>
+ r(10 - i, `n${i}`, 'a.ts')
+ );
+ const out = diversifyByFile(results, 5, 2);
+ expect(out).toHaveLength(5);
+ expect(out.every((x) => x.node.filePath === 'a.ts')).toBe(true);
+ });
+
+ it('returns the input slice unchanged when perFileCap=0', () => {
+ const results = [
+ r(10, 'a1', 'a.ts'),
+ r(9, 'a2', 'a.ts'),
+ r(8, 'a3', 'a.ts'),
+ ];
+ expect(diversifyByFile(results, 3, 0)).toEqual(results);
+ });
+
+ it('returns input unchanged when results.length <= limit and no reordering needed', () => {
+ const results = [r(10, 'a1', 'a.ts'), r(9, 'a2', 'a.ts')];
+ expect(diversifyByFile(results, 5, 2)).toEqual(results);
+ });
+
+ it('still reorders within limit when results.length === limit but cap rearranges', () => {
+ // Same total count as limit, but the cap reorders to surface peer files
+ // earlier in the list.
+ const results = [
+ r(10, 'a1', 'a.ts'),
+ r(9, 'a2', 'a.ts'),
+ r(8, 'a3', 'a.ts'),
+ r(7, 'a4', 'a.ts'),
+ r(6, 'b1', 'b.ts'),
+ ];
+ const out = diversifyByFile(results, 5, 2);
+ // First 2 from a.ts (cap), then b.ts, then backfill a.ts.
+ expect(out.map((x) => x.node.name)).toEqual(['a1', 'a2', 'b1', 'a3', 'a4']);
+ });
+
+ it('respects the limit even when picked + skipped exceed it', () => {
+ const results = [
+ r(10, 'a1', 'a.ts'),
+ r(9, 'a2', 'a.ts'),
+ r(8, 'a3', 'a.ts'),
+ r(7, 'b1', 'b.ts'),
+ ];
+ const out = diversifyByFile(results, 2, 2);
+ expect(out).toHaveLength(2);
+ expect(out.map((x) => x.node.name)).toEqual(['a1', 'a2']);
+ });
+
+ it('always preserves the top-scoring result at position 0', () => {
+ const results = [
+ r(100, 'top', 'big.ts'),
+ r(50, 'big2', 'big.ts'),
+ r(40, 'big3', 'big.ts'),
+ r(30, 'big4', 'big.ts'),
+ r(20, 'other', 'other.ts'),
+ ];
+ const out = diversifyByFile(results, 3, 2);
+ expect(out[0].node.name).toBe('top');
+ });
+});
+
+describe('searchNodes per-file diversification (integration)', () => {
+ let dir: string;
+ let db: DatabaseConnection;
+ let q: QueryBuilder;
+
+ function makeNode(id: string, name: string, kind: Node['kind'], filePath: string): Node {
+ return {
+ id,
+ kind,
+ name,
+ qualifiedName: `${filePath}::${name}`,
+ filePath,
+ language: 'typescript',
+ startLine: 1,
+ endLine: 1,
+ startColumn: 0,
+ endColumn: 0,
+ updatedAt: Date.now(),
+ };
+ }
+
+ beforeEach(() => {
+ dir = fs.mkdtempSync(path.join(os.tmpdir(), 'diversify-search-'));
+ db = DatabaseConnection.initialize(path.join(dir, 'test.db'));
+ q = new QueryBuilder(db.getDb());
+ // Simulate the "10 methods of one class" scenario: a class plus many
+ // methods all sharing a common token, all in one file. Plus a peer
+ // file with a sibling implementation.
+ const nodes: Node[] = [
+ makeNode('cls', 'DatabaseConnection', 'class', 'src/db.ts'),
+ makeNode('m1', 'connect', 'method', 'src/db.ts'),
+ makeNode('m2', 'disconnect', 'method', 'src/db.ts'),
+ makeNode('m3', 'reconnect', 'method', 'src/db.ts'),
+ makeNode('m4', 'isConnected', 'method', 'src/db.ts'),
+ makeNode('m5', 'connectionString', 'property', 'src/db.ts'),
+ makeNode('peer', 'PoolConnection', 'class', 'src/pool.ts'),
+ makeNode('peer2', 'connectPool', 'function', 'src/pool.ts'),
+ ];
+ q.insertNodes(nodes);
+ });
+
+ afterEach(() => {
+ db.close();
+ if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true });
+ });
+
+ it('caps results per file at the default (3) so peer files surface', () => {
+ const results = q.searchNodes('connect', { limit: 5 });
+ const fromDbTs = results.filter((r) => r.node.filePath === 'src/db.ts').length;
+ const fromPool = results.filter((r) => r.node.filePath === 'src/pool.ts').length;
+ expect(fromDbTs).toBeLessThanOrEqual(3); // cap
+ expect(fromPool).toBeGreaterThanOrEqual(1); // peer file represented
+ });
+
+ it('honors perFileCap: 0 (disabled) — does not enforce a per-file limit', () => {
+ // Insert a heavy imbalance so dominance is unambiguous: 10 matching
+ // methods in db.ts, only the existing pool.ts entries elsewhere.
+ const heavyDb: Node[] = Array.from({ length: 10 }, (_, i) =>
+ makeNode(`heavy${i}`, `connectVariant${i}`, 'method', 'src/db.ts')
+ );
+ q.insertNodes(heavyDb);
+ const results = q.searchNodes('connect', { limit: 8, perFileCap: 0 });
+ const fromDbTs = results.filter((r) => r.node.filePath === 'src/db.ts').length;
+ expect(fromDbTs).toBeGreaterThan(3);
+ });
+
+ it('honors a higher perFileCap', () => {
+ const results = q.searchNodes('connect', { limit: 6, perFileCap: 5 });
+ const fromDbTs = results.filter((r) => r.node.filePath === 'src/db.ts').length;
+ expect(fromDbTs).toBeLessThanOrEqual(5);
+ });
+
+ it('preserves the top-scoring hit even with diversification', () => {
+ // Class node with the most direct name match is the most relevant —
+ // diversification must never displace it from #1.
+ const results = q.searchNodes('DatabaseConnection', { limit: 3 });
+ expect(results[0].node.name).toBe('DatabaseConnection');
+ });
+
+ it('does not lose results — fills limit by backfilling skipped same-file hits', () => {
+ // If only one file has matches, all results legitimately come from it.
+ // The cap should not cause us to return fewer than `limit` results.
+ const onlyOneFileNodes: Node[] = Array.from({ length: 10 }, (_, i) =>
+ makeNode(`only${i}`, `solo${i}`, 'function', 'src/only.ts')
+ );
+ q.insertNodes(onlyOneFileNodes);
+ const results = q.searchNodes('solo', { limit: 5 });
+ expect(results.length).toBe(5);
+ });
+});
diff --git a/__tests__/edges-unique.test.ts b/__tests__/edges-unique.test.ts
new file mode 100644
index 00000000..49eced53
--- /dev/null
+++ b/__tests__/edges-unique.test.ts
@@ -0,0 +1,166 @@
+/**
+ * Edge Uniqueness Tests
+ *
+ * Regression tests for the bug where `INSERT OR IGNORE INTO edges` was
+ * silently a no-op: the only candidate key was the AUTOINCREMENT id (which
+ * never conflicts), so duplicate edges accumulated on every re-emission /
+ * re-resolution.
+ *
+ * Fix: a UNIQUE index on (source, target, kind, COALESCE(line, -1),
+ * COALESCE(col, -1)) backs a fresh-install schema and is also applied via
+ * migration v4 (with a dedup pass over existing rows).
+ */
+
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+import { DatabaseConnection } from '../src/db';
+import { QueryBuilder } from '../src/db/queries';
+import { Edge, Node } from '../src/types';
+import { runMigrations, getCurrentVersion, CURRENT_SCHEMA_VERSION } from '../src/db/migrations';
+
+function tempDb(): { dir: string; db: DatabaseConnection; q: QueryBuilder } {
+ const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-edges-unique-'));
+ const db = DatabaseConnection.initialize(path.join(dir, 'test.db'));
+ const q = new QueryBuilder(db.getDb());
+ return { dir, db, q };
+}
+
+function cleanup(dir: string, db: DatabaseConnection) {
+ db.close();
+ if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true });
+}
+
+function makeNode(id: string, name: string): Node {
+ return {
+ id,
+ kind: 'function',
+ name,
+ qualifiedName: `f::${name}`,
+ filePath: 'a.ts',
+ language: 'typescript',
+ startLine: 1,
+ endLine: 1,
+ startColumn: 0,
+ endColumn: 0,
+ updatedAt: Date.now(),
+ };
+}
+
+function edgesCount(db: DatabaseConnection): number {
+ const row = db.getDb().prepare('SELECT COUNT(*) as c FROM edges').get() as { c: number };
+ return row.c;
+}
+
+describe('Edge UNIQUE constraint (bug #2)', () => {
+ let dir: string;
+ let db: DatabaseConnection;
+ let q: QueryBuilder;
+
+ beforeEach(() => {
+ ({ dir, db, q } = tempDb());
+ q.insertNodes([makeNode('n1', 'foo'), makeNode('n2', 'bar')]);
+ });
+
+ afterEach(() => cleanup(dir, db));
+
+ it('rejects duplicate (source, target, kind, line, col)', () => {
+ const e: Edge = { source: 'n1', target: 'n2', kind: 'calls', line: 10, column: 5 };
+ q.insertEdge(e);
+ q.insertEdge(e); // INSERT OR IGNORE — should be a no-op now
+ expect(edgesCount(db)).toBe(1);
+ });
+
+ it('treats two NULL line edges as duplicates (COALESCE in unique index)', () => {
+ const e: Edge = { source: 'n1', target: 'n2', kind: 'calls' };
+ q.insertEdge(e);
+ q.insertEdge(e);
+ expect(edgesCount(db)).toBe(1);
+ });
+
+ it('allows same source/target/kind on different lines', () => {
+ q.insertEdge({ source: 'n1', target: 'n2', kind: 'calls', line: 1 });
+ q.insertEdge({ source: 'n1', target: 'n2', kind: 'calls', line: 2 });
+ expect(edgesCount(db)).toBe(2);
+ });
+
+ it('allows same source/target/line on different kinds', () => {
+ q.insertEdge({ source: 'n1', target: 'n2', kind: 'calls', line: 1 });
+ q.insertEdge({ source: 'n1', target: 'n2', kind: 'references', line: 1 });
+ expect(edgesCount(db)).toBe(2);
+ });
+
+ it('insertEdges (batch) dedupes within the same call', () => {
+ const e: Edge = { source: 'n1', target: 'n2', kind: 'calls', line: 1, column: 1 };
+ q.insertEdges([e, e, e]);
+ expect(edgesCount(db)).toBe(1);
+ });
+
+ it('survives the same edge being re-emitted across many cycles', () => {
+ const e: Edge = { source: 'n1', target: 'n2', kind: 'calls', line: 1 };
+ for (let i = 0; i < 100; i++) {
+ q.insertEdge(e);
+ }
+ expect(edgesCount(db)).toBe(1);
+ });
+});
+
+describe('Migration v4: dedup existing edges', () => {
+ let dir: string;
+ let dbPath: string;
+
+ beforeEach(() => {
+ dir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-migr-v4-'));
+ dbPath = path.join(dir, 'test.db');
+ });
+
+ afterEach(() => {
+ if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true });
+ });
+
+ it('collapses pre-existing duplicates and adds the UNIQUE index', () => {
+ // Build a v3-shaped database manually: schema, but simulate a stale
+ // version row + insert duplicates that the missing UNIQUE index let
+ // through. We use the real initialize() path then drop the index +
+ // version row to back-date the DB.
+ const db = DatabaseConnection.initialize(dbPath);
+ db.getDb().exec(`DROP INDEX IF EXISTS idx_edges_unique;`);
+ db.getDb().exec(`DELETE FROM schema_versions;`);
+ db.getDb().prepare(
+ 'INSERT INTO schema_versions (version, applied_at, description) VALUES (3, ?, ?)'
+ ).run(Date.now(), 'simulated v3');
+
+ const q = new QueryBuilder(db.getDb());
+ q.insertNodes([makeNode('n1', 'foo'), makeNode('n2', 'bar')]);
+ // Force-insert duplicates via raw SQL (bypassing the constraint that
+ // is now absent). Three rows that should collapse to one.
+ const stmt = db.getDb().prepare(
+ 'INSERT INTO edges (source, target, kind, line, col) VALUES (?, ?, ?, ?, ?)'
+ );
+ stmt.run('n1', 'n2', 'calls', 10, 5);
+ stmt.run('n1', 'n2', 'calls', 10, 5);
+ stmt.run('n1', 'n2', 'calls', 10, 5);
+ // And one with NULL line/col, also duplicated
+ stmt.run('n1', 'n2', 'references', null, null);
+ stmt.run('n1', 'n2', 'references', null, null);
+
+ expect(edgesCount(db)).toBe(5);
+ expect(getCurrentVersion(db.getDb())).toBe(3);
+
+ // Run migrations forward
+ runMigrations(db.getDb(), 3);
+
+ expect(getCurrentVersion(db.getDb())).toBe(CURRENT_SCHEMA_VERSION);
+ expect(CURRENT_SCHEMA_VERSION).toBeGreaterThanOrEqual(4);
+ // 3 calls dups → 1, 2 references dups → 1
+ expect(edgesCount(db)).toBe(2);
+
+ // Now the constraint is enforced: another duplicate insert is a no-op.
+ const q2 = new QueryBuilder(db.getDb());
+ q2.insertEdge({ source: 'n1', target: 'n2', kind: 'calls', line: 10, column: 5 });
+ expect(edgesCount(db)).toBe(2);
+
+ db.close();
+ });
+});
diff --git a/__tests__/extraction-resolution-accuracy.test.ts b/__tests__/extraction-resolution-accuracy.test.ts
new file mode 100644
index 00000000..f78f3d76
--- /dev/null
+++ b/__tests__/extraction-resolution-accuracy.test.ts
@@ -0,0 +1,266 @@
+/**
+ * Extraction & Resolution Accuracy Tests
+ *
+ * Regression tests for three accuracy bugs fixed in one PR:
+ * 1. Parse-retry comment strip was hardcoded to `//`, no-op on Python/Ruby/etc.
+ * 2. Framework route extractors ran regex over raw file content, matching
+ * examples in docstrings/comments as real routes.
+ * 3. UTF-8 BOM caused spurious "modified" hash mismatches between editors.
+ */
+
+import { describe, it, expect } from 'vitest';
+import { stripBom, stripCommentLinesForRetry, stripCommentsForRegex } from '../src/utils';
+import { hashContent } from '../src/extraction';
+import { flaskResolver, fastapiResolver, djangoResolver } from '../src/resolution/frameworks/python';
+import { expressResolver } from '../src/resolution/frameworks/express';
+import { aspnetResolver } from '../src/resolution/frameworks/csharp';
+import { rustResolver } from '../src/resolution/frameworks/rust';
+import { laravelResolver } from '../src/resolution/frameworks/laravel';
+
+describe('UTF-8 BOM normalization (bug #5)', () => {
+ it('stripBom removes leading U+FEFF', () => {
+ expect(stripBom('hello')).toBe('hello');
+ expect(stripBom('hello')).toBe('hello');
+ expect(stripBom('')).toBe('');
+ });
+
+ it('stripBom only removes leading BOM, not embedded ones', () => {
+ expect(stripBom('ab')).toBe('ab');
+ });
+
+ it('hashContent treats BOM and no-BOM as identical', () => {
+ const withBom = 'export function hello() { return 42; }';
+ const withoutBom = 'export function hello() { return 42; }';
+ expect(hashContent(withBom)).toBe(hashContent(withoutBom));
+ });
+});
+
+describe('Per-language comment-line stripping (bug #1)', () => {
+ it('strips `#` lines for Python', () => {
+ const input = ['# CHECK: foo', 'def x():', ' pass'].join('\n');
+ const out = stripCommentLinesForRetry(input, 'python');
+ expect(out.split('\n')).toEqual(['', 'def x():', ' pass']);
+ });
+
+ it('strips `#` lines for Ruby', () => {
+ const input = ['# top comment', 'def x; end'].join('\n');
+ const out = stripCommentLinesForRetry(input, 'ruby');
+ expect(out.split('\n')).toEqual(['', 'def x; end']);
+ });
+
+ it('strips `//` lines for TypeScript', () => {
+ const input = ['// header', 'function x() {}'].join('\n');
+ const out = stripCommentLinesForRetry(input, 'typescript');
+ expect(out.split('\n')).toEqual(['', 'function x() {}']);
+ });
+
+ it('strips both `//` and `#` lines for PHP', () => {
+ const input = ['// js-style', '# perl-style', ' {
+ const input = '// looks like a comment\ncode';
+ expect(stripCommentLinesForRetry(input, 'unknown-lang')).toBe(input);
+ });
+
+ it('preserves line count so node positions stay correct', () => {
+ const input = ['# c1', 'a', '# c2', 'b'].join('\n');
+ const out = stripCommentLinesForRetry(input, 'python');
+ expect(out.split('\n').length).toBe(input.split('\n').length);
+ });
+
+ it('does NOT strip indented `#` inside Python (still recognized as line comment)', () => {
+ // The marker matches optional leading whitespace + `#`, so an indented
+ // pure comment line is correctly stripped. Non-comment code on the same
+ // line as `#` (mid-line comment) is intentionally not stripped here.
+ const input = [' # indented comment', ' pass # trailing'].join('\n');
+ const out = stripCommentLinesForRetry(input, 'python');
+ expect(out.split('\n')).toEqual(['', ' pass # trailing']);
+ });
+});
+
+describe('Framework regex no longer matches docstrings/comments (bug #4)', () => {
+ describe('Flask', () => {
+ it('skips routes inside `#` comments', () => {
+ const content = [
+ 'from flask import Flask',
+ 'app = Flask(__name__)',
+ '# Example: @app.route("/fake")',
+ '@app.route("/real")',
+ 'def real(): pass',
+ ].join('\n');
+ const nodes = flaskResolver.extractNodes!('app.py', content);
+ const paths = nodes.map((n) => n.name);
+ expect(paths).toContain('/real');
+ expect(paths).not.toContain('/fake');
+ });
+
+ it('skips routes inside triple-quoted docstrings', () => {
+ const content = [
+ 'def example():',
+ ' """',
+ ' Usage: @app.route("/fake")',
+ ' """',
+ ' pass',
+ '@app.route("/real")',
+ 'def real(): pass',
+ ].join('\n');
+ const nodes = flaskResolver.extractNodes!('app.py', content);
+ const paths = nodes.map((n) => n.name);
+ expect(paths).toContain('/real');
+ expect(paths).not.toContain('/fake');
+ });
+ });
+
+ describe('FastAPI', () => {
+ it('skips routes inside `#` comments and triple-quoted docstrings', () => {
+ const content = [
+ '"""',
+ 'Module docs — example: @app.get("/docfake")',
+ '"""',
+ '# @app.post("/commentfake")',
+ '@app.get("/real")',
+ 'def real(): pass',
+ ].join('\n');
+ const nodes = fastapiResolver.extractNodes!('app.py', content);
+ const names = nodes.map((n) => n.name);
+ expect(names.some((n) => n.includes('/real'))).toBe(true);
+ expect(names.some((n) => n.includes('/docfake'))).toBe(false);
+ expect(names.some((n) => n.includes('/commentfake'))).toBe(false);
+ });
+
+ it('preserves correct line numbers for real routes after stripping', () => {
+ const content = [
+ '"""', // line 1
+ '@app.get("/fake")', // line 2 — inside docstring
+ '"""', // line 3
+ '', // line 4
+ '@app.get("/real")', // line 5 — real
+ ].join('\n');
+ const nodes = fastapiResolver.extractNodes!('app.py', content);
+ const real = nodes.find((n) => n.name.includes('/real'));
+ expect(real).toBeDefined();
+ expect(real!.startLine).toBe(5);
+ });
+ });
+
+ describe('Django URL patterns', () => {
+ it('skips path() inside `#` comments', () => {
+ const content = [
+ 'from django.urls import path',
+ '# example: path("fake/", fake_view)',
+ 'urlpatterns = [path("real/", real_view)]',
+ ].join('\n');
+ const nodes = djangoResolver.extractNodes!('urls.py', content);
+ const names = nodes.map((n) => n.name);
+ expect(names).toContain('real/');
+ expect(names).not.toContain('fake/');
+ });
+ });
+
+ describe('Express', () => {
+ it('skips routes inside `//` comments', () => {
+ const content = [
+ 'const app = express();',
+ '// app.get("/fake", fakeHandler);',
+ 'app.get("/real", realHandler);',
+ ].join('\n');
+ const nodes = expressResolver.extractNodes!('server.js', content);
+ const names = nodes.map((n) => n.name);
+ expect(names.some((n) => n.includes('/real'))).toBe(true);
+ expect(names.some((n) => n.includes('/fake'))).toBe(false);
+ });
+
+ it('skips routes inside `/* ... */` block comments', () => {
+ const content = [
+ '/*',
+ ' * app.post("/blockfake", h);',
+ ' */',
+ 'app.get("/real", h);',
+ ].join('\n');
+ const nodes = expressResolver.extractNodes!('server.js', content);
+ const names = nodes.map((n) => n.name);
+ expect(names.some((n) => n.includes('/real'))).toBe(true);
+ expect(names.some((n) => n.includes('/blockfake'))).toBe(false);
+ });
+ });
+
+ describe('Laravel', () => {
+ it('skips routes inside PHP `//` and `#` comments', () => {
+ const content = [
+ ' n.name);
+ expect(names.some((n) => n.includes('/real'))).toBe(true);
+ expect(names.some((n) => n.includes('/jsfake'))).toBe(false);
+ expect(names.some((n) => n.includes('/perlfake'))).toBe(false);
+ });
+ });
+
+ describe('Rust', () => {
+ it('skips actix/rocket routes inside `///` doc comments', () => {
+ const content = [
+ '/// Example route: #[get("/docfake")]',
+ '#[get("/real")]',
+ 'fn real() {}',
+ ].join('\n');
+ const nodes = rustResolver.extractNodes!('main.rs', content);
+ const names = nodes.map((n) => n.name);
+ expect(names.some((n) => n.includes('/real'))).toBe(true);
+ expect(names.some((n) => n.includes('/docfake'))).toBe(false);
+ });
+ });
+
+ describe('ASP.NET (C#)', () => {
+ it('skips route attributes inside `///` XML doc comments', () => {
+ const content = [
+ '/// ',
+ '/// Example: [HttpGet("/docfake")]',
+ '/// ',
+ '[HttpGet("/real")]',
+ 'public class C {}',
+ ].join('\n');
+ const nodes = aspnetResolver.extractNodes!('Controller.cs', content);
+ const names = nodes.map((n) => n.name);
+ expect(names.some((n) => n.includes('/real'))).toBe(true);
+ expect(names.some((n) => n.includes('/docfake'))).toBe(false);
+ });
+
+ it('skips minimal-API MapGet/MapPost calls inside comments', () => {
+ // Regression: the minimalApiPattern loop below the routePatterns
+ // loop was initially missed when applying the strip helper, leaving
+ // commented-out `app.MapGet("/x")` calls extracted as real routes.
+ const content = [
+ '// app.MapGet("/linefake", h);',
+ '/*',
+ ' * app.MapPost("/blockfake", h);',
+ ' */',
+ 'app.MapGet("/real", h);',
+ ].join('\n');
+ const nodes = aspnetResolver.extractNodes!('Program.cs', content);
+ const names = nodes.map((n) => n.name);
+ expect(names.some((n) => n.includes('/real'))).toBe(true);
+ expect(names.some((n) => n.includes('/linefake'))).toBe(false);
+ expect(names.some((n) => n.includes('/blockfake'))).toBe(false);
+ });
+ });
+});
+
+describe('stripCommentsForRegex preserves line offsets', () => {
+ it('keeps newlines so match.index → original line number', () => {
+ const input = '"""\n@app.get("/x")\n"""\n@app.get("/y")';
+ const out = stripCommentsForRegex(input, 'python');
+ // Newlines preserved
+ expect(out.split('\n').length).toBe(input.split('\n').length);
+ // The /y route survives
+ expect(out).toContain('/y');
+ // The docstring contents are blanked
+ expect(out).not.toContain('/x');
+ });
+});
diff --git a/__tests__/extraction.test.ts b/__tests__/extraction.test.ts
index 8a70ffed..d4f7344c 100644
--- a/__tests__/extraction.test.ts
+++ b/__tests__/extraction.test.ts
@@ -3079,3 +3079,420 @@ describe('Directory Exclusion', () => {
expect(files.every((f) => !f.includes('vendor'))).toBe(true);
});
});
+
+// =============================================================================
+// R Extraction
+// =============================================================================
+
+describe('R Extraction', () => {
+ describe('Language detection', () => {
+ it('should detect R files', () => {
+ expect(detectLanguage('script.R')).toBe('r');
+ expect(detectLanguage('utils.r')).toBe('r');
+ });
+
+ it('should report R as supported', () => {
+ expect(isLanguageSupported('r')).toBe(true);
+ expect(getSupportedLanguages()).toContain('r');
+ });
+ });
+
+ describe('Function extraction', () => {
+ it('should extract a function defined with <-', () => {
+ const code = `add <- function(a, b) {
+ a + b
+}`;
+ const result = extractFromSource('main.R', code);
+ const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'add');
+ expect(fn).toBeDefined();
+ expect(fn?.signature).toBe('(a, b)');
+ });
+
+ it('should extract a function defined with =', () => {
+ const code = `subtract = function(a, b) a - b`;
+ const result = extractFromSource('main.R', code);
+ const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'subtract');
+ expect(fn).toBeDefined();
+ });
+
+ it('should extract a function defined with <<-', () => {
+ const code = `divide <<- function(a, b) a / b`;
+ const result = extractFromSource('main.R', code);
+ const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'divide');
+ expect(fn).toBeDefined();
+ });
+
+ it('should extract S3 method names verbatim (period in name)', () => {
+ const code = `print.myClass <- function(x, ...) cat(x$value)`;
+ const result = extractFromSource('print.R', code);
+ const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'print.myClass');
+ expect(fn).toBeDefined();
+ });
+
+ it('should NOT emit anonymous function nodes for inline lambdas', () => {
+ const code = `result <- lapply(xs, function(x) x * 2)`;
+ const result = extractFromSource('main.R', code);
+ expect(result.nodes.find((n) => n.kind === 'function')).toBeUndefined();
+ });
+
+ it('should attach a docstring from preceding roxygen comments', () => {
+ const code = `#' Add two numbers
+#' @param a numeric
+#' @param b numeric
+add <- function(a, b) a + b`;
+ const result = extractFromSource('main.R', code);
+ const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'add');
+ expect(fn?.docstring).toContain('Add two numbers');
+ });
+ });
+
+ describe('Call extraction', () => {
+ it('should extract simple function calls inside a function body', () => {
+ const code = `wrap <- function(x) {
+ inner(x)
+ another(x)
+}`;
+ const result = extractFromSource('main.R', code);
+ const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'wrap')!;
+ const calls = result.unresolvedReferences.filter(
+ (r) => r.fromNodeId === fn.id && r.referenceKind === 'calls'
+ );
+ const calleeNames = calls.map((c) => c.referenceName);
+ expect(calleeNames).toContain('inner');
+ expect(calleeNames).toContain('another');
+ });
+
+ it('should preserve namespace operator in callee name (pkg::fn)', () => {
+ const code = `runner <- function() {
+ dplyr::filter(df, x > 0)
+}`;
+ const result = extractFromSource('main.R', code);
+ const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'runner')!;
+ const calleeNames = result.unresolvedReferences
+ .filter((r) => r.fromNodeId === fn.id)
+ .map((r) => r.referenceName);
+ expect(calleeNames).toContain('dplyr::filter');
+ });
+ });
+
+ describe('Imports', () => {
+ it('should extract library() with bare-identifier argument', () => {
+ const code = `library(dplyr)`;
+ const result = extractFromSource('main.R', code);
+ const importNode = result.nodes.find((n) => n.kind === 'import');
+ expect(importNode?.name).toBe('dplyr');
+ });
+
+ it('should extract library() with quoted-string argument', () => {
+ const code = `library("tidyr")`;
+ const result = extractFromSource('main.R', code);
+ const importNode = result.nodes.find((n) => n.kind === 'import' && n.name === 'tidyr');
+ expect(importNode).toBeDefined();
+ });
+
+ it('should extract require() the same way as library()', () => {
+ const code = `require(ggplot2)`;
+ const result = extractFromSource('main.R', code);
+ const importNode = result.nodes.find((n) => n.kind === 'import' && n.name === 'ggplot2');
+ expect(importNode).toBeDefined();
+ });
+
+ it('should extract source() with a string path', () => {
+ const code = `source("helpers.R")`;
+ const result = extractFromSource('main.R', code);
+ const importNode = result.nodes.find((n) => n.kind === 'import' && n.name === 'helpers.R');
+ expect(importNode).toBeDefined();
+ });
+
+ it('should not emit an import node for a dynamic source() argument', () => {
+ const code = `source(paste0(BASE, "/helpers.R"))`;
+ const result = extractFromSource('main.R', code);
+ const imports = result.nodes.filter((n) => n.kind === 'import');
+ expect(imports.length).toBe(0);
+ });
+
+ it('should unquote R 4.0+ raw string literals (round delimiter)', () => {
+ const code = `source(r"(helpers.R)")`;
+ const result = extractFromSource('main.R', code);
+ const importNode = result.nodes.find((n) => n.kind === 'import' && n.name === 'helpers.R');
+ expect(importNode).toBeDefined();
+ });
+
+ it('should unquote R raw strings with bracket and brace delimiters', () => {
+ const r1 = extractFromSource('a.R', `library(R"[mypkg]")`);
+ const r2 = extractFromSource('b.R', `library(r"{mypkg}")`);
+ expect(r1.nodes.find((n) => n.kind === 'import' && n.name === 'mypkg')).toBeDefined();
+ expect(r2.nodes.find((n) => n.kind === 'import' && n.name === 'mypkg')).toBeDefined();
+ });
+
+ it('should unquote dash-delimited raw strings used to embed quotes', () => {
+ const code = `source(r"-(file.R)-")`;
+ const result = extractFromSource('main.R', code);
+ const importNode = result.nodes.find((n) => n.kind === 'import' && n.name === 'file.R');
+ expect(importNode).toBeDefined();
+ });
+ });
+
+ describe('Top-level constants', () => {
+ it('should extract top-level non-function assignments as constants', () => {
+ const code = `PI <- 3.14159
+COLORS <- c("red", "green")`;
+ const result = extractFromSource('main.R', code);
+ const pi = result.nodes.find((n) => n.kind === 'constant' && n.name === 'PI');
+ const colors = result.nodes.find((n) => n.kind === 'constant' && n.name === 'COLORS');
+ expect(pi).toBeDefined();
+ expect(colors).toBeDefined();
+ });
+
+ it('should NOT emit a constant for assignments inside a function body', () => {
+ const code = `outer <- function() {
+ x <- 5
+ x
+}`;
+ const result = extractFromSource('main.R', code);
+ const innerVar = result.nodes.find((n) => n.kind === 'constant' && n.name === 'x');
+ expect(innerVar).toBeUndefined();
+ });
+ });
+});
+
+// HCL / Terraform Extraction
+// =============================================================================
+
+describe('HCL / Terraform Extraction', () => {
+ describe('Language detection', () => {
+ it('should detect HCL/Terraform files', () => {
+ expect(detectLanguage('main.tf')).toBe('hcl');
+ expect(detectLanguage('terraform.tfvars')).toBe('hcl');
+ expect(detectLanguage('config.hcl')).toBe('hcl');
+ });
+
+ it('should report HCL as supported', () => {
+ expect(isLanguageSupported('hcl')).toBe(true);
+ expect(getSupportedLanguages()).toContain('hcl');
+ });
+ });
+
+ describe('Block extraction', () => {
+ it('should extract a resource block as a class node', () => {
+ const code = `resource "aws_s3_bucket" "logs" { bucket = "my-logs" }`;
+ const result = extractFromSource('main.tf', code);
+
+ const node = result.nodes.find((n) => n.qualifiedName === 'aws_s3_bucket.logs');
+ expect(node).toBeDefined();
+ expect(node?.kind).toBe('class');
+ expect(node?.name).toBe('aws_s3_bucket.logs');
+ expect(node?.language).toBe('hcl');
+ expect(node?.signature).toBe('resource "aws_s3_bucket" "logs"');
+ });
+
+ it('should extract a data block with `data.` prefix', () => {
+ const code = `data "aws_caller_identity" "current" {}`;
+ const result = extractFromSource('main.tf', code);
+
+ const node = result.nodes.find((n) => n.qualifiedName === 'data.aws_caller_identity.current');
+ expect(node).toBeDefined();
+ expect(node?.kind).toBe('class');
+ expect(node?.name).toBe('aws_caller_identity.current');
+ });
+
+ it('should extract a variable block', () => {
+ const code = `variable "environment" { type = string }`;
+ const result = extractFromSource('main.tf', code);
+
+ const node = result.nodes.find((n) => n.qualifiedName === 'var.environment');
+ expect(node).toBeDefined();
+ expect(node?.kind).toBe('variable');
+ expect(node?.name).toBe('environment');
+ });
+
+ it('should extract an output block as an export', () => {
+ const code = `output "vpc_id" { value = "abc" }`;
+ const result = extractFromSource('main.tf', code);
+
+ const node = result.nodes.find((n) => n.qualifiedName === 'output.vpc_id');
+ expect(node).toBeDefined();
+ expect(node?.kind).toBe('export');
+ expect(node?.name).toBe('vpc_id');
+ });
+
+ it('should extract a module block', () => {
+ const code = `module "vpc" { source = "terraform-aws-modules/vpc/aws" }`;
+ const result = extractFromSource('main.tf', code);
+
+ const node = result.nodes.find((n) => n.qualifiedName === 'module.vpc');
+ expect(node).toBeDefined();
+ expect(node?.kind).toBe('module');
+ expect(node?.name).toBe('vpc');
+ });
+
+ it('should extract a provider block as namespace', () => {
+ const code = `provider "aws" { region = "us-east-1" }`;
+ const result = extractFromSource('main.tf', code);
+
+ const node = result.nodes.find((n) => n.qualifiedName === 'provider.aws');
+ expect(node).toBeDefined();
+ expect(node?.kind).toBe('namespace');
+ });
+
+ it('should split a locals block into one constant per attribute', () => {
+ const code = `locals {
+ bucket_name = "my-bucket"
+ retention = 30
+}`;
+ const result = extractFromSource('main.tf', code);
+
+ const bucketName = result.nodes.find((n) => n.qualifiedName === 'local.bucket_name');
+ const retention = result.nodes.find((n) => n.qualifiedName === 'local.retention');
+ expect(bucketName?.kind).toBe('constant');
+ expect(retention?.kind).toBe('constant');
+ });
+
+ it('should connect blocks to the file via contains edges', () => {
+ const code = `resource "aws_s3_bucket" "logs" {}`;
+ const result = extractFromSource('main.tf', code);
+
+ const fileNode = result.nodes.find((n) => n.kind === 'file');
+ const resourceNode = result.nodes.find((n) => n.qualifiedName === 'aws_s3_bucket.logs');
+ expect(fileNode).toBeDefined();
+ expect(resourceNode).toBeDefined();
+ const containsEdge = result.edges.find(
+ (e) => e.source === fileNode!.id && e.target === resourceNode!.id && e.kind === 'contains'
+ );
+ expect(containsEdge).toBeDefined();
+ });
+ });
+
+ describe('Reference extraction', () => {
+ it('should extract var.X references', () => {
+ const code = `resource "aws_s3_bucket" "logs" { bucket = var.bucket_name }`;
+ const result = extractFromSource('main.tf', code);
+
+ const ref = result.unresolvedReferences.find((r) => r.referenceName === 'var.bucket_name');
+ expect(ref).toBeDefined();
+ expect(ref?.referenceKind).toBe('references');
+ });
+
+ it('should extract local.X references', () => {
+ const code = `resource "aws_s3_bucket" "logs" { tags = local.common_tags }`;
+ const result = extractFromSource('main.tf', code);
+
+ const ref = result.unresolvedReferences.find((r) => r.referenceName === 'local.common_tags');
+ expect(ref).toBeDefined();
+ });
+
+ it('should extract module.X references and stop at the module name', () => {
+ const code = `output "vpc_id" { value = module.vpc.vpc_id }`;
+ const result = extractFromSource('main.tf', code);
+
+ const ref = result.unresolvedReferences.find((r) => r.referenceName === 'module.vpc');
+ expect(ref).toBeDefined();
+ // Should NOT emit a reference for the trailing attribute
+ expect(result.unresolvedReferences.find((r) => r.referenceName === 'module.vpc.vpc_id')).toBeUndefined();
+ });
+
+ it('should extract data.T.N references with both labels', () => {
+ const code = `output "x" { value = data.aws_caller_identity.current.account_id }`;
+ const result = extractFromSource('main.tf', code);
+
+ const ref = result.unresolvedReferences.find(
+ (r) => r.referenceName === 'data.aws_caller_identity.current'
+ );
+ expect(ref).toBeDefined();
+ });
+
+ it('should extract resource references as TYPE.NAME', () => {
+ const code = `resource "aws_s3_bucket_versioning" "v" { bucket = aws_s3_bucket.logs.id }`;
+ const result = extractFromSource('main.tf', code);
+
+ const ref = result.unresolvedReferences.find((r) => r.referenceName === 'aws_s3_bucket.logs');
+ expect(ref).toBeDefined();
+ });
+
+ it('should extract references inside string interpolations', () => {
+ const code = 'locals { name = "${var.environment}-${random_id.suffix.hex}" }';
+ const result = extractFromSource('main.tf', code);
+
+ const names = result.unresolvedReferences.map((r) => r.referenceName);
+ expect(names).toContain('var.environment');
+ expect(names).toContain('random_id.suffix');
+ });
+
+ it('should ignore references to count, each, self, and path', () => {
+ const code = `resource "aws_instance" "web" {
+ count = 3
+ tags = { Name = "web-\${count.index}", For = each.value, Self = self.id, P = path.module }
+}`;
+ const result = extractFromSource('main.tf', code);
+
+ const names = result.unresolvedReferences.map((r) => r.referenceName);
+ expect(names.find((n) => n.startsWith('count.'))).toBeUndefined();
+ expect(names.find((n) => n.startsWith('each.'))).toBeUndefined();
+ expect(names.find((n) => n.startsWith('self.'))).toBeUndefined();
+ expect(names.find((n) => n.startsWith('path.'))).toBeUndefined();
+ });
+
+ it('should ignore for-loop iteration variables', () => {
+ const code = `output "ids" { value = [for s in var.subnets : s.id] }`;
+ const result = extractFromSource('main.tf', code);
+
+ const names = result.unresolvedReferences.map((r) => r.referenceName);
+ // var.subnets reference comes through, but `s.id` does NOT
+ expect(names).toContain('var.subnets');
+ expect(names.find((n) => n.startsWith('s.'))).toBeUndefined();
+ });
+
+ it('should ignore key/value bindings in for-object expressions', () => {
+ const code = `locals { tags = { for k, v in var.input : k => "\${v}-suffix" } }`;
+ const result = extractFromSource('main.tf', code);
+
+ const names = result.unresolvedReferences.map((r) => r.referenceName);
+ expect(names).toContain('var.input');
+ expect(names.find((n) => n === 'k' || n.startsWith('k.'))).toBeUndefined();
+ expect(names.find((n) => n === 'v' || n.startsWith('v.'))).toBeUndefined();
+ });
+
+ it('should emit an imports edge for module source', () => {
+ const code = `module "vpc" { source = "terraform-aws-modules/vpc/aws" }`;
+ const result = extractFromSource('main.tf', code);
+
+ const importRef = result.unresolvedReferences.find(
+ (r) => r.referenceKind === 'imports' && r.referenceName === 'terraform-aws-modules/vpc/aws'
+ );
+ expect(importRef).toBeDefined();
+ });
+ });
+
+ describe('Robustness', () => {
+ it('should handle empty files', () => {
+ const result = extractFromSource('main.tf', '');
+ const fileNode = result.nodes.find((n) => n.kind === 'file');
+ expect(fileNode).toBeDefined();
+ });
+
+ it('should handle blocks with no body', () => {
+ const code = `data "aws_caller_identity" "current" {}`;
+ const result = extractFromSource('main.tf', code);
+ expect(result.nodes.find((n) => n.qualifiedName === 'data.aws_caller_identity.current')).toBeDefined();
+ });
+
+ it('should walk nested blocks for references without emitting child nodes', () => {
+ const code = `resource "aws_s3_bucket_versioning" "v" {
+ bucket = aws_s3_bucket.logs.id
+ versioning_configuration {
+ status = var.versioning_status
+ }
+}`;
+ const result = extractFromSource('main.tf', code);
+
+ // Only one block-level node, plus the file
+ const blockNodes = result.nodes.filter((n) => n.kind === 'class');
+ expect(blockNodes.length).toBe(1);
+
+ // References from the nested block should still be captured
+ const names = result.unresolvedReferences.map((r) => r.referenceName);
+ expect(names).toContain('aws_s3_bucket.logs');
+ expect(names).toContain('var.versioning_status');
+ });
+ });
+});
diff --git a/__tests__/foundation.test.ts b/__tests__/foundation.test.ts
index 9ee437da..97c04dcb 100644
--- a/__tests__/foundation.test.ts
+++ b/__tests__/foundation.test.ts
@@ -305,7 +305,7 @@ describe('Database Connection', () => {
const version = db.getSchemaVersion();
expect(version).not.toBeNull();
- expect(version?.version).toBe(3);
+ expect(version?.version).toBe(9);
db.close();
});
diff --git a/__tests__/index-hooks.test.ts b/__tests__/index-hooks.test.ts
new file mode 100644
index 00000000..639587f9
--- /dev/null
+++ b/__tests__/index-hooks.test.ts
@@ -0,0 +1,130 @@
+/**
+ * Index-hook framework: register a fake hook at runtime, run an
+ * indexAll/sync against a synthetic project, assert the hook ran
+ * with the expected context shape and that errors are caught.
+ *
+ * The registry's static-import list (`REGISTERED_HOOKS`) is empty
+ * on main today; tests poke at the runner directly through
+ * `runAfterIndexAll`/`runAfterSync` rather than mutating that
+ * list.
+ */
+import { describe, it, expect } from 'vitest';
+import {
+ runAfterIndexAll,
+ runAfterSync,
+ getRegisteredHooks,
+ type IndexHook,
+ type IndexHookContext,
+} from '../src/index-hooks/registry';
+import type { SyncResult } from '../src/extraction';
+
+function makeFakeContext(): IndexHookContext {
+ // Hooks should not mutate the context; for the runner-shape
+ // tests we hand them stubs typed `as any` — the runner doesn't
+ // touch any of these fields itself.
+ return {
+ projectRoot: '/tmp/fake-project',
+ /* eslint-disable @typescript-eslint/no-explicit-any */
+ config: {} as any,
+ queries: {} as any,
+ db: {} as any,
+ /* eslint-enable */
+ };
+}
+
+const fakeSyncResult: SyncResult = {
+ filesChecked: 0,
+ filesAdded: 0,
+ filesModified: 0,
+ filesRemoved: 0,
+ nodesUpdated: 0,
+ durationMs: 0,
+};
+
+describe('index-hooks registry — runner', () => {
+ it('registered hooks expose stable {name, afterIndexAll|afterSync} shape', () => {
+ const hooks = getRegisteredHooks();
+ expect(hooks.length).toBeGreaterThanOrEqual(0);
+ for (const h of hooks) {
+ expect(typeof h.name).toBe('string');
+ expect(h.afterIndexAll === undefined || typeof h.afterIndexAll === 'function').toBe(true);
+ expect(h.afterSync === undefined || typeof h.afterSync === 'function').toBe(true);
+ }
+ });
+
+ it('runAfterIndexAll returns one outcome per registered hook, swallowing per-hook errors', async () => {
+ // Registered hooks will throw on the fake `{} as any` ctx; the
+ // runner contract is to catch + report each error so one bad
+ // hook never fails the whole pass.
+ const outcomes = await runAfterIndexAll(makeFakeContext());
+ const expectedCount = getRegisteredHooks().filter((h) => h.afterIndexAll).length;
+ expect(outcomes.length).toBe(expectedCount);
+ for (const o of outcomes) {
+ expect(typeof o.name).toBe('string');
+ expect(o.phase).toBe('indexAll');
+ expect(typeof o.durationMs).toBe('number');
+ }
+ });
+
+ it('runAfterSync returns one outcome per registered hook, swallowing per-hook errors', async () => {
+ const outcomes = await runAfterSync(makeFakeContext(), fakeSyncResult);
+ const expectedCount = getRegisteredHooks().filter((h) => h.afterSync).length;
+ expect(outcomes.length).toBe(expectedCount);
+ for (const o of outcomes) {
+ expect(typeof o.name).toBe('string');
+ expect(o.phase).toBe('sync');
+ expect(typeof o.durationMs).toBe('number');
+ }
+ });
+});
+
+describe('index-hooks runner — fake-hook injection', () => {
+ // Helper: temporarily inject a fake hook by wrapping the runner
+ // directly. The runner accepts no array argument today; this
+ // suite exercises the public surface (runAfterIndexAll /
+ // runAfterSync) by simulating what a registered hook would do.
+ // When real hooks land, REGISTERED_HOOKS in registry.ts will
+ // contain them and this fixture-style approach disappears.
+
+ it('a hook with afterIndexAll receives the context and is awaited', async () => {
+ // Build a one-off hook and call it directly — the runner's
+ // contract is "for each registered hook, await afterIndexAll
+ // if defined." We exercise that contract by calling the hook
+ // ourselves to confirm the IndexHookContext shape stays usable
+ // by hook implementations.
+ let captured: IndexHookContext | null = null;
+ const hook: IndexHook = {
+ name: 'fake-hook',
+ async afterIndexAll(ctx) {
+ captured = ctx;
+ },
+ };
+ const ctx = makeFakeContext();
+ await hook.afterIndexAll!(ctx);
+ expect(captured).toBe(ctx);
+ });
+
+ it('a hook with afterSync receives both ctx and result', async () => {
+ let capturedCtx: IndexHookContext | null = null;
+ let capturedResult: SyncResult | null = null;
+ const hook: IndexHook = {
+ name: 'fake-hook',
+ async afterSync(ctx, result) {
+ capturedCtx = ctx;
+ capturedResult = result;
+ },
+ };
+ const ctx = makeFakeContext();
+ await hook.afterSync!(ctx, fakeSyncResult);
+ expect(capturedCtx).toBe(ctx);
+ expect(capturedResult).toBe(fakeSyncResult);
+ });
+
+ it('a hook missing afterIndexAll is silently skipped', () => {
+ // Just a typing assertion: an IndexHook without afterIndexAll
+ // is allowed (both methods are optional).
+ const hook: IndexHook = { name: 'sync-only' };
+ expect(hook.afterIndexAll).toBeUndefined();
+ expect(hook.afterSync).toBeUndefined();
+ });
+});
diff --git a/__tests__/issue-history.test.ts b/__tests__/issue-history.test.ts
new file mode 100644
index 00000000..7c281771
--- /dev/null
+++ b/__tests__/issue-history.test.ts
@@ -0,0 +1,390 @@
+/**
+ * Issue → symbol attribution: parser unit tests + end-to-end mining
+ * against synthetic git repos.
+ */
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import * as fs from 'fs';
+import * as os from 'os';
+import * as path from 'path';
+import { execFileSync } from 'child_process';
+import {
+ extractSymbolFromContext,
+ extractDeclaration,
+} from '../src/issue-history/parse-diff';
+import {
+ mineIssueCommits,
+ mineIssueHistory,
+ ISSUE_REGEX,
+ LAST_MINED_ISSUES_HEAD_KEY,
+} from '../src/issue-history';
+import CodeGraph from '../src/index';
+
+let HAS_GIT = true;
+try {
+ execFileSync('git', ['--version'], { stdio: 'ignore' });
+} catch {
+ HAS_GIT = false;
+}
+
+let testDir: string;
+let cg: CodeGraph | null = null;
+
+function git(...args: string[]): string {
+ return execFileSync('git', args, {
+ cwd: testDir,
+ encoding: 'utf-8',
+ env: {
+ ...process.env,
+ GIT_AUTHOR_NAME: 'Test',
+ GIT_AUTHOR_EMAIL: 'test@example.com',
+ GIT_COMMITTER_NAME: 'Test',
+ GIT_COMMITTER_EMAIL: 'test@example.com',
+ GIT_AUTHOR_DATE: process.env.GIT_AUTHOR_DATE,
+ GIT_COMMITTER_DATE: process.env.GIT_COMMITTER_DATE,
+ },
+ stdio: ['pipe', 'pipe', 'pipe'],
+ }).trim();
+}
+
+function commitAt(date: string, files: Record, message: string) {
+ for (const [rel, content] of Object.entries(files)) {
+ const abs = path.join(testDir, rel);
+ fs.mkdirSync(path.dirname(abs), { recursive: true });
+ fs.writeFileSync(abs, content);
+ }
+ git('add', '-A');
+ process.env.GIT_AUTHOR_DATE = date;
+ process.env.GIT_COMMITTER_DATE = date;
+ git('commit', '-m', message);
+ delete process.env.GIT_AUTHOR_DATE;
+ delete process.env.GIT_COMMITTER_DATE;
+}
+
+beforeEach(() => {
+ testDir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-issues-'));
+});
+
+afterEach(() => {
+ delete process.env.GIT_AUTHOR_DATE;
+ delete process.env.GIT_COMMITTER_DATE;
+ if (cg) {
+ cg.destroy();
+ cg = null;
+ }
+ if (fs.existsSync(testDir)) fs.rmSync(testDir, { recursive: true, force: true });
+});
+
+// ============================================================================
+// Pure parser unit tests
+// ============================================================================
+
+describe('ISSUE_REGEX', () => {
+ it('matches all canonical Fixes/Closes/Resolves verbs', () => {
+ const cases = [
+ 'Fix #1', 'Fixes #2', 'Fixed #3',
+ 'Close #4', 'Closes #5', 'Closed #6',
+ 'Resolve #7', 'Resolves #8', 'Resolved #9',
+ ];
+ for (const s of cases) {
+ ISSUE_REGEX.lastIndex = 0;
+ expect(ISSUE_REGEX.test(s)).toBe(true);
+ }
+ });
+
+ it('matches multiple issues in a single body', () => {
+ ISSUE_REGEX.lastIndex = 0;
+ const matches = [...'Fixes #1, closes #2 and resolves #3'.matchAll(ISSUE_REGEX)];
+ expect(matches.map((m) => m[1])).toEqual(['1', '2', '3']);
+ });
+
+ it('is case-insensitive', () => {
+ ISSUE_REGEX.lastIndex = 0;
+ expect(ISSUE_REGEX.test('FIXES #42')).toBe(true);
+ });
+
+ it('does NOT match `#N` without a verb', () => {
+ ISSUE_REGEX.lastIndex = 0;
+ // Match in body of message that mentions #99 but with no verb prefix.
+ expect(ISSUE_REGEX.test('See #99 for context')).toBe(false);
+ });
+
+ it('v1 limitation: `Fixes #1, #2` only captures #1', () => {
+ // Documented behavior — the second issue lacks a verb prefix and
+ // is silently dropped. Authors who care can write `Fixes #1, fixes #2`.
+ ISSUE_REGEX.lastIndex = 0;
+ const matches = [...'Fixes #1, #2'.matchAll(ISSUE_REGEX)];
+ expect(matches.map((m) => m[1])).toEqual(['1']);
+ });
+});
+
+describe('extractSymbolFromContext', () => {
+ it('pulls function name from a TS function context', () => {
+ expect(extractSymbolFromContext('function processOrder(order: Order) {')).toBe('processOrder');
+ });
+ it('pulls class name', () => {
+ expect(extractSymbolFromContext('class UserService {')).toBe('UserService');
+ });
+ it('pulls Python def', () => {
+ expect(extractSymbolFromContext('def compute_score(items):')).toBe('compute_score');
+ });
+ it('pulls Go func', () => {
+ expect(extractSymbolFromContext('func ProcessOrder(o *Order) error {')).toBe('ProcessOrder');
+ });
+ it('pulls method-style ` async foo(`', () => {
+ expect(extractSymbolFromContext(' async foo(args: string) {')).toBe('foo');
+ });
+ it('rejects keyword-only contexts', () => {
+ expect(extractSymbolFromContext(' if (x) {')).toBeNull();
+ });
+ it('returns null on empty input', () => {
+ expect(extractSymbolFromContext('')).toBeNull();
+ });
+});
+
+describe('extractDeclaration', () => {
+ it('captures + function decl', () => {
+ expect(extractDeclaration('+function helper() {')).toEqual({ name: 'helper', sign: '+' });
+ });
+ it('captures - class decl', () => {
+ expect(extractDeclaration('-export class Old {')).toEqual({ name: 'Old', sign: '-' });
+ });
+ it('captures Python def', () => {
+ expect(extractDeclaration('+def my_helper(x):')).toEqual({ name: 'my_helper', sign: '+' });
+ });
+ it('captures Go func with receiver', () => {
+ expect(extractDeclaration('+func (s *Service) DoThing() error {')).toEqual({
+ name: 'DoThing',
+ sign: '+',
+ });
+ });
+ it('skips file-marker `+++` and `---` lines', () => {
+ expect(extractDeclaration('+++ b/src/foo.ts')).toBeNull();
+ expect(extractDeclaration('--- a/src/foo.ts')).toBeNull();
+ });
+ it('skips keywords like `+if`', () => {
+ expect(extractDeclaration('+ if (x) return;')).toBeNull();
+ });
+ it('returns null on context lines (no +/-)', () => {
+ expect(extractDeclaration(' some body line')).toBeNull();
+ });
+});
+
+// ============================================================================
+// Git mining: synthetic repo
+// ============================================================================
+
+describe.skipIf(!HAS_GIT)('mineIssueCommits', () => {
+ beforeEach(() => {
+ git('init', '-q', '-b', 'main');
+ git('config', 'commit.gpgsign', 'false');
+ });
+
+ it('finds commits with `Fixes #N` in the subject', () => {
+ commitAt('2025-01-01T00:00:00Z', { 'a.ts': 'a' }, 'feat: add a (no issue)');
+ commitAt('2025-01-02T00:00:00Z', { 'a.ts': 'a2' }, 'fix: bug. Fixes #42');
+ const commits = mineIssueCommits(testDir, null);
+ expect(commits.length).toBe(1);
+ expect(commits[0]!.issues).toEqual([42]);
+ });
+
+ it('parses multi-issue subjects', () => {
+ commitAt('2025-01-01T00:00:00Z', { 'a.ts': 'a' }, 'fix: triple. Fixes #1, closes #2, resolves #3');
+ const [c] = mineIssueCommits(testDir, null);
+ expect(c?.issues).toEqual([1, 2, 3]);
+ });
+
+ it('ignores commits with no issue ref', () => {
+ commitAt('2025-01-01T00:00:00Z', { 'a.ts': 'a' }, 'plain message');
+ expect(mineIssueCommits(testDir, null).length).toBe(0);
+ });
+
+ it('returns [] when not in a git repo', () => {
+ const nonGit = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-nogit-'));
+ try {
+ expect(mineIssueCommits(nonGit, null)).toEqual([]);
+ } finally {
+ fs.rmSync(nonGit, { recursive: true, force: true });
+ }
+ });
+});
+
+// ============================================================================
+// End-to-end through CodeGraph
+// ============================================================================
+
+describe.skipIf(!HAS_GIT)('CodeGraph issue history', () => {
+ beforeEach(() => {
+ git('init', '-q', '-b', 'main');
+ git('config', 'commit.gpgsign', 'false');
+ });
+
+ it('attributes a Fixes #N commit to the modified function', async () => {
+ commitAt('2025-01-01T00:00:00Z', {
+ 'src/a.ts': `export function foo() { return 1; }\n`,
+ }, 'feat: add foo');
+
+ commitAt('2025-02-01T00:00:00Z', {
+ 'src/a.ts': `export function foo() {\n // changed\n return 2;\n}\n`,
+ }, 'fix: bug. Fixes #42');
+
+ cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } });
+ await cg.indexAll();
+
+ const node = cg.getNodesInFile('src/a.ts').find((n) => n.name === 'foo')!;
+ expect(node).toBeDefined();
+ const issues = cg.getIssuesForNode(node.id);
+ expect(issues.length).toBeGreaterThan(0);
+ expect(issues.some((i) => i.issueNumber === 42)).toBe(true);
+});
+
+ it('tracks the agent-usable multi-issue signal', async () => {
+ // Simulate the codegraph history pattern: `loadGrammarsForLanguages`
+ // touched by every language-add issue (#54, #82, #83, #85).
+ commitAt('2025-01-01T00:00:00Z', {
+ 'src/grammar.ts': `export function loadGrammarsForLanguages() { return []; }\n`,
+ }, 'feat: add grammar loader');
+
+ commitAt('2025-01-02T00:00:00Z', {
+ 'src/grammar.ts': `export function loadGrammarsForLanguages() {\n // R support\n return [];\n}\n`,
+ }, 'feat: add R support. Fixes #82');
+
+ commitAt('2025-01-03T00:00:00Z', {
+ 'src/grammar.ts': `export function loadGrammarsForLanguages() {\n // R + HCL support\n return [];\n}\n`,
+ }, 'feat: add HCL. Fixes #83');
+
+ commitAt('2025-01-04T00:00:00Z', {
+ 'src/grammar.ts': `export function loadGrammarsForLanguages() {\n // R + HCL + SQL\n return [];\n}\n`,
+ }, 'feat: add SQL. Fixes #85');
+
+ cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } });
+ await cg.indexAll();
+
+ const node = cg.getNodesByKind("function").find((n) => n.name === 'loadGrammarsForLanguages')!;
+ expect(node).toBeDefined();
+ const issues = cg.getIssuesForNode(node.id);
+ const issueNumbers = [...new Set(issues.map((i) => i.issueNumber))].sort((a, b) => a - b);
+ expect(issueNumbers).toEqual([82, 83, 85]);
+ });
+
+ it('records `added` kind for symbols introduced in a Fixes commit', async () => {
+ commitAt('2025-01-01T00:00:00Z', {
+ 'src/a.ts': `export function existing() { return 1; }\n`,
+ }, 'init');
+
+ commitAt('2025-02-01T00:00:00Z', {
+ 'src/a.ts': `export function existing() { return 1; }\nexport function brandNew() { return 2; }\n`,
+ }, 'feat: add brandNew. Fixes #100');
+
+ cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } });
+ await cg.indexAll();
+
+ const node = cg.getNodesByKind("function").find((n) => n.name === 'brandNew')!;
+ const issues = cg.getIssuesForNode(node.id);
+ expect(issues.some((i) => i.issueNumber === 100 && i.kind === 'added')).toBe(true);
+ });
+
+ it('drops attributions for symbols that no longer exist', async () => {
+ // Symbol added then removed in two separate `Fixes` commits. The
+ // current index has no node for it, so attributions for the removed
+ // symbol must not appear (FK + drop-on-resolve).
+ commitAt('2025-01-01T00:00:00Z', {
+ 'src/a.ts': `export function staysHere() { return 1; }\nexport function temporary() { return 99; }\n`,
+ }, 'feat: add. Fixes #1');
+
+ commitAt('2025-02-01T00:00:00Z', {
+ 'src/a.ts': `export function staysHere() { return 1; }\n`,
+ }, 'fix: drop temporary. Fixes #2');
+
+ cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } });
+ await cg.indexAll();
+
+ // staysHere should have at least the #1 attribution (added).
+ const node = cg.getNodesByKind("function").find((n) => n.name === 'staysHere')!;
+ const issues = cg.getIssuesForNode(node.id);
+ expect(issues.some((i) => i.issueNumber === 1)).toBe(true);
+
+ // No node should exist named `temporary`, and no attribution to
+ // issue #2 should reference a node that doesn't exist.
+ expect(cg.getNodesByKind("function").find((n) => n.name === 'temporary')).toBeUndefined();
+ });
+
+ it('survives indexAll outside a git repo (table empty, no errors)', async () => {
+ fs.rmSync(path.join(testDir, '.git'), { recursive: true, force: true });
+ fs.writeFileSync(path.join(testDir, 'a.ts'), `export function x() { return 1; }\n`);
+ cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } });
+ await cg.indexAll();
+ const nodes = cg.getNodesInFile('a.ts');
+ expect(nodes.length).toBeGreaterThan(0);
+ for (const n of nodes) expect(cg.getIssuesForNode(n.id)).toEqual([]);
+ });
+
+ it('respects enableIssueHistory=false', async () => {
+ commitAt('2025-01-01T00:00:00Z', {
+ 'src/a.ts': `export function foo() { return 1; }\n`,
+ }, 'init');
+ commitAt('2025-01-02T00:00:00Z', {
+ 'src/a.ts': `export function foo() { return 2; }\n`,
+ }, 'fix: foo. Fixes #1');
+
+ cg = CodeGraph.initSync(testDir, {
+ config: { include: ['**/*.ts'], exclude: [], enableIssueHistory: false },
+ });
+ await cg.indexAll();
+ const node = cg.getNodesInFile('src/a.ts').find((n) => n.name === 'foo')!;
+ expect(cg.getIssuesForNode(node.id)).toEqual([]);
+ });
+
+ it('incrementally picks up new Fixes commits on sync', async () => {
+ commitAt('2025-01-01T00:00:00Z', {
+ 'src/a.ts': `export function foo() { return 1; }\n`,
+ }, 'init');
+
+ cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } });
+ await cg.indexAll();
+ const node = cg.getNodesInFile('src/a.ts').find((n) => n.name === 'foo')!;
+ expect(cg.getIssuesForNode(node.id).length).toBe(0);
+
+ commitAt('2025-02-01T00:00:00Z', {
+ 'src/a.ts': `export function foo() { return 2; }\n`,
+ }, 'fix: foo. Fixes #50');
+ await cg.sync();
+
+ const issues = cg.getIssuesForNode(node.id);
+ expect(issues.some((i) => i.issueNumber === 50)).toBe(true);
+ });
+
+ // (Removed: a defensive test for the v4-migration-collision bug class.
+ // With file-based migrations (NNN-name.ts), two migrations claiming
+ // the same version produces a filesystem-level conflict — the silent
+ // skip the defensive guard protected against can no longer happen.)
+
+ it('recovers from an unreachable last_mined_issues_head', async () => {
+ commitAt('2025-01-01T00:00:00Z', {
+ 'src/a.ts': `export function foo() { return 1; }\n`,
+ }, 'init');
+ commitAt('2025-02-01T00:00:00Z', {
+ 'src/a.ts': `export function foo() { return 2; }\n`,
+ }, 'fix: foo. Fixes #1');
+
+ cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } });
+ await cg.indexAll();
+ const node = cg.getNodesInFile('src/a.ts').find((n) => n.name === 'foo')!;
+ expect(
+ [...new Set(cg.getIssuesForNode(node.id).map((i) => i.issueNumber))]
+ ).toEqual([1]);
+
+ // Simulate force-push / gc by storing an unreachable SHA.
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
+ (cg as any).queries.setMetadata(LAST_MINED_ISSUES_HEAD_KEY, '0'.repeat(40));
+
+ commitAt('2025-03-01T00:00:00Z', {
+ 'src/a.ts': `export function foo() { return 3; }\n`,
+ }, 'fix: foo again. Fixes #2');
+ await cg.sync();
+
+ const issueNums = [
+ ...new Set(cg.getIssuesForNode(node.id).map((i) => i.issueNumber)),
+ ].sort((a, b) => a - b);
+ expect(issueNums).toEqual([1, 2]);
+ });
+});
diff --git a/__tests__/language-registry.test.ts b/__tests__/language-registry.test.ts
new file mode 100644
index 00000000..9afdd59a
--- /dev/null
+++ b/__tests__/language-registry.test.ts
@@ -0,0 +1,157 @@
+/**
+ * Language registry: structural invariants.
+ *
+ * These tests guard against the "parallel list" failure mode that
+ * the registry refactor exists to prevent. If a future PR adds a
+ * grammar-backed language but forgets to wire it through one of
+ * the derived consumers, one of these tests should catch it.
+ */
+import { describe, it, expect } from 'vitest';
+import {
+ getLanguageDefs,
+ getLanguageDefByExtension,
+ getLanguageDefByName,
+} from '../src/extraction/languages/registry';
+import { EXTRACTORS } from '../src/extraction/languages';
+import {
+ detectLanguage,
+ isLanguageSupported,
+ getSupportedLanguages,
+ getLanguageDisplayName,
+ EXTENSION_MAP,
+} from '../src/extraction/grammars';
+
+describe('language registry — single source of truth', () => {
+ it('has at least the original 19 languages', () => {
+ const defs = getLanguageDefs();
+ expect(defs.length).toBeGreaterThanOrEqual(19);
+ });
+
+ it('every def has unique non-empty name', () => {
+ const names = new Set();
+ for (const def of getLanguageDefs()) {
+ expect(def.name).toBeTruthy();
+ expect(names.has(def.name)).toBe(false);
+ names.add(def.name);
+ }
+ });
+
+ it('extensions are unique across registry (one ext maps to one language)', () => {
+ const seen = new Map();
+ for (const def of getLanguageDefs()) {
+ for (const ext of def.extensions) {
+ const lower = ext.toLowerCase();
+ if (seen.has(lower)) {
+ // The .h ambiguity (C vs C++) is intentionally pinned to C
+ // by the registry; tree-sitter.ts has a content-sniff
+ // override. Anything else duplicating extensions is a bug.
+ throw new Error(
+ `Extension ${lower} mapped twice: ${seen.get(lower)} and ${def.name}`
+ );
+ }
+ seen.set(lower, def.name);
+ }
+ }
+ });
+
+ it('grammar-backed defs have wasmFile + extractor', () => {
+ for (const def of getLanguageDefs()) {
+ if (!def.grammar) continue;
+ expect(def.grammar.wasmFile).toMatch(/^tree-sitter-.+\.wasm$/);
+ expect(def.grammar.extractor).toBeDefined();
+ }
+ });
+
+ it('custom-extractor defs have a customExtractor function', () => {
+ for (const def of getLanguageDefs()) {
+ if (def.grammar) continue; // grammar-backed
+ expect(def.customExtractor).toBeInstanceOf(Function);
+ }
+ });
+});
+
+describe('derived consumers stay in sync with the registry', () => {
+ // Catch the "parallel list drift" bug that motivated this refactor.
+ // If a new language gets added to registry but a derived consumer
+ // still hard-codes the old set, one of these will fail.
+
+ it('EXTRACTORS contains exactly the grammar-backed languages', () => {
+ const grammarBacked = getLanguageDefs()
+ .filter((d) => d.grammar)
+ .map((d) => d.name)
+ .sort();
+ const extractorKeys = Object.keys(EXTRACTORS).sort();
+ expect(extractorKeys).toEqual(grammarBacked);
+ });
+
+ it('every grammar-backed extractor matches def.grammar.extractor exactly', () => {
+ for (const def of getLanguageDefs()) {
+ if (!def.grammar) continue;
+ expect(EXTRACTORS[def.name as keyof typeof EXTRACTORS]).toBe(def.grammar.extractor);
+ }
+ });
+
+ it('EXTENSION_MAP entries exactly mirror registry extensions', () => {
+ const expected = new Map();
+ for (const def of getLanguageDefs()) {
+ for (const ext of def.extensions) {
+ expected.set(ext.toLowerCase(), def.name);
+ }
+ }
+ for (const [ext, lang] of expected) {
+ expect(EXTENSION_MAP[ext]).toBe(lang);
+ }
+ // Reverse: no extra keys in EXTENSION_MAP.
+ expect(Object.keys(EXTENSION_MAP).sort()).toEqual([...expected.keys()].sort());
+ });
+
+ it('detectLanguage returns the expected name for every registered extension', () => {
+ for (const def of getLanguageDefs()) {
+ for (const ext of def.extensions) {
+ // .h is pinned to C by the registry; the C++ heuristic only
+ // applies when source is provided AND looks like C++.
+ expect(detectLanguage(`x${ext}`)).toBe(def.name);
+ }
+ }
+ });
+
+ it('isLanguageSupported returns true for every registered language and false for unknown', () => {
+ for (const def of getLanguageDefs()) {
+ expect(isLanguageSupported(def.name as never)).toBe(true);
+ }
+ expect(isLanguageSupported('unknown' as never)).toBe(false);
+ });
+
+ it('getSupportedLanguages returns exactly the registry names', () => {
+ const fromRegistry = getLanguageDefs().map((d) => d.name).sort();
+ const supported = (getSupportedLanguages() as string[]).sort();
+ expect(supported).toEqual(fromRegistry);
+ });
+
+ it('getLanguageDisplayName uses each defs displayName', () => {
+ for (const def of getLanguageDefs()) {
+ expect(getLanguageDisplayName(def.name as never)).toBe(def.displayName);
+ }
+ });
+});
+
+describe('lookup helpers', () => {
+ it('getLanguageDefByName returns the def for a registered name', () => {
+ expect(getLanguageDefByName('typescript')?.displayName).toBe('TypeScript');
+ });
+
+ it('getLanguageDefByName returns undefined for unknown names', () => {
+ expect(getLanguageDefByName('nonexistent-language-name')).toBeUndefined();
+ });
+
+ it('getLanguageDefByExtension is case-insensitive', () => {
+ expect(getLanguageDefByExtension('.TS')?.name).toBe('typescript');
+ expect(getLanguageDefByExtension('.ts')?.name).toBe('typescript');
+ });
+
+ it('Pascal extensionOverrides routes .dfm and .fmx to a customExtractor', () => {
+ const def = getLanguageDefByName('pascal');
+ expect(def?.extensionOverrides?.['.dfm']?.customExtractor).toBeInstanceOf(Function);
+ expect(def?.extensionOverrides?.['.fmx']?.customExtractor).toBeInstanceOf(Function);
+ });
+});
diff --git a/__tests__/mcp-tool-registry.test.ts b/__tests__/mcp-tool-registry.test.ts
new file mode 100644
index 00000000..2da0efc5
--- /dev/null
+++ b/__tests__/mcp-tool-registry.test.ts
@@ -0,0 +1,82 @@
+/**
+ * MCP tool registry: structural invariants.
+ *
+ * Guards against the failure mode where a future PR adds a
+ * ToolModule but forgets to implement the matching `handle`
+ * method on ToolHandler (or vice versa).
+ */
+import { describe, it, expect } from 'vitest';
+import { getToolModules, tools as registryTools } from '../src/mcp/tools/registry';
+import { ToolHandler, tools } from '../src/mcp/tools';
+
+describe('MCP tool registry — single source of truth', () => {
+ it('every tool module has a non-empty name and description', () => {
+ for (const m of getToolModules()) {
+ expect(m.definition.name).toMatch(/^codegraph_[a-z_]+$/);
+ expect(m.definition.description.length).toBeGreaterThan(20);
+ }
+ });
+
+ it('handlerKey is a string starting with "handle"', () => {
+ for (const m of getToolModules()) {
+ expect(m.handlerKey).toMatch(/^handle[A-Z][A-Za-z]+$/);
+ }
+ });
+
+ it('every registered tool has a corresponding ToolHandler method', () => {
+ const handler = new ToolHandler(null);
+ for (const m of getToolModules()) {
+ const fn = (handler as unknown as Record)[m.handlerKey];
+ expect(typeof fn).toBe('function');
+ }
+ });
+
+ it('exported `tools` array exactly mirrors the registry', () => {
+ const fromRegistry = registryTools.map((t) => t.name).sort();
+ const fromExport = tools.map((t) => t.name).sort();
+ expect(fromExport).toEqual(fromRegistry);
+ });
+
+ it('all main-line tools are registered (regression guard)', () => {
+ const expected = [
+ 'codegraph_callees',
+ 'codegraph_callers',
+ 'codegraph_config',
+ 'codegraph_context',
+ 'codegraph_explore',
+ 'codegraph_files',
+ 'codegraph_hotspots',
+ 'codegraph_impact',
+ 'codegraph_node',
+ 'codegraph_search',
+ 'codegraph_sql',
+ 'codegraph_status',
+ ];
+ const actual = getToolModules()
+ .map((m) => m.definition.name)
+ .sort();
+ expect(actual).toEqual(expected);
+ });
+
+ it('execute() reports unknown-tool errors', async () => {
+ const handler = new ToolHandler(null);
+ const result = await handler.execute('codegraph_does_not_exist', {});
+ expect(result.isError).toBe(true);
+ expect(result.content[0]?.text).toMatch(/Unknown tool/);
+ });
+
+ it('execute() actually dispatches to the registered handler (no broken `this` binding)', async () => {
+ // No CodeGraph instance is bound, so handlers that call
+ // `getCodeGraph()` will throw — the dispatch should catch it
+ // and return an error result. The point of this test is to
+ // confirm the registry lookup + `this[handlerKey](args)` chain
+ // reaches an actual method body, not that the body succeeds.
+ const handler = new ToolHandler(null);
+ const result = await handler.execute('codegraph_status', {});
+ expect(result.isError).toBe(true);
+ // Generic tool-execution-failed envelope from execute()'s catch block.
+ expect(result.content[0]?.text).toMatch(/Tool execution failed/);
+ // Specifically because no CodeGraph was bound:
+ expect(result.content[0]?.text).toMatch(/CodeGraph not initialized/);
+ });
+});
diff --git a/__tests__/migrations-registry.test.ts b/__tests__/migrations-registry.test.ts
new file mode 100644
index 00000000..9fa15eed
--- /dev/null
+++ b/__tests__/migrations-registry.test.ts
@@ -0,0 +1,95 @@
+/**
+ * Migration registry: structural invariants.
+ *
+ * Guards against the silent-no-op bug class that motivated this
+ * refactor. If a future PR introduces a duplicate version,
+ * out-of-order versions, or fails to register a new migration
+ * file, one of these tests fails loudly.
+ */
+import { describe, it, expect } from 'vitest';
+import * as fs from 'fs';
+import * as path from 'path';
+import {
+ ALL_MIGRATIONS,
+ CURRENT_SCHEMA_VERSION,
+} from '../src/db/migrations';
+
+describe('migration registry — structural invariants', () => {
+ it('registry is non-empty', () => {
+ expect(ALL_MIGRATIONS.length).toBeGreaterThan(0);
+ });
+
+ it('versions are unique', () => {
+ const seen = new Set();
+ for (const m of ALL_MIGRATIONS) {
+ expect(seen.has(m.version)).toBe(false);
+ seen.add(m.version);
+ }
+ });
+
+ it('versions are strictly ascending', () => {
+ for (let i = 1; i < ALL_MIGRATIONS.length; i++) {
+ expect(ALL_MIGRATIONS[i]!.version).toBeGreaterThan(
+ ALL_MIGRATIONS[i - 1]!.version
+ );
+ }
+ });
+
+ it('each migration has a non-empty description and a function up()', () => {
+ for (const m of ALL_MIGRATIONS) {
+ expect(m.description.length).toBeGreaterThan(0);
+ expect(typeof m.up).toBe('function');
+ }
+ });
+
+ it('CURRENT_SCHEMA_VERSION matches the highest registered version', () => {
+ const max = ALL_MIGRATIONS[ALL_MIGRATIONS.length - 1]!.version;
+ expect(CURRENT_SCHEMA_VERSION).toBe(max);
+ });
+});
+
+describe('migration files — filename ↔ version coupling', () => {
+ // Read the actual filenames on disk and assert each matches an
+ // entry in the registry. Catches the case where someone drops a
+ // new file in src/db/migrations/ but forgets to register it.
+ const migrationsDir = path.resolve(__dirname, '../src/db/migrations');
+ const SUPPORT_FILES = new Set(['index.ts', 'types.ts']);
+ const STRICT_NNN_PATTERN = /^\d{3}-[a-z0-9]+(?:-[a-z0-9]+)*\.ts$/;
+
+ function listMigrationFiles(): string[] {
+ return fs.readdirSync(migrationsDir).filter((f) => f.endsWith('.ts') && !SUPPORT_FILES.has(f));
+ }
+
+ it('every migration file matches the strict `NNN-kebab-name.ts` pattern', () => {
+ const offenders: string[] = [];
+ for (const f of listMigrationFiles()) {
+ if (!STRICT_NNN_PATTERN.test(f)) {
+ offenders.push(f);
+ }
+ }
+ expect(offenders).toEqual([]);
+ });
+
+ it('every src/db/migrations/NNN-*.ts file is registered (no orphan files)', () => {
+ const files = listMigrationFiles().filter((f) => STRICT_NNN_PATTERN.test(f));
+ expect(files.length).toBeGreaterThan(0);
+ const registeredVersions = new Set(ALL_MIGRATIONS.map((m) => m.version));
+ for (const f of files) {
+ const version = parseInt(f.slice(0, 3), 10);
+ if (!registeredVersions.has(version)) {
+ throw new Error(
+ `Migration file ${f} exists on disk but is not registered in src/db/migrations/index.ts. ` +
+ `Add an import + array entry for it.`
+ );
+ }
+ }
+ });
+
+ it('every registered version has a matching NNN-*.ts file (no phantom registrations)', () => {
+ const files = listMigrationFiles().filter((f) => STRICT_NNN_PATTERN.test(f));
+ const filenameVersions = new Set(files.map((f) => parseInt(f.slice(0, 3), 10)));
+ for (const m of ALL_MIGRATIONS) {
+ expect(filenameVersions.has(m.version)).toBe(true);
+ }
+ });
+});
diff --git a/__tests__/pr19-improvements.test.ts b/__tests__/pr19-improvements.test.ts
index 5fbe17d7..b69d9068 100644
--- a/__tests__/pr19-improvements.test.ts
+++ b/__tests__/pr19-improvements.test.ts
@@ -299,7 +299,7 @@ describe('Best-Candidate Resolution', () => {
describe('Schema v2 Migration', () => {
it.skipIf(!HAS_SQLITE)('should have correct current schema version', async () => {
const { CURRENT_SCHEMA_VERSION } = await import('../src/db/migrations');
- expect(CURRENT_SCHEMA_VERSION).toBe(3);
+ expect(CURRENT_SCHEMA_VERSION).toBe(9);
});
it.skipIf(!HAS_SQLITE)('should have migration for version 2', async () => {
diff --git a/__tests__/search-quality.test.ts b/__tests__/search-quality.test.ts
new file mode 100644
index 00000000..8e069776
--- /dev/null
+++ b/__tests__/search-quality.test.ts
@@ -0,0 +1,302 @@
+/**
+ * Search Quality Tests
+ *
+ * Regression tests for the FTS improvements that bring natural-language
+ * and partial-identifier queries into the top of the result set:
+ * - Subword tokens (camel/snake split) so `parser` finds `getParser`.
+ * - Porter stemmer so `parsing` matches `parser`/`parses`.
+ * - Stopword stripping so `"how"` / `"the"` don't crowd out the
+ * real terms via docstring matches.
+ *
+ * All measurements were captured against codegraph's own src/ during
+ * development. Targets that previously ranked #18, #19, or weren't in
+ * the top 20 jump to the top 5.
+ */
+
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+import { DatabaseConnection } from '../src/db';
+import { QueryBuilder } from '../src/db/queries';
+import { Node } from '../src/types';
+import { splitIdentifierTokens, buildNameSubwords } from '../src/utils';
+import { filterStopwords, STOP_WORDS } from '../src/search/query-utils';
+import { runMigrations, getCurrentVersion } from '../src/db/migrations';
+
+describe('splitIdentifierTokens', () => {
+ it('splits camelCase', () => {
+ expect(splitIdentifierTokens('getParser')).toEqual(['get', 'parser']);
+ });
+
+ it('splits PascalCase', () => {
+ expect(splitIdentifierTokens('DatabaseConnection')).toEqual(['database', 'connection']);
+ });
+
+ it('splits XMLHttpRequest-style runs of capitals', () => {
+ expect(splitIdentifierTokens('XMLHttpRequest')).toEqual(['xml', 'http', 'request']);
+ });
+
+ it('splits snake_case', () => {
+ expect(splitIdentifierTokens('database_connection')).toEqual(['database', 'connection']);
+ });
+
+ it('splits kebab-case and dots and slashes', () => {
+ expect(splitIdentifierTokens('foo-bar.baz/qux')).toEqual(['foo', 'bar', 'baz', 'qux']);
+ });
+
+ it('keeps single-word identifiers as-is', () => {
+ expect(splitIdentifierTokens('parse')).toEqual(['parse']);
+ });
+
+ it('handles trailing/leading underscores', () => {
+ expect(splitIdentifierTokens('__init__')).toEqual(['init']);
+ });
+
+ it('preserves numbers as part of the surrounding token', () => {
+ expect(splitIdentifierTokens('parseV2')).toEqual(['parse', 'v2']);
+ });
+});
+
+describe('buildNameSubwords', () => {
+ it('preserves the original identifier so direct queries still hit', () => {
+ const out = buildNameSubwords('getParser');
+ expect(out.split(' ')).toContain('getParser');
+ });
+
+ it('appends split tokens', () => {
+ const out = buildNameSubwords('getParser').split(' ');
+ expect(out).toContain('get');
+ expect(out).toContain('parser');
+ });
+
+ it('dedupes single-word identifiers (no "parse parse")', () => {
+ expect(buildNameSubwords('parse')).toBe('parse');
+ });
+
+ it('dedupes when split produces a single token equal to the original', () => {
+ // 'foo' has no boundary, so splitIdentifierTokens returns ['foo'];
+ // without dedup we would store 'foo foo'.
+ const out = buildNameSubwords('foo').split(' ');
+ expect(out).toEqual(['foo']);
+ });
+
+ it('handles empty string without crashing', () => {
+ expect(buildNameSubwords('')).toBe('');
+ });
+});
+
+describe('filterStopwords (shared with query-utils.ts)', () => {
+ it('drops common English stopwords', () => {
+ expect(filterStopwords(['how', 'does', 'parsing', 'work']))
+ // 'work' is also in STOP_WORDS, so the result is just 'parsing'
+ .toEqual(['parsing']);
+ });
+
+ it('returns the original list when every term is a stopword', () => {
+ // Otherwise we would produce an empty FTS query.
+ const allStopwords = ['the', 'a', 'an'];
+ expect(filterStopwords(allStopwords)).toEqual(allStopwords);
+ });
+
+ it('does not strip common identifier-like words', () => {
+ // `get` / `set` / `find` could be method names; never treated as stopwords.
+ expect(filterStopwords(['get', 'set', 'find', 'name']))
+ .toEqual(['get', 'set', 'find', 'name']);
+ expect(STOP_WORDS.has('get')).toBe(false);
+ });
+});
+
+describe('FTS5 search quality (integration)', () => {
+ let dir: string;
+ let db: DatabaseConnection;
+ let q: QueryBuilder;
+
+ function makeNode(id: string, name: string, kind: Node['kind'], docstring?: string): Node {
+ return {
+ id,
+ kind,
+ name,
+ qualifiedName: name,
+ filePath: `src/${name}.ts`,
+ language: 'typescript',
+ startLine: 1,
+ endLine: 1,
+ startColumn: 0,
+ endColumn: 0,
+ docstring,
+ updatedAt: Date.now(),
+ };
+ }
+
+ beforeEach(() => {
+ dir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-search-quality-'));
+ db = DatabaseConnection.initialize(path.join(dir, 'test.db'));
+ q = new QueryBuilder(db.getDb());
+ });
+
+ afterEach(() => {
+ db.close();
+ if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true });
+ });
+
+ it('finds getParser for a `parser` query (subword tokens)', () => {
+ q.insertNodes([
+ makeNode('n1', 'getParser', 'function'),
+ makeNode('n2', 'unrelated', 'function'),
+ ]);
+ const results = q.searchNodes('parser', { limit: 10 });
+ expect(results.find((r) => r.node.name === 'getParser')).toBeDefined();
+ });
+
+ it('finds DatabaseConnection for a `connection` query (subword tokens)', () => {
+ q.insertNodes([
+ makeNode('n1', 'DatabaseConnection', 'class'),
+ makeNode('n2', 'unrelated', 'function'),
+ ]);
+ const results = q.searchNodes('connection', { limit: 10 });
+ expect(results.find((r) => r.node.name === 'DatabaseConnection')).toBeDefined();
+ });
+
+ it('matches `parsing` against `getParser` via Porter stemmer', () => {
+ q.insertNodes([
+ makeNode('n1', 'getParser', 'function'),
+ makeNode('n2', 'unrelated', 'function'),
+ ]);
+ const results = q.searchNodes('parsing', { limit: 10 });
+ expect(results.find((r) => r.node.name === 'getParser')).toBeDefined();
+ });
+
+ it('matches `resolves references` against resolveOne', () => {
+ q.insertNodes([
+ makeNode('n1', 'resolveOne', 'method'),
+ makeNode('n2', 'unrelated', 'function'),
+ ]);
+ const results = q.searchNodes('resolves references', { limit: 10 });
+ expect(results.find((r) => r.node.name === 'resolveOne')).toBeDefined();
+ });
+
+ it('strips stopwords so `how does parser work` finds getParser', () => {
+ // Without stopword stripping the docstring of `unrelated` (containing
+ // "how" and "does") would BM25-flood the result list.
+ q.insertNodes([
+ makeNode('n1', 'getParser', 'function'),
+ makeNode(
+ 'n2',
+ 'unrelated',
+ 'function',
+ 'How does this work? It does many things — does, does, does.'
+ ),
+ ]);
+ const results = q.searchNodes('how does parser work', { limit: 10 });
+ const ranks = new Map(results.map((r, i) => [r.node.name, i + 1]));
+ const parserRank = ranks.get('getParser');
+ const unrelatedRank = ranks.get('unrelated');
+ expect(parserRank).toBeDefined();
+ if (unrelatedRank !== undefined) {
+ expect(parserRank).toBeLessThan(unrelatedRank);
+ }
+ });
+
+ it('exact identifier search still works (no regression on direct queries)', () => {
+ q.insertNodes([
+ makeNode('n1', 'ExtractionOrchestrator', 'class'),
+ makeNode('n2', 'extraction', 'variable'),
+ makeNode('n3', 'orchestrator', 'variable'),
+ ]);
+ const results = q.searchNodes('ExtractionOrchestrator', { limit: 10 });
+ expect(results[0].node.name).toBe('ExtractionOrchestrator');
+ });
+});
+
+describe('Migration v4: backfill name_subwords + rebuild FTS', () => {
+ let dir: string;
+
+ beforeEach(() => {
+ dir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-migr-v4-fts-'));
+ });
+
+ afterEach(() => {
+ if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true });
+ });
+
+ it('rebuilds FTS so subword search works on previously-indexed nodes', () => {
+ // Build a v3-shape database from explicit SQL — the pre-PR schema —
+ // then run forward migrations and verify search works end-to-end.
+ // This is a faithful simulation of an upgrade from a real v3 install.
+ const Database = require('better-sqlite3');
+ const dbHandle = new Database(path.join(dir, 'test.db'));
+ dbHandle.pragma('foreign_keys = ON');
+ dbHandle.exec(`
+ CREATE TABLE schema_versions (version INTEGER PRIMARY KEY, applied_at INTEGER NOT NULL, description TEXT);
+ INSERT INTO schema_versions (version, applied_at, description) VALUES (3, 0, 'v3');
+ CREATE TABLE nodes (
+ id TEXT PRIMARY KEY, kind TEXT NOT NULL, name TEXT NOT NULL,
+ qualified_name TEXT NOT NULL, file_path TEXT NOT NULL, language TEXT NOT NULL,
+ start_line INTEGER NOT NULL, end_line INTEGER NOT NULL,
+ start_column INTEGER NOT NULL, end_column INTEGER NOT NULL,
+ docstring TEXT, signature TEXT, visibility TEXT,
+ is_exported INTEGER DEFAULT 0, is_async INTEGER DEFAULT 0,
+ is_static INTEGER DEFAULT 0, is_abstract INTEGER DEFAULT 0,
+ decorators TEXT, type_parameters TEXT, updated_at INTEGER NOT NULL
+ );
+ CREATE VIRTUAL TABLE nodes_fts USING fts5(
+ id, name, qualified_name, docstring, signature,
+ content='nodes', content_rowid='rowid'
+ );
+ CREATE TRIGGER nodes_ai AFTER INSERT ON nodes BEGIN
+ INSERT INTO nodes_fts(rowid, id, name, qualified_name, docstring, signature)
+ VALUES (NEW.rowid, NEW.id, NEW.name, NEW.qualified_name, NEW.docstring, NEW.signature);
+ END;
+ INSERT INTO nodes (id, kind, name, qualified_name, file_path, language,
+ start_line, end_line, start_column, end_column, updated_at)
+ VALUES ('n1', 'function', 'getParser', 'getParser', 'a.ts', 'typescript', 1, 1, 0, 0, 0);
+ `);
+
+ expect(getCurrentVersion(dbHandle)).toBe(3);
+
+ // Apply forward migrations (4..N including the FTS-subwords pass).
+ runMigrations(dbHandle, 3);
+ expect(getCurrentVersion(dbHandle)).toBeGreaterThanOrEqual(9);
+
+ // The new column was backfilled with the split subwords.
+ const row = dbHandle.prepare('SELECT name_subwords FROM nodes WHERE id = ?').get('n1') as {
+ name_subwords: string;
+ };
+ expect(row.name_subwords).toContain('parser');
+
+ // Search end-to-end via QueryBuilder works against the migrated DB.
+ const q2 = new QueryBuilder(dbHandle);
+ const results = q2.searchNodes('parser', { limit: 10 });
+ expect(results.find((r) => r.node.name === 'getParser')).toBeDefined();
+
+ dbHandle.close();
+ });
+
+ it('migration is idempotent if name_subwords column already exists', () => {
+ // Simulate a partial-failure scenario: the ALTER TABLE landed
+ // (DDL is auto-committed in SQLite even inside a transaction) but
+ // the rest didn't, so the column is present but the FTS hasn't been
+ // recreated and the schema_versions row hasn't been bumped.
+ const Database = require('better-sqlite3');
+ const dbHandle = new Database(path.join(dir, 'test.db'));
+ dbHandle.exec(`
+ CREATE TABLE schema_versions (version INTEGER PRIMARY KEY, applied_at INTEGER NOT NULL, description TEXT);
+ INSERT INTO schema_versions (version, applied_at, description) VALUES (3, 0, 'v3');
+ CREATE TABLE nodes (
+ id TEXT PRIMARY KEY, kind TEXT NOT NULL, name TEXT NOT NULL,
+ qualified_name TEXT NOT NULL, file_path TEXT NOT NULL, language TEXT NOT NULL,
+ start_line INTEGER NOT NULL, end_line INTEGER NOT NULL,
+ start_column INTEGER NOT NULL, end_column INTEGER NOT NULL,
+ docstring TEXT, signature TEXT, visibility TEXT,
+ is_exported INTEGER DEFAULT 0, is_async INTEGER DEFAULT 0,
+ is_static INTEGER DEFAULT 0, is_abstract INTEGER DEFAULT 0,
+ decorators TEXT, type_parameters TEXT, updated_at INTEGER NOT NULL,
+ name_subwords TEXT -- partial pre-existing state
+ );
+ `);
+ expect(() => runMigrations(dbHandle, 3)).not.toThrow();
+ expect(getCurrentVersion(dbHandle)).toBeGreaterThanOrEqual(9);
+ dbHandle.close();
+ });
+});
diff --git a/__tests__/security.test.ts b/__tests__/security.test.ts
index 53441d58..1c62e648 100644
--- a/__tests__/security.test.ts
+++ b/__tests__/security.test.ts
@@ -533,3 +533,36 @@ describe('Symlink Cycle Detection', () => {
expect(files).toContain('src/valid.ts');
});
});
+
+describe('ReDoS-safe glob matching', () => {
+ it('coalesces runs of `*` so hostile inputs do not produce nested quantifiers', async () => {
+ const { globToSafeRegex } = await import('../src/utils');
+ // Two or more stars collapse to a single recursive wildcard. This is the
+ // ReDoS protection: `*****` doesn't expand to `[^/]*[^/]*[^/]*[^/]*[^/]*`,
+ // which on a long input could catastrophically backtrack.
+ expect(globToSafeRegex('*****')).toBe('.*');
+ expect(globToSafeRegex('**')).toBe('.*');
+
+ // Even a constructed-from-hostile-input regex matches in linear time.
+ const regex = new RegExp(`^${globToSafeRegex('*****')}foo$`);
+ const start = Date.now();
+ // 100k 'a's followed by something that doesn't end in 'foo'.
+ expect(regex.test('a'.repeat(100000) + 'bar')).toBe(false);
+ expect(Date.now() - start).toBeLessThan(500);
+ });
+
+ it('rejects pathologically long glob inputs', async () => {
+ const { globToSafeRegex } = await import('../src/utils');
+ expect(globToSafeRegex('*'.repeat(2000))).toBeNull();
+ });
+
+ it('preserves the standard glob semantics for common patterns', async () => {
+ const { globToSafeRegex } = await import('../src/utils');
+ const body = globToSafeRegex('src/**/*.test.ts');
+ expect(body).toBeDefined();
+ const regex = new RegExp(`^${body}$`);
+ expect(regex.test('src/lib/foo.test.ts')).toBe(true);
+ expect(regex.test('src/lib/foo.ts')).toBe(false);
+ expect(regex.test('other/src/foo.test.ts')).toBe(false);
+ });
+});
diff --git a/__tests__/sql-refs.test.ts b/__tests__/sql-refs.test.ts
new file mode 100644
index 00000000..7fb201c7
--- /dev/null
+++ b/__tests__/sql-refs.test.ts
@@ -0,0 +1,339 @@
+/**
+ * SQL call-site tests: parser unit tests + end-to-end through CodeGraph.
+ */
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import * as fs from 'fs';
+import * as os from 'os';
+import * as path from 'path';
+import { extractSqlRefs } from '../src/sql-refs';
+import CodeGraph from '../src/index';
+
+let testDir: string;
+let cg: CodeGraph | null = null;
+
+function write(rel: string, content: string) {
+ const abs = path.join(testDir, rel);
+ fs.mkdirSync(path.dirname(abs), { recursive: true });
+ fs.writeFileSync(abs, content);
+}
+
+beforeEach(() => {
+ testDir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-sql-'));
+});
+
+afterEach(() => {
+ if (cg) {
+ cg.destroy();
+ cg = null;
+ }
+ if (fs.existsSync(testDir)) fs.rmSync(testDir, { recursive: true, force: true });
+});
+
+// ============================================================================
+// Pure parser tests
+// ============================================================================
+
+describe('extractSqlRefs', () => {
+ it('captures FROM as a read', () => {
+ write('a.ts', `db.prepare('SELECT id FROM users WHERE id = ?');\n`);
+ const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null);
+ expect(refs).toHaveLength(1);
+ expect(refs[0]!).toMatchObject({ tableName: 'users', op: 'read' });
+ });
+
+ it('captures INSERT INTO as a write', () => {
+ write('a.ts', `db.prepare('INSERT INTO logs (msg) VALUES (?)');\n`);
+ const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null);
+ expect(refs).toHaveLength(1);
+ expect(refs[0]!).toMatchObject({ tableName: 'logs', op: 'write' });
+ });
+
+ it('captures UPDATE ... SET as a write', () => {
+ write('a.ts', `db.run('UPDATE users SET name = ? WHERE id = ?', ['x', 1]);\n`);
+ const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null);
+ expect(refs).toHaveLength(1);
+ expect(refs[0]!).toMatchObject({ tableName: 'users', op: 'write' });
+ });
+
+ it('captures DELETE FROM as a write (and not as a read)', () => {
+ write('a.ts', `db.run('DELETE FROM sessions WHERE expired_at < ?');\n`);
+ const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null);
+ // Both regexes (DELETE FROM as write, FROM as read) hit, so we expect
+ // two refs for the same table but different ops.
+ expect(refs.map((r) => r.op).sort()).toEqual(['read', 'write']);
+ expect(new Set(refs.map((r) => r.tableName))).toEqual(new Set(['sessions']));
+ });
+
+ it('captures CREATE TABLE / ALTER / DROP as ddl', () => {
+ write(
+ 'a.ts',
+ [
+ `db.exec('CREATE TABLE IF NOT EXISTS audit (id INTEGER)');`,
+ `db.exec('ALTER TABLE audit ADD COLUMN ts INTEGER');`,
+ `db.exec('DROP TABLE IF EXISTS audit_old');`,
+ ].join('\n')
+ );
+ const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null);
+ const ddls = refs.filter((r) => r.op === 'ddl');
+ expect(new Set(ddls.map((r) => r.tableName))).toEqual(new Set(['audit', 'audit_old']));
+ });
+
+ it('captures JOIN as a read', () => {
+ write(
+ 'a.ts',
+ `db.prepare('SELECT u.name, p.title FROM users u JOIN posts p ON p.user_id = u.id');\n`
+ );
+ const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null);
+ const tables = new Set(refs.map((r) => r.tableName));
+ expect(tables).toEqual(new Set(['users', 'posts']));
+ });
+
+ it('handles backtick (MySQL) and double-quoted (Postgres) identifiers', () => {
+ write(
+ 'a.ts',
+ [
+ "db.prepare('SELECT id FROM `mysql_table`');",
+ `db.prepare('SELECT id FROM "pg_table"');`,
+ ].join('\n')
+ );
+ const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null);
+ expect(new Set(refs.map((r) => r.tableName))).toEqual(
+ new Set(['mysql_table', 'pg_table'])
+ );
+ });
+
+ it('handles schema-qualified identifiers (drops the schema, keeps the table)', () => {
+ write('a.ts', `db.prepare('SELECT * FROM public.users');\n`);
+ const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null);
+ expect(refs[0]!.tableName).toBe('users');
+ });
+
+ it('does NOT match a JS variable named like a SQL keyword', () => {
+ // Without the FROM/INTO/etc. prefix, a bare identifier `users` is
+ // not caught — that's the whole point vs. plain grep.
+ write('a.ts', `const users = await loadUsers();\nfor (const user of users) {}\n`);
+ const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null);
+ expect(refs).toEqual([]);
+ });
+
+ it('skips unsupported languages (e.g. swift) without error', () => {
+ write('a.swift', `let q = "SELECT id FROM users"\n`);
+ const refs = extractSqlRefs(testDir, [{ path: 'a.swift', language: 'swift' }], () => null);
+ expect(refs).toEqual([]);
+ });
+
+ it('captures the correct 1-indexed line number', () => {
+ write(
+ 'a.ts',
+ [`// blah`, `// blah`, `db.prepare('SELECT * FROM line_three');`, `// blah`].join('\n')
+ );
+ const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null);
+ expect(refs[0]).toEqual(expect.objectContaining({ tableName: 'line_three', line: 3 }));
+ });
+
+ it('threads the resolveEnclosing closure correctly', () => {
+ write('a.ts', `db.prepare('SELECT * FROM t');\n`);
+ const calls: Array<[string, number]> = [];
+ extractSqlRefs(
+ testDir,
+ [{ path: 'a.ts', language: 'typescript' }],
+ (filePath, line) => {
+ calls.push([filePath, line]);
+ return 'fake-id';
+ }
+ );
+ expect(calls).toEqual([['a.ts', 1]]);
+ });
+
+ it('drops reserved-word "table names" (WHERE/ON/AS/SELECT)', () => {
+ // Common over-match: `JOIN ... ON x = y` would otherwise pick up
+ // `ON` as the table name. The reserved set blocks that.
+ write('a.ts', `db.prepare('SELECT * FROM users JOIN posts ON posts.uid = users.id');\n`);
+ const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null);
+ const names = new Set(refs.map((r) => r.tableName));
+ expect(names).toEqual(new Set(['users', 'posts']));
+ });
+
+ it('handles multiple SQL operations on a single line', () => {
+ write(
+ 'a.ts',
+ `db.exec('CREATE TABLE foo (id INTEGER); INSERT INTO foo VALUES (1)');\n`
+ );
+ const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null);
+ const ops = new Set(refs.map((r) => `${r.tableName}|${r.op}`));
+ expect(ops).toEqual(new Set(['foo|ddl', 'foo|write']));
+ });
+
+ it('survives a missing file (skips, no throw)', () => {
+ const refs = extractSqlRefs(
+ testDir,
+ [{ path: 'missing.ts', language: 'typescript' }],
+ () => null
+ );
+ expect(refs).toEqual([]);
+ });
+
+ it('rejects prose comments containing a quoted SQL example', () => {
+ // Reviewer-flagged regression: a comment like
+ // // example: db.prepare('SELECT name FROM the docs')
+ // used to falsely match `the` as a table because the quote inside
+ // the comment passed isInsideString(). The comment-stripper now
+ // removes everything after `//` before the regex sees the line.
+ write(
+ 'a.ts',
+ [
+ `// example: db.prepare('SELECT name FROM the docs')`,
+ `// "SELECT id FROM the comment"`,
+ `function ok() {`,
+ ` // sample SELECT FROM users in a comment — should be ignored`,
+ ` return 1;`,
+ `}`,
+ ].join('\n')
+ );
+ const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null);
+ expect(refs).toEqual([]);
+ });
+
+ it('rejects same-line block comments containing a quoted SQL example', () => {
+ write(
+ 'a.ts',
+ `/* "SELECT * FROM ghost" */ const x = 1;\n`
+ );
+ const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null);
+ expect(refs).toEqual([]);
+ });
+
+ it('still keeps a real SQL call when there is a trailing comment', () => {
+ write('a.ts', `db.prepare('SELECT * FROM users'); // good doc\n`);
+ const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null);
+ expect(refs.length).toBe(1);
+ expect(refs[0]!.tableName).toBe('users');
+ });
+
+ it('strips Python `#` comments', () => {
+ write(
+ 'a.py',
+ `# example: db.execute('SELECT * FROM the_docs')\nrows = db.execute('SELECT * FROM real_table')\n`
+ );
+ const refs = extractSqlRefs(testDir, [{ path: 'a.py', language: 'python' }], () => null);
+ expect(refs.map((r) => r.tableName)).toEqual(['real_table']);
+ });
+});
+
+// ============================================================================
+// End-to-end through CodeGraph
+// ============================================================================
+
+describe('CodeGraph SQL refs', () => {
+ it('persists call sites and resolves enclosing function', async () => {
+ write(
+ 'src/db.ts',
+ [
+ `export function getUser(id: number) {`,
+ ` return db.prepare('SELECT * FROM users WHERE id = ?').get(id);`,
+ `}`,
+ ``,
+ `export function logEvent(msg: string) {`,
+ ` db.prepare('INSERT INTO events (msg) VALUES (?)').run(msg);`,
+ `}`,
+ ].join('\n')
+ );
+ cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } });
+ await cg.indexAll();
+
+ const tables = cg.getSqlTables();
+ expect(new Set(tables.map((t) => t.tableName))).toEqual(new Set(['users', 'events']));
+
+ const userSites = cg.getSqlRefsByTable('users');
+ expect(userSites[0]!.sourceName).toBe('getUser');
+
+ const eventSites = cg.getSqlRefsByTable('events');
+ expect(eventSites[0]!.sourceName).toBe('logEvent');
+ expect(eventSites[0]!.op).toBe('write');
+ });
+
+ it('reverse view: getSqlTablesForNode returns tables touched by a function', async () => {
+ write(
+ 'src/a.ts',
+ [
+ `export function multiTouch() {`,
+ ` db.prepare('SELECT * FROM a').all();`,
+ ` db.prepare('INSERT INTO b VALUES (?)').run(1);`,
+ `}`,
+ ].join('\n')
+ );
+ cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } });
+ await cg.indexAll();
+
+ const node = cg.getNodesInFile('src/a.ts').find((n) => n.name === 'multiTouch')!;
+ const touched = cg.getSqlTablesForNode(node.id);
+ const summary = touched.map((r) => `${r.tableName}|${r.op}`).sort();
+ expect(summary).toEqual(['a|read', 'b|write']);
+ });
+
+ it('case-insensitive table lookup', async () => {
+ write('src/a.ts', `db.prepare('SELECT * FROM Users');\n`);
+ cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } });
+ await cg.indexAll();
+ expect(cg.getSqlRefsByTable('users').length).toBe(1);
+ expect(cg.getSqlRefsByTable('USERS').length).toBe(1);
+ });
+
+ it('respects enableSqlRefs=false', async () => {
+ write('src/a.ts', `db.prepare('SELECT * FROM users');\n`);
+ cg = CodeGraph.initSync(testDir, {
+ config: { include: ['**/*.ts'], exclude: [], enableSqlRefs: false },
+ });
+ await cg.indexAll();
+ expect(cg.getSqlTables()).toEqual([]);
+ });
+
+ it('incremental sync replaces refs for changed files only', async () => {
+ write('src/a.ts', `db.prepare('SELECT * FROM old_table');\n`);
+ write('src/b.ts', `db.prepare('SELECT * FROM stable_table');\n`);
+ cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } });
+ await cg.indexAll();
+ expect(new Set(cg.getSqlTables().map((t) => t.tableName))).toEqual(
+ new Set(['old_table', 'stable_table'])
+ );
+
+ write('src/a.ts', `db.prepare('SELECT * FROM new_table');\n`);
+ await cg.sync();
+
+ const tables = new Set(cg.getSqlTables().map((t) => t.tableName));
+ expect(tables).toContain('new_table');
+ expect(tables).toContain('stable_table');
+ expect(tables).not.toContain('old_table');
+ });
+
+ it('drops refs when a file is edited to remove its last SQL ref', async () => {
+ // Same regression as PR C — applySqlRefs([]) shouldn't leave
+ // stale rows. Pre-deleting the changed paths in runSqlRefsPass
+ // is the fix.
+ write('src/a.ts', `db.prepare('SELECT * FROM going_away');\n`);
+ cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } });
+ await cg.indexAll();
+ expect(cg.getSqlTables().some((t) => t.tableName === 'going_away')).toBe(true);
+
+ write('src/a.ts', `// no sql here anymore\nexport const x = 1;\n`);
+ await cg.sync();
+
+ expect(cg.getSqlTables().some((t) => t.tableName === 'going_away')).toBe(false);
+ });
+
+ it('drops refs for files removed between syncs', async () => {
+ write('src/a.ts', `db.prepare('SELECT * FROM gone_table');\n`);
+ cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } });
+ await cg.indexAll();
+ expect(cg.getSqlTables().some((t) => t.tableName === 'gone_table')).toBe(true);
+
+ fs.unlinkSync(path.join(testDir, 'src/a.ts'));
+ await cg.sync();
+ expect(cg.getSqlTables().some((t) => t.tableName === 'gone_table')).toBe(false);
+ });
+
+ // (Removed: a defensive test for the v4-migration-collision bug class.
+ // With file-based migrations (NNN-name.ts), two PRs claiming the same
+ // version produces a filesystem-level conflict, so the silent skip the
+ // defensive guard protected against can no longer happen.)
+});
diff --git a/__tests__/sync.test.ts b/__tests__/sync.test.ts
index 8365f630..cb657274 100644
--- a/__tests__/sync.test.ts
+++ b/__tests__/sync.test.ts
@@ -259,4 +259,140 @@ describe('Sync Module', () => {
expect(result.changedFilePaths).toBeUndefined();
});
});
+
+ // Regression tests for the "stale index after HEAD-moving git operation"
+ // bug. `git status` only reports working-tree dirtiness vs HEAD, so a
+ // merge / pull / checkout / rebase / reset (and even post-commit) leaves
+ // a clean tree and used to trick sync into reporting "up to date" while
+ // the DB still held pre-operation content hashes. The fix detects HEAD
+ // movement by comparing current HEAD against a stored last-synced HEAD
+ // and unioning `git diff` output into the changed-file set.
+ describe('HEAD-moving git operations', () => {
+ let testDir: string;
+ let cg: CodeGraph;
+
+ function git(...args: string[]) {
+ execFileSync('git', args, { cwd: testDir, stdio: 'pipe' });
+ }
+
+ beforeEach(async () => {
+ testDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-head-move-'));
+
+ git('init');
+ git('config', 'user.email', 'test@test.com');
+ git('config', 'user.name', 'Test');
+ // Pin initial branch name so subsequent checkouts are deterministic
+ // across git versions that default to master vs main.
+ git('symbolic-ref', 'HEAD', 'refs/heads/main');
+
+ const srcDir = path.join(testDir, 'src');
+ fs.mkdirSync(srcDir);
+ fs.writeFileSync(
+ path.join(srcDir, 'index.ts'),
+ `export function hello() { return 'world'; }`
+ );
+
+ git('add', '-A');
+ git('commit', '-m', 'initial');
+
+ cg = CodeGraph.initSync(testDir, {
+ config: { include: ['**/*.ts'], exclude: [] },
+ });
+ await cg.indexAll();
+ });
+
+ afterEach(() => {
+ if (cg) cg.destroy();
+ if (fs.existsSync(testDir)) {
+ fs.rmSync(testDir, { recursive: true, force: true });
+ }
+ });
+
+ it('should detect changes brought in by `git merge`', async () => {
+ // Branch off, modify on the branch, commit, switch back, merge.
+ git('checkout', '-b', 'feature');
+ fs.writeFileSync(
+ path.join(testDir, 'src', 'index.ts'),
+ `export function merged() { return 'from-branch'; }`
+ );
+ fs.writeFileSync(
+ path.join(testDir, 'src', 'added.ts'),
+ `export function fromBranch() { return 1; }`
+ );
+ git('add', '-A');
+ git('commit', '-m', 'feature work');
+ git('checkout', 'main');
+ git('merge', '--no-ff', 'feature', '-m', 'merge feature');
+
+ // Working tree is clean post-merge — `git status` shows nothing.
+ const result = await cg.sync();
+
+ expect(result.filesModified + result.filesAdded).toBeGreaterThanOrEqual(2);
+ expect(cg.searchNodes('merged').length).toBeGreaterThan(0);
+ expect(cg.searchNodes('fromBranch').length).toBeGreaterThan(0);
+ expect(cg.searchNodes('hello').length).toBe(0);
+ });
+
+ it('should detect changes after `git checkout` to a different branch', async () => {
+ git('checkout', '-b', 'other');
+ fs.writeFileSync(
+ path.join(testDir, 'src', 'index.ts'),
+ `export function onOther() { return 'other'; }`
+ );
+ git('add', '-A');
+ git('commit', '-m', 'other work');
+ git('checkout', 'main');
+ // We're back on main, where `hello` exists. Before the fix, sync
+ // here would no-op because the working tree matches HEAD (= main).
+ // But the index was last synced against `other`, so we expect the
+ // diff main..other to flow through and bring the index in line
+ // with the current branch.
+ git('checkout', 'other');
+
+ const result = await cg.sync();
+
+ expect(result.filesModified).toBeGreaterThanOrEqual(1);
+ expect(cg.searchNodes('onOther').length).toBeGreaterThan(0);
+ expect(cg.searchNodes('hello').length).toBe(0);
+ });
+
+ it('should detect file deletion brought in by a committed change', async () => {
+ git('rm', path.join('src', 'index.ts'));
+ git('commit', '-m', 'remove index');
+
+ const result = await cg.sync();
+
+ expect(result.filesRemoved).toBe(1);
+ expect(cg.searchNodes('hello').length).toBe(0);
+ });
+
+ it('should fall back to full scan when last-synced HEAD is unreachable', async () => {
+ // Modify and commit, then rewrite history so the previously-synced
+ // HEAD (recorded by indexAll in beforeEach) is no longer reachable.
+ fs.writeFileSync(
+ path.join(testDir, 'src', 'index.ts'),
+ `export function rewritten() { return 'rewritten'; }`
+ );
+ git('add', '-A');
+ git('commit', '--amend', '-m', 'rewritten');
+ // `git gc --prune=now` would sever the orphaned commit, but amending
+ // already moves HEAD to a new SHA the index has never seen and the
+ // OLD SHA may or may not be reachable. We verify behavior is correct
+ // either way: sync brings the index in line with current state.
+ const result = await cg.sync();
+
+ expect(result.filesModified + result.filesAdded).toBeGreaterThanOrEqual(1);
+ expect(cg.searchNodes('rewritten').length).toBeGreaterThan(0);
+ expect(cg.searchNodes('hello').length).toBe(0);
+ });
+
+ it('should still no-op when HEAD has not moved and tree is clean', async () => {
+ // Sanity: the new HEAD-tracking code must not introduce spurious work.
+ const result = await cg.sync();
+
+ expect(result.filesAdded).toBe(0);
+ expect(result.filesModified).toBe(0);
+ expect(result.filesRemoved).toBe(0);
+ });
+ });
});
diff --git a/__tests__/watcher.test.ts b/__tests__/watcher.test.ts
index f3638e6d..a546494d 100644
--- a/__tests__/watcher.test.ts
+++ b/__tests__/watcher.test.ts
@@ -31,6 +31,19 @@ function waitFor(
});
}
+/**
+ * fs.watch on macOS (FSEvents) and Linux (inotify) has a small but real
+ * latency between `fs.watch()` returning and the kernel actually
+ * delivering events. Writing a file in that window — particularly under
+ * parallel test load when the host CPU is busy — drops the event and
+ * causes a 5s timeout for "should trigger sync after file change" style
+ * tests. This helper standardizes the settle delay to match the pattern
+ * already used by the filtering tests in this file.
+ */
+async function letWatcherSettle(): Promise {
+ await new Promise((r) => setTimeout(r, 400));
+}
+
describe('FileWatcher', () => {
let testDir: string;
@@ -101,6 +114,7 @@ describe('FileWatcher', () => {
const watcher = new FileWatcher(testDir, baseConfig, syncFn, { debounceMs: 200 });
watcher.start();
+ await letWatcherSettle();
// Create a new file
fs.writeFileSync(path.join(testDir, 'src', 'new.ts'), 'export const y = 2;');
@@ -117,6 +131,7 @@ describe('FileWatcher', () => {
const watcher = new FileWatcher(testDir, baseConfig, syncFn, { debounceMs: 500 });
watcher.start();
+ await letWatcherSettle();
// Rapid-fire changes
for (let i = 0; i < 5; i++) {
@@ -145,7 +160,7 @@ describe('FileWatcher', () => {
watcher.start();
// Let watcher settle — fs.watch may fire residual events from beforeEach
- await new Promise((r) => setTimeout(r, 400));
+ await letWatcherSettle();
syncFn.mockClear();
// Create a file that doesn't match include patterns
@@ -165,7 +180,7 @@ describe('FileWatcher', () => {
watcher.start();
// Let watcher settle — fs.watch may fire residual events from beforeEach
- await new Promise((r) => setTimeout(r, 400));
+ await letWatcherSettle();
syncFn.mockClear();
// Simulate a .codegraph directory change
@@ -191,6 +206,7 @@ describe('FileWatcher', () => {
});
watcher.start();
+ await letWatcherSettle();
fs.writeFileSync(path.join(testDir, 'src', 'test.ts'), 'export const z = 3;');
@@ -209,6 +225,7 @@ describe('FileWatcher', () => {
});
watcher.start();
+ await letWatcherSettle();
fs.writeFileSync(path.join(testDir, 'src', 'test.ts'), 'export const z = 3;');
@@ -218,6 +235,36 @@ describe('FileWatcher', () => {
watcher.stop();
});
+
+ it('should retry pending changes after a sync failure (no events lost)', async () => {
+ // First call rejects, subsequent calls resolve. After the initial
+ // failure, the watcher should retry the same batch on its own — without
+ // this, transient sync failures (DB locked etc.) would silently drop the
+ // changes until a new file event happened.
+ let calls = 0;
+ const syncFn = vi.fn().mockImplementation(() => {
+ calls++;
+ if (calls === 1) return Promise.reject(new Error('transient'));
+ return Promise.resolve({ filesChanged: 1, durationMs: 5 });
+ });
+ const onSyncError = vi.fn();
+ const onSyncComplete = vi.fn();
+ const watcher = new FileWatcher(testDir, baseConfig, syncFn, {
+ debounceMs: 100,
+ onSyncError,
+ onSyncComplete,
+ });
+
+ watcher.start();
+ fs.writeFileSync(path.join(testDir, 'src', 'test.ts'), 'export const z = 3;');
+
+ await waitFor(() => onSyncComplete.mock.calls.length > 0, 5000);
+ expect(onSyncError).toHaveBeenCalledTimes(1);
+ expect(syncFn).toHaveBeenCalledTimes(2);
+ expect(onSyncComplete).toHaveBeenCalledWith({ filesChanged: 1, durationMs: 5 });
+
+ watcher.stop();
+ });
});
describe('CodeGraph integration', () => {
@@ -268,6 +315,7 @@ describe('FileWatcher', () => {
const initialNodes = initialStats.nodeCount;
cg.watch({ debounceMs: 300 });
+ await letWatcherSettle();
// Add a new file with a function
fs.writeFileSync(
diff --git a/scripts/battle-test.mjs b/scripts/battle-test.mjs
new file mode 100644
index 00000000..071ec3a4
--- /dev/null
+++ b/scripts/battle-test.mjs
@@ -0,0 +1,150 @@
+#!/usr/bin/env node
+/**
+ * Battle test: drive every feature shipped on `battle-test/all-shipped`
+ * against a real repo and print a comprehensive report.
+ *
+ * Validates:
+ * - migrations: schema is at v7 with all 7 migrations applied
+ * - extraction: nodes/edges/files indexed
+ * - centrality: PageRank scores populated, top-N nonempty
+ * - churn: per-file commit counts, LOC, last-touched timestamps
+ * - hotspots: risk scoring (centrality × churn) returns ranked rows
+ * - issue-history: Fixes/Closes/Resolves attribution
+ * - config-refs: env var read sites
+ * - sql-refs: table read/write/DDL call sites
+ * - MCP tool registry: 11 tools registered + dispatch works
+ * - Index-hook registry: 5 hooks registered + outcomes populated
+ *
+ * Usage: node scripts/battle-test.mjs
+ */
+
+import path from 'node:path';
+import fs from 'node:fs';
+import process from 'node:process';
+
+const targetPath = path.resolve(process.argv[2] ?? process.cwd());
+if (!fs.existsSync(targetPath)) {
+ console.error(`battle-test: target path does not exist: ${targetPath}`);
+ process.exit(1);
+}
+
+console.log(`\n=== Battle test: ${targetPath} ===\n`);
+
+const { CodeGraph } = await import('../dist/index.js');
+
+// Reset .codegraph if present so we exercise the fresh-init path
+const cgDir = path.join(targetPath, '.codegraph');
+if (fs.existsSync(cgDir)) {
+ fs.rmSync(cgDir, { recursive: true, force: true });
+}
+
+const cg = await CodeGraph.init(targetPath);
+
+const t0 = Date.now();
+const result = await cg.indexAll();
+const indexMs = Date.now() - t0;
+console.log(`✓ indexAll completed in ${indexMs}ms — files=${result.filesIndexed} nodes=${result.nodesCreated} edges=${result.edgesCreated}`);
+
+const stats = cg.getStats();
+console.log(` stats: ${stats.fileCount} files, ${stats.nodeCount} nodes, ${stats.edgeCount} edges`);
+
+// ----- migrations -----
+const { CURRENT_SCHEMA_VERSION, ALL_MIGRATIONS } = await import('../dist/db/migrations.js');
+const versions = ALL_MIGRATIONS.map((m) => m.version).join(',');
+console.log(`✓ schema v${CURRENT_SCHEMA_VERSION}, registered migrations: ${versions}`);
+
+// ----- index-hook registry -----
+const { getRegisteredHooks } = await import('../dist/index-hooks/registry.js');
+const hooks = getRegisteredHooks();
+console.log(`✓ ${hooks.length} index-hooks registered: ${hooks.map((h) => h.name).join(', ')}`);
+
+// ----- mcp tool registry -----
+const { getToolModules } = await import('../dist/mcp/tools/registry.js');
+const tools = getToolModules();
+console.log(`✓ ${tools.length} MCP tools registered: ${tools.map((t) => t.definition.name).join(', ')}`);
+
+// ----- centrality -----
+const top = cg.getTopCentralNodes({ limit: 5 });
+console.log(`\n--- centrality ---`);
+if (top.length === 0) {
+ console.log(` ✗ no centrality scores computed`);
+} else {
+ console.log(` ✓ top 5 by centrality:`);
+ for (const n of top) {
+ console.log(` ${n.centrality?.toFixed(5)} ${n.kind} ${n.name} (${n.filePath}:${n.startLine})`);
+ }
+}
+
+// ----- churn -----
+console.log(`\n--- churn ---`);
+const sample = cg.getStats().fileCount > 0
+ ? cg.getHotspots({ limit: 1, minCommits: 0 })[0]
+ : null;
+if (sample) {
+ const churn = cg.getFileChurn(sample.filePath);
+ console.log(` ✓ sample file ${sample.filePath}: commits=${churn?.commitCount} loc=${churn?.loc} lastTouched=${churn?.lastTouchedTs}`);
+} else {
+ console.log(` (no churn data — likely not in a git repo)`);
+}
+
+// ----- hotspots -----
+console.log(`\n--- hotspots ---`);
+const hot = cg.getHotspots({ limit: 5, minCommits: 0 });
+if (hot.length === 0) {
+ console.log(` (no hotspots)`);
+} else {
+ console.log(` ✓ top 5 by risk:`);
+ for (const r of hot) {
+ console.log(` risk=${r.riskScore.toFixed(4)} commits=${r.commitCount} loc=${r.loc} ${r.filePath}`);
+ }
+}
+
+// ----- issue history -----
+console.log(`\n--- issue history ---`);
+let issueCount = 0;
+let nodesWithIssues = 0;
+const allNodes = cg.getStats().nodeCount;
+// Sample up to 200 random nodes; count how many have any issue history
+const sampleNodes = cg.getTopCentralNodes({ limit: 200 });
+for (const n of sampleNodes) {
+ const issues = cg.getIssuesForNode(n.id);
+ if (issues.length > 0) {
+ nodesWithIssues++;
+ issueCount += issues.length;
+ }
+}
+console.log(` sampled ${sampleNodes.length} of ${allNodes} nodes: ${nodesWithIssues} have issue refs (${issueCount} attributions)`);
+
+// ----- config refs -----
+console.log(`\n--- config refs ---`);
+const envKeys = cg.getConfigKeys({ configKind: 'env', limit: 10 });
+if (envKeys.length === 0) {
+ console.log(` (no env-var read sites)`);
+} else {
+ console.log(` ✓ top 10 env vars (${envKeys.length}/${cg.getConfigKeys({ configKind: 'env', limit: 9999 }).length}):`);
+ for (const k of envKeys) {
+ console.log(` ${k.reads.toString().padStart(4)} reads ${k.distinctFiles} files ${k.configKey}`);
+ }
+}
+
+// ----- sql refs -----
+console.log(`\n--- sql refs ---`);
+const tables = cg.getSqlTables({ limit: 10 });
+if (tables.length === 0) {
+ console.log(` (no SQL string-literal call sites)`);
+} else {
+ console.log(` ✓ top 10 tables:`);
+ for (const t of tables) {
+ console.log(` r=${t.reads} w=${t.writes} d=${t.ddl} ${t.tableName}`);
+ }
+}
+
+// ----- sync regression -----
+console.log(`\n--- sync round-trip ---`);
+const t1 = Date.now();
+const syncResult = await cg.sync();
+const syncMs = Date.now() - t1;
+console.log(` ✓ sync no-op in ${syncMs}ms — added=${syncResult.filesAdded} modified=${syncResult.filesModified} removed=${syncResult.filesRemoved}`);
+
+cg.close();
+console.log(`\n=== battle test PASS ===\n`);
diff --git a/src/bin/codegraph.ts b/src/bin/codegraph.ts
index d118a1fd..44ccc873 100644
--- a/src/bin/codegraph.ts
+++ b/src/bin/codegraph.ts
@@ -23,6 +23,7 @@ import * as path from 'path';
import * as fs from 'fs';
import { getCodeGraphDir, isInitialized } from '../directory';
import { createShimmerProgress } from '../ui/shimmer-progress';
+import { globToSafeRegex } from '../utils';
// Lazy-load heavy modules (CodeGraph, runInstaller) to keep CLI startup fast.
async function loadCodeGraph(): Promise {
@@ -1158,16 +1159,15 @@ program
/\/spec\//,
];
- // Custom filter pattern
+ // Custom filter pattern (ReDoS-safe — globToSafeRegex coalesces
+ // consecutive wildcards so hostile inputs can't produce nested
+ // quantifiers like `.+.+.+`).
let customFilter: RegExp | null = null;
if (options.filter) {
- // Convert glob to regex: ** → .+, * → [^/]*, . → \.
- const regex = options.filter
- .replace(/[+[\]{}()^$|\\]/g, '\\$&')
- .replace(/\./g, '\\.')
- .replace(/\*\*/g, '.+')
- .replace(/\*/g, '[^/]*');
- customFilter = new RegExp(regex);
+ const regexBody = globToSafeRegex(options.filter);
+ if (regexBody !== null) {
+ customFilter = new RegExp(regexBody);
+ }
}
function isTestFile(filePath: string): boolean {
diff --git a/src/centrality/index.ts b/src/centrality/index.ts
new file mode 100644
index 00000000..d03f2206
--- /dev/null
+++ b/src/centrality/index.ts
@@ -0,0 +1,126 @@
+/**
+ * Centrality computation
+ *
+ * Computes PageRank over the `calls` + `references` subgraph and
+ * persists each node's score on the `nodes.centrality` column. Pure
+ * compute — no I/O — so the caller owns reading edges, writing scores,
+ * and deciding when to re-run.
+ *
+ * PageRank is the right shape for "what is structurally important?"
+ * because it rewards being reached (weighted by the importance of who
+ * reaches you), not just raw in-degree. A method called once from a
+ * central interface ranks above a method called many times from a
+ * leaf script.
+ *
+ * Edges of kind `contains` are deliberately excluded — they encode
+ * lexical containment (file → class → method), which would dominate
+ * the rank and hide actual reference flow.
+ *
+ * Side benefit observed in spike data: PageRank accidentally surfaces
+ * resolver false-positives. Generic short names (`trim`, `run`) that
+ * the resolver over-merges across files accumulate edges from many
+ * sources and float to the top alongside genuine hubs. Useful as a
+ * diagnostic; not a goal of this module.
+ */
+
+/** Damping factor — fraction of rank propagated through edges each step. */
+export const PR_DAMPING = 0.85;
+
+/**
+ * Iteration count. PageRank converges geometrically; 40 iterations puts
+ * us well below 1e-6 residual on graphs we've seen, with no per-graph
+ * tuning needed.
+ */
+export const PR_ITERATIONS = 40;
+
+/** Edge kinds that contribute to centrality. */
+export const PR_EDGE_KINDS = ['calls', 'references'] as const;
+
+export type PrEdgeKind = (typeof PR_EDGE_KINDS)[number];
+
+export interface CentralityResult {
+ /** nodeId → PageRank score in (0, 1). Sums to ~1.0 across all nodes. */
+ scores: Map;
+ /** Iterations actually run (currently always PR_ITERATIONS — kept for forward compat). */
+ iterations: number;
+ /** Wall-clock duration in milliseconds. */
+ durationMs: number;
+}
+
+interface NodeRef {
+ id: string;
+}
+
+interface EdgeRef {
+ source: string;
+ target: string;
+}
+
+/**
+ * Compute PageRank scores for the supplied nodes/edges.
+ *
+ * @param nodes All graph nodes (only `id` is read).
+ * @param edges Edges that contribute to centrality. Caller is
+ * responsible for filtering to `PR_EDGE_KINDS`.
+ *
+ * Edges referencing unknown node ids are silently dropped — the
+ * underlying graph has FK cascades, so dangling references can only
+ * occur mid-write and are not our problem to fix here.
+ */
+export function computePageRank(nodes: NodeRef[], edges: EdgeRef[]): CentralityResult {
+ const start = Date.now();
+ const N = nodes.length;
+ const scores = new Map();
+ if (N === 0) {
+ return { scores, iterations: 0, durationMs: Date.now() - start };
+ }
+
+ // Index nodes for tight numeric loops. Float64Array gives ~3× speedup
+ // over Array(N).fill on million-edge graphs and costs nothing on
+ // smaller ones.
+ const idx = new Map();
+ for (let i = 0; i < N; i++) {
+ const n = nodes[i]!;
+ idx.set(n.id, i);
+ }
+
+ const inEdges: number[][] = Array.from({ length: N }, () => []);
+ const outDeg = new Int32Array(N);
+ for (const e of edges) {
+ const s = idx.get(e.source);
+ const t = idx.get(e.target);
+ if (s === undefined || t === undefined) continue;
+ inEdges[t]!.push(s);
+ outDeg[s]! += 1;
+ }
+
+ let pr = new Float64Array(N).fill(1 / N);
+ const baseline = (1 - PR_DAMPING) / N;
+
+ for (let it = 0; it < PR_ITERATIONS; it++) {
+ const next = new Float64Array(N).fill(baseline);
+
+ // Distribute the rank of dangling nodes (no outgoing edges) uniformly.
+ // Without this the total rank decays each iteration.
+ let danglingSum = 0;
+ for (let i = 0; i < N; i++) {
+ if (outDeg[i] === 0) danglingSum += pr[i]!;
+ }
+ const danglingShare = (PR_DAMPING * danglingSum) / N;
+ for (let i = 0; i < N; i++) next[i]! += danglingShare;
+
+ for (let t = 0; t < N; t++) {
+ const sources = inEdges[t]!;
+ let s = 0;
+ for (let k = 0; k < sources.length; k++) {
+ const src = sources[k]!;
+ s += pr[src]! / outDeg[src]!;
+ }
+ next[t]! += PR_DAMPING * s;
+ }
+ pr = next;
+ }
+
+ for (let i = 0; i < N; i++) scores.set(nodes[i]!.id, pr[i]!);
+ return { scores, iterations: PR_ITERATIONS, durationMs: Date.now() - start };
+}
diff --git a/src/churn/index.ts b/src/churn/index.ts
new file mode 100644
index 00000000..1c332886
--- /dev/null
+++ b/src/churn/index.ts
@@ -0,0 +1,259 @@
+/**
+ * Per-file churn mining
+ *
+ * Reads `git log` to compute four signals per indexed file:
+ * - commit_count (how often the file gets touched)
+ * - first_seen_ts (when it entered the codebase)
+ * - last_touched_ts (how recently it was modified)
+ * - loc (line count of the current on-disk content)
+ *
+ * Combined with PageRank centrality (see ../centrality), these answer
+ * "where do bugs hide?" — central files that change often are the
+ * highest-expected-value review targets, validated empirically against
+ * codegraph's own history (e.g. `src/extraction/tree-sitter.ts`).
+ *
+ * Storage strategy: scalar columns on `files` (one row already exists
+ * per indexed path; adding columns avoids a JOIN on every read).
+ *
+ * Incremental update: persist `last_mined_churn_head` in
+ * project_metadata; on subsequent mines, only enumerate commits in
+ * `..HEAD`. This keeps `sync` fast on long histories. If the
+ * stored sha is unreachable (force-push, gc), the caller gets
+ * `needsFullRescan: true` and re-mines from scratch after `clearChurn`.
+ *
+ * Rename note: `git log --name-only` (without `--follow`) reports
+ * post-rename paths only. The pre-rename history is therefore not
+ * counted toward the new path's `commit_count`. `--follow` would fix
+ * this but is documented as O(N) per file and shells out individually,
+ * so v1 accepts the under-count and surfaces it in the doc-comment on
+ * `commitCount` in types.ts.
+ */
+
+import { execFileSync } from 'child_process';
+import * as fs from 'fs';
+import * as path from 'path';
+import { logDebug } from '../errors';
+
+/**
+ * Skip commits that touch more than this many indexed files. Merge
+ * commits and mass refactors otherwise inflate every file's
+ * commit_count without any real coupling signal.
+ */
+export const MAX_FILES_PER_COMMIT = 50;
+
+/** Sentinel for `git log --pretty=tformat:`; cannot collide with a path. */
+const COMMIT_HEADER_PREFIX = 'CGCMT-';
+
+/** Project-metadata key holding the HEAD SHA of the last mined commit. */
+export const LAST_MINED_CHURN_HEAD_KEY = 'last_mined_churn_head';
+
+/** Hard cap on git output we'll buffer (bytes). Matches cochange. */
+const MAX_GIT_BUFFER = 200 * 1024 * 1024;
+
+/** Wall-clock cap on a single git invocation (ms). */
+const GIT_TIMEOUT_MS = 60_000;
+
+export interface FileChurnDelta {
+ path: string;
+ /** Commits to add to the existing commit_count. */
+ commitCountDelta: number;
+ /**
+ * Most recent commit timestamp (unix seconds) seen in this delta.
+ * Caller takes max() with the existing value.
+ */
+ lastTouchedTs: number;
+ /**
+ * Earliest commit timestamp (unix seconds) in this delta. Caller
+ * applies `COALESCE(existing, this)` so the first-seen column only
+ * gets written once.
+ */
+ firstSeenTs: number;
+}
+
+export interface ChurnMineResult {
+ deltas: Map;
+ /** HEAD SHA reached by this run; null when not in a git repo. */
+ currentHead: string | null;
+ /**
+ * True when the caller's `sinceSha` was unreachable (force-push, gc).
+ * Caller should `clearChurn()` and re-mine with `sinceSha=null`.
+ */
+ needsFullRescan: boolean;
+}
+
+/**
+ * Get the current HEAD commit SHA, or null when not in a git repo or
+ * the repo has no commits yet.
+ */
+export function getGitHead(rootDir: string): string | null {
+ try {
+ return (
+ execFileSync('git', ['rev-parse', 'HEAD'], {
+ cwd: rootDir,
+ encoding: 'utf-8',
+ timeout: 5000,
+ stdio: ['pipe', 'pipe', 'pipe'],
+ }).trim() || null
+ );
+ } catch {
+ return null;
+ }
+}
+
+/**
+ * Verify that a stored SHA is still reachable from HEAD. After
+ * force-push or `git gc` it can disappear, in which case incremental
+ * mining would silently miss commits.
+ */
+function isShaReachable(rootDir: string, sha: string): boolean {
+ try {
+ execFileSync('git', ['cat-file', '-e', `${sha}^{commit}`], {
+ cwd: rootDir,
+ timeout: 5000,
+ stdio: ['pipe', 'pipe', 'pipe'],
+ });
+ return true;
+ } catch {
+ return false;
+ }
+}
+
+/**
+ * Read the LOC of a file as currently on disk. Cheap; always fresh.
+ *
+ * Counts newline-delimited lines: a file with content `"a\nb\n"`
+ * reports 2; an empty file reports 0; a file ending without a newline
+ * still reports the visible-line count.
+ */
+export function readFileLoc(rootDir: string, relPath: string): number {
+ try {
+ const abs = path.join(rootDir, relPath);
+ const content = fs.readFileSync(abs, 'utf8');
+ if (content.length === 0) return 0;
+ let lines = 0;
+ for (let i = 0; i < content.length; i++) if (content.charCodeAt(i) === 10) lines++;
+ // Trailing chunk without final newline still counts as a line.
+ if (content.charCodeAt(content.length - 1) !== 10) lines++;
+ return lines;
+ } catch {
+ return 0;
+ }
+}
+
+/**
+ * Mine git log for per-file commit metrics.
+ *
+ * @param rootDir Project root.
+ * @param indexedFiles Paths we care about (deltas only emitted for
+ * these). Files outside this set are ignored
+ * per-commit so churn doesn't accumulate for
+ * paths the index has no other knowledge of.
+ * @param sinceSha `null` for full scan; otherwise mine only
+ * `..HEAD`. Unreachable shas trigger
+ * `needsFullRescan: true`.
+ */
+export function mineChurn(
+ rootDir: string,
+ indexedFiles: Set,
+ sinceSha: string | null
+): ChurnMineResult {
+ const empty: ChurnMineResult = {
+ deltas: new Map(),
+ currentHead: null,
+ needsFullRescan: false,
+ };
+
+ const head = getGitHead(rootDir);
+ if (!head) return empty;
+
+ if (sinceSha && !isShaReachable(rootDir, sinceSha)) {
+ return { deltas: new Map(), currentHead: head, needsFullRescan: true };
+ }
+
+ // No-op: nothing has happened since last mine.
+ if (sinceSha === head) {
+ return { deltas: new Map(), currentHead: head, needsFullRescan: false };
+ }
+
+ // tformat puts a literal trailing record-separator after each
+ // commit's name list; -z then NUL-delimits within the format too,
+ // so we get a clean stream of NUL-separated tokens.
+ const args = [
+ 'log',
+ '--no-merges',
+ '--name-only',
+ `--pretty=tformat:${COMMIT_HEADER_PREFIX}%H|%ct`,
+ '-z',
+ ];
+ if (sinceSha) args.push(`${sinceSha}..HEAD`);
+
+ let raw: string;
+ try {
+ raw = execFileSync('git', args, {
+ cwd: rootDir,
+ encoding: 'utf-8',
+ timeout: GIT_TIMEOUT_MS,
+ maxBuffer: MAX_GIT_BUFFER,
+ stdio: ['pipe', 'pipe', 'pipe'],
+ });
+ } catch (err) {
+ logDebug(`mineChurn: git log failed: ${err instanceof Error ? err.message : String(err)}`);
+ return { deltas: new Map(), currentHead: head, needsFullRescan: false };
+ }
+
+ // Parse: tformat emits `CGCMT-|\0\n\0\0...
+ // CGCMT-|\0\n\0`. Each token between NULs is either
+ // a commit header or a path; paths arrive with a leading '\n' on the
+ // first one of each commit (the tformat record-separator). We walk
+ // tokens linearly, switching commit context on each header.
+ const tokens = raw.split('\0');
+ const headerRe = /^CGCMT-([0-9a-f]{40})\|(\d+)$/;
+ const deltas = new Map();
+
+ let curTs = 0;
+ let curPaths: string[] = [];
+ let curActive = false;
+
+ function flush() {
+ if (!curActive) return;
+ if (curPaths.length > 0 && curPaths.length <= MAX_FILES_PER_COMMIT) {
+ for (const p of curPaths) {
+ if (!indexedFiles.has(p)) continue;
+ const cur = deltas.get(p);
+ if (cur) {
+ cur.commitCountDelta += 1;
+ if (curTs > cur.lastTouchedTs) cur.lastTouchedTs = curTs;
+ if (curTs < cur.firstSeenTs) cur.firstSeenTs = curTs;
+ } else {
+ deltas.set(p, {
+ path: p,
+ commitCountDelta: 1,
+ lastTouchedTs: curTs,
+ firstSeenTs: curTs,
+ });
+ }
+ }
+ }
+ curPaths = [];
+ curActive = false;
+ }
+
+ for (const rawTok of tokens) {
+ if (rawTok === '') continue;
+ // Strip a single leading \n introduced by tformat's record separator.
+ const tok = rawTok.startsWith('\n') ? rawTok.slice(1) : rawTok;
+ if (tok === '') continue;
+ const m = headerRe.exec(tok);
+ if (m) {
+ flush();
+ curTs = parseInt(m[2]!, 10);
+ curActive = true;
+ } else if (curActive) {
+ curPaths.push(tok);
+ }
+ // Tokens before the first header (shouldn't happen) are ignored.
+ }
+ flush();
+
+ return { deltas, currentHead: head, needsFullRescan: false };
+}
diff --git a/src/config-refs/index.ts b/src/config-refs/index.ts
new file mode 100644
index 00000000..1ef47ae9
--- /dev/null
+++ b/src/config-refs/index.ts
@@ -0,0 +1,188 @@
+/**
+ * Config-reference extraction
+ *
+ * Scans indexed source files for known config-read patterns
+ * (`process.env.X`, `os.getenv("X")`, etc.) and records each read
+ * site as a row in `config_refs`. Each row links to its enclosing
+ * function via a line-range lookup against the existing nodes table,
+ * so an agent asking "what reads OBSIDIAN_PORT?" gets a list of real
+ * functions, not a grep wall.
+ *
+ * Why a separate table, not graph nodes/edges: env vars don't have a
+ * single source-of-truth file (they're a global namespace), so giving
+ * them a synthetic file_path would pollute the main graph. The table
+ * is queried via a dedicated MCP tool (`codegraph_config`) and via
+ * augmented `codegraph_node` output (per-function "reads:" line).
+ *
+ * Spike validation (mcp-obsidian-extended): 71 reads, 19 distinct
+ * keys; 8× OBSIDIAN_PORT, 8× TOOL_PRESET surface as central
+ * config knobs. Codegraph-itself is sparse (4 reads) — this feature
+ * shines on service-style codebases.
+ *
+ * V1 scope: env-only, regex-based per-language. YAML key reads,
+ * LaunchDarkly flags, etc. are deliberately out of scope; the schema
+ * already supports them via `config_kind` so adding them later is a
+ * pattern addition, not a redesign.
+ */
+
+import * as fs from 'fs';
+import * as path from 'path';
+import { logDebug } from '../errors';
+
+export type ConfigKind = 'env';
+
+export interface ConfigRef {
+ configKind: ConfigKind;
+ configKey: string;
+ /** Indexed-symbol id for the enclosing function/method. NULL = top-level. */
+ sourceNodeId: string | null;
+ filePath: string;
+ line: number;
+}
+
+interface PatternDef {
+ /** Languages this pattern applies to (matches `Language` in types.ts). */
+ languages: string[];
+ /** Regex with capture group 1 = config key. */
+ re: RegExp;
+}
+
+/**
+ * Per-language read-pattern catalogue.
+ *
+ * Patterns intentionally err on the side of including only
+ * UPPER_CASE_KEYS — the convention every framework follows for env
+ * vars. This avoids false positives like `process.env.foo` (a Node
+ * variable) or `os.getenv(some_var)` (dynamic).
+ */
+const PATTERNS: PatternDef[] = [
+ // process.env.FOO / process.env["FOO"] (TS, JS, TSX, JSX)
+ {
+ languages: ['typescript', 'javascript', 'tsx', 'jsx'],
+ re: /process\.env\.([A-Z_][A-Z0-9_]*)/g,
+ },
+ {
+ languages: ['typescript', 'javascript', 'tsx', 'jsx'],
+ re: /process\.env\[\s*['"]([A-Z_][A-Z0-9_]*)['"]\s*\]/g,
+ },
+ // os.getenv("FOO") / os.environ.get("FOO") / os.environ["FOO"]
+ {
+ languages: ['python'],
+ re: /\bos\.getenv\(\s*['"]([A-Z_][A-Z0-9_]*)['"]/g,
+ },
+ {
+ languages: ['python'],
+ re: /\bos\.environ\.get\(\s*['"]([A-Z_][A-Z0-9_]*)['"]/g,
+ },
+ {
+ languages: ['python'],
+ re: /\bos\.environ\[\s*['"]([A-Z_][A-Z0-9_]*)['"]\s*\]/g,
+ },
+ // Bare getenv("FOO") (Python convention with `from os import getenv`)
+ {
+ languages: ['python'],
+ re: /\bgetenv\(\s*['"]([A-Z_][A-Z0-9_]*)['"]/g,
+ },
+ // os.Getenv("FOO") / os.LookupEnv("FOO") (Go)
+ {
+ languages: ['go'],
+ re: /\bos\.(?:Getenv|LookupEnv)\(\s*"([A-Z_][A-Z0-9_]*)"/g,
+ },
+ // System.getenv("FOO") (Java/Kotlin)
+ {
+ languages: ['java', 'kotlin'],
+ re: /\bSystem\.getenv\(\s*"([A-Z_][A-Z0-9_]*)"/g,
+ },
+ // ENV["FOO"] / ENV.fetch("FOO") (Ruby)
+ {
+ languages: ['ruby'],
+ re: /\bENV\[\s*['"]([A-Z_][A-Z0-9_]*)['"]\s*\]/g,
+ },
+ {
+ languages: ['ruby'],
+ re: /\bENV\.fetch\(\s*['"]([A-Z_][A-Z0-9_]*)['"]/g,
+ },
+ // Rust: env!("FOO") / std::env::var("FOO")
+ {
+ languages: ['rust'],
+ re: /\benv!\(\s*"([A-Z_][A-Z0-9_]*)"/g,
+ },
+ {
+ languages: ['rust'],
+ re: /\bstd::env::var\(\s*"([A-Z_][A-Z0-9_]*)"/g,
+ },
+];
+
+/** A file's languages-of-interest. Skip everything not in PATTERNS. */
+const SUPPORTED_LANGUAGES = new Set(
+ PATTERNS.flatMap((p) => p.languages)
+);
+
+/**
+ * Resolver supplied by caller: (filePath, line) → enclosing nodeId
+ * (function/method/class). Returns null when the read is at the file's
+ * top level — the row still gets persisted with NULL source_node_id.
+ */
+export type EnclosingNodeResolver = (filePath: string, line: number) => string | null;
+
+export interface FileTarget {
+ path: string;
+ language: string;
+}
+
+/**
+ * Scan a list of (path, language) targets and return all read sites.
+ * Pure I/O + regex; the caller owns DB writes via `applyConfigRefs`.
+ *
+ * Files we can't read (deleted, permission, binary) are silently
+ * skipped — extraction has already validated readability for the rest.
+ */
+export function extractConfigRefs(
+ rootDir: string,
+ targets: Iterable,
+ resolveEnclosing: EnclosingNodeResolver
+): ConfigRef[] {
+ const refs: ConfigRef[] = [];
+ for (const t of targets) {
+ if (!SUPPORTED_LANGUAGES.has(t.language)) continue;
+ let src: string;
+ try {
+ src = fs.readFileSync(path.join(rootDir, t.path), 'utf8');
+ } catch (err) {
+ logDebug(`extractConfigRefs: read failed for ${t.path}: ${err instanceof Error ? err.message : String(err)}`);
+ continue;
+ }
+ // Iterate lines so we can attribute each match to a 1-indexed line.
+ const lines = src.split('\n');
+ for (let i = 0; i < lines.length; i++) {
+ const line = lines[i]!;
+ // Cheap pre-filter to skip the 99% of lines that obviously
+ // contain no env reference. Cuts per-file cost dramatically on
+ // big repos.
+ if (
+ !line.includes('env') &&
+ !line.includes('Env') &&
+ !line.includes('ENV')
+ ) {
+ continue;
+ }
+ for (const pat of PATTERNS) {
+ if (!pat.languages.includes(t.language)) continue;
+ pat.re.lastIndex = 0;
+ let m: RegExpExecArray | null;
+ while ((m = pat.re.exec(line)) !== null) {
+ const key = m[1]!;
+ const lineNo = i + 1;
+ refs.push({
+ configKind: 'env',
+ configKey: key,
+ sourceNodeId: resolveEnclosing(t.path, lineNo),
+ filePath: t.path,
+ line: lineNo,
+ });
+ }
+ }
+ }
+ }
+ return refs;
+}
diff --git a/src/config.ts b/src/config.ts
index 9ab1032a..f1d70250 100644
--- a/src/config.ts
+++ b/src/config.ts
@@ -128,6 +128,11 @@ function mergeConfig(
extractDocstrings: overrides.extractDocstrings ?? defaults.extractDocstrings,
trackCallSites: overrides.trackCallSites ?? defaults.trackCallSites,
customPatterns: overrides.customPatterns ?? defaults.customPatterns,
+ enableCentrality: overrides.enableCentrality ?? defaults.enableCentrality,
+ enableChurn: overrides.enableChurn ?? defaults.enableChurn,
+ enableIssueHistory: overrides.enableIssueHistory ?? defaults.enableIssueHistory,
+ enableConfigRefs: overrides.enableConfigRefs ?? defaults.enableConfigRefs,
+ enableSqlRefs: overrides.enableSqlRefs ?? defaults.enableSqlRefs,
};
}
diff --git a/src/context/index.ts b/src/context/index.ts
index 94192377..08f25657 100644
--- a/src/context/index.ts
+++ b/src/context/index.ts
@@ -286,6 +286,14 @@ export class ContextBuilder {
options: FindRelevantContextOptions = {}
): Promise {
const opts = { ...DEFAULT_FIND_OPTIONS, ...options };
+ // Bound user-supplied limits — `searchLimit` is multiplied by 5 in
+ // findNodesByExactName (line 312) and feeds several other unbounded
+ // operations below, so a request with `searchLimit: 1_000_000` would
+ // pull millions of rows before any filtering. 100 is well above the
+ // largest legitimate use we've seen.
+ opts.searchLimit = Math.min(Math.max(1, opts.searchLimit), 100);
+ opts.maxNodes = Math.min(Math.max(1, opts.maxNodes), 1000);
+ opts.traversalDepth = Math.min(Math.max(0, opts.traversalDepth), 10);
// Start with empty subgraph
const nodes = new Map();
diff --git a/src/db/index.ts b/src/db/index.ts
index 34e99338..da85caea 100644
--- a/src/db/index.ts
+++ b/src/db/index.ts
@@ -152,6 +152,36 @@ export class DatabaseConnection {
this.db.exec('ANALYZE');
}
+ /**
+ * Lightweight, non-blocking maintenance to run after bulk writes
+ * (indexAll, sync). Two operations:
+ *
+ * - `PRAGMA optimize` — incremental ANALYZE; SQLite only re-analyzes
+ * tables whose row counts changed materially since the last
+ * ANALYZE. Without it, the query planner has no statistics on the
+ * freshly-bulk-loaded tables and can pick suboptimal indexes.
+ *
+ * - `PRAGMA wal_checkpoint(PASSIVE)` — fold pending WAL pages back
+ * into the main database file so the WAL file doesn't grow
+ * unboundedly between automatic checkpoints (auto-fires at 1000
+ * pages by default; large indexAll runs blow past that).
+ *
+ * Both operations are silently swallowed on failure — they're a
+ * best-effort optimization, never load-bearing for correctness.
+ */
+ runMaintenance(): void {
+ try {
+ this.db.exec('PRAGMA optimize');
+ } catch {
+ // ignore
+ }
+ try {
+ this.db.exec('PRAGMA wal_checkpoint(PASSIVE)');
+ } catch {
+ // ignore (e.g., not in WAL mode)
+ }
+ }
+
/**
* Close the database connection
*/
diff --git a/src/db/migrations.ts b/src/db/migrations.ts
index 0a256dbc..98325247 100644
--- a/src/db/migrations.ts
+++ b/src/db/migrations.ts
@@ -1,60 +1,26 @@
/**
- * Database Migrations
+ * Database Migrations — runner + backward-compat surface.
*
- * Schema versioning and migration support.
+ * The migration definitions themselves live in
+ * `./migrations/-.ts`, one file per migration, with
+ * version derived from the filename prefix. This file is the
+ * runner (read schema_versions, apply pending in order) and the
+ * stable API surface that the rest of the codebase imports.
+ *
+ * Adding a migration: see `./migrations/index.ts`.
*/
import { SqliteDatabase } from './sqlite-adapter';
+import { ALL_MIGRATIONS, CURRENT_SCHEMA_VERSION as REGISTRY_CURRENT } from './migrations/index';
+import type { Migration } from './migrations/types';
/**
- * Current schema version
+ * Highest registered migration version. Derived from the
+ * registry; re-exported here unchanged so existing consumers
+ * (`import { CURRENT_SCHEMA_VERSION } from './migrations'`) keep
+ * working.
*/
-export const CURRENT_SCHEMA_VERSION = 3;
-
-/**
- * Migration definition
- */
-interface Migration {
- version: number;
- description: string;
- up: (db: SqliteDatabase) => void;
-}
-
-/**
- * All migrations in order
- *
- * Note: Version 1 is the initial schema, handled by schema.sql
- * Future migrations go here.
- */
-const migrations: Migration[] = [
- {
- version: 2,
- description: 'Add project metadata, provenance tracking, and unresolved ref context',
- up: (db) => {
- db.exec(`
- CREATE TABLE IF NOT EXISTS project_metadata (
- key TEXT PRIMARY KEY,
- value TEXT NOT NULL,
- updated_at INTEGER NOT NULL
- );
- ALTER TABLE unresolved_refs ADD COLUMN file_path TEXT NOT NULL DEFAULT '';
- ALTER TABLE unresolved_refs ADD COLUMN language TEXT NOT NULL DEFAULT 'unknown';
- ALTER TABLE edges ADD COLUMN provenance TEXT DEFAULT NULL;
- CREATE INDEX IF NOT EXISTS idx_unresolved_file_path ON unresolved_refs(file_path);
- CREATE INDEX IF NOT EXISTS idx_edges_provenance ON edges(provenance);
- `);
- },
- },
- {
- version: 3,
- description: 'Add lower(name) expression index for memory-efficient case-insensitive lookups',
- up: (db) => {
- db.exec(`
- CREATE INDEX IF NOT EXISTS idx_nodes_lower_name ON nodes(lower(name));
- `);
- },
- },
-];
+export const CURRENT_SCHEMA_VERSION: number = REGISTRY_CURRENT;
/**
* Get the current schema version from the database
@@ -84,17 +50,14 @@ function recordMigration(db: SqliteDatabase, version: number, description: strin
* Run all pending migrations
*/
export function runMigrations(db: SqliteDatabase, fromVersion: number): void {
- const pending = migrations.filter((m) => m.version > fromVersion);
-
- if (pending.length === 0) {
- return;
- }
+ const pending = ALL_MIGRATIONS.filter((m) => m.version > fromVersion);
+ if (pending.length === 0) return;
- // Sort by version
- pending.sort((a, b) => a.version - b.version);
+ // ALL_MIGRATIONS is already sorted by version, but filtering can
+ // be cheap to re-confirm.
+ const ordered = [...pending].sort((a, b) => a.version - b.version);
- // Run each migration in a transaction
- for (const migration of pending) {
+ for (const migration of ordered) {
db.transaction(() => {
migration.up(db);
recordMigration(db, migration.version, migration.description);
@@ -111,13 +74,15 @@ export function needsMigration(db: SqliteDatabase): boolean {
}
/**
- * Get list of pending migrations
+ * Get list of pending migrations.
+ *
+ * Returned as a fresh mutable array (not the underlying readonly
+ * registry) so callers that previously assigned the result to a
+ * `Migration[]`-typed variable keep working unchanged.
*/
export function getPendingMigrations(db: SqliteDatabase): Migration[] {
const current = getCurrentVersion(db);
- return migrations
- .filter((m) => m.version > current)
- .sort((a, b) => a.version - b.version);
+ return ALL_MIGRATIONS.filter((m) => m.version > current).slice();
}
/**
@@ -136,3 +101,7 @@ export function getMigrationHistory(
description: row.description,
}));
}
+
+// Re-export the registry surface for callers that want it.
+export { ALL_MIGRATIONS } from './migrations/index';
+export type { Migration, MigrationModule } from './migrations/types';
diff --git a/src/db/migrations/002-project-metadata.ts b/src/db/migrations/002-project-metadata.ts
new file mode 100644
index 00000000..9fe7945b
--- /dev/null
+++ b/src/db/migrations/002-project-metadata.ts
@@ -0,0 +1,19 @@
+import type { MigrationModule } from './types';
+
+export const MIGRATION: MigrationModule = {
+ description: 'Add project metadata, provenance tracking, and unresolved ref context',
+ up: (db) => {
+ db.exec(`
+ CREATE TABLE IF NOT EXISTS project_metadata (
+ key TEXT PRIMARY KEY,
+ value TEXT NOT NULL,
+ updated_at INTEGER NOT NULL
+ );
+ ALTER TABLE unresolved_refs ADD COLUMN file_path TEXT NOT NULL DEFAULT '';
+ ALTER TABLE unresolved_refs ADD COLUMN language TEXT NOT NULL DEFAULT 'unknown';
+ ALTER TABLE edges ADD COLUMN provenance TEXT DEFAULT NULL;
+ CREATE INDEX IF NOT EXISTS idx_unresolved_file_path ON unresolved_refs(file_path);
+ CREATE INDEX IF NOT EXISTS idx_edges_provenance ON edges(provenance);
+ `);
+ },
+};
diff --git a/src/db/migrations/003-lower-name-index.ts b/src/db/migrations/003-lower-name-index.ts
new file mode 100644
index 00000000..ff5416eb
--- /dev/null
+++ b/src/db/migrations/003-lower-name-index.ts
@@ -0,0 +1,10 @@
+import type { MigrationModule } from './types';
+
+export const MIGRATION: MigrationModule = {
+ description: 'Add lower(name) expression index for memory-efficient case-insensitive lookups',
+ up: (db) => {
+ db.exec(`
+ CREATE INDEX IF NOT EXISTS idx_nodes_lower_name ON nodes(lower(name));
+ `);
+ },
+};
diff --git a/src/db/migrations/004-centrality-churn.ts b/src/db/migrations/004-centrality-churn.ts
new file mode 100644
index 00000000..82d30ffe
--- /dev/null
+++ b/src/db/migrations/004-centrality-churn.ts
@@ -0,0 +1,42 @@
+import type { MigrationModule } from './types';
+
+export const MIGRATION: MigrationModule = {
+ description: 'Add centrality on nodes; per-file churn metrics on files',
+ up: (db) => {
+ // ALTER TABLE ADD COLUMN is not idempotent on SQLite — guard with
+ // PRAGMA table_info so re-running after a partial DDL failure (or
+ // landing alongside another migration that touches the same files
+ // columns) does not throw "duplicate column name".
+ const tableExists = (name: string): boolean =>
+ (db.prepare(`SELECT COUNT(*) AS c FROM sqlite_master WHERE type='table' AND name=?`)
+ .get(name) as { c: number }).c > 0;
+
+ if (tableExists('nodes')) {
+ const nodeCols = db.prepare(`PRAGMA table_info(nodes);`).all() as Array<{ name: string }>;
+ if (!nodeCols.some((c) => c.name === 'centrality')) {
+ db.exec(`ALTER TABLE nodes ADD COLUMN centrality REAL DEFAULT NULL;`);
+ }
+ db.exec(`CREATE INDEX IF NOT EXISTS idx_nodes_centrality ON nodes(centrality DESC);`);
+ }
+
+ if (tableExists('files')) {
+ const fileCols = db.prepare(`PRAGMA table_info(files);`).all() as Array<{ name: string }>;
+ if (!fileCols.some((c) => c.name === 'commit_count')) {
+ db.exec(`ALTER TABLE files ADD COLUMN commit_count INTEGER NOT NULL DEFAULT 0;`);
+ }
+ if (!fileCols.some((c) => c.name === 'loc')) {
+ db.exec(`ALTER TABLE files ADD COLUMN loc INTEGER NOT NULL DEFAULT 0;`);
+ }
+ if (!fileCols.some((c) => c.name === 'first_seen_ts')) {
+ db.exec(`ALTER TABLE files ADD COLUMN first_seen_ts INTEGER DEFAULT NULL;`);
+ }
+ if (!fileCols.some((c) => c.name === 'last_touched_ts')) {
+ db.exec(`ALTER TABLE files ADD COLUMN last_touched_ts INTEGER DEFAULT NULL;`);
+ }
+ db.exec(`
+ CREATE INDEX IF NOT EXISTS idx_files_commit_count ON files(commit_count DESC);
+ CREATE INDEX IF NOT EXISTS idx_files_last_touched ON files(last_touched_ts DESC);
+ `);
+ }
+ },
+};
diff --git a/src/db/migrations/005-symbol-issues.ts b/src/db/migrations/005-symbol-issues.ts
new file mode 100644
index 00000000..7af13795
--- /dev/null
+++ b/src/db/migrations/005-symbol-issues.ts
@@ -0,0 +1,19 @@
+import type { MigrationModule } from './types';
+
+export const MIGRATION: MigrationModule = {
+ description: 'Add symbol_issues table for issue→symbol attribution from git history',
+ up: (db) => {
+ db.exec(`
+ CREATE TABLE IF NOT EXISTS symbol_issues (
+ node_id TEXT NOT NULL,
+ issue_number INTEGER NOT NULL,
+ commit_sha TEXT NOT NULL,
+ kind TEXT NOT NULL CHECK (kind IN ('modified','added','removed')),
+ PRIMARY KEY (node_id, issue_number, commit_sha, kind),
+ FOREIGN KEY (node_id) REFERENCES nodes(id) ON DELETE CASCADE
+ );
+ CREATE INDEX IF NOT EXISTS idx_symbol_issues_node ON symbol_issues(node_id);
+ CREATE INDEX IF NOT EXISTS idx_symbol_issues_issue ON symbol_issues(issue_number);
+ `);
+ },
+};
diff --git a/src/db/migrations/006-config-refs.ts b/src/db/migrations/006-config-refs.ts
new file mode 100644
index 00000000..8fed1a91
--- /dev/null
+++ b/src/db/migrations/006-config-refs.ts
@@ -0,0 +1,24 @@
+import type { MigrationModule } from './types';
+
+export const MIGRATION: MigrationModule = {
+ description: 'Add config_refs table for env var / feature flag read sites',
+ up: (db) => {
+ db.exec(`
+ CREATE TABLE IF NOT EXISTS config_refs (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ config_kind TEXT NOT NULL,
+ config_key TEXT NOT NULL,
+ source_node_id TEXT,
+ file_path TEXT NOT NULL,
+ line INTEGER NOT NULL,
+ FOREIGN KEY (source_node_id) REFERENCES nodes(id) ON DELETE CASCADE
+ );
+ CREATE INDEX IF NOT EXISTS idx_config_refs_key
+ ON config_refs(config_kind, config_key);
+ CREATE INDEX IF NOT EXISTS idx_config_refs_node
+ ON config_refs(source_node_id);
+ CREATE INDEX IF NOT EXISTS idx_config_refs_file
+ ON config_refs(file_path);
+ `);
+ },
+};
diff --git a/src/db/migrations/007-sql-refs.ts b/src/db/migrations/007-sql-refs.ts
new file mode 100644
index 00000000..629d070f
--- /dev/null
+++ b/src/db/migrations/007-sql-refs.ts
@@ -0,0 +1,24 @@
+import type { MigrationModule } from './types';
+
+export const MIGRATION: MigrationModule = {
+ description: 'Add sql_refs table for SQL string-literal references to tables',
+ up: (db) => {
+ db.exec(`
+ CREATE TABLE IF NOT EXISTS sql_refs (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ table_name TEXT NOT NULL,
+ op TEXT NOT NULL CHECK (op IN ('read','write','ddl')),
+ source_node_id TEXT,
+ file_path TEXT NOT NULL,
+ line INTEGER NOT NULL,
+ FOREIGN KEY (source_node_id) REFERENCES nodes(id) ON DELETE CASCADE
+ );
+ CREATE INDEX IF NOT EXISTS idx_sql_refs_table
+ ON sql_refs(lower(table_name));
+ CREATE INDEX IF NOT EXISTS idx_sql_refs_node
+ ON sql_refs(source_node_id);
+ CREATE INDEX IF NOT EXISTS idx_sql_refs_file
+ ON sql_refs(file_path);
+ `);
+ },
+};
diff --git a/src/db/migrations/008-edges-unique.ts b/src/db/migrations/008-edges-unique.ts
new file mode 100644
index 00000000..ed7e5372
--- /dev/null
+++ b/src/db/migrations/008-edges-unique.ts
@@ -0,0 +1,29 @@
+import type { MigrationModule } from './types';
+
+export const MIGRATION: MigrationModule = {
+ description:
+ 'Dedup edges and enforce UNIQUE(source, target, kind, line, col) so INSERT OR IGNORE actually dedupes',
+ up: (db) => {
+ // Tolerate edges-table-missing (synthetic test DBs that only need
+ // the FTS / nodes side of the schema): if there's no edges table,
+ // there are no duplicates to dedup or unique constraint to add.
+ const hasEdges = (db
+ .prepare(`SELECT COUNT(*) AS c FROM sqlite_master WHERE type='table' AND name='edges'`)
+ .get() as { c: number }).c > 0;
+ if (!hasEdges) return;
+
+ // Without a UNIQUE constraint the existing `INSERT OR IGNORE INTO
+ // edges` was a no-op for dedup purposes. Collapse accumulated
+ // duplicates first, then add the UNIQUE index. COALESCE keeps
+ // NULL line/col values comparable.
+ db.exec(`
+ DELETE FROM edges
+ WHERE id NOT IN (
+ SELECT MIN(id) FROM edges
+ GROUP BY source, target, kind, COALESCE(line, -1), COALESCE(col, -1)
+ );
+ CREATE UNIQUE INDEX IF NOT EXISTS idx_edges_unique
+ ON edges(source, target, kind, COALESCE(line, -1), COALESCE(col, -1));
+ `);
+ },
+};
diff --git a/src/db/migrations/009-fts-subwords-porter.ts b/src/db/migrations/009-fts-subwords-porter.ts
new file mode 100644
index 00000000..032058cc
--- /dev/null
+++ b/src/db/migrations/009-fts-subwords-porter.ts
@@ -0,0 +1,68 @@
+import type { MigrationModule } from './types';
+import { buildNameSubwords } from '../../utils';
+
+export const MIGRATION: MigrationModule = {
+ description:
+ 'Add name_subwords + Porter stemmer to FTS so natural-language and partial-identifier queries work',
+ up: (db) => {
+ // 1. Add the synthetic subwords column to nodes — idempotent so a
+ // re-run after a partial DDL failure (SQLite auto-commits DDL,
+ // so only some of these statements may have landed) doesn't fail
+ // with "duplicate column name".
+ const cols = db.prepare(`PRAGMA table_info(nodes);`).all() as Array<{ name: string }>;
+ if (!cols.some((c) => c.name === 'name_subwords')) {
+ db.exec(`ALTER TABLE nodes ADD COLUMN name_subwords TEXT;`);
+ }
+
+ // 2. Drop the existing FTS table + triggers. We can't ALTER the
+ // FTS5 tokenizer in place; recreating is the supported path.
+ db.exec(`
+ DROP TRIGGER IF EXISTS nodes_ai;
+ DROP TRIGGER IF EXISTS nodes_ad;
+ DROP TRIGGER IF EXISTS nodes_au;
+ DROP TABLE IF EXISTS nodes_fts;
+ `);
+
+ // 3. Recreate the FTS table — but DO NOT recreate the triggers yet.
+ db.exec(`
+ CREATE VIRTUAL TABLE nodes_fts USING fts5(
+ id, name, qualified_name, docstring, signature, name_subwords,
+ content='nodes',
+ content_rowid='rowid',
+ tokenize="porter unicode61"
+ );
+ `);
+
+ // 4. Backfill name_subwords.
+ const rows = db
+ .prepare('SELECT id, name FROM nodes')
+ .all() as Array<{ id: string; name: string }>;
+ const update = db.prepare('UPDATE nodes SET name_subwords = ? WHERE id = ?');
+ for (const row of rows) {
+ update.run(buildNameSubwords(row.name), row.id);
+ }
+
+ // 5. Rebuild the FTS index from the content table.
+ db.exec(`INSERT INTO nodes_fts(nodes_fts) VALUES('rebuild');`);
+
+ // 6. Re-attach the triggers — fire on subsequent application writes.
+ db.exec(`
+ CREATE TRIGGER nodes_ai AFTER INSERT ON nodes BEGIN
+ INSERT INTO nodes_fts(rowid, id, name, qualified_name, docstring, signature, name_subwords)
+ VALUES (NEW.rowid, NEW.id, NEW.name, NEW.qualified_name, NEW.docstring, NEW.signature, NEW.name_subwords);
+ END;
+
+ CREATE TRIGGER nodes_ad AFTER DELETE ON nodes BEGIN
+ INSERT INTO nodes_fts(nodes_fts, rowid, id, name, qualified_name, docstring, signature, name_subwords)
+ VALUES ('delete', OLD.rowid, OLD.id, OLD.name, OLD.qualified_name, OLD.docstring, OLD.signature, OLD.name_subwords);
+ END;
+
+ CREATE TRIGGER nodes_au AFTER UPDATE ON nodes BEGIN
+ INSERT INTO nodes_fts(nodes_fts, rowid, id, name, qualified_name, docstring, signature, name_subwords)
+ VALUES ('delete', OLD.rowid, OLD.id, OLD.name, OLD.qualified_name, OLD.docstring, OLD.signature, OLD.name_subwords);
+ INSERT INTO nodes_fts(rowid, id, name, qualified_name, docstring, signature, name_subwords)
+ VALUES (NEW.rowid, NEW.id, NEW.name, NEW.qualified_name, NEW.docstring, NEW.signature, NEW.name_subwords);
+ END;
+ `);
+ },
+};
diff --git a/src/db/migrations/index.ts b/src/db/migrations/index.ts
new file mode 100644
index 00000000..b1d7b9a6
--- /dev/null
+++ b/src/db/migrations/index.ts
@@ -0,0 +1,118 @@
+/**
+ * Migration registry.
+ *
+ * Adding a new schema migration is:
+ *
+ * 1. Pick the next free 3-digit prefix (`NNN`) — `git ls-files
+ * 'src/db/migrations/[0-9]*.ts'` shows what's taken.
+ * 2. Create `src/db/migrations/-.ts`
+ * exporting a `MIGRATION: MigrationModule` (just `description`
+ * and `up(db)`).
+ * 3. Add **one** import line and **one** array entry to this file.
+ *
+ * **Why filename-derived versions instead of a field?** Two PRs
+ * adding migrations independently used to collide on the
+ * `migrations[]` array AND the `CURRENT_SCHEMA_VERSION` const.
+ * With monolithic migrations.ts, "I claimed v4 / you claimed v4"
+ * resolved as "second PR's v4 silently no-ops" — a real bug class
+ * (PR #113's reviewer caught one). With filename-derived versions,
+ * two PRs both creating `004-foo.ts` produce a filesystem-level
+ * conflict the maintainer sees instantly.
+ *
+ * `CURRENT_SCHEMA_VERSION` is the max of all registered versions.
+ */
+
+import type { Migration, MigrationModule } from './types';
+
+import { MIGRATION as MIG_002 } from './002-project-metadata';
+import { MIGRATION as MIG_003 } from './003-lower-name-index';
+import { MIGRATION as MIG_004 } from './004-centrality-churn';
+import { MIGRATION as MIG_005 } from './005-symbol-issues';
+import { MIGRATION as MIG_006 } from './006-config-refs';
+import { MIGRATION as MIG_007 } from './007-sql-refs';
+import { MIGRATION as MIG_008 } from './008-edges-unique';
+import { MIGRATION as MIG_009 } from './009-fts-subwords-porter';
+
+interface ModuleRef {
+ /**
+ * Source filename. The 3-digit prefix is the source of truth for
+ * the version number — `validateRegistered` parses it. Keep this
+ * field in sync with the actual file on disk; the
+ * filesystem-cross-check test catches drift.
+ */
+ filename: string;
+ module: MigrationModule;
+}
+
+/**
+ * Static-import list of every migration. Two PRs adding
+ * migrations both add a single entry here; alphabetical ordering
+ * puts adjacent additions on different lines unless the version
+ * numbers themselves collide, in which case the filesystem
+ * collision on `NNN-*.ts` surfaces the conflict instantly.
+ */
+const REGISTERED_MODULES: readonly ModuleRef[] = [
+ { filename: '002-project-metadata.ts', module: MIG_002 },
+ { filename: '003-lower-name-index.ts', module: MIG_003 },
+ { filename: '004-centrality-churn.ts', module: MIG_004 },
+ { filename: '005-symbol-issues.ts', module: MIG_005 },
+ { filename: '006-config-refs.ts', module: MIG_006 },
+ { filename: '007-sql-refs.ts', module: MIG_007 },
+ { filename: '008-edges-unique.ts', module: MIG_008 },
+ { filename: '009-fts-subwords-porter.ts', module: MIG_009 },
+];
+
+/** Strict 3-digit prefix on each migration filename. */
+const FILENAME_PATTERN = /^(\d{3})-[a-z0-9]+(?:-[a-z0-9]+)*\.ts$/;
+
+/**
+ * Validate the registered set: filenames match the strict
+ * `NNN-name.ts` shape, version is parsed from the prefix (no
+ * hand-typed version field that can drift), versions are unique,
+ * and the result is sorted ascending. Throws loudly at module
+ * load if any invariant is violated rather than silently dropping
+ * a migration during `runMigrations()`.
+ */
+function validateRegistered(refs: readonly ModuleRef[]): readonly Migration[] {
+ if (refs.length === 0) {
+ throw new Error('[CodeGraph] migrations registry is empty');
+ }
+ const parsed = refs.map((r) => {
+ const m = FILENAME_PATTERN.exec(r.filename);
+ if (!m) {
+ throw new Error(
+ `[CodeGraph] migration filename "${r.filename}" does not match ` +
+ `expected pattern NNN-kebab-name.ts (3-digit prefix, lowercase kebab-case body)`
+ );
+ }
+ const version = parseInt(m[1]!, 10);
+ return {
+ version,
+ filename: r.filename,
+ description: r.module.description,
+ up: r.module.up,
+ };
+ });
+ const sorted = [...parsed].sort((a, b) => a.version - b.version);
+ for (let i = 1; i < sorted.length; i++) {
+ if (sorted[i]!.version === sorted[i - 1]!.version) {
+ throw new Error(
+ `[CodeGraph] duplicate migration version ${sorted[i]!.version}: ` +
+ `${sorted[i - 1]!.filename} vs ${sorted[i]!.filename}`
+ );
+ }
+ }
+ return sorted.map((r) => ({
+ version: r.version,
+ description: r.description,
+ up: r.up,
+ }));
+}
+
+export const ALL_MIGRATIONS: readonly Migration[] = validateRegistered(REGISTERED_MODULES);
+
+/**
+ * Highest registered migration version. Derived from the registry
+ * (no hand-maintained constant to keep in sync).
+ */
+export const CURRENT_SCHEMA_VERSION: number = ALL_MIGRATIONS[ALL_MIGRATIONS.length - 1]!.version;
diff --git a/src/db/migrations/types.ts b/src/db/migrations/types.ts
new file mode 100644
index 00000000..479af672
--- /dev/null
+++ b/src/db/migrations/types.ts
@@ -0,0 +1,25 @@
+/**
+ * Migration registry types.
+ *
+ * Each migration ships its own self-contained file
+ * (`./NNN-description.ts`) exporting a `MIGRATION:
+ * MigrationModule`. The version number is derived from the
+ * leading 3-digit prefix on the filename, NOT from a field in the
+ * module — this guarantees no two PRs can claim the same version
+ * silently (filenames collide on the filesystem; SQL migrations
+ * never silently no-op).
+ */
+
+import type { SqliteDatabase } from '../sqlite-adapter';
+
+export interface MigrationModule {
+ /** One-line description for `schema_versions` table + diagnostics. */
+ readonly description: string;
+ /** The actual schema-mutation function. Wrapped in a transaction. */
+ readonly up: (db: SqliteDatabase) => void;
+}
+
+export interface Migration extends MigrationModule {
+ /** Version derived from filename's leading NNN prefix. */
+ readonly version: number;
+}
diff --git a/src/db/queries.ts b/src/db/queries.ts
index 51f1a1ad..4a3edb90 100644
--- a/src/db/queries.ts
+++ b/src/db/queries.ts
@@ -17,8 +17,8 @@ import {
SearchOptions,
SearchResult,
} from '../types';
-import { safeJsonParse } from '../utils';
-import { kindBonus, nameMatchBonus, scorePathRelevance } from '../search/query-utils';
+import { safeJsonParse, buildNameSubwords } from '../utils';
+import { kindBonus, nameMatchBonus, scorePathRelevance, filterStopwords, diversifyByFile } from '../search/query-utils';
/**
* Database row types (snake_case from SQLite)
@@ -44,6 +44,7 @@ interface NodeRow {
decorators: string | null;
type_parameters: string | null;
updated_at: number;
+ centrality: number | null;
}
interface EdgeRow {
@@ -66,6 +67,10 @@ interface FileRow {
indexed_at: number;
node_count: number;
errors: string | null;
+ commit_count: number | null;
+ loc: number | null;
+ first_seen_ts: number | null;
+ last_touched_ts: number | null;
}
interface UnresolvedRefRow {
@@ -105,6 +110,7 @@ function rowToNode(row: NodeRow): Node {
decorators: row.decorators ? safeJsonParse(row.decorators, undefined) : undefined,
typeParameters: row.type_parameters ? safeJsonParse(row.type_parameters, undefined) : undefined,
updatedAt: row.updated_at,
+ centrality: row.centrality ?? undefined,
};
}
@@ -136,6 +142,10 @@ function rowToFileRecord(row: FileRow): FileRecord {
indexedAt: row.indexed_at,
nodeCount: row.node_count,
errors: row.errors ? safeJsonParse(row.errors, undefined) : undefined,
+ commitCount: row.commit_count ?? 0,
+ loc: row.loc ?? 0,
+ firstSeenTs: row.first_seen_ts ?? null,
+ lastTouchedTs: row.last_touched_ts ?? null,
};
}
@@ -170,7 +180,6 @@ export class QueryBuilder {
getFileByPath?: SqliteStatement;
getAllFiles?: SqliteStatement;
insertUnresolved?: SqliteStatement;
- deleteUnresolvedByNode?: SqliteStatement;
getUnresolvedByName?: SqliteStatement;
getNodesByName?: SqliteStatement;
getNodesByQualifiedNameExact?: SqliteStatement;
@@ -185,6 +194,14 @@ export class QueryBuilder {
this.db = db;
}
+ /**
+ * Execute a callback inside a single SQLite transaction. Useful when a
+ * caller needs several `QueryBuilder` operations to commit atomically.
+ */
+ transaction(fn: () => T): T {
+ return this.db.transaction(fn)();
+ }
+
// ===========================================================================
// Node Operations
// ===========================================================================
@@ -200,13 +217,13 @@ export class QueryBuilder {
start_line, end_line, start_column, end_column,
docstring, signature, visibility,
is_exported, is_async, is_static, is_abstract,
- decorators, type_parameters, updated_at
+ decorators, type_parameters, updated_at, name_subwords
) VALUES (
@id, @kind, @name, @qualifiedName, @filePath, @language,
@startLine, @endLine, @startColumn, @endColumn,
@docstring, @signature, @visibility,
@isExported, @isAsync, @isStatic, @isAbstract,
- @decorators, @typeParameters, @updatedAt
+ @decorators, @typeParameters, @updatedAt, @nameSubwords
)
`);
}
@@ -223,6 +240,12 @@ export class QueryBuilder {
return;
}
+ // INSERT OR REPLACE may overwrite a node we have cached. Drop the
+ // stale entry so the next getNodeById sees the new row, not the old
+ // one (matches the cache-invalidation pattern used by updateNode and
+ // deleteNode below).
+ this.nodeCache.delete(node.id);
+
try {
this.stmts.insertNode.run({
id: node.id,
@@ -245,6 +268,7 @@ export class QueryBuilder {
decorators: node.decorators ? JSON.stringify(node.decorators) : null,
typeParameters: node.typeParameters ? JSON.stringify(node.typeParameters) : null,
updatedAt: node.updatedAt ?? Date.now(),
+ nameSubwords: buildNameSubwords(node.name),
});
} catch (error) {
throw error;
@@ -287,7 +311,8 @@ export class QueryBuilder {
is_abstract = @isAbstract,
decorators = @decorators,
type_parameters = @typeParameters,
- updated_at = @updatedAt
+ updated_at = @updatedAt,
+ name_subwords = @nameSubwords
WHERE id = @id
`);
}
@@ -322,6 +347,7 @@ export class QueryBuilder {
decorators: node.decorators ? JSON.stringify(node.decorators) : null,
typeParameters: node.typeParameters ? JSON.stringify(node.typeParameters) : null,
updatedAt: node.updatedAt ?? Date.now(),
+ nameSubwords: buildNameSubwords(node.name),
});
}
@@ -379,6 +405,59 @@ export class QueryBuilder {
return node;
}
+ /**
+ * Batch lookup: fetch many nodes by ID in a single SQL round-trip.
+ *
+ * Replaces the N+1 pattern in graph traversal where every edge would
+ * trigger its own `getNodeById` call. For a function with 50 callers
+ * this collapses 50 point reads into one IN-list query (~10-50x
+ * faster end-to-end).
+ *
+ * Returns a Map keyed by id so callers can preserve their own ordering
+ * (typically the order edges were returned from the graph). Missing IDs
+ * are simply absent from the map.
+ *
+ * Cache-aware: ids already in the LRU cache are served from memory and
+ * the SQL query only touches the misses.
+ */
+ getNodesByIds(ids: readonly string[]): Map {
+ const out = new Map();
+ if (ids.length === 0) return out;
+
+ // Serve cache hits first; build the miss list for SQL.
+ const misses: string[] = [];
+ for (const id of ids) {
+ const cached = this.nodeCache.get(id);
+ if (cached !== undefined) {
+ // LRU touch
+ this.nodeCache.delete(id);
+ this.nodeCache.set(id, cached);
+ out.set(id, cached);
+ } else {
+ misses.push(id);
+ }
+ }
+ if (misses.length === 0) return out;
+
+ // Chunk under SQLite's parameter limit (default 999, raised to 32766
+ // in better-sqlite3 builds — chunk at 500 for safety across both
+ // backends and to keep the query plan simple).
+ const CHUNK = 500;
+ for (let i = 0; i < misses.length; i += CHUNK) {
+ const chunk = misses.slice(i, i + CHUNK);
+ const placeholders = chunk.map(() => '?').join(',');
+ const rows = this.db
+ .prepare(`SELECT * FROM nodes WHERE id IN (${placeholders})`)
+ .all(...chunk) as NodeRow[];
+ for (const row of rows) {
+ const node = rowToNode(row);
+ out.set(node.id, node);
+ this.cacheNode(node);
+ }
+ }
+ return out;
+ }
+
/**
* Add a node to the cache, evicting oldest if needed
*/
@@ -478,7 +557,13 @@ export class QueryBuilder {
* 3. Score results based on match quality
*/
searchNodes(query: string, options: SearchOptions = {}): SearchResult[] {
- const { kinds, languages, limit = 100, offset = 0 } = options;
+ const { kinds, languages, limit = 100, offset = 0, perFileCap = 3 } = options;
+
+ // Note on over-fetching: searchNodesFTS already over-fetches by 5x
+ // internally (Math.max(limit*5, 100)) so its own rescoring pass has
+ // headroom. That same headroom feeds the per-file diversification
+ // below — no additional outer multiplier needed. Keeping this comment
+ // here so future readers don't reintroduce a multiplier-on-multiplier.
// First try FTS5 with prefix matching
let results = this.searchNodesFTS(query, { kinds, languages, limit, offset });
@@ -530,10 +615,23 @@ export class QueryBuilder {
+ nameMatchBonus(r.node.name, query),
}));
results.sort((a, b) => b.score - a.score);
- // Trim to requested limit after rescoring
- if (results.length > limit) {
- results = results.slice(0, limit);
- }
+ }
+
+ // Diversification: cap per-file results so the top-K isn't dominated
+ // by the methods of a single class. Top-scoring hit per file is always
+ // included; the cap only kicks in for the second-and-onward members
+ // of the same file. perFileCap=0 disables.
+ //
+ // Guard `results.length > limit`: when results <= limit there's
+ // nothing to drop, so the existing score order is already what the
+ // caller will see. (`diversifyByFile` is also safe to call here and
+ // would reorder within the same set, but the existing rescore order
+ // is already meaningful and we don't want to perturb it without
+ // benefit.)
+ if (perFileCap > 0 && results.length > limit) {
+ results = diversifyByFile(results, limit, perFileCap);
+ } else if (results.length > limit) {
+ results = results.slice(0, limit);
}
return results;
@@ -545,30 +643,38 @@ export class QueryBuilder {
private searchNodesFTS(query: string, options: SearchOptions): SearchResult[] {
const { kinds, languages, limit = 100, offset = 0 } = options;
- // Add prefix wildcard for better matching (e.g., "auth" matches "AuthService", "authenticate")
- // Escape special FTS5 characters and add prefix wildcard
- const ftsQuery = query
- .replace(/['"*():^]/g, '') // Remove FTS5 special chars
+ // Build the FTS query in three steps:
+ // 1. Strip characters with special meaning to FTS5 and split on whitespace.
+ // 2. Drop FTS5 boolean operators (AND/OR/NOT/NEAR) — prevents user input
+ // from injecting boolean structure into the OR-join below.
+ // 3. Drop English stopwords for natural-language queries — words like
+ // "how" / "the" otherwise become OR'd hits against any prose-bearing
+ // docstring and crowd out the actually-relevant identifier tokens.
+ const rawTerms = query
+ .replace(/['"*():^]/g, '')
.split(/\s+/)
- .filter(term => term.length > 0)
- // Strip FTS5 boolean operators to prevent query manipulation
- .filter(term => !/^(AND|OR|NOT|NEAR)$/i.test(term))
- .map(term => `"${term}"*`) // Prefix match each term
+ .filter((term) => term.length > 0)
+ .filter((term) => !/^(AND|OR|NOT|NEAR)$/i.test(term));
+
+ const filteredTerms = filterStopwords(rawTerms);
+
+ const ftsQuery = filteredTerms
+ .map((term) => `"${term}"*`) // Prefix match each term
.join(' OR ');
if (!ftsQuery) {
return [];
}
- // BM25 column weights: id=0, name=20, qualified_name=5, docstring=1, signature=2
- // Heavy name weight ensures exact/prefix name matches rank above incidental
- // mentions in long docstrings or qualified names of nested symbols.
- // Fetch 5x requested limit so post-hoc rescoring (kindBonus, pathRelevance,
- // nameMatchBonus) can promote results that BM25 alone undervalues.
+ // BM25 column weights: id=0, name=20, qualified_name=5, docstring=1,
+ // signature=2, name_subwords=10. Heavy name weight keeps exact and prefix
+ // name matches above incidental mentions in long docstrings; the new
+ // name_subwords column at 10× lets queries hit subword tokens like
+ // `parser` against `getParser` without burying full-name matches.
const ftsLimit = Math.max(limit * 5, 100);
let sql = `
- SELECT nodes.*, bm25(nodes_fts, 0, 20, 5, 1, 2) as score
+ SELECT nodes.*, bm25(nodes_fts, 0, 20, 5, 1, 2, 10) as score
FROM nodes_fts
JOIN nodes ON nodes_fts.id = nodes.id
WHERE nodes_fts MATCH ?
@@ -916,7 +1022,12 @@ export class QueryBuilder {
// ===========================================================================
/**
- * Insert or update a file record
+ * Insert or update a file record.
+ *
+ * Churn columns (commit_count, loc, first_seen_ts, last_touched_ts)
+ * are deliberately omitted from the ON CONFLICT update list — they
+ * are managed exclusively by `applyChurnDeltas` / `applyLocUpdates`.
+ * Adding them here would clobber mined git history on every re-index.
*/
upsertFile(file: FileRecord): void {
if (!this.stmts.upsertFile) {
@@ -1032,17 +1143,8 @@ export class QueryBuilder {
insert();
}
- /**
- * Delete unresolved references from a node
- */
- deleteUnresolvedByNode(nodeId: string): void {
- if (!this.stmts.deleteUnresolvedByNode) {
- this.stmts.deleteUnresolvedByNode = this.db.prepare(
- 'DELETE FROM unresolved_refs WHERE from_node_id = ?'
- );
- }
- this.stmts.deleteUnresolvedByNode.run(nodeId);
- }
+ // (deleteUnresolvedByNode removed — never called; FK cascade on
+ // nodes(id) → unresolved_refs.from_node_id handles cleanup automatically.)
/**
* Get unresolved references by name (for resolution)
@@ -1295,4 +1397,526 @@ export class QueryBuilder {
this.db.exec('DELETE FROM files');
})();
}
+
+ // ===========================================================================
+ // Centrality (PageRank scores on nodes)
+ // ===========================================================================
+
+ /**
+ * Apply PageRank scores to the nodes table in a single transaction.
+ * Existing scores for ids not in the map are NOT cleared — call
+ * `clearCentrality()` first for a from-scratch recompute.
+ */
+ applyCentralityScores(scores: Map): void {
+ if (scores.size === 0) return;
+ const stmt = this.db.prepare('UPDATE nodes SET centrality = ? WHERE id = ?');
+ this.db.transaction(() => {
+ for (const [id, score] of scores) {
+ stmt.run(score, id);
+ }
+ })();
+ // Cached node objects now have stale centrality. Drop the cache;
+ // subsequent reads pull the fresh value.
+ this.nodeCache.clear();
+ }
+
+ /** Reset all centrality values to NULL (fresh-recompute path). */
+ clearCentrality(): void {
+ this.db.exec('UPDATE nodes SET centrality = NULL');
+ this.nodeCache.clear();
+ }
+
+ /**
+ * Get top-N nodes by centrality, descending. Filters out NULL
+ * centrality (= not yet computed). Optional `kind` filter narrows
+ * to one node kind; optional `minCentrality` filters out the long
+ * tail of essentially-zero ranks.
+ */
+ getTopNodesByCentrality(opts: {
+ limit?: number;
+ kind?: NodeKind;
+ minCentrality?: number;
+ } = {}): Node[] {
+ const limit = opts.limit ?? 25;
+ const minCentrality = opts.minCentrality ?? 0;
+ const where: string[] = ['centrality IS NOT NULL', 'centrality >= ?'];
+ const params: (string | number)[] = [minCentrality];
+ if (opts.kind) {
+ where.push('kind = ?');
+ params.push(opts.kind);
+ }
+ const sql = `SELECT * FROM nodes WHERE ${where.join(' AND ')}
+ ORDER BY centrality DESC LIMIT ?`;
+ params.push(limit);
+ const rows = this.db.prepare(sql).all(...params) as NodeRow[];
+ return rows.map(rowToNode);
+ }
+
+ /**
+ * Compute the rank (1-based) of a single node by centrality.
+ * Returns null if the node has no centrality yet.
+ */
+ getCentralityRank(nodeId: string): { rank: number; total: number } | null {
+ const row = this.db
+ .prepare('SELECT centrality FROM nodes WHERE id = ?')
+ .get(nodeId) as { centrality: number | null } | undefined;
+ if (!row || row.centrality === null) return null;
+ const above = this.db
+ .prepare('SELECT COUNT(*) AS c FROM nodes WHERE centrality > ?')
+ .get(row.centrality) as { c: number };
+ const total = this.db
+ .prepare('SELECT COUNT(*) AS c FROM nodes WHERE centrality IS NOT NULL')
+ .get() as { c: number };
+ return { rank: above.c + 1, total: total.c };
+ }
+
+ // ===========================================================================
+ // Per-file churn (mined from git log)
+ // ===========================================================================
+
+ /**
+ * Apply churn deltas to the files table. For each delta:
+ * commit_count += commitCountDelta
+ * last_touched_ts = MAX(existing, lastTouchedTs)
+ * first_seen_ts = COALESCE(existing, firstSeenTs) // sticky
+ *
+ * Files in the delta map but not in the files table (uncommon —
+ * they'd have to be mined-but-never-indexed) are silently skipped.
+ */
+ applyChurnDeltas(
+ deltas: Iterable<{
+ path: string;
+ commitCountDelta: number;
+ lastTouchedTs: number;
+ firstSeenTs: number;
+ }>
+ ): void {
+ const stmt = this.db.prepare(
+ `UPDATE files
+ SET commit_count = commit_count + ?,
+ last_touched_ts = MAX(COALESCE(last_touched_ts, 0), ?),
+ first_seen_ts = COALESCE(first_seen_ts, ?)
+ WHERE path = ?`
+ );
+ this.db.transaction(() => {
+ for (const d of deltas) {
+ stmt.run(d.commitCountDelta, d.lastTouchedTs, d.firstSeenTs, d.path);
+ }
+ })();
+ }
+
+ /** Reset all churn columns; used before a full re-mine. Does not touch `loc`. */
+ clearChurn(): void {
+ this.db.exec(
+ `UPDATE files SET commit_count = 0, last_touched_ts = NULL, first_seen_ts = NULL`
+ );
+ }
+
+ /** Update the on-disk LOC for a single file. Cheap; called per changed file. */
+ updateFileLoc(filePath: string, loc: number): void {
+ this.db.prepare('UPDATE files SET loc = ? WHERE path = ?').run(loc, filePath);
+ }
+
+ /** Bulk LOC update — used during indexAll to refresh LOC for every indexed file. */
+ applyLocUpdates(entries: Iterable<{ path: string; loc: number }>): void {
+ const stmt = this.db.prepare('UPDATE files SET loc = ? WHERE path = ?');
+ this.db.transaction(() => {
+ for (const e of entries) stmt.run(e.loc, e.path);
+ })();
+ }
+
+ getTopFilesByChurn(opts: { limit?: number; minCommits?: number } = {}): FileRecord[] {
+ const limit = opts.limit ?? 25;
+ const minCommits = opts.minCommits ?? 1;
+ const rows = this.db
+ .prepare(
+ `SELECT * FROM files WHERE commit_count >= ?
+ ORDER BY commit_count DESC LIMIT ?`
+ )
+ .all(minCommits, limit) as FileRow[];
+ return rows.map(rowToFileRecord);
+ }
+
+ /**
+ * Hotspots: files ranked by `risk = (Σ centrality of nodes in file) × commit_count`.
+ *
+ * Both inputs are optional in their own right; with neither computed,
+ * this returns []. Sorting modes:
+ * - 'risk' : the combined score (default; what "hotspot" means)
+ * - 'centrality' : pure structural importance
+ * - 'churn' : pure change frequency
+ */
+ getHotspots(opts: {
+ limit?: number;
+ minCommits?: number;
+ minCentrality?: number;
+ sortBy?: 'risk' | 'centrality' | 'churn';
+ } = {}): Array<{
+ filePath: string;
+ fileCentrality: number;
+ commitCount: number;
+ loc: number;
+ lastTouchedTs: number | null;
+ riskScore: number;
+ }> {
+ const limit = opts.limit ?? 15;
+ const minCommits = opts.minCommits ?? 0;
+ const minCentrality = opts.minCentrality ?? 0;
+ const sortBy = opts.sortBy ?? 'risk';
+
+ const orderBy =
+ sortBy === 'centrality'
+ ? 'fileCentrality DESC'
+ : sortBy === 'churn'
+ ? 'commitCount DESC'
+ : 'riskScore DESC';
+
+ // Aggregate centrality at file level. LEFT JOIN so files without any
+ // indexed nodes (rare — schema-only files) still surface if they have churn.
+ const sql = `
+ SELECT
+ f.path AS filePath,
+ COALESCE(n_agg.fc, 0.0) AS fileCentrality,
+ f.commit_count AS commitCount,
+ f.loc AS loc,
+ f.last_touched_ts AS lastTouchedTs,
+ COALESCE(n_agg.fc, 0.0) * f.commit_count AS riskScore
+ FROM files f
+ LEFT JOIN (
+ SELECT file_path, SUM(centrality) AS fc
+ FROM nodes WHERE centrality IS NOT NULL
+ GROUP BY file_path
+ ) n_agg ON n_agg.file_path = f.path
+ WHERE f.commit_count >= ? AND COALESCE(n_agg.fc, 0.0) >= ?
+ ORDER BY ${orderBy}
+ LIMIT ?
+ `;
+ const rows = this.db.prepare(sql).all(minCommits, minCentrality, limit) as Array<{
+ filePath: string;
+ fileCentrality: number;
+ commitCount: number;
+ loc: number;
+ lastTouchedTs: number | null;
+ riskScore: number;
+ }>;
+ return rows;
+ }
+
+ // ===========================================================================
+ // Symbol-issue attributions (mined from git history)
+ // ===========================================================================
+
+ applyIssueAttributions(
+ rows: Iterable<{
+ nodeId: string;
+ issueNumber: number;
+ commitSha: string;
+ kind: 'modified' | 'added' | 'removed';
+ }>
+ ): void {
+ const stmt = this.db.prepare(
+ `INSERT OR IGNORE INTO symbol_issues (node_id, issue_number, commit_sha, kind)
+ VALUES (?, ?, ?, ?)`
+ );
+ this.db.transaction(() => {
+ for (const r of rows) {
+ stmt.run(r.nodeId, r.issueNumber, r.commitSha, r.kind);
+ }
+ })();
+ }
+
+ clearIssueAttributions(): void {
+ this.db.exec('DELETE FROM symbol_issues');
+ }
+
+ getIssuesForNode(nodeId: string): Array<{
+ issueNumber: number;
+ kind: 'modified' | 'added' | 'removed';
+ commitSha: string;
+ }> {
+ return this.db
+ .prepare(
+ `SELECT issue_number AS issueNumber, kind, commit_sha AS commitSha
+ FROM symbol_issues
+ WHERE node_id = ?
+ ORDER BY issue_number ASC, kind ASC`
+ )
+ .all(nodeId) as Array<{
+ issueNumber: number;
+ kind: 'modified' | 'added' | 'removed';
+ commitSha: string;
+ }>;
+ }
+
+ getNodesForIssue(issueNumber: number): Array<{
+ nodeId: string;
+ kind: 'modified' | 'added' | 'removed';
+ commitSha: string;
+ }> {
+ return this.db
+ .prepare(
+ `SELECT node_id AS nodeId, kind, commit_sha AS commitSha
+ FROM symbol_issues
+ WHERE issue_number = ?
+ ORDER BY node_id ASC`
+ )
+ .all(issueNumber) as Array<{
+ nodeId: string;
+ kind: 'modified' | 'added' | 'removed';
+ commitSha: string;
+ }>;
+ }
+
+ // ===========================================================================
+ // Config references (env vars / feature flags read sites)
+ // ===========================================================================
+
+ applyConfigRefs(
+ rows: Array<{
+ configKind: 'env';
+ configKey: string;
+ sourceNodeId: string | null;
+ filePath: string;
+ line: number;
+ }>
+ ): void {
+ if (rows.length === 0) return;
+ const distinctFiles = new Set(rows.map((r) => r.filePath));
+ const deleteStmt = this.db.prepare('DELETE FROM config_refs WHERE file_path = ?');
+ const insertStmt = this.db.prepare(
+ `INSERT INTO config_refs (config_kind, config_key, source_node_id, file_path, line)
+ VALUES (?, ?, ?, ?, ?)`
+ );
+ this.db.transaction(() => {
+ for (const f of distinctFiles) deleteStmt.run(f);
+ for (const r of rows) {
+ insertStmt.run(r.configKind, r.configKey, r.sourceNodeId, r.filePath, r.line);
+ }
+ })();
+ }
+
+ clearConfigRefs(): void {
+ this.db.exec('DELETE FROM config_refs');
+ }
+
+ deleteConfigRefsForPaths(filePaths: Iterable): void {
+ const stmt = this.db.prepare('DELETE FROM config_refs WHERE file_path = ?');
+ this.db.transaction(() => {
+ for (const p of filePaths) stmt.run(p);
+ })();
+ }
+
+ pruneOrphanedConfigRefs(): void {
+ this.db.exec(
+ `DELETE FROM config_refs WHERE file_path NOT IN (SELECT path FROM files)`
+ );
+ }
+
+ getConfigKeys(opts: { configKind?: 'env'; limit?: number } = {}): Array<{
+ configKey: string;
+ reads: number;
+ distinctFiles: number;
+ }> {
+ const limit = opts.limit ?? 200;
+ const where = opts.configKind ? 'WHERE config_kind = ?' : '';
+ const params = opts.configKind ? [opts.configKind, limit] : [limit];
+ return this.db
+ .prepare(
+ `SELECT config_key AS configKey,
+ COUNT(*) AS reads,
+ COUNT(DISTINCT file_path) AS distinctFiles
+ FROM config_refs
+ ${where}
+ GROUP BY config_key
+ ORDER BY reads DESC, config_key ASC
+ LIMIT ?`
+ )
+ .all(...params) as Array<{ configKey: string; reads: number; distinctFiles: number }>;
+ }
+
+ getConfigRefsByKey(
+ configKey: string,
+ opts: { configKind?: 'env' } = {}
+ ): Array<{
+ filePath: string;
+ line: number;
+ sourceNodeId: string | null;
+ sourceName: string | null;
+ sourceKind: string | null;
+ }> {
+ const kind = opts.configKind ?? 'env';
+ return this.db
+ .prepare(
+ `SELECT cr.file_path AS filePath,
+ cr.line AS line,
+ cr.source_node_id AS sourceNodeId,
+ n.name AS sourceName,
+ n.kind AS sourceKind
+ FROM config_refs cr
+ LEFT JOIN nodes n ON n.id = cr.source_node_id
+ WHERE cr.config_kind = ? AND cr.config_key = ?
+ ORDER BY cr.file_path ASC, cr.line ASC`
+ )
+ .all(kind, configKey) as Array<{
+ filePath: string;
+ line: number;
+ sourceNodeId: string | null;
+ sourceName: string | null;
+ sourceKind: string | null;
+ }>;
+ }
+
+ getConfigKeysForNode(nodeId: string): Array<{ configKey: string; line: number }> {
+ return this.db
+ .prepare(
+ `SELECT config_key AS configKey, line
+ FROM config_refs
+ WHERE source_node_id = ?
+ ORDER BY config_key ASC, line ASC`
+ )
+ .all(nodeId) as Array<{ configKey: string; line: number }>;
+ }
+
+ // ===========================================================================
+ // SQL references (table-name string-literal refs from app code)
+ // ===========================================================================
+
+ applySqlRefs(
+ rows: Array<{
+ tableName: string;
+ op: 'read' | 'write' | 'ddl';
+ sourceNodeId: string | null;
+ filePath: string;
+ line: number;
+ }>
+ ): void {
+ if (rows.length === 0) return;
+ const stmt = this.db.prepare(
+ `INSERT INTO sql_refs (table_name, op, source_node_id, file_path, line)
+ VALUES (?, ?, ?, ?, ?)`
+ );
+ this.db.transaction(() => {
+ for (const r of rows) {
+ stmt.run(r.tableName, r.op, r.sourceNodeId, r.filePath, r.line);
+ }
+ })();
+ }
+
+ replaceAllSqlRefs(
+ rows: Array<{
+ tableName: string;
+ op: 'read' | 'write' | 'ddl';
+ sourceNodeId: string | null;
+ filePath: string;
+ line: number;
+ }>
+ ): void {
+ const insert = this.db.prepare(
+ `INSERT INTO sql_refs (table_name, op, source_node_id, file_path, line)
+ VALUES (?, ?, ?, ?, ?)`
+ );
+ this.db.transaction(() => {
+ this.db.exec('DELETE FROM sql_refs');
+ for (const r of rows) {
+ insert.run(r.tableName, r.op, r.sourceNodeId, r.filePath, r.line);
+ }
+ })();
+ }
+
+ deleteSqlRefsForPaths(filePaths: Iterable): void {
+ const stmt = this.db.prepare('DELETE FROM sql_refs WHERE file_path = ?');
+ this.db.transaction(() => {
+ for (const p of filePaths) stmt.run(p);
+ })();
+ }
+
+ clearSqlRefs(): void {
+ this.db.exec('DELETE FROM sql_refs');
+ }
+
+ pruneOrphanedSqlRefs(): void {
+ this.db.exec(
+ `DELETE FROM sql_refs WHERE file_path NOT IN (SELECT path FROM files)`
+ );
+ }
+
+ getSqlTables(opts: { limit?: number } = {}): Array<{
+ tableName: string;
+ reads: number;
+ writes: number;
+ ddl: number;
+ total: number;
+ }> {
+ const limit = opts.limit ?? 100;
+ return this.db
+ .prepare(
+ `SELECT lower(table_name) AS tableName,
+ SUM(CASE WHEN op = 'read' THEN 1 ELSE 0 END) AS reads,
+ SUM(CASE WHEN op = 'write' THEN 1 ELSE 0 END) AS writes,
+ SUM(CASE WHEN op = 'ddl' THEN 1 ELSE 0 END) AS ddl,
+ COUNT(*) AS total
+ FROM sql_refs
+ GROUP BY lower(table_name)
+ ORDER BY total DESC, tableName ASC
+ LIMIT ?`
+ )
+ .all(limit) as Array<{
+ tableName: string;
+ reads: number;
+ writes: number;
+ ddl: number;
+ total: number;
+ }>;
+ }
+
+ getSqlRefsByTable(
+ tableName: string,
+ opts: { op?: 'read' | 'write' | 'ddl' } = {}
+ ): Array<{
+ op: 'read' | 'write' | 'ddl';
+ filePath: string;
+ line: number;
+ sourceNodeId: string | null;
+ sourceName: string | null;
+ sourceKind: string | null;
+ }> {
+ const params: Array = [tableName.toLowerCase()];
+ let opFilter = '';
+ if (opts.op) {
+ opFilter = ' AND sr.op = ?';
+ params.push(opts.op);
+ }
+ return this.db
+ .prepare(
+ `SELECT sr.op AS op,
+ sr.file_path AS filePath,
+ sr.line AS line,
+ sr.source_node_id AS sourceNodeId,
+ n.name AS sourceName,
+ n.kind AS sourceKind
+ FROM sql_refs sr
+ LEFT JOIN nodes n ON n.id = sr.source_node_id
+ WHERE lower(sr.table_name) = ?${opFilter}
+ ORDER BY sr.file_path ASC, sr.line ASC`
+ )
+ .all(...params) as Array<{
+ op: 'read' | 'write' | 'ddl';
+ filePath: string;
+ line: number;
+ sourceNodeId: string | null;
+ sourceName: string | null;
+ sourceKind: string | null;
+ }>;
+ }
+
+ getSqlTablesForNode(nodeId: string): Array<{ tableName: string; op: string }> {
+ return this.db
+ .prepare(
+ `SELECT DISTINCT lower(table_name) AS tableName, op
+ FROM sql_refs
+ WHERE source_node_id = ?
+ ORDER BY tableName ASC, op ASC`
+ )
+ .all(nodeId) as Array<{ tableName: string; op: string }>;
+ }
}
diff --git a/src/db/schema.sql b/src/db/schema.sql
index dd0a9f06..be75f5de 100644
--- a/src/db/schema.sql
+++ b/src/db/schema.sql
@@ -37,7 +37,13 @@ CREATE TABLE IF NOT EXISTS nodes (
is_abstract INTEGER DEFAULT 0,
decorators TEXT, -- JSON array
type_parameters TEXT, -- JSON array
- updated_at INTEGER NOT NULL
+ updated_at INTEGER NOT NULL,
+ centrality REAL DEFAULT NULL, -- PageRank over calls+references; NULL until first compute
+ -- Camel/snake-split tokens of `name`, joined by spaces. The default
+ -- FTS5 tokenizer indexes each as a separate term, so a query for
+ -- `parser` finds `getParser` etc. Populated by buildNameSubwords()
+ -- in src/utils.ts on every insert/update.
+ name_subwords TEXT
);
-- Edges: Relationships between nodes
@@ -63,7 +69,12 @@ CREATE TABLE IF NOT EXISTS files (
modified_at INTEGER NOT NULL,
indexed_at INTEGER NOT NULL,
node_count INTEGER DEFAULT 0,
- errors TEXT -- JSON array
+ errors TEXT, -- JSON array
+ -- Churn signals (mined from git log)
+ commit_count INTEGER NOT NULL DEFAULT 0,
+ loc INTEGER NOT NULL DEFAULT 0,
+ first_seen_ts INTEGER DEFAULT NULL, -- unix seconds
+ last_touched_ts INTEGER DEFAULT NULL -- unix seconds
);
-- Unresolved References: References that need resolution after full indexing
@@ -92,34 +103,42 @@ CREATE INDEX IF NOT EXISTS idx_nodes_file_path ON nodes(file_path);
CREATE INDEX IF NOT EXISTS idx_nodes_language ON nodes(language);
CREATE INDEX IF NOT EXISTS idx_nodes_file_line ON nodes(file_path, start_line);
CREATE INDEX IF NOT EXISTS idx_nodes_lower_name ON nodes(lower(name));
+CREATE INDEX IF NOT EXISTS idx_nodes_centrality ON nodes(centrality DESC);
-- Full-text search index on node names, docstrings, and signatures
+-- The Porter stemmer collapses morphological variants so a query for
+-- `parsing` matches a docstring or subword containing `parser`/`parse`.
+-- This is the largest single quality lift for natural-language queries
+-- (verified empirically: targets that ranked #18-#19 or weren't in the
+-- top 20 jump to the top 5 — see __tests__/search-quality.test.ts).
CREATE VIRTUAL TABLE IF NOT EXISTS nodes_fts USING fts5(
id,
name,
qualified_name,
docstring,
signature,
+ name_subwords,
content='nodes',
- content_rowid='rowid'
+ content_rowid='rowid',
+ tokenize="porter unicode61"
);
-- Triggers to keep FTS index in sync
CREATE TRIGGER IF NOT EXISTS nodes_ai AFTER INSERT ON nodes BEGIN
- INSERT INTO nodes_fts(rowid, id, name, qualified_name, docstring, signature)
- VALUES (NEW.rowid, NEW.id, NEW.name, NEW.qualified_name, NEW.docstring, NEW.signature);
+ INSERT INTO nodes_fts(rowid, id, name, qualified_name, docstring, signature, name_subwords)
+ VALUES (NEW.rowid, NEW.id, NEW.name, NEW.qualified_name, NEW.docstring, NEW.signature, NEW.name_subwords);
END;
CREATE TRIGGER IF NOT EXISTS nodes_ad AFTER DELETE ON nodes BEGIN
- INSERT INTO nodes_fts(nodes_fts, rowid, id, name, qualified_name, docstring, signature)
- VALUES ('delete', OLD.rowid, OLD.id, OLD.name, OLD.qualified_name, OLD.docstring, OLD.signature);
+ INSERT INTO nodes_fts(nodes_fts, rowid, id, name, qualified_name, docstring, signature, name_subwords)
+ VALUES ('delete', OLD.rowid, OLD.id, OLD.name, OLD.qualified_name, OLD.docstring, OLD.signature, OLD.name_subwords);
END;
CREATE TRIGGER IF NOT EXISTS nodes_au AFTER UPDATE ON nodes BEGIN
- INSERT INTO nodes_fts(nodes_fts, rowid, id, name, qualified_name, docstring, signature)
- VALUES ('delete', OLD.rowid, OLD.id, OLD.name, OLD.qualified_name, OLD.docstring, OLD.signature);
- INSERT INTO nodes_fts(rowid, id, name, qualified_name, docstring, signature)
- VALUES (NEW.rowid, NEW.id, NEW.name, NEW.qualified_name, NEW.docstring, NEW.signature);
+ INSERT INTO nodes_fts(nodes_fts, rowid, id, name, qualified_name, docstring, signature, name_subwords)
+ VALUES ('delete', OLD.rowid, OLD.id, OLD.name, OLD.qualified_name, OLD.docstring, OLD.signature, OLD.name_subwords);
+ INSERT INTO nodes_fts(rowid, id, name, qualified_name, docstring, signature, name_subwords)
+ VALUES (NEW.rowid, NEW.id, NEW.name, NEW.qualified_name, NEW.docstring, NEW.signature, NEW.name_subwords);
END;
-- Edge indexes
@@ -129,9 +148,20 @@ CREATE INDEX IF NOT EXISTS idx_edges_kind ON edges(kind);
CREATE INDEX IF NOT EXISTS idx_edges_source_kind ON edges(source, kind);
CREATE INDEX IF NOT EXISTS idx_edges_target_kind ON edges(target, kind);
+-- Uniqueness for (source, target, kind, line, col). The id column is an
+-- AUTOINCREMENT primary key, so without this index `INSERT OR IGNORE`
+-- would never see a conflict — duplicate edges would silently accumulate
+-- on every re-resolution / re-emission. COALESCE keeps two NULL line/col
+-- values comparable as equal (SQLite treats raw NULLs in a UNIQUE index
+-- as distinct).
+CREATE UNIQUE INDEX IF NOT EXISTS idx_edges_unique
+ ON edges(source, target, kind, COALESCE(line, -1), COALESCE(col, -1));
+
-- File indexes
CREATE INDEX IF NOT EXISTS idx_files_language ON files(language);
CREATE INDEX IF NOT EXISTS idx_files_modified_at ON files(modified_at);
+CREATE INDEX IF NOT EXISTS idx_files_commit_count ON files(commit_count DESC);
+CREATE INDEX IF NOT EXISTS idx_files_last_touched ON files(last_touched_ts DESC);
-- Unresolved refs indexes
CREATE INDEX IF NOT EXISTS idx_unresolved_from_node ON unresolved_refs(from_node_id);
@@ -146,3 +176,61 @@ CREATE TABLE IF NOT EXISTS project_metadata (
value TEXT NOT NULL,
updated_at INTEGER NOT NULL
);
+
+-- Issue → symbol attribution mined from git history.
+-- One row per (node, issue, commit, kind) tuple; kind is 'modified'
+-- (enclosing function changed by hunk), 'added' (declaration on a +
+-- line), or 'removed' (declaration on a - line, dropped at lookup
+-- time when no current node matches).
+CREATE TABLE IF NOT EXISTS symbol_issues (
+ node_id TEXT NOT NULL,
+ issue_number INTEGER NOT NULL,
+ commit_sha TEXT NOT NULL,
+ kind TEXT NOT NULL CHECK (kind IN ('modified','added','removed')),
+ PRIMARY KEY (node_id, issue_number, commit_sha, kind),
+ FOREIGN KEY (node_id) REFERENCES nodes(id) ON DELETE CASCADE
+);
+CREATE INDEX IF NOT EXISTS idx_symbol_issues_node ON symbol_issues(node_id);
+CREATE INDEX IF NOT EXISTS idx_symbol_issues_issue ON symbol_issues(issue_number);
+
+-- Config references: read sites for env vars / feature flags / etc.
+-- One row per syntactic occurrence in source. config_kind narrows to
+-- 'env' (process.env, os.getenv, ...) for v1; future kinds add YAML
+-- keys, LaunchDarkly flags, etc. source_node_id may be NULL for
+-- top-level reads that aren't inside a function/method.
+CREATE TABLE IF NOT EXISTS config_refs (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ config_kind TEXT NOT NULL,
+ config_key TEXT NOT NULL,
+ source_node_id TEXT,
+ file_path TEXT NOT NULL,
+ line INTEGER NOT NULL,
+ FOREIGN KEY (source_node_id) REFERENCES nodes(id) ON DELETE CASCADE
+);
+CREATE INDEX IF NOT EXISTS idx_config_refs_key
+ ON config_refs(config_kind, config_key);
+CREATE INDEX IF NOT EXISTS idx_config_refs_node
+ ON config_refs(source_node_id);
+CREATE INDEX IF NOT EXISTS idx_config_refs_file
+ ON config_refs(file_path);
+
+-- SQL references: per-call-site links from app code to a table name.
+-- One row per syntactic occurrence in source. op is 'read' (SELECT,
+-- FROM in non-DDL), 'write' (INSERT/UPDATE/DELETE), or 'ddl'
+-- (CREATE TABLE / ALTER TABLE / DROP TABLE -- rare in app code but
+-- catches migration scripts).
+CREATE TABLE IF NOT EXISTS sql_refs (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ table_name TEXT NOT NULL,
+ op TEXT NOT NULL CHECK (op IN ('read','write','ddl')),
+ source_node_id TEXT,
+ file_path TEXT NOT NULL,
+ line INTEGER NOT NULL,
+ FOREIGN KEY (source_node_id) REFERENCES nodes(id) ON DELETE CASCADE
+);
+CREATE INDEX IF NOT EXISTS idx_sql_refs_table
+ ON sql_refs(lower(table_name));
+CREATE INDEX IF NOT EXISTS idx_sql_refs_node
+ ON sql_refs(source_node_id);
+CREATE INDEX IF NOT EXISTS idx_sql_refs_file
+ ON sql_refs(file_path);
diff --git a/src/default-config.ts b/src/default-config.ts
new file mode 100644
index 00000000..34769609
--- /dev/null
+++ b/src/default-config.ts
@@ -0,0 +1,199 @@
+/**
+ * Default project configuration.
+ *
+ * Lives in its own file (separate from `types.ts`) because the
+ * `include` glob list is derived from the language registry — and
+ * the registry transitively imports `types.ts` via per-language
+ * files, which would create an evaluation cycle if `default-config`
+ * were itself imported by `types.ts` eagerly.
+ *
+ * **Lazy include resolution.** The `include` array is built on
+ * first access via a property getter, not at module load. By the
+ * time anything reads `DEFAULT_CONFIG.include`, the registry has
+ * fully evaluated, so all language definitions are available.
+ */
+
+import type { CodeGraphConfig } from './types';
+import { getLanguageDefs } from './extraction/languages/registry';
+
+let _includeCache: string[] | null = null;
+function buildIncludeGlobs(): string[] {
+ if (_includeCache) return _includeCache;
+ const seen = new Set();
+ const out: string[] = [];
+ for (const def of getLanguageDefs()) {
+ for (const glob of def.includeGlobs) {
+ if (seen.has(glob)) continue;
+ seen.add(glob);
+ out.push(glob);
+ }
+ }
+ _includeCache = out;
+ return out;
+}
+
+const baseConfig: CodeGraphConfig = {
+ version: 1,
+ rootDir: '.',
+ include: [], // populated lazily via the getter below
+ exclude: [
+ // Version control
+ '**/.git/**',
+
+ // Dependencies
+ '**/node_modules/**',
+ '**/vendor/**',
+ '**/Pods/**',
+
+ // Generic build outputs
+ '**/dist/**',
+ '**/build/**',
+ '**/out/**',
+ '**/bin/**',
+ '**/obj/**',
+ '**/target/**',
+
+ // JavaScript/TypeScript
+ '**/*.min.js',
+ '**/*.bundle.js',
+ '**/.next/**',
+ '**/.nuxt/**',
+ '**/.svelte-kit/**',
+ '**/.output/**',
+ '**/.turbo/**',
+ '**/.cache/**',
+ '**/.parcel-cache/**',
+ '**/.vite/**',
+ '**/.astro/**',
+ '**/.docusaurus/**',
+ '**/.gatsby/**',
+ '**/.webpack/**',
+ '**/.nx/**',
+ '**/.yarn/cache/**',
+ '**/.pnpm-store/**',
+ '**/storybook-static/**',
+
+ // React Native / Expo
+ '**/.expo/**',
+ '**/web-build/**',
+ '**/ios/Pods/**',
+ '**/ios/build/**',
+ '**/android/build/**',
+ '**/android/.gradle/**',
+
+ // Python
+ '**/__pycache__/**',
+ '**/.venv/**',
+ '**/venv/**',
+ '**/site-packages/**',
+ '**/dist-packages/**',
+ '**/.pytest_cache/**',
+ '**/.mypy_cache/**',
+ '**/.ruff_cache/**',
+ '**/.tox/**',
+ '**/.nox/**',
+ '**/*.egg-info/**',
+ '**/.eggs/**',
+
+ // Go
+ '**/go/pkg/mod/**',
+
+ // Rust
+ '**/target/debug/**',
+ '**/target/release/**',
+
+ // Java/Kotlin/Gradle
+ '**/.gradle/**',
+ '**/.m2/**',
+ '**/generated-sources/**',
+ '**/.kotlin/**',
+
+ // Dart/Flutter
+ '**/.dart_tool/**',
+
+ // C#/.NET
+ '**/.vs/**',
+ '**/.nuget/**',
+ '**/artifacts/**',
+ '**/publish/**',
+
+ // C/C++
+ '**/cmake-build-*/**',
+ '**/CMakeFiles/**',
+ '**/bazel-*/**',
+ '**/vcpkg_installed/**',
+ '**/.conan/**',
+ '**/Debug/**',
+ '**/Release/**',
+ '**/x64/**',
+ '**/.pio/**', // Platform.io (IoT/embedded build artifacts and library deps)
+
+ // Electron
+ '**/release/**',
+ '**/*.app/**',
+ '**/*.asar',
+
+ // Swift/iOS/Xcode
+ '**/DerivedData/**',
+ '**/.build/**',
+ '**/.swiftpm/**',
+ '**/xcuserdata/**',
+ '**/Carthage/Build/**',
+ '**/SourcePackages/**',
+
+ // Delphi/Pascal
+ '**/__history/**',
+ '**/__recovery/**',
+ '**/*.dcu',
+
+ // PHP
+ '**/.composer/**',
+ '**/storage/framework/**',
+ '**/bootstrap/cache/**',
+
+ // Ruby
+ '**/.bundle/**',
+ '**/tmp/cache/**',
+ '**/public/assets/**',
+ '**/public/packs/**',
+ '**/.yardoc/**',
+
+ // Testing/Coverage
+ '**/coverage/**',
+ '**/htmlcov/**',
+ '**/.nyc_output/**',
+ '**/test-results/**',
+ '**/.coverage/**',
+
+ // IDE/Editor
+ '**/.idea/**',
+
+ // Logs and temp
+ '**/logs/**',
+ '**/tmp/**',
+ '**/temp/**',
+
+ // Documentation build output
+ '**/_build/**',
+ '**/docs/_build/**',
+ '**/site/**',
+ ],
+ languages: [],
+ frameworks: [],
+ maxFileSize: 1024 * 1024, // 1MB
+ extractDocstrings: true,
+ trackCallSites: true,
+ enableCentrality: true,
+ enableChurn: true,
+ enableIssueHistory: true,
+ enableConfigRefs: true,
+ enableSqlRefs: true,
+};
+
+Object.defineProperty(baseConfig, 'include', {
+ get: () => buildIncludeGlobs(),
+ enumerable: true,
+ configurable: true,
+});
+
+export const DEFAULT_CONFIG: CodeGraphConfig = baseConfig;
diff --git a/src/extraction/grammars.ts b/src/extraction/grammars.ts
index df264fb3..5c2aec09 100644
--- a/src/extraction/grammars.ts
+++ b/src/extraction/grammars.ts
@@ -4,77 +4,63 @@
* Uses web-tree-sitter (WASM) for universal cross-platform support.
* Grammars are loaded lazily — only languages actually present in the project
* are compiled, keeping V8 WASM memory pressure low on large codebases.
+ *
+ * As of the language-registry refactor, all per-language metadata
+ * (WASM filenames, file extensions, display names, vendored flag)
+ * lives in `./languages/.ts` and is auto-collected by
+ * `./languages/registry.ts`. The constants exported here
+ * (`EXTENSION_MAP`, `getSupportedLanguages`, `getLanguageDisplayName`)
+ * remain for backward compat but are derived from the registry.
*/
import * as path from 'path';
import { Parser, Language as WasmLanguage } from 'web-tree-sitter';
import { Language } from '../types';
+import { getLanguageDefs, getLanguageDefByExtension, getLanguageDefByName } from './languages/registry';
export type GrammarLanguage = Exclude;
/**
- * WASM filename map — maps each language to its .wasm grammar file
- * in the tree-sitter-wasms package.
+ * File extension → Language mapping, computed lazily on first read.
+ *
+ * Cannot be a top-level IIFE: the registry transitively pulls in
+ * `tree-sitter.ts` (via custom-extractor language defs), which
+ * imports this file — building the map at module load would TDZ
+ * against `ALL_DEFS` in the registry. Use the `getExtensionMap()`
+ * function for an explicit lazy entry point, or read
+ * `EXTENSION_MAP` (a Proxy that materialises on first property
+ * access).
*/
-const WASM_GRAMMAR_FILES: Record = {
- typescript: 'tree-sitter-typescript.wasm',
- tsx: 'tree-sitter-tsx.wasm',
- javascript: 'tree-sitter-javascript.wasm',
- jsx: 'tree-sitter-javascript.wasm',
- python: 'tree-sitter-python.wasm',
- go: 'tree-sitter-go.wasm',
- rust: 'tree-sitter-rust.wasm',
- java: 'tree-sitter-java.wasm',
- c: 'tree-sitter-c.wasm',
- cpp: 'tree-sitter-cpp.wasm',
- csharp: 'tree-sitter-c_sharp.wasm',
- php: 'tree-sitter-php.wasm',
- ruby: 'tree-sitter-ruby.wasm',
- swift: 'tree-sitter-swift.wasm',
- kotlin: 'tree-sitter-kotlin.wasm',
- dart: 'tree-sitter-dart.wasm',
- pascal: 'tree-sitter-pascal.wasm',
-};
+let _extensionMapCache: Record | null = null;
+export function getExtensionMap(): Record {
+ if (_extensionMapCache) return _extensionMapCache;
+ const out: Record = {};
+ for (const def of getLanguageDefs()) {
+ for (const ext of def.extensions) {
+ out[ext.toLowerCase()] = def.name as Language;
+ }
+ }
+ _extensionMapCache = out;
+ return out;
+}
/**
- * File extension to Language mapping
+ * Backward-compat: a Proxy that lazy-builds the extension map on
+ * first property access. Existing callers can keep doing
+ * `EXTENSION_MAP['.ts']` without changes.
*/
-export const EXTENSION_MAP: Record = {
- '.ts': 'typescript',
- '.tsx': 'tsx',
- '.js': 'javascript',
- '.mjs': 'javascript',
- '.cjs': 'javascript',
- '.jsx': 'jsx',
- '.py': 'python',
- '.pyw': 'python',
- '.go': 'go',
- '.rs': 'rust',
- '.java': 'java',
- '.c': 'c',
- '.h': 'c', // Could also be C++, defaulting to C
- '.cpp': 'cpp',
- '.cc': 'cpp',
- '.cxx': 'cpp',
- '.hpp': 'cpp',
- '.hxx': 'cpp',
- '.cs': 'csharp',
- '.php': 'php',
- '.rb': 'ruby',
- '.rake': 'ruby',
- '.swift': 'swift',
- '.kt': 'kotlin',
- '.kts': 'kotlin',
- '.dart': 'dart',
- '.liquid': 'liquid',
- '.svelte': 'svelte',
- '.pas': 'pascal',
- '.dpr': 'pascal',
- '.dpk': 'pascal',
- '.lpr': 'pascal',
- '.dfm': 'pascal',
- '.fmx': 'pascal',
-};
+export const EXTENSION_MAP: Record = new Proxy({} as Record, {
+ get(_t, key: string) { return getExtensionMap()[key]; },
+ has(_t, key: string) { return key in getExtensionMap(); },
+ ownKeys() { return Object.keys(getExtensionMap()); },
+ getOwnPropertyDescriptor(_t, key: string) {
+ const map = getExtensionMap();
+ if (key in map) {
+ return { configurable: true, enumerable: true, writable: false, value: map[key] };
+ }
+ return undefined;
+ },
+});
/**
* Caches for loaded grammars and parsers
@@ -108,21 +94,28 @@ export async function loadGrammarsForLanguages(languages: Language[]): Promise
- lang in WASM_GRAMMAR_FILES &&
- !languageCache.has(lang) &&
- !unavailableGrammarErrors.has(lang)
- );
+ // Deduplicate; filter to languages that have a tree-sitter grammar
+ // (registry's `def.grammar` field) and aren't already loaded.
+ const seen = new Set();
+ const toLoad: Array<{ lang: Language; wasmFile: string; vendored: boolean }> = [];
+ for (const lang of languages) {
+ if (seen.has(lang)) continue;
+ seen.add(lang);
+ if (languageCache.has(lang) || unavailableGrammarErrors.has(lang)) continue;
+ const def = getLanguageDefByName(lang);
+ if (!def?.grammar) continue;
+ toLoad.push({
+ lang,
+ wasmFile: def.grammar.wasmFile,
+ vendored: def.grammar.vendored === true,
+ });
+ }
// Load grammars sequentially to avoid web-tree-sitter WASM race condition on Node 20+
// See: https://github.com/tree-sitter/tree-sitter/issues/2338
- for (const lang of toLoad) {
- const wasmFile = WASM_GRAMMAR_FILES[lang];
+ for (const { lang, wasmFile, vendored } of toLoad) {
try {
- // Pascal ships its own WASM (not in tree-sitter-wasms)
- const wasmPath = lang === 'pascal'
+ const wasmPath = vendored
? path.join(__dirname, 'wasm', wasmFile)
: require.resolve(`tree-sitter-wasms/out/${wasmFile}`);
const language = await WasmLanguage.load(wasmPath);
@@ -140,7 +133,9 @@ export async function loadGrammarsForLanguages(languages: Language[]): Promise {
- const allLanguages = Object.keys(WASM_GRAMMAR_FILES) as GrammarLanguage[];
+ const allLanguages = getLanguageDefs()
+ .filter((d) => d.grammar)
+ .map((d) => d.name as Language);
await loadGrammarsForLanguages(allLanguages);
}
@@ -176,7 +171,8 @@ export function getParser(language: Language): Parser | null {
*/
export function detectLanguage(filePath: string, source?: string): Language {
const ext = filePath.substring(filePath.lastIndexOf('.')).toLowerCase();
- const lang = EXTENSION_MAP[ext] || 'unknown';
+ const def = getLanguageDefByExtension(ext);
+ const lang = (def?.name as Language) ?? 'unknown';
// .h files could be C or C++ — check source content for C++ features
if (lang === 'c' && ext === '.h' && source) {
@@ -196,29 +192,30 @@ function looksLikeCpp(source: string): boolean {
}
/**
- * Check if a language is supported (has a grammar defined).
- * Returns true if the grammar exists, even if not yet loaded.
+ * Check if a language is supported (has a grammar or custom extractor).
+ * Returns true if a registry entry exists, even if its grammar isn't loaded.
*/
export function isLanguageSupported(language: Language): boolean {
- if (language === 'svelte') return true; // custom extractor (script block delegation)
- if (language === 'liquid') return true; // custom regex extractor
if (language === 'unknown') return false;
- return language in WASM_GRAMMAR_FILES;
+ return getLanguageDefByName(language) !== undefined;
}
/**
* Check if a grammar has been loaded and is ready for parsing.
+ * Custom-extractor languages (no `grammar` field) are always "ready".
*/
export function isGrammarLoaded(language: Language): boolean {
- if (language === 'svelte' || language === 'liquid') return true;
+ const def = getLanguageDefByName(language);
+ if (!def) return false;
+ if (!def.grammar) return true; // custom extractor — always available
return languageCache.has(language);
}
/**
- * Get all supported languages (those with grammar definitions).
+ * Get all supported languages from the registry.
*/
export function getSupportedLanguages(): Language[] {
- return [...(Object.keys(WASM_GRAMMAR_FILES) as GrammarLanguage[]), 'svelte', 'liquid'];
+ return getLanguageDefs().map((d) => d.name as Language);
}
/**
@@ -237,54 +234,33 @@ export function resetParser(language: Language): void {
}
/**
- * Clear parser/grammar caches (useful for testing)
+ * Clear parser cache (useful for testing).
+ *
+ * Note: `languageCache` is intentionally NOT cleared — the WASM
+ * `Language` modules are expensive to load and stay cached so a
+ * subsequent `getParser` call can rebuild a fresh `Parser` instance
+ * without re-reading the .wasm file. To fully re-init, set
+ * `parserInitialized = false` and call `initGrammars()` again.
*/
export function clearParserCache(): void {
for (const parser of parserCache.values()) {
- parser.delete();
+ try { parser.delete(); } catch { /* ignore */ }
}
parserCache.clear();
- // Note: languageCache is NOT cleared — WASM languages persist.
- // To fully re-init, set parserInitialized = false and call initGrammars() again.
unavailableGrammarErrors.clear();
}
/**
- * Report grammars that failed to load.
+ * Get unavailable grammar errors (for diagnostics)
*/
-export function getUnavailableGrammarErrors(): Partial> {
- const out: Partial> = {};
- for (const [language, message] of unavailableGrammarErrors.entries()) {
- out[language] = message;
- }
- return out;
+export function getUnavailableGrammarErrors(): Record {
+ return Object.fromEntries(unavailableGrammarErrors);
}
/**
- * Get language display name
+ * Human-readable display name (e.g. "TypeScript", "Pascal / Delphi").
+ * Returns the canonical name unchanged if no display name is registered.
*/
export function getLanguageDisplayName(language: Language): string {
- const names: Record = {
- typescript: 'TypeScript',
- javascript: 'JavaScript',
- tsx: 'TypeScript (TSX)',
- jsx: 'JavaScript (JSX)',
- python: 'Python',
- go: 'Go',
- rust: 'Rust',
- java: 'Java',
- c: 'C',
- cpp: 'C++',
- csharp: 'C#',
- php: 'PHP',
- ruby: 'Ruby',
- swift: 'Swift',
- kotlin: 'Kotlin',
- dart: 'Dart',
- svelte: 'Svelte',
- liquid: 'Liquid',
- pascal: 'Pascal / Delphi',
- unknown: 'Unknown',
- };
- return names[language] || language;
+ return getLanguageDefByName(language)?.displayName ?? language;
}
diff --git a/src/extraction/hcl-extractor.ts b/src/extraction/hcl-extractor.ts
new file mode 100644
index 00000000..3d810c88
--- /dev/null
+++ b/src/extraction/hcl-extractor.ts
@@ -0,0 +1,587 @@
+import type { Node as SyntaxNode } from 'web-tree-sitter';
+import { Node, Edge, ExtractionResult, ExtractionError, UnresolvedReference, NodeKind } from '../types';
+import { generateNodeId, getNodeText } from './tree-sitter-helpers';
+import { getParser } from './grammars';
+
+/**
+ * HclExtractor — extracts a Terraform/HCL file into the graph.
+ *
+ * HCL is a declarative configuration language: there are no functions,
+ * classes, or methods. The unit of structure is the **block**:
+ *
+ * [