diff --git a/CLAUDE.md b/CLAUDE.md index 71a50c73..b0f3a660 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -98,7 +98,9 @@ SQLite database with: ### Supported Languages -TypeScript, JavaScript, TSX, JSX, Svelte, Python, Go, Rust, Java, C, C++, C#, PHP, Ruby, Swift, Kotlin, Dart, Liquid, Pascal +TypeScript, JavaScript, TSX, JSX, Svelte, Python, Go, Rust, Java, C, C++, C#, PHP, Ruby, Swift, Kotlin, Dart, Liquid, Pascal, R + +To add a new language, follow the cookbook at [`docs/ADDING-A-LANGUAGE.md`](docs/ADDING-A-LANGUAGE.md). ### Node and Edge Types diff --git a/README.md b/README.md index fd1ffaba..3303fc0e 100644 --- a/README.md +++ b/README.md @@ -316,7 +316,7 @@ fi ## MCP Tools -When running as an MCP server, CodeGraph exposes these tools to Claude Code: +When running as an MCP server, CodeGraph exposes these tools to any MCP-compatible AI assistant: | Tool | Purpose | |------|---------| @@ -331,6 +331,110 @@ When running as an MCP server, CodeGraph exposes these tools to Claude Code: --- +## Using with Other MCP Clients + +The MCP server runs over **stdio** and works with any MCP-compatible client — not just Claude Code. The interactive installer is Claude Code-specific (it writes `~/.claude.json`), so for other clients you'll want the manual setup. + +**Common steps for every client:** + +```bash +npm install -g @colbymchenry/codegraph # so `codegraph` is on PATH +cd your-project +codegraph init -i # initialize + index this project +``` + +Then point your MCP client at `codegraph serve --mcp` using whatever config shape it expects: + +### opencode + +In `opencode.json` (project) or `~/.config/opencode/opencode.json` (global): + +```json +{ + "$schema": "https://opencode.ai/config.json", + "mcp": { + "codegraph": { + "type": "local", + "command": ["codegraph", "serve", "--mcp"], + "enabled": true + } + } +} +``` + +### Cursor + +In `~/.cursor/mcp.json` (global) or `.cursor/mcp.json` (project): + +```json +{ + "mcpServers": { + "codegraph": { + "command": "codegraph", + "args": ["serve", "--mcp"] + } + } +} +``` + +### LangChain (`MultiServerMCPClient`) + +The CodeGraph server speaks stdio, not SSE — pass `transport: "stdio"`: + +```python +from langchain_mcp_adapters.client import MultiServerMCPClient + +client = MultiServerMCPClient({ + "codegraph": { + "command": "codegraph", + "args": ["serve", "--mcp"], + "transport": "stdio", + } +}) +tools = await client.get_tools() +``` + +### Claude Agent SDK + +Pass the server in `mcpServers` (TypeScript) or `mcp_servers` (Python) when calling `query()`: + +```python +from claude_agent_sdk import query, ClaudeAgentOptions + +options = ClaudeAgentOptions( + mcp_servers={ + "codegraph": { + "command": "codegraph", + "args": ["serve", "--mcp"], + } + }, + allowed_tools=["mcp__codegraph__*"], +) + +async for message in query(prompt="Where is auth handled?", options=options): + ... +``` + +### Anything else (generic stdio MCP) + +Most MCP clients (Continue, Zed, custom integrations, etc.) accept some variation of `command` + `args`. The values are always: + +| Field | Value | +|-------|-------| +| Command | `codegraph` | +| Args | `["serve", "--mcp"]` | +| Transport | `stdio` | + +The server reads the project root from the MCP `initialize` request's `rootUri` (set by the client when it connects). If your client doesn't send a `rootUri`, pass the project path explicitly: + +```bash +codegraph serve --mcp --path /absolute/path/to/project +``` + +> **Note:** CodeGraph's MCP server does **not** speak SSE/HTTP. If your client only supports `url` + `transport: "sse"`, you'll need to wrap stdio with a bridge like [supergateway](https://github.com/supercorp-ai/supergateway). + +--- + ## Library Usage ```typescript @@ -402,6 +506,8 @@ The `.codegraph/config.json` file controls indexing: | Liquid | `.liquid` | Full support | | Pascal / Delphi | `.pas`, `.dpr`, `.dpk`, `.lpr` | Full support (classes, records, interfaces, enums, DFM/FMX form files) | +Want to add another language? See [`docs/ADDING-A-LANGUAGE.md`](docs/ADDING-A-LANGUAGE.md) — it walks through sourcing a tree-sitter grammar, probing the AST, choosing between the OO and self-contained extractor patterns, and the worked examples in the existing extractors. + ## Troubleshooting **"CodeGraph not initialized"** — Run `codegraph init` in your project directory first. diff --git a/__tests__/centrality.test.ts b/__tests__/centrality.test.ts new file mode 100644 index 00000000..e45dc858 --- /dev/null +++ b/__tests__/centrality.test.ts @@ -0,0 +1,134 @@ +import { describe, it, expect } from 'vitest'; +import { computePageRank, PR_DAMPING, PR_ITERATIONS } from '../src/centrality'; + +function asNodes(ids: string[]) { + return ids.map((id) => ({ id })); +} + +describe('computePageRank', () => { + it('returns empty result for an empty graph', () => { + const r = computePageRank([], []); + expect(r.scores.size).toBe(0); + expect(r.iterations).toBe(0); + }); + + it('assigns uniform rank to N isolated nodes', () => { + const r = computePageRank(asNodes(['a', 'b', 'c', 'd']), []); + expect(r.scores.size).toBe(4); + // 4 isolated nodes — all dangling — should each end up with 1/N. + for (const v of r.scores.values()) { + expect(v).toBeCloseTo(0.25, 6); + } + }); + + it('rewards being reached (sinks accumulate rank)', () => { + // a -> b -> c. c has no outgoing, so it accumulates the most. + const r = computePageRank( + asNodes(['a', 'b', 'c']), + [ + { source: 'a', target: 'b' }, + { source: 'b', target: 'c' }, + ] + ); + const a = r.scores.get('a')!; + const b = r.scores.get('b')!; + const c = r.scores.get('c')!; + expect(c).toBeGreaterThan(b); + expect(b).toBeGreaterThan(a); + }); + + it('star: hub ranks above all leaves; leaves are equal', () => { + const leaves = ['l1', 'l2', 'l3', 'l4', 'l5', 'l6', 'l7', 'l8', 'l9']; + const edges = leaves.map((l) => ({ source: l, target: 'hub' })); + const r = computePageRank(asNodes([...leaves, 'hub']), edges); + const hub = r.scores.get('hub')!; + for (const l of leaves) { + const lv = r.scores.get(l)!; + expect(hub).toBeGreaterThan(lv); + } + // Leaves are symmetric — should be within 1e-9. + const first = r.scores.get(leaves[0])!; + for (const l of leaves.slice(1)) { + expect(r.scores.get(l)!).toBeCloseTo(first, 9); + } + }); + + it('cycle: all nodes have approximately equal rank', () => { + const r = computePageRank( + asNodes(['a', 'b', 'c']), + [ + { source: 'a', target: 'b' }, + { source: 'b', target: 'c' }, + { source: 'c', target: 'a' }, + ] + ); + const a = r.scores.get('a')!; + const b = r.scores.get('b')!; + const c = r.scores.get('c')!; + // Symmetric → all equal at convergence. + expect(a).toBeCloseTo(b, 6); + expect(b).toBeCloseTo(c, 6); + }); + + it('total rank sums to ~1 (mass is conserved)', () => { + const r = computePageRank( + asNodes(['a', 'b', 'c', 'd', 'e']), + [ + { source: 'a', target: 'b' }, + { source: 'b', target: 'c' }, + { source: 'd', target: 'c' }, + { source: 'e', target: 'd' }, + { source: 'a', target: 'e' }, + ] + ); + let sum = 0; + for (const v of r.scores.values()) sum += v; + expect(sum).toBeCloseTo(1, 6); + }); + + it('preserves mass across two disconnected components', () => { + const r = computePageRank( + asNodes(['a', 'b', 'c', 'd']), + [ + { source: 'a', target: 'b' }, + { source: 'c', target: 'd' }, + ] + ); + let sum = 0; + for (const v of r.scores.values()) sum += v; + expect(sum).toBeCloseTo(1, 6); + // Within each component, the sink ranks above the source. + expect(r.scores.get('b')!).toBeGreaterThan(r.scores.get('a')!); + expect(r.scores.get('d')!).toBeGreaterThan(r.scores.get('c')!); + }); + + it('drops edges referencing unknown nodes', () => { + // 'ghost' is not in the node set — that edge should be ignored, + // not crash and not pollute scores. + const r = computePageRank( + asNodes(['a', 'b']), + [ + { source: 'a', target: 'b' }, + { source: 'a', target: 'ghost' }, + { source: 'ghost', target: 'b' }, + ] + ); + expect(r.scores.size).toBe(2); + expect(r.scores.get('b')!).toBeGreaterThan(r.scores.get('a')!); + let sum = 0; + for (const v of r.scores.values()) sum += v; + expect(sum).toBeCloseTo(1, 6); + }); + + it('reports iteration count and duration', () => { + const r = computePageRank(asNodes(['a', 'b']), [{ source: 'a', target: 'b' }]); + expect(r.iterations).toBe(PR_ITERATIONS); + expect(r.durationMs).toBeGreaterThanOrEqual(0); + }); + + it('damping constant is the textbook 0.85', () => { + // Sentinel — protects against accidental tuning that would invalidate + // the spike findings the PR was justified on. + expect(PR_DAMPING).toBe(0.85); + }); +}); diff --git a/__tests__/churn.test.ts b/__tests__/churn.test.ts new file mode 100644 index 00000000..fbe279f6 --- /dev/null +++ b/__tests__/churn.test.ts @@ -0,0 +1,208 @@ +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import * as fs from 'fs'; +import * as os from 'os'; +import * as path from 'path'; +import { execFileSync } from 'child_process'; +import { + mineChurn, + getGitHead, + readFileLoc, + MAX_FILES_PER_COMMIT, + LAST_MINED_CHURN_HEAD_KEY, +} from '../src/churn'; + +let HAS_GIT = true; +try { + execFileSync('git', ['--version'], { stdio: 'ignore' }); +} catch { + HAS_GIT = false; +} + +let tempDir: string; + +function git(...args: string[]): string { + return execFileSync('git', args, { + cwd: tempDir, + encoding: 'utf-8', + env: { + ...process.env, + GIT_AUTHOR_NAME: 'Test', + GIT_AUTHOR_EMAIL: 'test@example.com', + GIT_COMMITTER_NAME: 'Test', + GIT_COMMITTER_EMAIL: 'test@example.com', + GIT_AUTHOR_DATE: process.env.GIT_AUTHOR_DATE, + GIT_COMMITTER_DATE: process.env.GIT_COMMITTER_DATE, + }, + stdio: ['pipe', 'pipe', 'pipe'], + }).trim(); +} + +function commitAt(date: string, paths: string[], content?: string) { + for (const p of paths) { + const abs = path.join(tempDir, p); + fs.mkdirSync(path.dirname(abs), { recursive: true }); + fs.writeFileSync(abs, content ?? `data for ${p} at ${date}\n`); + } + git('add', ...paths); + // Pin both author and committer dates so timestamps are deterministic. + process.env.GIT_AUTHOR_DATE = date; + process.env.GIT_COMMITTER_DATE = date; + git('commit', '-m', `commit at ${date}`); + delete process.env.GIT_AUTHOR_DATE; + delete process.env.GIT_COMMITTER_DATE; +} + +beforeEach(() => { + tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-churn-')); + if (HAS_GIT) { + git('init', '-q', '-b', 'main'); + git('config', 'commit.gpgsign', 'false'); + } +}); + +afterEach(() => { + delete process.env.GIT_AUTHOR_DATE; + delete process.env.GIT_COMMITTER_DATE; + fs.rmSync(tempDir, { recursive: true, force: true }); +}); + +describe.skipIf(!HAS_GIT)('mineChurn', () => { + it('returns empty + null head when not in a git repo', () => { + const nonGit = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-nogit-')); + try { + const r = mineChurn(nonGit, new Set(['foo.ts']), null); + expect(r.currentHead).toBeNull(); + expect(r.deltas.size).toBe(0); + expect(r.needsFullRescan).toBe(false); + } finally { + fs.rmSync(nonGit, { recursive: true, force: true }); + } + }); + + it('counts commits per indexed file, ignores files not in index', () => { + commitAt('2025-01-01T00:00:00', ['a.ts', 'b.ts']); + commitAt('2025-01-02T00:00:00', ['a.ts']); + commitAt('2025-01-03T00:00:00', ['a.ts', 'b.ts', 'c.ts']); + + const r = mineChurn(tempDir, new Set(['a.ts', 'b.ts']), null); + expect(r.deltas.get('a.ts')?.commitCountDelta).toBe(3); + expect(r.deltas.get('b.ts')?.commitCountDelta).toBe(2); + expect(r.deltas.has('c.ts')).toBe(false); + }); + + it('records first-seen / last-touched as min/max of commit timestamps', () => { + commitAt('2025-01-01T00:00:00Z', ['a.ts']); + commitAt('2025-06-01T00:00:00Z', ['a.ts']); + commitAt('2025-12-01T00:00:00Z', ['a.ts']); + + const r = mineChurn(tempDir, new Set(['a.ts']), null); + const d = r.deltas.get('a.ts')!; + // 2025-01-01 UTC = 1735689600 + expect(d.firstSeenTs).toBe(1735689600); + // 2025-12-01 UTC = 1764547200 + expect(d.lastTouchedTs).toBe(1764547200); + }); + + it('skips commits touching more than MAX_FILES_PER_COMMIT files', () => { + const bigBatch: string[] = []; + for (let i = 0; i < MAX_FILES_PER_COMMIT + 1; i++) bigBatch.push(`f${i}.ts`); + commitAt('2025-01-01T00:00:00Z', bigBatch); + // Then a normal commit on one of the same files. + commitAt('2025-02-01T00:00:00Z', ['f0.ts']); + + const r = mineChurn(tempDir, new Set(bigBatch), null); + // First commit was skipped; only the second one should count. + expect(r.deltas.get('f0.ts')?.commitCountDelta).toBe(1); + // Files only seen in the skipped commit produce no delta at all. + expect(r.deltas.has('f50.ts')).toBe(false); + }); + + it('incremental mining returns only commits since the given sha', () => { + commitAt('2025-01-01T00:00:00Z', ['a.ts']); + const sha1 = getGitHead(tempDir)!; + commitAt('2025-01-02T00:00:00Z', ['a.ts']); + commitAt('2025-01-03T00:00:00Z', ['a.ts']); + + const incr = mineChurn(tempDir, new Set(['a.ts']), sha1); + // Only the two commits *after* sha1 should be counted. + expect(incr.deltas.get('a.ts')?.commitCountDelta).toBe(2); + expect(incr.needsFullRescan).toBe(false); + }); + + it('returns needsFullRescan=true when sinceSha is unreachable', () => { + commitAt('2025-01-01T00:00:00Z', ['a.ts']); + const fakeSha = '0'.repeat(40); + const r = mineChurn(tempDir, new Set(['a.ts']), fakeSha); + expect(r.needsFullRescan).toBe(true); + expect(r.deltas.size).toBe(0); + expect(r.currentHead).not.toBeNull(); + }); + + it('returns empty deltas when sinceSha equals current head (no-op)', () => { + commitAt('2025-01-01T00:00:00Z', ['a.ts']); + const head = getGitHead(tempDir)!; + const r = mineChurn(tempDir, new Set(['a.ts']), head); + expect(r.currentHead).toBe(head); + expect(r.deltas.size).toBe(0); + expect(r.needsFullRescan).toBe(false); + }); + + it('handles paths with spaces and unicode safely (NUL-delimited)', () => { + commitAt('2025-01-01T00:00:00Z', ['name with space.ts']); + commitAt('2025-01-02T00:00:00Z', ['ünïcødë.ts']); + + const r = mineChurn( + tempDir, + new Set(['name with space.ts', 'ünïcødë.ts']), + null + ); + expect(r.deltas.get('name with space.ts')?.commitCountDelta).toBe(1); + expect(r.deltas.get('ünïcødë.ts')?.commitCountDelta).toBe(1); + }); + + it('LAST_MINED_CHURN_HEAD_KEY is stable (used as project_metadata key)', () => { + expect(LAST_MINED_CHURN_HEAD_KEY).toBe('last_mined_churn_head'); + }); +}); + +describe('readFileLoc', () => { + it('returns 0 for an empty file', () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-loc-')); + try { + const f = path.join(dir, 'empty.txt'); + fs.writeFileSync(f, ''); + expect(readFileLoc(dir, 'empty.txt')).toBe(0); + } finally { + fs.rmSync(dir, { recursive: true, force: true }); + } + }); + + it('counts newline-terminated lines', () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-loc-')); + try { + fs.writeFileSync(path.join(dir, 'x.txt'), 'a\nb\nc\n'); + expect(readFileLoc(dir, 'x.txt')).toBe(3); + } finally { + fs.rmSync(dir, { recursive: true, force: true }); + } + }); + + it('counts a final no-newline chunk as one extra line', () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-loc-')); + try { + fs.writeFileSync(path.join(dir, 'x.txt'), 'a\nb\nc'); + expect(readFileLoc(dir, 'x.txt')).toBe(3); + } finally { + fs.rmSync(dir, { recursive: true, force: true }); + } + }); + + it('returns 0 for a missing file (does not throw)', () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-loc-')); + try { + expect(readFileLoc(dir, 'no-such-file.txt')).toBe(0); + } finally { + fs.rmSync(dir, { recursive: true, force: true }); + } + }); +}); diff --git a/__tests__/cochange.test.ts b/__tests__/cochange.test.ts new file mode 100644 index 00000000..4a3918aa --- /dev/null +++ b/__tests__/cochange.test.ts @@ -0,0 +1,481 @@ +/** + * Co-Change Graph Tests + * + * Verifies the file-level co-change miner: + * - parses git log output correctly + * - filters out merge / large refactor commits via MAX_FILES_PER_COMMIT + * - drops files outside the indexed set + * - persists per-file commit_count and per-pair count + * - computes Jaccard correctly at query time + * - updates incrementally on subsequent runs + * - detects unreachable previous-head and re-mines from scratch + * - migration v4 creates the table + column + */ + +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { execFileSync } from 'child_process'; +import { + mineCoChanges, + MAX_FILES_PER_COMMIT, + MIN_COCHANGE_COUNT, + LAST_MINED_HEAD_KEY, + getGitHead, +} from '../src/cochange'; +import { DatabaseConnection } from '../src/db'; +import { QueryBuilder } from '../src/db/queries'; +import { runMigrations, getCurrentVersion } from '../src/db/migrations'; +import CodeGraph from '../src/index'; +import { loadConfig } from '../src/config'; + +function tempGitRepo(prefix: string): string { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), prefix)); + execFileSync('git', ['init'], { cwd: dir, stdio: 'pipe' }); + execFileSync('git', ['config', 'user.email', 'test@test.com'], { cwd: dir, stdio: 'pipe' }); + execFileSync('git', ['config', 'user.name', 'Test'], { cwd: dir, stdio: 'pipe' }); + // Pin the initial branch name so subsequent operations are deterministic + // across systems with different `init.defaultBranch` settings. + execFileSync('git', ['symbolic-ref', 'HEAD', 'refs/heads/main'], { cwd: dir, stdio: 'pipe' }); + return dir; +} + +function commit(dir: string, message: string, files: Record) { + for (const [rel, content] of Object.entries(files)) { + const full = path.join(dir, rel); + fs.mkdirSync(path.dirname(full), { recursive: true }); + fs.writeFileSync(full, content); + } + execFileSync('git', ['add', '-A'], { cwd: dir, stdio: 'pipe' }); + execFileSync('git', ['commit', '-m', message], { cwd: dir, stdio: 'pipe' }); +} + +function rm(dir: string, ...rels: string[]) { + for (const rel of rels) { + fs.unlinkSync(path.join(dir, rel)); + } +} + +describe('mineCoChanges (unit)', () => { + let dir: string; + + afterEach(() => { + if (dir && fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true }); + }); + + it('returns empty result for non-git directories', () => { + dir = fs.mkdtempSync(path.join(os.tmpdir(), 'cochange-nongit-')); + const result = mineCoChanges(dir, new Set(['a.ts']), null); + expect(result.currentHead).toBeNull(); + expect(result.pairs.size).toBe(0); + expect(result.fileCommits.size).toBe(0); + }); + + it('counts pairs and per-file commits across multiple commits', () => { + dir = tempGitRepo('cochange-basic-'); + commit(dir, 'c1', { 'a.ts': '1', 'b.ts': '1' }); + commit(dir, 'c2', { 'a.ts': '2', 'b.ts': '2' }); + commit(dir, 'c3', { 'a.ts': '3' }); + + const result = mineCoChanges(dir, new Set(['a.ts', 'b.ts']), null); + expect(result.currentHead).not.toBeNull(); + expect(result.fileCommits.get('a.ts')).toBe(3); + expect(result.fileCommits.get('b.ts')).toBe(2); + expect(result.pairs.get('a.ts\0b.ts')).toBe(2); + }); + + it('drops files outside the indexed set', () => { + dir = tempGitRepo('cochange-filter-'); + commit(dir, 'c1', { 'a.ts': '1', 'README.md': 'doc', 'b.ts': '1' }); + commit(dir, 'c2', { 'a.ts': '2', 'b.ts': '2' }); + + // README.md is not indexed; the pair (a, b) still counts but no + // (a, README) or (b, README) pair is created. + const result = mineCoChanges(dir, new Set(['a.ts', 'b.ts']), null); + expect(result.fileCommits.has('README.md')).toBe(false); + expect(result.pairs.get('a.ts\0b.ts')).toBe(2); + expect([...result.pairs.keys()].length).toBe(1); + }); + + it('skips commits that touch more than MAX_FILES_PER_COMMIT indexed files', () => { + dir = tempGitRepo('cochange-mass-'); + // First commit: massive refactor across many files (would otherwise + // produce O(N²) spurious pairs). + const massFiles: Record = {}; + const indexed = new Set(); + for (let i = 0; i < MAX_FILES_PER_COMMIT + 5; i++) { + const f = `src/m${i}.ts`; + massFiles[f] = String(i); + indexed.add(f); + } + commit(dir, 'mass', massFiles); + // Second commit: small, two files — should produce one pair. + commit(dir, 'small', { 'src/m0.ts': 'A', 'src/m1.ts': 'B' }); + + const result = mineCoChanges(dir, indexed, null); + expect(result.pairs.get('src/m0.ts\0src/m1.ts')).toBe(1); + // The mass-refactor commit contributes nothing. + expect([...result.pairs.values()].every((c) => c <= 1)).toBe(true); + }); + + it('mines incrementally — only commits in ..HEAD', () => { + dir = tempGitRepo('cochange-incr-'); + commit(dir, 'c1', { 'a.ts': '1', 'b.ts': '1' }); + const anchor = getGitHead(dir)!; + commit(dir, 'c2', { 'a.ts': '2', 'b.ts': '2' }); + commit(dir, 'c3', { 'a.ts': '3', 'b.ts': '3' }); + + const result = mineCoChanges(dir, new Set(['a.ts', 'b.ts']), anchor); + // c2 + c3 only — anchor commit is excluded by the .. range + expect(result.fileCommits.get('a.ts')).toBe(2); + expect(result.pairs.get('a.ts\0b.ts')).toBe(2); + }); + + it('returns no-op delta when current HEAD == sinceSha', () => { + dir = tempGitRepo('cochange-noop-'); + commit(dir, 'c1', { 'a.ts': '1' }); + const head = getGitHead(dir)!; + + const result = mineCoChanges(dir, new Set(['a.ts']), head); + expect(result.currentHead).toBe(head); + expect(result.pairs.size).toBe(0); + expect(result.fileCommits.size).toBe(0); + expect(result.needsFullRescan).toBe(false); + }); + + it('signals needsFullRescan when sinceSha is unreachable', () => { + dir = tempGitRepo('cochange-orphan-'); + commit(dir, 'c1', { 'a.ts': '1' }); + const result = mineCoChanges( + dir, + new Set(['a.ts']), + '0000000000000000000000000000000000000000' + ); + expect(result.needsFullRescan).toBe(true); + }); + + it('correctly handles paths with spaces and unicode', () => { + dir = tempGitRepo('cochange-special-'); + commit(dir, 'c1', { 'with space.ts': '1', 'café.ts': '1' }); + commit(dir, 'c2', { 'with space.ts': '2', 'café.ts': '2' }); + + const result = mineCoChanges( + dir, + new Set(['with space.ts', 'café.ts']), + null + ); + // Either ordering (canonical sort) is fine + const total = [...result.pairs.values()].reduce((a, b) => a + b, 0); + expect(total).toBe(2); + }); + + it('does not misidentify a file literally named "--" as a sentinel', () => { + // Earlier the parser used `--` as the per-commit header; a real file + // by that name would corrupt block boundaries. Sentinel is now NUL- + // bracketed so it cannot collide with any POSIX-legal filename. + dir = tempGitRepo('cochange-dashdash-'); + commit(dir, 'c1', { '--': 'literal dash file', 'b.ts': '1' }); + commit(dir, 'c2', { '--': 'changed', 'b.ts': '2' }); + + const result = mineCoChanges(dir, new Set(['--', 'b.ts']), null); + // We expect both files to be counted in both commits and one pair. + expect(result.fileCommits.get('--')).toBe(2); + expect(result.fileCommits.get('b.ts')).toBe(2); + expect(result.pairs.get('--\0b.ts')).toBe(2); + }); +}); + +describe('QueryBuilder co-change CRUD', () => { + let dir: string; + let db: DatabaseConnection; + let q: QueryBuilder; + + beforeEach(() => { + dir = fs.mkdtempSync(path.join(os.tmpdir(), 'cochange-db-')); + db = DatabaseConnection.initialize(path.join(dir, 'test.db')); + q = new QueryBuilder(db.getDb()); + // Insert a few file rows so commit_count updates and FK semantics work. + const upsert = db.getDb().prepare(` + INSERT INTO files (path, content_hash, language, size, modified_at, indexed_at) + VALUES (?, '', 'typescript', 0, 0, 0) + `); + upsert.run('a.ts'); + upsert.run('b.ts'); + upsert.run('c.ts'); + }); + + afterEach(() => { + db.close(); + if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true }); + }); + + it('canonicalises pair ordering on upsert', () => { + q.applyCoChangeDeltas([['b.ts', 'a.ts', 3]], []); + const row = db.getDb().prepare('SELECT * FROM co_changes').get() as any; + expect(row.file_a).toBe('a.ts'); + expect(row.file_b).toBe('b.ts'); + expect(row.count).toBe(3); + }); + + it('accumulates counts on repeated apply', () => { + q.applyCoChangeDeltas([['a.ts', 'b.ts', 2]], []); + q.applyCoChangeDeltas([['a.ts', 'b.ts', 3]], []); + const row = db.getDb().prepare('SELECT count FROM co_changes').get() as any; + expect(row.count).toBe(5); + }); + + it('increments per-file commit_count', () => { + q.applyCoChangeDeltas([], [['a.ts', 4]]); + q.applyCoChangeDeltas([], [['a.ts', 1]]); + const row = db.getDb().prepare('SELECT commit_count FROM files WHERE path = ?').get('a.ts') as any; + expect(row.commit_count).toBe(5); + }); + + it('skips no-op self-pairs', () => { + q.applyCoChangeDeltas([['a.ts', 'a.ts', 5]], []); + const cnt = db.getDb().prepare('SELECT COUNT(*) AS n FROM co_changes').get() as any; + expect(cnt.n).toBe(0); + }); + + it('clearCoChanges wipes pairs and zeroes per-file counts', () => { + q.applyCoChangeDeltas([['a.ts', 'b.ts', 3]], [['a.ts', 5]]); + q.clearCoChanges(); + const cnt = db.getDb().prepare('SELECT COUNT(*) AS n FROM co_changes').get() as any; + expect(cnt.n).toBe(0); + const row = db.getDb().prepare('SELECT commit_count FROM files WHERE path = ?').get('a.ts') as any; + expect(row.commit_count).toBe(0); + }); +}); + +describe('getCoChangedFiles (Jaccard ranking)', () => { + let dir: string; + let db: DatabaseConnection; + let q: QueryBuilder; + + beforeEach(() => { + dir = fs.mkdtempSync(path.join(os.tmpdir(), 'cochange-rank-')); + db = DatabaseConnection.initialize(path.join(dir, 'test.db')); + q = new QueryBuilder(db.getDb()); + const insertFile = db.getDb().prepare(` + INSERT INTO files (path, content_hash, language, size, modified_at, indexed_at, commit_count) + VALUES (?, '', 'typescript', 0, 0, 0, ?) + `); + // anchor.ts changed in 10 commits. + insertFile.run('anchor.ts', 10); + // tight.ts changed in 4 commits, all of which were with anchor.ts. + insertFile.run('tight.ts', 4); + // loose.ts changed in 100 commits, only 4 with anchor.ts → low Jaccard. + insertFile.run('loose.ts', 100); + // weak.ts changed in 5 commits, only 1 with anchor.ts → drops below minCount. + insertFile.run('weak.ts', 5); + + q.applyCoChangeDeltas( + [ + ['anchor.ts', 'tight.ts', 4], + ['anchor.ts', 'loose.ts', 4], + ['anchor.ts', 'weak.ts', 1], + ], + [] + ); + }); + + afterEach(() => { + db.close(); + if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true }); + }); + + it('ranks tight coupling above loose coupling via Jaccard', () => { + const results = q.getCoChangedFiles('anchor.ts'); + expect(results[0].path).toBe('tight.ts'); + expect(results[0].jaccard).toBeCloseTo(4 / (10 + 4 - 4), 2); + const loose = results.find((r) => r.path === 'loose.ts')!; + expect(loose.jaccard).toBeLessThan(results[0].jaccard); + }); + + it('drops pairs below minCount', () => { + const results = q.getCoChangedFiles('anchor.ts', { minCount: 2 }); + expect(results.find((r) => r.path === 'weak.ts')).toBeUndefined(); + }); + + it('drops pairs below minJaccard (filter is applied in SQL, before LIMIT)', () => { + const results = q.getCoChangedFiles('anchor.ts', { minJaccard: 0.5 }); + // tight.ts has jaccard 0.4 — also dropped at this threshold. + expect(results.length).toBe(0); + }); + + it('does not silently drop high-jaccard pairs ranked beyond an internal over-fetch', () => { + // Insert many low-jaccard partners to push tight.ts past any in-memory + // truncation that could happen if minJaccard were applied JS-side after + // a small SQL LIMIT. With the SQL-side filter, a `limit: 1` request + // with high minJaccard must still return tight.ts. + const insertFile = db.getDb().prepare(` + INSERT INTO files (path, content_hash, language, size, modified_at, indexed_at, commit_count) + VALUES (?, '', 'typescript', 0, 0, 0, ?) + `); + const deltas: Array<[string, string, number]> = []; + for (let i = 0; i < 100; i++) { + const p = `noise${i}.ts`; + insertFile.run(p, 1000); // huge commit_count → near-zero jaccard + deltas.push(['anchor.ts', p, 4]); + } + q.applyCoChangeDeltas(deltas, []); + const results = q.getCoChangedFiles('anchor.ts', { limit: 1, minJaccard: 0.3 }); + expect(results).toHaveLength(1); + expect(results[0].path).toBe('tight.ts'); + }); + + it('returns symmetric results when queried from either side', () => { + const fromAnchor = q.getCoChangedFiles('anchor.ts').find((r) => r.path === 'tight.ts')!; + const fromTight = q.getCoChangedFiles('tight.ts').find((r) => r.path === 'anchor.ts')!; + expect(fromAnchor.count).toBe(fromTight.count); + expect(fromAnchor.jaccard).toBeCloseTo(fromTight.jaccard, 4); + }); + + it('respects the limit', () => { + const results = q.getCoChangedFiles('anchor.ts', { limit: 1 }); + expect(results).toHaveLength(1); + }); +}); + +describe('CodeGraph end-to-end (mining wired into indexAll/sync)', () => { + let dir: string; + let cg: CodeGraph; + + beforeEach(async () => { + dir = tempGitRepo('cochange-e2e-'); + fs.writeFileSync(path.join(dir, 'a.ts'), 'export const a = 1;'); + fs.writeFileSync(path.join(dir, 'b.ts'), 'export const b = 1;'); + execFileSync('git', ['add', '-A'], { cwd: dir, stdio: 'pipe' }); + execFileSync('git', ['commit', '-m', 'initial'], { cwd: dir, stdio: 'pipe' }); + // A second co-change of the same pair so we cross MIN_COCHANGE_COUNT (2). + fs.writeFileSync(path.join(dir, 'a.ts'), 'export const a = 2;'); + fs.writeFileSync(path.join(dir, 'b.ts'), 'export const b = 2;'); + execFileSync('git', ['add', '-A'], { cwd: dir, stdio: 'pipe' }); + execFileSync('git', ['commit', '-m', 'second'], { cwd: dir, stdio: 'pipe' }); + + cg = CodeGraph.initSync(dir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + }); + + afterEach(() => { + if (cg) cg.destroy(); + if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true }); + }); + + it('populates co_changes after indexAll on a git repo', () => { + const partners = cg.getCoChangedFiles('a.ts'); + expect(partners.length).toBeGreaterThanOrEqual(1); + const b = partners.find((p) => p.path === 'b.ts'); + expect(b).toBeDefined(); + expect(b!.count).toBeGreaterThanOrEqual(MIN_COCHANGE_COUNT); + }); + + it('stores the last mined HEAD in project_metadata', () => { + // Internal-state assertion to confirm incremental sync has an anchor. + // `queries` is private; cast to access it from the test. + const head = (cg as unknown as { queries: QueryBuilder }).queries.getMetadata(LAST_MINED_HEAD_KEY); + expect(head).toMatch(/^[0-9a-f]{40}$/); + }); + + it('updates incrementally on sync', async () => { + const before = cg.getCoChangedFiles('a.ts').find((p) => p.path === 'b.ts')!.count; + fs.writeFileSync(path.join(dir, 'a.ts'), 'export const a = 3;'); + fs.writeFileSync(path.join(dir, 'b.ts'), 'export const b = 3;'); + execFileSync('git', ['add', '-A'], { cwd: dir, stdio: 'pipe' }); + execFileSync('git', ['commit', '-m', 'third'], { cwd: dir, stdio: 'pipe' }); + + await cg.sync(); + const after = cg.getCoChangedFiles('a.ts').find((p) => p.path === 'b.ts')!.count; + expect(after).toBe(before + 1); + }); + + it('respects enableCoChange: false (no mining, empty results)', async () => { + const dir2 = tempGitRepo('cochange-disabled-'); + fs.writeFileSync(path.join(dir2, 'a.ts'), '1'); + fs.writeFileSync(path.join(dir2, 'b.ts'), '1'); + execFileSync('git', ['add', '-A'], { cwd: dir2, stdio: 'pipe' }); + execFileSync('git', ['commit', '-m', 'c1'], { cwd: dir2, stdio: 'pipe' }); + + const cg2 = CodeGraph.initSync(dir2, { + config: { include: ['**/*.ts'], exclude: [], enableCoChange: false }, + }); + await cg2.indexAll(); + expect(cg2.getCoChangedFiles('a.ts')).toHaveLength(0); + cg2.destroy(); + fs.rmSync(dir2, { recursive: true, force: true }); + }); + + it('persists enableCoChange across config save/load round-trip', () => { + // Regression: mergeConfig used to enumerate fields by hand and + // silently dropped enableCoChange, so the opt-out flag could never + // survive a reload from disk. + const dir2 = fs.mkdtempSync(path.join(os.tmpdir(), 'cochange-cfgrt-')); + const cg2 = CodeGraph.initSync(dir2, { + config: { enableCoChange: false }, + }); + cg2.close(); + const reloaded = loadConfig(dir2); + expect(reloaded.enableCoChange).toBe(false); + fs.rmSync(dir2, { recursive: true, force: true }); + }); +}); + +describe('Migration v4: add commit_count column + co_changes table', () => { + let dir: string; + + beforeEach(() => { + dir = fs.mkdtempSync(path.join(os.tmpdir(), 'cochange-migr-')); + }); + + afterEach(() => { + if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true }); + }); + + it('adds commit_count to files and creates co_changes', () => { + // Build a v3-shape DB by hand. + const Database = require('better-sqlite3'); + const dbHandle = new Database(path.join(dir, 'test.db')); + dbHandle.exec(` + CREATE TABLE schema_versions (version INTEGER PRIMARY KEY, applied_at INTEGER NOT NULL, description TEXT); + INSERT INTO schema_versions (version, applied_at, description) VALUES (3, 0, 'v3'); + CREATE TABLE files ( + path TEXT PRIMARY KEY, content_hash TEXT NOT NULL, language TEXT NOT NULL, + size INTEGER NOT NULL, modified_at INTEGER NOT NULL, indexed_at INTEGER NOT NULL, + node_count INTEGER DEFAULT 0, errors TEXT + ); + INSERT INTO files (path, content_hash, language, size, modified_at, indexed_at) + VALUES ('x.ts', '', 'typescript', 0, 0, 0); + `); + expect(getCurrentVersion(dbHandle)).toBe(3); + + runMigrations(dbHandle, 3); + expect(getCurrentVersion(dbHandle)).toBeGreaterThanOrEqual(10); + + const cols = dbHandle.prepare('PRAGMA table_info(files)').all() as Array<{ name: string }>; + expect(cols.some((c) => c.name === 'commit_count')).toBe(true); + const tableExists = dbHandle + .prepare("SELECT name FROM sqlite_master WHERE type='table' AND name='co_changes'") + .get(); + expect(tableExists).toBeDefined(); + dbHandle.close(); + }); + + it('migration is idempotent on partial-DDL re-run', () => { + const Database = require('better-sqlite3'); + const dbHandle = new Database(path.join(dir, 'test.db')); + dbHandle.exec(` + CREATE TABLE schema_versions (version INTEGER PRIMARY KEY, applied_at INTEGER NOT NULL, description TEXT); + INSERT INTO schema_versions (version, applied_at, description) VALUES (3, 0, 'v3'); + CREATE TABLE files ( + path TEXT PRIMARY KEY, content_hash TEXT NOT NULL, language TEXT NOT NULL, + size INTEGER NOT NULL, modified_at INTEGER NOT NULL, indexed_at INTEGER NOT NULL, + node_count INTEGER DEFAULT 0, errors TEXT, + commit_count INTEGER NOT NULL DEFAULT 0 -- partial pre-existing state + ); + `); + expect(() => runMigrations(dbHandle, 3)).not.toThrow(); + expect(getCurrentVersion(dbHandle)).toBeGreaterThanOrEqual(10); + dbHandle.close(); + }); +}); diff --git a/__tests__/codegraphignore.test.ts b/__tests__/codegraphignore.test.ts new file mode 100644 index 00000000..4d7e58c5 --- /dev/null +++ b/__tests__/codegraphignore.test.ts @@ -0,0 +1,168 @@ +/** + * .codegraphignore Tests + * + * Regression test for the bug where the .codegraphignore marker file was + * honored by the filesystem-walk fallback (`scanDirectoryWalk`) but + * silently ignored by the git fast path (`getGitVisibleFiles` and + * `getGitChangedFiles`). Same project gave different file sets depending + * on whether `.git` existed. + */ + +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { execFileSync } from 'child_process'; +import { scanDirectory } from '../src/extraction'; +import { DEFAULT_CONFIG, CodeGraphConfig } from '../src/types'; +import CodeGraph from '../src/index'; + +function tempDir(prefix: string): string { + return fs.mkdtempSync(path.join(os.tmpdir(), prefix)); +} + +function git(cwd: string, ...args: string[]) { + execFileSync('git', args, { cwd, stdio: 'pipe' }); +} + +const config: CodeGraphConfig = { + ...DEFAULT_CONFIG, + include: ['**/*.ts'], + exclude: [], +}; + +describe('.codegraphignore marker (bug #3)', () => { + describe('git fast path', () => { + let dir: string; + + beforeEach(() => { + dir = tempDir('codegraph-ignore-git-'); + git(dir, 'init'); + git(dir, 'config', 'user.email', 'test@test.com'); + git(dir, 'config', 'user.name', 'Test'); + // Pin branch name for determinism across git defaults + git(dir, 'symbolic-ref', 'HEAD', 'refs/heads/main'); + + fs.mkdirSync(path.join(dir, 'src')); + fs.mkdirSync(path.join(dir, 'vendor')); + fs.mkdirSync(path.join(dir, 'vendor', 'lib')); + fs.writeFileSync(path.join(dir, 'src', 'app.ts'), 'export const a = 1;'); + fs.writeFileSync(path.join(dir, 'vendor', 'pkg.ts'), 'export const v = 1;'); + fs.writeFileSync(path.join(dir, 'vendor', 'lib', 'sub.ts'), 'export const s = 1;'); + // Mark vendor/ as ignored + fs.writeFileSync(path.join(dir, 'vendor', '.codegraphignore'), ''); + + git(dir, 'add', '-A'); + git(dir, 'commit', '-m', 'initial'); + }); + + afterEach(() => { + if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true }); + }); + + it('scanDirectory honors .codegraphignore on the git fast path', () => { + const files = scanDirectory(dir, config); + expect(files).toContain('src/app.ts'); + expect(files).not.toContain('vendor/pkg.ts'); + expect(files).not.toContain('vendor/lib/sub.ts'); + }); + + it('marker at project root excludes everything', () => { + fs.writeFileSync(path.join(dir, '.codegraphignore'), ''); + // Need to add it to git so ls-files sees it (or rely on -o) + git(dir, 'add', '-A'); + git(dir, 'commit', '-m', 'add root marker'); + const files = scanDirectory(dir, config); + expect(files).toEqual([]); + }); + + it('marker in nested subdir does not affect siblings', () => { + // Add another sibling subdir without a marker + fs.mkdirSync(path.join(dir, 'libs')); + fs.writeFileSync(path.join(dir, 'libs', 'util.ts'), 'export const u = 1;'); + git(dir, 'add', '-A'); + git(dir, 'commit', '-m', 'add libs'); + + const files = scanDirectory(dir, config); + expect(files).toContain('src/app.ts'); + expect(files).toContain('libs/util.ts'); + expect(files).not.toContain('vendor/pkg.ts'); + }); + + it('respects marker added after initial commit (untracked marker)', () => { + // The marker file itself need not be committed — it can be a local + // override. Add marker AFTER commit, do not commit it. + fs.mkdirSync(path.join(dir, 'generated')); + fs.writeFileSync(path.join(dir, 'generated', 'gen.ts'), 'export const g = 1;'); + fs.writeFileSync(path.join(dir, 'generated', '.codegraphignore'), ''); + // The .ts file is untracked but visible via `git ls-files -o`. + // The marker is also untracked — we still detect it via fs check. + + const files = scanDirectory(dir, config); + expect(files).not.toContain('generated/gen.ts'); + }); + }); + + describe('parity with non-git fallback (filesystem walk)', () => { + let dir: string; + + beforeEach(() => { + dir = tempDir('codegraph-ignore-walk-'); + fs.mkdirSync(path.join(dir, 'src')); + fs.mkdirSync(path.join(dir, 'vendor')); + fs.writeFileSync(path.join(dir, 'src', 'app.ts'), 'export const a = 1;'); + fs.writeFileSync(path.join(dir, 'vendor', 'pkg.ts'), 'export const v = 1;'); + fs.writeFileSync(path.join(dir, 'vendor', '.codegraphignore'), ''); + }); + + afterEach(() => { + if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true }); + }); + + it('non-git project also honors the marker (sanity / pre-existing behavior)', () => { + const files = scanDirectory(dir, config); + expect(files).toContain('src/app.ts'); + expect(files).not.toContain('vendor/pkg.ts'); + }); + }); + + describe('sync git path (getGitChangedFiles)', () => { + let dir: string; + let cg: CodeGraph; + + beforeEach(async () => { + dir = tempDir('codegraph-ignore-sync-'); + git(dir, 'init'); + git(dir, 'config', 'user.email', 'test@test.com'); + git(dir, 'config', 'user.name', 'Test'); + git(dir, 'symbolic-ref', 'HEAD', 'refs/heads/main'); + + fs.mkdirSync(path.join(dir, 'src')); + fs.mkdirSync(path.join(dir, 'vendor')); + fs.writeFileSync(path.join(dir, 'src', 'app.ts'), 'export const a = 1;'); + fs.writeFileSync(path.join(dir, 'vendor', '.codegraphignore'), ''); + + git(dir, 'add', '-A'); + git(dir, 'commit', '-m', 'initial'); + + cg = CodeGraph.initSync(dir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + }); + + afterEach(() => { + if (cg) cg.destroy(); + if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true }); + }); + + it('sync ignores changes inside marker dirs', async () => { + // Add a new file under vendor/ — should NOT be picked up by sync. + fs.writeFileSync(path.join(dir, 'vendor', 'leaked.ts'), 'export const x = 1;'); + // Also add a real change to confirm sync still runs. + fs.writeFileSync(path.join(dir, 'src', 'app.ts'), 'export const a = 2;'); + + const result = await cg.sync(); + expect(result.changedFilePaths).toContain('src/app.ts'); + expect(result.changedFilePaths ?? []).not.toContain('vendor/leaked.ts'); + }); + }); +}); diff --git a/__tests__/config-refs.test.ts b/__tests__/config-refs.test.ts new file mode 100644 index 00000000..ab1a63e4 --- /dev/null +++ b/__tests__/config-refs.test.ts @@ -0,0 +1,288 @@ +/** + * Config-refs tests: parser unit tests + end-to-end through CodeGraph. + */ +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import * as fs from 'fs'; +import * as os from 'os'; +import * as path from 'path'; +import { extractConfigRefs } from '../src/config-refs'; +import CodeGraph from '../src/index'; + +let testDir: string; +let cg: CodeGraph | null = null; + +function write(rel: string, content: string) { + const abs = path.join(testDir, rel); + fs.mkdirSync(path.dirname(abs), { recursive: true }); + fs.writeFileSync(abs, content); +} + +beforeEach(() => { + testDir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-config-')); +}); + +afterEach(() => { + if (cg) { + cg.destroy(); + cg = null; + } + if (fs.existsSync(testDir)) fs.rmSync(testDir, { recursive: true, force: true }); +}); + +// ============================================================================ +// Pure parser tests (no CodeGraph) +// ============================================================================ + +describe('extractConfigRefs', () => { + it('extracts process.env.X from TS', () => { + write('a.ts', `const port = process.env.OBSIDIAN_PORT;\n`); + const refs = extractConfigRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + expect(refs.length).toBe(1); + expect(refs[0]!.configKey).toBe('OBSIDIAN_PORT'); + expect(refs[0]!.line).toBe(1); + }); + + it('extracts process.env["X"] from JS', () => { + write('a.js', `module.exports = { port: process.env["MY_KEY"] };\n`); + const refs = extractConfigRefs(testDir, [{ path: 'a.js', language: 'javascript' }], () => null); + expect(refs.map((r) => r.configKey)).toEqual(['MY_KEY']); + }); + + it('extracts os.getenv / os.environ from Python', () => { + write( + 'a.py', + [ + `import os`, + `port = os.getenv("PYTHON_PORT")`, + `host = os.environ.get("PYTHON_HOST")`, + `path = os.environ["PYTHON_PATH"]`, + `name = getenv("PYTHON_NAME")`, + ].join('\n') + ); + const refs = extractConfigRefs(testDir, [{ path: 'a.py', language: 'python' }], () => null); + expect(new Set(refs.map((r) => r.configKey))).toEqual( + new Set(['PYTHON_PORT', 'PYTHON_HOST', 'PYTHON_PATH', 'PYTHON_NAME']) + ); + }); + + it('extracts os.Getenv / os.LookupEnv from Go', () => { + write( + 'a.go', + [ + `package main`, + `import "os"`, + `var Port = os.Getenv("GO_PORT")`, + `var Host, _ = os.LookupEnv("GO_HOST")`, + ].join('\n') + ); + const refs = extractConfigRefs(testDir, [{ path: 'a.go', language: 'go' }], () => null); + expect(new Set(refs.map((r) => r.configKey))).toEqual(new Set(['GO_PORT', 'GO_HOST'])); + }); + + it('extracts ENV[...] / ENV.fetch from Ruby', () => { + write('a.rb', `port = ENV["RUBY_PORT"]\nhost = ENV.fetch("RUBY_HOST")\n`); + const refs = extractConfigRefs(testDir, [{ path: 'a.rb', language: 'ruby' }], () => null); + expect(new Set(refs.map((r) => r.configKey))).toEqual(new Set(['RUBY_PORT', 'RUBY_HOST'])); + }); + + it('extracts env!/std::env::var from Rust', () => { + write( + 'a.rs', + [ + `let port = env!("RUST_PORT");`, + `let host = std::env::var("RUST_HOST").unwrap();`, + ].join('\n') + ); + const refs = extractConfigRefs(testDir, [{ path: 'a.rs', language: 'rust' }], () => null); + expect(new Set(refs.map((r) => r.configKey))).toEqual(new Set(['RUST_PORT', 'RUST_HOST'])); + }); + + it('extracts System.getenv from Java/Kotlin', () => { + write('A.java', `String port = System.getenv("JAVA_PORT");\n`); + const refs = extractConfigRefs(testDir, [{ path: 'A.java', language: 'java' }], () => null); + expect(refs.map((r) => r.configKey)).toEqual(['JAVA_PORT']); + }); + + it('only matches UPPER_CASE keys (skips lower-case identifiers)', () => { + write('a.ts', `const x = process.env.somethingDynamic;\nconst y = process.env.GOOD_KEY;\n`); + const refs = extractConfigRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + expect(refs.map((r) => r.configKey)).toEqual(['GOOD_KEY']); + }); + + it('skips files in unsupported languages without crashing', () => { + write('a.swift', `let port = ProcessInfo.processInfo.environment["SWIFT_PORT"]\n`); + const refs = extractConfigRefs(testDir, [{ path: 'a.swift', language: 'swift' }], () => null); + // Swift not in PATTERNS for v1. + expect(refs).toEqual([]); + }); + + it('captures the correct 1-indexed line number', () => { + write( + 'a.ts', + [ + `// line 1`, + `// line 2`, + `const x = process.env.LINE_THREE_KEY;`, + `// line 4`, + `const y = process.env.LINE_FIVE_KEY;`, + ].join('\n') + ); + const refs = extractConfigRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + expect(refs).toEqual([ + expect.objectContaining({ configKey: 'LINE_THREE_KEY', line: 3 }), + expect.objectContaining({ configKey: 'LINE_FIVE_KEY', line: 5 }), + ]); + }); + + it('threads the resolveEnclosing closure correctly', () => { + write('a.ts', `const x = process.env.FOO;\n`); + const calls: Array<[string, number]> = []; + extractConfigRefs( + testDir, + [{ path: 'a.ts', language: 'typescript' }], + (filePath, line) => { + calls.push([filePath, line]); + return 'fake-node-id'; + } + ); + expect(calls).toEqual([['a.ts', 1]]); + }); + + it('survives a missing file (skips, no throw)', () => { + const refs = extractConfigRefs( + testDir, + [{ path: 'does-not-exist.ts', language: 'typescript' }], + () => null + ); + expect(refs).toEqual([]); + }); +}); + +// ============================================================================ +// End-to-end through CodeGraph +// ============================================================================ + +describe('CodeGraph config refs', () => { + it('persists env reads after indexAll and resolves enclosing function', async () => { + write( + 'src/server.ts', + [ + `export function start() {`, + ` const port = process.env.OBSIDIAN_PORT ?? 8080;`, + ` return port;`, + `}`, + ``, + `export function getApiKey() {`, + ` return process.env.OBSIDIAN_API_KEY;`, + `}`, + ``, + `// top-level read`, + `export const HOST = process.env.OBSIDIAN_HOST;`, + ].join('\n') + ); + cg = CodeGraph.initSync(testDir, { + config: { include: ['**/*.ts'], exclude: [] }, + }); + await cg.indexAll(); + + // All three keys should be visible. + const keys = cg.getConfigKeys({ configKind: 'env' }); + expect(keys.map((k) => k.configKey).sort()).toEqual([ + 'OBSIDIAN_API_KEY', + 'OBSIDIAN_HOST', + 'OBSIDIAN_PORT', + ]); + + // The OBSIDIAN_PORT read should be attributed to `start`. + const portSites = cg.getConfigRefsByKey('OBSIDIAN_PORT'); + expect(portSites.length).toBe(1); + expect(portSites[0]!.sourceName).toBe('start'); + + // The HOST read is at the top level — sourceName should be null. + const hostSites = cg.getConfigRefsByKey('OBSIDIAN_HOST'); + expect(hostSites[0]!.sourceName).toBeNull(); + }); + + it('reverse view: getConfigKeysForNode returns keys read by a function', async () => { + write( + 'src/a.ts', + [ + `export function loadConfig() {`, + ` const a = process.env.KEY_A;`, + ` const b = process.env.KEY_B;`, + ` return { a, b };`, + `}`, + ].join('\n') + ); + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + + const node = cg.getNodesInFile('src/a.ts').find((n) => n.name === 'loadConfig')!; + const keys = cg.getConfigKeysForNode(node.id).map((r) => r.configKey).sort(); + expect(keys).toEqual(['KEY_A', 'KEY_B']); + }); + + it('respects enableConfigRefs=false', async () => { + write('src/a.ts', `export const PORT = process.env.PORT;\n`); + cg = CodeGraph.initSync(testDir, { + config: { include: ['**/*.ts'], exclude: [], enableConfigRefs: false }, + }); + await cg.indexAll(); + expect(cg.getConfigKeys()).toEqual([]); + }); + + it('incremental sync replaces refs for changed files only', async () => { + write('src/a.ts', `export const A = process.env.OLD_KEY;\n`); + write('src/b.ts', `export const B = process.env.UNCHANGED_KEY;\n`); + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + expect(cg.getConfigKeys().map((k) => k.configKey).sort()).toEqual([ + 'OLD_KEY', + 'UNCHANGED_KEY', + ]); + + // Edit only a.ts — UNCHANGED_KEY should still be there. + write('src/a.ts', `export const A = process.env.NEW_KEY;\n`); + await cg.sync(); + + const keys = cg.getConfigKeys().map((k) => k.configKey).sort(); + expect(keys).toContain('NEW_KEY'); + expect(keys).toContain('UNCHANGED_KEY'); + expect(keys).not.toContain('OLD_KEY'); + }); + + it('drops refs when a file is edited to remove its last env read', async () => { + // Regression for the empty-rows early-return data-corruption bug: + // applyConfigRefs([]) used to short-circuit without deleting the + // stale rows for the file. The sync path now explicitly invalidates + // rows for every changed file *before* extracting, regardless of + // whether the new content has any reads. + write('src/a.ts', `export const PORT = process.env.REMOVED_KEY;\n`); + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + expect(cg.getConfigKeys().some((k) => k.configKey === 'REMOVED_KEY')).toBe(true); + + // Edit a.ts to remove the env read entirely (no remaining reads). + write('src/a.ts', `export const PORT = 8080; // no env read here\n`); + await cg.sync(); + + expect(cg.getConfigKeys().some((k) => k.configKey === 'REMOVED_KEY')).toBe(false); + }); + + it('drops refs for files removed between syncs', async () => { + write('src/a.ts', `export const A = process.env.GOING_AWAY;\n`); + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + expect(cg.getConfigKeys().some((k) => k.configKey === 'GOING_AWAY')).toBe(true); + + fs.unlinkSync(path.join(testDir, 'src/a.ts')); + await cg.sync(); + + expect(cg.getConfigKeys().some((k) => k.configKey === 'GOING_AWAY')).toBe(false); + }); + + // (Removed: a defensive test for the v4-migration-collision bug class. + // With file-based migrations (NNN-name.ts), two PRs claiming the same + // version produces a filesystem-level conflict, so the silent skip the + // defensive guard protected against can no longer happen.) +}); diff --git a/__tests__/context.test.ts b/__tests__/context.test.ts index 52dae1fe..9a0614aa 100644 --- a/__tests__/context.test.ts +++ b/__tests__/context.test.ts @@ -210,6 +210,19 @@ export function validateEmail(email: string): boolean { expect(result.nodes.size).toBeLessThanOrEqual(5); }); + + it('should clamp absurd searchLimit/maxNodes values to safe upper bounds', async () => { + // Without clamping, the internal `findNodesByExactName` query would + // request `searchLimit * 5` rows — passing 1e9 here would blow out + // memory. The call should complete in normal time and not return more + // than the hard cap on maxNodes (1000). + const result = await cg.findRelevantContext('function', { + searchLimit: 1_000_000_000, + maxNodes: 1_000_000_000, + traversalDepth: 1_000, + }); + expect(result.nodes.size).toBeLessThanOrEqual(1000); + }); }); describe('buildContext()', () => { diff --git a/__tests__/db-perf.test.ts b/__tests__/db-perf.test.ts new file mode 100644 index 00000000..256cf92c --- /dev/null +++ b/__tests__/db-perf.test.ts @@ -0,0 +1,161 @@ +/** + * DB Performance / Correctness Tests + * + * Regression tests for three changes: + * 1. Batch `getNodesByIds` collapses graph-traversal N+1 reads. + * 2. `insertNode` invalidates the LRU cache so INSERT OR REPLACE + * doesn't serve a stale cached row on next `getNodeById`. + * 3. `runMaintenance` runs `PRAGMA optimize` + `wal_checkpoint(PASSIVE)` + * after indexAll/sync without throwing. + */ + +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { DatabaseConnection } from '../src/db'; +import { QueryBuilder } from '../src/db/queries'; +import { Node } from '../src/types'; + +function makeNode(id: string, name = id): Node { + return { + id, + kind: 'function', + name, + qualifiedName: name, + filePath: 'a.ts', + language: 'typescript', + startLine: 1, + endLine: 1, + startColumn: 0, + endColumn: 0, + updatedAt: Date.now(), + }; +} + +describe('getNodesByIds (batch lookup)', () => { + let dir: string; + let db: DatabaseConnection; + let q: QueryBuilder; + + beforeEach(() => { + dir = fs.mkdtempSync(path.join(os.tmpdir(), 'db-perf-batch-')); + db = DatabaseConnection.initialize(path.join(dir, 'test.db')); + q = new QueryBuilder(db.getDb()); + }); + + afterEach(() => { + db.close(); + if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true }); + }); + + it('returns a Map keyed by id, with one entry per existing node', () => { + q.insertNodes([makeNode('n1'), makeNode('n2'), makeNode('n3')]); + const out = q.getNodesByIds(['n1', 'n2', 'n3']); + expect(out.size).toBe(3); + expect(out.get('n1')!.name).toBe('n1'); + expect(out.get('n3')!.name).toBe('n3'); + }); + + it('omits missing IDs from the result map (no nulls, no exceptions)', () => { + q.insertNodes([makeNode('n1'), makeNode('n2')]); + const out = q.getNodesByIds(['n1', 'missing', 'n2']); + expect(out.size).toBe(2); + expect(out.has('missing')).toBe(false); + expect(out.has('n1')).toBe(true); + expect(out.has('n2')).toBe(true); + }); + + it('handles an empty input array', () => { + expect(q.getNodesByIds([]).size).toBe(0); + }); + + it('handles batches over the SQLite parameter limit (chunking)', () => { + // Insert 1500 nodes; the helper chunks at 500 internally. + const nodes = Array.from({ length: 1500 }, (_, i) => makeNode(`n${i}`)); + q.insertNodes(nodes); + const ids = nodes.map((n) => n.id); + const out = q.getNodesByIds(ids); + expect(out.size).toBe(1500); + // Spot-check a few from the first / middle / last chunk. + expect(out.has('n0')).toBe(true); + expect(out.has('n750')).toBe(true); + expect(out.has('n1499')).toBe(true); + }); + + it('serves cache hits from memory and queries only the misses', () => { + q.insertNodes([makeNode('n1'), makeNode('n2'), makeNode('n3')]); + // Warm the cache for n1 only. + q.getNodeById('n1'); + // Replace the underlying row to make a miss-vs-cache-hit detectable. + db.getDb().prepare('UPDATE nodes SET name = ? WHERE id = ?').run('changed', 'n1'); + const out = q.getNodesByIds(['n1', 'n2']); + // The cached n1 (still 'n1', not 'changed') must be returned. + expect(out.get('n1')!.name).toBe('n1'); + expect(out.get('n2')!.name).toBe('n2'); + }); +}); + +describe('insertNode cache invalidation', () => { + let dir: string; + let db: DatabaseConnection; + let q: QueryBuilder; + + beforeEach(() => { + dir = fs.mkdtempSync(path.join(os.tmpdir(), 'db-perf-cache-')); + db = DatabaseConnection.initialize(path.join(dir, 'test.db')); + q = new QueryBuilder(db.getDb()); + }); + + afterEach(() => { + db.close(); + if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true }); + }); + + it('does not serve a stale cached node after INSERT OR REPLACE', () => { + // Regression: insertNode (which uses INSERT OR REPLACE) used to skip + // cache invalidation, so the next getNodeById returned the pre-replace + // version until LRU eviction. + const original = makeNode('n1', 'oldName'); + q.insertNode(original); + const beforeReplace = q.getNodeById('n1'); + expect(beforeReplace!.name).toBe('oldName'); + + // Replace via insertNode (the bug path). + q.insertNode({ ...original, name: 'newName', updatedAt: Date.now() }); + const afterReplace = q.getNodeById('n1'); + expect(afterReplace!.name).toBe('newName'); + }); +}); + +describe('runMaintenance', () => { + let dir: string; + let db: DatabaseConnection; + + beforeEach(() => { + dir = fs.mkdtempSync(path.join(os.tmpdir(), 'db-perf-maint-')); + db = DatabaseConnection.initialize(path.join(dir, 'test.db')); + }); + + afterEach(() => { + db.close(); + if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true }); + }); + + it('runs without throwing on a fresh database', () => { + expect(() => db.runMaintenance()).not.toThrow(); + }); + + it('runs without throwing after writes', () => { + const q = new QueryBuilder(db.getDb()); + q.insertNodes([makeNode('n1'), makeNode('n2')]); + expect(() => db.runMaintenance()).not.toThrow(); + }); + + it('swallows failures rather than propagating (best-effort)', () => { + // Close the DB so the underlying handle would normally throw on any + // exec(). runMaintenance must still not propagate. + db.close(); + expect(() => db.runMaintenance()).not.toThrow(); + }); +}); diff --git a/__tests__/diversify.test.ts b/__tests__/diversify.test.ts new file mode 100644 index 00000000..181ee9c5 --- /dev/null +++ b/__tests__/diversify.test.ts @@ -0,0 +1,200 @@ +/** + * Result Diversification Tests + * + * Verifies the per-file cap on search results: queries that match many + * symbols in one file (the methods of a class) no longer return 10 hits + * from one file, but instead surface representative breadth across files. + */ + +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { DatabaseConnection } from '../src/db'; +import { QueryBuilder } from '../src/db/queries'; +import { diversifyByFile } from '../src/search/query-utils'; +import { Node } from '../src/types'; + +describe('diversifyByFile (unit)', () => { + function r(score: number, name: string, filePath: string) { + return { node: { id: name, name, filePath } as Node, score }; + } + + it('caps consecutive results from the same file at perFileCap', () => { + const results = [ + r(10, 'a1', 'a.ts'), + r(9, 'a2', 'a.ts'), + r(8, 'a3', 'a.ts'), + r(7, 'a4', 'a.ts'), + r(6, 'b1', 'b.ts'), + ]; + const out = diversifyByFile(results, 5, 2); + expect(out.map((x) => x.node.name)).toEqual(['a1', 'a2', 'b1', 'a3', 'a4']); + // First two from a.ts (cap), then b.ts (different file), then backfill. + }); + + it('preserves overall ranking when no file dominates', () => { + const results = [ + r(10, 'a1', 'a.ts'), + r(9, 'b1', 'b.ts'), + r(8, 'c1', 'c.ts'), + r(7, 'a2', 'a.ts'), + ]; + const out = diversifyByFile(results, 4, 2); + expect(out.map((x) => x.node.name)).toEqual(['a1', 'b1', 'c1', 'a2']); + }); + + it('does not lose results — backfills from skipped when limit not yet filled', () => { + // 10 candidates all from one file, limit 5, cap 2: pick 2, backfill 3. + const results = Array.from({ length: 10 }, (_, i) => + r(10 - i, `n${i}`, 'a.ts') + ); + const out = diversifyByFile(results, 5, 2); + expect(out).toHaveLength(5); + expect(out.every((x) => x.node.filePath === 'a.ts')).toBe(true); + }); + + it('returns the input slice unchanged when perFileCap=0', () => { + const results = [ + r(10, 'a1', 'a.ts'), + r(9, 'a2', 'a.ts'), + r(8, 'a3', 'a.ts'), + ]; + expect(diversifyByFile(results, 3, 0)).toEqual(results); + }); + + it('returns input unchanged when results.length <= limit and no reordering needed', () => { + const results = [r(10, 'a1', 'a.ts'), r(9, 'a2', 'a.ts')]; + expect(diversifyByFile(results, 5, 2)).toEqual(results); + }); + + it('still reorders within limit when results.length === limit but cap rearranges', () => { + // Same total count as limit, but the cap reorders to surface peer files + // earlier in the list. + const results = [ + r(10, 'a1', 'a.ts'), + r(9, 'a2', 'a.ts'), + r(8, 'a3', 'a.ts'), + r(7, 'a4', 'a.ts'), + r(6, 'b1', 'b.ts'), + ]; + const out = diversifyByFile(results, 5, 2); + // First 2 from a.ts (cap), then b.ts, then backfill a.ts. + expect(out.map((x) => x.node.name)).toEqual(['a1', 'a2', 'b1', 'a3', 'a4']); + }); + + it('respects the limit even when picked + skipped exceed it', () => { + const results = [ + r(10, 'a1', 'a.ts'), + r(9, 'a2', 'a.ts'), + r(8, 'a3', 'a.ts'), + r(7, 'b1', 'b.ts'), + ]; + const out = diversifyByFile(results, 2, 2); + expect(out).toHaveLength(2); + expect(out.map((x) => x.node.name)).toEqual(['a1', 'a2']); + }); + + it('always preserves the top-scoring result at position 0', () => { + const results = [ + r(100, 'top', 'big.ts'), + r(50, 'big2', 'big.ts'), + r(40, 'big3', 'big.ts'), + r(30, 'big4', 'big.ts'), + r(20, 'other', 'other.ts'), + ]; + const out = diversifyByFile(results, 3, 2); + expect(out[0].node.name).toBe('top'); + }); +}); + +describe('searchNodes per-file diversification (integration)', () => { + let dir: string; + let db: DatabaseConnection; + let q: QueryBuilder; + + function makeNode(id: string, name: string, kind: Node['kind'], filePath: string): Node { + return { + id, + kind, + name, + qualifiedName: `${filePath}::${name}`, + filePath, + language: 'typescript', + startLine: 1, + endLine: 1, + startColumn: 0, + endColumn: 0, + updatedAt: Date.now(), + }; + } + + beforeEach(() => { + dir = fs.mkdtempSync(path.join(os.tmpdir(), 'diversify-search-')); + db = DatabaseConnection.initialize(path.join(dir, 'test.db')); + q = new QueryBuilder(db.getDb()); + // Simulate the "10 methods of one class" scenario: a class plus many + // methods all sharing a common token, all in one file. Plus a peer + // file with a sibling implementation. + const nodes: Node[] = [ + makeNode('cls', 'DatabaseConnection', 'class', 'src/db.ts'), + makeNode('m1', 'connect', 'method', 'src/db.ts'), + makeNode('m2', 'disconnect', 'method', 'src/db.ts'), + makeNode('m3', 'reconnect', 'method', 'src/db.ts'), + makeNode('m4', 'isConnected', 'method', 'src/db.ts'), + makeNode('m5', 'connectionString', 'property', 'src/db.ts'), + makeNode('peer', 'PoolConnection', 'class', 'src/pool.ts'), + makeNode('peer2', 'connectPool', 'function', 'src/pool.ts'), + ]; + q.insertNodes(nodes); + }); + + afterEach(() => { + db.close(); + if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true }); + }); + + it('caps results per file at the default (3) so peer files surface', () => { + const results = q.searchNodes('connect', { limit: 5 }); + const fromDbTs = results.filter((r) => r.node.filePath === 'src/db.ts').length; + const fromPool = results.filter((r) => r.node.filePath === 'src/pool.ts').length; + expect(fromDbTs).toBeLessThanOrEqual(3); // cap + expect(fromPool).toBeGreaterThanOrEqual(1); // peer file represented + }); + + it('honors perFileCap: 0 (disabled) — does not enforce a per-file limit', () => { + // Insert a heavy imbalance so dominance is unambiguous: 10 matching + // methods in db.ts, only the existing pool.ts entries elsewhere. + const heavyDb: Node[] = Array.from({ length: 10 }, (_, i) => + makeNode(`heavy${i}`, `connectVariant${i}`, 'method', 'src/db.ts') + ); + q.insertNodes(heavyDb); + const results = q.searchNodes('connect', { limit: 8, perFileCap: 0 }); + const fromDbTs = results.filter((r) => r.node.filePath === 'src/db.ts').length; + expect(fromDbTs).toBeGreaterThan(3); + }); + + it('honors a higher perFileCap', () => { + const results = q.searchNodes('connect', { limit: 6, perFileCap: 5 }); + const fromDbTs = results.filter((r) => r.node.filePath === 'src/db.ts').length; + expect(fromDbTs).toBeLessThanOrEqual(5); + }); + + it('preserves the top-scoring hit even with diversification', () => { + // Class node with the most direct name match is the most relevant — + // diversification must never displace it from #1. + const results = q.searchNodes('DatabaseConnection', { limit: 3 }); + expect(results[0].node.name).toBe('DatabaseConnection'); + }); + + it('does not lose results — fills limit by backfilling skipped same-file hits', () => { + // If only one file has matches, all results legitimately come from it. + // The cap should not cause us to return fewer than `limit` results. + const onlyOneFileNodes: Node[] = Array.from({ length: 10 }, (_, i) => + makeNode(`only${i}`, `solo${i}`, 'function', 'src/only.ts') + ); + q.insertNodes(onlyOneFileNodes); + const results = q.searchNodes('solo', { limit: 5 }); + expect(results.length).toBe(5); + }); +}); diff --git a/__tests__/edges-unique.test.ts b/__tests__/edges-unique.test.ts new file mode 100644 index 00000000..49eced53 --- /dev/null +++ b/__tests__/edges-unique.test.ts @@ -0,0 +1,166 @@ +/** + * Edge Uniqueness Tests + * + * Regression tests for the bug where `INSERT OR IGNORE INTO edges` was + * silently a no-op: the only candidate key was the AUTOINCREMENT id (which + * never conflicts), so duplicate edges accumulated on every re-emission / + * re-resolution. + * + * Fix: a UNIQUE index on (source, target, kind, COALESCE(line, -1), + * COALESCE(col, -1)) backs a fresh-install schema and is also applied via + * migration v4 (with a dedup pass over existing rows). + */ + +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { DatabaseConnection } from '../src/db'; +import { QueryBuilder } from '../src/db/queries'; +import { Edge, Node } from '../src/types'; +import { runMigrations, getCurrentVersion, CURRENT_SCHEMA_VERSION } from '../src/db/migrations'; + +function tempDb(): { dir: string; db: DatabaseConnection; q: QueryBuilder } { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-edges-unique-')); + const db = DatabaseConnection.initialize(path.join(dir, 'test.db')); + const q = new QueryBuilder(db.getDb()); + return { dir, db, q }; +} + +function cleanup(dir: string, db: DatabaseConnection) { + db.close(); + if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true }); +} + +function makeNode(id: string, name: string): Node { + return { + id, + kind: 'function', + name, + qualifiedName: `f::${name}`, + filePath: 'a.ts', + language: 'typescript', + startLine: 1, + endLine: 1, + startColumn: 0, + endColumn: 0, + updatedAt: Date.now(), + }; +} + +function edgesCount(db: DatabaseConnection): number { + const row = db.getDb().prepare('SELECT COUNT(*) as c FROM edges').get() as { c: number }; + return row.c; +} + +describe('Edge UNIQUE constraint (bug #2)', () => { + let dir: string; + let db: DatabaseConnection; + let q: QueryBuilder; + + beforeEach(() => { + ({ dir, db, q } = tempDb()); + q.insertNodes([makeNode('n1', 'foo'), makeNode('n2', 'bar')]); + }); + + afterEach(() => cleanup(dir, db)); + + it('rejects duplicate (source, target, kind, line, col)', () => { + const e: Edge = { source: 'n1', target: 'n2', kind: 'calls', line: 10, column: 5 }; + q.insertEdge(e); + q.insertEdge(e); // INSERT OR IGNORE — should be a no-op now + expect(edgesCount(db)).toBe(1); + }); + + it('treats two NULL line edges as duplicates (COALESCE in unique index)', () => { + const e: Edge = { source: 'n1', target: 'n2', kind: 'calls' }; + q.insertEdge(e); + q.insertEdge(e); + expect(edgesCount(db)).toBe(1); + }); + + it('allows same source/target/kind on different lines', () => { + q.insertEdge({ source: 'n1', target: 'n2', kind: 'calls', line: 1 }); + q.insertEdge({ source: 'n1', target: 'n2', kind: 'calls', line: 2 }); + expect(edgesCount(db)).toBe(2); + }); + + it('allows same source/target/line on different kinds', () => { + q.insertEdge({ source: 'n1', target: 'n2', kind: 'calls', line: 1 }); + q.insertEdge({ source: 'n1', target: 'n2', kind: 'references', line: 1 }); + expect(edgesCount(db)).toBe(2); + }); + + it('insertEdges (batch) dedupes within the same call', () => { + const e: Edge = { source: 'n1', target: 'n2', kind: 'calls', line: 1, column: 1 }; + q.insertEdges([e, e, e]); + expect(edgesCount(db)).toBe(1); + }); + + it('survives the same edge being re-emitted across many cycles', () => { + const e: Edge = { source: 'n1', target: 'n2', kind: 'calls', line: 1 }; + for (let i = 0; i < 100; i++) { + q.insertEdge(e); + } + expect(edgesCount(db)).toBe(1); + }); +}); + +describe('Migration v4: dedup existing edges', () => { + let dir: string; + let dbPath: string; + + beforeEach(() => { + dir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-migr-v4-')); + dbPath = path.join(dir, 'test.db'); + }); + + afterEach(() => { + if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true }); + }); + + it('collapses pre-existing duplicates and adds the UNIQUE index', () => { + // Build a v3-shaped database manually: schema, but simulate a stale + // version row + insert duplicates that the missing UNIQUE index let + // through. We use the real initialize() path then drop the index + + // version row to back-date the DB. + const db = DatabaseConnection.initialize(dbPath); + db.getDb().exec(`DROP INDEX IF EXISTS idx_edges_unique;`); + db.getDb().exec(`DELETE FROM schema_versions;`); + db.getDb().prepare( + 'INSERT INTO schema_versions (version, applied_at, description) VALUES (3, ?, ?)' + ).run(Date.now(), 'simulated v3'); + + const q = new QueryBuilder(db.getDb()); + q.insertNodes([makeNode('n1', 'foo'), makeNode('n2', 'bar')]); + // Force-insert duplicates via raw SQL (bypassing the constraint that + // is now absent). Three rows that should collapse to one. + const stmt = db.getDb().prepare( + 'INSERT INTO edges (source, target, kind, line, col) VALUES (?, ?, ?, ?, ?)' + ); + stmt.run('n1', 'n2', 'calls', 10, 5); + stmt.run('n1', 'n2', 'calls', 10, 5); + stmt.run('n1', 'n2', 'calls', 10, 5); + // And one with NULL line/col, also duplicated + stmt.run('n1', 'n2', 'references', null, null); + stmt.run('n1', 'n2', 'references', null, null); + + expect(edgesCount(db)).toBe(5); + expect(getCurrentVersion(db.getDb())).toBe(3); + + // Run migrations forward + runMigrations(db.getDb(), 3); + + expect(getCurrentVersion(db.getDb())).toBe(CURRENT_SCHEMA_VERSION); + expect(CURRENT_SCHEMA_VERSION).toBeGreaterThanOrEqual(4); + // 3 calls dups → 1, 2 references dups → 1 + expect(edgesCount(db)).toBe(2); + + // Now the constraint is enforced: another duplicate insert is a no-op. + const q2 = new QueryBuilder(db.getDb()); + q2.insertEdge({ source: 'n1', target: 'n2', kind: 'calls', line: 10, column: 5 }); + expect(edgesCount(db)).toBe(2); + + db.close(); + }); +}); diff --git a/__tests__/embeddings.test.ts b/__tests__/embeddings.test.ts new file mode 100644 index 00000000..216e4a08 --- /dev/null +++ b/__tests__/embeddings.test.ts @@ -0,0 +1,388 @@ +/** + * Embedding pipeline + hybrid search + cross-language matching. + * + * Reuses the in-process fake-Ollama pattern from llm.test.ts so the + * tests stay hermetic. The fake server returns deterministic vectors + * derived from the input text so we can assert ordering by hand. + */ + +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import * as http from 'http'; +import { AddressInfo } from 'net'; +import { CodeGraph } from '../src'; +import { + vectorToBytes, + bytesToVector, + cosineNormalised, + reciprocalRankFusion, + topKByCosine, + topKByCosineMatrix, + EmbeddingCache, +} from '../src/llm/embeddings'; + +const EMBED_DIM = 8; + +function l2(v: Float32Array): Float32Array { + let s = 0; + for (let i = 0; i < v.length; i++) s += v[i]! * v[i]!; + const n = Math.sqrt(s) || 1; + const out = new Float32Array(v.length); + for (let i = 0; i < v.length; i++) out[i] = v[i]! / n; + return out; +} + +/** Deterministic 8-dim vector keyed off character codes. */ +function fakeEmbed(text: string): number[] { + const v = new Array(EMBED_DIM).fill(0); + for (let i = 0; i < text.length; i++) { + v[i % EMBED_DIM] += text.charCodeAt(i) % 17; + } + return v; +} + +interface FakeServer { + url: string; + chatCalls: number; + embedCalls: number; + close: () => Promise; +} + +async function startFake(): Promise { + const state = { chatCalls: 0, embedCalls: 0 }; + const server = http.createServer((req, res) => { + let body = ''; + req.on('data', (c) => (body += c)); + req.on('end', () => { + res.setHeader('content-type', 'application/json'); + if (req.url?.endsWith('/models') || req.url === '/models') { + res.end( + JSON.stringify({ + data: [{ id: 'qwen2.5-coder:7b' }, { id: 'nomic-embed-text' }], + }) + ); + return; + } + if (req.url?.endsWith('/chat/completions')) { + state.chatCalls++; + // Look for the symbol body in the user message and echo a + // deterministic summary so the cache key is stable. + const parsed = JSON.parse(body) as { + messages: Array<{ content: string }>; + }; + const userText = parsed.messages?.[0]?.content || ''; + const last = userText.slice(-200); + res.end( + JSON.stringify({ + choices: [ + { + message: { + role: 'assistant', + content: 'Summary of: ' + last.replace(/\s+/g, ' ').slice(0, 80), + }, + }, + ], + }) + ); + return; + } + if (req.url?.endsWith('/embeddings')) { + state.embedCalls++; + const parsed = JSON.parse(body) as { input: string[] }; + res.end( + JSON.stringify({ + data: parsed.input.map((text) => ({ embedding: fakeEmbed(text) })), + }) + ); + return; + } + res.statusCode = 404; + res.end(); + }); + }); + await new Promise((r) => server.listen(0, '127.0.0.1', r)); + const addr = server.address() as AddressInfo; + return { + url: `http://127.0.0.1:${addr.port}/v1`, + get chatCalls() { + return state.chatCalls; + }, + get embedCalls() { + return state.embedCalls; + }, + close: () => + new Promise((resolve, reject) => + server.close((err) => (err ? reject(err) : resolve())) + ), + }; +} + +describe('embedding helpers', () => { + it('vectorToBytes round-trips through bytesToVector', () => { + const v = l2(Float32Array.from([1, 2, 3, 4, 5, 6, 7, 8])); + const b = vectorToBytes(v); + const v2 = bytesToVector(b); + for (let i = 0; i < v.length; i++) { + expect(v2[i]).toBeCloseTo(v[i]!, 6); + } + }); + + it('cosineNormalised gives 1.0 for the same vector', () => { + const v = l2(Float32Array.from([1, 0, 0, 0, 0, 0, 0, 0])); + expect(cosineNormalised(v, v)).toBeCloseTo(1, 6); + }); + + it('cosineNormalised gives 0 for orthogonal vectors', () => { + const a = l2(Float32Array.from([1, 0, 0, 0, 0, 0, 0, 0])); + const b = l2(Float32Array.from([0, 1, 0, 0, 0, 0, 0, 0])); + expect(cosineNormalised(a, b)).toBeCloseTo(0, 6); + }); + + it('topKByCosine returns the highest-scoring node ids', () => { + const query = l2(Float32Array.from([1, 0, 0, 0, 0, 0, 0, 0])); + const candidates = [ + { nodeId: 'a', embedding: vectorToBytes(l2(Float32Array.from([0.9, 0.1, 0, 0, 0, 0, 0, 0]))) }, + { nodeId: 'b', embedding: vectorToBytes(l2(Float32Array.from([0, 1, 0, 0, 0, 0, 0, 0]))) }, + { nodeId: 'c', embedding: vectorToBytes(l2(Float32Array.from([0.5, 0.5, 0, 0, 0, 0, 0, 0]))) }, + ]; + const hits = topKByCosine(query, candidates, 2); + expect(hits.map((h) => h.nodeId)).toEqual(['a', 'c']); + }); + + it('RRF favors items appearing high in both rankings', () => { + const fts = [{ id: 'x' }, { id: 'y' }, { id: 'z' }]; + const sem = [{ id: 'y' }, { id: 'z' }, { id: 'x' }]; + const fused = reciprocalRankFusion([fts, sem]); + // y appears at rank 2 in fts (1/62) + rank 1 in sem (1/61) = highest + const sorted = [...fused.entries()].sort((a, b) => b[1] - a[1]).map(([id]) => id); + expect(sorted[0]).toBe('y'); + }); + + it('topKByCosineMatrix matches topKByCosine on the same data', () => { + const query = l2(Float32Array.from([1, 0, 0, 0, 0, 0, 0, 0])); + const vecs = [ + { id: 'a', v: l2(Float32Array.from([0.9, 0.1, 0, 0, 0, 0, 0, 0])) }, + { id: 'b', v: l2(Float32Array.from([0, 1, 0, 0, 0, 0, 0, 0])) }, + { id: 'c', v: l2(Float32Array.from([0.5, 0.5, 0, 0, 0, 0, 0, 0])) }, + ]; + const candidates = vecs.map((e) => ({ nodeId: e.id, embedding: vectorToBytes(e.v) })); + const matrix = new Float32Array(vecs.length * EMBED_DIM); + const ids = vecs.map((e) => e.id); + for (let i = 0; i < vecs.length; i++) matrix.set(vecs[i]!.v, i * EMBED_DIM); + + const a = topKByCosine(query, candidates, 3).map((h) => h.nodeId); + const b = topKByCosineMatrix(query, matrix, ids, EMBED_DIM, 3).map((h) => h.nodeId); + expect(b).toEqual(a); + }); + + it('EmbeddingCache returns the same result on hit and miss; invalidate forces refetch', () => { + let fetchCalls = 0; + const v = vectorToBytes(l2(Float32Array.from([1, 0, 0, 0, 0, 0, 0, 0]))); + const fetcher = { + getAllEmbeddings: (_model: string) => { + fetchCalls++; + return [{ nodeId: 'a', embedding: v }]; + }, + }; + + const cache = new EmbeddingCache(); + const r1 = cache.get(fetcher, 'm'); + const r2 = cache.get(fetcher, 'm'); + expect(fetchCalls).toBe(1); + expect(r1).toBe(r2); + expect(r1.ids).toEqual(['a']); + expect(r1.dim).toBe(EMBED_DIM); + + cache.invalidate(); + cache.get(fetcher, 'm'); + expect(fetchCalls).toBe(2); + + // Switching models also forces a refetch. + cache.get(fetcher, 'other-model'); + expect(fetchCalls).toBe(3); + }); + + it('EmbeddingCache skips rows whose dimension does not match the first row', () => { + const v3 = vectorToBytes(l2(Float32Array.from([1, 0, 0, 0, 0, 0, 0, 0]))); + // Different shape: 4-dim vector. Should be skipped. + const v4 = Buffer.from(new Float32Array([1, 0, 0, 0]).buffer); + const fetcher = { + getAllEmbeddings: (_model: string) => [ + { nodeId: 'good', embedding: v3 }, + { nodeId: 'bad', embedding: v4 }, + { nodeId: 'good2', embedding: v3 }, + ], + }; + const cache = new EmbeddingCache(); + const r = cache.get(fetcher, 'm'); + expect(r.ids).toEqual(['good', 'good2']); + expect(r.matrix.length).toBe(2 * EMBED_DIM); + expect(r.dim).toBe(EMBED_DIM); + }); + + it('EmbeddingCache returns an empty result without calling the fetcher again on hit', () => { + let fetchCalls = 0; + const fetcher = { + getAllEmbeddings: (_model: string) => { + fetchCalls++; + return []; + }, + }; + const cache = new EmbeddingCache(); + const r = cache.get(fetcher, 'm'); + expect(r.ids).toEqual([]); + expect(r.dim).toBe(0); + cache.get(fetcher, 'm'); + expect(fetchCalls).toBe(1); + }); +}); + +describe('CodeGraph hybrid search & similar', () => { + let tempDir: string; + let fake: FakeServer; + + beforeEach(async () => { + tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-emb-')); + fake = await startFake(); + fs.writeFileSync( + path.join(tempDir, 'sample.ts'), + `export function authenticateUser(name: string): string { + const token = 'secret'; + const claim = 'session'; + return name + token + claim; +} + +export function lookupAccount(id: string): { id: string } { + const cache = new Map(); + cache.set(id, { id }); + return { id }; +} + +export class TokenStore { + private bag: Map = new Map(); + put(k: string, v: string): void { this.bag.set(k, v); } + get(k: string): string | undefined { return this.bag.get(k); } + size(): number { return this.bag.size; } +} +` + ); + fs.writeFileSync( + path.join(tempDir, 'helper.py'), + `def authenticate_user(name): + token = 'secret' + claim = 'session' + return name + token + claim +` + ); + }); + + afterEach(async () => { + await fake.close(); + fs.rmSync(tempDir, { recursive: true, force: true }); + }); + + it('searchHybrid falls back to FTS when no embedding model is configured', async () => { + const cg = await CodeGraph.init(tempDir, { + config: { + llm: { endpoint: fake.url, chatModel: 'qwen2.5-coder:7b' }, + }, + }); + try { + await cg.indexAll({ summarize: false }); + const results = await cg.searchHybrid('authenticate', { limit: 5 }); + expect(results.length).toBeGreaterThan(0); + // No embeddings in DB → no embed calls fired + expect(fake.embedCalls).toBe(0); + } finally { + cg.close(); + } + }); + + it('background pass produces summaries AND embeddings end-to-end', async () => { + const cg = await CodeGraph.init(tempDir, { + config: { + llm: { + endpoint: fake.url, + chatModel: 'qwen2.5-coder:7b', + embeddingModel: 'nomic-embed-text', + }, + }, + }); + try { + await cg.indexAll(); + await cg.awaitBackgroundSummarization(); + + const cov = cg.getSummaryCoverage(); + expect(cov.summarised).toBeGreaterThan(0); + // Embedding pass also ran (chat calls > 0 AND embed calls > 0) + expect(fake.chatCalls).toBeGreaterThan(0); + expect(fake.embedCalls).toBeGreaterThan(0); + + // Re-running summarize is a cache hit; re-running embed should + // also be a cache hit (embedding_model already set). + const callsAfterFirst = fake.chatCalls + fake.embedCalls; + await cg.summarizeAll(); + // chat shouldn't fire again; embed pass not invoked here directly. + expect(fake.chatCalls + fake.embedCalls).toBe(callsAfterFirst); + } finally { + cg.close(); + } + }); + + it('searchHybrid returns FTS+semantic blended results once embeddings exist', async () => { + const cg = await CodeGraph.init(tempDir, { + config: { + llm: { + endpoint: fake.url, + chatModel: 'qwen2.5-coder:7b', + embeddingModel: 'nomic-embed-text', + }, + }, + }); + try { + await cg.indexAll(); + await cg.awaitBackgroundSummarization(); + + const results = await cg.searchHybrid('authenticateUser', { limit: 5 }); + expect(results.length).toBeGreaterThan(0); + // Hybrid path embedded the query (one extra embed call beyond + // the bulk-summary embeddings). + expect(fake.embedCalls).toBeGreaterThan(1); + } finally { + cg.close(); + } + }); + + it('findSimilar returns related symbols and respects differentLanguage', async () => { + const cg = await CodeGraph.init(tempDir, { + config: { + llm: { + endpoint: fake.url, + chatModel: 'qwen2.5-coder:7b', + embeddingModel: 'nomic-embed-text', + }, + }, + }); + try { + await cg.indexAll(); + await cg.awaitBackgroundSummarization(); + + const ts = cg.searchNodes('authenticateUser', { limit: 1 })[0]; + expect(ts).toBeDefined(); + + const similar = await cg.findSimilar(ts!.node.id, { limit: 3 }); + // Should exclude the source itself + expect(similar.find((r) => r.node.id === ts!.node.id)).toBeUndefined(); + + // Cross-language filter should only return non-TS hits (or empty) + const xLang = await cg.findSimilar(ts!.node.id, { limit: 3, differentLanguage: true }); + for (const r of xLang) { + expect(r.node.language).not.toBe(ts!.node.language); + } + } finally { + cg.close(); + } + }); +}); diff --git a/__tests__/extraction-resolution-accuracy.test.ts b/__tests__/extraction-resolution-accuracy.test.ts new file mode 100644 index 00000000..f78f3d76 --- /dev/null +++ b/__tests__/extraction-resolution-accuracy.test.ts @@ -0,0 +1,266 @@ +/** + * Extraction & Resolution Accuracy Tests + * + * Regression tests for three accuracy bugs fixed in one PR: + * 1. Parse-retry comment strip was hardcoded to `//`, no-op on Python/Ruby/etc. + * 2. Framework route extractors ran regex over raw file content, matching + * examples in docstrings/comments as real routes. + * 3. UTF-8 BOM caused spurious "modified" hash mismatches between editors. + */ + +import { describe, it, expect } from 'vitest'; +import { stripBom, stripCommentLinesForRetry, stripCommentsForRegex } from '../src/utils'; +import { hashContent } from '../src/extraction'; +import { flaskResolver, fastapiResolver, djangoResolver } from '../src/resolution/frameworks/python'; +import { expressResolver } from '../src/resolution/frameworks/express'; +import { aspnetResolver } from '../src/resolution/frameworks/csharp'; +import { rustResolver } from '../src/resolution/frameworks/rust'; +import { laravelResolver } from '../src/resolution/frameworks/laravel'; + +describe('UTF-8 BOM normalization (bug #5)', () => { + it('stripBom removes leading U+FEFF', () => { + expect(stripBom('hello')).toBe('hello'); + expect(stripBom('hello')).toBe('hello'); + expect(stripBom('')).toBe(''); + }); + + it('stripBom only removes leading BOM, not embedded ones', () => { + expect(stripBom('ab')).toBe('ab'); + }); + + it('hashContent treats BOM and no-BOM as identical', () => { + const withBom = 'export function hello() { return 42; }'; + const withoutBom = 'export function hello() { return 42; }'; + expect(hashContent(withBom)).toBe(hashContent(withoutBom)); + }); +}); + +describe('Per-language comment-line stripping (bug #1)', () => { + it('strips `#` lines for Python', () => { + const input = ['# CHECK: foo', 'def x():', ' pass'].join('\n'); + const out = stripCommentLinesForRetry(input, 'python'); + expect(out.split('\n')).toEqual(['', 'def x():', ' pass']); + }); + + it('strips `#` lines for Ruby', () => { + const input = ['# top comment', 'def x; end'].join('\n'); + const out = stripCommentLinesForRetry(input, 'ruby'); + expect(out.split('\n')).toEqual(['', 'def x; end']); + }); + + it('strips `//` lines for TypeScript', () => { + const input = ['// header', 'function x() {}'].join('\n'); + const out = stripCommentLinesForRetry(input, 'typescript'); + expect(out.split('\n')).toEqual(['', 'function x() {}']); + }); + + it('strips both `//` and `#` lines for PHP', () => { + const input = ['// js-style', '# perl-style', ' { + const input = '// looks like a comment\ncode'; + expect(stripCommentLinesForRetry(input, 'unknown-lang')).toBe(input); + }); + + it('preserves line count so node positions stay correct', () => { + const input = ['# c1', 'a', '# c2', 'b'].join('\n'); + const out = stripCommentLinesForRetry(input, 'python'); + expect(out.split('\n').length).toBe(input.split('\n').length); + }); + + it('does NOT strip indented `#` inside Python (still recognized as line comment)', () => { + // The marker matches optional leading whitespace + `#`, so an indented + // pure comment line is correctly stripped. Non-comment code on the same + // line as `#` (mid-line comment) is intentionally not stripped here. + const input = [' # indented comment', ' pass # trailing'].join('\n'); + const out = stripCommentLinesForRetry(input, 'python'); + expect(out.split('\n')).toEqual(['', ' pass # trailing']); + }); +}); + +describe('Framework regex no longer matches docstrings/comments (bug #4)', () => { + describe('Flask', () => { + it('skips routes inside `#` comments', () => { + const content = [ + 'from flask import Flask', + 'app = Flask(__name__)', + '# Example: @app.route("/fake")', + '@app.route("/real")', + 'def real(): pass', + ].join('\n'); + const nodes = flaskResolver.extractNodes!('app.py', content); + const paths = nodes.map((n) => n.name); + expect(paths).toContain('/real'); + expect(paths).not.toContain('/fake'); + }); + + it('skips routes inside triple-quoted docstrings', () => { + const content = [ + 'def example():', + ' """', + ' Usage: @app.route("/fake")', + ' """', + ' pass', + '@app.route("/real")', + 'def real(): pass', + ].join('\n'); + const nodes = flaskResolver.extractNodes!('app.py', content); + const paths = nodes.map((n) => n.name); + expect(paths).toContain('/real'); + expect(paths).not.toContain('/fake'); + }); + }); + + describe('FastAPI', () => { + it('skips routes inside `#` comments and triple-quoted docstrings', () => { + const content = [ + '"""', + 'Module docs — example: @app.get("/docfake")', + '"""', + '# @app.post("/commentfake")', + '@app.get("/real")', + 'def real(): pass', + ].join('\n'); + const nodes = fastapiResolver.extractNodes!('app.py', content); + const names = nodes.map((n) => n.name); + expect(names.some((n) => n.includes('/real'))).toBe(true); + expect(names.some((n) => n.includes('/docfake'))).toBe(false); + expect(names.some((n) => n.includes('/commentfake'))).toBe(false); + }); + + it('preserves correct line numbers for real routes after stripping', () => { + const content = [ + '"""', // line 1 + '@app.get("/fake")', // line 2 — inside docstring + '"""', // line 3 + '', // line 4 + '@app.get("/real")', // line 5 — real + ].join('\n'); + const nodes = fastapiResolver.extractNodes!('app.py', content); + const real = nodes.find((n) => n.name.includes('/real')); + expect(real).toBeDefined(); + expect(real!.startLine).toBe(5); + }); + }); + + describe('Django URL patterns', () => { + it('skips path() inside `#` comments', () => { + const content = [ + 'from django.urls import path', + '# example: path("fake/", fake_view)', + 'urlpatterns = [path("real/", real_view)]', + ].join('\n'); + const nodes = djangoResolver.extractNodes!('urls.py', content); + const names = nodes.map((n) => n.name); + expect(names).toContain('real/'); + expect(names).not.toContain('fake/'); + }); + }); + + describe('Express', () => { + it('skips routes inside `//` comments', () => { + const content = [ + 'const app = express();', + '// app.get("/fake", fakeHandler);', + 'app.get("/real", realHandler);', + ].join('\n'); + const nodes = expressResolver.extractNodes!('server.js', content); + const names = nodes.map((n) => n.name); + expect(names.some((n) => n.includes('/real'))).toBe(true); + expect(names.some((n) => n.includes('/fake'))).toBe(false); + }); + + it('skips routes inside `/* ... */` block comments', () => { + const content = [ + '/*', + ' * app.post("/blockfake", h);', + ' */', + 'app.get("/real", h);', + ].join('\n'); + const nodes = expressResolver.extractNodes!('server.js', content); + const names = nodes.map((n) => n.name); + expect(names.some((n) => n.includes('/real'))).toBe(true); + expect(names.some((n) => n.includes('/blockfake'))).toBe(false); + }); + }); + + describe('Laravel', () => { + it('skips routes inside PHP `//` and `#` comments', () => { + const content = [ + ' n.name); + expect(names.some((n) => n.includes('/real'))).toBe(true); + expect(names.some((n) => n.includes('/jsfake'))).toBe(false); + expect(names.some((n) => n.includes('/perlfake'))).toBe(false); + }); + }); + + describe('Rust', () => { + it('skips actix/rocket routes inside `///` doc comments', () => { + const content = [ + '/// Example route: #[get("/docfake")]', + '#[get("/real")]', + 'fn real() {}', + ].join('\n'); + const nodes = rustResolver.extractNodes!('main.rs', content); + const names = nodes.map((n) => n.name); + expect(names.some((n) => n.includes('/real'))).toBe(true); + expect(names.some((n) => n.includes('/docfake'))).toBe(false); + }); + }); + + describe('ASP.NET (C#)', () => { + it('skips route attributes inside `///` XML doc comments', () => { + const content = [ + '/// ', + '/// Example: [HttpGet("/docfake")]', + '/// ', + '[HttpGet("/real")]', + 'public class C {}', + ].join('\n'); + const nodes = aspnetResolver.extractNodes!('Controller.cs', content); + const names = nodes.map((n) => n.name); + expect(names.some((n) => n.includes('/real'))).toBe(true); + expect(names.some((n) => n.includes('/docfake'))).toBe(false); + }); + + it('skips minimal-API MapGet/MapPost calls inside comments', () => { + // Regression: the minimalApiPattern loop below the routePatterns + // loop was initially missed when applying the strip helper, leaving + // commented-out `app.MapGet("/x")` calls extracted as real routes. + const content = [ + '// app.MapGet("/linefake", h);', + '/*', + ' * app.MapPost("/blockfake", h);', + ' */', + 'app.MapGet("/real", h);', + ].join('\n'); + const nodes = aspnetResolver.extractNodes!('Program.cs', content); + const names = nodes.map((n) => n.name); + expect(names.some((n) => n.includes('/real'))).toBe(true); + expect(names.some((n) => n.includes('/linefake'))).toBe(false); + expect(names.some((n) => n.includes('/blockfake'))).toBe(false); + }); + }); +}); + +describe('stripCommentsForRegex preserves line offsets', () => { + it('keeps newlines so match.index → original line number', () => { + const input = '"""\n@app.get("/x")\n"""\n@app.get("/y")'; + const out = stripCommentsForRegex(input, 'python'); + // Newlines preserved + expect(out.split('\n').length).toBe(input.split('\n').length); + // The /y route survives + expect(out).toContain('/y'); + // The docstring contents are blanked + expect(out).not.toContain('/x'); + }); +}); diff --git a/__tests__/extraction.test.ts b/__tests__/extraction.test.ts index 8a70ffed..f279ae03 100644 --- a/__tests__/extraction.test.ts +++ b/__tests__/extraction.test.ts @@ -3079,3 +3079,655 @@ describe('Directory Exclusion', () => { expect(files.every((f) => !f.includes('vendor'))).toBe(true); }); }); + +// ============================================================================= +// R Extraction +// ============================================================================= + +describe('R Extraction', () => { + describe('Language detection', () => { + it('should detect R files', () => { + expect(detectLanguage('script.R')).toBe('r'); + expect(detectLanguage('utils.r')).toBe('r'); + }); + + it('should report R as supported', () => { + expect(isLanguageSupported('r')).toBe(true); + expect(getSupportedLanguages()).toContain('r'); + }); + }); + + describe('Function extraction', () => { + it('should extract a function defined with <-', () => { + const code = `add <- function(a, b) { + a + b +}`; + const result = extractFromSource('main.R', code); + const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'add'); + expect(fn).toBeDefined(); + expect(fn?.signature).toBe('(a, b)'); + }); + + it('should extract a function defined with =', () => { + const code = `subtract = function(a, b) a - b`; + const result = extractFromSource('main.R', code); + const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'subtract'); + expect(fn).toBeDefined(); + }); + + it('should extract a function defined with <<-', () => { + const code = `divide <<- function(a, b) a / b`; + const result = extractFromSource('main.R', code); + const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'divide'); + expect(fn).toBeDefined(); + }); + + it('should extract S3 method names verbatim (period in name)', () => { + const code = `print.myClass <- function(x, ...) cat(x$value)`; + const result = extractFromSource('print.R', code); + const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'print.myClass'); + expect(fn).toBeDefined(); + }); + + it('should NOT emit anonymous function nodes for inline lambdas', () => { + const code = `result <- lapply(xs, function(x) x * 2)`; + const result = extractFromSource('main.R', code); + expect(result.nodes.find((n) => n.kind === 'function')).toBeUndefined(); + }); + + it('should attach a docstring from preceding roxygen comments', () => { + const code = `#' Add two numbers +#' @param a numeric +#' @param b numeric +add <- function(a, b) a + b`; + const result = extractFromSource('main.R', code); + const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'add'); + expect(fn?.docstring).toContain('Add two numbers'); + }); + }); + + describe('Call extraction', () => { + it('should extract simple function calls inside a function body', () => { + const code = `wrap <- function(x) { + inner(x) + another(x) +}`; + const result = extractFromSource('main.R', code); + const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'wrap')!; + const calls = result.unresolvedReferences.filter( + (r) => r.fromNodeId === fn.id && r.referenceKind === 'calls' + ); + const calleeNames = calls.map((c) => c.referenceName); + expect(calleeNames).toContain('inner'); + expect(calleeNames).toContain('another'); + }); + + it('should preserve namespace operator in callee name (pkg::fn)', () => { + const code = `runner <- function() { + dplyr::filter(df, x > 0) +}`; + const result = extractFromSource('main.R', code); + const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'runner')!; + const calleeNames = result.unresolvedReferences + .filter((r) => r.fromNodeId === fn.id) + .map((r) => r.referenceName); + expect(calleeNames).toContain('dplyr::filter'); + }); + }); + + describe('Imports', () => { + it('should extract library() with bare-identifier argument', () => { + const code = `library(dplyr)`; + const result = extractFromSource('main.R', code); + const importNode = result.nodes.find((n) => n.kind === 'import'); + expect(importNode?.name).toBe('dplyr'); + }); + + it('should extract library() with quoted-string argument', () => { + const code = `library("tidyr")`; + const result = extractFromSource('main.R', code); + const importNode = result.nodes.find((n) => n.kind === 'import' && n.name === 'tidyr'); + expect(importNode).toBeDefined(); + }); + + it('should extract require() the same way as library()', () => { + const code = `require(ggplot2)`; + const result = extractFromSource('main.R', code); + const importNode = result.nodes.find((n) => n.kind === 'import' && n.name === 'ggplot2'); + expect(importNode).toBeDefined(); + }); + + it('should extract source() with a string path', () => { + const code = `source("helpers.R")`; + const result = extractFromSource('main.R', code); + const importNode = result.nodes.find((n) => n.kind === 'import' && n.name === 'helpers.R'); + expect(importNode).toBeDefined(); + }); + + it('should not emit an import node for a dynamic source() argument', () => { + const code = `source(paste0(BASE, "/helpers.R"))`; + const result = extractFromSource('main.R', code); + const imports = result.nodes.filter((n) => n.kind === 'import'); + expect(imports.length).toBe(0); + }); + + it('should unquote R 4.0+ raw string literals (round delimiter)', () => { + const code = `source(r"(helpers.R)")`; + const result = extractFromSource('main.R', code); + const importNode = result.nodes.find((n) => n.kind === 'import' && n.name === 'helpers.R'); + expect(importNode).toBeDefined(); + }); + + it('should unquote R raw strings with bracket and brace delimiters', () => { + const r1 = extractFromSource('a.R', `library(R"[mypkg]")`); + const r2 = extractFromSource('b.R', `library(r"{mypkg}")`); + expect(r1.nodes.find((n) => n.kind === 'import' && n.name === 'mypkg')).toBeDefined(); + expect(r2.nodes.find((n) => n.kind === 'import' && n.name === 'mypkg')).toBeDefined(); + }); + + it('should unquote dash-delimited raw strings used to embed quotes', () => { + const code = `source(r"-(file.R)-")`; + const result = extractFromSource('main.R', code); + const importNode = result.nodes.find((n) => n.kind === 'import' && n.name === 'file.R'); + expect(importNode).toBeDefined(); + }); + }); + + describe('Top-level constants', () => { + it('should extract top-level non-function assignments as constants', () => { + const code = `PI <- 3.14159 +COLORS <- c("red", "green")`; + const result = extractFromSource('main.R', code); + const pi = result.nodes.find((n) => n.kind === 'constant' && n.name === 'PI'); + const colors = result.nodes.find((n) => n.kind === 'constant' && n.name === 'COLORS'); + expect(pi).toBeDefined(); + expect(colors).toBeDefined(); + }); + + it('should NOT emit a constant for assignments inside a function body', () => { + const code = `outer <- function() { + x <- 5 + x +}`; + const result = extractFromSource('main.R', code); + const innerVar = result.nodes.find((n) => n.kind === 'constant' && n.name === 'x'); + expect(innerVar).toBeUndefined(); + }); + }); +}); + +// HCL / Terraform Extraction +// ============================================================================= + +describe('HCL / Terraform Extraction', () => { + describe('Language detection', () => { + it('should detect HCL/Terraform files', () => { + expect(detectLanguage('main.tf')).toBe('hcl'); + expect(detectLanguage('terraform.tfvars')).toBe('hcl'); + expect(detectLanguage('config.hcl')).toBe('hcl'); + }); + + it('should report HCL as supported', () => { + expect(isLanguageSupported('hcl')).toBe(true); + expect(getSupportedLanguages()).toContain('hcl'); + }); + }); + + describe('Block extraction', () => { + it('should extract a resource block as a class node', () => { + const code = `resource "aws_s3_bucket" "logs" { bucket = "my-logs" }`; + const result = extractFromSource('main.tf', code); + + const node = result.nodes.find((n) => n.qualifiedName === 'aws_s3_bucket.logs'); + expect(node).toBeDefined(); + expect(node?.kind).toBe('class'); + expect(node?.name).toBe('aws_s3_bucket.logs'); + expect(node?.language).toBe('hcl'); + expect(node?.signature).toBe('resource "aws_s3_bucket" "logs"'); + }); + + it('should extract a data block with `data.` prefix', () => { + const code = `data "aws_caller_identity" "current" {}`; + const result = extractFromSource('main.tf', code); + + const node = result.nodes.find((n) => n.qualifiedName === 'data.aws_caller_identity.current'); + expect(node).toBeDefined(); + expect(node?.kind).toBe('class'); + expect(node?.name).toBe('aws_caller_identity.current'); + }); + + it('should extract a variable block', () => { + const code = `variable "environment" { type = string }`; + const result = extractFromSource('main.tf', code); + + const node = result.nodes.find((n) => n.qualifiedName === 'var.environment'); + expect(node).toBeDefined(); + expect(node?.kind).toBe('variable'); + expect(node?.name).toBe('environment'); + }); + + it('should extract an output block as an export', () => { + const code = `output "vpc_id" { value = "abc" }`; + const result = extractFromSource('main.tf', code); + + const node = result.nodes.find((n) => n.qualifiedName === 'output.vpc_id'); + expect(node).toBeDefined(); + expect(node?.kind).toBe('export'); + expect(node?.name).toBe('vpc_id'); + }); + + it('should extract a module block', () => { + const code = `module "vpc" { source = "terraform-aws-modules/vpc/aws" }`; + const result = extractFromSource('main.tf', code); + + const node = result.nodes.find((n) => n.qualifiedName === 'module.vpc'); + expect(node).toBeDefined(); + expect(node?.kind).toBe('module'); + expect(node?.name).toBe('vpc'); + }); + + it('should extract a provider block as namespace', () => { + const code = `provider "aws" { region = "us-east-1" }`; + const result = extractFromSource('main.tf', code); + + const node = result.nodes.find((n) => n.qualifiedName === 'provider.aws'); + expect(node).toBeDefined(); + expect(node?.kind).toBe('namespace'); + }); + + it('should split a locals block into one constant per attribute', () => { + const code = `locals { + bucket_name = "my-bucket" + retention = 30 +}`; + const result = extractFromSource('main.tf', code); + + const bucketName = result.nodes.find((n) => n.qualifiedName === 'local.bucket_name'); + const retention = result.nodes.find((n) => n.qualifiedName === 'local.retention'); + expect(bucketName?.kind).toBe('constant'); + expect(retention?.kind).toBe('constant'); + }); + + it('should connect blocks to the file via contains edges', () => { + const code = `resource "aws_s3_bucket" "logs" {}`; + const result = extractFromSource('main.tf', code); + + const fileNode = result.nodes.find((n) => n.kind === 'file'); + const resourceNode = result.nodes.find((n) => n.qualifiedName === 'aws_s3_bucket.logs'); + expect(fileNode).toBeDefined(); + expect(resourceNode).toBeDefined(); + const containsEdge = result.edges.find( + (e) => e.source === fileNode!.id && e.target === resourceNode!.id && e.kind === 'contains' + ); + expect(containsEdge).toBeDefined(); + }); + }); + + describe('Reference extraction', () => { + it('should extract var.X references', () => { + const code = `resource "aws_s3_bucket" "logs" { bucket = var.bucket_name }`; + const result = extractFromSource('main.tf', code); + + const ref = result.unresolvedReferences.find((r) => r.referenceName === 'var.bucket_name'); + expect(ref).toBeDefined(); + expect(ref?.referenceKind).toBe('references'); + }); + + it('should extract local.X references', () => { + const code = `resource "aws_s3_bucket" "logs" { tags = local.common_tags }`; + const result = extractFromSource('main.tf', code); + + const ref = result.unresolvedReferences.find((r) => r.referenceName === 'local.common_tags'); + expect(ref).toBeDefined(); + }); + + it('should extract module.X references and stop at the module name', () => { + const code = `output "vpc_id" { value = module.vpc.vpc_id }`; + const result = extractFromSource('main.tf', code); + + const ref = result.unresolvedReferences.find((r) => r.referenceName === 'module.vpc'); + expect(ref).toBeDefined(); + // Should NOT emit a reference for the trailing attribute + expect(result.unresolvedReferences.find((r) => r.referenceName === 'module.vpc.vpc_id')).toBeUndefined(); + }); + + it('should extract data.T.N references with both labels', () => { + const code = `output "x" { value = data.aws_caller_identity.current.account_id }`; + const result = extractFromSource('main.tf', code); + + const ref = result.unresolvedReferences.find( + (r) => r.referenceName === 'data.aws_caller_identity.current' + ); + expect(ref).toBeDefined(); + }); + + it('should extract resource references as TYPE.NAME', () => { + const code = `resource "aws_s3_bucket_versioning" "v" { bucket = aws_s3_bucket.logs.id }`; + const result = extractFromSource('main.tf', code); + + const ref = result.unresolvedReferences.find((r) => r.referenceName === 'aws_s3_bucket.logs'); + expect(ref).toBeDefined(); + }); + + it('should extract references inside string interpolations', () => { + const code = 'locals { name = "${var.environment}-${random_id.suffix.hex}" }'; + const result = extractFromSource('main.tf', code); + + const names = result.unresolvedReferences.map((r) => r.referenceName); + expect(names).toContain('var.environment'); + expect(names).toContain('random_id.suffix'); + }); + + it('should ignore references to count, each, self, and path', () => { + const code = `resource "aws_instance" "web" { + count = 3 + tags = { Name = "web-\${count.index}", For = each.value, Self = self.id, P = path.module } +}`; + const result = extractFromSource('main.tf', code); + + const names = result.unresolvedReferences.map((r) => r.referenceName); + expect(names.find((n) => n.startsWith('count.'))).toBeUndefined(); + expect(names.find((n) => n.startsWith('each.'))).toBeUndefined(); + expect(names.find((n) => n.startsWith('self.'))).toBeUndefined(); + expect(names.find((n) => n.startsWith('path.'))).toBeUndefined(); + }); + + it('should ignore for-loop iteration variables', () => { + const code = `output "ids" { value = [for s in var.subnets : s.id] }`; + const result = extractFromSource('main.tf', code); + + const names = result.unresolvedReferences.map((r) => r.referenceName); + // var.subnets reference comes through, but `s.id` does NOT + expect(names).toContain('var.subnets'); + expect(names.find((n) => n.startsWith('s.'))).toBeUndefined(); + }); + + it('should ignore key/value bindings in for-object expressions', () => { + const code = `locals { tags = { for k, v in var.input : k => "\${v}-suffix" } }`; + const result = extractFromSource('main.tf', code); + + const names = result.unresolvedReferences.map((r) => r.referenceName); + expect(names).toContain('var.input'); + expect(names.find((n) => n === 'k' || n.startsWith('k.'))).toBeUndefined(); + expect(names.find((n) => n === 'v' || n.startsWith('v.'))).toBeUndefined(); + }); + + it('should emit an imports edge for module source', () => { + const code = `module "vpc" { source = "terraform-aws-modules/vpc/aws" }`; + const result = extractFromSource('main.tf', code); + + const importRef = result.unresolvedReferences.find( + (r) => r.referenceKind === 'imports' && r.referenceName === 'terraform-aws-modules/vpc/aws' + ); + expect(importRef).toBeDefined(); + }); + }); + + describe('Robustness', () => { + it('should handle empty files', () => { + const result = extractFromSource('main.tf', ''); + const fileNode = result.nodes.find((n) => n.kind === 'file'); + expect(fileNode).toBeDefined(); + }); + + it('should handle blocks with no body', () => { + const code = `data "aws_caller_identity" "current" {}`; + const result = extractFromSource('main.tf', code); + expect(result.nodes.find((n) => n.qualifiedName === 'data.aws_caller_identity.current')).toBeDefined(); + }); + + it('should walk nested blocks for references without emitting child nodes', () => { + const code = `resource "aws_s3_bucket_versioning" "v" { + bucket = aws_s3_bucket.logs.id + versioning_configuration { + status = var.versioning_status + } +}`; + const result = extractFromSource('main.tf', code); + + // Only one block-level node, plus the file + const blockNodes = result.nodes.filter((n) => n.kind === 'class'); + expect(blockNodes.length).toBe(1); + + // References from the nested block should still be captured + const names = result.unresolvedReferences.map((r) => r.referenceName); + expect(names).toContain('aws_s3_bucket.logs'); + expect(names).toContain('var.versioning_status'); + }); + }); +}); + + +// ============================================================================= +// SQL Extraction +// ============================================================================= + +describe('SQL Extraction', () => { + describe('Language detection', () => { + it('should detect SQL files', () => { + expect(detectLanguage('schema.sql')).toBe('sql'); + expect(detectLanguage('migrations/001.ddl')).toBe('sql'); + expect(detectLanguage('seed.dml')).toBe('sql'); + }); + + it('should report SQL as supported', () => { + expect(isLanguageSupported('sql')).toBe(true); + expect(getSupportedLanguages()).toContain('sql'); + }); + }); + + describe('CREATE TABLE', () => { + it('should extract a table as a class node', () => { + const code = `CREATE TABLE users (id INT PRIMARY KEY, email VARCHAR(255));`; + const result = extractFromSource('schema.sql', code); + const node = result.nodes.find((n) => n.kind === 'class' && n.name === 'users'); + expect(node).toBeDefined(); + expect(node?.signature).toBe('CREATE TABLE users'); + }); + + it('should preserve schema-qualified table names', () => { + const code = `CREATE TABLE reporting.events (id INT);`; + const result = extractFromSource('schema.sql', code); + const node = result.nodes.find((n) => n.kind === 'class' && n.name === 'reporting.events'); + expect(node).toBeDefined(); + }); + + it('should extract inline foreign-key references', () => { + const code = `CREATE TABLE orders (id INT, user_id INT REFERENCES users(id));`; + const result = extractFromSource('schema.sql', code); + const orders = result.nodes.find((n) => n.name === 'orders')!; + const fk = result.unresolvedReferences.find( + (r) => r.fromNodeId === orders.id && r.referenceName === 'users' && r.referenceKind === 'references' + ); + expect(fk).toBeDefined(); + }); + + it('should extract CONSTRAINT-style foreign keys', () => { + const code = `CREATE TABLE orders ( + id INT, + user_id INT, + CONSTRAINT fk_user FOREIGN KEY (user_id) REFERENCES users(id) ON DELETE CASCADE +);`; + const result = extractFromSource('schema.sql', code); + const orders = result.nodes.find((n) => n.name === 'orders')!; + const fk = result.unresolvedReferences.find( + (r) => r.fromNodeId === orders.id && r.referenceName === 'users' + ); + expect(fk).toBeDefined(); + }); + + it('should add a contains edge from the file to each table', () => { + const code = `CREATE TABLE a (id INT); CREATE TABLE b (id INT);`; + const result = extractFromSource('schema.sql', code); + const file = result.nodes.find((n) => n.kind === 'file')!; + const a = result.nodes.find((n) => n.name === 'a')!; + const b = result.nodes.find((n) => n.name === 'b')!; + expect(result.edges).toContainEqual(expect.objectContaining({ source: file.id, target: a.id, kind: 'contains' })); + expect(result.edges).toContainEqual(expect.objectContaining({ source: file.id, target: b.id, kind: 'contains' })); + }); + }); + + describe('CREATE VIEW', () => { + it('should extract a view as a class node', () => { + const code = `CREATE VIEW active_users AS SELECT id FROM users;`; + const result = extractFromSource('views.sql', code); + const view = result.nodes.find((n) => n.kind === 'class' && n.name === 'active_users'); + expect(view).toBeDefined(); + }); + + it('should record references to source tables in the view query', () => { + const code = `CREATE VIEW user_orders AS + SELECT u.id, COUNT(o.id) AS n + FROM users u + LEFT JOIN orders o ON o.user_id = u.id;`; + const result = extractFromSource('views.sql', code); + const view = result.nodes.find((n) => n.name === 'user_orders')!; + const refs = result.unresolvedReferences + .filter((r) => r.fromNodeId === view.id) + .map((r) => r.referenceName); + expect(refs).toContain('users'); + expect(refs).toContain('orders'); + }); + + it('should de-duplicate identical references in the same scope', () => { + const code = `CREATE VIEW double_users AS + SELECT * FROM users JOIN users u2 ON u2.id = users.id;`; + const result = extractFromSource('views.sql', code); + const view = result.nodes.find((n) => n.name === 'double_users')!; + const usersRefs = result.unresolvedReferences.filter( + (r) => r.fromNodeId === view.id && r.referenceName === 'users' + ); + expect(usersRefs.length).toBe(1); + }); + + it('should walk into derived-table subqueries to find inner table refs', () => { + const code = `CREATE VIEW v AS + SELECT * FROM (SELECT id FROM users) u JOIN orders o ON o.user_id = u.id;`; + const result = extractFromSource('views.sql', code); + const view = result.nodes.find((n) => n.name === 'v')!; + const refs = result.unresolvedReferences + .filter((r) => r.fromNodeId === view.id) + .map((r) => r.referenceName); + expect(refs).toContain('users'); + expect(refs).toContain('orders'); + }); + }); + + describe('CREATE FUNCTION', () => { + it('should extract a function with signature', () => { + const code = `CREATE FUNCTION add(a INT, b INT) RETURNS INT AS 'SELECT a + b' LANGUAGE SQL;`; + const result = extractFromSource('fns.sql', code); + const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'add'); + expect(fn).toBeDefined(); + expect(fn?.signature).toContain('(a INT, b INT)'); + }); + + it('should handle CREATE OR REPLACE FUNCTION', () => { + const code = `CREATE OR REPLACE FUNCTION calc(x INT) RETURNS INT AS 'SELECT x * 2' LANGUAGE SQL;`; + const result = extractFromSource('fns.sql', code); + const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'calc'); + expect(fn).toBeDefined(); + }); + + it('should label a CREATE FUNCTION signature with CREATE FUNCTION', () => { + const code = `CREATE FUNCTION add(a INT) RETURNS INT AS 'SELECT a + 1' LANGUAGE SQL;`; + const result = extractFromSource('fns.sql', code); + const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'add'); + expect(fn?.signature).toContain('CREATE FUNCTION'); + expect(fn?.signature).not.toContain('CREATE PROCEDURE'); + }); + }); + + describe('CREATE TRIGGER', () => { + it('should extract a trigger with target-table reference and called function', () => { + const code = `CREATE TRIGGER orders_audit +AFTER INSERT ON orders +FOR EACH ROW +EXECUTE FUNCTION audit_orders();`; + const result = extractFromSource('triggers.sql', code); + const trigger = result.nodes.find((n) => n.kind === 'function' && n.name === 'orders_audit'); + expect(trigger).toBeDefined(); + + const refs = result.unresolvedReferences.filter((r) => r.fromNodeId === trigger!.id); + const tableRef = refs.find((r) => r.referenceName === 'orders' && r.referenceKind === 'references'); + const callRef = refs.find((r) => r.referenceName === 'audit_orders' && r.referenceKind === 'calls'); + expect(tableRef).toBeDefined(); + expect(callRef).toBeDefined(); + }); + + it('should still locate target/function across an UPDATE OF column list', () => { + const code = `CREATE TRIGGER t +BEFORE UPDATE OF col1, col2 ON orders +FOR EACH ROW +EXECUTE FUNCTION audit_cols();`; + const result = extractFromSource('triggers.sql', code); + const trigger = result.nodes.find((n) => n.name === 't')!; + const refs = result.unresolvedReferences.filter((r) => r.fromNodeId === trigger.id); + expect(refs.find((r) => r.referenceName === 'orders' && r.referenceKind === 'references')).toBeDefined(); + expect(refs.find((r) => r.referenceName === 'audit_cols' && r.referenceKind === 'calls')).toBeDefined(); + }); + }); + + describe('CREATE TYPE', () => { + it('should extract an enum type as an enum node', () => { + const code = `CREATE TYPE order_status AS ENUM ('pending', 'shipped', 'cancelled');`; + const result = extractFromSource('types.sql', code); + const node = result.nodes.find((n) => n.name === 'order_status'); + expect(node?.kind).toBe('enum'); + }); + + it('should extract a non-enum CREATE TYPE as a type_alias', () => { + const code = `CREATE TYPE point AS (x FLOAT, y FLOAT);`; + const result = extractFromSource('types.sql', code); + const node = result.nodes.find((n) => n.name === 'point'); + expect(node?.kind).toBe('type_alias'); + }); + }); + + describe('CREATE SCHEMA', () => { + it('should extract a schema as a namespace node', () => { + const code = `CREATE SCHEMA reporting;`; + const result = extractFromSource('schemas.sql', code); + const node = result.nodes.find((n) => n.name === 'reporting'); + expect(node?.kind).toBe('namespace'); + }); + }); + + describe('Robustness', () => { + it('should not error on plain SELECT/INSERT/UPDATE statements', () => { + const code = `SELECT * FROM users; +INSERT INTO orders (id) VALUES (1); +UPDATE users SET email = 'x';`; + const result = extractFromSource('queries.sql', code); + expect(result.errors.filter((e) => e.severity === 'error').length).toBe(0); + const nonFile = result.nodes.filter((n) => n.kind !== 'file'); + expect(nonFile.length).toBe(0); + }); + + it('should not emit nodes for CREATE INDEX', () => { + const code = `CREATE INDEX idx_users_email ON users(email);`; + const result = extractFromSource('idx.sql', code); + const nonFile = result.nodes.filter((n) => n.kind !== 'file'); + expect(nonFile.length).toBe(0); + }); + + it('should handle multiple statements without leaking state', () => { + const code = `CREATE TABLE a (id INT); +CREATE TABLE b (id INT, a_id INT REFERENCES a(id)); +CREATE VIEW c AS SELECT * FROM a JOIN b ON b.a_id = a.id;`; + const result = extractFromSource('multi.sql', code); + const a = result.nodes.find((n) => n.name === 'a'); + const b = result.nodes.find((n) => n.name === 'b'); + const c = result.nodes.find((n) => n.name === 'c'); + expect(a).toBeDefined(); + expect(b).toBeDefined(); + expect(c).toBeDefined(); + + const bRefs = result.unresolvedReferences.filter((r) => r.fromNodeId === b!.id); + const cRefs = result.unresolvedReferences.filter((r) => r.fromNodeId === c!.id); + expect(bRefs.map((r) => r.referenceName)).toContain('a'); + expect(cRefs.map((r) => r.referenceName)).toContain('a'); + expect(cRefs.map((r) => r.referenceName)).toContain('b'); + }); + }); +}); diff --git a/__tests__/foundation.test.ts b/__tests__/foundation.test.ts index 9ee437da..bd6e957d 100644 --- a/__tests__/foundation.test.ts +++ b/__tests__/foundation.test.ts @@ -305,7 +305,7 @@ describe('Database Connection', () => { const version = db.getSchemaVersion(); expect(version).not.toBeNull(); - expect(version?.version).toBe(3); + expect(version?.version).toBe(16); db.close(); }); diff --git a/__tests__/index-hooks.test.ts b/__tests__/index-hooks.test.ts new file mode 100644 index 00000000..639587f9 --- /dev/null +++ b/__tests__/index-hooks.test.ts @@ -0,0 +1,130 @@ +/** + * Index-hook framework: register a fake hook at runtime, run an + * indexAll/sync against a synthetic project, assert the hook ran + * with the expected context shape and that errors are caught. + * + * The registry's static-import list (`REGISTERED_HOOKS`) is empty + * on main today; tests poke at the runner directly through + * `runAfterIndexAll`/`runAfterSync` rather than mutating that + * list. + */ +import { describe, it, expect } from 'vitest'; +import { + runAfterIndexAll, + runAfterSync, + getRegisteredHooks, + type IndexHook, + type IndexHookContext, +} from '../src/index-hooks/registry'; +import type { SyncResult } from '../src/extraction'; + +function makeFakeContext(): IndexHookContext { + // Hooks should not mutate the context; for the runner-shape + // tests we hand them stubs typed `as any` — the runner doesn't + // touch any of these fields itself. + return { + projectRoot: '/tmp/fake-project', + /* eslint-disable @typescript-eslint/no-explicit-any */ + config: {} as any, + queries: {} as any, + db: {} as any, + /* eslint-enable */ + }; +} + +const fakeSyncResult: SyncResult = { + filesChecked: 0, + filesAdded: 0, + filesModified: 0, + filesRemoved: 0, + nodesUpdated: 0, + durationMs: 0, +}; + +describe('index-hooks registry — runner', () => { + it('registered hooks expose stable {name, afterIndexAll|afterSync} shape', () => { + const hooks = getRegisteredHooks(); + expect(hooks.length).toBeGreaterThanOrEqual(0); + for (const h of hooks) { + expect(typeof h.name).toBe('string'); + expect(h.afterIndexAll === undefined || typeof h.afterIndexAll === 'function').toBe(true); + expect(h.afterSync === undefined || typeof h.afterSync === 'function').toBe(true); + } + }); + + it('runAfterIndexAll returns one outcome per registered hook, swallowing per-hook errors', async () => { + // Registered hooks will throw on the fake `{} as any` ctx; the + // runner contract is to catch + report each error so one bad + // hook never fails the whole pass. + const outcomes = await runAfterIndexAll(makeFakeContext()); + const expectedCount = getRegisteredHooks().filter((h) => h.afterIndexAll).length; + expect(outcomes.length).toBe(expectedCount); + for (const o of outcomes) { + expect(typeof o.name).toBe('string'); + expect(o.phase).toBe('indexAll'); + expect(typeof o.durationMs).toBe('number'); + } + }); + + it('runAfterSync returns one outcome per registered hook, swallowing per-hook errors', async () => { + const outcomes = await runAfterSync(makeFakeContext(), fakeSyncResult); + const expectedCount = getRegisteredHooks().filter((h) => h.afterSync).length; + expect(outcomes.length).toBe(expectedCount); + for (const o of outcomes) { + expect(typeof o.name).toBe('string'); + expect(o.phase).toBe('sync'); + expect(typeof o.durationMs).toBe('number'); + } + }); +}); + +describe('index-hooks runner — fake-hook injection', () => { + // Helper: temporarily inject a fake hook by wrapping the runner + // directly. The runner accepts no array argument today; this + // suite exercises the public surface (runAfterIndexAll / + // runAfterSync) by simulating what a registered hook would do. + // When real hooks land, REGISTERED_HOOKS in registry.ts will + // contain them and this fixture-style approach disappears. + + it('a hook with afterIndexAll receives the context and is awaited', async () => { + // Build a one-off hook and call it directly — the runner's + // contract is "for each registered hook, await afterIndexAll + // if defined." We exercise that contract by calling the hook + // ourselves to confirm the IndexHookContext shape stays usable + // by hook implementations. + let captured: IndexHookContext | null = null; + const hook: IndexHook = { + name: 'fake-hook', + async afterIndexAll(ctx) { + captured = ctx; + }, + }; + const ctx = makeFakeContext(); + await hook.afterIndexAll!(ctx); + expect(captured).toBe(ctx); + }); + + it('a hook with afterSync receives both ctx and result', async () => { + let capturedCtx: IndexHookContext | null = null; + let capturedResult: SyncResult | null = null; + const hook: IndexHook = { + name: 'fake-hook', + async afterSync(ctx, result) { + capturedCtx = ctx; + capturedResult = result; + }, + }; + const ctx = makeFakeContext(); + await hook.afterSync!(ctx, fakeSyncResult); + expect(capturedCtx).toBe(ctx); + expect(capturedResult).toBe(fakeSyncResult); + }); + + it('a hook missing afterIndexAll is silently skipped', () => { + // Just a typing assertion: an IndexHook without afterIndexAll + // is allowed (both methods are optional). + const hook: IndexHook = { name: 'sync-only' }; + expect(hook.afterIndexAll).toBeUndefined(); + expect(hook.afterSync).toBeUndefined(); + }); +}); diff --git a/__tests__/issue-history.test.ts b/__tests__/issue-history.test.ts new file mode 100644 index 00000000..7c281771 --- /dev/null +++ b/__tests__/issue-history.test.ts @@ -0,0 +1,390 @@ +/** + * Issue → symbol attribution: parser unit tests + end-to-end mining + * against synthetic git repos. + */ +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import * as fs from 'fs'; +import * as os from 'os'; +import * as path from 'path'; +import { execFileSync } from 'child_process'; +import { + extractSymbolFromContext, + extractDeclaration, +} from '../src/issue-history/parse-diff'; +import { + mineIssueCommits, + mineIssueHistory, + ISSUE_REGEX, + LAST_MINED_ISSUES_HEAD_KEY, +} from '../src/issue-history'; +import CodeGraph from '../src/index'; + +let HAS_GIT = true; +try { + execFileSync('git', ['--version'], { stdio: 'ignore' }); +} catch { + HAS_GIT = false; +} + +let testDir: string; +let cg: CodeGraph | null = null; + +function git(...args: string[]): string { + return execFileSync('git', args, { + cwd: testDir, + encoding: 'utf-8', + env: { + ...process.env, + GIT_AUTHOR_NAME: 'Test', + GIT_AUTHOR_EMAIL: 'test@example.com', + GIT_COMMITTER_NAME: 'Test', + GIT_COMMITTER_EMAIL: 'test@example.com', + GIT_AUTHOR_DATE: process.env.GIT_AUTHOR_DATE, + GIT_COMMITTER_DATE: process.env.GIT_COMMITTER_DATE, + }, + stdio: ['pipe', 'pipe', 'pipe'], + }).trim(); +} + +function commitAt(date: string, files: Record, message: string) { + for (const [rel, content] of Object.entries(files)) { + const abs = path.join(testDir, rel); + fs.mkdirSync(path.dirname(abs), { recursive: true }); + fs.writeFileSync(abs, content); + } + git('add', '-A'); + process.env.GIT_AUTHOR_DATE = date; + process.env.GIT_COMMITTER_DATE = date; + git('commit', '-m', message); + delete process.env.GIT_AUTHOR_DATE; + delete process.env.GIT_COMMITTER_DATE; +} + +beforeEach(() => { + testDir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-issues-')); +}); + +afterEach(() => { + delete process.env.GIT_AUTHOR_DATE; + delete process.env.GIT_COMMITTER_DATE; + if (cg) { + cg.destroy(); + cg = null; + } + if (fs.existsSync(testDir)) fs.rmSync(testDir, { recursive: true, force: true }); +}); + +// ============================================================================ +// Pure parser unit tests +// ============================================================================ + +describe('ISSUE_REGEX', () => { + it('matches all canonical Fixes/Closes/Resolves verbs', () => { + const cases = [ + 'Fix #1', 'Fixes #2', 'Fixed #3', + 'Close #4', 'Closes #5', 'Closed #6', + 'Resolve #7', 'Resolves #8', 'Resolved #9', + ]; + for (const s of cases) { + ISSUE_REGEX.lastIndex = 0; + expect(ISSUE_REGEX.test(s)).toBe(true); + } + }); + + it('matches multiple issues in a single body', () => { + ISSUE_REGEX.lastIndex = 0; + const matches = [...'Fixes #1, closes #2 and resolves #3'.matchAll(ISSUE_REGEX)]; + expect(matches.map((m) => m[1])).toEqual(['1', '2', '3']); + }); + + it('is case-insensitive', () => { + ISSUE_REGEX.lastIndex = 0; + expect(ISSUE_REGEX.test('FIXES #42')).toBe(true); + }); + + it('does NOT match `#N` without a verb', () => { + ISSUE_REGEX.lastIndex = 0; + // Match in body of message that mentions #99 but with no verb prefix. + expect(ISSUE_REGEX.test('See #99 for context')).toBe(false); + }); + + it('v1 limitation: `Fixes #1, #2` only captures #1', () => { + // Documented behavior — the second issue lacks a verb prefix and + // is silently dropped. Authors who care can write `Fixes #1, fixes #2`. + ISSUE_REGEX.lastIndex = 0; + const matches = [...'Fixes #1, #2'.matchAll(ISSUE_REGEX)]; + expect(matches.map((m) => m[1])).toEqual(['1']); + }); +}); + +describe('extractSymbolFromContext', () => { + it('pulls function name from a TS function context', () => { + expect(extractSymbolFromContext('function processOrder(order: Order) {')).toBe('processOrder'); + }); + it('pulls class name', () => { + expect(extractSymbolFromContext('class UserService {')).toBe('UserService'); + }); + it('pulls Python def', () => { + expect(extractSymbolFromContext('def compute_score(items):')).toBe('compute_score'); + }); + it('pulls Go func', () => { + expect(extractSymbolFromContext('func ProcessOrder(o *Order) error {')).toBe('ProcessOrder'); + }); + it('pulls method-style ` async foo(`', () => { + expect(extractSymbolFromContext(' async foo(args: string) {')).toBe('foo'); + }); + it('rejects keyword-only contexts', () => { + expect(extractSymbolFromContext(' if (x) {')).toBeNull(); + }); + it('returns null on empty input', () => { + expect(extractSymbolFromContext('')).toBeNull(); + }); +}); + +describe('extractDeclaration', () => { + it('captures + function decl', () => { + expect(extractDeclaration('+function helper() {')).toEqual({ name: 'helper', sign: '+' }); + }); + it('captures - class decl', () => { + expect(extractDeclaration('-export class Old {')).toEqual({ name: 'Old', sign: '-' }); + }); + it('captures Python def', () => { + expect(extractDeclaration('+def my_helper(x):')).toEqual({ name: 'my_helper', sign: '+' }); + }); + it('captures Go func with receiver', () => { + expect(extractDeclaration('+func (s *Service) DoThing() error {')).toEqual({ + name: 'DoThing', + sign: '+', + }); + }); + it('skips file-marker `+++` and `---` lines', () => { + expect(extractDeclaration('+++ b/src/foo.ts')).toBeNull(); + expect(extractDeclaration('--- a/src/foo.ts')).toBeNull(); + }); + it('skips keywords like `+if`', () => { + expect(extractDeclaration('+ if (x) return;')).toBeNull(); + }); + it('returns null on context lines (no +/-)', () => { + expect(extractDeclaration(' some body line')).toBeNull(); + }); +}); + +// ============================================================================ +// Git mining: synthetic repo +// ============================================================================ + +describe.skipIf(!HAS_GIT)('mineIssueCommits', () => { + beforeEach(() => { + git('init', '-q', '-b', 'main'); + git('config', 'commit.gpgsign', 'false'); + }); + + it('finds commits with `Fixes #N` in the subject', () => { + commitAt('2025-01-01T00:00:00Z', { 'a.ts': 'a' }, 'feat: add a (no issue)'); + commitAt('2025-01-02T00:00:00Z', { 'a.ts': 'a2' }, 'fix: bug. Fixes #42'); + const commits = mineIssueCommits(testDir, null); + expect(commits.length).toBe(1); + expect(commits[0]!.issues).toEqual([42]); + }); + + it('parses multi-issue subjects', () => { + commitAt('2025-01-01T00:00:00Z', { 'a.ts': 'a' }, 'fix: triple. Fixes #1, closes #2, resolves #3'); + const [c] = mineIssueCommits(testDir, null); + expect(c?.issues).toEqual([1, 2, 3]); + }); + + it('ignores commits with no issue ref', () => { + commitAt('2025-01-01T00:00:00Z', { 'a.ts': 'a' }, 'plain message'); + expect(mineIssueCommits(testDir, null).length).toBe(0); + }); + + it('returns [] when not in a git repo', () => { + const nonGit = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-nogit-')); + try { + expect(mineIssueCommits(nonGit, null)).toEqual([]); + } finally { + fs.rmSync(nonGit, { recursive: true, force: true }); + } + }); +}); + +// ============================================================================ +// End-to-end through CodeGraph +// ============================================================================ + +describe.skipIf(!HAS_GIT)('CodeGraph issue history', () => { + beforeEach(() => { + git('init', '-q', '-b', 'main'); + git('config', 'commit.gpgsign', 'false'); + }); + + it('attributes a Fixes #N commit to the modified function', async () => { + commitAt('2025-01-01T00:00:00Z', { + 'src/a.ts': `export function foo() { return 1; }\n`, + }, 'feat: add foo'); + + commitAt('2025-02-01T00:00:00Z', { + 'src/a.ts': `export function foo() {\n // changed\n return 2;\n}\n`, + }, 'fix: bug. Fixes #42'); + + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + + const node = cg.getNodesInFile('src/a.ts').find((n) => n.name === 'foo')!; + expect(node).toBeDefined(); + const issues = cg.getIssuesForNode(node.id); + expect(issues.length).toBeGreaterThan(0); + expect(issues.some((i) => i.issueNumber === 42)).toBe(true); +}); + + it('tracks the agent-usable multi-issue signal', async () => { + // Simulate the codegraph history pattern: `loadGrammarsForLanguages` + // touched by every language-add issue (#54, #82, #83, #85). + commitAt('2025-01-01T00:00:00Z', { + 'src/grammar.ts': `export function loadGrammarsForLanguages() { return []; }\n`, + }, 'feat: add grammar loader'); + + commitAt('2025-01-02T00:00:00Z', { + 'src/grammar.ts': `export function loadGrammarsForLanguages() {\n // R support\n return [];\n}\n`, + }, 'feat: add R support. Fixes #82'); + + commitAt('2025-01-03T00:00:00Z', { + 'src/grammar.ts': `export function loadGrammarsForLanguages() {\n // R + HCL support\n return [];\n}\n`, + }, 'feat: add HCL. Fixes #83'); + + commitAt('2025-01-04T00:00:00Z', { + 'src/grammar.ts': `export function loadGrammarsForLanguages() {\n // R + HCL + SQL\n return [];\n}\n`, + }, 'feat: add SQL. Fixes #85'); + + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + + const node = cg.getNodesByKind("function").find((n) => n.name === 'loadGrammarsForLanguages')!; + expect(node).toBeDefined(); + const issues = cg.getIssuesForNode(node.id); + const issueNumbers = [...new Set(issues.map((i) => i.issueNumber))].sort((a, b) => a - b); + expect(issueNumbers).toEqual([82, 83, 85]); + }); + + it('records `added` kind for symbols introduced in a Fixes commit', async () => { + commitAt('2025-01-01T00:00:00Z', { + 'src/a.ts': `export function existing() { return 1; }\n`, + }, 'init'); + + commitAt('2025-02-01T00:00:00Z', { + 'src/a.ts': `export function existing() { return 1; }\nexport function brandNew() { return 2; }\n`, + }, 'feat: add brandNew. Fixes #100'); + + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + + const node = cg.getNodesByKind("function").find((n) => n.name === 'brandNew')!; + const issues = cg.getIssuesForNode(node.id); + expect(issues.some((i) => i.issueNumber === 100 && i.kind === 'added')).toBe(true); + }); + + it('drops attributions for symbols that no longer exist', async () => { + // Symbol added then removed in two separate `Fixes` commits. The + // current index has no node for it, so attributions for the removed + // symbol must not appear (FK + drop-on-resolve). + commitAt('2025-01-01T00:00:00Z', { + 'src/a.ts': `export function staysHere() { return 1; }\nexport function temporary() { return 99; }\n`, + }, 'feat: add. Fixes #1'); + + commitAt('2025-02-01T00:00:00Z', { + 'src/a.ts': `export function staysHere() { return 1; }\n`, + }, 'fix: drop temporary. Fixes #2'); + + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + + // staysHere should have at least the #1 attribution (added). + const node = cg.getNodesByKind("function").find((n) => n.name === 'staysHere')!; + const issues = cg.getIssuesForNode(node.id); + expect(issues.some((i) => i.issueNumber === 1)).toBe(true); + + // No node should exist named `temporary`, and no attribution to + // issue #2 should reference a node that doesn't exist. + expect(cg.getNodesByKind("function").find((n) => n.name === 'temporary')).toBeUndefined(); + }); + + it('survives indexAll outside a git repo (table empty, no errors)', async () => { + fs.rmSync(path.join(testDir, '.git'), { recursive: true, force: true }); + fs.writeFileSync(path.join(testDir, 'a.ts'), `export function x() { return 1; }\n`); + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + const nodes = cg.getNodesInFile('a.ts'); + expect(nodes.length).toBeGreaterThan(0); + for (const n of nodes) expect(cg.getIssuesForNode(n.id)).toEqual([]); + }); + + it('respects enableIssueHistory=false', async () => { + commitAt('2025-01-01T00:00:00Z', { + 'src/a.ts': `export function foo() { return 1; }\n`, + }, 'init'); + commitAt('2025-01-02T00:00:00Z', { + 'src/a.ts': `export function foo() { return 2; }\n`, + }, 'fix: foo. Fixes #1'); + + cg = CodeGraph.initSync(testDir, { + config: { include: ['**/*.ts'], exclude: [], enableIssueHistory: false }, + }); + await cg.indexAll(); + const node = cg.getNodesInFile('src/a.ts').find((n) => n.name === 'foo')!; + expect(cg.getIssuesForNode(node.id)).toEqual([]); + }); + + it('incrementally picks up new Fixes commits on sync', async () => { + commitAt('2025-01-01T00:00:00Z', { + 'src/a.ts': `export function foo() { return 1; }\n`, + }, 'init'); + + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + const node = cg.getNodesInFile('src/a.ts').find((n) => n.name === 'foo')!; + expect(cg.getIssuesForNode(node.id).length).toBe(0); + + commitAt('2025-02-01T00:00:00Z', { + 'src/a.ts': `export function foo() { return 2; }\n`, + }, 'fix: foo. Fixes #50'); + await cg.sync(); + + const issues = cg.getIssuesForNode(node.id); + expect(issues.some((i) => i.issueNumber === 50)).toBe(true); + }); + + // (Removed: a defensive test for the v4-migration-collision bug class. + // With file-based migrations (NNN-name.ts), two migrations claiming + // the same version produces a filesystem-level conflict — the silent + // skip the defensive guard protected against can no longer happen.) + + it('recovers from an unreachable last_mined_issues_head', async () => { + commitAt('2025-01-01T00:00:00Z', { + 'src/a.ts': `export function foo() { return 1; }\n`, + }, 'init'); + commitAt('2025-02-01T00:00:00Z', { + 'src/a.ts': `export function foo() { return 2; }\n`, + }, 'fix: foo. Fixes #1'); + + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + const node = cg.getNodesInFile('src/a.ts').find((n) => n.name === 'foo')!; + expect( + [...new Set(cg.getIssuesForNode(node.id).map((i) => i.issueNumber))] + ).toEqual([1]); + + // Simulate force-push / gc by storing an unreachable SHA. + // eslint-disable-next-line @typescript-eslint/no-explicit-any + (cg as any).queries.setMetadata(LAST_MINED_ISSUES_HEAD_KEY, '0'.repeat(40)); + + commitAt('2025-03-01T00:00:00Z', { + 'src/a.ts': `export function foo() { return 3; }\n`, + }, 'fix: foo again. Fixes #2'); + await cg.sync(); + + const issueNums = [ + ...new Set(cg.getIssuesForNode(node.id).map((i) => i.issueNumber)), + ].sort((a, b) => a - b); + expect(issueNums).toEqual([1, 2]); + }); +}); diff --git a/__tests__/language-registry.test.ts b/__tests__/language-registry.test.ts new file mode 100644 index 00000000..9afdd59a --- /dev/null +++ b/__tests__/language-registry.test.ts @@ -0,0 +1,157 @@ +/** + * Language registry: structural invariants. + * + * These tests guard against the "parallel list" failure mode that + * the registry refactor exists to prevent. If a future PR adds a + * grammar-backed language but forgets to wire it through one of + * the derived consumers, one of these tests should catch it. + */ +import { describe, it, expect } from 'vitest'; +import { + getLanguageDefs, + getLanguageDefByExtension, + getLanguageDefByName, +} from '../src/extraction/languages/registry'; +import { EXTRACTORS } from '../src/extraction/languages'; +import { + detectLanguage, + isLanguageSupported, + getSupportedLanguages, + getLanguageDisplayName, + EXTENSION_MAP, +} from '../src/extraction/grammars'; + +describe('language registry — single source of truth', () => { + it('has at least the original 19 languages', () => { + const defs = getLanguageDefs(); + expect(defs.length).toBeGreaterThanOrEqual(19); + }); + + it('every def has unique non-empty name', () => { + const names = new Set(); + for (const def of getLanguageDefs()) { + expect(def.name).toBeTruthy(); + expect(names.has(def.name)).toBe(false); + names.add(def.name); + } + }); + + it('extensions are unique across registry (one ext maps to one language)', () => { + const seen = new Map(); + for (const def of getLanguageDefs()) { + for (const ext of def.extensions) { + const lower = ext.toLowerCase(); + if (seen.has(lower)) { + // The .h ambiguity (C vs C++) is intentionally pinned to C + // by the registry; tree-sitter.ts has a content-sniff + // override. Anything else duplicating extensions is a bug. + throw new Error( + `Extension ${lower} mapped twice: ${seen.get(lower)} and ${def.name}` + ); + } + seen.set(lower, def.name); + } + } + }); + + it('grammar-backed defs have wasmFile + extractor', () => { + for (const def of getLanguageDefs()) { + if (!def.grammar) continue; + expect(def.grammar.wasmFile).toMatch(/^tree-sitter-.+\.wasm$/); + expect(def.grammar.extractor).toBeDefined(); + } + }); + + it('custom-extractor defs have a customExtractor function', () => { + for (const def of getLanguageDefs()) { + if (def.grammar) continue; // grammar-backed + expect(def.customExtractor).toBeInstanceOf(Function); + } + }); +}); + +describe('derived consumers stay in sync with the registry', () => { + // Catch the "parallel list drift" bug that motivated this refactor. + // If a new language gets added to registry but a derived consumer + // still hard-codes the old set, one of these will fail. + + it('EXTRACTORS contains exactly the grammar-backed languages', () => { + const grammarBacked = getLanguageDefs() + .filter((d) => d.grammar) + .map((d) => d.name) + .sort(); + const extractorKeys = Object.keys(EXTRACTORS).sort(); + expect(extractorKeys).toEqual(grammarBacked); + }); + + it('every grammar-backed extractor matches def.grammar.extractor exactly', () => { + for (const def of getLanguageDefs()) { + if (!def.grammar) continue; + expect(EXTRACTORS[def.name as keyof typeof EXTRACTORS]).toBe(def.grammar.extractor); + } + }); + + it('EXTENSION_MAP entries exactly mirror registry extensions', () => { + const expected = new Map(); + for (const def of getLanguageDefs()) { + for (const ext of def.extensions) { + expected.set(ext.toLowerCase(), def.name); + } + } + for (const [ext, lang] of expected) { + expect(EXTENSION_MAP[ext]).toBe(lang); + } + // Reverse: no extra keys in EXTENSION_MAP. + expect(Object.keys(EXTENSION_MAP).sort()).toEqual([...expected.keys()].sort()); + }); + + it('detectLanguage returns the expected name for every registered extension', () => { + for (const def of getLanguageDefs()) { + for (const ext of def.extensions) { + // .h is pinned to C by the registry; the C++ heuristic only + // applies when source is provided AND looks like C++. + expect(detectLanguage(`x${ext}`)).toBe(def.name); + } + } + }); + + it('isLanguageSupported returns true for every registered language and false for unknown', () => { + for (const def of getLanguageDefs()) { + expect(isLanguageSupported(def.name as never)).toBe(true); + } + expect(isLanguageSupported('unknown' as never)).toBe(false); + }); + + it('getSupportedLanguages returns exactly the registry names', () => { + const fromRegistry = getLanguageDefs().map((d) => d.name).sort(); + const supported = (getSupportedLanguages() as string[]).sort(); + expect(supported).toEqual(fromRegistry); + }); + + it('getLanguageDisplayName uses each defs displayName', () => { + for (const def of getLanguageDefs()) { + expect(getLanguageDisplayName(def.name as never)).toBe(def.displayName); + } + }); +}); + +describe('lookup helpers', () => { + it('getLanguageDefByName returns the def for a registered name', () => { + expect(getLanguageDefByName('typescript')?.displayName).toBe('TypeScript'); + }); + + it('getLanguageDefByName returns undefined for unknown names', () => { + expect(getLanguageDefByName('nonexistent-language-name')).toBeUndefined(); + }); + + it('getLanguageDefByExtension is case-insensitive', () => { + expect(getLanguageDefByExtension('.TS')?.name).toBe('typescript'); + expect(getLanguageDefByExtension('.ts')?.name).toBe('typescript'); + }); + + it('Pascal extensionOverrides routes .dfm and .fmx to a customExtractor', () => { + const def = getLanguageDefByName('pascal'); + expect(def?.extensionOverrides?.['.dfm']?.customExtractor).toBeInstanceOf(Function); + expect(def?.extensionOverrides?.['.fmx']?.customExtractor).toBeInstanceOf(Function); + }); +}); diff --git a/__tests__/llm-tiers.test.ts b/__tests__/llm-tiers.test.ts new file mode 100644 index 00000000..2fe588b5 --- /dev/null +++ b/__tests__/llm-tiers.test.ts @@ -0,0 +1,381 @@ +/** + * Tier 1 #3, Tier 2 #4/#5, Tier 3 #7/#8: directory summaries, role + * classifier, change-intent, dead-code judge, naming drift. + * + * Same in-process fake-Ollama pattern as llm.test.ts. The fake's + * chat handler returns deterministic JSON for the prompts that + * expect it (classifier, dead-code, naming) so we can assert ordering + * and parsing. + */ + +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import * as http from 'http'; +import { AddressInfo } from 'net'; +import { CodeGraph } from '../src'; + +interface FakeServer { + url: string; + chatCalls: number; + /** Lets a test override the next chat response. */ + nextChatText: string | null; + close: () => Promise; +} + +async function startFake(): Promise { + const state: { chatCalls: number; nextChatText: string | null } = { + chatCalls: 0, + nextChatText: null, + }; + const server = http.createServer((req, res) => { + let body = ''; + req.on('data', (c) => (body += c)); + req.on('end', () => { + res.setHeader('content-type', 'application/json'); + if (req.url?.endsWith('/models') || req.url === '/models') { + res.end(JSON.stringify({ data: [{ id: 'qwen2.5-coder:7b' }] })); + return; + } + if (req.url?.endsWith('/chat/completions')) { + state.chatCalls++; + const parsed = JSON.parse(body) as { messages: Array<{ content: string }> }; + const userText = parsed.messages?.[0]?.content || ''; + let text: string; + if (state.nextChatText !== null) { + text = state.nextChatText; + state.nextChatText = null; + } else if (userText.includes('Reply with EXACTLY one JSON object')) { + // Could be classifier-style or judge-style; default to a + // benign verdict object that satisfies dead-code parsing. + if (userText.includes('"verdict"')) { + text = '{"verdict": "uncertain", "confidence": 0.5, "reason": "test stub"}'; + } else if (userText.includes('"consistent"')) { + text = '{"consistent": true, "suggestion": "", "reason": "test stub"}'; + } else { + text = 'unknown'; + } + } else if (userText.includes('Classify the following code symbol')) { + text = 'business_logic'; + } else if (userText.includes('Module summary:')) { + text = 'Coordinates a small module that does test things.'; + } else { + text = 'Test stub summary line.'; + } + res.end( + JSON.stringify({ + choices: [{ message: { role: 'assistant', content: text } }], + }) + ); + return; + } + if (req.url?.endsWith('/embeddings')) { + const parsed = JSON.parse(body) as { input: string[] }; + const fake = (s: string): number[] => { + const v = new Array(8).fill(0); + for (let i = 0; i < s.length; i++) v[i % 8] += s.charCodeAt(i) % 11; + return v; + }; + res.end( + JSON.stringify({ data: parsed.input.map((s) => ({ embedding: fake(s) })) }) + ); + return; + } + res.statusCode = 404; + res.end(); + }); + }); + await new Promise((r) => server.listen(0, '127.0.0.1', r)); + const addr = server.address() as AddressInfo; + return { + url: `http://127.0.0.1:${addr.port}/v1`, + get chatCalls() { + return state.chatCalls; + }, + set nextChatText(v: string | null) { + state.nextChatText = v; + }, + get nextChatText() { + return state.nextChatText; + }, + close: () => + new Promise((resolve, reject) => + server.close((err) => (err ? reject(err) : resolve())) + ), + }; +} + +describe('Tier extensions', () => { + let tempDir: string; + let fake: FakeServer; + + beforeEach(async () => { + tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-tiers-')); + fake = await startFake(); + // Two files in two different dirs to give the directory summarizer + // and naming-drift checker enough siblings to be meaningful. + fs.mkdirSync(path.join(tempDir, 'src', 'auth'), { recursive: true }); + fs.mkdirSync(path.join(tempDir, 'src', 'util'), { recursive: true }); + fs.writeFileSync( + path.join(tempDir, 'src', 'auth', 'token.ts'), + `export function createToken(user: string): string { + const payload = { user }; + const sig = 'fake'; + return JSON.stringify(payload) + sig; +} + +export function verifyToken(token: string): boolean { + const valid = token.length > 0; + const checked = true; + return valid && checked; +} + +export class TokenStore { + private bag: Map = new Map(); + put(k: string, v: string): void { this.bag.set(k, v); } + get(k: string): string | undefined { return this.bag.get(k); } + size(): number { return this.bag.size; } +} +` + ); + fs.writeFileSync( + path.join(tempDir, 'src', 'util', 'helpers.ts'), + `export function formatDate(d: Date): string { + const y = d.getFullYear(); + const m = d.getMonth(); + return y + '-' + m; +} + +export function clamp(n: number, min: number, max: number): number { + if (n < min) return min; + if (n > max) return max; + return n; +} + +export function debounce(fn: () => void, ms: number): () => void { + let t: ReturnType | undefined; + return () => { + if (t) clearTimeout(t); + t = setTimeout(fn, ms); + }; +} +` + ); + }); + + afterEach(async () => { + await fake.close(); + fs.rmSync(tempDir, { recursive: true, force: true }); + }); + + it('directory summary text round-trips correctly (column-order regression)', async () => { + // The fake server returns "Coordinates a small module..." for the + // dir-summarizer prompt. If the SQL bind order is wrong we'd see a + // hex content_hash come back instead of that paragraph. + const cg = await CodeGraph.init(tempDir, { + config: { + llm: { endpoint: fake.url, chatModel: 'qwen2.5-coder:7b' }, + }, + }); + try { + await cg.indexAll(); + await cg.awaitBackgroundSummarization(); + + const all = cg.getAllDirectorySummaries(); + expect(all.length).toBeGreaterThan(0); + for (const { summary } of all) { + // Summaries must be prose, not 32-char hex (which would be + // a content_hash bleeding into the wrong column). + expect(summary).not.toMatch(/^[0-9a-f]{32}$/); + expect(summary.length).toBeGreaterThan(20); + } + } finally { + cg.close(); + } + }); + + it('background pass writes directory summaries and role labels', async () => { + const cg = await CodeGraph.init(tempDir, { + config: { + llm: { + endpoint: fake.url, + chatModel: 'qwen2.5-coder:7b', + embeddingModel: 'fake-embed', + }, + }, + }); + try { + await cg.indexAll(); + await cg.awaitBackgroundSummarization(); + + // Directory summaries: at least one of the two source dirs + // should have one (3+ symbol threshold). + const dirs = cg.getAllDirectorySummaries(); + expect(dirs.length).toBeGreaterThan(0); + + // Role classification: every summarised symbol should have a + // role assigned (classifier returns "business_logic" for our + // fake responses). + const counts = cg.getRoleCounts(); + expect([...counts.values()].reduce((a, b) => a + b, 0)).toBeGreaterThan(0); + + // findNodesByRole returns the matching nodes + const businessLogic = cg.findNodesByRole('business_logic', 100); + expect(businessLogic.length).toBeGreaterThan(0); + } finally { + cg.close(); + } + }); + + it('summarizeChange honors before-only and after-only modes', async () => { + const cg = await CodeGraph.init(tempDir, { + config: { llm: { endpoint: fake.url, chatModel: 'qwen2.5-coder:7b' } }, + }); + try { + const added = await cg.summarizeChange( + 'newFn', + 'function', + '', + 'function newFn() { return 1; }' + ); + expect(added.intent.length).toBeGreaterThan(0); + + const removed = await cg.summarizeChange( + 'oldFn', + 'function', + 'function oldFn() { return 1; }', + '' + ); + expect(removed.intent.length).toBeGreaterThan(0); + } finally { + cg.close(); + } + }); + + it('findDeadCodeCandidates returns parsed verdicts', async () => { + const cg = await CodeGraph.init(tempDir, { + config: { llm: { endpoint: fake.url, chatModel: 'qwen2.5-coder:7b' } }, + }); + try { + await cg.indexAll({ summarize: false }); + const result = await cg.findDeadCodeCandidates({ maxCandidates: 5 }); + // Real assertions: judged ≤ candidates, no errors on the fake + // server, and every verdict carries a parsed confidence in + // [0, 1] from one of the three known labels. + expect(result.candidates).toBeGreaterThanOrEqual(result.judged); + expect(result.errors).toBe(0); + for (const r of result.results) { + expect(['dead', 'live', 'uncertain']).toContain(r.verdict); + expect(r.confidence).toBeGreaterThanOrEqual(0); + expect(r.confidence).toBeLessThanOrEqual(1); + } + } finally { + cg.close(); + } + }); + + it('parseRole accepts canonical, fenced, multi-word, and trailing-punct inputs', async () => { + const { parseRole } = await import('../src/llm/classifier'); + // Canonical + expect(parseRole('business_logic')).toBe('business_logic'); + // Trailing punctuation + expect(parseRole('business_logic.')).toBe('business_logic'); + // Fenced + quotes + expect(parseRole('`business_logic`')).toBe('business_logic'); + // Title-cased multi-word — the case the reviewer flagged. + expect(parseRole('Business Logic')).toBe('business_logic'); + expect(parseRole('Api Endpoint')).toBe('api_endpoint'); + // Garbage falls through to "unknown" (advisory degrade). + expect(parseRole('I think this is a util maybe')).toBe('unknown'); + }); + + it('agent bridge: pendingSummariesBatch + saveAgentSummaries round-trip without LLM', async () => { + // No config.llm — exercises the path users without Ollama would take. + const cg = await CodeGraph.init(tempDir); + try { + await cg.indexAll({ summarize: false }); + + const batch = cg.pendingSummariesBatch({ limit: 5, modelHint: 'claude-test' }); + expect(batch.items.length).toBeGreaterThan(0); + expect(batch.total).toBeGreaterThanOrEqual(batch.items.length); + // Each item should have a non-empty body and a content_hash. + for (const it of batch.items) { + expect(it.body.length).toBeGreaterThan(0); + expect(it.contentHash.length).toBe(32); + } + + // Pretend the agent answered each one with a fake summary. + const saved = cg.saveAgentSummaries( + batch.items.map((it) => ({ + nodeId: it.nodeId, + contentHash: it.contentHash, + summary: `Agent-summarised ${it.name}`, + })), + 'claude-test' + ); + expect(saved.saved).toBe(batch.items.length); + expect(saved.skipped).toBe(0); + + // Coverage now reflects the writes. + const cov = cg.getSummaryCoverage(); + expect(cov.summarised).toBeGreaterThanOrEqual(batch.items.length); + + // Re-issuing the batch with the same modelHint should NOT return + // the same items again (cache short-circuit). + const batch2 = cg.pendingSummariesBatch({ limit: 5, modelHint: 'claude-test' }); + const overlap = batch2.items.filter((b) => + batch.items.some((a) => a.nodeId === b.nodeId) + ); + expect(overlap.length).toBe(0); + } finally { + cg.close(); + } + }); + + it('agent bridge: stale content_hash is rejected with a clear error', async () => { + const cg = await CodeGraph.init(tempDir); + try { + await cg.indexAll({ summarize: false }); + const batch = cg.pendingSummariesBatch({ limit: 1 }); + const item = batch.items[0]!; + const result = cg.saveAgentSummaries( + [ + { + nodeId: item.nodeId, + contentHash: 'cccccccccccccccccccccccccccccccc', // stale + summary: 'wrong cache key', + }, + ], + 'claude-test' + ); + expect(result.saved).toBe(0); + expect(result.skipped).toBe(1); + expect(result.errors[0]).toMatch(/content_hash drifted/); + } finally { + cg.close(); + } + }); + + it('checkNamingDrift returns advisory consistent/suggestion shape', async () => { + const cg = await CodeGraph.init(tempDir, { + config: { llm: { endpoint: fake.url, chatModel: 'qwen2.5-coder:7b' } }, + }); + try { + await cg.indexAll({ summarize: false }); + // Override response so we can assert parsing of an inconsistent verdict + fake.nextChatText = + '{"consistent": false, "suggestion": "createSession", "reason": "siblings use create* prefix"}'; + + const verdict = await cg.checkNamingDrift({ + name: 'makeSession', + kind: 'function', + filePath: 'src/auth/new.ts', + }); + expect(verdict.consistent).toBe(false); + expect(verdict.suggestion).toBe('createSession'); + } finally { + cg.close(); + } + }); +}); diff --git a/__tests__/llm.test.ts b/__tests__/llm.test.ts new file mode 100644 index 00000000..e4bc48b6 --- /dev/null +++ b/__tests__/llm.test.ts @@ -0,0 +1,366 @@ +/** + * LLM auto-detect + background summarisation tests + * + * Spins up a tiny in-process HTTP server that mimics the OpenAI-compat + * surface Ollama exposes. Covers: + * - detectLocalLlm picks a chat model from /v1/models + * - LlmClient.isReachable / listModels round-trip + * - summarizeAll content_hash cache: re-running is a pure cache hit + * - CodeGraph.startBackgroundSummarization is fire-and-forget + * - cancellation via AbortController on close() + */ + +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import * as http from 'http'; +import { AddressInfo } from 'net'; +import { CodeGraph } from '../src'; +import { LlmClient } from '../src/llm/client'; +import { detectLocalLlm } from '../src/llm/detect'; + +interface FakeServerOptions { + models?: string[]; + /** Delay before responding to /chat/completions, ms. */ + chatDelayMs?: number; + /** Optional override for the chat completion text. */ + chatText?: string; +} + +interface FakeServer { + url: string; + chatCalls: number; + modelsCalls: number; + close: () => Promise; +} + +async function startFakeOllama(options: FakeServerOptions = {}): Promise { + const models = options.models ?? ['qwen2.5-coder:7b']; + const state = { chatCalls: 0, modelsCalls: 0 }; + + const server = http.createServer(async (req, res) => { + if (req.url === '/v1/models' || req.url === '/models') { + state.modelsCalls++; + res.setHeader('content-type', 'application/json'); + res.end(JSON.stringify({ data: models.map((id) => ({ id })) })); + return; + } + if (req.url?.endsWith('/chat/completions')) { + state.chatCalls++; + let body = ''; + req.on('data', (chunk) => (body += chunk)); + req.on('end', async () => { + if (options.chatDelayMs) { + await new Promise((r) => setTimeout(r, options.chatDelayMs)); + } + res.setHeader('content-type', 'application/json'); + res.end( + JSON.stringify({ + choices: [ + { + message: { + role: 'assistant', + content: options.chatText ?? 'Computes a thing and returns it', + }, + }, + ], + usage: { prompt_tokens: 10, completion_tokens: 8 }, + }) + ); + }); + return; + } + res.statusCode = 404; + res.end(); + }); + + await new Promise((resolve) => server.listen(0, '127.0.0.1', resolve)); + const addr = server.address() as AddressInfo; + const url = `http://127.0.0.1:${addr.port}/v1`; + + return { + url, + get chatCalls() { + return state.chatCalls; + }, + get modelsCalls() { + return state.modelsCalls; + }, + close: () => + new Promise((resolve, reject) => { + server.close((err) => (err ? reject(err) : resolve())); + }), + }; +} + +describe('LlmClient', () => { + it('isReachable returns true when /v1/models responds', async () => { + const fake = await startFakeOllama(); + try { + const client = new LlmClient({ endpoint: fake.url }); + expect(await client.isReachable()).toBe(true); + } finally { + await fake.close(); + } + }); + + it('isReachable returns false when nothing listens', async () => { + // Pick an unused port deterministically by opening + immediately closing + // a server. Race-free enough for a single test. + const tmp = http.createServer(); + await new Promise((r) => tmp.listen(0, '127.0.0.1', r)); + const port = (tmp.address() as AddressInfo).port; + await new Promise((r) => tmp.close(() => r())); + const client = new LlmClient({ + endpoint: `http://127.0.0.1:${port}/v1`, + timeoutMs: 200, + }); + expect(await client.isReachable()).toBe(false); + }); + + it('listModels returns ids from /v1/models', async () => { + const fake = await startFakeOllama({ models: ['qwen2.5:7b', 'gemma3:4b', 'nomic-embed-text'] }); + try { + const client = new LlmClient({ endpoint: fake.url }); + const ids = await client.listModels(); + expect(ids).toEqual(['qwen2.5:7b', 'gemma3:4b', 'nomic-embed-text']); + } finally { + await fake.close(); + } + }); +}); + +describe('detectLocalLlm', () => { + it('picks a preferred chat model and skips embedding-only ids', async () => { + const fake = await startFakeOllama({ + models: ['nomic-embed-text', 'gemma3:4b', 'qwen2.5-coder:7b'], + }); + try { + const detected = await detectLocalLlm(fake.url); + expect(detected).not.toBeNull(); + expect(detected?.chatModel).toBe('qwen2.5-coder:7b'); + expect(detected?.embeddingModel).toBe('nomic-embed-text'); + } finally { + await fake.close(); + } + }); + + it('falls back to first non-embedding model when none preferred', async () => { + const fake = await startFakeOllama({ + models: ['custom-finetune:13b', 'bge-m3'], + }); + try { + const detected = await detectLocalLlm(fake.url); + expect(detected?.chatModel).toBe('custom-finetune:13b'); + } finally { + await fake.close(); + } + }); + + it('returns null when endpoint is unreachable', async () => { + const tmp = http.createServer(); + await new Promise((r) => tmp.listen(0, '127.0.0.1', r)); + const port = (tmp.address() as AddressInfo).port; + await new Promise((r) => tmp.close(() => r())); + const detected = await detectLocalLlm(`http://127.0.0.1:${port}/v1`, 200); + expect(detected).toBeNull(); + }); +}); + +describe('CodeGraph background summarisation', () => { + let tempDir: string; + let fake: FakeServer; + + beforeEach(async () => { + tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-llm-')); + fake = await startFakeOllama(); + // Drop one TS file so indexAll has something summarisable + fs.writeFileSync( + path.join(tempDir, 'sample.ts'), + `export function greet(name: string): string { + const greeting = 'Hello'; + const punctuation = '!'; + return \`\${greeting}, \${name}\${punctuation}\`; +} + +export class Counter { + private value: number = 0; + increment(): number { + this.value += 1; + return this.value; + } + reset(): void { + this.value = 0; + } +} +` + ); + }); + + afterEach(async () => { + await fake.close(); + fs.rmSync(tempDir, { recursive: true, force: true }); + }); + + it('startBackgroundSummarization populates the cache when an LLM is configured', async () => { + const cg = await CodeGraph.init(tempDir, { + config: { + llm: { endpoint: fake.url, chatModel: 'qwen2.5-coder:7b' }, + }, + }); + try { + // indexAll fires summarisation in the background + await cg.indexAll(); + await cg.awaitBackgroundSummarization(); + + const cov = cg.getSummaryCoverage(); + expect(cov.total).toBeGreaterThan(0); + expect(cov.summarised).toBeGreaterThan(0); + expect(fake.chatCalls).toBeGreaterThan(0); + } finally { + cg.close(); + } + }); + + it('re-running is a pure cache hit (no LLM calls)', async () => { + const cg = await CodeGraph.init(tempDir, { + config: { + llm: { endpoint: fake.url, chatModel: 'qwen2.5-coder:7b' }, + }, + }); + try { + await cg.indexAll(); + await cg.awaitBackgroundSummarization(); + const callsAfterFirstPass = fake.chatCalls; + expect(callsAfterFirstPass).toBeGreaterThan(0); + + // Run it again — every symbol should hit the cache. + const result = await cg.summarizeAll(); + expect(result.cacheHits).toBe(result.candidates); + expect(result.generated).toBe(0); + expect(fake.chatCalls).toBe(callsAfterFirstPass); + } finally { + cg.close(); + } + }); + + it('hasLlm + getEffectiveLlmConfig reflect explicit config', async () => { + const cg = await CodeGraph.init(tempDir, { + config: { + llm: { endpoint: fake.url, chatModel: 'qwen2.5-coder:7b' }, + }, + }); + try { + expect(cg.hasLlm()).toBe(true); + const eff = await cg.getEffectiveLlmConfig(); + expect(eff?.endpoint).toBe(fake.url); + expect(eff?.chatModel).toBe('qwen2.5-coder:7b'); + } finally { + cg.close(); + } + }); + + it('skips background pass silently when no LLM is reachable', async () => { + // Point at a guaranteed-closed port so the test is hermetic (host + // may or may not have Ollama on 11434). Reachability check fails + // and the background pass returns early without making chat calls. + const tmp = http.createServer(); + await new Promise((r) => tmp.listen(0, '127.0.0.1', r)); + const closedPort = (tmp.address() as AddressInfo).port; + await new Promise((r) => tmp.close(() => r())); + + const cg = await CodeGraph.init(tempDir, { + config: { + llm: { + endpoint: `http://127.0.0.1:${closedPort}/v1`, + chatModel: 'qwen2.5-coder:7b', + timeoutMs: 200, + }, + }, + }); + try { + await cg.indexAll(); + await cg.awaitBackgroundSummarization(); + const cov = cg.getSummaryCoverage(); + expect(cov.summarised).toBe(0); + expect(cg.isSummarizing()).toBe(false); + } finally { + cg.close(); + } + }); + + it('close() cancels in-flight background summarisation', async () => { + // Slow chat replies so we can observe cancellation between calls. + await fake.close(); + fake = await startFakeOllama({ chatDelayMs: 100 }); + + const cg = await CodeGraph.init(tempDir, { + config: { + llm: { endpoint: fake.url, chatModel: 'qwen2.5-coder:7b' }, + }, + }); + await cg.indexAll(); + // Don't await: cancel mid-flight. + expect(cg.isSummarizing()).toBe(true); + cg.close(); + // close() aborts the controller; awaiting here would hang on the + // last in-flight HTTP request, so we just verify the bookkeeping + // is consistent. + expect(cg.isSummarizing()).toBe(false); + }); + + it('re-queues a second pass when sync fires mid-pass (dirty flag)', async () => { + // Slow chat replies so the bg pass is still running when we kick + // off a second startBackgroundSummarization() call. + await fake.close(); + fake = await startFakeOllama({ chatDelayMs: 30 }); + + const cg = await CodeGraph.init(tempDir, { + config: { llm: { endpoint: fake.url, chatModel: 'qwen2.5-coder:7b' } }, + }); + try { + await cg.indexAll(); + // First pass is mid-flight; this second call should set the + // dirty flag and return the existing promise rather than + // starting a parallel pass. + const p1 = cg.startBackgroundSummarization(); + const p2 = cg.startBackgroundSummarization(); + expect(p1).toBe(p2); + await p1; + // After the first pass completes, the dirty flag triggers a + // second pass — wait for it and ensure it ran clean (cache + // hits, no errors). + if (cg.isSummarizing()) { + await cg.awaitBackgroundSummarization(); + } + expect(cg.isSummarizing()).toBe(false); + } finally { + cg.close(); + } + }); + + it('getSymbolSummaries returns map keyed by node id', async () => { + const cg = await CodeGraph.init(tempDir, { + config: { + llm: { endpoint: fake.url, chatModel: 'qwen2.5-coder:7b' }, + }, + }); + try { + await cg.indexAll(); + await cg.awaitBackgroundSummarization(); + + const allNodes = cg.getStats(); + expect(allNodes.nodeCount).toBeGreaterThan(0); + + const ids = cg + .searchNodes('greet', { limit: 5 }) + .map((r) => r.node.id); + const summaries = cg.getSymbolSummaries(ids); + // At least one summarised symbol came back. + expect([...summaries.values()].some((s) => s.length > 0)).toBe(true); + } finally { + cg.close(); + } + }); +}); diff --git a/__tests__/mcp-tool-registry.test.ts b/__tests__/mcp-tool-registry.test.ts new file mode 100644 index 00000000..0bf45159 --- /dev/null +++ b/__tests__/mcp-tool-registry.test.ts @@ -0,0 +1,90 @@ +/** + * MCP tool registry: structural invariants. + * + * Guards against the failure mode where a future PR adds a + * ToolModule but forgets to implement the matching `handle` + * method on ToolHandler (or vice versa). + */ +import { describe, it, expect } from 'vitest'; +import { getToolModules, tools as registryTools } from '../src/mcp/tools/registry'; +import { ToolHandler, tools } from '../src/mcp/tools'; + +describe('MCP tool registry — single source of truth', () => { + it('every tool module has a non-empty name and description', () => { + for (const m of getToolModules()) { + expect(m.definition.name).toMatch(/^codegraph_[a-z_]+$/); + expect(m.definition.description.length).toBeGreaterThan(20); + } + }); + + it('handlerKey is a string starting with "handle"', () => { + for (const m of getToolModules()) { + expect(m.handlerKey).toMatch(/^handle[A-Z][A-Za-z]+$/); + } + }); + + it('every registered tool has a corresponding ToolHandler method', () => { + const handler = new ToolHandler(null); + for (const m of getToolModules()) { + const fn = (handler as unknown as Record)[m.handlerKey]; + expect(typeof fn).toBe('function'); + } + }); + + it('exported `tools` array exactly mirrors the registry', () => { + const fromRegistry = registryTools.map((t) => t.name).sort(); + const fromExport = tools.map((t) => t.name).sort(); + expect(fromExport).toEqual(fromRegistry); + }); + + it('all main-line tools are registered (regression guard)', () => { + const expected = [ + 'codegraph_ask', + 'codegraph_callees', + 'codegraph_callers', + 'codegraph_config', + 'codegraph_context', + 'codegraph_dead_code', + 'codegraph_explore', + 'codegraph_files', + 'codegraph_hotspots', + 'codegraph_impact', + 'codegraph_module', + 'codegraph_node', + 'codegraph_pending_summaries', + 'codegraph_review_context', + 'codegraph_role', + 'codegraph_save_summaries', + 'codegraph_search', + 'codegraph_similar', + 'codegraph_sql', + 'codegraph_status', + ]; + const actual = getToolModules() + .map((m) => m.definition.name) + .sort(); + expect(actual).toEqual(expected); + }); + + it('execute() reports unknown-tool errors', async () => { + const handler = new ToolHandler(null); + const result = await handler.execute('codegraph_does_not_exist', {}); + expect(result.isError).toBe(true); + expect(result.content[0]?.text).toMatch(/Unknown tool/); + }); + + it('execute() actually dispatches to the registered handler (no broken `this` binding)', async () => { + // No CodeGraph instance is bound, so handlers that call + // `getCodeGraph()` will throw — the dispatch should catch it + // and return an error result. The point of this test is to + // confirm the registry lookup + `this[handlerKey](args)` chain + // reaches an actual method body, not that the body succeeds. + const handler = new ToolHandler(null); + const result = await handler.execute('codegraph_status', {}); + expect(result.isError).toBe(true); + // Generic tool-execution-failed envelope from execute()'s catch block. + expect(result.content[0]?.text).toMatch(/Tool execution failed/); + // Specifically because no CodeGraph was bound: + expect(result.content[0]?.text).toMatch(/CodeGraph not initialized/); + }); +}); diff --git a/__tests__/migrations-015-016.test.ts b/__tests__/migrations-015-016.test.ts new file mode 100644 index 00000000..b71968fe --- /dev/null +++ b/__tests__/migrations-015-016.test.ts @@ -0,0 +1,148 @@ +/** + * Migration 015 (drop idx_co_changes_a) and 016 (split embeddings). + * + * - 015 verifies the redundant `idx_co_changes_a` index is removed + * on upgrade and absent on a fresh DB; the wider PK still covers + * `WHERE file_a = ?` lookups. + * - 016 verifies embeddings move from `symbol_summaries.embedding` + * into a dedicated `symbol_embeddings` table, the old columns + * are dropped, and existing data is preserved verbatim. + */ +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { createDatabase } from '../src/db/sqlite-adapter'; +import { runMigrations, getCurrentVersion } from '../src/db/migrations'; +import { DatabaseConnection } from '../src/db'; + +function tempDir(): string { + return fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-mig-015-016-')); +} + +function cleanup(dir: string): void { + if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true }); +} + +describe('Migration 015 — drop idx_co_changes_a', () => { + let dir: string; + beforeEach(() => { dir = tempDir(); }); + afterEach(() => cleanup(dir)); + + it('fresh DB does not contain idx_co_changes_a, but keeps idx_co_changes_b', () => { + const dbPath = path.join(dir, 'fresh.db'); + const db = DatabaseConnection.initialize(dbPath); + try { + const indexes = db.getDb() + .prepare("SELECT name FROM sqlite_master WHERE type = 'index' AND tbl_name = 'co_changes'") + .all() as Array<{ name: string }>; + const names = indexes.map((r) => r.name); + expect(names).not.toContain('idx_co_changes_a'); + expect(names).toContain('idx_co_changes_b'); + } finally { + db.close(); + } + }); +}); + +describe('Migration 016 — split embeddings into symbol_embeddings table', () => { + let dir: string; + beforeEach(() => { dir = tempDir(); }); + afterEach(() => cleanup(dir)); + + it('moves existing embedding rows; drops the inline columns', () => { + const dbPath = path.join(dir, 'upgrade.db'); + const adapter = createDatabase(dbPath); + + // Simulate a v14 database: just enough of the relevant schema. + adapter.exec(` + CREATE TABLE nodes (id TEXT PRIMARY KEY); + INSERT INTO nodes (id) VALUES ('n1'), ('n2'), ('n3'); + CREATE TABLE symbol_summaries ( + node_id TEXT PRIMARY KEY, + content_hash TEXT NOT NULL, + summary TEXT NOT NULL, + model TEXT NOT NULL, + generated_at INTEGER NOT NULL, + embedding BLOB, + embedding_model TEXT, + role TEXT, + role_model TEXT, + FOREIGN KEY (node_id) REFERENCES nodes(id) ON DELETE CASCADE + ); + CREATE INDEX idx_summaries_embedding_model ON symbol_summaries(embedding_model); + CREATE TABLE schema_versions ( + version INTEGER PRIMARY KEY, + applied_at INTEGER NOT NULL, + description TEXT + ); + INSERT INTO schema_versions (version, applied_at, description) VALUES (14, 0, 'v14'); + `); + + // n1 has both summary and embedding; n2 has summary only; + // n3 has summary + embedding from a stale model — all rows are + // copied into symbol_embeddings so long as embedding_model is set. + const buf1 = Buffer.from(new Float32Array([1, 0, 0]).buffer); + const buf3 = Buffer.from(new Float32Array([0, 1, 0]).buffer); + adapter.prepare(` + INSERT INTO symbol_summaries + (node_id, content_hash, summary, model, generated_at, embedding, embedding_model) + VALUES + ('n1', 'h1', 's1', 'chat-m', 100, ?, 'embed-m'), + ('n2', 'h2', 's2', 'chat-m', 100, NULL, NULL), + ('n3', 'h3', 's3', 'chat-m', 100, ?, 'old-embed-m') + `).run(buf1, buf3); + + runMigrations(adapter, getCurrentVersion(adapter)); + + // Old columns gone + const cols = adapter.prepare("PRAGMA table_info('symbol_summaries')").all() as Array<{ name: string }>; + const colNames = cols.map((c) => c.name); + expect(colNames).not.toContain('embedding'); + expect(colNames).not.toContain('embedding_model'); + + // New table has the rows that had embedding_model set + const moved = adapter + .prepare('SELECT node_id, embedding_model FROM symbol_embeddings ORDER BY node_id') + .all() as Array<{ node_id: string; embedding_model: string }>; + expect(moved).toEqual([ + { node_id: 'n1', embedding_model: 'embed-m' }, + { node_id: 'n3', embedding_model: 'old-embed-m' }, + ]); + + // Embedding bytes preserved verbatim for n1 + const n1 = adapter + .prepare('SELECT embedding FROM symbol_embeddings WHERE node_id = ?') + .get('n1') as { embedding: Buffer }; + expect(Buffer.from(n1.embedding).equals(buf1)).toBe(true); + + // Index on the new table + const idx = adapter + .prepare("SELECT name FROM sqlite_master WHERE type = 'index' AND tbl_name = 'symbol_embeddings'") + .all() as Array<{ name: string }>; + expect(idx.map((r) => r.name)).toContain('idx_embeddings_model'); + + expect(getCurrentVersion(adapter)).toBeGreaterThanOrEqual(16); + + adapter.close(); + }); + + it('fresh DB has symbol_embeddings table and no embedding columns on symbol_summaries', () => { + const db = DatabaseConnection.initialize(path.join(dir, 'fresh.db')); + try { + const cols = db.getDb() + .prepare("PRAGMA table_info('symbol_summaries')") + .all() as Array<{ name: string }>; + const colNames = cols.map((c) => c.name); + expect(colNames).not.toContain('embedding'); + expect(colNames).not.toContain('embedding_model'); + + const tables = db.getDb() + .prepare("SELECT name FROM sqlite_master WHERE type = 'table' AND name = 'symbol_embeddings'") + .all() as Array<{ name: string }>; + expect(tables.length).toBe(1); + } finally { + db.close(); + } + }); +}); diff --git a/__tests__/migrations-registry.test.ts b/__tests__/migrations-registry.test.ts new file mode 100644 index 00000000..9fa15eed --- /dev/null +++ b/__tests__/migrations-registry.test.ts @@ -0,0 +1,95 @@ +/** + * Migration registry: structural invariants. + * + * Guards against the silent-no-op bug class that motivated this + * refactor. If a future PR introduces a duplicate version, + * out-of-order versions, or fails to register a new migration + * file, one of these tests fails loudly. + */ +import { describe, it, expect } from 'vitest'; +import * as fs from 'fs'; +import * as path from 'path'; +import { + ALL_MIGRATIONS, + CURRENT_SCHEMA_VERSION, +} from '../src/db/migrations'; + +describe('migration registry — structural invariants', () => { + it('registry is non-empty', () => { + expect(ALL_MIGRATIONS.length).toBeGreaterThan(0); + }); + + it('versions are unique', () => { + const seen = new Set(); + for (const m of ALL_MIGRATIONS) { + expect(seen.has(m.version)).toBe(false); + seen.add(m.version); + } + }); + + it('versions are strictly ascending', () => { + for (let i = 1; i < ALL_MIGRATIONS.length; i++) { + expect(ALL_MIGRATIONS[i]!.version).toBeGreaterThan( + ALL_MIGRATIONS[i - 1]!.version + ); + } + }); + + it('each migration has a non-empty description and a function up()', () => { + for (const m of ALL_MIGRATIONS) { + expect(m.description.length).toBeGreaterThan(0); + expect(typeof m.up).toBe('function'); + } + }); + + it('CURRENT_SCHEMA_VERSION matches the highest registered version', () => { + const max = ALL_MIGRATIONS[ALL_MIGRATIONS.length - 1]!.version; + expect(CURRENT_SCHEMA_VERSION).toBe(max); + }); +}); + +describe('migration files — filename ↔ version coupling', () => { + // Read the actual filenames on disk and assert each matches an + // entry in the registry. Catches the case where someone drops a + // new file in src/db/migrations/ but forgets to register it. + const migrationsDir = path.resolve(__dirname, '../src/db/migrations'); + const SUPPORT_FILES = new Set(['index.ts', 'types.ts']); + const STRICT_NNN_PATTERN = /^\d{3}-[a-z0-9]+(?:-[a-z0-9]+)*\.ts$/; + + function listMigrationFiles(): string[] { + return fs.readdirSync(migrationsDir).filter((f) => f.endsWith('.ts') && !SUPPORT_FILES.has(f)); + } + + it('every migration file matches the strict `NNN-kebab-name.ts` pattern', () => { + const offenders: string[] = []; + for (const f of listMigrationFiles()) { + if (!STRICT_NNN_PATTERN.test(f)) { + offenders.push(f); + } + } + expect(offenders).toEqual([]); + }); + + it('every src/db/migrations/NNN-*.ts file is registered (no orphan files)', () => { + const files = listMigrationFiles().filter((f) => STRICT_NNN_PATTERN.test(f)); + expect(files.length).toBeGreaterThan(0); + const registeredVersions = new Set(ALL_MIGRATIONS.map((m) => m.version)); + for (const f of files) { + const version = parseInt(f.slice(0, 3), 10); + if (!registeredVersions.has(version)) { + throw new Error( + `Migration file ${f} exists on disk but is not registered in src/db/migrations/index.ts. ` + + `Add an import + array entry for it.` + ); + } + } + }); + + it('every registered version has a matching NNN-*.ts file (no phantom registrations)', () => { + const files = listMigrationFiles().filter((f) => STRICT_NNN_PATTERN.test(f)); + const filenameVersions = new Set(files.map((f) => parseInt(f.slice(0, 3), 10))); + for (const m of ALL_MIGRATIONS) { + expect(filenameVersions.has(m.version)).toBe(true); + } + }); +}); diff --git a/__tests__/pr19-improvements.test.ts b/__tests__/pr19-improvements.test.ts index 5fbe17d7..9f9ddc38 100644 --- a/__tests__/pr19-improvements.test.ts +++ b/__tests__/pr19-improvements.test.ts @@ -299,7 +299,7 @@ describe('Best-Candidate Resolution', () => { describe('Schema v2 Migration', () => { it.skipIf(!HAS_SQLITE)('should have correct current schema version', async () => { const { CURRENT_SCHEMA_VERSION } = await import('../src/db/migrations'); - expect(CURRENT_SCHEMA_VERSION).toBe(3); + expect(CURRENT_SCHEMA_VERSION).toBe(16); }); it.skipIf(!HAS_SQLITE)('should have migration for version 2', async () => { diff --git a/__tests__/review-context.test.ts b/__tests__/review-context.test.ts new file mode 100644 index 00000000..fd61e103 --- /dev/null +++ b/__tests__/review-context.test.ts @@ -0,0 +1,644 @@ +/** + * Review Context Tests + * + * Verifies: + * - parseDiff handles standard git unified-diff shapes (modified, + * added, deleted, renamed, multiple hunks). + * - symbolsTouchedByHunks correctly maps line ranges to symbols. + * - buildReviewContext attaches callers, callees, impact, tests + * for affected symbols. + * - Co-change warnings surface when a changed file's historical + * co-changers were NOT touched. + * - Graceful degrade: pre-#105 install (no co_changes table) and + * pre-#106 install (no `tests` edges) — return empty rather than + * throwing. + */ + +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { parseDiff, symbolsTouchedByHunks } from '../src/review/diff-parser'; +import { buildReviewContext } from '../src/review'; +import { DatabaseConnection } from '../src/db'; +import { QueryBuilder } from '../src/db/queries'; +import { GraphTraverser } from '../src/graph/traversal'; +import { Node, Edge } from '../src/types'; + +// ============================================================================= +// parseDiff +// ============================================================================= + +describe('parseDiff', () => { + it('parses a simple modified-file diff', () => { + const diff = `diff --git a/src/foo.ts b/src/foo.ts +index abc..def 100644 +--- a/src/foo.ts ++++ b/src/foo.ts +@@ -10,3 +10,5 @@ + unchanged +-old line ++new line one ++new line two + also unchanged`; + const files = parseDiff(diff); + expect(files).toHaveLength(1); + expect(files[0].path).toBe('src/foo.ts'); + expect(files[0].status).toBe('modified'); + expect(files[0].hunks).toEqual([ + { oldStart: 10, oldCount: 3, newStart: 10, newCount: 5 }, + ]); + }); + + it('detects file additions via /dev/null in the --- header', () => { + const diff = `diff --git a/new.ts b/new.ts +new file mode 100644 +index 0000000..abc +--- /dev/null ++++ b/new.ts +@@ -0,0 +1,3 @@ ++a ++b ++c`; + const files = parseDiff(diff); + expect(files).toHaveLength(1); + expect(files[0].status).toBe('added'); + expect(files[0].path).toBe('new.ts'); + }); + + it('detects file deletions via /dev/null in the +++ header', () => { + const diff = `diff --git a/gone.ts b/gone.ts +deleted file mode 100644 +index abc..0000000 +--- a/gone.ts ++++ /dev/null +@@ -1,3 +0,0 @@ +-a +-b +-c`; + const files = parseDiff(diff); + expect(files).toHaveLength(1); + expect(files[0].status).toBe('deleted'); + expect(files[0].path).toBe('gone.ts'); + }); + + it('detects renames and exposes oldPath', () => { + const diff = `diff --git a/old.ts b/new.ts +similarity index 95% +rename from old.ts +rename to new.ts +index abc..def 100644 +--- a/old.ts ++++ b/new.ts +@@ -1,2 +1,2 @@ +-old name ++new name + unchanged`; + const files = parseDiff(diff); + expect(files).toHaveLength(1); + expect(files[0].status).toBe('renamed'); + expect(files[0].path).toBe('new.ts'); + expect(files[0].oldPath).toBe('old.ts'); + }); + + it('handles multi-file, multi-hunk diffs', () => { + const diff = `diff --git a/a.ts b/a.ts +index abc..def 100644 +--- a/a.ts ++++ b/a.ts +@@ -10,3 +10,4 @@ + ctx ++added + ctx + ctx +@@ -20,2 +21,2 @@ +-old ++new + ctx +diff --git a/b.ts b/b.ts +index 111..222 100644 +--- a/b.ts ++++ b/b.ts +@@ -5,1 +5,1 @@ +-x ++y`; + const files = parseDiff(diff); + expect(files).toHaveLength(2); + expect(files[0].path).toBe('a.ts'); + expect(files[0].hunks).toHaveLength(2); + expect(files[1].path).toBe('b.ts'); + expect(files[1].hunks).toHaveLength(1); + }); + + it('returns [] for empty input', () => { + expect(parseDiff('')).toEqual([]); + }); + + it('emits a hunk-less rename even when followed by another hunked file', () => { + // Regression: previously a rename-only file mid-diff was silently + // dropped because the EOF-only hunk-less flush never fired before + // the next `diff --git` header arrived. + const diff = `diff --git a/old.ts b/new.ts +similarity index 100% +rename from old.ts +rename to new.ts +diff --git a/other.ts b/other.ts +index abc..def 100644 +--- a/other.ts ++++ b/other.ts +@@ -1,1 +1,1 @@ +-x ++y`; + const files = parseDiff(diff); + expect(files).toHaveLength(2); + expect(files[0].status).toBe('renamed'); + expect(files[0].path).toBe('new.ts'); + expect(files[0].oldPath).toBe('old.ts'); + expect(files[1].path).toBe('other.ts'); + expect(files[1].status).toBe('modified'); + }); + + it('emits a hunk-less file-mode-change followed by another file', () => { + const diff = `diff --git a/script.sh b/script.sh +old mode 100644 +new mode 100755 +diff --git a/foo.ts b/foo.ts +index abc..def 100644 +--- a/foo.ts ++++ b/foo.ts +@@ -1,1 +1,1 @@ +-a ++b`; + const files = parseDiff(diff); + // The mode-change file has no add/delete/rename markers so it + // doesn't qualify as hunk-less for our purposes — it's silently + // skipped (current implementation). The hunked file MUST still + // be emitted, and that's the regression risk. + expect(files.find((f) => f.path === 'foo.ts')).toBeDefined(); + }); + + it('strips C-style quoting from paths with spaces or special chars', () => { + const diff = `diff --git "a/path with spaces.ts" "b/path with spaces.ts" +index abc..def 100644 +--- "a/path with spaces.ts" ++++ "b/path with spaces.ts" +@@ -1,1 +1,1 @@ +-a ++b`; + const files = parseDiff(diff); + expect(files).toHaveLength(1); + expect(files[0].path).toBe('path with spaces.ts'); + expect(files[0].path).not.toContain('"'); + }); + + it('handles single-line hunk header (no comma)', () => { + // git emits `@@ -5 +5 @@` for one-line hunks (count of 1 elided). + const diff = `diff --git a/x.ts b/x.ts +index abc..def 100644 +--- a/x.ts ++++ b/x.ts +@@ -5 +5 @@ +-old ++new`; + const files = parseDiff(diff); + expect(files[0].hunks[0]).toEqual({ + oldStart: 5, + oldCount: 1, + newStart: 5, + newCount: 1, + }); + }); +}); + +// ============================================================================= +// symbolsTouchedByHunks +// ============================================================================= + +describe('symbolsTouchedByHunks', () => { + const sym = (startLine: number, endLine: number, name = 'sym') => ({ startLine, endLine, name }); + + it('returns symbols whose range overlaps any hunk', () => { + const symbols = [sym(1, 5, 'a'), sym(10, 20, 'b'), sym(50, 60, 'c')]; + const hunks = [{ oldStart: 12, oldCount: 3, newStart: 12, newCount: 3 }]; + const out = symbolsTouchedByHunks(hunks, symbols); + expect(out.map((s) => s.name)).toEqual(['b']); + }); + + it('matches a symbol that fully contains the hunk', () => { + const symbols = [sym(1, 100, 'big')]; + const hunks = [{ oldStart: 50, oldCount: 1, newStart: 50, newCount: 1 }]; + expect(symbolsTouchedByHunks(hunks, symbols).map((s) => s.name)).toEqual(['big']); + }); + + it('matches a symbol fully contained by the hunk', () => { + const symbols = [sym(50, 55, 'small')]; + const hunks = [{ oldStart: 10, oldCount: 100, newStart: 10, newCount: 100 }]; + expect(symbolsTouchedByHunks(hunks, symbols).map((s) => s.name)).toEqual(['small']); + }); + + it('does not match symbols outside any hunk', () => { + const symbols = [sym(1, 5, 'before'), sym(50, 60, 'after')]; + const hunks = [{ oldStart: 20, oldCount: 5, newStart: 20, newCount: 5 }]; + expect(symbolsTouchedByHunks(hunks, symbols)).toEqual([]); + }); + + it('returns [] when hunks or symbols are empty', () => { + expect(symbolsTouchedByHunks([], [sym(1, 5)])).toEqual([]); + expect(symbolsTouchedByHunks([{ oldStart: 1, oldCount: 1, newStart: 1, newCount: 1 }], [])).toEqual([]); + }); +}); + +// ============================================================================= +// buildReviewContext (integration) +// ============================================================================= + +function makeNode(id: string, name: string, kind: Node['kind'], filePath: string, startLine: number, endLine: number): Node { + return { + id, + kind, + name, + qualifiedName: `${filePath}::${name}`, + filePath, + language: 'typescript', + startLine, + endLine, + startColumn: 0, + endColumn: 0, + updatedAt: Date.now(), + }; +} + +describe('buildReviewContext (integration)', () => { + let dir: string; + let db: DatabaseConnection; + let q: QueryBuilder; + let traverser: GraphTraverser; + + beforeEach(() => { + dir = fs.mkdtempSync(path.join(os.tmpdir(), 'review-ctx-')); + db = DatabaseConnection.initialize(path.join(dir, 'test.db')); + q = new QueryBuilder(db.getDb()); + traverser = new GraphTraverser(q); + + // Set up a small graph: + // src/foo.ts contains `doFoo` (lines 5-15) + // src/bar.ts contains `useFoo` (lines 1-10) which calls doFoo + // src/baz.ts contains `helper` (lines 20-30) which doFoo calls + const upsertFile = db.getDb().prepare(` + INSERT INTO files (path, content_hash, language, size, modified_at, indexed_at) + VALUES (?, '', 'typescript', 0, 0, 0) + `); + upsertFile.run('src/foo.ts'); + upsertFile.run('src/bar.ts'); + upsertFile.run('src/baz.ts'); + + q.insertNodes([ + makeNode('foo', 'doFoo', 'function', 'src/foo.ts', 5, 15), + makeNode('bar', 'useFoo', 'function', 'src/bar.ts', 1, 10), + makeNode('baz', 'helper', 'function', 'src/baz.ts', 20, 30), + ]); + + // Edges: useFoo -> doFoo (calls), doFoo -> helper (calls) + const callEdge = (source: string, target: string, line: number): Edge => ({ + source, + target, + kind: 'calls', + line, + }); + q.insertEdges([callEdge('bar', 'foo', 5), callEdge('foo', 'baz', 12)]); + }); + + afterEach(() => { + db.close(); + if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true }); + }); + + function modifyDoFooDiff(): string { + return `diff --git a/src/foo.ts b/src/foo.ts +index abc..def 100644 +--- a/src/foo.ts ++++ b/src/foo.ts +@@ -10,3 +10,4 @@ + ctx +-old impl ++new impl ++plus one + ctx`; + } + + it('attaches callers and callees for affected symbols', () => { + const ctx = buildReviewContext(modifyDoFooDiff(), q, traverser); + expect(ctx.files).toHaveLength(1); + expect(ctx.files[0].affectedSymbols).toHaveLength(1); + const sym = ctx.files[0].affectedSymbols[0]; + expect(sym.name).toBe('doFoo'); + expect(sym.callers.map((c) => c.name)).toContain('useFoo'); + expect(sym.callees.map((c) => c.name)).toContain('helper'); + }); + + it('summarizes correctly across an added + modified + deleted set', () => { + const diff = `diff --git a/src/foo.ts b/src/foo.ts +--- a/src/foo.ts ++++ b/src/foo.ts +@@ -10,1 +10,1 @@ +-x ++y +diff --git a/src/added.ts b/src/added.ts +new file mode 100644 +--- /dev/null ++++ b/src/added.ts +@@ -0,0 +1,1 @@ ++content +diff --git a/src/baz.ts b/src/baz.ts +deleted file mode 100644 +--- a/src/baz.ts ++++ /dev/null +@@ -1,1 +0,0 @@ +-x`; + const ctx = buildReviewContext(diff, q, traverser); + expect(ctx.summary.filesAdded).toBe(1); + expect(ctx.summary.filesModified).toBe(1); + expect(ctx.summary.filesDeleted).toBe(1); + }); + + it('reports broken incoming refs for deleted files', () => { + const diff = `diff --git a/src/baz.ts b/src/baz.ts +deleted file mode 100644 +--- a/src/baz.ts ++++ /dev/null +@@ -20,11 +0,0 @@ +-x`; + const ctx = buildReviewContext(diff, q, traverser); + const baz = ctx.files.find((f) => f.path === 'src/baz.ts')!; + expect(baz.status).toBe('deleted'); + // doFoo (in foo.ts) calls helper (in baz.ts) — deleting baz.ts breaks foo. + expect(baz.brokenIncomingRefs?.map((r) => r.name)).toContain('doFoo'); + }); + + it('dedupes brokenIncomingRefs when one caller has multiple edge types to the deleted file', () => { + // Add a second edge from useFoo to helper (e.g., references in + // addition to the existing call). Without dedup, useFoo would appear + // twice in brokenIncomingRefs. + q.insertEdges([{ source: 'bar', target: 'baz', kind: 'references', line: 7 }]); + // Note: bar already had a `calls` edge target=foo and now `references` target=baz. + // For deletion of baz.ts we look at incoming to baz's symbols (helper). + // We need TWO edges from the same source to helper for dedup to fire. + q.insertEdges([ + { source: 'bar', target: 'baz', kind: 'imports', line: 7 }, + ]); + const diff = `diff --git a/src/baz.ts b/src/baz.ts +deleted file mode 100644 +--- a/src/baz.ts ++++ /dev/null +@@ -20,11 +0,0 @@ +-x`; + const ctx = buildReviewContext(diff, q, traverser); + const baz = ctx.files.find((f) => f.path === 'src/baz.ts')!; + // useFoo should appear at most once with line=7 (we have two edges + // both at line 7 from bar to baz with different kinds). + const fromBar = baz.brokenIncomingRefs?.filter((r) => r.name === 'useFoo' && r.line === 7); + expect(fromBar?.length).toBe(1); + }); + + it('returns empty co-change warnings on a pre-#105 install (no co_changes table)', () => { + // Default DatabaseConnection.initialize() runs schema.sql which on + // upstream/main does NOT include the co_changes table. The helper + // must gracefully degrade rather than throw. + const ctx = buildReviewContext(modifyDoFooDiff(), q, traverser); + expect(ctx.coChangeWarnings).toEqual([]); + expect(ctx.summary.coChangeWarnings).toBe(0); + }); + + it('returns empty tests array on a pre-#106 install (no `tests` edges)', () => { + const ctx = buildReviewContext(modifyDoFooDiff(), q, traverser); + expect(ctx.files[0].tests).toEqual([]); + }); + + it('respects maxCallersPerSymbol cap', () => { + // Add 10 more callers of doFoo to make the cap observable. + const extraNodes: Node[] = []; + const extraEdges: Edge[] = []; + const upsert = db.getDb().prepare(` + INSERT INTO files (path, content_hash, language, size, modified_at, indexed_at) + VALUES (?, '', 'typescript', 0, 0, 0) + `); + for (let i = 0; i < 10; i++) { + const fp = `src/caller${i}.ts`; + upsert.run(fp); + const id = `caller${i}`; + extraNodes.push(makeNode(id, `caller${i}`, 'function', fp, 1, 5)); + extraEdges.push({ source: id, target: 'foo', kind: 'calls', line: 1 }); + } + q.insertNodes(extraNodes); + q.insertEdges(extraEdges); + + const ctx = buildReviewContext(modifyDoFooDiff(), q, traverser, { maxCallersPerSymbol: 3 }); + const sym = ctx.files[0].affectedSymbols[0]; + expect(sym.callers.length).toBeLessThanOrEqual(3); + }); + + it('co-change warning surfaces when a changed file has historical co-changers not in the PR', () => { + // Manually create the co_changes table + add commit_count + populate. + // This simulates a post-#105 install. (When PR #105 lands the table + // exists natively; we simulate it here so the helper has data to + // surface.) + db.getDb().exec(` + CREATE TABLE IF NOT EXISTS co_changes ( + file_a TEXT NOT NULL, + file_b TEXT NOT NULL, + count INTEGER NOT NULL, + PRIMARY KEY (file_a, file_b), + CHECK (file_a < file_b) + ); + `); + db.getDb().prepare('UPDATE files SET commit_count = ? WHERE path = ?').run(10, 'src/foo.ts'); + db.getDb().prepare('UPDATE files SET commit_count = ? WHERE path = ?').run(8, 'src/bar.ts'); + db.getDb().prepare('INSERT INTO co_changes (file_a, file_b, count) VALUES (?, ?, ?)') + .run('src/bar.ts', 'src/foo.ts', 7); + + // Re-define getCoChangedFiles via a thin shim (since we don't have + // PR #105's QueryBuilder method here). Use the same SQL the PR + // would use. + (q as unknown as { + getCoChangedFiles: typeof getCoChangedFilesShim; + }).getCoChangedFiles = getCoChangedFilesShim.bind(null, q); + + // Diff touches src/foo.ts but NOT src/bar.ts → bar.ts should surface + // as a co-change warning. + const ctx = buildReviewContext(modifyDoFooDiff(), q, traverser, { + minCoChangeJaccard: 0.3, + }); + expect(ctx.coChangeWarnings.length).toBeGreaterThan(0); + const w = ctx.coChangeWarnings[0]; + expect(w.changedFile).toBe('src/foo.ts'); + expect(w.expectedToChange).toBe('src/bar.ts'); + expect(w.jaccard).toBeGreaterThan(0.3); + }); + + it('does NOT warn about files that ARE in the PR (changedPaths exclusion)', () => { + db.getDb().exec(` + CREATE TABLE IF NOT EXISTS co_changes ( + file_a TEXT NOT NULL, file_b TEXT NOT NULL, count INTEGER NOT NULL, + PRIMARY KEY (file_a, file_b), CHECK (file_a < file_b) + ); + `); + db.getDb().prepare('UPDATE files SET commit_count = ? WHERE path = ?').run(10, 'src/foo.ts'); + db.getDb().prepare('UPDATE files SET commit_count = ? WHERE path = ?').run(8, 'src/bar.ts'); + db.getDb().prepare('INSERT INTO co_changes (file_a, file_b, count) VALUES (?, ?, ?)') + .run('src/bar.ts', 'src/foo.ts', 7); + (q as unknown as { getCoChangedFiles: typeof getCoChangedFilesShim }).getCoChangedFiles + = getCoChangedFilesShim.bind(null, q); + + // Diff includes BOTH foo and bar → no warning should appear because + // bar IS in the changed set. + const diff = `diff --git a/src/foo.ts b/src/foo.ts +--- a/src/foo.ts ++++ b/src/foo.ts +@@ -10,1 +10,1 @@ +-x ++y +diff --git a/src/bar.ts b/src/bar.ts +--- a/src/bar.ts ++++ b/src/bar.ts +@@ -3,1 +3,1 @@ +-x ++y`; + const ctx = buildReviewContext(diff, q, traverser, { minCoChangeJaccard: 0.3 }); + expect(ctx.coChangeWarnings).toEqual([]); + }); +}); + +describe('serializeReviewContextWithinCap (JSON-safe truncation)', () => { + // Re-import the helper indirectly via the MCP tool path. To test it + // in isolation we'd need to export it; instead exercise it via the + // path: build a too-large context, call the public buildReviewContext, + // serialize, and verify the output is parseable JSON. + it('produces parseable JSON even when context exceeds the cap', async () => { + // Build a context with thousands of symbols by inserting many nodes + // and a diff that touches them all. + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'review-trunc-')); + const db = DatabaseConnection.initialize(path.join(dir, 'test.db')); + const q = new QueryBuilder(db.getDb()); + const traverser = new GraphTraverser(q); + + db.getDb().prepare(`INSERT INTO files (path, content_hash, language, size, modified_at, indexed_at) VALUES (?, '', 'typescript', 0, 0, 0)`).run('src/big.ts'); + const nodes: Node[] = []; + for (let i = 0; i < 200; i++) { + nodes.push(makeNode(`n${i}`, `sym${i}`, 'function', 'src/big.ts', i * 5, i * 5 + 4)); + // Long docstrings to stress the truncation + nodes[i].docstring = 'x'.repeat(500); + } + q.insertNodes(nodes); + + // Diff that touches every line in big.ts. + const diff = `diff --git a/src/big.ts b/src/big.ts +--- a/src/big.ts ++++ b/src/big.ts +@@ -1,1000 +1,1000 @@ +-x ++y`; + const ctx = buildReviewContext(diff, q, traverser); + + // Use the helper directly — re-create it inline (matches the MCP + // tool's serializeReviewContextWithinCap behavior). Verify JSON parses. + const json = JSON.stringify(ctx, null, 2); + expect(() => JSON.parse(json)).not.toThrow(); // sanity: full JSON is valid + + // Now apply the same trimming logic the MCP handler uses (lift it + // here as a one-off — equivalent to importing the private helper). + const cap = 5000; // small cap to force trimming + const trimmed = trimContextToFitJson(ctx, cap); + expect(trimmed.length).toBeLessThanOrEqual(cap); + expect(() => JSON.parse(trimmed)).not.toThrow(); + + db.close(); + fs.rmSync(dir, { recursive: true, force: true }); + }); +}); + +// Inline equivalent of serializeReviewContextWithinCap from src/mcp/tools.ts. +// Kept here to avoid exporting an internal helper just for tests. +function trimContextToFitJson(context: unknown, cap: number): string { + const ctx = JSON.parse(JSON.stringify(context)) as { + summary: Record; + files: Array<{ + affectedSymbols: Array<{ + docstring?: string; + signature?: string; + callers?: unknown[]; + callees?: unknown[]; + }>; + _truncated?: boolean; + }>; + coChangeWarnings: unknown[]; + _truncated?: boolean; + }; + const fits = (s: string) => s.length <= cap; + let json = JSON.stringify(ctx, null, 2); + if (fits(json)) return json; + for (const f of ctx.files) for (const s of f.affectedSymbols) delete s.docstring; + json = JSON.stringify(ctx, null, 2); + if (fits(json)) return json; + for (const f of ctx.files) for (const s of f.affectedSymbols) delete s.signature; + json = JSON.stringify(ctx, null, 2); + if (fits(json)) return json; + for (const f of ctx.files) for (const s of f.affectedSymbols) { + if (Array.isArray(s.callers)) s.callers = s.callers.slice(0, 2); + if (Array.isArray(s.callees)) s.callees = s.callees.slice(0, 2); + } + json = JSON.stringify(ctx, null, 2); + if (fits(json)) return json; + for (const f of ctx.files) for (const s of f.affectedSymbols) { + delete s.callers; + delete s.callees; + } + json = JSON.stringify(ctx, null, 2); + if (fits(json)) return json; + while (ctx.files.length > 1) { + ctx.files.pop(); + ctx._truncated = true; + json = JSON.stringify(ctx, null, 2); + if (fits(json)) return json; + } + return JSON.stringify( + { summary: ctx.summary, coChangeWarnings: ctx.coChangeWarnings, _truncated: true }, + null, 2 + ); +} + +/** + * Shim that mimics PR #105's QueryBuilder.getCoChangedFiles. Used in + * tests for forward-compatibility — once #105 lands, the real method + * exists on QueryBuilder and this shim is unnecessary. + */ +function getCoChangedFilesShim( + q: QueryBuilder, + filePath: string, + options: { limit: number; minCount: number; minJaccard: number } +): Array<{ path: string; count: number; jaccard: number }> { + const { limit, minCount, minJaccard } = options; + const sql = ` + WITH partners AS ( + SELECT file_b AS path, count FROM co_changes WHERE file_a = ? + UNION ALL + SELECT file_a AS path, count FROM co_changes WHERE file_b = ? + ), + anchor AS (SELECT commit_count AS c FROM files WHERE path = ?), + scored AS ( + SELECT + p.path AS path, p.count AS count, + CAST(p.count AS REAL) / NULLIF((SELECT c FROM anchor) + f.commit_count - p.count, 0) AS jaccard + FROM partners p + JOIN files f ON f.path = p.path + WHERE p.count >= ? + ) + SELECT path, count, jaccard FROM scored + WHERE COALESCE(jaccard, 0) >= ? + ORDER BY jaccard DESC, count DESC + LIMIT ? + `; + const rows = (q as unknown as { db: { prepare: (sql: string) => { all: (...args: unknown[]) => Array<{ path: string; count: number; jaccard: number | null }> } } }).db + .prepare(sql) + .all(filePath, filePath, filePath, minCount, minJaccard, limit); + return rows.map((r) => ({ path: r.path, count: r.count, jaccard: r.jaccard ?? 0 })); +} diff --git a/__tests__/search-quality.test.ts b/__tests__/search-quality.test.ts new file mode 100644 index 00000000..8e069776 --- /dev/null +++ b/__tests__/search-quality.test.ts @@ -0,0 +1,302 @@ +/** + * Search Quality Tests + * + * Regression tests for the FTS improvements that bring natural-language + * and partial-identifier queries into the top of the result set: + * - Subword tokens (camel/snake split) so `parser` finds `getParser`. + * - Porter stemmer so `parsing` matches `parser`/`parses`. + * - Stopword stripping so `"how"` / `"the"` don't crowd out the + * real terms via docstring matches. + * + * All measurements were captured against codegraph's own src/ during + * development. Targets that previously ranked #18, #19, or weren't in + * the top 20 jump to the top 5. + */ + +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { DatabaseConnection } from '../src/db'; +import { QueryBuilder } from '../src/db/queries'; +import { Node } from '../src/types'; +import { splitIdentifierTokens, buildNameSubwords } from '../src/utils'; +import { filterStopwords, STOP_WORDS } from '../src/search/query-utils'; +import { runMigrations, getCurrentVersion } from '../src/db/migrations'; + +describe('splitIdentifierTokens', () => { + it('splits camelCase', () => { + expect(splitIdentifierTokens('getParser')).toEqual(['get', 'parser']); + }); + + it('splits PascalCase', () => { + expect(splitIdentifierTokens('DatabaseConnection')).toEqual(['database', 'connection']); + }); + + it('splits XMLHttpRequest-style runs of capitals', () => { + expect(splitIdentifierTokens('XMLHttpRequest')).toEqual(['xml', 'http', 'request']); + }); + + it('splits snake_case', () => { + expect(splitIdentifierTokens('database_connection')).toEqual(['database', 'connection']); + }); + + it('splits kebab-case and dots and slashes', () => { + expect(splitIdentifierTokens('foo-bar.baz/qux')).toEqual(['foo', 'bar', 'baz', 'qux']); + }); + + it('keeps single-word identifiers as-is', () => { + expect(splitIdentifierTokens('parse')).toEqual(['parse']); + }); + + it('handles trailing/leading underscores', () => { + expect(splitIdentifierTokens('__init__')).toEqual(['init']); + }); + + it('preserves numbers as part of the surrounding token', () => { + expect(splitIdentifierTokens('parseV2')).toEqual(['parse', 'v2']); + }); +}); + +describe('buildNameSubwords', () => { + it('preserves the original identifier so direct queries still hit', () => { + const out = buildNameSubwords('getParser'); + expect(out.split(' ')).toContain('getParser'); + }); + + it('appends split tokens', () => { + const out = buildNameSubwords('getParser').split(' '); + expect(out).toContain('get'); + expect(out).toContain('parser'); + }); + + it('dedupes single-word identifiers (no "parse parse")', () => { + expect(buildNameSubwords('parse')).toBe('parse'); + }); + + it('dedupes when split produces a single token equal to the original', () => { + // 'foo' has no boundary, so splitIdentifierTokens returns ['foo']; + // without dedup we would store 'foo foo'. + const out = buildNameSubwords('foo').split(' '); + expect(out).toEqual(['foo']); + }); + + it('handles empty string without crashing', () => { + expect(buildNameSubwords('')).toBe(''); + }); +}); + +describe('filterStopwords (shared with query-utils.ts)', () => { + it('drops common English stopwords', () => { + expect(filterStopwords(['how', 'does', 'parsing', 'work'])) + // 'work' is also in STOP_WORDS, so the result is just 'parsing' + .toEqual(['parsing']); + }); + + it('returns the original list when every term is a stopword', () => { + // Otherwise we would produce an empty FTS query. + const allStopwords = ['the', 'a', 'an']; + expect(filterStopwords(allStopwords)).toEqual(allStopwords); + }); + + it('does not strip common identifier-like words', () => { + // `get` / `set` / `find` could be method names; never treated as stopwords. + expect(filterStopwords(['get', 'set', 'find', 'name'])) + .toEqual(['get', 'set', 'find', 'name']); + expect(STOP_WORDS.has('get')).toBe(false); + }); +}); + +describe('FTS5 search quality (integration)', () => { + let dir: string; + let db: DatabaseConnection; + let q: QueryBuilder; + + function makeNode(id: string, name: string, kind: Node['kind'], docstring?: string): Node { + return { + id, + kind, + name, + qualifiedName: name, + filePath: `src/${name}.ts`, + language: 'typescript', + startLine: 1, + endLine: 1, + startColumn: 0, + endColumn: 0, + docstring, + updatedAt: Date.now(), + }; + } + + beforeEach(() => { + dir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-search-quality-')); + db = DatabaseConnection.initialize(path.join(dir, 'test.db')); + q = new QueryBuilder(db.getDb()); + }); + + afterEach(() => { + db.close(); + if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true }); + }); + + it('finds getParser for a `parser` query (subword tokens)', () => { + q.insertNodes([ + makeNode('n1', 'getParser', 'function'), + makeNode('n2', 'unrelated', 'function'), + ]); + const results = q.searchNodes('parser', { limit: 10 }); + expect(results.find((r) => r.node.name === 'getParser')).toBeDefined(); + }); + + it('finds DatabaseConnection for a `connection` query (subword tokens)', () => { + q.insertNodes([ + makeNode('n1', 'DatabaseConnection', 'class'), + makeNode('n2', 'unrelated', 'function'), + ]); + const results = q.searchNodes('connection', { limit: 10 }); + expect(results.find((r) => r.node.name === 'DatabaseConnection')).toBeDefined(); + }); + + it('matches `parsing` against `getParser` via Porter stemmer', () => { + q.insertNodes([ + makeNode('n1', 'getParser', 'function'), + makeNode('n2', 'unrelated', 'function'), + ]); + const results = q.searchNodes('parsing', { limit: 10 }); + expect(results.find((r) => r.node.name === 'getParser')).toBeDefined(); + }); + + it('matches `resolves references` against resolveOne', () => { + q.insertNodes([ + makeNode('n1', 'resolveOne', 'method'), + makeNode('n2', 'unrelated', 'function'), + ]); + const results = q.searchNodes('resolves references', { limit: 10 }); + expect(results.find((r) => r.node.name === 'resolveOne')).toBeDefined(); + }); + + it('strips stopwords so `how does parser work` finds getParser', () => { + // Without stopword stripping the docstring of `unrelated` (containing + // "how" and "does") would BM25-flood the result list. + q.insertNodes([ + makeNode('n1', 'getParser', 'function'), + makeNode( + 'n2', + 'unrelated', + 'function', + 'How does this work? It does many things — does, does, does.' + ), + ]); + const results = q.searchNodes('how does parser work', { limit: 10 }); + const ranks = new Map(results.map((r, i) => [r.node.name, i + 1])); + const parserRank = ranks.get('getParser'); + const unrelatedRank = ranks.get('unrelated'); + expect(parserRank).toBeDefined(); + if (unrelatedRank !== undefined) { + expect(parserRank).toBeLessThan(unrelatedRank); + } + }); + + it('exact identifier search still works (no regression on direct queries)', () => { + q.insertNodes([ + makeNode('n1', 'ExtractionOrchestrator', 'class'), + makeNode('n2', 'extraction', 'variable'), + makeNode('n3', 'orchestrator', 'variable'), + ]); + const results = q.searchNodes('ExtractionOrchestrator', { limit: 10 }); + expect(results[0].node.name).toBe('ExtractionOrchestrator'); + }); +}); + +describe('Migration v4: backfill name_subwords + rebuild FTS', () => { + let dir: string; + + beforeEach(() => { + dir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-migr-v4-fts-')); + }); + + afterEach(() => { + if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true }); + }); + + it('rebuilds FTS so subword search works on previously-indexed nodes', () => { + // Build a v3-shape database from explicit SQL — the pre-PR schema — + // then run forward migrations and verify search works end-to-end. + // This is a faithful simulation of an upgrade from a real v3 install. + const Database = require('better-sqlite3'); + const dbHandle = new Database(path.join(dir, 'test.db')); + dbHandle.pragma('foreign_keys = ON'); + dbHandle.exec(` + CREATE TABLE schema_versions (version INTEGER PRIMARY KEY, applied_at INTEGER NOT NULL, description TEXT); + INSERT INTO schema_versions (version, applied_at, description) VALUES (3, 0, 'v3'); + CREATE TABLE nodes ( + id TEXT PRIMARY KEY, kind TEXT NOT NULL, name TEXT NOT NULL, + qualified_name TEXT NOT NULL, file_path TEXT NOT NULL, language TEXT NOT NULL, + start_line INTEGER NOT NULL, end_line INTEGER NOT NULL, + start_column INTEGER NOT NULL, end_column INTEGER NOT NULL, + docstring TEXT, signature TEXT, visibility TEXT, + is_exported INTEGER DEFAULT 0, is_async INTEGER DEFAULT 0, + is_static INTEGER DEFAULT 0, is_abstract INTEGER DEFAULT 0, + decorators TEXT, type_parameters TEXT, updated_at INTEGER NOT NULL + ); + CREATE VIRTUAL TABLE nodes_fts USING fts5( + id, name, qualified_name, docstring, signature, + content='nodes', content_rowid='rowid' + ); + CREATE TRIGGER nodes_ai AFTER INSERT ON nodes BEGIN + INSERT INTO nodes_fts(rowid, id, name, qualified_name, docstring, signature) + VALUES (NEW.rowid, NEW.id, NEW.name, NEW.qualified_name, NEW.docstring, NEW.signature); + END; + INSERT INTO nodes (id, kind, name, qualified_name, file_path, language, + start_line, end_line, start_column, end_column, updated_at) + VALUES ('n1', 'function', 'getParser', 'getParser', 'a.ts', 'typescript', 1, 1, 0, 0, 0); + `); + + expect(getCurrentVersion(dbHandle)).toBe(3); + + // Apply forward migrations (4..N including the FTS-subwords pass). + runMigrations(dbHandle, 3); + expect(getCurrentVersion(dbHandle)).toBeGreaterThanOrEqual(9); + + // The new column was backfilled with the split subwords. + const row = dbHandle.prepare('SELECT name_subwords FROM nodes WHERE id = ?').get('n1') as { + name_subwords: string; + }; + expect(row.name_subwords).toContain('parser'); + + // Search end-to-end via QueryBuilder works against the migrated DB. + const q2 = new QueryBuilder(dbHandle); + const results = q2.searchNodes('parser', { limit: 10 }); + expect(results.find((r) => r.node.name === 'getParser')).toBeDefined(); + + dbHandle.close(); + }); + + it('migration is idempotent if name_subwords column already exists', () => { + // Simulate a partial-failure scenario: the ALTER TABLE landed + // (DDL is auto-committed in SQLite even inside a transaction) but + // the rest didn't, so the column is present but the FTS hasn't been + // recreated and the schema_versions row hasn't been bumped. + const Database = require('better-sqlite3'); + const dbHandle = new Database(path.join(dir, 'test.db')); + dbHandle.exec(` + CREATE TABLE schema_versions (version INTEGER PRIMARY KEY, applied_at INTEGER NOT NULL, description TEXT); + INSERT INTO schema_versions (version, applied_at, description) VALUES (3, 0, 'v3'); + CREATE TABLE nodes ( + id TEXT PRIMARY KEY, kind TEXT NOT NULL, name TEXT NOT NULL, + qualified_name TEXT NOT NULL, file_path TEXT NOT NULL, language TEXT NOT NULL, + start_line INTEGER NOT NULL, end_line INTEGER NOT NULL, + start_column INTEGER NOT NULL, end_column INTEGER NOT NULL, + docstring TEXT, signature TEXT, visibility TEXT, + is_exported INTEGER DEFAULT 0, is_async INTEGER DEFAULT 0, + is_static INTEGER DEFAULT 0, is_abstract INTEGER DEFAULT 0, + decorators TEXT, type_parameters TEXT, updated_at INTEGER NOT NULL, + name_subwords TEXT -- partial pre-existing state + ); + `); + expect(() => runMigrations(dbHandle, 3)).not.toThrow(); + expect(getCurrentVersion(dbHandle)).toBeGreaterThanOrEqual(9); + dbHandle.close(); + }); +}); diff --git a/__tests__/security.test.ts b/__tests__/security.test.ts index 53441d58..1c62e648 100644 --- a/__tests__/security.test.ts +++ b/__tests__/security.test.ts @@ -533,3 +533,36 @@ describe('Symlink Cycle Detection', () => { expect(files).toContain('src/valid.ts'); }); }); + +describe('ReDoS-safe glob matching', () => { + it('coalesces runs of `*` so hostile inputs do not produce nested quantifiers', async () => { + const { globToSafeRegex } = await import('../src/utils'); + // Two or more stars collapse to a single recursive wildcard. This is the + // ReDoS protection: `*****` doesn't expand to `[^/]*[^/]*[^/]*[^/]*[^/]*`, + // which on a long input could catastrophically backtrack. + expect(globToSafeRegex('*****')).toBe('.*'); + expect(globToSafeRegex('**')).toBe('.*'); + + // Even a constructed-from-hostile-input regex matches in linear time. + const regex = new RegExp(`^${globToSafeRegex('*****')}foo$`); + const start = Date.now(); + // 100k 'a's followed by something that doesn't end in 'foo'. + expect(regex.test('a'.repeat(100000) + 'bar')).toBe(false); + expect(Date.now() - start).toBeLessThan(500); + }); + + it('rejects pathologically long glob inputs', async () => { + const { globToSafeRegex } = await import('../src/utils'); + expect(globToSafeRegex('*'.repeat(2000))).toBeNull(); + }); + + it('preserves the standard glob semantics for common patterns', async () => { + const { globToSafeRegex } = await import('../src/utils'); + const body = globToSafeRegex('src/**/*.test.ts'); + expect(body).toBeDefined(); + const regex = new RegExp(`^${body}$`); + expect(regex.test('src/lib/foo.test.ts')).toBe(true); + expect(regex.test('src/lib/foo.ts')).toBe(false); + expect(regex.test('other/src/foo.test.ts')).toBe(false); + }); +}); diff --git a/__tests__/sql-refs.test.ts b/__tests__/sql-refs.test.ts new file mode 100644 index 00000000..0a05a31b --- /dev/null +++ b/__tests__/sql-refs.test.ts @@ -0,0 +1,339 @@ +/** + * SQL call-site tests: parser unit tests + end-to-end through CodeGraph. + */ +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import * as fs from 'fs'; +import * as os from 'os'; +import * as path from 'path'; +import { extractSqlRefs } from '../src/sql-refs'; +import CodeGraph from '../src/index'; + +let testDir: string; +let cg: CodeGraph | null = null; + +function write(rel: string, content: string) { + const abs = path.join(testDir, rel); + fs.mkdirSync(path.dirname(abs), { recursive: true }); + fs.writeFileSync(abs, content); +} + +beforeEach(() => { + testDir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-sql-')); +}); + +afterEach(() => { + if (cg) { + cg.destroy(); + cg = null; + } + if (fs.existsSync(testDir)) fs.rmSync(testDir, { recursive: true, force: true }); +}); + +// ============================================================================ +// Pure parser tests +// ============================================================================ + +describe('extractSqlRefs', () => { + it('captures FROM as a read', () => { + write('a.ts', `db.prepare('SELECT id FROM users WHERE id = ?');\n`); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + expect(refs).toHaveLength(1); + expect(refs[0]!).toMatchObject({ tableName: 'users', op: 'read' }); + }); + + it('captures INSERT INTO as a write', () => { + write('a.ts', `db.prepare('INSERT INTO logs (msg) VALUES (?)');\n`); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + expect(refs).toHaveLength(1); + expect(refs[0]!).toMatchObject({ tableName: 'logs', op: 'write' }); + }); + + it('captures UPDATE ... SET as a write', () => { + write('a.ts', `db.run('UPDATE users SET name = ? WHERE id = ?', ['x', 1]);\n`); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + expect(refs).toHaveLength(1); + expect(refs[0]!).toMatchObject({ tableName: 'users', op: 'write' }); + }); + + it('captures DELETE FROM as a write (and not as a read)', () => { + write('a.ts', `db.run('DELETE FROM sessions WHERE expired_at < ?');\n`); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + // Both regexes (DELETE FROM as write, FROM as read) hit, so we expect + // two refs for the same table but different ops. + expect(refs.map((r) => r.op).sort()).toEqual(['read', 'write']); + expect(new Set(refs.map((r) => r.tableName))).toEqual(new Set(['sessions'])); + }); + + it('captures CREATE TABLE / ALTER / DROP as ddl', () => { + write( + 'a.ts', + [ + `db.exec('CREATE TABLE IF NOT EXISTS audit (id INTEGER)');`, + `db.exec('ALTER TABLE audit ADD COLUMN ts INTEGER');`, + `db.exec('DROP TABLE IF EXISTS audit_old');`, + ].join('\n') + ); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + const ddls = refs.filter((r) => r.op === 'ddl'); + expect(new Set(ddls.map((r) => r.tableName))).toEqual(new Set(['audit', 'audit_old'])); + }); + + it('captures JOIN as a read', () => { + write( + 'a.ts', + `db.prepare('SELECT u.name, p.title FROM users u JOIN posts p ON p.user_id = u.id');\n` + ); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + const tables = new Set(refs.map((r) => r.tableName)); + expect(tables).toEqual(new Set(['users', 'posts'])); + }); + + it('handles backtick (MySQL) and double-quoted (Postgres) identifiers', () => { + write( + 'a.ts', + [ + "db.prepare('SELECT id FROM `mysql_table`');", + `db.prepare('SELECT id FROM "pg_table"');`, + ].join('\n') + ); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + expect(new Set(refs.map((r) => r.tableName))).toEqual( + new Set(['mysql_table', 'pg_table']) + ); + }); + + it('handles schema-qualified identifiers (drops the schema, keeps the table)', () => { + write('a.ts', `db.prepare('SELECT * FROM public.users');\n`); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + expect(refs[0]!.tableName).toBe('users'); + }); + + it('does NOT match a JS variable named like a SQL keyword', () => { + // Without the FROM/INTO/etc. prefix, a bare identifier `users` is + // not caught — that's the whole point vs. plain grep. + write('a.ts', `const users = await loadUsers();\nfor (const user of users) {}\n`); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + expect(refs).toEqual([]); + }); + + it('skips unsupported languages (e.g. swift) without error', () => { + write('a.swift', `let q = "SELECT id FROM users"\n`); + const refs = extractSqlRefs(testDir, [{ path: 'a.swift', language: 'swift' }], () => null); + expect(refs).toEqual([]); + }); + + it('captures the correct 1-indexed line number', () => { + write( + 'a.ts', + [`// blah`, `// blah`, `db.prepare('SELECT * FROM line_three');`, `// blah`].join('\n') + ); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + expect(refs[0]).toEqual(expect.objectContaining({ tableName: 'line_three', line: 3 })); + }); + + it('threads the resolveEnclosing closure correctly', () => { + write('a.ts', `db.prepare('SELECT * FROM t');\n`); + const calls: Array<[string, number]> = []; + extractSqlRefs( + testDir, + [{ path: 'a.ts', language: 'typescript' }], + (filePath, line) => { + calls.push([filePath, line]); + return 'fake-id'; + } + ); + expect(calls).toEqual([['a.ts', 1]]); + }); + + it('drops reserved-word "table names" (WHERE/ON/AS/SELECT)', () => { + // Common over-match: `JOIN ... ON x = y` would otherwise pick up + // `ON` as the table name. The reserved set blocks that. + write('a.ts', `db.prepare('SELECT * FROM users JOIN posts ON posts.uid = users.id');\n`); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + const names = new Set(refs.map((r) => r.tableName)); + expect(names).toEqual(new Set(['users', 'posts'])); + }); + + it('handles multiple SQL operations on a single line', () => { + write( + 'a.ts', + `db.exec('CREATE TABLE foo (id INTEGER); INSERT INTO foo VALUES (1)');\n` + ); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + const ops = new Set(refs.map((r) => `${r.tableName}|${r.op}`)); + expect(ops).toEqual(new Set(['foo|ddl', 'foo|write'])); + }); + + it('survives a missing file (skips, no throw)', () => { + const refs = extractSqlRefs( + testDir, + [{ path: 'missing.ts', language: 'typescript' }], + () => null + ); + expect(refs).toEqual([]); + }); + + it('rejects prose comments containing a quoted SQL example', () => { + // Reviewer-flagged regression: a comment like + // // example: db.prepare('SELECT name FROM the docs') + // used to falsely match `the` as a table because the quote inside + // the comment passed isInsideString(). The comment-stripper now + // removes everything after `//` before the regex sees the line. + write( + 'a.ts', + [ + `// example: db.prepare('SELECT name FROM the docs')`, + `// "SELECT id FROM the comment"`, + `function ok() {`, + ` // sample SELECT FROM users in a comment — should be ignored`, + ` return 1;`, + `}`, + ].join('\n') + ); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + expect(refs).toEqual([]); + }); + + it('rejects same-line block comments containing a quoted SQL example', () => { + write( + 'a.ts', + `/* "SELECT * FROM ghost" */ const x = 1;\n` + ); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + expect(refs).toEqual([]); + }); + + it('still keeps a real SQL call when there is a trailing comment', () => { + write('a.ts', `db.prepare('SELECT * FROM users'); // good doc\n`); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + expect(refs.length).toBe(1); + expect(refs[0]!.tableName).toBe('users'); + }); + + it('strips Python `#` comments', () => { + write( + 'a.py', + `# example: db.execute('SELECT * FROM the_docs')\nrows = db.execute('SELECT * FROM real_table')\n` + ); + const refs = extractSqlRefs(testDir, [{ path: 'a.py', language: 'python' }], () => null); + expect(refs.map((r) => r.tableName)).toEqual(['real_table']); + }); +}); + +// ============================================================================ +// End-to-end through CodeGraph +// ============================================================================ + +describe('CodeGraph SQL refs', () => { + it('persists call sites and resolves enclosing function', async () => { + write( + 'src/db.ts', + [ + `export function getUser(id: number) {`, + ` return db.prepare('SELECT * FROM users WHERE id = ?').get(id);`, + `}`, + ``, + `export function logEvent(msg: string) {`, + ` db.prepare('INSERT INTO events (msg) VALUES (?)').run(msg);`, + `}`, + ].join('\n') + ); + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + + const tables = cg.getSqlTables(); + expect(new Set(tables.map((t) => t.tableName))).toEqual(new Set(['users', 'events'])); + + const userSites = cg.getSqlRefsByTable('users'); + expect(userSites[0]!.sourceName).toBe('getUser'); + + const eventSites = cg.getSqlRefsByTable('events'); + expect(eventSites[0]!.sourceName).toBe('logEvent'); + expect(eventSites[0]!.op).toBe('write'); + }); + + it('reverse view: getSqlTablesForNode returns tables touched by a function', async () => { + write( + 'src/a.ts', + [ + `export function multiTouch() {`, + ` db.prepare('SELECT * FROM users').all();`, + ` db.prepare('INSERT INTO orders VALUES (?)').run(1);`, + `}`, + ].join('\n') + ); + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + + const node = cg.getNodesInFile('src/a.ts').find((n) => n.name === 'multiTouch')!; + const touched = cg.getSqlTablesForNode(node.id); + const summary = touched.map((r) => `${r.tableName}|${r.op}`).sort(); + expect(summary).toEqual(['orders|write', 'users|read']); + }); + + it('case-insensitive table lookup', async () => { + write('src/a.ts', `db.prepare('SELECT * FROM Users');\n`); + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + expect(cg.getSqlRefsByTable('users').length).toBe(1); + expect(cg.getSqlRefsByTable('USERS').length).toBe(1); + }); + + it('respects enableSqlRefs=false', async () => { + write('src/a.ts', `db.prepare('SELECT * FROM users');\n`); + cg = CodeGraph.initSync(testDir, { + config: { include: ['**/*.ts'], exclude: [], enableSqlRefs: false }, + }); + await cg.indexAll(); + expect(cg.getSqlTables()).toEqual([]); + }); + + it('incremental sync replaces refs for changed files only', async () => { + write('src/a.ts', `db.prepare('SELECT * FROM old_table');\n`); + write('src/b.ts', `db.prepare('SELECT * FROM stable_table');\n`); + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + expect(new Set(cg.getSqlTables().map((t) => t.tableName))).toEqual( + new Set(['old_table', 'stable_table']) + ); + + write('src/a.ts', `db.prepare('SELECT * FROM new_table');\n`); + await cg.sync(); + + const tables = new Set(cg.getSqlTables().map((t) => t.tableName)); + expect(tables).toContain('new_table'); + expect(tables).toContain('stable_table'); + expect(tables).not.toContain('old_table'); + }); + + it('drops refs when a file is edited to remove its last SQL ref', async () => { + // Same regression as PR C — applySqlRefs([]) shouldn't leave + // stale rows. Pre-deleting the changed paths in runSqlRefsPass + // is the fix. + write('src/a.ts', `db.prepare('SELECT * FROM going_away');\n`); + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + expect(cg.getSqlTables().some((t) => t.tableName === 'going_away')).toBe(true); + + write('src/a.ts', `// no sql here anymore\nexport const x = 1;\n`); + await cg.sync(); + + expect(cg.getSqlTables().some((t) => t.tableName === 'going_away')).toBe(false); + }); + + it('drops refs for files removed between syncs', async () => { + write('src/a.ts', `db.prepare('SELECT * FROM gone_table');\n`); + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + expect(cg.getSqlTables().some((t) => t.tableName === 'gone_table')).toBe(true); + + fs.unlinkSync(path.join(testDir, 'src/a.ts')); + await cg.sync(); + expect(cg.getSqlTables().some((t) => t.tableName === 'gone_table')).toBe(false); + }); + + // (Removed: a defensive test for the v4-migration-collision bug class. + // With file-based migrations (NNN-name.ts), two PRs claiming the same + // version produces a filesystem-level conflict, so the silent skip the + // defensive guard protected against can no longer happen.) +}); diff --git a/__tests__/sync.test.ts b/__tests__/sync.test.ts index 8365f630..115d078b 100644 --- a/__tests__/sync.test.ts +++ b/__tests__/sync.test.ts @@ -259,4 +259,242 @@ describe('Sync Module', () => { expect(result.changedFilePaths).toBeUndefined(); }); }); + + // Regression tests for the "stale index after HEAD-moving git operation" + // bug. `git status` only reports working-tree dirtiness vs HEAD, so a + // merge / pull / checkout / rebase / reset (and even post-commit) leaves + // a clean tree and used to trick sync into reporting "up to date" while + // the DB still held pre-operation content hashes. The fix detects HEAD + // movement by comparing current HEAD against a stored last-synced HEAD + // and unioning `git diff` output into the changed-file set. + describe('HEAD-moving git operations', () => { + let testDir: string; + let cg: CodeGraph; + + function git(...args: string[]) { + execFileSync('git', args, { cwd: testDir, stdio: 'pipe' }); + } + + beforeEach(async () => { + testDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-head-move-')); + + git('init'); + git('config', 'user.email', 'test@test.com'); + git('config', 'user.name', 'Test'); + git('symbolic-ref', 'HEAD', 'refs/heads/main'); + + const srcDir = path.join(testDir, 'src'); + fs.mkdirSync(srcDir); + fs.writeFileSync( + path.join(srcDir, 'index.ts'), + `export function hello() { return 'world'; }` + ); + + git('add', '-A'); + git('commit', '-m', 'initial'); + + cg = CodeGraph.initSync(testDir, { + config: { include: ['**/*.ts'], exclude: [] }, + }); + await cg.indexAll(); + }); + + afterEach(() => { + if (cg) cg.destroy(); + if (fs.existsSync(testDir)) { + fs.rmSync(testDir, { recursive: true, force: true }); + } + }); + + it('should detect changes brought in by `git merge`', async () => { + git('checkout', '-b', 'feature'); + fs.writeFileSync( + path.join(testDir, 'src', 'index.ts'), + `export function merged() { return 'from-branch'; }` + ); + fs.writeFileSync( + path.join(testDir, 'src', 'added.ts'), + `export function fromBranch() { return 1; }` + ); + git('add', '-A'); + git('commit', '-m', 'feature work'); + git('checkout', 'main'); + git('merge', '--no-ff', 'feature', '-m', 'merge feature'); + + const result = await cg.sync(); + + expect(result.filesModified + result.filesAdded).toBeGreaterThanOrEqual(2); + expect(cg.searchNodes('merged').length).toBeGreaterThan(0); + expect(cg.searchNodes('fromBranch').length).toBeGreaterThan(0); + expect(cg.searchNodes('hello').length).toBe(0); + }); + + it('should detect changes after `git checkout` to a different branch', async () => { + git('checkout', '-b', 'other'); + fs.writeFileSync( + path.join(testDir, 'src', 'index.ts'), + `export function onOther() { return 'other'; }` + ); + git('add', '-A'); + git('commit', '-m', 'other work'); + git('checkout', 'main'); + git('checkout', 'other'); + + const result = await cg.sync(); + + expect(result.filesModified).toBeGreaterThanOrEqual(1); + expect(cg.searchNodes('onOther').length).toBeGreaterThan(0); + expect(cg.searchNodes('hello').length).toBe(0); + }); + + it('should detect file deletion brought in by a committed change', async () => { + git('rm', path.join('src', 'index.ts')); + git('commit', '-m', 'remove index'); + + const result = await cg.sync(); + + expect(result.filesRemoved).toBe(1); + expect(cg.searchNodes('hello').length).toBe(0); + }); + + it('should fall back to full scan when last-synced HEAD is unreachable', async () => { + fs.writeFileSync( + path.join(testDir, 'src', 'index.ts'), + `export function rewritten() { return 'rewritten'; }` + ); + git('add', '-A'); + git('commit', '--amend', '-m', 'rewritten'); + const result = await cg.sync(); + + expect(result.filesModified + result.filesAdded).toBeGreaterThanOrEqual(1); + expect(cg.searchNodes('rewritten').length).toBeGreaterThan(0); + expect(cg.searchNodes('hello').length).toBe(0); + }); + + it('should still no-op when HEAD has not moved and tree is clean', async () => { + const result = await cg.sync(); + + expect(result.filesAdded).toBe(0); + expect(result.filesModified).toBe(0); + expect(result.filesRemoved).toBe(0); + }); + }); + + describe('Git submodule support', () => { + let parentDir: string; + let submoduleSrc: string; + let cg: CodeGraph; + + function git(cwd: string, ...args: string[]) { + execFileSync('git', args, { cwd, stdio: 'pipe' }); + } + + beforeEach(async () => { + parentDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-submod-parent-')); + submoduleSrc = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-submod-src-')); + + git(submoduleSrc, 'init'); + git(submoduleSrc, 'config', 'user.email', 'test@test.com'); + git(submoduleSrc, 'config', 'user.name', 'Test'); + fs.writeFileSync( + path.join(submoduleSrc, 'lib.ts'), + `export function fromSubmodule() { return 'sub'; }` + ); + git(submoduleSrc, 'add', '-A'); + git(submoduleSrc, 'commit', '-m', 'submodule initial'); + + git(parentDir, 'init'); + git(parentDir, 'config', 'user.email', 'test@test.com'); + git(parentDir, 'config', 'user.name', 'Test'); + + const parentSrc = path.join(parentDir, 'src'); + fs.mkdirSync(parentSrc); + fs.writeFileSync( + path.join(parentSrc, 'main.ts'), + `export function fromParent() { return 'parent'; }` + ); + + git(parentDir, '-c', 'protocol.file.allow=always', 'submodule', 'add', submoduleSrc, 'vendor/sub'); + git(parentDir, 'add', '-A'); + git(parentDir, 'commit', '-m', 'parent initial with submodule'); + + cg = CodeGraph.initSync(parentDir, { + config: { + include: ['**/*.ts'], + exclude: [], + }, + }); + }); + + afterEach(() => { + if (cg) cg.destroy(); + if (fs.existsSync(parentDir)) fs.rmSync(parentDir, { recursive: true, force: true }); + if (fs.existsSync(submoduleSrc)) fs.rmSync(submoduleSrc, { recursive: true, force: true }); + }); + + it('should index files inside a submodule on full index', async () => { + const result = await cg.indexAll(); + + expect(result.filesIndexed).toBeGreaterThanOrEqual(2); + const subNodes = cg.searchNodes('fromSubmodule'); + const parentNodes = cg.searchNodes('fromParent'); + expect(subNodes.length).toBeGreaterThan(0); + expect(parentNodes.length).toBeGreaterThan(0); + expect(subNodes.some((r) => r.node.filePath.startsWith('vendor/sub/'))).toBe(true); + }); + + it('should detect modifications to files inside a submodule via sync', async () => { + await cg.indexAll(); + + fs.writeFileSync( + path.join(parentDir, 'vendor/sub/lib.ts'), + `export function fromSubmodule() { return 'changed'; }` + ); + + const result = await cg.sync(); + + expect(result.filesModified).toBe(1); + expect(result.changedFilePaths).toContain('vendor/sub/lib.ts'); + }); + + it('should detect new untracked files inside a submodule via sync', async () => { + await cg.indexAll(); + + fs.writeFileSync( + path.join(parentDir, 'vendor/sub/newfile.ts'), + `export function added() { return 1; }` + ); + + const result = await cg.sync(); + + expect(result.filesAdded).toBe(1); + expect(result.changedFilePaths).toContain('vendor/sub/newfile.ts'); + }); + + it('should not break when a submodule directory is missing or empty', async () => { + fs.rmSync(path.join(parentDir, 'vendor/sub'), { recursive: true, force: true }); + fs.mkdirSync(path.join(parentDir, 'vendor/sub')); + + const result = await cg.indexAll(); + expect(result.errors.filter((e) => e.severity === 'error').length).toBe(0); + expect(cg.searchNodes('fromParent').length).toBeGreaterThan(0); + }); + + it('should skip submodule contents when indexSubmodules is false', async () => { + cg.destroy(); + fs.rmSync(path.join(parentDir, '.codegraph'), { recursive: true, force: true }); + cg = CodeGraph.initSync(parentDir, { + config: { + include: ['**/*.ts'], + exclude: [], + indexSubmodules: false, + }, + }); + + const result = await cg.indexAll(); + expect(cg.searchNodes('fromParent').length).toBeGreaterThan(0); + expect(cg.searchNodes('fromSubmodule').length).toBe(0); + expect(result.filesIndexed).toBe(1); + }); + }); }); diff --git a/__tests__/tests-edges.test.ts b/__tests__/tests-edges.test.ts new file mode 100644 index 00000000..abc300fb --- /dev/null +++ b/__tests__/tests-edges.test.ts @@ -0,0 +1,248 @@ +/** + * Tests-as-Edges Tests + * + * Verifies the convention-based test→subject file resolver and the + * `tests` edges it produces: + * - All recognized test naming conventions (Jest/Vitest, pytest, + * Go, RSpec, JUnit/xUnit, Quick/Spek) + * - The four-step resolution strategy (co-located, mirrored, + * common source roots, basename-anywhere) + * - End-to-end via CodeGraph: indexAll populates `tests` edges, + * sync incrementally refreshes them, getTestsForFile and + * getSubjectsOfTest return the expected file records. + */ + +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { + testSubjectBasename, + isTestFile, + findTestSubjects, +} from '../src/tests-edges'; +import CodeGraph from '../src/index'; + +describe('testSubjectBasename', () => { + it('recognizes JS/TS .test and .spec suffixes', () => { + expect(testSubjectBasename('foo.test.ts')).toBe('foo'); + expect(testSubjectBasename('foo.spec.tsx')).toBe('foo'); + expect(testSubjectBasename('Bar.test.js')).toBe('Bar'); + expect(testSubjectBasename('a/b/foo.test.mjs')).toBe('foo'); + }); + + it('recognizes Python pytest test_foo style', () => { + expect(testSubjectBasename('test_foo.py')).toBe('foo'); + expect(testSubjectBasename('pkg/test_handlers.py')).toBe('handlers'); + }); + + it('recognizes Go and Rust foo_test style', () => { + expect(testSubjectBasename('foo_test.go')).toBe('foo'); + expect(testSubjectBasename('foo_test.rs')).toBe('foo'); + }); + + it('recognizes Ruby foo_spec / foo_test style', () => { + expect(testSubjectBasename('foo_spec.rb')).toBe('foo'); + expect(testSubjectBasename('foo_test.rb')).toBe('foo'); + }); + + it('recognizes xUnit FooTest / FooTests', () => { + expect(testSubjectBasename('FooTest.java')).toBe('Foo'); + expect(testSubjectBasename('FooTests.cs')).toBe('Foo'); + expect(testSubjectBasename('FooTest.kt')).toBe('Foo'); + }); + + it('recognizes Quick/Spek FooSpec', () => { + expect(testSubjectBasename('FooSpec.swift')).toBe('Foo'); + expect(testSubjectBasename('FooSpec.kt')).toBe('Foo'); + }); + + it('returns null for non-test files', () => { + expect(testSubjectBasename('foo.ts')).toBeNull(); + expect(testSubjectBasename('handler.py')).toBeNull(); + expect(testSubjectBasename('README.md')).toBeNull(); + // Doesn't false-positive on similar-looking names + expect(testSubjectBasename('contest.ts')).toBeNull(); + expect(testSubjectBasename('untested.go')).toBeNull(); + }); +}); + +describe('isTestFile', () => { + it('agrees with testSubjectBasename', () => { + expect(isTestFile('foo.test.ts')).toBe(true); + expect(isTestFile('foo.ts')).toBe(false); + }); +}); + +describe('findTestSubjects (resolver strategies)', () => { + it('1. co-located: foo/foo.test.ts → foo/foo.ts', () => { + const all = new Set(['src/foo.ts', 'src/foo.test.ts']); + expect(findTestSubjects('src/foo.test.ts', all)).toEqual(['src/foo.ts']); + }); + + it('1b. co-located: foo/bar.test.ts → foo/bar/index.ts', () => { + const all = new Set(['src/bar/index.ts', 'src/bar.test.ts']); + expect(findTestSubjects('src/bar.test.ts', all)).toEqual(['src/bar/index.ts']); + }); + + it('2. mirrored: foo/__tests__/bar.test.ts → foo/bar.ts', () => { + const all = new Set(['src/bar.ts', 'src/__tests__/bar.test.ts']); + expect(findTestSubjects('src/__tests__/bar.test.ts', all)).toEqual(['src/bar.ts']); + }); + + it('2b. mirrored to index: __tests__/sync.test.ts → src/sync/index.ts', () => { + // Top-level __tests__ doesn't translate to a sibling source root, so + // the resolver falls through to step 3 (common source roots). + const all = new Set(['src/sync/index.ts', '__tests__/sync.test.ts']); + expect(findTestSubjects('__tests__/sync.test.ts', all)).toEqual(['src/sync/index.ts']); + }); + + it('3. common source roots: __tests__/handler.test.ts → lib/handler.ts', () => { + const all = new Set(['lib/handler.ts', '__tests__/handler.test.ts']); + expect(findTestSubjects('__tests__/handler.test.ts', all)).toEqual(['lib/handler.ts']); + }); + + it('4. basename-anywhere with prefix-tiebreaker', () => { + const all = new Set([ + 'packages/auth/utils.ts', + 'packages/billing/utils.ts', + 'packages/auth/utils.test.ts', + ]); + // Co-located resolves first → utils.ts in auth wins by directory. + expect(findTestSubjects('packages/auth/utils.test.ts', all)) + .toEqual(['packages/auth/utils.ts']); + }); + + it('returns [] for tests with no matching subject', () => { + const all = new Set(['__tests__/integration.test.ts']); + expect(findTestSubjects('__tests__/integration.test.ts', all)).toEqual([]); + }); + + it('returns [] for non-test files', () => { + const all = new Set(['src/foo.ts']); + expect(findTestSubjects('src/foo.ts', all)).toEqual([]); + }); + + it('does not edge a test file back to itself', () => { + // Pathological: a file matching the test pattern that also happens + // to live where its "subject" would resolve. Should never produce a + // self-edge. + const all = new Set(['src/foo.test.ts']); + expect(findTestSubjects('src/foo.test.ts', all)).toEqual([]); + }); + + it('handles tsx test files preferring tsx subject before ts', () => { + const all = new Set(['src/Component.tsx', 'src/Component.test.tsx']); + expect(findTestSubjects('src/Component.test.tsx', all)) + .toEqual(['src/Component.tsx']); + }); + + it('matches Go _test convention to subject .go', () => { + const all = new Set(['internal/handler.go', 'internal/handler_test.go']); + expect(findTestSubjects('internal/handler_test.go', all)) + .toEqual(['internal/handler.go']); + }); + + it('matches Python test_ convention to subject .py', () => { + const all = new Set(['app/handlers.py', 'tests/test_handlers.py']); + expect(findTestSubjects('tests/test_handlers.py', all)) + .toEqual(['app/handlers.py']); + }); + + it('strips top-level tests/ prefix when computing the mirrored subject path', () => { + // Regression: previously the mirroring regex only matched `/tests/` + // (slash-prefixed), so a top-level `tests/` directory wasn't stripped + // and the multi-root fallback (src/lib/app/...) never fired. With a + // decoy `tests/handlers.py` present, the resolver would have wrongly + // picked it via the basename-anywhere step instead of the real subject + // under `lib/`. + const all = new Set([ + 'lib/handlers.py', + 'tests/handlers.py', // decoy + 'tests/test_handlers.py', + ]); + expect(findTestSubjects('tests/test_handlers.py', all)) + .toContain('lib/handlers.py'); + }); + + it('strips top-level spec/ prefix similarly', () => { + const all = new Set(['app/order.rb', 'spec/order_spec.rb']); + expect(findTestSubjects('spec/order_spec.rb', all)) + .toEqual(['app/order.rb']); + }); +}); + +describe('CodeGraph end-to-end (tests edges wired into indexAll/sync)', () => { + let dir: string; + let cg: CodeGraph; + + beforeEach(async () => { + dir = fs.mkdtempSync(path.join(os.tmpdir(), 'tests-edges-e2e-')); + fs.mkdirSync(path.join(dir, 'src')); + fs.mkdirSync(path.join(dir, 'src', 'sync')); + fs.mkdirSync(path.join(dir, '__tests__')); + // Subject files + fs.writeFileSync(path.join(dir, 'src', 'sync', 'index.ts'), 'export const sync = 1;'); + fs.writeFileSync(path.join(dir, 'src', 'sync', 'watcher.ts'), 'export const watcher = 1;'); + fs.writeFileSync(path.join(dir, 'src', 'utils.ts'), 'export const utils = 1;'); + // Tests + fs.writeFileSync(path.join(dir, '__tests__', 'sync.test.ts'), 'import { sync } from "../src/sync"; export {};'); + fs.writeFileSync(path.join(dir, 'src', 'sync', 'watcher.test.ts'), 'import { watcher } from "./watcher"; export {};'); + // Feature-themed test (no single subject) + fs.writeFileSync(path.join(dir, '__tests__', 'integration.test.ts'), 'export {};'); + + cg = CodeGraph.initSync(dir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + }); + + afterEach(() => { + if (cg) cg.destroy(); + if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true }); + }); + + it('indexAll populates tests edges (mirrored layout: __tests__/sync.test.ts → src/sync/index.ts)', () => { + const subjects = cg.getSubjectsOfTest('__tests__/sync.test.ts'); + const paths = subjects.map((s) => s.path); + expect(paths).toContain('src/sync/index.ts'); + }); + + it('indexAll populates tests edges (co-located: src/sync/watcher.test.ts → src/sync/watcher.ts)', () => { + const subjects = cg.getSubjectsOfTest('src/sync/watcher.test.ts'); + expect(subjects.map((s) => s.path)).toEqual(['src/sync/watcher.ts']); + }); + + it('getTestsForFile returns the test that covers a given subject (incoming edges)', () => { + const tests = cg.getTestsForFile('src/sync/watcher.ts'); + expect(tests.map((t) => t.path)).toContain('src/sync/watcher.test.ts'); + }); + + it('returns empty array for tests with no resolvable subject (no false-positive guesses)', () => { + const subjects = cg.getSubjectsOfTest('__tests__/integration.test.ts'); + expect(subjects).toEqual([]); + }); + + it('returns empty array for non-test files queried as tests', () => { + expect(cg.getSubjectsOfTest('src/sync/index.ts')).toEqual([]); + }); + + it('sync refreshes a test file\'s edges when its subject convention changes', async () => { + // Add a new subject file and a co-located test for it. After sync, + // the new test should have a `tests` edge to the new subject. + fs.writeFileSync(path.join(dir, 'src', 'newmod.ts'), 'export const m = 1;'); + fs.writeFileSync(path.join(dir, 'src', 'newmod.test.ts'), 'import "./newmod";'); + + await cg.sync(); + const subjects = cg.getSubjectsOfTest('src/newmod.test.ts'); + expect(subjects.map((s) => s.path)).toEqual(['src/newmod.ts']); + }); + + it('sync removes stale edges when a subject file is deleted (FK cascade)', async () => { + // The cascade is on the file *node* (kind='file' has nodes_fts triggers + // and FK constraints from edges). When we sync after deleting the + // subject, edges to it should disappear. + fs.unlinkSync(path.join(dir, 'src', 'sync', 'watcher.ts')); + await cg.sync(); + const subjects = cg.getSubjectsOfTest('src/sync/watcher.test.ts'); + expect(subjects.map((s) => s.path)).not.toContain('src/sync/watcher.ts'); + }); +}); diff --git a/__tests__/watcher.test.ts b/__tests__/watcher.test.ts index f3638e6d..a546494d 100644 --- a/__tests__/watcher.test.ts +++ b/__tests__/watcher.test.ts @@ -31,6 +31,19 @@ function waitFor( }); } +/** + * fs.watch on macOS (FSEvents) and Linux (inotify) has a small but real + * latency between `fs.watch()` returning and the kernel actually + * delivering events. Writing a file in that window — particularly under + * parallel test load when the host CPU is busy — drops the event and + * causes a 5s timeout for "should trigger sync after file change" style + * tests. This helper standardizes the settle delay to match the pattern + * already used by the filtering tests in this file. + */ +async function letWatcherSettle(): Promise { + await new Promise((r) => setTimeout(r, 400)); +} + describe('FileWatcher', () => { let testDir: string; @@ -101,6 +114,7 @@ describe('FileWatcher', () => { const watcher = new FileWatcher(testDir, baseConfig, syncFn, { debounceMs: 200 }); watcher.start(); + await letWatcherSettle(); // Create a new file fs.writeFileSync(path.join(testDir, 'src', 'new.ts'), 'export const y = 2;'); @@ -117,6 +131,7 @@ describe('FileWatcher', () => { const watcher = new FileWatcher(testDir, baseConfig, syncFn, { debounceMs: 500 }); watcher.start(); + await letWatcherSettle(); // Rapid-fire changes for (let i = 0; i < 5; i++) { @@ -145,7 +160,7 @@ describe('FileWatcher', () => { watcher.start(); // Let watcher settle — fs.watch may fire residual events from beforeEach - await new Promise((r) => setTimeout(r, 400)); + await letWatcherSettle(); syncFn.mockClear(); // Create a file that doesn't match include patterns @@ -165,7 +180,7 @@ describe('FileWatcher', () => { watcher.start(); // Let watcher settle — fs.watch may fire residual events from beforeEach - await new Promise((r) => setTimeout(r, 400)); + await letWatcherSettle(); syncFn.mockClear(); // Simulate a .codegraph directory change @@ -191,6 +206,7 @@ describe('FileWatcher', () => { }); watcher.start(); + await letWatcherSettle(); fs.writeFileSync(path.join(testDir, 'src', 'test.ts'), 'export const z = 3;'); @@ -209,6 +225,7 @@ describe('FileWatcher', () => { }); watcher.start(); + await letWatcherSettle(); fs.writeFileSync(path.join(testDir, 'src', 'test.ts'), 'export const z = 3;'); @@ -218,6 +235,36 @@ describe('FileWatcher', () => { watcher.stop(); }); + + it('should retry pending changes after a sync failure (no events lost)', async () => { + // First call rejects, subsequent calls resolve. After the initial + // failure, the watcher should retry the same batch on its own — without + // this, transient sync failures (DB locked etc.) would silently drop the + // changes until a new file event happened. + let calls = 0; + const syncFn = vi.fn().mockImplementation(() => { + calls++; + if (calls === 1) return Promise.reject(new Error('transient')); + return Promise.resolve({ filesChanged: 1, durationMs: 5 }); + }); + const onSyncError = vi.fn(); + const onSyncComplete = vi.fn(); + const watcher = new FileWatcher(testDir, baseConfig, syncFn, { + debounceMs: 100, + onSyncError, + onSyncComplete, + }); + + watcher.start(); + fs.writeFileSync(path.join(testDir, 'src', 'test.ts'), 'export const z = 3;'); + + await waitFor(() => onSyncComplete.mock.calls.length > 0, 5000); + expect(onSyncError).toHaveBeenCalledTimes(1); + expect(syncFn).toHaveBeenCalledTimes(2); + expect(onSyncComplete).toHaveBeenCalledWith({ filesChanged: 1, durationMs: 5 }); + + watcher.stop(); + }); }); describe('CodeGraph integration', () => { @@ -268,6 +315,7 @@ describe('FileWatcher', () => { const initialNodes = initialStats.nodeCount; cg.watch({ debounceMs: 300 }); + await letWatcherSettle(); // Add a new file with a function fs.writeFileSync( diff --git a/docs/ADDING-A-LANGUAGE.md b/docs/ADDING-A-LANGUAGE.md new file mode 100644 index 00000000..189b0e27 --- /dev/null +++ b/docs/ADDING-A-LANGUAGE.md @@ -0,0 +1,463 @@ +# Adding a Language + +This is a cookbook for adding a new language to CodeGraph. It assumes you have a +working dev setup (`npm install` and `npm test` pass). + +There are two patterns. **Pick the one that matches the language you're adding.** + +| Language shape | Pattern | Examples | +|---|---|---| +| Procedural / OO with named functions, classes, methods | **`LanguageExtractor` config** | `python.ts`, `ruby.ts`, `r.ts` | +| Declarative / template / configuration / no named functions | **Custom extractor class** | `hcl-extractor.ts`, `liquid-extractor.ts`, `sql-extractor.ts` | + +The two patterns share the same setup steps (1–4) and only diverge at the extractor +itself (step 5). + +--- + +## 1. Source a tree-sitter wasm grammar + +CodeGraph parses everything via [`web-tree-sitter`](https://www.npmjs.com/package/web-tree-sitter), +so the grammar has to be available as a `.wasm` file. Three options, in order of +preference: + +### 1a. Already in `tree-sitter-wasms` + +The [`tree-sitter-wasms`](https://www.npmjs.com/package/tree-sitter-wasms) npm package +ships pre-built wasms for 30+ common languages. Check `node_modules/tree-sitter-wasms/out/` +after a fresh install: + +```bash +ls node_modules/tree-sitter-wasms/out/ | grep +``` + +If your grammar is there, you're done with this step — just reference the filename. + +### 1b. A pre-built `.wasm` released somewhere else + +Many grammars publish wasms in their GitHub releases (e.g. r-lib/tree-sitter-r) or +in a separate npm package (e.g. `@tree-sitter-grammars/tree-sitter-hcl` ships +`tree-sitter-hcl.wasm` directly in the tarball). + +```bash +# GitHub release +curl -sL -o src/extraction/wasm/tree-sitter-foo.wasm \ + https://github.com/.../releases/download/vX.Y.Z/tree-sitter-foo.wasm + +# Inside an npm tarball +mkdir -p /tmp/foo && cd /tmp/foo +curl -sL https://registry.npmjs.org/tree-sitter-foo/-/tree-sitter-foo-X.Y.Z.tgz | tar xz +cp package/tree-sitter-foo.wasm /src/extraction/wasm/ +``` + +Verify the sha256 against the upstream release manifest before committing. + +### 1c. Build from source + +If only the C source is published (e.g. DerekStride/tree-sitter-sql), build the wasm +locally with `tree-sitter-cli`. Recent versions ship their own wasi-sdk and don't need +Docker or local emcc: + +```bash +mkdir /tmp/foo && cd /tmp/foo +curl -sL https://github.com/.../releases/download/vX.Y.Z/tree-sitter-foo.tar.gz | tar xz +npx --yes tree-sitter-cli@latest build --wasm +cp tree-sitter-foo.wasm /src/extraction/wasm/ +``` + +### Where the wasm lives + +- Grammars from the `tree-sitter-wasms` package are loaded directly from there at runtime. +- Other grammars must be **vendored** under `src/extraction/wasm/` so they ship in the + npm package. The build's `copy-assets` script copies every `.wasm` from that + directory into `dist/extraction/wasm/`. + +**License check.** Tree-sitter grammars are usually MIT or Apache-2.0 — confirm before +committing the wasm and note the source/version in the file's header comment so the +provenance is recoverable later. + +--- + +## 2. Probe the AST + +Don't guess at node types. Parse a representative sample and dump the tree: + +```js +// scratch/probe.mjs +import { Parser, Language } from 'web-tree-sitter'; +await Parser.init(); +const lang = await Language.load('./src/extraction/wasm/tree-sitter-foo.wasm'); +const parser = new Parser(); +parser.setLanguage(lang); + +const sample = ` +// realistic code here — cover every construct you plan to extract +`; + +const tree = parser.parse(sample); +function dump(n, d = 0, max = 4) { + if (d > max) return; + const text = n.text.length > 60 ? n.text.slice(0, 60).replace(/\n/g, '\\n') + '...' : n.text.replace(/\n/g, '\\n'); + console.log(`${' '.repeat(d)}${n.type} "${text}"`); + for (let i = 0; i < n.namedChildCount; i++) dump(n.namedChild(i), d + 1, max); +} +dump(tree.rootNode); +``` + +```bash +node scratch/probe.mjs +``` + +Cover every construct you plan to extract: function definitions, classes, methods, +imports, assignments, calls, references. Watch for surprises: + +- Some grammars wrap names in extra layers (`identifier > simple_identifier`) +- Field names (`childForFieldName`) often differ from what the docs imply +- Operator nodes can be named, unnamed, or both — call `child(i)` vs `namedChild(i)` + and inspect + +Save the probe output before you start coding — you'll refer to it constantly. + +--- + +## 3. Register the language + +Three files, all small. + +**`src/types.ts`** — add to the `Language` union and to `DEFAULT_CONFIG.include`: + +```ts +export type Language = + | 'typescript' + | ... + | 'foo' // ← add here + | 'unknown'; + +export const DEFAULT_CONFIG: CodeGraphConfig = { + ... + include: [ + ... + '**/*.foo', // ← and here + ], +}; +``` + +**`src/extraction/grammars.ts`** — wire up the wasm path, extension map, and display name: + +```ts +const WASM_GRAMMAR_FILES: Record = { + ... + foo: 'tree-sitter-foo.wasm', +}; + +// If vendored under src/extraction/wasm/ instead of tree-sitter-wasms: +const VENDORED_WASM_LANGUAGES: ReadonlySet = new Set([ + 'pascal', + 'foo', // ← add here +]); + +export const EXTENSION_MAP: Record = { + ... + '.foo': 'foo', +}; + +// And in getLanguageDisplayName(): +foo: 'Foo', +``` + +**`CLAUDE.md`** — append the language to the "Supported Languages" line so the +LLM-readable architecture doc stays in sync. + +--- + +## 4. Type-check before writing the extractor + +Run `npx tsc --noEmit` now. If it's not clean, the wiring is wrong — fix that +before adding extraction logic, otherwise type errors will pile up. + +--- + +## 5a. Path A — Plug into `LanguageExtractor` + +Use this when the language has named function/class/method declarations (Python, Ruby, +Java, R, etc.). Create `src/extraction/languages/.ts`: + +```ts +import type { LanguageExtractor } from '../tree-sitter-types'; + +export const fooExtractor: LanguageExtractor = { + // Map AST node types → graph kinds. Empty array = "this kind doesn't + // exist in this language." + functionTypes: ['function_definition'], + classTypes: ['class_definition'], + methodTypes: ['function_definition'], // often the same node, dispatched by context + interfaceTypes: [], + structTypes: [], + enumTypes: [], + typeAliasTypes: [], + importTypes: ['import_statement'], + callTypes: ['call'], + variableTypes: ['assignment'], + + // Field names tree-sitter exposes for extractors to read. + nameField: 'name', + bodyField: 'body', + paramsField: 'parameters', + returnField: 'return_type', + + // Optional hooks — implement what you need: + getSignature: (node, source) => { ... }, + isExported: (node, source) => { ... }, + isAsync: (node) => { ... }, + + // Escape hatch: take over a specific node type entirely. Return true to + // tell the core "I handled this, skip default dispatch." + visitNode: (node, ctx) => { + // R uses this to handle `name <- function() {}` because tree-sitter's + // function_definition has no name field — the name is on the LHS of + // the enclosing assignment. + return false; + }, +}; +``` + +Then register it in `src/extraction/languages/index.ts`: + +```ts +import { fooExtractor } from './foo'; + +export const EXTRACTORS: Partial> = { + ... + foo: fooExtractor, +}; +``` + +The core (`TreeSitterExtractor` in `src/extraction/tree-sitter.ts`) does the rest: +walks the AST, dispatches based on your `*Types` arrays, calls your hooks, manages +the scope stack, and emits nodes/edges. + +**Worked example: R** (`src/extraction/languages/r.ts`). R's `function_definition` +has no name (it's anonymous), so `functionTypes` is empty and the `visitNode` hook +intercepts `binary_operator` assignments and emits the function manually via +`ctx.createNode('function', name, ...)`. + +## 5b. Path B — Custom extractor class + +Use this when the language is declarative (HCL, SQL, dbt) or has a fundamentally +different shape than functions/classes/methods (Liquid templates, Pascal `.dfm` form +files). Create `src/extraction/-extractor.ts`: + +```ts +import { Node, Edge, ExtractionResult, ExtractionError, UnresolvedReference } from '../types'; +import { generateNodeId, getNodeText } from './tree-sitter-helpers'; +import { getParser } from './grammars'; + +export class FooExtractor { + private filePath: string; + private source: string; + private nodes: Node[] = []; + private edges: Edge[] = []; + private unresolvedReferences: UnresolvedReference[] = []; + private errors: ExtractionError[] = []; + + constructor(filePath: string, source: string) { + this.filePath = filePath; + this.source = source; + } + + extract(): ExtractionResult { + const startTime = Date.now(); + const parser = getParser('foo'); + if (!parser) { + this.errors.push({ message: 'foo grammar not loaded', severity: 'error', code: 'grammar_unavailable' }); + return this.result(startTime); + } + const tree = parser.parse(this.source); + if (!tree) { ... return this.result(startTime); } + + try { + const fileNodeId = this.createFileNode(); + // Walk the AST, emit nodes via this.nodes.push and this.edges.push + // Emit references via this.unresolvedReferences.push so the resolver + // pass can match them across files. + ... + return this.result(startTime); + } finally { + tree.delete(); // ← important: tree-sitter trees back onto WASM memory + } + } + + private result(startTime: number): ExtractionResult { + return { + nodes: this.nodes, + edges: this.edges, + unresolvedReferences: this.unresolvedReferences, + errors: this.errors, + durationMs: Date.now() - startTime, + }; + } +} +``` + +Wire the dispatch in `src/extraction/tree-sitter.ts`: + +```ts +import { FooExtractor } from './foo-extractor'; + +export function extractFromSource(filePath, source, language?) { + ... + if (detectedLanguage === 'foo') { + return new FooExtractor(filePath, source).extract(); + } + ... +} +``` + +**Worked examples:** + +- `src/extraction/hcl-extractor.ts` — Terraform / HCL. Block-based DDL. Each + top-level block becomes a node whose qualified name matches the Terraform + reference form (`var.X`, `local.X`, `module.X`, `aws_s3_bucket.foo`) so the + resolver can match references across files automatically. +- `src/extraction/sql-extractor.ts` — SQL DDL. CREATE TABLE / VIEW / FUNCTION / + TRIGGER / TYPE / SCHEMA → graph nodes; foreign keys, view source tables, + trigger target tables and executed functions → edges. +- `src/extraction/liquid-extractor.ts` — Shopify Liquid templates. Regex-based + (no tree-sitter) since the template grammar isn't useful for code intelligence. + +--- + +## 6. Pick `NodeKind` and `EdgeKind` values + +`NodeKind` and `EdgeKind` are fixed unions in `src/types.ts`. Map your language's +constructs onto the closest existing kind rather than introducing new ones — +adding a new kind is a cross-cutting change that touches search, resolution, and +context-building code. + +Common mappings used by recent extractors: + +| Language construct | NodeKind | +|---|---| +| Function / procedure / standalone routine | `function` | +| Method on a class | `method` | +| Class / type / table / declarative resource | `class` | +| Trait / mixin | `trait` | +| Interface / protocol | `interface` | +| Module / package / file-level scope / Terraform module | `module` | +| Namespace / schema / SQL schema / Terraform provider | `namespace` | +| Variable / Terraform variable | `variable` | +| Constant / Terraform local / R top-level binding | `constant` | +| Type alias / SQL composite type | `type_alias` | +| Enum (any) | `enum` | +| Import / library / source / require | `import` | +| Output / re-export / Terraform output | `export` | + +Edges are usually one of: + +| Edge | When | +|---|---| +| `contains` | Parent contains child (file → block, class → method) | +| `calls` | Function/method invokes another | +| `imports` | File pulls in another module/file | +| `references` | Generic mention of another symbol (FK, lookup, attribute access) | +| `extends` / `implements` | Inheritance relationships | + +Emit references through `unresolvedReferences` (with `referenceName` set to a +qualified name that matches what you put on the target node's `qualifiedName`) — +the resolver pass matches them across files using the `name-matcher` and +`import-resolver` modules. + +--- + +## 7. Tests + +Tests live in `__tests__/extraction.test.ts`, grouped by language with a +`describe(' Extraction', ...)` block. Use `extractFromSource` directly +for unit-style tests: + +```ts +import { extractFromSource } from '../src/extraction'; + +describe('Foo Extraction', () => { + describe('Language detection', () => { + it('should detect Foo files', () => { + expect(detectLanguage('main.foo')).toBe('foo'); + }); + }); + + describe('Function extraction', () => { + it('should extract a top-level function', () => { + const code = `function add(a, b) a + b`; + const result = extractFromSource('main.foo', code); + const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'add'); + expect(fn).toBeDefined(); + }); + }); +}); +``` + +Cover the AST shapes you saw in the probe, especially the surprising ones. Pay +particular attention to: + +- The smallest possible valid program (`expect(...).toBeDefined()` for the file node) +- Each node-kind mapping (one test per emitted kind) +- Reference forms (call edges, FK / cross-file references, imports) +- Anything you intentionally skipped (anonymous lambdas, dynamic imports, etc.) + with a negative assertion so the omission is documented + +Run the suite serialized to avoid the file-watcher tests' parallel flakiness: + +```bash +npx vitest run --no-file-parallelism +``` + +End-to-end smoke test from a fresh fixture before opening the PR: + +```bash +SMOKE=$(mktemp -d) && cat > "$SMOKE/main.foo" <<'EOF' +... realistic input ... +EOF +cd "$SMOKE" && git init -q +node /dist/bin/codegraph.js init "$SMOKE" +node /dist/bin/codegraph.js index "$SMOKE" +node /dist/bin/codegraph.js status "$SMOKE" +cd "$SMOKE" && node /dist/bin/codegraph.js query "" +``` + +The `status` call should report your file under "Files by Language", and `query` +should turn up the symbols you expect at the right line numbers. + +--- + +## 8. Open the PR + +Include in the PR description: + +- The grammar source + version + license + sha256 (if vendored) +- A small worked example showing what gets extracted +- The full test plan (`npm test`, `tsc`, `npm run build`, CLI smoke) +- Any known limitations (constructs not supported, AST quirks, things the grammar + itself can't parse) + +Don't claim support for constructs the grammar can't actually parse — this happens +more often than you'd expect (e.g. `tree-sitter-sql` errors out on `CREATE +PROCEDURE` because procedure-body syntax varies sharply across dialects). Say what +works, say what doesn't, and let reviewers decide. + +--- + +## Reference: existing extractors as templates + +Read these in source order if your language is similar to one of them: + +- **Procedural / OO:** `src/extraction/languages/python.ts` (small, easy to read), + `ruby.ts` (with bare-call detection), `kotlin.ts` (extension functions), + `r.ts` (no `def` keyword — uses `visitNode` hook for assignments) +- **Declarative / config:** `src/extraction/hcl-extractor.ts` (Terraform reference + graph), `sql-extractor.ts` (DDL with FK / view source extraction) +- **Embedded / template:** `src/extraction/svelte-extractor.ts` (delegates to JS + for `