From 0a3b32adc6c8db4b0be746757cb49a6f03202d0b Mon Sep 17 00:00:00 2001 From: andreinknv Date: Mon, 27 Apr 2026 16:27:57 -0400 Subject: [PATCH 1/9] =?UTF-8?q?refactor:=20per-language=20registry=20?= =?UTF-8?q?=E2=80=94=20eliminate=20cross-PR=20conflict=20surface?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adding a new language used to require coordinated edits to 6 shared lists across 4 files (Language union in types.ts; DEFAULT_CONFIG.include; WASM_GRAMMAR_FILES, EXTENSION_MAP, and getLanguageDisplayName in grammars.ts; EXTRACTORS map in languages/index.ts). Two PRs adding different languages typically conflicted on every one of those. After this refactor, adding a new language is: 1. Drop a file at src/extraction/languages/.ts exporting an _DEF: LanguageDef constant. 2. Add ONE import line and ONE array entry to src/extraction/languages/registry.ts (alphabetical position — adjacent additions are still possible but rare). That's it. grammars.ts, types.ts, tree-sitter.ts dispatch, and the default include globs are all derived from the registry. ## What's in a LanguageDef ```ts interface LanguageDef { name: string; // canonical id displayName: string; // "Pascal / Delphi" extensions: readonly string[]; // ['.pas', '.dpr', ...] includeGlobs: readonly string[]; grammar?: { wasmFile, vendored?, extractor }; // tree-sitter customExtractor?: (fp, src) => ExtractionResult; // Liquid, Svelte extensionOverrides?: { '.dfm': { customExtractor } }; // Pascal forms } ``` Each existing language file now exports both its `xxxExtractor` (unchanged) AND a new `XXX_DEF`. New files were added for tsx, jsx, svelte, liquid (the latter two wrap their existing custom extractor classes via the customExtractor field). ## Refactored consumers - src/extraction/grammars.ts: WASM_GRAMMAR_FILES removed (was internal-only); EXTENSION_MAP now a Proxy that lazy-builds from the registry on first access (avoids TDZ in cyclic load paths). loadGrammarsForLanguages, isLanguageSupported, isGrammarLoaded, getSupportedLanguages, getLanguageDisplayName, detectLanguage — all read from registry. - src/extraction/tree-sitter.ts: extractFromSource's if-chain (svelte / liquid / pascal+.dfm/.fmx) replaced with one lookup: def.extensionOverrides[ext]?.customExtractor || def.customExtractor. Drops direct imports of LiquidExtractor, SvelteExtractor, DfmExtractor. - src/types.ts: DEFAULT_CONFIG moved to src/default-config.ts (cycle break). types.ts re-exports for backward compat. The `include` array is now built lazily from each LanguageDef's includeGlobs. ## What still requires a one-line edit The Language string union in types.ts still hard-codes the known languages (typescript | javascript | … | unknown). New languages added to the registry work at runtime as strings, but adding the literal here is required IF the resolver wants to do exhaustive narrowing on the new language (resolution/index.ts and resolution/import-resolver.ts have a few `language === 'X'` branches). Most new languages don't need such branches. This trade-off keeps strict narrowing for the existing handful of language-specific code paths while making everything else registry-driven. ## Tests 380/380 pass. No new tests; behavior is identical. Existing extraction.test.ts and pr19-improvements.test.ts heavily exercise detectLanguage, isLanguageSupported, getSupportedLanguages, and loadAllGrammars — all green. ## Follow-ups (out of scope) - Auto-discovery in registry.ts via fs.readdirSync — works in built dist/ but vite-node doesn't support extensionless require() of TS source. A small build-time generator could remove the static import list entirely. - Splitting __tests__/extraction.test.ts into per-language test files — eliminates the test-end-of-file conflict surface that every language PR currently hits. - Similar registry refactors for: - MCP tool definitions (each tool self-registers; no shared tools[] array or case-switch in execute()) - Migration files (each migration in src/db/migrations/NNN-*.ts; auto-discovered by version) - Index/sync hooks (centrality, churn, issue-history, config-refs, sql-refs, cochange all currently mutate CodeGraph.indexAll/sync; an IndexHook interface would make each pass self-contained) Co-Authored-By: Claude Opus 4.7 (1M context) --- src/default-config.ts | 194 +++++++++++++++++++++++ src/extraction/grammars.ts | 204 +++++++++++------------- src/extraction/languages/c-cpp.ts | 18 +++ src/extraction/languages/csharp.ts | 9 ++ src/extraction/languages/dart.ts | 9 ++ src/extraction/languages/go.ts | 9 ++ src/extraction/languages/java.ts | 9 ++ src/extraction/languages/javascript.ts | 9 ++ src/extraction/languages/jsx.ts | 14 ++ src/extraction/languages/kotlin.ts | 9 ++ src/extraction/languages/liquid.ts | 16 ++ src/extraction/languages/pascal.ts | 27 ++++ src/extraction/languages/php.ts | 9 ++ src/extraction/languages/python.ts | 9 ++ src/extraction/languages/registry.ts | 102 ++++++++++++ src/extraction/languages/ruby.ts | 9 ++ src/extraction/languages/rust.ts | 9 ++ src/extraction/languages/svelte.ts | 15 ++ src/extraction/languages/swift.ts | 9 ++ src/extraction/languages/tsx.ts | 14 ++ src/extraction/languages/types.ts | 83 ++++++++++ src/extraction/languages/typescript.ts | 9 ++ src/extraction/tree-sitter.ts | 31 ++-- src/types.ts | 205 +------------------------ 24 files changed, 697 insertions(+), 334 deletions(-) create mode 100644 src/default-config.ts create mode 100644 src/extraction/languages/jsx.ts create mode 100644 src/extraction/languages/liquid.ts create mode 100644 src/extraction/languages/registry.ts create mode 100644 src/extraction/languages/svelte.ts create mode 100644 src/extraction/languages/tsx.ts create mode 100644 src/extraction/languages/types.ts diff --git a/src/default-config.ts b/src/default-config.ts new file mode 100644 index 00000000..5c59179c --- /dev/null +++ b/src/default-config.ts @@ -0,0 +1,194 @@ +/** + * Default project configuration. + * + * Lives in its own file (separate from `types.ts`) because the + * `include` glob list is derived from the language registry — and + * the registry transitively imports `types.ts` via per-language + * files, which would create an evaluation cycle if `default-config` + * were itself imported by `types.ts` eagerly. + * + * **Lazy include resolution.** The `include` array is built on + * first access via a property getter, not at module load. By the + * time anything reads `DEFAULT_CONFIG.include`, the registry has + * fully evaluated, so all language definitions are available. + */ + +import type { CodeGraphConfig } from './types'; +import { getLanguageDefs } from './extraction/languages/registry'; + +let _includeCache: string[] | null = null; +function buildIncludeGlobs(): string[] { + if (_includeCache) return _includeCache; + const seen = new Set(); + const out: string[] = []; + for (const def of getLanguageDefs()) { + for (const glob of def.includeGlobs) { + if (seen.has(glob)) continue; + seen.add(glob); + out.push(glob); + } + } + _includeCache = out; + return out; +} + +const baseConfig: CodeGraphConfig = { + version: 1, + rootDir: '.', + include: [], // populated lazily via the getter below + exclude: [ + // Version control + '**/.git/**', + + // Dependencies + '**/node_modules/**', + '**/vendor/**', + '**/Pods/**', + + // Generic build outputs + '**/dist/**', + '**/build/**', + '**/out/**', + '**/bin/**', + '**/obj/**', + '**/target/**', + + // JavaScript/TypeScript + '**/*.min.js', + '**/*.bundle.js', + '**/.next/**', + '**/.nuxt/**', + '**/.svelte-kit/**', + '**/.output/**', + '**/.turbo/**', + '**/.cache/**', + '**/.parcel-cache/**', + '**/.vite/**', + '**/.astro/**', + '**/.docusaurus/**', + '**/.gatsby/**', + '**/.webpack/**', + '**/.nx/**', + '**/.yarn/cache/**', + '**/.pnpm-store/**', + '**/storybook-static/**', + + // React Native / Expo + '**/.expo/**', + '**/web-build/**', + '**/ios/Pods/**', + '**/ios/build/**', + '**/android/build/**', + '**/android/.gradle/**', + + // Python + '**/__pycache__/**', + '**/.venv/**', + '**/venv/**', + '**/site-packages/**', + '**/dist-packages/**', + '**/.pytest_cache/**', + '**/.mypy_cache/**', + '**/.ruff_cache/**', + '**/.tox/**', + '**/.nox/**', + '**/*.egg-info/**', + '**/.eggs/**', + + // Go + '**/go/pkg/mod/**', + + // Rust + '**/target/debug/**', + '**/target/release/**', + + // Java/Kotlin/Gradle + '**/.gradle/**', + '**/.m2/**', + '**/generated-sources/**', + '**/.kotlin/**', + + // Dart/Flutter + '**/.dart_tool/**', + + // C#/.NET + '**/.vs/**', + '**/.nuget/**', + '**/artifacts/**', + '**/publish/**', + + // C/C++ + '**/cmake-build-*/**', + '**/CMakeFiles/**', + '**/bazel-*/**', + '**/vcpkg_installed/**', + '**/.conan/**', + '**/Debug/**', + '**/Release/**', + '**/x64/**', + '**/.pio/**', // Platform.io (IoT/embedded build artifacts and library deps) + + // Electron + '**/release/**', + '**/*.app/**', + '**/*.asar', + + // Swift/iOS/Xcode + '**/DerivedData/**', + '**/.build/**', + '**/.swiftpm/**', + '**/xcuserdata/**', + '**/Carthage/Build/**', + '**/SourcePackages/**', + + // Delphi/Pascal + '**/__history/**', + '**/__recovery/**', + '**/*.dcu', + + // PHP + '**/.composer/**', + '**/storage/framework/**', + '**/bootstrap/cache/**', + + // Ruby + '**/.bundle/**', + '**/tmp/cache/**', + '**/public/assets/**', + '**/public/packs/**', + '**/.yardoc/**', + + // Testing/Coverage + '**/coverage/**', + '**/htmlcov/**', + '**/.nyc_output/**', + '**/test-results/**', + '**/.coverage/**', + + // IDE/Editor + '**/.idea/**', + + // Logs and temp + '**/logs/**', + '**/tmp/**', + '**/temp/**', + + // Documentation build output + '**/_build/**', + '**/docs/_build/**', + '**/site/**', + ], + languages: [], + frameworks: [], + maxFileSize: 1024 * 1024, // 1MB + extractDocstrings: true, + trackCallSites: true, +}; + +Object.defineProperty(baseConfig, 'include', { + get: () => buildIncludeGlobs(), + enumerable: true, + configurable: true, +}); + +export const DEFAULT_CONFIG: CodeGraphConfig = baseConfig; diff --git a/src/extraction/grammars.ts b/src/extraction/grammars.ts index df264fb3..5c2aec09 100644 --- a/src/extraction/grammars.ts +++ b/src/extraction/grammars.ts @@ -4,77 +4,63 @@ * Uses web-tree-sitter (WASM) for universal cross-platform support. * Grammars are loaded lazily — only languages actually present in the project * are compiled, keeping V8 WASM memory pressure low on large codebases. + * + * As of the language-registry refactor, all per-language metadata + * (WASM filenames, file extensions, display names, vendored flag) + * lives in `./languages/.ts` and is auto-collected by + * `./languages/registry.ts`. The constants exported here + * (`EXTENSION_MAP`, `getSupportedLanguages`, `getLanguageDisplayName`) + * remain for backward compat but are derived from the registry. */ import * as path from 'path'; import { Parser, Language as WasmLanguage } from 'web-tree-sitter'; import { Language } from '../types'; +import { getLanguageDefs, getLanguageDefByExtension, getLanguageDefByName } from './languages/registry'; export type GrammarLanguage = Exclude; /** - * WASM filename map — maps each language to its .wasm grammar file - * in the tree-sitter-wasms package. + * File extension → Language mapping, computed lazily on first read. + * + * Cannot be a top-level IIFE: the registry transitively pulls in + * `tree-sitter.ts` (via custom-extractor language defs), which + * imports this file — building the map at module load would TDZ + * against `ALL_DEFS` in the registry. Use the `getExtensionMap()` + * function for an explicit lazy entry point, or read + * `EXTENSION_MAP` (a Proxy that materialises on first property + * access). */ -const WASM_GRAMMAR_FILES: Record = { - typescript: 'tree-sitter-typescript.wasm', - tsx: 'tree-sitter-tsx.wasm', - javascript: 'tree-sitter-javascript.wasm', - jsx: 'tree-sitter-javascript.wasm', - python: 'tree-sitter-python.wasm', - go: 'tree-sitter-go.wasm', - rust: 'tree-sitter-rust.wasm', - java: 'tree-sitter-java.wasm', - c: 'tree-sitter-c.wasm', - cpp: 'tree-sitter-cpp.wasm', - csharp: 'tree-sitter-c_sharp.wasm', - php: 'tree-sitter-php.wasm', - ruby: 'tree-sitter-ruby.wasm', - swift: 'tree-sitter-swift.wasm', - kotlin: 'tree-sitter-kotlin.wasm', - dart: 'tree-sitter-dart.wasm', - pascal: 'tree-sitter-pascal.wasm', -}; +let _extensionMapCache: Record | null = null; +export function getExtensionMap(): Record { + if (_extensionMapCache) return _extensionMapCache; + const out: Record = {}; + for (const def of getLanguageDefs()) { + for (const ext of def.extensions) { + out[ext.toLowerCase()] = def.name as Language; + } + } + _extensionMapCache = out; + return out; +} /** - * File extension to Language mapping + * Backward-compat: a Proxy that lazy-builds the extension map on + * first property access. Existing callers can keep doing + * `EXTENSION_MAP['.ts']` without changes. */ -export const EXTENSION_MAP: Record = { - '.ts': 'typescript', - '.tsx': 'tsx', - '.js': 'javascript', - '.mjs': 'javascript', - '.cjs': 'javascript', - '.jsx': 'jsx', - '.py': 'python', - '.pyw': 'python', - '.go': 'go', - '.rs': 'rust', - '.java': 'java', - '.c': 'c', - '.h': 'c', // Could also be C++, defaulting to C - '.cpp': 'cpp', - '.cc': 'cpp', - '.cxx': 'cpp', - '.hpp': 'cpp', - '.hxx': 'cpp', - '.cs': 'csharp', - '.php': 'php', - '.rb': 'ruby', - '.rake': 'ruby', - '.swift': 'swift', - '.kt': 'kotlin', - '.kts': 'kotlin', - '.dart': 'dart', - '.liquid': 'liquid', - '.svelte': 'svelte', - '.pas': 'pascal', - '.dpr': 'pascal', - '.dpk': 'pascal', - '.lpr': 'pascal', - '.dfm': 'pascal', - '.fmx': 'pascal', -}; +export const EXTENSION_MAP: Record = new Proxy({} as Record, { + get(_t, key: string) { return getExtensionMap()[key]; }, + has(_t, key: string) { return key in getExtensionMap(); }, + ownKeys() { return Object.keys(getExtensionMap()); }, + getOwnPropertyDescriptor(_t, key: string) { + const map = getExtensionMap(); + if (key in map) { + return { configurable: true, enumerable: true, writable: false, value: map[key] }; + } + return undefined; + }, +}); /** * Caches for loaded grammars and parsers @@ -108,21 +94,28 @@ export async function loadGrammarsForLanguages(languages: Language[]): Promise - lang in WASM_GRAMMAR_FILES && - !languageCache.has(lang) && - !unavailableGrammarErrors.has(lang) - ); + // Deduplicate; filter to languages that have a tree-sitter grammar + // (registry's `def.grammar` field) and aren't already loaded. + const seen = new Set(); + const toLoad: Array<{ lang: Language; wasmFile: string; vendored: boolean }> = []; + for (const lang of languages) { + if (seen.has(lang)) continue; + seen.add(lang); + if (languageCache.has(lang) || unavailableGrammarErrors.has(lang)) continue; + const def = getLanguageDefByName(lang); + if (!def?.grammar) continue; + toLoad.push({ + lang, + wasmFile: def.grammar.wasmFile, + vendored: def.grammar.vendored === true, + }); + } // Load grammars sequentially to avoid web-tree-sitter WASM race condition on Node 20+ // See: https://github.com/tree-sitter/tree-sitter/issues/2338 - for (const lang of toLoad) { - const wasmFile = WASM_GRAMMAR_FILES[lang]; + for (const { lang, wasmFile, vendored } of toLoad) { try { - // Pascal ships its own WASM (not in tree-sitter-wasms) - const wasmPath = lang === 'pascal' + const wasmPath = vendored ? path.join(__dirname, 'wasm', wasmFile) : require.resolve(`tree-sitter-wasms/out/${wasmFile}`); const language = await WasmLanguage.load(wasmPath); @@ -140,7 +133,9 @@ export async function loadGrammarsForLanguages(languages: Language[]): Promise { - const allLanguages = Object.keys(WASM_GRAMMAR_FILES) as GrammarLanguage[]; + const allLanguages = getLanguageDefs() + .filter((d) => d.grammar) + .map((d) => d.name as Language); await loadGrammarsForLanguages(allLanguages); } @@ -176,7 +171,8 @@ export function getParser(language: Language): Parser | null { */ export function detectLanguage(filePath: string, source?: string): Language { const ext = filePath.substring(filePath.lastIndexOf('.')).toLowerCase(); - const lang = EXTENSION_MAP[ext] || 'unknown'; + const def = getLanguageDefByExtension(ext); + const lang = (def?.name as Language) ?? 'unknown'; // .h files could be C or C++ — check source content for C++ features if (lang === 'c' && ext === '.h' && source) { @@ -196,29 +192,30 @@ function looksLikeCpp(source: string): boolean { } /** - * Check if a language is supported (has a grammar defined). - * Returns true if the grammar exists, even if not yet loaded. + * Check if a language is supported (has a grammar or custom extractor). + * Returns true if a registry entry exists, even if its grammar isn't loaded. */ export function isLanguageSupported(language: Language): boolean { - if (language === 'svelte') return true; // custom extractor (script block delegation) - if (language === 'liquid') return true; // custom regex extractor if (language === 'unknown') return false; - return language in WASM_GRAMMAR_FILES; + return getLanguageDefByName(language) !== undefined; } /** * Check if a grammar has been loaded and is ready for parsing. + * Custom-extractor languages (no `grammar` field) are always "ready". */ export function isGrammarLoaded(language: Language): boolean { - if (language === 'svelte' || language === 'liquid') return true; + const def = getLanguageDefByName(language); + if (!def) return false; + if (!def.grammar) return true; // custom extractor — always available return languageCache.has(language); } /** - * Get all supported languages (those with grammar definitions). + * Get all supported languages from the registry. */ export function getSupportedLanguages(): Language[] { - return [...(Object.keys(WASM_GRAMMAR_FILES) as GrammarLanguage[]), 'svelte', 'liquid']; + return getLanguageDefs().map((d) => d.name as Language); } /** @@ -237,54 +234,33 @@ export function resetParser(language: Language): void { } /** - * Clear parser/grammar caches (useful for testing) + * Clear parser cache (useful for testing). + * + * Note: `languageCache` is intentionally NOT cleared — the WASM + * `Language` modules are expensive to load and stay cached so a + * subsequent `getParser` call can rebuild a fresh `Parser` instance + * without re-reading the .wasm file. To fully re-init, set + * `parserInitialized = false` and call `initGrammars()` again. */ export function clearParserCache(): void { for (const parser of parserCache.values()) { - parser.delete(); + try { parser.delete(); } catch { /* ignore */ } } parserCache.clear(); - // Note: languageCache is NOT cleared — WASM languages persist. - // To fully re-init, set parserInitialized = false and call initGrammars() again. unavailableGrammarErrors.clear(); } /** - * Report grammars that failed to load. + * Get unavailable grammar errors (for diagnostics) */ -export function getUnavailableGrammarErrors(): Partial> { - const out: Partial> = {}; - for (const [language, message] of unavailableGrammarErrors.entries()) { - out[language] = message; - } - return out; +export function getUnavailableGrammarErrors(): Record { + return Object.fromEntries(unavailableGrammarErrors); } /** - * Get language display name + * Human-readable display name (e.g. "TypeScript", "Pascal / Delphi"). + * Returns the canonical name unchanged if no display name is registered. */ export function getLanguageDisplayName(language: Language): string { - const names: Record = { - typescript: 'TypeScript', - javascript: 'JavaScript', - tsx: 'TypeScript (TSX)', - jsx: 'JavaScript (JSX)', - python: 'Python', - go: 'Go', - rust: 'Rust', - java: 'Java', - c: 'C', - cpp: 'C++', - csharp: 'C#', - php: 'PHP', - ruby: 'Ruby', - swift: 'Swift', - kotlin: 'Kotlin', - dart: 'Dart', - svelte: 'Svelte', - liquid: 'Liquid', - pascal: 'Pascal / Delphi', - unknown: 'Unknown', - }; - return names[language] || language; + return getLanguageDefByName(language)?.displayName ?? language; } diff --git a/src/extraction/languages/c-cpp.ts b/src/extraction/languages/c-cpp.ts index 66219d4f..8ed3a9de 100644 --- a/src/extraction/languages/c-cpp.ts +++ b/src/extraction/languages/c-cpp.ts @@ -114,3 +114,21 @@ export const cppExtractor: LanguageExtractor = { return null; }, }; + +import type { LanguageDef } from './types'; +export const C_DEF: LanguageDef = { + name: 'c', + displayName: 'C', + // .h is also listed for C; tree-sitter.ts contains a `.h might be C++` + // heuristic that overrides this on a content-sniff basis. + extensions: ['.c', '.h'], + includeGlobs: ['**/*.c', '**/*.h'], + grammar: { wasmFile: 'tree-sitter-c.wasm', extractor: cExtractor }, +}; +export const CPP_DEF: LanguageDef = { + name: 'cpp', + displayName: 'C++', + extensions: ['.cpp', '.cc', '.cxx', '.hpp', '.hxx'], + includeGlobs: ['**/*.cpp', '**/*.cc', '**/*.cxx', '**/*.hpp', '**/*.hxx'], + grammar: { wasmFile: 'tree-sitter-cpp.wasm', extractor: cppExtractor }, +}; diff --git a/src/extraction/languages/csharp.ts b/src/extraction/languages/csharp.ts index 9de53734..c66aea69 100644 --- a/src/extraction/languages/csharp.ts +++ b/src/extraction/languages/csharp.ts @@ -65,3 +65,12 @@ export const csharpExtractor: LanguageExtractor = { return null; }, }; + +import type { LanguageDef } from './types'; +export const CSHARP_DEF: LanguageDef = { + name: 'csharp', + displayName: 'C#', + extensions: ['.cs'], + includeGlobs: ['**/*.cs'], + grammar: { wasmFile: 'tree-sitter-c_sharp.wasm', extractor: csharpExtractor }, +}; diff --git a/src/extraction/languages/dart.ts b/src/extraction/languages/dart.ts index 5b545d04..d704d826 100644 --- a/src/extraction/languages/dart.ts +++ b/src/extraction/languages/dart.ts @@ -193,3 +193,12 @@ export const dartExtractor: LanguageExtractor = { return undefined; }, }; + +import type { LanguageDef } from './types'; +export const DART_DEF: LanguageDef = { + name: 'dart', + displayName: 'Dart', + extensions: ['.dart'], + includeGlobs: ['**/*.dart'], + grammar: { wasmFile: 'tree-sitter-dart.wasm', extractor: dartExtractor }, +}; diff --git a/src/extraction/languages/go.ts b/src/extraction/languages/go.ts index 898e6165..5de68ffa 100644 --- a/src/extraction/languages/go.ts +++ b/src/extraction/languages/go.ts @@ -49,3 +49,12 @@ export const goExtractor: LanguageExtractor = { return match?.[1]; }, }; + +import type { LanguageDef } from './types'; +export const GO_DEF: LanguageDef = { + name: 'go', + displayName: 'Go', + extensions: ['.go'], + includeGlobs: ['**/*.go'], + grammar: { wasmFile: 'tree-sitter-go.wasm', extractor: goExtractor }, +}; diff --git a/src/extraction/languages/java.ts b/src/extraction/languages/java.ts index 638533f0..9613217c 100644 --- a/src/extraction/languages/java.ts +++ b/src/extraction/languages/java.ts @@ -57,3 +57,12 @@ export const javaExtractor: LanguageExtractor = { return null; }, }; + +import type { LanguageDef } from './types'; +export const JAVA_DEF: LanguageDef = { + name: 'java', + displayName: 'Java', + extensions: ['.java'], + includeGlobs: ['**/*.java'], + grammar: { wasmFile: 'tree-sitter-java.wasm', extractor: javaExtractor }, +}; diff --git a/src/extraction/languages/javascript.ts b/src/extraction/languages/javascript.ts index 0a0d6780..946e1c5c 100644 --- a/src/extraction/languages/javascript.ts +++ b/src/extraction/languages/javascript.ts @@ -82,3 +82,12 @@ export const javascriptExtractor: LanguageExtractor = { return null; }, }; + +import type { LanguageDef } from './types'; +export const JAVASCRIPT_DEF: LanguageDef = { + name: 'javascript', + displayName: 'JavaScript', + extensions: ['.js', '.mjs', '.cjs'], + includeGlobs: ['**/*.js'], + grammar: { wasmFile: 'tree-sitter-javascript.wasm', extractor: javascriptExtractor }, +}; diff --git a/src/extraction/languages/jsx.ts b/src/extraction/languages/jsx.ts new file mode 100644 index 00000000..5091ee64 --- /dev/null +++ b/src/extraction/languages/jsx.ts @@ -0,0 +1,14 @@ +/** + * JSX — reuses the JavaScript extractor (the JS grammar handles JSX + * via the same `tree-sitter-javascript.wasm` file). + */ +import { javascriptExtractor } from './javascript'; +import type { LanguageDef } from './types'; + +export const JSX_DEF: LanguageDef = { + name: 'jsx', + displayName: 'JSX', + extensions: ['.jsx'], + includeGlobs: ['**/*.jsx'], + grammar: { wasmFile: 'tree-sitter-javascript.wasm', extractor: javascriptExtractor }, +}; diff --git a/src/extraction/languages/kotlin.ts b/src/extraction/languages/kotlin.ts index 19c38624..77d15609 100644 --- a/src/extraction/languages/kotlin.ts +++ b/src/extraction/languages/kotlin.ts @@ -236,3 +236,12 @@ export const kotlinExtractor: LanguageExtractor = { return null; }, }; + +import type { LanguageDef } from './types'; +export const KOTLIN_DEF: LanguageDef = { + name: 'kotlin', + displayName: 'Kotlin', + extensions: ['.kt', '.kts'], + includeGlobs: ['**/*.kt'], + grammar: { wasmFile: 'tree-sitter-kotlin.wasm', extractor: kotlinExtractor }, +}; diff --git a/src/extraction/languages/liquid.ts b/src/extraction/languages/liquid.ts new file mode 100644 index 00000000..ead2f978 --- /dev/null +++ b/src/extraction/languages/liquid.ts @@ -0,0 +1,16 @@ +/** + * Liquid — custom regex-based extractor for Shopify Liquid templates. + * Tree-sitter has no production-quality Liquid grammar; the + * `LiquidExtractor` does targeted pattern matching for snippet + * includes and Drop variable references. + */ +import { LiquidExtractor } from '../liquid-extractor'; +import type { LanguageDef } from './types'; + +export const LIQUID_DEF: LanguageDef = { + name: 'liquid', + displayName: 'Liquid', + extensions: ['.liquid'], + includeGlobs: ['**/*.liquid'], + customExtractor: (filePath, source) => new LiquidExtractor(filePath, source).extract(), +}; diff --git a/src/extraction/languages/pascal.ts b/src/extraction/languages/pascal.ts index aed6a59f..a196c7b0 100644 --- a/src/extraction/languages/pascal.ts +++ b/src/extraction/languages/pascal.ts @@ -60,3 +60,30 @@ export const pascalExtractor: LanguageExtractor = { return node.type === 'declConst'; }, }; + +import type { LanguageDef } from './types'; +import { DfmExtractor } from '../dfm-extractor'; + +const dfmCustomExtractor = (filePath: string, source: string) => + new DfmExtractor(filePath, source).extract(); + +export const PASCAL_DEF: LanguageDef = { + name: 'pascal', + displayName: 'Pascal / Delphi', + extensions: ['.pas', '.dpr', '.dpk', '.lpr', '.dfm', '.fmx'], + includeGlobs: [ + '**/*.pas', '**/*.dpr', '**/*.dpk', '**/*.lpr', + '**/*.dfm', '**/*.fmx', + ], + grammar: { + wasmFile: 'tree-sitter-pascal.wasm', + vendored: true, + extractor: pascalExtractor, + }, + // .dfm/.fmx are Delphi/FireMonkey form files — declarative property + // definitions, not Pascal source. Route them to the dedicated DfmExtractor. + extensionOverrides: { + '.dfm': { customExtractor: dfmCustomExtractor }, + '.fmx': { customExtractor: dfmCustomExtractor }, + }, +}; diff --git a/src/extraction/languages/php.ts b/src/extraction/languages/php.ts index 1133f979..30271286 100644 --- a/src/extraction/languages/php.ts +++ b/src/extraction/languages/php.ts @@ -103,3 +103,12 @@ export const phpExtractor: LanguageExtractor = { return null; }, }; + +import type { LanguageDef } from './types'; +export const PHP_DEF: LanguageDef = { + name: 'php', + displayName: 'PHP', + extensions: ['.php'], + includeGlobs: ['**/*.php'], + grammar: { wasmFile: 'tree-sitter-php.wasm', extractor: phpExtractor }, +}; diff --git a/src/extraction/languages/python.ts b/src/extraction/languages/python.ts index 77807d66..2cddcf40 100644 --- a/src/extraction/languages/python.ts +++ b/src/extraction/languages/python.ts @@ -51,3 +51,12 @@ export const pythonExtractor: LanguageExtractor = { return null; }, }; + +import type { LanguageDef } from './types'; +export const PYTHON_DEF: LanguageDef = { + name: 'python', + displayName: 'Python', + extensions: ['.py', '.pyw'], + includeGlobs: ['**/*.py'], + grammar: { wasmFile: 'tree-sitter-python.wasm', extractor: pythonExtractor }, +}; diff --git a/src/extraction/languages/registry.ts b/src/extraction/languages/registry.ts new file mode 100644 index 00000000..1f4ca6ae --- /dev/null +++ b/src/extraction/languages/registry.ts @@ -0,0 +1,102 @@ +/** + * Language registry — central import + collection of every per-language + * `LanguageDef`. Adding a new language is: + * + * 1. Create `src/extraction/languages/.ts` exporting an + * `_DEF: LanguageDef` constant. + * 2. Add **one** import line and **one** array entry to this file. + * + * This file is the only place a "central list" of languages lives, + * so adjacent-line conflicts between PRs adding different languages + * are limited to whichever alphabetical neighborhood they target. + * + * Note: an earlier draft used `fs.readdirSync` auto-discovery which + * eliminated even this file, but `require()` of extensionless paths + * doesn't work under vitest's vite-node loader for `.ts` source. A + * generated-barrel build step would restore zero-list-edits and is + * tracked as a follow-up. + */ + +import type { LanguageDef } from './types'; + +// ===================================================================== +// Imports — one per language, alphabetical by name +// ===================================================================== +import { C_DEF, CPP_DEF } from './c-cpp'; +import { CSHARP_DEF } from './csharp'; +import { DART_DEF } from './dart'; +import { GO_DEF } from './go'; +import { JAVA_DEF } from './java'; +import { JAVASCRIPT_DEF } from './javascript'; +import { JSX_DEF } from './jsx'; +import { KOTLIN_DEF } from './kotlin'; +import { LIQUID_DEF } from './liquid'; +import { PASCAL_DEF } from './pascal'; +import { PHP_DEF } from './php'; +import { PYTHON_DEF } from './python'; +import { RUBY_DEF } from './ruby'; +import { RUST_DEF } from './rust'; +import { SVELTE_DEF } from './svelte'; +import { SWIFT_DEF } from './swift'; +import { TSX_DEF } from './tsx'; +import { TYPESCRIPT_DEF } from './typescript'; + +// ===================================================================== +// Registry — alphabetical by name +// ===================================================================== +const ALL_DEFS: readonly LanguageDef[] = [ + C_DEF, + CPP_DEF, + CSHARP_DEF, + DART_DEF, + GO_DEF, + JAVA_DEF, + JAVASCRIPT_DEF, + JSX_DEF, + KOTLIN_DEF, + LIQUID_DEF, + PASCAL_DEF, + PHP_DEF, + PYTHON_DEF, + RUBY_DEF, + RUST_DEF, + SVELTE_DEF, + SWIFT_DEF, + TSX_DEF, + TYPESCRIPT_DEF, +]; + +let byName: Map | null = null; +let byExtension: Map | null = null; + +function ensureIndexes(): void { + if (byName && byExtension) return; + byName = new Map(); + byExtension = new Map(); + for (const def of ALL_DEFS) { + byName.set(def.name, def); + for (const ext of def.extensions) { + byExtension.set(ext.toLowerCase(), def); + } + } +} + +export function getLanguageDefs(): readonly LanguageDef[] { + return ALL_DEFS; +} + +export function getLanguageDefByName(name: string): LanguageDef | undefined { + ensureIndexes(); + return byName!.get(name); +} + +export function getLanguageDefByExtension(ext: string): LanguageDef | undefined { + ensureIndexes(); + return byExtension!.get(ext.toLowerCase()); +} + +/** Reset cached indexes. Used by tests; no-op in production paths. */ +export function _resetRegistryCacheForTests(): void { + byName = null; + byExtension = null; +} diff --git a/src/extraction/languages/ruby.ts b/src/extraction/languages/ruby.ts index b5426165..810ac26a 100644 --- a/src/extraction/languages/ruby.ts +++ b/src/extraction/languages/ruby.ts @@ -109,3 +109,12 @@ export const rubyExtractor: LanguageExtractor = { return null; }, }; + +import type { LanguageDef } from './types'; +export const RUBY_DEF: LanguageDef = { + name: 'ruby', + displayName: 'Ruby', + extensions: ['.rb', '.rake'], + includeGlobs: ['**/*.rb'], + grammar: { wasmFile: 'tree-sitter-ruby.wasm', extractor: rubyExtractor }, +}; diff --git a/src/extraction/languages/rust.ts b/src/extraction/languages/rust.ts index 0266a2fd..35c957c0 100644 --- a/src/extraction/languages/rust.ts +++ b/src/extraction/languages/rust.ts @@ -114,3 +114,12 @@ export const rustExtractor: LanguageExtractor = { return null; }, }; + +import type { LanguageDef } from './types'; +export const RUST_DEF: LanguageDef = { + name: 'rust', + displayName: 'Rust', + extensions: ['.rs'], + includeGlobs: ['**/*.rs'], + grammar: { wasmFile: 'tree-sitter-rust.wasm', extractor: rustExtractor }, +}; diff --git a/src/extraction/languages/svelte.ts b/src/extraction/languages/svelte.ts new file mode 100644 index 00000000..7f7ab889 --- /dev/null +++ b/src/extraction/languages/svelte.ts @@ -0,0 +1,15 @@ +/** + * Svelte — custom extractor that delegates the script block back + * through the universal extraction pipeline as TypeScript/JavaScript, + * then merges in template-level call references. + */ +import { SvelteExtractor } from '../svelte-extractor'; +import type { LanguageDef } from './types'; + +export const SVELTE_DEF: LanguageDef = { + name: 'svelte', + displayName: 'Svelte', + extensions: ['.svelte'], + includeGlobs: ['**/*.svelte'], + customExtractor: (filePath, source) => new SvelteExtractor(filePath, source).extract(), +}; diff --git a/src/extraction/languages/swift.ts b/src/extraction/languages/swift.ts index 373fa8a9..fe1ac5ce 100644 --- a/src/extraction/languages/swift.ts +++ b/src/extraction/languages/swift.ts @@ -81,3 +81,12 @@ export const swiftExtractor: LanguageExtractor = { return null; }, }; + +import type { LanguageDef } from './types'; +export const SWIFT_DEF: LanguageDef = { + name: 'swift', + displayName: 'Swift', + extensions: ['.swift'], + includeGlobs: ['**/*.swift'], + grammar: { wasmFile: 'tree-sitter-swift.wasm', extractor: swiftExtractor }, +}; diff --git a/src/extraction/languages/tsx.ts b/src/extraction/languages/tsx.ts new file mode 100644 index 00000000..f4cbe536 --- /dev/null +++ b/src/extraction/languages/tsx.ts @@ -0,0 +1,14 @@ +/** + * TSX (TypeScript + JSX) — reuses the TypeScript extractor with a + * dedicated grammar so JSX-specific node types parse correctly. + */ +import { typescriptExtractor } from './typescript'; +import type { LanguageDef } from './types'; + +export const TSX_DEF: LanguageDef = { + name: 'tsx', + displayName: 'TSX', + extensions: ['.tsx'], + includeGlobs: ['**/*.tsx'], + grammar: { wasmFile: 'tree-sitter-tsx.wasm', extractor: typescriptExtractor }, +}; diff --git a/src/extraction/languages/types.ts b/src/extraction/languages/types.ts new file mode 100644 index 00000000..a93e1930 --- /dev/null +++ b/src/extraction/languages/types.ts @@ -0,0 +1,83 @@ +/** + * Per-language registry types. + * + * Each language ships its own self-contained `LanguageDef` (file + * extensions, default-config globs, grammar config, etc.) so that + * adding a new language is a single-file addition rather than 6 + * coordinated edits across `types.ts`, `grammars.ts`, and the + * `extraction/languages/index.ts` barrel. The registry + * (`./registry`) auto-discovers definitions at module load. + */ + +import type { LanguageExtractor } from '../tree-sitter-types'; +import type { ExtractionResult } from '../../types'; + +/** + * Custom extraction function for languages that don't fit the + * universal tree-sitter AST shape (Liquid, Svelte, HCL, SQL, + * Pascal DFM/FMX form files). + */ +export type CustomExtractorFn = (filePath: string, source: string) => ExtractionResult; + +export interface GrammarBackedConfig { + /** + * WASM grammar filename. Resolved either against the + * `tree-sitter-wasms` npm package or, if `vendored` is true, + * against `src/extraction/wasm/`. + */ + wasmFile: string; + /** + * True when the WASM is shipped under `src/extraction/wasm/` + * because no pre-built grammar exists in `tree-sitter-wasms`. + */ + vendored?: boolean; + /** + * Per-language tree-sitter extraction config consumed by + * `TreeSitterExtractor`. The existing per-language objects + * (e.g. `typescriptExtractor`) are passed in here unchanged. + */ + extractor: LanguageExtractor; +} + +export interface LanguageDef { + /** + * Canonical language name. Stored as the `language` value on + * `Node`, `Edge`, and `FileRecord` rows. Should match an entry + * in the `Language` union in `src/types.ts` for known + * languages; new registry-only languages are accepted as + * strings at runtime. + */ + name: string; + /** Human-readable display label (e.g. "HCL / Terraform"). */ + displayName: string; + /** + * File extensions, lower-cased, with leading dot. Each + * extension uniquely maps to one language (caller should not + * register the same extension twice). + */ + extensions: readonly string[]; + /** + * Default-config include glob patterns. Combined into + * `DEFAULT_CONFIG.include` at registry load. + */ + includeGlobs: readonly string[]; + /** + * Tree-sitter grammar config. Absent for purely-custom + * languages like Liquid (regex-based) and Svelte (script + * delegation). + */ + grammar?: GrammarBackedConfig; + /** + * Whole-language custom extractor. Used when `grammar` is + * absent. If both are present, `extensionOverrides` and + * `customExtractor` win over `grammar`. + */ + customExtractor?: CustomExtractorFn; + /** + * Per-extension override. Used by Pascal where `.dfm`/`.fmx` + * (form files) are extracted by `DfmExtractor` rather than the + * tree-sitter Pascal grammar. Keys are lower-cased extensions + * with the leading dot. + */ + extensionOverrides?: Readonly>; +} diff --git a/src/extraction/languages/typescript.ts b/src/extraction/languages/typescript.ts index 9540dd94..9f82e675 100644 --- a/src/extraction/languages/typescript.ts +++ b/src/extraction/languages/typescript.ts @@ -1,5 +1,6 @@ import { getNodeText, getChildByField } from '../tree-sitter-helpers'; import type { LanguageExtractor } from '../tree-sitter-types'; +import type { LanguageDef } from './types'; export const typescriptExtractor: LanguageExtractor = { functionTypes: ['function_declaration', 'arrow_function', 'function_expression'], @@ -116,3 +117,11 @@ export const typescriptExtractor: LanguageExtractor = { return null; }, }; + +export const TYPESCRIPT_DEF: LanguageDef = { + name: 'typescript', + displayName: 'TypeScript', + extensions: ['.ts'], + includeGlobs: ['**/*.ts'], + grammar: { wasmFile: 'tree-sitter-typescript.wasm', extractor: typescriptExtractor }, +}; diff --git a/src/extraction/tree-sitter.ts b/src/extraction/tree-sitter.ts index 7345d91f..f0bd4b7c 100644 --- a/src/extraction/tree-sitter.ts +++ b/src/extraction/tree-sitter.ts @@ -19,9 +19,7 @@ import { getParser, detectLanguage, isLanguageSupported } from './grammars'; import { generateNodeId, getNodeText, getChildByField, getPrecedingDocstring } from './tree-sitter-helpers'; import type { LanguageExtractor, ExtractorContext } from './tree-sitter-types'; import { EXTRACTORS } from './languages'; -import { LiquidExtractor } from './liquid-extractor'; -import { SvelteExtractor } from './svelte-extractor'; -import { DfmExtractor } from './dfm-extractor'; +import { getLanguageDefByName } from './languages/registry'; // Re-export for backward compatibility export { generateNodeId } from './tree-sitter-helpers'; @@ -2319,28 +2317,21 @@ export function extractFromSource( ): ExtractionResult { const detectedLanguage = language || detectLanguage(filePath, source); const fileExtension = path.extname(filePath).toLowerCase(); + const def = getLanguageDefByName(detectedLanguage); - // Use custom extractor for Svelte - if (detectedLanguage === 'svelte') { - const extractor = new SvelteExtractor(filePath, source); - return extractor.extract(); + // Per-extension override wins (e.g. Pascal `.dfm`/`.fmx` route to + // DfmExtractor rather than the tree-sitter Pascal grammar). + const override = def?.extensionOverrides?.[fileExtension]; + if (override) { + return override.customExtractor(filePath, source); } - // Use custom extractor for Liquid - if (detectedLanguage === 'liquid') { - const extractor = new LiquidExtractor(filePath, source); - return extractor.extract(); - } - - // Use custom extractor for DFM/FMX form files - if ( - detectedLanguage === 'pascal' && - (fileExtension === '.dfm' || fileExtension === '.fmx') - ) { - const extractor = new DfmExtractor(filePath, source); - return extractor.extract(); + // Whole-language custom extractor (Liquid, Svelte, etc.). + if (def?.customExtractor) { + return def.customExtractor(filePath, source); } + // Tree-sitter path. const extractor = new TreeSitterExtractor(filePath, source, detectedLanguage); return extractor.extract(); } diff --git a/src/types.ts b/src/types.ts index 6834483d..e9b3cbcc 100644 --- a/src/types.ts +++ b/src/types.ts @@ -476,206 +476,11 @@ export interface CodeGraphConfig { }[]; } -/** - * Default configuration values - */ -export const DEFAULT_CONFIG: CodeGraphConfig = { - version: 1, - rootDir: '.', - include: [ - // TypeScript/JavaScript - '**/*.ts', - '**/*.tsx', - '**/*.js', - '**/*.jsx', - // Python - '**/*.py', - // Go - '**/*.go', - // Rust - '**/*.rs', - // Java - '**/*.java', - // C/C++ - '**/*.c', - '**/*.h', - '**/*.cpp', - '**/*.hpp', - '**/*.cc', - '**/*.cxx', - // C# - '**/*.cs', - // PHP - '**/*.php', - // Ruby - '**/*.rb', - // Swift - '**/*.swift', - // Kotlin - '**/*.kt', - '**/*.kts', - // Dart - '**/*.dart', - // Svelte - '**/*.svelte', - // Liquid (Shopify themes) - '**/*.liquid', - // Pascal / Delphi - '**/*.pas', - '**/*.dpr', - '**/*.dpk', - '**/*.lpr', - '**/*.dfm', - '**/*.fmx', - ], - exclude: [ - // Version control - '**/.git/**', - - // Dependencies - '**/node_modules/**', - '**/vendor/**', - '**/Pods/**', - - // Generic build outputs - '**/dist/**', - '**/build/**', - '**/out/**', - '**/bin/**', - '**/obj/**', - '**/target/**', - - // JavaScript/TypeScript - '**/*.min.js', - '**/*.bundle.js', - '**/.next/**', - '**/.nuxt/**', - '**/.svelte-kit/**', - '**/.output/**', - '**/.turbo/**', - '**/.cache/**', - '**/.parcel-cache/**', - '**/.vite/**', - '**/.astro/**', - '**/.docusaurus/**', - '**/.gatsby/**', - '**/.webpack/**', - '**/.nx/**', - '**/.yarn/cache/**', - '**/.pnpm-store/**', - '**/storybook-static/**', - - // React Native / Expo - '**/.expo/**', - '**/web-build/**', - '**/ios/Pods/**', - '**/ios/build/**', - '**/android/build/**', - '**/android/.gradle/**', - - // Python - '**/__pycache__/**', - '**/.venv/**', - '**/venv/**', - '**/site-packages/**', - '**/dist-packages/**', - '**/.pytest_cache/**', - '**/.mypy_cache/**', - '**/.ruff_cache/**', - '**/.tox/**', - '**/.nox/**', - '**/*.egg-info/**', - '**/.eggs/**', - - // Go - '**/go/pkg/mod/**', - - // Rust - '**/target/debug/**', - '**/target/release/**', - - // Java/Kotlin/Gradle - '**/.gradle/**', - '**/.m2/**', - '**/generated-sources/**', - '**/.kotlin/**', - - // Dart/Flutter - '**/.dart_tool/**', - - // C#/.NET - '**/.vs/**', - '**/.nuget/**', - '**/artifacts/**', - '**/publish/**', - - // C/C++ - '**/cmake-build-*/**', - '**/CMakeFiles/**', - '**/bazel-*/**', - '**/vcpkg_installed/**', - '**/.conan/**', - '**/Debug/**', - '**/Release/**', - '**/x64/**', - '**/.pio/**', // Platform.io (IoT/embedded build artifacts and library deps) - - // Electron - '**/release/**', - '**/*.app/**', - '**/*.asar', - - // Swift/iOS/Xcode - '**/DerivedData/**', - '**/.build/**', - '**/.swiftpm/**', - '**/xcuserdata/**', - '**/Carthage/Build/**', - '**/SourcePackages/**', - - // Delphi/Pascal - '**/__history/**', - '**/__recovery/**', - '**/*.dcu', - - // PHP - '**/.composer/**', - '**/storage/framework/**', - '**/bootstrap/cache/**', - - // Ruby - '**/.bundle/**', - '**/tmp/cache/**', - '**/public/assets/**', - '**/public/packs/**', - '**/.yardoc/**', - - // Testing/Coverage - '**/coverage/**', - '**/htmlcov/**', - '**/.nyc_output/**', - '**/test-results/**', - '**/.coverage/**', - - // IDE/Editor - '**/.idea/**', - - // Logs and temp - '**/logs/**', - '**/tmp/**', - '**/temp/**', - - // Documentation build output - '**/_build/**', - '**/docs/_build/**', - '**/site/**', - ], - languages: [], - frameworks: [], - maxFileSize: 1024 * 1024, // 1MB - extractDocstrings: true, - trackCallSites: true, -}; +// `DEFAULT_CONFIG` lives in `./default-config.ts` so its `include` +// list can be derived from the language registry without import +// cycles. Re-exported here for backward compat with consumers that +// already import it from `'./types'`. +export { DEFAULT_CONFIG } from './default-config'; // ============================================================================= // Database Types From e43a6183993008eeede1667e180ccb08f1870576 Mon Sep 17 00:00:00 2001 From: andreinknv Date: Mon, 27 Apr 2026 16:44:28 -0400 Subject: [PATCH 2/9] fix(language-registry): TreeSitterExtractor reads from def.grammar.extractor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewer caught a real bug: the original commit kept the EXTRACTORS map in src/extraction/languages/index.ts as a separate hand-curated registry that TreeSitterExtractor read from. Adding a new grammar-backed language would have required editing EXTRACTORS too, undermining the refactor's stated single-source-of- truth claim. A future contributor missing the EXTRACTORS update would silently produce empty extraction results. Fix: - TreeSitterExtractor now reads its extractor straight off the language def: getLanguageDefByName(this.language)?.grammar?.extractor - EXTRACTORS in languages/index.ts becomes a Proxy that derives lazily from the registry (kept for backward compat — readers unchanged). - Add 16 structural-invariant tests in __tests__/language-registry.test.ts that fail loudly if any derived consumer drifts from the registry: EXTRACTORS / EXTENSION_MAP / detectLanguage / isLanguageSupported / getSupportedLanguages / getLanguageDisplayName all asserted to exactly mirror the registry contents. Adding a new grammar-backed language is now genuinely "one new file + two lines in registry.ts" — no other files to touch. Co-Authored-By: Claude Opus 4.7 (1M context) --- __tests__/language-registry.test.ts | 157 +++++++++++++++++++++++++++ src/extraction/languages/index.ts | 101 ++++++++++------- src/extraction/languages/registry.ts | 6 + src/extraction/tree-sitter.ts | 6 +- 4 files changed, 231 insertions(+), 39 deletions(-) create mode 100644 __tests__/language-registry.test.ts diff --git a/__tests__/language-registry.test.ts b/__tests__/language-registry.test.ts new file mode 100644 index 00000000..9afdd59a --- /dev/null +++ b/__tests__/language-registry.test.ts @@ -0,0 +1,157 @@ +/** + * Language registry: structural invariants. + * + * These tests guard against the "parallel list" failure mode that + * the registry refactor exists to prevent. If a future PR adds a + * grammar-backed language but forgets to wire it through one of + * the derived consumers, one of these tests should catch it. + */ +import { describe, it, expect } from 'vitest'; +import { + getLanguageDefs, + getLanguageDefByExtension, + getLanguageDefByName, +} from '../src/extraction/languages/registry'; +import { EXTRACTORS } from '../src/extraction/languages'; +import { + detectLanguage, + isLanguageSupported, + getSupportedLanguages, + getLanguageDisplayName, + EXTENSION_MAP, +} from '../src/extraction/grammars'; + +describe('language registry — single source of truth', () => { + it('has at least the original 19 languages', () => { + const defs = getLanguageDefs(); + expect(defs.length).toBeGreaterThanOrEqual(19); + }); + + it('every def has unique non-empty name', () => { + const names = new Set(); + for (const def of getLanguageDefs()) { + expect(def.name).toBeTruthy(); + expect(names.has(def.name)).toBe(false); + names.add(def.name); + } + }); + + it('extensions are unique across registry (one ext maps to one language)', () => { + const seen = new Map(); + for (const def of getLanguageDefs()) { + for (const ext of def.extensions) { + const lower = ext.toLowerCase(); + if (seen.has(lower)) { + // The .h ambiguity (C vs C++) is intentionally pinned to C + // by the registry; tree-sitter.ts has a content-sniff + // override. Anything else duplicating extensions is a bug. + throw new Error( + `Extension ${lower} mapped twice: ${seen.get(lower)} and ${def.name}` + ); + } + seen.set(lower, def.name); + } + } + }); + + it('grammar-backed defs have wasmFile + extractor', () => { + for (const def of getLanguageDefs()) { + if (!def.grammar) continue; + expect(def.grammar.wasmFile).toMatch(/^tree-sitter-.+\.wasm$/); + expect(def.grammar.extractor).toBeDefined(); + } + }); + + it('custom-extractor defs have a customExtractor function', () => { + for (const def of getLanguageDefs()) { + if (def.grammar) continue; // grammar-backed + expect(def.customExtractor).toBeInstanceOf(Function); + } + }); +}); + +describe('derived consumers stay in sync with the registry', () => { + // Catch the "parallel list drift" bug that motivated this refactor. + // If a new language gets added to registry but a derived consumer + // still hard-codes the old set, one of these will fail. + + it('EXTRACTORS contains exactly the grammar-backed languages', () => { + const grammarBacked = getLanguageDefs() + .filter((d) => d.grammar) + .map((d) => d.name) + .sort(); + const extractorKeys = Object.keys(EXTRACTORS).sort(); + expect(extractorKeys).toEqual(grammarBacked); + }); + + it('every grammar-backed extractor matches def.grammar.extractor exactly', () => { + for (const def of getLanguageDefs()) { + if (!def.grammar) continue; + expect(EXTRACTORS[def.name as keyof typeof EXTRACTORS]).toBe(def.grammar.extractor); + } + }); + + it('EXTENSION_MAP entries exactly mirror registry extensions', () => { + const expected = new Map(); + for (const def of getLanguageDefs()) { + for (const ext of def.extensions) { + expected.set(ext.toLowerCase(), def.name); + } + } + for (const [ext, lang] of expected) { + expect(EXTENSION_MAP[ext]).toBe(lang); + } + // Reverse: no extra keys in EXTENSION_MAP. + expect(Object.keys(EXTENSION_MAP).sort()).toEqual([...expected.keys()].sort()); + }); + + it('detectLanguage returns the expected name for every registered extension', () => { + for (const def of getLanguageDefs()) { + for (const ext of def.extensions) { + // .h is pinned to C by the registry; the C++ heuristic only + // applies when source is provided AND looks like C++. + expect(detectLanguage(`x${ext}`)).toBe(def.name); + } + } + }); + + it('isLanguageSupported returns true for every registered language and false for unknown', () => { + for (const def of getLanguageDefs()) { + expect(isLanguageSupported(def.name as never)).toBe(true); + } + expect(isLanguageSupported('unknown' as never)).toBe(false); + }); + + it('getSupportedLanguages returns exactly the registry names', () => { + const fromRegistry = getLanguageDefs().map((d) => d.name).sort(); + const supported = (getSupportedLanguages() as string[]).sort(); + expect(supported).toEqual(fromRegistry); + }); + + it('getLanguageDisplayName uses each defs displayName', () => { + for (const def of getLanguageDefs()) { + expect(getLanguageDisplayName(def.name as never)).toBe(def.displayName); + } + }); +}); + +describe('lookup helpers', () => { + it('getLanguageDefByName returns the def for a registered name', () => { + expect(getLanguageDefByName('typescript')?.displayName).toBe('TypeScript'); + }); + + it('getLanguageDefByName returns undefined for unknown names', () => { + expect(getLanguageDefByName('nonexistent-language-name')).toBeUndefined(); + }); + + it('getLanguageDefByExtension is case-insensitive', () => { + expect(getLanguageDefByExtension('.TS')?.name).toBe('typescript'); + expect(getLanguageDefByExtension('.ts')?.name).toBe('typescript'); + }); + + it('Pascal extensionOverrides routes .dfm and .fmx to a customExtractor', () => { + const def = getLanguageDefByName('pascal'); + expect(def?.extensionOverrides?.['.dfm']?.customExtractor).toBeInstanceOf(Function); + expect(def?.extensionOverrides?.['.fmx']?.customExtractor).toBeInstanceOf(Function); + }); +}); diff --git a/src/extraction/languages/index.ts b/src/extraction/languages/index.ts index e5d12ac6..0e35b826 100644 --- a/src/extraction/languages/index.ts +++ b/src/extraction/languages/index.ts @@ -1,44 +1,71 @@ /** - * Per-language extraction configurations. + * Per-language barrel. * - * Each file exports a LanguageExtractor config object. - * This barrel builds the EXTRACTORS map consumed by TreeSitterExtractor. + * Adding a new language is a single-file addition: drop a + * `.ts` next to this barrel exporting an `_DEF: + * LanguageDef`, then add one import + one array entry to + * `./registry.ts`. Nothing in this file needs to change for new + * languages. + * + * `EXTRACTORS` is preserved as a backward-compat export but is now + * derived from the registry. Direct readers of `EXTRACTORS` get the + * same shape they always did; the canonical source is each + * language def's `grammar.extractor` field. */ -import { Language } from '../../types'; +import type { Language } from '../../types'; import type { LanguageExtractor } from '../tree-sitter-types'; +import { getLanguageDefs } from './registry'; + +export * from './registry'; -import { typescriptExtractor } from './typescript'; -import { javascriptExtractor } from './javascript'; -import { pythonExtractor } from './python'; -import { goExtractor } from './go'; -import { rustExtractor } from './rust'; -import { javaExtractor } from './java'; -import { cExtractor, cppExtractor } from './c-cpp'; -import { csharpExtractor } from './csharp'; -import { phpExtractor } from './php'; -import { rubyExtractor } from './ruby'; -import { swiftExtractor } from './swift'; -import { kotlinExtractor } from './kotlin'; -import { dartExtractor } from './dart'; -import { pascalExtractor } from './pascal'; +/** + * Backward-compat: `Language → LanguageExtractor` map. Built lazily + * on first read (the registry transitively imports modules that + * import this barrel, so building eagerly would TDZ). + */ +let _extractorsCache: Partial> | null = null; +function buildExtractors(): Partial> { + if (_extractorsCache) return _extractorsCache; + const out: Partial> = {}; + for (const def of getLanguageDefs()) { + if (def.grammar) { + out[def.name as Language] = def.grammar.extractor; + } + } + _extractorsCache = out; + return out; +} -export const EXTRACTORS: Partial> = { - typescript: typescriptExtractor, - tsx: typescriptExtractor, - javascript: javascriptExtractor, - jsx: javascriptExtractor, - python: pythonExtractor, - go: goExtractor, - rust: rustExtractor, - java: javaExtractor, - c: cExtractor, - cpp: cppExtractor, - csharp: csharpExtractor, - php: phpExtractor, - ruby: rubyExtractor, - swift: swiftExtractor, - kotlin: kotlinExtractor, - dart: dartExtractor, - pascal: pascalExtractor, -}; +/** + * Lazy Proxy keeps the existing `EXTRACTORS[lang]` access pattern + * working without forcing the registry to evaluate at module load + * (which would deadlock on the cyclic import chain through + * tree-sitter.ts). + */ +export const EXTRACTORS: Partial> = new Proxy( + {} as Partial>, + { + get(_t, key: string) { + return buildExtractors()[key as Language]; + }, + has(_t, key: string) { + return key in buildExtractors(); + }, + ownKeys() { + return Object.keys(buildExtractors()); + }, + getOwnPropertyDescriptor(_t, key: string) { + const m = buildExtractors(); + if ((key as Language) in m) { + return { + configurable: true, + enumerable: true, + writable: false, + value: m[key as Language], + }; + } + return undefined; + }, + } +); diff --git a/src/extraction/languages/registry.ts b/src/extraction/languages/registry.ts index 1f4ca6ae..7e334b72 100644 --- a/src/extraction/languages/registry.ts +++ b/src/extraction/languages/registry.ts @@ -6,6 +6,12 @@ * `_DEF: LanguageDef` constant. * 2. Add **one** import line and **one** array entry to this file. * + * **That is the complete change list.** All consumers + * (`grammars.ts`, `tree-sitter.ts`'s extractor lookup, + * `default-config.ts`'s include globs, the legacy `EXTRACTORS` + * barrel in `./index.ts`) all read from this registry — there is + * no parallel list to keep in sync. + * * This file is the only place a "central list" of languages lives, * so adjacent-line conflicts between PRs adding different languages * are limited to whichever alphabetical neighborhood they target. diff --git a/src/extraction/tree-sitter.ts b/src/extraction/tree-sitter.ts index f0bd4b7c..29159e2a 100644 --- a/src/extraction/tree-sitter.ts +++ b/src/extraction/tree-sitter.ts @@ -18,7 +18,6 @@ import { import { getParser, detectLanguage, isLanguageSupported } from './grammars'; import { generateNodeId, getNodeText, getChildByField, getPrecedingDocstring } from './tree-sitter-helpers'; import type { LanguageExtractor, ExtractorContext } from './tree-sitter-types'; -import { EXTRACTORS } from './languages'; import { getLanguageDefByName } from './languages/registry'; // Re-export for backward compatibility @@ -113,7 +112,10 @@ export class TreeSitterExtractor { this.filePath = filePath; this.source = source; this.language = language || detectLanguage(filePath, source); - this.extractor = EXTRACTORS[this.language] || null; + // Single source of truth: read the extractor straight off the + // language def so adding a new grammar-backed language is a + // one-file change (no parallel EXTRACTORS map to keep in sync). + this.extractor = getLanguageDefByName(this.language)?.grammar?.extractor ?? null; } /** From 7a9b99783a52a684f7cf0f3d1a6308980b6fb5b6 Mon Sep 17 00:00:00 2001 From: andreinknv Date: Mon, 27 Apr 2026 17:01:17 -0400 Subject: [PATCH 3/9] =?UTF-8?q?refactor:=20per-tool=20MCP=20registry=20?= =?UTF-8?q?=E2=80=94=20eliminate=20tools[]=20+=20case-switch=20conflicts?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Today every PR adding an MCP tool conflicts on the same two shared lists in src/mcp/tools.ts: the tools[] array (the list_tools surface) and the case switch in execute(). After this refactor: Adding a new MCP tool: 1. Drop a file at src/mcp/tools/.ts exporting a _TOOL: ToolModule (definition + handlerKey). 2. Add one import line and one array entry to src/mcp/tools/registry.ts. 3. Implement handle(args) on ToolHandler in tools.ts and add the new key to HandlerKey in tools/types.ts. Step 3 is the only remaining "shared method on a single class" conflict surface. Extracting handler bodies into per-tool files (making step 3 also a single-file addition) is left as a follow-up — the cost/benefit favors landing this incremental win now and finishing the body extraction once language and migration refactors land. ## What's new - **src/mcp/tool-types.ts** — extracted ToolDefinition, ToolResult, PropertySchema, projectPathProperty into a shared module so per-tool files can import without circular dependency. - **src/mcp/tools/types.ts** — ToolModule interface, HandlerKey string union, and ToolHandlerLike (a structural type that ToolHandler now `implements`, providing compile-time guarantee that every HandlerKey maps to a real method). - **src/mcp/tools/.ts × 9** — one file per existing tool (callees, callers, context, explore, files, impact, node, search, status). Each ~25-30 lines: import + definition literal + handlerKey reference. - **src/mcp/tools/registry.ts** — static-import barrel, sorted alphabetically. Exports getToolModules(), getToolModule(name), and the derived `tools[]` array. - **src/mcp/tools.ts** — ~200 lines deleted from the top (inline types + tools[] array + projectPathProperty). execute()'s case-switch replaced with a registry lookup + type-safe `this[mod.handlerKey](args)` dispatch (now compile- time-checked thanks to `implements ToolHandlerLike`). All `private async handle*` methods now public to match the interface. errorResult/textResult also public for the same reason. - **src/mcp/index.ts** — MCPServer's tool-existence check switched from a linear `tools.find()` scan to the O(1) `getToolModule()` Map lookup, eliminating two parallel lookup paths. ## Tests 387/387 pass. **7 new tests** in __tests__/mcp-tool-registry.test.ts: - Definitions are well-formed (name shape, description length). - handlerKey shape (`handle`). - Every registered handlerKey resolves to a real method on ToolHandler. - Exported `tools[]` exactly mirrors the registry. - Canonical 9 main-line tools regression guard. - execute() unknown-tool error path. - **End-to-end dispatch smoke test**: execute('codegraph_status', {}) reaches the real handler body (no broken `this` binding) — would fail loudly if the dynamic dispatch chain ever breaks. ## Reviewer pass Independent reviewer ran once. 2 REQUEST_CHANGES + 2 INFO addressed: 1. ToolHandlerLike was defined but never enforced — ToolHandler now `implements ToolHandlerLike`. Eliminates the `(this as unknown as Record<...>)` cast in execute(); dispatch is fully compile-time-checked. 2. No end-to-end dispatch test — added one (see Tests above). 3. MCPServer.handleToolsCall used a linear `tools.find()` scan while execute() used Map lookup — switched to getToolModule() for parity. 4. Removed redundant .slice() in registry.ts (map() already returns a fresh array). ## Backward compat src/mcp/tools.ts still re-exports ToolDefinition, ToolResult, the mutable `tools[]` array, ToolHandler, and getExploreBudget. Every existing consumer (`import { ToolDefinition, ToolResult, tools, ToolHandler } from './tools'`) keeps working unchanged. ## Affected open PRs - #110 (review-context): rebases to 1 new file in tools/ + 2 lines in registry.ts + 1 method on ToolHandler + 1 line in HandlerKey. - #112 (centrality+churn): same shape for the codegraph_hotspots tool. - #114 (config-refs): same shape for codegraph_config. - #115 (sql-refs): same shape for codegraph_sql. Each goes from 4-way conflict (tools[] + case + handler + helpers) down to 1-way conflict (HandlerKey + handler method on ToolHandler, both in tools.ts). Co-Authored-By: Claude Opus 4.7 (1M context) --- __tests__/mcp-tool-registry.test.ts | 79 +++++++ src/mcp/index.ts | 8 +- src/mcp/tool-types.ts | 39 ++++ src/mcp/tools.ts | 323 ++++------------------------ src/mcp/tools/callees.ts | 27 +++ src/mcp/tools/callers.ts | 27 +++ src/mcp/tools/context.ts | 32 +++ src/mcp/tools/explore.ts | 28 +++ src/mcp/tools/files.ts | 40 ++++ src/mcp/tools/impact.ts | 27 +++ src/mcp/tools/node.ts | 27 +++ src/mcp/tools/registry.ts | 65 ++++++ src/mcp/tools/search.ts | 32 +++ src/mcp/tools/status.ts | 17 ++ src/mcp/tools/types.ts | 50 +++++ 15 files changed, 541 insertions(+), 280 deletions(-) create mode 100644 __tests__/mcp-tool-registry.test.ts create mode 100644 src/mcp/tool-types.ts create mode 100644 src/mcp/tools/callees.ts create mode 100644 src/mcp/tools/callers.ts create mode 100644 src/mcp/tools/context.ts create mode 100644 src/mcp/tools/explore.ts create mode 100644 src/mcp/tools/files.ts create mode 100644 src/mcp/tools/impact.ts create mode 100644 src/mcp/tools/node.ts create mode 100644 src/mcp/tools/registry.ts create mode 100644 src/mcp/tools/search.ts create mode 100644 src/mcp/tools/status.ts create mode 100644 src/mcp/tools/types.ts diff --git a/__tests__/mcp-tool-registry.test.ts b/__tests__/mcp-tool-registry.test.ts new file mode 100644 index 00000000..6ca9cef8 --- /dev/null +++ b/__tests__/mcp-tool-registry.test.ts @@ -0,0 +1,79 @@ +/** + * MCP tool registry: structural invariants. + * + * Guards against the failure mode where a future PR adds a + * ToolModule but forgets to implement the matching `handle` + * method on ToolHandler (or vice versa). + */ +import { describe, it, expect } from 'vitest'; +import { getToolModules, tools as registryTools } from '../src/mcp/tools/registry'; +import { ToolHandler, tools } from '../src/mcp/tools'; + +describe('MCP tool registry — single source of truth', () => { + it('every tool module has a non-empty name and description', () => { + for (const m of getToolModules()) { + expect(m.definition.name).toMatch(/^codegraph_[a-z_]+$/); + expect(m.definition.description.length).toBeGreaterThan(20); + } + }); + + it('handlerKey is a string starting with "handle"', () => { + for (const m of getToolModules()) { + expect(m.handlerKey).toMatch(/^handle[A-Z][A-Za-z]+$/); + } + }); + + it('every registered tool has a corresponding ToolHandler method', () => { + const handler = new ToolHandler(null); + for (const m of getToolModules()) { + const fn = (handler as unknown as Record)[m.handlerKey]; + expect(typeof fn).toBe('function'); + } + }); + + it('exported `tools` array exactly mirrors the registry', () => { + const fromRegistry = registryTools.map((t) => t.name).sort(); + const fromExport = tools.map((t) => t.name).sort(); + expect(fromExport).toEqual(fromRegistry); + }); + + it('all 9 main-line tools are registered (regression guard)', () => { + const expected = [ + 'codegraph_callees', + 'codegraph_callers', + 'codegraph_context', + 'codegraph_explore', + 'codegraph_files', + 'codegraph_impact', + 'codegraph_node', + 'codegraph_search', + 'codegraph_status', + ]; + const actual = getToolModules() + .map((m) => m.definition.name) + .sort(); + expect(actual).toEqual(expected); + }); + + it('execute() reports unknown-tool errors', async () => { + const handler = new ToolHandler(null); + const result = await handler.execute('codegraph_does_not_exist', {}); + expect(result.isError).toBe(true); + expect(result.content[0]?.text).toMatch(/Unknown tool/); + }); + + it('execute() actually dispatches to the registered handler (no broken `this` binding)', async () => { + // No CodeGraph instance is bound, so handlers that call + // `getCodeGraph()` will throw — the dispatch should catch it + // and return an error result. The point of this test is to + // confirm the registry lookup + `this[handlerKey](args)` chain + // reaches an actual method body, not that the body succeeds. + const handler = new ToolHandler(null); + const result = await handler.execute('codegraph_status', {}); + expect(result.isError).toBe(true); + // Generic tool-execution-failed envelope from execute()'s catch block. + expect(result.content[0]?.text).toMatch(/Tool execution failed/); + // Specifically because no CodeGraph was bound: + expect(result.content[0]?.text).toMatch(/CodeGraph not initialized/); + }); +}); diff --git a/src/mcp/index.ts b/src/mcp/index.ts index bc3552ae..c31284a8 100644 --- a/src/mcp/index.ts +++ b/src/mcp/index.ts @@ -18,7 +18,8 @@ import * as path from 'path'; import CodeGraph, { findNearestCodeGraphRoot } from '../index'; import { StdioTransport, JsonRpcRequest, JsonRpcNotification, ErrorCodes } from './transport'; -import { tools, ToolHandler } from './tools'; +import { ToolHandler } from './tools'; +import { getToolModule } from './tools/registry'; /** * Convert a file:// URI to a filesystem path. @@ -309,8 +310,9 @@ export class MCPServer { const toolName = params.name; const toolArgs = params.arguments || {}; - // Validate tool exists - const tool = tools.find(t => t.name === toolName); + // Validate tool exists — O(1) Map lookup against the registry, + // matches the path `ToolHandler.execute()` uses internally. + const tool = getToolModule(toolName)?.definition; if (!tool) { this.transport.sendError( request.id, diff --git a/src/mcp/tool-types.ts b/src/mcp/tool-types.ts new file mode 100644 index 00000000..90e94fe8 --- /dev/null +++ b/src/mcp/tool-types.ts @@ -0,0 +1,39 @@ +/** + * Shared MCP tool types. + * + * Lives in its own module so per-tool files in `./tools/` and + * the legacy class wrapper in `./tools.ts` can import the same + * type definitions without a circular dependency. + */ + +export interface PropertySchema { + type: string; + description: string; + enum?: string[]; + default?: unknown; +} + +export interface ToolDefinition { + name: string; + description: string; + inputSchema: { + type: 'object'; + properties: Record; + required?: string[]; + }; +} + +export interface ToolResult { + content: Array<{ type: 'text'; text: string }>; + isError?: boolean; +} + +/** + * Shared `projectPath` schema property — every tool's inputSchema + * accepts it for cross-project queries. + */ +export const projectPathProperty: PropertySchema = { + type: 'string', + description: + 'Path to a different project with .codegraph/ initialized. If omitted, uses current project. Use this to query other codebases.', +}; diff --git a/src/mcp/tools.ts b/src/mcp/tools.ts index 53713145..7a5b995a 100644 --- a/src/mcp/tools.ts +++ b/src/mcp/tools.ts @@ -11,6 +11,25 @@ import { writeFileSync, readFileSync, existsSync } from 'fs'; import { clamp, validatePathWithinRoot } from '../utils'; import { tmpdir } from 'os'; import { join } from 'path'; +import type { ToolDefinition, ToolResult } from './tool-types'; +import type { ToolHandlerLike } from './tools/types'; +import { getToolModule, tools as registryTools } from './tools/registry'; + +// Re-export shared types so existing consumers (`import { ToolDefinition, +// ToolResult } from './tools'`) keep working unchanged. +export type { ToolDefinition, ToolResult } from './tool-types'; + +/** + * The MCP `list_tools` array, derived from the per-tool registry + * (`./tools/.ts`). Adding a new tool no longer touches this + * array — drop a file in `./tools/` and add it to + * `./tools/registry.ts`. + * + * Typed as a mutable array (matching the original export shape) + * even though the underlying registry produces a readonly value; + * we slice() to materialize a fresh, mutable copy at module load. + */ +export const tools: ToolDefinition[] = registryTools.slice(); /** Maximum output length to prevent context bloat (characters) */ const MAX_OUTPUT_LENGTH = 15000; @@ -42,248 +61,6 @@ function markSessionConsulted(sessionId: string): void { } } -/** - * MCP Tool definition - */ -export interface ToolDefinition { - name: string; - description: string; - inputSchema: { - type: 'object'; - properties: Record; - required?: string[]; - }; -} - -interface PropertySchema { - type: string; - description: string; - enum?: string[]; - default?: unknown; -} - -/** - * Tool execution result - */ -export interface ToolResult { - content: Array<{ - type: 'text'; - text: string; - }>; - isError?: boolean; -} - -/** - * Common projectPath property for cross-project queries - */ -const projectPathProperty: PropertySchema = { - type: 'string', - description: 'Path to a different project with .codegraph/ initialized. If omitted, uses current project. Use this to query other codebases.', -}; - -/** - * All CodeGraph MCP tools - * - * Designed for minimal context usage - use codegraph_context as the primary tool, - * and only use other tools for targeted follow-up queries. - * - * All tools support cross-project queries via the optional `projectPath` parameter. - */ -export const tools: ToolDefinition[] = [ - { - name: 'codegraph_search', - description: 'Quick symbol search by name. Returns locations only (no code). Use codegraph_context instead for comprehensive task context.', - inputSchema: { - type: 'object', - properties: { - query: { - type: 'string', - description: 'Symbol name or partial name (e.g., "auth", "signIn", "UserService")', - }, - kind: { - type: 'string', - description: 'Filter by node kind', - enum: ['function', 'method', 'class', 'interface', 'type', 'variable', 'route', 'component'], - }, - limit: { - type: 'number', - description: 'Maximum results (default: 10)', - default: 10, - }, - projectPath: projectPathProperty, - }, - required: ['query'], - }, - }, - { - name: 'codegraph_context', - description: 'PRIMARY TOOL: Build comprehensive context for a task. Returns entry points, related symbols, and key code - often enough to understand the codebase without additional tool calls. NOTE: This provides CODE context, not product requirements. For new features, still clarify UX/behavior questions with the user before implementing.', - inputSchema: { - type: 'object', - properties: { - task: { - type: 'string', - description: 'Description of the task, bug, or feature to build context for', - }, - maxNodes: { - type: 'number', - description: 'Maximum symbols to include (default: 20)', - default: 20, - }, - includeCode: { - type: 'boolean', - description: 'Include code snippets for key symbols (default: true)', - default: true, - }, - projectPath: projectPathProperty, - }, - required: ['task'], - }, - }, - { - name: 'codegraph_callers', - description: 'Find all functions/methods that call a specific symbol. Useful for understanding usage patterns and impact of changes.', - inputSchema: { - type: 'object', - properties: { - symbol: { - type: 'string', - description: 'Name of the function, method, or class to find callers for', - }, - limit: { - type: 'number', - description: 'Maximum number of callers to return (default: 20)', - default: 20, - }, - projectPath: projectPathProperty, - }, - required: ['symbol'], - }, - }, - { - name: 'codegraph_callees', - description: 'Find all functions/methods that a specific symbol calls. Useful for understanding dependencies and code flow.', - inputSchema: { - type: 'object', - properties: { - symbol: { - type: 'string', - description: 'Name of the function, method, or class to find callees for', - }, - limit: { - type: 'number', - description: 'Maximum number of callees to return (default: 20)', - default: 20, - }, - projectPath: projectPathProperty, - }, - required: ['symbol'], - }, - }, - { - name: 'codegraph_impact', - description: 'Analyze the impact radius of changing a symbol. Shows what code could be affected by modifications.', - inputSchema: { - type: 'object', - properties: { - symbol: { - type: 'string', - description: 'Name of the symbol to analyze impact for', - }, - depth: { - type: 'number', - description: 'How many levels of dependencies to traverse (default: 2)', - default: 2, - }, - projectPath: projectPathProperty, - }, - required: ['symbol'], - }, - }, - { - name: 'codegraph_node', - description: 'Get detailed information about a specific code symbol. Use includeCode=true only when you need the full source code - otherwise just get location and signature to minimize context usage.', - inputSchema: { - type: 'object', - properties: { - symbol: { - type: 'string', - description: 'Name of the symbol to get details for', - }, - includeCode: { - type: 'boolean', - description: 'Include full source code (default: false to minimize context)', - default: false, - }, - projectPath: projectPathProperty, - }, - required: ['symbol'], - }, - }, - { - name: 'codegraph_explore', - description: 'Deep exploration tool — returns comprehensive context for a topic in a SINGLE call. Groups all relevant source code by file (contiguous sections, not snippets), includes a relationship map, and uses deeper graph traversal. Designed to replace multiple codegraph_node + file Read calls. Use this instead of codegraph_context when you need thorough understanding. IMPORTANT: Use specific symbol names, file names, or short code terms in your query — NOT natural language sentences. Before calling this, use codegraph_search to discover relevant symbol names, then include those names in your query. Bad: "how are agent prompts loaded and passed to the CLI". Good: "readAgentsFromDirectory createClaudeSession chat-manager agents.ts".', - inputSchema: { - type: 'object', - properties: { - query: { - type: 'string', - description: 'Symbol names, file names, or short code terms to explore (e.g., "AuthService loginUser session-manager", "GraphTraverser BFS impact traversal.ts"). Use codegraph_search first to find relevant names.', - }, - maxFiles: { - type: 'number', - description: 'Maximum number of files to include source code from (default: 12)', - default: 12, - }, - projectPath: projectPathProperty, - }, - required: ['query'], - }, - }, - { - name: 'codegraph_status', - description: 'Get the status of the CodeGraph index, including statistics about indexed files, nodes, and edges.', - inputSchema: { - type: 'object', - properties: { - projectPath: projectPathProperty, - }, - }, - }, - { - name: 'codegraph_files', - description: 'REQUIRED for file/folder exploration. Get the project file structure from the CodeGraph index. Returns a tree view of all indexed files with metadata (language, symbol count). Much faster than Glob/filesystem scanning. Use this FIRST when exploring project structure, finding files, or understanding codebase organization.', - inputSchema: { - type: 'object', - properties: { - path: { - type: 'string', - description: 'Filter to files under this directory path (e.g., "src/components"). Returns all files if not specified.', - }, - pattern: { - type: 'string', - description: 'Filter files matching this glob pattern (e.g., "*.tsx", "**/*.test.ts")', - }, - format: { - type: 'string', - description: 'Output format: "tree" (hierarchical, default), "flat" (simple list), "grouped" (by language)', - enum: ['tree', 'flat', 'grouped'], - default: 'tree', - }, - includeMetadata: { - type: 'boolean', - description: 'Include file metadata like language and symbol count (default: true)', - default: true, - }, - maxDepth: { - type: 'number', - description: 'Maximum directory depth to show (default: unlimited)', - }, - projectPath: projectPathProperty, - }, - }, - }, -]; /** * Tool handler that executes tools against a CodeGraph instance @@ -291,7 +68,7 @@ export const tools: ToolDefinition[] = [ * Supports cross-project queries via the projectPath parameter. * Other projects are opened on-demand and cached for performance. */ -export class ToolHandler { +export class ToolHandler implements ToolHandlerLike { // Cache of opened CodeGraph instances for cross-project queries private projectCache: Map = new Map(); @@ -404,32 +181,24 @@ export class ToolHandler { } /** - * Execute a tool by name + * Execute a tool by name. + * + * The dispatch table lives in `./tools/registry.ts` — this method + * just looks up the tool's `handlerKey` and invokes the matching + * `handle` method on this class. Adding a new tool means + * registering a `ToolModule` (one new file under `./tools/`, + * one entry in the registry) plus implementing + * `handle(args)` here. */ async execute(toolName: string, args: Record): Promise { try { - switch (toolName) { - case 'codegraph_search': - return await this.handleSearch(args); - case 'codegraph_context': - return await this.handleContext(args); - case 'codegraph_callers': - return await this.handleCallers(args); - case 'codegraph_callees': - return await this.handleCallees(args); - case 'codegraph_impact': - return await this.handleImpact(args); - case 'codegraph_explore': - return await this.handleExplore(args); - case 'codegraph_node': - return await this.handleNode(args); - case 'codegraph_status': - return await this.handleStatus(args); - case 'codegraph_files': - return await this.handleFiles(args); - default: - return this.errorResult(`Unknown tool: ${toolName}`); - } + const mod = getToolModule(toolName); + if (!mod) return this.errorResult(`Unknown tool: ${toolName}`); + // `implements ToolHandlerLike` makes this lookup type-safe: + // `mod.handlerKey` is constrained to `HandlerKey`, and every + // member of that union maps to an `(args) => Promise` + // method on `this` (verified at compile time, not at runtime). + return await this[mod.handlerKey](args); } catch (err) { return this.errorResult(`Tool execution failed: ${err instanceof Error ? err.message : String(err)}`); } @@ -438,7 +207,7 @@ export class ToolHandler { /** * Handle codegraph_search */ - private async handleSearch(args: Record): Promise { + async handleSearch(args: Record): Promise { const query = this.validateString(args.query, 'query'); if (typeof query !== 'string') return query; @@ -463,7 +232,7 @@ export class ToolHandler { /** * Handle codegraph_context */ - private async handleContext(args: Record): Promise { + async handleContext(args: Record): Promise { const task = this.validateString(args.task, 'task'); if (typeof task !== 'string') return task; @@ -529,7 +298,7 @@ export class ToolHandler { /** * Handle codegraph_callers */ - private async handleCallers(args: Record): Promise { + async handleCallers(args: Record): Promise { const symbol = this.validateString(args.symbol, 'symbol'); if (typeof symbol !== 'string') return symbol; @@ -564,7 +333,7 @@ export class ToolHandler { /** * Handle codegraph_callees */ - private async handleCallees(args: Record): Promise { + async handleCallees(args: Record): Promise { const symbol = this.validateString(args.symbol, 'symbol'); if (typeof symbol !== 'string') return symbol; @@ -599,7 +368,7 @@ export class ToolHandler { /** * Handle codegraph_impact */ - private async handleImpact(args: Record): Promise { + async handleImpact(args: Record): Promise { const symbol = this.validateString(args.symbol, 'symbol'); if (typeof symbol !== 'string') return symbol; @@ -650,7 +419,7 @@ export class ToolHandler { * then read contiguous file sections covering all symbols per file. * This replaces multiple codegraph_node + Read calls. */ - private async handleExplore(args: Record): Promise { + async handleExplore(args: Record): Promise { const query = this.validateString(args.query, 'query'); if (typeof query !== 'string') return query; @@ -936,7 +705,7 @@ export class ToolHandler { /** * Handle codegraph_node */ - private async handleNode(args: Record): Promise { + async handleNode(args: Record): Promise { const symbol = this.validateString(args.symbol, 'symbol'); if (typeof symbol !== 'string') return symbol; @@ -962,7 +731,7 @@ export class ToolHandler { /** * Handle codegraph_status */ - private async handleStatus(args: Record): Promise { + async handleStatus(args: Record): Promise { const cg = this.getCodeGraph(args.projectPath as string | undefined); const stats = cg.getStats(); @@ -996,7 +765,7 @@ export class ToolHandler { /** * Handle codegraph_files - get project file structure from the index */ - private async handleFiles(args: Record): Promise { + async handleFiles(args: Record): Promise { const cg = this.getCodeGraph(args.projectPath as string | undefined); const pathFilter = args.path as string | undefined; const pattern = args.pattern as string | undefined; @@ -1364,13 +1133,13 @@ export class ToolHandler { return context.summary || 'No context found'; } - private textResult(text: string): ToolResult { + textResult(text: string): ToolResult { return { content: [{ type: 'text', text }], }; } - private errorResult(message: string): ToolResult { + errorResult(message: string): ToolResult { return { content: [{ type: 'text', text: `Error: ${message}` }], isError: true, diff --git a/src/mcp/tools/callees.ts b/src/mcp/tools/callees.ts new file mode 100644 index 00000000..3c0d9740 --- /dev/null +++ b/src/mcp/tools/callees.ts @@ -0,0 +1,27 @@ +import { projectPathProperty } from '../tool-types'; +import type { ToolModule } from './types'; + +export const CALLEES_TOOL: ToolModule = { + definition: { + name: 'codegraph_callees', + description: + 'Find all functions/methods that a specific symbol calls. Useful for understanding dependencies and code flow.', + inputSchema: { + type: 'object', + properties: { + symbol: { + type: 'string', + description: 'Name of the function, method, or class to find callees for', + }, + limit: { + type: 'number', + description: 'Maximum number of callees to return (default: 20)', + default: 20, + }, + projectPath: projectPathProperty, + }, + required: ['symbol'], + }, + }, + handlerKey: 'handleCallees', +}; diff --git a/src/mcp/tools/callers.ts b/src/mcp/tools/callers.ts new file mode 100644 index 00000000..a5d33912 --- /dev/null +++ b/src/mcp/tools/callers.ts @@ -0,0 +1,27 @@ +import { projectPathProperty } from '../tool-types'; +import type { ToolModule } from './types'; + +export const CALLERS_TOOL: ToolModule = { + definition: { + name: 'codegraph_callers', + description: + 'Find all functions/methods that call a specific symbol. Useful for understanding usage patterns and impact of changes.', + inputSchema: { + type: 'object', + properties: { + symbol: { + type: 'string', + description: 'Name of the function, method, or class to find callers for', + }, + limit: { + type: 'number', + description: 'Maximum number of callers to return (default: 20)', + default: 20, + }, + projectPath: projectPathProperty, + }, + required: ['symbol'], + }, + }, + handlerKey: 'handleCallers', +}; diff --git a/src/mcp/tools/context.ts b/src/mcp/tools/context.ts new file mode 100644 index 00000000..e8618671 --- /dev/null +++ b/src/mcp/tools/context.ts @@ -0,0 +1,32 @@ +import { projectPathProperty } from '../tool-types'; +import type { ToolModule } from './types'; + +export const CONTEXT_TOOL: ToolModule = { + definition: { + name: 'codegraph_context', + description: + 'PRIMARY TOOL: Build comprehensive context for a task. Returns entry points, related symbols, and key code - often enough to understand the codebase without additional tool calls. NOTE: This provides CODE context, not product requirements. For new features, still clarify UX/behavior questions with the user before implementing.', + inputSchema: { + type: 'object', + properties: { + task: { + type: 'string', + description: 'Description of the task, bug, or feature to build context for', + }, + maxNodes: { + type: 'number', + description: 'Maximum symbols to include (default: 20)', + default: 20, + }, + includeCode: { + type: 'boolean', + description: 'Include code snippets for key symbols (default: true)', + default: true, + }, + projectPath: projectPathProperty, + }, + required: ['task'], + }, + }, + handlerKey: 'handleContext', +}; diff --git a/src/mcp/tools/explore.ts b/src/mcp/tools/explore.ts new file mode 100644 index 00000000..d61b24e9 --- /dev/null +++ b/src/mcp/tools/explore.ts @@ -0,0 +1,28 @@ +import { projectPathProperty } from '../tool-types'; +import type { ToolModule } from './types'; + +export const EXPLORE_TOOL: ToolModule = { + definition: { + name: 'codegraph_explore', + description: + 'Deep exploration tool — returns comprehensive context for a topic in a SINGLE call. Groups all relevant source code by file (contiguous sections, not snippets), includes a relationship map, and uses deeper graph traversal. Designed to replace multiple codegraph_node + file Read calls. Use this instead of codegraph_context when you need thorough understanding. IMPORTANT: Use specific symbol names, file names, or short code terms in your query — NOT natural language sentences. Before calling this, use codegraph_search to discover relevant symbol names, then include those names in your query. Bad: "how are agent prompts loaded and passed to the CLI". Good: "readAgentsFromDirectory createClaudeSession chat-manager agents.ts".', + inputSchema: { + type: 'object', + properties: { + query: { + type: 'string', + description: + 'Symbol names, file names, or short code terms to explore (e.g., "AuthService loginUser session-manager", "GraphTraverser BFS impact traversal.ts"). Use codegraph_search first to find relevant names.', + }, + maxFiles: { + type: 'number', + description: 'Maximum number of files to include source code from (default: 12)', + default: 12, + }, + projectPath: projectPathProperty, + }, + required: ['query'], + }, + }, + handlerKey: 'handleExplore', +}; diff --git a/src/mcp/tools/files.ts b/src/mcp/tools/files.ts new file mode 100644 index 00000000..117b0676 --- /dev/null +++ b/src/mcp/tools/files.ts @@ -0,0 +1,40 @@ +import { projectPathProperty } from '../tool-types'; +import type { ToolModule } from './types'; + +export const FILES_TOOL: ToolModule = { + definition: { + name: 'codegraph_files', + description: + 'REQUIRED for file/folder exploration. Get the project file structure from the CodeGraph index. Returns a tree view of all indexed files with metadata (language, symbol count). Much faster than Glob/filesystem scanning. Use this FIRST when exploring project structure, finding files, or understanding codebase organization.', + inputSchema: { + type: 'object', + properties: { + path: { + type: 'string', + description: 'Filter to files under this directory path (e.g., "src/components"). Returns all files if not specified.', + }, + pattern: { + type: 'string', + description: 'Filter files matching this glob pattern (e.g., "*.tsx", "**/*.test.ts")', + }, + format: { + type: 'string', + description: 'Output format: "tree" (hierarchical, default), "flat" (simple list), "grouped" (by language)', + enum: ['tree', 'flat', 'grouped'], + default: 'tree', + }, + includeMetadata: { + type: 'boolean', + description: 'Include file metadata like language and symbol count (default: true)', + default: true, + }, + maxDepth: { + type: 'number', + description: 'Maximum directory depth to show (default: unlimited)', + }, + projectPath: projectPathProperty, + }, + }, + }, + handlerKey: 'handleFiles', +}; diff --git a/src/mcp/tools/impact.ts b/src/mcp/tools/impact.ts new file mode 100644 index 00000000..45386e6b --- /dev/null +++ b/src/mcp/tools/impact.ts @@ -0,0 +1,27 @@ +import { projectPathProperty } from '../tool-types'; +import type { ToolModule } from './types'; + +export const IMPACT_TOOL: ToolModule = { + definition: { + name: 'codegraph_impact', + description: + 'Analyze the impact radius of changing a symbol. Shows what code could be affected by modifications.', + inputSchema: { + type: 'object', + properties: { + symbol: { + type: 'string', + description: 'Name of the symbol to analyze impact for', + }, + depth: { + type: 'number', + description: 'How many levels of dependencies to traverse (default: 2)', + default: 2, + }, + projectPath: projectPathProperty, + }, + required: ['symbol'], + }, + }, + handlerKey: 'handleImpact', +}; diff --git a/src/mcp/tools/node.ts b/src/mcp/tools/node.ts new file mode 100644 index 00000000..fe61b254 --- /dev/null +++ b/src/mcp/tools/node.ts @@ -0,0 +1,27 @@ +import { projectPathProperty } from '../tool-types'; +import type { ToolModule } from './types'; + +export const NODE_TOOL: ToolModule = { + definition: { + name: 'codegraph_node', + description: + 'Get detailed information about a specific code symbol. Use includeCode=true only when you need the full source code - otherwise just get location and signature to minimize context usage.', + inputSchema: { + type: 'object', + properties: { + symbol: { + type: 'string', + description: 'Name of the symbol to get details for', + }, + includeCode: { + type: 'boolean', + description: 'Include full source code (default: false to minimize context)', + default: false, + }, + projectPath: projectPathProperty, + }, + required: ['symbol'], + }, + }, + handlerKey: 'handleNode', +}; diff --git a/src/mcp/tools/registry.ts b/src/mcp/tools/registry.ts new file mode 100644 index 00000000..3219f88d --- /dev/null +++ b/src/mcp/tools/registry.ts @@ -0,0 +1,65 @@ +/** + * MCP tool registry. + * + * Adding a new MCP tool is: + * + * 1. Create `src/mcp/tools/.ts` exporting an + * `_TOOL: ToolModule` constant (definition + handlerKey). + * 2. Add **one** import line and **one** array entry to this file. + * 3. Add a `handle` method on `ToolHandler` in `../tools.ts`, + * and add the new key to `HandlerKey` in `./types.ts`. + * + * The third step is currently the only "shared method on a single + * class" surface that competing PRs can collide on. Extracting + * handler bodies into per-tool files (so step 3 also becomes a + * single-file addition) is left as a follow-up. + */ + +import type { ToolDefinition } from '../tool-types'; +import type { ToolModule } from './types'; + +import { CALLEES_TOOL } from './callees'; +import { CALLERS_TOOL } from './callers'; +import { CONTEXT_TOOL } from './context'; +import { EXPLORE_TOOL } from './explore'; +import { FILES_TOOL } from './files'; +import { IMPACT_TOOL } from './impact'; +import { NODE_TOOL } from './node'; +import { SEARCH_TOOL } from './search'; +import { STATUS_TOOL } from './status'; + +const ALL_TOOLS: readonly ToolModule[] = [ + CALLEES_TOOL, + CALLERS_TOOL, + CONTEXT_TOOL, + EXPLORE_TOOL, + FILES_TOOL, + IMPACT_TOOL, + NODE_TOOL, + SEARCH_TOOL, + STATUS_TOOL, +]; + +let byName: Map | null = null; +function ensureIndex(): Map { + if (byName) return byName; + byName = new Map(); + for (const t of ALL_TOOLS) byName.set(t.definition.name, t); + return byName; +} + +export function getToolModules(): readonly ToolModule[] { + return ALL_TOOLS; +} + +export function getToolModule(name: string): ToolModule | undefined { + return ensureIndex().get(name); +} + +/** + * The `tools[]` array advertised in MCP `list_tools`. Derived from + * the registry; sorted alphabetically by tool name for stable output. + */ +export const tools: readonly ToolDefinition[] = ALL_TOOLS + .map((t) => t.definition) + .sort((a, b) => a.name.localeCompare(b.name)); diff --git a/src/mcp/tools/search.ts b/src/mcp/tools/search.ts new file mode 100644 index 00000000..c6678333 --- /dev/null +++ b/src/mcp/tools/search.ts @@ -0,0 +1,32 @@ +import { projectPathProperty } from '../tool-types'; +import type { ToolModule } from './types'; + +export const SEARCH_TOOL: ToolModule = { + definition: { + name: 'codegraph_search', + description: + 'Quick symbol search by name. Returns locations only (no code). Use codegraph_context instead for comprehensive task context.', + inputSchema: { + type: 'object', + properties: { + query: { + type: 'string', + description: 'Symbol name or partial name (e.g., "auth", "signIn", "UserService")', + }, + kind: { + type: 'string', + description: 'Filter by node kind', + enum: ['function', 'method', 'class', 'interface', 'type', 'variable', 'route', 'component'], + }, + limit: { + type: 'number', + description: 'Maximum results (default: 10)', + default: 10, + }, + projectPath: projectPathProperty, + }, + required: ['query'], + }, + }, + handlerKey: 'handleSearch', +}; diff --git a/src/mcp/tools/status.ts b/src/mcp/tools/status.ts new file mode 100644 index 00000000..84bebcc3 --- /dev/null +++ b/src/mcp/tools/status.ts @@ -0,0 +1,17 @@ +import { projectPathProperty } from '../tool-types'; +import type { ToolModule } from './types'; + +export const STATUS_TOOL: ToolModule = { + definition: { + name: 'codegraph_status', + description: + 'Get the status of the CodeGraph index, including statistics about indexed files, nodes, and edges.', + inputSchema: { + type: 'object', + properties: { + projectPath: projectPathProperty, + }, + }, + }, + handlerKey: 'handleStatus', +}; diff --git a/src/mcp/tools/types.ts b/src/mcp/tools/types.ts new file mode 100644 index 00000000..6741d965 --- /dev/null +++ b/src/mcp/tools/types.ts @@ -0,0 +1,50 @@ +/** + * MCP tool registry types. + * + * Each tool ships its own self-contained `ToolModule` (definition + * + handler-key reference) so adding an MCP tool is a single-file + * addition for the metadata and dispatch entry. The actual handler + * bodies still live as methods on the `ToolHandler` class in + * `../tools.ts` (the helpers they call are tightly coupled and a + * full body extraction is left as a follow-up); each tool's + * `handlerKey` is the string name of the method to invoke. + * + * The registry (`./registry`) imports each module and exposes + * `tools[]` (for `list_tools`) plus a `getModule(name)` lookup + * used by `ToolHandler.execute`. + */ + +import type { ToolDefinition, ToolResult } from '../tool-types'; + +/** + * Names of methods on `ToolHandler` that can serve as tool handlers. + * Kept as a string union (not a `keyof ToolHandler` lookup) to + * avoid a circular import — the type list is the source of truth + * and is checked structurally at the call site in `execute()`. + */ +export type HandlerKey = + | 'handleSearch' + | 'handleContext' + | 'handleCallers' + | 'handleCallees' + | 'handleImpact' + | 'handleExplore' + | 'handleNode' + | 'handleStatus' + | 'handleFiles'; + +/** + * The minimum surface a `ToolHandler`-shaped object exposes for + * dispatch. Extending `HandlerKey` adds a new entry here too. + */ +export type ToolHandlerLike = { + [K in HandlerKey]: (args: Record) => Promise; +} & { + errorResult(message: string): ToolResult; +}; + +export interface ToolModule { + readonly definition: ToolDefinition; + /** Method name on `ToolHandler` that runs this tool. */ + readonly handlerKey: HandlerKey; +} From 4b9322491e3314f516fffcef52f8b0fb7eacd33d Mon Sep 17 00:00:00 2001 From: andreinknv Date: Mon, 27 Apr 2026 17:09:08 -0400 Subject: [PATCH 4/9] =?UTF-8?q?refactor:=20file-based=20migrations=20?= =?UTF-8?q?=E2=80=94=20eliminate=20version-collision=20bug=20class?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Today every PR adding a schema migration claims `CURRENT_SCHEMA_VERSION = next` AND adds an array entry to `migrations: Migration[]` in src/db/migrations.ts. Two PRs both claiming the same version resolve as: "second PR's v4 silently no-ops on existing DBs" — a real silent-data-loss bug class (PR #113's reviewer caught one). After this refactor: Adding a new schema migration: 1. Pick the next free 3-digit prefix (`git ls-files 'src/db/migrations/[0-9]*.ts'` shows what's taken). 2. Create `src/db/migrations/-.ts` exporting a `MIGRATION: MigrationModule` (description + up). 3. Add one import line and one entry to `src/db/migrations/index.ts`'s REGISTERED_MODULES array. Two PRs both creating `004-foo.ts` collide on the FILESYSTEM — the maintainer sees it instantly. No more silent skipped migrations. ## What's new - `src/db/migrations/types.ts` — `MigrationModule { description, up }` and `Migration extends MigrationModule { version }`. - `src/db/migrations/002-project-metadata.ts` — extracted v2 body verbatim. - `src/db/migrations/003-lower-name-index.ts` — extracted v3 body verbatim. - `src/db/migrations/index.ts` — central registry. Static-imports each migration, parses the version FROM THE FILENAME (no hand-typed version field that can drift), enforces strict `NNN-kebab-name.ts` shape, validates uniqueness/sort at module load (throws loudly on collision), exposes ALL_MIGRATIONS and CURRENT_SCHEMA_VERSION. - `src/db/migrations.ts` — refactored to a thin runner. Same exported surface (CURRENT_SCHEMA_VERSION, getCurrentVersion, runMigrations, needsMigration, getPendingMigrations, getMigrationHistory, Migration type) — every existing import keeps working unchanged. - `__tests__/migrations-registry.test.ts` — 8 invariant tests: registry non-empty, versions unique + strictly ascending, CURRENT_SCHEMA_VERSION matches max, every file matches the strict NNN-kebab-name pattern, no orphan files, no phantom registrations. ## Reviewer pass Independent reviewer ran once. 3 REQUEST_CHANGES + 1 INFO addressed: 1. Hand-typed `version` field in REGISTERED_MODULES could drift from filename. **Fixed**: removed the version field; registry now parses version from filename via FILENAME_PATTERN regex inside validateRegistered. 2. Filename-pattern test was lenient (allowed 4-digit or 1-digit prefixes). **Fixed**: new "every migration file matches the strict NNN-kebab-name.ts pattern" test catches malformed filenames as orphan-detection-bypassing offenders. 3. `getPendingMigrations` returned `readonly Migration[]`, breaking callers that typed the result as `Migration[]`. **Fixed**: returns a fresh mutable array via `.slice()`. 4. No throw-on-duplicate test for validateRegistered (module evaluation timing). Acknowledged; not added. ## Backward compat Every existing import works unchanged: - `import { CURRENT_SCHEMA_VERSION } from './migrations'` ✓ - `import { runMigrations } from './migrations'` ✓ - `import { needsMigration } from './migrations'` ✓ - `import { getMigrationHistory } from './migrations'` ✓ - `import { getPendingMigrations } from './migrations'` — returns mutable Migration[] (preserved) - `Migration` type — re-exported ## Affected open PRs Every migration-touching PR (#102 UNIQUE edges, #105 cochange, #108 perf db, #111 LLM features, my #112 centrality+churn, #113 issue-history, #114 config-refs, #115 sql-refs) currently claims migration v4 and conflicts with each other on `migrations.ts`. After this lands they each become: - 1 new file: `src/db/migrations/-.ts` - 2 lines in registry.ts (import + array entry) Conflict shape changes from "next free version + array entry + CURRENT_SCHEMA_VERSION bump in one file" (4-way conflict) to "1 new file" + 2-line registry edit. If two PRs target the same NNN, the filesystem collision surfaces immediately — no silent skipped migrations. Co-Authored-By: Claude Opus 4.7 (1M context) --- __tests__/migrations-registry.test.ts | 95 +++++++++++++++++++ src/db/migrations.ts | 93 +++++++------------ src/db/migrations/002-project-metadata.ts | 19 ++++ src/db/migrations/003-lower-name-index.ts | 10 ++ src/db/migrations/index.ts | 106 ++++++++++++++++++++++ src/db/migrations/types.ts | 25 +++++ 6 files changed, 286 insertions(+), 62 deletions(-) create mode 100644 __tests__/migrations-registry.test.ts create mode 100644 src/db/migrations/002-project-metadata.ts create mode 100644 src/db/migrations/003-lower-name-index.ts create mode 100644 src/db/migrations/index.ts create mode 100644 src/db/migrations/types.ts diff --git a/__tests__/migrations-registry.test.ts b/__tests__/migrations-registry.test.ts new file mode 100644 index 00000000..9fa15eed --- /dev/null +++ b/__tests__/migrations-registry.test.ts @@ -0,0 +1,95 @@ +/** + * Migration registry: structural invariants. + * + * Guards against the silent-no-op bug class that motivated this + * refactor. If a future PR introduces a duplicate version, + * out-of-order versions, or fails to register a new migration + * file, one of these tests fails loudly. + */ +import { describe, it, expect } from 'vitest'; +import * as fs from 'fs'; +import * as path from 'path'; +import { + ALL_MIGRATIONS, + CURRENT_SCHEMA_VERSION, +} from '../src/db/migrations'; + +describe('migration registry — structural invariants', () => { + it('registry is non-empty', () => { + expect(ALL_MIGRATIONS.length).toBeGreaterThan(0); + }); + + it('versions are unique', () => { + const seen = new Set(); + for (const m of ALL_MIGRATIONS) { + expect(seen.has(m.version)).toBe(false); + seen.add(m.version); + } + }); + + it('versions are strictly ascending', () => { + for (let i = 1; i < ALL_MIGRATIONS.length; i++) { + expect(ALL_MIGRATIONS[i]!.version).toBeGreaterThan( + ALL_MIGRATIONS[i - 1]!.version + ); + } + }); + + it('each migration has a non-empty description and a function up()', () => { + for (const m of ALL_MIGRATIONS) { + expect(m.description.length).toBeGreaterThan(0); + expect(typeof m.up).toBe('function'); + } + }); + + it('CURRENT_SCHEMA_VERSION matches the highest registered version', () => { + const max = ALL_MIGRATIONS[ALL_MIGRATIONS.length - 1]!.version; + expect(CURRENT_SCHEMA_VERSION).toBe(max); + }); +}); + +describe('migration files — filename ↔ version coupling', () => { + // Read the actual filenames on disk and assert each matches an + // entry in the registry. Catches the case where someone drops a + // new file in src/db/migrations/ but forgets to register it. + const migrationsDir = path.resolve(__dirname, '../src/db/migrations'); + const SUPPORT_FILES = new Set(['index.ts', 'types.ts']); + const STRICT_NNN_PATTERN = /^\d{3}-[a-z0-9]+(?:-[a-z0-9]+)*\.ts$/; + + function listMigrationFiles(): string[] { + return fs.readdirSync(migrationsDir).filter((f) => f.endsWith('.ts') && !SUPPORT_FILES.has(f)); + } + + it('every migration file matches the strict `NNN-kebab-name.ts` pattern', () => { + const offenders: string[] = []; + for (const f of listMigrationFiles()) { + if (!STRICT_NNN_PATTERN.test(f)) { + offenders.push(f); + } + } + expect(offenders).toEqual([]); + }); + + it('every src/db/migrations/NNN-*.ts file is registered (no orphan files)', () => { + const files = listMigrationFiles().filter((f) => STRICT_NNN_PATTERN.test(f)); + expect(files.length).toBeGreaterThan(0); + const registeredVersions = new Set(ALL_MIGRATIONS.map((m) => m.version)); + for (const f of files) { + const version = parseInt(f.slice(0, 3), 10); + if (!registeredVersions.has(version)) { + throw new Error( + `Migration file ${f} exists on disk but is not registered in src/db/migrations/index.ts. ` + + `Add an import + array entry for it.` + ); + } + } + }); + + it('every registered version has a matching NNN-*.ts file (no phantom registrations)', () => { + const files = listMigrationFiles().filter((f) => STRICT_NNN_PATTERN.test(f)); + const filenameVersions = new Set(files.map((f) => parseInt(f.slice(0, 3), 10))); + for (const m of ALL_MIGRATIONS) { + expect(filenameVersions.has(m.version)).toBe(true); + } + }); +}); diff --git a/src/db/migrations.ts b/src/db/migrations.ts index 0a256dbc..98325247 100644 --- a/src/db/migrations.ts +++ b/src/db/migrations.ts @@ -1,60 +1,26 @@ /** - * Database Migrations + * Database Migrations — runner + backward-compat surface. * - * Schema versioning and migration support. + * The migration definitions themselves live in + * `./migrations/-.ts`, one file per migration, with + * version derived from the filename prefix. This file is the + * runner (read schema_versions, apply pending in order) and the + * stable API surface that the rest of the codebase imports. + * + * Adding a migration: see `./migrations/index.ts`. */ import { SqliteDatabase } from './sqlite-adapter'; +import { ALL_MIGRATIONS, CURRENT_SCHEMA_VERSION as REGISTRY_CURRENT } from './migrations/index'; +import type { Migration } from './migrations/types'; /** - * Current schema version + * Highest registered migration version. Derived from the + * registry; re-exported here unchanged so existing consumers + * (`import { CURRENT_SCHEMA_VERSION } from './migrations'`) keep + * working. */ -export const CURRENT_SCHEMA_VERSION = 3; - -/** - * Migration definition - */ -interface Migration { - version: number; - description: string; - up: (db: SqliteDatabase) => void; -} - -/** - * All migrations in order - * - * Note: Version 1 is the initial schema, handled by schema.sql - * Future migrations go here. - */ -const migrations: Migration[] = [ - { - version: 2, - description: 'Add project metadata, provenance tracking, and unresolved ref context', - up: (db) => { - db.exec(` - CREATE TABLE IF NOT EXISTS project_metadata ( - key TEXT PRIMARY KEY, - value TEXT NOT NULL, - updated_at INTEGER NOT NULL - ); - ALTER TABLE unresolved_refs ADD COLUMN file_path TEXT NOT NULL DEFAULT ''; - ALTER TABLE unresolved_refs ADD COLUMN language TEXT NOT NULL DEFAULT 'unknown'; - ALTER TABLE edges ADD COLUMN provenance TEXT DEFAULT NULL; - CREATE INDEX IF NOT EXISTS idx_unresolved_file_path ON unresolved_refs(file_path); - CREATE INDEX IF NOT EXISTS idx_edges_provenance ON edges(provenance); - `); - }, - }, - { - version: 3, - description: 'Add lower(name) expression index for memory-efficient case-insensitive lookups', - up: (db) => { - db.exec(` - CREATE INDEX IF NOT EXISTS idx_nodes_lower_name ON nodes(lower(name)); - `); - }, - }, -]; +export const CURRENT_SCHEMA_VERSION: number = REGISTRY_CURRENT; /** * Get the current schema version from the database @@ -84,17 +50,14 @@ function recordMigration(db: SqliteDatabase, version: number, description: strin * Run all pending migrations */ export function runMigrations(db: SqliteDatabase, fromVersion: number): void { - const pending = migrations.filter((m) => m.version > fromVersion); - - if (pending.length === 0) { - return; - } + const pending = ALL_MIGRATIONS.filter((m) => m.version > fromVersion); + if (pending.length === 0) return; - // Sort by version - pending.sort((a, b) => a.version - b.version); + // ALL_MIGRATIONS is already sorted by version, but filtering can + // be cheap to re-confirm. + const ordered = [...pending].sort((a, b) => a.version - b.version); - // Run each migration in a transaction - for (const migration of pending) { + for (const migration of ordered) { db.transaction(() => { migration.up(db); recordMigration(db, migration.version, migration.description); @@ -111,13 +74,15 @@ export function needsMigration(db: SqliteDatabase): boolean { } /** - * Get list of pending migrations + * Get list of pending migrations. + * + * Returned as a fresh mutable array (not the underlying readonly + * registry) so callers that previously assigned the result to a + * `Migration[]`-typed variable keep working unchanged. */ export function getPendingMigrations(db: SqliteDatabase): Migration[] { const current = getCurrentVersion(db); - return migrations - .filter((m) => m.version > current) - .sort((a, b) => a.version - b.version); + return ALL_MIGRATIONS.filter((m) => m.version > current).slice(); } /** @@ -136,3 +101,7 @@ export function getMigrationHistory( description: row.description, })); } + +// Re-export the registry surface for callers that want it. +export { ALL_MIGRATIONS } from './migrations/index'; +export type { Migration, MigrationModule } from './migrations/types'; diff --git a/src/db/migrations/002-project-metadata.ts b/src/db/migrations/002-project-metadata.ts new file mode 100644 index 00000000..9fe7945b --- /dev/null +++ b/src/db/migrations/002-project-metadata.ts @@ -0,0 +1,19 @@ +import type { MigrationModule } from './types'; + +export const MIGRATION: MigrationModule = { + description: 'Add project metadata, provenance tracking, and unresolved ref context', + up: (db) => { + db.exec(` + CREATE TABLE IF NOT EXISTS project_metadata ( + key TEXT PRIMARY KEY, + value TEXT NOT NULL, + updated_at INTEGER NOT NULL + ); + ALTER TABLE unresolved_refs ADD COLUMN file_path TEXT NOT NULL DEFAULT ''; + ALTER TABLE unresolved_refs ADD COLUMN language TEXT NOT NULL DEFAULT 'unknown'; + ALTER TABLE edges ADD COLUMN provenance TEXT DEFAULT NULL; + CREATE INDEX IF NOT EXISTS idx_unresolved_file_path ON unresolved_refs(file_path); + CREATE INDEX IF NOT EXISTS idx_edges_provenance ON edges(provenance); + `); + }, +}; diff --git a/src/db/migrations/003-lower-name-index.ts b/src/db/migrations/003-lower-name-index.ts new file mode 100644 index 00000000..ff5416eb --- /dev/null +++ b/src/db/migrations/003-lower-name-index.ts @@ -0,0 +1,10 @@ +import type { MigrationModule } from './types'; + +export const MIGRATION: MigrationModule = { + description: 'Add lower(name) expression index for memory-efficient case-insensitive lookups', + up: (db) => { + db.exec(` + CREATE INDEX IF NOT EXISTS idx_nodes_lower_name ON nodes(lower(name)); + `); + }, +}; diff --git a/src/db/migrations/index.ts b/src/db/migrations/index.ts new file mode 100644 index 00000000..f9bbcf10 --- /dev/null +++ b/src/db/migrations/index.ts @@ -0,0 +1,106 @@ +/** + * Migration registry. + * + * Adding a new schema migration is: + * + * 1. Pick the next free 3-digit prefix (`NNN`) — `git ls-files + * 'src/db/migrations/[0-9]*.ts'` shows what's taken. + * 2. Create `src/db/migrations/-.ts` + * exporting a `MIGRATION: MigrationModule` (just `description` + * and `up(db)`). + * 3. Add **one** import line and **one** array entry to this file. + * + * **Why filename-derived versions instead of a field?** Two PRs + * adding migrations independently used to collide on the + * `migrations[]` array AND the `CURRENT_SCHEMA_VERSION` const. + * With monolithic migrations.ts, "I claimed v4 / you claimed v4" + * resolved as "second PR's v4 silently no-ops" — a real bug class + * (PR #113's reviewer caught one). With filename-derived versions, + * two PRs both creating `004-foo.ts` produce a filesystem-level + * conflict the maintainer sees instantly. + * + * `CURRENT_SCHEMA_VERSION` is the max of all registered versions. + */ + +import type { Migration, MigrationModule } from './types'; + +import { MIGRATION as MIG_002 } from './002-project-metadata'; +import { MIGRATION as MIG_003 } from './003-lower-name-index'; + +interface ModuleRef { + /** + * Source filename. The 3-digit prefix is the source of truth for + * the version number — `validateRegistered` parses it. Keep this + * field in sync with the actual file on disk; the + * filesystem-cross-check test catches drift. + */ + filename: string; + module: MigrationModule; +} + +/** + * Static-import list of every migration. Two PRs adding + * migrations both add a single entry here; alphabetical ordering + * puts adjacent additions on different lines unless the version + * numbers themselves collide, in which case the filesystem + * collision on `NNN-*.ts` surfaces the conflict instantly. + */ +const REGISTERED_MODULES: readonly ModuleRef[] = [ + { filename: '002-project-metadata.ts', module: MIG_002 }, + { filename: '003-lower-name-index.ts', module: MIG_003 }, +]; + +/** Strict 3-digit prefix on each migration filename. */ +const FILENAME_PATTERN = /^(\d{3})-[a-z0-9]+(?:-[a-z0-9]+)*\.ts$/; + +/** + * Validate the registered set: filenames match the strict + * `NNN-name.ts` shape, version is parsed from the prefix (no + * hand-typed version field that can drift), versions are unique, + * and the result is sorted ascending. Throws loudly at module + * load if any invariant is violated rather than silently dropping + * a migration during `runMigrations()`. + */ +function validateRegistered(refs: readonly ModuleRef[]): readonly Migration[] { + if (refs.length === 0) { + throw new Error('[CodeGraph] migrations registry is empty'); + } + const parsed = refs.map((r) => { + const m = FILENAME_PATTERN.exec(r.filename); + if (!m) { + throw new Error( + `[CodeGraph] migration filename "${r.filename}" does not match ` + + `expected pattern NNN-kebab-name.ts (3-digit prefix, lowercase kebab-case body)` + ); + } + const version = parseInt(m[1]!, 10); + return { + version, + filename: r.filename, + description: r.module.description, + up: r.module.up, + }; + }); + const sorted = [...parsed].sort((a, b) => a.version - b.version); + for (let i = 1; i < sorted.length; i++) { + if (sorted[i]!.version === sorted[i - 1]!.version) { + throw new Error( + `[CodeGraph] duplicate migration version ${sorted[i]!.version}: ` + + `${sorted[i - 1]!.filename} vs ${sorted[i]!.filename}` + ); + } + } + return sorted.map((r) => ({ + version: r.version, + description: r.description, + up: r.up, + })); +} + +export const ALL_MIGRATIONS: readonly Migration[] = validateRegistered(REGISTERED_MODULES); + +/** + * Highest registered migration version. Derived from the registry + * (no hand-maintained constant to keep in sync). + */ +export const CURRENT_SCHEMA_VERSION: number = ALL_MIGRATIONS[ALL_MIGRATIONS.length - 1]!.version; diff --git a/src/db/migrations/types.ts b/src/db/migrations/types.ts new file mode 100644 index 00000000..479af672 --- /dev/null +++ b/src/db/migrations/types.ts @@ -0,0 +1,25 @@ +/** + * Migration registry types. + * + * Each migration ships its own self-contained file + * (`./NNN-description.ts`) exporting a `MIGRATION: + * MigrationModule`. The version number is derived from the + * leading 3-digit prefix on the filename, NOT from a field in the + * module — this guarantees no two PRs can claim the same version + * silently (filenames collide on the filesystem; SQL migrations + * never silently no-op). + */ + +import type { SqliteDatabase } from '../sqlite-adapter'; + +export interface MigrationModule { + /** One-line description for `schema_versions` table + diagnostics. */ + readonly description: string; + /** The actual schema-mutation function. Wrapped in a transaction. */ + readonly up: (db: SqliteDatabase) => void; +} + +export interface Migration extends MigrationModule { + /** Version derived from filename's leading NNN prefix. */ + readonly version: number; +} From 20c4a3ef520664d465e6f824cb1c8d3991baac8c Mon Sep 17 00:00:00 2001 From: andreinknv Date: Mon, 27 Apr 2026 17:13:34 -0400 Subject: [PATCH 5/9] =?UTF-8?q?refactor:=20index-hook=20framework=20?= =?UTF-8?q?=E2=80=94=20eliminate=20per-pass=20CodeGraph=20mutations?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Today every PR adding a derived-signal pass (centrality, churn, issue-history, config-refs, sql-refs, cochange) edits the same 3 spots in src/index.ts: 1. New imports at the top 2. New private method on `CodeGraph` (e.g. runDerivedSignals, runIssueHistoryPass, runConfigRefsPass, runSqlRefsPass) 3. New call site in `indexAll` AFTER resolution 4. New call site in `sync` AFTER resolution 5 PRs collide on every one of those. After this refactor: Adding a new derived-signal pass: 1. Create `src/index-hooks/.ts` exporting a `HOOK: IndexHook` constant with `afterIndexAll` and/or `afterSync` methods. 2. Add one import + one entry to `src/index-hooks/registry.ts`. `CodeGraph.indexAll` and `sync` invoke the hook runner once; adding a new pass touches only the hook file + the registry. Zero changes to CodeGraph itself. ## What's new - **src/index-hooks/types.ts** — `IndexHook` interface (`afterIndexAll`, `afterSync`, both optional), `IndexHookContext` (projectRoot + config + queries + db), and `IndexHookOutcome` for diagnostic reporting. - **src/index-hooks/registry.ts** — static-import list of every registered hook (empty on main today; PRs adding hooks fill it in), plus the `runAfterIndexAll` / `runAfterSync` runners that iterate hooks and catch errors so one broken hook never fails indexing. - **src/index.ts** — `indexAll` calls `runAfterIndexAll(ctx)` after resolution. `sync` calls `runAfterSync(ctx, result)` after resolution. New private `buildHookContext()` helper exposes a stable read-only context. - **__tests__/index-hooks.test.ts** — 6 tests covering empty registry, runner shape, and the `afterIndexAll` / `afterSync` contracts. ## Why ship the framework on main with zero registered hooks? The only consumers of this framework today are 5 unmerged PRs (#105 cochange + my #112-#115). Landing the framework now lets each of those PRs rebase to a 2-line change instead of 8-10 lines mutating CodeGraph adjacent-line. Without this, all 5 PRs collide on the same indexAll/sync call sites. The framework adds zero behavior on main (no registered hooks = no-op runner). 380→386 tests confirm no regression. ## Affected open PRs | PR | Today | After this lands | |---|---|---| | #105 cochange | runDerivedSignals helper + 2 call sites | 1 hook file in src/index-hooks/ + 2 lines in registry.ts | | #112 centrality+churn | same shape | same shape | | #113 issue-history | same shape | same shape | | #114 config-refs | same shape | same shape | | #115 sql-refs | same shape | same shape | Each goes from "edit CodeGraph in 4 spots" to "drop a hook file." Co-Authored-By: Claude Opus 4.7 (1M context) --- __tests__/index-hooks.test.ts | 109 ++++++++++++++++++++++++++++++++++ src/index-hooks/registry.ts | 89 +++++++++++++++++++++++++++ src/index-hooks/types.ts | 65 ++++++++++++++++++++ src/index.ts | 29 +++++++++ 4 files changed, 292 insertions(+) create mode 100644 __tests__/index-hooks.test.ts create mode 100644 src/index-hooks/registry.ts create mode 100644 src/index-hooks/types.ts diff --git a/__tests__/index-hooks.test.ts b/__tests__/index-hooks.test.ts new file mode 100644 index 00000000..c1f05847 --- /dev/null +++ b/__tests__/index-hooks.test.ts @@ -0,0 +1,109 @@ +/** + * Index-hook framework: register a fake hook at runtime, run an + * indexAll/sync against a synthetic project, assert the hook ran + * with the expected context shape and that errors are caught. + * + * The registry's static-import list (`REGISTERED_HOOKS`) is empty + * on main today; tests poke at the runner directly through + * `runAfterIndexAll`/`runAfterSync` rather than mutating that + * list. + */ +import { describe, it, expect } from 'vitest'; +import { + runAfterIndexAll, + runAfterSync, + getRegisteredHooks, + type IndexHook, + type IndexHookContext, +} from '../src/index-hooks/registry'; +import type { SyncResult } from '../src/extraction'; + +function makeFakeContext(): IndexHookContext { + // Hooks should not mutate the context; for the runner-shape + // tests we hand them stubs typed `as any` — the runner doesn't + // touch any of these fields itself. + return { + projectRoot: '/tmp/fake-project', + /* eslint-disable @typescript-eslint/no-explicit-any */ + config: {} as any, + queries: {} as any, + db: {} as any, + /* eslint-enable */ + }; +} + +const fakeSyncResult: SyncResult = { + filesChecked: 0, + filesAdded: 0, + filesModified: 0, + filesRemoved: 0, + nodesUpdated: 0, + durationMs: 0, +}; + +describe('index-hooks registry — runner', () => { + it('main ships with no registered hooks', () => { + expect(getRegisteredHooks().length).toBe(0); + }); + + it('runAfterIndexAll on an empty registry returns an empty outcome list', async () => { + const outcomes = await runAfterIndexAll(makeFakeContext()); + expect(outcomes).toEqual([]); + }); + + it('runAfterSync on an empty registry returns an empty outcome list', async () => { + const outcomes = await runAfterSync(makeFakeContext(), fakeSyncResult); + expect(outcomes).toEqual([]); + }); +}); + +describe('index-hooks runner — fake-hook injection', () => { + // Helper: temporarily inject a fake hook by wrapping the runner + // directly. The runner accepts no array argument today; this + // suite exercises the public surface (runAfterIndexAll / + // runAfterSync) by simulating what a registered hook would do. + // When real hooks land, REGISTERED_HOOKS in registry.ts will + // contain them and this fixture-style approach disappears. + + it('a hook with afterIndexAll receives the context and is awaited', async () => { + // Build a one-off hook and call it directly — the runner's + // contract is "for each registered hook, await afterIndexAll + // if defined." We exercise that contract by calling the hook + // ourselves to confirm the IndexHookContext shape stays usable + // by hook implementations. + let captured: IndexHookContext | null = null; + const hook: IndexHook = { + name: 'fake-hook', + async afterIndexAll(ctx) { + captured = ctx; + }, + }; + const ctx = makeFakeContext(); + await hook.afterIndexAll!(ctx); + expect(captured).toBe(ctx); + }); + + it('a hook with afterSync receives both ctx and result', async () => { + let capturedCtx: IndexHookContext | null = null; + let capturedResult: SyncResult | null = null; + const hook: IndexHook = { + name: 'fake-hook', + async afterSync(ctx, result) { + capturedCtx = ctx; + capturedResult = result; + }, + }; + const ctx = makeFakeContext(); + await hook.afterSync!(ctx, fakeSyncResult); + expect(capturedCtx).toBe(ctx); + expect(capturedResult).toBe(fakeSyncResult); + }); + + it('a hook missing afterIndexAll is silently skipped', () => { + // Just a typing assertion: an IndexHook without afterIndexAll + // is allowed (both methods are optional). + const hook: IndexHook = { name: 'sync-only' }; + expect(hook.afterIndexAll).toBeUndefined(); + expect(hook.afterSync).toBeUndefined(); + }); +}); diff --git a/src/index-hooks/registry.ts b/src/index-hooks/registry.ts new file mode 100644 index 00000000..d68503ee --- /dev/null +++ b/src/index-hooks/registry.ts @@ -0,0 +1,89 @@ +/** + * Index-hook registry. + * + * Adding a new derived-signal pass: + * + * 1. Create `src/index-hooks/.ts` exporting a + * `HOOK: IndexHook` constant with `afterIndexAll` and/or + * `afterSync` implementations. + * 2. Add **one** import line and **one** array entry to this file. + * + * That's it. `CodeGraph` doesn't need a new private method or + * call site for each pass — the runner inside `runHooks*` walks + * every registered hook automatically. + * + * On main today there are NO hooks registered (this file ships + * the framework only). PRs adding derived-signal passes + * (centrality, churn, issue-history, config-refs, sql-refs, + * cochange) each register their hook here. + */ + +import type { IndexHook, IndexHookContext, IndexHookOutcome } from './types'; +import type { SyncResult } from '../extraction'; +import { logDebug } from '../errors'; + +/** + * Static-import list of every registered hook. + * + * Two PRs adding hooks land their entries on different lines + * (alphabetical neighborhoods rarely collide). When an entry is + * unwanted at runtime, the hook itself can short-circuit on a + * config flag inside its `afterIndexAll`/`afterSync`. + */ +const REGISTERED_HOOKS: readonly IndexHook[] = [ + // PRs adding hooks: append your `import { HOOK as _HOOK } from './';` + // above and your `_HOOK` entry here, alphabetical by name. +]; + +/** + * Run `afterIndexAll` for every registered hook. Errors are + * caught + logged so one broken hook never fails the whole + * index. Returns per-hook outcomes for diagnostics. + */ +export async function runAfterIndexAll( + ctx: IndexHookContext +): Promise { + const out: IndexHookOutcome[] = []; + for (const hook of REGISTERED_HOOKS) { + if (!hook.afterIndexAll) continue; + const start = Date.now(); + try { + await hook.afterIndexAll(ctx); + out.push({ name: hook.name, phase: 'indexAll', durationMs: Date.now() - start }); + } catch (err) { + const e = err instanceof Error ? err : new Error(String(err)); + logDebug(`index-hook "${hook.name}" afterIndexAll failed: ${e.message}`); + out.push({ name: hook.name, phase: 'indexAll', durationMs: Date.now() - start, error: e }); + } + } + return out; +} + +/** Same shape, for `afterSync`. */ +export async function runAfterSync( + ctx: IndexHookContext, + result: SyncResult +): Promise { + const out: IndexHookOutcome[] = []; + for (const hook of REGISTERED_HOOKS) { + if (!hook.afterSync) continue; + const start = Date.now(); + try { + await hook.afterSync(ctx, result); + out.push({ name: hook.name, phase: 'sync', durationMs: Date.now() - start }); + } catch (err) { + const e = err instanceof Error ? err : new Error(String(err)); + logDebug(`index-hook "${hook.name}" afterSync failed: ${e.message}`); + out.push({ name: hook.name, phase: 'sync', durationMs: Date.now() - start, error: e }); + } + } + return out; +} + +/** Read access for tests + diagnostic tools. */ +export function getRegisteredHooks(): readonly IndexHook[] { + return REGISTERED_HOOKS; +} + +// Re-export the types so consumers can import everything from one place. +export type { IndexHook, IndexHookContext, IndexHookOutcome } from './types'; diff --git a/src/index-hooks/types.ts b/src/index-hooks/types.ts new file mode 100644 index 00000000..f1c07558 --- /dev/null +++ b/src/index-hooks/types.ts @@ -0,0 +1,65 @@ +/** + * Index-hook types. + * + * `IndexHook`s are derived-signal passes that run AFTER core + * indexing/sync has finished — centrality computation, churn + * mining, issue history, config-ref extraction, SQL call-site + * scanning, co-change graph mining, etc. Today every such PR + * mutates `CodeGraph` directly (private method + call site in + * `indexAll` + call site in `sync`), forcing every-PR conflicts + * on adjacent lines. + * + * After the registry refactor, each pass is its own file: + * - exports a `HOOK: IndexHook` constant + * - registers itself in `./registry.ts` (1 import line + 1 array entry) + * - implements `afterIndexAll` and/or `afterSync` + * + * `CodeGraph` stops growing per-pass methods. The hook runner + * inside `CodeGraph` is a small generic loop that calls every + * registered hook in sequence, swallowing errors so one broken + * hook doesn't fail the whole index/sync. + */ + +import type { CodeGraphConfig } from '../types'; +import type { QueryBuilder } from '../db/queries'; +import type { DatabaseConnection } from '../db'; +import type { SyncResult } from '../extraction'; + +/** + * Per-call context handed to every hook. Stable shape so hooks + * don't need to import private members of `CodeGraph`. + */ +export interface IndexHookContext { + readonly projectRoot: string; + readonly config: CodeGraphConfig; + readonly queries: QueryBuilder; + readonly db: DatabaseConnection; +} + +export interface IndexHook { + /** Stable identifier for logging / opt-out. */ + readonly name: string; + + /** + * Run after a full `indexAll` completes successfully. Treat + * this as a clean-slate signal — clear any cached state your + * pass owns and re-derive from scratch. + */ + afterIndexAll?(ctx: IndexHookContext): Promise | void; + + /** + * Run after `sync` completes. `result.changedFilePaths` (when + * present) is the bounded set of paths touched in this sync; + * hooks should use it to do incremental work where possible. + */ + afterSync?(ctx: IndexHookContext, result: SyncResult): Promise | void; +} + +/** Per-hook outcome reported back from the registry runner. */ +export interface IndexHookOutcome { + readonly name: string; + readonly phase: 'indexAll' | 'sync'; + readonly durationMs: number; + /** Defined when the hook threw; the runner caught it. */ + readonly error?: Error; +} diff --git a/src/index.ts b/src/index.ts index 0ff1e090..1cf55624 100644 --- a/src/index.ts +++ b/src/index.ts @@ -49,6 +49,11 @@ import { GraphTraverser, GraphQueryManager } from './graph'; import { ContextBuilder, createContextBuilder } from './context'; import { Mutex, FileLock } from './utils'; import { FileWatcher, WatchOptions } from './sync'; +import { + runAfterIndexAll as runIndexHooksAfterIndexAll, + runAfterSync as runIndexHooksAfterSync, + type IndexHookContext, +} from './index-hooks/registry'; // Re-export types for consumers export * from './types'; @@ -402,6 +407,13 @@ export class CodeGraph { }); } + // Run registered post-indexAll hooks (centrality, churn, + // issue-history, config-refs, sql-refs, …). Best-effort: + // hook errors are caught + logged inside the runner. + if (result.success) { + await runIndexHooksAfterIndexAll(this.buildHookContext()); + } + return result; } finally { this.fileLock.release(); @@ -409,6 +421,18 @@ export class CodeGraph { }); } + /** + * Build the read-only context handed to every index hook. + */ + private buildHookContext(): IndexHookContext { + return { + projectRoot: this.projectRoot, + config: this.config, + queries: this.queries, + db: this.db, + }; + } + /** * Index specific files * @@ -483,6 +507,11 @@ export class CodeGraph { } } + // Run registered post-sync hooks. Same registry as the + // indexAll path — hooks distinguish via their + // `afterIndexAll` vs `afterSync` methods. + await runIndexHooksAfterSync(this.buildHookContext(), result); + return result; } finally { this.fileLock.release(); From 38887ee4fc2354e6ddc651b23d2f3be9769fbccb Mon Sep 17 00:00:00 2001 From: andreinknv Date: Mon, 27 Apr 2026 17:48:43 -0400 Subject: [PATCH 6/9] feat: PR #112 (centrality + churn + hotspots) on top of refactors Lands centrality (PageRank) and churn (git history) as registered IndexHooks (`afterIndexAll` + `afterSync`) instead of CodeGraph private methods. Adds: - Migration 004: nodes.centrality + files.{commit_count,loc, first_seen_ts,last_touched_ts} + indexes - src/centrality/ + src/churn/ (pure modules) - src/index-hooks/centrality.ts + churn.ts (registered hooks) - CodeGraph public methods: getCentrality, getTopCentralNodes, getCentralityRank, getFileChurn, getHotspots - codegraph_hotspots MCP tool wired through ToolModule registry + handleHotspots on ToolHandler - Updated regression-guard tests (index-hooks, mcp-tool-registry) to reflect newly registered hooks/tools Tests: 440/440 pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- __tests__/centrality.test.ts | 134 +++++++++++ __tests__/churn.test.ts | 208 +++++++++++++++++ __tests__/foundation.test.ts | 2 +- __tests__/index-hooks.test.ts | 33 ++- __tests__/mcp-tool-registry.test.ts | 3 +- __tests__/pr19-improvements.test.ts | 2 +- src/centrality/index.ts | 126 +++++++++++ src/churn/index.ts | 259 ++++++++++++++++++++++ src/config.ts | 2 + src/db/migrations/004-centrality-churn.ts | 33 +++ src/db/migrations/index.ts | 2 + src/db/queries.ts | 221 +++++++++++++++++- src/db/schema.sql | 13 +- src/default-config.ts | 2 + src/index-hooks/centrality.ts | 37 ++++ src/index-hooks/churn.ts | 53 +++++ src/index-hooks/registry.ts | 7 +- src/index.ts | 42 ++++ src/mcp/tools.ts | 51 +++++ src/mcp/tools/hotspots.ts | 37 ++++ src/mcp/tools/registry.ts | 2 + src/mcp/tools/types.ts | 3 +- src/types.ts | 37 ++++ 23 files changed, 1294 insertions(+), 15 deletions(-) create mode 100644 __tests__/centrality.test.ts create mode 100644 __tests__/churn.test.ts create mode 100644 src/centrality/index.ts create mode 100644 src/churn/index.ts create mode 100644 src/db/migrations/004-centrality-churn.ts create mode 100644 src/index-hooks/centrality.ts create mode 100644 src/index-hooks/churn.ts create mode 100644 src/mcp/tools/hotspots.ts diff --git a/__tests__/centrality.test.ts b/__tests__/centrality.test.ts new file mode 100644 index 00000000..e45dc858 --- /dev/null +++ b/__tests__/centrality.test.ts @@ -0,0 +1,134 @@ +import { describe, it, expect } from 'vitest'; +import { computePageRank, PR_DAMPING, PR_ITERATIONS } from '../src/centrality'; + +function asNodes(ids: string[]) { + return ids.map((id) => ({ id })); +} + +describe('computePageRank', () => { + it('returns empty result for an empty graph', () => { + const r = computePageRank([], []); + expect(r.scores.size).toBe(0); + expect(r.iterations).toBe(0); + }); + + it('assigns uniform rank to N isolated nodes', () => { + const r = computePageRank(asNodes(['a', 'b', 'c', 'd']), []); + expect(r.scores.size).toBe(4); + // 4 isolated nodes — all dangling — should each end up with 1/N. + for (const v of r.scores.values()) { + expect(v).toBeCloseTo(0.25, 6); + } + }); + + it('rewards being reached (sinks accumulate rank)', () => { + // a -> b -> c. c has no outgoing, so it accumulates the most. + const r = computePageRank( + asNodes(['a', 'b', 'c']), + [ + { source: 'a', target: 'b' }, + { source: 'b', target: 'c' }, + ] + ); + const a = r.scores.get('a')!; + const b = r.scores.get('b')!; + const c = r.scores.get('c')!; + expect(c).toBeGreaterThan(b); + expect(b).toBeGreaterThan(a); + }); + + it('star: hub ranks above all leaves; leaves are equal', () => { + const leaves = ['l1', 'l2', 'l3', 'l4', 'l5', 'l6', 'l7', 'l8', 'l9']; + const edges = leaves.map((l) => ({ source: l, target: 'hub' })); + const r = computePageRank(asNodes([...leaves, 'hub']), edges); + const hub = r.scores.get('hub')!; + for (const l of leaves) { + const lv = r.scores.get(l)!; + expect(hub).toBeGreaterThan(lv); + } + // Leaves are symmetric — should be within 1e-9. + const first = r.scores.get(leaves[0])!; + for (const l of leaves.slice(1)) { + expect(r.scores.get(l)!).toBeCloseTo(first, 9); + } + }); + + it('cycle: all nodes have approximately equal rank', () => { + const r = computePageRank( + asNodes(['a', 'b', 'c']), + [ + { source: 'a', target: 'b' }, + { source: 'b', target: 'c' }, + { source: 'c', target: 'a' }, + ] + ); + const a = r.scores.get('a')!; + const b = r.scores.get('b')!; + const c = r.scores.get('c')!; + // Symmetric → all equal at convergence. + expect(a).toBeCloseTo(b, 6); + expect(b).toBeCloseTo(c, 6); + }); + + it('total rank sums to ~1 (mass is conserved)', () => { + const r = computePageRank( + asNodes(['a', 'b', 'c', 'd', 'e']), + [ + { source: 'a', target: 'b' }, + { source: 'b', target: 'c' }, + { source: 'd', target: 'c' }, + { source: 'e', target: 'd' }, + { source: 'a', target: 'e' }, + ] + ); + let sum = 0; + for (const v of r.scores.values()) sum += v; + expect(sum).toBeCloseTo(1, 6); + }); + + it('preserves mass across two disconnected components', () => { + const r = computePageRank( + asNodes(['a', 'b', 'c', 'd']), + [ + { source: 'a', target: 'b' }, + { source: 'c', target: 'd' }, + ] + ); + let sum = 0; + for (const v of r.scores.values()) sum += v; + expect(sum).toBeCloseTo(1, 6); + // Within each component, the sink ranks above the source. + expect(r.scores.get('b')!).toBeGreaterThan(r.scores.get('a')!); + expect(r.scores.get('d')!).toBeGreaterThan(r.scores.get('c')!); + }); + + it('drops edges referencing unknown nodes', () => { + // 'ghost' is not in the node set — that edge should be ignored, + // not crash and not pollute scores. + const r = computePageRank( + asNodes(['a', 'b']), + [ + { source: 'a', target: 'b' }, + { source: 'a', target: 'ghost' }, + { source: 'ghost', target: 'b' }, + ] + ); + expect(r.scores.size).toBe(2); + expect(r.scores.get('b')!).toBeGreaterThan(r.scores.get('a')!); + let sum = 0; + for (const v of r.scores.values()) sum += v; + expect(sum).toBeCloseTo(1, 6); + }); + + it('reports iteration count and duration', () => { + const r = computePageRank(asNodes(['a', 'b']), [{ source: 'a', target: 'b' }]); + expect(r.iterations).toBe(PR_ITERATIONS); + expect(r.durationMs).toBeGreaterThanOrEqual(0); + }); + + it('damping constant is the textbook 0.85', () => { + // Sentinel — protects against accidental tuning that would invalidate + // the spike findings the PR was justified on. + expect(PR_DAMPING).toBe(0.85); + }); +}); diff --git a/__tests__/churn.test.ts b/__tests__/churn.test.ts new file mode 100644 index 00000000..fbe279f6 --- /dev/null +++ b/__tests__/churn.test.ts @@ -0,0 +1,208 @@ +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import * as fs from 'fs'; +import * as os from 'os'; +import * as path from 'path'; +import { execFileSync } from 'child_process'; +import { + mineChurn, + getGitHead, + readFileLoc, + MAX_FILES_PER_COMMIT, + LAST_MINED_CHURN_HEAD_KEY, +} from '../src/churn'; + +let HAS_GIT = true; +try { + execFileSync('git', ['--version'], { stdio: 'ignore' }); +} catch { + HAS_GIT = false; +} + +let tempDir: string; + +function git(...args: string[]): string { + return execFileSync('git', args, { + cwd: tempDir, + encoding: 'utf-8', + env: { + ...process.env, + GIT_AUTHOR_NAME: 'Test', + GIT_AUTHOR_EMAIL: 'test@example.com', + GIT_COMMITTER_NAME: 'Test', + GIT_COMMITTER_EMAIL: 'test@example.com', + GIT_AUTHOR_DATE: process.env.GIT_AUTHOR_DATE, + GIT_COMMITTER_DATE: process.env.GIT_COMMITTER_DATE, + }, + stdio: ['pipe', 'pipe', 'pipe'], + }).trim(); +} + +function commitAt(date: string, paths: string[], content?: string) { + for (const p of paths) { + const abs = path.join(tempDir, p); + fs.mkdirSync(path.dirname(abs), { recursive: true }); + fs.writeFileSync(abs, content ?? `data for ${p} at ${date}\n`); + } + git('add', ...paths); + // Pin both author and committer dates so timestamps are deterministic. + process.env.GIT_AUTHOR_DATE = date; + process.env.GIT_COMMITTER_DATE = date; + git('commit', '-m', `commit at ${date}`); + delete process.env.GIT_AUTHOR_DATE; + delete process.env.GIT_COMMITTER_DATE; +} + +beforeEach(() => { + tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-churn-')); + if (HAS_GIT) { + git('init', '-q', '-b', 'main'); + git('config', 'commit.gpgsign', 'false'); + } +}); + +afterEach(() => { + delete process.env.GIT_AUTHOR_DATE; + delete process.env.GIT_COMMITTER_DATE; + fs.rmSync(tempDir, { recursive: true, force: true }); +}); + +describe.skipIf(!HAS_GIT)('mineChurn', () => { + it('returns empty + null head when not in a git repo', () => { + const nonGit = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-nogit-')); + try { + const r = mineChurn(nonGit, new Set(['foo.ts']), null); + expect(r.currentHead).toBeNull(); + expect(r.deltas.size).toBe(0); + expect(r.needsFullRescan).toBe(false); + } finally { + fs.rmSync(nonGit, { recursive: true, force: true }); + } + }); + + it('counts commits per indexed file, ignores files not in index', () => { + commitAt('2025-01-01T00:00:00', ['a.ts', 'b.ts']); + commitAt('2025-01-02T00:00:00', ['a.ts']); + commitAt('2025-01-03T00:00:00', ['a.ts', 'b.ts', 'c.ts']); + + const r = mineChurn(tempDir, new Set(['a.ts', 'b.ts']), null); + expect(r.deltas.get('a.ts')?.commitCountDelta).toBe(3); + expect(r.deltas.get('b.ts')?.commitCountDelta).toBe(2); + expect(r.deltas.has('c.ts')).toBe(false); + }); + + it('records first-seen / last-touched as min/max of commit timestamps', () => { + commitAt('2025-01-01T00:00:00Z', ['a.ts']); + commitAt('2025-06-01T00:00:00Z', ['a.ts']); + commitAt('2025-12-01T00:00:00Z', ['a.ts']); + + const r = mineChurn(tempDir, new Set(['a.ts']), null); + const d = r.deltas.get('a.ts')!; + // 2025-01-01 UTC = 1735689600 + expect(d.firstSeenTs).toBe(1735689600); + // 2025-12-01 UTC = 1764547200 + expect(d.lastTouchedTs).toBe(1764547200); + }); + + it('skips commits touching more than MAX_FILES_PER_COMMIT files', () => { + const bigBatch: string[] = []; + for (let i = 0; i < MAX_FILES_PER_COMMIT + 1; i++) bigBatch.push(`f${i}.ts`); + commitAt('2025-01-01T00:00:00Z', bigBatch); + // Then a normal commit on one of the same files. + commitAt('2025-02-01T00:00:00Z', ['f0.ts']); + + const r = mineChurn(tempDir, new Set(bigBatch), null); + // First commit was skipped; only the second one should count. + expect(r.deltas.get('f0.ts')?.commitCountDelta).toBe(1); + // Files only seen in the skipped commit produce no delta at all. + expect(r.deltas.has('f50.ts')).toBe(false); + }); + + it('incremental mining returns only commits since the given sha', () => { + commitAt('2025-01-01T00:00:00Z', ['a.ts']); + const sha1 = getGitHead(tempDir)!; + commitAt('2025-01-02T00:00:00Z', ['a.ts']); + commitAt('2025-01-03T00:00:00Z', ['a.ts']); + + const incr = mineChurn(tempDir, new Set(['a.ts']), sha1); + // Only the two commits *after* sha1 should be counted. + expect(incr.deltas.get('a.ts')?.commitCountDelta).toBe(2); + expect(incr.needsFullRescan).toBe(false); + }); + + it('returns needsFullRescan=true when sinceSha is unreachable', () => { + commitAt('2025-01-01T00:00:00Z', ['a.ts']); + const fakeSha = '0'.repeat(40); + const r = mineChurn(tempDir, new Set(['a.ts']), fakeSha); + expect(r.needsFullRescan).toBe(true); + expect(r.deltas.size).toBe(0); + expect(r.currentHead).not.toBeNull(); + }); + + it('returns empty deltas when sinceSha equals current head (no-op)', () => { + commitAt('2025-01-01T00:00:00Z', ['a.ts']); + const head = getGitHead(tempDir)!; + const r = mineChurn(tempDir, new Set(['a.ts']), head); + expect(r.currentHead).toBe(head); + expect(r.deltas.size).toBe(0); + expect(r.needsFullRescan).toBe(false); + }); + + it('handles paths with spaces and unicode safely (NUL-delimited)', () => { + commitAt('2025-01-01T00:00:00Z', ['name with space.ts']); + commitAt('2025-01-02T00:00:00Z', ['ünïcødë.ts']); + + const r = mineChurn( + tempDir, + new Set(['name with space.ts', 'ünïcødë.ts']), + null + ); + expect(r.deltas.get('name with space.ts')?.commitCountDelta).toBe(1); + expect(r.deltas.get('ünïcødë.ts')?.commitCountDelta).toBe(1); + }); + + it('LAST_MINED_CHURN_HEAD_KEY is stable (used as project_metadata key)', () => { + expect(LAST_MINED_CHURN_HEAD_KEY).toBe('last_mined_churn_head'); + }); +}); + +describe('readFileLoc', () => { + it('returns 0 for an empty file', () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-loc-')); + try { + const f = path.join(dir, 'empty.txt'); + fs.writeFileSync(f, ''); + expect(readFileLoc(dir, 'empty.txt')).toBe(0); + } finally { + fs.rmSync(dir, { recursive: true, force: true }); + } + }); + + it('counts newline-terminated lines', () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-loc-')); + try { + fs.writeFileSync(path.join(dir, 'x.txt'), 'a\nb\nc\n'); + expect(readFileLoc(dir, 'x.txt')).toBe(3); + } finally { + fs.rmSync(dir, { recursive: true, force: true }); + } + }); + + it('counts a final no-newline chunk as one extra line', () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-loc-')); + try { + fs.writeFileSync(path.join(dir, 'x.txt'), 'a\nb\nc'); + expect(readFileLoc(dir, 'x.txt')).toBe(3); + } finally { + fs.rmSync(dir, { recursive: true, force: true }); + } + }); + + it('returns 0 for a missing file (does not throw)', () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-loc-')); + try { + expect(readFileLoc(dir, 'no-such-file.txt')).toBe(0); + } finally { + fs.rmSync(dir, { recursive: true, force: true }); + } + }); +}); diff --git a/__tests__/foundation.test.ts b/__tests__/foundation.test.ts index 9ee437da..4e8f204a 100644 --- a/__tests__/foundation.test.ts +++ b/__tests__/foundation.test.ts @@ -305,7 +305,7 @@ describe('Database Connection', () => { const version = db.getSchemaVersion(); expect(version).not.toBeNull(); - expect(version?.version).toBe(3); + expect(version?.version).toBe(4); db.close(); }); diff --git a/__tests__/index-hooks.test.ts b/__tests__/index-hooks.test.ts index c1f05847..639587f9 100644 --- a/__tests__/index-hooks.test.ts +++ b/__tests__/index-hooks.test.ts @@ -42,18 +42,39 @@ const fakeSyncResult: SyncResult = { }; describe('index-hooks registry — runner', () => { - it('main ships with no registered hooks', () => { - expect(getRegisteredHooks().length).toBe(0); + it('registered hooks expose stable {name, afterIndexAll|afterSync} shape', () => { + const hooks = getRegisteredHooks(); + expect(hooks.length).toBeGreaterThanOrEqual(0); + for (const h of hooks) { + expect(typeof h.name).toBe('string'); + expect(h.afterIndexAll === undefined || typeof h.afterIndexAll === 'function').toBe(true); + expect(h.afterSync === undefined || typeof h.afterSync === 'function').toBe(true); + } }); - it('runAfterIndexAll on an empty registry returns an empty outcome list', async () => { + it('runAfterIndexAll returns one outcome per registered hook, swallowing per-hook errors', async () => { + // Registered hooks will throw on the fake `{} as any` ctx; the + // runner contract is to catch + report each error so one bad + // hook never fails the whole pass. const outcomes = await runAfterIndexAll(makeFakeContext()); - expect(outcomes).toEqual([]); + const expectedCount = getRegisteredHooks().filter((h) => h.afterIndexAll).length; + expect(outcomes.length).toBe(expectedCount); + for (const o of outcomes) { + expect(typeof o.name).toBe('string'); + expect(o.phase).toBe('indexAll'); + expect(typeof o.durationMs).toBe('number'); + } }); - it('runAfterSync on an empty registry returns an empty outcome list', async () => { + it('runAfterSync returns one outcome per registered hook, swallowing per-hook errors', async () => { const outcomes = await runAfterSync(makeFakeContext(), fakeSyncResult); - expect(outcomes).toEqual([]); + const expectedCount = getRegisteredHooks().filter((h) => h.afterSync).length; + expect(outcomes.length).toBe(expectedCount); + for (const o of outcomes) { + expect(typeof o.name).toBe('string'); + expect(o.phase).toBe('sync'); + expect(typeof o.durationMs).toBe('number'); + } }); }); diff --git a/__tests__/mcp-tool-registry.test.ts b/__tests__/mcp-tool-registry.test.ts index 6ca9cef8..b8ce3025 100644 --- a/__tests__/mcp-tool-registry.test.ts +++ b/__tests__/mcp-tool-registry.test.ts @@ -37,13 +37,14 @@ describe('MCP tool registry — single source of truth', () => { expect(fromExport).toEqual(fromRegistry); }); - it('all 9 main-line tools are registered (regression guard)', () => { + it('all main-line tools are registered (regression guard)', () => { const expected = [ 'codegraph_callees', 'codegraph_callers', 'codegraph_context', 'codegraph_explore', 'codegraph_files', + 'codegraph_hotspots', 'codegraph_impact', 'codegraph_node', 'codegraph_search', diff --git a/__tests__/pr19-improvements.test.ts b/__tests__/pr19-improvements.test.ts index 5fbe17d7..d43dceb2 100644 --- a/__tests__/pr19-improvements.test.ts +++ b/__tests__/pr19-improvements.test.ts @@ -299,7 +299,7 @@ describe('Best-Candidate Resolution', () => { describe('Schema v2 Migration', () => { it.skipIf(!HAS_SQLITE)('should have correct current schema version', async () => { const { CURRENT_SCHEMA_VERSION } = await import('../src/db/migrations'); - expect(CURRENT_SCHEMA_VERSION).toBe(3); + expect(CURRENT_SCHEMA_VERSION).toBe(4); }); it.skipIf(!HAS_SQLITE)('should have migration for version 2', async () => { diff --git a/src/centrality/index.ts b/src/centrality/index.ts new file mode 100644 index 00000000..d03f2206 --- /dev/null +++ b/src/centrality/index.ts @@ -0,0 +1,126 @@ +/** + * Centrality computation + * + * Computes PageRank over the `calls` + `references` subgraph and + * persists each node's score on the `nodes.centrality` column. Pure + * compute — no I/O — so the caller owns reading edges, writing scores, + * and deciding when to re-run. + * + * PageRank is the right shape for "what is structurally important?" + * because it rewards being reached (weighted by the importance of who + * reaches you), not just raw in-degree. A method called once from a + * central interface ranks above a method called many times from a + * leaf script. + * + * Edges of kind `contains` are deliberately excluded — they encode + * lexical containment (file → class → method), which would dominate + * the rank and hide actual reference flow. + * + * Side benefit observed in spike data: PageRank accidentally surfaces + * resolver false-positives. Generic short names (`trim`, `run`) that + * the resolver over-merges across files accumulate edges from many + * sources and float to the top alongside genuine hubs. Useful as a + * diagnostic; not a goal of this module. + */ + +/** Damping factor — fraction of rank propagated through edges each step. */ +export const PR_DAMPING = 0.85; + +/** + * Iteration count. PageRank converges geometrically; 40 iterations puts + * us well below 1e-6 residual on graphs we've seen, with no per-graph + * tuning needed. + */ +export const PR_ITERATIONS = 40; + +/** Edge kinds that contribute to centrality. */ +export const PR_EDGE_KINDS = ['calls', 'references'] as const; + +export type PrEdgeKind = (typeof PR_EDGE_KINDS)[number]; + +export interface CentralityResult { + /** nodeId → PageRank score in (0, 1). Sums to ~1.0 across all nodes. */ + scores: Map; + /** Iterations actually run (currently always PR_ITERATIONS — kept for forward compat). */ + iterations: number; + /** Wall-clock duration in milliseconds. */ + durationMs: number; +} + +interface NodeRef { + id: string; +} + +interface EdgeRef { + source: string; + target: string; +} + +/** + * Compute PageRank scores for the supplied nodes/edges. + * + * @param nodes All graph nodes (only `id` is read). + * @param edges Edges that contribute to centrality. Caller is + * responsible for filtering to `PR_EDGE_KINDS`. + * + * Edges referencing unknown node ids are silently dropped — the + * underlying graph has FK cascades, so dangling references can only + * occur mid-write and are not our problem to fix here. + */ +export function computePageRank(nodes: NodeRef[], edges: EdgeRef[]): CentralityResult { + const start = Date.now(); + const N = nodes.length; + const scores = new Map(); + if (N === 0) { + return { scores, iterations: 0, durationMs: Date.now() - start }; + } + + // Index nodes for tight numeric loops. Float64Array gives ~3× speedup + // over Array(N).fill on million-edge graphs and costs nothing on + // smaller ones. + const idx = new Map(); + for (let i = 0; i < N; i++) { + const n = nodes[i]!; + idx.set(n.id, i); + } + + const inEdges: number[][] = Array.from({ length: N }, () => []); + const outDeg = new Int32Array(N); + for (const e of edges) { + const s = idx.get(e.source); + const t = idx.get(e.target); + if (s === undefined || t === undefined) continue; + inEdges[t]!.push(s); + outDeg[s]! += 1; + } + + let pr = new Float64Array(N).fill(1 / N); + const baseline = (1 - PR_DAMPING) / N; + + for (let it = 0; it < PR_ITERATIONS; it++) { + const next = new Float64Array(N).fill(baseline); + + // Distribute the rank of dangling nodes (no outgoing edges) uniformly. + // Without this the total rank decays each iteration. + let danglingSum = 0; + for (let i = 0; i < N; i++) { + if (outDeg[i] === 0) danglingSum += pr[i]!; + } + const danglingShare = (PR_DAMPING * danglingSum) / N; + for (let i = 0; i < N; i++) next[i]! += danglingShare; + + for (let t = 0; t < N; t++) { + const sources = inEdges[t]!; + let s = 0; + for (let k = 0; k < sources.length; k++) { + const src = sources[k]!; + s += pr[src]! / outDeg[src]!; + } + next[t]! += PR_DAMPING * s; + } + pr = next; + } + + for (let i = 0; i < N; i++) scores.set(nodes[i]!.id, pr[i]!); + return { scores, iterations: PR_ITERATIONS, durationMs: Date.now() - start }; +} diff --git a/src/churn/index.ts b/src/churn/index.ts new file mode 100644 index 00000000..1c332886 --- /dev/null +++ b/src/churn/index.ts @@ -0,0 +1,259 @@ +/** + * Per-file churn mining + * + * Reads `git log` to compute four signals per indexed file: + * - commit_count (how often the file gets touched) + * - first_seen_ts (when it entered the codebase) + * - last_touched_ts (how recently it was modified) + * - loc (line count of the current on-disk content) + * + * Combined with PageRank centrality (see ../centrality), these answer + * "where do bugs hide?" — central files that change often are the + * highest-expected-value review targets, validated empirically against + * codegraph's own history (e.g. `src/extraction/tree-sitter.ts`). + * + * Storage strategy: scalar columns on `files` (one row already exists + * per indexed path; adding columns avoids a JOIN on every read). + * + * Incremental update: persist `last_mined_churn_head` in + * project_metadata; on subsequent mines, only enumerate commits in + * `..HEAD`. This keeps `sync` fast on long histories. If the + * stored sha is unreachable (force-push, gc), the caller gets + * `needsFullRescan: true` and re-mines from scratch after `clearChurn`. + * + * Rename note: `git log --name-only` (without `--follow`) reports + * post-rename paths only. The pre-rename history is therefore not + * counted toward the new path's `commit_count`. `--follow` would fix + * this but is documented as O(N) per file and shells out individually, + * so v1 accepts the under-count and surfaces it in the doc-comment on + * `commitCount` in types.ts. + */ + +import { execFileSync } from 'child_process'; +import * as fs from 'fs'; +import * as path from 'path'; +import { logDebug } from '../errors'; + +/** + * Skip commits that touch more than this many indexed files. Merge + * commits and mass refactors otherwise inflate every file's + * commit_count without any real coupling signal. + */ +export const MAX_FILES_PER_COMMIT = 50; + +/** Sentinel for `git log --pretty=tformat:`; cannot collide with a path. */ +const COMMIT_HEADER_PREFIX = 'CGCMT-'; + +/** Project-metadata key holding the HEAD SHA of the last mined commit. */ +export const LAST_MINED_CHURN_HEAD_KEY = 'last_mined_churn_head'; + +/** Hard cap on git output we'll buffer (bytes). Matches cochange. */ +const MAX_GIT_BUFFER = 200 * 1024 * 1024; + +/** Wall-clock cap on a single git invocation (ms). */ +const GIT_TIMEOUT_MS = 60_000; + +export interface FileChurnDelta { + path: string; + /** Commits to add to the existing commit_count. */ + commitCountDelta: number; + /** + * Most recent commit timestamp (unix seconds) seen in this delta. + * Caller takes max() with the existing value. + */ + lastTouchedTs: number; + /** + * Earliest commit timestamp (unix seconds) in this delta. Caller + * applies `COALESCE(existing, this)` so the first-seen column only + * gets written once. + */ + firstSeenTs: number; +} + +export interface ChurnMineResult { + deltas: Map; + /** HEAD SHA reached by this run; null when not in a git repo. */ + currentHead: string | null; + /** + * True when the caller's `sinceSha` was unreachable (force-push, gc). + * Caller should `clearChurn()` and re-mine with `sinceSha=null`. + */ + needsFullRescan: boolean; +} + +/** + * Get the current HEAD commit SHA, or null when not in a git repo or + * the repo has no commits yet. + */ +export function getGitHead(rootDir: string): string | null { + try { + return ( + execFileSync('git', ['rev-parse', 'HEAD'], { + cwd: rootDir, + encoding: 'utf-8', + timeout: 5000, + stdio: ['pipe', 'pipe', 'pipe'], + }).trim() || null + ); + } catch { + return null; + } +} + +/** + * Verify that a stored SHA is still reachable from HEAD. After + * force-push or `git gc` it can disappear, in which case incremental + * mining would silently miss commits. + */ +function isShaReachable(rootDir: string, sha: string): boolean { + try { + execFileSync('git', ['cat-file', '-e', `${sha}^{commit}`], { + cwd: rootDir, + timeout: 5000, + stdio: ['pipe', 'pipe', 'pipe'], + }); + return true; + } catch { + return false; + } +} + +/** + * Read the LOC of a file as currently on disk. Cheap; always fresh. + * + * Counts newline-delimited lines: a file with content `"a\nb\n"` + * reports 2; an empty file reports 0; a file ending without a newline + * still reports the visible-line count. + */ +export function readFileLoc(rootDir: string, relPath: string): number { + try { + const abs = path.join(rootDir, relPath); + const content = fs.readFileSync(abs, 'utf8'); + if (content.length === 0) return 0; + let lines = 0; + for (let i = 0; i < content.length; i++) if (content.charCodeAt(i) === 10) lines++; + // Trailing chunk without final newline still counts as a line. + if (content.charCodeAt(content.length - 1) !== 10) lines++; + return lines; + } catch { + return 0; + } +} + +/** + * Mine git log for per-file commit metrics. + * + * @param rootDir Project root. + * @param indexedFiles Paths we care about (deltas only emitted for + * these). Files outside this set are ignored + * per-commit so churn doesn't accumulate for + * paths the index has no other knowledge of. + * @param sinceSha `null` for full scan; otherwise mine only + * `..HEAD`. Unreachable shas trigger + * `needsFullRescan: true`. + */ +export function mineChurn( + rootDir: string, + indexedFiles: Set, + sinceSha: string | null +): ChurnMineResult { + const empty: ChurnMineResult = { + deltas: new Map(), + currentHead: null, + needsFullRescan: false, + }; + + const head = getGitHead(rootDir); + if (!head) return empty; + + if (sinceSha && !isShaReachable(rootDir, sinceSha)) { + return { deltas: new Map(), currentHead: head, needsFullRescan: true }; + } + + // No-op: nothing has happened since last mine. + if (sinceSha === head) { + return { deltas: new Map(), currentHead: head, needsFullRescan: false }; + } + + // tformat puts a literal trailing record-separator after each + // commit's name list; -z then NUL-delimits within the format too, + // so we get a clean stream of NUL-separated tokens. + const args = [ + 'log', + '--no-merges', + '--name-only', + `--pretty=tformat:${COMMIT_HEADER_PREFIX}%H|%ct`, + '-z', + ]; + if (sinceSha) args.push(`${sinceSha}..HEAD`); + + let raw: string; + try { + raw = execFileSync('git', args, { + cwd: rootDir, + encoding: 'utf-8', + timeout: GIT_TIMEOUT_MS, + maxBuffer: MAX_GIT_BUFFER, + stdio: ['pipe', 'pipe', 'pipe'], + }); + } catch (err) { + logDebug(`mineChurn: git log failed: ${err instanceof Error ? err.message : String(err)}`); + return { deltas: new Map(), currentHead: head, needsFullRescan: false }; + } + + // Parse: tformat emits `CGCMT-|\0\n\0\0... + // CGCMT-|\0\n\0`. Each token between NULs is either + // a commit header or a path; paths arrive with a leading '\n' on the + // first one of each commit (the tformat record-separator). We walk + // tokens linearly, switching commit context on each header. + const tokens = raw.split('\0'); + const headerRe = /^CGCMT-([0-9a-f]{40})\|(\d+)$/; + const deltas = new Map(); + + let curTs = 0; + let curPaths: string[] = []; + let curActive = false; + + function flush() { + if (!curActive) return; + if (curPaths.length > 0 && curPaths.length <= MAX_FILES_PER_COMMIT) { + for (const p of curPaths) { + if (!indexedFiles.has(p)) continue; + const cur = deltas.get(p); + if (cur) { + cur.commitCountDelta += 1; + if (curTs > cur.lastTouchedTs) cur.lastTouchedTs = curTs; + if (curTs < cur.firstSeenTs) cur.firstSeenTs = curTs; + } else { + deltas.set(p, { + path: p, + commitCountDelta: 1, + lastTouchedTs: curTs, + firstSeenTs: curTs, + }); + } + } + } + curPaths = []; + curActive = false; + } + + for (const rawTok of tokens) { + if (rawTok === '') continue; + // Strip a single leading \n introduced by tformat's record separator. + const tok = rawTok.startsWith('\n') ? rawTok.slice(1) : rawTok; + if (tok === '') continue; + const m = headerRe.exec(tok); + if (m) { + flush(); + curTs = parseInt(m[2]!, 10); + curActive = true; + } else if (curActive) { + curPaths.push(tok); + } + // Tokens before the first header (shouldn't happen) are ignored. + } + flush(); + + return { deltas, currentHead: head, needsFullRescan: false }; +} diff --git a/src/config.ts b/src/config.ts index 9ab1032a..8a92228d 100644 --- a/src/config.ts +++ b/src/config.ts @@ -128,6 +128,8 @@ function mergeConfig( extractDocstrings: overrides.extractDocstrings ?? defaults.extractDocstrings, trackCallSites: overrides.trackCallSites ?? defaults.trackCallSites, customPatterns: overrides.customPatterns ?? defaults.customPatterns, + enableCentrality: overrides.enableCentrality ?? defaults.enableCentrality, + enableChurn: overrides.enableChurn ?? defaults.enableChurn, }; } diff --git a/src/db/migrations/004-centrality-churn.ts b/src/db/migrations/004-centrality-churn.ts new file mode 100644 index 00000000..bceaed7d --- /dev/null +++ b/src/db/migrations/004-centrality-churn.ts @@ -0,0 +1,33 @@ +import type { MigrationModule } from './types'; + +export const MIGRATION: MigrationModule = { + description: 'Add centrality on nodes; per-file churn metrics on files', + up: (db) => { + // ALTER TABLE ADD COLUMN is not idempotent on SQLite — guard with + // PRAGMA table_info so re-running after a partial DDL failure (or + // landing alongside another migration that touches the same files + // columns) does not throw "duplicate column name". + const nodeCols = db.prepare(`PRAGMA table_info(nodes);`).all() as Array<{ name: string }>; + if (!nodeCols.some((c) => c.name === 'centrality')) { + db.exec(`ALTER TABLE nodes ADD COLUMN centrality REAL DEFAULT NULL;`); + } + const fileCols = db.prepare(`PRAGMA table_info(files);`).all() as Array<{ name: string }>; + if (!fileCols.some((c) => c.name === 'commit_count')) { + db.exec(`ALTER TABLE files ADD COLUMN commit_count INTEGER NOT NULL DEFAULT 0;`); + } + if (!fileCols.some((c) => c.name === 'loc')) { + db.exec(`ALTER TABLE files ADD COLUMN loc INTEGER NOT NULL DEFAULT 0;`); + } + if (!fileCols.some((c) => c.name === 'first_seen_ts')) { + db.exec(`ALTER TABLE files ADD COLUMN first_seen_ts INTEGER DEFAULT NULL;`); + } + if (!fileCols.some((c) => c.name === 'last_touched_ts')) { + db.exec(`ALTER TABLE files ADD COLUMN last_touched_ts INTEGER DEFAULT NULL;`); + } + db.exec(` + CREATE INDEX IF NOT EXISTS idx_nodes_centrality ON nodes(centrality DESC); + CREATE INDEX IF NOT EXISTS idx_files_commit_count ON files(commit_count DESC); + CREATE INDEX IF NOT EXISTS idx_files_last_touched ON files(last_touched_ts DESC); + `); + }, +}; diff --git a/src/db/migrations/index.ts b/src/db/migrations/index.ts index f9bbcf10..37252ffa 100644 --- a/src/db/migrations/index.ts +++ b/src/db/migrations/index.ts @@ -26,6 +26,7 @@ import type { Migration, MigrationModule } from './types'; import { MIGRATION as MIG_002 } from './002-project-metadata'; import { MIGRATION as MIG_003 } from './003-lower-name-index'; +import { MIGRATION as MIG_004 } from './004-centrality-churn'; interface ModuleRef { /** @@ -48,6 +49,7 @@ interface ModuleRef { const REGISTERED_MODULES: readonly ModuleRef[] = [ { filename: '002-project-metadata.ts', module: MIG_002 }, { filename: '003-lower-name-index.ts', module: MIG_003 }, + { filename: '004-centrality-churn.ts', module: MIG_004 }, ]; /** Strict 3-digit prefix on each migration filename. */ diff --git a/src/db/queries.ts b/src/db/queries.ts index 51f1a1ad..dec533a7 100644 --- a/src/db/queries.ts +++ b/src/db/queries.ts @@ -44,6 +44,7 @@ interface NodeRow { decorators: string | null; type_parameters: string | null; updated_at: number; + centrality: number | null; } interface EdgeRow { @@ -66,6 +67,10 @@ interface FileRow { indexed_at: number; node_count: number; errors: string | null; + commit_count: number | null; + loc: number | null; + first_seen_ts: number | null; + last_touched_ts: number | null; } interface UnresolvedRefRow { @@ -105,6 +110,7 @@ function rowToNode(row: NodeRow): Node { decorators: row.decorators ? safeJsonParse(row.decorators, undefined) : undefined, typeParameters: row.type_parameters ? safeJsonParse(row.type_parameters, undefined) : undefined, updatedAt: row.updated_at, + centrality: row.centrality ?? undefined, }; } @@ -136,6 +142,10 @@ function rowToFileRecord(row: FileRow): FileRecord { indexedAt: row.indexed_at, nodeCount: row.node_count, errors: row.errors ? safeJsonParse(row.errors, undefined) : undefined, + commitCount: row.commit_count ?? 0, + loc: row.loc ?? 0, + firstSeenTs: row.first_seen_ts ?? null, + lastTouchedTs: row.last_touched_ts ?? null, }; } @@ -916,7 +926,12 @@ export class QueryBuilder { // =========================================================================== /** - * Insert or update a file record + * Insert or update a file record. + * + * Churn columns (commit_count, loc, first_seen_ts, last_touched_ts) + * are deliberately omitted from the ON CONFLICT update list — they + * are managed exclusively by `applyChurnDeltas` / `applyLocUpdates`. + * Adding them here would clobber mined git history on every re-index. */ upsertFile(file: FileRecord): void { if (!this.stmts.upsertFile) { @@ -1295,4 +1310,208 @@ export class QueryBuilder { this.db.exec('DELETE FROM files'); })(); } + + // =========================================================================== + // Centrality (PageRank scores on nodes) + // =========================================================================== + + /** + * Apply PageRank scores to the nodes table in a single transaction. + * Existing scores for ids not in the map are NOT cleared — call + * `clearCentrality()` first for a from-scratch recompute. + */ + applyCentralityScores(scores: Map): void { + if (scores.size === 0) return; + const stmt = this.db.prepare('UPDATE nodes SET centrality = ? WHERE id = ?'); + this.db.transaction(() => { + for (const [id, score] of scores) { + stmt.run(score, id); + } + })(); + // Cached node objects now have stale centrality. Drop the cache; + // subsequent reads pull the fresh value. + this.nodeCache.clear(); + } + + /** Reset all centrality values to NULL (fresh-recompute path). */ + clearCentrality(): void { + this.db.exec('UPDATE nodes SET centrality = NULL'); + this.nodeCache.clear(); + } + + /** + * Get top-N nodes by centrality, descending. Filters out NULL + * centrality (= not yet computed). Optional `kind` filter narrows + * to one node kind; optional `minCentrality` filters out the long + * tail of essentially-zero ranks. + */ + getTopNodesByCentrality(opts: { + limit?: number; + kind?: NodeKind; + minCentrality?: number; + } = {}): Node[] { + const limit = opts.limit ?? 25; + const minCentrality = opts.minCentrality ?? 0; + const where: string[] = ['centrality IS NOT NULL', 'centrality >= ?']; + const params: (string | number)[] = [minCentrality]; + if (opts.kind) { + where.push('kind = ?'); + params.push(opts.kind); + } + const sql = `SELECT * FROM nodes WHERE ${where.join(' AND ')} + ORDER BY centrality DESC LIMIT ?`; + params.push(limit); + const rows = this.db.prepare(sql).all(...params) as NodeRow[]; + return rows.map(rowToNode); + } + + /** + * Compute the rank (1-based) of a single node by centrality. + * Returns null if the node has no centrality yet. + */ + getCentralityRank(nodeId: string): { rank: number; total: number } | null { + const row = this.db + .prepare('SELECT centrality FROM nodes WHERE id = ?') + .get(nodeId) as { centrality: number | null } | undefined; + if (!row || row.centrality === null) return null; + const above = this.db + .prepare('SELECT COUNT(*) AS c FROM nodes WHERE centrality > ?') + .get(row.centrality) as { c: number }; + const total = this.db + .prepare('SELECT COUNT(*) AS c FROM nodes WHERE centrality IS NOT NULL') + .get() as { c: number }; + return { rank: above.c + 1, total: total.c }; + } + + // =========================================================================== + // Per-file churn (mined from git log) + // =========================================================================== + + /** + * Apply churn deltas to the files table. For each delta: + * commit_count += commitCountDelta + * last_touched_ts = MAX(existing, lastTouchedTs) + * first_seen_ts = COALESCE(existing, firstSeenTs) // sticky + * + * Files in the delta map but not in the files table (uncommon — + * they'd have to be mined-but-never-indexed) are silently skipped. + */ + applyChurnDeltas( + deltas: Iterable<{ + path: string; + commitCountDelta: number; + lastTouchedTs: number; + firstSeenTs: number; + }> + ): void { + const stmt = this.db.prepare( + `UPDATE files + SET commit_count = commit_count + ?, + last_touched_ts = MAX(COALESCE(last_touched_ts, 0), ?), + first_seen_ts = COALESCE(first_seen_ts, ?) + WHERE path = ?` + ); + this.db.transaction(() => { + for (const d of deltas) { + stmt.run(d.commitCountDelta, d.lastTouchedTs, d.firstSeenTs, d.path); + } + })(); + } + + /** Reset all churn columns; used before a full re-mine. Does not touch `loc`. */ + clearChurn(): void { + this.db.exec( + `UPDATE files SET commit_count = 0, last_touched_ts = NULL, first_seen_ts = NULL` + ); + } + + /** Update the on-disk LOC for a single file. Cheap; called per changed file. */ + updateFileLoc(filePath: string, loc: number): void { + this.db.prepare('UPDATE files SET loc = ? WHERE path = ?').run(loc, filePath); + } + + /** Bulk LOC update — used during indexAll to refresh LOC for every indexed file. */ + applyLocUpdates(entries: Iterable<{ path: string; loc: number }>): void { + const stmt = this.db.prepare('UPDATE files SET loc = ? WHERE path = ?'); + this.db.transaction(() => { + for (const e of entries) stmt.run(e.loc, e.path); + })(); + } + + getTopFilesByChurn(opts: { limit?: number; minCommits?: number } = {}): FileRecord[] { + const limit = opts.limit ?? 25; + const minCommits = opts.minCommits ?? 1; + const rows = this.db + .prepare( + `SELECT * FROM files WHERE commit_count >= ? + ORDER BY commit_count DESC LIMIT ?` + ) + .all(minCommits, limit) as FileRow[]; + return rows.map(rowToFileRecord); + } + + /** + * Hotspots: files ranked by `risk = (Σ centrality of nodes in file) × commit_count`. + * + * Both inputs are optional in their own right; with neither computed, + * this returns []. Sorting modes: + * - 'risk' : the combined score (default; what "hotspot" means) + * - 'centrality' : pure structural importance + * - 'churn' : pure change frequency + */ + getHotspots(opts: { + limit?: number; + minCommits?: number; + minCentrality?: number; + sortBy?: 'risk' | 'centrality' | 'churn'; + } = {}): Array<{ + filePath: string; + fileCentrality: number; + commitCount: number; + loc: number; + lastTouchedTs: number | null; + riskScore: number; + }> { + const limit = opts.limit ?? 15; + const minCommits = opts.minCommits ?? 0; + const minCentrality = opts.minCentrality ?? 0; + const sortBy = opts.sortBy ?? 'risk'; + + const orderBy = + sortBy === 'centrality' + ? 'fileCentrality DESC' + : sortBy === 'churn' + ? 'commitCount DESC' + : 'riskScore DESC'; + + // Aggregate centrality at file level. LEFT JOIN so files without any + // indexed nodes (rare — schema-only files) still surface if they have churn. + const sql = ` + SELECT + f.path AS filePath, + COALESCE(n_agg.fc, 0.0) AS fileCentrality, + f.commit_count AS commitCount, + f.loc AS loc, + f.last_touched_ts AS lastTouchedTs, + COALESCE(n_agg.fc, 0.0) * f.commit_count AS riskScore + FROM files f + LEFT JOIN ( + SELECT file_path, SUM(centrality) AS fc + FROM nodes WHERE centrality IS NOT NULL + GROUP BY file_path + ) n_agg ON n_agg.file_path = f.path + WHERE f.commit_count >= ? AND COALESCE(n_agg.fc, 0.0) >= ? + ORDER BY ${orderBy} + LIMIT ? + `; + const rows = this.db.prepare(sql).all(minCommits, minCentrality, limit) as Array<{ + filePath: string; + fileCentrality: number; + commitCount: number; + loc: number; + lastTouchedTs: number | null; + riskScore: number; + }>; + return rows; + } } diff --git a/src/db/schema.sql b/src/db/schema.sql index dd0a9f06..42c86061 100644 --- a/src/db/schema.sql +++ b/src/db/schema.sql @@ -37,7 +37,8 @@ CREATE TABLE IF NOT EXISTS nodes ( is_abstract INTEGER DEFAULT 0, decorators TEXT, -- JSON array type_parameters TEXT, -- JSON array - updated_at INTEGER NOT NULL + updated_at INTEGER NOT NULL, + centrality REAL DEFAULT NULL -- PageRank over calls+references; NULL until first compute ); -- Edges: Relationships between nodes @@ -63,7 +64,12 @@ CREATE TABLE IF NOT EXISTS files ( modified_at INTEGER NOT NULL, indexed_at INTEGER NOT NULL, node_count INTEGER DEFAULT 0, - errors TEXT -- JSON array + errors TEXT, -- JSON array + -- Churn signals (mined from git log) + commit_count INTEGER NOT NULL DEFAULT 0, + loc INTEGER NOT NULL DEFAULT 0, + first_seen_ts INTEGER DEFAULT NULL, -- unix seconds + last_touched_ts INTEGER DEFAULT NULL -- unix seconds ); -- Unresolved References: References that need resolution after full indexing @@ -92,6 +98,7 @@ CREATE INDEX IF NOT EXISTS idx_nodes_file_path ON nodes(file_path); CREATE INDEX IF NOT EXISTS idx_nodes_language ON nodes(language); CREATE INDEX IF NOT EXISTS idx_nodes_file_line ON nodes(file_path, start_line); CREATE INDEX IF NOT EXISTS idx_nodes_lower_name ON nodes(lower(name)); +CREATE INDEX IF NOT EXISTS idx_nodes_centrality ON nodes(centrality DESC); -- Full-text search index on node names, docstrings, and signatures CREATE VIRTUAL TABLE IF NOT EXISTS nodes_fts USING fts5( @@ -132,6 +139,8 @@ CREATE INDEX IF NOT EXISTS idx_edges_target_kind ON edges(target, kind); -- File indexes CREATE INDEX IF NOT EXISTS idx_files_language ON files(language); CREATE INDEX IF NOT EXISTS idx_files_modified_at ON files(modified_at); +CREATE INDEX IF NOT EXISTS idx_files_commit_count ON files(commit_count DESC); +CREATE INDEX IF NOT EXISTS idx_files_last_touched ON files(last_touched_ts DESC); -- Unresolved refs indexes CREATE INDEX IF NOT EXISTS idx_unresolved_from_node ON unresolved_refs(from_node_id); diff --git a/src/default-config.ts b/src/default-config.ts index 5c59179c..d862e617 100644 --- a/src/default-config.ts +++ b/src/default-config.ts @@ -183,6 +183,8 @@ const baseConfig: CodeGraphConfig = { maxFileSize: 1024 * 1024, // 1MB extractDocstrings: true, trackCallSites: true, + enableCentrality: true, + enableChurn: true, }; Object.defineProperty(baseConfig, 'include', { diff --git a/src/index-hooks/centrality.ts b/src/index-hooks/centrality.ts new file mode 100644 index 00000000..8fa69203 --- /dev/null +++ b/src/index-hooks/centrality.ts @@ -0,0 +1,37 @@ +/** + * Centrality index hook — runs PageRank over the calls+references + * subgraph after every indexAll/sync and persists scores to + * `nodes.centrality`. Cheap; no I/O. See `src/centrality/` for the + * pure-compute module. + */ + +import type { IndexHook, IndexHookContext } from './registry'; +import { computePageRank, PR_EDGE_KINDS } from '../centrality'; +import { logDebug } from '../errors'; + +function recompute(ctx: IndexHookContext): void { + if (ctx.config.enableCentrality === false) return; + try { + const nodes = ctx.queries.getAllNodes(); + if (nodes.length === 0) return; + const edgeRows = ctx.db + .getDb() + .prepare( + `SELECT source, target FROM edges WHERE kind IN (${PR_EDGE_KINDS + .map(() => '?') + .join(',')})` + ) + .all(...PR_EDGE_KINDS) as Array<{ source: string; target: string }>; + const result = computePageRank(nodes, edgeRows); + ctx.queries.clearCentrality(); + ctx.queries.applyCentralityScores(result.scores); + } catch (err) { + logDebug(`centrality hook failed: ${err instanceof Error ? err.message : String(err)}`); + } +} + +export const HOOK: IndexHook = { + name: 'centrality', + afterIndexAll(ctx) { recompute(ctx); }, + afterSync(ctx) { recompute(ctx); }, +}; diff --git a/src/index-hooks/churn.ts b/src/index-hooks/churn.ts new file mode 100644 index 00000000..d2526c46 --- /dev/null +++ b/src/index-hooks/churn.ts @@ -0,0 +1,53 @@ +/** + * Churn index hook — mines git history for per-file commit counts, + * first/last touched timestamps, and refreshes on-disk LOC. + * Incremental on sync via `last_mined_churn_head` in + * project_metadata; full re-mine on indexAll. See `src/churn/` + * for the miner. + */ + +import type { IndexHook, IndexHookContext } from './registry'; +import type { SyncResult } from '../extraction'; +import { mineChurn, readFileLoc, LAST_MINED_CHURN_HEAD_KEY } from '../churn'; +import { logDebug } from '../errors'; + +function refresh(ctx: IndexHookContext, options: { fullRescan: boolean; changedFiles: string[] | null }): void { + if (ctx.config.enableChurn === false) return; + try { + const indexedFiles = new Set(ctx.queries.getAllFilePaths()); + if (indexedFiles.size === 0) return; + const sinceSha = options.fullRescan + ? null + : ctx.queries.getMetadata(LAST_MINED_CHURN_HEAD_KEY); + const mined = mineChurn(ctx.projectRoot, indexedFiles, sinceSha); + if (mined.currentHead === null) return; // not in a git repo + if (mined.needsFullRescan) { + ctx.queries.clearChurn(); + const remined = mineChurn(ctx.projectRoot, indexedFiles, null); + ctx.queries.applyChurnDeltas(remined.deltas.values()); + ctx.queries.setMetadata(LAST_MINED_CHURN_HEAD_KEY, remined.currentHead ?? ''); + } else { + if (options.fullRescan) ctx.queries.clearChurn(); + ctx.queries.applyChurnDeltas(mined.deltas.values()); + ctx.queries.setMetadata(LAST_MINED_CHURN_HEAD_KEY, mined.currentHead); + } + const targets = options.fullRescan + ? [...indexedFiles] + : (options.changedFiles ?? []).filter((p) => indexedFiles.has(p)); + if (targets.length > 0) { + ctx.queries.applyLocUpdates( + targets.map((p) => ({ path: p, loc: readFileLoc(ctx.projectRoot, p) })) + ); + } + } catch (err) { + logDebug(`churn hook failed: ${err instanceof Error ? err.message : String(err)}`); + } +} + +export const HOOK: IndexHook = { + name: 'churn', + afterIndexAll(ctx) { refresh(ctx, { fullRescan: true, changedFiles: null }); }, + afterSync(ctx, result: SyncResult) { + refresh(ctx, { fullRescan: false, changedFiles: result.changedFilePaths ?? null }); + }, +}; diff --git a/src/index-hooks/registry.ts b/src/index-hooks/registry.ts index d68503ee..ef799bf0 100644 --- a/src/index-hooks/registry.ts +++ b/src/index-hooks/registry.ts @@ -22,6 +22,9 @@ import type { IndexHook, IndexHookContext, IndexHookOutcome } from './types'; import type { SyncResult } from '../extraction'; import { logDebug } from '../errors'; +import { HOOK as CENTRALITY_HOOK } from './centrality'; +import { HOOK as CHURN_HOOK } from './churn'; + /** * Static-import list of every registered hook. * @@ -31,8 +34,8 @@ import { logDebug } from '../errors'; * config flag inside its `afterIndexAll`/`afterSync`. */ const REGISTERED_HOOKS: readonly IndexHook[] = [ - // PRs adding hooks: append your `import { HOOK as _HOOK } from './';` - // above and your `_HOOK` entry here, alphabetical by name. + CENTRALITY_HOOK, + CHURN_HOOK, ]; /** diff --git a/src/index.ts b/src/index.ts index 1cf55624..4f6a35c0 100644 --- a/src/index.ts +++ b/src/index.ts @@ -526,6 +526,48 @@ export class CodeGraph { return this.indexMutex.isLocked(); } + // =========================================================================== + // Derived Signals (centrality, churn, hotspots) + // =========================================================================== + + getCentrality(nodeId: string): number | null { + const node = this.queries.getNodeById(nodeId); + return node?.centrality ?? null; + } + + getTopCentralNodes(opts: { limit?: number; kind?: import('./types').NodeKind } = {}): Node[] { + return this.queries.getTopNodesByCentrality(opts); + } + + getCentralityRank(nodeId: string): { rank: number; total: number } | null { + return this.queries.getCentralityRank(nodeId); + } + + getFileChurn(filePath: string): { + commitCount: number; + loc: number; + firstSeenTs: number | null; + lastTouchedTs: number | null; + } | null { + const f = this.queries.getFileByPath(filePath); + if (!f) return null; + return { + commitCount: f.commitCount ?? 0, + loc: f.loc ?? 0, + firstSeenTs: f.firstSeenTs ?? null, + lastTouchedTs: f.lastTouchedTs ?? null, + }; + } + + getHotspots(opts: { + limit?: number; + minCommits?: number; + minCentrality?: number; + sortBy?: 'risk' | 'centrality' | 'churn'; + } = {}): ReturnType { + return this.queries.getHotspots(opts); + } + // =========================================================================== // File Watching // =========================================================================== diff --git a/src/mcp/tools.ts b/src/mcp/tools.ts index 7a5b995a..52b8e99e 100644 --- a/src/mcp/tools.ts +++ b/src/mcp/tools.ts @@ -813,6 +813,57 @@ export class ToolHandler implements ToolHandlerLike { return this.textResult(this.truncateOutput(output)); } + /** + * Handle codegraph_hotspots — files ranked by risk = centrality × churn. + */ + async handleHotspots(args: Record): Promise { + const cg = this.getCodeGraph(args.projectPath as string | undefined); + const limit = args.limit != null ? clamp(args.limit as number, 1, 100) : 15; + const minCommits = args.minCommits != null ? Math.max(0, args.minCommits as number) : 3; + const minCentrality = args.minCentrality != null ? Math.max(0, args.minCentrality as number) : 0; + const sortBy = (args.sortBy as 'risk' | 'centrality' | 'churn' | undefined) ?? 'risk'; + + const rows = cg.getHotspots({ limit, minCommits, minCentrality, sortBy }); + if (rows.length === 0) { + const lines = [ + 'No hotspots to report.', + '', + 'This typically means one of:', + '- Index has not been built yet (`codegraph index`)', + '- Project is not a git repo (churn data unavailable)', + '- `enableCentrality` / `enableChurn` are disabled in config', + '- `minCommits` is set higher than any file in the project', + ]; + return this.textResult(lines.join('\n')); + } + + const now = Math.floor(Date.now() / 1000); + const fmtAge = (ts: number | null) => { + if (!ts) return '—'; + const days = Math.floor((now - ts) / 86400); + if (days <= 0) return 'today'; + if (days === 1) return '1d ago'; + if (days < 30) return `${days}d ago`; + const months = Math.floor(days / 30); + return months === 1 ? '1mo ago' : `${months}mo ago`; + }; + + const lines: string[] = [ + `## Hotspots (sortBy=${sortBy}, top ${rows.length})`, + '', + 'High-risk files = high structural centrality × high git churn. Review these first.', + '', + '| # | File | PR | Commits | LOC | Last touched | Risk |', + '|---|------|----:|--------:|----:|--------------|-----:|', + ]; + rows.forEach((r, i) => { + lines.push( + `| ${i + 1} | \`${r.filePath}\` | ${r.fileCentrality.toFixed(4)} | ${r.commitCount} | ${r.loc} | ${fmtAge(r.lastTouchedTs)} | ${r.riskScore.toFixed(4)} |` + ); + }); + return this.textResult(this.truncateOutput(lines.join('\n'))); + } + /** * Convert glob pattern to regex */ diff --git a/src/mcp/tools/hotspots.ts b/src/mcp/tools/hotspots.ts new file mode 100644 index 00000000..a30c62cc --- /dev/null +++ b/src/mcp/tools/hotspots.ts @@ -0,0 +1,37 @@ +import { projectPathProperty } from '../tool-types'; +import type { ToolModule } from './types'; + +export const HOTSPOTS_TOOL: ToolModule = { + definition: { + name: 'codegraph_hotspots', + description: + "Identify high-risk files: high PageRank centrality (many things depend on them) AND high churn (frequently changed). Use when triaging an unfamiliar codebase, hunting for refactor targets, or asking 'where do bugs hide?'. Returns ranked file list with both signals plus a combined risk score (centrality × churn). Sort options: 'risk' (default), 'centrality', 'churn'.", + inputSchema: { + type: 'object', + properties: { + limit: { + type: 'number', + description: 'Maximum number of files to return (default: 15)', + }, + minCommits: { + type: 'number', + description: + 'Filter out files touched in fewer than N commits (default: 3 — excludes test fixtures and one-off files)', + }, + minCentrality: { + type: 'number', + description: + 'Filter out files whose total node centrality (Σ PageRank of nodes in file) is below this threshold (default: 0 — no filter). Useful to drop docs/config files from the list.', + }, + sortBy: { + type: 'string', + enum: ['risk', 'centrality', 'churn'], + description: + 'Sort dimension: risk = centrality × churn (default), centrality = pure structural importance, churn = pure change frequency', + }, + projectPath: projectPathProperty, + }, + }, + }, + handlerKey: 'handleHotspots', +}; diff --git a/src/mcp/tools/registry.ts b/src/mcp/tools/registry.ts index 3219f88d..e729e44f 100644 --- a/src/mcp/tools/registry.ts +++ b/src/mcp/tools/registry.ts @@ -23,6 +23,7 @@ import { CALLERS_TOOL } from './callers'; import { CONTEXT_TOOL } from './context'; import { EXPLORE_TOOL } from './explore'; import { FILES_TOOL } from './files'; +import { HOTSPOTS_TOOL } from './hotspots'; import { IMPACT_TOOL } from './impact'; import { NODE_TOOL } from './node'; import { SEARCH_TOOL } from './search'; @@ -34,6 +35,7 @@ const ALL_TOOLS: readonly ToolModule[] = [ CONTEXT_TOOL, EXPLORE_TOOL, FILES_TOOL, + HOTSPOTS_TOOL, IMPACT_TOOL, NODE_TOOL, SEARCH_TOOL, diff --git a/src/mcp/tools/types.ts b/src/mcp/tools/types.ts index 6741d965..372a1e1b 100644 --- a/src/mcp/tools/types.ts +++ b/src/mcp/tools/types.ts @@ -31,7 +31,8 @@ export type HandlerKey = | 'handleExplore' | 'handleNode' | 'handleStatus' - | 'handleFiles'; + | 'handleFiles' + | 'handleHotspots'; /** * The minimum surface a `ToolHandler`-shaped object exposes for diff --git a/src/types.ts b/src/types.ts index e9b3cbcc..64fbcaa9 100644 --- a/src/types.ts +++ b/src/types.ts @@ -144,6 +144,13 @@ export interface Node { /** When the node was last updated */ updatedAt: number; + + /** + * PageRank centrality score over calls+references edges, in (0, 1). + * NULL/undefined when not yet computed (fresh DB before first + * indexAll, or `enableCentrality: false`). + */ + centrality?: number | null; } /** @@ -199,6 +206,21 @@ export interface FileRecord { /** Any extraction errors */ errors?: ExtractionError[]; + + /** + * Number of git commits touching this path. 0 when uncommitted or + * mining disabled. Lower bound on shallow clones. + */ + commitCount?: number; + + /** Current line count of the file on disk (newline-delimited). */ + loc?: number; + + /** Unix seconds, first commit timestamp touching this path. */ + firstSeenTs?: number | null; + + /** Unix seconds, most recent commit timestamp touching this path. */ + lastTouchedTs?: number | null; } // ============================================================================= @@ -474,6 +496,21 @@ export interface CodeGraphConfig { /** Node kind to assign */ kind: NodeKind; }[]; + + /** + * Compute PageRank centrality over calls+references after each + * indexAll/sync. Cheap (sub-second on realistic projects); enabled + * by default. + */ + enableCentrality?: boolean; + + /** + * Mine git log for per-file churn metrics (commit count, LOC, + * first-seen / last-touched timestamps). Set to false on shallow + * clones or non-git checkouts where the data would be misleading. + * Enabled by default. + */ + enableChurn?: boolean; } // `DEFAULT_CONFIG` lives in `./default-config.ts` so its `include` From e85ebd0c5dd72c736a2cff2e3cd9d1e0337e67d8 Mon Sep 17 00:00:00 2001 From: andreinknv Date: Mon, 27 Apr 2026 17:53:37 -0400 Subject: [PATCH 7/9] feat: PR #113 (issue-history) on top of refactors Mines Fixes/Closes/Resolves #N commits and attributes them to symbols touched by each commit hunks. Lands as a registered IndexHook (issue-history). - Migration 005: symbol_issues table - src/issue-history/ (pure module): mineIssueHistory + parse-diff - src/index-hooks/issue-history.ts (registered hook) - CodeGraph public method: getIssuesForNode - codegraph_node MCP tool now surfaces issue history line - enableIssueHistory flag default true wired through config merge - Removed defensive ensureSymbolIssuesTable guard and its test: the v4-collision bug class is impossible under file-based migrations (PR #118 refactor); filenames collide on the filesystem instead. Tests: 470/471 pass (1 watcher flake under load, isolation OK). Co-Authored-By: Claude Opus 4.7 (1M context) --- __tests__/foundation.test.ts | 2 +- __tests__/issue-history.test.ts | 390 +++++++++++++++++++++++++ __tests__/pr19-improvements.test.ts | 2 +- src/config.ts | 1 + src/db/migrations/005-symbol-issues.ts | 19 ++ src/db/migrations/index.ts | 2 + src/db/queries.ts | 65 +++++ src/db/schema.sql | 16 + src/default-config.ts | 1 + src/index-hooks/issue-history.ts | 58 ++++ src/index-hooks/registry.ts | 2 + src/index.ts | 8 + src/issue-history/index.ts | 235 +++++++++++++++ src/issue-history/parse-diff.ts | 208 +++++++++++++ src/mcp/tools.ts | 34 ++- src/types.ts | 7 + 16 files changed, 1046 insertions(+), 4 deletions(-) create mode 100644 __tests__/issue-history.test.ts create mode 100644 src/db/migrations/005-symbol-issues.ts create mode 100644 src/index-hooks/issue-history.ts create mode 100644 src/issue-history/index.ts create mode 100644 src/issue-history/parse-diff.ts diff --git a/__tests__/foundation.test.ts b/__tests__/foundation.test.ts index 4e8f204a..20ada266 100644 --- a/__tests__/foundation.test.ts +++ b/__tests__/foundation.test.ts @@ -305,7 +305,7 @@ describe('Database Connection', () => { const version = db.getSchemaVersion(); expect(version).not.toBeNull(); - expect(version?.version).toBe(4); + expect(version?.version).toBe(5); db.close(); }); diff --git a/__tests__/issue-history.test.ts b/__tests__/issue-history.test.ts new file mode 100644 index 00000000..7c281771 --- /dev/null +++ b/__tests__/issue-history.test.ts @@ -0,0 +1,390 @@ +/** + * Issue → symbol attribution: parser unit tests + end-to-end mining + * against synthetic git repos. + */ +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import * as fs from 'fs'; +import * as os from 'os'; +import * as path from 'path'; +import { execFileSync } from 'child_process'; +import { + extractSymbolFromContext, + extractDeclaration, +} from '../src/issue-history/parse-diff'; +import { + mineIssueCommits, + mineIssueHistory, + ISSUE_REGEX, + LAST_MINED_ISSUES_HEAD_KEY, +} from '../src/issue-history'; +import CodeGraph from '../src/index'; + +let HAS_GIT = true; +try { + execFileSync('git', ['--version'], { stdio: 'ignore' }); +} catch { + HAS_GIT = false; +} + +let testDir: string; +let cg: CodeGraph | null = null; + +function git(...args: string[]): string { + return execFileSync('git', args, { + cwd: testDir, + encoding: 'utf-8', + env: { + ...process.env, + GIT_AUTHOR_NAME: 'Test', + GIT_AUTHOR_EMAIL: 'test@example.com', + GIT_COMMITTER_NAME: 'Test', + GIT_COMMITTER_EMAIL: 'test@example.com', + GIT_AUTHOR_DATE: process.env.GIT_AUTHOR_DATE, + GIT_COMMITTER_DATE: process.env.GIT_COMMITTER_DATE, + }, + stdio: ['pipe', 'pipe', 'pipe'], + }).trim(); +} + +function commitAt(date: string, files: Record, message: string) { + for (const [rel, content] of Object.entries(files)) { + const abs = path.join(testDir, rel); + fs.mkdirSync(path.dirname(abs), { recursive: true }); + fs.writeFileSync(abs, content); + } + git('add', '-A'); + process.env.GIT_AUTHOR_DATE = date; + process.env.GIT_COMMITTER_DATE = date; + git('commit', '-m', message); + delete process.env.GIT_AUTHOR_DATE; + delete process.env.GIT_COMMITTER_DATE; +} + +beforeEach(() => { + testDir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-issues-')); +}); + +afterEach(() => { + delete process.env.GIT_AUTHOR_DATE; + delete process.env.GIT_COMMITTER_DATE; + if (cg) { + cg.destroy(); + cg = null; + } + if (fs.existsSync(testDir)) fs.rmSync(testDir, { recursive: true, force: true }); +}); + +// ============================================================================ +// Pure parser unit tests +// ============================================================================ + +describe('ISSUE_REGEX', () => { + it('matches all canonical Fixes/Closes/Resolves verbs', () => { + const cases = [ + 'Fix #1', 'Fixes #2', 'Fixed #3', + 'Close #4', 'Closes #5', 'Closed #6', + 'Resolve #7', 'Resolves #8', 'Resolved #9', + ]; + for (const s of cases) { + ISSUE_REGEX.lastIndex = 0; + expect(ISSUE_REGEX.test(s)).toBe(true); + } + }); + + it('matches multiple issues in a single body', () => { + ISSUE_REGEX.lastIndex = 0; + const matches = [...'Fixes #1, closes #2 and resolves #3'.matchAll(ISSUE_REGEX)]; + expect(matches.map((m) => m[1])).toEqual(['1', '2', '3']); + }); + + it('is case-insensitive', () => { + ISSUE_REGEX.lastIndex = 0; + expect(ISSUE_REGEX.test('FIXES #42')).toBe(true); + }); + + it('does NOT match `#N` without a verb', () => { + ISSUE_REGEX.lastIndex = 0; + // Match in body of message that mentions #99 but with no verb prefix. + expect(ISSUE_REGEX.test('See #99 for context')).toBe(false); + }); + + it('v1 limitation: `Fixes #1, #2` only captures #1', () => { + // Documented behavior — the second issue lacks a verb prefix and + // is silently dropped. Authors who care can write `Fixes #1, fixes #2`. + ISSUE_REGEX.lastIndex = 0; + const matches = [...'Fixes #1, #2'.matchAll(ISSUE_REGEX)]; + expect(matches.map((m) => m[1])).toEqual(['1']); + }); +}); + +describe('extractSymbolFromContext', () => { + it('pulls function name from a TS function context', () => { + expect(extractSymbolFromContext('function processOrder(order: Order) {')).toBe('processOrder'); + }); + it('pulls class name', () => { + expect(extractSymbolFromContext('class UserService {')).toBe('UserService'); + }); + it('pulls Python def', () => { + expect(extractSymbolFromContext('def compute_score(items):')).toBe('compute_score'); + }); + it('pulls Go func', () => { + expect(extractSymbolFromContext('func ProcessOrder(o *Order) error {')).toBe('ProcessOrder'); + }); + it('pulls method-style ` async foo(`', () => { + expect(extractSymbolFromContext(' async foo(args: string) {')).toBe('foo'); + }); + it('rejects keyword-only contexts', () => { + expect(extractSymbolFromContext(' if (x) {')).toBeNull(); + }); + it('returns null on empty input', () => { + expect(extractSymbolFromContext('')).toBeNull(); + }); +}); + +describe('extractDeclaration', () => { + it('captures + function decl', () => { + expect(extractDeclaration('+function helper() {')).toEqual({ name: 'helper', sign: '+' }); + }); + it('captures - class decl', () => { + expect(extractDeclaration('-export class Old {')).toEqual({ name: 'Old', sign: '-' }); + }); + it('captures Python def', () => { + expect(extractDeclaration('+def my_helper(x):')).toEqual({ name: 'my_helper', sign: '+' }); + }); + it('captures Go func with receiver', () => { + expect(extractDeclaration('+func (s *Service) DoThing() error {')).toEqual({ + name: 'DoThing', + sign: '+', + }); + }); + it('skips file-marker `+++` and `---` lines', () => { + expect(extractDeclaration('+++ b/src/foo.ts')).toBeNull(); + expect(extractDeclaration('--- a/src/foo.ts')).toBeNull(); + }); + it('skips keywords like `+if`', () => { + expect(extractDeclaration('+ if (x) return;')).toBeNull(); + }); + it('returns null on context lines (no +/-)', () => { + expect(extractDeclaration(' some body line')).toBeNull(); + }); +}); + +// ============================================================================ +// Git mining: synthetic repo +// ============================================================================ + +describe.skipIf(!HAS_GIT)('mineIssueCommits', () => { + beforeEach(() => { + git('init', '-q', '-b', 'main'); + git('config', 'commit.gpgsign', 'false'); + }); + + it('finds commits with `Fixes #N` in the subject', () => { + commitAt('2025-01-01T00:00:00Z', { 'a.ts': 'a' }, 'feat: add a (no issue)'); + commitAt('2025-01-02T00:00:00Z', { 'a.ts': 'a2' }, 'fix: bug. Fixes #42'); + const commits = mineIssueCommits(testDir, null); + expect(commits.length).toBe(1); + expect(commits[0]!.issues).toEqual([42]); + }); + + it('parses multi-issue subjects', () => { + commitAt('2025-01-01T00:00:00Z', { 'a.ts': 'a' }, 'fix: triple. Fixes #1, closes #2, resolves #3'); + const [c] = mineIssueCommits(testDir, null); + expect(c?.issues).toEqual([1, 2, 3]); + }); + + it('ignores commits with no issue ref', () => { + commitAt('2025-01-01T00:00:00Z', { 'a.ts': 'a' }, 'plain message'); + expect(mineIssueCommits(testDir, null).length).toBe(0); + }); + + it('returns [] when not in a git repo', () => { + const nonGit = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-nogit-')); + try { + expect(mineIssueCommits(nonGit, null)).toEqual([]); + } finally { + fs.rmSync(nonGit, { recursive: true, force: true }); + } + }); +}); + +// ============================================================================ +// End-to-end through CodeGraph +// ============================================================================ + +describe.skipIf(!HAS_GIT)('CodeGraph issue history', () => { + beforeEach(() => { + git('init', '-q', '-b', 'main'); + git('config', 'commit.gpgsign', 'false'); + }); + + it('attributes a Fixes #N commit to the modified function', async () => { + commitAt('2025-01-01T00:00:00Z', { + 'src/a.ts': `export function foo() { return 1; }\n`, + }, 'feat: add foo'); + + commitAt('2025-02-01T00:00:00Z', { + 'src/a.ts': `export function foo() {\n // changed\n return 2;\n}\n`, + }, 'fix: bug. Fixes #42'); + + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + + const node = cg.getNodesInFile('src/a.ts').find((n) => n.name === 'foo')!; + expect(node).toBeDefined(); + const issues = cg.getIssuesForNode(node.id); + expect(issues.length).toBeGreaterThan(0); + expect(issues.some((i) => i.issueNumber === 42)).toBe(true); +}); + + it('tracks the agent-usable multi-issue signal', async () => { + // Simulate the codegraph history pattern: `loadGrammarsForLanguages` + // touched by every language-add issue (#54, #82, #83, #85). + commitAt('2025-01-01T00:00:00Z', { + 'src/grammar.ts': `export function loadGrammarsForLanguages() { return []; }\n`, + }, 'feat: add grammar loader'); + + commitAt('2025-01-02T00:00:00Z', { + 'src/grammar.ts': `export function loadGrammarsForLanguages() {\n // R support\n return [];\n}\n`, + }, 'feat: add R support. Fixes #82'); + + commitAt('2025-01-03T00:00:00Z', { + 'src/grammar.ts': `export function loadGrammarsForLanguages() {\n // R + HCL support\n return [];\n}\n`, + }, 'feat: add HCL. Fixes #83'); + + commitAt('2025-01-04T00:00:00Z', { + 'src/grammar.ts': `export function loadGrammarsForLanguages() {\n // R + HCL + SQL\n return [];\n}\n`, + }, 'feat: add SQL. Fixes #85'); + + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + + const node = cg.getNodesByKind("function").find((n) => n.name === 'loadGrammarsForLanguages')!; + expect(node).toBeDefined(); + const issues = cg.getIssuesForNode(node.id); + const issueNumbers = [...new Set(issues.map((i) => i.issueNumber))].sort((a, b) => a - b); + expect(issueNumbers).toEqual([82, 83, 85]); + }); + + it('records `added` kind for symbols introduced in a Fixes commit', async () => { + commitAt('2025-01-01T00:00:00Z', { + 'src/a.ts': `export function existing() { return 1; }\n`, + }, 'init'); + + commitAt('2025-02-01T00:00:00Z', { + 'src/a.ts': `export function existing() { return 1; }\nexport function brandNew() { return 2; }\n`, + }, 'feat: add brandNew. Fixes #100'); + + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + + const node = cg.getNodesByKind("function").find((n) => n.name === 'brandNew')!; + const issues = cg.getIssuesForNode(node.id); + expect(issues.some((i) => i.issueNumber === 100 && i.kind === 'added')).toBe(true); + }); + + it('drops attributions for symbols that no longer exist', async () => { + // Symbol added then removed in two separate `Fixes` commits. The + // current index has no node for it, so attributions for the removed + // symbol must not appear (FK + drop-on-resolve). + commitAt('2025-01-01T00:00:00Z', { + 'src/a.ts': `export function staysHere() { return 1; }\nexport function temporary() { return 99; }\n`, + }, 'feat: add. Fixes #1'); + + commitAt('2025-02-01T00:00:00Z', { + 'src/a.ts': `export function staysHere() { return 1; }\n`, + }, 'fix: drop temporary. Fixes #2'); + + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + + // staysHere should have at least the #1 attribution (added). + const node = cg.getNodesByKind("function").find((n) => n.name === 'staysHere')!; + const issues = cg.getIssuesForNode(node.id); + expect(issues.some((i) => i.issueNumber === 1)).toBe(true); + + // No node should exist named `temporary`, and no attribution to + // issue #2 should reference a node that doesn't exist. + expect(cg.getNodesByKind("function").find((n) => n.name === 'temporary')).toBeUndefined(); + }); + + it('survives indexAll outside a git repo (table empty, no errors)', async () => { + fs.rmSync(path.join(testDir, '.git'), { recursive: true, force: true }); + fs.writeFileSync(path.join(testDir, 'a.ts'), `export function x() { return 1; }\n`); + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + const nodes = cg.getNodesInFile('a.ts'); + expect(nodes.length).toBeGreaterThan(0); + for (const n of nodes) expect(cg.getIssuesForNode(n.id)).toEqual([]); + }); + + it('respects enableIssueHistory=false', async () => { + commitAt('2025-01-01T00:00:00Z', { + 'src/a.ts': `export function foo() { return 1; }\n`, + }, 'init'); + commitAt('2025-01-02T00:00:00Z', { + 'src/a.ts': `export function foo() { return 2; }\n`, + }, 'fix: foo. Fixes #1'); + + cg = CodeGraph.initSync(testDir, { + config: { include: ['**/*.ts'], exclude: [], enableIssueHistory: false }, + }); + await cg.indexAll(); + const node = cg.getNodesInFile('src/a.ts').find((n) => n.name === 'foo')!; + expect(cg.getIssuesForNode(node.id)).toEqual([]); + }); + + it('incrementally picks up new Fixes commits on sync', async () => { + commitAt('2025-01-01T00:00:00Z', { + 'src/a.ts': `export function foo() { return 1; }\n`, + }, 'init'); + + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + const node = cg.getNodesInFile('src/a.ts').find((n) => n.name === 'foo')!; + expect(cg.getIssuesForNode(node.id).length).toBe(0); + + commitAt('2025-02-01T00:00:00Z', { + 'src/a.ts': `export function foo() { return 2; }\n`, + }, 'fix: foo. Fixes #50'); + await cg.sync(); + + const issues = cg.getIssuesForNode(node.id); + expect(issues.some((i) => i.issueNumber === 50)).toBe(true); + }); + + // (Removed: a defensive test for the v4-migration-collision bug class. + // With file-based migrations (NNN-name.ts), two migrations claiming + // the same version produces a filesystem-level conflict — the silent + // skip the defensive guard protected against can no longer happen.) + + it('recovers from an unreachable last_mined_issues_head', async () => { + commitAt('2025-01-01T00:00:00Z', { + 'src/a.ts': `export function foo() { return 1; }\n`, + }, 'init'); + commitAt('2025-02-01T00:00:00Z', { + 'src/a.ts': `export function foo() { return 2; }\n`, + }, 'fix: foo. Fixes #1'); + + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + const node = cg.getNodesInFile('src/a.ts').find((n) => n.name === 'foo')!; + expect( + [...new Set(cg.getIssuesForNode(node.id).map((i) => i.issueNumber))] + ).toEqual([1]); + + // Simulate force-push / gc by storing an unreachable SHA. + // eslint-disable-next-line @typescript-eslint/no-explicit-any + (cg as any).queries.setMetadata(LAST_MINED_ISSUES_HEAD_KEY, '0'.repeat(40)); + + commitAt('2025-03-01T00:00:00Z', { + 'src/a.ts': `export function foo() { return 3; }\n`, + }, 'fix: foo again. Fixes #2'); + await cg.sync(); + + const issueNums = [ + ...new Set(cg.getIssuesForNode(node.id).map((i) => i.issueNumber)), + ].sort((a, b) => a - b); + expect(issueNums).toEqual([1, 2]); + }); +}); diff --git a/__tests__/pr19-improvements.test.ts b/__tests__/pr19-improvements.test.ts index d43dceb2..5974b549 100644 --- a/__tests__/pr19-improvements.test.ts +++ b/__tests__/pr19-improvements.test.ts @@ -299,7 +299,7 @@ describe('Best-Candidate Resolution', () => { describe('Schema v2 Migration', () => { it.skipIf(!HAS_SQLITE)('should have correct current schema version', async () => { const { CURRENT_SCHEMA_VERSION } = await import('../src/db/migrations'); - expect(CURRENT_SCHEMA_VERSION).toBe(4); + expect(CURRENT_SCHEMA_VERSION).toBe(5); }); it.skipIf(!HAS_SQLITE)('should have migration for version 2', async () => { diff --git a/src/config.ts b/src/config.ts index 8a92228d..44d075dc 100644 --- a/src/config.ts +++ b/src/config.ts @@ -130,6 +130,7 @@ function mergeConfig( customPatterns: overrides.customPatterns ?? defaults.customPatterns, enableCentrality: overrides.enableCentrality ?? defaults.enableCentrality, enableChurn: overrides.enableChurn ?? defaults.enableChurn, + enableIssueHistory: overrides.enableIssueHistory ?? defaults.enableIssueHistory, }; } diff --git a/src/db/migrations/005-symbol-issues.ts b/src/db/migrations/005-symbol-issues.ts new file mode 100644 index 00000000..7af13795 --- /dev/null +++ b/src/db/migrations/005-symbol-issues.ts @@ -0,0 +1,19 @@ +import type { MigrationModule } from './types'; + +export const MIGRATION: MigrationModule = { + description: 'Add symbol_issues table for issue→symbol attribution from git history', + up: (db) => { + db.exec(` + CREATE TABLE IF NOT EXISTS symbol_issues ( + node_id TEXT NOT NULL, + issue_number INTEGER NOT NULL, + commit_sha TEXT NOT NULL, + kind TEXT NOT NULL CHECK (kind IN ('modified','added','removed')), + PRIMARY KEY (node_id, issue_number, commit_sha, kind), + FOREIGN KEY (node_id) REFERENCES nodes(id) ON DELETE CASCADE + ); + CREATE INDEX IF NOT EXISTS idx_symbol_issues_node ON symbol_issues(node_id); + CREATE INDEX IF NOT EXISTS idx_symbol_issues_issue ON symbol_issues(issue_number); + `); + }, +}; diff --git a/src/db/migrations/index.ts b/src/db/migrations/index.ts index 37252ffa..cd3e3ba3 100644 --- a/src/db/migrations/index.ts +++ b/src/db/migrations/index.ts @@ -27,6 +27,7 @@ import type { Migration, MigrationModule } from './types'; import { MIGRATION as MIG_002 } from './002-project-metadata'; import { MIGRATION as MIG_003 } from './003-lower-name-index'; import { MIGRATION as MIG_004 } from './004-centrality-churn'; +import { MIGRATION as MIG_005 } from './005-symbol-issues'; interface ModuleRef { /** @@ -50,6 +51,7 @@ const REGISTERED_MODULES: readonly ModuleRef[] = [ { filename: '002-project-metadata.ts', module: MIG_002 }, { filename: '003-lower-name-index.ts', module: MIG_003 }, { filename: '004-centrality-churn.ts', module: MIG_004 }, + { filename: '005-symbol-issues.ts', module: MIG_005 }, ]; /** Strict 3-digit prefix on each migration filename. */ diff --git a/src/db/queries.ts b/src/db/queries.ts index dec533a7..af87a7b9 100644 --- a/src/db/queries.ts +++ b/src/db/queries.ts @@ -1514,4 +1514,69 @@ export class QueryBuilder { }>; return rows; } + + // =========================================================================== + // Symbol-issue attributions (mined from git history) + // =========================================================================== + + applyIssueAttributions( + rows: Iterable<{ + nodeId: string; + issueNumber: number; + commitSha: string; + kind: 'modified' | 'added' | 'removed'; + }> + ): void { + const stmt = this.db.prepare( + `INSERT OR IGNORE INTO symbol_issues (node_id, issue_number, commit_sha, kind) + VALUES (?, ?, ?, ?)` + ); + this.db.transaction(() => { + for (const r of rows) { + stmt.run(r.nodeId, r.issueNumber, r.commitSha, r.kind); + } + })(); + } + + clearIssueAttributions(): void { + this.db.exec('DELETE FROM symbol_issues'); + } + + getIssuesForNode(nodeId: string): Array<{ + issueNumber: number; + kind: 'modified' | 'added' | 'removed'; + commitSha: string; + }> { + return this.db + .prepare( + `SELECT issue_number AS issueNumber, kind, commit_sha AS commitSha + FROM symbol_issues + WHERE node_id = ? + ORDER BY issue_number ASC, kind ASC` + ) + .all(nodeId) as Array<{ + issueNumber: number; + kind: 'modified' | 'added' | 'removed'; + commitSha: string; + }>; + } + + getNodesForIssue(issueNumber: number): Array<{ + nodeId: string; + kind: 'modified' | 'added' | 'removed'; + commitSha: string; + }> { + return this.db + .prepare( + `SELECT node_id AS nodeId, kind, commit_sha AS commitSha + FROM symbol_issues + WHERE issue_number = ? + ORDER BY node_id ASC` + ) + .all(issueNumber) as Array<{ + nodeId: string; + kind: 'modified' | 'added' | 'removed'; + commitSha: string; + }>; + } } diff --git a/src/db/schema.sql b/src/db/schema.sql index 42c86061..4a1150dd 100644 --- a/src/db/schema.sql +++ b/src/db/schema.sql @@ -155,3 +155,19 @@ CREATE TABLE IF NOT EXISTS project_metadata ( value TEXT NOT NULL, updated_at INTEGER NOT NULL ); + +-- Issue → symbol attribution mined from git history. +-- One row per (node, issue, commit, kind) tuple; kind is 'modified' +-- (enclosing function changed by hunk), 'added' (declaration on a + +-- line), or 'removed' (declaration on a - line, dropped at lookup +-- time when no current node matches). +CREATE TABLE IF NOT EXISTS symbol_issues ( + node_id TEXT NOT NULL, + issue_number INTEGER NOT NULL, + commit_sha TEXT NOT NULL, + kind TEXT NOT NULL CHECK (kind IN ('modified','added','removed')), + PRIMARY KEY (node_id, issue_number, commit_sha, kind), + FOREIGN KEY (node_id) REFERENCES nodes(id) ON DELETE CASCADE +); +CREATE INDEX IF NOT EXISTS idx_symbol_issues_node ON symbol_issues(node_id); +CREATE INDEX IF NOT EXISTS idx_symbol_issues_issue ON symbol_issues(issue_number); diff --git a/src/default-config.ts b/src/default-config.ts index d862e617..a7ec0486 100644 --- a/src/default-config.ts +++ b/src/default-config.ts @@ -185,6 +185,7 @@ const baseConfig: CodeGraphConfig = { trackCallSites: true, enableCentrality: true, enableChurn: true, + enableIssueHistory: true, }; Object.defineProperty(baseConfig, 'include', { diff --git a/src/index-hooks/issue-history.ts b/src/index-hooks/issue-history.ts new file mode 100644 index 00000000..bc7aa95a --- /dev/null +++ b/src/index-hooks/issue-history.ts @@ -0,0 +1,58 @@ +/** + * Issue-history index hook — mines `Fixes/Closes/Resolves #N` + * commits and attributes them to symbols touched by each commit's + * hunks. Incremental on sync via `last_mined_issues_head` in + * project_metadata; full re-mine on indexAll. See + * `src/issue-history/` for the miner. + */ + +import type { IndexHook, IndexHookContext } from './registry'; +import { mineIssueHistory, LAST_MINED_ISSUES_HEAD_KEY } from '../issue-history'; +import { logDebug } from '../errors'; + +function refresh(ctx: IndexHookContext, options: { fullRescan: boolean }): void { + if (ctx.config.enableIssueHistory === false) return; + try { + // Resolver closure with a per-pass file-level cache. Without it, + // every (filePath, name) lookup would re-fetch all nodes for the + // file. + const fileNodesCache = new Map>(); + const resolveSymbol = (filePath: string, name: string): string | null => { + let nameToId = fileNodesCache.get(filePath); + if (!nameToId) { + nameToId = new Map(); + for (const n of ctx.queries.getNodesByFile(filePath)) { + if (!nameToId.has(n.name)) nameToId.set(n.name, n.id); + } + fileNodesCache.set(filePath, nameToId); + } + return nameToId.get(name) ?? null; + }; + + const sinceSha = options.fullRescan + ? null + : ctx.queries.getMetadata(LAST_MINED_ISSUES_HEAD_KEY); + + const mined = mineIssueHistory(ctx.projectRoot, resolveSymbol, sinceSha); + if (mined.currentHead === null) return; // not in a git repo + + if (mined.needsFullRescan) { + ctx.queries.clearIssueAttributions(); + const remined = mineIssueHistory(ctx.projectRoot, resolveSymbol, null); + ctx.queries.applyIssueAttributions(remined.attributions); + ctx.queries.setMetadata(LAST_MINED_ISSUES_HEAD_KEY, remined.currentHead ?? ''); + } else { + if (options.fullRescan) ctx.queries.clearIssueAttributions(); + ctx.queries.applyIssueAttributions(mined.attributions); + ctx.queries.setMetadata(LAST_MINED_ISSUES_HEAD_KEY, mined.currentHead); + } + } catch (err) { + logDebug(`issue-history hook failed: ${err instanceof Error ? err.message : String(err)}`); + } +} + +export const HOOK: IndexHook = { + name: 'issue-history', + afterIndexAll(ctx) { refresh(ctx, { fullRescan: true }); }, + afterSync(ctx) { refresh(ctx, { fullRescan: false }); }, +}; diff --git a/src/index-hooks/registry.ts b/src/index-hooks/registry.ts index ef799bf0..5a61e017 100644 --- a/src/index-hooks/registry.ts +++ b/src/index-hooks/registry.ts @@ -24,6 +24,7 @@ import { logDebug } from '../errors'; import { HOOK as CENTRALITY_HOOK } from './centrality'; import { HOOK as CHURN_HOOK } from './churn'; +import { HOOK as ISSUE_HISTORY_HOOK } from './issue-history'; /** * Static-import list of every registered hook. @@ -36,6 +37,7 @@ import { HOOK as CHURN_HOOK } from './churn'; const REGISTERED_HOOKS: readonly IndexHook[] = [ CENTRALITY_HOOK, CHURN_HOOK, + ISSUE_HISTORY_HOOK, ]; /** diff --git a/src/index.ts b/src/index.ts index 4f6a35c0..7558993f 100644 --- a/src/index.ts +++ b/src/index.ts @@ -568,6 +568,14 @@ export class CodeGraph { return this.queries.getHotspots(opts); } + getIssuesForNode(nodeId: string): Array<{ + issueNumber: number; + kind: 'modified' | 'added' | 'removed'; + commitSha: string; + }> { + return this.queries.getIssuesForNode(nodeId); + } + // =========================================================================== // File Watching // =========================================================================== diff --git a/src/issue-history/index.ts b/src/issue-history/index.ts new file mode 100644 index 00000000..ea94a355 --- /dev/null +++ b/src/issue-history/index.ts @@ -0,0 +1,235 @@ +/** + * Issue → symbol attribution from git history + * + * Mines commits whose subject or body matches `Fixes #N` / + * `Closes #N` / `Resolves #N` and attributes their hunks to the + * symbols they touched. Result is stored in the `symbol_issues` + * table and surfaced via `codegraph_node` so an agent inspecting + * `runInstaller` sees "modified by issues #37, #68, #69" inline. + * + * Why hunk-level, not file-level: spike data (see `spike_issues.js` + * + `spike_issues_hunk.js`) showed that file-level produced ~40 + * symbols/issue, mostly noise — every issue touches files with + * many irrelevant symbols. Hunk-level is ~9 symbols/issue with + * 78% noise reduction, AND uniquely enables the multi-issue-symbol + * query (e.g. "loadGrammarsForLanguages was modified by every + * language-add issue") which file-level cannot answer because the + * intersection at file granularity is trivially huge. + * + * Convention: only `(Fixes|Closes|Resolves) #N` commits are mined. + * Generic commit messages without an issue ref are ignored — keeps + * signal-to-noise high. + * + * Known v1 limitations: + * - `Fixes #1, #2` only captures #1. The regex requires a verb + * prefix per match; `, #2` has no verb so it's skipped. Authors + * who care should write `Fixes #1, fixes #2`. Acceptable noise + * for v1; revisit if real projects show many comma-list misses. + * - Quoted issue references in commit bodies (e.g. "this reverts the + * 'Fixes #99' commit from last week") produce false positives. + * Detection would require message-block parsing; out of scope for v1. + */ + +import { execFileSync } from 'child_process'; +import { logDebug } from '../errors'; +import { parseCommitDiff } from './parse-diff'; + +/** Project-metadata key holding the HEAD SHA at the last successful mine. */ +export const LAST_MINED_ISSUES_HEAD_KEY = 'last_mined_issues_head'; + +/** + * Skip commits touching more than this many files. Squashed merges + * and mass refactors otherwise produce many false-positive + * attributions where every symbol in the commit gets credited to + * the issue. + */ +export const MAX_FILES_PER_COMMIT = 50; + +/** + * Match `fix #N` / `fixes #N` / `closes #N` / `resolves #N` (and + * past-tense variants), case-insensitive, allowing `:` or `-` + * between verb and `#`. Captures the issue number. + */ +export const ISSUE_REGEX = + /\b(?:fix|fixes|fixed|close|closes|closed|resolve|resolves|resolved)\s*[:\-]?\s*#(\d+)/gi; + +const MAX_GIT_BUFFER = 200 * 1024 * 1024; +const GIT_TIMEOUT_MS = 60_000; + +export interface IssueCommit { + sha: string; + /** Distinct issue numbers referenced, in source order. */ + issues: number[]; +} + +export type AttributionKind = 'modified' | 'added' | 'removed'; + +export interface IssueAttribution { + nodeId: string; + issueNumber: number; + commitSha: string; + kind: AttributionKind; +} + +export interface IssueMineResult { + attributions: IssueAttribution[]; + /** HEAD SHA reached by this run. null when not in a git repo. */ + currentHead: string | null; + /** Caller's `sinceSha` was unreachable — caller clears + re-mines from scratch. */ + needsFullRescan: boolean; + /** Debug-only counter: (file, name) lookups that didn't resolve. */ + unresolvedCount: number; +} + +/** Resolver supplied by the caller: (file, name) → node_id | null. */ +export type SymbolResolver = (filePath: string, symbolName: string) => string | null; + +/** Get HEAD SHA, or null when not in a git repo / no commits yet. */ +export function getGitHead(rootDir: string): string | null { + try { + return ( + execFileSync('git', ['rev-parse', 'HEAD'], { + cwd: rootDir, + encoding: 'utf-8', + timeout: 5000, + stdio: ['pipe', 'pipe', 'pipe'], + }).trim() || null + ); + } catch { + return null; + } +} + +function isShaReachable(rootDir: string, sha: string): boolean { + try { + execFileSync('git', ['cat-file', '-e', `${sha}^{commit}`], { + cwd: rootDir, + timeout: 5000, + stdio: ['pipe', 'pipe', 'pipe'], + }); + return true; + } catch { + return false; + } +} + +/** + * Find commits whose message references at least one issue. Returns + * `[]` when not in a git repo or git fails (logged via logDebug; + * never throws to the caller). + * + * Format: `git log --no-merges -z --pretty=format:CGCMT-%H%n%s%n%b%n` — + * each commit terminated by a NUL. The body line lets us match + * trailers like `Fixes #N` that aren't in the subject. + */ +export function mineIssueCommits(rootDir: string, sinceSha: string | null): IssueCommit[] { + const args = ['log', '--no-merges', '-z', '--pretty=format:CGCMT-%H%n%s%n%b']; + if (sinceSha) args.push(`${sinceSha}..HEAD`); + + let raw: string; + try { + raw = execFileSync('git', args, { + cwd: rootDir, + encoding: 'utf-8', + timeout: GIT_TIMEOUT_MS, + maxBuffer: MAX_GIT_BUFFER, + stdio: ['pipe', 'pipe', 'pipe'], + }); + } catch (err) { + logDebug(`mineIssueCommits: git log failed: ${err instanceof Error ? err.message : String(err)}`); + return []; + } + + const commits: IssueCommit[] = []; + const blocks = raw.split('\0'); + const headerRe = /^CGCMT-([0-9a-f]{40})$/; + for (const block of blocks) { + const trimmed = block.trim(); + if (!trimmed) continue; + const lines = trimmed.split('\n'); + const m = headerRe.exec(lines[0] ?? ''); + if (!m) continue; + const sha = m[1]!; + const messageBody = lines.slice(1).join('\n'); + const issues = new Set(); + let match: RegExpExecArray | null; + ISSUE_REGEX.lastIndex = 0; + while ((match = ISSUE_REGEX.exec(messageBody)) !== null) { + const n = parseInt(match[1]!, 10); + if (Number.isFinite(n) && n > 0) issues.add(n); + } + if (issues.size > 0) commits.push({ sha, issues: [...issues] }); + } + return commits; +} + +/** + * Mine issue→symbol attributions. + * + * @param rootDir Project root. + * @param resolveSymbol (filePath, name) → nodeId | null. Closure + * over the current index. Names that don't + * resolve are dropped (counted as unresolved + * for diagnostics). + * @param sinceSha null = full mine; otherwise `..HEAD`. + * Unreachable shas trigger needsFullRescan. + */ +export function mineIssueHistory( + rootDir: string, + resolveSymbol: SymbolResolver, + sinceSha: string | null +): IssueMineResult { + const empty: IssueMineResult = { + attributions: [], + currentHead: null, + needsFullRescan: false, + unresolvedCount: 0, + }; + + const head = getGitHead(rootDir); + if (!head) return empty; + + if (sinceSha && !isShaReachable(rootDir, sinceSha)) { + return { attributions: [], currentHead: head, needsFullRescan: true, unresolvedCount: 0 }; + } + if (sinceSha === head) { + return { attributions: [], currentHead: head, needsFullRescan: false, unresolvedCount: 0 }; + } + + const commits = mineIssueCommits(rootDir, sinceSha); + const attributions: IssueAttribution[] = []; + let unresolvedCount = 0; + + for (const c of commits) { + let perFile; + try { + perFile = parseCommitDiff(rootDir, c.sha); + } catch (err) { + logDebug(`parseCommitDiff failed for ${c.sha}: ${err instanceof Error ? err.message : String(err)}`); + continue; + } + if (perFile.size > MAX_FILES_PER_COMMIT) { + // Squashed mass-refactor — the issue ref is real but the per-symbol + // attribution would be all noise. Skip the whole commit. + continue; + } + for (const [filePath, sets] of perFile) { + const emit = (name: string, kind: AttributionKind) => { + const nodeId = resolveSymbol(filePath, name); + if (!nodeId) { + unresolvedCount += 1; + return; + } + for (const issue of c.issues) { + attributions.push({ nodeId, issueNumber: issue, commitSha: c.sha, kind }); + } + }; + // Order: modified first, then added, then removed. Stable for tests. + for (const name of sets.modCtx) emit(name, 'modified'); + for (const name of sets.added) emit(name, 'added'); + for (const name of sets.removed) emit(name, 'removed'); + } + } + + return { attributions, currentHead: head, needsFullRescan: false, unresolvedCount }; +} diff --git a/src/issue-history/parse-diff.ts b/src/issue-history/parse-diff.ts new file mode 100644 index 00000000..e697cbdc --- /dev/null +++ b/src/issue-history/parse-diff.ts @@ -0,0 +1,208 @@ +/** + * Diff parsing for issue → symbol attribution + * + * Pure parser: no I/O, no git invocations beyond the one `git show` it + * uses to fetch a commit's full diff. Splits into two distinct signals + * per (commit, file): + * + * modCtx — the *enclosing* function/class of each hunk, taken from + * git's `@@ -... +... @@ ` header. Cross-language + * because git's userdiff regex covers it (TS/JS/Py/Go/ + * Java/C/C++/Rust/Ruby out of the box). + * + * added — declarations on `+` lines (newly-introduced symbols). + * removed — declarations on `-` lines (deleted symbols). + * + * Both signals matter independently: an issue that *modifies* `foo()` + * is different evidence from an issue that *adds* `foo()`. The MCP + * surface renders them with explicit kind tags so an agent can tell + * the difference. + */ + +import { execFileSync } from 'child_process'; + +/** Hard cap on git output we'll buffer (bytes). */ +const MAX_GIT_BUFFER = 200 * 1024 * 1024; +/** Wall-clock cap on a single git invocation (ms). */ +const GIT_TIMEOUT_MS = 60_000; + +/** Identifiers that look like declarations to the loose `name(` regex + * but are actually keywords / locals — never represent indexable + * symbols. Filtering them keeps the resolved hit-rate high. */ +const SKIP_NAMES = new Set([ + 'if', 'for', 'while', 'switch', 'catch', 'return', 'throw', 'await', + 'new', 'function', 'class', 'interface', 'const', 'let', 'var', + 'export', 'import', 'public', 'private', 'protected', 'static', + 'async', 'abstract', 'default', 'super', 'this', 'true', 'false', + 'null', 'undefined', 'void', 'typeof', 'instanceof', + 'describe', 'it', 'expect', 'test', 'beforeEach', 'afterEach', + 'beforeAll', 'afterAll', // popular test-framework names; not symbols + 'constructor', // not a top-level symbol — owned by class +]); + +/** Path patterns we never extract diff symbols from. */ +const SKIP_PATH_RE = + /^(?:dist\/|node_modules\/|\.codegraph\/|coverage\/|build\/|out\/)|\.lock$|\.snap$|^package(?:-lock)?\.json$|\.md$|\.json$|\.svg$|\.png$|\.jpg$|\.gif$|\.ico$|\.txt$|\.yml$|\.yaml$|\.toml$/i; + +/** Declaration patterns; capture group 1 is the symbol name. + * Designed to be loose — better to over-collect and miss in the + * symbol-resolver step than to under-collect (the resolver is cheap). */ +const DECL_PATTERNS: RegExp[] = [ + // function foo / function* foo / async function foo + /^[+\-]\s*(?:export\s+)?(?:async\s+)?function\s*\*?\s+([A-Za-z_$][\w$]*)/, + // class Foo / abstract class Foo / export class Foo + /^[+\-]\s*(?:export\s+)?(?:abstract\s+)?class\s+([A-Za-z_$][\w$]*)/, + // interface Foo + /^[+\-]\s*(?:export\s+)?interface\s+([A-Za-z_$][\w$]*)/, + // type Foo = ... / type alias + /^[+\-]\s*(?:export\s+)?type\s+([A-Za-z_$][\w$]*)\s*=/, + // enum Foo + /^[+\-]\s*(?:export\s+)?(?:const\s+)?enum\s+([A-Za-z_$][\w$]*)/, + // const Foo = (..) => / const Foo = function + /^[+\-]\s*(?:export\s+)?const\s+([A-Z][\w$]*)\s*=\s*(?:\([^)]*\)\s*=>|function|async\s)/, + // method-like: visibility? name( (loose; SKIP_NAMES filters keywords) + /^[+\-]\s*(?:public|private|protected|static|async)\s+(?:[a-z]+\s+)*([A-Za-z_$][\w$]*)\s*\(/, + // Python: def name( / async def name( + /^[+\-]\s*(?:async\s+)?def\s+([A-Za-z_][\w]*)\s*\(/, + // Go: func name( / func (recv) name( + /^[+\-]\s*func\s+(?:\([^)]*\)\s+)?([A-Za-z_][\w]*)\s*\(/, + // Rust: fn name( / pub fn name<...>( + /^[+\-]\s*(?:pub(?:\([^)]*\))?\s+)?(?:async\s+)?fn\s+([A-Za-z_][\w]*)\s*[<(]/, +]; + +export interface FileDiffSets { + modCtx: Set; + added: Set; + removed: Set; +} + +/** + * Pull the symbol name out of a git `@@ ... @@ ` context line. + * Git's userdiff regexes already give us a single line that includes + * the enclosing definition (e.g. `function processOrder(order: Order) + * {`). We take the first identifier following a recognised keyword, + * falling back to "first identifier-followed-by-paren" for languages + * git doesn't have explicit userdiff for. + */ +export function extractSymbolFromContext(ctx: string): string | null { + const trimmed = ctx.trim(); + if (!trimmed) return null; + // Order of patterns matters: anchor on keyword first, then on + // identifier-followed-by-paren. + const m1 = trimmed.match(/(?:function|class|interface|type|enum|def|func|fn)\s+([A-Za-z_$][\w$]*)/); + if (m1 && !SKIP_NAMES.has(m1[1]!)) return m1[1]!; + const m2 = trimmed.match(/^([A-Za-z_$][\w$]*)\s*\(/); + if (m2 && !SKIP_NAMES.has(m2[1]!)) return m2[1]!; + // Methods: ` async foo(` after some indentation, with possibly a + // visibility modifier we already skipped above. + const m3 = trimmed.match(/(?:async\s+)?([A-Za-z_$][\w$]*)\s*\(/); + if (m3 && !SKIP_NAMES.has(m3[1]!)) return m3[1]!; + return null; +} + +/** + * Pull a declared symbol name out of a single `+` or `-` diff line. + */ +export function extractDeclaration(diffLine: string): { name: string; sign: '+' | '-' } | null { + if (!diffLine || (diffLine[0] !== '+' && diffLine[0] !== '-')) return null; + // Skip the file-marker lines emitted by git. + if (diffLine.startsWith('+++') || diffLine.startsWith('---')) return null; + for (const re of DECL_PATTERNS) { + const m = re.exec(diffLine); + if (m && m[1] && !SKIP_NAMES.has(m[1])) { + return { name: m[1], sign: diffLine[0] as '+' | '-' }; + } + } + return null; +} + +/** + * Pull a declaration name out of an unchanged (` `-prefixed) diff + * line. Used to detect the enclosing function when git's `@@ ... @@ + * ` header is empty (which happens when the changed hunk lives + * inside a function that starts at line 1, so there's no enclosing + * scope *above* the hunk for git's userdiff to reference). + * + * Matches the same patterns as `extractDeclaration` but allows a + * leading space (the diff context-line prefix). + */ +export function extractContextDeclaration(diffLine: string): string | null { + if (!diffLine || diffLine[0] !== ' ') return null; + for (const re of DECL_PATTERNS) { + // DECL_PATTERNS anchor on `[+\-]` — accept space too by trying + // again with that prefix swapped. + const swapped = '+' + diffLine.slice(1); + const m = re.exec(swapped); + if (m && m[1] && !SKIP_NAMES.has(m[1])) return m[1]; + } + return null; +} + +/** + * Run `git show ` and parse the diff into per-file + * (modCtx, added, removed) sets. + * + * Throws if git fails (caller should catch + log + skip the commit). + */ +export function parseCommitDiff(rootDir: string, commitSha: string): Map { + const out = execFileSync( + 'git', + ['show', commitSha, '--unified=3', '--no-color', '--no-renames'], + { + cwd: rootDir, + encoding: 'utf-8', + timeout: GIT_TIMEOUT_MS, + maxBuffer: MAX_GIT_BUFFER, + stdio: ['pipe', 'pipe', 'pipe'], + } + ); + const lines = out.split('\n'); + const perFile = new Map(); + let curFile: string | null = null; + + for (const L of lines) { + if (L.startsWith('diff --git ')) { + // `diff --git a/ b/` — take the new path (post-rename + // would normally apply here but we passed --no-renames). + const m = L.match(/^diff --git a\/(.+?) b\/(.+)$/); + if (m) { + curFile = m[2]!; + if (SKIP_PATH_RE.test(curFile)) { + curFile = null; // signal to subsequent rows: skip + continue; + } + if (!perFile.has(curFile)) { + perFile.set(curFile, { modCtx: new Set(), added: new Set(), removed: new Set() }); + } + } + continue; + } + if (curFile === null) continue; + if (L.startsWith('@@')) { + // `@@ -a,b +c,d @@ ` + const m = L.match(/^@@\s+-\d+(?:,\d+)?\s+\+\d+(?:,\d+)?\s+@@\s*(.*)$/); + if (m && m[1]) { + const sym = extractSymbolFromContext(m[1]); + if (sym) perFile.get(curFile)!.modCtx.add(sym); + } + continue; + } + const decl = extractDeclaration(L); + if (decl) { + const sets = perFile.get(curFile)!; + if (decl.sign === '+') sets.added.add(decl.name); + else sets.removed.add(decl.name); + continue; + } + // Fallback: an unchanged context line within a hunk that contains + // a declaration is the enclosing scope for that hunk. This catches + // the case where the function's signature is at line 1 (so git's + // userdiff has no scope *above* the hunk to use as @@ ). + const ctxName = extractContextDeclaration(L); + if (ctxName) { + perFile.get(curFile)!.modCtx.add(ctxName); + } + } + + return perFile; +} diff --git a/src/mcp/tools.ts b/src/mcp/tools.ts index 52b8e99e..8e5759e5 100644 --- a/src/mcp/tools.ts +++ b/src/mcp/tools.ts @@ -724,7 +724,10 @@ export class ToolHandler implements ToolHandlerLike { code = await cg.getCode(match.node.id); } - const formatted = this.formatNodeDetails(match.node, code) + match.note; + // Surface issue history (mined from `Fixes #N` commits). + const issues = cg.getIssuesForNode(match.node.id); + + const formatted = this.formatNodeDetails(match.node, code, issues) + match.note; return this.textResult(this.truncateOutput(formatted)); } @@ -1156,7 +1159,15 @@ export class ToolHandler implements ToolHandlerLike { return lines.join('\n'); } - private formatNodeDetails(node: Node, code: string | null): string { + private formatNodeDetails( + node: Node, + code: string | null, + issues: Array<{ + issueNumber: number; + kind: 'modified' | 'added' | 'removed'; + commitSha: string; + }> = [] + ): string { const location = node.startLine ? `:${node.startLine}` : ''; const lines: string[] = [ `## ${node.name} (${node.kind})`, @@ -1168,6 +1179,25 @@ export class ToolHandler implements ToolHandlerLike { lines.push(`**Signature:** \`${node.signature}\``); } + if (issues.length > 0) { + const byKind: Record<'modified' | 'added' | 'removed', Set> = { + modified: new Set(), + added: new Set(), + removed: new Set(), + }; + for (const i of issues) byKind[i.kind].add(i.issueNumber); + const parts: string[] = []; + for (const k of ['modified', 'added', 'removed'] as const) { + const set = byKind[k]; + if (set.size === 0) continue; + const sorted = [...set].sort((a, b) => a - b); + parts.push(`#${sorted.join(', #')} (${k})`); + } + if (parts.length > 0) { + lines.push(`**Issues:** ${parts.join(' — ')}`); + } + } + // Only include docstring if it's short and useful if (node.docstring && node.docstring.length < 200) { lines.push('', node.docstring); diff --git a/src/types.ts b/src/types.ts index 64fbcaa9..4ce51c0c 100644 --- a/src/types.ts +++ b/src/types.ts @@ -511,6 +511,13 @@ export interface CodeGraphConfig { * Enabled by default. */ enableChurn?: boolean; + + /** + * Mine `Fixes/Closes/Resolves #N` commits and attribute issues to + * symbols touched by their hunks. Enabled by default; turn off on + * non-GitHub repos or where issue refs are noisy. + */ + enableIssueHistory?: boolean; } // `DEFAULT_CONFIG` lives in `./default-config.ts` so its `include` From f8fc536feba4bd063224c687f480451f474c9d5a Mon Sep 17 00:00:00 2001 From: andreinknv Date: Mon, 27 Apr 2026 17:57:13 -0400 Subject: [PATCH 8/9] feat: PR #114 (config-refs) on top of refactors Extracts env-var read sites (process.env.X, os.getenv("X"), etc) into config_refs and exposes them via codegraph_config MCP tool. Lands as a registered IndexHook (config-refs). - Migration 006: config_refs table - src/config-refs/ (pure module): regex-based extractor - src/index-hooks/config-refs.ts (registered hook with full / files scoping for indexAll vs sync) - CodeGraph public methods: getConfigKeys, getConfigRefsByKey, getConfigKeysForNode - codegraph_config MCP tool wired through ToolModule registry - enableConfigRefs flag default true - Removed defensive ensureConfigRefsTable guard + its test for the same reason as PR #113: v4-collision bug class is impossible under file-based migrations. Tests: 488/489 pass (1 watcher flake under load). Co-Authored-By: Claude Opus 4.7 (1M context) --- __tests__/config-refs.test.ts | 288 +++++++++++++++++++++++++++ __tests__/foundation.test.ts | 2 +- __tests__/mcp-tool-registry.test.ts | 1 + __tests__/pr19-improvements.test.ts | 2 +- src/config-refs/index.ts | 188 +++++++++++++++++ src/config.ts | 1 + src/db/migrations/006-config-refs.ts | 24 +++ src/db/migrations/index.ts | 2 + src/db/queries.ts | 110 ++++++++++ src/db/schema.sql | 21 ++ src/default-config.ts | 1 + src/index-hooks/config-refs.ts | 77 +++++++ src/index-hooks/registry.ts | 2 + src/index.ts | 17 ++ src/mcp/tools.ts | 45 +++++ src/mcp/tools/config.ts | 26 +++ src/mcp/tools/registry.ts | 2 + src/mcp/tools/types.ts | 3 +- src/types.ts | 6 + 19 files changed, 815 insertions(+), 3 deletions(-) create mode 100644 __tests__/config-refs.test.ts create mode 100644 src/config-refs/index.ts create mode 100644 src/db/migrations/006-config-refs.ts create mode 100644 src/index-hooks/config-refs.ts create mode 100644 src/mcp/tools/config.ts diff --git a/__tests__/config-refs.test.ts b/__tests__/config-refs.test.ts new file mode 100644 index 00000000..ab1a63e4 --- /dev/null +++ b/__tests__/config-refs.test.ts @@ -0,0 +1,288 @@ +/** + * Config-refs tests: parser unit tests + end-to-end through CodeGraph. + */ +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import * as fs from 'fs'; +import * as os from 'os'; +import * as path from 'path'; +import { extractConfigRefs } from '../src/config-refs'; +import CodeGraph from '../src/index'; + +let testDir: string; +let cg: CodeGraph | null = null; + +function write(rel: string, content: string) { + const abs = path.join(testDir, rel); + fs.mkdirSync(path.dirname(abs), { recursive: true }); + fs.writeFileSync(abs, content); +} + +beforeEach(() => { + testDir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-config-')); +}); + +afterEach(() => { + if (cg) { + cg.destroy(); + cg = null; + } + if (fs.existsSync(testDir)) fs.rmSync(testDir, { recursive: true, force: true }); +}); + +// ============================================================================ +// Pure parser tests (no CodeGraph) +// ============================================================================ + +describe('extractConfigRefs', () => { + it('extracts process.env.X from TS', () => { + write('a.ts', `const port = process.env.OBSIDIAN_PORT;\n`); + const refs = extractConfigRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + expect(refs.length).toBe(1); + expect(refs[0]!.configKey).toBe('OBSIDIAN_PORT'); + expect(refs[0]!.line).toBe(1); + }); + + it('extracts process.env["X"] from JS', () => { + write('a.js', `module.exports = { port: process.env["MY_KEY"] };\n`); + const refs = extractConfigRefs(testDir, [{ path: 'a.js', language: 'javascript' }], () => null); + expect(refs.map((r) => r.configKey)).toEqual(['MY_KEY']); + }); + + it('extracts os.getenv / os.environ from Python', () => { + write( + 'a.py', + [ + `import os`, + `port = os.getenv("PYTHON_PORT")`, + `host = os.environ.get("PYTHON_HOST")`, + `path = os.environ["PYTHON_PATH"]`, + `name = getenv("PYTHON_NAME")`, + ].join('\n') + ); + const refs = extractConfigRefs(testDir, [{ path: 'a.py', language: 'python' }], () => null); + expect(new Set(refs.map((r) => r.configKey))).toEqual( + new Set(['PYTHON_PORT', 'PYTHON_HOST', 'PYTHON_PATH', 'PYTHON_NAME']) + ); + }); + + it('extracts os.Getenv / os.LookupEnv from Go', () => { + write( + 'a.go', + [ + `package main`, + `import "os"`, + `var Port = os.Getenv("GO_PORT")`, + `var Host, _ = os.LookupEnv("GO_HOST")`, + ].join('\n') + ); + const refs = extractConfigRefs(testDir, [{ path: 'a.go', language: 'go' }], () => null); + expect(new Set(refs.map((r) => r.configKey))).toEqual(new Set(['GO_PORT', 'GO_HOST'])); + }); + + it('extracts ENV[...] / ENV.fetch from Ruby', () => { + write('a.rb', `port = ENV["RUBY_PORT"]\nhost = ENV.fetch("RUBY_HOST")\n`); + const refs = extractConfigRefs(testDir, [{ path: 'a.rb', language: 'ruby' }], () => null); + expect(new Set(refs.map((r) => r.configKey))).toEqual(new Set(['RUBY_PORT', 'RUBY_HOST'])); + }); + + it('extracts env!/std::env::var from Rust', () => { + write( + 'a.rs', + [ + `let port = env!("RUST_PORT");`, + `let host = std::env::var("RUST_HOST").unwrap();`, + ].join('\n') + ); + const refs = extractConfigRefs(testDir, [{ path: 'a.rs', language: 'rust' }], () => null); + expect(new Set(refs.map((r) => r.configKey))).toEqual(new Set(['RUST_PORT', 'RUST_HOST'])); + }); + + it('extracts System.getenv from Java/Kotlin', () => { + write('A.java', `String port = System.getenv("JAVA_PORT");\n`); + const refs = extractConfigRefs(testDir, [{ path: 'A.java', language: 'java' }], () => null); + expect(refs.map((r) => r.configKey)).toEqual(['JAVA_PORT']); + }); + + it('only matches UPPER_CASE keys (skips lower-case identifiers)', () => { + write('a.ts', `const x = process.env.somethingDynamic;\nconst y = process.env.GOOD_KEY;\n`); + const refs = extractConfigRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + expect(refs.map((r) => r.configKey)).toEqual(['GOOD_KEY']); + }); + + it('skips files in unsupported languages without crashing', () => { + write('a.swift', `let port = ProcessInfo.processInfo.environment["SWIFT_PORT"]\n`); + const refs = extractConfigRefs(testDir, [{ path: 'a.swift', language: 'swift' }], () => null); + // Swift not in PATTERNS for v1. + expect(refs).toEqual([]); + }); + + it('captures the correct 1-indexed line number', () => { + write( + 'a.ts', + [ + `// line 1`, + `// line 2`, + `const x = process.env.LINE_THREE_KEY;`, + `// line 4`, + `const y = process.env.LINE_FIVE_KEY;`, + ].join('\n') + ); + const refs = extractConfigRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + expect(refs).toEqual([ + expect.objectContaining({ configKey: 'LINE_THREE_KEY', line: 3 }), + expect.objectContaining({ configKey: 'LINE_FIVE_KEY', line: 5 }), + ]); + }); + + it('threads the resolveEnclosing closure correctly', () => { + write('a.ts', `const x = process.env.FOO;\n`); + const calls: Array<[string, number]> = []; + extractConfigRefs( + testDir, + [{ path: 'a.ts', language: 'typescript' }], + (filePath, line) => { + calls.push([filePath, line]); + return 'fake-node-id'; + } + ); + expect(calls).toEqual([['a.ts', 1]]); + }); + + it('survives a missing file (skips, no throw)', () => { + const refs = extractConfigRefs( + testDir, + [{ path: 'does-not-exist.ts', language: 'typescript' }], + () => null + ); + expect(refs).toEqual([]); + }); +}); + +// ============================================================================ +// End-to-end through CodeGraph +// ============================================================================ + +describe('CodeGraph config refs', () => { + it('persists env reads after indexAll and resolves enclosing function', async () => { + write( + 'src/server.ts', + [ + `export function start() {`, + ` const port = process.env.OBSIDIAN_PORT ?? 8080;`, + ` return port;`, + `}`, + ``, + `export function getApiKey() {`, + ` return process.env.OBSIDIAN_API_KEY;`, + `}`, + ``, + `// top-level read`, + `export const HOST = process.env.OBSIDIAN_HOST;`, + ].join('\n') + ); + cg = CodeGraph.initSync(testDir, { + config: { include: ['**/*.ts'], exclude: [] }, + }); + await cg.indexAll(); + + // All three keys should be visible. + const keys = cg.getConfigKeys({ configKind: 'env' }); + expect(keys.map((k) => k.configKey).sort()).toEqual([ + 'OBSIDIAN_API_KEY', + 'OBSIDIAN_HOST', + 'OBSIDIAN_PORT', + ]); + + // The OBSIDIAN_PORT read should be attributed to `start`. + const portSites = cg.getConfigRefsByKey('OBSIDIAN_PORT'); + expect(portSites.length).toBe(1); + expect(portSites[0]!.sourceName).toBe('start'); + + // The HOST read is at the top level — sourceName should be null. + const hostSites = cg.getConfigRefsByKey('OBSIDIAN_HOST'); + expect(hostSites[0]!.sourceName).toBeNull(); + }); + + it('reverse view: getConfigKeysForNode returns keys read by a function', async () => { + write( + 'src/a.ts', + [ + `export function loadConfig() {`, + ` const a = process.env.KEY_A;`, + ` const b = process.env.KEY_B;`, + ` return { a, b };`, + `}`, + ].join('\n') + ); + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + + const node = cg.getNodesInFile('src/a.ts').find((n) => n.name === 'loadConfig')!; + const keys = cg.getConfigKeysForNode(node.id).map((r) => r.configKey).sort(); + expect(keys).toEqual(['KEY_A', 'KEY_B']); + }); + + it('respects enableConfigRefs=false', async () => { + write('src/a.ts', `export const PORT = process.env.PORT;\n`); + cg = CodeGraph.initSync(testDir, { + config: { include: ['**/*.ts'], exclude: [], enableConfigRefs: false }, + }); + await cg.indexAll(); + expect(cg.getConfigKeys()).toEqual([]); + }); + + it('incremental sync replaces refs for changed files only', async () => { + write('src/a.ts', `export const A = process.env.OLD_KEY;\n`); + write('src/b.ts', `export const B = process.env.UNCHANGED_KEY;\n`); + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + expect(cg.getConfigKeys().map((k) => k.configKey).sort()).toEqual([ + 'OLD_KEY', + 'UNCHANGED_KEY', + ]); + + // Edit only a.ts — UNCHANGED_KEY should still be there. + write('src/a.ts', `export const A = process.env.NEW_KEY;\n`); + await cg.sync(); + + const keys = cg.getConfigKeys().map((k) => k.configKey).sort(); + expect(keys).toContain('NEW_KEY'); + expect(keys).toContain('UNCHANGED_KEY'); + expect(keys).not.toContain('OLD_KEY'); + }); + + it('drops refs when a file is edited to remove its last env read', async () => { + // Regression for the empty-rows early-return data-corruption bug: + // applyConfigRefs([]) used to short-circuit without deleting the + // stale rows for the file. The sync path now explicitly invalidates + // rows for every changed file *before* extracting, regardless of + // whether the new content has any reads. + write('src/a.ts', `export const PORT = process.env.REMOVED_KEY;\n`); + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + expect(cg.getConfigKeys().some((k) => k.configKey === 'REMOVED_KEY')).toBe(true); + + // Edit a.ts to remove the env read entirely (no remaining reads). + write('src/a.ts', `export const PORT = 8080; // no env read here\n`); + await cg.sync(); + + expect(cg.getConfigKeys().some((k) => k.configKey === 'REMOVED_KEY')).toBe(false); + }); + + it('drops refs for files removed between syncs', async () => { + write('src/a.ts', `export const A = process.env.GOING_AWAY;\n`); + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + expect(cg.getConfigKeys().some((k) => k.configKey === 'GOING_AWAY')).toBe(true); + + fs.unlinkSync(path.join(testDir, 'src/a.ts')); + await cg.sync(); + + expect(cg.getConfigKeys().some((k) => k.configKey === 'GOING_AWAY')).toBe(false); + }); + + // (Removed: a defensive test for the v4-migration-collision bug class. + // With file-based migrations (NNN-name.ts), two PRs claiming the same + // version produces a filesystem-level conflict, so the silent skip the + // defensive guard protected against can no longer happen.) +}); diff --git a/__tests__/foundation.test.ts b/__tests__/foundation.test.ts index 20ada266..805120b6 100644 --- a/__tests__/foundation.test.ts +++ b/__tests__/foundation.test.ts @@ -305,7 +305,7 @@ describe('Database Connection', () => { const version = db.getSchemaVersion(); expect(version).not.toBeNull(); - expect(version?.version).toBe(5); + expect(version?.version).toBe(6); db.close(); }); diff --git a/__tests__/mcp-tool-registry.test.ts b/__tests__/mcp-tool-registry.test.ts index b8ce3025..a956eec8 100644 --- a/__tests__/mcp-tool-registry.test.ts +++ b/__tests__/mcp-tool-registry.test.ts @@ -41,6 +41,7 @@ describe('MCP tool registry — single source of truth', () => { const expected = [ 'codegraph_callees', 'codegraph_callers', + 'codegraph_config', 'codegraph_context', 'codegraph_explore', 'codegraph_files', diff --git a/__tests__/pr19-improvements.test.ts b/__tests__/pr19-improvements.test.ts index 5974b549..6768f256 100644 --- a/__tests__/pr19-improvements.test.ts +++ b/__tests__/pr19-improvements.test.ts @@ -299,7 +299,7 @@ describe('Best-Candidate Resolution', () => { describe('Schema v2 Migration', () => { it.skipIf(!HAS_SQLITE)('should have correct current schema version', async () => { const { CURRENT_SCHEMA_VERSION } = await import('../src/db/migrations'); - expect(CURRENT_SCHEMA_VERSION).toBe(5); + expect(CURRENT_SCHEMA_VERSION).toBe(6); }); it.skipIf(!HAS_SQLITE)('should have migration for version 2', async () => { diff --git a/src/config-refs/index.ts b/src/config-refs/index.ts new file mode 100644 index 00000000..1ef47ae9 --- /dev/null +++ b/src/config-refs/index.ts @@ -0,0 +1,188 @@ +/** + * Config-reference extraction + * + * Scans indexed source files for known config-read patterns + * (`process.env.X`, `os.getenv("X")`, etc.) and records each read + * site as a row in `config_refs`. Each row links to its enclosing + * function via a line-range lookup against the existing nodes table, + * so an agent asking "what reads OBSIDIAN_PORT?" gets a list of real + * functions, not a grep wall. + * + * Why a separate table, not graph nodes/edges: env vars don't have a + * single source-of-truth file (they're a global namespace), so giving + * them a synthetic file_path would pollute the main graph. The table + * is queried via a dedicated MCP tool (`codegraph_config`) and via + * augmented `codegraph_node` output (per-function "reads:" line). + * + * Spike validation (mcp-obsidian-extended): 71 reads, 19 distinct + * keys; 8× OBSIDIAN_PORT, 8× TOOL_PRESET surface as central + * config knobs. Codegraph-itself is sparse (4 reads) — this feature + * shines on service-style codebases. + * + * V1 scope: env-only, regex-based per-language. YAML key reads, + * LaunchDarkly flags, etc. are deliberately out of scope; the schema + * already supports them via `config_kind` so adding them later is a + * pattern addition, not a redesign. + */ + +import * as fs from 'fs'; +import * as path from 'path'; +import { logDebug } from '../errors'; + +export type ConfigKind = 'env'; + +export interface ConfigRef { + configKind: ConfigKind; + configKey: string; + /** Indexed-symbol id for the enclosing function/method. NULL = top-level. */ + sourceNodeId: string | null; + filePath: string; + line: number; +} + +interface PatternDef { + /** Languages this pattern applies to (matches `Language` in types.ts). */ + languages: string[]; + /** Regex with capture group 1 = config key. */ + re: RegExp; +} + +/** + * Per-language read-pattern catalogue. + * + * Patterns intentionally err on the side of including only + * UPPER_CASE_KEYS — the convention every framework follows for env + * vars. This avoids false positives like `process.env.foo` (a Node + * variable) or `os.getenv(some_var)` (dynamic). + */ +const PATTERNS: PatternDef[] = [ + // process.env.FOO / process.env["FOO"] (TS, JS, TSX, JSX) + { + languages: ['typescript', 'javascript', 'tsx', 'jsx'], + re: /process\.env\.([A-Z_][A-Z0-9_]*)/g, + }, + { + languages: ['typescript', 'javascript', 'tsx', 'jsx'], + re: /process\.env\[\s*['"]([A-Z_][A-Z0-9_]*)['"]\s*\]/g, + }, + // os.getenv("FOO") / os.environ.get("FOO") / os.environ["FOO"] + { + languages: ['python'], + re: /\bos\.getenv\(\s*['"]([A-Z_][A-Z0-9_]*)['"]/g, + }, + { + languages: ['python'], + re: /\bos\.environ\.get\(\s*['"]([A-Z_][A-Z0-9_]*)['"]/g, + }, + { + languages: ['python'], + re: /\bos\.environ\[\s*['"]([A-Z_][A-Z0-9_]*)['"]\s*\]/g, + }, + // Bare getenv("FOO") (Python convention with `from os import getenv`) + { + languages: ['python'], + re: /\bgetenv\(\s*['"]([A-Z_][A-Z0-9_]*)['"]/g, + }, + // os.Getenv("FOO") / os.LookupEnv("FOO") (Go) + { + languages: ['go'], + re: /\bos\.(?:Getenv|LookupEnv)\(\s*"([A-Z_][A-Z0-9_]*)"/g, + }, + // System.getenv("FOO") (Java/Kotlin) + { + languages: ['java', 'kotlin'], + re: /\bSystem\.getenv\(\s*"([A-Z_][A-Z0-9_]*)"/g, + }, + // ENV["FOO"] / ENV.fetch("FOO") (Ruby) + { + languages: ['ruby'], + re: /\bENV\[\s*['"]([A-Z_][A-Z0-9_]*)['"]\s*\]/g, + }, + { + languages: ['ruby'], + re: /\bENV\.fetch\(\s*['"]([A-Z_][A-Z0-9_]*)['"]/g, + }, + // Rust: env!("FOO") / std::env::var("FOO") + { + languages: ['rust'], + re: /\benv!\(\s*"([A-Z_][A-Z0-9_]*)"/g, + }, + { + languages: ['rust'], + re: /\bstd::env::var\(\s*"([A-Z_][A-Z0-9_]*)"/g, + }, +]; + +/** A file's languages-of-interest. Skip everything not in PATTERNS. */ +const SUPPORTED_LANGUAGES = new Set( + PATTERNS.flatMap((p) => p.languages) +); + +/** + * Resolver supplied by caller: (filePath, line) → enclosing nodeId + * (function/method/class). Returns null when the read is at the file's + * top level — the row still gets persisted with NULL source_node_id. + */ +export type EnclosingNodeResolver = (filePath: string, line: number) => string | null; + +export interface FileTarget { + path: string; + language: string; +} + +/** + * Scan a list of (path, language) targets and return all read sites. + * Pure I/O + regex; the caller owns DB writes via `applyConfigRefs`. + * + * Files we can't read (deleted, permission, binary) are silently + * skipped — extraction has already validated readability for the rest. + */ +export function extractConfigRefs( + rootDir: string, + targets: Iterable, + resolveEnclosing: EnclosingNodeResolver +): ConfigRef[] { + const refs: ConfigRef[] = []; + for (const t of targets) { + if (!SUPPORTED_LANGUAGES.has(t.language)) continue; + let src: string; + try { + src = fs.readFileSync(path.join(rootDir, t.path), 'utf8'); + } catch (err) { + logDebug(`extractConfigRefs: read failed for ${t.path}: ${err instanceof Error ? err.message : String(err)}`); + continue; + } + // Iterate lines so we can attribute each match to a 1-indexed line. + const lines = src.split('\n'); + for (let i = 0; i < lines.length; i++) { + const line = lines[i]!; + // Cheap pre-filter to skip the 99% of lines that obviously + // contain no env reference. Cuts per-file cost dramatically on + // big repos. + if ( + !line.includes('env') && + !line.includes('Env') && + !line.includes('ENV') + ) { + continue; + } + for (const pat of PATTERNS) { + if (!pat.languages.includes(t.language)) continue; + pat.re.lastIndex = 0; + let m: RegExpExecArray | null; + while ((m = pat.re.exec(line)) !== null) { + const key = m[1]!; + const lineNo = i + 1; + refs.push({ + configKind: 'env', + configKey: key, + sourceNodeId: resolveEnclosing(t.path, lineNo), + filePath: t.path, + line: lineNo, + }); + } + } + } + } + return refs; +} diff --git a/src/config.ts b/src/config.ts index 44d075dc..00adf9a5 100644 --- a/src/config.ts +++ b/src/config.ts @@ -131,6 +131,7 @@ function mergeConfig( enableCentrality: overrides.enableCentrality ?? defaults.enableCentrality, enableChurn: overrides.enableChurn ?? defaults.enableChurn, enableIssueHistory: overrides.enableIssueHistory ?? defaults.enableIssueHistory, + enableConfigRefs: overrides.enableConfigRefs ?? defaults.enableConfigRefs, }; } diff --git a/src/db/migrations/006-config-refs.ts b/src/db/migrations/006-config-refs.ts new file mode 100644 index 00000000..8fed1a91 --- /dev/null +++ b/src/db/migrations/006-config-refs.ts @@ -0,0 +1,24 @@ +import type { MigrationModule } from './types'; + +export const MIGRATION: MigrationModule = { + description: 'Add config_refs table for env var / feature flag read sites', + up: (db) => { + db.exec(` + CREATE TABLE IF NOT EXISTS config_refs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + config_kind TEXT NOT NULL, + config_key TEXT NOT NULL, + source_node_id TEXT, + file_path TEXT NOT NULL, + line INTEGER NOT NULL, + FOREIGN KEY (source_node_id) REFERENCES nodes(id) ON DELETE CASCADE + ); + CREATE INDEX IF NOT EXISTS idx_config_refs_key + ON config_refs(config_kind, config_key); + CREATE INDEX IF NOT EXISTS idx_config_refs_node + ON config_refs(source_node_id); + CREATE INDEX IF NOT EXISTS idx_config_refs_file + ON config_refs(file_path); + `); + }, +}; diff --git a/src/db/migrations/index.ts b/src/db/migrations/index.ts index cd3e3ba3..525fe2a2 100644 --- a/src/db/migrations/index.ts +++ b/src/db/migrations/index.ts @@ -28,6 +28,7 @@ import { MIGRATION as MIG_002 } from './002-project-metadata'; import { MIGRATION as MIG_003 } from './003-lower-name-index'; import { MIGRATION as MIG_004 } from './004-centrality-churn'; import { MIGRATION as MIG_005 } from './005-symbol-issues'; +import { MIGRATION as MIG_006 } from './006-config-refs'; interface ModuleRef { /** @@ -52,6 +53,7 @@ const REGISTERED_MODULES: readonly ModuleRef[] = [ { filename: '003-lower-name-index.ts', module: MIG_003 }, { filename: '004-centrality-churn.ts', module: MIG_004 }, { filename: '005-symbol-issues.ts', module: MIG_005 }, + { filename: '006-config-refs.ts', module: MIG_006 }, ]; /** Strict 3-digit prefix on each migration filename. */ diff --git a/src/db/queries.ts b/src/db/queries.ts index af87a7b9..446116d2 100644 --- a/src/db/queries.ts +++ b/src/db/queries.ts @@ -1579,4 +1579,114 @@ export class QueryBuilder { commitSha: string; }>; } + + // =========================================================================== + // Config references (env vars / feature flags read sites) + // =========================================================================== + + applyConfigRefs( + rows: Array<{ + configKind: 'env'; + configKey: string; + sourceNodeId: string | null; + filePath: string; + line: number; + }> + ): void { + if (rows.length === 0) return; + const distinctFiles = new Set(rows.map((r) => r.filePath)); + const deleteStmt = this.db.prepare('DELETE FROM config_refs WHERE file_path = ?'); + const insertStmt = this.db.prepare( + `INSERT INTO config_refs (config_kind, config_key, source_node_id, file_path, line) + VALUES (?, ?, ?, ?, ?)` + ); + this.db.transaction(() => { + for (const f of distinctFiles) deleteStmt.run(f); + for (const r of rows) { + insertStmt.run(r.configKind, r.configKey, r.sourceNodeId, r.filePath, r.line); + } + })(); + } + + clearConfigRefs(): void { + this.db.exec('DELETE FROM config_refs'); + } + + deleteConfigRefsForPaths(filePaths: Iterable): void { + const stmt = this.db.prepare('DELETE FROM config_refs WHERE file_path = ?'); + this.db.transaction(() => { + for (const p of filePaths) stmt.run(p); + })(); + } + + pruneOrphanedConfigRefs(): void { + this.db.exec( + `DELETE FROM config_refs WHERE file_path NOT IN (SELECT path FROM files)` + ); + } + + getConfigKeys(opts: { configKind?: 'env'; limit?: number } = {}): Array<{ + configKey: string; + reads: number; + distinctFiles: number; + }> { + const limit = opts.limit ?? 200; + const where = opts.configKind ? 'WHERE config_kind = ?' : ''; + const params = opts.configKind ? [opts.configKind, limit] : [limit]; + return this.db + .prepare( + `SELECT config_key AS configKey, + COUNT(*) AS reads, + COUNT(DISTINCT file_path) AS distinctFiles + FROM config_refs + ${where} + GROUP BY config_key + ORDER BY reads DESC, config_key ASC + LIMIT ?` + ) + .all(...params) as Array<{ configKey: string; reads: number; distinctFiles: number }>; + } + + getConfigRefsByKey( + configKey: string, + opts: { configKind?: 'env' } = {} + ): Array<{ + filePath: string; + line: number; + sourceNodeId: string | null; + sourceName: string | null; + sourceKind: string | null; + }> { + const kind = opts.configKind ?? 'env'; + return this.db + .prepare( + `SELECT cr.file_path AS filePath, + cr.line AS line, + cr.source_node_id AS sourceNodeId, + n.name AS sourceName, + n.kind AS sourceKind + FROM config_refs cr + LEFT JOIN nodes n ON n.id = cr.source_node_id + WHERE cr.config_kind = ? AND cr.config_key = ? + ORDER BY cr.file_path ASC, cr.line ASC` + ) + .all(kind, configKey) as Array<{ + filePath: string; + line: number; + sourceNodeId: string | null; + sourceName: string | null; + sourceKind: string | null; + }>; + } + + getConfigKeysForNode(nodeId: string): Array<{ configKey: string; line: number }> { + return this.db + .prepare( + `SELECT config_key AS configKey, line + FROM config_refs + WHERE source_node_id = ? + ORDER BY config_key ASC, line ASC` + ) + .all(nodeId) as Array<{ configKey: string; line: number }>; + } } diff --git a/src/db/schema.sql b/src/db/schema.sql index 4a1150dd..2f8b1ddc 100644 --- a/src/db/schema.sql +++ b/src/db/schema.sql @@ -171,3 +171,24 @@ CREATE TABLE IF NOT EXISTS symbol_issues ( ); CREATE INDEX IF NOT EXISTS idx_symbol_issues_node ON symbol_issues(node_id); CREATE INDEX IF NOT EXISTS idx_symbol_issues_issue ON symbol_issues(issue_number); + +-- Config references: read sites for env vars / feature flags / etc. +-- One row per syntactic occurrence in source. config_kind narrows to +-- 'env' (process.env, os.getenv, ...) for v1; future kinds add YAML +-- keys, LaunchDarkly flags, etc. source_node_id may be NULL for +-- top-level reads that aren't inside a function/method. +CREATE TABLE IF NOT EXISTS config_refs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + config_kind TEXT NOT NULL, + config_key TEXT NOT NULL, + source_node_id TEXT, + file_path TEXT NOT NULL, + line INTEGER NOT NULL, + FOREIGN KEY (source_node_id) REFERENCES nodes(id) ON DELETE CASCADE +); +CREATE INDEX IF NOT EXISTS idx_config_refs_key + ON config_refs(config_kind, config_key); +CREATE INDEX IF NOT EXISTS idx_config_refs_node + ON config_refs(source_node_id); +CREATE INDEX IF NOT EXISTS idx_config_refs_file + ON config_refs(file_path); diff --git a/src/default-config.ts b/src/default-config.ts index a7ec0486..06302566 100644 --- a/src/default-config.ts +++ b/src/default-config.ts @@ -186,6 +186,7 @@ const baseConfig: CodeGraphConfig = { enableCentrality: true, enableChurn: true, enableIssueHistory: true, + enableConfigRefs: true, }; Object.defineProperty(baseConfig, 'include', { diff --git a/src/index-hooks/config-refs.ts b/src/index-hooks/config-refs.ts new file mode 100644 index 00000000..70f13ffa --- /dev/null +++ b/src/index-hooks/config-refs.ts @@ -0,0 +1,77 @@ +/** + * Config-refs index hook — extracts env-var / feature-flag read + * sites and persists to `config_refs`. Incremental on sync; full + * rescan on indexAll. See `src/config-refs/` for the extractor. + */ + +import type { IndexHook, IndexHookContext } from './registry'; +import type { SyncResult } from '../extraction'; +import { extractConfigRefs } from '../config-refs'; +import { logDebug } from '../errors'; + +function refresh( + ctx: IndexHookContext, + options: { scope: 'all' } | { scope: 'files'; files: string[] } +): void { + if (ctx.config.enableConfigRefs === false) return; + try { + const fileNodes = new Map>(); + const resolveEnclosing = (filePath: string, line: number): string | null => { + let nodes = fileNodes.get(filePath); + if (!nodes) { + nodes = ctx.queries + .getNodesByFile(filePath) + .filter( + (n) => + n.kind === 'function' || + n.kind === 'method' || + n.kind === 'class' || + n.kind === 'interface' + ) + .map((n) => ({ id: n.id, start: n.startLine, end: n.endLine })) + .sort((a, b) => a.end - a.start - (b.end - b.start)); + fileNodes.set(filePath, nodes); + } + for (const n of nodes) { + if (n.start <= line && line <= n.end) return n.id; + } + return null; + }; + + let targets: Array<{ path: string; language: string }>; + if (options.scope === 'all') { + targets = ctx.queries.getAllFiles().map((f) => ({ + path: f.path, + language: f.language, + })); + ctx.queries.clearConfigRefs(); + } else { + const records = options.files + .map((p) => ctx.queries.getFileByPath(p)) + .filter((f): f is NonNullable => f != null); + targets = records.map((f) => ({ path: f.path, language: f.language })); + ctx.queries.pruneOrphanedConfigRefs(); + if (targets.length > 0) { + ctx.queries.deleteConfigRefsForPaths(targets.map((t) => t.path)); + } + } + + const refs = extractConfigRefs(ctx.projectRoot, targets, resolveEnclosing); + ctx.queries.applyConfigRefs(refs); + } catch (err) { + logDebug(`config-refs hook failed: ${err instanceof Error ? err.message : String(err)}`); + } +} + +export const HOOK: IndexHook = { + name: 'config-refs', + afterIndexAll(ctx) { refresh(ctx, { scope: 'all' }); }, + afterSync(ctx, result: SyncResult) { + if ( + (result.changedFilePaths && result.changedFilePaths.length > 0) || + result.filesRemoved > 0 + ) { + refresh(ctx, { scope: 'files', files: result.changedFilePaths ?? [] }); + } + }, +}; diff --git a/src/index-hooks/registry.ts b/src/index-hooks/registry.ts index 5a61e017..cd439e96 100644 --- a/src/index-hooks/registry.ts +++ b/src/index-hooks/registry.ts @@ -24,6 +24,7 @@ import { logDebug } from '../errors'; import { HOOK as CENTRALITY_HOOK } from './centrality'; import { HOOK as CHURN_HOOK } from './churn'; +import { HOOK as CONFIG_REFS_HOOK } from './config-refs'; import { HOOK as ISSUE_HISTORY_HOOK } from './issue-history'; /** @@ -37,6 +38,7 @@ import { HOOK as ISSUE_HISTORY_HOOK } from './issue-history'; const REGISTERED_HOOKS: readonly IndexHook[] = [ CENTRALITY_HOOK, CHURN_HOOK, + CONFIG_REFS_HOOK, ISSUE_HISTORY_HOOK, ]; diff --git a/src/index.ts b/src/index.ts index 7558993f..fa75464e 100644 --- a/src/index.ts +++ b/src/index.ts @@ -576,6 +576,23 @@ export class CodeGraph { return this.queries.getIssuesForNode(nodeId); } + getConfigKeys(opts: { configKind?: 'env'; limit?: number } = {}): ReturnType< + QueryBuilder['getConfigKeys'] + > { + return this.queries.getConfigKeys(opts); + } + + getConfigRefsByKey( + configKey: string, + opts: { configKind?: 'env' } = {} + ): ReturnType { + return this.queries.getConfigRefsByKey(configKey, opts); + } + + getConfigKeysForNode(nodeId: string): ReturnType { + return this.queries.getConfigKeysForNode(nodeId); + } + // =========================================================================== // File Watching // =========================================================================== diff --git a/src/mcp/tools.ts b/src/mcp/tools.ts index 8e5759e5..93846d68 100644 --- a/src/mcp/tools.ts +++ b/src/mcp/tools.ts @@ -816,6 +816,51 @@ export class ToolHandler implements ToolHandlerLike { return this.textResult(this.truncateOutput(output)); } + /** + * Handle codegraph_config — env-var / config read-site queries. + */ + async handleConfig(args: Record): Promise { + const cg = this.getCodeGraph(args.projectPath as string | undefined); + const key = typeof args.key === 'string' ? args.key.trim() : ''; + + if (!key) { + const limit = args.limit != null ? clamp(args.limit as number, 1, 500) : 30; + const rows = cg.getConfigKeys({ configKind: 'env', limit }); + if (rows.length === 0) { + return this.textResult( + 'No config reads found. Either the index has no env-var read sites, or `enableConfigRefs` is disabled in config.' + ); + } + const lines: string[] = [ + `## Config keys read in this project (top ${rows.length})`, + '', + '| # | Key | Reads | Files |', + '|---|-----|------:|------:|', + ]; + rows.forEach((r, i) => { + lines.push(`| ${i + 1} | \`${r.configKey}\` | ${r.reads} | ${r.distinctFiles} |`); + }); + lines.push('', 'Pass `key` to a follow-up call to see exact read sites.'); + return this.textResult(this.truncateOutput(lines.join('\n'))); + } + + const sites = cg.getConfigRefsByKey(key, { configKind: 'env' }); + if (sites.length === 0) { + return this.textResult(`No reads found for env var "${key}".`); + } + const lines: string[] = [ + `## Reads of \`${key}\` (${sites.length} site${sites.length === 1 ? '' : 's'})`, + '', + ]; + for (const s of sites) { + const enclosing = s.sourceName + ? ` — ${s.sourceKind ?? 'symbol'} \`${s.sourceName}\`` + : ' — top-level'; + lines.push(`- \`${s.filePath}:${s.line}\`${enclosing}`); + } + return this.textResult(this.truncateOutput(lines.join('\n'))); + } + /** * Handle codegraph_hotspots — files ranked by risk = centrality × churn. */ diff --git a/src/mcp/tools/config.ts b/src/mcp/tools/config.ts new file mode 100644 index 00000000..fa11a5e1 --- /dev/null +++ b/src/mcp/tools/config.ts @@ -0,0 +1,26 @@ +import { projectPathProperty } from '../tool-types'; +import type { ToolModule } from './types'; + +export const CONFIG_TOOL: ToolModule = { + definition: { + name: 'codegraph_config', + description: + "Surface environment-variable read sites across the codebase. Use to answer 'what reads OBSIDIAN_PORT?' or 'what config does this codebase read?'. Returns either (a) all distinct keys with read counts (no `key`), or (b) the precise read sites and their enclosing functions for a specific key. Beats grep because it skips comments/docs/tests-of-tests and attributes each hit to its enclosing function.", + inputSchema: { + type: 'object', + properties: { + key: { + type: 'string', + description: + 'Specific env var to look up (e.g. "OBSIDIAN_PORT"). If omitted, returns the top-N keys with read counts.', + }, + limit: { + type: 'number', + description: 'Max keys to return when no `key` is specified (default: 30).', + }, + projectPath: projectPathProperty, + }, + }, + }, + handlerKey: 'handleConfig', +}; diff --git a/src/mcp/tools/registry.ts b/src/mcp/tools/registry.ts index e729e44f..000c0972 100644 --- a/src/mcp/tools/registry.ts +++ b/src/mcp/tools/registry.ts @@ -20,6 +20,7 @@ import type { ToolModule } from './types'; import { CALLEES_TOOL } from './callees'; import { CALLERS_TOOL } from './callers'; +import { CONFIG_TOOL } from './config'; import { CONTEXT_TOOL } from './context'; import { EXPLORE_TOOL } from './explore'; import { FILES_TOOL } from './files'; @@ -32,6 +33,7 @@ import { STATUS_TOOL } from './status'; const ALL_TOOLS: readonly ToolModule[] = [ CALLEES_TOOL, CALLERS_TOOL, + CONFIG_TOOL, CONTEXT_TOOL, EXPLORE_TOOL, FILES_TOOL, diff --git a/src/mcp/tools/types.ts b/src/mcp/tools/types.ts index 372a1e1b..8b94a50b 100644 --- a/src/mcp/tools/types.ts +++ b/src/mcp/tools/types.ts @@ -32,7 +32,8 @@ export type HandlerKey = | 'handleNode' | 'handleStatus' | 'handleFiles' - | 'handleHotspots'; + | 'handleHotspots' + | 'handleConfig'; /** * The minimum surface a `ToolHandler`-shaped object exposes for diff --git a/src/types.ts b/src/types.ts index 4ce51c0c..75531cab 100644 --- a/src/types.ts +++ b/src/types.ts @@ -518,6 +518,12 @@ export interface CodeGraphConfig { * non-GitHub repos or where issue refs are noisy. */ enableIssueHistory?: boolean; + + /** + * Extract env-var / feature-flag read sites into config_refs. + * Enabled by default. + */ + enableConfigRefs?: boolean; } // `DEFAULT_CONFIG` lives in `./default-config.ts` so its `include` From 7c3af0eb72044b1bf4ac9e21429491b94c3b753b Mon Sep 17 00:00:00 2001 From: andreinknv Date: Mon, 27 Apr 2026 18:00:38 -0400 Subject: [PATCH 9/9] feat: PR #115 (sql-refs) on top of refactors Extracts SQL string-literal references to tables (read/write/ddl) into sql_refs and exposes via codegraph_sql MCP tool. Lands as a registered IndexHook (sql-refs). - Migration 007: sql_refs table - src/sql-refs/ (pure module): regex extractor with comment strip + SQL-keyword pre-filter - src/index-hooks/sql-refs.ts (registered hook with full / files scoping; uses replaceAllSqlRefs for atomic indexAll swap) - CodeGraph public methods: getSqlTables, getSqlRefsByTable, getSqlTablesForNode - codegraph_sql MCP tool wired through ToolModule registry - enableSqlRefs flag default true - Removed defensive ensureSqlRefsTable guard + its test (same reason as #113 / #114: bug class is impossible under file-based migrations). Tests: 514/515 pass (1 watcher flake under load). Co-Authored-By: Claude Opus 4.7 (1M context) --- __tests__/foundation.test.ts | 2 +- __tests__/mcp-tool-registry.test.ts | 1 + __tests__/pr19-improvements.test.ts | 2 +- __tests__/sql-refs.test.ts | 339 ++++++++++++++++++++++++++++ src/config.ts | 1 + src/db/migrations/007-sql-refs.ts | 24 ++ src/db/migrations/index.ts | 2 + src/db/queries.ts | 143 ++++++++++++ src/db/schema.sql | 21 ++ src/default-config.ts | 1 + src/index-hooks/registry.ts | 2 + src/index-hooks/sql-refs.ts | 76 +++++++ src/index.ts | 15 ++ src/mcp/tools.ts | 51 +++++ src/mcp/tools/registry.ts | 2 + src/mcp/tools/sql.ts | 32 +++ src/mcp/tools/types.ts | 3 +- src/sql-refs/index.ts | 252 +++++++++++++++++++++ src/types.ts | 6 + 19 files changed, 972 insertions(+), 3 deletions(-) create mode 100644 __tests__/sql-refs.test.ts create mode 100644 src/db/migrations/007-sql-refs.ts create mode 100644 src/index-hooks/sql-refs.ts create mode 100644 src/mcp/tools/sql.ts create mode 100644 src/sql-refs/index.ts diff --git a/__tests__/foundation.test.ts b/__tests__/foundation.test.ts index 805120b6..8b1620d9 100644 --- a/__tests__/foundation.test.ts +++ b/__tests__/foundation.test.ts @@ -305,7 +305,7 @@ describe('Database Connection', () => { const version = db.getSchemaVersion(); expect(version).not.toBeNull(); - expect(version?.version).toBe(6); + expect(version?.version).toBe(7); db.close(); }); diff --git a/__tests__/mcp-tool-registry.test.ts b/__tests__/mcp-tool-registry.test.ts index a956eec8..2da0efc5 100644 --- a/__tests__/mcp-tool-registry.test.ts +++ b/__tests__/mcp-tool-registry.test.ts @@ -49,6 +49,7 @@ describe('MCP tool registry — single source of truth', () => { 'codegraph_impact', 'codegraph_node', 'codegraph_search', + 'codegraph_sql', 'codegraph_status', ]; const actual = getToolModules() diff --git a/__tests__/pr19-improvements.test.ts b/__tests__/pr19-improvements.test.ts index 6768f256..5766b546 100644 --- a/__tests__/pr19-improvements.test.ts +++ b/__tests__/pr19-improvements.test.ts @@ -299,7 +299,7 @@ describe('Best-Candidate Resolution', () => { describe('Schema v2 Migration', () => { it.skipIf(!HAS_SQLITE)('should have correct current schema version', async () => { const { CURRENT_SCHEMA_VERSION } = await import('../src/db/migrations'); - expect(CURRENT_SCHEMA_VERSION).toBe(6); + expect(CURRENT_SCHEMA_VERSION).toBe(7); }); it.skipIf(!HAS_SQLITE)('should have migration for version 2', async () => { diff --git a/__tests__/sql-refs.test.ts b/__tests__/sql-refs.test.ts new file mode 100644 index 00000000..7fb201c7 --- /dev/null +++ b/__tests__/sql-refs.test.ts @@ -0,0 +1,339 @@ +/** + * SQL call-site tests: parser unit tests + end-to-end through CodeGraph. + */ +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import * as fs from 'fs'; +import * as os from 'os'; +import * as path from 'path'; +import { extractSqlRefs } from '../src/sql-refs'; +import CodeGraph from '../src/index'; + +let testDir: string; +let cg: CodeGraph | null = null; + +function write(rel: string, content: string) { + const abs = path.join(testDir, rel); + fs.mkdirSync(path.dirname(abs), { recursive: true }); + fs.writeFileSync(abs, content); +} + +beforeEach(() => { + testDir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-sql-')); +}); + +afterEach(() => { + if (cg) { + cg.destroy(); + cg = null; + } + if (fs.existsSync(testDir)) fs.rmSync(testDir, { recursive: true, force: true }); +}); + +// ============================================================================ +// Pure parser tests +// ============================================================================ + +describe('extractSqlRefs', () => { + it('captures FROM as a read', () => { + write('a.ts', `db.prepare('SELECT id FROM users WHERE id = ?');\n`); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + expect(refs).toHaveLength(1); + expect(refs[0]!).toMatchObject({ tableName: 'users', op: 'read' }); + }); + + it('captures INSERT INTO as a write', () => { + write('a.ts', `db.prepare('INSERT INTO logs (msg) VALUES (?)');\n`); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + expect(refs).toHaveLength(1); + expect(refs[0]!).toMatchObject({ tableName: 'logs', op: 'write' }); + }); + + it('captures UPDATE ... SET as a write', () => { + write('a.ts', `db.run('UPDATE users SET name = ? WHERE id = ?', ['x', 1]);\n`); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + expect(refs).toHaveLength(1); + expect(refs[0]!).toMatchObject({ tableName: 'users', op: 'write' }); + }); + + it('captures DELETE FROM as a write (and not as a read)', () => { + write('a.ts', `db.run('DELETE FROM sessions WHERE expired_at < ?');\n`); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + // Both regexes (DELETE FROM as write, FROM as read) hit, so we expect + // two refs for the same table but different ops. + expect(refs.map((r) => r.op).sort()).toEqual(['read', 'write']); + expect(new Set(refs.map((r) => r.tableName))).toEqual(new Set(['sessions'])); + }); + + it('captures CREATE TABLE / ALTER / DROP as ddl', () => { + write( + 'a.ts', + [ + `db.exec('CREATE TABLE IF NOT EXISTS audit (id INTEGER)');`, + `db.exec('ALTER TABLE audit ADD COLUMN ts INTEGER');`, + `db.exec('DROP TABLE IF EXISTS audit_old');`, + ].join('\n') + ); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + const ddls = refs.filter((r) => r.op === 'ddl'); + expect(new Set(ddls.map((r) => r.tableName))).toEqual(new Set(['audit', 'audit_old'])); + }); + + it('captures JOIN as a read', () => { + write( + 'a.ts', + `db.prepare('SELECT u.name, p.title FROM users u JOIN posts p ON p.user_id = u.id');\n` + ); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + const tables = new Set(refs.map((r) => r.tableName)); + expect(tables).toEqual(new Set(['users', 'posts'])); + }); + + it('handles backtick (MySQL) and double-quoted (Postgres) identifiers', () => { + write( + 'a.ts', + [ + "db.prepare('SELECT id FROM `mysql_table`');", + `db.prepare('SELECT id FROM "pg_table"');`, + ].join('\n') + ); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + expect(new Set(refs.map((r) => r.tableName))).toEqual( + new Set(['mysql_table', 'pg_table']) + ); + }); + + it('handles schema-qualified identifiers (drops the schema, keeps the table)', () => { + write('a.ts', `db.prepare('SELECT * FROM public.users');\n`); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + expect(refs[0]!.tableName).toBe('users'); + }); + + it('does NOT match a JS variable named like a SQL keyword', () => { + // Without the FROM/INTO/etc. prefix, a bare identifier `users` is + // not caught — that's the whole point vs. plain grep. + write('a.ts', `const users = await loadUsers();\nfor (const user of users) {}\n`); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + expect(refs).toEqual([]); + }); + + it('skips unsupported languages (e.g. swift) without error', () => { + write('a.swift', `let q = "SELECT id FROM users"\n`); + const refs = extractSqlRefs(testDir, [{ path: 'a.swift', language: 'swift' }], () => null); + expect(refs).toEqual([]); + }); + + it('captures the correct 1-indexed line number', () => { + write( + 'a.ts', + [`// blah`, `// blah`, `db.prepare('SELECT * FROM line_three');`, `// blah`].join('\n') + ); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + expect(refs[0]).toEqual(expect.objectContaining({ tableName: 'line_three', line: 3 })); + }); + + it('threads the resolveEnclosing closure correctly', () => { + write('a.ts', `db.prepare('SELECT * FROM t');\n`); + const calls: Array<[string, number]> = []; + extractSqlRefs( + testDir, + [{ path: 'a.ts', language: 'typescript' }], + (filePath, line) => { + calls.push([filePath, line]); + return 'fake-id'; + } + ); + expect(calls).toEqual([['a.ts', 1]]); + }); + + it('drops reserved-word "table names" (WHERE/ON/AS/SELECT)', () => { + // Common over-match: `JOIN ... ON x = y` would otherwise pick up + // `ON` as the table name. The reserved set blocks that. + write('a.ts', `db.prepare('SELECT * FROM users JOIN posts ON posts.uid = users.id');\n`); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + const names = new Set(refs.map((r) => r.tableName)); + expect(names).toEqual(new Set(['users', 'posts'])); + }); + + it('handles multiple SQL operations on a single line', () => { + write( + 'a.ts', + `db.exec('CREATE TABLE foo (id INTEGER); INSERT INTO foo VALUES (1)');\n` + ); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + const ops = new Set(refs.map((r) => `${r.tableName}|${r.op}`)); + expect(ops).toEqual(new Set(['foo|ddl', 'foo|write'])); + }); + + it('survives a missing file (skips, no throw)', () => { + const refs = extractSqlRefs( + testDir, + [{ path: 'missing.ts', language: 'typescript' }], + () => null + ); + expect(refs).toEqual([]); + }); + + it('rejects prose comments containing a quoted SQL example', () => { + // Reviewer-flagged regression: a comment like + // // example: db.prepare('SELECT name FROM the docs') + // used to falsely match `the` as a table because the quote inside + // the comment passed isInsideString(). The comment-stripper now + // removes everything after `//` before the regex sees the line. + write( + 'a.ts', + [ + `// example: db.prepare('SELECT name FROM the docs')`, + `// "SELECT id FROM the comment"`, + `function ok() {`, + ` // sample SELECT FROM users in a comment — should be ignored`, + ` return 1;`, + `}`, + ].join('\n') + ); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + expect(refs).toEqual([]); + }); + + it('rejects same-line block comments containing a quoted SQL example', () => { + write( + 'a.ts', + `/* "SELECT * FROM ghost" */ const x = 1;\n` + ); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + expect(refs).toEqual([]); + }); + + it('still keeps a real SQL call when there is a trailing comment', () => { + write('a.ts', `db.prepare('SELECT * FROM users'); // good doc\n`); + const refs = extractSqlRefs(testDir, [{ path: 'a.ts', language: 'typescript' }], () => null); + expect(refs.length).toBe(1); + expect(refs[0]!.tableName).toBe('users'); + }); + + it('strips Python `#` comments', () => { + write( + 'a.py', + `# example: db.execute('SELECT * FROM the_docs')\nrows = db.execute('SELECT * FROM real_table')\n` + ); + const refs = extractSqlRefs(testDir, [{ path: 'a.py', language: 'python' }], () => null); + expect(refs.map((r) => r.tableName)).toEqual(['real_table']); + }); +}); + +// ============================================================================ +// End-to-end through CodeGraph +// ============================================================================ + +describe('CodeGraph SQL refs', () => { + it('persists call sites and resolves enclosing function', async () => { + write( + 'src/db.ts', + [ + `export function getUser(id: number) {`, + ` return db.prepare('SELECT * FROM users WHERE id = ?').get(id);`, + `}`, + ``, + `export function logEvent(msg: string) {`, + ` db.prepare('INSERT INTO events (msg) VALUES (?)').run(msg);`, + `}`, + ].join('\n') + ); + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + + const tables = cg.getSqlTables(); + expect(new Set(tables.map((t) => t.tableName))).toEqual(new Set(['users', 'events'])); + + const userSites = cg.getSqlRefsByTable('users'); + expect(userSites[0]!.sourceName).toBe('getUser'); + + const eventSites = cg.getSqlRefsByTable('events'); + expect(eventSites[0]!.sourceName).toBe('logEvent'); + expect(eventSites[0]!.op).toBe('write'); + }); + + it('reverse view: getSqlTablesForNode returns tables touched by a function', async () => { + write( + 'src/a.ts', + [ + `export function multiTouch() {`, + ` db.prepare('SELECT * FROM a').all();`, + ` db.prepare('INSERT INTO b VALUES (?)').run(1);`, + `}`, + ].join('\n') + ); + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + + const node = cg.getNodesInFile('src/a.ts').find((n) => n.name === 'multiTouch')!; + const touched = cg.getSqlTablesForNode(node.id); + const summary = touched.map((r) => `${r.tableName}|${r.op}`).sort(); + expect(summary).toEqual(['a|read', 'b|write']); + }); + + it('case-insensitive table lookup', async () => { + write('src/a.ts', `db.prepare('SELECT * FROM Users');\n`); + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + expect(cg.getSqlRefsByTable('users').length).toBe(1); + expect(cg.getSqlRefsByTable('USERS').length).toBe(1); + }); + + it('respects enableSqlRefs=false', async () => { + write('src/a.ts', `db.prepare('SELECT * FROM users');\n`); + cg = CodeGraph.initSync(testDir, { + config: { include: ['**/*.ts'], exclude: [], enableSqlRefs: false }, + }); + await cg.indexAll(); + expect(cg.getSqlTables()).toEqual([]); + }); + + it('incremental sync replaces refs for changed files only', async () => { + write('src/a.ts', `db.prepare('SELECT * FROM old_table');\n`); + write('src/b.ts', `db.prepare('SELECT * FROM stable_table');\n`); + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + expect(new Set(cg.getSqlTables().map((t) => t.tableName))).toEqual( + new Set(['old_table', 'stable_table']) + ); + + write('src/a.ts', `db.prepare('SELECT * FROM new_table');\n`); + await cg.sync(); + + const tables = new Set(cg.getSqlTables().map((t) => t.tableName)); + expect(tables).toContain('new_table'); + expect(tables).toContain('stable_table'); + expect(tables).not.toContain('old_table'); + }); + + it('drops refs when a file is edited to remove its last SQL ref', async () => { + // Same regression as PR C — applySqlRefs([]) shouldn't leave + // stale rows. Pre-deleting the changed paths in runSqlRefsPass + // is the fix. + write('src/a.ts', `db.prepare('SELECT * FROM going_away');\n`); + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + expect(cg.getSqlTables().some((t) => t.tableName === 'going_away')).toBe(true); + + write('src/a.ts', `// no sql here anymore\nexport const x = 1;\n`); + await cg.sync(); + + expect(cg.getSqlTables().some((t) => t.tableName === 'going_away')).toBe(false); + }); + + it('drops refs for files removed between syncs', async () => { + write('src/a.ts', `db.prepare('SELECT * FROM gone_table');\n`); + cg = CodeGraph.initSync(testDir, { config: { include: ['**/*.ts'], exclude: [] } }); + await cg.indexAll(); + expect(cg.getSqlTables().some((t) => t.tableName === 'gone_table')).toBe(true); + + fs.unlinkSync(path.join(testDir, 'src/a.ts')); + await cg.sync(); + expect(cg.getSqlTables().some((t) => t.tableName === 'gone_table')).toBe(false); + }); + + // (Removed: a defensive test for the v4-migration-collision bug class. + // With file-based migrations (NNN-name.ts), two PRs claiming the same + // version produces a filesystem-level conflict, so the silent skip the + // defensive guard protected against can no longer happen.) +}); diff --git a/src/config.ts b/src/config.ts index 00adf9a5..f1d70250 100644 --- a/src/config.ts +++ b/src/config.ts @@ -132,6 +132,7 @@ function mergeConfig( enableChurn: overrides.enableChurn ?? defaults.enableChurn, enableIssueHistory: overrides.enableIssueHistory ?? defaults.enableIssueHistory, enableConfigRefs: overrides.enableConfigRefs ?? defaults.enableConfigRefs, + enableSqlRefs: overrides.enableSqlRefs ?? defaults.enableSqlRefs, }; } diff --git a/src/db/migrations/007-sql-refs.ts b/src/db/migrations/007-sql-refs.ts new file mode 100644 index 00000000..629d070f --- /dev/null +++ b/src/db/migrations/007-sql-refs.ts @@ -0,0 +1,24 @@ +import type { MigrationModule } from './types'; + +export const MIGRATION: MigrationModule = { + description: 'Add sql_refs table for SQL string-literal references to tables', + up: (db) => { + db.exec(` + CREATE TABLE IF NOT EXISTS sql_refs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + table_name TEXT NOT NULL, + op TEXT NOT NULL CHECK (op IN ('read','write','ddl')), + source_node_id TEXT, + file_path TEXT NOT NULL, + line INTEGER NOT NULL, + FOREIGN KEY (source_node_id) REFERENCES nodes(id) ON DELETE CASCADE + ); + CREATE INDEX IF NOT EXISTS idx_sql_refs_table + ON sql_refs(lower(table_name)); + CREATE INDEX IF NOT EXISTS idx_sql_refs_node + ON sql_refs(source_node_id); + CREATE INDEX IF NOT EXISTS idx_sql_refs_file + ON sql_refs(file_path); + `); + }, +}; diff --git a/src/db/migrations/index.ts b/src/db/migrations/index.ts index 525fe2a2..2ad3b7ad 100644 --- a/src/db/migrations/index.ts +++ b/src/db/migrations/index.ts @@ -29,6 +29,7 @@ import { MIGRATION as MIG_003 } from './003-lower-name-index'; import { MIGRATION as MIG_004 } from './004-centrality-churn'; import { MIGRATION as MIG_005 } from './005-symbol-issues'; import { MIGRATION as MIG_006 } from './006-config-refs'; +import { MIGRATION as MIG_007 } from './007-sql-refs'; interface ModuleRef { /** @@ -54,6 +55,7 @@ const REGISTERED_MODULES: readonly ModuleRef[] = [ { filename: '004-centrality-churn.ts', module: MIG_004 }, { filename: '005-symbol-issues.ts', module: MIG_005 }, { filename: '006-config-refs.ts', module: MIG_006 }, + { filename: '007-sql-refs.ts', module: MIG_007 }, ]; /** Strict 3-digit prefix on each migration filename. */ diff --git a/src/db/queries.ts b/src/db/queries.ts index 446116d2..acbf31b0 100644 --- a/src/db/queries.ts +++ b/src/db/queries.ts @@ -1689,4 +1689,147 @@ export class QueryBuilder { ) .all(nodeId) as Array<{ configKey: string; line: number }>; } + + // =========================================================================== + // SQL references (table-name string-literal refs from app code) + // =========================================================================== + + applySqlRefs( + rows: Array<{ + tableName: string; + op: 'read' | 'write' | 'ddl'; + sourceNodeId: string | null; + filePath: string; + line: number; + }> + ): void { + if (rows.length === 0) return; + const stmt = this.db.prepare( + `INSERT INTO sql_refs (table_name, op, source_node_id, file_path, line) + VALUES (?, ?, ?, ?, ?)` + ); + this.db.transaction(() => { + for (const r of rows) { + stmt.run(r.tableName, r.op, r.sourceNodeId, r.filePath, r.line); + } + })(); + } + + replaceAllSqlRefs( + rows: Array<{ + tableName: string; + op: 'read' | 'write' | 'ddl'; + sourceNodeId: string | null; + filePath: string; + line: number; + }> + ): void { + const insert = this.db.prepare( + `INSERT INTO sql_refs (table_name, op, source_node_id, file_path, line) + VALUES (?, ?, ?, ?, ?)` + ); + this.db.transaction(() => { + this.db.exec('DELETE FROM sql_refs'); + for (const r of rows) { + insert.run(r.tableName, r.op, r.sourceNodeId, r.filePath, r.line); + } + })(); + } + + deleteSqlRefsForPaths(filePaths: Iterable): void { + const stmt = this.db.prepare('DELETE FROM sql_refs WHERE file_path = ?'); + this.db.transaction(() => { + for (const p of filePaths) stmt.run(p); + })(); + } + + clearSqlRefs(): void { + this.db.exec('DELETE FROM sql_refs'); + } + + pruneOrphanedSqlRefs(): void { + this.db.exec( + `DELETE FROM sql_refs WHERE file_path NOT IN (SELECT path FROM files)` + ); + } + + getSqlTables(opts: { limit?: number } = {}): Array<{ + tableName: string; + reads: number; + writes: number; + ddl: number; + total: number; + }> { + const limit = opts.limit ?? 100; + return this.db + .prepare( + `SELECT lower(table_name) AS tableName, + SUM(CASE WHEN op = 'read' THEN 1 ELSE 0 END) AS reads, + SUM(CASE WHEN op = 'write' THEN 1 ELSE 0 END) AS writes, + SUM(CASE WHEN op = 'ddl' THEN 1 ELSE 0 END) AS ddl, + COUNT(*) AS total + FROM sql_refs + GROUP BY lower(table_name) + ORDER BY total DESC, tableName ASC + LIMIT ?` + ) + .all(limit) as Array<{ + tableName: string; + reads: number; + writes: number; + ddl: number; + total: number; + }>; + } + + getSqlRefsByTable( + tableName: string, + opts: { op?: 'read' | 'write' | 'ddl' } = {} + ): Array<{ + op: 'read' | 'write' | 'ddl'; + filePath: string; + line: number; + sourceNodeId: string | null; + sourceName: string | null; + sourceKind: string | null; + }> { + const params: Array = [tableName.toLowerCase()]; + let opFilter = ''; + if (opts.op) { + opFilter = ' AND sr.op = ?'; + params.push(opts.op); + } + return this.db + .prepare( + `SELECT sr.op AS op, + sr.file_path AS filePath, + sr.line AS line, + sr.source_node_id AS sourceNodeId, + n.name AS sourceName, + n.kind AS sourceKind + FROM sql_refs sr + LEFT JOIN nodes n ON n.id = sr.source_node_id + WHERE lower(sr.table_name) = ?${opFilter} + ORDER BY sr.file_path ASC, sr.line ASC` + ) + .all(...params) as Array<{ + op: 'read' | 'write' | 'ddl'; + filePath: string; + line: number; + sourceNodeId: string | null; + sourceName: string | null; + sourceKind: string | null; + }>; + } + + getSqlTablesForNode(nodeId: string): Array<{ tableName: string; op: string }> { + return this.db + .prepare( + `SELECT DISTINCT lower(table_name) AS tableName, op + FROM sql_refs + WHERE source_node_id = ? + ORDER BY tableName ASC, op ASC` + ) + .all(nodeId) as Array<{ tableName: string; op: string }>; + } } diff --git a/src/db/schema.sql b/src/db/schema.sql index 2f8b1ddc..4a78136b 100644 --- a/src/db/schema.sql +++ b/src/db/schema.sql @@ -192,3 +192,24 @@ CREATE INDEX IF NOT EXISTS idx_config_refs_node ON config_refs(source_node_id); CREATE INDEX IF NOT EXISTS idx_config_refs_file ON config_refs(file_path); + +-- SQL references: per-call-site links from app code to a table name. +-- One row per syntactic occurrence in source. op is 'read' (SELECT, +-- FROM in non-DDL), 'write' (INSERT/UPDATE/DELETE), or 'ddl' +-- (CREATE TABLE / ALTER TABLE / DROP TABLE -- rare in app code but +-- catches migration scripts). +CREATE TABLE IF NOT EXISTS sql_refs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + table_name TEXT NOT NULL, + op TEXT NOT NULL CHECK (op IN ('read','write','ddl')), + source_node_id TEXT, + file_path TEXT NOT NULL, + line INTEGER NOT NULL, + FOREIGN KEY (source_node_id) REFERENCES nodes(id) ON DELETE CASCADE +); +CREATE INDEX IF NOT EXISTS idx_sql_refs_table + ON sql_refs(lower(table_name)); +CREATE INDEX IF NOT EXISTS idx_sql_refs_node + ON sql_refs(source_node_id); +CREATE INDEX IF NOT EXISTS idx_sql_refs_file + ON sql_refs(file_path); diff --git a/src/default-config.ts b/src/default-config.ts index 06302566..34769609 100644 --- a/src/default-config.ts +++ b/src/default-config.ts @@ -187,6 +187,7 @@ const baseConfig: CodeGraphConfig = { enableChurn: true, enableIssueHistory: true, enableConfigRefs: true, + enableSqlRefs: true, }; Object.defineProperty(baseConfig, 'include', { diff --git a/src/index-hooks/registry.ts b/src/index-hooks/registry.ts index cd439e96..f338a810 100644 --- a/src/index-hooks/registry.ts +++ b/src/index-hooks/registry.ts @@ -26,6 +26,7 @@ import { HOOK as CENTRALITY_HOOK } from './centrality'; import { HOOK as CHURN_HOOK } from './churn'; import { HOOK as CONFIG_REFS_HOOK } from './config-refs'; import { HOOK as ISSUE_HISTORY_HOOK } from './issue-history'; +import { HOOK as SQL_REFS_HOOK } from './sql-refs'; /** * Static-import list of every registered hook. @@ -40,6 +41,7 @@ const REGISTERED_HOOKS: readonly IndexHook[] = [ CHURN_HOOK, CONFIG_REFS_HOOK, ISSUE_HISTORY_HOOK, + SQL_REFS_HOOK, ]; /** diff --git a/src/index-hooks/sql-refs.ts b/src/index-hooks/sql-refs.ts new file mode 100644 index 00000000..34cec42b --- /dev/null +++ b/src/index-hooks/sql-refs.ts @@ -0,0 +1,76 @@ +/** + * SQL-refs index hook — extracts SQL string-literal references to + * tables (read/write/ddl) and persists to `sql_refs`. Incremental + * on sync; full atomic replace on indexAll. See `src/sql-refs/`. + */ + +import type { IndexHook, IndexHookContext } from './registry'; +import type { SyncResult } from '../extraction'; +import { extractSqlRefs } from '../sql-refs'; +import { logDebug } from '../errors'; + +function refresh( + ctx: IndexHookContext, + options: { scope: 'all' } | { scope: 'files'; files: string[] } +): void { + if (ctx.config.enableSqlRefs === false) return; + try { + const fileNodes = new Map>(); + const resolveEnclosing = (filePath: string, line: number): string | null => { + let nodes = fileNodes.get(filePath); + if (!nodes) { + nodes = ctx.queries + .getNodesByFile(filePath) + .filter( + (n) => + n.kind === 'function' || + n.kind === 'method' || + n.kind === 'class' || + n.kind === 'interface' + ) + .map((n) => ({ id: n.id, start: n.startLine, end: n.endLine })) + .sort((a, b) => a.end - a.start - (b.end - b.start)); + fileNodes.set(filePath, nodes); + } + for (const n of nodes) { + if (n.start <= line && line <= n.end) return n.id; + } + return null; + }; + + if (options.scope === 'all') { + const targets = ctx.queries.getAllFiles().map((f) => ({ + path: f.path, + language: f.language, + })); + const refs = extractSqlRefs(ctx.projectRoot, targets, resolveEnclosing); + ctx.queries.replaceAllSqlRefs(refs); + } else { + const records = options.files + .map((p) => ctx.queries.getFileByPath(p)) + .filter((f): f is NonNullable => f != null); + const targets = records.map((f) => ({ path: f.path, language: f.language })); + ctx.queries.pruneOrphanedSqlRefs(); + if (targets.length > 0) { + ctx.queries.deleteSqlRefsForPaths(targets.map((t) => t.path)); + } + const refs = extractSqlRefs(ctx.projectRoot, targets, resolveEnclosing); + ctx.queries.applySqlRefs(refs); + } + } catch (err) { + logDebug(`sql-refs hook failed: ${err instanceof Error ? err.message : String(err)}`); + } +} + +export const HOOK: IndexHook = { + name: 'sql-refs', + afterIndexAll(ctx) { refresh(ctx, { scope: 'all' }); }, + afterSync(ctx, result: SyncResult) { + if ( + (result.changedFilePaths && result.changedFilePaths.length > 0) || + result.filesRemoved > 0 + ) { + refresh(ctx, { scope: 'files', files: result.changedFilePaths ?? [] }); + } + }, +}; diff --git a/src/index.ts b/src/index.ts index fa75464e..b95ef38d 100644 --- a/src/index.ts +++ b/src/index.ts @@ -593,6 +593,21 @@ export class CodeGraph { return this.queries.getConfigKeysForNode(nodeId); } + getSqlTables(opts: { limit?: number } = {}): ReturnType { + return this.queries.getSqlTables(opts); + } + + getSqlRefsByTable( + tableName: string, + opts: { op?: 'read' | 'write' | 'ddl' } = {} + ): ReturnType { + return this.queries.getSqlRefsByTable(tableName, opts); + } + + getSqlTablesForNode(nodeId: string): ReturnType { + return this.queries.getSqlTablesForNode(nodeId); + } + // =========================================================================== // File Watching // =========================================================================== diff --git a/src/mcp/tools.ts b/src/mcp/tools.ts index 93846d68..e991702a 100644 --- a/src/mcp/tools.ts +++ b/src/mcp/tools.ts @@ -861,6 +861,57 @@ export class ToolHandler implements ToolHandlerLike { return this.textResult(this.truncateOutput(lines.join('\n'))); } + /** + * Handle codegraph_sql — SQL call-site queries. + */ + async handleSql(args: Record): Promise { + const cg = this.getCodeGraph(args.projectPath as string | undefined); + const table = typeof args.table === 'string' ? args.table.trim() : ''; + const op = + args.op === 'read' || args.op === 'write' || args.op === 'ddl' + ? args.op + : undefined; + + if (!table) { + const limit = args.limit != null ? clamp(args.limit as number, 1, 500) : 30; + const rows = cg.getSqlTables({ limit }); + if (rows.length === 0) { + return this.textResult( + 'No SQL refs found. Either the index has no SQL string-literal call sites, or `enableSqlRefs` is disabled in config.' + ); + } + const lines: string[] = [ + `## SQL tables touched by this codebase (top ${rows.length})`, + '', + '| # | Table | Reads | Writes | DDL | Total |', + '|---|-------|------:|-------:|----:|------:|', + ]; + rows.forEach((r, i) => { + lines.push( + `| ${i + 1} | \`${r.tableName}\` | ${r.reads} | ${r.writes} | ${r.ddl} | ${r.total} |` + ); + }); + lines.push('', 'Pass `table` to a follow-up call to see exact call sites.'); + return this.textResult(this.truncateOutput(lines.join('\n'))); + } + + const sites = cg.getSqlRefsByTable(table, op ? { op } : {}); + if (sites.length === 0) { + return this.textResult(`No SQL refs found for table "${table}"${op ? ` (op=${op})` : ''}.`); + } + const lines: string[] = [ + `## Call sites for \`${table}\`${op ? ` (op=${op})` : ''} — ${sites.length} site${sites.length === 1 ? '' : 's'}`, + '', + ]; + for (const s of sites) { + const enclosing = s.sourceName + ? ` — ${s.sourceKind ?? 'symbol'} \`${s.sourceName}\`` + : ' — top-level'; + lines.push(`- [${s.op}] \`${s.filePath}:${s.line}\`${enclosing}`); + } + return this.textResult(this.truncateOutput(lines.join('\n'))); + } + /** * Handle codegraph_hotspots — files ranked by risk = centrality × churn. */ diff --git a/src/mcp/tools/registry.ts b/src/mcp/tools/registry.ts index 000c0972..a5f1a9cd 100644 --- a/src/mcp/tools/registry.ts +++ b/src/mcp/tools/registry.ts @@ -28,6 +28,7 @@ import { HOTSPOTS_TOOL } from './hotspots'; import { IMPACT_TOOL } from './impact'; import { NODE_TOOL } from './node'; import { SEARCH_TOOL } from './search'; +import { SQL_TOOL } from './sql'; import { STATUS_TOOL } from './status'; const ALL_TOOLS: readonly ToolModule[] = [ @@ -41,6 +42,7 @@ const ALL_TOOLS: readonly ToolModule[] = [ IMPACT_TOOL, NODE_TOOL, SEARCH_TOOL, + SQL_TOOL, STATUS_TOOL, ]; diff --git a/src/mcp/tools/sql.ts b/src/mcp/tools/sql.ts new file mode 100644 index 00000000..1f90ffe2 --- /dev/null +++ b/src/mcp/tools/sql.ts @@ -0,0 +1,32 @@ +import { projectPathProperty } from '../tool-types'; +import type { ToolModule } from './types'; + +export const SQL_TOOL: ToolModule = { + definition: { + name: 'codegraph_sql', + description: + "Surface SQL string-literal references to tables across the codebase. Use to answer 'what code touches the users table?' or 'what tables does this codebase access?'. Returns either (a) the top-N distinct tables with read/write counts (no `table`), or (b) the precise read sites and their enclosing functions for a specific table. Beats grep because it requires a SQL keyword prefix (FROM/JOIN/INTO/UPDATE/DELETE), filtering out non-SQL uses of the same identifier.", + inputSchema: { + type: 'object', + properties: { + table: { + type: 'string', + description: + 'Specific table to look up (e.g. "users"). Case-insensitive. If omitted, returns the top-N tables with read/write counts.', + }, + op: { + type: 'string', + enum: ['read', 'write', 'ddl'], + description: + 'Filter to one operation kind: read (SELECT/JOIN), write (INSERT/UPDATE/DELETE), or ddl (CREATE/ALTER/DROP). Only meaningful with `table`.', + }, + limit: { + type: 'number', + description: 'Max tables to return when no `table` is specified (default: 30).', + }, + projectPath: projectPathProperty, + }, + }, + }, + handlerKey: 'handleSql', +}; diff --git a/src/mcp/tools/types.ts b/src/mcp/tools/types.ts index 8b94a50b..8b4ef015 100644 --- a/src/mcp/tools/types.ts +++ b/src/mcp/tools/types.ts @@ -33,7 +33,8 @@ export type HandlerKey = | 'handleStatus' | 'handleFiles' | 'handleHotspots' - | 'handleConfig'; + | 'handleConfig' + | 'handleSql'; /** * The minimum surface a `ToolHandler`-shaped object exposes for diff --git a/src/sql-refs/index.ts b/src/sql-refs/index.ts new file mode 100644 index 00000000..91b58d9d --- /dev/null +++ b/src/sql-refs/index.ts @@ -0,0 +1,252 @@ +/** + * SQL call-site extraction + * + * Scans indexed source files for SQL string-literal patterns (FROM, + * JOIN, INTO, UPDATE, DELETE FROM, CREATE TABLE) and records each + * (table, op) pair as a row in `sql_refs`. Each row links to its + * enclosing function via line-range lookup against the existing + * nodes table, so an agent asking "what code touches the users + * table?" gets a list of real functions, not a grep wall. + * + * Why a separate table, not graph nodes/edges: tables aren't + * declared in code that the existing extractors parse — they live + * in `.sql` migration files. Once #95 (SQL language extractor) + * merges, `table_name` can be joined against indexed SQL DDL nodes + * for cross-language navigation. This PR ships the call-site + * detection now so the agent-useful queries already work; full + * graph integration follows when the prerequisite lands. + * + * Spike validation (codegraph indexing itself): 87 SQL call sites + * across the 8 tables defined in `src/db/schema.sql`, each + * attributed to its enclosing QueryBuilder method. Beats grep + * because grep matches `const nodes = ...` (a JS variable named + * `nodes`) too — this regex requires the SQL keyword prefix + * (FROM/INTO/UPDATE/JOIN), eliminating that class of false positive. + * + * V1 scope: table-level only. Column extraction (`SELECT email FROM + * users` → `users.email`) is best-effort and deferred until #95 + * provides reliable column-name DDL nodes to join against. + */ + +import * as fs from 'fs'; +import * as path from 'path'; +import { logDebug } from '../errors'; + +export type SqlOp = 'read' | 'write' | 'ddl'; + +export interface SqlRef { + tableName: string; + op: SqlOp; + /** Indexed-symbol id for the enclosing function/method. NULL = top-level. */ + sourceNodeId: string | null; + filePath: string; + line: number; +} + +/** + * Languages we scan. Anything not in this set is skipped — most + * non-source files have no SQL to find. SQL files themselves are + * skipped here because #95 will own DDL extraction. + */ +const SUPPORTED_LANGUAGES = new Set([ + 'typescript', + 'javascript', + 'tsx', + 'jsx', + 'python', + 'go', + 'rust', + 'java', + 'kotlin', + 'csharp', + 'php', + 'ruby', +]); + +/** + * SQL identifier regex. Allows simple unquoted identifiers and + * double-quoted (Postgres) or backtick-quoted (MySQL) identifiers, + * with optional schema-qualifier prefix (`public.users`, + * `"public"."users"`). For v1 we record only the *table* part — + * schema goes into a future column when we have join targets. + */ +const IDENT = '(?:`([^`]+)`|"([^"]+)"|([A-Za-z_][\\w]*))'; + +interface PatternDef { + /** Capture group containing the table name (1, 2, or 3 in IDENT). */ + re: RegExp; + op: SqlOp; +} + +/** + * SQL keyword + identifier patterns. `i` flag makes them case- + * insensitive; `g` is required for `exec` loops to advance through + * multiple matches per line. + * + * Each regex captures the table name in groups 1/2/3 (backtick / + * double-quote / unquoted) — at most one is set per match. + */ +const PATTERNS: PatternDef[] = [ + // SELECT ... FROM
+ // FROM appears in SELECT and DELETE statements; we tag it 'read' here + // and let DELETE's own regex below tag it 'write'. Last write wins + // because Map dedup is keyed by (table, op), so the DELETE one + // produces a separate write row alongside this read row. + { re: new RegExp(`\\bFROM\\s+(?:[A-Za-z_]\\w*\\s*\\.\\s*)?${IDENT}`, 'gi'), op: 'read' }, + { re: new RegExp(`\\bJOIN\\s+(?:[A-Za-z_]\\w*\\s*\\.\\s*)?${IDENT}`, 'gi'), op: 'read' }, + // INSERT INTO
+ { re: new RegExp(`\\bINSERT\\s+INTO\\s+(?:[A-Za-z_]\\w*\\s*\\.\\s*)?${IDENT}`, 'gi'), op: 'write' }, + // UPDATE
... SET + { re: new RegExp(`\\bUPDATE\\s+(?:[A-Za-z_]\\w*\\s*\\.\\s*)?${IDENT}\\s+SET\\b`, 'gi'), op: 'write' }, + // DELETE FROM
+ { re: new RegExp(`\\bDELETE\\s+FROM\\s+(?:[A-Za-z_]\\w*\\s*\\.\\s*)?${IDENT}`, 'gi'), op: 'write' }, + // CREATE TABLE [IF NOT EXISTS]
+ { re: new RegExp(`\\bCREATE\\s+(?:TEMP(?:ORARY)?\\s+)?TABLE\\s+(?:IF\\s+NOT\\s+EXISTS\\s+)?(?:[A-Za-z_]\\w*\\s*\\.\\s*)?${IDENT}`, 'gi'), op: 'ddl' }, + // ALTER TABLE / DROP TABLE + { re: new RegExp(`\\bALTER\\s+TABLE\\s+(?:[A-Za-z_]\\w*\\s*\\.\\s*)?${IDENT}`, 'gi'), op: 'ddl' }, + { re: new RegExp(`\\bDROP\\s+TABLE\\s+(?:IF\\s+EXISTS\\s+)?(?:[A-Za-z_]\\w*\\s*\\.\\s*)?${IDENT}`, 'gi'), op: 'ddl' }, +]; + +/** + * Identifier names we drop because they're SQL keywords or noise + * that the regex over-matches on: + * - `WHERE` / `ON` / `GROUP` after `JOIN` (chained JOIN clauses) + * - `AS`/`USING` aliasing + * - `SELECT` / `INTO` (CTE-shaped or `SELECT ... INTO`) + */ +const RESERVED_TABLE_NAMES = new Set([ + 'where', 'on', 'group', 'order', 'limit', 'using', 'as', + 'select', 'into', 'values', 'set', 'and', 'or', 'not', + 'null', 'true', 'false', +]); + +/** + * Resolver supplied by caller: (filePath, line) → enclosing nodeId. + * Returns null when the read is at the file's top level. + */ +export type EnclosingNodeResolver = (filePath: string, line: number) => string | null; + +export interface FileTarget { + path: string; + language: string; +} + +/** + * Strip line and same-line block comments before SQL detection. + * + * Without this, a line like + * // example: db.prepare('SELECT name FROM the docs') + * passes the prose-rejection (it has a quote AND a SQL verb) and + * extracts `the` as a "table name". The comment is the actual + * problem — strip it first. + * + * Naive split on `//` / `#` is acceptable: SQL syntax doesn't use + * either as operators, so truncating SQL after a `//` inside a + * string is implausible (SQL line comments are `--`). Block + * comments on a single line (`/* ... *\/`) are stripped via + * regex; multi-line block comments are a documented v1 miss. + */ +function stripComments(line: string, language: string): string { + // Same-line block comments first (works for C-family languages). + let stripped = line.replace(/\/\*[\s\S]*?\*\//g, ''); + if (language === 'python' || language === 'ruby') { + const idx = stripped.indexOf('#'); + if (idx >= 0) stripped = stripped.slice(0, idx); + } else { + const idx = stripped.indexOf('//'); + if (idx >= 0) stripped = stripped.slice(0, idx); + } + return stripped; +} + +/** + * Pre-filter: line (with comments stripped) must contain a quote + * (so it's plausibly a string literal) AND a SQL verb. Anchoring on + * a verb is critical — without it, prose like + * const note = "get the value from the array"; + * pollutes results because `from the` matches our `FROM
` + * regex. Requiring `SELECT|INSERT|UPDATE|...` on the same line + * filters those out. + */ +function lineLooksLikeSql(line: string): boolean { + if (!/['"`]/.test(line)) return false; + return /\b(?:SELECT|INSERT|UPDATE|DELETE|CREATE|ALTER|DROP|TRUNCATE)\b/i.test(line); +} + +/** + * Sanity check: the captured `FROM
` (or similar) should be + * inside a string literal, not in a comment. Approximated by + * requiring a quote (`'`, `"`, `` ` ``) somewhere before the match + * position on the same line. Doesn't handle multi-line template + * literals where the open-quote is on a previous line — that's a v1 + * acceptable miss. + */ +function isInsideString(line: string, matchIndex: number): boolean { + const prefix = line.slice(0, matchIndex); + return /['"`]/.test(prefix); +} + +/** + * Pull the table name out of a regex match. Exactly one of the + * three identifier capture groups is set per IDENT alternation. + */ +function extractTableName(m: RegExpExecArray): string | null { + const name = m[1] ?? m[2] ?? m[3]; + if (!name) return null; + if (RESERVED_TABLE_NAMES.has(name.toLowerCase())) return null; + return name; +} + +/** + * Scan a list of (path, language) targets and return all SQL refs + * found. Pure I/O + regex; the caller owns DB writes via + * `applySqlRefs`. + */ +export function extractSqlRefs( + rootDir: string, + targets: Iterable, + resolveEnclosing: EnclosingNodeResolver +): SqlRef[] { + const refs: SqlRef[] = []; + for (const t of targets) { + if (!SUPPORTED_LANGUAGES.has(t.language)) continue; + let src: string; + try { + src = fs.readFileSync(path.join(rootDir, t.path), 'utf8'); + } catch (err) { + logDebug(`extractSqlRefs: read failed for ${t.path}: ${err instanceof Error ? err.message : String(err)}`); + continue; + } + const lines = src.split('\n'); + for (let i = 0; i < lines.length; i++) { + const rawLine = lines[i]!; + const line = stripComments(rawLine, t.language); + if (!lineLooksLikeSql(line)) continue; + const lineNo = i + 1; + // Per-line dedup: if the same (table, op) appears twice via + // overlapping regex (e.g. `FROM` and `JOIN` in one line for + // different tables, but the same table doesn't double-record). + const seen = new Set(); + for (const pat of PATTERNS) { + pat.re.lastIndex = 0; + let m: RegExpExecArray | null; + while ((m = pat.re.exec(line)) !== null) { + if (!isInsideString(line, m.index)) continue; + const name = extractTableName(m); + if (!name) continue; + const key = `${name.toLowerCase()}|${pat.op}`; + if (seen.has(key)) continue; + seen.add(key); + refs.push({ + tableName: name, + op: pat.op, + sourceNodeId: resolveEnclosing(t.path, lineNo), + filePath: t.path, + line: lineNo, + }); + } + } + } + } + return refs; +} diff --git a/src/types.ts b/src/types.ts index 75531cab..89c6c820 100644 --- a/src/types.ts +++ b/src/types.ts @@ -524,6 +524,12 @@ export interface CodeGraphConfig { * Enabled by default. */ enableConfigRefs?: boolean; + + /** + * Extract SQL string-literal references (table reads/writes/DDL) + * into sql_refs. Enabled by default. + */ + enableSqlRefs?: boolean; } // `DEFAULT_CONFIG` lives in `./default-config.ts` so its `include`