From 0a3b32adc6c8db4b0be746757cb49a6f03202d0b Mon Sep 17 00:00:00 2001 From: andreinknv Date: Mon, 27 Apr 2026 16:27:57 -0400 Subject: [PATCH 1/2] =?UTF-8?q?refactor:=20per-language=20registry=20?= =?UTF-8?q?=E2=80=94=20eliminate=20cross-PR=20conflict=20surface?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adding a new language used to require coordinated edits to 6 shared lists across 4 files (Language union in types.ts; DEFAULT_CONFIG.include; WASM_GRAMMAR_FILES, EXTENSION_MAP, and getLanguageDisplayName in grammars.ts; EXTRACTORS map in languages/index.ts). Two PRs adding different languages typically conflicted on every one of those. After this refactor, adding a new language is: 1. Drop a file at src/extraction/languages/.ts exporting an _DEF: LanguageDef constant. 2. Add ONE import line and ONE array entry to src/extraction/languages/registry.ts (alphabetical position — adjacent additions are still possible but rare). That's it. grammars.ts, types.ts, tree-sitter.ts dispatch, and the default include globs are all derived from the registry. ## What's in a LanguageDef ```ts interface LanguageDef { name: string; // canonical id displayName: string; // "Pascal / Delphi" extensions: readonly string[]; // ['.pas', '.dpr', ...] includeGlobs: readonly string[]; grammar?: { wasmFile, vendored?, extractor }; // tree-sitter customExtractor?: (fp, src) => ExtractionResult; // Liquid, Svelte extensionOverrides?: { '.dfm': { customExtractor } }; // Pascal forms } ``` Each existing language file now exports both its `xxxExtractor` (unchanged) AND a new `XXX_DEF`. New files were added for tsx, jsx, svelte, liquid (the latter two wrap their existing custom extractor classes via the customExtractor field). ## Refactored consumers - src/extraction/grammars.ts: WASM_GRAMMAR_FILES removed (was internal-only); EXTENSION_MAP now a Proxy that lazy-builds from the registry on first access (avoids TDZ in cyclic load paths). loadGrammarsForLanguages, isLanguageSupported, isGrammarLoaded, getSupportedLanguages, getLanguageDisplayName, detectLanguage — all read from registry. - src/extraction/tree-sitter.ts: extractFromSource's if-chain (svelte / liquid / pascal+.dfm/.fmx) replaced with one lookup: def.extensionOverrides[ext]?.customExtractor || def.customExtractor. Drops direct imports of LiquidExtractor, SvelteExtractor, DfmExtractor. - src/types.ts: DEFAULT_CONFIG moved to src/default-config.ts (cycle break). types.ts re-exports for backward compat. The `include` array is now built lazily from each LanguageDef's includeGlobs. ## What still requires a one-line edit The Language string union in types.ts still hard-codes the known languages (typescript | javascript | … | unknown). New languages added to the registry work at runtime as strings, but adding the literal here is required IF the resolver wants to do exhaustive narrowing on the new language (resolution/index.ts and resolution/import-resolver.ts have a few `language === 'X'` branches). Most new languages don't need such branches. This trade-off keeps strict narrowing for the existing handful of language-specific code paths while making everything else registry-driven. ## Tests 380/380 pass. No new tests; behavior is identical. Existing extraction.test.ts and pr19-improvements.test.ts heavily exercise detectLanguage, isLanguageSupported, getSupportedLanguages, and loadAllGrammars — all green. ## Follow-ups (out of scope) - Auto-discovery in registry.ts via fs.readdirSync — works in built dist/ but vite-node doesn't support extensionless require() of TS source. A small build-time generator could remove the static import list entirely. - Splitting __tests__/extraction.test.ts into per-language test files — eliminates the test-end-of-file conflict surface that every language PR currently hits. - Similar registry refactors for: - MCP tool definitions (each tool self-registers; no shared tools[] array or case-switch in execute()) - Migration files (each migration in src/db/migrations/NNN-*.ts; auto-discovered by version) - Index/sync hooks (centrality, churn, issue-history, config-refs, sql-refs, cochange all currently mutate CodeGraph.indexAll/sync; an IndexHook interface would make each pass self-contained) Co-Authored-By: Claude Opus 4.7 (1M context) --- src/default-config.ts | 194 +++++++++++++++++++++++ src/extraction/grammars.ts | 204 +++++++++++------------- src/extraction/languages/c-cpp.ts | 18 +++ src/extraction/languages/csharp.ts | 9 ++ src/extraction/languages/dart.ts | 9 ++ src/extraction/languages/go.ts | 9 ++ src/extraction/languages/java.ts | 9 ++ src/extraction/languages/javascript.ts | 9 ++ src/extraction/languages/jsx.ts | 14 ++ src/extraction/languages/kotlin.ts | 9 ++ src/extraction/languages/liquid.ts | 16 ++ src/extraction/languages/pascal.ts | 27 ++++ src/extraction/languages/php.ts | 9 ++ src/extraction/languages/python.ts | 9 ++ src/extraction/languages/registry.ts | 102 ++++++++++++ src/extraction/languages/ruby.ts | 9 ++ src/extraction/languages/rust.ts | 9 ++ src/extraction/languages/svelte.ts | 15 ++ src/extraction/languages/swift.ts | 9 ++ src/extraction/languages/tsx.ts | 14 ++ src/extraction/languages/types.ts | 83 ++++++++++ src/extraction/languages/typescript.ts | 9 ++ src/extraction/tree-sitter.ts | 31 ++-- src/types.ts | 205 +------------------------ 24 files changed, 697 insertions(+), 334 deletions(-) create mode 100644 src/default-config.ts create mode 100644 src/extraction/languages/jsx.ts create mode 100644 src/extraction/languages/liquid.ts create mode 100644 src/extraction/languages/registry.ts create mode 100644 src/extraction/languages/svelte.ts create mode 100644 src/extraction/languages/tsx.ts create mode 100644 src/extraction/languages/types.ts diff --git a/src/default-config.ts b/src/default-config.ts new file mode 100644 index 00000000..5c59179c --- /dev/null +++ b/src/default-config.ts @@ -0,0 +1,194 @@ +/** + * Default project configuration. + * + * Lives in its own file (separate from `types.ts`) because the + * `include` glob list is derived from the language registry — and + * the registry transitively imports `types.ts` via per-language + * files, which would create an evaluation cycle if `default-config` + * were itself imported by `types.ts` eagerly. + * + * **Lazy include resolution.** The `include` array is built on + * first access via a property getter, not at module load. By the + * time anything reads `DEFAULT_CONFIG.include`, the registry has + * fully evaluated, so all language definitions are available. + */ + +import type { CodeGraphConfig } from './types'; +import { getLanguageDefs } from './extraction/languages/registry'; + +let _includeCache: string[] | null = null; +function buildIncludeGlobs(): string[] { + if (_includeCache) return _includeCache; + const seen = new Set(); + const out: string[] = []; + for (const def of getLanguageDefs()) { + for (const glob of def.includeGlobs) { + if (seen.has(glob)) continue; + seen.add(glob); + out.push(glob); + } + } + _includeCache = out; + return out; +} + +const baseConfig: CodeGraphConfig = { + version: 1, + rootDir: '.', + include: [], // populated lazily via the getter below + exclude: [ + // Version control + '**/.git/**', + + // Dependencies + '**/node_modules/**', + '**/vendor/**', + '**/Pods/**', + + // Generic build outputs + '**/dist/**', + '**/build/**', + '**/out/**', + '**/bin/**', + '**/obj/**', + '**/target/**', + + // JavaScript/TypeScript + '**/*.min.js', + '**/*.bundle.js', + '**/.next/**', + '**/.nuxt/**', + '**/.svelte-kit/**', + '**/.output/**', + '**/.turbo/**', + '**/.cache/**', + '**/.parcel-cache/**', + '**/.vite/**', + '**/.astro/**', + '**/.docusaurus/**', + '**/.gatsby/**', + '**/.webpack/**', + '**/.nx/**', + '**/.yarn/cache/**', + '**/.pnpm-store/**', + '**/storybook-static/**', + + // React Native / Expo + '**/.expo/**', + '**/web-build/**', + '**/ios/Pods/**', + '**/ios/build/**', + '**/android/build/**', + '**/android/.gradle/**', + + // Python + '**/__pycache__/**', + '**/.venv/**', + '**/venv/**', + '**/site-packages/**', + '**/dist-packages/**', + '**/.pytest_cache/**', + '**/.mypy_cache/**', + '**/.ruff_cache/**', + '**/.tox/**', + '**/.nox/**', + '**/*.egg-info/**', + '**/.eggs/**', + + // Go + '**/go/pkg/mod/**', + + // Rust + '**/target/debug/**', + '**/target/release/**', + + // Java/Kotlin/Gradle + '**/.gradle/**', + '**/.m2/**', + '**/generated-sources/**', + '**/.kotlin/**', + + // Dart/Flutter + '**/.dart_tool/**', + + // C#/.NET + '**/.vs/**', + '**/.nuget/**', + '**/artifacts/**', + '**/publish/**', + + // C/C++ + '**/cmake-build-*/**', + '**/CMakeFiles/**', + '**/bazel-*/**', + '**/vcpkg_installed/**', + '**/.conan/**', + '**/Debug/**', + '**/Release/**', + '**/x64/**', + '**/.pio/**', // Platform.io (IoT/embedded build artifacts and library deps) + + // Electron + '**/release/**', + '**/*.app/**', + '**/*.asar', + + // Swift/iOS/Xcode + '**/DerivedData/**', + '**/.build/**', + '**/.swiftpm/**', + '**/xcuserdata/**', + '**/Carthage/Build/**', + '**/SourcePackages/**', + + // Delphi/Pascal + '**/__history/**', + '**/__recovery/**', + '**/*.dcu', + + // PHP + '**/.composer/**', + '**/storage/framework/**', + '**/bootstrap/cache/**', + + // Ruby + '**/.bundle/**', + '**/tmp/cache/**', + '**/public/assets/**', + '**/public/packs/**', + '**/.yardoc/**', + + // Testing/Coverage + '**/coverage/**', + '**/htmlcov/**', + '**/.nyc_output/**', + '**/test-results/**', + '**/.coverage/**', + + // IDE/Editor + '**/.idea/**', + + // Logs and temp + '**/logs/**', + '**/tmp/**', + '**/temp/**', + + // Documentation build output + '**/_build/**', + '**/docs/_build/**', + '**/site/**', + ], + languages: [], + frameworks: [], + maxFileSize: 1024 * 1024, // 1MB + extractDocstrings: true, + trackCallSites: true, +}; + +Object.defineProperty(baseConfig, 'include', { + get: () => buildIncludeGlobs(), + enumerable: true, + configurable: true, +}); + +export const DEFAULT_CONFIG: CodeGraphConfig = baseConfig; diff --git a/src/extraction/grammars.ts b/src/extraction/grammars.ts index df264fb3..5c2aec09 100644 --- a/src/extraction/grammars.ts +++ b/src/extraction/grammars.ts @@ -4,77 +4,63 @@ * Uses web-tree-sitter (WASM) for universal cross-platform support. * Grammars are loaded lazily — only languages actually present in the project * are compiled, keeping V8 WASM memory pressure low on large codebases. + * + * As of the language-registry refactor, all per-language metadata + * (WASM filenames, file extensions, display names, vendored flag) + * lives in `./languages/.ts` and is auto-collected by + * `./languages/registry.ts`. The constants exported here + * (`EXTENSION_MAP`, `getSupportedLanguages`, `getLanguageDisplayName`) + * remain for backward compat but are derived from the registry. */ import * as path from 'path'; import { Parser, Language as WasmLanguage } from 'web-tree-sitter'; import { Language } from '../types'; +import { getLanguageDefs, getLanguageDefByExtension, getLanguageDefByName } from './languages/registry'; export type GrammarLanguage = Exclude; /** - * WASM filename map — maps each language to its .wasm grammar file - * in the tree-sitter-wasms package. + * File extension → Language mapping, computed lazily on first read. + * + * Cannot be a top-level IIFE: the registry transitively pulls in + * `tree-sitter.ts` (via custom-extractor language defs), which + * imports this file — building the map at module load would TDZ + * against `ALL_DEFS` in the registry. Use the `getExtensionMap()` + * function for an explicit lazy entry point, or read + * `EXTENSION_MAP` (a Proxy that materialises on first property + * access). */ -const WASM_GRAMMAR_FILES: Record = { - typescript: 'tree-sitter-typescript.wasm', - tsx: 'tree-sitter-tsx.wasm', - javascript: 'tree-sitter-javascript.wasm', - jsx: 'tree-sitter-javascript.wasm', - python: 'tree-sitter-python.wasm', - go: 'tree-sitter-go.wasm', - rust: 'tree-sitter-rust.wasm', - java: 'tree-sitter-java.wasm', - c: 'tree-sitter-c.wasm', - cpp: 'tree-sitter-cpp.wasm', - csharp: 'tree-sitter-c_sharp.wasm', - php: 'tree-sitter-php.wasm', - ruby: 'tree-sitter-ruby.wasm', - swift: 'tree-sitter-swift.wasm', - kotlin: 'tree-sitter-kotlin.wasm', - dart: 'tree-sitter-dart.wasm', - pascal: 'tree-sitter-pascal.wasm', -}; +let _extensionMapCache: Record | null = null; +export function getExtensionMap(): Record { + if (_extensionMapCache) return _extensionMapCache; + const out: Record = {}; + for (const def of getLanguageDefs()) { + for (const ext of def.extensions) { + out[ext.toLowerCase()] = def.name as Language; + } + } + _extensionMapCache = out; + return out; +} /** - * File extension to Language mapping + * Backward-compat: a Proxy that lazy-builds the extension map on + * first property access. Existing callers can keep doing + * `EXTENSION_MAP['.ts']` without changes. */ -export const EXTENSION_MAP: Record = { - '.ts': 'typescript', - '.tsx': 'tsx', - '.js': 'javascript', - '.mjs': 'javascript', - '.cjs': 'javascript', - '.jsx': 'jsx', - '.py': 'python', - '.pyw': 'python', - '.go': 'go', - '.rs': 'rust', - '.java': 'java', - '.c': 'c', - '.h': 'c', // Could also be C++, defaulting to C - '.cpp': 'cpp', - '.cc': 'cpp', - '.cxx': 'cpp', - '.hpp': 'cpp', - '.hxx': 'cpp', - '.cs': 'csharp', - '.php': 'php', - '.rb': 'ruby', - '.rake': 'ruby', - '.swift': 'swift', - '.kt': 'kotlin', - '.kts': 'kotlin', - '.dart': 'dart', - '.liquid': 'liquid', - '.svelte': 'svelte', - '.pas': 'pascal', - '.dpr': 'pascal', - '.dpk': 'pascal', - '.lpr': 'pascal', - '.dfm': 'pascal', - '.fmx': 'pascal', -}; +export const EXTENSION_MAP: Record = new Proxy({} as Record, { + get(_t, key: string) { return getExtensionMap()[key]; }, + has(_t, key: string) { return key in getExtensionMap(); }, + ownKeys() { return Object.keys(getExtensionMap()); }, + getOwnPropertyDescriptor(_t, key: string) { + const map = getExtensionMap(); + if (key in map) { + return { configurable: true, enumerable: true, writable: false, value: map[key] }; + } + return undefined; + }, +}); /** * Caches for loaded grammars and parsers @@ -108,21 +94,28 @@ export async function loadGrammarsForLanguages(languages: Language[]): Promise - lang in WASM_GRAMMAR_FILES && - !languageCache.has(lang) && - !unavailableGrammarErrors.has(lang) - ); + // Deduplicate; filter to languages that have a tree-sitter grammar + // (registry's `def.grammar` field) and aren't already loaded. + const seen = new Set(); + const toLoad: Array<{ lang: Language; wasmFile: string; vendored: boolean }> = []; + for (const lang of languages) { + if (seen.has(lang)) continue; + seen.add(lang); + if (languageCache.has(lang) || unavailableGrammarErrors.has(lang)) continue; + const def = getLanguageDefByName(lang); + if (!def?.grammar) continue; + toLoad.push({ + lang, + wasmFile: def.grammar.wasmFile, + vendored: def.grammar.vendored === true, + }); + } // Load grammars sequentially to avoid web-tree-sitter WASM race condition on Node 20+ // See: https://github.com/tree-sitter/tree-sitter/issues/2338 - for (const lang of toLoad) { - const wasmFile = WASM_GRAMMAR_FILES[lang]; + for (const { lang, wasmFile, vendored } of toLoad) { try { - // Pascal ships its own WASM (not in tree-sitter-wasms) - const wasmPath = lang === 'pascal' + const wasmPath = vendored ? path.join(__dirname, 'wasm', wasmFile) : require.resolve(`tree-sitter-wasms/out/${wasmFile}`); const language = await WasmLanguage.load(wasmPath); @@ -140,7 +133,9 @@ export async function loadGrammarsForLanguages(languages: Language[]): Promise { - const allLanguages = Object.keys(WASM_GRAMMAR_FILES) as GrammarLanguage[]; + const allLanguages = getLanguageDefs() + .filter((d) => d.grammar) + .map((d) => d.name as Language); await loadGrammarsForLanguages(allLanguages); } @@ -176,7 +171,8 @@ export function getParser(language: Language): Parser | null { */ export function detectLanguage(filePath: string, source?: string): Language { const ext = filePath.substring(filePath.lastIndexOf('.')).toLowerCase(); - const lang = EXTENSION_MAP[ext] || 'unknown'; + const def = getLanguageDefByExtension(ext); + const lang = (def?.name as Language) ?? 'unknown'; // .h files could be C or C++ — check source content for C++ features if (lang === 'c' && ext === '.h' && source) { @@ -196,29 +192,30 @@ function looksLikeCpp(source: string): boolean { } /** - * Check if a language is supported (has a grammar defined). - * Returns true if the grammar exists, even if not yet loaded. + * Check if a language is supported (has a grammar or custom extractor). + * Returns true if a registry entry exists, even if its grammar isn't loaded. */ export function isLanguageSupported(language: Language): boolean { - if (language === 'svelte') return true; // custom extractor (script block delegation) - if (language === 'liquid') return true; // custom regex extractor if (language === 'unknown') return false; - return language in WASM_GRAMMAR_FILES; + return getLanguageDefByName(language) !== undefined; } /** * Check if a grammar has been loaded and is ready for parsing. + * Custom-extractor languages (no `grammar` field) are always "ready". */ export function isGrammarLoaded(language: Language): boolean { - if (language === 'svelte' || language === 'liquid') return true; + const def = getLanguageDefByName(language); + if (!def) return false; + if (!def.grammar) return true; // custom extractor — always available return languageCache.has(language); } /** - * Get all supported languages (those with grammar definitions). + * Get all supported languages from the registry. */ export function getSupportedLanguages(): Language[] { - return [...(Object.keys(WASM_GRAMMAR_FILES) as GrammarLanguage[]), 'svelte', 'liquid']; + return getLanguageDefs().map((d) => d.name as Language); } /** @@ -237,54 +234,33 @@ export function resetParser(language: Language): void { } /** - * Clear parser/grammar caches (useful for testing) + * Clear parser cache (useful for testing). + * + * Note: `languageCache` is intentionally NOT cleared — the WASM + * `Language` modules are expensive to load and stay cached so a + * subsequent `getParser` call can rebuild a fresh `Parser` instance + * without re-reading the .wasm file. To fully re-init, set + * `parserInitialized = false` and call `initGrammars()` again. */ export function clearParserCache(): void { for (const parser of parserCache.values()) { - parser.delete(); + try { parser.delete(); } catch { /* ignore */ } } parserCache.clear(); - // Note: languageCache is NOT cleared — WASM languages persist. - // To fully re-init, set parserInitialized = false and call initGrammars() again. unavailableGrammarErrors.clear(); } /** - * Report grammars that failed to load. + * Get unavailable grammar errors (for diagnostics) */ -export function getUnavailableGrammarErrors(): Partial> { - const out: Partial> = {}; - for (const [language, message] of unavailableGrammarErrors.entries()) { - out[language] = message; - } - return out; +export function getUnavailableGrammarErrors(): Record { + return Object.fromEntries(unavailableGrammarErrors); } /** - * Get language display name + * Human-readable display name (e.g. "TypeScript", "Pascal / Delphi"). + * Returns the canonical name unchanged if no display name is registered. */ export function getLanguageDisplayName(language: Language): string { - const names: Record = { - typescript: 'TypeScript', - javascript: 'JavaScript', - tsx: 'TypeScript (TSX)', - jsx: 'JavaScript (JSX)', - python: 'Python', - go: 'Go', - rust: 'Rust', - java: 'Java', - c: 'C', - cpp: 'C++', - csharp: 'C#', - php: 'PHP', - ruby: 'Ruby', - swift: 'Swift', - kotlin: 'Kotlin', - dart: 'Dart', - svelte: 'Svelte', - liquid: 'Liquid', - pascal: 'Pascal / Delphi', - unknown: 'Unknown', - }; - return names[language] || language; + return getLanguageDefByName(language)?.displayName ?? language; } diff --git a/src/extraction/languages/c-cpp.ts b/src/extraction/languages/c-cpp.ts index 66219d4f..8ed3a9de 100644 --- a/src/extraction/languages/c-cpp.ts +++ b/src/extraction/languages/c-cpp.ts @@ -114,3 +114,21 @@ export const cppExtractor: LanguageExtractor = { return null; }, }; + +import type { LanguageDef } from './types'; +export const C_DEF: LanguageDef = { + name: 'c', + displayName: 'C', + // .h is also listed for C; tree-sitter.ts contains a `.h might be C++` + // heuristic that overrides this on a content-sniff basis. + extensions: ['.c', '.h'], + includeGlobs: ['**/*.c', '**/*.h'], + grammar: { wasmFile: 'tree-sitter-c.wasm', extractor: cExtractor }, +}; +export const CPP_DEF: LanguageDef = { + name: 'cpp', + displayName: 'C++', + extensions: ['.cpp', '.cc', '.cxx', '.hpp', '.hxx'], + includeGlobs: ['**/*.cpp', '**/*.cc', '**/*.cxx', '**/*.hpp', '**/*.hxx'], + grammar: { wasmFile: 'tree-sitter-cpp.wasm', extractor: cppExtractor }, +}; diff --git a/src/extraction/languages/csharp.ts b/src/extraction/languages/csharp.ts index 9de53734..c66aea69 100644 --- a/src/extraction/languages/csharp.ts +++ b/src/extraction/languages/csharp.ts @@ -65,3 +65,12 @@ export const csharpExtractor: LanguageExtractor = { return null; }, }; + +import type { LanguageDef } from './types'; +export const CSHARP_DEF: LanguageDef = { + name: 'csharp', + displayName: 'C#', + extensions: ['.cs'], + includeGlobs: ['**/*.cs'], + grammar: { wasmFile: 'tree-sitter-c_sharp.wasm', extractor: csharpExtractor }, +}; diff --git a/src/extraction/languages/dart.ts b/src/extraction/languages/dart.ts index 5b545d04..d704d826 100644 --- a/src/extraction/languages/dart.ts +++ b/src/extraction/languages/dart.ts @@ -193,3 +193,12 @@ export const dartExtractor: LanguageExtractor = { return undefined; }, }; + +import type { LanguageDef } from './types'; +export const DART_DEF: LanguageDef = { + name: 'dart', + displayName: 'Dart', + extensions: ['.dart'], + includeGlobs: ['**/*.dart'], + grammar: { wasmFile: 'tree-sitter-dart.wasm', extractor: dartExtractor }, +}; diff --git a/src/extraction/languages/go.ts b/src/extraction/languages/go.ts index 898e6165..5de68ffa 100644 --- a/src/extraction/languages/go.ts +++ b/src/extraction/languages/go.ts @@ -49,3 +49,12 @@ export const goExtractor: LanguageExtractor = { return match?.[1]; }, }; + +import type { LanguageDef } from './types'; +export const GO_DEF: LanguageDef = { + name: 'go', + displayName: 'Go', + extensions: ['.go'], + includeGlobs: ['**/*.go'], + grammar: { wasmFile: 'tree-sitter-go.wasm', extractor: goExtractor }, +}; diff --git a/src/extraction/languages/java.ts b/src/extraction/languages/java.ts index 638533f0..9613217c 100644 --- a/src/extraction/languages/java.ts +++ b/src/extraction/languages/java.ts @@ -57,3 +57,12 @@ export const javaExtractor: LanguageExtractor = { return null; }, }; + +import type { LanguageDef } from './types'; +export const JAVA_DEF: LanguageDef = { + name: 'java', + displayName: 'Java', + extensions: ['.java'], + includeGlobs: ['**/*.java'], + grammar: { wasmFile: 'tree-sitter-java.wasm', extractor: javaExtractor }, +}; diff --git a/src/extraction/languages/javascript.ts b/src/extraction/languages/javascript.ts index 0a0d6780..946e1c5c 100644 --- a/src/extraction/languages/javascript.ts +++ b/src/extraction/languages/javascript.ts @@ -82,3 +82,12 @@ export const javascriptExtractor: LanguageExtractor = { return null; }, }; + +import type { LanguageDef } from './types'; +export const JAVASCRIPT_DEF: LanguageDef = { + name: 'javascript', + displayName: 'JavaScript', + extensions: ['.js', '.mjs', '.cjs'], + includeGlobs: ['**/*.js'], + grammar: { wasmFile: 'tree-sitter-javascript.wasm', extractor: javascriptExtractor }, +}; diff --git a/src/extraction/languages/jsx.ts b/src/extraction/languages/jsx.ts new file mode 100644 index 00000000..5091ee64 --- /dev/null +++ b/src/extraction/languages/jsx.ts @@ -0,0 +1,14 @@ +/** + * JSX — reuses the JavaScript extractor (the JS grammar handles JSX + * via the same `tree-sitter-javascript.wasm` file). + */ +import { javascriptExtractor } from './javascript'; +import type { LanguageDef } from './types'; + +export const JSX_DEF: LanguageDef = { + name: 'jsx', + displayName: 'JSX', + extensions: ['.jsx'], + includeGlobs: ['**/*.jsx'], + grammar: { wasmFile: 'tree-sitter-javascript.wasm', extractor: javascriptExtractor }, +}; diff --git a/src/extraction/languages/kotlin.ts b/src/extraction/languages/kotlin.ts index 19c38624..77d15609 100644 --- a/src/extraction/languages/kotlin.ts +++ b/src/extraction/languages/kotlin.ts @@ -236,3 +236,12 @@ export const kotlinExtractor: LanguageExtractor = { return null; }, }; + +import type { LanguageDef } from './types'; +export const KOTLIN_DEF: LanguageDef = { + name: 'kotlin', + displayName: 'Kotlin', + extensions: ['.kt', '.kts'], + includeGlobs: ['**/*.kt'], + grammar: { wasmFile: 'tree-sitter-kotlin.wasm', extractor: kotlinExtractor }, +}; diff --git a/src/extraction/languages/liquid.ts b/src/extraction/languages/liquid.ts new file mode 100644 index 00000000..ead2f978 --- /dev/null +++ b/src/extraction/languages/liquid.ts @@ -0,0 +1,16 @@ +/** + * Liquid — custom regex-based extractor for Shopify Liquid templates. + * Tree-sitter has no production-quality Liquid grammar; the + * `LiquidExtractor` does targeted pattern matching for snippet + * includes and Drop variable references. + */ +import { LiquidExtractor } from '../liquid-extractor'; +import type { LanguageDef } from './types'; + +export const LIQUID_DEF: LanguageDef = { + name: 'liquid', + displayName: 'Liquid', + extensions: ['.liquid'], + includeGlobs: ['**/*.liquid'], + customExtractor: (filePath, source) => new LiquidExtractor(filePath, source).extract(), +}; diff --git a/src/extraction/languages/pascal.ts b/src/extraction/languages/pascal.ts index aed6a59f..a196c7b0 100644 --- a/src/extraction/languages/pascal.ts +++ b/src/extraction/languages/pascal.ts @@ -60,3 +60,30 @@ export const pascalExtractor: LanguageExtractor = { return node.type === 'declConst'; }, }; + +import type { LanguageDef } from './types'; +import { DfmExtractor } from '../dfm-extractor'; + +const dfmCustomExtractor = (filePath: string, source: string) => + new DfmExtractor(filePath, source).extract(); + +export const PASCAL_DEF: LanguageDef = { + name: 'pascal', + displayName: 'Pascal / Delphi', + extensions: ['.pas', '.dpr', '.dpk', '.lpr', '.dfm', '.fmx'], + includeGlobs: [ + '**/*.pas', '**/*.dpr', '**/*.dpk', '**/*.lpr', + '**/*.dfm', '**/*.fmx', + ], + grammar: { + wasmFile: 'tree-sitter-pascal.wasm', + vendored: true, + extractor: pascalExtractor, + }, + // .dfm/.fmx are Delphi/FireMonkey form files — declarative property + // definitions, not Pascal source. Route them to the dedicated DfmExtractor. + extensionOverrides: { + '.dfm': { customExtractor: dfmCustomExtractor }, + '.fmx': { customExtractor: dfmCustomExtractor }, + }, +}; diff --git a/src/extraction/languages/php.ts b/src/extraction/languages/php.ts index 1133f979..30271286 100644 --- a/src/extraction/languages/php.ts +++ b/src/extraction/languages/php.ts @@ -103,3 +103,12 @@ export const phpExtractor: LanguageExtractor = { return null; }, }; + +import type { LanguageDef } from './types'; +export const PHP_DEF: LanguageDef = { + name: 'php', + displayName: 'PHP', + extensions: ['.php'], + includeGlobs: ['**/*.php'], + grammar: { wasmFile: 'tree-sitter-php.wasm', extractor: phpExtractor }, +}; diff --git a/src/extraction/languages/python.ts b/src/extraction/languages/python.ts index 77807d66..2cddcf40 100644 --- a/src/extraction/languages/python.ts +++ b/src/extraction/languages/python.ts @@ -51,3 +51,12 @@ export const pythonExtractor: LanguageExtractor = { return null; }, }; + +import type { LanguageDef } from './types'; +export const PYTHON_DEF: LanguageDef = { + name: 'python', + displayName: 'Python', + extensions: ['.py', '.pyw'], + includeGlobs: ['**/*.py'], + grammar: { wasmFile: 'tree-sitter-python.wasm', extractor: pythonExtractor }, +}; diff --git a/src/extraction/languages/registry.ts b/src/extraction/languages/registry.ts new file mode 100644 index 00000000..1f4ca6ae --- /dev/null +++ b/src/extraction/languages/registry.ts @@ -0,0 +1,102 @@ +/** + * Language registry — central import + collection of every per-language + * `LanguageDef`. Adding a new language is: + * + * 1. Create `src/extraction/languages/.ts` exporting an + * `_DEF: LanguageDef` constant. + * 2. Add **one** import line and **one** array entry to this file. + * + * This file is the only place a "central list" of languages lives, + * so adjacent-line conflicts between PRs adding different languages + * are limited to whichever alphabetical neighborhood they target. + * + * Note: an earlier draft used `fs.readdirSync` auto-discovery which + * eliminated even this file, but `require()` of extensionless paths + * doesn't work under vitest's vite-node loader for `.ts` source. A + * generated-barrel build step would restore zero-list-edits and is + * tracked as a follow-up. + */ + +import type { LanguageDef } from './types'; + +// ===================================================================== +// Imports — one per language, alphabetical by name +// ===================================================================== +import { C_DEF, CPP_DEF } from './c-cpp'; +import { CSHARP_DEF } from './csharp'; +import { DART_DEF } from './dart'; +import { GO_DEF } from './go'; +import { JAVA_DEF } from './java'; +import { JAVASCRIPT_DEF } from './javascript'; +import { JSX_DEF } from './jsx'; +import { KOTLIN_DEF } from './kotlin'; +import { LIQUID_DEF } from './liquid'; +import { PASCAL_DEF } from './pascal'; +import { PHP_DEF } from './php'; +import { PYTHON_DEF } from './python'; +import { RUBY_DEF } from './ruby'; +import { RUST_DEF } from './rust'; +import { SVELTE_DEF } from './svelte'; +import { SWIFT_DEF } from './swift'; +import { TSX_DEF } from './tsx'; +import { TYPESCRIPT_DEF } from './typescript'; + +// ===================================================================== +// Registry — alphabetical by name +// ===================================================================== +const ALL_DEFS: readonly LanguageDef[] = [ + C_DEF, + CPP_DEF, + CSHARP_DEF, + DART_DEF, + GO_DEF, + JAVA_DEF, + JAVASCRIPT_DEF, + JSX_DEF, + KOTLIN_DEF, + LIQUID_DEF, + PASCAL_DEF, + PHP_DEF, + PYTHON_DEF, + RUBY_DEF, + RUST_DEF, + SVELTE_DEF, + SWIFT_DEF, + TSX_DEF, + TYPESCRIPT_DEF, +]; + +let byName: Map | null = null; +let byExtension: Map | null = null; + +function ensureIndexes(): void { + if (byName && byExtension) return; + byName = new Map(); + byExtension = new Map(); + for (const def of ALL_DEFS) { + byName.set(def.name, def); + for (const ext of def.extensions) { + byExtension.set(ext.toLowerCase(), def); + } + } +} + +export function getLanguageDefs(): readonly LanguageDef[] { + return ALL_DEFS; +} + +export function getLanguageDefByName(name: string): LanguageDef | undefined { + ensureIndexes(); + return byName!.get(name); +} + +export function getLanguageDefByExtension(ext: string): LanguageDef | undefined { + ensureIndexes(); + return byExtension!.get(ext.toLowerCase()); +} + +/** Reset cached indexes. Used by tests; no-op in production paths. */ +export function _resetRegistryCacheForTests(): void { + byName = null; + byExtension = null; +} diff --git a/src/extraction/languages/ruby.ts b/src/extraction/languages/ruby.ts index b5426165..810ac26a 100644 --- a/src/extraction/languages/ruby.ts +++ b/src/extraction/languages/ruby.ts @@ -109,3 +109,12 @@ export const rubyExtractor: LanguageExtractor = { return null; }, }; + +import type { LanguageDef } from './types'; +export const RUBY_DEF: LanguageDef = { + name: 'ruby', + displayName: 'Ruby', + extensions: ['.rb', '.rake'], + includeGlobs: ['**/*.rb'], + grammar: { wasmFile: 'tree-sitter-ruby.wasm', extractor: rubyExtractor }, +}; diff --git a/src/extraction/languages/rust.ts b/src/extraction/languages/rust.ts index 0266a2fd..35c957c0 100644 --- a/src/extraction/languages/rust.ts +++ b/src/extraction/languages/rust.ts @@ -114,3 +114,12 @@ export const rustExtractor: LanguageExtractor = { return null; }, }; + +import type { LanguageDef } from './types'; +export const RUST_DEF: LanguageDef = { + name: 'rust', + displayName: 'Rust', + extensions: ['.rs'], + includeGlobs: ['**/*.rs'], + grammar: { wasmFile: 'tree-sitter-rust.wasm', extractor: rustExtractor }, +}; diff --git a/src/extraction/languages/svelte.ts b/src/extraction/languages/svelte.ts new file mode 100644 index 00000000..7f7ab889 --- /dev/null +++ b/src/extraction/languages/svelte.ts @@ -0,0 +1,15 @@ +/** + * Svelte — custom extractor that delegates the script block back + * through the universal extraction pipeline as TypeScript/JavaScript, + * then merges in template-level call references. + */ +import { SvelteExtractor } from '../svelte-extractor'; +import type { LanguageDef } from './types'; + +export const SVELTE_DEF: LanguageDef = { + name: 'svelte', + displayName: 'Svelte', + extensions: ['.svelte'], + includeGlobs: ['**/*.svelte'], + customExtractor: (filePath, source) => new SvelteExtractor(filePath, source).extract(), +}; diff --git a/src/extraction/languages/swift.ts b/src/extraction/languages/swift.ts index 373fa8a9..fe1ac5ce 100644 --- a/src/extraction/languages/swift.ts +++ b/src/extraction/languages/swift.ts @@ -81,3 +81,12 @@ export const swiftExtractor: LanguageExtractor = { return null; }, }; + +import type { LanguageDef } from './types'; +export const SWIFT_DEF: LanguageDef = { + name: 'swift', + displayName: 'Swift', + extensions: ['.swift'], + includeGlobs: ['**/*.swift'], + grammar: { wasmFile: 'tree-sitter-swift.wasm', extractor: swiftExtractor }, +}; diff --git a/src/extraction/languages/tsx.ts b/src/extraction/languages/tsx.ts new file mode 100644 index 00000000..f4cbe536 --- /dev/null +++ b/src/extraction/languages/tsx.ts @@ -0,0 +1,14 @@ +/** + * TSX (TypeScript + JSX) — reuses the TypeScript extractor with a + * dedicated grammar so JSX-specific node types parse correctly. + */ +import { typescriptExtractor } from './typescript'; +import type { LanguageDef } from './types'; + +export const TSX_DEF: LanguageDef = { + name: 'tsx', + displayName: 'TSX', + extensions: ['.tsx'], + includeGlobs: ['**/*.tsx'], + grammar: { wasmFile: 'tree-sitter-tsx.wasm', extractor: typescriptExtractor }, +}; diff --git a/src/extraction/languages/types.ts b/src/extraction/languages/types.ts new file mode 100644 index 00000000..a93e1930 --- /dev/null +++ b/src/extraction/languages/types.ts @@ -0,0 +1,83 @@ +/** + * Per-language registry types. + * + * Each language ships its own self-contained `LanguageDef` (file + * extensions, default-config globs, grammar config, etc.) so that + * adding a new language is a single-file addition rather than 6 + * coordinated edits across `types.ts`, `grammars.ts`, and the + * `extraction/languages/index.ts` barrel. The registry + * (`./registry`) auto-discovers definitions at module load. + */ + +import type { LanguageExtractor } from '../tree-sitter-types'; +import type { ExtractionResult } from '../../types'; + +/** + * Custom extraction function for languages that don't fit the + * universal tree-sitter AST shape (Liquid, Svelte, HCL, SQL, + * Pascal DFM/FMX form files). + */ +export type CustomExtractorFn = (filePath: string, source: string) => ExtractionResult; + +export interface GrammarBackedConfig { + /** + * WASM grammar filename. Resolved either against the + * `tree-sitter-wasms` npm package or, if `vendored` is true, + * against `src/extraction/wasm/`. + */ + wasmFile: string; + /** + * True when the WASM is shipped under `src/extraction/wasm/` + * because no pre-built grammar exists in `tree-sitter-wasms`. + */ + vendored?: boolean; + /** + * Per-language tree-sitter extraction config consumed by + * `TreeSitterExtractor`. The existing per-language objects + * (e.g. `typescriptExtractor`) are passed in here unchanged. + */ + extractor: LanguageExtractor; +} + +export interface LanguageDef { + /** + * Canonical language name. Stored as the `language` value on + * `Node`, `Edge`, and `FileRecord` rows. Should match an entry + * in the `Language` union in `src/types.ts` for known + * languages; new registry-only languages are accepted as + * strings at runtime. + */ + name: string; + /** Human-readable display label (e.g. "HCL / Terraform"). */ + displayName: string; + /** + * File extensions, lower-cased, with leading dot. Each + * extension uniquely maps to one language (caller should not + * register the same extension twice). + */ + extensions: readonly string[]; + /** + * Default-config include glob patterns. Combined into + * `DEFAULT_CONFIG.include` at registry load. + */ + includeGlobs: readonly string[]; + /** + * Tree-sitter grammar config. Absent for purely-custom + * languages like Liquid (regex-based) and Svelte (script + * delegation). + */ + grammar?: GrammarBackedConfig; + /** + * Whole-language custom extractor. Used when `grammar` is + * absent. If both are present, `extensionOverrides` and + * `customExtractor` win over `grammar`. + */ + customExtractor?: CustomExtractorFn; + /** + * Per-extension override. Used by Pascal where `.dfm`/`.fmx` + * (form files) are extracted by `DfmExtractor` rather than the + * tree-sitter Pascal grammar. Keys are lower-cased extensions + * with the leading dot. + */ + extensionOverrides?: Readonly>; +} diff --git a/src/extraction/languages/typescript.ts b/src/extraction/languages/typescript.ts index 9540dd94..9f82e675 100644 --- a/src/extraction/languages/typescript.ts +++ b/src/extraction/languages/typescript.ts @@ -1,5 +1,6 @@ import { getNodeText, getChildByField } from '../tree-sitter-helpers'; import type { LanguageExtractor } from '../tree-sitter-types'; +import type { LanguageDef } from './types'; export const typescriptExtractor: LanguageExtractor = { functionTypes: ['function_declaration', 'arrow_function', 'function_expression'], @@ -116,3 +117,11 @@ export const typescriptExtractor: LanguageExtractor = { return null; }, }; + +export const TYPESCRIPT_DEF: LanguageDef = { + name: 'typescript', + displayName: 'TypeScript', + extensions: ['.ts'], + includeGlobs: ['**/*.ts'], + grammar: { wasmFile: 'tree-sitter-typescript.wasm', extractor: typescriptExtractor }, +}; diff --git a/src/extraction/tree-sitter.ts b/src/extraction/tree-sitter.ts index 7345d91f..f0bd4b7c 100644 --- a/src/extraction/tree-sitter.ts +++ b/src/extraction/tree-sitter.ts @@ -19,9 +19,7 @@ import { getParser, detectLanguage, isLanguageSupported } from './grammars'; import { generateNodeId, getNodeText, getChildByField, getPrecedingDocstring } from './tree-sitter-helpers'; import type { LanguageExtractor, ExtractorContext } from './tree-sitter-types'; import { EXTRACTORS } from './languages'; -import { LiquidExtractor } from './liquid-extractor'; -import { SvelteExtractor } from './svelte-extractor'; -import { DfmExtractor } from './dfm-extractor'; +import { getLanguageDefByName } from './languages/registry'; // Re-export for backward compatibility export { generateNodeId } from './tree-sitter-helpers'; @@ -2319,28 +2317,21 @@ export function extractFromSource( ): ExtractionResult { const detectedLanguage = language || detectLanguage(filePath, source); const fileExtension = path.extname(filePath).toLowerCase(); + const def = getLanguageDefByName(detectedLanguage); - // Use custom extractor for Svelte - if (detectedLanguage === 'svelte') { - const extractor = new SvelteExtractor(filePath, source); - return extractor.extract(); + // Per-extension override wins (e.g. Pascal `.dfm`/`.fmx` route to + // DfmExtractor rather than the tree-sitter Pascal grammar). + const override = def?.extensionOverrides?.[fileExtension]; + if (override) { + return override.customExtractor(filePath, source); } - // Use custom extractor for Liquid - if (detectedLanguage === 'liquid') { - const extractor = new LiquidExtractor(filePath, source); - return extractor.extract(); - } - - // Use custom extractor for DFM/FMX form files - if ( - detectedLanguage === 'pascal' && - (fileExtension === '.dfm' || fileExtension === '.fmx') - ) { - const extractor = new DfmExtractor(filePath, source); - return extractor.extract(); + // Whole-language custom extractor (Liquid, Svelte, etc.). + if (def?.customExtractor) { + return def.customExtractor(filePath, source); } + // Tree-sitter path. const extractor = new TreeSitterExtractor(filePath, source, detectedLanguage); return extractor.extract(); } diff --git a/src/types.ts b/src/types.ts index 6834483d..e9b3cbcc 100644 --- a/src/types.ts +++ b/src/types.ts @@ -476,206 +476,11 @@ export interface CodeGraphConfig { }[]; } -/** - * Default configuration values - */ -export const DEFAULT_CONFIG: CodeGraphConfig = { - version: 1, - rootDir: '.', - include: [ - // TypeScript/JavaScript - '**/*.ts', - '**/*.tsx', - '**/*.js', - '**/*.jsx', - // Python - '**/*.py', - // Go - '**/*.go', - // Rust - '**/*.rs', - // Java - '**/*.java', - // C/C++ - '**/*.c', - '**/*.h', - '**/*.cpp', - '**/*.hpp', - '**/*.cc', - '**/*.cxx', - // C# - '**/*.cs', - // PHP - '**/*.php', - // Ruby - '**/*.rb', - // Swift - '**/*.swift', - // Kotlin - '**/*.kt', - '**/*.kts', - // Dart - '**/*.dart', - // Svelte - '**/*.svelte', - // Liquid (Shopify themes) - '**/*.liquid', - // Pascal / Delphi - '**/*.pas', - '**/*.dpr', - '**/*.dpk', - '**/*.lpr', - '**/*.dfm', - '**/*.fmx', - ], - exclude: [ - // Version control - '**/.git/**', - - // Dependencies - '**/node_modules/**', - '**/vendor/**', - '**/Pods/**', - - // Generic build outputs - '**/dist/**', - '**/build/**', - '**/out/**', - '**/bin/**', - '**/obj/**', - '**/target/**', - - // JavaScript/TypeScript - '**/*.min.js', - '**/*.bundle.js', - '**/.next/**', - '**/.nuxt/**', - '**/.svelte-kit/**', - '**/.output/**', - '**/.turbo/**', - '**/.cache/**', - '**/.parcel-cache/**', - '**/.vite/**', - '**/.astro/**', - '**/.docusaurus/**', - '**/.gatsby/**', - '**/.webpack/**', - '**/.nx/**', - '**/.yarn/cache/**', - '**/.pnpm-store/**', - '**/storybook-static/**', - - // React Native / Expo - '**/.expo/**', - '**/web-build/**', - '**/ios/Pods/**', - '**/ios/build/**', - '**/android/build/**', - '**/android/.gradle/**', - - // Python - '**/__pycache__/**', - '**/.venv/**', - '**/venv/**', - '**/site-packages/**', - '**/dist-packages/**', - '**/.pytest_cache/**', - '**/.mypy_cache/**', - '**/.ruff_cache/**', - '**/.tox/**', - '**/.nox/**', - '**/*.egg-info/**', - '**/.eggs/**', - - // Go - '**/go/pkg/mod/**', - - // Rust - '**/target/debug/**', - '**/target/release/**', - - // Java/Kotlin/Gradle - '**/.gradle/**', - '**/.m2/**', - '**/generated-sources/**', - '**/.kotlin/**', - - // Dart/Flutter - '**/.dart_tool/**', - - // C#/.NET - '**/.vs/**', - '**/.nuget/**', - '**/artifacts/**', - '**/publish/**', - - // C/C++ - '**/cmake-build-*/**', - '**/CMakeFiles/**', - '**/bazel-*/**', - '**/vcpkg_installed/**', - '**/.conan/**', - '**/Debug/**', - '**/Release/**', - '**/x64/**', - '**/.pio/**', // Platform.io (IoT/embedded build artifacts and library deps) - - // Electron - '**/release/**', - '**/*.app/**', - '**/*.asar', - - // Swift/iOS/Xcode - '**/DerivedData/**', - '**/.build/**', - '**/.swiftpm/**', - '**/xcuserdata/**', - '**/Carthage/Build/**', - '**/SourcePackages/**', - - // Delphi/Pascal - '**/__history/**', - '**/__recovery/**', - '**/*.dcu', - - // PHP - '**/.composer/**', - '**/storage/framework/**', - '**/bootstrap/cache/**', - - // Ruby - '**/.bundle/**', - '**/tmp/cache/**', - '**/public/assets/**', - '**/public/packs/**', - '**/.yardoc/**', - - // Testing/Coverage - '**/coverage/**', - '**/htmlcov/**', - '**/.nyc_output/**', - '**/test-results/**', - '**/.coverage/**', - - // IDE/Editor - '**/.idea/**', - - // Logs and temp - '**/logs/**', - '**/tmp/**', - '**/temp/**', - - // Documentation build output - '**/_build/**', - '**/docs/_build/**', - '**/site/**', - ], - languages: [], - frameworks: [], - maxFileSize: 1024 * 1024, // 1MB - extractDocstrings: true, - trackCallSites: true, -}; +// `DEFAULT_CONFIG` lives in `./default-config.ts` so its `include` +// list can be derived from the language registry without import +// cycles. Re-exported here for backward compat with consumers that +// already import it from `'./types'`. +export { DEFAULT_CONFIG } from './default-config'; // ============================================================================= // Database Types From e43a6183993008eeede1667e180ccb08f1870576 Mon Sep 17 00:00:00 2001 From: andreinknv Date: Mon, 27 Apr 2026 16:44:28 -0400 Subject: [PATCH 2/2] fix(language-registry): TreeSitterExtractor reads from def.grammar.extractor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewer caught a real bug: the original commit kept the EXTRACTORS map in src/extraction/languages/index.ts as a separate hand-curated registry that TreeSitterExtractor read from. Adding a new grammar-backed language would have required editing EXTRACTORS too, undermining the refactor's stated single-source-of- truth claim. A future contributor missing the EXTRACTORS update would silently produce empty extraction results. Fix: - TreeSitterExtractor now reads its extractor straight off the language def: getLanguageDefByName(this.language)?.grammar?.extractor - EXTRACTORS in languages/index.ts becomes a Proxy that derives lazily from the registry (kept for backward compat — readers unchanged). - Add 16 structural-invariant tests in __tests__/language-registry.test.ts that fail loudly if any derived consumer drifts from the registry: EXTRACTORS / EXTENSION_MAP / detectLanguage / isLanguageSupported / getSupportedLanguages / getLanguageDisplayName all asserted to exactly mirror the registry contents. Adding a new grammar-backed language is now genuinely "one new file + two lines in registry.ts" — no other files to touch. Co-Authored-By: Claude Opus 4.7 (1M context) --- __tests__/language-registry.test.ts | 157 +++++++++++++++++++++++++++ src/extraction/languages/index.ts | 101 ++++++++++------- src/extraction/languages/registry.ts | 6 + src/extraction/tree-sitter.ts | 6 +- 4 files changed, 231 insertions(+), 39 deletions(-) create mode 100644 __tests__/language-registry.test.ts diff --git a/__tests__/language-registry.test.ts b/__tests__/language-registry.test.ts new file mode 100644 index 00000000..9afdd59a --- /dev/null +++ b/__tests__/language-registry.test.ts @@ -0,0 +1,157 @@ +/** + * Language registry: structural invariants. + * + * These tests guard against the "parallel list" failure mode that + * the registry refactor exists to prevent. If a future PR adds a + * grammar-backed language but forgets to wire it through one of + * the derived consumers, one of these tests should catch it. + */ +import { describe, it, expect } from 'vitest'; +import { + getLanguageDefs, + getLanguageDefByExtension, + getLanguageDefByName, +} from '../src/extraction/languages/registry'; +import { EXTRACTORS } from '../src/extraction/languages'; +import { + detectLanguage, + isLanguageSupported, + getSupportedLanguages, + getLanguageDisplayName, + EXTENSION_MAP, +} from '../src/extraction/grammars'; + +describe('language registry — single source of truth', () => { + it('has at least the original 19 languages', () => { + const defs = getLanguageDefs(); + expect(defs.length).toBeGreaterThanOrEqual(19); + }); + + it('every def has unique non-empty name', () => { + const names = new Set(); + for (const def of getLanguageDefs()) { + expect(def.name).toBeTruthy(); + expect(names.has(def.name)).toBe(false); + names.add(def.name); + } + }); + + it('extensions are unique across registry (one ext maps to one language)', () => { + const seen = new Map(); + for (const def of getLanguageDefs()) { + for (const ext of def.extensions) { + const lower = ext.toLowerCase(); + if (seen.has(lower)) { + // The .h ambiguity (C vs C++) is intentionally pinned to C + // by the registry; tree-sitter.ts has a content-sniff + // override. Anything else duplicating extensions is a bug. + throw new Error( + `Extension ${lower} mapped twice: ${seen.get(lower)} and ${def.name}` + ); + } + seen.set(lower, def.name); + } + } + }); + + it('grammar-backed defs have wasmFile + extractor', () => { + for (const def of getLanguageDefs()) { + if (!def.grammar) continue; + expect(def.grammar.wasmFile).toMatch(/^tree-sitter-.+\.wasm$/); + expect(def.grammar.extractor).toBeDefined(); + } + }); + + it('custom-extractor defs have a customExtractor function', () => { + for (const def of getLanguageDefs()) { + if (def.grammar) continue; // grammar-backed + expect(def.customExtractor).toBeInstanceOf(Function); + } + }); +}); + +describe('derived consumers stay in sync with the registry', () => { + // Catch the "parallel list drift" bug that motivated this refactor. + // If a new language gets added to registry but a derived consumer + // still hard-codes the old set, one of these will fail. + + it('EXTRACTORS contains exactly the grammar-backed languages', () => { + const grammarBacked = getLanguageDefs() + .filter((d) => d.grammar) + .map((d) => d.name) + .sort(); + const extractorKeys = Object.keys(EXTRACTORS).sort(); + expect(extractorKeys).toEqual(grammarBacked); + }); + + it('every grammar-backed extractor matches def.grammar.extractor exactly', () => { + for (const def of getLanguageDefs()) { + if (!def.grammar) continue; + expect(EXTRACTORS[def.name as keyof typeof EXTRACTORS]).toBe(def.grammar.extractor); + } + }); + + it('EXTENSION_MAP entries exactly mirror registry extensions', () => { + const expected = new Map(); + for (const def of getLanguageDefs()) { + for (const ext of def.extensions) { + expected.set(ext.toLowerCase(), def.name); + } + } + for (const [ext, lang] of expected) { + expect(EXTENSION_MAP[ext]).toBe(lang); + } + // Reverse: no extra keys in EXTENSION_MAP. + expect(Object.keys(EXTENSION_MAP).sort()).toEqual([...expected.keys()].sort()); + }); + + it('detectLanguage returns the expected name for every registered extension', () => { + for (const def of getLanguageDefs()) { + for (const ext of def.extensions) { + // .h is pinned to C by the registry; the C++ heuristic only + // applies when source is provided AND looks like C++. + expect(detectLanguage(`x${ext}`)).toBe(def.name); + } + } + }); + + it('isLanguageSupported returns true for every registered language and false for unknown', () => { + for (const def of getLanguageDefs()) { + expect(isLanguageSupported(def.name as never)).toBe(true); + } + expect(isLanguageSupported('unknown' as never)).toBe(false); + }); + + it('getSupportedLanguages returns exactly the registry names', () => { + const fromRegistry = getLanguageDefs().map((d) => d.name).sort(); + const supported = (getSupportedLanguages() as string[]).sort(); + expect(supported).toEqual(fromRegistry); + }); + + it('getLanguageDisplayName uses each defs displayName', () => { + for (const def of getLanguageDefs()) { + expect(getLanguageDisplayName(def.name as never)).toBe(def.displayName); + } + }); +}); + +describe('lookup helpers', () => { + it('getLanguageDefByName returns the def for a registered name', () => { + expect(getLanguageDefByName('typescript')?.displayName).toBe('TypeScript'); + }); + + it('getLanguageDefByName returns undefined for unknown names', () => { + expect(getLanguageDefByName('nonexistent-language-name')).toBeUndefined(); + }); + + it('getLanguageDefByExtension is case-insensitive', () => { + expect(getLanguageDefByExtension('.TS')?.name).toBe('typescript'); + expect(getLanguageDefByExtension('.ts')?.name).toBe('typescript'); + }); + + it('Pascal extensionOverrides routes .dfm and .fmx to a customExtractor', () => { + const def = getLanguageDefByName('pascal'); + expect(def?.extensionOverrides?.['.dfm']?.customExtractor).toBeInstanceOf(Function); + expect(def?.extensionOverrides?.['.fmx']?.customExtractor).toBeInstanceOf(Function); + }); +}); diff --git a/src/extraction/languages/index.ts b/src/extraction/languages/index.ts index e5d12ac6..0e35b826 100644 --- a/src/extraction/languages/index.ts +++ b/src/extraction/languages/index.ts @@ -1,44 +1,71 @@ /** - * Per-language extraction configurations. + * Per-language barrel. * - * Each file exports a LanguageExtractor config object. - * This barrel builds the EXTRACTORS map consumed by TreeSitterExtractor. + * Adding a new language is a single-file addition: drop a + * `.ts` next to this barrel exporting an `_DEF: + * LanguageDef`, then add one import + one array entry to + * `./registry.ts`. Nothing in this file needs to change for new + * languages. + * + * `EXTRACTORS` is preserved as a backward-compat export but is now + * derived from the registry. Direct readers of `EXTRACTORS` get the + * same shape they always did; the canonical source is each + * language def's `grammar.extractor` field. */ -import { Language } from '../../types'; +import type { Language } from '../../types'; import type { LanguageExtractor } from '../tree-sitter-types'; +import { getLanguageDefs } from './registry'; + +export * from './registry'; -import { typescriptExtractor } from './typescript'; -import { javascriptExtractor } from './javascript'; -import { pythonExtractor } from './python'; -import { goExtractor } from './go'; -import { rustExtractor } from './rust'; -import { javaExtractor } from './java'; -import { cExtractor, cppExtractor } from './c-cpp'; -import { csharpExtractor } from './csharp'; -import { phpExtractor } from './php'; -import { rubyExtractor } from './ruby'; -import { swiftExtractor } from './swift'; -import { kotlinExtractor } from './kotlin'; -import { dartExtractor } from './dart'; -import { pascalExtractor } from './pascal'; +/** + * Backward-compat: `Language → LanguageExtractor` map. Built lazily + * on first read (the registry transitively imports modules that + * import this barrel, so building eagerly would TDZ). + */ +let _extractorsCache: Partial> | null = null; +function buildExtractors(): Partial> { + if (_extractorsCache) return _extractorsCache; + const out: Partial> = {}; + for (const def of getLanguageDefs()) { + if (def.grammar) { + out[def.name as Language] = def.grammar.extractor; + } + } + _extractorsCache = out; + return out; +} -export const EXTRACTORS: Partial> = { - typescript: typescriptExtractor, - tsx: typescriptExtractor, - javascript: javascriptExtractor, - jsx: javascriptExtractor, - python: pythonExtractor, - go: goExtractor, - rust: rustExtractor, - java: javaExtractor, - c: cExtractor, - cpp: cppExtractor, - csharp: csharpExtractor, - php: phpExtractor, - ruby: rubyExtractor, - swift: swiftExtractor, - kotlin: kotlinExtractor, - dart: dartExtractor, - pascal: pascalExtractor, -}; +/** + * Lazy Proxy keeps the existing `EXTRACTORS[lang]` access pattern + * working without forcing the registry to evaluate at module load + * (which would deadlock on the cyclic import chain through + * tree-sitter.ts). + */ +export const EXTRACTORS: Partial> = new Proxy( + {} as Partial>, + { + get(_t, key: string) { + return buildExtractors()[key as Language]; + }, + has(_t, key: string) { + return key in buildExtractors(); + }, + ownKeys() { + return Object.keys(buildExtractors()); + }, + getOwnPropertyDescriptor(_t, key: string) { + const m = buildExtractors(); + if ((key as Language) in m) { + return { + configurable: true, + enumerable: true, + writable: false, + value: m[key as Language], + }; + } + return undefined; + }, + } +); diff --git a/src/extraction/languages/registry.ts b/src/extraction/languages/registry.ts index 1f4ca6ae..7e334b72 100644 --- a/src/extraction/languages/registry.ts +++ b/src/extraction/languages/registry.ts @@ -6,6 +6,12 @@ * `_DEF: LanguageDef` constant. * 2. Add **one** import line and **one** array entry to this file. * + * **That is the complete change list.** All consumers + * (`grammars.ts`, `tree-sitter.ts`'s extractor lookup, + * `default-config.ts`'s include globs, the legacy `EXTRACTORS` + * barrel in `./index.ts`) all read from this registry — there is + * no parallel list to keep in sync. + * * This file is the only place a "central list" of languages lives, * so adjacent-line conflicts between PRs adding different languages * are limited to whichever alphabetical neighborhood they target. diff --git a/src/extraction/tree-sitter.ts b/src/extraction/tree-sitter.ts index f0bd4b7c..29159e2a 100644 --- a/src/extraction/tree-sitter.ts +++ b/src/extraction/tree-sitter.ts @@ -18,7 +18,6 @@ import { import { getParser, detectLanguage, isLanguageSupported } from './grammars'; import { generateNodeId, getNodeText, getChildByField, getPrecedingDocstring } from './tree-sitter-helpers'; import type { LanguageExtractor, ExtractorContext } from './tree-sitter-types'; -import { EXTRACTORS } from './languages'; import { getLanguageDefByName } from './languages/registry'; // Re-export for backward compatibility @@ -113,7 +112,10 @@ export class TreeSitterExtractor { this.filePath = filePath; this.source = source; this.language = language || detectLanguage(filePath, source); - this.extractor = EXTRACTORS[this.language] || null; + // Single source of truth: read the extractor straight off the + // language def so adding a new grammar-backed language is a + // one-file change (no parallel EXTRACTORS map to keep in sync). + this.extractor = getLanguageDefByName(this.language)?.grammar?.extractor ?? null; } /**