From 3ca2f51f39f1ddad1298f4da852c9d64d8e62c38 Mon Sep 17 00:00:00 2001 From: andreinknv Date: Sun, 26 Apr 2026 00:18:35 -0400 Subject: [PATCH 01/45] feat: index files inside git submodules MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `git ls-files` (used for both the initial scan and incremental sync) does not enter submodules — they appear as gitlink entries with their contents invisible. As a result, source files inside submodules were silently skipped during indexing. Both file-discovery paths now recurse into active submodules: - getGitVisibleFiles (full index) enumerates active submodules via `git submodule foreach --recursive --quiet 'echo "$displaypath"'` and runs `git ls-files -co --exclude-standard` inside each, prefixing the submodule path so files are reported relative to the parent root. - getGitChangedFiles (sync) was refactored to share its status-parsing logic between the parent repo and each submodule. Submodule directory entries that the parent's status emits when a submodule pointer moves (e.g., " m vendor/sub") are filtered out so we don't try to read a directory as a file. Submodule indexing is on by default and can be disabled via `indexSubmodules: false` in CodeGraphConfig — useful for repos with large vendor submodules that should remain unindexed without having to add a path-based exclude. Uninitialized / missing submodules are silently skipped (best-effort enhancement on top of the existing scan). Status output paths are now C-style-unquoted before being used or compared against the submodule directory set, so submodule paths containing spaces or non-ASCII bytes are handled correctly. The parent status command failing still falls back to the full filesystem scan via a null return, preserving the prior contract; only submodule-internal status failures are absorbed silently. Closes #86. Co-Authored-By: Claude Opus 4.7 (1M context) --- __tests__/sync.test.ts | 129 ++++++++++++++++++++++++ src/extraction/index.ts | 212 ++++++++++++++++++++++++++++++++++------ src/types.ts | 9 ++ 3 files changed, 322 insertions(+), 28 deletions(-) diff --git a/__tests__/sync.test.ts b/__tests__/sync.test.ts index 8365f630..9a89a8eb 100644 --- a/__tests__/sync.test.ts +++ b/__tests__/sync.test.ts @@ -259,4 +259,133 @@ describe('Sync Module', () => { expect(result.changedFilePaths).toBeUndefined(); }); }); + + describe('Git submodule support', () => { + let parentDir: string; + let submoduleSrc: string; + let cg: CodeGraph; + + function git(cwd: string, ...args: string[]) { + execFileSync('git', args, { cwd, stdio: 'pipe' }); + } + + beforeEach(async () => { + parentDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-submod-parent-')); + submoduleSrc = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-submod-src-')); + + // Build the submodule's source repo first. + git(submoduleSrc, 'init'); + git(submoduleSrc, 'config', 'user.email', 'test@test.com'); + git(submoduleSrc, 'config', 'user.name', 'Test'); + fs.writeFileSync( + path.join(submoduleSrc, 'lib.ts'), + `export function fromSubmodule() { return 'sub'; }` + ); + git(submoduleSrc, 'add', '-A'); + git(submoduleSrc, 'commit', '-m', 'submodule initial'); + + // Build the parent repo and add the submodule. + git(parentDir, 'init'); + git(parentDir, 'config', 'user.email', 'test@test.com'); + git(parentDir, 'config', 'user.name', 'Test'); + + const parentSrc = path.join(parentDir, 'src'); + fs.mkdirSync(parentSrc); + fs.writeFileSync( + path.join(parentSrc, 'main.ts'), + `export function fromParent() { return 'parent'; }` + ); + + // git >= 2.38 blocks file:// submodule sources by default + // (CVE-2022-39253). Pass via -c so it applies to this command only. + git(parentDir, '-c', 'protocol.file.allow=always', 'submodule', 'add', submoduleSrc, 'vendor/sub'); + git(parentDir, 'add', '-A'); + git(parentDir, 'commit', '-m', 'parent initial with submodule'); + + cg = CodeGraph.initSync(parentDir, { + config: { + include: ['**/*.ts'], + exclude: [], + }, + }); + }); + + afterEach(() => { + if (cg) cg.destroy(); + if (fs.existsSync(parentDir)) fs.rmSync(parentDir, { recursive: true, force: true }); + if (fs.existsSync(submoduleSrc)) fs.rmSync(submoduleSrc, { recursive: true, force: true }); + }); + + it('should index files inside a submodule on full index', async () => { + const result = await cg.indexAll(); + + // Both the parent file and the submodule file should be indexed. + expect(result.filesIndexed).toBeGreaterThanOrEqual(2); + const subNodes = cg.searchNodes('fromSubmodule'); + const parentNodes = cg.searchNodes('fromParent'); + expect(subNodes.length).toBeGreaterThan(0); + expect(parentNodes.length).toBeGreaterThan(0); + // The submodule path should be reported relative to the parent root. + expect(subNodes.some((r) => r.node.filePath.startsWith('vendor/sub/'))).toBe(true); + }); + + it('should detect modifications to files inside a submodule via sync', async () => { + await cg.indexAll(); + + fs.writeFileSync( + path.join(parentDir, 'vendor/sub/lib.ts'), + `export function fromSubmodule() { return 'changed'; }` + ); + + const result = await cg.sync(); + + expect(result.filesModified).toBe(1); + expect(result.changedFilePaths).toContain('vendor/sub/lib.ts'); + }); + + it('should detect new untracked files inside a submodule via sync', async () => { + await cg.indexAll(); + + fs.writeFileSync( + path.join(parentDir, 'vendor/sub/newfile.ts'), + `export function added() { return 1; }` + ); + + const result = await cg.sync(); + + expect(result.filesAdded).toBe(1); + expect(result.changedFilePaths).toContain('vendor/sub/newfile.ts'); + }); + + it('should not break when a submodule directory is missing or empty', async () => { + // Wipe the submodule contents to mimic an unfetched submodule + // (this isn't a real `git submodule deinit` — that would also remove + // the .gitmodules entry — but it covers the common "directory exists, + // no .git inside" failure mode). git ls-files inside the empty dir + // errors; the scanner should swallow that and continue with parent files. + fs.rmSync(path.join(parentDir, 'vendor/sub'), { recursive: true, force: true }); + fs.mkdirSync(path.join(parentDir, 'vendor/sub')); + + const result = await cg.indexAll(); + expect(result.errors.filter((e) => e.severity === 'error').length).toBe(0); + expect(cg.searchNodes('fromParent').length).toBeGreaterThan(0); + }); + + it('should skip submodule contents when indexSubmodules is false', async () => { + cg.destroy(); + fs.rmSync(path.join(parentDir, '.codegraph'), { recursive: true, force: true }); + cg = CodeGraph.initSync(parentDir, { + config: { + include: ['**/*.ts'], + exclude: [], + indexSubmodules: false, + }, + }); + + const result = await cg.indexAll(); + expect(cg.searchNodes('fromParent').length).toBeGreaterThan(0); + expect(cg.searchNodes('fromSubmodule').length).toBe(0); + expect(result.filesIndexed).toBe(1); + }); + }); }); diff --git a/src/extraction/index.ts b/src/extraction/index.ts index 4ad056fb..f750e443 100644 --- a/src/extraction/index.ts +++ b/src/extraction/index.ts @@ -123,12 +123,73 @@ export function shouldIncludeFile( return false; } +/** + * Enumerate all initialized submodule paths (recursively), relative to `rootDir`. + * + * Uses `git submodule foreach` so we get exactly the submodules git considers + * active — uninitialized / deinitialized submodules are skipped automatically, + * which is what we want (we can't ls-files inside a directory with no .git). + * + * Returns [] when there are no submodules or when the command fails. Errors + * here are non-fatal: submodule indexing is a best-effort enhancement on top + * of the parent-repo file scan. + */ +function getGitSubmodules(rootDir: string): string[] { + try { + const output = execFileSync( + 'git', + ['submodule', 'foreach', '--recursive', '--quiet', 'echo "$displaypath"'], + { cwd: rootDir, encoding: 'utf-8', timeout: 10000, stdio: ['pipe', 'pipe', 'pipe'] } + ); + const paths: string[] = []; + for (const line of output.split('\n')) { + const trimmed = line.trim(); + if (trimmed) paths.push(normalizePath(trimmed)); + } + return paths; + } catch { + return []; + } +} + +/** + * Run `git ls-files -co --exclude-standard` inside a submodule and return + * paths prefixed back into the parent repo's relative-path namespace. + * Errors are swallowed so one broken submodule doesn't fail the whole scan. + */ +function getSubmoduleFiles(rootDir: string, submodulePath: string): string[] { + try { + const output = execFileSync( + 'git', + ['ls-files', '-co', '--exclude-standard'], + { + cwd: path.join(rootDir, submodulePath), + encoding: 'utf-8', + timeout: 30000, + maxBuffer: 50 * 1024 * 1024, + stdio: ['pipe', 'pipe', 'pipe'], + } + ); + const out: string[] = []; + for (const line of output.split('\n')) { + const trimmed = line.trim(); + if (trimmed) out.push(normalizePath(`${submodulePath}/${trimmed}`)); + } + return out; + } catch { + return []; + } +} + /** * Get all files visible to git (tracked + untracked but not ignored). - * Respects .gitignore at all levels (root, subdirectories). + * Respects .gitignore at all levels (root, subdirectories) and recurses + * into git submodules — `git ls-files` itself does not enter submodules, + * so each one is enumerated separately and its paths are prefixed. + * Pass `indexSubmodules: false` in config to skip the submodule walk. * Returns null on failure (non-git project) so callers can fall back. */ -function getGitVisibleFiles(rootDir: string): Set | null { +function getGitVisibleFiles(rootDir: string, config: CodeGraphConfig): Set | null { try { // Check if the project directory is gitignored by a parent repo. // When rootDir lives inside a parent git repo that ignores it, @@ -167,6 +228,18 @@ function getGitVisibleFiles(rootDir: string): Set | null { files.add(normalizePath(trimmed)); } } + + // Recurse into submodules: each submodule has its own git index, and the + // parent repo's ls-files only emits the submodule directory entry, not + // the files inside. + if (config.indexSubmodules !== false) { + for (const submodulePath of getGitSubmodules(rootDir)) { + for (const filePath of getSubmoduleFiles(rootDir, submodulePath)) { + files.add(filePath); + } + } + } + return files; } catch { return null; @@ -185,44 +258,127 @@ interface GitChanges { } /** - * Use `git status` to detect changed files instead of scanning every file. - * Returns null on failure so callers fall back to full scan. + * Decode the C-style-quoted path that `git status --porcelain` emits when + * a path contains spaces, control chars, or non-ASCII bytes (the path is + * wrapped in double quotes and individual bytes are escaped, e.g. + * "vendor/my\\040sub/file" + * Returns the path unchanged if it isn't quoted. */ -function getGitChangedFiles(rootDir: string, config: CodeGraphConfig): GitChanges | null { +function unquoteGitPath(raw: string): string { + if (raw.length < 2 || raw[0] !== '"' || raw[raw.length - 1] !== '"') { + return raw; + } + const body = raw.slice(1, -1); + const bytes: number[] = []; + for (let i = 0; i < body.length; i++) { + const ch = body[i]; + if (ch !== '\\') { + bytes.push(body.charCodeAt(i)); + continue; + } + const next = body[++i]; + if (next === undefined) break; + if (next >= '0' && next <= '7') { + // Octal escape (up to 3 digits) representing a single byte + let octal = next; + let peek = body[i + 1]; + while (octal.length < 3 && peek !== undefined && peek >= '0' && peek <= '7') { + octal += peek; + i++; + peek = body[i + 1]; + } + bytes.push(parseInt(octal, 8)); + } else { + const map: Record = { a: 7, b: 8, t: 9, n: 10, v: 11, f: 12, r: 13, '"': 34, '\\': 92 }; + bytes.push(map[next] ?? next.charCodeAt(0)); + } + } + return Buffer.from(bytes).toString('utf-8'); +} + +/** + * Run `git status --porcelain --no-renames` in `cwd` and bucket the entries. + * `pathPrefix`, when non-empty, is prepended to every file path so submodule + * status output can be reported relative to the parent repo's root. + * `submoduleDirs` is the set of paths (relative to the parent root) that + * are themselves submodule directories — the parent repo's status emits + * a single entry per submodule (e.g. ` m sub`), and we ignore those because + * the actual file-level changes are picked up by status runs inside each. + * + * Returns `true` if the command ran successfully (even if the working tree + * was clean), `false` if it failed — callers use this to fall back to a + * full filesystem scan when the parent-repo status is unreliable. + */ +function readGitStatus( + cwd: string, + pathPrefix: string, + submoduleDirs: ReadonlySet, + config: CodeGraphConfig, + buckets: GitChanges, +): boolean { + let output: string; try { - const output = execFileSync( + output = execFileSync( 'git', ['status', '--porcelain', '--no-renames'], - { cwd: rootDir, encoding: 'utf-8', timeout: 10000, stdio: ['pipe', 'pipe', 'pipe'] } + { cwd, encoding: 'utf-8', timeout: 10000, stdio: ['pipe', 'pipe', 'pipe'] } ); + } catch { + return false; + } - const modified: string[] = []; - const added: string[] = []; - const deleted: string[] = []; + for (const line of output.split('\n')) { + if (line.length < 4) continue; // Minimum: "XY file" - for (const line of output.split('\n')) { - if (line.length < 4) continue; // Minimum: "XY file" + const statusCode = line.substring(0, 2); + const rawPath = unquoteGitPath(line.substring(3)); + const filePath = pathPrefix + ? normalizePath(`${pathPrefix}/${rawPath}`) + : normalizePath(rawPath); - const statusCode = line.substring(0, 2); - const filePath = normalizePath(line.substring(3)); + // The submodule directory itself shows up as a status entry in the + // parent repo (e.g. " m sub" when the submodule's HEAD has moved); + // skip it — file-level changes are captured by recursing into the submodule. + if (submoduleDirs.has(filePath)) continue; - // Skip files that don't match include/exclude config - if (!shouldIncludeFile(filePath, config)) continue; + // Skip files that don't match include/exclude config + if (!shouldIncludeFile(filePath, config)) continue; - if (statusCode === '??') { - added.push(filePath); - } else if (statusCode.includes('D')) { - deleted.push(filePath); - } else { - // M, MM, AM, A (staged), etc. — treat as modified - modified.push(filePath); - } + if (statusCode === '??') { + buckets.added.push(filePath); + } else if (statusCode.includes('D')) { + buckets.deleted.push(filePath); + } else { + // M, MM, AM, A (staged), etc. — treat as modified + buckets.modified.push(filePath); } + } + return true; +} - return { modified, added, deleted }; - } catch { +/** + * Use `git status` to detect changed files instead of scanning every file. + * Returns null on failure so callers fall back to full scan. + * + * Recurses into git submodules: status inside the parent repo only emits + * a directory-level entry for a changed submodule, so we additionally run + * status inside each active submodule to pick up file-level changes. + * Submodule status failures are non-fatal — only a parent-repo failure + * triggers the full-scan fallback. + */ +function getGitChangedFiles(rootDir: string, config: CodeGraphConfig): GitChanges | null { + const submodules = config.indexSubmodules === false ? [] : getGitSubmodules(rootDir); + const submoduleDirs = new Set(submodules); + const buckets: GitChanges = { modified: [], added: [], deleted: [] }; + + if (!readGitStatus(rootDir, '', submoduleDirs, config, buckets)) { return null; } + for (const submodulePath of submodules) { + readGitStatus(path.join(rootDir, submodulePath), submodulePath, submoduleDirs, config, buckets); + } + + return buckets; } /** @@ -243,7 +399,7 @@ export function scanDirectory( onProgress?: (current: number, file: string) => void ): string[] { // Fast path: use git to get all visible files (respects .gitignore everywhere) - const gitFiles = getGitVisibleFiles(rootDir); + const gitFiles = getGitVisibleFiles(rootDir, config); if (gitFiles) { const files: string[] = []; let count = 0; @@ -270,7 +426,7 @@ export async function scanDirectoryAsync( config: CodeGraphConfig, onProgress?: (current: number, file: string) => void ): Promise { - const gitFiles = getGitVisibleFiles(rootDir); + const gitFiles = getGitVisibleFiles(rootDir, config); if (gitFiles) { const files: string[] = []; let count = 0; diff --git a/src/types.ts b/src/types.ts index 6834483d..f8cbf783 100644 --- a/src/types.ts +++ b/src/types.ts @@ -465,6 +465,14 @@ export interface CodeGraphConfig { /** Whether to track call sites */ trackCallSites: boolean; + /** + * Whether to recurse into git submodules during indexing and sync. + * Default: true. Set to false to skip submodule contents (useful when + * a submodule pulls in a large vendor tree you don't want indexed — + * adding the path to `exclude` also works). + */ + indexSubmodules?: boolean; + /** Custom symbol patterns to extract */ customPatterns?: { /** Name for this pattern group */ @@ -675,6 +683,7 @@ export const DEFAULT_CONFIG: CodeGraphConfig = { maxFileSize: 1024 * 1024, // 1MB extractDocstrings: true, trackCallSites: true, + indexSubmodules: true, }; // ============================================================================= From 31b827c44164dd08ccddceb670d7cf5612bf1ff5 Mon Sep 17 00:00:00 2001 From: andreinknv Date: Sun, 26 Apr 2026 00:57:35 -0400 Subject: [PATCH 02/45] docs: README setup snippets for non-Claude MCP clients Adds a "Using with Other MCP Clients" section to the README with copy-pastable config for opencode, Cursor, LangChain (MultiServerMCPClient), and the Claude Agent SDK, plus a generic stdio-MCP fallback. Each client gets the exact field names it actually expects (the existing README only documented the Claude Code / ~/.claude.json shape). Notes: - The CodeGraph MCP server speaks stdio only, so the LangChain example explicitly passes `transport: "stdio"` (the issue reporter had been trying to use SSE config) and there's a closing note pointing SSE-only clients at supergateway as a bridge. - The generic-fallback section documents the `--path` flag for clients that don't send a `rootUri` in the initialize request. Closes #65, #79. Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 106 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 105 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index fd1ffaba..38a86b12 100644 --- a/README.md +++ b/README.md @@ -316,7 +316,7 @@ fi ## MCP Tools -When running as an MCP server, CodeGraph exposes these tools to Claude Code: +When running as an MCP server, CodeGraph exposes these tools to any MCP-compatible AI assistant: | Tool | Purpose | |------|---------| @@ -331,6 +331,110 @@ When running as an MCP server, CodeGraph exposes these tools to Claude Code: --- +## Using with Other MCP Clients + +The MCP server runs over **stdio** and works with any MCP-compatible client — not just Claude Code. The interactive installer is Claude Code-specific (it writes `~/.claude.json`), so for other clients you'll want the manual setup. + +**Common steps for every client:** + +```bash +npm install -g @colbymchenry/codegraph # so `codegraph` is on PATH +cd your-project +codegraph init -i # initialize + index this project +``` + +Then point your MCP client at `codegraph serve --mcp` using whatever config shape it expects: + +### opencode + +In `opencode.json` (project) or `~/.config/opencode/opencode.json` (global): + +```json +{ + "$schema": "https://opencode.ai/config.json", + "mcp": { + "codegraph": { + "type": "local", + "command": ["codegraph", "serve", "--mcp"], + "enabled": true + } + } +} +``` + +### Cursor + +In `~/.cursor/mcp.json` (global) or `.cursor/mcp.json` (project): + +```json +{ + "mcpServers": { + "codegraph": { + "command": "codegraph", + "args": ["serve", "--mcp"] + } + } +} +``` + +### LangChain (`MultiServerMCPClient`) + +The CodeGraph server speaks stdio, not SSE — pass `transport: "stdio"`: + +```python +from langchain_mcp_adapters.client import MultiServerMCPClient + +client = MultiServerMCPClient({ + "codegraph": { + "command": "codegraph", + "args": ["serve", "--mcp"], + "transport": "stdio", + } +}) +tools = await client.get_tools() +``` + +### Claude Agent SDK + +Pass the server in `mcpServers` (TypeScript) or `mcp_servers` (Python) when calling `query()`: + +```python +from claude_agent_sdk import query, ClaudeAgentOptions + +options = ClaudeAgentOptions( + mcp_servers={ + "codegraph": { + "command": "codegraph", + "args": ["serve", "--mcp"], + } + }, + allowed_tools=["mcp__codegraph__*"], +) + +async for message in query(prompt="Where is auth handled?", options=options): + ... +``` + +### Anything else (generic stdio MCP) + +Most MCP clients (Continue, Zed, custom integrations, etc.) accept some variation of `command` + `args`. The values are always: + +| Field | Value | +|-------|-------| +| Command | `codegraph` | +| Args | `["serve", "--mcp"]` | +| Transport | `stdio` | + +The server reads the project root from the MCP `initialize` request's `rootUri` (set by the client when it connects). If your client doesn't send a `rootUri`, pass the project path explicitly: + +```bash +codegraph serve --mcp --path /absolute/path/to/project +``` + +> **Note:** CodeGraph's MCP server does **not** speak SSE/HTTP. If your client only supports `url` + `transport: "sse"`, you'll need to wrap stdio with a bridge like [supergateway](https://github.com/supercorp-ai/supergateway). + +--- + ## Library Usage ```typescript From 7ec4fc5fe3f583b405a6d7b5aa74fd3deb5fddbc Mon Sep 17 00:00:00 2001 From: andreinknv Date: Sun, 26 Apr 2026 01:04:22 -0400 Subject: [PATCH 03/45] docs: cookbook for adding a new language MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds docs/ADDING-A-LANGUAGE.md walking through every step a contributor needs to add a new language extractor: 1. Source a tree-sitter wasm grammar — covers the three real-world paths (already in tree-sitter-wasms, pre-built release artifact, build from source via tree-sitter-cli's bundled wasi-sdk). 2. Probe the AST with a small scratch script before writing code. 3. Register in src/types.ts + src/extraction/grammars.ts. 4. Type-check before adding extraction logic. 5. Pick a pattern: LanguageExtractor config (procedural / OO) or a self-contained extractor class (declarative / template / non-OO). 6. Map onto existing NodeKind / EdgeKind values. 7. Tests + end-to-end CLI smoke. 8. PR description checklist. Each section points at the existing extractors as worked examples (R for the OO path, HCL/SQL/Liquid for the custom path, Pascal+DFM for the cross-format case). README.md and CLAUDE.md gain a one-line pointer to the cookbook. Closes #55. Co-Authored-By: Claude Opus 4.7 (1M context) --- CLAUDE.md | 2 + README.md | 2 + docs/ADDING-A-LANGUAGE.md | 463 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 467 insertions(+) create mode 100644 docs/ADDING-A-LANGUAGE.md diff --git a/CLAUDE.md b/CLAUDE.md index 71a50c73..70bdba64 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -100,6 +100,8 @@ SQLite database with: TypeScript, JavaScript, TSX, JSX, Svelte, Python, Go, Rust, Java, C, C++, C#, PHP, Ruby, Swift, Kotlin, Dart, Liquid, Pascal +To add a new language, follow the cookbook at [`docs/ADDING-A-LANGUAGE.md`](docs/ADDING-A-LANGUAGE.md). + ### Node and Edge Types **NodeKind**: `file`, `module`, `class`, `struct`, `interface`, `trait`, `protocol`, `function`, `method`, `property`, `field`, `variable`, `constant`, `enum`, `enum_member`, `type_alias`, `namespace`, `parameter`, `import`, `export`, `route`, `component` diff --git a/README.md b/README.md index fd1ffaba..73e7a7b3 100644 --- a/README.md +++ b/README.md @@ -402,6 +402,8 @@ The `.codegraph/config.json` file controls indexing: | Liquid | `.liquid` | Full support | | Pascal / Delphi | `.pas`, `.dpr`, `.dpk`, `.lpr` | Full support (classes, records, interfaces, enums, DFM/FMX form files) | +Want to add another language? See [`docs/ADDING-A-LANGUAGE.md`](docs/ADDING-A-LANGUAGE.md) — it walks through sourcing a tree-sitter grammar, probing the AST, choosing between the OO and self-contained extractor patterns, and the worked examples in the existing extractors. + ## Troubleshooting **"CodeGraph not initialized"** — Run `codegraph init` in your project directory first. diff --git a/docs/ADDING-A-LANGUAGE.md b/docs/ADDING-A-LANGUAGE.md new file mode 100644 index 00000000..189b0e27 --- /dev/null +++ b/docs/ADDING-A-LANGUAGE.md @@ -0,0 +1,463 @@ +# Adding a Language + +This is a cookbook for adding a new language to CodeGraph. It assumes you have a +working dev setup (`npm install` and `npm test` pass). + +There are two patterns. **Pick the one that matches the language you're adding.** + +| Language shape | Pattern | Examples | +|---|---|---| +| Procedural / OO with named functions, classes, methods | **`LanguageExtractor` config** | `python.ts`, `ruby.ts`, `r.ts` | +| Declarative / template / configuration / no named functions | **Custom extractor class** | `hcl-extractor.ts`, `liquid-extractor.ts`, `sql-extractor.ts` | + +The two patterns share the same setup steps (1–4) and only diverge at the extractor +itself (step 5). + +--- + +## 1. Source a tree-sitter wasm grammar + +CodeGraph parses everything via [`web-tree-sitter`](https://www.npmjs.com/package/web-tree-sitter), +so the grammar has to be available as a `.wasm` file. Three options, in order of +preference: + +### 1a. Already in `tree-sitter-wasms` + +The [`tree-sitter-wasms`](https://www.npmjs.com/package/tree-sitter-wasms) npm package +ships pre-built wasms for 30+ common languages. Check `node_modules/tree-sitter-wasms/out/` +after a fresh install: + +```bash +ls node_modules/tree-sitter-wasms/out/ | grep +``` + +If your grammar is there, you're done with this step — just reference the filename. + +### 1b. A pre-built `.wasm` released somewhere else + +Many grammars publish wasms in their GitHub releases (e.g. r-lib/tree-sitter-r) or +in a separate npm package (e.g. `@tree-sitter-grammars/tree-sitter-hcl` ships +`tree-sitter-hcl.wasm` directly in the tarball). + +```bash +# GitHub release +curl -sL -o src/extraction/wasm/tree-sitter-foo.wasm \ + https://github.com/.../releases/download/vX.Y.Z/tree-sitter-foo.wasm + +# Inside an npm tarball +mkdir -p /tmp/foo && cd /tmp/foo +curl -sL https://registry.npmjs.org/tree-sitter-foo/-/tree-sitter-foo-X.Y.Z.tgz | tar xz +cp package/tree-sitter-foo.wasm /src/extraction/wasm/ +``` + +Verify the sha256 against the upstream release manifest before committing. + +### 1c. Build from source + +If only the C source is published (e.g. DerekStride/tree-sitter-sql), build the wasm +locally with `tree-sitter-cli`. Recent versions ship their own wasi-sdk and don't need +Docker or local emcc: + +```bash +mkdir /tmp/foo && cd /tmp/foo +curl -sL https://github.com/.../releases/download/vX.Y.Z/tree-sitter-foo.tar.gz | tar xz +npx --yes tree-sitter-cli@latest build --wasm +cp tree-sitter-foo.wasm /src/extraction/wasm/ +``` + +### Where the wasm lives + +- Grammars from the `tree-sitter-wasms` package are loaded directly from there at runtime. +- Other grammars must be **vendored** under `src/extraction/wasm/` so they ship in the + npm package. The build's `copy-assets` script copies every `.wasm` from that + directory into `dist/extraction/wasm/`. + +**License check.** Tree-sitter grammars are usually MIT or Apache-2.0 — confirm before +committing the wasm and note the source/version in the file's header comment so the +provenance is recoverable later. + +--- + +## 2. Probe the AST + +Don't guess at node types. Parse a representative sample and dump the tree: + +```js +// scratch/probe.mjs +import { Parser, Language } from 'web-tree-sitter'; +await Parser.init(); +const lang = await Language.load('./src/extraction/wasm/tree-sitter-foo.wasm'); +const parser = new Parser(); +parser.setLanguage(lang); + +const sample = ` +// realistic code here — cover every construct you plan to extract +`; + +const tree = parser.parse(sample); +function dump(n, d = 0, max = 4) { + if (d > max) return; + const text = n.text.length > 60 ? n.text.slice(0, 60).replace(/\n/g, '\\n') + '...' : n.text.replace(/\n/g, '\\n'); + console.log(`${' '.repeat(d)}${n.type} "${text}"`); + for (let i = 0; i < n.namedChildCount; i++) dump(n.namedChild(i), d + 1, max); +} +dump(tree.rootNode); +``` + +```bash +node scratch/probe.mjs +``` + +Cover every construct you plan to extract: function definitions, classes, methods, +imports, assignments, calls, references. Watch for surprises: + +- Some grammars wrap names in extra layers (`identifier > simple_identifier`) +- Field names (`childForFieldName`) often differ from what the docs imply +- Operator nodes can be named, unnamed, or both — call `child(i)` vs `namedChild(i)` + and inspect + +Save the probe output before you start coding — you'll refer to it constantly. + +--- + +## 3. Register the language + +Three files, all small. + +**`src/types.ts`** — add to the `Language` union and to `DEFAULT_CONFIG.include`: + +```ts +export type Language = + | 'typescript' + | ... + | 'foo' // ← add here + | 'unknown'; + +export const DEFAULT_CONFIG: CodeGraphConfig = { + ... + include: [ + ... + '**/*.foo', // ← and here + ], +}; +``` + +**`src/extraction/grammars.ts`** — wire up the wasm path, extension map, and display name: + +```ts +const WASM_GRAMMAR_FILES: Record = { + ... + foo: 'tree-sitter-foo.wasm', +}; + +// If vendored under src/extraction/wasm/ instead of tree-sitter-wasms: +const VENDORED_WASM_LANGUAGES: ReadonlySet = new Set([ + 'pascal', + 'foo', // ← add here +]); + +export const EXTENSION_MAP: Record = { + ... + '.foo': 'foo', +}; + +// And in getLanguageDisplayName(): +foo: 'Foo', +``` + +**`CLAUDE.md`** — append the language to the "Supported Languages" line so the +LLM-readable architecture doc stays in sync. + +--- + +## 4. Type-check before writing the extractor + +Run `npx tsc --noEmit` now. If it's not clean, the wiring is wrong — fix that +before adding extraction logic, otherwise type errors will pile up. + +--- + +## 5a. Path A — Plug into `LanguageExtractor` + +Use this when the language has named function/class/method declarations (Python, Ruby, +Java, R, etc.). Create `src/extraction/languages/.ts`: + +```ts +import type { LanguageExtractor } from '../tree-sitter-types'; + +export const fooExtractor: LanguageExtractor = { + // Map AST node types → graph kinds. Empty array = "this kind doesn't + // exist in this language." + functionTypes: ['function_definition'], + classTypes: ['class_definition'], + methodTypes: ['function_definition'], // often the same node, dispatched by context + interfaceTypes: [], + structTypes: [], + enumTypes: [], + typeAliasTypes: [], + importTypes: ['import_statement'], + callTypes: ['call'], + variableTypes: ['assignment'], + + // Field names tree-sitter exposes for extractors to read. + nameField: 'name', + bodyField: 'body', + paramsField: 'parameters', + returnField: 'return_type', + + // Optional hooks — implement what you need: + getSignature: (node, source) => { ... }, + isExported: (node, source) => { ... }, + isAsync: (node) => { ... }, + + // Escape hatch: take over a specific node type entirely. Return true to + // tell the core "I handled this, skip default dispatch." + visitNode: (node, ctx) => { + // R uses this to handle `name <- function() {}` because tree-sitter's + // function_definition has no name field — the name is on the LHS of + // the enclosing assignment. + return false; + }, +}; +``` + +Then register it in `src/extraction/languages/index.ts`: + +```ts +import { fooExtractor } from './foo'; + +export const EXTRACTORS: Partial> = { + ... + foo: fooExtractor, +}; +``` + +The core (`TreeSitterExtractor` in `src/extraction/tree-sitter.ts`) does the rest: +walks the AST, dispatches based on your `*Types` arrays, calls your hooks, manages +the scope stack, and emits nodes/edges. + +**Worked example: R** (`src/extraction/languages/r.ts`). R's `function_definition` +has no name (it's anonymous), so `functionTypes` is empty and the `visitNode` hook +intercepts `binary_operator` assignments and emits the function manually via +`ctx.createNode('function', name, ...)`. + +## 5b. Path B — Custom extractor class + +Use this when the language is declarative (HCL, SQL, dbt) or has a fundamentally +different shape than functions/classes/methods (Liquid templates, Pascal `.dfm` form +files). Create `src/extraction/-extractor.ts`: + +```ts +import { Node, Edge, ExtractionResult, ExtractionError, UnresolvedReference } from '../types'; +import { generateNodeId, getNodeText } from './tree-sitter-helpers'; +import { getParser } from './grammars'; + +export class FooExtractor { + private filePath: string; + private source: string; + private nodes: Node[] = []; + private edges: Edge[] = []; + private unresolvedReferences: UnresolvedReference[] = []; + private errors: ExtractionError[] = []; + + constructor(filePath: string, source: string) { + this.filePath = filePath; + this.source = source; + } + + extract(): ExtractionResult { + const startTime = Date.now(); + const parser = getParser('foo'); + if (!parser) { + this.errors.push({ message: 'foo grammar not loaded', severity: 'error', code: 'grammar_unavailable' }); + return this.result(startTime); + } + const tree = parser.parse(this.source); + if (!tree) { ... return this.result(startTime); } + + try { + const fileNodeId = this.createFileNode(); + // Walk the AST, emit nodes via this.nodes.push and this.edges.push + // Emit references via this.unresolvedReferences.push so the resolver + // pass can match them across files. + ... + return this.result(startTime); + } finally { + tree.delete(); // ← important: tree-sitter trees back onto WASM memory + } + } + + private result(startTime: number): ExtractionResult { + return { + nodes: this.nodes, + edges: this.edges, + unresolvedReferences: this.unresolvedReferences, + errors: this.errors, + durationMs: Date.now() - startTime, + }; + } +} +``` + +Wire the dispatch in `src/extraction/tree-sitter.ts`: + +```ts +import { FooExtractor } from './foo-extractor'; + +export function extractFromSource(filePath, source, language?) { + ... + if (detectedLanguage === 'foo') { + return new FooExtractor(filePath, source).extract(); + } + ... +} +``` + +**Worked examples:** + +- `src/extraction/hcl-extractor.ts` — Terraform / HCL. Block-based DDL. Each + top-level block becomes a node whose qualified name matches the Terraform + reference form (`var.X`, `local.X`, `module.X`, `aws_s3_bucket.foo`) so the + resolver can match references across files automatically. +- `src/extraction/sql-extractor.ts` — SQL DDL. CREATE TABLE / VIEW / FUNCTION / + TRIGGER / TYPE / SCHEMA → graph nodes; foreign keys, view source tables, + trigger target tables and executed functions → edges. +- `src/extraction/liquid-extractor.ts` — Shopify Liquid templates. Regex-based + (no tree-sitter) since the template grammar isn't useful for code intelligence. + +--- + +## 6. Pick `NodeKind` and `EdgeKind` values + +`NodeKind` and `EdgeKind` are fixed unions in `src/types.ts`. Map your language's +constructs onto the closest existing kind rather than introducing new ones — +adding a new kind is a cross-cutting change that touches search, resolution, and +context-building code. + +Common mappings used by recent extractors: + +| Language construct | NodeKind | +|---|---| +| Function / procedure / standalone routine | `function` | +| Method on a class | `method` | +| Class / type / table / declarative resource | `class` | +| Trait / mixin | `trait` | +| Interface / protocol | `interface` | +| Module / package / file-level scope / Terraform module | `module` | +| Namespace / schema / SQL schema / Terraform provider | `namespace` | +| Variable / Terraform variable | `variable` | +| Constant / Terraform local / R top-level binding | `constant` | +| Type alias / SQL composite type | `type_alias` | +| Enum (any) | `enum` | +| Import / library / source / require | `import` | +| Output / re-export / Terraform output | `export` | + +Edges are usually one of: + +| Edge | When | +|---|---| +| `contains` | Parent contains child (file → block, class → method) | +| `calls` | Function/method invokes another | +| `imports` | File pulls in another module/file | +| `references` | Generic mention of another symbol (FK, lookup, attribute access) | +| `extends` / `implements` | Inheritance relationships | + +Emit references through `unresolvedReferences` (with `referenceName` set to a +qualified name that matches what you put on the target node's `qualifiedName`) — +the resolver pass matches them across files using the `name-matcher` and +`import-resolver` modules. + +--- + +## 7. Tests + +Tests live in `__tests__/extraction.test.ts`, grouped by language with a +`describe(' Extraction', ...)` block. Use `extractFromSource` directly +for unit-style tests: + +```ts +import { extractFromSource } from '../src/extraction'; + +describe('Foo Extraction', () => { + describe('Language detection', () => { + it('should detect Foo files', () => { + expect(detectLanguage('main.foo')).toBe('foo'); + }); + }); + + describe('Function extraction', () => { + it('should extract a top-level function', () => { + const code = `function add(a, b) a + b`; + const result = extractFromSource('main.foo', code); + const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'add'); + expect(fn).toBeDefined(); + }); + }); +}); +``` + +Cover the AST shapes you saw in the probe, especially the surprising ones. Pay +particular attention to: + +- The smallest possible valid program (`expect(...).toBeDefined()` for the file node) +- Each node-kind mapping (one test per emitted kind) +- Reference forms (call edges, FK / cross-file references, imports) +- Anything you intentionally skipped (anonymous lambdas, dynamic imports, etc.) + with a negative assertion so the omission is documented + +Run the suite serialized to avoid the file-watcher tests' parallel flakiness: + +```bash +npx vitest run --no-file-parallelism +``` + +End-to-end smoke test from a fresh fixture before opening the PR: + +```bash +SMOKE=$(mktemp -d) && cat > "$SMOKE/main.foo" <<'EOF' +... realistic input ... +EOF +cd "$SMOKE" && git init -q +node /dist/bin/codegraph.js init "$SMOKE" +node /dist/bin/codegraph.js index "$SMOKE" +node /dist/bin/codegraph.js status "$SMOKE" +cd "$SMOKE" && node /dist/bin/codegraph.js query "" +``` + +The `status` call should report your file under "Files by Language", and `query` +should turn up the symbols you expect at the right line numbers. + +--- + +## 8. Open the PR + +Include in the PR description: + +- The grammar source + version + license + sha256 (if vendored) +- A small worked example showing what gets extracted +- The full test plan (`npm test`, `tsc`, `npm run build`, CLI smoke) +- Any known limitations (constructs not supported, AST quirks, things the grammar + itself can't parse) + +Don't claim support for constructs the grammar can't actually parse — this happens +more often than you'd expect (e.g. `tree-sitter-sql` errors out on `CREATE +PROCEDURE` because procedure-body syntax varies sharply across dialects). Say what +works, say what doesn't, and let reviewers decide. + +--- + +## Reference: existing extractors as templates + +Read these in source order if your language is similar to one of them: + +- **Procedural / OO:** `src/extraction/languages/python.ts` (small, easy to read), + `ruby.ts` (with bare-call detection), `kotlin.ts` (extension functions), + `r.ts` (no `def` keyword — uses `visitNode` hook for assignments) +- **Declarative / config:** `src/extraction/hcl-extractor.ts` (Terraform reference + graph), `sql-extractor.ts` (DDL with FK / view source extraction) +- **Embedded / template:** `src/extraction/svelte-extractor.ts` (delegates to JS + for ` + const code = `\n`; + const result = extractFromSource('Comp.svelte', code); + const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'add'); + expect(fn).toBeDefined(); + expect(fn?.startLine).toBe(2); + }); + + it('handles multi-line opening tags (script with attributes wrapped)', () => { + // Line 1: + const code = `\nfunction greet() { return "hi"; }\n\n`; + const result = extractFromSource('Comp.svelte', code); + const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'greet'); + expect(fn).toBeDefined(); + expect(fn?.startLine).toBe(3); + }); + + it('preserves correct line numbers when the script block is offset by template lines', () => { + // Line 1:

Hello

+ // Line 2: + // Line 3: + const code = `

Hello

\n\n\n`; + const result = extractFromSource('Comp.svelte', code); + const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'bottom'); + expect(fn).toBeDefined(); + expect(fn?.startLine).toBe(4); + }); + + it('handles a single-line script block with no internal newline', () => { + // Line 1: + const code = `\n`; + const result = extractFromSource('Comp.svelte', code); + const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'inline'); + expect(fn).toBeDefined(); + expect(fn?.startLine).toBe(1); + }); + + it('attributes each block correctly when a file has both module and instance scripts', () => { + // Line 1: + // Line 4: + // Line 5: + const code = + `\n` + + `\n\n`; + const result = extractFromSource('Comp.svelte', code); + const moduleFn = result.nodes.find((n) => n.kind === 'function' && n.name === 'moduleHelper'); + const instanceFn = result.nodes.find((n) => n.kind === 'function' && n.name === 'instanceHelper'); + expect(moduleFn?.startLine).toBe(2); + expect(instanceFn?.startLine).toBe(6); + }); +}); diff --git a/__tests__/watcher.test.ts b/__tests__/watcher.test.ts index f3638e6d..ee732df6 100644 --- a/__tests__/watcher.test.ts +++ b/__tests__/watcher.test.ts @@ -218,6 +218,36 @@ describe('FileWatcher', () => { watcher.stop(); }); + + it('should retry pending changes after a sync failure (no events lost)', async () => { + // First call rejects, subsequent calls resolve. After the initial + // failure, the watcher should retry the same batch on its own — without + // this, transient sync failures (DB locked etc.) would silently drop the + // changes until a new file event happened. + let calls = 0; + const syncFn = vi.fn().mockImplementation(() => { + calls++; + if (calls === 1) return Promise.reject(new Error('transient')); + return Promise.resolve({ filesChanged: 1, durationMs: 5 }); + }); + const onSyncError = vi.fn(); + const onSyncComplete = vi.fn(); + const watcher = new FileWatcher(testDir, baseConfig, syncFn, { + debounceMs: 100, + onSyncError, + onSyncComplete, + }); + + watcher.start(); + fs.writeFileSync(path.join(testDir, 'src', 'test.ts'), 'export const z = 3;'); + + await waitFor(() => onSyncComplete.mock.calls.length > 0, 5000); + expect(onSyncError).toHaveBeenCalledTimes(1); + expect(syncFn).toHaveBeenCalledTimes(2); + expect(onSyncComplete).toHaveBeenCalledWith({ filesChanged: 1, durationMs: 5 }); + + watcher.stop(); + }); }); describe('CodeGraph integration', () => { diff --git a/src/context/index.ts b/src/context/index.ts index 94192377..08f25657 100644 --- a/src/context/index.ts +++ b/src/context/index.ts @@ -286,6 +286,14 @@ export class ContextBuilder { options: FindRelevantContextOptions = {} ): Promise { const opts = { ...DEFAULT_FIND_OPTIONS, ...options }; + // Bound user-supplied limits — `searchLimit` is multiplied by 5 in + // findNodesByExactName (line 312) and feeds several other unbounded + // operations below, so a request with `searchLimit: 1_000_000` would + // pull millions of rows before any filtering. 100 is well above the + // largest legitimate use we've seen. + opts.searchLimit = Math.min(Math.max(1, opts.searchLimit), 100); + opts.maxNodes = Math.min(Math.max(1, opts.maxNodes), 1000); + opts.traversalDepth = Math.min(Math.max(0, opts.traversalDepth), 10); // Start with empty subgraph const nodes = new Map(); diff --git a/src/extraction/svelte-extractor.ts b/src/extraction/svelte-extractor.ts index 5586ee34..323cbe80 100644 --- a/src/extraction/svelte-extractor.ts +++ b/src/extraction/svelte-extractor.ts @@ -135,13 +135,17 @@ export class SvelteExtractor { // Detect module script const isModule = /context\s*=\s*["']module["']/.test(attrs); - // Calculate start line of the script content (line after - const code = `\n`; - const result = extractFromSource('Comp.svelte', code); - const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'add'); - expect(fn).toBeDefined(); - expect(fn?.startLine).toBe(2); - }); - - it('handles multi-line opening tags (script with attributes wrapped)', () => { - // Line 1: - const code = `\nfunction greet() { return "hi"; }\n\n`; - const result = extractFromSource('Comp.svelte', code); - const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'greet'); - expect(fn).toBeDefined(); - expect(fn?.startLine).toBe(3); - }); - - it('preserves correct line numbers when the script block is offset by template lines', () => { - // Line 1:

Hello

- // Line 2: - // Line 3: - const code = `

Hello

\n\n\n`; - const result = extractFromSource('Comp.svelte', code); - const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'bottom'); - expect(fn).toBeDefined(); - expect(fn?.startLine).toBe(4); - }); - - it('handles a single-line script block with no internal newline', () => { - // Line 1: - const code = `\n`; - const result = extractFromSource('Comp.svelte', code); - const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'inline'); - expect(fn).toBeDefined(); - expect(fn?.startLine).toBe(1); - }); - - it('attributes each block correctly when a file has both module and instance scripts', () => { - // Line 1: - // Line 4: - // Line 5: - const code = - `\n` + - `\n\n`; - const result = extractFromSource('Comp.svelte', code); - const moduleFn = result.nodes.find((n) => n.kind === 'function' && n.name === 'moduleHelper'); - const instanceFn = result.nodes.find((n) => n.kind === 'function' && n.name === 'instanceHelper'); - expect(moduleFn?.startLine).toBe(2); - expect(instanceFn?.startLine).toBe(6); +describe('HCL / Terraform Extraction', () => { + describe('Language detection', () => { + it('should detect HCL/Terraform files', () => { + expect(detectLanguage('main.tf')).toBe('hcl'); + expect(detectLanguage('terraform.tfvars')).toBe('hcl'); + expect(detectLanguage('config.hcl')).toBe('hcl'); + }); + + it('should report HCL as supported', () => { + expect(isLanguageSupported('hcl')).toBe(true); + expect(getSupportedLanguages()).toContain('hcl'); + }); + }); + + describe('Block extraction', () => { + it('should extract a resource block as a class node', () => { + const code = `resource "aws_s3_bucket" "logs" { bucket = "my-logs" }`; + const result = extractFromSource('main.tf', code); + + const node = result.nodes.find((n) => n.qualifiedName === 'aws_s3_bucket.logs'); + expect(node).toBeDefined(); + expect(node?.kind).toBe('class'); + expect(node?.name).toBe('aws_s3_bucket.logs'); + expect(node?.language).toBe('hcl'); + expect(node?.signature).toBe('resource "aws_s3_bucket" "logs"'); + }); + + it('should extract a data block with `data.` prefix', () => { + const code = `data "aws_caller_identity" "current" {}`; + const result = extractFromSource('main.tf', code); + + const node = result.nodes.find((n) => n.qualifiedName === 'data.aws_caller_identity.current'); + expect(node).toBeDefined(); + expect(node?.kind).toBe('class'); + expect(node?.name).toBe('aws_caller_identity.current'); + }); + + it('should extract a variable block', () => { + const code = `variable "environment" { type = string }`; + const result = extractFromSource('main.tf', code); + + const node = result.nodes.find((n) => n.qualifiedName === 'var.environment'); + expect(node).toBeDefined(); + expect(node?.kind).toBe('variable'); + expect(node?.name).toBe('environment'); + }); + + it('should extract an output block as an export', () => { + const code = `output "vpc_id" { value = "abc" }`; + const result = extractFromSource('main.tf', code); + + const node = result.nodes.find((n) => n.qualifiedName === 'output.vpc_id'); + expect(node).toBeDefined(); + expect(node?.kind).toBe('export'); + expect(node?.name).toBe('vpc_id'); + }); + + it('should extract a module block', () => { + const code = `module "vpc" { source = "terraform-aws-modules/vpc/aws" }`; + const result = extractFromSource('main.tf', code); + + const node = result.nodes.find((n) => n.qualifiedName === 'module.vpc'); + expect(node).toBeDefined(); + expect(node?.kind).toBe('module'); + expect(node?.name).toBe('vpc'); + }); + + it('should extract a provider block as namespace', () => { + const code = `provider "aws" { region = "us-east-1" }`; + const result = extractFromSource('main.tf', code); + + const node = result.nodes.find((n) => n.qualifiedName === 'provider.aws'); + expect(node).toBeDefined(); + expect(node?.kind).toBe('namespace'); + }); + + it('should split a locals block into one constant per attribute', () => { + const code = `locals { + bucket_name = "my-bucket" + retention = 30 +}`; + const result = extractFromSource('main.tf', code); + + const bucketName = result.nodes.find((n) => n.qualifiedName === 'local.bucket_name'); + const retention = result.nodes.find((n) => n.qualifiedName === 'local.retention'); + expect(bucketName?.kind).toBe('constant'); + expect(retention?.kind).toBe('constant'); + }); + + it('should connect blocks to the file via contains edges', () => { + const code = `resource "aws_s3_bucket" "logs" {}`; + const result = extractFromSource('main.tf', code); + + const fileNode = result.nodes.find((n) => n.kind === 'file'); + const resourceNode = result.nodes.find((n) => n.qualifiedName === 'aws_s3_bucket.logs'); + expect(fileNode).toBeDefined(); + expect(resourceNode).toBeDefined(); + const containsEdge = result.edges.find( + (e) => e.source === fileNode!.id && e.target === resourceNode!.id && e.kind === 'contains' + ); + expect(containsEdge).toBeDefined(); + }); + }); + + describe('Reference extraction', () => { + it('should extract var.X references', () => { + const code = `resource "aws_s3_bucket" "logs" { bucket = var.bucket_name }`; + const result = extractFromSource('main.tf', code); + + const ref = result.unresolvedReferences.find((r) => r.referenceName === 'var.bucket_name'); + expect(ref).toBeDefined(); + expect(ref?.referenceKind).toBe('references'); + }); + + it('should extract local.X references', () => { + const code = `resource "aws_s3_bucket" "logs" { tags = local.common_tags }`; + const result = extractFromSource('main.tf', code); + + const ref = result.unresolvedReferences.find((r) => r.referenceName === 'local.common_tags'); + expect(ref).toBeDefined(); + }); + + it('should extract module.X references and stop at the module name', () => { + const code = `output "vpc_id" { value = module.vpc.vpc_id }`; + const result = extractFromSource('main.tf', code); + + const ref = result.unresolvedReferences.find((r) => r.referenceName === 'module.vpc'); + expect(ref).toBeDefined(); + // Should NOT emit a reference for the trailing attribute + expect(result.unresolvedReferences.find((r) => r.referenceName === 'module.vpc.vpc_id')).toBeUndefined(); + }); + + it('should extract data.T.N references with both labels', () => { + const code = `output "x" { value = data.aws_caller_identity.current.account_id }`; + const result = extractFromSource('main.tf', code); + + const ref = result.unresolvedReferences.find( + (r) => r.referenceName === 'data.aws_caller_identity.current' + ); + expect(ref).toBeDefined(); + }); + + it('should extract resource references as TYPE.NAME', () => { + const code = `resource "aws_s3_bucket_versioning" "v" { bucket = aws_s3_bucket.logs.id }`; + const result = extractFromSource('main.tf', code); + + const ref = result.unresolvedReferences.find((r) => r.referenceName === 'aws_s3_bucket.logs'); + expect(ref).toBeDefined(); + }); + + it('should extract references inside string interpolations', () => { + const code = 'locals { name = "${var.environment}-${random_id.suffix.hex}" }'; + const result = extractFromSource('main.tf', code); + + const names = result.unresolvedReferences.map((r) => r.referenceName); + expect(names).toContain('var.environment'); + expect(names).toContain('random_id.suffix'); + }); + + it('should ignore references to count, each, self, and path', () => { + const code = `resource "aws_instance" "web" { + count = 3 + tags = { Name = "web-\${count.index}", For = each.value, Self = self.id, P = path.module } +}`; + const result = extractFromSource('main.tf', code); + + const names = result.unresolvedReferences.map((r) => r.referenceName); + expect(names.find((n) => n.startsWith('count.'))).toBeUndefined(); + expect(names.find((n) => n.startsWith('each.'))).toBeUndefined(); + expect(names.find((n) => n.startsWith('self.'))).toBeUndefined(); + expect(names.find((n) => n.startsWith('path.'))).toBeUndefined(); + }); + + it('should ignore for-loop iteration variables', () => { + const code = `output "ids" { value = [for s in var.subnets : s.id] }`; + const result = extractFromSource('main.tf', code); + + const names = result.unresolvedReferences.map((r) => r.referenceName); + // var.subnets reference comes through, but `s.id` does NOT + expect(names).toContain('var.subnets'); + expect(names.find((n) => n.startsWith('s.'))).toBeUndefined(); + }); + + it('should ignore key/value bindings in for-object expressions', () => { + const code = `locals { tags = { for k, v in var.input : k => "\${v}-suffix" } }`; + const result = extractFromSource('main.tf', code); + + const names = result.unresolvedReferences.map((r) => r.referenceName); + expect(names).toContain('var.input'); + expect(names.find((n) => n === 'k' || n.startsWith('k.'))).toBeUndefined(); + expect(names.find((n) => n === 'v' || n.startsWith('v.'))).toBeUndefined(); + }); + + it('should emit an imports edge for module source', () => { + const code = `module "vpc" { source = "terraform-aws-modules/vpc/aws" }`; + const result = extractFromSource('main.tf', code); + + const importRef = result.unresolvedReferences.find( + (r) => r.referenceKind === 'imports' && r.referenceName === 'terraform-aws-modules/vpc/aws' + ); + expect(importRef).toBeDefined(); + }); + }); + + describe('Robustness', () => { + it('should handle empty files', () => { + const result = extractFromSource('main.tf', ''); + const fileNode = result.nodes.find((n) => n.kind === 'file'); + expect(fileNode).toBeDefined(); + }); + + it('should handle blocks with no body', () => { + const code = `data "aws_caller_identity" "current" {}`; + const result = extractFromSource('main.tf', code); + expect(result.nodes.find((n) => n.qualifiedName === 'data.aws_caller_identity.current')).toBeDefined(); + }); + + it('should walk nested blocks for references without emitting child nodes', () => { + const code = `resource "aws_s3_bucket_versioning" "v" { + bucket = aws_s3_bucket.logs.id + versioning_configuration { + status = var.versioning_status + } +}`; + const result = extractFromSource('main.tf', code); + + // Only one block-level node, plus the file + const blockNodes = result.nodes.filter((n) => n.kind === 'class'); + expect(blockNodes.length).toBe(1); + + // References from the nested block should still be captured + const names = result.unresolvedReferences.map((r) => r.referenceName); + expect(names).toContain('aws_s3_bucket.logs'); + expect(names).toContain('var.versioning_status'); + }); }); }); diff --git a/src/extraction/hcl-extractor.ts b/src/extraction/hcl-extractor.ts new file mode 100644 index 00000000..3d810c88 --- /dev/null +++ b/src/extraction/hcl-extractor.ts @@ -0,0 +1,587 @@ +import type { Node as SyntaxNode } from 'web-tree-sitter'; +import { Node, Edge, ExtractionResult, ExtractionError, UnresolvedReference, NodeKind } from '../types'; +import { generateNodeId, getNodeText } from './tree-sitter-helpers'; +import { getParser } from './grammars'; + +/** + * HclExtractor — extracts a Terraform/HCL file into the graph. + * + * HCL is a declarative configuration language: there are no functions, + * classes, or methods. The unit of structure is the **block**: + * + * [