From 3ca2f51f39f1ddad1298f4da852c9d64d8e62c38 Mon Sep 17 00:00:00 2001 From: andreinknv Date: Sun, 26 Apr 2026 00:18:35 -0400 Subject: [PATCH 01/25] feat: index files inside git submodules MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `git ls-files` (used for both the initial scan and incremental sync) does not enter submodules — they appear as gitlink entries with their contents invisible. As a result, source files inside submodules were silently skipped during indexing. Both file-discovery paths now recurse into active submodules: - getGitVisibleFiles (full index) enumerates active submodules via `git submodule foreach --recursive --quiet 'echo "$displaypath"'` and runs `git ls-files -co --exclude-standard` inside each, prefixing the submodule path so files are reported relative to the parent root. - getGitChangedFiles (sync) was refactored to share its status-parsing logic between the parent repo and each submodule. Submodule directory entries that the parent's status emits when a submodule pointer moves (e.g., " m vendor/sub") are filtered out so we don't try to read a directory as a file. Submodule indexing is on by default and can be disabled via `indexSubmodules: false` in CodeGraphConfig — useful for repos with large vendor submodules that should remain unindexed without having to add a path-based exclude. Uninitialized / missing submodules are silently skipped (best-effort enhancement on top of the existing scan). Status output paths are now C-style-unquoted before being used or compared against the submodule directory set, so submodule paths containing spaces or non-ASCII bytes are handled correctly. The parent status command failing still falls back to the full filesystem scan via a null return, preserving the prior contract; only submodule-internal status failures are absorbed silently. Closes #86. Co-Authored-By: Claude Opus 4.7 (1M context) --- __tests__/sync.test.ts | 129 ++++++++++++++++++++++++ src/extraction/index.ts | 212 ++++++++++++++++++++++++++++++++++------ src/types.ts | 9 ++ 3 files changed, 322 insertions(+), 28 deletions(-) diff --git a/__tests__/sync.test.ts b/__tests__/sync.test.ts index 8365f630..9a89a8eb 100644 --- a/__tests__/sync.test.ts +++ b/__tests__/sync.test.ts @@ -259,4 +259,133 @@ describe('Sync Module', () => { expect(result.changedFilePaths).toBeUndefined(); }); }); + + describe('Git submodule support', () => { + let parentDir: string; + let submoduleSrc: string; + let cg: CodeGraph; + + function git(cwd: string, ...args: string[]) { + execFileSync('git', args, { cwd, stdio: 'pipe' }); + } + + beforeEach(async () => { + parentDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-submod-parent-')); + submoduleSrc = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-submod-src-')); + + // Build the submodule's source repo first. + git(submoduleSrc, 'init'); + git(submoduleSrc, 'config', 'user.email', 'test@test.com'); + git(submoduleSrc, 'config', 'user.name', 'Test'); + fs.writeFileSync( + path.join(submoduleSrc, 'lib.ts'), + `export function fromSubmodule() { return 'sub'; }` + ); + git(submoduleSrc, 'add', '-A'); + git(submoduleSrc, 'commit', '-m', 'submodule initial'); + + // Build the parent repo and add the submodule. + git(parentDir, 'init'); + git(parentDir, 'config', 'user.email', 'test@test.com'); + git(parentDir, 'config', 'user.name', 'Test'); + + const parentSrc = path.join(parentDir, 'src'); + fs.mkdirSync(parentSrc); + fs.writeFileSync( + path.join(parentSrc, 'main.ts'), + `export function fromParent() { return 'parent'; }` + ); + + // git >= 2.38 blocks file:// submodule sources by default + // (CVE-2022-39253). Pass via -c so it applies to this command only. + git(parentDir, '-c', 'protocol.file.allow=always', 'submodule', 'add', submoduleSrc, 'vendor/sub'); + git(parentDir, 'add', '-A'); + git(parentDir, 'commit', '-m', 'parent initial with submodule'); + + cg = CodeGraph.initSync(parentDir, { + config: { + include: ['**/*.ts'], + exclude: [], + }, + }); + }); + + afterEach(() => { + if (cg) cg.destroy(); + if (fs.existsSync(parentDir)) fs.rmSync(parentDir, { recursive: true, force: true }); + if (fs.existsSync(submoduleSrc)) fs.rmSync(submoduleSrc, { recursive: true, force: true }); + }); + + it('should index files inside a submodule on full index', async () => { + const result = await cg.indexAll(); + + // Both the parent file and the submodule file should be indexed. + expect(result.filesIndexed).toBeGreaterThanOrEqual(2); + const subNodes = cg.searchNodes('fromSubmodule'); + const parentNodes = cg.searchNodes('fromParent'); + expect(subNodes.length).toBeGreaterThan(0); + expect(parentNodes.length).toBeGreaterThan(0); + // The submodule path should be reported relative to the parent root. + expect(subNodes.some((r) => r.node.filePath.startsWith('vendor/sub/'))).toBe(true); + }); + + it('should detect modifications to files inside a submodule via sync', async () => { + await cg.indexAll(); + + fs.writeFileSync( + path.join(parentDir, 'vendor/sub/lib.ts'), + `export function fromSubmodule() { return 'changed'; }` + ); + + const result = await cg.sync(); + + expect(result.filesModified).toBe(1); + expect(result.changedFilePaths).toContain('vendor/sub/lib.ts'); + }); + + it('should detect new untracked files inside a submodule via sync', async () => { + await cg.indexAll(); + + fs.writeFileSync( + path.join(parentDir, 'vendor/sub/newfile.ts'), + `export function added() { return 1; }` + ); + + const result = await cg.sync(); + + expect(result.filesAdded).toBe(1); + expect(result.changedFilePaths).toContain('vendor/sub/newfile.ts'); + }); + + it('should not break when a submodule directory is missing or empty', async () => { + // Wipe the submodule contents to mimic an unfetched submodule + // (this isn't a real `git submodule deinit` — that would also remove + // the .gitmodules entry — but it covers the common "directory exists, + // no .git inside" failure mode). git ls-files inside the empty dir + // errors; the scanner should swallow that and continue with parent files. + fs.rmSync(path.join(parentDir, 'vendor/sub'), { recursive: true, force: true }); + fs.mkdirSync(path.join(parentDir, 'vendor/sub')); + + const result = await cg.indexAll(); + expect(result.errors.filter((e) => e.severity === 'error').length).toBe(0); + expect(cg.searchNodes('fromParent').length).toBeGreaterThan(0); + }); + + it('should skip submodule contents when indexSubmodules is false', async () => { + cg.destroy(); + fs.rmSync(path.join(parentDir, '.codegraph'), { recursive: true, force: true }); + cg = CodeGraph.initSync(parentDir, { + config: { + include: ['**/*.ts'], + exclude: [], + indexSubmodules: false, + }, + }); + + const result = await cg.indexAll(); + expect(cg.searchNodes('fromParent').length).toBeGreaterThan(0); + expect(cg.searchNodes('fromSubmodule').length).toBe(0); + expect(result.filesIndexed).toBe(1); + }); + }); }); diff --git a/src/extraction/index.ts b/src/extraction/index.ts index 4ad056fb..f750e443 100644 --- a/src/extraction/index.ts +++ b/src/extraction/index.ts @@ -123,12 +123,73 @@ export function shouldIncludeFile( return false; } +/** + * Enumerate all initialized submodule paths (recursively), relative to `rootDir`. + * + * Uses `git submodule foreach` so we get exactly the submodules git considers + * active — uninitialized / deinitialized submodules are skipped automatically, + * which is what we want (we can't ls-files inside a directory with no .git). + * + * Returns [] when there are no submodules or when the command fails. Errors + * here are non-fatal: submodule indexing is a best-effort enhancement on top + * of the parent-repo file scan. + */ +function getGitSubmodules(rootDir: string): string[] { + try { + const output = execFileSync( + 'git', + ['submodule', 'foreach', '--recursive', '--quiet', 'echo "$displaypath"'], + { cwd: rootDir, encoding: 'utf-8', timeout: 10000, stdio: ['pipe', 'pipe', 'pipe'] } + ); + const paths: string[] = []; + for (const line of output.split('\n')) { + const trimmed = line.trim(); + if (trimmed) paths.push(normalizePath(trimmed)); + } + return paths; + } catch { + return []; + } +} + +/** + * Run `git ls-files -co --exclude-standard` inside a submodule and return + * paths prefixed back into the parent repo's relative-path namespace. + * Errors are swallowed so one broken submodule doesn't fail the whole scan. + */ +function getSubmoduleFiles(rootDir: string, submodulePath: string): string[] { + try { + const output = execFileSync( + 'git', + ['ls-files', '-co', '--exclude-standard'], + { + cwd: path.join(rootDir, submodulePath), + encoding: 'utf-8', + timeout: 30000, + maxBuffer: 50 * 1024 * 1024, + stdio: ['pipe', 'pipe', 'pipe'], + } + ); + const out: string[] = []; + for (const line of output.split('\n')) { + const trimmed = line.trim(); + if (trimmed) out.push(normalizePath(`${submodulePath}/${trimmed}`)); + } + return out; + } catch { + return []; + } +} + /** * Get all files visible to git (tracked + untracked but not ignored). - * Respects .gitignore at all levels (root, subdirectories). + * Respects .gitignore at all levels (root, subdirectories) and recurses + * into git submodules — `git ls-files` itself does not enter submodules, + * so each one is enumerated separately and its paths are prefixed. + * Pass `indexSubmodules: false` in config to skip the submodule walk. * Returns null on failure (non-git project) so callers can fall back. */ -function getGitVisibleFiles(rootDir: string): Set | null { +function getGitVisibleFiles(rootDir: string, config: CodeGraphConfig): Set | null { try { // Check if the project directory is gitignored by a parent repo. // When rootDir lives inside a parent git repo that ignores it, @@ -167,6 +228,18 @@ function getGitVisibleFiles(rootDir: string): Set | null { files.add(normalizePath(trimmed)); } } + + // Recurse into submodules: each submodule has its own git index, and the + // parent repo's ls-files only emits the submodule directory entry, not + // the files inside. + if (config.indexSubmodules !== false) { + for (const submodulePath of getGitSubmodules(rootDir)) { + for (const filePath of getSubmoduleFiles(rootDir, submodulePath)) { + files.add(filePath); + } + } + } + return files; } catch { return null; @@ -185,44 +258,127 @@ interface GitChanges { } /** - * Use `git status` to detect changed files instead of scanning every file. - * Returns null on failure so callers fall back to full scan. + * Decode the C-style-quoted path that `git status --porcelain` emits when + * a path contains spaces, control chars, or non-ASCII bytes (the path is + * wrapped in double quotes and individual bytes are escaped, e.g. + * "vendor/my\\040sub/file" + * Returns the path unchanged if it isn't quoted. */ -function getGitChangedFiles(rootDir: string, config: CodeGraphConfig): GitChanges | null { +function unquoteGitPath(raw: string): string { + if (raw.length < 2 || raw[0] !== '"' || raw[raw.length - 1] !== '"') { + return raw; + } + const body = raw.slice(1, -1); + const bytes: number[] = []; + for (let i = 0; i < body.length; i++) { + const ch = body[i]; + if (ch !== '\\') { + bytes.push(body.charCodeAt(i)); + continue; + } + const next = body[++i]; + if (next === undefined) break; + if (next >= '0' && next <= '7') { + // Octal escape (up to 3 digits) representing a single byte + let octal = next; + let peek = body[i + 1]; + while (octal.length < 3 && peek !== undefined && peek >= '0' && peek <= '7') { + octal += peek; + i++; + peek = body[i + 1]; + } + bytes.push(parseInt(octal, 8)); + } else { + const map: Record = { a: 7, b: 8, t: 9, n: 10, v: 11, f: 12, r: 13, '"': 34, '\\': 92 }; + bytes.push(map[next] ?? next.charCodeAt(0)); + } + } + return Buffer.from(bytes).toString('utf-8'); +} + +/** + * Run `git status --porcelain --no-renames` in `cwd` and bucket the entries. + * `pathPrefix`, when non-empty, is prepended to every file path so submodule + * status output can be reported relative to the parent repo's root. + * `submoduleDirs` is the set of paths (relative to the parent root) that + * are themselves submodule directories — the parent repo's status emits + * a single entry per submodule (e.g. ` m sub`), and we ignore those because + * the actual file-level changes are picked up by status runs inside each. + * + * Returns `true` if the command ran successfully (even if the working tree + * was clean), `false` if it failed — callers use this to fall back to a + * full filesystem scan when the parent-repo status is unreliable. + */ +function readGitStatus( + cwd: string, + pathPrefix: string, + submoduleDirs: ReadonlySet, + config: CodeGraphConfig, + buckets: GitChanges, +): boolean { + let output: string; try { - const output = execFileSync( + output = execFileSync( 'git', ['status', '--porcelain', '--no-renames'], - { cwd: rootDir, encoding: 'utf-8', timeout: 10000, stdio: ['pipe', 'pipe', 'pipe'] } + { cwd, encoding: 'utf-8', timeout: 10000, stdio: ['pipe', 'pipe', 'pipe'] } ); + } catch { + return false; + } - const modified: string[] = []; - const added: string[] = []; - const deleted: string[] = []; + for (const line of output.split('\n')) { + if (line.length < 4) continue; // Minimum: "XY file" - for (const line of output.split('\n')) { - if (line.length < 4) continue; // Minimum: "XY file" + const statusCode = line.substring(0, 2); + const rawPath = unquoteGitPath(line.substring(3)); + const filePath = pathPrefix + ? normalizePath(`${pathPrefix}/${rawPath}`) + : normalizePath(rawPath); - const statusCode = line.substring(0, 2); - const filePath = normalizePath(line.substring(3)); + // The submodule directory itself shows up as a status entry in the + // parent repo (e.g. " m sub" when the submodule's HEAD has moved); + // skip it — file-level changes are captured by recursing into the submodule. + if (submoduleDirs.has(filePath)) continue; - // Skip files that don't match include/exclude config - if (!shouldIncludeFile(filePath, config)) continue; + // Skip files that don't match include/exclude config + if (!shouldIncludeFile(filePath, config)) continue; - if (statusCode === '??') { - added.push(filePath); - } else if (statusCode.includes('D')) { - deleted.push(filePath); - } else { - // M, MM, AM, A (staged), etc. — treat as modified - modified.push(filePath); - } + if (statusCode === '??') { + buckets.added.push(filePath); + } else if (statusCode.includes('D')) { + buckets.deleted.push(filePath); + } else { + // M, MM, AM, A (staged), etc. — treat as modified + buckets.modified.push(filePath); } + } + return true; +} - return { modified, added, deleted }; - } catch { +/** + * Use `git status` to detect changed files instead of scanning every file. + * Returns null on failure so callers fall back to full scan. + * + * Recurses into git submodules: status inside the parent repo only emits + * a directory-level entry for a changed submodule, so we additionally run + * status inside each active submodule to pick up file-level changes. + * Submodule status failures are non-fatal — only a parent-repo failure + * triggers the full-scan fallback. + */ +function getGitChangedFiles(rootDir: string, config: CodeGraphConfig): GitChanges | null { + const submodules = config.indexSubmodules === false ? [] : getGitSubmodules(rootDir); + const submoduleDirs = new Set(submodules); + const buckets: GitChanges = { modified: [], added: [], deleted: [] }; + + if (!readGitStatus(rootDir, '', submoduleDirs, config, buckets)) { return null; } + for (const submodulePath of submodules) { + readGitStatus(path.join(rootDir, submodulePath), submodulePath, submoduleDirs, config, buckets); + } + + return buckets; } /** @@ -243,7 +399,7 @@ export function scanDirectory( onProgress?: (current: number, file: string) => void ): string[] { // Fast path: use git to get all visible files (respects .gitignore everywhere) - const gitFiles = getGitVisibleFiles(rootDir); + const gitFiles = getGitVisibleFiles(rootDir, config); if (gitFiles) { const files: string[] = []; let count = 0; @@ -270,7 +426,7 @@ export async function scanDirectoryAsync( config: CodeGraphConfig, onProgress?: (current: number, file: string) => void ): Promise { - const gitFiles = getGitVisibleFiles(rootDir); + const gitFiles = getGitVisibleFiles(rootDir, config); if (gitFiles) { const files: string[] = []; let count = 0; diff --git a/src/types.ts b/src/types.ts index 6834483d..f8cbf783 100644 --- a/src/types.ts +++ b/src/types.ts @@ -465,6 +465,14 @@ export interface CodeGraphConfig { /** Whether to track call sites */ trackCallSites: boolean; + /** + * Whether to recurse into git submodules during indexing and sync. + * Default: true. Set to false to skip submodule contents (useful when + * a submodule pulls in a large vendor tree you don't want indexed — + * adding the path to `exclude` also works). + */ + indexSubmodules?: boolean; + /** Custom symbol patterns to extract */ customPatterns?: { /** Name for this pattern group */ @@ -675,6 +683,7 @@ export const DEFAULT_CONFIG: CodeGraphConfig = { maxFileSize: 1024 * 1024, // 1MB extractDocstrings: true, trackCallSites: true, + indexSubmodules: true, }; // ============================================================================= From 05eb7a53cb84d921e37e425958dfb3affd463406 Mon Sep 17 00:00:00 2001 From: andreinknv Date: Sun, 26 Apr 2026 01:27:34 -0400 Subject: [PATCH 02/25] fix: correctness bugs found in audit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Four user-visible bugs caught by an independent audit pass: 1. Svelte symbols reported on the wrong line src/extraction/svelte-extractor.ts:144 The script-block regex captures content starting with the leading newline that follows `>`, so the inner extractor sees that newline as line 1 of its 1-indexed input and the first real code on line 2. The previous `contentStartLine = scriptTagLine + openingTagLines + 1` was added to that 1-indexed line number, shifting every Svelte symbol's startLine / endLine off by 1. Drops the `+1`. Five regression tests added covering single-line, multi-line opening tag, template-offset, single-line no-newline, and dual module/instance script blocks. 2. Watcher silently dropped pending changes on sync failure src/sync/watcher.ts:177 `hasChanges = false` ran before the sync attempt, so a thrown sync (DB locked, transient FS error) left the pending batch forgotten until a NEW file event arrived. Re-set `hasChanges = true` in the catch path so a transient failure schedules a retry on its own. Regression test added (mocks fail-then-succeed, asserts the second call happens without a new file event). 3. Graph traversal default maxDepth was Infinity src/graph/traversal.ts:14, src/types.ts:301 `limit: 1000` capped returned nodes, but during traversal the visited set and BFS/DFS frontier can grow far beyond `limit` on highly connected graphs before the cap kicks in. Default is now 10. Callers who really need exhaustive traversal can still pass `maxDepth: Infinity` explicitly — the JSDoc documents this. This is a public-API behavior change; existing tests pass. Also caps `findPath`'s BFS queue at 100,000 entries (FIND_PATH_MAX_QUEUE) and returns null if exceeded — each entry holds a cloned path array, so on dense graphs the queue could otherwise consume gigabytes before either finding a path or exhausting the search. 4. `findRelevantContext` did not bound caller-supplied limits src/context/index.ts:284 `searchLimit` is multiplied by 5 in `findNodesByExactName` and feeds several other unbounded operations; a caller passing `searchLimit: 1_000_000` would pull millions of rows. Now clamped: searchLimit ∈ [1, 100], maxNodes ∈ [1, 1000], traversalDepth ∈ [0, 10]. Regression test asserts a 1e9 input is bounded. All 387 tests pass serialized; tsc clean. Co-Authored-By: Claude Opus 4.7 (1M context) --- __tests__/context.test.ts | 13 ++++++ __tests__/extraction.test.ts | 69 ++++++++++++++++++++++++++++++ __tests__/watcher.test.ts | 30 +++++++++++++ src/context/index.ts | 8 ++++ src/extraction/svelte-extractor.ts | 10 +++-- src/graph/traversal.ts | 23 +++++++++- src/sync/watcher.ts | 12 +++++- src/types.ts | 7 ++- 8 files changed, 165 insertions(+), 7 deletions(-) diff --git a/__tests__/context.test.ts b/__tests__/context.test.ts index 52dae1fe..9a0614aa 100644 --- a/__tests__/context.test.ts +++ b/__tests__/context.test.ts @@ -210,6 +210,19 @@ export function validateEmail(email: string): boolean { expect(result.nodes.size).toBeLessThanOrEqual(5); }); + + it('should clamp absurd searchLimit/maxNodes values to safe upper bounds', async () => { + // Without clamping, the internal `findNodesByExactName` query would + // request `searchLimit * 5` rows — passing 1e9 here would blow out + // memory. The call should complete in normal time and not return more + // than the hard cap on maxNodes (1000). + const result = await cg.findRelevantContext('function', { + searchLimit: 1_000_000_000, + maxNodes: 1_000_000_000, + traversalDepth: 1_000, + }); + expect(result.nodes.size).toBeLessThanOrEqual(1000); + }); }); describe('buildContext()', () => { diff --git a/__tests__/extraction.test.ts b/__tests__/extraction.test.ts index 8a70ffed..a6fd7687 100644 --- a/__tests__/extraction.test.ts +++ b/__tests__/extraction.test.ts @@ -3079,3 +3079,72 @@ describe('Directory Exclusion', () => { expect(files.every((f) => !f.includes('vendor'))).toBe(true); }); }); + +// ============================================================================= +// Svelte line-number regressions (audit fix) +// ============================================================================= + +describe('Svelte line numbering', () => { + it('reports symbol line numbers relative to the .svelte file, not the script content', () => { + // Line 1: + const code = `\n`; + const result = extractFromSource('Comp.svelte', code); + const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'add'); + expect(fn).toBeDefined(); + expect(fn?.startLine).toBe(2); + }); + + it('handles multi-line opening tags (script with attributes wrapped)', () => { + // Line 1: + const code = `\nfunction greet() { return "hi"; }\n\n`; + const result = extractFromSource('Comp.svelte', code); + const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'greet'); + expect(fn).toBeDefined(); + expect(fn?.startLine).toBe(3); + }); + + it('preserves correct line numbers when the script block is offset by template lines', () => { + // Line 1:

Hello

+ // Line 2: + // Line 3: + const code = `

Hello

\n\n\n`; + const result = extractFromSource('Comp.svelte', code); + const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'bottom'); + expect(fn).toBeDefined(); + expect(fn?.startLine).toBe(4); + }); + + it('handles a single-line script block with no internal newline', () => { + // Line 1: + const code = `\n`; + const result = extractFromSource('Comp.svelte', code); + const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'inline'); + expect(fn).toBeDefined(); + expect(fn?.startLine).toBe(1); + }); + + it('attributes each block correctly when a file has both module and instance scripts', () => { + // Line 1: + // Line 4: + // Line 5: + const code = + `\n` + + `\n\n`; + const result = extractFromSource('Comp.svelte', code); + const moduleFn = result.nodes.find((n) => n.kind === 'function' && n.name === 'moduleHelper'); + const instanceFn = result.nodes.find((n) => n.kind === 'function' && n.name === 'instanceHelper'); + expect(moduleFn?.startLine).toBe(2); + expect(instanceFn?.startLine).toBe(6); + }); +}); diff --git a/__tests__/watcher.test.ts b/__tests__/watcher.test.ts index f3638e6d..ee732df6 100644 --- a/__tests__/watcher.test.ts +++ b/__tests__/watcher.test.ts @@ -218,6 +218,36 @@ describe('FileWatcher', () => { watcher.stop(); }); + + it('should retry pending changes after a sync failure (no events lost)', async () => { + // First call rejects, subsequent calls resolve. After the initial + // failure, the watcher should retry the same batch on its own — without + // this, transient sync failures (DB locked etc.) would silently drop the + // changes until a new file event happened. + let calls = 0; + const syncFn = vi.fn().mockImplementation(() => { + calls++; + if (calls === 1) return Promise.reject(new Error('transient')); + return Promise.resolve({ filesChanged: 1, durationMs: 5 }); + }); + const onSyncError = vi.fn(); + const onSyncComplete = vi.fn(); + const watcher = new FileWatcher(testDir, baseConfig, syncFn, { + debounceMs: 100, + onSyncError, + onSyncComplete, + }); + + watcher.start(); + fs.writeFileSync(path.join(testDir, 'src', 'test.ts'), 'export const z = 3;'); + + await waitFor(() => onSyncComplete.mock.calls.length > 0, 5000); + expect(onSyncError).toHaveBeenCalledTimes(1); + expect(syncFn).toHaveBeenCalledTimes(2); + expect(onSyncComplete).toHaveBeenCalledWith({ filesChanged: 1, durationMs: 5 }); + + watcher.stop(); + }); }); describe('CodeGraph integration', () => { diff --git a/src/context/index.ts b/src/context/index.ts index 94192377..08f25657 100644 --- a/src/context/index.ts +++ b/src/context/index.ts @@ -286,6 +286,14 @@ export class ContextBuilder { options: FindRelevantContextOptions = {} ): Promise { const opts = { ...DEFAULT_FIND_OPTIONS, ...options }; + // Bound user-supplied limits — `searchLimit` is multiplied by 5 in + // findNodesByExactName (line 312) and feeds several other unbounded + // operations below, so a request with `searchLimit: 1_000_000` would + // pull millions of rows before any filtering. 100 is well above the + // largest legitimate use we've seen. + opts.searchLimit = Math.min(Math.max(1, opts.searchLimit), 100); + opts.maxNodes = Math.min(Math.max(1, opts.maxNodes), 1000); + opts.traversalDepth = Math.min(Math.max(0, opts.traversalDepth), 10); // Start with empty subgraph const nodes = new Map(); diff --git a/src/extraction/svelte-extractor.ts b/src/extraction/svelte-extractor.ts index 5586ee34..323cbe80 100644 --- a/src/extraction/svelte-extractor.ts +++ b/src/extraction/svelte-extractor.ts @@ -135,13 +135,17 @@ export class SvelteExtractor { // Detect module script const isModule = /context\s*=\s*["']module["']/.test(attrs); - // Calculate start line of the script content (line after - const code = `\n`; - const result = extractFromSource('Comp.svelte', code); - const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'add'); - expect(fn).toBeDefined(); - expect(fn?.startLine).toBe(2); - }); - - it('handles multi-line opening tags (script with attributes wrapped)', () => { - // Line 1: - const code = `\nfunction greet() { return "hi"; }\n\n`; - const result = extractFromSource('Comp.svelte', code); - const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'greet'); - expect(fn).toBeDefined(); - expect(fn?.startLine).toBe(3); - }); - - it('preserves correct line numbers when the script block is offset by template lines', () => { - // Line 1:

Hello

- // Line 2: - // Line 3: - const code = `

Hello

\n\n\n`; - const result = extractFromSource('Comp.svelte', code); - const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'bottom'); - expect(fn).toBeDefined(); - expect(fn?.startLine).toBe(4); - }); - - it('handles a single-line script block with no internal newline', () => { - // Line 1: - const code = `\n`; - const result = extractFromSource('Comp.svelte', code); - const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'inline'); - expect(fn).toBeDefined(); - expect(fn?.startLine).toBe(1); - }); - - it('attributes each block correctly when a file has both module and instance scripts', () => { - // Line 1: - // Line 4: - // Line 5: - const code = - `\n` + - `\n\n`; - const result = extractFromSource('Comp.svelte', code); - const moduleFn = result.nodes.find((n) => n.kind === 'function' && n.name === 'moduleHelper'); - const instanceFn = result.nodes.find((n) => n.kind === 'function' && n.name === 'instanceHelper'); - expect(moduleFn?.startLine).toBe(2); - expect(instanceFn?.startLine).toBe(6); +describe('HCL / Terraform Extraction', () => { + describe('Language detection', () => { + it('should detect HCL/Terraform files', () => { + expect(detectLanguage('main.tf')).toBe('hcl'); + expect(detectLanguage('terraform.tfvars')).toBe('hcl'); + expect(detectLanguage('config.hcl')).toBe('hcl'); + }); + + it('should report HCL as supported', () => { + expect(isLanguageSupported('hcl')).toBe(true); + expect(getSupportedLanguages()).toContain('hcl'); + }); + }); + + describe('Block extraction', () => { + it('should extract a resource block as a class node', () => { + const code = `resource "aws_s3_bucket" "logs" { bucket = "my-logs" }`; + const result = extractFromSource('main.tf', code); + + const node = result.nodes.find((n) => n.qualifiedName === 'aws_s3_bucket.logs'); + expect(node).toBeDefined(); + expect(node?.kind).toBe('class'); + expect(node?.name).toBe('aws_s3_bucket.logs'); + expect(node?.language).toBe('hcl'); + expect(node?.signature).toBe('resource "aws_s3_bucket" "logs"'); + }); + + it('should extract a data block with `data.` prefix', () => { + const code = `data "aws_caller_identity" "current" {}`; + const result = extractFromSource('main.tf', code); + + const node = result.nodes.find((n) => n.qualifiedName === 'data.aws_caller_identity.current'); + expect(node).toBeDefined(); + expect(node?.kind).toBe('class'); + expect(node?.name).toBe('aws_caller_identity.current'); + }); + + it('should extract a variable block', () => { + const code = `variable "environment" { type = string }`; + const result = extractFromSource('main.tf', code); + + const node = result.nodes.find((n) => n.qualifiedName === 'var.environment'); + expect(node).toBeDefined(); + expect(node?.kind).toBe('variable'); + expect(node?.name).toBe('environment'); + }); + + it('should extract an output block as an export', () => { + const code = `output "vpc_id" { value = "abc" }`; + const result = extractFromSource('main.tf', code); + + const node = result.nodes.find((n) => n.qualifiedName === 'output.vpc_id'); + expect(node).toBeDefined(); + expect(node?.kind).toBe('export'); + expect(node?.name).toBe('vpc_id'); + }); + + it('should extract a module block', () => { + const code = `module "vpc" { source = "terraform-aws-modules/vpc/aws" }`; + const result = extractFromSource('main.tf', code); + + const node = result.nodes.find((n) => n.qualifiedName === 'module.vpc'); + expect(node).toBeDefined(); + expect(node?.kind).toBe('module'); + expect(node?.name).toBe('vpc'); + }); + + it('should extract a provider block as namespace', () => { + const code = `provider "aws" { region = "us-east-1" }`; + const result = extractFromSource('main.tf', code); + + const node = result.nodes.find((n) => n.qualifiedName === 'provider.aws'); + expect(node).toBeDefined(); + expect(node?.kind).toBe('namespace'); + }); + + it('should split a locals block into one constant per attribute', () => { + const code = `locals { + bucket_name = "my-bucket" + retention = 30 +}`; + const result = extractFromSource('main.tf', code); + + const bucketName = result.nodes.find((n) => n.qualifiedName === 'local.bucket_name'); + const retention = result.nodes.find((n) => n.qualifiedName === 'local.retention'); + expect(bucketName?.kind).toBe('constant'); + expect(retention?.kind).toBe('constant'); + }); + + it('should connect blocks to the file via contains edges', () => { + const code = `resource "aws_s3_bucket" "logs" {}`; + const result = extractFromSource('main.tf', code); + + const fileNode = result.nodes.find((n) => n.kind === 'file'); + const resourceNode = result.nodes.find((n) => n.qualifiedName === 'aws_s3_bucket.logs'); + expect(fileNode).toBeDefined(); + expect(resourceNode).toBeDefined(); + const containsEdge = result.edges.find( + (e) => e.source === fileNode!.id && e.target === resourceNode!.id && e.kind === 'contains' + ); + expect(containsEdge).toBeDefined(); + }); + }); + + describe('Reference extraction', () => { + it('should extract var.X references', () => { + const code = `resource "aws_s3_bucket" "logs" { bucket = var.bucket_name }`; + const result = extractFromSource('main.tf', code); + + const ref = result.unresolvedReferences.find((r) => r.referenceName === 'var.bucket_name'); + expect(ref).toBeDefined(); + expect(ref?.referenceKind).toBe('references'); + }); + + it('should extract local.X references', () => { + const code = `resource "aws_s3_bucket" "logs" { tags = local.common_tags }`; + const result = extractFromSource('main.tf', code); + + const ref = result.unresolvedReferences.find((r) => r.referenceName === 'local.common_tags'); + expect(ref).toBeDefined(); + }); + + it('should extract module.X references and stop at the module name', () => { + const code = `output "vpc_id" { value = module.vpc.vpc_id }`; + const result = extractFromSource('main.tf', code); + + const ref = result.unresolvedReferences.find((r) => r.referenceName === 'module.vpc'); + expect(ref).toBeDefined(); + // Should NOT emit a reference for the trailing attribute + expect(result.unresolvedReferences.find((r) => r.referenceName === 'module.vpc.vpc_id')).toBeUndefined(); + }); + + it('should extract data.T.N references with both labels', () => { + const code = `output "x" { value = data.aws_caller_identity.current.account_id }`; + const result = extractFromSource('main.tf', code); + + const ref = result.unresolvedReferences.find( + (r) => r.referenceName === 'data.aws_caller_identity.current' + ); + expect(ref).toBeDefined(); + }); + + it('should extract resource references as TYPE.NAME', () => { + const code = `resource "aws_s3_bucket_versioning" "v" { bucket = aws_s3_bucket.logs.id }`; + const result = extractFromSource('main.tf', code); + + const ref = result.unresolvedReferences.find((r) => r.referenceName === 'aws_s3_bucket.logs'); + expect(ref).toBeDefined(); + }); + + it('should extract references inside string interpolations', () => { + const code = 'locals { name = "${var.environment}-${random_id.suffix.hex}" }'; + const result = extractFromSource('main.tf', code); + + const names = result.unresolvedReferences.map((r) => r.referenceName); + expect(names).toContain('var.environment'); + expect(names).toContain('random_id.suffix'); + }); + + it('should ignore references to count, each, self, and path', () => { + const code = `resource "aws_instance" "web" { + count = 3 + tags = { Name = "web-\${count.index}", For = each.value, Self = self.id, P = path.module } +}`; + const result = extractFromSource('main.tf', code); + + const names = result.unresolvedReferences.map((r) => r.referenceName); + expect(names.find((n) => n.startsWith('count.'))).toBeUndefined(); + expect(names.find((n) => n.startsWith('each.'))).toBeUndefined(); + expect(names.find((n) => n.startsWith('self.'))).toBeUndefined(); + expect(names.find((n) => n.startsWith('path.'))).toBeUndefined(); + }); + + it('should ignore for-loop iteration variables', () => { + const code = `output "ids" { value = [for s in var.subnets : s.id] }`; + const result = extractFromSource('main.tf', code); + + const names = result.unresolvedReferences.map((r) => r.referenceName); + // var.subnets reference comes through, but `s.id` does NOT + expect(names).toContain('var.subnets'); + expect(names.find((n) => n.startsWith('s.'))).toBeUndefined(); + }); + + it('should ignore key/value bindings in for-object expressions', () => { + const code = `locals { tags = { for k, v in var.input : k => "\${v}-suffix" } }`; + const result = extractFromSource('main.tf', code); + + const names = result.unresolvedReferences.map((r) => r.referenceName); + expect(names).toContain('var.input'); + expect(names.find((n) => n === 'k' || n.startsWith('k.'))).toBeUndefined(); + expect(names.find((n) => n === 'v' || n.startsWith('v.'))).toBeUndefined(); + }); + + it('should emit an imports edge for module source', () => { + const code = `module "vpc" { source = "terraform-aws-modules/vpc/aws" }`; + const result = extractFromSource('main.tf', code); + + const importRef = result.unresolvedReferences.find( + (r) => r.referenceKind === 'imports' && r.referenceName === 'terraform-aws-modules/vpc/aws' + ); + expect(importRef).toBeDefined(); + }); + }); + + describe('Robustness', () => { + it('should handle empty files', () => { + const result = extractFromSource('main.tf', ''); + const fileNode = result.nodes.find((n) => n.kind === 'file'); + expect(fileNode).toBeDefined(); + }); + + it('should handle blocks with no body', () => { + const code = `data "aws_caller_identity" "current" {}`; + const result = extractFromSource('main.tf', code); + expect(result.nodes.find((n) => n.qualifiedName === 'data.aws_caller_identity.current')).toBeDefined(); + }); + + it('should walk nested blocks for references without emitting child nodes', () => { + const code = `resource "aws_s3_bucket_versioning" "v" { + bucket = aws_s3_bucket.logs.id + versioning_configuration { + status = var.versioning_status + } +}`; + const result = extractFromSource('main.tf', code); + + // Only one block-level node, plus the file + const blockNodes = result.nodes.filter((n) => n.kind === 'class'); + expect(blockNodes.length).toBe(1); + + // References from the nested block should still be captured + const names = result.unresolvedReferences.map((r) => r.referenceName); + expect(names).toContain('aws_s3_bucket.logs'); + expect(names).toContain('var.versioning_status'); + }); }); }); diff --git a/src/extraction/hcl-extractor.ts b/src/extraction/hcl-extractor.ts new file mode 100644 index 00000000..3d810c88 --- /dev/null +++ b/src/extraction/hcl-extractor.ts @@ -0,0 +1,587 @@ +import type { Node as SyntaxNode } from 'web-tree-sitter'; +import { Node, Edge, ExtractionResult, ExtractionError, UnresolvedReference, NodeKind } from '../types'; +import { generateNodeId, getNodeText } from './tree-sitter-helpers'; +import { getParser } from './grammars'; + +/** + * HclExtractor — extracts a Terraform/HCL file into the graph. + * + * HCL is a declarative configuration language: there are no functions, + * classes, or methods. The unit of structure is the **block**: + * + * [