From 4f66d23eee8be40485e66f156a56b79abbcd527d Mon Sep 17 00:00:00 2001 From: andreinknv Date: Sun, 26 Apr 2026 01:43:23 -0400 Subject: [PATCH] fix: defense-in-depth hardening MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bundles seven smaller hardening fixes from the audit pass: 1. Symlink-resistant path validation in sync paths src/extraction/index.ts:1148, 1206, 1293, 1347 src/utils.ts (new validatePathWithinRootReal) getChangedFiles() and sync() now resolve symlinks via realpath before reading, so a regular file swapped for a symlink to outside the project (between scan and read) is rejected rather than followed. The lexical-only validatePathWithinRoot stays for paths that don't yet exist on disk. 2. storeExtractionResult is now atomic src/extraction/index.ts:1018-1098, src/db/queries.ts (new QueryBuilder.transaction) delete + insert + upsert run in a single SQLite transaction, so a process kill mid-write can no longer leave the file's old data wiped while the new data is missing. Nested transactions inside insertNodes/insertEdges/insertUnresolvedRefsBatch/deleteFile work correctly because better-sqlite3 collapses them via SAVEPOINT. 3. recycleWorker is now actually async src/extraction/index.ts:573 Previously declared sync but called via `await`. Now properly awaits worker.terminate() with a 1s timeout (so a wedged WASM worker can't block the caller indefinitely) and force-fires terminate after the timeout to avoid zombie threads. 4. Parse-worker handles unknown message types src/extraction/parse-worker.ts:60-78 When a malformed message arrives with an `id`, the worker sends back a structured error instead of silently dropping it (which previously caused the main thread to block until the per-file timeout). Messages without an id are still ignored — no pending promise to unblock. 5. ReDoS-safe glob matching src/utils.ts (new globToSafeRegex), src/bin/codegraph.ts:1163, src/graph/queries.ts:196 Coalesces runs of `*` so hostile inputs like `*****` map to a single `.*` rather than five chained quantifiers (which would catastrophically backtrack on long inputs). Caps input at 1024 chars. Implementation walks character-by-character with no sentinels — avoids any chance of marker-string collisions with user input. 6. fileUriToPath fallback hardening src/mcp/index.ts:36-40 Catch-block fallback for malformed file:// URIs now runs through path.resolve() so a `file:///../etc/passwd` style input is normalized rather than passed raw to downstream filesystem code. 7. Reference dedup at extraction time + dead-code removal src/extraction/tree-sitter.ts (new dedupeReferences), src/db/queries.ts (deleteUnresolvedByNode removed) A function that calls `foo()` 100 times now produces 1 unresolved reference instead of 100; the resolver collapses duplicates anyway but extraction was wasteful. Removed the never-called deleteUnresolvedByNode helper (FK cascade handles cleanup). 3 new tests for globToSafeRegex; full suite 383/383 passes. Co-Authored-By: Claude Opus 4.7 (1M context) --- __tests__/security.test.ts | 33 +++++++ src/bin/codegraph.ts | 16 ++-- src/db/queries.ts | 22 +++-- src/extraction/index.ts | 153 +++++++++++++++++++++------------ src/extraction/parse-worker.ts | 24 ++++++ src/extraction/tree-sitter.ts | 23 ++++- src/graph/queries.ts | 13 ++- src/mcp/index.ts | 6 +- src/utils.ts | 67 +++++++++++++++ 9 files changed, 272 insertions(+), 85 deletions(-) diff --git a/__tests__/security.test.ts b/__tests__/security.test.ts index 53441d58..1c62e648 100644 --- a/__tests__/security.test.ts +++ b/__tests__/security.test.ts @@ -533,3 +533,36 @@ describe('Symlink Cycle Detection', () => { expect(files).toContain('src/valid.ts'); }); }); + +describe('ReDoS-safe glob matching', () => { + it('coalesces runs of `*` so hostile inputs do not produce nested quantifiers', async () => { + const { globToSafeRegex } = await import('../src/utils'); + // Two or more stars collapse to a single recursive wildcard. This is the + // ReDoS protection: `*****` doesn't expand to `[^/]*[^/]*[^/]*[^/]*[^/]*`, + // which on a long input could catastrophically backtrack. + expect(globToSafeRegex('*****')).toBe('.*'); + expect(globToSafeRegex('**')).toBe('.*'); + + // Even a constructed-from-hostile-input regex matches in linear time. + const regex = new RegExp(`^${globToSafeRegex('*****')}foo$`); + const start = Date.now(); + // 100k 'a's followed by something that doesn't end in 'foo'. + expect(regex.test('a'.repeat(100000) + 'bar')).toBe(false); + expect(Date.now() - start).toBeLessThan(500); + }); + + it('rejects pathologically long glob inputs', async () => { + const { globToSafeRegex } = await import('../src/utils'); + expect(globToSafeRegex('*'.repeat(2000))).toBeNull(); + }); + + it('preserves the standard glob semantics for common patterns', async () => { + const { globToSafeRegex } = await import('../src/utils'); + const body = globToSafeRegex('src/**/*.test.ts'); + expect(body).toBeDefined(); + const regex = new RegExp(`^${body}$`); + expect(regex.test('src/lib/foo.test.ts')).toBe(true); + expect(regex.test('src/lib/foo.ts')).toBe(false); + expect(regex.test('other/src/foo.test.ts')).toBe(false); + }); +}); diff --git a/src/bin/codegraph.ts b/src/bin/codegraph.ts index d118a1fd..44ccc873 100644 --- a/src/bin/codegraph.ts +++ b/src/bin/codegraph.ts @@ -23,6 +23,7 @@ import * as path from 'path'; import * as fs from 'fs'; import { getCodeGraphDir, isInitialized } from '../directory'; import { createShimmerProgress } from '../ui/shimmer-progress'; +import { globToSafeRegex } from '../utils'; // Lazy-load heavy modules (CodeGraph, runInstaller) to keep CLI startup fast. async function loadCodeGraph(): Promise { @@ -1158,16 +1159,15 @@ program /\/spec\//, ]; - // Custom filter pattern + // Custom filter pattern (ReDoS-safe — globToSafeRegex coalesces + // consecutive wildcards so hostile inputs can't produce nested + // quantifiers like `.+.+.+`). let customFilter: RegExp | null = null; if (options.filter) { - // Convert glob to regex: ** → .+, * → [^/]*, . → \. - const regex = options.filter - .replace(/[+[\]{}()^$|\\]/g, '\\$&') - .replace(/\./g, '\\.') - .replace(/\*\*/g, '.+') - .replace(/\*/g, '[^/]*'); - customFilter = new RegExp(regex); + const regexBody = globToSafeRegex(options.filter); + if (regexBody !== null) { + customFilter = new RegExp(regexBody); + } } function isTestFile(filePath: string): boolean { diff --git a/src/db/queries.ts b/src/db/queries.ts index 51f1a1ad..41a27c6b 100644 --- a/src/db/queries.ts +++ b/src/db/queries.ts @@ -170,7 +170,6 @@ export class QueryBuilder { getFileByPath?: SqliteStatement; getAllFiles?: SqliteStatement; insertUnresolved?: SqliteStatement; - deleteUnresolvedByNode?: SqliteStatement; getUnresolvedByName?: SqliteStatement; getNodesByName?: SqliteStatement; getNodesByQualifiedNameExact?: SqliteStatement; @@ -185,6 +184,14 @@ export class QueryBuilder { this.db = db; } + /** + * Execute a callback inside a single SQLite transaction. Useful when a + * caller needs several `QueryBuilder` operations to commit atomically. + */ + transaction(fn: () => T): T { + return this.db.transaction(fn)(); + } + // =========================================================================== // Node Operations // =========================================================================== @@ -1032,17 +1039,8 @@ export class QueryBuilder { insert(); } - /** - * Delete unresolved references from a node - */ - deleteUnresolvedByNode(nodeId: string): void { - if (!this.stmts.deleteUnresolvedByNode) { - this.stmts.deleteUnresolvedByNode = this.db.prepare( - 'DELETE FROM unresolved_refs WHERE from_node_id = ?' - ); - } - this.stmts.deleteUnresolvedByNode.run(nodeId); - } + // (deleteUnresolvedByNode removed — never called; FK cascade on + // nodes(id) → unresolved_refs.from_node_id handles cleanup automatically.) /** * Get unresolved references by name (for resolution) diff --git a/src/extraction/index.ts b/src/extraction/index.ts index 4ad056fb..822c5cc5 100644 --- a/src/extraction/index.ts +++ b/src/extraction/index.ts @@ -20,7 +20,7 @@ import { QueryBuilder } from '../db/queries'; import { extractFromSource } from './tree-sitter'; import { detectLanguage, isLanguageSupported, initGrammars, loadGrammarsForLanguages } from './grammars'; import { logDebug, logWarn } from '../errors'; -import { validatePathWithinRoot, normalizePath } from '../utils'; +import { validatePathWithinRoot, validatePathWithinRootReal, normalizePath } from '../utils'; import picomatch from 'picomatch'; /** @@ -571,14 +571,29 @@ export class ExtractionOrchestrator { * Terminates the current worker and clears the reference so * ensureWorker() will spawn a fresh one on the next call. */ - function recycleWorker(): void { + async function recycleWorker(): Promise { if (!parseWorker) return; log(`Recycling worker after ${workerParseCount} parses (heap: ${Math.round(process.memoryUsage().rss / 1024 / 1024)}MB RSS)`); const w = parseWorker; parseWorker = null; workerParseCount = 0; - // Fire-and-forget: worker.terminate() can hang if WASM is stuck - w.terminate().catch(() => {}); + // worker.terminate() can hang if WASM is stuck — bound the wait so we + // never block the caller's `await` on a wedged worker. The terminate + // promise keeps running in the background so the worker eventually gets + // reaped even if the timeout wins. + let timedOut = false; + try { + await Promise.race([ + w.terminate(), + new Promise((resolve) => setTimeout(() => { timedOut = true; resolve(); }, 1000)), + ]); + } catch { + // ignore — terminate() failing means the worker is already gone + } + if (timedOut) { + // Fire-and-forget: don't leak a zombie if terminate is still pending. + w.terminate().catch(() => {}); + } } async function requestParse(filePath: string, content: string): Promise { @@ -1016,7 +1031,13 @@ export class ExtractionOrchestrator { } /** - * Store extraction result in database + * Store extraction result in database. + * + * The whole sequence (delete existing rows → insert nodes → insert edges → + * insert unresolved refs → upsert file record) runs in a single transaction + * so a process kill mid-write cannot leave the file's old data wiped while + * the new data is missing — either everything from this call commits or + * nothing does. */ private storeExtractionResult( filePath: string, @@ -1033,59 +1054,61 @@ export class ExtractionOrchestrator { return; // No changes } - // Delete existing data for this file - if (existingFile) { - this.queries.deleteFile(filePath); - } - // Filter out nodes with missing required fields before insertion. // This prevents FK violations when edges reference nodes that would // be silently skipped by insertNode() (see issue #42). const validNodes = result.nodes.filter((n) => n.id && n.kind && n.name && n.filePath && n.language); - // Insert nodes - if (validNodes.length > 0) { - this.queries.insertNodes(validNodes); - } + this.queries.transaction(() => { + // Delete existing data for this file + if (existingFile) { + this.queries.deleteFile(filePath); + } - // Filter edges to only reference nodes that were actually inserted - if (result.edges.length > 0) { - const insertedIds = new Set(validNodes.map((n) => n.id)); - const validEdges = result.edges.filter( - (e) => insertedIds.has(e.source) && insertedIds.has(e.target) - ); - if (validEdges.length > 0) { - this.queries.insertEdges(validEdges); + // Insert nodes + if (validNodes.length > 0) { + this.queries.insertNodes(validNodes); } - } - // Insert unresolved references in batch with denormalized filePath/language - if (result.unresolvedReferences.length > 0) { - const insertedIds = new Set(validNodes.map((n) => n.id)); - const refsWithContext = result.unresolvedReferences - .filter((ref) => insertedIds.has(ref.fromNodeId)) - .map((ref) => ({ - ...ref, - filePath: ref.filePath ?? filePath, - language: ref.language ?? language, - })); - if (refsWithContext.length > 0) { - this.queries.insertUnresolvedRefsBatch(refsWithContext); + // Filter edges to only reference nodes that were actually inserted + if (result.edges.length > 0) { + const insertedIds = new Set(validNodes.map((n) => n.id)); + const validEdges = result.edges.filter( + (e) => insertedIds.has(e.source) && insertedIds.has(e.target) + ); + if (validEdges.length > 0) { + this.queries.insertEdges(validEdges); + } } - } - // Insert file record - const fileRecord: FileRecord = { - path: filePath, - contentHash, - language, - size: stats.size, - modifiedAt: stats.mtimeMs, - indexedAt: Date.now(), - nodeCount: result.nodes.length, - errors: result.errors.length > 0 ? result.errors : undefined, - }; - this.queries.upsertFile(fileRecord); + // Insert unresolved references in batch with denormalized filePath/language + if (result.unresolvedReferences.length > 0) { + const insertedIds = new Set(validNodes.map((n) => n.id)); + const refsWithContext = result.unresolvedReferences + .filter((ref) => insertedIds.has(ref.fromNodeId)) + .map((ref) => ({ + ...ref, + filePath: ref.filePath ?? filePath, + language: ref.language ?? language, + })); + if (refsWithContext.length > 0) { + this.queries.insertUnresolvedRefsBatch(refsWithContext); + } + } + + // Insert file record + const fileRecord: FileRecord = { + path: filePath, + contentHash, + language, + size: stats.size, + modifiedAt: stats.mtimeMs, + indexedAt: Date.now(), + nodeCount: result.nodes.length, + errors: result.errors.length > 0 ? result.errors : undefined, + }; + this.queries.upsertFile(fileRecord); + }); } /** @@ -1125,9 +1148,16 @@ export class ExtractionOrchestrator { } } - // Handle modified files — read + hash only these files + // Handle modified files — read + hash only these files. Resolve + // symlinks (validatePathWithinRootReal) so a regular file swapped + // for a symlink to outside the project between scan and read is + // rejected, not followed. for (const filePath of gitChanges.modified) { - const fullPath = path.join(this.rootDir, filePath); + const fullPath = validatePathWithinRootReal(this.rootDir, filePath); + if (!fullPath) { + logWarn('Path traversal blocked during sync', { filePath }); + continue; + } let content: string; try { content = fs.readFileSync(fullPath, 'utf-8'); @@ -1176,9 +1206,13 @@ export class ExtractionOrchestrator { } } - // Find files to add or update + // Find files to add or update (symlink-resistant validation) for (const filePath of currentFiles) { - const fullPath = path.join(this.rootDir, filePath); + const fullPath = validatePathWithinRootReal(this.rootDir, filePath); + if (!fullPath) { + logWarn('Path traversal blocked during sync', { filePath }); + continue; + } let content: string; try { content = fs.readFileSync(fullPath, 'utf-8'); @@ -1260,8 +1294,13 @@ export class ExtractionOrchestrator { } // Modified files — read + hash only these, compare with DB + // (symlink-resistant validation) for (const filePath of gitChanges.modified) { - const fullPath = path.join(this.rootDir, filePath); + const fullPath = validatePathWithinRootReal(this.rootDir, filePath); + if (!fullPath) { + logWarn('Path traversal blocked while detecting changes', { filePath }); + continue; + } let content: string; try { content = fs.readFileSync(fullPath, 'utf-8'); @@ -1309,9 +1348,13 @@ export class ExtractionOrchestrator { } } - // Find added and modified files + // Find added and modified files (symlink-resistant validation) for (const filePath of currentFiles) { - const fullPath = path.join(this.rootDir, filePath); + const fullPath = validatePathWithinRootReal(this.rootDir, filePath); + if (!fullPath) { + logWarn('Path traversal blocked while detecting changes', { filePath }); + continue; + } let content: string; try { content = fs.readFileSync(fullPath, 'utf-8'); diff --git a/src/extraction/parse-worker.ts b/src/extraction/parse-worker.ts index 21b239ca..211cfbf7 100644 --- a/src/extraction/parse-worker.ts +++ b/src/extraction/parse-worker.ts @@ -55,5 +55,29 @@ parentPort!.on('message', async (msg: { type: string; id?: number; filePath?: st } } else if (msg.type === 'shutdown') { parentPort!.postMessage({ type: 'shutdown-ack' }); + } else { + // Unknown message types: when an `id` is present, surface a structured + // error so the in-flight Promise on the main thread fails fast rather + // than blocking until the per-file timeout. Messages without an `id` + // have no pending promise to unblock and are silently ignored — no + // harm done. + const id = msg.id; + if (typeof id === 'number') { + parentPort!.postMessage({ + type: 'parse-result', + id, + result: { + nodes: [], + edges: [], + unresolvedReferences: [], + errors: [{ + message: `Parse worker received unknown message type: ${msg.type}`, + severity: 'error', + code: 'worker_protocol_error', + }], + durationMs: 0, + } satisfies ExtractionResult, + }); + } } }); diff --git a/src/extraction/tree-sitter.ts b/src/extraction/tree-sitter.ts index 7345d91f..2fbda545 100644 --- a/src/extraction/tree-sitter.ts +++ b/src/extraction/tree-sitter.ts @@ -26,6 +26,27 @@ import { DfmExtractor } from './dfm-extractor'; // Re-export for backward compatibility export { generateNodeId } from './tree-sitter-helpers'; +/** + * Deduplicate unresolved references by (fromNodeId, referenceName, + * referenceKind). A function calling `foo()` 100 times pushes 100 refs + * during extraction; the resolver collapses them to one edge eventually + * (edges are unique on `(source, target, kind, line)` and most resolvers + * skip duplicate work), but indexing time and DB churn scale with the + * raw count. Collapsing here keeps the first occurrence's line/column + * (which is typically what users want when "go to call site" surfaces). + */ +function dedupeReferences(refs: UnresolvedReference[]): UnresolvedReference[] { + const seen = new Set(); + const out: UnresolvedReference[] = []; + for (const ref of refs) { + const key = `${ref.fromNodeId}\0${ref.referenceKind}\0${ref.referenceName}`; + if (seen.has(key)) continue; + seen.add(key); + out.push(ref); + } + return out; +} + /** * Extract the name from a node based on language */ @@ -216,7 +237,7 @@ export class TreeSitterExtractor { return { nodes: this.nodes, edges: this.edges, - unresolvedReferences: this.unresolvedReferences, + unresolvedReferences: dedupeReferences(this.unresolvedReferences), errors: this.errors, durationMs: Date.now() - startTime, }; diff --git a/src/graph/queries.ts b/src/graph/queries.ts index c39e2e32..e6d79c51 100644 --- a/src/graph/queries.ts +++ b/src/graph/queries.ts @@ -7,6 +7,7 @@ import { Node, Edge, Context, Subgraph, EdgeKind } from '../types'; import { QueryBuilder } from '../db/queries'; import { GraphTraverser } from './traversal'; +import { globToSafeRegex } from '../utils'; /** * Graph query manager for complex queries @@ -194,13 +195,11 @@ export class GraphQueryManager { * @returns Array of matching nodes */ findByQualifiedName(pattern: string): Node[] { - // Convert glob pattern to regex - const regexPattern = pattern - .replace(/[.+^${}()|[\]\\]/g, '\\$&') - .replace(/\*/g, '.*') - .replace(/\?/g, '.'); - - const regex = new RegExp(`^${regexPattern}$`); + // Convert glob pattern to regex (ReDoS-safe — consecutive wildcards are + // coalesced so hostile inputs can't produce nested quantifiers). + const regexBody = globToSafeRegex(pattern); + if (regexBody === null) return []; + const regex = new RegExp(`^${regexBody}$`); // This is inefficient for large graphs - would need FTS index on qualified_name // For now, use kind-based filtering if possible diff --git a/src/mcp/index.ts b/src/mcp/index.ts index bc3552ae..90f680fe 100644 --- a/src/mcp/index.ts +++ b/src/mcp/index.ts @@ -34,8 +34,10 @@ function fileUriToPath(uri: string): string { } return path.resolve(filePath); } catch { - // Fallback for non-standard URIs - return uri.replace(/^file:\/\/\/?/, ''); + // Fallback for non-standard URIs — still resolve through path.resolve + // so a malformed `file:///../etc/passwd` is normalized rather than + // returned raw to downstream filesystem code. + return path.resolve(uri.replace(/^file:\/\/\/?/, '')); } } diff --git a/src/utils.ts b/src/utils.ts index e75e58e0..22f31232 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -122,6 +122,36 @@ export function isPathWithinRoot(filePath: string, rootDir: string): boolean { return resolvedPath.startsWith(resolvedRoot + path.sep) || resolvedPath === resolvedRoot; } +/** + * Like validatePathWithinRoot but also resolves symlinks via fs.realpathSync, + * so a regular-looking path that is actually a symlink to outside the root + * is rejected. Returns the resolved real path, or null if the file escapes + * the root or can't be reached. + * + * Costs an extra realpath syscall vs. the lexical-only check, so prefer + * validatePathWithinRoot for hot paths where symlink TOCTOU isn't relevant. + */ +export function validatePathWithinRootReal(projectRoot: string, filePath: string): string | null { + const resolved = path.resolve(projectRoot, filePath); + const normalizedRoot = path.resolve(projectRoot); + if (!resolved.startsWith(normalizedRoot + path.sep) && resolved !== normalizedRoot) { + return null; + } + try { + const realPath = fs.realpathSync(resolved); + const realRoot = fs.realpathSync(normalizedRoot); + if (!realPath.startsWith(realRoot + path.sep) && realPath !== realRoot) { + return null; + } + return realPath; + } catch { + // realpath failures (broken symlink, permissions) — return the lexically- + // resolved path. The downstream readFileSync will fail naturally and the + // caller already handles read errors. + return resolved; + } +} + /** * Like isPathWithinRoot but also resolves symlinks via fs.realpathSync. * @@ -174,6 +204,43 @@ export function normalizePath(filePath: string): string { return filePath.replace(/\\/g, '/'); } +/** + * Convert a simple `*` / `?` / `**` glob to a safe regex source string. + * + * Hardens against catastrophic backtracking: consecutive `*` are coalesced + * to a single wildcard so a hostile input like `*****` doesn't become + * `.*.*.*.*.*` (nested quantifiers blow up on long inputs). Returns null + * if the input would produce a pathologically long pattern. + * + * The output is the *body* of the regex (no anchors); callers add `^` / `$` + * as appropriate for their use case. + */ +export function globToSafeRegex(glob: string): string | null { + if (glob.length > 1024) return null; + // Single pass: walk character-by-character. When we hit a `*` we look + // ahead to coalesce a run, so `**` (any-depth) maps to `.*` and `*` to + // `[^/]*` — and a hostile `*****` collapses to a single `.*` rather + // than five chained quantifiers (which would catastrophically + // backtrack on long inputs). + let out = ''; + for (let i = 0; i < glob.length; i++) { + const ch = glob[i]; + if (ch === '*') { + let runLen = 1; + while (glob[i + runLen] === '*') runLen++; + out += runLen >= 2 ? '.*' : '[^/]*'; + i += runLen - 1; + } else if (ch === '?') { + out += '[^/]'; + } else if (ch && /[.+^${}()|[\]\\]/.test(ch)) { + out += '\\' + ch; + } else if (ch) { + out += ch; + } + } + return out; +} + /** * Cross-process file lock using a lock file with PID tracking. *