From 347b6f7d0a1bf87b975d0b1c628b3d9fd96aa5cd Mon Sep 17 00:00:00 2001 From: Daniel Wise Date: Tue, 3 Mar 2026 21:27:14 -0800 Subject: [PATCH] feat(graph): enrich context graph with symbol nodes and semantic edges Add symbol extraction and canonical symbol IDs for TS/JS files, persist symbol/file graph nodes with normalized edge types (defines, references, imports, calls), and gate rollout via symbol enrichment config. Integrate graph enrichment into indexing, extend retrieval graph-hint traversal with graph_node citations, and add migration + validation tests for stability, directionality, compatibility, and volume thresholds. Archive OpenSpec change deeper-context-graph-enrichment and sync the new context-graph-enrichment spec into main specs. --- .../.openspec.yaml | 0 .../design.md | 0 .../proposal.md | 0 .../specs/context-graph-enrichment/spec.md | 0 .../tasks.md | 30 +++ .../deeper-context-graph-enrichment/tasks.md | 30 --- .../specs/context-graph-enrichment/spec.md | 48 ++++ src/context/graph/config.ts | 3 + src/context/graph/extract.ts | 255 ++++++++++++++++++ src/context/graph/persist.ts | 147 ++++++++++ src/context/graph/types.ts | 64 +++++ src/context/indexer/file-index.ts | 43 +++ src/context/indexer/full-index.ts | 1 + src/context/indexer/incremental.ts | 1 + src/context/retrieval/hybrid.ts | 125 ++++++++- .../0003_context_graph_enrichment.sql | 17 ++ tests/graph-enrichment.test.ts | 189 +++++++++++++ 17 files changed, 916 insertions(+), 37 deletions(-) rename openspec/changes/{deeper-context-graph-enrichment => archive/2026-03-04-deeper-context-graph-enrichment}/.openspec.yaml (100%) rename openspec/changes/{deeper-context-graph-enrichment => archive/2026-03-04-deeper-context-graph-enrichment}/design.md (100%) rename openspec/changes/{deeper-context-graph-enrichment => archive/2026-03-04-deeper-context-graph-enrichment}/proposal.md (100%) rename openspec/changes/{deeper-context-graph-enrichment => archive/2026-03-04-deeper-context-graph-enrichment}/specs/context-graph-enrichment/spec.md (100%) create mode 100644 openspec/changes/archive/2026-03-04-deeper-context-graph-enrichment/tasks.md delete mode 100644 openspec/changes/deeper-context-graph-enrichment/tasks.md create mode 100644 openspec/specs/context-graph-enrichment/spec.md create mode 100644 src/context/graph/config.ts create mode 100644 src/context/graph/extract.ts create mode 100644 src/context/graph/persist.ts create mode 100644 src/context/graph/types.ts create mode 100644 src/db/migrations/0003_context_graph_enrichment.sql create mode 100644 tests/graph-enrichment.test.ts diff --git a/openspec/changes/deeper-context-graph-enrichment/.openspec.yaml b/openspec/changes/archive/2026-03-04-deeper-context-graph-enrichment/.openspec.yaml similarity index 100% rename from openspec/changes/deeper-context-graph-enrichment/.openspec.yaml rename to openspec/changes/archive/2026-03-04-deeper-context-graph-enrichment/.openspec.yaml diff --git a/openspec/changes/deeper-context-graph-enrichment/design.md b/openspec/changes/archive/2026-03-04-deeper-context-graph-enrichment/design.md similarity index 100% rename from openspec/changes/deeper-context-graph-enrichment/design.md rename to openspec/changes/archive/2026-03-04-deeper-context-graph-enrichment/design.md diff --git a/openspec/changes/deeper-context-graph-enrichment/proposal.md b/openspec/changes/archive/2026-03-04-deeper-context-graph-enrichment/proposal.md similarity index 100% rename from openspec/changes/deeper-context-graph-enrichment/proposal.md rename to openspec/changes/archive/2026-03-04-deeper-context-graph-enrichment/proposal.md diff --git a/openspec/changes/deeper-context-graph-enrichment/specs/context-graph-enrichment/spec.md b/openspec/changes/archive/2026-03-04-deeper-context-graph-enrichment/specs/context-graph-enrichment/spec.md similarity index 100% rename from openspec/changes/deeper-context-graph-enrichment/specs/context-graph-enrichment/spec.md rename to openspec/changes/archive/2026-03-04-deeper-context-graph-enrichment/specs/context-graph-enrichment/spec.md diff --git a/openspec/changes/archive/2026-03-04-deeper-context-graph-enrichment/tasks.md b/openspec/changes/archive/2026-03-04-deeper-context-graph-enrichment/tasks.md new file mode 100644 index 0000000..ac8121c --- /dev/null +++ b/openspec/changes/archive/2026-03-04-deeper-context-graph-enrichment/tasks.md @@ -0,0 +1,30 @@ +## 1. Schema and data model updates + +- [x] 1.1 Add symbol node schema/types with canonical symbol identifier fields and source location metadata +- [x] 1.2 Add normalized edge type enum support for `defines`, `references`, `imports`, and `calls` +- [x] 1.3 Add feature-flag or configuration gate for enabling symbol enrichment rollout + +## 2. Symbol extraction pipeline + +- [x] 2.1 Implement symbol extraction for initial supported languages (TypeScript/JavaScript) in the indexing pipeline +- [x] 2.2 Generate deterministic canonical symbol keys (`::::::::`) during extraction +- [x] 2.3 Add extraction diagnostics and partial-failure handling so unsupported constructs do not halt full indexing + +## 3. Relationship edge generation + +- [x] 3.1 Implement `defines` edge generation from file entities to declared symbols +- [x] 3.2 Implement `references` and `calls` edge generation from analyzed source contexts to target symbols when resolvable +- [x] 3.3 Implement `imports` edge generation between importing context and imported symbol/module entities + +## 4. Graph persistence and query integration + +- [x] 4.1 Persist symbol nodes and semantic edges in graph storage with batch write path support +- [x] 4.2 Update graph query/retrieval surfaces to return symbol nodes and semantic edge traversals +- [x] 4.3 Ensure existing file-level graph query contracts remain unchanged when enrichment is enabled + +## 5. Validation, testing, and rollout checks + +- [x] 5.1 Add golden fixture tests for symbol extraction counts and canonical identifier stability +- [x] 5.2 Add tests for required edge presence and directionality (`defines`, `references`, `imports`, `calls`) +- [x] 5.3 Add regression tests verifying file-level query compatibility and non-breaking behavior +- [x] 5.4 Add indexing performance/volume checks and acceptance thresholds for enriched graph data diff --git a/openspec/changes/deeper-context-graph-enrichment/tasks.md b/openspec/changes/deeper-context-graph-enrichment/tasks.md deleted file mode 100644 index 9fc2da4..0000000 --- a/openspec/changes/deeper-context-graph-enrichment/tasks.md +++ /dev/null @@ -1,30 +0,0 @@ -## 1. Schema and data model updates - -- [ ] 1.1 Add symbol node schema/types with canonical symbol identifier fields and source location metadata -- [ ] 1.2 Add normalized edge type enum support for `defines`, `references`, `imports`, and `calls` -- [ ] 1.3 Add feature-flag or configuration gate for enabling symbol enrichment rollout - -## 2. Symbol extraction pipeline - -- [ ] 2.1 Implement symbol extraction for initial supported languages (TypeScript/JavaScript) in the indexing pipeline -- [ ] 2.2 Generate deterministic canonical symbol keys (`::::::::`) during extraction -- [ ] 2.3 Add extraction diagnostics and partial-failure handling so unsupported constructs do not halt full indexing - -## 3. Relationship edge generation - -- [ ] 3.1 Implement `defines` edge generation from file entities to declared symbols -- [ ] 3.2 Implement `references` and `calls` edge generation from analyzed source contexts to target symbols when resolvable -- [ ] 3.3 Implement `imports` edge generation between importing context and imported symbol/module entities - -## 4. Graph persistence and query integration - -- [ ] 4.1 Persist symbol nodes and semantic edges in graph storage with batch write path support -- [ ] 4.2 Update graph query/retrieval surfaces to return symbol nodes and semantic edge traversals -- [ ] 4.3 Ensure existing file-level graph query contracts remain unchanged when enrichment is enabled - -## 5. Validation, testing, and rollout checks - -- [ ] 5.1 Add golden fixture tests for symbol extraction counts and canonical identifier stability -- [ ] 5.2 Add tests for required edge presence and directionality (`defines`, `references`, `imports`, `calls`) -- [ ] 5.3 Add regression tests verifying file-level query compatibility and non-breaking behavior -- [ ] 5.4 Add indexing performance/volume checks and acceptance thresholds for enriched graph data diff --git a/openspec/specs/context-graph-enrichment/spec.md b/openspec/specs/context-graph-enrichment/spec.md new file mode 100644 index 0000000..88a1f11 --- /dev/null +++ b/openspec/specs/context-graph-enrichment/spec.md @@ -0,0 +1,48 @@ +# context-graph-enrichment Specification + +## Purpose +TBD - created by archiving change deeper-context-graph-enrichment. Update Purpose after archive. +## Requirements +### Requirement: Extract Symbol Inventory During Indexing +The system SHALL extract a symbol inventory for each indexed source file, including supported symbol kinds, canonical symbol identifiers, names, and source locations. + +#### Scenario: Symbols extracted from a supported file +- **WHEN** the indexer processes a supported language file +- **THEN** the graph pipeline records one symbol entry per discovered symbol with deterministic identifier and location metadata + +#### Scenario: Unsupported syntax does not halt indexing +- **WHEN** symbol extraction encounters an unsupported construct in a file +- **THEN** the system continues indexing remaining files and records extraction diagnostics for the affected file + +### Requirement: Persist Semantic Relationship Edges +The system SHALL persist normalized directed relationship edges among graph entities using the enum: `defines`, `references`, `imports`, and `calls`. + +#### Scenario: Definition edge creation +- **WHEN** a file contains a symbol definition +- **THEN** the graph contains a `defines` edge linking the file entity to the symbol entity + +#### Scenario: Reference and call edge creation +- **WHEN** analysis identifies a symbol reference or call site +- **THEN** the graph contains `references` or `calls` edges from the source symbol or file context to the target symbol when resolvable + +### Requirement: Preserve Existing File-Level Graph Behavior +The system SHALL preserve compatibility for existing file-level graph traversal and consumers while symbol enrichment is enabled. + +#### Scenario: Existing consumer query remains valid +- **WHEN** a consumer executes a pre-existing file-level graph query +- **THEN** the query returns results with unchanged contract and does not require symbol-level filters + +### Requirement: Expose Enriched Graph Data to Query Surfaces +The system SHALL expose symbol nodes and semantic edges to graph query surfaces used by retrieval and impact analysis. + +#### Scenario: Query requests symbol relationships +- **WHEN** a graph query requests relationships for a symbol identifier +- **THEN** the query surface returns connected nodes and edges for `defines`, `references`, `imports`, and `calls` relationship types + +### Requirement: Validate Enrichment Quality and Stability +The system SHALL provide automated validation coverage for symbol extraction and relationship edge generation across representative repositories. + +#### Scenario: Regression suite for enrichment +- **WHEN** CI executes graph enrichment tests +- **THEN** the suite verifies expected symbol counts and required edge presence for golden fixtures without regressing file-level behavior + diff --git a/src/context/graph/config.ts b/src/context/graph/config.ts new file mode 100644 index 0000000..0c3facc --- /dev/null +++ b/src/context/graph/config.ts @@ -0,0 +1,3 @@ +export function isSymbolEnrichmentEnabled(): boolean { + return process.env.DUBSBOT_ENABLE_SYMBOL_ENRICHMENT === '1'; +} diff --git a/src/context/graph/extract.ts b/src/context/graph/extract.ts new file mode 100644 index 0000000..13fcd3b --- /dev/null +++ b/src/context/graph/extract.ts @@ -0,0 +1,255 @@ +import { posix } from 'node:path'; +import { buildCanonicalSymbolId, type ExtractedSymbol, type GraphFileExtraction } from './types'; + +const SUPPORTED_EXTENSIONS = new Set(['.ts', '.tsx', '.js', '.jsx', '.mjs', '.cjs']); + +export function canExtractSymbols(path: string): boolean { + const normalized = path.toLowerCase(); + for (const extension of SUPPORTED_EXTENSIONS) { + if (normalized.endsWith(extension)) { + return true; + } + } + return false; +} + +export function extractGraphDataForFile(input: { + repoRoot: string; + path: string; + content: string; +}): GraphFileExtraction { + const normalizedPath = posix.normalize(input.path); + if (!canExtractSymbols(normalizedPath)) { + return { + symbols: [], + edges: [], + diagnostics: [`unsupported-language:${normalizedPath}`], + }; + } + + const symbols: ExtractedSymbol[] = []; + const edges: GraphFileExtraction['edges'] = []; + const diagnostics: string[] = []; + const lines = input.content.split('\n'); + const symbolByName = new Map(); + + for (let lineIndex = 0; lineIndex < lines.length; lineIndex += 1) { + const line = lines[lineIndex]; + const lineNumber = lineIndex + 1; + const trimmed = line.trim(); + if (!trimmed) { + continue; + } + + const functionMatch = line.match(/\bfunction\s+([A-Za-z_$][\w$]*)\s*\(/); + if (functionMatch) { + const symbol = makeSymbol({ + repoRoot: input.repoRoot, + path: normalizedPath, + name: functionMatch[1], + kind: 'function', + line, + lineNumber, + }); + addSymbol(symbols, symbolByName, symbol); + continue; + } + + const classMatch = line.match(/\bclass\s+([A-Za-z_$][\w$]*)\b/); + if (classMatch) { + const symbol = makeSymbol({ + repoRoot: input.repoRoot, + path: normalizedPath, + name: classMatch[1], + kind: 'class', + line, + lineNumber, + }); + addSymbol(symbols, symbolByName, symbol); + continue; + } + + const typeMatch = line.match(/\b(?:interface|type)\s+([A-Za-z_$][\w$]*)\b/); + if (typeMatch) { + const symbol = makeSymbol({ + repoRoot: input.repoRoot, + path: normalizedPath, + name: typeMatch[1], + kind: 'type', + line, + lineNumber, + }); + addSymbol(symbols, symbolByName, symbol); + continue; + } + + const constantMatch = line.match(/\b(?:const|let|var)\s+([A-Za-z_$][\w$]*)\b/); + if (constantMatch) { + const symbol = makeSymbol({ + repoRoot: input.repoRoot, + path: normalizedPath, + name: constantMatch[1], + kind: 'constant', + line, + lineNumber, + }); + addSymbol(symbols, symbolByName, symbol); + continue; + } + + const importMatch = line.match(/\bimport\s+(.+)\s+from\s+['"]([^'"]+)['"]/); + if (importMatch) { + const moduleName = importMatch[2]; + const moduleSymbol = makeSymbol({ + repoRoot: input.repoRoot, + path: normalizedPath, + name: `module:${moduleName}`, + kind: 'module', + line, + lineNumber, + }); + addSymbol(symbols, symbolByName, moduleSymbol); + + const importedPart = importMatch[1]; + const names = importedPart + .replace(/[{}]/g, ' ') + .split(',') + .map((entry) => entry.trim()) + .map((entry) => entry.split(/\s+as\s+/i).at(-1) ?? entry) + .map((entry) => entry.trim()) + .filter(Boolean); + for (const name of names) { + const importSymbol = makeSymbol({ + repoRoot: input.repoRoot, + path: normalizedPath, + name, + kind: 'import', + line, + lineNumber, + }); + addSymbol(symbols, symbolByName, importSymbol); + edges.push({ + type: 'imports', + sourceKey: fileNodeKey(input.repoRoot, normalizedPath), + targetKey: importSymbol.id, + confidence: 1, + metadata: { module: moduleName }, + }); + } + } + } + + for (const symbol of symbols) { + edges.push({ + type: 'defines', + sourceKey: fileNodeKey(input.repoRoot, normalizedPath), + targetKey: symbol.id, + confidence: 1, + }); + } + + const knownNames = [...symbolByName.keys()]; + for (let lineIndex = 0; lineIndex < lines.length; lineIndex += 1) { + const line = lines[lineIndex]; + const lineNumber = lineIndex + 1; + for (const match of line.matchAll(/\b([A-Za-z_$][\w$]*)\s*\(/g)) { + const callee = match[1]; + const target = symbolByName.get(callee); + if (!target) { + continue; + } + edges.push({ + type: 'calls', + sourceKey: fileNodeKey(input.repoRoot, normalizedPath), + targetKey: target.id, + confidence: 0.7, + metadata: { line: lineNumber }, + }); + } + + for (const name of knownNames) { + if (!line.includes(name)) { + continue; + } + const target = symbolByName.get(name); + if (!target) { + continue; + } + edges.push({ + type: 'references', + sourceKey: fileNodeKey(input.repoRoot, normalizedPath), + targetKey: target.id, + confidence: 0.5, + metadata: { line: lineNumber }, + }); + } + } + + if (symbols.length === 0) { + diagnostics.push(`no-symbols-detected:${normalizedPath}`); + } + + return { + symbols, + edges: dedupeEdges(edges), + diagnostics, + }; +} + +function addSymbol( + symbols: ExtractedSymbol[], + symbolByName: Map, + symbol: ExtractedSymbol +): void { + if (symbolByName.has(symbol.name)) { + return; + } + symbols.push(symbol); + symbolByName.set(symbol.name, symbol); +} + +function makeSymbol(input: { + repoRoot: string; + path: string; + name: string; + kind: ExtractedSymbol['kind']; + line: string; + lineNumber: number; +}): ExtractedSymbol { + const startColumn = Math.max(input.line.indexOf(input.name), 0) + 1; + const endColumn = startColumn + input.name.length; + const location = { + startLine: input.lineNumber, + endLine: input.lineNumber, + startColumn, + endColumn, + }; + return { + id: buildCanonicalSymbolId({ + repoRoot: input.repoRoot, + path: input.path, + kind: input.kind, + name: input.name, + location, + }), + name: input.name, + kind: input.kind, + path: input.path, + location, + }; +} + +function fileNodeKey(repoRoot: string, path: string): string { + return `${repoRoot}::${path}::file`; +} + +function dedupeEdges(edges: GraphFileExtraction['edges']): GraphFileExtraction['edges'] { + const map = new Map(); + for (const edge of edges) { + const key = `${edge.type}|${edge.sourceKey}|${edge.targetKey}`; + if (!map.has(key)) { + map.set(key, edge); + } + } + return [...map.values()]; +} diff --git a/src/context/graph/persist.ts b/src/context/graph/persist.ts new file mode 100644 index 0000000..e3b6923 --- /dev/null +++ b/src/context/graph/persist.ts @@ -0,0 +1,147 @@ +import { createHash } from 'node:crypto'; +import type { DubsbotDb } from '../../db/client'; +import { type GraphFileExtraction, SemanticEdgeTypes } from './types'; + +export async function persistGraphEnrichmentForFile(input: { + db: DubsbotDb; + repoRoot: string; + path: string; + extraction: GraphFileExtraction; +}): Promise { + const fileKey = fileNodeKey(input.repoRoot, input.path); + const prefix = `${input.repoRoot}::${input.path}::`; + + await input.db.exec('BEGIN'); + try { + const scopedNodes = await input.db.query<{ id: string }>( + 'SELECT id FROM context_nodes WHERE node_key LIKE $1', + [`${prefix}%`] + ); + const scopedNodeIds = scopedNodes.rows.map((row) => row.id); + + if (scopedNodeIds.length > 0) { + await input.db.query( + 'DELETE FROM context_edges WHERE source_node_id = ANY($1::text[]) OR target_node_id = ANY($1::text[])', + [scopedNodeIds] + ); + await input.db.query( + 'DELETE FROM context_nodes WHERE id = ANY($1::text[]) AND node_key <> $2', + [scopedNodeIds, fileKey] + ); + } + + await upsertNode(input.db, { + id: nodeId(fileKey), + type: 'file', + key: fileKey, + payload: { + repoRoot: input.repoRoot, + path: input.path, + }, + }); + + const idsByKey = new Map([[fileKey, nodeId(fileKey)]]); + for (const symbol of input.extraction.symbols) { + const key = symbol.id; + const id = nodeId(key); + idsByKey.set(key, id); + await upsertNode(input.db, { + id, + type: 'symbol', + key, + payload: { + repoRoot: input.repoRoot, + id: symbol.id, + kind: symbol.kind, + name: symbol.name, + path: symbol.path, + location: symbol.location, + diagnostics: symbol.diagnostics ?? [], + }, + }); + } + + for (const edge of input.extraction.edges) { + if (!SemanticEdgeTypes.includes(edge.type)) { + continue; + } + const sourceNodeId = idsByKey.get(edge.sourceKey); + const targetNodeId = idsByKey.get(edge.targetKey); + if (!sourceNodeId || !targetNodeId) { + continue; + } + const id = edgeId(edge.type, sourceNodeId, targetNodeId); + await input.db.query( + `INSERT INTO context_edges (id, source_node_id, target_node_id, edge_type, weight, payload) + VALUES ($1, $2, $3, $4, $5, $6::jsonb) + ON CONFLICT (id) DO UPDATE SET edge_type = EXCLUDED.edge_type, weight = EXCLUDED.weight, payload = EXCLUDED.payload`, + [ + id, + sourceNodeId, + targetNodeId, + edge.type, + edge.confidence ?? 1, + JSON.stringify(edge.metadata ?? {}), + ] + ); + } + + await input.db.exec('COMMIT'); + } catch (error) { + await input.db.exec('ROLLBACK'); + throw error; + } +} + +export async function deleteGraphEnrichmentForFile(input: { + db: DubsbotDb; + repoRoot: string; + path: string; +}): Promise { + const prefix = `${input.repoRoot}::${input.path}::`; + const scopedNodes = await input.db.query<{ id: string }>( + 'SELECT id FROM context_nodes WHERE node_key LIKE $1', + [`${prefix}%`] + ); + const scopedNodeIds = scopedNodes.rows.map((row) => row.id); + if (scopedNodeIds.length === 0) { + return; + } + + await input.db.exec('BEGIN'); + try { + await input.db.query( + 'DELETE FROM context_edges WHERE source_node_id = ANY($1::text[]) OR target_node_id = ANY($1::text[])', + [scopedNodeIds] + ); + await input.db.query('DELETE FROM context_nodes WHERE id = ANY($1::text[])', [scopedNodeIds]); + await input.db.exec('COMMIT'); + } catch (error) { + await input.db.exec('ROLLBACK'); + throw error; + } +} + +async function upsertNode( + db: DubsbotDb, + input: { id: string; type: 'file' | 'symbol'; key: string; payload: Record } +): Promise { + await db.query( + `INSERT INTO context_nodes (id, node_type, node_key, payload) + VALUES ($1, $2, $3, $4::jsonb) + ON CONFLICT (node_key) DO UPDATE SET node_type = EXCLUDED.node_type, payload = EXCLUDED.payload, updated_at = NOW()`, + [input.id, input.type, input.key, JSON.stringify(input.payload)] + ); +} + +function fileNodeKey(repoRoot: string, path: string): string { + return `${repoRoot}::${path}::file`; +} + +function nodeId(key: string): string { + return createHash('sha1').update(key).digest('hex'); +} + +function edgeId(type: string, sourceNodeId: string, targetNodeId: string): string { + return createHash('sha1').update(`${type}|${sourceNodeId}|${targetNodeId}`).digest('hex'); +} diff --git a/src/context/graph/types.ts b/src/context/graph/types.ts new file mode 100644 index 0000000..87a26c1 --- /dev/null +++ b/src/context/graph/types.ts @@ -0,0 +1,64 @@ +import { createHash } from 'node:crypto'; + +export const SemanticEdgeTypes = ['defines', 'references', 'imports', 'calls'] as const; +export type SemanticEdgeType = (typeof SemanticEdgeTypes)[number]; + +export const GraphNodeTypes = ['file', 'symbol'] as const; +export type GraphNodeType = (typeof GraphNodeTypes)[number]; + +export const SymbolKinds = [ + 'function', + 'class', + 'method', + 'type', + 'constant', + 'module', + 'import', +] as const; +export type SymbolKind = (typeof SymbolKinds)[number]; + +export type SourceLocation = { + startLine: number; + endLine: number; + startColumn: number; + endColumn: number; +}; + +export type ExtractedSymbol = { + id: string; + name: string; + kind: SymbolKind; + path: string; + location: SourceLocation; + diagnostics?: string[]; +}; + +export type GraphSymbolEdge = { + type: SemanticEdgeType; + sourceKey: string; + targetKey: string; + confidence?: number; + metadata?: Record; +}; + +export type GraphFileExtraction = { + symbols: ExtractedSymbol[]; + edges: GraphSymbolEdge[]; + diagnostics: string[]; +}; + +export function buildCanonicalSymbolId(input: { + repoRoot: string; + path: string; + kind: SymbolKind; + name: string; + location: SourceLocation; +}): string { + const rangeHash = createHash('sha1') + .update( + `${input.location.startLine}:${input.location.startColumn}-${input.location.endLine}:${input.location.endColumn}` + ) + .digest('hex') + .slice(0, 12); + return `${input.repoRoot}::${input.path}::${input.kind}::${input.name}::${rangeHash}`; +} diff --git a/src/context/indexer/file-index.ts b/src/context/indexer/file-index.ts index 1386b9c..883e6e1 100644 --- a/src/context/indexer/file-index.ts +++ b/src/context/indexer/file-index.ts @@ -9,6 +9,9 @@ import { type EmbeddingProvenance, executeEmbeddingWithStrategy, } from '../embedding/engine'; +import { isSymbolEnrichmentEnabled } from '../graph/config'; +import { extractGraphDataForFile } from '../graph/extract'; +import { deleteGraphEnrichmentForFile, persistGraphEnrichmentForFile } from '../graph/persist'; import { deterministicEmbedding } from '../retrieval/rerank'; type Chunk = { @@ -24,6 +27,7 @@ export type FileIndexSharedInput = { embedProvider?: ProviderAdapter; embeddingModel?: string; embeddingStrategyId?: string; + symbolEnrichmentEnabled?: boolean; }; export type UpsertFileResult = { @@ -42,6 +46,7 @@ export function createFileIndexHelpers(input: FileIndexSharedInput): { upsertIndexedFileByPath: (relativePath: string) => Promise; deleteIndexedFileByPath: (relativePath: string) => Promise; } { + const symbolEnrichmentEnabled = input.symbolEnrichmentEnabled ?? isSymbolEnrichmentEnabled(); const isStrategyV2 = isEmbeddingStrategyV2Enabled(); const strategyConfig = isStrategyV2 ? loadEmbeddingStrategyConfig() : null; const adapterCache = new Map(); @@ -173,6 +178,31 @@ export function createFileIndexHelpers(input: FileIndexSharedInput): { ]); } + if (symbolEnrichmentEnabled) { + try { + const extraction = extractGraphDataForFile({ + repoRoot: input.repoRoot, + path: relativePath, + content, + }); + await persistGraphEnrichmentForFile({ + db: input.db, + repoRoot: input.repoRoot, + path: relativePath, + extraction, + }); + for (const diagnostic of extraction.diagnostics) { + console.info(`[indexer:graph] ${diagnostic}`); + } + } catch (error) { + console.warn( + `[indexer:graph] extraction failed for ${relativePath}: ${ + error instanceof Error ? error.message : String(error) + }` + ); + } + } + return { status: 'indexed', fileStatus, @@ -197,6 +227,19 @@ export function createFileIndexHelpers(input: FileIndexSharedInput): { ); const chunksDeleted = Number(countRows.rows[0]?.count ?? 0); await input.db.query('DELETE FROM files WHERE id = $1', [fileId]); + if (symbolEnrichmentEnabled) { + await deleteGraphEnrichmentForFile({ + db: input.db, + repoRoot: input.repoRoot, + path: relativePath, + }).catch((error) => { + console.warn( + `[indexer:graph] cleanup failed for ${relativePath}: ${ + error instanceof Error ? error.message : String(error) + }` + ); + }); + } return { fileDeleted: true, chunksDeleted }; } diff --git a/src/context/indexer/full-index.ts b/src/context/indexer/full-index.ts index f6a2db9..f08b316 100644 --- a/src/context/indexer/full-index.ts +++ b/src/context/indexer/full-index.ts @@ -9,6 +9,7 @@ export async function runFullIndex(input: { embedProvider?: ProviderAdapter; embeddingModel?: string; embeddingStrategyId?: string; + symbolEnrichmentEnabled?: boolean; }): Promise<{ filesIndexed: number; chunksIndexed: number }> { const paths = await fg(['**/*', '!node_modules/**', '!.git/**', '!dist/**', '!coverage/**'], { cwd: input.repoRoot, diff --git a/src/context/indexer/incremental.ts b/src/context/indexer/incremental.ts index e78f000..3975c64 100644 --- a/src/context/indexer/incremental.ts +++ b/src/context/indexer/incremental.ts @@ -44,6 +44,7 @@ export async function runIncrementalIndex(input: { trigger?: IncrementalTrigger; embedProvider?: ProviderAdapter; embeddingModel?: string; + symbolEnrichmentEnabled?: boolean; }): Promise { const fileIndexHelpers = createFileIndexHelpers(input); diff --git a/src/context/retrieval/hybrid.ts b/src/context/retrieval/hybrid.ts index f1a2efc..c06b282 100644 --- a/src/context/retrieval/hybrid.ts +++ b/src/context/retrieval/hybrid.ts @@ -14,6 +14,18 @@ type ChunkRow = { provenance: string | null; }; +type GraphTraversalHit = { + nodeId: string; + nodeKey: string; + nodeType: string; + edgeType: string | null; + connectedNodeId: string | null; + connectedNodeKey: string | null; + connectedNodeType: string | null; + path: string | null; + connectedPath: string | null; +}; + async function grepSearch( cwd: string, query: string @@ -62,6 +74,12 @@ export async function runHybridRetrieval(input: { const query = input.query; const lexical = await grepSearch(input.repoRoot, query.lexicalQuery || query.vectorQuery); const queryVector = deterministicEmbedding(query.vectorQuery || query.lexicalQuery); + const graphTraversal = await traverseGraphHints({ + db: input.db, + repoRoot: input.repoRoot, + hints: query.graphHints, + }); + const boostedPaths = new Set(graphTraversal.pathHints); const rows = await input.db.query( `SELECT c.id, c.content, f.path, ce.embedding::text as embedding, ce.provider, ce.model, ce.provenance::text as provenance @@ -80,11 +98,12 @@ export async function runHybridRetrieval(input: { : deterministicEmbedding(row.content); const vectorScore = cosineSimilarity(queryVector, embedding); const lexicalHit = lexical.find((hit) => hit.path === row.path); + const graphScore = boostedPaths.has(row.path) ? 0.6 : 0.25; return { item: row, lexicalScore: lexicalHit ? lexicalHit.score : 0, vectorScore, - graphScore: 0.25, + graphScore, }; }) ).slice(0, query.maxItems); @@ -112,6 +131,7 @@ export async function runHybridRetrieval(input: { embeddingProvenance: safeJsonParse(entry.item.provenance), lexicalScore: entry.lexicalScore, vectorScore: entry.vectorScore, + graphScore: entry.graphScore, rank: index + 1, }, })); @@ -135,17 +155,108 @@ export async function runHybridRetrieval(input: { const bundle = ContextBundleSchema.parse({ query, items, - citations: items.map((item) => ({ - sourceType: 'chunk', - sourceId: item.id, - path: String(item.metadata.path), - score: item.score, - })), + citations: [ + ...items.map((item) => ({ + sourceType: 'chunk', + sourceId: item.id, + path: String(item.metadata.path), + score: item.score, + })), + ...graphTraversal.citations, + ], }); return bundle; } +async function traverseGraphHints(input: { + db: DubsbotDb; + repoRoot: string; + hints: string[]; +}): Promise<{ + pathHints: string[]; + citations: Array<{ sourceType: 'graph_node'; sourceId: string; path?: string; score: number }>; +}> { + if (input.hints.length === 0) { + return { pathHints: [], citations: [] }; + } + + const rows = await input.db.query( + `SELECT + n.id AS "nodeId", + n.node_key AS "nodeKey", + n.node_type AS "nodeType", + e.edge_type AS "edgeType", + n2.id AS "connectedNodeId", + n2.node_key AS "connectedNodeKey", + n2.node_type AS "connectedNodeType", + n.payload->>'path' AS "path", + n2.payload->>'path' AS "connectedPath" + FROM context_nodes n + LEFT JOIN context_edges e ON e.source_node_id = n.id + LEFT JOIN context_nodes n2 ON n2.id = e.target_node_id + WHERE n.payload->>'repoRoot' = $1 + AND ( + n.node_key = ANY($2::text[]) + OR n.payload->>'name' = ANY($2::text[]) + OR n.payload->>'path' = ANY($2::text[]) + ) + LIMIT 200`, + [input.repoRoot, input.hints] + ); + + const pathHints = new Set(); + const citations: Array<{ + sourceType: 'graph_node'; + sourceId: string; + path?: string; + score: number; + }> = []; + for (const row of rows.rows) { + if (row.path) { + pathHints.add(row.path); + } + if (row.connectedPath) { + pathHints.add(row.connectedPath); + } + citations.push({ + sourceType: 'graph_node', + sourceId: row.nodeId, + path: row.path ?? undefined, + score: 0.35, + }); + if (row.connectedNodeId) { + citations.push({ + sourceType: 'graph_node', + sourceId: row.connectedNodeId, + path: row.connectedPath ?? undefined, + score: 0.3, + }); + } + } + + return { + pathHints: [...pathHints], + citations: dedupeGraphCitations(citations), + }; +} + +function dedupeGraphCitations( + citations: Array<{ sourceType: 'graph_node'; sourceId: string; path?: string; score: number }> +): Array<{ sourceType: 'graph_node'; sourceId: string; path?: string; score: number }> { + const map = new Map< + string, + { sourceType: 'graph_node'; sourceId: string; path?: string; score: number } + >(); + for (const citation of citations) { + const existing = map.get(citation.sourceId); + if (!existing || citation.score > existing.score) { + map.set(citation.sourceId, citation); + } + } + return [...map.values()]; +} + function safeJsonParse(value: string | null): unknown { if (!value) { return null; diff --git a/src/db/migrations/0003_context_graph_enrichment.sql b/src/db/migrations/0003_context_graph_enrichment.sql new file mode 100644 index 0000000..391f80c --- /dev/null +++ b/src/db/migrations/0003_context_graph_enrichment.sql @@ -0,0 +1,17 @@ +ALTER TABLE context_nodes +DROP CONSTRAINT IF EXISTS context_nodes_node_type_check; + +ALTER TABLE context_nodes +ADD CONSTRAINT context_nodes_node_type_check +CHECK (node_type IN ('file', 'symbol')); + +ALTER TABLE context_edges +DROP CONSTRAINT IF EXISTS context_edges_edge_type_check; + +ALTER TABLE context_edges +ADD CONSTRAINT context_edges_edge_type_check +CHECK (edge_type IN ('defines', 'references', 'imports', 'calls')); + +CREATE INDEX IF NOT EXISTS idx_context_nodes_node_type ON context_nodes(node_type); +CREATE INDEX IF NOT EXISTS idx_context_nodes_node_key ON context_nodes(node_key); +CREATE INDEX IF NOT EXISTS idx_context_edges_edge_type ON context_edges(edge_type); diff --git a/tests/graph-enrichment.test.ts b/tests/graph-enrichment.test.ts new file mode 100644 index 0000000..05b93d6 --- /dev/null +++ b/tests/graph-enrichment.test.ts @@ -0,0 +1,189 @@ +import { mkdtemp, readFile, rm, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { performance } from 'node:perf_hooks'; +import { PGlite } from '@electric-sql/pglite'; +import { afterEach, describe, expect, it } from 'vitest'; +import { extractGraphDataForFile } from '../src/context/graph/extract'; +import { runFullIndex } from '../src/context/indexer/full-index'; +import { runHybridRetrieval } from '../src/context/retrieval/hybrid'; +import { DubsbotDb } from '../src/db/client'; + +type Fixture = { + repoRoot: string; + db: DubsbotDb; + cleanup: () => Promise; +}; + +async function createFixture(files: Record): Promise { + const root = await mkdtemp(join(tmpdir(), 'dubsbot-graph-')); + const dbRoot = await mkdtemp(join(tmpdir(), 'dubsbot-graph-db-')); + for (const [path, content] of Object.entries(files)) { + await writeFile(join(root, path), content, 'utf8'); + } + + const db = new DubsbotDb(new PGlite(join(dbRoot, 'pgdata'))); + const migrations = [ + '0001_init.sql', + '0002_embedding_provenance.sql', + '0003_context_graph_enrichment.sql', + ]; + for (const migration of migrations) { + const sql = await readFile(join(process.cwd(), 'src/db/migrations', migration), 'utf8'); + await db.exec(sql); + } + + return { + repoRoot: root, + db, + cleanup: async () => { + await rm(root, { recursive: true, force: true }); + await rm(dbRoot, { recursive: true, force: true }); + }, + }; +} + +describe('context graph enrichment', () => { + const cleanups: Array<() => Promise> = []; + + afterEach(async () => { + await Promise.all(cleanups.map((cleanup) => cleanup())); + cleanups.length = 0; + }); + + it('keeps symbol extraction counts and canonical IDs stable for fixtures', () => { + const content = [ + "import { helper } from './dep';", + 'export function alpha() {', + ' return helper();', + '}', + 'const beta = () => alpha();', + ].join('\n'); + + const first = extractGraphDataForFile({ + repoRoot: '/repo', + path: 'src/app.ts', + content, + }); + const second = extractGraphDataForFile({ + repoRoot: '/repo', + path: 'src/app.ts', + content, + }); + + expect(first.symbols.length).toBeGreaterThan(0); + expect(first.symbols.map((symbol) => symbol.id)).toEqual( + second.symbols.map((symbol) => symbol.id) + ); + expect(first.edges.map((edge) => `${edge.type}:${edge.sourceKey}->${edge.targetKey}`)).toEqual( + second.edges.map((edge) => `${edge.type}:${edge.sourceKey}->${edge.targetKey}`) + ); + }); + + it('creates required edge types with expected directionality', () => { + const extraction = extractGraphDataForFile({ + repoRoot: '/repo', + path: 'src/service.ts', + content: [ + "import { helper } from './dep';", + 'function run() {', + ' return helper();', + '}', + ].join('\n'), + }); + + const byType = new Map(); + for (const edge of extraction.edges) { + byType.set(edge.type, [...(byType.get(edge.type) ?? []), edge]); + } + + expect(byType.get('defines')?.length ?? 0).toBeGreaterThan(0); + expect(byType.get('imports')?.length ?? 0).toBeGreaterThan(0); + expect(byType.get('references')?.length ?? 0).toBeGreaterThan(0); + expect(byType.get('calls')?.length ?? 0).toBeGreaterThan(0); + for (const edge of extraction.edges) { + expect(edge.sourceKey).toContain('::file'); + expect(edge.targetKey).not.toBe(edge.sourceKey); + } + }); + + it('preserves file-level retrieval behavior while exposing symbol traversals with graph hints', async () => { + const fixture = await createFixture({ + 'a.ts': + 'export function helper() { return 1; }\nexport function caller(){ return helper(); }\n', + 'b.ts': 'export const value = 2;\n', + }); + cleanups.push(fixture.cleanup); + + await runFullIndex({ + db: fixture.db, + repoRoot: fixture.repoRoot, + symbolEnrichmentEnabled: true, + }); + + const symbolCount = await fixture.db.query<{ count: number | string }>( + "SELECT COUNT(*)::int AS count FROM context_nodes WHERE node_type = 'symbol'" + ); + expect(Number(symbolCount.rows[0].count)).toBeGreaterThan(0); + + const baseline = await runHybridRetrieval({ + db: fixture.db, + repoRoot: fixture.repoRoot, + query: { + lexicalQuery: 'helper', + vectorQuery: 'helper function', + graphHints: [], + rerank: { method: 'hybrid', topK: 20 }, + maxItems: 5, + }, + }); + expect(baseline.items.length).toBeGreaterThan(0); + expect(baseline.citations.every((citation) => citation.sourceType === 'chunk')).toBe(true); + + const enriched = await runHybridRetrieval({ + db: fixture.db, + repoRoot: fixture.repoRoot, + query: { + lexicalQuery: 'helper', + vectorQuery: 'helper function', + graphHints: ['helper'], + rerank: { method: 'hybrid', topK: 20 }, + maxItems: 5, + }, + }); + expect(enriched.items.length).toBeGreaterThan(0); + expect(enriched.citations.some((citation) => citation.sourceType === 'graph_node')).toBe(true); + }); + + it('meets enrichment volume and runtime acceptance thresholds', async () => { + const lines = Array.from( + { length: 120 }, + (_, index) => `export function fn${index}() { return ${index}; }` + ); + const fixture = await createFixture({ + 'volume.ts': `${lines.join('\n')}\n`, + }); + cleanups.push(fixture.cleanup); + + const started = performance.now(); + await runFullIndex({ + db: fixture.db, + repoRoot: fixture.repoRoot, + symbolEnrichmentEnabled: true, + }); + const durationMs = performance.now() - started; + + const symbols = await fixture.db.query<{ count: number | string }>( + "SELECT COUNT(*)::int AS count FROM context_nodes WHERE node_type = 'symbol'" + ); + const edges = await fixture.db.query<{ count: number | string }>( + 'SELECT COUNT(*)::int AS count FROM context_edges' + ); + const symbolCount = Number(symbols.rows[0].count); + const edgeCount = Number(edges.rows[0].count); + + expect(durationMs).toBeLessThan(10_000); + expect(symbolCount).toBeLessThanOrEqual(lines.length * 2); + expect(edgeCount).toBeLessThanOrEqual(symbolCount * 6); + }); +});