diff --git a/openspec/changes/true-incremental-indexing/.openspec.yaml b/openspec/changes/archive/2026-03-04-true-incremental-indexing/.openspec.yaml similarity index 100% rename from openspec/changes/true-incremental-indexing/.openspec.yaml rename to openspec/changes/archive/2026-03-04-true-incremental-indexing/.openspec.yaml diff --git a/openspec/changes/true-incremental-indexing/design.md b/openspec/changes/archive/2026-03-04-true-incremental-indexing/design.md similarity index 100% rename from openspec/changes/true-incremental-indexing/design.md rename to openspec/changes/archive/2026-03-04-true-incremental-indexing/design.md diff --git a/openspec/changes/true-incremental-indexing/proposal.md b/openspec/changes/archive/2026-03-04-true-incremental-indexing/proposal.md similarity index 100% rename from openspec/changes/true-incremental-indexing/proposal.md rename to openspec/changes/archive/2026-03-04-true-incremental-indexing/proposal.md diff --git a/openspec/changes/true-incremental-indexing/specs/incremental-indexing/spec.md b/openspec/changes/archive/2026-03-04-true-incremental-indexing/specs/incremental-indexing/spec.md similarity index 100% rename from openspec/changes/true-incremental-indexing/specs/incremental-indexing/spec.md rename to openspec/changes/archive/2026-03-04-true-incremental-indexing/specs/incremental-indexing/spec.md diff --git a/openspec/changes/true-incremental-indexing/tasks.md b/openspec/changes/archive/2026-03-04-true-incremental-indexing/tasks.md similarity index 54% rename from openspec/changes/true-incremental-indexing/tasks.md rename to openspec/changes/archive/2026-03-04-true-incremental-indexing/tasks.md index f30715f..cea2a68 100644 --- a/openspec/changes/true-incremental-indexing/tasks.md +++ b/openspec/changes/archive/2026-03-04-true-incremental-indexing/tasks.md @@ -1,17 +1,17 @@ ## 1. Incremental Indexing Core -- [ ] 1.1 Add shared file-index helpers (upsert file, replace chunks/embeddings/docs, delete file by path) for reuse by full and incremental indexers. -- [ ] 1.2 Replace `runIncrementalIndex` broad fallback with a path-targeted pipeline that processes normalized changed paths only. -- [ ] 1.3 Add incremental operation/result types that include inserted/updated/deleted file counts and chunk counters. +- [x] 1.1 Add shared file-index helpers (upsert file, replace chunks/embeddings/docs, delete file by path) for reuse by full and incremental indexers. +- [x] 1.2 Replace `runIncrementalIndex` broad fallback with a path-targeted pipeline that processes normalized changed paths only. +- [x] 1.3 Add incremental operation/result types that include inserted/updated/deleted file counts and chunk counters. ## 2. Delete Handling and Fallback Rules -- [ ] 2.1 Implement explicit delete operations for `unlink` events and missing-path detection during incremental runs. -- [ ] 2.2 Update daemon watcher wiring to pass enough event metadata (fs event type and git-head change context) into incremental indexing. -- [ ] 2.3 Implement narrow fallback policy so only unresolved unscoped git-head transitions can trigger full reindex, with reason metadata/logging. +- [x] 2.1 Implement explicit delete operations for `unlink` events and missing-path detection during incremental runs. +- [x] 2.2 Update daemon watcher wiring to pass enough event metadata (fs event type and git-head change context) into incremental indexing. +- [x] 2.3 Implement narrow fallback policy so only unresolved unscoped git-head transitions can trigger full reindex, with reason metadata/logging. ## 3. Verification -- [ ] 3.1 Add tests for add/change targeted updates to ensure unrelated files are not reindexed. -- [ ] 3.2 Add tests for delete handling to verify stale `files`/`chunks`/`chunk_embeddings`/`bm25_documents` data is removed. -- [ ] 3.3 Add tests for fallback gating to verify fs path events never trigger full fallback and unresolved git-head transitions do. +- [x] 3.1 Add tests for add/change targeted updates to ensure unrelated files are not reindexed. +- [x] 3.2 Add tests for delete handling to verify stale `files`/`chunks`/`chunk_embeddings`/`bm25_documents` data is removed. +- [x] 3.3 Add tests for fallback gating to verify fs path events never trigger full fallback and unresolved git-head transitions do. diff --git a/openspec/specs/incremental-indexing/spec.md b/openspec/specs/incremental-indexing/spec.md new file mode 100644 index 0000000..3f86248 --- /dev/null +++ b/openspec/specs/incremental-indexing/spec.md @@ -0,0 +1,52 @@ +# incremental-indexing Specification + +## Purpose +TBD - created by archiving change true-incremental-indexing. Update Purpose after archive. +## Requirements +### Requirement: Incremental indexing MUST process only targeted changed paths +The indexing system MUST support an incremental mode that updates only the files identified in a change set, without scanning or reprocessing unrelated repository paths. + +#### Scenario: Single-file change updates only that file +- **WHEN** incremental indexing is invoked with one changed path +- **THEN** only that path is read, re-chunked, and re-embedded +- **AND** unrelated indexed files remain untouched + +#### Scenario: Duplicate changed paths are coalesced +- **WHEN** incremental indexing receives repeated entries for the same path in one run +- **THEN** the path is processed once +- **AND** result counters reflect a single file operation for that path + +### Requirement: Incremental indexing MUST handle deletes correctly +The indexing system MUST remove indexed records for files that were deleted from the repository, including dependent chunks and retrieval documents. + +#### Scenario: File unlink removes indexed content +- **WHEN** incremental indexing is invoked for a path marked as deleted +- **THEN** the indexed file row for that repo/path is removed +- **AND** all dependent chunks, embeddings, and bm25 documents are removed via cascade or equivalent guarantees + +#### Scenario: Missing file during change processing is treated as delete +- **WHEN** incremental indexing receives a changed path that no longer exists on disk +- **THEN** the system treats that path as a delete operation +- **AND** stale indexed content for the path is removed + +### Requirement: Full reindex fallback MUST be narrow and explicit +The indexing system MUST avoid broad full-reindex fallback for normal path-scoped file-system events and MAY fallback only for unscoped events where changed paths cannot be determined safely. + +#### Scenario: File-system add/change/unlink does not trigger full fallback +- **WHEN** incremental indexing is triggered from file-system events with concrete paths +- **THEN** the run completes through targeted path operations only +- **AND** full repository indexing is not invoked + +#### Scenario: Unscoped git-head change can fallback with reason +- **WHEN** a git-head transition occurs and changed paths cannot be resolved reliably +- **THEN** the system runs a single explicit full fallback reindex +- **AND** emits metadata or logs indicating fallback reason and trigger type + +### Requirement: Incremental runs MUST expose operation counts +The indexing system MUST return operation counters that distinguish inserted/updated/deleted files and indexed chunks for each run. + +#### Scenario: Result includes delete counts +- **WHEN** a run processes at least one delete operation +- **THEN** the result includes a non-zero deleted-file count +- **AND** chunk counters reflect removed and/or replaced chunk totals + diff --git a/src/context/indexer/file-index.ts b/src/context/indexer/file-index.ts new file mode 100644 index 0000000..1386b9c --- /dev/null +++ b/src/context/indexer/file-index.ts @@ -0,0 +1,261 @@ +import { createHash, randomUUID } from 'node:crypto'; +import { readFile } from 'node:fs/promises'; +import type { DubsbotDb } from '../../db/client'; +import { createProviderAdapter } from '../../providers'; +import type { ProviderAdapter } from '../../providers/types'; +import { isEmbeddingStrategyV2Enabled, loadEmbeddingStrategyConfig } from '../embedding/config'; +import { + assertEmbeddingSuccess, + type EmbeddingProvenance, + executeEmbeddingWithStrategy, +} from '../embedding/engine'; +import { deterministicEmbedding } from '../retrieval/rerank'; + +type Chunk = { + index: number; + content: string; + startLine: number; + endLine: number; +}; + +export type FileIndexSharedInput = { + db: DubsbotDb; + repoRoot: string; + embedProvider?: ProviderAdapter; + embeddingModel?: string; + embeddingStrategyId?: string; +}; + +export type UpsertFileResult = { + status: 'indexed' | 'missing'; + fileStatus?: 'inserted' | 'updated'; + chunksInserted: number; + chunksDeleted: number; +}; + +export type DeleteFileResult = { + fileDeleted: boolean; + chunksDeleted: number; +}; + +export function createFileIndexHelpers(input: FileIndexSharedInput): { + upsertIndexedFileByPath: (relativePath: string) => Promise; + deleteIndexedFileByPath: (relativePath: string) => Promise; +} { + const isStrategyV2 = isEmbeddingStrategyV2Enabled(); + const strategyConfig = isStrategyV2 ? loadEmbeddingStrategyConfig() : null; + const adapterCache = new Map(); + + function getAdapter(provider: string): ProviderAdapter { + const cached = adapterCache.get(provider); + if (cached) { + return cached; + } + const adapter = createProviderAdapter(provider as 'openai' | 'anthropic' | 'google'); + adapterCache.set(provider, adapter); + return adapter; + } + + async function embedContent(chunkContent: string): Promise<{ + embedding: number[]; + provider: string; + model: string; + provenance: EmbeddingProvenance; + }> { + if (isStrategyV2 && strategyConfig) { + const strategyId = input.embeddingStrategyId ?? strategyConfig.defaults.indexing; + const result = await executeEmbeddingWithStrategy({ + config: strategyConfig, + strategyId, + value: chunkContent, + adapterForProvider: getAdapter, + }); + const success = assertEmbeddingSuccess(result); + emitEmbeddingTelemetry(success.provenance); + return { + embedding: success.embedding, + provider: success.provider, + model: success.model, + provenance: success.provenance, + }; + } + + const provider = input.embedProvider ? 'remote' : 'local'; + const model = + input.embeddingModel ?? (input.embedProvider ? 'text-embedding-3-small' : 'deterministic-v1'); + const embedding = + input.embedProvider != null + ? (await input.embedProvider.embed({ model, values: [chunkContent] }))[0] + : deterministicEmbedding(chunkContent); + const provenance: EmbeddingProvenance = { + strategyId: 'legacy-default', + attemptPath: [ + { + strategyId: 'legacy-default', + provider, + model, + status: 'success', + }, + ], + fallbackUsed: false, + resolvedBy: { + strategyId: 'legacy-default', + provider, + model, + }, + }; + + return { embedding, provider, model, provenance }; + } + + async function upsertIndexedFileByPath(relativePath: string): Promise { + const absolutePath = `${input.repoRoot}/${relativePath}`; + const content = await readFile(absolutePath, 'utf8').catch(() => null); + if (!content) { + return { status: 'missing', chunksInserted: 0, chunksDeleted: 0 }; + } + + const existingRows = await input.db.query<{ id: string }>( + 'SELECT id FROM files WHERE repo_root = $1 AND path = $2', + [input.repoRoot, relativePath] + ); + const existingFileId = existingRows.rows[0]?.id; + const fileStatus: 'inserted' | 'updated' = existingFileId ? 'updated' : 'inserted'; + + const fileId = existingFileId ?? randomUUID(); + const persistedRows = await input.db.query<{ id: string }>( + `INSERT INTO files (id, repo_root, path, hash, language) + VALUES ($1, $2, $3, $4, $5) + ON CONFLICT (repo_root, path) DO UPDATE SET hash = EXCLUDED.hash, language = EXCLUDED.language, updated_at = NOW() + RETURNING id`, + [fileId, input.repoRoot, relativePath, hashContent(content), detectLanguage(relativePath)] + ); + const persistedFileId = persistedRows.rows[0].id; + + let chunksDeleted = 0; + if (existingFileId) { + const deletedRows = await input.db.query<{ count: number | string }>( + 'SELECT COUNT(*)::int AS count FROM chunks WHERE file_id = $1', + [persistedFileId] + ); + chunksDeleted = Number(deletedRows.rows[0]?.count ?? 0); + } + + await input.db.query('DELETE FROM chunks WHERE file_id = $1', [persistedFileId]); + + const chunks = chunkFile(content); + for (const chunk of chunks) { + const chunkId = randomUUID(); + await input.db.query( + `INSERT INTO chunks (id, file_id, chunk_index, content, start_line, end_line) + VALUES ($1, $2, $3, $4, $5, $6)`, + [chunkId, persistedFileId, chunk.index, chunk.content, chunk.startLine, chunk.endLine] + ); + + const embedded = await embedContent(chunk.content); + await input.db.query( + `INSERT INTO chunk_embeddings (chunk_id, provider, model, embedding, provenance) + VALUES ($1, $2, $3, $4::jsonb, $5::jsonb) + ON CONFLICT (chunk_id) DO UPDATE SET provider = EXCLUDED.provider, model = EXCLUDED.model, embedding = EXCLUDED.embedding, provenance = EXCLUDED.provenance`, + [ + chunkId, + embedded.provider, + embedded.model, + JSON.stringify(embedded.embedding), + JSON.stringify(embedded.provenance), + ] + ); + + await input.db.query('INSERT INTO bm25_documents (id, chunk_id, body) VALUES ($1, $2, $3)', [ + randomUUID(), + chunkId, + chunk.content, + ]); + } + + return { + status: 'indexed', + fileStatus, + chunksInserted: chunks.length, + chunksDeleted, + }; + } + + async function deleteIndexedFileByPath(relativePath: string): Promise { + const fileRows = await input.db.query<{ id: string }>( + 'SELECT id FROM files WHERE repo_root = $1 AND path = $2', + [input.repoRoot, relativePath] + ); + const fileId = fileRows.rows[0]?.id; + if (!fileId) { + return { fileDeleted: false, chunksDeleted: 0 }; + } + + const countRows = await input.db.query<{ count: number | string }>( + 'SELECT COUNT(*)::int AS count FROM chunks WHERE file_id = $1', + [fileId] + ); + const chunksDeleted = Number(countRows.rows[0]?.count ?? 0); + await input.db.query('DELETE FROM files WHERE id = $1', [fileId]); + return { fileDeleted: true, chunksDeleted }; + } + + return { + upsertIndexedFileByPath, + deleteIndexedFileByPath, + }; +} + +function hashContent(content: string): string { + return createHash('sha256').update(content).digest('hex'); +} + +function detectLanguage(path: string): string { + const extension = path.split('.').at(-1)?.toLowerCase(); + switch (extension) { + case 'ts': + case 'tsx': + return 'typescript'; + case 'js': + case 'jsx': + return 'javascript'; + case 'py': + return 'python'; + case 'rs': + return 'rust'; + case 'go': + return 'go'; + default: + return 'text'; + } +} + +function chunkFile(content: string, linesPerChunk = 120): Chunk[] { + const lines = content.split('\n'); + const chunks: Chunk[] = []; + for (let i = 0; i < lines.length; i += linesPerChunk) { + const startLine = i + 1; + const endLine = Math.min(i + linesPerChunk, lines.length); + chunks.push({ + index: chunks.length, + content: lines.slice(i, endLine).join('\n'), + startLine, + endLine, + }); + } + return chunks; +} + +function emitEmbeddingTelemetry(provenance: EmbeddingProvenance): void { + if (process.env.DUBSBOT_EMBEDDING_PROVENANCE_LOG !== '1') { + return; + } + const resolved = provenance.resolvedBy + ? `${provenance.resolvedBy.provider}:${provenance.resolvedBy.model}` + : 'none'; + console.info( + `[embedding] strategy=${provenance.strategyId} resolved=${resolved} fallback=${provenance.fallbackUsed} attempts=${provenance.attemptPath + .map((attempt) => `${attempt.provider}:${attempt.model}:${attempt.status}`) + .join('>')}` + ); +} diff --git a/src/context/indexer/full-index.ts b/src/context/indexer/full-index.ts index 9bd2390..f6a2db9 100644 --- a/src/context/indexer/full-index.ts +++ b/src/context/indexer/full-index.ts @@ -1,63 +1,7 @@ -import { createHash, randomUUID } from 'node:crypto'; -import { readFile } from 'node:fs/promises'; import fg from 'fast-glob'; import type { DubsbotDb } from '../../db/client'; -import { createProviderAdapter } from '../../providers'; import type { ProviderAdapter } from '../../providers/types'; -import { isEmbeddingStrategyV2Enabled, loadEmbeddingStrategyConfig } from '../embedding/config'; -import { - assertEmbeddingSuccess, - type EmbeddingProvenance, - executeEmbeddingWithStrategy, -} from '../embedding/engine'; -import { deterministicEmbedding } from '../retrieval/rerank'; - -type Chunk = { - index: number; - content: string; - startLine: number; - endLine: number; -}; - -function hashContent(content: string): string { - return createHash('sha256').update(content).digest('hex'); -} - -function detectLanguage(path: string): string { - const extension = path.split('.').at(-1)?.toLowerCase(); - switch (extension) { - case 'ts': - case 'tsx': - return 'typescript'; - case 'js': - case 'jsx': - return 'javascript'; - case 'py': - return 'python'; - case 'rs': - return 'rust'; - case 'go': - return 'go'; - default: - return 'text'; - } -} - -function chunkFile(content: string, linesPerChunk = 120): Chunk[] { - const lines = content.split('\n'); - const chunks: Chunk[] = []; - for (let i = 0; i < lines.length; i += linesPerChunk) { - const startLine = i + 1; - const endLine = Math.min(i + linesPerChunk, lines.length); - chunks.push({ - index: chunks.length, - content: lines.slice(i, endLine).join('\n'), - startLine, - endLine, - }); - } - return chunks; -} +import { createFileIndexHelpers } from './file-index'; export async function runFullIndex(input: { db: DubsbotDb; @@ -75,132 +19,17 @@ export async function runFullIndex(input: { let filesIndexed = 0; let chunksIndexed = 0; - - const isStrategyV2 = isEmbeddingStrategyV2Enabled(); - const strategyConfig = isStrategyV2 ? loadEmbeddingStrategyConfig() : null; - const adapterCache = new Map(); - - function getAdapter(provider: string): ProviderAdapter { - const cached = adapterCache.get(provider); - if (cached) { - return cached; - } - const adapter = createProviderAdapter(provider as 'openai' | 'anthropic' | 'google'); - adapterCache.set(provider, adapter); - return adapter; - } + const helpers = createFileIndexHelpers(input); for (const relativePath of paths) { - const absolutePath = `${input.repoRoot}/${relativePath}`; - const content = await readFile(absolutePath, 'utf8').catch(() => null); - if (!content) { + const indexed = await helpers.upsertIndexedFileByPath(relativePath); + if (indexed.status !== 'indexed') { continue; } filesIndexed += 1; - const fileId = randomUUID(); - await input.db.query( - `INSERT INTO files (id, repo_root, path, hash, language) - VALUES ($1, $2, $3, $4, $5) - ON CONFLICT (repo_root, path) DO UPDATE SET hash = EXCLUDED.hash, language = EXCLUDED.language, updated_at = NOW() - RETURNING id`, - [fileId, input.repoRoot, relativePath, hashContent(content), detectLanguage(relativePath)] - ); - - const fileRows = await input.db.query<{ id: string }>( - 'SELECT id FROM files WHERE repo_root = $1 AND path = $2', - [input.repoRoot, relativePath] - ); - const persistedFileId = fileRows.rows[0].id; - - await input.db.query('DELETE FROM chunks WHERE file_id = $1', [persistedFileId]); - - const chunks = chunkFile(content); - for (const chunk of chunks) { - const chunkId = randomUUID(); - chunksIndexed += 1; - - await input.db.query( - `INSERT INTO chunks (id, file_id, chunk_index, content, start_line, end_line) - VALUES ($1, $2, $3, $4, $5, $6)`, - [chunkId, persistedFileId, chunk.index, chunk.content, chunk.startLine, chunk.endLine] - ); - - let embedding: number[]; - let provider = input.embedProvider ? 'remote' : 'local'; - let model = input.embeddingModel ?? 'deterministic-v1'; - let provenance: EmbeddingProvenance = { - strategyId: 'legacy-default', - attemptPath: [ - { - strategyId: 'legacy-default', - provider, - model, - status: 'success', - }, - ], - fallbackUsed: false, - resolvedBy: { - strategyId: 'legacy-default', - provider, - model, - }, - }; - - if (isStrategyV2 && strategyConfig) { - const strategyId = input.embeddingStrategyId ?? strategyConfig.defaults.indexing; - const result = await executeEmbeddingWithStrategy({ - config: strategyConfig, - strategyId, - value: chunk.content, - adapterForProvider: getAdapter, - }); - const success = assertEmbeddingSuccess(result); - embedding = success.embedding; - provider = success.provider; - model = success.model; - provenance = success.provenance; - emitEmbeddingTelemetry(success.provenance); - } else { - embedding = - input.embedProvider != null - ? ( - await input.embedProvider.embed({ - model: input.embeddingModel ?? 'text-embedding-3-small', - values: [chunk.content], - }) - )[0] - : deterministicEmbedding(chunk.content); - } - - await input.db.query( - `INSERT INTO chunk_embeddings (chunk_id, provider, model, embedding, provenance) - VALUES ($1, $2, $3, $4::jsonb, $5::jsonb) - ON CONFLICT (chunk_id) DO UPDATE SET provider = EXCLUDED.provider, model = EXCLUDED.model, embedding = EXCLUDED.embedding, provenance = EXCLUDED.provenance`, - [chunkId, provider, model, JSON.stringify(embedding), JSON.stringify(provenance)] - ); - - await input.db.query('INSERT INTO bm25_documents (id, chunk_id, body) VALUES ($1, $2, $3)', [ - randomUUID(), - chunkId, - chunk.content, - ]); - } + chunksIndexed += indexed.chunksInserted; } return { filesIndexed, chunksIndexed }; } - -function emitEmbeddingTelemetry(provenance: EmbeddingProvenance): void { - if (process.env.DUBSBOT_EMBEDDING_PROVENANCE_LOG !== '1') { - return; - } - const resolved = provenance.resolvedBy - ? `${provenance.resolvedBy.provider}:${provenance.resolvedBy.model}` - : 'none'; - console.info( - `[embedding] strategy=${provenance.strategyId} resolved=${resolved} fallback=${provenance.fallbackUsed} attempts=${provenance.attemptPath - .map((attempt) => `${attempt.provider}:${attempt.model}:${attempt.status}`) - .join('>')}` - ); -} diff --git a/src/context/indexer/incremental.ts b/src/context/indexer/incremental.ts index a9a9d70..e78f000 100644 --- a/src/context/indexer/incremental.ts +++ b/src/context/indexer/incremental.ts @@ -1,22 +1,194 @@ +import { execFile } from 'node:child_process'; +import { isAbsolute, posix, relative, sep } from 'node:path'; +import { promisify } from 'node:util'; import type { DubsbotDb } from '../../db/client'; import type { ProviderAdapter } from '../../providers/types'; +import { createFileIndexHelpers } from './file-index'; import { runFullIndex } from './full-index'; +const execFileAsync = promisify(execFile); + +export type IncrementalPathOperation = { + path: string; + type: 'upsert' | 'delete'; +}; + +export type IncrementalTrigger = + | { + source: 'fs'; + event: 'add' | 'change' | 'unlink'; + } + | { + source: 'git-head'; + previous: string; + current: string; + }; + +export type IncrementalIndexResult = { + mode: 'incremental' | 'full-fallback'; + fallbackReason: string | null; + filesIndexed: number; + chunksIndexed: number; + filesInserted: number; + filesUpdated: number; + filesDeleted: number; + chunksInserted: number; + chunksDeleted: number; +}; + export async function runIncrementalIndex(input: { db: DubsbotDb; repoRoot: string; - changedPaths: string[]; + operations?: IncrementalPathOperation[]; + changedPaths?: string[]; + trigger?: IncrementalTrigger; embedProvider?: ProviderAdapter; embeddingModel?: string; -}): Promise<{ filesIndexed: number; chunksIndexed: number }> { - if (input.changedPaths.length === 0) { - return { filesIndexed: 0, chunksIndexed: 0 }; +}): Promise { + const fileIndexHelpers = createFileIndexHelpers(input); + + let operations = + input.operations ?? + (input.changedPaths ?? []).map((path) => ({ + path, + type: 'upsert' as const, + })); + let fallbackReason: string | null = null; + + if (operations.length === 0 && input.trigger?.source === 'git-head') { + const resolved = await resolveGitHeadOperations( + input.repoRoot, + input.trigger.previous, + input.trigger.current + ); + if (resolved) { + operations = resolved; + } else { + fallbackReason = `unresolved-git-head-transition:${input.trigger.previous.slice(0, 8)}->${input.trigger.current.slice(0, 8)}`; + } + } + + if (fallbackReason) { + const full = await runFullIndex({ + db: input.db, + repoRoot: input.repoRoot, + embedProvider: input.embedProvider, + embeddingModel: input.embeddingModel, + }); + console.warn(`[indexer:incremental] falling back to full index (${fallbackReason})`); + return { + mode: 'full-fallback', + fallbackReason, + filesIndexed: full.filesIndexed, + chunksIndexed: full.chunksIndexed, + filesInserted: 0, + filesUpdated: full.filesIndexed, + filesDeleted: 0, + chunksInserted: full.chunksIndexed, + chunksDeleted: 0, + }; + } + + const counters: IncrementalIndexResult = { + mode: 'incremental', + fallbackReason: null, + filesIndexed: 0, + chunksIndexed: 0, + filesInserted: 0, + filesUpdated: 0, + filesDeleted: 0, + chunksInserted: 0, + chunksDeleted: 0, + }; + + for (const operation of coalesceOperations(input.repoRoot, operations)) { + if (operation.type === 'delete') { + const deleted = await fileIndexHelpers.deleteIndexedFileByPath(operation.path); + counters.filesDeleted += deleted.fileDeleted ? 1 : 0; + counters.chunksDeleted += deleted.chunksDeleted; + continue; + } + + const upserted = await fileIndexHelpers.upsertIndexedFileByPath(operation.path); + if (upserted.status === 'missing') { + const deleted = await fileIndexHelpers.deleteIndexedFileByPath(operation.path); + counters.filesDeleted += deleted.fileDeleted ? 1 : 0; + counters.chunksDeleted += deleted.chunksDeleted; + continue; + } + + counters.filesIndexed += 1; + counters.chunksIndexed += upserted.chunksInserted; + counters.chunksInserted += upserted.chunksInserted; + counters.chunksDeleted += upserted.chunksDeleted; + if (upserted.fileStatus === 'inserted') { + counters.filesInserted += 1; + } else { + counters.filesUpdated += 1; + } } - return runFullIndex({ - db: input.db, - repoRoot: input.repoRoot, - embedProvider: input.embedProvider, - embeddingModel: input.embeddingModel, - }); + return counters; +} + +async function resolveGitHeadOperations( + repoRoot: string, + previous: string, + current: string +): Promise { + try { + const { stdout } = await execFileAsync( + 'git', + ['diff', '--name-status', '--no-renames', previous, current], + { cwd: repoRoot } + ); + const lines = stdout + .split('\n') + .map((line) => line.trim()) + .filter(Boolean); + const operations: IncrementalPathOperation[] = []; + for (const line of lines) { + const [status, ...rawPathParts] = line.split(/\s+/); + const rawPath = rawPathParts.join(' ').trim(); + if (!rawPath) { + continue; + } + operations.push({ + path: rawPath, + type: status.startsWith('D') ? 'delete' : 'upsert', + }); + } + return operations; + } catch { + return null; + } +} + +function coalesceOperations( + repoRoot: string, + operations: IncrementalPathOperation[] +): IncrementalPathOperation[] { + const byPath = new Map(); + for (const operation of operations) { + const normalizedPath = normalizeRepoRelativePath(repoRoot, operation.path); + if (!normalizedPath) { + continue; + } + byPath.set(normalizedPath, { + path: normalizedPath, + type: operation.type, + }); + } + return [...byPath.values()]; +} + +function normalizeRepoRelativePath(repoRoot: string, candidatePath: string): string | null { + const repoRelative = isAbsolute(candidatePath) + ? relative(repoRoot, candidatePath) + : candidatePath; + const normalized = posix.normalize(repoRelative.split(sep).join('/')).replace(/^\.\/+/, ''); + if (normalized === '' || normalized === '.' || normalized.startsWith('../')) { + return null; + } + return normalized; } diff --git a/src/daemon/main.ts b/src/daemon/main.ts index 4edf875..c54173a 100644 --- a/src/daemon/main.ts +++ b/src/daemon/main.ts @@ -63,26 +63,27 @@ async function main(): Promise { }); const fsWatcher = new RepoFsWatcher(repoRoot); - fsWatcher.on('change', async ({ path }) => { + fsWatcher.on('change', async ({ path, type }) => { await runIncrementalIndex({ db, repoRoot, - changedPaths: [path], + operations: [{ path, type: type === 'unlink' ? 'delete' : 'upsert' }], + trigger: { source: 'fs', event: type }, embedProvider: provider, }); - await hooks.trigger('file-change', { cwd: repoRoot, payload: { path } }); + await hooks.trigger('file-change', { cwd: repoRoot, payload: { path, type } }); }); fsWatcher.start(); const gitWatcher = new GitWatcher(repoRoot); - gitWatcher.on('change', async () => { + gitWatcher.on('change', async ({ previous, current }) => { await runIncrementalIndex({ db, repoRoot, - changedPaths: ['.git/HEAD'], + trigger: { source: 'git-head', previous, current }, embedProvider: provider, }); - await hooks.trigger('git-head-change', { cwd: repoRoot }); + await hooks.trigger('git-head-change', { cwd: repoRoot, payload: { previous, current } }); }); gitWatcher.start(); diff --git a/tests/incremental-indexing.test.ts b/tests/incremental-indexing.test.ts new file mode 100644 index 0000000..9d882c5 --- /dev/null +++ b/tests/incremental-indexing.test.ts @@ -0,0 +1,162 @@ +import { mkdtemp, readFile, rm, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { PGlite } from '@electric-sql/pglite'; +import { afterEach, describe, expect, it } from 'vitest'; +import { runFullIndex } from '../src/context/indexer/full-index'; +import { runIncrementalIndex } from '../src/context/indexer/incremental'; +import { DubsbotDb } from '../src/db/client'; + +type Fixture = { + repoRoot: string; + db: DubsbotDb; + cleanup: () => Promise; +}; + +async function createFixture(files: Record): Promise { + const root = await mkdtemp(join(tmpdir(), 'dubsbot-incremental-')); + const dbRoot = await mkdtemp(join(tmpdir(), 'dubsbot-incremental-db-')); + for (const [path, content] of Object.entries(files)) { + await writeFile(join(root, path), content, 'utf8'); + } + + const db = new DubsbotDb(new PGlite(join(dbRoot, 'pgdata'))); + const migration0001 = await readFile( + join(process.cwd(), 'src/db/migrations/0001_init.sql'), + 'utf8' + ); + const migration0002 = await readFile( + join(process.cwd(), 'src/db/migrations/0002_embedding_provenance.sql'), + 'utf8' + ); + await db.exec(migration0001); + await db.exec(migration0002); + + return { + repoRoot: root, + db, + cleanup: async () => { + await rm(root, { recursive: true, force: true }); + await rm(dbRoot, { recursive: true, force: true }); + }, + }; +} + +async function getChunkIdsForPath( + db: DubsbotDb, + repoRoot: string, + path: string +): Promise { + const rows = await db.query<{ id: string }>( + `SELECT c.id + FROM chunks c + JOIN files f ON f.id = c.file_id + WHERE f.repo_root = $1 AND f.path = $2 + ORDER BY c.chunk_index ASC`, + [repoRoot, path] + ); + return rows.rows.map((row) => row.id); +} + +describe('incremental indexing', () => { + const cleanups: Array<() => Promise> = []; + + afterEach(async () => { + await Promise.all(cleanups.map((cleanup) => cleanup())); + cleanups.length = 0; + }); + + it('updates only targeted changed paths and coalesces duplicates', async () => { + const fixture = await createFixture({ + 'a.ts': 'export const a = 1;\n', + 'b.ts': 'export const b = 1;\n', + }); + cleanups.push(fixture.cleanup); + + await runFullIndex({ db: fixture.db, repoRoot: fixture.repoRoot }); + const beforeUnchangedChunkIds = await getChunkIdsForPath(fixture.db, fixture.repoRoot, 'b.ts'); + + await writeFile(join(fixture.repoRoot, 'a.ts'), 'export const a = 2;\n', 'utf8'); + const result = await runIncrementalIndex({ + db: fixture.db, + repoRoot: fixture.repoRoot, + operations: [ + { path: 'a.ts', type: 'upsert' }, + { path: 'a.ts', type: 'upsert' }, + ], + trigger: { source: 'fs', event: 'change' }, + }); + + const afterUnchangedChunkIds = await getChunkIdsForPath(fixture.db, fixture.repoRoot, 'b.ts'); + expect(afterUnchangedChunkIds).toEqual(beforeUnchangedChunkIds); + expect(result).toMatchObject({ + mode: 'incremental', + filesIndexed: 1, + filesInserted: 0, + filesUpdated: 1, + filesDeleted: 0, + }); + }); + + it('removes stale files/chunks/embeddings/documents on delete operations', async () => { + const fixture = await createFixture({ + 'delete-me.ts': 'export const toDelete = true;\n', + }); + cleanups.push(fixture.cleanup); + + await runFullIndex({ db: fixture.db, repoRoot: fixture.repoRoot }); + const initialChunkIds = await getChunkIdsForPath(fixture.db, fixture.repoRoot, 'delete-me.ts'); + expect(initialChunkIds.length).toBeGreaterThan(0); + + await rm(join(fixture.repoRoot, 'delete-me.ts')); + const result = await runIncrementalIndex({ + db: fixture.db, + repoRoot: fixture.repoRoot, + operations: [{ path: 'delete-me.ts', type: 'delete' }], + trigger: { source: 'fs', event: 'unlink' }, + }); + + const filesCount = await fixture.db.query<{ count: number | string }>( + 'SELECT COUNT(*)::int AS count FROM files WHERE repo_root = $1 AND path = $2', + [fixture.repoRoot, 'delete-me.ts'] + ); + expect(Number(filesCount.rows[0].count)).toBe(0); + + for (const chunkId of initialChunkIds) { + const embeddingCount = await fixture.db.query<{ count: number | string }>( + 'SELECT COUNT(*)::int AS count FROM chunk_embeddings WHERE chunk_id = $1', + [chunkId] + ); + const documentCount = await fixture.db.query<{ count: number | string }>( + 'SELECT COUNT(*)::int AS count FROM bm25_documents WHERE chunk_id = $1', + [chunkId] + ); + expect(Number(embeddingCount.rows[0].count)).toBe(0); + expect(Number(documentCount.rows[0].count)).toBe(0); + } + + expect(result).toMatchObject({ + mode: 'incremental', + filesDeleted: 1, + }); + expect(result.chunksDeleted).toBeGreaterThan(0); + }); + + it('falls back only for unresolved git-head transitions', async () => { + const fixture = await createFixture({ + 'one.ts': 'export const one = 1;\n', + 'two.ts': 'export const two = 2;\n', + }); + cleanups.push(fixture.cleanup); + + const result = await runIncrementalIndex({ + db: fixture.db, + repoRoot: fixture.repoRoot, + trigger: { source: 'git-head', previous: 'deadbeef', current: 'cafebabe' }, + }); + + expect(result.mode).toBe('full-fallback'); + expect(result.fallbackReason).toContain('unresolved-git-head-transition'); + expect(result.filesIndexed).toBe(2); + }); +});