diff --git a/__tests__/sync.test.ts b/__tests__/sync.test.ts index 8365f630..cb657274 100644 --- a/__tests__/sync.test.ts +++ b/__tests__/sync.test.ts @@ -259,4 +259,140 @@ describe('Sync Module', () => { expect(result.changedFilePaths).toBeUndefined(); }); }); + + // Regression tests for the "stale index after HEAD-moving git operation" + // bug. `git status` only reports working-tree dirtiness vs HEAD, so a + // merge / pull / checkout / rebase / reset (and even post-commit) leaves + // a clean tree and used to trick sync into reporting "up to date" while + // the DB still held pre-operation content hashes. The fix detects HEAD + // movement by comparing current HEAD against a stored last-synced HEAD + // and unioning `git diff` output into the changed-file set. + describe('HEAD-moving git operations', () => { + let testDir: string; + let cg: CodeGraph; + + function git(...args: string[]) { + execFileSync('git', args, { cwd: testDir, stdio: 'pipe' }); + } + + beforeEach(async () => { + testDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-head-move-')); + + git('init'); + git('config', 'user.email', 'test@test.com'); + git('config', 'user.name', 'Test'); + // Pin initial branch name so subsequent checkouts are deterministic + // across git versions that default to master vs main. + git('symbolic-ref', 'HEAD', 'refs/heads/main'); + + const srcDir = path.join(testDir, 'src'); + fs.mkdirSync(srcDir); + fs.writeFileSync( + path.join(srcDir, 'index.ts'), + `export function hello() { return 'world'; }` + ); + + git('add', '-A'); + git('commit', '-m', 'initial'); + + cg = CodeGraph.initSync(testDir, { + config: { include: ['**/*.ts'], exclude: [] }, + }); + await cg.indexAll(); + }); + + afterEach(() => { + if (cg) cg.destroy(); + if (fs.existsSync(testDir)) { + fs.rmSync(testDir, { recursive: true, force: true }); + } + }); + + it('should detect changes brought in by `git merge`', async () => { + // Branch off, modify on the branch, commit, switch back, merge. + git('checkout', '-b', 'feature'); + fs.writeFileSync( + path.join(testDir, 'src', 'index.ts'), + `export function merged() { return 'from-branch'; }` + ); + fs.writeFileSync( + path.join(testDir, 'src', 'added.ts'), + `export function fromBranch() { return 1; }` + ); + git('add', '-A'); + git('commit', '-m', 'feature work'); + git('checkout', 'main'); + git('merge', '--no-ff', 'feature', '-m', 'merge feature'); + + // Working tree is clean post-merge — `git status` shows nothing. + const result = await cg.sync(); + + expect(result.filesModified + result.filesAdded).toBeGreaterThanOrEqual(2); + expect(cg.searchNodes('merged').length).toBeGreaterThan(0); + expect(cg.searchNodes('fromBranch').length).toBeGreaterThan(0); + expect(cg.searchNodes('hello').length).toBe(0); + }); + + it('should detect changes after `git checkout` to a different branch', async () => { + git('checkout', '-b', 'other'); + fs.writeFileSync( + path.join(testDir, 'src', 'index.ts'), + `export function onOther() { return 'other'; }` + ); + git('add', '-A'); + git('commit', '-m', 'other work'); + git('checkout', 'main'); + // We're back on main, where `hello` exists. Before the fix, sync + // here would no-op because the working tree matches HEAD (= main). + // But the index was last synced against `other`, so we expect the + // diff main..other to flow through and bring the index in line + // with the current branch. + git('checkout', 'other'); + + const result = await cg.sync(); + + expect(result.filesModified).toBeGreaterThanOrEqual(1); + expect(cg.searchNodes('onOther').length).toBeGreaterThan(0); + expect(cg.searchNodes('hello').length).toBe(0); + }); + + it('should detect file deletion brought in by a committed change', async () => { + git('rm', path.join('src', 'index.ts')); + git('commit', '-m', 'remove index'); + + const result = await cg.sync(); + + expect(result.filesRemoved).toBe(1); + expect(cg.searchNodes('hello').length).toBe(0); + }); + + it('should fall back to full scan when last-synced HEAD is unreachable', async () => { + // Modify and commit, then rewrite history so the previously-synced + // HEAD (recorded by indexAll in beforeEach) is no longer reachable. + fs.writeFileSync( + path.join(testDir, 'src', 'index.ts'), + `export function rewritten() { return 'rewritten'; }` + ); + git('add', '-A'); + git('commit', '--amend', '-m', 'rewritten'); + // `git gc --prune=now` would sever the orphaned commit, but amending + // already moves HEAD to a new SHA the index has never seen and the + // OLD SHA may or may not be reachable. We verify behavior is correct + // either way: sync brings the index in line with current state. + const result = await cg.sync(); + + expect(result.filesModified + result.filesAdded).toBeGreaterThanOrEqual(1); + expect(cg.searchNodes('rewritten').length).toBeGreaterThan(0); + expect(cg.searchNodes('hello').length).toBe(0); + }); + + it('should still no-op when HEAD has not moved and tree is clean', async () => { + // Sanity: the new HEAD-tracking code must not introduce spurious work. + const result = await cg.sync(); + + expect(result.filesAdded).toBe(0); + expect(result.filesModified).toBe(0); + expect(result.filesRemoved).toBe(0); + }); + }); }); diff --git a/src/extraction/index.ts b/src/extraction/index.ts index 4ad056fb..01c6cfb7 100644 --- a/src/extraction/index.ts +++ b/src/extraction/index.ts @@ -185,44 +185,162 @@ interface GitChanges { } /** - * Use `git status` to detect changed files instead of scanning every file. - * Returns null on failure so callers fall back to full scan. + * Project-metadata key holding the HEAD SHA the index was last synced against. + * Used to detect HEAD-moving operations (merge, pull, checkout, rebase, + * reset, post-commit) that leave the working tree clean — which `git status` + * alone cannot see. */ -function getGitChangedFiles(rootDir: string, config: CodeGraphConfig): GitChanges | null { +export const LAST_SYNCED_HEAD_KEY = 'last_synced_head'; + +interface GitChangesResult { + changes: GitChanges; + /** Current HEAD SHA, or null if not in a git repo or repo has no commits yet. */ + currentHead: string | null; + /** + * True when the previously-synced HEAD is no longer reachable from current + * HEAD (e.g., after a force-push, history rewrite, or `git gc`). Caller + * should treat this as "git history is unreliable here" and fall back to + * a full filesystem scan. + */ + needsFullReindex: boolean; +} + +/** + * Get the current HEAD commit SHA. Returns null when not in a git repo or + * the repo has no commits yet. + */ +export function getGitHead(rootDir: string): string | null { try { - const output = execFileSync( + return execFileSync( + 'git', + ['rev-parse', 'HEAD'], + { cwd: rootDir, encoding: 'utf-8', timeout: 5000, stdio: ['pipe', 'pipe', 'pipe'] } + ).trim() || null; + } catch { + return null; + } +} + +/** + * Detect changed files using git, combining two sources: + * + * 1. `git status --porcelain` — uncommitted edits in the working tree. + * 2. `git diff ..HEAD` — committed changes since last + * sync. This catches operations that move HEAD without dirtying the + * working tree (merge, pull, checkout, rebase, reset, post-commit). + * + * Without (2), a `git merge` (etc.) would silently leave the index stale + * because the working tree is clean and `git status` reports nothing. + * + * Returns null when git is unavailable (non-git project or status failure) + * so the caller falls back to a full filesystem scan. Returns + * `needsFullReindex: true` when the last-synced HEAD is unreachable + * (force-push, gc), which also calls for a full scan. + */ +function getGitChangedFiles( + rootDir: string, + config: CodeGraphConfig, + lastSyncedHead: string | null +): GitChangesResult | null { + let statusOutput: string; + try { + statusOutput = execFileSync( 'git', ['status', '--porcelain', '--no-renames'], { cwd: rootDir, encoding: 'utf-8', timeout: 10000, stdio: ['pipe', 'pipe', 'pipe'] } ); + } catch { + return null; + } - const modified: string[] = []; - const added: string[] = []; - const deleted: string[] = []; + const currentHead = getGitHead(rootDir); + + // Two parallel maps: candidates (files that exist or may exist on disk + // and need an index check) and deletions (files git says were removed). + // Origin distinguishes untracked-add (skip hash compare) from + // modified/committed (do hash compare). + const candidates = new Map(); + const deletions = new Set(); + + for (const line of statusOutput.split('\n')) { + if (line.length < 4) continue; + const code = line.substring(0, 2); + const filePath = normalizePath(line.substring(3)); + if (!shouldIncludeFile(filePath, config)) continue; + + if (code === '??') { + if (!candidates.has(filePath)) candidates.set(filePath, '??'); + } else if (code.includes('D')) { + deletions.add(filePath); + } else { + candidates.set(filePath, 'modified'); + } + } - for (const line of output.split('\n')) { - if (line.length < 4) continue; // Minimum: "XY file" + // Union committed changes since last sync. + if (currentHead && lastSyncedHead && currentHead !== lastSyncedHead) { + // Verify the previously-synced commit is still reachable. If history + // was rewritten (force-push) or pruned (gc), we cannot diff against it + // and must full-reindex. + try { + execFileSync( + 'git', + ['cat-file', '-e', `${lastSyncedHead}^{commit}`], + { cwd: rootDir, encoding: 'utf-8', timeout: 5000, stdio: ['pipe', 'pipe', 'pipe'] } + ); + } catch { + logDebug('Last-synced HEAD unreachable, falling back to full reindex', { lastSyncedHead, currentHead }); + return { changes: { modified: [], added: [], deleted: [] }, currentHead, needsFullReindex: true }; + } - const statusCode = line.substring(0, 2); - const filePath = normalizePath(line.substring(3)); + let diffOutput: string; + try { + // -z: NUL-delimited fields/records, robust against arbitrary path chars. + // --no-renames: keep semantics consistent with the status call above. + diffOutput = execFileSync( + 'git', + ['diff', '--name-status', '--no-renames', '-z', `${lastSyncedHead}..${currentHead}`], + { cwd: rootDir, encoding: 'utf-8', timeout: 30000, maxBuffer: 50 * 1024 * 1024, stdio: ['pipe', 'pipe', 'pipe'] } + ); + } catch { + logDebug('git diff against last-synced HEAD failed, falling back to full reindex', { lastSyncedHead, currentHead }); + return { changes: { modified: [], added: [], deleted: [] }, currentHead, needsFullReindex: true }; + } - // Skip files that don't match include/exclude config + // With -z + --name-status the stream is: status \0 path \0 status \0 path \0 ... + const tokens = diffOutput.split('\0').filter((t) => t.length > 0); + for (let i = 0; i + 1 < tokens.length; i += 2) { + const code = tokens[i]!; + const filePath = normalizePath(tokens[i + 1]!); if (!shouldIncludeFile(filePath, config)) continue; - if (statusCode === '??') { - added.push(filePath); - } else if (statusCode.includes('D')) { - deleted.push(filePath); + if (code.startsWith('D')) { + deletions.add(filePath); } else { - // M, MM, AM, A (staged), etc. — treat as modified - modified.push(filePath); + // A/M/T (and C with --no-renames) — caller will read+hash and let + // the DB lookup decide whether it's truly an add or a modify. + if (!candidates.has(filePath)) candidates.set(filePath, 'modified'); } } + } - return { modified, added, deleted }; - } catch { - return null; + // A file present in both sets exists on disk now (working tree wins over + // recorded deletion — e.g., file deleted in commit, then re-created + // uncommitted). + for (const filePath of candidates.keys()) deletions.delete(filePath); + + const modified: string[] = []; + const added: string[] = []; + for (const [filePath, origin] of candidates) { + if (origin === '??') added.push(filePath); + else modified.push(filePath); } + + return { + changes: { modified, added, deleted: Array.from(deletions) }, + currentHead, + needsFullReindex: false, + }; } /** @@ -856,6 +974,13 @@ export class ExtractionOrchestrator { (parseWorker as import('worker_threads').Worker).terminate().catch(() => {}); } + // Establish a baseline HEAD so the next sync can detect HEAD-moving git + // operations against this index. + const headAfterIndex = getGitHead(this.rootDir); + if (headAfterIndex) { + this.queries.setMetadata(LAST_SYNCED_HEAD_KEY, headAfterIndex); + } + return { success: filesIndexed > 0 || errors.filter((e) => e.severity === 'error').length === 0, filesIndexed, @@ -1109,7 +1234,12 @@ export class ExtractionOrchestrator { }); const filesToIndex: string[] = []; - const gitChanges = getGitChangedFiles(this.rootDir, this.config); + const lastSyncedHead = this.queries.getMetadata(LAST_SYNCED_HEAD_KEY); + const gitResult = getGitChangedFiles(this.rootDir, this.config, lastSyncedHead); + const currentHead = gitResult?.currentHead ?? null; + // When the last-synced HEAD is unreachable we drop to the filesystem + // fallback, which uses on-disk hashes and is correct regardless of git. + const gitChanges = gitResult && !gitResult.needsFullReindex ? gitResult.changes : null; if (gitChanges) { // === Git fast path === @@ -1227,6 +1357,13 @@ export class ExtractionOrchestrator { nodesUpdated += result.nodes.length; } + // Persist current HEAD so the next sync can detect HEAD-moving git + // operations (merge, pull, checkout, rebase, reset, post-commit) even + // when they leave the working tree clean. + if (currentHead) { + this.queries.setMetadata(LAST_SYNCED_HEAD_KEY, currentHead); + } + return { filesChecked, filesAdded, @@ -1243,7 +1380,11 @@ export class ExtractionOrchestrator { * Uses git status as a fast path when available, falling back to full scan. */ getChangedFiles(): { added: string[]; modified: string[]; removed: string[] } { - const gitChanges = getGitChangedFiles(this.rootDir, this.config); + const lastSyncedHead = this.queries.getMetadata(LAST_SYNCED_HEAD_KEY); + const gitResult = getGitChangedFiles(this.rootDir, this.config, lastSyncedHead); + // Unreachable last-synced HEAD → drop to the filesystem fallback, which + // is correct regardless of git history state. + const gitChanges = gitResult && !gitResult.needsFullReindex ? gitResult.changes : null; if (gitChanges) { // === Git fast path ===