Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 136 additions & 0 deletions __tests__/sync.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -259,4 +259,140 @@ describe('Sync Module', () => {
expect(result.changedFilePaths).toBeUndefined();
});
});

// Regression tests for the "stale index after HEAD-moving git operation"
// bug. `git status` only reports working-tree dirtiness vs HEAD, so a
// merge / pull / checkout / rebase / reset (and even post-commit) leaves
// a clean tree and used to trick sync into reporting "up to date" while
// the DB still held pre-operation content hashes. The fix detects HEAD
// movement by comparing current HEAD against a stored last-synced HEAD
// and unioning `git diff` output into the changed-file set.
describe('HEAD-moving git operations', () => {
let testDir: string;
let cg: CodeGraph;

function git(...args: string[]) {
execFileSync('git', args, { cwd: testDir, stdio: 'pipe' });
}

beforeEach(async () => {
testDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-head-move-'));

git('init');
git('config', 'user.email', 'test@test.com');
git('config', 'user.name', 'Test');
// Pin initial branch name so subsequent checkouts are deterministic
// across git versions that default to master vs main.
git('symbolic-ref', 'HEAD', 'refs/heads/main');

const srcDir = path.join(testDir, 'src');
fs.mkdirSync(srcDir);
fs.writeFileSync(
path.join(srcDir, 'index.ts'),
`export function hello() { return 'world'; }`
);

git('add', '-A');
git('commit', '-m', 'initial');

cg = CodeGraph.initSync(testDir, {
config: { include: ['**/*.ts'], exclude: [] },
});
await cg.indexAll();
});

afterEach(() => {
if (cg) cg.destroy();
if (fs.existsSync(testDir)) {
fs.rmSync(testDir, { recursive: true, force: true });
}
});

it('should detect changes brought in by `git merge`', async () => {
// Branch off, modify on the branch, commit, switch back, merge.
git('checkout', '-b', 'feature');
fs.writeFileSync(
path.join(testDir, 'src', 'index.ts'),
`export function merged() { return 'from-branch'; }`
);
fs.writeFileSync(
path.join(testDir, 'src', 'added.ts'),
`export function fromBranch() { return 1; }`
);
git('add', '-A');
git('commit', '-m', 'feature work');
git('checkout', 'main');
git('merge', '--no-ff', 'feature', '-m', 'merge feature');

// Working tree is clean post-merge — `git status` shows nothing.
const result = await cg.sync();

expect(result.filesModified + result.filesAdded).toBeGreaterThanOrEqual(2);
expect(cg.searchNodes('merged').length).toBeGreaterThan(0);
expect(cg.searchNodes('fromBranch').length).toBeGreaterThan(0);
expect(cg.searchNodes('hello').length).toBe(0);
});

it('should detect changes after `git checkout` to a different branch', async () => {
git('checkout', '-b', 'other');
fs.writeFileSync(
path.join(testDir, 'src', 'index.ts'),
`export function onOther() { return 'other'; }`
);
git('add', '-A');
git('commit', '-m', 'other work');
git('checkout', 'main');
// We're back on main, where `hello` exists. Before the fix, sync
// here would no-op because the working tree matches HEAD (= main).
// But the index was last synced against `other`, so we expect the
// diff main..other to flow through and bring the index in line
// with the current branch.
git('checkout', 'other');

const result = await cg.sync();

expect(result.filesModified).toBeGreaterThanOrEqual(1);
expect(cg.searchNodes('onOther').length).toBeGreaterThan(0);
expect(cg.searchNodes('hello').length).toBe(0);
});

it('should detect file deletion brought in by a committed change', async () => {
git('rm', path.join('src', 'index.ts'));
git('commit', '-m', 'remove index');

const result = await cg.sync();

expect(result.filesRemoved).toBe(1);
expect(cg.searchNodes('hello').length).toBe(0);
});

it('should fall back to full scan when last-synced HEAD is unreachable', async () => {
// Modify and commit, then rewrite history so the previously-synced
// HEAD (recorded by indexAll in beforeEach) is no longer reachable.
fs.writeFileSync(
path.join(testDir, 'src', 'index.ts'),
`export function rewritten() { return 'rewritten'; }`
);
git('add', '-A');
git('commit', '--amend', '-m', 'rewritten');
// `git gc --prune=now` would sever the orphaned commit, but amending
// already moves HEAD to a new SHA the index has never seen and the
// OLD SHA may or may not be reachable. We verify behavior is correct
// either way: sync brings the index in line with current state.
const result = await cg.sync();

expect(result.filesModified + result.filesAdded).toBeGreaterThanOrEqual(1);
expect(cg.searchNodes('rewritten').length).toBeGreaterThan(0);
expect(cg.searchNodes('hello').length).toBe(0);
});

it('should still no-op when HEAD has not moved and tree is clean', async () => {
// Sanity: the new HEAD-tracking code must not introduce spurious work.
const result = await cg.sync();

expect(result.filesAdded).toBe(0);
expect(result.filesModified).toBe(0);
expect(result.filesRemoved).toBe(0);
});
});
});
187 changes: 164 additions & 23 deletions src/extraction/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -185,44 +185,162 @@ interface GitChanges {
}

/**
* Use `git status` to detect changed files instead of scanning every file.
* Returns null on failure so callers fall back to full scan.
* Project-metadata key holding the HEAD SHA the index was last synced against.
* Used to detect HEAD-moving operations (merge, pull, checkout, rebase,
* reset, post-commit) that leave the working tree clean — which `git status`
* alone cannot see.
*/
function getGitChangedFiles(rootDir: string, config: CodeGraphConfig): GitChanges | null {
export const LAST_SYNCED_HEAD_KEY = 'last_synced_head';

interface GitChangesResult {
changes: GitChanges;
/** Current HEAD SHA, or null if not in a git repo or repo has no commits yet. */
currentHead: string | null;
/**
* True when the previously-synced HEAD is no longer reachable from current
* HEAD (e.g., after a force-push, history rewrite, or `git gc`). Caller
* should treat this as "git history is unreliable here" and fall back to
* a full filesystem scan.
*/
needsFullReindex: boolean;
}

/**
* Get the current HEAD commit SHA. Returns null when not in a git repo or
* the repo has no commits yet.
*/
export function getGitHead(rootDir: string): string | null {
try {
const output = execFileSync(
return execFileSync(
'git',
['rev-parse', 'HEAD'],
{ cwd: rootDir, encoding: 'utf-8', timeout: 5000, stdio: ['pipe', 'pipe', 'pipe'] }
).trim() || null;
} catch {
return null;
}
}

/**
* Detect changed files using git, combining two sources:
*
* 1. `git status --porcelain` — uncommitted edits in the working tree.
* 2. `git diff <lastSyncedHead>..HEAD` — committed changes since last
* sync. This catches operations that move HEAD without dirtying the
* working tree (merge, pull, checkout, rebase, reset, post-commit).
*
* Without (2), a `git merge` (etc.) would silently leave the index stale
* because the working tree is clean and `git status` reports nothing.
*
* Returns null when git is unavailable (non-git project or status failure)
* so the caller falls back to a full filesystem scan. Returns
* `needsFullReindex: true` when the last-synced HEAD is unreachable
* (force-push, gc), which also calls for a full scan.
*/
function getGitChangedFiles(
rootDir: string,
config: CodeGraphConfig,
lastSyncedHead: string | null
): GitChangesResult | null {
let statusOutput: string;
try {
statusOutput = execFileSync(
'git',
['status', '--porcelain', '--no-renames'],
{ cwd: rootDir, encoding: 'utf-8', timeout: 10000, stdio: ['pipe', 'pipe', 'pipe'] }
);
} catch {
return null;
}

const modified: string[] = [];
const added: string[] = [];
const deleted: string[] = [];
const currentHead = getGitHead(rootDir);

// Two parallel maps: candidates (files that exist or may exist on disk
// and need an index check) and deletions (files git says were removed).
// Origin distinguishes untracked-add (skip hash compare) from
// modified/committed (do hash compare).
const candidates = new Map<string, '??' | 'modified'>();
const deletions = new Set<string>();

for (const line of statusOutput.split('\n')) {
if (line.length < 4) continue;
const code = line.substring(0, 2);
const filePath = normalizePath(line.substring(3));
if (!shouldIncludeFile(filePath, config)) continue;

if (code === '??') {
if (!candidates.has(filePath)) candidates.set(filePath, '??');
} else if (code.includes('D')) {
deletions.add(filePath);
} else {
candidates.set(filePath, 'modified');
}
}

for (const line of output.split('\n')) {
if (line.length < 4) continue; // Minimum: "XY file"
// Union committed changes since last sync.
if (currentHead && lastSyncedHead && currentHead !== lastSyncedHead) {
// Verify the previously-synced commit is still reachable. If history
// was rewritten (force-push) or pruned (gc), we cannot diff against it
// and must full-reindex.
try {
execFileSync(
'git',
['cat-file', '-e', `${lastSyncedHead}^{commit}`],
{ cwd: rootDir, encoding: 'utf-8', timeout: 5000, stdio: ['pipe', 'pipe', 'pipe'] }
);
} catch {
logDebug('Last-synced HEAD unreachable, falling back to full reindex', { lastSyncedHead, currentHead });
return { changes: { modified: [], added: [], deleted: [] }, currentHead, needsFullReindex: true };
}

const statusCode = line.substring(0, 2);
const filePath = normalizePath(line.substring(3));
let diffOutput: string;
try {
// -z: NUL-delimited fields/records, robust against arbitrary path chars.
// --no-renames: keep semantics consistent with the status call above.
diffOutput = execFileSync(
'git',
['diff', '--name-status', '--no-renames', '-z', `${lastSyncedHead}..${currentHead}`],
{ cwd: rootDir, encoding: 'utf-8', timeout: 30000, maxBuffer: 50 * 1024 * 1024, stdio: ['pipe', 'pipe', 'pipe'] }
);
} catch {
logDebug('git diff against last-synced HEAD failed, falling back to full reindex', { lastSyncedHead, currentHead });
return { changes: { modified: [], added: [], deleted: [] }, currentHead, needsFullReindex: true };
}

// Skip files that don't match include/exclude config
// With -z + --name-status the stream is: status \0 path \0 status \0 path \0 ...
const tokens = diffOutput.split('\0').filter((t) => t.length > 0);
for (let i = 0; i + 1 < tokens.length; i += 2) {
const code = tokens[i]!;
const filePath = normalizePath(tokens[i + 1]!);
if (!shouldIncludeFile(filePath, config)) continue;

if (statusCode === '??') {
added.push(filePath);
} else if (statusCode.includes('D')) {
deleted.push(filePath);
if (code.startsWith('D')) {
deletions.add(filePath);
} else {
// M, MM, AM, A (staged), etc. — treat as modified
modified.push(filePath);
// A/M/T (and C with --no-renames) — caller will read+hash and let
// the DB lookup decide whether it's truly an add or a modify.
if (!candidates.has(filePath)) candidates.set(filePath, 'modified');
}
}
}

return { modified, added, deleted };
} catch {
return null;
// A file present in both sets exists on disk now (working tree wins over
// recorded deletion — e.g., file deleted in commit, then re-created
// uncommitted).
for (const filePath of candidates.keys()) deletions.delete(filePath);

const modified: string[] = [];
const added: string[] = [];
for (const [filePath, origin] of candidates) {
if (origin === '??') added.push(filePath);
else modified.push(filePath);
}

return {
changes: { modified, added, deleted: Array.from(deletions) },
currentHead,
needsFullReindex: false,
};
}

/**
Expand Down Expand Up @@ -856,6 +974,13 @@ export class ExtractionOrchestrator {
(parseWorker as import('worker_threads').Worker).terminate().catch(() => {});
}

// Establish a baseline HEAD so the next sync can detect HEAD-moving git
// operations against this index.
const headAfterIndex = getGitHead(this.rootDir);
if (headAfterIndex) {
this.queries.setMetadata(LAST_SYNCED_HEAD_KEY, headAfterIndex);
}

return {
success: filesIndexed > 0 || errors.filter((e) => e.severity === 'error').length === 0,
filesIndexed,
Expand Down Expand Up @@ -1109,7 +1234,12 @@ export class ExtractionOrchestrator {
});

const filesToIndex: string[] = [];
const gitChanges = getGitChangedFiles(this.rootDir, this.config);
const lastSyncedHead = this.queries.getMetadata(LAST_SYNCED_HEAD_KEY);
const gitResult = getGitChangedFiles(this.rootDir, this.config, lastSyncedHead);
const currentHead = gitResult?.currentHead ?? null;
// When the last-synced HEAD is unreachable we drop to the filesystem
// fallback, which uses on-disk hashes and is correct regardless of git.
const gitChanges = gitResult && !gitResult.needsFullReindex ? gitResult.changes : null;

if (gitChanges) {
// === Git fast path ===
Expand Down Expand Up @@ -1227,6 +1357,13 @@ export class ExtractionOrchestrator {
nodesUpdated += result.nodes.length;
}

// Persist current HEAD so the next sync can detect HEAD-moving git
// operations (merge, pull, checkout, rebase, reset, post-commit) even
// when they leave the working tree clean.
if (currentHead) {
this.queries.setMetadata(LAST_SYNCED_HEAD_KEY, currentHead);
}

return {
filesChecked,
filesAdded,
Expand All @@ -1243,7 +1380,11 @@ export class ExtractionOrchestrator {
* Uses git status as a fast path when available, falling back to full scan.
*/
getChangedFiles(): { added: string[]; modified: string[]; removed: string[] } {
const gitChanges = getGitChangedFiles(this.rootDir, this.config);
const lastSyncedHead = this.queries.getMetadata(LAST_SYNCED_HEAD_KEY);
const gitResult = getGitChangedFiles(this.rootDir, this.config, lastSyncedHead);
// Unreachable last-synced HEAD → drop to the filesystem fallback, which
// is correct regardless of git history state.
const gitChanges = gitResult && !gitResult.needsFullReindex ? gitResult.changes : null;

if (gitChanges) {
// === Git fast path ===
Expand Down