Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
200 changes: 200 additions & 0 deletions __tests__/diversify.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
/**
* Result Diversification Tests
*
* Verifies the per-file cap on search results: queries that match many
* symbols in one file (the methods of a class) no longer return 10 hits
* from one file, but instead surface representative breadth across files.
*/

import { describe, it, expect, beforeEach, afterEach } from 'vitest';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
import { DatabaseConnection } from '../src/db';
import { QueryBuilder } from '../src/db/queries';
import { diversifyByFile } from '../src/search/query-utils';
import { Node } from '../src/types';

describe('diversifyByFile (unit)', () => {
function r(score: number, name: string, filePath: string) {
return { node: { id: name, name, filePath } as Node, score };
}

it('caps consecutive results from the same file at perFileCap', () => {
const results = [
r(10, 'a1', 'a.ts'),
r(9, 'a2', 'a.ts'),
r(8, 'a3', 'a.ts'),
r(7, 'a4', 'a.ts'),
r(6, 'b1', 'b.ts'),
];
const out = diversifyByFile(results, 5, 2);
expect(out.map((x) => x.node.name)).toEqual(['a1', 'a2', 'b1', 'a3', 'a4']);
// First two from a.ts (cap), then b.ts (different file), then backfill.
});

it('preserves overall ranking when no file dominates', () => {
const results = [
r(10, 'a1', 'a.ts'),
r(9, 'b1', 'b.ts'),
r(8, 'c1', 'c.ts'),
r(7, 'a2', 'a.ts'),
];
const out = diversifyByFile(results, 4, 2);
expect(out.map((x) => x.node.name)).toEqual(['a1', 'b1', 'c1', 'a2']);
});

it('does not lose results — backfills from skipped when limit not yet filled', () => {
// 10 candidates all from one file, limit 5, cap 2: pick 2, backfill 3.
const results = Array.from({ length: 10 }, (_, i) =>
r(10 - i, `n${i}`, 'a.ts')
);
const out = diversifyByFile(results, 5, 2);
expect(out).toHaveLength(5);
expect(out.every((x) => x.node.filePath === 'a.ts')).toBe(true);
});

it('returns the input slice unchanged when perFileCap=0', () => {
const results = [
r(10, 'a1', 'a.ts'),
r(9, 'a2', 'a.ts'),
r(8, 'a3', 'a.ts'),
];
expect(diversifyByFile(results, 3, 0)).toEqual(results);
});

it('returns input unchanged when results.length <= limit and no reordering needed', () => {
const results = [r(10, 'a1', 'a.ts'), r(9, 'a2', 'a.ts')];
expect(diversifyByFile(results, 5, 2)).toEqual(results);
});

it('still reorders within limit when results.length === limit but cap rearranges', () => {
// Same total count as limit, but the cap reorders to surface peer files
// earlier in the list.
const results = [
r(10, 'a1', 'a.ts'),
r(9, 'a2', 'a.ts'),
r(8, 'a3', 'a.ts'),
r(7, 'a4', 'a.ts'),
r(6, 'b1', 'b.ts'),
];
const out = diversifyByFile(results, 5, 2);
// First 2 from a.ts (cap), then b.ts, then backfill a.ts.
expect(out.map((x) => x.node.name)).toEqual(['a1', 'a2', 'b1', 'a3', 'a4']);
});

it('respects the limit even when picked + skipped exceed it', () => {
const results = [
r(10, 'a1', 'a.ts'),
r(9, 'a2', 'a.ts'),
r(8, 'a3', 'a.ts'),
r(7, 'b1', 'b.ts'),
];
const out = diversifyByFile(results, 2, 2);
expect(out).toHaveLength(2);
expect(out.map((x) => x.node.name)).toEqual(['a1', 'a2']);
});

it('always preserves the top-scoring result at position 0', () => {
const results = [
r(100, 'top', 'big.ts'),
r(50, 'big2', 'big.ts'),
r(40, 'big3', 'big.ts'),
r(30, 'big4', 'big.ts'),
r(20, 'other', 'other.ts'),
];
const out = diversifyByFile(results, 3, 2);
expect(out[0].node.name).toBe('top');
});
});

describe('searchNodes per-file diversification (integration)', () => {
let dir: string;
let db: DatabaseConnection;
let q: QueryBuilder;

function makeNode(id: string, name: string, kind: Node['kind'], filePath: string): Node {
return {
id,
kind,
name,
qualifiedName: `${filePath}::${name}`,
filePath,
language: 'typescript',
startLine: 1,
endLine: 1,
startColumn: 0,
endColumn: 0,
updatedAt: Date.now(),
};
}

beforeEach(() => {
dir = fs.mkdtempSync(path.join(os.tmpdir(), 'diversify-search-'));
db = DatabaseConnection.initialize(path.join(dir, 'test.db'));
q = new QueryBuilder(db.getDb());
// Simulate the "10 methods of one class" scenario: a class plus many
// methods all sharing a common token, all in one file. Plus a peer
// file with a sibling implementation.
const nodes: Node[] = [
makeNode('cls', 'DatabaseConnection', 'class', 'src/db.ts'),
makeNode('m1', 'connect', 'method', 'src/db.ts'),
makeNode('m2', 'disconnect', 'method', 'src/db.ts'),
makeNode('m3', 'reconnect', 'method', 'src/db.ts'),
makeNode('m4', 'isConnected', 'method', 'src/db.ts'),
makeNode('m5', 'connectionString', 'property', 'src/db.ts'),
makeNode('peer', 'PoolConnection', 'class', 'src/pool.ts'),
makeNode('peer2', 'connectPool', 'function', 'src/pool.ts'),
];
q.insertNodes(nodes);
});

afterEach(() => {
db.close();
if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true });
});

it('caps results per file at the default (3) so peer files surface', () => {
const results = q.searchNodes('connect', { limit: 5 });
const fromDbTs = results.filter((r) => r.node.filePath === 'src/db.ts').length;
const fromPool = results.filter((r) => r.node.filePath === 'src/pool.ts').length;
expect(fromDbTs).toBeLessThanOrEqual(3); // cap
expect(fromPool).toBeGreaterThanOrEqual(1); // peer file represented
});

it('honors perFileCap: 0 (disabled) — does not enforce a per-file limit', () => {
// Insert a heavy imbalance so dominance is unambiguous: 10 matching
// methods in db.ts, only the existing pool.ts entries elsewhere.
const heavyDb: Node[] = Array.from({ length: 10 }, (_, i) =>
makeNode(`heavy${i}`, `connectVariant${i}`, 'method', 'src/db.ts')
);
q.insertNodes(heavyDb);
const results = q.searchNodes('connect', { limit: 8, perFileCap: 0 });
const fromDbTs = results.filter((r) => r.node.filePath === 'src/db.ts').length;
expect(fromDbTs).toBeGreaterThan(3);
});

it('honors a higher perFileCap', () => {
const results = q.searchNodes('connect', { limit: 6, perFileCap: 5 });
const fromDbTs = results.filter((r) => r.node.filePath === 'src/db.ts').length;
expect(fromDbTs).toBeLessThanOrEqual(5);
});

it('preserves the top-scoring hit even with diversification', () => {
// Class node with the most direct name match is the most relevant —
// diversification must never displace it from #1.
const results = q.searchNodes('DatabaseConnection', { limit: 3 });
expect(results[0].node.name).toBe('DatabaseConnection');
});

it('does not lose results — fills limit by backfilling skipped same-file hits', () => {
// If only one file has matches, all results legitimately come from it.
// The cap should not cause us to return fewer than `limit` results.
const onlyOneFileNodes: Node[] = Array.from({ length: 10 }, (_, i) =>
makeNode(`only${i}`, `solo${i}`, 'function', 'src/only.ts')
);
q.insertNodes(onlyOneFileNodes);
const results = q.searchNodes('solo', { limit: 5 });
expect(results.length).toBe(5);
});
});
9 changes: 8 additions & 1 deletion src/bin/codegraph.ts
Original file line number Diff line number Diff line change
Expand Up @@ -744,10 +744,17 @@ program
} else {
console.log(chalk.bold(`\nSearch Results for "${search}":\n`));

// searchNodes returns BM25 + bonuses, which is unbounded.
// Render each result's score as a fraction of the top hit so
// the user gets a relative confidence (top = 100%) instead of
// nonsensical "10449%" raw weights.
const topScore = results[0]!.score;
for (const result of results) {
const node = result.node;
const location = `${node.filePath}:${node.startLine}`;
const score = chalk.dim(`(${(result.score * 100).toFixed(0)}%)`);
const relPct =
topScore > 0 ? Math.round((result.score / topScore) * 100) : 0;
const score = chalk.dim(`(${relPct}%)`);

console.log(
chalk.cyan(node.kind.padEnd(12)) +
Expand Down
31 changes: 25 additions & 6 deletions src/db/queries.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ import {
SearchResult,
} from '../types';
import { safeJsonParse } from '../utils';
import { kindBonus, nameMatchBonus, scorePathRelevance } from '../search/query-utils';
import { kindBonus, nameMatchBonus, scorePathRelevance, diversifyByFile } from '../search/query-utils';

/**
* Database row types (snake_case from SQLite)
Expand Down Expand Up @@ -478,7 +478,13 @@ export class QueryBuilder {
* 3. Score results based on match quality
*/
searchNodes(query: string, options: SearchOptions = {}): SearchResult[] {
const { kinds, languages, limit = 100, offset = 0 } = options;
const { kinds, languages, limit = 100, offset = 0, perFileCap = 3 } = options;

// Note on over-fetching: searchNodesFTS already over-fetches by 5x
// internally (Math.max(limit*5, 100)) so its own rescoring pass has
// headroom. That same headroom feeds the per-file diversification
// below — no additional outer multiplier needed. Keeping this comment
// here so future readers don't reintroduce a multiplier-on-multiplier.

// First try FTS5 with prefix matching
let results = this.searchNodesFTS(query, { kinds, languages, limit, offset });
Expand Down Expand Up @@ -530,10 +536,23 @@ export class QueryBuilder {
+ nameMatchBonus(r.node.name, query),
}));
results.sort((a, b) => b.score - a.score);
// Trim to requested limit after rescoring
if (results.length > limit) {
results = results.slice(0, limit);
}
}

// Diversification: cap per-file results so the top-K isn't dominated
// by the methods of a single class. Top-scoring hit per file is always
// included; the cap only kicks in for the second-and-onward members
// of the same file. perFileCap=0 disables.
//
// Guard `results.length > limit`: when results <= limit there's
// nothing to drop, so the existing score order is already what the
// caller will see. (`diversifyByFile` is also safe to call here and
// would reorder within the same set, but the existing rescore order
// is already meaningful and we don't want to perturb it without
// benefit.)
if (perFileCap > 0 && results.length > limit) {
results = diversifyByFile(results, limit, perFileCap);
} else if (results.length > limit) {
results = results.slice(0, limit);
}

return results;
Expand Down
46 changes: 46 additions & 0 deletions src/search/query-utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -333,3 +333,49 @@ export function kindBonus(kind: Node['kind']): number {
};
return bonuses[kind] ?? 0;
}

/**
* Cap consecutive results from the same file. Preserves overall ranking:
* the highest-scoring hit from each file is taken first (up to `perFileCap`
* per file), in score order. If `limit` isn't filled after the capped
* pass, the remaining slots are filled with the next-best hits regardless
* of file (preserves correctness — never hides a hit that would have
* otherwise been returned).
*
* Why: queries like `"ExtractionOrchestrator"` return the matching class
* plus 9 of its members from the same file. The first hit is informative;
* the next 9 are implementation detail that pushes peer files (subclasses,
* callers, sibling modules) past the limit. Capping per file surfaces
* representative breadth without losing the top hit.
*/
export function diversifyByFile<T extends { node: Node }>(
results: T[],
limit: number,
perFileCap: number
): T[] {
if (perFileCap <= 0) return results.slice(0, limit);
const perFile = new Map<string, number>();
const picked: T[] = [];
const skipped: T[] = [];
for (const r of results) {
const f = r.node.filePath;
const c = perFile.get(f) ?? 0;
if (c < perFileCap) {
picked.push(r);
perFile.set(f, c + 1);
if (picked.length >= limit) return picked;
} else {
skipped.push(r);
}
}
// Backfill from skipped (in original score order) so we don't return
// fewer results than the caller asked for. This also handles the
// edge case where `results.length <= limit`: nothing was actually
// dropped, but the per-file cap reordered them so peer files appear
// earlier — `picked` first, then any leftover same-file hits.
for (const r of skipped) {
if (picked.length >= limit) break;
picked.push(r);
}
return picked;
}
11 changes: 11 additions & 0 deletions src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,17 @@ export interface SearchOptions {

/** Whether search is case-sensitive */
caseSensitive?: boolean;

/**
* Cap the number of results from any single file before returning.
* Default 3. Set to 0 to disable diversification (return raw ranked
* results, even if 10 of them come from the same class). The class /
* function / interface members of the same file are usually less
* informative as multiple distinct results than as "this file plus
* representative members" — diversification surfaces context across
* the codebase rather than burying the user in one file's internals.
*/
perFileCap?: number;
}

/**
Expand Down