diff --git a/__tests__/diversify.test.ts b/__tests__/diversify.test.ts new file mode 100644 index 00000000..181ee9c5 --- /dev/null +++ b/__tests__/diversify.test.ts @@ -0,0 +1,200 @@ +/** + * Result Diversification Tests + * + * Verifies the per-file cap on search results: queries that match many + * symbols in one file (the methods of a class) no longer return 10 hits + * from one file, but instead surface representative breadth across files. + */ + +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { DatabaseConnection } from '../src/db'; +import { QueryBuilder } from '../src/db/queries'; +import { diversifyByFile } from '../src/search/query-utils'; +import { Node } from '../src/types'; + +describe('diversifyByFile (unit)', () => { + function r(score: number, name: string, filePath: string) { + return { node: { id: name, name, filePath } as Node, score }; + } + + it('caps consecutive results from the same file at perFileCap', () => { + const results = [ + r(10, 'a1', 'a.ts'), + r(9, 'a2', 'a.ts'), + r(8, 'a3', 'a.ts'), + r(7, 'a4', 'a.ts'), + r(6, 'b1', 'b.ts'), + ]; + const out = diversifyByFile(results, 5, 2); + expect(out.map((x) => x.node.name)).toEqual(['a1', 'a2', 'b1', 'a3', 'a4']); + // First two from a.ts (cap), then b.ts (different file), then backfill. + }); + + it('preserves overall ranking when no file dominates', () => { + const results = [ + r(10, 'a1', 'a.ts'), + r(9, 'b1', 'b.ts'), + r(8, 'c1', 'c.ts'), + r(7, 'a2', 'a.ts'), + ]; + const out = diversifyByFile(results, 4, 2); + expect(out.map((x) => x.node.name)).toEqual(['a1', 'b1', 'c1', 'a2']); + }); + + it('does not lose results — backfills from skipped when limit not yet filled', () => { + // 10 candidates all from one file, limit 5, cap 2: pick 2, backfill 3. + const results = Array.from({ length: 10 }, (_, i) => + r(10 - i, `n${i}`, 'a.ts') + ); + const out = diversifyByFile(results, 5, 2); + expect(out).toHaveLength(5); + expect(out.every((x) => x.node.filePath === 'a.ts')).toBe(true); + }); + + it('returns the input slice unchanged when perFileCap=0', () => { + const results = [ + r(10, 'a1', 'a.ts'), + r(9, 'a2', 'a.ts'), + r(8, 'a3', 'a.ts'), + ]; + expect(diversifyByFile(results, 3, 0)).toEqual(results); + }); + + it('returns input unchanged when results.length <= limit and no reordering needed', () => { + const results = [r(10, 'a1', 'a.ts'), r(9, 'a2', 'a.ts')]; + expect(diversifyByFile(results, 5, 2)).toEqual(results); + }); + + it('still reorders within limit when results.length === limit but cap rearranges', () => { + // Same total count as limit, but the cap reorders to surface peer files + // earlier in the list. + const results = [ + r(10, 'a1', 'a.ts'), + r(9, 'a2', 'a.ts'), + r(8, 'a3', 'a.ts'), + r(7, 'a4', 'a.ts'), + r(6, 'b1', 'b.ts'), + ]; + const out = diversifyByFile(results, 5, 2); + // First 2 from a.ts (cap), then b.ts, then backfill a.ts. + expect(out.map((x) => x.node.name)).toEqual(['a1', 'a2', 'b1', 'a3', 'a4']); + }); + + it('respects the limit even when picked + skipped exceed it', () => { + const results = [ + r(10, 'a1', 'a.ts'), + r(9, 'a2', 'a.ts'), + r(8, 'a3', 'a.ts'), + r(7, 'b1', 'b.ts'), + ]; + const out = diversifyByFile(results, 2, 2); + expect(out).toHaveLength(2); + expect(out.map((x) => x.node.name)).toEqual(['a1', 'a2']); + }); + + it('always preserves the top-scoring result at position 0', () => { + const results = [ + r(100, 'top', 'big.ts'), + r(50, 'big2', 'big.ts'), + r(40, 'big3', 'big.ts'), + r(30, 'big4', 'big.ts'), + r(20, 'other', 'other.ts'), + ]; + const out = diversifyByFile(results, 3, 2); + expect(out[0].node.name).toBe('top'); + }); +}); + +describe('searchNodes per-file diversification (integration)', () => { + let dir: string; + let db: DatabaseConnection; + let q: QueryBuilder; + + function makeNode(id: string, name: string, kind: Node['kind'], filePath: string): Node { + return { + id, + kind, + name, + qualifiedName: `${filePath}::${name}`, + filePath, + language: 'typescript', + startLine: 1, + endLine: 1, + startColumn: 0, + endColumn: 0, + updatedAt: Date.now(), + }; + } + + beforeEach(() => { + dir = fs.mkdtempSync(path.join(os.tmpdir(), 'diversify-search-')); + db = DatabaseConnection.initialize(path.join(dir, 'test.db')); + q = new QueryBuilder(db.getDb()); + // Simulate the "10 methods of one class" scenario: a class plus many + // methods all sharing a common token, all in one file. Plus a peer + // file with a sibling implementation. + const nodes: Node[] = [ + makeNode('cls', 'DatabaseConnection', 'class', 'src/db.ts'), + makeNode('m1', 'connect', 'method', 'src/db.ts'), + makeNode('m2', 'disconnect', 'method', 'src/db.ts'), + makeNode('m3', 'reconnect', 'method', 'src/db.ts'), + makeNode('m4', 'isConnected', 'method', 'src/db.ts'), + makeNode('m5', 'connectionString', 'property', 'src/db.ts'), + makeNode('peer', 'PoolConnection', 'class', 'src/pool.ts'), + makeNode('peer2', 'connectPool', 'function', 'src/pool.ts'), + ]; + q.insertNodes(nodes); + }); + + afterEach(() => { + db.close(); + if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true }); + }); + + it('caps results per file at the default (3) so peer files surface', () => { + const results = q.searchNodes('connect', { limit: 5 }); + const fromDbTs = results.filter((r) => r.node.filePath === 'src/db.ts').length; + const fromPool = results.filter((r) => r.node.filePath === 'src/pool.ts').length; + expect(fromDbTs).toBeLessThanOrEqual(3); // cap + expect(fromPool).toBeGreaterThanOrEqual(1); // peer file represented + }); + + it('honors perFileCap: 0 (disabled) — does not enforce a per-file limit', () => { + // Insert a heavy imbalance so dominance is unambiguous: 10 matching + // methods in db.ts, only the existing pool.ts entries elsewhere. + const heavyDb: Node[] = Array.from({ length: 10 }, (_, i) => + makeNode(`heavy${i}`, `connectVariant${i}`, 'method', 'src/db.ts') + ); + q.insertNodes(heavyDb); + const results = q.searchNodes('connect', { limit: 8, perFileCap: 0 }); + const fromDbTs = results.filter((r) => r.node.filePath === 'src/db.ts').length; + expect(fromDbTs).toBeGreaterThan(3); + }); + + it('honors a higher perFileCap', () => { + const results = q.searchNodes('connect', { limit: 6, perFileCap: 5 }); + const fromDbTs = results.filter((r) => r.node.filePath === 'src/db.ts').length; + expect(fromDbTs).toBeLessThanOrEqual(5); + }); + + it('preserves the top-scoring hit even with diversification', () => { + // Class node with the most direct name match is the most relevant — + // diversification must never displace it from #1. + const results = q.searchNodes('DatabaseConnection', { limit: 3 }); + expect(results[0].node.name).toBe('DatabaseConnection'); + }); + + it('does not lose results — fills limit by backfilling skipped same-file hits', () => { + // If only one file has matches, all results legitimately come from it. + // The cap should not cause us to return fewer than `limit` results. + const onlyOneFileNodes: Node[] = Array.from({ length: 10 }, (_, i) => + makeNode(`only${i}`, `solo${i}`, 'function', 'src/only.ts') + ); + q.insertNodes(onlyOneFileNodes); + const results = q.searchNodes('solo', { limit: 5 }); + expect(results.length).toBe(5); + }); +}); diff --git a/src/bin/codegraph.ts b/src/bin/codegraph.ts index d118a1fd..d452e5b9 100644 --- a/src/bin/codegraph.ts +++ b/src/bin/codegraph.ts @@ -744,10 +744,17 @@ program } else { console.log(chalk.bold(`\nSearch Results for "${search}":\n`)); + // searchNodes returns BM25 + bonuses, which is unbounded. + // Render each result's score as a fraction of the top hit so + // the user gets a relative confidence (top = 100%) instead of + // nonsensical "10449%" raw weights. + const topScore = results[0]!.score; for (const result of results) { const node = result.node; const location = `${node.filePath}:${node.startLine}`; - const score = chalk.dim(`(${(result.score * 100).toFixed(0)}%)`); + const relPct = + topScore > 0 ? Math.round((result.score / topScore) * 100) : 0; + const score = chalk.dim(`(${relPct}%)`); console.log( chalk.cyan(node.kind.padEnd(12)) + diff --git a/src/db/queries.ts b/src/db/queries.ts index 51f1a1ad..48c7e542 100644 --- a/src/db/queries.ts +++ b/src/db/queries.ts @@ -18,7 +18,7 @@ import { SearchResult, } from '../types'; import { safeJsonParse } from '../utils'; -import { kindBonus, nameMatchBonus, scorePathRelevance } from '../search/query-utils'; +import { kindBonus, nameMatchBonus, scorePathRelevance, diversifyByFile } from '../search/query-utils'; /** * Database row types (snake_case from SQLite) @@ -478,7 +478,13 @@ export class QueryBuilder { * 3. Score results based on match quality */ searchNodes(query: string, options: SearchOptions = {}): SearchResult[] { - const { kinds, languages, limit = 100, offset = 0 } = options; + const { kinds, languages, limit = 100, offset = 0, perFileCap = 3 } = options; + + // Note on over-fetching: searchNodesFTS already over-fetches by 5x + // internally (Math.max(limit*5, 100)) so its own rescoring pass has + // headroom. That same headroom feeds the per-file diversification + // below — no additional outer multiplier needed. Keeping this comment + // here so future readers don't reintroduce a multiplier-on-multiplier. // First try FTS5 with prefix matching let results = this.searchNodesFTS(query, { kinds, languages, limit, offset }); @@ -530,10 +536,23 @@ export class QueryBuilder { + nameMatchBonus(r.node.name, query), })); results.sort((a, b) => b.score - a.score); - // Trim to requested limit after rescoring - if (results.length > limit) { - results = results.slice(0, limit); - } + } + + // Diversification: cap per-file results so the top-K isn't dominated + // by the methods of a single class. Top-scoring hit per file is always + // included; the cap only kicks in for the second-and-onward members + // of the same file. perFileCap=0 disables. + // + // Guard `results.length > limit`: when results <= limit there's + // nothing to drop, so the existing score order is already what the + // caller will see. (`diversifyByFile` is also safe to call here and + // would reorder within the same set, but the existing rescore order + // is already meaningful and we don't want to perturb it without + // benefit.) + if (perFileCap > 0 && results.length > limit) { + results = diversifyByFile(results, limit, perFileCap); + } else if (results.length > limit) { + results = results.slice(0, limit); } return results; diff --git a/src/search/query-utils.ts b/src/search/query-utils.ts index 9a61acae..0b20cdd5 100644 --- a/src/search/query-utils.ts +++ b/src/search/query-utils.ts @@ -333,3 +333,49 @@ export function kindBonus(kind: Node['kind']): number { }; return bonuses[kind] ?? 0; } + +/** + * Cap consecutive results from the same file. Preserves overall ranking: + * the highest-scoring hit from each file is taken first (up to `perFileCap` + * per file), in score order. If `limit` isn't filled after the capped + * pass, the remaining slots are filled with the next-best hits regardless + * of file (preserves correctness — never hides a hit that would have + * otherwise been returned). + * + * Why: queries like `"ExtractionOrchestrator"` return the matching class + * plus 9 of its members from the same file. The first hit is informative; + * the next 9 are implementation detail that pushes peer files (subclasses, + * callers, sibling modules) past the limit. Capping per file surfaces + * representative breadth without losing the top hit. + */ +export function diversifyByFile( + results: T[], + limit: number, + perFileCap: number +): T[] { + if (perFileCap <= 0) return results.slice(0, limit); + const perFile = new Map(); + const picked: T[] = []; + const skipped: T[] = []; + for (const r of results) { + const f = r.node.filePath; + const c = perFile.get(f) ?? 0; + if (c < perFileCap) { + picked.push(r); + perFile.set(f, c + 1); + if (picked.length >= limit) return picked; + } else { + skipped.push(r); + } + } + // Backfill from skipped (in original score order) so we don't return + // fewer results than the caller asked for. This also handles the + // edge case where `results.length <= limit`: nothing was actually + // dropped, but the per-file cap reordered them so peer files appear + // earlier — `picked` first, then any leftover same-file hits. + for (const r of skipped) { + if (picked.length >= limit) break; + picked.push(r); + } + return picked; +} diff --git a/src/types.ts b/src/types.ts index 6834483d..954236d2 100644 --- a/src/types.ts +++ b/src/types.ts @@ -340,6 +340,17 @@ export interface SearchOptions { /** Whether search is case-sensitive */ caseSensitive?: boolean; + + /** + * Cap the number of results from any single file before returning. + * Default 3. Set to 0 to disable diversification (return raw ranked + * results, even if 10 of them come from the same class). The class / + * function / interface members of the same file are usually less + * informative as multiple distinct results than as "this file plus + * representative members" — diversification surfaces context across + * the codebase rather than burying the user in one file's internals. + */ + perFileCap?: number; } /**