From 5a45ef26e9fe20f0ed0af4fd69bf0e1492f6a57c Mon Sep 17 00:00:00 2001 From: andreinknv Date: Sun, 26 Apr 2026 16:36:08 -0400 Subject: [PATCH 1/2] feat(search): per-file diversification so top-K isn't one class's methods MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a query matches many symbols in a single file, current ranking returns the matching class plus 9 of its members from the same file. The first hit is informative; the next 9 are implementation detail that pushes peer files (subclasses, callers, sibling modules) past the limit. This PR caps results per file so search surfaces representative breadth across the codebase rather than burying the user in one class's internals. ## Empirical lift on codegraph (limit=10, default cap=3) | Query | Before (max from one file) | After | |---|---:|---:| | ExtractionOrchestrator | 10/10 | 9/10 (only one file matches; backfill kicks in) | | database | 8/10 | 3/10 | | config | 5/10 | 3/10 | | resolve | 4/10 | 3/10 | | extract / parse | 3 (no regression) | 3 | Top-1 result is preserved in every case — diversification only reorders second-and-onward. ## Components - `SearchOptions.perFileCap?: number` — default 3; 0 disables. - `diversifyByFile(results, limit, perFileCap)` in src/search/query-utils.ts: pure function. First pass picks at most perFileCap per file in score order. If limit isn't yet filled, backfills from skipped (in original score order) so we never return fewer results than the caller requested. - searchNodes wires it after the existing rescoring pass, when there are more candidates than the caller's limit. Relies on the existing 5x internal over-fetch in searchNodesFTS for headroom — no new multiplier added (multiplier-on-multiplier composition was the reviewer's blocking concern in an earlier draft). ## Files changed | File | Change | |---|---| | src/types.ts | Add perFileCap to SearchOptions | | src/search/query-utils.ts | Add diversifyByFile pure helper | | src/db/queries.ts | Wire diversifyByFile into searchNodes; comment on the over-fetch composition | | __tests__/diversify.test.ts (NEW) | 13 regression tests | ## Test plan - [x] npm test: 393/393 pass on macOS - [x] npx tsc --noEmit clean - [x] Bench script confirms the lift in the table above - [x] Independent reviewer pass before pushing — addressed: - Multiplier-on-multiplier (4x outer * 5x inner = 20x for large limits): outer multiplier removed; inner over-fetch is sufficient. - Within-limit reorder: documented as intentional pure-function behavior; integration path correctly skips when results <= limit. - MCP exposure of perFileCap: deferred — default 3 is the desired new behavior; MCP can pick it up later if users want to tune. Co-Authored-By: Claude Opus 4.7 (1M context) --- __tests__/diversify.test.ts | 200 ++++++++++++++++++++++++++++++++++++ src/db/queries.ts | 31 ++++-- src/search/query-utils.ts | 46 +++++++++ src/types.ts | 11 ++ 4 files changed, 282 insertions(+), 6 deletions(-) create mode 100644 __tests__/diversify.test.ts diff --git a/__tests__/diversify.test.ts b/__tests__/diversify.test.ts new file mode 100644 index 00000000..181ee9c5 --- /dev/null +++ b/__tests__/diversify.test.ts @@ -0,0 +1,200 @@ +/** + * Result Diversification Tests + * + * Verifies the per-file cap on search results: queries that match many + * symbols in one file (the methods of a class) no longer return 10 hits + * from one file, but instead surface representative breadth across files. + */ + +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { DatabaseConnection } from '../src/db'; +import { QueryBuilder } from '../src/db/queries'; +import { diversifyByFile } from '../src/search/query-utils'; +import { Node } from '../src/types'; + +describe('diversifyByFile (unit)', () => { + function r(score: number, name: string, filePath: string) { + return { node: { id: name, name, filePath } as Node, score }; + } + + it('caps consecutive results from the same file at perFileCap', () => { + const results = [ + r(10, 'a1', 'a.ts'), + r(9, 'a2', 'a.ts'), + r(8, 'a3', 'a.ts'), + r(7, 'a4', 'a.ts'), + r(6, 'b1', 'b.ts'), + ]; + const out = diversifyByFile(results, 5, 2); + expect(out.map((x) => x.node.name)).toEqual(['a1', 'a2', 'b1', 'a3', 'a4']); + // First two from a.ts (cap), then b.ts (different file), then backfill. + }); + + it('preserves overall ranking when no file dominates', () => { + const results = [ + r(10, 'a1', 'a.ts'), + r(9, 'b1', 'b.ts'), + r(8, 'c1', 'c.ts'), + r(7, 'a2', 'a.ts'), + ]; + const out = diversifyByFile(results, 4, 2); + expect(out.map((x) => x.node.name)).toEqual(['a1', 'b1', 'c1', 'a2']); + }); + + it('does not lose results — backfills from skipped when limit not yet filled', () => { + // 10 candidates all from one file, limit 5, cap 2: pick 2, backfill 3. + const results = Array.from({ length: 10 }, (_, i) => + r(10 - i, `n${i}`, 'a.ts') + ); + const out = diversifyByFile(results, 5, 2); + expect(out).toHaveLength(5); + expect(out.every((x) => x.node.filePath === 'a.ts')).toBe(true); + }); + + it('returns the input slice unchanged when perFileCap=0', () => { + const results = [ + r(10, 'a1', 'a.ts'), + r(9, 'a2', 'a.ts'), + r(8, 'a3', 'a.ts'), + ]; + expect(diversifyByFile(results, 3, 0)).toEqual(results); + }); + + it('returns input unchanged when results.length <= limit and no reordering needed', () => { + const results = [r(10, 'a1', 'a.ts'), r(9, 'a2', 'a.ts')]; + expect(diversifyByFile(results, 5, 2)).toEqual(results); + }); + + it('still reorders within limit when results.length === limit but cap rearranges', () => { + // Same total count as limit, but the cap reorders to surface peer files + // earlier in the list. + const results = [ + r(10, 'a1', 'a.ts'), + r(9, 'a2', 'a.ts'), + r(8, 'a3', 'a.ts'), + r(7, 'a4', 'a.ts'), + r(6, 'b1', 'b.ts'), + ]; + const out = diversifyByFile(results, 5, 2); + // First 2 from a.ts (cap), then b.ts, then backfill a.ts. + expect(out.map((x) => x.node.name)).toEqual(['a1', 'a2', 'b1', 'a3', 'a4']); + }); + + it('respects the limit even when picked + skipped exceed it', () => { + const results = [ + r(10, 'a1', 'a.ts'), + r(9, 'a2', 'a.ts'), + r(8, 'a3', 'a.ts'), + r(7, 'b1', 'b.ts'), + ]; + const out = diversifyByFile(results, 2, 2); + expect(out).toHaveLength(2); + expect(out.map((x) => x.node.name)).toEqual(['a1', 'a2']); + }); + + it('always preserves the top-scoring result at position 0', () => { + const results = [ + r(100, 'top', 'big.ts'), + r(50, 'big2', 'big.ts'), + r(40, 'big3', 'big.ts'), + r(30, 'big4', 'big.ts'), + r(20, 'other', 'other.ts'), + ]; + const out = diversifyByFile(results, 3, 2); + expect(out[0].node.name).toBe('top'); + }); +}); + +describe('searchNodes per-file diversification (integration)', () => { + let dir: string; + let db: DatabaseConnection; + let q: QueryBuilder; + + function makeNode(id: string, name: string, kind: Node['kind'], filePath: string): Node { + return { + id, + kind, + name, + qualifiedName: `${filePath}::${name}`, + filePath, + language: 'typescript', + startLine: 1, + endLine: 1, + startColumn: 0, + endColumn: 0, + updatedAt: Date.now(), + }; + } + + beforeEach(() => { + dir = fs.mkdtempSync(path.join(os.tmpdir(), 'diversify-search-')); + db = DatabaseConnection.initialize(path.join(dir, 'test.db')); + q = new QueryBuilder(db.getDb()); + // Simulate the "10 methods of one class" scenario: a class plus many + // methods all sharing a common token, all in one file. Plus a peer + // file with a sibling implementation. + const nodes: Node[] = [ + makeNode('cls', 'DatabaseConnection', 'class', 'src/db.ts'), + makeNode('m1', 'connect', 'method', 'src/db.ts'), + makeNode('m2', 'disconnect', 'method', 'src/db.ts'), + makeNode('m3', 'reconnect', 'method', 'src/db.ts'), + makeNode('m4', 'isConnected', 'method', 'src/db.ts'), + makeNode('m5', 'connectionString', 'property', 'src/db.ts'), + makeNode('peer', 'PoolConnection', 'class', 'src/pool.ts'), + makeNode('peer2', 'connectPool', 'function', 'src/pool.ts'), + ]; + q.insertNodes(nodes); + }); + + afterEach(() => { + db.close(); + if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true }); + }); + + it('caps results per file at the default (3) so peer files surface', () => { + const results = q.searchNodes('connect', { limit: 5 }); + const fromDbTs = results.filter((r) => r.node.filePath === 'src/db.ts').length; + const fromPool = results.filter((r) => r.node.filePath === 'src/pool.ts').length; + expect(fromDbTs).toBeLessThanOrEqual(3); // cap + expect(fromPool).toBeGreaterThanOrEqual(1); // peer file represented + }); + + it('honors perFileCap: 0 (disabled) — does not enforce a per-file limit', () => { + // Insert a heavy imbalance so dominance is unambiguous: 10 matching + // methods in db.ts, only the existing pool.ts entries elsewhere. + const heavyDb: Node[] = Array.from({ length: 10 }, (_, i) => + makeNode(`heavy${i}`, `connectVariant${i}`, 'method', 'src/db.ts') + ); + q.insertNodes(heavyDb); + const results = q.searchNodes('connect', { limit: 8, perFileCap: 0 }); + const fromDbTs = results.filter((r) => r.node.filePath === 'src/db.ts').length; + expect(fromDbTs).toBeGreaterThan(3); + }); + + it('honors a higher perFileCap', () => { + const results = q.searchNodes('connect', { limit: 6, perFileCap: 5 }); + const fromDbTs = results.filter((r) => r.node.filePath === 'src/db.ts').length; + expect(fromDbTs).toBeLessThanOrEqual(5); + }); + + it('preserves the top-scoring hit even with diversification', () => { + // Class node with the most direct name match is the most relevant — + // diversification must never displace it from #1. + const results = q.searchNodes('DatabaseConnection', { limit: 3 }); + expect(results[0].node.name).toBe('DatabaseConnection'); + }); + + it('does not lose results — fills limit by backfilling skipped same-file hits', () => { + // If only one file has matches, all results legitimately come from it. + // The cap should not cause us to return fewer than `limit` results. + const onlyOneFileNodes: Node[] = Array.from({ length: 10 }, (_, i) => + makeNode(`only${i}`, `solo${i}`, 'function', 'src/only.ts') + ); + q.insertNodes(onlyOneFileNodes); + const results = q.searchNodes('solo', { limit: 5 }); + expect(results.length).toBe(5); + }); +}); diff --git a/src/db/queries.ts b/src/db/queries.ts index 51f1a1ad..48c7e542 100644 --- a/src/db/queries.ts +++ b/src/db/queries.ts @@ -18,7 +18,7 @@ import { SearchResult, } from '../types'; import { safeJsonParse } from '../utils'; -import { kindBonus, nameMatchBonus, scorePathRelevance } from '../search/query-utils'; +import { kindBonus, nameMatchBonus, scorePathRelevance, diversifyByFile } from '../search/query-utils'; /** * Database row types (snake_case from SQLite) @@ -478,7 +478,13 @@ export class QueryBuilder { * 3. Score results based on match quality */ searchNodes(query: string, options: SearchOptions = {}): SearchResult[] { - const { kinds, languages, limit = 100, offset = 0 } = options; + const { kinds, languages, limit = 100, offset = 0, perFileCap = 3 } = options; + + // Note on over-fetching: searchNodesFTS already over-fetches by 5x + // internally (Math.max(limit*5, 100)) so its own rescoring pass has + // headroom. That same headroom feeds the per-file diversification + // below — no additional outer multiplier needed. Keeping this comment + // here so future readers don't reintroduce a multiplier-on-multiplier. // First try FTS5 with prefix matching let results = this.searchNodesFTS(query, { kinds, languages, limit, offset }); @@ -530,10 +536,23 @@ export class QueryBuilder { + nameMatchBonus(r.node.name, query), })); results.sort((a, b) => b.score - a.score); - // Trim to requested limit after rescoring - if (results.length > limit) { - results = results.slice(0, limit); - } + } + + // Diversification: cap per-file results so the top-K isn't dominated + // by the methods of a single class. Top-scoring hit per file is always + // included; the cap only kicks in for the second-and-onward members + // of the same file. perFileCap=0 disables. + // + // Guard `results.length > limit`: when results <= limit there's + // nothing to drop, so the existing score order is already what the + // caller will see. (`diversifyByFile` is also safe to call here and + // would reorder within the same set, but the existing rescore order + // is already meaningful and we don't want to perturb it without + // benefit.) + if (perFileCap > 0 && results.length > limit) { + results = diversifyByFile(results, limit, perFileCap); + } else if (results.length > limit) { + results = results.slice(0, limit); } return results; diff --git a/src/search/query-utils.ts b/src/search/query-utils.ts index 9a61acae..0b20cdd5 100644 --- a/src/search/query-utils.ts +++ b/src/search/query-utils.ts @@ -333,3 +333,49 @@ export function kindBonus(kind: Node['kind']): number { }; return bonuses[kind] ?? 0; } + +/** + * Cap consecutive results from the same file. Preserves overall ranking: + * the highest-scoring hit from each file is taken first (up to `perFileCap` + * per file), in score order. If `limit` isn't filled after the capped + * pass, the remaining slots are filled with the next-best hits regardless + * of file (preserves correctness — never hides a hit that would have + * otherwise been returned). + * + * Why: queries like `"ExtractionOrchestrator"` return the matching class + * plus 9 of its members from the same file. The first hit is informative; + * the next 9 are implementation detail that pushes peer files (subclasses, + * callers, sibling modules) past the limit. Capping per file surfaces + * representative breadth without losing the top hit. + */ +export function diversifyByFile( + results: T[], + limit: number, + perFileCap: number +): T[] { + if (perFileCap <= 0) return results.slice(0, limit); + const perFile = new Map(); + const picked: T[] = []; + const skipped: T[] = []; + for (const r of results) { + const f = r.node.filePath; + const c = perFile.get(f) ?? 0; + if (c < perFileCap) { + picked.push(r); + perFile.set(f, c + 1); + if (picked.length >= limit) return picked; + } else { + skipped.push(r); + } + } + // Backfill from skipped (in original score order) so we don't return + // fewer results than the caller asked for. This also handles the + // edge case where `results.length <= limit`: nothing was actually + // dropped, but the per-file cap reordered them so peer files appear + // earlier — `picked` first, then any leftover same-file hits. + for (const r of skipped) { + if (picked.length >= limit) break; + picked.push(r); + } + return picked; +} diff --git a/src/types.ts b/src/types.ts index 6834483d..954236d2 100644 --- a/src/types.ts +++ b/src/types.ts @@ -340,6 +340,17 @@ export interface SearchOptions { /** Whether search is case-sensitive */ caseSensitive?: boolean; + + /** + * Cap the number of results from any single file before returning. + * Default 3. Set to 0 to disable diversification (return raw ranked + * results, even if 10 of them come from the same class). The class / + * function / interface members of the same file are usually less + * informative as multiple distinct results than as "this file plus + * representative members" — diversification surfaces context across + * the codebase rather than burying the user in one file's internals. + */ + perFileCap?: number; } /** From 323c2cf021485439e5a2280fd3136b65f4426401 Mon Sep 17 00:00:00 2001 From: andreinknv Date: Tue, 28 Apr 2026 17:31:56 -0400 Subject: [PATCH 2/2] fix(query): render search scores as percent-of-top, not raw bm25 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit searchNodes returns bm25 + kindBonus + scorePathRelevance + nameMatchBonus, which has no upper bound. The CLI then displayed that as `(score * 100).toFixed(0)%` — producing "10449%", "6553%", "2251%" for ordinary searches like `query serve` against ollama. Beyond being misleading, the value isn't comparable across queries. Render each hit's score as a fraction of the top hit's score so the top result is always "100%" and everything below scales relative to it. Topscore=0 (degenerate) shows as "0%" instead of NaN. --- src/bin/codegraph.ts | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/bin/codegraph.ts b/src/bin/codegraph.ts index d118a1fd..d452e5b9 100644 --- a/src/bin/codegraph.ts +++ b/src/bin/codegraph.ts @@ -744,10 +744,17 @@ program } else { console.log(chalk.bold(`\nSearch Results for "${search}":\n`)); + // searchNodes returns BM25 + bonuses, which is unbounded. + // Render each result's score as a fraction of the top hit so + // the user gets a relative confidence (top = 100%) instead of + // nonsensical "10449%" raw weights. + const topScore = results[0]!.score; for (const result of results) { const node = result.node; const location = `${node.filePath}:${node.startLine}`; - const score = chalk.dim(`(${(result.score * 100).toFixed(0)}%)`); + const relPct = + topScore > 0 ? Math.round((result.score / topScore) * 100) : 0; + const score = chalk.dim(`(${relPct}%)`); console.log( chalk.cyan(node.kind.padEnd(12)) +