From 64b1ee2fe48be84918cacebd41fa11b789ee0a41 Mon Sep 17 00:00:00 2001 From: mohammed naji Date: Tue, 12 May 2026 00:35:04 +0400 Subject: [PATCH 1/3] Fix retrieval correctness for production prompts Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- CHANGELOG.md | 7 + README.md | 10 + src/contracts/context-pack-diagnostics.ts | 15 + src/contracts/context-pack.ts | 2 + src/contracts/retrieval-gate.ts | 12 + src/pipeline/detect.ts | 99 +------ src/pipeline/spi/build.ts | 26 +- src/runtime/context-pack-diagnostics.ts | 153 +++++++++- src/runtime/context-pack.ts | 42 ++- src/runtime/retrieval-gate.ts | 115 ++++++- src/runtime/retrieve.ts | 280 ++++++++++++++++-- src/runtime/retrieve/slicing.ts | 96 +++++- src/shared/source-discovery.ts | 244 +++++++++++++++ tests/unit/benchmark-quality.test.ts | 6 +- tests/unit/context-pack-diagnostics.test.ts | 105 +++++++ tests/unit/detect.test.ts | 64 ++-- tests/unit/generate.test.ts | 12 +- tests/unit/retrieval-gate.test.ts | 41 +++ .../retrieve-production-correctness.test.ts | 144 +++++++++ tests/unit/spi-build.test.ts | 8 +- 20 files changed, 1298 insertions(+), 183 deletions(-) create mode 100644 src/shared/source-discovery.ts create mode 100644 tests/unit/retrieve-production-correctness.test.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 338ca08..566f191 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,13 @@ All notable changes to the TypeScript package will be documented in this file. ## [Unreleased] +### Changed + +- **Default graph discovery is stricter on duplicate/generated paths**: legacy detect and `--spi` now share hard ignores for nested worktrees, VCS metadata, `graphify-out`, dependency stores, and common build/cache outputs, while keeping tests, benchmarks, fixtures, and mocks indexable unless the user excludes them. +- **Retrieval exclusions are intent-aware**: prompts like "exclude tests" or "do not include benchmarks" no longer classify as `test` intent, and the parsed excluded domains/terms now suppress matching retrieval candidates instead of only affecting wording. +- **Slice-v1 anchors and traversal are more truthful for production prompts**: literal file-path mentions are distinguished from lexical source-path overlap, explicit `Class.method` prompts anchor the method instead of the class, and pipeline-shaped NestJS prompts now walk backward/forward through controller, service, orchestrator, and persistence paths without exploding into sibling controller methods. +- **Context-pack diagnostics catch semantically wrong packs**: diagnostics now flag excluded-domain selections, polluted source paths, controller-only pipeline packs, missing method anchors/runtime pipeline evidence, and test-dominated production packs. + ## [0.22.0] - 2026-05-11 ### Added diff --git a/README.md b/README.md index 750920e..143aabb 100644 --- a/README.md +++ b/README.md @@ -153,6 +153,16 @@ graphify-ts --help # full surface --- +## Default discovery rules + +`graphify-ts generate` now hard-ignores nested VCS/worktree copies and generated/build output by default: `.worktrees/`, `worktrees/`, `.git/`, `graphify-out/`, `node_modules/`, `dist/`, `build/`, `coverage/`, cache folders, source maps, lock/build artifacts, and temp/log files. + +Tests, benchmarks, fixtures, mocks, and config files are **not** hard-ignored anymore. They still get indexed so retrieval can use them when you ask for them, but production/runtime prompts now soft-penalize them and honor prompt exclusions like "exclude tests, benchmarks, fixtures". + +`.graphifyignore` still adds extra ignore rules, and negated entries such as `!vendor/**` or `!lib/**` can re-include a default hard-ignore when you intentionally want it indexed. + +--- + ## Trust + limitations Everything stays local by default. No telemetry, no cloud upload, no API key required. diff --git a/src/contracts/context-pack-diagnostics.ts b/src/contracts/context-pack-diagnostics.ts index 600d9be..ef2a066 100644 --- a/src/contracts/context-pack-diagnostics.ts +++ b/src/contracts/context-pack-diagnostics.ts @@ -10,6 +10,8 @@ // rather than from a model judgement. That keeps the surface fully // deterministic and CI-asseratable. +import type { SourceDomain } from '../shared/source-discovery.js' + /** The categories of structural problems a context-pack can exhibit. Each * enum value maps to one rule in computeContextPackDiagnostics. */ export type ContextPackDiagnosticKind = @@ -22,6 +24,13 @@ export type ContextPackDiagnosticKind = | 'low_avg_match_score' | 'orphan_nodes' | 'no_graph_signals' + | 'excluded_domain_selected' + | 'test_dominated_pack' + | 'controller_only_pipeline_pack' + | 'missing_method_anchor' + | 'missing_runtime_pipeline' + | 'polluted_source_path_selected' + | 'missing_structural_evidence' export type ContextPackDiagnosticSeverity = 'info' | 'warn' | 'error' @@ -48,6 +57,12 @@ export interface ContextPackQualitySignals { /** token_count from the pack as a fraction of task_contract.budget. * Capped at 1.0 for over-budget packs. */ budget_utilization: number + /** Source-domain distribution across selected nodes. */ + domain_distribution: Partial> + /** Domains the prompt explicitly excluded. */ + excluded_domains: string[] + /** Number of selected nodes from polluted/generated paths. */ + polluted_source_path_count: number } export interface ContextPackDiagnostics { diff --git a/src/contracts/context-pack.ts b/src/contracts/context-pack.ts index f8d3a2e..3ca5af7 100644 --- a/src/contracts/context-pack.ts +++ b/src/contracts/context-pack.ts @@ -1,5 +1,6 @@ import type { RetrievalGateDecision } from './retrieval-gate.js' import type { TaskIntentKind } from './task-intent.js' +import type { SourceDomain } from '../shared/source-discovery.js' export type ContextPackTaskKind = 'explain' | 'review' | 'impact' @@ -93,6 +94,7 @@ export interface ContextPackNode { framework?: string | undefined framework_role?: string | undefined framework_boost?: number | undefined + source_domain?: SourceDomain | undefined evidence_class?: ContextPackEvidenceClass | undefined representation_type?: ContextRepresentationType | undefined representation_reason?: string | undefined diff --git a/src/contracts/retrieval-gate.ts b/src/contracts/retrieval-gate.ts index a1781c5..3652cf5 100644 --- a/src/contracts/retrieval-gate.ts +++ b/src/contracts/retrieval-gate.ts @@ -19,11 +19,23 @@ export type RetrievalIntent = | 'chitchat' | 'unknown' +export type RetrievalExcludedDomain = + | 'test' + | 'benchmark' + | 'fixture' + | 'generated' + | 'docs' + | 'config' + | 'build_artifact' + export interface RetrievalGateSignals { has_pr_diff: boolean has_stack_trace: boolean mentioned_paths: ReadonlyArray mentioned_symbols: ReadonlyArray + excluded_domains?: ReadonlyArray + excluded_terms?: ReadonlyArray + excluded_path_hints?: ReadonlyArray } export interface RetrievalGateDecision { diff --git a/src/pipeline/detect.ts b/src/pipeline/detect.ts index 0ea2657..6d6c513 100644 --- a/src/pipeline/detect.ts +++ b/src/pipeline/detect.ts @@ -1,7 +1,12 @@ -import { Dirent, existsSync, lstatSync, mkdirSync, readFileSync, readdirSync, realpathSync, statSync, writeFileSync } from 'node:fs' +import { Dirent, lstatSync, mkdirSync, readFileSync, readdirSync, realpathSync, statSync, writeFileSync } from 'node:fs' import { basename, dirname, extname, relative, resolve, sep } from 'node:path' import { sidecarAwareFileFingerprint } from '../shared/binary-ingest-sidecar.js' +import { + isDiscoveryPathIgnored, + isIgnoredByPatterns, + loadGraphifyignorePatterns, +} from '../shared/source-discovery.js' export const FileType = { CODE: 'code', @@ -109,14 +114,8 @@ const SKIP_DIRS = new Set([ '.venv', 'env', '.env', - 'graphify-out', - 'node_modules', '__pycache__', - '.git', - 'dist', - 'build', 'target', - 'out', 'site-packages', 'lib64', '.pytest_cache', @@ -124,29 +123,11 @@ const SKIP_DIRS = new Set([ '.ruff_cache', '.tox', '.eggs', - 'test', - 'tests', - '__tests__', - 'spec', - 'specs', - 'e2e', - 'cypress', - 'playwright', - 'coverage', 'storybook-static', - 'fixtures', - '__fixtures__', - '__mocks__', - 'mocks', ]) const NOISE_FILE_PATTERNS: RegExp[] = [ - /\.(test|spec)\.(ts|tsx|js|jsx|mjs|cjs)$/i, /\.stories\.(ts|tsx|js|jsx)$/i, - /\.mock\.(ts|tsx|js|jsx)$/i, - /^(vitest|jest|webpack|rollup|vite|babel)\.config\./i, - /^setupTests\.(ts|tsx|js|jsx)$/i, - /^jest\.setup\.(ts|tsx|js|jsx)$/i, ] function isNoiseFile(name: string): boolean { @@ -157,21 +138,6 @@ function toPosixPath(path: string): string { return path.split(sep).join('/') } -function globToRegExp(pattern: string): RegExp { - const wildcardCount = [...pattern].filter((character) => character === '*').length - if (pattern.length > 512 || wildcardCount > 32) { - return /^$/ - } - - const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, '\\$&') - const wildcarded = escaped.replace(/\*/g, '.*').replace(/\?/g, '.') - return new RegExp(`^${wildcarded}$`) -} - -function matchesPattern(value: string, pattern: string): boolean { - return globToRegExp(pattern).test(value) -} - function isNoiseDir(part: string): boolean { return SKIP_DIRS.has(part) || part.endsWith('_venv') || part.endsWith('_env') || part.endsWith('.egg-info') } @@ -234,53 +200,11 @@ export function countWords(path: string): number { } export function _loadGraphifyignore(root: string): string[] { - try { - const content = readFileSync(resolve(root, '.graphifyignore'), 'utf8') - return content - .split(/\r?\n/) - .map((line) => line.trim()) - .filter((line) => line.length > 0 && !line.startsWith('#')) - } catch { - return [] - } + return loadGraphifyignorePatterns(root) } export function _isIgnored(path: string, root: string, patterns: string[]): boolean { - if (patterns.length === 0) { - return false - } - - const relativePath = toPosixPath(relative(resolve(root), resolve(path))) - if (relativePath.startsWith('..')) { - return false - } - - const pathParts = relativePath.split('/') - const fileName = basename(path) - - for (const rawPattern of patterns) { - const pattern = rawPattern.replace(/^\/+|\/+$/g, '') - if (!pattern) { - continue - } - - if (matchesPattern(relativePath, pattern) || matchesPattern(fileName, pattern)) { - return true - } - - for (let index = 0; index < pathParts.length; index += 1) { - const part = pathParts[index] - if (!part) { - continue - } - const prefix = pathParts.slice(0, index + 1).join('/') - if (matchesPattern(part, pattern) || matchesPattern(prefix, pattern)) { - return true - } - } - } - - return false + return isIgnoredByPatterns(path, root, patterns) } function isWithinRoot(rootRealPath: string, candidateRealPath: string): boolean { @@ -312,7 +236,7 @@ function visitDirectory( continue } - if (_isIgnored(entryPath, root, ignorePatterns)) { + if (isDiscoveryPathIgnored(entryPath, root, ignorePatterns)) { continue } @@ -390,11 +314,6 @@ function collectFiles(root: string, followSymlinks: boolean, ignorePatterns: str visitDirectory(resolvedRoot, resolvedRoot, followSymlinks, ignorePatterns, [rootRealPath], rootRealPath, files) - const memoryDir = resolve(resolvedRoot, 'graphify-out', 'memory') - if (existsSync(memoryDir)) { - visitDirectory(memoryDir, resolvedRoot, followSymlinks, ignorePatterns, [rootRealPath], rootRealPath, files) - } - return [...new Set(files)].sort() } diff --git a/src/pipeline/spi/build.ts b/src/pipeline/spi/build.ts index 5039bbb..d4c0972 100644 --- a/src/pipeline/spi/build.ts +++ b/src/pipeline/spi/build.ts @@ -50,6 +50,10 @@ import { import { dirname, extname, join, relative, resolve } from 'node:path' import ts from 'typescript' +import { + isDiscoveryPathIgnored, + loadGraphifyignorePatterns, +} from '../../shared/source-discovery.js' import type { SemanticProgramIndex, SpiDiagnostic, @@ -85,19 +89,6 @@ export type BuildSpiOptions = { // BuildSpiOptions / buildSpi directly. export type BuildSpiFileLayerOptions = BuildSpiOptions -const SKIP_DIRS = new Set([ - 'node_modules', - 'dist', - 'build', - '.next', - 'coverage', - '.git', - 'graphify-out', - '.test-artifacts', - '.turbo', - '.vercel', -]) - const EXT_TO_LANG: Record = { '.ts': 'typescript', '.tsx': 'tsx', @@ -127,7 +118,8 @@ export function buildSpi(opts: BuildSpiOptions): SemanticProgramIndex { const diagnostics: SpiDiagnostic[] = [] const absPaths: string[] = [] - collectFiles(root, absPaths) + const ignorePatterns = loadGraphifyignorePatterns(root) + collectFiles(root, root, ignorePatterns, absPaths) const pathToFileId = new Map() for (const abs of absPaths) { @@ -199,7 +191,7 @@ export function buildSpi(opts: BuildSpiOptions): SemanticProgramIndex { // buildSpi directly. export const buildSpiFileLayer = buildSpi -function collectFiles(dir: string, out: string[]): void { +function collectFiles(root: string, dir: string, ignorePatterns: readonly string[], out: string[]): void { let entries: import('node:fs').Dirent[] try { entries = readdirSync(dir, { withFileTypes: true, encoding: 'utf8' }) @@ -207,11 +199,11 @@ function collectFiles(dir: string, out: string[]): void { return } for (const entry of entries) { - if (SKIP_DIRS.has(entry.name)) continue if (entry.isSymbolicLink()) continue const full = join(dir, entry.name) + if (isDiscoveryPathIgnored(full, root, ignorePatterns)) continue if (entry.isDirectory()) { - collectFiles(full, out) + collectFiles(root, full, ignorePatterns, out) } else if (entry.isFile()) { out.push(full) } diff --git a/src/runtime/context-pack-diagnostics.ts b/src/runtime/context-pack-diagnostics.ts index 99eabe8..2ec96a2 100644 --- a/src/runtime/context-pack-diagnostics.ts +++ b/src/runtime/context-pack-diagnostics.ts @@ -37,6 +37,7 @@ import type { ContextPackDiagnostics, ContextPackQualitySignals, } from '../contracts/context-pack-diagnostics.js' +import { classifySourceDomain, isPollutedSourcePath, type SourceDomain } from '../shared/source-discovery.js' const RULE_WEIGHTS: ReadonlyMap = new Map([ ['missing_required_evidence', 2], @@ -48,6 +49,13 @@ const RULE_WEIGHTS: ReadonlyMap = new Map([ ['low_avg_match_score', 1], ['orphan_nodes', 1], ['no_graph_signals', 1], + ['excluded_domain_selected', 1], + ['test_dominated_pack', 1], + ['controller_only_pipeline_pack', 1], + ['missing_method_anchor', 1], + ['missing_runtime_pipeline', 1], + ['polluted_source_path_selected', 2], + ['missing_structural_evidence', 1], ]) const SEVERITY_ORDER: Record = { @@ -176,6 +184,74 @@ export function computeContextPackDiagnostics( }) } + if (signals.polluted_source_path_count > 0) { + warnings.push({ + kind: 'polluted_source_path_selected', + severity: 'error', + message: 'Pack selected nodes from polluted paths such as nested worktrees or generated outputs.', + detail: { count: signals.polluted_source_path_count }, + }) + } + + if (signals.excluded_domains.length > 0) { + const selectedExcludedDomains = Object.entries(signals.domain_distribution) + .filter(([domain, count]) => signals.excluded_domains.includes(domain) && (count ?? 0) > 0) + .map(([domain]) => domain) + if (selectedExcludedDomains.length > 0) { + warnings.push({ + kind: 'excluded_domain_selected', + severity: 'warn', + message: `Pack selected nodes from excluded domains: ${selectedExcludedDomains.join(', ')}.`, + detail: { domains: selectedExcludedDomains }, + }) + } + } + + if (productionPrompt(pack) && dominatedByDomains(signals.domain_distribution, ['test', 'benchmark', 'fixture'])) { + warnings.push({ + kind: 'test_dominated_pack', + severity: 'warn', + message: 'Pack is dominated by test, benchmark, or fixture nodes for a production-oriented prompt.', + detail: { domain_distribution: signals.domain_distribution }, + }) + } + + if (pipelinePrompt(pack) && (pack.coverage.selected_relationships === 0 || signals.relationship_count === 0)) { + warnings.push({ + kind: 'missing_structural_evidence', + severity: 'warn', + message: 'Pack is missing structural relationships for a pipeline-oriented prompt.', + detail: { + selected_relationships: pack.coverage.selected_relationships, + relationship_count: signals.relationship_count, + }, + }) + } + + if (pipelinePrompt(pack) && controllerOnlyPipelinePack(pack)) { + warnings.push({ + kind: 'controller_only_pipeline_pack', + severity: 'warn', + message: 'Pack stayed at controller-level context for a pipeline-oriented prompt.', + }) + } + + if (pipelinePrompt(pack) && missingRuntimePipeline(pack)) { + warnings.push({ + kind: 'missing_runtime_pipeline', + severity: 'warn', + message: 'Pack did not follow the runtime path into service/orchestrator/persistence nodes.', + }) + } + + if (requestedMethodAnchor(pack) && !selectedMethodAnchor(pack)) { + warnings.push({ + kind: 'missing_method_anchor', + severity: 'warn', + message: 'Prompt requested a specific method anchor but the selected slice did not anchor that method.', + }) + } + warnings.sort((a, b) => { const sevDelta = SEVERITY_ORDER[a.severity] - SEVERITY_ORDER[b.severity] if (sevDelta !== 0) return sevDelta @@ -199,12 +275,19 @@ function computeSignals(pack: CompiledContextPack): ContextPackQualitySignals { let snippetNodes = 0 let scoreSum = 0 let scoreCount = 0 + let pollutedSourcePathCount = 0 + const domainDistribution: Partial> = {} for (const node of pack.nodes) { if (typeof node.snippet === 'string' && node.snippet.length > 0) snippetNodes += 1 if (typeof node.match_score === 'number' && Number.isFinite(node.match_score)) { scoreSum += node.match_score scoreCount += 1 } + const domain = node.source_domain ?? classifySourceDomain(node.source_file) + domainDistribution[domain] = (domainDistribution[domain] ?? 0) + 1 + if (isPollutedSourcePath(node.source_file)) { + pollutedSourcePathCount += 1 + } } const snippetCoverage = nodeCount === 0 ? 0 : snippetNodes / nodeCount @@ -219,13 +302,77 @@ function computeSignals(pack: CompiledContextPack): ContextPackQualitySignals { snippet_coverage: snippetCoverage, avg_match_score: avgMatchScore, budget_utilization: budgetUtilization, + domain_distribution: domainDistribution, + excluded_domains: [...(pack.retrieval_gate?.signals.excluded_domains ?? [])], + polluted_source_path_count: pollutedSourcePathCount, } } +function productionPrompt(pack: CompiledContextPack): boolean { + const prompt = pack.task_contract.prompt?.toLowerCase() ?? '' + return prompt.length > 0 + && /\b(production|runtime|pipeline|service|orchestrator|persistence|repository)\b/.test(prompt) + && pack.retrieval_gate?.intent !== 'test' +} + +function pipelinePrompt(pack: CompiledContextPack): boolean { + return /\b(runtime|pipeline|service|orchestrator|job|agent|scoring|persistence|repository)\b/i.test(pack.task_contract.prompt ?? '') +} + +function dominatedByDomains( + distribution: Partial>, + domains: readonly SourceDomain[], +): boolean { + const total = Object.values(distribution).reduce((sum, count) => sum + (count ?? 0), 0) + if (total === 0) { + return false + } + + const dominated = domains.reduce((sum, domain) => sum + (distribution[domain] ?? 0), 0) + return dominated / total >= 0.5 +} + +function controllerOnlyPipelinePack(pack: CompiledContextPack): boolean { + const controllerNodes = pack.nodes.filter((node) => (node.framework_role ?? '').toLowerCase().includes('controller')).length + return controllerNodes > 0 && controllerNodes === pack.nodes.length +} + +function missingRuntimePipeline(pack: CompiledContextPack): boolean { + const pipelineNodeCount = pack.nodes.filter((node) => { + const role = (node.framework_role ?? '').toLowerCase() + const label = node.label.toLowerCase() + return role.includes('service') + || role.includes('provider') + || role.includes('repository') + || role.includes('orchestrator') + || label.includes('service') + || label.includes('orchestrator') + || label.includes('repository') + || label.includes('agent') + }).length + const structuralRelations = pack.relationships.filter((relationship) => ['calls', 'injects', 'depends_on', 'reads_env', 'uses_config'].includes(relationship.relation)).length + return pipelinePrompt(pack) && (pipelineNodeCount === 0 || structuralRelations === 0) +} + +function requestedMethodAnchor(pack: CompiledContextPack): boolean { + const mentionedSymbols = pack.retrieval_gate?.signals.mentioned_symbols ?? [] + return mentionedSymbols.some((symbol) => /(?:\.|#|::)[A-Za-z_$][\w$]*$/.test(symbol) || /\(\)$/.test(symbol)) +} + +function selectedMethodAnchor(pack: CompiledContextPack): boolean { + const mentionedSymbols = pack.retrieval_gate?.signals.mentioned_symbols ?? [] + const anchors = pack.slice?.anchors ?? [] + return mentionedSymbols.some((symbol) => { + const normalizedSymbol = symbol.replace(/`/g, '').replace(/\(\)$/, '').toLowerCase() + return anchors.some((anchor) => anchor.label.replace(/`/g, '').replace(/\(\)$/, '').toLowerCase() === normalizedSymbol) + }) +} + function computeQualityScore(warnings: ContextPackDiagnosticWarning[]): number { - let totalWeight = 0 - for (const weight of RULE_WEIGHTS.values()) totalWeight += weight - if (totalWeight === 0) return 1 + // Keep the quality-score denominator stable as diagnostics expand so + // historical scores remain comparable. New warnings still deduct via the + // numerator, but don't dilute the old baseline. + const totalWeight = 10 let triggeredWeight = 0 for (const warning of warnings) { triggeredWeight += RULE_WEIGHTS.get(warning.kind) ?? 1 diff --git a/src/runtime/context-pack.ts b/src/runtime/context-pack.ts index 81ed13c..f330277 100644 --- a/src/runtime/context-pack.ts +++ b/src/runtime/context-pack.ts @@ -24,6 +24,7 @@ import type { } from '../contracts/context-pack.js' import type { RetrievalGateDecision } from '../contracts/retrieval-gate.js' import type { TaskIntentKind } from '../contracts/task-intent.js' +import { classifySourceDomain, type SourceDomain } from '../shared/source-discovery.js' import { estimateQueryTokens } from './serve.js' import { resolveTaskEvidenceRecipe } from './task-evidence-recipes.js' import { selectByValuePerToken, type ValuePerTokenCandidate } from './value-per-token.js' @@ -46,6 +47,7 @@ export interface ContextPackNodeCandidate { type CoverageEntry = CoverageNodeCandidate['entry'] -const TEST_PATH_PATTERN = /(?:^|\/)(?:__tests__|tests?|fixtures?)(?:\/|$)|\.(?:test|spec)\.[^/]+$/i const CONFIG_PATH_PATTERN = /(?:^|\/)(?:config|configs?|settings|env)(?:\/|$)|(?:^|\/)\.env(?:\.[^/]+)?$|(?:^|\/)(?:package|tsconfig|vite|vitest|jest|eslint|prettier|rollup|webpack)\.(?:json|[cm]?js|ts|mjs|cjs)$/i const CONTRACT_PATH_PATTERN = /(?:^|\/)(?:contracts?|schemas?|dto|types?|interfaces?|openapi|graphql)(?:\/|$)|(?:^|\/)[^/]*\.d\.ts$/i const CONTRACT_NODE_KINDS = new Set(['interface', 'type', 'type_alias', 'typealias', 'enum', 'schema', 'contract']) @@ -279,10 +281,13 @@ function orderedSemanticCategories( } function isTestEntry(entry: CoverageEntry): boolean { - return TEST_PATH_PATTERN.test(entry.source_file) + return classifySourceDomain(entry.source_file) === 'test' } function isConfigurationEntry(entry: CoverageEntry): boolean { + if (classifySourceDomain(entry.source_file) === 'config') { + return true + } if (CONFIG_PATH_PATTERN.test(entry.source_file)) { return true } @@ -660,6 +665,7 @@ function scoringViewForCandidate(candidate: ContextPackNodeCandidate): Candidate const snippet = candidate.snippet ?? builtEntry().snippet ?? null const framework = candidate.framework ?? builtEntry().framework const frameworkRole = candidate.framework_role ?? builtEntry().framework_role + const sourceDomain = candidate.source_domain ?? builtEntry().source_domain ?? classifySourceDomain(source_file) return { label: candidate.label, @@ -673,6 +679,7 @@ function scoringViewForCandidate(candidate: ContextPackNodeCandidate): Candidate framework_boost: candidate.framework_boost ?? builtEntry().framework_boost ?? 0, + source_domain: sourceDomain, exact_anchor_match: candidate.exact_anchor_match ?? false, direct_symbol_match: candidate.direct_symbol_match ?? false, source_path_match: candidate.source_path_match ?? false, @@ -706,6 +713,26 @@ function looksArtifact(sourceFile: string): boolean { return /(?:^|\/)(?:package-lock\.json|pnpm-lock\.yaml|yarn\.lock|dist\/|build\/|coverage\/|graphify-out\/)/i.test(sourceFile) } +function sourceDomainPenalty(view: CandidateScoringView, taskContract: ContextPackTaskContract): number { + switch (view.source_domain) { + case 'test': + return taskContract.semantic_required.includes('tests') || taskContract.semantic_optional.includes('tests') ? 0 : 2 + case 'benchmark': + case 'fixture': + return 2 + case 'generated': + case 'build_artifact': + return 3 + case 'docs': + return 1 + case 'config': + return taskContract.semantic_required.includes('configuration') || taskContract.semantic_optional.includes('configuration') ? 0 : 0.5 + case 'production': + case 'unknown': + return 0 + } +} + function looksTypeOnly(view: CoverageEntry): boolean { const sourceFile = view.source_file.toLowerCase() const label = view.label.toLowerCase() @@ -833,6 +860,11 @@ function computeContextCandidateValue( pushUnique(reasons, 'source path match') } + if (view.source_domain !== 'production' && view.source_domain !== 'unknown') { + score += 0.25 + pushUnique(reasons, `${view.source_domain} domain`) + } + if (looksLikeBarrelFile(view.source_file) && !view.exact_anchor_match && !view.source_path_match) { score -= 2.5 pushUnique(penalties, 'barrel export penalty') @@ -864,6 +896,12 @@ function computeContextCandidateValue( pushUnique(penalties, 'hub node penalty') } + const domainPenalty = sourceDomainPenalty(view, taskContract) + if (domainPenalty > 0) { + score -= domainPenalty + pushUnique(penalties, `${view.source_domain.replace('_', ' ')} penalty`) + } + if (exactCodeRequested(taskContract) && (!view.source_file || typeof view.snippet !== 'string' || view.snippet.length === 0)) { score -= 1 pushUnique(penalties, 'missing snippet penalty') diff --git a/src/runtime/retrieval-gate.ts b/src/runtime/retrieval-gate.ts index 0dead9c..e3714f8 100644 --- a/src/runtime/retrieval-gate.ts +++ b/src/runtime/retrieval-gate.ts @@ -22,6 +22,7 @@ import type { RetrievalGateDecision, + RetrievalExcludedDomain, RetrievalGateSignals, RetrievalIntent, RetrievalLevel, @@ -29,6 +30,7 @@ import type { export type { RetrievalGateDecision, + RetrievalExcludedDomain, RetrievalGateSignals, RetrievalIntent, RetrievalLevel, @@ -70,21 +72,38 @@ const PATTERNS: ReadonlyArray<{ intent: RetrievalIntent; re: RegExp }> = [ const PATH_RE = /(?:^|\s|`)((?:[\w@./-]+\/)*[\w./@-]+\.[A-Za-z]{1,8})(?=\b|`|$)/g const SYMBOL_BACKTICK_RE = /`([A-Za-z_$][\w$]*(?:\.[A-Za-z_$][\w$]*)*\(?\)?)`/g +const SYMBOL_EXPLICIT_RE = /\b([A-Z][A-Za-z0-9_$]*(?:\.|#|::)[A-Za-z_$][\w$]*\(?\)?|[A-Za-z_$][\w$]{2,}\(\))\b/g const STACK_TRACE_RE = /(?:^|\n)\s*at\s+\S+\s*\([^)]*:\d+(?::\d+)?\)|Error[:\s]\s+\S/ +const EXCLUSION_SPAN_RE = /\b(?:exclude|excluding|ignore|ignoring|omit|omitting|skip|skipping|without|do not include|don't include|not|no)\b\s+(.+?)(?=(?:\s+\b(?:but|while|however|when)\b|[.;\n]|$))/gi + +const EXCLUDED_DOMAIN_HINTS: ReadonlyArray<{ domain: RetrievalExcludedDomain; pattern: RegExp; pathHints: string[] }> = [ + { domain: 'test', pattern: /\b(?:tests?|specs?|coverage|__tests__|e2e|cypress|playwright)\b/i, pathHints: ['test', 'tests', '__tests__', 'spec', 'specs', 'coverage'] }, + { domain: 'benchmark', pattern: /\b(?:bench(?:mark|marks)?|performance|perf)\b/i, pathHints: ['bench', 'benchmark', 'benchmarks', 'perf', 'performance'] }, + { domain: 'fixture', pattern: /\b(?:fixtures?|mocks?|__fixtures__|__mocks__)\b/i, pathHints: ['fixture', 'fixtures', 'mock', 'mocks', '__fixtures__', '__mocks__'] }, + { domain: 'generated', pattern: /\b(?:generated|codegen|__generated__)\b/i, pathHints: ['generated', '__generated__'] }, + { domain: 'docs', pattern: /\b(?:docs?|readme|changelog|markdown|mdx?)\b/i, pathHints: ['docs', 'readme', 'changelog'] }, + { domain: 'config', pattern: /\b(?:config|configs?|settings|env|docker|compose|k8s|helm|package\.json|tsconfig)\b/i, pathHints: ['config', 'configs', 'settings', 'env', 'docker', 'compose', 'k8s', 'helm', 'package.json', 'tsconfig'] }, + { domain: 'build_artifact', pattern: /\b(?:build artifacts?|dist|coverage|graphify-out|node_modules)\b/i, pathHints: ['build', 'dist', 'coverage', 'graphify-out', 'node_modules'] }, +] export function classifyRetrievalLevel(input: RetrievalGateInput): RetrievalGateDecision { const prompt = input.prompt ?? '' - const detectedPaths = input.mentionedPaths ?? detectPaths(prompt) - const detectedSymbols = input.mentionedSymbols ?? detectSymbols(prompt) + const exclusions = extractPromptExclusions(prompt) + const positivePrompt = exclusions.positivePrompt + const detectedPaths = input.mentionedPaths ?? detectPaths(positivePrompt) + const detectedSymbols = input.mentionedSymbols ?? detectSymbols(positivePrompt) const hasStackTrace = input.hasStackTrace ?? STACK_TRACE_RE.test(prompt) const hasPrDiff = input.hasPrDiff === true - const intent = input.intent ?? detectIntent(prompt) + const intent = input.intent ?? detectIntent(positivePrompt) const signals: RetrievalGateSignals = { has_pr_diff: hasPrDiff, has_stack_trace: hasStackTrace, mentioned_paths: detectedPaths, mentioned_symbols: detectedSymbols, + ...(exclusions.excludedDomains.length > 0 ? { excluded_domains: exclusions.excludedDomains } : {}), + ...(exclusions.excludedTerms.length > 0 ? { excluded_terms: exclusions.excludedTerms } : {}), + ...(exclusions.excludedPathHints.length > 0 ? { excluded_path_hints: exclusions.excludedPathHints } : {}), } if (input.manualOverride !== undefined) { @@ -184,5 +203,95 @@ function detectSymbols(prompt: string): string[] { for (const match of prompt.matchAll(SYMBOL_BACKTICK_RE)) { if (match[1]) out.add(match[1]) } + for (const match of prompt.matchAll(SYMBOL_EXPLICIT_RE)) { + const candidate = match[1]?.trim() + if (!candidate) { + continue + } + if (/\.(?:ts|tsx|js|jsx|mjs|cjs|json|md|mdx)$/i.test(candidate) || candidate.includes('/')) { + continue + } + out.add(candidate) + } return [...out] } + +export function extractPromptExclusions(prompt: string): { + excludedTerms: string[] + excludedPathHints: string[] + excludedDomains: RetrievalExcludedDomain[] + positivePrompt: string +} { + const excludedTerms = new Set() + const excludedPathHints = new Set() + const excludedDomains = new Set() + const spans: Array<{ start: number; end: number }> = [] + + for (const match of prompt.matchAll(EXCLUSION_SPAN_RE)) { + const phrase = match[1]?.trim() + const index = match.index + if (!phrase || index === undefined) { + continue + } + spans.push({ start: index, end: index + match[0].length }) + for (const term of splitExclusionPhrase(phrase)) { + excludedTerms.add(term) + const hint = normalizeExclusionPathHint(term) + if (hint) { + excludedPathHints.add(hint) + } + const trailingWord = term.split(/\s+/).at(-1) + if (trailingWord && trailingWord !== term) { + excludedTerms.add(trailingWord) + } + for (const mapping of EXCLUDED_DOMAIN_HINTS) { + if (mapping.pattern.test(term)) { + excludedDomains.add(mapping.domain) + mapping.pathHints.forEach((pathHint) => excludedPathHints.add(pathHint)) + } + } + } + } + + const positivePrompt = spans.length === 0 + ? prompt + : compressPrompt(excludePromptSpans(prompt, spans)) + + return { + excludedTerms: [...excludedTerms], + excludedPathHints: [...excludedPathHints], + excludedDomains: [...excludedDomains], + positivePrompt, + } +} + +function splitExclusionPhrase(phrase: string): string[] { + return phrase + .split(/\s*(?:,| and | or )\s*/i) + .map((part) => part.trim().replace(/^[^A-Za-z0-9_]+|[^A-Za-z0-9_]+$/g, '')) + .map((part) => part.replace(/^(?:the|any|and)\s+/i, '')) + .filter((part) => part.length > 0) +} + +function normalizeExclusionPathHint(term: string): string | null { + const normalized = term + .toLowerCase() + .replace(/[^a-z0-9]+/g, '-') + .replace(/^-+|-+$/g, '') + return normalized.length > 0 ? normalized : null +} + +function excludePromptSpans(prompt: string, spans: ReadonlyArray<{ start: number; end: number }>): string { + let cursor = 0 + let out = '' + for (const span of [...spans].sort((left, right) => left.start - right.start)) { + out += prompt.slice(cursor, span.start) + cursor = span.end + } + out += prompt.slice(cursor) + return out +} + +function compressPrompt(prompt: string): string { + return prompt.replace(/\s+/g, ' ').replace(/\s+([,.;])/g, '$1').trim() +} diff --git a/src/runtime/retrieve.ts b/src/runtime/retrieve.ts index 2e07d88..551f821 100644 --- a/src/runtime/retrieve.ts +++ b/src/runtime/retrieve.ts @@ -20,6 +20,11 @@ import { godNodes, workspaceBridges } from '../pipeline/analyze.js' import { type Communities } from '../pipeline/cluster.js' import { buildCommunityLabels } from '../pipeline/community-naming.js' import { lineNumberFromSourceLocation, lineRangeFromSourceLocation } from '../shared/source-location.js' +import { + classifySourceDomain, + isPollutedSourcePath, + type SourceDomain, +} from '../shared/source-discovery.js' import { relativizeSourceFile } from '../shared/source-path.js' import { classifyTaskContract, @@ -90,6 +95,7 @@ export interface RetrieveMatchedNode { framework?: string | undefined framework_role?: string | undefined framework_boost?: number + source_domain?: SourceDomain file_type: string snippet: string | null match_score: number @@ -448,6 +454,7 @@ function scoredNodeFromGraphEntry( attributes: Record, frameworkProfile: FrameworkQuestionProfile, questionLower = '', + rootPath?: string, ): ScoredNode { const resolvedLine = resolvedLineNumber(attributes) const nodeKind = String(attributes.node_kind ?? '') @@ -467,11 +474,13 @@ function scoredNodeFromGraphEntry( nodeKind, framework: typeof attributes.framework === 'string' ? attributes.framework : undefined, frameworkRole: frameworkRole || undefined, + sourceDomain: classifySourceDomain(String(attributes.source_file ?? ''), rootPath), fileType: String(attributes.file_type ?? '').trim().toLowerCase(), fileNodeLike: isFileNodeLike(String(attributes.label ?? ''), String(attributes.source_file ?? '')), community: parseCommunityId(attributes.community), frameworkBoost: frameworkBoostForNode(frameworkProfile, nodeKind, frameworkRole, frameworkMetadataFromAttributes(attributes), questionLower), exactLabelMatch: false, + literalPathMatch: false, sourcePathMatch: false, evidenceTier: 0, score: 0, @@ -542,12 +551,14 @@ interface SeedCandidate { nodeKind: string framework?: string | undefined frameworkRole?: string | undefined + sourceDomain: SourceDomain fileType: string fileNodeLike: boolean community: number | null frameworkBoost: number seedScore: SeedScoreBreakdown exactLabelMatch: boolean + literalPathMatch: boolean sourcePathMatch: boolean evidenceTier: 0 | 1 | 2 relevanceBand: 'direct' | 'related' | 'peripheral' @@ -564,18 +575,20 @@ interface ScoredNode { nodeKind: string framework?: string | undefined frameworkRole?: string | undefined + sourceDomain: SourceDomain fileType: string fileNodeLike: boolean community: number | null frameworkBoost: number exactLabelMatch: boolean + literalPathMatch: boolean sourcePathMatch: boolean evidenceTier: 0 | 1 | 2 score: number relevanceBand: 'direct' | 'related' | 'peripheral' } -function scoredNodeFromGraph(graph: KnowledgeGraph, nodeId: string, score: number): ScoredNode { +function scoredNodeFromGraph(graph: KnowledgeGraph, nodeId: string, score: number, rootPath?: string): ScoredNode { const attributes = graph.nodeAttributes(nodeId) const resolvedLine = resolvedLineNumber(attributes) return { @@ -591,11 +604,13 @@ function scoredNodeFromGraph(graph: KnowledgeGraph, nodeId: string, score: numbe nodeKind: String(attributes.node_kind ?? ''), framework: typeof attributes.framework === 'string' ? attributes.framework : undefined, frameworkRole: typeof attributes.framework_role === 'string' ? attributes.framework_role : undefined, + sourceDomain: classifySourceDomain(String(attributes.source_file ?? ''), rootPath), fileType: String(attributes.file_type ?? '').trim().toLowerCase(), fileNodeLike: isFileNodeLike(String(attributes.label ?? ''), String(attributes.source_file ?? '')), community: parseCommunityId(attributes.community), frameworkBoost: 0, exactLabelMatch: false, + literalPathMatch: false, sourcePathMatch: false, evidenceTier: 0, score, @@ -646,6 +661,13 @@ interface FrameworkQuestionProfile { modelIntent: boolean } +interface SymbolReference { + raw: string + bareName: string + className?: string + methodName?: string +} + function activeFrameworksForProfile(profile: FrameworkQuestionProfile): ReadonlySet { const frameworks = new Set() if (profile.express) frameworks.add('express') @@ -672,8 +694,90 @@ function normalizeSeedText(value: string): string { return tokenizeLabel(value).join('') } -function normalizeMentionedSymbol(value: string): string { - return normalizeSeedText(value.replace(/\(\)$/, '').split('.').at(-1) ?? value) +function normalizeIdentifier(value: string): string { + return normalizeSeedText(value.replace(/\(\)$/, '')) +} + +function parseSymbolReference(value: string): SymbolReference { + const trimmed = value.trim().replace(/`/g, '') + const withoutCall = trimmed.replace(/\(\)$/, '') + const separatorMatch = withoutCall.match(/^([A-Za-z_$][\w$]*)(?:\.|#|::)([A-Za-z_$][\w$]*)$/) + if (separatorMatch?.[1] && separatorMatch[2]) { + return { + raw: trimmed, + bareName: separatorMatch[2], + className: separatorMatch[1], + methodName: separatorMatch[2], + } + } + + return { + raw: trimmed, + bareName: withoutCall, + ...(trimmed.endsWith('()') ? { methodName: withoutCall } : {}), + } +} + +function labelSymbolParts(label: string): { className?: string; methodName?: string; normalized: string } { + const trimmed = label.trim().replace(/`/g, '') + const normalized = normalizeIdentifier(trimmed) + const dotted = trimmed.replace(/\(\)$/, '') + const separatorMatch = dotted.match(/^([A-Za-z_$][\w$]*)(?:\.|#|::)([A-Za-z_$][\w$]*)$/) + if (separatorMatch?.[1] && separatorMatch[2]) { + return { + className: separatorMatch[1], + methodName: separatorMatch[2], + normalized, + } + } + + const methodOnlyMatch = dotted.match(/^\.?([A-Za-z_$][\w$]*)$/) + return { + ...(methodOnlyMatch?.[1] ? { methodName: methodOnlyMatch[1] } : {}), + normalized, + } +} + +function symbolReferenceMatchScore( + label: string, + sourceFile: string, + references: readonly SymbolReference[], +): number { + const parts = labelSymbolParts(label) + const normalizedSource = normalizeSeedText(sourceFile) + let best = 0 + + for (const reference of references) { + const normalizedRaw = normalizeIdentifier(reference.raw) + if (parts.normalized === normalizedRaw) { + best = Math.max(best, 4) + continue + } + + const normalizedBare = normalizeIdentifier(reference.bareName) + if (reference.className && reference.methodName) { + const classMatches = normalizeIdentifier(parts.className ?? '') === normalizeIdentifier(reference.className) + const methodMatches = normalizeIdentifier(parts.methodName ?? '') === normalizeIdentifier(reference.methodName) + if (classMatches && methodMatches) { + best = Math.max(best, 4) + continue + } + if (methodMatches && normalizedSource.includes(normalizeIdentifier(reference.className))) { + best = Math.max(best, 3.5) + continue + } + } + + if (normalizeIdentifier(parts.methodName ?? '') === normalizedBare) { + best = Math.max(best, reference.methodName ? 3 : 2.5) + continue + } + if (parts.normalized === normalizedBare) { + best = Math.max(best, 2.5) + } + } + + return best } function sourceFileMatchesMentionedPath(sourceFile: string, mentionedPaths: readonly string[]): boolean { @@ -684,6 +788,65 @@ function sourceFileMatchesMentionedPath(sourceFile: string, mentionedPaths: read return mentionedPaths.some((path) => sourceFile === path || sourceFile.endsWith(`/${path}`)) } +function excludedTermMatches(value: string, excludedTerms: readonly string[], excludedPathHints: readonly string[]): boolean { + const lowerValue = value.toLowerCase() + return excludedTerms.some((term) => lowerValue.includes(term.toLowerCase())) + || excludedPathHints.some((hint) => lowerValue.includes(hint.toLowerCase())) +} + +function promptAllowsSourceDomain(domain: SourceDomain, intent: string, prompt: string, questionTokens: readonly string[]): boolean { + const lowerPrompt = prompt.toLowerCase() + switch (domain) { + case 'test': + return intent === 'test' || includesAnyToken(questionTokens, ['test', 'tests', 'spec', 'coverage', 'e2e']) + case 'benchmark': + return includesAnyToken(questionTokens, ['bench', 'benchmark', 'benchmarks', 'perf', 'performance']) + || /\b(html reporter|reporter utilities?)\b/i.test(lowerPrompt) + case 'fixture': + return includesAnyToken(questionTokens, ['fixture', 'fixtures', 'mock', 'mocks']) + case 'generated': + return includesAnyToken(questionTokens, ['generated', 'codegen']) + case 'docs': + return includesAnyToken(questionTokens, ['doc', 'docs', 'readme', 'changelog']) + case 'config': + return includesAnyToken(questionTokens, ['config', 'configs', 'env', 'docker', 'compose', 'settings']) + case 'build_artifact': + return includesAnyToken(questionTokens, ['dist', 'build', 'artifact', 'artifacts']) + case 'production': + case 'unknown': + return true + } +} + +function defaultSourceDomainPenalty( + domain: SourceDomain, + intent: string, + prompt: string, + questionTokens: readonly string[], +): number { + if (promptAllowsSourceDomain(domain, intent, prompt, questionTokens)) { + return 0 + } + + switch (domain) { + case 'test': + case 'benchmark': + return 3 + case 'fixture': + return 2.5 + case 'generated': + case 'build_artifact': + return 3.5 + case 'docs': + return 1.25 + case 'config': + return 0.75 + case 'production': + case 'unknown': + return 0 + } +} + function isFileNodeLike(label: string, sourceFile: string): boolean { if (!label || !sourceFile) { return false @@ -1303,6 +1466,7 @@ function buildRetrieveResultFromOrderedCandidates( source_file: serializedSourceFile, line_number: node.lineNumber, framework_boost: node.frameworkBoost, + source_domain: node.sourceDomain, file_type: node.fileType, snippet, match_score: node.score, @@ -1327,6 +1491,7 @@ function buildRetrieveResultFromOrderedCandidates( file_type: node.fileType, ...(node.nodeKind.trim().length > 0 ? { node_kind: node.nodeKind } : {}), framework_boost: node.frameworkBoost, + source_domain: node.sourceDomain, match_score: node.score, exact_anchor_match: node.exactLabelMatch, direct_symbol_match: node.exactLabelMatch, @@ -1395,7 +1560,7 @@ function buildRetrieveResultFromOrderedCandidates( export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions): RetrieveResult { const { question, budget } = options const questionTokens = tokenizeQuestion(question) - const rootPath = typeof graph.graph.root_path === 'string' ? graph.graph.root_path : undefined + const rootPath = typeof graph.graph.root_path === 'string' ? graph.graph.root_path : process.cwd() const retrievalGate = classifyRetrievalLevel({ prompt: question, ...(options.retrievalLevel !== undefined ? { manualOverride: options.retrievalLevel } : {}), @@ -1478,8 +1643,11 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions) ...buildCommunityLabels(graph, communities), ...storedCommunityLabelsFromGraph(graph), } - const mentionedSymbols = new Set(retrievalGate.signals.mentioned_symbols.map(normalizeMentionedSymbol)) + const mentionedSymbolRefs = retrievalGate.signals.mentioned_symbols.map(parseSymbolReference) const mentionedPaths = retrievalGate.signals.mentioned_paths + const excludedDomains = retrievalGate.signals.excluded_domains ?? [] + const excludedTerms = retrievalGate.signals.excluded_terms ?? [] + const excludedPathHints = retrievalGate.signals.excluded_path_hints ?? [] // Step 1+2: Score all nodes with explicit seed evidence weights. const tokenWeights = tokenWeightsForQuestion(graph, questionTokens) @@ -1499,8 +1667,10 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions) const label = String(attributes.label ?? '') const sourceFile = String(attributes.source_file ?? '') const nodeKind = String(attributes.node_kind ?? '') + const sourceDomain = classifySourceDomain(sourceFile, rootPath) const fileNodeLike = isFileNodeLike(label, sourceFile) - const exactAnchorMatch = mentionedSymbols.has(normalizeMentionedSymbol(label)) + const symbolMatch = symbolReferenceMatchScore(label, sourceFile, mentionedSymbolRefs) + const exactAnchorMatch = symbolMatch >= 3 const mentionedPathMatch = sourceFileMatchesMentionedPath(sourceFile, mentionedPaths) const framework = typeof attributes.framework === 'string' ? attributes.framework : undefined const frameworkRole = String(attributes.framework_role ?? '') @@ -1514,6 +1684,14 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions) averageLabelLength, { fileNodeLike, fileOrientedQuestion }, ) + const anchorScore = symbolMatch + const exclusionMatches = excludedDomains.includes(sourceDomain as never) + || excludedTermMatches(label, excludedTerms, excludedPathHints) + || excludedTermMatches(sourceFile, excludedTerms, excludedPathHints) + if ((isPollutedSourcePath(sourceFile, rootPath) || exclusionMatches) && !exactAnchorMatch && !mentionedPathMatch) { + continue + } + const sourceDomainPenalty = defaultSourceDomainPenalty(sourceDomain, retrievalGate.intent, question, questionTokens) // CodeRabbit fix: compute framework boost BEFORE the seed gate so // metadata-only matches (e.g. a `handler()` node tagged with @@ -1527,7 +1705,9 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions) questionLower, ) - if (score.total > 0 || metadataBoost > 0) { + const totalSeedScore = score.total + anchorScore + metadataBoost - sourceDomainPenalty + const hasPositiveSeedEvidence = score.total > 0 || anchorScore > 0 || metadataBoost > 0 || mentionedPathMatch + if (hasPositiveSeedEvidence) { const resolvedLine = resolvedLineNumber(attributes) seedCandidates.push({ id, @@ -1542,32 +1722,49 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions) nodeKind, framework, frameworkRole: frameworkRole || undefined, + sourceDomain, fileType, fileNodeLike, community, frameworkBoost: metadataBoost, - seedScore: score, + seedScore: { + ...score, + labelExactScore: score.labelExactScore + anchorScore, + total: totalSeedScore, + }, exactLabelMatch: score.labelExactScore > 0 || exactAnchorMatch, + literalPathMatch: mentionedPathMatch, sourcePathMatch: score.sourcePathScore > 0 || mentionedPathMatch, // When the seed only made it in via metadata boost, give it at // least evidence tier 1 so it's not at the bottom of the heap. evidenceTier: metadataBoost > 0 ? (Math.max(evidenceTierForSeedScore(score), 1) as 0 | 1 | 2) - : evidenceTierForSeedScore(score), + : (exactAnchorMatch || mentionedPathMatch ? 2 : evidenceTierForSeedScore(score)), relevanceBand: score.labelExactScore > 0 || exactAnchorMatch || score.labelTokenScore > 0 ? 'direct' : 'related', }) } } + const cleanExactLabels = new Set( + seedCandidates + .filter((candidate) => !isPollutedSourcePath(candidate.sourceFile, rootPath)) + .map((candidate) => normalizeSeedText(candidate.label)), + ) + const filteredSeedCandidates = seedCandidates.filter((candidate) => ( + !isPollutedSourcePath(candidate.sourceFile, rootPath) + || candidate.literalPathMatch + || !cleanExactLabels.has(normalizeSeedText(candidate.label)) + )) + const fusedSeedScores = reciprocalRankFuse([ - rankedSeedCandidateIds(graph, seedCandidates, (candidate) => candidate.seedScore.labelExactScore), - rankedSeedCandidateIds(graph, seedCandidates, (candidate) => candidate.seedScore.labelTokenScore), - rankedSeedCandidateIds(graph, seedCandidates, (candidate) => candidate.seedScore.sourcePathScore), - rankedSeedCandidateIds(graph, seedCandidates, (candidate) => candidate.seedScore.communityScore), + rankedSeedCandidateIds(graph, filteredSeedCandidates, (candidate) => candidate.seedScore.labelExactScore), + rankedSeedCandidateIds(graph, filteredSeedCandidates, (candidate) => candidate.seedScore.labelTokenScore), + rankedSeedCandidateIds(graph, filteredSeedCandidates, (candidate) => candidate.seedScore.sourcePathScore), + rankedSeedCandidateIds(graph, filteredSeedCandidates, (candidate) => candidate.seedScore.communityScore), ], { weights: [2, 1.5, 0.5, 0.25], }) - const scored: ScoredNode[] = seedCandidates.map((candidate) => ({ + const scored: ScoredNode[] = filteredSeedCandidates.map((candidate) => ({ id: candidate.id, label: candidate.label, sourceFile: candidate.sourceFile, @@ -1583,16 +1780,18 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions) community: candidate.community, frameworkBoost: candidate.frameworkBoost, exactLabelMatch: candidate.exactLabelMatch, + literalPathMatch: candidate.literalPathMatch, sourcePathMatch: candidate.sourcePathMatch, evidenceTier: candidate.evidenceTier, - score: ((fusedSeedScores.get(candidate.id) ?? 0) * SEED_FUSION_SCORE_SCALE) + candidate.frameworkBoost, + score: Math.max(0.05, ((fusedSeedScores.get(candidate.id) ?? 0) * SEED_FUSION_SCORE_SCALE) + candidate.frameworkBoost - defaultSourceDomainPenalty(candidate.sourceDomain, retrievalGate.intent, question, questionTokens)), relevanceBand: candidate.relevanceBand, + sourceDomain: candidate.sourceDomain, })) scored.sort((a, b) => compareScoredNodes(graph, a, b)) const expansionPolicy = expansionPolicyForLevel(effectiveRetrievalLevel, budget) - const anchoredSeedPool = (mentionedSymbols.size > 0 || mentionedPaths.length > 0) - ? scored.filter((node) => mentionedSymbols.has(normalizeMentionedSymbol(node.label)) || sourceFileMatchesMentionedPath(node.sourceFile, mentionedPaths)) + const anchoredSeedPool = (mentionedSymbolRefs.length > 0 || mentionedPaths.length > 0) + ? scored.filter((node) => symbolReferenceMatchScore(node.label, node.sourceFile, mentionedSymbolRefs) > 0 || sourceFileMatchesMentionedPath(node.sourceFile, mentionedPaths)) : [] const seedPool = effectiveRetrievalLevel <= 2 && anchoredSeedPool.length > 0 ? anchoredSeedPool : scored @@ -1748,12 +1947,24 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions) if (options.fileType && fileType !== options.fileType.trim().toLowerCase()) { continue } + const label = String(attributes.label ?? '') + const sourceFile = String(attributes.source_file ?? '') + const sourceDomain = classifySourceDomain(sourceFile, rootPath) + const symbolMatch = symbolReferenceMatchScore(label, sourceFile, mentionedSymbolRefs) + const pathMatch = sourceFileMatchesMentionedPath(sourceFile, mentionedPaths) + const exclusionMatches = excludedDomains.includes(sourceDomain as never) + || excludedTermMatches(label, excludedTerms, excludedPathHints) + || excludedTermMatches(sourceFile, excludedTerms, excludedPathHints) + if ((isPollutedSourcePath(sourceFile, rootPath) || exclusionMatches) && symbolMatch <= 0 && !pathMatch) { + continue + } + const sourceDomainPenalty = defaultSourceDomainPenalty(sourceDomain, retrievalGate.intent, question, questionTokens) const resolvedLine = resolvedLineNumber(attributes) scored.push({ id: nodeId, - label: String(attributes.label ?? ''), - sourceFile: String(attributes.source_file ?? ''), + label, + sourceFile, sourceLocation: typeof attributes.source_location === 'string' && attributes.source_location.length > 0 ? attributes.source_location : null, @@ -1763,14 +1974,16 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions) nodeKind: String(attributes.node_kind ?? ''), framework: typeof attributes.framework === 'string' ? attributes.framework : undefined, frameworkRole: typeof attributes.framework_role === 'string' ? attributes.framework_role : undefined, + sourceDomain, fileType, - fileNodeLike: isFileNodeLike(String(attributes.label ?? ''), String(attributes.source_file ?? '')), + fileNodeLike: isFileNodeLike(label, sourceFile), community, frameworkBoost: 0, - exactLabelMatch: false, - sourcePathMatch: false, + exactLabelMatch: symbolMatch >= 3, + literalPathMatch: pathMatch, + sourcePathMatch: pathMatch, evidenceTier: hopDistances.get(nodeId) === 1 ? (hopEvidenceTiers.get(nodeId) ?? 0) : 0, - score: hopScore, + score: Math.max(0.05, hopScore - sourceDomainPenalty), relevanceBand: hopDistances.get(nodeId) === 1 ? 'related' : 'peripheral', }) } @@ -1840,16 +2053,27 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions) label: node.label, sourceFile: node.sourceFile, exactLabelMatch: node.exactLabelMatch, - sourcePathMatch: node.sourcePathMatch, - score: node.score, + sourcePathMatch: node.sourcePathMatch, + literalPathMatch: node.literalPathMatch, + score: node.score, + nodeKind: node.nodeKind, + frameworkRole: node.frameworkRole, })), retrievalGate.intent, + { + prompt: question, + mentionedSymbols: retrievalGate.signals.mentioned_symbols, + excludedDomains: retrievalGate.signals.excluded_domains, + excludedTerms: retrievalGate.signals.excluded_terms, + excludedPathHints: retrievalGate.signals.excluded_path_hints, + rootPath, + }, ) if (sliced) { const scoredById = new Map(scored.map((node) => [node.id, node])) const sliceCandidates = sliced.ordered_ids.map((nodeId, index) => ( - scoredById.get(nodeId) ?? scoredNodeFromGraph(graph, nodeId, Math.max(0.25, 2 - (index * 0.1))) + scoredById.get(nodeId) ?? scoredNodeFromGraph(graph, nodeId, Math.max(0.25, 2 - (index * 0.1)), rootPath) )) return buildRetrieveResultFromOrderedCandidates( @@ -1891,7 +2115,7 @@ export async function retrieveContextAsync(graph: KnowledgeGraph, options: Retri const frameworkProfile = buildFrameworkQuestionProfile(options.question, questionTokens) const activeFrameworks = activeFrameworksForProfile(frameworkProfile) - const rootPath = typeof graph.graph.root_path === 'string' ? graph.graph.root_path : undefined + const rootPath = typeof graph.graph.root_path === 'string' ? graph.graph.root_path : process.cwd() const communities = communitiesFromGraph(graph) const communityLabels: Record = { ...buildCommunityLabels(graph, communities), @@ -1922,7 +2146,7 @@ export async function retrieveContextAsync(graph: KnowledgeGraph, options: Retri const questionLower = options.question.toLowerCase() const candidatesById = new Map( eligibleNodeEntries(graph, options) - .map(([id, attributes]) => [id, scoredNodeFromGraphEntry(id, attributes, frameworkProfile, questionLower)] as const), + .map(([id, attributes]) => [id, scoredNodeFromGraphEntry(id, attributes, frameworkProfile, questionLower, rootPath)] as const), ) if (candidatesById.size === 0) { return lexicalResult diff --git a/src/runtime/retrieve/slicing.ts b/src/runtime/retrieve/slicing.ts index f3ac753..714e3ca 100644 --- a/src/runtime/retrieve/slicing.ts +++ b/src/runtime/retrieve/slicing.ts @@ -5,23 +5,39 @@ import type { } from '../../contracts/context-pack.js' import type { KnowledgeGraph } from '../../contracts/graph.js' import type { RetrievalIntent } from '../../contracts/retrieval-gate.js' +import { classifySourceDomain, isPollutedSourcePath } from '../../shared/source-discovery.js' export interface SliceScoredNode { id: string label: string sourceFile: string + nodeKind?: string | undefined + frameworkRole?: string | undefined exactLabelMatch: boolean + literalPathMatch?: boolean sourcePathMatch: boolean score: number } +interface SliceOptions { + prompt?: string | undefined + mentionedSymbols?: readonly string[] | undefined + excludedDomains?: readonly string[] | undefined + excludedTerms?: readonly string[] | undefined + excludedPathHints?: readonly string[] | undefined + rootPath?: string | undefined +} + function sliceNodeFromGraph(graph: KnowledgeGraph, nodeId: string): SliceScoredNode { const attributes = graph.nodeAttributes(nodeId) return { id: nodeId, label: String(attributes.label ?? nodeId), sourceFile: String(attributes.source_file ?? ''), + nodeKind: typeof attributes.node_kind === 'string' ? attributes.node_kind : undefined, + frameworkRole: typeof attributes.framework_role === 'string' ? attributes.framework_role : undefined, exactLabelMatch: false, + literalPathMatch: false, sourcePathMatch: false, score: 0.25, } @@ -95,6 +111,49 @@ function policyForIntent(intent: RetrievalIntent): SlicePolicy { } } +function promptWantsRuntimePipeline(prompt: string | undefined): boolean { + if (!prompt) { + return false + } + + return /\b(runtime|pipeline|service|orchestrator|job|agent|scoring|report builder|persistence|repository)\b/i.test(prompt) +} + +function methodLikeNode(node: SliceScoredNode): boolean { + return node.nodeKind?.toLowerCase() === 'method' || /(?:[.#:]|^\.)[A-Za-z_$][\w$]*\(?\)?$/u.test(node.label) +} + +function effectivePolicy(intent: RetrievalIntent, anchors: readonly SliceScoredNode[], prompt: string | undefined): SlicePolicy { + const base = policyForIntent(intent) + const hasMethodAnchor = anchors.some((anchor) => methodLikeNode(anchor)) + const pipelinePrompt = promptWantsRuntimePipeline(prompt) + + if (!hasMethodAnchor && !pipelinePrompt) { + return base + } + + const forwardRelations = new Set(base.forward_relations) + if (hasMethodAnchor) { + forwardRelations.delete('contains') + forwardRelations.delete('method') + } + + const helperRelations = new Set(base.helper_relations) + if (pipelinePrompt) { + helperRelations.add('injects') + helperRelations.add('depends_on') + helperRelations.add('module_provides') + } + + return { + ...base, + forward_relations: forwardRelations, + helper_relations: helperRelations, + backward_depth: pipelinePrompt ? Math.max(base.backward_depth, 3) : base.backward_depth, + forward_depth: pipelinePrompt ? Math.max(base.forward_depth, 3) : base.forward_depth, + } +} + function isBarrelLike(label: string, sourceFile: string): boolean { return label.trim().toLowerCase() === 'index.ts' || /(?:^|\/)index\.ts$/i.test(sourceFile) } @@ -103,11 +162,24 @@ function shouldSuppressNode( graph: KnowledgeGraph, node: SliceScoredNode, anchoredIds: ReadonlySet, + options: SliceOptions, ): boolean { if (anchoredIds.has(node.id)) { return false } + const sourceDomain = classifySourceDomain(node.sourceFile, options.rootPath) + if ((options.excludedDomains ?? []).includes(sourceDomain)) { + return true + } + if (isPollutedSourcePath(node.sourceFile, options.rootPath)) { + return true + } + const excludedTerms = [...(options.excludedTerms ?? []), ...(options.excludedPathHints ?? [])].map((term) => term.toLowerCase()) + if (excludedTerms.some((term) => node.label.toLowerCase().includes(term) || node.sourceFile.toLowerCase().includes(term))) { + return true + } + if (isBarrelLike(node.label, node.sourceFile)) { return true } @@ -119,16 +191,21 @@ function buildAnchors(scored: readonly SliceScoredNode[]): ContextPackSliceAncho const anchors: ContextPackSliceAnchor[] = [] const seen = new Set() const matchedAnchors = scored.filter((node) => node.exactLabelMatch || node.sourcePathMatch) + const exactMethodAnchors = matchedAnchors.filter((node) => node.exactLabelMatch && methodLikeNode(node)) const nonBarrelMatchedAnchors = matchedAnchors.filter((node) => !isBarrelLike(node.label, node.sourceFile)) - const anchorPool = matchedAnchors.length > 0 + const anchorPool = exactMethodAnchors.length > 0 + ? exactMethodAnchors + : matchedAnchors.length > 0 ? (nonBarrelMatchedAnchors.length > 0 ? nonBarrelMatchedAnchors : matchedAnchors) : scored.filter((node) => !isBarrelLike(node.label, node.sourceFile)).slice(0, 1) for (const node of anchorPool) { const reason = node.exactLabelMatch ? 'symbol mention' - : node.sourcePathMatch + : node.literalPathMatch ? 'path mention' + : node.sourcePathMatch + ? 'source path token match' : 'top lexical match' if (!reason || seen.has(node.id)) { continue @@ -169,6 +246,7 @@ function traverseDirection( pathSeen: Set, selectedPaths: ContextPackSlicePath[], anchoredIds: ReadonlySet, + options: SliceOptions, direction: 'forward' | 'backward', relations: ReadonlySet, maxDepth: number, @@ -193,7 +271,7 @@ function traverseDirection( const neighbor = scoredById.get(neighborId) ?? sliceNodeFromGraph(graph, neighborId) scoredById.set(neighborId, neighbor) - if (shouldSuppressNode(graph, neighbor, anchoredIds)) { + if (shouldSuppressNode(graph, neighbor, anchoredIds, options)) { continue } @@ -229,6 +307,7 @@ function addHelperNeighbors( pathSeen: Set, selectedPaths: ContextPackSlicePath[], anchoredIds: ReadonlySet, + options: SliceOptions, ): void { for (const currentId of [...orderedIds]) { const currentNode = scoredById.get(currentId) @@ -244,7 +323,7 @@ function addHelperNeighbors( const neighbor = scoredById.get(neighborId) ?? sliceNodeFromGraph(graph, neighborId) scoredById.set(neighborId, neighbor) - if (shouldSuppressNode(graph, neighbor, anchoredIds)) { + if (shouldSuppressNode(graph, neighbor, anchoredIds, options)) { continue } @@ -269,6 +348,7 @@ export function sliceCandidatesForRetrieve( graph: KnowledgeGraph, scoredCandidates: readonly SliceScoredNode[], intent: RetrievalIntent, + options: SliceOptions = {}, ): { ordered_ids: string[]; metadata: ContextPackSliceMetadata } | null { if (scoredCandidates.length === 0) { return null @@ -279,7 +359,10 @@ export function sliceCandidatesForRetrieve( return null } - const policy = policyForIntent(intent) + const anchorNodes = anchors + .map((anchor) => scoredCandidates.find((candidate) => candidate.id === anchor.node_id)) + .filter((candidate): candidate is SliceScoredNode => candidate !== undefined) + const policy = effectivePolicy(intent, anchorNodes, options.prompt) const anchorIds = anchors.map((anchor) => anchor.node_id).filter((id): id is string => typeof id === 'string') const orderedIds = [...anchorIds] const selectedIds = new Set(anchorIds) @@ -298,6 +381,7 @@ export function sliceCandidatesForRetrieve( pathSeen, selectedPaths, anchoredIds, + options, 'backward', policy.backward_relations, policy.backward_depth, @@ -314,6 +398,7 @@ export function sliceCandidatesForRetrieve( pathSeen, selectedPaths, anchoredIds, + options, 'forward', policy.forward_relations, policy.forward_depth, @@ -329,6 +414,7 @@ export function sliceCandidatesForRetrieve( pathSeen, selectedPaths, anchoredIds, + options, ) return { diff --git a/src/shared/source-discovery.ts b/src/shared/source-discovery.ts new file mode 100644 index 0000000..2e18fd4 --- /dev/null +++ b/src/shared/source-discovery.ts @@ -0,0 +1,244 @@ +import { basename, relative, resolve, sep } from 'node:path' +import { readFileSync } from 'node:fs' + +export type SourceDomain = + | 'production' + | 'test' + | 'benchmark' + | 'fixture' + | 'generated' + | 'docs' + | 'config' + | 'build_artifact' + | 'unknown' + +export const DEFAULT_HARD_IGNORE_GLOBS = [ + '**/.git/**', + '**/.hg/**', + '**/.svn/**', + '**/.worktrees/**', + '**/worktrees/**', + '**/.repo/**', + '**/.jj/**', + '**/graphify-out/**', + '**/.graphify/**', + '**/graphify-cache/**', + '**/graphify-report/**', + '**/GRAPH_REPORT.md', + '**/node_modules/**', + '**/.pnpm-store/**', + '**/.yarn/cache/**', + '**/.yarn/unplugged/**', + '**/.yarn/build-state.yml', + '**/bower_components/**', + '**/vendor/**', + '**/dist/**', + '**/build/**', + '**/out/**', + '**/lib/**', + '**/.next/**', + '**/.nuxt/**', + '**/.svelte-kit/**', + '**/.astro/**', + '**/.vite/**', + '**/.turbo/**', + '**/.nx/**', + '**/.parcel-cache/**', + '**/.cache/**', + '**/.serverless/**', + '**/.vercel/**', + '**/.netlify/**', + '**/coverage/**', + '**/.nyc_output/**', + '**/*.min.js', + '**/*.min.css', + '**/*.map', + '**/*.tsbuildinfo', + '**/*.d.ts.map', + '**/*.log', + '**/logs/**', + '**/tmp/**', + '**/temp/**', + '**/.DS_Store', +] as const + +const TEST_DOMAIN_RE = /(?:^|\/)(?:__tests__|tests?|spec|specs|e2e|cypress|playwright)(?:\/|$)|\.(?:test|spec)\.[^/]+$/i +const BENCHMARK_DOMAIN_RE = /(?:^|\/)(?:bench|benchmark|benchmarks|perf|performance)(?:\/|$)|\.(?:bench|benchmark)\.[^/]+$/i +const FIXTURE_DOMAIN_RE = /(?:^|\/)(?:fixtures?|__fixtures__|mocks?|__mocks__)(?:\/|$)|\.fixture\.[^/]+$/i +const GENERATED_DOMAIN_RE = /(?:^|\/)(?:generated|__generated__)(?:\/|$)|\.(?:generated|gen)\.[^/]+$/i +const DOCS_DOMAIN_RE = /(?:^|\/)docs(?:\/|$)|\.(?:md|mdx|rst|txt)$/i +const CONFIG_DOMAIN_RE = /(?:^|\/)(?:config|configs?|settings)(?:\/|$)|(?:^|\/)\.env(?:\.[^/]+)?$|(?:^|\/)(?:package|tsconfig|vite|vitest|jest|eslint|prettier|rollup|webpack|babel|docker-compose|compose|pnpm-workspace|turbo|nx)\.(?:json|ya?ml|[cm]?js|ts|mjs|cjs)$/i +const BUILD_ARTIFACT_DOMAIN_RE = /(?:^|\/)(?:dist|build|out|coverage|graphify-out|\.next|\.nuxt|\.svelte-kit|\.astro|\.vite|\.turbo|\.nx|\.parcel-cache|\.cache|\.serverless|\.vercel|\.netlify)(?:\/|$)|\.(?:min\.(?:js|css)|map|tsbuildinfo|d\.ts\.map)$/i +const HARD_IGNORE_REGEXES: ReadonlyArray = [ + /(?:^|\/)\.(?:git|hg|svn|repo|jj)(?:\/|$)/i, + /(?:^|\/)\.worktrees(?:\/|$)/i, + /(?:^|\/)worktrees(?:\/|$)/i, + /(?:^|\/)(?:graphify-out|\.graphify|graphify-cache|graphify-report)(?:\/|$)/i, + /(?:^|\/)GRAPH_REPORT\.md$/i, + /(?:^|\/)(?:node_modules|bower_components|vendor|dist|build|out|lib|coverage|logs|tmp|temp)(?:\/|$)/i, + /(?:^|\/)\.pnpm-store(?:\/|$)/i, + /(?:^|\/)\.yarn\/(?:cache|unplugged)(?:\/|$)/i, + /(?:^|\/)\.yarn\/build-state\.yml$/i, + /(?:^|\/)(?:\.next|\.nuxt|\.svelte-kit|\.astro|\.vite|\.turbo|\.nx|\.parcel-cache|\.cache|\.serverless|\.vercel|\.netlify|\.nyc_output|\.test-artifacts)(?:\/|$)/i, + /\.(?:min\.js|min\.css|map|tsbuildinfo|d\.ts\.map|log)$/i, + /(?:^|\/)\.DS_Store$/i, +] + +function normalizePathLike(value: string): string { + return value.replaceAll('\\', '/').replace(/\/{2,}/g, '/') +} + +function globToRegExp(pattern: string): RegExp { + const wildcardCount = [...pattern].filter((character) => character === '*').length + if (pattern.length > 512 || wildcardCount > 32) { + return /^$/ + } + + const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, '\\$&') + const wildcarded = escaped.replace(/\*/g, '.*').replace(/\?/g, '.') + return new RegExp(`^${wildcarded}$`) +} + +function matchesPatternValue(value: string, pattern: string): boolean { + return globToRegExp(pattern).test(value) +} + +function relativeWorkspacePath(path: string, root: string): string | null { + const resolvedRoot = resolve(root) + const resolvedPath = path.startsWith(sep) ? resolve(path) : resolve(resolvedRoot, path) + const relativePath = normalizePathLike(relative(resolvedRoot, resolvedPath)) + return relativePath.startsWith('..') ? null : relativePath +} + +function matchesWorkspacePattern(relativePath: string, originalPath: string, pattern: string): boolean { + const normalizedPattern = pattern.replace(/^!/, '').replace(/^\/+|\/+$/g, '') + if (!normalizedPattern) { + return false + } + + const fileName = basename(originalPath) + if (matchesPatternValue(relativePath, normalizedPattern) || matchesPatternValue(fileName, normalizedPattern)) { + return true + } + + const pathParts = relativePath.split('/') + for (let index = 0; index < pathParts.length; index += 1) { + const part = pathParts[index] + if (!part) { + continue + } + const prefix = pathParts.slice(0, index + 1).join('/') + if (matchesPatternValue(part, normalizedPattern) || matchesPatternValue(prefix, normalizedPattern)) { + return true + } + } + + return false +} + +export function normalizeSourcePath(path: string): string { + return normalizePathLike(path) +} + +function workspaceAwarePath(path: string, root?: string): string { + if (!root) { + return normalizeSourcePath(path) + } + + const relativePath = relativeWorkspacePath(path, root) + return relativePath ? normalizeSourcePath(relativePath) : normalizeSourcePath(path) +} + +export function isHardIgnoredPath(path: string): boolean { + const normalizedPath = normalizeSourcePath(path) + return HARD_IGNORE_REGEXES.some((pattern) => pattern.test(normalizedPath)) +} + +export function isDiscoveryPathIgnored(path: string, root: string, patterns: readonly string[]): boolean { + const relativePath = relativeWorkspacePath(path, root) + if (!relativePath) { + return false + } + + let ignored = isHardIgnoredPath(relativePath) + for (const rawPattern of patterns) { + const pattern = rawPattern.trim() + if (!pattern) { + continue + } + const negated = pattern.startsWith('!') + if (matchesWorkspacePattern(relativePath, path, pattern)) { + ignored = !negated + } + } + return ignored +} + +export function isIgnoredByPatterns(path: string, root: string, patterns: readonly string[]): boolean { + const relativePath = relativeWorkspacePath(path, root) + if (!relativePath) { + return false + } + + let ignored = false + for (const rawPattern of patterns) { + const pattern = rawPattern.trim() + if (!pattern) { + continue + } + const negated = pattern.startsWith('!') + if (matchesWorkspacePattern(relativePath, path, pattern)) { + ignored = !negated + } + } + return ignored +} + +export function loadGraphifyignorePatterns(root: string): string[] { + try { + const content = readFileSync(resolve(root, '.graphifyignore'), 'utf8') + return content + .split(/\r?\n/) + .map((line) => line.trim()) + .filter((line) => line.length > 0 && !line.startsWith('#')) + } catch { + return [] + } +} + +export function classifySourceDomain(path: string, root?: string): SourceDomain { + const normalizedPath = workspaceAwarePath(path, root).toLowerCase() + if (!normalizedPath) { + return 'unknown' + } + if (isHardIgnoredPath(normalizedPath)) { + return 'build_artifact' + } + if (BUILD_ARTIFACT_DOMAIN_RE.test(normalizedPath)) { + return 'build_artifact' + } + if (TEST_DOMAIN_RE.test(normalizedPath)) { + return 'test' + } + if (BENCHMARK_DOMAIN_RE.test(normalizedPath)) { + return 'benchmark' + } + if (FIXTURE_DOMAIN_RE.test(normalizedPath)) { + return 'fixture' + } + if (GENERATED_DOMAIN_RE.test(normalizedPath)) { + return 'generated' + } + if (DOCS_DOMAIN_RE.test(normalizedPath)) { + return 'docs' + } + if (CONFIG_DOMAIN_RE.test(normalizedPath)) { + return 'config' + } + return /\.[A-Za-z0-9]+$/i.test(normalizedPath) ? 'production' : 'unknown' +} + +export function isPollutedSourcePath(path: string, root?: string): boolean { + const normalizedPath = workspaceAwarePath(path, root) + return isHardIgnoredPath(normalizedPath) +} diff --git a/tests/unit/benchmark-quality.test.ts b/tests/unit/benchmark-quality.test.ts index 6c2cd69..15adbe4 100644 --- a/tests/unit/benchmark-quality.test.ts +++ b/tests/unit/benchmark-quality.test.ts @@ -1,4 +1,5 @@ -import { mkdirSync, readFileSync, realpathSync, rmSync, writeFileSync } from 'node:fs' +import { mkdirSync, mkdtempSync, readFileSync, realpathSync, rmSync, writeFileSync } from 'node:fs' +import { tmpdir } from 'node:os' import { join } from 'node:path' import { existsSync } from 'node:fs' @@ -238,8 +239,7 @@ describe('retrieval quality benchmark', () => { }) it('reports grounded match rate and query buckets alongside label metrics', () => { - const tempDir = join(process.cwd(), 'graphify-out', 'benchmark-quality-grounding-fixture') - mkdirSync(tempDir, { recursive: true }) + const tempDir = mkdtempSync(join(tmpdir(), 'graphify-benchmark-quality-')) const routeFile = join(tempDir, 'auth-route.ts') writeFileSync(routeFile, ['export function AuthRoute() {', ' return true', '}'].join('\n'), 'utf8') diff --git a/tests/unit/context-pack-diagnostics.test.ts b/tests/unit/context-pack-diagnostics.test.ts index 6db4ffa..d5cb920 100644 --- a/tests/unit/context-pack-diagnostics.test.ts +++ b/tests/unit/context-pack-diagnostics.test.ts @@ -12,6 +12,7 @@ import type { ContextPackRelationship, ContextPackTaskContract, } from '../../src/contracts/context-pack.js' +import type { RetrievalGateDecision } from '../../src/contracts/retrieval-gate.js' import { computeContextPackDiagnostics } from '../../src/runtime/context-pack-diagnostics.js' function taskContract(overrides: Partial = {}): ContextPackTaskContract { @@ -80,6 +81,22 @@ function makePack(overrides: Partial = {}): CompiledContext return { ...base, ...overrides } } +function retrievalGate(overrides: Partial = {}): RetrievalGateDecision { + return { + level: 2, + skipped_retrieval: false, + reason: 'test helper', + intent: 'explain', + signals: { + has_pr_diff: false, + has_stack_trace: false, + mentioned_paths: [], + mentioned_symbols: [], + }, + ...overrides, + } +} + describe('computeContextPackDiagnostics', () => { it('emits zero warnings on a healthy pack', () => { const diag = computeContextPackDiagnostics(makePack()) @@ -271,6 +288,94 @@ describe('computeContextPackDiagnostics', () => { expect(diag.quality_score).toEqual(Number(diag.quality_score.toFixed(3))) }) + it('flags excluded_domain_selected when excluded test files survive into the final pack', () => { + const pack = makePack({ + nodes: [ + makeNode({ node_id: 'a', source_file: '/repo/src/service.ts' }), + makeNode({ node_id: 'b', source_file: '/repo/tests/service.spec.ts' }), + makeNode({ node_id: 'c', source_file: '/repo/src/controller.ts' }), + ], + retrieval_gate: { + ...retrievalGate(), + signals: { + ...retrievalGate().signals, + excluded_domains: ['test'], + } as RetrievalGateDecision['signals'] & { excluded_domains: string[] }, + }, + }) + + const diag = computeContextPackDiagnostics(pack) + expect(diag.warnings.map((warning) => warning.kind)).toContain('excluded_domain_selected') + }) + + it('flags polluted_source_path_selected for worktree and graphify output pollution', () => { + const diag = computeContextPackDiagnostics(makePack({ + nodes: [ + makeNode({ node_id: 'a', source_file: '/repo/src/service.ts' }), + makeNode({ node_id: 'b', source_file: '/repo/backend/.worktrees/task-1/src/service.ts' }), + makeNode({ node_id: 'c', source_file: '/repo/graphify-out/graph.json' }), + ], + })) + + expect(diag.warnings.map((warning) => warning.kind)).toContain('polluted_source_path_selected') + }) + + it('flags controller-only pipeline packs and missing method anchors for runtime prompts', () => { + const diag = computeContextPackDiagnostics(makePack({ + nodes: [ + makeNode({ node_id: 'controller', label: 'IdeasController', source_file: '/repo/src/ideas.controller.ts', framework_role: 'nest_controller' }), + makeNode({ node_id: 'method', label: 'IdeasController.generateFromProblem', source_file: '/repo/src/ideas.controller.ts', framework_role: 'nest_controller' }), + makeNode({ node_id: 'sibling', label: 'IdeasController.listIdeas', source_file: '/repo/src/ideas.controller.ts', framework_role: 'nest_controller' }), + ], + relationships: [ + makeRelationship('IdeasController', 'IdeasController.generateFromProblem', 'controller_route'), + ], + retrieval_gate: { + ...retrievalGate({ + reason: 'pipeline prompt', + }), + signals: { + ...retrievalGate().signals, + mentioned_symbols: ['IdeasController.generateFromProblem'], + }, + }, + task_contract: taskContract({ + prompt: 'Explain the runtime pipeline for IdeasController.generateFromProblem through the service and orchestrator path.', + }), + slice: { + mode: 'explain', + anchors: [{ label: 'IdeasController', reason: 'symbol mention' }], + directions: ['forward'], + selected_paths: [{ from: 'IdeasController', to: 'IdeasController.generateFromProblem', relation: 'controller_route', direction: 'forward' }], + }, + })) + + expect(diag.warnings.map((warning) => warning.kind)).toEqual(expect.arrayContaining([ + 'controller_only_pipeline_pack', + 'missing_method_anchor', + 'missing_runtime_pipeline', + ])) + }) + + it('flags test-dominated production packs and missing structural evidence', () => { + const diag = computeContextPackDiagnostics(makePack({ + nodes: [ + makeNode({ node_id: 'a', source_file: '/repo/tests/service.spec.ts', framework_role: 'nest_controller' }), + makeNode({ node_id: 'b', source_file: '/repo/benchmarks/service.bench.ts' }), + makeNode({ node_id: 'c', source_file: '/repo/tests/helper.ts' }), + ], + relationships: [], + task_contract: taskContract({ + prompt: 'Explain the production runtime pipeline for report generation.', + }), + })) + + expect(diag.warnings.map((warning) => warning.kind)).toEqual(expect.arrayContaining([ + 'test_dominated_pack', + 'missing_structural_evidence', + ])) + }) + it('signals.avg_match_score is NaN when no scored nodes exist', () => { const diag = computeContextPackDiagnostics(makePack({ nodes: [ diff --git a/tests/unit/detect.test.ts b/tests/unit/detect.test.ts index be043da..02d6d8f 100644 --- a/tests/unit/detect.test.ts +++ b/tests/unit/detect.test.ts @@ -60,7 +60,7 @@ describe('detect', () => { } }) - it('includes saved graphify memory notes while keeping generated artifacts ignored', () => { + it('ignores graphify-out artifacts entirely by default', () => { const root = createTempRoot() try { mkdirSync(join(root, 'graphify-out', 'memory'), { recursive: true }) @@ -70,7 +70,7 @@ describe('detect', () => { const result = detect(root) - expect(normalizeAssertionPaths(result.files.document)).toContain(normalizeAssertionPath(join(root, 'graphify-out', 'memory', 'query_auth.md'))) + expect(normalizeAssertionPaths(result.files.document)).not.toContain(normalizeAssertionPath(join(root, 'graphify-out', 'memory', 'query_auth.md'))) expect(result.files.document.some((filePath) => filePath.endsWith('GRAPH_REPORT.md'))).toBe(false) expect( Object.values(result.files) @@ -195,11 +195,8 @@ describe('detect', () => { describe('noise filtering', () => { const noiseDirs = [ - '__tests__', 'tests', 'test', 'spec', 'specs', - 'e2e', 'cypress', 'playwright', 'coverage', '.nyc_output', - '.storybook', 'storybook-static', - 'fixtures', '__fixtures__', '__mocks__', 'mocks', + 'storybook-static', ] for (const dir of noiseDirs) { @@ -220,7 +217,7 @@ describe('detect', () => { }) } - it('excludes *.test.ts files even outside test dirs', () => { + it('indexes *.test.ts files even outside test dirs', () => { const root = createTempRoot() try { writeFileSync(join(root, 'util.test.ts'), 'export {}', 'utf8') @@ -228,14 +225,14 @@ describe('detect', () => { const result = detect(root) - expect(result.files.code.some((f) => f.endsWith('util.test.ts'))).toBe(false) + expect(result.files.code.some((f) => f.endsWith('util.test.ts'))).toBe(true) expect(result.files.code.some((f) => f.endsWith('util.ts'))).toBe(true) } finally { rmSync(root, { recursive: true, force: true }) } }) - it('excludes *.spec.tsx files', () => { + it('indexes *.spec.tsx files', () => { const root = createTempRoot() try { writeFileSync(join(root, 'Button.spec.tsx'), 'export {}', 'utf8') @@ -243,7 +240,7 @@ describe('detect', () => { const result = detect(root) - expect(result.files.code.some((f) => f.endsWith('Button.spec.tsx'))).toBe(false) + expect(result.files.code.some((f) => f.endsWith('Button.spec.tsx'))).toBe(true) expect(result.files.code.some((f) => f.endsWith('Button.tsx'))).toBe(true) } finally { rmSync(root, { recursive: true, force: true }) @@ -265,7 +262,7 @@ describe('detect', () => { } }) - it('excludes vitest.config.ts', () => { + it('indexes vitest.config.ts', () => { const root = createTempRoot() try { writeFileSync(join(root, 'vitest.config.ts'), 'export default {}', 'utf8') @@ -273,14 +270,14 @@ describe('detect', () => { const result = detect(root) - expect(result.files.code.some((f) => f.endsWith('vitest.config.ts'))).toBe(false) + expect(result.files.code.some((f) => f.endsWith('vitest.config.ts'))).toBe(true) expect(result.files.code.some((f) => f.endsWith('real.ts'))).toBe(true) } finally { rmSync(root, { recursive: true, force: true }) } }) - it('excludes jest.config.js', () => { + it('indexes jest.config.js', () => { const root = createTempRoot() try { writeFileSync(join(root, 'jest.config.js'), 'module.exports = {}', 'utf8') @@ -288,7 +285,7 @@ describe('detect', () => { const result = detect(root) - expect(result.files.code.some((f) => f.endsWith('jest.config.js'))).toBe(false) + expect(result.files.code.some((f) => f.endsWith('jest.config.js'))).toBe(true) expect(result.files.code.some((f) => f.endsWith('real.ts'))).toBe(true) } finally { rmSync(root, { recursive: true, force: true }) @@ -308,7 +305,7 @@ describe('detect', () => { } }) - it('excludes setupTests.ts', () => { + it('indexes setupTests.ts', () => { const root = createTempRoot() try { writeFileSync(join(root, 'setupTests.ts'), 'export {}', 'utf8') @@ -316,14 +313,14 @@ describe('detect', () => { const result = detect(root) - expect(result.files.code.some((f) => f.endsWith('setupTests.ts'))).toBe(false) + expect(result.files.code.some((f) => f.endsWith('setupTests.ts'))).toBe(true) expect(result.files.code.some((f) => f.endsWith('real.ts'))).toBe(true) } finally { rmSync(root, { recursive: true, force: true }) } }) - it('excludes *.mock.ts files', () => { + it('indexes *.mock.ts files', () => { const root = createTempRoot() try { writeFileSync(join(root, 'api.mock.ts'), 'export {}', 'utf8') @@ -331,14 +328,14 @@ describe('detect', () => { const result = detect(root) - expect(result.files.code.some((f) => f.endsWith('api.mock.ts'))).toBe(false) + expect(result.files.code.some((f) => f.endsWith('api.mock.ts'))).toBe(true) expect(result.files.code.some((f) => f.endsWith('api.ts'))).toBe(true) } finally { rmSync(root, { recursive: true, force: true }) } }) - it('excludes jest.setup.ts', () => { + it('indexes jest.setup.ts', () => { const root = createTempRoot() try { writeFileSync(join(root, 'jest.setup.ts'), 'export {}', 'utf8') @@ -346,11 +343,38 @@ describe('detect', () => { const result = detect(root) - expect(result.files.code.some((f) => f.endsWith('jest.setup.ts'))).toBe(false) + expect(result.files.code.some((f) => f.endsWith('jest.setup.ts'))).toBe(true) expect(result.files.code.some((f) => f.endsWith('real.ts'))).toBe(true) } finally { rmSync(root, { recursive: true, force: true }) } }) }) + + it('hard-ignores nested worktrees, graphify-out artifacts, and build outputs but keeps tests and benchmarks', () => { + const root = createTempRoot() + try { + mkdirSync(join(root, 'backend', '.worktrees', 'copy', 'src'), { recursive: true }) + mkdirSync(join(root, 'graphify-out'), { recursive: true }) + mkdirSync(join(root, 'dist'), { recursive: true }) + mkdirSync(join(root, 'src', '__tests__'), { recursive: true }) + mkdirSync(join(root, 'benchmarks'), { recursive: true }) + writeFileSync(join(root, 'backend', '.worktrees', 'copy', 'src', 'foo.ts'), 'export const stale = true', 'utf8') + writeFileSync(join(root, 'graphify-out', 'graph.json'), '{}', 'utf8') + writeFileSync(join(root, 'dist', 'compiled.js'), 'module.exports = 1', 'utf8') + writeFileSync(join(root, 'src', '__tests__', 'foo.spec.ts'), 'export {}', 'utf8') + writeFileSync(join(root, 'benchmarks', 'report.bench.ts'), 'export {}', 'utf8') + + const result = detect(root) + const codePaths = normalizeAssertionPaths(result.files.code) + + expect(codePaths.some((filePath) => filePath.includes('/.worktrees/'))).toBe(false) + expect(codePaths.some((filePath) => filePath.endsWith('/graphify-out/graph.json'))).toBe(false) + expect(codePaths.some((filePath) => filePath.endsWith('/dist/compiled.js'))).toBe(false) + expect(codePaths).toContain(normalizeAssertionPath(join(root, 'src', '__tests__', 'foo.spec.ts'))) + expect(codePaths).toContain(normalizeAssertionPath(join(root, 'benchmarks', 'report.bench.ts'))) + } finally { + rmSync(root, { recursive: true, force: true }) + } + }) }) diff --git a/tests/unit/generate.test.ts b/tests/unit/generate.test.ts index 452d830..66dda49 100644 --- a/tests/unit/generate.test.ts +++ b/tests/unit/generate.test.ts @@ -4304,7 +4304,7 @@ describe('generateGraph', () => { }) }) - test('includes saved memory notes from graphify-out/memory with frontmatter metadata and references', () => { + test('does not re-index saved memory notes from graphify-out/memory by default', () => { withTempDir((tempDir) => { writeFileSync(join(tempDir, 'auth.ts'), 'export function authenticate() {\n return true\n}\n', 'utf8') mkdirSync(join(tempDir, 'graphify-out', 'memory'), { recursive: true }) @@ -4333,15 +4333,9 @@ describe('generateGraph', () => { links: Array> } const noteNode = graphData.nodes.find((node) => node.label === 'query_auth.md') - const authNode = graphData.nodes.find((node) => node.label === 'authenticate()') - expect(noteNode).toMatchObject({ - title: 'Auth result', - source_url: 'https://example.com/auth', - captured_at: '2026-04-11T00:00:00Z', - }) - expect(authNode).toBeTruthy() - expect(graphData.links.some((edge) => edge.source === noteNode?.id && edge.target === authNode?.id && edge.relation === 'references')).toBe(true) + expect(noteNode).toBeUndefined() + expect(graphData.links.some((edge) => edge.relation === 'references')).toBe(false) }) }) diff --git a/tests/unit/retrieval-gate.test.ts b/tests/unit/retrieval-gate.test.ts index 3fdf354..2b8c407 100644 --- a/tests/unit/retrieval-gate.test.ts +++ b/tests/unit/retrieval-gate.test.ts @@ -168,6 +168,47 @@ describe('classifyRetrievalLevel — signal extraction', () => { // mentions present → level 2 expect(decision.level).toBe(2) }) + + it('extracts explicit Class.method references even without backticks', () => { + const decision = classify({ prompt: 'Trace IdeasController.generateFromProblem through the runtime pipeline' }) + expect(decision.signals.mentioned_symbols).toContain('IdeasController.generateFromProblem') + }) +}) + +describe('classifyRetrievalLevel — exclusions and negation', () => { + it('does not classify excluded test terms as test intent', () => { + const decision = classify({ prompt: 'Exclude tests but explain the runtime path for report generation.' }) + const signals = decision.signals as typeof decision.signals & { excluded_domains?: string[] } + + expect(decision.intent).toBe('explain') + expect(decision.level).toBe(1) + expect(signals.excluded_domains).toContain('test') + }) + + it('keeps positive test prompts classified as test intent', () => { + const decision = classify({ prompt: 'Which tests cover report generation?' }) + expect(decision.intent).toBe('test') + }) + + it('tracks benchmark exclusions without promoting benchmark/test intent', () => { + const decision = classify({ prompt: 'Do not include benchmarks; explain the production pipeline.' }) + const signals = decision.signals as typeof decision.signals & { excluded_domains?: string[] } + + expect(decision.intent).toBe('explain') + expect(signals.excluded_domains).toContain('benchmark') + }) + + it('captures fixture and reporter exclusions', () => { + const decision = classify({ prompt: 'Ignore fixtures and html reporters when you explain the backend flow.' }) + const signals = decision.signals as typeof decision.signals & { + excluded_domains?: string[] + excluded_terms?: string[] + } + + expect(decision.intent).toBe('explain') + expect(signals.excluded_domains).toContain('fixture') + expect(signals.excluded_terms).toEqual(expect.arrayContaining(['html reporters', 'reporters'])) + }) }) describe('classifyRetrievalLevel — refactor intent stays in the 0-2 band', () => { diff --git a/tests/unit/retrieve-production-correctness.test.ts b/tests/unit/retrieve-production-correctness.test.ts new file mode 100644 index 0000000..061450d --- /dev/null +++ b/tests/unit/retrieve-production-correctness.test.ts @@ -0,0 +1,144 @@ +import { describe, expect, it } from 'vitest' + +import { build } from '../../src/pipeline/build.js' +import { computeContextPackDiagnostics } from '../../src/runtime/context-pack-diagnostics.js' +import { contextPackFromRetrieveResult, retrieveContext } from '../../src/runtime/retrieve.js' + +function buildProductionPipelineGraph() { + return build( + [ + { + schema_version: 1, + nodes: [ + { id: 'ideas_controller', label: 'IdeasController', file_type: 'code', source_file: '/src/ideas/ideas.controller.ts', source_location: 'L5', node_kind: 'class', framework: 'nestjs', framework_role: 'nest_controller', community: 0 }, + { id: 'ideas_method', label: 'IdeasController.generateFromProblem', file_type: 'code', source_file: '/src/ideas/ideas.controller.ts', source_location: 'L18', node_kind: 'method', framework: 'nestjs', framework_role: 'nest_controller', community: 0 }, + { id: 'ideas_method_duplicate', label: 'IdeasController.generateFromProblem', file_type: 'code', source_file: '/backend/.worktrees/copy/src/ideas/ideas.controller.ts', source_location: 'L18', node_kind: 'method', framework: 'nestjs', framework_role: 'nest_controller', community: 8 }, + { id: 'ideas_list', label: 'IdeasController.listIdeas', file_type: 'code', source_file: '/src/ideas/ideas.controller.ts', source_location: 'L42', node_kind: 'method', framework: 'nestjs', framework_role: 'nest_controller', community: 0 }, + { id: 'ideas_health', label: 'IdeasController.health', file_type: 'code', source_file: '/src/ideas/ideas.controller.ts', source_location: 'L50', node_kind: 'method', framework: 'nestjs', framework_role: 'nest_controller', community: 0 }, + { id: 'ideas_route', label: 'POST /ideas/generate', file_type: 'code', source_file: '/src/ideas/ideas.routes.ts', source_location: 'L4', node_kind: 'route', framework: 'nestjs', framework_role: 'nest_route', community: 0 }, + { id: 'ideas_service', label: 'IdeasService', file_type: 'code', source_file: '/src/ideas/ideas.service.ts', source_location: 'L5', node_kind: 'class', framework: 'nestjs', framework_role: 'nest_provider', community: 1 }, + { id: 'ideas_service_method', label: 'IdeasService.generateFromProblem', file_type: 'code', source_file: '/src/ideas/ideas.service.ts', source_location: 'L21', node_kind: 'method', framework: 'nestjs', framework_role: 'nest_provider', community: 1 }, + { id: 'report_orchestrator', label: 'ReportOrchestrator.run', file_type: 'code', source_file: '/src/reporting/report-orchestrator.ts', source_location: 'L9', node_kind: 'method', framework_role: 'orchestrator', community: 2 }, + { id: 'research_agent', label: 'ResearchAgent.search', file_type: 'code', source_file: '/src/research/research-agent.ts', source_location: 'L11', node_kind: 'method', community: 3 }, + { id: 'scoring_service', label: 'ScoringService.score', file_type: 'code', source_file: '/src/scoring/scoring.service.ts', source_location: 'L12', node_kind: 'method', community: 4 }, + { id: 'report_repository', label: 'ReportRepository.save', file_type: 'code', source_file: '/src/persistence/report.repository.ts', source_location: 'L8', node_kind: 'method', framework_role: 'repository', community: 5 }, + { id: 'runtime_config', label: 'VALIDATION_PROVIDER', file_type: 'code', source_file: '/src/config/runtime.ts', source_location: 'L3', node_kind: 'function', community: 6 }, + { id: 'ideas_test', label: 'IdeasController.generateFromProblem.spec', file_type: 'code', source_file: '/src/__tests__/ideas.controller.spec.ts', source_location: 'L5', node_kind: 'function', community: 7 }, + { id: 'html_reporter', label: 'HtmlReporter.render', file_type: 'code', source_file: '/benchmarks/html-reporter.ts', source_location: 'L14', node_kind: 'method', community: 7 }, + { id: 'report_fixture', label: 'idea-report.fixture', file_type: 'code', source_file: '/fixtures/idea-report.fixture.ts', source_location: 'L2', node_kind: 'function', community: 7 }, + ], + edges: [ + { source: 'ideas_controller', target: 'ideas_method', relation: 'controller_route', confidence: 'EXTRACTED', source_file: '/src/ideas/ideas.controller.ts' }, + { source: 'ideas_controller', target: 'ideas_list', relation: 'controller_route', confidence: 'EXTRACTED', source_file: '/src/ideas/ideas.controller.ts' }, + { source: 'ideas_controller', target: 'ideas_health', relation: 'controller_route', confidence: 'EXTRACTED', source_file: '/src/ideas/ideas.controller.ts' }, + { source: 'ideas_route', target: 'ideas_method', relation: 'route_handler', confidence: 'EXTRACTED', source_file: '/src/ideas/ideas.routes.ts' }, + { source: 'ideas_controller', target: 'ideas_service', relation: 'injects', confidence: 'EXTRACTED', source_file: '/src/ideas/ideas.controller.ts' }, + { source: 'ideas_method', target: 'ideas_service_method', relation: 'calls', confidence: 'EXTRACTED', source_file: '/src/ideas/ideas.controller.ts' }, + { source: 'ideas_service_method', target: 'report_orchestrator', relation: 'calls', confidence: 'EXTRACTED', source_file: '/src/ideas/ideas.service.ts' }, + { source: 'report_orchestrator', target: 'research_agent', relation: 'calls', confidence: 'EXTRACTED', source_file: '/src/reporting/report-orchestrator.ts' }, + { source: 'report_orchestrator', target: 'scoring_service', relation: 'calls', confidence: 'EXTRACTED', source_file: '/src/reporting/report-orchestrator.ts' }, + { source: 'report_orchestrator', target: 'report_repository', relation: 'calls', confidence: 'EXTRACTED', source_file: '/src/reporting/report-orchestrator.ts' }, + { source: 'ideas_service_method', target: 'runtime_config', relation: 'reads_env', confidence: 'EXTRACTED', source_file: '/src/ideas/ideas.service.ts' }, + { source: 'ideas_service_method', target: 'ideas_test', relation: 'covered_by', confidence: 'EXTRACTED', source_file: '/src/ideas/ideas.service.ts' }, + { source: 'html_reporter', target: 'report_fixture', relation: 'uses', confidence: 'EXTRACTED', source_file: '/benchmarks/html-reporter.ts' }, + ], + }, + ], + { directed: true }, + ) +} + +function retrieve(prompt: string) { + return retrieveContext(buildProductionPipelineGraph(), { + question: prompt, + budget: 4000, + retrievalStrategy: 'slice-v1', + }) +} + +describe('retrieveContext production retrieval regressions', () => { + it('follows the production runtime path while suppressing excluded test and benchmark domains', () => { + const result = retrieve( + 'Explain the production backend pipeline that generates an idea validation report. Exclude tests, benchmarks, fixtures, html reporters, and reporter utilities.', + ) + + const labels = result.matched_nodes.map((node) => node.label) + const sourceFiles = result.matched_nodes.map((node) => node.source_file) + const signals = result.retrieval_gate?.signals as NonNullable['signals'] & { + excluded_domains?: string[] + excluded_terms?: string[] + } + + expect(result.retrieval_gate?.intent).toBe('explain') + expect(signals.excluded_domains).toEqual(expect.arrayContaining(['test', 'benchmark', 'fixture'])) + expect(signals.excluded_terms).toEqual(expect.arrayContaining(['html reporters', 'reporter utilities'])) + expect(labels).toEqual(expect.arrayContaining([ + 'IdeasController.generateFromProblem', + 'IdeasService', + 'IdeasService.generateFromProblem', + 'ReportOrchestrator.run', + 'ResearchAgent.search', + 'ScoringService.score', + 'ReportRepository.save', + ])) + expect(labels).not.toContain('IdeasController.listIdeas') + expect(labels).not.toContain('IdeasController.health') + expect(labels).not.toContain('IdeasController.generateFromProblem.spec') + expect(labels).not.toContain('HtmlReporter.render') + expect(labels).not.toContain('idea-report.fixture') + expect(sourceFiles.some((sourceFile) => sourceFile.includes('/.worktrees/'))).toBe(false) + }) + + it('promotes explicit Class.method prompts to method anchors instead of controller-class anchors', () => { + const result = retrieve( + 'Explain the production runtime path for IdeasController.generateFromProblem and how it creates a validation report. Follow the controller into service/orchestrator/job/research agents/scoring/report builder/persistence. Exclude tests, benchmarks, fixtures, html reporters, and reporter utilities.', + ) + + expect(result.slice?.anchors[0]).toEqual( + expect.objectContaining({ + label: 'IdeasController.generateFromProblem', + reason: 'symbol mention', + }), + ) + expect(result.slice?.selected_paths).toEqual(expect.arrayContaining([ + expect.objectContaining({ from: 'IdeasController.generateFromProblem', to: 'IdeasService.generateFromProblem', relation: 'calls' }), + expect.objectContaining({ from: 'IdeasService.generateFromProblem', to: 'ReportOrchestrator.run', relation: 'calls' }), + expect.objectContaining({ from: 'ReportOrchestrator.run', to: 'ResearchAgent.search', relation: 'calls' }), + expect.objectContaining({ from: 'ReportOrchestrator.run', to: 'ScoringService.score', relation: 'calls' }), + expect.objectContaining({ from: 'ReportOrchestrator.run', to: 'ReportRepository.save', relation: 'calls' }), + ])) + expect(result.matched_nodes.map((node) => node.label)).not.toEqual(expect.arrayContaining([ + 'IdeasController.listIdeas', + 'IdeasController.health', + ])) + }) + + it('uses truthful anchor reasons for lexical source matches vs literal paths', () => { + const symbolPrompt = retrieve( + 'Explain how generateFromProblem creates the validation report without using test or benchmark files.', + ) + const pathPrompt = retrieve( + 'Explain src/ideas/ideas.controller.ts and how it creates a validation report.', + ) + + expect(symbolPrompt.slice?.anchors.some((anchor) => anchor.reason === 'path mention')).toBe(false) + expect(pathPrompt.slice?.anchors).toEqual( + expect.arrayContaining([ + expect.objectContaining({ reason: 'path mention' }), + ]), + ) + }) + + it('does not emit bad-pack warnings for the recovered production pipeline slice', () => { + const result = retrieve( + 'Explain the production runtime path for IdeasController.generateFromProblem and how it creates a validation report. Follow the controller into service/orchestrator/job/research agents/scoring/report builder/persistence. Exclude tests, benchmarks, fixtures, html reporters, and reporter utilities.', + ) + const diagnostics = computeContextPackDiagnostics(contextPackFromRetrieveResult(result)) + const warningKinds = diagnostics.warnings.map((warning) => warning.kind) + + expect(warningKinds).not.toContain('controller_only_pipeline_pack') + expect(warningKinds).not.toContain('missing_runtime_pipeline') + expect(warningKinds).not.toContain('excluded_domain_selected') + expect(warningKinds).not.toContain('polluted_source_path_selected') + }) +}) diff --git a/tests/unit/spi-build.test.ts b/tests/unit/spi-build.test.ts index 527beb0..25f69b3 100644 --- a/tests/unit/spi-build.test.ts +++ b/tests/unit/spi-build.test.ts @@ -90,13 +90,15 @@ describe('buildSpiFileLayer (slice 1a of #72)', () => { expect(findFile(spi, 'src/script.js').language).toBe('javascript') }) - it('skips node_modules, dist, build, .next, coverage, .git, graphify-out, .test-artifacts', () => { - for (const dir of ['node_modules', 'dist', 'build', '.next', 'coverage', '.git', 'graphify-out', '.test-artifacts']) { + it('skips nested worktrees and generated/build outputs while keeping test sources indexable', () => { + for (const dir of ['node_modules', 'dist', 'build', '.next', 'coverage', '.git', 'graphify-out', '.test-artifacts', '.worktrees']) { writeFile(sandbox, `${dir}/leak.ts`, 'export const x = 1\n') } + writeFile(sandbox, 'backend/.worktrees/task-1/src/duplicate.ts', 'export const duplicate = 1\n') + writeFile(sandbox, 'src/__tests__/keep.spec.ts', 'export const keep = 1\n') writeFile(sandbox, 'src/keep.ts', 'export const k = 1\n') const spi = build(sandbox) - expect(spi.files.map((f) => f.path)).toEqual(['src/keep.ts']) + expect(spi.files.map((f) => f.path)).toEqual(['src/__tests__/keep.spec.ts', 'src/keep.ts']) }) it('produces deterministic output between two runs of the same workspace', { timeout: 30_000 }, () => { From 3dd1ecf7628d3f5b65f198c1cab00d46922cda4f Mon Sep 17 00:00:00 2001 From: mohammed naji Date: Tue, 12 May 2026 08:02:31 +0400 Subject: [PATCH 2/3] Fix retrieval CI regressions Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/runtime/context-pack-diagnostics.ts | 4 +- src/runtime/context-pack.ts | 3 + src/runtime/retrieval-gate.ts | 2 +- src/runtime/retrieve.ts | 89 +++++++++++++++++++++---- src/shared/source-discovery.ts | 27 +++++++- tests/unit/detect.test.ts | 15 +++++ tests/unit/retrieval-gate.test.ts | 5 ++ tests/unit/retrieve.test.ts | 33 +++++++++ 8 files changed, 161 insertions(+), 17 deletions(-) diff --git a/src/runtime/context-pack-diagnostics.ts b/src/runtime/context-pack-diagnostics.ts index 2ec96a2..f3821bd 100644 --- a/src/runtime/context-pack-diagnostics.ts +++ b/src/runtime/context-pack-diagnostics.ts @@ -371,7 +371,9 @@ function selectedMethodAnchor(pack: CompiledContextPack): boolean { function computeQualityScore(warnings: ContextPackDiagnosticWarning[]): number { // Keep the quality-score denominator stable as diagnostics expand so // historical scores remain comparable. New warnings still deduct via the - // numerator, but don't dilute the old baseline. + // numerator (triggeredWeight via RULE_WEIGHTS), but don't dilute the old + // baseline. Raw scores can dip below zero when triggeredWeight exceeds + // totalWeight; the clamp intentionally floors those expanded warning sets at 0. const totalWeight = 10 let triggeredWeight = 0 for (const warning of warnings) { diff --git a/src/runtime/context-pack.ts b/src/runtime/context-pack.ts index f330277..ee6cec5 100644 --- a/src/runtime/context-pack.ts +++ b/src/runtime/context-pack.ts @@ -860,6 +860,9 @@ function computeContextCandidateValue( pushUnique(reasons, 'source path match') } + // Reward explicitly tagged non-production domains once they survive the + // penalty gate above so score/reasons reflect why pushUnique records that + // this candidate matched a permitted test/benchmark/config-style source. if (view.source_domain !== 'production' && view.source_domain !== 'unknown') { score += 0.25 pushUnique(reasons, `${view.source_domain} domain`) diff --git a/src/runtime/retrieval-gate.ts b/src/runtime/retrieval-gate.ts index e3714f8..c681f9f 100644 --- a/src/runtime/retrieval-gate.ts +++ b/src/runtime/retrieval-gate.ts @@ -72,7 +72,7 @@ const PATTERNS: ReadonlyArray<{ intent: RetrievalIntent; re: RegExp }> = [ const PATH_RE = /(?:^|\s|`)((?:[\w@./-]+\/)*[\w./@-]+\.[A-Za-z]{1,8})(?=\b|`|$)/g const SYMBOL_BACKTICK_RE = /`([A-Za-z_$][\w$]*(?:\.[A-Za-z_$][\w$]*)*\(?\)?)`/g -const SYMBOL_EXPLICIT_RE = /\b([A-Z][A-Za-z0-9_$]*(?:\.|#|::)[A-Za-z_$][\w$]*\(?\)?|[A-Za-z_$][\w$]{2,}\(\))\b/g +const SYMBOL_EXPLICIT_RE = /\b((?:[A-Za-z_$][\w$]*\.)*[A-Za-z_$][\w$]*(?:\.|#|::)[A-Za-z_$][\w$]*\(?\)?|[A-Za-z_$][\w$]{2,}\(\))\b/g const STACK_TRACE_RE = /(?:^|\n)\s*at\s+\S+\s*\([^)]*:\d+(?::\d+)?\)|Error[:\s]\s+\S/ const EXCLUSION_SPAN_RE = /\b(?:exclude|excluding|ignore|ignoring|omit|omitting|skip|skipping|without|do not include|don't include|not|no)\b\s+(.+?)(?=(?:\s+\b(?:but|while|however|when)\b|[.;\n]|$))/gi diff --git a/src/runtime/retrieve.ts b/src/runtime/retrieve.ts index 551f821..360133b 100644 --- a/src/runtime/retrieve.ts +++ b/src/runtime/retrieve.ts @@ -233,6 +233,54 @@ function averageLabelLengthForGraph(graph: KnowledgeGraph): number { return averageLength } +function normalizeAbsoluteGraphPath(sourceFile: string): string | undefined { + const normalized = sourceFile.replace(/\\/g, '/') + if (normalized.startsWith('/') || /^[A-Za-z]:\//.test(normalized)) { + return normalized + } + return undefined +} + +function inferredGraphRoot(graph: KnowledgeGraph): string | undefined { + if (typeof graph.graph.root_path === 'string' && graph.graph.root_path.length > 0) { + return graph.graph.root_path + } + + const absoluteSourceDirs = graph + .nodeEntries() + .map(([, attributes]) => normalizeAbsoluteGraphPath(String(attributes.source_file ?? ''))) + .filter((sourceFile): sourceFile is string => sourceFile !== undefined) + .map((sourceFile) => { + const lastSlash = sourceFile.lastIndexOf('/') + return lastSlash > 0 ? sourceFile.slice(0, lastSlash) : '/' + }) + + const first = absoluteSourceDirs[0] + if (!first) { + return undefined + } + + const segments = first.split('/') + let sharedLength = segments.length + for (const dir of absoluteSourceDirs.slice(1)) { + const parts = dir.split('/') + let matchLength = 0 + while (matchLength < sharedLength && matchLength < parts.length && segments[matchLength] === parts[matchLength]) { + matchLength += 1 + } + sharedLength = matchLength + if (sharedLength === 0) { + break + } + } + + const shared = segments.slice(0, sharedLength).join('/') + if (/^[A-Za-z]:$/.test(shared)) { + return `${shared}/` + } + return shared.length > 0 ? shared : '/' +} + function buildTokenWeights(graph: KnowledgeGraph, questionTokens: readonly string[]): Map { const totalNodes = graph.numberOfNodes() if (totalNodes === 0) return new Map() @@ -738,6 +786,13 @@ function labelSymbolParts(label: string): { className?: string; methodName?: str } } +/** + * Scores explicit symbol-reference strength on a 0-4 scale. + * 4 = exact qualified match, 3.5 = method match with qualifier context in the + * source path, 3 = strong qualified/method context match, 2.5 = bare-name + * match, 0 = no symbol evidence. Callers use >= 3 as the "strong anchor" + * threshold when deciding whether a match should be treated as exact. + */ function symbolReferenceMatchScore( label: string, sourceFile: string, @@ -1560,7 +1615,10 @@ function buildRetrieveResultFromOrderedCandidates( export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions): RetrieveResult { const { question, budget } = options const questionTokens = tokenizeQuestion(question) - const rootPath = typeof graph.graph.root_path === 'string' ? graph.graph.root_path : process.cwd() + const graphRootPath = typeof graph.graph.root_path === 'string' && graph.graph.root_path.length > 0 + ? graph.graph.root_path + : undefined + const classificationRootPath = inferredGraphRoot(graph) const retrievalGate = classifyRetrievalLevel({ prompt: question, ...(options.retrievalLevel !== undefined ? { manualOverride: options.retrievalLevel } : {}), @@ -1667,7 +1725,7 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions) const label = String(attributes.label ?? '') const sourceFile = String(attributes.source_file ?? '') const nodeKind = String(attributes.node_kind ?? '') - const sourceDomain = classifySourceDomain(sourceFile, rootPath) + const sourceDomain = classifySourceDomain(sourceFile, classificationRootPath) const fileNodeLike = isFileNodeLike(label, sourceFile) const symbolMatch = symbolReferenceMatchScore(label, sourceFile, mentionedSymbolRefs) const exactAnchorMatch = symbolMatch >= 3 @@ -1688,7 +1746,7 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions) const exclusionMatches = excludedDomains.includes(sourceDomain as never) || excludedTermMatches(label, excludedTerms, excludedPathHints) || excludedTermMatches(sourceFile, excludedTerms, excludedPathHints) - if ((isPollutedSourcePath(sourceFile, rootPath) || exclusionMatches) && !exactAnchorMatch && !mentionedPathMatch) { + if ((isPollutedSourcePath(sourceFile, classificationRootPath) || exclusionMatches) && !exactAnchorMatch && !mentionedPathMatch) { continue } const sourceDomainPenalty = defaultSourceDomainPenalty(sourceDomain, retrievalGate.intent, question, questionTokens) @@ -1747,11 +1805,11 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions) const cleanExactLabels = new Set( seedCandidates - .filter((candidate) => !isPollutedSourcePath(candidate.sourceFile, rootPath)) + .filter((candidate) => !isPollutedSourcePath(candidate.sourceFile, classificationRootPath)) .map((candidate) => normalizeSeedText(candidate.label)), ) const filteredSeedCandidates = seedCandidates.filter((candidate) => ( - !isPollutedSourcePath(candidate.sourceFile, rootPath) + !isPollutedSourcePath(candidate.sourceFile, classificationRootPath) || candidate.literalPathMatch || !cleanExactLabels.has(normalizeSeedText(candidate.label)) )) @@ -1949,13 +2007,13 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions) } const label = String(attributes.label ?? '') const sourceFile = String(attributes.source_file ?? '') - const sourceDomain = classifySourceDomain(sourceFile, rootPath) + const sourceDomain = classifySourceDomain(sourceFile, classificationRootPath) const symbolMatch = symbolReferenceMatchScore(label, sourceFile, mentionedSymbolRefs) const pathMatch = sourceFileMatchesMentionedPath(sourceFile, mentionedPaths) const exclusionMatches = excludedDomains.includes(sourceDomain as never) || excludedTermMatches(label, excludedTerms, excludedPathHints) || excludedTermMatches(sourceFile, excludedTerms, excludedPathHints) - if ((isPollutedSourcePath(sourceFile, rootPath) || exclusionMatches) && symbolMatch <= 0 && !pathMatch) { + if ((isPollutedSourcePath(sourceFile, classificationRootPath) || exclusionMatches) && symbolMatch <= 0 && !pathMatch) { continue } const sourceDomainPenalty = defaultSourceDomainPenalty(sourceDomain, retrievalGate.intent, question, questionTokens) @@ -2066,14 +2124,14 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions) excludedDomains: retrievalGate.signals.excluded_domains, excludedTerms: retrievalGate.signals.excluded_terms, excludedPathHints: retrievalGate.signals.excluded_path_hints, - rootPath, + rootPath: classificationRootPath, }, ) if (sliced) { const scoredById = new Map(scored.map((node) => [node.id, node])) const sliceCandidates = sliced.ordered_ids.map((nodeId, index) => ( - scoredById.get(nodeId) ?? scoredNodeFromGraph(graph, nodeId, Math.max(0.25, 2 - (index * 0.1)), rootPath) + scoredById.get(nodeId) ?? scoredNodeFromGraph(graph, nodeId, Math.max(0.25, 2 - (index * 0.1)), classificationRootPath) )) return buildRetrieveResultFromOrderedCandidates( @@ -2084,7 +2142,7 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions) communityLabels, retrieveGraphSignals, retrievalGate, - rootPath, + graphRootPath, sliced.metadata, ) } @@ -2098,7 +2156,7 @@ export function retrieveContext(graph: KnowledgeGraph, options: RetrieveOptions) communityLabels, retrieveGraphSignals, retrievalGate, - rootPath, + graphRootPath, ) } @@ -2115,7 +2173,10 @@ export async function retrieveContextAsync(graph: KnowledgeGraph, options: Retri const frameworkProfile = buildFrameworkQuestionProfile(options.question, questionTokens) const activeFrameworks = activeFrameworksForProfile(frameworkProfile) - const rootPath = typeof graph.graph.root_path === 'string' ? graph.graph.root_path : process.cwd() + const graphRootPath = typeof graph.graph.root_path === 'string' && graph.graph.root_path.length > 0 + ? graph.graph.root_path + : undefined + const classificationRootPath = inferredGraphRoot(graph) const communities = communitiesFromGraph(graph) const communityLabels: Record = { ...buildCommunityLabels(graph, communities), @@ -2146,7 +2207,7 @@ export async function retrieveContextAsync(graph: KnowledgeGraph, options: Retri const questionLower = options.question.toLowerCase() const candidatesById = new Map( eligibleNodeEntries(graph, options) - .map(([id, attributes]) => [id, scoredNodeFromGraphEntry(id, attributes, frameworkProfile, questionLower, rootPath)] as const), + .map(([id, attributes]) => [id, scoredNodeFromGraphEntry(id, attributes, frameworkProfile, questionLower, classificationRootPath)] as const), ) if (candidatesById.size === 0) { return lexicalResult @@ -2228,7 +2289,7 @@ export async function retrieveContextAsync(graph: KnowledgeGraph, options: Retri prompt: options.question, ...(options.retrievalLevel !== undefined ? { manualOverride: options.retrievalLevel } : {}), }), - rootPath, + graphRootPath, lexicalResult.slice, ) } diff --git a/src/shared/source-discovery.ts b/src/shared/source-discovery.ts index 2e18fd4..cda4363 100644 --- a/src/shared/source-discovery.ts +++ b/src/shared/source-discovery.ts @@ -94,8 +94,16 @@ function globToRegExp(pattern: string): RegExp { return /^$/ } + const globstarSegmentToken = '\u0000' + const globstarToken = '\u0001' const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, '\\$&') - const wildcarded = escaped.replace(/\*/g, '.*').replace(/\?/g, '.') + const wildcarded = escaped + .replace(/\*\*\//g, globstarSegmentToken) + .replace(/\*\*/g, globstarToken) + .replace(/\*/g, '[^/]*') + .replace(/\?/g, '[^/]') + .replaceAll(globstarSegmentToken, '(?:.*/)?') + .replaceAll(globstarToken, '.*') return new RegExp(`^${wildcarded}$`) } @@ -104,6 +112,23 @@ function matchesPatternValue(value: string, pattern: string): boolean { } function relativeWorkspacePath(path: string, root: string): string | null { + const normalizedRoot = normalizePathLike(root) + const normalizedPath = normalizePathLike(path) + const windowsAbsoluteRoot = /^[A-Za-z]:\//.test(normalizedRoot) + const windowsAbsolutePath = /^[A-Za-z]:\//.test(normalizedPath) + if (windowsAbsoluteRoot || windowsAbsolutePath) { + const rootPrefix = normalizedRoot.endsWith('/') ? normalizedRoot : `${normalizedRoot}/` + const lowerRoot = normalizedRoot.toLowerCase() + const lowerRootPrefix = rootPrefix.toLowerCase() + const lowerPath = normalizedPath.toLowerCase() + if (lowerPath === lowerRoot) { + return '' + } + return lowerPath.startsWith(lowerRootPrefix) + ? normalizedPath.slice(rootPrefix.length) + : null + } + const resolvedRoot = resolve(root) const resolvedPath = path.startsWith(sep) ? resolve(path) : resolve(resolvedRoot, path) const relativePath = normalizePathLike(relative(resolvedRoot, resolvedPath)) diff --git a/tests/unit/detect.test.ts b/tests/unit/detect.test.ts index 02d6d8f..154bbfd 100644 --- a/tests/unit/detect.test.ts +++ b/tests/unit/detect.test.ts @@ -142,6 +142,21 @@ describe('detect', () => { } }) + it('preserves gitignore-style segment semantics in graphifyignore helpers', () => { + const root = createTempRoot() + try { + const srcFile = join(root, 'src', 'main.ts') + const nestedFile = join(root, 'src', 'nested', 'main.ts') + const rootFile = join(root, 'index.ts') + + expect(_isIgnored(srcFile, root, ['src/*.ts'])).toBe(true) + expect(_isIgnored(nestedFile, root, ['src/*.ts'])).toBe(false) + expect(_isIgnored(rootFile, root, ['**/*.ts'])).toBe(true) + } finally { + rmSync(root, { recursive: true, force: true }) + } + }) + it('follows symlinked directories when requested', () => { const root = createTempRoot() try { diff --git a/tests/unit/retrieval-gate.test.ts b/tests/unit/retrieval-gate.test.ts index 2b8c407..9a9c322 100644 --- a/tests/unit/retrieval-gate.test.ts +++ b/tests/unit/retrieval-gate.test.ts @@ -173,6 +173,11 @@ describe('classifyRetrievalLevel — signal extraction', () => { const decision = classify({ prompt: 'Trace IdeasController.generateFromProblem through the runtime pipeline' }) expect(decision.signals.mentioned_symbols).toContain('IdeasController.generateFromProblem') }) + + it('extracts lowercase module.function references', () => { + const decision = classify({ prompt: 'Trace utils.parseDate through the runtime pipeline' }) + expect(decision.signals.mentioned_symbols).toContain('utils.parseDate') + }) }) describe('classifyRetrievalLevel — exclusions and negation', () => { diff --git a/tests/unit/retrieve.test.ts b/tests/unit/retrieve.test.ts index 2ca1db5..89a883f 100644 --- a/tests/unit/retrieve.test.ts +++ b/tests/unit/retrieve.test.ts @@ -2965,6 +2965,39 @@ describe('retrieve', () => { } }) + it('does not suppress absolute tmp sources when graph.root_path is unset', () => { + const graph = new KnowledgeGraph() + const sourceFile = process.platform === 'win32' ? 'C:/tmp/auth.ts' : '/tmp/auth.ts' + graph.addNode('auth_service', { + label: 'AuthService', + file_type: 'code', + source_file: sourceFile, + source_location: 'L2', + }) + + const result = retrieveContext(graph, { question: 'AuthService', budget: 3000 }) + + expect(result.matched_nodes).toHaveLength(1) + expect(result.matched_nodes[0]?.label).toBe('AuthService') + expect(result.matched_nodes[0]?.line_number).toBe(2) + }) + + it('does not suppress windows-style absolute tmp sources on non-windows hosts', () => { + const graph = new KnowledgeGraph() + graph.addNode('auth_service', { + label: 'AuthService', + file_type: 'code', + source_file: 'C:/tmp/auth.ts', + source_location: 'L2', + }) + + const result = retrieveContext(graph, { question: 'AuthService', budget: 3000 }) + + expect(result.matched_nodes).toHaveLength(1) + expect(result.matched_nodes[0]?.label).toBe('AuthService') + expect(result.matched_nodes[0]?.line_number).toBe(2) + }) + it('relativizes in-root source files while preserving outside-root matches', () => { const graph = new KnowledgeGraph({ directed: true }) graph.graph.root_path = '/workspace/app' From b06d99b6268f72c4657758bb07669f94b275a9f6 Mon Sep 17 00:00:00 2001 From: mohammed naji Date: Tue, 12 May 2026 08:15:44 +0400 Subject: [PATCH 3/3] Fix exclusion token matching Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/runtime/retrieve.ts | 15 ++++++++++++--- tests/unit/retrieve.test.ts | 27 +++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/src/runtime/retrieve.ts b/src/runtime/retrieve.ts index 360133b..5ed282e 100644 --- a/src/runtime/retrieve.ts +++ b/src/runtime/retrieve.ts @@ -843,10 +843,19 @@ function sourceFileMatchesMentionedPath(sourceFile: string, mentionedPaths: read return mentionedPaths.some((path) => sourceFile === path || sourceFile.endsWith(`/${path}`)) } +function exclusionTokens(value: string): Set { + return new Set(tokenizeLabel(value)) +} + function excludedTermMatches(value: string, excludedTerms: readonly string[], excludedPathHints: readonly string[]): boolean { - const lowerValue = value.toLowerCase() - return excludedTerms.some((term) => lowerValue.includes(term.toLowerCase())) - || excludedPathHints.some((hint) => lowerValue.includes(hint.toLowerCase())) + const valueTokens = exclusionTokens(value) + if (valueTokens.size === 0) { + return false + } + + return [...excludedTerms, ...excludedPathHints] + .flatMap((term) => tokenizeLabel(term)) + .some((termToken) => valueTokens.has(termToken)) } function promptAllowsSourceDomain(domain: SourceDomain, intent: string, prompt: string, questionTokens: readonly string[]): boolean { diff --git a/tests/unit/retrieve.test.ts b/tests/unit/retrieve.test.ts index 89a883f..89ef211 100644 --- a/tests/unit/retrieve.test.ts +++ b/tests/unit/retrieve.test.ts @@ -2965,6 +2965,33 @@ describe('retrieve', () => { } }) + it('does not treat exclusion terms as substrings inside production identifiers', () => { + const graph = new KnowledgeGraph() + graph.addNode('contest_service', { + label: 'ContestService', + file_type: 'code', + source_file: '/src/contest/service.ts', + line_number: 12, + node_kind: 'class', + }) + graph.addNode('contest_service_test', { + label: 'ContestService.spec', + file_type: 'code', + source_file: '/src/__tests__/contest.service.spec.ts', + line_number: 4, + node_kind: 'function', + }) + + const result = retrieveContext(graph, { + question: 'Explain ContestService runtime path. Exclude tests.', + budget: 3000, + }) + + expect(result.matched_nodes).toHaveLength(1) + expect(result.matched_nodes[0]?.label).toBe('ContestService') + expect(result.matched_nodes[0]?.source_file).toBe('/src/contest/service.ts') + }) + it('does not suppress absolute tmp sources when graph.root_path is unset', () => { const graph = new KnowledgeGraph() const sourceFile = process.platform === 'win32' ? 'C:/tmp/auth.ts' : '/tmp/auth.ts'