From c8b330d8e16097b3d0395322e07505a0089edd82 Mon Sep 17 00:00:00 2001 From: Joel Teply Date: Fri, 17 Apr 2026 18:05:05 -0500 Subject: [PATCH 1/2] =?UTF-8?q?feat(rag):=20Phase=201=20=E2=80=94=20stable?= =?UTF-8?q?-first=20ordering=20for=20prefix-reuse=20(issue=20#918)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds PromptTier enum (INVARIANT / SEMI_STABLE / VOLATILE) and makes every RAGSource declare its tier. RAGComposer sorts collected sections deterministically by (tier, sourceName) before returning. Why: today the composer's parallel section assembly produces a different byte order on every chat call. llama-server / DMR's prefix-KV-cache reuse never fires, so each turn reprocesses the full 14k-token prompt from scratch (~35s prompt eval at 400 tok/s). With deterministic ordering AND stable bytes within each tier, the unchanging INVARIANT prefix gets reused — only the VOLATILE suffix needs evaluation. Expected: ~70× faster prompt eval per turn for repeat-context turns. Architecture (per docs/architecture/MULTIMODAL-WORKER-AND-PREFIX-REUSE.md): - INVARIANT: persona identity, tool definitions, recipe rules, docs (PersonaIdentity, ToolDefinitions, CodeTool, Documentation, ToolMethodology, ProjectContext) - SEMI_STABLE: history, memories, participants, governance — append-only (ConversationHistory, LiveRoomAwareness, Governance, OpenProposals, SentinelAwareness, GlobalAwareness, SocialMediaRAG, SemanticMemory) - VOLATILE: latest message, audio chunks, current activity, UI state (ActivityContext, CodebaseSearch, MediaArtifact, VoiceConversation, WidgetContext) Implementation note: tier is a class-level declaration on each RAGSource (required field, no Option<>). Sources return Omit from load() and fromBatchResult(); RAGComposer injects the source's declared tier when wrapping the section. Single-source-of-truth classification per source — no per-return-statement repetition. Phases 2 (slot pinning) and 3 (composition cache) build on this. Phase 4 (multimodal content parts) depends on #917 ModelMetadata. tsc clean. Branch: feature/prefix-reuse-and-multimodal off main. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/system/rag/shared/RAGComposer.ts | 44 ++++++++++++++++-- src/system/rag/shared/RAGSource.ts | 46 +++++++++++++++++-- src/system/rag/shared/RAGTypes.ts | 32 +++++++++++++ .../rag/sources/ActivityContextSource.ts | 4 +- src/system/rag/sources/CodeToolSource.ts | 6 ++- .../rag/sources/CodebaseSearchSource.ts | 4 +- .../rag/sources/ConversationHistorySource.ts | 6 ++- src/system/rag/sources/DocumentationSource.ts | 4 +- .../rag/sources/GlobalAwarenessSource.ts | 10 ++-- src/system/rag/sources/GovernanceSource.ts | 4 +- .../rag/sources/LiveRoomAwarenessSource.ts | 6 ++- src/system/rag/sources/MediaArtifactSource.ts | 4 +- src/system/rag/sources/OpenProposalsSource.ts | 6 ++- .../rag/sources/PersonaIdentitySource.ts | 6 ++- .../rag/sources/ProjectContextSource.ts | 12 +++-- .../rag/sources/SemanticMemorySource.ts | 8 ++-- .../rag/sources/SentinelAwarenessSource.ts | 4 +- .../rag/sources/SocialMediaRAGSource.ts | 8 ++-- .../rag/sources/ToolDefinitionsSource.ts | 10 ++-- .../rag/sources/ToolMethodologySource.ts | 4 +- .../rag/sources/VoiceConversationSource.ts | 6 ++- src/system/rag/sources/WidgetContextSource.ts | 6 ++- 22 files changed, 191 insertions(+), 49 deletions(-) diff --git a/src/system/rag/shared/RAGComposer.ts b/src/system/rag/shared/RAGComposer.ts index d7b24f63b..9f7991423 100644 --- a/src/system/rag/shared/RAGComposer.ts +++ b/src/system/rag/shared/RAGComposer.ts @@ -22,6 +22,15 @@ */ import type { RAGSource, RAGSourceContext, RAGSection, RAGCompositionResult } from './RAGSource'; +import { PromptTier } from './RAGTypes'; + +/** Sort key for tiers — smaller numbers concatenate first. + * INVARIANT before SEMI_STABLE before VOLATILE. See PromptTier docs. */ +const TIER_ORDER: Record = { + [PromptTier.INVARIANT]: 0, + [PromptTier.SEMI_STABLE]: 1, + [PromptTier.VOLATILE]: 2, +}; import type { RagSourceRequest, RagComposeResult, RagSourceResult } from '../../../shared/generated/rag'; import { Logger } from '../../core/logging/Logger'; import { TimingHarness } from '../../core/shared/TimingHarness'; @@ -230,6 +239,19 @@ export class RAGComposer { failedSources.push({ source: result.source, error: result.error }); } } + // Deterministic ordering: sections sorted by (tier, sourceName) so the + // assembled prompt's bytes are identical across requests with identical + // section contents. This is the prerequisite for llama-server / DMR + // prefix-KV-cache reuse — without stable ordering, the same logical + // prompt has different bytes per turn and the cache misses every time. + // Tier order: INVARIANT first, then SEMI_STABLE, then VOLATILE — see + // PromptTier in RAGTypes.ts and docs/architecture/MULTIMODAL-WORKER-AND-PREFIX-REUSE.md + sections.sort((a, b) => { + const tierOrder = TIER_ORDER[a.tier] - TIER_ORDER[b.tier]; + if (tierOrder !== 0) return tierOrder; + // Within a tier: alphabetical by source name. Stable, total order. + return a.sourceName.localeCompare(b.sourceName); + }); timer.mark('collect_results'); // Log ALL source timings for performance diagnosis @@ -316,8 +338,13 @@ export class RAGComposer { if (rustResult.success) { // Convert via source's fromBatchResult method + // Tier injection: same single-source-of-truth as TS path — + // the source's class declaration provides tier; we inject it + // here so the section conforms to RAGSection regardless of + // whether the source's fromBatchResult included it. if (sourceInfo.source.fromBatchResult) { - const section = sourceInfo.source.fromBatchResult(rustResult, rustResult.load_time_ms); + const rawSection = sourceInfo.source.fromBatchResult(rustResult, rustResult.load_time_ms); + const section: RAGSection = { ...rawSection, tier: sourceInfo.source.tier }; results.push({ success: true, section, @@ -326,9 +353,11 @@ export class RAGComposer { }); } else { // Fallback: basic conversion + const rawSection = this.defaultFromBatchResult(sourceInfo.source.name, rustResult); + const section: RAGSection = { ...rawSection, tier: sourceInfo.source.tier }; results.push({ success: true, - section: this.defaultFromBatchResult(sourceInfo.source.name, rustResult), + section, sourceName: sourceInfo.source.name, loadTime: rustResult.load_time_ms }); @@ -441,10 +470,14 @@ export class RAGComposer { sourceTimer.setMeta('budget', budget); try { - const section = await source.load(context, budget); + const rawSection = await source.load(context, budget); sourceTimer.mark('load'); - sourceTimer.setMeta('tokenCount', section.tokenCount); + sourceTimer.setMeta('tokenCount', rawSection.tokenCount); const record = sourceTimer.finish(); + // Inject tier from the source's declaration. Sources don't re-state + // their tier on every return; the class-level declaration is the + // single source of truth, applied here. + const section: RAGSection = { ...rawSection, tier: source.tier }; return { success: true, section, sourceName: source.name, loadTime: record.totalMs }; } catch (error: any) { sourceTimer.setError(error.message); @@ -457,8 +490,9 @@ export class RAGComposer { /** * Default conversion from Rust RagSourceResult to TypeScript RAGSection. * Used when source doesn't implement fromBatchResult. + * Returns without `tier` — caller injects from the source's declaration. */ - private defaultFromBatchResult(sourceName: string, result: RagSourceResult): RAGSection { + private defaultFromBatchResult(sourceName: string, result: RagSourceResult): Omit { // Combine all sections into a single content block const content = result.sections .map(s => s.content) diff --git a/src/system/rag/shared/RAGSource.ts b/src/system/rag/shared/RAGSource.ts index 995b6859f..0a46dd7d1 100644 --- a/src/system/rag/shared/RAGSource.ts +++ b/src/system/rag/shared/RAGSource.ts @@ -19,6 +19,10 @@ import type { UUID } from '../../core/types/CrossPlatformUUID'; import type { RAGBuildOptions, LLMMessage, RAGArtifact, PersonaMemory, PersonaIdentity, RecipeStrategy } from './RAGTypes'; +import { PromptTier } from './RAGTypes'; + +// Re-export so source files only need one import +export { PromptTier } from './RAGTypes'; /** * Context passed to each RAGSource for loading @@ -70,6 +74,9 @@ export interface RAGSourceContext { export interface RAGSection { /** Source that produced this section */ readonly sourceName: string; + /** Tier this section belongs to — drives stable-byte-prefix ordering. + * Mirrored from the producing source's declared tier. */ + readonly tier: PromptTier; /** Estimated token count */ readonly tokenCount: number; /** Time taken to load (ms) */ @@ -105,6 +112,29 @@ export interface RAGSource { */ readonly priority: number; + /** + * Tier — INVARIANT / SEMI_STABLE / VOLATILE. + * Required. Drives stable-byte-prefix prompt assembly so llama-server + * reuses KV cache for the unchanging region instead of reprocessing + * the full prompt every turn. + * + * Classification rules: + * - INVARIANT — system prompt fragments, recipe rules, role identity, + * tool definitions. Bytes must be identical across thousands of turns + * for the same persona+recipe. NO timestamps, NO request IDs, NO + * per-request volatile data. + * - SEMI_STABLE — conversation history, memories, participants, + * governance. Grows monotonically — append-only relative to the + * previous turn. Earlier bytes never rewritten. + * - VOLATILE — current message, audio chunks, current timestamp, + * per-request observations. The only region the server reprocesses + * token-by-token. + * + * If you can't decide, the source probably mixes tiers and should be + * split into separate sources at the right granularity. + */ + readonly tier: PromptTier; + /** * Default budget allocation as percentage (0-100). * Total across all sources should roughly equal 100. @@ -126,11 +156,16 @@ export interface RAGSource { * Load data from this source. * Called in parallel with other applicable sources. * + * Returns the section without the `tier` field — RAGComposer injects + * the source's declared `tier` into the section after load completes. + * This keeps source implementations focused on what they produce + * rather than re-asserting their tier on every return. + * * @param context - Context for loading * @param allocatedBudget - Token budget allocated to this source - * @returns Section of RAG context + * @returns Section of RAG context (tier added by composer) */ - load(context: RAGSourceContext, allocatedBudget: number): Promise; + load(context: RAGSourceContext, allocatedBudget: number): Promise>; /** * Whether this source produces identical results for all personas in the same room. @@ -168,11 +203,14 @@ export interface RAGSource { * Only called if supportsBatching is true. * Transforms the typed Rust result into the RAGSection format. * + * Returns the section without `tier` — RAGComposer injects the source's + * declared tier after conversion, same as the non-batched path. + * * @param result - The result from Rust's rag/compose endpoint * @param loadTimeMs - How long the load took - * @returns The RAGSection to include in the composition result + * @returns The RAGSection (without tier) to include in the composition result */ - fromBatchResult?(result: RagSourceResult, loadTimeMs: number): RAGSection; + fromBatchResult?(result: RagSourceResult, loadTimeMs: number): Omit; } // Re-export Rust-generated types for batch support diff --git a/src/system/rag/shared/RAGTypes.ts b/src/system/rag/shared/RAGTypes.ts index 351f5d293..3ab546333 100644 --- a/src/system/rag/shared/RAGTypes.ts +++ b/src/system/rag/shared/RAGTypes.ts @@ -18,6 +18,38 @@ import type { RecipeToolDeclaration } from '../../recipes/shared/RecipeTypes'; */ export type RAGDomain = 'chat' | 'academy' | 'game' | 'code' | 'analysis'; +/** + * Prompt tier — declares how often a RAG source's contribution changes between + * requests. Drives stable-byte-prefix prompt assembly so llama-server / vllm / + * DMR can reuse the KV cache for the invariant region instead of reprocessing + * the full prompt every turn. + * + * The contract: a section's bytes must be byte-identical across requests for + * sources at the same tier with the same inputs. INVARIANT and SEMI_STABLE + * sources MUST NOT contain timestamps, request IDs, or any per-request + * volatile data. Those go in VOLATILE only. + * + * Final assembly order is always: INVARIANT → SEMI_STABLE → VOLATILE. + * Within each tier, sources are sorted by name (alphabetical) so the byte + * order is fully deterministic. + * + * See: docs/architecture/MULTIMODAL-WORKER-AND-PREFIX-REUSE.md (Part 1) + */ +export const enum PromptTier { + /** Persona system prompt, recipe rules, role identity, tool definitions. + * Changes ~weekly when persona/recipe is edited. Identical bytes across + * thousands of turns for the same persona+recipe. */ + INVARIANT = 'invariant', + /** Conversation history, active genome adapters, participants, governance + * state. Grows monotonically — new content APPENDS to the existing + * prefix, doesn't rewrite earlier bytes. */ + SEMI_STABLE = 'semi_stable', + /** Latest user message, audio chunks, current timestamp, last-second + * pressure observations. Changes every request. The only region the + * server actually has to reprocess token-by-token. */ + VOLATILE = 'volatile', +} + /** * Model capabilities that affect RAG context building * Determines how artifacts (images, etc.) are processed diff --git a/src/system/rag/sources/ActivityContextSource.ts b/src/system/rag/sources/ActivityContextSource.ts index 1c6cac5e1..5ce9f2599 100644 --- a/src/system/rag/sources/ActivityContextSource.ts +++ b/src/system/rag/sources/ActivityContextSource.ts @@ -8,6 +8,7 @@ */ import type { RAGSource, RAGSection, RAGSourceContext } from '../shared/RAGSource'; +import { PromptTier } from '../shared/RAGSource'; import type { RecipeStrategy } from '../shared/RAGTypes'; import type { RecipeToolDeclaration } from '../../recipes/shared/RecipeTypes'; import { ORM } from '../../../daemons/data-daemon/server/ORM'; @@ -23,6 +24,7 @@ import { isSlowLocalModel } from '../../shared/ModelContextWindows'; */ export class ActivityContextSource implements RAGSource { readonly name = 'activity'; + readonly tier = PromptTier.VOLATILE; readonly isShared = true; // Medium priority - important for guided interactions @@ -36,7 +38,7 @@ export class ActivityContextSource implements RAGSource { return true; } - async load(context: RAGSourceContext, allocatedBudget: number): Promise { + async load(context: RAGSourceContext, allocatedBudget: number): Promise> { const startTime = Date.now(); try { diff --git a/src/system/rag/sources/CodeToolSource.ts b/src/system/rag/sources/CodeToolSource.ts index 40dbe2d54..4e8e202c6 100644 --- a/src/system/rag/sources/CodeToolSource.ts +++ b/src/system/rag/sources/CodeToolSource.ts @@ -14,6 +14,7 @@ */ import type { RAGSource, RAGSourceContext, RAGSection } from '../shared/RAGSource'; +import { PromptTier } from '../shared/RAGSource'; import { PersonaToolRegistry } from '../../user/server/modules/PersonaToolRegistry'; import { Logger } from '../../core/logging/Logger'; @@ -70,6 +71,7 @@ const CODE_TOOL_GROUPS: readonly CodeToolGroup[] = [ export class CodeToolSource implements RAGSource { readonly name = 'code-tools'; + readonly tier = PromptTier.INVARIANT; readonly priority = 50; // Medium — below conversation/widget, above learning config readonly defaultBudgetPercent = 5; @@ -84,7 +86,7 @@ export class CodeToolSource implements RAGSource { return tools.some(t => t.name.startsWith('code/')); } - async load(context: RAGSourceContext, allocatedBudget: number): Promise { + async load(context: RAGSourceContext, allocatedBudget: number): Promise> { const startTime = performance.now(); try { @@ -253,7 +255,7 @@ export class CodeToolSource implements RAGSource { return tools.filter(t => t.name.startsWith('code/')).length; } - private emptySection(startTime: number, error?: string): RAGSection { + private emptySection(startTime: number, error?: string): Omit { return { sourceName: this.name, tokenCount: 0, diff --git a/src/system/rag/sources/CodebaseSearchSource.ts b/src/system/rag/sources/CodebaseSearchSource.ts index 38553fcbe..e8c6faa9a 100644 --- a/src/system/rag/sources/CodebaseSearchSource.ts +++ b/src/system/rag/sources/CodebaseSearchSource.ts @@ -13,6 +13,7 @@ */ import type { RAGSource, RAGSourceContext, RAGSection } from '../shared/RAGSource'; +import { PromptTier } from '../shared/RAGSource'; import { getCodebaseIndexer } from '../services/CodebaseIndexer'; import { Logger } from '../../core/logging/Logger'; @@ -29,6 +30,7 @@ const RELEVANCE_THRESHOLD = 0.35; export class CodebaseSearchSource implements RAGSource { readonly name = 'codebase-search'; + readonly tier = PromptTier.VOLATILE; readonly priority = 55; readonly defaultBudgetPercent = 8; readonly isShared = true; @@ -43,7 +45,7 @@ export class CodebaseSearchSource implements RAGSource { return currentMessage.length >= MIN_QUERY_LENGTH; } - async load(context: RAGSourceContext, allocatedBudget: number): Promise { + async load(context: RAGSourceContext, allocatedBudget: number): Promise> { const startTime = Date.now(); const query = context.options?.currentMessage?.content as string; diff --git a/src/system/rag/sources/ConversationHistorySource.ts b/src/system/rag/sources/ConversationHistorySource.ts index 6e8101098..7a5a43345 100644 --- a/src/system/rag/sources/ConversationHistorySource.ts +++ b/src/system/rag/sources/ConversationHistorySource.ts @@ -10,6 +10,7 @@ */ import type { RAGSource, RAGSourceContext, RAGSection } from '../shared/RAGSource'; +import { PromptTier } from '../shared/RAGSource'; import type { LLMMessage } from '../shared/RAGTypes'; import { ORM } from '../../../daemons/data-daemon/server/ORM'; import { ChatMessageEntity, type MediaItem } from '../../data/entities/ChatMessageEntity'; @@ -145,6 +146,7 @@ interface InflightEntry { export class ConversationHistorySource implements RAGSource { readonly name = 'conversation-history'; + readonly tier = PromptTier.SEMI_STABLE; readonly priority = 80; // High - conversation is core context readonly defaultBudgetPercent = 25; // Gets largest share of budget @@ -224,7 +226,7 @@ export class ConversationHistorySource implements RAGSource { return true; } - async load(context: RAGSourceContext, allocatedBudget: number): Promise { + async load(context: RAGSourceContext, allocatedBudget: number): Promise> { const startTime = performance.now(); ConversationHistorySource.initEventSubscription(); @@ -564,7 +566,7 @@ export class ConversationHistorySource implements RAGSource { return []; } - private emptySection(startTime: number, error?: string): RAGSection { + private emptySection(startTime: number, error?: string): Omit { return { sourceName: this.name, tokenCount: 0, diff --git a/src/system/rag/sources/DocumentationSource.ts b/src/system/rag/sources/DocumentationSource.ts index 97cb27a4e..fd3444077 100644 --- a/src/system/rag/sources/DocumentationSource.ts +++ b/src/system/rag/sources/DocumentationSource.ts @@ -14,6 +14,7 @@ */ import type { RAGSource, RAGSourceContext, RAGSection } from '../shared/RAGSource'; +import { PromptTier } from '../shared/RAGSource'; import { Logger } from '../../core/logging/Logger'; import * as fs from 'fs/promises'; import * as path from 'path'; @@ -45,6 +46,7 @@ const DOC_CHAPTERS: readonly Omit[] = [ export class DocumentationSource implements RAGSource { readonly name = 'documentation'; + readonly tier = PromptTier.INVARIANT; readonly priority = 35; readonly defaultBudgetPercent = 5; readonly isShared = true; @@ -62,7 +64,7 @@ export class DocumentationSource implements RAGSource { return true; } - async load(context: RAGSourceContext, allocatedBudget: number): Promise { + async load(context: RAGSourceContext, allocatedBudget: number): Promise> { const startTime = performance.now(); try { diff --git a/src/system/rag/sources/GlobalAwarenessSource.ts b/src/system/rag/sources/GlobalAwarenessSource.ts index 08d4eaef2..bbccb2c1c 100644 --- a/src/system/rag/sources/GlobalAwarenessSource.ts +++ b/src/system/rag/sources/GlobalAwarenessSource.ts @@ -23,6 +23,7 @@ */ import type { RAGSource, RAGSourceContext, RAGSection } from '../shared/RAGSource'; +import { PromptTier } from '../shared/RAGSource'; import type { RagSourceRequest, RagSourceResult, ConsciousnessSourceMetadata } from '../../../shared/generated/rag'; import type { PersonaUser } from '../../user/server/PersonaUser'; import { Logger } from '../../core/logging/Logger'; @@ -62,6 +63,7 @@ export function getConsciousness(personaId: string): boolean { export class GlobalAwarenessSource implements RAGSource { readonly name = 'global-awareness'; + readonly tier = PromptTier.SEMI_STABLE; readonly priority = 85; // After identity (95), before conversation (80) readonly defaultBudgetPercent = 5; readonly isShared = true; @@ -107,7 +109,7 @@ export class GlobalAwarenessSource implements RAGSource { * Convert Rust RagSourceResult to TypeScript RAGSection. * Maps consciousness result to systemPromptSection. */ - fromBatchResult(result: RagSourceResult, loadTimeMs: number): RAGSection { + fromBatchResult(result: RagSourceResult, loadTimeMs: number): Omit { // Consciousness result has formatted_prompt in the first section const formattedPrompt = result.sections .map(s => s.content) @@ -142,7 +144,7 @@ export class GlobalAwarenessSource implements RAGSource { * Note: When batching is enabled, this method is typically not called. * RAGComposer uses getBatchRequest() + fromBatchResult() instead. */ - async load(context: RAGSourceContext, _allocatedBudget: number): Promise { + async load(context: RAGSourceContext, _allocatedBudget: number): Promise> { const startTime = performance.now(); try { @@ -227,7 +229,7 @@ export class GlobalAwarenessSource implements RAGSource { } } - private createEmptySection(loadTimeMs: number): RAGSection { + private createEmptySection(loadTimeMs: number): Omit { return { sourceName: this.name, tokenCount: 0, @@ -236,7 +238,7 @@ export class GlobalAwarenessSource implements RAGSource { }; } - private createErrorSection(loadTimeMs: number, error: string): RAGSection { + private createErrorSection(loadTimeMs: number, error: string): Omit { return { sourceName: this.name, tokenCount: 0, diff --git a/src/system/rag/sources/GovernanceSource.ts b/src/system/rag/sources/GovernanceSource.ts index db15a9145..9d7069484 100644 --- a/src/system/rag/sources/GovernanceSource.ts +++ b/src/system/rag/sources/GovernanceSource.ts @@ -7,6 +7,7 @@ */ import type { RAGSource, RAGSection, RAGSourceContext } from '../shared/RAGSource'; +import { PromptTier } from '../shared/RAGSource'; import { isSlowLocalModel, getContextWindow } from '../../shared/ModelContextWindows'; /** @@ -58,6 +59,7 @@ You can propose collective decisions with collaboration/decision/propose and vot */ export class GovernanceSource implements RAGSource { readonly name = 'governance'; + readonly tier = PromptTier.SEMI_STABLE; readonly isShared = true; // Low priority - governance examples are nice-to-have, not critical @@ -78,7 +80,7 @@ export class GovernanceSource implements RAGSource { return true; } - async load(context: RAGSourceContext, allocatedBudget: number): Promise { + async load(context: RAGSourceContext, allocatedBudget: number): Promise> { const startTime = Date.now(); // Determine which version to use based on budget and model capability diff --git a/src/system/rag/sources/LiveRoomAwarenessSource.ts b/src/system/rag/sources/LiveRoomAwarenessSource.ts index 3f7422265..54b40e198 100644 --- a/src/system/rag/sources/LiveRoomAwarenessSource.ts +++ b/src/system/rag/sources/LiveRoomAwarenessSource.ts @@ -16,6 +16,7 @@ */ import type { RAGSource, RAGSection, RAGSourceContext } from '../shared/RAGSource'; +import { PromptTier } from '../shared/RAGSource'; import type { RAGArtifact } from '../shared/RAGTypes'; import { ORM } from '../../../daemons/data-daemon/server/ORM'; import { CallEntity, type CallParticipant } from '../../data/entities/CallEntity'; @@ -27,6 +28,7 @@ const log = Logger.create('LiveRoomAwarenessSource', 'rag'); export class LiveRoomAwarenessSource implements RAGSource { readonly name = 'live-room-awareness'; + readonly tier = PromptTier.SEMI_STABLE; readonly priority = 30; readonly defaultBudgetPercent = 3; @@ -41,7 +43,7 @@ export class LiveRoomAwarenessSource implements RAGSource { return true; } - async load(context: RAGSourceContext, allocatedBudget: number): Promise { + async load(context: RAGSourceContext, allocatedBudget: number): Promise> { const startTime = performance.now(); const call = this.getActiveCall(); @@ -200,7 +202,7 @@ export class LiveRoomAwarenessSource implements RAGSource { } } - private emptySection(loadTimeMs: number): RAGSection { + private emptySection(loadTimeMs: number): Omit { return { sourceName: this.name, tokenCount: 0, diff --git a/src/system/rag/sources/MediaArtifactSource.ts b/src/system/rag/sources/MediaArtifactSource.ts index 99f1dcbb7..f66cbb3ff 100644 --- a/src/system/rag/sources/MediaArtifactSource.ts +++ b/src/system/rag/sources/MediaArtifactSource.ts @@ -18,6 +18,7 @@ */ import type { RAGSource, RAGSourceContext, RAGSection } from '../shared/RAGSource'; +import { PromptTier } from '../shared/RAGSource'; import { type RAGArtifact, type MediaArtifactMetadata, hasMediaMetadata } from '../shared/RAGTypes'; import { VisionDescriptionService } from '../../vision/VisionDescriptionService'; import { ConversationHistorySource } from './ConversationHistorySource'; @@ -31,6 +32,7 @@ const TOKENS_PER_IMAGE_BASE64 = 1000; export class MediaArtifactSource implements RAGSource { readonly name = 'media-artifacts'; + readonly tier = PromptTier.VOLATILE; readonly priority = 65; readonly defaultBudgetPercent = 5; @@ -39,7 +41,7 @@ export class MediaArtifactSource implements RAGSource { return context.options.includeArtifacts !== false; } - async load(context: RAGSourceContext, allocatedBudget: number): Promise { + async load(context: RAGSourceContext, allocatedBudget: number): Promise> { const startTime = performance.now(); // Scan window for media. Balance between finding images in chatty rooms and diff --git a/src/system/rag/sources/OpenProposalsSource.ts b/src/system/rag/sources/OpenProposalsSource.ts index 246a01812..a860d0798 100644 --- a/src/system/rag/sources/OpenProposalsSource.ts +++ b/src/system/rag/sources/OpenProposalsSource.ts @@ -11,6 +11,7 @@ */ import type { RAGSource, RAGSection, RAGSourceContext } from '../shared/RAGSource'; +import { PromptTier } from '../shared/RAGSource'; import { ORM } from '../../../daemons/data-daemon/server/ORM'; import type { DecisionProposalEntity, DecisionOption, RankedVote } from '../../data/entities/DecisionProposalEntity'; import type { DataRecord } from '../../../daemons/data-daemon/shared/DataStorageAdapter'; @@ -32,7 +33,7 @@ function formatProposal(record: DataRecord): string { ${options}`; } -const EMPTY_SECTION: RAGSection = { +const EMPTY_SECTION: Omit = { sourceName: 'open-proposals', tokenCount: 0, loadTimeMs: 0, @@ -41,6 +42,7 @@ const EMPTY_SECTION: RAGSection = { export class OpenProposalsSource implements RAGSource { readonly name = 'open-proposals'; + readonly tier = PromptTier.SEMI_STABLE; readonly priority = 25; readonly defaultBudgetPercent = 3; @@ -48,7 +50,7 @@ export class OpenProposalsSource implements RAGSource { return true; } - async load(context: RAGSourceContext, allocatedBudget: number): Promise { + async load(context: RAGSourceContext, allocatedBudget: number): Promise> { const startTime = Date.now(); if (allocatedBudget < 30) { diff --git a/src/system/rag/sources/PersonaIdentitySource.ts b/src/system/rag/sources/PersonaIdentitySource.ts index 55427b73b..019ace400 100644 --- a/src/system/rag/sources/PersonaIdentitySource.ts +++ b/src/system/rag/sources/PersonaIdentitySource.ts @@ -18,6 +18,7 @@ */ import type { RAGSource, RAGSourceContext, RAGSection } from '../shared/RAGSource'; +import { PromptTier } from '../shared/RAGSource'; import type { PersonaIdentity } from '../shared/RAGTypes'; import { ORM } from '../../../daemons/data-daemon/server/ORM'; import { UserEntity } from '../../data/entities/UserEntity'; @@ -28,6 +29,7 @@ const log = Logger.create('PersonaIdentitySource', 'rag'); export class PersonaIdentitySource implements RAGSource { readonly name = 'persona-identity'; + readonly tier = PromptTier.INVARIANT; readonly priority = 95; // Critical - must be included readonly defaultBudgetPercent = 20; @@ -91,7 +93,7 @@ export class PersonaIdentitySource implements RAGSource { return true; } - async load(context: RAGSourceContext, allocatedBudget: number): Promise { + async load(context: RAGSourceContext, allocatedBudget: number): Promise> { const startTime = performance.now(); try { @@ -373,7 +375,7 @@ LIMITS: // ── Helpers ────────────────────────────────────────────────────── - private defaultSection(startTime: number, error?: string): RAGSection { + private defaultSection(startTime: number, error?: string): Omit { const defaultIdentity: PersonaIdentity = { name: 'AI Assistant', systemPrompt: 'You are a helpful AI assistant participating in a group chat.' diff --git a/src/system/rag/sources/ProjectContextSource.ts b/src/system/rag/sources/ProjectContextSource.ts index 6d4163005..9e7e5287f 100644 --- a/src/system/rag/sources/ProjectContextSource.ts +++ b/src/system/rag/sources/ProjectContextSource.ts @@ -21,6 +21,7 @@ */ import type { RAGSource, RAGSourceContext, RAGSection } from '../shared/RAGSource'; +import { PromptTier } from '../shared/RAGSource'; import { WorkspaceStrategy } from '../../code/server/WorkspaceStrategy'; import { ProjectDetector, type ProjectType } from '../../code/server/ProjectDetector'; import { Logger } from '../../core/logging/Logger'; @@ -33,6 +34,7 @@ const log = Logger.create('ProjectContextSource', 'rag'); export class ProjectContextSource implements RAGSource { readonly name = 'project-context'; + readonly tier = PromptTier.INVARIANT; readonly priority = 70; readonly defaultBudgetPercent = 5; readonly isShared = true; @@ -46,8 +48,8 @@ export class ProjectContextSource implements RAGSource { * 14×5 = 70 synchronous shell calls per RAG cycle. * Single-flight coalescing prevents thundering herd on cache miss. */ - private static _contextCache: Map = new Map(); - private static _contextInflight: Map> = new Map(); + private static _contextCache: Map; cachedAt: number }> = new Map(); + private static _contextInflight: Map>> = new Map(); private static readonly CONTEXT_CACHE_TTL_MS = 30_000; isApplicable(context: RAGSourceContext): boolean { @@ -59,7 +61,7 @@ export class ProjectContextSource implements RAGSource { return ProjectContextSource.isMainRepoGit(); } - async load(context: RAGSourceContext, allocatedBudget: number): Promise { + async load(context: RAGSourceContext, allocatedBudget: number): Promise> { const startTime = performance.now(); const wsMeta = WorkspaceStrategy.getProjectForPersona(context.personaId); @@ -104,7 +106,7 @@ export class ProjectContextSource implements RAGSource { isPersonalWorkspace: boolean, initialBranch: string, startTime: number, - ): Promise { + ): Promise> { try { // Resolve branch — from workspace metadata or live git query @@ -371,7 +373,7 @@ export class ProjectContextSource implements RAGSource { return `## ${label}: ${projectType.description}\nBranch: ${branch}\n${gitStatus}`; } - private emptySection(startTime: number, error?: string): RAGSection { + private emptySection(startTime: number, error?: string): Omit { return { sourceName: this.name, tokenCount: 0, diff --git a/src/system/rag/sources/SemanticMemorySource.ts b/src/system/rag/sources/SemanticMemorySource.ts index 9af52e9f6..5da453c32 100644 --- a/src/system/rag/sources/SemanticMemorySource.ts +++ b/src/system/rag/sources/SemanticMemorySource.ts @@ -18,6 +18,7 @@ */ import type { RAGSource, RAGSourceContext, RAGSection } from '../shared/RAGSource'; +import { PromptTier } from '../shared/RAGSource'; import type { RagSourceRequest, RagSourceResult } from '../../../shared/generated/rag'; import type { PersonaMemory } from '../shared/RAGTypes'; import { TieredMemoryCache } from '../cache/TieredMemoryCache'; @@ -30,6 +31,7 @@ const TOKENS_PER_MEMORY_ESTIMATE = 80; export class SemanticMemorySource implements RAGSource { readonly name = 'semantic-memory'; + readonly tier = PromptTier.SEMI_STABLE; readonly priority = 60; // Medium-high - memories inform persona behavior readonly defaultBudgetPercent = 12; readonly supportsBatching = true; // Participate in batched Rust IPC @@ -69,7 +71,7 @@ export class SemanticMemorySource implements RAGSource { * Convert Rust RagSourceResult to TypeScript RAGSection. * Maps Rust's memory format back to PersonaMemory[]. */ - fromBatchResult(result: RagSourceResult, loadTimeMs: number): RAGSection { + fromBatchResult(result: RagSourceResult, loadTimeMs: number): Omit { // Extract memories from sections const memories: PersonaMemory[] = []; @@ -122,7 +124,7 @@ export class SemanticMemorySource implements RAGSource { }; } - async load(context: RAGSourceContext, allocatedBudget: number): Promise { + async load(context: RAGSourceContext, allocatedBudget: number): Promise> { const startTime = performance.now(); const maxMemories = Math.max(3, Math.floor(allocatedBudget / TOKENS_PER_MEMORY_ESTIMATE)); @@ -195,7 +197,7 @@ export class SemanticMemorySource implements RAGSource { return validTypes.includes(type) ? type as PersonaMemory['type'] : 'observation'; } - private emptySection(startTime: number, error?: string): RAGSection { + private emptySection(startTime: number, error?: string): Omit { return { sourceName: this.name, tokenCount: 0, diff --git a/src/system/rag/sources/SentinelAwarenessSource.ts b/src/system/rag/sources/SentinelAwarenessSource.ts index b6dc186a4..e7e8681a4 100644 --- a/src/system/rag/sources/SentinelAwarenessSource.ts +++ b/src/system/rag/sources/SentinelAwarenessSource.ts @@ -16,12 +16,14 @@ */ import type { RAGSource, RAGSection, RAGSourceContext } from '../shared/RAGSource'; +import { PromptTier } from '../shared/RAGSource'; import { TemplateRegistry } from '../../sentinel/pipelines/TemplateRegistry'; import { sentinelEventBridge } from '../../sentinel/SentinelEventBridge'; import { isSlowLocalModel, getContextWindow } from '../../shared/ModelContextWindows'; export class SentinelAwarenessSource implements RAGSource { readonly name = 'sentinel-awareness'; + readonly tier = PromptTier.SEMI_STABLE; readonly isShared = true; readonly priority = 58; readonly defaultBudgetPercent = 8; @@ -36,7 +38,7 @@ export class SentinelAwarenessSource implements RAGSource { return true; } - async load(context: RAGSourceContext, allocatedBudget: number): Promise { + async load(context: RAGSourceContext, allocatedBudget: number): Promise> { const startTime = Date.now(); const modelId = context.options?.modelId; diff --git a/src/system/rag/sources/SocialMediaRAGSource.ts b/src/system/rag/sources/SocialMediaRAGSource.ts index 7a46797f8..e6501e32d 100644 --- a/src/system/rag/sources/SocialMediaRAGSource.ts +++ b/src/system/rag/sources/SocialMediaRAGSource.ts @@ -25,6 +25,7 @@ */ import type { RAGSource, RAGSourceContext, RAGSection } from '../shared/RAGSource'; +import { PromptTier } from '../shared/RAGSource'; import type { SocialNotification, SocialProfile } from '@system/social/shared/SocialMediaTypes'; import type { ISocialMediaProvider } from '@system/social/shared/ISocialMediaProvider'; import { SocialCredentialEntity } from '@system/social/shared/SocialCredentialEntity'; @@ -54,6 +55,7 @@ interface ResolvedCredential { export class SocialMediaRAGSource implements RAGSource { readonly name = 'social-media'; + readonly tier = PromptTier.SEMI_STABLE; readonly priority = 55; readonly defaultBudgetPercent = 3; @@ -97,7 +99,7 @@ export class SocialMediaRAGSource implements RAGSource { * If HUD is cached, returns it. If not, returns empty section. * Background warmup loop handles populating the cache. */ - async load(context: RAGSourceContext, _allocatedBudget: number): Promise { + async load(context: RAGSourceContext, _allocatedBudget: number): Promise> { const startTime = performance.now(); // Register this persona for background warmup @@ -461,7 +463,7 @@ export class SocialMediaRAGSource implements RAGSource { ]); } - private emptySection(startTime: number): RAGSection { + private emptySection(startTime: number): Omit { return { sourceName: this.name, tokenCount: 0, @@ -470,7 +472,7 @@ export class SocialMediaRAGSource implements RAGSource { }; } - private errorSection(startTime: number, error: string): RAGSection { + private errorSection(startTime: number, error: string): Omit { return { sourceName: this.name, tokenCount: 0, diff --git a/src/system/rag/sources/ToolDefinitionsSource.ts b/src/system/rag/sources/ToolDefinitionsSource.ts index df7ed5a35..6a0cea59d 100644 --- a/src/system/rag/sources/ToolDefinitionsSource.ts +++ b/src/system/rag/sources/ToolDefinitionsSource.ts @@ -19,6 +19,7 @@ */ import type { RAGSource, RAGSourceContext, RAGSection } from '../shared/RAGSource'; +import { PromptTier } from '../shared/RAGSource'; import type { NativeToolSpec } from '../../../daemons/ai-provider-daemon/shared/AIProviderTypesV2'; import type { RecipeToolDeclaration } from '../../recipes/shared/RecipeTypes'; import { PersonaToolRegistry } from '../../user/server/modules/PersonaToolRegistry'; @@ -35,6 +36,7 @@ const log = Logger.create('ToolDefinitionsSource', 'rag'); export class ToolDefinitionsSource implements RAGSource { readonly name = 'tool-definitions'; + readonly tier = PromptTier.INVARIANT; readonly priority = 45; readonly defaultBudgetPercent = 10; @@ -46,7 +48,7 @@ export class ToolDefinitionsSource implements RAGSource { return true; } - async load(context: RAGSourceContext, allocatedBudget: number): Promise { + async load(context: RAGSourceContext, allocatedBudget: number): Promise> { const startTime = performance.now(); try { @@ -94,7 +96,7 @@ export class ToolDefinitionsSource implements RAGSource { toolDefinitions: ToolDefinition[], allocatedBudget: number, startTime: number - ): RAGSection { + ): Omit { // Exclude meta-tools — models with native tool calling don't need discovery tools. // search_tools/list_tools cause infinite loops where models search instead of act. const META_TOOLS = new Set(['search_tools', 'list_tools', 'working_memory']); @@ -178,7 +180,7 @@ export class ToolDefinitionsSource implements RAGSource { toolDefinitions: ToolDefinition[], allocatedBudget: number, startTime: number - ): RAGSection { + ): Omit { // Exclude chat/send when responding in a chat room (same as native path) if (context.roomId) { toolDefinitions = toolDefinitions.filter(t => t.name !== 'collaboration/chat/send'); @@ -382,7 +384,7 @@ RESPOND WITH TOOL CALLS, NOT DESCRIPTIONS.`; return Math.ceil(text.length / 4); } - private emptySection(startTime: number, error?: string): RAGSection { + private emptySection(startTime: number, error?: string): Omit { return { sourceName: this.name, tokenCount: 0, diff --git a/src/system/rag/sources/ToolMethodologySource.ts b/src/system/rag/sources/ToolMethodologySource.ts index 0ad57d275..528930536 100644 --- a/src/system/rag/sources/ToolMethodologySource.ts +++ b/src/system/rag/sources/ToolMethodologySource.ts @@ -17,6 +17,7 @@ */ import type { RAGSource, RAGSourceContext, RAGSection } from '../shared/RAGSource'; +import { PromptTier } from '../shared/RAGSource'; import { PersonaToolRegistry } from '../../user/server/modules/PersonaToolRegistry'; import { Logger } from '../../core/logging/Logger'; @@ -65,6 +66,7 @@ const TOOL_CATEGORIES: readonly ToolCategory[] = [ export class ToolMethodologySource implements RAGSource { readonly name = 'tool-methodology'; + readonly tier = PromptTier.INVARIANT; readonly priority = 48; readonly defaultBudgetPercent = 3; @@ -78,7 +80,7 @@ export class ToolMethodologySource implements RAGSource { return tools.some(t => TOOL_CATEGORIES.some(cat => t.name.startsWith(cat.prefix))); } - async load(context: RAGSourceContext, allocatedBudget: number): Promise { + async load(context: RAGSourceContext, allocatedBudget: number): Promise> { const startTime = performance.now(); try { diff --git a/src/system/rag/sources/VoiceConversationSource.ts b/src/system/rag/sources/VoiceConversationSource.ts index ff9328cb9..8e9a5f324 100644 --- a/src/system/rag/sources/VoiceConversationSource.ts +++ b/src/system/rag/sources/VoiceConversationSource.ts @@ -14,6 +14,7 @@ */ import type { RAGSource, RAGSourceContext, RAGSection } from '../shared/RAGSource'; +import { PromptTier } from '../shared/RAGSource'; import type { LLMMessage } from '../shared/RAGTypes'; import { extractSentiment, formatEmotionLabel } from '../shared/TextSentiment'; import { Logger } from '../../core/logging/Logger'; @@ -53,6 +54,7 @@ export function unregisterVoiceOrchestrator(): void { export class VoiceConversationSource implements RAGSource { readonly name = 'voice-conversation'; + readonly tier = PromptTier.VOLATILE; readonly priority = 85; readonly defaultBudgetPercent = 30; @@ -67,7 +69,7 @@ export class VoiceConversationSource implements RAGSource { return hasVoiceSession && hasOrchestrator; } - async load(context: RAGSourceContext, allocatedBudget: number): Promise { + async load(context: RAGSourceContext, allocatedBudget: number): Promise> { const startTime = performance.now(); if (!voiceOrchestrator) { @@ -254,7 +256,7 @@ You may speak for as long as needed to complete your thought. Natural conversati return breakdown; } - private emptySection(startTime: number, error?: string): RAGSection { + private emptySection(startTime: number, error?: string): Omit { return { sourceName: this.name, tokenCount: 0, diff --git a/src/system/rag/sources/WidgetContextSource.ts b/src/system/rag/sources/WidgetContextSource.ts index c6bd37926..9da769ac9 100644 --- a/src/system/rag/sources/WidgetContextSource.ts +++ b/src/system/rag/sources/WidgetContextSource.ts @@ -11,6 +11,7 @@ */ import type { RAGSource, RAGSourceContext, RAGSection } from '../shared/RAGSource'; +import { PromptTier } from '../shared/RAGSource'; import { WidgetContextService } from '../services/WidgetContextService'; import { Logger } from '../../core/logging/Logger'; @@ -18,6 +19,7 @@ const log = Logger.create('WidgetContextSource', 'rag'); export class WidgetContextSource implements RAGSource { readonly name = 'widget-context'; + readonly tier = PromptTier.VOLATILE; readonly priority = 75; // High - UI context is very relevant readonly defaultBudgetPercent = 5; readonly isShared = true; @@ -27,7 +29,7 @@ export class WidgetContextSource implements RAGSource { return !!(context.options.widgetContext || context.sessionId); } - async load(context: RAGSourceContext, _allocatedBudget: number): Promise { + async load(context: RAGSourceContext, _allocatedBudget: number): Promise> { const startTime = performance.now(); try { @@ -90,7 +92,7 @@ Use this context to provide relevant, contextual assistance. `.trim(); } - private emptySection(startTime: number, error?: string): RAGSection { + private emptySection(startTime: number, error?: string): Omit { return { sourceName: this.name, tokenCount: 0, From 203fb6534d108db3349b68fd82808ff64e686a70 Mon Sep 17 00:00:00 2001 From: Joel Teply Date: Fri, 17 Apr 2026 16:14:11 -0500 Subject: [PATCH 2/2] =?UTF-8?q?fix(rag):=20throttle=20codebase=20indexing?= =?UTF-8?q?=20=E2=80=94=20stop=20saturating=20event=20loop=20on=20boot?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CodebaseIndexer ran 64-batches back-to-back with NO yield between batches. Each batch ~1.5s + ~80MB RSS growth. With 5000+ chunks in src/, that's 78+ batches × 1.5s = 2+ minutes of total event-loop saturation immediately after every boot. Local personas couldn't respond, voice couldn't connect, anything that needed the bus was blocked until indexing finished. Two changes: - Batch size 64→16 (smaller per-batch RSS hit, ~4× more chances for other IO to interleave between IPC roundtrips) - 50ms pause between batches via setTimeout (yields the event loop so chat/voice/personas can process while indexing runs) The throughput cost is small (16 vs 64 chunks per IPC) and the inter-batch pause is invisible at human timescales. The chat-arrival latency win is huge — system is responsive within seconds of boot instead of minutes. The deeper fix is querying GpuPressureWatcher / ResourcePressureWatcher before each batch and backing off when pressure is high — same principle Joel called out for InferenceCoordinator slot capacity. That's a follow-up; this is the floor. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/system/rag/services/CodebaseIndexer.ts | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/src/system/rag/services/CodebaseIndexer.ts b/src/system/rag/services/CodebaseIndexer.ts index 19a2c8646..5b281725f 100644 --- a/src/system/rag/services/CodebaseIndexer.ts +++ b/src/system/rag/services/CodebaseIndexer.ts @@ -28,8 +28,17 @@ const log = Logger.create('CodebaseIndexer', 'rag'); /** Maximum content length per chunk (chars). Longer chunks are split. */ const MAX_CHUNK_CHARS = 2000; -/** Batch size for embedding generation — one Rust IPC call per batch */ -const EMBEDDING_BATCH_SIZE = 64; +/** Batch size for embedding generation — one Rust IPC call per batch. + * Was 64; dropped to 16 because 64 × ~80MB-per-batch RSS growth saturated + * the event loop and starved chat for ~2min after every boot on M5. + * 16 gives Rust IPC the ~4× headroom to interleave with persona inference. */ +const EMBEDDING_BATCH_SIZE = 16; + +/** Pause between batches (ms) to yield the event loop and let the Rust + * IPC pipeline drain. Without this, the indexer blocks chat and live for + * the full duration. 50ms is small enough to not visibly slow indexing + * but big enough that other IO can interleave. */ +const EMBEDDING_BATCH_PAUSE_MS = 50; /** File extensions to index */ const INDEXABLE_EXTENSIONS = new Set(['.ts', '.md', '.js']); @@ -224,6 +233,14 @@ export class CodebaseIndexer { log.error(`Embedding batch ${i}-${i + batch.length} failed: ${err}`); errors.push({ file: `batch-${i}`, error: String(err) }); } + + // Yield to other IO between batches. Without this, the indexer + // monopolises the event loop and chat/voice/personas all stall + // for the full indexing duration. Chat-arrival latency matters + // more than indexing throughput. + if (i + EMBEDDING_BATCH_SIZE < allChunks.length) { + await new Promise(resolve => setTimeout(resolve, EMBEDDING_BATCH_PAUSE_MS)); + } } // Any write to code_index invalidates the in-memory query cache.