From c8b330d8e16097b3d0395322e07505a0089edd82 Mon Sep 17 00:00:00 2001
From: Joel Teply <joel@cambriantech.com>
Date: Fri, 17 Apr 2026 18:05:05 -0500
Subject: [PATCH 1/2] =?UTF-8?q?feat(rag):=20Phase=201=20=E2=80=94=20stable?=
 =?UTF-8?q?-first=20ordering=20for=20prefix-reuse=20(issue=20#918)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds PromptTier enum (INVARIANT / SEMI_STABLE / VOLATILE) and makes
every RAGSource declare its tier. RAGComposer sorts collected sections
deterministically by (tier, sourceName) before returning.

Why: today the composer's parallel section assembly produces a different
byte order on every chat call. llama-server / DMR's prefix-KV-cache
reuse never fires, so each turn reprocesses the full 14k-token prompt
from scratch (~35s prompt eval at 400 tok/s). With deterministic
ordering AND stable bytes within each tier, the unchanging INVARIANT
prefix gets reused — only the VOLATILE suffix needs evaluation.
Expected: ~70× faster prompt eval per turn for repeat-context turns.

Architecture (per docs/architecture/MULTIMODAL-WORKER-AND-PREFIX-REUSE.md):
- INVARIANT: persona identity, tool definitions, recipe rules, docs
  (PersonaIdentity, ToolDefinitions, CodeTool, Documentation,
   ToolMethodology, ProjectContext)
- SEMI_STABLE: history, memories, participants, governance — append-only
  (ConversationHistory, LiveRoomAwareness, Governance, OpenProposals,
   SentinelAwareness, GlobalAwareness, SocialMediaRAG, SemanticMemory)
- VOLATILE: latest message, audio chunks, current activity, UI state
  (ActivityContext, CodebaseSearch, MediaArtifact, VoiceConversation,
   WidgetContext)

Implementation note: tier is a class-level declaration on each RAGSource
(required field, no Option<>). Sources return Omit<RAGSection, 'tier'>
from load() and fromBatchResult(); RAGComposer injects the source's
declared tier when wrapping the section. Single-source-of-truth
classification per source — no per-return-statement repetition.

Phases 2 (slot pinning) and 3 (composition cache) build on this.
Phase 4 (multimodal content parts) depends on #917 ModelMetadata.

tsc clean. Branch: feature/prefix-reuse-and-multimodal off main.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/system/rag/shared/RAGComposer.ts          | 44 ++++++++++++++++--
 src/system/rag/shared/RAGSource.ts            | 46 +++++++++++++++++--
 src/system/rag/shared/RAGTypes.ts             | 32 +++++++++++++
 .../rag/sources/ActivityContextSource.ts      |  4 +-
 src/system/rag/sources/CodeToolSource.ts      |  6 ++-
 .../rag/sources/CodebaseSearchSource.ts       |  4 +-
 .../rag/sources/ConversationHistorySource.ts  |  6 ++-
 src/system/rag/sources/DocumentationSource.ts |  4 +-
 .../rag/sources/GlobalAwarenessSource.ts      | 10 ++--
 src/system/rag/sources/GovernanceSource.ts    |  4 +-
 .../rag/sources/LiveRoomAwarenessSource.ts    |  6 ++-
 src/system/rag/sources/MediaArtifactSource.ts |  4 +-
 src/system/rag/sources/OpenProposalsSource.ts |  6 ++-
 .../rag/sources/PersonaIdentitySource.ts      |  6 ++-
 .../rag/sources/ProjectContextSource.ts       | 12 +++--
 .../rag/sources/SemanticMemorySource.ts       |  8 ++--
 .../rag/sources/SentinelAwarenessSource.ts    |  4 +-
 .../rag/sources/SocialMediaRAGSource.ts       |  8 ++--
 .../rag/sources/ToolDefinitionsSource.ts      | 10 ++--
 .../rag/sources/ToolMethodologySource.ts      |  4 +-
 .../rag/sources/VoiceConversationSource.ts    |  6 ++-
 src/system/rag/sources/WidgetContextSource.ts |  6 ++-
 22 files changed, 191 insertions(+), 49 deletions(-)

diff --git a/src/system/rag/shared/RAGComposer.ts b/src/system/rag/shared/RAGComposer.ts
index d7b24f63b..9f7991423 100644
--- a/src/system/rag/shared/RAGComposer.ts
+++ b/src/system/rag/shared/RAGComposer.ts
@@ -22,6 +22,15 @@
  */
 
 import type { RAGSource, RAGSourceContext, RAGSection, RAGCompositionResult } from './RAGSource';
+import { PromptTier } from './RAGTypes';
+
+/** Sort key for tiers — smaller numbers concatenate first.
+ * INVARIANT before SEMI_STABLE before VOLATILE. See PromptTier docs. */
+const TIER_ORDER: Record<PromptTier, number> = {
+  [PromptTier.INVARIANT]: 0,
+  [PromptTier.SEMI_STABLE]: 1,
+  [PromptTier.VOLATILE]: 2,
+};
 import type { RagSourceRequest, RagComposeResult, RagSourceResult } from '../../../shared/generated/rag';
 import { Logger } from '../../core/logging/Logger';
 import { TimingHarness } from '../../core/shared/TimingHarness';
@@ -230,6 +239,19 @@ export class RAGComposer {
         failedSources.push({ source: result.source, error: result.error });
       }
     }
+    // Deterministic ordering: sections sorted by (tier, sourceName) so the
+    // assembled prompt's bytes are identical across requests with identical
+    // section contents. This is the prerequisite for llama-server / DMR
+    // prefix-KV-cache reuse — without stable ordering, the same logical
+    // prompt has different bytes per turn and the cache misses every time.
+    // Tier order: INVARIANT first, then SEMI_STABLE, then VOLATILE — see
+    // PromptTier in RAGTypes.ts and docs/architecture/MULTIMODAL-WORKER-AND-PREFIX-REUSE.md
+    sections.sort((a, b) => {
+      const tierOrder = TIER_ORDER[a.tier] - TIER_ORDER[b.tier];
+      if (tierOrder !== 0) return tierOrder;
+      // Within a tier: alphabetical by source name. Stable, total order.
+      return a.sourceName.localeCompare(b.sourceName);
+    });
     timer.mark('collect_results');
 
     // Log ALL source timings for performance diagnosis
@@ -316,8 +338,13 @@ export class RAGComposer {
 
         if (rustResult.success) {
           // Convert via source's fromBatchResult method
+          // Tier injection: same single-source-of-truth as TS path —
+          // the source's class declaration provides tier; we inject it
+          // here so the section conforms to RAGSection regardless of
+          // whether the source's fromBatchResult included it.
           if (sourceInfo.source.fromBatchResult) {
-            const section = sourceInfo.source.fromBatchResult(rustResult, rustResult.load_time_ms);
+            const rawSection = sourceInfo.source.fromBatchResult(rustResult, rustResult.load_time_ms);
+            const section: RAGSection = { ...rawSection, tier: sourceInfo.source.tier };
             results.push({
               success: true,
               section,
@@ -326,9 +353,11 @@ export class RAGComposer {
             });
           } else {
             // Fallback: basic conversion
+            const rawSection = this.defaultFromBatchResult(sourceInfo.source.name, rustResult);
+            const section: RAGSection = { ...rawSection, tier: sourceInfo.source.tier };
             results.push({
               success: true,
-              section: this.defaultFromBatchResult(sourceInfo.source.name, rustResult),
+              section,
               sourceName: sourceInfo.source.name,
               loadTime: rustResult.load_time_ms
             });
@@ -441,10 +470,14 @@ export class RAGComposer {
     sourceTimer.setMeta('budget', budget);
 
     try {
-      const section = await source.load(context, budget);
+      const rawSection = await source.load(context, budget);
       sourceTimer.mark('load');
-      sourceTimer.setMeta('tokenCount', section.tokenCount);
+      sourceTimer.setMeta('tokenCount', rawSection.tokenCount);
       const record = sourceTimer.finish();
+      // Inject tier from the source's declaration. Sources don't re-state
+      // their tier on every return; the class-level declaration is the
+      // single source of truth, applied here.
+      const section: RAGSection = { ...rawSection, tier: source.tier };
       return { success: true, section, sourceName: source.name, loadTime: record.totalMs };
     } catch (error: any) {
       sourceTimer.setError(error.message);
@@ -457,8 +490,9 @@ export class RAGComposer {
   /**
    * Default conversion from Rust RagSourceResult to TypeScript RAGSection.
    * Used when source doesn't implement fromBatchResult.
+   * Returns without `tier` — caller injects from the source's declaration.
    */
-  private defaultFromBatchResult(sourceName: string, result: RagSourceResult): RAGSection {
+  private defaultFromBatchResult(sourceName: string, result: RagSourceResult): Omit<RAGSection, 'tier'> {
     // Combine all sections into a single content block
     const content = result.sections
       .map(s => s.content)
diff --git a/src/system/rag/shared/RAGSource.ts b/src/system/rag/shared/RAGSource.ts
index 995b6859f..0a46dd7d1 100644
--- a/src/system/rag/shared/RAGSource.ts
+++ b/src/system/rag/shared/RAGSource.ts
@@ -19,6 +19,10 @@
 
 import type { UUID } from '../../core/types/CrossPlatformUUID';
 import type { RAGBuildOptions, LLMMessage, RAGArtifact, PersonaMemory, PersonaIdentity, RecipeStrategy } from './RAGTypes';
+import { PromptTier } from './RAGTypes';
+
+// Re-export so source files only need one import
+export { PromptTier } from './RAGTypes';
 
 /**
  * Context passed to each RAGSource for loading
@@ -70,6 +74,9 @@ export interface RAGSourceContext {
 export interface RAGSection {
   /** Source that produced this section */
   readonly sourceName: string;
+  /** Tier this section belongs to — drives stable-byte-prefix ordering.
+   * Mirrored from the producing source's declared tier. */
+  readonly tier: PromptTier;
   /** Estimated token count */
   readonly tokenCount: number;
   /** Time taken to load (ms) */
@@ -105,6 +112,29 @@ export interface RAGSource {
    */
   readonly priority: number;
 
+  /**
+   * Tier — INVARIANT / SEMI_STABLE / VOLATILE.
+   * Required. Drives stable-byte-prefix prompt assembly so llama-server
+   * reuses KV cache for the unchanging region instead of reprocessing
+   * the full prompt every turn.
+   *
+   * Classification rules:
+   * - INVARIANT — system prompt fragments, recipe rules, role identity,
+   *   tool definitions. Bytes must be identical across thousands of turns
+   *   for the same persona+recipe. NO timestamps, NO request IDs, NO
+   *   per-request volatile data.
+   * - SEMI_STABLE — conversation history, memories, participants,
+   *   governance. Grows monotonically — append-only relative to the
+   *   previous turn. Earlier bytes never rewritten.
+   * - VOLATILE — current message, audio chunks, current timestamp,
+   *   per-request observations. The only region the server reprocesses
+   *   token-by-token.
+   *
+   * If you can't decide, the source probably mixes tiers and should be
+   * split into separate sources at the right granularity.
+   */
+  readonly tier: PromptTier;
+
   /**
    * Default budget allocation as percentage (0-100).
    * Total across all sources should roughly equal 100.
@@ -126,11 +156,16 @@ export interface RAGSource {
    * Load data from this source.
    * Called in parallel with other applicable sources.
    *
+   * Returns the section without the `tier` field — RAGComposer injects
+   * the source's declared `tier` into the section after load completes.
+   * This keeps source implementations focused on what they produce
+   * rather than re-asserting their tier on every return.
+   *
    * @param context - Context for loading
    * @param allocatedBudget - Token budget allocated to this source
-   * @returns Section of RAG context
+   * @returns Section of RAG context (tier added by composer)
    */
-  load(context: RAGSourceContext, allocatedBudget: number): Promise<RAGSection>;
+  load(context: RAGSourceContext, allocatedBudget: number): Promise<Omit<RAGSection, 'tier'>>;
 
   /**
    * Whether this source produces identical results for all personas in the same room.
@@ -168,11 +203,14 @@ export interface RAGSource {
    * Only called if supportsBatching is true.
    * Transforms the typed Rust result into the RAGSection format.
    *
+   * Returns the section without `tier` — RAGComposer injects the source's
+   * declared tier after conversion, same as the non-batched path.
+   *
    * @param result - The result from Rust's rag/compose endpoint
    * @param loadTimeMs - How long the load took
-   * @returns The RAGSection to include in the composition result
+   * @returns The RAGSection (without tier) to include in the composition result
    */
-  fromBatchResult?(result: RagSourceResult, loadTimeMs: number): RAGSection;
+  fromBatchResult?(result: RagSourceResult, loadTimeMs: number): Omit<RAGSection, 'tier'>;
 }
 
 // Re-export Rust-generated types for batch support
diff --git a/src/system/rag/shared/RAGTypes.ts b/src/system/rag/shared/RAGTypes.ts
index 351f5d293..3ab546333 100644
--- a/src/system/rag/shared/RAGTypes.ts
+++ b/src/system/rag/shared/RAGTypes.ts
@@ -18,6 +18,38 @@ import type { RecipeToolDeclaration } from '../../recipes/shared/RecipeTypes';
  */
 export type RAGDomain = 'chat' | 'academy' | 'game' | 'code' | 'analysis';
 
+/**
+ * Prompt tier — declares how often a RAG source's contribution changes between
+ * requests. Drives stable-byte-prefix prompt assembly so llama-server / vllm /
+ * DMR can reuse the KV cache for the invariant region instead of reprocessing
+ * the full prompt every turn.
+ *
+ * The contract: a section's bytes must be byte-identical across requests for
+ * sources at the same tier with the same inputs. INVARIANT and SEMI_STABLE
+ * sources MUST NOT contain timestamps, request IDs, or any per-request
+ * volatile data. Those go in VOLATILE only.
+ *
+ * Final assembly order is always: INVARIANT → SEMI_STABLE → VOLATILE.
+ * Within each tier, sources are sorted by name (alphabetical) so the byte
+ * order is fully deterministic.
+ *
+ * See: docs/architecture/MULTIMODAL-WORKER-AND-PREFIX-REUSE.md (Part 1)
+ */
+export const enum PromptTier {
+  /** Persona system prompt, recipe rules, role identity, tool definitions.
+   * Changes ~weekly when persona/recipe is edited. Identical bytes across
+   * thousands of turns for the same persona+recipe. */
+  INVARIANT = 'invariant',
+  /** Conversation history, active genome adapters, participants, governance
+   * state. Grows monotonically — new content APPENDS to the existing
+   * prefix, doesn't rewrite earlier bytes. */
+  SEMI_STABLE = 'semi_stable',
+  /** Latest user message, audio chunks, current timestamp, last-second
+   * pressure observations. Changes every request. The only region the
+   * server actually has to reprocess token-by-token. */
+  VOLATILE = 'volatile',
+}
+
 /**
  * Model capabilities that affect RAG context building
  * Determines how artifacts (images, etc.) are processed
diff --git a/src/system/rag/sources/ActivityContextSource.ts b/src/system/rag/sources/ActivityContextSource.ts
index 1c6cac5e1..5ce9f2599 100644
--- a/src/system/rag/sources/ActivityContextSource.ts
+++ b/src/system/rag/sources/ActivityContextSource.ts
@@ -8,6 +8,7 @@
  */
 
 import type { RAGSource, RAGSection, RAGSourceContext } from '../shared/RAGSource';
+import { PromptTier } from '../shared/RAGSource';
 import type { RecipeStrategy } from '../shared/RAGTypes';
 import type { RecipeToolDeclaration } from '../../recipes/shared/RecipeTypes';
 import { ORM } from '../../../daemons/data-daemon/server/ORM';
@@ -23,6 +24,7 @@ import { isSlowLocalModel } from '../../shared/ModelContextWindows';
  */
 export class ActivityContextSource implements RAGSource {
   readonly name = 'activity';
+  readonly tier = PromptTier.VOLATILE;
   readonly isShared = true;
 
   // Medium priority - important for guided interactions
@@ -36,7 +38,7 @@ export class ActivityContextSource implements RAGSource {
     return true;
   }
 
-  async load(context: RAGSourceContext, allocatedBudget: number): Promise<RAGSection> {
+  async load(context: RAGSourceContext, allocatedBudget: number): Promise<Omit<RAGSection, 'tier'>> {
     const startTime = Date.now();
 
     try {
diff --git a/src/system/rag/sources/CodeToolSource.ts b/src/system/rag/sources/CodeToolSource.ts
index 40dbe2d54..4e8e202c6 100644
--- a/src/system/rag/sources/CodeToolSource.ts
+++ b/src/system/rag/sources/CodeToolSource.ts
@@ -14,6 +14,7 @@
  */
 
 import type { RAGSource, RAGSourceContext, RAGSection } from '../shared/RAGSource';
+import { PromptTier } from '../shared/RAGSource';
 import { PersonaToolRegistry } from '../../user/server/modules/PersonaToolRegistry';
 import { Logger } from '../../core/logging/Logger';
 
@@ -70,6 +71,7 @@ const CODE_TOOL_GROUPS: readonly CodeToolGroup[] = [
 
 export class CodeToolSource implements RAGSource {
   readonly name = 'code-tools';
+  readonly tier = PromptTier.INVARIANT;
   readonly priority = 50;  // Medium — below conversation/widget, above learning config
   readonly defaultBudgetPercent = 5;
 
@@ -84,7 +86,7 @@ export class CodeToolSource implements RAGSource {
     return tools.some(t => t.name.startsWith('code/'));
   }
 
-  async load(context: RAGSourceContext, allocatedBudget: number): Promise<RAGSection> {
+  async load(context: RAGSourceContext, allocatedBudget: number): Promise<Omit<RAGSection, 'tier'>> {
     const startTime = performance.now();
 
     try {
@@ -253,7 +255,7 @@ export class CodeToolSource implements RAGSource {
     return tools.filter(t => t.name.startsWith('code/')).length;
   }
 
-  private emptySection(startTime: number, error?: string): RAGSection {
+  private emptySection(startTime: number, error?: string): Omit<RAGSection, 'tier'> {
     return {
       sourceName: this.name,
       tokenCount: 0,
diff --git a/src/system/rag/sources/CodebaseSearchSource.ts b/src/system/rag/sources/CodebaseSearchSource.ts
index 38553fcbe..e8c6faa9a 100644
--- a/src/system/rag/sources/CodebaseSearchSource.ts
+++ b/src/system/rag/sources/CodebaseSearchSource.ts
@@ -13,6 +13,7 @@
  */
 
 import type { RAGSource, RAGSourceContext, RAGSection } from '../shared/RAGSource';
+import { PromptTier } from '../shared/RAGSource';
 import { getCodebaseIndexer } from '../services/CodebaseIndexer';
 import { Logger } from '../../core/logging/Logger';
 
@@ -29,6 +30,7 @@ const RELEVANCE_THRESHOLD = 0.35;
 
 export class CodebaseSearchSource implements RAGSource {
   readonly name = 'codebase-search';
+  readonly tier = PromptTier.VOLATILE;
   readonly priority = 55;
   readonly defaultBudgetPercent = 8;
   readonly isShared = true;
@@ -43,7 +45,7 @@ export class CodebaseSearchSource implements RAGSource {
     return currentMessage.length >= MIN_QUERY_LENGTH;
   }
 
-  async load(context: RAGSourceContext, allocatedBudget: number): Promise<RAGSection> {
+  async load(context: RAGSourceContext, allocatedBudget: number): Promise<Omit<RAGSection, 'tier'>> {
     const startTime = Date.now();
     const query = context.options?.currentMessage?.content as string;
 
diff --git a/src/system/rag/sources/ConversationHistorySource.ts b/src/system/rag/sources/ConversationHistorySource.ts
index 6e8101098..7a5a43345 100644
--- a/src/system/rag/sources/ConversationHistorySource.ts
+++ b/src/system/rag/sources/ConversationHistorySource.ts
@@ -10,6 +10,7 @@
  */
 
 import type { RAGSource, RAGSourceContext, RAGSection } from '../shared/RAGSource';
+import { PromptTier } from '../shared/RAGSource';
 import type { LLMMessage } from '../shared/RAGTypes';
 import { ORM } from '../../../daemons/data-daemon/server/ORM';
 import { ChatMessageEntity, type MediaItem } from '../../data/entities/ChatMessageEntity';
@@ -145,6 +146,7 @@ interface InflightEntry {
 
 export class ConversationHistorySource implements RAGSource {
   readonly name = 'conversation-history';
+  readonly tier = PromptTier.SEMI_STABLE;
   readonly priority = 80;  // High - conversation is core context
   readonly defaultBudgetPercent = 25;  // Gets largest share of budget
 
@@ -224,7 +226,7 @@ export class ConversationHistorySource implements RAGSource {
     return true;
   }
 
-  async load(context: RAGSourceContext, allocatedBudget: number): Promise<RAGSection> {
+  async load(context: RAGSourceContext, allocatedBudget: number): Promise<Omit<RAGSection, 'tier'>> {
     const startTime = performance.now();
     ConversationHistorySource.initEventSubscription();
 
@@ -564,7 +566,7 @@ export class ConversationHistorySource implements RAGSource {
     return [];
   }
 
-  private emptySection(startTime: number, error?: string): RAGSection {
+  private emptySection(startTime: number, error?: string): Omit<RAGSection, 'tier'> {
     return {
       sourceName: this.name,
       tokenCount: 0,
diff --git a/src/system/rag/sources/DocumentationSource.ts b/src/system/rag/sources/DocumentationSource.ts
index 97cb27a4e..fd3444077 100644
--- a/src/system/rag/sources/DocumentationSource.ts
+++ b/src/system/rag/sources/DocumentationSource.ts
@@ -14,6 +14,7 @@
  */
 
 import type { RAGSource, RAGSourceContext, RAGSection } from '../shared/RAGSource';
+import { PromptTier } from '../shared/RAGSource';
 import { Logger } from '../../core/logging/Logger';
 import * as fs from 'fs/promises';
 import * as path from 'path';
@@ -45,6 +46,7 @@ const DOC_CHAPTERS: readonly Omit<DocChapter, 'count'>[] = [
 
 export class DocumentationSource implements RAGSource {
   readonly name = 'documentation';
+  readonly tier = PromptTier.INVARIANT;
   readonly priority = 35;
   readonly defaultBudgetPercent = 5;
   readonly isShared = true;
@@ -62,7 +64,7 @@ export class DocumentationSource implements RAGSource {
     return true;
   }
 
-  async load(context: RAGSourceContext, allocatedBudget: number): Promise<RAGSection> {
+  async load(context: RAGSourceContext, allocatedBudget: number): Promise<Omit<RAGSection, 'tier'>> {
     const startTime = performance.now();
 
     try {
diff --git a/src/system/rag/sources/GlobalAwarenessSource.ts b/src/system/rag/sources/GlobalAwarenessSource.ts
index 08d4eaef2..bbccb2c1c 100644
--- a/src/system/rag/sources/GlobalAwarenessSource.ts
+++ b/src/system/rag/sources/GlobalAwarenessSource.ts
@@ -23,6 +23,7 @@
  */
 
 import type { RAGSource, RAGSourceContext, RAGSection } from '../shared/RAGSource';
+import { PromptTier } from '../shared/RAGSource';
 import type { RagSourceRequest, RagSourceResult, ConsciousnessSourceMetadata } from '../../../shared/generated/rag';
 import type { PersonaUser } from '../../user/server/PersonaUser';
 import { Logger } from '../../core/logging/Logger';
@@ -62,6 +63,7 @@ export function getConsciousness(personaId: string): boolean {
 
 export class GlobalAwarenessSource implements RAGSource {
   readonly name = 'global-awareness';
+  readonly tier = PromptTier.SEMI_STABLE;
   readonly priority = 85;  // After identity (95), before conversation (80)
   readonly defaultBudgetPercent = 5;
   readonly isShared = true;
@@ -107,7 +109,7 @@ export class GlobalAwarenessSource implements RAGSource {
    * Convert Rust RagSourceResult to TypeScript RAGSection.
    * Maps consciousness result to systemPromptSection.
    */
-  fromBatchResult(result: RagSourceResult, loadTimeMs: number): RAGSection {
+  fromBatchResult(result: RagSourceResult, loadTimeMs: number): Omit<RAGSection, 'tier'> {
     // Consciousness result has formatted_prompt in the first section
     const formattedPrompt = result.sections
       .map(s => s.content)
@@ -142,7 +144,7 @@ export class GlobalAwarenessSource implements RAGSource {
    * Note: When batching is enabled, this method is typically not called.
    * RAGComposer uses getBatchRequest() + fromBatchResult() instead.
    */
-  async load(context: RAGSourceContext, _allocatedBudget: number): Promise<RAGSection> {
+  async load(context: RAGSourceContext, _allocatedBudget: number): Promise<Omit<RAGSection, 'tier'>> {
     const startTime = performance.now();
 
     try {
@@ -227,7 +229,7 @@ export class GlobalAwarenessSource implements RAGSource {
     }
   }
 
-  private createEmptySection(loadTimeMs: number): RAGSection {
+  private createEmptySection(loadTimeMs: number): Omit<RAGSection, 'tier'> {
     return {
       sourceName: this.name,
       tokenCount: 0,
@@ -236,7 +238,7 @@ export class GlobalAwarenessSource implements RAGSource {
     };
   }
 
-  private createErrorSection(loadTimeMs: number, error: string): RAGSection {
+  private createErrorSection(loadTimeMs: number, error: string): Omit<RAGSection, 'tier'> {
     return {
       sourceName: this.name,
       tokenCount: 0,
diff --git a/src/system/rag/sources/GovernanceSource.ts b/src/system/rag/sources/GovernanceSource.ts
index db15a9145..9d7069484 100644
--- a/src/system/rag/sources/GovernanceSource.ts
+++ b/src/system/rag/sources/GovernanceSource.ts
@@ -7,6 +7,7 @@
  */
 
 import type { RAGSource, RAGSection, RAGSourceContext } from '../shared/RAGSource';
+import { PromptTier } from '../shared/RAGSource';
 import { isSlowLocalModel, getContextWindow } from '../../shared/ModelContextWindows';
 
 /**
@@ -58,6 +59,7 @@ You can propose collective decisions with collaboration/decision/propose and vot
  */
 export class GovernanceSource implements RAGSource {
   readonly name = 'governance';
+  readonly tier = PromptTier.SEMI_STABLE;
   readonly isShared = true;
 
   // Low priority - governance examples are nice-to-have, not critical
@@ -78,7 +80,7 @@ export class GovernanceSource implements RAGSource {
     return true;
   }
 
-  async load(context: RAGSourceContext, allocatedBudget: number): Promise<RAGSection> {
+  async load(context: RAGSourceContext, allocatedBudget: number): Promise<Omit<RAGSection, 'tier'>> {
     const startTime = Date.now();
 
     // Determine which version to use based on budget and model capability
diff --git a/src/system/rag/sources/LiveRoomAwarenessSource.ts b/src/system/rag/sources/LiveRoomAwarenessSource.ts
index 3f7422265..54b40e198 100644
--- a/src/system/rag/sources/LiveRoomAwarenessSource.ts
+++ b/src/system/rag/sources/LiveRoomAwarenessSource.ts
@@ -16,6 +16,7 @@
  */
 
 import type { RAGSource, RAGSection, RAGSourceContext } from '../shared/RAGSource';
+import { PromptTier } from '../shared/RAGSource';
 import type { RAGArtifact } from '../shared/RAGTypes';
 import { ORM } from '../../../daemons/data-daemon/server/ORM';
 import { CallEntity, type CallParticipant } from '../../data/entities/CallEntity';
@@ -27,6 +28,7 @@ const log = Logger.create('LiveRoomAwarenessSource', 'rag');
 
 export class LiveRoomAwarenessSource implements RAGSource {
   readonly name = 'live-room-awareness';
+  readonly tier = PromptTier.SEMI_STABLE;
   readonly priority = 30;
   readonly defaultBudgetPercent = 3;
 
@@ -41,7 +43,7 @@ export class LiveRoomAwarenessSource implements RAGSource {
     return true;
   }
 
-  async load(context: RAGSourceContext, allocatedBudget: number): Promise<RAGSection> {
+  async load(context: RAGSourceContext, allocatedBudget: number): Promise<Omit<RAGSection, 'tier'>> {
     const startTime = performance.now();
 
     const call = this.getActiveCall();
@@ -200,7 +202,7 @@ export class LiveRoomAwarenessSource implements RAGSource {
     }
   }
 
-  private emptySection(loadTimeMs: number): RAGSection {
+  private emptySection(loadTimeMs: number): Omit<RAGSection, 'tier'> {
     return {
       sourceName: this.name,
       tokenCount: 0,
diff --git a/src/system/rag/sources/MediaArtifactSource.ts b/src/system/rag/sources/MediaArtifactSource.ts
index 99f1dcbb7..f66cbb3ff 100644
--- a/src/system/rag/sources/MediaArtifactSource.ts
+++ b/src/system/rag/sources/MediaArtifactSource.ts
@@ -18,6 +18,7 @@
  */
 
 import type { RAGSource, RAGSourceContext, RAGSection } from '../shared/RAGSource';
+import { PromptTier } from '../shared/RAGSource';
 import { type RAGArtifact, type MediaArtifactMetadata, hasMediaMetadata } from '../shared/RAGTypes';
 import { VisionDescriptionService } from '../../vision/VisionDescriptionService';
 import { ConversationHistorySource } from './ConversationHistorySource';
@@ -31,6 +32,7 @@ const TOKENS_PER_IMAGE_BASE64 = 1000;
 
 export class MediaArtifactSource implements RAGSource {
   readonly name = 'media-artifacts';
+  readonly tier = PromptTier.VOLATILE;
   readonly priority = 65;
   readonly defaultBudgetPercent = 5;
 
@@ -39,7 +41,7 @@ export class MediaArtifactSource implements RAGSource {
     return context.options.includeArtifacts !== false;
   }
 
-  async load(context: RAGSourceContext, allocatedBudget: number): Promise<RAGSection> {
+  async load(context: RAGSourceContext, allocatedBudget: number): Promise<Omit<RAGSection, 'tier'>> {
     const startTime = performance.now();
 
     // Scan window for media. Balance between finding images in chatty rooms and
diff --git a/src/system/rag/sources/OpenProposalsSource.ts b/src/system/rag/sources/OpenProposalsSource.ts
index 246a01812..a860d0798 100644
--- a/src/system/rag/sources/OpenProposalsSource.ts
+++ b/src/system/rag/sources/OpenProposalsSource.ts
@@ -11,6 +11,7 @@
  */
 
 import type { RAGSource, RAGSection, RAGSourceContext } from '../shared/RAGSource';
+import { PromptTier } from '../shared/RAGSource';
 import { ORM } from '../../../daemons/data-daemon/server/ORM';
 import type { DecisionProposalEntity, DecisionOption, RankedVote } from '../../data/entities/DecisionProposalEntity';
 import type { DataRecord } from '../../../daemons/data-daemon/shared/DataStorageAdapter';
@@ -32,7 +33,7 @@ function formatProposal(record: DataRecord<DecisionProposalEntity>): string {
 ${options}`;
 }
 
-const EMPTY_SECTION: RAGSection = {
+const EMPTY_SECTION: Omit<RAGSection, 'tier'> = {
   sourceName: 'open-proposals',
   tokenCount: 0,
   loadTimeMs: 0,
@@ -41,6 +42,7 @@ const EMPTY_SECTION: RAGSection = {
 
 export class OpenProposalsSource implements RAGSource {
   readonly name = 'open-proposals';
+  readonly tier = PromptTier.SEMI_STABLE;
   readonly priority = 25;
   readonly defaultBudgetPercent = 3;
 
@@ -48,7 +50,7 @@ export class OpenProposalsSource implements RAGSource {
     return true;
   }
 
-  async load(context: RAGSourceContext, allocatedBudget: number): Promise<RAGSection> {
+  async load(context: RAGSourceContext, allocatedBudget: number): Promise<Omit<RAGSection, 'tier'>> {
     const startTime = Date.now();
 
     if (allocatedBudget < 30) {
diff --git a/src/system/rag/sources/PersonaIdentitySource.ts b/src/system/rag/sources/PersonaIdentitySource.ts
index 55427b73b..019ace400 100644
--- a/src/system/rag/sources/PersonaIdentitySource.ts
+++ b/src/system/rag/sources/PersonaIdentitySource.ts
@@ -18,6 +18,7 @@
  */
 
 import type { RAGSource, RAGSourceContext, RAGSection } from '../shared/RAGSource';
+import { PromptTier } from '../shared/RAGSource';
 import type { PersonaIdentity } from '../shared/RAGTypes';
 import { ORM } from '../../../daemons/data-daemon/server/ORM';
 import { UserEntity } from '../../data/entities/UserEntity';
@@ -28,6 +29,7 @@ const log = Logger.create('PersonaIdentitySource', 'rag');
 
 export class PersonaIdentitySource implements RAGSource {
   readonly name = 'persona-identity';
+  readonly tier = PromptTier.INVARIANT;
   readonly priority = 95;  // Critical - must be included
   readonly defaultBudgetPercent = 20;
 
@@ -91,7 +93,7 @@ export class PersonaIdentitySource implements RAGSource {
     return true;
   }
 
-  async load(context: RAGSourceContext, allocatedBudget: number): Promise<RAGSection> {
+  async load(context: RAGSourceContext, allocatedBudget: number): Promise<Omit<RAGSection, 'tier'>> {
     const startTime = performance.now();
 
     try {
@@ -373,7 +375,7 @@ LIMITS:
 
   // ── Helpers ──────────────────────────────────────────────────────
 
-  private defaultSection(startTime: number, error?: string): RAGSection {
+  private defaultSection(startTime: number, error?: string): Omit<RAGSection, 'tier'> {
     const defaultIdentity: PersonaIdentity = {
       name: 'AI Assistant',
       systemPrompt: 'You are a helpful AI assistant participating in a group chat.'
diff --git a/src/system/rag/sources/ProjectContextSource.ts b/src/system/rag/sources/ProjectContextSource.ts
index 6d4163005..9e7e5287f 100644
--- a/src/system/rag/sources/ProjectContextSource.ts
+++ b/src/system/rag/sources/ProjectContextSource.ts
@@ -21,6 +21,7 @@
  */
 
 import type { RAGSource, RAGSourceContext, RAGSection } from '../shared/RAGSource';
+import { PromptTier } from '../shared/RAGSource';
 import { WorkspaceStrategy } from '../../code/server/WorkspaceStrategy';
 import { ProjectDetector, type ProjectType } from '../../code/server/ProjectDetector';
 import { Logger } from '../../core/logging/Logger';
@@ -33,6 +34,7 @@ const log = Logger.create('ProjectContextSource', 'rag');
 
 export class ProjectContextSource implements RAGSource {
   readonly name = 'project-context';
+  readonly tier = PromptTier.INVARIANT;
   readonly priority = 70;
   readonly defaultBudgetPercent = 5;
   readonly isShared = true;
@@ -46,8 +48,8 @@ export class ProjectContextSource implements RAGSource {
    * 14×5 = 70 synchronous shell calls per RAG cycle.
    * Single-flight coalescing prevents thundering herd on cache miss.
    */
-  private static _contextCache: Map<string, { section: RAGSection; cachedAt: number }> = new Map();
-  private static _contextInflight: Map<string, Promise<RAGSection>> = new Map();
+  private static _contextCache: Map<string, { section: Omit<RAGSection, 'tier'>; cachedAt: number }> = new Map();
+  private static _contextInflight: Map<string, Promise<Omit<RAGSection, 'tier'>>> = new Map();
   private static readonly CONTEXT_CACHE_TTL_MS = 30_000;
 
   isApplicable(context: RAGSourceContext): boolean {
@@ -59,7 +61,7 @@ export class ProjectContextSource implements RAGSource {
     return ProjectContextSource.isMainRepoGit();
   }
 
-  async load(context: RAGSourceContext, allocatedBudget: number): Promise<RAGSection> {
+  async load(context: RAGSourceContext, allocatedBudget: number): Promise<Omit<RAGSection, 'tier'>> {
     const startTime = performance.now();
 
     const wsMeta = WorkspaceStrategy.getProjectForPersona(context.personaId);
@@ -104,7 +106,7 @@ export class ProjectContextSource implements RAGSource {
     isPersonalWorkspace: boolean,
     initialBranch: string,
     startTime: number,
-  ): Promise<RAGSection> {
+  ): Promise<Omit<RAGSection, 'tier'>> {
 
     try {
       // Resolve branch — from workspace metadata or live git query
@@ -371,7 +373,7 @@ export class ProjectContextSource implements RAGSource {
     return `## ${label}: ${projectType.description}\nBranch: ${branch}\n${gitStatus}`;
   }
 
-  private emptySection(startTime: number, error?: string): RAGSection {
+  private emptySection(startTime: number, error?: string): Omit<RAGSection, 'tier'> {
     return {
       sourceName: this.name,
       tokenCount: 0,
diff --git a/src/system/rag/sources/SemanticMemorySource.ts b/src/system/rag/sources/SemanticMemorySource.ts
index 9af52e9f6..5da453c32 100644
--- a/src/system/rag/sources/SemanticMemorySource.ts
+++ b/src/system/rag/sources/SemanticMemorySource.ts
@@ -18,6 +18,7 @@
  */
 
 import type { RAGSource, RAGSourceContext, RAGSection } from '../shared/RAGSource';
+import { PromptTier } from '../shared/RAGSource';
 import type { RagSourceRequest, RagSourceResult } from '../../../shared/generated/rag';
 import type { PersonaMemory } from '../shared/RAGTypes';
 import { TieredMemoryCache } from '../cache/TieredMemoryCache';
@@ -30,6 +31,7 @@ const TOKENS_PER_MEMORY_ESTIMATE = 80;
 
 export class SemanticMemorySource implements RAGSource {
   readonly name = 'semantic-memory';
+  readonly tier = PromptTier.SEMI_STABLE;
   readonly priority = 60;  // Medium-high - memories inform persona behavior
   readonly defaultBudgetPercent = 12;
   readonly supportsBatching = true;  // Participate in batched Rust IPC
@@ -69,7 +71,7 @@ export class SemanticMemorySource implements RAGSource {
    * Convert Rust RagSourceResult to TypeScript RAGSection.
    * Maps Rust's memory format back to PersonaMemory[].
    */
-  fromBatchResult(result: RagSourceResult, loadTimeMs: number): RAGSection {
+  fromBatchResult(result: RagSourceResult, loadTimeMs: number): Omit<RAGSection, 'tier'> {
     // Extract memories from sections
     const memories: PersonaMemory[] = [];
 
@@ -122,7 +124,7 @@ export class SemanticMemorySource implements RAGSource {
     };
   }
 
-  async load(context: RAGSourceContext, allocatedBudget: number): Promise<RAGSection> {
+  async load(context: RAGSourceContext, allocatedBudget: number): Promise<Omit<RAGSection, 'tier'>> {
     const startTime = performance.now();
     const maxMemories = Math.max(3, Math.floor(allocatedBudget / TOKENS_PER_MEMORY_ESTIMATE));
 
@@ -195,7 +197,7 @@ export class SemanticMemorySource implements RAGSource {
     return validTypes.includes(type) ? type as PersonaMemory['type'] : 'observation';
   }
 
-  private emptySection(startTime: number, error?: string): RAGSection {
+  private emptySection(startTime: number, error?: string): Omit<RAGSection, 'tier'> {
     return {
       sourceName: this.name,
       tokenCount: 0,
diff --git a/src/system/rag/sources/SentinelAwarenessSource.ts b/src/system/rag/sources/SentinelAwarenessSource.ts
index b6dc186a4..e7e8681a4 100644
--- a/src/system/rag/sources/SentinelAwarenessSource.ts
+++ b/src/system/rag/sources/SentinelAwarenessSource.ts
@@ -16,12 +16,14 @@
  */
 
 import type { RAGSource, RAGSection, RAGSourceContext } from '../shared/RAGSource';
+import { PromptTier } from '../shared/RAGSource';
 import { TemplateRegistry } from '../../sentinel/pipelines/TemplateRegistry';
 import { sentinelEventBridge } from '../../sentinel/SentinelEventBridge';
 import { isSlowLocalModel, getContextWindow } from '../../shared/ModelContextWindows';
 
 export class SentinelAwarenessSource implements RAGSource {
   readonly name = 'sentinel-awareness';
+  readonly tier = PromptTier.SEMI_STABLE;
   readonly isShared = true;
   readonly priority = 58;
   readonly defaultBudgetPercent = 8;
@@ -36,7 +38,7 @@ export class SentinelAwarenessSource implements RAGSource {
     return true;
   }
 
-  async load(context: RAGSourceContext, allocatedBudget: number): Promise<RAGSection> {
+  async load(context: RAGSourceContext, allocatedBudget: number): Promise<Omit<RAGSection, 'tier'>> {
     const startTime = Date.now();
 
     const modelId = context.options?.modelId;
diff --git a/src/system/rag/sources/SocialMediaRAGSource.ts b/src/system/rag/sources/SocialMediaRAGSource.ts
index 7a46797f8..e6501e32d 100644
--- a/src/system/rag/sources/SocialMediaRAGSource.ts
+++ b/src/system/rag/sources/SocialMediaRAGSource.ts
@@ -25,6 +25,7 @@
  */
 
 import type { RAGSource, RAGSourceContext, RAGSection } from '../shared/RAGSource';
+import { PromptTier } from '../shared/RAGSource';
 import type { SocialNotification, SocialProfile } from '@system/social/shared/SocialMediaTypes';
 import type { ISocialMediaProvider } from '@system/social/shared/ISocialMediaProvider';
 import { SocialCredentialEntity } from '@system/social/shared/SocialCredentialEntity';
@@ -54,6 +55,7 @@ interface ResolvedCredential {
 
 export class SocialMediaRAGSource implements RAGSource {
   readonly name = 'social-media';
+  readonly tier = PromptTier.SEMI_STABLE;
   readonly priority = 55;
   readonly defaultBudgetPercent = 3;
 
@@ -97,7 +99,7 @@ export class SocialMediaRAGSource implements RAGSource {
    * If HUD is cached, returns it. If not, returns empty section.
    * Background warmup loop handles populating the cache.
    */
-  async load(context: RAGSourceContext, _allocatedBudget: number): Promise<RAGSection> {
+  async load(context: RAGSourceContext, _allocatedBudget: number): Promise<Omit<RAGSection, 'tier'>> {
     const startTime = performance.now();
 
     // Register this persona for background warmup
@@ -461,7 +463,7 @@ export class SocialMediaRAGSource implements RAGSource {
     ]);
   }
 
-  private emptySection(startTime: number): RAGSection {
+  private emptySection(startTime: number): Omit<RAGSection, 'tier'> {
     return {
       sourceName: this.name,
       tokenCount: 0,
@@ -470,7 +472,7 @@ export class SocialMediaRAGSource implements RAGSource {
     };
   }
 
-  private errorSection(startTime: number, error: string): RAGSection {
+  private errorSection(startTime: number, error: string): Omit<RAGSection, 'tier'> {
     return {
       sourceName: this.name,
       tokenCount: 0,
diff --git a/src/system/rag/sources/ToolDefinitionsSource.ts b/src/system/rag/sources/ToolDefinitionsSource.ts
index df7ed5a35..6a0cea59d 100644
--- a/src/system/rag/sources/ToolDefinitionsSource.ts
+++ b/src/system/rag/sources/ToolDefinitionsSource.ts
@@ -19,6 +19,7 @@
  */
 
 import type { RAGSource, RAGSourceContext, RAGSection } from '../shared/RAGSource';
+import { PromptTier } from '../shared/RAGSource';
 import type { NativeToolSpec } from '../../../daemons/ai-provider-daemon/shared/AIProviderTypesV2';
 import type { RecipeToolDeclaration } from '../../recipes/shared/RecipeTypes';
 import { PersonaToolRegistry } from '../../user/server/modules/PersonaToolRegistry';
@@ -35,6 +36,7 @@ const log = Logger.create('ToolDefinitionsSource', 'rag');
 
 export class ToolDefinitionsSource implements RAGSource {
   readonly name = 'tool-definitions';
+  readonly tier = PromptTier.INVARIANT;
   readonly priority = 45;
   readonly defaultBudgetPercent = 10;
 
@@ -46,7 +48,7 @@ export class ToolDefinitionsSource implements RAGSource {
     return true;
   }
 
-  async load(context: RAGSourceContext, allocatedBudget: number): Promise<RAGSection> {
+  async load(context: RAGSourceContext, allocatedBudget: number): Promise<Omit<RAGSection, 'tier'>> {
     const startTime = performance.now();
 
     try {
@@ -94,7 +96,7 @@ export class ToolDefinitionsSource implements RAGSource {
     toolDefinitions: ToolDefinition[],
     allocatedBudget: number,
     startTime: number
-  ): RAGSection {
+  ): Omit<RAGSection, 'tier'> {
     // Exclude meta-tools — models with native tool calling don't need discovery tools.
     // search_tools/list_tools cause infinite loops where models search instead of act.
     const META_TOOLS = new Set(['search_tools', 'list_tools', 'working_memory']);
@@ -178,7 +180,7 @@ export class ToolDefinitionsSource implements RAGSource {
     toolDefinitions: ToolDefinition[],
     allocatedBudget: number,
     startTime: number
-  ): RAGSection {
+  ): Omit<RAGSection, 'tier'> {
     // Exclude chat/send when responding in a chat room (same as native path)
     if (context.roomId) {
       toolDefinitions = toolDefinitions.filter(t => t.name !== 'collaboration/chat/send');
@@ -382,7 +384,7 @@ RESPOND WITH TOOL CALLS, NOT DESCRIPTIONS.`;
     return Math.ceil(text.length / 4);
   }
 
-  private emptySection(startTime: number, error?: string): RAGSection {
+  private emptySection(startTime: number, error?: string): Omit<RAGSection, 'tier'> {
     return {
       sourceName: this.name,
       tokenCount: 0,
diff --git a/src/system/rag/sources/ToolMethodologySource.ts b/src/system/rag/sources/ToolMethodologySource.ts
index 0ad57d275..528930536 100644
--- a/src/system/rag/sources/ToolMethodologySource.ts
+++ b/src/system/rag/sources/ToolMethodologySource.ts
@@ -17,6 +17,7 @@
  */
 
 import type { RAGSource, RAGSourceContext, RAGSection } from '../shared/RAGSource';
+import { PromptTier } from '../shared/RAGSource';
 import { PersonaToolRegistry } from '../../user/server/modules/PersonaToolRegistry';
 import { Logger } from '../../core/logging/Logger';
 
@@ -65,6 +66,7 @@ const TOOL_CATEGORIES: readonly ToolCategory[] = [
 
 export class ToolMethodologySource implements RAGSource {
   readonly name = 'tool-methodology';
+  readonly tier = PromptTier.INVARIANT;
   readonly priority = 48;
   readonly defaultBudgetPercent = 3;
 
@@ -78,7 +80,7 @@ export class ToolMethodologySource implements RAGSource {
     return tools.some(t => TOOL_CATEGORIES.some(cat => t.name.startsWith(cat.prefix)));
   }
 
-  async load(context: RAGSourceContext, allocatedBudget: number): Promise<RAGSection> {
+  async load(context: RAGSourceContext, allocatedBudget: number): Promise<Omit<RAGSection, 'tier'>> {
     const startTime = performance.now();
 
     try {
diff --git a/src/system/rag/sources/VoiceConversationSource.ts b/src/system/rag/sources/VoiceConversationSource.ts
index ff9328cb9..8e9a5f324 100644
--- a/src/system/rag/sources/VoiceConversationSource.ts
+++ b/src/system/rag/sources/VoiceConversationSource.ts
@@ -14,6 +14,7 @@
  */
 
 import type { RAGSource, RAGSourceContext, RAGSection } from '../shared/RAGSource';
+import { PromptTier } from '../shared/RAGSource';
 import type { LLMMessage } from '../shared/RAGTypes';
 import { extractSentiment, formatEmotionLabel } from '../shared/TextSentiment';
 import { Logger } from '../../core/logging/Logger';
@@ -53,6 +54,7 @@ export function unregisterVoiceOrchestrator(): void {
 
 export class VoiceConversationSource implements RAGSource {
   readonly name = 'voice-conversation';
+  readonly tier = PromptTier.VOLATILE;
   readonly priority = 85;
   readonly defaultBudgetPercent = 30;
 
@@ -67,7 +69,7 @@ export class VoiceConversationSource implements RAGSource {
     return hasVoiceSession && hasOrchestrator;
   }
 
-  async load(context: RAGSourceContext, allocatedBudget: number): Promise<RAGSection> {
+  async load(context: RAGSourceContext, allocatedBudget: number): Promise<Omit<RAGSection, 'tier'>> {
     const startTime = performance.now();
 
     if (!voiceOrchestrator) {
@@ -254,7 +256,7 @@ You may speak for as long as needed to complete your thought. Natural conversati
     return breakdown;
   }
 
-  private emptySection(startTime: number, error?: string): RAGSection {
+  private emptySection(startTime: number, error?: string): Omit<RAGSection, 'tier'> {
     return {
       sourceName: this.name,
       tokenCount: 0,
diff --git a/src/system/rag/sources/WidgetContextSource.ts b/src/system/rag/sources/WidgetContextSource.ts
index c6bd37926..9da769ac9 100644
--- a/src/system/rag/sources/WidgetContextSource.ts
+++ b/src/system/rag/sources/WidgetContextSource.ts
@@ -11,6 +11,7 @@
  */
 
 import type { RAGSource, RAGSourceContext, RAGSection } from '../shared/RAGSource';
+import { PromptTier } from '../shared/RAGSource';
 import { WidgetContextService } from '../services/WidgetContextService';
 import { Logger } from '../../core/logging/Logger';
 
@@ -18,6 +19,7 @@ const log = Logger.create('WidgetContextSource', 'rag');
 
 export class WidgetContextSource implements RAGSource {
   readonly name = 'widget-context';
+  readonly tier = PromptTier.VOLATILE;
   readonly priority = 75;  // High - UI context is very relevant
   readonly defaultBudgetPercent = 5;
   readonly isShared = true;
@@ -27,7 +29,7 @@ export class WidgetContextSource implements RAGSource {
     return !!(context.options.widgetContext || context.sessionId);
   }
 
-  async load(context: RAGSourceContext, _allocatedBudget: number): Promise<RAGSection> {
+  async load(context: RAGSourceContext, _allocatedBudget: number): Promise<Omit<RAGSection, 'tier'>> {
     const startTime = performance.now();
 
     try {
@@ -90,7 +92,7 @@ Use this context to provide relevant, contextual assistance.
 `.trim();
   }
 
-  private emptySection(startTime: number, error?: string): RAGSection {
+  private emptySection(startTime: number, error?: string): Omit<RAGSection, 'tier'> {
     return {
       sourceName: this.name,
       tokenCount: 0,

From 203fb6534d108db3349b68fd82808ff64e686a70 Mon Sep 17 00:00:00 2001
From: Joel Teply <joel@cambriantech.com>
Date: Fri, 17 Apr 2026 16:14:11 -0500
Subject: [PATCH 2/2] =?UTF-8?q?fix(rag):=20throttle=20codebase=20indexing?=
 =?UTF-8?q?=20=E2=80=94=20stop=20saturating=20event=20loop=20on=20boot?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CodebaseIndexer ran 64-batches back-to-back with NO yield between
batches. Each batch ~1.5s + ~80MB RSS growth. With 5000+ chunks in
src/, that's 78+ batches × 1.5s = 2+ minutes of total event-loop
saturation immediately after every boot. Local personas couldn't
respond, voice couldn't connect, anything that needed the bus was
blocked until indexing finished.

Two changes:
- Batch size 64→16 (smaller per-batch RSS hit, ~4× more chances
  for other IO to interleave between IPC roundtrips)
- 50ms pause between batches via setTimeout (yields the event loop
  so chat/voice/personas can process while indexing runs)

The throughput cost is small (16 vs 64 chunks per IPC) and the
inter-batch pause is invisible at human timescales. The chat-arrival
latency win is huge — system is responsive within seconds of boot
instead of minutes.

The deeper fix is querying GpuPressureWatcher / ResourcePressureWatcher
before each batch and backing off when pressure is high — same
principle Joel called out for InferenceCoordinator slot capacity.
That's a follow-up; this is the floor.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/system/rag/services/CodebaseIndexer.ts | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/src/system/rag/services/CodebaseIndexer.ts b/src/system/rag/services/CodebaseIndexer.ts
index 19a2c8646..5b281725f 100644
--- a/src/system/rag/services/CodebaseIndexer.ts
+++ b/src/system/rag/services/CodebaseIndexer.ts
@@ -28,8 +28,17 @@ const log = Logger.create('CodebaseIndexer', 'rag');
 /** Maximum content length per chunk (chars). Longer chunks are split. */
 const MAX_CHUNK_CHARS = 2000;
 
-/** Batch size for embedding generation — one Rust IPC call per batch */
-const EMBEDDING_BATCH_SIZE = 64;
+/** Batch size for embedding generation — one Rust IPC call per batch.
+ * Was 64; dropped to 16 because 64 × ~80MB-per-batch RSS growth saturated
+ * the event loop and starved chat for ~2min after every boot on M5.
+ * 16 gives Rust IPC the ~4× headroom to interleave with persona inference. */
+const EMBEDDING_BATCH_SIZE = 16;
+
+/** Pause between batches (ms) to yield the event loop and let the Rust
+ * IPC pipeline drain. Without this, the indexer blocks chat and live for
+ * the full duration. 50ms is small enough to not visibly slow indexing
+ * but big enough that other IO can interleave. */
+const EMBEDDING_BATCH_PAUSE_MS = 50;
 
 /** File extensions to index */
 const INDEXABLE_EXTENSIONS = new Set(['.ts', '.md', '.js']);
@@ -224,6 +233,14 @@ export class CodebaseIndexer {
         log.error(`Embedding batch ${i}-${i + batch.length} failed: ${err}`);
         errors.push({ file: `batch-${i}`, error: String(err) });
       }
+
+      // Yield to other IO between batches. Without this, the indexer
+      // monopolises the event loop and chat/voice/personas all stall
+      // for the full indexing duration. Chat-arrival latency matters
+      // more than indexing throughput.
+      if (i + EMBEDDING_BATCH_SIZE < allChunks.length) {
+        await new Promise(resolve => setTimeout(resolve, EMBEDDING_BATCH_PAUSE_MS));
+      }
     }
 
     // Any write to code_index invalidates the in-memory query cache.