CambrianTech · joelteply · Apr 18, 2026 · Apr 17, 2026 · Apr 17, 2026 · Copilot
diff --git a/src/system/rag/services/CodebaseIndexer.ts b/src/system/rag/services/CodebaseIndexer.ts
@@ -28,8 +28,17 @@ const log = Logger.create('CodebaseIndexer', 'rag');
 /** Maximum content length per chunk (chars). Longer chunks are split. */
 const MAX_CHUNK_CHARS = 2000;
 
-/** Batch size for embedding generation — one Rust IPC call per batch */
-const EMBEDDING_BATCH_SIZE = 64;
+/** Batch size for embedding generation — one Rust IPC call per batch.
+ * Was 64; dropped to 16 because 64 × ~80MB-per-batch RSS growth saturated
+ * the event loop and starved chat for ~2min after every boot on M5.
+ * 16 gives Rust IPC the ~4× headroom to interleave with persona inference. */
+const EMBEDDING_BATCH_SIZE = 16;
+
+/** Pause between batches (ms) to yield the event loop and let the Rust
+ * IPC pipeline drain. Without this, the indexer blocks chat and live for
+ * the full duration. 50ms is small enough to not visibly slow indexing
+ * but big enough that other IO can interleave. */
+const EMBEDDING_BATCH_PAUSE_MS = 50;
 
 /** File extensions to index */
 const INDEXABLE_EXTENSIONS = new Set(['.ts', '.md', '.js']);
@@ -224,6 +233,14 @@ export class CodebaseIndexer {
         log.error(`Embedding batch ${i}-${i + batch.length} failed: ${err}`);
         errors.push({ file: `batch-${i}`, error: String(err) });
       }
+
+      // Yield to other IO between batches. Without this, the indexer
+      // monopolises the event loop and chat/voice/personas all stall
+      // for the full indexing duration. Chat-arrival latency matters
+      // more than indexing throughput.
+      if (i + EMBEDDING_BATCH_SIZE < allChunks.length) {
+        await new Promise(resolve => setTimeout(resolve, EMBEDDING_BATCH_PAUSE_MS));
+      }
     }
 
     // Any write to code_index invalidates the in-memory query cache.

diff --git a/src/system/rag/shared/RAGComposer.ts b/src/system/rag/shared/RAGComposer.ts
@@ -22,6 +22,15 @@
  */
 
 import type { RAGSource, RAGSourceContext, RAGSection, RAGCompositionResult } from './RAGSource';
+import { PromptTier } from './RAGTypes';
+
+/** Sort key for tiers — smaller numbers concatenate first.
+ * INVARIANT before SEMI_STABLE before VOLATILE. See PromptTier docs. */
+const TIER_ORDER: Record<PromptTier, number> = {
+  [PromptTier.INVARIANT]: 0,
+  [PromptTier.SEMI_STABLE]: 1,
+  [PromptTier.VOLATILE]: 2,
+};
 import type { RagSourceRequest, RagComposeResult, RagSourceResult } from '../../../shared/generated/rag';
 import { Logger } from '../../core/logging/Logger';
 import { TimingHarness } from '../../core/shared/TimingHarness';
@@ -230,6 +239,19 @@ export class RAGComposer {
         failedSources.push({ source: result.source, error: result.error });
       }
     }
+    // Deterministic ordering: sections sorted by (tier, sourceName) so the
+    // assembled prompt's bytes are identical across requests with identical
+    // section contents. This is the prerequisite for llama-server / DMR
+    // prefix-KV-cache reuse — without stable ordering, the same logical
+    // prompt has different bytes per turn and the cache misses every time.
+    // Tier order: INVARIANT first, then SEMI_STABLE, then VOLATILE — see
+    // PromptTier in RAGTypes.ts and docs/architecture/MULTIMODAL-WORKER-AND-PREFIX-REUSE.md
+    sections.sort((a, b) => {
+      const tierOrder = TIER_ORDER[a.tier] - TIER_ORDER[b.tier];
+      if (tierOrder !== 0) return tierOrder;
+      // Within a tier: alphabetical by source name. Stable, total order.
+      return a.sourceName.localeCompare(b.sourceName);
+    });
     timer.mark('collect_results');
 
     // Log ALL source timings for performance diagnosis
@@ -316,8 +338,13 @@ export class RAGComposer {
 
         if (rustResult.success) {
           // Convert via source's fromBatchResult method
+          // Tier injection: same single-source-of-truth as TS path —
+          // the source's class declaration provides tier; we inject it
+          // here so the section conforms to RAGSection regardless of
+          // whether the source's fromBatchResult included it.
           if (sourceInfo.source.fromBatchResult) {
-            const section = sourceInfo.source.fromBatchResult(rustResult, rustResult.load_time_ms);
+            const rawSection = sourceInfo.source.fromBatchResult(rustResult, rustResult.load_time_ms);
+            const section: RAGSection = { ...rawSection, tier: sourceInfo.source.tier };
             results.push({
               success: true,
               section,
@@ -326,9 +353,11 @@ export class RAGComposer {
             });
           } else {
             // Fallback: basic conversion
+            const rawSection = this.defaultFromBatchResult(sourceInfo.source.name, rustResult);
+            const section: RAGSection = { ...rawSection, tier: sourceInfo.source.tier };
             results.push({
               success: true,
-              section: this.defaultFromBatchResult(sourceInfo.source.name, rustResult),
+              section,
               sourceName: sourceInfo.source.name,
               loadTime: rustResult.load_time_ms
             });
@@ -441,10 +470,14 @@ export class RAGComposer {
     sourceTimer.setMeta('budget', budget);
 
     try {
-      const section = await source.load(context, budget);
+      const rawSection = await source.load(context, budget);
       sourceTimer.mark('load');
-      sourceTimer.setMeta('tokenCount', section.tokenCount);
+      sourceTimer.setMeta('tokenCount', rawSection.tokenCount);
       const record = sourceTimer.finish();
+      // Inject tier from the source's declaration. Sources don't re-state
+      // their tier on every return; the class-level declaration is the
+      // single source of truth, applied here.
+      const section: RAGSection = { ...rawSection, tier: source.tier };
       return { success: true, section, sourceName: source.name, loadTime: record.totalMs };
     } catch (error: any) {
       sourceTimer.setError(error.message);
@@ -457,8 +490,9 @@ export class RAGComposer {
   /**
    * Default conversion from Rust RagSourceResult to TypeScript RAGSection.
    * Used when source doesn't implement fromBatchResult.
+   * Returns without `tier` — caller injects from the source's declaration.
    */
-  private defaultFromBatchResult(sourceName: string, result: RagSourceResult): RAGSection {
+  private defaultFromBatchResult(sourceName: string, result: RagSourceResult): Omit<RAGSection, 'tier'> {
     // Combine all sections into a single content block
     const content = result.sections
       .map(s => s.content)

diff --git a/src/system/rag/shared/RAGSource.ts b/src/system/rag/shared/RAGSource.ts
@@ -19,6 +19,10 @@
 
 import type { UUID } from '../../core/types/CrossPlatformUUID';
 import type { RAGBuildOptions, LLMMessage, RAGArtifact, PersonaMemory, PersonaIdentity, RecipeStrategy } from './RAGTypes';
+import { PromptTier } from './RAGTypes';
+
+// Re-export so source files only need one import
+export { PromptTier } from './RAGTypes';
-// Re-export so source files only need one import
-export { PromptTier } from './RAGTypes';
+// Keep PromptTier imported for use within this file; do not re-export it here
+// because `const enum` members are erased at emit and are not safe runtime ESM exports.
-// Re-export so source files only need one import
-export { PromptTier } from './RAGTypes';
+// Keep PromptTier imported for use within this file; do not re-export it here
+// because `const enum` members are erased at emit and are not safe runtime ESM exports.
 
 /**
  * Context passed to each RAGSource for loading
@@ -70,6 +74,9 @@ export interface RAGSourceContext {
 export interface RAGSection {
   /** Source that produced this section */
   readonly sourceName: string;
+  /** Tier this section belongs to — drives stable-byte-prefix ordering.
+   * Mirrored from the producing source's declared tier. */
+  readonly tier: PromptTier;
   /** Estimated token count */
   readonly tokenCount: number;
   /** Time taken to load (ms) */
@@ -105,6 +112,29 @@ export interface RAGSource {
    */
   readonly priority: number;
 
+  /**
+   * Tier — INVARIANT / SEMI_STABLE / VOLATILE.
+   * Required. Drives stable-byte-prefix prompt assembly so llama-server
+   * reuses KV cache for the unchanging region instead of reprocessing
+   * the full prompt every turn.
+   *
+   * Classification rules:
+   * - INVARIANT — system prompt fragments, recipe rules, role identity,
+   *   tool definitions. Bytes must be identical across thousands of turns
+   *   for the same persona+recipe. NO timestamps, NO request IDs, NO
+   *   per-request volatile data.
+   * - SEMI_STABLE — conversation history, memories, participants,
+   *   governance. Grows monotonically — append-only relative to the
+   *   previous turn. Earlier bytes never rewritten.
+   * - VOLATILE — current message, audio chunks, current timestamp,
+   *   per-request observations. The only region the server reprocesses
+   *   token-by-token.
+   *
+   * If you can't decide, the source probably mixes tiers and should be
+   * split into separate sources at the right granularity.
+   */
+  readonly tier: PromptTier;
+
   /**
    * Default budget allocation as percentage (0-100).
    * Total across all sources should roughly equal 100.
@@ -126,11 +156,16 @@ export interface RAGSource {
    * Load data from this source.
    * Called in parallel with other applicable sources.
    *
+   * Returns the section without the `tier` field — RAGComposer injects
+   * the source's declared `tier` into the section after load completes.
+   * This keeps source implementations focused on what they produce
+   * rather than re-asserting their tier on every return.
+   *
    * @param context - Context for loading
    * @param allocatedBudget - Token budget allocated to this source
-   * @returns Section of RAG context
+   * @returns Section of RAG context (tier added by composer)
    */
-  load(context: RAGSourceContext, allocatedBudget: number): Promise<RAGSection>;
+  load(context: RAGSourceContext, allocatedBudget: number): Promise<Omit<RAGSection, 'tier'>>;
 
   /**
    * Whether this source produces identical results for all personas in the same room.
@@ -168,11 +203,14 @@ export interface RAGSource {
    * Only called if supportsBatching is true.
    * Transforms the typed Rust result into the RAGSection format.
    *
+   * Returns the section without `tier` — RAGComposer injects the source's
+   * declared tier after conversion, same as the non-batched path.
+   *
    * @param result - The result from Rust's rag/compose endpoint
    * @param loadTimeMs - How long the load took
-   * @returns The RAGSection to include in the composition result
+   * @returns The RAGSection (without tier) to include in the composition result
    */
-  fromBatchResult?(result: RagSourceResult, loadTimeMs: number): RAGSection;
+  fromBatchResult?(result: RagSourceResult, loadTimeMs: number): Omit<RAGSection, 'tier'>;
 }
 
 // Re-export Rust-generated types for batch support

diff --git a/src/system/rag/shared/RAGTypes.ts b/src/system/rag/shared/RAGTypes.ts
@@ -18,6 +18,38 @@ import type { RecipeToolDeclaration } from '../../recipes/shared/RecipeTypes';
  */
 export type RAGDomain = 'chat' | 'academy' | 'game' | 'code' | 'analysis';
 
+/**
+ * Prompt tier — declares how often a RAG source's contribution changes between
+ * requests. Drives stable-byte-prefix prompt assembly so llama-server / vllm /
+ * DMR can reuse the KV cache for the invariant region instead of reprocessing
+ * the full prompt every turn.
+ *
+ * The contract: a section's bytes must be byte-identical across requests for
+ * sources at the same tier with the same inputs. INVARIANT and SEMI_STABLE
+ * sources MUST NOT contain timestamps, request IDs, or any per-request
+ * volatile data. Those go in VOLATILE only.
+ *
+ * Final assembly order is always: INVARIANT → SEMI_STABLE → VOLATILE.
+ * Within each tier, sources are sorted by name (alphabetical) so the byte
+ * order is fully deterministic.
+ *
+ * See: docs/architecture/MULTIMODAL-WORKER-AND-PREFIX-REUSE.md (Part 1)
+ */
+export const enum PromptTier {
+  /** Persona system prompt, recipe rules, role identity, tool definitions.
+   * Changes ~weekly when persona/recipe is edited. Identical bytes across
+   * thousands of turns for the same persona+recipe. */
+  INVARIANT = 'invariant',
+  /** Conversation history, active genome adapters, participants, governance
+   * state. Grows monotonically — new content APPENDS to the existing
+   * prefix, doesn't rewrite earlier bytes. */
+  SEMI_STABLE = 'semi_stable',
+  /** Latest user message, audio chunks, current timestamp, last-second
+   * pressure observations. Changes every request. The only region the
+   * server actually has to reprocess token-by-token. */
+  VOLATILE = 'volatile',
+}
+
 /**
  * Model capabilities that affect RAG context building
  * Determines how artifacts (images, etc.) are processed

diff --git a/src/system/rag/sources/ActivityContextSource.ts b/src/system/rag/sources/ActivityContextSource.ts
@@ -8,6 +8,7 @@
  */
 
 import type { RAGSource, RAGSection, RAGSourceContext } from '../shared/RAGSource';
+import { PromptTier } from '../shared/RAGSource';
 import type { RecipeStrategy } from '../shared/RAGTypes';
 import type { RecipeToolDeclaration } from '../../recipes/shared/RecipeTypes';
 import { ORM } from '../../../daemons/data-daemon/server/ORM';
@@ -23,6 +24,7 @@ import { isSlowLocalModel } from '../../shared/ModelContextWindows';
  */
 export class ActivityContextSource implements RAGSource {
   readonly name = 'activity';
+  readonly tier = PromptTier.VOLATILE;
   readonly isShared = true;
 
   // Medium priority - important for guided interactions
@@ -36,7 +38,7 @@ export class ActivityContextSource implements RAGSource {
     return true;
   }
 
-  async load(context: RAGSourceContext, allocatedBudget: number): Promise<RAGSection> {
+  async load(context: RAGSourceContext, allocatedBudget: number): Promise<Omit<RAGSection, 'tier'>> {
     const startTime = Date.now();
 
     try {

diff --git a/src/system/rag/sources/CodeToolSource.ts b/src/system/rag/sources/CodeToolSource.ts
@@ -14,6 +14,7 @@
  */
 
 import type { RAGSource, RAGSourceContext, RAGSection } from '../shared/RAGSource';
+import { PromptTier } from '../shared/RAGSource';
 import { PersonaToolRegistry } from '../../user/server/modules/PersonaToolRegistry';
 import { Logger } from '../../core/logging/Logger';
 
@@ -70,6 +71,7 @@ const CODE_TOOL_GROUPS: readonly CodeToolGroup[] = [
 
 export class CodeToolSource implements RAGSource {
   readonly name = 'code-tools';
+  readonly tier = PromptTier.INVARIANT;
   readonly priority = 50;  // Medium — below conversation/widget, above learning config
   readonly defaultBudgetPercent = 5;
 
@@ -84,7 +86,7 @@ export class CodeToolSource implements RAGSource {
     return tools.some(t => t.name.startsWith('code/'));
   }
 
-  async load(context: RAGSourceContext, allocatedBudget: number): Promise<RAGSection> {
+  async load(context: RAGSourceContext, allocatedBudget: number): Promise<Omit<RAGSection, 'tier'>> {
     const startTime = performance.now();
 
     try {
@@ -253,7 +255,7 @@ export class CodeToolSource implements RAGSource {
     return tools.filter(t => t.name.startsWith('code/')).length;
   }
 
-  private emptySection(startTime: number, error?: string): RAGSection {
+  private emptySection(startTime: number, error?: string): Omit<RAGSection, 'tier'> {
     return {
       sourceName: this.name,
       tokenCount: 0,

diff --git a/src/system/rag/sources/CodebaseSearchSource.ts b/src/system/rag/sources/CodebaseSearchSource.ts
@@ -13,6 +13,7 @@
  */
 
 import type { RAGSource, RAGSourceContext, RAGSection } from '../shared/RAGSource';
+import { PromptTier } from '../shared/RAGSource';
 import { getCodebaseIndexer } from '../services/CodebaseIndexer';
 import { Logger } from '../../core/logging/Logger';
 
@@ -29,6 +30,7 @@ const RELEVANCE_THRESHOLD = 0.35;
 
 export class CodebaseSearchSource implements RAGSource {
   readonly name = 'codebase-search';
+  readonly tier = PromptTier.VOLATILE;
   readonly priority = 55;
   readonly defaultBudgetPercent = 8;
   readonly isShared = true;
@@ -43,7 +45,7 @@ export class CodebaseSearchSource implements RAGSource {
     return currentMessage.length >= MIN_QUERY_LENGTH;
   }
 
-  async load(context: RAGSourceContext, allocatedBudget: number): Promise<RAGSection> {
+  async load(context: RAGSourceContext, allocatedBudget: number): Promise<Omit<RAGSection, 'tier'>> {
     const startTime = Date.now();
     const query = context.options?.currentMessage?.content as string;
 

diff --git a/src/system/rag/sources/ConversationHistorySource.ts b/src/system/rag/sources/ConversationHistorySource.ts
@@ -10,6 +10,7 @@
  */
 
 import type { RAGSource, RAGSourceContext, RAGSection } from '../shared/RAGSource';
+import { PromptTier } from '../shared/RAGSource';
 import type { LLMMessage } from '../shared/RAGTypes';
 import { ORM } from '../../../daemons/data-daemon/server/ORM';
 import { ChatMessageEntity, type MediaItem } from '../../data/entities/ChatMessageEntity';
@@ -145,6 +146,7 @@ interface InflightEntry {
 
 export class ConversationHistorySource implements RAGSource {
   readonly name = 'conversation-history';
+  readonly tier = PromptTier.SEMI_STABLE;
   readonly priority = 80;  // High - conversation is core context
   readonly defaultBudgetPercent = 25;  // Gets largest share of budget
 
@@ -224,7 +226,7 @@ export class ConversationHistorySource implements RAGSource {
     return true;
   }
 
-  async load(context: RAGSourceContext, allocatedBudget: number): Promise<RAGSection> {
+  async load(context: RAGSourceContext, allocatedBudget: number): Promise<Omit<RAGSection, 'tier'>> {
     const startTime = performance.now();
     ConversationHistorySource.initEventSubscription();
 
@@ -564,7 +566,7 @@ export class ConversationHistorySource implements RAGSource {
     return [];
   }
 
-  private emptySection(startTime: number, error?: string): RAGSection {
+  private emptySection(startTime: number, error?: string): Omit<RAGSection, 'tier'> {
     return {
       sourceName: this.name,
       tokenCount: 0,

diff --git a/src/system/rag/sources/DocumentationSource.ts b/src/system/rag/sources/DocumentationSource.ts
@@ -14,6 +14,7 @@
  */
 
 import type { RAGSource, RAGSourceContext, RAGSection } from '../shared/RAGSource';
+import { PromptTier } from '../shared/RAGSource';
 import { Logger } from '../../core/logging/Logger';
 import * as fs from 'fs/promises';
 import * as path from 'path';
@@ -45,6 +46,7 @@ const DOC_CHAPTERS: readonly Omit<DocChapter, 'count'>[] = [
 
 export class DocumentationSource implements RAGSource {
   readonly name = 'documentation';
+  readonly tier = PromptTier.INVARIANT;
   readonly priority = 35;
   readonly defaultBudgetPercent = 5;
   readonly isShared = true;
@@ -62,7 +64,7 @@ export class DocumentationSource implements RAGSource {
     return true;
   }
 
-  async load(context: RAGSourceContext, allocatedBudget: number): Promise<RAGSection> {
+  async load(context: RAGSourceContext, allocatedBudget: number): Promise<Omit<RAGSection, 'tier'>> {
     const startTime = performance.now();
 
     try {