CambrianTech · joelteply · Apr 18, 2026 · Apr 17, 2026 · Apr 17, 2026 · Apr 18, 2026
diff --git a/src/system/rag/builders/ChatRAGBuilder.ts b/src/system/rag/builders/ChatRAGBuilder.ts
@@ -317,21 +317,37 @@ export class ChatRAGBuilder extends RAGBuilder {
     // 2.4. Inject RAG source context into system prompt — GENERIC LOOP
     // Each RAGSource provides a systemPromptSection. We inject them all without
     // knowing source names. Adding a new source requires ZERO changes here.
+    //
+    // Phase 1.5 (issue #918): the assembly order is enforced for stable
+    // byte-prefix prompts so llama-server / DMR can reuse KV cache. Order:
+    //   1. Identity systemPrompt (INVARIANT — already in finalIdentity)
+    //   2. Tool definitions (INVARIANT — moved here from end)
+    //   3. Loop iterates systemPromptSections in tier-sorted order
+    //      (Phase 1's RAGComposer sort guarantees Map iteration order is
+    //       INVARIANT → SEMI_STABLE → VOLATILE → alphabetical within tier)
+    //   4. Human presence (VOLATILE — moved here from start)
+    // Volatile content lives only in the suffix; the INVARIANT prefix is
+    // byte-identical across thousands of turns for the same persona+recipe.
     const finalIdentity = { ...identity };
 
-    // 2.4.1. Inject human presence awareness (which room each user is viewing)
-    // This is NOT a RAG source — it's lightweight synchronous state, always injected.
-    const allPresence = HumanPresenceTracker.allPresence;
-    if (allPresence.length > 0) {
-      const lines = allPresence.map(p => {
-        const viewingThis = p.roomId === contextId;
-        return `- ${p.displayName} is viewing: ${p.roomName}${viewingThis ? ' (this room — they can see your response in real-time)' : ''}`;
-      });
-      finalIdentity.systemPrompt = finalIdentity.systemPrompt +
-        `\n\n## HUMAN PRESENCE\n${lines.join('\n')}`;
+    // 2.4.1. Inject INVARIANT tool definitions FIRST (after identity).
+    // Tool definitions are INVARIANT per the source classification — they
+    // change only when the tool catalog itself changes, not per request.
+    // Putting them at the top of the prefix maximizes the byte-stable
+    // region that DMR can reuse.
+    const toolDefinitionsPrompt = systemPromptSections.get('tool-definitions');
+    let injectedCount = 0;
+    if (!isSmallContext && toolDefinitionsPrompt) {
+      finalIdentity.systemPrompt += toolDefinitionsPrompt;
+      injectedCount++;
+      this.log(`🔧 ChatRAGBuilder: Injected tool definitions (INVARIANT, byte-stable prefix region)`);
     }
 
-    // 2.4.2. Inject all RAG source systemPromptSections generically
+    // 2.4.2. Inject all OTHER RAG source systemPromptSections in tier order.
+    //
+    // The Map iteration order matches the (tier, sourceName) sort that
+    // RAGComposer applied to result.sections in Phase 1 — Map preserves
+    // insertion order, and extractFromComposition inserts in that order.
     //
     // Sources with wrapper instructions — the section content gets wrapped with
     // additional context instructions. Eventually these wrappers should move INTO
@@ -348,10 +364,9 @@ export class ChatRAGBuilder extends RAGBuilder {
     // Codebase search is critical — if someone asks about code, they need the answer.
     const ALWAYS_INJECT = new Set(['codebase-search']);
 
-    // Tool definitions are injected separately (native specs vs XML have different paths)
+    // Tool definitions already injected above; skip in the generic loop.
     const SKIP_GENERIC = new Set(['tool-definitions']);
 
-    let injectedCount = 0;
     for (const [sourceName, section] of systemPromptSections) {
       if (SKIP_GENERIC.has(sourceName)) continue;
       if (isSmallContext && !ALWAYS_INJECT.has(sourceName)) continue;
@@ -362,12 +377,18 @@ export class ChatRAGBuilder extends RAGBuilder {
       this.log(`🔧 ChatRAGBuilder: Injected ${sourceName} into system prompt`);
     }
 
-    // 2.4.3. Inject XML tool definitions for text-based providers (budget-aware via ToolDefinitionsSource)
-    const toolDefinitionsPrompt = systemPromptSections.get('tool-definitions');
-    if (!isSmallContext && toolDefinitionsPrompt) {
-      finalIdentity.systemPrompt += toolDefinitionsPrompt;
-      injectedCount++;
-      this.log(`🔧 ChatRAGBuilder: Injected tool definitions into system prompt (XML format)`);
+    // 2.4.3. Inject VOLATILE human presence LAST.
+    // HumanPresenceTracker is not a RAGSource but its content is volatile
+    // (changes when any user switches rooms). It must live in the suffix,
+    // never in the byte-stable prefix region.
+    const allPresence = HumanPresenceTracker.allPresence;
+    if (allPresence.length > 0) {
+      const lines = allPresence.map(p => {
+        const viewingThis = p.roomId === contextId;
+        return `- ${p.displayName} is viewing: ${p.roomName}${viewingThis ? ' (this room — they can see your response in real-time)' : ''}`;
+      });
+      finalIdentity.systemPrompt = finalIdentity.systemPrompt +
+        `\n\n## HUMAN PRESENCE\n${lines.join('\n')}`;
     }
 
     if (isSmallContext) {

diff --git a/src/system/rag/services/CodebaseIndexer.ts b/src/system/rag/services/CodebaseIndexer.ts
@@ -28,8 +28,17 @@ const log = Logger.create('CodebaseIndexer', 'rag');
 /** Maximum content length per chunk (chars). Longer chunks are split. */
 const MAX_CHUNK_CHARS = 2000;
 
-/** Batch size for embedding generation — one Rust IPC call per batch */
-const EMBEDDING_BATCH_SIZE = 64;
+/** Batch size for embedding generation — one Rust IPC call per batch.
+ * Was 64; dropped to 16 because 64 × ~80MB-per-batch RSS growth saturated
+ * the event loop and starved chat for ~2min after every boot on M5.
+ * 16 gives Rust IPC the ~4× headroom to interleave with persona inference. */
+const EMBEDDING_BATCH_SIZE = 16;
+
+/** Pause between batches (ms) to yield the event loop and let the Rust
+ * IPC pipeline drain. Without this, the indexer blocks chat and live for
+ * the full duration. 50ms is small enough to not visibly slow indexing
+ * but big enough that other IO can interleave. */
+const EMBEDDING_BATCH_PAUSE_MS = 50;
 
 /** File extensions to index */
 const INDEXABLE_EXTENSIONS = new Set(['.ts', '.md', '.js']);
@@ -224,6 +233,14 @@ export class CodebaseIndexer {
         log.error(`Embedding batch ${i}-${i + batch.length} failed: ${err}`);
         errors.push({ file: `batch-${i}`, error: String(err) });
       }
+
+      // Yield to other IO between batches. Without this, the indexer
+      // monopolises the event loop and chat/voice/personas all stall
+      // for the full indexing duration. Chat-arrival latency matters
+      // more than indexing throughput.
+      if (i + EMBEDDING_BATCH_SIZE < allChunks.length) {
+        await new Promise(resolve => setTimeout(resolve, EMBEDDING_BATCH_PAUSE_MS));
+      }
     }
 
     // Any write to code_index invalidates the in-memory query cache.

diff --git a/src/system/rag/shared/RAGComposer.ts b/src/system/rag/shared/RAGComposer.ts
@@ -22,6 +22,15 @@
  */
 
 import type { RAGSource, RAGSourceContext, RAGSection, RAGCompositionResult } from './RAGSource';
+import { PromptTier } from './RAGTypes';
+
+/** Sort key for tiers — smaller numbers concatenate first.
+ * INVARIANT before SEMI_STABLE before VOLATILE. See PromptTier docs. */
+const TIER_ORDER: Record<PromptTier, number> = {
+  [PromptTier.INVARIANT]: 0,
+  [PromptTier.SEMI_STABLE]: 1,
+  [PromptTier.VOLATILE]: 2,
+};
 import type { RagSourceRequest, RagComposeResult, RagSourceResult } from '../../../shared/generated/rag';
 import { Logger } from '../../core/logging/Logger';
 import { TimingHarness } from '../../core/shared/TimingHarness';
@@ -230,6 +239,19 @@ export class RAGComposer {
         failedSources.push({ source: result.source, error: result.error });
       }
     }
+    // Deterministic ordering: sections sorted by (tier, sourceName) so the
+    // assembled prompt's bytes are identical across requests with identical
+    // section contents. This is the prerequisite for llama-server / DMR
+    // prefix-KV-cache reuse — without stable ordering, the same logical
+    // prompt has different bytes per turn and the cache misses every time.
+    // Tier order: INVARIANT first, then SEMI_STABLE, then VOLATILE — see
+    // PromptTier in RAGTypes.ts and docs/architecture/MULTIMODAL-WORKER-AND-PREFIX-REUSE.md
+    sections.sort((a, b) => {
+      const tierOrder = TIER_ORDER[a.tier] - TIER_ORDER[b.tier];
+      if (tierOrder !== 0) return tierOrder;
+      // Within a tier: alphabetical by source name. Stable, total order.
+      return a.sourceName.localeCompare(b.sourceName);
+    });
     timer.mark('collect_results');
 
     // Log ALL source timings for performance diagnosis
@@ -316,8 +338,13 @@ export class RAGComposer {
 
         if (rustResult.success) {
           // Convert via source's fromBatchResult method
+          // Tier injection: same single-source-of-truth as TS path —
+          // the source's class declaration provides tier; we inject it
+          // here so the section conforms to RAGSection regardless of
+          // whether the source's fromBatchResult included it.
           if (sourceInfo.source.fromBatchResult) {
-            const section = sourceInfo.source.fromBatchResult(rustResult, rustResult.load_time_ms);
+            const rawSection = sourceInfo.source.fromBatchResult(rustResult, rustResult.load_time_ms);
+            const section: RAGSection = { ...rawSection, tier: sourceInfo.source.tier };
             results.push({
               success: true,
               section,
@@ -326,9 +353,11 @@ export class RAGComposer {
             });
           } else {
             // Fallback: basic conversion
+            const rawSection = this.defaultFromBatchResult(sourceInfo.source.name, rustResult);
+            const section: RAGSection = { ...rawSection, tier: sourceInfo.source.tier };
             results.push({
               success: true,
-              section: this.defaultFromBatchResult(sourceInfo.source.name, rustResult),
+              section,
               sourceName: sourceInfo.source.name,
               loadTime: rustResult.load_time_ms
             });
@@ -441,10 +470,14 @@ export class RAGComposer {
     sourceTimer.setMeta('budget', budget);
 
     try {
-      const section = await source.load(context, budget);
+      const rawSection = await source.load(context, budget);
       sourceTimer.mark('load');
-      sourceTimer.setMeta('tokenCount', section.tokenCount);
+      sourceTimer.setMeta('tokenCount', rawSection.tokenCount);
       const record = sourceTimer.finish();
+      // Inject tier from the source's declaration. Sources don't re-state
+      // their tier on every return; the class-level declaration is the
+      // single source of truth, applied here.
+      const section: RAGSection = { ...rawSection, tier: source.tier };
       return { success: true, section, sourceName: source.name, loadTime: record.totalMs };
     } catch (error: any) {
       sourceTimer.setError(error.message);
@@ -457,8 +490,9 @@ export class RAGComposer {
   /**
    * Default conversion from Rust RagSourceResult to TypeScript RAGSection.
    * Used when source doesn't implement fromBatchResult.
+   * Returns without `tier` — caller injects from the source's declaration.
    */
-  private defaultFromBatchResult(sourceName: string, result: RagSourceResult): RAGSection {
+  private defaultFromBatchResult(sourceName: string, result: RagSourceResult): Omit<RAGSection, 'tier'> {
     // Combine all sections into a single content block
     const content = result.sections
       .map(s => s.content)

diff --git a/src/system/rag/shared/RAGSource.ts b/src/system/rag/shared/RAGSource.ts
@@ -19,6 +19,10 @@
 
 import type { UUID } from '../../core/types/CrossPlatformUUID';
 import type { RAGBuildOptions, LLMMessage, RAGArtifact, PersonaMemory, PersonaIdentity, RecipeStrategy } from './RAGTypes';
+import { PromptTier } from './RAGTypes';
+
+// Re-export so source files only need one import
+export { PromptTier } from './RAGTypes';
-import type { RAGBuildOptions, LLMMessage, RAGArtifact, PersonaMemory, PersonaIdentity, RecipeStrategy } from './RAGTypes';
-import { PromptTier } from './RAGTypes';
-
-// Re-export so source files only need one import
-export { PromptTier } from './RAGTypes';
+import type { RAGBuildOptions, LLMMessage, RAGArtifact, PersonaMemory, PersonaIdentity, RecipeStrategy, PromptTier } from './RAGTypes';
+
+// Re-export so source files only need one import
+export type { PromptTier } from './RAGTypes';
-import type { RAGBuildOptions, LLMMessage, RAGArtifact, PersonaMemory, PersonaIdentity, RecipeStrategy } from './RAGTypes';
-import { PromptTier } from './RAGTypes';
-
-// Re-export so source files only need one import
-export { PromptTier } from './RAGTypes';
+import type { RAGBuildOptions, LLMMessage, RAGArtifact, PersonaMemory, PersonaIdentity, RecipeStrategy, PromptTier } from './RAGTypes';
+
+// Re-export so source files only need one import
+export type { PromptTier } from './RAGTypes';
 
 /**
  * Context passed to each RAGSource for loading
@@ -70,6 +74,9 @@ export interface RAGSourceContext {
 export interface RAGSection {
   /** Source that produced this section */
   readonly sourceName: string;
+  /** Tier this section belongs to — drives stable-byte-prefix ordering.
+   * Mirrored from the producing source's declared tier. */
+  readonly tier: PromptTier;
   /** Estimated token count */
   readonly tokenCount: number;
   /** Time taken to load (ms) */
@@ -105,6 +112,29 @@ export interface RAGSource {
    */
   readonly priority: number;
 
+  /**
+   * Tier — INVARIANT / SEMI_STABLE / VOLATILE.
+   * Required. Drives stable-byte-prefix prompt assembly so llama-server
+   * reuses KV cache for the unchanging region instead of reprocessing
+   * the full prompt every turn.
+   *
+   * Classification rules:
+   * - INVARIANT — system prompt fragments, recipe rules, role identity,
+   *   tool definitions. Bytes must be identical across thousands of turns
+   *   for the same persona+recipe. NO timestamps, NO request IDs, NO
+   *   per-request volatile data.
+   * - SEMI_STABLE — conversation history, memories, participants,
+   *   governance. Grows monotonically — append-only relative to the
+   *   previous turn. Earlier bytes never rewritten.
+   * - VOLATILE — current message, audio chunks, current timestamp,
+   *   per-request observations. The only region the server reprocesses
+   *   token-by-token.
+   *
+   * If you can't decide, the source probably mixes tiers and should be
+   * split into separate sources at the right granularity.
+   */
+  readonly tier: PromptTier;
+
   /**
    * Default budget allocation as percentage (0-100).
    * Total across all sources should roughly equal 100.
@@ -126,11 +156,16 @@ export interface RAGSource {
    * Load data from this source.
    * Called in parallel with other applicable sources.
    *
+   * Returns the section without the `tier` field — RAGComposer injects
+   * the source's declared `tier` into the section after load completes.
+   * This keeps source implementations focused on what they produce
+   * rather than re-asserting their tier on every return.
+   *
    * @param context - Context for loading
    * @param allocatedBudget - Token budget allocated to this source
-   * @returns Section of RAG context
+   * @returns Section of RAG context (tier added by composer)
    */
-  load(context: RAGSourceContext, allocatedBudget: number): Promise<RAGSection>;
+  load(context: RAGSourceContext, allocatedBudget: number): Promise<Omit<RAGSection, 'tier'>>;
 
   /**
    * Whether this source produces identical results for all personas in the same room.
@@ -168,11 +203,14 @@ export interface RAGSource {
    * Only called if supportsBatching is true.
    * Transforms the typed Rust result into the RAGSection format.
    *
+   * Returns the section without `tier` — RAGComposer injects the source's
+   * declared tier after conversion, same as the non-batched path.
+   *
    * @param result - The result from Rust's rag/compose endpoint
    * @param loadTimeMs - How long the load took
-   * @returns The RAGSection to include in the composition result
+   * @returns The RAGSection (without tier) to include in the composition result
    */
-  fromBatchResult?(result: RagSourceResult, loadTimeMs: number): RAGSection;
+  fromBatchResult?(result: RagSourceResult, loadTimeMs: number): Omit<RAGSection, 'tier'>;
 }
 
 // Re-export Rust-generated types for batch support

diff --git a/src/system/rag/shared/RAGTypes.ts b/src/system/rag/shared/RAGTypes.ts
@@ -18,6 +18,38 @@ import type { RecipeToolDeclaration } from '../../recipes/shared/RecipeTypes';
  */
 export type RAGDomain = 'chat' | 'academy' | 'game' | 'code' | 'analysis';
 
+/**
+ * Prompt tier — declares how often a RAG source's contribution changes between
+ * requests. Drives stable-byte-prefix prompt assembly so llama-server / vllm /
+ * DMR can reuse the KV cache for the invariant region instead of reprocessing
+ * the full prompt every turn.
+ *
+ * The contract: a section's bytes must be byte-identical across requests for
+ * sources at the same tier with the same inputs. INVARIANT and SEMI_STABLE
+ * sources MUST NOT contain timestamps, request IDs, or any per-request
+ * volatile data. Those go in VOLATILE only.
+ *
+ * Final assembly order is always: INVARIANT → SEMI_STABLE → VOLATILE.
+ * Within each tier, sources are sorted by name (alphabetical) so the byte
+ * order is fully deterministic.
- * Within each tier, sources are sorted by name (alphabetical) so the byte
- * order is fully deterministic.
+ * Within a tier, source order must be deterministic. Consumers may apply
+ * tier-specific ordering rules before any fallback alphabetical ordering; for
+ * example, `tool-definitions` is hoisted ahead of other INVARIANT sources.
- * Within each tier, sources are sorted by name (alphabetical) so the byte
- * order is fully deterministic.
+ * Within a tier, source order must be deterministic. Consumers may apply
+ * tier-specific ordering rules before any fallback alphabetical ordering; for
+ * example, `tool-definitions` is hoisted ahead of other INVARIANT sources.
+ *
+ * See: docs/architecture/MULTIMODAL-WORKER-AND-PREFIX-REUSE.md (Part 1)
+ */
+export const enum PromptTier {
+  /** Persona system prompt, recipe rules, role identity, tool definitions.
+   * Changes ~weekly when persona/recipe is edited. Identical bytes across
+   * thousands of turns for the same persona+recipe. */
+  INVARIANT = 'invariant',
+  /** Conversation history, active genome adapters, participants, governance
+   * state. Grows monotonically — new content APPENDS to the existing
+   * prefix, doesn't rewrite earlier bytes. */
+  SEMI_STABLE = 'semi_stable',
+  /** Latest user message, audio chunks, current timestamp, last-second
+   * pressure observations. Changes every request. The only region the
+   * server actually has to reprocess token-by-token. */
+  VOLATILE = 'volatile',
+}
+
 /**
  * Model capabilities that affect RAG context building
  * Determines how artifacts (images, etc.) are processed

diff --git a/src/system/rag/sources/ActivityContextSource.ts b/src/system/rag/sources/ActivityContextSource.ts
@@ -8,6 +8,7 @@
  */
 
 import type { RAGSource, RAGSection, RAGSourceContext } from '../shared/RAGSource';
+import { PromptTier } from '../shared/RAGSource';
 import type { RecipeStrategy } from '../shared/RAGTypes';
 import type { RecipeToolDeclaration } from '../../recipes/shared/RecipeTypes';
 import { ORM } from '../../../daemons/data-daemon/server/ORM';
@@ -23,6 +24,7 @@ import { isSlowLocalModel } from '../../shared/ModelContextWindows';
  */
 export class ActivityContextSource implements RAGSource {
   readonly name = 'activity';
+  readonly tier = PromptTier.VOLATILE;
   readonly isShared = true;
 
   // Medium priority - important for guided interactions
@@ -36,7 +38,7 @@ export class ActivityContextSource implements RAGSource {
     return true;
   }
 
-  async load(context: RAGSourceContext, allocatedBudget: number): Promise<RAGSection> {
+  async load(context: RAGSourceContext, allocatedBudget: number): Promise<Omit<RAGSection, 'tier'>> {
     const startTime = Date.now();
 
     try {

diff --git a/src/system/rag/sources/CodeToolSource.ts b/src/system/rag/sources/CodeToolSource.ts
@@ -14,6 +14,7 @@
  */
 
 import type { RAGSource, RAGSourceContext, RAGSection } from '../shared/RAGSource';
+import { PromptTier } from '../shared/RAGSource';
 import { PersonaToolRegistry } from '../../user/server/modules/PersonaToolRegistry';
 import { Logger } from '../../core/logging/Logger';
 
@@ -70,6 +71,7 @@ const CODE_TOOL_GROUPS: readonly CodeToolGroup[] = [
 
 export class CodeToolSource implements RAGSource {
   readonly name = 'code-tools';
+  readonly tier = PromptTier.INVARIANT;
   readonly priority = 50;  // Medium — below conversation/widget, above learning config
   readonly defaultBudgetPercent = 5;
 
@@ -84,7 +86,7 @@ export class CodeToolSource implements RAGSource {
     return tools.some(t => t.name.startsWith('code/'));
   }
 
-  async load(context: RAGSourceContext, allocatedBudget: number): Promise<RAGSection> {
+  async load(context: RAGSourceContext, allocatedBudget: number): Promise<Omit<RAGSection, 'tier'>> {
     const startTime = performance.now();
 
     try {
@@ -253,7 +255,7 @@ export class CodeToolSource implements RAGSource {
     return tools.filter(t => t.name.startsWith('code/')).length;
   }
 
-  private emptySection(startTime: number, error?: string): RAGSection {
+  private emptySection(startTime: number, error?: string): Omit<RAGSection, 'tier'> {
     return {
       sourceName: this.name,
       tokenCount: 0,