Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 40 additions & 19 deletions src/system/rag/builders/ChatRAGBuilder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -317,21 +317,37 @@ export class ChatRAGBuilder extends RAGBuilder {
// 2.4. Inject RAG source context into system prompt — GENERIC LOOP
// Each RAGSource provides a systemPromptSection. We inject them all without
// knowing source names. Adding a new source requires ZERO changes here.
//
// Phase 1.5 (issue #918): the assembly order is enforced for stable
// byte-prefix prompts so llama-server / DMR can reuse KV cache. Order:
// 1. Identity systemPrompt (INVARIANT — already in finalIdentity)
// 2. Tool definitions (INVARIANT — moved here from end)
// 3. Loop iterates systemPromptSections in tier-sorted order
// (Phase 1's RAGComposer sort guarantees Map iteration order is
// INVARIANT → SEMI_STABLE → VOLATILE → alphabetical within tier)
// 4. Human presence (VOLATILE — moved here from start)
// Volatile content lives only in the suffix; the INVARIANT prefix is
// byte-identical across thousands of turns for the same persona+recipe.
const finalIdentity = { ...identity };

// 2.4.1. Inject human presence awareness (which room each user is viewing)
// This is NOT a RAG source — it's lightweight synchronous state, always injected.
const allPresence = HumanPresenceTracker.allPresence;
if (allPresence.length > 0) {
const lines = allPresence.map(p => {
const viewingThis = p.roomId === contextId;
return `- ${p.displayName} is viewing: ${p.roomName}${viewingThis ? ' (this room — they can see your response in real-time)' : ''}`;
});
finalIdentity.systemPrompt = finalIdentity.systemPrompt +
`\n\n## HUMAN PRESENCE\n${lines.join('\n')}`;
// 2.4.1. Inject INVARIANT tool definitions FIRST (after identity).
// Tool definitions are INVARIANT per the source classification — they
// change only when the tool catalog itself changes, not per request.
// Putting them at the top of the prefix maximizes the byte-stable
// region that DMR can reuse.
const toolDefinitionsPrompt = systemPromptSections.get('tool-definitions');
let injectedCount = 0;
if (!isSmallContext && toolDefinitionsPrompt) {
finalIdentity.systemPrompt += toolDefinitionsPrompt;
injectedCount++;
this.log(`🔧 ChatRAGBuilder: Injected tool definitions (INVARIANT, byte-stable prefix region)`);
}

// 2.4.2. Inject all RAG source systemPromptSections generically
// 2.4.2. Inject all OTHER RAG source systemPromptSections in tier order.
//
// The Map iteration order matches the (tier, sourceName) sort that
// RAGComposer applied to result.sections in Phase 1 — Map preserves
// insertion order, and extractFromComposition inserts in that order.
//
// Sources with wrapper instructions — the section content gets wrapped with
// additional context instructions. Eventually these wrappers should move INTO
Expand All @@ -348,10 +364,9 @@ export class ChatRAGBuilder extends RAGBuilder {
// Codebase search is critical — if someone asks about code, they need the answer.
const ALWAYS_INJECT = new Set(['codebase-search']);

// Tool definitions are injected separately (native specs vs XML have different paths)
// Tool definitions already injected above; skip in the generic loop.
const SKIP_GENERIC = new Set(['tool-definitions']);

let injectedCount = 0;
for (const [sourceName, section] of systemPromptSections) {
if (SKIP_GENERIC.has(sourceName)) continue;
if (isSmallContext && !ALWAYS_INJECT.has(sourceName)) continue;
Expand All @@ -362,12 +377,18 @@ export class ChatRAGBuilder extends RAGBuilder {
this.log(`🔧 ChatRAGBuilder: Injected ${sourceName} into system prompt`);
}

// 2.4.3. Inject XML tool definitions for text-based providers (budget-aware via ToolDefinitionsSource)
const toolDefinitionsPrompt = systemPromptSections.get('tool-definitions');
if (!isSmallContext && toolDefinitionsPrompt) {
finalIdentity.systemPrompt += toolDefinitionsPrompt;
injectedCount++;
this.log(`🔧 ChatRAGBuilder: Injected tool definitions into system prompt (XML format)`);
// 2.4.3. Inject VOLATILE human presence LAST.
// HumanPresenceTracker is not a RAGSource but its content is volatile
// (changes when any user switches rooms). It must live in the suffix,
// never in the byte-stable prefix region.
const allPresence = HumanPresenceTracker.allPresence;
if (allPresence.length > 0) {
const lines = allPresence.map(p => {
const viewingThis = p.roomId === contextId;
return `- ${p.displayName} is viewing: ${p.roomName}${viewingThis ? ' (this room — they can see your response in real-time)' : ''}`;
});
finalIdentity.systemPrompt = finalIdentity.systemPrompt +
`\n\n## HUMAN PRESENCE\n${lines.join('\n')}`;
}

if (isSmallContext) {
Expand Down
21 changes: 19 additions & 2 deletions src/system/rag/services/CodebaseIndexer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,17 @@ const log = Logger.create('CodebaseIndexer', 'rag');
/** Maximum content length per chunk (chars). Longer chunks are split. */
const MAX_CHUNK_CHARS = 2000;

/** Batch size for embedding generation — one Rust IPC call per batch */
const EMBEDDING_BATCH_SIZE = 64;
/** Batch size for embedding generation — one Rust IPC call per batch.
* Was 64; dropped to 16 because 64 × ~80MB-per-batch RSS growth saturated
* the event loop and starved chat for ~2min after every boot on M5.
* 16 gives Rust IPC the ~4× headroom to interleave with persona inference. */
const EMBEDDING_BATCH_SIZE = 16;

/** Pause between batches (ms) to yield the event loop and let the Rust
* IPC pipeline drain. Without this, the indexer blocks chat and live for
* the full duration. 50ms is small enough to not visibly slow indexing
* but big enough that other IO can interleave. */
const EMBEDDING_BATCH_PAUSE_MS = 50;

/** File extensions to index */
const INDEXABLE_EXTENSIONS = new Set(['.ts', '.md', '.js']);
Expand Down Expand Up @@ -224,6 +233,14 @@ export class CodebaseIndexer {
log.error(`Embedding batch ${i}-${i + batch.length} failed: ${err}`);
errors.push({ file: `batch-${i}`, error: String(err) });
}

// Yield to other IO between batches. Without this, the indexer
// monopolises the event loop and chat/voice/personas all stall
// for the full indexing duration. Chat-arrival latency matters
// more than indexing throughput.
if (i + EMBEDDING_BATCH_SIZE < allChunks.length) {
await new Promise(resolve => setTimeout(resolve, EMBEDDING_BATCH_PAUSE_MS));
}
}

// Any write to code_index invalidates the in-memory query cache.
Expand Down
44 changes: 39 additions & 5 deletions src/system/rag/shared/RAGComposer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,15 @@
*/

import type { RAGSource, RAGSourceContext, RAGSection, RAGCompositionResult } from './RAGSource';
import { PromptTier } from './RAGTypes';

/** Sort key for tiers — smaller numbers concatenate first.
* INVARIANT before SEMI_STABLE before VOLATILE. See PromptTier docs. */
const TIER_ORDER: Record<PromptTier, number> = {
[PromptTier.INVARIANT]: 0,
[PromptTier.SEMI_STABLE]: 1,
[PromptTier.VOLATILE]: 2,
};
import type { RagSourceRequest, RagComposeResult, RagSourceResult } from '../../../shared/generated/rag';
import { Logger } from '../../core/logging/Logger';
import { TimingHarness } from '../../core/shared/TimingHarness';
Expand Down Expand Up @@ -230,6 +239,19 @@ export class RAGComposer {
failedSources.push({ source: result.source, error: result.error });
}
}
// Deterministic ordering: sections sorted by (tier, sourceName) so the
// assembled prompt's bytes are identical across requests with identical
// section contents. This is the prerequisite for llama-server / DMR
// prefix-KV-cache reuse — without stable ordering, the same logical
// prompt has different bytes per turn and the cache misses every time.
// Tier order: INVARIANT first, then SEMI_STABLE, then VOLATILE — see
// PromptTier in RAGTypes.ts and docs/architecture/MULTIMODAL-WORKER-AND-PREFIX-REUSE.md
sections.sort((a, b) => {
const tierOrder = TIER_ORDER[a.tier] - TIER_ORDER[b.tier];
if (tierOrder !== 0) return tierOrder;
// Within a tier: alphabetical by source name. Stable, total order.
return a.sourceName.localeCompare(b.sourceName);
});
timer.mark('collect_results');

// Log ALL source timings for performance diagnosis
Expand Down Expand Up @@ -316,8 +338,13 @@ export class RAGComposer {

if (rustResult.success) {
// Convert via source's fromBatchResult method
// Tier injection: same single-source-of-truth as TS path —
// the source's class declaration provides tier; we inject it
// here so the section conforms to RAGSection regardless of
// whether the source's fromBatchResult included it.
if (sourceInfo.source.fromBatchResult) {
const section = sourceInfo.source.fromBatchResult(rustResult, rustResult.load_time_ms);
const rawSection = sourceInfo.source.fromBatchResult(rustResult, rustResult.load_time_ms);
const section: RAGSection = { ...rawSection, tier: sourceInfo.source.tier };
results.push({
success: true,
section,
Expand All @@ -326,9 +353,11 @@ export class RAGComposer {
});
} else {
// Fallback: basic conversion
const rawSection = this.defaultFromBatchResult(sourceInfo.source.name, rustResult);
const section: RAGSection = { ...rawSection, tier: sourceInfo.source.tier };
results.push({
success: true,
section: this.defaultFromBatchResult(sourceInfo.source.name, rustResult),
section,
sourceName: sourceInfo.source.name,
loadTime: rustResult.load_time_ms
});
Expand Down Expand Up @@ -441,10 +470,14 @@ export class RAGComposer {
sourceTimer.setMeta('budget', budget);

try {
const section = await source.load(context, budget);
const rawSection = await source.load(context, budget);
sourceTimer.mark('load');
sourceTimer.setMeta('tokenCount', section.tokenCount);
sourceTimer.setMeta('tokenCount', rawSection.tokenCount);
const record = sourceTimer.finish();
// Inject tier from the source's declaration. Sources don't re-state
// their tier on every return; the class-level declaration is the
// single source of truth, applied here.
const section: RAGSection = { ...rawSection, tier: source.tier };
return { success: true, section, sourceName: source.name, loadTime: record.totalMs };
} catch (error: any) {
sourceTimer.setError(error.message);
Expand All @@ -457,8 +490,9 @@ export class RAGComposer {
/**
* Default conversion from Rust RagSourceResult to TypeScript RAGSection.
* Used when source doesn't implement fromBatchResult.
* Returns without `tier` — caller injects from the source's declaration.
*/
private defaultFromBatchResult(sourceName: string, result: RagSourceResult): RAGSection {
private defaultFromBatchResult(sourceName: string, result: RagSourceResult): Omit<RAGSection, 'tier'> {
// Combine all sections into a single content block
const content = result.sections
.map(s => s.content)
Expand Down
46 changes: 42 additions & 4 deletions src/system/rag/shared/RAGSource.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@

import type { UUID } from '../../core/types/CrossPlatformUUID';
import type { RAGBuildOptions, LLMMessage, RAGArtifact, PersonaMemory, PersonaIdentity, RecipeStrategy } from './RAGTypes';
import { PromptTier } from './RAGTypes';

// Re-export so source files only need one import
export { PromptTier } from './RAGTypes';
Comment on lines 21 to +25
Copy link

Copilot AI Apr 18, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The file imports and re-exports PromptTier as a runtime symbol (import { PromptTier } ... + export { PromptTier } ...), but PromptTier is a const enum (erased at emit). In ESM this can cause a hard runtime failure when the re-exported binding doesn’t exist. Safer options: (a) make PromptTier a normal enum, or (b) remove the runtime re-export and have consumers import type { PromptTier } / use string literals.

Suggested change
import type { RAGBuildOptions, LLMMessage, RAGArtifact, PersonaMemory, PersonaIdentity, RecipeStrategy } from './RAGTypes';
import { PromptTier } from './RAGTypes';
// Re-export so source files only need one import
export { PromptTier } from './RAGTypes';
import type { RAGBuildOptions, LLMMessage, RAGArtifact, PersonaMemory, PersonaIdentity, RecipeStrategy, PromptTier } from './RAGTypes';
// Re-export so source files only need one import
export type { PromptTier } from './RAGTypes';

Copilot uses AI. Check for mistakes.

/**
* Context passed to each RAGSource for loading
Expand Down Expand Up @@ -70,6 +74,9 @@ export interface RAGSourceContext {
export interface RAGSection {
/** Source that produced this section */
readonly sourceName: string;
/** Tier this section belongs to — drives stable-byte-prefix ordering.
* Mirrored from the producing source's declared tier. */
readonly tier: PromptTier;
/** Estimated token count */
readonly tokenCount: number;
/** Time taken to load (ms) */
Expand Down Expand Up @@ -105,6 +112,29 @@ export interface RAGSource {
*/
readonly priority: number;

/**
* Tier — INVARIANT / SEMI_STABLE / VOLATILE.
* Required. Drives stable-byte-prefix prompt assembly so llama-server
* reuses KV cache for the unchanging region instead of reprocessing
* the full prompt every turn.
*
* Classification rules:
* - INVARIANT — system prompt fragments, recipe rules, role identity,
* tool definitions. Bytes must be identical across thousands of turns
* for the same persona+recipe. NO timestamps, NO request IDs, NO
* per-request volatile data.
* - SEMI_STABLE — conversation history, memories, participants,
* governance. Grows monotonically — append-only relative to the
* previous turn. Earlier bytes never rewritten.
* - VOLATILE — current message, audio chunks, current timestamp,
* per-request observations. The only region the server reprocesses
* token-by-token.
*
* If you can't decide, the source probably mixes tiers and should be
* split into separate sources at the right granularity.
*/
readonly tier: PromptTier;

/**
* Default budget allocation as percentage (0-100).
* Total across all sources should roughly equal 100.
Expand All @@ -126,11 +156,16 @@ export interface RAGSource {
* Load data from this source.
* Called in parallel with other applicable sources.
*
* Returns the section without the `tier` field — RAGComposer injects
* the source's declared `tier` into the section after load completes.
* This keeps source implementations focused on what they produce
* rather than re-asserting their tier on every return.
*
* @param context - Context for loading
* @param allocatedBudget - Token budget allocated to this source
* @returns Section of RAG context
* @returns Section of RAG context (tier added by composer)
*/
load(context: RAGSourceContext, allocatedBudget: number): Promise<RAGSection>;
load(context: RAGSourceContext, allocatedBudget: number): Promise<Omit<RAGSection, 'tier'>>;

/**
* Whether this source produces identical results for all personas in the same room.
Expand Down Expand Up @@ -168,11 +203,14 @@ export interface RAGSource {
* Only called if supportsBatching is true.
* Transforms the typed Rust result into the RAGSection format.
*
* Returns the section without `tier` — RAGComposer injects the source's
* declared tier after conversion, same as the non-batched path.
*
* @param result - The result from Rust's rag/compose endpoint
* @param loadTimeMs - How long the load took
* @returns The RAGSection to include in the composition result
* @returns The RAGSection (without tier) to include in the composition result
*/
fromBatchResult?(result: RagSourceResult, loadTimeMs: number): RAGSection;
fromBatchResult?(result: RagSourceResult, loadTimeMs: number): Omit<RAGSection, 'tier'>;
}

// Re-export Rust-generated types for batch support
Expand Down
32 changes: 32 additions & 0 deletions src/system/rag/shared/RAGTypes.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,38 @@ import type { RecipeToolDeclaration } from '../../recipes/shared/RecipeTypes';
*/
export type RAGDomain = 'chat' | 'academy' | 'game' | 'code' | 'analysis';

/**
* Prompt tier — declares how often a RAG source's contribution changes between
* requests. Drives stable-byte-prefix prompt assembly so llama-server / vllm /
* DMR can reuse the KV cache for the invariant region instead of reprocessing
* the full prompt every turn.
*
* The contract: a section's bytes must be byte-identical across requests for
* sources at the same tier with the same inputs. INVARIANT and SEMI_STABLE
* sources MUST NOT contain timestamps, request IDs, or any per-request
* volatile data. Those go in VOLATILE only.
*
* Final assembly order is always: INVARIANT → SEMI_STABLE → VOLATILE.
* Within each tier, sources are sorted by name (alphabetical) so the byte
* order is fully deterministic.
Comment on lines +33 to +34
Copy link

Copilot AI Apr 18, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The PromptTier doc says “Within each tier, sources are sorted by name (alphabetical)”, but ChatRAGBuilder now explicitly hoists tool-definitions ahead of other INVARIANT sources. Either update this doc to reflect the actual consumer-side ordering rules, or move all ordering decisions into the composer so the documented contract matches the emitted prompt order.

Suggested change
* Within each tier, sources are sorted by name (alphabetical) so the byte
* order is fully deterministic.
* Within a tier, source order must be deterministic. Consumers may apply
* tier-specific ordering rules before any fallback alphabetical ordering; for
* example, `tool-definitions` is hoisted ahead of other INVARIANT sources.

Copilot uses AI. Check for mistakes.
*
* See: docs/architecture/MULTIMODAL-WORKER-AND-PREFIX-REUSE.md (Part 1)
*/
export const enum PromptTier {
/** Persona system prompt, recipe rules, role identity, tool definitions.
* Changes ~weekly when persona/recipe is edited. Identical bytes across
* thousands of turns for the same persona+recipe. */
INVARIANT = 'invariant',
/** Conversation history, active genome adapters, participants, governance
* state. Grows monotonically — new content APPENDS to the existing
* prefix, doesn't rewrite earlier bytes. */
SEMI_STABLE = 'semi_stable',
/** Latest user message, audio chunks, current timestamp, last-second
* pressure observations. Changes every request. The only region the
* server actually has to reprocess token-by-token. */
VOLATILE = 'volatile',
}
Comment on lines +38 to +51
Copy link

Copilot AI Apr 18, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PromptTier is declared as a const enum, but it’s also imported/re-exported as a value (export { PromptTier } ...). Since const enums are erased in JS output, this can break ESM/bundler consumers with “module does not provide an export named 'PromptTier'”. Consider switching PromptTier to a normal export enum (or as const object + union type), or make the re-export/imports type-only and stop re-exporting it as a runtime value.

Copilot uses AI. Check for mistakes.

/**
* Model capabilities that affect RAG context building
* Determines how artifacts (images, etc.) are processed
Expand Down
4 changes: 3 additions & 1 deletion src/system/rag/sources/ActivityContextSource.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
*/

import type { RAGSource, RAGSection, RAGSourceContext } from '../shared/RAGSource';
import { PromptTier } from '../shared/RAGSource';
import type { RecipeStrategy } from '../shared/RAGTypes';
import type { RecipeToolDeclaration } from '../../recipes/shared/RecipeTypes';
import { ORM } from '../../../daemons/data-daemon/server/ORM';
Expand All @@ -23,6 +24,7 @@ import { isSlowLocalModel } from '../../shared/ModelContextWindows';
*/
export class ActivityContextSource implements RAGSource {
readonly name = 'activity';
readonly tier = PromptTier.VOLATILE;
readonly isShared = true;

// Medium priority - important for guided interactions
Expand All @@ -36,7 +38,7 @@ export class ActivityContextSource implements RAGSource {
return true;
}

async load(context: RAGSourceContext, allocatedBudget: number): Promise<RAGSection> {
async load(context: RAGSourceContext, allocatedBudget: number): Promise<Omit<RAGSection, 'tier'>> {
const startTime = Date.now();

try {
Expand Down
6 changes: 4 additions & 2 deletions src/system/rag/sources/CodeToolSource.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
*/

import type { RAGSource, RAGSourceContext, RAGSection } from '../shared/RAGSource';
import { PromptTier } from '../shared/RAGSource';
import { PersonaToolRegistry } from '../../user/server/modules/PersonaToolRegistry';
import { Logger } from '../../core/logging/Logger';

Expand Down Expand Up @@ -70,6 +71,7 @@ const CODE_TOOL_GROUPS: readonly CodeToolGroup[] = [

export class CodeToolSource implements RAGSource {
readonly name = 'code-tools';
readonly tier = PromptTier.INVARIANT;
readonly priority = 50; // Medium — below conversation/widget, above learning config
readonly defaultBudgetPercent = 5;

Expand All @@ -84,7 +86,7 @@ export class CodeToolSource implements RAGSource {
return tools.some(t => t.name.startsWith('code/'));
}

async load(context: RAGSourceContext, allocatedBudget: number): Promise<RAGSection> {
async load(context: RAGSourceContext, allocatedBudget: number): Promise<Omit<RAGSection, 'tier'>> {
const startTime = performance.now();

try {
Expand Down Expand Up @@ -253,7 +255,7 @@ export class CodeToolSource implements RAGSource {
return tools.filter(t => t.name.startsWith('code/')).length;
}

private emptySection(startTime: number, error?: string): RAGSection {
private emptySection(startTime: number, error?: string): Omit<RAGSection, 'tier'> {
return {
sourceName: this.name,
tokenCount: 0,
Expand Down
Loading
Loading