Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 19 additions & 2 deletions src/system/rag/services/CodebaseIndexer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,17 @@ const log = Logger.create('CodebaseIndexer', 'rag');
/** Maximum content length per chunk (chars). Longer chunks are split. */
const MAX_CHUNK_CHARS = 2000;

/** Batch size for embedding generation — one Rust IPC call per batch */
const EMBEDDING_BATCH_SIZE = 64;
/** Batch size for embedding generation — one Rust IPC call per batch.
* Was 64; dropped to 16 because 64 × ~80MB-per-batch RSS growth saturated
* the event loop and starved chat for ~2min after every boot on M5.
* 16 gives Rust IPC the ~4× headroom to interleave with persona inference. */
const EMBEDDING_BATCH_SIZE = 16;

/** Pause between batches (ms) to yield the event loop and let the Rust
* IPC pipeline drain. Without this, the indexer blocks chat and live for
* the full duration. 50ms is small enough to not visibly slow indexing
* but big enough that other IO can interleave. */
const EMBEDDING_BATCH_PAUSE_MS = 50;

/** File extensions to index */
const INDEXABLE_EXTENSIONS = new Set(['.ts', '.md', '.js']);
Expand Down Expand Up @@ -224,6 +233,14 @@ export class CodebaseIndexer {
log.error(`Embedding batch ${i}-${i + batch.length} failed: ${err}`);
errors.push({ file: `batch-${i}`, error: String(err) });
}

// Yield to other IO between batches. Without this, the indexer
// monopolises the event loop and chat/voice/personas all stall
// for the full indexing duration. Chat-arrival latency matters
// more than indexing throughput.
if (i + EMBEDDING_BATCH_SIZE < allChunks.length) {
await new Promise(resolve => setTimeout(resolve, EMBEDDING_BATCH_PAUSE_MS));
}
}

// Any write to code_index invalidates the in-memory query cache.
Expand Down
44 changes: 39 additions & 5 deletions src/system/rag/shared/RAGComposer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,15 @@
*/

import type { RAGSource, RAGSourceContext, RAGSection, RAGCompositionResult } from './RAGSource';
import { PromptTier } from './RAGTypes';

/** Sort key for tiers — smaller numbers concatenate first.
* INVARIANT before SEMI_STABLE before VOLATILE. See PromptTier docs. */
const TIER_ORDER: Record<PromptTier, number> = {
[PromptTier.INVARIANT]: 0,
[PromptTier.SEMI_STABLE]: 1,
[PromptTier.VOLATILE]: 2,
};
import type { RagSourceRequest, RagComposeResult, RagSourceResult } from '../../../shared/generated/rag';
import { Logger } from '../../core/logging/Logger';
import { TimingHarness } from '../../core/shared/TimingHarness';
Expand Down Expand Up @@ -230,6 +239,19 @@ export class RAGComposer {
failedSources.push({ source: result.source, error: result.error });
}
}
// Deterministic ordering: sections sorted by (tier, sourceName) so the
// assembled prompt's bytes are identical across requests with identical
// section contents. This is the prerequisite for llama-server / DMR
// prefix-KV-cache reuse — without stable ordering, the same logical
// prompt has different bytes per turn and the cache misses every time.
// Tier order: INVARIANT first, then SEMI_STABLE, then VOLATILE — see
// PromptTier in RAGTypes.ts and docs/architecture/MULTIMODAL-WORKER-AND-PREFIX-REUSE.md
sections.sort((a, b) => {
const tierOrder = TIER_ORDER[a.tier] - TIER_ORDER[b.tier];
if (tierOrder !== 0) return tierOrder;
// Within a tier: alphabetical by source name. Stable, total order.
return a.sourceName.localeCompare(b.sourceName);
});
timer.mark('collect_results');

// Log ALL source timings for performance diagnosis
Expand Down Expand Up @@ -316,8 +338,13 @@ export class RAGComposer {

if (rustResult.success) {
// Convert via source's fromBatchResult method
// Tier injection: same single-source-of-truth as TS path —
// the source's class declaration provides tier; we inject it
// here so the section conforms to RAGSection regardless of
// whether the source's fromBatchResult included it.
if (sourceInfo.source.fromBatchResult) {
const section = sourceInfo.source.fromBatchResult(rustResult, rustResult.load_time_ms);
const rawSection = sourceInfo.source.fromBatchResult(rustResult, rustResult.load_time_ms);
const section: RAGSection = { ...rawSection, tier: sourceInfo.source.tier };
results.push({
success: true,
section,
Expand All @@ -326,9 +353,11 @@ export class RAGComposer {
});
} else {
// Fallback: basic conversion
const rawSection = this.defaultFromBatchResult(sourceInfo.source.name, rustResult);
const section: RAGSection = { ...rawSection, tier: sourceInfo.source.tier };
results.push({
success: true,
section: this.defaultFromBatchResult(sourceInfo.source.name, rustResult),
section,
sourceName: sourceInfo.source.name,
loadTime: rustResult.load_time_ms
});
Expand Down Expand Up @@ -441,10 +470,14 @@ export class RAGComposer {
sourceTimer.setMeta('budget', budget);

try {
const section = await source.load(context, budget);
const rawSection = await source.load(context, budget);
sourceTimer.mark('load');
sourceTimer.setMeta('tokenCount', section.tokenCount);
sourceTimer.setMeta('tokenCount', rawSection.tokenCount);
const record = sourceTimer.finish();
// Inject tier from the source's declaration. Sources don't re-state
// their tier on every return; the class-level declaration is the
// single source of truth, applied here.
const section: RAGSection = { ...rawSection, tier: source.tier };
return { success: true, section, sourceName: source.name, loadTime: record.totalMs };
} catch (error: any) {
sourceTimer.setError(error.message);
Expand All @@ -457,8 +490,9 @@ export class RAGComposer {
/**
* Default conversion from Rust RagSourceResult to TypeScript RAGSection.
* Used when source doesn't implement fromBatchResult.
* Returns without `tier` — caller injects from the source's declaration.
*/
private defaultFromBatchResult(sourceName: string, result: RagSourceResult): RAGSection {
private defaultFromBatchResult(sourceName: string, result: RagSourceResult): Omit<RAGSection, 'tier'> {
// Combine all sections into a single content block
const content = result.sections
.map(s => s.content)
Expand Down
46 changes: 42 additions & 4 deletions src/system/rag/shared/RAGSource.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@

import type { UUID } from '../../core/types/CrossPlatformUUID';
import type { RAGBuildOptions, LLMMessage, RAGArtifact, PersonaMemory, PersonaIdentity, RecipeStrategy } from './RAGTypes';
import { PromptTier } from './RAGTypes';

// Re-export so source files only need one import
export { PromptTier } from './RAGTypes';
Comment on lines +24 to +25
Copy link

Copilot AI Apr 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PromptTier is declared as a const enum (erased at emit), but this file re-exports it as a runtime export (export { PromptTier } from './RAGTypes'). With module: ES2020, this can produce a runtime ESM error because ./RAGTypes will not actually export PromptTier. Fix by either (a) making PromptTier a non-const enum (or a const PromptTier = {...} as const object) so it exists at runtime, or (b) removing this re-export and importing PromptTier directly from RAGTypes in the sources.

Suggested change
// Re-export so source files only need one import
export { PromptTier } from './RAGTypes';
// Keep PromptTier imported for use within this file; do not re-export it here
// because `const enum` members are erased at emit and are not safe runtime ESM exports.

Copilot uses AI. Check for mistakes.

/**
* Context passed to each RAGSource for loading
Expand Down Expand Up @@ -70,6 +74,9 @@ export interface RAGSourceContext {
export interface RAGSection {
/** Source that produced this section */
readonly sourceName: string;
/** Tier this section belongs to — drives stable-byte-prefix ordering.
* Mirrored from the producing source's declared tier. */
readonly tier: PromptTier;
/** Estimated token count */
readonly tokenCount: number;
/** Time taken to load (ms) */
Expand Down Expand Up @@ -105,6 +112,29 @@ export interface RAGSource {
*/
readonly priority: number;

/**
* Tier — INVARIANT / SEMI_STABLE / VOLATILE.
* Required. Drives stable-byte-prefix prompt assembly so llama-server
* reuses KV cache for the unchanging region instead of reprocessing
* the full prompt every turn.
*
* Classification rules:
* - INVARIANT — system prompt fragments, recipe rules, role identity,
* tool definitions. Bytes must be identical across thousands of turns
* for the same persona+recipe. NO timestamps, NO request IDs, NO
* per-request volatile data.
* - SEMI_STABLE — conversation history, memories, participants,
* governance. Grows monotonically — append-only relative to the
* previous turn. Earlier bytes never rewritten.
* - VOLATILE — current message, audio chunks, current timestamp,
* per-request observations. The only region the server reprocesses
* token-by-token.
*
* If you can't decide, the source probably mixes tiers and should be
* split into separate sources at the right granularity.
*/
readonly tier: PromptTier;

/**
* Default budget allocation as percentage (0-100).
* Total across all sources should roughly equal 100.
Expand All @@ -126,11 +156,16 @@ export interface RAGSource {
* Load data from this source.
* Called in parallel with other applicable sources.
*
* Returns the section without the `tier` field — RAGComposer injects
* the source's declared `tier` into the section after load completes.
* This keeps source implementations focused on what they produce
* rather than re-asserting their tier on every return.
*
* @param context - Context for loading
* @param allocatedBudget - Token budget allocated to this source
* @returns Section of RAG context
* @returns Section of RAG context (tier added by composer)
*/
load(context: RAGSourceContext, allocatedBudget: number): Promise<RAGSection>;
load(context: RAGSourceContext, allocatedBudget: number): Promise<Omit<RAGSection, 'tier'>>;

/**
* Whether this source produces identical results for all personas in the same room.
Expand Down Expand Up @@ -168,11 +203,14 @@ export interface RAGSource {
* Only called if supportsBatching is true.
* Transforms the typed Rust result into the RAGSection format.
*
* Returns the section without `tier` — RAGComposer injects the source's
* declared tier after conversion, same as the non-batched path.
*
* @param result - The result from Rust's rag/compose endpoint
* @param loadTimeMs - How long the load took
* @returns The RAGSection to include in the composition result
* @returns The RAGSection (without tier) to include in the composition result
*/
fromBatchResult?(result: RagSourceResult, loadTimeMs: number): RAGSection;
fromBatchResult?(result: RagSourceResult, loadTimeMs: number): Omit<RAGSection, 'tier'>;
}

// Re-export Rust-generated types for batch support
Expand Down
32 changes: 32 additions & 0 deletions src/system/rag/shared/RAGTypes.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,38 @@ import type { RecipeToolDeclaration } from '../../recipes/shared/RecipeTypes';
*/
export type RAGDomain = 'chat' | 'academy' | 'game' | 'code' | 'analysis';

/**
* Prompt tier — declares how often a RAG source's contribution changes between
* requests. Drives stable-byte-prefix prompt assembly so llama-server / vllm /
* DMR can reuse the KV cache for the invariant region instead of reprocessing
* the full prompt every turn.
*
* The contract: a section's bytes must be byte-identical across requests for
* sources at the same tier with the same inputs. INVARIANT and SEMI_STABLE
* sources MUST NOT contain timestamps, request IDs, or any per-request
* volatile data. Those go in VOLATILE only.
*
* Final assembly order is always: INVARIANT → SEMI_STABLE → VOLATILE.
* Within each tier, sources are sorted by name (alphabetical) so the byte
* order is fully deterministic.
*
* See: docs/architecture/MULTIMODAL-WORKER-AND-PREFIX-REUSE.md (Part 1)
*/
export const enum PromptTier {
/** Persona system prompt, recipe rules, role identity, tool definitions.
* Changes ~weekly when persona/recipe is edited. Identical bytes across
* thousands of turns for the same persona+recipe. */
INVARIANT = 'invariant',
/** Conversation history, active genome adapters, participants, governance
* state. Grows monotonically — new content APPENDS to the existing
* prefix, doesn't rewrite earlier bytes. */
SEMI_STABLE = 'semi_stable',
/** Latest user message, audio chunks, current timestamp, last-second
* pressure observations. Changes every request. The only region the
* server actually has to reprocess token-by-token. */
VOLATILE = 'volatile',
}

/**
* Model capabilities that affect RAG context building
* Determines how artifacts (images, etc.) are processed
Expand Down
4 changes: 3 additions & 1 deletion src/system/rag/sources/ActivityContextSource.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
*/

import type { RAGSource, RAGSection, RAGSourceContext } from '../shared/RAGSource';
import { PromptTier } from '../shared/RAGSource';
import type { RecipeStrategy } from '../shared/RAGTypes';
import type { RecipeToolDeclaration } from '../../recipes/shared/RecipeTypes';
import { ORM } from '../../../daemons/data-daemon/server/ORM';
Expand All @@ -23,6 +24,7 @@ import { isSlowLocalModel } from '../../shared/ModelContextWindows';
*/
export class ActivityContextSource implements RAGSource {
readonly name = 'activity';
readonly tier = PromptTier.VOLATILE;
readonly isShared = true;

// Medium priority - important for guided interactions
Expand All @@ -36,7 +38,7 @@ export class ActivityContextSource implements RAGSource {
return true;
}

async load(context: RAGSourceContext, allocatedBudget: number): Promise<RAGSection> {
async load(context: RAGSourceContext, allocatedBudget: number): Promise<Omit<RAGSection, 'tier'>> {
const startTime = Date.now();

try {
Expand Down
6 changes: 4 additions & 2 deletions src/system/rag/sources/CodeToolSource.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
*/

import type { RAGSource, RAGSourceContext, RAGSection } from '../shared/RAGSource';
import { PromptTier } from '../shared/RAGSource';
import { PersonaToolRegistry } from '../../user/server/modules/PersonaToolRegistry';
import { Logger } from '../../core/logging/Logger';

Expand Down Expand Up @@ -70,6 +71,7 @@ const CODE_TOOL_GROUPS: readonly CodeToolGroup[] = [

export class CodeToolSource implements RAGSource {
readonly name = 'code-tools';
readonly tier = PromptTier.INVARIANT;
readonly priority = 50; // Medium — below conversation/widget, above learning config
readonly defaultBudgetPercent = 5;

Expand All @@ -84,7 +86,7 @@ export class CodeToolSource implements RAGSource {
return tools.some(t => t.name.startsWith('code/'));
}

async load(context: RAGSourceContext, allocatedBudget: number): Promise<RAGSection> {
async load(context: RAGSourceContext, allocatedBudget: number): Promise<Omit<RAGSection, 'tier'>> {
const startTime = performance.now();

try {
Expand Down Expand Up @@ -253,7 +255,7 @@ export class CodeToolSource implements RAGSource {
return tools.filter(t => t.name.startsWith('code/')).length;
}

private emptySection(startTime: number, error?: string): RAGSection {
private emptySection(startTime: number, error?: string): Omit<RAGSection, 'tier'> {
return {
sourceName: this.name,
tokenCount: 0,
Expand Down
4 changes: 3 additions & 1 deletion src/system/rag/sources/CodebaseSearchSource.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
*/

import type { RAGSource, RAGSourceContext, RAGSection } from '../shared/RAGSource';
import { PromptTier } from '../shared/RAGSource';
import { getCodebaseIndexer } from '../services/CodebaseIndexer';
import { Logger } from '../../core/logging/Logger';

Expand All @@ -29,6 +30,7 @@ const RELEVANCE_THRESHOLD = 0.35;

export class CodebaseSearchSource implements RAGSource {
readonly name = 'codebase-search';
readonly tier = PromptTier.VOLATILE;
readonly priority = 55;
readonly defaultBudgetPercent = 8;
readonly isShared = true;
Expand All @@ -43,7 +45,7 @@ export class CodebaseSearchSource implements RAGSource {
return currentMessage.length >= MIN_QUERY_LENGTH;
}

async load(context: RAGSourceContext, allocatedBudget: number): Promise<RAGSection> {
async load(context: RAGSourceContext, allocatedBudget: number): Promise<Omit<RAGSection, 'tier'>> {
const startTime = Date.now();
const query = context.options?.currentMessage?.content as string;

Expand Down
6 changes: 4 additions & 2 deletions src/system/rag/sources/ConversationHistorySource.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
*/

import type { RAGSource, RAGSourceContext, RAGSection } from '../shared/RAGSource';
import { PromptTier } from '../shared/RAGSource';
import type { LLMMessage } from '../shared/RAGTypes';
import { ORM } from '../../../daemons/data-daemon/server/ORM';
import { ChatMessageEntity, type MediaItem } from '../../data/entities/ChatMessageEntity';
Expand Down Expand Up @@ -145,6 +146,7 @@ interface InflightEntry {

export class ConversationHistorySource implements RAGSource {
readonly name = 'conversation-history';
readonly tier = PromptTier.SEMI_STABLE;
readonly priority = 80; // High - conversation is core context
readonly defaultBudgetPercent = 25; // Gets largest share of budget

Expand Down Expand Up @@ -224,7 +226,7 @@ export class ConversationHistorySource implements RAGSource {
return true;
}

async load(context: RAGSourceContext, allocatedBudget: number): Promise<RAGSection> {
async load(context: RAGSourceContext, allocatedBudget: number): Promise<Omit<RAGSection, 'tier'>> {
const startTime = performance.now();
ConversationHistorySource.initEventSubscription();

Expand Down Expand Up @@ -564,7 +566,7 @@ export class ConversationHistorySource implements RAGSource {
return [];
}

private emptySection(startTime: number, error?: string): RAGSection {
private emptySection(startTime: number, error?: string): Omit<RAGSection, 'tier'> {
return {
sourceName: this.name,
tokenCount: 0,
Expand Down
4 changes: 3 additions & 1 deletion src/system/rag/sources/DocumentationSource.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
*/

import type { RAGSource, RAGSourceContext, RAGSection } from '../shared/RAGSource';
import { PromptTier } from '../shared/RAGSource';
import { Logger } from '../../core/logging/Logger';
import * as fs from 'fs/promises';
import * as path from 'path';
Expand Down Expand Up @@ -45,6 +46,7 @@ const DOC_CHAPTERS: readonly Omit<DocChapter, 'count'>[] = [

export class DocumentationSource implements RAGSource {
readonly name = 'documentation';
readonly tier = PromptTier.INVARIANT;
readonly priority = 35;
readonly defaultBudgetPercent = 5;
readonly isShared = true;
Expand All @@ -62,7 +64,7 @@ export class DocumentationSource implements RAGSource {
return true;
}

async load(context: RAGSourceContext, allocatedBudget: number): Promise<RAGSection> {
async load(context: RAGSourceContext, allocatedBudget: number): Promise<Omit<RAGSection, 'tier'>> {
const startTime = performance.now();

try {
Expand Down
Loading
Loading