diff --git a/app/L0/_all/mod/_core/agent_prompt/AGENTS.md b/app/L0/_all/mod/_core/agent_prompt/AGENTS.md index fd4a6202..a31f9c35 100644 --- a/app/L0/_all/mod/_core/agent_prompt/AGENTS.md +++ b/app/L0/_all/mod/_core/agent_prompt/AGENTS.md @@ -26,6 +26,7 @@ Current shared runtime contract: - normalized prompt items should cache `valueTokenCount` alongside the normalized string value so repeated prompt builds can reuse tokenizer results for the same item body - prompt-budget ratios are stored as percentages of the configured model `maxTokens`, with `system`, `history`, and `transient` required to total 100 while `singleMessage` is a separate percentage of the history budget - long prompt contributors may be trimmed through the shared middle-replacement placeholder emitted by `trimPromptLongMessage(...)`; the placeholder must keep a stable `space.chat.readLongMessage({ id, from, to })` instruction so the active chat runtime can expose the removed text on demand during that turn +- the `<>` counter inside that placeholder must stay byte-stable across turns even when the removed-character count changes order of magnitude, so prefix prompt-cache backends (llama.cpp `--prompt-cache`, qwen serve, vLLM prefix cache, etc.) keep their warm cache for everything after the placeholder instead of forcing a full prompt re-prefill on every turn; `buildPromptLongMessagePlaceholder(...)` enforces this by padding the counter to a fixed decimal width with leading zeros - part-level prompt-budget trimming should build a one-shot thresholded multi-contributor plan that trims only contributors whose planned cut is at least `250` tokens; system and transient consumers may then fall back to one combined section-body trim when contributor-level trims would all be smaller than that threshold - `installPromptItemAccess(...)` must keep full prompt-item text in runtime-only memory while publishing only redacted `space.chat.promptItems` metadata plus `readLongMessage(...)` on the live chat namespace - this module is prompt-builder-agnostic; callers must provide `buildPromptInput(context)` and may optionally provide `updatePromptHistory({ context, historyMessages, options, prompt, promptInput })` diff --git a/app/L0/_all/mod/_core/agent_prompt/prompt-items.js b/app/L0/_all/mod/_core/agent_prompt/prompt-items.js index 37df5f18..440cc959 100644 --- a/app/L0/_all/mod/_core/agent_prompt/prompt-items.js +++ b/app/L0/_all/mod/_core/agent_prompt/prompt-items.js @@ -533,6 +533,18 @@ export function buildPromptOverflowTrimPlan(contributors = [], overflowTokens, o }; } +// Pad the removed-chars counter to a fixed width so the placeholder string has +// a byte-stable length regardless of how many characters were trimmed. Each +// time the trimmer runs the counter changes (because chars-needed drifts with +// content and token budget), and without padding that changes the byte offset +// of every token after the placeholder. For backends with prefix prompt-cache +// reuse (llama.cpp `--prompt-cache`, qwen serve, vLLM prefix cache, etc.) the +// drifting placeholder forces a full prompt re-prefill on every turn instead +// of a warm-cache continuation — turning a 10s warm reply into multi-minute +// full PP for long-context sessions. 10 digits cover up to ~10 billion +// characters which is far above any realistic message length. +const LONG_MESSAGE_REMOVED_CHARS_PAD_WIDTH = 10; + export function buildPromptLongMessagePlaceholder({ id, removedChars } = {}) { const normalizedId = Number.isFinite(Number(id)) ? Math.max(1, Math.round(Number(id))) : 0; const normalizedRemovedChars = Number.isFinite(Number(removedChars)) @@ -543,7 +555,12 @@ export function buildPromptLongMessagePlaceholder({ id, removedChars } = {}) { return ""; } - return `<<${normalizedRemovedChars} characters removed to optimize context, read with space.chat.readLongMessage({id: ${normalizedId}, from: 0, to:${LONG_MESSAGE_DEFAULT_TO}})>>`; + const paddedRemovedChars = String(normalizedRemovedChars).padStart( + LONG_MESSAGE_REMOVED_CHARS_PAD_WIDTH, + "0" + ); + + return `<<${paddedRemovedChars} characters removed to optimize context, read with space.chat.readLongMessage({id: ${normalizedId}, from: 0, to:${LONG_MESSAGE_DEFAULT_TO}})>>`; } export function trimPromptLongMessage(text, options = {}) { diff --git a/tests/prompt_budget_trim_test.mjs b/tests/prompt_budget_trim_test.mjs index 548a56df..826b3edf 100644 --- a/tests/prompt_budget_trim_test.mjs +++ b/tests/prompt_budget_trim_test.mjs @@ -196,3 +196,36 @@ test("system part falls back to section-body compression when no contributor can assert.match(plan.sectionContributor.originalValueText, /\n\n/u); assert.equal(plan.sectionContributor.tokenCount > 0, true); }); + +test("buildPromptLongMessagePlaceholder pads removedChars to a fixed byte width across orders of magnitude", () => { + // Prefix prompt-cache backends (llama.cpp `--prompt-cache`, qwen serve, + // vLLM prefix cache, etc.) key cache hits on a byte-stable prompt prefix. + // The placeholder appears inside every trimmed message and changes every + // turn as the trimmer recalculates how many characters to drop. Without + // padding, the counter `<>` changes byte length + // each turn (3 -> 4 digits when N crosses 1000), shifting every byte after + // the placeholder and forcing a full prompt re-prefill instead of a warm- + // cache continuation. Lock the placeholder length to be byte-stable + // regardless of the counter's magnitude so the cache prefix stays valid + // across turns. + const placeholderSmall = buildPromptLongMessagePlaceholder({ id: 1, removedChars: 9 }); + const placeholderMedium = buildPromptLongMessagePlaceholder({ id: 1, removedChars: 4096 }); + const placeholderLarge = buildPromptLongMessagePlaceholder({ id: 1, removedChars: 500_000 }); + const placeholderHuge = buildPromptLongMessagePlaceholder({ id: 1, removedChars: 1_000_000_000 }); + + assert.equal(placeholderSmall.length, placeholderMedium.length); + assert.equal(placeholderSmall.length, placeholderLarge.length); + assert.equal(placeholderSmall.length, placeholderHuge.length); +}); + +test("buildPromptLongMessagePlaceholder keeps the counter readable as a decimal number with leading zeros", () => { + const placeholder = buildPromptLongMessagePlaceholder({ id: 7, removedChars: 42 }); + + // The counter should still be parseable as a positive integer after the + // leading-zero pad so anything that inspects it (logs, debug overlays, + // future tooling) does not need to special-case the format. + const match = placeholder.match(/<<(\d+) characters removed/u); + + assert.ok(match, "placeholder should start with `<< characters removed`"); + assert.equal(Number(match[1]), 42); +});