agent0ai · nsyring · May 12, 2026
diff --git a/app/L0/_all/mod/_core/agent_prompt/AGENTS.md b/app/L0/_all/mod/_core/agent_prompt/AGENTS.md
@@ -26,6 +26,7 @@ Current shared runtime contract:
 - normalized prompt items should cache `valueTokenCount` alongside the normalized string value so repeated prompt builds can reuse tokenizer results for the same item body
 - prompt-budget ratios are stored as percentages of the configured model `maxTokens`, with `system`, `history`, and `transient` required to total 100 while `singleMessage` is a separate percentage of the history budget
 - long prompt contributors may be trimmed through the shared middle-replacement placeholder emitted by `trimPromptLongMessage(...)`; the placeholder must keep a stable `space.chat.readLongMessage({ id, from, to })` instruction so the active chat runtime can expose the removed text on demand during that turn
+- the `<<NNNNNNNNNN characters removed ...>>` counter inside that placeholder must stay byte-stable across turns even when the removed-character count changes order of magnitude, so prefix prompt-cache backends (llama.cpp `--prompt-cache`, qwen serve, vLLM prefix cache, etc.) keep their warm cache for everything after the placeholder instead of forcing a full prompt re-prefill on every turn; `buildPromptLongMessagePlaceholder(...)` enforces this by padding the counter to a fixed decimal width with leading zeros
 - part-level prompt-budget trimming should build a one-shot thresholded multi-contributor plan that trims only contributors whose planned cut is at least `250` tokens; system and transient consumers may then fall back to one combined section-body trim when contributor-level trims would all be smaller than that threshold
 - `installPromptItemAccess(...)` must keep full prompt-item text in runtime-only memory while publishing only redacted `space.chat.promptItems` metadata plus `readLongMessage(...)` on the live chat namespace
 - this module is prompt-builder-agnostic; callers must provide `buildPromptInput(context)` and may optionally provide `updatePromptHistory({ context, historyMessages, options, prompt, promptInput })`

diff --git a/app/L0/_all/mod/_core/agent_prompt/prompt-items.js b/app/L0/_all/mod/_core/agent_prompt/prompt-items.js
@@ -533,6 +533,18 @@ export function buildPromptOverflowTrimPlan(contributors = [], overflowTokens, o
   };
 }
 
+// Pad the removed-chars counter to a fixed width so the placeholder string has
+// a byte-stable length regardless of how many characters were trimmed. Each
+// time the trimmer runs the counter changes (because chars-needed drifts with
+// content and token budget), and without padding that changes the byte offset
+// of every token after the placeholder. For backends with prefix prompt-cache
+// reuse (llama.cpp `--prompt-cache`, qwen serve, vLLM prefix cache, etc.) the
+// drifting placeholder forces a full prompt re-prefill on every turn instead
+// of a warm-cache continuation — turning a 10s warm reply into multi-minute
+// full PP for long-context sessions. 10 digits cover up to ~10 billion
+// characters which is far above any realistic message length.
+const LONG_MESSAGE_REMOVED_CHARS_PAD_WIDTH = 10;
+
 export function buildPromptLongMessagePlaceholder({ id, removedChars } = {}) {
   const normalizedId = Number.isFinite(Number(id)) ? Math.max(1, Math.round(Number(id))) : 0;
   const normalizedRemovedChars = Number.isFinite(Number(removedChars))
@@ -543,7 +555,12 @@ export function buildPromptLongMessagePlaceholder({ id, removedChars } = {}) {
     return "";
   }
 
-  return `<<${normalizedRemovedChars} characters removed to optimize context, read with space.chat.readLongMessage({id: ${normalizedId}, from: 0, to:${LONG_MESSAGE_DEFAULT_TO}})>>`;
+  const paddedRemovedChars = String(normalizedRemovedChars).padStart(
+    LONG_MESSAGE_REMOVED_CHARS_PAD_WIDTH,
+    "0"
+  );
+
+  return `<<${paddedRemovedChars} characters removed to optimize context, read with space.chat.readLongMessage({id: ${normalizedId}, from: 0, to:${LONG_MESSAGE_DEFAULT_TO}})>>`;
 }
 
 export function trimPromptLongMessage(text, options = {}) {

diff --git a/tests/prompt_budget_trim_test.mjs b/tests/prompt_budget_trim_test.mjs
@@ -196,3 +196,36 @@ test("system part falls back to section-body compression when no contributor can
   assert.match(plan.sectionContributor.originalValueText, /\n\n/u);
   assert.equal(plan.sectionContributor.tokenCount > 0, true);
 });
+
+test("buildPromptLongMessagePlaceholder pads removedChars to a fixed byte width across orders of magnitude", () => {
+  // Prefix prompt-cache backends (llama.cpp `--prompt-cache`, qwen serve,
+  // vLLM prefix cache, etc.) key cache hits on a byte-stable prompt prefix.
+  // The placeholder appears inside every trimmed message and changes every
+  // turn as the trimmer recalculates how many characters to drop. Without
+  // padding, the counter `<<N characters removed ...>>` changes byte length
+  // each turn (3 -> 4 digits when N crosses 1000), shifting every byte after
+  // the placeholder and forcing a full prompt re-prefill instead of a warm-
+  // cache continuation. Lock the placeholder length to be byte-stable
+  // regardless of the counter's magnitude so the cache prefix stays valid
+  // across turns.
+  const placeholderSmall = buildPromptLongMessagePlaceholder({ id: 1, removedChars: 9 });
+  const placeholderMedium = buildPromptLongMessagePlaceholder({ id: 1, removedChars: 4096 });
+  const placeholderLarge = buildPromptLongMessagePlaceholder({ id: 1, removedChars: 500_000 });
+  const placeholderHuge = buildPromptLongMessagePlaceholder({ id: 1, removedChars: 1_000_000_000 });
+
+  assert.equal(placeholderSmall.length, placeholderMedium.length);
+  assert.equal(placeholderSmall.length, placeholderLarge.length);
+  assert.equal(placeholderSmall.length, placeholderHuge.length);
+});
+
+test("buildPromptLongMessagePlaceholder keeps the counter readable as a decimal number with leading zeros", () => {
+  const placeholder = buildPromptLongMessagePlaceholder({ id: 7, removedChars: 42 });
+
+  // The counter should still be parseable as a positive integer after the
+  // leading-zero pad so anything that inspects it (logs, debug overlays,
+  // future tooling) does not need to special-case the format.
+  const match = placeholder.match(/<<(\d+) characters removed/u);
+
+  assert.ok(match, "placeholder should start with `<<<digits> characters removed`");
+  assert.equal(Number(match[1]), 42);
+});