From 77b4738fbea8e2553aba77b631a6cbf1e895139c Mon Sep 17 00:00:00 2001
From: "Syring, Nikolas" <nikolas.syring@rittec.de>
Date: Tue, 12 May 2026 17:10:20 +0200
Subject: [PATCH] fix(agent-prompt): pad long-message placeholder counter to a
 byte-stable width

`buildPromptLongMessagePlaceholder(...)` emits the middle-replacement string
that `trimPromptLongMessage(...)` substitutes into long prompt contributors
when the active chat surface needs to fit a turn into the configured prompt
budget. The placeholder carries a `<<N characters removed to optimize context,
read with space.chat.readLongMessage({id: N, from: 0, to: N})>>` instruction
so the model can still pull the removed text on demand for the current turn.

The unpadded decimal counter inside that placeholder changes byte length
every time the trimmer recalculates how many characters to drop (the count
drifts with content and budget, and grows or shrinks an order of magnitude
across turns of a long conversation). Because the placeholder lives inside
every trimmed history message, that drifting counter shifts the byte offset
of every token after the placeholder in the prompt.

For backends that reuse a prefix prompt cache (llama.cpp `--prompt-cache`,
qwen serve, vLLM prefix cache, and similar inference servers) cache hits
key off a byte-stable prompt prefix. With the unpadded counter the prefix
no longer matches across turns and the server falls back to a full prompt
re-prefill instead of a warm-cache continuation. For long-context Talk
sessions (~135k tokens) that turns a 10-second warm reply into multi-
minute full PP and frequently trips client-side request timeouts before
the model even starts streaming.

Pad the counter to a fixed decimal width with leading zeros via
`String(n).padStart(10, "0")`. Ten digits cover up to ~10 billion
characters which is well above any realistic message length, the counter
stays parseable as a positive integer for logs and debug overlays, and
the placeholder is now byte-identical regardless of whether 9, 4096,
500 000, or 1 000 000 000 characters were dropped.

Add `tests/prompt_budget_trim_test.mjs` regression coverage that asserts
identical placeholder length across four orders of magnitude plus a
counter-parseability guard. Document the byte-stable counter contract
in `_core/agent_prompt/AGENTS.md` so the rationale survives future
placeholder refactors.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 app/L0/_all/mod/_core/agent_prompt/AGENTS.md  |  1 +
 .../mod/_core/agent_prompt/prompt-items.js    | 19 ++++++++++-
 tests/prompt_budget_trim_test.mjs             | 33 +++++++++++++++++++
 3 files changed, 52 insertions(+), 1 deletion(-)
diff --git a/app/L0/_all/mod/_core/agent_prompt/AGENTS.md b/app/L0/_all/mod/_core/agent_prompt/AGENTS.md
index fd4a6202..a31f9c35 100644
--- a/app/L0/_all/mod/_core/agent_prompt/AGENTS.md
+++ b/app/L0/_all/mod/_core/agent_prompt/AGENTS.md
@@ -26,6 +26,7 @@ Current shared runtime contract:
 - normalized prompt items should cache `valueTokenCount` alongside the normalized string value so repeated prompt builds can reuse tokenizer results for the same item body
 - prompt-budget ratios are stored as percentages of the configured model `maxTokens`, with `system`, `history`, and `transient` required to total 100 while `singleMessage` is a separate percentage of the history budget
 - long prompt contributors may be trimmed through the shared middle-replacement placeholder emitted by `trimPromptLongMessage(...)`; the placeholder must keep a stable `space.chat.readLongMessage({ id, from, to })` instruction so the active chat runtime can expose the removed text on demand during that turn
+- the `<<NNNNNNNNNN characters removed ...>>` counter inside that placeholder must stay byte-stable across turns even when the removed-character count changes order of magnitude, so prefix prompt-cache backends (llama.cpp `--prompt-cache`, qwen serve, vLLM prefix cache, etc.) keep their warm cache for everything after the placeholder instead of forcing a full prompt re-prefill on every turn; `buildPromptLongMessagePlaceholder(...)` enforces this by padding the counter to a fixed decimal width with leading zeros
 - part-level prompt-budget trimming should build a one-shot thresholded multi-contributor plan that trims only contributors whose planned cut is at least `250` tokens; system and transient consumers may then fall back to one combined section-body trim when contributor-level trims would all be smaller than that threshold
 - `installPromptItemAccess(...)` must keep full prompt-item text in runtime-only memory while publishing only redacted `space.chat.promptItems` metadata plus `readLongMessage(...)` on the live chat namespace
 - this module is prompt-builder-agnostic; callers must provide `buildPromptInput(context)` and may optionally provide `updatePromptHistory({ context, historyMessages, options, prompt, promptInput })`
diff --git a/app/L0/_all/mod/_core/agent_prompt/prompt-items.js b/app/L0/_all/mod/_core/agent_prompt/prompt-items.js
index 37df5f18..440cc959 100644
--- a/app/L0/_all/mod/_core/agent_prompt/prompt-items.js
+++ b/app/L0/_all/mod/_core/agent_prompt/prompt-items.js
@@ -533,6 +533,18 @@ export function buildPromptOverflowTrimPlan(contributors = [], overflowTokens, o
   };
 }
 
+// Pad the removed-chars counter to a fixed width so the placeholder string has
+// a byte-stable length regardless of how many characters were trimmed. Each
+// time the trimmer runs the counter changes (because chars-needed drifts with
+// content and token budget), and without padding that changes the byte offset
+// of every token after the placeholder. For backends with prefix prompt-cache
+// reuse (llama.cpp `--prompt-cache`, qwen serve, vLLM prefix cache, etc.) the
+// drifting placeholder forces a full prompt re-prefill on every turn instead
+// of a warm-cache continuation — turning a 10s warm reply into multi-minute
+// full PP for long-context sessions. 10 digits cover up to ~10 billion
+// characters which is far above any realistic message length.
+const LONG_MESSAGE_REMOVED_CHARS_PAD_WIDTH = 10;
+
 export function buildPromptLongMessagePlaceholder({ id, removedChars } = {}) {
   const normalizedId = Number.isFinite(Number(id)) ? Math.max(1, Math.round(Number(id))) : 0;
   const normalizedRemovedChars = Number.isFinite(Number(removedChars))
@@ -543,7 +555,12 @@ export function buildPromptLongMessagePlaceholder({ id, removedChars } = {}) {
     return "";
   }
 
-  return `<<${normalizedRemovedChars} characters removed to optimize context, read with space.chat.readLongMessage({id: ${normalizedId}, from: 0, to:${LONG_MESSAGE_DEFAULT_TO}})>>`;
+  const paddedRemovedChars = String(normalizedRemovedChars).padStart(
+    LONG_MESSAGE_REMOVED_CHARS_PAD_WIDTH,
+    "0"
+  );
+
+  return `<<${paddedRemovedChars} characters removed to optimize context, read with space.chat.readLongMessage({id: ${normalizedId}, from: 0, to:${LONG_MESSAGE_DEFAULT_TO}})>>`;
 }
 
 export function trimPromptLongMessage(text, options = {}) {
diff --git a/tests/prompt_budget_trim_test.mjs b/tests/prompt_budget_trim_test.mjs
index 548a56df..826b3edf 100644
--- a/tests/prompt_budget_trim_test.mjs
+++ b/tests/prompt_budget_trim_test.mjs
@@ -196,3 +196,36 @@ test("system part falls back to section-body compression when no contributor can
   assert.match(plan.sectionContributor.originalValueText, /\n\n/u);
   assert.equal(plan.sectionContributor.tokenCount > 0, true);
 });
+
+test("buildPromptLongMessagePlaceholder pads removedChars to a fixed byte width across orders of magnitude", () => {
+  // Prefix prompt-cache backends (llama.cpp `--prompt-cache`, qwen serve,
+  // vLLM prefix cache, etc.) key cache hits on a byte-stable prompt prefix.
+  // The placeholder appears inside every trimmed message and changes every
+  // turn as the trimmer recalculates how many characters to drop. Without
+  // padding, the counter `<<N characters removed ...>>` changes byte length
+  // each turn (3 -> 4 digits when N crosses 1000), shifting every byte after
+  // the placeholder and forcing a full prompt re-prefill instead of a warm-
+  // cache continuation. Lock the placeholder length to be byte-stable
+  // regardless of the counter's magnitude so the cache prefix stays valid
+  // across turns.
+  const placeholderSmall = buildPromptLongMessagePlaceholder({ id: 1, removedChars: 9 });
+  const placeholderMedium = buildPromptLongMessagePlaceholder({ id: 1, removedChars: 4096 });
+  const placeholderLarge = buildPromptLongMessagePlaceholder({ id: 1, removedChars: 500_000 });
+  const placeholderHuge = buildPromptLongMessagePlaceholder({ id: 1, removedChars: 1_000_000_000 });
+
+  assert.equal(placeholderSmall.length, placeholderMedium.length);
+  assert.equal(placeholderSmall.length, placeholderLarge.length);
+  assert.equal(placeholderSmall.length, placeholderHuge.length);
+});
+
+test("buildPromptLongMessagePlaceholder keeps the counter readable as a decimal number with leading zeros", () => {
+  const placeholder = buildPromptLongMessagePlaceholder({ id: 7, removedChars: 42 });
+
+  // The counter should still be parseable as a positive integer after the
+  // leading-zero pad so anything that inspects it (logs, debug overlays,
+  // future tooling) does not need to special-case the format.
+  const match = placeholder.match(/<<(\d+) characters removed/u);
+
+  assert.ok(match, "placeholder should start with `<<<digits> characters removed`");
+  assert.equal(Number(match[1]), 42);
+});