From 8481a52c0ce23cc07ea92f8618c20cf97466d7e7 Mon Sep 17 00:00:00 2001 From: Kevin Kent Date: Mon, 13 Apr 2026 22:08:40 -0400 Subject: [PATCH] Fix inline summarization prompt cache misses The background inline summarizer forks messages from the main render but applies different post-processing than the tool calling loop: - Main agent call: stripInternalToolCallIds + validateToolMessages (filters orphaned tool results that lack matching assistant tool calls) - Background summarizer: stripInternalToolCallIds only After a summarization is applied, prompt-tsx re-renders the conversation with summarized history. Tool results that referenced tool calls from the now-summarized rounds become orphaned. The main call filters these out via validateToolMessages, but the background summarizer keeps them. This causes the message arrays to diverge, breaking prefix-based prompt caching (e.g., Anthropic's cache_control). The divergence specifically occurs on the 2nd+ summarization in the same turn, because the 1st summarization creates the orphaned messages that the 2nd summarization's forked copy includes but the main call filters out. This explains the observed pattern of 0% cache hit rate on 2nd+ summarizations while 1st summarizations get 65-98% hits. Fix: apply validateToolMessagesCore to the forked messages in the background summarizer, matching the main call's processing pipeline. Also move addCacheBreakpoints() to run before _startBackgroundSummarization to ensure cache breakpoint ordering is deterministic (code clarity improvement). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../src/extension/intents/node/agentIntent.ts | 35 ++++++++++++------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/extensions/copilot/src/extension/intents/node/agentIntent.ts b/extensions/copilot/src/extension/intents/node/agentIntent.ts index a2cdae7fa7de8..0b0c434982068 100644 --- a/extensions/copilot/src/extension/intents/node/agentIntent.ts +++ b/extensions/copilot/src/extension/intents/node/agentIntent.ts @@ -658,7 +658,21 @@ export class AgentIntentInvocation extends EditCodeIntentInvocation implements I )); } + const lastMessage = result.messages.at(-1); + if (lastMessage?.role === Raw.ChatRole.User) { + const currentTurn = promptContext.conversation?.getLatestTurn(); + if (currentTurn && !currentTurn.getMetadata(RenderedUserMessageMetadata)) { + currentTurn.setMetadata(new RenderedUserMessageMetadata(lastMessage.content)); + } + } + + addCacheBreakpoints(result.messages); + // Post-render: kick off background compaction at ≥ 80% if idle. + // This must run AFTER addCacheBreakpoints so that the messages + // forwarded to the background summarizer include cache breakpoints, + // making the prompt prefix byte-identical to the main agent fetch + // and enabling prompt cache hits on the summarization call. if (summarizationEnabled && backgroundSummarizer && !didSummarizeThisIteration) { const postRenderRatio = baseBudget > 0 ? (result.tokenCount + toolTokens) / baseBudget @@ -682,16 +696,6 @@ export class AgentIntentInvocation extends EditCodeIntentInvocation implements I } } - const lastMessage = result.messages.at(-1); - if (lastMessage?.role === Raw.ChatRole.User) { - const currentTurn = promptContext.conversation?.getLatestTurn(); - if (currentTurn && !currentTurn.getMetadata(RenderedUserMessageMetadata)) { - currentTurn.setMetadata(new RenderedUserMessageMetadata(lastMessage.content)); - } - } - - addCacheBreakpoints(result.messages); - if (this.request.command === 'error') { // Should trigger a 400 result.messages.push({ @@ -805,9 +809,14 @@ export class AgentIntentInvocation extends EditCodeIntentInvocation implements I try { if (useInlineSummarization) { // Inline mode: fork the exact messages from the main render - // and append a summary user message. The prompt prefix is - // byte-identical to the main agent loop for cache hits. - const strippedMainMessages = ToolCallingLoop.stripInternalToolCallIds(mainRenderMessages); + // and append a summary user message. The prompt prefix must + // be byte-identical to the main agent fetch for cache hits. + // Apply the same post-processing as the tool calling loop + // (strip internal IDs + validate/filter orphaned tool messages) + // so the message arrays match exactly. + const strippedMainMessages = ToolCallingLoop.validateToolMessagesCore( + ToolCallingLoop.stripInternalToolCallIds(mainRenderMessages), + ).messages; const summaryMsgResult = await renderPromptElement( this.instantiationService, this.endpoint,