From 8481a52c0ce23cc07ea92f8618c20cf97466d7e7 Mon Sep 17 00:00:00 2001
From: Kevin Kent <kevinkent@microsoft.com>
Date: Mon, 13 Apr 2026 22:08:40 -0400
Subject: [PATCH] Fix inline summarization prompt cache misses

The background inline summarizer forks messages from the main render
but applies different post-processing than the tool calling loop:

- Main agent call: stripInternalToolCallIds + validateToolMessages
  (filters orphaned tool results that lack matching assistant tool calls)
- Background summarizer: stripInternalToolCallIds only

After a summarization is applied, prompt-tsx re-renders the conversation
with summarized history. Tool results that referenced tool calls from
the now-summarized rounds become orphaned. The main call filters these
out via validateToolMessages, but the background summarizer keeps them.
This causes the message arrays to diverge, breaking prefix-based prompt
caching (e.g., Anthropic's cache_control).

The divergence specifically occurs on the 2nd+ summarization in the
same turn, because the 1st summarization creates the orphaned messages
that the 2nd summarization's forked copy includes but the main call
filters out. This explains the observed pattern of 0% cache hit rate
on 2nd+ summarizations while 1st summarizations get 65-98% hits.

Fix: apply validateToolMessagesCore to the forked messages in the
background summarizer, matching the main call's processing pipeline.

Also move addCacheBreakpoints() to run before
_startBackgroundSummarization to ensure cache breakpoint ordering is
deterministic (code clarity improvement).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../src/extension/intents/node/agentIntent.ts | 35 ++++++++++++-------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/extensions/copilot/src/extension/intents/node/agentIntent.ts b/extensions/copilot/src/extension/intents/node/agentIntent.ts
index a2cdae7fa7de8..0b0c434982068 100644
--- a/extensions/copilot/src/extension/intents/node/agentIntent.ts
+++ b/extensions/copilot/src/extension/intents/node/agentIntent.ts
@@ -658,7 +658,21 @@ export class AgentIntentInvocation extends EditCodeIntentInvocation implements I
 			));
 		}
 
+		const lastMessage = result.messages.at(-1);
+		if (lastMessage?.role === Raw.ChatRole.User) {
+			const currentTurn = promptContext.conversation?.getLatestTurn();
+			if (currentTurn && !currentTurn.getMetadata(RenderedUserMessageMetadata)) {
+				currentTurn.setMetadata(new RenderedUserMessageMetadata(lastMessage.content));
+			}
+		}
+
+		addCacheBreakpoints(result.messages);
+
 		// Post-render: kick off background compaction at ≥ 80% if idle.
+		// This must run AFTER addCacheBreakpoints so that the messages
+		// forwarded to the background summarizer include cache breakpoints,
+		// making the prompt prefix byte-identical to the main agent fetch
+		// and enabling prompt cache hits on the summarization call.
 		if (summarizationEnabled && backgroundSummarizer && !didSummarizeThisIteration) {
 			const postRenderRatio = baseBudget > 0
 				? (result.tokenCount + toolTokens) / baseBudget
@@ -682,16 +696,6 @@ export class AgentIntentInvocation extends EditCodeIntentInvocation implements I
 			}
 		}
 
-		const lastMessage = result.messages.at(-1);
-		if (lastMessage?.role === Raw.ChatRole.User) {
-			const currentTurn = promptContext.conversation?.getLatestTurn();
-			if (currentTurn && !currentTurn.getMetadata(RenderedUserMessageMetadata)) {
-				currentTurn.setMetadata(new RenderedUserMessageMetadata(lastMessage.content));
-			}
-		}
-
-		addCacheBreakpoints(result.messages);
-
 		if (this.request.command === 'error') {
 			// Should trigger a 400
 			result.messages.push({
@@ -805,9 +809,14 @@ export class AgentIntentInvocation extends EditCodeIntentInvocation implements I
 			try {
 				if (useInlineSummarization) {
 					// Inline mode: fork the exact messages from the main render
-					// and append a summary user message. The prompt prefix is
-					// byte-identical to the main agent loop for cache hits.
-					const strippedMainMessages = ToolCallingLoop.stripInternalToolCallIds(mainRenderMessages);
+					// and append a summary user message. The prompt prefix must
+					// be byte-identical to the main agent fetch for cache hits.
+					// Apply the same post-processing as the tool calling loop
+					// (strip internal IDs + validate/filter orphaned tool messages)
+					// so the message arrays match exactly.
+					const strippedMainMessages = ToolCallingLoop.validateToolMessagesCore(
+						ToolCallingLoop.stripInternalToolCallIds(mainRenderMessages),
+					).messages;
 					const summaryMsgResult = await renderPromptElement(
 						this.instantiationService,
 						this.endpoint,