From 6402aed9282b44a798d49586b29fa3990329ff80 Mon Sep 17 00:00:00 2001
From: li-xiu-qi <lixiuqixiaoke@qq.com>
Date: Wed, 17 Jun 2026 17:26:56 +0800
Subject: [PATCH] fix(agent-core): cap compaction output tokens when
 maxOutputSize is undefined

The compaction worker in full.ts was not passing maxOutputSize to
resolveCompletionBudget, causing computeCompletionBudgetCap to fall
back to the full context window size as max_completion_tokens. When
maxOutputSize is also undefined (the default for most models), this
results in max_tokens equal to max_context_tokens, which causes
APIContextOverflowError on providers that do not auto-clamp
max_tokens server-side.

This change:
- Passes maxOutputSize to resolveCompletionBudget (aligning with the
  main loop in index.ts, same as PR #482)
- Adds a conservative fallback cap of min(maxCtx/4, 8192) when
  maxOutputSize is undefined, ensuring compaction never requests the
  full context window as output tokens
- Adds tests covering the compaction budget resolution scenarios

Resolve #834
---
 .changeset/compaction-output-token-cap.md     |   5 +
 .../agent-core/src/agent/compaction/full.ts   |  10 ++
 .../compaction-overflow-verification.test.ts  | 140 ++++++++++++++++++
 .../test/utils/completion-budget.test.ts      |  89 +++++++++++
 4 files changed, 244 insertions(+)
 create mode 100644 .changeset/compaction-output-token-cap.md
 create mode 100644 packages/agent-core/test/utils/compaction-overflow-verification.test.ts

diff --git a/.changeset/compaction-output-token-cap.md b/.changeset/compaction-output-token-cap.md
new file mode 100644
index 000000000..6f26f5c64
--- /dev/null
+++ b/.changeset/compaction-output-token-cap.md
@@ -0,0 +1,5 @@
+---
+"@moonshot-ai/agent-core": patch
+---
+
+Cap compaction output tokens to a conservative fallback when maxOutputSize is not configured, preventing APIContextOverflowError on providers that do not auto-clamp max_tokens.
diff --git a/packages/agent-core/src/agent/compaction/full.ts b/packages/agent-core/src/agent/compaction/full.ts
index e444aee52..65b99a80f 100644
--- a/packages/agent-core/src/agent/compaction/full.ts
+++ b/packages/agent-core/src/agent/compaction/full.ts
@@ -261,9 +261,19 @@ export class FullCompaction {
       await this.triggerPreCompactHook(data, tokensBefore, signal);
 
       const model = this.agent.config.model;
+      const maxOutputSize = this.agent.config.maxOutputSize;
+      const maxCtx = this.agent.config.modelCapabilities.max_context_tokens ?? 0;
+      // When maxOutputSize is not configured (the default), fall back to a
+      // conservative cap so compaction never requests the full context window
+      // as max_completion_tokens. 1/4 of the context window (capped at 8192)
+      // is generous for a summary while preventing overflow on providers that
+      // do not auto-clamp max_tokens server-side.
+      const compactionOutputCap =
+        maxOutputSize ?? (maxCtx > 0 ? Math.min(Math.floor(maxCtx / 4), 8192) : undefined);
       const provider = applyCompletionBudget({
         provider: this.agent.config.provider,
         budget: resolveCompletionBudget({
+          maxOutputSize: compactionOutputCap,
           reservedContextSize: this.agent.kimiConfig?.loopControl?.reservedContextSize,
         }),
         capability: this.agent.config.modelCapabilities,
diff --git a/packages/agent-core/test/utils/compaction-overflow-verification.test.ts b/packages/agent-core/test/utils/compaction-overflow-verification.test.ts
new file mode 100644
index 000000000..1072f0071
--- /dev/null
+++ b/packages/agent-core/test/utils/compaction-overflow-verification.test.ts
@@ -0,0 +1,140 @@
+import { describe, expect, it } from 'vitest';
+
+import {
+  applyCompletionBudget,
+  computeCompletionBudgetCap,
+  resolveCompletionBudget,
+} from '../../src/utils/completion-budget';
+
+import type { ChatProvider, ModelCapability } from '@moonshot-ai/kosong';
+
+function makeCapability(maxContextTokens: number): ModelCapability {
+  return {
+    image_in: false,
+    video_in: false,
+    audio_in: false,
+    thinking: false,
+    tool_use: true,
+    max_context_tokens: maxContextTokens,
+  };
+}
+
+function makeMockProvider(): { provider: ChatProvider; getCap: () => number | null } {
+  let cap: number | null = null;
+  const provider = {
+    name: 'mock',
+    modelName: 'mock-model',
+    thinkingEffort: null,
+    generate: (() => {}) as unknown as ChatProvider['generate'],
+    withThinking: (() => {}) as unknown as ChatProvider['withThinking'],
+    withMaxCompletionTokens: ((n: number) => {
+      cap = n;
+      return { ...provider, _cap: n } as unknown as ChatProvider;
+    }) as unknown as (n: number) => ChatProvider,
+  } as ChatProvider;
+  return { provider, getCap: () => cap };
+}
+
+/**
+ * Simulates the ORIGINAL compactionRound() budget logic (before fix):
+ * does NOT pass maxOutputSize to resolveCompletionBudget.
+ */
+function originalCompactionMaxTokens(args: {
+  maxOutputSize?: number;
+  maxCtx: number;
+  reservedContextSize?: number;
+}): number {
+  const { provider, getCap } = makeMockProvider();
+  applyCompletionBudget({
+    provider,
+    budget: resolveCompletionBudget({
+      // maxOutputSize intentionally omitted — this is the bug
+      reservedContextSize: args.reservedContextSize,
+    }),
+    capability: makeCapability(args.maxCtx),
+  });
+  return getCap() ?? 0;
+}
+
+/**
+ * Simulates the PATCHED compactionRound() budget logic (after fix):
+ * passes maxOutputSize or a conservative fallback cap.
+ */
+function patchedCompactionMaxTokens(args: {
+  maxOutputSize?: number;
+  maxCtx: number;
+  reservedContextSize?: number;
+}): number {
+  const compactionOutputCap =
+    args.maxOutputSize ?? (args.maxCtx > 0 ? Math.min(Math.floor(args.maxCtx / 4), 8192) : undefined);
+  const { provider, getCap } = makeMockProvider();
+  applyCompletionBudget({
+    provider,
+    budget: resolveCompletionBudget({
+      maxOutputSize: compactionOutputCap,
+      reservedContextSize: args.reservedContextSize,
+    }),
+    capability: makeCapability(args.maxCtx),
+  });
+  return getCap() ?? 0;
+}
+
+describe('compaction overflow verification (before vs after fix)', () => {
+  // Simulated compaction input size: a typical compaction prompt contains
+  // the entire conversation history being compacted.
+  const COMPACTION_INPUT_TOKENS = 80_000;
+
+  const testModels = [
+    {
+      name: 'stepfun/step-3.7-flash (maxOutputSize not configured)',
+      maxOutputSize: undefined,
+      maxCtx: 256_000,
+      reservedContextSize: 50_000,
+    },
+    {
+      name: 'kimi-for-coding (maxOutputSize not configured)',
+      maxOutputSize: undefined,
+      maxCtx: 262_144,
+      reservedContextSize: 50_000,
+    },
+    {
+      name: 'zhipu/glm-5.2 (maxOutputSize=131072)',
+      maxOutputSize: 131_072,
+      maxCtx: 1_000_000,
+      reservedContextSize: 50_000,
+    },
+  ];
+
+  for (const model of testModels) {
+    it(`${model.name}: original overflows, patched is safe`, () => {
+      const origCap = originalCompactionMaxTokens(model);
+      const patchedCap = patchedCompactionMaxTokens(model);
+      const origTotal = COMPACTION_INPUT_TOKENS + origCap;
+      const patchedTotal = COMPACTION_INPUT_TOKENS + patchedCap;
+
+      // --- Original code ---
+      // The original compaction code does NOT pass maxOutputSize to
+      // resolveCompletionBudget, so it always falls back to using the full
+      // context window as max_completion_tokens — regardless of whether
+      // maxOutputSize is configured. This is the core bug.
+      expect(origCap).toBe(model.maxCtx); // bug: always uses full context as max_tokens
+      expect(origTotal).toBeGreaterThan(model.maxCtx); // overflow!
+
+      // --- Patched code ---
+      // The patched code always uses a safe cap (either maxOutputSize or min(maxCtx/4, 8192))
+      expect(patchedTotal).toBeLessThanOrEqual(model.maxCtx);
+      expect(patchedCap).toBeLessThan(model.maxCtx);
+
+      // Print a comparison table for manual verification
+      console.log(`
+  ${model.name}
+    max_context_tokens:    ${model.maxCtx.toLocaleString()}
+    maxOutputSize:         ${model.maxOutputSize?.toLocaleString() ?? 'undefined'}
+    compaction input est:  ${COMPACTION_INPUT_TOKENS.toLocaleString()}
+
+    Original:  max_tokens=${origCap.toLocaleString()}  total=${origTotal.toLocaleString()}  overflow=${origTotal > model.maxCtx ? 'YES ❌' : 'NO'}
+    Patched:   max_tokens=${patchedCap.toLocaleString()}  total=${patchedTotal.toLocaleString()}  overflow=${patchedTotal > model.maxCtx ? 'YES ❌' : 'NO ✅'}
+`);
+    });
+  }
+});
diff --git a/packages/agent-core/test/utils/completion-budget.test.ts b/packages/agent-core/test/utils/completion-budget.test.ts
index 7df91f5d0..f89034889 100644
--- a/packages/agent-core/test/utils/completion-budget.test.ts
+++ b/packages/agent-core/test/utils/completion-budget.test.ts
@@ -248,3 +248,92 @@ describe('resolveCompletionBudget', () => {
     expect(budget?.fallback).toBe(32000);
   });
 });
+
+describe('compaction budget resolution', () => {
+  // Simulates the budget resolution logic from full.ts compactionRound():
+  //   const compactionOutputCap =
+  //     maxOutputSize ?? (maxCtx > 0 ? Math.min(Math.floor(maxCtx / 4), 8192) : undefined);
+  // This ensures compaction never requests the full context window as
+  // max_completion_tokens when maxOutputSize is not explicitly configured.
+  function resolveCompactionBudget(args: {
+    readonly maxOutputSize?: number;
+    readonly maxCtx: number;
+    readonly reservedContextSize?: number;
+    readonly env?: NodeJS.ProcessEnv;
+  }): ReturnType<typeof resolveCompletionBudget> {
+    const compactionOutputCap =
+      args.maxOutputSize ?? (args.maxCtx > 0 ? Math.min(Math.floor(args.maxCtx / 4), 8192) : undefined);
+    return resolveCompletionBudget({
+      maxOutputSize: compactionOutputCap,
+      reservedContextSize: args.reservedContextSize,
+      env: args.env,
+    });
+  }
+
+  it('uses a conservative fallback cap when maxOutputSize is undefined', () => {
+    const budget = resolveCompactionBudget({
+      maxCtx: 262_144,
+      reservedContextSize: 50_000,
+      env: {},
+    });
+    // 262144 / 4 = 65536, min(65536, 8192) = 8192
+    expect(budget?.hardCap).toBe(8192);
+  });
+
+  it('caps at 8192 even for very large context windows', () => {
+    const budget = resolveCompactionBudget({
+      maxCtx: 1_000_000,
+      reservedContextSize: 50_000,
+      env: {},
+    });
+    expect(budget?.hardCap).toBe(8192);
+  });
+
+  it('uses 1/4 of context when context is small', () => {
+    const budget = resolveCompactionBudget({
+      maxCtx: 20_000,
+      reservedContextSize: 50_000,
+      env: {},
+    });
+    // 20000 / 4 = 5000, min(5000, 8192) = 5000
+    expect(budget?.hardCap).toBe(5000);
+  });
+
+  it('uses explicit maxOutputSize when configured', () => {
+    const budget = resolveCompactionBudget({
+      maxOutputSize: 131_072,
+      maxCtx: 1_000_000,
+      reservedContextSize: 50_000,
+      env: {},
+    });
+    expect(budget?.hardCap).toBe(131_072);
+  });
+
+  it('respects KIMI_MODEL_MAX_COMPLETION_TOKENS over the fallback cap', () => {
+    const budget = resolveCompactionBudget({
+      maxCtx: 262_144,
+      reservedContextSize: 50_000,
+      env: { KIMI_MODEL_MAX_COMPLETION_TOKENS: '4096' },
+    });
+    expect(budget?.hardCap).toBe(4096);
+  });
+
+  it('produces a hardCap that computeCompletionBudgetCap will use instead of maxCtx', () => {
+    const maxCtx = 262_144;
+    const budget = resolveCompactionBudget({
+      maxCtx,
+      reservedContextSize: 50_000,
+      env: {},
+    });
+    // The budget should have a hardCap, not just a fallback
+    expect(budget?.hardCap).toBeDefined();
+    expect(budget?.hardCap).not.toBe(maxCtx);
+    // computeCompletionBudgetCap should use the hardCap, not the context window
+    const cap = computeCompletionBudgetCap({
+      budget: budget!,
+      capability: makeCapability(maxCtx),
+    });
+    expect(cap).toBe(8192);
+    expect(cap).toBeLessThan(maxCtx);
+  });
+});