MoonshotAI · li-xiu-qi · Jun 17, 2026
diff --git a/.changeset/compaction-output-token-cap.md b/.changeset/compaction-output-token-cap.md
@@ -0,0 +1,5 @@
+---
+"@moonshot-ai/agent-core": patch
+---
+
+Cap compaction output tokens to a conservative fallback when maxOutputSize is not configured, preventing APIContextOverflowError on providers that do not auto-clamp max_tokens.
diff --git a/packages/agent-core/src/agent/compaction/full.ts b/packages/agent-core/src/agent/compaction/full.ts
@@ -261,9 +261,19 @@ export class FullCompaction {
       await this.triggerPreCompactHook(data, tokensBefore, signal);
 
       const model = this.agent.config.model;
+      const maxOutputSize = this.agent.config.maxOutputSize;
+      const maxCtx = this.agent.config.modelCapabilities.max_context_tokens ?? 0;
+      // When maxOutputSize is not configured (the default), fall back to a
+      // conservative cap so compaction never requests the full context window
+      // as max_completion_tokens. 1/4 of the context window (capped at 8192)
+      // is generous for a summary while preventing overflow on providers that
+      // do not auto-clamp max_tokens server-side.
+      const compactionOutputCap =
+        maxOutputSize ?? (maxCtx > 0 ? Math.min(Math.floor(maxCtx / 4), 8192) : undefined);
       const provider = applyCompletionBudget({
         provider: this.agent.config.provider,
         budget: resolveCompletionBudget({
+          maxOutputSize: compactionOutputCap,
           reservedContextSize: this.agent.kimiConfig?.loopControl?.reservedContextSize,
         }),
         capability: this.agent.config.modelCapabilities,

diff --git a/packages/agent-core/test/utils/compaction-overflow-verification.test.ts b/packages/agent-core/test/utils/compaction-overflow-verification.test.ts
@@ -0,0 +1,140 @@
+import { describe, expect, it } from 'vitest';
+
+import {
+  applyCompletionBudget,
+  computeCompletionBudgetCap,
+  resolveCompletionBudget,
+} from '../../src/utils/completion-budget';
+
+import type { ChatProvider, ModelCapability } from '@moonshot-ai/kosong';
+
+function makeCapability(maxContextTokens: number): ModelCapability {
+  return {
+    image_in: false,
+    video_in: false,
+    audio_in: false,
+    thinking: false,
+    tool_use: true,
+    max_context_tokens: maxContextTokens,
+  };
+}
+
+function makeMockProvider(): { provider: ChatProvider; getCap: () => number | null } {
+  let cap: number | null = null;
+  const provider = {
+    name: 'mock',
+    modelName: 'mock-model',
+    thinkingEffort: null,
+    generate: (() => {}) as unknown as ChatProvider['generate'],
+    withThinking: (() => {}) as unknown as ChatProvider['withThinking'],
+    withMaxCompletionTokens: ((n: number) => {
+      cap = n;
+      return { ...provider, _cap: n } as unknown as ChatProvider;
+    }) as unknown as (n: number) => ChatProvider,
+  } as ChatProvider;
+  return { provider, getCap: () => cap };
+}
+
+/**
+ * Simulates the ORIGINAL compactionRound() budget logic (before fix):
+ * does NOT pass maxOutputSize to resolveCompletionBudget.
+ */
+function originalCompactionMaxTokens(args: {
+  maxOutputSize?: number;
+  maxCtx: number;
+  reservedContextSize?: number;
+}): number {
+  const { provider, getCap } = makeMockProvider();
+  applyCompletionBudget({
+    provider,
+    budget: resolveCompletionBudget({
+      // maxOutputSize intentionally omitted — this is the bug
+      reservedContextSize: args.reservedContextSize,
+    }),
+    capability: makeCapability(args.maxCtx),
+  });
+  return getCap() ?? 0;
+}
+
+/**
+ * Simulates the PATCHED compactionRound() budget logic (after fix):
+ * passes maxOutputSize or a conservative fallback cap.
+ */
+function patchedCompactionMaxTokens(args: {
+  maxOutputSize?: number;
+  maxCtx: number;
+  reservedContextSize?: number;
+}): number {
+  const compactionOutputCap =
+    args.maxOutputSize ?? (args.maxCtx > 0 ? Math.min(Math.floor(args.maxCtx / 4), 8192) : undefined);
+  const { provider, getCap } = makeMockProvider();
+  applyCompletionBudget({
+    provider,
+    budget: resolveCompletionBudget({
+      maxOutputSize: compactionOutputCap,
+      reservedContextSize: args.reservedContextSize,
+    }),
+    capability: makeCapability(args.maxCtx),
+  });
+  return getCap() ?? 0;
+}
+
+describe('compaction overflow verification (before vs after fix)', () => {
+  // Simulated compaction input size: a typical compaction prompt contains
+  // the entire conversation history being compacted.
+  const COMPACTION_INPUT_TOKENS = 80_000;
+
+  const testModels = [
+    {
+      name: 'stepfun/step-3.7-flash (maxOutputSize not configured)',
+      maxOutputSize: undefined,
+      maxCtx: 256_000,
+      reservedContextSize: 50_000,
+    },
+    {
+      name: 'kimi-for-coding (maxOutputSize not configured)',
+      maxOutputSize: undefined,
+      maxCtx: 262_144,
+      reservedContextSize: 50_000,
+    },
+    {
+      name: 'zhipu/glm-5.2 (maxOutputSize=131072)',
+      maxOutputSize: 131_072,
+      maxCtx: 1_000_000,
+      reservedContextSize: 50_000,
+    },
+  ];
+
+  for (const model of testModels) {
+    it(`${model.name}: original overflows, patched is safe`, () => {
+      const origCap = originalCompactionMaxTokens(model);
+      const patchedCap = patchedCompactionMaxTokens(model);
+      const origTotal = COMPACTION_INPUT_TOKENS + origCap;
+      const patchedTotal = COMPACTION_INPUT_TOKENS + patchedCap;
+
+      // --- Original code ---
+      // The original compaction code does NOT pass maxOutputSize to
+      // resolveCompletionBudget, so it always falls back to using the full
+      // context window as max_completion_tokens — regardless of whether
+      // maxOutputSize is configured. This is the core bug.
+      expect(origCap).toBe(model.maxCtx); // bug: always uses full context as max_tokens
+      expect(origTotal).toBeGreaterThan(model.maxCtx); // overflow!
+
+      // --- Patched code ---
+      // The patched code always uses a safe cap (either maxOutputSize or min(maxCtx/4, 8192))
+      expect(patchedTotal).toBeLessThanOrEqual(model.maxCtx);
+      expect(patchedCap).toBeLessThan(model.maxCtx);
+
+      // Print a comparison table for manual verification
+      console.log(`
+  ${model.name}
+    max_context_tokens:    ${model.maxCtx.toLocaleString()}
+    maxOutputSize:         ${model.maxOutputSize?.toLocaleString() ?? 'undefined'}
+    compaction input est:  ${COMPACTION_INPUT_TOKENS.toLocaleString()}
+
+    Original:  max_tokens=${origCap.toLocaleString()}  total=${origTotal.toLocaleString()}  overflow=${origTotal > model.maxCtx ? 'YES ❌' : 'NO'}
+    Patched:   max_tokens=${patchedCap.toLocaleString()}  total=${patchedTotal.toLocaleString()}  overflow=${patchedTotal > model.maxCtx ? 'YES ❌' : 'NO ✅'}
+`);
+    });
+  }
+});
diff --git a/packages/agent-core/test/utils/completion-budget.test.ts b/packages/agent-core/test/utils/completion-budget.test.ts
@@ -248,3 +248,92 @@ describe('resolveCompletionBudget', () => {
     expect(budget?.fallback).toBe(32000);
   });
 });
+
+describe('compaction budget resolution', () => {
+  // Simulates the budget resolution logic from full.ts compactionRound():
+  //   const compactionOutputCap =
+  //     maxOutputSize ?? (maxCtx > 0 ? Math.min(Math.floor(maxCtx / 4), 8192) : undefined);
+  // This ensures compaction never requests the full context window as
+  // max_completion_tokens when maxOutputSize is not explicitly configured.
+  function resolveCompactionBudget(args: {
+    readonly maxOutputSize?: number;
+    readonly maxCtx: number;
+    readonly reservedContextSize?: number;
+    readonly env?: NodeJS.ProcessEnv;
+  }): ReturnType<typeof resolveCompletionBudget> {
+    const compactionOutputCap =
+      args.maxOutputSize ?? (args.maxCtx > 0 ? Math.min(Math.floor(args.maxCtx / 4), 8192) : undefined);
+    return resolveCompletionBudget({
+      maxOutputSize: compactionOutputCap,
+      reservedContextSize: args.reservedContextSize,
+      env: args.env,
+    });
+  }
+
+  it('uses a conservative fallback cap when maxOutputSize is undefined', () => {
+    const budget = resolveCompactionBudget({
+      maxCtx: 262_144,
+      reservedContextSize: 50_000,
+      env: {},
+    });
+    // 262144 / 4 = 65536, min(65536, 8192) = 8192
+    expect(budget?.hardCap).toBe(8192);
+  });
+
+  it('caps at 8192 even for very large context windows', () => {
+    const budget = resolveCompactionBudget({
+      maxCtx: 1_000_000,
+      reservedContextSize: 50_000,
+      env: {},
+    });
+    expect(budget?.hardCap).toBe(8192);
+  });
+
+  it('uses 1/4 of context when context is small', () => {
+    const budget = resolveCompactionBudget({
+      maxCtx: 20_000,
+      reservedContextSize: 50_000,
+      env: {},
+    });
+    // 20000 / 4 = 5000, min(5000, 8192) = 5000
+    expect(budget?.hardCap).toBe(5000);
+  });
+
+  it('uses explicit maxOutputSize when configured', () => {
+    const budget = resolveCompactionBudget({
+      maxOutputSize: 131_072,
+      maxCtx: 1_000_000,
+      reservedContextSize: 50_000,
+      env: {},
+    });
+    expect(budget?.hardCap).toBe(131_072);
+  });
+
+  it('respects KIMI_MODEL_MAX_COMPLETION_TOKENS over the fallback cap', () => {
+    const budget = resolveCompactionBudget({
+      maxCtx: 262_144,
+      reservedContextSize: 50_000,
+      env: { KIMI_MODEL_MAX_COMPLETION_TOKENS: '4096' },
+    });
+    expect(budget?.hardCap).toBe(4096);
+  });
+
+  it('produces a hardCap that computeCompletionBudgetCap will use instead of maxCtx', () => {
+    const maxCtx = 262_144;
+    const budget = resolveCompactionBudget({
+      maxCtx,
+      reservedContextSize: 50_000,
+      env: {},
+    });
+    // The budget should have a hardCap, not just a fallback
+    expect(budget?.hardCap).toBeDefined();
+    expect(budget?.hardCap).not.toBe(maxCtx);
+    // computeCompletionBudgetCap should use the hardCap, not the context window
+    const cap = computeCompletionBudgetCap({
+      budget: budget!,
+      capability: makeCapability(maxCtx),
+    });
+    expect(cap).toBe(8192);
+    expect(cap).toBeLessThan(maxCtx);
+  });
+});