MoonshotAI · mikkihugo · Jun 15, 2026 · Jun 15, 2026 · Jun 15, 2026 · chatgpt-codex-connector
diff --git a/.changeset/deepseek-inline-tool-calls.md b/.changeset/deepseek-inline-tool-calls.md
@@ -0,0 +1,7 @@
+---
+"@moonshot-ai/kosong": patch
+---
+
+Parse DeepSeek-format inline tool calls when an OpenAI-compatible backend leaves them unstructured.
+
+DeepSeek-architecture models (deepseek-v3/r1 and derivatives such as cogito) emit tool calls as special tokens rather than OpenAI `tool_calls`. DeepSeek's own API structures these server-side, but many compatible deployments — self-hosted vLLM/SGLang/llama.cpp, ollama, and some proxies — leak the raw `<|tool▁calls▁begin|>…` tokens into the assistant content, leaving the agent with nothing to dispatch and the turn dead-ending. The OpenAI chat-completions provider now detects that case, parses the tokens into structured tool calls, and strips them from the visible text — but only when the backend returned no structured call, so it stays a no-op for providers that already do the right thing.
diff --git a/packages/kosong/src/providers/deepseek-inline-tool-calls.ts b/packages/kosong/src/providers/deepseek-inline-tool-calls.ts
@@ -0,0 +1,158 @@
+import type { ToolCall } from '#/message';
+
+/**
+ * Defensive fallback for DeepSeek-V3 style **inline** tool calls.
+ *
+ * DeepSeek-architecture models (deepseek-v3/r1 and derivatives such as
+ * cogito-2.1) emit tool calls in a special-token format rather than as OpenAI
+ * `tool_calls`. The official DeepSeek API parses this server-side and returns
+ * structured `tool_calls`, but many OpenAI-compatible deployments — self-hosted
+ * vLLM / SGLang / llama.cpp, ollama, and some proxies — do NOT, and instead leak
+ * the raw tokens into the assistant `content`:
+ *
+ *   <|tool▁calls▁begin|>
+ *     <|tool▁call▁begin|>function<|tool▁sep|>NAME
+ *     ```json
+ *     { ...arguments... }
+ *     ```<|tool▁call▁end|>      (repeated for parallel calls)
+ *   <|tool▁calls▁end|>
+ *
+ * The bar is either ASCII `|` (U+007C, as ollama-cloud emits) or the tokenizer's
+ * full-width `｜` (U+FF5C, as raw vLLM/SGLang/llama.cpp leaks); separators are
+ * `▁` (U+2581). The outer `…calls▁begin…` wrapper is sometimes omitted and the
+ * block starts straight at a per-call `…call▁begin…` (see vllm-project/vllm#21727),
+ * so detection anchors on either boundary.
+ *
+ * When this happens the agent sees no tool call and the turn dead-ends. This
+ * module parses those tokens client-side so the call can still be dispatched. It
+ * is applied ONLY when the provider returned no structured tool call AND a block
+ * boundary is present, so it is a no-op for every well-behaved provider/model.
+ */
+
+const SEP = '▁'; // U+2581
+const BAR = '[|｜]'; // ASCII U+007C or full-width U+FF5C
+
+/** First block boundary: a calls-begin OR a call-begin token, either bar. */
+const BLOCK_START = new RegExp(`<${BAR}tool${SEP}calls?${SEP}begin${BAR}>`);
+
+// One call: <…call▁begin…>[function]<…sep…>NAME ```[json] {ARGS} ```
+const CALL_RE = new RegExp(
+  `<${BAR}tool${SEP}call${SEP}begin${BAR}>\\s*(?:function)?\\s*<${BAR}tool${SEP}sep${BAR}>\\s*([A-Za-z0-9_.-]+)\\s*` +
+    '```(?:json)?\\s*([\\s\\S]*?)```',
+  'g',
+);
+
+/** Canonical ASCII calls-begin sentinel (documentation / convenience). */
+export const DEEPSEEK_TOOL_CALLS_BEGIN = `<|tool${SEP}calls${SEP}begin|>`;
+
+// Longest sentinel we might hold back while detecting a marker split across deltas.
+const MAX_MARKER_LEN = DEEPSEEK_TOOL_CALLS_BEGIN.length;
+
+/** Index of the first DeepSeek tool-call block boundary in `content`, or -1. */
+export function firstBlockStart(content: string): number {
+  const match = BLOCK_START.exec(content);
+  return match ? match.index : -1;
+}
+
+/**
+ * Parse DeepSeek inline tool-call tokens from assistant content into structured
+ * {@link ToolCall}s. Calls whose argument block is not valid JSON are skipped,
+ * so a partially corrupted emission yields the calls it can rather than throwing.
+ */
+export function parseDeepSeekInlineToolCalls(content: string): ToolCall[] {
+  if (typeof content !== 'string' || firstBlockStart(content) < 0) {
+    return [];
+  }
+  const calls: ToolCall[] = [];
+  CALL_RE.lastIndex = 0;
+  let match: RegExpExecArray | null;
+  while ((match = CALL_RE.exec(content)) !== null) {
+    const name = (match[1] ?? '').trim();
+    const args = (match[2] ?? '').trim();
+    if (name.length === 0) continue;
+    try {
+      JSON.parse(args);
+    } catch {
+      continue;
+    }
+    calls.push({ type: 'function', id: crypto.randomUUID(), name, arguments: args });
+  }
+  return calls;
+}
+
+/**
+ * Streaming-safe filter that lets text deltas through live until a DeepSeek
+ * tool-call block begins, then suppresses the raw tokens (so they never reach the
+ * UI) while accumulating the full content for {@link parseDeepSeekInlineToolCalls}.
+ *
+ * A small trailing holdback covers a block-start marker that straddles two deltas.
+ * Once the provider emits a structured tool call ({@link releaseHoldback}) no
+ * inline leak is possible, so the filter releases any held text and passes the
+ * rest through verbatim — preserving ordering for well-behaved providers.
+ */
+export class DeepSeekInlineToolCallFilter {
+  private buffer = '';
+  private full = '';
+  private suppressing = false;
+  private passthrough = false;
+
+  /** Feed a content delta; returns the text safe to yield now (possibly empty). */
+  push(delta: string): string {
+    this.full += delta;
+    if (this.passthrough) return delta;
+    if (this.suppressing) return '';
+    this.buffer += delta;
+    const idx = firstBlockStart(this.buffer);
+    if (idx >= 0) {
+      const out = this.buffer.slice(0, idx);
+      this.suppressing = true;
+      this.buffer = '';
+      return out;
+    }
+    const holdback = MAX_MARKER_LEN - 1;
+    if (this.buffer.length > holdback) {
+      const out = this.buffer.slice(0, this.buffer.length - holdback);
+      this.buffer = this.buffer.slice(this.buffer.length - holdback);
+      return out;
+    }
+    return '';
+  }
+
+  /**
+   * The provider emitted a structured tool call, so no inline leak is possible.
+   * Release any held-back text (in order) and stop buffering subsequent deltas —
+   * this keeps a short preamble from being reordered after the tool-call parts.
+   *
+   * No-op once suppression has begun: if an inline block was already detected we
+   * must keep stripping it rather than flip to passthrough (which would leak the
+   * remainder of the block as visible text).
+   */
+  releaseHoldback(): string {
+    if (this.suppressing) return '';
+    this.passthrough = true;
+    const out = this.buffer;
+    this.buffer = '';
+    return out;
+  }
+
+  /**
+   * Remaining buffered text once the stream ends (empty if a block was
+   * suppressed). Idempotent: the buffer is cleared, so a second call returns ''.
+   */
+  flush(): string {
+    if (this.suppressing) return '';
+    const out = this.buffer;
+    this.buffer = '';
+    return out;
+  }
+
+  /** Whether a block-start marker was seen (and the rest of content suppressed). */
+  get sawToolBlock(): boolean {
+    return this.suppressing;
+  }
+
+  /** Full accumulated content (for parsing). */
+  get content(): string {
+    return this.full;
+  }
+}
diff --git a/packages/kosong/src/providers/openai-legacy.ts b/packages/kosong/src/providers/openai-legacy.ts
@@ -30,6 +30,11 @@ import {
   convertChatCompletionStreamToolCall,
   type BufferedChatCompletionToolCall,
 } from './chat-completions-stream';
+import {
+  DeepSeekInlineToolCallFilter,
+  firstBlockStart,
+  parseDeepSeekInlineToolCalls,
+} from './deepseek-inline-tool-calls';
 import {
   mergeRequestHeaders,
   requireProviderApiKey,
@@ -361,20 +366,36 @@ export class OpenAILegacyStreamedMessage implements StreamedMessage {
       yield { type: 'think', think: reasoning } satisfies StreamedMessagePart;
     }
 
-    if (message.content) {
-      yield { type: 'text', text: message.content } satisfies StreamedMessagePart;
+    const structuredToolCalls = (message.tool_calls ?? []).filter(isFunctionToolCall);
+    const content = typeof message.content === 'string' ? message.content : '';
+
+    // Fallback: a backend served a DeepSeek-format model but left its inline
+    // tool-call tokens in `content` instead of structuring them as `tool_calls`.
+    // Strip the block from visible text whenever a block boundary is present (and
+    // the provider returned no structured call) — even if some blocks fail to
+    // parse — so the raw tokens never render. Parse what we can into dispatchable
+    // tool calls. No-op when absent or already structured.
+    const blockStart = structuredToolCalls.length === 0 ? firstBlockStart(content) : -1;
+    const inlineToolCalls = blockStart >= 0 ? parseDeepSeekInlineToolCalls(content) : [];
+
+    if (content.length > 0) {
+      const text = blockStart >= 0 ? content.slice(0, blockStart) : content;
+      if (text.length > 0) {
+        yield { type: 'text', text } satisfies StreamedMessagePart;
+      }
     }
 
-    if (message.tool_calls) {
-      for (const toolCall of message.tool_calls) {
-        if (!isFunctionToolCall(toolCall)) continue;
-        yield {
-          type: 'function',
-          id: toolCall.id || crypto.randomUUID(),
-          name: toolCall.function.name,
-          arguments: toolCall.function.arguments,
-        } satisfies ToolCall;
-      }
+    for (const toolCall of structuredToolCalls) {
+      yield {
+        type: 'function',
+        id: toolCall.id || crypto.randomUUID(),
+        name: toolCall.function.name,
+        arguments: toolCall.function.arguments,
+      } satisfies ToolCall;
+    }
+
+    for (const toolCall of inlineToolCalls) {
+      yield toolCall;
     }
   }
 
@@ -383,6 +404,8 @@ export class OpenAILegacyStreamedMessage implements StreamedMessage {
     reasoningKey: string | undefined,
   ): AsyncGenerator<StreamedMessagePart> {
     const bufferedToolCalls = new Map<number | string, BufferedChatCompletionToolCall>();
+    const inlineFilter = new DeepSeekInlineToolCallFilter();
+    let sawStructuredToolCall = false;
 
     try {
       for await (const chunk of response) {
@@ -416,19 +439,46 @@ export class OpenAILegacyStreamedMessage implements StreamedMessage {
           yield { type: 'think', think: reasoning } satisfies StreamedMessagePart;
         }
 
-        // text content
+        // text content — funnel through the inline filter so a leaked DeepSeek
+        // tool-call block is stripped from visible text (and captured for parsing
+        // once the stream ends) instead of being shown to the user.
         if (delta.content) {
-          yield { type: 'text', text: delta.content } satisfies StreamedMessagePart;
+          const visible = inlineFilter.push(delta.content);
+          if (visible.length > 0) {
+            yield { type: 'text', text: visible } satisfies StreamedMessagePart;
+          }
         }
 
         // tool calls — preserve `index` on every yielded part so the generate
         // loop can route interleaved argument deltas from parallel tool calls.
+        if (delta.tool_calls && delta.tool_calls.length > 0 && !sawStructuredToolCall) {
+          sawStructuredToolCall = true;
+          // A structured tool call means no inline leak is possible: release any
+          // held-back preamble text now so it isn't reordered after the call parts.
+          const released = inlineFilter.releaseHoldback();
+          if (released.length > 0) {
+            yield { type: 'text', text: released } satisfies StreamedMessagePart;
+          }
+        }
         for (const toolCall of delta.tool_calls ?? []) {
           for (const part of convertChatCompletionStreamToolCall(toolCall, bufferedToolCalls)) {
             yield part;
           }
         }
       }
+
+      // Flush any text held back for partial begin-marker detection.
+      const tail = inlineFilter.flush();
+      if (tail.length > 0) {
+        yield { type: 'text', text: tail } satisfies StreamedMessagePart;
+      }
+      // Fallback: the backend served a DeepSeek-format model but left its inline
+      // tool-call tokens in `content` instead of structuring them. Parse them.
+      if (!sawStructuredToolCall && inlineFilter.sawToolBlock) {
+        for (const toolCall of parseDeepSeekInlineToolCalls(inlineFilter.content)) {
+          yield toolCall;
+        }
+      }
     } catch (error: unknown) {
       throw convertOpenAIError(error);
     }