From b7c35de0b03636e641f53ccc8e89a2484c93d89a Mon Sep 17 00:00:00 2001 From: Mikael Hugo Date: Mon, 15 Jun 2026 22:54:21 +0200 Subject: [PATCH 1/3] fix(kosong): parse DeepSeek inline tool calls left unstructured by OpenAI-compatible backends MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DeepSeek-architecture models (deepseek-v3/r1 and derivatives like cogito) emit tool calls as special tokens rather than OpenAI tool_calls. DeepSeek's own API structures these server-side, but many compatible deployments — self-hosted vLLM/SGLang/llama.cpp, ollama, some proxies — leak the raw <|tool_calls_begin|> tokens into the assistant content, so the agent sees no tool call and the turn dead-ends. The OpenAI chat-completions provider now parses those tokens into structured tool calls and strips them from visible text, but only when the backend returned no structured call — a no-op for providers that already do the right thing. Covers both streaming (marker-aware, so a split begin-token is still caught) and non-streaming responses. --- .changeset/deepseek-inline-tool-calls.md | 7 ++ .../providers/deepseek-inline-tool-calls.ts | 111 ++++++++++++++++++ .../kosong/src/providers/openai-legacy.ts | 74 +++++++++--- .../deepseek-inline-tool-calls.test.ts | 79 +++++++++++++ 4 files changed, 257 insertions(+), 14 deletions(-) create mode 100644 .changeset/deepseek-inline-tool-calls.md create mode 100644 packages/kosong/src/providers/deepseek-inline-tool-calls.ts create mode 100644 packages/kosong/test/providers/deepseek-inline-tool-calls.test.ts diff --git a/.changeset/deepseek-inline-tool-calls.md b/.changeset/deepseek-inline-tool-calls.md new file mode 100644 index 000000000..426839a99 --- /dev/null +++ b/.changeset/deepseek-inline-tool-calls.md @@ -0,0 +1,7 @@ +--- +"@moonshot-ai/kosong": patch +--- + +Parse DeepSeek-format inline tool calls when an OpenAI-compatible backend leaves them unstructured. + +DeepSeek-architecture models (deepseek-v3/r1 and derivatives such as cogito) emit tool calls as special tokens rather than OpenAI `tool_calls`. DeepSeek's own API structures these server-side, but many compatible deployments — self-hosted vLLM/SGLang/llama.cpp, ollama, and some proxies — leak the raw `<|tool▁calls▁begin|>…` tokens into the assistant content, leaving the agent with nothing to dispatch and the turn dead-ending. The OpenAI chat-completions provider now detects that case, parses the tokens into structured tool calls, and strips them from the visible text — but only when the backend returned no structured call, so it stays a no-op for providers that already do the right thing. diff --git a/packages/kosong/src/providers/deepseek-inline-tool-calls.ts b/packages/kosong/src/providers/deepseek-inline-tool-calls.ts new file mode 100644 index 000000000..35092cd86 --- /dev/null +++ b/packages/kosong/src/providers/deepseek-inline-tool-calls.ts @@ -0,0 +1,111 @@ +import type { ToolCall } from '#/message'; + +/** + * Defensive fallback for DeepSeek-V3 style **inline** tool calls. + * + * DeepSeek-architecture models (deepseek-v3/r1 and derivatives such as + * cogito-2.1) emit tool calls in a special-token format rather than as OpenAI + * `tool_calls`. The official DeepSeek API parses this server-side and returns + * structured `tool_calls`, but many OpenAI-compatible deployments — self-hosted + * vLLM / SGLang / llama.cpp, ollama, and some proxies — do NOT, and instead leak + * the raw tokens into the assistant `content`: + * + * <|tool▁calls▁begin|> + * <|tool▁call▁begin|>function<|tool▁sep|>NAME + * ```json + * { ...arguments... } + * ```<|tool▁call▁end|> (repeated for parallel calls) + * <|tool▁calls▁end|> + * + * (the bars are ASCII U+007C, the separators are U+2581). When that happens the + * agent sees no tool call and the turn dead-ends. This module parses those tokens + * client-side so the call can still be dispatched. It is applied ONLY when the + * provider returned no structured tool call AND the begin token is present, so it + * is a no-op for every well-behaved provider/model. + */ + +const SEP = '▁'; // ▁ +export const DEEPSEEK_TOOL_CALLS_BEGIN = `<|tool${SEP}calls${SEP}begin|>`; + +// <|tool▁call▁begin|>[function]<|tool▁sep|>NAME ```[json] {ARGS} ``` +const CALL_RE = new RegExp( + `<\\|tool${SEP}call${SEP}begin\\|>\\s*(?:function)?\\s*<\\|tool${SEP}sep\\|>\\s*([A-Za-z0-9_.-]+)\\s*` + + '```(?:json)?\\s*([\\s\\S]*?)```', + 'g', +); + +/** + * Parse DeepSeek inline tool-call tokens from assistant content into structured + * {@link ToolCall}s. Calls whose argument block is not valid JSON are skipped, + * so a partially corrupted emission yields the calls it can rather than throwing. + */ +export function parseDeepSeekInlineToolCalls(content: string): ToolCall[] { + if (typeof content !== 'string' || !content.includes(DEEPSEEK_TOOL_CALLS_BEGIN)) { + return []; + } + const calls: ToolCall[] = []; + CALL_RE.lastIndex = 0; + let match: RegExpExecArray | null; + while ((match = CALL_RE.exec(content)) !== null) { + const name = (match[1] ?? '').trim(); + const args = (match[2] ?? '').trim(); + if (name.length === 0) continue; + try { + JSON.parse(args); + } catch { + continue; + } + calls.push({ type: 'function', id: crypto.randomUUID(), name, arguments: args }); + } + return calls; +} + +/** + * Streaming-safe filter that lets text deltas through live until the DeepSeek + * tool-call block begins, then suppresses the raw tokens (so they never reach the + * UI) while still accumulating the full content for {@link parseDeepSeekInlineToolCalls}. + * + * A small trailing holdback covers a begin-marker that straddles two deltas. + */ +export class DeepSeekInlineToolCallFilter { + private readonly marker = DEEPSEEK_TOOL_CALLS_BEGIN; + private buffer = ''; + private full = ''; + private suppressing = false; + + /** Feed a content delta; returns the text safe to yield now (possibly empty). */ + push(delta: string): string { + this.full += delta; + if (this.suppressing) return ''; + this.buffer += delta; + const idx = this.buffer.indexOf(this.marker); + if (idx >= 0) { + const out = this.buffer.slice(0, idx); + this.suppressing = true; + this.buffer = ''; + return out; + } + const holdback = this.marker.length - 1; + if (this.buffer.length > holdback) { + const out = this.buffer.slice(0, this.buffer.length - holdback); + this.buffer = this.buffer.slice(this.buffer.length - holdback); + return out; + } + return ''; + } + + /** Remaining buffered text once the stream ends (empty if a block was suppressed). */ + flush(): string { + return this.suppressing ? '' : this.buffer; + } + + /** Whether the begin marker was seen. */ + get sawToolBlock(): boolean { + return this.suppressing; + } + + /** Full accumulated content (for parsing). */ + get content(): string { + return this.full; + } +} diff --git a/packages/kosong/src/providers/openai-legacy.ts b/packages/kosong/src/providers/openai-legacy.ts index c9987a5f5..20c60bc05 100644 --- a/packages/kosong/src/providers/openai-legacy.ts +++ b/packages/kosong/src/providers/openai-legacy.ts @@ -30,6 +30,11 @@ import { convertChatCompletionStreamToolCall, type BufferedChatCompletionToolCall, } from './chat-completions-stream'; +import { + DeepSeekInlineToolCallFilter, + parseDeepSeekInlineToolCalls, + DEEPSEEK_TOOL_CALLS_BEGIN, +} from './deepseek-inline-tool-calls'; import { mergeRequestHeaders, requireProviderApiKey, @@ -361,20 +366,38 @@ export class OpenAILegacyStreamedMessage implements StreamedMessage { yield { type: 'think', think: reasoning } satisfies StreamedMessagePart; } - if (message.content) { - yield { type: 'text', text: message.content } satisfies StreamedMessagePart; + const structuredToolCalls = (message.tool_calls ?? []).filter(isFunctionToolCall); + + // Fallback: a backend served a DeepSeek-format model but left its inline + // tool-call tokens in `content` instead of structuring them as `tool_calls`. + // Parse them so the call is still dispatched (no-op when absent or when the + // provider already structured the call). + const inlineToolCalls = + structuredToolCalls.length === 0 && typeof message.content === 'string' + ? parseDeepSeekInlineToolCalls(message.content) + : []; + + if (typeof message.content === 'string' && message.content.length > 0) { + const text = + inlineToolCalls.length > 0 + ? message.content.slice(0, message.content.indexOf(DEEPSEEK_TOOL_CALLS_BEGIN)) + : message.content; + if (text.length > 0) { + yield { type: 'text', text } satisfies StreamedMessagePart; + } } - if (message.tool_calls) { - for (const toolCall of message.tool_calls) { - if (!isFunctionToolCall(toolCall)) continue; - yield { - type: 'function', - id: toolCall.id || crypto.randomUUID(), - name: toolCall.function.name, - arguments: toolCall.function.arguments, - } satisfies ToolCall; - } + for (const toolCall of structuredToolCalls) { + yield { + type: 'function', + id: toolCall.id || crypto.randomUUID(), + name: toolCall.function.name, + arguments: toolCall.function.arguments, + } satisfies ToolCall; + } + + for (const toolCall of inlineToolCalls) { + yield toolCall; } } @@ -383,6 +406,8 @@ export class OpenAILegacyStreamedMessage implements StreamedMessage { reasoningKey: string | undefined, ): AsyncGenerator { const bufferedToolCalls = new Map(); + const inlineFilter = new DeepSeekInlineToolCallFilter(); + let sawStructuredToolCall = false; try { for await (const chunk of response) { @@ -416,19 +441,40 @@ export class OpenAILegacyStreamedMessage implements StreamedMessage { yield { type: 'think', think: reasoning } satisfies StreamedMessagePart; } - // text content + // text content — funnel through the inline filter so a leaked DeepSeek + // tool-call block is stripped from visible text (and captured for parsing + // once the stream ends) instead of being shown to the user. if (delta.content) { - yield { type: 'text', text: delta.content } satisfies StreamedMessagePart; + const visible = inlineFilter.push(delta.content); + if (visible.length > 0) { + yield { type: 'text', text: visible } satisfies StreamedMessagePart; + } } // tool calls — preserve `index` on every yielded part so the generate // loop can route interleaved argument deltas from parallel tool calls. + if (delta.tool_calls && delta.tool_calls.length > 0) { + sawStructuredToolCall = true; + } for (const toolCall of delta.tool_calls ?? []) { for (const part of convertChatCompletionStreamToolCall(toolCall, bufferedToolCalls)) { yield part; } } } + + // Flush any text held back for partial begin-marker detection. + const tail = inlineFilter.flush(); + if (tail.length > 0) { + yield { type: 'text', text: tail } satisfies StreamedMessagePart; + } + // Fallback: the backend served a DeepSeek-format model but left its inline + // tool-call tokens in `content` instead of structuring them. Parse them. + if (!sawStructuredToolCall && inlineFilter.sawToolBlock) { + for (const toolCall of parseDeepSeekInlineToolCalls(inlineFilter.content)) { + yield toolCall; + } + } } catch (error: unknown) { throw convertOpenAIError(error); } diff --git a/packages/kosong/test/providers/deepseek-inline-tool-calls.test.ts b/packages/kosong/test/providers/deepseek-inline-tool-calls.test.ts new file mode 100644 index 000000000..8a3d05500 --- /dev/null +++ b/packages/kosong/test/providers/deepseek-inline-tool-calls.test.ts @@ -0,0 +1,79 @@ +import { + DEEPSEEK_TOOL_CALLS_BEGIN, + DeepSeekInlineToolCallFilter, + parseDeepSeekInlineToolCalls, +} from '#/providers/deepseek-inline-tool-calls'; +import { describe, expect, it } from 'vitest'; + +const SEP = '▁'; +const callBlock = (name: string, args: string) => + `<|tool${SEP}call${SEP}begin|>function<|tool${SEP}sep|>${name}\n\`\`\`json\n${args}\n\`\`\`<|tool${SEP}call${SEP}end|>`; +const wrap = (...blocks: string[]) => + `${DEEPSEEK_TOOL_CALLS_BEGIN}${blocks.join('')}<|tool${SEP}calls${SEP}end|>`; + +describe('parseDeepSeekInlineToolCalls', () => { + it('returns no calls when the begin token is absent', () => { + expect(parseDeepSeekInlineToolCalls('A plain assistant answer.')).toEqual([]); + expect(parseDeepSeekInlineToolCalls('')).toEqual([]); + }); + + it('parses a single inline tool call', () => { + const calls = parseDeepSeekInlineToolCalls(wrap(callBlock('read_file', '{"path": "app.js"}'))); + expect(calls).toHaveLength(1); + expect(calls[0]).toMatchObject({ + type: 'function', + name: 'read_file', + arguments: '{"path": "app.js"}', + }); + expect(typeof calls[0]?.id).toBe('string'); + }); + + it('parses parallel inline tool calls in order', () => { + const calls = parseDeepSeekInlineToolCalls( + wrap(callBlock('Read', '{"path":"a.js"}'), callBlock('Grep', '{"pattern":"foo"}')), + ); + expect(calls.map((c) => c.name)).toEqual(['Read', 'Grep']); + expect(calls.map((c) => c.arguments)).toEqual(['{"path":"a.js"}', '{"pattern":"foo"}']); + }); + + it('skips a call whose argument block is not valid JSON', () => { + const calls = parseDeepSeekInlineToolCalls( + wrap(callBlock('Read', '{"path": broken'), callBlock('Grep', '{"pattern":"x"}')), + ); + expect(calls.map((c) => c.name)).toEqual(['Grep']); + }); +}); + +describe('DeepSeekInlineToolCallFilter', () => { + it('passes ordinary text through and never suppresses', () => { + const f = new DeepSeekInlineToolCallFilter(); + let out = f.push('Hello, '); + out += f.push('world.'); + out += f.flush(); + expect(out).toBe('Hello, world.'); + expect(f.sawToolBlock).toBe(false); + }); + + it('emits text before the block and suppresses the tokens', () => { + const f = new DeepSeekInlineToolCallFilter(); + const content = `Reading now. ${wrap(callBlock('read_file', '{"path":"a.js"}'))}`; + let out = ''; + out += f.push(content); + out += f.flush(); + expect(out).toBe('Reading now. '); + expect(f.sawToolBlock).toBe(true); + expect(f.content).toBe(content); + expect(parseDeepSeekInlineToolCalls(f.content)).toHaveLength(1); + }); + + it('detects a begin marker split across deltas', () => { + const f = new DeepSeekInlineToolCallFilter(); + const mid = Math.floor(DEEPSEEK_TOOL_CALLS_BEGIN.length / 2); + let out = ''; + out += f.push(`ok ${DEEPSEEK_TOOL_CALLS_BEGIN.slice(0, mid)}`); + out += f.push(`${DEEPSEEK_TOOL_CALLS_BEGIN.slice(mid)}rest`); + out += f.flush(); + expect(out).toBe('ok '); + expect(f.sawToolBlock).toBe(true); + }); +}); From 60eab4d8499847fe42c05b7a8c359471e6013dc5 Mon Sep 17 00:00:00 2001 From: Mikael Hugo Date: Mon, 15 Jun 2026 23:03:27 +0200 Subject: [PATCH 2/3] address review: strip malformed inline blocks; make filter.flush idempotent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Non-stream path: strip the DeepSeek token block from visible text whenever the begin token is present, not only when a call parses — so a malformed emission never renders raw tokens to the user. - DeepSeekInlineToolCallFilter.flush() clears its buffer, so it is idempotent and a reused instance can't re-emit. - Tests for both, plus non-stream provider coverage (valid and malformed blocks). --- .../providers/deepseek-inline-tool-calls.ts | 10 +++- .../kosong/src/providers/openai-legacy.ts | 25 +++++---- .../deepseek-inline-tool-calls.test.ts | 56 +++++++++++++++++++ 3 files changed, 77 insertions(+), 14 deletions(-) diff --git a/packages/kosong/src/providers/deepseek-inline-tool-calls.ts b/packages/kosong/src/providers/deepseek-inline-tool-calls.ts index 35092cd86..1160d1ee8 100644 --- a/packages/kosong/src/providers/deepseek-inline-tool-calls.ts +++ b/packages/kosong/src/providers/deepseek-inline-tool-calls.ts @@ -94,9 +94,15 @@ export class DeepSeekInlineToolCallFilter { return ''; } - /** Remaining buffered text once the stream ends (empty if a block was suppressed). */ + /** + * Remaining buffered text once the stream ends (empty if a block was + * suppressed). Idempotent: the buffer is cleared, so a second call returns ''. + */ flush(): string { - return this.suppressing ? '' : this.buffer; + if (this.suppressing) return ''; + const out = this.buffer; + this.buffer = ''; + return out; } /** Whether the begin marker was seen. */ diff --git a/packages/kosong/src/providers/openai-legacy.ts b/packages/kosong/src/providers/openai-legacy.ts index 20c60bc05..46ad5af62 100644 --- a/packages/kosong/src/providers/openai-legacy.ts +++ b/packages/kosong/src/providers/openai-legacy.ts @@ -367,21 +367,22 @@ export class OpenAILegacyStreamedMessage implements StreamedMessage { } const structuredToolCalls = (message.tool_calls ?? []).filter(isFunctionToolCall); + const content = typeof message.content === 'string' ? message.content : ''; // Fallback: a backend served a DeepSeek-format model but left its inline // tool-call tokens in `content` instead of structuring them as `tool_calls`. - // Parse them so the call is still dispatched (no-op when absent or when the - // provider already structured the call). - const inlineToolCalls = - structuredToolCalls.length === 0 && typeof message.content === 'string' - ? parseDeepSeekInlineToolCalls(message.content) - : []; - - if (typeof message.content === 'string' && message.content.length > 0) { - const text = - inlineToolCalls.length > 0 - ? message.content.slice(0, message.content.indexOf(DEEPSEEK_TOOL_CALLS_BEGIN)) - : message.content; + // Strip the block from visible text whenever the begin token is present (and + // the provider returned no structured call) — even if some blocks fail to + // parse — so the raw tokens never render. Parse what we can into dispatchable + // tool calls. No-op when absent or already structured. + const hasInlineBlock = + structuredToolCalls.length === 0 && content.includes(DEEPSEEK_TOOL_CALLS_BEGIN); + const inlineToolCalls = hasInlineBlock ? parseDeepSeekInlineToolCalls(content) : []; + + if (content.length > 0) { + const text = hasInlineBlock + ? content.slice(0, content.indexOf(DEEPSEEK_TOOL_CALLS_BEGIN)) + : content; if (text.length > 0) { yield { type: 'text', text } satisfies StreamedMessagePart; } diff --git a/packages/kosong/test/providers/deepseek-inline-tool-calls.test.ts b/packages/kosong/test/providers/deepseek-inline-tool-calls.test.ts index 8a3d05500..72b62447a 100644 --- a/packages/kosong/test/providers/deepseek-inline-tool-calls.test.ts +++ b/packages/kosong/test/providers/deepseek-inline-tool-calls.test.ts @@ -1,8 +1,10 @@ +import type { StreamedMessagePart } from '#/message'; import { DEEPSEEK_TOOL_CALLS_BEGIN, DeepSeekInlineToolCallFilter, parseDeepSeekInlineToolCalls, } from '#/providers/deepseek-inline-tool-calls'; +import { OpenAILegacyStreamedMessage } from '#/providers/openai-legacy'; import { describe, expect, it } from 'vitest'; const SEP = '▁'; @@ -76,4 +78,58 @@ describe('DeepSeekInlineToolCallFilter', () => { expect(out).toBe('ok '); expect(f.sawToolBlock).toBe(true); }); + + it('flush is idempotent — a second call returns empty', () => { + const f = new DeepSeekInlineToolCallFilter(); + f.push('held'); + expect(f.flush()).toBe('held'); + expect(f.flush()).toBe(''); + }); + + it('suppresses a malformed block too (it has the begin token but no parseable call)', () => { + const f = new DeepSeekInlineToolCallFilter(); + const malformed = `${DEEPSEEK_TOOL_CALLS_BEGIN}<|tool${SEP}call${SEP}begin|>function<|tool${SEP}sep|>read_file\n\`\`\`json\n{ broken`; + let out = f.push(`note ${malformed}`); + out += f.flush(); + expect(out).toBe('note '); + expect(f.sawToolBlock).toBe(true); + expect(parseDeepSeekInlineToolCalls(f.content)).toEqual([]); + }); +}); + +describe('OpenAILegacyStreamedMessage inline-tool fallback (non-stream)', () => { + const nonStream = (content: string) => + new OpenAILegacyStreamedMessage( + { id: 'cmpl_test', choices: [{ index: 0, message: { role: 'assistant', content }, finish_reason: 'stop' }] } as never, + false, + undefined, + ); + const collect = async (sm: AsyncIterable): Promise => { + const parts: StreamedMessagePart[] = []; + for await (const part of sm) parts.push(part); + return parts; + }; + const textOf = (parts: StreamedMessagePart[]) => + parts + .filter((p): p is Extract => p.type === 'text') + .map((p) => p.text) + .join(''); + + it('parses a leaked block into a function part and strips the tokens', async () => { + const parts = await collect(nonStream(`Reading. ${wrap(callBlock('read_file', '{"path":"a.js"}'))}`)); + expect(textOf(parts)).toBe('Reading. '); + const fns = parts.filter((p): p is Extract => p.type === 'function'); + expect(fns).toHaveLength(1); + expect(fns[0]?.name).toBe('read_file'); + }); + + it('strips the tokens of a malformed block even though no call is dispatched', async () => { + const parts = await collect( + nonStream(`Reading. ${DEEPSEEK_TOOL_CALLS_BEGIN}<|tool${SEP}call${SEP}begin|>function<|tool${SEP}sep|>read_file\n\`\`\`json\n{ broken`), + ); + const text = textOf(parts); + expect(text).toBe('Reading. '); + expect(text).not.toContain(DEEPSEEK_TOOL_CALLS_BEGIN); + expect(parts.some((p) => p.type === 'function')).toBe(false); + }); }); From 42d0a7ab8702e07f140733b4fe51fa7ef838ad70 Mon Sep 17 00:00:00 2001 From: Mikael Hugo Date: Mon, 15 Jun 2026 23:15:58 +0200 Subject: [PATCH 3/3] address review: full-width sentinels, omitted calls-begin, stream ordering Bot reviewers (Codex + advreview panel) flagged: - Real DeepSeek/vLLM leaks use full-width sentinels (U+FF5C) and can omit the outer calls-begin token. Detection is now bar-agnostic and anchors on either the calls-begin or a per-call call-begin boundary, so the self-hosted vLLM/SGLang/llama.cpp case is actually recovered. - A short text preamble before a structured tool_calls delta could be reordered after the call: a structured call now releases held text first, in order. - releaseHoldback() is a no-op once suppression has begun, so a structured call arriving mid-block can't flip the filter to passthrough and leak the rest of the raw tokens. Covered by tests (full-width, omitted wrapper, reorder, mid-block guard). --- .../providers/deepseek-inline-tool-calls.ts | 75 +++++++++++++---- .../kosong/src/providers/openai-legacy.ts | 21 +++-- .../deepseek-inline-tool-calls.test.ts | 84 ++++++++++++++++++- 3 files changed, 152 insertions(+), 28 deletions(-) diff --git a/packages/kosong/src/providers/deepseek-inline-tool-calls.ts b/packages/kosong/src/providers/deepseek-inline-tool-calls.ts index 1160d1ee8..b24c7c70e 100644 --- a/packages/kosong/src/providers/deepseek-inline-tool-calls.ts +++ b/packages/kosong/src/providers/deepseek-inline-tool-calls.ts @@ -17,30 +17,50 @@ import type { ToolCall } from '#/message'; * ```<|tool▁call▁end|> (repeated for parallel calls) * <|tool▁calls▁end|> * - * (the bars are ASCII U+007C, the separators are U+2581). When that happens the - * agent sees no tool call and the turn dead-ends. This module parses those tokens - * client-side so the call can still be dispatched. It is applied ONLY when the - * provider returned no structured tool call AND the begin token is present, so it - * is a no-op for every well-behaved provider/model. + * The bar is either ASCII `|` (U+007C, as ollama-cloud emits) or the tokenizer's + * full-width `|` (U+FF5C, as raw vLLM/SGLang/llama.cpp leaks); separators are + * `▁` (U+2581). The outer `…calls▁begin…` wrapper is sometimes omitted and the + * block starts straight at a per-call `…call▁begin…` (see vllm-project/vllm#21727), + * so detection anchors on either boundary. + * + * When this happens the agent sees no tool call and the turn dead-ends. This + * module parses those tokens client-side so the call can still be dispatched. It + * is applied ONLY when the provider returned no structured tool call AND a block + * boundary is present, so it is a no-op for every well-behaved provider/model. */ -const SEP = '▁'; // ▁ -export const DEEPSEEK_TOOL_CALLS_BEGIN = `<|tool${SEP}calls${SEP}begin|>`; +const SEP = '▁'; // U+2581 +const BAR = '[||]'; // ASCII U+007C or full-width U+FF5C -// <|tool▁call▁begin|>[function]<|tool▁sep|>NAME ```[json] {ARGS} ``` +/** First block boundary: a calls-begin OR a call-begin token, either bar. */ +const BLOCK_START = new RegExp(`<${BAR}tool${SEP}calls?${SEP}begin${BAR}>`); + +// One call: <…call▁begin…>[function]<…sep…>NAME ```[json] {ARGS} ``` const CALL_RE = new RegExp( - `<\\|tool${SEP}call${SEP}begin\\|>\\s*(?:function)?\\s*<\\|tool${SEP}sep\\|>\\s*([A-Za-z0-9_.-]+)\\s*` + + `<${BAR}tool${SEP}call${SEP}begin${BAR}>\\s*(?:function)?\\s*<${BAR}tool${SEP}sep${BAR}>\\s*([A-Za-z0-9_.-]+)\\s*` + '```(?:json)?\\s*([\\s\\S]*?)```', 'g', ); +/** Canonical ASCII calls-begin sentinel (documentation / convenience). */ +export const DEEPSEEK_TOOL_CALLS_BEGIN = `<|tool${SEP}calls${SEP}begin|>`; + +// Longest sentinel we might hold back while detecting a marker split across deltas. +const MAX_MARKER_LEN = DEEPSEEK_TOOL_CALLS_BEGIN.length; + +/** Index of the first DeepSeek tool-call block boundary in `content`, or -1. */ +export function firstBlockStart(content: string): number { + const match = BLOCK_START.exec(content); + return match ? match.index : -1; +} + /** * Parse DeepSeek inline tool-call tokens from assistant content into structured * {@link ToolCall}s. Calls whose argument block is not valid JSON are skipped, * so a partially corrupted emission yields the calls it can rather than throwing. */ export function parseDeepSeekInlineToolCalls(content: string): ToolCall[] { - if (typeof content !== 'string' || !content.includes(DEEPSEEK_TOOL_CALLS_BEGIN)) { + if (typeof content !== 'string' || firstBlockStart(content) < 0) { return []; } const calls: ToolCall[] = []; @@ -61,31 +81,35 @@ export function parseDeepSeekInlineToolCalls(content: string): ToolCall[] { } /** - * Streaming-safe filter that lets text deltas through live until the DeepSeek + * Streaming-safe filter that lets text deltas through live until a DeepSeek * tool-call block begins, then suppresses the raw tokens (so they never reach the - * UI) while still accumulating the full content for {@link parseDeepSeekInlineToolCalls}. + * UI) while accumulating the full content for {@link parseDeepSeekInlineToolCalls}. * - * A small trailing holdback covers a begin-marker that straddles two deltas. + * A small trailing holdback covers a block-start marker that straddles two deltas. + * Once the provider emits a structured tool call ({@link releaseHoldback}) no + * inline leak is possible, so the filter releases any held text and passes the + * rest through verbatim — preserving ordering for well-behaved providers. */ export class DeepSeekInlineToolCallFilter { - private readonly marker = DEEPSEEK_TOOL_CALLS_BEGIN; private buffer = ''; private full = ''; private suppressing = false; + private passthrough = false; /** Feed a content delta; returns the text safe to yield now (possibly empty). */ push(delta: string): string { this.full += delta; + if (this.passthrough) return delta; if (this.suppressing) return ''; this.buffer += delta; - const idx = this.buffer.indexOf(this.marker); + const idx = firstBlockStart(this.buffer); if (idx >= 0) { const out = this.buffer.slice(0, idx); this.suppressing = true; this.buffer = ''; return out; } - const holdback = this.marker.length - 1; + const holdback = MAX_MARKER_LEN - 1; if (this.buffer.length > holdback) { const out = this.buffer.slice(0, this.buffer.length - holdback); this.buffer = this.buffer.slice(this.buffer.length - holdback); @@ -94,6 +118,23 @@ export class DeepSeekInlineToolCallFilter { return ''; } + /** + * The provider emitted a structured tool call, so no inline leak is possible. + * Release any held-back text (in order) and stop buffering subsequent deltas — + * this keeps a short preamble from being reordered after the tool-call parts. + * + * No-op once suppression has begun: if an inline block was already detected we + * must keep stripping it rather than flip to passthrough (which would leak the + * remainder of the block as visible text). + */ + releaseHoldback(): string { + if (this.suppressing) return ''; + this.passthrough = true; + const out = this.buffer; + this.buffer = ''; + return out; + } + /** * Remaining buffered text once the stream ends (empty if a block was * suppressed). Idempotent: the buffer is cleared, so a second call returns ''. @@ -105,7 +146,7 @@ export class DeepSeekInlineToolCallFilter { return out; } - /** Whether the begin marker was seen. */ + /** Whether a block-start marker was seen (and the rest of content suppressed). */ get sawToolBlock(): boolean { return this.suppressing; } diff --git a/packages/kosong/src/providers/openai-legacy.ts b/packages/kosong/src/providers/openai-legacy.ts index 46ad5af62..b9c914f5f 100644 --- a/packages/kosong/src/providers/openai-legacy.ts +++ b/packages/kosong/src/providers/openai-legacy.ts @@ -32,8 +32,8 @@ import { } from './chat-completions-stream'; import { DeepSeekInlineToolCallFilter, + firstBlockStart, parseDeepSeekInlineToolCalls, - DEEPSEEK_TOOL_CALLS_BEGIN, } from './deepseek-inline-tool-calls'; import { mergeRequestHeaders, @@ -371,18 +371,15 @@ export class OpenAILegacyStreamedMessage implements StreamedMessage { // Fallback: a backend served a DeepSeek-format model but left its inline // tool-call tokens in `content` instead of structuring them as `tool_calls`. - // Strip the block from visible text whenever the begin token is present (and + // Strip the block from visible text whenever a block boundary is present (and // the provider returned no structured call) — even if some blocks fail to // parse — so the raw tokens never render. Parse what we can into dispatchable // tool calls. No-op when absent or already structured. - const hasInlineBlock = - structuredToolCalls.length === 0 && content.includes(DEEPSEEK_TOOL_CALLS_BEGIN); - const inlineToolCalls = hasInlineBlock ? parseDeepSeekInlineToolCalls(content) : []; + const blockStart = structuredToolCalls.length === 0 ? firstBlockStart(content) : -1; + const inlineToolCalls = blockStart >= 0 ? parseDeepSeekInlineToolCalls(content) : []; if (content.length > 0) { - const text = hasInlineBlock - ? content.slice(0, content.indexOf(DEEPSEEK_TOOL_CALLS_BEGIN)) - : content; + const text = blockStart >= 0 ? content.slice(0, blockStart) : content; if (text.length > 0) { yield { type: 'text', text } satisfies StreamedMessagePart; } @@ -454,8 +451,14 @@ export class OpenAILegacyStreamedMessage implements StreamedMessage { // tool calls — preserve `index` on every yielded part so the generate // loop can route interleaved argument deltas from parallel tool calls. - if (delta.tool_calls && delta.tool_calls.length > 0) { + if (delta.tool_calls && delta.tool_calls.length > 0 && !sawStructuredToolCall) { sawStructuredToolCall = true; + // A structured tool call means no inline leak is possible: release any + // held-back preamble text now so it isn't reordered after the call parts. + const released = inlineFilter.releaseHoldback(); + if (released.length > 0) { + yield { type: 'text', text: released } satisfies StreamedMessagePart; + } } for (const toolCall of delta.tool_calls ?? []) { for (const part of convertChatCompletionStreamToolCall(toolCall, bufferedToolCalls)) { diff --git a/packages/kosong/test/providers/deepseek-inline-tool-calls.test.ts b/packages/kosong/test/providers/deepseek-inline-tool-calls.test.ts index 72b62447a..8eb37a022 100644 --- a/packages/kosong/test/providers/deepseek-inline-tool-calls.test.ts +++ b/packages/kosong/test/providers/deepseek-inline-tool-calls.test.ts @@ -2,14 +2,17 @@ import type { StreamedMessagePart } from '#/message'; import { DEEPSEEK_TOOL_CALLS_BEGIN, DeepSeekInlineToolCallFilter, + firstBlockStart, parseDeepSeekInlineToolCalls, } from '#/providers/deepseek-inline-tool-calls'; import { OpenAILegacyStreamedMessage } from '#/providers/openai-legacy'; import { describe, expect, it } from 'vitest'; const SEP = '▁'; -const callBlock = (name: string, args: string) => - `<|tool${SEP}call${SEP}begin|>function<|tool${SEP}sep|>${name}\n\`\`\`json\n${args}\n\`\`\`<|tool${SEP}call${SEP}end|>`; +// `bar` lets the same helpers build both the ASCII (U+007C) form ollama emits and +// the full-width (U+FF5C) form raw vLLM/SGLang leaks. +const callBlock = (name: string, args: string, bar = '|') => + `<${bar}tool${SEP}call${SEP}begin${bar}>function<${bar}tool${SEP}sep${bar}>${name}\n\`\`\`json\n${args}\n\`\`\`<${bar}tool${SEP}call${SEP}end${bar}>`; const wrap = (...blocks: string[]) => `${DEEPSEEK_TOOL_CALLS_BEGIN}${blocks.join('')}<|tool${SEP}calls${SEP}end|>`; @@ -44,6 +47,27 @@ describe('parseDeepSeekInlineToolCalls', () => { ); expect(calls.map((c) => c.name)).toEqual(['Grep']); }); + + it('parses the full-width (U+FF5C) sentinel form raw vLLM leaks emit', () => { + const fw = `<|tool${SEP}calls${SEP}begin|>${callBlock('read_file', '{"path":"a.js"}', '|')}<|tool${SEP}calls${SEP}end|>`; + const calls = parseDeepSeekInlineToolCalls(fw); + expect(calls).toHaveLength(1); + expect(calls[0]?.name).toBe('read_file'); + }); + + it('parses when the outer calls-begin wrapper is omitted (starts at call-begin)', () => { + const calls = parseDeepSeekInlineToolCalls(callBlock('read_file', '{"path":"a.js"}')); + expect(calls).toHaveLength(1); + expect(calls[0]?.name).toBe('read_file'); + }); +}); + +describe('firstBlockStart', () => { + it('locates either boundary, both bars, and returns -1 otherwise', () => { + expect(firstBlockStart('plain text')).toBe(-1); + expect(firstBlockStart(`go ${DEEPSEEK_TOOL_CALLS_BEGIN}…`)).toBe(3); + expect(firstBlockStart(`go <|tool${SEP}call${SEP}begin|>…`)).toBe(3); + }); }); describe('DeepSeekInlineToolCallFilter', () => { @@ -95,6 +119,62 @@ describe('DeepSeekInlineToolCallFilter', () => { expect(f.sawToolBlock).toBe(true); expect(parseDeepSeekInlineToolCalls(f.content)).toEqual([]); }); + + it('releaseHoldback returns held text and then passes the rest through', () => { + const f = new DeepSeekInlineToolCallFilter(); + expect(f.push('Hi. ')).toBe(''); // shorter than the holdback — held + expect(f.releaseHoldback()).toBe('Hi. '); + expect(f.push('more')).toBe('more'); // passthrough now + }); + + it('does not leak a block when a structured call arrives after suppression began', () => { + const f = new DeepSeekInlineToolCallFilter(); + expect(f.push(`go ${DEEPSEEK_TOOL_CALLS_BEGIN}<|tool${SEP}call${SEP}begin|>`)).toBe('go '); + expect(f.sawToolBlock).toBe(true); + // A structured tool call arrives mid-block: releaseHoldback must NOT flip to + // passthrough, or the rest of the raw tokens would leak as visible text. + expect(f.releaseHoldback()).toBe(''); + expect(f.push(`function<|tool${SEP}sep|>read_file`)).toBe(''); + expect(f.flush()).toBe(''); + }); +}); + +describe('OpenAILegacyStreamedMessage inline-tool fallback (stream)', () => { + const streamed = (chunks: unknown[]) => + new OpenAILegacyStreamedMessage( + (async function* () { + for (const c of chunks) yield c; + })() as never, + true, + undefined, + ); + + it('keeps a short text preamble before structured tool-call parts (no reorder)', async () => { + // "Hi. " (4 chars) is shorter than the holdback, so it is buffered; the + // structured tool_calls delta must release it first, not after. + const sm = streamed([ + { id: 'c', choices: [{ index: 0, delta: { content: 'Hi. ' } }] }, + { + id: 'c', + choices: [ + { + index: 0, + delta: { + tool_calls: [ + { index: 0, id: 'call_1', function: { name: 'read_file', arguments: '{"path":"a.js"}' } }, + ], + }, + }, + ], + }, + { id: 'c', choices: [{ index: 0, delta: {}, finish_reason: 'tool_calls' }] }, + ]); + const parts: StreamedMessagePart[] = []; + for await (const part of sm) parts.push(part); + const types = parts.map((p) => p.type); + expect(types.indexOf('text')).toBeLessThan(types.indexOf('function')); + expect(parts.find((p) => p.type === 'text')).toMatchObject({ text: 'Hi. ' }); + }); }); describe('OpenAILegacyStreamedMessage inline-tool fallback (non-stream)', () => {