diff --git a/docs/features/k2_v3.md b/docs/features/k2_v3.md
index 901083d7095f..84bcb4d47ced 100644
--- a/docs/features/k2_v3.md
+++ b/docs/features/k2_v3.md
@@ -1,7 +1,7 @@
# K2-V3 Reasoning and Tool Calling
This page documents how to use the LLM360 `k2_v3` reasoning parser and the
-multi-format tool calling parser from the LLM360 `v0.12.0-ifm` branch.
+multi-format tool calling parser from the LLM360 `v0.12.0-ifm-0518` branch.
!!! note
This workflow is specific to the LLM360 fork and branch. It is not the
@@ -14,7 +14,7 @@ Use the LLM360 fork for this setup:
```bash
git clone https://github.com/LLM360/vllm.git
cd vllm
-git switch v0.12.0-ifm
+git switch v0.12.0-ifm-0518
VLLM_USE_PRECOMPILED=1 uv pip install --editable .
```
@@ -30,7 +30,7 @@ vllm serve YOUR_K2_V3_MODEL \
--port 8000 \
--reasoning-parser k2_v3 \
--enable-auto-tool-choice \
- --tool-call-parser multi_format
+ --tool-call-parser k2_v3
```
## Required Settings
@@ -38,18 +38,19 @@ vllm serve YOUR_K2_V3_MODEL \
Use these settings to enable the K2 reasoning parser and tool calling flow:
- `--reasoning-parser k2_v3`
-- `--tool-call-parser multi_format`
+- `--tool-call-parser k2_v3`
- `--enable-auto-tool-choice`
At request time, pass the chat template kwargs through `extra_body`:
-- `reasoning_effort`: optional, defaults to `high` (` ... `)
-- `tool_format`: selects the tool-call output format used by the parser
+- `reasoning_effort`: optional, defaults to `high`
+ (` ... `)
+- `tool_call_format`: selects the tool-call output format used by the parser
## OpenAI-Compatible Client Example
The example below uses `YOUR_K2_V3_MODEL` with `tool_choice="auto"` and
-sets both `tool_format` and `reasoning_effort` in `chat_template_kwargs`.
+sets both `tool_call_format` and `reasoning_effort` in `chat_template_kwargs`.
```python
from openai import OpenAI
@@ -84,7 +85,7 @@ resp = client.chat.completions.create(
tool_choice="auto",
extra_body={
"chat_template_kwargs": {
- "tool_format": "default",
+ "tool_call_format": "xml",
"reasoning_effort": "high",
}
},
@@ -98,9 +99,9 @@ print("tool_calls:", message.tool_calls)
## Summary
-For the LLM360 `v0.12.0-ifm` branch:
+For the LLM360 `v0.12.0-ifm-0518` branch:
- use `k2_v3` as the reasoning parser
-- use `multi_format` as the tool-call parser
-- use `high` / `` as the default reasoning effort unless you override it
-- set `tool_format` in request-time `chat_template_kwargs`
+- use `k2_v3` as the tool-call parser
+- use `high` / `` as the default reasoning effort unless you override it
+- set `tool_call_format` in request-time `chat_template_kwargs`
diff --git a/tests/entrypoints/openai/test_tool_parser_kwargs.py b/tests/entrypoints/openai/test_tool_parser_kwargs.py
index b3fb934a15da..b7b4e9bc080b 100644
--- a/tests/entrypoints/openai/test_tool_parser_kwargs.py
+++ b/tests/entrypoints/openai/test_tool_parser_kwargs.py
@@ -59,7 +59,7 @@ def _xgrammar_getattr(name: str):
class KwargAwareToolParser(ToolParser):
def __init__(self, tokenizer, chat_template_kwargs=None):
super().__init__(tokenizer)
- self.tool_format = (chat_template_kwargs or {}).get("tool_format")
+ self.tool_call_format = (chat_template_kwargs or {}).get("tool_call_format")
def extract_tool_calls(self, model_output, request):
return ExtractedToolCallInformation(
@@ -67,7 +67,7 @@ def extract_tool_calls(self, model_output, request):
tool_calls=[
ToolCall(
function=FunctionCall(
- name=self.tool_format or "missing",
+ name=self.tool_call_format or "missing",
arguments="{}",
)
)
@@ -148,7 +148,7 @@ def test_parse_tool_calls_from_content_passes_chat_template_kwargs():
enable_auto_tools=True,
tool_parser_cls=KwargAwareToolParser,
content="noop()",
- chat_template_kwargs={"tool_format": "python"},
+ chat_template_kwargs={"tool_call_format": "python"},
)
assert content is None
@@ -167,7 +167,7 @@ def test_parse_tool_calls_from_content_keeps_legacy_parsers_compatible():
enable_auto_tools=True,
tool_parser_cls=LegacyToolParser,
content="noop()",
- chat_template_kwargs={"tool_format": "python"},
+ chat_template_kwargs={"tool_call_format": "python"},
)
assert content is None
diff --git a/tests/entrypoints/openai/tool_parsers/test_multi_format_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_multi_format_tool_parser.py
index a67215651389..83ac97615db2 100644
--- a/tests/entrypoints/openai/tool_parsers/test_multi_format_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_multi_format_tool_parser.py
@@ -32,10 +32,10 @@ def decode(self, token_ids):
return "".join(reverse_vocab[token_id] for token_id in token_ids)
-def make_parser(tool_format: str) -> ToolParser:
+def make_parser(tool_call_format: str) -> ToolParser:
return ToolParserManager.get_tool_parser("multi_format")(
FakeTokenizer(),
- chat_template_kwargs={"tool_format": tool_format},
+ chat_template_kwargs={"tool_call_format": tool_call_format},
)
@@ -78,7 +78,7 @@ def make_schema_request() -> ChatCompletionRequest:
)
-def test_missing_tool_format_defaults_to_xml():
+def test_missing_tool_call_format_defaults_to_xml():
parser = make_parser_with_kwargs({})
extracted = run_tool_extraction_nonstreaming(
@@ -220,19 +220,25 @@ def test_ifm_xml_typed_format_uses_arg_type_without_schema():
@pytest.mark.parametrize(
- "tool_format",
+ "tool_call_format",
["default", "typed_xml", "XML", "xllm_typed", "xml ", ""],
)
-def test_tool_format_requires_exact_supported_value(tool_format: str):
+def test_tool_call_format_requires_exact_supported_value(tool_call_format: str):
with pytest.raises(ValueError, match="Use one of these exact values"):
- make_parser_with_kwargs({"tool_call_format": tool_format})
+ make_parser_with_kwargs({"tool_call_format": tool_call_format})
-def test_tool_format_must_be_a_string():
+def test_tool_call_format_must_be_a_string():
with pytest.raises(ValueError, match="must be a string"):
make_parser_with_kwargs({"tool_call_format": 123})
+@pytest.mark.parametrize("arg_name", ["tool_format", "tool_calling_format"])
+def test_legacy_tool_format_arguments_are_rejected(arg_name: str):
+ with pytest.raises(ValueError, match=f"Unsupported argument: {arg_name}"):
+ make_parser_with_kwargs({arg_name: "xml"})
+
+
def test_k2_v3_parser_alias_uses_ifm_formats():
parser = ToolParserManager.get_tool_parser("k2_v3")(
FakeTokenizer(),
@@ -255,6 +261,52 @@ def test_k2_v3_parser_alias_uses_ifm_formats():
}
+def test_k2_v3_parser_strips_0518_ifm_reasoning_prefix():
+ parser = ToolParserManager.get_tool_parser("k2_v3")(
+ FakeTokenizer(),
+ chat_template_kwargs={"tool_call_format": "xml"},
+ )
+
+ extracted = run_tool_extraction_nonstreaming(
+ parser,
+ "need lookup\n"
+ "\n"
+ "get_weather"
+ "city"
+ "Tokyo"
+ "\n"
+ "",
+ make_request(),
+ )
+
+ assert extracted.tools_called
+ assert extracted.content is None
+ assert extracted.tool_calls[0].function.name == "get_weather"
+ assert json.loads(extracted.tool_calls[0].function.arguments) == {"city": "Tokyo"}
+
+
+def test_k2_v3_parser_does_not_strip_legacy_reasoning_prefix():
+ parser = ToolParserManager.get_tool_parser("k2_v3")(
+ FakeTokenizer(),
+ chat_template_kwargs={"tool_call_format": "xml"},
+ )
+
+ extracted = run_tool_extraction_nonstreaming(
+ parser,
+ "legacy reasoning\n"
+ "get_weather"
+ "city"
+ "Tokyo"
+ "",
+ make_request(),
+ )
+
+ assert extracted.tools_called
+ assert extracted.content == "legacy reasoning\n"
+ assert extracted.tool_calls[0].function.name == "get_weather"
+ assert json.loads(extracted.tool_calls[0].function.arguments) == {"city": "Tokyo"}
+
+
def test_minimax_format_extracts_inline_invokes():
parser = make_parser("minimax")
diff --git a/tests/reasoning/test_k2_v3_reasoning_parser.py b/tests/reasoning/test_k2_v3_reasoning_parser.py
index 9ffa66d8b6f7..6a02f56b3a57 100644
--- a/tests/reasoning/test_k2_v3_reasoning_parser.py
+++ b/tests/reasoning/test_k2_v3_reasoning_parser.py
@@ -9,9 +9,9 @@
PARSER_NAME = "k2_v3"
EFFORT_TOKENS = {
- "high": ("", ""),
- "medium": ("", ""),
- "low": ("", ""),
+ "high": ("", ""),
+ "medium": ("", ""),
+ "low": ("", ""),
}
@@ -190,21 +190,21 @@ def test_reasoning(
def test_default_effort_is_high(k2_v3_tokenizer):
- """Parser with no reasoning_effort should use /<\/think>."""
+ """Parser with no reasoning_effort should use /."""
parser = ReasoningParserManager.get_reasoning_parser(PARSER_NAME)(k2_v3_tokenizer)
- assert parser.start_token == ""
- assert parser.end_token == ""
+ assert parser.start_token == ""
+ assert parser.end_token == ""
def test_none_effort_falls_back_to_high(k2_v3_tokenizer):
"""reasoning_effort='none' should fall back to high tokens."""
parser = _make_parser(k2_v3_tokenizer, "none")
- assert parser.start_token == ""
- assert parser.end_token == ""
+ assert parser.start_token == ""
+ assert parser.end_token == ""
def test_unknown_effort_falls_back_to_high(k2_v3_tokenizer):
"""Unknown effort value should fall back to high tokens."""
parser = _make_parser(k2_v3_tokenizer, "ultra")
- assert parser.start_token == ""
- assert parser.end_token == ""
+ assert parser.start_token == ""
+ assert parser.end_token == ""
diff --git a/vllm/entrypoints/openai/tool_parsers/multi_format_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/multi_format_tool_parser.py
index 30b476315c78..e31a8b614183 100644
--- a/vllm/entrypoints/openai/tool_parsers/multi_format_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/multi_format_tool_parser.py
@@ -78,6 +78,14 @@ class MultiFormatToolParser(ToolParser):
r"(.*?)",
re.DOTALL,
)
+ _IFM_REASONING_PREFIX_REGEX = re.compile(
+ r"\A\s*(?:"
+ r".*?|"
+ r".*?|"
+ r".*?"
+ r")\s*",
+ re.DOTALL,
+ )
_GLM_BLOCK_REGEX = re.compile(
r"(.*?)",
@@ -96,11 +104,17 @@ def __init__(
super().__init__(tokenizer)
chat_template_kwargs = chat_template_kwargs or {}
- raw_tool_format = "xml"
- for key in ("tool_call_format", "tool_calling_format", "tool_format"):
- if key in chat_template_kwargs and chat_template_kwargs[key] is not None:
- raw_tool_format = chat_template_kwargs[key]
- break
+ if "tool_calling_format" in chat_template_kwargs:
+ raise ValueError(
+ "Unsupported argument: tool_calling_format. "
+ "Use tool_call_format with one of: json, xml, xml_typed."
+ )
+ if "tool_format" in chat_template_kwargs:
+ raise ValueError(
+ "Unsupported argument: tool_format. "
+ "Use tool_call_format with one of: json, xml, xml_typed."
+ )
+ raw_tool_format = chat_template_kwargs.get("tool_call_format", "xml")
self.tool_format = self._validate_tool_format(raw_tool_format)
self._delegate: ToolParser | None = None
@@ -115,13 +129,13 @@ def __init__(
def _validate_tool_format(cls, tool_format: Any) -> str:
if not isinstance(tool_format, str):
raise ValueError(
- "tool_format/tool_call_format must be a string. "
+ "tool_call_format must be a string. "
f"Got {type(tool_format).__name__}."
)
if tool_format not in cls._SUPPORTED_TOOL_FORMATS:
supported_formats = ", ".join(sorted(cls._SUPPORTED_TOOL_FORMATS))
raise ValueError(
- f"Unsupported tool_format/tool_call_format '{tool_format}'. "
+ f"Unsupported tool_call_format '{tool_format}'. "
"Use one of these exact values: "
f"{supported_formats}."
)
@@ -159,7 +173,7 @@ def extract_tool_calls(
return self._extract_python_tool_calls(model_output)
except Exception:
logger.exception(
- "Error extracting tool calls for tool_format=%s.",
+ "Error extracting tool calls for tool_call_format=%s.",
self.tool_format,
)
@@ -200,11 +214,21 @@ def _json_or_string(value: str) -> Any:
except json.JSONDecodeError:
return value
- @staticmethod
- def _prefix_content(model_output: str, first_tool_index: int | None) -> str | None:
+ @classmethod
+ def _strip_ifm_reasoning_prefix(cls, content: str) -> str:
+ while match := cls._IFM_REASONING_PREFIX_REGEX.match(content):
+ content = content[match.end() :]
+ return content
+
+ @classmethod
+ def _prefix_content(
+ cls,
+ model_output: str,
+ first_tool_index: int | None,
+ ) -> str | None:
if first_tool_index is None or first_tool_index <= 0:
return None
- content = model_output[:first_tool_index]
+ content = cls._strip_ifm_reasoning_prefix(model_output[:first_tool_index])
return content if content.strip() else None
@staticmethod
@@ -690,4 +714,4 @@ def _extract_python_tool_calls(
class K2V3ToolParser(MultiFormatToolParser):
- """K2-V3 alias for the IFM-aware multi-format parser."""
+ """K2-V3 parser for BBQ 0518 IFM tool-call and reasoning tokens."""
diff --git a/vllm/reasoning/k2_v3_reasoning_parser.py b/vllm/reasoning/k2_v3_reasoning_parser.py
index de70bd5a7f30..e1bb23f49e5b 100644
--- a/vllm/reasoning/k2_v3_reasoning_parser.py
+++ b/vllm/reasoning/k2_v3_reasoning_parser.py
@@ -10,10 +10,10 @@ class K2V3ReasoningParser(DeepSeekR1ReasoningParser):
Reasoning parser for the K2-v3 model family.
K2-v3 supports three reasoning effort levels, each using different
- think tokens:
- - high (default): /
- - medium: /
- - low: /
+ IFM think tokens:
+ - high (default): /
+ - medium: /
+ - low: /
The effort level is selected via the ``reasoning_effort`` parameter
in ``chat_template_kwargs``. The chat template inserts the start
@@ -22,9 +22,9 @@ class K2V3ReasoningParser(DeepSeekR1ReasoningParser):
"""
_EFFORT_TOKENS: dict[str, tuple[str, str]] = {
- "high": ("", ""),
- "medium": ("", ""),
- "low": ("", ""),
+ "high": ("", ""),
+ "medium": ("", ""),
+ "low": ("", ""),
}
def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):