diff --git a/docs/features/k2_v3.md b/docs/features/k2_v3.md index 901083d7095f..84bcb4d47ced 100644 --- a/docs/features/k2_v3.md +++ b/docs/features/k2_v3.md @@ -1,7 +1,7 @@ # K2-V3 Reasoning and Tool Calling This page documents how to use the LLM360 `k2_v3` reasoning parser and the -multi-format tool calling parser from the LLM360 `v0.12.0-ifm` branch. +multi-format tool calling parser from the LLM360 `v0.12.0-ifm-0518` branch. !!! note This workflow is specific to the LLM360 fork and branch. It is not the @@ -14,7 +14,7 @@ Use the LLM360 fork for this setup: ```bash git clone https://github.com/LLM360/vllm.git cd vllm -git switch v0.12.0-ifm +git switch v0.12.0-ifm-0518 VLLM_USE_PRECOMPILED=1 uv pip install --editable . ``` @@ -30,7 +30,7 @@ vllm serve YOUR_K2_V3_MODEL \ --port 8000 \ --reasoning-parser k2_v3 \ --enable-auto-tool-choice \ - --tool-call-parser multi_format + --tool-call-parser k2_v3 ``` ## Required Settings @@ -38,18 +38,19 @@ vllm serve YOUR_K2_V3_MODEL \ Use these settings to enable the K2 reasoning parser and tool calling flow: - `--reasoning-parser k2_v3` -- `--tool-call-parser multi_format` +- `--tool-call-parser k2_v3` - `--enable-auto-tool-choice` At request time, pass the chat template kwargs through `extra_body`: -- `reasoning_effort`: optional, defaults to `high` (` ... `) -- `tool_format`: selects the tool-call output format used by the parser +- `reasoning_effort`: optional, defaults to `high` + (` ... `) +- `tool_call_format`: selects the tool-call output format used by the parser ## OpenAI-Compatible Client Example The example below uses `YOUR_K2_V3_MODEL` with `tool_choice="auto"` and -sets both `tool_format` and `reasoning_effort` in `chat_template_kwargs`. +sets both `tool_call_format` and `reasoning_effort` in `chat_template_kwargs`. ```python from openai import OpenAI @@ -84,7 +85,7 @@ resp = client.chat.completions.create( tool_choice="auto", extra_body={ "chat_template_kwargs": { - "tool_format": "default", + "tool_call_format": "xml", "reasoning_effort": "high", } }, @@ -98,9 +99,9 @@ print("tool_calls:", message.tool_calls) ## Summary -For the LLM360 `v0.12.0-ifm` branch: +For the LLM360 `v0.12.0-ifm-0518` branch: - use `k2_v3` as the reasoning parser -- use `multi_format` as the tool-call parser -- use `high` / `` as the default reasoning effort unless you override it -- set `tool_format` in request-time `chat_template_kwargs` +- use `k2_v3` as the tool-call parser +- use `high` / `` as the default reasoning effort unless you override it +- set `tool_call_format` in request-time `chat_template_kwargs` diff --git a/tests/entrypoints/openai/test_tool_parser_kwargs.py b/tests/entrypoints/openai/test_tool_parser_kwargs.py index b3fb934a15da..b7b4e9bc080b 100644 --- a/tests/entrypoints/openai/test_tool_parser_kwargs.py +++ b/tests/entrypoints/openai/test_tool_parser_kwargs.py @@ -59,7 +59,7 @@ def _xgrammar_getattr(name: str): class KwargAwareToolParser(ToolParser): def __init__(self, tokenizer, chat_template_kwargs=None): super().__init__(tokenizer) - self.tool_format = (chat_template_kwargs or {}).get("tool_format") + self.tool_call_format = (chat_template_kwargs or {}).get("tool_call_format") def extract_tool_calls(self, model_output, request): return ExtractedToolCallInformation( @@ -67,7 +67,7 @@ def extract_tool_calls(self, model_output, request): tool_calls=[ ToolCall( function=FunctionCall( - name=self.tool_format or "missing", + name=self.tool_call_format or "missing", arguments="{}", ) ) @@ -148,7 +148,7 @@ def test_parse_tool_calls_from_content_passes_chat_template_kwargs(): enable_auto_tools=True, tool_parser_cls=KwargAwareToolParser, content="noop()", - chat_template_kwargs={"tool_format": "python"}, + chat_template_kwargs={"tool_call_format": "python"}, ) assert content is None @@ -167,7 +167,7 @@ def test_parse_tool_calls_from_content_keeps_legacy_parsers_compatible(): enable_auto_tools=True, tool_parser_cls=LegacyToolParser, content="noop()", - chat_template_kwargs={"tool_format": "python"}, + chat_template_kwargs={"tool_call_format": "python"}, ) assert content is None diff --git a/tests/entrypoints/openai/tool_parsers/test_multi_format_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_multi_format_tool_parser.py index a67215651389..83ac97615db2 100644 --- a/tests/entrypoints/openai/tool_parsers/test_multi_format_tool_parser.py +++ b/tests/entrypoints/openai/tool_parsers/test_multi_format_tool_parser.py @@ -32,10 +32,10 @@ def decode(self, token_ids): return "".join(reverse_vocab[token_id] for token_id in token_ids) -def make_parser(tool_format: str) -> ToolParser: +def make_parser(tool_call_format: str) -> ToolParser: return ToolParserManager.get_tool_parser("multi_format")( FakeTokenizer(), - chat_template_kwargs={"tool_format": tool_format}, + chat_template_kwargs={"tool_call_format": tool_call_format}, ) @@ -78,7 +78,7 @@ def make_schema_request() -> ChatCompletionRequest: ) -def test_missing_tool_format_defaults_to_xml(): +def test_missing_tool_call_format_defaults_to_xml(): parser = make_parser_with_kwargs({}) extracted = run_tool_extraction_nonstreaming( @@ -220,19 +220,25 @@ def test_ifm_xml_typed_format_uses_arg_type_without_schema(): @pytest.mark.parametrize( - "tool_format", + "tool_call_format", ["default", "typed_xml", "XML", "xllm_typed", "xml ", ""], ) -def test_tool_format_requires_exact_supported_value(tool_format: str): +def test_tool_call_format_requires_exact_supported_value(tool_call_format: str): with pytest.raises(ValueError, match="Use one of these exact values"): - make_parser_with_kwargs({"tool_call_format": tool_format}) + make_parser_with_kwargs({"tool_call_format": tool_call_format}) -def test_tool_format_must_be_a_string(): +def test_tool_call_format_must_be_a_string(): with pytest.raises(ValueError, match="must be a string"): make_parser_with_kwargs({"tool_call_format": 123}) +@pytest.mark.parametrize("arg_name", ["tool_format", "tool_calling_format"]) +def test_legacy_tool_format_arguments_are_rejected(arg_name: str): + with pytest.raises(ValueError, match=f"Unsupported argument: {arg_name}"): + make_parser_with_kwargs({arg_name: "xml"}) + + def test_k2_v3_parser_alias_uses_ifm_formats(): parser = ToolParserManager.get_tool_parser("k2_v3")( FakeTokenizer(), @@ -255,6 +261,52 @@ def test_k2_v3_parser_alias_uses_ifm_formats(): } +def test_k2_v3_parser_strips_0518_ifm_reasoning_prefix(): + parser = ToolParserManager.get_tool_parser("k2_v3")( + FakeTokenizer(), + chat_template_kwargs={"tool_call_format": "xml"}, + ) + + extracted = run_tool_extraction_nonstreaming( + parser, + "need lookup\n" + "\n" + "get_weather" + "city" + "Tokyo" + "\n" + "", + make_request(), + ) + + assert extracted.tools_called + assert extracted.content is None + assert extracted.tool_calls[0].function.name == "get_weather" + assert json.loads(extracted.tool_calls[0].function.arguments) == {"city": "Tokyo"} + + +def test_k2_v3_parser_does_not_strip_legacy_reasoning_prefix(): + parser = ToolParserManager.get_tool_parser("k2_v3")( + FakeTokenizer(), + chat_template_kwargs={"tool_call_format": "xml"}, + ) + + extracted = run_tool_extraction_nonstreaming( + parser, + "legacy reasoning\n" + "get_weather" + "city" + "Tokyo" + "", + make_request(), + ) + + assert extracted.tools_called + assert extracted.content == "legacy reasoning\n" + assert extracted.tool_calls[0].function.name == "get_weather" + assert json.loads(extracted.tool_calls[0].function.arguments) == {"city": "Tokyo"} + + def test_minimax_format_extracts_inline_invokes(): parser = make_parser("minimax") diff --git a/tests/reasoning/test_k2_v3_reasoning_parser.py b/tests/reasoning/test_k2_v3_reasoning_parser.py index 9ffa66d8b6f7..6a02f56b3a57 100644 --- a/tests/reasoning/test_k2_v3_reasoning_parser.py +++ b/tests/reasoning/test_k2_v3_reasoning_parser.py @@ -9,9 +9,9 @@ PARSER_NAME = "k2_v3" EFFORT_TOKENS = { - "high": ("", ""), - "medium": ("", ""), - "low": ("", ""), + "high": ("", ""), + "medium": ("", ""), + "low": ("", ""), } @@ -190,21 +190,21 @@ def test_reasoning( def test_default_effort_is_high(k2_v3_tokenizer): - """Parser with no reasoning_effort should use /<\/think>.""" + """Parser with no reasoning_effort should use /.""" parser = ReasoningParserManager.get_reasoning_parser(PARSER_NAME)(k2_v3_tokenizer) - assert parser.start_token == "" - assert parser.end_token == "" + assert parser.start_token == "" + assert parser.end_token == "" def test_none_effort_falls_back_to_high(k2_v3_tokenizer): """reasoning_effort='none' should fall back to high tokens.""" parser = _make_parser(k2_v3_tokenizer, "none") - assert parser.start_token == "" - assert parser.end_token == "" + assert parser.start_token == "" + assert parser.end_token == "" def test_unknown_effort_falls_back_to_high(k2_v3_tokenizer): """Unknown effort value should fall back to high tokens.""" parser = _make_parser(k2_v3_tokenizer, "ultra") - assert parser.start_token == "" - assert parser.end_token == "" + assert parser.start_token == "" + assert parser.end_token == "" diff --git a/vllm/entrypoints/openai/tool_parsers/multi_format_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/multi_format_tool_parser.py index 30b476315c78..e31a8b614183 100644 --- a/vllm/entrypoints/openai/tool_parsers/multi_format_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/multi_format_tool_parser.py @@ -78,6 +78,14 @@ class MultiFormatToolParser(ToolParser): r"(.*?)", re.DOTALL, ) + _IFM_REASONING_PREFIX_REGEX = re.compile( + r"\A\s*(?:" + r".*?|" + r".*?|" + r".*?" + r")\s*", + re.DOTALL, + ) _GLM_BLOCK_REGEX = re.compile( r"(.*?)", @@ -96,11 +104,17 @@ def __init__( super().__init__(tokenizer) chat_template_kwargs = chat_template_kwargs or {} - raw_tool_format = "xml" - for key in ("tool_call_format", "tool_calling_format", "tool_format"): - if key in chat_template_kwargs and chat_template_kwargs[key] is not None: - raw_tool_format = chat_template_kwargs[key] - break + if "tool_calling_format" in chat_template_kwargs: + raise ValueError( + "Unsupported argument: tool_calling_format. " + "Use tool_call_format with one of: json, xml, xml_typed." + ) + if "tool_format" in chat_template_kwargs: + raise ValueError( + "Unsupported argument: tool_format. " + "Use tool_call_format with one of: json, xml, xml_typed." + ) + raw_tool_format = chat_template_kwargs.get("tool_call_format", "xml") self.tool_format = self._validate_tool_format(raw_tool_format) self._delegate: ToolParser | None = None @@ -115,13 +129,13 @@ def __init__( def _validate_tool_format(cls, tool_format: Any) -> str: if not isinstance(tool_format, str): raise ValueError( - "tool_format/tool_call_format must be a string. " + "tool_call_format must be a string. " f"Got {type(tool_format).__name__}." ) if tool_format not in cls._SUPPORTED_TOOL_FORMATS: supported_formats = ", ".join(sorted(cls._SUPPORTED_TOOL_FORMATS)) raise ValueError( - f"Unsupported tool_format/tool_call_format '{tool_format}'. " + f"Unsupported tool_call_format '{tool_format}'. " "Use one of these exact values: " f"{supported_formats}." ) @@ -159,7 +173,7 @@ def extract_tool_calls( return self._extract_python_tool_calls(model_output) except Exception: logger.exception( - "Error extracting tool calls for tool_format=%s.", + "Error extracting tool calls for tool_call_format=%s.", self.tool_format, ) @@ -200,11 +214,21 @@ def _json_or_string(value: str) -> Any: except json.JSONDecodeError: return value - @staticmethod - def _prefix_content(model_output: str, first_tool_index: int | None) -> str | None: + @classmethod + def _strip_ifm_reasoning_prefix(cls, content: str) -> str: + while match := cls._IFM_REASONING_PREFIX_REGEX.match(content): + content = content[match.end() :] + return content + + @classmethod + def _prefix_content( + cls, + model_output: str, + first_tool_index: int | None, + ) -> str | None: if first_tool_index is None or first_tool_index <= 0: return None - content = model_output[:first_tool_index] + content = cls._strip_ifm_reasoning_prefix(model_output[:first_tool_index]) return content if content.strip() else None @staticmethod @@ -690,4 +714,4 @@ def _extract_python_tool_calls( class K2V3ToolParser(MultiFormatToolParser): - """K2-V3 alias for the IFM-aware multi-format parser.""" + """K2-V3 parser for BBQ 0518 IFM tool-call and reasoning tokens.""" diff --git a/vllm/reasoning/k2_v3_reasoning_parser.py b/vllm/reasoning/k2_v3_reasoning_parser.py index de70bd5a7f30..e1bb23f49e5b 100644 --- a/vllm/reasoning/k2_v3_reasoning_parser.py +++ b/vllm/reasoning/k2_v3_reasoning_parser.py @@ -10,10 +10,10 @@ class K2V3ReasoningParser(DeepSeekR1ReasoningParser): Reasoning parser for the K2-v3 model family. K2-v3 supports three reasoning effort levels, each using different - think tokens: - - high (default): / - - medium: / - - low: / + IFM think tokens: + - high (default): / + - medium: / + - low: / The effort level is selected via the ``reasoning_effort`` parameter in ``chat_template_kwargs``. The chat template inserts the start @@ -22,9 +22,9 @@ class K2V3ReasoningParser(DeepSeekR1ReasoningParser): """ _EFFORT_TOKENS: dict[str, tuple[str, str]] = { - "high": ("", ""), - "medium": ("", ""), - "low": ("", ""), + "high": ("", ""), + "medium": ("", ""), + "low": ("", ""), } def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):