LLM360 · hanseungwook · May 21, 2026
diff --git a/docs/features/k2_v3.md b/docs/features/k2_v3.md
@@ -1,7 +1,7 @@
 # K2-V3 Reasoning and Tool Calling
 
 This page documents how to use the LLM360 `k2_v3` reasoning parser and the
-multi-format tool calling parser from the LLM360 `v0.12.0-ifm` branch.
+multi-format tool calling parser from the LLM360 `v0.12.0-ifm-0518` branch.
 
 !!! note
     This workflow is specific to the LLM360 fork and branch. It is not the
@@ -14,7 +14,7 @@ Use the LLM360 fork for this setup:
 ```bash
 git clone https://github.com/LLM360/vllm.git
 cd vllm
-git switch v0.12.0-ifm
+git switch v0.12.0-ifm-0518
 VLLM_USE_PRECOMPILED=1 uv pip install --editable .
 ```
 
@@ -30,26 +30,27 @@ vllm serve YOUR_K2_V3_MODEL \
     --port 8000 \
     --reasoning-parser k2_v3 \
     --enable-auto-tool-choice \
-    --tool-call-parser multi_format
+    --tool-call-parser k2_v3
 ```
 
 ## Required Settings
 
 Use these settings to enable the K2 reasoning parser and tool calling flow:
 
 - `--reasoning-parser k2_v3`
-- `--tool-call-parser multi_format`
+- `--tool-call-parser k2_v3`
 - `--enable-auto-tool-choice`
 
 At request time, pass the chat template kwargs through `extra_body`:
 
-- `reasoning_effort`: optional, defaults to `high` (`<think> ... </think>`)
-- `tool_format`: selects the tool-call output format used by the parser
+- `reasoning_effort`: optional, defaults to `high`
+  (`<ifm|think> ... </ifm|think>`)
+- `tool_call_format`: selects the tool-call output format used by the parser
 
 ## OpenAI-Compatible Client Example
 
 The example below uses `YOUR_K2_V3_MODEL` with `tool_choice="auto"` and
-sets both `tool_format` and `reasoning_effort` in `chat_template_kwargs`.
+sets both `tool_call_format` and `reasoning_effort` in `chat_template_kwargs`.
 
 ```python
 from openai import OpenAI
@@ -84,7 +85,7 @@ resp = client.chat.completions.create(
     tool_choice="auto",
     extra_body={
         "chat_template_kwargs": {
-            "tool_format": "default",
+            "tool_call_format": "xml",
             "reasoning_effort": "high",
         }
     },
@@ -98,9 +99,9 @@ print("tool_calls:", message.tool_calls)
 
 ## Summary
 
-For the LLM360 `v0.12.0-ifm` branch:
+For the LLM360 `v0.12.0-ifm-0518` branch:
 
 - use `k2_v3` as the reasoning parser
-- use `multi_format` as the tool-call parser
-- use `high` / `<think>` as the default reasoning effort unless you override it
-- set `tool_format` in request-time `chat_template_kwargs`
+- use `k2_v3` as the tool-call parser
+- use `high` / `<ifm|think>` as the default reasoning effort unless you override it
+- set `tool_call_format` in request-time `chat_template_kwargs`
diff --git a/tests/entrypoints/openai/test_tool_parser_kwargs.py b/tests/entrypoints/openai/test_tool_parser_kwargs.py
@@ -59,15 +59,15 @@ def _xgrammar_getattr(name: str):
 class KwargAwareToolParser(ToolParser):
     def __init__(self, tokenizer, chat_template_kwargs=None):
         super().__init__(tokenizer)
-        self.tool_format = (chat_template_kwargs or {}).get("tool_format")
+        self.tool_call_format = (chat_template_kwargs or {}).get("tool_call_format")
 
     def extract_tool_calls(self, model_output, request):
         return ExtractedToolCallInformation(
             tools_called=True,
             tool_calls=[
                 ToolCall(
                     function=FunctionCall(
-                        name=self.tool_format or "missing",
+                        name=self.tool_call_format or "missing",
                         arguments="{}",
                     )
                 )
@@ -148,7 +148,7 @@ def test_parse_tool_calls_from_content_passes_chat_template_kwargs():
         enable_auto_tools=True,
         tool_parser_cls=KwargAwareToolParser,
         content="<function_calls>noop()</function_calls>",
-        chat_template_kwargs={"tool_format": "python"},
+        chat_template_kwargs={"tool_call_format": "python"},
     )
 
     assert content is None
@@ -167,7 +167,7 @@ def test_parse_tool_calls_from_content_keeps_legacy_parsers_compatible():
         enable_auto_tools=True,
         tool_parser_cls=LegacyToolParser,
         content="<function_calls>noop()</function_calls>",
-        chat_template_kwargs={"tool_format": "python"},
+        chat_template_kwargs={"tool_call_format": "python"},
     )
 
     assert content is None

diff --git a/tests/entrypoints/openai/tool_parsers/test_multi_format_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_multi_format_tool_parser.py
@@ -32,10 +32,10 @@ def decode(self, token_ids):
         return "".join(reverse_vocab[token_id] for token_id in token_ids)
 
 
-def make_parser(tool_format: str) -> ToolParser:
+def make_parser(tool_call_format: str) -> ToolParser:
     return ToolParserManager.get_tool_parser("multi_format")(
         FakeTokenizer(),
-        chat_template_kwargs={"tool_format": tool_format},
+        chat_template_kwargs={"tool_call_format": tool_call_format},
     )
 
 
@@ -78,7 +78,7 @@ def make_schema_request() -> ChatCompletionRequest:
     )
 
 
-def test_missing_tool_format_defaults_to_xml():
+def test_missing_tool_call_format_defaults_to_xml():
     parser = make_parser_with_kwargs({})
 
     extracted = run_tool_extraction_nonstreaming(
@@ -220,19 +220,25 @@ def test_ifm_xml_typed_format_uses_arg_type_without_schema():
 
 
 @pytest.mark.parametrize(
-    "tool_format",
+    "tool_call_format",
     ["default", "typed_xml", "XML", "xllm_typed", "xml ", ""],
 )
-def test_tool_format_requires_exact_supported_value(tool_format: str):
+def test_tool_call_format_requires_exact_supported_value(tool_call_format: str):
     with pytest.raises(ValueError, match="Use one of these exact values"):
-        make_parser_with_kwargs({"tool_call_format": tool_format})
+        make_parser_with_kwargs({"tool_call_format": tool_call_format})
 
 
-def test_tool_format_must_be_a_string():
+def test_tool_call_format_must_be_a_string():
     with pytest.raises(ValueError, match="must be a string"):
         make_parser_with_kwargs({"tool_call_format": 123})
 
 
+@pytest.mark.parametrize("arg_name", ["tool_format", "tool_calling_format"])
+def test_legacy_tool_format_arguments_are_rejected(arg_name: str):
+    with pytest.raises(ValueError, match=f"Unsupported argument: {arg_name}"):
+        make_parser_with_kwargs({arg_name: "xml"})
+
+
 def test_k2_v3_parser_alias_uses_ifm_formats():
     parser = ToolParserManager.get_tool_parser("k2_v3")(
         FakeTokenizer(),
@@ -255,6 +261,52 @@ def test_k2_v3_parser_alias_uses_ifm_formats():
     }
 
 
+def test_k2_v3_parser_strips_0518_ifm_reasoning_prefix():
+    parser = ToolParserManager.get_tool_parser("k2_v3")(
+        FakeTokenizer(),
+        chat_template_kwargs={"tool_call_format": "xml"},
+    )
+
+    extracted = run_tool_extraction_nonstreaming(
+        parser,
+        "<ifm|think>need lookup</ifm|think>\n"
+        "<ifm|tool_calls>\n"
+        "<ifm|tool_call>get_weather"
+        "<ifm|arg_key>city</ifm|arg_key>"
+        "<ifm|arg_value>Tokyo</ifm|arg_value>"
+        "</ifm|tool_call>\n"
+        "</ifm|tool_calls>",
+        make_request(),
+    )
+
+    assert extracted.tools_called
+    assert extracted.content is None
+    assert extracted.tool_calls[0].function.name == "get_weather"
+    assert json.loads(extracted.tool_calls[0].function.arguments) == {"city": "Tokyo"}
+
+
+def test_k2_v3_parser_does_not_strip_legacy_reasoning_prefix():
+    parser = ToolParserManager.get_tool_parser("k2_v3")(
+        FakeTokenizer(),
+        chat_template_kwargs={"tool_call_format": "xml"},
+    )
+
+    extracted = run_tool_extraction_nonstreaming(
+        parser,
+        "<think>legacy reasoning</think>\n"
+        "<ifm|tool_call>get_weather"
+        "<ifm|arg_key>city</ifm|arg_key>"
+        "<ifm|arg_value>Tokyo</ifm|arg_value>"
+        "</ifm|tool_call>",
+        make_request(),
+    )
+
+    assert extracted.tools_called
+    assert extracted.content == "<think>legacy reasoning</think>\n"
+    assert extracted.tool_calls[0].function.name == "get_weather"
+    assert json.loads(extracted.tool_calls[0].function.arguments) == {"city": "Tokyo"}
+
+
 def test_minimax_format_extracts_inline_invokes():
     parser = make_parser("minimax")
 

diff --git a/tests/reasoning/test_k2_v3_reasoning_parser.py b/tests/reasoning/test_k2_v3_reasoning_parser.py
@@ -9,9 +9,9 @@
 PARSER_NAME = "k2_v3"
 
 EFFORT_TOKENS = {
-    "high": ("<think>", "</think>"),
-    "medium": ("<think_fast>", "</think_fast>"),
-    "low": ("<think_faster>", "</think_faster>"),
+    "high": ("<ifm|think>", "</ifm|think>"),
+    "medium": ("<ifm|think_fast>", "</ifm|think_fast>"),
+    "low": ("<ifm|think_faster>", "</ifm|think_faster>"),
 }
 
 
@@ -190,21 +190,21 @@ def test_reasoning(
 
 
 def test_default_effort_is_high(k2_v3_tokenizer):
-    """Parser with no reasoning_effort should use <think>/<\/think>."""
+    """Parser with no reasoning_effort should use <ifm|think>/</ifm|think>."""
     parser = ReasoningParserManager.get_reasoning_parser(PARSER_NAME)(k2_v3_tokenizer)
-    assert parser.start_token == "<think>"
-    assert parser.end_token == "</think>"
+    assert parser.start_token == "<ifm|think>"
+    assert parser.end_token == "</ifm|think>"
 
 
 def test_none_effort_falls_back_to_high(k2_v3_tokenizer):
     """reasoning_effort='none' should fall back to high tokens."""
     parser = _make_parser(k2_v3_tokenizer, "none")
-    assert parser.start_token == "<think>"
-    assert parser.end_token == "</think>"
+    assert parser.start_token == "<ifm|think>"
+    assert parser.end_token == "</ifm|think>"
 
 
 def test_unknown_effort_falls_back_to_high(k2_v3_tokenizer):
     """Unknown effort value should fall back to high tokens."""
     parser = _make_parser(k2_v3_tokenizer, "ultra")
-    assert parser.start_token == "<think>"
-    assert parser.end_token == "</think>"
+    assert parser.start_token == "<ifm|think>"
+    assert parser.end_token == "</ifm|think>"
diff --git a/vllm/entrypoints/openai/tool_parsers/multi_format_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/multi_format_tool_parser.py
@@ -78,6 +78,14 @@ class MultiFormatToolParser(ToolParser):
         r"<ifm\|arg_value>(.*?)</ifm\|arg_value>",
         re.DOTALL,
     )
+    _IFM_REASONING_PREFIX_REGEX = re.compile(
+        r"\A\s*(?:"
+        r"<ifm\|think>.*?</ifm\|think>|"
+        r"<ifm\|think_fast>.*?</ifm\|think_fast>|"
+        r"<ifm\|think_faster>.*?</ifm\|think_faster>"
+        r")\s*",
+        re.DOTALL,
+    )
 
     _GLM_BLOCK_REGEX = re.compile(
         r"<tool_call>(.*?)</tool_call>",
@@ -96,11 +104,17 @@ def __init__(
         super().__init__(tokenizer)
 
         chat_template_kwargs = chat_template_kwargs or {}
-        raw_tool_format = "xml"
-        for key in ("tool_call_format", "tool_calling_format", "tool_format"):
-            if key in chat_template_kwargs and chat_template_kwargs[key] is not None:
-                raw_tool_format = chat_template_kwargs[key]
-                break
+        if "tool_calling_format" in chat_template_kwargs:
+            raise ValueError(
+                "Unsupported argument: tool_calling_format. "
+                "Use tool_call_format with one of: json, xml, xml_typed."
+            )
+        if "tool_format" in chat_template_kwargs:
+            raise ValueError(
+                "Unsupported argument: tool_format. "
+                "Use tool_call_format with one of: json, xml, xml_typed."
+            )
+        raw_tool_format = chat_template_kwargs.get("tool_call_format", "xml")
         self.tool_format = self._validate_tool_format(raw_tool_format)
         self._delegate: ToolParser | None = None
 
@@ -115,13 +129,13 @@ def __init__(
     def _validate_tool_format(cls, tool_format: Any) -> str:
         if not isinstance(tool_format, str):
             raise ValueError(
-                "tool_format/tool_call_format must be a string. "
+                "tool_call_format must be a string. "
                 f"Got {type(tool_format).__name__}."
             )
         if tool_format not in cls._SUPPORTED_TOOL_FORMATS:
             supported_formats = ", ".join(sorted(cls._SUPPORTED_TOOL_FORMATS))
             raise ValueError(
-                f"Unsupported tool_format/tool_call_format '{tool_format}'. "
+                f"Unsupported tool_call_format '{tool_format}'. "
                 "Use one of these exact values: "
                 f"{supported_formats}."
             )
@@ -159,7 +173,7 @@ def extract_tool_calls(
                 return self._extract_python_tool_calls(model_output)
         except Exception:
             logger.exception(
-                "Error extracting tool calls for tool_format=%s.",
+                "Error extracting tool calls for tool_call_format=%s.",
                 self.tool_format,
             )
 
@@ -200,11 +214,21 @@ def _json_or_string(value: str) -> Any:
         except json.JSONDecodeError:
             return value
 
-    @staticmethod
-    def _prefix_content(model_output: str, first_tool_index: int | None) -> str | None:
+    @classmethod
+    def _strip_ifm_reasoning_prefix(cls, content: str) -> str:
+        while match := cls._IFM_REASONING_PREFIX_REGEX.match(content):
+            content = content[match.end() :]
+        return content
+
+    @classmethod
+    def _prefix_content(
+        cls,
+        model_output: str,
+        first_tool_index: int | None,
+    ) -> str | None:
         if first_tool_index is None or first_tool_index <= 0:
             return None
-        content = model_output[:first_tool_index]
+        content = cls._strip_ifm_reasoning_prefix(model_output[:first_tool_index])
         return content if content.strip() else None
 
     @staticmethod
@@ -690,4 +714,4 @@ def _extract_python_tool_calls(
 
 
 class K2V3ToolParser(MultiFormatToolParser):
-    """K2-V3 alias for the IFM-aware multi-format parser."""
+    """K2-V3 parser for BBQ 0518 IFM tool-call and reasoning tokens."""
diff --git a/vllm/reasoning/k2_v3_reasoning_parser.py b/vllm/reasoning/k2_v3_reasoning_parser.py
@@ -10,10 +10,10 @@ class K2V3ReasoningParser(DeepSeekR1ReasoningParser):
     Reasoning parser for the K2-v3 model family.
 
     K2-v3 supports three reasoning effort levels, each using different
-    think tokens:
-      - high (default): <think> / </think>
-      - medium:         <think_fast> / </think_fast>
-      - low:            <think_faster> / </think_faster>
+    IFM think tokens:
+      - high (default): <ifm|think> / </ifm|think>
+      - medium:         <ifm|think_fast> / </ifm|think_fast>
+      - low:            <ifm|think_faster> / </ifm|think_faster>
 
     The effort level is selected via the ``reasoning_effort`` parameter
     in ``chat_template_kwargs``.  The chat template inserts the start
@@ -22,9 +22,9 @@ class K2V3ReasoningParser(DeepSeekR1ReasoningParser):
     """
 
     _EFFORT_TOKENS: dict[str, tuple[str, str]] = {
-        "high": ("<think>", "</think>"),
-        "medium": ("<think_fast>", "</think_fast>"),
-        "low": ("<think_faster>", "</think_faster>"),
+        "high": ("<ifm|think>", "</ifm|think>"),
+        "medium": ("<ifm|think_fast>", "</ifm|think_fast>"),
+        "low": ("<ifm|think_faster>", "</ifm|think_faster>"),
     }
 
     def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):