Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 13 additions & 12 deletions docs/features/k2_v3.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# K2-V3 Reasoning and Tool Calling

This page documents how to use the LLM360 `k2_v3` reasoning parser and the
multi-format tool calling parser from the LLM360 `v0.12.0-ifm` branch.
multi-format tool calling parser from the LLM360 `v0.12.0-ifm-0518` branch.

!!! note
This workflow is specific to the LLM360 fork and branch. It is not the
Expand All @@ -14,7 +14,7 @@ Use the LLM360 fork for this setup:
```bash
git clone https://github.com/LLM360/vllm.git
cd vllm
git switch v0.12.0-ifm
git switch v0.12.0-ifm-0518
VLLM_USE_PRECOMPILED=1 uv pip install --editable .
```

Expand All @@ -30,26 +30,27 @@ vllm serve YOUR_K2_V3_MODEL \
--port 8000 \
--reasoning-parser k2_v3 \
--enable-auto-tool-choice \
--tool-call-parser multi_format
--tool-call-parser k2_v3
```

## Required Settings

Use these settings to enable the K2 reasoning parser and tool calling flow:

- `--reasoning-parser k2_v3`
- `--tool-call-parser multi_format`
- `--tool-call-parser k2_v3`
- `--enable-auto-tool-choice`

At request time, pass the chat template kwargs through `extra_body`:

- `reasoning_effort`: optional, defaults to `high` (`<think> ... </think>`)
- `tool_format`: selects the tool-call output format used by the parser
- `reasoning_effort`: optional, defaults to `high`
(`<ifm|think> ... </ifm|think>`)
- `tool_call_format`: selects the tool-call output format used by the parser

## OpenAI-Compatible Client Example

The example below uses `YOUR_K2_V3_MODEL` with `tool_choice="auto"` and
sets both `tool_format` and `reasoning_effort` in `chat_template_kwargs`.
sets both `tool_call_format` and `reasoning_effort` in `chat_template_kwargs`.

```python
from openai import OpenAI
Expand Down Expand Up @@ -84,7 +85,7 @@ resp = client.chat.completions.create(
tool_choice="auto",
extra_body={
"chat_template_kwargs": {
"tool_format": "default",
"tool_call_format": "xml",
"reasoning_effort": "high",
}
},
Expand All @@ -98,9 +99,9 @@ print("tool_calls:", message.tool_calls)

## Summary

For the LLM360 `v0.12.0-ifm` branch:
For the LLM360 `v0.12.0-ifm-0518` branch:

- use `k2_v3` as the reasoning parser
- use `multi_format` as the tool-call parser
- use `high` / `<think>` as the default reasoning effort unless you override it
- set `tool_format` in request-time `chat_template_kwargs`
- use `k2_v3` as the tool-call parser
- use `high` / `<ifm|think>` as the default reasoning effort unless you override it
- set `tool_call_format` in request-time `chat_template_kwargs`
8 changes: 4 additions & 4 deletions tests/entrypoints/openai/test_tool_parser_kwargs.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,15 +59,15 @@ def _xgrammar_getattr(name: str):
class KwargAwareToolParser(ToolParser):
def __init__(self, tokenizer, chat_template_kwargs=None):
super().__init__(tokenizer)
self.tool_format = (chat_template_kwargs or {}).get("tool_format")
self.tool_call_format = (chat_template_kwargs or {}).get("tool_call_format")

def extract_tool_calls(self, model_output, request):
return ExtractedToolCallInformation(
tools_called=True,
tool_calls=[
ToolCall(
function=FunctionCall(
name=self.tool_format or "missing",
name=self.tool_call_format or "missing",
arguments="{}",
)
)
Expand Down Expand Up @@ -148,7 +148,7 @@ def test_parse_tool_calls_from_content_passes_chat_template_kwargs():
enable_auto_tools=True,
tool_parser_cls=KwargAwareToolParser,
content="<function_calls>noop()</function_calls>",
chat_template_kwargs={"tool_format": "python"},
chat_template_kwargs={"tool_call_format": "python"},
)

assert content is None
Expand All @@ -167,7 +167,7 @@ def test_parse_tool_calls_from_content_keeps_legacy_parsers_compatible():
enable_auto_tools=True,
tool_parser_cls=LegacyToolParser,
content="<function_calls>noop()</function_calls>",
chat_template_kwargs={"tool_format": "python"},
chat_template_kwargs={"tool_call_format": "python"},
)

assert content is None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,10 @@ def decode(self, token_ids):
return "".join(reverse_vocab[token_id] for token_id in token_ids)


def make_parser(tool_format: str) -> ToolParser:
def make_parser(tool_call_format: str) -> ToolParser:
return ToolParserManager.get_tool_parser("multi_format")(
FakeTokenizer(),
chat_template_kwargs={"tool_format": tool_format},
chat_template_kwargs={"tool_call_format": tool_call_format},
)


Expand Down Expand Up @@ -78,7 +78,7 @@ def make_schema_request() -> ChatCompletionRequest:
)


def test_missing_tool_format_defaults_to_xml():
def test_missing_tool_call_format_defaults_to_xml():
parser = make_parser_with_kwargs({})

extracted = run_tool_extraction_nonstreaming(
Expand Down Expand Up @@ -220,19 +220,25 @@ def test_ifm_xml_typed_format_uses_arg_type_without_schema():


@pytest.mark.parametrize(
"tool_format",
"tool_call_format",
["default", "typed_xml", "XML", "xllm_typed", "xml ", ""],
)
def test_tool_format_requires_exact_supported_value(tool_format: str):
def test_tool_call_format_requires_exact_supported_value(tool_call_format: str):
with pytest.raises(ValueError, match="Use one of these exact values"):
make_parser_with_kwargs({"tool_call_format": tool_format})
make_parser_with_kwargs({"tool_call_format": tool_call_format})


def test_tool_format_must_be_a_string():
def test_tool_call_format_must_be_a_string():
with pytest.raises(ValueError, match="must be a string"):
make_parser_with_kwargs({"tool_call_format": 123})


@pytest.mark.parametrize("arg_name", ["tool_format", "tool_calling_format"])
def test_legacy_tool_format_arguments_are_rejected(arg_name: str):
with pytest.raises(ValueError, match=f"Unsupported argument: {arg_name}"):
make_parser_with_kwargs({arg_name: "xml"})


def test_k2_v3_parser_alias_uses_ifm_formats():
parser = ToolParserManager.get_tool_parser("k2_v3")(
FakeTokenizer(),
Expand All @@ -255,6 +261,52 @@ def test_k2_v3_parser_alias_uses_ifm_formats():
}


def test_k2_v3_parser_strips_0518_ifm_reasoning_prefix():
parser = ToolParserManager.get_tool_parser("k2_v3")(
FakeTokenizer(),
chat_template_kwargs={"tool_call_format": "xml"},
)

extracted = run_tool_extraction_nonstreaming(
parser,
"<ifm|think>need lookup</ifm|think>\n"
"<ifm|tool_calls>\n"
"<ifm|tool_call>get_weather"
"<ifm|arg_key>city</ifm|arg_key>"
"<ifm|arg_value>Tokyo</ifm|arg_value>"
"</ifm|tool_call>\n"
"</ifm|tool_calls>",
make_request(),
)

assert extracted.tools_called
assert extracted.content is None
assert extracted.tool_calls[0].function.name == "get_weather"
assert json.loads(extracted.tool_calls[0].function.arguments) == {"city": "Tokyo"}


def test_k2_v3_parser_does_not_strip_legacy_reasoning_prefix():
parser = ToolParserManager.get_tool_parser("k2_v3")(
FakeTokenizer(),
chat_template_kwargs={"tool_call_format": "xml"},
)

extracted = run_tool_extraction_nonstreaming(
parser,
"<think>legacy reasoning</think>\n"
"<ifm|tool_call>get_weather"
"<ifm|arg_key>city</ifm|arg_key>"
"<ifm|arg_value>Tokyo</ifm|arg_value>"
"</ifm|tool_call>",
make_request(),
)

assert extracted.tools_called
assert extracted.content == "<think>legacy reasoning</think>\n"
assert extracted.tool_calls[0].function.name == "get_weather"
assert json.loads(extracted.tool_calls[0].function.arguments) == {"city": "Tokyo"}


def test_minimax_format_extracts_inline_invokes():
parser = make_parser("minimax")

Expand Down
20 changes: 10 additions & 10 deletions tests/reasoning/test_k2_v3_reasoning_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
PARSER_NAME = "k2_v3"

EFFORT_TOKENS = {
"high": ("<think>", "</think>"),
"medium": ("<think_fast>", "</think_fast>"),
"low": ("<think_faster>", "</think_faster>"),
"high": ("<ifm|think>", "</ifm|think>"),
"medium": ("<ifm|think_fast>", "</ifm|think_fast>"),
"low": ("<ifm|think_faster>", "</ifm|think_faster>"),
}


Expand Down Expand Up @@ -190,21 +190,21 @@ def test_reasoning(


def test_default_effort_is_high(k2_v3_tokenizer):
"""Parser with no reasoning_effort should use <think>/<\/think>."""
"""Parser with no reasoning_effort should use <ifm|think>/</ifm|think>."""
parser = ReasoningParserManager.get_reasoning_parser(PARSER_NAME)(k2_v3_tokenizer)
assert parser.start_token == "<think>"
assert parser.end_token == "</think>"
assert parser.start_token == "<ifm|think>"
assert parser.end_token == "</ifm|think>"


def test_none_effort_falls_back_to_high(k2_v3_tokenizer):
"""reasoning_effort='none' should fall back to high tokens."""
parser = _make_parser(k2_v3_tokenizer, "none")
assert parser.start_token == "<think>"
assert parser.end_token == "</think>"
assert parser.start_token == "<ifm|think>"
assert parser.end_token == "</ifm|think>"


def test_unknown_effort_falls_back_to_high(k2_v3_tokenizer):
"""Unknown effort value should fall back to high tokens."""
parser = _make_parser(k2_v3_tokenizer, "ultra")
assert parser.start_token == "<think>"
assert parser.end_token == "</think>"
assert parser.start_token == "<ifm|think>"
assert parser.end_token == "</ifm|think>"
48 changes: 36 additions & 12 deletions vllm/entrypoints/openai/tool_parsers/multi_format_tool_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,14 @@ class MultiFormatToolParser(ToolParser):
r"<ifm\|arg_value>(.*?)</ifm\|arg_value>",
re.DOTALL,
)
_IFM_REASONING_PREFIX_REGEX = re.compile(
r"\A\s*(?:"
r"<ifm\|think>.*?</ifm\|think>|"
r"<ifm\|think_fast>.*?</ifm\|think_fast>|"
r"<ifm\|think_faster>.*?</ifm\|think_faster>"
r")\s*",
re.DOTALL,
)

_GLM_BLOCK_REGEX = re.compile(
r"<tool_call>(.*?)</tool_call>",
Expand All @@ -96,11 +104,17 @@ def __init__(
super().__init__(tokenizer)

chat_template_kwargs = chat_template_kwargs or {}
raw_tool_format = "xml"
for key in ("tool_call_format", "tool_calling_format", "tool_format"):
if key in chat_template_kwargs and chat_template_kwargs[key] is not None:
raw_tool_format = chat_template_kwargs[key]
break
if "tool_calling_format" in chat_template_kwargs:
raise ValueError(
"Unsupported argument: tool_calling_format. "
"Use tool_call_format with one of: json, xml, xml_typed."
)
if "tool_format" in chat_template_kwargs:
raise ValueError(
"Unsupported argument: tool_format. "
"Use tool_call_format with one of: json, xml, xml_typed."
)
raw_tool_format = chat_template_kwargs.get("tool_call_format", "xml")
self.tool_format = self._validate_tool_format(raw_tool_format)
self._delegate: ToolParser | None = None

Expand All @@ -115,13 +129,13 @@ def __init__(
def _validate_tool_format(cls, tool_format: Any) -> str:
if not isinstance(tool_format, str):
raise ValueError(
"tool_format/tool_call_format must be a string. "
"tool_call_format must be a string. "
f"Got {type(tool_format).__name__}."
)
if tool_format not in cls._SUPPORTED_TOOL_FORMATS:
supported_formats = ", ".join(sorted(cls._SUPPORTED_TOOL_FORMATS))
raise ValueError(
f"Unsupported tool_format/tool_call_format '{tool_format}'. "
f"Unsupported tool_call_format '{tool_format}'. "
"Use one of these exact values: "
f"{supported_formats}."
)
Expand Down Expand Up @@ -159,7 +173,7 @@ def extract_tool_calls(
return self._extract_python_tool_calls(model_output)
except Exception:
logger.exception(
"Error extracting tool calls for tool_format=%s.",
"Error extracting tool calls for tool_call_format=%s.",
self.tool_format,
)

Expand Down Expand Up @@ -200,11 +214,21 @@ def _json_or_string(value: str) -> Any:
except json.JSONDecodeError:
return value

@staticmethod
def _prefix_content(model_output: str, first_tool_index: int | None) -> str | None:
@classmethod
def _strip_ifm_reasoning_prefix(cls, content: str) -> str:
while match := cls._IFM_REASONING_PREFIX_REGEX.match(content):
content = content[match.end() :]
return content

@classmethod
def _prefix_content(
cls,
model_output: str,
first_tool_index: int | None,
) -> str | None:
if first_tool_index is None or first_tool_index <= 0:
return None
content = model_output[:first_tool_index]
content = cls._strip_ifm_reasoning_prefix(model_output[:first_tool_index])
return content if content.strip() else None

@staticmethod
Expand Down Expand Up @@ -690,4 +714,4 @@ def _extract_python_tool_calls(


class K2V3ToolParser(MultiFormatToolParser):
"""K2-V3 alias for the IFM-aware multi-format parser."""
"""K2-V3 parser for BBQ 0518 IFM tool-call and reasoning tokens."""
14 changes: 7 additions & 7 deletions vllm/reasoning/k2_v3_reasoning_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ class K2V3ReasoningParser(DeepSeekR1ReasoningParser):
Reasoning parser for the K2-v3 model family.

K2-v3 supports three reasoning effort levels, each using different
think tokens:
- high (default): <think> / </think>
- medium: <think_fast> / </think_fast>
- low: <think_faster> / </think_faster>
IFM think tokens:
- high (default): <ifm|think> / </ifm|think>
- medium: <ifm|think_fast> / </ifm|think_fast>
- low: <ifm|think_faster> / </ifm|think_faster>

The effort level is selected via the ``reasoning_effort`` parameter
in ``chat_template_kwargs``. The chat template inserts the start
Expand All @@ -22,9 +22,9 @@ class K2V3ReasoningParser(DeepSeekR1ReasoningParser):
"""

_EFFORT_TOKENS: dict[str, tuple[str, str]] = {
"high": ("<think>", "</think>"),
"medium": ("<think_fast>", "</think_fast>"),
"low": ("<think_faster>", "</think_faster>"),
"high": ("<ifm|think>", "</ifm|think>"),
"medium": ("<ifm|think_fast>", "</ifm|think_fast>"),
"low": ("<ifm|think_faster>", "</ifm|think_faster>"),
}

def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
Expand Down