diff --git a/docs/llm/reference.md b/docs/llm/reference.md index 2cb6b3e901..ba34253340 100644 --- a/docs/llm/reference.md +++ b/docs/llm/reference.md @@ -396,7 +396,7 @@ There are several known limitations which are expected to be addressed in the co Some servable types introduce additional limitations: ### Stateful servable limitations -- `finish_reason` not supported (always set to `stop`), +- `finish_reason=length` is not supported (only possible are `stop` or `tool_calls`), - `logprobs` not supported, - sequential request processing (only one request is handled at a time), - only a single response can be returned. Parameter `n` is not supported. diff --git a/docs/model_server_rest_api_chat.md b/docs/model_server_rest_api_chat.md index 4ab8c1cded..472350d943 100644 --- a/docs/model_server_rest_api_chat.md +++ b/docs/model_server_rest_api_chat.md @@ -245,7 +245,7 @@ If any of those parameters is not specified and request is made to Prompt Lookup | choices.message.content | ✅ | ✅ | string | The contents of the message. | | choices.message.reasoning_content | ✅ | ❌ | string | If model supports reasoning and is deployed with appropriate response parser, the reasoning part of the output is stored in the field. | | choices.message.tool_calls | ✅ | ✅ | array | The tool calls generated by the model, such as function calls. | -| choices.finish_reason | ✅ | ✅ | string or null | The reason the model stopped generating tokens. This will be `stop` if the model hit a natural stop point or a provided stop sequence, `length` if the maximum number of tokens specified in the request was reached, or `null` when generation continues (streaming). | +| choices.finish_reason | ✅ | ✅ | string or null | The reason the model stopped generating tokens. This will be `stop` if the model hit a natural stop point or a provided stop sequence, `length` if the maximum number of tokens specified in the request was reached, `tool_calls` if stopped due to a tool call, or `null` when generation continues (streaming). | | choices.logprobs | ⚠️ | ✅ | object or null | Log probability information for the choice. **_In current version, only one logprob per token can be returned._** | | created | ✅ | ✅ | string | The Unix timestamp (in seconds) of when the chat completion was created. | | model | ✅ | ✅ | string | The model used for the chat completion. | diff --git a/docs/model_server_rest_api_completions.md b/docs/model_server_rest_api_completions.md index 7e05209f93..1a8ba9257d 100644 --- a/docs/model_server_rest_api_completions.md +++ b/docs/model_server_rest_api_completions.md @@ -124,7 +124,7 @@ Note that below parameters are valid only for prompt lookup pipeline. Add `"prom | choices | ✅ | ✅ | array | A list of chat completion choices. Can be more than one if `n` is greater than 1 (beam search or multinomial samplings). | | choices.index | ✅ | ✅ | integer | The index of the choice in the list of choices. | | choices.text | ✅ | ✅ | string | A chat completion text generated by the model. | -| choices.finish_reason | ✅ | ✅ | string or null | The reason the model stopped generating tokens. This will be `stop` if the model hit a natural stop point or a provided stop sequence, `length` if the maximum number of tokens specified in the request was reached, or `null` when generation continues (streaming). | +| choices.finish_reason | ✅ | ✅ | string or null | The reason the model stopped generating tokens. This will be `stop` if the model hit a natural stop point or a provided stop sequence, `length` if the maximum number of tokens specified in the request was reached, `tool_calls` if stopped due to a tool call, or `null` when generation continues (streaming). | | choices.logprobs | ⚠️ | ✅ | object or null | Log probability information for the choice. **_In current version, only one logprob per token can be returned _** | | created | ✅ | ✅ | string | The Unix timestamp (in seconds) of when the chat completion was created. | | model | ✅ | ✅ | string | The model used for the chat completion. | diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index ae2492abd7..acb0a76d04 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -848,6 +848,30 @@ void updateUsage(CompletionUsageStatistics& usage, const std::vector& g usage.completionTokens -= usage.promptTokens; } +static std::optional mapFinishReason(ov::genai::GenerationFinishReason finishReason, bool hasToolCalls) { + // GenerationFinishReason::TOOL_CALLS is not available in GenAI yet. + // Use feature detection based on presence of tool calls as a workaround until GenAI exposes TOOL_CALLS. + if (hasToolCalls && finishReason == ov::genai::GenerationFinishReason::STOP) { + return "tool_calls"; + } + switch (finishReason) { + case ov::genai::GenerationFinishReason::STOP: + return "stop"; + case ov::genai::GenerationFinishReason::LENGTH: + return "length"; + default: + return std::nullopt; + } +} + +static bool hasToolCallsInStreamingDelta(const rapidjson::Document& delta) { + if (!delta.HasMember("delta") || !delta["delta"].IsObject()) { + return false; + } + const auto& deltaObj = delta["delta"]; + return deltaObj.HasMember("tool_calls") && deltaObj["tool_calls"].IsArray(); +} + ParsedOutput OpenAIChatCompletionsHandler::parseOutputIfNeeded(const std::vector& generatedIds) { OVMS_PROFILE_FUNCTION(); ParsedOutput parsedOutput; @@ -878,22 +902,13 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(const std::vect // finish_reason: string; // "stop" => natural stop point due to stopping criteria // "length" => due to reaching max_tokens parameter + // "tool_calls" => generation stopped due to generated tool calls - std::string finishReason; - switch (generationOutput.finish_reason) { - case ov::genai::GenerationFinishReason::STOP: - finishReason = "stop"; - break; - case ov::genai::GenerationFinishReason::LENGTH: - finishReason = "length"; - break; - default: - finishReason = "unknown"; + std::optional finishReason = mapFinishReason(generationOutput.finish_reason, !parsedOutput.toolCalls.empty()); + if (!finishReason.has_value()) { SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Unknown finish reason: {}", static_cast(generationOutput.finish_reason)); - break; } - jsonResponse.FinishReason(finishReason); - + jsonResponse.FinishReason(finishReason.value_or("unknown")); // index: integer; Choice index, only n=1 supported anyway jsonResponse.Index(index++); @@ -1005,8 +1020,9 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(const ov::genai updateUsage(usage, tokens, request.echo); ParsedOutput parsedOutput = parseOutputIfNeeded(tokens); jsonResponse.StartObject(); - // finish_reason: string; always "stop" for this method - jsonResponse.FinishReason("stop"); + // finish_reason: "stop" in regular scenario, "tool_calls" if output contains tool calls + auto finishReason = mapFinishReason(ov::genai::GenerationFinishReason::STOP, !parsedOutput.toolCalls.empty()); + jsonResponse.FinishReason(finishReason.value_or("unknown")); // index: integer; Choice index, only n=1 supported anyway jsonResponse.Index(index++); @@ -1058,11 +1074,13 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(const ov::genai jsonResponse.StartArray("choices"); int index = 0; usage.completionTokens = completionTokens; + for (int i = 0; i < results.texts.size(); i++) { const std::string& text = results.texts[i]; SPDLOG_LOGGER_TRACE(llm_calculator_logger, "Generated text: {}", text); jsonResponse.StartObject(); // finish_reason: string; always "stop" for this method + // tool_calls from VLM legacy pipeline are unsupported due to lack of tokens in API, so finish reason cannot be tool_call jsonResponse.FinishReason("stop"); // index: integer; Choice index, only n=1 supported anyway jsonResponse.Index(index++); @@ -1121,6 +1139,7 @@ std::string OpenAIChatCompletionsHandler::serializeStreamingChunk(const std::str Value choices(kArrayType); Value choice(kObjectType); + bool hasToolCalls = false; // choices: array of size N, where N is related to n request parameter choices.SetArray(); @@ -1129,19 +1148,9 @@ std::string OpenAIChatCompletionsHandler::serializeStreamingChunk(const std::str // "stop" => natural stop point due to stopping criteria // "length" => due to reaching max_tokens parameter // "content_filter" => when produced restricted output (not supported) - // "tool_calls" => generation stopped and waiting for tool output (not supported) + // "tool_calls" => generation stopped and waiting for tool output // "function_call" => deprecated // null - natural scenario when the generation has not completed yet - switch (finishReason) { - case ov::genai::GenerationFinishReason::STOP: - choice.AddMember("finish_reason", "stop", allocator); - break; - case ov::genai::GenerationFinishReason::LENGTH: - choice.AddMember("finish_reason", "length", allocator); - break; - default: - choice.AddMember("finish_reason", Value(), allocator); - } // index: integer; Choice index, only n=1 supported anyway choice.AddMember("index", 0, allocator); // logprobs: object/null; Log probability information for the choice. TODO @@ -1155,6 +1164,7 @@ std::string OpenAIChatCompletionsHandler::serializeStreamingChunk(const std::str if (delta->HasMember("delta")) { // Deep copy the "delta" member value into the choice object choice.AddMember("delta", Value((*delta)["delta"], allocator), allocator); + hasToolCalls = hasToolCallsInStreamingDelta(*delta); } } else { @@ -1167,6 +1177,13 @@ std::string OpenAIChatCompletionsHandler::serializeStreamingChunk(const std::str choice.AddMember("text", Value(chunkResponse.c_str(), allocator), allocator); } + auto serializedFinishReason = mapFinishReason(finishReason, hasToolCalls); + if (serializedFinishReason.has_value()) { + choice.AddMember("finish_reason", Value(serializedFinishReason.value().c_str(), allocator), allocator); + } else { + choice.AddMember("finish_reason", Value(rapidjson::kNullType), allocator); + } + choices.PushBack(choice, allocator); doc.AddMember("choices", choices, allocator); diff --git a/src/llm/visual_language_model/legacy/servable.cpp b/src/llm/visual_language_model/legacy/servable.cpp index 5a09a24390..7970257947 100644 --- a/src/llm/visual_language_model/legacy/servable.cpp +++ b/src/llm/visual_language_model/legacy/servable.cpp @@ -81,7 +81,9 @@ absl::Status VisualLanguageModelLegacyServable::parseRequest(std::shared_ptrapiHandler = std::make_shared(*legacyExecutionContext->payload.parsedJson, legacyExecutionContext->endpoint, std::chrono::system_clock::now(), - getProperties()->tokenizer); + getProperties()->tokenizer, + getProperties()->toolParserName, + getProperties()->reasoningParserName); auto& config = ovms::Config::instance(); auto status = executionContext->apiHandler->parseRequest(getProperties()->maxTokensLimit, getProperties()->bestOfLimit, getProperties()->maxModelLength, config.getServerSettings().allowedLocalMediaPath, config.getServerSettings().allowedMediaDomains); diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index fec2009867..9f0f26bff6 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -31,6 +31,8 @@ #include "test_utils.hpp" #include "platform_utils.hpp" +const std::string llama3TokenizerPathForHandlerTests = getGenericFullPathForSrcTest("/ovms/src/test/llm_testing/unsloth/Llama-3.1-8B-Instruct"); + class HttpOpenAIHandlerTest : public ::testing::Test { protected: ovms::Server& server = ovms::Server::instance(); @@ -402,6 +404,167 @@ class HttpOpenAIHandlerParsingTest : public ::testing::Test { } }; +static std::vector createLlama3ToolCallTokens(ov::genai::Tokenizer& tokenizer) { + std::string toolCall = "<|python_tag|>" + R"({"name": "example_tool", "parameters": {"arg1": "value1", "arg2": 42}})"; + auto generatedTensor = tokenizer.encode(toolCall, ov::genai::add_special_tokens(true)).input_ids; + std::vector generatedTokens(generatedTensor.data(), generatedTensor.data() + generatedTensor.get_size()); + return generatedTokens; +} + +TEST_F(HttpOpenAIHandlerParsingTest, serializeStreamingChunkReturnsIntermediateNullAndFinallyToolCallsFinishReason) { + std::shared_ptr llama3Tokenizer = std::make_shared(llama3TokenizerPathForHandlerTests); + std::string json = R"({ + "model": "llama", + "stream": true, + "messages": [{"role": "user", "content": "What is weather?"}], + "tools": [{ + "type": "function", + "function": { + "name": "get_humidity", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string"} + } + } + } + }] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *llama3Tokenizer, "llama3"); + uint32_t maxTokensLimit = 100; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + std::vector> stream = { + {"<|python_tag|>", ov::genai::GenerationFinishReason::NONE}, + {"{\"", ov::genai::GenerationFinishReason::NONE}, + {"name", ov::genai::GenerationFinishReason::NONE}, + {"\":", ov::genai::GenerationFinishReason::NONE}, + {" \"", ov::genai::GenerationFinishReason::NONE}, + {"get", ov::genai::GenerationFinishReason::NONE}, + {"_humidity", ov::genai::GenerationFinishReason::NONE}, + {"\",", ov::genai::GenerationFinishReason::NONE}, + {" \"", ov::genai::GenerationFinishReason::NONE}, + {"parameters", ov::genai::GenerationFinishReason::NONE}, + {"\":", ov::genai::GenerationFinishReason::NONE}, + {" {\"", ov::genai::GenerationFinishReason::NONE}, + {"location", ov::genai::GenerationFinishReason::NONE}, + {"\":", ov::genai::GenerationFinishReason::NONE}, + {" \"", ov::genai::GenerationFinishReason::NONE}, + {"Paris\"}}", ov::genai::GenerationFinishReason::STOP}, + }; + + std::vector serializedChunks; + for (const auto& [chunk, finishReason] : stream) { + std::string serialized = apiHandler->serializeStreamingChunk(chunk, finishReason); + if (!serialized.empty()) { + serializedChunks.push_back(serialized); + } + } + ASSERT_FALSE(serializedChunks.empty()); + const std::string& lastChunk = serializedChunks.back(); + ASSERT_NE(lastChunk.find("\"tool_calls\""), std::string::npos) << lastChunk; + ASSERT_NE(lastChunk.find("\"finish_reason\":\"tool_calls\""), std::string::npos) << lastChunk; + // Verify that intermediate chunks with NONE finish_reason are serialized correctly + ASSERT_GE(serializedChunks.size(), 2u); + for (size_t i = 0; i + 1 < serializedChunks.size(); ++i) { + const std::string& chunkStr = serializedChunks[i]; + ASSERT_NE(chunkStr.find("\"finish_reason\":null"), std::string::npos) << chunkStr; + } +} + +TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseGenerationOutputReturnsToolCallsFinishReason) { + std::shared_ptr llama3Tokenizer = std::make_shared(llama3TokenizerPathForHandlerTests); + std::string json = R"({ + "model": "llama", + "stream": false, + "messages": [{"role": "user", "content": "What is weather?"}], + "tools": [{ + "type": "function", + "function": { + "name": "example_tool", + "parameters": {"type": "object"} + } + }] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *llama3Tokenizer, "llama3"); + uint32_t maxTokensLimit = 100; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + ov::genai::GenerationOutput generationOutput; + generationOutput.generated_ids = createLlama3ToolCallTokens(*llama3Tokenizer); + generationOutput.finish_reason = ov::genai::GenerationFinishReason::STOP; // Change it once GenAI introduces tool_calls finish reason + std::string serialized = apiHandler->serializeUnaryResponse(std::vector{generationOutput}); + + ASSERT_NE(serialized.find("\"finish_reason\":\"tool_calls\""), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"tool_calls\":[{"), std::string::npos) << serialized; +} + +TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseEncodedResultsReturnsToolCallsFinishReason) { + std::shared_ptr llama3Tokenizer = std::make_shared(llama3TokenizerPathForHandlerTests); + std::string json = R"({ + "model": "llama", + "stream": false, + "messages": [{"role": "user", "content": "What is weather?"}], + "tools": [{ + "type": "function", + "function": { + "name": "example_tool", + "parameters": {"type": "object"} + } + }] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *llama3Tokenizer, "llama3"); + uint32_t maxTokensLimit = 100; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + ov::genai::EncodedResults results; + results.tokens = {createLlama3ToolCallTokens(*llama3Tokenizer)}; + std::string serialized = apiHandler->serializeUnaryResponse(results); + + ASSERT_NE(serialized.find("\"finish_reason\":\"tool_calls\""), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"tool_calls\":[{"), std::string::npos) << serialized; +} + +// This is unsupported, once we have tool calling for VLM legacy pipeline, change the test +TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseVLMSupportsToolCallsFinishReason_Unsupported) { + std::string json = R"({ + "model": "llama", + "stream": false, + "messages": [{"role": "user", "content": "What is weather?"}] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); + uint32_t maxTokensLimit = 100; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + ov::genai::VLMDecodedResults results; + results.texts = {"dummy"}; + std::string serialized = apiHandler->serializeUnaryResponse(results, 1); + + // ASSERT_NE(serialized.find("\"finish_reason\":\"tool_calls\""), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"finish_reason\":\"stop\""), std::string::npos) << serialized; +} + TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsBase64) { std::string json = R"({ "model": "llama",