From 1d9d0a7556a29e4a6aaf53517d62ac37783bb1f7 Mon Sep 17 00:00:00 2001 From: Damian Kalinowski Date: Mon, 16 Feb 2026 17:16:22 +0100 Subject: [PATCH 01/11] save --- .../continuous_batching/servable.cpp | 18 +++++++++++++++++- .../legacy/legacy_executor.cpp | 4 +++- .../visual_language_model/legacy/servable.cpp | 18 +++++++++++++++++- 3 files changed, 37 insertions(+), 3 deletions(-) diff --git a/src/llm/visual_language_model/continuous_batching/servable.cpp b/src/llm/visual_language_model/continuous_batching/servable.cpp index 6a38d3e4bb..bb0ceb5e3f 100644 --- a/src/llm/visual_language_model/continuous_batching/servable.cpp +++ b/src/llm/visual_language_model/continuous_batching/servable.cpp @@ -93,7 +93,23 @@ absl::Status VisualLanguageModelServable::prepareInputs(std::shared_ptrinputText = properties->tokenizer.apply_chat_template(chatHistory, add_generation_prompt); + ov::genai::JsonContainer tools = ov::genai::JsonContainer::from_json_string(R"([ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get current weather by city", + "parameters": { + "type": "object", + "properties": { + "city": {"type": "string"} + }, + "required": ["city"] + } + } + } +])"); + vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, add_generation_prompt, {}, tools); } else { return absl::InvalidArgumentError("Unsupported endpoint"); } diff --git a/src/llm/visual_language_model/legacy/legacy_executor.cpp b/src/llm/visual_language_model/legacy/legacy_executor.cpp index 1e1ce255cb..84cfb513de 100644 --- a/src/llm/visual_language_model/legacy/legacy_executor.cpp +++ b/src/llm/visual_language_model/legacy/legacy_executor.cpp @@ -18,6 +18,8 @@ #include "servable.hpp" #include "vector" +#include + namespace ovms { VisualLanguageModelLegacyExecutor::VisualLanguageModelLegacyExecutor(std::shared_ptr pipe) { this->pipe = std::move(pipe); @@ -40,7 +42,7 @@ void VisualLanguageModelLegacyExecutor::processRequest() { } else { SPDLOG_LOGGER_TRACE(llm_executor_logger, "Generation started"); try { - requestExecutionContext->results = pipe->generate(requestExecutionContext->inputText, requestExecutionContext->inputImages, requestExecutionContext->generationConfigBuilder->getConfig(), requestExecutionContext->textStreamer); + requestExecutionContext->results = pipe->generate(requestExecutionContext->inputText, requestExecutionContext->inputImages, generationConfigBuilder->getConfig(), requestExecutionContext->textStreamer); } catch (std::exception& e) { requestExecutionContext->success = false; SPDLOG_LOGGER_ERROR(llm_executor_logger, "VLM pipeline generation failed: {}.", e.what()); diff --git a/src/llm/visual_language_model/legacy/servable.cpp b/src/llm/visual_language_model/legacy/servable.cpp index 5a09a24390..e26f785724 100644 --- a/src/llm/visual_language_model/legacy/servable.cpp +++ b/src/llm/visual_language_model/legacy/servable.cpp @@ -252,7 +252,23 @@ absl::Status VisualLanguageModelLegacyServable::prepareInputs(std::shared_ptrinputText = properties->tokenizer.apply_chat_template(chatHistory, add_generation_prompt); + ov::genai::JsonContainer tools = ov::genai::JsonContainer::from_json_string(R"([ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get current weather by city", + "parameters": { + "type": "object", + "properties": { + "city": {"type": "string"} + }, + "required": ["city"] + } + } + } +])"); + vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, add_generation_prompt, {}, tools); } else { return absl::InvalidArgumentError("Unsupported endpoint"); } From 3721f501343c57a970916d9b237c4b1d62b4f5e3 Mon Sep 17 00:00:00 2001 From: Damian Kalinowski Date: Tue, 17 Feb 2026 12:03:04 +0100 Subject: [PATCH 02/11] save --- src/llm/visual_language_model/continuous_batching/servable.cpp | 1 + src/llm/visual_language_model/legacy/legacy_executor.cpp | 2 +- src/llm/visual_language_model/legacy/servable.cpp | 3 ++- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/llm/visual_language_model/continuous_batching/servable.cpp b/src/llm/visual_language_model/continuous_batching/servable.cpp index bb0ceb5e3f..6001c9de25 100644 --- a/src/llm/visual_language_model/continuous_batching/servable.cpp +++ b/src/llm/visual_language_model/continuous_batching/servable.cpp @@ -110,6 +110,7 @@ absl::Status VisualLanguageModelServable::prepareInputs(std::shared_ptrinputText = properties->tokenizer.apply_chat_template(chatHistory, add_generation_prompt, {}, tools); + //vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, add_generation_prompt, {}); } else { return absl::InvalidArgumentError("Unsupported endpoint"); } diff --git a/src/llm/visual_language_model/legacy/legacy_executor.cpp b/src/llm/visual_language_model/legacy/legacy_executor.cpp index 84cfb513de..2e6ec331cf 100644 --- a/src/llm/visual_language_model/legacy/legacy_executor.cpp +++ b/src/llm/visual_language_model/legacy/legacy_executor.cpp @@ -42,7 +42,7 @@ void VisualLanguageModelLegacyExecutor::processRequest() { } else { SPDLOG_LOGGER_TRACE(llm_executor_logger, "Generation started"); try { - requestExecutionContext->results = pipe->generate(requestExecutionContext->inputText, requestExecutionContext->inputImages, generationConfigBuilder->getConfig(), requestExecutionContext->textStreamer); + requestExecutionContext->results = pipe->generate(requestExecutionContext->inputText, requestExecutionContext->inputImages, requestExecutionContext->generationConfigBuilder->getConfig(), requestExecutionContext->textStreamer); } catch (std::exception& e) { requestExecutionContext->success = false; SPDLOG_LOGGER_ERROR(llm_executor_logger, "VLM pipeline generation failed: {}.", e.what()); diff --git a/src/llm/visual_language_model/legacy/servable.cpp b/src/llm/visual_language_model/legacy/servable.cpp index e26f785724..8545f9de0e 100644 --- a/src/llm/visual_language_model/legacy/servable.cpp +++ b/src/llm/visual_language_model/legacy/servable.cpp @@ -268,7 +268,8 @@ absl::Status VisualLanguageModelLegacyServable::prepareInputs(std::shared_ptrinputText = properties->tokenizer.apply_chat_template(chatHistory, add_generation_prompt, {}, tools); + //vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, add_generation_prompt, {}, tools); + vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, add_generation_prompt, {}); } else { return absl::InvalidArgumentError("Unsupported endpoint"); } From 2f930ed9a4188f2e5f77f6b0bb9255929672d995 Mon Sep 17 00:00:00 2001 From: Damian Kalinowski Date: Tue, 17 Feb 2026 12:51:11 +0100 Subject: [PATCH 03/11] save --- src/llm/apis/openai_completions.cpp | 67 +++++++++++++++++++ src/llm/apis/openai_completions.hpp | 1 + src/llm/apis/openai_request.hpp | 3 + src/llm/servable.cpp | 1 + .../continuous_batching/servable.cpp | 25 ++----- .../visual_language_model/legacy/servable.cpp | 1 + 6 files changed, 80 insertions(+), 18 deletions(-) diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index ae2492abd7..9f9b463c40 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -17,6 +17,7 @@ #include "openai_completions.hpp" #include +#include #include #include "src/port/rapidjson_stringbuffer.hpp" #include "src/port/rapidjson_writer.hpp" @@ -44,6 +45,51 @@ namespace ovms { constexpr size_t DEFAULT_MAX_STOP_WORDS = 16; // same as deep-seek +namespace { + +ov::genai::JsonContainer rapidJsonValueToJsonContainer(const rapidjson::Value& value) { + if (value.IsNull()) { + return ov::genai::JsonContainer(nullptr); + } + if (value.IsBool()) { + return ov::genai::JsonContainer(value.GetBool()); + } + if (value.IsInt64()) { + return ov::genai::JsonContainer(value.GetInt64()); + } + if (value.IsUint64()) { + auto uintValue = value.GetUint64(); + if (uintValue <= static_cast(std::numeric_limits::max())) { + return ov::genai::JsonContainer(static_cast(uintValue)); + } + return ov::genai::JsonContainer(static_cast(uintValue)); + } + if (value.IsDouble()) { + return ov::genai::JsonContainer(value.GetDouble()); + } + if (value.IsString()) { + return ov::genai::JsonContainer(std::string(value.GetString(), value.GetStringLength())); + } + if (value.IsArray()) { + ov::genai::JsonContainer arrayContainer = ov::genai::JsonContainer::array(); + for (const auto& item : value.GetArray()) { + arrayContainer.push_back(rapidJsonValueToJsonContainer(item)); + } + return arrayContainer; + } + if (value.IsObject()) { + ov::genai::JsonContainer objectContainer = ov::genai::JsonContainer::object(); + for (auto member = value.MemberBegin(); member != value.MemberEnd(); ++member) { + const std::string key(member->name.GetString(), member->name.GetStringLength()); + objectContainer[key] = rapidJsonValueToJsonContainer(member->value); + } + return objectContainer; + } + throw std::invalid_argument("Unsupported JSON value type"); +} + +} // namespace + absl::Status OpenAIChatCompletionsHandler::parseCompletionsPart() { // prompt: string auto it = doc.FindMember("prompt"); @@ -430,6 +476,23 @@ absl::Status OpenAIChatCompletionsHandler::parseTools() { } request.toolChoice = tool_choice; + request.tools = std::nullopt; + if (it != doc.MemberEnd() && !it->value.IsNull()) { + try { + request.tools = rapidJsonValueToJsonContainer(it->value); + } catch (const std::exception& e) { + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Direct tools conversion to JsonContainer failed: {}. Falling back to JSON string conversion.", e.what()); + try { + rapidjson::StringBuffer toolsBuffer; + rapidjson::Writer toolsWriter(toolsBuffer); + it->value.Accept(toolsWriter); + request.tools = ov::genai::JsonContainer::from_json_string(toolsBuffer.GetString()); + } catch (const std::exception& fallbackEx) { + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Fallback tools conversion failed: {}", fallbackEx.what()); + return absl::InvalidArgumentError(absl::StrCat("Invalid tools payload: ", fallbackEx.what())); + } + } + } if (jsonChanged) { StringBuffer buffer; Writer writer(buffer); @@ -466,6 +529,10 @@ std::optional OpenAIChatCompletionsHandler::getResponseFormat() con return request.responseFormat; } +const std::optional& OpenAIChatCompletionsHandler::getTools() const { + return request.tools; +} + std::string convertOpenAIResponseFormatToStructuralTagStringFormat(const rapidjson::Value& openAIFormat) { // Build the new object: {"type": "structural_tag", "format": } // If response_format has {"json_schema": {"schema": {...}}}, flatten it to {"json_schema": {...}} diff --git a/src/llm/apis/openai_completions.hpp b/src/llm/apis/openai_completions.hpp index 7292d99c01..3b55e35573 100644 --- a/src/llm/apis/openai_completions.hpp +++ b/src/llm/apis/openai_completions.hpp @@ -102,6 +102,7 @@ class OpenAIChatCompletionsHandler { ov::genai::ChatHistory& getChatHistory(); std::optional getMaxTokens() const; std::optional getResponseFormat() const; + const std::optional& getTools() const; bool isStream() const; std::string getModel() const; diff --git a/src/llm/apis/openai_request.hpp b/src/llm/apis/openai_request.hpp index de355c12a1..dfb2151506 100644 --- a/src/llm/apis/openai_request.hpp +++ b/src/llm/apis/openai_request.hpp @@ -25,6 +25,7 @@ #include #include +#include #include #include "src/port/rapidjson_document.hpp" @@ -78,6 +79,8 @@ struct OpenAIChatCompletionsRequest { std::optional responseFormat{std::nullopt}; // Map that holds tool names and schemas for their arguments ToolsSchemas_t toolNameSchemaMap; + // Full tools payload in JSON form for passing directly to tokenizer chat template. + std::optional tools{std::nullopt}; // Holds value for tool_choice field as described in https://platform.openai.com/docs/api-reference/chat/create#chat_create-tool_choice std::string toolChoice; diff --git a/src/llm/servable.cpp b/src/llm/servable.cpp index 3c463a5aa9..e7aa2c8e01 100644 --- a/src/llm/servable.cpp +++ b/src/llm/servable.cpp @@ -156,6 +156,7 @@ absl::Status GenAiServable::parseRequest(std::shared_ptr& executionContext) { if (executionContext->apiHandler == nullptr) { return absl::Status(absl::StatusCode::kInvalidArgument, "API handler is not initialized"); diff --git a/src/llm/visual_language_model/continuous_batching/servable.cpp b/src/llm/visual_language_model/continuous_batching/servable.cpp index 6001c9de25..09aab47abc 100644 --- a/src/llm/visual_language_model/continuous_batching/servable.cpp +++ b/src/llm/visual_language_model/continuous_batching/servable.cpp @@ -62,6 +62,7 @@ std::shared_ptr VisualLanguageModelServable::getPropert return properties; } +// Continuous Batching VLM absl::Status VisualLanguageModelServable::prepareInputs(std::shared_ptr& executionContext) { auto vlmExecutionContext = std::static_pointer_cast(executionContext); if (vlmExecutionContext->apiHandler == nullptr) { @@ -93,24 +94,12 @@ absl::Status VisualLanguageModelServable::prepareInputs(std::shared_ptrinputText = properties->tokenizer.apply_chat_template(chatHistory, add_generation_prompt, {}, tools); - //vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, add_generation_prompt, {}); + const auto& tools = vlmExecutionContext->apiHandler->getTools(); + if (tools.has_value()) { + vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, add_generation_prompt, {}, tools); + } else { + vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, add_generation_prompt, {}); + } } else { return absl::InvalidArgumentError("Unsupported endpoint"); } diff --git a/src/llm/visual_language_model/legacy/servable.cpp b/src/llm/visual_language_model/legacy/servable.cpp index 8545f9de0e..62838dd57f 100644 --- a/src/llm/visual_language_model/legacy/servable.cpp +++ b/src/llm/visual_language_model/legacy/servable.cpp @@ -222,6 +222,7 @@ absl::Status VisualLanguageModelLegacyServable::preparePartialResponse(std::shar return absl::OkStatus(); } +// Legacy VLM absl::Status VisualLanguageModelLegacyServable::prepareInputs(std::shared_ptr& executionContext) { auto vlmExecutionContext = std::static_pointer_cast(executionContext); if (vlmExecutionContext->apiHandler == nullptr) { From 94203b06db0dd4fb898973674f3cefc4df570d96 Mon Sep 17 00:00:00 2001 From: Damian Kalinowski Date: Tue, 17 Feb 2026 13:20:29 +0100 Subject: [PATCH 04/11] save --- src/llm/apis/openai_completions.cpp | 2 ++ src/llm/servable.cpp | 7 +++- .../visual_language_model/legacy/servable.cpp | 35 ++++++++----------- .../legacy/servable_initializer.cpp | 9 +++++ 4 files changed, 32 insertions(+), 21 deletions(-) diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index 9f9b463c40..713466bc24 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -1214,7 +1214,9 @@ std::string OpenAIChatCompletionsHandler::serializeStreamingChunk(const std::str // logprobs: object/null; Log probability information for the choice. TODO choice.AddMember("logprobs", Value(), allocator); if (endpoint == Endpoint::CHAT_COMPLETIONS) { + SPDLOG_INFO("If have output parser, we will parseChunk via it"); if (outputParser != nullptr) { + SPDLOG_INFO("Passing chunk via output parser: {}", chunkResponse); std::optional delta = outputParser->parseChunk(chunkResponse, areToolsAvailable(), finishReason); if (!delta.has_value()) { return ""; diff --git a/src/llm/servable.cpp b/src/llm/servable.cpp index e7aa2c8e01..4b66ae12be 100644 --- a/src/llm/servable.cpp +++ b/src/llm/servable.cpp @@ -183,8 +183,13 @@ absl::Status GenAiServable::prepareInputs(std::shared_ptrapiHandler->getChatHistory(); constexpr bool add_generation_prompt = true; // confirm it should be hardcoded + const auto& tools = executionContext->apiHandler->getTools(); try { - inputText = getProperties()->tokenizer.apply_chat_template(chatHistory, add_generation_prompt); + if (tools.has_value()) { + inputText = getProperties()->tokenizer.apply_chat_template(chatHistory, add_generation_prompt, {}, tools); + } else { + inputText = getProperties()->tokenizer.apply_chat_template(chatHistory, add_generation_prompt); + } } catch (const std::exception& e) { SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Failed to apply chat template: {}", e.what()); return absl::Status(absl::StatusCode::kInvalidArgument, "Failed to apply chat template. The model either does not have chat template or has an invalid one."); diff --git a/src/llm/visual_language_model/legacy/servable.cpp b/src/llm/visual_language_model/legacy/servable.cpp index 62838dd57f..a7a7f5080f 100644 --- a/src/llm/visual_language_model/legacy/servable.cpp +++ b/src/llm/visual_language_model/legacy/servable.cpp @@ -81,7 +81,9 @@ absl::Status VisualLanguageModelLegacyServable::parseRequest(std::shared_ptrapiHandler = std::make_shared(*legacyExecutionContext->payload.parsedJson, legacyExecutionContext->endpoint, std::chrono::system_clock::now(), - getProperties()->tokenizer); + getProperties()->tokenizer, + getProperties()->toolParserName, + getProperties()->reasoningParserName); auto& config = ovms::Config::instance(); auto status = executionContext->apiHandler->parseRequest(getProperties()->maxTokensLimit, getProperties()->bestOfLimit, getProperties()->maxModelLength, config.getServerSettings().allowedLocalMediaPath, config.getServerSettings().allowedMediaDomains); @@ -101,7 +103,12 @@ absl::Status VisualLanguageModelLegacyServable::parseRequest(std::shared_ptrtextStreamer = std::make_shared(getProperties()->tokenizer, callback); + ov::AnyMap streamerConfig; + if (legacyExecutionContext->apiHandler->getOutputParser() != nullptr && + (legacyExecutionContext->apiHandler->getOutputParser()->requiresStreamingWithSpecialTokens())) { + streamerConfig.insert(ov::genai::skip_special_tokens(false)); + } + legacyExecutionContext->textStreamer = std::make_shared(getProperties()->tokenizer, callback, streamerConfig); } legacyExecutionContext->generationConfigBuilder = std::make_shared(getProperties()->baseGenerationConfig, getProperties()->toolParserName, @@ -253,24 +260,12 @@ absl::Status VisualLanguageModelLegacyServable::prepareInputs(std::shared_ptrinputText = properties->tokenizer.apply_chat_template(chatHistory, add_generation_prompt, {}, tools); - vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, add_generation_prompt, {}); + const auto& tools = vlmExecutionContext->apiHandler->getTools(); + if (tools.has_value()) { + vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, add_generation_prompt, {}, tools); + } else { + vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, add_generation_prompt, {}); + } } else { return absl::InvalidArgumentError("Unsupported endpoint"); } diff --git a/src/llm/visual_language_model/legacy/servable_initializer.cpp b/src/llm/visual_language_model/legacy/servable_initializer.cpp index 5fddcbc98e..ec8bfd327a 100644 --- a/src/llm/visual_language_model/legacy/servable_initializer.cpp +++ b/src/llm/visual_language_model/legacy/servable_initializer.cpp @@ -53,6 +53,14 @@ Status VisualLanguageModelLegacyServableInitializer::initialize(std::shared_ptr< if (std::filesystem::exists(modelGenerationConfigPath)) { properties->baseGenerationConfig = ov::genai::GenerationConfig(modelGenerationConfigPath.string()); } + + if (nodeOptions.has_tool_parser()) { + properties->toolParserName = nodeOptions.tool_parser(); + } + + if (nodeOptions.has_reasoning_parser()) { + properties->reasoningParserName = nodeOptions.reasoning_parser(); + } properties->schedulerConfig.max_num_batched_tokens = nodeOptions.max_num_batched_tokens(); properties->schedulerConfig.cache_size = nodeOptions.cache_size(); properties->schedulerConfig.dynamic_split_fuse = nodeOptions.dynamic_split_fuse(); @@ -90,6 +98,7 @@ Status VisualLanguageModelLegacyServableInitializer::initialize(std::shared_ptr< } properties->bestOfLimit = nodeOptions.best_of_limit(); properties->maxModelLength = parseMaxModelLength(parsedModelsPath); + properties->enableToolGuidedGeneration = nodeOptions.enable_tool_guided_generation(); return StatusCode::OK; } From beb8a3f5f7961f2ee4627535ca3fb0c72eb7d256 Mon Sep 17 00:00:00 2001 From: Damian Kalinowski Date: Tue, 17 Feb 2026 13:23:25 +0100 Subject: [PATCH 05/11] test --- src/llm/apis/openai_completions.cpp | 2 - src/test/http_openai_handler_test.cpp | 67 +++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 2 deletions(-) diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index 713466bc24..9f9b463c40 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -1214,9 +1214,7 @@ std::string OpenAIChatCompletionsHandler::serializeStreamingChunk(const std::str // logprobs: object/null; Log probability information for the choice. TODO choice.AddMember("logprobs", Value(), allocator); if (endpoint == Endpoint::CHAT_COMPLETIONS) { - SPDLOG_INFO("If have output parser, we will parseChunk via it"); if (outputParser != nullptr) { - SPDLOG_INFO("Passing chunk via output parser: {}", chunkResponse); std::optional delta = outputParser->parseChunk(chunkResponse, areToolsAvailable(), finishReason); if (!delta.has_value()) { return ""; diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index fec2009867..f185302ba0 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -1175,6 +1175,73 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParseRequestWithTools_Provided1_ChoiceNone) assertRequestWithTools(providedTools, toolsChoice, expectedJson); } +TEST_F(HttpOpenAIHandlerParsingTest, ParseRequestWithTools_PopulatesToolsJsonContainer) { + std::string json = R"({ + "model": "llama", + "messages": [ + { + "role": "user", + "content": "What is the weather?" + } + ], + "tools": [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get weather", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string" + } + }, + "required": ["location"] + } + } + } + ] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = + std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); + + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + const auto& tools = apiHandler->getTools(); + ASSERT_TRUE(tools.has_value()); + EXPECT_TRUE(tools->is_array()); + ASSERT_EQ(tools->size(), 1); + ASSERT_TRUE((*tools)[0]["function"]["name"].as_string().has_value()); + EXPECT_EQ((*tools)[0]["function"]["name"].as_string().value(), "get_weather"); +} + +TEST_F(HttpOpenAIHandlerParsingTest, OutputParserInitializationDependsOnParserNames) { + std::string json = R"({ + "model": "llama", + "messages": [ + { + "role": "user", + "content": "hello" + } + ] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto withoutParserNames = std::make_shared( + doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(withoutParserNames->getOutputParser(), nullptr); + + auto withParserNames = std::make_shared( + doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer, "llama3", ""); + EXPECT_NE(withParserNames->getOutputParser(), nullptr); +} + // Provide get_weather1, get_weather2, get_weather3 but take only first one - get_weather1 TEST_F(HttpOpenAIHandlerParsingTest, ParseRequestWithTools_Provided3_ChoiceFirst) { std::string providedTools = R"( From 180e53d25b9a7415493205ab8bdfdf0c706a3def Mon Sep 17 00:00:00 2001 From: Damian Kalinowski Date: Tue, 17 Feb 2026 13:27:50 +0100 Subject: [PATCH 06/11] tests --- src/llm/apis/openai_completions.cpp | 7 ++ .../legacy/legacy_executor.cpp | 4 +- src/test/http_openai_handler_test.cpp | 2 +- .../complete_flow_test.cpp | 85 +++++++++++++++++++ 4 files changed, 94 insertions(+), 4 deletions(-) diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index 9f9b463c40..da292ce388 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include "src/port/rapidjson_stringbuffer.hpp" #include "src/port/rapidjson_writer.hpp" #include @@ -54,6 +55,12 @@ ov::genai::JsonContainer rapidJsonValueToJsonContainer(const rapidjson::Value& v if (value.IsBool()) { return ov::genai::JsonContainer(value.GetBool()); } + if (value.IsInt()) { + return ov::genai::JsonContainer(value.GetInt()); + } + if (value.IsUint()) { + return ov::genai::JsonContainer(static_cast(value.GetUint())); + } if (value.IsInt64()) { return ov::genai::JsonContainer(value.GetInt64()); } diff --git a/src/llm/visual_language_model/legacy/legacy_executor.cpp b/src/llm/visual_language_model/legacy/legacy_executor.cpp index 2e6ec331cf..a21c799cec 100644 --- a/src/llm/visual_language_model/legacy/legacy_executor.cpp +++ b/src/llm/visual_language_model/legacy/legacy_executor.cpp @@ -16,9 +16,7 @@ #include "legacy_executor.hpp" #include "servable.hpp" -#include "vector" - -#include +#include namespace ovms { VisualLanguageModelLegacyExecutor::VisualLanguageModelLegacyExecutor(std::shared_ptr pipe) { diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index f185302ba0..915800f178 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -1238,7 +1238,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, OutputParserInitializationDependsOnParserNa EXPECT_EQ(withoutParserNames->getOutputParser(), nullptr); auto withParserNames = std::make_shared( - doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer, "llama3", ""); + doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer, "llama3", ""); EXPECT_NE(withParserNames->getOutputParser(), nullptr); } diff --git a/src/test/llm/visual_language_model/complete_flow_test.cpp b/src/test/llm/visual_language_model/complete_flow_test.cpp index 4afbd99f31..4dc22d6fa3 100644 --- a/src/test/llm/visual_language_model/complete_flow_test.cpp +++ b/src/test/llm/visual_language_model/complete_flow_test.cpp @@ -260,6 +260,50 @@ TEST_P(VLMServableExecutionTestParameterized, UnaryRestrictedTagUsed) { ovms::StatusCode::MEDIAPIPE_EXECUTION_ERROR); } +TEST_P(VLMServableExecutionTestParameterized, unaryBasicWithTools) { + auto modelName = GetParam(); + std::vector> fields = { + {"temperature", "0.0"}, + {"stream", "false"}, + {"max_tokens", "5"}, + {"ignore_eos", "true"}, + {"tool_choice", R"("auto")"}, + {"tools", R"([ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get weather by city", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": "string" + } + }, + "required": ["city"] + } + } + } + ])"}}; + std::string requestBody = createRequestBody(modelName, fields); + + ASSERT_EQ( + handler->dispatchToProcessor(endpointChatCompletions, requestBody, &response, comp, responseComponents, writer, multiPartParser), + ovms::StatusCode::OK); + + parsedResponse.Parse(response.c_str()); + ASSERT_TRUE(parsedResponse.IsObject()); + ASSERT_TRUE(parsedResponse.HasMember("choices")); + ASSERT_TRUE(parsedResponse["choices"].IsArray()); + ASSERT_EQ(parsedResponse["choices"].Capacity(), 1); + ASSERT_TRUE(parsedResponse["choices"][0].HasMember("message")); + ASSERT_TRUE(parsedResponse["choices"][0]["message"].IsObject()); + ASSERT_TRUE(parsedResponse["choices"][0]["message"]["content"].IsString()); + EXPECT_STREQ(parsedResponse["object"].GetString(), "chat.completion"); + EXPECT_STREQ(parsedResponse["model"].GetString(), modelName.c_str()); +} + // Stream flow TEST_P(VLMServableExecutionTestParameterized, streamBasic) { @@ -361,6 +405,47 @@ TEST_P(VLMServableExecutionTestParameterized, streamRestrictedTagUsed) { ovms::StatusCode::PARTIAL_END); } +TEST_P(VLMServableExecutionTestParameterized, streamBasicWithTools) { + auto modelName = GetParam(); + std::vector> fields = { + {"temperature", "0.0"}, + {"stream", "true"}, + {"max_tokens", "5"}, + {"ignore_eos", "true"}, + {"tool_choice", R"("auto")"}, + {"tools", R"([ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get weather by city", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": "string" + } + }, + "required": ["city"] + } + } + } + ])"}}; + std::string requestBody = createRequestBody(modelName, fields); + + std::vector responses; + EXPECT_CALL(*writer, PartialReply(::testing::_)) + .WillRepeatedly([&responses](std::string response) { + responses.push_back(response); + }); + EXPECT_CALL(*writer, PartialReplyEnd()).Times(1); + + ASSERT_EQ( + handler->dispatchToProcessor(endpointChatCompletions, requestBody, &response, comp, responseComponents, writer, multiPartParser), + ovms::StatusCode::PARTIAL_END); + ASSERT_FALSE(responses.empty()); +} + INSTANTIATE_TEST_SUITE_P( VLMServableExecutionTests, VLMServableExecutionTestParameterized, From 36f522fd622cede9457f9db03d2a6835510d2d2c Mon Sep 17 00:00:00 2001 From: Damian Kalinowski Date: Thu, 19 Feb 2026 12:18:15 +0100 Subject: [PATCH 07/11] hack --- src/llm/apis/openai_completions.cpp | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index da292ce388..7adfce35e2 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -1135,6 +1135,25 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(const ov::genai for (int i = 0; i < results.texts.size(); i++) { const std::string& text = results.texts[i]; SPDLOG_LOGGER_TRACE(llm_calculator_logger, "Generated text: {}", text); + + // Workaround to use OVMS unary parsers: get tokens from string + // This way we have detokenized text from GenAI and calculate tokens, to further convert back to text again, in parseOutputIfNeeded... + auto result = tokenizer.encode(text); + auto& input_ids = result.input_ids; + if (input_ids.get_shape().size() != 2) + throw std::runtime_error("input_ids should have 2 dimensions"); + if (input_ids.get_shape()[0] != 1) + throw std::runtime_error("input_ids should have 1 batch size"); + if (input_ids.get_element_type() != ov::element::i64) + throw std::runtime_error("input_ids should have i64 element type"); + + int64_t* input_ids_data = reinterpret_cast(input_ids.data()); + std::vector tokens(input_ids_data, input_ids_data + input_ids.get_shape()[1]); + + SPDLOG_LOGGER_TRACE(llm_calculator_logger, "Generated tokens: {}", tokens); + updateUsage(usage, tokens, request.echo); + ParsedOutput parsedOutput = parseOutputIfNeeded(tokens); + jsonResponse.StartObject(); // finish_reason: string; always "stop" for this method jsonResponse.FinishReason("stop"); @@ -1142,16 +1161,10 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(const ov::genai jsonResponse.Index(index++); // logprobs: object/null; Log probability information for the choice. TODO - // message: object if (endpoint == Endpoint::CHAT_COMPLETIONS) { - jsonResponse.StartObject("message"); - jsonResponse.String("content", text); - jsonResponse.String("role", "assistant"); // TODO - hardcoded - // TODO: tools_call - // TODO: function_call (deprecated) - jsonResponse.EndObject(); + jsonResponse.MessageObject(parsedOutput); } else if (endpoint == Endpoint::COMPLETIONS) { - jsonResponse.String("text", text); + jsonResponse.Text(parsedOutput); } // finish message object From 7e94c18e0eaf96ef558a6cda5a330b08c787765c Mon Sep 17 00:00:00 2001 From: Damian Kalinowski Date: Thu, 19 Feb 2026 14:15:00 +0100 Subject: [PATCH 08/11] save --- src/llm/servable.cpp | 1 - src/llm/visual_language_model/continuous_batching/servable.cpp | 1 - src/llm/visual_language_model/legacy/servable.cpp | 1 - src/test/llm/visual_language_model/complete_flow_test.cpp | 2 +- 4 files changed, 1 insertion(+), 4 deletions(-) diff --git a/src/llm/servable.cpp b/src/llm/servable.cpp index 4b66ae12be..3af794e492 100644 --- a/src/llm/servable.cpp +++ b/src/llm/servable.cpp @@ -156,7 +156,6 @@ absl::Status GenAiServable::parseRequest(std::shared_ptr& executionContext) { if (executionContext->apiHandler == nullptr) { return absl::Status(absl::StatusCode::kInvalidArgument, "API handler is not initialized"); diff --git a/src/llm/visual_language_model/continuous_batching/servable.cpp b/src/llm/visual_language_model/continuous_batching/servable.cpp index 09aab47abc..4f8ce9710f 100644 --- a/src/llm/visual_language_model/continuous_batching/servable.cpp +++ b/src/llm/visual_language_model/continuous_batching/servable.cpp @@ -62,7 +62,6 @@ std::shared_ptr VisualLanguageModelServable::getPropert return properties; } -// Continuous Batching VLM absl::Status VisualLanguageModelServable::prepareInputs(std::shared_ptr& executionContext) { auto vlmExecutionContext = std::static_pointer_cast(executionContext); if (vlmExecutionContext->apiHandler == nullptr) { diff --git a/src/llm/visual_language_model/legacy/servable.cpp b/src/llm/visual_language_model/legacy/servable.cpp index a7a7f5080f..5c695c00e2 100644 --- a/src/llm/visual_language_model/legacy/servable.cpp +++ b/src/llm/visual_language_model/legacy/servable.cpp @@ -229,7 +229,6 @@ absl::Status VisualLanguageModelLegacyServable::preparePartialResponse(std::shar return absl::OkStatus(); } -// Legacy VLM absl::Status VisualLanguageModelLegacyServable::prepareInputs(std::shared_ptr& executionContext) { auto vlmExecutionContext = std::static_pointer_cast(executionContext); if (vlmExecutionContext->apiHandler == nullptr) { diff --git a/src/test/llm/visual_language_model/complete_flow_test.cpp b/src/test/llm/visual_language_model/complete_flow_test.cpp index 4dc22d6fa3..e976518f76 100644 --- a/src/test/llm/visual_language_model/complete_flow_test.cpp +++ b/src/test/llm/visual_language_model/complete_flow_test.cpp @@ -267,7 +267,7 @@ TEST_P(VLMServableExecutionTestParameterized, unaryBasicWithTools) { {"stream", "false"}, {"max_tokens", "5"}, {"ignore_eos", "true"}, - {"tool_choice", R"("auto")"}, + {"tool_choice", "auto"}, {"tools", R"([ { "type": "function", From 21ab2fd8fd1a65fc28d358739ee135d8d075b300 Mon Sep 17 00:00:00 2001 From: Damian Kalinowski Date: Thu, 19 Feb 2026 15:29:05 +0100 Subject: [PATCH 09/11] fix test --- src/llm/apis/openai_completions.cpp | 4 ++-- src/llm/apis/openai_completions.hpp | 2 +- src/llm/visual_language_model/legacy/servable.cpp | 8 +------- src/test/llm/visual_language_model/complete_flow_test.cpp | 2 +- 4 files changed, 5 insertions(+), 11 deletions(-) diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index 7adfce35e2..2f6ac16944 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -917,6 +917,7 @@ absl::Status OpenAIChatCompletionsHandler::parseRequest(std::optional void updateUsage(CompletionUsageStatistics& usage, const std::vector& generatedIds, bool echoPrompt) { OVMS_PROFILE_FUNCTION(); + SPDLOG_INFO("Echo prompt: {}", echoPrompt); usage.completionTokens += generatedIds.size(); if (echoPrompt) usage.completionTokens -= usage.promptTokens; @@ -1123,7 +1124,7 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(const ov::genai return jsonResponse.ToString(); } -std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(const ov::genai::VLMDecodedResults& results, size_t completionTokens) { +std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(const ov::genai::VLMDecodedResults& results) { OVMS_PROFILE_FUNCTION(); OpenAiJsonResponse jsonResponse; jsonResponse.StartObject(); @@ -1131,7 +1132,6 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(const ov::genai // choices: array of size N, where N is related to n request parameter jsonResponse.StartArray("choices"); int index = 0; - usage.completionTokens = completionTokens; for (int i = 0; i < results.texts.size(); i++) { const std::string& text = results.texts[i]; SPDLOG_LOGGER_TRACE(llm_calculator_logger, "Generated text: {}", text); diff --git a/src/llm/apis/openai_completions.hpp b/src/llm/apis/openai_completions.hpp index 3b55e35573..51264d6ad7 100644 --- a/src/llm/apis/openai_completions.hpp +++ b/src/llm/apis/openai_completions.hpp @@ -121,7 +121,7 @@ class OpenAIChatCompletionsHandler { std::string serializeUnaryResponse(const std::vector& generationOutputs); std::string serializeUnaryResponse(const ov::genai::EncodedResults& results); // VLMDecodedResults does not contain tokens that we can count, so we need to pass completionTokens in order to provide correct usage statistics - std::string serializeUnaryResponse(const ov::genai::VLMDecodedResults& results, size_t completionTokens); + std::string serializeUnaryResponse(const ov::genai::VLMDecodedResults& results); std::string serializeStreamingChunk(const std::string& chunkResponse, ov::genai::GenerationFinishReason finishReason); std::string serializeStreamingUsageChunk(); }; diff --git a/src/llm/visual_language_model/legacy/servable.cpp b/src/llm/visual_language_model/legacy/servable.cpp index 5c695c00e2..f104c08c4e 100644 --- a/src/llm/visual_language_model/legacy/servable.cpp +++ b/src/llm/visual_language_model/legacy/servable.cpp @@ -157,13 +157,7 @@ absl::Status VisualLanguageModelLegacyServable::prepareCompleteResponse(std::sha if (legacyExecutionContext->payload.client->isDisconnected()) { return absl::CancelledError(); } - size_t completionTokens = 0; - for (std::string text : legacyExecutionContext->results.texts) { - auto tokensTensor = properties->tokenizer.encode(text, ov::genai::add_special_tokens(false)).input_ids; - completionTokens += tokensTensor.get_size(); - } - SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Generated tokens number: {}", completionTokens); - executionContext->response = executionContext->apiHandler->serializeUnaryResponse(legacyExecutionContext->results, completionTokens); + executionContext->response = executionContext->apiHandler->serializeUnaryResponse(legacyExecutionContext->results); SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Complete unary response: {}", executionContext->response); return absl::OkStatus(); } diff --git a/src/test/llm/visual_language_model/complete_flow_test.cpp b/src/test/llm/visual_language_model/complete_flow_test.cpp index e976518f76..4dc22d6fa3 100644 --- a/src/test/llm/visual_language_model/complete_flow_test.cpp +++ b/src/test/llm/visual_language_model/complete_flow_test.cpp @@ -267,7 +267,7 @@ TEST_P(VLMServableExecutionTestParameterized, unaryBasicWithTools) { {"stream", "false"}, {"max_tokens", "5"}, {"ignore_eos", "true"}, - {"tool_choice", "auto"}, + {"tool_choice", R"("auto")"}, {"tools", R"([ { "type": "function", From 2ef6bb1422363a973d8a0c02e138ef6e903c0efa Mon Sep 17 00:00:00 2001 From: Damian Kalinowski Date: Thu, 19 Feb 2026 15:54:51 +0100 Subject: [PATCH 10/11] save --- src/test/http_openai_handler_test.cpp | 95 +++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index 915800f178..e91fe30a13 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -1242,6 +1242,101 @@ TEST_F(HttpOpenAIHandlerParsingTest, OutputParserInitializationDependsOnParserNa EXPECT_NE(withParserNames->getOutputParser(), nullptr); } +TEST_F(HttpOpenAIHandlerParsingTest, SerializeUnaryResponseVLMDecodedResultsWithToolParser) { + std::string json = R"({ + "model": "llama", + "messages": [ + { + "role": "user", + "content": "hello" + } + ], + "tools": [ + { + "type": "function", + "function": { + "name": "get_weather", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string" + } + }, + "required": ["location"] + } + } + } + ] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + uint32_t maxTokensLimit = 64; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + + auto apiHandler = std::make_shared( + doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer, "hermes3", ""); + + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + ov::genai::VLMDecodedResults results; + results.texts.push_back( + "I will call a tool.{\"name\":\"get_weather\",\"arguments\":{\"location\":\"Paris\"}}"); + + std::string serialized = apiHandler->serializeUnaryResponse(results); + + rapidjson::Document responseDoc; + responseDoc.Parse(serialized.c_str()); + ASSERT_FALSE(responseDoc.HasParseError()); + ASSERT_TRUE(responseDoc.IsObject()); + + ASSERT_TRUE(responseDoc.HasMember("choices")); + ASSERT_TRUE(responseDoc["choices"].IsArray()); + ASSERT_EQ(responseDoc["choices"].Size(), 1); + + const auto& choice = responseDoc["choices"][0]; + ASSERT_TRUE(choice.IsObject()); + ASSERT_TRUE(choice.HasMember("finish_reason")); + ASSERT_TRUE(choice["finish_reason"].IsString()); + EXPECT_STREQ(choice["finish_reason"].GetString(), "stop"); + + ASSERT_TRUE(choice.HasMember("message")); + ASSERT_TRUE(choice["message"].IsObject()); + const auto& message = choice["message"]; + + ASSERT_TRUE(message.HasMember("content")); + ASSERT_TRUE(message["content"].IsString()); + EXPECT_STREQ(message["content"].GetString(), "I will call a tool."); + + ASSERT_TRUE(message.HasMember("tool_calls")); + ASSERT_TRUE(message["tool_calls"].IsArray()); + ASSERT_EQ(message["tool_calls"].Size(), 1); + + const auto& toolCall = message["tool_calls"][0]; + ASSERT_TRUE(toolCall.IsObject()); + ASSERT_TRUE(toolCall.HasMember("id")); + ASSERT_TRUE(toolCall["id"].IsString()); + EXPECT_GT(std::string(toolCall["id"].GetString()).size(), 0); + ASSERT_TRUE(toolCall.HasMember("function")); + ASSERT_TRUE(toolCall["function"].IsObject()); + ASSERT_TRUE(toolCall["function"].HasMember("name")); + EXPECT_STREQ(toolCall["function"]["name"].GetString(), "get_weather"); + ASSERT_TRUE(toolCall["function"].HasMember("arguments")); + EXPECT_STREQ(toolCall["function"]["arguments"].GetString(), "{\"location\":\"Paris\"}"); + + ASSERT_TRUE(responseDoc.HasMember("object")); + EXPECT_STREQ(responseDoc["object"].GetString(), "chat.completion"); + ASSERT_TRUE(responseDoc.HasMember("model")); + EXPECT_STREQ(responseDoc["model"].GetString(), "llama"); + + ASSERT_TRUE(responseDoc.HasMember("usage")); + ASSERT_TRUE(responseDoc["usage"].IsObject()); + ASSERT_TRUE(responseDoc["usage"].HasMember("completion_tokens")); + EXPECT_GT(responseDoc["usage"]["completion_tokens"].GetInt(), 0); +} + // Provide get_weather1, get_weather2, get_weather3 but take only first one - get_weather1 TEST_F(HttpOpenAIHandlerParsingTest, ParseRequestWithTools_Provided3_ChoiceFirst) { std::string providedTools = R"( From c2d5eb5f70b0475867e521d908e68b86adc10065 Mon Sep 17 00:00:00 2001 From: Damian Kalinowski Date: Fri, 20 Feb 2026 13:10:44 +0100 Subject: [PATCH 11/11] address review --- src/llm/apis/openai_completions.cpp | 42 +++++++++---------- src/llm/apis/openai_completions.hpp | 4 +- src/llm/apis/openai_request.hpp | 3 -- src/llm/servable.cpp | 6 ++- .../continuous_batching/servable.cpp | 8 +++- .../visual_language_model/legacy/servable.cpp | 8 +++- src/test/http_openai_handler_test.cpp | 6 ++- 7 files changed, 45 insertions(+), 32 deletions(-) diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index 2f6ac16944..8398011fbe 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -483,23 +483,6 @@ absl::Status OpenAIChatCompletionsHandler::parseTools() { } request.toolChoice = tool_choice; - request.tools = std::nullopt; - if (it != doc.MemberEnd() && !it->value.IsNull()) { - try { - request.tools = rapidJsonValueToJsonContainer(it->value); - } catch (const std::exception& e) { - SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Direct tools conversion to JsonContainer failed: {}. Falling back to JSON string conversion.", e.what()); - try { - rapidjson::StringBuffer toolsBuffer; - rapidjson::Writer toolsWriter(toolsBuffer); - it->value.Accept(toolsWriter); - request.tools = ov::genai::JsonContainer::from_json_string(toolsBuffer.GetString()); - } catch (const std::exception& fallbackEx) { - SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Fallback tools conversion failed: {}", fallbackEx.what()); - return absl::InvalidArgumentError(absl::StrCat("Invalid tools payload: ", fallbackEx.what())); - } - } - } if (jsonChanged) { StringBuffer buffer; Writer writer(buffer); @@ -509,6 +492,27 @@ absl::Status OpenAIChatCompletionsHandler::parseTools() { return absl::OkStatus(); } +absl::StatusOr> OpenAIChatCompletionsHandler::parseToolsToJsonContainer() { + auto it = doc.FindMember("tools"); + if (it == doc.MemberEnd() || it->value.IsNull()) { + return std::nullopt; + } + try { + return rapidJsonValueToJsonContainer(it->value); + } catch (const std::exception& e) { + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Direct tools conversion to JsonContainer failed: {}. Falling back to JSON string conversion.", e.what()); + try { + rapidjson::StringBuffer toolsBuffer; + rapidjson::Writer toolsWriter(toolsBuffer); + it->value.Accept(toolsWriter); + return ov::genai::JsonContainer::from_json_string(toolsBuffer.GetString()); + } catch (const std::exception& fallbackEx) { + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Fallback tools conversion failed: {}", fallbackEx.what()); + return absl::InvalidArgumentError(absl::StrCat("Invalid tools payload: ", fallbackEx.what())); + } + } +} + const bool OpenAIChatCompletionsHandler::areToolsAvailable() const { return !request.toolNameSchemaMap.empty(); } @@ -536,10 +540,6 @@ std::optional OpenAIChatCompletionsHandler::getResponseFormat() con return request.responseFormat; } -const std::optional& OpenAIChatCompletionsHandler::getTools() const { - return request.tools; -} - std::string convertOpenAIResponseFormatToStructuralTagStringFormat(const rapidjson::Value& openAIFormat) { // Build the new object: {"type": "structural_tag", "format": } // If response_format has {"json_schema": {"schema": {...}}}, flatten it to {"json_schema": {...}} diff --git a/src/llm/apis/openai_completions.hpp b/src/llm/apis/openai_completions.hpp index 51264d6ad7..9e48701d7f 100644 --- a/src/llm/apis/openai_completions.hpp +++ b/src/llm/apis/openai_completions.hpp @@ -26,6 +26,7 @@ #include #include +#include #include #include #include @@ -34,6 +35,7 @@ #pragma warning(push) #pragma warning(disable : 6001 4324 6385 6386) #include "absl/status/status.h" +#include "absl/status/statusor.h" #pragma warning(pop) #include "../io_processing/output_parser.hpp" #include "openai_request.hpp" @@ -102,7 +104,6 @@ class OpenAIChatCompletionsHandler { ov::genai::ChatHistory& getChatHistory(); std::optional getMaxTokens() const; std::optional getResponseFormat() const; - const std::optional& getTools() const; bool isStream() const; std::string getModel() const; @@ -116,6 +117,7 @@ class OpenAIChatCompletionsHandler { absl::Status parseRequest(std::optional maxTokensLimit, uint32_t bestOfLimit, std::optional maxModelLength, std::optional allowedLocalMediaPath = std::nullopt, std::optional> allowedMediaDomains = std::nullopt); absl::Status parseMessages(std::optional allowedLocalMediaPath = std::nullopt, std::optional> allowedMediaDomains = std::nullopt); absl::Status parseTools(); + absl::StatusOr> parseToolsToJsonContainer(); const bool areToolsAvailable() const; std::string serializeUnaryResponse(const std::vector& generationOutputs); diff --git a/src/llm/apis/openai_request.hpp b/src/llm/apis/openai_request.hpp index dfb2151506..de355c12a1 100644 --- a/src/llm/apis/openai_request.hpp +++ b/src/llm/apis/openai_request.hpp @@ -25,7 +25,6 @@ #include #include -#include #include #include "src/port/rapidjson_document.hpp" @@ -79,8 +78,6 @@ struct OpenAIChatCompletionsRequest { std::optional responseFormat{std::nullopt}; // Map that holds tool names and schemas for their arguments ToolsSchemas_t toolNameSchemaMap; - // Full tools payload in JSON form for passing directly to tokenizer chat template. - std::optional tools{std::nullopt}; // Holds value for tool_choice field as described in https://platform.openai.com/docs/api-reference/chat/create#chat_create-tool_choice std::string toolChoice; diff --git a/src/llm/servable.cpp b/src/llm/servable.cpp index 3af794e492..02d1fd0fd8 100644 --- a/src/llm/servable.cpp +++ b/src/llm/servable.cpp @@ -182,7 +182,11 @@ absl::Status GenAiServable::prepareInputs(std::shared_ptrapiHandler->getChatHistory(); constexpr bool add_generation_prompt = true; // confirm it should be hardcoded - const auto& tools = executionContext->apiHandler->getTools(); + auto toolsStatus = executionContext->apiHandler->parseToolsToJsonContainer(); + if (!toolsStatus.ok()) { + return toolsStatus.status(); + } + const auto& tools = toolsStatus.value(); try { if (tools.has_value()) { inputText = getProperties()->tokenizer.apply_chat_template(chatHistory, add_generation_prompt, {}, tools); diff --git a/src/llm/visual_language_model/continuous_batching/servable.cpp b/src/llm/visual_language_model/continuous_batching/servable.cpp index 4f8ce9710f..bd292ada00 100644 --- a/src/llm/visual_language_model/continuous_batching/servable.cpp +++ b/src/llm/visual_language_model/continuous_batching/servable.cpp @@ -93,11 +93,15 @@ absl::Status VisualLanguageModelServable::prepareInputs(std::shared_ptrapiHandler->getTools(); + auto toolsStatus = vlmExecutionContext->apiHandler->parseToolsToJsonContainer(); + if (!toolsStatus.ok()) { + return toolsStatus.status(); + } + const auto& tools = toolsStatus.value(); if (tools.has_value()) { vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, add_generation_prompt, {}, tools); } else { - vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, add_generation_prompt, {}); + vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, add_generation_prompt); } } else { return absl::InvalidArgumentError("Unsupported endpoint"); diff --git a/src/llm/visual_language_model/legacy/servable.cpp b/src/llm/visual_language_model/legacy/servable.cpp index f104c08c4e..fed304bf80 100644 --- a/src/llm/visual_language_model/legacy/servable.cpp +++ b/src/llm/visual_language_model/legacy/servable.cpp @@ -253,11 +253,15 @@ absl::Status VisualLanguageModelLegacyServable::prepareInputs(std::shared_ptrapiHandler->getTools(); + auto toolsStatus = vlmExecutionContext->apiHandler->parseToolsToJsonContainer(); + if (!toolsStatus.ok()) { + return toolsStatus.status(); + } + const auto& tools = toolsStatus.value(); if (tools.has_value()) { vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, add_generation_prompt, {}, tools); } else { - vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, add_generation_prompt, {}); + vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, add_generation_prompt); } } else { return absl::InvalidArgumentError("Unsupported endpoint"); diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index e91fe30a13..0ddb241fa3 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -1175,7 +1175,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParseRequestWithTools_Provided1_ChoiceNone) assertRequestWithTools(providedTools, toolsChoice, expectedJson); } -TEST_F(HttpOpenAIHandlerParsingTest, ParseRequestWithTools_PopulatesToolsJsonContainer) { +TEST_F(HttpOpenAIHandlerParsingTest, ParseRequestWithTools_ParsesToolsJsonContainerOnDemand) { std::string json = R"({ "model": "llama", "messages": [ @@ -1212,7 +1212,9 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParseRequestWithTools_PopulatesToolsJsonCon std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); - const auto& tools = apiHandler->getTools(); + auto toolsStatus = apiHandler->parseToolsToJsonContainer(); + ASSERT_TRUE(toolsStatus.ok()); + const auto& tools = toolsStatus.value(); ASSERT_TRUE(tools.has_value()); EXPECT_TRUE(tools->is_array()); ASSERT_EQ(tools->size(), 1);