Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 97 additions & 10 deletions src/llm/apis/openai_completions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@
#include "openai_completions.hpp"

#include <cmath>
#include <limits>
#include <memory>
#include <stdexcept>
#include "src/port/rapidjson_stringbuffer.hpp"
#include "src/port/rapidjson_writer.hpp"
#include <set>
Expand All @@ -44,6 +46,57 @@ namespace ovms {

constexpr size_t DEFAULT_MAX_STOP_WORDS = 16; // same as deep-seek

namespace {

ov::genai::JsonContainer rapidJsonValueToJsonContainer(const rapidjson::Value& value) {
if (value.IsNull()) {
return ov::genai::JsonContainer(nullptr);
}
if (value.IsBool()) {
return ov::genai::JsonContainer(value.GetBool());
}
if (value.IsInt()) {
return ov::genai::JsonContainer(value.GetInt());
}
if (value.IsUint()) {
return ov::genai::JsonContainer(static_cast<int64_t>(value.GetUint()));
}
if (value.IsInt64()) {
return ov::genai::JsonContainer(value.GetInt64());
}
if (value.IsUint64()) {
auto uintValue = value.GetUint64();
if (uintValue <= static_cast<uint64_t>(std::numeric_limits<int64_t>::max())) {
return ov::genai::JsonContainer(static_cast<int64_t>(uintValue));
}
return ov::genai::JsonContainer(static_cast<double>(uintValue));
Comment on lines +69 to +72
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How does it work? Why do we have such condition only for uint64?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is because genai::JsonContainer has no support for uint64, only int64

}
if (value.IsDouble()) {
return ov::genai::JsonContainer(value.GetDouble());
}
if (value.IsString()) {
return ov::genai::JsonContainer(std::string(value.GetString(), value.GetStringLength()));
}
if (value.IsArray()) {
ov::genai::JsonContainer arrayContainer = ov::genai::JsonContainer::array();
for (const auto& item : value.GetArray()) {
arrayContainer.push_back(rapidJsonValueToJsonContainer(item));
}
return arrayContainer;
}
if (value.IsObject()) {
ov::genai::JsonContainer objectContainer = ov::genai::JsonContainer::object();
for (auto member = value.MemberBegin(); member != value.MemberEnd(); ++member) {
const std::string key(member->name.GetString(), member->name.GetStringLength());
objectContainer[key] = rapidJsonValueToJsonContainer(member->value);
}
return objectContainer;
}
throw std::invalid_argument("Unsupported JSON value type");
}

} // namespace

absl::Status OpenAIChatCompletionsHandler::parseCompletionsPart() {
// prompt: string
auto it = doc.FindMember("prompt");
Expand Down Expand Up @@ -439,6 +492,27 @@ absl::Status OpenAIChatCompletionsHandler::parseTools() {
return absl::OkStatus();
}

absl::StatusOr<std::optional<ov::genai::JsonContainer>> OpenAIChatCompletionsHandler::parseToolsToJsonContainer() {
auto it = doc.FindMember("tools");
if (it == doc.MemberEnd() || it->value.IsNull()) {
return std::nullopt;
}
try {
return rapidJsonValueToJsonContainer(it->value);
} catch (const std::exception& e) {
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Direct tools conversion to JsonContainer failed: {}. Falling back to JSON string conversion.", e.what());
try {
rapidjson::StringBuffer toolsBuffer;
rapidjson::Writer<rapidjson::StringBuffer> toolsWriter(toolsBuffer);
it->value.Accept(toolsWriter);
return ov::genai::JsonContainer::from_json_string(toolsBuffer.GetString());
} catch (const std::exception& fallbackEx) {
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Fallback tools conversion failed: {}", fallbackEx.what());
return absl::InvalidArgumentError(absl::StrCat("Invalid tools payload: ", fallbackEx.what()));
}
}
}

const bool OpenAIChatCompletionsHandler::areToolsAvailable() const {
return !request.toolNameSchemaMap.empty();
}
Expand Down Expand Up @@ -843,6 +917,7 @@ absl::Status OpenAIChatCompletionsHandler::parseRequest(std::optional<uint32_t>

void updateUsage(CompletionUsageStatistics& usage, const std::vector<int64_t>& generatedIds, bool echoPrompt) {
OVMS_PROFILE_FUNCTION();
SPDLOG_INFO("Echo prompt: {}", echoPrompt);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Logging on INFO?

usage.completionTokens += generatedIds.size();
if (echoPrompt)
usage.completionTokens -= usage.promptTokens;
Expand Down Expand Up @@ -1049,35 +1124,47 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(const ov::genai
return jsonResponse.ToString();
}

std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(const ov::genai::VLMDecodedResults& results, size_t completionTokens) {
std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(const ov::genai::VLMDecodedResults& results) {
OVMS_PROFILE_FUNCTION();
OpenAiJsonResponse jsonResponse;
jsonResponse.StartObject();

// choices: array of size N, where N is related to n request parameter
jsonResponse.StartArray("choices");
int index = 0;
usage.completionTokens = completionTokens;
for (int i = 0; i < results.texts.size(); i++) {
const std::string& text = results.texts[i];
SPDLOG_LOGGER_TRACE(llm_calculator_logger, "Generated text: {}", text);

// Workaround to use OVMS unary parsers: get tokens from string
// This way we have detokenized text from GenAI and calculate tokens, to further convert back to text again, in parseOutputIfNeeded...
auto result = tokenizer.encode(text);
auto& input_ids = result.input_ids;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

those are only new tokens right? prompt not included?
just checking as input_ids name here is a bit confusing

if (input_ids.get_shape().size() != 2)
throw std::runtime_error("input_ids should have 2 dimensions");
if (input_ids.get_shape()[0] != 1)
throw std::runtime_error("input_ids should have 1 batch size");
if (input_ids.get_element_type() != ov::element::i64)
throw std::runtime_error("input_ids should have i64 element type");

int64_t* input_ids_data = reinterpret_cast<int64_t*>(input_ids.data());
std::vector<int64_t> tokens(input_ids_data, input_ids_data + input_ids.get_shape()[1]);

SPDLOG_LOGGER_TRACE(llm_calculator_logger, "Generated tokens: {}", tokens);
updateUsage(usage, tokens, request.echo);
ParsedOutput parsedOutput = parseOutputIfNeeded(tokens);

jsonResponse.StartObject();
// finish_reason: string; always "stop" for this method
jsonResponse.FinishReason("stop");
// index: integer; Choice index, only n=1 supported anyway
jsonResponse.Index(index++);
// logprobs: object/null; Log probability information for the choice. TODO

// message: object
if (endpoint == Endpoint::CHAT_COMPLETIONS) {
jsonResponse.StartObject("message");
jsonResponse.String("content", text);
jsonResponse.String("role", "assistant"); // TODO - hardcoded
// TODO: tools_call
// TODO: function_call (deprecated)
jsonResponse.EndObject();
jsonResponse.MessageObject(parsedOutput);
} else if (endpoint == Endpoint::COMPLETIONS) {
jsonResponse.String("text", text);
jsonResponse.Text(parsedOutput);
}

// finish message object
Expand Down
5 changes: 4 additions & 1 deletion src/llm/apis/openai_completions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

#include <openvino/genai/generation_config.hpp>
#include <openvino/genai/generation_handle.hpp>
#include <openvino/genai/json_container.hpp>
#include <openvino/genai/llm_pipeline.hpp>
#include <openvino/genai/tokenizer.hpp>
#include <openvino/genai/visual_language/pipeline.hpp>
Expand All @@ -34,6 +35,7 @@
#pragma warning(push)
#pragma warning(disable : 6001 4324 6385 6386)
#include "absl/status/status.h"
#include "absl/status/statusor.h"
#pragma warning(pop)
#include "../io_processing/output_parser.hpp"
#include "openai_request.hpp"
Expand Down Expand Up @@ -115,12 +117,13 @@ class OpenAIChatCompletionsHandler {
absl::Status parseRequest(std::optional<uint32_t> maxTokensLimit, uint32_t bestOfLimit, std::optional<uint32_t> maxModelLength, std::optional<std::string> allowedLocalMediaPath = std::nullopt, std::optional<std::vector<std::string>> allowedMediaDomains = std::nullopt);
absl::Status parseMessages(std::optional<std::string> allowedLocalMediaPath = std::nullopt, std::optional<std::vector<std::string>> allowedMediaDomains = std::nullopt);
absl::Status parseTools();
absl::StatusOr<std::optional<ov::genai::JsonContainer>> parseToolsToJsonContainer();
const bool areToolsAvailable() const;

std::string serializeUnaryResponse(const std::vector<ov::genai::GenerationOutput>& generationOutputs);
std::string serializeUnaryResponse(const ov::genai::EncodedResults& results);
// VLMDecodedResults does not contain tokens that we can count, so we need to pass completionTokens in order to provide correct usage statistics
std::string serializeUnaryResponse(const ov::genai::VLMDecodedResults& results, size_t completionTokens);
std::string serializeUnaryResponse(const ov::genai::VLMDecodedResults& results);
std::string serializeStreamingChunk(const std::string& chunkResponse, ov::genai::GenerationFinishReason finishReason);
std::string serializeStreamingUsageChunk();
};
Expand Down
11 changes: 10 additions & 1 deletion src/llm/servable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -182,8 +182,17 @@ absl::Status GenAiServable::prepareInputs(std::shared_ptr<GenAiServableExecution
#else
ov::genai::ChatHistory& chatHistory = executionContext->apiHandler->getChatHistory();
constexpr bool add_generation_prompt = true; // confirm it should be hardcoded
auto toolsStatus = executionContext->apiHandler->parseToolsToJsonContainer();
if (!toolsStatus.ok()) {
return toolsStatus.status();
}
const auto& tools = toolsStatus.value();
try {
inputText = getProperties()->tokenizer.apply_chat_template(chatHistory, add_generation_prompt);
if (tools.has_value()) {
inputText = getProperties()->tokenizer.apply_chat_template(chatHistory, add_generation_prompt, {}, tools);
} else {
inputText = getProperties()->tokenizer.apply_chat_template(chatHistory, add_generation_prompt);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

from different comment:

this is custom chat template. if empty string is provided, internal one is used.
Should we add it here as well?

}
} catch (const std::exception& e) {
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Failed to apply chat template: {}", e.what());
return absl::Status(absl::StatusCode::kInvalidArgument, "Failed to apply chat template. The model either does not have chat template or has an invalid one.");
Expand Down
11 changes: 10 additions & 1 deletion src/llm/visual_language_model/continuous_batching/servable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,16 @@ absl::Status VisualLanguageModelServable::prepareInputs(std::shared_ptr<GenAiSer
}

constexpr bool add_generation_prompt = true; // confirm it should be hardcoded
vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, add_generation_prompt);
auto toolsStatus = vlmExecutionContext->apiHandler->parseToolsToJsonContainer();
if (!toolsStatus.ok()) {
return toolsStatus.status();
}
const auto& tools = toolsStatus.value();
if (tools.has_value()) {
vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, add_generation_prompt, {}, tools);
} else {
vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, add_generation_prompt);
}
} else {
return absl::InvalidArgumentError("Unsupported endpoint");
}
Expand Down
2 changes: 1 addition & 1 deletion src/llm/visual_language_model/legacy/legacy_executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

#include "legacy_executor.hpp"
#include "servable.hpp"
#include "vector"
#include <vector>

namespace ovms {
VisualLanguageModelLegacyExecutor::VisualLanguageModelLegacyExecutor(std::shared_ptr<ov::genai::VLMPipeline> pipe) {
Expand Down
30 changes: 20 additions & 10 deletions src/llm/visual_language_model/legacy/servable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,9 @@ absl::Status VisualLanguageModelLegacyServable::parseRequest(std::shared_ptr<Gen
legacyExecutionContext->apiHandler = std::make_shared<OpenAIChatCompletionsHandler>(*legacyExecutionContext->payload.parsedJson,
legacyExecutionContext->endpoint,
std::chrono::system_clock::now(),
getProperties()->tokenizer);
getProperties()->tokenizer,
getProperties()->toolParserName,
getProperties()->reasoningParserName);
auto& config = ovms::Config::instance();

auto status = executionContext->apiHandler->parseRequest(getProperties()->maxTokensLimit, getProperties()->bestOfLimit, getProperties()->maxModelLength, config.getServerSettings().allowedLocalMediaPath, config.getServerSettings().allowedMediaDomains);
Expand All @@ -101,7 +103,12 @@ absl::Status VisualLanguageModelLegacyServable::parseRequest(std::shared_ptr<Gen
}
return ov::genai::StreamingStatus::RUNNING;
};
legacyExecutionContext->textStreamer = std::make_shared<ov::genai::TextStreamer>(getProperties()->tokenizer, callback);
ov::AnyMap streamerConfig;
if (legacyExecutionContext->apiHandler->getOutputParser() != nullptr &&
(legacyExecutionContext->apiHandler->getOutputParser()->requiresStreamingWithSpecialTokens())) {
streamerConfig.insert(ov::genai::skip_special_tokens(false));
}
legacyExecutionContext->textStreamer = std::make_shared<ov::genai::TextStreamer>(getProperties()->tokenizer, callback, streamerConfig);
}
legacyExecutionContext->generationConfigBuilder = std::make_shared<GenerationConfigBuilder>(getProperties()->baseGenerationConfig,
getProperties()->toolParserName,
Expand Down Expand Up @@ -150,13 +157,7 @@ absl::Status VisualLanguageModelLegacyServable::prepareCompleteResponse(std::sha
if (legacyExecutionContext->payload.client->isDisconnected()) {
return absl::CancelledError();
}
size_t completionTokens = 0;
for (std::string text : legacyExecutionContext->results.texts) {
auto tokensTensor = properties->tokenizer.encode(text, ov::genai::add_special_tokens(false)).input_ids;
completionTokens += tokensTensor.get_size();
}
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Generated tokens number: {}", completionTokens);
executionContext->response = executionContext->apiHandler->serializeUnaryResponse(legacyExecutionContext->results, completionTokens);
executionContext->response = executionContext->apiHandler->serializeUnaryResponse(legacyExecutionContext->results);
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Complete unary response: {}", executionContext->response);
return absl::OkStatus();
}
Expand Down Expand Up @@ -252,7 +253,16 @@ absl::Status VisualLanguageModelLegacyServable::prepareInputs(std::shared_ptr<Ge
}

constexpr bool add_generation_prompt = true; // confirm it should be hardcoded
vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, add_generation_prompt);
auto toolsStatus = vlmExecutionContext->apiHandler->parseToolsToJsonContainer();
if (!toolsStatus.ok()) {
return toolsStatus.status();
}
const auto& tools = toolsStatus.value();
if (tools.has_value()) {
vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, add_generation_prompt, {}, tools);
} else {
vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, add_generation_prompt);
}
} else {
return absl::InvalidArgumentError("Unsupported endpoint");
}
Expand Down
9 changes: 9 additions & 0 deletions src/llm/visual_language_model/legacy/servable_initializer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,14 @@ Status VisualLanguageModelLegacyServableInitializer::initialize(std::shared_ptr<
if (std::filesystem::exists(modelGenerationConfigPath)) {
properties->baseGenerationConfig = ov::genai::GenerationConfig(modelGenerationConfigPath.string());
}

if (nodeOptions.has_tool_parser()) {
properties->toolParserName = nodeOptions.tool_parser();
}

if (nodeOptions.has_reasoning_parser()) {
properties->reasoningParserName = nodeOptions.reasoning_parser();
}
properties->schedulerConfig.max_num_batched_tokens = nodeOptions.max_num_batched_tokens();
properties->schedulerConfig.cache_size = nodeOptions.cache_size();
properties->schedulerConfig.dynamic_split_fuse = nodeOptions.dynamic_split_fuse();
Expand Down Expand Up @@ -90,6 +98,7 @@ Status VisualLanguageModelLegacyServableInitializer::initialize(std::shared_ptr<
}
properties->bestOfLimit = nodeOptions.best_of_limit();
properties->maxModelLength = parseMaxModelLength(parsedModelsPath);
properties->enableToolGuidedGeneration = nodeOptions.enable_tool_guided_generation();
return StatusCode::OK;
}

Expand Down
Loading