From 5abb01aa34aec3fbf53825aa7d8748a2da7ce2b4 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Wed, 17 Dec 2025 17:24:27 +0100 Subject: [PATCH] Initiali suppor of Kokoro model --- Dockerfile.redhat | 5 +- Dockerfile.ubuntu | 4 +- src/BUILD | 1 + src/audio/audio_utils.cpp | 31 ++ src/audio/audio_utils.hpp | 2 + src/audio/kokoro/BUILD | 60 ++++ src/audio/kokoro/kokoro_calculator.cc | 320 ++++++++++++++++++ src/audio/kokoro/kokoro_calculator.proto | 33 ++ src/audio/kokoro/kokoro_servable.hpp | 197 +++++++++++ src/logging.cpp | 4 + src/logging.hpp | 1 + .../mediapipegraphdefinition.cpp | 23 ++ .../mediapipegraphdefinition.hpp | 8 +- .../mediapipegraphexecutor.cpp | 4 +- .../mediapipegraphexecutor.hpp | 3 + 15 files changed, 692 insertions(+), 4 deletions(-) create mode 100644 src/audio/kokoro/BUILD create mode 100644 src/audio/kokoro/kokoro_calculator.cc create mode 100644 src/audio/kokoro/kokoro_calculator.proto create mode 100644 src/audio/kokoro/kokoro_servable.hpp diff --git a/Dockerfile.redhat b/Dockerfile.redhat index bc574eaaf2..41e02ecc12 100644 --- a/Dockerfile.redhat +++ b/Dockerfile.redhat @@ -127,7 +127,9 @@ RUN dnf install -y -d6 \ python3.12 \ python3.12-devel \ python3.12-pip \ - libicu-devel && \ + libicu-devel \ + espeak-ng \ + espeak-ng-devel && \ dnf clean all WORKDIR / @@ -416,6 +418,7 @@ RUN if [ -f /usr/bin/dnf ] ; then export DNF_TOOL=dnf ; echo -e "max_parallel_do if ! [[ $debug_bazel_flags == *"py_off"* ]]; then \ $DNF_TOOL install -y python3.12-libs --setopt=install_weak_deps=0 --nodocs; \ fi ; \ + $DNF_TOOL install -y espeak-ng --setopt=install_weak_deps=0 --nodocs; \ $DNF_TOOL install -y shadow-utils; \ $DNF_TOOL clean all ; \ cp -v /etc/ssl/certs/ca-bundle.crt /etc/ssl/certs/ca-certificates.crt ; \ diff --git a/Dockerfile.ubuntu b/Dockerfile.ubuntu index 15e47daf20..d80087c646 100644 --- a/Dockerfile.ubuntu +++ b/Dockerfile.ubuntu @@ -100,6 +100,8 @@ RUN if [ "$BASE_OS" == "ubuntu24" ] ; then apt-get update && \ apt-get clean && rm -rf /var/lib/apt/lists/* ; fi ENV PIP_BREAK_SYSTEM_PACKAGES=1 RUN apt-get update && apt-get install --no-install-recommends -y \ + espeak-ng \ + libespeak-ng-dev \ libgflags-dev \ bc \ ca-certificates \ @@ -413,7 +415,7 @@ ARG INSTALL_DRIVER_VERSION="24.39.31294" COPY ./install_ubuntu_gpu_drivers.sh /tmp/install_gpu_drivers.sh # hadolint ignore=DL3003,SC2164 RUN apt-get update ; \ - apt-get install -y --no-install-recommends curl ca-certificates libxml2 || exit 1; \ + apt-get install -y --no-install-recommends curl ca-certificates libxml2 espeak-ng espeak-ng-data || exit 1; \ if [ "$GPU" == "1" ] ; then \ /tmp/install_gpu_drivers.sh ; \ fi ; \ diff --git a/src/BUILD b/src/BUILD index 71321ca7ee..0318099727 100644 --- a/src/BUILD +++ b/src/BUILD @@ -563,6 +563,7 @@ ovms_cc_library( "//src/image_gen:image_gen_calculator", "//src/audio/speech_to_text:s2t_calculator", "//src/audio/text_to_speech:t2s_calculator", + "//src/audio/kokoro:kokoro_calculator", "//src/audio:audio_utils", "//src/image_gen:imagegen_init", "//src/llm:openai_completions_api_handler", diff --git a/src/audio/audio_utils.cpp b/src/audio/audio_utils.cpp index 77b38e70df..01daafb351 100644 --- a/src/audio/audio_utils.cpp +++ b/src/audio/audio_utils.cpp @@ -188,3 +188,34 @@ void prepareAudioOutput(void** ppData, size_t& pDataSize, uint16_t bitsPerSample auto outputPreparationTime = (timer.elapsed(OUTPUT_PREPARATION)) / 1000; SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "Output preparation time: {} ms", outputPreparationTime); } + + +void prepareAudioOutputKokoro(void** ppData, size_t& pDataSize, uint16_t bitsPerSample, size_t speechSize, const float* waveformPtr) { + enum : unsigned int { + OUTPUT_PREPARATION, + TIMER_END + }; + Timer timer; + timer.start(OUTPUT_PREPARATION); + drwav_data_format format; + format.container = drwav_container_riff; + format.format = DR_WAVE_FORMAT_IEEE_FLOAT; + format.channels = 1; + format.sampleRate = 24000; // assume it is always 24 KHz + format.bitsPerSample = bitsPerSample; + drwav wav; + size_t totalSamples = speechSize * format.channels; + + auto status = drwav_init_memory_write(&wav, ppData, &pDataSize, &format, nullptr); + if (status == DRWAV_FALSE) { + throw std::runtime_error("Failed to write all frames"); + } + drwav_uint64 framesWritten = drwav_write_pcm_frames(&wav, totalSamples, waveformPtr); + if (framesWritten != totalSamples) { + throw std::runtime_error("Failed to write all frames"); + } + drwav_uninit(&wav); + timer.stop(OUTPUT_PREPARATION); + auto outputPreparationTime = (timer.elapsed(OUTPUT_PREPARATION)) / 1000; + SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "Output preparation time: {} ms", outputPreparationTime); +} \ No newline at end of file diff --git a/src/audio/audio_utils.hpp b/src/audio/audio_utils.hpp index cbeea8b457..874e83dca4 100644 --- a/src/audio/audio_utils.hpp +++ b/src/audio/audio_utils.hpp @@ -25,3 +25,5 @@ bool isWavBuffer(const std::string buf); std::vector readWav(const std::string_view& wavData); std::vector readMp3(const std::string_view& mp3Data); void prepareAudioOutput(void** ppData, size_t& pDataSize, uint16_t bitsPerSample, size_t speechSize, const float* waveformPtr); +void prepareAudioOutputKokoro(void** ppData, size_t& pDataSize, uint16_t bitsPerSample, size_t speechSize, const float* waveformPtr); + diff --git a/src/audio/kokoro/BUILD b/src/audio/kokoro/BUILD new file mode 100644 index 0000000000..d7d3b64b1a --- /dev/null +++ b/src/audio/kokoro/BUILD @@ -0,0 +1,60 @@ +# +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +load("@mediapipe//mediapipe/framework/port:build_config.bzl", "mediapipe_cc_proto_library", "mediapipe_proto_library") +load("//:common_settings.bzl", "ovms_cc_library") + +ovms_cc_library( + name = "kokoro_servable", + hdrs = ["kokoro_servable.hpp"], + deps= ["//third_party:openvino", + "//src:libovms_ovinferrequestsqueue", + ], + visibility = ["//visibility:public"], + alwayslink = 1, +) + +ovms_cc_library( + name = "kokoro_calculator", + srcs = ["kokoro_calculator.cc"], + deps = [ + "@mediapipe//mediapipe/framework:calculator_framework", + "//src:httppayload", + "//src:libovmslogging", + "kokoro_calculator_cc_proto", + "//src/port:dr_audio", + "//src/port:rapidjson_stringbuffer", + "//src/port:rapidjson_writer", + ":kokoro_servable", + "//third_party:genai", + "//src/audio:audio_utils", + "//src:executingstreamidguard", + "//src:model_metric_reporter", + "//third_party/espeak_ng:espeak_ng", + ], + visibility = ["//visibility:public"], + alwayslink = 1, +) + +mediapipe_proto_library( + name = "kokoro_calculator_proto", + srcs = ["kokoro_calculator.proto"], + visibility = ["//visibility:private"], + deps = [ + "@mediapipe//mediapipe/framework:calculator_options_proto", + "@mediapipe//mediapipe/framework:calculator_proto", + ], +) diff --git a/src/audio/kokoro/kokoro_calculator.cc b/src/audio/kokoro/kokoro_calculator.cc new file mode 100644 index 0000000000..986dd92fab --- /dev/null +++ b/src/audio/kokoro/kokoro_calculator.cc @@ -0,0 +1,320 @@ +//***************************************************************************** +// Copyright 2025 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** +#include +#include +#include +#include +#include +#include +#include + +#pragma warning(push) +#pragma warning(disable : 4005 4309 6001 6385 6386 6326 6011 6246 4456 6246) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/port/canonical_errors.h" +#pragma GCC diagnostic pop +#pragma warning(pop) + +#include "src/audio/audio_utils.hpp" +#include "src/http_payload.hpp" +#include "src/logging.hpp" +#include "src/port/dr_audio.hpp" + +#include "../../model_metric_reporter.hpp" +#include "../../executingstreamidguard.hpp" + +#pragma warning(push) +#pragma warning(disable : 6001 4324 6385 6386) +#include "absl/strings/escaping.h" +#include "absl/strings/str_cat.h" +#pragma warning(pop) + +#include + +#include "kokoro_servable.hpp" + +#ifdef _WIN32 +#include +#include +#endif + +using namespace ovms; + +namespace { + +#ifndef espeakPHONEMES_IPA +#define espeakPHONEMES_IPA 0x02 +#endif +#ifndef espeakPHONEMES_NO_STRESS +#define espeakPHONEMES_NO_STRESS 0x08 +#endif + +void espeakPhonemizeAll(const std::string& textUtf8, std::string& outIpa, bool noStress = true) { + outIpa.clear(); + auto& espeak = ovms::EspeakInstance::instance(); + if (!espeak.isReady()) { + SPDLOG_ERROR("eSpeak not initialized"); + return; + } + + std::lock_guard guard(espeak.mutex()); + + const int mode = espeakPHONEMES_IPA | (noStress ? espeakPHONEMES_NO_STRESS : 0); + const void* pos = static_cast(textUtf8.c_str()); + const char* endPtr = static_cast(pos) + textUtf8.size(); + std::string rawIpa; + + while (pos && static_cast(pos) < endPtr) { + const char* ipaChunk = espeak_TextToPhonemes(&pos, espeakCHARS_UTF8, mode); + if (ipaChunk && *ipaChunk) { + if (!rawIpa.empty()) { + rawIpa.push_back(' '); + } + rawIpa.append(ipaChunk); + } + } + + // Strip combining diacriticals (U+0300..U+036F) and collapse spaces + std::string cleaned; + for (size_t i = 0; i < rawIpa.size(); ++i) { + unsigned char c = static_cast(rawIpa[i]); + if (i + 1 < rawIpa.size()) { + unsigned char next = static_cast(rawIpa[i + 1]); + if ((c == 0xCC && next >= 0x80) || (c == 0xCD && next <= 0xAF)) { + i++; + continue; + } + } + cleaned.push_back(c); + } + + bool lastSpace = false; + for (char c : cleaned) { + if (std::isspace(static_cast(c))) { + if (!lastSpace) { + outIpa.push_back(' '); + lastSpace = true; + } + } else { + outIpa.push_back(c); + lastSpace = false; + } + } + + if (!outIpa.empty() && std::isspace(static_cast(outIpa.back()))) { + outIpa.pop_back(); + } + + SPDLOG_DEBUG("IPA phonemes: '{}' (length: {})", outIpa, outIpa.size()); +} + +size_t utf8CharLen(unsigned char lead) { + if (lead < 0x80) return 1; + if ((lead >> 5) == 0x6) return 2; + if ((lead >> 4) == 0xE) return 3; + if ((lead >> 3) == 0x1E) return 4; + return 1; +} + +void tokenize(const std::string& textUtf8, + std::vector& tokenIds, + const ovms::VocabIndex& ix) { + tokenIds.clear(); + size_t pos = 0; + const size_t n = textUtf8.size(); + + while (pos < n) { + size_t maxTry = std::min(ix.max_token_bytes, n - pos); + int foundId = -1; + size_t foundLen = 0; + + for (size_t len = maxTry; len > 0; --len) { + auto it = ix.by_token.find(std::string(textUtf8.data() + pos, len)); + if (it != ix.by_token.end()) { + foundId = it->second; + foundLen = len; + break; + } + } + + if (foundId >= 0) { + tokenIds.push_back(foundId); + pos += foundLen; + } else { + const unsigned char lead = static_cast(textUtf8[pos]); + const size_t adv = utf8CharLen(lead); + SPDLOG_WARN("Tokenizer: unknown bytes at pos {}: '{}'", + pos, std::string(textUtf8.data() + pos, std::min(adv, n - pos))); + pos += std::min(adv, n - pos); + } + } + SPDLOG_DEBUG("Tokenize: produced {} ids", tokenIds.size()); +} +} // namespace + +namespace mediapipe { + +const std::string KOKORO_SESSION_SIDE_PACKET_TAG = "KOKORO_NODE_RESOURCES"; + +class KokoroCalculator : public CalculatorBase { + static const std::string INPUT_TAG_NAME; + static const std::string OUTPUT_TAG_NAME; + +public: + static absl::Status GetContract(CalculatorContract* cc) { + RET_CHECK(!cc->Inputs().GetTags().empty()); + RET_CHECK(!cc->Outputs().GetTags().empty()); + cc->Inputs().Tag(INPUT_TAG_NAME).Set(); + cc->InputSidePackets().Tag(KOKORO_SESSION_SIDE_PACKET_TAG).Set(); + cc->Outputs().Tag(OUTPUT_TAG_NAME).Set(); + return absl::OkStatus(); + } + + absl::Status Close(CalculatorContext* cc) final { + SPDLOG_LOGGER_DEBUG(kokoro_calculator_logger, "KokoroCalculator [Node: {}] Close", cc->NodeName()); + return absl::OkStatus(); + } + + absl::Status Open(CalculatorContext* cc) final { + SPDLOG_LOGGER_DEBUG(kokoro_calculator_logger, "KokoroCalculator [Node: {}] Open", cc->NodeName()); + return absl::OkStatus(); + } + + absl::Status Process(CalculatorContext* cc) final { + SPDLOG_LOGGER_DEBUG(kokoro_calculator_logger, "KokoroCalculator [Node: {}] Process start", cc->NodeName()); + + KokoroServableMap servablesMap = cc->InputSidePackets() + .Tag(KOKORO_SESSION_SIDE_PACKET_TAG).Get(); + auto servableIt = servablesMap.find(cc->NodeName()); + RET_CHECK(servableIt != servablesMap.end()) + << "Could not find initialized Kokoro node named: " << cc->NodeName(); + auto servable = servableIt->second; + + const auto& payload = cc->Inputs().Tag(INPUT_TAG_NAME).Get(); + auto it = payload.parsedJson->FindMember("input"); + RET_CHECK(it != payload.parsedJson->MemberEnd()) << "Missing 'input' in request"; + RET_CHECK(it->value.IsString()) << "'input' must be a string"; + const std::string text = it->value.GetString(); + + // Text -> IPA phonemization + std::string phonemes; + espeakPhonemizeAll(text, phonemes, /*noStress=*/true); + SPDLOG_DEBUG("Input text: '{}', IPA phonemes ({} chars): '{}'", text, phonemes.size(), phonemes); + + // IPA -> Kokoro token IDs + const auto& vocabIx = servable->getVocabIndex(); + std::vector> inputTokens(1); + tokenize(phonemes, inputTokens[0], vocabIx); + + // Prepend PAD token (id=0) - Kokoro model requires BOS/PAD at start + inputTokens[0].insert(inputTokens[0].begin(), 0); + + // Append EOS (period token = 4) if not already present + if (inputTokens[0].empty() || inputTokens[0].back() != 4) { + inputTokens[0].push_back(4); + } + + // Voice embedding + std::vector voice = { + -0.2296, 0.1835, -0.0069, -0.1240, -0.2505, 0.0112, -0.0759, -0.1650, + -0.2665, -0.1965, 0.0242, -0.1667, 0.3524, 0.2140, 0.3069, -0.3377, + -0.0878, -0.0477, 0.0813, -0.2135, -0.2340, -0.1971, 0.0200, 0.0145, + 0.0016, 0.2596, -0.2665, 0.1434, 0.0503, 0.0867, 0.1905, -0.1281, + 0.0658, -0.0639, -0.0920, 0.2444, -0.1506, -0.2197, 0.1385, 0.2133, + -0.0755, -0.0188, -0.0142, 0.2301, -0.0776, -0.0748, 0.0172, 0.0430, + -0.1009, 0.1519, 0.1137, 0.0641, 0.2264, 0.1911, -0.0205, 0.2578, + 0.2210, -0.0784, -0.0235, -0.0547, 0.2191, -0.1623, -0.2416, 0.0076, + 0.0574, 0.2186, 0.0080, 0.0473, 0.0972, 0.0286, 0.1324, 0.0686, + 0.2652, -0.2237, -0.0980, -0.1693, -0.1866, 0.2273, 0.2008, -0.0683, + 0.0957, 0.0623, -0.1891, 0.1620, 0.1811, -0.0516, -0.0800, -0.1416, + -0.2374, -0.1892, 0.1726, -0.0690, -0.0300, 0.0467, -0.2811, -0.1603, + 0.0342, -0.1054, -0.0604, -0.0475, -0.0908, -0.1286, 0.1105, -0.1186, + 0.0582, 0.1887, 0.0345, 0.2081, 0.1404, -0.2532, 0.0026, 0.0402, + 0.0812, -0.0512, 0.0128, 0.0084, -0.0970, -0.0362, 0.0036, -0.0720, + -0.0850, 0.0221, -0.1037, 0.0569, 0.0187, -0.0649, -0.0288, -0.1795, + 0.0045, 0.2535, 0.6751, 0.1578, -0.0966, 0.1516, 0.2109, 0.2033, + -0.2155, -0.1783, 0.0836, -0.1050, 0.0676, -0.0237, 0.0387, -0.2564, + 0.1891, 0.1305, -0.3239, -0.1312, 0.2723, 0.0745, 0.1335, 0.0302, + 0.0172, 0.2207, 0.0215, -0.0379, -0.1954, 0.4944, 0.2905, -0.0306, + 0.2858, 0.2341, 0.0545, 0.4626, 0.2947, 0.3802, 0.2820, 0.1557, + 0.1743, -0.1410, 0.0986, 0.4751, -0.2146, 0.3530, -0.2357, -0.5626, + -0.0617, 0.2190, 0.0992, -0.2365, 0.3726, 0.2092, 0.1660, 0.1928, + 0.5731, -0.1734, -0.0816, -0.3191, -0.1871, -0.2217, -0.0112, 0.1261, + 0.1601, 0.3835, 0.0451, -0.1927, -0.1116, 0.2204, -0.0379, -0.0094, + -0.0455, -0.4831, -0.3345, -0.2119, 0.4803, 0.1214, 0.1723, 0.2605, + 0.0051, -0.2587, 0.0511, -0.1318, 0.0227, -0.0645, 0.2573, -0.0205, + 0.0665, -0.3562, -0.6070, 0.4191, 0.0351, 0.2033, -0.5508, -0.1415, + -0.1249, -0.0986, -0.1120, -0.1187, 0.0600, 0.1974, 0.5017, -0.0247, + -0.2986, 0.3983, -0.1159, -0.4275, -0.0164, -0.3783, 0.0717, 0.1478, + -0.1144, 0.2292, 0.2741, 0.4309, -0.1611, 0.0755, -0.0981, 0.4584, + -0.2061, -0.0787, -0.1779, 0.2275, -0.1742, -0.2230, -0.1739, 0.0646 + }; + + auto& ids = inputTokens[0]; + + auto inputIdsTensor = ov::Tensor{ov::element::i64, ov::Shape{1, ids.size()}}; + auto refS = ov::Tensor{ov::element::f32, ov::Shape{1, voice.size()}}; + auto speed = ov::Tensor{ov::element::f32, ov::Shape{1}}; + + *reinterpret_cast(speed.data()) = 0.5f; + std::copy(ids.data(), ids.data() + ids.size(), + reinterpret_cast(inputIdsTensor.data())); + std::copy(voice.data(), voice.data() + voice.size(), + reinterpret_cast(refS.data())); + + // Inference + ModelMetricReporter unused(nullptr, nullptr, "unused", 1); + auto executingStreamIdGuard = + std::make_unique(servable->getInferRequestsQueue(), unused); + ov::InferRequest& inferRequest = executingStreamIdGuard->getInferRequest(); + + inferRequest.set_tensor("input_ids", inputIdsTensor); + inferRequest.set_tensor("103", refS); + inferRequest.set_tensor("speed", speed); + inferRequest.start_async(); + inferRequest.wait(); + + // Collect audio output + auto out = inferRequest.get_tensor(inferRequest.get_compiled_model().outputs()[0]); + RET_CHECK(out.get_shape().size() == 1); + RET_CHECK(out.get_element_type() == ov::element::f32); + const size_t samples = out.get_shape()[0]; + const float* data = out.data(); + + SPDLOG_DEBUG("Model output: {} audio samples ({:.2f}s at 24kHz)", + samples, static_cast(samples) / 24000.0f); + + void* wavDataPtr = nullptr; + size_t wavSize = 0; + prepareAudioOutputKokoro(&wavDataPtr, wavSize, 32, samples, data); + + auto output = std::make_unique(reinterpret_cast(wavDataPtr), wavSize); + drwav_free(wavDataPtr, NULL); + + cc->Outputs().Tag(OUTPUT_TAG_NAME).Add(output.release(), cc->InputTimestamp()); + SPDLOG_LOGGER_DEBUG(kokoro_calculator_logger, "KokoroCalculator [Node: {}] Process end", cc->NodeName()); + return absl::OkStatus(); + } +}; + +const std::string KokoroCalculator::INPUT_TAG_NAME{"HTTP_REQUEST_PAYLOAD"}; +const std::string KokoroCalculator::OUTPUT_TAG_NAME{"HTTP_RESPONSE_PAYLOAD"}; + +REGISTER_CALCULATOR(KokoroCalculator); + +} // namespace mediapipe diff --git a/src/audio/kokoro/kokoro_calculator.proto b/src/audio/kokoro/kokoro_calculator.proto new file mode 100644 index 0000000000..d9fc1b4bd9 --- /dev/null +++ b/src/audio/kokoro/kokoro_calculator.proto @@ -0,0 +1,33 @@ +//***************************************************************************** +// Copyright 2025 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** + +syntax = "proto2"; +package mediapipe; + +import "mediapipe/framework/calculator.proto"; + + +message KokoroCalculatorOptions { + extend mediapipe.CalculatorOptions { + // https://github.com/google/mediapipe/issues/634 have to be unique in app + // no rule to obtain this + optional KokoroCalculatorOptions ext = 116423799; + } + + required string models_path = 1; + optional string target_device = 2; + optional string plugin_config = 3; +} diff --git a/src/audio/kokoro/kokoro_servable.hpp b/src/audio/kokoro/kokoro_servable.hpp new file mode 100644 index 0000000000..3e42bd0db4 --- /dev/null +++ b/src/audio/kokoro/kokoro_servable.hpp @@ -0,0 +1,197 @@ +//***************************************************************************** +// Copyright 2025 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#pragma warning(push) +#pragma warning(disable : 4005 4309 6001 6385 6386 6326 6011 4005 4456 6246) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#include "mediapipe/framework/calculator_graph.h" +#pragma GCC diagnostic pop +#pragma warning(pop) + +#include "openvino/runtime/core.hpp" +#include "../../ovinferrequestsqueue.hpp" + +#include +#include + +#include "src/audio/kokoro/kokoro_calculator.pb.h" +#include "src/logging.hpp" + +namespace ovms { + +struct VocabIndex { + std::unordered_map by_token; + size_t max_token_bytes = 1; +}; + +class EspeakInstance { +public: + static EspeakInstance& instance() { + static EspeakInstance inst; + return inst; + } + + bool isReady() const { return ready_; } + std::mutex& mutex() { return mutex_; } + +private: + EspeakInstance() { + ready_ = tryInit(); + if (!ready_) { + SPDLOG_ERROR("eSpeak-NG initialization failed (data path or voice not found)"); + } else { + SPDLOG_INFO("eSpeak-NG initialized successfully"); + } + } + + ~EspeakInstance() { + if (ready_) { + espeak_Terminate(); + } + } + + EspeakInstance(const EspeakInstance&) = delete; + EspeakInstance& operator=(const EspeakInstance&) = delete; + + bool tryInit() { + auto try_path = [](const char* path) -> bool { + int sr = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS, + 0, path, + espeakINITIALIZE_DONT_EXIT); + if (sr <= 0) return false; + if (espeak_SetVoiceByName("en") != EE_OK && + espeak_SetVoiceByName("en-us") != EE_OK) { + return false; + } + return true; + }; + + if (try_path(nullptr)) return true; + + static const char* ngPaths[] = { + "/usr/share/espeak-ng-data", + "/opt/homebrew/share/espeak-ng-data", + "/usr/local/share/espeak-ng-data", + "espeak-ng-data", + nullptr + }; + for (int i = 0; ngPaths[i]; ++i) + if (try_path(ngPaths[i])) return true; + + static const char* esPaths[] = { + "/usr/share/espeak-data", + "/usr/local/share/espeak-data", + "espeak-data", + nullptr + }; + for (int i = 0; esPaths[i]; ++i) + if (try_path(esPaths[i])) return true; + + return false; + } + + bool ready_ = false; + std::mutex mutex_; +}; + +struct KokoroServable { + std::filesystem::path parsedModelsPath; + std::shared_ptr model; + ov::CompiledModel compiledModel; + std::unique_ptr inferRequestsQueue; + VocabIndex vocabIndex; + + KokoroServable(const std::string& modelDir, const std::string& targetDevice, const std::string& graphPath) { + EspeakInstance::instance(); + + auto fsModelsPath = std::filesystem::path(modelDir); + if (fsModelsPath.is_relative()) { + parsedModelsPath = (std::filesystem::path(graphPath) / fsModelsPath); + } else { + parsedModelsPath = fsModelsPath; + } + + vocabIndex = loadVocabFromConfig(parsedModelsPath); + + ov::AnyMap properties; + ov::Core core; + auto m_model = core.read_model(parsedModelsPath / std::filesystem::path("openvino_model.xml"), {}, properties); + compiledModel = core.compile_model(m_model, targetDevice, properties); + inferRequestsQueue = std::make_unique(compiledModel, 5); + } + + OVInferRequestsQueue& getInferRequestsQueue() { + return *inferRequestsQueue; + } + + const VocabIndex& getVocabIndex() const { + return vocabIndex; + } + +private: + static VocabIndex loadVocabFromConfig(const std::filesystem::path& modelDir) { + VocabIndex ix; + auto configPath = modelDir / "config.json"; + std::ifstream ifs(configPath); + if (!ifs.is_open()) { + SPDLOG_ERROR("Failed to open Kokoro config: {}", configPath.string()); + return ix; + } + + std::stringstream buffer; + buffer << ifs.rdbuf(); + std::string jsonStr = buffer.str(); + + rapidjson::Document doc; + doc.Parse(jsonStr.c_str()); + if (doc.HasParseError()) { + SPDLOG_ERROR("Failed to parse Kokoro config JSON: {}", configPath.string()); + return ix; + } + + if (!doc.HasMember("vocab") || !doc["vocab"].IsObject()) { + SPDLOG_ERROR("Kokoro config missing 'vocab' object: {}", configPath.string()); + return ix; + } + + const auto& vocab = doc["vocab"]; + ix.by_token.reserve(vocab.MemberCount()); + for (auto it = vocab.MemberBegin(); it != vocab.MemberEnd(); ++it) { + if (!it->name.IsString() || !it->value.IsInt()) continue; + std::string token = it->name.GetString(); + int id = it->value.GetInt(); + ix.by_token.emplace(token, id); + ix.max_token_bytes = std::max(ix.max_token_bytes, token.size()); + } + + SPDLOG_INFO("Loaded Kokoro vocabulary: {} tokens, max_token_bytes={}", + ix.by_token.size(), ix.max_token_bytes); + return ix; + } +}; + +using KokoroServableMap = std::unordered_map>; +} // namespace ovms diff --git a/src/logging.cpp b/src/logging.cpp index e89fce9a07..9d058d82dc 100644 --- a/src/logging.cpp +++ b/src/logging.cpp @@ -35,6 +35,7 @@ std::shared_ptr llm_executor_logger = std::make_shared llm_calculator_logger = std::make_shared("llm_calculator"); std::shared_ptr s2t_calculator_logger = std::make_shared("s2t_calculator"); std::shared_ptr t2s_calculator_logger = std::make_shared("t2s_calculator"); +std::shared_ptr kokoro_calculator_logger = std::make_shared("kokoro_calculator"); std::shared_ptr embeddings_calculator_logger = std::make_shared("embeddings_calculator"); std::shared_ptr rerank_calculator_logger = std::make_shared("rerank_calculator"); #endif @@ -78,6 +79,7 @@ static void register_loggers(const std::string& log_level, std::vectorset_pattern(default_pattern); s2t_calculator_logger->set_pattern(default_pattern); t2s_calculator_logger->set_pattern(default_pattern); + kokoro_calculator_logger->set_pattern(default_pattern); rerank_calculator_logger->set_pattern(default_pattern); embeddings_calculator_logger->set_pattern(default_pattern); #endif @@ -98,6 +100,7 @@ static void register_loggers(const std::string& log_level, std::vectorsinks().push_back(sink); s2t_calculator_logger->sinks().push_back(sink); t2s_calculator_logger->sinks().push_back(sink); + kokoro_calculator_logger->sinks().push_back(sink); rerank_calculator_logger->sinks().push_back(sink); embeddings_calculator_logger->sinks().push_back(sink); #endif @@ -119,6 +122,7 @@ static void register_loggers(const std::string& log_level, std::vector llm_executor_logger; extern std::shared_ptr llm_calculator_logger; extern std::shared_ptr s2t_calculator_logger; extern std::shared_ptr t2s_calculator_logger; +extern std::shared_ptr kokoro_calculator_logger; extern std::shared_ptr embeddings_calculator_logger; extern std::shared_ptr rerank_calculator_logger; #endif diff --git a/src/mediapipe_internal/mediapipegraphdefinition.cpp b/src/mediapipe_internal/mediapipegraphdefinition.cpp index 9047765e75..e1436b5891 100644 --- a/src/mediapipe_internal/mediapipegraphdefinition.cpp +++ b/src/mediapipe_internal/mediapipegraphdefinition.cpp @@ -63,6 +63,7 @@ const std::string MediapipeGraphDefinition::LLM_NODE_CALCULATOR_NAME{"LLMCalcula const std::string MediapipeGraphDefinition::IMAGE_GEN_CALCULATOR_NAME{"ImageGenCalculator"}; const std::string MediapipeGraphDefinition::STT_NODE_CALCULATOR_NAME{"S2tCalculator"}; const std::string MediapipeGraphDefinition::TTS_NODE_CALCULATOR_NAME{"T2sCalculator"}; +const std::string MediapipeGraphDefinition::KOKORO_NODE_CALCULATOR_NAME{"KokoroCalculator"}; const std::string MediapipeGraphDefinition::EMBEDDINGS_NODE_CALCULATOR_NAME{"EmbeddingsCalculatorOV"}; const std::string MediapipeGraphDefinition::RERANK_NODE_CALCULATOR_NAME{"RerankCalculatorOV"}; @@ -625,6 +626,28 @@ Status MediapipeGraphDefinition::initializeNodes() { return StatusCode::MEDIAPIPE_GRAPH_CONFIG_FILE_INVALID; } } + if (endsWith(config.node(i).calculator(), KOKORO_NODE_CALCULATOR_NAME)) { + auto& kokoroServableMap = this->sidePacketMaps.kokoroServableMap; + ResourcesCleaningGuard kokoroServablesCleaningGuard(kokoroServableMap); + if (!config.node(i).node_options().size()) { + SPDLOG_LOGGER_ERROR(modelmanager_logger, "Kokoro node missing options in graph: {}. ", this->name); + return StatusCode::LLM_NODE_MISSING_OPTIONS; + } + if (config.node(i).name().empty()) { + SPDLOG_LOGGER_ERROR(modelmanager_logger, "Kokoro node name is missing in graph: {}. ", this->name); + return StatusCode::LLM_NODE_MISSING_NAME; + } + std::string nodeName = config.node(i).name(); + if (kokoroServableMap.find(nodeName) != kokoroServableMap.end()) { + SPDLOG_LOGGER_ERROR(modelmanager_logger, "Kokoro node name: {} already used in graph: {}. ", nodeName, this->name); + return StatusCode::LLM_NODE_NAME_ALREADY_EXISTS; + } + mediapipe::KokoroCalculatorOptions nodeOptions; + config.node(i).node_options(0).UnpackTo(&nodeOptions); + std::shared_ptr servable = std::make_shared(nodeOptions.models_path(), nodeOptions.target_device(), mgconfig.getBasePath()); + kokoroServableMap.insert(std::pair>(nodeName, std::move(servable))); + kokoroServablesCleaningGuard.disableCleaning(); + } } return StatusCode::OK; } diff --git a/src/mediapipe_internal/mediapipegraphdefinition.hpp b/src/mediapipe_internal/mediapipegraphdefinition.hpp index 14c9e0679f..1067ca7d42 100644 --- a/src/mediapipe_internal/mediapipegraphdefinition.hpp +++ b/src/mediapipe_internal/mediapipegraphdefinition.hpp @@ -48,6 +48,7 @@ #include "../rerank/rerank_servable.hpp" #include "../audio/speech_to_text/s2t_servable.hpp" #include "../audio/text_to_speech/t2s_servable.hpp" +#include "../audio/kokoro/kokoro_servable.hpp" namespace ovms { class MediapipeGraphDefinitionUnloadGuard; @@ -66,6 +67,7 @@ using GenAiServableMap = std::unordered_map>; using SttServableMap = std::unordered_map>; using TtsServableMap = std::unordered_map>; +using KokoroServableMap = std::unordered_map>; using EmbeddingsServableMap = std::unordered_map>; using ImageGenerationPipelinesMap = std::unordered_map>; @@ -77,6 +79,7 @@ struct GraphSidePackets { RerankServableMap rerankServableMap; SttServableMap sttServableMap; TtsServableMap ttsServableMap; + KokoroServableMap kokoroServableMap; void clear() { pythonNodeResourcesMap.clear(); genAiServableMap.clear(); @@ -85,6 +88,7 @@ struct GraphSidePackets { rerankServableMap.clear(); sttServableMap.clear(); ttsServableMap.clear(); + kokoroServableMap.clear(); } bool empty() { return (pythonNodeResourcesMap.empty() && @@ -93,7 +97,8 @@ struct GraphSidePackets { embeddingsServableMap.empty() && rerankServableMap.empty() && sttServableMap.empty() && - ttsServableMap.empty()); + ttsServableMap.empty() && + kokoroServableMap.empty()); } }; @@ -136,6 +141,7 @@ class MediapipeGraphDefinition { static const std::string RERANK_NODE_CALCULATOR_NAME; static const std::string STT_NODE_CALCULATOR_NAME; static const std::string TTS_NODE_CALCULATOR_NAME; + static const std::string KOKORO_NODE_CALCULATOR_NAME; Status waitForLoaded(std::unique_ptr& unloadGuard, const uint32_t waitForLoadedTimeoutMicroseconds = WAIT_FOR_LOADED_DEFAULT_TIMEOUT_MICROSECONDS); // Pipelines are not versioned and any available definition has constant version equal 1. diff --git a/src/mediapipe_internal/mediapipegraphexecutor.cpp b/src/mediapipe_internal/mediapipegraphexecutor.cpp index 93b53fdf8e..b2016ac3aa 100644 --- a/src/mediapipe_internal/mediapipegraphexecutor.cpp +++ b/src/mediapipe_internal/mediapipegraphexecutor.cpp @@ -49,6 +49,7 @@ MediapipeGraphExecutor::MediapipeGraphExecutor( const RerankServableMap& rerankServableMap, const SttServableMap& sttServableMap, const TtsServableMap& ttsServableMap, + const KokoroServableMap& kokoroServableMap, PythonBackend* pythonBackend, MediapipeServableMetricReporter* mediapipeServableMetricReporter) : name(name), @@ -58,7 +59,7 @@ MediapipeGraphExecutor::MediapipeGraphExecutor( outputTypes(std::move(outputTypes)), inputNames(std::move(inputNames)), outputNames(std::move(outputNames)), - sidePacketMaps({pythonNodeResourcesMap, llmNodeResourcesMap, {}, embeddingsServableMap, rerankServableMap, sttServableMap, ttsServableMap}), + sidePacketMaps({pythonNodeResourcesMap, llmNodeResourcesMap, {}, embeddingsServableMap, rerankServableMap, sttServableMap, ttsServableMap, kokoroServableMap}), pythonBackend(pythonBackend), currentStreamTimestamp(STARTING_TIMESTAMP), mediapipeServableMetricReporter(mediapipeServableMetricReporter) {} @@ -92,6 +93,7 @@ const std::string MediapipeGraphExecutor::EMBEDDINGS_SESSION_SIDE_PACKET_TAG = " const std::string MediapipeGraphExecutor::RERANK_SESSION_SIDE_PACKET_TAG = "rerank_servable"; const std::string MediapipeGraphExecutor::STT_SESSION_SIDE_PACKET_TAG = "s2t_servable"; const std::string MediapipeGraphExecutor::TTS_SESSION_SIDE_PACKET_TAG = "t2s_servable"; +const std::string MediapipeGraphExecutor::KOKORO_SESSION_SIDE_PACKET_TAG = "kokoro_servable"; const ::mediapipe::Timestamp MediapipeGraphExecutor::STARTING_TIMESTAMP = ::mediapipe::Timestamp(0); } // namespace ovms diff --git a/src/mediapipe_internal/mediapipegraphexecutor.hpp b/src/mediapipe_internal/mediapipegraphexecutor.hpp index c165469395..af2e8d08e6 100644 --- a/src/mediapipe_internal/mediapipegraphexecutor.hpp +++ b/src/mediapipe_internal/mediapipegraphexecutor.hpp @@ -95,6 +95,7 @@ class MediapipeGraphExecutor { static const std::string RERANK_SESSION_SIDE_PACKET_TAG; static const std::string STT_SESSION_SIDE_PACKET_TAG; static const std::string TTS_SESSION_SIDE_PACKET_TAG; + static const std::string KOKORO_SESSION_SIDE_PACKET_TAG; static const ::mediapipe::Timestamp STARTING_TIMESTAMP; MediapipeGraphExecutor(const std::string& name, const std::string& version, const ::mediapipe::CalculatorGraphConfig& config, @@ -107,6 +108,7 @@ class MediapipeGraphExecutor { const RerankServableMap& rerankServableMap, const SttServableMap& sttServableMap, const TtsServableMap& ttsServableMap, + const KokoroServableMap& kokoroServableMap, PythonBackend* pythonBackend, MediapipeServableMetricReporter* mediapipeServableMetricReporter); MediapipeGraphExecutor(const std::string& name, const std::string& version, const ::mediapipe::CalculatorGraphConfig& config, @@ -157,6 +159,7 @@ class MediapipeGraphExecutor { inputSidePackets[RERANK_SESSION_SIDE_PACKET_TAG] = mediapipe::MakePacket(this->sidePacketMaps.rerankServableMap).At(STARTING_TIMESTAMP); inputSidePackets[STT_SESSION_SIDE_PACKET_TAG] = mediapipe::MakePacket(this->sidePacketMaps.sttServableMap).At(STARTING_TIMESTAMP); inputSidePackets[TTS_SESSION_SIDE_PACKET_TAG] = mediapipe::MakePacket(this->sidePacketMaps.ttsServableMap).At(STARTING_TIMESTAMP); + inputSidePackets[KOKORO_SESSION_SIDE_PACKET_TAG] = mediapipe::MakePacket(this->sidePacketMaps.kokoroServableMap).At(STARTING_TIMESTAMP); MP_RETURN_ON_FAIL(graph.StartRun(inputSidePackets), std::string("start MediaPipe graph: ") + this->name, StatusCode::MEDIAPIPE_GRAPH_START_ERROR);