From 5abb01aa34aec3fbf53825aa7d8748a2da7ce2b4 Mon Sep 17 00:00:00 2001
From: Michal Kulakowski <michal.kulakowski@intel.com>
Date: Wed, 17 Dec 2025 17:24:27 +0100
Subject: [PATCH] Initiali suppor of Kokoro model

---
 Dockerfile.redhat                             |   5 +-
 Dockerfile.ubuntu                             |   4 +-
 src/BUILD                                     |   1 +
 src/audio/audio_utils.cpp                     |  31 ++
 src/audio/audio_utils.hpp                     |   2 +
 src/audio/kokoro/BUILD                        |  60 ++++
 src/audio/kokoro/kokoro_calculator.cc         | 320 ++++++++++++++++++
 src/audio/kokoro/kokoro_calculator.proto      |  33 ++
 src/audio/kokoro/kokoro_servable.hpp          | 197 +++++++++++
 src/logging.cpp                               |   4 +
 src/logging.hpp                               |   1 +
 .../mediapipegraphdefinition.cpp              |  23 ++
 .../mediapipegraphdefinition.hpp              |   8 +-
 .../mediapipegraphexecutor.cpp                |   4 +-
 .../mediapipegraphexecutor.hpp                |   3 +
 15 files changed, 692 insertions(+), 4 deletions(-)
 create mode 100644 src/audio/kokoro/BUILD
 create mode 100644 src/audio/kokoro/kokoro_calculator.cc
 create mode 100644 src/audio/kokoro/kokoro_calculator.proto
 create mode 100644 src/audio/kokoro/kokoro_servable.hpp

diff --git a/Dockerfile.redhat b/Dockerfile.redhat
index bc574eaaf2..41e02ecc12 100644
--- a/Dockerfile.redhat
+++ b/Dockerfile.redhat
@@ -127,7 +127,9 @@ RUN dnf install -y -d6 \
             python3.12 \
             python3.12-devel \
             python3.12-pip \
-            libicu-devel && \
+            libicu-devel \
+            espeak-ng \
+            espeak-ng-devel && \
             dnf clean all
 
 WORKDIR /
@@ -416,6 +418,7 @@ RUN if [ -f /usr/bin/dnf ] ; then export DNF_TOOL=dnf ; echo -e "max_parallel_do
     if ! [[ $debug_bazel_flags == *"py_off"* ]]; then \
         $DNF_TOOL install -y python3.12-libs --setopt=install_weak_deps=0 --nodocs; \
     fi ; \
+    $DNF_TOOL install -y espeak-ng --setopt=install_weak_deps=0 --nodocs; \
     $DNF_TOOL install -y shadow-utils; \
     $DNF_TOOL clean all ; \
     cp -v /etc/ssl/certs/ca-bundle.crt /etc/ssl/certs/ca-certificates.crt ; \
diff --git a/Dockerfile.ubuntu b/Dockerfile.ubuntu
index 15e47daf20..d80087c646 100644
--- a/Dockerfile.ubuntu
+++ b/Dockerfile.ubuntu
@@ -100,6 +100,8 @@ RUN if [ "$BASE_OS" == "ubuntu24" ] ; then apt-get update && \
     apt-get clean && rm -rf /var/lib/apt/lists/* ; fi
 ENV PIP_BREAK_SYSTEM_PACKAGES=1
 RUN apt-get update && apt-get install --no-install-recommends -y \
+            espeak-ng \
+            libespeak-ng-dev \
             libgflags-dev \
             bc \
             ca-certificates \
@@ -413,7 +415,7 @@ ARG INSTALL_DRIVER_VERSION="24.39.31294"
 COPY ./install_ubuntu_gpu_drivers.sh /tmp/install_gpu_drivers.sh
 # hadolint ignore=DL3003,SC2164
 RUN apt-get update ; \
-    apt-get install -y --no-install-recommends curl ca-certificates libxml2 || exit 1; \
+    apt-get install -y --no-install-recommends curl ca-certificates libxml2 espeak-ng espeak-ng-data || exit 1; \
     if [ "$GPU" == "1" ] ; then \
 	/tmp/install_gpu_drivers.sh ; \
     fi ; \
diff --git a/src/BUILD b/src/BUILD
index 71321ca7ee..0318099727 100644
--- a/src/BUILD
+++ b/src/BUILD
@@ -563,6 +563,7 @@ ovms_cc_library(
                 "//src/image_gen:image_gen_calculator",
                 "//src/audio/speech_to_text:s2t_calculator",
                 "//src/audio/text_to_speech:t2s_calculator",
+                "//src/audio/kokoro:kokoro_calculator",
                 "//src/audio:audio_utils",
                 "//src/image_gen:imagegen_init",
                 "//src/llm:openai_completions_api_handler",
diff --git a/src/audio/audio_utils.cpp b/src/audio/audio_utils.cpp
index 77b38e70df..01daafb351 100644
--- a/src/audio/audio_utils.cpp
+++ b/src/audio/audio_utils.cpp
@@ -188,3 +188,34 @@ void prepareAudioOutput(void** ppData, size_t& pDataSize, uint16_t bitsPerSample
     auto outputPreparationTime = (timer.elapsed<std::chrono::microseconds>(OUTPUT_PREPARATION)) / 1000;
     SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "Output preparation time: {} ms", outputPreparationTime);
 }
+
+
+void prepareAudioOutputKokoro(void** ppData, size_t& pDataSize, uint16_t bitsPerSample, size_t speechSize, const float* waveformPtr) {
+    enum : unsigned int {
+        OUTPUT_PREPARATION,
+        TIMER_END
+    };
+    Timer<TIMER_END> timer;
+    timer.start(OUTPUT_PREPARATION);
+    drwav_data_format format;
+    format.container = drwav_container_riff;
+    format.format = DR_WAVE_FORMAT_IEEE_FLOAT;
+    format.channels = 1;
+    format.sampleRate = 24000;  // assume it is always 24 KHz
+    format.bitsPerSample = bitsPerSample;
+    drwav wav;
+    size_t totalSamples = speechSize * format.channels;
+
+    auto status = drwav_init_memory_write(&wav, ppData, &pDataSize, &format, nullptr);
+    if (status == DRWAV_FALSE) {
+        throw std::runtime_error("Failed to write all frames");
+    }
+    drwav_uint64 framesWritten = drwav_write_pcm_frames(&wav, totalSamples, waveformPtr);
+    if (framesWritten != totalSamples) {
+        throw std::runtime_error("Failed to write all frames");
+    }
+    drwav_uninit(&wav);
+    timer.stop(OUTPUT_PREPARATION);
+    auto outputPreparationTime = (timer.elapsed<std::chrono::microseconds>(OUTPUT_PREPARATION)) / 1000;
+    SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "Output preparation time: {} ms", outputPreparationTime);
+}
\ No newline at end of file
diff --git a/src/audio/audio_utils.hpp b/src/audio/audio_utils.hpp
index cbeea8b457..874e83dca4 100644
--- a/src/audio/audio_utils.hpp
+++ b/src/audio/audio_utils.hpp
@@ -25,3 +25,5 @@ bool isWavBuffer(const std::string buf);
 std::vector<float> readWav(const std::string_view& wavData);
 std::vector<float> readMp3(const std::string_view& mp3Data);
 void prepareAudioOutput(void** ppData, size_t& pDataSize, uint16_t bitsPerSample, size_t speechSize, const float* waveformPtr);
+void prepareAudioOutputKokoro(void** ppData, size_t& pDataSize, uint16_t bitsPerSample, size_t speechSize, const float* waveformPtr);
+
diff --git a/src/audio/kokoro/BUILD b/src/audio/kokoro/BUILD
new file mode 100644
index 0000000000..d7d3b64b1a
--- /dev/null
+++ b/src/audio/kokoro/BUILD
@@ -0,0 +1,60 @@
+#
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+load("@mediapipe//mediapipe/framework/port:build_config.bzl", "mediapipe_cc_proto_library", "mediapipe_proto_library")
+load("//:common_settings.bzl", "ovms_cc_library")
+
+ovms_cc_library(
+    name = "kokoro_servable",
+    hdrs = ["kokoro_servable.hpp"],
+    deps= ["//third_party:openvino",
+    "//src:libovms_ovinferrequestsqueue",
+    ],
+    visibility = ["//visibility:public"],
+    alwayslink = 1,
+)
+
+ovms_cc_library(
+    name = "kokoro_calculator",
+    srcs = ["kokoro_calculator.cc"],
+    deps = [
+        "@mediapipe//mediapipe/framework:calculator_framework",
+        "//src:httppayload",
+        "//src:libovmslogging",
+        "kokoro_calculator_cc_proto",
+        "//src/port:dr_audio",
+        "//src/port:rapidjson_stringbuffer",
+        "//src/port:rapidjson_writer",
+        ":kokoro_servable",
+        "//third_party:genai",
+        "//src/audio:audio_utils",
+        "//src:executingstreamidguard",
+        "//src:model_metric_reporter",
+        "//third_party/espeak_ng:espeak_ng",
+    ],
+    visibility = ["//visibility:public"],
+    alwayslink = 1,
+)
+
+mediapipe_proto_library(
+    name = "kokoro_calculator_proto",
+    srcs = ["kokoro_calculator.proto"],
+    visibility = ["//visibility:private"],
+    deps = [
+        "@mediapipe//mediapipe/framework:calculator_options_proto",
+        "@mediapipe//mediapipe/framework:calculator_proto",
+    ],
+)
diff --git a/src/audio/kokoro/kokoro_calculator.cc b/src/audio/kokoro/kokoro_calculator.cc
new file mode 100644
index 0000000000..986dd92fab
--- /dev/null
+++ b/src/audio/kokoro/kokoro_calculator.cc
@@ -0,0 +1,320 @@
+//*****************************************************************************
+// Copyright 2025 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#include <algorithm>
+#include <cstdint>
+#include <fstream>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#pragma warning(push)
+#pragma warning(disable : 4005 4309 6001 6385 6386 6326 6011 6246 4456 6246)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#include "mediapipe/framework/calculator_framework.h"
+#include "mediapipe/framework/port/canonical_errors.h"
+#pragma GCC diagnostic pop
+#pragma warning(pop)
+
+#include "src/audio/audio_utils.hpp"
+#include "src/http_payload.hpp"
+#include "src/logging.hpp"
+#include "src/port/dr_audio.hpp"
+
+#include "../../model_metric_reporter.hpp"
+#include "../../executingstreamidguard.hpp"
+
+#pragma warning(push)
+#pragma warning(disable : 6001 4324 6385 6386)
+#include "absl/strings/escaping.h"
+#include "absl/strings/str_cat.h"
+#pragma warning(pop)
+
+#include <espeak-ng/speak_lib.h>
+
+#include "kokoro_servable.hpp"
+
+#ifdef _WIN32
+#include <fcntl.h>
+#include <io.h>
+#endif
+
+using namespace ovms;
+
+namespace {
+
+#ifndef espeakPHONEMES_IPA
+#define espeakPHONEMES_IPA 0x02
+#endif
+#ifndef espeakPHONEMES_NO_STRESS
+#define espeakPHONEMES_NO_STRESS 0x08
+#endif
+
+void espeakPhonemizeAll(const std::string& textUtf8, std::string& outIpa, bool noStress = true) {
+    outIpa.clear();
+    auto& espeak = ovms::EspeakInstance::instance();
+    if (!espeak.isReady()) {
+        SPDLOG_ERROR("eSpeak not initialized");
+        return;
+    }
+
+    std::lock_guard<std::mutex> guard(espeak.mutex());
+
+    const int mode = espeakPHONEMES_IPA | (noStress ? espeakPHONEMES_NO_STRESS : 0);
+    const void* pos = static_cast<const void*>(textUtf8.c_str());
+    const char* endPtr = static_cast<const char*>(pos) + textUtf8.size();
+    std::string rawIpa;
+
+    while (pos && static_cast<const char*>(pos) < endPtr) {
+        const char* ipaChunk = espeak_TextToPhonemes(&pos, espeakCHARS_UTF8, mode);
+        if (ipaChunk && *ipaChunk) {
+            if (!rawIpa.empty()) {
+                rawIpa.push_back(' ');
+            }
+            rawIpa.append(ipaChunk);
+        }
+    }
+
+    // Strip combining diacriticals (U+0300..U+036F) and collapse spaces
+    std::string cleaned;
+    for (size_t i = 0; i < rawIpa.size(); ++i) {
+        unsigned char c = static_cast<unsigned char>(rawIpa[i]);
+        if (i + 1 < rawIpa.size()) {
+            unsigned char next = static_cast<unsigned char>(rawIpa[i + 1]);
+            if ((c == 0xCC && next >= 0x80) || (c == 0xCD && next <= 0xAF)) {
+                i++;
+                continue;
+            }
+        }
+        cleaned.push_back(c);
+    }
+
+    bool lastSpace = false;
+    for (char c : cleaned) {
+        if (std::isspace(static_cast<unsigned char>(c))) {
+            if (!lastSpace) {
+                outIpa.push_back(' ');
+                lastSpace = true;
+            }
+        } else {
+            outIpa.push_back(c);
+            lastSpace = false;
+        }
+    }
+
+    if (!outIpa.empty() && std::isspace(static_cast<unsigned char>(outIpa.back()))) {
+        outIpa.pop_back();
+    }
+
+    SPDLOG_DEBUG("IPA phonemes: '{}' (length: {})", outIpa, outIpa.size());
+}
+
+size_t utf8CharLen(unsigned char lead) {
+    if (lead < 0x80) return 1;
+    if ((lead >> 5) == 0x6) return 2;
+    if ((lead >> 4) == 0xE) return 3;
+    if ((lead >> 3) == 0x1E) return 4;
+    return 1;
+}
+
+void tokenize(const std::string& textUtf8,
+              std::vector<int64_t>& tokenIds,
+              const ovms::VocabIndex& ix) {
+    tokenIds.clear();
+    size_t pos = 0;
+    const size_t n = textUtf8.size();
+
+    while (pos < n) {
+        size_t maxTry = std::min(ix.max_token_bytes, n - pos);
+        int foundId = -1;
+        size_t foundLen = 0;
+
+        for (size_t len = maxTry; len > 0; --len) {
+            auto it = ix.by_token.find(std::string(textUtf8.data() + pos, len));
+            if (it != ix.by_token.end()) {
+                foundId = it->second;
+                foundLen = len;
+                break;
+            }
+        }
+
+        if (foundId >= 0) {
+            tokenIds.push_back(foundId);
+            pos += foundLen;
+        } else {
+            const unsigned char lead = static_cast<unsigned char>(textUtf8[pos]);
+            const size_t adv = utf8CharLen(lead);
+            SPDLOG_WARN("Tokenizer: unknown bytes at pos {}: '{}'",
+                        pos, std::string(textUtf8.data() + pos, std::min(adv, n - pos)));
+            pos += std::min(adv, n - pos);
+        }
+    }
+    SPDLOG_DEBUG("Tokenize: produced {} ids", tokenIds.size());
+}
+}  // namespace
+
+namespace mediapipe {
+
+const std::string KOKORO_SESSION_SIDE_PACKET_TAG = "KOKORO_NODE_RESOURCES";
+
+class KokoroCalculator : public CalculatorBase {
+    static const std::string INPUT_TAG_NAME;
+    static const std::string OUTPUT_TAG_NAME;
+
+public:
+    static absl::Status GetContract(CalculatorContract* cc) {
+        RET_CHECK(!cc->Inputs().GetTags().empty());
+        RET_CHECK(!cc->Outputs().GetTags().empty());
+        cc->Inputs().Tag(INPUT_TAG_NAME).Set<ovms::HttpPayload>();
+        cc->InputSidePackets().Tag(KOKORO_SESSION_SIDE_PACKET_TAG).Set<KokoroServableMap>();
+        cc->Outputs().Tag(OUTPUT_TAG_NAME).Set<std::string>();
+        return absl::OkStatus();
+    }
+
+    absl::Status Close(CalculatorContext* cc) final {
+        SPDLOG_LOGGER_DEBUG(kokoro_calculator_logger, "KokoroCalculator [Node: {}] Close", cc->NodeName());
+        return absl::OkStatus();
+    }
+
+    absl::Status Open(CalculatorContext* cc) final {
+        SPDLOG_LOGGER_DEBUG(kokoro_calculator_logger, "KokoroCalculator [Node: {}] Open", cc->NodeName());
+        return absl::OkStatus();
+    }
+
+    absl::Status Process(CalculatorContext* cc) final {
+        SPDLOG_LOGGER_DEBUG(kokoro_calculator_logger, "KokoroCalculator [Node: {}] Process start", cc->NodeName());
+
+        KokoroServableMap servablesMap = cc->InputSidePackets()
+            .Tag(KOKORO_SESSION_SIDE_PACKET_TAG).Get<KokoroServableMap>();
+        auto servableIt = servablesMap.find(cc->NodeName());
+        RET_CHECK(servableIt != servablesMap.end())
+            << "Could not find initialized Kokoro node named: " << cc->NodeName();
+        auto servable = servableIt->second;
+
+        const auto& payload = cc->Inputs().Tag(INPUT_TAG_NAME).Get<ovms::HttpPayload>();
+        auto it = payload.parsedJson->FindMember("input");
+        RET_CHECK(it != payload.parsedJson->MemberEnd()) << "Missing 'input' in request";
+        RET_CHECK(it->value.IsString()) << "'input' must be a string";
+        const std::string text = it->value.GetString();
+
+        // Text -> IPA phonemization
+        std::string phonemes;
+        espeakPhonemizeAll(text, phonemes, /*noStress=*/true);
+        SPDLOG_DEBUG("Input text: '{}', IPA phonemes ({} chars): '{}'", text, phonemes.size(), phonemes);
+
+        // IPA -> Kokoro token IDs
+        const auto& vocabIx = servable->getVocabIndex();
+        std::vector<std::vector<int64_t>> inputTokens(1);
+        tokenize(phonemes, inputTokens[0], vocabIx);
+
+        // Prepend PAD token (id=0) - Kokoro model requires BOS/PAD at start
+        inputTokens[0].insert(inputTokens[0].begin(), 0);
+
+        // Append EOS (period token = 4) if not already present
+        if (inputTokens[0].empty() || inputTokens[0].back() != 4) {
+            inputTokens[0].push_back(4);
+        }
+
+        // Voice embedding
+        std::vector<float> voice = {
+            -0.2296, 0.1835, -0.0069, -0.1240, -0.2505, 0.0112, -0.0759, -0.1650,
+            -0.2665, -0.1965, 0.0242, -0.1667, 0.3524, 0.2140, 0.3069, -0.3377,
+            -0.0878, -0.0477, 0.0813, -0.2135, -0.2340, -0.1971, 0.0200, 0.0145,
+            0.0016, 0.2596, -0.2665, 0.1434, 0.0503, 0.0867, 0.1905, -0.1281,
+            0.0658, -0.0639, -0.0920, 0.2444, -0.1506, -0.2197, 0.1385, 0.2133,
+            -0.0755, -0.0188, -0.0142, 0.2301, -0.0776, -0.0748, 0.0172, 0.0430,
+            -0.1009, 0.1519, 0.1137, 0.0641, 0.2264, 0.1911, -0.0205, 0.2578,
+            0.2210, -0.0784, -0.0235, -0.0547, 0.2191, -0.1623, -0.2416, 0.0076,
+            0.0574, 0.2186, 0.0080, 0.0473, 0.0972, 0.0286, 0.1324, 0.0686,
+            0.2652, -0.2237, -0.0980, -0.1693, -0.1866, 0.2273, 0.2008, -0.0683,
+            0.0957, 0.0623, -0.1891, 0.1620, 0.1811, -0.0516, -0.0800, -0.1416,
+            -0.2374, -0.1892, 0.1726, -0.0690, -0.0300, 0.0467, -0.2811, -0.1603,
+            0.0342, -0.1054, -0.0604, -0.0475, -0.0908, -0.1286, 0.1105, -0.1186,
+            0.0582, 0.1887, 0.0345, 0.2081, 0.1404, -0.2532, 0.0026, 0.0402,
+            0.0812, -0.0512, 0.0128, 0.0084, -0.0970, -0.0362, 0.0036, -0.0720,
+            -0.0850, 0.0221, -0.1037, 0.0569, 0.0187, -0.0649, -0.0288, -0.1795,
+            0.0045, 0.2535, 0.6751, 0.1578, -0.0966, 0.1516, 0.2109, 0.2033,
+            -0.2155, -0.1783, 0.0836, -0.1050, 0.0676, -0.0237, 0.0387, -0.2564,
+            0.1891, 0.1305, -0.3239, -0.1312, 0.2723, 0.0745, 0.1335, 0.0302,
+            0.0172, 0.2207, 0.0215, -0.0379, -0.1954, 0.4944, 0.2905, -0.0306,
+            0.2858, 0.2341, 0.0545, 0.4626, 0.2947, 0.3802, 0.2820, 0.1557,
+            0.1743, -0.1410, 0.0986, 0.4751, -0.2146, 0.3530, -0.2357, -0.5626,
+            -0.0617, 0.2190, 0.0992, -0.2365, 0.3726, 0.2092, 0.1660, 0.1928,
+            0.5731, -0.1734, -0.0816, -0.3191, -0.1871, -0.2217, -0.0112, 0.1261,
+            0.1601, 0.3835, 0.0451, -0.1927, -0.1116, 0.2204, -0.0379, -0.0094,
+            -0.0455, -0.4831, -0.3345, -0.2119, 0.4803, 0.1214, 0.1723, 0.2605,
+            0.0051, -0.2587, 0.0511, -0.1318, 0.0227, -0.0645, 0.2573, -0.0205,
+            0.0665, -0.3562, -0.6070, 0.4191, 0.0351, 0.2033, -0.5508, -0.1415,
+            -0.1249, -0.0986, -0.1120, -0.1187, 0.0600, 0.1974, 0.5017, -0.0247,
+            -0.2986, 0.3983, -0.1159, -0.4275, -0.0164, -0.3783, 0.0717, 0.1478,
+            -0.1144, 0.2292, 0.2741, 0.4309, -0.1611, 0.0755, -0.0981, 0.4584,
+            -0.2061, -0.0787, -0.1779, 0.2275, -0.1742, -0.2230, -0.1739, 0.0646
+        };
+
+        auto& ids = inputTokens[0];
+
+        auto inputIdsTensor = ov::Tensor{ov::element::i64, ov::Shape{1, ids.size()}};
+        auto refS = ov::Tensor{ov::element::f32, ov::Shape{1, voice.size()}};
+        auto speed = ov::Tensor{ov::element::f32, ov::Shape{1}};
+
+        *reinterpret_cast<float*>(speed.data()) = 0.5f;
+        std::copy(ids.data(), ids.data() + ids.size(),
+                  reinterpret_cast<int64_t*>(inputIdsTensor.data()));
+        std::copy(voice.data(), voice.data() + voice.size(),
+                  reinterpret_cast<float*>(refS.data()));
+
+        // Inference
+        ModelMetricReporter unused(nullptr, nullptr, "unused", 1);
+        auto executingStreamIdGuard =
+            std::make_unique<ExecutingStreamIdGuard>(servable->getInferRequestsQueue(), unused);
+        ov::InferRequest& inferRequest = executingStreamIdGuard->getInferRequest();
+
+        inferRequest.set_tensor("input_ids", inputIdsTensor);
+        inferRequest.set_tensor("103", refS);
+        inferRequest.set_tensor("speed", speed);
+        inferRequest.start_async();
+        inferRequest.wait();
+
+        // Collect audio output
+        auto out = inferRequest.get_tensor(inferRequest.get_compiled_model().outputs()[0]);
+        RET_CHECK(out.get_shape().size() == 1);
+        RET_CHECK(out.get_element_type() == ov::element::f32);
+        const size_t samples = out.get_shape()[0];
+        const float* data = out.data<float>();
+
+        SPDLOG_DEBUG("Model output: {} audio samples ({:.2f}s at 24kHz)",
+                     samples, static_cast<float>(samples) / 24000.0f);
+
+        void* wavDataPtr = nullptr;
+        size_t wavSize = 0;
+        prepareAudioOutputKokoro(&wavDataPtr, wavSize, 32, samples, data);
+
+        auto output = std::make_unique<std::string>(reinterpret_cast<char*>(wavDataPtr), wavSize);
+        drwav_free(wavDataPtr, NULL);
+
+        cc->Outputs().Tag(OUTPUT_TAG_NAME).Add(output.release(), cc->InputTimestamp());
+        SPDLOG_LOGGER_DEBUG(kokoro_calculator_logger, "KokoroCalculator [Node: {}] Process end", cc->NodeName());
+        return absl::OkStatus();
+    }
+};
+
+const std::string KokoroCalculator::INPUT_TAG_NAME{"HTTP_REQUEST_PAYLOAD"};
+const std::string KokoroCalculator::OUTPUT_TAG_NAME{"HTTP_RESPONSE_PAYLOAD"};
+
+REGISTER_CALCULATOR(KokoroCalculator);
+
+}  // namespace mediapipe
diff --git a/src/audio/kokoro/kokoro_calculator.proto b/src/audio/kokoro/kokoro_calculator.proto
new file mode 100644
index 0000000000..d9fc1b4bd9
--- /dev/null
+++ b/src/audio/kokoro/kokoro_calculator.proto
@@ -0,0 +1,33 @@
+//*****************************************************************************
+// Copyright 2025 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+syntax = "proto2";
+package mediapipe;
+
+import "mediapipe/framework/calculator.proto";
+
+
+message KokoroCalculatorOptions {
+  extend mediapipe.CalculatorOptions {
+    // https://github.com/google/mediapipe/issues/634 have to be unique in app
+    // no rule to obtain this
+    optional KokoroCalculatorOptions ext = 116423799;
+    }
+
+    required string models_path = 1;
+    optional string target_device = 2;
+    optional string plugin_config = 3;
+}
diff --git a/src/audio/kokoro/kokoro_servable.hpp b/src/audio/kokoro/kokoro_servable.hpp
new file mode 100644
index 0000000000..3e42bd0db4
--- /dev/null
+++ b/src/audio/kokoro/kokoro_servable.hpp
@@ -0,0 +1,197 @@
+//*****************************************************************************
+// Copyright 2025 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#pragma once
+
+#include <fstream>
+#include <memory>
+#include <mutex>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#pragma warning(push)
+#pragma warning(disable : 4005 4309 6001 6385 6386 6326 6011 4005 4456 6246)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#include "mediapipe/framework/calculator_graph.h"
+#pragma GCC diagnostic pop
+#pragma warning(pop)
+
+#include "openvino/runtime/core.hpp"
+#include "../../ovinferrequestsqueue.hpp"
+
+#include <espeak-ng/speak_lib.h>
+#include <rapidjson/document.h>
+
+#include "src/audio/kokoro/kokoro_calculator.pb.h"
+#include "src/logging.hpp"
+
+namespace ovms {
+
+struct VocabIndex {
+    std::unordered_map<std::string, int> by_token;
+    size_t max_token_bytes = 1;
+};
+
+class EspeakInstance {
+public:
+    static EspeakInstance& instance() {
+        static EspeakInstance inst;
+        return inst;
+    }
+
+    bool isReady() const { return ready_; }
+    std::mutex& mutex() { return mutex_; }
+
+private:
+    EspeakInstance() {
+        ready_ = tryInit();
+        if (!ready_) {
+            SPDLOG_ERROR("eSpeak-NG initialization failed (data path or voice not found)");
+        } else {
+            SPDLOG_INFO("eSpeak-NG initialized successfully");
+        }
+    }
+
+    ~EspeakInstance() {
+        if (ready_) {
+            espeak_Terminate();
+        }
+    }
+
+    EspeakInstance(const EspeakInstance&) = delete;
+    EspeakInstance& operator=(const EspeakInstance&) = delete;
+
+    bool tryInit() {
+        auto try_path = [](const char* path) -> bool {
+            int sr = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS,
+                                       0, path,
+                                       espeakINITIALIZE_DONT_EXIT);
+            if (sr <= 0) return false;
+            if (espeak_SetVoiceByName("en") != EE_OK &&
+                espeak_SetVoiceByName("en-us") != EE_OK) {
+                return false;
+            }
+            return true;
+        };
+
+        if (try_path(nullptr)) return true;
+
+        static const char* ngPaths[] = {
+            "/usr/share/espeak-ng-data",
+            "/opt/homebrew/share/espeak-ng-data",
+            "/usr/local/share/espeak-ng-data",
+            "espeak-ng-data",
+            nullptr
+        };
+        for (int i = 0; ngPaths[i]; ++i)
+            if (try_path(ngPaths[i])) return true;
+
+        static const char* esPaths[] = {
+            "/usr/share/espeak-data",
+            "/usr/local/share/espeak-data",
+            "espeak-data",
+            nullptr
+        };
+        for (int i = 0; esPaths[i]; ++i)
+            if (try_path(esPaths[i])) return true;
+
+        return false;
+    }
+
+    bool ready_ = false;
+    std::mutex mutex_;
+};
+
+struct KokoroServable {
+    std::filesystem::path parsedModelsPath;
+    std::shared_ptr<ov::Model> model;
+    ov::CompiledModel compiledModel;
+    std::unique_ptr<OVInferRequestsQueue> inferRequestsQueue;
+    VocabIndex vocabIndex;
+
+    KokoroServable(const std::string& modelDir, const std::string& targetDevice, const std::string& graphPath) {
+        EspeakInstance::instance();
+
+        auto fsModelsPath = std::filesystem::path(modelDir);
+        if (fsModelsPath.is_relative()) {
+            parsedModelsPath = (std::filesystem::path(graphPath) / fsModelsPath);
+        } else {
+            parsedModelsPath = fsModelsPath;
+        }
+
+        vocabIndex = loadVocabFromConfig(parsedModelsPath);
+
+        ov::AnyMap properties;
+        ov::Core core;
+        auto m_model = core.read_model(parsedModelsPath / std::filesystem::path("openvino_model.xml"), {}, properties);
+        compiledModel = core.compile_model(m_model, targetDevice, properties);
+        inferRequestsQueue = std::make_unique<OVInferRequestsQueue>(compiledModel, 5);
+    }
+
+    OVInferRequestsQueue& getInferRequestsQueue() {
+        return *inferRequestsQueue;
+    }
+
+    const VocabIndex& getVocabIndex() const {
+        return vocabIndex;
+    }
+
+private:
+    static VocabIndex loadVocabFromConfig(const std::filesystem::path& modelDir) {
+        VocabIndex ix;
+        auto configPath = modelDir / "config.json";
+        std::ifstream ifs(configPath);
+        if (!ifs.is_open()) {
+            SPDLOG_ERROR("Failed to open Kokoro config: {}", configPath.string());
+            return ix;
+        }
+
+        std::stringstream buffer;
+        buffer << ifs.rdbuf();
+        std::string jsonStr = buffer.str();
+
+        rapidjson::Document doc;
+        doc.Parse(jsonStr.c_str());
+        if (doc.HasParseError()) {
+            SPDLOG_ERROR("Failed to parse Kokoro config JSON: {}", configPath.string());
+            return ix;
+        }
+
+        if (!doc.HasMember("vocab") || !doc["vocab"].IsObject()) {
+            SPDLOG_ERROR("Kokoro config missing 'vocab' object: {}", configPath.string());
+            return ix;
+        }
+
+        const auto& vocab = doc["vocab"];
+        ix.by_token.reserve(vocab.MemberCount());
+        for (auto it = vocab.MemberBegin(); it != vocab.MemberEnd(); ++it) {
+            if (!it->name.IsString() || !it->value.IsInt()) continue;
+            std::string token = it->name.GetString();
+            int id = it->value.GetInt();
+            ix.by_token.emplace(token, id);
+            ix.max_token_bytes = std::max(ix.max_token_bytes, token.size());
+        }
+
+        SPDLOG_INFO("Loaded Kokoro vocabulary: {} tokens, max_token_bytes={}",
+                     ix.by_token.size(), ix.max_token_bytes);
+        return ix;
+    }
+};
+
+using KokoroServableMap = std::unordered_map<std::string, std::shared_ptr<KokoroServable>>;
+}  // namespace ovms
diff --git a/src/logging.cpp b/src/logging.cpp
index e89fce9a07..9d058d82dc 100644
--- a/src/logging.cpp
+++ b/src/logging.cpp
@@ -35,6 +35,7 @@ std::shared_ptr<spdlog::logger> llm_executor_logger = std::make_shared<spdlog::l
 std::shared_ptr<spdlog::logger> llm_calculator_logger = std::make_shared<spdlog::logger>("llm_calculator");
 std::shared_ptr<spdlog::logger> s2t_calculator_logger = std::make_shared<spdlog::logger>("s2t_calculator");
 std::shared_ptr<spdlog::logger> t2s_calculator_logger = std::make_shared<spdlog::logger>("t2s_calculator");
+std::shared_ptr<spdlog::logger> kokoro_calculator_logger = std::make_shared<spdlog::logger>("kokoro_calculator");
 std::shared_ptr<spdlog::logger> embeddings_calculator_logger = std::make_shared<spdlog::logger>("embeddings_calculator");
 std::shared_ptr<spdlog::logger> rerank_calculator_logger = std::make_shared<spdlog::logger>("rerank_calculator");
 #endif
@@ -78,6 +79,7 @@ static void register_loggers(const std::string& log_level, std::vector<spdlog::s
     llm_calculator_logger->set_pattern(default_pattern);
     s2t_calculator_logger->set_pattern(default_pattern);
     t2s_calculator_logger->set_pattern(default_pattern);
+    kokoro_calculator_logger->set_pattern(default_pattern);
     rerank_calculator_logger->set_pattern(default_pattern);
     embeddings_calculator_logger->set_pattern(default_pattern);
 #endif
@@ -98,6 +100,7 @@ static void register_loggers(const std::string& log_level, std::vector<spdlog::s
         llm_calculator_logger->sinks().push_back(sink);
         s2t_calculator_logger->sinks().push_back(sink);
         t2s_calculator_logger->sinks().push_back(sink);
+        kokoro_calculator_logger->sinks().push_back(sink);
         rerank_calculator_logger->sinks().push_back(sink);
         embeddings_calculator_logger->sinks().push_back(sink);
 #endif
@@ -119,6 +122,7 @@ static void register_loggers(const std::string& log_level, std::vector<spdlog::s
     set_log_level(log_level, llm_calculator_logger);
     set_log_level(log_level, s2t_calculator_logger);
     set_log_level(log_level, t2s_calculator_logger);
+    set_log_level(log_level, kokoro_calculator_logger);
     set_log_level(log_level, rerank_calculator_logger);
     set_log_level(log_level, embeddings_calculator_logger);
 #endif
diff --git a/src/logging.hpp b/src/logging.hpp
index 011458fe49..bcbf987f30 100644
--- a/src/logging.hpp
+++ b/src/logging.hpp
@@ -38,6 +38,7 @@ extern std::shared_ptr<spdlog::logger> llm_executor_logger;
 extern std::shared_ptr<spdlog::logger> llm_calculator_logger;
 extern std::shared_ptr<spdlog::logger> s2t_calculator_logger;
 extern std::shared_ptr<spdlog::logger> t2s_calculator_logger;
+extern std::shared_ptr<spdlog::logger> kokoro_calculator_logger;
 extern std::shared_ptr<spdlog::logger> embeddings_calculator_logger;
 extern std::shared_ptr<spdlog::logger> rerank_calculator_logger;
 #endif
diff --git a/src/mediapipe_internal/mediapipegraphdefinition.cpp b/src/mediapipe_internal/mediapipegraphdefinition.cpp
index 9047765e75..e1436b5891 100644
--- a/src/mediapipe_internal/mediapipegraphdefinition.cpp
+++ b/src/mediapipe_internal/mediapipegraphdefinition.cpp
@@ -63,6 +63,7 @@ const std::string MediapipeGraphDefinition::LLM_NODE_CALCULATOR_NAME{"LLMCalcula
 const std::string MediapipeGraphDefinition::IMAGE_GEN_CALCULATOR_NAME{"ImageGenCalculator"};
 const std::string MediapipeGraphDefinition::STT_NODE_CALCULATOR_NAME{"S2tCalculator"};
 const std::string MediapipeGraphDefinition::TTS_NODE_CALCULATOR_NAME{"T2sCalculator"};
+const std::string MediapipeGraphDefinition::KOKORO_NODE_CALCULATOR_NAME{"KokoroCalculator"};
 const std::string MediapipeGraphDefinition::EMBEDDINGS_NODE_CALCULATOR_NAME{"EmbeddingsCalculatorOV"};
 const std::string MediapipeGraphDefinition::RERANK_NODE_CALCULATOR_NAME{"RerankCalculatorOV"};
 
@@ -625,6 +626,28 @@ Status MediapipeGraphDefinition::initializeNodes() {
                 return StatusCode::MEDIAPIPE_GRAPH_CONFIG_FILE_INVALID;
             }
         }
+        if (endsWith(config.node(i).calculator(), KOKORO_NODE_CALCULATOR_NAME)) {
+            auto& kokoroServableMap = this->sidePacketMaps.kokoroServableMap;
+            ResourcesCleaningGuard<KokoroServableMap> kokoroServablesCleaningGuard(kokoroServableMap);
+            if (!config.node(i).node_options().size()) {
+                SPDLOG_LOGGER_ERROR(modelmanager_logger, "Kokoro node missing options in graph: {}. ", this->name);
+                return StatusCode::LLM_NODE_MISSING_OPTIONS;
+            }
+            if (config.node(i).name().empty()) {
+                SPDLOG_LOGGER_ERROR(modelmanager_logger, "Kokoro node name is missing in graph: {}. ", this->name);
+                return StatusCode::LLM_NODE_MISSING_NAME;
+            }
+            std::string nodeName = config.node(i).name();
+            if (kokoroServableMap.find(nodeName) != kokoroServableMap.end()) {
+                SPDLOG_LOGGER_ERROR(modelmanager_logger, "Kokoro node name: {} already used in graph: {}. ", nodeName, this->name);
+                return StatusCode::LLM_NODE_NAME_ALREADY_EXISTS;
+            }
+            mediapipe::KokoroCalculatorOptions nodeOptions;
+            config.node(i).node_options(0).UnpackTo(&nodeOptions);
+            std::shared_ptr<KokoroServable> servable = std::make_shared<KokoroServable>(nodeOptions.models_path(), nodeOptions.target_device(), mgconfig.getBasePath());
+            kokoroServableMap.insert(std::pair<std::string, std::shared_ptr<KokoroServable>>(nodeName, std::move(servable)));
+            kokoroServablesCleaningGuard.disableCleaning();
+        }
     }
     return StatusCode::OK;
 }
diff --git a/src/mediapipe_internal/mediapipegraphdefinition.hpp b/src/mediapipe_internal/mediapipegraphdefinition.hpp
index 14c9e0679f..1067ca7d42 100644
--- a/src/mediapipe_internal/mediapipegraphdefinition.hpp
+++ b/src/mediapipe_internal/mediapipegraphdefinition.hpp
@@ -48,6 +48,7 @@
 #include "../rerank/rerank_servable.hpp"
 #include "../audio/speech_to_text/s2t_servable.hpp"
 #include "../audio/text_to_speech/t2s_servable.hpp"
+#include "../audio/kokoro/kokoro_servable.hpp"
 
 namespace ovms {
 class MediapipeGraphDefinitionUnloadGuard;
@@ -66,6 +67,7 @@ using GenAiServableMap = std::unordered_map<std::string, std::shared_ptr<GenAiSe
 using RerankServableMap = std::unordered_map<std::string, std::shared_ptr<RerankServable>>;
 using SttServableMap = std::unordered_map<std::string, std::shared_ptr<SttServable>>;
 using TtsServableMap = std::unordered_map<std::string, std::shared_ptr<TtsServable>>;
+using KokoroServableMap = std::unordered_map<std::string, std::shared_ptr<KokoroServable>>;
 using EmbeddingsServableMap = std::unordered_map<std::string, std::shared_ptr<EmbeddingsServable>>;
 using ImageGenerationPipelinesMap = std::unordered_map<std::string, std::shared_ptr<ImageGenerationPipelines>>;
 
@@ -77,6 +79,7 @@ struct GraphSidePackets {
     RerankServableMap rerankServableMap;
     SttServableMap sttServableMap;
     TtsServableMap ttsServableMap;
+    KokoroServableMap kokoroServableMap;
     void clear() {
         pythonNodeResourcesMap.clear();
         genAiServableMap.clear();
@@ -85,6 +88,7 @@ struct GraphSidePackets {
         rerankServableMap.clear();
         sttServableMap.clear();
         ttsServableMap.clear();
+        kokoroServableMap.clear();
     }
     bool empty() {
         return (pythonNodeResourcesMap.empty() &&
@@ -93,7 +97,8 @@ struct GraphSidePackets {
                 embeddingsServableMap.empty() &&
                 rerankServableMap.empty() &&
                 sttServableMap.empty() &&
-                ttsServableMap.empty());
+                ttsServableMap.empty() &&
+                kokoroServableMap.empty());
     }
 };
 
@@ -136,6 +141,7 @@ class MediapipeGraphDefinition {
     static const std::string RERANK_NODE_CALCULATOR_NAME;
     static const std::string STT_NODE_CALCULATOR_NAME;
     static const std::string TTS_NODE_CALCULATOR_NAME;
+    static const std::string KOKORO_NODE_CALCULATOR_NAME;
     Status waitForLoaded(std::unique_ptr<MediapipeGraphDefinitionUnloadGuard>& unloadGuard, const uint32_t waitForLoadedTimeoutMicroseconds = WAIT_FOR_LOADED_DEFAULT_TIMEOUT_MICROSECONDS);
 
     // Pipelines are not versioned and any available definition has constant version equal 1.
diff --git a/src/mediapipe_internal/mediapipegraphexecutor.cpp b/src/mediapipe_internal/mediapipegraphexecutor.cpp
index 93b53fdf8e..b2016ac3aa 100644
--- a/src/mediapipe_internal/mediapipegraphexecutor.cpp
+++ b/src/mediapipe_internal/mediapipegraphexecutor.cpp
@@ -49,6 +49,7 @@ MediapipeGraphExecutor::MediapipeGraphExecutor(
     const RerankServableMap& rerankServableMap,
     const SttServableMap& sttServableMap,
     const TtsServableMap& ttsServableMap,
+    const KokoroServableMap& kokoroServableMap,
     PythonBackend* pythonBackend,
     MediapipeServableMetricReporter* mediapipeServableMetricReporter) :
     name(name),
@@ -58,7 +59,7 @@ MediapipeGraphExecutor::MediapipeGraphExecutor(
     outputTypes(std::move(outputTypes)),
     inputNames(std::move(inputNames)),
     outputNames(std::move(outputNames)),
-    sidePacketMaps({pythonNodeResourcesMap, llmNodeResourcesMap, {}, embeddingsServableMap, rerankServableMap, sttServableMap, ttsServableMap}),
+    sidePacketMaps({pythonNodeResourcesMap, llmNodeResourcesMap, {}, embeddingsServableMap, rerankServableMap, sttServableMap, ttsServableMap, kokoroServableMap}),
     pythonBackend(pythonBackend),
     currentStreamTimestamp(STARTING_TIMESTAMP),
     mediapipeServableMetricReporter(mediapipeServableMetricReporter) {}
@@ -92,6 +93,7 @@ const std::string MediapipeGraphExecutor::EMBEDDINGS_SESSION_SIDE_PACKET_TAG = "
 const std::string MediapipeGraphExecutor::RERANK_SESSION_SIDE_PACKET_TAG = "rerank_servable";
 const std::string MediapipeGraphExecutor::STT_SESSION_SIDE_PACKET_TAG = "s2t_servable";
 const std::string MediapipeGraphExecutor::TTS_SESSION_SIDE_PACKET_TAG = "t2s_servable";
+const std::string MediapipeGraphExecutor::KOKORO_SESSION_SIDE_PACKET_TAG = "kokoro_servable";
 const ::mediapipe::Timestamp MediapipeGraphExecutor::STARTING_TIMESTAMP = ::mediapipe::Timestamp(0);
 
 }  // namespace ovms
diff --git a/src/mediapipe_internal/mediapipegraphexecutor.hpp b/src/mediapipe_internal/mediapipegraphexecutor.hpp
index c165469395..af2e8d08e6 100644
--- a/src/mediapipe_internal/mediapipegraphexecutor.hpp
+++ b/src/mediapipe_internal/mediapipegraphexecutor.hpp
@@ -95,6 +95,7 @@ class MediapipeGraphExecutor {
     static const std::string RERANK_SESSION_SIDE_PACKET_TAG;
     static const std::string STT_SESSION_SIDE_PACKET_TAG;
     static const std::string TTS_SESSION_SIDE_PACKET_TAG;
+    static const std::string KOKORO_SESSION_SIDE_PACKET_TAG;
     static const ::mediapipe::Timestamp STARTING_TIMESTAMP;
 
     MediapipeGraphExecutor(const std::string& name, const std::string& version, const ::mediapipe::CalculatorGraphConfig& config,
@@ -107,6 +108,7 @@ class MediapipeGraphExecutor {
         const RerankServableMap& rerankServableMap,
         const SttServableMap& sttServableMap,
         const TtsServableMap& ttsServableMap,
+        const KokoroServableMap& kokoroServableMap,
         PythonBackend* pythonBackend,
         MediapipeServableMetricReporter* mediapipeServableMetricReporter);
     MediapipeGraphExecutor(const std::string& name, const std::string& version, const ::mediapipe::CalculatorGraphConfig& config,
@@ -157,6 +159,7 @@ class MediapipeGraphExecutor {
         inputSidePackets[RERANK_SESSION_SIDE_PACKET_TAG] = mediapipe::MakePacket<RerankServableMap>(this->sidePacketMaps.rerankServableMap).At(STARTING_TIMESTAMP);
         inputSidePackets[STT_SESSION_SIDE_PACKET_TAG] = mediapipe::MakePacket<SttServableMap>(this->sidePacketMaps.sttServableMap).At(STARTING_TIMESTAMP);
         inputSidePackets[TTS_SESSION_SIDE_PACKET_TAG] = mediapipe::MakePacket<TtsServableMap>(this->sidePacketMaps.ttsServableMap).At(STARTING_TIMESTAMP);
+        inputSidePackets[KOKORO_SESSION_SIDE_PACKET_TAG] = mediapipe::MakePacket<KokoroServableMap>(this->sidePacketMaps.kokoroServableMap).At(STARTING_TIMESTAMP);
 
         MP_RETURN_ON_FAIL(graph.StartRun(inputSidePackets), std::string("start MediaPipe graph: ") + this->name, StatusCode::MEDIAPIPE_GRAPH_START_ERROR);