openvinotoolkit · michalkulakowski · Dec 17, 2025 · Copilot · Feb 19, 2026
diff --git a/Dockerfile.redhat b/Dockerfile.redhat
@@ -127,7 +127,9 @@ RUN dnf install -y -d6 \
             python3.12 \
             python3.12-devel \
             python3.12-pip \
-            libicu-devel && \
+            libicu-devel \
+            espeak-ng \
+            espeak-ng-devel && \
             dnf clean all
 
 WORKDIR /
@@ -416,6 +418,7 @@ RUN if [ -f /usr/bin/dnf ] ; then export DNF_TOOL=dnf ; echo -e "max_parallel_do
     if ! [[ $debug_bazel_flags == *"py_off"* ]]; then \
         $DNF_TOOL install -y python3.12-libs --setopt=install_weak_deps=0 --nodocs; \
     fi ; \
+    $DNF_TOOL install -y espeak-ng --setopt=install_weak_deps=0 --nodocs; \
     $DNF_TOOL install -y shadow-utils; \
     $DNF_TOOL clean all ; \
     cp -v /etc/ssl/certs/ca-bundle.crt /etc/ssl/certs/ca-certificates.crt ; \

diff --git a/Dockerfile.ubuntu b/Dockerfile.ubuntu
@@ -100,6 +100,8 @@ RUN if [ "$BASE_OS" == "ubuntu24" ] ; then apt-get update && \
     apt-get clean && rm -rf /var/lib/apt/lists/* ; fi
 ENV PIP_BREAK_SYSTEM_PACKAGES=1
 RUN apt-get update && apt-get install --no-install-recommends -y \
+            espeak-ng \
+            libespeak-ng-dev \
             libgflags-dev \
             bc \
             ca-certificates \
@@ -413,7 +415,7 @@ ARG INSTALL_DRIVER_VERSION="24.39.31294"
 COPY ./install_ubuntu_gpu_drivers.sh /tmp/install_gpu_drivers.sh
 # hadolint ignore=DL3003,SC2164
 RUN apt-get update ; \
-    apt-get install -y --no-install-recommends curl ca-certificates libxml2 || exit 1; \
+    apt-get install -y --no-install-recommends curl ca-certificates libxml2 espeak-ng espeak-ng-data || exit 1; \
     if [ "$GPU" == "1" ] ; then \
 	/tmp/install_gpu_drivers.sh ; \
     fi ; \

diff --git a/src/BUILD b/src/BUILD
@@ -563,6 +563,7 @@ ovms_cc_library(
                 "//src/image_gen:image_gen_calculator",
                 "//src/audio/speech_to_text:s2t_calculator",
                 "//src/audio/text_to_speech:t2s_calculator",
+                "//src/audio/kokoro:kokoro_calculator",
                 "//src/audio:audio_utils",
                 "//src/image_gen:imagegen_init",
                 "//src/llm:openai_completions_api_handler",

diff --git a/src/audio/audio_utils.cpp b/src/audio/audio_utils.cpp
@@ -188,3 +188,34 @@ void prepareAudioOutput(void** ppData, size_t& pDataSize, uint16_t bitsPerSample
     auto outputPreparationTime = (timer.elapsed<std::chrono::microseconds>(OUTPUT_PREPARATION)) / 1000;
     SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "Output preparation time: {} ms", outputPreparationTime);
 }
+
+
+void prepareAudioOutputKokoro(void** ppData, size_t& pDataSize, uint16_t bitsPerSample, size_t speechSize, const float* waveformPtr) {
+    enum : unsigned int {
+        OUTPUT_PREPARATION,
+        TIMER_END
+    };
+    Timer<TIMER_END> timer;
+    timer.start(OUTPUT_PREPARATION);
+    drwav_data_format format;
+    format.container = drwav_container_riff;
+    format.format = DR_WAVE_FORMAT_IEEE_FLOAT;
+    format.channels = 1;
+    format.sampleRate = 24000;  // assume it is always 24 KHz
+    format.bitsPerSample = bitsPerSample;
+    drwav wav;
+    size_t totalSamples = speechSize * format.channels;
+
+    auto status = drwav_init_memory_write(&wav, ppData, &pDataSize, &format, nullptr);
+    if (status == DRWAV_FALSE) {
+        throw std::runtime_error("Failed to write all frames");
+    }
+    drwav_uint64 framesWritten = drwav_write_pcm_frames(&wav, totalSamples, waveformPtr);
+    if (framesWritten != totalSamples) {
+        throw std::runtime_error("Failed to write all frames");
+    }
+    drwav_uninit(&wav);
+    timer.stop(OUTPUT_PREPARATION);
+    auto outputPreparationTime = (timer.elapsed<std::chrono::microseconds>(OUTPUT_PREPARATION)) / 1000;
+    SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "Output preparation time: {} ms", outputPreparationTime);
-    SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "Output preparation time: {} ms", outputPreparationTime);
+    SPDLOG_LOGGER_DEBUG(kokoro_calculator_logger, "Output preparation time: {} ms", outputPreparationTime);
-    SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "Output preparation time: {} ms", outputPreparationTime);
+    SPDLOG_LOGGER_DEBUG(kokoro_calculator_logger, "Output preparation time: {} ms", outputPreparationTime);
+}
diff --git a/src/audio/audio_utils.hpp b/src/audio/audio_utils.hpp
@@ -25,3 +25,5 @@ bool isWavBuffer(const std::string buf);
 std::vector<float> readWav(const std::string_view& wavData);
 std::vector<float> readMp3(const std::string_view& mp3Data);
 void prepareAudioOutput(void** ppData, size_t& pDataSize, uint16_t bitsPerSample, size_t speechSize, const float* waveformPtr);
+void prepareAudioOutputKokoro(void** ppData, size_t& pDataSize, uint16_t bitsPerSample, size_t speechSize, const float* waveformPtr);
+
diff --git a/src/audio/kokoro/BUILD b/src/audio/kokoro/BUILD
@@ -0,0 +1,60 @@
+#
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+load("@mediapipe//mediapipe/framework/port:build_config.bzl", "mediapipe_cc_proto_library", "mediapipe_proto_library")
+load("//:common_settings.bzl", "ovms_cc_library")
+
+ovms_cc_library(
+    name = "kokoro_servable",
+    hdrs = ["kokoro_servable.hpp"],
+    deps= ["//third_party:openvino",
+    "//src:libovms_ovinferrequestsqueue",
+    ],
+    visibility = ["//visibility:public"],
+    alwayslink = 1,
+)
+
+ovms_cc_library(
+    name = "kokoro_calculator",
+    srcs = ["kokoro_calculator.cc"],
+    deps = [
+        "@mediapipe//mediapipe/framework:calculator_framework",
+        "//src:httppayload",
+        "//src:libovmslogging",
+        "kokoro_calculator_cc_proto",
+        "//src/port:dr_audio",
+        "//src/port:rapidjson_stringbuffer",
+        "//src/port:rapidjson_writer",
+        ":kokoro_servable",
+        "//third_party:genai",
+        "//src/audio:audio_utils",
+        "//src:executingstreamidguard",
+        "//src:model_metric_reporter",
+        "//third_party/espeak_ng:espeak_ng",
+    ],
+    visibility = ["//visibility:public"],
+    alwayslink = 1,
+)
+
+mediapipe_proto_library(
+    name = "kokoro_calculator_proto",
+    srcs = ["kokoro_calculator.proto"],
+    visibility = ["//visibility:private"],
+    deps = [
+        "@mediapipe//mediapipe/framework:calculator_options_proto",
+        "@mediapipe//mediapipe/framework:calculator_proto",
+    ],
+)