From f333aa5e885d04ed68a6b4b98f6c3a7c3cd10621 Mon Sep 17 00:00:00 2001
From: adhikjoshi <adhik3035@gmail.com>
Date: Sun, 22 Feb 2026 06:33:44 +0530
Subject: [PATCH] feat: Add ModelsLab TTS provider
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds ModelsLab as a new TTS backend for VideoLingo, following the same
per-file pattern as the existing SiliconFlow, OpenAI, Azure, and Edge
TTS integrations.

Changes:
- core/tts_backend/modelslab_tts.py  — new provider module
- core/tts_backend/tts_main.py       — import + dispatch case
- config.yaml                        — modelslab_tts config section
- core/st_utils/sidebar_setting.py   — Streamlit API key + voice picker

Features:
- 10 voices: Bella, Antoni, Elli, Josh, Arnold, Adam, Sam, Rachel, Domi, Gigi
- Async polling: handles 'processing' status (polls every 5 s, up to 5 min)
- 2 500-character limit enforced automatically
- 3-retry logic via @except_handler decorator (same as other providers)
- WAV output — compatible with VideoLingo's audio pipeline

Setup:
  1. Get an API key at https://modelslab.com
  2. Set modelslab_tts.api_key in config.yaml (or via the Streamlit sidebar)
  3. Select 'modelslab_tts' as the TTS Method and choose a voice
---
 config.yaml                       |   9 ++-
 core/st_utils/sidebar_setting.py  |  16 ++++-
 core/tts_backend/modelslab_tts.py | 112 ++++++++++++++++++++++++++++++
 core/tts_backend/tts_main.py      |   3 +
 4 files changed, 138 insertions(+), 2 deletions(-)
 create mode 100644 core/tts_backend/modelslab_tts.py

diff --git a/config.yaml b/config.yaml
index 099b4bdb..8ca4f075 100644
--- a/config.yaml
+++ b/config.yaml
@@ -70,7 +70,7 @@ reflect_translate: true
 pause_before_translate: false
 
 ## ======================== Dubbing Settings ======================== ##
-# TTS selection [sf_fish_tts, openai_tts, gpt_sovits, azure_tts, fish_tts, edge_tts, custom_tts]
+# TTS selection [sf_fish_tts, openai_tts, gpt_sovits, azure_tts, fish_tts, edge_tts, custom_tts, modelslab_tts]
 tts_method: 'azure_tts'
 
 # SiliconFlow FishTTS
@@ -119,6 +119,13 @@ gpt_sovits:
 f5tts:
   302_api: 'YOUR_302_API_KEY'
 
+# ModelsLab TTS configuration
+# Sign up at https://modelslab.com to get your API key
+modelslab_tts:
+  api_key: 'YOUR_MODELSLAB_API_KEY'
+  # Voice options: Bella, Antoni, Elli, Josh, Arnold, Adam, Sam, Rachel, Domi, Gigi
+  voice: 'Bella'
+
 # *Audio speed range
 speed_factor:
   min: 1
diff --git a/core/st_utils/sidebar_setting.py b/core/st_utils/sidebar_setting.py
index 5c5d24aa..020be91e 100644
--- a/core/st_utils/sidebar_setting.py
+++ b/core/st_utils/sidebar_setting.py
@@ -84,7 +84,7 @@ def page_setting():
             update_key("burn_subtitles", burn_subtitles)
             st.rerun()
     with st.expander(t("Dubbing Settings"), expanded=True):
-        tts_methods = ["azure_tts", "openai_tts", "fish_tts", "sf_fish_tts", "edge_tts", "gpt_sovits", "custom_tts", "sf_cosyvoice2", "f5tts"]
+        tts_methods = ["azure_tts", "openai_tts", "fish_tts", "sf_fish_tts", "edge_tts", "gpt_sovits", "custom_tts", "sf_cosyvoice2", "f5tts", "modelslab_tts"]
         select_tts = st.selectbox(t("TTS Method"), options=tts_methods, index=tts_methods.index(load_key("tts_method")))
         if select_tts != load_key("tts_method"):
             update_key("tts_method", select_tts)
@@ -151,6 +151,20 @@ def page_setting():
         
         elif select_tts == "f5tts":
             config_input("302ai API", "f5tts.302_api")
+
+        elif select_tts == "modelslab_tts":
+            config_input(t("ModelsLab API Key"), "modelslab_tts.api_key")
+            from core.tts_backend.modelslab_tts import VOICE_OPTIONS
+            voice_names = list(VOICE_OPTIONS.keys())
+            current_voice = load_key("modelslab_tts.voice") or "Bella"
+            selected_voice = st.selectbox(
+                t("ModelsLab Voice"),
+                options=voice_names,
+                index=voice_names.index(current_voice) if current_voice in voice_names else 0,
+            )
+            if selected_voice != load_key("modelslab_tts.voice"):
+                update_key("modelslab_tts.voice", selected_voice)
+                st.rerun()
         
 def check_api():
     try:
diff --git a/core/tts_backend/modelslab_tts.py b/core/tts_backend/modelslab_tts.py
new file mode 100644
index 00000000..88b9a1fc
--- /dev/null
+++ b/core/tts_backend/modelslab_tts.py
@@ -0,0 +1,112 @@
+import time
+import requests
+from pathlib import Path
+
+from core.utils import load_key, except_handler
+
+# ModelsLab TTS API endpoints
+_API_URL = "https://modelslab.com/api/v6/voice/text_to_speech"
+_FETCH_URL = "https://modelslab.com/api/v6/voice/fetch/{}"
+
+# Voice name → voice_id mapping
+# Full list: https://docs.modelslab.com/text-to-audio/realtime/tts
+VOICE_OPTIONS = {
+    "Bella":   1,
+    "Antoni":  2,
+    "Elli":    3,
+    "Josh":    4,
+    "Arnold":  5,
+    "Adam":    6,
+    "Sam":     7,
+    "Rachel":  8,
+    "Domi":    9,
+    "Gigi":    10,
+}
+
+
+@except_handler("Failed to generate audio using ModelsLab TTS", retry=3, delay=2)
+def modelslab_tts(text: str, save_path: str) -> None:
+    """
+    Generate speech using the ModelsLab TTS API and save it as a WAV file.
+
+    Config keys (in config.yaml under ``modelslab_tts``):
+        api_key  — ModelsLab API key (https://modelslab.com)
+        voice    — Voice name, one of: Bella, Antoni, Elli, Josh, Arnold, Adam,
+                   Sam, Rachel, Domi, Gigi  (default: Bella)
+
+    ModelsLab TTS has a 2 500-character limit per request.
+    Responses with ``status == "processing"`` are polled every 5 s for up to 5 min.
+    """
+    api_key = load_key("modelslab_tts.api_key")
+    voice_name = load_key("modelslab_tts.voice") or "Bella"
+
+    if voice_name not in VOICE_OPTIONS:
+        raise ValueError(
+            f"Invalid voice '{voice_name}'. Choose from: {', '.join(VOICE_OPTIONS)}"
+        )
+    voice_id = VOICE_OPTIONS[voice_name]
+
+    # Enforce API character limit
+    if len(text) > 2500:
+        text = text[:2500]
+
+    payload = {
+        "key": api_key,
+        "prompt": text,
+        "language": "English",
+        "voice_id": voice_id,
+        "audio_format": "wav",
+    }
+
+    response = requests.post(_API_URL, json=payload, timeout=(15, 60))
+    response.raise_for_status()
+    data = response.json()
+
+    audio_url: str | None = None
+
+    if data.get("status") == "success":
+        output = data.get("output") or data.get("output_url")
+        audio_url = output[0] if isinstance(output, list) else output
+
+    elif data.get("status") == "processing":
+        request_id = data.get("id")
+        if not request_id:
+            raise RuntimeError("ModelsLab returned 'processing' but no request id")
+
+        fetch_url = _FETCH_URL.format(request_id)
+        fetch_payload = {"key": api_key}
+
+        for _ in range(60):          # poll for up to 5 minutes
+            time.sleep(5)
+            fetch_resp = requests.post(fetch_url, json=fetch_payload, timeout=(10, 30))
+            fetch_resp.raise_for_status()
+            fetch_data = fetch_resp.json()
+
+            if fetch_data.get("status") == "success":
+                output = fetch_data.get("output") or fetch_data.get("output_url")
+                audio_url = output[0] if isinstance(output, list) else output
+                break
+            elif fetch_data.get("status") == "processing":
+                continue
+            else:
+                raise RuntimeError(f"ModelsLab fetch error: {fetch_data}")
+
+        if not audio_url:
+            raise TimeoutError("ModelsLab TTS timed out waiting for audio generation")
+
+    else:
+        raise RuntimeError(f"ModelsLab TTS API error: {data}")
+
+    # Download and save the generated audio
+    audio_resp = requests.get(audio_url, timeout=(10, 120))
+    audio_resp.raise_for_status()
+
+    save_file = Path(save_path)
+    save_file.parent.mkdir(parents=True, exist_ok=True)
+    save_file.write_bytes(audio_resp.content)
+
+    print(f"ModelsLab TTS: audio saved to {save_file}")
+
+
+if __name__ == "__main__":
+    modelslab_tts("Hello! This is a test of ModelsLab text-to-speech.", "test_modelslab.wav")
diff --git a/core/tts_backend/tts_main.py b/core/tts_backend/tts_main.py
index 49cc0c29..c8a16fd1 100644
--- a/core/tts_backend/tts_main.py
+++ b/core/tts_backend/tts_main.py
@@ -13,6 +13,7 @@
 from core.tts_backend.custom_tts import custom_tts
 from core.prompts import get_correct_text_prompt
 from core.tts_backend._302_f5tts import f5_tts_for_videolingo
+from core.tts_backend.modelslab_tts import modelslab_tts
 from core.utils import *
 
 def clean_text_for_tts(text):
@@ -64,6 +65,8 @@ def tts_main(text, save_as, number, task_df):
                 cosyvoice_tts_for_videolingo(text, save_as, number, task_df)
             elif TTS_METHOD == 'f5tts':
                 f5_tts_for_videolingo(text, save_as, number, task_df)
+            elif TTS_METHOD == 'modelslab_tts':
+                modelslab_tts(text, save_as)
                 
             # Check generated audio duration
             duration = get_audio_duration(save_as)