From f333aa5e885d04ed68a6b4b98f6c3a7c3cd10621 Mon Sep 17 00:00:00 2001 From: adhikjoshi Date: Sun, 22 Feb 2026 06:33:44 +0530 Subject: [PATCH] feat: Add ModelsLab TTS provider MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds ModelsLab as a new TTS backend for VideoLingo, following the same per-file pattern as the existing SiliconFlow, OpenAI, Azure, and Edge TTS integrations. Changes: - core/tts_backend/modelslab_tts.py — new provider module - core/tts_backend/tts_main.py — import + dispatch case - config.yaml — modelslab_tts config section - core/st_utils/sidebar_setting.py — Streamlit API key + voice picker Features: - 10 voices: Bella, Antoni, Elli, Josh, Arnold, Adam, Sam, Rachel, Domi, Gigi - Async polling: handles 'processing' status (polls every 5 s, up to 5 min) - 2 500-character limit enforced automatically - 3-retry logic via @except_handler decorator (same as other providers) - WAV output — compatible with VideoLingo's audio pipeline Setup: 1. Get an API key at https://modelslab.com 2. Set modelslab_tts.api_key in config.yaml (or via the Streamlit sidebar) 3. Select 'modelslab_tts' as the TTS Method and choose a voice --- config.yaml | 9 ++- core/st_utils/sidebar_setting.py | 16 ++++- core/tts_backend/modelslab_tts.py | 112 ++++++++++++++++++++++++++++++ core/tts_backend/tts_main.py | 3 + 4 files changed, 138 insertions(+), 2 deletions(-) create mode 100644 core/tts_backend/modelslab_tts.py diff --git a/config.yaml b/config.yaml index 099b4bdb..8ca4f075 100644 --- a/config.yaml +++ b/config.yaml @@ -70,7 +70,7 @@ reflect_translate: true pause_before_translate: false ## ======================== Dubbing Settings ======================== ## -# TTS selection [sf_fish_tts, openai_tts, gpt_sovits, azure_tts, fish_tts, edge_tts, custom_tts] +# TTS selection [sf_fish_tts, openai_tts, gpt_sovits, azure_tts, fish_tts, edge_tts, custom_tts, modelslab_tts] tts_method: 'azure_tts' # SiliconFlow FishTTS @@ -119,6 +119,13 @@ gpt_sovits: f5tts: 302_api: 'YOUR_302_API_KEY' +# ModelsLab TTS configuration +# Sign up at https://modelslab.com to get your API key +modelslab_tts: + api_key: 'YOUR_MODELSLAB_API_KEY' + # Voice options: Bella, Antoni, Elli, Josh, Arnold, Adam, Sam, Rachel, Domi, Gigi + voice: 'Bella' + # *Audio speed range speed_factor: min: 1 diff --git a/core/st_utils/sidebar_setting.py b/core/st_utils/sidebar_setting.py index 5c5d24aa..020be91e 100644 --- a/core/st_utils/sidebar_setting.py +++ b/core/st_utils/sidebar_setting.py @@ -84,7 +84,7 @@ def page_setting(): update_key("burn_subtitles", burn_subtitles) st.rerun() with st.expander(t("Dubbing Settings"), expanded=True): - tts_methods = ["azure_tts", "openai_tts", "fish_tts", "sf_fish_tts", "edge_tts", "gpt_sovits", "custom_tts", "sf_cosyvoice2", "f5tts"] + tts_methods = ["azure_tts", "openai_tts", "fish_tts", "sf_fish_tts", "edge_tts", "gpt_sovits", "custom_tts", "sf_cosyvoice2", "f5tts", "modelslab_tts"] select_tts = st.selectbox(t("TTS Method"), options=tts_methods, index=tts_methods.index(load_key("tts_method"))) if select_tts != load_key("tts_method"): update_key("tts_method", select_tts) @@ -151,6 +151,20 @@ def page_setting(): elif select_tts == "f5tts": config_input("302ai API", "f5tts.302_api") + + elif select_tts == "modelslab_tts": + config_input(t("ModelsLab API Key"), "modelslab_tts.api_key") + from core.tts_backend.modelslab_tts import VOICE_OPTIONS + voice_names = list(VOICE_OPTIONS.keys()) + current_voice = load_key("modelslab_tts.voice") or "Bella" + selected_voice = st.selectbox( + t("ModelsLab Voice"), + options=voice_names, + index=voice_names.index(current_voice) if current_voice in voice_names else 0, + ) + if selected_voice != load_key("modelslab_tts.voice"): + update_key("modelslab_tts.voice", selected_voice) + st.rerun() def check_api(): try: diff --git a/core/tts_backend/modelslab_tts.py b/core/tts_backend/modelslab_tts.py new file mode 100644 index 00000000..88b9a1fc --- /dev/null +++ b/core/tts_backend/modelslab_tts.py @@ -0,0 +1,112 @@ +import time +import requests +from pathlib import Path + +from core.utils import load_key, except_handler + +# ModelsLab TTS API endpoints +_API_URL = "https://modelslab.com/api/v6/voice/text_to_speech" +_FETCH_URL = "https://modelslab.com/api/v6/voice/fetch/{}" + +# Voice name → voice_id mapping +# Full list: https://docs.modelslab.com/text-to-audio/realtime/tts +VOICE_OPTIONS = { + "Bella": 1, + "Antoni": 2, + "Elli": 3, + "Josh": 4, + "Arnold": 5, + "Adam": 6, + "Sam": 7, + "Rachel": 8, + "Domi": 9, + "Gigi": 10, +} + + +@except_handler("Failed to generate audio using ModelsLab TTS", retry=3, delay=2) +def modelslab_tts(text: str, save_path: str) -> None: + """ + Generate speech using the ModelsLab TTS API and save it as a WAV file. + + Config keys (in config.yaml under ``modelslab_tts``): + api_key — ModelsLab API key (https://modelslab.com) + voice — Voice name, one of: Bella, Antoni, Elli, Josh, Arnold, Adam, + Sam, Rachel, Domi, Gigi (default: Bella) + + ModelsLab TTS has a 2 500-character limit per request. + Responses with ``status == "processing"`` are polled every 5 s for up to 5 min. + """ + api_key = load_key("modelslab_tts.api_key") + voice_name = load_key("modelslab_tts.voice") or "Bella" + + if voice_name not in VOICE_OPTIONS: + raise ValueError( + f"Invalid voice '{voice_name}'. Choose from: {', '.join(VOICE_OPTIONS)}" + ) + voice_id = VOICE_OPTIONS[voice_name] + + # Enforce API character limit + if len(text) > 2500: + text = text[:2500] + + payload = { + "key": api_key, + "prompt": text, + "language": "English", + "voice_id": voice_id, + "audio_format": "wav", + } + + response = requests.post(_API_URL, json=payload, timeout=(15, 60)) + response.raise_for_status() + data = response.json() + + audio_url: str | None = None + + if data.get("status") == "success": + output = data.get("output") or data.get("output_url") + audio_url = output[0] if isinstance(output, list) else output + + elif data.get("status") == "processing": + request_id = data.get("id") + if not request_id: + raise RuntimeError("ModelsLab returned 'processing' but no request id") + + fetch_url = _FETCH_URL.format(request_id) + fetch_payload = {"key": api_key} + + for _ in range(60): # poll for up to 5 minutes + time.sleep(5) + fetch_resp = requests.post(fetch_url, json=fetch_payload, timeout=(10, 30)) + fetch_resp.raise_for_status() + fetch_data = fetch_resp.json() + + if fetch_data.get("status") == "success": + output = fetch_data.get("output") or fetch_data.get("output_url") + audio_url = output[0] if isinstance(output, list) else output + break + elif fetch_data.get("status") == "processing": + continue + else: + raise RuntimeError(f"ModelsLab fetch error: {fetch_data}") + + if not audio_url: + raise TimeoutError("ModelsLab TTS timed out waiting for audio generation") + + else: + raise RuntimeError(f"ModelsLab TTS API error: {data}") + + # Download and save the generated audio + audio_resp = requests.get(audio_url, timeout=(10, 120)) + audio_resp.raise_for_status() + + save_file = Path(save_path) + save_file.parent.mkdir(parents=True, exist_ok=True) + save_file.write_bytes(audio_resp.content) + + print(f"ModelsLab TTS: audio saved to {save_file}") + + +if __name__ == "__main__": + modelslab_tts("Hello! This is a test of ModelsLab text-to-speech.", "test_modelslab.wav") diff --git a/core/tts_backend/tts_main.py b/core/tts_backend/tts_main.py index 49cc0c29..c8a16fd1 100644 --- a/core/tts_backend/tts_main.py +++ b/core/tts_backend/tts_main.py @@ -13,6 +13,7 @@ from core.tts_backend.custom_tts import custom_tts from core.prompts import get_correct_text_prompt from core.tts_backend._302_f5tts import f5_tts_for_videolingo +from core.tts_backend.modelslab_tts import modelslab_tts from core.utils import * def clean_text_for_tts(text): @@ -64,6 +65,8 @@ def tts_main(text, save_as, number, task_df): cosyvoice_tts_for_videolingo(text, save_as, number, task_df) elif TTS_METHOD == 'f5tts': f5_tts_for_videolingo(text, save_as, number, task_df) + elif TTS_METHOD == 'modelslab_tts': + modelslab_tts(text, save_as) # Check generated audio duration duration = get_audio_duration(save_as)