Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ reflect_translate: true
pause_before_translate: false

## ======================== Dubbing Settings ======================== ##
# TTS selection [sf_fish_tts, openai_tts, gpt_sovits, azure_tts, fish_tts, edge_tts, custom_tts]
# TTS selection [sf_fish_tts, openai_tts, gpt_sovits, azure_tts, fish_tts, edge_tts, custom_tts, modelslab_tts]
tts_method: 'azure_tts'

# SiliconFlow FishTTS
Expand Down Expand Up @@ -119,6 +119,13 @@ gpt_sovits:
f5tts:
302_api: 'YOUR_302_API_KEY'

# ModelsLab TTS configuration
# Sign up at https://modelslab.com to get your API key
modelslab_tts:
api_key: 'YOUR_MODELSLAB_API_KEY'
# Voice options: Bella, Antoni, Elli, Josh, Arnold, Adam, Sam, Rachel, Domi, Gigi
voice: 'Bella'

# *Audio speed range
speed_factor:
min: 1
Expand Down
16 changes: 15 additions & 1 deletion core/st_utils/sidebar_setting.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def page_setting():
update_key("burn_subtitles", burn_subtitles)
st.rerun()
with st.expander(t("Dubbing Settings"), expanded=True):
tts_methods = ["azure_tts", "openai_tts", "fish_tts", "sf_fish_tts", "edge_tts", "gpt_sovits", "custom_tts", "sf_cosyvoice2", "f5tts"]
tts_methods = ["azure_tts", "openai_tts", "fish_tts", "sf_fish_tts", "edge_tts", "gpt_sovits", "custom_tts", "sf_cosyvoice2", "f5tts", "modelslab_tts"]
select_tts = st.selectbox(t("TTS Method"), options=tts_methods, index=tts_methods.index(load_key("tts_method")))
if select_tts != load_key("tts_method"):
update_key("tts_method", select_tts)
Expand Down Expand Up @@ -151,6 +151,20 @@ def page_setting():

elif select_tts == "f5tts":
config_input("302ai API", "f5tts.302_api")

elif select_tts == "modelslab_tts":
config_input(t("ModelsLab API Key"), "modelslab_tts.api_key")
from core.tts_backend.modelslab_tts import VOICE_OPTIONS
voice_names = list(VOICE_OPTIONS.keys())
current_voice = load_key("modelslab_tts.voice") or "Bella"
selected_voice = st.selectbox(
t("ModelsLab Voice"),
options=voice_names,
index=voice_names.index(current_voice) if current_voice in voice_names else 0,
)
if selected_voice != load_key("modelslab_tts.voice"):
update_key("modelslab_tts.voice", selected_voice)
st.rerun()

def check_api():
try:
Expand Down
112 changes: 112 additions & 0 deletions core/tts_backend/modelslab_tts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import time
import requests
from pathlib import Path

from core.utils import load_key, except_handler

# ModelsLab TTS API endpoints
_API_URL = "https://modelslab.com/api/v6/voice/text_to_speech"
_FETCH_URL = "https://modelslab.com/api/v6/voice/fetch/{}"

# Voice name → voice_id mapping
# Full list: https://docs.modelslab.com/text-to-audio/realtime/tts
VOICE_OPTIONS = {
"Bella": 1,
"Antoni": 2,
"Elli": 3,
"Josh": 4,
"Arnold": 5,
"Adam": 6,
"Sam": 7,
"Rachel": 8,
"Domi": 9,
"Gigi": 10,
}


@except_handler("Failed to generate audio using ModelsLab TTS", retry=3, delay=2)
def modelslab_tts(text: str, save_path: str) -> None:
"""
Generate speech using the ModelsLab TTS API and save it as a WAV file.

Config keys (in config.yaml under ``modelslab_tts``):
api_key — ModelsLab API key (https://modelslab.com)
voice — Voice name, one of: Bella, Antoni, Elli, Josh, Arnold, Adam,
Sam, Rachel, Domi, Gigi (default: Bella)

ModelsLab TTS has a 2 500-character limit per request.
Responses with ``status == "processing"`` are polled every 5 s for up to 5 min.
"""
api_key = load_key("modelslab_tts.api_key")
voice_name = load_key("modelslab_tts.voice") or "Bella"

if voice_name not in VOICE_OPTIONS:
raise ValueError(
f"Invalid voice '{voice_name}'. Choose from: {', '.join(VOICE_OPTIONS)}"
)
voice_id = VOICE_OPTIONS[voice_name]

# Enforce API character limit
if len(text) > 2500:
text = text[:2500]

payload = {
"key": api_key,
"prompt": text,
"language": "English",
"voice_id": voice_id,
"audio_format": "wav",
}

response = requests.post(_API_URL, json=payload, timeout=(15, 60))
response.raise_for_status()
data = response.json()

audio_url: str | None = None

if data.get("status") == "success":
output = data.get("output") or data.get("output_url")
audio_url = output[0] if isinstance(output, list) else output

elif data.get("status") == "processing":
request_id = data.get("id")
if not request_id:
raise RuntimeError("ModelsLab returned 'processing' but no request id")

fetch_url = _FETCH_URL.format(request_id)
fetch_payload = {"key": api_key}

for _ in range(60): # poll for up to 5 minutes
time.sleep(5)
fetch_resp = requests.post(fetch_url, json=fetch_payload, timeout=(10, 30))
fetch_resp.raise_for_status()
fetch_data = fetch_resp.json()

if fetch_data.get("status") == "success":
output = fetch_data.get("output") or fetch_data.get("output_url")
audio_url = output[0] if isinstance(output, list) else output
break
elif fetch_data.get("status") == "processing":
continue
else:
raise RuntimeError(f"ModelsLab fetch error: {fetch_data}")

if not audio_url:
raise TimeoutError("ModelsLab TTS timed out waiting for audio generation")

else:
raise RuntimeError(f"ModelsLab TTS API error: {data}")

# Download and save the generated audio
audio_resp = requests.get(audio_url, timeout=(10, 120))
audio_resp.raise_for_status()

save_file = Path(save_path)
save_file.parent.mkdir(parents=True, exist_ok=True)
save_file.write_bytes(audio_resp.content)

print(f"ModelsLab TTS: audio saved to {save_file}")


if __name__ == "__main__":
modelslab_tts("Hello! This is a test of ModelsLab text-to-speech.", "test_modelslab.wav")
3 changes: 3 additions & 0 deletions core/tts_backend/tts_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from core.tts_backend.custom_tts import custom_tts
from core.prompts import get_correct_text_prompt
from core.tts_backend._302_f5tts import f5_tts_for_videolingo
from core.tts_backend.modelslab_tts import modelslab_tts
from core.utils import *

def clean_text_for_tts(text):
Expand Down Expand Up @@ -64,6 +65,8 @@ def tts_main(text, save_as, number, task_df):
cosyvoice_tts_for_videolingo(text, save_as, number, task_df)
elif TTS_METHOD == 'f5tts':
f5_tts_for_videolingo(text, save_as, number, task_df)
elif TTS_METHOD == 'modelslab_tts':
modelslab_tts(text, save_as)

# Check generated audio duration
duration = get_audio_duration(save_as)
Expand Down