Huanshere · octo-patch · Mar 17, 2026
diff --git a/config.yaml b/config.yaml
@@ -70,7 +70,7 @@ reflect_translate: true
 pause_before_translate: false
 
 ## ======================== Dubbing Settings ======================== ##
-# TTS selection [sf_fish_tts, openai_tts, gpt_sovits, azure_tts, fish_tts, edge_tts, custom_tts]
+# TTS selection [sf_fish_tts, openai_tts, gpt_sovits, azure_tts, fish_tts, edge_tts, custom_tts, minimax_tts]
 tts_method: 'azure_tts'
 
 # SiliconFlow FishTTS
@@ -119,6 +119,12 @@ gpt_sovits:
 f5tts:
   302_api: 'YOUR_302_API_KEY'
 
+# MiniMax TTS configuration, get API key from https://www.minimax.io
+minimax_tts:
+  api_key: 'YOUR_MINIMAX_API_KEY'
+  model: 'speech-2.8-hd'
+  voice: 'English_Graceful_Lady'
+
 # *Audio speed range
 speed_factor:
   min: 1

diff --git a/core/st_utils/sidebar_setting.py b/core/st_utils/sidebar_setting.py
@@ -84,7 +84,7 @@ def page_setting():
             update_key("burn_subtitles", burn_subtitles)
             st.rerun()
     with st.expander(t("Dubbing Settings"), expanded=True):
-        tts_methods = ["azure_tts", "openai_tts", "fish_tts", "sf_fish_tts", "edge_tts", "gpt_sovits", "custom_tts", "sf_cosyvoice2", "f5tts"]
+        tts_methods = ["azure_tts", "openai_tts", "fish_tts", "sf_fish_tts", "edge_tts", "gpt_sovits", "custom_tts", "sf_cosyvoice2", "f5tts", "minimax_tts"]
         select_tts = st.selectbox(t("TTS Method"), options=tts_methods, index=tts_methods.index(load_key("tts_method")))
         if select_tts != load_key("tts_method"):
             update_key("tts_method", select_tts)
@@ -151,6 +151,41 @@ def page_setting():
 
         elif select_tts == "f5tts":
             config_input("302ai API", "f5tts.302_api")
+
+        elif select_tts == "minimax_tts":
+            config_input(t("MiniMax API Key"), "minimax_tts.api_key")
+            minimax_voices = {
+                "English_Graceful_Lady": t("English Female, Graceful"),
+                "English_Insightful_Speaker": t("English Male, Insightful"),
+                "English_radiant_girl": t("English Female, Radiant"),
+                "English_Persuasive_Man": t("English Male, Persuasive"),
+                "English_Lucky_Robot": t("English, Robot"),
+                "Wise_Woman": t("Female, Wise"),
+                "Friendly_Person": t("Friendly"),
+                "Inspirational_girl": t("Female, Inspirational"),
+                "Deep_Voice_Man": t("Male, Deep Voice"),
+                "sweet_girl": t("Female, Sweet"),
+                "cute_boy": t("Male, Cute"),
+                "lovely_girl": t("Female, Lovely"),
+            }
+            selected_voice = st.selectbox(
+                t("MiniMax Voice"),
+                options=list(minimax_voices.keys()),
+                format_func=lambda x: minimax_voices[x],
+                index=list(minimax_voices.keys()).index(load_key("minimax_tts.voice")) if load_key("minimax_tts.voice") in minimax_voices else 0
+            )
+            if selected_voice != load_key("minimax_tts.voice"):
+                update_key("minimax_tts.voice", selected_voice)
+                st.rerun()
+            minimax_models = ["speech-2.8-hd", "speech-2.8-turbo"]
+            selected_model = st.selectbox(
+                t("MiniMax TTS Model"),
+                options=minimax_models,
+                index=minimax_models.index(load_key("minimax_tts.model")) if load_key("minimax_tts.model") in minimax_models else 0
+            )
+            if selected_model != load_key("minimax_tts.model"):
+                update_key("minimax_tts.model", selected_model)
+                st.rerun()
 
 def check_api():
     try:

diff --git a/core/tts_backend/minimax_tts.py b/core/tts_backend/minimax_tts.py
@@ -0,0 +1,79 @@
+import io
+import requests
+from pathlib import Path
+from pydub import AudioSegment
+from core.utils import load_key, except_handler
+
+BASE_URL = "https://api.minimax.io/v1/t2a_v2"
+BASE_URL_CN = "https://api.minimaxi.com/v1/t2a_v2"
+
+VOICE_LIST = [
+    "English_Graceful_Lady",
+    "English_Insightful_Speaker",
+    "English_radiant_girl",
+    "English_Persuasive_Man",
+    "English_Lucky_Robot",
+    "Wise_Woman",
+    "Friendly_Person",
+    "Inspirational_girl",
+    "Deep_Voice_Man",
+    "sweet_girl",
+    "cute_boy",
+    "lovely_girl",
+]
+
+MODEL_LIST = ["speech-2.8-hd", "speech-2.8-turbo"]
+
+# refer to: https://platform.minimax.io/docs/api-reference/speech-t2a-http
+
+@except_handler("Failed to generate audio using MiniMax TTS", retry=3, delay=1)
+def minimax_tts(text, save_path):
+    API_KEY = load_key("minimax_tts.api_key")
+    voice = load_key("minimax_tts.voice")
+    model = load_key("minimax_tts.model")
+
+    if voice not in VOICE_LIST:
+        raise ValueError(f"Invalid voice: {voice}. Please choose from {VOICE_LIST}")
+
+    payload = {
+        "model": model,
+        "text": text,
+        "stream": False,
+        "voice_setting": {
+            "voice_id": voice,
+            "speed": 1.0,
+            "vol": 1.0,
+            "pitch": 0,
+        },
+        "audio_setting": {
+            "format": "mp3",
+            "sample_rate": 32000,
+        },
+    }
+
+    headers = {
+        "Authorization": f"Bearer {API_KEY}",
+        "Content-Type": "application/json",
+    }
+
+    response = requests.post(BASE_URL, headers=headers, json=payload, timeout=60)
+    response.raise_for_status()
+
+    result = response.json()
+    if "data" not in result or "audio" not in result["data"]:
+        raise ValueError(f"Unexpected API response: {result}")
+
+    audio_hex = result["data"]["audio"]
+    audio_bytes = bytes.fromhex(audio_hex)
+
+    # Convert mp3 to wav using pydub
+    speech_file_path = Path(save_path)
+    speech_file_path.parent.mkdir(parents=True, exist_ok=True)
+
+    audio = AudioSegment.from_mp3(io.BytesIO(audio_bytes))
+    audio.export(save_path, format="wav")
+    print(f"Audio saved to {speech_file_path}")
+
+
+if __name__ == "__main__":
+    minimax_tts("Hi! Welcome to VideoLingo!", "test.wav")
diff --git a/core/tts_backend/tts_main.py b/core/tts_backend/tts_main.py
@@ -11,6 +11,7 @@
 from core.tts_backend.edge_tts import edge_tts
 from core.tts_backend.sf_cosyvoice2 import cosyvoice_tts_for_videolingo
 from core.tts_backend.custom_tts import custom_tts
+from core.tts_backend.minimax_tts import minimax_tts
 from core.prompts import get_correct_text_prompt
 from core.tts_backend._302_f5tts import f5_tts_for_videolingo
 from core.utils import *
@@ -64,6 +65,8 @@ def tts_main(text, save_as, number, task_df):
                 cosyvoice_tts_for_videolingo(text, save_as, number, task_df)
             elif TTS_METHOD == 'f5tts':
                 f5_tts_for_videolingo(text, save_as, number, task_df)
+            elif TTS_METHOD == 'minimax_tts':
+                minimax_tts(text, save_as)
 
             # Check generated audio duration
             duration = get_audio_duration(save_as)

diff --git a/docs/pages/docs/start.en-US.md b/docs/pages/docs/start.en-US.md
@@ -34,9 +34,11 @@ VideoLingo provides multiple TTS integration methods. Here's a comparison (skip
 | 🎙️ SiliconFlow FishTTS | [SiliconFlow](https://cloud.siliconflow.cn/i/ttKDEsxE) | Voice Clone | Unstable cloning effect | 😃 | 😃 |
 | 🗣 Edge TTS | Local | Completely free | Average effect | 😐 | 😐 |
 | 🗣️ GPT-SoVITS | Local | Best voice cloning | Only supports Chinese/English, requires local inference, complex setup | 🏆 | 🚫 |
+| 🔊 MiniMax TTS | [MiniMax](https://www.minimax.io) | HD quality, many voices | Requires API key | 😃 | 🤩 |
 
 - For SiliconFlow FishTTS, get key from [SiliconFlow](https://cloud.siliconflow.cn/i/ttKDEsxE), note that cloning feature requires paid credits;
 - For OpenAI TTS, Azure TTS, and Fish TTS, use [302AI](https://gpt302.saaslink.net/C2oHR9) - one API key provides access to all three services
+- For MiniMax TTS, get API key from [MiniMax](https://www.minimax.io), supports `speech-2.8-hd` (recommended) and `speech-2.8-turbo` models
 > Wanna use your own TTS? Modify in `core/all_tts_functions/custom_tts.py`!
 
 <details>

diff --git a/docs/pages/docs/start.zh-CN.md b/docs/pages/docs/start.zh-CN.md
@@ -34,9 +34,11 @@ VideoLingo提供了多种 tts 接入方式，以下是对比（如不使用配
 | 🎙️ SiliconFlow FishTTS | [硅基流动](https://cloud.siliconflow.cn/i/ttKDEsxE) | 语音克隆 | 克隆效果不稳定 | 😃 | 😃 |
 | 🗣 Edge TTS | 本地 | 完全免费 | 效果一般 | 😐 | 😐 |
 | 🗣️ GPT-SoVITS | 本地 | 最强语音克隆 | 只支持中英文，需要本地训练推理，配置麻烦 | 🏆 | 🚫 |
+| 🔊 MiniMax TTS | [MiniMax](https://www.minimax.io) | 高清音质，多种音色 | 需要API密钥 | 😃 | 🤩 |
 
 - SiliconFlow FishTTS 请在 [硅基流动](https://cloud.siliconflow.cn/i/ttKDEsxE) 获取key，注意克隆功能需要付费充值积分；
 - OpenAI TTS、Azure TTS 和 Fish TTS，仅支持 [302AI](https://gpt302.saaslink.net/C2oHR9) - 一个 API key 即可使用所有服务
+- MiniMax TTS 请在 [MiniMax](https://www.minimax.io) 获取API密钥，支持 `speech-2.8-hd`（推荐）和 `speech-2.8-turbo` 模型
 > 现在还可以在 `core/all_tts_functions/custom_tts.py` 里自定义tts渠道！
 
 <details>

diff --git a/tests/__init__.py b/tests/__init__.py