Huanshere · 0000sir · Dec 2, 2024 · Dec 2, 2024 · Dec 2, 2024 · Feb 6, 2025
diff --git a/config.yaml b/config.yaml
@@ -85,6 +85,21 @@ fish_tts:
     'AD学姐': '7f92f8afb8ec43bf81429cc1c9199cb1'
     '丁真': '54a5170264694bfc8e9ad98df7bd89c3'
 
+cosyvoice:
+  api_url: ''
+  speaker: "英文女"
+  speed: '0.7'
+
+cosyvoice_cloud:
+  api_key: ''
+  model: ''
+  speaker: 'longfei'
+  speed: '1.0'
+
+sambert:
+  api_key: ''
+  speaker: 'sambert-zhihao-v1'
+
 # *Audio speed range
 speed_factor:
   min: 1

diff --git a/core/all_tts_functions/cosyvoice_cloud.py b/core/all_tts_functions/cosyvoice_cloud.py
@@ -0,0 +1,21 @@
+# coding=utf-8
+import os, sys
+import dashscope
+from dashscope.audio.tts_v2 import *
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+from core.config_utils import load_key
+
+def cosyvoice_cloud(text: str, save_path: str):
+    dashscope.api_key = load_key("cosyvoice_cloud.api_key")
+    model = "cosyvoice-v1"
+    voice = load_key("cosyvoice_cloud.speaker")
+    speech_rate = float(load_key("cosyvoice_cloud.speed"))
+    synthesizer = SpeechSynthesizer(model=model, voice=voice, speech_rate=speech_rate, format=AudioFormat.WAV_22050HZ_MONO_16BIT)
+    audio = synthesizer.call(text)
+    print('requestId: ', synthesizer.get_last_request_id())
+    with open(save_path, 'wb') as f:
+        f.write(audio)
+
+if __name__ == "__main__":
+    text = """操作柔软物体非常难"""
+    cosyvoice_cloud(text, "output/cosyvoice-cloud.wav")
diff --git a/core/all_tts_functions/cosyvoice_tts.py b/core/all_tts_functions/cosyvoice_tts.py
@@ -0,0 +1,44 @@
+import requests
+import os, sys
+sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
+from core.config_utils import load_key
+
+def cosyvoice_tts(text: str, save_path: str):
+    # url = load_key("cosyvoice.api_url")
+    url = "http://59.65.196.19:9872/tts"
+    speaker = load_key("cosyvoice.speaker")
+    data={
+        "text": text,
+        "role": speaker,
+        "speed": 1.0
+    }
+
+    response=requests.post(url,data=data,timeout=3600)
+
+    if response.status_code!=200:
+        # 出错了
+        print(response.json())
+    else:
+        # 返回的wav数据流，可直接保存
+        with open(save_path,'wb') as f:
+            f.write(response.content)
+
+if __name__ == "__main__":
+    text = """
+    豫章故郡，洪都新府。星分翼轸，地接衡庐。襟三江而带五湖，控蛮荆而引瓯越。物华天宝，龙光射牛斗之墟；人杰地灵，徐孺下陈蕃之榻。雄州雾列，俊采星驰。台隍枕夷夏之交，宾主尽东南之美。都督阎公之雅望，棨戟遥临；宇文新州之懿范，襜帷暂驻。十旬休假，胜友如云；千里逢迎，高朋满座。腾蛟起凤，孟学士之词宗；紫电青霜，王将军之武库。家君作宰，路出名区；童子何知，躬逢胜饯。
+    时维九月，序属三秋。潦水尽而寒潭清，烟光凝而暮山紫。俨骖騑于上路，访风景于崇阿。临帝子之长洲，得天人之旧馆。层峦耸翠，上出重霄；飞阁流丹，下临无地。鹤汀凫渚，穷岛屿之萦回；桂殿兰宫，即冈峦之体势。
+    披绣闼，俯雕甍，山原旷其盈视，川泽纡其骇瞩。闾阎扑地，钟鸣鼎食之家；舸舰弥津，青雀黄龙之舳。云销雨霁，彩彻区明。落霞与孤鹜齐飞，秋水共长天一色。渔舟唱晚，响穷彭蠡之滨，雁阵惊寒，声断衡阳之浦。
+    遥襟甫畅，逸兴遄飞。爽籁发而清风生，纤歌凝而白云遏。睢园绿竹，气凌彭泽之樽；邺水朱华，光照临川之笔。四美具，二难并。穷睇眄于中天，极娱游于暇日。天高地迥，觉宇宙之无穷；兴尽悲来，识盈虚之有数。望长安于日下，目吴会于云间。地势极而南溟深，天柱高而北辰远。关山难越，谁悲失路之人；萍水相逢，尽是他乡之客。怀帝阍而不见，奉宣室以何年？
+    嗟乎！时运不齐，命途多舛。冯唐易老，李广难封。屈贾谊于长沙，非无圣主；窜梁鸿于海曲，岂乏明时？所赖君子见机，达人知命。老当益壮，宁移白首之心？穷且益坚，不坠青云之志。酌贪泉而觉爽，处涸辙以犹欢。北海虽赊，扶摇可接；东隅已逝，桑榆非晚。孟尝高洁，空余报国之情；阮籍猖狂，岂效穷途之哭！
+    勃，三尺微命，一介书生。无路请缨，等终军之弱冠；有怀投笔，慕宗悫之长风。舍簪笏于百龄，奉晨昏于万里。非谢家之宝树，接孟氏之芳邻。他日趋庭，叨陪鲤对；今兹捧袂，喜托龙门。杨意不逢，抚凌云而自惜；钟期既遇，奏流水以何惭？
+    呜呼！胜地不常，盛筵难再；兰亭已矣，梓泽丘墟。临别赠言，幸承恩于伟饯；登高作赋，是所望于群公。敢竭鄙怀，恭疏短引；一言均赋，四韵俱成。请洒潘江，各倾陆海云尔。
+　　 滕王高阁临江渚，佩玉鸣鸾罢歌舞。
+　　 画栋朝飞南浦云，珠帘暮卷西山雨。
+　　 闲云潭影日悠悠，物换星移几度秋。
+　　 阁中帝子今何在？槛外长江空自流。
+    """
+    cosyvoice_tts(text, "output/cosyvoice_tts_cn.wav")
+    text = """
+    One winter day, the queen was doing needle work while gazing out her ebony window at the new fallen snow.  A bird flew by the window startling the queen and she pricked her finger.  A single drop of blood fell on the snow outside her window.  As she looked at the blood on the snow she said to herself, "Oh, how I wish that I had a daughter that had skin as white as snow, lips as red as blood, and hair as black as ebony."  
+    """
+    # cosyvoice_tts(text, "output/cosyvoice_tts_en.wav")
diff --git a/core/all_tts_functions/sambert_cloud.py b/core/all_tts_functions/sambert_cloud.py
@@ -0,0 +1,23 @@
+# coding=utf-8
+import os,sys
+import dashscope
+from dashscope.audio.tts import SpeechSynthesizer
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+from core.config_utils import load_key
+
+def sambert_cloud(text: str, save_path: str):
+    dashscope.api_key = load_key("sambert.api_key")
+    model = load_key("sambert.api_key")
+    result = SpeechSynthesizer.call(model=model,
+                                text=text,
+                                sample_rate=48000)
+    if result.get_audio_data() is not None:
+        with open(save_path, 'wb') as f:
+            f.write(result.get_audio_data())
+        print('SUCCESS: get audio data: %dbytes in %s' % (sys.getsizeof(result.get_audio_data()), save_path))
+    else:
+        print('ERROR: response is %s' % (result.get_response()))
+
+if __name__ == "__main__":
+    text = "Striking drums and gongs was also prohibited. Playing the huqin was also forbidden. Even playing secretly at home was forbidden."
+    sambert_cloud(text, "output/cosyvoice-cloud.wav")
diff --git a/core/all_tts_functions/tts_main.py b/core/all_tts_functions/tts_main.py
@@ -13,6 +13,9 @@
 from core.all_tts_functions.azure_tts import azure_tts
 from core.ask_gpt import ask_gpt
 from core.prompts_storage import get_correct_text_prompt
+from core.all_tts_functions.cosyvoice_tts import cosyvoice_tts
+from core.all_tts_functions.cosyvoice_cloud import cosyvoice_cloud
+from core.all_tts_functions.sambert_cloud import sambert_cloud
 
 def clean_text_for_tts(text):
     """Remove problematic characters for TTS"""
@@ -21,6 +24,7 @@ def clean_text_for_tts(text):
         text = text.replace(char, '')
     return text.strip()
 
+
 def tts_main(text, save_as, number, task_df):
     text = clean_text_for_tts(text)
     # 检查文本是否为空或单字符，单字符配音容易触发bug
@@ -37,7 +41,7 @@ def tts_main(text, save_as, number, task_df):
 
     print(f"Generating <{text}...>")
     TTS_METHOD = load_key("tts_method")
-    
+
     max_retries = 3
     for attempt in range(max_retries):
         try:
@@ -55,6 +59,12 @@ def tts_main(text, save_as, number, task_df):
                 azure_tts(text, save_as)
             elif TTS_METHOD == 'sf_fish_tts':
                 siliconflow_fish_tts_for_videolingo(text, save_as, number, task_df)
+            elif TTS_METHOD == 'cosyvoice':
+                cosyvoice_tts(text, save_as)
+            elif TTS_METHOD == 'cosyvoice_cloud':
+                cosyvoice_cloud(text, save_as)
+            elif TTS_METHOD == 'sambert':
+                sambert_cloud(text, save_as)
 
             # 检查生成的音频时长
             duration = get_audio_duration(save_as)

diff --git a/core/step10_gen_audio.py b/core/step10_gen_audio.py
@@ -39,6 +39,9 @@ def adjust_audio_speed(input_file: str, output_file: str, speed_factor: float) -
     if abs(speed_factor - 1.0) < 0.001:
         shutil.copy2(input_file, output_file)
         return
+
+    if os.path.exists(output_file):
+        return
 
     atempo = speed_factor
     cmd = ['ffmpeg', '-i', input_file, '-filter:a', f'atempo={atempo}', '-y', output_file]
@@ -99,7 +102,7 @@ def generate_tts_audio(tasks_df: pd.DataFrame) -> pd.DataFrame:
                 raise e
 
         # for gpt_sovits, do not use parallel to avoid mistakes
-        max_workers = load_key("max_workers") if load_key("tts_method") != "gpt_sovits" else 1
+        max_workers = int(load_key("max_workers")) if load_key("tts_method") != "gpt_sovits" else 1
         # parallel processing for remaining tasks
         if len(tasks_df) > warmup_size:
             remaining_tasks = tasks_df.iloc[warmup_size:].copy()

diff --git a/core/step3_2_splitbymeaning.py b/core/step3_2_splitbymeaning.py
@@ -117,7 +117,7 @@ def split_sentences_by_meaning():
     nlp = init_nlp()
     # 🔄 process sentences multiple times to ensure all are split
     for retry_attempt in range(3):
-        sentences = parallel_split_sentences(sentences, max_length=load_key("max_split_length"), max_workers=load_key("max_workers"), nlp=nlp, retry_attempt=retry_attempt)
+        sentences = parallel_split_sentences(sentences, max_length=load_key("max_split_length"), max_workers=int(load_key("max_workers")), nlp=nlp, retry_attempt=retry_attempt)
 
     # 💾 save results
     with open('output/log/sentence_splitbymeaning.txt', 'w', encoding='utf-8') as f:

diff --git a/core/step4_2_translate_all.py b/core/step4_2_translate_all.py
@@ -77,7 +77,7 @@ def translate_all():
         transient=True,
     ) as progress:
         task = progress.add_task("[cyan]Translating chunks...", total=len(chunks))
-        with concurrent.futures.ThreadPoolExecutor(max_workers=load_key("max_workers")) as executor:
+        with concurrent.futures.ThreadPoolExecutor(max_workers=int(load_key("max_workers"))) as executor:
             futures = []
             for i, chunk in enumerate(chunks):
                 future = executor.submit(translate_chunk, chunk, chunks, theme_prompt, i)

diff --git a/core/step5_splitforsub.py b/core/step5_splitforsub.py
@@ -94,7 +94,7 @@ def process(i):
         tr_lines[i] = tr_parts
         remerged_tr_lines[i] = tr_remerged
 
-    with concurrent.futures.ThreadPoolExecutor(max_workers=load_key("max_workers")) as executor:
+    with concurrent.futures.ThreadPoolExecutor(max_workers=int(load_key("max_workers"))) as executor:
         executor.map(process, to_split)
 
     # Flatten `src_lines` and `tr_lines`

diff --git a/core/step9_extract_refer_audio.py b/core/step9_extract_refer_audio.py
@@ -23,6 +23,9 @@ def time_to_samples(time_str, sr):
 
 def extract_audio(audio_data, sr, start_time, end_time, out_file):
     """Simplified audio extraction function"""
+    if os.path.exists(out_file):
+        rprint(f"Output file {out_file} exists, skipping.")
+        return
     start = time_to_samples(start_time, sr)
     end = time_to_samples(end_time, sr)
     sf.write(out_file, audio_data[start:end], sr)

diff --git a/st_components/sidebar_setting.py b/st_components/sidebar_setting.py
@@ -74,7 +74,8 @@ def page_setting():
             update_key("resolution", resolution)
 
     with st.expander("Dubbing Settings", expanded=True):
-        tts_methods = ["sf_fish_tts", "openai_tts", "azure_tts", "gpt_sovits", "fish_tts"]
+        config_input("Max Workers", "max_workers")
+        tts_methods = ["sf_fish_tts", "openai_tts", "azure_tts", "gpt_sovits", "fish_tts", "cosyvoice", "cosyvoice_cloud", "sambert"]
         selected_tts_method = st.selectbox("TTS Method", options=tts_methods, index=tts_methods.index(load_key("tts_method")))
         if selected_tts_method != load_key("tts_method"):
             update_key("tts_method", selected_tts_method)
@@ -130,6 +131,41 @@ def page_setting():
             )
             if selected_refer_mode != load_key("gpt_sovits.refer_mode"):
                 update_key("gpt_sovits.refer_mode", selected_refer_mode)
+        elif selected_tts_method == "cosyvoice":
+            config_input("API URL", "cosyvoice.api_url")
+            speakers = ["中文男","中文女","英文男","英文女","粤语女","日语男","韩语女"]
+            speaker = st.selectbox("发音人", options=speakers, index=speakers.index(load_key("cosyvoice.speaker")))
+            if speaker != load_key("cosyvoice.speaker"):
+                update_key("cosyvoice.speaker", speaker)
+            config_input("语速", "cosyvoice.speed")
+        elif selected_tts_method == "cosyvoice_cloud":
+            config_input("API KEY", "cosyvoice_cloud.api_key", help="Dashscope API KEY")
+            speakers = ["longwan","longcheng","longhua","longxiaochun","longxiaoxia","longxiaocheng","longxiaobai",
+            "longlaotie","longshu","longshuo","longjing","longmiao","longyue","longyuan","longfei","longjielidou",
+            "longtong","longxiang","loongstella","loongbella"]
+            speaker = st.selectbox("发音人", options=speakers, index=speakers.index(load_key("cosyvoice_cloud.speaker")), help="可以在 https://help.aliyun.com/zh/model-studio/developer-reference/timbre-list 查看不同发音人效果")
+            if speaker != load_key("cosyvoice_cloud.speaker"):
+                update_key("cosyvoice_cloud.speaker", speaker)
+            config_input("语速", "cosyvoice_cloud.speed")
+        elif selected_tts_method == "sambert":
+            st.info("Sambert语音合成API基于达摩院改良的自回归韵律模型，支持文本至语音的实时流式合成。")
+            speakers = ["sambert-zhinan-v1","sambert-zhiqi-v1","sambert-zhichu-v1","sambert-zhide-v1","sambert-zhijia-v1",
+            "sambert-zhiru-v1","sambert-zhiqian-v1","sambert-zhixiang-v1","sambert-zhiwei-v1",
+            "sambert-zhihao-v1","sambert-zhijing-v1","sambert-zhiming-v1","sambert-zhimo-v1",
+            "sambert-zhina-v1","sambert-zhishu-v1","sambert-zhistella-v1","sambert-zhiting-v1",
+            "sambert-zhixiao-v1","sambert-zhiya-v1","sambert-zhiye-v1","sambert-zhiying-v1",
+            "sambert-zhiyuan-v1","sambert-zhiyue-v1","sambert-zhigui-v1","sambert-zhishuo-v1",
+            "sambert-zhimiao-emo-v1","sambert-zhimao-v1","sambert-zhilun-v1","sambert-zhifei-v1",
+            "sambert-zhida-v1","sambert-camila-v1","sambert-perla-v1","sambert-indah-v1","sambert-clara-v1",
+            "sambert-hanna-v1","sambert-beth-v1","sambert-betty-v1","sambert-cally-v1","sambert-cindy-v1",
+            "sambert-eva-v1","sambert-donna-v1","sambert-brian-v1","sambert-waan-v1"]
+            config_input("API KEY", "sambert.api_key", help="Dashscope API KEY")
+            # config_input("音色", "sambert.speaker")
+            speaker = st.selectbox("发音人", options=speakers, index=speakers.index(load_key("sambert.speaker")),
+                help="访问 https://help.aliyun.com/zh/model-studio/developer-reference/model-list 查看不同说话人特性")
+            if speaker != load_key("sambert.speaker"):
+                update_key("sambert.speaker", speaker)
+
 
 def check_api():
     try: