diff --git a/config.yaml b/config.yaml index 371fa1a0..64a857cc 100644 --- a/config.yaml +++ b/config.yaml @@ -85,6 +85,21 @@ fish_tts: 'AD学姐': '7f92f8afb8ec43bf81429cc1c9199cb1' '丁真': '54a5170264694bfc8e9ad98df7bd89c3' +cosyvoice: + api_url: '' + speaker: "英文女" + speed: '0.7' + +cosyvoice_cloud: + api_key: '' + model: '' + speaker: 'longfei' + speed: '1.0' + +sambert: + api_key: '' + speaker: 'sambert-zhihao-v1' + # *Audio speed range speed_factor: min: 1 diff --git a/core/all_tts_functions/cosyvoice_cloud.py b/core/all_tts_functions/cosyvoice_cloud.py new file mode 100644 index 00000000..d80e14eb --- /dev/null +++ b/core/all_tts_functions/cosyvoice_cloud.py @@ -0,0 +1,21 @@ +# coding=utf-8 +import os, sys +import dashscope +from dashscope.audio.tts_v2 import * +sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) +from core.config_utils import load_key + +def cosyvoice_cloud(text: str, save_path: str): + dashscope.api_key = load_key("cosyvoice_cloud.api_key") + model = "cosyvoice-v1" + voice = load_key("cosyvoice_cloud.speaker") + speech_rate = float(load_key("cosyvoice_cloud.speed")) + synthesizer = SpeechSynthesizer(model=model, voice=voice, speech_rate=speech_rate, format=AudioFormat.WAV_22050HZ_MONO_16BIT) + audio = synthesizer.call(text) + print('requestId: ', synthesizer.get_last_request_id()) + with open(save_path, 'wb') as f: + f.write(audio) + +if __name__ == "__main__": + text = """操作柔软物体非常难""" + cosyvoice_cloud(text, "output/cosyvoice-cloud.wav") \ No newline at end of file diff --git a/core/all_tts_functions/cosyvoice_tts.py b/core/all_tts_functions/cosyvoice_tts.py new file mode 100644 index 00000000..cb60b2ed --- /dev/null +++ b/core/all_tts_functions/cosyvoice_tts.py @@ -0,0 +1,44 @@ +import requests +import os, sys +sys.path.append(os.path.join(os.path.dirname(__file__), "..", "..")) +from core.config_utils import load_key + +def cosyvoice_tts(text: str, save_path: str): + # url = load_key("cosyvoice.api_url") + url = "http://59.65.196.19:9872/tts" + speaker = load_key("cosyvoice.speaker") + data={ + "text": text, + "role": speaker, + "speed": 1.0 + } + + response=requests.post(url,data=data,timeout=3600) + + if response.status_code!=200: + # 出错了 + print(response.json()) + else: + # 返回的wav数据流,可直接保存 + with open(save_path,'wb') as f: + f.write(response.content) + +if __name__ == "__main__": + text = """ + 豫章故郡,洪都新府。星分翼轸,地接衡庐。襟三江而带五湖,控蛮荆而引瓯越。物华天宝,龙光射牛斗之墟;人杰地灵,徐孺下陈蕃之榻。雄州雾列,俊采星驰。台隍枕夷夏之交,宾主尽东南之美。都督阎公之雅望,棨戟遥临;宇文新州之懿范,襜帷暂驻。十旬休假,胜友如云;千里逢迎,高朋满座。腾蛟起凤,孟学士之词宗;紫电青霜,王将军之武库。家君作宰,路出名区;童子何知,躬逢胜饯。 + 时维九月,序属三秋。潦水尽而寒潭清,烟光凝而暮山紫。俨骖騑于上路,访风景于崇阿。临帝子之长洲,得天人之旧馆。层峦耸翠,上出重霄;飞阁流丹,下临无地。鹤汀凫渚,穷岛屿之萦回;桂殿兰宫,即冈峦之体势。 + 披绣闼,俯雕甍,山原旷其盈视,川泽纡其骇瞩。闾阎扑地,钟鸣鼎食之家;舸舰弥津,青雀黄龙之舳。云销雨霁,彩彻区明。落霞与孤鹜齐飞,秋水共长天一色。渔舟唱晚,响穷彭蠡之滨,雁阵惊寒,声断衡阳之浦。 + 遥襟甫畅,逸兴遄飞。爽籁发而清风生,纤歌凝而白云遏。睢园绿竹,气凌彭泽之樽;邺水朱华,光照临川之笔。四美具,二难并。穷睇眄于中天,极娱游于暇日。天高地迥,觉宇宙之无穷;兴尽悲来,识盈虚之有数。望长安于日下,目吴会于云间。地势极而南溟深,天柱高而北辰远。关山难越,谁悲失路之人;萍水相逢,尽是他乡之客。怀帝阍而不见,奉宣室以何年? + 嗟乎!时运不齐,命途多舛。冯唐易老,李广难封。屈贾谊于长沙,非无圣主;窜梁鸿于海曲,岂乏明时?所赖君子见机,达人知命。老当益壮,宁移白首之心?穷且益坚,不坠青云之志。酌贪泉而觉爽,处涸辙以犹欢。北海虽赊,扶摇可接;东隅已逝,桑榆非晚。孟尝高洁,空余报国之情;阮籍猖狂,岂效穷途之哭! + 勃,三尺微命,一介书生。无路请缨,等终军之弱冠;有怀投笔,慕宗悫之长风。舍簪笏于百龄,奉晨昏于万里。非谢家之宝树,接孟氏之芳邻。他日趋庭,叨陪鲤对;今兹捧袂,喜托龙门。杨意不逢,抚凌云而自惜;钟期既遇,奏流水以何惭? + 呜呼!胜地不常,盛筵难再;兰亭已矣,梓泽丘墟。临别赠言,幸承恩于伟饯;登高作赋,是所望于群公。敢竭鄙怀,恭疏短引;一言均赋,四韵俱成。请洒潘江,各倾陆海云尔。 +   滕王高阁临江渚,佩玉鸣鸾罢歌舞。 +   画栋朝飞南浦云,珠帘暮卷西山雨。 +   闲云潭影日悠悠,物换星移几度秋。 +   阁中帝子今何在?槛外长江空自流。 + """ + cosyvoice_tts(text, "output/cosyvoice_tts_cn.wav") + text = """ + One winter day, the queen was doing needle work while gazing out her ebony window at the new fallen snow. A bird flew by the window startling the queen and she pricked her finger. A single drop of blood fell on the snow outside her window. As she looked at the blood on the snow she said to herself, "Oh, how I wish that I had a daughter that had skin as white as snow, lips as red as blood, and hair as black as ebony." + """ + # cosyvoice_tts(text, "output/cosyvoice_tts_en.wav") \ No newline at end of file diff --git a/core/all_tts_functions/sambert_cloud.py b/core/all_tts_functions/sambert_cloud.py new file mode 100644 index 00000000..fe109222 --- /dev/null +++ b/core/all_tts_functions/sambert_cloud.py @@ -0,0 +1,23 @@ +# coding=utf-8 +import os,sys +import dashscope +from dashscope.audio.tts import SpeechSynthesizer +sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) +from core.config_utils import load_key + +def sambert_cloud(text: str, save_path: str): + dashscope.api_key = load_key("sambert.api_key") + model = load_key("sambert.api_key") + result = SpeechSynthesizer.call(model=model, + text=text, + sample_rate=48000) + if result.get_audio_data() is not None: + with open(save_path, 'wb') as f: + f.write(result.get_audio_data()) + print('SUCCESS: get audio data: %dbytes in %s' % (sys.getsizeof(result.get_audio_data()), save_path)) + else: + print('ERROR: response is %s' % (result.get_response())) + +if __name__ == "__main__": + text = "Striking drums and gongs was also prohibited. Playing the huqin was also forbidden. Even playing secretly at home was forbidden." + sambert_cloud(text, "output/cosyvoice-cloud.wav") \ No newline at end of file diff --git a/core/all_tts_functions/tts_main.py b/core/all_tts_functions/tts_main.py index a5476e18..2a7aacba 100644 --- a/core/all_tts_functions/tts_main.py +++ b/core/all_tts_functions/tts_main.py @@ -13,6 +13,9 @@ from core.all_tts_functions.azure_tts import azure_tts from core.ask_gpt import ask_gpt from core.prompts_storage import get_correct_text_prompt +from core.all_tts_functions.cosyvoice_tts import cosyvoice_tts +from core.all_tts_functions.cosyvoice_cloud import cosyvoice_cloud +from core.all_tts_functions.sambert_cloud import sambert_cloud def clean_text_for_tts(text): """Remove problematic characters for TTS""" @@ -21,6 +24,7 @@ def clean_text_for_tts(text): text = text.replace(char, '') return text.strip() + def tts_main(text, save_as, number, task_df): text = clean_text_for_tts(text) # 检查文本是否为空或单字符,单字符配音容易触发bug @@ -37,7 +41,7 @@ def tts_main(text, save_as, number, task_df): print(f"Generating <{text}...>") TTS_METHOD = load_key("tts_method") - + max_retries = 3 for attempt in range(max_retries): try: @@ -55,6 +59,12 @@ def tts_main(text, save_as, number, task_df): azure_tts(text, save_as) elif TTS_METHOD == 'sf_fish_tts': siliconflow_fish_tts_for_videolingo(text, save_as, number, task_df) + elif TTS_METHOD == 'cosyvoice': + cosyvoice_tts(text, save_as) + elif TTS_METHOD == 'cosyvoice_cloud': + cosyvoice_cloud(text, save_as) + elif TTS_METHOD == 'sambert': + sambert_cloud(text, save_as) # 检查生成的音频时长 duration = get_audio_duration(save_as) diff --git a/core/step10_gen_audio.py b/core/step10_gen_audio.py index e3f76d51..5f9946d2 100644 --- a/core/step10_gen_audio.py +++ b/core/step10_gen_audio.py @@ -39,6 +39,9 @@ def adjust_audio_speed(input_file: str, output_file: str, speed_factor: float) - if abs(speed_factor - 1.0) < 0.001: shutil.copy2(input_file, output_file) return + + if os.path.exists(output_file): + return atempo = speed_factor cmd = ['ffmpeg', '-i', input_file, '-filter:a', f'atempo={atempo}', '-y', output_file] @@ -99,7 +102,7 @@ def generate_tts_audio(tasks_df: pd.DataFrame) -> pd.DataFrame: raise e # for gpt_sovits, do not use parallel to avoid mistakes - max_workers = load_key("max_workers") if load_key("tts_method") != "gpt_sovits" else 1 + max_workers = int(load_key("max_workers")) if load_key("tts_method") != "gpt_sovits" else 1 # parallel processing for remaining tasks if len(tasks_df) > warmup_size: remaining_tasks = tasks_df.iloc[warmup_size:].copy() diff --git a/core/step3_2_splitbymeaning.py b/core/step3_2_splitbymeaning.py index ba35fdd3..962b0507 100644 --- a/core/step3_2_splitbymeaning.py +++ b/core/step3_2_splitbymeaning.py @@ -117,7 +117,7 @@ def split_sentences_by_meaning(): nlp = init_nlp() # 🔄 process sentences multiple times to ensure all are split for retry_attempt in range(3): - sentences = parallel_split_sentences(sentences, max_length=load_key("max_split_length"), max_workers=load_key("max_workers"), nlp=nlp, retry_attempt=retry_attempt) + sentences = parallel_split_sentences(sentences, max_length=load_key("max_split_length"), max_workers=int(load_key("max_workers")), nlp=nlp, retry_attempt=retry_attempt) # 💾 save results with open('output/log/sentence_splitbymeaning.txt', 'w', encoding='utf-8') as f: diff --git a/core/step4_2_translate_all.py b/core/step4_2_translate_all.py index 8c9ac707..c77ab4bd 100644 --- a/core/step4_2_translate_all.py +++ b/core/step4_2_translate_all.py @@ -77,7 +77,7 @@ def translate_all(): transient=True, ) as progress: task = progress.add_task("[cyan]Translating chunks...", total=len(chunks)) - with concurrent.futures.ThreadPoolExecutor(max_workers=load_key("max_workers")) as executor: + with concurrent.futures.ThreadPoolExecutor(max_workers=int(load_key("max_workers"))) as executor: futures = [] for i, chunk in enumerate(chunks): future = executor.submit(translate_chunk, chunk, chunks, theme_prompt, i) diff --git a/core/step5_splitforsub.py b/core/step5_splitforsub.py index 2150406c..1a9a6386 100644 --- a/core/step5_splitforsub.py +++ b/core/step5_splitforsub.py @@ -94,7 +94,7 @@ def process(i): tr_lines[i] = tr_parts remerged_tr_lines[i] = tr_remerged - with concurrent.futures.ThreadPoolExecutor(max_workers=load_key("max_workers")) as executor: + with concurrent.futures.ThreadPoolExecutor(max_workers=int(load_key("max_workers"))) as executor: executor.map(process, to_split) # Flatten `src_lines` and `tr_lines` diff --git a/core/step9_extract_refer_audio.py b/core/step9_extract_refer_audio.py index 74777404..e278bc85 100644 --- a/core/step9_extract_refer_audio.py +++ b/core/step9_extract_refer_audio.py @@ -23,6 +23,9 @@ def time_to_samples(time_str, sr): def extract_audio(audio_data, sr, start_time, end_time, out_file): """Simplified audio extraction function""" + if os.path.exists(out_file): + rprint(f"Output file {out_file} exists, skipping.") + return start = time_to_samples(start_time, sr) end = time_to_samples(end_time, sr) sf.write(out_file, audio_data[start:end], sr) diff --git a/st_components/sidebar_setting.py b/st_components/sidebar_setting.py index b0d88ec5..a944abf5 100644 --- a/st_components/sidebar_setting.py +++ b/st_components/sidebar_setting.py @@ -74,7 +74,8 @@ def page_setting(): update_key("resolution", resolution) with st.expander("Dubbing Settings", expanded=True): - tts_methods = ["sf_fish_tts", "openai_tts", "azure_tts", "gpt_sovits", "fish_tts"] + config_input("Max Workers", "max_workers") + tts_methods = ["sf_fish_tts", "openai_tts", "azure_tts", "gpt_sovits", "fish_tts", "cosyvoice", "cosyvoice_cloud", "sambert"] selected_tts_method = st.selectbox("TTS Method", options=tts_methods, index=tts_methods.index(load_key("tts_method"))) if selected_tts_method != load_key("tts_method"): update_key("tts_method", selected_tts_method) @@ -130,6 +131,41 @@ def page_setting(): ) if selected_refer_mode != load_key("gpt_sovits.refer_mode"): update_key("gpt_sovits.refer_mode", selected_refer_mode) + elif selected_tts_method == "cosyvoice": + config_input("API URL", "cosyvoice.api_url") + speakers = ["中文男","中文女","英文男","英文女","粤语女","日语男","韩语女"] + speaker = st.selectbox("发音人", options=speakers, index=speakers.index(load_key("cosyvoice.speaker"))) + if speaker != load_key("cosyvoice.speaker"): + update_key("cosyvoice.speaker", speaker) + config_input("语速", "cosyvoice.speed") + elif selected_tts_method == "cosyvoice_cloud": + config_input("API KEY", "cosyvoice_cloud.api_key", help="Dashscope API KEY") + speakers = ["longwan","longcheng","longhua","longxiaochun","longxiaoxia","longxiaocheng","longxiaobai", + "longlaotie","longshu","longshuo","longjing","longmiao","longyue","longyuan","longfei","longjielidou", + "longtong","longxiang","loongstella","loongbella"] + speaker = st.selectbox("发音人", options=speakers, index=speakers.index(load_key("cosyvoice_cloud.speaker")), help="可以在 https://help.aliyun.com/zh/model-studio/developer-reference/timbre-list 查看不同发音人效果") + if speaker != load_key("cosyvoice_cloud.speaker"): + update_key("cosyvoice_cloud.speaker", speaker) + config_input("语速", "cosyvoice_cloud.speed") + elif selected_tts_method == "sambert": + st.info("Sambert语音合成API基于达摩院改良的自回归韵律模型,支持文本至语音的实时流式合成。") + speakers = ["sambert-zhinan-v1","sambert-zhiqi-v1","sambert-zhichu-v1","sambert-zhide-v1","sambert-zhijia-v1", + "sambert-zhiru-v1","sambert-zhiqian-v1","sambert-zhixiang-v1","sambert-zhiwei-v1", + "sambert-zhihao-v1","sambert-zhijing-v1","sambert-zhiming-v1","sambert-zhimo-v1", + "sambert-zhina-v1","sambert-zhishu-v1","sambert-zhistella-v1","sambert-zhiting-v1", + "sambert-zhixiao-v1","sambert-zhiya-v1","sambert-zhiye-v1","sambert-zhiying-v1", + "sambert-zhiyuan-v1","sambert-zhiyue-v1","sambert-zhigui-v1","sambert-zhishuo-v1", + "sambert-zhimiao-emo-v1","sambert-zhimao-v1","sambert-zhilun-v1","sambert-zhifei-v1", + "sambert-zhida-v1","sambert-camila-v1","sambert-perla-v1","sambert-indah-v1","sambert-clara-v1", + "sambert-hanna-v1","sambert-beth-v1","sambert-betty-v1","sambert-cally-v1","sambert-cindy-v1", + "sambert-eva-v1","sambert-donna-v1","sambert-brian-v1","sambert-waan-v1"] + config_input("API KEY", "sambert.api_key", help="Dashscope API KEY") + # config_input("音色", "sambert.speaker") + speaker = st.selectbox("发音人", options=speakers, index=speakers.index(load_key("sambert.speaker")), + help="访问 https://help.aliyun.com/zh/model-studio/developer-reference/model-list 查看不同说话人特性") + if speaker != load_key("sambert.speaker"): + update_key("sambert.speaker", speaker) + def check_api(): try: