Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,21 @@ fish_tts:
'AD学姐': '7f92f8afb8ec43bf81429cc1c9199cb1'
'丁真': '54a5170264694bfc8e9ad98df7bd89c3'

cosyvoice:
api_url: ''
speaker: "英文女"
speed: '0.7'

cosyvoice_cloud:
api_key: ''
model: ''
speaker: 'longfei'
speed: '1.0'

sambert:
api_key: ''
speaker: 'sambert-zhihao-v1'

# *Audio speed range
speed_factor:
min: 1
Expand Down
21 changes: 21 additions & 0 deletions core/all_tts_functions/cosyvoice_cloud.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# coding=utf-8
import os, sys
import dashscope
from dashscope.audio.tts_v2 import *
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from core.config_utils import load_key

def cosyvoice_cloud(text: str, save_path: str):
dashscope.api_key = load_key("cosyvoice_cloud.api_key")
model = "cosyvoice-v1"
voice = load_key("cosyvoice_cloud.speaker")
speech_rate = float(load_key("cosyvoice_cloud.speed"))
synthesizer = SpeechSynthesizer(model=model, voice=voice, speech_rate=speech_rate, format=AudioFormat.WAV_22050HZ_MONO_16BIT)
audio = synthesizer.call(text)
print('requestId: ', synthesizer.get_last_request_id())
with open(save_path, 'wb') as f:
f.write(audio)

if __name__ == "__main__":
text = """操作柔软物体非常难"""
cosyvoice_cloud(text, "output/cosyvoice-cloud.wav")
44 changes: 44 additions & 0 deletions core/all_tts_functions/cosyvoice_tts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import requests
import os, sys
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
from core.config_utils import load_key

def cosyvoice_tts(text: str, save_path: str):
# url = load_key("cosyvoice.api_url")
url = "http://59.65.196.19:9872/tts"
speaker = load_key("cosyvoice.speaker")
data={
"text": text,
"role": speaker,
"speed": 1.0
}

response=requests.post(url,data=data,timeout=3600)

if response.status_code!=200:
# 出错了
print(response.json())
else:
# 返回的wav数据流,可直接保存
with open(save_path,'wb') as f:
f.write(response.content)

if __name__ == "__main__":
text = """
豫章故郡,洪都新府。星分翼轸,地接衡庐。襟三江而带五湖,控蛮荆而引瓯越。物华天宝,龙光射牛斗之墟;人杰地灵,徐孺下陈蕃之榻。雄州雾列,俊采星驰。台隍枕夷夏之交,宾主尽东南之美。都督阎公之雅望,棨戟遥临;宇文新州之懿范,襜帷暂驻。十旬休假,胜友如云;千里逢迎,高朋满座。腾蛟起凤,孟学士之词宗;紫电青霜,王将军之武库。家君作宰,路出名区;童子何知,躬逢胜饯。
时维九月,序属三秋。潦水尽而寒潭清,烟光凝而暮山紫。俨骖騑于上路,访风景于崇阿。临帝子之长洲,得天人之旧馆。层峦耸翠,上出重霄;飞阁流丹,下临无地。鹤汀凫渚,穷岛屿之萦回;桂殿兰宫,即冈峦之体势。
披绣闼,俯雕甍,山原旷其盈视,川泽纡其骇瞩。闾阎扑地,钟鸣鼎食之家;舸舰弥津,青雀黄龙之舳。云销雨霁,彩彻区明。落霞与孤鹜齐飞,秋水共长天一色。渔舟唱晚,响穷彭蠡之滨,雁阵惊寒,声断衡阳之浦。
遥襟甫畅,逸兴遄飞。爽籁发而清风生,纤歌凝而白云遏。睢园绿竹,气凌彭泽之樽;邺水朱华,光照临川之笔。四美具,二难并。穷睇眄于中天,极娱游于暇日。天高地迥,觉宇宙之无穷;兴尽悲来,识盈虚之有数。望长安于日下,目吴会于云间。地势极而南溟深,天柱高而北辰远。关山难越,谁悲失路之人;萍水相逢,尽是他乡之客。怀帝阍而不见,奉宣室以何年?
嗟乎!时运不齐,命途多舛。冯唐易老,李广难封。屈贾谊于长沙,非无圣主;窜梁鸿于海曲,岂乏明时?所赖君子见机,达人知命。老当益壮,宁移白首之心?穷且益坚,不坠青云之志。酌贪泉而觉爽,处涸辙以犹欢。北海虽赊,扶摇可接;东隅已逝,桑榆非晚。孟尝高洁,空余报国之情;阮籍猖狂,岂效穷途之哭!
勃,三尺微命,一介书生。无路请缨,等终军之弱冠;有怀投笔,慕宗悫之长风。舍簪笏于百龄,奉晨昏于万里。非谢家之宝树,接孟氏之芳邻。他日趋庭,叨陪鲤对;今兹捧袂,喜托龙门。杨意不逢,抚凌云而自惜;钟期既遇,奏流水以何惭?
呜呼!胜地不常,盛筵难再;兰亭已矣,梓泽丘墟。临别赠言,幸承恩于伟饯;登高作赋,是所望于群公。敢竭鄙怀,恭疏短引;一言均赋,四韵俱成。请洒潘江,各倾陆海云尔。
   滕王高阁临江渚,佩玉鸣鸾罢歌舞。
   画栋朝飞南浦云,珠帘暮卷西山雨。
   闲云潭影日悠悠,物换星移几度秋。
   阁中帝子今何在?槛外长江空自流。
"""
cosyvoice_tts(text, "output/cosyvoice_tts_cn.wav")
text = """
One winter day, the queen was doing needle work while gazing out her ebony window at the new fallen snow. A bird flew by the window startling the queen and she pricked her finger. A single drop of blood fell on the snow outside her window. As she looked at the blood on the snow she said to herself, "Oh, how I wish that I had a daughter that had skin as white as snow, lips as red as blood, and hair as black as ebony."
"""
# cosyvoice_tts(text, "output/cosyvoice_tts_en.wav")
23 changes: 23 additions & 0 deletions core/all_tts_functions/sambert_cloud.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# coding=utf-8
import os,sys
import dashscope
from dashscope.audio.tts import SpeechSynthesizer
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from core.config_utils import load_key

def sambert_cloud(text: str, save_path: str):
dashscope.api_key = load_key("sambert.api_key")
model = load_key("sambert.api_key")
result = SpeechSynthesizer.call(model=model,
text=text,
sample_rate=48000)
if result.get_audio_data() is not None:
with open(save_path, 'wb') as f:
f.write(result.get_audio_data())
print('SUCCESS: get audio data: %dbytes in %s' % (sys.getsizeof(result.get_audio_data()), save_path))
else:
print('ERROR: response is %s' % (result.get_response()))

if __name__ == "__main__":
text = "Striking drums and gongs was also prohibited. Playing the huqin was also forbidden. Even playing secretly at home was forbidden."
sambert_cloud(text, "output/cosyvoice-cloud.wav")
12 changes: 11 additions & 1 deletion core/all_tts_functions/tts_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@
from core.all_tts_functions.azure_tts import azure_tts
from core.ask_gpt import ask_gpt
from core.prompts_storage import get_correct_text_prompt
from core.all_tts_functions.cosyvoice_tts import cosyvoice_tts
from core.all_tts_functions.cosyvoice_cloud import cosyvoice_cloud
from core.all_tts_functions.sambert_cloud import sambert_cloud

def clean_text_for_tts(text):
"""Remove problematic characters for TTS"""
Expand All @@ -21,6 +24,7 @@ def clean_text_for_tts(text):
text = text.replace(char, '')
return text.strip()


def tts_main(text, save_as, number, task_df):
text = clean_text_for_tts(text)
# 检查文本是否为空或单字符,单字符配音容易触发bug
Expand All @@ -37,7 +41,7 @@ def tts_main(text, save_as, number, task_df):

print(f"Generating <{text}...>")
TTS_METHOD = load_key("tts_method")

max_retries = 3
for attempt in range(max_retries):
try:
Expand All @@ -55,6 +59,12 @@ def tts_main(text, save_as, number, task_df):
azure_tts(text, save_as)
elif TTS_METHOD == 'sf_fish_tts':
siliconflow_fish_tts_for_videolingo(text, save_as, number, task_df)
elif TTS_METHOD == 'cosyvoice':
cosyvoice_tts(text, save_as)
elif TTS_METHOD == 'cosyvoice_cloud':
cosyvoice_cloud(text, save_as)
elif TTS_METHOD == 'sambert':
sambert_cloud(text, save_as)

# 检查生成的音频时长
duration = get_audio_duration(save_as)
Expand Down
5 changes: 4 additions & 1 deletion core/step10_gen_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ def adjust_audio_speed(input_file: str, output_file: str, speed_factor: float) -
if abs(speed_factor - 1.0) < 0.001:
shutil.copy2(input_file, output_file)
return

if os.path.exists(output_file):
return

atempo = speed_factor
cmd = ['ffmpeg', '-i', input_file, '-filter:a', f'atempo={atempo}', '-y', output_file]
Expand Down Expand Up @@ -99,7 +102,7 @@ def generate_tts_audio(tasks_df: pd.DataFrame) -> pd.DataFrame:
raise e

# for gpt_sovits, do not use parallel to avoid mistakes
max_workers = load_key("max_workers") if load_key("tts_method") != "gpt_sovits" else 1
max_workers = int(load_key("max_workers")) if load_key("tts_method") != "gpt_sovits" else 1
# parallel processing for remaining tasks
if len(tasks_df) > warmup_size:
remaining_tasks = tasks_df.iloc[warmup_size:].copy()
Expand Down
2 changes: 1 addition & 1 deletion core/step3_2_splitbymeaning.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def split_sentences_by_meaning():
nlp = init_nlp()
# 🔄 process sentences multiple times to ensure all are split
for retry_attempt in range(3):
sentences = parallel_split_sentences(sentences, max_length=load_key("max_split_length"), max_workers=load_key("max_workers"), nlp=nlp, retry_attempt=retry_attempt)
sentences = parallel_split_sentences(sentences, max_length=load_key("max_split_length"), max_workers=int(load_key("max_workers")), nlp=nlp, retry_attempt=retry_attempt)

# 💾 save results
with open('output/log/sentence_splitbymeaning.txt', 'w', encoding='utf-8') as f:
Expand Down
2 changes: 1 addition & 1 deletion core/step4_2_translate_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def translate_all():
transient=True,
) as progress:
task = progress.add_task("[cyan]Translating chunks...", total=len(chunks))
with concurrent.futures.ThreadPoolExecutor(max_workers=load_key("max_workers")) as executor:
with concurrent.futures.ThreadPoolExecutor(max_workers=int(load_key("max_workers"))) as executor:
futures = []
for i, chunk in enumerate(chunks):
future = executor.submit(translate_chunk, chunk, chunks, theme_prompt, i)
Expand Down
2 changes: 1 addition & 1 deletion core/step5_splitforsub.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def process(i):
tr_lines[i] = tr_parts
remerged_tr_lines[i] = tr_remerged

with concurrent.futures.ThreadPoolExecutor(max_workers=load_key("max_workers")) as executor:
with concurrent.futures.ThreadPoolExecutor(max_workers=int(load_key("max_workers"))) as executor:
executor.map(process, to_split)

# Flatten `src_lines` and `tr_lines`
Expand Down
3 changes: 3 additions & 0 deletions core/step9_extract_refer_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ def time_to_samples(time_str, sr):

def extract_audio(audio_data, sr, start_time, end_time, out_file):
"""Simplified audio extraction function"""
if os.path.exists(out_file):
rprint(f"Output file {out_file} exists, skipping.")
return
start = time_to_samples(start_time, sr)
end = time_to_samples(end_time, sr)
sf.write(out_file, audio_data[start:end], sr)
Expand Down
38 changes: 37 additions & 1 deletion st_components/sidebar_setting.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,8 @@ def page_setting():
update_key("resolution", resolution)

with st.expander("Dubbing Settings", expanded=True):
tts_methods = ["sf_fish_tts", "openai_tts", "azure_tts", "gpt_sovits", "fish_tts"]
config_input("Max Workers", "max_workers")
tts_methods = ["sf_fish_tts", "openai_tts", "azure_tts", "gpt_sovits", "fish_tts", "cosyvoice", "cosyvoice_cloud", "sambert"]
selected_tts_method = st.selectbox("TTS Method", options=tts_methods, index=tts_methods.index(load_key("tts_method")))
if selected_tts_method != load_key("tts_method"):
update_key("tts_method", selected_tts_method)
Expand Down Expand Up @@ -130,6 +131,41 @@ def page_setting():
)
if selected_refer_mode != load_key("gpt_sovits.refer_mode"):
update_key("gpt_sovits.refer_mode", selected_refer_mode)
elif selected_tts_method == "cosyvoice":
config_input("API URL", "cosyvoice.api_url")
speakers = ["中文男","中文女","英文男","英文女","粤语女","日语男","韩语女"]
speaker = st.selectbox("发音人", options=speakers, index=speakers.index(load_key("cosyvoice.speaker")))
if speaker != load_key("cosyvoice.speaker"):
update_key("cosyvoice.speaker", speaker)
config_input("语速", "cosyvoice.speed")
elif selected_tts_method == "cosyvoice_cloud":
config_input("API KEY", "cosyvoice_cloud.api_key", help="Dashscope API KEY")
speakers = ["longwan","longcheng","longhua","longxiaochun","longxiaoxia","longxiaocheng","longxiaobai",
"longlaotie","longshu","longshuo","longjing","longmiao","longyue","longyuan","longfei","longjielidou",
"longtong","longxiang","loongstella","loongbella"]
speaker = st.selectbox("发音人", options=speakers, index=speakers.index(load_key("cosyvoice_cloud.speaker")), help="可以在 https://help.aliyun.com/zh/model-studio/developer-reference/timbre-list 查看不同发音人效果")
if speaker != load_key("cosyvoice_cloud.speaker"):
update_key("cosyvoice_cloud.speaker", speaker)
config_input("语速", "cosyvoice_cloud.speed")
elif selected_tts_method == "sambert":
st.info("Sambert语音合成API基于达摩院改良的自回归韵律模型,支持文本至语音的实时流式合成。")
speakers = ["sambert-zhinan-v1","sambert-zhiqi-v1","sambert-zhichu-v1","sambert-zhide-v1","sambert-zhijia-v1",
"sambert-zhiru-v1","sambert-zhiqian-v1","sambert-zhixiang-v1","sambert-zhiwei-v1",
"sambert-zhihao-v1","sambert-zhijing-v1","sambert-zhiming-v1","sambert-zhimo-v1",
"sambert-zhina-v1","sambert-zhishu-v1","sambert-zhistella-v1","sambert-zhiting-v1",
"sambert-zhixiao-v1","sambert-zhiya-v1","sambert-zhiye-v1","sambert-zhiying-v1",
"sambert-zhiyuan-v1","sambert-zhiyue-v1","sambert-zhigui-v1","sambert-zhishuo-v1",
"sambert-zhimiao-emo-v1","sambert-zhimao-v1","sambert-zhilun-v1","sambert-zhifei-v1",
"sambert-zhida-v1","sambert-camila-v1","sambert-perla-v1","sambert-indah-v1","sambert-clara-v1",
"sambert-hanna-v1","sambert-beth-v1","sambert-betty-v1","sambert-cally-v1","sambert-cindy-v1",
"sambert-eva-v1","sambert-donna-v1","sambert-brian-v1","sambert-waan-v1"]
config_input("API KEY", "sambert.api_key", help="Dashscope API KEY")
# config_input("音色", "sambert.speaker")
speaker = st.selectbox("发音人", options=speakers, index=speakers.index(load_key("sambert.speaker")),
help="访问 https://help.aliyun.com/zh/model-studio/developer-reference/model-list 查看不同说话人特性")
if speaker != load_key("sambert.speaker"):
update_key("sambert.speaker", speaker)


def check_api():
try:
Expand Down