Spaces:
Runtime error
Runtime error
Upload app.py
Browse files
app.py
CHANGED
@@ -9,25 +9,10 @@ logging.getLogger("charset_normalizer").setLevel(logging.ERROR)
|
|
9 |
logging.getLogger("torchaudio._extension").setLevel(logging.ERROR)
|
10 |
import pdb
|
11 |
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
"gpt_path", gweight_data)
|
17 |
-
else:
|
18 |
-
gpt_path = os.environ.get(
|
19 |
-
"gpt_path", "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt")
|
20 |
-
|
21 |
-
if os.path.exists("./sweight.txt"):
|
22 |
-
with open("./sweight.txt", 'r',encoding="utf-8") as file:
|
23 |
-
sweight_data = file.read()
|
24 |
-
sovits_path = os.environ.get("sovits_path", sweight_data)
|
25 |
-
else:
|
26 |
-
sovits_path = os.environ.get("sovits_path", "GPT_SoVITS/pretrained_models/s2G488k.pth")
|
27 |
-
# gpt_path = os.environ.get(
|
28 |
-
# "gpt_path", "pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
|
29 |
-
# )
|
30 |
-
# sovits_path = os.environ.get("sovits_path", "pretrained_models/s2G488k.pth")
|
31 |
cnhubert_base_path = os.environ.get(
|
32 |
"cnhubert_base_path", "pretrained_models/chinese-hubert-base"
|
33 |
)
|
@@ -36,8 +21,6 @@ bert_path = os.environ.get(
|
|
36 |
)
|
37 |
infer_ttswebui = os.environ.get("infer_ttswebui", 9872)
|
38 |
infer_ttswebui = int(infer_ttswebui)
|
39 |
-
is_share = os.environ.get("is_share", "False")
|
40 |
-
is_share=eval(is_share)
|
41 |
if "_CUDA_VISIBLE_DEVICES" in os.environ:
|
42 |
os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"]
|
43 |
is_half = eval(os.environ.get("is_half", "True"))
|
@@ -47,6 +30,10 @@ import numpy as np
|
|
47 |
import librosa,torch
|
48 |
from feature_extractor import cnhubert
|
49 |
cnhubert.cnhubert_base_path=cnhubert_base_path
|
|
|
|
|
|
|
|
|
50 |
|
51 |
from module.models import SynthesizerTrn
|
52 |
from AR.models.t2s_lightning_module import Text2SemanticLightningModule
|
@@ -55,17 +42,12 @@ from text.cleaner import clean_text
|
|
55 |
from time import time as ttime
|
56 |
from module.mel_processing import spectrogram_torch
|
57 |
from my_utils import load_audio
|
58 |
-
from tools.i18n.i18n import I18nAuto
|
59 |
-
i18n = I18nAuto()
|
60 |
|
61 |
-
|
62 |
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
device = "mps"
|
67 |
-
else:
|
68 |
-
device = "cpu"
|
69 |
|
70 |
tokenizer = AutoTokenizer.from_pretrained(bert_path)
|
71 |
bert_model = AutoModelForMaskedLM.from_pretrained(bert_path)
|
@@ -74,11 +56,12 @@ if is_half == True:
|
|
74 |
else:
|
75 |
bert_model = bert_model.to(device)
|
76 |
|
|
|
77 |
def get_bert_feature(text, word2ph):
|
78 |
with torch.no_grad():
|
79 |
inputs = tokenizer(text, return_tensors="pt")
|
80 |
for i in inputs:
|
81 |
-
inputs[i] = inputs[i].to(device)
|
82 |
res = bert_model(**inputs, output_hidden_states=True)
|
83 |
res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()[1:-1]
|
84 |
assert len(word2ph) == len(text)
|
@@ -116,7 +99,6 @@ class DictToAttrRecursive(dict):
|
|
116 |
except KeyError:
|
117 |
raise AttributeError(f"Attribute {item} not found")
|
118 |
|
119 |
-
|
120 |
ssl_model = cnhubert.get_model()
|
121 |
if is_half == True:
|
122 |
ssl_model = ssl_model.half().to(device)
|
@@ -143,7 +125,6 @@ def change_sovits_weights(sovits_path):
|
|
143 |
vq_model = vq_model.to(device)
|
144 |
vq_model.eval()
|
145 |
print(vq_model.load_state_dict(dict_s2["weight"], strict=False))
|
146 |
-
with open("./sweight.txt","w",encoding="utf-8")as f:f.write(sovits_path)
|
147 |
change_sovits_weights(sovits_path)
|
148 |
|
149 |
def change_gpt_weights(gpt_path):
|
@@ -160,9 +141,9 @@ def change_gpt_weights(gpt_path):
|
|
160 |
t2s_model.eval()
|
161 |
total = sum([param.nelement() for param in t2s_model.parameters()])
|
162 |
print("Number of parameter: %.2fM" % (total / 1e6))
|
163 |
-
with open("./gweight.txt","w",encoding="utf-8")as f:f.write(gpt_path)
|
164 |
change_gpt_weights(gpt_path)
|
165 |
|
|
|
166 |
def get_spepc(hps, filename):
|
167 |
audio = load_audio(filename, int(hps.data.sampling_rate))
|
168 |
audio = torch.FloatTensor(audio)
|
@@ -211,8 +192,6 @@ def clean_text_inf(text, language):
|
|
211 |
phones = cleaned_text_to_sequence(phones)
|
212 |
|
213 |
return phones, word2ph, norm_text
|
214 |
-
|
215 |
-
|
216 |
def get_bert_inf(phones, word2ph, norm_text, language):
|
217 |
if language == "zh":
|
218 |
bert = get_bert_feature(norm_text, word2ph).to(device)
|
@@ -292,7 +271,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
|
|
292 |
t1 = ttime()
|
293 |
prompt_language = dict_language[prompt_language]
|
294 |
text_language = dict_language[text_language]
|
295 |
-
|
296 |
if prompt_language == "en":
|
297 |
phones1, word2ph1, norm_text1 = clean_text_inf(prompt_text, prompt_language)
|
298 |
else:
|
@@ -309,7 +288,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
|
|
309 |
bert1 = get_bert_inf(phones1, word2ph1, norm_text1, prompt_language)
|
310 |
else:
|
311 |
bert1 = nonen_get_bert_inf(prompt_text, prompt_language)
|
312 |
-
|
313 |
for text in texts:
|
314 |
# 解决输入目标文本的空行导致报错的问题
|
315 |
if (len(text.strip()) == 0):
|
@@ -323,7 +302,6 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
|
|
323 |
bert2 = get_bert_inf(phones2, word2ph2, norm_text2, text_language)
|
324 |
else:
|
325 |
bert2 = nonen_get_bert_inf(text, text_language)
|
326 |
-
|
327 |
bert = torch.cat([bert1, bert2], 1)
|
328 |
|
329 |
all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0)
|
@@ -446,96 +424,86 @@ def cut2(inp):
|
|
446 |
def cut3(inp):
|
447 |
inp = inp.strip("\n")
|
448 |
return "\n".join(["%s。" % item for item in inp.strip("。").split("。")])
|
449 |
-
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
|
459 |
-
|
460 |
-
|
461 |
-
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
|
466 |
-
|
467 |
-
|
468 |
-
|
469 |
-
|
470 |
-
def get_weights_names():
|
471 |
-
SoVITS_names = [pretrained_sovits_name]
|
472 |
-
for name in os.listdir(SoVITS_weight_root):
|
473 |
-
if name.endswith(".pth"):SoVITS_names.append("%s/%s"%(SoVITS_weight_root,name))
|
474 |
-
GPT_names = [pretrained_gpt_name]
|
475 |
-
for name in os.listdir(GPT_weight_root):
|
476 |
-
if name.endswith(".ckpt"): GPT_names.append("%s/%s"%(GPT_weight_root,name))
|
477 |
-
return SoVITS_names,GPT_names
|
478 |
-
SoVITS_names,GPT_names = get_weights_names()
|
479 |
|
480 |
with gr.Blocks(title="GPT-SoVITS WebUI") as app:
|
481 |
-
gr.Markdown(
|
482 |
-
|
483 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
484 |
with gr.Group():
|
485 |
-
gr.Markdown(value=
|
486 |
-
with gr.Row():
|
487 |
-
GPT_dropdown = gr.Dropdown(label=i18n("GPT模型列表"), choices=sorted(GPT_names, key=custom_sort_key), value=gpt_path,interactive=True)
|
488 |
-
SoVITS_dropdown = gr.Dropdown(label=i18n("SoVITS模型列表"), choices=sorted(SoVITS_names, key=custom_sort_key), value=sovits_path,interactive=True)
|
489 |
-
refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary")
|
490 |
-
refresh_button.click(fn=change_choices, inputs=[], outputs=[SoVITS_dropdown, GPT_dropdown])
|
491 |
-
SoVITS_dropdown.change(change_sovits_weights,[SoVITS_dropdown],[])
|
492 |
-
GPT_dropdown.change(change_gpt_weights,[GPT_dropdown],[])
|
493 |
-
gr.Markdown(value=i18n("*请上传并填写参考信息"))
|
494 |
with gr.Row():
|
495 |
-
|
496 |
-
|
497 |
-
|
498 |
-
|
499 |
-
|
500 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
501 |
with gr.Row():
|
502 |
-
text = gr.Textbox(label=
|
503 |
text_language = gr.Dropdown(
|
504 |
-
label=
|
505 |
-
)
|
506 |
-
how_to_cut = gr.Radio(
|
507 |
-
label=i18n("怎么切"),
|
508 |
-
choices=[i18n("不切"),i18n("凑五句一切"),i18n("凑50字一切"),i18n("按中文句号。切"),i18n("按英文句号.切"),],
|
509 |
-
value=i18n("凑50字一切"),
|
510 |
-
interactive=True,
|
511 |
)
|
512 |
-
inference_button = gr.Button(
|
513 |
-
output = gr.Audio(label=
|
514 |
-
|
515 |
inference_button.click(
|
516 |
get_tts_wav,
|
517 |
-
[
|
518 |
[output],
|
519 |
)
|
520 |
|
521 |
-
|
522 |
-
|
523 |
-
|
524 |
-
|
525 |
-
|
526 |
-
|
527 |
-
|
528 |
-
|
529 |
-
|
530 |
-
|
531 |
-
|
532 |
-
|
533 |
-
|
534 |
-
|
535 |
-
app.queue(concurrency_count=511, max_size=1022).launch(
|
536 |
-
server_name="0.0.0.0",
|
537 |
-
inbrowser=True,
|
538 |
-
share=is_share,
|
539 |
-
server_port=infer_ttswebui,
|
540 |
-
quiet=True,
|
541 |
-
)
|
|
|
9 |
logging.getLogger("torchaudio._extension").setLevel(logging.ERROR)
|
10 |
import pdb
|
11 |
|
12 |
+
gpt_path = os.environ.get(
|
13 |
+
"gpt_path", "models/Taffy/Taffy-e5.ckpt"
|
14 |
+
)
|
15 |
+
sovits_path = os.environ.get("sovits_path", "models/Taffy/Taffy_e20_s1020.pth")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
cnhubert_base_path = os.environ.get(
|
17 |
"cnhubert_base_path", "pretrained_models/chinese-hubert-base"
|
18 |
)
|
|
|
21 |
)
|
22 |
infer_ttswebui = os.environ.get("infer_ttswebui", 9872)
|
23 |
infer_ttswebui = int(infer_ttswebui)
|
|
|
|
|
24 |
if "_CUDA_VISIBLE_DEVICES" in os.environ:
|
25 |
os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"]
|
26 |
is_half = eval(os.environ.get("is_half", "True"))
|
|
|
30 |
import librosa,torch
|
31 |
from feature_extractor import cnhubert
|
32 |
cnhubert.cnhubert_base_path=cnhubert_base_path
|
33 |
+
import ssl
|
34 |
+
ssl._create_default_https_context = ssl._create_unverified_context
|
35 |
+
import nltk
|
36 |
+
nltk.download('cmudict')
|
37 |
|
38 |
from module.models import SynthesizerTrn
|
39 |
from AR.models.t2s_lightning_module import Text2SemanticLightningModule
|
|
|
42 |
from time import time as ttime
|
43 |
from module.mel_processing import spectrogram_torch
|
44 |
from my_utils import load_audio
|
|
|
|
|
45 |
|
46 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
47 |
|
48 |
+
is_half = eval(
|
49 |
+
os.environ.get("is_half", "True" if torch.cuda.is_available() else "False")
|
50 |
+
)
|
|
|
|
|
|
|
51 |
|
52 |
tokenizer = AutoTokenizer.from_pretrained(bert_path)
|
53 |
bert_model = AutoModelForMaskedLM.from_pretrained(bert_path)
|
|
|
56 |
else:
|
57 |
bert_model = bert_model.to(device)
|
58 |
|
59 |
+
|
60 |
def get_bert_feature(text, word2ph):
|
61 |
with torch.no_grad():
|
62 |
inputs = tokenizer(text, return_tensors="pt")
|
63 |
for i in inputs:
|
64 |
+
inputs[i] = inputs[i].to(device) #####输入是long不用管精度问题,精度随bert_model
|
65 |
res = bert_model(**inputs, output_hidden_states=True)
|
66 |
res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()[1:-1]
|
67 |
assert len(word2ph) == len(text)
|
|
|
99 |
except KeyError:
|
100 |
raise AttributeError(f"Attribute {item} not found")
|
101 |
|
|
|
102 |
ssl_model = cnhubert.get_model()
|
103 |
if is_half == True:
|
104 |
ssl_model = ssl_model.half().to(device)
|
|
|
125 |
vq_model = vq_model.to(device)
|
126 |
vq_model.eval()
|
127 |
print(vq_model.load_state_dict(dict_s2["weight"], strict=False))
|
|
|
128 |
change_sovits_weights(sovits_path)
|
129 |
|
130 |
def change_gpt_weights(gpt_path):
|
|
|
141 |
t2s_model.eval()
|
142 |
total = sum([param.nelement() for param in t2s_model.parameters()])
|
143 |
print("Number of parameter: %.2fM" % (total / 1e6))
|
|
|
144 |
change_gpt_weights(gpt_path)
|
145 |
|
146 |
+
|
147 |
def get_spepc(hps, filename):
|
148 |
audio = load_audio(filename, int(hps.data.sampling_rate))
|
149 |
audio = torch.FloatTensor(audio)
|
|
|
192 |
phones = cleaned_text_to_sequence(phones)
|
193 |
|
194 |
return phones, word2ph, norm_text
|
|
|
|
|
195 |
def get_bert_inf(phones, word2ph, norm_text, language):
|
196 |
if language == "zh":
|
197 |
bert = get_bert_feature(norm_text, word2ph).to(device)
|
|
|
271 |
t1 = ttime()
|
272 |
prompt_language = dict_language[prompt_language]
|
273 |
text_language = dict_language[text_language]
|
274 |
+
|
275 |
if prompt_language == "en":
|
276 |
phones1, word2ph1, norm_text1 = clean_text_inf(prompt_text, prompt_language)
|
277 |
else:
|
|
|
288 |
bert1 = get_bert_inf(phones1, word2ph1, norm_text1, prompt_language)
|
289 |
else:
|
290 |
bert1 = nonen_get_bert_inf(prompt_text, prompt_language)
|
291 |
+
|
292 |
for text in texts:
|
293 |
# 解决输入目标文本的空行导致报错的问题
|
294 |
if (len(text.strip()) == 0):
|
|
|
302 |
bert2 = get_bert_inf(phones2, word2ph2, norm_text2, text_language)
|
303 |
else:
|
304 |
bert2 = nonen_get_bert_inf(text, text_language)
|
|
|
305 |
bert = torch.cat([bert1, bert2], 1)
|
306 |
|
307 |
all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0)
|
|
|
424 |
def cut3(inp):
|
425 |
inp = inp.strip("\n")
|
426 |
return "\n".join(["%s。" % item for item in inp.strip("。").split("。")])
|
427 |
+
|
428 |
+
def scan_audio_files(folder_path):
|
429 |
+
""" 扫描指定文件夹获取音频文件列表 """
|
430 |
+
return [f for f in os.listdir(folder_path) if f.endswith('.wav')]
|
431 |
+
|
432 |
+
def load_audio_text_mappings(folder_path, list_file_name):
|
433 |
+
text_to_audio_mappings = {}
|
434 |
+
audio_to_text_mappings = {}
|
435 |
+
with open(os.path.join(folder_path, list_file_name), 'r', encoding='utf-8') as file:
|
436 |
+
for line in file:
|
437 |
+
parts = line.strip().split('|')
|
438 |
+
if len(parts) >= 4:
|
439 |
+
audio_file_name = parts[0]
|
440 |
+
text = parts[3]
|
441 |
+
audio_file_path = os.path.join(folder_path, audio_file_name)
|
442 |
+
text_to_audio_mappings[text] = audio_file_path
|
443 |
+
audio_to_text_mappings[audio_file_path] = text
|
444 |
+
return text_to_audio_mappings, audio_to_text_mappings
|
445 |
+
|
446 |
+
audio_folder_path = 'audio/Taffy'
|
447 |
+
text_to_audio_mappings, audio_to_text_mappings = load_audio_text_mappings(audio_folder_path, 'Taffy.list')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
448 |
|
449 |
with gr.Blocks(title="GPT-SoVITS WebUI") as app:
|
450 |
+
gr.Markdown(value="""
|
451 |
+
# <center>【AI塔菲】在线语音生成(GPT-SoVITS)\n
|
452 |
+
|
453 |
+
### <center>模型作者:Xz乔希 https://space.bilibili.com/5859321\n
|
454 |
+
### <center>GPT-SoVITS在线合集:https://www.modelscope.cn/studios/xzjosh/GPT-SoVITS\n
|
455 |
+
### <center>数据集下载:https://huggingface.co/datasets/XzJosh/audiodataset\n
|
456 |
+
### <center>声音归属:永雏塔菲 https://space.bilibili.com/1265680561\n
|
457 |
+
### <center>GPT-SoVITS项目:https://github.com/RVC-Boss/GPT-SoVITS\n
|
458 |
+
### <center>使用本模型请严格遵守法律法规!发布二创作品请标注本项目作者及链接、作品使用GPT-SoVITS AI生成!\n
|
459 |
+
### <center>⚠️在线端不稳定且生成速度较慢,强烈建议下载模型本地推理!\n
|
460 |
+
""")
|
461 |
+
# with gr.Tabs():
|
462 |
+
# with gr.TabItem(i18n("伴奏人声分离&去混响&去回声")):
|
463 |
with gr.Group():
|
464 |
+
gr.Markdown(value="*参考音频选择(必选)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
465 |
with gr.Row():
|
466 |
+
audio_select = gr.Dropdown(label="选择参考音频(不建议选较长的)", choices=list(text_to_audio_mappings.keys()))
|
467 |
+
ref_audio = gr.Audio(label="参考音频试听")
|
468 |
+
ref_text = gr.Textbox(label="参考音频文本")
|
469 |
+
|
470 |
+
# 定义更新参考文本的函数
|
471 |
+
def update_ref_text_and_audio(selected_text):
|
472 |
+
audio_path = text_to_audio_mappings.get(selected_text, "")
|
473 |
+
return selected_text, audio_path
|
474 |
+
|
475 |
+
# 绑定下拉菜单的变化到更新函数
|
476 |
+
audio_select.change(update_ref_text_and_audio, [audio_select], [ref_text, ref_audio])
|
477 |
+
|
478 |
+
# 其他 Gradio 组件和功能
|
479 |
+
prompt_language = gr.Dropdown(
|
480 |
+
label="参考音频语种", choices=["中文", "英文", "日文"], value="中文"
|
481 |
+
)
|
482 |
+
gr.Markdown(value="*请填写需要合成的目标文本")
|
483 |
with gr.Row():
|
484 |
+
text = gr.Textbox(label="需要合成的文本", value="")
|
485 |
text_language = gr.Dropdown(
|
486 |
+
label="需要合成的语种", choices=["中文", "英文", "日文"], value="中文"
|
|
|
|
|
|
|
|
|
|
|
|
|
487 |
)
|
488 |
+
inference_button = gr.Button("合成语音", variant="primary")
|
489 |
+
output = gr.Audio(label="输出的语音")
|
|
|
490 |
inference_button.click(
|
491 |
get_tts_wav,
|
492 |
+
[audio_select, ref_text, prompt_language, text, text_language],
|
493 |
[output],
|
494 |
)
|
495 |
|
496 |
+
|
497 |
+
gr.Markdown(value="文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。")
|
498 |
+
with gr.Row():
|
499 |
+
text_inp = gr.Textbox(label="需要合成的切分前文本", value="")
|
500 |
+
button1 = gr.Button("凑五句一切", variant="primary")
|
501 |
+
button2 = gr.Button("凑50字一切", variant="primary")
|
502 |
+
button3 = gr.Button("按中文句号。切", variant="primary")
|
503 |
+
text_opt = gr.Textbox(label="切分后文本", value="")
|
504 |
+
button1.click(cut1, [text_inp], [text_opt])
|
505 |
+
button2.click(cut2, [text_inp], [text_opt])
|
506 |
+
button3.click(cut3, [text_inp], [text_opt])
|
507 |
+
|
508 |
+
app.queue(max_size=10)
|
509 |
+
app.launch(inbrowser=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|