maimai-GPT-SoVITS

Running

App Files Files Community

XzJosh commited on Jan 26, 2024

Commit

6ae47ab

verified ·

1 Parent(s): bf0dde6

Upload app.py

Browse files

Files changed (1) hide show

app.py +88 -120

app.py CHANGED Viewed

@@ -9,25 +9,10 @@ logging.getLogger("charset_normalizer").setLevel(logging.ERROR)
 logging.getLogger("torchaudio._extension").setLevel(logging.ERROR)
 import pdb
-if os.path.exists("./gweight.txt"):
-    with open("./gweight.txt", 'r',encoding="utf-8") as file:
-        gweight_data = file.read()
-        gpt_path = os.environ.get(
-    "gpt_path", gweight_data)
-else:
-    gpt_path = os.environ.get(
-    "gpt_path", "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt")
-if os.path.exists("./sweight.txt"):
-    with open("./sweight.txt", 'r',encoding="utf-8") as file:
-        sweight_data = file.read()
-        sovits_path = os.environ.get("sovits_path", sweight_data)
-else:
-    sovits_path = os.environ.get("sovits_path", "GPT_SoVITS/pretrained_models/s2G488k.pth")
-# gpt_path = os.environ.get(
-#     "gpt_path", "pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
-# )
-# sovits_path = os.environ.get("sovits_path", "pretrained_models/s2G488k.pth")
 cnhubert_base_path = os.environ.get(
     "cnhubert_base_path", "pretrained_models/chinese-hubert-base"
 )
@@ -36,8 +21,6 @@ bert_path = os.environ.get(
 )
 infer_ttswebui = os.environ.get("infer_ttswebui", 9872)
 infer_ttswebui = int(infer_ttswebui)
-is_share = os.environ.get("is_share", "False")
-is_share=eval(is_share)
 if "_CUDA_VISIBLE_DEVICES" in os.environ:
     os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"]
 is_half = eval(os.environ.get("is_half", "True"))
@@ -47,6 +30,10 @@ import numpy as np
 import librosa,torch
 from feature_extractor import cnhubert
 cnhubert.cnhubert_base_path=cnhubert_base_path
 from module.models import SynthesizerTrn
 from AR.models.t2s_lightning_module import Text2SemanticLightningModule
@@ -55,17 +42,12 @@ from text.cleaner import clean_text
 from time import time as ttime
 from module.mel_processing import spectrogram_torch
 from my_utils import load_audio
-from tools.i18n.i18n import I18nAuto
-i18n = I18nAuto()
-os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 确保直接启动推理UI时也能够设置。
-if torch.cuda.is_available():
-    device = "cuda"
-elif torch.backends.mps.is_available():
-    device = "mps"
-else:
-    device = "cpu"
 tokenizer = AutoTokenizer.from_pretrained(bert_path)
 bert_model = AutoModelForMaskedLM.from_pretrained(bert_path)
@@ -74,11 +56,12 @@ if is_half == True:
 else:
     bert_model = bert_model.to(device)
 def get_bert_feature(text, word2ph):
     with torch.no_grad():
         inputs = tokenizer(text, return_tensors="pt")
         for i in inputs:
-            inputs[i] = inputs[i].to(device)
         res = bert_model(**inputs, output_hidden_states=True)
         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()[1:-1]
     assert len(word2ph) == len(text)
@@ -116,7 +99,6 @@ class DictToAttrRecursive(dict):
         except KeyError:
             raise AttributeError(f"Attribute {item} not found")
 ssl_model = cnhubert.get_model()
 if is_half == True:
     ssl_model = ssl_model.half().to(device)
@@ -143,7 +125,6 @@ def change_sovits_weights(sovits_path):
         vq_model = vq_model.to(device)
     vq_model.eval()
     print(vq_model.load_state_dict(dict_s2["weight"], strict=False))
-    with open("./sweight.txt","w",encoding="utf-8")as f:f.write(sovits_path)
 change_sovits_weights(sovits_path)
 def change_gpt_weights(gpt_path):
@@ -160,9 +141,9 @@ def change_gpt_weights(gpt_path):
     t2s_model.eval()
     total = sum([param.nelement() for param in t2s_model.parameters()])
     print("Number of parameter: %.2fM" % (total / 1e6))
-    with open("./gweight.txt","w",encoding="utf-8")as f:f.write(gpt_path)
 change_gpt_weights(gpt_path)
 def get_spepc(hps, filename):
     audio = load_audio(filename, int(hps.data.sampling_rate))
     audio = torch.FloatTensor(audio)
@@ -211,8 +192,6 @@ def clean_text_inf(text, language):
     phones = cleaned_text_to_sequence(phones)
     return phones, word2ph, norm_text
 def get_bert_inf(phones, word2ph, norm_text, language):
     if language == "zh":
         bert = get_bert_feature(norm_text, word2ph).to(device)
@@ -292,7 +271,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
     t1 = ttime()
     prompt_language = dict_language[prompt_language]
     text_language = dict_language[text_language]
     if prompt_language == "en":
         phones1, word2ph1, norm_text1 = clean_text_inf(prompt_text, prompt_language)
     else:
@@ -309,7 +288,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
         bert1 = get_bert_inf(phones1, word2ph1, norm_text1, prompt_language)
     else:
         bert1 = nonen_get_bert_inf(prompt_text, prompt_language)
     for text in texts:
         # 解决输入目标文本的空行导致报错的问题
         if (len(text.strip()) == 0):
@@ -323,7 +302,6 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
             bert2 = get_bert_inf(phones2, word2ph2, norm_text2, text_language)
         else:
             bert2 = nonen_get_bert_inf(text, text_language)
         bert = torch.cat([bert1, bert2], 1)
         all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0)
@@ -446,96 +424,86 @@ def cut2(inp):
 def cut3(inp):
     inp = inp.strip("\n")
     return "\n".join(["%s。" % item for item in inp.strip("。").split("。")])
-def cut4(inp):
-    inp = inp.strip("\n")
-    return "\n".join(["%s." % item for item in inp.strip(".").split(".")])
-def custom_sort_key(s):
-    # 使用正则表达式提取字符串中的数字部分和非数字部分
-    parts = re.split('(\d+)', s)
-    # 将数字部分转换为整数，非数字部分保持不变
-    parts = [int(part) if part.isdigit() else part for part in parts]
-    return parts
-def change_choices():
-    SoVITS_names, GPT_names = get_weights_names()
-    return {"choices": sorted(SoVITS_names,key=custom_sort_key), "__type__": "update"}, {"choices": sorted(GPT_names,key=custom_sort_key), "__type__": "update"}
-pretrained_sovits_name="GPT_SoVITS/pretrained_models/s2G488k.pth"
-pretrained_gpt_name="GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
-SoVITS_weight_root="SoVITS_weights"
-GPT_weight_root="GPT_weights"
-os.makedirs(SoVITS_weight_root,exist_ok=True)
-os.makedirs(GPT_weight_root,exist_ok=True)
-def get_weights_names():
-    SoVITS_names = [pretrained_sovits_name]
-    for name in os.listdir(SoVITS_weight_root):
-        if name.endswith(".pth"):SoVITS_names.append("%s/%s"%(SoVITS_weight_root,name))
-    GPT_names = [pretrained_gpt_name]
-    for name in os.listdir(GPT_weight_root):
-        if name.endswith(".ckpt"): GPT_names.append("%s/%s"%(GPT_weight_root,name))
-    return SoVITS_names,GPT_names
-SoVITS_names,GPT_names = get_weights_names()
 with gr.Blocks(title="GPT-SoVITS WebUI") as app:
-    gr.Markdown(
-        value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>.")
-    )
     with gr.Group():
-        gr.Markdown(value=i18n("模型切换"))
-        with gr.Row():
-            GPT_dropdown = gr.Dropdown(label=i18n("GPT模型列表"), choices=sorted(GPT_names, key=custom_sort_key), value=gpt_path,interactive=True)
-            SoVITS_dropdown = gr.Dropdown(label=i18n("SoVITS模型列表"), choices=sorted(SoVITS_names, key=custom_sort_key), value=sovits_path,interactive=True)
-            refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary")
-            refresh_button.click(fn=change_choices, inputs=[], outputs=[SoVITS_dropdown, GPT_dropdown])
-            SoVITS_dropdown.change(change_sovits_weights,[SoVITS_dropdown],[])
-            GPT_dropdown.change(change_gpt_weights,[GPT_dropdown],[])
-        gr.Markdown(value=i18n("*请上传并填写参考信息"))
         with gr.Row():
-            inp_ref = gr.Audio(label=i18n("请上传参考音频"), type="filepath")
-            prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value="")
-            prompt_language = gr.Dropdown(
-                label=i18n("参考音频的语种"),choices=[i18n("中文"),i18n("英文"),i18n("日文")],value=i18n("中文")
-            )
-        gr.Markdown(value=i18n("*请填写需要合成的目标文本。中英混合选中文，日英混合选日文，中日混合暂不支持，非目标语言文本自动遗弃。"))
         with gr.Row():
-            text = gr.Textbox(label=i18n("需要合成的文本"), value="")
             text_language = gr.Dropdown(
-                label=i18n("需要合成的语种"),choices=[i18n("中文"),i18n("英文"),i18n("日文")],value=i18n("中文")
-            )
-            how_to_cut = gr.Radio(
-                label=i18n("怎么切"),
-                choices=[i18n("不切"),i18n("凑五句一切"),i18n("凑50字一切"),i18n("按中文句号。切"),i18n("按英文句号.切"),],
-                value=i18n("凑50字一切"),
-                interactive=True,
             )
-            inference_button = gr.Button(i18n("合成语音"), variant="primary")
-            output = gr.Audio(label=i18n("输出的语音"))
         inference_button.click(
             get_tts_wav,
-            [inp_ref, prompt_text, prompt_language, text, text_language,how_to_cut],
             [output],
         )
-        gr.Markdown(value=i18n("文本切分工具。太长的文本合成出来效果不一定好，所以太长建议先切。合成会根据文本的换行分开合成再拼起来。"))
-        with gr.Row():
-            text_inp = gr.Textbox(label=i18n("需要合成的切分前文本"),value="")
-            button1 = gr.Button(i18n("凑五句一切"), variant="primary")
-            button2 = gr.Button(i18n("凑50字一切"), variant="primary")
-            button3 = gr.Button(i18n("按中文句号。切"), variant="primary")
-            button4 = gr.Button(i18n("按英文句号.切"), variant="primary")
-            text_opt = gr.Textbox(label=i18n("切分后文本"), value="")
-            button1.click(cut1, [text_inp], [text_opt])
-            button2.click(cut2, [text_inp], [text_opt])
-            button3.click(cut3, [text_inp], [text_opt])
-            button4.click(cut4, [text_inp], [text_opt])
-        gr.Markdown(value=i18n("后续将支持混合语种编码文本输入。"))
-app.queue(concurrency_count=511, max_size=1022).launch(
-    server_name="0.0.0.0",
-    inbrowser=True,
-    share=is_share,
-    server_port=infer_ttswebui,
-    quiet=True,
-)

 logging.getLogger("torchaudio._extension").setLevel(logging.ERROR)
 import pdb
+gpt_path = os.environ.get(
+    "gpt_path", "models/Taffy/Taffy-e5.ckpt"
+)
+sovits_path = os.environ.get("sovits_path", "models/Taffy/Taffy_e20_s1020.pth")
 cnhubert_base_path = os.environ.get(
     "cnhubert_base_path", "pretrained_models/chinese-hubert-base"
 )
 )
 infer_ttswebui = os.environ.get("infer_ttswebui", 9872)
 infer_ttswebui = int(infer_ttswebui)
 if "_CUDA_VISIBLE_DEVICES" in os.environ:
     os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"]
 is_half = eval(os.environ.get("is_half", "True"))
 import librosa,torch
 from feature_extractor import cnhubert
 cnhubert.cnhubert_base_path=cnhubert_base_path
+import ssl
+ssl._create_default_https_context = ssl._create_unverified_context
+import nltk
+nltk.download('cmudict')
 from module.models import SynthesizerTrn
 from AR.models.t2s_lightning_module import Text2SemanticLightningModule
 from time import time as ttime
 from module.mel_processing import spectrogram_torch
 from my_utils import load_audio
+device = "cuda" if torch.cuda.is_available() else "cpu"
+is_half = eval(
+    os.environ.get("is_half", "True" if torch.cuda.is_available() else "False")
+)
 tokenizer = AutoTokenizer.from_pretrained(bert_path)
 bert_model = AutoModelForMaskedLM.from_pretrained(bert_path)
 else:
     bert_model = bert_model.to(device)
 def get_bert_feature(text, word2ph):
     with torch.no_grad():
         inputs = tokenizer(text, return_tensors="pt")
         for i in inputs:
+            inputs[i] = inputs[i].to(device)  #####输入是long不用管精度问题，精度随bert_model
         res = bert_model(**inputs, output_hidden_states=True)
         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()[1:-1]
     assert len(word2ph) == len(text)
         except KeyError:
             raise AttributeError(f"Attribute {item} not found")
 ssl_model = cnhubert.get_model()
 if is_half == True:
     ssl_model = ssl_model.half().to(device)
         vq_model = vq_model.to(device)
     vq_model.eval()
     print(vq_model.load_state_dict(dict_s2["weight"], strict=False))
 change_sovits_weights(sovits_path)
 def change_gpt_weights(gpt_path):
     t2s_model.eval()
     total = sum([param.nelement() for param in t2s_model.parameters()])
     print("Number of parameter: %.2fM" % (total / 1e6))
 change_gpt_weights(gpt_path)
 def get_spepc(hps, filename):
     audio = load_audio(filename, int(hps.data.sampling_rate))
     audio = torch.FloatTensor(audio)
     phones = cleaned_text_to_sequence(phones)
     return phones, word2ph, norm_text
 def get_bert_inf(phones, word2ph, norm_text, language):
     if language == "zh":
         bert = get_bert_feature(norm_text, word2ph).to(device)
     t1 = ttime()
     prompt_language = dict_language[prompt_language]
     text_language = dict_language[text_language]
     if prompt_language == "en":
         phones1, word2ph1, norm_text1 = clean_text_inf(prompt_text, prompt_language)
     else:
         bert1 = get_bert_inf(phones1, word2ph1, norm_text1, prompt_language)
     else:
         bert1 = nonen_get_bert_inf(prompt_text, prompt_language)
     for text in texts:
         # 解决输入目标文本的空行导致报错的问题
         if (len(text.strip()) == 0):
             bert2 = get_bert_inf(phones2, word2ph2, norm_text2, text_language)
         else:
             bert2 = nonen_get_bert_inf(text, text_language)
         bert = torch.cat([bert1, bert2], 1)
         all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0)
 def cut3(inp):
     inp = inp.strip("\n")
     return "\n".join(["%s。" % item for item in inp.strip("。").split("。")])
+def scan_audio_files(folder_path):
+    """ 扫描指定文件夹获取音频文件列表 """
+    return [f for f in os.listdir(folder_path) if f.endswith('.wav')]
+def load_audio_text_mappings(folder_path, list_file_name):
+    text_to_audio_mappings = {}
+    audio_to_text_mappings = {}
+    with open(os.path.join(folder_path, list_file_name), 'r', encoding='utf-8') as file:
+        for line in file:
+            parts = line.strip().split('|')
+            if len(parts) >= 4:
+                audio_file_name = parts[0]
+                text = parts[3]
+                audio_file_path = os.path.join(folder_path, audio_file_name)
+                text_to_audio_mappings[text] = audio_file_path
+                audio_to_text_mappings[audio_file_path] = text
+    return text_to_audio_mappings, audio_to_text_mappings
+audio_folder_path = 'audio/Taffy'
+text_to_audio_mappings, audio_to_text_mappings = load_audio_text_mappings(audio_folder_path, 'Taffy.list')
 with gr.Blocks(title="GPT-SoVITS WebUI") as app:
+    gr.Markdown(value="""
+    # <center>【AI塔菲】在线语音生成（GPT-SoVITS）\n
+    ### <center>模型作者：Xz乔希 https://space.bilibili.com/5859321\n
+    ### <center>GPT-SoVITS在线合集：https://www.modelscope.cn/studios/xzjosh/GPT-SoVITS\n
+    ### <center>数据集下载：https://huggingface.co/datasets/XzJosh/audiodataset\n
+    ### <center>声音归属：永雏塔菲 https://space.bilibili.com/1265680561\n
+    ### <center>GPT-SoVITS项目：https://github.com/RVC-Boss/GPT-SoVITS\n
+    ### <center>使用本模型请严格遵守法律法规！发布二创作品请标注本项目作者及链接、作品使用GPT-SoVITS AI生成！\n
+    ### <center>⚠️在线端不稳定且生成速度较慢，强烈建议下载模型本地推理！\n
+                """)
+    # with gr.Tabs():
+    #     with gr.TabItem(i18n("伴奏人声分离&去混响&去回声")):
     with gr.Group():
+        gr.Markdown(value="*参考音频选择（必选）")
         with gr.Row():
+            audio_select = gr.Dropdown(label="选择参考音频（不建议选较长的）", choices=list(text_to_audio_mappings.keys()))
+            ref_audio = gr.Audio(label="参考音频试听")
+            ref_text = gr.Textbox(label="参考音频文本")
+    # 定义更新参考文本的函数
+        def update_ref_text_and_audio(selected_text):
+            audio_path = text_to_audio_mappings.get(selected_text, "")
+            return selected_text, audio_path
+    # 绑定下拉菜单的变化到更新函数
+        audio_select.change(update_ref_text_and_audio, [audio_select], [ref_text, ref_audio])
+    # 其他 Gradio 组件和功能
+        prompt_language = gr.Dropdown(
+            label="参考音频语种", choices=["中文", "英文", "日文"], value="中文"
+        )
+        gr.Markdown(value="*请填写需要合成的目标文本")
         with gr.Row():
+            text = gr.Textbox(label="需要合成的文本", value="")
             text_language = gr.Dropdown(
+                label="需要合成的语种", choices=["中文", "英文", "日文"], value="中文"
             )
+            inference_button = gr.Button("合成语音", variant="primary")
+            output = gr.Audio(label="输出的语音")
         inference_button.click(
             get_tts_wav,
+            [audio_select, ref_text, prompt_language, text, text_language],
             [output],
         )
+    gr.Markdown(value="文本切分工具。太长的文本合成出来效果不一定好，所以太长建议先切。合成会根据文本的换行分开合成再拼起来。")
+    with gr.Row():
+        text_inp = gr.Textbox(label="需要合成的切分前文本", value="")
+        button1 = gr.Button("凑五句一切", variant="primary")
+        button2 = gr.Button("凑50字一切", variant="primary")
+        button3 = gr.Button("按中文句号。切", variant="primary")
+        text_opt = gr.Textbox(label="切分后文本", value="")
+        button1.click(cut1, [text_inp], [text_opt])
+        button2.click(cut2, [text_inp], [text_opt])
+        button3.click(cut3, [text_inp], [text_opt])
+app.queue(max_size=10)
+app.launch(inbrowser=True)