Spaces:

Kit-Lemonfoot
/

Lemonfoot_GPTSoVITS

Running

App Files Files Community

Kit-Lemonfoot commited on May 10

Commit

c8d8351

•

1 Parent(s): 9b54243

GPTSV_FI Update Part 1

Browse files

Files changed (17) hide show

.gitignore +5 -0
api.py +734 -559
api_v2.py +453 -0
config.py +66 -66
go-webui.bat +2 -2
requirements.txt +2 -12
tools/asr/fasterwhisper_asr.py +23 -19
tools/asr/funasr_asr.py +4 -3
tools/cmd-denoise.py +28 -28
tools/i18n/i18n.py +1 -1
tools/my_utils.py +31 -31
tools/slice_audio.py +48 -48
tools/slicer2.py +261 -261
tools/subfix_webui.py +2 -2
tools/uvr5/lib/lib_v5/modelparams/4band_v3.json +53 -53
tools/uvr5/webui.py +1 -2
webui.py +878 -878

.gitignore CHANGED Viewed

@@ -10,3 +10,8 @@ reference
 GPT_weights
 SoVITS_weights
 TEMP

 GPT_weights
 SoVITS_weights
 TEMP
+PortableGit
+ffmpeg.exe
+ffprobe.exe
+tmp_audio
+trained

api.py CHANGED Viewed

@@ -1,559 +1,734 @@
-"""
-# api.py usage
-` python api.py -dr "123.wav" -dt "一二三。" -dl "zh" `
-## 执行参数:
-`-s` - `SoVITS模型路径, 可在 config.py 中指定`
-`-g` - `GPT模型路径, 可在 config.py 中指定`
-调用请求缺少参考音频时使用
-`-dr` - `默认参考音频路径`
-`-dt` - `默认参考音频文本`
-`-dl` - `默认参考音频语种, "中文","英文","日文","zh","en","ja"`
-`-d` - `推理设备, "cuda","cpu"`
-`-a` - `绑定地址, 默认"127.0.0.1"`
-`-p` - `绑定端口, 默认9880, 可在 config.py 中指定`
-`-fp` - `覆盖 config.py 使用全精度`
-`-hp` - `覆盖 config.py 使用半精度`
-`-hb` - `cnhubert路径`
-`-b` - `bert路径`
-## 调用:
-### 推理
-endpoint: `/`
-使用执行参数指定的参考音频:
-GET:
-    `http://127.0.0.1:9880?text=先帝创业未半而中道崩殂，今天下三分，益州疲弊，此诚危急存亡之秋也。&text_language=zh`
-POST:
-```json
-{
-    "text": "先帝创业未半而中道崩殂，今天下三分，益州疲弊，此诚危急存亡之秋也。",
-    "text_language": "zh"
-}
-```
-手动指定当次推理所使用的参考音频:
-GET:
-    `http://127.0.0.1:9880?refer_wav_path=123.wav&prompt_text=一二三。&prompt_language=zh&text=先帝创业未半而中道崩殂，今天下三分，益州疲弊，此诚危急存亡之秋也。&text_language=zh`
-POST:
-```json
-{
-    "refer_wav_path": "123.wav",
-    "prompt_text": "一二三。",
-    "prompt_language": "zh",
-    "text": "先帝创业未半而中道崩殂，今天下三分，益州疲弊，此诚危急存亡之秋也。",
-    "text_language": "zh"
-}
-```
-RESP:
-成功: 直接返回 wav 音频流， http code 200
-失败: 返回包含错误信息的 json, http code 400
-### 更换默认参考音频
-endpoint: `/change_refer`
-key与推理端一样
-GET:
-    `http://127.0.0.1:9880/change_refer?refer_wav_path=123.wav&prompt_text=一二三。&prompt_language=zh`
-POST:
-```json
-{
-    "refer_wav_path": "123.wav",
-    "prompt_text": "一二三。",
-    "prompt_language": "zh"
-}
-```
-RESP:
-成功: json, http code 200
-失败: json, 400
-### 命令控制
-endpoint: `/control`
-command:
-"restart": 重新运行
-"exit": 结束运行
-GET:
-    `http://127.0.0.1:9880/control?command=restart`
-POST:
-```json
-{
-    "command": "restart"
-}
-```
-RESP: 无
-"""
-import argparse
-import os
-import sys
-now_dir = os.getcwd()
-sys.path.append(now_dir)
-sys.path.append("%s/GPT_SoVITS" % (now_dir))
-import signal
-from time import time as ttime
-import torch
-import librosa
-import soundfile as sf
-from fastapi import FastAPI, Request, HTTPException
-from fastapi.responses import StreamingResponse, JSONResponse
-import uvicorn
-from transformers import AutoModelForMaskedLM, AutoTokenizer
-import numpy as np
-from feature_extractor import cnhubert
-from io import BytesIO
-from module.models import SynthesizerTrn
-from AR.models.t2s_lightning_module import Text2SemanticLightningModule
-from text import cleaned_text_to_sequence
-from text.cleaner import clean_text
-from module.mel_processing import spectrogram_torch
-from my_utils import load_audio
-import config as global_config
-g_config = global_config.Config()
-# AVAILABLE_COMPUTE = "cuda" if torch.cuda.is_available() else "cpu"
-parser = argparse.ArgumentParser(description="GPT-SoVITS api")
-parser.add_argument("-s", "--sovits_path", type=str, default=g_config.sovits_path, help="SoVITS模型路径")
-parser.add_argument("-g", "--gpt_path", type=str, default=g_config.gpt_path, help="GPT模型路径")
-parser.add_argument("-dr", "--default_refer_path", type=str, default="", help="默认参考音频路径")
-parser.add_argument("-dt", "--default_refer_text", type=str, default="", help="默认参考音频文本")
-parser.add_argument("-dl", "--default_refer_language", type=str, default="", help="默认参考音频语种")
-parser.add_argument("-d", "--device", type=str, default=g_config.infer_device, help="cuda / cpu")
-parser.add_argument("-a", "--bind_addr", type=str, default="0.0.0.0", help="default: 0.0.0.0")
-parser.add_argument("-p", "--port", type=int, default=g_config.api_port, help="default: 9880")
-parser.add_argument("-fp", "--full_precision", action="store_true", default=False, help="覆盖config.is_half为False, 使用全精度")
-parser.add_argument("-hp", "--half_precision", action="store_true", default=False, help="覆盖config.is_half为True, 使用半精度")
-# bool值的用法为 `python ./api.py -fp ...`
-# 此时 full_precision==True, half_precision==False
-parser.add_argument("-hb", "--hubert_path", type=str, default=g_config.cnhubert_path, help="覆盖config.cnhubert_path")
-parser.add_argument("-b", "--bert_path", type=str, default=g_config.bert_path, help="覆盖config.bert_path")
-args = parser.parse_args()
-sovits_path = args.sovits_path
-gpt_path = args.gpt_path
-class DefaultRefer:
-    def __init__(self, path, text, language):
-        self.path = args.default_refer_path
-        self.text = args.default_refer_text
-        self.language = args.default_refer_language
-    def is_ready(self) -> bool:
-        return is_full(self.path, self.text, self.language)
-default_refer = DefaultRefer(args.default_refer_path, args.default_refer_text, args.default_refer_language)
-device = args.device
-port = args.port
-host = args.bind_addr
-if sovits_path == "":
-    sovits_path = g_config.pretrained_sovits_path
-    print(f"[WARN] 未指定SoVITS模型路径, fallback后当前值: {sovits_path}")
-if gpt_path == "":
-    gpt_path = g_config.pretrained_gpt_path
-    print(f"[WARN] 未指定GPT模型路径, fallback后当前值: {gpt_path}")
-# 指定默认参考音频, 调用方 未提供/未给全 参考音频参数时使用
-if default_refer.path == "" or default_refer.text == "" or default_refer.language == "":
-    default_refer.path, default_refer.text, default_refer.language = "", "", ""
-    print("[INFO] 未指定默认参考音频")
-else:
-    print(f"[INFO] 默认参考音频路径: {default_refer.path}")
-    print(f"[INFO] 默认参考音频文本: {default_refer.text}")
-    print(f"[INFO] 默认参考音频语种: {default_refer.language}")
-is_half = g_config.is_half
-if args.full_precision:
-    is_half = False
-if args.half_precision:
-    is_half = True
-if args.full_precision and args.half_precision:
-    is_half = g_config.is_half  # 炒饭fallback
-print(f"[INFO] 半精: {is_half}")
-cnhubert_base_path = args.hubert_path
-bert_path = args.bert_path
-cnhubert.cnhubert_base_path = cnhubert_base_path
-tokenizer = AutoTokenizer.from_pretrained(bert_path)
-bert_model = AutoModelForMaskedLM.from_pretrained(bert_path)
-if is_half:
-    bert_model = bert_model.half().to(device)
-else:
-    bert_model = bert_model.to(device)
-def is_empty(*items):  # 任意一项不为空返回False
-    for item in items:
-        if item is not None and item != "":
-            return False
-    return True
-def is_full(*items):  # 任意一项为空返回False
-    for item in items:
-        if item is None or item == "":
-            return False
-    return True
-def change_sovits_weights(sovits_path):
-    global vq_model, hps
-    dict_s2 = torch.load(sovits_path, map_location="cpu")
-    hps = dict_s2["config"]
-    hps = DictToAttrRecursive(hps)
-    hps.model.semantic_frame_rate = "25hz"
-    vq_model = SynthesizerTrn(
-        hps.data.filter_length // 2 + 1,
-        hps.train.segment_size // hps.data.hop_length,
-        n_speakers=hps.data.n_speakers,
-        **hps.model
-    )
-    if ("pretrained" not in sovits_path):
-        del vq_model.enc_q
-    if is_half == True:
-        vq_model = vq_model.half().to(device)
-    else:
-        vq_model = vq_model.to(device)
-    vq_model.eval()
-    print(vq_model.load_state_dict(dict_s2["weight"], strict=False))
-    with open("./sweight.txt", "w", encoding="utf-8") as f:
-        f.write(sovits_path)
-def change_gpt_weights(gpt_path):
-    global hz, max_sec, t2s_model, config
-    hz = 50
-    dict_s1 = torch.load(gpt_path, map_location="cpu")
-    config = dict_s1["config"]
-    max_sec = config["data"]["max_sec"]
-    t2s_model = Text2SemanticLightningModule(config, "****", is_train=False)
-    t2s_model.load_state_dict(dict_s1["weight"])
-    if is_half == True:
-        t2s_model = t2s_model.half()
-    t2s_model = t2s_model.to(device)
-    t2s_model.eval()
-    total = sum([param.nelement() for param in t2s_model.parameters()])
-    print("Number of parameter: %.2fM" % (total / 1e6))
-    with open("./gweight.txt", "w", encoding="utf-8") as f: f.write(gpt_path)
-def get_bert_feature(text, word2ph):
-    with torch.no_grad():
-        inputs = tokenizer(text, return_tensors="pt")
-        for i in inputs:
-            inputs[i] = inputs[i].to(device)  #####输入是long不用管精度问题，精度随bert_model
-        res = bert_model(**inputs, output_hidden_states=True)
-        res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()[1:-1]
-    assert len(word2ph) == len(text)
-    phone_level_feature = []
-    for i in range(len(word2ph)):
-        repeat_feature = res[i].repeat(word2ph[i], 1)
-        phone_level_feature.append(repeat_feature)
-    phone_level_feature = torch.cat(phone_level_feature, dim=0)
-    # if(is_half==True):phone_level_feature=phone_level_feature.half()
-    return phone_level_feature.T
-n_semantic = 1024
-dict_s2 = torch.load(sovits_path, map_location="cpu")
-hps = dict_s2["config"]
-class DictToAttrRecursive:
-    def __init__(self, input_dict):
-        for key, value in input_dict.items():
-            if isinstance(value, dict):
-                # 如果值是字典，递归调用构造函数
-                setattr(self, key, DictToAttrRecursive(value))
-            else:
-                setattr(self, key, value)
-hps = DictToAttrRecursive(hps)
-hps.model.semantic_frame_rate = "25hz"
-dict_s1 = torch.load(gpt_path, map_location="cpu")
-config = dict_s1["config"]
-ssl_model = cnhubert.get_model()
-if is_half:
-    ssl_model = ssl_model.half().to(device)
-else:
-    ssl_model = ssl_model.to(device)
-vq_model = SynthesizerTrn(
-    hps.data.filter_length // 2 + 1,
-    hps.train.segment_size // hps.data.hop_length,
-    n_speakers=hps.data.n_speakers,
-    **hps.model)
-if is_half:
-    vq_model = vq_model.half().to(device)
-else:
-    vq_model = vq_model.to(device)
-vq_model.eval()
-print(vq_model.load_state_dict(dict_s2["weight"], strict=False))
-hz = 50
-max_sec = config['data']['max_sec']
-t2s_model = Text2SemanticLightningModule(config, "****", is_train=False)
-t2s_model.load_state_dict(dict_s1["weight"])
-if is_half:
-    t2s_model = t2s_model.half()
-t2s_model = t2s_model.to(device)
-t2s_model.eval()
-total = sum([param.nelement() for param in t2s_model.parameters()])
-print("Number of parameter: %.2fM" % (total / 1e6))
-def get_spepc(hps, filename):
-    audio = load_audio(filename, int(hps.data.sampling_rate))
-    audio = torch.FloatTensor(audio)
-    audio_norm = audio
-    audio_norm = audio_norm.unsqueeze(0)
-    spec = spectrogram_torch(audio_norm, hps.data.filter_length, hps.data.sampling_rate, hps.data.hop_length,
-                             hps.data.win_length, center=False)
-    return spec
-dict_language = {
-    "中文": "zh",
-    "英文": "en",
-    "日文": "ja",
-    "ZH": "zh",
-    "EN": "en",
-    "JA": "ja",
-    "zh": "zh",
-    "en": "en",
-    "ja": "ja"
-}
-def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language):
-    t0 = ttime()
-    prompt_text = prompt_text.strip("\n")
-    prompt_language, text = prompt_language, text.strip("\n")
-    zero_wav = np.zeros(int(hps.data.sampling_rate * 0.3), dtype=np.float16 if is_half == True else np.float32)
-    with torch.no_grad():
-        wav16k, sr = librosa.load(ref_wav_path, sr=16000)
-        wav16k = torch.from_numpy(wav16k)
-        zero_wav_torch = torch.from_numpy(zero_wav)
-        if (is_half == True):
-            wav16k = wav16k.half().to(device)
-            zero_wav_torch = zero_wav_torch.half().to(device)
-        else:
-            wav16k = wav16k.to(device)
-            zero_wav_torch = zero_wav_torch.to(device)
-        wav16k = torch.cat([wav16k, zero_wav_torch])
-        ssl_content = ssl_model.model(wav16k.unsqueeze(0))["last_hidden_state"].transpose(1, 2)  # .float()
-        codes = vq_model.extract_latent(ssl_content)
-        prompt_semantic = codes[0, 0]
-    t1 = ttime()
-    prompt_language = dict_language[prompt_language]
-    text_language = dict_language[text_language]
-    phones1, word2ph1, norm_text1 = clean_text(prompt_text, prompt_language)
-    phones1 = cleaned_text_to_sequence(phones1)
-    texts = text.split("\n")
-    audio_opt = []
-    for text in texts:
-        phones2, word2ph2, norm_text2 = clean_text(text, text_language)
-        phones2 = cleaned_text_to_sequence(phones2)
-        if (prompt_language == "zh"):
-            bert1 = get_bert_feature(norm_text1, word2ph1).to(device)
-        else:
-            bert1 = torch.zeros((1024, len(phones1)), dtype=torch.float16 if is_half == True else torch.float32).to(
-                device)
-        if (text_language == "zh"):
-            bert2 = get_bert_feature(norm_text2, word2ph2).to(device)
-        else:
-            bert2 = torch.zeros((1024, len(phones2))).to(bert1)
-        bert = torch.cat([bert1, bert2], 1)
-        all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0)
-        bert = bert.to(device).unsqueeze(0)
-        all_phoneme_len = torch.tensor([all_phoneme_ids.shape[-1]]).to(device)
-        prompt = prompt_semantic.unsqueeze(0).to(device)
-        t2 = ttime()
-        with torch.no_grad():
-            # pred_semantic = t2s_model.model.infer(
-            pred_semantic, idx = t2s_model.model.infer_panel(
-                all_phoneme_ids,
-                all_phoneme_len,
-                prompt,
-                bert,
-                # prompt_phone_len=ph_offset,
-                top_k=config['inference']['top_k'],
-                early_stop_num=hz * max_sec)
-        t3 = ttime()
-        # print(pred_semantic.shape,idx)
-        pred_semantic = pred_semantic[:, -idx:].unsqueeze(0)  # .unsqueeze(0)#mq要多unsqueeze一次
-        refer = get_spepc(hps, ref_wav_path)  # .to(device)
-        if (is_half == True):
-            refer = refer.half().to(device)
-        else:
-            refer = refer.to(device)
-        # audio = vq_model.decode(pred_semantic, all_phoneme_ids, refer).detach().cpu().numpy()[0, 0]
-        audio = \
-            vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0),
-                            refer).detach().cpu().numpy()[
-                0, 0]  ###试试重建不带上prompt部分
-        audio_opt.append(audio)
-        audio_opt.append(zero_wav)
-        t4 = ttime()
-    print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
-    yield hps.data.sampling_rate, (np.concatenate(audio_opt, 0) * 32768).astype(np.int16)
-def handle_control(command):
-    if command == "restart":
-        os.execl(g_config.python_exec, g_config.python_exec, *sys.argv)
-    elif command == "exit":
-        os.kill(os.getpid(), signal.SIGTERM)
-        exit(0)
-def handle_change(path, text, language):
-    if is_empty(path, text, language):
-        return JSONResponse({"code": 400, "message": '缺少任意一项以下参数: "path", "text", "language"'}, status_code=400)
-    if path != "" or path is not None:
-        default_refer.path = path
-    if text != "" or text is not None:
-        default_refer.text = text
-    if language != "" or language is not None:
-        default_refer.language = language
-    print(f"[INFO] 当前默认参考音频路径: {default_refer.path}")
-    print(f"[INFO] 当前默认参考音频文本: {default_refer.text}")
-    print(f"[INFO] 当前默认参考音频语种: {default_refer.language}")
-    print(f"[INFO] is_ready: {default_refer.is_ready()}")
-    return JSONResponse({"code": 0, "message": "Success"}, status_code=200)
-def handle(refer_wav_path, prompt_text, prompt_language, text, text_language):
-    if (
-            refer_wav_path == "" or refer_wav_path is None
-            or prompt_text == "" or prompt_text is None
-            or prompt_language == "" or prompt_language is None
-    ):
-        refer_wav_path, prompt_text, prompt_language = (
-            default_refer.path,
-            default_refer.text,
-            default_refer.language,
-        )
-        if not default_refer.is_ready():
-            return JSONResponse({"code": 400, "message": "未指定参考音频且接口无预设"}, status_code=400)
-    with torch.no_grad():
-        gen = get_tts_wav(
-            refer_wav_path, prompt_text, prompt_language, text, text_language
-        )
-        sampling_rate, audio_data = next(gen)
-    wav = BytesIO()
-    sf.write(wav, audio_data, sampling_rate, format="wav")
-    wav.seek(0)
-    torch.cuda.empty_cache()
-    return StreamingResponse(wav, media_type="audio/wav")
-app = FastAPI()
-#clark新增-----2024-02-21
-#可在启动后动态修改模型，以此满足同一个api不同的朗读者请求
-@app.post("/set_model")
-async def set_model(request: Request):
-    json_post_raw = await request.json()
-    global gpt_path
-    gpt_path=json_post_raw.get("gpt_model_path")
-    global sovits_path
-    sovits_path=json_post_raw.get("sovits_model_path")
-    print("gptpath"+gpt_path+";vitspath"+sovits_path)
-    change_sovits_weights(sovits_path)
-    change_gpt_weights(gpt_path)
-    return "ok"
-# 新增-----end------
-@app.post("/control")
-async def control(request: Request):
-    json_post_raw = await request.json()
-    return handle_control(json_post_raw.get("command"))
-@app.get("/control")
-async def control(command: str = None):
-    return handle_control(command)
-@app.post("/change_refer")
-async def change_refer(request: Request):
-    json_post_raw = await request.json()
-    return handle_change(
-        json_post_raw.get("refer_wav_path"),
-        json_post_raw.get("prompt_text"),
-        json_post_raw.get("prompt_language")
-    )
-@app.get("/change_refer")
-async def change_refer(
-        refer_wav_path: str = None,
-        prompt_text: str = None,
-        prompt_language: str = None
-):
-    return handle_change(refer_wav_path, prompt_text, prompt_language)
-@app.post("/")
-async def tts_endpoint(request: Request):
-    json_post_raw = await request.json()
-    return handle(
-        json_post_raw.get("refer_wav_path"),
-        json_post_raw.get("prompt_text"),
-        json_post_raw.get("prompt_language"),
-        json_post_raw.get("text"),
-        json_post_raw.get("text_language"),
-    )
-@app.get("/")
-async def tts_endpoint(
-        refer_wav_path: str = None,
-        prompt_text: str = None,
-        prompt_language: str = None,
-        text: str = None,
-        text_language: str = None,
-):
-    return handle(refer_wav_path, prompt_text, prompt_language, text, text_language)
-if __name__ == "__main__":
-    uvicorn.run(app, host=host, port=port, workers=1)

+"""
+# api.py usage
+` python api.py -dr "123.wav" -dt "一二三。" -dl "zh" `
+## 执行参数:
+`-s` - `SoVITS模型路径, 可在 config.py 中指定`
+`-g` - `GPT模型路径, 可在 config.py 中指定`
+调用请求缺少参考音频时使用
+`-dr` - `默认参考音频路径`
+`-dt` - `默认参考音频文本`
+`-dl` - `默认参考音频语种, "中文","英文","日文","zh","en","ja"`
+`-d` - `推理设备, "cuda","cpu"`
+`-a` - `绑定地址, 默认"127.0.0.1"`
+`-p` - `绑定端口, 默认9880, 可在 config.py 中指定`
+`-fp` - `覆盖 config.py 使用全精度`
+`-hp` - `覆盖 config.py 使用半精度`
+`-sm` - `流式返回模式, 默认不启用, "close","c", "normal","n", "keepalive","k"`
+·-mt` - `返回的音频编码格式, 流式默认ogg, 非流式默认wav, "wav", "ogg", "aac"`
+·-cp` - `文本切分符号设定, 默认为空, 以",.，。"字符串的方式传入`
+`-hb` - `cnhubert路径`
+`-b` - `bert路径`
+## 调用:
+### 推理
+endpoint: `/`
+使用执行参数指定的参考音频:
+GET:
+    `http://127.0.0.1:9880?text=先帝创业未半而中道崩殂，今天下三分，益州疲弊，此诚危急存亡之秋也。&text_language=zh`
+POST:
+```json
+{
+    "text": "先帝创业未半而中道崩殂，今天下三分，益州疲弊，此诚危急存亡之秋也。",
+    "text_language": "zh"
+}
+```
+使用执行参数指定的参考音频并设定分割符号:
+GET:
+    `http://127.0.0.1:9880?text=先帝创业未半而中道崩殂，今天下三分，益州疲弊，此诚危急存亡之秋也。&text_language=zh&cut_punc=，。`
+POST:
+```json
+{
+    "text": "先帝创业未半而中道崩殂，今天下三分，益州疲弊，此诚危急存亡之秋也。",
+    "text_language": "zh",
+    "cut_punc": "，。",
+}
+```
+手动指定当次推理所使用的参考音频:
+GET:
+    `http://127.0.0.1:9880?refer_wav_path=123.wav&prompt_text=一二三。&prompt_language=zh&text=先帝创业未半而中道崩殂，今天下三分，益州疲弊，此诚危急存亡之秋也。&text_language=zh`
+POST:
+```json
+{
+    "refer_wav_path": "123.wav",
+    "prompt_text": "一二三。",
+    "prompt_language": "zh",
+    "text": "先帝创业未半而中道崩殂，今天下三分，益州疲弊，此诚危急存亡之秋也。",
+    "text_language": "zh"
+}
+```
+RESP:
+成功: 直接返回 wav 音频流， http code 200
+失败: 返回包含错误信息的 json, http code 400
+### 更换默认参考音频
+endpoint: `/change_refer`
+key与推理端一样
+GET:
+    `http://127.0.0.1:9880/change_refer?refer_wav_path=123.wav&prompt_text=一二三。&prompt_language=zh`
+POST:
+```json
+{
+    "refer_wav_path": "123.wav",
+    "prompt_text": "一二三。",
+    "prompt_language": "zh"
+}
+```
+RESP:
+成功: json, http code 200
+失败: json, 400
+### 命令控制
+endpoint: `/control`
+command:
+"restart": 重新运行
+"exit": 结束运行
+GET:
+    `http://127.0.0.1:9880/control?command=restart`
+POST:
+```json
+{
+    "command": "restart"
+}
+```
+RESP: 无
+"""
+import argparse
+import os,re
+import sys
+import signal
+import LangSegment
+from time import time as ttime
+import torch
+import librosa
+import soundfile as sf
+from fastapi import FastAPI, Request, HTTPException
+from fastapi.responses import StreamingResponse, JSONResponse
+import uvicorn
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+import numpy as np
+from feature_extractor import cnhubert
+from io import BytesIO
+from module.models import SynthesizerTrn
+from AR.models.t2s_lightning_module import Text2SemanticLightningModule
+from text import cleaned_text_to_sequence
+from text.cleaner import clean_text
+from module.mel_processing import spectrogram_torch
+from my_utils import load_audio
+import config as global_config
+import logging
+import subprocess
+class DefaultRefer:
+    def __init__(self, path, text, language):
+        self.path = args.default_refer_path
+        self.text = args.default_refer_text
+        self.language = args.default_refer_language
+    def is_ready(self) -> bool:
+        return is_full(self.path, self.text, self.language)
+def is_empty(*items):  # 任意一项不为空返回False
+    for item in items:
+        if item is not None and item != "":
+            return False
+    return True
+def is_full(*items):  # 任意一项为空返回False
+    for item in items:
+        if item is None or item == "":
+            return False
+    return True
+def change_sovits_weights(sovits_path):
+    global vq_model, hps
+    dict_s2 = torch.load(sovits_path, map_location="cpu")
+    hps = dict_s2["config"]
+    hps = DictToAttrRecursive(hps)
+    hps.model.semantic_frame_rate = "25hz"
+    vq_model = SynthesizerTrn(
+        hps.data.filter_length // 2 + 1,
+        hps.train.segment_size // hps.data.hop_length,
+        n_speakers=hps.data.n_speakers,
+        **hps.model
+    )
+    if ("pretrained" not in sovits_path):
+        del vq_model.enc_q
+    if is_half == True:
+        vq_model = vq_model.half().to(device)
+    else:
+        vq_model = vq_model.to(device)
+    vq_model.eval()
+    vq_model.load_state_dict(dict_s2["weight"], strict=False)
+def change_gpt_weights(gpt_path):
+    global hz, max_sec, t2s_model, config
+    hz = 50
+    dict_s1 = torch.load(gpt_path, map_location="cpu")
+    config = dict_s1["config"]
+    max_sec = config["data"]["max_sec"]
+    t2s_model = Text2SemanticLightningModule(config, "****", is_train=False)
+    t2s_model.load_state_dict(dict_s1["weight"])
+    if is_half == True:
+        t2s_model = t2s_model.half()
+    t2s_model = t2s_model.to(device)
+    t2s_model.eval()
+    total = sum([param.nelement() for param in t2s_model.parameters()])
+    logger.info("Number of parameter: %.2fM" % (total / 1e6))
+def get_bert_feature(text, word2ph):
+    with torch.no_grad():
+        inputs = tokenizer(text, return_tensors="pt")
+        for i in inputs:
+            inputs[i] = inputs[i].to(device)  #####输入是long不用管精度问题，精度随bert_model
+        res = bert_model(**inputs, output_hidden_states=True)
+        res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()[1:-1]
+    assert len(word2ph) == len(text)
+    phone_level_feature = []
+    for i in range(len(word2ph)):
+        repeat_feature = res[i].repeat(word2ph[i], 1)
+        phone_level_feature.append(repeat_feature)
+    phone_level_feature = torch.cat(phone_level_feature, dim=0)
+    # if(is_half==True):phone_level_feature=phone_level_feature.half()
+    return phone_level_feature.T
+def clean_text_inf(text, language):
+    phones, word2ph, norm_text = clean_text(text, language)
+    phones = cleaned_text_to_sequence(phones)
+    return phones, word2ph, norm_text
+def get_bert_inf(phones, word2ph, norm_text, language):
+    language=language.replace("all_","")
+    if language == "zh":
+        bert = get_bert_feature(norm_text, word2ph).to(device)#.to(dtype)
+    else:
+        bert = torch.zeros(
+            (1024, len(phones)),
+            dtype=torch.float16 if is_half == True else torch.float32,
+        ).to(device)
+    return bert
+def get_phones_and_bert(text,language):
+    if language in {"en","all_zh","all_ja"}:
+        language = language.replace("all_","")
+        if language == "en":
+            LangSegment.setfilters(["en"])
+            formattext = " ".join(tmp["text"] for tmp in LangSegment.getTexts(text))
+        else:
+            # 因无法区别中日文汉字,以用户输入为准
+            formattext = text
+        while "  " in formattext:
+            formattext = formattext.replace("  ", " ")
+        phones, word2ph, norm_text = clean_text_inf(formattext, language)
+        if language == "zh":
+            bert = get_bert_feature(norm_text, word2ph).to(device)
+        else:
+            bert = torch.zeros(
+                (1024, len(phones)),
+                dtype=torch.float16 if is_half == True else torch.float32,
+            ).to(device)
+    elif language in {"zh", "ja","auto"}:
+        textlist=[]
+        langlist=[]
+        LangSegment.setfilters(["zh","ja","en","ko"])
+        if language == "auto":
+            for tmp in LangSegment.getTexts(text):
+                if tmp["lang"] == "ko":
+                    langlist.append("zh")
+                    textlist.append(tmp["text"])
+                else:
+                    langlist.append(tmp["lang"])
+                    textlist.append(tmp["text"])
+        else:
+            for tmp in LangSegment.getTexts(text):
+                if tmp["lang"] == "en":
+                    langlist.append(tmp["lang"])
+                else:
+                    # 因无法区别中日文汉字,以用户输入为准
+                    langlist.append(language)
+                textlist.append(tmp["text"])
+        # logger.info(textlist)
+        # logger.info(langlist)
+        phones_list = []
+        bert_list = []
+        norm_text_list = []
+        for i in range(len(textlist)):
+            lang = langlist[i]
+            phones, word2ph, norm_text = clean_text_inf(textlist[i], lang)
+            bert = get_bert_inf(phones, word2ph, norm_text, lang)
+            phones_list.append(phones)
+            norm_text_list.append(norm_text)
+            bert_list.append(bert)
+        bert = torch.cat(bert_list, dim=1)
+        phones = sum(phones_list, [])
+        norm_text = ''.join(norm_text_list)
+    return phones,bert.to(torch.float16 if is_half == True else torch.float32),norm_text
+class DictToAttrRecursive:
+    def __init__(self, input_dict):
+        for key, value in input_dict.items():
+            if isinstance(value, dict):
+                # 如果值是字典，递归调用构造函数
+                setattr(self, key, DictToAttrRecursive(value))
+            else:
+                setattr(self, key, value)
+def get_spepc(hps, filename):
+    audio = load_audio(filename, int(hps.data.sampling_rate))
+    audio = torch.FloatTensor(audio)
+    audio_norm = audio
+    audio_norm = audio_norm.unsqueeze(0)
+    spec = spectrogram_torch(audio_norm, hps.data.filter_length, hps.data.sampling_rate, hps.data.hop_length,
+                             hps.data.win_length, center=False)
+    return spec
+def pack_audio(audio_bytes, data, rate):
+    if media_type == "ogg":
+        audio_bytes = pack_ogg(audio_bytes, data, rate)
+    elif media_type == "aac":
+        audio_bytes = pack_aac(audio_bytes, data, rate)
+    else:
+        # wav无法流式, 先暂存raw
+        audio_bytes = pack_raw(audio_bytes, data, rate)
+    return audio_bytes
+def pack_ogg(audio_bytes, data, rate):
+    with sf.SoundFile(audio_bytes, mode='w', samplerate=rate, channels=1, format='ogg') as audio_file:
+        audio_file.write(data)
+    return audio_bytes
+def pack_raw(audio_bytes, data, rate):
+    audio_bytes.write(data.tobytes())
+    return audio_bytes
+def pack_wav(audio_bytes, rate):
+    data = np.frombuffer(audio_bytes.getvalue(),dtype=np.int16)
+    wav_bytes = BytesIO()
+    sf.write(wav_bytes, data, rate, format='wav')
+    return wav_bytes
+def pack_aac(audio_bytes, data, rate):
+    process = subprocess.Popen([
+        'ffmpeg',
+        '-f', 's16le',  # 输入16位有符号小端整数PCM
+        '-ar', str(rate),  # 设置采样率
+        '-ac', '1',  # 单声道
+        '-i', 'pipe:0',  # 从管道读取输入
+        '-c:a', 'aac',  # 音频编码器为AAC
+        '-b:a', '192k',  # 比特率
+        '-vn',  # 不包含视频
+        '-f', 'adts',  # 输出AAC数据流格式
+        'pipe:1'  # 将输出写入管道
+    ], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    out, _ = process.communicate(input=data.tobytes())
+    audio_bytes.write(out)
+    return audio_bytes
+def read_clean_buffer(audio_bytes):
+    audio_chunk = audio_bytes.getvalue()
+    audio_bytes.truncate(0)
+    audio_bytes.seek(0)
+    return audio_bytes, audio_chunk
+def cut_text(text, punc):
+    punc_list = [p for p in punc if p in {",", ".", ";", "?", "!", "、", "，", "。", "？", "！", ";", "：", "…"}]
+    if len(punc_list) > 0:
+        punds = r"[" + "".join(punc_list) + r"]"
+        text = text.strip("\n")
+        items = re.split(f"({punds})", text)
+        mergeitems = ["".join(group) for group in zip(items[::2], items[1::2])]
+        # 在句子不存在符号或句尾无符号的时候保证文本完整
+        if len(items)%2 == 1:
+            mergeitems.append(items[-1])
+        text = "\n".join(mergeitems)
+    while "\n\n" in text:
+        text = text.replace("\n\n", "\n")
+    return text
+def only_punc(text):
+    return not any(t.isalnum() or t.isalpha() for t in text)
+def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language):
+    t0 = ttime()
+    prompt_text = prompt_text.strip("\n")
+    prompt_language, text = prompt_language, text.strip("\n")
+    zero_wav = np.zeros(int(hps.data.sampling_rate * 0.3), dtype=np.float16 if is_half == True else np.float32)
+    with torch.no_grad():
+        wav16k, sr = librosa.load(ref_wav_path, sr=16000)
+        wav16k = torch.from_numpy(wav16k)
+        zero_wav_torch = torch.from_numpy(zero_wav)
+        if (is_half == True):
+            wav16k = wav16k.half().to(device)
+            zero_wav_torch = zero_wav_torch.half().to(device)
+        else:
+            wav16k = wav16k.to(device)
+            zero_wav_torch = zero_wav_torch.to(device)
+        wav16k = torch.cat([wav16k, zero_wav_torch])
+        ssl_content = ssl_model.model(wav16k.unsqueeze(0))["last_hidden_state"].transpose(1, 2)  # .float()
+        codes = vq_model.extract_latent(ssl_content)
+        prompt_semantic = codes[0, 0]
+    t1 = ttime()
+    prompt_language = dict_language[prompt_language.lower()]
+    text_language = dict_language[text_language.lower()]
+    phones1, bert1, norm_text1 = get_phones_and_bert(prompt_text, prompt_language)
+    texts = text.split("\n")
+    audio_bytes = BytesIO()
+    for text in texts:
+        # 简单防止纯符号引发参考音频泄露
+        if only_punc(text):
+            continue
+        audio_opt = []
+        phones2, bert2, norm_text2 = get_phones_and_bert(text, text_language)
+        bert = torch.cat([bert1, bert2], 1)
+        all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0)
+        bert = bert.to(device).unsqueeze(0)
+        all_phoneme_len = torch.tensor([all_phoneme_ids.shape[-1]]).to(device)
+        prompt = prompt_semantic.unsqueeze(0).to(device)
+        t2 = ttime()
+        with torch.no_grad():
+            # pred_semantic = t2s_model.model.infer(
+            pred_semantic, idx = t2s_model.model.infer_panel(
+                all_phoneme_ids,
+                all_phoneme_len,
+                prompt,
+                bert,
+                # prompt_phone_len=ph_offset,
+                top_k=config['inference']['top_k'],
+                early_stop_num=hz * max_sec)
+        t3 = ttime()
+        # print(pred_semantic.shape,idx)
+        pred_semantic = pred_semantic[:, -idx:].unsqueeze(0)  # .unsqueeze(0)#mq要多unsqueeze一次
+        refer = get_spepc(hps, ref_wav_path)  # .to(device)
+        if (is_half == True):
+            refer = refer.half().to(device)
+        else:
+            refer = refer.to(device)
+        # audio = vq_model.decode(pred_semantic, all_phoneme_ids, refer).detach().cpu().numpy()[0, 0]
+        audio = \
+            vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0),
+                            refer).detach().cpu().numpy()[
+                0, 0]  ###试试重建不带上prompt部分
+        audio_opt.append(audio)
+        audio_opt.append(zero_wav)
+        t4 = ttime()
+        audio_bytes = pack_audio(audio_bytes,(np.concatenate(audio_opt, 0) * 32768).astype(np.int16),hps.data.sampling_rate)
+    # logger.info("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
+        if stream_mode == "normal":
+            audio_bytes, audio_chunk = read_clean_buffer(audio_bytes)
+            yield audio_chunk
+    if not stream_mode == "normal":
+        if media_type == "wav":
+            audio_bytes = pack_wav(audio_bytes,hps.data.sampling_rate)
+        yield audio_bytes.getvalue()
+def handle_control(command):
+    if command == "restart":
+        os.execl(g_config.python_exec, g_config.python_exec, *sys.argv)
+    elif command == "exit":
+        os.kill(os.getpid(), signal.SIGTERM)
+        exit(0)
+def handle_change(path, text, language):
+    if is_empty(path, text, language):
+        return JSONResponse({"code": 400, "message": '缺少任意一项以下参数: "path", "text", "language"'}, status_code=400)
+    if path != "" or path is not None:
+        default_refer.path = path
+    if text != "" or text is not None:
+        default_refer.text = text
+    if language != "" or language is not None:
+        default_refer.language = language
+    logger.info(f"当前默认参考音频路径: {default_refer.path}")
+    logger.info(f"当前默认参考音频文本: {default_refer.text}")
+    logger.info(f"当前默认参考音频语种: {default_refer.language}")
+    logger.info(f"is_ready: {default_refer.is_ready()}")
+    return JSONResponse({"code": 0, "message": "Success"}, status_code=200)
+def handle(refer_wav_path, prompt_text, prompt_language, text, text_language, cut_punc):
+    if (
+            refer_wav_path == "" or refer_wav_path is None
+            or prompt_text == "" or prompt_text is None
+            or prompt_language == "" or prompt_language is None
+    ):
+        refer_wav_path, prompt_text, prompt_language = (
+            default_refer.path,
+            default_refer.text,
+            default_refer.language,
+        )
+        if not default_refer.is_ready():
+            return JSONResponse({"code": 400, "message": "未指定参考音频且接口无预设"}, status_code=400)
+    if cut_punc == None:
+        text = cut_text(text,default_cut_punc)
+    else:
+        text = cut_text(text,cut_punc)
+    return StreamingResponse(get_tts_wav(refer_wav_path, prompt_text, prompt_language, text, text_language), media_type="audio/"+media_type)
+# --------------------------------
+# 初始化部分
+# --------------------------------
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+sys.path.append("%s/GPT_SoVITS" % (now_dir))
+dict_language = {
+    "中文": "all_zh",
+    "英文": "en",
+    "日文": "all_ja",
+    "中英混合": "zh",
+    "日英混合": "ja",
+    "多语种混合": "auto",    #多语种启动切分识别语种
+    "all_zh": "all_zh",
+    "en": "en",
+    "all_ja": "all_ja",
+    "zh": "zh",
+    "ja": "ja",
+    "auto": "auto",
+}
+# logger
+logging.config.dictConfig(uvicorn.config.LOGGING_CONFIG)
+logger = logging.getLogger('uvicorn')
+# 获取配置
+g_config = global_config.Config()
+# 获取参数
+parser = argparse.ArgumentParser(description="GPT-SoVITS api")
+parser.add_argument("-s", "--sovits_path", type=str, default=g_config.sovits_path, help="SoVITS模型路径")
+parser.add_argument("-g", "--gpt_path", type=str, default=g_config.gpt_path, help="GPT模型路径")
+parser.add_argument("-dr", "--default_refer_path", type=str, default="", help="默认参考音频路径")
+parser.add_argument("-dt", "--default_refer_text", type=str, default="", help="默认参考音频文本")
+parser.add_argument("-dl", "--default_refer_language", type=str, default="", help="默认参考音频语种")
+parser.add_argument("-d", "--device", type=str, default=g_config.infer_device, help="cuda / cpu")
+parser.add_argument("-a", "--bind_addr", type=str, default="0.0.0.0", help="default: 0.0.0.0")
+parser.add_argument("-p", "--port", type=int, default=g_config.api_port, help="default: 9880")
+parser.add_argument("-fp", "--full_precision", action="store_true", default=False, help="覆盖config.is_half为False, 使用全精度")
+parser.add_argument("-hp", "--half_precision", action="store_true", default=False, help="覆盖config.is_half为True, 使用半精度")
+# bool值的用法为 `python ./api.py -fp ...`
+# 此时 full_precision==True, half_precision==False
+parser.add_argument("-sm", "--stream_mode", type=str, default="close", help="流式返回模式, close / normal / keepalive")
+parser.add_argument("-mt", "--media_type", type=str, default="wav", help="音频编码格式, wav / ogg / aac")
+parser.add_argument("-cp", "--cut_punc", type=str, default="", help="文本切分符号设定, 符号范围,.;?!、，。？！;：…")
+# 切割常用分句符为 `python ./api.py -cp ".?!。？！"`
+parser.add_argument("-hb", "--hubert_path", type=str, default=g_config.cnhubert_path, help="覆盖config.cnhubert_path")
+parser.add_argument("-b", "--bert_path", type=str, default=g_config.bert_path, help="覆盖config.bert_path")
+args = parser.parse_args()
+sovits_path = args.sovits_path
+gpt_path = args.gpt_path
+device = args.device
+port = args.port
+host = args.bind_addr
+cnhubert_base_path = args.hubert_path
+bert_path = args.bert_path
+default_cut_punc = args.cut_punc
+# 应用参数配置
+default_refer = DefaultRefer(args.default_refer_path, args.default_refer_text, args.default_refer_language)
+# 模型路径检查
+if sovits_path == "":
+    sovits_path = g_config.pretrained_sovits_path
+    logger.warn(f"未指定SoVITS模型路径, fallback后当前值: {sovits_path}")
+if gpt_path == "":
+    gpt_path = g_config.pretrained_gpt_path
+    logger.warn(f"未指定GPT模型路径, fallback后当前值: {gpt_path}")
+# 指定默认参考音频, 调用方 未提供/未给全 参考音频参数时使用
+if default_refer.path == "" or default_refer.text == "" or default_refer.language == "":
+    default_refer.path, default_refer.text, default_refer.language = "", "", ""
+    logger.info("未指定默认参考音频")
+else:
+    logger.info(f"默认参考音频路径: {default_refer.path}")
+    logger.info(f"默认参考音频文本: {default_refer.text}")
+    logger.info(f"默认参考音频语种: {default_refer.language}")
+# 获取半精度
+is_half = g_config.is_half
+if args.full_precision:
+    is_half = False
+if args.half_precision:
+    is_half = True
+if args.full_precision and args.half_precision:
+    is_half = g_config.is_half  # 炒饭fallback
+logger.info(f"半精: {is_half}")
+# 流式返回模式
+if args.stream_mode.lower() in ["normal","n"]:
+    stream_mode = "normal"
+    logger.info("流式返回已开启")
+else:
+    stream_mode = "close"
+# 音频编码格式
+if args.media_type.lower() in ["aac","ogg"]:
+    media_type = args.media_type.lower()
+elif stream_mode == "close":
+    media_type = "wav"
+else:
+    media_type = "ogg"
+logger.info(f"编码格式: {media_type}")
+# 初始化模型
+cnhubert.cnhubert_base_path = cnhubert_base_path
+tokenizer = AutoTokenizer.from_pretrained(bert_path)
+bert_model = AutoModelForMaskedLM.from_pretrained(bert_path)
+ssl_model = cnhubert.get_model()
+if is_half:
+    bert_model = bert_model.half().to(device)
+    ssl_model = ssl_model.half().to(device)
+else:
+    bert_model = bert_model.to(device)
+    ssl_model = ssl_model.to(device)
+change_sovits_weights(sovits_path)
+change_gpt_weights(gpt_path)
+# --------------------------------
+# 接口部分
+# --------------------------------
+app = FastAPI()
+@app.post("/set_model")
+async def set_model(request: Request):
+    json_post_raw = await request.json()
+    global gpt_path
+    gpt_path=json_post_raw.get("gpt_model_path")
+    global sovits_path
+    sovits_path=json_post_raw.get("sovits_model_path")
+    logger.info("gptpath"+gpt_path+";vitspath"+sovits_path)
+    change_sovits_weights(sovits_path)
+    change_gpt_weights(gpt_path)
+    return "ok"
+@app.post("/control")
+async def control(request: Request):
+    json_post_raw = await request.json()
+    return handle_control(json_post_raw.get("command"))
+@app.get("/control")
+async def control(command: str = None):
+    return handle_control(command)
+@app.post("/change_refer")
+async def change_refer(request: Request):
+    json_post_raw = await request.json()
+    return handle_change(
+        json_post_raw.get("refer_wav_path"),
+        json_post_raw.get("prompt_text"),
+        json_post_raw.get("prompt_language")
+    )
+@app.get("/change_refer")
+async def change_refer(
+        refer_wav_path: str = None,
+        prompt_text: str = None,
+        prompt_language: str = None
+):
+    return handle_change(refer_wav_path, prompt_text, prompt_language)
+@app.post("/")
+async def tts_endpoint(request: Request):
+    json_post_raw = await request.json()
+    return handle(
+        json_post_raw.get("refer_wav_path"),
+        json_post_raw.get("prompt_text"),
+        json_post_raw.get("prompt_language"),
+        json_post_raw.get("text"),
+        json_post_raw.get("text_language"),
+        json_post_raw.get("cut_punc"),
+    )
+@app.get("/")
+async def tts_endpoint(
+        refer_wav_path: str = None,
+        prompt_text: str = None,
+        prompt_language: str = None,
+        text: str = None,
+        text_language: str = None,
+        cut_punc: str = None,
+):
+    return handle(refer_wav_path, prompt_text, prompt_language, text, text_language, cut_punc)
+if __name__ == "__main__":
+    uvicorn.run(app, host=host, port=port, workers=1)

api_v2.py ADDED Viewed

	@@ -0,0 +1,453 @@

+"""
+# WebAPI文档
+` python api_v2.py -a 127.0.0.1 -p 9880 -c GPT_SoVITS/configs/tts_infer.yaml `
+## 执行参数:
+    `-a` - `绑定地址, 默认"127.0.0.1"`
+    `-p` - `绑定端口, 默认9880`
+    `-c` - `TTS配置文件路径, 默认"GPT_SoVITS/configs/tts_infer.yaml"`
+## 调用:
+### 推理
+endpoint: `/tts`
+GET:
+```
+http://127.0.0.1:9880/tts?text=先帝创业未半而中道崩殂，今天下三分，益州疲弊，此诚危急存亡之秋也。&text_lang=zh&ref_audio_path=archive_jingyuan_1.wav&prompt_lang=zh&prompt_text=我是「罗浮」云骑将军景元。不必拘谨，「将军」只是一时的身份，你称呼我景元便可&text_split_method=cut5&batch_size=1&media_type=wav&streaming_mode=true
+```
+POST:
+```json
+{
+    "text": "",                   # str.(required) text to be synthesized
+    "text_lang": "",              # str.(required) language of the text to be synthesized
+    "ref_audio_path": "",         # str.(required) reference audio path.
+    "prompt_text": "",            # str.(optional) prompt text for the reference audio
+    "prompt_lang": "",            # str.(required) language of the prompt text for the reference audio
+    "top_k": 5,                   # int.(optional) top k sampling
+    "top_p": 1,                   # float.(optional) top p sampling
+    "temperature": 1,             # float.(optional) temperature for sampling
+    "text_split_method": "cut5",  # str.(optional) text split method, see text_segmentation_method.py for details.
+    "batch_size": 1,              # int.(optional) batch size for inference
+    "batch_threshold": 0.75,      # float.(optional) threshold for batch splitting.
+    "split_bucket": true,         # bool.(optional) whether to split the batch into multiple buckets.
+    "speed_factor":1.0,           # float.(optional) control the speed of the synthesized audio.
+    "fragment_interval":0.3,      # float.(optional) to control the interval of the audio fragment.
+    "seed": -1,                   # int.(optional) random seed for reproducibility.
+    "media_type": "wav",          # str.(optional) media type of the output audio, support "wav", "raw", "ogg", "aac".
+    "streaming_mode": false,      # bool.(optional) whether to return a streaming response.
+    "parallel_infer": True,       # bool.(optional) whether to use parallel inference.
+    "repetition_penalty": 1.35    # float.(optional) repetition penalty for T2S model.
+}
+```
+RESP:
+成功: 直接返回 wav 音频流， http code 200
+失败: 返回包含错误信息的 json, http code 400
+### 命令控制
+endpoint: `/control`
+command:
+"restart": 重新运行
+"exit": 结束运行
+GET:
+```
+http://127.0.0.1:9880/control?command=restart
+```
+POST:
+```json
+{
+    "command": "restart"
+}
+```
+RESP: 无
+### 切换GPT模型
+endpoint: `/set_gpt_weights`
+GET:
+```
+http://127.0.0.1:9880/set_gpt_weights?weights_path=GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
+```
+RESP:
+成功: 返回"success", http code 200
+失败: 返回包含错误信息的 json, http code 400
+### 切换Sovits模型
+endpoint: `/set_sovits_weights`
+GET:
+```
+http://127.0.0.1:9880/set_sovits_weights?weights_path=GPT_SoVITS/pretrained_models/s2G488k.pth
+```
+RESP:
+成功: 返回"success", http code 200
+失败: 返回包含错误信息的 json, http code 400
+"""
+import os
+import sys
+import traceback
+from typing import Generator
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+sys.path.append("%s/GPT_SoVITS" % (now_dir))
+import argparse
+import subprocess
+import wave
+import signal
+import numpy as np
+import soundfile as sf
+from fastapi import FastAPI, Request, HTTPException, Response
+from fastapi.responses import StreamingResponse, JSONResponse
+from fastapi import FastAPI, UploadFile, File
+import uvicorn
+from io import BytesIO
+from tools.i18n.i18n import I18nAuto
+from GPT_SoVITS.TTS_infer_pack.TTS import TTS, TTS_Config
+from GPT_SoVITS.TTS_infer_pack.text_segmentation_method import get_method_names as get_cut_method_names
+from fastapi.responses import StreamingResponse
+from pydantic import BaseModel
+# print(sys.path)
+i18n = I18nAuto()
+cut_method_names = get_cut_method_names()
+parser = argparse.ArgumentParser(description="GPT-SoVITS api")
+parser.add_argument("-c", "--tts_config", type=str, default="GPT_SoVITS/configs/tts_infer.yaml", help="tts_infer路径")
+parser.add_argument("-a", "--bind_addr", type=str, default="127.0.0.1", help="default: 127.0.0.1")
+parser.add_argument("-p", "--port", type=int, default="9880", help="default: 9880")
+args = parser.parse_args()
+config_path = args.tts_config
+# device = args.device
+port = args.port
+host = args.bind_addr
+argv = sys.argv
+if config_path in [None, ""]:
+    config_path = "GPT-SoVITS/configs/tts_infer.yaml"
+tts_config = TTS_Config(config_path)
+tts_pipeline = TTS(tts_config)
+APP = FastAPI()
+class TTS_Request(BaseModel):
+    text: str = None
+    text_lang: str = None
+    ref_audio_path: str = None
+    prompt_lang: str = None
+    prompt_text: str = ""
+    top_k:int = 5
+    top_p:float = 1
+    temperature:float = 1
+    text_split_method:str = "cut5"
+    batch_size:int = 1
+    batch_threshold:float = 0.75
+    split_bucket:bool = True
+    speed_factor:float = 1.0
+    fragment_interval:float = 0.3
+    seed:int = -1
+    media_type:str = "wav"
+    streaming_mode:bool = False
+    parallel_infer:bool = True
+    repetition_penalty:float = 1.35
+### modify from https://github.com/RVC-Boss/GPT-SoVITS/pull/894/files
+def pack_ogg(io_buffer:BytesIO, data:np.ndarray, rate:int):
+    with sf.SoundFile(io_buffer, mode='w', samplerate=rate, channels=1, format='ogg') as audio_file:
+        audio_file.write(data)
+    return io_buffer
+def pack_raw(io_buffer:BytesIO, data:np.ndarray, rate:int):
+    io_buffer.write(data.tobytes())
+    return io_buffer
+def pack_wav(io_buffer:BytesIO, data:np.ndarray, rate:int):
+    io_buffer = BytesIO()
+    sf.write(io_buffer, data, rate, format='wav')
+    return io_buffer
+def pack_aac(io_buffer:BytesIO, data:np.ndarray, rate:int):
+    process = subprocess.Popen([
+        'ffmpeg',
+        '-f', 's16le',  # 输入16位有符号小端整数PCM
+        '-ar', str(rate),  # 设置采样率
+        '-ac', '1',  # 单声道
+        '-i', 'pipe:0',  # 从管道读取输入
+        '-c:a', 'aac',  # 音频编码器为AAC
+        '-b:a', '192k',  # 比特率
+        '-vn',  # 不包含视频
+        '-f', 'adts',  # 输出AAC数据流格式
+        'pipe:1'  # 将输出写入管道
+    ], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    out, _ = process.communicate(input=data.tobytes())
+    io_buffer.write(out)
+    return io_buffer
+def pack_audio(io_buffer:BytesIO, data:np.ndarray, rate:int, media_type:str):
+    if media_type == "ogg":
+        io_buffer = pack_ogg(io_buffer, data, rate)
+    elif media_type == "aac":
+        io_buffer = pack_aac(io_buffer, data, rate)
+    elif media_type == "wav":
+        io_buffer = pack_wav(io_buffer, data, rate)
+    else:
+        io_buffer = pack_raw(io_buffer, data, rate)
+    io_buffer.seek(0)
+    return io_buffer
+# from https://huggingface.co/spaces/coqui/voice-chat-with-mistral/blob/main/app.py
+def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=32000):
+    # This will create a wave header then append the frame input
+    # It should be first on a streaming wav file
+    # Other frames better should not have it (else you will hear some artifacts each chunk start)
+    wav_buf = BytesIO()
+    with wave.open(wav_buf, "wb") as vfout:
+        vfout.setnchannels(channels)
+        vfout.setsampwidth(sample_width)
+        vfout.setframerate(sample_rate)
+        vfout.writeframes(frame_input)
+    wav_buf.seek(0)
+    return wav_buf.read()
+def handle_control(command:str):
+    if command == "restart":
+        os.execl(sys.executable, sys.executable, *argv)
+    elif command == "exit":
+        os.kill(os.getpid(), signal.SIGTERM)
+        exit(0)
+def check_params(req:dict):
+    text:str = req.get("text", "")
+    text_lang:str = req.get("text_lang", "")
+    ref_audio_path:str = req.get("ref_audio_path", "")
+    streaming_mode:bool = req.get("streaming_mode", False)
+    media_type:str = req.get("media_type", "wav")
+    prompt_lang:str = req.get("prompt_lang", "")
+    text_split_method:str = req.get("text_split_method", "cut5")
+    if ref_audio_path in [None, ""]:
+        return JSONResponse(status_code=400, content={"message": "ref_audio_path is required"})
+    if text in [None, ""]:
+        return JSONResponse(status_code=400, content={"message": "text is required"})
+    if (text_lang in [None, ""]) :
+        return JSONResponse(status_code=400, content={"message": "text_lang is required"})
+    elif text_lang.lower() not in tts_config.languages:
+        return JSONResponse(status_code=400, content={"message": "text_lang is not supported"})
+    if (prompt_lang in [None, ""]) :
+        return JSONResponse(status_code=400, content={"message": "prompt_lang is required"})
+    elif prompt_lang.lower() not in tts_config.languages:
+        return JSONResponse(status_code=400, content={"message": "prompt_lang is not supported"})
+    if media_type not in ["wav", "raw", "ogg", "aac"]:
+        return JSONResponse(status_code=400, content={"message": "media_type is not supported"})
+    elif media_type == "ogg" and  not streaming_mode:
+        return JSONResponse(status_code=400, content={"message": "ogg format is not supported in non-streaming mode"})
+    if text_split_method not in cut_method_names:
+        return JSONResponse(status_code=400, content={"message": f"text_split_method:{text_split_method} is not supported"})
+    return None
+async def tts_handle(req:dict):
+    """
+    Text to speech handler.
+    Args:
+        req (dict):
+            {
+                "text": "",                   # str.(required) text to be synthesized
+                "text_lang: "",               # str.(required) language of the text to be synthesized
+                "ref_audio_path": "",         # str.(required) reference audio path
+                "prompt_text": "",            # str.(optional) prompt text for the reference audio
+                "prompt_lang": "",            # str.(required) language of the prompt text for the reference audio
+                "top_k": 5,                   # int. top k sampling
+                "top_p": 1,                   # float. top p sampling
+                "temperature": 1,             # float. temperature for sampling
+                "text_split_method": "cut5",  # str. text split method, see text_segmentation_method.py for details.
+                "batch_size": 1,              # int. batch size for inference
+                "batch_threshold": 0.75,      # float. threshold for batch splitting.
+                "split_bucket: True,          # bool. whether to split the batch into multiple buckets.
+                "speed_factor":1.0,           # float. control the speed of the synthesized audio.
+                "fragment_interval":0.3,      # float. to control the interval of the audio fragment.
+                "seed": -1,                   # int. random seed for reproducibility.
+                "media_type": "wav",          # str. media type of the output audio, support "wav", "raw", "ogg", "aac".
+                "streaming_mode": False,      # bool. whether to return a streaming response.
+                "parallel_infer": True,       # bool.(optional) whether to use parallel inference.
+                "repetition_penalty": 1.35    # float.(optional) repetition penalty for T2S model.
+            }
+    returns:
+        StreamingResponse: audio stream response.
+    """
+    streaming_mode = req.get("streaming_mode", False)
+    media_type = req.get("media_type", "wav")
+    check_res = check_params(req)
+    if check_res is not None:
+        return check_res
+    if streaming_mode:
+        req["return_fragment"] = True
+    try:
+        tts_generator=tts_pipeline.run(req)
+        if streaming_mode:
+            def streaming_generator(tts_generator:Generator, media_type:str):
+                if media_type == "wav":
+                    yield wave_header_chunk()
+                    media_type = "raw"
+                for sr, chunk in tts_generator:
+                    yield pack_audio(BytesIO(), chunk, sr, media_type).getvalue()
+            # _media_type = f"audio/{media_type}" if not (streaming_mode and media_type in ["wav", "raw"]) else f"audio/x-{media_type}"
+            return StreamingResponse(streaming_generator(tts_generator, media_type, ), media_type=f"audio/{media_type}")
+        else:
+            sr, audio_data = next(tts_generator)
+            audio_data = pack_audio(BytesIO(), audio_data, sr, media_type).getvalue()
+            return Response(audio_data, media_type=f"audio/{media_type}")
+    except Exception as e:
+        return JSONResponse(status_code=400, content={"message": f"tts failed", "Exception": str(e)})
+@APP.get("/control")
+async def control(command: str = None):
+    if command is None:
+        return JSONResponse(status_code=400, content={"message": "command is required"})
+    handle_control(command)
+@APP.get("/tts")
+async def tts_get_endpoint(
+                        text: str = None,
+                        text_lang: str = None,
+                        ref_audio_path: str = None,
+                        prompt_lang: str = None,
+                        prompt_text: str = "",
+                        top_k:int = 5,
+                        top_p:float = 1,
+                        temperature:float = 1,
+                        text_split_method:str = "cut0",
+                        batch_size:int = 1,
+                        batch_threshold:float = 0.75,
+                        split_bucket:bool = True,
+                        speed_factor:float = 1.0,
+                        fragment_interval:float = 0.3,
+                        seed:int = -1,
+                        media_type:str = "wav",
+                        streaming_mode:bool = False,
+                        parallel_infer:bool = True,
+                        repetition_penalty:float = 1.35
+                        ):
+    req = {
+        "text": text,
+        "text_lang": text_lang.lower(),
+        "ref_audio_path": ref_audio_path,
+        "prompt_text": prompt_text,
+        "prompt_lang": prompt_lang.lower(),
+        "top_k": top_k,
+        "top_p": top_p,
+        "temperature": temperature,
+        "text_split_method": text_split_method,
+        "batch_size":int(batch_size),
+        "batch_threshold":float(batch_threshold),
+        "speed_factor":float(speed_factor),
+        "split_bucket":split_bucket,
+        "fragment_interval":fragment_interval,
+        "seed":seed,
+        "media_type":media_type,
+        "streaming_mode":streaming_mode,
+        "parallel_infer":parallel_infer,
+        "repetition_penalty":float(repetition_penalty)
+    }
+    return await tts_handle(req)
+@APP.post("/tts")
+async def tts_post_endpoint(request: TTS_Request):
+    req = request.dict()
+    return await tts_handle(req)
+@APP.get("/set_refer_audio")
+async def set_refer_aduio(refer_audio_path: str = None):
+    try:
+        tts_pipeline.set_ref_audio(refer_audio_path)
+    except Exception as e:
+        return JSONResponse(status_code=400, content={"message": f"set refer audio failed", "Exception": str(e)})
+    return JSONResponse(status_code=200, content={"message": "success"})
+# @APP.post("/set_refer_audio")
+# async def set_refer_aduio_post(audio_file: UploadFile = File(...)):
+#     try:
+#         # 检查文件类型，确保是音频文件
+#         if not audio_file.content_type.startswith("audio/"):
+#             return JSONResponse(status_code=400, content={"message": "file type is not supported"})
+#         os.makedirs("uploaded_audio", exist_ok=True)
+#         save_path = os.path.join("uploaded_audio", audio_file.filename)
+#         # 保存音频文件到服务器上的一个目录
+#         with open(save_path , "wb") as buffer:
+#             buffer.write(await audio_file.read())
+#         tts_pipeline.set_ref_audio(save_path)
+#     except Exception as e:
+#         return JSONResponse(status_code=400, content={"message": f"set refer audio failed", "Exception": str(e)})
+#     return JSONResponse(status_code=200, content={"message": "success"})
+@APP.get("/set_gpt_weights")
+async def set_gpt_weights(weights_path: str = None):
+    try:
+        if weights_path in ["", None]:
+            return JSONResponse(status_code=400, content={"message": "gpt weight path is required"})
+        tts_pipeline.init_t2s_weights(weights_path)
+    except Exception as e:
+        return JSONResponse(status_code=400, content={"message": f"change gpt weight failed", "Exception": str(e)})
+    return JSONResponse(status_code=200, content={"message": "success"})
+@APP.get("/set_sovits_weights")
+async def set_sovits_weights(weights_path: str = None):
+    try:
+        if weights_path in ["", None]:
+            return JSONResponse(status_code=400, content={"message": "sovits weight path is required"})
+        tts_pipeline.init_vits_weights(weights_path)
+    except Exception as e:
+        return JSONResponse(status_code=400, content={"message": f"change sovits weight failed", "Exception": str(e)})
+    return JSONResponse(status_code=200, content={"message": "success"})
+if __name__ == "__main__":
+    try:
+        uvicorn.run(APP, host=host, port=port, workers=1)
+    except Exception as e:
+        traceback.print_exc()
+        os.kill(os.getpid(), signal.SIGTERM)
+        exit(0)

config.py CHANGED Viewed

@@ -1,66 +1,66 @@
-import sys,os
-import torch
-# 推理用的指定模型
-sovits_path = ""
-gpt_path = ""
-is_half_str = os.environ.get("is_half", "True")
-is_half = True if is_half_str.lower() == 'true' else False
-is_share_str = os.environ.get("is_share","False")
-is_share= True if is_share_str.lower() == 'true' else False
-cnhubert_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base"
-bert_path = "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large"
-pretrained_sovits_path = "GPT_SoVITS/pretrained_models/s2G488k.pth"
-pretrained_gpt_path = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
-exp_root = "logs"
-python_exec = sys.executable or "python"
-if torch.cuda.is_available():
-    infer_device = "cuda"
-else:
-    infer_device = "cpu"
-webui_port_main = 9874
-webui_port_uvr5 = 9873
-webui_port_infer_tts = 9872
-webui_port_subfix = 9871
-api_port = 9880
-if infer_device == "cuda":
-    gpu_name = torch.cuda.get_device_name(0)
-    if (
-            ("16" in gpu_name and "V100" not in gpu_name.upper())
-            or "P40" in gpu_name.upper()
-            or "P10" in gpu_name.upper()
-            or "1060" in gpu_name
-            or "1070" in gpu_name
-            or "1080" in gpu_name
-    ):
-        is_half=False
-if(infer_device=="cpu"):is_half=False
-class Config:
-    def __init__(self):
-        self.sovits_path = sovits_path
-        self.gpt_path = gpt_path
-        self.is_half = is_half
-        self.cnhubert_path = cnhubert_path
-        self.bert_path = bert_path
-        self.pretrained_sovits_path = pretrained_sovits_path
-        self.pretrained_gpt_path = pretrained_gpt_path
-        self.exp_root = exp_root
-        self.python_exec = python_exec
-        self.infer_device = infer_device
-        self.webui_port_main = webui_port_main
-        self.webui_port_uvr5 = webui_port_uvr5
-        self.webui_port_infer_tts = webui_port_infer_tts
-        self.webui_port_subfix = webui_port_subfix
-        self.api_port = api_port

+import sys,os
+import torch
+# 推理用的指定模型
+sovits_path = ""
+gpt_path = ""
+is_half_str = os.environ.get("is_half", "True")
+is_half = True if is_half_str.lower() == 'true' else False
+is_share_str = os.environ.get("is_share","False")
+is_share= True if is_share_str.lower() == 'true' else False
+cnhubert_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base"
+bert_path = "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large"
+pretrained_sovits_path = "GPT_SoVITS/pretrained_models/s2G488k.pth"
+pretrained_gpt_path = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
+exp_root = "logs"
+python_exec = sys.executable or "python"
+if torch.cuda.is_available():
+    infer_device = "cuda"
+else:
+    infer_device = "cpu"
+webui_port_main = 9874
+webui_port_uvr5 = 9873
+webui_port_infer_tts = 9872
+webui_port_subfix = 9871
+api_port = 9880
+if infer_device == "cuda":
+    gpu_name = torch.cuda.get_device_name(0)
+    if (
+            ("16" in gpu_name and "V100" not in gpu_name.upper())
+            or "P40" in gpu_name.upper()
+            or "P10" in gpu_name.upper()
+            or "1060" in gpu_name
+            or "1070" in gpu_name
+            or "1080" in gpu_name
+    ):
+        is_half=False
+if(infer_device=="cpu"):is_half=False
+class Config:
+    def __init__(self):
+        self.sovits_path = sovits_path
+        self.gpt_path = gpt_path
+        self.is_half = is_half
+        self.cnhubert_path = cnhubert_path
+        self.bert_path = bert_path
+        self.pretrained_sovits_path = pretrained_sovits_path
+        self.pretrained_gpt_path = pretrained_gpt_path
+        self.exp_root = exp_root
+        self.python_exec = python_exec
+        self.infer_device = infer_device
+        self.webui_port_main = webui_port_main
+        self.webui_port_uvr5 = webui_port_uvr5
+        self.webui_port_infer_tts = webui_port_infer_tts
+        self.webui_port_subfix = webui_port_subfix
+        self.api_port = api_port

go-webui.bat CHANGED Viewed

	@@ -1,2 +1,2 @@
1	- runtime\python.exe webui.py
2	- pause


1	+ runtime\python.exe webui.py
2	+ pause

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-numpy # OpenVoice: numpy==1.22.0
 scipy
 tensorboard
 librosa==0.9.2
@@ -25,14 +25,4 @@ jieba_fast
 jieba
 LangSegment>=0.2.0
 Faster_Whisper
-wordsegment
-faster-whisper==0.9.0
-pydub==0.25.1
-wavmark==0.0.3
-eng_to_ipa==0.0.2
-inflect==7.0.0
-unidecode==1.3.7
-whisper-timestamped==1.14.2
-openai
-python-dotenv
-langid==1.1.6

+numpy
 scipy
 tensorboard
 librosa==0.9.2
 jieba
 LangSegment>=0.2.0
 Faster_Whisper
+wordsegment

tools/asr/fasterwhisper_asr.py CHANGED Viewed

@@ -1,18 +1,16 @@
 import argparse
 import os
-os.environ["HF_ENDPOINT"]="https://hf-mirror.com"
 import traceback
-import requests
-from glob import glob
-import torch
 from faster_whisper import WhisperModel
 from tqdm import tqdm
 from tools.asr.config import check_fw_local_models
-os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
 language_code_list = [
     "af", "am", "ar", "as", "az",
     "ba", "be", "bg", "bn", "bo",
@@ -36,7 +34,7 @@ language_code_list = [
     "vi", "yi", "yo", "zh", "yue",
     "auto"]
-def execute_asr(input_folder, output_folder, model_size, language,precision):
     if '-local' in model_size:
         model_size = model_size[:-6]
         model_path = f'tools/asr/models/faster-whisper-{model_size}'
@@ -50,17 +48,18 @@ def execute_asr(input_folder, output_folder, model_size, language,precision):
         model = WhisperModel(model_path, device=device, compute_type=precision)
     except:
         return print(traceback.format_exc())
     output = []
     output_file_name = os.path.basename(input_folder)
-    output_file_path = os.path.abspath(f'{output_folder}/{output_file_name}.list')
-    if not os.path.exists(output_folder):
-        os.makedirs(output_folder)
-    for file in tqdm(glob(os.path.join(input_folder, '**/*.wav'), recursive=True)):
         try:
             segments, info = model.transcribe(
-                audio          = file,
                 beam_size      = 5,
                 vad_filter     = True,
                 vad_parameters = dict(min_silence_duration_ms=700),
@@ -68,18 +67,23 @@ def execute_asr(input_folder, output_folder, model_size, language,precision):
             text = ''
             if info.language == "zh":
-                print("检测为中文文本,转funasr处理")
                 if("only_asr"not in globals()):
-                    from tools.asr.funasr_asr import only_asr##如果用英文就不需要导入下载模型
-                text = only_asr(file)
             if text == '':
                 for segment in segments:
                     text += segment.text
-            output.append(f"{file}|{output_file_name}|{info.language.upper()}|{text}")
         except:
             return print(traceback.format_exc())
     with open(output_file_path, "w", encoding="utf-8") as f:
         f.write("\n".join(output))
         print(f"ASR 任务完成->标注文件路径: {output_file_path}\n")

 import argparse
 import os
 import traceback
+os.environ["HF_ENDPOINT"]          = "https://hf-mirror.com"
+os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
+import torch
 from faster_whisper import WhisperModel
 from tqdm import tqdm
 from tools.asr.config import check_fw_local_models
 language_code_list = [
     "af", "am", "ar", "as", "az",
     "ba", "be", "bg", "bn", "bo",
     "vi", "yi", "yo", "zh", "yue",
     "auto"]
+def execute_asr(input_folder, output_folder, model_size, language, precision):
     if '-local' in model_size:
         model_size = model_size[:-6]
         model_path = f'tools/asr/models/faster-whisper-{model_size}'
         model = WhisperModel(model_path, device=device, compute_type=precision)
     except:
         return print(traceback.format_exc())
+    input_file_names = os.listdir(input_folder)
+    input_file_names.sort()
     output = []
     output_file_name = os.path.basename(input_folder)
+    for file_name in tqdm(input_file_names):
         try:
+            file_path = os.path.join(input_folder, file_name)
             segments, info = model.transcribe(
+                audio          = file_path,
                 beam_size      = 5,
                 vad_filter     = True,
                 vad_parameters = dict(min_silence_duration_ms=700),
             text = ''
             if info.language == "zh":
+                print("检测为中文文本, 转 FunASR 处理")
                 if("only_asr"not in globals()):
+                    from tools.asr.funasr_asr import \
+                        only_asr  # #如果用英文就不需要导入下载模型
+                text = only_asr(file_path)
             if text == '':
                 for segment in segments:
                     text += segment.text
+            output.append(f"{file_path}|{output_file_name}|{info.language.upper()}|{text}")
         except:
             return print(traceback.format_exc())
+    output_folder = output_folder or "output/asr_opt"
+    os.makedirs(output_folder, exist_ok=True)
+    output_file_path = os.path.abspath(f'{output_folder}/{output_file_name}.list')
     with open(output_file_path, "w", encoding="utf-8") as f:
         f.write("\n".join(output))
         print(f"ASR 任务完成->标注文件路径: {output_file_path}\n")

tools/asr/funasr_asr.py CHANGED Viewed

@@ -38,10 +38,11 @@ def execute_asr(input_folder, output_folder, model_size, language):
     output = []
     output_file_name = os.path.basename(input_folder)
-    for name in tqdm(input_file_names):
         try:
-            text = model.generate(input="%s/%s"%(input_folder, name))[0]["text"]
-            output.append(f"{input_folder}/{name}|{output_file_name}|{language.upper()}|{text}")
         except:
             print(traceback.format_exc())

     output = []
     output_file_name = os.path.basename(input_folder)
+    for file_name in tqdm(input_file_names):
         try:
+            file_path = os.path.join(input_folder, file_name)
+            text = model.generate(input=file_path)[0]["text"]
+            output.append(f"{file_path}|{output_file_name}|{language.upper()}|{text}")
         except:
             print(traceback.format_exc())

tools/cmd-denoise.py CHANGED Viewed

@@ -1,29 +1,29 @@
-import os,argparse
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-from tqdm import tqdm
-path_denoise  = 'tools/denoise-model/speech_frcrn_ans_cirm_16k'
-path_denoise  = path_denoise  if os.path.exists(path_denoise)  else "damo/speech_frcrn_ans_cirm_16k"
-ans = pipeline(Tasks.acoustic_noise_suppression,model=path_denoise)
-def execute_denoise(input_folder,output_folder):
-    os.makedirs(output_folder,exist_ok=True)
-    # print(input_folder)
-    # print(list(os.listdir(input_folder).sort()))
-    for name in tqdm(os.listdir(input_folder)):
-        ans("%s/%s"%(input_folder,name),output_path='%s/%s'%(output_folder,name))
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-i", "--input_folder", type=str, required=True,
-                        help="Path to the folder containing WAV files.")
-    parser.add_argument("-o", "--output_folder", type=str, required=True,
-                        help="Output folder to store transcriptions.")
-    parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'],
-                        help="fp16 or fp32")#还没接入
-    cmd = parser.parse_args()
-    execute_denoise(
-        input_folder  = cmd.input_folder,
-        output_folder = cmd.output_folder,
     )

+import os,argparse
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from tqdm import tqdm
+path_denoise  = 'tools/denoise-model/speech_frcrn_ans_cirm_16k'
+path_denoise  = path_denoise  if os.path.exists(path_denoise)  else "damo/speech_frcrn_ans_cirm_16k"
+ans = pipeline(Tasks.acoustic_noise_suppression,model=path_denoise)
+def execute_denoise(input_folder,output_folder):
+    os.makedirs(output_folder,exist_ok=True)
+    # print(input_folder)
+    # print(list(os.listdir(input_folder).sort()))
+    for name in tqdm(os.listdir(input_folder)):
+        ans("%s/%s"%(input_folder,name),output_path='%s/%s'%(output_folder,name))
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-i", "--input_folder", type=str, required=True,
+                        help="Path to the folder containing WAV files.")
+    parser.add_argument("-o", "--output_folder", type=str, required=True,
+                        help="Output folder to store transcriptions.")
+    parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'],
+                        help="fp16 or fp32")#还没接入
+    cmd = parser.parse_args()
+    execute_denoise(
+        input_folder  = cmd.input_folder,
+        output_folder = cmd.output_folder,
     )

tools/i18n/i18n.py CHANGED Viewed

@@ -4,7 +4,7 @@ import os
 def load_language_list(language):
-    with open(f"./i18n/locale/zh_CN.json", "r", encoding="utf-8") as f:
         language_list = json.load(f)
     return language_list

 def load_language_list(language):
+    with open(f"./i18n/locale/{language}.json", "r", encoding="utf-8") as f:
         language_list = json.load(f)
     return language_list

tools/my_utils.py CHANGED Viewed

@@ -1,31 +1,31 @@
-import platform,os,traceback
-import ffmpeg
-import numpy as np
-def load_audio(file, sr):
-    try:
-        # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
-        # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
-        # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
-        file = clean_path(file)  # 防止小白拷路径头尾带了空格和"和回车
-        if os.path.exists(file) == False:
-            raise RuntimeError(
-                "You input a wrong audio path that does not exists, please fix it!"
-            )
-        out, _ = (
-            ffmpeg.input(file, threads=0)
-            .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
-            .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
-        )
-    except Exception as e:
-        traceback.print_exc()
-        raise RuntimeError(f"Failed to load audio: {e}")
-    return np.frombuffer(out, np.float32).flatten()
-def clean_path(path_str):
-    if platform.system() == 'Windows':
-        path_str = path_str.replace('/', '\\')
-    return path_str.strip(" ").strip('"').strip("\n").strip('"').strip(" ")

+import platform,os,traceback
+import ffmpeg
+import numpy as np
+def load_audio(file, sr):
+    try:
+        # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
+        # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
+        # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
+        file = clean_path(file)  # 防止小白拷路径头尾带了空格和"和回车
+        if os.path.exists(file) == False:
+            raise RuntimeError(
+                "You input a wrong audio path that does not exists, please fix it!"
+            )
+        out, _ = (
+            ffmpeg.input(file, threads=0)
+            .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
+            .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
+        )
+    except Exception as e:
+        traceback.print_exc()
+        raise RuntimeError(f"Failed to load audio: {e}")
+    return np.frombuffer(out, np.float32).flatten()
+def clean_path(path_str):
+    if platform.system() == 'Windows':
+        path_str = path_str.replace('/', '\\')
+    return path_str.strip(" ").strip('"').strip("\n").strip('"').strip(" ")

tools/slice_audio.py CHANGED Viewed

@@ -1,48 +1,48 @@
-import os,sys,numpy as np
-import traceback
-from scipy.io import wavfile
-# parent_directory = os.path.dirname(os.path.abspath(__file__))
-# sys.path.append(parent_directory)
-from my_utils import load_audio
-from slicer2 import Slicer
-def slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,i_part,all_part):
-    os.makedirs(opt_root,exist_ok=True)
-    if os.path.isfile(inp):
-        input=[inp]
-    elif os.path.isdir(inp):
-        input=[os.path.join(inp, name) for name in sorted(list(os.listdir(inp)))]
-    else:
-        return "输入路径存在但既不是文件也不是文件夹"
-    slicer = Slicer(
-        sr=32000,  # 长音频采样率
-        threshold=      int(threshold),  # 音量小于这个值视作静音的备选切割点
-        min_length=     int(min_length),  # 每段最小多长，如果第一段太短一直和后面段连起来直到超过这个值
-        min_interval=   int(min_interval),  # 最短切割间隔
-        hop_size=       int(hop_size),  # 怎么算音量曲线，越小精度越大计算量越高（不是精度越大效果越好）
-        max_sil_kept=   int(max_sil_kept),  # 切完后静音最多留多长
-    )
-    _max=float(_max)
-    alpha=float(alpha)
-    for inp_path in input[int(i_part)::int(all_part)]:
-        # print(inp_path)
-        try:
-            name = os.path.basename(inp_path)
-            audio = load_audio(inp_path, 32000)
-            # print(audio.shape)
-            for chunk, start, end in slicer.slice(audio):  # start和end是帧数
-                tmp_max = np.abs(chunk).max()
-                if(tmp_max>1):chunk/=tmp_max
-                chunk = (chunk / tmp_max * (_max * alpha)) + (1 - alpha) * chunk
-                wavfile.write(
-                    "%s/%s_%010d_%010d.wav" % (opt_root, name, start, end),
-                    32000,
-                    # chunk.astype(np.float32),
-                    (chunk * 32767).astype(np.int16),
-                )
-        except:
-            print(inp_path,"->fail->",traceback.format_exc())
-    return "执行完毕，请检查输出文件"
-print(slice(*sys.argv[1:]))

+import os,sys,numpy as np
+import traceback
+from scipy.io import wavfile
+# parent_directory = os.path.dirname(os.path.abspath(__file__))
+# sys.path.append(parent_directory)
+from my_utils import load_audio
+from slicer2 import Slicer
+def slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,i_part,all_part):
+    os.makedirs(opt_root,exist_ok=True)
+    if os.path.isfile(inp):
+        input=[inp]
+    elif os.path.isdir(inp):
+        input=[os.path.join(inp, name) for name in sorted(list(os.listdir(inp)))]
+    else:
+        return "输入路径存在但既不是文件也不是文件夹"
+    slicer = Slicer(
+        sr=32000,  # 长音频采样率
+        threshold=      int(threshold),  # 音量小于这个值视作静音的备选切割点
+        min_length=     int(min_length),  # 每段最小多长，如果第一段太短一直和后面段连起来直到超过这个值
+        min_interval=   int(min_interval),  # 最短切割间隔
+        hop_size=       int(hop_size),  # 怎么算音量曲线，越小精度越大计算量越高（不是精度越大效果越好）
+        max_sil_kept=   int(max_sil_kept),  # 切完后静音最多留多长
+    )
+    _max=float(_max)
+    alpha=float(alpha)
+    for inp_path in input[int(i_part)::int(all_part)]:
+        # print(inp_path)
+        try:
+            name = os.path.basename(inp_path)
+            audio = load_audio(inp_path, 32000)
+            # print(audio.shape)
+            for chunk, start, end in slicer.slice(audio):  # start和end是帧数
+                tmp_max = np.abs(chunk).max()
+                if(tmp_max>1):chunk/=tmp_max
+                chunk = (chunk / tmp_max * (_max * alpha)) + (1 - alpha) * chunk
+                wavfile.write(
+                    "%s/%s_%010d_%010d.wav" % (opt_root, name, start, end),
+                    32000,
+                    # chunk.astype(np.float32),
+                    (chunk * 32767).astype(np.int16),
+                )
+        except:
+            print(inp_path,"->fail->",traceback.format_exc())
+    return "执行完毕，请检查输出文件"
+print(slice(*sys.argv[1:]))

tools/slicer2.py CHANGED Viewed

@@ -1,261 +1,261 @@
-import numpy as np
-# This function is obtained from librosa.
-def get_rms(
-    y,
-    frame_length=2048,
-    hop_length=512,
-    pad_mode="constant",
-):
-    padding = (int(frame_length // 2), int(frame_length // 2))
-    y = np.pad(y, padding, mode=pad_mode)
-    axis = -1
-    # put our new within-frame axis at the end for now
-    out_strides = y.strides + tuple([y.strides[axis]])
-    # Reduce the shape on the framing axis
-    x_shape_trimmed = list(y.shape)
-    x_shape_trimmed[axis] -= frame_length - 1
-    out_shape = tuple(x_shape_trimmed) + tuple([frame_length])
-    xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides)
-    if axis < 0:
-        target_axis = axis - 1
-    else:
-        target_axis = axis + 1
-    xw = np.moveaxis(xw, -1, target_axis)
-    # Downsample along the target axis
-    slices = [slice(None)] * xw.ndim
-    slices[axis] = slice(0, None, hop_length)
-    x = xw[tuple(slices)]
-    # Calculate power
-    power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True)
-    return np.sqrt(power)
-class Slicer:
-    def __init__(
-        self,
-        sr: int,
-        threshold: float = -40.0,
-        min_length: int = 5000,
-        min_interval: int = 300,
-        hop_size: int = 20,
-        max_sil_kept: int = 5000,
-    ):
-        if not min_length >= min_interval >= hop_size:
-            raise ValueError(
-                "The following condition must be satisfied: min_length >= min_interval >= hop_size"
-            )
-        if not max_sil_kept >= hop_size:
-            raise ValueError(
-                "The following condition must be satisfied: max_sil_kept >= hop_size"
-            )
-        min_interval = sr * min_interval / 1000
-        self.threshold = 10 ** (threshold / 20.0)
-        self.hop_size = round(sr * hop_size / 1000)
-        self.win_size = min(round(min_interval), 4 * self.hop_size)
-        self.min_length = round(sr * min_length / 1000 / self.hop_size)
-        self.min_interval = round(min_interval / self.hop_size)
-        self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
-    def _apply_slice(self, waveform, begin, end):
-        if len(waveform.shape) > 1:
-            return waveform[
-                :, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size)
-            ]
-        else:
-            return waveform[
-                begin * self.hop_size : min(waveform.shape[0], end * self.hop_size)
-            ]
-    # @timeit
-    def slice(self, waveform):
-        if len(waveform.shape) > 1:
-            samples = waveform.mean(axis=0)
-        else:
-            samples = waveform
-        if samples.shape[0] <= self.min_length:
-            return [waveform]
-        rms_list = get_rms(
-            y=samples, frame_length=self.win_size, hop_length=self.hop_size
-        ).squeeze(0)
-        sil_tags = []
-        silence_start = None
-        clip_start = 0
-        for i, rms in enumerate(rms_list):
-            # Keep looping while frame is silent.
-            if rms < self.threshold:
-                # Record start of silent frames.
-                if silence_start is None:
-                    silence_start = i
-                continue
-            # Keep looping while frame is not silent and silence start has not been recorded.
-            if silence_start is None:
-                continue
-            # Clear recorded silence start if interval is not enough or clip is too short
-            is_leading_silence = silence_start == 0 and i > self.max_sil_kept
-            need_slice_middle = (
-                i - silence_start >= self.min_interval
-                and i - clip_start >= self.min_length
-            )
-            if not is_leading_silence and not need_slice_middle:
-                silence_start = None
-                continue
-            # Need slicing. Record the range of silent frames to be removed.
-            if i - silence_start <= self.max_sil_kept:
-                pos = rms_list[silence_start : i + 1].argmin() + silence_start
-                if silence_start == 0:
-                    sil_tags.append((0, pos))
-                else:
-                    sil_tags.append((pos, pos))
-                clip_start = pos
-            elif i - silence_start <= self.max_sil_kept * 2:
-                pos = rms_list[
-                    i - self.max_sil_kept : silence_start + self.max_sil_kept + 1
-                ].argmin()
-                pos += i - self.max_sil_kept
-                pos_l = (
-                    rms_list[
-                        silence_start : silence_start + self.max_sil_kept + 1
-                    ].argmin()
-                    + silence_start
-                )
-                pos_r = (
-                    rms_list[i - self.max_sil_kept : i + 1].argmin()
-                    + i
-                    - self.max_sil_kept
-                )
-                if silence_start == 0:
-                    sil_tags.append((0, pos_r))
-                    clip_start = pos_r
-                else:
-                    sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
-                    clip_start = max(pos_r, pos)
-            else:
-                pos_l = (
-                    rms_list[
-                        silence_start : silence_start + self.max_sil_kept + 1
-                    ].argmin()
-                    + silence_start
-                )
-                pos_r = (
-                    rms_list[i - self.max_sil_kept : i + 1].argmin()
-                    + i
-                    - self.max_sil_kept
-                )
-                if silence_start == 0:
-                    sil_tags.append((0, pos_r))
-                else:
-                    sil_tags.append((pos_l, pos_r))
-                clip_start = pos_r
-            silence_start = None
-        # Deal with trailing silence.
-        total_frames = rms_list.shape[0]
-        if (
-            silence_start is not None
-            and total_frames - silence_start >= self.min_interval
-        ):
-            silence_end = min(total_frames, silence_start + self.max_sil_kept)
-            pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start
-            sil_tags.append((pos, total_frames + 1))
-        # Apply and return slices.
-        ####音频+起始时间+终止时间
-        if len(sil_tags) == 0:
-            return [[waveform,0,int(total_frames*self.hop_size)]]
-        else:
-            chunks = []
-            if sil_tags[0][0] > 0:
-                chunks.append([self._apply_slice(waveform, 0, sil_tags[0][0]),0,int(sil_tags[0][0]*self.hop_size)])
-            for i in range(len(sil_tags) - 1):
-                chunks.append(
-                    [self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]),int(sil_tags[i][1]*self.hop_size),int(sil_tags[i + 1][0]*self.hop_size)]
-                )
-            if sil_tags[-1][1] < total_frames:
-                chunks.append(
-                    [self._apply_slice(waveform, sil_tags[-1][1], total_frames),int(sil_tags[-1][1]*self.hop_size),int(total_frames*self.hop_size)]
-                )
-            return chunks
-def main():
-    import os.path
-    from argparse import ArgumentParser
-    import librosa
-    import soundfile
-    parser = ArgumentParser()
-    parser.add_argument("audio", type=str, help="The audio to be sliced")
-    parser.add_argument(
-        "--out", type=str, help="Output directory of the sliced audio clips"
-    )
-    parser.add_argument(
-        "--db_thresh",
-        type=float,
-        required=False,
-        default=-40,
-        help="The dB threshold for silence detection",
-    )
-    parser.add_argument(
-        "--min_length",
-        type=int,
-        required=False,
-        default=5000,
-        help="The minimum milliseconds required for each sliced audio clip",
-    )
-    parser.add_argument(
-        "--min_interval",
-        type=int,
-        required=False,
-        default=300,
-        help="The minimum milliseconds for a silence part to be sliced",
-    )
-    parser.add_argument(
-        "--hop_size",
-        type=int,
-        required=False,
-        default=10,
-        help="Frame length in milliseconds",
-    )
-    parser.add_argument(
-        "--max_sil_kept",
-        type=int,
-        required=False,
-        default=500,
-        help="The maximum silence length kept around the sliced clip, presented in milliseconds",
-    )
-    args = parser.parse_args()
-    out = args.out
-    if out is None:
-        out = os.path.dirname(os.path.abspath(args.audio))
-    audio, sr = librosa.load(args.audio, sr=None, mono=False)
-    slicer = Slicer(
-        sr=sr,
-        threshold=args.db_thresh,
-        min_length=args.min_length,
-        min_interval=args.min_interval,
-        hop_size=args.hop_size,
-        max_sil_kept=args.max_sil_kept,
-    )
-    chunks = slicer.slice(audio)
-    if not os.path.exists(out):
-        os.makedirs(out)
-    for i, chunk in enumerate(chunks):
-        if len(chunk.shape) > 1:
-            chunk = chunk.T
-        soundfile.write(
-            os.path.join(
-                out,
-                f"%s_%d.wav"
-                % (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i),
-            ),
-            chunk,
-            sr,
-        )
-if __name__ == "__main__":
-    main()

+import numpy as np
+# This function is obtained from librosa.
+def get_rms(
+    y,
+    frame_length=2048,
+    hop_length=512,
+    pad_mode="constant",
+):
+    padding = (int(frame_length // 2), int(frame_length // 2))
+    y = np.pad(y, padding, mode=pad_mode)
+    axis = -1
+    # put our new within-frame axis at the end for now
+    out_strides = y.strides + tuple([y.strides[axis]])
+    # Reduce the shape on the framing axis
+    x_shape_trimmed = list(y.shape)
+    x_shape_trimmed[axis] -= frame_length - 1
+    out_shape = tuple(x_shape_trimmed) + tuple([frame_length])
+    xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides)
+    if axis < 0:
+        target_axis = axis - 1
+    else:
+        target_axis = axis + 1
+    xw = np.moveaxis(xw, -1, target_axis)
+    # Downsample along the target axis
+    slices = [slice(None)] * xw.ndim
+    slices[axis] = slice(0, None, hop_length)
+    x = xw[tuple(slices)]
+    # Calculate power
+    power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True)
+    return np.sqrt(power)
+class Slicer:
+    def __init__(
+        self,
+        sr: int,
+        threshold: float = -40.0,
+        min_length: int = 5000,
+        min_interval: int = 300,
+        hop_size: int = 20,
+        max_sil_kept: int = 5000,
+    ):
+        if not min_length >= min_interval >= hop_size:
+            raise ValueError(
+                "The following condition must be satisfied: min_length >= min_interval >= hop_size"
+            )
+        if not max_sil_kept >= hop_size:
+            raise ValueError(
+                "The following condition must be satisfied: max_sil_kept >= hop_size"
+            )
+        min_interval = sr * min_interval / 1000
+        self.threshold = 10 ** (threshold / 20.0)
+        self.hop_size = round(sr * hop_size / 1000)
+        self.win_size = min(round(min_interval), 4 * self.hop_size)
+        self.min_length = round(sr * min_length / 1000 / self.hop_size)
+        self.min_interval = round(min_interval / self.hop_size)
+        self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
+    def _apply_slice(self, waveform, begin, end):
+        if len(waveform.shape) > 1:
+            return waveform[
+                :, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size)
+            ]
+        else:
+            return waveform[
+                begin * self.hop_size : min(waveform.shape[0], end * self.hop_size)
+            ]
+    # @timeit
+    def slice(self, waveform):
+        if len(waveform.shape) > 1:
+            samples = waveform.mean(axis=0)
+        else:
+            samples = waveform
+        if samples.shape[0] <= self.min_length:
+            return [waveform]
+        rms_list = get_rms(
+            y=samples, frame_length=self.win_size, hop_length=self.hop_size
+        ).squeeze(0)
+        sil_tags = []
+        silence_start = None
+        clip_start = 0
+        for i, rms in enumerate(rms_list):
+            # Keep looping while frame is silent.
+            if rms < self.threshold:
+                # Record start of silent frames.
+                if silence_start is None:
+                    silence_start = i
+                continue
+            # Keep looping while frame is not silent and silence start has not been recorded.
+            if silence_start is None:
+                continue
+            # Clear recorded silence start if interval is not enough or clip is too short
+            is_leading_silence = silence_start == 0 and i > self.max_sil_kept
+            need_slice_middle = (
+                i - silence_start >= self.min_interval
+                and i - clip_start >= self.min_length
+            )
+            if not is_leading_silence and not need_slice_middle:
+                silence_start = None
+                continue
+            # Need slicing. Record the range of silent frames to be removed.
+            if i - silence_start <= self.max_sil_kept:
+                pos = rms_list[silence_start : i + 1].argmin() + silence_start
+                if silence_start == 0:
+                    sil_tags.append((0, pos))
+                else:
+                    sil_tags.append((pos, pos))
+                clip_start = pos
+            elif i - silence_start <= self.max_sil_kept * 2:
+                pos = rms_list[
+                    i - self.max_sil_kept : silence_start + self.max_sil_kept + 1
+                ].argmin()
+                pos += i - self.max_sil_kept
+                pos_l = (
+                    rms_list[
+                        silence_start : silence_start + self.max_sil_kept + 1
+                    ].argmin()
+                    + silence_start
+                )
+                pos_r = (
+                    rms_list[i - self.max_sil_kept : i + 1].argmin()
+                    + i
+                    - self.max_sil_kept
+                )
+                if silence_start == 0:
+                    sil_tags.append((0, pos_r))
+                    clip_start = pos_r
+                else:
+                    sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
+                    clip_start = max(pos_r, pos)
+            else:
+                pos_l = (
+                    rms_list[
+                        silence_start : silence_start + self.max_sil_kept + 1
+                    ].argmin()
+                    + silence_start
+                )
+                pos_r = (
+                    rms_list[i - self.max_sil_kept : i + 1].argmin()
+                    + i
+                    - self.max_sil_kept
+                )
+                if silence_start == 0:
+                    sil_tags.append((0, pos_r))
+                else:
+                    sil_tags.append((pos_l, pos_r))
+                clip_start = pos_r
+            silence_start = None
+        # Deal with trailing silence.
+        total_frames = rms_list.shape[0]
+        if (
+            silence_start is not None
+            and total_frames - silence_start >= self.min_interval
+        ):
+            silence_end = min(total_frames, silence_start + self.max_sil_kept)
+            pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start
+            sil_tags.append((pos, total_frames + 1))
+        # Apply and return slices.
+        ####音频+起始时间+终止时间
+        if len(sil_tags) == 0:
+            return [[waveform,0,int(total_frames*self.hop_size)]]
+        else:
+            chunks = []
+            if sil_tags[0][0] > 0:
+                chunks.append([self._apply_slice(waveform, 0, sil_tags[0][0]),0,int(sil_tags[0][0]*self.hop_size)])
+            for i in range(len(sil_tags) - 1):
+                chunks.append(
+                    [self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]),int(sil_tags[i][1]*self.hop_size),int(sil_tags[i + 1][0]*self.hop_size)]
+                )
+            if sil_tags[-1][1] < total_frames:
+                chunks.append(
+                    [self._apply_slice(waveform, sil_tags[-1][1], total_frames),int(sil_tags[-1][1]*self.hop_size),int(total_frames*self.hop_size)]
+                )
+            return chunks
+def main():
+    import os.path
+    from argparse import ArgumentParser
+    import librosa
+    import soundfile
+    parser = ArgumentParser()
+    parser.add_argument("audio", type=str, help="The audio to be sliced")
+    parser.add_argument(
+        "--out", type=str, help="Output directory of the sliced audio clips"
+    )
+    parser.add_argument(
+        "--db_thresh",
+        type=float,
+        required=False,
+        default=-40,
+        help="The dB threshold for silence detection",
+    )
+    parser.add_argument(
+        "--min_length",
+        type=int,
+        required=False,
+        default=5000,
+        help="The minimum milliseconds required for each sliced audio clip",
+    )
+    parser.add_argument(
+        "--min_interval",
+        type=int,
+        required=False,
+        default=300,
+        help="The minimum milliseconds for a silence part to be sliced",
+    )
+    parser.add_argument(
+        "--hop_size",
+        type=int,
+        required=False,
+        default=10,
+        help="Frame length in milliseconds",
+    )
+    parser.add_argument(
+        "--max_sil_kept",
+        type=int,
+        required=False,
+        default=500,
+        help="The maximum silence length kept around the sliced clip, presented in milliseconds",
+    )
+    args = parser.parse_args()
+    out = args.out
+    if out is None:
+        out = os.path.dirname(os.path.abspath(args.audio))
+    audio, sr = librosa.load(args.audio, sr=None, mono=False)
+    slicer = Slicer(
+        sr=sr,
+        threshold=args.db_thresh,
+        min_length=args.min_length,
+        min_interval=args.min_interval,
+        hop_size=args.hop_size,
+        max_sil_kept=args.max_sil_kept,
+    )
+    chunks = slicer.slice(audio)
+    if not os.path.exists(out):
+        os.makedirs(out)
+    for i, chunk in enumerate(chunks):
+        if len(chunk.shape) > 1:
+            chunk = chunk.T
+        soundfile.write(
+            os.path.join(
+                out,
+                f"%s_%d.wav"
+                % (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i),
+            ),
+            chunk,
+            sr,
+        )
+if __name__ == "__main__":
+    main()

tools/subfix_webui.py CHANGED Viewed

@@ -493,6 +493,6 @@ if __name__ == "__main__":
         server_name="0.0.0.0",
         inbrowser=True,
         quiet=True,
-        share=True,
         server_port=int(args.webui_port_subfix)
-    )

         server_name="0.0.0.0",
         inbrowser=True,
         quiet=True,
+        share=eval(args.is_share),
         server_port=int(args.webui_port_subfix)
+    )

tools/uvr5/lib/lib_v5/modelparams/4band_v3.json CHANGED Viewed

@@ -1,54 +1,54 @@
-{
-	"bins": 672,
-	"unstable_bins": 8,
-	"reduction_bins": 530,
-	"band": {
-		"1": {
-			"sr": 7350,
-			"hl": 80,
-			"n_fft": 640,
-			"crop_start": 0,
-			"crop_stop": 85,
-			"lpf_start": 25,
-			"lpf_stop": 53,
-			"res_type": "polyphase"
-		},
-		"2": {
-			"sr": 7350,
-			"hl": 80,
-			"n_fft": 320,
-			"crop_start": 4,
-			"crop_stop": 87,
-			"hpf_start": 25,
-			"hpf_stop": 12,
-			"lpf_start": 31,
-			"lpf_stop": 62,
-			"res_type": "polyphase"
-		},
-		"3": {
-			"sr": 14700,
-			"hl": 160,
-			"n_fft": 512,
-			"crop_start": 17,
-			"crop_stop": 216,
-			"hpf_start": 48,
-			"hpf_stop": 24,
-			"lpf_start": 139,
-			"lpf_stop": 210,
-			"res_type": "polyphase"
-		},
-		"4": {
-			"sr": 44100,
-			"hl": 480,
-			"n_fft": 960,
-			"crop_start": 78,
-			"crop_stop": 383,
-			"hpf_start": 130,
-			"hpf_stop": 86,
-			"res_type": "kaiser_fast"
-		}
-	},
-	"sr": 44100,
-	"pre_filter_start": 668,
-	"pre_filter_stop": 672
 }

+{
+	"bins": 672,
+	"unstable_bins": 8,
+	"reduction_bins": 530,
+	"band": {
+		"1": {
+			"sr": 7350,
+			"hl": 80,
+			"n_fft": 640,
+			"crop_start": 0,
+			"crop_stop": 85,
+			"lpf_start": 25,
+			"lpf_stop": 53,
+			"res_type": "polyphase"
+		},
+		"2": {
+			"sr": 7350,
+			"hl": 80,
+			"n_fft": 320,
+			"crop_start": 4,
+			"crop_stop": 87,
+			"hpf_start": 25,
+			"hpf_stop": 12,
+			"lpf_start": 31,
+			"lpf_stop": 62,
+			"res_type": "polyphase"
+		},
+		"3": {
+			"sr": 14700,
+			"hl": 160,
+			"n_fft": 512,
+			"crop_start": 17,
+			"crop_stop": 216,
+			"hpf_start": 48,
+			"hpf_stop": 24,
+			"lpf_start": 139,
+			"lpf_stop": 210,
+			"res_type": "polyphase"
+		},
+		"4": {
+			"sr": 44100,
+			"hl": 480,
+			"n_fft": 960,
+			"crop_start": 78,
+			"crop_stop": 383,
+			"hpf_start": 130,
+			"hpf_stop": 86,
+			"res_type": "kaiser_fast"
+		}
+	},
+	"sr": 44100,
+	"pre_filter_start": 668,
+	"pre_filter_stop": 672
 }

tools/uvr5/webui.py CHANGED Viewed

@@ -73,8 +73,7 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format
                     os.path.basename(inp_path),
                 )
                 os.system(
-                    "ffmpeg -i %s -vn -acodec pcm_s16le -ac 2 -ar 44100 %s -y"
-                    % (inp_path, tmp_path)
                 )
                 inp_path = tmp_path
             try:

                     os.path.basename(inp_path),
                 )
                 os.system(
+                    f'ffmpeg -i "{inp_path}" -vn -acodec pcm_s16le -ac 2 -ar 44100 "{tmp_path}" -y'
                 )
                 inp_path = tmp_path
             try:

webui.py CHANGED Viewed

@@ -1,878 +1,878 @@
-import os,shutil,sys,pdb,re
-now_dir = os.getcwd()
-sys.path.insert(0, now_dir)
-import json,yaml,warnings,torch
-import platform
-import psutil
-import signal
-warnings.filterwarnings("ignore")
-torch.manual_seed(233333)
-tmp = os.path.join(now_dir, "TEMP")
-os.makedirs(tmp, exist_ok=True)
-os.environ["TEMP"] = tmp
-if(os.path.exists(tmp)):
-    for name in os.listdir(tmp):
-        if(name=="jieba.cache"):continue
-        path="%s/%s"%(tmp,name)
-        delete=os.remove if os.path.isfile(path) else shutil.rmtree
-        try:
-            delete(path)
-        except Exception as e:
-            print(str(e))
-            pass
-import site
-site_packages_roots = []
-for path in site.getsitepackages():
-    if "packages" in path:
-        site_packages_roots.append(path)
-if(site_packages_roots==[]):site_packages_roots=["%s/runtime/Lib/site-packages" % now_dir]
-#os.environ["OPENBLAS_NUM_THREADS"] = "4"
-os.environ["no_proxy"] = "localhost, 127.0.0.1, ::1"
-os.environ["all_proxy"] = ""
-for site_packages_root in site_packages_roots:
-    if os.path.exists(site_packages_root):
-        try:
-            with open("%s/users.pth" % (site_packages_root), "w") as f:
-                f.write(
-                    "%s\n%s/tools\n%s/tools/damo_asr\n%s/GPT_SoVITS\n%s/tools/uvr5"
-                    % (now_dir, now_dir, now_dir, now_dir, now_dir)
-                )
-            break
-        except PermissionError:
-            pass
-from tools import my_utils
-import traceback
-import shutil
-import pdb
-import gradio as gr
-from subprocess import Popen
-import signal
-from config import python_exec,infer_device,is_half,exp_root,webui_port_main,webui_port_infer_tts,webui_port_uvr5,webui_port_subfix,is_share
-from tools.i18n.i18n import I18nAuto
-i18n = I18nAuto()
-from scipy.io import wavfile
-from tools.my_utils import load_audio
-from multiprocessing import cpu_count
-# os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 当遇到mps不支持的步骤时使用cpu
-n_cpu=cpu_count()
-ngpu = torch.cuda.device_count()
-gpu_infos = []
-mem = []
-if_gpu_ok = False
-# 判断是否有能用来训练和加速推理的N卡
-if torch.cuda.is_available() or ngpu != 0:
-    for i in range(ngpu):
-        gpu_name = torch.cuda.get_device_name(i)
-        if any(value in gpu_name.upper()for value in ["10","16","20","30","40","A2","A3","A4","P4","A50","500","A60","70","80","90","M4","T4","TITAN","L4","4060"]):
-            # A10#A100#V100#A40#P40#M40#K80#A4500
-            if_gpu_ok = True  # 至少有一张能用的N卡
-            gpu_infos.append("%s\t%s" % (i, gpu_name))
-            mem.append(int(torch.cuda.get_device_properties(i).total_memory/ 1024/ 1024/ 1024+ 0.4))
-# # 判断是否支持mps加速
-# if torch.backends.mps.is_available():
-#     if_gpu_ok = True
-#     gpu_infos.append("%s\t%s" % ("0", "Apple GPU"))
-#     mem.append(psutil.virtual_memory().total/ 1024 / 1024 / 1024) # 实测使用系统内存作为显存不会爆显存
-if if_gpu_ok and len(gpu_infos) > 0:
-    gpu_info = "\n".join(gpu_infos)
-    default_batch_size = min(mem) // 2
-else:
-    gpu_info = ("%s\t%s" % ("0", "CPU"))
-    gpu_infos.append("%s\t%s" % ("0", "CPU"))
-    default_batch_size = psutil.virtual_memory().total/ 1024 / 1024 / 1024 / 2
-gpus = "-".join([i[0] for i in gpu_infos])
-pretrained_sovits_name="GPT_SoVITS/pretrained_models/s2G488k.pth"
-pretrained_gpt_name="GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
-def get_weights_names():
-    SoVITS_names = [pretrained_sovits_name]
-    for name in os.listdir(SoVITS_weight_root):
-        if name.endswith(".pth"):SoVITS_names.append(name)
-    GPT_names = [pretrained_gpt_name]
-    for name in os.listdir(GPT_weight_root):
-        if name.endswith(".ckpt"): GPT_names.append(name)
-    return SoVITS_names,GPT_names
-SoVITS_weight_root="SoVITS_weights"
-GPT_weight_root="GPT_weights"
-os.makedirs(SoVITS_weight_root,exist_ok=True)
-os.makedirs(GPT_weight_root,exist_ok=True)
-SoVITS_names,GPT_names = get_weights_names()
-def custom_sort_key(s):
-    # 使用正则表达式提取字符串中的数字部分和非数字部分
-    parts = re.split('(\d+)', s)
-    # 将数字部分转换为整数，非数字部分保持不变
-    parts = [int(part) if part.isdigit() else part for part in parts]
-    return parts
-def change_choices():
-    SoVITS_names, GPT_names = get_weights_names()
-    return {"choices": sorted(SoVITS_names,key=custom_sort_key), "__type__": "update"}, {"choices": sorted(GPT_names,key=custom_sort_key), "__type__": "update"}
-p_label=None
-p_uvr5=None
-p_asr=None
-p_denoise=None
-p_tts_inference=None
-def kill_proc_tree(pid, including_parent=True):
-    try:
-        parent = psutil.Process(pid)
-    except psutil.NoSuchProcess:
-        # Process already terminated
-        return
-    children = parent.children(recursive=True)
-    for child in children:
-        try:
-            os.kill(child.pid, signal.SIGTERM)  # or signal.SIGKILL
-        except OSError:
-            pass
-    if including_parent:
-        try:
-            os.kill(parent.pid, signal.SIGTERM)  # or signal.SIGKILL
-        except OSError:
-            pass
-system=platform.system()
-def kill_process(pid):
-    if(system=="Windows"):
-        cmd = "taskkill /t /f /pid %s" % pid
-        os.system(cmd)
-    else:
-        kill_proc_tree(pid)
-def change_label(if_label,path_list):
-    global p_label
-    if(if_label==True and p_label==None):
-        path_list=my_utils.clean_path(path_list)
-        cmd = '"%s" tools/subfix_webui.py --load_list "%s" --webui_port %s --is_share %s'%(python_exec,path_list,webui_port_subfix,is_share)
-        yield i18n("打标工具WebUI已开启")
-        print(cmd)
-        p_label = Popen(cmd, shell=True)
-    elif(if_label==False and p_label!=None):
-        kill_process(p_label.pid)
-        p_label=None
-        yield i18n("打标工具WebUI已关闭")
-def change_uvr5(if_uvr5):
-    global p_uvr5
-    if(if_uvr5==True and p_uvr5==None):
-        cmd = '"%s" tools/uvr5/webui.py "%s" %s %s %s'%(python_exec,infer_device,is_half,webui_port_uvr5,is_share)
-        yield i18n("UVR5已开启")
-        print(cmd)
-        p_uvr5 = Popen(cmd, shell=True)
-    elif(if_uvr5==False and p_uvr5!=None):
-        kill_process(p_uvr5.pid)
-        p_uvr5=None
-        yield i18n("UVR5已关闭")
-def change_tts_inference(if_tts,bert_path,cnhubert_base_path,gpu_number,gpt_path,sovits_path):
-    global p_tts_inference
-    if(if_tts==True and p_tts_inference==None):
-        os.environ["gpt_path"]=gpt_path if "/" in gpt_path else "%s/%s"%(GPT_weight_root,gpt_path)
-        os.environ["sovits_path"]=sovits_path if "/"in sovits_path else "%s/%s"%(SoVITS_weight_root,sovits_path)
-        os.environ["cnhubert_base_path"]=cnhubert_base_path
-        os.environ["bert_path"]=bert_path
-        os.environ["_CUDA_VISIBLE_DEVICES"]=gpu_number
-        os.environ["is_half"]=str(is_half)
-        os.environ["infer_ttswebui"]=str(webui_port_infer_tts)
-        os.environ["is_share"]=str(is_share)
-        cmd = '"%s" GPT_SoVITS/inference_webui.py'%(python_exec)
-        yield i18n("TTS推理进程已开启")
-        print(cmd)
-        p_tts_inference = Popen(cmd, shell=True)
-    elif(if_tts==False and p_tts_inference!=None):
-        kill_process(p_tts_inference.pid)
-        p_tts_inference=None
-        yield i18n("TTS推理进程已关闭")
-from tools.asr.config import asr_dict
-def open_asr(asr_inp_dir, asr_opt_dir, asr_model, asr_model_size, asr_lang):
-    global p_asr
-    if(p_asr==None):
-        asr_inp_dir=my_utils.clean_path(asr_inp_dir)
-        cmd = f'"{python_exec}" tools/asr/{asr_dict[asr_model]["path"]}'
-        cmd += f' -i "{asr_inp_dir}"'
-        cmd += f' -o "{asr_opt_dir}"'
-        cmd += f' -s {asr_model_size}'
-        cmd += f' -l {asr_lang}'
-        cmd += " -p %s"%("float16"if is_half==True else "float32")
-        yield "ASR任务开启：%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True}
-        print(cmd)
-        p_asr = Popen(cmd, shell=True)
-        p_asr.wait()
-        p_asr=None
-        yield f"ASR任务完成, 查看终端进行下一步",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
-    else:
-        yield "已有正在进行的ASR任务，需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True}
-        # return None
-def close_asr():
-    global p_asr
-    if(p_asr!=None):
-        kill_process(p_asr.pid)
-        p_asr=None
-    return "已终止ASR进程",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
-def open_denoise(denoise_inp_dir, denoise_opt_dir):
-    global p_denoise
-    if(p_denoise==None):
-        denoise_inp_dir=my_utils.clean_path(denoise_inp_dir)
-        denoise_opt_dir=my_utils.clean_path(denoise_opt_dir)
-        cmd = '"%s" tools/cmd-denoise.py -i "%s" -o "%s" -p %s'%(python_exec,denoise_inp_dir,denoise_opt_dir,"float16"if is_half==True else "float32")
-        yield "语音降噪任务开启：%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True}
-        print(cmd)
-        p_denoise = Popen(cmd, shell=True)
-        p_denoise.wait()
-        p_denoise=None
-        yield f"语音降噪任务完成, 查看终端进行下一步",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
-    else:
-        yield "已有正在进行的语音降噪任务，需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True}
-        # return None
-def close_denoise():
-    global p_denoise
-    if(p_denoise!=None):
-        kill_process(p_denoise.pid)
-        p_denoise=None
-    return "已终止语音降噪进程",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
-p_train_SoVITS=None
-def open1Ba(batch_size,total_epoch,exp_name,text_low_lr_rate,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers1Ba,pretrained_s2G,pretrained_s2D):
-    global p_train_SoVITS
-    if(p_train_SoVITS==None):
-        with open("GPT_SoVITS/configs/s2.json")as f:
-            data=f.read()
-            data=json.loads(data)
-        s2_dir="%s/%s"%(exp_root,exp_name)
-        os.makedirs("%s/logs_s2"%(s2_dir),exist_ok=True)
-        if(is_half==False):
-            data["train"]["fp16_run"]=False
-            batch_size=max(1,batch_size//2)
-        data["train"]["batch_size"]=batch_size
-        data["train"]["epochs"]=total_epoch
-        data["train"]["text_low_lr_rate"]=text_low_lr_rate
-        data["train"]["pretrained_s2G"]=pretrained_s2G
-        data["train"]["pretrained_s2D"]=pretrained_s2D
-        data["train"]["if_save_latest"]=if_save_latest
-        data["train"]["if_save_every_weights"]=if_save_every_weights
-        data["train"]["save_every_epoch"]=save_every_epoch
-        data["train"]["gpu_numbers"]=gpu_numbers1Ba
-        data["data"]["exp_dir"]=data["s2_ckpt_dir"]=s2_dir
-        data["save_weight_dir"]=SoVITS_weight_root
-        data["name"]=exp_name
-        tmp_config_path="%s/tmp_s2.json"%tmp
-        with open(tmp_config_path,"w")as f:f.write(json.dumps(data))
-        cmd = '"%s" GPT_SoVITS/s2_train.py --config "%s"'%(python_exec,tmp_config_path)
-        yield "SoVITS训练开始：%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True}
-        print(cmd)
-        p_train_SoVITS = Popen(cmd, shell=True)
-        p_train_SoVITS.wait()
-        p_train_SoVITS=None
-        yield "SoVITS训练完成",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
-    else:
-        yield "已有正在进行的SoVITS训练任务，需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True}
-def close1Ba():
-    global p_train_SoVITS
-    if(p_train_SoVITS!=None):
-        kill_process(p_train_SoVITS.pid)
-        p_train_SoVITS=None
-    return "已终止SoVITS训练",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
-p_train_GPT=None
-def open1Bb(batch_size,total_epoch,exp_name,if_dpo,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers,pretrained_s1):
-    global p_train_GPT
-    if(p_train_GPT==None):
-        with open("GPT_SoVITS/configs/s1longer.yaml")as f:
-            data=f.read()
-            data=yaml.load(data, Loader=yaml.FullLoader)
-        s1_dir="%s/%s"%(exp_root,exp_name)
-        os.makedirs("%s/logs_s1"%(s1_dir),exist_ok=True)
-        if(is_half==False):
-            data["train"]["precision"]="32"
-            batch_size = max(1, batch_size // 2)
-        data["train"]["batch_size"]=batch_size
-        data["train"]["epochs"]=total_epoch
-        data["pretrained_s1"]=pretrained_s1
-        data["train"]["save_every_n_epoch"]=save_every_epoch
-        data["train"]["if_save_every_weights"]=if_save_every_weights
-        data["train"]["if_save_latest"]=if_save_latest
-        data["train"]["if_dpo"]=if_dpo
-        data["train"]["half_weights_save_dir"]=GPT_weight_root
-        data["train"]["exp_name"]=exp_name
-        data["train_semantic_path"]="%s/6-name2semantic.tsv"%s1_dir
-        data["train_phoneme_path"]="%s/2-name2text.txt"%s1_dir
-        data["output_dir"]="%s/logs_s1"%s1_dir
-        os.environ["_CUDA_VISIBLE_DEVICES"]=gpu_numbers.replace("-",",")
-        os.environ["hz"]="25hz"
-        tmp_config_path="%s/tmp_s1.yaml"%tmp
-        with open(tmp_config_path, "w") as f:f.write(yaml.dump(data, default_flow_style=False))
-        # cmd = '"%s" GPT_SoVITS/s1_train.py --config_file "%s" --train_semantic_path "%s/6-name2semantic.tsv" --train_phoneme_path "%s/2-name2text.txt" --output_dir "%s/logs_s1"'%(python_exec,tmp_config_path,s1_dir,s1_dir,s1_dir)
-        cmd = '"%s" GPT_SoVITS/s1_train.py --config_file "%s" '%(python_exec,tmp_config_path)
-        yield "GPT训练开始：%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True}
-        print(cmd)
-        p_train_GPT = Popen(cmd, shell=True)
-        p_train_GPT.wait()
-        p_train_GPT=None
-        yield "GPT训练完成",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
-    else:
-        yield "已有正在进行的GPT训练任务，需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True}
-def close1Bb():
-    global p_train_GPT
-    if(p_train_GPT!=None):
-        kill_process(p_train_GPT.pid)
-        p_train_GPT=None
-    return "已终止GPT训练",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
-ps_slice=[]
-def open_slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,n_parts):
-    global ps_slice
-    inp = my_utils.clean_path(inp)
-    opt_root = my_utils.clean_path(opt_root)
-    if(os.path.exists(inp)==False):
-        yield "输入路径不存在",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
-        return
-    if os.path.isfile(inp):n_parts=1
-    elif os.path.isdir(inp):pass
-    else:
-        yield "输入路径存在但既不���文件也不是文件夹",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
-        return
-    if (ps_slice == []):
-        for i_part in range(n_parts):
-            cmd = '"%s" tools/slice_audio.py "%s" "%s" %s %s %s %s %s %s %s %s %s''' % (python_exec,inp, opt_root, threshold, min_length, min_interval, hop_size, max_sil_kept, _max, alpha, i_part, n_parts)
-            print(cmd)
-            p = Popen(cmd, shell=True)
-            ps_slice.append(p)
-        yield "切割执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
-        for p in ps_slice:
-            p.wait()
-        ps_slice=[]
-        yield "切割结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
-    else:
-        yield "已有正在进行的切割任务，需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
-def close_slice():
-    global ps_slice
-    if (ps_slice != []):
-        for p_slice in ps_slice:
-            try:
-                kill_process(p_slice.pid)
-            except:
-                traceback.print_exc()
-        ps_slice=[]
-    return "已终止所有切割进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
-ps1a=[]
-def open1a(inp_text,inp_wav_dir,exp_name,gpu_numbers,bert_pretrained_dir):
-    global ps1a
-    inp_text = my_utils.clean_path(inp_text)
-    inp_wav_dir = my_utils.clean_path(inp_wav_dir)
-    if (ps1a == []):
-        opt_dir="%s/%s"%(exp_root,exp_name)
-        config={
-            "inp_text":inp_text,
-            "inp_wav_dir":inp_wav_dir,
-            "exp_name":exp_name,
-            "opt_dir":opt_dir,
-            "bert_pretrained_dir":bert_pretrained_dir,
-        }
-        gpu_names=gpu_numbers.split("-")
-        all_parts=len(gpu_names)
-        for i_part in range(all_parts):
-            config.update(
-                {
-                    "i_part": str(i_part),
-                    "all_parts": str(all_parts),
-                    "_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
-                    "is_half": str(is_half)
-                }
-            )
-            os.environ.update(config)
-            cmd = '"%s" GPT_SoVITS/prepare_datasets/1-get-text.py'%python_exec
-            print(cmd)
-            p = Popen(cmd, shell=True)
-            ps1a.append(p)
-        yield "文本进程执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
-        for p in ps1a:
-            p.wait()
-        opt = []
-        for i_part in range(all_parts):
-            txt_path = "%s/2-name2text-%s.txt" % (opt_dir, i_part)
-            with open(txt_path, "r", encoding="utf8") as f:
-                opt += f.read().strip("\n").split("\n")
-            os.remove(txt_path)
-        path_text = "%s/2-name2text.txt" % opt_dir
-        with open(path_text, "w", encoding="utf8") as f:
-            f.write("\n".join(opt) + "\n")
-        ps1a=[]
-        yield "文本进程结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
-    else:
-        yield "已有正在进行的文本任务，需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
-def close1a():
-    global ps1a
-    if (ps1a != []):
-        for p1a in ps1a:
-            try:
-                kill_process(p1a.pid)
-            except:
-                traceback.print_exc()
-        ps1a=[]
-    return "已终止所有1a进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
-ps1b=[]
-def open1b(inp_text,inp_wav_dir,exp_name,gpu_numbers,ssl_pretrained_dir):
-    global ps1b
-    inp_text = my_utils.clean_path(inp_text)
-    inp_wav_dir = my_utils.clean_path(inp_wav_dir)
-    if (ps1b == []):
-        config={
-            "inp_text":inp_text,
-            "inp_wav_dir":inp_wav_dir,
-            "exp_name":exp_name,
-            "opt_dir":"%s/%s"%(exp_root,exp_name),
-            "cnhubert_base_dir":ssl_pretrained_dir,
-            "is_half": str(is_half)
-        }
-        gpu_names=gpu_numbers.split("-")
-        all_parts=len(gpu_names)
-        for i_part in range(all_parts):
-            config.update(
-                {
-                    "i_part": str(i_part),
-                    "all_parts": str(all_parts),
-                    "_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
-                }
-            )
-            os.environ.update(config)
-            cmd = '"%s" GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py'%python_exec
-            print(cmd)
-            p = Popen(cmd, shell=True)
-            ps1b.append(p)
-        yield "SSL提取进程执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
-        for p in ps1b:
-            p.wait()
-        ps1b=[]
-        yield "SSL提取进程结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
-    else:
-        yield "已有正在进���的SSL提取任务，需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
-def close1b():
-    global ps1b
-    if (ps1b != []):
-        for p1b in ps1b:
-            try:
-                kill_process(p1b.pid)
-            except:
-                traceback.print_exc()
-        ps1b=[]
-    return "已终止所有1b进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
-ps1c=[]
-def open1c(inp_text,exp_name,gpu_numbers,pretrained_s2G_path):
-    global ps1c
-    inp_text = my_utils.clean_path(inp_text)
-    if (ps1c == []):
-        opt_dir="%s/%s"%(exp_root,exp_name)
-        config={
-            "inp_text":inp_text,
-            "exp_name":exp_name,
-            "opt_dir":opt_dir,
-            "pretrained_s2G":pretrained_s2G_path,
-            "s2config_path":"GPT_SoVITS/configs/s2.json",
-            "is_half": str(is_half)
-        }
-        gpu_names=gpu_numbers.split("-")
-        all_parts=len(gpu_names)
-        for i_part in range(all_parts):
-            config.update(
-                {
-                    "i_part": str(i_part),
-                    "all_parts": str(all_parts),
-                    "_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
-                }
-            )
-            os.environ.update(config)
-            cmd = '"%s" GPT_SoVITS/prepare_datasets/3-get-semantic.py'%python_exec
-            print(cmd)
-            p = Popen(cmd, shell=True)
-            ps1c.append(p)
-        yield "语义token提取进程执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
-        for p in ps1c:
-            p.wait()
-        opt = ["item_name\tsemantic_audio"]
-        path_semantic = "%s/6-name2semantic.tsv" % opt_dir
-        for i_part in range(all_parts):
-            semantic_path = "%s/6-name2semantic-%s.tsv" % (opt_dir, i_part)
-            with open(semantic_path, "r", encoding="utf8") as f:
-                opt += f.read().strip("\n").split("\n")
-            os.remove(semantic_path)
-        with open(path_semantic, "w", encoding="utf8") as f:
-            f.write("\n".join(opt) + "\n")
-        ps1c=[]
-        yield "语义token提取进程结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
-    else:
-        yield "已有正在进行的语义token提取任务，需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
-def close1c():
-    global ps1c
-    if (ps1c != []):
-        for p1c in ps1c:
-            try:
-                kill_process(p1c.pid)
-            except:
-                traceback.print_exc()
-        ps1c=[]
-    return "已终止所有语义token进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
-#####inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numbers1c,bert_pretrained_dir,cnhubert_base_dir,pretrained_s2G
-ps1abc=[]
-def open1abc(inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numbers1c,bert_pretrained_dir,ssl_pretrained_dir,pretrained_s2G_path):
-    global ps1abc
-    inp_text = my_utils.clean_path(inp_text)
-    inp_wav_dir = my_utils.clean_path(inp_wav_dir)
-    if (ps1abc == []):
-        opt_dir="%s/%s"%(exp_root,exp_name)
-        try:
-            #############################1a
-            path_text="%s/2-name2text.txt" % opt_dir
-            if(os.path.exists(path_text)==False or (os.path.exists(path_text)==True and len(open(path_text,"r",encoding="utf8").read().strip("\n").split("\n"))<2)):
-                config={
-                    "inp_text":inp_text,
-                    "inp_wav_dir":inp_wav_dir,
-                    "exp_name":exp_name,
-                    "opt_dir":opt_dir,
-                    "bert_pretrained_dir":bert_pretrained_dir,
-                    "is_half": str(is_half)
-                }
-                gpu_names=gpu_numbers1a.split("-")
-                all_parts=len(gpu_names)
-                for i_part in range(all_parts):
-                    config.update(
-                        {
-                            "i_part": str(i_part),
-                            "all_parts": str(all_parts),
-                            "_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
-                        }
-                    )
-                    os.environ.update(config)
-                    cmd = '"%s" GPT_SoVITS/prepare_datasets/1-get-text.py'%python_exec
-                    print(cmd)
-                    p = Popen(cmd, shell=True)
-                    ps1abc.append(p)
-                yield "进度：1a-ing", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
-                for p in ps1abc:p.wait()
-                opt = []
-                for i_part in range(all_parts):#txt_path="%s/2-name2text-%s.txt"%(opt_dir,i_part)
-                    txt_path = "%s/2-name2text-%s.txt" % (opt_dir, i_part)
-                    with open(txt_path, "r",encoding="utf8") as f:
-                        opt += f.read().strip("\n").split("\n")
-                    os.remove(txt_path)
-                with open(path_text, "w",encoding="utf8") as f:
-                    f.write("\n".join(opt) + "\n")
-            yield "进度：1a-done", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
-            ps1abc=[]
-            #############################1b
-            config={
-                "inp_text":inp_text,
-                "inp_wav_dir":inp_wav_dir,
-                "exp_name":exp_name,
-                "opt_dir":opt_dir,
-                "cnhubert_base_dir":ssl_pretrained_dir,
-            }
-            gpu_names=gpu_numbers1Ba.split("-")
-            all_parts=len(gpu_names)
-            for i_part in range(all_parts):
-                config.update(
-                    {
-                        "i_part": str(i_part),
-                        "all_parts": str(all_parts),
-                        "_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
-                    }
-                )
-                os.environ.update(config)
-                cmd = '"%s" GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py'%python_exec
-                print(cmd)
-                p = Popen(cmd, shell=True)
-                ps1abc.append(p)
-            yield "进度：1a-done, 1b-ing", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
-            for p in ps1abc:p.wait()
-            yield "进度：1a1b-done", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
-            ps1abc=[]
-            #############################1c
-            path_semantic = "%s/6-name2semantic.tsv" % opt_dir
-            if(os.path.exists(path_semantic)==False or (os.path.exists(path_semantic)==True and os.path.getsize(path_semantic)<31)):
-                config={
-                    "inp_text":inp_text,
-                    "exp_name":exp_name,
-                    "opt_dir":opt_dir,
-                    "pretrained_s2G":pretrained_s2G_path,
-                    "s2config_path":"GPT_SoVITS/configs/s2.json",
-                }
-                gpu_names=gpu_numbers1c.split("-")
-                all_parts=len(gpu_names)
-                for i_part in range(all_parts):
-                    config.update(
-                        {
-                            "i_part": str(i_part),
-                            "all_parts": str(all_parts),
-                            "_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
-                        }
-                    )
-                    os.environ.update(config)
-                    cmd = '"%s" GPT_SoVITS/prepare_datasets/3-get-semantic.py'%python_exec
-                    print(cmd)
-                    p = Popen(cmd, shell=True)
-                    ps1abc.append(p)
-                yield "进度：1a1b-done, 1cing", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
-                for p in ps1abc:p.wait()
-                opt = ["item_name\tsemantic_audio"]
-                for i_part in range(all_parts):
-                    semantic_path = "%s/6-name2semantic-%s.tsv" % (opt_dir, i_part)
-                    with open(semantic_path, "r",encoding="utf8") as f:
-                        opt += f.read().strip("\n").split("\n")
-                    os.remove(semantic_path)
-                with open(path_semantic, "w",encoding="utf8") as f:
-                    f.write("\n".join(opt) + "\n")
-                yield "进度：all-done", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
-            ps1abc = []
-            yield "一键三连进程结束", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
-        except:
-            traceback.print_exc()
-            close1abc()
-            yield "一键三连中途报错", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
-    else:
-        yield "已有正在进行的一键三连任务，需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
-def close1abc():
-    global ps1abc
-    if (ps1abc != []):
-        for p1abc in ps1abc:
-            try:
-                kill_process(p1abc.pid)
-            except:
-                traceback.print_exc()
-        ps1abc=[]
-    return "已终止所有一键三连进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
-with gr.Blocks(title="GPT-SoVITS WebUI") as app:
-    gr.Markdown(
-        value=
-            i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>.")
-    )
-    gr.Markdown(
-        value=
-            i18n("中文教程文档：https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e")
-    )
-    with gr.Tabs():
-        with gr.TabItem(i18n("0-前置数据集获取工具")):#提前随机切片防止uvr5爆内存->uvr5->slicer->asr->打标
-            gr.Markdown(value=i18n("0a-UVR5人声伴奏分离&去混响去延迟工具"))
-            with gr.Row():
-                if_uvr5 = gr.Checkbox(label=i18n("是否开启UVR5-WebUI"),show_label=True)
-                uvr5_info = gr.Textbox(label=i18n("UVR5进程输出信息"))
-            gr.Markdown(value=i18n("0b-语音切分工具"))
-            with gr.Row():
-                with gr.Row():
-                    slice_inp_path=gr.Textbox(label=i18n("音频自动切分输入路径，可文件可文件夹"),value="")
-                    slice_opt_root=gr.Textbox(label=i18n("切分后的子音频的输出根目录"),value="output/slicer_opt")
-                    threshold=gr.Textbox(label=i18n("threshold:音量小于这个值视作静音的备选切割点"),value="-34")
-                    min_length=gr.Textbox(label=i18n("min_length:每段最小多长，如果第一段太短一直和后面段连起来直到超过这个值"),value="4000")
-                    min_interval=gr.Textbox(label=i18n("min_interval:最短切割间隔"),value="300")
-                    hop_size=gr.Textbox(label=i18n("hop_size:怎么算音量曲线，越小精度越大计算量越高（不是精度越大效果越好）"),value="10")
-                    max_sil_kept=gr.Textbox(label=i18n("max_sil_kept:切完后静音最多留多长"),value="500")
-                with gr.Row():
-                    open_slicer_button=gr.Button(i18n("开启语音切割"), variant="primary",visible=True)
-                    close_slicer_button=gr.Button(i18n("终止语音切割"), variant="primary",visible=False)
-                    _max=gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("max:归一化后最大值多少"),value=0.9,interactive=True)
-                    alpha=gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("alpha_mix:混多少比例归一化后音频进来"),value=0.25,interactive=True)
-                    n_process=gr.Slider(minimum=1,maximum=n_cpu,step=1,label=i18n("切割使用的进程数"),value=4,interactive=True)
-                    slicer_info = gr.Textbox(label=i18n("语音切割进程输出信息"))
-            gr.Markdown(value=i18n("0bb-语音降噪工具"))
-            with gr.Row():
-                open_denoise_button = gr.Button(i18n("开启语音降噪"), visible=True)
-                close_denoise_button = gr.Button(i18n("终止语音降噪进程"), variant="primary",visible=False)
-                denoise_input_dir=gr.Textbox(label=i18n("降噪音频文件输入文件夹"),value="")
-                denoise_output_dir=gr.Textbox(label=i18n("降噪结果输出文件夹"),value="output/denoise_opt")
-                denoise_info = gr.Textbox(label=i18n("语音降噪进程输出信息"))
-            gr.Markdown(value=i18n("0c-中文批量离线ASR工具"))
-            with gr.Row():
-                open_asr_button = gr.Button(i18n("开启离线批量ASR"), variant="primary",visible=True)
-                close_asr_button = gr.Button(i18n("终止ASR进程"), variant="primary",visible=False)
-                with gr.Column():
-                    with gr.Row():
-                        asr_inp_dir = gr.Textbox(
-                            label=i18n("输入文件夹路径"),
-                            value="output/slicer_opt",
-                            interactive=True,
-                        )
-                        asr_opt_dir = gr.Textbox(
-                            label       = i18n("输出文件夹路径"),
-                            value       = "output/asr_opt",
-                            interactive = True,
-                        )
-                    with gr.Row():
-                        asr_model = gr.Dropdown(
-                            label       = i18n("ASR 模型"),
-                            choices     = list(asr_dict.keys()),
-                            interactive = True,
-                            value="达摩 ASR (中文)"
-                        )
-                        asr_size = gr.Dropdown(
-                            label       = i18n("ASR 模型尺寸"),
-                            choices     = ["large"],
-                            interactive = True,
-                            value="large"
-                        )
-                        asr_lang = gr.Dropdown(
-                            label       = i18n("ASR 语言设置"),
-                            choices     = ["zh"],
-                            interactive = True,
-                            value="zh"
-                        )
-                    with gr.Row():
-                        asr_info = gr.Textbox(label=i18n("ASR进程输出信息"))
-                def change_lang_choices(key): #根据选择的模型修改可选的语言
-                    # return gr.Dropdown(choices=asr_dict[key]['lang'])
-                    return {"__type__": "update", "choices": asr_dict[key]['lang'],"value":asr_dict[key]['lang'][0]}
-                def change_size_choices(key): # 根据选择的模型修改可选的模型尺寸
-                    # return gr.Dropdown(choices=asr_dict[key]['size'])
-                    return {"__type__": "update", "choices": asr_dict[key]['size']}
-                asr_model.change(change_lang_choices, [asr_model], [asr_lang])
-                asr_model.change(change_size_choices, [asr_model], [asr_size])
-            gr.Markdown(value=i18n("0d-语音文本校对标注工具"))
-            with gr.Row():
-                if_label = gr.Checkbox(label=i18n("是否开启打标WebUI"),show_label=True)
-                path_list = gr.Textbox(
-                    label=i18n(".list标注文件的路径"),
-                    value="output/asr_opt/slicer_opt.list",
-                    interactive=True,
-                )
-                label_info = gr.Textbox(label=i18n("打标工具进程输出信息"))
-            if_label.change(change_label, [if_label,path_list], [label_info])
-            if_uvr5.change(change_uvr5, [if_uvr5], [uvr5_info])
-            open_asr_button.click(open_asr, [asr_inp_dir, asr_opt_dir, asr_model, asr_size, asr_lang], [asr_info,open_asr_button,close_asr_button])
-            close_asr_button.click(close_asr, [], [asr_info,open_asr_button,close_asr_button])
-            open_slicer_button.click(open_slice, [slice_inp_path,slice_opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,n_process], [slicer_info,open_slicer_button,close_slicer_button])
-            close_slicer_button.click(close_slice, [], [slicer_info,open_slicer_button,close_slicer_button])
-            open_denoise_button.click(open_denoise, [denoise_input_dir,denoise_output_dir], [denoise_info,open_denoise_button,close_denoise_button])
-            close_denoise_button.click(close_denoise, [], [denoise_info,open_denoise_button,close_denoise_button])
-        with gr.TabItem(i18n("1-GPT-SoVITS-TTS")):
-            with gr.Row():
-                exp_name = gr.Textbox(label=i18n("*实验/模型名"), value="xxx", interactive=True)
-                gpu_info = gr.Textbox(label=i18n("显卡信息"), value=gpu_info, visible=True, interactive=False)
-                pretrained_s2G = gr.Textbox(label=i18n("预训练的SoVITS-G模型路径"), value="GPT_SoVITS/pretrained_models/s2G488k.pth", interactive=True)
-                pretrained_s2D = gr.Textbox(label=i18n("预训练的SoVITS-D模型路径"), value="GPT_SoVITS/pretrained_models/s2D488k.pth", interactive=True)
-                pretrained_s1 = gr.Textbox(label=i18n("预训练的GPT模型路径"), value="GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt", interactive=True)
-            with gr.TabItem(i18n("1A-训练集格式化工具")):
-                gr.Markdown(value=i18n("输出logs/实验名目录下应有23456开头的文件和文件夹"))
-                with gr.Row():
-                    inp_text = gr.Textbox(label=i18n("*文本标注文件"),value="output/asr_opt/slicer_opt.list",interactive=True)
-                    inp_wav_dir = gr.Textbox(
-                        label=i18n("*训练集音频文件目录"),
-                        value="output/slicer_opt",
-                        interactive=True,
-                        placeholder=i18n("填切割后音频所在目录！读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名（不是全路径）。如果留空则使用.list文件里的绝对全路径。")
-                    )
-                gr.Markdown(value=i18n("1Aa-文本内容"))
-                with gr.Row():
-                    gpu_numbers1a = gr.Textbox(label=i18n("GPU卡号以-分割，每个卡号一个进程"),value="%s-%s"%(gpus,gpus),interactive=True)
-                    bert_pretrained_dir = gr.Textbox(label=i18n("预训练的中文BERT模型路径"),value="GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",interactive=False)
-                    button1a_open = gr.Button(i18n("开启文本获取"), variant="primary",visible=True)
-                    button1a_close = gr.Button(i18n("终止文本获取进程"), variant="primary",visible=False)
-                    info1a=gr.Textbox(label=i18n("文本进程输出信息"))
-                gr.Markdown(value=i18n("1Ab-SSL自监督特征提取"))
-                with gr.Row():
-                    gpu_numbers1Ba = gr.Textbox(label=i18n("GPU卡号以-分割，每个卡号一个进程"),value="%s-%s"%(gpus,gpus),interactive=True)
-                    cnhubert_base_dir = gr.Textbox(label=i18n("预训练的SSL模型路径"),value="GPT_SoVITS/pretrained_models/chinese-hubert-base",interactive=False)
-                    button1b_open = gr.Button(i18n("开启SSL提取"), variant="primary",visible=True)
-                    button1b_close = gr.Button(i18n("终止SSL提取进程"), variant="primary",visible=False)
-                    info1b=gr.Textbox(label=i18n("SSL进程输出信息"))
-                gr.Markdown(value=i18n("1Ac-语义token提取"))
-                with gr.Row():
-                    gpu_numbers1c = gr.Textbox(label=i18n("GPU卡号以-分割，每个卡号一个进程"),value="%s-%s"%(gpus,gpus),interactive=True)
-                    button1c_open = gr.Button(i18n("开启语义token提取"), variant="primary",visible=True)
-                    button1c_close = gr.Button(i18n("终止语义token提取进程"), variant="primary",visible=False)
-                    info1c=gr.Textbox(label=i18n("语义token提取进程输出信息"))
-                gr.Markdown(value=i18n("1Aabc-训练集格式化一键三连"))
-                with gr.Row():
-                    button1abc_open = gr.Button(i18n("开启一键三连"), variant="primary",visible=True)
-                    button1abc_close = gr.Button(i18n("终止一键三连"), variant="primary",visible=False)
-                    info1abc=gr.Textbox(label=i18n("一键三连进程输出信息"))
-            button1a_open.click(open1a, [inp_text,inp_wav_dir,exp_name,gpu_numbers1a,bert_pretrained_dir], [info1a,button1a_open,button1a_close])
-            button1a_close.click(close1a, [], [info1a,button1a_open,button1a_close])
-            button1b_open.click(open1b, [inp_text,inp_wav_dir,exp_name,gpu_numbers1Ba,cnhubert_base_dir], [info1b,button1b_open,button1b_close])
-            button1b_close.click(close1b, [], [info1b,button1b_open,button1b_close])
-            button1c_open.click(open1c, [inp_text,exp_name,gpu_numbers1c,pretrained_s2G], [info1c,button1c_open,button1c_close])
-            button1c_close.click(close1c, [], [info1c,button1c_open,button1c_close])
-            button1abc_open.click(open1abc, [inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numbers1c,bert_pretrained_dir,cnhubert_base_dir,pretrained_s2G], [info1abc,button1abc_open,button1abc_close])
-            button1abc_close.click(close1abc, [], [info1abc,button1abc_open,button1abc_close])
-            with gr.TabItem(i18n("1B-微调训练")):
-                gr.Markdown(value=i18n("1Ba-SoVITS训练。用于分享的模型文件输出在SoVITS_weights下。"))
-                with gr.Row():
-                    batch_size = gr.Slider(minimum=1,maximum=40,step=1,label=i18n("每张显卡的batch_size"),value=default_batch_size,interactive=True)
-                    total_epoch = gr.Slider(minimum=1,maximum=25,step=1,label=i18n("总训练轮数total_epoch，不建议太高"),value=8,interactive=True)
-                    text_low_lr_rate = gr.Slider(minimum=0.2,maximum=0.6,step=0.05,label=i18n("文本模块学习率权重"),value=0.4,interactive=True)
-                    save_every_epoch = gr.Slider(minimum=1,maximum=25,step=1,label=i18n("保存频率save_every_epoch"),value=4,interactive=True)
-                    if_save_latest = gr.Checkbox(label=i18n("是否仅保存最新的ckpt文件以节省硬盘空间"), value=True, interactive=True, show_label=True)
-                    if_save_every_weights = gr.Checkbox(label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"), value=True, interactive=True, show_label=True)
-                    gpu_numbers1Ba = gr.Textbox(label=i18n("GPU卡号以-分割，每个卡号一个进程"), value="%s" % (gpus), interactive=True)
-                with gr.Row():
-                    button1Ba_open = gr.Button(i18n("开启SoVITS训练"), variant="primary",visible=True)
-                    button1Ba_close = gr.Button(i18n("终止SoVITS训练"), variant="primary",visible=False)
-                    info1Ba=gr.Textbox(label=i18n("SoVITS训练进程输出信息"))
-                gr.Markdown(value=i18n("1Bb-GPT训练。用于分享的模型文件输出在GPT_weights下。"))
-                with gr.Row():
-                    batch_size1Bb = gr.Slider(minimum=1,maximum=40,step=1,label=i18n("每张显卡的batch_size"),value=default_batch_size,interactive=True)
-                    total_epoch1Bb = gr.Slider(minimum=2,maximum=50,step=1,label=i18n("总训练轮数total_epoch"),value=15,interactive=True)
-                    if_dpo = gr.Checkbox(label=i18n("是否开启dpo训练选项(实验性)"), value=False, interactive=True, show_label=True)
-                    if_save_latest1Bb = gr.Checkbox(label=i18n("是否仅保存最新的ckpt文件以节省硬盘空间"), value=True, interactive=True, show_label=True)
-                    if_save_every_weights1Bb = gr.Checkbox(label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"), value=True, interactive=True, show_label=True)
-                    save_every_epoch1Bb = gr.Slider(minimum=1,maximum=50,step=1,label=i18n("保存频率save_every_epoch"),value=5,interactive=True)
-                    gpu_numbers1Bb = gr.Textbox(label=i18n("GPU卡号以-分割，每个卡号一个进程"), value="%s" % (gpus), interactive=True)
-                with gr.Row():
-                    button1Bb_open = gr.Button(i18n("开启GPT训练"), variant="primary",visible=True)
-                    button1Bb_close = gr.Button(i18n("终止GPT训练"), variant="primary",visible=False)
-                    info1Bb=gr.Textbox(label=i18n("GPT训练进程输出信息"))
-            button1Ba_open.click(open1Ba, [batch_size,total_epoch,exp_name,text_low_lr_rate,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers1Ba,pretrained_s2G,pretrained_s2D], [info1Ba,button1Ba_open,button1Ba_close])
-            button1Ba_close.click(close1Ba, [], [info1Ba,button1Ba_open,button1Ba_close])
-            button1Bb_open.click(open1Bb, [batch_size1Bb,total_epoch1Bb,exp_name,if_dpo,if_save_latest1Bb,if_save_every_weights1Bb,save_every_epoch1Bb,gpu_numbers1Bb,pretrained_s1],   [info1Bb,button1Bb_open,button1Bb_close])
-            button1Bb_close.click(close1Bb, [], [info1Bb,button1Bb_open,button1Bb_close])
-            with gr.TabItem(i18n("1C-推理")):
-                gr.Markdown(value=i18n("选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模，体验5秒Zero Shot TTS用。"))
-                with gr.Row():
-                    GPT_dropdown = gr.Dropdown(label=i18n("*GPT模型列表"), choices=sorted(GPT_names,key=custom_sort_key),value=pretrained_gpt_name,interactive=True)
-                    SoVITS_dropdown = gr.Dropdown(label=i18n("*SoVITS模型列表"), choices=sorted(SoVITS_names,key=custom_sort_key),value=pretrained_sovits_name,interactive=True)
-                    gpu_number_1C=gr.Textbox(label=i18n("GPU卡号,只能填1个整数"), value=gpus, interactive=True)
-                    refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary")
-                    refresh_button.click(fn=change_choices,inputs=[],outputs=[SoVITS_dropdown,GPT_dropdown])
-                with gr.Row():
-                    if_tts = gr.Checkbox(label=i18n("是否开启TTS推理WebUI"), show_label=True)
-                    tts_info = gr.Textbox(label=i18n("TTS推理WebUI进程输出信息"))
-                    if_tts.change(change_tts_inference, [if_tts,bert_pretrained_dir,cnhubert_base_dir,gpu_number_1C,GPT_dropdown,SoVITS_dropdown], [tts_info])
-        with gr.TabItem(i18n("2-GPT-SoVITS-变声")):gr.Markdown(value=i18n("施工中，请静候佳音"))
-    app.queue(concurrency_count=511, max_size=1022).launch(
-        server_name="0.0.0.0",
-        inbrowser=True,
-        share=True,
-        server_port=webui_port_main,
-        quiet=True,
-    )

+import os,shutil,sys,pdb,re
+now_dir = os.getcwd()
+sys.path.insert(0, now_dir)
+import json,yaml,warnings,torch
+import platform
+import psutil
+import signal
+warnings.filterwarnings("ignore")
+torch.manual_seed(233333)
+tmp = os.path.join(now_dir, "TEMP")
+os.makedirs(tmp, exist_ok=True)
+os.environ["TEMP"] = tmp
+if(os.path.exists(tmp)):
+    for name in os.listdir(tmp):
+        if(name=="jieba.cache"):continue
+        path="%s/%s"%(tmp,name)
+        delete=os.remove if os.path.isfile(path) else shutil.rmtree
+        try:
+            delete(path)
+        except Exception as e:
+            print(str(e))
+            pass
+import site
+site_packages_roots = []
+for path in site.getsitepackages():
+    if "packages" in path:
+        site_packages_roots.append(path)
+if(site_packages_roots==[]):site_packages_roots=["%s/runtime/Lib/site-packages" % now_dir]
+#os.environ["OPENBLAS_NUM_THREADS"] = "4"
+os.environ["no_proxy"] = "localhost, 127.0.0.1, ::1"
+os.environ["all_proxy"] = ""
+for site_packages_root in site_packages_roots:
+    if os.path.exists(site_packages_root):
+        try:
+            with open("%s/users.pth" % (site_packages_root), "w") as f:
+                f.write(
+                    "%s\n%s/tools\n%s/tools/damo_asr\n%s/GPT_SoVITS\n%s/tools/uvr5"
+                    % (now_dir, now_dir, now_dir, now_dir, now_dir)
+                )
+            break
+        except PermissionError:
+            pass
+from tools import my_utils
+import traceback
+import shutil
+import pdb
+import gradio as gr
+from subprocess import Popen
+import signal
+from config import python_exec,infer_device,is_half,exp_root,webui_port_main,webui_port_infer_tts,webui_port_uvr5,webui_port_subfix,is_share
+from tools.i18n.i18n import I18nAuto
+i18n = I18nAuto()
+from scipy.io import wavfile
+from tools.my_utils import load_audio
+from multiprocessing import cpu_count
+# os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 当遇到mps不支持的步骤时使用cpu
+n_cpu=cpu_count()
+ngpu = torch.cuda.device_count()
+gpu_infos = []
+mem = []
+if_gpu_ok = False
+# 判断是否有能用来训练和加速推理的N卡
+if torch.cuda.is_available() or ngpu != 0:
+    for i in range(ngpu):
+        gpu_name = torch.cuda.get_device_name(i)
+        if any(value in gpu_name.upper()for value in ["10","16","20","30","40","A2","A3","A4","P4","A50","500","A60","70","80","90","M4","T4","TITAN","L4","4060"]):
+            # A10#A100#V100#A40#P40#M40#K80#A4500
+            if_gpu_ok = True  # 至少有一张能用的N卡
+            gpu_infos.append("%s\t%s" % (i, gpu_name))
+            mem.append(int(torch.cuda.get_device_properties(i).total_memory/ 1024/ 1024/ 1024+ 0.4))
+# # 判断是否支持mps加速
+# if torch.backends.mps.is_available():
+#     if_gpu_ok = True
+#     gpu_infos.append("%s\t%s" % ("0", "Apple GPU"))
+#     mem.append(psutil.virtual_memory().total/ 1024 / 1024 / 1024) # 实测使用系统内存作为显存不会爆显存
+if if_gpu_ok and len(gpu_infos) > 0:
+    gpu_info = "\n".join(gpu_infos)
+    default_batch_size = min(mem) // 2
+else:
+    gpu_info = ("%s\t%s" % ("0", "CPU"))
+    gpu_infos.append("%s\t%s" % ("0", "CPU"))
+    default_batch_size = psutil.virtual_memory().total/ 1024 / 1024 / 1024 / 2
+gpus = "-".join([i[0] for i in gpu_infos])
+pretrained_sovits_name="GPT_SoVITS/pretrained_models/s2G488k.pth"
+pretrained_gpt_name="GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
+def get_weights_names():
+    SoVITS_names = [pretrained_sovits_name]
+    for name in os.listdir(SoVITS_weight_root):
+        if name.endswith(".pth"):SoVITS_names.append(name)
+    GPT_names = [pretrained_gpt_name]
+    for name in os.listdir(GPT_weight_root):
+        if name.endswith(".ckpt"): GPT_names.append(name)
+    return SoVITS_names,GPT_names
+SoVITS_weight_root="SoVITS_weights"
+GPT_weight_root="GPT_weights"
+os.makedirs(SoVITS_weight_root,exist_ok=True)
+os.makedirs(GPT_weight_root,exist_ok=True)
+SoVITS_names,GPT_names = get_weights_names()
+def custom_sort_key(s):
+    # 使用正则表达式提取字符串中的数字部分和非数字部分
+    parts = re.split('(\d+)', s)
+    # 将数字部分转换为整数，非数字部分保持不变
+    parts = [int(part) if part.isdigit() else part for part in parts]
+    return parts
+def change_choices():
+    SoVITS_names, GPT_names = get_weights_names()
+    return {"choices": sorted(SoVITS_names,key=custom_sort_key), "__type__": "update"}, {"choices": sorted(GPT_names,key=custom_sort_key), "__type__": "update"}
+p_label=None
+p_uvr5=None
+p_asr=None
+p_denoise=None
+p_tts_inference=None
+def kill_proc_tree(pid, including_parent=True):
+    try:
+        parent = psutil.Process(pid)
+    except psutil.NoSuchProcess:
+        # Process already terminated
+        return
+    children = parent.children(recursive=True)
+    for child in children:
+        try:
+            os.kill(child.pid, signal.SIGTERM)  # or signal.SIGKILL
+        except OSError:
+            pass
+    if including_parent:
+        try:
+            os.kill(parent.pid, signal.SIGTERM)  # or signal.SIGKILL
+        except OSError:
+            pass
+system=platform.system()
+def kill_process(pid):
+    if(system=="Windows"):
+        cmd = "taskkill /t /f /pid %s" % pid
+        os.system(cmd)
+    else:
+        kill_proc_tree(pid)
+def change_label(if_label,path_list):
+    global p_label
+    if(if_label==True and p_label==None):
+        path_list=my_utils.clean_path(path_list)
+        cmd = '"%s" tools/subfix_webui.py --load_list "%s" --webui_port %s --is_share %s'%(python_exec,path_list,webui_port_subfix,is_share)
+        yield i18n("打标工具WebUI已开启")
+        print(cmd)
+        p_label = Popen(cmd, shell=True)
+    elif(if_label==False and p_label!=None):
+        kill_process(p_label.pid)
+        p_label=None
+        yield i18n("打标工具WebUI已关闭")
+def change_uvr5(if_uvr5):
+    global p_uvr5
+    if(if_uvr5==True and p_uvr5==None):
+        cmd = '"%s" tools/uvr5/webui.py "%s" %s %s %s'%(python_exec,infer_device,is_half,webui_port_uvr5,is_share)
+        yield i18n("UVR5已开启")
+        print(cmd)
+        p_uvr5 = Popen(cmd, shell=True)
+    elif(if_uvr5==False and p_uvr5!=None):
+        kill_process(p_uvr5.pid)
+        p_uvr5=None
+        yield i18n("UVR5已关闭")
+def change_tts_inference(if_tts,bert_path,cnhubert_base_path,gpu_number,gpt_path,sovits_path):
+    global p_tts_inference
+    if(if_tts==True and p_tts_inference==None):
+        os.environ["gpt_path"]=gpt_path if "/" in gpt_path else "%s/%s"%(GPT_weight_root,gpt_path)
+        os.environ["sovits_path"]=sovits_path if "/"in sovits_path else "%s/%s"%(SoVITS_weight_root,sovits_path)
+        os.environ["cnhubert_base_path"]=cnhubert_base_path
+        os.environ["bert_path"]=bert_path
+        os.environ["_CUDA_VISIBLE_DEVICES"]=gpu_number
+        os.environ["is_half"]=str(is_half)
+        os.environ["infer_ttswebui"]=str(webui_port_infer_tts)
+        os.environ["is_share"]=str(is_share)
+        cmd = '"%s" GPT_SoVITS/inference_webui.py'%(python_exec)
+        yield i18n("TTS推理进程已开启")
+        print(cmd)
+        p_tts_inference = Popen(cmd, shell=True)
+    elif(if_tts==False and p_tts_inference!=None):
+        kill_process(p_tts_inference.pid)
+        p_tts_inference=None
+        yield i18n("TTS推理进程已关闭")
+from tools.asr.config import asr_dict
+def open_asr(asr_inp_dir, asr_opt_dir, asr_model, asr_model_size, asr_lang):
+    global p_asr
+    if(p_asr==None):
+        asr_inp_dir=my_utils.clean_path(asr_inp_dir)
+        cmd = f'"{python_exec}" tools/asr/{asr_dict[asr_model]["path"]}'
+        cmd += f' -i "{asr_inp_dir}"'
+        cmd += f' -o "{asr_opt_dir}"'
+        cmd += f' -s {asr_model_size}'
+        cmd += f' -l {asr_lang}'
+        cmd += " -p %s"%("float16"if is_half==True else "float32")
+        yield "ASR任务开启：%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True}
+        print(cmd)
+        p_asr = Popen(cmd, shell=True)
+        p_asr.wait()
+        p_asr=None
+        yield f"ASR任务完成, 查看终端进行下一步",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
+    else:
+        yield "已有正在进行的ASR任务，需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True}
+        # return None
+def close_asr():
+    global p_asr
+    if(p_asr!=None):
+        kill_process(p_asr.pid)
+        p_asr=None
+    return "已终止ASR进程",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
+def open_denoise(denoise_inp_dir, denoise_opt_dir):
+    global p_denoise
+    if(p_denoise==None):
+        denoise_inp_dir=my_utils.clean_path(denoise_inp_dir)
+        denoise_opt_dir=my_utils.clean_path(denoise_opt_dir)
+        cmd = '"%s" tools/cmd-denoise.py -i "%s" -o "%s" -p %s'%(python_exec,denoise_inp_dir,denoise_opt_dir,"float16"if is_half==True else "float32")
+        yield "语音降噪任务开启：%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True}
+        print(cmd)
+        p_denoise = Popen(cmd, shell=True)
+        p_denoise.wait()
+        p_denoise=None
+        yield f"语音降噪任务完成, 查看终端进行下一步",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
+    else:
+        yield "已有正在进行的语音降噪任务，需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True}
+        # return None
+def close_denoise():
+    global p_denoise
+    if(p_denoise!=None):
+        kill_process(p_denoise.pid)
+        p_denoise=None
+    return "已终止语音降噪进程",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
+p_train_SoVITS=None
+def open1Ba(batch_size,total_epoch,exp_name,text_low_lr_rate,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers1Ba,pretrained_s2G,pretrained_s2D):
+    global p_train_SoVITS
+    if(p_train_SoVITS==None):
+        with open("GPT_SoVITS/configs/s2.json")as f:
+            data=f.read()
+            data=json.loads(data)
+        s2_dir="%s/%s"%(exp_root,exp_name)
+        os.makedirs("%s/logs_s2"%(s2_dir),exist_ok=True)
+        if(is_half==False):
+            data["train"]["fp16_run"]=False
+            batch_size=max(1,batch_size//2)
+        data["train"]["batch_size"]=batch_size
+        data["train"]["epochs"]=total_epoch
+        data["train"]["text_low_lr_rate"]=text_low_lr_rate
+        data["train"]["pretrained_s2G"]=pretrained_s2G
+        data["train"]["pretrained_s2D"]=pretrained_s2D
+        data["train"]["if_save_latest"]=if_save_latest
+        data["train"]["if_save_every_weights"]=if_save_every_weights
+        data["train"]["save_every_epoch"]=save_every_epoch
+        data["train"]["gpu_numbers"]=gpu_numbers1Ba
+        data["data"]["exp_dir"]=data["s2_ckpt_dir"]=s2_dir
+        data["save_weight_dir"]=SoVITS_weight_root
+        data["name"]=exp_name
+        tmp_config_path="%s/tmp_s2.json"%tmp
+        with open(tmp_config_path,"w")as f:f.write(json.dumps(data))
+        cmd = '"%s" GPT_SoVITS/s2_train.py --config "%s"'%(python_exec,tmp_config_path)
+        yield "SoVITS训练开始：%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True}
+        print(cmd)
+        p_train_SoVITS = Popen(cmd, shell=True)
+        p_train_SoVITS.wait()
+        p_train_SoVITS=None
+        yield "SoVITS训练完成",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
+    else:
+        yield "已有正在进行的SoVITS训练任务，需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True}
+def close1Ba():
+    global p_train_SoVITS
+    if(p_train_SoVITS!=None):
+        kill_process(p_train_SoVITS.pid)
+        p_train_SoVITS=None
+    return "已终止SoVITS训练",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
+p_train_GPT=None
+def open1Bb(batch_size,total_epoch,exp_name,if_dpo,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers,pretrained_s1):
+    global p_train_GPT
+    if(p_train_GPT==None):
+        with open("GPT_SoVITS/configs/s1longer.yaml")as f:
+            data=f.read()
+            data=yaml.load(data, Loader=yaml.FullLoader)
+        s1_dir="%s/%s"%(exp_root,exp_name)
+        os.makedirs("%s/logs_s1"%(s1_dir),exist_ok=True)
+        if(is_half==False):
+            data["train"]["precision"]="32"
+            batch_size = max(1, batch_size // 2)
+        data["train"]["batch_size"]=batch_size
+        data["train"]["epochs"]=total_epoch
+        data["pretrained_s1"]=pretrained_s1
+        data["train"]["save_every_n_epoch"]=save_every_epoch
+        data["train"]["if_save_every_weights"]=if_save_every_weights
+        data["train"]["if_save_latest"]=if_save_latest
+        data["train"]["if_dpo"]=if_dpo
+        data["train"]["half_weights_save_dir"]=GPT_weight_root
+        data["train"]["exp_name"]=exp_name
+        data["train_semantic_path"]="%s/6-name2semantic.tsv"%s1_dir
+        data["train_phoneme_path"]="%s/2-name2text.txt"%s1_dir
+        data["output_dir"]="%s/logs_s1"%s1_dir
+        os.environ["_CUDA_VISIBLE_DEVICES"]=gpu_numbers.replace("-",",")
+        os.environ["hz"]="25hz"
+        tmp_config_path="%s/tmp_s1.yaml"%tmp
+        with open(tmp_config_path, "w") as f:f.write(yaml.dump(data, default_flow_style=False))
+        # cmd = '"%s" GPT_SoVITS/s1_train.py --config_file "%s" --train_semantic_path "%s/6-name2semantic.tsv" --train_phoneme_path "%s/2-name2text.txt" --output_dir "%s/logs_s1"'%(python_exec,tmp_config_path,s1_dir,s1_dir,s1_dir)
+        cmd = '"%s" GPT_SoVITS/s1_train.py --config_file "%s" '%(python_exec,tmp_config_path)
+        yield "GPT训练开始：%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True}
+        print(cmd)
+        p_train_GPT = Popen(cmd, shell=True)
+        p_train_GPT.wait()
+        p_train_GPT=None
+        yield "GPT训练完成",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
+    else:
+        yield "已有正在进行的GPT训练任务，需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True}
+def close1Bb():
+    global p_train_GPT
+    if(p_train_GPT!=None):
+        kill_process(p_train_GPT.pid)
+        p_train_GPT=None
+    return "已终止GPT训练",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
+ps_slice=[]
+def open_slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,n_parts):
+    global ps_slice
+    inp = my_utils.clean_path(inp)
+    opt_root = my_utils.clean_path(opt_root)
+    if(os.path.exists(inp)==False):
+        yield "输入路径不存在",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
+        return
+    if os.path.isfile(inp):n_parts=1
+    elif os.path.isdir(inp):pass
+    else:
+        yield "输入路径存在但既不是文件也不是文件夹",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
+        return
+    if (ps_slice == []):
+        for i_part in range(n_parts):
+            cmd = '"%s" tools/slice_audio.py "%s" "%s" %s %s %s %s %s %s %s %s %s''' % (python_exec,inp, opt_root, threshold, min_length, min_interval, hop_size, max_sil_kept, _max, alpha, i_part, n_parts)
+            print(cmd)
+            p = Popen(cmd, shell=True)
+            ps_slice.append(p)
+        yield "切割执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
+        for p in ps_slice:
+            p.wait()
+        ps_slice=[]
+        yield "切割结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
+    else:
+        yield "已有正在进行的切割任务，需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
+def close_slice():
+    global ps_slice
+    if (ps_slice != []):
+        for p_slice in ps_slice:
+            try:
+                kill_process(p_slice.pid)
+            except:
+                traceback.print_exc()
+        ps_slice=[]
+    return "已终止所有切割进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
+ps1a=[]
+def open1a(inp_text,inp_wav_dir,exp_name,gpu_numbers,bert_pretrained_dir):
+    global ps1a
+    inp_text = my_utils.clean_path(inp_text)
+    inp_wav_dir = my_utils.clean_path(inp_wav_dir)
+    if (ps1a == []):
+        opt_dir="%s/%s"%(exp_root,exp_name)
+        config={
+            "inp_text":inp_text,
+            "inp_wav_dir":inp_wav_dir,
+            "exp_name":exp_name,
+            "opt_dir":opt_dir,
+            "bert_pretrained_dir":bert_pretrained_dir,
+        }
+        gpu_names=gpu_numbers.split("-")
+        all_parts=len(gpu_names)
+        for i_part in range(all_parts):
+            config.update(
+                {
+                    "i_part": str(i_part),
+                    "all_parts": str(all_parts),
+                    "_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
+                    "is_half": str(is_half)
+                }
+            )
+            os.environ.update(config)
+            cmd = '"%s" GPT_SoVITS/prepare_datasets/1-get-text.py'%python_exec
+            print(cmd)
+            p = Popen(cmd, shell=True)
+            ps1a.append(p)
+        yield "文本进程执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
+        for p in ps1a:
+            p.wait()
+        opt = []
+        for i_part in range(all_parts):
+            txt_path = "%s/2-name2text-%s.txt" % (opt_dir, i_part)
+            with open(txt_path, "r", encoding="utf8") as f:
+                opt += f.read().strip("\n").split("\n")
+            os.remove(txt_path)
+        path_text = "%s/2-name2text.txt" % opt_dir
+        with open(path_text, "w", encoding="utf8") as f:
+            f.write("\n".join(opt) + "\n")
+        ps1a=[]
+        yield "文本进程结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
+    else:
+        yield "已有正在进行的文本任务，需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
+def close1a():
+    global ps1a
+    if (ps1a != []):
+        for p1a in ps1a:
+            try:
+                kill_process(p1a.pid)
+            except:
+                traceback.print_exc()
+        ps1a=[]
+    return "已终止所有1a进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
+ps1b=[]
+def open1b(inp_text,inp_wav_dir,exp_name,gpu_numbers,ssl_pretrained_dir):
+    global ps1b
+    inp_text = my_utils.clean_path(inp_text)
+    inp_wav_dir = my_utils.clean_path(inp_wav_dir)
+    if (ps1b == []):
+        config={
+            "inp_text":inp_text,
+            "inp_wav_dir":inp_wav_dir,
+            "exp_name":exp_name,
+            "opt_dir":"%s/%s"%(exp_root,exp_name),
+            "cnhubert_base_dir":ssl_pretrained_dir,
+            "is_half": str(is_half)
+        }
+        gpu_names=gpu_numbers.split("-")
+        all_parts=len(gpu_names)
+        for i_part in range(all_parts):
+            config.update(
+                {
+                    "i_part": str(i_part),
+                    "all_parts": str(all_parts),
+                    "_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
+                }
+            )
+            os.environ.update(config)
+            cmd = '"%s" GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py'%python_exec
+            print(cmd)
+            p = Popen(cmd, shell=True)
+            ps1b.append(p)
+        yield "SSL提取进程执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
+        for p in ps1b:
+            p.wait()
+        ps1b=[]
+        yield "SSL提取进程结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
+    else:
+        yield "已有正在进行的SSL提取任务，需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
+def close1b():
+    global ps1b
+    if (ps1b != []):
+        for p1b in ps1b:
+            try:
+                kill_process(p1b.pid)
+            except:
+                traceback.print_exc()
+        ps1b=[]
+    return "已终止所有1b进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
+ps1c=[]
+def open1c(inp_text,exp_name,gpu_numbers,pretrained_s2G_path):
+    global ps1c
+    inp_text = my_utils.clean_path(inp_text)
+    if (ps1c == []):
+        opt_dir="%s/%s"%(exp_root,exp_name)
+        config={
+            "inp_text":inp_text,
+            "exp_name":exp_name,
+            "opt_dir":opt_dir,
+            "pretrained_s2G":pretrained_s2G_path,
+            "s2config_path":"GPT_SoVITS/configs/s2.json",
+            "is_half": str(is_half)
+        }
+        gpu_names=gpu_numbers.split("-")
+        all_parts=len(gpu_names)
+        for i_part in range(all_parts):
+            config.update(
+                {
+                    "i_part": str(i_part),
+                    "all_parts": str(all_parts),
+                    "_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
+                }
+            )
+            os.environ.update(config)
+            cmd = '"%s" GPT_SoVITS/prepare_datasets/3-get-semantic.py'%python_exec
+            print(cmd)
+            p = Popen(cmd, shell=True)
+            ps1c.append(p)
+        yield "语义token提取进程执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
+        for p in ps1c:
+            p.wait()
+        opt = ["item_name\tsemantic_audio"]
+        path_semantic = "%s/6-name2semantic.tsv" % opt_dir
+        for i_part in range(all_parts):
+            semantic_path = "%s/6-name2semantic-%s.tsv" % (opt_dir, i_part)
+            with open(semantic_path, "r", encoding="utf8") as f:
+                opt += f.read().strip("\n").split("\n")
+            os.remove(semantic_path)
+        with open(path_semantic, "w", encoding="utf8") as f:
+            f.write("\n".join(opt) + "\n")
+        ps1c=[]
+        yield "语义token提取进程结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
+    else:
+        yield "已有正在��行的语义token提取任务，需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
+def close1c():
+    global ps1c
+    if (ps1c != []):
+        for p1c in ps1c:
+            try:
+                kill_process(p1c.pid)
+            except:
+                traceback.print_exc()
+        ps1c=[]
+    return "已终止所有语义token进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
+#####inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numbers1c,bert_pretrained_dir,cnhubert_base_dir,pretrained_s2G
+ps1abc=[]
+def open1abc(inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numbers1c,bert_pretrained_dir,ssl_pretrained_dir,pretrained_s2G_path):
+    global ps1abc
+    inp_text = my_utils.clean_path(inp_text)
+    inp_wav_dir = my_utils.clean_path(inp_wav_dir)
+    if (ps1abc == []):
+        opt_dir="%s/%s"%(exp_root,exp_name)
+        try:
+            #############################1a
+            path_text="%s/2-name2text.txt" % opt_dir
+            if(os.path.exists(path_text)==False or (os.path.exists(path_text)==True and len(open(path_text,"r",encoding="utf8").read().strip("\n").split("\n"))<2)):
+                config={
+                    "inp_text":inp_text,
+                    "inp_wav_dir":inp_wav_dir,
+                    "exp_name":exp_name,
+                    "opt_dir":opt_dir,
+                    "bert_pretrained_dir":bert_pretrained_dir,
+                    "is_half": str(is_half)
+                }
+                gpu_names=gpu_numbers1a.split("-")
+                all_parts=len(gpu_names)
+                for i_part in range(all_parts):
+                    config.update(
+                        {
+                            "i_part": str(i_part),
+                            "all_parts": str(all_parts),
+                            "_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
+                        }
+                    )
+                    os.environ.update(config)
+                    cmd = '"%s" GPT_SoVITS/prepare_datasets/1-get-text.py'%python_exec
+                    print(cmd)
+                    p = Popen(cmd, shell=True)
+                    ps1abc.append(p)
+                yield "进度：1a-ing", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
+                for p in ps1abc:p.wait()
+                opt = []
+                for i_part in range(all_parts):#txt_path="%s/2-name2text-%s.txt"%(opt_dir,i_part)
+                    txt_path = "%s/2-name2text-%s.txt" % (opt_dir, i_part)
+                    with open(txt_path, "r",encoding="utf8") as f:
+                        opt += f.read().strip("\n").split("\n")
+                    os.remove(txt_path)
+                with open(path_text, "w",encoding="utf8") as f:
+                    f.write("\n".join(opt) + "\n")
+            yield "进度：1a-done", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
+            ps1abc=[]
+            #############################1b
+            config={
+                "inp_text":inp_text,
+                "inp_wav_dir":inp_wav_dir,
+                "exp_name":exp_name,
+                "opt_dir":opt_dir,
+                "cnhubert_base_dir":ssl_pretrained_dir,
+            }
+            gpu_names=gpu_numbers1Ba.split("-")
+            all_parts=len(gpu_names)
+            for i_part in range(all_parts):
+                config.update(
+                    {
+                        "i_part": str(i_part),
+                        "all_parts": str(all_parts),
+                        "_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
+                    }
+                )
+                os.environ.update(config)
+                cmd = '"%s" GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py'%python_exec
+                print(cmd)
+                p = Popen(cmd, shell=True)
+                ps1abc.append(p)
+            yield "进度：1a-done, 1b-ing", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
+            for p in ps1abc:p.wait()
+            yield "进度：1a1b-done", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
+            ps1abc=[]
+            #############################1c
+            path_semantic = "%s/6-name2semantic.tsv" % opt_dir
+            if(os.path.exists(path_semantic)==False or (os.path.exists(path_semantic)==True and os.path.getsize(path_semantic)<31)):
+                config={
+                    "inp_text":inp_text,
+                    "exp_name":exp_name,
+                    "opt_dir":opt_dir,
+                    "pretrained_s2G":pretrained_s2G_path,
+                    "s2config_path":"GPT_SoVITS/configs/s2.json",
+                }
+                gpu_names=gpu_numbers1c.split("-")
+                all_parts=len(gpu_names)
+                for i_part in range(all_parts):
+                    config.update(
+                        {
+                            "i_part": str(i_part),
+                            "all_parts": str(all_parts),
+                            "_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
+                        }
+                    )
+                    os.environ.update(config)
+                    cmd = '"%s" GPT_SoVITS/prepare_datasets/3-get-semantic.py'%python_exec
+                    print(cmd)
+                    p = Popen(cmd, shell=True)
+                    ps1abc.append(p)
+                yield "进度：1a1b-done, 1cing", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
+                for p in ps1abc:p.wait()
+                opt = ["item_name\tsemantic_audio"]
+                for i_part in range(all_parts):
+                    semantic_path = "%s/6-name2semantic-%s.tsv" % (opt_dir, i_part)
+                    with open(semantic_path, "r",encoding="utf8") as f:
+                        opt += f.read().strip("\n").split("\n")
+                    os.remove(semantic_path)
+                with open(path_semantic, "w",encoding="utf8") as f:
+                    f.write("\n".join(opt) + "\n")
+                yield "进度：all-done", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
+            ps1abc = []
+            yield "一键三连进程结束", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
+        except:
+            traceback.print_exc()
+            close1abc()
+            yield "一键三连中途报错", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
+    else:
+        yield "已有正在进行的一键三连任务，需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
+def close1abc():
+    global ps1abc
+    if (ps1abc != []):
+        for p1abc in ps1abc:
+            try:
+                kill_process(p1abc.pid)
+            except:
+                traceback.print_exc()
+        ps1abc=[]
+    return "已终止所有一键三连进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
+with gr.Blocks(title="GPT-SoVITS WebUI") as app:
+    gr.Markdown(
+        value=
+            i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>.")
+    )
+    gr.Markdown(
+        value=
+            i18n("中文教程文档：https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e")
+    )
+    with gr.Tabs():
+        with gr.TabItem(i18n("0-前置数据集获取工具")):#提前随机切片防止uvr5爆内存->uvr5->slicer->asr->打标
+            gr.Markdown(value=i18n("0a-UVR5人声伴奏分离&去混响去延迟工具"))
+            with gr.Row():
+                if_uvr5 = gr.Checkbox(label=i18n("是否开启UVR5-WebUI"),show_label=True)
+                uvr5_info = gr.Textbox(label=i18n("UVR5进程输出信息"))
+            gr.Markdown(value=i18n("0b-语音切分工具"))
+            with gr.Row():
+                with gr.Row():
+                    slice_inp_path=gr.Textbox(label=i18n("音频自动切分输入路径，可文件可文件夹"),value="")
+                    slice_opt_root=gr.Textbox(label=i18n("切分后的子音频的输出根目录"),value="output/slicer_opt")
+                    threshold=gr.Textbox(label=i18n("threshold:音量小于这个值视作静音的备选切割点"),value="-34")
+                    min_length=gr.Textbox(label=i18n("min_length:每段最小多长，如果第一段太短一直和后面段连起来直到超过这个值"),value="4000")
+                    min_interval=gr.Textbox(label=i18n("min_interval:最短切割间隔"),value="300")
+                    hop_size=gr.Textbox(label=i18n("hop_size:怎么算音量曲线，越小精度越大计算量越高（不是精度越大效果越好）"),value="10")
+                    max_sil_kept=gr.Textbox(label=i18n("max_sil_kept:切完后静音最多留多长"),value="500")
+                with gr.Row():
+                    open_slicer_button=gr.Button(i18n("开启语音切割"), variant="primary",visible=True)
+                    close_slicer_button=gr.Button(i18n("终止语音切割"), variant="primary",visible=False)
+                    _max=gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("max:归一化后最大值多少"),value=0.9,interactive=True)
+                    alpha=gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("alpha_mix:混多少比例归一化后音频进来"),value=0.25,interactive=True)
+                    n_process=gr.Slider(minimum=1,maximum=n_cpu,step=1,label=i18n("切割使用的进程数"),value=4,interactive=True)
+                    slicer_info = gr.Textbox(label=i18n("语音切割进���输出信息"))
+            gr.Markdown(value=i18n("0bb-语音降噪工具"))
+            with gr.Row():
+                open_denoise_button = gr.Button(i18n("开启语音降噪"), variant="primary",visible=True)
+                close_denoise_button = gr.Button(i18n("终止语音降噪进程"), variant="primary",visible=False)
+                denoise_input_dir=gr.Textbox(label=i18n("降噪音频文件输入文件夹"),value="")
+                denoise_output_dir=gr.Textbox(label=i18n("降噪结果输出文件夹"),value="output/denoise_opt")
+                denoise_info = gr.Textbox(label=i18n("语音降噪进程输出信息"))
+            gr.Markdown(value=i18n("0c-中文批量离线ASR工具"))
+            with gr.Row():
+                open_asr_button = gr.Button(i18n("开启离线批量ASR"), variant="primary",visible=True)
+                close_asr_button = gr.Button(i18n("终止ASR进程"), variant="primary",visible=False)
+                with gr.Column():
+                    with gr.Row():
+                        asr_inp_dir = gr.Textbox(
+                            label=i18n("输入文件夹路径"),
+                            value="D:\\GPT-SoVITS\\raw\\xxx",
+                            interactive=True,
+                        )
+                        asr_opt_dir = gr.Textbox(
+                            label       = i18n("输出文件夹路径"),
+                            value       = "output/asr_opt",
+                            interactive = True,
+                        )
+                    with gr.Row():
+                        asr_model = gr.Dropdown(
+                            label       = i18n("ASR 模型"),
+                            choices     = list(asr_dict.keys()),
+                            interactive = True,
+                            value="达摩 ASR (中文)"
+                        )
+                        asr_size = gr.Dropdown(
+                            label       = i18n("ASR 模型尺寸"),
+                            choices     = ["large"],
+                            interactive = True,
+                            value="large"
+                        )
+                        asr_lang = gr.Dropdown(
+                            label       = i18n("ASR 语言设置"),
+                            choices     = ["zh"],
+                            interactive = True,
+                            value="zh"
+                        )
+                    with gr.Row():
+                        asr_info = gr.Textbox(label=i18n("ASR进程输出信息"))
+                def change_lang_choices(key): #根据选择的模型修改可选的语言
+                    # return gr.Dropdown(choices=asr_dict[key]['lang'])
+                    return {"__type__": "update", "choices": asr_dict[key]['lang'],"value":asr_dict[key]['lang'][0]}
+                def change_size_choices(key): # 根据选择的模型修改可选的模型尺寸
+                    # return gr.Dropdown(choices=asr_dict[key]['size'])
+                    return {"__type__": "update", "choices": asr_dict[key]['size']}
+                asr_model.change(change_lang_choices, [asr_model], [asr_lang])
+                asr_model.change(change_size_choices, [asr_model], [asr_size])
+            gr.Markdown(value=i18n("0d-语音文本校对标注工具"))
+            with gr.Row():
+                if_label = gr.Checkbox(label=i18n("是否开启打标WebUI"),show_label=True)
+                path_list = gr.Textbox(
+                    label=i18n(".list标注文件的路径"),
+                    value="D:\\RVC1006\\GPT-SoVITS\\raw\\xxx.list",
+                    interactive=True,
+                )
+                label_info = gr.Textbox(label=i18n("打标工具进程输出信息"))
+            if_label.change(change_label, [if_label,path_list], [label_info])
+            if_uvr5.change(change_uvr5, [if_uvr5], [uvr5_info])
+            open_asr_button.click(open_asr, [asr_inp_dir, asr_opt_dir, asr_model, asr_size, asr_lang], [asr_info,open_asr_button,close_asr_button])
+            close_asr_button.click(close_asr, [], [asr_info,open_asr_button,close_asr_button])
+            open_slicer_button.click(open_slice, [slice_inp_path,slice_opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,n_process], [slicer_info,open_slicer_button,close_slicer_button])
+            close_slicer_button.click(close_slice, [], [slicer_info,open_slicer_button,close_slicer_button])
+            open_denoise_button.click(open_denoise, [denoise_input_dir,denoise_output_dir], [denoise_info,open_denoise_button,close_denoise_button])
+            close_denoise_button.click(close_denoise, [], [denoise_info,open_denoise_button,close_denoise_button])
+        with gr.TabItem(i18n("1-GPT-SoVITS-TTS")):
+            with gr.Row():
+                exp_name = gr.Textbox(label=i18n("*实验/模型名"), value="xxx", interactive=True)
+                gpu_info = gr.Textbox(label=i18n("显卡信息"), value=gpu_info, visible=True, interactive=False)
+                pretrained_s2G = gr.Textbox(label=i18n("预训练的SoVITS-G模型路径"), value="GPT_SoVITS/pretrained_models/s2G488k.pth", interactive=True)
+                pretrained_s2D = gr.Textbox(label=i18n("预训练的SoVITS-D模型路径"), value="GPT_SoVITS/pretrained_models/s2D488k.pth", interactive=True)
+                pretrained_s1 = gr.Textbox(label=i18n("预训练的GPT模型路径"), value="GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt", interactive=True)
+            with gr.TabItem(i18n("1A-训练集格式化工具")):
+                gr.Markdown(value=i18n("输出logs/实验名目录下应有23456开头的文件和文件夹"))
+                with gr.Row():
+                    inp_text = gr.Textbox(label=i18n("*文本标注文件"),value=r"D:\RVC1006\GPT-SoVITS\raw\xxx.list",interactive=True)
+                    inp_wav_dir = gr.Textbox(
+                        label=i18n("*训练集音频文件目录"),
+                        # value=r"D:\RVC1006\GPT-SoVITS\raw\xxx",
+                        interactive=True,
+                        placeholder=i18n("填切割后音频所在目录！读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名（不是全路径）。如果留空则使用.list文件里的绝对全路径。")
+                    )
+                gr.Markdown(value=i18n("1Aa-文本内容"))
+                with gr.Row():
+                    gpu_numbers1a = gr.Textbox(label=i18n("GPU卡号以-分割，每个卡号一个进程"),value="%s-%s"%(gpus,gpus),interactive=True)
+                    bert_pretrained_dir = gr.Textbox(label=i18n("预训练的中文BERT模型路径"),value="GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",interactive=False)
+                    button1a_open = gr.Button(i18n("开启文本获取"), variant="primary",visible=True)
+                    button1a_close = gr.Button(i18n("终止文本获取进程"), variant="primary",visible=False)
+                    info1a=gr.Textbox(label=i18n("文本进程输出信息"))
+                gr.Markdown(value=i18n("1Ab-SSL自监督特征提取"))
+                with gr.Row():
+                    gpu_numbers1Ba = gr.Textbox(label=i18n("GPU卡号以-分割，每个卡号一个进程"),value="%s-%s"%(gpus,gpus),interactive=True)
+                    cnhubert_base_dir = gr.Textbox(label=i18n("预训练的SSL模型路径"),value="GPT_SoVITS/pretrained_models/chinese-hubert-base",interactive=False)
+                    button1b_open = gr.Button(i18n("开启SSL提取"), variant="primary",visible=True)
+                    button1b_close = gr.Button(i18n("终止SSL提取进程"), variant="primary",visible=False)
+                    info1b=gr.Textbox(label=i18n("SSL进程输出信息"))
+                gr.Markdown(value=i18n("1Ac-语义token提取"))
+                with gr.Row():
+                    gpu_numbers1c = gr.Textbox(label=i18n("GPU卡号以-分割，每个卡号一个进程"),value="%s-%s"%(gpus,gpus),interactive=True)
+                    button1c_open = gr.Button(i18n("开启语义token提取"), variant="primary",visible=True)
+                    button1c_close = gr.Button(i18n("终止语义token提取进程"), variant="primary",visible=False)
+                    info1c=gr.Textbox(label=i18n("语义token提取进程输出信息"))
+                gr.Markdown(value=i18n("1Aabc-训练集格式化一键三连"))
+                with gr.Row():
+                    button1abc_open = gr.Button(i18n("开启一键三连"), variant="primary",visible=True)
+                    button1abc_close = gr.Button(i18n("终止一键三连"), variant="primary",visible=False)
+                    info1abc=gr.Textbox(label=i18n("一键三连进程输出信息"))
+            button1a_open.click(open1a, [inp_text,inp_wav_dir,exp_name,gpu_numbers1a,bert_pretrained_dir], [info1a,button1a_open,button1a_close])
+            button1a_close.click(close1a, [], [info1a,button1a_open,button1a_close])
+            button1b_open.click(open1b, [inp_text,inp_wav_dir,exp_name,gpu_numbers1Ba,cnhubert_base_dir], [info1b,button1b_open,button1b_close])
+            button1b_close.click(close1b, [], [info1b,button1b_open,button1b_close])
+            button1c_open.click(open1c, [inp_text,exp_name,gpu_numbers1c,pretrained_s2G], [info1c,button1c_open,button1c_close])
+            button1c_close.click(close1c, [], [info1c,button1c_open,button1c_close])
+            button1abc_open.click(open1abc, [inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numbers1c,bert_pretrained_dir,cnhubert_base_dir,pretrained_s2G], [info1abc,button1abc_open,button1abc_close])
+            button1abc_close.click(close1abc, [], [info1abc,button1abc_open,button1abc_close])
+            with gr.TabItem(i18n("1B-微调训练")):
+                gr.Markdown(value=i18n("1Ba-SoVITS训练。用于分享的模型文件输出在SoVITS_weights下。"))
+                with gr.Row():
+                    batch_size = gr.Slider(minimum=1,maximum=40,step=1,label=i18n("每张显卡的batch_size"),value=default_batch_size,interactive=True)
+                    total_epoch = gr.Slider(minimum=1,maximum=25,step=1,label=i18n("总训练轮数total_epoch，不建议太高"),value=8,interactive=True)
+                    text_low_lr_rate = gr.Slider(minimum=0.2,maximum=0.6,step=0.05,label=i18n("文本模块学习率权重"),value=0.4,interactive=True)
+                    save_every_epoch = gr.Slider(minimum=1,maximum=25,step=1,label=i18n("保存频率save_every_epoch"),value=4,interactive=True)
+                    if_save_latest = gr.Checkbox(label=i18n("是否仅保存最新的ckpt文件以节省硬盘空间"), value=True, interactive=True, show_label=True)
+                    if_save_every_weights = gr.Checkbox(label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"), value=True, interactive=True, show_label=True)
+                    gpu_numbers1Ba = gr.Textbox(label=i18n("GPU卡号以-分割，每个卡号一个进程"), value="%s" % (gpus), interactive=True)
+                with gr.Row():
+                    button1Ba_open = gr.Button(i18n("开启SoVITS训练"), variant="primary",visible=True)
+                    button1Ba_close = gr.Button(i18n("终止SoVITS训练"), variant="primary",visible=False)
+                    info1Ba=gr.Textbox(label=i18n("SoVITS训练进程输出信息"))
+                gr.Markdown(value=i18n("1Bb-GPT训练。用于分享的模型文件输出在GPT_weights下。"))
+                with gr.Row():
+                    batch_size1Bb = gr.Slider(minimum=1,maximum=40,step=1,label=i18n("每张显卡的batch_size"),value=default_batch_size,interactive=True)
+                    total_epoch1Bb = gr.Slider(minimum=2,maximum=50,step=1,label=i18n("总训练轮数total_epoch"),value=15,interactive=True)
+                    if_dpo = gr.Checkbox(label=i18n("是否开启dpo训练选项(实验性)"), value=False, interactive=True, show_label=True)
+                    if_save_latest1Bb = gr.Checkbox(label=i18n("是否仅保存最新的ckpt文件以节省硬盘空间"), value=True, interactive=True, show_label=True)
+                    if_save_every_weights1Bb = gr.Checkbox(label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"), value=True, interactive=True, show_label=True)
+                    save_every_epoch1Bb = gr.Slider(minimum=1,maximum=50,step=1,label=i18n("保存频率save_every_epoch"),value=5,interactive=True)
+                    gpu_numbers1Bb = gr.Textbox(label=i18n("GPU卡号以-分割，每个卡号一个进程"), value="%s" % (gpus), interactive=True)
+                with gr.Row():
+                    button1Bb_open = gr.Button(i18n("开启GPT训练"), variant="primary",visible=True)
+                    button1Bb_close = gr.Button(i18n("终止GPT训练"), variant="primary",visible=False)
+                    info1Bb=gr.Textbox(label=i18n("GPT训练进程输出信息"))
+            button1Ba_open.click(open1Ba, [batch_size,total_epoch,exp_name,text_low_lr_rate,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers1Ba,pretrained_s2G,pretrained_s2D], [info1Ba,button1Ba_open,button1Ba_close])
+            button1Ba_close.click(close1Ba, [], [info1Ba,button1Ba_open,button1Ba_close])
+            button1Bb_open.click(open1Bb, [batch_size1Bb,total_epoch1Bb,exp_name,if_dpo,if_save_latest1Bb,if_save_every_weights1Bb,save_every_epoch1Bb,gpu_numbers1Bb,pretrained_s1],   [info1Bb,button1Bb_open,button1Bb_close])
+            button1Bb_close.click(close1Bb, [], [info1Bb,button1Bb_open,button1Bb_close])
+            with gr.TabItem(i18n("1C-推理")):
+                gr.Markdown(value=i18n("选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模，体验5秒Zero Shot TTS用。"))
+                with gr.Row():
+                    GPT_dropdown = gr.Dropdown(label=i18n("*GPT模型列表"), choices=sorted(GPT_names,key=custom_sort_key),value=pretrained_gpt_name,interactive=True)
+                    SoVITS_dropdown = gr.Dropdown(label=i18n("*SoVITS模型列表"), choices=sorted(SoVITS_names,key=custom_sort_key),value=pretrained_sovits_name,interactive=True)
+                    gpu_number_1C=gr.Textbox(label=i18n("GPU卡号,只能填1个整数"), value=gpus, interactive=True)
+                    refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary")
+                    refresh_button.click(fn=change_choices,inputs=[],outputs=[SoVITS_dropdown,GPT_dropdown])
+                with gr.Row():
+                    if_tts = gr.Checkbox(label=i18n("是否开启TTS推理WebUI"), show_label=True)
+                    tts_info = gr.Textbox(label=i18n("TTS推理WebUI进程输出信息"))
+                    if_tts.change(change_tts_inference, [if_tts,bert_pretrained_dir,cnhubert_base_dir,gpu_number_1C,GPT_dropdown,SoVITS_dropdown], [tts_info])
+        with gr.TabItem(i18n("2-GPT-SoVITS-变声")):gr.Markdown(value=i18n("���工中，请静候佳音"))
+    app.queue(concurrency_count=511, max_size=1022).launch(
+        server_name="0.0.0.0",
+        inbrowser=True,
+        share=is_share,
+        server_port=webui_port_main,
+        quiet=True,
+    )