# -*- coding: utf-8 -*- import traceback import torch from scipy.io import wavfile import edge_tts import subprocess import gradio as gr import gradio.processing_utils as gr_pu import io import os import logging import time from pathlib import Path import re import json import argparse import librosa import matplotlib.pyplot as plt import numpy as np import soundfile from inference import infer_tool from inference import slicer from inference.infer_tool import Svc logging.getLogger('numba').setLevel(logging.WARNING) chunks_dict = infer_tool.read_temp("inference/chunks_temp.json") logging.getLogger('numba').setLevel(logging.WARNING) logging.getLogger('markdown_it').setLevel(logging.WARNING) logging.getLogger('urllib3').setLevel(logging.WARNING) logging.getLogger('matplotlib').setLevel(logging.WARNING) logging.getLogger('multipart').setLevel(logging.WARNING) model = None spk = None debug = False class HParams(): def __init__(self, **kwargs): for k, v in kwargs.items(): if type(v) == dict: v = HParams(**v) self[k] = v def keys(self): return self.__dict__.keys() def items(self): return self.__dict__.items() def values(self): return self.__dict__.values() def __len__(self): return len(self.__dict__) def __getitem__(self, key): return getattr(self, key) def __setitem__(self, key, value): return setattr(self, key, value) def __contains__(self, key): return key in self.__dict__ def __repr__(self): return self.__dict__.__repr__() def get_hparams_from_file(config_path): with open(config_path, "r", encoding="utf-8") as f: data = f.read() config = json.loads(data) hparams = HParams(**config) return hparams def vc_fn(sid, input_audio, vc_transform, auto_f0, cluster_ratio, slice_db, noise_scale, pad_seconds, cl_num, lg_num, lgr_num, f0_predictor, enhancer_adaptive_key, cr_threshold): try: if input_audio is None: raise gr.Error("你需要上傳音頻") if model is None: raise gr.Error("你需要指定模型") sampling_rate, audio = input_audio # print(audio.shape,sampling_rate) audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32) if len(audio.shape) > 1: audio = librosa.to_mono(audio.transpose(1, 0)) temp_path = "temp.wav" soundfile.write(temp_path, audio, sampling_rate, format="wav") _audio = model.slice_inference(temp_path, sid, vc_transform, slice_db, cluster_ratio, auto_f0, noise_scale, pad_seconds, cl_num, lg_num, lgr_num, f0_predictor, enhancer_adaptive_key, cr_threshold) model.clear_empty() os.remove(temp_path) # 構建保存文件的路徑,並保存到results文件夾內 try: timestamp = str(int(time.time())) filename = sid + "_" + timestamp + ".wav" # output_file = os.path.join("./results", filename) # soundfile.write(output_file, _audio, model.target_sample, format="wav") soundfile.write('/tmp/'+filename, _audio, model.target_sample, format="wav") # return f"推理成功,音頻文件保存為results/{filename}", (model.target_sample, _audio) return f"推理成功,音頻文件保存為{filename}", (model.target_sample, _audio) except Exception as e: if debug: traceback.print_exc() return f"文件保存失敗,請手動保存", (model.target_sample, _audio) except Exception as e: if debug: traceback.print_exc() raise gr.Error(e) def tts_func(_text, _rate, _voice): # 使用edge-tts把文字轉成音頻 # voice = "zh-CN-XiaoyiNeural"#女性,較高音 # voice = "zh-CN-YunxiNeural"#男性 voice = "zh-CN-YunxiNeural" # 男性 if (_voice == "女"): voice = "zh-CN-XiaoyiNeural" output_file = "/tmp/"+_text[0:10]+".wav" # communicate = edge_tts.Communicate(_text, voice) # await communicate.save(output_file) if _rate >= 0: ratestr = "+{:.0%}".format(_rate) elif _rate < 0: ratestr = "{:.0%}".format(_rate) # 減號自帶 p = subprocess.Popen("python -m edge_tts " + " --text "+_text + " --write-media "+output_file + " --voice "+voice + " --rate="+ratestr, shell=True, stdout=subprocess.PIPE, stdin=subprocess.PIPE) p.wait() return output_file def text_clear(text): return re.sub(r"[\n\,\(\) ]", "", text) def vc_fn2(sid, input_audio, vc_transform, auto_f0, cluster_ratio, slice_db, noise_scale, pad_seconds, cl_num, lg_num, lgr_num, text2tts, tts_rate, tts_voice, f0_predictor, enhancer_adaptive_key, cr_threshold): # 使用edge-tts把文字轉成音頻 text2tts = text_clear(text2tts) output_file = tts_func(text2tts, tts_rate, tts_voice) # 調整採樣率 sr2 = 44100 wav, sr = librosa.load(output_file) wav2 = librosa.resample(wav, orig_sr=sr, target_sr=sr2) save_path2 = text2tts[0:10]+"_44k"+".wav" wavfile.write(save_path2, sr2, (wav2 * np.iinfo(np.int16).max).astype(np.int16) ) # 讀取音頻 sample_rate, data = gr_pu.audio_from_file(save_path2) vc_input = (sample_rate, data) a, b = vc_fn(sid, vc_input, vc_transform, auto_f0, cluster_ratio, slice_db, noise_scale, pad_seconds, cl_num, lg_num, lgr_num, f0_predictor, enhancer_adaptive_key, cr_threshold) os.remove(output_file) os.remove(save_path2) return a, b models_info = [ { "description": """ 這個模型包含李永樂老師AI語音模型。\n\n Space採用CPU推理,速度極慢,建議下載模型本地GPU推理。\n\n """, "model_path": "model/G_4800.pth", "config_path": "model/config.json", } ] model_inferall = [] if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--share", action="store_true", default=False, help="share gradio app") # 一定要設置的部份 parser.add_argument('-cl', '--clip', type=float, default=0, help='音頻強製切片,預設0為自動切片,單位為秒/s') parser.add_argument('-n', '--clean_names', type=str, nargs='+', default=["君の知らない物語-src.wav"], help='wav文件名列表,放在raw文件夾下') parser.add_argument('-t', '--trans', type=int, nargs='+', default=[0], help='音高調整,支持正負(半音)') parser.add_argument('-s', '--spk_list', type=str, nargs='+', default=['nen'], help='合成目標說話人名稱') # 可選項部份 parser.add_argument('-a', '--auto_predict_f0', action='store_true', default=False, help='語音轉換自動預測音高,轉換歌聲時不要打開這個會嚴重跑調') parser.add_argument('-cm', '--cluster_model_path', type=str, default="logs/44k/kmeans_10000.pt", help='聚類模型路徑,如果沒有訓練聚類則隨便填') parser.add_argument('-cr', '--cluster_infer_ratio', type=float, default=0, help='聚類方案佔比,範圍0-1,若沒有訓練聚類模型則預設0即可') parser.add_argument('-lg', '--linear_gradient', type=float, default=0, help='兩段音頻切片的交叉淡入長度,如果強製切片後出現人聲不連貫可調整該數值,如果連貫建議採用預設值0,單位為秒') parser.add_argument('-f0p', '--f0_predictor', type=str, default="pm", help='選擇F0預測器,可選擇crepe,pm,dio,harvest,預設為pm(註意:crepe為原F0使用均值濾波器)') parser.add_argument('-eh', '--enhance', action='store_true', default=False, help='是否使用NSF_HIFIGAN增強器,該選項對部份訓練集少的模型有一定的音質增強效果,但是對訓練好的模型有反麵效果,預設關閉') parser.add_argument('-shd', '--shallow_diffusion', action='store_true', default=False, help='是否使用淺層擴散,使用後可解決一部份電音問題,預設關閉,該選項打開時,NSF_HIFIGAN增強器將會被禁止') # 淺擴散設置 parser.add_argument('-dm', '--diffusion_model_path', type=str, default="logs/44k/diffusion/model_0.pt", help='擴散模型路徑') parser.add_argument('-dc', '--diffusion_config_path', type=str, default="logs/44k/diffusion/config.yaml", help='擴散模型配置文件路徑') parser.add_argument('-ks', '--k_step', type=int, default=100, help='擴散步數,越大越接近擴散模型的結果,預設100') parser.add_argument('-od', '--only_diffusion', action='store_true', default=False, help='純擴散模式,該模式不會加載sovits模型,以擴散模型推理') # 不用動的部份 parser.add_argument('-sd', '--slice_db', type=int, default=-40, help='預設-40,嘈雜的音頻可以-30,幹聲保留呼吸可以-50') parser.add_argument('-d', '--device', type=str, default=None, help='推理設備,None則為自動選擇cpu和gpu') parser.add_argument('-ns', '--noice_scale', type=float, default=0.4, help='噪音級別,會影響咬字和音質,較為玄學') parser.add_argument('-p', '--pad_seconds', type=float, default=0.5, help='推理音頻pad秒數,由於未知原因開頭結尾會有異響,pad一小段靜音段後就不會出現') parser.add_argument('-wf', '--wav_format', type=str, default='flac', help='音頻輸出格式') parser.add_argument('-lgr', '--linear_gradient_retain', type=float, default=0.75, help='自動音頻切片後,需要捨棄每段切片的頭尾。該參數設置交叉長度保留的比例,範圍0-1,左開右閉') parser.add_argument('-eak', '--enhancer_adaptive_key', type=int, default=0, help='使增強器適應更高的音域(單位為半音數)|預設為0') parser.add_argument('-ft', '--f0_filter_threshold', type=float, default=0.05, help='F0過濾閾值,隻有使用crepe時有效. 數值範圍從0-1. 降低該值可減少跑調概率,但會增加啞音') args = parser.parse_args() categories = ["李永樂老師AI語音模型"] others = { "PCR vits-fast-fineturning": "https://huggingface.co/spaces/FrankZxShen/vits-fast-finetuning-pcr", } for info in models_info: config_path = info['config_path'] model_path = info['model_path'] description = info['description'] clean_names = args.clean_names trans = args.trans spk_list = list(get_hparams_from_file(config_path).spk.keys()) slice_db = args.slice_db wav_format = args.wav_format auto_predict_f0 = args.auto_predict_f0 cluster_infer_ratio = args.cluster_infer_ratio noice_scale = args.noice_scale pad_seconds = args.pad_seconds clip = args.clip lg = args.linear_gradient lgr = args.linear_gradient_retain f0p = args.f0_predictor enhance = args.enhance enhancer_adaptive_key = args.enhancer_adaptive_key cr_threshold = args.f0_filter_threshold diffusion_model_path = args.diffusion_model_path diffusion_config_path = args.diffusion_config_path k_step = args.k_step only_diffusion = args.only_diffusion shallow_diffusion = args.shallow_diffusion model = Svc(model_path, config_path, args.device, args.cluster_model_path, enhance, diffusion_model_path, diffusion_config_path, shallow_diffusion, only_diffusion) model_inferall.append((description, spk_list, model)) app = gr.Blocks() with app: gr.Markdown( "#